{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.999475890985325, "eval_steps": 500, "global_step": 13350, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.002620545073375262, "grad_norm": 11.724857330322266, "learning_rate": 1.197604790419162e-07, "loss": 0.6087, "loss_nan_ranks": 0, "loss_rank_avg": 0.6668875217437744, "step": 5 }, { "epoch": 0.005241090146750524, "grad_norm": 14.185229301452637, "learning_rate": 2.694610778443114e-07, "loss": 0.6416, "loss_nan_ranks": 0, "loss_rank_avg": 0.6689453125, "step": 10 }, { "epoch": 0.007861635220125786, "grad_norm": 14.250484466552734, "learning_rate": 4.191616766467066e-07, "loss": 0.6312, "loss_nan_ranks": 0, "loss_rank_avg": 0.6533203125, "step": 15 }, { "epoch": 0.010482180293501049, "grad_norm": 12.26307487487793, "learning_rate": 5.688622754491019e-07, "loss": 0.6082, "loss_nan_ranks": 0, "loss_rank_avg": 0.5997869968414307, "step": 20 }, { "epoch": 0.01310272536687631, "grad_norm": 11.114072799682617, "learning_rate": 7.18562874251497e-07, "loss": 0.6011, "loss_nan_ranks": 0, "loss_rank_avg": 0.5580273866653442, "step": 25 }, { "epoch": 0.015723270440251572, "grad_norm": 10.276555061340332, "learning_rate": 8.682634730538923e-07, "loss": 0.6239, "loss_nan_ranks": 0, "loss_rank_avg": 0.5910165309906006, "step": 30 }, { "epoch": 0.018343815513626835, "grad_norm": 7.9180121421813965, "learning_rate": 1.0179640718562875e-06, "loss": 0.6123, "loss_nan_ranks": 0, "loss_rank_avg": 0.5606018900871277, "step": 35 }, { "epoch": 0.020964360587002098, "grad_norm": 7.015914440155029, "learning_rate": 1.1676646706586827e-06, "loss": 0.5562, "loss_nan_ranks": 0, "loss_rank_avg": 0.572616457939148, "step": 40 }, { "epoch": 0.02358490566037736, "grad_norm": 6.32131290435791, "learning_rate": 1.3173652694610781e-06, "loss": 0.5463, "loss_nan_ranks": 0, "loss_rank_avg": 0.5819799900054932, "step": 45 }, { "epoch": 0.02620545073375262, "grad_norm": 4.320750713348389, "learning_rate": 1.4670658682634732e-06, "loss": 0.5135, "loss_nan_ranks": 0, "loss_rank_avg": 0.44384765625, "step": 50 }, { "epoch": 0.028825995807127882, "grad_norm": 3.183272361755371, "learning_rate": 1.6167664670658684e-06, "loss": 0.5003, "loss_nan_ranks": 0, "loss_rank_avg": 0.5493164658546448, "step": 55 }, { "epoch": 0.031446540880503145, "grad_norm": 2.9379472732543945, "learning_rate": 1.7664670658682636e-06, "loss": 0.5153, "loss_nan_ranks": 0, "loss_rank_avg": 0.5202266573905945, "step": 60 }, { "epoch": 0.034067085953878404, "grad_norm": 2.233424425125122, "learning_rate": 1.916167664670659e-06, "loss": 0.4874, "loss_nan_ranks": 0, "loss_rank_avg": 0.48310330510139465, "step": 65 }, { "epoch": 0.03668763102725367, "grad_norm": 1.9633489847183228, "learning_rate": 2.065868263473054e-06, "loss": 0.462, "loss_nan_ranks": 0, "loss_rank_avg": 0.4763810634613037, "step": 70 }, { "epoch": 0.03930817610062893, "grad_norm": 2.0084543228149414, "learning_rate": 2.215568862275449e-06, "loss": 0.4679, "loss_nan_ranks": 0, "loss_rank_avg": 0.4606221914291382, "step": 75 }, { "epoch": 0.041928721174004195, "grad_norm": 1.543745517730713, "learning_rate": 2.3652694610778446e-06, "loss": 0.4551, "loss_nan_ranks": 0, "loss_rank_avg": 0.4795137643814087, "step": 80 }, { "epoch": 0.044549266247379454, "grad_norm": 1.4478498697280884, "learning_rate": 2.5149700598802396e-06, "loss": 0.4271, "loss_nan_ranks": 0, "loss_rank_avg": 0.4189101457595825, "step": 85 }, { "epoch": 0.04716981132075472, "grad_norm": 1.2258864641189575, "learning_rate": 2.664670658682635e-06, "loss": 0.3946, "loss_nan_ranks": 0, "loss_rank_avg": 0.3954263925552368, "step": 90 }, { "epoch": 0.04979035639412998, "grad_norm": 1.826331615447998, "learning_rate": 2.81437125748503e-06, "loss": 0.4298, "loss_nan_ranks": 0, "loss_rank_avg": 0.4531334638595581, "step": 95 }, { "epoch": 0.05241090146750524, "grad_norm": 1.2551552057266235, "learning_rate": 2.9640718562874255e-06, "loss": 0.4199, "loss_nan_ranks": 0, "loss_rank_avg": 0.360871821641922, "step": 100 }, { "epoch": 0.055031446540880505, "grad_norm": 1.3189074993133545, "learning_rate": 3.113772455089821e-06, "loss": 0.4025, "loss_nan_ranks": 0, "loss_rank_avg": 0.36634361743927, "step": 105 }, { "epoch": 0.057651991614255764, "grad_norm": 1.0882036685943604, "learning_rate": 3.263473053892216e-06, "loss": 0.3986, "loss_nan_ranks": 0, "loss_rank_avg": 0.3869917094707489, "step": 110 }, { "epoch": 0.06027253668763103, "grad_norm": 1.296404242515564, "learning_rate": 3.4131736526946114e-06, "loss": 0.4135, "loss_nan_ranks": 0, "loss_rank_avg": 0.4592534005641937, "step": 115 }, { "epoch": 0.06289308176100629, "grad_norm": 1.1988800764083862, "learning_rate": 3.562874251497006e-06, "loss": 0.3877, "loss_nan_ranks": 0, "loss_rank_avg": 0.358315646648407, "step": 120 }, { "epoch": 0.06551362683438156, "grad_norm": 1.1711173057556152, "learning_rate": 3.7125748502994014e-06, "loss": 0.3901, "loss_nan_ranks": 0, "loss_rank_avg": 0.4149848222732544, "step": 125 }, { "epoch": 0.06813417190775681, "grad_norm": 1.0720573663711548, "learning_rate": 3.862275449101797e-06, "loss": 0.3836, "loss_nan_ranks": 0, "loss_rank_avg": 0.4089677929878235, "step": 130 }, { "epoch": 0.07075471698113207, "grad_norm": 1.0855176448822021, "learning_rate": 4.011976047904192e-06, "loss": 0.3835, "loss_nan_ranks": 0, "loss_rank_avg": 0.36083984375, "step": 135 }, { "epoch": 0.07337526205450734, "grad_norm": 1.0493624210357666, "learning_rate": 4.161676646706587e-06, "loss": 0.3664, "loss_nan_ranks": 0, "loss_rank_avg": 0.3388671875, "step": 140 }, { "epoch": 0.0759958071278826, "grad_norm": 1.1222777366638184, "learning_rate": 4.311377245508982e-06, "loss": 0.3917, "loss_nan_ranks": 0, "loss_rank_avg": 0.37147727608680725, "step": 145 }, { "epoch": 0.07861635220125786, "grad_norm": 0.912915825843811, "learning_rate": 4.461077844311378e-06, "loss": 0.3112, "loss_nan_ranks": 0, "loss_rank_avg": 0.2651228904724121, "step": 150 }, { "epoch": 0.08123689727463312, "grad_norm": 0.7332404851913452, "learning_rate": 4.610778443113773e-06, "loss": 0.3405, "loss_nan_ranks": 0, "loss_rank_avg": 0.3047122061252594, "step": 155 }, { "epoch": 0.08385744234800839, "grad_norm": 0.8747320175170898, "learning_rate": 4.760479041916168e-06, "loss": 0.3459, "loss_nan_ranks": 0, "loss_rank_avg": 0.351220965385437, "step": 160 }, { "epoch": 0.08647798742138364, "grad_norm": 1.0423736572265625, "learning_rate": 4.910179640718563e-06, "loss": 0.3364, "loss_nan_ranks": 0, "loss_rank_avg": 0.3234074115753174, "step": 165 }, { "epoch": 0.08909853249475891, "grad_norm": 1.1128911972045898, "learning_rate": 5.059880239520959e-06, "loss": 0.341, "loss_nan_ranks": 0, "loss_rank_avg": 0.37532955408096313, "step": 170 }, { "epoch": 0.09171907756813417, "grad_norm": 0.9953895807266235, "learning_rate": 5.209580838323353e-06, "loss": 0.3598, "loss_nan_ranks": 0, "loss_rank_avg": 0.3447265625, "step": 175 }, { "epoch": 0.09433962264150944, "grad_norm": 1.1297733783721924, "learning_rate": 5.359281437125749e-06, "loss": 0.333, "loss_nan_ranks": 0, "loss_rank_avg": 0.3620813190937042, "step": 180 }, { "epoch": 0.09696016771488469, "grad_norm": 0.9839518070220947, "learning_rate": 5.508982035928144e-06, "loss": 0.3589, "loss_nan_ranks": 0, "loss_rank_avg": 0.3644392788410187, "step": 185 }, { "epoch": 0.09958071278825996, "grad_norm": 0.9680199027061462, "learning_rate": 5.658682634730539e-06, "loss": 0.3303, "loss_nan_ranks": 0, "loss_rank_avg": 0.3452174961566925, "step": 190 }, { "epoch": 0.10220125786163523, "grad_norm": 1.0094338655471802, "learning_rate": 5.808383233532935e-06, "loss": 0.3228, "loss_nan_ranks": 0, "loss_rank_avg": 0.3340649902820587, "step": 195 }, { "epoch": 0.10482180293501048, "grad_norm": 0.9142029881477356, "learning_rate": 5.95808383233533e-06, "loss": 0.334, "loss_nan_ranks": 0, "loss_rank_avg": 0.2673777937889099, "step": 200 }, { "epoch": 0.10744234800838574, "grad_norm": 2.28641939163208, "learning_rate": 6.107784431137725e-06, "loss": 0.3214, "loss_nan_ranks": 0, "loss_rank_avg": 0.3424881398677826, "step": 205 }, { "epoch": 0.11006289308176101, "grad_norm": 1.3070180416107178, "learning_rate": 6.25748502994012e-06, "loss": 0.317, "loss_nan_ranks": 0, "loss_rank_avg": 0.33521905541419983, "step": 210 }, { "epoch": 0.11268343815513626, "grad_norm": 1.0307612419128418, "learning_rate": 6.407185628742516e-06, "loss": 0.351, "loss_nan_ranks": 0, "loss_rank_avg": 0.33849790692329407, "step": 215 }, { "epoch": 0.11530398322851153, "grad_norm": 0.7611124515533447, "learning_rate": 6.556886227544911e-06, "loss": 0.33, "loss_nan_ranks": 0, "loss_rank_avg": 0.31713446974754333, "step": 220 }, { "epoch": 0.1179245283018868, "grad_norm": 1.1252394914627075, "learning_rate": 6.706586826347305e-06, "loss": 0.3151, "loss_nan_ranks": 0, "loss_rank_avg": 0.30296391248703003, "step": 225 }, { "epoch": 0.12054507337526206, "grad_norm": 1.2389367818832397, "learning_rate": 6.8562874251497016e-06, "loss": 0.3268, "loss_nan_ranks": 0, "loss_rank_avg": 0.3277322053909302, "step": 230 }, { "epoch": 0.12316561844863731, "grad_norm": 1.3844778537750244, "learning_rate": 7.005988023952096e-06, "loss": 0.3227, "loss_nan_ranks": 0, "loss_rank_avg": 0.28340739011764526, "step": 235 }, { "epoch": 0.12578616352201258, "grad_norm": 1.3721177577972412, "learning_rate": 7.155688622754492e-06, "loss": 0.3369, "loss_nan_ranks": 0, "loss_rank_avg": 0.32868561148643494, "step": 240 }, { "epoch": 0.12840670859538783, "grad_norm": 0.9161558747291565, "learning_rate": 7.305389221556887e-06, "loss": 0.3311, "loss_nan_ranks": 0, "loss_rank_avg": 0.3234233856201172, "step": 245 }, { "epoch": 0.1310272536687631, "grad_norm": 0.9054167866706848, "learning_rate": 7.4550898203592825e-06, "loss": 0.3368, "loss_nan_ranks": 0, "loss_rank_avg": 0.35056421160697937, "step": 250 }, { "epoch": 0.13364779874213836, "grad_norm": 0.8199090957641602, "learning_rate": 7.604790419161677e-06, "loss": 0.3229, "loss_nan_ranks": 0, "loss_rank_avg": 0.3535444140434265, "step": 255 }, { "epoch": 0.13626834381551362, "grad_norm": 0.8128832578659058, "learning_rate": 7.754491017964072e-06, "loss": 0.3265, "loss_nan_ranks": 0, "loss_rank_avg": 0.3213815689086914, "step": 260 }, { "epoch": 0.1388888888888889, "grad_norm": 1.2015630006790161, "learning_rate": 7.904191616766468e-06, "loss": 0.3267, "loss_nan_ranks": 0, "loss_rank_avg": 0.3715681731700897, "step": 265 }, { "epoch": 0.14150943396226415, "grad_norm": 0.9504061937332153, "learning_rate": 8.053892215568863e-06, "loss": 0.3377, "loss_nan_ranks": 0, "loss_rank_avg": 0.32142454385757446, "step": 270 }, { "epoch": 0.1441299790356394, "grad_norm": 1.0159927606582642, "learning_rate": 8.203592814371259e-06, "loss": 0.3108, "loss_nan_ranks": 0, "loss_rank_avg": 0.3076171875, "step": 275 }, { "epoch": 0.14675052410901468, "grad_norm": 0.9303966164588928, "learning_rate": 8.353293413173653e-06, "loss": 0.3086, "loss_nan_ranks": 0, "loss_rank_avg": 0.2712244391441345, "step": 280 }, { "epoch": 0.14937106918238993, "grad_norm": 2.135909080505371, "learning_rate": 8.50299401197605e-06, "loss": 0.3045, "loss_nan_ranks": 0, "loss_rank_avg": 0.3287564814090729, "step": 285 }, { "epoch": 0.1519916142557652, "grad_norm": 1.1756415367126465, "learning_rate": 8.652694610778444e-06, "loss": 0.3074, "loss_nan_ranks": 0, "loss_rank_avg": 0.3280114233493805, "step": 290 }, { "epoch": 0.15461215932914046, "grad_norm": 0.899340808391571, "learning_rate": 8.802395209580839e-06, "loss": 0.3026, "loss_nan_ranks": 0, "loss_rank_avg": 0.2846500277519226, "step": 295 }, { "epoch": 0.15723270440251572, "grad_norm": 0.9873744249343872, "learning_rate": 8.952095808383234e-06, "loss": 0.3079, "loss_nan_ranks": 0, "loss_rank_avg": 0.33438584208488464, "step": 300 }, { "epoch": 0.159853249475891, "grad_norm": 1.1683752536773682, "learning_rate": 9.10179640718563e-06, "loss": 0.2949, "loss_nan_ranks": 0, "loss_rank_avg": 0.283935546875, "step": 305 }, { "epoch": 0.16247379454926625, "grad_norm": 0.8605905175209045, "learning_rate": 9.251497005988024e-06, "loss": 0.3146, "loss_nan_ranks": 0, "loss_rank_avg": 0.3393379747867584, "step": 310 }, { "epoch": 0.1650943396226415, "grad_norm": 1.0230729579925537, "learning_rate": 9.401197604790419e-06, "loss": 0.2774, "loss_nan_ranks": 0, "loss_rank_avg": 0.30908203125, "step": 315 }, { "epoch": 0.16771488469601678, "grad_norm": 1.3808443546295166, "learning_rate": 9.550898203592815e-06, "loss": 0.2966, "loss_nan_ranks": 0, "loss_rank_avg": 0.2983994781970978, "step": 320 }, { "epoch": 0.17033542976939203, "grad_norm": 1.0363587141036987, "learning_rate": 9.70059880239521e-06, "loss": 0.2853, "loss_nan_ranks": 0, "loss_rank_avg": 0.27548953890800476, "step": 325 }, { "epoch": 0.17295597484276728, "grad_norm": 0.9016790390014648, "learning_rate": 9.850299401197606e-06, "loss": 0.3172, "loss_nan_ranks": 0, "loss_rank_avg": 0.2579153776168823, "step": 330 }, { "epoch": 0.17557651991614256, "grad_norm": 1.2287031412124634, "learning_rate": 1e-05, "loss": 0.3169, "loss_nan_ranks": 0, "loss_rank_avg": 0.3329937756061554, "step": 335 }, { "epoch": 0.17819706498951782, "grad_norm": 0.9567996263504028, "learning_rate": 1.0149700598802397e-05, "loss": 0.3022, "loss_nan_ranks": 0, "loss_rank_avg": 0.3393104672431946, "step": 340 }, { "epoch": 0.18081761006289307, "grad_norm": 1.061110258102417, "learning_rate": 1.029940119760479e-05, "loss": 0.3101, "loss_nan_ranks": 0, "loss_rank_avg": 0.3320685923099518, "step": 345 }, { "epoch": 0.18343815513626835, "grad_norm": 0.909108579158783, "learning_rate": 1.0449101796407186e-05, "loss": 0.281, "loss_nan_ranks": 0, "loss_rank_avg": 0.26276031136512756, "step": 350 }, { "epoch": 0.1860587002096436, "grad_norm": 1.0113543272018433, "learning_rate": 1.0598802395209583e-05, "loss": 0.3055, "loss_nan_ranks": 0, "loss_rank_avg": 0.34771761298179626, "step": 355 }, { "epoch": 0.18867924528301888, "grad_norm": 0.7303060293197632, "learning_rate": 1.0748502994011977e-05, "loss": 0.3203, "loss_nan_ranks": 0, "loss_rank_avg": 0.3523019850254059, "step": 360 }, { "epoch": 0.19129979035639413, "grad_norm": 1.0004197359085083, "learning_rate": 1.0898203592814372e-05, "loss": 0.2992, "loss_nan_ranks": 0, "loss_rank_avg": 0.2729937434196472, "step": 365 }, { "epoch": 0.19392033542976939, "grad_norm": 1.0297443866729736, "learning_rate": 1.1047904191616768e-05, "loss": 0.2786, "loss_nan_ranks": 0, "loss_rank_avg": 0.27099609375, "step": 370 }, { "epoch": 0.19654088050314467, "grad_norm": 2.403956651687622, "learning_rate": 1.1197604790419163e-05, "loss": 0.2917, "loss_nan_ranks": 0, "loss_rank_avg": 0.249267578125, "step": 375 }, { "epoch": 0.19916142557651992, "grad_norm": 2.3001322746276855, "learning_rate": 1.1347305389221557e-05, "loss": 0.2929, "loss_nan_ranks": 0, "loss_rank_avg": 0.3072666525840759, "step": 380 }, { "epoch": 0.20178197064989517, "grad_norm": 0.873443603515625, "learning_rate": 1.1497005988023952e-05, "loss": 0.2645, "loss_nan_ranks": 0, "loss_rank_avg": 0.27001953125, "step": 385 }, { "epoch": 0.20440251572327045, "grad_norm": 1.0583703517913818, "learning_rate": 1.1646706586826348e-05, "loss": 0.2894, "loss_nan_ranks": 0, "loss_rank_avg": 0.259521484375, "step": 390 }, { "epoch": 0.2070230607966457, "grad_norm": 0.9356868267059326, "learning_rate": 1.1796407185628744e-05, "loss": 0.2728, "loss_nan_ranks": 0, "loss_rank_avg": 0.27055835723876953, "step": 395 }, { "epoch": 0.20964360587002095, "grad_norm": 0.9089909195899963, "learning_rate": 1.1946107784431137e-05, "loss": 0.3039, "loss_nan_ranks": 0, "loss_rank_avg": 0.3303355276584625, "step": 400 }, { "epoch": 0.21226415094339623, "grad_norm": 1.0864156484603882, "learning_rate": 1.2095808383233534e-05, "loss": 0.2842, "loss_nan_ranks": 0, "loss_rank_avg": 0.2611331045627594, "step": 405 }, { "epoch": 0.2148846960167715, "grad_norm": 0.903243362903595, "learning_rate": 1.224550898203593e-05, "loss": 0.3116, "loss_nan_ranks": 0, "loss_rank_avg": 0.31614750623703003, "step": 410 }, { "epoch": 0.21750524109014674, "grad_norm": 0.9702324271202087, "learning_rate": 1.2395209580838323e-05, "loss": 0.2772, "loss_nan_ranks": 0, "loss_rank_avg": 0.278564453125, "step": 415 }, { "epoch": 0.22012578616352202, "grad_norm": 0.8921561241149902, "learning_rate": 1.2544910179640719e-05, "loss": 0.282, "loss_nan_ranks": 0, "loss_rank_avg": 0.28161734342575073, "step": 420 }, { "epoch": 0.22274633123689727, "grad_norm": 1.4527266025543213, "learning_rate": 1.2694610778443115e-05, "loss": 0.2858, "loss_nan_ranks": 0, "loss_rank_avg": 0.3099859654903412, "step": 425 }, { "epoch": 0.22536687631027252, "grad_norm": 1.3680222034454346, "learning_rate": 1.284431137724551e-05, "loss": 0.3097, "loss_nan_ranks": 0, "loss_rank_avg": 0.3171539008617401, "step": 430 }, { "epoch": 0.2279874213836478, "grad_norm": 1.5904136896133423, "learning_rate": 1.2994011976047905e-05, "loss": 0.2802, "loss_nan_ranks": 0, "loss_rank_avg": 0.23265689611434937, "step": 435 }, { "epoch": 0.23060796645702306, "grad_norm": 0.776222825050354, "learning_rate": 1.3143712574850301e-05, "loss": 0.2874, "loss_nan_ranks": 0, "loss_rank_avg": 0.2671012878417969, "step": 440 }, { "epoch": 0.23322851153039834, "grad_norm": 1.0297017097473145, "learning_rate": 1.3293413173652696e-05, "loss": 0.255, "loss_nan_ranks": 0, "loss_rank_avg": 0.24852225184440613, "step": 445 }, { "epoch": 0.2358490566037736, "grad_norm": 0.9424745440483093, "learning_rate": 1.3443113772455092e-05, "loss": 0.2994, "loss_nan_ranks": 0, "loss_rank_avg": 0.3500230610370636, "step": 450 }, { "epoch": 0.23846960167714884, "grad_norm": 1.6893078088760376, "learning_rate": 1.3592814371257486e-05, "loss": 0.2994, "loss_nan_ranks": 0, "loss_rank_avg": 0.28594040870666504, "step": 455 }, { "epoch": 0.24109014675052412, "grad_norm": 0.9330493211746216, "learning_rate": 1.3742514970059881e-05, "loss": 0.2906, "loss_nan_ranks": 0, "loss_rank_avg": 0.34921205043792725, "step": 460 }, { "epoch": 0.24371069182389937, "grad_norm": 0.854236364364624, "learning_rate": 1.3892215568862277e-05, "loss": 0.2652, "loss_nan_ranks": 0, "loss_rank_avg": 0.27588099241256714, "step": 465 }, { "epoch": 0.24633123689727462, "grad_norm": 1.0205166339874268, "learning_rate": 1.404191616766467e-05, "loss": 0.2785, "loss_nan_ranks": 0, "loss_rank_avg": 0.2535867989063263, "step": 470 }, { "epoch": 0.2489517819706499, "grad_norm": 0.9875480532646179, "learning_rate": 1.4191616766467067e-05, "loss": 0.283, "loss_nan_ranks": 0, "loss_rank_avg": 0.291537880897522, "step": 475 }, { "epoch": 0.25157232704402516, "grad_norm": 0.8087443113327026, "learning_rate": 1.4341317365269463e-05, "loss": 0.2834, "loss_nan_ranks": 0, "loss_rank_avg": 0.31447193026542664, "step": 480 }, { "epoch": 0.25419287211740044, "grad_norm": 0.8693152666091919, "learning_rate": 1.4491017964071859e-05, "loss": 0.2815, "loss_nan_ranks": 0, "loss_rank_avg": 0.28755509853363037, "step": 485 }, { "epoch": 0.25681341719077566, "grad_norm": 0.8461484909057617, "learning_rate": 1.4640718562874252e-05, "loss": 0.279, "loss_nan_ranks": 0, "loss_rank_avg": 0.30435264110565186, "step": 490 }, { "epoch": 0.25943396226415094, "grad_norm": 0.9464346170425415, "learning_rate": 1.4790419161676648e-05, "loss": 0.2957, "loss_nan_ranks": 0, "loss_rank_avg": 0.31394729018211365, "step": 495 }, { "epoch": 0.2620545073375262, "grad_norm": 0.9506782293319702, "learning_rate": 1.4940119760479045e-05, "loss": 0.2702, "loss_nan_ranks": 0, "loss_rank_avg": 0.2537984848022461, "step": 500 }, { "epoch": 0.26467505241090145, "grad_norm": 0.8543886542320251, "learning_rate": 1.5089820359281437e-05, "loss": 0.2885, "loss_nan_ranks": 0, "loss_rank_avg": 0.2579079866409302, "step": 505 }, { "epoch": 0.2672955974842767, "grad_norm": 1.0954477787017822, "learning_rate": 1.5239520958083834e-05, "loss": 0.2722, "loss_nan_ranks": 0, "loss_rank_avg": 0.25832486152648926, "step": 510 }, { "epoch": 0.269916142557652, "grad_norm": 1.5733797550201416, "learning_rate": 1.538922155688623e-05, "loss": 0.2545, "loss_nan_ranks": 0, "loss_rank_avg": 0.2169189453125, "step": 515 }, { "epoch": 0.27253668763102723, "grad_norm": 0.9599918723106384, "learning_rate": 1.5538922155688625e-05, "loss": 0.269, "loss_nan_ranks": 0, "loss_rank_avg": 0.27587890625, "step": 520 }, { "epoch": 0.2751572327044025, "grad_norm": 0.7465035319328308, "learning_rate": 1.5688622754491018e-05, "loss": 0.303, "loss_nan_ranks": 0, "loss_rank_avg": 0.30516883730888367, "step": 525 }, { "epoch": 0.2777777777777778, "grad_norm": 1.8104451894760132, "learning_rate": 1.5838323353293414e-05, "loss": 0.2497, "loss_nan_ranks": 0, "loss_rank_avg": 0.2503080368041992, "step": 530 }, { "epoch": 0.280398322851153, "grad_norm": 0.8634847402572632, "learning_rate": 1.598802395209581e-05, "loss": 0.2873, "loss_nan_ranks": 0, "loss_rank_avg": 0.30565452575683594, "step": 535 }, { "epoch": 0.2830188679245283, "grad_norm": 1.3197232484817505, "learning_rate": 1.6137724550898203e-05, "loss": 0.2649, "loss_nan_ranks": 0, "loss_rank_avg": 0.2626909613609314, "step": 540 }, { "epoch": 0.2856394129979036, "grad_norm": 1.1044285297393799, "learning_rate": 1.62874251497006e-05, "loss": 0.2716, "loss_nan_ranks": 0, "loss_rank_avg": 0.2565639019012451, "step": 545 }, { "epoch": 0.2882599580712788, "grad_norm": 2.0272889137268066, "learning_rate": 1.6437125748502996e-05, "loss": 0.2676, "loss_nan_ranks": 0, "loss_rank_avg": 0.2064208984375, "step": 550 }, { "epoch": 0.2908805031446541, "grad_norm": 1.0531353950500488, "learning_rate": 1.6586826347305392e-05, "loss": 0.2784, "loss_nan_ranks": 0, "loss_rank_avg": 0.32692980766296387, "step": 555 }, { "epoch": 0.29350104821802936, "grad_norm": 1.0479260683059692, "learning_rate": 1.6736526946107785e-05, "loss": 0.2805, "loss_nan_ranks": 0, "loss_rank_avg": 0.27001953125, "step": 560 }, { "epoch": 0.29612159329140464, "grad_norm": 1.0311917066574097, "learning_rate": 1.688622754491018e-05, "loss": 0.271, "loss_nan_ranks": 0, "loss_rank_avg": 0.2805914878845215, "step": 565 }, { "epoch": 0.29874213836477986, "grad_norm": 0.7885255217552185, "learning_rate": 1.7035928143712577e-05, "loss": 0.2821, "loss_nan_ranks": 0, "loss_rank_avg": 0.3129787743091583, "step": 570 }, { "epoch": 0.30136268343815514, "grad_norm": 0.96186763048172, "learning_rate": 1.718562874251497e-05, "loss": 0.2695, "loss_nan_ranks": 0, "loss_rank_avg": 0.26282820105552673, "step": 575 }, { "epoch": 0.3039832285115304, "grad_norm": 0.9567992687225342, "learning_rate": 1.7335329341317367e-05, "loss": 0.272, "loss_nan_ranks": 0, "loss_rank_avg": 0.27879998087882996, "step": 580 }, { "epoch": 0.30660377358490565, "grad_norm": 1.8625792264938354, "learning_rate": 1.7485029940119763e-05, "loss": 0.2574, "loss_nan_ranks": 0, "loss_rank_avg": 0.29016339778900146, "step": 585 }, { "epoch": 0.30922431865828093, "grad_norm": 1.0454092025756836, "learning_rate": 1.763473053892216e-05, "loss": 0.2617, "loss_nan_ranks": 0, "loss_rank_avg": 0.2546542286872864, "step": 590 }, { "epoch": 0.3118448637316562, "grad_norm": 1.0812140703201294, "learning_rate": 1.7784431137724552e-05, "loss": 0.2361, "loss_nan_ranks": 0, "loss_rank_avg": 0.23095703125, "step": 595 }, { "epoch": 0.31446540880503143, "grad_norm": 0.9106873273849487, "learning_rate": 1.793413173652695e-05, "loss": 0.2618, "loss_nan_ranks": 0, "loss_rank_avg": 0.25149843096733093, "step": 600 }, { "epoch": 0.3170859538784067, "grad_norm": 0.7510280609130859, "learning_rate": 1.8083832335329345e-05, "loss": 0.2822, "loss_nan_ranks": 0, "loss_rank_avg": 0.25785890221595764, "step": 605 }, { "epoch": 0.319706498951782, "grad_norm": 1.044481635093689, "learning_rate": 1.8233532934131738e-05, "loss": 0.2627, "loss_nan_ranks": 0, "loss_rank_avg": 0.2672847807407379, "step": 610 }, { "epoch": 0.3223270440251572, "grad_norm": 0.9411810636520386, "learning_rate": 1.8383233532934134e-05, "loss": 0.2691, "loss_nan_ranks": 0, "loss_rank_avg": 0.30929991602897644, "step": 615 }, { "epoch": 0.3249475890985325, "grad_norm": 1.0617573261260986, "learning_rate": 1.853293413173653e-05, "loss": 0.2738, "loss_nan_ranks": 0, "loss_rank_avg": 0.22705078125, "step": 620 }, { "epoch": 0.3275681341719078, "grad_norm": 0.8805792331695557, "learning_rate": 1.8682634730538923e-05, "loss": 0.2639, "loss_nan_ranks": 0, "loss_rank_avg": 0.239990234375, "step": 625 }, { "epoch": 0.330188679245283, "grad_norm": 1.3362305164337158, "learning_rate": 1.883233532934132e-05, "loss": 0.2726, "loss_nan_ranks": 0, "loss_rank_avg": 0.23389750719070435, "step": 630 }, { "epoch": 0.3328092243186583, "grad_norm": 1.0816547870635986, "learning_rate": 1.8982035928143712e-05, "loss": 0.2577, "loss_nan_ranks": 0, "loss_rank_avg": 0.216552734375, "step": 635 }, { "epoch": 0.33542976939203356, "grad_norm": 1.0076228380203247, "learning_rate": 1.913173652694611e-05, "loss": 0.2518, "loss_nan_ranks": 0, "loss_rank_avg": 0.23681640625, "step": 640 }, { "epoch": 0.3380503144654088, "grad_norm": 1.2804813385009766, "learning_rate": 1.9281437125748505e-05, "loss": 0.2531, "loss_nan_ranks": 0, "loss_rank_avg": 0.239990234375, "step": 645 }, { "epoch": 0.34067085953878407, "grad_norm": 1.0749441385269165, "learning_rate": 1.9431137724550898e-05, "loss": 0.2791, "loss_nan_ranks": 0, "loss_rank_avg": 0.28917208313941956, "step": 650 }, { "epoch": 0.34329140461215935, "grad_norm": 0.7442736029624939, "learning_rate": 1.9580838323353294e-05, "loss": 0.2784, "loss_nan_ranks": 0, "loss_rank_avg": 0.25295543670654297, "step": 655 }, { "epoch": 0.34591194968553457, "grad_norm": 0.8936218023300171, "learning_rate": 1.973053892215569e-05, "loss": 0.2635, "loss_nan_ranks": 0, "loss_rank_avg": 0.27409684658050537, "step": 660 }, { "epoch": 0.34853249475890985, "grad_norm": 0.8553372621536255, "learning_rate": 1.9880239520958083e-05, "loss": 0.2542, "loss_nan_ranks": 0, "loss_rank_avg": 0.2538270652294159, "step": 665 }, { "epoch": 0.35115303983228513, "grad_norm": 0.8348433375358582, "learning_rate": 2.002994011976048e-05, "loss": 0.2731, "loss_nan_ranks": 0, "loss_rank_avg": 0.26132816076278687, "step": 670 }, { "epoch": 0.35377358490566035, "grad_norm": 0.8847088813781738, "learning_rate": 2.0179640718562872e-05, "loss": 0.277, "loss_nan_ranks": 0, "loss_rank_avg": 0.274321973323822, "step": 675 }, { "epoch": 0.35639412997903563, "grad_norm": 0.8822789788246155, "learning_rate": 2.0329341317365272e-05, "loss": 0.2708, "loss_nan_ranks": 0, "loss_rank_avg": 0.272810161113739, "step": 680 }, { "epoch": 0.3590146750524109, "grad_norm": 1.0013290643692017, "learning_rate": 2.0479041916167665e-05, "loss": 0.2665, "loss_nan_ranks": 0, "loss_rank_avg": 0.2875209450721741, "step": 685 }, { "epoch": 0.36163522012578614, "grad_norm": 1.2342756986618042, "learning_rate": 2.0628742514970065e-05, "loss": 0.248, "loss_nan_ranks": 0, "loss_rank_avg": 0.22775958478450775, "step": 690 }, { "epoch": 0.3642557651991614, "grad_norm": 1.2740610837936401, "learning_rate": 2.0778443113772458e-05, "loss": 0.2633, "loss_nan_ranks": 0, "loss_rank_avg": 0.21473848819732666, "step": 695 }, { "epoch": 0.3668763102725367, "grad_norm": 0.8021836280822754, "learning_rate": 2.092814371257485e-05, "loss": 0.2546, "loss_nan_ranks": 0, "loss_rank_avg": 0.26980459690093994, "step": 700 }, { "epoch": 0.3694968553459119, "grad_norm": 1.2527188062667847, "learning_rate": 2.107784431137725e-05, "loss": 0.2689, "loss_nan_ranks": 0, "loss_rank_avg": 0.25510236620903015, "step": 705 }, { "epoch": 0.3721174004192872, "grad_norm": 0.7445383071899414, "learning_rate": 2.1227544910179643e-05, "loss": 0.2668, "loss_nan_ranks": 0, "loss_rank_avg": 0.27235108613967896, "step": 710 }, { "epoch": 0.3747379454926625, "grad_norm": 0.8911032676696777, "learning_rate": 2.1377245508982036e-05, "loss": 0.2808, "loss_nan_ranks": 0, "loss_rank_avg": 0.3051337003707886, "step": 715 }, { "epoch": 0.37735849056603776, "grad_norm": 0.9463456273078918, "learning_rate": 2.1526946107784436e-05, "loss": 0.2538, "loss_nan_ranks": 0, "loss_rank_avg": 0.2794130742549896, "step": 720 }, { "epoch": 0.379979035639413, "grad_norm": 1.009350299835205, "learning_rate": 2.167664670658683e-05, "loss": 0.2627, "loss_nan_ranks": 0, "loss_rank_avg": 0.31796935200691223, "step": 725 }, { "epoch": 0.38259958071278827, "grad_norm": 0.9498868584632874, "learning_rate": 2.182634730538922e-05, "loss": 0.2463, "loss_nan_ranks": 0, "loss_rank_avg": 0.214111328125, "step": 730 }, { "epoch": 0.38522012578616355, "grad_norm": 1.2612749338150024, "learning_rate": 2.197604790419162e-05, "loss": 0.2796, "loss_nan_ranks": 0, "loss_rank_avg": 0.28517019748687744, "step": 735 }, { "epoch": 0.38784067085953877, "grad_norm": 1.0274648666381836, "learning_rate": 2.2125748502994014e-05, "loss": 0.2598, "loss_nan_ranks": 0, "loss_rank_avg": 0.3108254373073578, "step": 740 }, { "epoch": 0.39046121593291405, "grad_norm": 1.821379542350769, "learning_rate": 2.2275449101796407e-05, "loss": 0.2419, "loss_nan_ranks": 0, "loss_rank_avg": 0.22509765625, "step": 745 }, { "epoch": 0.39308176100628933, "grad_norm": 0.9160019755363464, "learning_rate": 2.2425149700598807e-05, "loss": 0.2681, "loss_nan_ranks": 0, "loss_rank_avg": 0.2566436231136322, "step": 750 }, { "epoch": 0.39570230607966456, "grad_norm": 1.0037506818771362, "learning_rate": 2.25748502994012e-05, "loss": 0.2433, "loss_nan_ranks": 0, "loss_rank_avg": 0.23486328125, "step": 755 }, { "epoch": 0.39832285115303984, "grad_norm": 1.2298099994659424, "learning_rate": 2.2724550898203596e-05, "loss": 0.277, "loss_nan_ranks": 0, "loss_rank_avg": 0.255581796169281, "step": 760 }, { "epoch": 0.4009433962264151, "grad_norm": 1.3319263458251953, "learning_rate": 2.287425149700599e-05, "loss": 0.2404, "loss_nan_ranks": 0, "loss_rank_avg": 0.211669921875, "step": 765 }, { "epoch": 0.40356394129979034, "grad_norm": 4.390848159790039, "learning_rate": 2.3023952095808385e-05, "loss": 0.2599, "loss_nan_ranks": 0, "loss_rank_avg": 0.284297913312912, "step": 770 }, { "epoch": 0.4061844863731656, "grad_norm": 2.03829288482666, "learning_rate": 2.317365269461078e-05, "loss": 0.2554, "loss_nan_ranks": 0, "loss_rank_avg": 0.26315489411354065, "step": 775 }, { "epoch": 0.4088050314465409, "grad_norm": 1.0062929391860962, "learning_rate": 2.3323353293413174e-05, "loss": 0.2458, "loss_nan_ranks": 0, "loss_rank_avg": 0.224853515625, "step": 780 }, { "epoch": 0.4114255765199161, "grad_norm": 1.4772664308547974, "learning_rate": 2.347305389221557e-05, "loss": 0.2523, "loss_nan_ranks": 0, "loss_rank_avg": 0.23134742677211761, "step": 785 }, { "epoch": 0.4140461215932914, "grad_norm": 1.4864003658294678, "learning_rate": 2.3622754491017967e-05, "loss": 0.2502, "loss_nan_ranks": 0, "loss_rank_avg": 0.210205078125, "step": 790 }, { "epoch": 0.4166666666666667, "grad_norm": 0.9668591022491455, "learning_rate": 2.377245508982036e-05, "loss": 0.264, "loss_nan_ranks": 0, "loss_rank_avg": 0.2741929590702057, "step": 795 }, { "epoch": 0.4192872117400419, "grad_norm": 0.9752196669578552, "learning_rate": 2.3922155688622756e-05, "loss": 0.2687, "loss_nan_ranks": 0, "loss_rank_avg": 0.2708283066749573, "step": 800 }, { "epoch": 0.4219077568134172, "grad_norm": 1.0176275968551636, "learning_rate": 2.4071856287425152e-05, "loss": 0.2484, "loss_nan_ranks": 0, "loss_rank_avg": 0.253340482711792, "step": 805 }, { "epoch": 0.42452830188679247, "grad_norm": 1.0713645219802856, "learning_rate": 2.4221556886227545e-05, "loss": 0.2452, "loss_nan_ranks": 0, "loss_rank_avg": 0.2933781147003174, "step": 810 }, { "epoch": 0.4271488469601677, "grad_norm": 0.956200122833252, "learning_rate": 2.437125748502994e-05, "loss": 0.2385, "loss_nan_ranks": 0, "loss_rank_avg": 0.26098963618278503, "step": 815 }, { "epoch": 0.429769392033543, "grad_norm": 0.943717896938324, "learning_rate": 2.4520958083832338e-05, "loss": 0.2529, "loss_nan_ranks": 0, "loss_rank_avg": 0.24085500836372375, "step": 820 }, { "epoch": 0.43238993710691825, "grad_norm": 1.687644362449646, "learning_rate": 2.467065868263473e-05, "loss": 0.2317, "loss_nan_ranks": 0, "loss_rank_avg": 0.22789129614830017, "step": 825 }, { "epoch": 0.4350104821802935, "grad_norm": 0.9256353378295898, "learning_rate": 2.482035928143713e-05, "loss": 0.2504, "loss_nan_ranks": 0, "loss_rank_avg": 0.2247050702571869, "step": 830 }, { "epoch": 0.43763102725366876, "grad_norm": 0.8353980779647827, "learning_rate": 2.4970059880239523e-05, "loss": 0.2453, "loss_nan_ranks": 0, "loss_rank_avg": 0.2708541750907898, "step": 835 }, { "epoch": 0.44025157232704404, "grad_norm": 0.9708998799324036, "learning_rate": 2.5119760479041916e-05, "loss": 0.2494, "loss_nan_ranks": 0, "loss_rank_avg": 0.217529296875, "step": 840 }, { "epoch": 0.44287211740041926, "grad_norm": 1.3114147186279297, "learning_rate": 2.5269461077844316e-05, "loss": 0.2517, "loss_nan_ranks": 0, "loss_rank_avg": 0.224853515625, "step": 845 }, { "epoch": 0.44549266247379454, "grad_norm": 1.0196064710617065, "learning_rate": 2.541916167664671e-05, "loss": 0.2538, "loss_nan_ranks": 0, "loss_rank_avg": 0.25506791472435, "step": 850 }, { "epoch": 0.4481132075471698, "grad_norm": 1.0026663541793823, "learning_rate": 2.55688622754491e-05, "loss": 0.2364, "loss_nan_ranks": 0, "loss_rank_avg": 0.22676116228103638, "step": 855 }, { "epoch": 0.45073375262054505, "grad_norm": 1.156481385231018, "learning_rate": 2.57185628742515e-05, "loss": 0.2621, "loss_nan_ranks": 0, "loss_rank_avg": 0.27671852707862854, "step": 860 }, { "epoch": 0.4533542976939203, "grad_norm": 0.8996175527572632, "learning_rate": 2.5868263473053894e-05, "loss": 0.2538, "loss_nan_ranks": 0, "loss_rank_avg": 0.24104604125022888, "step": 865 }, { "epoch": 0.4559748427672956, "grad_norm": 0.9830065369606018, "learning_rate": 2.6017964071856287e-05, "loss": 0.2569, "loss_nan_ranks": 0, "loss_rank_avg": 0.25553032755851746, "step": 870 }, { "epoch": 0.4585953878406709, "grad_norm": 0.9731114506721497, "learning_rate": 2.6167664670658687e-05, "loss": 0.2313, "loss_nan_ranks": 0, "loss_rank_avg": 0.23193359375, "step": 875 }, { "epoch": 0.4612159329140461, "grad_norm": 1.0539577007293701, "learning_rate": 2.631736526946108e-05, "loss": 0.2376, "loss_nan_ranks": 0, "loss_rank_avg": 0.22667427361011505, "step": 880 }, { "epoch": 0.4638364779874214, "grad_norm": 0.9352318644523621, "learning_rate": 2.6467065868263476e-05, "loss": 0.2485, "loss_nan_ranks": 0, "loss_rank_avg": 0.2803516983985901, "step": 885 }, { "epoch": 0.46645702306079667, "grad_norm": 0.9622676968574524, "learning_rate": 2.6616766467065872e-05, "loss": 0.2235, "loss_nan_ranks": 0, "loss_rank_avg": 0.205322265625, "step": 890 }, { "epoch": 0.4690775681341719, "grad_norm": 0.9191893935203552, "learning_rate": 2.6766467065868265e-05, "loss": 0.263, "loss_nan_ranks": 0, "loss_rank_avg": 0.28229767084121704, "step": 895 }, { "epoch": 0.4716981132075472, "grad_norm": 0.8149584531784058, "learning_rate": 2.691616766467066e-05, "loss": 0.2596, "loss_nan_ranks": 0, "loss_rank_avg": 0.2862365245819092, "step": 900 }, { "epoch": 0.47431865828092246, "grad_norm": 0.9685344696044922, "learning_rate": 2.7065868263473058e-05, "loss": 0.2554, "loss_nan_ranks": 0, "loss_rank_avg": 0.22389689087867737, "step": 905 }, { "epoch": 0.4769392033542977, "grad_norm": 0.9538320302963257, "learning_rate": 2.721556886227545e-05, "loss": 0.2529, "loss_nan_ranks": 0, "loss_rank_avg": 0.23646777868270874, "step": 910 }, { "epoch": 0.47955974842767296, "grad_norm": 1.021291971206665, "learning_rate": 2.7365269461077847e-05, "loss": 0.2358, "loss_nan_ranks": 0, "loss_rank_avg": 0.223388671875, "step": 915 }, { "epoch": 0.48218029350104824, "grad_norm": 0.7586928009986877, "learning_rate": 2.751497005988024e-05, "loss": 0.2377, "loss_nan_ranks": 0, "loss_rank_avg": 0.23621279001235962, "step": 920 }, { "epoch": 0.48480083857442346, "grad_norm": 0.8782613277435303, "learning_rate": 2.7664670658682636e-05, "loss": 0.2339, "loss_nan_ranks": 0, "loss_rank_avg": 0.25007951259613037, "step": 925 }, { "epoch": 0.48742138364779874, "grad_norm": 0.8057980537414551, "learning_rate": 2.7814371257485033e-05, "loss": 0.2394, "loss_nan_ranks": 0, "loss_rank_avg": 0.23260241746902466, "step": 930 }, { "epoch": 0.490041928721174, "grad_norm": 0.9458929300308228, "learning_rate": 2.7964071856287425e-05, "loss": 0.2524, "loss_nan_ranks": 0, "loss_rank_avg": 0.30464041233062744, "step": 935 }, { "epoch": 0.49266247379454925, "grad_norm": 1.1031662225723267, "learning_rate": 2.8113772455089822e-05, "loss": 0.2508, "loss_nan_ranks": 0, "loss_rank_avg": 0.21631650626659393, "step": 940 }, { "epoch": 0.49528301886792453, "grad_norm": 1.1147128343582153, "learning_rate": 2.8263473053892218e-05, "loss": 0.2207, "loss_nan_ranks": 0, "loss_rank_avg": 0.1950717717409134, "step": 945 }, { "epoch": 0.4979035639412998, "grad_norm": 0.8075921535491943, "learning_rate": 2.841317365269461e-05, "loss": 0.2521, "loss_nan_ranks": 0, "loss_rank_avg": 0.2745627760887146, "step": 950 }, { "epoch": 0.500524109014675, "grad_norm": 0.7695682644844055, "learning_rate": 2.856287425149701e-05, "loss": 0.2421, "loss_nan_ranks": 0, "loss_rank_avg": 0.2553519606590271, "step": 955 }, { "epoch": 0.5031446540880503, "grad_norm": 0.9481866359710693, "learning_rate": 2.8712574850299403e-05, "loss": 0.2309, "loss_nan_ranks": 0, "loss_rank_avg": 0.2060546875, "step": 960 }, { "epoch": 0.5057651991614256, "grad_norm": 3.3165810108184814, "learning_rate": 2.8862275449101796e-05, "loss": 0.2392, "loss_nan_ranks": 0, "loss_rank_avg": 0.25720077753067017, "step": 965 }, { "epoch": 0.5083857442348009, "grad_norm": 0.8951301574707031, "learning_rate": 2.9011976047904196e-05, "loss": 0.2587, "loss_nan_ranks": 0, "loss_rank_avg": 0.29488053917884827, "step": 970 }, { "epoch": 0.5110062893081762, "grad_norm": 0.8743593096733093, "learning_rate": 2.916167664670659e-05, "loss": 0.2309, "loss_nan_ranks": 0, "loss_rank_avg": 0.212537944316864, "step": 975 }, { "epoch": 0.5136268343815513, "grad_norm": 0.9430364370346069, "learning_rate": 2.9311377245508982e-05, "loss": 0.2448, "loss_nan_ranks": 0, "loss_rank_avg": 0.24036072194576263, "step": 980 }, { "epoch": 0.5162473794549266, "grad_norm": 1.2850607633590698, "learning_rate": 2.946107784431138e-05, "loss": 0.2396, "loss_nan_ranks": 0, "loss_rank_avg": 0.22472365200519562, "step": 985 }, { "epoch": 0.5188679245283019, "grad_norm": 0.9026057124137878, "learning_rate": 2.9610778443113774e-05, "loss": 0.2532, "loss_nan_ranks": 0, "loss_rank_avg": 0.2544049918651581, "step": 990 }, { "epoch": 0.5214884696016772, "grad_norm": 1.04141366481781, "learning_rate": 2.9760479041916167e-05, "loss": 0.2214, "loss_nan_ranks": 0, "loss_rank_avg": 0.17437362670898438, "step": 995 }, { "epoch": 0.5241090146750524, "grad_norm": 0.9281846880912781, "learning_rate": 2.9910179640718567e-05, "loss": 0.2439, "loss_nan_ranks": 0, "loss_rank_avg": 0.23749575018882751, "step": 1000 }, { "epoch": 0.5267295597484277, "grad_norm": 1.122997760772705, "learning_rate": 3.005988023952096e-05, "loss": 0.24, "loss_nan_ranks": 0, "loss_rank_avg": 0.254638671875, "step": 1005 }, { "epoch": 0.5293501048218029, "grad_norm": 0.7896718382835388, "learning_rate": 3.020958083832336e-05, "loss": 0.2474, "loss_nan_ranks": 0, "loss_rank_avg": 0.24245308339595795, "step": 1010 }, { "epoch": 0.5319706498951782, "grad_norm": 1.19742751121521, "learning_rate": 3.0359281437125753e-05, "loss": 0.2397, "loss_nan_ranks": 0, "loss_rank_avg": 0.21044921875, "step": 1015 }, { "epoch": 0.5345911949685535, "grad_norm": 0.7850223183631897, "learning_rate": 3.0508982035928145e-05, "loss": 0.2639, "loss_nan_ranks": 0, "loss_rank_avg": 0.29461705684661865, "step": 1020 }, { "epoch": 0.5372117400419287, "grad_norm": 0.9502226710319519, "learning_rate": 3.0658682634730545e-05, "loss": 0.2455, "loss_nan_ranks": 0, "loss_rank_avg": 0.28391456604003906, "step": 1025 }, { "epoch": 0.539832285115304, "grad_norm": 0.8148714900016785, "learning_rate": 3.0808383233532935e-05, "loss": 0.2493, "loss_nan_ranks": 0, "loss_rank_avg": 0.28964686393737793, "step": 1030 }, { "epoch": 0.5424528301886793, "grad_norm": 0.768586277961731, "learning_rate": 3.095808383233533e-05, "loss": 0.2438, "loss_nan_ranks": 0, "loss_rank_avg": 0.23348668217658997, "step": 1035 }, { "epoch": 0.5450733752620545, "grad_norm": 1.006455898284912, "learning_rate": 3.110778443113773e-05, "loss": 0.2286, "loss_nan_ranks": 0, "loss_rank_avg": 0.21520912647247314, "step": 1040 }, { "epoch": 0.5476939203354297, "grad_norm": 0.782268226146698, "learning_rate": 3.1257485029940124e-05, "loss": 0.2594, "loss_nan_ranks": 0, "loss_rank_avg": 0.27343398332595825, "step": 1045 }, { "epoch": 0.550314465408805, "grad_norm": 0.7622452974319458, "learning_rate": 3.140718562874251e-05, "loss": 0.2368, "loss_nan_ranks": 0, "loss_rank_avg": 0.2747539281845093, "step": 1050 }, { "epoch": 0.5529350104821803, "grad_norm": 1.9257678985595703, "learning_rate": 3.1556886227544916e-05, "loss": 0.2404, "loss_nan_ranks": 0, "loss_rank_avg": 0.24580204486846924, "step": 1055 }, { "epoch": 0.5555555555555556, "grad_norm": 1.0492233037948608, "learning_rate": 3.1706586826347306e-05, "loss": 0.2609, "loss_nan_ranks": 0, "loss_rank_avg": 0.2686328887939453, "step": 1060 }, { "epoch": 0.5581761006289309, "grad_norm": 0.8673623204231262, "learning_rate": 3.18562874251497e-05, "loss": 0.2446, "loss_nan_ranks": 0, "loss_rank_avg": 0.23184172809123993, "step": 1065 }, { "epoch": 0.560796645702306, "grad_norm": 0.86591637134552, "learning_rate": 3.20059880239521e-05, "loss": 0.2449, "loss_nan_ranks": 0, "loss_rank_avg": 0.2521081864833832, "step": 1070 }, { "epoch": 0.5634171907756813, "grad_norm": 1.0577750205993652, "learning_rate": 3.2155688622754494e-05, "loss": 0.2352, "loss_nan_ranks": 0, "loss_rank_avg": 0.22219401597976685, "step": 1075 }, { "epoch": 0.5660377358490566, "grad_norm": 0.9919980764389038, "learning_rate": 3.230538922155689e-05, "loss": 0.2428, "loss_nan_ranks": 0, "loss_rank_avg": 0.22773528099060059, "step": 1080 }, { "epoch": 0.5686582809224319, "grad_norm": 0.9499963521957397, "learning_rate": 3.245508982035929e-05, "loss": 0.2269, "loss_nan_ranks": 0, "loss_rank_avg": 0.208740234375, "step": 1085 }, { "epoch": 0.5712788259958071, "grad_norm": 1.0068999528884888, "learning_rate": 3.2604790419161677e-05, "loss": 0.249, "loss_nan_ranks": 0, "loss_rank_avg": 0.25735247135162354, "step": 1090 }, { "epoch": 0.5738993710691824, "grad_norm": 0.8172402381896973, "learning_rate": 3.275449101796407e-05, "loss": 0.2448, "loss_nan_ranks": 0, "loss_rank_avg": 0.2182607352733612, "step": 1095 }, { "epoch": 0.5765199161425576, "grad_norm": 0.7479739785194397, "learning_rate": 3.290419161676647e-05, "loss": 0.2394, "loss_nan_ranks": 0, "loss_rank_avg": 0.22888004779815674, "step": 1100 }, { "epoch": 0.5791404612159329, "grad_norm": 0.775371789932251, "learning_rate": 3.3053892215568865e-05, "loss": 0.2586, "loss_nan_ranks": 0, "loss_rank_avg": 0.3027857542037964, "step": 1105 }, { "epoch": 0.5817610062893082, "grad_norm": 1.130009651184082, "learning_rate": 3.320359281437126e-05, "loss": 0.2359, "loss_nan_ranks": 0, "loss_rank_avg": 0.207763671875, "step": 1110 }, { "epoch": 0.5843815513626834, "grad_norm": 0.8480820655822754, "learning_rate": 3.335329341317366e-05, "loss": 0.2289, "loss_nan_ranks": 0, "loss_rank_avg": 0.211669921875, "step": 1115 }, { "epoch": 0.5870020964360587, "grad_norm": 1.0589594841003418, "learning_rate": 3.350299401197605e-05, "loss": 0.2468, "loss_nan_ranks": 0, "loss_rank_avg": 0.2513768672943115, "step": 1120 }, { "epoch": 0.589622641509434, "grad_norm": 57.0751953125, "learning_rate": 3.3652694610778444e-05, "loss": 0.3119, "loss_nan_ranks": 0, "loss_rank_avg": 0.6061429381370544, "step": 1125 }, { "epoch": 0.5922431865828093, "grad_norm": 0.8701210618019104, "learning_rate": 3.380239520958084e-05, "loss": 0.2347, "loss_nan_ranks": 0, "loss_rank_avg": 0.2277510166168213, "step": 1130 }, { "epoch": 0.5948637316561844, "grad_norm": 0.867240846157074, "learning_rate": 3.3952095808383236e-05, "loss": 0.2345, "loss_nan_ranks": 0, "loss_rank_avg": 0.22389352321624756, "step": 1135 }, { "epoch": 0.5974842767295597, "grad_norm": 1.079352855682373, "learning_rate": 3.410179640718563e-05, "loss": 0.2404, "loss_nan_ranks": 0, "loss_rank_avg": 0.211181640625, "step": 1140 }, { "epoch": 0.600104821802935, "grad_norm": 0.8751652836799622, "learning_rate": 3.425149700598803e-05, "loss": 0.2282, "loss_nan_ranks": 0, "loss_rank_avg": 0.24226129055023193, "step": 1145 }, { "epoch": 0.6027253668763103, "grad_norm": 0.9897102117538452, "learning_rate": 3.4401197604790425e-05, "loss": 0.2278, "loss_nan_ranks": 0, "loss_rank_avg": 0.1945013552904129, "step": 1150 }, { "epoch": 0.6053459119496856, "grad_norm": 1.09795081615448, "learning_rate": 3.4550898203592815e-05, "loss": 0.2389, "loss_nan_ranks": 0, "loss_rank_avg": 0.2258596420288086, "step": 1155 }, { "epoch": 0.6079664570230608, "grad_norm": 0.9403954148292542, "learning_rate": 3.470059880239521e-05, "loss": 0.2329, "loss_nan_ranks": 0, "loss_rank_avg": 0.27871084213256836, "step": 1160 }, { "epoch": 0.610587002096436, "grad_norm": 1.3040416240692139, "learning_rate": 3.485029940119761e-05, "loss": 0.2263, "loss_nan_ranks": 0, "loss_rank_avg": 0.20994171500205994, "step": 1165 }, { "epoch": 0.6132075471698113, "grad_norm": 0.9638862013816833, "learning_rate": 3.5000000000000004e-05, "loss": 0.2396, "loss_nan_ranks": 0, "loss_rank_avg": 0.20459549129009247, "step": 1170 }, { "epoch": 0.6158280922431866, "grad_norm": 0.9757469296455383, "learning_rate": 3.514970059880239e-05, "loss": 0.2414, "loss_nan_ranks": 0, "loss_rank_avg": 0.260408878326416, "step": 1175 }, { "epoch": 0.6184486373165619, "grad_norm": 0.8674975633621216, "learning_rate": 3.5299401197604796e-05, "loss": 0.2617, "loss_nan_ranks": 0, "loss_rank_avg": 0.2864234447479248, "step": 1180 }, { "epoch": 0.6210691823899371, "grad_norm": 0.890983521938324, "learning_rate": 3.5449101796407186e-05, "loss": 0.2542, "loss_nan_ranks": 0, "loss_rank_avg": 0.259521484375, "step": 1185 }, { "epoch": 0.6236897274633124, "grad_norm": 0.9182384610176086, "learning_rate": 3.559880239520958e-05, "loss": 0.2487, "loss_nan_ranks": 0, "loss_rank_avg": 0.24978932738304138, "step": 1190 }, { "epoch": 0.6263102725366876, "grad_norm": 0.9169577956199646, "learning_rate": 3.574850299401198e-05, "loss": 0.2319, "loss_nan_ranks": 0, "loss_rank_avg": 0.29184383153915405, "step": 1195 }, { "epoch": 0.6289308176100629, "grad_norm": 2.9035658836364746, "learning_rate": 3.5898203592814375e-05, "loss": 0.2276, "loss_nan_ranks": 0, "loss_rank_avg": 0.23210959136486053, "step": 1200 }, { "epoch": 0.6315513626834381, "grad_norm": 0.7462544441223145, "learning_rate": 3.604790419161677e-05, "loss": 0.258, "loss_nan_ranks": 0, "loss_rank_avg": 0.24800057709217072, "step": 1205 }, { "epoch": 0.6341719077568134, "grad_norm": 0.9018626809120178, "learning_rate": 3.619760479041917e-05, "loss": 0.2266, "loss_nan_ranks": 0, "loss_rank_avg": 0.25354671478271484, "step": 1210 }, { "epoch": 0.6367924528301887, "grad_norm": 0.7574321031570435, "learning_rate": 3.634730538922156e-05, "loss": 0.2402, "loss_nan_ranks": 0, "loss_rank_avg": 0.2568485736846924, "step": 1215 }, { "epoch": 0.639412997903564, "grad_norm": 0.8622102737426758, "learning_rate": 3.649700598802396e-05, "loss": 0.2502, "loss_nan_ranks": 0, "loss_rank_avg": 0.2628772258758545, "step": 1220 }, { "epoch": 0.6420335429769392, "grad_norm": 0.910557746887207, "learning_rate": 3.664670658682635e-05, "loss": 0.2517, "loss_nan_ranks": 0, "loss_rank_avg": 0.2687186300754547, "step": 1225 }, { "epoch": 0.6446540880503144, "grad_norm": 0.9514092803001404, "learning_rate": 3.6796407185628746e-05, "loss": 0.2377, "loss_nan_ranks": 0, "loss_rank_avg": 0.23730027675628662, "step": 1230 }, { "epoch": 0.6472746331236897, "grad_norm": 0.9319348335266113, "learning_rate": 3.694610778443114e-05, "loss": 0.238, "loss_nan_ranks": 0, "loss_rank_avg": 0.2547975778579712, "step": 1235 }, { "epoch": 0.649895178197065, "grad_norm": 1.4198458194732666, "learning_rate": 3.709580838323354e-05, "loss": 0.2091, "loss_nan_ranks": 0, "loss_rank_avg": 0.22032234072685242, "step": 1240 }, { "epoch": 0.6525157232704403, "grad_norm": 1.0361565351486206, "learning_rate": 3.724550898203593e-05, "loss": 0.2435, "loss_nan_ranks": 0, "loss_rank_avg": 0.19445861876010895, "step": 1245 }, { "epoch": 0.6551362683438156, "grad_norm": 0.8487932085990906, "learning_rate": 3.739520958083833e-05, "loss": 0.2424, "loss_nan_ranks": 0, "loss_rank_avg": 0.24062740802764893, "step": 1250 }, { "epoch": 0.6577568134171907, "grad_norm": 0.8280177116394043, "learning_rate": 3.754491017964072e-05, "loss": 0.2507, "loss_nan_ranks": 0, "loss_rank_avg": 0.2391597330570221, "step": 1255 }, { "epoch": 0.660377358490566, "grad_norm": 0.7015854120254517, "learning_rate": 3.769461077844312e-05, "loss": 0.2299, "loss_nan_ranks": 0, "loss_rank_avg": 0.23055067658424377, "step": 1260 }, { "epoch": 0.6629979035639413, "grad_norm": 1.7529536485671997, "learning_rate": 3.784431137724551e-05, "loss": 0.2136, "loss_nan_ranks": 0, "loss_rank_avg": 0.1573486328125, "step": 1265 }, { "epoch": 0.6656184486373166, "grad_norm": 1.1261061429977417, "learning_rate": 3.799401197604791e-05, "loss": 0.2385, "loss_nan_ranks": 0, "loss_rank_avg": 0.26623547077178955, "step": 1270 }, { "epoch": 0.6682389937106918, "grad_norm": 0.6807898283004761, "learning_rate": 3.8143712574850306e-05, "loss": 0.2382, "loss_nan_ranks": 0, "loss_rank_avg": 0.2307087481021881, "step": 1275 }, { "epoch": 0.6708595387840671, "grad_norm": 0.6805742979049683, "learning_rate": 3.8293413173652695e-05, "loss": 0.2352, "loss_nan_ranks": 0, "loss_rank_avg": 0.2753327488899231, "step": 1280 }, { "epoch": 0.6734800838574424, "grad_norm": 0.8867018818855286, "learning_rate": 3.844311377245509e-05, "loss": 0.2382, "loss_nan_ranks": 0, "loss_rank_avg": 0.2446499615907669, "step": 1285 }, { "epoch": 0.6761006289308176, "grad_norm": 1.0850870609283447, "learning_rate": 3.859281437125749e-05, "loss": 0.2176, "loss_nan_ranks": 0, "loss_rank_avg": 0.203857421875, "step": 1290 }, { "epoch": 0.6787211740041929, "grad_norm": 0.7444595694541931, "learning_rate": 3.8742514970059884e-05, "loss": 0.2376, "loss_nan_ranks": 0, "loss_rank_avg": 0.2482675313949585, "step": 1295 }, { "epoch": 0.6813417190775681, "grad_norm": 0.9599826335906982, "learning_rate": 3.889221556886228e-05, "loss": 0.2217, "loss_nan_ranks": 0, "loss_rank_avg": 0.1557583212852478, "step": 1300 }, { "epoch": 0.6839622641509434, "grad_norm": 0.9627584218978882, "learning_rate": 3.9041916167664676e-05, "loss": 0.2399, "loss_nan_ranks": 0, "loss_rank_avg": 0.198858380317688, "step": 1305 }, { "epoch": 0.6865828092243187, "grad_norm": 0.8851011991500854, "learning_rate": 3.9191616766467066e-05, "loss": 0.242, "loss_nan_ranks": 0, "loss_rank_avg": 0.2949499189853668, "step": 1310 }, { "epoch": 0.689203354297694, "grad_norm": 0.8310570120811462, "learning_rate": 3.934131736526946e-05, "loss": 0.22, "loss_nan_ranks": 0, "loss_rank_avg": 0.24201838672161102, "step": 1315 }, { "epoch": 0.6918238993710691, "grad_norm": 1.0353339910507202, "learning_rate": 3.949101796407186e-05, "loss": 0.2231, "loss_nan_ranks": 0, "loss_rank_avg": 0.2144128680229187, "step": 1320 }, { "epoch": 0.6944444444444444, "grad_norm": 0.6979736685752869, "learning_rate": 3.9640718562874255e-05, "loss": 0.2445, "loss_nan_ranks": 0, "loss_rank_avg": 0.27705004811286926, "step": 1325 }, { "epoch": 0.6970649895178197, "grad_norm": 0.9733805060386658, "learning_rate": 3.9790419161676644e-05, "loss": 0.2357, "loss_nan_ranks": 0, "loss_rank_avg": 0.20027337968349457, "step": 1330 }, { "epoch": 0.699685534591195, "grad_norm": 1.0096243619918823, "learning_rate": 3.994011976047905e-05, "loss": 0.2304, "loss_nan_ranks": 0, "loss_rank_avg": 0.26803430914878845, "step": 1335 }, { "epoch": 0.7023060796645703, "grad_norm": 0.980590283870697, "learning_rate": 3.999999385200795e-05, "loss": 0.2404, "loss_nan_ranks": 0, "loss_rank_avg": 0.202880859375, "step": 1340 }, { "epoch": 0.7049266247379455, "grad_norm": 0.80577552318573, "learning_rate": 3.999995628095911e-05, "loss": 0.2315, "loss_nan_ranks": 0, "loss_rank_avg": 0.2268478274345398, "step": 1345 }, { "epoch": 0.7075471698113207, "grad_norm": 1.0306898355484009, "learning_rate": 3.999988455447666e-05, "loss": 0.2237, "loss_nan_ranks": 0, "loss_rank_avg": 0.22385285794734955, "step": 1350 }, { "epoch": 0.710167714884696, "grad_norm": 0.8197321891784668, "learning_rate": 3.9999778672683076e-05, "loss": 0.2352, "loss_nan_ranks": 0, "loss_rank_avg": 0.24937386810779572, "step": 1355 }, { "epoch": 0.7127882599580713, "grad_norm": 0.8815612196922302, "learning_rate": 3.99996386357592e-05, "loss": 0.2275, "loss_nan_ranks": 0, "loss_rank_avg": 0.26051896810531616, "step": 1360 }, { "epoch": 0.7154088050314465, "grad_norm": 0.8673317432403564, "learning_rate": 3.999946444394417e-05, "loss": 0.2468, "loss_nan_ranks": 0, "loss_rank_avg": 0.2650872766971588, "step": 1365 }, { "epoch": 0.7180293501048218, "grad_norm": 0.8530757427215576, "learning_rate": 3.9999256097535466e-05, "loss": 0.2257, "loss_nan_ranks": 0, "loss_rank_avg": 0.2251889407634735, "step": 1370 }, { "epoch": 0.7206498951781971, "grad_norm": 0.7286533713340759, "learning_rate": 3.999901359688891e-05, "loss": 0.2184, "loss_nan_ranks": 0, "loss_rank_avg": 0.17423991858959198, "step": 1375 }, { "epoch": 0.7232704402515723, "grad_norm": 1.0834404230117798, "learning_rate": 3.999873694241863e-05, "loss": 0.2337, "loss_nan_ranks": 0, "loss_rank_avg": 0.2013612985610962, "step": 1380 }, { "epoch": 0.7258909853249476, "grad_norm": 0.9604234099388123, "learning_rate": 3.999842613459709e-05, "loss": 0.2242, "loss_nan_ranks": 0, "loss_rank_avg": 0.22143641114234924, "step": 1385 }, { "epoch": 0.7285115303983228, "grad_norm": 1.0743560791015625, "learning_rate": 3.9998081173955076e-05, "loss": 0.2271, "loss_nan_ranks": 0, "loss_rank_avg": 0.1707763671875, "step": 1390 }, { "epoch": 0.7311320754716981, "grad_norm": 0.7798816561698914, "learning_rate": 3.999770206108172e-05, "loss": 0.2222, "loss_nan_ranks": 0, "loss_rank_avg": 0.21589408814907074, "step": 1395 }, { "epoch": 0.7337526205450734, "grad_norm": 0.8090185523033142, "learning_rate": 3.999728879662443e-05, "loss": 0.2375, "loss_nan_ranks": 0, "loss_rank_avg": 0.2567808926105499, "step": 1400 }, { "epoch": 0.7363731656184487, "grad_norm": 0.828900158405304, "learning_rate": 3.9996841381289e-05, "loss": 0.2277, "loss_nan_ranks": 0, "loss_rank_avg": 0.27614784240722656, "step": 1405 }, { "epoch": 0.7389937106918238, "grad_norm": 0.9465203285217285, "learning_rate": 3.99963598158395e-05, "loss": 0.2562, "loss_nan_ranks": 0, "loss_rank_avg": 0.23473942279815674, "step": 1410 }, { "epoch": 0.7416142557651991, "grad_norm": 0.7388080358505249, "learning_rate": 3.999584410109834e-05, "loss": 0.2467, "loss_nan_ranks": 0, "loss_rank_avg": 0.23388850688934326, "step": 1415 }, { "epoch": 0.7442348008385744, "grad_norm": 0.9057663679122925, "learning_rate": 3.999529423794624e-05, "loss": 0.2112, "loss_nan_ranks": 0, "loss_rank_avg": 0.23402154445648193, "step": 1420 }, { "epoch": 0.7468553459119497, "grad_norm": 0.8521957397460938, "learning_rate": 3.9994710227322256e-05, "loss": 0.2233, "loss_nan_ranks": 0, "loss_rank_avg": 0.219970703125, "step": 1425 }, { "epoch": 0.749475890985325, "grad_norm": 0.7135666012763977, "learning_rate": 3.999409207022373e-05, "loss": 0.2209, "loss_nan_ranks": 0, "loss_rank_avg": 0.16821300983428955, "step": 1430 }, { "epoch": 0.7520964360587002, "grad_norm": 0.7875863909721375, "learning_rate": 3.999343976770635e-05, "loss": 0.2493, "loss_nan_ranks": 0, "loss_rank_avg": 0.26239144802093506, "step": 1435 }, { "epoch": 0.7547169811320755, "grad_norm": 0.8010939359664917, "learning_rate": 3.9992753320884086e-05, "loss": 0.2293, "loss_nan_ranks": 0, "loss_rank_avg": 0.18904992938041687, "step": 1440 }, { "epoch": 0.7573375262054507, "grad_norm": 0.7313894629478455, "learning_rate": 3.9992032730929254e-05, "loss": 0.2268, "loss_nan_ranks": 0, "loss_rank_avg": 0.2471504658460617, "step": 1445 }, { "epoch": 0.759958071278826, "grad_norm": 0.9344741106033325, "learning_rate": 3.9991277999072436e-05, "loss": 0.2198, "loss_nan_ranks": 0, "loss_rank_avg": 0.21785646677017212, "step": 1450 }, { "epoch": 0.7625786163522013, "grad_norm": 1.380718469619751, "learning_rate": 3.9990489126602565e-05, "loss": 0.2266, "loss_nan_ranks": 0, "loss_rank_avg": 0.26836979389190674, "step": 1455 }, { "epoch": 0.7651991614255765, "grad_norm": 0.7142491340637207, "learning_rate": 3.998966611486686e-05, "loss": 0.2525, "loss_nan_ranks": 0, "loss_rank_avg": 0.27713894844055176, "step": 1460 }, { "epoch": 0.7678197064989518, "grad_norm": 0.8345506191253662, "learning_rate": 3.998880896527082e-05, "loss": 0.2471, "loss_nan_ranks": 0, "loss_rank_avg": 0.23274078965187073, "step": 1465 }, { "epoch": 0.7704402515723271, "grad_norm": 0.8071177005767822, "learning_rate": 3.998791767927828e-05, "loss": 0.2227, "loss_nan_ranks": 0, "loss_rank_avg": 0.2065439373254776, "step": 1470 }, { "epoch": 0.7730607966457023, "grad_norm": 0.9820889830589294, "learning_rate": 3.9986992258411355e-05, "loss": 0.2318, "loss_nan_ranks": 0, "loss_rank_avg": 0.2510470747947693, "step": 1475 }, { "epoch": 0.7756813417190775, "grad_norm": 0.664943277835846, "learning_rate": 3.998603270425045e-05, "loss": 0.2298, "loss_nan_ranks": 0, "loss_rank_avg": 0.2430516481399536, "step": 1480 }, { "epoch": 0.7783018867924528, "grad_norm": 0.6739146113395691, "learning_rate": 3.998503901843427e-05, "loss": 0.2431, "loss_nan_ranks": 0, "loss_rank_avg": 0.27743998169898987, "step": 1485 }, { "epoch": 0.7809224318658281, "grad_norm": 0.7599772810935974, "learning_rate": 3.998401120265981e-05, "loss": 0.219, "loss_nan_ranks": 0, "loss_rank_avg": 0.2510407567024231, "step": 1490 }, { "epoch": 0.7835429769392034, "grad_norm": 0.8791052103042603, "learning_rate": 3.9982949258682345e-05, "loss": 0.2426, "loss_nan_ranks": 0, "loss_rank_avg": 0.23215271532535553, "step": 1495 }, { "epoch": 0.7861635220125787, "grad_norm": 0.9418508410453796, "learning_rate": 3.9981853188315444e-05, "loss": 0.2438, "loss_nan_ranks": 0, "loss_rank_avg": 0.224778950214386, "step": 1500 }, { "epoch": 0.7887840670859538, "grad_norm": 1.0868874788284302, "learning_rate": 3.998072299343093e-05, "loss": 0.2197, "loss_nan_ranks": 0, "loss_rank_avg": 0.16823123395442963, "step": 1505 }, { "epoch": 0.7914046121593291, "grad_norm": 0.6775776743888855, "learning_rate": 3.997955867595895e-05, "loss": 0.2339, "loss_nan_ranks": 0, "loss_rank_avg": 0.21675719320774078, "step": 1510 }, { "epoch": 0.7940251572327044, "grad_norm": 0.9414287805557251, "learning_rate": 3.9978360237887876e-05, "loss": 0.2325, "loss_nan_ranks": 0, "loss_rank_avg": 0.25559812784194946, "step": 1515 }, { "epoch": 0.7966457023060797, "grad_norm": 0.964307963848114, "learning_rate": 3.997712768126438e-05, "loss": 0.2232, "loss_nan_ranks": 0, "loss_rank_avg": 0.23508593440055847, "step": 1520 }, { "epoch": 0.799266247379455, "grad_norm": 0.6535694003105164, "learning_rate": 3.997586100819338e-05, "loss": 0.2457, "loss_nan_ranks": 0, "loss_rank_avg": 0.2811439335346222, "step": 1525 }, { "epoch": 0.8018867924528302, "grad_norm": 0.822663426399231, "learning_rate": 3.99745602208381e-05, "loss": 0.2246, "loss_nan_ranks": 0, "loss_rank_avg": 0.22198763489723206, "step": 1530 }, { "epoch": 0.8045073375262054, "grad_norm": 0.8911979794502258, "learning_rate": 3.997322532141995e-05, "loss": 0.2447, "loss_nan_ranks": 0, "loss_rank_avg": 0.27367645502090454, "step": 1535 }, { "epoch": 0.8071278825995807, "grad_norm": 1.597509503364563, "learning_rate": 3.9971856312218664e-05, "loss": 0.2121, "loss_nan_ranks": 0, "loss_rank_avg": 0.1748046875, "step": 1540 }, { "epoch": 0.809748427672956, "grad_norm": 0.6583138108253479, "learning_rate": 3.99704531955722e-05, "loss": 0.2247, "loss_nan_ranks": 0, "loss_rank_avg": 0.21396388113498688, "step": 1545 }, { "epoch": 0.8123689727463312, "grad_norm": 0.7443684339523315, "learning_rate": 3.9969015973876765e-05, "loss": 0.2338, "loss_nan_ranks": 0, "loss_rank_avg": 0.22403575479984283, "step": 1550 }, { "epoch": 0.8149895178197065, "grad_norm": 0.7412883639335632, "learning_rate": 3.996754464958681e-05, "loss": 0.2335, "loss_nan_ranks": 0, "loss_rank_avg": 0.24360555410385132, "step": 1555 }, { "epoch": 0.8176100628930818, "grad_norm": 0.635653018951416, "learning_rate": 3.9966039225215025e-05, "loss": 0.2313, "loss_nan_ranks": 0, "loss_rank_avg": 0.2305343598127365, "step": 1560 }, { "epoch": 0.820230607966457, "grad_norm": 0.6679280996322632, "learning_rate": 3.9964499703332334e-05, "loss": 0.2309, "loss_nan_ranks": 0, "loss_rank_avg": 0.20315735042095184, "step": 1565 }, { "epoch": 0.8228511530398323, "grad_norm": 0.7374495267868042, "learning_rate": 3.996292608656791e-05, "loss": 0.2125, "loss_nan_ranks": 0, "loss_rank_avg": 0.21739450097084045, "step": 1570 }, { "epoch": 0.8254716981132075, "grad_norm": 0.7598218321800232, "learning_rate": 3.996131837760912e-05, "loss": 0.2328, "loss_nan_ranks": 0, "loss_rank_avg": 0.186279296875, "step": 1575 }, { "epoch": 0.8280922431865828, "grad_norm": 0.6665475368499756, "learning_rate": 3.9959676579201574e-05, "loss": 0.2134, "loss_nan_ranks": 0, "loss_rank_avg": 0.21755671501159668, "step": 1580 }, { "epoch": 0.8307127882599581, "grad_norm": 0.7411794662475586, "learning_rate": 3.995800069414909e-05, "loss": 0.2175, "loss_nan_ranks": 0, "loss_rank_avg": 0.2294345200061798, "step": 1585 }, { "epoch": 0.8333333333333334, "grad_norm": 0.864759087562561, "learning_rate": 3.995629072531372e-05, "loss": 0.2273, "loss_nan_ranks": 0, "loss_rank_avg": 0.24587363004684448, "step": 1590 }, { "epoch": 0.8359538784067087, "grad_norm": 0.7389988303184509, "learning_rate": 3.995454667561569e-05, "loss": 0.2241, "loss_nan_ranks": 0, "loss_rank_avg": 0.2165318727493286, "step": 1595 }, { "epoch": 0.8385744234800838, "grad_norm": 0.7519676685333252, "learning_rate": 3.9952768548033455e-05, "loss": 0.2535, "loss_nan_ranks": 0, "loss_rank_avg": 0.26860612630844116, "step": 1600 }, { "epoch": 0.8411949685534591, "grad_norm": 0.8388245701789856, "learning_rate": 3.995095634560365e-05, "loss": 0.2347, "loss_nan_ranks": 0, "loss_rank_avg": 0.25020381808280945, "step": 1605 }, { "epoch": 0.8438155136268344, "grad_norm": 0.8016890287399292, "learning_rate": 3.994911007142112e-05, "loss": 0.2426, "loss_nan_ranks": 0, "loss_rank_avg": 0.23435023427009583, "step": 1610 }, { "epoch": 0.8464360587002097, "grad_norm": 1.0219448804855347, "learning_rate": 3.994722972863888e-05, "loss": 0.2278, "loss_nan_ranks": 0, "loss_rank_avg": 0.2231299877166748, "step": 1615 }, { "epoch": 0.8490566037735849, "grad_norm": 0.8658933639526367, "learning_rate": 3.9945315320468125e-05, "loss": 0.2143, "loss_nan_ranks": 0, "loss_rank_avg": 0.22396323084831238, "step": 1620 }, { "epoch": 0.8516771488469602, "grad_norm": 0.8496825098991394, "learning_rate": 3.994336685017825e-05, "loss": 0.2234, "loss_nan_ranks": 0, "loss_rank_avg": 0.18982970714569092, "step": 1625 }, { "epoch": 0.8542976939203354, "grad_norm": 0.9361600875854492, "learning_rate": 3.994138432109679e-05, "loss": 0.2219, "loss_nan_ranks": 0, "loss_rank_avg": 0.202392578125, "step": 1630 }, { "epoch": 0.8569182389937107, "grad_norm": 0.958289623260498, "learning_rate": 3.993936773660948e-05, "loss": 0.2324, "loss_nan_ranks": 0, "loss_rank_avg": 0.21435546875, "step": 1635 }, { "epoch": 0.859538784067086, "grad_norm": 0.7888805866241455, "learning_rate": 3.993731710016018e-05, "loss": 0.2294, "loss_nan_ranks": 0, "loss_rank_avg": 0.20953628420829773, "step": 1640 }, { "epoch": 0.8621593291404612, "grad_norm": 0.8121855854988098, "learning_rate": 3.993523241525091e-05, "loss": 0.2185, "loss_nan_ranks": 0, "loss_rank_avg": 0.201904296875, "step": 1645 }, { "epoch": 0.8647798742138365, "grad_norm": 0.8300842046737671, "learning_rate": 3.9933113685441844e-05, "loss": 0.2146, "loss_nan_ranks": 0, "loss_rank_avg": 0.22567909955978394, "step": 1650 }, { "epoch": 0.8674004192872118, "grad_norm": 0.8639819622039795, "learning_rate": 3.9930960914351316e-05, "loss": 0.2177, "loss_nan_ranks": 0, "loss_rank_avg": 0.22009161114692688, "step": 1655 }, { "epoch": 0.870020964360587, "grad_norm": 0.7249926924705505, "learning_rate": 3.992877410565576e-05, "loss": 0.2372, "loss_nan_ranks": 0, "loss_rank_avg": 0.23612543940544128, "step": 1660 }, { "epoch": 0.8726415094339622, "grad_norm": 0.7727657556533813, "learning_rate": 3.992655326308975e-05, "loss": 0.2077, "loss_nan_ranks": 0, "loss_rank_avg": 0.25405246019363403, "step": 1665 }, { "epoch": 0.8752620545073375, "grad_norm": 0.7392059564590454, "learning_rate": 3.9924298390446e-05, "loss": 0.2251, "loss_nan_ranks": 0, "loss_rank_avg": 0.2551038861274719, "step": 1670 }, { "epoch": 0.8778825995807128, "grad_norm": 0.7574449181556702, "learning_rate": 3.992200949157531e-05, "loss": 0.2532, "loss_nan_ranks": 0, "loss_rank_avg": 0.2989419996738434, "step": 1675 }, { "epoch": 0.8805031446540881, "grad_norm": 1.003849744796753, "learning_rate": 3.991968657038663e-05, "loss": 0.2327, "loss_nan_ranks": 0, "loss_rank_avg": 0.21192163228988647, "step": 1680 }, { "epoch": 0.8831236897274634, "grad_norm": 0.7698060274124146, "learning_rate": 3.9917329630846955e-05, "loss": 0.222, "loss_nan_ranks": 0, "loss_rank_avg": 0.23520153760910034, "step": 1685 }, { "epoch": 0.8857442348008385, "grad_norm": 0.8432544469833374, "learning_rate": 3.991493867698144e-05, "loss": 0.2253, "loss_nan_ranks": 0, "loss_rank_avg": 0.22624069452285767, "step": 1690 }, { "epoch": 0.8883647798742138, "grad_norm": 1.2525861263275146, "learning_rate": 3.991251371287327e-05, "loss": 0.2269, "loss_nan_ranks": 0, "loss_rank_avg": 0.25401657819747925, "step": 1695 }, { "epoch": 0.8909853249475891, "grad_norm": 0.7080795168876648, "learning_rate": 3.991005474266377e-05, "loss": 0.2328, "loss_nan_ranks": 0, "loss_rank_avg": 0.22854046523571014, "step": 1700 }, { "epoch": 0.8936058700209644, "grad_norm": 0.7806035876274109, "learning_rate": 3.990756177055228e-05, "loss": 0.2291, "loss_nan_ranks": 0, "loss_rank_avg": 0.24327123165130615, "step": 1705 }, { "epoch": 0.8962264150943396, "grad_norm": 0.8708102703094482, "learning_rate": 3.990503480079624e-05, "loss": 0.2572, "loss_nan_ranks": 0, "loss_rank_avg": 0.28605660796165466, "step": 1710 }, { "epoch": 0.8988469601677149, "grad_norm": 0.745387613773346, "learning_rate": 3.9902473837711166e-05, "loss": 0.2443, "loss_nan_ranks": 0, "loss_rank_avg": 0.24579143524169922, "step": 1715 }, { "epoch": 0.9014675052410901, "grad_norm": 0.7314765453338623, "learning_rate": 3.9899878885670586e-05, "loss": 0.2166, "loss_nan_ranks": 0, "loss_rank_avg": 0.22977255284786224, "step": 1720 }, { "epoch": 0.9040880503144654, "grad_norm": 0.8882567286491394, "learning_rate": 3.989724994910611e-05, "loss": 0.2615, "loss_nan_ranks": 0, "loss_rank_avg": 0.2808782756328583, "step": 1725 }, { "epoch": 0.9067085953878407, "grad_norm": 0.9777259826660156, "learning_rate": 3.989458703250737e-05, "loss": 0.2397, "loss_nan_ranks": 0, "loss_rank_avg": 0.19820043444633484, "step": 1730 }, { "epoch": 0.9093291404612159, "grad_norm": 0.6205902695655823, "learning_rate": 3.989189014042202e-05, "loss": 0.2199, "loss_nan_ranks": 0, "loss_rank_avg": 0.24654075503349304, "step": 1735 }, { "epoch": 0.9119496855345912, "grad_norm": 0.7724601030349731, "learning_rate": 3.988915927745576e-05, "loss": 0.2001, "loss_nan_ranks": 0, "loss_rank_avg": 0.18580566346645355, "step": 1740 }, { "epoch": 0.9145702306079665, "grad_norm": 0.7739741206169128, "learning_rate": 3.9886394448272274e-05, "loss": 0.2371, "loss_nan_ranks": 0, "loss_rank_avg": 0.26846638321876526, "step": 1745 }, { "epoch": 0.9171907756813418, "grad_norm": 0.8521780371665955, "learning_rate": 3.988359565759328e-05, "loss": 0.2126, "loss_nan_ranks": 0, "loss_rank_avg": 0.1624755859375, "step": 1750 }, { "epoch": 0.9198113207547169, "grad_norm": 1.187021017074585, "learning_rate": 3.988076291019849e-05, "loss": 0.2214, "loss_nan_ranks": 0, "loss_rank_avg": 0.22318154573440552, "step": 1755 }, { "epoch": 0.9224318658280922, "grad_norm": 0.84868323802948, "learning_rate": 3.987789621092558e-05, "loss": 0.2532, "loss_nan_ranks": 0, "loss_rank_avg": 0.23538881540298462, "step": 1760 }, { "epoch": 0.9250524109014675, "grad_norm": 0.8934110403060913, "learning_rate": 3.9874995564670245e-05, "loss": 0.2231, "loss_nan_ranks": 0, "loss_rank_avg": 0.20849609375, "step": 1765 }, { "epoch": 0.9276729559748428, "grad_norm": 1.0175093412399292, "learning_rate": 3.987206097638614e-05, "loss": 0.257, "loss_nan_ranks": 0, "loss_rank_avg": 0.21352073550224304, "step": 1770 }, { "epoch": 0.9302935010482181, "grad_norm": 0.7733330726623535, "learning_rate": 3.986909245108487e-05, "loss": 0.2424, "loss_nan_ranks": 0, "loss_rank_avg": 0.2377007156610489, "step": 1775 }, { "epoch": 0.9329140461215933, "grad_norm": 0.9361969828605652, "learning_rate": 3.9866089993836006e-05, "loss": 0.2168, "loss_nan_ranks": 0, "loss_rank_avg": 0.22012554109096527, "step": 1780 }, { "epoch": 0.9355345911949685, "grad_norm": 0.7702367305755615, "learning_rate": 3.986305360976709e-05, "loss": 0.2432, "loss_nan_ranks": 0, "loss_rank_avg": 0.2497783899307251, "step": 1785 }, { "epoch": 0.9381551362683438, "grad_norm": 0.7354511618614197, "learning_rate": 3.985998330406357e-05, "loss": 0.2343, "loss_nan_ranks": 0, "loss_rank_avg": 0.19119471311569214, "step": 1790 }, { "epoch": 0.9407756813417191, "grad_norm": 3.6996049880981445, "learning_rate": 3.9856879081968846e-05, "loss": 0.2216, "loss_nan_ranks": 0, "loss_rank_avg": 0.255986750125885, "step": 1795 }, { "epoch": 0.9433962264150944, "grad_norm": 0.7672015428543091, "learning_rate": 3.985374094878423e-05, "loss": 0.2202, "loss_nan_ranks": 0, "loss_rank_avg": 0.22132608294487, "step": 1800 }, { "epoch": 0.9460167714884696, "grad_norm": 0.9519333839416504, "learning_rate": 3.985056890986895e-05, "loss": 0.2166, "loss_nan_ranks": 0, "loss_rank_avg": 0.2124580442905426, "step": 1805 }, { "epoch": 0.9486373165618449, "grad_norm": 0.7499129772186279, "learning_rate": 3.984736297064012e-05, "loss": 0.2235, "loss_nan_ranks": 0, "loss_rank_avg": 0.22277656197547913, "step": 1810 }, { "epoch": 0.9512578616352201, "grad_norm": 0.7869206070899963, "learning_rate": 3.984412313657279e-05, "loss": 0.2222, "loss_nan_ranks": 0, "loss_rank_avg": 0.18107184767723083, "step": 1815 }, { "epoch": 0.9538784067085954, "grad_norm": 0.6203080415725708, "learning_rate": 3.984084941319985e-05, "loss": 0.2191, "loss_nan_ranks": 0, "loss_rank_avg": 0.29906970262527466, "step": 1820 }, { "epoch": 0.9564989517819706, "grad_norm": 0.7308595776557922, "learning_rate": 3.983754180611209e-05, "loss": 0.2391, "loss_nan_ranks": 0, "loss_rank_avg": 0.2216796875, "step": 1825 }, { "epoch": 0.9591194968553459, "grad_norm": 0.686708390712738, "learning_rate": 3.983420032095817e-05, "loss": 0.2421, "loss_nan_ranks": 0, "loss_rank_avg": 0.23726975917816162, "step": 1830 }, { "epoch": 0.9617400419287212, "grad_norm": 0.633300244808197, "learning_rate": 3.983082496344458e-05, "loss": 0.2228, "loss_nan_ranks": 0, "loss_rank_avg": 0.22657258808612823, "step": 1835 }, { "epoch": 0.9643605870020965, "grad_norm": 0.6701763868331909, "learning_rate": 3.982741573933568e-05, "loss": 0.2363, "loss_nan_ranks": 0, "loss_rank_avg": 0.22205190360546112, "step": 1840 }, { "epoch": 0.9669811320754716, "grad_norm": 0.749075174331665, "learning_rate": 3.9823972654453664e-05, "loss": 0.2261, "loss_nan_ranks": 0, "loss_rank_avg": 0.22223912179470062, "step": 1845 }, { "epoch": 0.9696016771488469, "grad_norm": 0.9106373190879822, "learning_rate": 3.9820495714678536e-05, "loss": 0.2036, "loss_nan_ranks": 0, "loss_rank_avg": 0.20068359375, "step": 1850 }, { "epoch": 0.9722222222222222, "grad_norm": 2.0793375968933105, "learning_rate": 3.981698492594814e-05, "loss": 0.2129, "loss_nan_ranks": 0, "loss_rank_avg": 0.19463402032852173, "step": 1855 }, { "epoch": 0.9748427672955975, "grad_norm": 0.995448112487793, "learning_rate": 3.981344029425811e-05, "loss": 0.2196, "loss_nan_ranks": 0, "loss_rank_avg": 0.20898227393627167, "step": 1860 }, { "epoch": 0.9774633123689728, "grad_norm": 0.8250730633735657, "learning_rate": 3.980986182566188e-05, "loss": 0.2307, "loss_nan_ranks": 0, "loss_rank_avg": 0.2784325182437897, "step": 1865 }, { "epoch": 0.980083857442348, "grad_norm": 0.7888976335525513, "learning_rate": 3.980624952627067e-05, "loss": 0.232, "loss_nan_ranks": 0, "loss_rank_avg": 0.1858631819486618, "step": 1870 }, { "epoch": 0.9827044025157232, "grad_norm": 0.6838705539703369, "learning_rate": 3.980260340225347e-05, "loss": 0.2289, "loss_nan_ranks": 0, "loss_rank_avg": 0.22918249666690826, "step": 1875 }, { "epoch": 0.9853249475890985, "grad_norm": 0.737946093082428, "learning_rate": 3.979892345983706e-05, "loss": 0.2264, "loss_nan_ranks": 0, "loss_rank_avg": 0.24606642127037048, "step": 1880 }, { "epoch": 0.9879454926624738, "grad_norm": 0.649776816368103, "learning_rate": 3.979520970530594e-05, "loss": 0.2278, "loss_nan_ranks": 0, "loss_rank_avg": 0.22287237644195557, "step": 1885 }, { "epoch": 0.9905660377358491, "grad_norm": 0.8000211715698242, "learning_rate": 3.979146214500237e-05, "loss": 0.2398, "loss_nan_ranks": 0, "loss_rank_avg": 0.2615301012992859, "step": 1890 }, { "epoch": 0.9931865828092243, "grad_norm": 0.6818714141845703, "learning_rate": 3.9787680785326343e-05, "loss": 0.2103, "loss_nan_ranks": 0, "loss_rank_avg": 0.20701850950717926, "step": 1895 }, { "epoch": 0.9958071278825996, "grad_norm": 0.8771920204162598, "learning_rate": 3.978386563273557e-05, "loss": 0.2325, "loss_nan_ranks": 0, "loss_rank_avg": 0.2713809609413147, "step": 1900 }, { "epoch": 0.9984276729559748, "grad_norm": 0.6226481795310974, "learning_rate": 3.978001669374548e-05, "loss": 0.2313, "loss_nan_ranks": 0, "loss_rank_avg": 0.24410386383533478, "step": 1905 }, { "epoch": 1.00104821802935, "grad_norm": 0.7404968738555908, "learning_rate": 3.9776133974929193e-05, "loss": 0.201, "loss_nan_ranks": 0, "loss_rank_avg": 0.20939648151397705, "step": 1910 }, { "epoch": 1.0036687631027255, "grad_norm": 0.8718746900558472, "learning_rate": 3.9772217482917524e-05, "loss": 0.1977, "loss_nan_ranks": 0, "loss_rank_avg": 0.18115234375, "step": 1915 }, { "epoch": 1.0062893081761006, "grad_norm": 0.7572813034057617, "learning_rate": 3.9768267224398956e-05, "loss": 0.2204, "loss_nan_ranks": 0, "loss_rank_avg": 0.21850603818893433, "step": 1920 }, { "epoch": 1.0089098532494758, "grad_norm": 0.7003916501998901, "learning_rate": 3.976428320611965e-05, "loss": 0.2008, "loss_nan_ranks": 0, "loss_rank_avg": 0.14007088541984558, "step": 1925 }, { "epoch": 1.0115303983228512, "grad_norm": 0.808007001876831, "learning_rate": 3.976026543488341e-05, "loss": 0.2156, "loss_nan_ranks": 0, "loss_rank_avg": 0.20655229687690735, "step": 1930 }, { "epoch": 1.0141509433962264, "grad_norm": 0.7994524240493774, "learning_rate": 3.9756213917551685e-05, "loss": 0.2088, "loss_nan_ranks": 0, "loss_rank_avg": 0.21075238287448883, "step": 1935 }, { "epoch": 1.0167714884696017, "grad_norm": 0.7276920676231384, "learning_rate": 3.975212866104356e-05, "loss": 0.2034, "loss_nan_ranks": 0, "loss_rank_avg": 0.20128029584884644, "step": 1940 }, { "epoch": 1.019392033542977, "grad_norm": 0.628477156162262, "learning_rate": 3.974800967233574e-05, "loss": 0.2324, "loss_nan_ranks": 0, "loss_rank_avg": 0.26335054636001587, "step": 1945 }, { "epoch": 1.0220125786163523, "grad_norm": 0.784572958946228, "learning_rate": 3.974385695846252e-05, "loss": 0.2007, "loss_nan_ranks": 0, "loss_rank_avg": 0.21776393055915833, "step": 1950 }, { "epoch": 1.0246331236897275, "grad_norm": 0.7957406044006348, "learning_rate": 3.9739670526515815e-05, "loss": 0.2092, "loss_nan_ranks": 0, "loss_rank_avg": 0.21917438507080078, "step": 1955 }, { "epoch": 1.0272536687631026, "grad_norm": 0.7487369179725647, "learning_rate": 3.9735450383645104e-05, "loss": 0.1986, "loss_nan_ranks": 0, "loss_rank_avg": 0.1875, "step": 1960 }, { "epoch": 1.029874213836478, "grad_norm": 0.7331587672233582, "learning_rate": 3.9731196537057445e-05, "loss": 0.2249, "loss_nan_ranks": 0, "loss_rank_avg": 0.19073855876922607, "step": 1965 }, { "epoch": 1.0324947589098532, "grad_norm": 0.827370285987854, "learning_rate": 3.972690899401745e-05, "loss": 0.2076, "loss_nan_ranks": 0, "loss_rank_avg": 0.20354562997817993, "step": 1970 }, { "epoch": 1.0351153039832286, "grad_norm": 0.6845994591712952, "learning_rate": 3.9722587761847294e-05, "loss": 0.2144, "loss_nan_ranks": 0, "loss_rank_avg": 0.24186748266220093, "step": 1975 }, { "epoch": 1.0377358490566038, "grad_norm": 0.7059889435768127, "learning_rate": 3.971823284792665e-05, "loss": 0.2165, "loss_nan_ranks": 0, "loss_rank_avg": 0.22607070207595825, "step": 1980 }, { "epoch": 1.040356394129979, "grad_norm": 0.6191701889038086, "learning_rate": 3.9713844259692746e-05, "loss": 0.2152, "loss_nan_ranks": 0, "loss_rank_avg": 0.19120293855667114, "step": 1985 }, { "epoch": 1.0429769392033543, "grad_norm": 0.6356191039085388, "learning_rate": 3.970942200464031e-05, "loss": 0.2068, "loss_nan_ranks": 0, "loss_rank_avg": 0.21611721813678741, "step": 1990 }, { "epoch": 1.0455974842767295, "grad_norm": 0.6623996496200562, "learning_rate": 3.9704966090321536e-05, "loss": 0.2161, "loss_nan_ranks": 0, "loss_rank_avg": 0.2316063493490219, "step": 1995 }, { "epoch": 1.0482180293501049, "grad_norm": 0.7194094061851501, "learning_rate": 3.970047652434615e-05, "loss": 0.2052, "loss_nan_ranks": 0, "loss_rank_avg": 0.2116880565881729, "step": 2000 }, { "epoch": 1.05083857442348, "grad_norm": 0.9472671151161194, "learning_rate": 3.9695953314381305e-05, "loss": 0.2152, "loss_nan_ranks": 0, "loss_rank_avg": 0.20667138695716858, "step": 2005 }, { "epoch": 1.0534591194968554, "grad_norm": 0.9666203260421753, "learning_rate": 3.969139646815165e-05, "loss": 0.2201, "loss_nan_ranks": 0, "loss_rank_avg": 0.2363303005695343, "step": 2010 }, { "epoch": 1.0560796645702306, "grad_norm": 0.8482215404510498, "learning_rate": 3.9686805993439226e-05, "loss": 0.2039, "loss_nan_ranks": 0, "loss_rank_avg": 0.21171987056732178, "step": 2015 }, { "epoch": 1.0587002096436058, "grad_norm": 1.468690276145935, "learning_rate": 3.968218189808356e-05, "loss": 0.1857, "loss_nan_ranks": 0, "loss_rank_avg": 0.190673828125, "step": 2020 }, { "epoch": 1.0613207547169812, "grad_norm": 0.7537864446640015, "learning_rate": 3.967752418998155e-05, "loss": 0.2281, "loss_nan_ranks": 0, "loss_rank_avg": 0.2312135249376297, "step": 2025 }, { "epoch": 1.0639412997903563, "grad_norm": 0.7670570015907288, "learning_rate": 3.9672832877087524e-05, "loss": 0.2033, "loss_nan_ranks": 0, "loss_rank_avg": 0.24082399904727936, "step": 2030 }, { "epoch": 1.0665618448637317, "grad_norm": 0.7302185893058777, "learning_rate": 3.966810796741318e-05, "loss": 0.2164, "loss_nan_ranks": 0, "loss_rank_avg": 0.2091417908668518, "step": 2035 }, { "epoch": 1.069182389937107, "grad_norm": 0.7771731615066528, "learning_rate": 3.9663349469027626e-05, "loss": 0.2248, "loss_nan_ranks": 0, "loss_rank_avg": 0.2181556224822998, "step": 2040 }, { "epoch": 1.0718029350104823, "grad_norm": 0.7886734008789062, "learning_rate": 3.9658557390057286e-05, "loss": 0.2197, "loss_nan_ranks": 0, "loss_rank_avg": 0.23269076645374298, "step": 2045 }, { "epoch": 1.0744234800838575, "grad_norm": 1.0671067237854004, "learning_rate": 3.965373173868596e-05, "loss": 0.211, "loss_nan_ranks": 0, "loss_rank_avg": 0.178466796875, "step": 2050 }, { "epoch": 1.0770440251572326, "grad_norm": 0.7055492401123047, "learning_rate": 3.9648872523154785e-05, "loss": 0.1981, "loss_nan_ranks": 0, "loss_rank_avg": 0.18505859375, "step": 2055 }, { "epoch": 1.079664570230608, "grad_norm": 0.7833647131919861, "learning_rate": 3.96439797517622e-05, "loss": 0.2074, "loss_nan_ranks": 0, "loss_rank_avg": 0.21262788772583008, "step": 2060 }, { "epoch": 1.0822851153039832, "grad_norm": 0.8312519192695618, "learning_rate": 3.963905343286396e-05, "loss": 0.2008, "loss_nan_ranks": 0, "loss_rank_avg": 0.2077818363904953, "step": 2065 }, { "epoch": 1.0849056603773586, "grad_norm": 0.7561509609222412, "learning_rate": 3.963409357487312e-05, "loss": 0.2079, "loss_nan_ranks": 0, "loss_rank_avg": 0.23984187841415405, "step": 2070 }, { "epoch": 1.0875262054507338, "grad_norm": 0.6779179573059082, "learning_rate": 3.9629100186259994e-05, "loss": 0.2279, "loss_nan_ranks": 0, "loss_rank_avg": 0.21529117226600647, "step": 2075 }, { "epoch": 1.090146750524109, "grad_norm": 0.7629055976867676, "learning_rate": 3.9624073275552176e-05, "loss": 0.2046, "loss_nan_ranks": 0, "loss_rank_avg": 0.19189453125, "step": 2080 }, { "epoch": 1.0927672955974843, "grad_norm": 0.7624127864837646, "learning_rate": 3.96190128513345e-05, "loss": 0.2097, "loss_nan_ranks": 0, "loss_rank_avg": 0.21261179447174072, "step": 2085 }, { "epoch": 1.0953878406708595, "grad_norm": 0.9637182950973511, "learning_rate": 3.9613918922249025e-05, "loss": 0.1901, "loss_nan_ranks": 0, "loss_rank_avg": 0.1895751953125, "step": 2090 }, { "epoch": 1.0980083857442349, "grad_norm": 0.6332825422286987, "learning_rate": 3.960879149699505e-05, "loss": 0.196, "loss_nan_ranks": 0, "loss_rank_avg": 0.19877511262893677, "step": 2095 }, { "epoch": 1.10062893081761, "grad_norm": 0.7005939483642578, "learning_rate": 3.960363058432906e-05, "loss": 0.2077, "loss_nan_ranks": 0, "loss_rank_avg": 0.171142578125, "step": 2100 }, { "epoch": 1.1032494758909852, "grad_norm": 0.8345542550086975, "learning_rate": 3.959843619306472e-05, "loss": 0.2043, "loss_nan_ranks": 0, "loss_rank_avg": 0.22179073095321655, "step": 2105 }, { "epoch": 1.1058700209643606, "grad_norm": 0.6734275221824646, "learning_rate": 3.959320833207292e-05, "loss": 0.2301, "loss_nan_ranks": 0, "loss_rank_avg": 0.24089553952217102, "step": 2110 }, { "epoch": 1.1084905660377358, "grad_norm": 0.883843183517456, "learning_rate": 3.958794701028164e-05, "loss": 0.2037, "loss_nan_ranks": 0, "loss_rank_avg": 0.231886625289917, "step": 2115 }, { "epoch": 1.1111111111111112, "grad_norm": 1.0141050815582275, "learning_rate": 3.958265223667605e-05, "loss": 0.2103, "loss_nan_ranks": 0, "loss_rank_avg": 0.2082097977399826, "step": 2120 }, { "epoch": 1.1137316561844863, "grad_norm": 0.9002329111099243, "learning_rate": 3.957732402029842e-05, "loss": 0.2059, "loss_nan_ranks": 0, "loss_rank_avg": 0.166748046875, "step": 2125 }, { "epoch": 1.1163522012578617, "grad_norm": 0.7092249989509583, "learning_rate": 3.957196237024817e-05, "loss": 0.2313, "loss_nan_ranks": 0, "loss_rank_avg": 0.27344369888305664, "step": 2130 }, { "epoch": 1.118972746331237, "grad_norm": 0.601761519908905, "learning_rate": 3.956656729568178e-05, "loss": 0.2101, "loss_nan_ranks": 0, "loss_rank_avg": 0.22603391110897064, "step": 2135 }, { "epoch": 1.121593291404612, "grad_norm": 0.7879045605659485, "learning_rate": 3.956113880581282e-05, "loss": 0.2111, "loss_nan_ranks": 0, "loss_rank_avg": 0.20690859854221344, "step": 2140 }, { "epoch": 1.1242138364779874, "grad_norm": 0.6645146608352661, "learning_rate": 3.955567690991195e-05, "loss": 0.223, "loss_nan_ranks": 0, "loss_rank_avg": 0.21006427705287933, "step": 2145 }, { "epoch": 1.1268343815513626, "grad_norm": 0.6369447112083435, "learning_rate": 3.9550181617306845e-05, "loss": 0.2318, "loss_nan_ranks": 0, "loss_rank_avg": 0.23548580706119537, "step": 2150 }, { "epoch": 1.129454926624738, "grad_norm": 0.7624822854995728, "learning_rate": 3.9544652937382235e-05, "loss": 0.1966, "loss_nan_ranks": 0, "loss_rank_avg": 0.175537109375, "step": 2155 }, { "epoch": 1.1320754716981132, "grad_norm": 0.8087216019630432, "learning_rate": 3.953909087957987e-05, "loss": 0.1976, "loss_nan_ranks": 0, "loss_rank_avg": 0.16578355431556702, "step": 2160 }, { "epoch": 1.1346960167714886, "grad_norm": 1.0704227685928345, "learning_rate": 3.9533495453398485e-05, "loss": 0.2171, "loss_nan_ranks": 0, "loss_rank_avg": 0.1859961748123169, "step": 2165 }, { "epoch": 1.1373165618448637, "grad_norm": 0.8290188908576965, "learning_rate": 3.952786666839382e-05, "loss": 0.2155, "loss_nan_ranks": 0, "loss_rank_avg": 0.22795447707176208, "step": 2170 }, { "epoch": 1.139937106918239, "grad_norm": 0.7330989837646484, "learning_rate": 3.9522204534178574e-05, "loss": 0.1959, "loss_nan_ranks": 0, "loss_rank_avg": 0.20861802995204926, "step": 2175 }, { "epoch": 1.1425576519916143, "grad_norm": 0.7780997157096863, "learning_rate": 3.9516509060422395e-05, "loss": 0.2103, "loss_nan_ranks": 0, "loss_rank_avg": 0.21568194031715393, "step": 2180 }, { "epoch": 1.1451781970649895, "grad_norm": 0.7849951982498169, "learning_rate": 3.9510780256851886e-05, "loss": 0.2216, "loss_nan_ranks": 0, "loss_rank_avg": 0.2479293793439865, "step": 2185 }, { "epoch": 1.1477987421383649, "grad_norm": 0.9373758435249329, "learning_rate": 3.950501813325054e-05, "loss": 0.2058, "loss_nan_ranks": 0, "loss_rank_avg": 0.19455841183662415, "step": 2190 }, { "epoch": 1.15041928721174, "grad_norm": 0.7577291131019592, "learning_rate": 3.949922269945878e-05, "loss": 0.2258, "loss_nan_ranks": 0, "loss_rank_avg": 0.19436845183372498, "step": 2195 }, { "epoch": 1.1530398322851152, "grad_norm": 0.7371213436126709, "learning_rate": 3.9493393965373904e-05, "loss": 0.219, "loss_nan_ranks": 0, "loss_rank_avg": 0.2209862470626831, "step": 2200 }, { "epoch": 1.1556603773584906, "grad_norm": 0.632513701915741, "learning_rate": 3.948753194095008e-05, "loss": 0.2091, "loss_nan_ranks": 0, "loss_rank_avg": 0.2065919041633606, "step": 2205 }, { "epoch": 1.1582809224318658, "grad_norm": 0.8861731886863708, "learning_rate": 3.9481636636198325e-05, "loss": 0.2027, "loss_nan_ranks": 0, "loss_rank_avg": 0.19612561166286469, "step": 2210 }, { "epoch": 1.1609014675052411, "grad_norm": 1.6322046518325806, "learning_rate": 3.94757080611865e-05, "loss": 0.1947, "loss_nan_ranks": 0, "loss_rank_avg": 0.185546875, "step": 2215 }, { "epoch": 1.1635220125786163, "grad_norm": 0.7297495603561401, "learning_rate": 3.9469746226039285e-05, "loss": 0.2012, "loss_nan_ranks": 0, "loss_rank_avg": 0.1977044939994812, "step": 2220 }, { "epoch": 1.1661425576519917, "grad_norm": 0.7956055402755737, "learning_rate": 3.946375114093816e-05, "loss": 0.1991, "loss_nan_ranks": 0, "loss_rank_avg": 0.19768553972244263, "step": 2225 }, { "epoch": 1.1687631027253669, "grad_norm": 0.6843878626823425, "learning_rate": 3.9457722816121354e-05, "loss": 0.2115, "loss_nan_ranks": 0, "loss_rank_avg": 0.23284509778022766, "step": 2230 }, { "epoch": 1.171383647798742, "grad_norm": 0.8009164333343506, "learning_rate": 3.945166126188392e-05, "loss": 0.1903, "loss_nan_ranks": 0, "loss_rank_avg": 0.176334410905838, "step": 2235 }, { "epoch": 1.1740041928721174, "grad_norm": 0.8924102187156677, "learning_rate": 3.9445566488577624e-05, "loss": 0.201, "loss_nan_ranks": 0, "loss_rank_avg": 0.16902442276477814, "step": 2240 }, { "epoch": 1.1766247379454926, "grad_norm": 0.6214645504951477, "learning_rate": 3.943943850661097e-05, "loss": 0.2011, "loss_nan_ranks": 0, "loss_rank_avg": 0.19259414076805115, "step": 2245 }, { "epoch": 1.179245283018868, "grad_norm": 1.5890018939971924, "learning_rate": 3.943327732644917e-05, "loss": 0.2003, "loss_nan_ranks": 0, "loss_rank_avg": 0.23410651087760925, "step": 2250 }, { "epoch": 1.1818658280922432, "grad_norm": 0.6572086215019226, "learning_rate": 3.942708295861415e-05, "loss": 0.2193, "loss_nan_ranks": 0, "loss_rank_avg": 0.2210836112499237, "step": 2255 }, { "epoch": 1.1844863731656186, "grad_norm": 0.9096604585647583, "learning_rate": 3.942085541368448e-05, "loss": 0.2123, "loss_nan_ranks": 0, "loss_rank_avg": 0.1952608823776245, "step": 2260 }, { "epoch": 1.1871069182389937, "grad_norm": 0.7038206458091736, "learning_rate": 3.941459470229542e-05, "loss": 0.1928, "loss_nan_ranks": 0, "loss_rank_avg": 0.19516688585281372, "step": 2265 }, { "epoch": 1.189727463312369, "grad_norm": 0.8303375840187073, "learning_rate": 3.940830083513885e-05, "loss": 0.2058, "loss_nan_ranks": 0, "loss_rank_avg": 0.22259192168712616, "step": 2270 }, { "epoch": 1.1923480083857443, "grad_norm": 0.708311140537262, "learning_rate": 3.940197382296329e-05, "loss": 0.214, "loss_nan_ranks": 0, "loss_rank_avg": 0.23684288561344147, "step": 2275 }, { "epoch": 1.1949685534591195, "grad_norm": 0.6626346707344055, "learning_rate": 3.9395613676573863e-05, "loss": 0.2181, "loss_nan_ranks": 0, "loss_rank_avg": 0.245641827583313, "step": 2280 }, { "epoch": 1.1975890985324948, "grad_norm": 0.7180384397506714, "learning_rate": 3.9389220406832256e-05, "loss": 0.2016, "loss_nan_ranks": 0, "loss_rank_avg": 0.19217683374881744, "step": 2285 }, { "epoch": 1.20020964360587, "grad_norm": 0.6733828186988831, "learning_rate": 3.938279402465674e-05, "loss": 0.1839, "loss_nan_ranks": 0, "loss_rank_avg": 0.17612981796264648, "step": 2290 }, { "epoch": 1.2028301886792452, "grad_norm": 0.5625250339508057, "learning_rate": 3.937633454102214e-05, "loss": 0.1945, "loss_nan_ranks": 0, "loss_rank_avg": 0.1847318410873413, "step": 2295 }, { "epoch": 1.2054507337526206, "grad_norm": 0.5799747705459595, "learning_rate": 3.93698419669598e-05, "loss": 0.2065, "loss_nan_ranks": 0, "loss_rank_avg": 0.21696288883686066, "step": 2300 }, { "epoch": 1.2080712788259957, "grad_norm": 0.6593776345252991, "learning_rate": 3.936331631355757e-05, "loss": 0.2048, "loss_nan_ranks": 0, "loss_rank_avg": 0.18715828657150269, "step": 2305 }, { "epoch": 1.2106918238993711, "grad_norm": 0.7460365295410156, "learning_rate": 3.9356757591959815e-05, "loss": 0.2206, "loss_nan_ranks": 0, "loss_rank_avg": 0.21599701046943665, "step": 2310 }, { "epoch": 1.2133123689727463, "grad_norm": 0.6883477568626404, "learning_rate": 3.9350165813367344e-05, "loss": 0.2114, "loss_nan_ranks": 0, "loss_rank_avg": 0.1934875249862671, "step": 2315 }, { "epoch": 1.2159329140461215, "grad_norm": 0.7474843263626099, "learning_rate": 3.9343540989037455e-05, "loss": 0.2111, "loss_nan_ranks": 0, "loss_rank_avg": 0.21142578125, "step": 2320 }, { "epoch": 1.2185534591194969, "grad_norm": 0.7178785800933838, "learning_rate": 3.933688313028384e-05, "loss": 0.2146, "loss_nan_ranks": 0, "loss_rank_avg": 0.2500786781311035, "step": 2325 }, { "epoch": 1.221174004192872, "grad_norm": 0.7264668941497803, "learning_rate": 3.933019224847663e-05, "loss": 0.2121, "loss_nan_ranks": 0, "loss_rank_avg": 0.22011686861515045, "step": 2330 }, { "epoch": 1.2237945492662474, "grad_norm": 0.6529995203018188, "learning_rate": 3.9323468355042354e-05, "loss": 0.2178, "loss_nan_ranks": 0, "loss_rank_avg": 0.19955116510391235, "step": 2335 }, { "epoch": 1.2264150943396226, "grad_norm": 0.7731424570083618, "learning_rate": 3.931671146146391e-05, "loss": 0.1926, "loss_nan_ranks": 0, "loss_rank_avg": 0.201904296875, "step": 2340 }, { "epoch": 1.229035639412998, "grad_norm": 0.6317564845085144, "learning_rate": 3.930992157928056e-05, "loss": 0.1962, "loss_nan_ranks": 0, "loss_rank_avg": 0.21055123209953308, "step": 2345 }, { "epoch": 1.2316561844863732, "grad_norm": 0.7882787585258484, "learning_rate": 3.930309872008788e-05, "loss": 0.2244, "loss_nan_ranks": 0, "loss_rank_avg": 0.21452301740646362, "step": 2350 }, { "epoch": 1.2342767295597485, "grad_norm": 0.6930122971534729, "learning_rate": 3.92962428955378e-05, "loss": 0.2011, "loss_nan_ranks": 0, "loss_rank_avg": 0.19605226814746857, "step": 2355 }, { "epoch": 1.2368972746331237, "grad_norm": 0.7814037799835205, "learning_rate": 3.928935411733852e-05, "loss": 0.2076, "loss_nan_ranks": 0, "loss_rank_avg": 0.17131970822811127, "step": 2360 }, { "epoch": 1.2395178197064989, "grad_norm": 0.6624770760536194, "learning_rate": 3.928243239725453e-05, "loss": 0.2002, "loss_nan_ranks": 0, "loss_rank_avg": 0.22540496289730072, "step": 2365 }, { "epoch": 1.2421383647798743, "grad_norm": 0.8233559727668762, "learning_rate": 3.927547774710658e-05, "loss": 0.2039, "loss_nan_ranks": 0, "loss_rank_avg": 0.2119140625, "step": 2370 }, { "epoch": 1.2447589098532494, "grad_norm": 0.7578710317611694, "learning_rate": 3.926849017877163e-05, "loss": 0.1919, "loss_nan_ranks": 0, "loss_rank_avg": 0.20740604400634766, "step": 2375 }, { "epoch": 1.2473794549266248, "grad_norm": 0.786601722240448, "learning_rate": 3.926146970418289e-05, "loss": 0.2168, "loss_nan_ranks": 0, "loss_rank_avg": 0.24346387386322021, "step": 2380 }, { "epoch": 1.25, "grad_norm": 0.7464667558670044, "learning_rate": 3.925441633532976e-05, "loss": 0.2058, "loss_nan_ranks": 0, "loss_rank_avg": 0.185876727104187, "step": 2385 }, { "epoch": 1.2526205450733752, "grad_norm": 0.6592541337013245, "learning_rate": 3.92473300842578e-05, "loss": 0.1942, "loss_nan_ranks": 0, "loss_rank_avg": 0.18294116854667664, "step": 2390 }, { "epoch": 1.2552410901467506, "grad_norm": 0.6724171042442322, "learning_rate": 3.9240210963068734e-05, "loss": 0.2013, "loss_nan_ranks": 0, "loss_rank_avg": 0.21190500259399414, "step": 2395 }, { "epoch": 1.2578616352201257, "grad_norm": 0.8747267127037048, "learning_rate": 3.923305898392043e-05, "loss": 0.213, "loss_nan_ranks": 0, "loss_rank_avg": 0.25538158416748047, "step": 2400 }, { "epoch": 1.2604821802935011, "grad_norm": 0.7435500621795654, "learning_rate": 3.922587415902686e-05, "loss": 0.2191, "loss_nan_ranks": 0, "loss_rank_avg": 0.2292519509792328, "step": 2405 }, { "epoch": 1.2631027253668763, "grad_norm": 0.790035605430603, "learning_rate": 3.921865650065809e-05, "loss": 0.209, "loss_nan_ranks": 0, "loss_rank_avg": 0.21131622791290283, "step": 2410 }, { "epoch": 1.2657232704402515, "grad_norm": 0.7259888648986816, "learning_rate": 3.921140602114026e-05, "loss": 0.2129, "loss_nan_ranks": 0, "loss_rank_avg": 0.22094830870628357, "step": 2415 }, { "epoch": 1.2683438155136268, "grad_norm": 0.5739971399307251, "learning_rate": 3.920412273285556e-05, "loss": 0.1962, "loss_nan_ranks": 0, "loss_rank_avg": 0.18913546204566956, "step": 2420 }, { "epoch": 1.270964360587002, "grad_norm": 0.8419296145439148, "learning_rate": 3.9196806648242216e-05, "loss": 0.2064, "loss_nan_ranks": 0, "loss_rank_avg": 0.203369140625, "step": 2425 }, { "epoch": 1.2735849056603774, "grad_norm": 0.6042289137840271, "learning_rate": 3.9189457779794446e-05, "loss": 0.2043, "loss_nan_ranks": 0, "loss_rank_avg": 0.2455616444349289, "step": 2430 }, { "epoch": 1.2762054507337526, "grad_norm": 0.6255780458450317, "learning_rate": 3.9182076140062475e-05, "loss": 0.2042, "loss_nan_ranks": 0, "loss_rank_avg": 0.22539886832237244, "step": 2435 }, { "epoch": 1.2788259958071277, "grad_norm": 0.6214666366577148, "learning_rate": 3.9174661741652483e-05, "loss": 0.2057, "loss_nan_ranks": 0, "loss_rank_avg": 0.20764382183551788, "step": 2440 }, { "epoch": 1.2814465408805031, "grad_norm": 0.8051807880401611, "learning_rate": 3.91672145972266e-05, "loss": 0.1999, "loss_nan_ranks": 0, "loss_rank_avg": 0.19399192929267883, "step": 2445 }, { "epoch": 1.2840670859538785, "grad_norm": 0.6633710265159607, "learning_rate": 3.915973471950287e-05, "loss": 0.225, "loss_nan_ranks": 0, "loss_rank_avg": 0.22795894742012024, "step": 2450 }, { "epoch": 1.2866876310272537, "grad_norm": 0.6695340871810913, "learning_rate": 3.915222212125526e-05, "loss": 0.2077, "loss_nan_ranks": 0, "loss_rank_avg": 0.167012557387352, "step": 2455 }, { "epoch": 1.2893081761006289, "grad_norm": 14.448149681091309, "learning_rate": 3.914467681531358e-05, "loss": 0.2177, "loss_nan_ranks": 0, "loss_rank_avg": 0.23487228155136108, "step": 2460 }, { "epoch": 1.2919287211740043, "grad_norm": 0.696930468082428, "learning_rate": 3.9137098814563535e-05, "loss": 0.2161, "loss_nan_ranks": 0, "loss_rank_avg": 0.2489500641822815, "step": 2465 }, { "epoch": 1.2945492662473794, "grad_norm": 0.7310530543327332, "learning_rate": 3.912948813194663e-05, "loss": 0.2189, "loss_nan_ranks": 0, "loss_rank_avg": 0.20653784275054932, "step": 2470 }, { "epoch": 1.2971698113207548, "grad_norm": 0.6279557943344116, "learning_rate": 3.9121844780460226e-05, "loss": 0.1909, "loss_nan_ranks": 0, "loss_rank_avg": 0.19262602925300598, "step": 2475 }, { "epoch": 1.29979035639413, "grad_norm": 0.8946002125740051, "learning_rate": 3.911416877315743e-05, "loss": 0.2144, "loss_nan_ranks": 0, "loss_rank_avg": 0.237060546875, "step": 2480 }, { "epoch": 1.3024109014675052, "grad_norm": 0.701214075088501, "learning_rate": 3.9106460123147145e-05, "loss": 0.2005, "loss_nan_ranks": 0, "loss_rank_avg": 0.182861328125, "step": 2485 }, { "epoch": 1.3050314465408805, "grad_norm": 0.718437671661377, "learning_rate": 3.909871884359401e-05, "loss": 0.2309, "loss_nan_ranks": 0, "loss_rank_avg": 0.2242903858423233, "step": 2490 }, { "epoch": 1.3076519916142557, "grad_norm": 0.6559104323387146, "learning_rate": 3.90909449477184e-05, "loss": 0.2131, "loss_nan_ranks": 0, "loss_rank_avg": 0.1921052485704422, "step": 2495 }, { "epoch": 1.310272536687631, "grad_norm": 0.8101383447647095, "learning_rate": 3.9083138448796385e-05, "loss": 0.1957, "loss_nan_ranks": 0, "loss_rank_avg": 0.1594591736793518, "step": 2500 }, { "epoch": 1.3128930817610063, "grad_norm": 0.7414583563804626, "learning_rate": 3.907529936015971e-05, "loss": 0.2013, "loss_nan_ranks": 0, "loss_rank_avg": 0.21024464070796967, "step": 2505 }, { "epoch": 1.3155136268343814, "grad_norm": 0.6516440510749817, "learning_rate": 3.9067427695195764e-05, "loss": 0.1987, "loss_nan_ranks": 0, "loss_rank_avg": 0.2437392771244049, "step": 2510 }, { "epoch": 1.3181341719077568, "grad_norm": 0.8260552883148193, "learning_rate": 3.905952346734759e-05, "loss": 0.2162, "loss_nan_ranks": 0, "loss_rank_avg": 0.21447709202766418, "step": 2515 }, { "epoch": 1.320754716981132, "grad_norm": 0.8752504587173462, "learning_rate": 3.905158669011385e-05, "loss": 0.2065, "loss_nan_ranks": 0, "loss_rank_avg": 0.16129451990127563, "step": 2520 }, { "epoch": 1.3233752620545074, "grad_norm": 0.7147773504257202, "learning_rate": 3.904361737704876e-05, "loss": 0.1988, "loss_nan_ranks": 0, "loss_rank_avg": 0.21069785952568054, "step": 2525 }, { "epoch": 1.3259958071278826, "grad_norm": 0.6199226379394531, "learning_rate": 3.903561554176213e-05, "loss": 0.1971, "loss_nan_ranks": 0, "loss_rank_avg": 0.2516867518424988, "step": 2530 }, { "epoch": 1.3286163522012577, "grad_norm": 0.8225551843643188, "learning_rate": 3.902758119791928e-05, "loss": 0.2136, "loss_nan_ranks": 0, "loss_rank_avg": 0.1904865801334381, "step": 2535 }, { "epoch": 1.3312368972746331, "grad_norm": 0.6185528039932251, "learning_rate": 3.901951435924107e-05, "loss": 0.2117, "loss_nan_ranks": 0, "loss_rank_avg": 0.23471373319625854, "step": 2540 }, { "epoch": 1.3338574423480085, "grad_norm": 0.7478176355361938, "learning_rate": 3.901141503950386e-05, "loss": 0.2252, "loss_nan_ranks": 0, "loss_rank_avg": 0.20767131447792053, "step": 2545 }, { "epoch": 1.3364779874213837, "grad_norm": 1.0327668190002441, "learning_rate": 3.900328325253946e-05, "loss": 0.1844, "loss_nan_ranks": 0, "loss_rank_avg": 0.18234822154045105, "step": 2550 }, { "epoch": 1.3390985324947589, "grad_norm": 0.6867483854293823, "learning_rate": 3.8995119012235134e-05, "loss": 0.1999, "loss_nan_ranks": 0, "loss_rank_avg": 0.1513671875, "step": 2555 }, { "epoch": 1.3417190775681342, "grad_norm": 2.9362030029296875, "learning_rate": 3.898692233253358e-05, "loss": 0.2128, "loss_nan_ranks": 0, "loss_rank_avg": 0.22732852399349213, "step": 2560 }, { "epoch": 1.3443396226415094, "grad_norm": 0.7277422547340393, "learning_rate": 3.8978693227432874e-05, "loss": 0.2124, "loss_nan_ranks": 0, "loss_rank_avg": 0.21256864070892334, "step": 2565 }, { "epoch": 1.3469601677148848, "grad_norm": 0.7678915858268738, "learning_rate": 3.897043171098649e-05, "loss": 0.2056, "loss_nan_ranks": 0, "loss_rank_avg": 0.21991431713104248, "step": 2570 }, { "epoch": 1.34958071278826, "grad_norm": 0.6507485508918762, "learning_rate": 3.8962137797303235e-05, "loss": 0.2197, "loss_nan_ranks": 0, "loss_rank_avg": 0.23043885827064514, "step": 2575 }, { "epoch": 1.3522012578616351, "grad_norm": 0.8911487460136414, "learning_rate": 3.8953811500547266e-05, "loss": 0.1867, "loss_nan_ranks": 0, "loss_rank_avg": 0.1748046875, "step": 2580 }, { "epoch": 1.3548218029350105, "grad_norm": 0.6960018277168274, "learning_rate": 3.8945452834938005e-05, "loss": 0.182, "loss_nan_ranks": 0, "loss_rank_avg": 0.1771518588066101, "step": 2585 }, { "epoch": 1.3574423480083857, "grad_norm": 0.6837010979652405, "learning_rate": 3.8937061814750194e-05, "loss": 0.1966, "loss_nan_ranks": 0, "loss_rank_avg": 0.23394589126110077, "step": 2590 }, { "epoch": 1.360062893081761, "grad_norm": 0.6866763234138489, "learning_rate": 3.8928638454313795e-05, "loss": 0.2259, "loss_nan_ranks": 0, "loss_rank_avg": 0.2412635087966919, "step": 2595 }, { "epoch": 1.3626834381551363, "grad_norm": 0.8366163372993469, "learning_rate": 3.8920182768014034e-05, "loss": 0.2101, "loss_nan_ranks": 0, "loss_rank_avg": 0.22028440237045288, "step": 2600 }, { "epoch": 1.3653039832285114, "grad_norm": 0.7486369013786316, "learning_rate": 3.891169477029131e-05, "loss": 0.1983, "loss_nan_ranks": 0, "loss_rank_avg": 0.20773348212242126, "step": 2605 }, { "epoch": 1.3679245283018868, "grad_norm": 0.6341990232467651, "learning_rate": 3.890317447564123e-05, "loss": 0.2024, "loss_nan_ranks": 0, "loss_rank_avg": 0.1899259388446808, "step": 2610 }, { "epoch": 1.370545073375262, "grad_norm": 0.6667135953903198, "learning_rate": 3.889462189861452e-05, "loss": 0.2089, "loss_nan_ranks": 0, "loss_rank_avg": 0.1973501443862915, "step": 2615 }, { "epoch": 1.3731656184486374, "grad_norm": 0.7557380199432373, "learning_rate": 3.888603705381709e-05, "loss": 0.1967, "loss_nan_ranks": 0, "loss_rank_avg": 0.2168242335319519, "step": 2620 }, { "epoch": 1.3757861635220126, "grad_norm": 0.731421172618866, "learning_rate": 3.8877419955909905e-05, "loss": 0.2162, "loss_nan_ranks": 0, "loss_rank_avg": 0.20348013937473297, "step": 2625 }, { "epoch": 1.3784067085953877, "grad_norm": 0.6229634284973145, "learning_rate": 3.886877061960905e-05, "loss": 0.1907, "loss_nan_ranks": 0, "loss_rank_avg": 0.2174549698829651, "step": 2630 }, { "epoch": 1.381027253668763, "grad_norm": 0.8051416277885437, "learning_rate": 3.886008905968563e-05, "loss": 0.2412, "loss_nan_ranks": 0, "loss_rank_avg": 0.22031381726264954, "step": 2635 }, { "epoch": 1.3836477987421385, "grad_norm": 0.6593210101127625, "learning_rate": 3.8851375290965816e-05, "loss": 0.1999, "loss_nan_ranks": 0, "loss_rank_avg": 0.17712393403053284, "step": 2640 }, { "epoch": 1.3862683438155137, "grad_norm": 0.6895683407783508, "learning_rate": 3.884262932833076e-05, "loss": 0.2126, "loss_nan_ranks": 0, "loss_rank_avg": 0.1672450453042984, "step": 2645 }, { "epoch": 1.3888888888888888, "grad_norm": 0.8762910962104797, "learning_rate": 3.88338511867166e-05, "loss": 0.1984, "loss_nan_ranks": 0, "loss_rank_avg": 0.16539700329303741, "step": 2650 }, { "epoch": 1.3915094339622642, "grad_norm": 0.6993823647499084, "learning_rate": 3.882504088111444e-05, "loss": 0.2219, "loss_nan_ranks": 0, "loss_rank_avg": 0.2100042849779129, "step": 2655 }, { "epoch": 1.3941299790356394, "grad_norm": 0.7604408860206604, "learning_rate": 3.8816198426570296e-05, "loss": 0.2167, "loss_nan_ranks": 0, "loss_rank_avg": 0.2532285749912262, "step": 2660 }, { "epoch": 1.3967505241090148, "grad_norm": 0.6632134318351746, "learning_rate": 3.880732383818509e-05, "loss": 0.2115, "loss_nan_ranks": 0, "loss_rank_avg": 0.2341681867837906, "step": 2665 }, { "epoch": 1.39937106918239, "grad_norm": 0.7571194767951965, "learning_rate": 3.879841713111463e-05, "loss": 0.1962, "loss_nan_ranks": 0, "loss_rank_avg": 0.19808389246463776, "step": 2670 }, { "epoch": 1.4019916142557651, "grad_norm": 0.985468864440918, "learning_rate": 3.8789478320569585e-05, "loss": 0.208, "loss_nan_ranks": 0, "loss_rank_avg": 0.21600691974163055, "step": 2675 }, { "epoch": 1.4046121593291405, "grad_norm": 0.7668294906616211, "learning_rate": 3.878050742181542e-05, "loss": 0.2127, "loss_nan_ranks": 0, "loss_rank_avg": 0.22027012705802917, "step": 2680 }, { "epoch": 1.4072327044025157, "grad_norm": 0.7187151312828064, "learning_rate": 3.8771504450172415e-05, "loss": 0.2065, "loss_nan_ranks": 0, "loss_rank_avg": 0.1777723878622055, "step": 2685 }, { "epoch": 1.409853249475891, "grad_norm": 0.6384240388870239, "learning_rate": 3.876246942101563e-05, "loss": 0.193, "loss_nan_ranks": 0, "loss_rank_avg": 0.21956053376197815, "step": 2690 }, { "epoch": 1.4124737945492662, "grad_norm": 0.8472657799720764, "learning_rate": 3.875340234977486e-05, "loss": 0.174, "loss_nan_ranks": 0, "loss_rank_avg": 0.17662659287452698, "step": 2695 }, { "epoch": 1.4150943396226414, "grad_norm": 0.7782993912696838, "learning_rate": 3.874430325193464e-05, "loss": 0.2001, "loss_nan_ranks": 0, "loss_rank_avg": 0.20671376585960388, "step": 2700 }, { "epoch": 1.4177148846960168, "grad_norm": 1.1483628749847412, "learning_rate": 3.873517214303417e-05, "loss": 0.1982, "loss_nan_ranks": 0, "loss_rank_avg": 0.1874844878911972, "step": 2705 }, { "epoch": 1.420335429769392, "grad_norm": 0.936142086982727, "learning_rate": 3.872600903866733e-05, "loss": 0.2218, "loss_nan_ranks": 0, "loss_rank_avg": 0.16953758895397186, "step": 2710 }, { "epoch": 1.4229559748427674, "grad_norm": 0.9390134811401367, "learning_rate": 3.871681395448266e-05, "loss": 0.1916, "loss_nan_ranks": 0, "loss_rank_avg": 0.19737115502357483, "step": 2715 }, { "epoch": 1.4255765199161425, "grad_norm": 0.762726366519928, "learning_rate": 3.8707586906183294e-05, "loss": 0.2084, "loss_nan_ranks": 0, "loss_rank_avg": 0.2282119244337082, "step": 2720 }, { "epoch": 1.4281970649895177, "grad_norm": 0.6033948063850403, "learning_rate": 3.869832790952695e-05, "loss": 0.1972, "loss_nan_ranks": 0, "loss_rank_avg": 0.20079247653484344, "step": 2725 }, { "epoch": 1.430817610062893, "grad_norm": 0.8036638498306274, "learning_rate": 3.868903698032593e-05, "loss": 0.2063, "loss_nan_ranks": 0, "loss_rank_avg": 0.216064453125, "step": 2730 }, { "epoch": 1.4334381551362683, "grad_norm": 0.6354225277900696, "learning_rate": 3.867971413444704e-05, "loss": 0.198, "loss_nan_ranks": 0, "loss_rank_avg": 0.2076335847377777, "step": 2735 }, { "epoch": 1.4360587002096437, "grad_norm": 0.6323251724243164, "learning_rate": 3.867035938781161e-05, "loss": 0.2135, "loss_nan_ranks": 0, "loss_rank_avg": 0.19645678997039795, "step": 2740 }, { "epoch": 1.4386792452830188, "grad_norm": 0.6425936222076416, "learning_rate": 3.866097275639545e-05, "loss": 0.202, "loss_nan_ranks": 0, "loss_rank_avg": 0.19408687949180603, "step": 2745 }, { "epoch": 1.441299790356394, "grad_norm": 1.0641465187072754, "learning_rate": 3.865155425622882e-05, "loss": 0.1956, "loss_nan_ranks": 0, "loss_rank_avg": 0.16387057304382324, "step": 2750 }, { "epoch": 1.4439203354297694, "grad_norm": 0.6685094833374023, "learning_rate": 3.86421039033964e-05, "loss": 0.2016, "loss_nan_ranks": 0, "loss_rank_avg": 0.15119390189647675, "step": 2755 }, { "epoch": 1.4465408805031448, "grad_norm": 0.7995813488960266, "learning_rate": 3.8632621714037266e-05, "loss": 0.1831, "loss_nan_ranks": 0, "loss_rank_avg": 0.23123614490032196, "step": 2760 }, { "epoch": 1.44916142557652, "grad_norm": 0.6669260263442993, "learning_rate": 3.862310770434487e-05, "loss": 0.2173, "loss_nan_ranks": 0, "loss_rank_avg": 0.2532452344894409, "step": 2765 }, { "epoch": 1.4517819706498951, "grad_norm": 0.7086100578308105, "learning_rate": 3.861356189056701e-05, "loss": 0.2179, "loss_nan_ranks": 0, "loss_rank_avg": 0.2617954611778259, "step": 2770 }, { "epoch": 1.4544025157232705, "grad_norm": 0.638028085231781, "learning_rate": 3.8603984289005786e-05, "loss": 0.2173, "loss_nan_ranks": 0, "loss_rank_avg": 0.26442041993141174, "step": 2775 }, { "epoch": 1.4570230607966457, "grad_norm": 0.5652633309364319, "learning_rate": 3.85943749160176e-05, "loss": 0.1881, "loss_nan_ranks": 0, "loss_rank_avg": 0.19999033212661743, "step": 2780 }, { "epoch": 1.459643605870021, "grad_norm": 0.6860530972480774, "learning_rate": 3.858473378801309e-05, "loss": 0.1957, "loss_nan_ranks": 0, "loss_rank_avg": 0.20773279666900635, "step": 2785 }, { "epoch": 1.4622641509433962, "grad_norm": 0.677933394908905, "learning_rate": 3.857506092145714e-05, "loss": 0.1997, "loss_nan_ranks": 0, "loss_rank_avg": 0.21120098233222961, "step": 2790 }, { "epoch": 1.4648846960167714, "grad_norm": 0.7018347978591919, "learning_rate": 3.856535633286884e-05, "loss": 0.1804, "loss_nan_ranks": 0, "loss_rank_avg": 0.1769612580537796, "step": 2795 }, { "epoch": 1.4675052410901468, "grad_norm": 0.6802659630775452, "learning_rate": 3.855562003882144e-05, "loss": 0.2116, "loss_nan_ranks": 0, "loss_rank_avg": 0.22609364986419678, "step": 2800 }, { "epoch": 1.470125786163522, "grad_norm": 0.6637560129165649, "learning_rate": 3.854585205594235e-05, "loss": 0.2103, "loss_nan_ranks": 0, "loss_rank_avg": 0.173095703125, "step": 2805 }, { "epoch": 1.4727463312368974, "grad_norm": 0.560377836227417, "learning_rate": 3.853605240091309e-05, "loss": 0.2141, "loss_nan_ranks": 0, "loss_rank_avg": 0.2323237955570221, "step": 2810 }, { "epoch": 1.4753668763102725, "grad_norm": 0.7306016087532043, "learning_rate": 3.8526221090469266e-05, "loss": 0.2135, "loss_nan_ranks": 0, "loss_rank_avg": 0.1875, "step": 2815 }, { "epoch": 1.4779874213836477, "grad_norm": 0.7161574959754944, "learning_rate": 3.851635814140055e-05, "loss": 0.2107, "loss_nan_ranks": 0, "loss_rank_avg": 0.20000721514225006, "step": 2820 }, { "epoch": 1.480607966457023, "grad_norm": 0.617828369140625, "learning_rate": 3.850646357055065e-05, "loss": 0.1961, "loss_nan_ranks": 0, "loss_rank_avg": 0.16674214601516724, "step": 2825 }, { "epoch": 1.4832285115303983, "grad_norm": 0.5676167607307434, "learning_rate": 3.8496537394817264e-05, "loss": 0.2158, "loss_nan_ranks": 0, "loss_rank_avg": 0.20249316096305847, "step": 2830 }, { "epoch": 1.4858490566037736, "grad_norm": 1.0004444122314453, "learning_rate": 3.8486579631152067e-05, "loss": 0.2021, "loss_nan_ranks": 0, "loss_rank_avg": 0.19540755450725555, "step": 2835 }, { "epoch": 1.4884696016771488, "grad_norm": 0.8708783388137817, "learning_rate": 3.84765902965607e-05, "loss": 0.1794, "loss_nan_ranks": 0, "loss_rank_avg": 0.1630859375, "step": 2840 }, { "epoch": 1.491090146750524, "grad_norm": 1.0635863542556763, "learning_rate": 3.846656940810269e-05, "loss": 0.2061, "loss_nan_ranks": 0, "loss_rank_avg": 0.1687188297510147, "step": 2845 }, { "epoch": 1.4937106918238994, "grad_norm": 0.7047156095504761, "learning_rate": 3.845651698289145e-05, "loss": 0.1936, "loss_nan_ranks": 0, "loss_rank_avg": 0.171891450881958, "step": 2850 }, { "epoch": 1.4963312368972748, "grad_norm": 1.9242242574691772, "learning_rate": 3.844643303809429e-05, "loss": 0.2316, "loss_nan_ranks": 0, "loss_rank_avg": 0.19759413599967957, "step": 2855 }, { "epoch": 1.49895178197065, "grad_norm": 0.6161275506019592, "learning_rate": 3.8436317590932315e-05, "loss": 0.2087, "loss_nan_ranks": 0, "loss_rank_avg": 0.213210791349411, "step": 2860 }, { "epoch": 1.501572327044025, "grad_norm": 0.6815282702445984, "learning_rate": 3.842617065868043e-05, "loss": 0.2058, "loss_nan_ranks": 0, "loss_rank_avg": 0.18007223308086395, "step": 2865 }, { "epoch": 1.5041928721174003, "grad_norm": 0.7069063782691956, "learning_rate": 3.841599225866733e-05, "loss": 0.2074, "loss_nan_ranks": 0, "loss_rank_avg": 0.15446972846984863, "step": 2870 }, { "epoch": 1.5068134171907757, "grad_norm": 0.6494333744049072, "learning_rate": 3.8405782408275425e-05, "loss": 0.1953, "loss_nan_ranks": 0, "loss_rank_avg": 0.21661195158958435, "step": 2875 }, { "epoch": 1.509433962264151, "grad_norm": 0.6672247648239136, "learning_rate": 3.8395541124940843e-05, "loss": 0.1981, "loss_nan_ranks": 0, "loss_rank_avg": 0.21872758865356445, "step": 2880 }, { "epoch": 1.5120545073375262, "grad_norm": 0.6205090880393982, "learning_rate": 3.8385268426153415e-05, "loss": 0.2226, "loss_nan_ranks": 0, "loss_rank_avg": 0.24721340835094452, "step": 2885 }, { "epoch": 1.5146750524109014, "grad_norm": 0.8091459274291992, "learning_rate": 3.8374964329456574e-05, "loss": 0.1934, "loss_nan_ranks": 0, "loss_rank_avg": 0.21637308597564697, "step": 2890 }, { "epoch": 1.5172955974842768, "grad_norm": 0.7390249967575073, "learning_rate": 3.8364628852447424e-05, "loss": 0.2159, "loss_nan_ranks": 0, "loss_rank_avg": 0.16475994884967804, "step": 2895 }, { "epoch": 1.519916142557652, "grad_norm": 0.7167327404022217, "learning_rate": 3.835426201277664e-05, "loss": 0.2099, "loss_nan_ranks": 0, "loss_rank_avg": 0.21945439279079437, "step": 2900 }, { "epoch": 1.5225366876310273, "grad_norm": 0.6998141407966614, "learning_rate": 3.834386382814845e-05, "loss": 0.1926, "loss_nan_ranks": 0, "loss_rank_avg": 0.205071821808815, "step": 2905 }, { "epoch": 1.5251572327044025, "grad_norm": 0.7944632768630981, "learning_rate": 3.833343431632062e-05, "loss": 0.2031, "loss_nan_ranks": 0, "loss_rank_avg": 0.18502473831176758, "step": 2910 }, { "epoch": 1.5277777777777777, "grad_norm": 0.795866072177887, "learning_rate": 3.83229734951044e-05, "loss": 0.2106, "loss_nan_ranks": 0, "loss_rank_avg": 0.177978515625, "step": 2915 }, { "epoch": 1.530398322851153, "grad_norm": 0.6461061835289001, "learning_rate": 3.831248138236455e-05, "loss": 0.2041, "loss_nan_ranks": 0, "loss_rank_avg": 0.21445702016353607, "step": 2920 }, { "epoch": 1.5330188679245285, "grad_norm": 0.6679595708847046, "learning_rate": 3.830195799601922e-05, "loss": 0.2058, "loss_nan_ranks": 0, "loss_rank_avg": 0.19536687433719635, "step": 2925 }, { "epoch": 1.5356394129979036, "grad_norm": 0.7739956974983215, "learning_rate": 3.829140335404e-05, "loss": 0.2032, "loss_nan_ranks": 0, "loss_rank_avg": 0.20849609375, "step": 2930 }, { "epoch": 1.5382599580712788, "grad_norm": 0.6194927096366882, "learning_rate": 3.8280817474451845e-05, "loss": 0.2061, "loss_nan_ranks": 0, "loss_rank_avg": 0.20353049039840698, "step": 2935 }, { "epoch": 1.540880503144654, "grad_norm": 0.8548099398612976, "learning_rate": 3.827020037533306e-05, "loss": 0.2187, "loss_nan_ranks": 0, "loss_rank_avg": 0.2137005627155304, "step": 2940 }, { "epoch": 1.5435010482180294, "grad_norm": 0.6645764112472534, "learning_rate": 3.825955207481527e-05, "loss": 0.2019, "loss_nan_ranks": 0, "loss_rank_avg": 0.2067786604166031, "step": 2945 }, { "epoch": 1.5461215932914047, "grad_norm": 0.7723944187164307, "learning_rate": 3.824887259108337e-05, "loss": 0.1972, "loss_nan_ranks": 0, "loss_rank_avg": 0.2169840931892395, "step": 2950 }, { "epoch": 1.54874213836478, "grad_norm": 0.6538282632827759, "learning_rate": 3.8238161942375534e-05, "loss": 0.208, "loss_nan_ranks": 0, "loss_rank_avg": 0.19555526971817017, "step": 2955 }, { "epoch": 1.551362683438155, "grad_norm": 0.682479977607727, "learning_rate": 3.8227420146983134e-05, "loss": 0.2061, "loss_nan_ranks": 0, "loss_rank_avg": 0.197683647274971, "step": 2960 }, { "epoch": 1.5539832285115303, "grad_norm": 0.8361935019493103, "learning_rate": 3.821664722325075e-05, "loss": 0.1957, "loss_nan_ranks": 0, "loss_rank_avg": 0.2167164385318756, "step": 2965 }, { "epoch": 1.5566037735849056, "grad_norm": 0.7050533294677734, "learning_rate": 3.820584318957611e-05, "loss": 0.1982, "loss_nan_ranks": 0, "loss_rank_avg": 0.22836749255657196, "step": 2970 }, { "epoch": 1.559224318658281, "grad_norm": 0.6500424146652222, "learning_rate": 3.819500806441009e-05, "loss": 0.212, "loss_nan_ranks": 0, "loss_rank_avg": 0.2061050683259964, "step": 2975 }, { "epoch": 1.5618448637316562, "grad_norm": 0.6116467714309692, "learning_rate": 3.8184141866256636e-05, "loss": 0.1931, "loss_nan_ranks": 0, "loss_rank_avg": 0.19401168823242188, "step": 2980 }, { "epoch": 1.5644654088050314, "grad_norm": 0.6403232216835022, "learning_rate": 3.8173244613672785e-05, "loss": 0.2197, "loss_nan_ranks": 0, "loss_rank_avg": 0.22676880657672882, "step": 2985 }, { "epoch": 1.5670859538784065, "grad_norm": 0.6228274703025818, "learning_rate": 3.816231632526858e-05, "loss": 0.1936, "loss_nan_ranks": 0, "loss_rank_avg": 0.19302821159362793, "step": 2990 }, { "epoch": 1.569706498951782, "grad_norm": 0.888628363609314, "learning_rate": 3.815135701970711e-05, "loss": 0.2239, "loss_nan_ranks": 0, "loss_rank_avg": 0.2211262583732605, "step": 2995 }, { "epoch": 1.5723270440251573, "grad_norm": 0.7154015302658081, "learning_rate": 3.814036671570438e-05, "loss": 0.2115, "loss_nan_ranks": 0, "loss_rank_avg": 0.20973986387252808, "step": 3000 }, { "epoch": 1.5749475890985325, "grad_norm": 0.6683697700500488, "learning_rate": 3.8129345432029376e-05, "loss": 0.206, "loss_nan_ranks": 0, "loss_rank_avg": 0.24227727949619293, "step": 3005 }, { "epoch": 1.5775681341719077, "grad_norm": 0.817297101020813, "learning_rate": 3.8118293187503975e-05, "loss": 0.2006, "loss_nan_ranks": 0, "loss_rank_avg": 0.22506284713745117, "step": 3010 }, { "epoch": 1.580188679245283, "grad_norm": 1.3425382375717163, "learning_rate": 3.810721000100293e-05, "loss": 0.2164, "loss_nan_ranks": 0, "loss_rank_avg": 0.23591598868370056, "step": 3015 }, { "epoch": 1.5828092243186582, "grad_norm": 0.6170550584793091, "learning_rate": 3.8096095891453824e-05, "loss": 0.1934, "loss_nan_ranks": 0, "loss_rank_avg": 0.19402346014976501, "step": 3020 }, { "epoch": 1.5854297693920336, "grad_norm": 0.6385833621025085, "learning_rate": 3.808495087783707e-05, "loss": 0.1951, "loss_nan_ranks": 0, "loss_rank_avg": 0.19327062368392944, "step": 3025 }, { "epoch": 1.5880503144654088, "grad_norm": 0.5318775177001953, "learning_rate": 3.8073774979185845e-05, "loss": 0.1849, "loss_nan_ranks": 0, "loss_rank_avg": 0.19630634784698486, "step": 3030 }, { "epoch": 1.590670859538784, "grad_norm": 0.7159228324890137, "learning_rate": 3.8062568214586076e-05, "loss": 0.1948, "loss_nan_ranks": 0, "loss_rank_avg": 0.23486372828483582, "step": 3035 }, { "epoch": 1.5932914046121593, "grad_norm": 0.5865247845649719, "learning_rate": 3.80513306031764e-05, "loss": 0.2026, "loss_nan_ranks": 0, "loss_rank_avg": 0.19960322976112366, "step": 3040 }, { "epoch": 1.5959119496855347, "grad_norm": 0.7836723327636719, "learning_rate": 3.804006216414812e-05, "loss": 0.1981, "loss_nan_ranks": 0, "loss_rank_avg": 0.2001953125, "step": 3045 }, { "epoch": 1.59853249475891, "grad_norm": 0.6605933904647827, "learning_rate": 3.8028762916745216e-05, "loss": 0.2052, "loss_nan_ranks": 0, "loss_rank_avg": 0.22346261143684387, "step": 3050 }, { "epoch": 1.601153039832285, "grad_norm": 0.6677698493003845, "learning_rate": 3.801743288026426e-05, "loss": 0.1956, "loss_nan_ranks": 0, "loss_rank_avg": 0.19307313859462738, "step": 3055 }, { "epoch": 1.6037735849056602, "grad_norm": 0.7019439339637756, "learning_rate": 3.8006072074054415e-05, "loss": 0.2077, "loss_nan_ranks": 0, "loss_rank_avg": 0.2159917950630188, "step": 3060 }, { "epoch": 1.6063941299790356, "grad_norm": 0.6048671007156372, "learning_rate": 3.7994680517517374e-05, "loss": 0.2109, "loss_nan_ranks": 0, "loss_rank_avg": 0.21212241053581238, "step": 3065 }, { "epoch": 1.609014675052411, "grad_norm": 0.5906047821044922, "learning_rate": 3.798325823010737e-05, "loss": 0.2181, "loss_nan_ranks": 0, "loss_rank_avg": 0.18436996638774872, "step": 3070 }, { "epoch": 1.6116352201257862, "grad_norm": 1.061728596687317, "learning_rate": 3.7971805231331096e-05, "loss": 0.1971, "loss_nan_ranks": 0, "loss_rank_avg": 0.17529296875, "step": 3075 }, { "epoch": 1.6142557651991614, "grad_norm": 0.6743407845497131, "learning_rate": 3.79603215407477e-05, "loss": 0.2033, "loss_nan_ranks": 0, "loss_rank_avg": 0.19340994954109192, "step": 3080 }, { "epoch": 1.6168763102725365, "grad_norm": 0.8559533953666687, "learning_rate": 3.7948807177968755e-05, "loss": 0.2104, "loss_nan_ranks": 0, "loss_rank_avg": 0.22995486855506897, "step": 3085 }, { "epoch": 1.619496855345912, "grad_norm": 0.66473788022995, "learning_rate": 3.79372621626582e-05, "loss": 0.2091, "loss_nan_ranks": 0, "loss_rank_avg": 0.20205003023147583, "step": 3090 }, { "epoch": 1.6221174004192873, "grad_norm": 0.6741588115692139, "learning_rate": 3.792568651453233e-05, "loss": 0.1963, "loss_nan_ranks": 0, "loss_rank_avg": 0.2298995554447174, "step": 3095 }, { "epoch": 1.6247379454926625, "grad_norm": 0.7121372818946838, "learning_rate": 3.7914080253359754e-05, "loss": 0.2077, "loss_nan_ranks": 0, "loss_rank_avg": 0.23662368953227997, "step": 3100 }, { "epoch": 1.6273584905660377, "grad_norm": 0.7736591100692749, "learning_rate": 3.790244339896136e-05, "loss": 0.2117, "loss_nan_ranks": 0, "loss_rank_avg": 0.19614163041114807, "step": 3105 }, { "epoch": 1.629979035639413, "grad_norm": 0.7388415932655334, "learning_rate": 3.7890775971210286e-05, "loss": 0.1973, "loss_nan_ranks": 0, "loss_rank_avg": 0.19440676271915436, "step": 3110 }, { "epoch": 1.6325995807127882, "grad_norm": 0.7736803889274597, "learning_rate": 3.787907799003186e-05, "loss": 0.1976, "loss_nan_ranks": 0, "loss_rank_avg": 0.18566857278347015, "step": 3115 }, { "epoch": 1.6352201257861636, "grad_norm": 0.6769443154335022, "learning_rate": 3.786734947540363e-05, "loss": 0.1983, "loss_nan_ranks": 0, "loss_rank_avg": 0.19205525517463684, "step": 3120 }, { "epoch": 1.6378406708595388, "grad_norm": 0.6849276423454285, "learning_rate": 3.7855590447355243e-05, "loss": 0.1969, "loss_nan_ranks": 0, "loss_rank_avg": 0.20867285132408142, "step": 3125 }, { "epoch": 1.640461215932914, "grad_norm": 0.7193086743354797, "learning_rate": 3.7843800925968495e-05, "loss": 0.2015, "loss_nan_ranks": 0, "loss_rank_avg": 0.20997187495231628, "step": 3130 }, { "epoch": 1.6430817610062893, "grad_norm": 0.642825722694397, "learning_rate": 3.7831980931377234e-05, "loss": 0.206, "loss_nan_ranks": 0, "loss_rank_avg": 0.17490190267562866, "step": 3135 }, { "epoch": 1.6457023060796647, "grad_norm": 1.0223510265350342, "learning_rate": 3.782013048376736e-05, "loss": 0.1954, "loss_nan_ranks": 0, "loss_rank_avg": 0.1529541015625, "step": 3140 }, { "epoch": 1.64832285115304, "grad_norm": 0.7299026250839233, "learning_rate": 3.7808249603376773e-05, "loss": 0.2199, "loss_nan_ranks": 0, "loss_rank_avg": 0.23865553736686707, "step": 3145 }, { "epoch": 1.650943396226415, "grad_norm": 0.6869032382965088, "learning_rate": 3.779633831049535e-05, "loss": 0.1996, "loss_nan_ranks": 0, "loss_rank_avg": 0.18986861407756805, "step": 3150 }, { "epoch": 1.6535639412997902, "grad_norm": 0.6794070601463318, "learning_rate": 3.7784396625464896e-05, "loss": 0.1862, "loss_nan_ranks": 0, "loss_rank_avg": 0.18808311223983765, "step": 3155 }, { "epoch": 1.6561844863731656, "grad_norm": 0.6022070646286011, "learning_rate": 3.777242456867914e-05, "loss": 0.1893, "loss_nan_ranks": 0, "loss_rank_avg": 0.221364825963974, "step": 3160 }, { "epoch": 1.658805031446541, "grad_norm": 0.7784854173660278, "learning_rate": 3.776042216058365e-05, "loss": 0.2058, "loss_nan_ranks": 0, "loss_rank_avg": 0.222900390625, "step": 3165 }, { "epoch": 1.6614255765199162, "grad_norm": 0.6530669331550598, "learning_rate": 3.774838942167587e-05, "loss": 0.2133, "loss_nan_ranks": 0, "loss_rank_avg": 0.2069305181503296, "step": 3170 }, { "epoch": 1.6640461215932913, "grad_norm": 0.654613733291626, "learning_rate": 3.773632637250498e-05, "loss": 0.196, "loss_nan_ranks": 0, "loss_rank_avg": 0.24302370846271515, "step": 3175 }, { "epoch": 1.6666666666666665, "grad_norm": 0.9669556617736816, "learning_rate": 3.772423303367199e-05, "loss": 0.199, "loss_nan_ranks": 0, "loss_rank_avg": 0.20167219638824463, "step": 3180 }, { "epoch": 1.669287211740042, "grad_norm": 0.7367469668388367, "learning_rate": 3.77121094258296e-05, "loss": 0.2186, "loss_nan_ranks": 0, "loss_rank_avg": 0.18835796415805817, "step": 3185 }, { "epoch": 1.6719077568134173, "grad_norm": 0.7441819310188293, "learning_rate": 3.7699955569682185e-05, "loss": 0.2425, "loss_nan_ranks": 0, "loss_rank_avg": 0.3099658787250519, "step": 3190 }, { "epoch": 1.6745283018867925, "grad_norm": 0.7120652198791504, "learning_rate": 3.7687771485985834e-05, "loss": 0.1976, "loss_nan_ranks": 0, "loss_rank_avg": 0.1901395171880722, "step": 3195 }, { "epoch": 1.6771488469601676, "grad_norm": 0.8141222596168518, "learning_rate": 3.767555719554821e-05, "loss": 0.2126, "loss_nan_ranks": 0, "loss_rank_avg": 0.24998538196086884, "step": 3200 }, { "epoch": 1.679769392033543, "grad_norm": 0.7448544502258301, "learning_rate": 3.766331271922858e-05, "loss": 0.2117, "loss_nan_ranks": 0, "loss_rank_avg": 0.203841432929039, "step": 3205 }, { "epoch": 1.6823899371069182, "grad_norm": 0.5663997530937195, "learning_rate": 3.765103807793776e-05, "loss": 0.173, "loss_nan_ranks": 0, "loss_rank_avg": 0.1611168533563614, "step": 3210 }, { "epoch": 1.6850104821802936, "grad_norm": 0.6563591361045837, "learning_rate": 3.763873329263808e-05, "loss": 0.2195, "loss_nan_ranks": 0, "loss_rank_avg": 0.1935395896434784, "step": 3215 }, { "epoch": 1.6876310272536688, "grad_norm": 0.7698992490768433, "learning_rate": 3.762639838434335e-05, "loss": 0.1918, "loss_nan_ranks": 0, "loss_rank_avg": 0.15444648265838623, "step": 3220 }, { "epoch": 1.690251572327044, "grad_norm": 0.8108476996421814, "learning_rate": 3.7614033374118826e-05, "loss": 0.2056, "loss_nan_ranks": 0, "loss_rank_avg": 0.2104421854019165, "step": 3225 }, { "epoch": 1.6928721174004193, "grad_norm": 0.7241305708885193, "learning_rate": 3.760163828308116e-05, "loss": 0.1915, "loss_nan_ranks": 0, "loss_rank_avg": 0.19673112034797668, "step": 3230 }, { "epoch": 1.6954926624737947, "grad_norm": 0.6991493105888367, "learning_rate": 3.75892131323984e-05, "loss": 0.2169, "loss_nan_ranks": 0, "loss_rank_avg": 0.2223394215106964, "step": 3235 }, { "epoch": 1.6981132075471699, "grad_norm": 0.6248032450675964, "learning_rate": 3.757675794328989e-05, "loss": 0.2, "loss_nan_ranks": 0, "loss_rank_avg": 0.2107730656862259, "step": 3240 }, { "epoch": 1.700733752620545, "grad_norm": 0.6475279331207275, "learning_rate": 3.756427273702632e-05, "loss": 0.2194, "loss_nan_ranks": 0, "loss_rank_avg": 0.21362945437431335, "step": 3245 }, { "epoch": 1.7033542976939202, "grad_norm": 0.7653929591178894, "learning_rate": 3.75517575349296e-05, "loss": 0.2126, "loss_nan_ranks": 0, "loss_rank_avg": 0.2135382890701294, "step": 3250 }, { "epoch": 1.7059748427672956, "grad_norm": 0.6877974271774292, "learning_rate": 3.7539212358372885e-05, "loss": 0.1971, "loss_nan_ranks": 0, "loss_rank_avg": 0.2015942931175232, "step": 3255 }, { "epoch": 1.708595387840671, "grad_norm": 0.5155228972434998, "learning_rate": 3.752663722878053e-05, "loss": 0.1953, "loss_nan_ranks": 0, "loss_rank_avg": 0.1829644739627838, "step": 3260 }, { "epoch": 1.7112159329140462, "grad_norm": 0.6721019148826599, "learning_rate": 3.751403216762803e-05, "loss": 0.2138, "loss_nan_ranks": 0, "loss_rank_avg": 0.21119743585586548, "step": 3265 }, { "epoch": 1.7138364779874213, "grad_norm": 0.6383962035179138, "learning_rate": 3.7501397196441996e-05, "loss": 0.2119, "loss_nan_ranks": 0, "loss_rank_avg": 0.19863849878311157, "step": 3270 }, { "epoch": 1.7164570230607965, "grad_norm": 0.53727126121521, "learning_rate": 3.748873233680012e-05, "loss": 0.2058, "loss_nan_ranks": 0, "loss_rank_avg": 0.18619093298912048, "step": 3275 }, { "epoch": 1.719077568134172, "grad_norm": 0.6815948486328125, "learning_rate": 3.7476037610331135e-05, "loss": 0.2131, "loss_nan_ranks": 0, "loss_rank_avg": 0.20206259191036224, "step": 3280 }, { "epoch": 1.7216981132075473, "grad_norm": 0.7351505160331726, "learning_rate": 3.746331303871479e-05, "loss": 0.1897, "loss_nan_ranks": 0, "loss_rank_avg": 0.21092607080936432, "step": 3285 }, { "epoch": 1.7243186582809225, "grad_norm": 0.7017987966537476, "learning_rate": 3.745055864368179e-05, "loss": 0.2144, "loss_nan_ranks": 0, "loss_rank_avg": 0.23522168397903442, "step": 3290 }, { "epoch": 1.7269392033542976, "grad_norm": 0.7407639622688293, "learning_rate": 3.743777444701378e-05, "loss": 0.1942, "loss_nan_ranks": 0, "loss_rank_avg": 0.2146407663822174, "step": 3295 }, { "epoch": 1.7295597484276728, "grad_norm": 0.8630208373069763, "learning_rate": 3.7424960470543294e-05, "loss": 0.2071, "loss_nan_ranks": 0, "loss_rank_avg": 0.18362826108932495, "step": 3300 }, { "epoch": 1.7321802935010482, "grad_norm": 0.662527859210968, "learning_rate": 3.741211673615374e-05, "loss": 0.2256, "loss_nan_ranks": 0, "loss_rank_avg": 0.21645821630954742, "step": 3305 }, { "epoch": 1.7348008385744236, "grad_norm": 0.7605148553848267, "learning_rate": 3.7399243265779305e-05, "loss": 0.1955, "loss_nan_ranks": 0, "loss_rank_avg": 0.19093088805675507, "step": 3310 }, { "epoch": 1.7374213836477987, "grad_norm": 0.6030600070953369, "learning_rate": 3.7386340081405004e-05, "loss": 0.2298, "loss_nan_ranks": 0, "loss_rank_avg": 0.2234543412923813, "step": 3315 }, { "epoch": 1.740041928721174, "grad_norm": 0.6327378153800964, "learning_rate": 3.737340720506657e-05, "loss": 0.1985, "loss_nan_ranks": 0, "loss_rank_avg": 0.2092643678188324, "step": 3320 }, { "epoch": 1.7426624737945493, "grad_norm": 0.6156671047210693, "learning_rate": 3.736044465885046e-05, "loss": 0.1898, "loss_nan_ranks": 0, "loss_rank_avg": 0.220514714717865, "step": 3325 }, { "epoch": 1.7452830188679245, "grad_norm": 0.643059253692627, "learning_rate": 3.734745246489379e-05, "loss": 0.2048, "loss_nan_ranks": 0, "loss_rank_avg": 0.2512093186378479, "step": 3330 }, { "epoch": 1.7479035639412999, "grad_norm": 0.7039141654968262, "learning_rate": 3.73344306453843e-05, "loss": 0.2092, "loss_nan_ranks": 0, "loss_rank_avg": 0.22273048758506775, "step": 3335 }, { "epoch": 1.750524109014675, "grad_norm": 0.6144287586212158, "learning_rate": 3.732137922256035e-05, "loss": 0.2167, "loss_nan_ranks": 0, "loss_rank_avg": 0.17802879214286804, "step": 3340 }, { "epoch": 1.7531446540880502, "grad_norm": 0.7636461853981018, "learning_rate": 3.7308298218710816e-05, "loss": 0.1947, "loss_nan_ranks": 0, "loss_rank_avg": 0.186279296875, "step": 3345 }, { "epoch": 1.7557651991614256, "grad_norm": 0.6746559739112854, "learning_rate": 3.729518765617513e-05, "loss": 0.2199, "loss_nan_ranks": 0, "loss_rank_avg": 0.20318138599395752, "step": 3350 }, { "epoch": 1.758385744234801, "grad_norm": 0.6666935682296753, "learning_rate": 3.7282047557343195e-05, "loss": 0.2148, "loss_nan_ranks": 0, "loss_rank_avg": 0.1769717037677765, "step": 3355 }, { "epoch": 1.7610062893081762, "grad_norm": 0.5784667134284973, "learning_rate": 3.726887794465533e-05, "loss": 0.1945, "loss_nan_ranks": 0, "loss_rank_avg": 0.18660691380500793, "step": 3360 }, { "epoch": 1.7636268343815513, "grad_norm": 0.7593470811843872, "learning_rate": 3.725567884060229e-05, "loss": 0.2122, "loss_nan_ranks": 0, "loss_rank_avg": 0.2277555763721466, "step": 3365 }, { "epoch": 1.7662473794549265, "grad_norm": 0.6410986185073853, "learning_rate": 3.724245026772518e-05, "loss": 0.2203, "loss_nan_ranks": 0, "loss_rank_avg": 0.19018101692199707, "step": 3370 }, { "epoch": 1.7688679245283019, "grad_norm": 0.7130261659622192, "learning_rate": 3.7229192248615416e-05, "loss": 0.2081, "loss_nan_ranks": 0, "loss_rank_avg": 0.2189968228340149, "step": 3375 }, { "epoch": 1.7714884696016773, "grad_norm": 0.7862639427185059, "learning_rate": 3.721590480591474e-05, "loss": 0.1946, "loss_nan_ranks": 0, "loss_rank_avg": 0.20198023319244385, "step": 3380 }, { "epoch": 1.7741090146750524, "grad_norm": 0.6717849969863892, "learning_rate": 3.72025879623151e-05, "loss": 0.1959, "loss_nan_ranks": 0, "loss_rank_avg": 0.20632538199424744, "step": 3385 }, { "epoch": 1.7767295597484276, "grad_norm": 0.6856803297996521, "learning_rate": 3.718924174055868e-05, "loss": 0.1926, "loss_nan_ranks": 0, "loss_rank_avg": 0.18674078583717346, "step": 3390 }, { "epoch": 1.7793501048218028, "grad_norm": 0.6597937941551208, "learning_rate": 3.717586616343784e-05, "loss": 0.1913, "loss_nan_ranks": 0, "loss_rank_avg": 0.2081529200077057, "step": 3395 }, { "epoch": 1.7819706498951782, "grad_norm": 0.548897922039032, "learning_rate": 3.716246125379504e-05, "loss": 0.2059, "loss_nan_ranks": 0, "loss_rank_avg": 0.18970248103141785, "step": 3400 }, { "epoch": 1.7845911949685536, "grad_norm": 0.6445600986480713, "learning_rate": 3.714902703452288e-05, "loss": 0.2047, "loss_nan_ranks": 0, "loss_rank_avg": 0.21960783004760742, "step": 3405 }, { "epoch": 1.7872117400419287, "grad_norm": 0.6185094714164734, "learning_rate": 3.713556352856398e-05, "loss": 0.2154, "loss_nan_ranks": 0, "loss_rank_avg": 0.2151571810245514, "step": 3410 }, { "epoch": 1.789832285115304, "grad_norm": 0.6007769107818604, "learning_rate": 3.712207075891097e-05, "loss": 0.1987, "loss_nan_ranks": 0, "loss_rank_avg": 0.23578466475009918, "step": 3415 }, { "epoch": 1.7924528301886793, "grad_norm": 0.7605302333831787, "learning_rate": 3.7108548748606496e-05, "loss": 0.1991, "loss_nan_ranks": 0, "loss_rank_avg": 0.1687735766172409, "step": 3420 }, { "epoch": 1.7950733752620545, "grad_norm": 0.5006226897239685, "learning_rate": 3.70949975207431e-05, "loss": 0.1915, "loss_nan_ranks": 0, "loss_rank_avg": 0.208736851811409, "step": 3425 }, { "epoch": 1.7976939203354299, "grad_norm": 0.5228962302207947, "learning_rate": 3.708141709846323e-05, "loss": 0.2108, "loss_nan_ranks": 0, "loss_rank_avg": 0.17655402421951294, "step": 3430 }, { "epoch": 1.800314465408805, "grad_norm": 0.6512285470962524, "learning_rate": 3.70678075049592e-05, "loss": 0.1878, "loss_nan_ranks": 0, "loss_rank_avg": 0.21836760640144348, "step": 3435 }, { "epoch": 1.8029350104821802, "grad_norm": 1.5839763879776, "learning_rate": 3.7054168763473155e-05, "loss": 0.1929, "loss_nan_ranks": 0, "loss_rank_avg": 0.177001953125, "step": 3440 }, { "epoch": 1.8055555555555556, "grad_norm": 0.6855888962745667, "learning_rate": 3.704050089729699e-05, "loss": 0.1745, "loss_nan_ranks": 0, "loss_rank_avg": 0.19496369361877441, "step": 3445 }, { "epoch": 1.808176100628931, "grad_norm": 0.6062113046646118, "learning_rate": 3.702680392977235e-05, "loss": 0.198, "loss_nan_ranks": 0, "loss_rank_avg": 0.16827154159545898, "step": 3450 }, { "epoch": 1.8107966457023061, "grad_norm": 1.070608377456665, "learning_rate": 3.7013077884290576e-05, "loss": 0.2063, "loss_nan_ranks": 0, "loss_rank_avg": 0.20059406757354736, "step": 3455 }, { "epoch": 1.8134171907756813, "grad_norm": 0.5718883872032166, "learning_rate": 3.699932278429268e-05, "loss": 0.2137, "loss_nan_ranks": 0, "loss_rank_avg": 0.22236788272857666, "step": 3460 }, { "epoch": 1.8160377358490565, "grad_norm": 0.6839277148246765, "learning_rate": 3.698553865326928e-05, "loss": 0.2077, "loss_nan_ranks": 0, "loss_rank_avg": 0.221435546875, "step": 3465 }, { "epoch": 1.8186582809224319, "grad_norm": 0.7797523736953735, "learning_rate": 3.6971725514760576e-05, "loss": 0.1978, "loss_nan_ranks": 0, "loss_rank_avg": 0.20486325025558472, "step": 3470 }, { "epoch": 1.8212788259958073, "grad_norm": 0.7781976461410522, "learning_rate": 3.69578833923563e-05, "loss": 0.2032, "loss_nan_ranks": 0, "loss_rank_avg": 0.18268337845802307, "step": 3475 }, { "epoch": 1.8238993710691824, "grad_norm": 0.5963297486305237, "learning_rate": 3.6944012309695707e-05, "loss": 0.1952, "loss_nan_ranks": 0, "loss_rank_avg": 0.17451441287994385, "step": 3480 }, { "epoch": 1.8265199161425576, "grad_norm": 0.7559292316436768, "learning_rate": 3.693011229046747e-05, "loss": 0.2074, "loss_nan_ranks": 0, "loss_rank_avg": 0.20954449474811554, "step": 3485 }, { "epoch": 1.8291404612159328, "grad_norm": 0.5257536172866821, "learning_rate": 3.691618335840972e-05, "loss": 0.2107, "loss_nan_ranks": 0, "loss_rank_avg": 0.21420666575431824, "step": 3490 }, { "epoch": 1.8317610062893082, "grad_norm": 0.7904267311096191, "learning_rate": 3.690222553730992e-05, "loss": 0.2208, "loss_nan_ranks": 0, "loss_rank_avg": 0.23457945883274078, "step": 3495 }, { "epoch": 1.8343815513626835, "grad_norm": 0.6557732820510864, "learning_rate": 3.688823885100491e-05, "loss": 0.1912, "loss_nan_ranks": 0, "loss_rank_avg": 0.20230236649513245, "step": 3500 }, { "epoch": 1.8370020964360587, "grad_norm": 0.7868777513504028, "learning_rate": 3.6874223323380804e-05, "loss": 0.1962, "loss_nan_ranks": 0, "loss_rank_avg": 0.212323397397995, "step": 3505 }, { "epoch": 1.8396226415094339, "grad_norm": 0.6201764941215515, "learning_rate": 3.686017897837298e-05, "loss": 0.1904, "loss_nan_ranks": 0, "loss_rank_avg": 0.19490529596805573, "step": 3510 }, { "epoch": 1.8422431865828093, "grad_norm": 0.8452920317649841, "learning_rate": 3.684610583996602e-05, "loss": 0.1993, "loss_nan_ranks": 0, "loss_rank_avg": 0.1875, "step": 3515 }, { "epoch": 1.8448637316561844, "grad_norm": 0.7257571816444397, "learning_rate": 3.683200393219369e-05, "loss": 0.1929, "loss_nan_ranks": 0, "loss_rank_avg": 0.19361209869384766, "step": 3520 }, { "epoch": 1.8474842767295598, "grad_norm": 0.5986731052398682, "learning_rate": 3.681787327913888e-05, "loss": 0.1942, "loss_nan_ranks": 0, "loss_rank_avg": 0.197706937789917, "step": 3525 }, { "epoch": 1.850104821802935, "grad_norm": 0.8025481104850769, "learning_rate": 3.680371390493356e-05, "loss": 0.1956, "loss_nan_ranks": 0, "loss_rank_avg": 0.21123269200325012, "step": 3530 }, { "epoch": 1.8527253668763102, "grad_norm": 0.7123976349830627, "learning_rate": 3.678952583375878e-05, "loss": 0.1891, "loss_nan_ranks": 0, "loss_rank_avg": 0.166748046875, "step": 3535 }, { "epoch": 1.8553459119496856, "grad_norm": 0.6865976452827454, "learning_rate": 3.6775309089844566e-05, "loss": 0.1934, "loss_nan_ranks": 0, "loss_rank_avg": 0.1982835829257965, "step": 3540 }, { "epoch": 1.857966457023061, "grad_norm": 0.6656506061553955, "learning_rate": 3.676106369746993e-05, "loss": 0.1954, "loss_nan_ranks": 0, "loss_rank_avg": 0.2171439528465271, "step": 3545 }, { "epoch": 1.8605870020964361, "grad_norm": 0.6462177634239197, "learning_rate": 3.67467896809628e-05, "loss": 0.1928, "loss_nan_ranks": 0, "loss_rank_avg": 0.18315325677394867, "step": 3550 }, { "epoch": 1.8632075471698113, "grad_norm": 0.6657865643501282, "learning_rate": 3.673248706469999e-05, "loss": 0.2062, "loss_nan_ranks": 0, "loss_rank_avg": 0.207036554813385, "step": 3555 }, { "epoch": 1.8658280922431865, "grad_norm": 0.7065151929855347, "learning_rate": 3.6718155873107156e-05, "loss": 0.1859, "loss_nan_ranks": 0, "loss_rank_avg": 0.18963903188705444, "step": 3560 }, { "epoch": 1.8684486373165619, "grad_norm": 1.0115692615509033, "learning_rate": 3.670379613065875e-05, "loss": 0.1891, "loss_nan_ranks": 0, "loss_rank_avg": 0.174276202917099, "step": 3565 }, { "epoch": 1.8710691823899372, "grad_norm": 0.6386069655418396, "learning_rate": 3.668940786187801e-05, "loss": 0.2028, "loss_nan_ranks": 0, "loss_rank_avg": 0.24545133113861084, "step": 3570 }, { "epoch": 1.8736897274633124, "grad_norm": 0.6867290139198303, "learning_rate": 3.667499109133683e-05, "loss": 0.2071, "loss_nan_ranks": 0, "loss_rank_avg": 0.2165694534778595, "step": 3575 }, { "epoch": 1.8763102725366876, "grad_norm": 2.2979297637939453, "learning_rate": 3.6660545843655856e-05, "loss": 0.2067, "loss_nan_ranks": 0, "loss_rank_avg": 0.18144424259662628, "step": 3580 }, { "epoch": 1.8789308176100628, "grad_norm": 0.6569699645042419, "learning_rate": 3.664607214350429e-05, "loss": 0.2059, "loss_nan_ranks": 0, "loss_rank_avg": 0.21263781189918518, "step": 3585 }, { "epoch": 1.8815513626834381, "grad_norm": 0.6411572098731995, "learning_rate": 3.66315700156e-05, "loss": 0.1996, "loss_nan_ranks": 0, "loss_rank_avg": 0.18590950965881348, "step": 3590 }, { "epoch": 1.8841719077568135, "grad_norm": 0.6736159324645996, "learning_rate": 3.6617039484709324e-05, "loss": 0.2142, "loss_nan_ranks": 0, "loss_rank_avg": 0.20901615917682648, "step": 3595 }, { "epoch": 1.8867924528301887, "grad_norm": 0.7786797285079956, "learning_rate": 3.660248057564717e-05, "loss": 0.2083, "loss_nan_ranks": 0, "loss_rank_avg": 0.21553653478622437, "step": 3600 }, { "epoch": 1.8894129979035639, "grad_norm": 0.6000698208808899, "learning_rate": 3.658789331327688e-05, "loss": 0.1961, "loss_nan_ranks": 0, "loss_rank_avg": 0.18407447636127472, "step": 3605 }, { "epoch": 1.892033542976939, "grad_norm": 0.5410313010215759, "learning_rate": 3.657327772251022e-05, "loss": 0.2014, "loss_nan_ranks": 0, "loss_rank_avg": 0.19665025174617767, "step": 3610 }, { "epoch": 1.8946540880503144, "grad_norm": 0.6245362758636475, "learning_rate": 3.6558633828307335e-05, "loss": 0.2012, "loss_nan_ranks": 0, "loss_rank_avg": 0.21915487945079803, "step": 3615 }, { "epoch": 1.8972746331236898, "grad_norm": 0.6884681582450867, "learning_rate": 3.654396165567671e-05, "loss": 0.1811, "loss_nan_ranks": 0, "loss_rank_avg": 0.22231367230415344, "step": 3620 }, { "epoch": 1.899895178197065, "grad_norm": 0.8241666555404663, "learning_rate": 3.6529261229675134e-05, "loss": 0.2135, "loss_nan_ranks": 0, "loss_rank_avg": 0.179931640625, "step": 3625 }, { "epoch": 1.9025157232704402, "grad_norm": 0.6740656495094299, "learning_rate": 3.6514532575407606e-05, "loss": 0.1868, "loss_nan_ranks": 0, "loss_rank_avg": 0.19351506233215332, "step": 3630 }, { "epoch": 1.9051362683438156, "grad_norm": 1.3846477270126343, "learning_rate": 3.6499775718027374e-05, "loss": 0.1935, "loss_nan_ranks": 0, "loss_rank_avg": 0.21881192922592163, "step": 3635 }, { "epoch": 1.9077568134171907, "grad_norm": 0.6799472570419312, "learning_rate": 3.648499068273584e-05, "loss": 0.1874, "loss_nan_ranks": 0, "loss_rank_avg": 0.1958981156349182, "step": 3640 }, { "epoch": 1.9103773584905661, "grad_norm": 0.7112404108047485, "learning_rate": 3.6470177494782525e-05, "loss": 0.2124, "loss_nan_ranks": 0, "loss_rank_avg": 0.2296728789806366, "step": 3645 }, { "epoch": 1.9129979035639413, "grad_norm": 0.6902780532836914, "learning_rate": 3.6455336179465006e-05, "loss": 0.218, "loss_nan_ranks": 0, "loss_rank_avg": 0.2503230571746826, "step": 3650 }, { "epoch": 1.9156184486373165, "grad_norm": 0.6022567749023438, "learning_rate": 3.6440466762128945e-05, "loss": 0.1969, "loss_nan_ranks": 0, "loss_rank_avg": 0.17072564363479614, "step": 3655 }, { "epoch": 1.9182389937106918, "grad_norm": 0.618741512298584, "learning_rate": 3.642556926816795e-05, "loss": 0.2026, "loss_nan_ranks": 0, "loss_rank_avg": 0.21050477027893066, "step": 3660 }, { "epoch": 1.9208595387840672, "grad_norm": 0.7455676794052124, "learning_rate": 3.64106437230236e-05, "loss": 0.1904, "loss_nan_ranks": 0, "loss_rank_avg": 0.1719132959842682, "step": 3665 }, { "epoch": 1.9234800838574424, "grad_norm": 0.6303054690361023, "learning_rate": 3.639569015218537e-05, "loss": 0.1935, "loss_nan_ranks": 0, "loss_rank_avg": 0.17934748530387878, "step": 3670 }, { "epoch": 1.9261006289308176, "grad_norm": 0.6130335330963135, "learning_rate": 3.638070858119061e-05, "loss": 0.2027, "loss_nan_ranks": 0, "loss_rank_avg": 0.154541015625, "step": 3675 }, { "epoch": 1.9287211740041927, "grad_norm": 0.6253183484077454, "learning_rate": 3.6365699035624465e-05, "loss": 0.2027, "loss_nan_ranks": 0, "loss_rank_avg": 0.19237437844276428, "step": 3680 }, { "epoch": 1.9313417190775681, "grad_norm": 0.7445136308670044, "learning_rate": 3.635066154111989e-05, "loss": 0.2132, "loss_nan_ranks": 0, "loss_rank_avg": 0.2229272425174713, "step": 3685 }, { "epoch": 1.9339622641509435, "grad_norm": 0.6628245115280151, "learning_rate": 3.6335596123357515e-05, "loss": 0.1794, "loss_nan_ranks": 0, "loss_rank_avg": 0.18328478932380676, "step": 3690 }, { "epoch": 1.9365828092243187, "grad_norm": 0.667185366153717, "learning_rate": 3.6320502808065716e-05, "loss": 0.2002, "loss_nan_ranks": 0, "loss_rank_avg": 0.20585229992866516, "step": 3695 }, { "epoch": 1.9392033542976939, "grad_norm": 0.5516197085380554, "learning_rate": 3.630538162102048e-05, "loss": 0.2059, "loss_nan_ranks": 0, "loss_rank_avg": 0.18774524331092834, "step": 3700 }, { "epoch": 1.941823899371069, "grad_norm": 0.6480304002761841, "learning_rate": 3.62902325880454e-05, "loss": 0.2146, "loss_nan_ranks": 0, "loss_rank_avg": 0.26353558897972107, "step": 3705 }, { "epoch": 1.9444444444444444, "grad_norm": 0.5691676735877991, "learning_rate": 3.627505573501162e-05, "loss": 0.2207, "loss_nan_ranks": 0, "loss_rank_avg": 0.2253093421459198, "step": 3710 }, { "epoch": 1.9470649895178198, "grad_norm": 0.651718020439148, "learning_rate": 3.6259851087837785e-05, "loss": 0.1969, "loss_nan_ranks": 0, "loss_rank_avg": 0.18974995613098145, "step": 3715 }, { "epoch": 1.949685534591195, "grad_norm": 0.7716585397720337, "learning_rate": 3.6244618672490036e-05, "loss": 0.2007, "loss_nan_ranks": 0, "loss_rank_avg": 0.2197091281414032, "step": 3720 }, { "epoch": 1.9523060796645701, "grad_norm": 0.6229172348976135, "learning_rate": 3.622935851498191e-05, "loss": 0.1959, "loss_nan_ranks": 0, "loss_rank_avg": 0.197265625, "step": 3725 }, { "epoch": 1.9549266247379455, "grad_norm": 0.5668113231658936, "learning_rate": 3.621407064137433e-05, "loss": 0.2107, "loss_nan_ranks": 0, "loss_rank_avg": 0.24369171261787415, "step": 3730 }, { "epoch": 1.9575471698113207, "grad_norm": 0.588336706161499, "learning_rate": 3.619875507777555e-05, "loss": 0.1989, "loss_nan_ranks": 0, "loss_rank_avg": 0.21228131651878357, "step": 3735 }, { "epoch": 1.960167714884696, "grad_norm": 0.6574887633323669, "learning_rate": 3.6183411850341106e-05, "loss": 0.1961, "loss_nan_ranks": 0, "loss_rank_avg": 0.18895301222801208, "step": 3740 }, { "epoch": 1.9627882599580713, "grad_norm": 0.7427954077720642, "learning_rate": 3.616804098527379e-05, "loss": 0.202, "loss_nan_ranks": 0, "loss_rank_avg": 0.18273983895778656, "step": 3745 }, { "epoch": 1.9654088050314464, "grad_norm": 1.0722060203552246, "learning_rate": 3.615264250882359e-05, "loss": 0.183, "loss_nan_ranks": 0, "loss_rank_avg": 0.151123046875, "step": 3750 }, { "epoch": 1.9680293501048218, "grad_norm": 0.590573251247406, "learning_rate": 3.613721644728765e-05, "loss": 0.1934, "loss_nan_ranks": 0, "loss_rank_avg": 0.17236328125, "step": 3755 }, { "epoch": 1.9706498951781972, "grad_norm": 0.7106091976165771, "learning_rate": 3.6121762827010206e-05, "loss": 0.2109, "loss_nan_ranks": 0, "loss_rank_avg": 0.22868366539478302, "step": 3760 }, { "epoch": 1.9732704402515724, "grad_norm": 0.6222302913665771, "learning_rate": 3.610628167438258e-05, "loss": 0.2062, "loss_nan_ranks": 0, "loss_rank_avg": 0.21538963913917542, "step": 3765 }, { "epoch": 1.9758909853249476, "grad_norm": 0.6666224002838135, "learning_rate": 3.60907730158431e-05, "loss": 0.198, "loss_nan_ranks": 0, "loss_rank_avg": 0.1969054937362671, "step": 3770 }, { "epoch": 1.9785115303983227, "grad_norm": 0.7065848112106323, "learning_rate": 3.607523687787707e-05, "loss": 0.2255, "loss_nan_ranks": 0, "loss_rank_avg": 0.24432584643363953, "step": 3775 }, { "epoch": 1.9811320754716981, "grad_norm": 0.6347010135650635, "learning_rate": 3.605967328701673e-05, "loss": 0.2109, "loss_nan_ranks": 0, "loss_rank_avg": 0.19049328565597534, "step": 3780 }, { "epoch": 1.9837526205450735, "grad_norm": 0.9422894716262817, "learning_rate": 3.60440822698412e-05, "loss": 0.1974, "loss_nan_ranks": 0, "loss_rank_avg": 0.17729416489601135, "step": 3785 }, { "epoch": 1.9863731656184487, "grad_norm": 0.738005518913269, "learning_rate": 3.602846385297642e-05, "loss": 0.2035, "loss_nan_ranks": 0, "loss_rank_avg": 0.21540319919586182, "step": 3790 }, { "epoch": 1.9889937106918238, "grad_norm": 0.6134724617004395, "learning_rate": 3.601281806309516e-05, "loss": 0.1867, "loss_nan_ranks": 0, "loss_rank_avg": 0.16434405744075775, "step": 3795 }, { "epoch": 1.991614255765199, "grad_norm": 0.7037052512168884, "learning_rate": 3.599714492691689e-05, "loss": 0.1896, "loss_nan_ranks": 0, "loss_rank_avg": 0.21146099269390106, "step": 3800 }, { "epoch": 1.9942348008385744, "grad_norm": 0.5308715105056763, "learning_rate": 3.598144447120783e-05, "loss": 0.2222, "loss_nan_ranks": 0, "loss_rank_avg": 0.235158771276474, "step": 3805 }, { "epoch": 1.9968553459119498, "grad_norm": 0.7394761443138123, "learning_rate": 3.596571672278083e-05, "loss": 0.1989, "loss_nan_ranks": 0, "loss_rank_avg": 0.2079474925994873, "step": 3810 }, { "epoch": 1.999475890985325, "grad_norm": 0.594095766544342, "learning_rate": 3.5949961708495335e-05, "loss": 0.1817, "loss_nan_ranks": 0, "loss_rank_avg": 0.18311980366706848, "step": 3815 }, { "epoch": 2.002620545073375, "grad_norm": 0.7084780931472778, "learning_rate": 3.593417945525739e-05, "loss": 0.1562, "loss_nan_ranks": 0, "loss_rank_avg": 0.1945737898349762, "step": 3820 }, { "epoch": 2.0052410901467503, "grad_norm": 0.6283460259437561, "learning_rate": 3.591836999001952e-05, "loss": 0.1817, "loss_nan_ranks": 0, "loss_rank_avg": 0.203782320022583, "step": 3825 }, { "epoch": 2.007861635220126, "grad_norm": 0.7033699154853821, "learning_rate": 3.5902533339780756e-05, "loss": 0.1565, "loss_nan_ranks": 0, "loss_rank_avg": 0.154052734375, "step": 3830 }, { "epoch": 2.010482180293501, "grad_norm": 0.7841500043869019, "learning_rate": 3.588666953158653e-05, "loss": 0.1972, "loss_nan_ranks": 0, "loss_rank_avg": 0.18807217478752136, "step": 3835 }, { "epoch": 2.0131027253668763, "grad_norm": 0.6844927668571472, "learning_rate": 3.587077859252868e-05, "loss": 0.1767, "loss_nan_ranks": 0, "loss_rank_avg": 0.18625213205814362, "step": 3840 }, { "epoch": 2.0157232704402515, "grad_norm": 0.7283322215080261, "learning_rate": 3.585486054974535e-05, "loss": 0.1643, "loss_nan_ranks": 0, "loss_rank_avg": 0.20896628499031067, "step": 3845 }, { "epoch": 2.0183438155136266, "grad_norm": 0.8837510943412781, "learning_rate": 3.583891543042097e-05, "loss": 0.1938, "loss_nan_ranks": 0, "loss_rank_avg": 0.17919921875, "step": 3850 }, { "epoch": 2.0209643605870022, "grad_norm": 0.7254173755645752, "learning_rate": 3.582294326178624e-05, "loss": 0.1861, "loss_nan_ranks": 0, "loss_rank_avg": 0.17642877995967865, "step": 3855 }, { "epoch": 2.0235849056603774, "grad_norm": 0.5675681233406067, "learning_rate": 3.5806944071118036e-05, "loss": 0.1695, "loss_nan_ranks": 0, "loss_rank_avg": 0.17149925231933594, "step": 3860 }, { "epoch": 2.0262054507337526, "grad_norm": 0.5747088193893433, "learning_rate": 3.579091788573938e-05, "loss": 0.1863, "loss_nan_ranks": 0, "loss_rank_avg": 0.16926203668117523, "step": 3865 }, { "epoch": 2.0288259958071277, "grad_norm": 0.6015620231628418, "learning_rate": 3.577486473301939e-05, "loss": 0.1848, "loss_nan_ranks": 0, "loss_rank_avg": 0.1605832278728485, "step": 3870 }, { "epoch": 2.0314465408805034, "grad_norm": 0.9510326385498047, "learning_rate": 3.575878464037325e-05, "loss": 0.1608, "loss_nan_ranks": 0, "loss_rank_avg": 0.16941693425178528, "step": 3875 }, { "epoch": 2.0340670859538785, "grad_norm": 0.6109377145767212, "learning_rate": 3.574267763526215e-05, "loss": 0.1656, "loss_nan_ranks": 0, "loss_rank_avg": 0.15363606810569763, "step": 3880 }, { "epoch": 2.0366876310272537, "grad_norm": 0.5519078969955444, "learning_rate": 3.5726543745193236e-05, "loss": 0.1764, "loss_nan_ranks": 0, "loss_rank_avg": 0.19953450560569763, "step": 3885 }, { "epoch": 2.039308176100629, "grad_norm": 0.7259485721588135, "learning_rate": 3.571038299771957e-05, "loss": 0.1949, "loss_nan_ranks": 0, "loss_rank_avg": 0.19868190586566925, "step": 3890 }, { "epoch": 2.041928721174004, "grad_norm": 0.7612906694412231, "learning_rate": 3.569419542044008e-05, "loss": 0.1569, "loss_nan_ranks": 0, "loss_rank_avg": 0.172119140625, "step": 3895 }, { "epoch": 2.0445492662473796, "grad_norm": 0.7879012823104858, "learning_rate": 3.56779810409995e-05, "loss": 0.1616, "loss_nan_ranks": 0, "loss_rank_avg": 0.1760033667087555, "step": 3900 }, { "epoch": 2.047169811320755, "grad_norm": 0.6542850136756897, "learning_rate": 3.566173988708836e-05, "loss": 0.1752, "loss_nan_ranks": 0, "loss_rank_avg": 0.2122265100479126, "step": 3905 }, { "epoch": 2.04979035639413, "grad_norm": 0.6596713066101074, "learning_rate": 3.5645471986442905e-05, "loss": 0.1859, "loss_nan_ranks": 0, "loss_rank_avg": 0.18576733767986298, "step": 3910 }, { "epoch": 2.052410901467505, "grad_norm": 0.7132096290588379, "learning_rate": 3.5629177366845055e-05, "loss": 0.1656, "loss_nan_ranks": 0, "loss_rank_avg": 0.1605224609375, "step": 3915 }, { "epoch": 2.0550314465408803, "grad_norm": 0.6687309741973877, "learning_rate": 3.561285605612236e-05, "loss": 0.1921, "loss_nan_ranks": 0, "loss_rank_avg": 0.21139320731163025, "step": 3920 }, { "epoch": 2.057651991614256, "grad_norm": 0.6709274053573608, "learning_rate": 3.5596508082147944e-05, "loss": 0.1641, "loss_nan_ranks": 0, "loss_rank_avg": 0.1706489622592926, "step": 3925 }, { "epoch": 2.060272536687631, "grad_norm": 0.6589915156364441, "learning_rate": 3.558013347284049e-05, "loss": 0.1727, "loss_nan_ranks": 0, "loss_rank_avg": 0.17125006020069122, "step": 3930 }, { "epoch": 2.0628930817610063, "grad_norm": 0.6327504515647888, "learning_rate": 3.5563732256164136e-05, "loss": 0.1861, "loss_nan_ranks": 0, "loss_rank_avg": 0.1846763789653778, "step": 3935 }, { "epoch": 2.0655136268343814, "grad_norm": 0.6399828791618347, "learning_rate": 3.554730446012849e-05, "loss": 0.1818, "loss_nan_ranks": 0, "loss_rank_avg": 0.14534403383731842, "step": 3940 }, { "epoch": 2.0681341719077566, "grad_norm": 0.5941177010536194, "learning_rate": 3.553085011278854e-05, "loss": 0.1811, "loss_nan_ranks": 0, "loss_rank_avg": 0.2060641646385193, "step": 3945 }, { "epoch": 2.0707547169811322, "grad_norm": 0.7381300926208496, "learning_rate": 3.551436924224461e-05, "loss": 0.1802, "loss_nan_ranks": 0, "loss_rank_avg": 0.1723439246416092, "step": 3950 }, { "epoch": 2.0733752620545074, "grad_norm": 1.0024296045303345, "learning_rate": 3.549786187664231e-05, "loss": 0.1814, "loss_nan_ranks": 0, "loss_rank_avg": 0.21678271889686584, "step": 3955 }, { "epoch": 2.0759958071278826, "grad_norm": 0.6965764164924622, "learning_rate": 3.548132804417255e-05, "loss": 0.1823, "loss_nan_ranks": 0, "loss_rank_avg": 0.1474609375, "step": 3960 }, { "epoch": 2.0786163522012577, "grad_norm": 0.6827486753463745, "learning_rate": 3.546476777307137e-05, "loss": 0.1682, "loss_nan_ranks": 0, "loss_rank_avg": 0.17864082753658295, "step": 3965 }, { "epoch": 2.081236897274633, "grad_norm": 0.6874611377716064, "learning_rate": 3.544818109162e-05, "loss": 0.1697, "loss_nan_ranks": 0, "loss_rank_avg": 0.18936476111412048, "step": 3970 }, { "epoch": 2.0838574423480085, "grad_norm": 0.6896753311157227, "learning_rate": 3.543156802814478e-05, "loss": 0.1934, "loss_nan_ranks": 0, "loss_rank_avg": 0.20261545479297638, "step": 3975 }, { "epoch": 2.0864779874213837, "grad_norm": 0.719116747379303, "learning_rate": 3.5414928611017085e-05, "loss": 0.1565, "loss_nan_ranks": 0, "loss_rank_avg": 0.1512451171875, "step": 3980 }, { "epoch": 2.089098532494759, "grad_norm": 0.6998587846755981, "learning_rate": 3.53982628686533e-05, "loss": 0.1615, "loss_nan_ranks": 0, "loss_rank_avg": 0.170166015625, "step": 3985 }, { "epoch": 2.091719077568134, "grad_norm": 0.801823616027832, "learning_rate": 3.538157082951477e-05, "loss": 0.1749, "loss_nan_ranks": 0, "loss_rank_avg": 0.144775390625, "step": 3990 }, { "epoch": 2.0943396226415096, "grad_norm": 0.6100899577140808, "learning_rate": 3.536485252210775e-05, "loss": 0.1884, "loss_nan_ranks": 0, "loss_rank_avg": 0.16361545026302338, "step": 3995 }, { "epoch": 2.096960167714885, "grad_norm": 0.7097408771514893, "learning_rate": 3.534810797498335e-05, "loss": 0.1948, "loss_nan_ranks": 0, "loss_rank_avg": 0.15076175332069397, "step": 4000 }, { "epoch": 2.09958071278826, "grad_norm": 0.5789570212364197, "learning_rate": 3.533133721673751e-05, "loss": 0.1774, "loss_nan_ranks": 0, "loss_rank_avg": 0.1568412184715271, "step": 4005 }, { "epoch": 2.102201257861635, "grad_norm": 0.6784456968307495, "learning_rate": 3.5314540276010895e-05, "loss": 0.1762, "loss_nan_ranks": 0, "loss_rank_avg": 0.2062300741672516, "step": 4010 }, { "epoch": 2.1048218029350103, "grad_norm": 0.5736457109451294, "learning_rate": 3.529771718148893e-05, "loss": 0.1567, "loss_nan_ranks": 0, "loss_rank_avg": 0.17741183936595917, "step": 4015 }, { "epoch": 2.107442348008386, "grad_norm": 0.6271811127662659, "learning_rate": 3.528086796190167e-05, "loss": 0.1985, "loss_nan_ranks": 0, "loss_rank_avg": 0.19951367378234863, "step": 4020 }, { "epoch": 2.110062893081761, "grad_norm": 1.0117782354354858, "learning_rate": 3.526399264602381e-05, "loss": 0.1669, "loss_nan_ranks": 0, "loss_rank_avg": 0.14599609375, "step": 4025 }, { "epoch": 2.1126834381551363, "grad_norm": 0.6283865571022034, "learning_rate": 3.524709126267458e-05, "loss": 0.1947, "loss_nan_ranks": 0, "loss_rank_avg": 0.18431368470191956, "step": 4030 }, { "epoch": 2.1153039832285114, "grad_norm": 0.7869764566421509, "learning_rate": 3.523016384071777e-05, "loss": 0.1799, "loss_nan_ranks": 0, "loss_rank_avg": 0.17630180716514587, "step": 4035 }, { "epoch": 2.1179245283018866, "grad_norm": 0.6120043992996216, "learning_rate": 3.521321040906159e-05, "loss": 0.1684, "loss_nan_ranks": 0, "loss_rank_avg": 0.16492129862308502, "step": 4040 }, { "epoch": 2.120545073375262, "grad_norm": 0.6537008881568909, "learning_rate": 3.5196230996658704e-05, "loss": 0.1653, "loss_nan_ranks": 0, "loss_rank_avg": 0.19163097441196442, "step": 4045 }, { "epoch": 2.1231656184486374, "grad_norm": 0.6703867316246033, "learning_rate": 3.517922563250615e-05, "loss": 0.1657, "loss_nan_ranks": 0, "loss_rank_avg": 0.2131059169769287, "step": 4050 }, { "epoch": 2.1257861635220126, "grad_norm": 0.6623901128768921, "learning_rate": 3.5162194345645256e-05, "loss": 0.1715, "loss_nan_ranks": 0, "loss_rank_avg": 0.18095040321350098, "step": 4055 }, { "epoch": 2.1284067085953877, "grad_norm": 0.7682641744613647, "learning_rate": 3.514513716516164e-05, "loss": 0.1871, "loss_nan_ranks": 0, "loss_rank_avg": 0.15569807589054108, "step": 4060 }, { "epoch": 2.131027253668763, "grad_norm": 0.6966356635093689, "learning_rate": 3.512805412018512e-05, "loss": 0.174, "loss_nan_ranks": 0, "loss_rank_avg": 0.14013671875, "step": 4065 }, { "epoch": 2.1336477987421385, "grad_norm": 0.6977257132530212, "learning_rate": 3.5110945239889725e-05, "loss": 0.1674, "loss_nan_ranks": 0, "loss_rank_avg": 0.13693030178546906, "step": 4070 }, { "epoch": 2.1362683438155137, "grad_norm": 0.5598815679550171, "learning_rate": 3.509381055349357e-05, "loss": 0.1678, "loss_nan_ranks": 0, "loss_rank_avg": 0.18437781929969788, "step": 4075 }, { "epoch": 2.138888888888889, "grad_norm": 0.5758344531059265, "learning_rate": 3.507665009025885e-05, "loss": 0.1977, "loss_nan_ranks": 0, "loss_rank_avg": 0.2025020569562912, "step": 4080 }, { "epoch": 2.141509433962264, "grad_norm": 0.5952780246734619, "learning_rate": 3.505946387949177e-05, "loss": 0.1588, "loss_nan_ranks": 0, "loss_rank_avg": 0.1604464054107666, "step": 4085 }, { "epoch": 2.1441299790356396, "grad_norm": 0.6980146169662476, "learning_rate": 3.5042251950542536e-05, "loss": 0.1744, "loss_nan_ranks": 0, "loss_rank_avg": 0.17276695370674133, "step": 4090 }, { "epoch": 2.146750524109015, "grad_norm": 0.5248650908470154, "learning_rate": 3.502501433280525e-05, "loss": 0.1729, "loss_nan_ranks": 0, "loss_rank_avg": 0.17749759554862976, "step": 4095 }, { "epoch": 2.14937106918239, "grad_norm": 0.6207015514373779, "learning_rate": 3.5007751055717895e-05, "loss": 0.1886, "loss_nan_ranks": 0, "loss_rank_avg": 0.21023103594779968, "step": 4100 }, { "epoch": 2.151991614255765, "grad_norm": 0.6230864524841309, "learning_rate": 3.499046214876227e-05, "loss": 0.1804, "loss_nan_ranks": 0, "loss_rank_avg": 0.16375795006752014, "step": 4105 }, { "epoch": 2.1546121593291403, "grad_norm": 0.6781209111213684, "learning_rate": 3.497314764146394e-05, "loss": 0.1666, "loss_nan_ranks": 0, "loss_rank_avg": 0.1437818706035614, "step": 4110 }, { "epoch": 2.157232704402516, "grad_norm": 0.5978240966796875, "learning_rate": 3.49558075633922e-05, "loss": 0.1889, "loss_nan_ranks": 0, "loss_rank_avg": 0.20453524589538574, "step": 4115 }, { "epoch": 2.159853249475891, "grad_norm": 0.8283042907714844, "learning_rate": 3.493844194416001e-05, "loss": 0.1694, "loss_nan_ranks": 0, "loss_rank_avg": 0.15572591125965118, "step": 4120 }, { "epoch": 2.1624737945492662, "grad_norm": 0.6346971988677979, "learning_rate": 3.4921050813423944e-05, "loss": 0.2003, "loss_nan_ranks": 0, "loss_rank_avg": 0.167236328125, "step": 4125 }, { "epoch": 2.1650943396226414, "grad_norm": 0.7900070548057556, "learning_rate": 3.490363420088415e-05, "loss": 0.1688, "loss_nan_ranks": 0, "loss_rank_avg": 0.16584350168704987, "step": 4130 }, { "epoch": 2.1677148846960166, "grad_norm": 0.9483361840248108, "learning_rate": 3.488619213628429e-05, "loss": 0.1778, "loss_nan_ranks": 0, "loss_rank_avg": 0.20415958762168884, "step": 4135 }, { "epoch": 2.170335429769392, "grad_norm": 0.6414875984191895, "learning_rate": 3.4868724649411486e-05, "loss": 0.1903, "loss_nan_ranks": 0, "loss_rank_avg": 0.1789773851633072, "step": 4140 }, { "epoch": 2.1729559748427674, "grad_norm": 0.6929765343666077, "learning_rate": 3.48512317700963e-05, "loss": 0.1942, "loss_nan_ranks": 0, "loss_rank_avg": 0.21046538650989532, "step": 4145 }, { "epoch": 2.1755765199161425, "grad_norm": 0.6449279189109802, "learning_rate": 3.483371352821263e-05, "loss": 0.1766, "loss_nan_ranks": 0, "loss_rank_avg": 0.16827432811260223, "step": 4150 }, { "epoch": 2.1781970649895177, "grad_norm": 0.606066107749939, "learning_rate": 3.48161699536777e-05, "loss": 0.1789, "loss_nan_ranks": 0, "loss_rank_avg": 0.16351351141929626, "step": 4155 }, { "epoch": 2.180817610062893, "grad_norm": 0.7101812362670898, "learning_rate": 3.4798601076451986e-05, "loss": 0.1637, "loss_nan_ranks": 0, "loss_rank_avg": 0.19216609001159668, "step": 4160 }, { "epoch": 2.1834381551362685, "grad_norm": 1.1365573406219482, "learning_rate": 3.47810069265392e-05, "loss": 0.1704, "loss_nan_ranks": 0, "loss_rank_avg": 0.18184298276901245, "step": 4165 }, { "epoch": 2.1860587002096437, "grad_norm": 0.6491653919219971, "learning_rate": 3.476338753398618e-05, "loss": 0.1813, "loss_nan_ranks": 0, "loss_rank_avg": 0.18884286284446716, "step": 4170 }, { "epoch": 2.188679245283019, "grad_norm": 0.6283389329910278, "learning_rate": 3.474574292888292e-05, "loss": 0.1889, "loss_nan_ranks": 0, "loss_rank_avg": 0.19282126426696777, "step": 4175 }, { "epoch": 2.191299790356394, "grad_norm": 0.7391582727432251, "learning_rate": 3.472807314136242e-05, "loss": 0.1874, "loss_nan_ranks": 0, "loss_rank_avg": 0.1503807157278061, "step": 4180 }, { "epoch": 2.1939203354297696, "grad_norm": 0.5754171013832092, "learning_rate": 3.471037820160072e-05, "loss": 0.1783, "loss_nan_ranks": 0, "loss_rank_avg": 0.18280450999736786, "step": 4185 }, { "epoch": 2.1965408805031448, "grad_norm": 0.6991333961486816, "learning_rate": 3.469265813981679e-05, "loss": 0.1833, "loss_nan_ranks": 0, "loss_rank_avg": 0.20395539700984955, "step": 4190 }, { "epoch": 2.19916142557652, "grad_norm": 0.7103485465049744, "learning_rate": 3.467491298627252e-05, "loss": 0.1648, "loss_nan_ranks": 0, "loss_rank_avg": 0.1837499439716339, "step": 4195 }, { "epoch": 2.201781970649895, "grad_norm": 0.6320298314094543, "learning_rate": 3.465714277127266e-05, "loss": 0.1764, "loss_nan_ranks": 0, "loss_rank_avg": 0.16248497366905212, "step": 4200 }, { "epoch": 2.2044025157232703, "grad_norm": 0.7922936677932739, "learning_rate": 3.463934752516474e-05, "loss": 0.1824, "loss_nan_ranks": 0, "loss_rank_avg": 0.17559239268302917, "step": 4205 }, { "epoch": 2.207023060796646, "grad_norm": 0.5327322483062744, "learning_rate": 3.4621527278339025e-05, "loss": 0.1808, "loss_nan_ranks": 0, "loss_rank_avg": 0.18227726221084595, "step": 4210 }, { "epoch": 2.209643605870021, "grad_norm": 0.688612163066864, "learning_rate": 3.460368206122852e-05, "loss": 0.184, "loss_nan_ranks": 0, "loss_rank_avg": 0.17920489609241486, "step": 4215 }, { "epoch": 2.2122641509433962, "grad_norm": 0.5882250070571899, "learning_rate": 3.458581190430884e-05, "loss": 0.1766, "loss_nan_ranks": 0, "loss_rank_avg": 0.16983014345169067, "step": 4220 }, { "epoch": 2.2148846960167714, "grad_norm": 0.6443110108375549, "learning_rate": 3.4567916838098195e-05, "loss": 0.1695, "loss_nan_ranks": 0, "loss_rank_avg": 0.1739184558391571, "step": 4225 }, { "epoch": 2.2175052410901466, "grad_norm": 0.8445901870727539, "learning_rate": 3.454999689315734e-05, "loss": 0.166, "loss_nan_ranks": 0, "loss_rank_avg": 0.18260544538497925, "step": 4230 }, { "epoch": 2.220125786163522, "grad_norm": 0.6660024523735046, "learning_rate": 3.453205210008952e-05, "loss": 0.1714, "loss_nan_ranks": 0, "loss_rank_avg": 0.173113614320755, "step": 4235 }, { "epoch": 2.2227463312368974, "grad_norm": 0.6402644515037537, "learning_rate": 3.4514082489540415e-05, "loss": 0.1779, "loss_nan_ranks": 0, "loss_rank_avg": 0.16839855909347534, "step": 4240 }, { "epoch": 2.2253668763102725, "grad_norm": 0.6560665369033813, "learning_rate": 3.4496088092198076e-05, "loss": 0.188, "loss_nan_ranks": 0, "loss_rank_avg": 0.1756685972213745, "step": 4245 }, { "epoch": 2.2279874213836477, "grad_norm": 0.8085160255432129, "learning_rate": 3.44780689387929e-05, "loss": 0.1656, "loss_nan_ranks": 0, "loss_rank_avg": 0.14646513760089874, "step": 4250 }, { "epoch": 2.230607966457023, "grad_norm": 0.5747509002685547, "learning_rate": 3.446002506009754e-05, "loss": 0.2022, "loss_nan_ranks": 0, "loss_rank_avg": 0.2267492413520813, "step": 4255 }, { "epoch": 2.2332285115303985, "grad_norm": 1.1871565580368042, "learning_rate": 3.44419564869269e-05, "loss": 0.1893, "loss_nan_ranks": 0, "loss_rank_avg": 0.20584087073802948, "step": 4260 }, { "epoch": 2.2358490566037736, "grad_norm": 0.6333910822868347, "learning_rate": 3.4423863250138045e-05, "loss": 0.1925, "loss_nan_ranks": 0, "loss_rank_avg": 0.22530904412269592, "step": 4265 }, { "epoch": 2.238469601677149, "grad_norm": 0.6745782494544983, "learning_rate": 3.440574538063016e-05, "loss": 0.1719, "loss_nan_ranks": 0, "loss_rank_avg": 0.1735236644744873, "step": 4270 }, { "epoch": 2.241090146750524, "grad_norm": 0.6441916823387146, "learning_rate": 3.438760290934449e-05, "loss": 0.1873, "loss_nan_ranks": 0, "loss_rank_avg": 0.21418049931526184, "step": 4275 }, { "epoch": 2.2437106918238996, "grad_norm": 0.6465288400650024, "learning_rate": 3.436943586726431e-05, "loss": 0.1842, "loss_nan_ranks": 0, "loss_rank_avg": 0.1877419799566269, "step": 4280 }, { "epoch": 2.2463312368972748, "grad_norm": 0.6912493109703064, "learning_rate": 3.435124428541484e-05, "loss": 0.1759, "loss_nan_ranks": 0, "loss_rank_avg": 0.11175537109375, "step": 4285 }, { "epoch": 2.24895178197065, "grad_norm": 0.670586347579956, "learning_rate": 3.433302819486322e-05, "loss": 0.1896, "loss_nan_ranks": 0, "loss_rank_avg": 0.15557770431041718, "step": 4290 }, { "epoch": 2.251572327044025, "grad_norm": 0.6130393147468567, "learning_rate": 3.431478762671844e-05, "loss": 0.1829, "loss_nan_ranks": 0, "loss_rank_avg": 0.1899413913488388, "step": 4295 }, { "epoch": 2.2541928721174003, "grad_norm": 0.609475314617157, "learning_rate": 3.42965226121313e-05, "loss": 0.1755, "loss_nan_ranks": 0, "loss_rank_avg": 0.15828870236873627, "step": 4300 }, { "epoch": 2.2568134171907754, "grad_norm": 0.6548188328742981, "learning_rate": 3.4278233182294335e-05, "loss": 0.1743, "loss_nan_ranks": 0, "loss_rank_avg": 0.20844203233718872, "step": 4305 }, { "epoch": 2.259433962264151, "grad_norm": 0.7024995684623718, "learning_rate": 3.4259919368441794e-05, "loss": 0.1871, "loss_nan_ranks": 0, "loss_rank_avg": 0.2090778946876526, "step": 4310 }, { "epoch": 2.262054507337526, "grad_norm": 0.7198363542556763, "learning_rate": 3.424158120184955e-05, "loss": 0.1699, "loss_nan_ranks": 0, "loss_rank_avg": 0.1468142420053482, "step": 4315 }, { "epoch": 2.2646750524109014, "grad_norm": 0.600344717502594, "learning_rate": 3.422321871383507e-05, "loss": 0.1984, "loss_nan_ranks": 0, "loss_rank_avg": 0.16410166025161743, "step": 4320 }, { "epoch": 2.2672955974842766, "grad_norm": 0.6233639717102051, "learning_rate": 3.4204831935757365e-05, "loss": 0.1848, "loss_nan_ranks": 0, "loss_rank_avg": 0.20671750605106354, "step": 4325 }, { "epoch": 2.269916142557652, "grad_norm": 0.8082700967788696, "learning_rate": 3.418642089901692e-05, "loss": 0.1604, "loss_nan_ranks": 0, "loss_rank_avg": 0.15148866176605225, "step": 4330 }, { "epoch": 2.2725366876310273, "grad_norm": 0.6904008984565735, "learning_rate": 3.4167985635055655e-05, "loss": 0.1875, "loss_nan_ranks": 0, "loss_rank_avg": 0.17664828896522522, "step": 4335 }, { "epoch": 2.2751572327044025, "grad_norm": 0.5504633188247681, "learning_rate": 3.4149526175356854e-05, "loss": 0.1996, "loss_nan_ranks": 0, "loss_rank_avg": 0.19843214750289917, "step": 4340 }, { "epoch": 2.2777777777777777, "grad_norm": 0.60943603515625, "learning_rate": 3.413104255144514e-05, "loss": 0.1808, "loss_nan_ranks": 0, "loss_rank_avg": 0.20119717717170715, "step": 4345 }, { "epoch": 2.280398322851153, "grad_norm": 0.668181836605072, "learning_rate": 3.4112534794886376e-05, "loss": 0.1682, "loss_nan_ranks": 0, "loss_rank_avg": 0.19905316829681396, "step": 4350 }, { "epoch": 2.2830188679245285, "grad_norm": 0.7229729294776917, "learning_rate": 3.409400293728767e-05, "loss": 0.1764, "loss_nan_ranks": 0, "loss_rank_avg": 0.1654052734375, "step": 4355 }, { "epoch": 2.2856394129979036, "grad_norm": 0.5337548851966858, "learning_rate": 3.407544701029725e-05, "loss": 0.1838, "loss_nan_ranks": 0, "loss_rank_avg": 0.21104368567466736, "step": 4360 }, { "epoch": 2.288259958071279, "grad_norm": 0.6288163661956787, "learning_rate": 3.40568670456045e-05, "loss": 0.1871, "loss_nan_ranks": 0, "loss_rank_avg": 0.1397732049226761, "step": 4365 }, { "epoch": 2.290880503144654, "grad_norm": 0.6795221567153931, "learning_rate": 3.4038263074939805e-05, "loss": 0.1685, "loss_nan_ranks": 0, "loss_rank_avg": 0.17288152873516083, "step": 4370 }, { "epoch": 2.2935010482180296, "grad_norm": 0.6144188046455383, "learning_rate": 3.401963513007458e-05, "loss": 0.1591, "loss_nan_ranks": 0, "loss_rank_avg": 0.1681046187877655, "step": 4375 }, { "epoch": 2.2961215932914047, "grad_norm": 0.6623589396476746, "learning_rate": 3.400098324282116e-05, "loss": 0.1726, "loss_nan_ranks": 0, "loss_rank_avg": 0.17139089107513428, "step": 4380 }, { "epoch": 2.29874213836478, "grad_norm": 0.7196406126022339, "learning_rate": 3.39823074450328e-05, "loss": 0.1684, "loss_nan_ranks": 0, "loss_rank_avg": 0.1435546875, "step": 4385 }, { "epoch": 2.301362683438155, "grad_norm": 0.7317269444465637, "learning_rate": 3.3963607768603545e-05, "loss": 0.1755, "loss_nan_ranks": 0, "loss_rank_avg": 0.1617431640625, "step": 4390 }, { "epoch": 2.3039832285115303, "grad_norm": 0.6227354407310486, "learning_rate": 3.3944884245468255e-05, "loss": 0.1767, "loss_nan_ranks": 0, "loss_rank_avg": 0.1522216796875, "step": 4395 }, { "epoch": 2.3066037735849054, "grad_norm": 0.7358293533325195, "learning_rate": 3.3926136907602503e-05, "loss": 0.1931, "loss_nan_ranks": 0, "loss_rank_avg": 0.1415230929851532, "step": 4400 }, { "epoch": 2.309224318658281, "grad_norm": 0.6808892488479614, "learning_rate": 3.390736578702253e-05, "loss": 0.1744, "loss_nan_ranks": 0, "loss_rank_avg": 0.1495361328125, "step": 4405 }, { "epoch": 2.311844863731656, "grad_norm": 0.5879157781600952, "learning_rate": 3.388857091578519e-05, "loss": 0.185, "loss_nan_ranks": 0, "loss_rank_avg": 0.21268212795257568, "step": 4410 }, { "epoch": 2.3144654088050314, "grad_norm": 0.6618671417236328, "learning_rate": 3.3869752325987915e-05, "loss": 0.17, "loss_nan_ranks": 0, "loss_rank_avg": 0.17198902368545532, "step": 4415 }, { "epoch": 2.3170859538784065, "grad_norm": 0.6409109234809875, "learning_rate": 3.385091004976861e-05, "loss": 0.1916, "loss_nan_ranks": 0, "loss_rank_avg": 0.18544016778469086, "step": 4420 }, { "epoch": 2.319706498951782, "grad_norm": 0.5145808458328247, "learning_rate": 3.3832044119305666e-05, "loss": 0.1731, "loss_nan_ranks": 0, "loss_rank_avg": 0.17380250990390778, "step": 4425 }, { "epoch": 2.3223270440251573, "grad_norm": 0.5956165790557861, "learning_rate": 3.381315456681785e-05, "loss": 0.18, "loss_nan_ranks": 0, "loss_rank_avg": 0.14881974458694458, "step": 4430 }, { "epoch": 2.3249475890985325, "grad_norm": 0.7136877179145813, "learning_rate": 3.3794241424564275e-05, "loss": 0.1901, "loss_nan_ranks": 0, "loss_rank_avg": 0.17421871423721313, "step": 4435 }, { "epoch": 2.3275681341719077, "grad_norm": 0.6358838081359863, "learning_rate": 3.377530472484435e-05, "loss": 0.1824, "loss_nan_ranks": 0, "loss_rank_avg": 0.21158553659915924, "step": 4440 }, { "epoch": 2.330188679245283, "grad_norm": 0.5762045383453369, "learning_rate": 3.375634449999769e-05, "loss": 0.1819, "loss_nan_ranks": 0, "loss_rank_avg": 0.19911928474903107, "step": 4445 }, { "epoch": 2.3328092243186584, "grad_norm": 0.6769942045211792, "learning_rate": 3.373736078240411e-05, "loss": 0.1786, "loss_nan_ranks": 0, "loss_rank_avg": 0.16046220064163208, "step": 4450 }, { "epoch": 2.3354297693920336, "grad_norm": 0.6089766025543213, "learning_rate": 3.371835360448353e-05, "loss": 0.1738, "loss_nan_ranks": 0, "loss_rank_avg": 0.19723621010780334, "step": 4455 }, { "epoch": 2.338050314465409, "grad_norm": 0.780342698097229, "learning_rate": 3.369932299869594e-05, "loss": 0.1798, "loss_nan_ranks": 0, "loss_rank_avg": 0.1385498046875, "step": 4460 }, { "epoch": 2.340670859538784, "grad_norm": 0.6272355318069458, "learning_rate": 3.368026899754136e-05, "loss": 0.1776, "loss_nan_ranks": 0, "loss_rank_avg": 0.16443294286727905, "step": 4465 }, { "epoch": 2.3432914046121596, "grad_norm": 0.6277251243591309, "learning_rate": 3.366119163355972e-05, "loss": 0.1727, "loss_nan_ranks": 0, "loss_rank_avg": 0.17139068245887756, "step": 4470 }, { "epoch": 2.3459119496855347, "grad_norm": 0.6159740090370178, "learning_rate": 3.364209093933088e-05, "loss": 0.1861, "loss_nan_ranks": 0, "loss_rank_avg": 0.15588770806789398, "step": 4475 }, { "epoch": 2.34853249475891, "grad_norm": 0.6222087740898132, "learning_rate": 3.362296694747455e-05, "loss": 0.1718, "loss_nan_ranks": 0, "loss_rank_avg": 0.16325759887695312, "step": 4480 }, { "epoch": 2.351153039832285, "grad_norm": 0.6561034917831421, "learning_rate": 3.36038196906502e-05, "loss": 0.1975, "loss_nan_ranks": 0, "loss_rank_avg": 0.2421393245458603, "step": 4485 }, { "epoch": 2.3537735849056602, "grad_norm": 0.6397557258605957, "learning_rate": 3.358464920155704e-05, "loss": 0.1722, "loss_nan_ranks": 0, "loss_rank_avg": 0.14547546207904816, "step": 4490 }, { "epoch": 2.3563941299790354, "grad_norm": 0.5537540912628174, "learning_rate": 3.3565455512933974e-05, "loss": 0.1929, "loss_nan_ranks": 0, "loss_rank_avg": 0.19911175966262817, "step": 4495 }, { "epoch": 2.359014675052411, "grad_norm": 0.6250444054603577, "learning_rate": 3.35462386575595e-05, "loss": 0.1836, "loss_nan_ranks": 0, "loss_rank_avg": 0.1976991593837738, "step": 4500 }, { "epoch": 2.361635220125786, "grad_norm": 0.5753092765808105, "learning_rate": 3.3526998668251696e-05, "loss": 0.1819, "loss_nan_ranks": 0, "loss_rank_avg": 0.19621963798999786, "step": 4505 }, { "epoch": 2.3642557651991614, "grad_norm": 0.6889363527297974, "learning_rate": 3.3507735577868144e-05, "loss": 0.1878, "loss_nan_ranks": 0, "loss_rank_avg": 0.17461004853248596, "step": 4510 }, { "epoch": 2.3668763102725365, "grad_norm": 0.5696137547492981, "learning_rate": 3.3488449419305876e-05, "loss": 0.174, "loss_nan_ranks": 0, "loss_rank_avg": 0.18081456422805786, "step": 4515 }, { "epoch": 2.369496855345912, "grad_norm": 0.6583285331726074, "learning_rate": 3.3469140225501316e-05, "loss": 0.176, "loss_nan_ranks": 0, "loss_rank_avg": 0.20351186394691467, "step": 4520 }, { "epoch": 2.3721174004192873, "grad_norm": 0.7312102317810059, "learning_rate": 3.344980802943023e-05, "loss": 0.1638, "loss_nan_ranks": 0, "loss_rank_avg": 0.18066838383674622, "step": 4525 }, { "epoch": 2.3747379454926625, "grad_norm": 0.8290305733680725, "learning_rate": 3.3430452864107674e-05, "loss": 0.178, "loss_nan_ranks": 0, "loss_rank_avg": 0.18665027618408203, "step": 4530 }, { "epoch": 2.3773584905660377, "grad_norm": 0.5993841290473938, "learning_rate": 3.341107476258792e-05, "loss": 0.1706, "loss_nan_ranks": 0, "loss_rank_avg": 0.12851500511169434, "step": 4535 }, { "epoch": 2.379979035639413, "grad_norm": 0.6254639029502869, "learning_rate": 3.3391673757964404e-05, "loss": 0.185, "loss_nan_ranks": 0, "loss_rank_avg": 0.18954774737358093, "step": 4540 }, { "epoch": 2.3825995807127884, "grad_norm": 0.678989589214325, "learning_rate": 3.33722498833697e-05, "loss": 0.162, "loss_nan_ranks": 0, "loss_rank_avg": 0.153564453125, "step": 4545 }, { "epoch": 2.3852201257861636, "grad_norm": 0.6548760533332825, "learning_rate": 3.3352803171975415e-05, "loss": 0.1615, "loss_nan_ranks": 0, "loss_rank_avg": 0.19703298807144165, "step": 4550 }, { "epoch": 2.3878406708595388, "grad_norm": 0.5945609211921692, "learning_rate": 3.3333333656992166e-05, "loss": 0.1868, "loss_nan_ranks": 0, "loss_rank_avg": 0.17958563566207886, "step": 4555 }, { "epoch": 2.390461215932914, "grad_norm": 0.5302448868751526, "learning_rate": 3.331384137166951e-05, "loss": 0.1787, "loss_nan_ranks": 0, "loss_rank_avg": 0.16719529032707214, "step": 4560 }, { "epoch": 2.3930817610062896, "grad_norm": 0.5767799019813538, "learning_rate": 3.32943263492959e-05, "loss": 0.1985, "loss_nan_ranks": 0, "loss_rank_avg": 0.19227570295333862, "step": 4565 }, { "epoch": 2.3957023060796647, "grad_norm": 0.8207547664642334, "learning_rate": 3.32747886231986e-05, "loss": 0.1711, "loss_nan_ranks": 0, "loss_rank_avg": 0.1671919822692871, "step": 4570 }, { "epoch": 2.39832285115304, "grad_norm": 0.8117937445640564, "learning_rate": 3.325522822674366e-05, "loss": 0.1726, "loss_nan_ranks": 0, "loss_rank_avg": 0.19688598811626434, "step": 4575 }, { "epoch": 2.400943396226415, "grad_norm": 0.8320883512496948, "learning_rate": 3.323564519333586e-05, "loss": 0.1831, "loss_nan_ranks": 0, "loss_rank_avg": 0.20656168460845947, "step": 4580 }, { "epoch": 2.4035639412997902, "grad_norm": 0.6701602339744568, "learning_rate": 3.321603955641861e-05, "loss": 0.1677, "loss_nan_ranks": 0, "loss_rank_avg": 0.16803507506847382, "step": 4585 }, { "epoch": 2.4061844863731654, "grad_norm": 0.7060126662254333, "learning_rate": 3.319641134947393e-05, "loss": 0.1774, "loss_nan_ranks": 0, "loss_rank_avg": 0.17268584668636322, "step": 4590 }, { "epoch": 2.408805031446541, "grad_norm": 0.6497982740402222, "learning_rate": 3.31767606060224e-05, "loss": 0.1638, "loss_nan_ranks": 0, "loss_rank_avg": 0.16267868876457214, "step": 4595 }, { "epoch": 2.411425576519916, "grad_norm": 0.5415063500404358, "learning_rate": 3.315708735962307e-05, "loss": 0.1727, "loss_nan_ranks": 0, "loss_rank_avg": 0.1775253713130951, "step": 4600 }, { "epoch": 2.4140461215932913, "grad_norm": 0.6065043807029724, "learning_rate": 3.313739164387343e-05, "loss": 0.1805, "loss_nan_ranks": 0, "loss_rank_avg": 0.1512911468744278, "step": 4605 }, { "epoch": 2.4166666666666665, "grad_norm": 0.6144443154335022, "learning_rate": 3.311767349240934e-05, "loss": 0.1715, "loss_nan_ranks": 0, "loss_rank_avg": 0.19169479608535767, "step": 4610 }, { "epoch": 2.419287211740042, "grad_norm": 0.6852019429206848, "learning_rate": 3.309793293890497e-05, "loss": 0.1777, "loss_nan_ranks": 0, "loss_rank_avg": 0.1656094640493393, "step": 4615 }, { "epoch": 2.4219077568134173, "grad_norm": 0.5891123414039612, "learning_rate": 3.3078170017072744e-05, "loss": 0.1746, "loss_nan_ranks": 0, "loss_rank_avg": 0.180824413895607, "step": 4620 }, { "epoch": 2.4245283018867925, "grad_norm": 0.618079423904419, "learning_rate": 3.305838476066331e-05, "loss": 0.1792, "loss_nan_ranks": 0, "loss_rank_avg": 0.17430126667022705, "step": 4625 }, { "epoch": 2.4271488469601676, "grad_norm": 0.6806156635284424, "learning_rate": 3.303857720346544e-05, "loss": 0.1528, "loss_nan_ranks": 0, "loss_rank_avg": 0.16445207595825195, "step": 4630 }, { "epoch": 2.429769392033543, "grad_norm": 0.7231004238128662, "learning_rate": 3.3018747379305994e-05, "loss": 0.1815, "loss_nan_ranks": 0, "loss_rank_avg": 0.153076171875, "step": 4635 }, { "epoch": 2.4323899371069184, "grad_norm": 0.669330358505249, "learning_rate": 3.299889532204985e-05, "loss": 0.1649, "loss_nan_ranks": 0, "loss_rank_avg": 0.1876327395439148, "step": 4640 }, { "epoch": 2.4350104821802936, "grad_norm": 0.603354811668396, "learning_rate": 3.2979021065599864e-05, "loss": 0.1595, "loss_nan_ranks": 0, "loss_rank_avg": 0.16586905717849731, "step": 4645 }, { "epoch": 2.4376310272536688, "grad_norm": 0.6138980984687805, "learning_rate": 3.29591246438968e-05, "loss": 0.1655, "loss_nan_ranks": 0, "loss_rank_avg": 0.17583346366882324, "step": 4650 }, { "epoch": 2.440251572327044, "grad_norm": 0.6296398639678955, "learning_rate": 3.293920609091929e-05, "loss": 0.176, "loss_nan_ranks": 0, "loss_rank_avg": 0.16585686802864075, "step": 4655 }, { "epoch": 2.442872117400419, "grad_norm": 0.7336766719818115, "learning_rate": 3.291926544068375e-05, "loss": 0.1761, "loss_nan_ranks": 0, "loss_rank_avg": 0.14485588669776917, "step": 4660 }, { "epoch": 2.4454926624737947, "grad_norm": 0.5738866925239563, "learning_rate": 3.289930272724431e-05, "loss": 0.1731, "loss_nan_ranks": 0, "loss_rank_avg": 0.21165721118450165, "step": 4665 }, { "epoch": 2.44811320754717, "grad_norm": 0.575883686542511, "learning_rate": 3.2879317984692825e-05, "loss": 0.1609, "loss_nan_ranks": 0, "loss_rank_avg": 0.1656447947025299, "step": 4670 }, { "epoch": 2.450733752620545, "grad_norm": 0.5860551595687866, "learning_rate": 3.2859311247158734e-05, "loss": 0.1738, "loss_nan_ranks": 0, "loss_rank_avg": 0.17280437052249908, "step": 4675 }, { "epoch": 2.45335429769392, "grad_norm": 1.0741225481033325, "learning_rate": 3.283928254880906e-05, "loss": 0.1765, "loss_nan_ranks": 0, "loss_rank_avg": 0.13873291015625, "step": 4680 }, { "epoch": 2.4559748427672954, "grad_norm": 0.599844753742218, "learning_rate": 3.2819231923848316e-05, "loss": 0.166, "loss_nan_ranks": 0, "loss_rank_avg": 0.14130263030529022, "step": 4685 }, { "epoch": 2.458595387840671, "grad_norm": 0.6118952035903931, "learning_rate": 3.2799159406518464e-05, "loss": 0.1817, "loss_nan_ranks": 0, "loss_rank_avg": 0.1787024289369583, "step": 4690 }, { "epoch": 2.461215932914046, "grad_norm": 0.6382375955581665, "learning_rate": 3.277906503109885e-05, "loss": 0.1542, "loss_nan_ranks": 0, "loss_rank_avg": 0.17545664310455322, "step": 4695 }, { "epoch": 2.4638364779874213, "grad_norm": 0.7123702764511108, "learning_rate": 3.275894883190618e-05, "loss": 0.1869, "loss_nan_ranks": 0, "loss_rank_avg": 0.2054811716079712, "step": 4700 }, { "epoch": 2.4664570230607965, "grad_norm": 0.630284309387207, "learning_rate": 3.273881084329438e-05, "loss": 0.1611, "loss_nan_ranks": 0, "loss_rank_avg": 0.1761714667081833, "step": 4705 }, { "epoch": 2.469077568134172, "grad_norm": 0.6742514371871948, "learning_rate": 3.271865109965462e-05, "loss": 0.1876, "loss_nan_ranks": 0, "loss_rank_avg": 0.18644127249717712, "step": 4710 }, { "epoch": 2.4716981132075473, "grad_norm": 0.7128267288208008, "learning_rate": 3.269846963541521e-05, "loss": 0.1915, "loss_nan_ranks": 0, "loss_rank_avg": 0.16147366166114807, "step": 4715 }, { "epoch": 2.4743186582809225, "grad_norm": 0.5690448880195618, "learning_rate": 3.267826648504157e-05, "loss": 0.1735, "loss_nan_ranks": 0, "loss_rank_avg": 0.169921875, "step": 4720 }, { "epoch": 2.4769392033542976, "grad_norm": 0.6697767376899719, "learning_rate": 3.2658041683036124e-05, "loss": 0.18, "loss_nan_ranks": 0, "loss_rank_avg": 0.1703624427318573, "step": 4725 }, { "epoch": 2.479559748427673, "grad_norm": 0.6302558183670044, "learning_rate": 3.263779526393831e-05, "loss": 0.1729, "loss_nan_ranks": 0, "loss_rank_avg": 0.17672468721866608, "step": 4730 }, { "epoch": 2.4821802935010484, "grad_norm": 0.6719659566879272, "learning_rate": 3.261752726232446e-05, "loss": 0.1776, "loss_nan_ranks": 0, "loss_rank_avg": 0.15950210392475128, "step": 4735 }, { "epoch": 2.4848008385744236, "grad_norm": 0.6178978085517883, "learning_rate": 3.2597237712807764e-05, "loss": 0.1775, "loss_nan_ranks": 0, "loss_rank_avg": 0.19610846042633057, "step": 4740 }, { "epoch": 2.4874213836477987, "grad_norm": 0.6141926050186157, "learning_rate": 3.2576926650038225e-05, "loss": 0.1719, "loss_nan_ranks": 0, "loss_rank_avg": 0.14078906178474426, "step": 4745 }, { "epoch": 2.490041928721174, "grad_norm": 0.5836093425750732, "learning_rate": 3.255659410870257e-05, "loss": 0.1773, "loss_nan_ranks": 0, "loss_rank_avg": 0.17038002610206604, "step": 4750 }, { "epoch": 2.492662473794549, "grad_norm": 0.6279257535934448, "learning_rate": 3.253624012352421e-05, "loss": 0.1893, "loss_nan_ranks": 0, "loss_rank_avg": 0.22794798016548157, "step": 4755 }, { "epoch": 2.4952830188679247, "grad_norm": 0.6302879452705383, "learning_rate": 3.251586472926317e-05, "loss": 0.181, "loss_nan_ranks": 0, "loss_rank_avg": 0.1786530762910843, "step": 4760 }, { "epoch": 2.4979035639413, "grad_norm": 0.6581209301948547, "learning_rate": 3.249546796071608e-05, "loss": 0.1679, "loss_nan_ranks": 0, "loss_rank_avg": 0.1811736524105072, "step": 4765 }, { "epoch": 2.500524109014675, "grad_norm": 0.6320407390594482, "learning_rate": 3.2475049852716014e-05, "loss": 0.1704, "loss_nan_ranks": 0, "loss_rank_avg": 0.17114879190921783, "step": 4770 }, { "epoch": 2.50314465408805, "grad_norm": 0.7551243305206299, "learning_rate": 3.245461044013253e-05, "loss": 0.177, "loss_nan_ranks": 0, "loss_rank_avg": 0.18058930337429047, "step": 4775 }, { "epoch": 2.5057651991614254, "grad_norm": 0.6153421998023987, "learning_rate": 3.243414975787154e-05, "loss": 0.1892, "loss_nan_ranks": 0, "loss_rank_avg": 0.1871110498905182, "step": 4780 }, { "epoch": 2.508385744234801, "grad_norm": 0.6175753474235535, "learning_rate": 3.24136678408753e-05, "loss": 0.1781, "loss_nan_ranks": 0, "loss_rank_avg": 0.19896206259727478, "step": 4785 }, { "epoch": 2.511006289308176, "grad_norm": 0.6540406346321106, "learning_rate": 3.239316472412233e-05, "loss": 0.1667, "loss_nan_ranks": 0, "loss_rank_avg": 0.1719357818365097, "step": 4790 }, { "epoch": 2.5136268343815513, "grad_norm": 0.6842586398124695, "learning_rate": 3.237264044262734e-05, "loss": 0.1893, "loss_nan_ranks": 0, "loss_rank_avg": 0.23990176618099213, "step": 4795 }, { "epoch": 2.5162473794549265, "grad_norm": 0.5659977197647095, "learning_rate": 3.23520950314412e-05, "loss": 0.1691, "loss_nan_ranks": 0, "loss_rank_avg": 0.19691407680511475, "step": 4800 }, { "epoch": 2.518867924528302, "grad_norm": 0.736707866191864, "learning_rate": 3.233152852565085e-05, "loss": 0.1854, "loss_nan_ranks": 0, "loss_rank_avg": 0.16525039076805115, "step": 4805 }, { "epoch": 2.5214884696016773, "grad_norm": 0.6787170171737671, "learning_rate": 3.231094096037927e-05, "loss": 0.1847, "loss_nan_ranks": 0, "loss_rank_avg": 0.17906887829303741, "step": 4810 }, { "epoch": 2.5241090146750524, "grad_norm": 0.6288094520568848, "learning_rate": 3.22903323707854e-05, "loss": 0.1665, "loss_nan_ranks": 0, "loss_rank_avg": 0.18031513690948486, "step": 4815 }, { "epoch": 2.5267295597484276, "grad_norm": 0.630490243434906, "learning_rate": 3.2269702792064066e-05, "loss": 0.1532, "loss_nan_ranks": 0, "loss_rank_avg": 0.1419677734375, "step": 4820 }, { "epoch": 2.529350104821803, "grad_norm": 0.5428565740585327, "learning_rate": 3.224905225944598e-05, "loss": 0.1846, "loss_nan_ranks": 0, "loss_rank_avg": 0.2248590588569641, "step": 4825 }, { "epoch": 2.531970649895178, "grad_norm": 0.6126314401626587, "learning_rate": 3.2228380808197594e-05, "loss": 0.1842, "loss_nan_ranks": 0, "loss_rank_avg": 0.2059944123029709, "step": 4830 }, { "epoch": 2.5345911949685536, "grad_norm": 0.6274798512458801, "learning_rate": 3.2207688473621116e-05, "loss": 0.1706, "loss_nan_ranks": 0, "loss_rank_avg": 0.19701413810253143, "step": 4835 }, { "epoch": 2.5372117400419287, "grad_norm": 0.5996156334877014, "learning_rate": 3.2186975291054406e-05, "loss": 0.1693, "loss_nan_ranks": 0, "loss_rank_avg": 0.14583313465118408, "step": 4840 }, { "epoch": 2.539832285115304, "grad_norm": 0.9609743356704712, "learning_rate": 3.2166241295870915e-05, "loss": 0.1756, "loss_nan_ranks": 0, "loss_rank_avg": 0.13916015625, "step": 4845 }, { "epoch": 2.5424528301886795, "grad_norm": 0.645316481590271, "learning_rate": 3.2145486523479664e-05, "loss": 0.1883, "loss_nan_ranks": 0, "loss_rank_avg": 0.19266776740550995, "step": 4850 }, { "epoch": 2.5450733752620547, "grad_norm": 0.7192069292068481, "learning_rate": 3.212471100932513e-05, "loss": 0.1727, "loss_nan_ranks": 0, "loss_rank_avg": 0.152587890625, "step": 4855 }, { "epoch": 2.54769392033543, "grad_norm": 0.6661515235900879, "learning_rate": 3.210391478888725e-05, "loss": 0.1878, "loss_nan_ranks": 0, "loss_rank_avg": 0.22219903767108917, "step": 4860 }, { "epoch": 2.550314465408805, "grad_norm": 0.5181549191474915, "learning_rate": 3.208309789768127e-05, "loss": 0.1611, "loss_nan_ranks": 0, "loss_rank_avg": 0.14796018600463867, "step": 4865 }, { "epoch": 2.55293501048218, "grad_norm": 0.6805993914604187, "learning_rate": 3.206226037125778e-05, "loss": 0.1704, "loss_nan_ranks": 0, "loss_rank_avg": 0.1424269825220108, "step": 4870 }, { "epoch": 2.5555555555555554, "grad_norm": 0.6539928317070007, "learning_rate": 3.204140224520259e-05, "loss": 0.1805, "loss_nan_ranks": 0, "loss_rank_avg": 0.17243647575378418, "step": 4875 }, { "epoch": 2.558176100628931, "grad_norm": 0.7448137998580933, "learning_rate": 3.20205235551367e-05, "loss": 0.1734, "loss_nan_ranks": 0, "loss_rank_avg": 0.1661716103553772, "step": 4880 }, { "epoch": 2.560796645702306, "grad_norm": 0.6820452213287354, "learning_rate": 3.1999624336716207e-05, "loss": 0.2037, "loss_nan_ranks": 0, "loss_rank_avg": 0.174968421459198, "step": 4885 }, { "epoch": 2.5634171907756813, "grad_norm": 0.5573581457138062, "learning_rate": 3.197870462563231e-05, "loss": 0.167, "loss_nan_ranks": 0, "loss_rank_avg": 0.1981693059206009, "step": 4890 }, { "epoch": 2.5660377358490565, "grad_norm": 0.7350602149963379, "learning_rate": 3.195776445761116e-05, "loss": 0.1739, "loss_nan_ranks": 0, "loss_rank_avg": 0.163818359375, "step": 4895 }, { "epoch": 2.568658280922432, "grad_norm": 0.6017246842384338, "learning_rate": 3.1936803868413865e-05, "loss": 0.1815, "loss_nan_ranks": 0, "loss_rank_avg": 0.18484187126159668, "step": 4900 }, { "epoch": 2.5712788259958073, "grad_norm": 0.5709414482116699, "learning_rate": 3.1915822893836394e-05, "loss": 0.1902, "loss_nan_ranks": 0, "loss_rank_avg": 0.1708984375, "step": 4905 }, { "epoch": 2.5738993710691824, "grad_norm": 0.5710614323616028, "learning_rate": 3.189482156970956e-05, "loss": 0.181, "loss_nan_ranks": 0, "loss_rank_avg": 0.2219957709312439, "step": 4910 }, { "epoch": 2.5765199161425576, "grad_norm": 0.7193589210510254, "learning_rate": 3.187379993189889e-05, "loss": 0.1828, "loss_nan_ranks": 0, "loss_rank_avg": 0.2016737461090088, "step": 4915 }, { "epoch": 2.5791404612159328, "grad_norm": 0.6822991967201233, "learning_rate": 3.1852758016304625e-05, "loss": 0.1698, "loss_nan_ranks": 0, "loss_rank_avg": 0.1361376941204071, "step": 4920 }, { "epoch": 2.581761006289308, "grad_norm": 0.6136971116065979, "learning_rate": 3.1831695858861635e-05, "loss": 0.1793, "loss_nan_ranks": 0, "loss_rank_avg": 0.18536555767059326, "step": 4925 }, { "epoch": 2.5843815513626835, "grad_norm": 0.7393411993980408, "learning_rate": 3.181061349553935e-05, "loss": 0.1828, "loss_nan_ranks": 0, "loss_rank_avg": 0.18716517090797424, "step": 4930 }, { "epoch": 2.5870020964360587, "grad_norm": 0.7095828056335449, "learning_rate": 3.178951096234172e-05, "loss": 0.1614, "loss_nan_ranks": 0, "loss_rank_avg": 0.158935546875, "step": 4935 }, { "epoch": 2.589622641509434, "grad_norm": 0.6684853434562683, "learning_rate": 3.176838829530712e-05, "loss": 0.1751, "loss_nan_ranks": 0, "loss_rank_avg": 0.14074695110321045, "step": 4940 }, { "epoch": 2.5922431865828095, "grad_norm": 0.6725984811782837, "learning_rate": 3.174724553050833e-05, "loss": 0.186, "loss_nan_ranks": 0, "loss_rank_avg": 0.19328799843788147, "step": 4945 }, { "epoch": 2.5948637316561847, "grad_norm": 0.6634368896484375, "learning_rate": 3.172608270405244e-05, "loss": 0.1759, "loss_nan_ranks": 0, "loss_rank_avg": 0.18889939785003662, "step": 4950 }, { "epoch": 2.59748427672956, "grad_norm": 0.7839432954788208, "learning_rate": 3.1704899852080816e-05, "loss": 0.1771, "loss_nan_ranks": 0, "loss_rank_avg": 0.15167236328125, "step": 4955 }, { "epoch": 2.600104821802935, "grad_norm": 0.7503156065940857, "learning_rate": 3.1683697010768995e-05, "loss": 0.1758, "loss_nan_ranks": 0, "loss_rank_avg": 0.146728515625, "step": 4960 }, { "epoch": 2.60272536687631, "grad_norm": 0.5571033954620361, "learning_rate": 3.166247421632668e-05, "loss": 0.1674, "loss_nan_ranks": 0, "loss_rank_avg": 0.17306920886039734, "step": 4965 }, { "epoch": 2.6053459119496853, "grad_norm": 0.6313229203224182, "learning_rate": 3.1641231504997624e-05, "loss": 0.1832, "loss_nan_ranks": 0, "loss_rank_avg": 0.20780566334724426, "step": 4970 }, { "epoch": 2.607966457023061, "grad_norm": 0.689773678779602, "learning_rate": 3.161996891305962e-05, "loss": 0.2011, "loss_nan_ranks": 0, "loss_rank_avg": 0.20171569287776947, "step": 4975 }, { "epoch": 2.610587002096436, "grad_norm": 0.5588131546974182, "learning_rate": 3.15986864768244e-05, "loss": 0.1743, "loss_nan_ranks": 0, "loss_rank_avg": 0.21974435448646545, "step": 4980 }, { "epoch": 2.6132075471698113, "grad_norm": 0.5979903340339661, "learning_rate": 3.1577384232637575e-05, "loss": 0.1797, "loss_nan_ranks": 0, "loss_rank_avg": 0.16760723292827606, "step": 4985 }, { "epoch": 2.6158280922431865, "grad_norm": 0.6243706345558167, "learning_rate": 3.15560622168786e-05, "loss": 0.1713, "loss_nan_ranks": 0, "loss_rank_avg": 0.16010913252830505, "step": 4990 }, { "epoch": 2.618448637316562, "grad_norm": 0.7762710452079773, "learning_rate": 3.1534720465960694e-05, "loss": 0.1827, "loss_nan_ranks": 0, "loss_rank_avg": 0.20435717701911926, "step": 4995 }, { "epoch": 2.6210691823899372, "grad_norm": 0.7093287706375122, "learning_rate": 3.151335901633077e-05, "loss": 0.1817, "loss_nan_ranks": 0, "loss_rank_avg": 0.20002923905849457, "step": 5000 }, { "epoch": 2.6236897274633124, "grad_norm": 0.7017514705657959, "learning_rate": 3.1491977904469384e-05, "loss": 0.1895, "loss_nan_ranks": 0, "loss_rank_avg": 0.17623648047447205, "step": 5005 }, { "epoch": 2.6263102725366876, "grad_norm": 0.5963422060012817, "learning_rate": 3.147057716689068e-05, "loss": 0.1818, "loss_nan_ranks": 0, "loss_rank_avg": 0.18271183967590332, "step": 5010 }, { "epoch": 2.6289308176100628, "grad_norm": 0.5557655096054077, "learning_rate": 3.14491568401423e-05, "loss": 0.1917, "loss_nan_ranks": 0, "loss_rank_avg": 0.16214272379875183, "step": 5015 }, { "epoch": 2.631551362683438, "grad_norm": 0.7692015171051025, "learning_rate": 3.142771696080536e-05, "loss": 0.179, "loss_nan_ranks": 0, "loss_rank_avg": 0.16943359375, "step": 5020 }, { "epoch": 2.6341719077568135, "grad_norm": 0.6690882444381714, "learning_rate": 3.140625756549436e-05, "loss": 0.1832, "loss_nan_ranks": 0, "loss_rank_avg": 0.18332576751708984, "step": 5025 }, { "epoch": 2.6367924528301887, "grad_norm": 0.733025074005127, "learning_rate": 3.138477869085712e-05, "loss": 0.1763, "loss_nan_ranks": 0, "loss_rank_avg": 0.197998046875, "step": 5030 }, { "epoch": 2.639412997903564, "grad_norm": 0.6765258312225342, "learning_rate": 3.1363280373574744e-05, "loss": 0.1795, "loss_nan_ranks": 0, "loss_rank_avg": 0.19180743396282196, "step": 5035 }, { "epoch": 2.642033542976939, "grad_norm": 0.6777477264404297, "learning_rate": 3.134176265036153e-05, "loss": 0.1869, "loss_nan_ranks": 0, "loss_rank_avg": 0.1671317219734192, "step": 5040 }, { "epoch": 2.6446540880503147, "grad_norm": 0.8515976071357727, "learning_rate": 3.1320225557964896e-05, "loss": 0.183, "loss_nan_ranks": 0, "loss_rank_avg": 0.161865234375, "step": 5045 }, { "epoch": 2.64727463312369, "grad_norm": 0.5167196393013, "learning_rate": 3.129866913316538e-05, "loss": 0.184, "loss_nan_ranks": 0, "loss_rank_avg": 0.21703512966632843, "step": 5050 }, { "epoch": 2.649895178197065, "grad_norm": 0.6473361253738403, "learning_rate": 3.127709341277651e-05, "loss": 0.1802, "loss_nan_ranks": 0, "loss_rank_avg": 0.18052998185157776, "step": 5055 }, { "epoch": 2.65251572327044, "grad_norm": 0.5987588167190552, "learning_rate": 3.125549843364477e-05, "loss": 0.1859, "loss_nan_ranks": 0, "loss_rank_avg": 0.20642319321632385, "step": 5060 }, { "epoch": 2.6551362683438153, "grad_norm": 0.5620844960212708, "learning_rate": 3.1233884232649534e-05, "loss": 0.1877, "loss_nan_ranks": 0, "loss_rank_avg": 0.19808673858642578, "step": 5065 }, { "epoch": 2.6577568134171905, "grad_norm": 0.7065621018409729, "learning_rate": 3.1212250846703e-05, "loss": 0.1898, "loss_nan_ranks": 0, "loss_rank_avg": 0.155029296875, "step": 5070 }, { "epoch": 2.660377358490566, "grad_norm": 0.6492888927459717, "learning_rate": 3.1190598312750145e-05, "loss": 0.1729, "loss_nan_ranks": 0, "loss_rank_avg": 0.2139112949371338, "step": 5075 }, { "epoch": 2.6629979035639413, "grad_norm": 0.7838486433029175, "learning_rate": 3.116892666776861e-05, "loss": 0.1805, "loss_nan_ranks": 0, "loss_rank_avg": 0.18071314692497253, "step": 5080 }, { "epoch": 2.6656184486373165, "grad_norm": 0.6746211647987366, "learning_rate": 3.114723594876872e-05, "loss": 0.1799, "loss_nan_ranks": 0, "loss_rank_avg": 0.16943359375, "step": 5085 }, { "epoch": 2.668238993710692, "grad_norm": 0.5902780294418335, "learning_rate": 3.112552619279335e-05, "loss": 0.1627, "loss_nan_ranks": 0, "loss_rank_avg": 0.1680593490600586, "step": 5090 }, { "epoch": 2.6708595387840672, "grad_norm": 0.6151570677757263, "learning_rate": 3.1103797436917874e-05, "loss": 0.1832, "loss_nan_ranks": 0, "loss_rank_avg": 0.2051677107810974, "step": 5095 }, { "epoch": 2.6734800838574424, "grad_norm": 0.6031655669212341, "learning_rate": 3.108204971825013e-05, "loss": 0.1811, "loss_nan_ranks": 0, "loss_rank_avg": 0.202290877699852, "step": 5100 }, { "epoch": 2.6761006289308176, "grad_norm": 0.7880269289016724, "learning_rate": 3.106028307393034e-05, "loss": 0.1813, "loss_nan_ranks": 0, "loss_rank_avg": 0.19555190205574036, "step": 5105 }, { "epoch": 2.6787211740041927, "grad_norm": 0.5152765512466431, "learning_rate": 3.103849754113106e-05, "loss": 0.1666, "loss_nan_ranks": 0, "loss_rank_avg": 0.1540350317955017, "step": 5110 }, { "epoch": 2.681341719077568, "grad_norm": 0.5880343914031982, "learning_rate": 3.101669315705706e-05, "loss": 0.1736, "loss_nan_ranks": 0, "loss_rank_avg": 0.18420341610908508, "step": 5115 }, { "epoch": 2.6839622641509435, "grad_norm": 0.7327151894569397, "learning_rate": 3.099486995894535e-05, "loss": 0.1902, "loss_nan_ranks": 0, "loss_rank_avg": 0.21202674508094788, "step": 5120 }, { "epoch": 2.6865828092243187, "grad_norm": 0.6346898078918457, "learning_rate": 3.097302798406504e-05, "loss": 0.1915, "loss_nan_ranks": 0, "loss_rank_avg": 0.19018851220607758, "step": 5125 }, { "epoch": 2.689203354297694, "grad_norm": 0.5346789360046387, "learning_rate": 3.0951167269717326e-05, "loss": 0.1935, "loss_nan_ranks": 0, "loss_rank_avg": 0.19498604536056519, "step": 5130 }, { "epoch": 2.691823899371069, "grad_norm": 0.6258835792541504, "learning_rate": 3.092928785323539e-05, "loss": 0.1711, "loss_nan_ranks": 0, "loss_rank_avg": 0.18919895589351654, "step": 5135 }, { "epoch": 2.6944444444444446, "grad_norm": 0.6863317489624023, "learning_rate": 3.090738977198437e-05, "loss": 0.168, "loss_nan_ranks": 0, "loss_rank_avg": 0.16847869753837585, "step": 5140 }, { "epoch": 2.69706498951782, "grad_norm": 0.5800849199295044, "learning_rate": 3.088547306336126e-05, "loss": 0.179, "loss_nan_ranks": 0, "loss_rank_avg": 0.17609632015228271, "step": 5145 }, { "epoch": 2.699685534591195, "grad_norm": 0.636333167552948, "learning_rate": 3.08635377647949e-05, "loss": 0.1696, "loss_nan_ranks": 0, "loss_rank_avg": 0.19171679019927979, "step": 5150 }, { "epoch": 2.70230607966457, "grad_norm": 0.6417024731636047, "learning_rate": 3.084158391374583e-05, "loss": 0.1658, "loss_nan_ranks": 0, "loss_rank_avg": 0.18236128985881805, "step": 5155 }, { "epoch": 2.7049266247379453, "grad_norm": 0.5931425094604492, "learning_rate": 3.08196115477063e-05, "loss": 0.1753, "loss_nan_ranks": 0, "loss_rank_avg": 0.17998279631137848, "step": 5160 }, { "epoch": 2.7075471698113205, "grad_norm": 0.5689296126365662, "learning_rate": 3.0797620704200186e-05, "loss": 0.159, "loss_nan_ranks": 0, "loss_rank_avg": 0.15252912044525146, "step": 5165 }, { "epoch": 2.710167714884696, "grad_norm": 0.7568827271461487, "learning_rate": 3.07756114207829e-05, "loss": 0.1622, "loss_nan_ranks": 0, "loss_rank_avg": 0.157958984375, "step": 5170 }, { "epoch": 2.7127882599580713, "grad_norm": 0.6383182406425476, "learning_rate": 3.0753583735041365e-05, "loss": 0.1955, "loss_nan_ranks": 0, "loss_rank_avg": 0.22475168108940125, "step": 5175 }, { "epoch": 2.7154088050314464, "grad_norm": 0.7478203773498535, "learning_rate": 3.073153768459391e-05, "loss": 0.1759, "loss_nan_ranks": 0, "loss_rank_avg": 0.1389104276895523, "step": 5180 }, { "epoch": 2.718029350104822, "grad_norm": 0.8033480644226074, "learning_rate": 3.0709473307090244e-05, "loss": 0.1814, "loss_nan_ranks": 0, "loss_rank_avg": 0.17884859442710876, "step": 5185 }, { "epoch": 2.720649895178197, "grad_norm": 0.6497260332107544, "learning_rate": 3.0687390640211374e-05, "loss": 0.1926, "loss_nan_ranks": 0, "loss_rank_avg": 0.20827674865722656, "step": 5190 }, { "epoch": 2.7232704402515724, "grad_norm": 0.5410135388374329, "learning_rate": 3.0665289721669526e-05, "loss": 0.1703, "loss_nan_ranks": 0, "loss_rank_avg": 0.1410600244998932, "step": 5195 }, { "epoch": 2.7258909853249476, "grad_norm": 0.661300003528595, "learning_rate": 3.064317058920811e-05, "loss": 0.1626, "loss_nan_ranks": 0, "loss_rank_avg": 0.15847226977348328, "step": 5200 }, { "epoch": 2.7285115303983227, "grad_norm": 0.6560903787612915, "learning_rate": 3.062103328060164e-05, "loss": 0.1691, "loss_nan_ranks": 0, "loss_rank_avg": 0.16378501057624817, "step": 5205 }, { "epoch": 2.731132075471698, "grad_norm": 0.5811970233917236, "learning_rate": 3.0598877833655654e-05, "loss": 0.1674, "loss_nan_ranks": 0, "loss_rank_avg": 0.17363318800926208, "step": 5210 }, { "epoch": 2.7337526205450735, "grad_norm": 0.5654155015945435, "learning_rate": 3.057670428620669e-05, "loss": 0.1885, "loss_nan_ranks": 0, "loss_rank_avg": 0.16175119578838348, "step": 5215 }, { "epoch": 2.7363731656184487, "grad_norm": 0.6081898212432861, "learning_rate": 3.0554512676122196e-05, "loss": 0.1812, "loss_nan_ranks": 0, "loss_rank_avg": 0.16766831278800964, "step": 5220 }, { "epoch": 2.738993710691824, "grad_norm": 0.6194442510604858, "learning_rate": 3.053230304130043e-05, "loss": 0.1618, "loss_nan_ranks": 0, "loss_rank_avg": 0.15402266383171082, "step": 5225 }, { "epoch": 2.741614255765199, "grad_norm": 0.6389869451522827, "learning_rate": 3.0510075419670496e-05, "loss": 0.1867, "loss_nan_ranks": 0, "loss_rank_avg": 0.1833145022392273, "step": 5230 }, { "epoch": 2.7442348008385746, "grad_norm": 0.6671640276908875, "learning_rate": 3.048782984919215e-05, "loss": 0.167, "loss_nan_ranks": 0, "loss_rank_avg": 0.1536865234375, "step": 5235 }, { "epoch": 2.74685534591195, "grad_norm": 0.6398336887359619, "learning_rate": 3.0465566367855847e-05, "loss": 0.1835, "loss_nan_ranks": 0, "loss_rank_avg": 0.20122002065181732, "step": 5240 }, { "epoch": 2.749475890985325, "grad_norm": 0.6063717007637024, "learning_rate": 3.044328501368261e-05, "loss": 0.1724, "loss_nan_ranks": 0, "loss_rank_avg": 0.22785818576812744, "step": 5245 }, { "epoch": 2.7520964360587, "grad_norm": 0.6103911399841309, "learning_rate": 3.0420985824723984e-05, "loss": 0.162, "loss_nan_ranks": 0, "loss_rank_avg": 0.18639829754829407, "step": 5250 }, { "epoch": 2.7547169811320753, "grad_norm": 0.6719363927841187, "learning_rate": 3.0398668839061978e-05, "loss": 0.1801, "loss_nan_ranks": 0, "loss_rank_avg": 0.18195107579231262, "step": 5255 }, { "epoch": 2.7573375262054505, "grad_norm": 0.6338904500007629, "learning_rate": 3.037633409480899e-05, "loss": 0.1782, "loss_nan_ranks": 0, "loss_rank_avg": 0.17371147871017456, "step": 5260 }, { "epoch": 2.759958071278826, "grad_norm": 0.6469314098358154, "learning_rate": 3.0353981630107748e-05, "loss": 0.1709, "loss_nan_ranks": 0, "loss_rank_avg": 0.13950802385807037, "step": 5265 }, { "epoch": 2.7625786163522013, "grad_norm": 0.6446741819381714, "learning_rate": 3.0331611483131245e-05, "loss": 0.1882, "loss_nan_ranks": 0, "loss_rank_avg": 0.1679340898990631, "step": 5270 }, { "epoch": 2.7651991614255764, "grad_norm": 0.7273684144020081, "learning_rate": 3.0309223692082663e-05, "loss": 0.1651, "loss_nan_ranks": 0, "loss_rank_avg": 0.13474009931087494, "step": 5275 }, { "epoch": 2.767819706498952, "grad_norm": 0.6342905759811401, "learning_rate": 3.028681829519532e-05, "loss": 0.1864, "loss_nan_ranks": 0, "loss_rank_avg": 0.21595671772956848, "step": 5280 }, { "epoch": 2.770440251572327, "grad_norm": 0.7033205032348633, "learning_rate": 3.026439533073261e-05, "loss": 0.1768, "loss_nan_ranks": 0, "loss_rank_avg": 0.20636935532093048, "step": 5285 }, { "epoch": 2.7730607966457024, "grad_norm": 0.6942194700241089, "learning_rate": 3.0241954836987916e-05, "loss": 0.1695, "loss_nan_ranks": 0, "loss_rank_avg": 0.1712348610162735, "step": 5290 }, { "epoch": 2.7756813417190775, "grad_norm": 0.5782949924468994, "learning_rate": 3.0219496852284558e-05, "loss": 0.1783, "loss_nan_ranks": 0, "loss_rank_avg": 0.18249915540218353, "step": 5295 }, { "epoch": 2.7783018867924527, "grad_norm": 0.6877314448356628, "learning_rate": 3.0197021414975735e-05, "loss": 0.1678, "loss_nan_ranks": 0, "loss_rank_avg": 0.166259765625, "step": 5300 }, { "epoch": 2.780922431865828, "grad_norm": 0.6748788952827454, "learning_rate": 3.0174528563444447e-05, "loss": 0.1734, "loss_nan_ranks": 0, "loss_rank_avg": 0.14892578125, "step": 5305 }, { "epoch": 2.7835429769392035, "grad_norm": 0.8148460984230042, "learning_rate": 3.0152018336103427e-05, "loss": 0.1756, "loss_nan_ranks": 0, "loss_rank_avg": 0.14063332974910736, "step": 5310 }, { "epoch": 2.7861635220125787, "grad_norm": 1.9282485246658325, "learning_rate": 3.0129490771395086e-05, "loss": 0.1776, "loss_nan_ranks": 0, "loss_rank_avg": 0.19029554724693298, "step": 5315 }, { "epoch": 2.788784067085954, "grad_norm": 0.6980833411216736, "learning_rate": 3.0106945907791455e-05, "loss": 0.1888, "loss_nan_ranks": 0, "loss_rank_avg": 0.147705078125, "step": 5320 }, { "epoch": 2.791404612159329, "grad_norm": 0.5489820241928101, "learning_rate": 3.0084383783794094e-05, "loss": 0.1751, "loss_nan_ranks": 0, "loss_rank_avg": 0.1689077913761139, "step": 5325 }, { "epoch": 2.7940251572327046, "grad_norm": 0.6219632029533386, "learning_rate": 3.0061804437934037e-05, "loss": 0.1762, "loss_nan_ranks": 0, "loss_rank_avg": 0.18127012252807617, "step": 5330 }, { "epoch": 2.79664570230608, "grad_norm": 0.5704904794692993, "learning_rate": 3.0039207908771747e-05, "loss": 0.1701, "loss_nan_ranks": 0, "loss_rank_avg": 0.17348316311836243, "step": 5335 }, { "epoch": 2.799266247379455, "grad_norm": 0.6238038539886475, "learning_rate": 3.0016594234897015e-05, "loss": 0.1823, "loss_nan_ranks": 0, "loss_rank_avg": 0.156494140625, "step": 5340 }, { "epoch": 2.80188679245283, "grad_norm": 0.6026485562324524, "learning_rate": 2.9993963454928914e-05, "loss": 0.1852, "loss_nan_ranks": 0, "loss_rank_avg": 0.18142390251159668, "step": 5345 }, { "epoch": 2.8045073375262053, "grad_norm": 0.7556951642036438, "learning_rate": 2.997131560751574e-05, "loss": 0.164, "loss_nan_ranks": 0, "loss_rank_avg": 0.1475830078125, "step": 5350 }, { "epoch": 2.8071278825995805, "grad_norm": 0.6020495295524597, "learning_rate": 2.994865073133492e-05, "loss": 0.1864, "loss_nan_ranks": 0, "loss_rank_avg": 0.20331259071826935, "step": 5355 }, { "epoch": 2.809748427672956, "grad_norm": 0.582724928855896, "learning_rate": 2.9925968865092994e-05, "loss": 0.1591, "loss_nan_ranks": 0, "loss_rank_avg": 0.16749607026576996, "step": 5360 }, { "epoch": 2.8123689727463312, "grad_norm": 0.5949836373329163, "learning_rate": 2.9903270047525467e-05, "loss": 0.1806, "loss_nan_ranks": 0, "loss_rank_avg": 0.19134676456451416, "step": 5365 }, { "epoch": 2.8149895178197064, "grad_norm": 0.5697270035743713, "learning_rate": 2.9880554317396843e-05, "loss": 0.1871, "loss_nan_ranks": 0, "loss_rank_avg": 0.21274207532405853, "step": 5370 }, { "epoch": 2.817610062893082, "grad_norm": 0.5818173885345459, "learning_rate": 2.985782171350048e-05, "loss": 0.1716, "loss_nan_ranks": 0, "loss_rank_avg": 0.16496126353740692, "step": 5375 }, { "epoch": 2.820230607966457, "grad_norm": 0.9107900261878967, "learning_rate": 2.9835072274658556e-05, "loss": 0.1782, "loss_nan_ranks": 0, "loss_rank_avg": 0.1845887154340744, "step": 5380 }, { "epoch": 2.8228511530398324, "grad_norm": 0.6681698560714722, "learning_rate": 2.9812306039722016e-05, "loss": 0.171, "loss_nan_ranks": 0, "loss_rank_avg": 0.14111310243606567, "step": 5385 }, { "epoch": 2.8254716981132075, "grad_norm": 0.58545982837677, "learning_rate": 2.978952304757045e-05, "loss": 0.1884, "loss_nan_ranks": 0, "loss_rank_avg": 0.17585648596286774, "step": 5390 }, { "epoch": 2.8280922431865827, "grad_norm": 0.5710182189941406, "learning_rate": 2.9766723337112124e-05, "loss": 0.1856, "loss_nan_ranks": 0, "loss_rank_avg": 0.1905125230550766, "step": 5395 }, { "epoch": 2.830712788259958, "grad_norm": 0.6644915342330933, "learning_rate": 2.974390694728381e-05, "loss": 0.1737, "loss_nan_ranks": 0, "loss_rank_avg": 0.18250641226768494, "step": 5400 }, { "epoch": 2.8333333333333335, "grad_norm": 0.7214245200157166, "learning_rate": 2.972107391705077e-05, "loss": 0.1482, "loss_nan_ranks": 0, "loss_rank_avg": 0.1199951171875, "step": 5405 }, { "epoch": 2.8359538784067087, "grad_norm": 0.5463569164276123, "learning_rate": 2.9698224285406697e-05, "loss": 0.1874, "loss_nan_ranks": 0, "loss_rank_avg": 0.22129079699516296, "step": 5410 }, { "epoch": 2.838574423480084, "grad_norm": 0.6725033521652222, "learning_rate": 2.9675358091373634e-05, "loss": 0.1566, "loss_nan_ranks": 0, "loss_rank_avg": 0.16396062076091766, "step": 5415 }, { "epoch": 2.841194968553459, "grad_norm": 0.7897777557373047, "learning_rate": 2.9652475374001898e-05, "loss": 0.1646, "loss_nan_ranks": 0, "loss_rank_avg": 0.1701788604259491, "step": 5420 }, { "epoch": 2.8438155136268346, "grad_norm": 0.6413025856018066, "learning_rate": 2.9629576172370035e-05, "loss": 0.1797, "loss_nan_ranks": 0, "loss_rank_avg": 0.18991181254386902, "step": 5425 }, { "epoch": 2.8464360587002098, "grad_norm": 0.5760217905044556, "learning_rate": 2.960666052558474e-05, "loss": 0.1713, "loss_nan_ranks": 0, "loss_rank_avg": 0.16620710492134094, "step": 5430 }, { "epoch": 2.849056603773585, "grad_norm": 0.5823150873184204, "learning_rate": 2.9583728472780787e-05, "loss": 0.1664, "loss_nan_ranks": 0, "loss_rank_avg": 0.1885368973016739, "step": 5435 }, { "epoch": 2.85167714884696, "grad_norm": 0.5929847359657288, "learning_rate": 2.9560780053120982e-05, "loss": 0.1579, "loss_nan_ranks": 0, "loss_rank_avg": 0.1745993196964264, "step": 5440 }, { "epoch": 2.8542976939203353, "grad_norm": 0.5878052711486816, "learning_rate": 2.9537815305796056e-05, "loss": 0.1917, "loss_nan_ranks": 0, "loss_rank_avg": 0.2162623107433319, "step": 5445 }, { "epoch": 2.8569182389937104, "grad_norm": 0.595242977142334, "learning_rate": 2.951483427002465e-05, "loss": 0.1645, "loss_nan_ranks": 0, "loss_rank_avg": 0.16339543461799622, "step": 5450 }, { "epoch": 2.859538784067086, "grad_norm": 0.6756539940834045, "learning_rate": 2.9491836985053215e-05, "loss": 0.1804, "loss_nan_ranks": 0, "loss_rank_avg": 0.19334867596626282, "step": 5455 }, { "epoch": 2.8621593291404612, "grad_norm": 0.6055642366409302, "learning_rate": 2.946882349015594e-05, "loss": 0.1744, "loss_nan_ranks": 0, "loss_rank_avg": 0.22528602182865143, "step": 5460 }, { "epoch": 2.8647798742138364, "grad_norm": 0.6377682089805603, "learning_rate": 2.9445793824634715e-05, "loss": 0.199, "loss_nan_ranks": 0, "loss_rank_avg": 0.17041015625, "step": 5465 }, { "epoch": 2.867400419287212, "grad_norm": 0.6229009032249451, "learning_rate": 2.9422748027819025e-05, "loss": 0.1665, "loss_nan_ranks": 0, "loss_rank_avg": 0.162109375, "step": 5470 }, { "epoch": 2.870020964360587, "grad_norm": 0.6600066423416138, "learning_rate": 2.9399686139065924e-05, "loss": 0.1772, "loss_nan_ranks": 0, "loss_rank_avg": 0.2039642333984375, "step": 5475 }, { "epoch": 2.8726415094339623, "grad_norm": 0.5950238704681396, "learning_rate": 2.9376608197759934e-05, "loss": 0.1907, "loss_nan_ranks": 0, "loss_rank_avg": 0.16557833552360535, "step": 5480 }, { "epoch": 2.8752620545073375, "grad_norm": 0.7862025499343872, "learning_rate": 2.9353514243313004e-05, "loss": 0.1623, "loss_nan_ranks": 0, "loss_rank_avg": 0.14168497920036316, "step": 5485 }, { "epoch": 2.8778825995807127, "grad_norm": 0.6496447324752808, "learning_rate": 2.9330404315164413e-05, "loss": 0.1996, "loss_nan_ranks": 0, "loss_rank_avg": 0.2142472267150879, "step": 5490 }, { "epoch": 2.880503144654088, "grad_norm": 0.6302199959754944, "learning_rate": 2.9307278452780726e-05, "loss": 0.1654, "loss_nan_ranks": 0, "loss_rank_avg": 0.13268312811851501, "step": 5495 }, { "epoch": 2.8831236897274635, "grad_norm": 0.8944174647331238, "learning_rate": 2.928413669565573e-05, "loss": 0.1755, "loss_nan_ranks": 0, "loss_rank_avg": 0.16154518723487854, "step": 5500 }, { "epoch": 2.8857442348008386, "grad_norm": 0.6077420711517334, "learning_rate": 2.9260979083310345e-05, "loss": 0.1724, "loss_nan_ranks": 0, "loss_rank_avg": 0.16899073123931885, "step": 5505 }, { "epoch": 2.888364779874214, "grad_norm": 0.7525168061256409, "learning_rate": 2.9237805655292572e-05, "loss": 0.1757, "loss_nan_ranks": 0, "loss_rank_avg": 0.14932957291603088, "step": 5510 }, { "epoch": 2.890985324947589, "grad_norm": 0.7273001074790955, "learning_rate": 2.921461645117743e-05, "loss": 0.1752, "loss_nan_ranks": 0, "loss_rank_avg": 0.17291836440563202, "step": 5515 }, { "epoch": 2.8936058700209646, "grad_norm": 0.6908173561096191, "learning_rate": 2.9191411510566852e-05, "loss": 0.1871, "loss_nan_ranks": 0, "loss_rank_avg": 0.1522650569677353, "step": 5520 }, { "epoch": 2.8962264150943398, "grad_norm": 0.6004059910774231, "learning_rate": 2.9168190873089685e-05, "loss": 0.1668, "loss_nan_ranks": 0, "loss_rank_avg": 0.18317842483520508, "step": 5525 }, { "epoch": 2.898846960167715, "grad_norm": 0.7183316349983215, "learning_rate": 2.9144954578401558e-05, "loss": 0.1937, "loss_nan_ranks": 0, "loss_rank_avg": 0.158203125, "step": 5530 }, { "epoch": 2.90146750524109, "grad_norm": 0.6961800456047058, "learning_rate": 2.912170266618483e-05, "loss": 0.1768, "loss_nan_ranks": 0, "loss_rank_avg": 0.18650424480438232, "step": 5535 }, { "epoch": 2.9040880503144653, "grad_norm": 0.9425960183143616, "learning_rate": 2.9098435176148567e-05, "loss": 0.1725, "loss_nan_ranks": 0, "loss_rank_avg": 0.20058107376098633, "step": 5540 }, { "epoch": 2.9067085953878404, "grad_norm": 0.5319423675537109, "learning_rate": 2.9075152148028394e-05, "loss": 0.1646, "loss_nan_ranks": 0, "loss_rank_avg": 0.13040751218795776, "step": 5545 }, { "epoch": 2.909329140461216, "grad_norm": 0.6470221877098083, "learning_rate": 2.9051853621586513e-05, "loss": 0.1841, "loss_nan_ranks": 0, "loss_rank_avg": 0.20197980105876923, "step": 5550 }, { "epoch": 2.911949685534591, "grad_norm": 0.7041011452674866, "learning_rate": 2.9028539636611567e-05, "loss": 0.1671, "loss_nan_ranks": 0, "loss_rank_avg": 0.18308019638061523, "step": 5555 }, { "epoch": 2.9145702306079664, "grad_norm": 0.6379269957542419, "learning_rate": 2.9005210232918596e-05, "loss": 0.1714, "loss_nan_ranks": 0, "loss_rank_avg": 0.1962052583694458, "step": 5560 }, { "epoch": 2.917190775681342, "grad_norm": 0.6516327261924744, "learning_rate": 2.8981865450349006e-05, "loss": 0.1909, "loss_nan_ranks": 0, "loss_rank_avg": 0.18401208519935608, "step": 5565 }, { "epoch": 2.919811320754717, "grad_norm": 0.5459520220756531, "learning_rate": 2.8958505328770415e-05, "loss": 0.2012, "loss_nan_ranks": 0, "loss_rank_avg": 0.23481181263923645, "step": 5570 }, { "epoch": 2.9224318658280923, "grad_norm": 0.7334117889404297, "learning_rate": 2.893512990807669e-05, "loss": 0.1936, "loss_nan_ranks": 0, "loss_rank_avg": 0.19439546763896942, "step": 5575 }, { "epoch": 2.9250524109014675, "grad_norm": 0.6233519315719604, "learning_rate": 2.8911739228187782e-05, "loss": 0.1503, "loss_nan_ranks": 0, "loss_rank_avg": 0.1621095985174179, "step": 5580 }, { "epoch": 2.9276729559748427, "grad_norm": 0.5482489466667175, "learning_rate": 2.8888333329049728e-05, "loss": 0.1725, "loss_nan_ranks": 0, "loss_rank_avg": 0.19045138359069824, "step": 5585 }, { "epoch": 2.930293501048218, "grad_norm": 0.6794843077659607, "learning_rate": 2.8864912250634543e-05, "loss": 0.1766, "loss_nan_ranks": 0, "loss_rank_avg": 0.16787664592266083, "step": 5590 }, { "epoch": 2.9329140461215935, "grad_norm": 0.5271611213684082, "learning_rate": 2.8841476032940162e-05, "loss": 0.1553, "loss_nan_ranks": 0, "loss_rank_avg": 0.15002946555614471, "step": 5595 }, { "epoch": 2.9355345911949686, "grad_norm": 0.5626645684242249, "learning_rate": 2.88180247159904e-05, "loss": 0.1659, "loss_nan_ranks": 0, "loss_rank_avg": 0.15474967658519745, "step": 5600 }, { "epoch": 2.938155136268344, "grad_norm": 0.5897443294525146, "learning_rate": 2.8794558339834825e-05, "loss": 0.1713, "loss_nan_ranks": 0, "loss_rank_avg": 0.16423803567886353, "step": 5605 }, { "epoch": 2.940775681341719, "grad_norm": 0.5693492889404297, "learning_rate": 2.877107694454874e-05, "loss": 0.1895, "loss_nan_ranks": 0, "loss_rank_avg": 0.1666092872619629, "step": 5610 }, { "epoch": 2.9433962264150946, "grad_norm": 0.5590540170669556, "learning_rate": 2.8747580570233098e-05, "loss": 0.1765, "loss_nan_ranks": 0, "loss_rank_avg": 0.1851804554462433, "step": 5615 }, { "epoch": 2.9460167714884697, "grad_norm": 0.5822227001190186, "learning_rate": 2.8724069257014425e-05, "loss": 0.1761, "loss_nan_ranks": 0, "loss_rank_avg": 0.174712672829628, "step": 5620 }, { "epoch": 2.948637316561845, "grad_norm": 0.5724350214004517, "learning_rate": 2.8700543045044767e-05, "loss": 0.1769, "loss_nan_ranks": 0, "loss_rank_avg": 0.2074946165084839, "step": 5625 }, { "epoch": 2.95125786163522, "grad_norm": 0.6528369188308716, "learning_rate": 2.8677001974501607e-05, "loss": 0.1674, "loss_nan_ranks": 0, "loss_rank_avg": 0.12680822610855103, "step": 5630 }, { "epoch": 2.9538784067085953, "grad_norm": 0.5912206172943115, "learning_rate": 2.865344608558781e-05, "loss": 0.186, "loss_nan_ranks": 0, "loss_rank_avg": 0.1962258368730545, "step": 5635 }, { "epoch": 2.9564989517819704, "grad_norm": 0.9319872856140137, "learning_rate": 2.8629875418531542e-05, "loss": 0.1814, "loss_nan_ranks": 0, "loss_rank_avg": 0.151123046875, "step": 5640 }, { "epoch": 2.959119496855346, "grad_norm": 0.5565921664237976, "learning_rate": 2.860629001358621e-05, "loss": 0.1837, "loss_nan_ranks": 0, "loss_rank_avg": 0.20103786885738373, "step": 5645 }, { "epoch": 2.961740041928721, "grad_norm": 0.7086861729621887, "learning_rate": 2.8582689911030383e-05, "loss": 0.1653, "loss_nan_ranks": 0, "loss_rank_avg": 0.2012900412082672, "step": 5650 }, { "epoch": 2.9643605870020964, "grad_norm": 0.508179247379303, "learning_rate": 2.8559075151167745e-05, "loss": 0.1869, "loss_nan_ranks": 0, "loss_rank_avg": 0.1763106882572174, "step": 5655 }, { "epoch": 2.9669811320754715, "grad_norm": 0.5380913019180298, "learning_rate": 2.8535445774326994e-05, "loss": 0.1653, "loss_nan_ranks": 0, "loss_rank_avg": 0.20369559526443481, "step": 5660 }, { "epoch": 2.969601677148847, "grad_norm": 0.6445121169090271, "learning_rate": 2.8511801820861807e-05, "loss": 0.1866, "loss_nan_ranks": 0, "loss_rank_avg": 0.1454983353614807, "step": 5665 }, { "epoch": 2.9722222222222223, "grad_norm": 0.6975541114807129, "learning_rate": 2.8488143331150743e-05, "loss": 0.1544, "loss_nan_ranks": 0, "loss_rank_avg": 0.15515579283237457, "step": 5670 }, { "epoch": 2.9748427672955975, "grad_norm": 0.6473163366317749, "learning_rate": 2.8464470345597184e-05, "loss": 0.1895, "loss_nan_ranks": 0, "loss_rank_avg": 0.20755667984485626, "step": 5675 }, { "epoch": 2.9774633123689727, "grad_norm": 0.6191983222961426, "learning_rate": 2.844078290462928e-05, "loss": 0.1623, "loss_nan_ranks": 0, "loss_rank_avg": 0.17519129812717438, "step": 5680 }, { "epoch": 2.980083857442348, "grad_norm": 0.6182118654251099, "learning_rate": 2.8417081048699855e-05, "loss": 0.1912, "loss_nan_ranks": 0, "loss_rank_avg": 0.17867231369018555, "step": 5685 }, { "epoch": 2.982704402515723, "grad_norm": 0.6826756596565247, "learning_rate": 2.8393364818286363e-05, "loss": 0.1789, "loss_nan_ranks": 0, "loss_rank_avg": 0.159912109375, "step": 5690 }, { "epoch": 2.9853249475890986, "grad_norm": 0.5628302097320557, "learning_rate": 2.8369634253890797e-05, "loss": 0.1651, "loss_nan_ranks": 0, "loss_rank_avg": 0.1419781893491745, "step": 5695 }, { "epoch": 2.987945492662474, "grad_norm": 0.6937417387962341, "learning_rate": 2.8345889396039615e-05, "loss": 0.1657, "loss_nan_ranks": 0, "loss_rank_avg": 0.15064573287963867, "step": 5700 }, { "epoch": 2.990566037735849, "grad_norm": 0.5172299742698669, "learning_rate": 2.8322130285283725e-05, "loss": 0.1806, "loss_nan_ranks": 0, "loss_rank_avg": 0.2054455578327179, "step": 5705 }, { "epoch": 2.9931865828092246, "grad_norm": 0.6245455741882324, "learning_rate": 2.829835696219834e-05, "loss": 0.1897, "loss_nan_ranks": 0, "loss_rank_avg": 0.1823698729276657, "step": 5710 }, { "epoch": 2.9958071278825997, "grad_norm": 0.6723186373710632, "learning_rate": 2.8274569467382962e-05, "loss": 0.1753, "loss_nan_ranks": 0, "loss_rank_avg": 0.17229504883289337, "step": 5715 }, { "epoch": 2.998427672955975, "grad_norm": 0.87140291929245, "learning_rate": 2.8250767841461283e-05, "loss": 0.1684, "loss_nan_ranks": 0, "loss_rank_avg": 0.17452089488506317, "step": 5720 }, { "epoch": 3.001572327044025, "grad_norm": 0.6093711853027344, "learning_rate": 2.822695212508114e-05, "loss": 0.1416, "loss_nan_ranks": 0, "loss_rank_avg": 0.15838420391082764, "step": 5725 }, { "epoch": 3.0041928721174003, "grad_norm": 0.6168466210365295, "learning_rate": 2.820312235891443e-05, "loss": 0.1788, "loss_nan_ranks": 0, "loss_rank_avg": 0.1453549861907959, "step": 5730 }, { "epoch": 3.006813417190776, "grad_norm": 0.6319136619567871, "learning_rate": 2.8179278583657034e-05, "loss": 0.1445, "loss_nan_ranks": 0, "loss_rank_avg": 0.165153369307518, "step": 5735 }, { "epoch": 3.009433962264151, "grad_norm": 0.6942062973976135, "learning_rate": 2.8155420840028767e-05, "loss": 0.1551, "loss_nan_ranks": 0, "loss_rank_avg": 0.125732421875, "step": 5740 }, { "epoch": 3.012054507337526, "grad_norm": 0.7496764659881592, "learning_rate": 2.81315491687733e-05, "loss": 0.1497, "loss_nan_ranks": 0, "loss_rank_avg": 0.1387939453125, "step": 5745 }, { "epoch": 3.0146750524109014, "grad_norm": 0.7578096985816956, "learning_rate": 2.8107663610658087e-05, "loss": 0.1531, "loss_nan_ranks": 0, "loss_rank_avg": 0.1143798828125, "step": 5750 }, { "epoch": 3.0172955974842766, "grad_norm": 0.5947731137275696, "learning_rate": 2.80837642064743e-05, "loss": 0.1244, "loss_nan_ranks": 0, "loss_rank_avg": 0.13372820615768433, "step": 5755 }, { "epoch": 3.019916142557652, "grad_norm": 0.7277220487594604, "learning_rate": 2.8059850997036745e-05, "loss": 0.1628, "loss_nan_ranks": 0, "loss_rank_avg": 0.15905359387397766, "step": 5760 }, { "epoch": 3.0225366876310273, "grad_norm": 0.558275580406189, "learning_rate": 2.8035924023183816e-05, "loss": 0.1528, "loss_nan_ranks": 0, "loss_rank_avg": 0.15842102468013763, "step": 5765 }, { "epoch": 3.0251572327044025, "grad_norm": 0.7598258852958679, "learning_rate": 2.8011983325777415e-05, "loss": 0.1481, "loss_nan_ranks": 0, "loss_rank_avg": 0.1642048954963684, "step": 5770 }, { "epoch": 3.0277777777777777, "grad_norm": 0.7024770379066467, "learning_rate": 2.7988028945702874e-05, "loss": 0.1359, "loss_nan_ranks": 0, "loss_rank_avg": 0.12566912174224854, "step": 5775 }, { "epoch": 3.030398322851153, "grad_norm": 0.6844687461853027, "learning_rate": 2.7964060923868888e-05, "loss": 0.159, "loss_nan_ranks": 0, "loss_rank_avg": 0.14308127760887146, "step": 5780 }, { "epoch": 3.0330188679245285, "grad_norm": 0.7333658337593079, "learning_rate": 2.794007930120747e-05, "loss": 0.1508, "loss_nan_ranks": 0, "loss_rank_avg": 0.13852116465568542, "step": 5785 }, { "epoch": 3.0356394129979036, "grad_norm": 0.7345966100692749, "learning_rate": 2.791608411867383e-05, "loss": 0.1697, "loss_nan_ranks": 0, "loss_rank_avg": 0.1815803349018097, "step": 5790 }, { "epoch": 3.038259958071279, "grad_norm": 0.5574893355369568, "learning_rate": 2.789207541724636e-05, "loss": 0.1569, "loss_nan_ranks": 0, "loss_rank_avg": 0.17574036121368408, "step": 5795 }, { "epoch": 3.040880503144654, "grad_norm": 0.7342729568481445, "learning_rate": 2.7868053237926527e-05, "loss": 0.1498, "loss_nan_ranks": 0, "loss_rank_avg": 0.1334531009197235, "step": 5800 }, { "epoch": 3.043501048218029, "grad_norm": 0.8768147826194763, "learning_rate": 2.784401762173882e-05, "loss": 0.13, "loss_nan_ranks": 0, "loss_rank_avg": 0.12249755859375, "step": 5805 }, { "epoch": 3.0461215932914047, "grad_norm": 0.7460582256317139, "learning_rate": 2.7819968609730677e-05, "loss": 0.1546, "loss_nan_ranks": 0, "loss_rank_avg": 0.1651483029127121, "step": 5810 }, { "epoch": 3.04874213836478, "grad_norm": 0.6598678827285767, "learning_rate": 2.7795906242972396e-05, "loss": 0.1402, "loss_nan_ranks": 0, "loss_rank_avg": 0.13518714904785156, "step": 5815 }, { "epoch": 3.051362683438155, "grad_norm": 0.8546496033668518, "learning_rate": 2.7771830562557104e-05, "loss": 0.1446, "loss_nan_ranks": 0, "loss_rank_avg": 0.11572265625, "step": 5820 }, { "epoch": 3.0539832285115303, "grad_norm": 0.6218474507331848, "learning_rate": 2.774774160960066e-05, "loss": 0.1598, "loss_nan_ranks": 0, "loss_rank_avg": 0.17280012369155884, "step": 5825 }, { "epoch": 3.056603773584906, "grad_norm": 0.9477034211158752, "learning_rate": 2.7723639425241585e-05, "loss": 0.1439, "loss_nan_ranks": 0, "loss_rank_avg": 0.1212158203125, "step": 5830 }, { "epoch": 3.059224318658281, "grad_norm": 0.7466812133789062, "learning_rate": 2.769952405064099e-05, "loss": 0.15, "loss_nan_ranks": 0, "loss_rank_avg": 0.14890992641448975, "step": 5835 }, { "epoch": 3.061844863731656, "grad_norm": 0.7301759719848633, "learning_rate": 2.767539552698252e-05, "loss": 0.1327, "loss_nan_ranks": 0, "loss_rank_avg": 0.15387779474258423, "step": 5840 }, { "epoch": 3.0644654088050314, "grad_norm": 0.6939190030097961, "learning_rate": 2.7651253895472284e-05, "loss": 0.1638, "loss_nan_ranks": 0, "loss_rank_avg": 0.12840767204761505, "step": 5845 }, { "epoch": 3.0670859538784065, "grad_norm": 0.5036671757698059, "learning_rate": 2.7627099197338757e-05, "loss": 0.1701, "loss_nan_ranks": 0, "loss_rank_avg": 0.19116505980491638, "step": 5850 }, { "epoch": 3.069706498951782, "grad_norm": 0.6274353861808777, "learning_rate": 2.7602931473832736e-05, "loss": 0.1669, "loss_nan_ranks": 0, "loss_rank_avg": 0.1835746318101883, "step": 5855 }, { "epoch": 3.0723270440251573, "grad_norm": 0.7170372605323792, "learning_rate": 2.7578750766227272e-05, "loss": 0.1611, "loss_nan_ranks": 0, "loss_rank_avg": 0.1607617437839508, "step": 5860 }, { "epoch": 3.0749475890985325, "grad_norm": 0.5228092074394226, "learning_rate": 2.7554557115817588e-05, "loss": 0.1605, "loss_nan_ranks": 0, "loss_rank_avg": 0.15039002895355225, "step": 5865 }, { "epoch": 3.0775681341719077, "grad_norm": 0.6270290017127991, "learning_rate": 2.753035056392099e-05, "loss": 0.1513, "loss_nan_ranks": 0, "loss_rank_avg": 0.14429956674575806, "step": 5870 }, { "epoch": 3.080188679245283, "grad_norm": 0.7377024292945862, "learning_rate": 2.750613115187685e-05, "loss": 0.1174, "loss_nan_ranks": 0, "loss_rank_avg": 0.1204833984375, "step": 5875 }, { "epoch": 3.0828092243186584, "grad_norm": 0.7889018654823303, "learning_rate": 2.7481898921046462e-05, "loss": 0.1323, "loss_nan_ranks": 0, "loss_rank_avg": 0.10128215700387955, "step": 5880 }, { "epoch": 3.0854297693920336, "grad_norm": 0.8123045563697815, "learning_rate": 2.745765391281306e-05, "loss": 0.1408, "loss_nan_ranks": 0, "loss_rank_avg": 0.1146240234375, "step": 5885 }, { "epoch": 3.088050314465409, "grad_norm": 0.5776121616363525, "learning_rate": 2.7433396168581654e-05, "loss": 0.1621, "loss_nan_ranks": 0, "loss_rank_avg": 0.17960049211978912, "step": 5890 }, { "epoch": 3.090670859538784, "grad_norm": 0.72197026014328, "learning_rate": 2.740912572977903e-05, "loss": 0.1402, "loss_nan_ranks": 0, "loss_rank_avg": 0.14819848537445068, "step": 5895 }, { "epoch": 3.093291404612159, "grad_norm": 0.6737906336784363, "learning_rate": 2.738484263785365e-05, "loss": 0.1372, "loss_nan_ranks": 0, "loss_rank_avg": 0.1411348134279251, "step": 5900 }, { "epoch": 3.0959119496855347, "grad_norm": 0.6078870892524719, "learning_rate": 2.736054693427557e-05, "loss": 0.1614, "loss_nan_ranks": 0, "loss_rank_avg": 0.18547850847244263, "step": 5905 }, { "epoch": 3.09853249475891, "grad_norm": 0.6489272713661194, "learning_rate": 2.7336238660536413e-05, "loss": 0.15, "loss_nan_ranks": 0, "loss_rank_avg": 0.14268571138381958, "step": 5910 }, { "epoch": 3.101153039832285, "grad_norm": 0.6875291466712952, "learning_rate": 2.7311917858149226e-05, "loss": 0.1626, "loss_nan_ranks": 0, "loss_rank_avg": 0.17005501687526703, "step": 5915 }, { "epoch": 3.1037735849056602, "grad_norm": 0.7525482773780823, "learning_rate": 2.7287584568648507e-05, "loss": 0.1463, "loss_nan_ranks": 0, "loss_rank_avg": 0.15657764673233032, "step": 5920 }, { "epoch": 3.1063941299790354, "grad_norm": 0.6066417694091797, "learning_rate": 2.726323883359003e-05, "loss": 0.1491, "loss_nan_ranks": 0, "loss_rank_avg": 0.15884283185005188, "step": 5925 }, { "epoch": 3.109014675052411, "grad_norm": 0.6516925096511841, "learning_rate": 2.723888069455084e-05, "loss": 0.1502, "loss_nan_ranks": 0, "loss_rank_avg": 0.12646484375, "step": 5930 }, { "epoch": 3.111635220125786, "grad_norm": 0.6193258166313171, "learning_rate": 2.7214510193129186e-05, "loss": 0.1591, "loss_nan_ranks": 0, "loss_rank_avg": 0.18081727623939514, "step": 5935 }, { "epoch": 3.1142557651991614, "grad_norm": 0.6331426501274109, "learning_rate": 2.719012737094439e-05, "loss": 0.1487, "loss_nan_ranks": 0, "loss_rank_avg": 0.14574037492275238, "step": 5940 }, { "epoch": 3.1168763102725365, "grad_norm": 0.801331639289856, "learning_rate": 2.7165732269636863e-05, "loss": 0.1364, "loss_nan_ranks": 0, "loss_rank_avg": 0.109619140625, "step": 5945 }, { "epoch": 3.119496855345912, "grad_norm": 0.7135635614395142, "learning_rate": 2.714132493086793e-05, "loss": 0.1685, "loss_nan_ranks": 0, "loss_rank_avg": 0.16051837801933289, "step": 5950 }, { "epoch": 3.1221174004192873, "grad_norm": 0.6845545172691345, "learning_rate": 2.7116905396319863e-05, "loss": 0.1537, "loss_nan_ranks": 0, "loss_rank_avg": 0.18284091353416443, "step": 5955 }, { "epoch": 3.1247379454926625, "grad_norm": 0.8440403342247009, "learning_rate": 2.7092473707695737e-05, "loss": 0.1544, "loss_nan_ranks": 0, "loss_rank_avg": 0.16937442123889923, "step": 5960 }, { "epoch": 3.1273584905660377, "grad_norm": 0.6862518787384033, "learning_rate": 2.706802990671939e-05, "loss": 0.1583, "loss_nan_ranks": 0, "loss_rank_avg": 0.1287841796875, "step": 5965 }, { "epoch": 3.129979035639413, "grad_norm": 0.8747920393943787, "learning_rate": 2.704357403513534e-05, "loss": 0.1488, "loss_nan_ranks": 0, "loss_rank_avg": 0.14729902148246765, "step": 5970 }, { "epoch": 3.1325995807127884, "grad_norm": 0.8272216320037842, "learning_rate": 2.701910613470873e-05, "loss": 0.1534, "loss_nan_ranks": 0, "loss_rank_avg": 0.1635628640651703, "step": 5975 }, { "epoch": 3.1352201257861636, "grad_norm": 0.6622633337974548, "learning_rate": 2.699462624722523e-05, "loss": 0.1574, "loss_nan_ranks": 0, "loss_rank_avg": 0.16519419848918915, "step": 5980 }, { "epoch": 3.1378406708595388, "grad_norm": 0.6107540726661682, "learning_rate": 2.6970134414491e-05, "loss": 0.155, "loss_nan_ranks": 0, "loss_rank_avg": 0.17579969763755798, "step": 5985 }, { "epoch": 3.140461215932914, "grad_norm": 0.734446108341217, "learning_rate": 2.6945630678332584e-05, "loss": 0.1342, "loss_nan_ranks": 0, "loss_rank_avg": 0.15087890625, "step": 5990 }, { "epoch": 3.143081761006289, "grad_norm": 0.7071894407272339, "learning_rate": 2.692111508059686e-05, "loss": 0.1557, "loss_nan_ranks": 0, "loss_rank_avg": 0.15364445745944977, "step": 5995 }, { "epoch": 3.1457023060796647, "grad_norm": 0.5757654905319214, "learning_rate": 2.6896587663150965e-05, "loss": 0.1566, "loss_nan_ranks": 0, "loss_rank_avg": 0.16788052022457123, "step": 6000 }, { "epoch": 3.14832285115304, "grad_norm": 0.6631264090538025, "learning_rate": 2.687204846788222e-05, "loss": 0.1484, "loss_nan_ranks": 0, "loss_rank_avg": 0.13493452966213226, "step": 6005 }, { "epoch": 3.150943396226415, "grad_norm": 0.7087613344192505, "learning_rate": 2.6847497536698058e-05, "loss": 0.1547, "loss_nan_ranks": 0, "loss_rank_avg": 0.16542352735996246, "step": 6010 }, { "epoch": 3.1535639412997902, "grad_norm": 0.6962686777114868, "learning_rate": 2.6822934911525958e-05, "loss": 0.1514, "loss_nan_ranks": 0, "loss_rank_avg": 0.1555195450782776, "step": 6015 }, { "epoch": 3.1561844863731654, "grad_norm": 0.6705347895622253, "learning_rate": 2.679836063431336e-05, "loss": 0.1678, "loss_nan_ranks": 0, "loss_rank_avg": 0.1522950977087021, "step": 6020 }, { "epoch": 3.158805031446541, "grad_norm": 0.5783252716064453, "learning_rate": 2.677377474702762e-05, "loss": 0.1593, "loss_nan_ranks": 0, "loss_rank_avg": 0.17791618406772614, "step": 6025 }, { "epoch": 3.161425576519916, "grad_norm": 0.6777276396751404, "learning_rate": 2.6749177291655905e-05, "loss": 0.161, "loss_nan_ranks": 0, "loss_rank_avg": 0.15510901808738708, "step": 6030 }, { "epoch": 3.1640461215932913, "grad_norm": 0.6445097327232361, "learning_rate": 2.6724568310205153e-05, "loss": 0.1342, "loss_nan_ranks": 0, "loss_rank_avg": 0.18770700693130493, "step": 6035 }, { "epoch": 3.1666666666666665, "grad_norm": 0.6930022835731506, "learning_rate": 2.6699947844701967e-05, "loss": 0.1444, "loss_nan_ranks": 0, "loss_rank_avg": 0.17430582642555237, "step": 6040 }, { "epoch": 3.169287211740042, "grad_norm": 0.6261948943138123, "learning_rate": 2.6675315937192574e-05, "loss": 0.1619, "loss_nan_ranks": 0, "loss_rank_avg": 0.1623680591583252, "step": 6045 }, { "epoch": 3.1719077568134173, "grad_norm": 0.7149227261543274, "learning_rate": 2.665067262974275e-05, "loss": 0.1442, "loss_nan_ranks": 0, "loss_rank_avg": 0.1636398881673813, "step": 6050 }, { "epoch": 3.1745283018867925, "grad_norm": 0.6243942379951477, "learning_rate": 2.6626017964437726e-05, "loss": 0.1384, "loss_nan_ranks": 0, "loss_rank_avg": 0.13991810381412506, "step": 6055 }, { "epoch": 3.1771488469601676, "grad_norm": 0.649437665939331, "learning_rate": 2.6601351983382123e-05, "loss": 0.1553, "loss_nan_ranks": 0, "loss_rank_avg": 0.1520819365978241, "step": 6060 }, { "epoch": 3.179769392033543, "grad_norm": 0.6788381338119507, "learning_rate": 2.6576674728699905e-05, "loss": 0.1423, "loss_nan_ranks": 0, "loss_rank_avg": 0.16024520993232727, "step": 6065 }, { "epoch": 3.1823899371069184, "grad_norm": 0.7072592377662659, "learning_rate": 2.655198624253428e-05, "loss": 0.1478, "loss_nan_ranks": 0, "loss_rank_avg": 0.1642773151397705, "step": 6070 }, { "epoch": 3.1850104821802936, "grad_norm": 0.616181492805481, "learning_rate": 2.6527286567047634e-05, "loss": 0.1567, "loss_nan_ranks": 0, "loss_rank_avg": 0.177655428647995, "step": 6075 }, { "epoch": 3.1876310272536688, "grad_norm": 0.6885983943939209, "learning_rate": 2.6502575744421473e-05, "loss": 0.1589, "loss_nan_ranks": 0, "loss_rank_avg": 0.14435414969921112, "step": 6080 }, { "epoch": 3.190251572327044, "grad_norm": 0.7557274699211121, "learning_rate": 2.647785381685633e-05, "loss": 0.1676, "loss_nan_ranks": 0, "loss_rank_avg": 0.1525026261806488, "step": 6085 }, { "epoch": 3.192872117400419, "grad_norm": 0.5805322527885437, "learning_rate": 2.6453120826571705e-05, "loss": 0.1587, "loss_nan_ranks": 0, "loss_rank_avg": 0.13723093271255493, "step": 6090 }, { "epoch": 3.1954926624737947, "grad_norm": 0.5989310145378113, "learning_rate": 2.6428376815805984e-05, "loss": 0.1775, "loss_nan_ranks": 0, "loss_rank_avg": 0.18171349167823792, "step": 6095 }, { "epoch": 3.19811320754717, "grad_norm": 0.6322781443595886, "learning_rate": 2.6403621826816385e-05, "loss": 0.1446, "loss_nan_ranks": 0, "loss_rank_avg": 0.13691356778144836, "step": 6100 }, { "epoch": 3.200733752620545, "grad_norm": 0.6193546056747437, "learning_rate": 2.637885590187888e-05, "loss": 0.1678, "loss_nan_ranks": 0, "loss_rank_avg": 0.15641272068023682, "step": 6105 }, { "epoch": 3.20335429769392, "grad_norm": 0.6977561116218567, "learning_rate": 2.6354079083288087e-05, "loss": 0.1477, "loss_nan_ranks": 0, "loss_rank_avg": 0.17221683263778687, "step": 6110 }, { "epoch": 3.2059748427672954, "grad_norm": 0.6117526888847351, "learning_rate": 2.6329291413357263e-05, "loss": 0.1449, "loss_nan_ranks": 0, "loss_rank_avg": 0.18370969593524933, "step": 6115 }, { "epoch": 3.208595387840671, "grad_norm": 0.6177181601524353, "learning_rate": 2.630449293441818e-05, "loss": 0.1675, "loss_nan_ranks": 0, "loss_rank_avg": 0.14681090414524078, "step": 6120 }, { "epoch": 3.211215932914046, "grad_norm": 0.7336227297782898, "learning_rate": 2.6279683688821056e-05, "loss": 0.1809, "loss_nan_ranks": 0, "loss_rank_avg": 0.17036855220794678, "step": 6125 }, { "epoch": 3.2138364779874213, "grad_norm": 0.6561317443847656, "learning_rate": 2.6254863718934525e-05, "loss": 0.149, "loss_nan_ranks": 0, "loss_rank_avg": 0.15689235925674438, "step": 6130 }, { "epoch": 3.2164570230607965, "grad_norm": 0.5736268162727356, "learning_rate": 2.6230033067145516e-05, "loss": 0.1472, "loss_nan_ranks": 0, "loss_rank_avg": 0.17615295946598053, "step": 6135 }, { "epoch": 3.219077568134172, "grad_norm": 0.5321448445320129, "learning_rate": 2.620519177585921e-05, "loss": 0.1582, "loss_nan_ranks": 0, "loss_rank_avg": 0.20982366800308228, "step": 6140 }, { "epoch": 3.2216981132075473, "grad_norm": 0.6963729858398438, "learning_rate": 2.618033988749895e-05, "loss": 0.1672, "loss_nan_ranks": 0, "loss_rank_avg": 0.15652288496494293, "step": 6145 }, { "epoch": 3.2243186582809225, "grad_norm": 0.7338701486587524, "learning_rate": 2.615547744450618e-05, "loss": 0.1603, "loss_nan_ranks": 0, "loss_rank_avg": 0.184444397687912, "step": 6150 }, { "epoch": 3.2269392033542976, "grad_norm": 0.6008429527282715, "learning_rate": 2.6130604489340367e-05, "loss": 0.1651, "loss_nan_ranks": 0, "loss_rank_avg": 0.17205990850925446, "step": 6155 }, { "epoch": 3.229559748427673, "grad_norm": 0.677862823009491, "learning_rate": 2.610572106447894e-05, "loss": 0.1605, "loss_nan_ranks": 0, "loss_rank_avg": 0.19454920291900635, "step": 6160 }, { "epoch": 3.2321802935010484, "grad_norm": 0.6206879615783691, "learning_rate": 2.608082721241719e-05, "loss": 0.1636, "loss_nan_ranks": 0, "loss_rank_avg": 0.19852474331855774, "step": 6165 }, { "epoch": 3.2348008385744236, "grad_norm": 0.6645183563232422, "learning_rate": 2.6055922975668235e-05, "loss": 0.157, "loss_nan_ranks": 0, "loss_rank_avg": 0.14958696067333221, "step": 6170 }, { "epoch": 3.2374213836477987, "grad_norm": 0.7822976112365723, "learning_rate": 2.6031008396762908e-05, "loss": 0.1566, "loss_nan_ranks": 0, "loss_rank_avg": 0.14029009640216827, "step": 6175 }, { "epoch": 3.240041928721174, "grad_norm": 0.6289648413658142, "learning_rate": 2.6006083518249724e-05, "loss": 0.1766, "loss_nan_ranks": 0, "loss_rank_avg": 0.2056703120470047, "step": 6180 }, { "epoch": 3.242662473794549, "grad_norm": 0.5854620933532715, "learning_rate": 2.5981148382694773e-05, "loss": 0.1331, "loss_nan_ranks": 0, "loss_rank_avg": 0.1278085708618164, "step": 6185 }, { "epoch": 3.2452830188679247, "grad_norm": 0.7568283081054688, "learning_rate": 2.5956203032681667e-05, "loss": 0.1524, "loss_nan_ranks": 0, "loss_rank_avg": 0.12183448672294617, "step": 6190 }, { "epoch": 3.2479035639413, "grad_norm": 0.6330800652503967, "learning_rate": 2.5931247510811464e-05, "loss": 0.1484, "loss_nan_ranks": 0, "loss_rank_avg": 0.1322021484375, "step": 6195 }, { "epoch": 3.250524109014675, "grad_norm": 0.6357014179229736, "learning_rate": 2.5906281859702582e-05, "loss": 0.1583, "loss_nan_ranks": 0, "loss_rank_avg": 0.17333781719207764, "step": 6200 }, { "epoch": 3.25314465408805, "grad_norm": 0.7124892473220825, "learning_rate": 2.5881306121990758e-05, "loss": 0.162, "loss_nan_ranks": 0, "loss_rank_avg": 0.15561464428901672, "step": 6205 }, { "epoch": 3.2557651991614254, "grad_norm": 0.717791736125946, "learning_rate": 2.5856320340328934e-05, "loss": 0.1622, "loss_nan_ranks": 0, "loss_rank_avg": 0.15265150368213654, "step": 6210 }, { "epoch": 3.258385744234801, "grad_norm": 0.7117369771003723, "learning_rate": 2.5831324557387216e-05, "loss": 0.1658, "loss_nan_ranks": 0, "loss_rank_avg": 0.1384900063276291, "step": 6215 }, { "epoch": 3.261006289308176, "grad_norm": 0.6919196844100952, "learning_rate": 2.580631881585279e-05, "loss": 0.1444, "loss_nan_ranks": 0, "loss_rank_avg": 0.1550731062889099, "step": 6220 }, { "epoch": 3.2636268343815513, "grad_norm": 0.7333115339279175, "learning_rate": 2.5781303158429844e-05, "loss": 0.1526, "loss_nan_ranks": 0, "loss_rank_avg": 0.12255859375, "step": 6225 }, { "epoch": 3.2662473794549265, "grad_norm": 0.6055130362510681, "learning_rate": 2.575627762783951e-05, "loss": 0.1606, "loss_nan_ranks": 0, "loss_rank_avg": 0.23269812762737274, "step": 6230 }, { "epoch": 3.268867924528302, "grad_norm": 0.6821181178092957, "learning_rate": 2.573124226681976e-05, "loss": 0.1522, "loss_nan_ranks": 0, "loss_rank_avg": 0.13218367099761963, "step": 6235 }, { "epoch": 3.2714884696016773, "grad_norm": 0.7422898411750793, "learning_rate": 2.5706197118125375e-05, "loss": 0.1479, "loss_nan_ranks": 0, "loss_rank_avg": 0.15026451647281647, "step": 6240 }, { "epoch": 3.2741090146750524, "grad_norm": 0.66852205991745, "learning_rate": 2.568114222452785e-05, "loss": 0.1579, "loss_nan_ranks": 0, "loss_rank_avg": 0.1492151916027069, "step": 6245 }, { "epoch": 3.2767295597484276, "grad_norm": 0.6432539820671082, "learning_rate": 2.5656077628815305e-05, "loss": 0.163, "loss_nan_ranks": 0, "loss_rank_avg": 0.15138231217861176, "step": 6250 }, { "epoch": 3.279350104821803, "grad_norm": 0.6414669156074524, "learning_rate": 2.5631003373792452e-05, "loss": 0.1558, "loss_nan_ranks": 0, "loss_rank_avg": 0.19593289494514465, "step": 6255 }, { "epoch": 3.281970649895178, "grad_norm": 0.7199597954750061, "learning_rate": 2.5605919502280482e-05, "loss": 0.1368, "loss_nan_ranks": 0, "loss_rank_avg": 0.1262434870004654, "step": 6260 }, { "epoch": 3.2845911949685536, "grad_norm": 0.7075192928314209, "learning_rate": 2.5580826057117002e-05, "loss": 0.1455, "loss_nan_ranks": 0, "loss_rank_avg": 0.14794921875, "step": 6265 }, { "epoch": 3.2872117400419287, "grad_norm": 0.6006430983543396, "learning_rate": 2.5555723081156005e-05, "loss": 0.1533, "loss_nan_ranks": 0, "loss_rank_avg": 0.11506500095129013, "step": 6270 }, { "epoch": 3.289832285115304, "grad_norm": 0.5886109471321106, "learning_rate": 2.5530610617267718e-05, "loss": 0.1624, "loss_nan_ranks": 0, "loss_rank_avg": 0.14662334322929382, "step": 6275 }, { "epoch": 3.292452830188679, "grad_norm": 0.6336468458175659, "learning_rate": 2.5505488708338596e-05, "loss": 0.1612, "loss_nan_ranks": 0, "loss_rank_avg": 0.16814139485359192, "step": 6280 }, { "epoch": 3.2950733752620547, "grad_norm": 0.6334772706031799, "learning_rate": 2.5480357397271222e-05, "loss": 0.1519, "loss_nan_ranks": 0, "loss_rank_avg": 0.12603622674942017, "step": 6285 }, { "epoch": 3.29769392033543, "grad_norm": 0.779043436050415, "learning_rate": 2.5455216726984215e-05, "loss": 0.1492, "loss_nan_ranks": 0, "loss_rank_avg": 0.1370849609375, "step": 6290 }, { "epoch": 3.300314465408805, "grad_norm": 0.795427143573761, "learning_rate": 2.5430066740412214e-05, "loss": 0.1554, "loss_nan_ranks": 0, "loss_rank_avg": 0.11929628252983093, "step": 6295 }, { "epoch": 3.30293501048218, "grad_norm": 0.6863541603088379, "learning_rate": 2.5404907480505735e-05, "loss": 0.1562, "loss_nan_ranks": 0, "loss_rank_avg": 0.15934820473194122, "step": 6300 }, { "epoch": 3.3055555555555554, "grad_norm": 0.7187139391899109, "learning_rate": 2.537973899023114e-05, "loss": 0.1516, "loss_nan_ranks": 0, "loss_rank_avg": 0.15310056507587433, "step": 6305 }, { "epoch": 3.308176100628931, "grad_norm": 0.7239200472831726, "learning_rate": 2.535456131257057e-05, "loss": 0.1542, "loss_nan_ranks": 0, "loss_rank_avg": 0.15294574201107025, "step": 6310 }, { "epoch": 3.310796645702306, "grad_norm": 0.5413670539855957, "learning_rate": 2.5329374490521836e-05, "loss": 0.1567, "loss_nan_ranks": 0, "loss_rank_avg": 0.110429547727108, "step": 6315 }, { "epoch": 3.3134171907756813, "grad_norm": 0.6516697406768799, "learning_rate": 2.5304178567098374e-05, "loss": 0.1599, "loss_nan_ranks": 0, "loss_rank_avg": 0.17944911122322083, "step": 6320 }, { "epoch": 3.3160377358490565, "grad_norm": 0.669211745262146, "learning_rate": 2.5278973585329168e-05, "loss": 0.1404, "loss_nan_ranks": 0, "loss_rank_avg": 0.1333250105381012, "step": 6325 }, { "epoch": 3.318658280922432, "grad_norm": 0.7142298221588135, "learning_rate": 2.525375958825866e-05, "loss": 0.1642, "loss_nan_ranks": 0, "loss_rank_avg": 0.15886715054512024, "step": 6330 }, { "epoch": 3.3212788259958073, "grad_norm": 0.8329634666442871, "learning_rate": 2.52285366189467e-05, "loss": 0.152, "loss_nan_ranks": 0, "loss_rank_avg": 0.1321033090353012, "step": 6335 }, { "epoch": 3.3238993710691824, "grad_norm": 0.6707449555397034, "learning_rate": 2.5203304720468445e-05, "loss": 0.1566, "loss_nan_ranks": 0, "loss_rank_avg": 0.15341900289058685, "step": 6340 }, { "epoch": 3.3265199161425576, "grad_norm": 0.5349269509315491, "learning_rate": 2.5178063935914324e-05, "loss": 0.1453, "loss_nan_ranks": 0, "loss_rank_avg": 0.15282392501831055, "step": 6345 }, { "epoch": 3.3291404612159328, "grad_norm": 0.8673381209373474, "learning_rate": 2.515281430838992e-05, "loss": 0.169, "loss_nan_ranks": 0, "loss_rank_avg": 0.1706235706806183, "step": 6350 }, { "epoch": 3.331761006289308, "grad_norm": 0.6685834527015686, "learning_rate": 2.5127555881015923e-05, "loss": 0.1757, "loss_nan_ranks": 0, "loss_rank_avg": 0.2108490765094757, "step": 6355 }, { "epoch": 3.3343815513626835, "grad_norm": 0.7365850210189819, "learning_rate": 2.5102288696928066e-05, "loss": 0.15, "loss_nan_ranks": 0, "loss_rank_avg": 0.130615234375, "step": 6360 }, { "epoch": 3.3370020964360587, "grad_norm": 0.8285936713218689, "learning_rate": 2.5077012799277006e-05, "loss": 0.1495, "loss_nan_ranks": 0, "loss_rank_avg": 0.1509149670600891, "step": 6365 }, { "epoch": 3.339622641509434, "grad_norm": 0.6469640135765076, "learning_rate": 2.5051728231228322e-05, "loss": 0.158, "loss_nan_ranks": 0, "loss_rank_avg": 0.135266974568367, "step": 6370 }, { "epoch": 3.342243186582809, "grad_norm": 0.5959151387214661, "learning_rate": 2.502643503596237e-05, "loss": 0.1437, "loss_nan_ranks": 0, "loss_rank_avg": 0.151039719581604, "step": 6375 }, { "epoch": 3.3448637316561847, "grad_norm": 0.7715269327163696, "learning_rate": 2.5001133256674233e-05, "loss": 0.1503, "loss_nan_ranks": 0, "loss_rank_avg": 0.17917034029960632, "step": 6380 }, { "epoch": 3.34748427672956, "grad_norm": 0.6614984273910522, "learning_rate": 2.4975822936573684e-05, "loss": 0.1751, "loss_nan_ranks": 0, "loss_rank_avg": 0.1380983591079712, "step": 6385 }, { "epoch": 3.350104821802935, "grad_norm": 0.5487889051437378, "learning_rate": 2.495050411888506e-05, "loss": 0.1651, "loss_nan_ranks": 0, "loss_rank_avg": 0.20875106751918793, "step": 6390 }, { "epoch": 3.35272536687631, "grad_norm": 1.8391904830932617, "learning_rate": 2.4925176846847214e-05, "loss": 0.1486, "loss_nan_ranks": 0, "loss_rank_avg": 0.11009234189987183, "step": 6395 }, { "epoch": 3.3553459119496853, "grad_norm": 0.6677514314651489, "learning_rate": 2.489984116371344e-05, "loss": 0.1545, "loss_nan_ranks": 0, "loss_rank_avg": 0.1642158031463623, "step": 6400 }, { "epoch": 3.357966457023061, "grad_norm": 0.6820788383483887, "learning_rate": 2.4874497112751394e-05, "loss": 0.1502, "loss_nan_ranks": 0, "loss_rank_avg": 0.1800456941127777, "step": 6405 }, { "epoch": 3.360587002096436, "grad_norm": 0.6414058804512024, "learning_rate": 2.4849144737243026e-05, "loss": 0.1627, "loss_nan_ranks": 0, "loss_rank_avg": 0.17555800080299377, "step": 6410 }, { "epoch": 3.3632075471698113, "grad_norm": 0.7890809178352356, "learning_rate": 2.4823784080484495e-05, "loss": 0.1306, "loss_nan_ranks": 0, "loss_rank_avg": 0.12165604531764984, "step": 6415 }, { "epoch": 3.3658280922431865, "grad_norm": 0.8478096723556519, "learning_rate": 2.479841518578611e-05, "loss": 0.1442, "loss_nan_ranks": 0, "loss_rank_avg": 0.17640218138694763, "step": 6420 }, { "epoch": 3.368448637316562, "grad_norm": 0.6587736010551453, "learning_rate": 2.4773038096472247e-05, "loss": 0.1659, "loss_nan_ranks": 0, "loss_rank_avg": 0.15387940406799316, "step": 6425 }, { "epoch": 3.3710691823899372, "grad_norm": 0.6497980356216431, "learning_rate": 2.474765285588127e-05, "loss": 0.1407, "loss_nan_ranks": 0, "loss_rank_avg": 0.11815936118364334, "step": 6430 }, { "epoch": 3.3736897274633124, "grad_norm": 0.6267364621162415, "learning_rate": 2.4722259507365475e-05, "loss": 0.1585, "loss_nan_ranks": 0, "loss_rank_avg": 0.16179129481315613, "step": 6435 }, { "epoch": 3.3763102725366876, "grad_norm": 0.7024030089378357, "learning_rate": 2.4696858094290992e-05, "loss": 0.1309, "loss_nan_ranks": 0, "loss_rank_avg": 0.1190185546875, "step": 6440 }, { "epoch": 3.3789308176100628, "grad_norm": 0.7756052017211914, "learning_rate": 2.4671448660037732e-05, "loss": 0.1497, "loss_nan_ranks": 0, "loss_rank_avg": 0.12463849037885666, "step": 6445 }, { "epoch": 3.381551362683438, "grad_norm": 0.7756056189537048, "learning_rate": 2.464603124799931e-05, "loss": 0.151, "loss_nan_ranks": 0, "loss_rank_avg": 0.177520290017128, "step": 6450 }, { "epoch": 3.3841719077568135, "grad_norm": 0.6775057315826416, "learning_rate": 2.4620605901582943e-05, "loss": 0.1485, "loss_nan_ranks": 0, "loss_rank_avg": 0.14761006832122803, "step": 6455 }, { "epoch": 3.3867924528301887, "grad_norm": 0.6803090572357178, "learning_rate": 2.4595172664209425e-05, "loss": 0.1345, "loss_nan_ranks": 0, "loss_rank_avg": 0.14740537106990814, "step": 6460 }, { "epoch": 3.389412997903564, "grad_norm": 0.6456284523010254, "learning_rate": 2.4569731579313007e-05, "loss": 0.162, "loss_nan_ranks": 0, "loss_rank_avg": 0.15551471710205078, "step": 6465 }, { "epoch": 3.392033542976939, "grad_norm": 0.6624186038970947, "learning_rate": 2.4544282690341344e-05, "loss": 0.1647, "loss_nan_ranks": 0, "loss_rank_avg": 0.17889371514320374, "step": 6470 }, { "epoch": 3.3946540880503147, "grad_norm": 0.6124078631401062, "learning_rate": 2.4518826040755435e-05, "loss": 0.1451, "loss_nan_ranks": 0, "loss_rank_avg": 0.13483217358589172, "step": 6475 }, { "epoch": 3.39727463312369, "grad_norm": 0.6263321042060852, "learning_rate": 2.4493361674029515e-05, "loss": 0.1498, "loss_nan_ranks": 0, "loss_rank_avg": 0.15271490812301636, "step": 6480 }, { "epoch": 3.399895178197065, "grad_norm": 0.648196280002594, "learning_rate": 2.4467889633650996e-05, "loss": 0.1419, "loss_nan_ranks": 0, "loss_rank_avg": 0.13612696528434753, "step": 6485 }, { "epoch": 3.40251572327044, "grad_norm": 0.672197163105011, "learning_rate": 2.444240996312041e-05, "loss": 0.1494, "loss_nan_ranks": 0, "loss_rank_avg": 0.11425255239009857, "step": 6490 }, { "epoch": 3.4051362683438153, "grad_norm": 0.5785332918167114, "learning_rate": 2.4416922705951312e-05, "loss": 0.1634, "loss_nan_ranks": 0, "loss_rank_avg": 0.21997715532779694, "step": 6495 }, { "epoch": 3.407756813417191, "grad_norm": 0.8766397833824158, "learning_rate": 2.4391427905670215e-05, "loss": 0.1456, "loss_nan_ranks": 0, "loss_rank_avg": 0.1305435299873352, "step": 6500 }, { "epoch": 3.410377358490566, "grad_norm": 0.745764434337616, "learning_rate": 2.436592560581651e-05, "loss": 0.1353, "loss_nan_ranks": 0, "loss_rank_avg": 0.10039962828159332, "step": 6505 }, { "epoch": 3.4129979035639413, "grad_norm": 0.672865629196167, "learning_rate": 2.4340415849942386e-05, "loss": 0.1482, "loss_nan_ranks": 0, "loss_rank_avg": 0.11167111992835999, "step": 6510 }, { "epoch": 3.4156184486373165, "grad_norm": 0.5722970366477966, "learning_rate": 2.4314898681612794e-05, "loss": 0.153, "loss_nan_ranks": 0, "loss_rank_avg": 0.136262446641922, "step": 6515 }, { "epoch": 3.418238993710692, "grad_norm": 0.7656019926071167, "learning_rate": 2.4289374144405318e-05, "loss": 0.1447, "loss_nan_ranks": 0, "loss_rank_avg": 0.12890625, "step": 6520 }, { "epoch": 3.4208595387840672, "grad_norm": 0.6020995378494263, "learning_rate": 2.426384228191014e-05, "loss": 0.1343, "loss_nan_ranks": 0, "loss_rank_avg": 0.1420569270849228, "step": 6525 }, { "epoch": 3.4234800838574424, "grad_norm": 0.6600314378738403, "learning_rate": 2.4238303137729945e-05, "loss": 0.1441, "loss_nan_ranks": 0, "loss_rank_avg": 0.11346963047981262, "step": 6530 }, { "epoch": 3.4261006289308176, "grad_norm": 0.6210017800331116, "learning_rate": 2.421275675547985e-05, "loss": 0.1696, "loss_nan_ranks": 0, "loss_rank_avg": 0.1867697685956955, "step": 6535 }, { "epoch": 3.4287211740041927, "grad_norm": 0.7550157904624939, "learning_rate": 2.4187203178787347e-05, "loss": 0.1615, "loss_nan_ranks": 0, "loss_rank_avg": 0.16229265928268433, "step": 6540 }, { "epoch": 3.431341719077568, "grad_norm": 0.5855455994606018, "learning_rate": 2.41616424512922e-05, "loss": 0.1537, "loss_nan_ranks": 0, "loss_rank_avg": 0.16221889853477478, "step": 6545 }, { "epoch": 3.4339622641509435, "grad_norm": 0.5946267247200012, "learning_rate": 2.4136074616646396e-05, "loss": 0.1532, "loss_nan_ranks": 0, "loss_rank_avg": 0.16361533105373383, "step": 6550 }, { "epoch": 3.4365828092243187, "grad_norm": 0.7061434984207153, "learning_rate": 2.411049971851405e-05, "loss": 0.1546, "loss_nan_ranks": 0, "loss_rank_avg": 0.15311777591705322, "step": 6555 }, { "epoch": 3.439203354297694, "grad_norm": 0.6284918785095215, "learning_rate": 2.4084917800571344e-05, "loss": 0.1611, "loss_nan_ranks": 0, "loss_rank_avg": 0.1728515625, "step": 6560 }, { "epoch": 3.441823899371069, "grad_norm": 0.5714501142501831, "learning_rate": 2.405932890650645e-05, "loss": 0.1425, "loss_nan_ranks": 0, "loss_rank_avg": 0.1305420696735382, "step": 6565 }, { "epoch": 3.4444444444444446, "grad_norm": 0.67555832862854, "learning_rate": 2.4033733080019453e-05, "loss": 0.1564, "loss_nan_ranks": 0, "loss_rank_avg": 0.16778060793876648, "step": 6570 }, { "epoch": 3.44706498951782, "grad_norm": 0.823415994644165, "learning_rate": 2.400813036482228e-05, "loss": 0.1805, "loss_nan_ranks": 0, "loss_rank_avg": 0.18518805503845215, "step": 6575 }, { "epoch": 3.449685534591195, "grad_norm": 0.7290722727775574, "learning_rate": 2.398252080463861e-05, "loss": 0.157, "loss_nan_ranks": 0, "loss_rank_avg": 0.145263671875, "step": 6580 }, { "epoch": 3.45230607966457, "grad_norm": 0.6552735567092896, "learning_rate": 2.3956904443203825e-05, "loss": 0.138, "loss_nan_ranks": 0, "loss_rank_avg": 0.1675729751586914, "step": 6585 }, { "epoch": 3.4549266247379453, "grad_norm": 0.8209438323974609, "learning_rate": 2.3931281324264918e-05, "loss": 0.1727, "loss_nan_ranks": 0, "loss_rank_avg": 0.10845947265625, "step": 6590 }, { "epoch": 3.457547169811321, "grad_norm": 0.5601760745048523, "learning_rate": 2.3905651491580423e-05, "loss": 0.1579, "loss_nan_ranks": 0, "loss_rank_avg": 0.1745111048221588, "step": 6595 }, { "epoch": 3.460167714884696, "grad_norm": 0.5397576689720154, "learning_rate": 2.3880014988920327e-05, "loss": 0.1584, "loss_nan_ranks": 0, "loss_rank_avg": 0.2028835266828537, "step": 6600 }, { "epoch": 3.4627882599580713, "grad_norm": 0.6182676553726196, "learning_rate": 2.3854371860066034e-05, "loss": 0.1548, "loss_nan_ranks": 0, "loss_rank_avg": 0.15692049264907837, "step": 6605 }, { "epoch": 3.4654088050314464, "grad_norm": 0.6674803495407104, "learning_rate": 2.3828722148810236e-05, "loss": 0.1416, "loss_nan_ranks": 0, "loss_rank_avg": 0.15361738204956055, "step": 6610 }, { "epoch": 3.468029350104822, "grad_norm": 0.614689826965332, "learning_rate": 2.380306589895689e-05, "loss": 0.1576, "loss_nan_ranks": 0, "loss_rank_avg": 0.1334121972322464, "step": 6615 }, { "epoch": 3.470649895178197, "grad_norm": 0.6762018799781799, "learning_rate": 2.3777403154321107e-05, "loss": 0.1456, "loss_nan_ranks": 0, "loss_rank_avg": 0.12771591544151306, "step": 6620 }, { "epoch": 3.4732704402515724, "grad_norm": 0.6038153171539307, "learning_rate": 2.3751733958729083e-05, "loss": 0.1672, "loss_nan_ranks": 0, "loss_rank_avg": 0.14504867792129517, "step": 6625 }, { "epoch": 3.4758909853249476, "grad_norm": 0.7279376983642578, "learning_rate": 2.372605835601805e-05, "loss": 0.1443, "loss_nan_ranks": 0, "loss_rank_avg": 0.14499513804912567, "step": 6630 }, { "epoch": 3.4785115303983227, "grad_norm": 0.7328532934188843, "learning_rate": 2.370037639003616e-05, "loss": 0.1529, "loss_nan_ranks": 0, "loss_rank_avg": 0.16632357239723206, "step": 6635 }, { "epoch": 3.481132075471698, "grad_norm": 0.6980881690979004, "learning_rate": 2.3674688104642453e-05, "loss": 0.1481, "loss_nan_ranks": 0, "loss_rank_avg": 0.13219040632247925, "step": 6640 }, { "epoch": 3.4837526205450735, "grad_norm": 0.6118700504302979, "learning_rate": 2.364899354370675e-05, "loss": 0.1552, "loss_nan_ranks": 0, "loss_rank_avg": 0.14038139581680298, "step": 6645 }, { "epoch": 3.4863731656184487, "grad_norm": 0.6516825556755066, "learning_rate": 2.3623292751109582e-05, "loss": 0.1544, "loss_nan_ranks": 0, "loss_rank_avg": 0.1734481155872345, "step": 6650 }, { "epoch": 3.488993710691824, "grad_norm": 0.5811892151832581, "learning_rate": 2.3597585770742138e-05, "loss": 0.1768, "loss_nan_ranks": 0, "loss_rank_avg": 0.197306290268898, "step": 6655 }, { "epoch": 3.491614255765199, "grad_norm": 0.6686034202575684, "learning_rate": 2.3571872646506165e-05, "loss": 0.1491, "loss_nan_ranks": 0, "loss_rank_avg": 0.14100056886672974, "step": 6660 }, { "epoch": 3.4942348008385746, "grad_norm": 0.6539227962493896, "learning_rate": 2.3546153422313903e-05, "loss": 0.1453, "loss_nan_ranks": 0, "loss_rank_avg": 0.1873638778924942, "step": 6665 }, { "epoch": 3.49685534591195, "grad_norm": 0.6214085817337036, "learning_rate": 2.3520428142088018e-05, "loss": 0.1656, "loss_nan_ranks": 0, "loss_rank_avg": 0.2092817723751068, "step": 6670 }, { "epoch": 3.499475890985325, "grad_norm": 0.726628839969635, "learning_rate": 2.3494696849761497e-05, "loss": 0.1596, "loss_nan_ranks": 0, "loss_rank_avg": 0.14948055148124695, "step": 6675 }, { "epoch": 3.5020964360587, "grad_norm": 0.6781795024871826, "learning_rate": 2.3468959589277623e-05, "loss": 0.1524, "loss_nan_ranks": 0, "loss_rank_avg": 0.1506710946559906, "step": 6680 }, { "epoch": 3.5047169811320753, "grad_norm": 0.7744038105010986, "learning_rate": 2.3443216404589844e-05, "loss": 0.1555, "loss_nan_ranks": 0, "loss_rank_avg": 0.15132437646389008, "step": 6685 }, { "epoch": 3.5073375262054505, "grad_norm": 0.6445820331573486, "learning_rate": 2.3417467339661757e-05, "loss": 0.1635, "loss_nan_ranks": 0, "loss_rank_avg": 0.16688039898872375, "step": 6690 }, { "epoch": 3.509958071278826, "grad_norm": 0.5953259468078613, "learning_rate": 2.3391712438466962e-05, "loss": 0.1394, "loss_nan_ranks": 0, "loss_rank_avg": 0.15444037318229675, "step": 6695 }, { "epoch": 3.5125786163522013, "grad_norm": 0.7253164649009705, "learning_rate": 2.3365951744989054e-05, "loss": 0.1632, "loss_nan_ranks": 0, "loss_rank_avg": 0.102294921875, "step": 6700 }, { "epoch": 3.5151991614255764, "grad_norm": 0.668173611164093, "learning_rate": 2.334018530322151e-05, "loss": 0.1522, "loss_nan_ranks": 0, "loss_rank_avg": 0.12434506416320801, "step": 6705 }, { "epoch": 3.517819706498952, "grad_norm": 0.59187251329422, "learning_rate": 2.331441315716763e-05, "loss": 0.1506, "loss_nan_ranks": 0, "loss_rank_avg": 0.1513979732990265, "step": 6710 }, { "epoch": 3.520440251572327, "grad_norm": 0.6256042122840881, "learning_rate": 2.3288635350840445e-05, "loss": 0.1472, "loss_nan_ranks": 0, "loss_rank_avg": 0.12702324986457825, "step": 6715 }, { "epoch": 3.5230607966457024, "grad_norm": 0.6789478063583374, "learning_rate": 2.3262851928262665e-05, "loss": 0.1521, "loss_nan_ranks": 0, "loss_rank_avg": 0.1434326171875, "step": 6720 }, { "epoch": 3.5256813417190775, "grad_norm": 0.7792441844940186, "learning_rate": 2.323706293346658e-05, "loss": 0.1694, "loss_nan_ranks": 0, "loss_rank_avg": 0.1643582582473755, "step": 6725 }, { "epoch": 3.5283018867924527, "grad_norm": 0.7054676413536072, "learning_rate": 2.321126841049401e-05, "loss": 0.1538, "loss_nan_ranks": 0, "loss_rank_avg": 0.17248061299324036, "step": 6730 }, { "epoch": 3.530922431865828, "grad_norm": 0.7300561666488647, "learning_rate": 2.3185468403396198e-05, "loss": 0.1452, "loss_nan_ranks": 0, "loss_rank_avg": 0.13691771030426025, "step": 6735 }, { "epoch": 3.5335429769392035, "grad_norm": 0.7270481586456299, "learning_rate": 2.315966295623376e-05, "loss": 0.1607, "loss_nan_ranks": 0, "loss_rank_avg": 0.1448974609375, "step": 6740 }, { "epoch": 3.5361635220125787, "grad_norm": 0.6351275444030762, "learning_rate": 2.3133852113076616e-05, "loss": 0.1631, "loss_nan_ranks": 0, "loss_rank_avg": 0.18207350373268127, "step": 6745 }, { "epoch": 3.538784067085954, "grad_norm": 0.7486494779586792, "learning_rate": 2.3108035918003875e-05, "loss": 0.1489, "loss_nan_ranks": 0, "loss_rank_avg": 0.10829313099384308, "step": 6750 }, { "epoch": 3.541404612159329, "grad_norm": 0.7265515327453613, "learning_rate": 2.308221441510382e-05, "loss": 0.1397, "loss_nan_ranks": 0, "loss_rank_avg": 0.12119436264038086, "step": 6755 }, { "epoch": 3.5440251572327046, "grad_norm": 0.5404002666473389, "learning_rate": 2.3056387648473753e-05, "loss": 0.1611, "loss_nan_ranks": 0, "loss_rank_avg": 0.16169658303260803, "step": 6760 }, { "epoch": 3.54664570230608, "grad_norm": 0.7101503014564514, "learning_rate": 2.303055566222001e-05, "loss": 0.1422, "loss_nan_ranks": 0, "loss_rank_avg": 0.16694357991218567, "step": 6765 }, { "epoch": 3.549266247379455, "grad_norm": 0.7440077066421509, "learning_rate": 2.300471850045782e-05, "loss": 0.1285, "loss_nan_ranks": 0, "loss_rank_avg": 0.13593001663684845, "step": 6770 }, { "epoch": 3.55188679245283, "grad_norm": 0.693570613861084, "learning_rate": 2.297887620731124e-05, "loss": 0.1623, "loss_nan_ranks": 0, "loss_rank_avg": 0.14585727453231812, "step": 6775 }, { "epoch": 3.5545073375262053, "grad_norm": 0.6090971231460571, "learning_rate": 2.295302882691312e-05, "loss": 0.1649, "loss_nan_ranks": 0, "loss_rank_avg": 0.1292724609375, "step": 6780 }, { "epoch": 3.5571278825995805, "grad_norm": 0.6697745323181152, "learning_rate": 2.2927176403404978e-05, "loss": 0.1774, "loss_nan_ranks": 0, "loss_rank_avg": 0.149550199508667, "step": 6785 }, { "epoch": 3.559748427672956, "grad_norm": 0.883256196975708, "learning_rate": 2.290131898093693e-05, "loss": 0.1402, "loss_nan_ranks": 0, "loss_rank_avg": 0.13478536903858185, "step": 6790 }, { "epoch": 3.5623689727463312, "grad_norm": 0.6198585033416748, "learning_rate": 2.2875456603667664e-05, "loss": 0.1481, "loss_nan_ranks": 0, "loss_rank_avg": 0.15345340967178345, "step": 6795 }, { "epoch": 3.5649895178197064, "grad_norm": 0.6998546123504639, "learning_rate": 2.2849589315764303e-05, "loss": 0.1718, "loss_nan_ranks": 0, "loss_rank_avg": 0.17192421853542328, "step": 6800 }, { "epoch": 3.567610062893082, "grad_norm": 0.7005147337913513, "learning_rate": 2.2823717161402375e-05, "loss": 0.1624, "loss_nan_ranks": 0, "loss_rank_avg": 0.16433775424957275, "step": 6805 }, { "epoch": 3.570230607966457, "grad_norm": 0.7533726096153259, "learning_rate": 2.27978401847657e-05, "loss": 0.1605, "loss_nan_ranks": 0, "loss_rank_avg": 0.11318092048168182, "step": 6810 }, { "epoch": 3.5728511530398324, "grad_norm": 0.776470959186554, "learning_rate": 2.2771958430046342e-05, "loss": 0.1533, "loss_nan_ranks": 0, "loss_rank_avg": 0.11972829699516296, "step": 6815 }, { "epoch": 3.5754716981132075, "grad_norm": 0.6570600867271423, "learning_rate": 2.2746071941444537e-05, "loss": 0.174, "loss_nan_ranks": 0, "loss_rank_avg": 0.16138510406017303, "step": 6820 }, { "epoch": 3.5780922431865827, "grad_norm": 0.6985480785369873, "learning_rate": 2.2720180763168576e-05, "loss": 0.1497, "loss_nan_ranks": 0, "loss_rank_avg": 0.12646484375, "step": 6825 }, { "epoch": 3.580712788259958, "grad_norm": 0.5580952763557434, "learning_rate": 2.269428493943479e-05, "loss": 0.1564, "loss_nan_ranks": 0, "loss_rank_avg": 0.18356704711914062, "step": 6830 }, { "epoch": 3.5833333333333335, "grad_norm": 0.6391990780830383, "learning_rate": 2.2668384514467427e-05, "loss": 0.1493, "loss_nan_ranks": 0, "loss_rank_avg": 0.1443229615688324, "step": 6835 }, { "epoch": 3.5859538784067087, "grad_norm": 0.6866182684898376, "learning_rate": 2.2642479532498597e-05, "loss": 0.1751, "loss_nan_ranks": 0, "loss_rank_avg": 0.15328383445739746, "step": 6840 }, { "epoch": 3.588574423480084, "grad_norm": 0.69412761926651, "learning_rate": 2.2616570037768187e-05, "loss": 0.1639, "loss_nan_ranks": 0, "loss_rank_avg": 0.137939453125, "step": 6845 }, { "epoch": 3.591194968553459, "grad_norm": 0.6370830535888672, "learning_rate": 2.25906560745238e-05, "loss": 0.1833, "loss_nan_ranks": 0, "loss_rank_avg": 0.15124990046024323, "step": 6850 }, { "epoch": 3.5938155136268346, "grad_norm": 0.7414199709892273, "learning_rate": 2.256473768702066e-05, "loss": 0.1351, "loss_nan_ranks": 0, "loss_rank_avg": 0.1309814453125, "step": 6855 }, { "epoch": 3.5964360587002098, "grad_norm": 0.702250599861145, "learning_rate": 2.2538814919521556e-05, "loss": 0.1599, "loss_nan_ranks": 0, "loss_rank_avg": 0.130854994058609, "step": 6860 }, { "epoch": 3.599056603773585, "grad_norm": 0.6545102000236511, "learning_rate": 2.2512887816296755e-05, "loss": 0.1561, "loss_nan_ranks": 0, "loss_rank_avg": 0.1243896484375, "step": 6865 }, { "epoch": 3.60167714884696, "grad_norm": 0.6784298419952393, "learning_rate": 2.2486956421623917e-05, "loss": 0.1549, "loss_nan_ranks": 0, "loss_rank_avg": 0.15646380186080933, "step": 6870 }, { "epoch": 3.6042976939203353, "grad_norm": 0.6086201071739197, "learning_rate": 2.2461020779788054e-05, "loss": 0.14, "loss_nan_ranks": 0, "loss_rank_avg": 0.14351555705070496, "step": 6875 }, { "epoch": 3.6069182389937104, "grad_norm": 0.7209429144859314, "learning_rate": 2.2435080935081402e-05, "loss": 0.156, "loss_nan_ranks": 0, "loss_rank_avg": 0.1248779296875, "step": 6880 }, { "epoch": 3.609538784067086, "grad_norm": 0.703329861164093, "learning_rate": 2.240913693180341e-05, "loss": 0.1317, "loss_nan_ranks": 0, "loss_rank_avg": 0.13986779749393463, "step": 6885 }, { "epoch": 3.6121593291404612, "grad_norm": 0.6641044616699219, "learning_rate": 2.2383188814260585e-05, "loss": 0.1485, "loss_nan_ranks": 0, "loss_rank_avg": 0.17025399208068848, "step": 6890 }, { "epoch": 3.6147798742138364, "grad_norm": 0.6589853763580322, "learning_rate": 2.2357236626766504e-05, "loss": 0.1404, "loss_nan_ranks": 0, "loss_rank_avg": 0.1534072756767273, "step": 6895 }, { "epoch": 3.617400419287212, "grad_norm": 0.7012056112289429, "learning_rate": 2.233128041364166e-05, "loss": 0.148, "loss_nan_ranks": 0, "loss_rank_avg": 0.1505126953125, "step": 6900 }, { "epoch": 3.620020964360587, "grad_norm": 0.8724771738052368, "learning_rate": 2.230532021921345e-05, "loss": 0.1397, "loss_nan_ranks": 0, "loss_rank_avg": 0.13603194057941437, "step": 6905 }, { "epoch": 3.6226415094339623, "grad_norm": 0.809339702129364, "learning_rate": 2.2279356087816044e-05, "loss": 0.15, "loss_nan_ranks": 0, "loss_rank_avg": 0.14090360701084137, "step": 6910 }, { "epoch": 3.6252620545073375, "grad_norm": 0.6809837222099304, "learning_rate": 2.2253388063790356e-05, "loss": 0.1513, "loss_nan_ranks": 0, "loss_rank_avg": 0.19587504863739014, "step": 6915 }, { "epoch": 3.6278825995807127, "grad_norm": 0.6588430404663086, "learning_rate": 2.2227416191483928e-05, "loss": 0.1612, "loss_nan_ranks": 0, "loss_rank_avg": 0.1605924516916275, "step": 6920 }, { "epoch": 3.630503144654088, "grad_norm": 0.6808624863624573, "learning_rate": 2.2201440515250897e-05, "loss": 0.1438, "loss_nan_ranks": 0, "loss_rank_avg": 0.1401149183511734, "step": 6925 }, { "epoch": 3.6331236897274635, "grad_norm": 0.5970581769943237, "learning_rate": 2.217546107945188e-05, "loss": 0.1597, "loss_nan_ranks": 0, "loss_rank_avg": 0.12778399884700775, "step": 6930 }, { "epoch": 3.6357442348008386, "grad_norm": 0.647731602191925, "learning_rate": 2.2149477928453914e-05, "loss": 0.1554, "loss_nan_ranks": 0, "loss_rank_avg": 0.16550582647323608, "step": 6935 }, { "epoch": 3.638364779874214, "grad_norm": 0.6262145638465881, "learning_rate": 2.212349110663039e-05, "loss": 0.1538, "loss_nan_ranks": 0, "loss_rank_avg": 0.15969672799110413, "step": 6940 }, { "epoch": 3.640985324947589, "grad_norm": 0.7175219655036926, "learning_rate": 2.209750065836096e-05, "loss": 0.1564, "loss_nan_ranks": 0, "loss_rank_avg": 0.1234130859375, "step": 6945 }, { "epoch": 3.6436058700209646, "grad_norm": 0.6249467730522156, "learning_rate": 2.207150662803148e-05, "loss": 0.1583, "loss_nan_ranks": 0, "loss_rank_avg": 0.18427444994449615, "step": 6950 }, { "epoch": 3.6462264150943398, "grad_norm": 0.7114428281784058, "learning_rate": 2.204550906003391e-05, "loss": 0.135, "loss_nan_ranks": 0, "loss_rank_avg": 0.14605392515659332, "step": 6955 }, { "epoch": 3.648846960167715, "grad_norm": 0.7079446315765381, "learning_rate": 2.2019507998766253e-05, "loss": 0.1465, "loss_nan_ranks": 0, "loss_rank_avg": 0.14103522896766663, "step": 6960 }, { "epoch": 3.65146750524109, "grad_norm": 0.6679487228393555, "learning_rate": 2.199350348863249e-05, "loss": 0.1482, "loss_nan_ranks": 0, "loss_rank_avg": 0.16830191016197205, "step": 6965 }, { "epoch": 3.6540880503144653, "grad_norm": 0.7247354388237, "learning_rate": 2.1967495574042484e-05, "loss": 0.1617, "loss_nan_ranks": 0, "loss_rank_avg": 0.15935799479484558, "step": 6970 }, { "epoch": 3.6567085953878404, "grad_norm": 0.7235015630722046, "learning_rate": 2.194148429941191e-05, "loss": 0.1639, "loss_nan_ranks": 0, "loss_rank_avg": 0.1375732421875, "step": 6975 }, { "epoch": 3.659329140461216, "grad_norm": 0.6391201019287109, "learning_rate": 2.191546970916218e-05, "loss": 0.1629, "loss_nan_ranks": 0, "loss_rank_avg": 0.14263559877872467, "step": 6980 }, { "epoch": 3.661949685534591, "grad_norm": 0.6432464718818665, "learning_rate": 2.1889451847720372e-05, "loss": 0.1576, "loss_nan_ranks": 0, "loss_rank_avg": 0.14456170797348022, "step": 6985 }, { "epoch": 3.6645702306079664, "grad_norm": 0.5945658087730408, "learning_rate": 2.186343075951916e-05, "loss": 0.137, "loss_nan_ranks": 0, "loss_rank_avg": 0.12893876433372498, "step": 6990 }, { "epoch": 3.667190775681342, "grad_norm": 0.6155629754066467, "learning_rate": 2.1837406488996703e-05, "loss": 0.1493, "loss_nan_ranks": 0, "loss_rank_avg": 0.19243334233760834, "step": 6995 }, { "epoch": 3.669811320754717, "grad_norm": 0.6677305698394775, "learning_rate": 2.181137908059663e-05, "loss": 0.1479, "loss_nan_ranks": 0, "loss_rank_avg": 0.15798315405845642, "step": 7000 }, { "epoch": 3.6724318658280923, "grad_norm": 0.5484825968742371, "learning_rate": 2.1785348578767893e-05, "loss": 0.1683, "loss_nan_ranks": 0, "loss_rank_avg": 0.14938029646873474, "step": 7005 }, { "epoch": 3.6750524109014675, "grad_norm": 0.6847086548805237, "learning_rate": 2.1759315027964743e-05, "loss": 0.1725, "loss_nan_ranks": 0, "loss_rank_avg": 0.15053415298461914, "step": 7010 }, { "epoch": 3.6776729559748427, "grad_norm": 0.6554835438728333, "learning_rate": 2.173327847264665e-05, "loss": 0.1654, "loss_nan_ranks": 0, "loss_rank_avg": 0.1666020154953003, "step": 7015 }, { "epoch": 3.680293501048218, "grad_norm": 0.5411213636398315, "learning_rate": 2.170723895727819e-05, "loss": 0.1455, "loss_nan_ranks": 0, "loss_rank_avg": 0.10216250270605087, "step": 7020 }, { "epoch": 3.6829140461215935, "grad_norm": 1.39761221408844, "learning_rate": 2.1681196526329015e-05, "loss": 0.1503, "loss_nan_ranks": 0, "loss_rank_avg": 0.15248467028141022, "step": 7025 }, { "epoch": 3.6855345911949686, "grad_norm": 0.6272345781326294, "learning_rate": 2.1655151224273747e-05, "loss": 0.1737, "loss_nan_ranks": 0, "loss_rank_avg": 0.2032470405101776, "step": 7030 }, { "epoch": 3.688155136268344, "grad_norm": 0.6710824966430664, "learning_rate": 2.162910309559191e-05, "loss": 0.1628, "loss_nan_ranks": 0, "loss_rank_avg": 0.1658201515674591, "step": 7035 }, { "epoch": 3.690775681341719, "grad_norm": 0.6399701833724976, "learning_rate": 2.1603052184767863e-05, "loss": 0.1395, "loss_nan_ranks": 0, "loss_rank_avg": 0.120361328125, "step": 7040 }, { "epoch": 3.6933962264150946, "grad_norm": 0.7688432335853577, "learning_rate": 2.1576998536290706e-05, "loss": 0.1565, "loss_nan_ranks": 0, "loss_rank_avg": 0.11587116122245789, "step": 7045 }, { "epoch": 3.6960167714884697, "grad_norm": 0.6019247174263, "learning_rate": 2.155094219465422e-05, "loss": 0.148, "loss_nan_ranks": 0, "loss_rank_avg": 0.17584499716758728, "step": 7050 }, { "epoch": 3.698637316561845, "grad_norm": 0.6681298613548279, "learning_rate": 2.1524883204356786e-05, "loss": 0.1672, "loss_nan_ranks": 0, "loss_rank_avg": 0.1889631301164627, "step": 7055 }, { "epoch": 3.70125786163522, "grad_norm": 0.6414555311203003, "learning_rate": 2.1498821609901306e-05, "loss": 0.1612, "loss_nan_ranks": 0, "loss_rank_avg": 0.15406014025211334, "step": 7060 }, { "epoch": 3.7038784067085953, "grad_norm": 0.601821780204773, "learning_rate": 2.1472757455795135e-05, "loss": 0.1614, "loss_nan_ranks": 0, "loss_rank_avg": 0.14945435523986816, "step": 7065 }, { "epoch": 3.7064989517819704, "grad_norm": 0.6997553706169128, "learning_rate": 2.1446690786549986e-05, "loss": 0.1613, "loss_nan_ranks": 0, "loss_rank_avg": 0.16845703125, "step": 7070 }, { "epoch": 3.709119496855346, "grad_norm": 0.6811888813972473, "learning_rate": 2.142062164668188e-05, "loss": 0.1677, "loss_nan_ranks": 0, "loss_rank_avg": 0.13721466064453125, "step": 7075 }, { "epoch": 3.711740041928721, "grad_norm": 0.6376758217811584, "learning_rate": 2.1394550080711056e-05, "loss": 0.1508, "loss_nan_ranks": 0, "loss_rank_avg": 0.15672701597213745, "step": 7080 }, { "epoch": 3.7143605870020964, "grad_norm": 0.75472092628479, "learning_rate": 2.1368476133161885e-05, "loss": 0.1318, "loss_nan_ranks": 0, "loss_rank_avg": 0.1363525390625, "step": 7085 }, { "epoch": 3.7169811320754715, "grad_norm": 0.6204655766487122, "learning_rate": 2.1342399848562826e-05, "loss": 0.1537, "loss_nan_ranks": 0, "loss_rank_avg": 0.160270094871521, "step": 7090 }, { "epoch": 3.719601677148847, "grad_norm": 0.7598617076873779, "learning_rate": 2.1316321271446306e-05, "loss": 0.1665, "loss_nan_ranks": 0, "loss_rank_avg": 0.1729973554611206, "step": 7095 }, { "epoch": 3.7222222222222223, "grad_norm": 0.6129899024963379, "learning_rate": 2.129024044634868e-05, "loss": 0.1551, "loss_nan_ranks": 0, "loss_rank_avg": 0.14653250575065613, "step": 7100 }, { "epoch": 3.7248427672955975, "grad_norm": 0.8236209750175476, "learning_rate": 2.1264157417810153e-05, "loss": 0.1477, "loss_nan_ranks": 0, "loss_rank_avg": 0.09567755460739136, "step": 7105 }, { "epoch": 3.7274633123689727, "grad_norm": 0.7946968078613281, "learning_rate": 2.1238072230374655e-05, "loss": 0.1463, "loss_nan_ranks": 0, "loss_rank_avg": 0.145263671875, "step": 7110 }, { "epoch": 3.730083857442348, "grad_norm": 0.9173590540885925, "learning_rate": 2.121198492858985e-05, "loss": 0.1483, "loss_nan_ranks": 0, "loss_rank_avg": 0.1373291015625, "step": 7115 }, { "epoch": 3.732704402515723, "grad_norm": 0.6552045345306396, "learning_rate": 2.1185895557006982e-05, "loss": 0.1159, "loss_nan_ranks": 0, "loss_rank_avg": 0.12957391142845154, "step": 7120 }, { "epoch": 3.7353249475890986, "grad_norm": 0.67323899269104, "learning_rate": 2.1159804160180826e-05, "loss": 0.1445, "loss_nan_ranks": 0, "loss_rank_avg": 0.1298828125, "step": 7125 }, { "epoch": 3.737945492662474, "grad_norm": 0.7391826510429382, "learning_rate": 2.1133710782669653e-05, "loss": 0.146, "loss_nan_ranks": 0, "loss_rank_avg": 0.1224365234375, "step": 7130 }, { "epoch": 3.740566037735849, "grad_norm": 0.6076016426086426, "learning_rate": 2.1107615469035078e-05, "loss": 0.1649, "loss_nan_ranks": 0, "loss_rank_avg": 0.1857558786869049, "step": 7135 }, { "epoch": 3.7431865828092246, "grad_norm": 0.6100286245346069, "learning_rate": 2.1081518263842032e-05, "loss": 0.1354, "loss_nan_ranks": 0, "loss_rank_avg": 0.13551019132137299, "step": 7140 }, { "epoch": 3.7458071278825997, "grad_norm": 0.6343952417373657, "learning_rate": 2.1055419211658687e-05, "loss": 0.1561, "loss_nan_ranks": 0, "loss_rank_avg": 0.1612548828125, "step": 7145 }, { "epoch": 3.748427672955975, "grad_norm": 0.6255004405975342, "learning_rate": 2.102931835705636e-05, "loss": 0.162, "loss_nan_ranks": 0, "loss_rank_avg": 0.20365479588508606, "step": 7150 }, { "epoch": 3.75104821802935, "grad_norm": 0.6127068400382996, "learning_rate": 2.1003215744609452e-05, "loss": 0.15, "loss_nan_ranks": 0, "loss_rank_avg": 0.1393146812915802, "step": 7155 }, { "epoch": 3.7536687631027252, "grad_norm": 0.6279597282409668, "learning_rate": 2.0977111418895363e-05, "loss": 0.1551, "loss_nan_ranks": 0, "loss_rank_avg": 0.151123046875, "step": 7160 }, { "epoch": 3.7562893081761004, "grad_norm": 0.6501970291137695, "learning_rate": 2.095100542449441e-05, "loss": 0.1384, "loss_nan_ranks": 0, "loss_rank_avg": 0.1400650590658188, "step": 7165 }, { "epoch": 3.758909853249476, "grad_norm": 0.6475972533226013, "learning_rate": 2.0924897805989778e-05, "loss": 0.1563, "loss_nan_ranks": 0, "loss_rank_avg": 0.1405632048845291, "step": 7170 }, { "epoch": 3.761530398322851, "grad_norm": 0.6913469433784485, "learning_rate": 2.089878860796741e-05, "loss": 0.1457, "loss_nan_ranks": 0, "loss_rank_avg": 0.15082892775535583, "step": 7175 }, { "epoch": 3.7641509433962264, "grad_norm": 0.6681004762649536, "learning_rate": 2.087267787501596e-05, "loss": 0.1561, "loss_nan_ranks": 0, "loss_rank_avg": 0.1363525390625, "step": 7180 }, { "epoch": 3.7667714884696015, "grad_norm": 0.6305525898933411, "learning_rate": 2.0846565651726688e-05, "loss": 0.155, "loss_nan_ranks": 0, "loss_rank_avg": 0.16507667303085327, "step": 7185 }, { "epoch": 3.769392033542977, "grad_norm": 0.7154058218002319, "learning_rate": 2.0820451982693406e-05, "loss": 0.1441, "loss_nan_ranks": 0, "loss_rank_avg": 0.118896484375, "step": 7190 }, { "epoch": 3.7720125786163523, "grad_norm": 0.8042317032814026, "learning_rate": 2.0794336912512403e-05, "loss": 0.1681, "loss_nan_ranks": 0, "loss_rank_avg": 0.1513671875, "step": 7195 }, { "epoch": 3.7746331236897275, "grad_norm": 0.7067472338676453, "learning_rate": 2.076822048578235e-05, "loss": 0.1472, "loss_nan_ranks": 0, "loss_rank_avg": 0.1341552734375, "step": 7200 }, { "epoch": 3.7772536687631026, "grad_norm": 0.6020834445953369, "learning_rate": 2.0742102747104243e-05, "loss": 0.1564, "loss_nan_ranks": 0, "loss_rank_avg": 0.18506141006946564, "step": 7205 }, { "epoch": 3.779874213836478, "grad_norm": 0.6437265276908875, "learning_rate": 2.0715983741081306e-05, "loss": 0.1585, "loss_nan_ranks": 0, "loss_rank_avg": 0.15423917770385742, "step": 7210 }, { "epoch": 3.782494758909853, "grad_norm": 0.6600501537322998, "learning_rate": 2.068986351231894e-05, "loss": 0.1715, "loss_nan_ranks": 0, "loss_rank_avg": 0.14562933146953583, "step": 7215 }, { "epoch": 3.7851153039832286, "grad_norm": 0.6256850957870483, "learning_rate": 2.0663742105424626e-05, "loss": 0.1437, "loss_nan_ranks": 0, "loss_rank_avg": 0.12755300104618073, "step": 7220 }, { "epoch": 3.7877358490566038, "grad_norm": 0.622725248336792, "learning_rate": 2.063761956500786e-05, "loss": 0.1449, "loss_nan_ranks": 0, "loss_rank_avg": 0.167802631855011, "step": 7225 }, { "epoch": 3.790356394129979, "grad_norm": 0.5516610145568848, "learning_rate": 2.0611495935680085e-05, "loss": 0.1541, "loss_nan_ranks": 0, "loss_rank_avg": 0.177067369222641, "step": 7230 }, { "epoch": 3.7929769392033545, "grad_norm": 0.5736691951751709, "learning_rate": 2.0585371262054584e-05, "loss": 0.1458, "loss_nan_ranks": 0, "loss_rank_avg": 0.17594820261001587, "step": 7235 }, { "epoch": 3.7955974842767297, "grad_norm": 0.8078172206878662, "learning_rate": 2.0559245588746433e-05, "loss": 0.155, "loss_nan_ranks": 0, "loss_rank_avg": 0.1312255859375, "step": 7240 }, { "epoch": 3.798218029350105, "grad_norm": 0.6607158184051514, "learning_rate": 2.0533118960372418e-05, "loss": 0.146, "loss_nan_ranks": 0, "loss_rank_avg": 0.15671448409557343, "step": 7245 }, { "epoch": 3.80083857442348, "grad_norm": 0.6096231341362, "learning_rate": 2.0506991421550948e-05, "loss": 0.1362, "loss_nan_ranks": 0, "loss_rank_avg": 0.1362011730670929, "step": 7250 }, { "epoch": 3.8034591194968552, "grad_norm": 0.5730605125427246, "learning_rate": 2.0480863016901988e-05, "loss": 0.1255, "loss_nan_ranks": 0, "loss_rank_avg": 0.15595975518226624, "step": 7255 }, { "epoch": 3.8060796645702304, "grad_norm": 0.510726809501648, "learning_rate": 2.0454733791046996e-05, "loss": 0.1509, "loss_nan_ranks": 0, "loss_rank_avg": 0.19028598070144653, "step": 7260 }, { "epoch": 3.808700209643606, "grad_norm": 0.6243789792060852, "learning_rate": 2.042860378860881e-05, "loss": 0.1503, "loss_nan_ranks": 0, "loss_rank_avg": 0.1603560745716095, "step": 7265 }, { "epoch": 3.811320754716981, "grad_norm": 0.6757491827011108, "learning_rate": 2.040247305421162e-05, "loss": 0.1571, "loss_nan_ranks": 0, "loss_rank_avg": 0.18410971760749817, "step": 7270 }, { "epoch": 3.8139412997903563, "grad_norm": 0.5957233905792236, "learning_rate": 2.037634163248084e-05, "loss": 0.1606, "loss_nan_ranks": 0, "loss_rank_avg": 0.17781707644462585, "step": 7275 }, { "epoch": 3.8165618448637315, "grad_norm": 0.7109359502792358, "learning_rate": 2.0350209568043068e-05, "loss": 0.1502, "loss_nan_ranks": 0, "loss_rank_avg": 0.1116943359375, "step": 7280 }, { "epoch": 3.819182389937107, "grad_norm": 0.7432882189750671, "learning_rate": 2.0324076905526012e-05, "loss": 0.1431, "loss_nan_ranks": 0, "loss_rank_avg": 0.1364685595035553, "step": 7285 }, { "epoch": 3.8218029350104823, "grad_norm": 0.60526043176651, "learning_rate": 2.029794368955838e-05, "loss": 0.1599, "loss_nan_ranks": 0, "loss_rank_avg": 0.16713152825832367, "step": 7290 }, { "epoch": 3.8244234800838575, "grad_norm": 0.5748690962791443, "learning_rate": 2.0271809964769842e-05, "loss": 0.1496, "loss_nan_ranks": 0, "loss_rank_avg": 0.20099319517612457, "step": 7295 }, { "epoch": 3.8270440251572326, "grad_norm": 0.6210798025131226, "learning_rate": 2.0245675775790934e-05, "loss": 0.1686, "loss_nan_ranks": 0, "loss_rank_avg": 0.17095521092414856, "step": 7300 }, { "epoch": 3.829664570230608, "grad_norm": 0.6181237101554871, "learning_rate": 2.0219541167252968e-05, "loss": 0.1476, "loss_nan_ranks": 0, "loss_rank_avg": 0.13480696082115173, "step": 7305 }, { "epoch": 3.832285115303983, "grad_norm": 0.7305977940559387, "learning_rate": 2.0193406183788e-05, "loss": 0.1478, "loss_nan_ranks": 0, "loss_rank_avg": 0.1747744083404541, "step": 7310 }, { "epoch": 3.8349056603773586, "grad_norm": 0.6723249554634094, "learning_rate": 2.01672708700287e-05, "loss": 0.1605, "loss_nan_ranks": 0, "loss_rank_avg": 0.17026358842849731, "step": 7315 }, { "epoch": 3.8375262054507338, "grad_norm": 0.5861697793006897, "learning_rate": 2.0141135270608326e-05, "loss": 0.1451, "loss_nan_ranks": 0, "loss_rank_avg": 0.17438700795173645, "step": 7320 }, { "epoch": 3.840146750524109, "grad_norm": 0.5344928503036499, "learning_rate": 2.0114999430160607e-05, "loss": 0.1534, "loss_nan_ranks": 0, "loss_rank_avg": 0.1679978370666504, "step": 7325 }, { "epoch": 3.8427672955974845, "grad_norm": 0.6318730115890503, "learning_rate": 2.0088863393319684e-05, "loss": 0.1438, "loss_nan_ranks": 0, "loss_rank_avg": 0.19917961955070496, "step": 7330 }, { "epoch": 3.8453878406708597, "grad_norm": 0.7025963664054871, "learning_rate": 2.006272720472005e-05, "loss": 0.169, "loss_nan_ranks": 0, "loss_rank_avg": 0.16618266701698303, "step": 7335 }, { "epoch": 3.848008385744235, "grad_norm": 0.6110601425170898, "learning_rate": 2.0036590908996433e-05, "loss": 0.1616, "loss_nan_ranks": 0, "loss_rank_avg": 0.19544419646263123, "step": 7340 }, { "epoch": 3.85062893081761, "grad_norm": 0.6725945472717285, "learning_rate": 2.001045455078376e-05, "loss": 0.1497, "loss_nan_ranks": 0, "loss_rank_avg": 0.14471367001533508, "step": 7345 }, { "epoch": 3.853249475890985, "grad_norm": 0.7329033017158508, "learning_rate": 1.9984318174717063e-05, "loss": 0.1416, "loss_nan_ranks": 0, "loss_rank_avg": 0.1296122521162033, "step": 7350 }, { "epoch": 3.8558700209643604, "grad_norm": 0.6029754281044006, "learning_rate": 1.9958181825431408e-05, "loss": 0.1799, "loss_nan_ranks": 0, "loss_rank_avg": 0.17006736993789673, "step": 7355 }, { "epoch": 3.858490566037736, "grad_norm": 0.6240707039833069, "learning_rate": 1.9932045547561794e-05, "loss": 0.1568, "loss_nan_ranks": 0, "loss_rank_avg": 0.14021959900856018, "step": 7360 }, { "epoch": 3.861111111111111, "grad_norm": 0.6396682858467102, "learning_rate": 1.9905909385743127e-05, "loss": 0.1522, "loss_nan_ranks": 0, "loss_rank_avg": 0.136474609375, "step": 7365 }, { "epoch": 3.8637316561844863, "grad_norm": 0.6918826103210449, "learning_rate": 1.9879773384610097e-05, "loss": 0.1446, "loss_nan_ranks": 0, "loss_rank_avg": 0.10507458448410034, "step": 7370 }, { "epoch": 3.8663522012578615, "grad_norm": 0.6699333786964417, "learning_rate": 1.985363758879713e-05, "loss": 0.1695, "loss_nan_ranks": 0, "loss_rank_avg": 0.1435866504907608, "step": 7375 }, { "epoch": 3.868972746331237, "grad_norm": 0.588420033454895, "learning_rate": 1.9827502042938287e-05, "loss": 0.1576, "loss_nan_ranks": 0, "loss_rank_avg": 0.14458289742469788, "step": 7380 }, { "epoch": 3.8715932914046123, "grad_norm": 0.7174352407455444, "learning_rate": 1.9801366791667208e-05, "loss": 0.1492, "loss_nan_ranks": 0, "loss_rank_avg": 0.14362692832946777, "step": 7385 }, { "epoch": 3.8742138364779874, "grad_norm": 0.6377548575401306, "learning_rate": 1.9775231879617046e-05, "loss": 0.1611, "loss_nan_ranks": 0, "loss_rank_avg": 0.17670518159866333, "step": 7390 }, { "epoch": 3.8768343815513626, "grad_norm": 0.6253235936164856, "learning_rate": 1.9749097351420352e-05, "loss": 0.1663, "loss_nan_ranks": 0, "loss_rank_avg": 0.1592959463596344, "step": 7395 }, { "epoch": 3.879454926624738, "grad_norm": 0.5513318777084351, "learning_rate": 1.9722963251709033e-05, "loss": 0.1468, "loss_nan_ranks": 0, "loss_rank_avg": 0.12524718046188354, "step": 7400 }, { "epoch": 3.882075471698113, "grad_norm": 0.6378021240234375, "learning_rate": 1.9696829625114262e-05, "loss": 0.1651, "loss_nan_ranks": 0, "loss_rank_avg": 0.17486140131950378, "step": 7405 }, { "epoch": 3.8846960167714886, "grad_norm": 0.6169103384017944, "learning_rate": 1.9670696516266402e-05, "loss": 0.1503, "loss_nan_ranks": 0, "loss_rank_avg": 0.1326904296875, "step": 7410 }, { "epoch": 3.8873165618448637, "grad_norm": 0.6443046927452087, "learning_rate": 1.9644563969794937e-05, "loss": 0.1561, "loss_nan_ranks": 0, "loss_rank_avg": 0.17288607358932495, "step": 7415 }, { "epoch": 3.889937106918239, "grad_norm": 0.7150650024414062, "learning_rate": 1.961843203032838e-05, "loss": 0.1621, "loss_nan_ranks": 0, "loss_rank_avg": 0.17397406697273254, "step": 7420 }, { "epoch": 3.8925576519916145, "grad_norm": 1.0315990447998047, "learning_rate": 1.9592300742494227e-05, "loss": 0.1583, "loss_nan_ranks": 0, "loss_rank_avg": 0.16239425539970398, "step": 7425 }, { "epoch": 3.8951781970649897, "grad_norm": 0.8368551731109619, "learning_rate": 1.9566170150918842e-05, "loss": 0.1383, "loss_nan_ranks": 0, "loss_rank_avg": 0.1172657310962677, "step": 7430 }, { "epoch": 3.897798742138365, "grad_norm": 0.6009153127670288, "learning_rate": 1.95400403002274e-05, "loss": 0.1507, "loss_nan_ranks": 0, "loss_rank_avg": 0.1353941410779953, "step": 7435 }, { "epoch": 3.90041928721174, "grad_norm": 0.5876301527023315, "learning_rate": 1.9513911235043833e-05, "loss": 0.1464, "loss_nan_ranks": 0, "loss_rank_avg": 0.1405186504125595, "step": 7440 }, { "epoch": 3.903039832285115, "grad_norm": 0.6061398386955261, "learning_rate": 1.9487782999990707e-05, "loss": 0.1625, "loss_nan_ranks": 0, "loss_rank_avg": 0.1707381010055542, "step": 7445 }, { "epoch": 3.9056603773584904, "grad_norm": 0.682398796081543, "learning_rate": 1.9461655639689176e-05, "loss": 0.1542, "loss_nan_ranks": 0, "loss_rank_avg": 0.10355198383331299, "step": 7450 }, { "epoch": 3.908280922431866, "grad_norm": 0.5958063006401062, "learning_rate": 1.943552919875891e-05, "loss": 0.1439, "loss_nan_ranks": 0, "loss_rank_avg": 0.14010480046272278, "step": 7455 }, { "epoch": 3.910901467505241, "grad_norm": 0.6603404879570007, "learning_rate": 1.9409403721817997e-05, "loss": 0.1493, "loss_nan_ranks": 0, "loss_rank_avg": 0.1476980298757553, "step": 7460 }, { "epoch": 3.9135220125786163, "grad_norm": 0.6378297805786133, "learning_rate": 1.938327925348289e-05, "loss": 0.15, "loss_nan_ranks": 0, "loss_rank_avg": 0.19074776768684387, "step": 7465 }, { "epoch": 3.9161425576519915, "grad_norm": 0.531608521938324, "learning_rate": 1.9357155838368314e-05, "loss": 0.1593, "loss_nan_ranks": 0, "loss_rank_avg": 0.1572052240371704, "step": 7470 }, { "epoch": 3.918763102725367, "grad_norm": 0.619661271572113, "learning_rate": 1.9331033521087187e-05, "loss": 0.1634, "loss_nan_ranks": 0, "loss_rank_avg": 0.17789630591869354, "step": 7475 }, { "epoch": 3.9213836477987423, "grad_norm": 0.6324253678321838, "learning_rate": 1.9304912346250567e-05, "loss": 0.152, "loss_nan_ranks": 0, "loss_rank_avg": 0.15719234943389893, "step": 7480 }, { "epoch": 3.9240041928721174, "grad_norm": 0.6691383123397827, "learning_rate": 1.9278792358467552e-05, "loss": 0.1669, "loss_nan_ranks": 0, "loss_rank_avg": 0.17446158826351166, "step": 7485 }, { "epoch": 3.9266247379454926, "grad_norm": 0.6252411007881165, "learning_rate": 1.925267360234522e-05, "loss": 0.1534, "loss_nan_ranks": 0, "loss_rank_avg": 0.1358642578125, "step": 7490 }, { "epoch": 3.9292452830188678, "grad_norm": 0.5381175875663757, "learning_rate": 1.9226556122488533e-05, "loss": 0.1516, "loss_nan_ranks": 0, "loss_rank_avg": 0.1207832396030426, "step": 7495 }, { "epoch": 3.931865828092243, "grad_norm": 0.5609357953071594, "learning_rate": 1.9200439963500282e-05, "loss": 0.1411, "loss_nan_ranks": 0, "loss_rank_avg": 0.1350494772195816, "step": 7500 }, { "epoch": 3.9344863731656186, "grad_norm": 0.6950834393501282, "learning_rate": 1.917432516998101e-05, "loss": 0.1378, "loss_nan_ranks": 0, "loss_rank_avg": 0.14841407537460327, "step": 7505 }, { "epoch": 3.9371069182389937, "grad_norm": 0.6106055378913879, "learning_rate": 1.9148211786528904e-05, "loss": 0.1737, "loss_nan_ranks": 0, "loss_rank_avg": 0.2322182059288025, "step": 7510 }, { "epoch": 3.939727463312369, "grad_norm": 0.641797661781311, "learning_rate": 1.912209985773977e-05, "loss": 0.1572, "loss_nan_ranks": 0, "loss_rank_avg": 0.16424429416656494, "step": 7515 }, { "epoch": 3.9423480083857445, "grad_norm": 0.7005577087402344, "learning_rate": 1.9095989428206917e-05, "loss": 0.1415, "loss_nan_ranks": 0, "loss_rank_avg": 0.1828462779521942, "step": 7520 }, { "epoch": 3.9449685534591197, "grad_norm": 0.7762574553489685, "learning_rate": 1.906988054252109e-05, "loss": 0.1454, "loss_nan_ranks": 0, "loss_rank_avg": 0.11207491159439087, "step": 7525 }, { "epoch": 3.947589098532495, "grad_norm": 0.6599898934364319, "learning_rate": 1.9043773245270406e-05, "loss": 0.1456, "loss_nan_ranks": 0, "loss_rank_avg": 0.1344425529241562, "step": 7530 }, { "epoch": 3.95020964360587, "grad_norm": 0.5022367238998413, "learning_rate": 1.9017667581040264e-05, "loss": 0.1435, "loss_nan_ranks": 0, "loss_rank_avg": 0.11625401675701141, "step": 7535 }, { "epoch": 3.952830188679245, "grad_norm": 0.687852680683136, "learning_rate": 1.8991563594413274e-05, "loss": 0.15, "loss_nan_ranks": 0, "loss_rank_avg": 0.16913428902626038, "step": 7540 }, { "epoch": 3.9554507337526204, "grad_norm": 0.636171817779541, "learning_rate": 1.8965461329969186e-05, "loss": 0.1457, "loss_nan_ranks": 0, "loss_rank_avg": 0.14577579498291016, "step": 7545 }, { "epoch": 3.958071278825996, "grad_norm": 0.7088593244552612, "learning_rate": 1.89393608322848e-05, "loss": 0.1567, "loss_nan_ranks": 0, "loss_rank_avg": 0.18438458442687988, "step": 7550 }, { "epoch": 3.960691823899371, "grad_norm": 0.8053141236305237, "learning_rate": 1.891326214593391e-05, "loss": 0.1409, "loss_nan_ranks": 0, "loss_rank_avg": 0.1474609375, "step": 7555 }, { "epoch": 3.9633123689727463, "grad_norm": 0.5420475006103516, "learning_rate": 1.888716531548721e-05, "loss": 0.1469, "loss_nan_ranks": 0, "loss_rank_avg": 0.1517164409160614, "step": 7560 }, { "epoch": 3.9659329140461215, "grad_norm": 0.6235396265983582, "learning_rate": 1.8861070385512222e-05, "loss": 0.1712, "loss_nan_ranks": 0, "loss_rank_avg": 0.15240763127803802, "step": 7565 }, { "epoch": 3.968553459119497, "grad_norm": 0.6375654339790344, "learning_rate": 1.883497740057323e-05, "loss": 0.1439, "loss_nan_ranks": 0, "loss_rank_avg": 0.14083002507686615, "step": 7570 }, { "epoch": 3.9711740041928723, "grad_norm": 0.6710426807403564, "learning_rate": 1.8808886405231184e-05, "loss": 0.1404, "loss_nan_ranks": 0, "loss_rank_avg": 0.15685389935970306, "step": 7575 }, { "epoch": 3.9737945492662474, "grad_norm": 0.7832931876182556, "learning_rate": 1.8782797444043657e-05, "loss": 0.155, "loss_nan_ranks": 0, "loss_rank_avg": 0.18243786692619324, "step": 7580 }, { "epoch": 3.9764150943396226, "grad_norm": 0.5275224447250366, "learning_rate": 1.8756710561564728e-05, "loss": 0.1409, "loss_nan_ranks": 0, "loss_rank_avg": 0.09319180250167847, "step": 7585 }, { "epoch": 3.9790356394129978, "grad_norm": 0.7512395977973938, "learning_rate": 1.8730625802344927e-05, "loss": 0.1651, "loss_nan_ranks": 0, "loss_rank_avg": 0.16206032037734985, "step": 7590 }, { "epoch": 3.981656184486373, "grad_norm": 0.7400315999984741, "learning_rate": 1.870454321093118e-05, "loss": 0.1516, "loss_nan_ranks": 0, "loss_rank_avg": 0.1293114572763443, "step": 7595 }, { "epoch": 3.9842767295597485, "grad_norm": 0.6416282057762146, "learning_rate": 1.8678462831866684e-05, "loss": 0.1424, "loss_nan_ranks": 0, "loss_rank_avg": 0.12756970524787903, "step": 7600 }, { "epoch": 3.9868972746331237, "grad_norm": 1.3672630786895752, "learning_rate": 1.8652384709690875e-05, "loss": 0.1557, "loss_nan_ranks": 0, "loss_rank_avg": 0.1692837029695511, "step": 7605 }, { "epoch": 3.989517819706499, "grad_norm": 0.670573890209198, "learning_rate": 1.8626308888939323e-05, "loss": 0.1431, "loss_nan_ranks": 0, "loss_rank_avg": 0.1169547587633133, "step": 7610 }, { "epoch": 3.992138364779874, "grad_norm": 0.7186034321784973, "learning_rate": 1.8600235414143676e-05, "loss": 0.1531, "loss_nan_ranks": 0, "loss_rank_avg": 0.15219153463840485, "step": 7615 }, { "epoch": 3.9947589098532497, "grad_norm": 0.5585160851478577, "learning_rate": 1.8574164329831578e-05, "loss": 0.1543, "loss_nan_ranks": 0, "loss_rank_avg": 0.18344330787658691, "step": 7620 }, { "epoch": 3.997379454926625, "grad_norm": 0.6685236692428589, "learning_rate": 1.8548095680526577e-05, "loss": 0.1516, "loss_nan_ranks": 0, "loss_rank_avg": 0.18843969702720642, "step": 7625 }, { "epoch": 4.000524109014675, "grad_norm": 0.5740066170692444, "learning_rate": 1.852202951074808e-05, "loss": 0.158, "loss_nan_ranks": 0, "loss_rank_avg": 0.14020760357379913, "step": 7630 }, { "epoch": 4.00314465408805, "grad_norm": 0.6754254102706909, "learning_rate": 1.8495965865011247e-05, "loss": 0.1491, "loss_nan_ranks": 0, "loss_rank_avg": 0.1560601145029068, "step": 7635 }, { "epoch": 4.005765199161425, "grad_norm": 0.7221053838729858, "learning_rate": 1.8469904787826928e-05, "loss": 0.1332, "loss_nan_ranks": 0, "loss_rank_avg": 0.137377068400383, "step": 7640 }, { "epoch": 4.0083857442348005, "grad_norm": 0.5754213929176331, "learning_rate": 1.8443846323701596e-05, "loss": 0.1319, "loss_nan_ranks": 0, "loss_rank_avg": 0.1294102668762207, "step": 7645 }, { "epoch": 4.011006289308176, "grad_norm": 0.7324739694595337, "learning_rate": 1.841779051713725e-05, "loss": 0.1341, "loss_nan_ranks": 0, "loss_rank_avg": 0.10472945868968964, "step": 7650 }, { "epoch": 4.013626834381552, "grad_norm": 0.6781483888626099, "learning_rate": 1.839173741263136e-05, "loss": 0.1378, "loss_nan_ranks": 0, "loss_rank_avg": 0.11547206342220306, "step": 7655 }, { "epoch": 4.016247379454927, "grad_norm": 0.7400633692741394, "learning_rate": 1.836568705467678e-05, "loss": 0.1368, "loss_nan_ranks": 0, "loss_rank_avg": 0.18429158627986908, "step": 7660 }, { "epoch": 4.018867924528302, "grad_norm": 0.7450043559074402, "learning_rate": 1.8339639487761663e-05, "loss": 0.1047, "loss_nan_ranks": 0, "loss_rank_avg": 0.11984512209892273, "step": 7665 }, { "epoch": 4.021488469601677, "grad_norm": 0.570132851600647, "learning_rate": 1.8313594756369407e-05, "loss": 0.1428, "loss_nan_ranks": 0, "loss_rank_avg": 0.157229483127594, "step": 7670 }, { "epoch": 4.024109014675052, "grad_norm": 0.6760542988777161, "learning_rate": 1.8287552904978566e-05, "loss": 0.127, "loss_nan_ranks": 0, "loss_rank_avg": 0.1246337890625, "step": 7675 }, { "epoch": 4.026729559748428, "grad_norm": 0.833230197429657, "learning_rate": 1.8261513978062768e-05, "loss": 0.1374, "loss_nan_ranks": 0, "loss_rank_avg": 0.12040312588214874, "step": 7680 }, { "epoch": 4.029350104821803, "grad_norm": 0.6724702715873718, "learning_rate": 1.8235478020090658e-05, "loss": 0.1364, "loss_nan_ranks": 0, "loss_rank_avg": 0.13720819354057312, "step": 7685 }, { "epoch": 4.031970649895178, "grad_norm": 0.6574586033821106, "learning_rate": 1.82094450755258e-05, "loss": 0.1323, "loss_nan_ranks": 0, "loss_rank_avg": 0.1236087828874588, "step": 7690 }, { "epoch": 4.034591194968553, "grad_norm": 0.6379922032356262, "learning_rate": 1.8183415188826623e-05, "loss": 0.1386, "loss_nan_ranks": 0, "loss_rank_avg": 0.15040497481822968, "step": 7695 }, { "epoch": 4.037211740041928, "grad_norm": 0.7528687715530396, "learning_rate": 1.8157388404446324e-05, "loss": 0.1287, "loss_nan_ranks": 0, "loss_rank_avg": 0.11081687361001968, "step": 7700 }, { "epoch": 4.039832285115304, "grad_norm": 0.6273317337036133, "learning_rate": 1.81313647668328e-05, "loss": 0.1485, "loss_nan_ranks": 0, "loss_rank_avg": 0.15230341255664825, "step": 7705 }, { "epoch": 4.0424528301886795, "grad_norm": 0.7650076150894165, "learning_rate": 1.810534432042859e-05, "loss": 0.1303, "loss_nan_ranks": 0, "loss_rank_avg": 0.13308845460414886, "step": 7710 }, { "epoch": 4.045073375262055, "grad_norm": 0.8710198998451233, "learning_rate": 1.8079327109670762e-05, "loss": 0.1127, "loss_nan_ranks": 0, "loss_rank_avg": 0.10908554494380951, "step": 7715 }, { "epoch": 4.04769392033543, "grad_norm": 0.6835393905639648, "learning_rate": 1.805331317899088e-05, "loss": 0.1233, "loss_nan_ranks": 0, "loss_rank_avg": 0.14658352732658386, "step": 7720 }, { "epoch": 4.050314465408805, "grad_norm": 0.6922931671142578, "learning_rate": 1.802730257281489e-05, "loss": 0.1291, "loss_nan_ranks": 0, "loss_rank_avg": 0.15485787391662598, "step": 7725 }, { "epoch": 4.05293501048218, "grad_norm": 0.6863040328025818, "learning_rate": 1.800129533556306e-05, "loss": 0.1353, "loss_nan_ranks": 0, "loss_rank_avg": 0.11109860241413116, "step": 7730 }, { "epoch": 4.055555555555555, "grad_norm": 0.7296552062034607, "learning_rate": 1.797529151164992e-05, "loss": 0.1286, "loss_nan_ranks": 0, "loss_rank_avg": 0.12409099191427231, "step": 7735 }, { "epoch": 4.0581761006289305, "grad_norm": 0.6822858452796936, "learning_rate": 1.7949291145484153e-05, "loss": 0.1251, "loss_nan_ranks": 0, "loss_rank_avg": 0.09227552264928818, "step": 7740 }, { "epoch": 4.060796645702306, "grad_norm": 0.6392486095428467, "learning_rate": 1.7923294281468552e-05, "loss": 0.1305, "loss_nan_ranks": 0, "loss_rank_avg": 0.10040283203125, "step": 7745 }, { "epoch": 4.063417190775682, "grad_norm": 0.5952988862991333, "learning_rate": 1.789730096399992e-05, "loss": 0.1339, "loss_nan_ranks": 0, "loss_rank_avg": 0.15841393172740936, "step": 7750 }, { "epoch": 4.066037735849057, "grad_norm": 0.6478767991065979, "learning_rate": 1.7871311237468997e-05, "loss": 0.1508, "loss_nan_ranks": 0, "loss_rank_avg": 0.17274433374404907, "step": 7755 }, { "epoch": 4.068658280922432, "grad_norm": 0.7296462059020996, "learning_rate": 1.7845325146260416e-05, "loss": 0.1223, "loss_nan_ranks": 0, "loss_rank_avg": 0.11344051361083984, "step": 7760 }, { "epoch": 4.071278825995807, "grad_norm": 0.7088618874549866, "learning_rate": 1.7819342734752573e-05, "loss": 0.1368, "loss_nan_ranks": 0, "loss_rank_avg": 0.13282841444015503, "step": 7765 }, { "epoch": 4.073899371069182, "grad_norm": 0.6578029990196228, "learning_rate": 1.7793364047317588e-05, "loss": 0.1177, "loss_nan_ranks": 0, "loss_rank_avg": 0.1344364732503891, "step": 7770 }, { "epoch": 4.076519916142558, "grad_norm": 0.6128595471382141, "learning_rate": 1.7767389128321235e-05, "loss": 0.1414, "loss_nan_ranks": 0, "loss_rank_avg": 0.13483229279518127, "step": 7775 }, { "epoch": 4.079140461215933, "grad_norm": 0.7561476826667786, "learning_rate": 1.7741418022122835e-05, "loss": 0.1144, "loss_nan_ranks": 0, "loss_rank_avg": 0.1260986328125, "step": 7780 }, { "epoch": 4.081761006289308, "grad_norm": 0.8561640381813049, "learning_rate": 1.771545077307521e-05, "loss": 0.1358, "loss_nan_ranks": 0, "loss_rank_avg": 0.14445775747299194, "step": 7785 }, { "epoch": 4.084381551362683, "grad_norm": 0.6901652216911316, "learning_rate": 1.7689487425524587e-05, "loss": 0.1344, "loss_nan_ranks": 0, "loss_rank_avg": 0.105224609375, "step": 7790 }, { "epoch": 4.087002096436058, "grad_norm": 0.6184242963790894, "learning_rate": 1.7663528023810528e-05, "loss": 0.1376, "loss_nan_ranks": 0, "loss_rank_avg": 0.16451911628246307, "step": 7795 }, { "epoch": 4.089622641509434, "grad_norm": 0.5835419297218323, "learning_rate": 1.763757261226587e-05, "loss": 0.1423, "loss_nan_ranks": 0, "loss_rank_avg": 0.12152544409036636, "step": 7800 }, { "epoch": 4.0922431865828095, "grad_norm": 0.7047265768051147, "learning_rate": 1.7611621235216614e-05, "loss": 0.1317, "loss_nan_ranks": 0, "loss_rank_avg": 0.15300840139389038, "step": 7805 }, { "epoch": 4.094863731656185, "grad_norm": 0.7085900902748108, "learning_rate": 1.7585673936981903e-05, "loss": 0.1267, "loss_nan_ranks": 0, "loss_rank_avg": 0.13223685324192047, "step": 7810 }, { "epoch": 4.09748427672956, "grad_norm": 0.7397329211235046, "learning_rate": 1.755973076187388e-05, "loss": 0.1283, "loss_nan_ranks": 0, "loss_rank_avg": 0.11137672513723373, "step": 7815 }, { "epoch": 4.100104821802935, "grad_norm": 0.7091107964515686, "learning_rate": 1.753379175419766e-05, "loss": 0.1518, "loss_nan_ranks": 0, "loss_rank_avg": 0.14884835481643677, "step": 7820 }, { "epoch": 4.10272536687631, "grad_norm": 0.8747248649597168, "learning_rate": 1.750785695825125e-05, "loss": 0.1462, "loss_nan_ranks": 0, "loss_rank_avg": 0.14700660109519958, "step": 7825 }, { "epoch": 4.105345911949685, "grad_norm": 0.7035658955574036, "learning_rate": 1.7481926418325453e-05, "loss": 0.1209, "loss_nan_ranks": 0, "loss_rank_avg": 0.1455896496772766, "step": 7830 }, { "epoch": 4.1079664570230605, "grad_norm": 0.6393303275108337, "learning_rate": 1.7456000178703816e-05, "loss": 0.1478, "loss_nan_ranks": 0, "loss_rank_avg": 0.12629064917564392, "step": 7835 }, { "epoch": 4.110587002096436, "grad_norm": 0.6895425915718079, "learning_rate": 1.7430078283662522e-05, "loss": 0.1489, "loss_nan_ranks": 0, "loss_rank_avg": 0.14216358959674835, "step": 7840 }, { "epoch": 4.113207547169812, "grad_norm": 0.7577569484710693, "learning_rate": 1.7404160777470352e-05, "loss": 0.1231, "loss_nan_ranks": 0, "loss_rank_avg": 0.11018848419189453, "step": 7845 }, { "epoch": 4.115828092243187, "grad_norm": 0.7650940418243408, "learning_rate": 1.7378247704388585e-05, "loss": 0.1098, "loss_nan_ranks": 0, "loss_rank_avg": 0.11372706294059753, "step": 7850 }, { "epoch": 4.118448637316562, "grad_norm": 0.5672314763069153, "learning_rate": 1.7352339108670925e-05, "loss": 0.1268, "loss_nan_ranks": 0, "loss_rank_avg": 0.12074200809001923, "step": 7855 }, { "epoch": 4.121069182389937, "grad_norm": 0.6222376227378845, "learning_rate": 1.7326435034563447e-05, "loss": 0.1243, "loss_nan_ranks": 0, "loss_rank_avg": 0.1330033838748932, "step": 7860 }, { "epoch": 4.123689727463312, "grad_norm": 0.6126108169555664, "learning_rate": 1.730053552630448e-05, "loss": 0.131, "loss_nan_ranks": 0, "loss_rank_avg": 0.12257729470729828, "step": 7865 }, { "epoch": 4.126310272536688, "grad_norm": 0.7355426549911499, "learning_rate": 1.727464062812457e-05, "loss": 0.1332, "loss_nan_ranks": 0, "loss_rank_avg": 0.11887337267398834, "step": 7870 }, { "epoch": 4.128930817610063, "grad_norm": 0.653061032295227, "learning_rate": 1.7248750384246396e-05, "loss": 0.1411, "loss_nan_ranks": 0, "loss_rank_avg": 0.14558589458465576, "step": 7875 }, { "epoch": 4.131551362683438, "grad_norm": 0.6871939897537231, "learning_rate": 1.7222864838884672e-05, "loss": 0.1166, "loss_nan_ranks": 0, "loss_rank_avg": 0.103271484375, "step": 7880 }, { "epoch": 4.134171907756813, "grad_norm": 0.6735332608222961, "learning_rate": 1.7196984036246093e-05, "loss": 0.1372, "loss_nan_ranks": 0, "loss_rank_avg": 0.1126708984375, "step": 7885 }, { "epoch": 4.136792452830188, "grad_norm": 0.7757828831672668, "learning_rate": 1.7171108020529267e-05, "loss": 0.1272, "loss_nan_ranks": 0, "loss_rank_avg": 0.09894275665283203, "step": 7890 }, { "epoch": 4.139412997903564, "grad_norm": 0.658591091632843, "learning_rate": 1.7145236835924603e-05, "loss": 0.1348, "loss_nan_ranks": 0, "loss_rank_avg": 0.18379972875118256, "step": 7895 }, { "epoch": 4.1420335429769395, "grad_norm": 0.6948657631874084, "learning_rate": 1.711937052661429e-05, "loss": 0.131, "loss_nan_ranks": 0, "loss_rank_avg": 0.15797369182109833, "step": 7900 }, { "epoch": 4.144654088050315, "grad_norm": 0.670413613319397, "learning_rate": 1.709350913677217e-05, "loss": 0.1251, "loss_nan_ranks": 0, "loss_rank_avg": 0.13112309575080872, "step": 7905 }, { "epoch": 4.14727463312369, "grad_norm": 0.8161856532096863, "learning_rate": 1.7067652710563682e-05, "loss": 0.1369, "loss_nan_ranks": 0, "loss_rank_avg": 0.15099668502807617, "step": 7910 }, { "epoch": 4.149895178197065, "grad_norm": 0.557365894317627, "learning_rate": 1.7041801292145807e-05, "loss": 0.1201, "loss_nan_ranks": 0, "loss_rank_avg": 0.1425701081752777, "step": 7915 }, { "epoch": 4.15251572327044, "grad_norm": 0.69334876537323, "learning_rate": 1.7015954925666945e-05, "loss": 0.1261, "loss_nan_ranks": 0, "loss_rank_avg": 0.149521604180336, "step": 7920 }, { "epoch": 4.155136268343815, "grad_norm": 0.6496643424034119, "learning_rate": 1.69901136552669e-05, "loss": 0.1308, "loss_nan_ranks": 0, "loss_rank_avg": 0.1431989073753357, "step": 7925 }, { "epoch": 4.1577568134171905, "grad_norm": 0.7024996876716614, "learning_rate": 1.6964277525076757e-05, "loss": 0.1239, "loss_nan_ranks": 0, "loss_rank_avg": 0.0966796875, "step": 7930 }, { "epoch": 4.160377358490566, "grad_norm": 0.5853867530822754, "learning_rate": 1.6938446579218815e-05, "loss": 0.1344, "loss_nan_ranks": 0, "loss_rank_avg": 0.12605339288711548, "step": 7935 }, { "epoch": 4.162997903563942, "grad_norm": 0.5790950059890747, "learning_rate": 1.6912620861806536e-05, "loss": 0.1245, "loss_nan_ranks": 0, "loss_rank_avg": 0.12532863020896912, "step": 7940 }, { "epoch": 4.165618448637317, "grad_norm": 0.6582181453704834, "learning_rate": 1.688680041694444e-05, "loss": 0.1588, "loss_nan_ranks": 0, "loss_rank_avg": 0.1869627833366394, "step": 7945 }, { "epoch": 4.168238993710692, "grad_norm": 0.7480989098548889, "learning_rate": 1.6860985288728052e-05, "loss": 0.1326, "loss_nan_ranks": 0, "loss_rank_avg": 0.1243896484375, "step": 7950 }, { "epoch": 4.170859538784067, "grad_norm": 0.7857028245925903, "learning_rate": 1.683517552124381e-05, "loss": 0.1315, "loss_nan_ranks": 0, "loss_rank_avg": 0.13845235109329224, "step": 7955 }, { "epoch": 4.173480083857442, "grad_norm": 0.7153283357620239, "learning_rate": 1.6809371158569002e-05, "loss": 0.1169, "loss_nan_ranks": 0, "loss_rank_avg": 0.10776656866073608, "step": 7960 }, { "epoch": 4.176100628930818, "grad_norm": 0.6991490125656128, "learning_rate": 1.678357224477169e-05, "loss": 0.1456, "loss_nan_ranks": 0, "loss_rank_avg": 0.15616081655025482, "step": 7965 }, { "epoch": 4.178721174004193, "grad_norm": 0.6320271492004395, "learning_rate": 1.6757778823910612e-05, "loss": 0.1542, "loss_nan_ranks": 0, "loss_rank_avg": 0.15206626057624817, "step": 7970 }, { "epoch": 4.181341719077568, "grad_norm": 0.6700145602226257, "learning_rate": 1.673199094003515e-05, "loss": 0.1251, "loss_nan_ranks": 0, "loss_rank_avg": 0.111328125, "step": 7975 }, { "epoch": 4.183962264150943, "grad_norm": 0.7927061915397644, "learning_rate": 1.670620863718521e-05, "loss": 0.1168, "loss_nan_ranks": 0, "loss_rank_avg": 0.114990234375, "step": 7980 }, { "epoch": 4.186582809224318, "grad_norm": 0.6464593410491943, "learning_rate": 1.668043195939118e-05, "loss": 0.1485, "loss_nan_ranks": 0, "loss_rank_avg": 0.11941881477832794, "step": 7985 }, { "epoch": 4.189203354297694, "grad_norm": 0.6977652311325073, "learning_rate": 1.6654660950673834e-05, "loss": 0.1219, "loss_nan_ranks": 0, "loss_rank_avg": 0.10890907049179077, "step": 7990 }, { "epoch": 4.1918238993710695, "grad_norm": 0.7073503732681274, "learning_rate": 1.6628895655044272e-05, "loss": 0.1468, "loss_nan_ranks": 0, "loss_rank_avg": 0.15555106103420258, "step": 7995 }, { "epoch": 4.194444444444445, "grad_norm": 0.6783562302589417, "learning_rate": 1.660313611650382e-05, "loss": 0.1229, "loss_nan_ranks": 0, "loss_rank_avg": 0.13440021872520447, "step": 8000 }, { "epoch": 4.19706498951782, "grad_norm": 0.6996365189552307, "learning_rate": 1.6577382379043997e-05, "loss": 0.1238, "loss_nan_ranks": 0, "loss_rank_avg": 0.16365362703800201, "step": 8005 }, { "epoch": 4.199685534591195, "grad_norm": 0.7659716010093689, "learning_rate": 1.6551634486646394e-05, "loss": 0.1161, "loss_nan_ranks": 0, "loss_rank_avg": 0.11299479007720947, "step": 8010 }, { "epoch": 4.20230607966457, "grad_norm": 0.7514837980270386, "learning_rate": 1.652589248328264e-05, "loss": 0.127, "loss_nan_ranks": 0, "loss_rank_avg": 0.13047926127910614, "step": 8015 }, { "epoch": 4.204926624737945, "grad_norm": 0.8097921013832092, "learning_rate": 1.6500156412914286e-05, "loss": 0.1252, "loss_nan_ranks": 0, "loss_rank_avg": 0.12737253308296204, "step": 8020 }, { "epoch": 4.2075471698113205, "grad_norm": 0.9040654301643372, "learning_rate": 1.6474426319492758e-05, "loss": 0.135, "loss_nan_ranks": 0, "loss_rank_avg": 0.13547413051128387, "step": 8025 }, { "epoch": 4.210167714884696, "grad_norm": 0.6679673790931702, "learning_rate": 1.6448702246959286e-05, "loss": 0.1303, "loss_nan_ranks": 0, "loss_rank_avg": 0.11428596079349518, "step": 8030 }, { "epoch": 4.212788259958071, "grad_norm": 0.6575073599815369, "learning_rate": 1.6422984239244803e-05, "loss": 0.1332, "loss_nan_ranks": 0, "loss_rank_avg": 0.09771472215652466, "step": 8035 }, { "epoch": 4.215408805031447, "grad_norm": 0.8515417575836182, "learning_rate": 1.6397272340269892e-05, "loss": 0.1305, "loss_nan_ranks": 0, "loss_rank_avg": 0.0955810546875, "step": 8040 }, { "epoch": 4.218029350104822, "grad_norm": 0.6153324842453003, "learning_rate": 1.63715665939447e-05, "loss": 0.1209, "loss_nan_ranks": 0, "loss_rank_avg": 0.15846960246562958, "step": 8045 }, { "epoch": 4.220649895178197, "grad_norm": 0.7781662344932556, "learning_rate": 1.6345867044168867e-05, "loss": 0.1344, "loss_nan_ranks": 0, "loss_rank_avg": 0.1522197723388672, "step": 8050 }, { "epoch": 4.223270440251572, "grad_norm": 0.7323370575904846, "learning_rate": 1.6320173734831463e-05, "loss": 0.151, "loss_nan_ranks": 0, "loss_rank_avg": 0.14353422820568085, "step": 8055 }, { "epoch": 4.225890985324948, "grad_norm": 0.6446478962898254, "learning_rate": 1.6294486709810875e-05, "loss": 0.1264, "loss_nan_ranks": 0, "loss_rank_avg": 0.11535608768463135, "step": 8060 }, { "epoch": 4.228511530398323, "grad_norm": 0.8349292874336243, "learning_rate": 1.6268806012974785e-05, "loss": 0.1177, "loss_nan_ranks": 0, "loss_rank_avg": 0.09714643657207489, "step": 8065 }, { "epoch": 4.231132075471698, "grad_norm": 0.6118739247322083, "learning_rate": 1.6243131688180048e-05, "loss": 0.1475, "loss_nan_ranks": 0, "loss_rank_avg": 0.11294543743133545, "step": 8070 }, { "epoch": 4.233752620545073, "grad_norm": 0.790520966053009, "learning_rate": 1.6217463779272647e-05, "loss": 0.1275, "loss_nan_ranks": 0, "loss_rank_avg": 0.0958251953125, "step": 8075 }, { "epoch": 4.236373165618448, "grad_norm": 0.7248908281326294, "learning_rate": 1.6191802330087606e-05, "loss": 0.1226, "loss_nan_ranks": 0, "loss_rank_avg": 0.1535649597644806, "step": 8080 }, { "epoch": 4.238993710691824, "grad_norm": 0.7249651551246643, "learning_rate": 1.6166147384448915e-05, "loss": 0.1464, "loss_nan_ranks": 0, "loss_rank_avg": 0.13027583062648773, "step": 8085 }, { "epoch": 4.2416142557651995, "grad_norm": 0.8868975639343262, "learning_rate": 1.614049898616947e-05, "loss": 0.1238, "loss_nan_ranks": 0, "loss_rank_avg": 0.0872802734375, "step": 8090 }, { "epoch": 4.244234800838575, "grad_norm": 0.6978241801261902, "learning_rate": 1.611485717905096e-05, "loss": 0.1422, "loss_nan_ranks": 0, "loss_rank_avg": 0.1411801278591156, "step": 8095 }, { "epoch": 4.24685534591195, "grad_norm": 0.6798672676086426, "learning_rate": 1.6089222006883835e-05, "loss": 0.1306, "loss_nan_ranks": 0, "loss_rank_avg": 0.10818228125572205, "step": 8100 }, { "epoch": 4.249475890985325, "grad_norm": 0.6767073273658752, "learning_rate": 1.6063593513447223e-05, "loss": 0.1477, "loss_nan_ranks": 0, "loss_rank_avg": 0.17610739171504974, "step": 8105 }, { "epoch": 4.2520964360587, "grad_norm": 0.6395707726478577, "learning_rate": 1.6037971742508826e-05, "loss": 0.1433, "loss_nan_ranks": 0, "loss_rank_avg": 0.1640520840883255, "step": 8110 }, { "epoch": 4.254716981132075, "grad_norm": 0.7487215399742126, "learning_rate": 1.6012356737824873e-05, "loss": 0.1339, "loss_nan_ranks": 0, "loss_rank_avg": 0.11843766272068024, "step": 8115 }, { "epoch": 4.2573375262054505, "grad_norm": 0.698691725730896, "learning_rate": 1.598674854314005e-05, "loss": 0.1273, "loss_nan_ranks": 0, "loss_rank_avg": 0.13690641522407532, "step": 8120 }, { "epoch": 4.259958071278826, "grad_norm": 0.6861567497253418, "learning_rate": 1.5961147202187385e-05, "loss": 0.1326, "loss_nan_ranks": 0, "loss_rank_avg": 0.14201557636260986, "step": 8125 }, { "epoch": 4.262578616352201, "grad_norm": 0.639648973941803, "learning_rate": 1.5935552758688237e-05, "loss": 0.1236, "loss_nan_ranks": 0, "loss_rank_avg": 0.14524777233600616, "step": 8130 }, { "epoch": 4.265199161425577, "grad_norm": 0.7146438360214233, "learning_rate": 1.5909965256352156e-05, "loss": 0.1466, "loss_nan_ranks": 0, "loss_rank_avg": 0.1595865935087204, "step": 8135 }, { "epoch": 4.267819706498952, "grad_norm": 0.6025714874267578, "learning_rate": 1.588438473887685e-05, "loss": 0.1268, "loss_nan_ranks": 0, "loss_rank_avg": 0.12222793698310852, "step": 8140 }, { "epoch": 4.270440251572327, "grad_norm": 0.6809830665588379, "learning_rate": 1.5858811249948104e-05, "loss": 0.1303, "loss_nan_ranks": 0, "loss_rank_avg": 0.1384069174528122, "step": 8145 }, { "epoch": 4.273060796645702, "grad_norm": 0.8114305734634399, "learning_rate": 1.5833244833239686e-05, "loss": 0.1554, "loss_nan_ranks": 0, "loss_rank_avg": 0.15316900610923767, "step": 8150 }, { "epoch": 4.2756813417190775, "grad_norm": 0.6200181841850281, "learning_rate": 1.58076855324133e-05, "loss": 0.1261, "loss_nan_ranks": 0, "loss_rank_avg": 0.14065788686275482, "step": 8155 }, { "epoch": 4.278301886792453, "grad_norm": 0.8265540599822998, "learning_rate": 1.578213339111849e-05, "loss": 0.1271, "loss_nan_ranks": 0, "loss_rank_avg": 0.1082763671875, "step": 8160 }, { "epoch": 4.280922431865828, "grad_norm": 0.6733942627906799, "learning_rate": 1.575658845299257e-05, "loss": 0.1283, "loss_nan_ranks": 0, "loss_rank_avg": 0.13398826122283936, "step": 8165 }, { "epoch": 4.283542976939203, "grad_norm": 0.5992264151573181, "learning_rate": 1.5731050761660563e-05, "loss": 0.1376, "loss_nan_ranks": 0, "loss_rank_avg": 0.16435667872428894, "step": 8170 }, { "epoch": 4.286163522012578, "grad_norm": 0.6579468846321106, "learning_rate": 1.570552036073511e-05, "loss": 0.1274, "loss_nan_ranks": 0, "loss_rank_avg": 0.13947761058807373, "step": 8175 }, { "epoch": 4.288784067085954, "grad_norm": 0.689167857170105, "learning_rate": 1.5679997293816397e-05, "loss": 0.1327, "loss_nan_ranks": 0, "loss_rank_avg": 0.119140625, "step": 8180 }, { "epoch": 4.2914046121593294, "grad_norm": 0.6589269042015076, "learning_rate": 1.56544816044921e-05, "loss": 0.1325, "loss_nan_ranks": 0, "loss_rank_avg": 0.1422399878501892, "step": 8185 }, { "epoch": 4.294025157232705, "grad_norm": 0.8908357620239258, "learning_rate": 1.5628973336337273e-05, "loss": 0.1415, "loss_nan_ranks": 0, "loss_rank_avg": 0.12804053723812103, "step": 8190 }, { "epoch": 4.29664570230608, "grad_norm": 0.7792320847511292, "learning_rate": 1.560347253291432e-05, "loss": 0.1274, "loss_nan_ranks": 0, "loss_rank_avg": 0.1163330078125, "step": 8195 }, { "epoch": 4.299266247379455, "grad_norm": 0.6766254305839539, "learning_rate": 1.557797923777288e-05, "loss": 0.1242, "loss_nan_ranks": 0, "loss_rank_avg": 0.15499071776866913, "step": 8200 }, { "epoch": 4.30188679245283, "grad_norm": 0.7019739747047424, "learning_rate": 1.5552493494449775e-05, "loss": 0.1366, "loss_nan_ranks": 0, "loss_rank_avg": 0.1348932981491089, "step": 8205 }, { "epoch": 4.304507337526205, "grad_norm": 0.8038340210914612, "learning_rate": 1.552701534646894e-05, "loss": 0.1193, "loss_nan_ranks": 0, "loss_rank_avg": 0.11774717271327972, "step": 8210 }, { "epoch": 4.3071278825995805, "grad_norm": 0.7018369436264038, "learning_rate": 1.5501544837341316e-05, "loss": 0.1406, "loss_nan_ranks": 0, "loss_rank_avg": 0.14359250664710999, "step": 8215 }, { "epoch": 4.309748427672956, "grad_norm": 0.5905582308769226, "learning_rate": 1.5476082010564825e-05, "loss": 0.1418, "loss_nan_ranks": 0, "loss_rank_avg": 0.14024695754051208, "step": 8220 }, { "epoch": 4.312368972746331, "grad_norm": 0.6606282591819763, "learning_rate": 1.545062690962425e-05, "loss": 0.1323, "loss_nan_ranks": 0, "loss_rank_avg": 0.13133929669857025, "step": 8225 }, { "epoch": 4.314989517819707, "grad_norm": 0.6196896433830261, "learning_rate": 1.5425179577991182e-05, "loss": 0.1191, "loss_nan_ranks": 0, "loss_rank_avg": 0.12420549988746643, "step": 8230 }, { "epoch": 4.317610062893082, "grad_norm": 0.7207083106040955, "learning_rate": 1.539974005912396e-05, "loss": 0.1318, "loss_nan_ranks": 0, "loss_rank_avg": 0.12050847709178925, "step": 8235 }, { "epoch": 4.320230607966457, "grad_norm": 0.6700676679611206, "learning_rate": 1.5374308396467555e-05, "loss": 0.1432, "loss_nan_ranks": 0, "loss_rank_avg": 0.13546261191368103, "step": 8240 }, { "epoch": 4.322851153039832, "grad_norm": 0.7328806519508362, "learning_rate": 1.534888463345355e-05, "loss": 0.1341, "loss_nan_ranks": 0, "loss_rank_avg": 0.1011962890625, "step": 8245 }, { "epoch": 4.3254716981132075, "grad_norm": 0.6616674065589905, "learning_rate": 1.5323468813500016e-05, "loss": 0.1167, "loss_nan_ranks": 0, "loss_rank_avg": 0.13191799819469452, "step": 8250 }, { "epoch": 4.328092243186583, "grad_norm": 0.9578312039375305, "learning_rate": 1.529806098001146e-05, "loss": 0.1403, "loss_nan_ranks": 0, "loss_rank_avg": 0.1744530200958252, "step": 8255 }, { "epoch": 4.330712788259958, "grad_norm": 0.591022253036499, "learning_rate": 1.5272661176378765e-05, "loss": 0.1331, "loss_nan_ranks": 0, "loss_rank_avg": 0.14783424139022827, "step": 8260 }, { "epoch": 4.333333333333333, "grad_norm": 0.7101303935050964, "learning_rate": 1.524726944597908e-05, "loss": 0.1455, "loss_nan_ranks": 0, "loss_rank_avg": 0.11623962223529816, "step": 8265 }, { "epoch": 4.335953878406708, "grad_norm": 0.6866541504859924, "learning_rate": 1.5221885832175791e-05, "loss": 0.1284, "loss_nan_ranks": 0, "loss_rank_avg": 0.12282105535268784, "step": 8270 }, { "epoch": 4.338574423480084, "grad_norm": 0.827079713344574, "learning_rate": 1.51965103783184e-05, "loss": 0.1276, "loss_nan_ranks": 0, "loss_rank_avg": 0.133544921875, "step": 8275 }, { "epoch": 4.341194968553459, "grad_norm": 0.7082659006118774, "learning_rate": 1.5171143127742483e-05, "loss": 0.1504, "loss_nan_ranks": 0, "loss_rank_avg": 0.13507716357707977, "step": 8280 }, { "epoch": 4.343815513626835, "grad_norm": 0.7135539054870605, "learning_rate": 1.5145784123769614e-05, "loss": 0.1308, "loss_nan_ranks": 0, "loss_rank_avg": 0.11422479152679443, "step": 8285 }, { "epoch": 4.34643605870021, "grad_norm": 0.6408969759941101, "learning_rate": 1.5120433409707267e-05, "loss": 0.1253, "loss_nan_ranks": 0, "loss_rank_avg": 0.13356246054172516, "step": 8290 }, { "epoch": 4.349056603773585, "grad_norm": 0.7403706908226013, "learning_rate": 1.5095091028848778e-05, "loss": 0.1166, "loss_nan_ranks": 0, "loss_rank_avg": 0.11962890625, "step": 8295 }, { "epoch": 4.35167714884696, "grad_norm": 0.7667035460472107, "learning_rate": 1.506975702447324e-05, "loss": 0.1386, "loss_nan_ranks": 0, "loss_rank_avg": 0.15348896384239197, "step": 8300 }, { "epoch": 4.354297693920335, "grad_norm": 0.61168372631073, "learning_rate": 1.5044431439845433e-05, "loss": 0.1311, "loss_nan_ranks": 0, "loss_rank_avg": 0.1137775182723999, "step": 8305 }, { "epoch": 4.3569182389937104, "grad_norm": 0.699834942817688, "learning_rate": 1.5019114318215779e-05, "loss": 0.138, "loss_nan_ranks": 0, "loss_rank_avg": 0.14548508822917938, "step": 8310 }, { "epoch": 4.359538784067086, "grad_norm": 0.744339108467102, "learning_rate": 1.4993805702820234e-05, "loss": 0.117, "loss_nan_ranks": 0, "loss_rank_avg": 0.08885983377695084, "step": 8315 }, { "epoch": 4.362159329140461, "grad_norm": 0.6417985558509827, "learning_rate": 1.496850563688022e-05, "loss": 0.1414, "loss_nan_ranks": 0, "loss_rank_avg": 0.1308688521385193, "step": 8320 }, { "epoch": 4.364779874213837, "grad_norm": 0.7600770592689514, "learning_rate": 1.4943214163602582e-05, "loss": 0.1155, "loss_nan_ranks": 0, "loss_rank_avg": 0.11033851653337479, "step": 8325 }, { "epoch": 4.367400419287212, "grad_norm": 0.7353156805038452, "learning_rate": 1.4917931326179462e-05, "loss": 0.1276, "loss_nan_ranks": 0, "loss_rank_avg": 0.15977749228477478, "step": 8330 }, { "epoch": 4.370020964360587, "grad_norm": 0.7362678050994873, "learning_rate": 1.489265716778828e-05, "loss": 0.1316, "loss_nan_ranks": 0, "loss_rank_avg": 0.16832853853702545, "step": 8335 }, { "epoch": 4.372641509433962, "grad_norm": 0.8924562335014343, "learning_rate": 1.4867391731591618e-05, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.081787109375, "step": 8340 }, { "epoch": 4.3752620545073375, "grad_norm": 0.8289515972137451, "learning_rate": 1.4842135060737162e-05, "loss": 0.1248, "loss_nan_ranks": 0, "loss_rank_avg": 0.09356689453125, "step": 8345 }, { "epoch": 4.377882599580713, "grad_norm": 0.7817913293838501, "learning_rate": 1.4816887198357642e-05, "loss": 0.1404, "loss_nan_ranks": 0, "loss_rank_avg": 0.15546870231628418, "step": 8350 }, { "epoch": 4.380503144654088, "grad_norm": 0.7873523235321045, "learning_rate": 1.4791648187570727e-05, "loss": 0.1471, "loss_nan_ranks": 0, "loss_rank_avg": 0.12130207568407059, "step": 8355 }, { "epoch": 4.383123689727463, "grad_norm": 0.8174839019775391, "learning_rate": 1.4766418071478987e-05, "loss": 0.1245, "loss_nan_ranks": 0, "loss_rank_avg": 0.0906982421875, "step": 8360 }, { "epoch": 4.385744234800838, "grad_norm": 0.6824502348899841, "learning_rate": 1.4741196893169793e-05, "loss": 0.1358, "loss_nan_ranks": 0, "loss_rank_avg": 0.1260560303926468, "step": 8365 }, { "epoch": 4.388364779874214, "grad_norm": 0.7154617309570312, "learning_rate": 1.4715984695715247e-05, "loss": 0.1342, "loss_nan_ranks": 0, "loss_rank_avg": 0.13355013728141785, "step": 8370 }, { "epoch": 4.390985324947589, "grad_norm": 0.7102477550506592, "learning_rate": 1.4690781522172129e-05, "loss": 0.125, "loss_nan_ranks": 0, "loss_rank_avg": 0.1316867172718048, "step": 8375 }, { "epoch": 4.393605870020965, "grad_norm": 0.6073978543281555, "learning_rate": 1.4665587415581791e-05, "loss": 0.1393, "loss_nan_ranks": 0, "loss_rank_avg": 0.18074709177017212, "step": 8380 }, { "epoch": 4.39622641509434, "grad_norm": 0.7010768055915833, "learning_rate": 1.4640402418970116e-05, "loss": 0.1313, "loss_nan_ranks": 0, "loss_rank_avg": 0.15058743953704834, "step": 8385 }, { "epoch": 4.398846960167715, "grad_norm": 0.7449144721031189, "learning_rate": 1.4615226575347419e-05, "loss": 0.139, "loss_nan_ranks": 0, "loss_rank_avg": 0.14759394526481628, "step": 8390 }, { "epoch": 4.40146750524109, "grad_norm": 0.8424119353294373, "learning_rate": 1.4590059927708379e-05, "loss": 0.1351, "loss_nan_ranks": 0, "loss_rank_avg": 0.1319580078125, "step": 8395 }, { "epoch": 4.404088050314465, "grad_norm": 0.7126275300979614, "learning_rate": 1.4564902519031992e-05, "loss": 0.1356, "loss_nan_ranks": 0, "loss_rank_avg": 0.09475725889205933, "step": 8400 }, { "epoch": 4.40670859538784, "grad_norm": 0.6241462230682373, "learning_rate": 1.453975439228145e-05, "loss": 0.1341, "loss_nan_ranks": 0, "loss_rank_avg": 0.10678377747535706, "step": 8405 }, { "epoch": 4.409329140461216, "grad_norm": 0.6343846917152405, "learning_rate": 1.4514615590404115e-05, "loss": 0.1486, "loss_nan_ranks": 0, "loss_rank_avg": 0.16542157530784607, "step": 8410 }, { "epoch": 4.411949685534591, "grad_norm": 0.7402783036231995, "learning_rate": 1.4489486156331412e-05, "loss": 0.1455, "loss_nan_ranks": 0, "loss_rank_avg": 0.13801810145378113, "step": 8415 }, { "epoch": 4.414570230607967, "grad_norm": 0.7390091419219971, "learning_rate": 1.4464366132978764e-05, "loss": 0.1332, "loss_nan_ranks": 0, "loss_rank_avg": 0.13943634927272797, "step": 8420 }, { "epoch": 4.417190775681342, "grad_norm": 0.6415871381759644, "learning_rate": 1.4439255563245539e-05, "loss": 0.1362, "loss_nan_ranks": 0, "loss_rank_avg": 0.12398558855056763, "step": 8425 }, { "epoch": 4.419811320754717, "grad_norm": 2.288583278656006, "learning_rate": 1.4414154490014944e-05, "loss": 0.1488, "loss_nan_ranks": 0, "loss_rank_avg": 0.18700379133224487, "step": 8430 }, { "epoch": 4.422431865828092, "grad_norm": 0.659214437007904, "learning_rate": 1.4389062956153974e-05, "loss": 0.1365, "loss_nan_ranks": 0, "loss_rank_avg": 0.13834397494792938, "step": 8435 }, { "epoch": 4.4250524109014675, "grad_norm": 0.7340996265411377, "learning_rate": 1.436398100451334e-05, "loss": 0.1238, "loss_nan_ranks": 0, "loss_rank_avg": 0.11822886765003204, "step": 8440 }, { "epoch": 4.427672955974843, "grad_norm": 0.7849664688110352, "learning_rate": 1.4338908677927377e-05, "loss": 0.1481, "loss_nan_ranks": 0, "loss_rank_avg": 0.13987746834754944, "step": 8445 }, { "epoch": 4.430293501048218, "grad_norm": 0.7479705214500427, "learning_rate": 1.4313846019213995e-05, "loss": 0.1333, "loss_nan_ranks": 0, "loss_rank_avg": 0.129150390625, "step": 8450 }, { "epoch": 4.432914046121593, "grad_norm": 0.6910154223442078, "learning_rate": 1.4288793071174578e-05, "loss": 0.1515, "loss_nan_ranks": 0, "loss_rank_avg": 0.14056730270385742, "step": 8455 }, { "epoch": 4.435534591194968, "grad_norm": 0.748752772808075, "learning_rate": 1.4263749876593936e-05, "loss": 0.1242, "loss_nan_ranks": 0, "loss_rank_avg": 0.1345258355140686, "step": 8460 }, { "epoch": 4.438155136268344, "grad_norm": 0.7462594509124756, "learning_rate": 1.4238716478240225e-05, "loss": 0.1303, "loss_nan_ranks": 0, "loss_rank_avg": 0.10595703125, "step": 8465 }, { "epoch": 4.440775681341719, "grad_norm": 0.6870795488357544, "learning_rate": 1.421369291886486e-05, "loss": 0.1448, "loss_nan_ranks": 0, "loss_rank_avg": 0.15087157487869263, "step": 8470 }, { "epoch": 4.443396226415095, "grad_norm": 0.8474176526069641, "learning_rate": 1.4188679241202472e-05, "loss": 0.1355, "loss_nan_ranks": 0, "loss_rank_avg": 0.1037876084446907, "step": 8475 }, { "epoch": 4.44601677148847, "grad_norm": 0.7224940061569214, "learning_rate": 1.4163675487970796e-05, "loss": 0.1211, "loss_nan_ranks": 0, "loss_rank_avg": 0.11664362996816635, "step": 8480 }, { "epoch": 4.448637316561845, "grad_norm": 0.625144898891449, "learning_rate": 1.4138681701870626e-05, "loss": 0.1501, "loss_nan_ranks": 0, "loss_rank_avg": 0.17151805758476257, "step": 8485 }, { "epoch": 4.45125786163522, "grad_norm": 0.701992392539978, "learning_rate": 1.4113697925585745e-05, "loss": 0.1443, "loss_nan_ranks": 0, "loss_rank_avg": 0.11887902021408081, "step": 8490 }, { "epoch": 4.453878406708595, "grad_norm": 0.6782894730567932, "learning_rate": 1.408872420178282e-05, "loss": 0.1193, "loss_nan_ranks": 0, "loss_rank_avg": 0.13557225465774536, "step": 8495 }, { "epoch": 4.45649895178197, "grad_norm": 0.7478619813919067, "learning_rate": 1.4063760573111372e-05, "loss": 0.1427, "loss_nan_ranks": 0, "loss_rank_avg": 0.14224380254745483, "step": 8500 }, { "epoch": 4.459119496855346, "grad_norm": 0.8462481498718262, "learning_rate": 1.4038807082203668e-05, "loss": 0.1311, "loss_nan_ranks": 0, "loss_rank_avg": 0.13301105797290802, "step": 8505 }, { "epoch": 4.461740041928721, "grad_norm": 0.7129939794540405, "learning_rate": 1.4013863771674662e-05, "loss": 0.1282, "loss_nan_ranks": 0, "loss_rank_avg": 0.11456423997879028, "step": 8510 }, { "epoch": 4.464360587002097, "grad_norm": 0.5032153725624084, "learning_rate": 1.3988930684121935e-05, "loss": 0.1153, "loss_nan_ranks": 0, "loss_rank_avg": 0.1306523084640503, "step": 8515 }, { "epoch": 4.466981132075472, "grad_norm": 0.6775069236755371, "learning_rate": 1.3964007862125595e-05, "loss": 0.1307, "loss_nan_ranks": 0, "loss_rank_avg": 0.12584161758422852, "step": 8520 }, { "epoch": 4.469601677148847, "grad_norm": 0.6101483702659607, "learning_rate": 1.3939095348248231e-05, "loss": 0.1407, "loss_nan_ranks": 0, "loss_rank_avg": 0.1297893524169922, "step": 8525 }, { "epoch": 4.472222222222222, "grad_norm": 0.8307243585586548, "learning_rate": 1.3914193185034814e-05, "loss": 0.1525, "loss_nan_ranks": 0, "loss_rank_avg": 0.1431855857372284, "step": 8530 }, { "epoch": 4.4748427672955975, "grad_norm": 0.7188790440559387, "learning_rate": 1.3889301415012648e-05, "loss": 0.1405, "loss_nan_ranks": 0, "loss_rank_avg": 0.1144997626543045, "step": 8535 }, { "epoch": 4.477463312368973, "grad_norm": 0.6485366225242615, "learning_rate": 1.386442008069129e-05, "loss": 0.1412, "loss_nan_ranks": 0, "loss_rank_avg": 0.1250472068786621, "step": 8540 }, { "epoch": 4.480083857442348, "grad_norm": 0.7434397339820862, "learning_rate": 1.3839549224562469e-05, "loss": 0.1304, "loss_nan_ranks": 0, "loss_rank_avg": 0.13347342610359192, "step": 8545 }, { "epoch": 4.482704402515723, "grad_norm": 0.6738256812095642, "learning_rate": 1.3814688889100016e-05, "loss": 0.1245, "loss_nan_ranks": 0, "loss_rank_avg": 0.14393022656440735, "step": 8550 }, { "epoch": 4.485324947589098, "grad_norm": 0.6852513551712036, "learning_rate": 1.3789839116759812e-05, "loss": 0.1226, "loss_nan_ranks": 0, "loss_rank_avg": 0.1534811109304428, "step": 8555 }, { "epoch": 4.487945492662474, "grad_norm": 0.8099626302719116, "learning_rate": 1.3764999949979677e-05, "loss": 0.1303, "loss_nan_ranks": 0, "loss_rank_avg": 0.11964830011129379, "step": 8560 }, { "epoch": 4.490566037735849, "grad_norm": 0.7180936336517334, "learning_rate": 1.3740171431179335e-05, "loss": 0.1397, "loss_nan_ranks": 0, "loss_rank_avg": 0.1126708984375, "step": 8565 }, { "epoch": 4.493186582809225, "grad_norm": 0.6698822975158691, "learning_rate": 1.3715353602760318e-05, "loss": 0.1274, "loss_nan_ranks": 0, "loss_rank_avg": 0.12569712102413177, "step": 8570 }, { "epoch": 4.4958071278826, "grad_norm": 0.7475360035896301, "learning_rate": 1.3690546507105898e-05, "loss": 0.1267, "loss_nan_ranks": 0, "loss_rank_avg": 0.14391563832759857, "step": 8575 }, { "epoch": 4.498427672955975, "grad_norm": 0.6875959634780884, "learning_rate": 1.3665750186581035e-05, "loss": 0.134, "loss_nan_ranks": 0, "loss_rank_avg": 0.12100572884082794, "step": 8580 }, { "epoch": 4.50104821802935, "grad_norm": 0.574510931968689, "learning_rate": 1.3640964683532265e-05, "loss": 0.1378, "loss_nan_ranks": 0, "loss_rank_avg": 0.15553082525730133, "step": 8585 }, { "epoch": 4.503668763102725, "grad_norm": 0.641708254814148, "learning_rate": 1.361619004028767e-05, "loss": 0.1406, "loss_nan_ranks": 0, "loss_rank_avg": 0.14186792075634003, "step": 8590 }, { "epoch": 4.5062893081761, "grad_norm": 0.697419285774231, "learning_rate": 1.3591426299156766e-05, "loss": 0.1329, "loss_nan_ranks": 0, "loss_rank_avg": 0.11964790523052216, "step": 8595 }, { "epoch": 4.508909853249476, "grad_norm": 0.6248086094856262, "learning_rate": 1.3566673502430465e-05, "loss": 0.1265, "loss_nan_ranks": 0, "loss_rank_avg": 0.1105770617723465, "step": 8600 }, { "epoch": 4.511530398322851, "grad_norm": 0.6745949387550354, "learning_rate": 1.3541931692380992e-05, "loss": 0.1315, "loss_nan_ranks": 0, "loss_rank_avg": 0.1352331042289734, "step": 8605 }, { "epoch": 4.514150943396227, "grad_norm": 0.7878817319869995, "learning_rate": 1.3517200911261792e-05, "loss": 0.1362, "loss_nan_ranks": 0, "loss_rank_avg": 0.15772098302841187, "step": 8610 }, { "epoch": 4.516771488469602, "grad_norm": 0.7717771530151367, "learning_rate": 1.3492481201307493e-05, "loss": 0.1361, "loss_nan_ranks": 0, "loss_rank_avg": 0.15128159523010254, "step": 8615 }, { "epoch": 4.519392033542977, "grad_norm": 0.6273295283317566, "learning_rate": 1.3467772604733803e-05, "loss": 0.142, "loss_nan_ranks": 0, "loss_rank_avg": 0.17118710279464722, "step": 8620 }, { "epoch": 4.522012578616352, "grad_norm": 0.5775997638702393, "learning_rate": 1.3443075163737454e-05, "loss": 0.1148, "loss_nan_ranks": 0, "loss_rank_avg": 0.1448180377483368, "step": 8625 }, { "epoch": 4.5246331236897275, "grad_norm": 0.7417553663253784, "learning_rate": 1.3418388920496132e-05, "loss": 0.1089, "loss_nan_ranks": 0, "loss_rank_avg": 0.1005859375, "step": 8630 }, { "epoch": 4.527253668763103, "grad_norm": 1.3467016220092773, "learning_rate": 1.3393713917168398e-05, "loss": 0.1439, "loss_nan_ranks": 0, "loss_rank_avg": 0.13034547865390778, "step": 8635 }, { "epoch": 4.529874213836478, "grad_norm": 0.7181364297866821, "learning_rate": 1.336905019589361e-05, "loss": 0.1552, "loss_nan_ranks": 0, "loss_rank_avg": 0.12785106897354126, "step": 8640 }, { "epoch": 4.532494758909853, "grad_norm": 0.6840164661407471, "learning_rate": 1.3344397798791872e-05, "loss": 0.1242, "loss_nan_ranks": 0, "loss_rank_avg": 0.1290283203125, "step": 8645 }, { "epoch": 4.535115303983228, "grad_norm": 0.6312596797943115, "learning_rate": 1.3319756767963931e-05, "loss": 0.1397, "loss_nan_ranks": 0, "loss_rank_avg": 0.15783429145812988, "step": 8650 }, { "epoch": 4.537735849056604, "grad_norm": 0.5777861475944519, "learning_rate": 1.329512714549115e-05, "loss": 0.1187, "loss_nan_ranks": 0, "loss_rank_avg": 0.11967919766902924, "step": 8655 }, { "epoch": 4.540356394129979, "grad_norm": 0.715965986251831, "learning_rate": 1.327050897343538e-05, "loss": 0.1261, "loss_nan_ranks": 0, "loss_rank_avg": 0.15220537781715393, "step": 8660 }, { "epoch": 4.5429769392033545, "grad_norm": 0.6857814788818359, "learning_rate": 1.324590229383893e-05, "loss": 0.1579, "loss_nan_ranks": 0, "loss_rank_avg": 0.12025237083435059, "step": 8665 }, { "epoch": 4.54559748427673, "grad_norm": 0.6896271705627441, "learning_rate": 1.3221307148724488e-05, "loss": 0.1533, "loss_nan_ranks": 0, "loss_rank_avg": 0.1713636815547943, "step": 8670 }, { "epoch": 4.548218029350105, "grad_norm": 0.7923415899276733, "learning_rate": 1.3196723580095037e-05, "loss": 0.1269, "loss_nan_ranks": 0, "loss_rank_avg": 0.12427599728107452, "step": 8675 }, { "epoch": 4.55083857442348, "grad_norm": 0.6951085925102234, "learning_rate": 1.317215162993379e-05, "loss": 0.1268, "loss_nan_ranks": 0, "loss_rank_avg": 0.13826394081115723, "step": 8680 }, { "epoch": 4.553459119496855, "grad_norm": 0.7655824422836304, "learning_rate": 1.3147591340204118e-05, "loss": 0.129, "loss_nan_ranks": 0, "loss_rank_avg": 0.107421875, "step": 8685 }, { "epoch": 4.55607966457023, "grad_norm": 0.7229164242744446, "learning_rate": 1.312304275284948e-05, "loss": 0.1236, "loss_nan_ranks": 0, "loss_rank_avg": 0.11998310685157776, "step": 8690 }, { "epoch": 4.558700209643606, "grad_norm": 0.9815998077392578, "learning_rate": 1.3098505909793356e-05, "loss": 0.1317, "loss_nan_ranks": 0, "loss_rank_avg": 0.10400390625, "step": 8695 }, { "epoch": 4.561320754716981, "grad_norm": 0.7889739871025085, "learning_rate": 1.3073980852939148e-05, "loss": 0.1304, "loss_nan_ranks": 0, "loss_rank_avg": 0.1103515625, "step": 8700 }, { "epoch": 4.563941299790356, "grad_norm": 0.7040817737579346, "learning_rate": 1.304946762417016e-05, "loss": 0.1401, "loss_nan_ranks": 0, "loss_rank_avg": 0.13906672596931458, "step": 8705 }, { "epoch": 4.566561844863732, "grad_norm": 0.6110599040985107, "learning_rate": 1.3024966265349481e-05, "loss": 0.1582, "loss_nan_ranks": 0, "loss_rank_avg": 0.15552054345607758, "step": 8710 }, { "epoch": 4.569182389937107, "grad_norm": 0.8028501868247986, "learning_rate": 1.3000476818319928e-05, "loss": 0.1492, "loss_nan_ranks": 0, "loss_rank_avg": 0.12405084073543549, "step": 8715 }, { "epoch": 4.571802935010482, "grad_norm": 0.5874795913696289, "learning_rate": 1.2975999324903968e-05, "loss": 0.1452, "loss_nan_ranks": 0, "loss_rank_avg": 0.1393960416316986, "step": 8720 }, { "epoch": 4.5744234800838575, "grad_norm": 0.5880939364433289, "learning_rate": 1.295153382690367e-05, "loss": 0.1352, "loss_nan_ranks": 0, "loss_rank_avg": 0.11091320216655731, "step": 8725 }, { "epoch": 4.577044025157233, "grad_norm": 0.7161071300506592, "learning_rate": 1.292708036610061e-05, "loss": 0.1234, "loss_nan_ranks": 0, "loss_rank_avg": 0.12063398957252502, "step": 8730 }, { "epoch": 4.579664570230608, "grad_norm": 0.7140787243843079, "learning_rate": 1.2902638984255801e-05, "loss": 0.142, "loss_nan_ranks": 0, "loss_rank_avg": 0.09417724609375, "step": 8735 }, { "epoch": 4.582285115303983, "grad_norm": 0.6972057819366455, "learning_rate": 1.2878209723109645e-05, "loss": 0.1538, "loss_nan_ranks": 0, "loss_rank_avg": 0.1201431006193161, "step": 8740 }, { "epoch": 4.584905660377358, "grad_norm": 0.7499229907989502, "learning_rate": 1.2853792624381823e-05, "loss": 0.1475, "loss_nan_ranks": 0, "loss_rank_avg": 0.1542520821094513, "step": 8745 }, { "epoch": 4.587526205450734, "grad_norm": 0.7541307806968689, "learning_rate": 1.2829387729771262e-05, "loss": 0.1259, "loss_nan_ranks": 0, "loss_rank_avg": 0.15811032056808472, "step": 8750 }, { "epoch": 4.590146750524109, "grad_norm": 0.7339674830436707, "learning_rate": 1.2804995080956038e-05, "loss": 0.135, "loss_nan_ranks": 0, "loss_rank_avg": 0.12139548361301422, "step": 8755 }, { "epoch": 4.5927672955974845, "grad_norm": 0.6182024478912354, "learning_rate": 1.2780614719593312e-05, "loss": 0.1391, "loss_nan_ranks": 0, "loss_rank_avg": 0.16165617108345032, "step": 8760 }, { "epoch": 4.59538784067086, "grad_norm": 0.6684214472770691, "learning_rate": 1.2756246687319278e-05, "loss": 0.1475, "loss_nan_ranks": 0, "loss_rank_avg": 0.13279467821121216, "step": 8765 }, { "epoch": 4.598008385744235, "grad_norm": 0.6575222015380859, "learning_rate": 1.273189102574905e-05, "loss": 0.1388, "loss_nan_ranks": 0, "loss_rank_avg": 0.100341796875, "step": 8770 }, { "epoch": 4.60062893081761, "grad_norm": 0.6948766708374023, "learning_rate": 1.2707547776476641e-05, "loss": 0.1257, "loss_nan_ranks": 0, "loss_rank_avg": 0.08551025390625, "step": 8775 }, { "epoch": 4.603249475890985, "grad_norm": 0.7402828335762024, "learning_rate": 1.2683216981074847e-05, "loss": 0.1264, "loss_nan_ranks": 0, "loss_rank_avg": 0.071685791015625, "step": 8780 }, { "epoch": 4.60587002096436, "grad_norm": 0.6999821066856384, "learning_rate": 1.2658898681095193e-05, "loss": 0.1327, "loss_nan_ranks": 0, "loss_rank_avg": 0.10844863206148148, "step": 8785 }, { "epoch": 4.6084905660377355, "grad_norm": 0.6915682554244995, "learning_rate": 1.2634592918067889e-05, "loss": 0.1165, "loss_nan_ranks": 0, "loss_rank_avg": 0.11777526885271072, "step": 8790 }, { "epoch": 4.611111111111111, "grad_norm": 0.775738537311554, "learning_rate": 1.261029973350171e-05, "loss": 0.1202, "loss_nan_ranks": 0, "loss_rank_avg": 0.1317407190799713, "step": 8795 }, { "epoch": 4.613731656184486, "grad_norm": 0.7596877217292786, "learning_rate": 1.2586019168883965e-05, "loss": 0.1515, "loss_nan_ranks": 0, "loss_rank_avg": 0.1843775510787964, "step": 8800 }, { "epoch": 4.616352201257862, "grad_norm": 0.6865528225898743, "learning_rate": 1.2561751265680405e-05, "loss": 0.1374, "loss_nan_ranks": 0, "loss_rank_avg": 0.1653093695640564, "step": 8805 }, { "epoch": 4.618972746331237, "grad_norm": 0.6690707802772522, "learning_rate": 1.2537496065335148e-05, "loss": 0.1333, "loss_nan_ranks": 0, "loss_rank_avg": 0.15160250663757324, "step": 8810 }, { "epoch": 4.621593291404612, "grad_norm": 0.5920203924179077, "learning_rate": 1.2513253609270644e-05, "loss": 0.1368, "loss_nan_ranks": 0, "loss_rank_avg": 0.138197124004364, "step": 8815 }, { "epoch": 4.6242138364779874, "grad_norm": 0.733187198638916, "learning_rate": 1.248902393888755e-05, "loss": 0.1471, "loss_nan_ranks": 0, "loss_rank_avg": 0.1322871744632721, "step": 8820 }, { "epoch": 4.626834381551363, "grad_norm": 0.7857728600502014, "learning_rate": 1.2464807095564712e-05, "loss": 0.1362, "loss_nan_ranks": 0, "loss_rank_avg": 0.12924149632453918, "step": 8825 }, { "epoch": 4.629454926624738, "grad_norm": 0.6490778923034668, "learning_rate": 1.2440603120659058e-05, "loss": 0.1274, "loss_nan_ranks": 0, "loss_rank_avg": 0.12592902779579163, "step": 8830 }, { "epoch": 4.632075471698113, "grad_norm": 0.6917393803596497, "learning_rate": 1.2416412055505532e-05, "loss": 0.1212, "loss_nan_ranks": 0, "loss_rank_avg": 0.12949758768081665, "step": 8835 }, { "epoch": 4.634696016771488, "grad_norm": 0.7808963656425476, "learning_rate": 1.2392233941417051e-05, "loss": 0.1447, "loss_nan_ranks": 0, "loss_rank_avg": 0.1295260488986969, "step": 8840 }, { "epoch": 4.637316561844864, "grad_norm": 0.6719533801078796, "learning_rate": 1.2368068819684402e-05, "loss": 0.1206, "loss_nan_ranks": 0, "loss_rank_avg": 0.10296630859375, "step": 8845 }, { "epoch": 4.639937106918239, "grad_norm": 0.6666312217712402, "learning_rate": 1.2343916731576178e-05, "loss": 0.1284, "loss_nan_ranks": 0, "loss_rank_avg": 0.13879592716693878, "step": 8850 }, { "epoch": 4.6425576519916145, "grad_norm": 0.7610325217247009, "learning_rate": 1.231977771833873e-05, "loss": 0.1306, "loss_nan_ranks": 0, "loss_rank_avg": 0.0919189453125, "step": 8855 }, { "epoch": 4.64517819706499, "grad_norm": 0.6327428221702576, "learning_rate": 1.2295651821196061e-05, "loss": 0.1262, "loss_nan_ranks": 0, "loss_rank_avg": 0.14302411675453186, "step": 8860 }, { "epoch": 4.647798742138365, "grad_norm": 0.6388610005378723, "learning_rate": 1.22715390813498e-05, "loss": 0.1259, "loss_nan_ranks": 0, "loss_rank_avg": 0.15068340301513672, "step": 8865 }, { "epoch": 4.65041928721174, "grad_norm": 0.7151187658309937, "learning_rate": 1.2247439539979085e-05, "loss": 0.1454, "loss_nan_ranks": 0, "loss_rank_avg": 0.11161041259765625, "step": 8870 }, { "epoch": 4.653039832285115, "grad_norm": 0.8098499774932861, "learning_rate": 1.2223353238240512e-05, "loss": 0.1168, "loss_nan_ranks": 0, "loss_rank_avg": 0.098876953125, "step": 8875 }, { "epoch": 4.65566037735849, "grad_norm": 0.6818023920059204, "learning_rate": 1.2199280217268085e-05, "loss": 0.1217, "loss_nan_ranks": 0, "loss_rank_avg": 0.13234014809131622, "step": 8880 }, { "epoch": 4.6582809224318655, "grad_norm": 0.9082877039909363, "learning_rate": 1.2175220518173112e-05, "loss": 0.1437, "loss_nan_ranks": 0, "loss_rank_avg": 0.13379597663879395, "step": 8885 }, { "epoch": 4.660901467505241, "grad_norm": 0.7240698337554932, "learning_rate": 1.2151174182044159e-05, "loss": 0.1297, "loss_nan_ranks": 0, "loss_rank_avg": 0.14722000062465668, "step": 8890 }, { "epoch": 4.663522012578616, "grad_norm": 0.5974624752998352, "learning_rate": 1.2127141249946966e-05, "loss": 0.1389, "loss_nan_ranks": 0, "loss_rank_avg": 0.11771177500486374, "step": 8895 }, { "epoch": 4.666142557651992, "grad_norm": 0.7507408261299133, "learning_rate": 1.2103121762924382e-05, "loss": 0.1476, "loss_nan_ranks": 0, "loss_rank_avg": 0.155793696641922, "step": 8900 }, { "epoch": 4.668763102725367, "grad_norm": 0.6103518009185791, "learning_rate": 1.2079115761996298e-05, "loss": 0.1276, "loss_nan_ranks": 0, "loss_rank_avg": 0.14163264632225037, "step": 8905 }, { "epoch": 4.671383647798742, "grad_norm": 0.6876264810562134, "learning_rate": 1.205512328815957e-05, "loss": 0.1416, "loss_nan_ranks": 0, "loss_rank_avg": 0.14926040172576904, "step": 8910 }, { "epoch": 4.674004192872117, "grad_norm": 0.6459044218063354, "learning_rate": 1.2031144382387963e-05, "loss": 0.1443, "loss_nan_ranks": 0, "loss_rank_avg": 0.15989676117897034, "step": 8915 }, { "epoch": 4.676624737945493, "grad_norm": 0.7647832036018372, "learning_rate": 1.2007179085632055e-05, "loss": 0.1132, "loss_nan_ranks": 0, "loss_rank_avg": 0.088623046875, "step": 8920 }, { "epoch": 4.679245283018868, "grad_norm": 0.773857057094574, "learning_rate": 1.1983227438819189e-05, "loss": 0.125, "loss_nan_ranks": 0, "loss_rank_avg": 0.12303590774536133, "step": 8925 }, { "epoch": 4.681865828092243, "grad_norm": 0.6240308880805969, "learning_rate": 1.1959289482853404e-05, "loss": 0.1479, "loss_nan_ranks": 0, "loss_rank_avg": 0.14185336232185364, "step": 8930 }, { "epoch": 4.684486373165618, "grad_norm": 0.6923026442527771, "learning_rate": 1.1935365258615347e-05, "loss": 0.1281, "loss_nan_ranks": 0, "loss_rank_avg": 0.10647977888584137, "step": 8935 }, { "epoch": 4.687106918238994, "grad_norm": 0.6889780759811401, "learning_rate": 1.1911454806962231e-05, "loss": 0.1448, "loss_nan_ranks": 0, "loss_rank_avg": 0.14253661036491394, "step": 8940 }, { "epoch": 4.689727463312369, "grad_norm": 0.8454015851020813, "learning_rate": 1.1887558168727726e-05, "loss": 0.1263, "loss_nan_ranks": 0, "loss_rank_avg": 0.12030220776796341, "step": 8945 }, { "epoch": 4.6923480083857445, "grad_norm": 0.6042851209640503, "learning_rate": 1.1863675384721927e-05, "loss": 0.1425, "loss_nan_ranks": 0, "loss_rank_avg": 0.13166247308254242, "step": 8950 }, { "epoch": 4.69496855345912, "grad_norm": 0.7077848315238953, "learning_rate": 1.1839806495731265e-05, "loss": 0.1171, "loss_nan_ranks": 0, "loss_rank_avg": 0.12613347172737122, "step": 8955 }, { "epoch": 4.697589098532495, "grad_norm": 0.6560515761375427, "learning_rate": 1.1815951542518447e-05, "loss": 0.1216, "loss_nan_ranks": 0, "loss_rank_avg": 0.14886903762817383, "step": 8960 }, { "epoch": 4.70020964360587, "grad_norm": 0.834663987159729, "learning_rate": 1.1792110565822363e-05, "loss": 0.1458, "loss_nan_ranks": 0, "loss_rank_avg": 0.1405927985906601, "step": 8965 }, { "epoch": 4.702830188679245, "grad_norm": 0.6441571712493896, "learning_rate": 1.1768283606358062e-05, "loss": 0.1383, "loss_nan_ranks": 0, "loss_rank_avg": 0.17624220252037048, "step": 8970 }, { "epoch": 4.70545073375262, "grad_norm": 0.6325451135635376, "learning_rate": 1.1744470704816626e-05, "loss": 0.1326, "loss_nan_ranks": 0, "loss_rank_avg": 0.11755526810884476, "step": 8975 }, { "epoch": 4.7080712788259955, "grad_norm": 0.7124117612838745, "learning_rate": 1.1720671901865158e-05, "loss": 0.1238, "loss_nan_ranks": 0, "loss_rank_avg": 0.08987025916576385, "step": 8980 }, { "epoch": 4.710691823899371, "grad_norm": 0.7094039916992188, "learning_rate": 1.1696887238146655e-05, "loss": 0.133, "loss_nan_ranks": 0, "loss_rank_avg": 0.14215922355651855, "step": 8985 }, { "epoch": 4.713312368972746, "grad_norm": 0.6898460388183594, "learning_rate": 1.1673116754279982e-05, "loss": 0.1452, "loss_nan_ranks": 0, "loss_rank_avg": 0.15627765655517578, "step": 8990 }, { "epoch": 4.715932914046122, "grad_norm": 0.7450935244560242, "learning_rate": 1.1649360490859794e-05, "loss": 0.1368, "loss_nan_ranks": 0, "loss_rank_avg": 0.14466890692710876, "step": 8995 }, { "epoch": 4.718553459119497, "grad_norm": 0.7478159070014954, "learning_rate": 1.1625618488456452e-05, "loss": 0.1239, "loss_nan_ranks": 0, "loss_rank_avg": 0.1301785111427307, "step": 9000 }, { "epoch": 4.721174004192872, "grad_norm": 0.666782796382904, "learning_rate": 1.1601890787615962e-05, "loss": 0.1317, "loss_nan_ranks": 0, "loss_rank_avg": 0.1365184485912323, "step": 9005 }, { "epoch": 4.723794549266247, "grad_norm": 0.7336409687995911, "learning_rate": 1.1578177428859899e-05, "loss": 0.1411, "loss_nan_ranks": 0, "loss_rank_avg": 0.11938644200563431, "step": 9010 }, { "epoch": 4.726415094339623, "grad_norm": 0.6312332153320312, "learning_rate": 1.1554478452685372e-05, "loss": 0.1234, "loss_nan_ranks": 0, "loss_rank_avg": 0.13300299644470215, "step": 9015 }, { "epoch": 4.729035639412998, "grad_norm": 0.7238254547119141, "learning_rate": 1.1530793899564903e-05, "loss": 0.1191, "loss_nan_ranks": 0, "loss_rank_avg": 0.10100212693214417, "step": 9020 }, { "epoch": 4.731656184486373, "grad_norm": 0.6394257545471191, "learning_rate": 1.1507123809946385e-05, "loss": 0.1231, "loss_nan_ranks": 0, "loss_rank_avg": 0.1652788668870926, "step": 9025 }, { "epoch": 4.734276729559748, "grad_norm": 0.6746800541877747, "learning_rate": 1.1483468224253018e-05, "loss": 0.1246, "loss_nan_ranks": 0, "loss_rank_avg": 0.16844013333320618, "step": 9030 }, { "epoch": 4.736897274633124, "grad_norm": 0.6455603837966919, "learning_rate": 1.1459827182883223e-05, "loss": 0.1432, "loss_nan_ranks": 0, "loss_rank_avg": 0.12587332725524902, "step": 9035 }, { "epoch": 4.739517819706499, "grad_norm": 0.7118901014328003, "learning_rate": 1.1436200726210603e-05, "loss": 0.1363, "loss_nan_ranks": 0, "loss_rank_avg": 0.1436968594789505, "step": 9040 }, { "epoch": 4.7421383647798745, "grad_norm": 0.6509990096092224, "learning_rate": 1.1412588894583832e-05, "loss": 0.1466, "loss_nan_ranks": 0, "loss_rank_avg": 0.17771729826927185, "step": 9045 }, { "epoch": 4.74475890985325, "grad_norm": 0.71531742811203, "learning_rate": 1.1388991728326615e-05, "loss": 0.1525, "loss_nan_ranks": 0, "loss_rank_avg": 0.13924604654312134, "step": 9050 }, { "epoch": 4.747379454926625, "grad_norm": 0.7064411640167236, "learning_rate": 1.1365409267737615e-05, "loss": 0.1102, "loss_nan_ranks": 0, "loss_rank_avg": 0.07723115384578705, "step": 9055 }, { "epoch": 4.75, "grad_norm": 0.6295397877693176, "learning_rate": 1.1341841553090369e-05, "loss": 0.1221, "loss_nan_ranks": 0, "loss_rank_avg": 0.1593475043773651, "step": 9060 }, { "epoch": 4.752620545073375, "grad_norm": 0.6610961556434631, "learning_rate": 1.1318288624633258e-05, "loss": 0.1627, "loss_nan_ranks": 0, "loss_rank_avg": 0.14274698495864868, "step": 9065 }, { "epoch": 4.75524109014675, "grad_norm": 0.7972453236579895, "learning_rate": 1.129475052258938e-05, "loss": 0.1557, "loss_nan_ranks": 0, "loss_rank_avg": 0.13769108057022095, "step": 9070 }, { "epoch": 4.7578616352201255, "grad_norm": 0.657624363899231, "learning_rate": 1.1271227287156536e-05, "loss": 0.1289, "loss_nan_ranks": 0, "loss_rank_avg": 0.13159124553203583, "step": 9075 }, { "epoch": 4.760482180293501, "grad_norm": 0.7180423140525818, "learning_rate": 1.1247718958507121e-05, "loss": 0.1235, "loss_nan_ranks": 0, "loss_rank_avg": 0.13836832344532013, "step": 9080 }, { "epoch": 4.763102725366876, "grad_norm": 0.7991257309913635, "learning_rate": 1.122422557678808e-05, "loss": 0.1346, "loss_nan_ranks": 0, "loss_rank_avg": 0.1132948100566864, "step": 9085 }, { "epoch": 4.765723270440252, "grad_norm": 0.71286940574646, "learning_rate": 1.1200747182120842e-05, "loss": 0.1339, "loss_nan_ranks": 0, "loss_rank_avg": 0.15693451464176178, "step": 9090 }, { "epoch": 4.768343815513627, "grad_norm": 0.6712046265602112, "learning_rate": 1.1177283814601227e-05, "loss": 0.1249, "loss_nan_ranks": 0, "loss_rank_avg": 0.135392963886261, "step": 9095 }, { "epoch": 4.770964360587002, "grad_norm": 0.6733860969543457, "learning_rate": 1.11538355142994e-05, "loss": 0.1383, "loss_nan_ranks": 0, "loss_rank_avg": 0.13899490237236023, "step": 9100 }, { "epoch": 4.773584905660377, "grad_norm": 0.6214482188224792, "learning_rate": 1.1130402321259788e-05, "loss": 0.1392, "loss_nan_ranks": 0, "loss_rank_avg": 0.18730586767196655, "step": 9105 }, { "epoch": 4.776205450733753, "grad_norm": 0.7401185631752014, "learning_rate": 1.1106984275501014e-05, "loss": 0.1418, "loss_nan_ranks": 0, "loss_rank_avg": 0.1090477854013443, "step": 9110 }, { "epoch": 4.778825995807128, "grad_norm": 0.6611148118972778, "learning_rate": 1.1083581417015858e-05, "loss": 0.1579, "loss_nan_ranks": 0, "loss_rank_avg": 0.15509769320487976, "step": 9115 }, { "epoch": 4.781446540880503, "grad_norm": 0.7677807211875916, "learning_rate": 1.1060193785771139e-05, "loss": 0.1167, "loss_nan_ranks": 0, "loss_rank_avg": 0.09381103515625, "step": 9120 }, { "epoch": 4.784067085953878, "grad_norm": 0.8474097847938538, "learning_rate": 1.1036821421707677e-05, "loss": 0.1044, "loss_nan_ranks": 0, "loss_rank_avg": 0.1416015625, "step": 9125 }, { "epoch": 4.786687631027254, "grad_norm": 0.6468724608421326, "learning_rate": 1.1013464364740223e-05, "loss": 0.1401, "loss_nan_ranks": 0, "loss_rank_avg": 0.15126362442970276, "step": 9130 }, { "epoch": 4.789308176100629, "grad_norm": 0.6623623967170715, "learning_rate": 1.0990122654757373e-05, "loss": 0.1327, "loss_nan_ranks": 0, "loss_rank_avg": 0.14331893622875214, "step": 9135 }, { "epoch": 4.7919287211740045, "grad_norm": 0.6035583019256592, "learning_rate": 1.0966796331621546e-05, "loss": 0.1316, "loss_nan_ranks": 0, "loss_rank_avg": 0.13826671242713928, "step": 9140 }, { "epoch": 4.79454926624738, "grad_norm": 0.661192774772644, "learning_rate": 1.094348543516885e-05, "loss": 0.1335, "loss_nan_ranks": 0, "loss_rank_avg": 0.12409496307373047, "step": 9145 }, { "epoch": 4.797169811320755, "grad_norm": 0.6580976247787476, "learning_rate": 1.0920190005209066e-05, "loss": 0.1552, "loss_nan_ranks": 0, "loss_rank_avg": 0.19092583656311035, "step": 9150 }, { "epoch": 4.79979035639413, "grad_norm": 0.684177041053772, "learning_rate": 1.0896910081525554e-05, "loss": 0.138, "loss_nan_ranks": 0, "loss_rank_avg": 0.15298570692539215, "step": 9155 }, { "epoch": 4.802410901467505, "grad_norm": 0.6210129857063293, "learning_rate": 1.0873645703875186e-05, "loss": 0.1275, "loss_nan_ranks": 0, "loss_rank_avg": 0.13470567762851715, "step": 9160 }, { "epoch": 4.80503144654088, "grad_norm": 0.7088992595672607, "learning_rate": 1.0850396911988312e-05, "loss": 0.1267, "loss_nan_ranks": 0, "loss_rank_avg": 0.16002580523490906, "step": 9165 }, { "epoch": 4.8076519916142555, "grad_norm": 0.7904854416847229, "learning_rate": 1.0827163745568638e-05, "loss": 0.1169, "loss_nan_ranks": 0, "loss_rank_avg": 0.1083984375, "step": 9170 }, { "epoch": 4.810272536687631, "grad_norm": 0.7116056680679321, "learning_rate": 1.08039462442932e-05, "loss": 0.1408, "loss_nan_ranks": 0, "loss_rank_avg": 0.14277006685733795, "step": 9175 }, { "epoch": 4.812893081761006, "grad_norm": 0.7433311343193054, "learning_rate": 1.0780744447812266e-05, "loss": 0.1097, "loss_nan_ranks": 0, "loss_rank_avg": 0.10260009765625, "step": 9180 }, { "epoch": 4.815513626834382, "grad_norm": 0.9626566171646118, "learning_rate": 1.0757558395749292e-05, "loss": 0.141, "loss_nan_ranks": 0, "loss_rank_avg": 0.1836649775505066, "step": 9185 }, { "epoch": 4.818134171907757, "grad_norm": 0.7340207695960999, "learning_rate": 1.0734388127700863e-05, "loss": 0.1154, "loss_nan_ranks": 0, "loss_rank_avg": 0.12539434432983398, "step": 9190 }, { "epoch": 4.820754716981132, "grad_norm": 0.8621208667755127, "learning_rate": 1.0711233683236584e-05, "loss": 0.1297, "loss_nan_ranks": 0, "loss_rank_avg": 0.1251404583454132, "step": 9195 }, { "epoch": 4.823375262054507, "grad_norm": 0.69330894947052, "learning_rate": 1.0688095101899046e-05, "loss": 0.1113, "loss_nan_ranks": 0, "loss_rank_avg": 0.08990478515625, "step": 9200 }, { "epoch": 4.825995807127883, "grad_norm": 0.7332335710525513, "learning_rate": 1.0664972423203748e-05, "loss": 0.1268, "loss_nan_ranks": 0, "loss_rank_avg": 0.12357576936483383, "step": 9205 }, { "epoch": 4.828616352201258, "grad_norm": 0.7066367864608765, "learning_rate": 1.0641865686639025e-05, "loss": 0.1308, "loss_nan_ranks": 0, "loss_rank_avg": 0.14807681739330292, "step": 9210 }, { "epoch": 4.831236897274633, "grad_norm": 0.6947424411773682, "learning_rate": 1.0618774931666014e-05, "loss": 0.1304, "loss_nan_ranks": 0, "loss_rank_avg": 0.11925715208053589, "step": 9215 }, { "epoch": 4.833857442348008, "grad_norm": 0.7224918603897095, "learning_rate": 1.0595700197718526e-05, "loss": 0.12, "loss_nan_ranks": 0, "loss_rank_avg": 0.10990692675113678, "step": 9220 }, { "epoch": 4.836477987421384, "grad_norm": 0.6890212297439575, "learning_rate": 1.0572641524203028e-05, "loss": 0.1456, "loss_nan_ranks": 0, "loss_rank_avg": 0.15643475949764252, "step": 9225 }, { "epoch": 4.839098532494759, "grad_norm": 0.7679932117462158, "learning_rate": 1.054959895049855e-05, "loss": 0.1387, "loss_nan_ranks": 0, "loss_rank_avg": 0.1330183744430542, "step": 9230 }, { "epoch": 4.8417190775681345, "grad_norm": 0.6915727853775024, "learning_rate": 1.0526572515956635e-05, "loss": 0.1298, "loss_nan_ranks": 0, "loss_rank_avg": 0.1190517470240593, "step": 9235 }, { "epoch": 4.84433962264151, "grad_norm": 0.7194444537162781, "learning_rate": 1.0503562259901257e-05, "loss": 0.1151, "loss_nan_ranks": 0, "loss_rank_avg": 0.1297692060470581, "step": 9240 }, { "epoch": 4.846960167714885, "grad_norm": 0.7311253547668457, "learning_rate": 1.0480568221628778e-05, "loss": 0.1024, "loss_nan_ranks": 0, "loss_rank_avg": 0.10064697265625, "step": 9245 }, { "epoch": 4.84958071278826, "grad_norm": 0.7731415033340454, "learning_rate": 1.0457590440407848e-05, "loss": 0.1332, "loss_nan_ranks": 0, "loss_rank_avg": 0.13664931058883667, "step": 9250 }, { "epoch": 4.852201257861635, "grad_norm": 0.6909032464027405, "learning_rate": 1.043462895547935e-05, "loss": 0.1157, "loss_nan_ranks": 0, "loss_rank_avg": 0.13590867817401886, "step": 9255 }, { "epoch": 4.85482180293501, "grad_norm": 0.622092068195343, "learning_rate": 1.0411683806056345e-05, "loss": 0.14, "loss_nan_ranks": 0, "loss_rank_avg": 0.1474238932132721, "step": 9260 }, { "epoch": 4.8574423480083855, "grad_norm": 0.7725269198417664, "learning_rate": 1.0388755031323993e-05, "loss": 0.1103, "loss_nan_ranks": 0, "loss_rank_avg": 0.13723696768283844, "step": 9265 }, { "epoch": 4.860062893081761, "grad_norm": 0.6898708343505859, "learning_rate": 1.0365842670439502e-05, "loss": 0.1253, "loss_nan_ranks": 0, "loss_rank_avg": 0.12042510509490967, "step": 9270 }, { "epoch": 4.862683438155136, "grad_norm": 0.719258725643158, "learning_rate": 1.034294676253203e-05, "loss": 0.1155, "loss_nan_ranks": 0, "loss_rank_avg": 0.0797119140625, "step": 9275 }, { "epoch": 4.865303983228512, "grad_norm": 0.63101726770401, "learning_rate": 1.0320067346702652e-05, "loss": 0.1285, "loss_nan_ranks": 0, "loss_rank_avg": 0.1552150547504425, "step": 9280 }, { "epoch": 4.867924528301887, "grad_norm": 0.7382326722145081, "learning_rate": 1.0297204462024265e-05, "loss": 0.1382, "loss_nan_ranks": 0, "loss_rank_avg": 0.1240234375, "step": 9285 }, { "epoch": 4.870545073375262, "grad_norm": 0.7246947288513184, "learning_rate": 1.0274358147541536e-05, "loss": 0.1245, "loss_nan_ranks": 0, "loss_rank_avg": 0.10537716001272202, "step": 9290 }, { "epoch": 4.873165618448637, "grad_norm": 0.6631788611412048, "learning_rate": 1.0251528442270855e-05, "loss": 0.1251, "loss_nan_ranks": 0, "loss_rank_avg": 0.14953824877738953, "step": 9295 }, { "epoch": 4.8757861635220126, "grad_norm": 0.7744284272193909, "learning_rate": 1.0228715385200224e-05, "loss": 0.1342, "loss_nan_ranks": 0, "loss_rank_avg": 0.1300048828125, "step": 9300 }, { "epoch": 4.878406708595388, "grad_norm": 0.6991315484046936, "learning_rate": 1.0205919015289221e-05, "loss": 0.1412, "loss_nan_ranks": 0, "loss_rank_avg": 0.13060849905014038, "step": 9305 }, { "epoch": 4.881027253668763, "grad_norm": 0.6638711094856262, "learning_rate": 1.0183139371468926e-05, "loss": 0.1523, "loss_nan_ranks": 0, "loss_rank_avg": 0.16712617874145508, "step": 9310 }, { "epoch": 4.883647798742138, "grad_norm": 0.7683344483375549, "learning_rate": 1.0160376492641846e-05, "loss": 0.1387, "loss_nan_ranks": 0, "loss_rank_avg": 0.11959155648946762, "step": 9315 }, { "epoch": 4.886268343815514, "grad_norm": 0.6923864483833313, "learning_rate": 1.013763041768188e-05, "loss": 0.129, "loss_nan_ranks": 0, "loss_rank_avg": 0.15276679396629333, "step": 9320 }, { "epoch": 4.888888888888889, "grad_norm": 1.0996633768081665, "learning_rate": 1.0114901185434211e-05, "loss": 0.1312, "loss_nan_ranks": 0, "loss_rank_avg": 0.15459886193275452, "step": 9325 }, { "epoch": 4.8915094339622645, "grad_norm": 0.7718807458877563, "learning_rate": 1.0092188834715262e-05, "loss": 0.15, "loss_nan_ranks": 0, "loss_rank_avg": 0.1363377869129181, "step": 9330 }, { "epoch": 4.89412997903564, "grad_norm": 0.7277114391326904, "learning_rate": 1.0069493404312627e-05, "loss": 0.1174, "loss_nan_ranks": 0, "loss_rank_avg": 0.15835943818092346, "step": 9335 }, { "epoch": 4.896750524109015, "grad_norm": 0.802894115447998, "learning_rate": 1.0046814932984996e-05, "loss": 0.1247, "loss_nan_ranks": 0, "loss_rank_avg": 0.1007080078125, "step": 9340 }, { "epoch": 4.89937106918239, "grad_norm": 0.700408399105072, "learning_rate": 1.0024153459462119e-05, "loss": 0.1238, "loss_nan_ranks": 0, "loss_rank_avg": 0.114013671875, "step": 9345 }, { "epoch": 4.901991614255765, "grad_norm": 0.838503360748291, "learning_rate": 1.0001509022444698e-05, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.12203644216060638, "step": 9350 }, { "epoch": 4.90461215932914, "grad_norm": 0.5777682662010193, "learning_rate": 9.978881660604345e-06, "loss": 0.1509, "loss_nan_ranks": 0, "loss_rank_avg": 0.15224018692970276, "step": 9355 }, { "epoch": 4.9072327044025155, "grad_norm": 0.9380687475204468, "learning_rate": 9.956271412583512e-06, "loss": 0.1245, "loss_nan_ranks": 0, "loss_rank_avg": 0.09423828125, "step": 9360 }, { "epoch": 4.909853249475891, "grad_norm": 0.70067298412323, "learning_rate": 9.933678316995414e-06, "loss": 0.1293, "loss_nan_ranks": 0, "loss_rank_avg": 0.1251220703125, "step": 9365 }, { "epoch": 4.912473794549266, "grad_norm": 0.6209483742713928, "learning_rate": 9.911102412424006e-06, "loss": 0.138, "loss_nan_ranks": 0, "loss_rank_avg": 0.14406529068946838, "step": 9370 }, { "epoch": 4.915094339622642, "grad_norm": 0.6728774309158325, "learning_rate": 9.88854373742385e-06, "loss": 0.1212, "loss_nan_ranks": 0, "loss_rank_avg": 0.1072998046875, "step": 9375 }, { "epoch": 4.917714884696017, "grad_norm": 0.6713111996650696, "learning_rate": 9.866002330520098e-06, "loss": 0.1416, "loss_nan_ranks": 0, "loss_rank_avg": 0.12103331089019775, "step": 9380 }, { "epoch": 4.920335429769392, "grad_norm": 0.6407737135887146, "learning_rate": 9.843478230208411e-06, "loss": 0.1268, "loss_nan_ranks": 0, "loss_rank_avg": 0.13770996034145355, "step": 9385 }, { "epoch": 4.922955974842767, "grad_norm": 0.7303878664970398, "learning_rate": 9.820971474954887e-06, "loss": 0.1269, "loss_nan_ranks": 0, "loss_rank_avg": 0.12143614143133163, "step": 9390 }, { "epoch": 4.9255765199161425, "grad_norm": 0.6689024567604065, "learning_rate": 9.798482103196023e-06, "loss": 0.1258, "loss_nan_ranks": 0, "loss_rank_avg": 0.12794151902198792, "step": 9395 }, { "epoch": 4.928197064989518, "grad_norm": 0.750069260597229, "learning_rate": 9.776010153338606e-06, "loss": 0.1226, "loss_nan_ranks": 0, "loss_rank_avg": 0.11221548169851303, "step": 9400 }, { "epoch": 4.930817610062893, "grad_norm": 0.7729977369308472, "learning_rate": 9.753555663759683e-06, "loss": 0.1399, "loss_nan_ranks": 0, "loss_rank_avg": 0.143976092338562, "step": 9405 }, { "epoch": 4.933438155136268, "grad_norm": 0.6980475187301636, "learning_rate": 9.731118672806476e-06, "loss": 0.1387, "loss_nan_ranks": 0, "loss_rank_avg": 0.1155320480465889, "step": 9410 }, { "epoch": 4.936058700209644, "grad_norm": 0.7056296467781067, "learning_rate": 9.70869921879632e-06, "loss": 0.1063, "loss_nan_ranks": 0, "loss_rank_avg": 0.11625757813453674, "step": 9415 }, { "epoch": 4.938679245283019, "grad_norm": 0.9438043236732483, "learning_rate": 9.686297340016624e-06, "loss": 0.1404, "loss_nan_ranks": 0, "loss_rank_avg": 0.1055908203125, "step": 9420 }, { "epoch": 4.941299790356394, "grad_norm": 0.6738733053207397, "learning_rate": 9.663913074724758e-06, "loss": 0.1327, "loss_nan_ranks": 0, "loss_rank_avg": 0.16575837135314941, "step": 9425 }, { "epoch": 4.94392033542977, "grad_norm": 0.6602095365524292, "learning_rate": 9.641546461148016e-06, "loss": 0.1319, "loss_nan_ranks": 0, "loss_rank_avg": 0.13226762413978577, "step": 9430 }, { "epoch": 4.946540880503145, "grad_norm": 0.8503509163856506, "learning_rate": 9.619197537483558e-06, "loss": 0.142, "loss_nan_ranks": 0, "loss_rank_avg": 0.10953769087791443, "step": 9435 }, { "epoch": 4.94916142557652, "grad_norm": 0.6526133418083191, "learning_rate": 9.596866341898318e-06, "loss": 0.1219, "loss_nan_ranks": 0, "loss_rank_avg": 0.11833551526069641, "step": 9440 }, { "epoch": 4.951781970649895, "grad_norm": 0.6714123487472534, "learning_rate": 9.574552912528962e-06, "loss": 0.1407, "loss_nan_ranks": 0, "loss_rank_avg": 0.14650098979473114, "step": 9445 }, { "epoch": 4.95440251572327, "grad_norm": 0.6884752511978149, "learning_rate": 9.55225728748183e-06, "loss": 0.1433, "loss_nan_ranks": 0, "loss_rank_avg": 0.11290435492992401, "step": 9450 }, { "epoch": 4.9570230607966455, "grad_norm": 0.7303773760795593, "learning_rate": 9.529979504832832e-06, "loss": 0.1117, "loss_nan_ranks": 0, "loss_rank_avg": 0.1060909554362297, "step": 9455 }, { "epoch": 4.959643605870021, "grad_norm": 0.6748950481414795, "learning_rate": 9.507719602627417e-06, "loss": 0.1331, "loss_nan_ranks": 0, "loss_rank_avg": 0.1291377991437912, "step": 9460 }, { "epoch": 4.962264150943396, "grad_norm": 0.678915798664093, "learning_rate": 9.485477618880501e-06, "loss": 0.1416, "loss_nan_ranks": 0, "loss_rank_avg": 0.14954330027103424, "step": 9465 }, { "epoch": 4.964884696016772, "grad_norm": 0.6271790266036987, "learning_rate": 9.463253591576392e-06, "loss": 0.1313, "loss_nan_ranks": 0, "loss_rank_avg": 0.15000781416893005, "step": 9470 }, { "epoch": 4.967505241090147, "grad_norm": 0.6814647912979126, "learning_rate": 9.441047558668746e-06, "loss": 0.1375, "loss_nan_ranks": 0, "loss_rank_avg": 0.14967915415763855, "step": 9475 }, { "epoch": 4.970125786163522, "grad_norm": 0.6391023397445679, "learning_rate": 9.418859558080478e-06, "loss": 0.1309, "loss_nan_ranks": 0, "loss_rank_avg": 0.16454076766967773, "step": 9480 }, { "epoch": 4.972746331236897, "grad_norm": 0.6564544439315796, "learning_rate": 9.396689627703706e-06, "loss": 0.1416, "loss_nan_ranks": 0, "loss_rank_avg": 0.14007678627967834, "step": 9485 }, { "epoch": 4.9753668763102725, "grad_norm": 0.6617010235786438, "learning_rate": 9.374537805399695e-06, "loss": 0.1338, "loss_nan_ranks": 0, "loss_rank_avg": 0.17006102204322815, "step": 9490 }, { "epoch": 4.977987421383648, "grad_norm": 0.7234999537467957, "learning_rate": 9.352404128998774e-06, "loss": 0.1515, "loss_nan_ranks": 0, "loss_rank_avg": 0.11756335198879242, "step": 9495 }, { "epoch": 4.980607966457023, "grad_norm": 0.6654846668243408, "learning_rate": 9.330288636300306e-06, "loss": 0.1231, "loss_nan_ranks": 0, "loss_rank_avg": 0.08251953125, "step": 9500 }, { "epoch": 4.983228511530398, "grad_norm": 0.7360222935676575, "learning_rate": 9.308191365072578e-06, "loss": 0.1359, "loss_nan_ranks": 0, "loss_rank_avg": 0.1528562754392624, "step": 9505 }, { "epoch": 4.985849056603773, "grad_norm": 0.8071749806404114, "learning_rate": 9.28611235305277e-06, "loss": 0.1197, "loss_nan_ranks": 0, "loss_rank_avg": 0.09523770958185196, "step": 9510 }, { "epoch": 4.988469601677149, "grad_norm": 0.8710860013961792, "learning_rate": 9.26405163794687e-06, "loss": 0.1439, "loss_nan_ranks": 0, "loss_rank_avg": 0.13526538014411926, "step": 9515 }, { "epoch": 4.991090146750524, "grad_norm": 0.7235217094421387, "learning_rate": 9.24200925742962e-06, "loss": 0.1351, "loss_nan_ranks": 0, "loss_rank_avg": 0.1304163634777069, "step": 9520 }, { "epoch": 4.9937106918239, "grad_norm": 0.6458355784416199, "learning_rate": 9.219985249144472e-06, "loss": 0.1345, "loss_nan_ranks": 0, "loss_rank_avg": 0.16256138682365417, "step": 9525 }, { "epoch": 4.996331236897275, "grad_norm": 0.7006630897521973, "learning_rate": 9.197979650703476e-06, "loss": 0.135, "loss_nan_ranks": 0, "loss_rank_avg": 0.11299961060285568, "step": 9530 }, { "epoch": 4.99895178197065, "grad_norm": 0.7289224863052368, "learning_rate": 9.175992499687254e-06, "loss": 0.123, "loss_nan_ranks": 0, "loss_rank_avg": 0.12888827919960022, "step": 9535 }, { "epoch": 5.0020964360587, "grad_norm": 0.6883781552314758, "learning_rate": 9.154023833644923e-06, "loss": 0.1171, "loss_nan_ranks": 0, "loss_rank_avg": 0.13848619163036346, "step": 9540 }, { "epoch": 5.004716981132075, "grad_norm": 0.6228224635124207, "learning_rate": 9.132073690094018e-06, "loss": 0.1454, "loss_nan_ranks": 0, "loss_rank_avg": 0.14985814690589905, "step": 9545 }, { "epoch": 5.0073375262054505, "grad_norm": 0.7366200089454651, "learning_rate": 9.110142106520474e-06, "loss": 0.121, "loss_nan_ranks": 0, "loss_rank_avg": 0.10620991885662079, "step": 9550 }, { "epoch": 5.009958071278826, "grad_norm": 0.7490249276161194, "learning_rate": 9.088229120378503e-06, "loss": 0.1206, "loss_nan_ranks": 0, "loss_rank_avg": 0.13648374378681183, "step": 9555 }, { "epoch": 5.012578616352202, "grad_norm": 0.6735389232635498, "learning_rate": 9.066334769090559e-06, "loss": 0.1307, "loss_nan_ranks": 0, "loss_rank_avg": 0.19621829688549042, "step": 9560 }, { "epoch": 5.015199161425577, "grad_norm": 0.7313094735145569, "learning_rate": 9.044459090047284e-06, "loss": 0.1182, "loss_nan_ranks": 0, "loss_rank_avg": 0.14157983660697937, "step": 9565 }, { "epoch": 5.017819706498952, "grad_norm": 0.8101787567138672, "learning_rate": 9.022602120607411e-06, "loss": 0.119, "loss_nan_ranks": 0, "loss_rank_avg": 0.11646367609500885, "step": 9570 }, { "epoch": 5.020440251572327, "grad_norm": 0.6893128156661987, "learning_rate": 9.000763898097756e-06, "loss": 0.1123, "loss_nan_ranks": 0, "loss_rank_avg": 0.13469628989696503, "step": 9575 }, { "epoch": 5.023060796645702, "grad_norm": 0.8310616612434387, "learning_rate": 8.978944459813084e-06, "loss": 0.1142, "loss_nan_ranks": 0, "loss_rank_avg": 0.11655493825674057, "step": 9580 }, { "epoch": 5.0256813417190775, "grad_norm": 0.7457146644592285, "learning_rate": 8.9571438430161e-06, "loss": 0.1122, "loss_nan_ranks": 0, "loss_rank_avg": 0.12168244272470474, "step": 9585 }, { "epoch": 5.028301886792453, "grad_norm": 0.7926444411277771, "learning_rate": 8.93536208493736e-06, "loss": 0.1226, "loss_nan_ranks": 0, "loss_rank_avg": 0.12764409184455872, "step": 9590 }, { "epoch": 5.030922431865828, "grad_norm": 0.6953998804092407, "learning_rate": 8.91359922277521e-06, "loss": 0.1281, "loss_nan_ranks": 0, "loss_rank_avg": 0.08046749979257584, "step": 9595 }, { "epoch": 5.033542976939203, "grad_norm": 0.6795186400413513, "learning_rate": 8.891855293695741e-06, "loss": 0.1202, "loss_nan_ranks": 0, "loss_rank_avg": 0.1052897572517395, "step": 9600 }, { "epoch": 5.036163522012578, "grad_norm": 0.7303555607795715, "learning_rate": 8.870130334832695e-06, "loss": 0.1153, "loss_nan_ranks": 0, "loss_rank_avg": 0.08300430327653885, "step": 9605 }, { "epoch": 5.038784067085954, "grad_norm": 0.851283073425293, "learning_rate": 8.848424383287427e-06, "loss": 0.1156, "loss_nan_ranks": 0, "loss_rank_avg": 0.074462890625, "step": 9610 }, { "epoch": 5.0414046121593294, "grad_norm": 0.6680898666381836, "learning_rate": 8.826737476128822e-06, "loss": 0.123, "loss_nan_ranks": 0, "loss_rank_avg": 0.1081383228302002, "step": 9615 }, { "epoch": 5.044025157232705, "grad_norm": 0.7580778002738953, "learning_rate": 8.805069650393239e-06, "loss": 0.1132, "loss_nan_ranks": 0, "loss_rank_avg": 0.1034734845161438, "step": 9620 }, { "epoch": 5.04664570230608, "grad_norm": 0.7712807059288025, "learning_rate": 8.783420943084477e-06, "loss": 0.0895, "loss_nan_ranks": 0, "loss_rank_avg": 0.07745361328125, "step": 9625 }, { "epoch": 5.049266247379455, "grad_norm": 0.750026524066925, "learning_rate": 8.761791391173656e-06, "loss": 0.103, "loss_nan_ranks": 0, "loss_rank_avg": 0.11945658922195435, "step": 9630 }, { "epoch": 5.05188679245283, "grad_norm": 0.6891661882400513, "learning_rate": 8.740181031599194e-06, "loss": 0.1311, "loss_nan_ranks": 0, "loss_rank_avg": 0.12674002349376678, "step": 9635 }, { "epoch": 5.054507337526205, "grad_norm": 0.7564114332199097, "learning_rate": 8.71858990126673e-06, "loss": 0.1147, "loss_nan_ranks": 0, "loss_rank_avg": 0.14253893494606018, "step": 9640 }, { "epoch": 5.0571278825995805, "grad_norm": 0.8268321752548218, "learning_rate": 8.697018037049061e-06, "loss": 0.1068, "loss_nan_ranks": 0, "loss_rank_avg": 0.07177734375, "step": 9645 }, { "epoch": 5.059748427672956, "grad_norm": 0.6657952070236206, "learning_rate": 8.6754654757861e-06, "loss": 0.1259, "loss_nan_ranks": 0, "loss_rank_avg": 0.07985062152147293, "step": 9650 }, { "epoch": 5.062368972746331, "grad_norm": 0.7740989923477173, "learning_rate": 8.653932254284772e-06, "loss": 0.1137, "loss_nan_ranks": 0, "loss_rank_avg": 0.09762389957904816, "step": 9655 }, { "epoch": 5.064989517819707, "grad_norm": 0.6569227576255798, "learning_rate": 8.632418409318985e-06, "loss": 0.1091, "loss_nan_ranks": 0, "loss_rank_avg": 0.11397574096918106, "step": 9660 }, { "epoch": 5.067610062893082, "grad_norm": 0.7708141803741455, "learning_rate": 8.610923977629555e-06, "loss": 0.1188, "loss_nan_ranks": 0, "loss_rank_avg": 0.082061767578125, "step": 9665 }, { "epoch": 5.070230607966457, "grad_norm": 0.6552038788795471, "learning_rate": 8.589448995924144e-06, "loss": 0.1144, "loss_nan_ranks": 0, "loss_rank_avg": 0.08863822370767593, "step": 9670 }, { "epoch": 5.072851153039832, "grad_norm": 0.7292214035987854, "learning_rate": 8.567993500877188e-06, "loss": 0.1182, "loss_nan_ranks": 0, "loss_rank_avg": 0.11741748452186584, "step": 9675 }, { "epoch": 5.0754716981132075, "grad_norm": 0.7536949515342712, "learning_rate": 8.54655752912987e-06, "loss": 0.0983, "loss_nan_ranks": 0, "loss_rank_avg": 0.08876580744981766, "step": 9680 }, { "epoch": 5.078092243186583, "grad_norm": 0.7254672050476074, "learning_rate": 8.52514111729001e-06, "loss": 0.1312, "loss_nan_ranks": 0, "loss_rank_avg": 0.1279667466878891, "step": 9685 }, { "epoch": 5.080712788259958, "grad_norm": 0.6217379570007324, "learning_rate": 8.503744301932026e-06, "loss": 0.1263, "loss_nan_ranks": 0, "loss_rank_avg": 0.15234941244125366, "step": 9690 }, { "epoch": 5.083333333333333, "grad_norm": 0.6784071922302246, "learning_rate": 8.482367119596876e-06, "loss": 0.1214, "loss_nan_ranks": 0, "loss_rank_avg": 0.1081690639257431, "step": 9695 }, { "epoch": 5.085953878406708, "grad_norm": 0.7554691433906555, "learning_rate": 8.46100960679198e-06, "loss": 0.122, "loss_nan_ranks": 0, "loss_rank_avg": 0.0892043262720108, "step": 9700 }, { "epoch": 5.088574423480084, "grad_norm": 0.7500377297401428, "learning_rate": 8.439671799991184e-06, "loss": 0.1165, "loss_nan_ranks": 0, "loss_rank_avg": 0.1136789470911026, "step": 9705 }, { "epoch": 5.091194968553459, "grad_norm": 0.645708441734314, "learning_rate": 8.418353735634666e-06, "loss": 0.1256, "loss_nan_ranks": 0, "loss_rank_avg": 0.13962142169475555, "step": 9710 }, { "epoch": 5.093815513626835, "grad_norm": 0.6366642117500305, "learning_rate": 8.39705545012889e-06, "loss": 0.1193, "loss_nan_ranks": 0, "loss_rank_avg": 0.10192590206861496, "step": 9715 }, { "epoch": 5.09643605870021, "grad_norm": 0.6908004879951477, "learning_rate": 8.375776979846546e-06, "loss": 0.1141, "loss_nan_ranks": 0, "loss_rank_avg": 0.13654020428657532, "step": 9720 }, { "epoch": 5.099056603773585, "grad_norm": 0.7502008080482483, "learning_rate": 8.354518361126475e-06, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.09822264313697815, "step": 9725 }, { "epoch": 5.10167714884696, "grad_norm": 0.7582405805587769, "learning_rate": 8.333279630273636e-06, "loss": 0.1012, "loss_nan_ranks": 0, "loss_rank_avg": 0.0853373259305954, "step": 9730 }, { "epoch": 5.104297693920335, "grad_norm": 0.7920611500740051, "learning_rate": 8.312060823559006e-06, "loss": 0.1254, "loss_nan_ranks": 0, "loss_rank_avg": 0.13328997790813446, "step": 9735 }, { "epoch": 5.1069182389937104, "grad_norm": 0.8097739219665527, "learning_rate": 8.290861977219542e-06, "loss": 0.1136, "loss_nan_ranks": 0, "loss_rank_avg": 0.0924072265625, "step": 9740 }, { "epoch": 5.109538784067086, "grad_norm": 0.7220619916915894, "learning_rate": 8.26968312745811e-06, "loss": 0.1171, "loss_nan_ranks": 0, "loss_rank_avg": 0.12379848957061768, "step": 9745 }, { "epoch": 5.112159329140461, "grad_norm": 0.9506103992462158, "learning_rate": 8.248524310443424e-06, "loss": 0.1119, "loss_nan_ranks": 0, "loss_rank_avg": 0.10959365218877792, "step": 9750 }, { "epoch": 5.114779874213837, "grad_norm": 0.6837508678436279, "learning_rate": 8.227385562310004e-06, "loss": 0.1236, "loss_nan_ranks": 0, "loss_rank_avg": 0.1349945366382599, "step": 9755 }, { "epoch": 5.117400419287212, "grad_norm": 0.9021229147911072, "learning_rate": 8.206266919158079e-06, "loss": 0.0918, "loss_nan_ranks": 0, "loss_rank_avg": 0.0799560546875, "step": 9760 }, { "epoch": 5.120020964360587, "grad_norm": 0.738277018070221, "learning_rate": 8.185168417053548e-06, "loss": 0.1285, "loss_nan_ranks": 0, "loss_rank_avg": 0.1264088898897171, "step": 9765 }, { "epoch": 5.122641509433962, "grad_norm": 0.633439838886261, "learning_rate": 8.164090092027914e-06, "loss": 0.1013, "loss_nan_ranks": 0, "loss_rank_avg": 0.10584621131420135, "step": 9770 }, { "epoch": 5.1252620545073375, "grad_norm": 0.5706722736358643, "learning_rate": 8.143031980078213e-06, "loss": 0.1156, "loss_nan_ranks": 0, "loss_rank_avg": 0.10877234488725662, "step": 9775 }, { "epoch": 5.127882599580713, "grad_norm": 0.7367931604385376, "learning_rate": 8.12199411716699e-06, "loss": 0.121, "loss_nan_ranks": 0, "loss_rank_avg": 0.10262522101402283, "step": 9780 }, { "epoch": 5.130503144654088, "grad_norm": 0.6430312991142273, "learning_rate": 8.100976539222179e-06, "loss": 0.1093, "loss_nan_ranks": 0, "loss_rank_avg": 0.1129990965127945, "step": 9785 }, { "epoch": 5.133123689727463, "grad_norm": 0.8123562335968018, "learning_rate": 8.079979282137083e-06, "loss": 0.1176, "loss_nan_ranks": 0, "loss_rank_avg": 0.08623982965946198, "step": 9790 }, { "epoch": 5.135744234800838, "grad_norm": 0.8307282328605652, "learning_rate": 8.059002381770303e-06, "loss": 0.1304, "loss_nan_ranks": 0, "loss_rank_avg": 0.11143720149993896, "step": 9795 }, { "epoch": 5.138364779874214, "grad_norm": 0.8005284070968628, "learning_rate": 8.038045873945664e-06, "loss": 0.114, "loss_nan_ranks": 0, "loss_rank_avg": 0.10873573273420334, "step": 9800 }, { "epoch": 5.140985324947589, "grad_norm": 0.7001315355300903, "learning_rate": 8.017109794452194e-06, "loss": 0.1005, "loss_nan_ranks": 0, "loss_rank_avg": 0.1235947459936142, "step": 9805 }, { "epoch": 5.143605870020965, "grad_norm": 0.7119617462158203, "learning_rate": 7.996194179044003e-06, "loss": 0.1262, "loss_nan_ranks": 0, "loss_rank_avg": 0.13383308053016663, "step": 9810 }, { "epoch": 5.14622641509434, "grad_norm": 0.678630530834198, "learning_rate": 7.975299063440268e-06, "loss": 0.1119, "loss_nan_ranks": 0, "loss_rank_avg": 0.13731247186660767, "step": 9815 }, { "epoch": 5.148846960167715, "grad_norm": 0.8533686995506287, "learning_rate": 7.95442448332515e-06, "loss": 0.1258, "loss_nan_ranks": 0, "loss_rank_avg": 0.1360068917274475, "step": 9820 }, { "epoch": 5.15146750524109, "grad_norm": 0.8401104211807251, "learning_rate": 7.933570474347738e-06, "loss": 0.1267, "loss_nan_ranks": 0, "loss_rank_avg": 0.08184814453125, "step": 9825 }, { "epoch": 5.154088050314465, "grad_norm": 0.8351700305938721, "learning_rate": 7.912737072122012e-06, "loss": 0.133, "loss_nan_ranks": 0, "loss_rank_avg": 0.12352661788463593, "step": 9830 }, { "epoch": 5.15670859538784, "grad_norm": 0.8089619874954224, "learning_rate": 7.891924312226738e-06, "loss": 0.1134, "loss_nan_ranks": 0, "loss_rank_avg": 0.12224520742893219, "step": 9835 }, { "epoch": 5.159329140461216, "grad_norm": 0.7358208298683167, "learning_rate": 7.87113223020543e-06, "loss": 0.1097, "loss_nan_ranks": 0, "loss_rank_avg": 0.09696923196315765, "step": 9840 }, { "epoch": 5.161949685534591, "grad_norm": 0.6873857378959656, "learning_rate": 7.8503608615663e-06, "loss": 0.1111, "loss_nan_ranks": 0, "loss_rank_avg": 0.10394269227981567, "step": 9845 }, { "epoch": 5.164570230607967, "grad_norm": 0.7390537858009338, "learning_rate": 7.829610241782171e-06, "loss": 0.1224, "loss_nan_ranks": 0, "loss_rank_avg": 0.1274482011795044, "step": 9850 }, { "epoch": 5.167190775681342, "grad_norm": 0.6764417290687561, "learning_rate": 7.808880406290455e-06, "loss": 0.126, "loss_nan_ranks": 0, "loss_rank_avg": 0.11439989507198334, "step": 9855 }, { "epoch": 5.169811320754717, "grad_norm": 0.8547785878181458, "learning_rate": 7.78817139049305e-06, "loss": 0.1115, "loss_nan_ranks": 0, "loss_rank_avg": 0.10105232149362564, "step": 9860 }, { "epoch": 5.172431865828092, "grad_norm": 0.7551053762435913, "learning_rate": 7.767483229756303e-06, "loss": 0.0992, "loss_nan_ranks": 0, "loss_rank_avg": 0.13016045093536377, "step": 9865 }, { "epoch": 5.1750524109014675, "grad_norm": 0.7922313213348389, "learning_rate": 7.746815959410947e-06, "loss": 0.1517, "loss_nan_ranks": 0, "loss_rank_avg": 0.1217041015625, "step": 9870 }, { "epoch": 5.177672955974843, "grad_norm": 0.7648522257804871, "learning_rate": 7.726169614752036e-06, "loss": 0.1115, "loss_nan_ranks": 0, "loss_rank_avg": 0.09638641774654388, "step": 9875 }, { "epoch": 5.180293501048218, "grad_norm": 0.6685067415237427, "learning_rate": 7.705544231038887e-06, "loss": 0.1164, "loss_nan_ranks": 0, "loss_rank_avg": 0.10683226585388184, "step": 9880 }, { "epoch": 5.182914046121593, "grad_norm": 0.7491695284843445, "learning_rate": 7.684939843495035e-06, "loss": 0.1084, "loss_nan_ranks": 0, "loss_rank_avg": 0.10484476387500763, "step": 9885 }, { "epoch": 5.185534591194968, "grad_norm": 0.7718347907066345, "learning_rate": 7.664356487308136e-06, "loss": 0.1181, "loss_nan_ranks": 0, "loss_rank_avg": 0.0891261026263237, "step": 9890 }, { "epoch": 5.188155136268344, "grad_norm": 0.7778302431106567, "learning_rate": 7.643794197629946e-06, "loss": 0.1072, "loss_nan_ranks": 0, "loss_rank_avg": 0.10242325812578201, "step": 9895 }, { "epoch": 5.190775681341719, "grad_norm": 0.7481498122215271, "learning_rate": 7.623253009576233e-06, "loss": 0.1067, "loss_nan_ranks": 0, "loss_rank_avg": 0.10735617578029633, "step": 9900 }, { "epoch": 5.193396226415095, "grad_norm": 0.6880243420600891, "learning_rate": 7.6027329582267266e-06, "loss": 0.1129, "loss_nan_ranks": 0, "loss_rank_avg": 0.093168243765831, "step": 9905 }, { "epoch": 5.19601677148847, "grad_norm": 0.7669791579246521, "learning_rate": 7.582234078625082e-06, "loss": 0.1221, "loss_nan_ranks": 0, "loss_rank_avg": 0.13155482709407806, "step": 9910 }, { "epoch": 5.198637316561845, "grad_norm": 0.7284143567085266, "learning_rate": 7.561756405778773e-06, "loss": 0.1393, "loss_nan_ranks": 0, "loss_rank_avg": 0.16921260952949524, "step": 9915 }, { "epoch": 5.20125786163522, "grad_norm": 0.7484710812568665, "learning_rate": 7.541299974659066e-06, "loss": 0.1146, "loss_nan_ranks": 0, "loss_rank_avg": 0.09271240234375, "step": 9920 }, { "epoch": 5.203878406708595, "grad_norm": 0.7038311958312988, "learning_rate": 7.520864820200953e-06, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.10301141440868378, "step": 9925 }, { "epoch": 5.20649895178197, "grad_norm": 0.7804470658302307, "learning_rate": 7.50045097730308e-06, "loss": 0.1243, "loss_nan_ranks": 0, "loss_rank_avg": 0.12956282496452332, "step": 9930 }, { "epoch": 5.209119496855346, "grad_norm": 0.7377161979675293, "learning_rate": 7.480058480827719e-06, "loss": 0.127, "loss_nan_ranks": 0, "loss_rank_avg": 0.11771019548177719, "step": 9935 }, { "epoch": 5.211740041928721, "grad_norm": 0.6337024569511414, "learning_rate": 7.45968736560067e-06, "loss": 0.1126, "loss_nan_ranks": 0, "loss_rank_avg": 0.1121087297797203, "step": 9940 }, { "epoch": 5.214360587002097, "grad_norm": 0.6332011818885803, "learning_rate": 7.439337666411219e-06, "loss": 0.1256, "loss_nan_ranks": 0, "loss_rank_avg": 0.08941650390625, "step": 9945 }, { "epoch": 5.216981132075472, "grad_norm": 0.9817304611206055, "learning_rate": 7.419009418012084e-06, "loss": 0.1091, "loss_nan_ranks": 0, "loss_rank_avg": 0.09293422847986221, "step": 9950 }, { "epoch": 5.219601677148847, "grad_norm": 0.6849780082702637, "learning_rate": 7.398702655119341e-06, "loss": 0.1176, "loss_nan_ranks": 0, "loss_rank_avg": 0.15425044298171997, "step": 9955 }, { "epoch": 5.222222222222222, "grad_norm": 0.7723156809806824, "learning_rate": 7.378417412412393e-06, "loss": 0.1191, "loss_nan_ranks": 0, "loss_rank_avg": 0.08868657052516937, "step": 9960 }, { "epoch": 5.2248427672955975, "grad_norm": 0.645289957523346, "learning_rate": 7.358153724533874e-06, "loss": 0.1124, "loss_nan_ranks": 0, "loss_rank_avg": 0.12060213088989258, "step": 9965 }, { "epoch": 5.227463312368973, "grad_norm": 0.7643070220947266, "learning_rate": 7.337911626089611e-06, "loss": 0.1226, "loss_nan_ranks": 0, "loss_rank_avg": 0.10711387544870377, "step": 9970 }, { "epoch": 5.230083857442348, "grad_norm": 0.6850946545600891, "learning_rate": 7.3176911516485605e-06, "loss": 0.1297, "loss_nan_ranks": 0, "loss_rank_avg": 0.12889599800109863, "step": 9975 }, { "epoch": 5.232704402515723, "grad_norm": 0.6873239874839783, "learning_rate": 7.297492335742746e-06, "loss": 0.1221, "loss_nan_ranks": 0, "loss_rank_avg": 0.11314275115728378, "step": 9980 }, { "epoch": 5.235324947589098, "grad_norm": 0.9334907531738281, "learning_rate": 7.277315212867224e-06, "loss": 0.109, "loss_nan_ranks": 0, "loss_rank_avg": 0.1055908203125, "step": 9985 }, { "epoch": 5.237945492662474, "grad_norm": 0.7646591067314148, "learning_rate": 7.25715981747998e-06, "loss": 0.1135, "loss_nan_ranks": 0, "loss_rank_avg": 0.11272291839122772, "step": 9990 }, { "epoch": 5.240566037735849, "grad_norm": 0.6887595057487488, "learning_rate": 7.2370261840019e-06, "loss": 0.1328, "loss_nan_ranks": 0, "loss_rank_avg": 0.1418076455593109, "step": 9995 }, { "epoch": 5.243186582809225, "grad_norm": 0.7327706813812256, "learning_rate": 7.216914346816715e-06, "loss": 0.1287, "loss_nan_ranks": 0, "loss_rank_avg": 0.08446444571018219, "step": 10000 }, { "epoch": 5.2458071278826, "grad_norm": 0.7251383662223816, "learning_rate": 7.196824340270916e-06, "loss": 0.1055, "loss_nan_ranks": 0, "loss_rank_avg": 0.09525515139102936, "step": 10005 }, { "epoch": 5.248427672955975, "grad_norm": 0.8185909390449524, "learning_rate": 7.176756198673734e-06, "loss": 0.1123, "loss_nan_ranks": 0, "loss_rank_avg": 0.14663901925086975, "step": 10010 }, { "epoch": 5.25104821802935, "grad_norm": 0.7505481839179993, "learning_rate": 7.156709956297041e-06, "loss": 0.1298, "loss_nan_ranks": 0, "loss_rank_avg": 0.11667823791503906, "step": 10015 }, { "epoch": 5.253668763102725, "grad_norm": 0.8053024411201477, "learning_rate": 7.136685647375321e-06, "loss": 0.1149, "loss_nan_ranks": 0, "loss_rank_avg": 0.11640357971191406, "step": 10020 }, { "epoch": 5.2562893081761, "grad_norm": 0.8202130198478699, "learning_rate": 7.116683306105592e-06, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.08791359513998032, "step": 10025 }, { "epoch": 5.258909853249476, "grad_norm": 0.6813123226165771, "learning_rate": 7.096702966647358e-06, "loss": 0.1226, "loss_nan_ranks": 0, "loss_rank_avg": 0.1360882818698883, "step": 10030 }, { "epoch": 5.261530398322851, "grad_norm": 0.7559633255004883, "learning_rate": 7.076744663122561e-06, "loss": 0.1186, "loss_nan_ranks": 0, "loss_rank_avg": 0.09497682005167007, "step": 10035 }, { "epoch": 5.264150943396227, "grad_norm": 0.6545113921165466, "learning_rate": 7.0568084296154955e-06, "loss": 0.1066, "loss_nan_ranks": 0, "loss_rank_avg": 0.1339862048625946, "step": 10040 }, { "epoch": 5.266771488469602, "grad_norm": 0.7296022772789001, "learning_rate": 7.036894300172774e-06, "loss": 0.1054, "loss_nan_ranks": 0, "loss_rank_avg": 0.099609375, "step": 10045 }, { "epoch": 5.269392033542977, "grad_norm": 0.7355965971946716, "learning_rate": 7.0170023088032534e-06, "loss": 0.1334, "loss_nan_ranks": 0, "loss_rank_avg": 0.1306404173374176, "step": 10050 }, { "epoch": 5.272012578616352, "grad_norm": 0.771010160446167, "learning_rate": 6.997132489477981e-06, "loss": 0.1078, "loss_nan_ranks": 0, "loss_rank_avg": 0.08247143030166626, "step": 10055 }, { "epoch": 5.2746331236897275, "grad_norm": 0.7686759829521179, "learning_rate": 6.977284876130162e-06, "loss": 0.1238, "loss_nan_ranks": 0, "loss_rank_avg": 0.11266212165355682, "step": 10060 }, { "epoch": 5.277253668763103, "grad_norm": 0.7104469537734985, "learning_rate": 6.957459502655053e-06, "loss": 0.1079, "loss_nan_ranks": 0, "loss_rank_avg": 0.1234317421913147, "step": 10065 }, { "epoch": 5.279874213836478, "grad_norm": 0.7212462425231934, "learning_rate": 6.937656402909938e-06, "loss": 0.1101, "loss_nan_ranks": 0, "loss_rank_avg": 0.1201360747218132, "step": 10070 }, { "epoch": 5.282494758909853, "grad_norm": 0.6866129636764526, "learning_rate": 6.917875610714069e-06, "loss": 0.1351, "loss_nan_ranks": 0, "loss_rank_avg": 0.20779114961624146, "step": 10075 }, { "epoch": 5.285115303983228, "grad_norm": 0.8186706304550171, "learning_rate": 6.898117159848594e-06, "loss": 0.1092, "loss_nan_ranks": 0, "loss_rank_avg": 0.07537841796875, "step": 10080 }, { "epoch": 5.287735849056604, "grad_norm": 0.835132896900177, "learning_rate": 6.878381084056503e-06, "loss": 0.1245, "loss_nan_ranks": 0, "loss_rank_avg": 0.12258610129356384, "step": 10085 }, { "epoch": 5.290356394129979, "grad_norm": 0.7027671337127686, "learning_rate": 6.858667417042593e-06, "loss": 0.1188, "loss_nan_ranks": 0, "loss_rank_avg": 0.13058139383792877, "step": 10090 }, { "epoch": 5.2929769392033545, "grad_norm": 0.7041785717010498, "learning_rate": 6.838976192473372e-06, "loss": 0.1051, "loss_nan_ranks": 0, "loss_rank_avg": 0.1019287109375, "step": 10095 }, { "epoch": 5.29559748427673, "grad_norm": 0.7192649841308594, "learning_rate": 6.819307443977035e-06, "loss": 0.0987, "loss_nan_ranks": 0, "loss_rank_avg": 0.09088463336229324, "step": 10100 }, { "epoch": 5.298218029350105, "grad_norm": 0.6497074365615845, "learning_rate": 6.799661205143382e-06, "loss": 0.13, "loss_nan_ranks": 0, "loss_rank_avg": 0.17226599156856537, "step": 10105 }, { "epoch": 5.30083857442348, "grad_norm": 0.6994401812553406, "learning_rate": 6.780037509523771e-06, "loss": 0.1304, "loss_nan_ranks": 0, "loss_rank_avg": 0.10001536458730698, "step": 10110 }, { "epoch": 5.303459119496855, "grad_norm": 0.7387732863426208, "learning_rate": 6.7604363906310825e-06, "loss": 0.1224, "loss_nan_ranks": 0, "loss_rank_avg": 0.18090754747390747, "step": 10115 }, { "epoch": 5.30607966457023, "grad_norm": 0.7848963737487793, "learning_rate": 6.7408578819396155e-06, "loss": 0.121, "loss_nan_ranks": 0, "loss_rank_avg": 0.11017556488513947, "step": 10120 }, { "epoch": 5.308700209643606, "grad_norm": 0.7292885184288025, "learning_rate": 6.721302016885067e-06, "loss": 0.11, "loss_nan_ranks": 0, "loss_rank_avg": 0.09594656527042389, "step": 10125 }, { "epoch": 5.311320754716981, "grad_norm": 0.6747096180915833, "learning_rate": 6.701768828864466e-06, "loss": 0.1258, "loss_nan_ranks": 0, "loss_rank_avg": 0.12736985087394714, "step": 10130 }, { "epoch": 5.313941299790357, "grad_norm": 0.817677915096283, "learning_rate": 6.6822583512360975e-06, "loss": 0.1137, "loss_nan_ranks": 0, "loss_rank_avg": 0.10613100230693817, "step": 10135 }, { "epoch": 5.316561844863732, "grad_norm": 0.7237040996551514, "learning_rate": 6.662770617319494e-06, "loss": 0.1203, "loss_nan_ranks": 0, "loss_rank_avg": 0.14210447669029236, "step": 10140 }, { "epoch": 5.319182389937107, "grad_norm": 0.6779884099960327, "learning_rate": 6.643305660395318e-06, "loss": 0.1289, "loss_nan_ranks": 0, "loss_rank_avg": 0.14578291773796082, "step": 10145 }, { "epoch": 5.321802935010482, "grad_norm": 0.766578733921051, "learning_rate": 6.623863513705348e-06, "loss": 0.1179, "loss_nan_ranks": 0, "loss_rank_avg": 0.1254599392414093, "step": 10150 }, { "epoch": 5.3244234800838575, "grad_norm": 0.7032365798950195, "learning_rate": 6.604444210452403e-06, "loss": 0.0913, "loss_nan_ranks": 0, "loss_rank_avg": 0.09285767376422882, "step": 10155 }, { "epoch": 5.327044025157233, "grad_norm": 0.5468205809593201, "learning_rate": 6.585047783800285e-06, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.09434093534946442, "step": 10160 }, { "epoch": 5.329664570230608, "grad_norm": 0.7430211901664734, "learning_rate": 6.565674266873745e-06, "loss": 0.1124, "loss_nan_ranks": 0, "loss_rank_avg": 0.10109928250312805, "step": 10165 }, { "epoch": 5.332285115303983, "grad_norm": 0.6913556456565857, "learning_rate": 6.546323692758396e-06, "loss": 0.1145, "loss_nan_ranks": 0, "loss_rank_avg": 0.11161768436431885, "step": 10170 }, { "epoch": 5.334905660377358, "grad_norm": 0.7440565824508667, "learning_rate": 6.52699609450067e-06, "loss": 0.1282, "loss_nan_ranks": 0, "loss_rank_avg": 0.1132674366235733, "step": 10175 }, { "epoch": 5.337526205450734, "grad_norm": 0.5860890746116638, "learning_rate": 6.5076915051077675e-06, "loss": 0.1343, "loss_nan_ranks": 0, "loss_rank_avg": 0.13795891404151917, "step": 10180 }, { "epoch": 5.340146750524109, "grad_norm": 0.6547601222991943, "learning_rate": 6.488409957547581e-06, "loss": 0.1227, "loss_nan_ranks": 0, "loss_rank_avg": 0.15738970041275024, "step": 10185 }, { "epoch": 5.3427672955974845, "grad_norm": 0.8251211643218994, "learning_rate": 6.469151484748679e-06, "loss": 0.1297, "loss_nan_ranks": 0, "loss_rank_avg": 0.1241455078125, "step": 10190 }, { "epoch": 5.34538784067086, "grad_norm": 0.6415407657623291, "learning_rate": 6.449916119600201e-06, "loss": 0.1169, "loss_nan_ranks": 0, "loss_rank_avg": 0.12454849481582642, "step": 10195 }, { "epoch": 5.348008385744235, "grad_norm": 0.6995936632156372, "learning_rate": 6.4307038949518305e-06, "loss": 0.1332, "loss_nan_ranks": 0, "loss_rank_avg": 0.1687215268611908, "step": 10200 }, { "epoch": 5.35062893081761, "grad_norm": 0.7267442345619202, "learning_rate": 6.411514843613725e-06, "loss": 0.122, "loss_nan_ranks": 0, "loss_rank_avg": 0.11219580471515656, "step": 10205 }, { "epoch": 5.353249475890985, "grad_norm": 0.7010683417320251, "learning_rate": 6.39234899835649e-06, "loss": 0.1134, "loss_nan_ranks": 0, "loss_rank_avg": 0.12867935001850128, "step": 10210 }, { "epoch": 5.35587002096436, "grad_norm": 0.6663080453872681, "learning_rate": 6.373206391911069e-06, "loss": 0.1043, "loss_nan_ranks": 0, "loss_rank_avg": 0.11025137454271317, "step": 10215 }, { "epoch": 5.3584905660377355, "grad_norm": 0.732987105846405, "learning_rate": 6.354087056968748e-06, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.10928837954998016, "step": 10220 }, { "epoch": 5.361111111111111, "grad_norm": 0.9579931497573853, "learning_rate": 6.334991026181052e-06, "loss": 0.1174, "loss_nan_ranks": 0, "loss_rank_avg": 0.11715349555015564, "step": 10225 }, { "epoch": 5.363731656184487, "grad_norm": 0.728870689868927, "learning_rate": 6.315918332159714e-06, "loss": 0.1265, "loss_nan_ranks": 0, "loss_rank_avg": 0.13661688566207886, "step": 10230 }, { "epoch": 5.366352201257862, "grad_norm": 0.7422712445259094, "learning_rate": 6.296869007476609e-06, "loss": 0.119, "loss_nan_ranks": 0, "loss_rank_avg": 0.143565371632576, "step": 10235 }, { "epoch": 5.368972746331237, "grad_norm": 0.7388545870780945, "learning_rate": 6.277843084663701e-06, "loss": 0.1104, "loss_nan_ranks": 0, "loss_rank_avg": 0.1201258972287178, "step": 10240 }, { "epoch": 5.371593291404612, "grad_norm": 0.8039642572402954, "learning_rate": 6.258840596213005e-06, "loss": 0.1198, "loss_nan_ranks": 0, "loss_rank_avg": 0.10702769458293915, "step": 10245 }, { "epoch": 5.3742138364779874, "grad_norm": 0.6975395679473877, "learning_rate": 6.239861574576498e-06, "loss": 0.1127, "loss_nan_ranks": 0, "loss_rank_avg": 0.11157487332820892, "step": 10250 }, { "epoch": 5.376834381551363, "grad_norm": 0.6503974199295044, "learning_rate": 6.220906052166085e-06, "loss": 0.109, "loss_nan_ranks": 0, "loss_rank_avg": 0.11010022461414337, "step": 10255 }, { "epoch": 5.379454926624738, "grad_norm": 0.7469361424446106, "learning_rate": 6.201974061353542e-06, "loss": 0.1236, "loss_nan_ranks": 0, "loss_rank_avg": 0.09031128138303757, "step": 10260 }, { "epoch": 5.382075471698113, "grad_norm": 0.6768584847450256, "learning_rate": 6.183065634470453e-06, "loss": 0.1116, "loss_nan_ranks": 0, "loss_rank_avg": 0.09780827164649963, "step": 10265 }, { "epoch": 5.384696016771488, "grad_norm": 0.6973583102226257, "learning_rate": 6.164180803808173e-06, "loss": 0.1359, "loss_nan_ranks": 0, "loss_rank_avg": 0.15749692916870117, "step": 10270 }, { "epoch": 5.387316561844864, "grad_norm": 0.6480283737182617, "learning_rate": 6.145319601617749e-06, "loss": 0.1148, "loss_nan_ranks": 0, "loss_rank_avg": 0.14943397045135498, "step": 10275 }, { "epoch": 5.389937106918239, "grad_norm": 0.7254815101623535, "learning_rate": 6.126482060109877e-06, "loss": 0.1082, "loss_nan_ranks": 0, "loss_rank_avg": 0.08257943391799927, "step": 10280 }, { "epoch": 5.3925576519916145, "grad_norm": 0.7758432030677795, "learning_rate": 6.1076682114548465e-06, "loss": 0.1277, "loss_nan_ranks": 0, "loss_rank_avg": 0.12551628053188324, "step": 10285 }, { "epoch": 5.39517819706499, "grad_norm": 0.7967376708984375, "learning_rate": 6.088878087782488e-06, "loss": 0.1206, "loss_nan_ranks": 0, "loss_rank_avg": 0.11224541068077087, "step": 10290 }, { "epoch": 5.397798742138365, "grad_norm": 0.7282092571258545, "learning_rate": 6.070111721182104e-06, "loss": 0.114, "loss_nan_ranks": 0, "loss_rank_avg": 0.12284990400075912, "step": 10295 }, { "epoch": 5.40041928721174, "grad_norm": 0.7961103320121765, "learning_rate": 6.051369143702446e-06, "loss": 0.115, "loss_nan_ranks": 0, "loss_rank_avg": 0.075592041015625, "step": 10300 }, { "epoch": 5.403039832285115, "grad_norm": 0.6946623921394348, "learning_rate": 6.032650387351624e-06, "loss": 0.1156, "loss_nan_ranks": 0, "loss_rank_avg": 0.09668971598148346, "step": 10305 }, { "epoch": 5.40566037735849, "grad_norm": 0.6397780179977417, "learning_rate": 6.013955484097067e-06, "loss": 0.1289, "loss_nan_ranks": 0, "loss_rank_avg": 0.18690338730812073, "step": 10310 }, { "epoch": 5.4082809224318655, "grad_norm": 0.6478099822998047, "learning_rate": 5.9952844658654744e-06, "loss": 0.1214, "loss_nan_ranks": 0, "loss_rank_avg": 0.14790332317352295, "step": 10315 }, { "epoch": 5.410901467505241, "grad_norm": 0.7488334774971008, "learning_rate": 5.9766373645427415e-06, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.11961181461811066, "step": 10320 }, { "epoch": 5.413522012578617, "grad_norm": 0.8306141495704651, "learning_rate": 5.958014211973943e-06, "loss": 0.1195, "loss_nan_ranks": 0, "loss_rank_avg": 0.1055527925491333, "step": 10325 }, { "epoch": 5.416142557651992, "grad_norm": 0.7220061421394348, "learning_rate": 5.9394150399632385e-06, "loss": 0.1405, "loss_nan_ranks": 0, "loss_rank_avg": 0.16748718917369843, "step": 10330 }, { "epoch": 5.418763102725367, "grad_norm": 0.6414279937744141, "learning_rate": 5.920839880273832e-06, "loss": 0.1092, "loss_nan_ranks": 0, "loss_rank_avg": 0.09563075006008148, "step": 10335 }, { "epoch": 5.421383647798742, "grad_norm": 0.7283735871315002, "learning_rate": 5.902288764627928e-06, "loss": 0.1274, "loss_nan_ranks": 0, "loss_rank_avg": 0.11764983832836151, "step": 10340 }, { "epoch": 5.424004192872117, "grad_norm": 0.6365904211997986, "learning_rate": 5.883761724706656e-06, "loss": 0.1285, "loss_nan_ranks": 0, "loss_rank_avg": 0.13443568348884583, "step": 10345 }, { "epoch": 5.426624737945493, "grad_norm": 0.7272641658782959, "learning_rate": 5.8652587921500544e-06, "loss": 0.1148, "loss_nan_ranks": 0, "loss_rank_avg": 0.08130456507205963, "step": 10350 }, { "epoch": 5.429245283018868, "grad_norm": 0.6290727853775024, "learning_rate": 5.846779998556971e-06, "loss": 0.1188, "loss_nan_ranks": 0, "loss_rank_avg": 0.1324918270111084, "step": 10355 }, { "epoch": 5.431865828092243, "grad_norm": 0.6942636966705322, "learning_rate": 5.828325375485033e-06, "loss": 0.1261, "loss_nan_ranks": 0, "loss_rank_avg": 0.10782308876514435, "step": 10360 }, { "epoch": 5.434486373165618, "grad_norm": 0.856620192527771, "learning_rate": 5.809894954450592e-06, "loss": 0.1264, "loss_nan_ranks": 0, "loss_rank_avg": 0.11584212630987167, "step": 10365 }, { "epoch": 5.437106918238994, "grad_norm": 0.6576579809188843, "learning_rate": 5.791488766928664e-06, "loss": 0.1114, "loss_nan_ranks": 0, "loss_rank_avg": 0.1250620186328888, "step": 10370 }, { "epoch": 5.439727463312369, "grad_norm": 0.6821559071540833, "learning_rate": 5.773106844352894e-06, "loss": 0.1307, "loss_nan_ranks": 0, "loss_rank_avg": 0.11624176800251007, "step": 10375 }, { "epoch": 5.4423480083857445, "grad_norm": 0.6944690346717834, "learning_rate": 5.7547492181154715e-06, "loss": 0.1139, "loss_nan_ranks": 0, "loss_rank_avg": 0.11554088443517685, "step": 10380 }, { "epoch": 5.44496855345912, "grad_norm": 0.6521162390708923, "learning_rate": 5.7364159195670975e-06, "loss": 0.1103, "loss_nan_ranks": 0, "loss_rank_avg": 0.11223713308572769, "step": 10385 }, { "epoch": 5.447589098532495, "grad_norm": 0.844800591468811, "learning_rate": 5.718106980016933e-06, "loss": 0.1011, "loss_nan_ranks": 0, "loss_rank_avg": 0.10950352251529694, "step": 10390 }, { "epoch": 5.45020964360587, "grad_norm": 0.6249414086341858, "learning_rate": 5.6998224307325246e-06, "loss": 0.1355, "loss_nan_ranks": 0, "loss_rank_avg": 0.17765092849731445, "step": 10395 }, { "epoch": 5.452830188679245, "grad_norm": 0.7176016569137573, "learning_rate": 5.68156230293979e-06, "loss": 0.1104, "loss_nan_ranks": 0, "loss_rank_avg": 0.10899336636066437, "step": 10400 }, { "epoch": 5.45545073375262, "grad_norm": 0.7706964015960693, "learning_rate": 5.66332662782292e-06, "loss": 0.0888, "loss_nan_ranks": 0, "loss_rank_avg": 0.074981689453125, "step": 10405 }, { "epoch": 5.4580712788259955, "grad_norm": 0.8711444139480591, "learning_rate": 5.645115436524353e-06, "loss": 0.0983, "loss_nan_ranks": 0, "loss_rank_avg": 0.11633491516113281, "step": 10410 }, { "epoch": 5.460691823899371, "grad_norm": 0.7096289396286011, "learning_rate": 5.626928760144712e-06, "loss": 0.1167, "loss_nan_ranks": 0, "loss_rank_avg": 0.10099075734615326, "step": 10415 }, { "epoch": 5.463312368972747, "grad_norm": 0.6985383629798889, "learning_rate": 5.6087666297427526e-06, "loss": 0.1129, "loss_nan_ranks": 0, "loss_rank_avg": 0.08926206827163696, "step": 10420 }, { "epoch": 5.465932914046122, "grad_norm": 0.7082505822181702, "learning_rate": 5.590629076335323e-06, "loss": 0.1181, "loss_nan_ranks": 0, "loss_rank_avg": 0.11006351560354233, "step": 10425 }, { "epoch": 5.468553459119497, "grad_norm": 0.7185156345367432, "learning_rate": 5.572516130897288e-06, "loss": 0.111, "loss_nan_ranks": 0, "loss_rank_avg": 0.13114425539970398, "step": 10430 }, { "epoch": 5.471174004192872, "grad_norm": 0.7408719658851624, "learning_rate": 5.554427824361488e-06, "loss": 0.1102, "loss_nan_ranks": 0, "loss_rank_avg": 0.13163840770721436, "step": 10435 }, { "epoch": 5.473794549266247, "grad_norm": 1.1774944067001343, "learning_rate": 5.5363641876186905e-06, "loss": 0.1239, "loss_nan_ranks": 0, "loss_rank_avg": 0.15403783321380615, "step": 10440 }, { "epoch": 5.476415094339623, "grad_norm": 0.6841291785240173, "learning_rate": 5.518325251517522e-06, "loss": 0.1325, "loss_nan_ranks": 0, "loss_rank_avg": 0.1528058797121048, "step": 10445 }, { "epoch": 5.479035639412998, "grad_norm": 0.7858748435974121, "learning_rate": 5.500311046864448e-06, "loss": 0.1127, "loss_nan_ranks": 0, "loss_rank_avg": 0.12168926000595093, "step": 10450 }, { "epoch": 5.481656184486373, "grad_norm": 0.8195909261703491, "learning_rate": 5.482321604423679e-06, "loss": 0.1234, "loss_nan_ranks": 0, "loss_rank_avg": 0.13854841887950897, "step": 10455 }, { "epoch": 5.484276729559748, "grad_norm": 0.7601556181907654, "learning_rate": 5.4643569549171385e-06, "loss": 0.1214, "loss_nan_ranks": 0, "loss_rank_avg": 0.13748139142990112, "step": 10460 }, { "epoch": 5.486897274633124, "grad_norm": 0.8059583306312561, "learning_rate": 5.446417129024417e-06, "loss": 0.111, "loss_nan_ranks": 0, "loss_rank_avg": 0.15006282925605774, "step": 10465 }, { "epoch": 5.489517819706499, "grad_norm": 0.7704102396965027, "learning_rate": 5.428502157382702e-06, "loss": 0.1394, "loss_nan_ranks": 0, "loss_rank_avg": 0.10409608483314514, "step": 10470 }, { "epoch": 5.4921383647798745, "grad_norm": 0.8262134194374084, "learning_rate": 5.410612070586752e-06, "loss": 0.1104, "loss_nan_ranks": 0, "loss_rank_avg": 0.10782438516616821, "step": 10475 }, { "epoch": 5.49475890985325, "grad_norm": 0.6677366495132446, "learning_rate": 5.39274689918881e-06, "loss": 0.106, "loss_nan_ranks": 0, "loss_rank_avg": 0.1179676502943039, "step": 10480 }, { "epoch": 5.497379454926625, "grad_norm": 0.9120731353759766, "learning_rate": 5.374906673698581e-06, "loss": 0.1039, "loss_nan_ranks": 0, "loss_rank_avg": 0.0929182767868042, "step": 10485 }, { "epoch": 5.5, "grad_norm": 0.6679109930992126, "learning_rate": 5.357091424583159e-06, "loss": 0.1193, "loss_nan_ranks": 0, "loss_rank_avg": 0.10127764195203781, "step": 10490 }, { "epoch": 5.502620545073375, "grad_norm": 0.6787980198860168, "learning_rate": 5.339301182266985e-06, "loss": 0.11, "loss_nan_ranks": 0, "loss_rank_avg": 0.1122751459479332, "step": 10495 }, { "epoch": 5.50524109014675, "grad_norm": 0.7567312121391296, "learning_rate": 5.321535977131809e-06, "loss": 0.0949, "loss_nan_ranks": 0, "loss_rank_avg": 0.08058898150920868, "step": 10500 }, { "epoch": 5.5078616352201255, "grad_norm": 0.8747363090515137, "learning_rate": 5.303795839516606e-06, "loss": 0.1096, "loss_nan_ranks": 0, "loss_rank_avg": 0.0567626953125, "step": 10505 }, { "epoch": 5.510482180293501, "grad_norm": 0.8775766491889954, "learning_rate": 5.286080799717543e-06, "loss": 0.1133, "loss_nan_ranks": 0, "loss_rank_avg": 0.1152627170085907, "step": 10510 }, { "epoch": 5.513102725366876, "grad_norm": 0.6846725940704346, "learning_rate": 5.268390887987935e-06, "loss": 0.1278, "loss_nan_ranks": 0, "loss_rank_avg": 0.1599602997303009, "step": 10515 }, { "epoch": 5.515723270440252, "grad_norm": 0.6617890000343323, "learning_rate": 5.250726134538177e-06, "loss": 0.1324, "loss_nan_ranks": 0, "loss_rank_avg": 0.14858025312423706, "step": 10520 }, { "epoch": 5.518343815513627, "grad_norm": 0.701453447341919, "learning_rate": 5.233086569535692e-06, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.12819528579711914, "step": 10525 }, { "epoch": 5.520964360587002, "grad_norm": 0.8231136202812195, "learning_rate": 5.215472223104909e-06, "loss": 0.102, "loss_nan_ranks": 0, "loss_rank_avg": 0.14212852716445923, "step": 10530 }, { "epoch": 5.523584905660377, "grad_norm": 0.7562174797058105, "learning_rate": 5.19788312532717e-06, "loss": 0.1151, "loss_nan_ranks": 0, "loss_rank_avg": 0.1172027587890625, "step": 10535 }, { "epoch": 5.526205450733753, "grad_norm": 0.668741762638092, "learning_rate": 5.180319306240702e-06, "loss": 0.1247, "loss_nan_ranks": 0, "loss_rank_avg": 0.09932608157396317, "step": 10540 }, { "epoch": 5.528825995807128, "grad_norm": 0.7404559254646301, "learning_rate": 5.162780795840567e-06, "loss": 0.1096, "loss_nan_ranks": 0, "loss_rank_avg": 0.11360190808773041, "step": 10545 }, { "epoch": 5.531446540880503, "grad_norm": 0.6470763683319092, "learning_rate": 5.145267624078594e-06, "loss": 0.1069, "loss_nan_ranks": 0, "loss_rank_avg": 0.10094386339187622, "step": 10550 }, { "epoch": 5.534067085953878, "grad_norm": 0.6381714344024658, "learning_rate": 5.1277798208633565e-06, "loss": 0.1298, "loss_nan_ranks": 0, "loss_rank_avg": 0.12636935710906982, "step": 10555 }, { "epoch": 5.536687631027254, "grad_norm": 1.1980273723602295, "learning_rate": 5.110317416060093e-06, "loss": 0.1098, "loss_nan_ranks": 0, "loss_rank_avg": 0.09641356766223907, "step": 10560 }, { "epoch": 5.539308176100629, "grad_norm": 0.6636906862258911, "learning_rate": 5.092880439490666e-06, "loss": 0.1299, "loss_nan_ranks": 0, "loss_rank_avg": 0.11983007192611694, "step": 10565 }, { "epoch": 5.5419287211740045, "grad_norm": 0.7041227221488953, "learning_rate": 5.075468920933517e-06, "loss": 0.1233, "loss_nan_ranks": 0, "loss_rank_avg": 0.11246918141841888, "step": 10570 }, { "epoch": 5.54454926624738, "grad_norm": 0.6758451461791992, "learning_rate": 5.058082890123605e-06, "loss": 0.1149, "loss_nan_ranks": 0, "loss_rank_avg": 0.131456658244133, "step": 10575 }, { "epoch": 5.547169811320755, "grad_norm": 0.7600293159484863, "learning_rate": 5.040722376752374e-06, "loss": 0.1128, "loss_nan_ranks": 0, "loss_rank_avg": 0.11408248543739319, "step": 10580 }, { "epoch": 5.54979035639413, "grad_norm": 0.7650055885314941, "learning_rate": 5.02338741046768e-06, "loss": 0.0888, "loss_nan_ranks": 0, "loss_rank_avg": 0.08961398899555206, "step": 10585 }, { "epoch": 5.552410901467505, "grad_norm": 0.6169334053993225, "learning_rate": 5.006078020873748e-06, "loss": 0.1246, "loss_nan_ranks": 0, "loss_rank_avg": 0.1591559648513794, "step": 10590 }, { "epoch": 5.55503144654088, "grad_norm": 0.7118397355079651, "learning_rate": 4.988794237531129e-06, "loss": 0.1172, "loss_nan_ranks": 0, "loss_rank_avg": 0.09753826260566711, "step": 10595 }, { "epoch": 5.5576519916142555, "grad_norm": 0.7271219491958618, "learning_rate": 4.971536089956641e-06, "loss": 0.1066, "loss_nan_ranks": 0, "loss_rank_avg": 0.12421578168869019, "step": 10600 }, { "epoch": 5.560272536687631, "grad_norm": 0.6494109630584717, "learning_rate": 4.954303607623332e-06, "loss": 0.1185, "loss_nan_ranks": 0, "loss_rank_avg": 0.0917762741446495, "step": 10605 }, { "epoch": 5.562893081761006, "grad_norm": 0.8092367053031921, "learning_rate": 4.937096819960408e-06, "loss": 0.1077, "loss_nan_ranks": 0, "loss_rank_avg": 0.09707881510257721, "step": 10610 }, { "epoch": 5.565513626834382, "grad_norm": 0.9010205864906311, "learning_rate": 4.919915756353198e-06, "loss": 0.1247, "loss_nan_ranks": 0, "loss_rank_avg": 0.10653796792030334, "step": 10615 }, { "epoch": 5.568134171907757, "grad_norm": 0.7276707887649536, "learning_rate": 4.902760446143096e-06, "loss": 0.1152, "loss_nan_ranks": 0, "loss_rank_avg": 0.11297433078289032, "step": 10620 }, { "epoch": 5.570754716981132, "grad_norm": 0.8087003231048584, "learning_rate": 4.885630918627518e-06, "loss": 0.1053, "loss_nan_ranks": 0, "loss_rank_avg": 0.1004638671875, "step": 10625 }, { "epoch": 5.573375262054507, "grad_norm": 0.7698782086372375, "learning_rate": 4.86852720305986e-06, "loss": 0.1012, "loss_nan_ranks": 0, "loss_rank_avg": 0.1025627851486206, "step": 10630 }, { "epoch": 5.575995807127883, "grad_norm": 0.7195867300033569, "learning_rate": 4.85144932864942e-06, "loss": 0.0957, "loss_nan_ranks": 0, "loss_rank_avg": 0.0869738757610321, "step": 10635 }, { "epoch": 5.578616352201258, "grad_norm": 0.7677379846572876, "learning_rate": 4.834397324561375e-06, "loss": 0.1042, "loss_nan_ranks": 0, "loss_rank_avg": 0.0792236328125, "step": 10640 }, { "epoch": 5.581236897274633, "grad_norm": 0.7311434149742126, "learning_rate": 4.817371219916713e-06, "loss": 0.098, "loss_nan_ranks": 0, "loss_rank_avg": 0.10641766339540482, "step": 10645 }, { "epoch": 5.583857442348008, "grad_norm": 0.6872872710227966, "learning_rate": 4.800371043792198e-06, "loss": 0.1121, "loss_nan_ranks": 0, "loss_rank_avg": 0.09992322325706482, "step": 10650 }, { "epoch": 5.586477987421384, "grad_norm": 0.7587729692459106, "learning_rate": 4.783396825220319e-06, "loss": 0.0983, "loss_nan_ranks": 0, "loss_rank_avg": 0.0865052118897438, "step": 10655 }, { "epoch": 5.589098532494759, "grad_norm": 0.8601624369621277, "learning_rate": 4.766448593189226e-06, "loss": 0.1104, "loss_nan_ranks": 0, "loss_rank_avg": 0.0723876953125, "step": 10660 }, { "epoch": 5.5917190775681345, "grad_norm": 0.6991299390792847, "learning_rate": 4.7495263766426905e-06, "loss": 0.1246, "loss_nan_ranks": 0, "loss_rank_avg": 0.12729085981845856, "step": 10665 }, { "epoch": 5.59433962264151, "grad_norm": 0.7156105637550354, "learning_rate": 4.732630204480059e-06, "loss": 0.1214, "loss_nan_ranks": 0, "loss_rank_avg": 0.11206556856632233, "step": 10670 }, { "epoch": 5.596960167714885, "grad_norm": 0.7052958011627197, "learning_rate": 4.715760105556198e-06, "loss": 0.1139, "loss_nan_ranks": 0, "loss_rank_avg": 0.10625350475311279, "step": 10675 }, { "epoch": 5.59958071278826, "grad_norm": 0.8378977179527283, "learning_rate": 4.69891610868145e-06, "loss": 0.1212, "loss_nan_ranks": 0, "loss_rank_avg": 0.13235339522361755, "step": 10680 }, { "epoch": 5.602201257861635, "grad_norm": 0.8395311236381531, "learning_rate": 4.68209824262158e-06, "loss": 0.1233, "loss_nan_ranks": 0, "loss_rank_avg": 0.0919189453125, "step": 10685 }, { "epoch": 5.60482180293501, "grad_norm": 0.702622652053833, "learning_rate": 4.665306536097725e-06, "loss": 0.1078, "loss_nan_ranks": 0, "loss_rank_avg": 0.11648667603731155, "step": 10690 }, { "epoch": 5.6074423480083855, "grad_norm": 0.8039387464523315, "learning_rate": 4.648541017786345e-06, "loss": 0.1117, "loss_nan_ranks": 0, "loss_rank_avg": 0.11662253737449646, "step": 10695 }, { "epoch": 5.610062893081761, "grad_norm": 0.7079611420631409, "learning_rate": 4.63180171631918e-06, "loss": 0.1347, "loss_nan_ranks": 0, "loss_rank_avg": 0.12504009902477264, "step": 10700 }, { "epoch": 5.612683438155136, "grad_norm": 0.7281865477561951, "learning_rate": 4.615088660283202e-06, "loss": 0.1232, "loss_nan_ranks": 0, "loss_rank_avg": 0.12699584662914276, "step": 10705 }, { "epoch": 5.615303983228512, "grad_norm": 0.7949917316436768, "learning_rate": 4.598401878220557e-06, "loss": 0.1132, "loss_nan_ranks": 0, "loss_rank_avg": 0.08386878669261932, "step": 10710 }, { "epoch": 5.617924528301887, "grad_norm": 0.768836259841919, "learning_rate": 4.581741398628521e-06, "loss": 0.1057, "loss_nan_ranks": 0, "loss_rank_avg": 0.10535287111997604, "step": 10715 }, { "epoch": 5.620545073375262, "grad_norm": 0.6575037240982056, "learning_rate": 4.565107249959449e-06, "loss": 0.1075, "loss_nan_ranks": 0, "loss_rank_avg": 0.06976318359375, "step": 10720 }, { "epoch": 5.623165618448637, "grad_norm": 0.6681073904037476, "learning_rate": 4.54849946062073e-06, "loss": 0.1108, "loss_nan_ranks": 0, "loss_rank_avg": 0.1465725600719452, "step": 10725 }, { "epoch": 5.6257861635220126, "grad_norm": 0.7189939022064209, "learning_rate": 4.531918058974736e-06, "loss": 0.118, "loss_nan_ranks": 0, "loss_rank_avg": 0.13257884979248047, "step": 10730 }, { "epoch": 5.628406708595388, "grad_norm": 0.7120517492294312, "learning_rate": 4.515363073338788e-06, "loss": 0.1044, "loss_nan_ranks": 0, "loss_rank_avg": 0.07698134332895279, "step": 10735 }, { "epoch": 5.631027253668763, "grad_norm": 0.7609550952911377, "learning_rate": 4.498834531985075e-06, "loss": 0.1312, "loss_nan_ranks": 0, "loss_rank_avg": 0.11576057970523834, "step": 10740 }, { "epoch": 5.633647798742138, "grad_norm": 0.6521153450012207, "learning_rate": 4.482332463140635e-06, "loss": 0.141, "loss_nan_ranks": 0, "loss_rank_avg": 0.129469633102417, "step": 10745 }, { "epoch": 5.636268343815514, "grad_norm": 0.7928845882415771, "learning_rate": 4.465856894987297e-06, "loss": 0.1304, "loss_nan_ranks": 0, "loss_rank_avg": 0.1300087422132492, "step": 10750 }, { "epoch": 5.638888888888889, "grad_norm": 0.7226012349128723, "learning_rate": 4.4494078556616246e-06, "loss": 0.1184, "loss_nan_ranks": 0, "loss_rank_avg": 0.13354867696762085, "step": 10755 }, { "epoch": 5.6415094339622645, "grad_norm": 0.7212560176849365, "learning_rate": 4.4329853732548925e-06, "loss": 0.1117, "loss_nan_ranks": 0, "loss_rank_avg": 0.08856201171875, "step": 10760 }, { "epoch": 5.64412997903564, "grad_norm": 0.7965877652168274, "learning_rate": 4.416589475813009e-06, "loss": 0.1241, "loss_nan_ranks": 0, "loss_rank_avg": 0.0980224609375, "step": 10765 }, { "epoch": 5.646750524109015, "grad_norm": 0.6735566854476929, "learning_rate": 4.400220191336484e-06, "loss": 0.1381, "loss_nan_ranks": 0, "loss_rank_avg": 0.1463538408279419, "step": 10770 }, { "epoch": 5.64937106918239, "grad_norm": 0.7604423761367798, "learning_rate": 4.383877547780378e-06, "loss": 0.1159, "loss_nan_ranks": 0, "loss_rank_avg": 0.1259516477584839, "step": 10775 }, { "epoch": 5.651991614255765, "grad_norm": 0.7322911024093628, "learning_rate": 4.3675615730542505e-06, "loss": 0.1254, "loss_nan_ranks": 0, "loss_rank_avg": 0.125588059425354, "step": 10780 }, { "epoch": 5.65461215932914, "grad_norm": 0.9131481647491455, "learning_rate": 4.351272295022133e-06, "loss": 0.1448, "loss_nan_ranks": 0, "loss_rank_avg": 0.1126449927687645, "step": 10785 }, { "epoch": 5.6572327044025155, "grad_norm": 0.6480017304420471, "learning_rate": 4.335009741502452e-06, "loss": 0.1121, "loss_nan_ranks": 0, "loss_rank_avg": 0.09486915916204453, "step": 10790 }, { "epoch": 5.659853249475891, "grad_norm": 0.8091151714324951, "learning_rate": 4.318773940267991e-06, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.096101313829422, "step": 10795 }, { "epoch": 5.662473794549266, "grad_norm": 0.6447314023971558, "learning_rate": 4.302564919045855e-06, "loss": 0.111, "loss_nan_ranks": 0, "loss_rank_avg": 0.09380515664815903, "step": 10800 }, { "epoch": 5.665094339622642, "grad_norm": 0.6767293810844421, "learning_rate": 4.286382705517407e-06, "loss": 0.1232, "loss_nan_ranks": 0, "loss_rank_avg": 0.11664150655269623, "step": 10805 }, { "epoch": 5.667714884696017, "grad_norm": 0.9022296667098999, "learning_rate": 4.270227327318244e-06, "loss": 0.1116, "loss_nan_ranks": 0, "loss_rank_avg": 0.0869140625, "step": 10810 }, { "epoch": 5.670335429769392, "grad_norm": 0.6763343811035156, "learning_rate": 4.2540988120381165e-06, "loss": 0.1316, "loss_nan_ranks": 0, "loss_rank_avg": 0.12650880217552185, "step": 10815 }, { "epoch": 5.672955974842767, "grad_norm": 0.7594171166419983, "learning_rate": 4.23799718722091e-06, "loss": 0.1104, "loss_nan_ranks": 0, "loss_rank_avg": 0.10680019855499268, "step": 10820 }, { "epoch": 5.6755765199161425, "grad_norm": 0.694604754447937, "learning_rate": 4.2219224803645795e-06, "loss": 0.1084, "loss_nan_ranks": 0, "loss_rank_avg": 0.0924072265625, "step": 10825 }, { "epoch": 5.678197064989518, "grad_norm": 0.752474308013916, "learning_rate": 4.20587471892111e-06, "loss": 0.141, "loss_nan_ranks": 0, "loss_rank_avg": 0.1477525681257248, "step": 10830 }, { "epoch": 5.680817610062893, "grad_norm": 0.7720589637756348, "learning_rate": 4.189853930296486e-06, "loss": 0.1147, "loss_nan_ranks": 0, "loss_rank_avg": 0.08408753573894501, "step": 10835 }, { "epoch": 5.683438155136268, "grad_norm": 0.7506592273712158, "learning_rate": 4.173860141850612e-06, "loss": 0.1201, "loss_nan_ranks": 0, "loss_rank_avg": 0.10575760155916214, "step": 10840 }, { "epoch": 5.686058700209644, "grad_norm": 0.7803653478622437, "learning_rate": 4.157893380897282e-06, "loss": 0.1344, "loss_nan_ranks": 0, "loss_rank_avg": 0.12382243573665619, "step": 10845 }, { "epoch": 5.688679245283019, "grad_norm": 0.7440863847732544, "learning_rate": 4.1419536747041425e-06, "loss": 0.124, "loss_nan_ranks": 0, "loss_rank_avg": 0.10115906596183777, "step": 10850 }, { "epoch": 5.691299790356394, "grad_norm": 0.7986370325088501, "learning_rate": 4.126041050492624e-06, "loss": 0.1184, "loss_nan_ranks": 0, "loss_rank_avg": 0.0830078125, "step": 10855 }, { "epoch": 5.69392033542977, "grad_norm": 0.6744484901428223, "learning_rate": 4.110155535437927e-06, "loss": 0.1042, "loss_nan_ranks": 0, "loss_rank_avg": 0.11421257257461548, "step": 10860 }, { "epoch": 5.696540880503145, "grad_norm": 0.8917695879936218, "learning_rate": 4.094297156668936e-06, "loss": 0.1081, "loss_nan_ranks": 0, "loss_rank_avg": 0.10523484647274017, "step": 10865 }, { "epoch": 5.69916142557652, "grad_norm": 0.6464312076568604, "learning_rate": 4.078465941268204e-06, "loss": 0.1231, "loss_nan_ranks": 0, "loss_rank_avg": 0.14706212282180786, "step": 10870 }, { "epoch": 5.701781970649895, "grad_norm": 0.7350417375564575, "learning_rate": 4.062661916271889e-06, "loss": 0.1126, "loss_nan_ranks": 0, "loss_rank_avg": 0.08206360787153244, "step": 10875 }, { "epoch": 5.70440251572327, "grad_norm": 0.7763562202453613, "learning_rate": 4.046885108669709e-06, "loss": 0.1184, "loss_nan_ranks": 0, "loss_rank_avg": 0.12558457255363464, "step": 10880 }, { "epoch": 5.7070230607966455, "grad_norm": 0.7393556237220764, "learning_rate": 4.031135545404923e-06, "loss": 0.1203, "loss_nan_ranks": 0, "loss_rank_avg": 0.1080063134431839, "step": 10885 }, { "epoch": 5.709643605870021, "grad_norm": 0.697335422039032, "learning_rate": 4.015413253374239e-06, "loss": 0.1077, "loss_nan_ranks": 0, "loss_rank_avg": 0.09124629944562912, "step": 10890 }, { "epoch": 5.712264150943396, "grad_norm": 0.7845637202262878, "learning_rate": 3.999718259427805e-06, "loss": 0.1115, "loss_nan_ranks": 0, "loss_rank_avg": 0.08410429209470749, "step": 10895 }, { "epoch": 5.714884696016772, "grad_norm": 0.8138260841369629, "learning_rate": 3.9840505903691414e-06, "loss": 0.1092, "loss_nan_ranks": 0, "loss_rank_avg": 0.124274343252182, "step": 10900 }, { "epoch": 5.717505241090147, "grad_norm": 12.967086791992188, "learning_rate": 3.968410272955106e-06, "loss": 0.1189, "loss_nan_ranks": 0, "loss_rank_avg": 0.14421498775482178, "step": 10905 }, { "epoch": 5.720125786163522, "grad_norm": 0.7884721159934998, "learning_rate": 3.952797333895855e-06, "loss": 0.1322, "loss_nan_ranks": 0, "loss_rank_avg": 0.08462636172771454, "step": 10910 }, { "epoch": 5.722746331236897, "grad_norm": 0.754406750202179, "learning_rate": 3.937211799854781e-06, "loss": 0.1356, "loss_nan_ranks": 0, "loss_rank_avg": 0.14049619436264038, "step": 10915 }, { "epoch": 5.7253668763102725, "grad_norm": 0.7726715803146362, "learning_rate": 3.921653697448475e-06, "loss": 0.0934, "loss_nan_ranks": 0, "loss_rank_avg": 0.07763671875, "step": 10920 }, { "epoch": 5.727987421383648, "grad_norm": 0.7438585162162781, "learning_rate": 3.90612305324668e-06, "loss": 0.1021, "loss_nan_ranks": 0, "loss_rank_avg": 0.090576171875, "step": 10925 }, { "epoch": 5.730607966457023, "grad_norm": 0.776539146900177, "learning_rate": 3.890619893772245e-06, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.12244457006454468, "step": 10930 }, { "epoch": 5.733228511530398, "grad_norm": 0.7650336623191833, "learning_rate": 3.875144245501093e-06, "loss": 0.1151, "loss_nan_ranks": 0, "loss_rank_avg": 0.08882482349872589, "step": 10935 }, { "epoch": 5.735849056603773, "grad_norm": 0.7434911131858826, "learning_rate": 3.859696134862152e-06, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.09627208113670349, "step": 10940 }, { "epoch": 5.738469601677149, "grad_norm": 0.7924013137817383, "learning_rate": 3.844275588237325e-06, "loss": 0.1037, "loss_nan_ranks": 0, "loss_rank_avg": 0.12574627995491028, "step": 10945 }, { "epoch": 5.741090146750524, "grad_norm": 0.806307315826416, "learning_rate": 3.828882631961442e-06, "loss": 0.1078, "loss_nan_ranks": 0, "loss_rank_avg": 0.10893553495407104, "step": 10950 }, { "epoch": 5.7437106918239, "grad_norm": 0.7530555725097656, "learning_rate": 3.813517292322215e-06, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.13084349036216736, "step": 10955 }, { "epoch": 5.746331236897275, "grad_norm": 0.7580758929252625, "learning_rate": 3.7981795955601896e-06, "loss": 0.0995, "loss_nan_ranks": 0, "loss_rank_avg": 0.10415025055408478, "step": 10960 }, { "epoch": 5.74895178197065, "grad_norm": 0.6946631669998169, "learning_rate": 3.7828695678687166e-06, "loss": 0.116, "loss_nan_ranks": 0, "loss_rank_avg": 0.11199931800365448, "step": 10965 }, { "epoch": 5.751572327044025, "grad_norm": 0.7735135555267334, "learning_rate": 3.7675872353938814e-06, "loss": 0.1215, "loss_nan_ranks": 0, "loss_rank_avg": 0.12082436680793762, "step": 10970 }, { "epoch": 5.7541928721174, "grad_norm": 0.8254667520523071, "learning_rate": 3.7523326242344717e-06, "loss": 0.1167, "loss_nan_ranks": 0, "loss_rank_avg": 0.11641869693994522, "step": 10975 }, { "epoch": 5.756813417190775, "grad_norm": 0.7062658071517944, "learning_rate": 3.7371057604419415e-06, "loss": 0.13, "loss_nan_ranks": 0, "loss_rank_avg": 0.13770698010921478, "step": 10980 }, { "epoch": 5.759433962264151, "grad_norm": 0.6900745034217834, "learning_rate": 3.7219066700203455e-06, "loss": 0.1072, "loss_nan_ranks": 0, "loss_rank_avg": 0.1039891242980957, "step": 10985 }, { "epoch": 5.762054507337526, "grad_norm": 0.7733207941055298, "learning_rate": 3.7067353789263294e-06, "loss": 0.1303, "loss_nan_ranks": 0, "loss_rank_avg": 0.12287549674510956, "step": 10990 }, { "epoch": 5.764675052410902, "grad_norm": 0.9098268151283264, "learning_rate": 3.691591913069048e-06, "loss": 0.1093, "loss_nan_ranks": 0, "loss_rank_avg": 0.08826844394207001, "step": 10995 }, { "epoch": 5.767295597484277, "grad_norm": 0.7530337572097778, "learning_rate": 3.6764762983101344e-06, "loss": 0.1079, "loss_nan_ranks": 0, "loss_rank_avg": 0.12231530994176865, "step": 11000 }, { "epoch": 5.769916142557652, "grad_norm": 0.669246256351471, "learning_rate": 3.6613885604636703e-06, "loss": 0.1088, "loss_nan_ranks": 0, "loss_rank_avg": 0.13573946058750153, "step": 11005 }, { "epoch": 5.772536687631027, "grad_norm": 0.7240214347839355, "learning_rate": 3.6463287252961134e-06, "loss": 0.1233, "loss_nan_ranks": 0, "loss_rank_avg": 0.14697837829589844, "step": 11010 }, { "epoch": 5.7751572327044025, "grad_norm": 0.6674785614013672, "learning_rate": 3.6312968185262908e-06, "loss": 0.1284, "loss_nan_ranks": 0, "loss_rank_avg": 0.13692863285541534, "step": 11015 }, { "epoch": 5.777777777777778, "grad_norm": 0.7756078839302063, "learning_rate": 3.6162928658253195e-06, "loss": 0.1187, "loss_nan_ranks": 0, "loss_rank_avg": 0.12126202881336212, "step": 11020 }, { "epoch": 5.780398322851153, "grad_norm": 0.7499447464942932, "learning_rate": 3.601316892816582e-06, "loss": 0.1173, "loss_nan_ranks": 0, "loss_rank_avg": 0.14638075232505798, "step": 11025 }, { "epoch": 5.783018867924528, "grad_norm": 1.003929853439331, "learning_rate": 3.586368925075674e-06, "loss": 0.1114, "loss_nan_ranks": 0, "loss_rank_avg": 0.16575685143470764, "step": 11030 }, { "epoch": 5.785639412997903, "grad_norm": 0.722449541091919, "learning_rate": 3.571448988130364e-06, "loss": 0.1231, "loss_nan_ranks": 0, "loss_rank_avg": 0.11033296585083008, "step": 11035 }, { "epoch": 5.788259958071279, "grad_norm": 0.8108361959457397, "learning_rate": 3.556557107460565e-06, "loss": 0.1049, "loss_nan_ranks": 0, "loss_rank_avg": 0.084716796875, "step": 11040 }, { "epoch": 5.790880503144654, "grad_norm": 0.7148273587226868, "learning_rate": 3.5416933084982576e-06, "loss": 0.1154, "loss_nan_ranks": 0, "loss_rank_avg": 0.14857742190361023, "step": 11045 }, { "epoch": 5.79350104821803, "grad_norm": 0.7880802750587463, "learning_rate": 3.52685761662747e-06, "loss": 0.1209, "loss_nan_ranks": 0, "loss_rank_avg": 0.08220598101615906, "step": 11050 }, { "epoch": 5.796121593291405, "grad_norm": 0.7853556275367737, "learning_rate": 3.5120500571842375e-06, "loss": 0.1079, "loss_nan_ranks": 0, "loss_rank_avg": 0.09490779042243958, "step": 11055 }, { "epoch": 5.79874213836478, "grad_norm": 0.7573359608650208, "learning_rate": 3.497270655456537e-06, "loss": 0.1134, "loss_nan_ranks": 0, "loss_rank_avg": 0.12122859060764313, "step": 11060 }, { "epoch": 5.801362683438155, "grad_norm": 0.69629967212677, "learning_rate": 3.4825194366842797e-06, "loss": 0.1126, "loss_nan_ranks": 0, "loss_rank_avg": 0.1040906012058258, "step": 11065 }, { "epoch": 5.80398322851153, "grad_norm": 0.7572780251502991, "learning_rate": 3.4677964260592267e-06, "loss": 0.1278, "loss_nan_ranks": 0, "loss_rank_avg": 0.1337701380252838, "step": 11070 }, { "epoch": 5.806603773584905, "grad_norm": 0.8365534543991089, "learning_rate": 3.4531016487249747e-06, "loss": 0.1145, "loss_nan_ranks": 0, "loss_rank_avg": 0.07947821915149689, "step": 11075 }, { "epoch": 5.809224318658281, "grad_norm": 0.771355152130127, "learning_rate": 3.438435129776905e-06, "loss": 0.1152, "loss_nan_ranks": 0, "loss_rank_avg": 0.10697413980960846, "step": 11080 }, { "epoch": 5.811844863731656, "grad_norm": 0.7912852764129639, "learning_rate": 3.42379689426213e-06, "loss": 0.1272, "loss_nan_ranks": 0, "loss_rank_avg": 0.13008055090904236, "step": 11085 }, { "epoch": 5.814465408805032, "grad_norm": 0.7045580744743347, "learning_rate": 3.409186967179483e-06, "loss": 0.1188, "loss_nan_ranks": 0, "loss_rank_avg": 0.1028495579957962, "step": 11090 }, { "epoch": 5.817085953878407, "grad_norm": 0.7422047853469849, "learning_rate": 3.394605373479427e-06, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.10188481211662292, "step": 11095 }, { "epoch": 5.819706498951782, "grad_norm": 0.7149353623390198, "learning_rate": 3.3800521380640538e-06, "loss": 0.116, "loss_nan_ranks": 0, "loss_rank_avg": 0.12615402042865753, "step": 11100 }, { "epoch": 5.822327044025157, "grad_norm": 0.7516360282897949, "learning_rate": 3.3655272857870202e-06, "loss": 0.1117, "loss_nan_ranks": 0, "loss_rank_avg": 0.1172906905412674, "step": 11105 }, { "epoch": 5.8249475890985325, "grad_norm": 0.718627393245697, "learning_rate": 3.3510308414535062e-06, "loss": 0.1053, "loss_nan_ranks": 0, "loss_rank_avg": 0.12420696020126343, "step": 11110 }, { "epoch": 5.827568134171908, "grad_norm": 0.6898729205131531, "learning_rate": 3.3365628298201935e-06, "loss": 0.1181, "loss_nan_ranks": 0, "loss_rank_avg": 0.11398045718669891, "step": 11115 }, { "epoch": 5.830188679245283, "grad_norm": 0.7775851488113403, "learning_rate": 3.3221232755951903e-06, "loss": 0.1196, "loss_nan_ranks": 0, "loss_rank_avg": 0.1542092263698578, "step": 11120 }, { "epoch": 5.832809224318658, "grad_norm": 0.7814466953277588, "learning_rate": 3.307712203438014e-06, "loss": 0.129, "loss_nan_ranks": 0, "loss_rank_avg": 0.10209138691425323, "step": 11125 }, { "epoch": 5.835429769392033, "grad_norm": 0.851301908493042, "learning_rate": 3.2933296379595394e-06, "loss": 0.1073, "loss_nan_ranks": 0, "loss_rank_avg": 0.0756775438785553, "step": 11130 }, { "epoch": 5.838050314465409, "grad_norm": 0.8173316717147827, "learning_rate": 3.2789756037219524e-06, "loss": 0.1013, "loss_nan_ranks": 0, "loss_rank_avg": 0.1055392324924469, "step": 11135 }, { "epoch": 5.840670859538784, "grad_norm": 0.7540410757064819, "learning_rate": 3.2646501252387287e-06, "loss": 0.1248, "loss_nan_ranks": 0, "loss_rank_avg": 0.09263626486063004, "step": 11140 }, { "epoch": 5.84329140461216, "grad_norm": 0.8481481671333313, "learning_rate": 3.2503532269745654e-06, "loss": 0.1139, "loss_nan_ranks": 0, "loss_rank_avg": 0.13721612095832825, "step": 11145 }, { "epoch": 5.845911949685535, "grad_norm": 0.734102725982666, "learning_rate": 3.2360849333453515e-06, "loss": 0.1061, "loss_nan_ranks": 0, "loss_rank_avg": 0.11949004232883453, "step": 11150 }, { "epoch": 5.84853249475891, "grad_norm": 0.8791471123695374, "learning_rate": 3.221845268718129e-06, "loss": 0.1198, "loss_nan_ranks": 0, "loss_rank_avg": 0.11480453610420227, "step": 11155 }, { "epoch": 5.851153039832285, "grad_norm": 0.7812302708625793, "learning_rate": 3.20763425741105e-06, "loss": 0.0916, "loss_nan_ranks": 0, "loss_rank_avg": 0.04547119140625, "step": 11160 }, { "epoch": 5.85377358490566, "grad_norm": 0.8186652064323425, "learning_rate": 3.1934519236933204e-06, "loss": 0.1066, "loss_nan_ranks": 0, "loss_rank_avg": 0.10640643537044525, "step": 11165 }, { "epoch": 5.856394129979035, "grad_norm": 0.802610456943512, "learning_rate": 3.1792982917851932e-06, "loss": 0.1217, "loss_nan_ranks": 0, "loss_rank_avg": 0.08154296875, "step": 11170 }, { "epoch": 5.859014675052411, "grad_norm": 0.7426859140396118, "learning_rate": 3.165173385857889e-06, "loss": 0.1099, "loss_nan_ranks": 0, "loss_rank_avg": 0.0936279296875, "step": 11175 }, { "epoch": 5.861635220125786, "grad_norm": 0.7227411866188049, "learning_rate": 3.1510772300335747e-06, "loss": 0.1246, "loss_nan_ranks": 0, "loss_rank_avg": 0.12074403464794159, "step": 11180 }, { "epoch": 5.864255765199162, "grad_norm": 0.9096740484237671, "learning_rate": 3.1370098483853173e-06, "loss": 0.1039, "loss_nan_ranks": 0, "loss_rank_avg": 0.09936092048883438, "step": 11185 }, { "epoch": 5.866876310272537, "grad_norm": 0.6973651647567749, "learning_rate": 3.1229712649370403e-06, "loss": 0.1027, "loss_nan_ranks": 0, "loss_rank_avg": 0.1065673828125, "step": 11190 }, { "epoch": 5.869496855345912, "grad_norm": 0.6310497522354126, "learning_rate": 3.1089615036635034e-06, "loss": 0.129, "loss_nan_ranks": 0, "loss_rank_avg": 0.1379322111606598, "step": 11195 }, { "epoch": 5.872117400419287, "grad_norm": 0.6595391035079956, "learning_rate": 3.094980588490224e-06, "loss": 0.1342, "loss_nan_ranks": 0, "loss_rank_avg": 0.13927483558654785, "step": 11200 }, { "epoch": 5.8747379454926625, "grad_norm": 0.6636078357696533, "learning_rate": 3.0810285432934695e-06, "loss": 0.1142, "loss_nan_ranks": 0, "loss_rank_avg": 0.08349460363388062, "step": 11205 }, { "epoch": 5.877358490566038, "grad_norm": 0.7393668293952942, "learning_rate": 3.0671053919001957e-06, "loss": 0.101, "loss_nan_ranks": 0, "loss_rank_avg": 0.07989126443862915, "step": 11210 }, { "epoch": 5.879979035639413, "grad_norm": 0.6790786981582642, "learning_rate": 3.0532111580880163e-06, "loss": 0.1229, "loss_nan_ranks": 0, "loss_rank_avg": 0.10980883240699768, "step": 11215 }, { "epoch": 5.882599580712788, "grad_norm": 0.7920456528663635, "learning_rate": 3.039345865585168e-06, "loss": 0.121, "loss_nan_ranks": 0, "loss_rank_avg": 0.0809326171875, "step": 11220 }, { "epoch": 5.885220125786163, "grad_norm": 0.8068638443946838, "learning_rate": 3.025509538070457e-06, "loss": 0.1089, "loss_nan_ranks": 0, "loss_rank_avg": 0.11082607507705688, "step": 11225 }, { "epoch": 5.887840670859539, "grad_norm": 0.6608523726463318, "learning_rate": 3.011702199173221e-06, "loss": 0.1042, "loss_nan_ranks": 0, "loss_rank_avg": 0.12623870372772217, "step": 11230 }, { "epoch": 5.890461215932914, "grad_norm": 0.68641197681427, "learning_rate": 2.997923872473292e-06, "loss": 0.1031, "loss_nan_ranks": 0, "loss_rank_avg": 0.11337362229824066, "step": 11235 }, { "epoch": 5.8930817610062896, "grad_norm": 0.8305394053459167, "learning_rate": 2.9841745815009558e-06, "loss": 0.1139, "loss_nan_ranks": 0, "loss_rank_avg": 0.109619140625, "step": 11240 }, { "epoch": 5.895702306079665, "grad_norm": 0.6855704188346863, "learning_rate": 2.97045434973692e-06, "loss": 0.1058, "loss_nan_ranks": 0, "loss_rank_avg": 0.073974609375, "step": 11245 }, { "epoch": 5.89832285115304, "grad_norm": 0.7410557866096497, "learning_rate": 2.956763200612256e-06, "loss": 0.1075, "loss_nan_ranks": 0, "loss_rank_avg": 0.10438671708106995, "step": 11250 }, { "epoch": 5.900943396226415, "grad_norm": 0.8150331974029541, "learning_rate": 2.9431011575083723e-06, "loss": 0.1139, "loss_nan_ranks": 0, "loss_rank_avg": 0.0777587890625, "step": 11255 }, { "epoch": 5.90356394129979, "grad_norm": 0.6317834854125977, "learning_rate": 2.92946824375697e-06, "loss": 0.1081, "loss_nan_ranks": 0, "loss_rank_avg": 0.12193545699119568, "step": 11260 }, { "epoch": 5.906184486373165, "grad_norm": 0.7798405289649963, "learning_rate": 2.9158644826399986e-06, "loss": 0.1347, "loss_nan_ranks": 0, "loss_rank_avg": 0.11343325674533844, "step": 11265 }, { "epoch": 5.908805031446541, "grad_norm": 0.7575414180755615, "learning_rate": 2.902289897389634e-06, "loss": 0.1362, "loss_nan_ranks": 0, "loss_rank_avg": 0.14311093091964722, "step": 11270 }, { "epoch": 5.911425576519916, "grad_norm": 0.9065014123916626, "learning_rate": 2.8887445111882194e-06, "loss": 0.1096, "loss_nan_ranks": 0, "loss_rank_avg": 0.08463536202907562, "step": 11275 }, { "epoch": 5.914046121593291, "grad_norm": 0.8347724080085754, "learning_rate": 2.8752283471682284e-06, "loss": 0.1131, "loss_nan_ranks": 0, "loss_rank_avg": 0.08719807118177414, "step": 11280 }, { "epoch": 5.916666666666667, "grad_norm": 0.7125123739242554, "learning_rate": 2.861741428412237e-06, "loss": 0.1446, "loss_nan_ranks": 0, "loss_rank_avg": 0.1454043686389923, "step": 11285 }, { "epoch": 5.919287211740042, "grad_norm": 0.8080534338951111, "learning_rate": 2.848283777952865e-06, "loss": 0.1146, "loss_nan_ranks": 0, "loss_rank_avg": 0.11547332257032394, "step": 11290 }, { "epoch": 5.921907756813417, "grad_norm": 0.7206215262413025, "learning_rate": 2.8348554187727685e-06, "loss": 0.1172, "loss_nan_ranks": 0, "loss_rank_avg": 0.13346555829048157, "step": 11295 }, { "epoch": 5.9245283018867925, "grad_norm": 0.6634690165519714, "learning_rate": 2.821456373804563e-06, "loss": 0.1191, "loss_nan_ranks": 0, "loss_rank_avg": 0.09581804275512695, "step": 11300 }, { "epoch": 5.927148846960168, "grad_norm": 0.6794573664665222, "learning_rate": 2.8080866659308114e-06, "loss": 0.1303, "loss_nan_ranks": 0, "loss_rank_avg": 0.13424135744571686, "step": 11305 }, { "epoch": 5.929769392033543, "grad_norm": 0.6448708772659302, "learning_rate": 2.794746317983967e-06, "loss": 0.1451, "loss_nan_ranks": 0, "loss_rank_avg": 0.15221071243286133, "step": 11310 }, { "epoch": 5.932389937106918, "grad_norm": 0.5301897525787354, "learning_rate": 2.7814353527463488e-06, "loss": 0.1203, "loss_nan_ranks": 0, "loss_rank_avg": 0.17727480828762054, "step": 11315 }, { "epoch": 5.935010482180293, "grad_norm": 0.7591878175735474, "learning_rate": 2.7681537929501034e-06, "loss": 0.1077, "loss_nan_ranks": 0, "loss_rank_avg": 0.11045671999454498, "step": 11320 }, { "epoch": 5.937631027253669, "grad_norm": 0.7652134895324707, "learning_rate": 2.754901661277145e-06, "loss": 0.1388, "loss_nan_ranks": 0, "loss_rank_avg": 0.135330468416214, "step": 11325 }, { "epoch": 5.940251572327044, "grad_norm": 0.7571619153022766, "learning_rate": 2.7416789803591394e-06, "loss": 0.112, "loss_nan_ranks": 0, "loss_rank_avg": 0.12743711471557617, "step": 11330 }, { "epoch": 5.9428721174004195, "grad_norm": 0.8233155012130737, "learning_rate": 2.7284857727774584e-06, "loss": 0.1181, "loss_nan_ranks": 0, "loss_rank_avg": 0.08775623887777328, "step": 11335 }, { "epoch": 5.945492662473795, "grad_norm": 0.7032680511474609, "learning_rate": 2.71532206106313e-06, "loss": 0.1151, "loss_nan_ranks": 0, "loss_rank_avg": 0.15328362584114075, "step": 11340 }, { "epoch": 5.94811320754717, "grad_norm": 0.749940812587738, "learning_rate": 2.7021878676968285e-06, "loss": 0.1085, "loss_nan_ranks": 0, "loss_rank_avg": 0.118592768907547, "step": 11345 }, { "epoch": 5.950733752620545, "grad_norm": 0.8226772546768188, "learning_rate": 2.689083215108799e-06, "loss": 0.1318, "loss_nan_ranks": 0, "loss_rank_avg": 0.12129893153905869, "step": 11350 }, { "epoch": 5.95335429769392, "grad_norm": 0.6892474293708801, "learning_rate": 2.6760081256788482e-06, "loss": 0.1345, "loss_nan_ranks": 0, "loss_rank_avg": 0.12827900052070618, "step": 11355 }, { "epoch": 5.955974842767295, "grad_norm": 0.7389814257621765, "learning_rate": 2.6629626217362914e-06, "loss": 0.118, "loss_nan_ranks": 0, "loss_rank_avg": 0.1389472335577011, "step": 11360 }, { "epoch": 5.9585953878406706, "grad_norm": 0.7197535037994385, "learning_rate": 2.6499467255599153e-06, "loss": 0.09, "loss_nan_ranks": 0, "loss_rank_avg": 0.11740542948246002, "step": 11365 }, { "epoch": 5.961215932914046, "grad_norm": 0.6489915251731873, "learning_rate": 2.636960459377955e-06, "loss": 0.1172, "loss_nan_ranks": 0, "loss_rank_avg": 0.1260216236114502, "step": 11370 }, { "epoch": 5.963836477987421, "grad_norm": 0.7277437448501587, "learning_rate": 2.624003845368035e-06, "loss": 0.1164, "loss_nan_ranks": 0, "loss_rank_avg": 0.10765425115823746, "step": 11375 }, { "epoch": 5.966457023060797, "grad_norm": 0.6941485404968262, "learning_rate": 2.6110769056571394e-06, "loss": 0.1109, "loss_nan_ranks": 0, "loss_rank_avg": 0.0910947397351265, "step": 11380 }, { "epoch": 5.969077568134172, "grad_norm": 0.6762799024581909, "learning_rate": 2.598179662321576e-06, "loss": 0.1256, "loss_nan_ranks": 0, "loss_rank_avg": 0.1689760833978653, "step": 11385 }, { "epoch": 5.971698113207547, "grad_norm": 0.7307757139205933, "learning_rate": 2.585312137386946e-06, "loss": 0.1161, "loss_nan_ranks": 0, "loss_rank_avg": 0.0966796875, "step": 11390 }, { "epoch": 5.9743186582809225, "grad_norm": 0.7055436372756958, "learning_rate": 2.5724743528280828e-06, "loss": 0.1224, "loss_nan_ranks": 0, "loss_rank_avg": 0.14848220348358154, "step": 11395 }, { "epoch": 5.976939203354298, "grad_norm": 0.7922831773757935, "learning_rate": 2.5596663305690506e-06, "loss": 0.1132, "loss_nan_ranks": 0, "loss_rank_avg": 0.11176641285419464, "step": 11400 }, { "epoch": 5.979559748427673, "grad_norm": 0.6653242707252502, "learning_rate": 2.5468880924830685e-06, "loss": 0.1065, "loss_nan_ranks": 0, "loss_rank_avg": 0.09289851784706116, "step": 11405 }, { "epoch": 5.982180293501048, "grad_norm": 0.6594162583351135, "learning_rate": 2.5341396603924984e-06, "loss": 0.12, "loss_nan_ranks": 0, "loss_rank_avg": 0.12634675204753876, "step": 11410 }, { "epoch": 5.984800838574423, "grad_norm": 0.7897002696990967, "learning_rate": 2.5214210560688e-06, "loss": 0.1205, "loss_nan_ranks": 0, "loss_rank_avg": 0.1091359332203865, "step": 11415 }, { "epoch": 5.987421383647799, "grad_norm": 0.8033177852630615, "learning_rate": 2.508732301232486e-06, "loss": 0.1163, "loss_nan_ranks": 0, "loss_rank_avg": 0.12310639023780823, "step": 11420 }, { "epoch": 5.990041928721174, "grad_norm": 0.7952110171318054, "learning_rate": 2.496073417553113e-06, "loss": 0.1052, "loss_nan_ranks": 0, "loss_rank_avg": 0.09496569633483887, "step": 11425 }, { "epoch": 5.9926624737945495, "grad_norm": 0.8437222838401794, "learning_rate": 2.483444426649202e-06, "loss": 0.1054, "loss_nan_ranks": 0, "loss_rank_avg": 0.0804443359375, "step": 11430 }, { "epoch": 5.995283018867925, "grad_norm": 0.7011773586273193, "learning_rate": 2.470845350088238e-06, "loss": 0.1297, "loss_nan_ranks": 0, "loss_rank_avg": 0.14462977647781372, "step": 11435 }, { "epoch": 5.9979035639413, "grad_norm": 0.7636061906814575, "learning_rate": 2.45827620938661e-06, "loss": 0.1086, "loss_nan_ranks": 0, "loss_rank_avg": 0.0979013666510582, "step": 11440 }, { "epoch": 6.00104821802935, "grad_norm": 0.6727458238601685, "learning_rate": 2.4457370260095846e-06, "loss": 0.1008, "loss_nan_ranks": 0, "loss_rank_avg": 0.12848661839962006, "step": 11445 }, { "epoch": 6.003668763102725, "grad_norm": 0.9353975653648376, "learning_rate": 2.4332278213712824e-06, "loss": 0.1281, "loss_nan_ranks": 0, "loss_rank_avg": 0.11782145500183105, "step": 11450 }, { "epoch": 6.0062893081761, "grad_norm": 0.7893511652946472, "learning_rate": 2.420748616834607e-06, "loss": 0.1017, "loss_nan_ranks": 0, "loss_rank_avg": 0.12847980856895447, "step": 11455 }, { "epoch": 6.008909853249476, "grad_norm": 0.7531543374061584, "learning_rate": 2.4082994337112386e-06, "loss": 0.0989, "loss_nan_ranks": 0, "loss_rank_avg": 0.0888490155339241, "step": 11460 }, { "epoch": 6.011530398322851, "grad_norm": 0.6247485876083374, "learning_rate": 2.3958802932615856e-06, "loss": 0.1116, "loss_nan_ranks": 0, "loss_rank_avg": 0.12102822959423065, "step": 11465 }, { "epoch": 6.014150943396227, "grad_norm": 0.7316791415214539, "learning_rate": 2.3834912166947487e-06, "loss": 0.1007, "loss_nan_ranks": 0, "loss_rank_avg": 0.0911519005894661, "step": 11470 }, { "epoch": 6.016771488469602, "grad_norm": 0.7604213953018188, "learning_rate": 2.3711322251684956e-06, "loss": 0.0862, "loss_nan_ranks": 0, "loss_rank_avg": 0.0772705078125, "step": 11475 }, { "epoch": 6.019392033542977, "grad_norm": 0.8012372851371765, "learning_rate": 2.358803339789202e-06, "loss": 0.1082, "loss_nan_ranks": 0, "loss_rank_avg": 0.11557720601558685, "step": 11480 }, { "epoch": 6.022012578616352, "grad_norm": 0.8371411561965942, "learning_rate": 2.346504581611837e-06, "loss": 0.1016, "loss_nan_ranks": 0, "loss_rank_avg": 0.10546975582838058, "step": 11485 }, { "epoch": 6.0246331236897275, "grad_norm": 0.6822553873062134, "learning_rate": 2.3342359716399175e-06, "loss": 0.1137, "loss_nan_ranks": 0, "loss_rank_avg": 0.15785232186317444, "step": 11490 }, { "epoch": 6.027253668763103, "grad_norm": 0.673607349395752, "learning_rate": 2.3219975308254684e-06, "loss": 0.1141, "loss_nan_ranks": 0, "loss_rank_avg": 0.11714327335357666, "step": 11495 }, { "epoch": 6.029874213836478, "grad_norm": 0.7590088248252869, "learning_rate": 2.309789280069008e-06, "loss": 0.1172, "loss_nan_ranks": 0, "loss_rank_avg": 0.10126854479312897, "step": 11500 }, { "epoch": 6.032494758909853, "grad_norm": 0.73857581615448, "learning_rate": 2.297611240219484e-06, "loss": 0.1008, "loss_nan_ranks": 0, "loss_rank_avg": 0.10918985307216644, "step": 11505 }, { "epoch": 6.035115303983228, "grad_norm": 0.7437694072723389, "learning_rate": 2.2854634320742487e-06, "loss": 0.1031, "loss_nan_ranks": 0, "loss_rank_avg": 0.08391405642032623, "step": 11510 }, { "epoch": 6.037735849056604, "grad_norm": 0.7067047357559204, "learning_rate": 2.273345876379036e-06, "loss": 0.11, "loss_nan_ranks": 0, "loss_rank_avg": 0.136289581656456, "step": 11515 }, { "epoch": 6.040356394129979, "grad_norm": 0.6687672138214111, "learning_rate": 2.2612585938278996e-06, "loss": 0.1113, "loss_nan_ranks": 0, "loss_rank_avg": 0.12116891145706177, "step": 11520 }, { "epoch": 6.0429769392033545, "grad_norm": 0.6922745704650879, "learning_rate": 2.249201605063216e-06, "loss": 0.1183, "loss_nan_ranks": 0, "loss_rank_avg": 0.1263124644756317, "step": 11525 }, { "epoch": 6.04559748427673, "grad_norm": 0.6233412623405457, "learning_rate": 2.2371749306756073e-06, "loss": 0.1169, "loss_nan_ranks": 0, "loss_rank_avg": 0.14116165041923523, "step": 11530 }, { "epoch": 6.048218029350105, "grad_norm": 0.7557879686355591, "learning_rate": 2.2251785912039357e-06, "loss": 0.1279, "loss_nan_ranks": 0, "loss_rank_avg": 0.14125700294971466, "step": 11535 }, { "epoch": 6.05083857442348, "grad_norm": 0.7540751099586487, "learning_rate": 2.213212607135251e-06, "loss": 0.1182, "loss_nan_ranks": 0, "loss_rank_avg": 0.07376109063625336, "step": 11540 }, { "epoch": 6.053459119496855, "grad_norm": 0.7563124895095825, "learning_rate": 2.2012769989047665e-06, "loss": 0.1105, "loss_nan_ranks": 0, "loss_rank_avg": 0.0994349867105484, "step": 11545 }, { "epoch": 6.05607966457023, "grad_norm": 0.7129912972450256, "learning_rate": 2.1893717868958243e-06, "loss": 0.1056, "loss_nan_ranks": 0, "loss_rank_avg": 0.12413402646780014, "step": 11550 }, { "epoch": 6.058700209643606, "grad_norm": 0.7121222019195557, "learning_rate": 2.177496991439851e-06, "loss": 0.1313, "loss_nan_ranks": 0, "loss_rank_avg": 0.12695494294166565, "step": 11555 }, { "epoch": 6.061320754716981, "grad_norm": 0.7974750995635986, "learning_rate": 2.165652632816331e-06, "loss": 0.1082, "loss_nan_ranks": 0, "loss_rank_avg": 0.06833033263683319, "step": 11560 }, { "epoch": 6.063941299790357, "grad_norm": 0.676853597164154, "learning_rate": 2.1538387312527665e-06, "loss": 0.1165, "loss_nan_ranks": 0, "loss_rank_avg": 0.15408313274383545, "step": 11565 }, { "epoch": 6.066561844863732, "grad_norm": 0.6463554501533508, "learning_rate": 2.1420553069246462e-06, "loss": 0.0979, "loss_nan_ranks": 0, "loss_rank_avg": 0.08351656794548035, "step": 11570 }, { "epoch": 6.069182389937107, "grad_norm": 0.7661669850349426, "learning_rate": 2.13030237995542e-06, "loss": 0.0975, "loss_nan_ranks": 0, "loss_rank_avg": 0.08501426875591278, "step": 11575 }, { "epoch": 6.071802935010482, "grad_norm": 0.7557024955749512, "learning_rate": 2.1185799704164432e-06, "loss": 0.091, "loss_nan_ranks": 0, "loss_rank_avg": 0.0771484375, "step": 11580 }, { "epoch": 6.0744234800838575, "grad_norm": 0.7830671072006226, "learning_rate": 2.10688809832696e-06, "loss": 0.1031, "loss_nan_ranks": 0, "loss_rank_avg": 0.0934370681643486, "step": 11585 }, { "epoch": 6.077044025157233, "grad_norm": 0.6532689929008484, "learning_rate": 2.0952267836540608e-06, "loss": 0.1198, "loss_nan_ranks": 0, "loss_rank_avg": 0.1707833707332611, "step": 11590 }, { "epoch": 6.079664570230608, "grad_norm": 0.8896797895431519, "learning_rate": 2.08359604631265e-06, "loss": 0.1157, "loss_nan_ranks": 0, "loss_rank_avg": 0.10823449492454529, "step": 11595 }, { "epoch": 6.082285115303983, "grad_norm": 0.7435277700424194, "learning_rate": 2.0719959061654137e-06, "loss": 0.1184, "loss_nan_ranks": 0, "loss_rank_avg": 0.1346738636493683, "step": 11600 }, { "epoch": 6.084905660377358, "grad_norm": 0.6748736500740051, "learning_rate": 2.0604263830227957e-06, "loss": 0.1234, "loss_nan_ranks": 0, "loss_rank_avg": 0.1394091546535492, "step": 11605 }, { "epoch": 6.087526205450734, "grad_norm": 0.8019354343414307, "learning_rate": 2.0488874966429352e-06, "loss": 0.0817, "loss_nan_ranks": 0, "loss_rank_avg": 0.06707763671875, "step": 11610 }, { "epoch": 6.090146750524109, "grad_norm": 0.8450024724006653, "learning_rate": 2.0373792667316604e-06, "loss": 0.0987, "loss_nan_ranks": 0, "loss_rank_avg": 0.072906494140625, "step": 11615 }, { "epoch": 6.0927672955974845, "grad_norm": 0.6614810228347778, "learning_rate": 2.0259017129424417e-06, "loss": 0.1231, "loss_nan_ranks": 0, "loss_rank_avg": 0.12428902834653854, "step": 11620 }, { "epoch": 6.09538784067086, "grad_norm": 0.7350912094116211, "learning_rate": 2.0144548548763643e-06, "loss": 0.108, "loss_nan_ranks": 0, "loss_rank_avg": 0.10314503312110901, "step": 11625 }, { "epoch": 6.098008385744235, "grad_norm": 0.6813164353370667, "learning_rate": 2.0030387120820927e-06, "loss": 0.0986, "loss_nan_ranks": 0, "loss_rank_avg": 0.08183074742555618, "step": 11630 }, { "epoch": 6.10062893081761, "grad_norm": 0.7465524077415466, "learning_rate": 1.991653304055836e-06, "loss": 0.1131, "loss_nan_ranks": 0, "loss_rank_avg": 0.10792015492916107, "step": 11635 }, { "epoch": 6.103249475890985, "grad_norm": 0.723408579826355, "learning_rate": 1.9802986502413103e-06, "loss": 0.0986, "loss_nan_ranks": 0, "loss_rank_avg": 0.0633544921875, "step": 11640 }, { "epoch": 6.10587002096436, "grad_norm": 0.7599599361419678, "learning_rate": 1.9689747700297167e-06, "loss": 0.1017, "loss_nan_ranks": 0, "loss_rank_avg": 0.07861328125, "step": 11645 }, { "epoch": 6.1084905660377355, "grad_norm": 0.7111448645591736, "learning_rate": 1.9576816827596934e-06, "loss": 0.103, "loss_nan_ranks": 0, "loss_rank_avg": 0.15611860156059265, "step": 11650 }, { "epoch": 6.111111111111111, "grad_norm": 0.7046041488647461, "learning_rate": 1.946419407717308e-06, "loss": 0.1103, "loss_nan_ranks": 0, "loss_rank_avg": 0.12058594822883606, "step": 11655 }, { "epoch": 6.113731656184487, "grad_norm": 0.7179438471794128, "learning_rate": 1.9351879641359895e-06, "loss": 0.0852, "loss_nan_ranks": 0, "loss_rank_avg": 0.0791943222284317, "step": 11660 }, { "epoch": 6.116352201257862, "grad_norm": 0.7104761600494385, "learning_rate": 1.923987371196523e-06, "loss": 0.1172, "loss_nan_ranks": 0, "loss_rank_avg": 0.11489333212375641, "step": 11665 }, { "epoch": 6.118972746331237, "grad_norm": 0.7625128626823425, "learning_rate": 1.9128176480270057e-06, "loss": 0.1134, "loss_nan_ranks": 0, "loss_rank_avg": 0.1123046875, "step": 11670 }, { "epoch": 6.121593291404612, "grad_norm": 0.7586326599121094, "learning_rate": 1.9016788137028142e-06, "loss": 0.0924, "loss_nan_ranks": 0, "loss_rank_avg": 0.10534888505935669, "step": 11675 }, { "epoch": 6.1242138364779874, "grad_norm": 0.77974933385849, "learning_rate": 1.8905708872465788e-06, "loss": 0.1094, "loss_nan_ranks": 0, "loss_rank_avg": 0.06809628754854202, "step": 11680 }, { "epoch": 6.126834381551363, "grad_norm": 0.8171315789222717, "learning_rate": 1.8794938876281432e-06, "loss": 0.1131, "loss_nan_ranks": 0, "loss_rank_avg": 0.08233642578125, "step": 11685 }, { "epoch": 6.129454926624738, "grad_norm": 0.8048850893974304, "learning_rate": 1.868447833764535e-06, "loss": 0.1052, "loss_nan_ranks": 0, "loss_rank_avg": 0.12104252725839615, "step": 11690 }, { "epoch": 6.132075471698113, "grad_norm": 0.7302783727645874, "learning_rate": 1.8574327445199315e-06, "loss": 0.1192, "loss_nan_ranks": 0, "loss_rank_avg": 0.13073480129241943, "step": 11695 }, { "epoch": 6.134696016771488, "grad_norm": 0.7510926127433777, "learning_rate": 1.8464486387056291e-06, "loss": 0.1074, "loss_nan_ranks": 0, "loss_rank_avg": 0.11252527683973312, "step": 11700 }, { "epoch": 6.137316561844864, "grad_norm": 0.6938128471374512, "learning_rate": 1.8354955350800163e-06, "loss": 0.1319, "loss_nan_ranks": 0, "loss_rank_avg": 0.17961004376411438, "step": 11705 }, { "epoch": 6.139937106918239, "grad_norm": 0.7121496200561523, "learning_rate": 1.824573452348537e-06, "loss": 0.119, "loss_nan_ranks": 0, "loss_rank_avg": 0.11327329277992249, "step": 11710 }, { "epoch": 6.1425576519916145, "grad_norm": 0.7868003249168396, "learning_rate": 1.8136824091636552e-06, "loss": 0.1068, "loss_nan_ranks": 0, "loss_rank_avg": 0.11851361393928528, "step": 11715 }, { "epoch": 6.14517819706499, "grad_norm": 0.7592499256134033, "learning_rate": 1.8028224241248238e-06, "loss": 0.1218, "loss_nan_ranks": 0, "loss_rank_avg": 0.12001194059848785, "step": 11720 }, { "epoch": 6.147798742138365, "grad_norm": 0.7448607087135315, "learning_rate": 1.7919935157784585e-06, "loss": 0.1018, "loss_nan_ranks": 0, "loss_rank_avg": 0.14102524518966675, "step": 11725 }, { "epoch": 6.15041928721174, "grad_norm": 0.7317970395088196, "learning_rate": 1.7811957026179017e-06, "loss": 0.1073, "loss_nan_ranks": 0, "loss_rank_avg": 0.1257852464914322, "step": 11730 }, { "epoch": 6.153039832285115, "grad_norm": 0.7459965348243713, "learning_rate": 1.770429003083396e-06, "loss": 0.1186, "loss_nan_ranks": 0, "loss_rank_avg": 0.12738224864006042, "step": 11735 }, { "epoch": 6.15566037735849, "grad_norm": 0.7520008683204651, "learning_rate": 1.7596934355620465e-06, "loss": 0.1376, "loss_nan_ranks": 0, "loss_rank_avg": 0.15343382954597473, "step": 11740 }, { "epoch": 6.1582809224318655, "grad_norm": 0.7318786382675171, "learning_rate": 1.74898901838779e-06, "loss": 0.1016, "loss_nan_ranks": 0, "loss_rank_avg": 0.061004638671875, "step": 11745 }, { "epoch": 6.160901467505241, "grad_norm": 0.7465088367462158, "learning_rate": 1.7383157698413676e-06, "loss": 0.1032, "loss_nan_ranks": 0, "loss_rank_avg": 0.11167745292186737, "step": 11750 }, { "epoch": 6.163522012578617, "grad_norm": 0.6921051740646362, "learning_rate": 1.727673708150286e-06, "loss": 0.1165, "loss_nan_ranks": 0, "loss_rank_avg": 0.15981853008270264, "step": 11755 }, { "epoch": 6.166142557651992, "grad_norm": 0.7283172607421875, "learning_rate": 1.7170628514888044e-06, "loss": 0.1056, "loss_nan_ranks": 0, "loss_rank_avg": 0.052154541015625, "step": 11760 }, { "epoch": 6.168763102725367, "grad_norm": 0.7145721316337585, "learning_rate": 1.7064832179778768e-06, "loss": 0.1035, "loss_nan_ranks": 0, "loss_rank_avg": 0.10321943461894989, "step": 11765 }, { "epoch": 6.171383647798742, "grad_norm": 0.7393687963485718, "learning_rate": 1.695934825685146e-06, "loss": 0.1071, "loss_nan_ranks": 0, "loss_rank_avg": 0.10480769723653793, "step": 11770 }, { "epoch": 6.174004192872117, "grad_norm": 0.7510984539985657, "learning_rate": 1.6854176926248956e-06, "loss": 0.1122, "loss_nan_ranks": 0, "loss_rank_avg": 0.12445969879627228, "step": 11775 }, { "epoch": 6.176624737945493, "grad_norm": 0.7398217916488647, "learning_rate": 1.6749318367580203e-06, "loss": 0.1281, "loss_nan_ranks": 0, "loss_rank_avg": 0.10730582475662231, "step": 11780 }, { "epoch": 6.179245283018868, "grad_norm": 0.6224637627601624, "learning_rate": 1.6644772759920157e-06, "loss": 0.1002, "loss_nan_ranks": 0, "loss_rank_avg": 0.1414836049079895, "step": 11785 }, { "epoch": 6.181865828092243, "grad_norm": 0.8317475318908691, "learning_rate": 1.6540540281809225e-06, "loss": 0.1098, "loss_nan_ranks": 0, "loss_rank_avg": 0.0701904296875, "step": 11790 }, { "epoch": 6.184486373165618, "grad_norm": 0.7141923904418945, "learning_rate": 1.6436621111253036e-06, "loss": 0.1059, "loss_nan_ranks": 0, "loss_rank_avg": 0.1531287431716919, "step": 11795 }, { "epoch": 6.187106918238993, "grad_norm": 0.7269362807273865, "learning_rate": 1.633301542572221e-06, "loss": 0.1121, "loss_nan_ranks": 0, "loss_rank_avg": 0.12649017572402954, "step": 11800 }, { "epoch": 6.189727463312369, "grad_norm": 0.7622941136360168, "learning_rate": 1.6229723402151987e-06, "loss": 0.1087, "loss_nan_ranks": 0, "loss_rank_avg": 0.12106990069150925, "step": 11805 }, { "epoch": 6.1923480083857445, "grad_norm": 0.8292533159255981, "learning_rate": 1.6126745216941908e-06, "loss": 0.1017, "loss_nan_ranks": 0, "loss_rank_avg": 0.0821533203125, "step": 11810 }, { "epoch": 6.19496855345912, "grad_norm": 0.8477041721343994, "learning_rate": 1.6024081045955652e-06, "loss": 0.0963, "loss_nan_ranks": 0, "loss_rank_avg": 0.09291965514421463, "step": 11815 }, { "epoch": 6.197589098532495, "grad_norm": 0.6917146444320679, "learning_rate": 1.5921731064520552e-06, "loss": 0.1021, "loss_nan_ranks": 0, "loss_rank_avg": 0.13043524324893951, "step": 11820 }, { "epoch": 6.20020964360587, "grad_norm": 0.7436804175376892, "learning_rate": 1.5819695447427364e-06, "loss": 0.1305, "loss_nan_ranks": 0, "loss_rank_avg": 0.13131611049175262, "step": 11825 }, { "epoch": 6.202830188679245, "grad_norm": 0.7311981320381165, "learning_rate": 1.5717974368930033e-06, "loss": 0.1249, "loss_nan_ranks": 0, "loss_rank_avg": 0.10772877931594849, "step": 11830 }, { "epoch": 6.20545073375262, "grad_norm": 0.8186302185058594, "learning_rate": 1.5616568002745247e-06, "loss": 0.1019, "loss_nan_ranks": 0, "loss_rank_avg": 0.0828857421875, "step": 11835 }, { "epoch": 6.2080712788259955, "grad_norm": 0.8164761662483215, "learning_rate": 1.551547652205241e-06, "loss": 0.1094, "loss_nan_ranks": 0, "loss_rank_avg": 0.1388622224330902, "step": 11840 }, { "epoch": 6.210691823899371, "grad_norm": 0.7896164059638977, "learning_rate": 1.541470009949302e-06, "loss": 0.1099, "loss_nan_ranks": 0, "loss_rank_avg": 0.1046142578125, "step": 11845 }, { "epoch": 6.213312368972747, "grad_norm": 0.7347028851509094, "learning_rate": 1.5314238907170565e-06, "loss": 0.1133, "loss_nan_ranks": 0, "loss_rank_avg": 0.09812355786561966, "step": 11850 }, { "epoch": 6.215932914046122, "grad_norm": 0.7074013948440552, "learning_rate": 1.5214093116650208e-06, "loss": 0.1052, "loss_nan_ranks": 0, "loss_rank_avg": 0.07903372496366501, "step": 11855 }, { "epoch": 6.218553459119497, "grad_norm": 0.7754842638969421, "learning_rate": 1.511426289895841e-06, "loss": 0.1125, "loss_nan_ranks": 0, "loss_rank_avg": 0.12341928482055664, "step": 11860 }, { "epoch": 6.221174004192872, "grad_norm": 0.8584949970245361, "learning_rate": 1.5014748424582859e-06, "loss": 0.0933, "loss_nan_ranks": 0, "loss_rank_avg": 0.07411631941795349, "step": 11865 }, { "epoch": 6.223794549266247, "grad_norm": 0.7787871956825256, "learning_rate": 1.4915549863471901e-06, "loss": 0.1041, "loss_nan_ranks": 0, "loss_rank_avg": 0.10432527214288712, "step": 11870 }, { "epoch": 6.226415094339623, "grad_norm": 0.7362038493156433, "learning_rate": 1.4816667385034378e-06, "loss": 0.0907, "loss_nan_ranks": 0, "loss_rank_avg": 0.12271437793970108, "step": 11875 }, { "epoch": 6.229035639412998, "grad_norm": 0.7616882920265198, "learning_rate": 1.4718101158139343e-06, "loss": 0.1054, "loss_nan_ranks": 0, "loss_rank_avg": 0.08734153211116791, "step": 11880 }, { "epoch": 6.231656184486373, "grad_norm": 0.6927313804626465, "learning_rate": 1.4619851351115787e-06, "loss": 0.1097, "loss_nan_ranks": 0, "loss_rank_avg": 0.0926610603928566, "step": 11885 }, { "epoch": 6.234276729559748, "grad_norm": 0.78810054063797, "learning_rate": 1.4521918131752345e-06, "loss": 0.1093, "loss_nan_ranks": 0, "loss_rank_avg": 0.0959969088435173, "step": 11890 }, { "epoch": 6.236897274633123, "grad_norm": 0.8270073533058167, "learning_rate": 1.4424301667296936e-06, "loss": 0.1047, "loss_nan_ranks": 0, "loss_rank_avg": 0.07994962483644485, "step": 11895 }, { "epoch": 6.239517819706499, "grad_norm": 0.7329707741737366, "learning_rate": 1.4327002124456545e-06, "loss": 0.1167, "loss_nan_ranks": 0, "loss_rank_avg": 0.11037370562553406, "step": 11900 }, { "epoch": 6.2421383647798745, "grad_norm": 0.7574793696403503, "learning_rate": 1.4230019669396966e-06, "loss": 0.1232, "loss_nan_ranks": 0, "loss_rank_avg": 0.13014623522758484, "step": 11905 }, { "epoch": 6.24475890985325, "grad_norm": 0.7613834738731384, "learning_rate": 1.4133354467742422e-06, "loss": 0.1094, "loss_nan_ranks": 0, "loss_rank_avg": 0.09843027591705322, "step": 11910 }, { "epoch": 6.247379454926625, "grad_norm": 0.7687358260154724, "learning_rate": 1.4037006684575393e-06, "loss": 0.1007, "loss_nan_ranks": 0, "loss_rank_avg": 0.09907668828964233, "step": 11915 }, { "epoch": 6.25, "grad_norm": 0.7559594511985779, "learning_rate": 1.3940976484436264e-06, "loss": 0.1068, "loss_nan_ranks": 0, "loss_rank_avg": 0.10573053359985352, "step": 11920 }, { "epoch": 6.252620545073375, "grad_norm": 0.6567190289497375, "learning_rate": 1.3845264031323025e-06, "loss": 0.132, "loss_nan_ranks": 0, "loss_rank_avg": 0.1519894301891327, "step": 11925 }, { "epoch": 6.25524109014675, "grad_norm": 0.6811748743057251, "learning_rate": 1.3749869488691037e-06, "loss": 0.1237, "loss_nan_ranks": 0, "loss_rank_avg": 0.13222289085388184, "step": 11930 }, { "epoch": 6.2578616352201255, "grad_norm": 0.7532901763916016, "learning_rate": 1.3654793019452761e-06, "loss": 0.0992, "loss_nan_ranks": 0, "loss_rank_avg": 0.09068848937749863, "step": 11935 }, { "epoch": 6.260482180293501, "grad_norm": 0.7820981740951538, "learning_rate": 1.3560034785977515e-06, "loss": 0.1049, "loss_nan_ranks": 0, "loss_rank_avg": 0.09503139555454254, "step": 11940 }, { "epoch": 6.263102725366876, "grad_norm": 0.7701623439788818, "learning_rate": 1.346559495009101e-06, "loss": 0.0999, "loss_nan_ranks": 0, "loss_rank_avg": 0.11630750447511673, "step": 11945 }, { "epoch": 6.265723270440252, "grad_norm": 0.7650212049484253, "learning_rate": 1.3371473673075298e-06, "loss": 0.1123, "loss_nan_ranks": 0, "loss_rank_avg": 0.09112050384283066, "step": 11950 }, { "epoch": 6.268343815513627, "grad_norm": 0.7648396492004395, "learning_rate": 1.32776711156684e-06, "loss": 0.1104, "loss_nan_ranks": 0, "loss_rank_avg": 0.11140650510787964, "step": 11955 }, { "epoch": 6.270964360587002, "grad_norm": 0.5840007066726685, "learning_rate": 1.3184187438063956e-06, "loss": 0.1036, "loss_nan_ranks": 0, "loss_rank_avg": 0.06048639118671417, "step": 11960 }, { "epoch": 6.273584905660377, "grad_norm": 0.9658522605895996, "learning_rate": 1.3091022799911168e-06, "loss": 0.1148, "loss_nan_ranks": 0, "loss_rank_avg": 0.102294921875, "step": 11965 }, { "epoch": 6.276205450733753, "grad_norm": 0.7145773768424988, "learning_rate": 1.2998177360314279e-06, "loss": 0.1029, "loss_nan_ranks": 0, "loss_rank_avg": 0.10112839937210083, "step": 11970 }, { "epoch": 6.278825995807128, "grad_norm": 0.6984833478927612, "learning_rate": 1.2905651277832454e-06, "loss": 0.1059, "loss_nan_ranks": 0, "loss_rank_avg": 0.10083213448524475, "step": 11975 }, { "epoch": 6.281446540880503, "grad_norm": 0.7433158159255981, "learning_rate": 1.281344471047945e-06, "loss": 0.1192, "loss_nan_ranks": 0, "loss_rank_avg": 0.13840986788272858, "step": 11980 }, { "epoch": 6.284067085953878, "grad_norm": 0.6915749907493591, "learning_rate": 1.2721557815723373e-06, "loss": 0.1155, "loss_nan_ranks": 0, "loss_rank_avg": 0.11917416751384735, "step": 11985 }, { "epoch": 6.286687631027253, "grad_norm": 0.8083169460296631, "learning_rate": 1.2629990750486431e-06, "loss": 0.0931, "loss_nan_ranks": 0, "loss_rank_avg": 0.0660400390625, "step": 11990 }, { "epoch": 6.289308176100629, "grad_norm": 0.6940762400627136, "learning_rate": 1.2538743671144605e-06, "loss": 0.1013, "loss_nan_ranks": 0, "loss_rank_avg": 0.09669125825166702, "step": 11995 }, { "epoch": 6.2919287211740045, "grad_norm": 0.8445092439651489, "learning_rate": 1.2447816733527374e-06, "loss": 0.1009, "loss_nan_ranks": 0, "loss_rank_avg": 0.09276933968067169, "step": 12000 }, { "epoch": 6.29454926624738, "grad_norm": 0.7265570163726807, "learning_rate": 1.235721009291757e-06, "loss": 0.1011, "loss_nan_ranks": 0, "loss_rank_avg": 0.11874869465827942, "step": 12005 }, { "epoch": 6.297169811320755, "grad_norm": 0.7973197102546692, "learning_rate": 1.2266923904050954e-06, "loss": 0.1101, "loss_nan_ranks": 0, "loss_rank_avg": 0.12004204094409943, "step": 12010 }, { "epoch": 6.29979035639413, "grad_norm": 0.798222005367279, "learning_rate": 1.2176958321116073e-06, "loss": 0.1287, "loss_nan_ranks": 0, "loss_rank_avg": 0.1184849962592125, "step": 12015 }, { "epoch": 6.302410901467505, "grad_norm": 0.736455500125885, "learning_rate": 1.208731349775394e-06, "loss": 0.1181, "loss_nan_ranks": 0, "loss_rank_avg": 0.12855270504951477, "step": 12020 }, { "epoch": 6.30503144654088, "grad_norm": 0.7713168859481812, "learning_rate": 1.1997989587057779e-06, "loss": 0.1143, "loss_nan_ranks": 0, "loss_rank_avg": 0.09117522835731506, "step": 12025 }, { "epoch": 6.3076519916142555, "grad_norm": 0.7758849263191223, "learning_rate": 1.190898674157277e-06, "loss": 0.1067, "loss_nan_ranks": 0, "loss_rank_avg": 0.08564572036266327, "step": 12030 }, { "epoch": 6.310272536687631, "grad_norm": 0.802696704864502, "learning_rate": 1.1820305113295794e-06, "loss": 0.1209, "loss_nan_ranks": 0, "loss_rank_avg": 0.14391744136810303, "step": 12035 }, { "epoch": 6.312893081761006, "grad_norm": 0.8025701642036438, "learning_rate": 1.1731944853675103e-06, "loss": 0.0951, "loss_nan_ranks": 0, "loss_rank_avg": 0.09395985305309296, "step": 12040 }, { "epoch": 6.315513626834382, "grad_norm": 0.7337117791175842, "learning_rate": 1.164390611361026e-06, "loss": 0.108, "loss_nan_ranks": 0, "loss_rank_avg": 0.111670583486557, "step": 12045 }, { "epoch": 6.318134171907757, "grad_norm": 0.8311132192611694, "learning_rate": 1.1556189043451593e-06, "loss": 0.1209, "loss_nan_ranks": 0, "loss_rank_avg": 0.12455315887928009, "step": 12050 }, { "epoch": 6.320754716981132, "grad_norm": 0.7437119483947754, "learning_rate": 1.1468793793000189e-06, "loss": 0.1154, "loss_nan_ranks": 0, "loss_rank_avg": 0.11051569133996964, "step": 12055 }, { "epoch": 6.323375262054507, "grad_norm": 0.7538794875144958, "learning_rate": 1.138172051150752e-06, "loss": 0.0883, "loss_nan_ranks": 0, "loss_rank_avg": 0.08596548438072205, "step": 12060 }, { "epoch": 6.325995807127883, "grad_norm": 0.7586971521377563, "learning_rate": 1.1294969347675133e-06, "loss": 0.1255, "loss_nan_ranks": 0, "loss_rank_avg": 0.10219697654247284, "step": 12065 }, { "epoch": 6.328616352201258, "grad_norm": 0.8248292207717896, "learning_rate": 1.1208540449654603e-06, "loss": 0.1081, "loss_nan_ranks": 0, "loss_rank_avg": 0.10271692276000977, "step": 12070 }, { "epoch": 6.331236897274633, "grad_norm": 0.8947778940200806, "learning_rate": 1.1122433965047063e-06, "loss": 0.1082, "loss_nan_ranks": 0, "loss_rank_avg": 0.089599609375, "step": 12075 }, { "epoch": 6.333857442348008, "grad_norm": 0.8684989213943481, "learning_rate": 1.1036650040903018e-06, "loss": 0.1089, "loss_nan_ranks": 0, "loss_rank_avg": 0.10005056113004684, "step": 12080 }, { "epoch": 6.336477987421383, "grad_norm": 0.7829846143722534, "learning_rate": 1.095118882372217e-06, "loss": 0.1073, "loss_nan_ranks": 0, "loss_rank_avg": 0.089599609375, "step": 12085 }, { "epoch": 6.339098532494759, "grad_norm": 0.6981558799743652, "learning_rate": 1.086605045945306e-06, "loss": 0.1166, "loss_nan_ranks": 0, "loss_rank_avg": 0.1020331084728241, "step": 12090 }, { "epoch": 6.3417190775681345, "grad_norm": 0.6989213824272156, "learning_rate": 1.0781235093492937e-06, "loss": 0.1106, "loss_nan_ranks": 0, "loss_rank_avg": 0.0916748046875, "step": 12095 }, { "epoch": 6.34433962264151, "grad_norm": 0.7455583810806274, "learning_rate": 1.069674287068736e-06, "loss": 0.1024, "loss_nan_ranks": 0, "loss_rank_avg": 0.0858154296875, "step": 12100 }, { "epoch": 6.346960167714885, "grad_norm": 0.6839850544929504, "learning_rate": 1.0612573935330084e-06, "loss": 0.1126, "loss_nan_ranks": 0, "loss_rank_avg": 0.10747925192117691, "step": 12105 }, { "epoch": 6.34958071278826, "grad_norm": 0.9391524195671082, "learning_rate": 1.052872843116277e-06, "loss": 0.1024, "loss_nan_ranks": 0, "loss_rank_avg": 0.09704363346099854, "step": 12110 }, { "epoch": 6.352201257861635, "grad_norm": 0.7337355613708496, "learning_rate": 1.0445206501374638e-06, "loss": 0.0974, "loss_nan_ranks": 0, "loss_rank_avg": 0.09076334536075592, "step": 12115 }, { "epoch": 6.35482180293501, "grad_norm": 0.7760245203971863, "learning_rate": 1.0362008288602454e-06, "loss": 0.1143, "loss_nan_ranks": 0, "loss_rank_avg": 0.09361493587493896, "step": 12120 }, { "epoch": 6.3574423480083855, "grad_norm": 0.8386433124542236, "learning_rate": 1.0279133934930074e-06, "loss": 0.1114, "loss_nan_ranks": 0, "loss_rank_avg": 0.13330651819705963, "step": 12125 }, { "epoch": 6.360062893081761, "grad_norm": 0.7231929302215576, "learning_rate": 1.0196583581888264e-06, "loss": 0.1103, "loss_nan_ranks": 0, "loss_rank_avg": 0.10863436013460159, "step": 12130 }, { "epoch": 6.362683438155136, "grad_norm": 0.8203999996185303, "learning_rate": 1.0114357370454475e-06, "loss": 0.1133, "loss_nan_ranks": 0, "loss_rank_avg": 0.09395897388458252, "step": 12135 }, { "epoch": 6.365303983228512, "grad_norm": 0.6534098982810974, "learning_rate": 1.0032455441052602e-06, "loss": 0.1131, "loss_nan_ranks": 0, "loss_rank_avg": 0.08841674029827118, "step": 12140 }, { "epoch": 6.367924528301887, "grad_norm": 0.7433450222015381, "learning_rate": 9.950877933552804e-07, "loss": 0.0976, "loss_nan_ranks": 0, "loss_rank_avg": 0.09493722021579742, "step": 12145 }, { "epoch": 6.370545073375262, "grad_norm": 0.6835120916366577, "learning_rate": 9.869624987271108e-07, "loss": 0.0972, "loss_nan_ranks": 0, "loss_rank_avg": 0.07848665118217468, "step": 12150 }, { "epoch": 6.373165618448637, "grad_norm": 0.7295222878456116, "learning_rate": 9.788696740969295e-07, "loss": 0.1217, "loss_nan_ranks": 0, "loss_rank_avg": 0.15375928580760956, "step": 12155 }, { "epoch": 6.3757861635220126, "grad_norm": 0.6435827016830444, "learning_rate": 9.70809333285463e-07, "loss": 0.105, "loss_nan_ranks": 0, "loss_rank_avg": 0.11237955093383789, "step": 12160 }, { "epoch": 6.378406708595388, "grad_norm": 0.6830844879150391, "learning_rate": 9.627814900579624e-07, "loss": 0.1062, "loss_nan_ranks": 0, "loss_rank_avg": 0.12887975573539734, "step": 12165 }, { "epoch": 6.381027253668763, "grad_norm": 0.7148608565330505, "learning_rate": 9.547861581241834e-07, "loss": 0.1243, "loss_nan_ranks": 0, "loss_rank_avg": 0.09197840094566345, "step": 12170 }, { "epoch": 6.383647798742138, "grad_norm": 0.7562742233276367, "learning_rate": 9.468233511383573e-07, "loss": 0.0994, "loss_nan_ranks": 0, "loss_rank_avg": 0.0972900390625, "step": 12175 }, { "epoch": 6.386268343815513, "grad_norm": 0.8822031617164612, "learning_rate": 9.388930826991682e-07, "loss": 0.0895, "loss_nan_ranks": 0, "loss_rank_avg": 0.0692138671875, "step": 12180 }, { "epoch": 6.388888888888889, "grad_norm": 0.671484649181366, "learning_rate": 9.309953663497362e-07, "loss": 0.1081, "loss_nan_ranks": 0, "loss_rank_avg": 0.10735087841749191, "step": 12185 }, { "epoch": 6.3915094339622645, "grad_norm": 0.6452421545982361, "learning_rate": 9.231302155775812e-07, "loss": 0.123, "loss_nan_ranks": 0, "loss_rank_avg": 0.1407989263534546, "step": 12190 }, { "epoch": 6.39412997903564, "grad_norm": 0.8478002548217773, "learning_rate": 9.152976438146211e-07, "loss": 0.0896, "loss_nan_ranks": 0, "loss_rank_avg": 0.11116098612546921, "step": 12195 }, { "epoch": 6.396750524109015, "grad_norm": 0.8632563352584839, "learning_rate": 9.074976644371269e-07, "loss": 0.097, "loss_nan_ranks": 0, "loss_rank_avg": 0.10032831877470016, "step": 12200 }, { "epoch": 6.39937106918239, "grad_norm": 0.7530129551887512, "learning_rate": 8.997302907657124e-07, "loss": 0.1005, "loss_nan_ranks": 0, "loss_rank_avg": 0.11775366961956024, "step": 12205 }, { "epoch": 6.401991614255765, "grad_norm": 0.6896882057189941, "learning_rate": 8.919955360653066e-07, "loss": 0.0986, "loss_nan_ranks": 0, "loss_rank_avg": 0.08039015531539917, "step": 12210 }, { "epoch": 6.40461215932914, "grad_norm": 0.7121017575263977, "learning_rate": 8.842934135451297e-07, "loss": 0.1064, "loss_nan_ranks": 0, "loss_rank_avg": 0.10184863209724426, "step": 12215 }, { "epoch": 6.4072327044025155, "grad_norm": 0.7153964638710022, "learning_rate": 8.766239363586826e-07, "loss": 0.1221, "loss_nan_ranks": 0, "loss_rank_avg": 0.10507543385028839, "step": 12220 }, { "epoch": 6.409853249475891, "grad_norm": 0.6596793532371521, "learning_rate": 8.689871176037102e-07, "loss": 0.1182, "loss_nan_ranks": 0, "loss_rank_avg": 0.0855712890625, "step": 12225 }, { "epoch": 6.412473794549266, "grad_norm": 0.7590340375900269, "learning_rate": 8.613829703221799e-07, "loss": 0.1135, "loss_nan_ranks": 0, "loss_rank_avg": 0.14787140488624573, "step": 12230 }, { "epoch": 6.415094339622642, "grad_norm": 0.6925032138824463, "learning_rate": 8.538115075002707e-07, "loss": 0.0817, "loss_nan_ranks": 0, "loss_rank_avg": 0.07074858248233795, "step": 12235 }, { "epoch": 6.417714884696017, "grad_norm": 0.7162047624588013, "learning_rate": 8.46272742068337e-07, "loss": 0.1257, "loss_nan_ranks": 0, "loss_rank_avg": 0.11975658684968948, "step": 12240 }, { "epoch": 6.420335429769392, "grad_norm": 0.857296884059906, "learning_rate": 8.387666869008981e-07, "loss": 0.1075, "loss_nan_ranks": 0, "loss_rank_avg": 0.06390380859375, "step": 12245 }, { "epoch": 6.422955974842767, "grad_norm": 0.6864899396896362, "learning_rate": 8.312933548166136e-07, "loss": 0.1032, "loss_nan_ranks": 0, "loss_rank_avg": 0.089599609375, "step": 12250 }, { "epoch": 6.4255765199161425, "grad_norm": 0.6637594699859619, "learning_rate": 8.238527585782563e-07, "loss": 0.1222, "loss_nan_ranks": 0, "loss_rank_avg": 0.12327419221401215, "step": 12255 }, { "epoch": 6.428197064989518, "grad_norm": 0.7350227236747742, "learning_rate": 8.164449108926887e-07, "loss": 0.1149, "loss_nan_ranks": 0, "loss_rank_avg": 0.13680747151374817, "step": 12260 }, { "epoch": 6.430817610062893, "grad_norm": 0.6388922929763794, "learning_rate": 8.090698244108553e-07, "loss": 0.1061, "loss_nan_ranks": 0, "loss_rank_avg": 0.13326486945152283, "step": 12265 }, { "epoch": 6.433438155136268, "grad_norm": 0.7068225741386414, "learning_rate": 8.017275117277434e-07, "loss": 0.1112, "loss_nan_ranks": 0, "loss_rank_avg": 0.14121992886066437, "step": 12270 }, { "epoch": 6.436058700209643, "grad_norm": 0.7588358521461487, "learning_rate": 7.944179853823786e-07, "loss": 0.122, "loss_nan_ranks": 0, "loss_rank_avg": 0.09675169736146927, "step": 12275 }, { "epoch": 6.438679245283019, "grad_norm": 0.793441653251648, "learning_rate": 7.871412578577886e-07, "loss": 0.1011, "loss_nan_ranks": 0, "loss_rank_avg": 0.13168825209140778, "step": 12280 }, { "epoch": 6.441299790356394, "grad_norm": 0.7840567231178284, "learning_rate": 7.798973415809885e-07, "loss": 0.1221, "loss_nan_ranks": 0, "loss_rank_avg": 0.12841647863388062, "step": 12285 }, { "epoch": 6.44392033542977, "grad_norm": 0.6803513765335083, "learning_rate": 7.726862489229625e-07, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.11007022857666016, "step": 12290 }, { "epoch": 6.446540880503145, "grad_norm": 0.7925393581390381, "learning_rate": 7.65507992198633e-07, "loss": 0.1033, "loss_nan_ranks": 0, "loss_rank_avg": 0.10711553692817688, "step": 12295 }, { "epoch": 6.44916142557652, "grad_norm": 0.7759236693382263, "learning_rate": 7.583625836668562e-07, "loss": 0.0751, "loss_nan_ranks": 0, "loss_rank_avg": 0.05218505859375, "step": 12300 }, { "epoch": 6.451781970649895, "grad_norm": 0.7366616129875183, "learning_rate": 7.51250035530382e-07, "loss": 0.1242, "loss_nan_ranks": 0, "loss_rank_avg": 0.10968296229839325, "step": 12305 }, { "epoch": 6.45440251572327, "grad_norm": 0.7767581939697266, "learning_rate": 7.441703599358474e-07, "loss": 0.1171, "loss_nan_ranks": 0, "loss_rank_avg": 0.11889313906431198, "step": 12310 }, { "epoch": 6.4570230607966455, "grad_norm": 0.7596882581710815, "learning_rate": 7.371235689737455e-07, "loss": 0.1193, "loss_nan_ranks": 0, "loss_rank_avg": 0.11869388073682785, "step": 12315 }, { "epoch": 6.459643605870021, "grad_norm": 0.8227205872535706, "learning_rate": 7.301096746784098e-07, "loss": 0.1134, "loss_nan_ranks": 0, "loss_rank_avg": 0.10239917039871216, "step": 12320 }, { "epoch": 6.462264150943396, "grad_norm": 0.7390924096107483, "learning_rate": 7.231286890280053e-07, "loss": 0.1133, "loss_nan_ranks": 0, "loss_rank_avg": 0.13894927501678467, "step": 12325 }, { "epoch": 6.464884696016772, "grad_norm": 0.7605389356613159, "learning_rate": 7.161806239444824e-07, "loss": 0.1186, "loss_nan_ranks": 0, "loss_rank_avg": 0.1234489157795906, "step": 12330 }, { "epoch": 6.467505241090147, "grad_norm": 0.7993542551994324, "learning_rate": 7.092654912935759e-07, "loss": 0.1117, "loss_nan_ranks": 0, "loss_rank_avg": 0.07925504446029663, "step": 12335 }, { "epoch": 6.470125786163522, "grad_norm": 0.7740378379821777, "learning_rate": 7.023833028847793e-07, "loss": 0.1023, "loss_nan_ranks": 0, "loss_rank_avg": 0.12133189290761948, "step": 12340 }, { "epoch": 6.472746331236897, "grad_norm": 0.7178128957748413, "learning_rate": 6.955340704713243e-07, "loss": 0.0887, "loss_nan_ranks": 0, "loss_rank_avg": 0.07630637288093567, "step": 12345 }, { "epoch": 6.4753668763102725, "grad_norm": 0.7819163799285889, "learning_rate": 6.887178057501632e-07, "loss": 0.1234, "loss_nan_ranks": 0, "loss_rank_avg": 0.1381218135356903, "step": 12350 }, { "epoch": 6.477987421383648, "grad_norm": 0.7058833241462708, "learning_rate": 6.819345203619443e-07, "loss": 0.1192, "loss_nan_ranks": 0, "loss_rank_avg": 0.13427695631980896, "step": 12355 }, { "epoch": 6.480607966457023, "grad_norm": 0.7779912352561951, "learning_rate": 6.751842258909969e-07, "loss": 0.12, "loss_nan_ranks": 0, "loss_rank_avg": 0.103742316365242, "step": 12360 }, { "epoch": 6.483228511530398, "grad_norm": 0.7333693504333496, "learning_rate": 6.684669338653083e-07, "loss": 0.1047, "loss_nan_ranks": 0, "loss_rank_avg": 0.08863117545843124, "step": 12365 }, { "epoch": 6.485849056603773, "grad_norm": 0.7994443774223328, "learning_rate": 6.617826557564977e-07, "loss": 0.1077, "loss_nan_ranks": 0, "loss_rank_avg": 0.12241359055042267, "step": 12370 }, { "epoch": 6.488469601677149, "grad_norm": 0.7210835218429565, "learning_rate": 6.551314029798206e-07, "loss": 0.0933, "loss_nan_ranks": 0, "loss_rank_avg": 0.07509548217058182, "step": 12375 }, { "epoch": 6.491090146750524, "grad_norm": 0.7040566802024841, "learning_rate": 6.485131868941197e-07, "loss": 0.098, "loss_nan_ranks": 0, "loss_rank_avg": 0.07635965943336487, "step": 12380 }, { "epoch": 6.4937106918239, "grad_norm": 0.742750346660614, "learning_rate": 6.419280188018207e-07, "loss": 0.099, "loss_nan_ranks": 0, "loss_rank_avg": 0.0703125, "step": 12385 }, { "epoch": 6.496331236897275, "grad_norm": 0.7622641324996948, "learning_rate": 6.353759099489121e-07, "loss": 0.115, "loss_nan_ranks": 0, "loss_rank_avg": 0.11391450464725494, "step": 12390 }, { "epoch": 6.49895178197065, "grad_norm": 0.7823505997657776, "learning_rate": 6.28856871524921e-07, "loss": 0.1001, "loss_nan_ranks": 0, "loss_rank_avg": 0.08123779296875, "step": 12395 }, { "epoch": 6.501572327044025, "grad_norm": 0.7582354545593262, "learning_rate": 6.223709146629064e-07, "loss": 0.1226, "loss_nan_ranks": 0, "loss_rank_avg": 0.1327885389328003, "step": 12400 }, { "epoch": 6.5041928721174, "grad_norm": 0.756651759147644, "learning_rate": 6.159180504394236e-07, "loss": 0.1, "loss_nan_ranks": 0, "loss_rank_avg": 0.13122040033340454, "step": 12405 }, { "epoch": 6.506813417190775, "grad_norm": 0.7833355069160461, "learning_rate": 6.09498289874515e-07, "loss": 0.1101, "loss_nan_ranks": 0, "loss_rank_avg": 0.1171574741601944, "step": 12410 }, { "epoch": 6.509433962264151, "grad_norm": 0.699315071105957, "learning_rate": 6.031116439316931e-07, "loss": 0.12, "loss_nan_ranks": 0, "loss_rank_avg": 0.12373475730419159, "step": 12415 }, { "epoch": 6.512054507337526, "grad_norm": 0.7405616044998169, "learning_rate": 5.967581235179065e-07, "loss": 0.1108, "loss_nan_ranks": 0, "loss_rank_avg": 0.08771055191755295, "step": 12420 }, { "epoch": 6.514675052410902, "grad_norm": 0.6853896379470825, "learning_rate": 5.904377394835514e-07, "loss": 0.1082, "loss_nan_ranks": 0, "loss_rank_avg": 0.12733067572116852, "step": 12425 }, { "epoch": 6.517295597484277, "grad_norm": 0.5839729905128479, "learning_rate": 5.841505026224181e-07, "loss": 0.1167, "loss_nan_ranks": 0, "loss_rank_avg": 0.14495626091957092, "step": 12430 }, { "epoch": 6.519916142557652, "grad_norm": 0.7660456299781799, "learning_rate": 5.778964236716977e-07, "loss": 0.0963, "loss_nan_ranks": 0, "loss_rank_avg": 0.11170623451471329, "step": 12435 }, { "epoch": 6.522536687631027, "grad_norm": 0.7043365836143494, "learning_rate": 5.716755133119512e-07, "loss": 0.1082, "loss_nan_ranks": 0, "loss_rank_avg": 0.10523848235607147, "step": 12440 }, { "epoch": 6.5251572327044025, "grad_norm": 0.7790372967720032, "learning_rate": 5.654877821670979e-07, "loss": 0.107, "loss_nan_ranks": 0, "loss_rank_avg": 0.10113519430160522, "step": 12445 }, { "epoch": 6.527777777777778, "grad_norm": 0.7878185510635376, "learning_rate": 5.593332408043872e-07, "loss": 0.1115, "loss_nan_ranks": 0, "loss_rank_avg": 0.10966823250055313, "step": 12450 }, { "epoch": 6.530398322851153, "grad_norm": 0.6953631043434143, "learning_rate": 5.532118997344027e-07, "loss": 0.1036, "loss_nan_ranks": 0, "loss_rank_avg": 0.09158299118280411, "step": 12455 }, { "epoch": 6.533018867924528, "grad_norm": 0.7004905939102173, "learning_rate": 5.471237694110132e-07, "loss": 0.095, "loss_nan_ranks": 0, "loss_rank_avg": 0.08766491711139679, "step": 12460 }, { "epoch": 6.535639412997903, "grad_norm": 0.7077845931053162, "learning_rate": 5.410688602313797e-07, "loss": 0.1069, "loss_nan_ranks": 0, "loss_rank_avg": 0.1242421418428421, "step": 12465 }, { "epoch": 6.538259958071279, "grad_norm": 0.8396255373954773, "learning_rate": 5.350471825359305e-07, "loss": 0.1207, "loss_nan_ranks": 0, "loss_rank_avg": 0.15242958068847656, "step": 12470 }, { "epoch": 6.540880503144654, "grad_norm": 0.6612614989280701, "learning_rate": 5.290587466083308e-07, "loss": 0.114, "loss_nan_ranks": 0, "loss_rank_avg": 0.12696123123168945, "step": 12475 }, { "epoch": 6.54350104821803, "grad_norm": 0.7015223503112793, "learning_rate": 5.231035626754932e-07, "loss": 0.1, "loss_nan_ranks": 0, "loss_rank_avg": 0.14191541075706482, "step": 12480 }, { "epoch": 6.546121593291405, "grad_norm": 0.8182874917984009, "learning_rate": 5.171816409075314e-07, "loss": 0.1002, "loss_nan_ranks": 0, "loss_rank_avg": 0.09325539320707321, "step": 12485 }, { "epoch": 6.54874213836478, "grad_norm": 0.8494343161582947, "learning_rate": 5.112929914177556e-07, "loss": 0.1066, "loss_nan_ranks": 0, "loss_rank_avg": 0.075439453125, "step": 12490 }, { "epoch": 6.551362683438155, "grad_norm": 0.717740535736084, "learning_rate": 5.054376242626591e-07, "loss": 0.0841, "loss_nan_ranks": 0, "loss_rank_avg": 0.06298828125, "step": 12495 }, { "epoch": 6.55398322851153, "grad_norm": 0.8419182896614075, "learning_rate": 4.996155494418897e-07, "loss": 0.102, "loss_nan_ranks": 0, "loss_rank_avg": 0.10433823615312576, "step": 12500 }, { "epoch": 6.556603773584905, "grad_norm": 0.7867080569267273, "learning_rate": 4.938267768982496e-07, "loss": 0.1087, "loss_nan_ranks": 0, "loss_rank_avg": 0.11582699418067932, "step": 12505 }, { "epoch": 6.559224318658281, "grad_norm": 0.6588146090507507, "learning_rate": 4.880713165176598e-07, "loss": 0.1068, "loss_nan_ranks": 0, "loss_rank_avg": 0.10758035629987717, "step": 12510 }, { "epoch": 6.561844863731656, "grad_norm": 0.7257134914398193, "learning_rate": 4.823491781291534e-07, "loss": 0.1042, "loss_nan_ranks": 0, "loss_rank_avg": 0.10703811049461365, "step": 12515 }, { "epoch": 6.564465408805032, "grad_norm": 0.7567834258079529, "learning_rate": 4.766603715048557e-07, "loss": 0.1144, "loss_nan_ranks": 0, "loss_rank_avg": 0.10694126784801483, "step": 12520 }, { "epoch": 6.567085953878407, "grad_norm": 0.7547696828842163, "learning_rate": 4.710049063599753e-07, "loss": 0.1093, "loss_nan_ranks": 0, "loss_rank_avg": 0.08135071396827698, "step": 12525 }, { "epoch": 6.569706498951782, "grad_norm": 0.7312501072883606, "learning_rate": 4.6538279235277315e-07, "loss": 0.1159, "loss_nan_ranks": 0, "loss_rank_avg": 0.10318690538406372, "step": 12530 }, { "epoch": 6.572327044025157, "grad_norm": 0.7074016332626343, "learning_rate": 4.597940390845601e-07, "loss": 0.086, "loss_nan_ranks": 0, "loss_rank_avg": 0.10734502971172333, "step": 12535 }, { "epoch": 6.5749475890985325, "grad_norm": 0.7671141624450684, "learning_rate": 4.542386560996681e-07, "loss": 0.1028, "loss_nan_ranks": 0, "loss_rank_avg": 0.1200064942240715, "step": 12540 }, { "epoch": 6.577568134171908, "grad_norm": 0.7219176888465881, "learning_rate": 4.487166528854459e-07, "loss": 0.1005, "loss_nan_ranks": 0, "loss_rank_avg": 0.08383356034755707, "step": 12545 }, { "epoch": 6.580188679245283, "grad_norm": 0.878116250038147, "learning_rate": 4.432280388722343e-07, "loss": 0.1106, "loss_nan_ranks": 0, "loss_rank_avg": 0.0889734998345375, "step": 12550 }, { "epoch": 6.582809224318658, "grad_norm": 0.6791242957115173, "learning_rate": 4.377728234333534e-07, "loss": 0.1212, "loss_nan_ranks": 0, "loss_rank_avg": 0.13134799897670746, "step": 12555 }, { "epoch": 6.585429769392033, "grad_norm": 0.6606261730194092, "learning_rate": 4.3235101588508633e-07, "loss": 0.1135, "loss_nan_ranks": 0, "loss_rank_avg": 0.10282780230045319, "step": 12560 }, { "epoch": 6.588050314465409, "grad_norm": 0.9038112163543701, "learning_rate": 4.269626254866643e-07, "loss": 0.1017, "loss_nan_ranks": 0, "loss_rank_avg": 0.09375, "step": 12565 }, { "epoch": 6.590670859538784, "grad_norm": 0.7051216959953308, "learning_rate": 4.216076614402442e-07, "loss": 0.121, "loss_nan_ranks": 0, "loss_rank_avg": 0.0945984348654747, "step": 12570 }, { "epoch": 6.59329140461216, "grad_norm": 0.7136359214782715, "learning_rate": 4.162861328909018e-07, "loss": 0.1033, "loss_nan_ranks": 0, "loss_rank_avg": 0.07963068783283234, "step": 12575 }, { "epoch": 6.595911949685535, "grad_norm": 0.683529794216156, "learning_rate": 4.1099804892661855e-07, "loss": 0.1049, "loss_nan_ranks": 0, "loss_rank_avg": 0.1307920664548874, "step": 12580 }, { "epoch": 6.59853249475891, "grad_norm": 0.7927236557006836, "learning_rate": 4.0574341857824826e-07, "loss": 0.1067, "loss_nan_ranks": 0, "loss_rank_avg": 0.12521082162857056, "step": 12585 }, { "epoch": 6.601153039832285, "grad_norm": 0.7777563333511353, "learning_rate": 4.005222508195217e-07, "loss": 0.0964, "loss_nan_ranks": 0, "loss_rank_avg": 0.085205078125, "step": 12590 }, { "epoch": 6.60377358490566, "grad_norm": 0.8762742280960083, "learning_rate": 3.9533455456702173e-07, "loss": 0.1133, "loss_nan_ranks": 0, "loss_rank_avg": 0.08815234154462814, "step": 12595 }, { "epoch": 6.606394129979035, "grad_norm": 0.7172031998634338, "learning_rate": 3.9018033868016616e-07, "loss": 0.0846, "loss_nan_ranks": 0, "loss_rank_avg": 0.0655192881822586, "step": 12600 }, { "epoch": 6.609014675052411, "grad_norm": 0.6910161972045898, "learning_rate": 3.8505961196120044e-07, "loss": 0.1, "loss_nan_ranks": 0, "loss_rank_avg": 0.12381216883659363, "step": 12605 }, { "epoch": 6.611635220125786, "grad_norm": 0.7001184821128845, "learning_rate": 3.7997238315517606e-07, "loss": 0.1314, "loss_nan_ranks": 0, "loss_rank_avg": 0.1847023069858551, "step": 12610 }, { "epoch": 6.614255765199162, "grad_norm": 0.7264262437820435, "learning_rate": 3.7491866094993446e-07, "loss": 0.1191, "loss_nan_ranks": 0, "loss_rank_avg": 0.10422031581401825, "step": 12615 }, { "epoch": 6.616876310272537, "grad_norm": 0.7688695192337036, "learning_rate": 3.698984539761008e-07, "loss": 0.1028, "loss_nan_ranks": 0, "loss_rank_avg": 0.07764077186584473, "step": 12620 }, { "epoch": 6.619496855345912, "grad_norm": 0.7013809084892273, "learning_rate": 3.649117708070571e-07, "loss": 0.1065, "loss_nan_ranks": 0, "loss_rank_avg": 0.11546897888183594, "step": 12625 }, { "epoch": 6.622117400419287, "grad_norm": 0.7238118052482605, "learning_rate": 3.5995861995894444e-07, "loss": 0.1211, "loss_nan_ranks": 0, "loss_rank_avg": 0.13196802139282227, "step": 12630 }, { "epoch": 6.6247379454926625, "grad_norm": 0.7912337183952332, "learning_rate": 3.5503900989062755e-07, "loss": 0.1153, "loss_nan_ranks": 0, "loss_rank_avg": 0.12135768681764603, "step": 12635 }, { "epoch": 6.627358490566038, "grad_norm": 0.7144173979759216, "learning_rate": 3.5015294900369703e-07, "loss": 0.1241, "loss_nan_ranks": 0, "loss_rank_avg": 0.11887726187705994, "step": 12640 }, { "epoch": 6.629979035639413, "grad_norm": 0.6379136443138123, "learning_rate": 3.453004456424491e-07, "loss": 0.1067, "loss_nan_ranks": 0, "loss_rank_avg": 0.12516003847122192, "step": 12645 }, { "epoch": 6.632599580712788, "grad_norm": 0.7322754859924316, "learning_rate": 3.404815080938639e-07, "loss": 0.1227, "loss_nan_ranks": 0, "loss_rank_avg": 0.12225642800331116, "step": 12650 }, { "epoch": 6.635220125786163, "grad_norm": 0.7234863638877869, "learning_rate": 3.356961445876117e-07, "loss": 0.1203, "loss_nan_ranks": 0, "loss_rank_avg": 0.11731066554784775, "step": 12655 }, { "epoch": 6.637840670859539, "grad_norm": 0.7147253155708313, "learning_rate": 3.309443632960152e-07, "loss": 0.1064, "loss_nan_ranks": 0, "loss_rank_avg": 0.14796984195709229, "step": 12660 }, { "epoch": 6.640461215932914, "grad_norm": 0.7932282090187073, "learning_rate": 3.2622617233404985e-07, "loss": 0.1041, "loss_nan_ranks": 0, "loss_rank_avg": 0.09972994774580002, "step": 12665 }, { "epoch": 6.6430817610062896, "grad_norm": 0.828298032283783, "learning_rate": 3.21541579759328e-07, "loss": 0.1015, "loss_nan_ranks": 0, "loss_rank_avg": 0.10119494795799255, "step": 12670 }, { "epoch": 6.645702306079665, "grad_norm": 0.6335622668266296, "learning_rate": 3.1689059357207674e-07, "loss": 0.1104, "loss_nan_ranks": 0, "loss_rank_avg": 0.09908124059438705, "step": 12675 }, { "epoch": 6.64832285115304, "grad_norm": 0.7649173736572266, "learning_rate": 3.122732217151403e-07, "loss": 0.1135, "loss_nan_ranks": 0, "loss_rank_avg": 0.10169924795627594, "step": 12680 }, { "epoch": 6.650943396226415, "grad_norm": 0.7193025350570679, "learning_rate": 3.076894720739532e-07, "loss": 0.0907, "loss_nan_ranks": 0, "loss_rank_avg": 0.11542670428752899, "step": 12685 }, { "epoch": 6.65356394129979, "grad_norm": 0.7355470657348633, "learning_rate": 3.0313935247652695e-07, "loss": 0.1199, "loss_nan_ranks": 0, "loss_rank_avg": 0.11160104721784592, "step": 12690 }, { "epoch": 6.656184486373165, "grad_norm": 0.6548497080802917, "learning_rate": 2.9862287069344575e-07, "loss": 0.0909, "loss_nan_ranks": 0, "loss_rank_avg": 0.13360291719436646, "step": 12695 }, { "epoch": 6.658805031446541, "grad_norm": 0.859886646270752, "learning_rate": 2.9414003443784867e-07, "loss": 0.0886, "loss_nan_ranks": 0, "loss_rank_avg": 0.059539794921875, "step": 12700 }, { "epoch": 6.661425576519916, "grad_norm": 0.8096195459365845, "learning_rate": 2.896908513654073e-07, "loss": 0.1051, "loss_nan_ranks": 0, "loss_rank_avg": 0.12544667720794678, "step": 12705 }, { "epoch": 6.664046121593291, "grad_norm": 0.695798933506012, "learning_rate": 2.852753290743326e-07, "loss": 0.1213, "loss_nan_ranks": 0, "loss_rank_avg": 0.1460527777671814, "step": 12710 }, { "epoch": 6.666666666666667, "grad_norm": 0.7370837330818176, "learning_rate": 2.808934751053438e-07, "loss": 0.0949, "loss_nan_ranks": 0, "loss_rank_avg": 0.0875244140625, "step": 12715 }, { "epoch": 6.669287211740042, "grad_norm": 0.7537276148796082, "learning_rate": 2.7654529694166157e-07, "loss": 0.1094, "loss_nan_ranks": 0, "loss_rank_avg": 0.1236792579293251, "step": 12720 }, { "epoch": 6.671907756813417, "grad_norm": 0.7928598523139954, "learning_rate": 2.722308020089992e-07, "loss": 0.104, "loss_nan_ranks": 0, "loss_rank_avg": 0.0823974609375, "step": 12725 }, { "epoch": 6.6745283018867925, "grad_norm": 0.8015418648719788, "learning_rate": 2.6794999767554287e-07, "loss": 0.1022, "loss_nan_ranks": 0, "loss_rank_avg": 0.059906005859375, "step": 12730 }, { "epoch": 6.677148846960168, "grad_norm": 0.7586283683776855, "learning_rate": 2.637028912519468e-07, "loss": 0.1218, "loss_nan_ranks": 0, "loss_rank_avg": 0.10995981842279434, "step": 12735 }, { "epoch": 6.679769392033543, "grad_norm": 0.7164087295532227, "learning_rate": 2.5948948999131585e-07, "loss": 0.1094, "loss_nan_ranks": 0, "loss_rank_avg": 0.10501286387443542, "step": 12740 }, { "epoch": 6.682389937106918, "grad_norm": 1.1453492641448975, "learning_rate": 2.553098010891919e-07, "loss": 0.1128, "loss_nan_ranks": 0, "loss_rank_avg": 0.17530810832977295, "step": 12745 }, { "epoch": 6.685010482180293, "grad_norm": 0.7735922932624817, "learning_rate": 2.511638316835474e-07, "loss": 0.1226, "loss_nan_ranks": 0, "loss_rank_avg": 0.10268770903348923, "step": 12750 }, { "epoch": 6.687631027253669, "grad_norm": 0.8022793531417847, "learning_rate": 2.470515888547609e-07, "loss": 0.1135, "loss_nan_ranks": 0, "loss_rank_avg": 0.11690790951251984, "step": 12755 }, { "epoch": 6.690251572327044, "grad_norm": 0.672698974609375, "learning_rate": 2.429730796256236e-07, "loss": 0.1013, "loss_nan_ranks": 0, "loss_rank_avg": 0.08837890625, "step": 12760 }, { "epoch": 6.6928721174004195, "grad_norm": 0.6244108080863953, "learning_rate": 2.3892831096131494e-07, "loss": 0.118, "loss_nan_ranks": 0, "loss_rank_avg": 0.1031256839632988, "step": 12765 }, { "epoch": 6.695492662473795, "grad_norm": 0.7368753552436829, "learning_rate": 2.3491728976938742e-07, "loss": 0.111, "loss_nan_ranks": 0, "loss_rank_avg": 0.12542890012264252, "step": 12770 }, { "epoch": 6.69811320754717, "grad_norm": 0.760115921497345, "learning_rate": 2.3094002289976824e-07, "loss": 0.1168, "loss_nan_ranks": 0, "loss_rank_avg": 0.11905722320079803, "step": 12775 }, { "epoch": 6.700733752620545, "grad_norm": 0.6448925137519836, "learning_rate": 2.2699651714473302e-07, "loss": 0.1176, "loss_nan_ranks": 0, "loss_rank_avg": 0.11876273900270462, "step": 12780 }, { "epoch": 6.70335429769392, "grad_norm": 0.7480875849723816, "learning_rate": 2.2308677923890576e-07, "loss": 0.0937, "loss_nan_ranks": 0, "loss_rank_avg": 0.080322265625, "step": 12785 }, { "epoch": 6.705974842767295, "grad_norm": 0.7540961503982544, "learning_rate": 2.1921081585923875e-07, "loss": 0.1167, "loss_nan_ranks": 0, "loss_rank_avg": 0.15754400193691254, "step": 12790 }, { "epoch": 6.7085953878406706, "grad_norm": 0.7379507422447205, "learning_rate": 2.153686336250105e-07, "loss": 0.1101, "loss_nan_ranks": 0, "loss_rank_avg": 0.07952894270420074, "step": 12795 }, { "epoch": 6.711215932914046, "grad_norm": 0.7464003562927246, "learning_rate": 2.1156023909780111e-07, "loss": 0.1047, "loss_nan_ranks": 0, "loss_rank_avg": 0.12541179358959198, "step": 12800 }, { "epoch": 6.713836477987421, "grad_norm": 0.7177678346633911, "learning_rate": 2.0778563878149471e-07, "loss": 0.1075, "loss_nan_ranks": 0, "loss_rank_avg": 0.12439662963151932, "step": 12805 }, { "epoch": 6.716457023060797, "grad_norm": 0.7599697113037109, "learning_rate": 2.0404483912226158e-07, "loss": 0.1051, "loss_nan_ranks": 0, "loss_rank_avg": 0.10825493931770325, "step": 12810 }, { "epoch": 6.719077568134172, "grad_norm": 1.1192086935043335, "learning_rate": 2.0033784650854927e-07, "loss": 0.117, "loss_nan_ranks": 0, "loss_rank_avg": 0.17269468307495117, "step": 12815 }, { "epoch": 6.721698113207547, "grad_norm": 0.719208300113678, "learning_rate": 1.9666466727106481e-07, "loss": 0.1048, "loss_nan_ranks": 0, "loss_rank_avg": 0.0932646170258522, "step": 12820 }, { "epoch": 6.7243186582809225, "grad_norm": 0.7651455402374268, "learning_rate": 1.9302530768277706e-07, "loss": 0.1014, "loss_nan_ranks": 0, "loss_rank_avg": 0.12158767879009247, "step": 12825 }, { "epoch": 6.726939203354298, "grad_norm": 0.7352505326271057, "learning_rate": 1.8941977395888988e-07, "loss": 0.1102, "loss_nan_ranks": 0, "loss_rank_avg": 0.11577711254358292, "step": 12830 }, { "epoch": 6.729559748427673, "grad_norm": 0.7424346804618835, "learning_rate": 1.8584807225684898e-07, "loss": 0.0986, "loss_nan_ranks": 0, "loss_rank_avg": 0.08648681640625, "step": 12835 }, { "epoch": 6.732180293501048, "grad_norm": 0.7273895144462585, "learning_rate": 1.8231020867631955e-07, "loss": 0.1103, "loss_nan_ranks": 0, "loss_rank_avg": 0.15292765200138092, "step": 12840 }, { "epoch": 6.734800838574423, "grad_norm": 0.6363976001739502, "learning_rate": 1.7880618925917526e-07, "loss": 0.1156, "loss_nan_ranks": 0, "loss_rank_avg": 0.1068999320268631, "step": 12845 }, { "epoch": 6.737421383647799, "grad_norm": 0.6364636421203613, "learning_rate": 1.753360199894938e-07, "loss": 0.111, "loss_nan_ranks": 0, "loss_rank_avg": 0.14879164099693298, "step": 12850 }, { "epoch": 6.740041928721174, "grad_norm": 0.6741135120391846, "learning_rate": 1.7189970679354794e-07, "loss": 0.1166, "loss_nan_ranks": 0, "loss_rank_avg": 0.13488692045211792, "step": 12855 }, { "epoch": 6.7426624737945495, "grad_norm": 0.6478638648986816, "learning_rate": 1.6849725553978792e-07, "loss": 0.1091, "loss_nan_ranks": 0, "loss_rank_avg": 0.1508275717496872, "step": 12860 }, { "epoch": 6.745283018867925, "grad_norm": 0.7270845770835876, "learning_rate": 1.6512867203883453e-07, "loss": 0.0985, "loss_nan_ranks": 0, "loss_rank_avg": 0.12802273035049438, "step": 12865 }, { "epoch": 6.7479035639413, "grad_norm": 0.716550886631012, "learning_rate": 1.6179396204347497e-07, "loss": 0.1195, "loss_nan_ranks": 0, "loss_rank_avg": 0.1199185699224472, "step": 12870 }, { "epoch": 6.750524109014675, "grad_norm": 0.8040631413459778, "learning_rate": 1.5849313124864262e-07, "loss": 0.1147, "loss_nan_ranks": 0, "loss_rank_avg": 0.10303539782762527, "step": 12875 }, { "epoch": 6.75314465408805, "grad_norm": 0.6849897503852844, "learning_rate": 1.55226185291415e-07, "loss": 0.1013, "loss_nan_ranks": 0, "loss_rank_avg": 0.13410991430282593, "step": 12880 }, { "epoch": 6.755765199161425, "grad_norm": 0.7915083169937134, "learning_rate": 1.5199312975100243e-07, "loss": 0.0865, "loss_nan_ranks": 0, "loss_rank_avg": 0.09747314453125, "step": 12885 }, { "epoch": 6.7583857442348005, "grad_norm": 0.7480253577232361, "learning_rate": 1.4879397014873954e-07, "loss": 0.1164, "loss_nan_ranks": 0, "loss_rank_avg": 0.10575560480356216, "step": 12890 }, { "epoch": 6.761006289308176, "grad_norm": 0.7745283246040344, "learning_rate": 1.4562871194806926e-07, "loss": 0.0933, "loss_nan_ranks": 0, "loss_rank_avg": 0.08332160860300064, "step": 12895 }, { "epoch": 6.763626834381551, "grad_norm": 0.672770082950592, "learning_rate": 1.4249736055454545e-07, "loss": 0.1078, "loss_nan_ranks": 0, "loss_rank_avg": 0.1391640454530716, "step": 12900 }, { "epoch": 6.766247379454927, "grad_norm": 0.83051598072052, "learning_rate": 1.3939992131581038e-07, "loss": 0.1181, "loss_nan_ranks": 0, "loss_rank_avg": 0.1187271699309349, "step": 12905 }, { "epoch": 6.768867924528302, "grad_norm": 0.7149649262428284, "learning_rate": 1.3633639952159495e-07, "loss": 0.1183, "loss_nan_ranks": 0, "loss_rank_avg": 0.11559177190065384, "step": 12910 }, { "epoch": 6.771488469601677, "grad_norm": 0.8395388126373291, "learning_rate": 1.3330680040370525e-07, "loss": 0.0976, "loss_nan_ranks": 0, "loss_rank_avg": 0.0789794921875, "step": 12915 }, { "epoch": 6.774109014675052, "grad_norm": 0.8058507442474365, "learning_rate": 1.303111291360204e-07, "loss": 0.1129, "loss_nan_ranks": 0, "loss_rank_avg": 0.0765380859375, "step": 12920 }, { "epoch": 6.776729559748428, "grad_norm": 0.7542924284934998, "learning_rate": 1.2734939083447028e-07, "loss": 0.0907, "loss_nan_ranks": 0, "loss_rank_avg": 0.12192582339048386, "step": 12925 }, { "epoch": 6.779350104821803, "grad_norm": 0.6925415396690369, "learning_rate": 1.2442159055703785e-07, "loss": 0.0985, "loss_nan_ranks": 0, "loss_rank_avg": 0.11571474373340607, "step": 12930 }, { "epoch": 6.781970649895178, "grad_norm": 0.8176593780517578, "learning_rate": 1.2152773330375233e-07, "loss": 0.1079, "loss_nan_ranks": 0, "loss_rank_avg": 0.115481436252594, "step": 12935 }, { "epoch": 6.784591194968553, "grad_norm": 0.7937777042388916, "learning_rate": 1.1866782401666943e-07, "loss": 0.0997, "loss_nan_ranks": 0, "loss_rank_avg": 0.11660611629486084, "step": 12940 }, { "epoch": 6.787211740041929, "grad_norm": 0.7690672874450684, "learning_rate": 1.1584186757987336e-07, "loss": 0.1024, "loss_nan_ranks": 0, "loss_rank_avg": 0.10116563737392426, "step": 12945 }, { "epoch": 6.789832285115304, "grad_norm": 0.6998053193092346, "learning_rate": 1.1304986881946145e-07, "loss": 0.1104, "loss_nan_ranks": 0, "loss_rank_avg": 0.15394994616508484, "step": 12950 }, { "epoch": 6.7924528301886795, "grad_norm": 0.876414954662323, "learning_rate": 1.1029183250354181e-07, "loss": 0.1151, "loss_nan_ranks": 0, "loss_rank_avg": 0.09614819288253784, "step": 12955 }, { "epoch": 6.795073375262055, "grad_norm": 0.7758975625038147, "learning_rate": 1.0756776334222008e-07, "loss": 0.1035, "loss_nan_ranks": 0, "loss_rank_avg": 0.07234509289264679, "step": 12960 }, { "epoch": 6.79769392033543, "grad_norm": 0.7724853754043579, "learning_rate": 1.0487766598759496e-07, "loss": 0.1176, "loss_nan_ranks": 0, "loss_rank_avg": 0.12443333864212036, "step": 12965 }, { "epoch": 6.800314465408805, "grad_norm": 0.7516874074935913, "learning_rate": 1.0222154503374937e-07, "loss": 0.1127, "loss_nan_ranks": 0, "loss_rank_avg": 0.12476566433906555, "step": 12970 }, { "epoch": 6.80293501048218, "grad_norm": 0.6587119698524475, "learning_rate": 9.959940501674148e-08, "loss": 0.1145, "loss_nan_ranks": 0, "loss_rank_avg": 0.10120025277137756, "step": 12975 }, { "epoch": 6.805555555555555, "grad_norm": 0.8887156844139099, "learning_rate": 9.701125041459592e-08, "loss": 0.1128, "loss_nan_ranks": 0, "loss_rank_avg": 0.17159056663513184, "step": 12980 }, { "epoch": 6.8081761006289305, "grad_norm": 0.7278609871864319, "learning_rate": 9.445708564729927e-08, "loss": 0.1181, "loss_nan_ranks": 0, "loss_rank_avg": 0.0933285504579544, "step": 12985 }, { "epoch": 6.810796645702306, "grad_norm": 0.682710587978363, "learning_rate": 9.193691507679126e-08, "loss": 0.0947, "loss_nan_ranks": 0, "loss_rank_avg": 0.06728173047304153, "step": 12990 }, { "epoch": 6.813417190775681, "grad_norm": 0.7441592216491699, "learning_rate": 8.945074300696022e-08, "loss": 0.1128, "loss_nan_ranks": 0, "loss_rank_avg": 0.08619271218776703, "step": 12995 }, { "epoch": 6.816037735849057, "grad_norm": 0.794715940952301, "learning_rate": 8.699857368362985e-08, "loss": 0.1019, "loss_nan_ranks": 0, "loss_rank_avg": 0.13376472890377045, "step": 13000 }, { "epoch": 6.818658280922432, "grad_norm": 0.804305911064148, "learning_rate": 8.45804112945503e-08, "loss": 0.1031, "loss_nan_ranks": 0, "loss_rank_avg": 0.07986754924058914, "step": 13005 }, { "epoch": 6.821278825995807, "grad_norm": 0.7131937742233276, "learning_rate": 8.21962599694004e-08, "loss": 0.1236, "loss_nan_ranks": 0, "loss_rank_avg": 0.12417107820510864, "step": 13010 }, { "epoch": 6.823899371069182, "grad_norm": 0.7585114240646362, "learning_rate": 7.984612377977874e-08, "loss": 0.1005, "loss_nan_ranks": 0, "loss_rank_avg": 0.12065385282039642, "step": 13015 }, { "epoch": 6.826519916142558, "grad_norm": 0.6928541660308838, "learning_rate": 7.753000673919042e-08, "loss": 0.1095, "loss_nan_ranks": 0, "loss_rank_avg": 0.1194111630320549, "step": 13020 }, { "epoch": 6.829140461215933, "grad_norm": 0.7906724214553833, "learning_rate": 7.524791280303812e-08, "loss": 0.1093, "loss_nan_ranks": 0, "loss_rank_avg": 0.09456080198287964, "step": 13025 }, { "epoch": 6.831761006289308, "grad_norm": 0.7523605227470398, "learning_rate": 7.299984586862874e-08, "loss": 0.1006, "loss_nan_ranks": 0, "loss_rank_avg": 0.09011338651180267, "step": 13030 }, { "epoch": 6.834381551362683, "grad_norm": 0.7620218992233276, "learning_rate": 7.07858097751557e-08, "loss": 0.1162, "loss_nan_ranks": 0, "loss_rank_avg": 0.12259824573993683, "step": 13035 }, { "epoch": 6.837002096436059, "grad_norm": 0.7315376996994019, "learning_rate": 6.860580830369668e-08, "loss": 0.094, "loss_nan_ranks": 0, "loss_rank_avg": 0.080322265625, "step": 13040 }, { "epoch": 6.839622641509434, "grad_norm": 0.7305793762207031, "learning_rate": 6.64598451772025e-08, "loss": 0.0954, "loss_nan_ranks": 0, "loss_rank_avg": 0.06353759765625, "step": 13045 }, { "epoch": 6.8422431865828095, "grad_norm": 0.798496425151825, "learning_rate": 6.434792406049717e-08, "loss": 0.1067, "loss_nan_ranks": 0, "loss_rank_avg": 0.0895010381937027, "step": 13050 }, { "epoch": 6.844863731656185, "grad_norm": 2.8543035984039307, "learning_rate": 6.227004856026897e-08, "loss": 0.1237, "loss_nan_ranks": 0, "loss_rank_avg": 0.11394209414720535, "step": 13055 }, { "epoch": 6.84748427672956, "grad_norm": 0.8442890048027039, "learning_rate": 6.022622222505936e-08, "loss": 0.0876, "loss_nan_ranks": 0, "loss_rank_avg": 0.0831298828125, "step": 13060 }, { "epoch": 6.850104821802935, "grad_norm": 0.7585592865943909, "learning_rate": 5.8216448545265205e-08, "loss": 0.1171, "loss_nan_ranks": 0, "loss_rank_avg": 0.09647373855113983, "step": 13065 }, { "epoch": 6.85272536687631, "grad_norm": 0.7444117665290833, "learning_rate": 5.6240730953132096e-08, "loss": 0.1108, "loss_nan_ranks": 0, "loss_rank_avg": 0.11333517730236053, "step": 13070 }, { "epoch": 6.855345911949685, "grad_norm": 0.7710156440734863, "learning_rate": 5.429907282273883e-08, "loss": 0.1169, "loss_nan_ranks": 0, "loss_rank_avg": 0.10573013871908188, "step": 13075 }, { "epoch": 6.8579664570230605, "grad_norm": 0.7597822546958923, "learning_rate": 5.239147747000406e-08, "loss": 0.1135, "loss_nan_ranks": 0, "loss_rank_avg": 0.08099365234375, "step": 13080 }, { "epoch": 6.860587002096436, "grad_norm": 0.6881340146064758, "learning_rate": 5.051794815266853e-08, "loss": 0.0996, "loss_nan_ranks": 0, "loss_rank_avg": 0.10512430965900421, "step": 13085 }, { "epoch": 6.863207547169811, "grad_norm": 0.8049854636192322, "learning_rate": 4.867848807029951e-08, "loss": 0.0969, "loss_nan_ranks": 0, "loss_rank_avg": 0.0690917819738388, "step": 13090 }, { "epoch": 6.865828092243187, "grad_norm": 0.7548791766166687, "learning_rate": 4.687310036428638e-08, "loss": 0.0914, "loss_nan_ranks": 0, "loss_rank_avg": 0.06378173828125, "step": 13095 }, { "epoch": 6.868448637316562, "grad_norm": 0.7037566304206848, "learning_rate": 4.510178811782284e-08, "loss": 0.1019, "loss_nan_ranks": 0, "loss_rank_avg": 0.11814156919717789, "step": 13100 }, { "epoch": 6.871069182389937, "grad_norm": 0.7759885787963867, "learning_rate": 4.336455435591358e-08, "loss": 0.0909, "loss_nan_ranks": 0, "loss_rank_avg": 0.10180674493312836, "step": 13105 }, { "epoch": 6.873689727463312, "grad_norm": 0.730364978313446, "learning_rate": 4.166140204536096e-08, "loss": 0.0902, "loss_nan_ranks": 0, "loss_rank_avg": 0.064483642578125, "step": 13110 }, { "epoch": 6.876310272536688, "grad_norm": 0.6661535501480103, "learning_rate": 3.999233409476943e-08, "loss": 0.0872, "loss_nan_ranks": 0, "loss_rank_avg": 0.08388853818178177, "step": 13115 }, { "epoch": 6.878930817610063, "grad_norm": 0.787157416343689, "learning_rate": 3.835735335453228e-08, "loss": 0.1164, "loss_nan_ranks": 0, "loss_rank_avg": 0.14255258440971375, "step": 13120 }, { "epoch": 6.881551362683438, "grad_norm": 0.7189652919769287, "learning_rate": 3.6756462616827084e-08, "loss": 0.1029, "loss_nan_ranks": 0, "loss_rank_avg": 0.08161546289920807, "step": 13125 }, { "epoch": 6.884171907756813, "grad_norm": 0.8704705238342285, "learning_rate": 3.5189664615615795e-08, "loss": 0.1195, "loss_nan_ranks": 0, "loss_rank_avg": 0.12327359616756439, "step": 13130 }, { "epoch": 6.886792452830189, "grad_norm": 0.7432771921157837, "learning_rate": 3.365696202664026e-08, "loss": 0.0895, "loss_nan_ranks": 0, "loss_rank_avg": 0.11050796508789062, "step": 13135 }, { "epoch": 6.889412997903564, "grad_norm": 0.6858586668968201, "learning_rate": 3.215835746741114e-08, "loss": 0.0943, "loss_nan_ranks": 0, "loss_rank_avg": 0.10106977075338364, "step": 13140 }, { "epoch": 6.8920335429769395, "grad_norm": 0.6864309310913086, "learning_rate": 3.069385349720788e-08, "loss": 0.1089, "loss_nan_ranks": 0, "loss_rank_avg": 0.11738134920597076, "step": 13145 }, { "epoch": 6.894654088050315, "grad_norm": 0.7780885696411133, "learning_rate": 2.9263452617074306e-08, "loss": 0.1044, "loss_nan_ranks": 0, "loss_rank_avg": 0.09812843799591064, "step": 13150 }, { "epoch": 6.89727463312369, "grad_norm": 0.7669959664344788, "learning_rate": 2.7867157269814147e-08, "loss": 0.1251, "loss_nan_ranks": 0, "loss_rank_avg": 0.14291706681251526, "step": 13155 }, { "epoch": 6.899895178197065, "grad_norm": 0.7585298418998718, "learning_rate": 2.6504969839986627e-08, "loss": 0.1129, "loss_nan_ranks": 0, "loss_rank_avg": 0.08612832427024841, "step": 13160 }, { "epoch": 6.90251572327044, "grad_norm": 0.7068600654602051, "learning_rate": 2.5176892653899777e-08, "loss": 0.1045, "loss_nan_ranks": 0, "loss_rank_avg": 0.12019000947475433, "step": 13165 }, { "epoch": 6.905136268343815, "grad_norm": 0.7073253393173218, "learning_rate": 2.3882927979614888e-08, "loss": 0.1107, "loss_nan_ranks": 0, "loss_rank_avg": 0.11398753523826599, "step": 13170 }, { "epoch": 6.9077568134171905, "grad_norm": 0.7321997284889221, "learning_rate": 2.2623078026930978e-08, "loss": 0.1133, "loss_nan_ranks": 0, "loss_rank_avg": 0.12146317958831787, "step": 13175 }, { "epoch": 6.910377358490566, "grad_norm": 0.7757676243782043, "learning_rate": 2.139734494738699e-08, "loss": 0.1248, "loss_nan_ranks": 0, "loss_rank_avg": 0.11253353208303452, "step": 13180 }, { "epoch": 6.912997903563941, "grad_norm": 0.783780574798584, "learning_rate": 2.0205730834264027e-08, "loss": 0.103, "loss_nan_ranks": 0, "loss_rank_avg": 0.07848242670297623, "step": 13185 }, { "epoch": 6.915618448637317, "grad_norm": 0.7034590840339661, "learning_rate": 1.9048237722567586e-08, "loss": 0.1072, "loss_nan_ranks": 0, "loss_rank_avg": 0.12093672901391983, "step": 13190 }, { "epoch": 6.918238993710692, "grad_norm": 0.7753239870071411, "learning_rate": 1.7924867589038663e-08, "loss": 0.1061, "loss_nan_ranks": 0, "loss_rank_avg": 0.10527370125055313, "step": 13195 }, { "epoch": 6.920859538784067, "grad_norm": 0.7496992349624634, "learning_rate": 1.6835622352138203e-08, "loss": 0.1114, "loss_nan_ranks": 0, "loss_rank_avg": 0.07733316719532013, "step": 13200 }, { "epoch": 6.923480083857442, "grad_norm": 0.7619433999061584, "learning_rate": 1.5780503872055986e-08, "loss": 0.112, "loss_nan_ranks": 0, "loss_rank_avg": 0.09532599151134491, "step": 13205 }, { "epoch": 6.926100628930818, "grad_norm": 0.7757188677787781, "learning_rate": 1.475951395069286e-08, "loss": 0.1092, "loss_nan_ranks": 0, "loss_rank_avg": 0.0898994505405426, "step": 13210 }, { "epoch": 6.928721174004193, "grad_norm": 0.6868317723274231, "learning_rate": 1.3772654331674073e-08, "loss": 0.1064, "loss_nan_ranks": 0, "loss_rank_avg": 0.08451978862285614, "step": 13215 }, { "epoch": 6.931341719077568, "grad_norm": 0.7334279417991638, "learning_rate": 1.2819926700333718e-08, "loss": 0.1152, "loss_nan_ranks": 0, "loss_rank_avg": 0.10934390872716904, "step": 13220 }, { "epoch": 6.933962264150943, "grad_norm": 0.8165897727012634, "learning_rate": 1.190133268371696e-08, "loss": 0.1112, "loss_nan_ranks": 0, "loss_rank_avg": 0.12218495458364487, "step": 13225 }, { "epoch": 6.936582809224319, "grad_norm": 0.8433283567428589, "learning_rate": 1.1016873850573372e-08, "loss": 0.1083, "loss_nan_ranks": 0, "loss_rank_avg": 0.10986328125, "step": 13230 }, { "epoch": 6.939203354297694, "grad_norm": 0.7000241279602051, "learning_rate": 1.0166551711363604e-08, "loss": 0.1102, "loss_nan_ranks": 0, "loss_rank_avg": 0.0697021484375, "step": 13235 }, { "epoch": 6.9418238993710695, "grad_norm": 0.8155228495597839, "learning_rate": 9.350367718243825e-09, "loss": 0.1145, "loss_nan_ranks": 0, "loss_rank_avg": 0.11756150424480438, "step": 13240 }, { "epoch": 6.944444444444445, "grad_norm": 0.7745731472969055, "learning_rate": 8.568323265074618e-09, "loss": 0.0991, "loss_nan_ranks": 0, "loss_rank_avg": 0.11353258788585663, "step": 13245 }, { "epoch": 6.94706498951782, "grad_norm": 0.9809739589691162, "learning_rate": 7.820419687409874e-09, "loss": 0.0959, "loss_nan_ranks": 0, "loss_rank_avg": 0.09842462837696075, "step": 13250 }, { "epoch": 6.949685534591195, "grad_norm": 0.8062945604324341, "learning_rate": 7.106658262505672e-09, "loss": 0.1105, "loss_nan_ranks": 0, "loss_rank_avg": 0.09640898555517197, "step": 13255 }, { "epoch": 6.95230607966457, "grad_norm": 0.7400102615356445, "learning_rate": 6.427040209302515e-09, "loss": 0.1079, "loss_nan_ranks": 0, "loss_rank_avg": 0.12121498584747314, "step": 13260 }, { "epoch": 6.954926624737945, "grad_norm": 0.7922000885009766, "learning_rate": 5.781566688436435e-09, "loss": 0.1174, "loss_nan_ranks": 0, "loss_rank_avg": 0.12481498718261719, "step": 13265 }, { "epoch": 6.9575471698113205, "grad_norm": 0.8348727822303772, "learning_rate": 5.17023880223011e-09, "loss": 0.1, "loss_nan_ranks": 0, "loss_rank_avg": 0.0753173828125, "step": 13270 }, { "epoch": 6.960167714884696, "grad_norm": 0.6546790599822998, "learning_rate": 4.593057594697304e-09, "loss": 0.0947, "loss_nan_ranks": 0, "loss_rank_avg": 0.1255844235420227, "step": 13275 }, { "epoch": 6.962788259958071, "grad_norm": 0.7211824059486389, "learning_rate": 4.050024051531765e-09, "loss": 0.1156, "loss_nan_ranks": 0, "loss_rank_avg": 0.10128942131996155, "step": 13280 }, { "epoch": 6.965408805031447, "grad_norm": 0.7522569298744202, "learning_rate": 3.541139100111668e-09, "loss": 0.1063, "loss_nan_ranks": 0, "loss_rank_avg": 0.081298828125, "step": 13285 }, { "epoch": 6.968029350104822, "grad_norm": 0.7041332721710205, "learning_rate": 3.066403609499613e-09, "loss": 0.0966, "loss_nan_ranks": 0, "loss_rank_avg": 0.10874584317207336, "step": 13290 }, { "epoch": 6.970649895178197, "grad_norm": 0.8444633483886719, "learning_rate": 2.625818390438184e-09, "loss": 0.1052, "loss_nan_ranks": 0, "loss_rank_avg": 0.1311793029308319, "step": 13295 }, { "epoch": 6.973270440251572, "grad_norm": 0.7568145394325256, "learning_rate": 2.219384195345509e-09, "loss": 0.0964, "loss_nan_ranks": 0, "loss_rank_avg": 0.08481334149837494, "step": 13300 }, { "epoch": 6.975890985324948, "grad_norm": 0.699336051940918, "learning_rate": 1.8471017183241401e-09, "loss": 0.1204, "loss_nan_ranks": 0, "loss_rank_avg": 0.18024323880672455, "step": 13305 }, { "epoch": 6.978511530398323, "grad_norm": 0.8261719346046448, "learning_rate": 1.5089715951432937e-09, "loss": 0.1143, "loss_nan_ranks": 0, "loss_rank_avg": 0.09984341263771057, "step": 13310 }, { "epoch": 6.981132075471698, "grad_norm": 0.7103723883628845, "learning_rate": 1.2049944032566096e-09, "loss": 0.1313, "loss_nan_ranks": 0, "loss_rank_avg": 0.09648270159959793, "step": 13315 }, { "epoch": 6.983752620545073, "grad_norm": 0.807367742061615, "learning_rate": 9.351706617910516e-10, "loss": 0.1036, "loss_nan_ranks": 0, "loss_rank_avg": 0.08563232421875, "step": 13320 }, { "epoch": 6.986373165618449, "grad_norm": 0.8853840827941895, "learning_rate": 6.995008315402451e-10, "loss": 0.1018, "loss_nan_ranks": 0, "loss_rank_avg": 0.069091796875, "step": 13325 }, { "epoch": 6.988993710691824, "grad_norm": 0.7650611996650696, "learning_rate": 4.979853149755797e-10, "loss": 0.1116, "loss_nan_ranks": 0, "loss_rank_avg": 0.15118631720542908, "step": 13330 }, { "epoch": 6.9916142557651995, "grad_norm": 0.7030468583106995, "learning_rate": 3.3062445624398864e-10, "loss": 0.1057, "loss_nan_ranks": 0, "loss_rank_avg": 0.0745849609375, "step": 13335 }, { "epoch": 6.994234800838575, "grad_norm": 0.8280528783798218, "learning_rate": 1.9741854115906679e-10, "loss": 0.1018, "loss_nan_ranks": 0, "loss_rank_avg": 0.0935848206281662, "step": 13340 }, { "epoch": 6.99685534591195, "grad_norm": 0.667475700378418, "learning_rate": 9.836779720551193e-11, "loss": 0.1284, "loss_nan_ranks": 0, "loss_rank_avg": 0.11847852170467377, "step": 13345 }, { "epoch": 6.999475890985325, "grad_norm": 0.6849798560142517, "learning_rate": 3.347239353912457e-11, "loss": 0.099, "loss_nan_ranks": 0, "loss_rank_avg": 0.15633749961853027, "step": 13350 }, { "epoch": 6.999475890985325, "loss_nan_ranks": 0, "loss_rank_avg": 0.15633749961853027, "step": 13350, "total_flos": 1.29440113754112e+16, "train_loss": 0.16645406688197276, "train_runtime": 120834.4603, "train_samples_per_second": 0.111, "train_steps_per_second": 0.111 } ], "logging_steps": 5, "max_steps": 13356, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 1500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.29440113754112e+16, "train_batch_size": 1, "trial_name": null, "trial_params": null }