{ "best_global_step": 12000, "best_metric": 0.3009350597858429, "best_model_checkpoint": null, "epoch": 2.99971659791542, "eval_steps": 1000, "global_step": 23817, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00012595648203545676, "grad_norm": 107.34553527832031, "learning_rate": 0.0, "loss": 4.9992, "step": 1 }, { "epoch": 0.0006297824101772837, "grad_norm": 132.82723999023438, "learning_rate": 5.037783375314861e-07, "loss": 5.2462, "step": 5 }, { "epoch": 0.0012595648203545674, "grad_norm": 61.73713684082031, "learning_rate": 1.1335012594458437e-06, "loss": 4.9216, "step": 10 }, { "epoch": 0.0018893472305318512, "grad_norm": 29.1369571685791, "learning_rate": 1.7632241813602012e-06, "loss": 4.3103, "step": 15 }, { "epoch": 0.0025191296407091348, "grad_norm": 13.688713073730469, "learning_rate": 2.392947103274559e-06, "loss": 3.7045, "step": 20 }, { "epoch": 0.0031489120508864186, "grad_norm": 7.063476085662842, "learning_rate": 3.0226700251889166e-06, "loss": 3.255, "step": 25 }, { "epoch": 0.0037786944610637024, "grad_norm": 4.568771839141846, "learning_rate": 3.6523929471032744e-06, "loss": 2.9188, "step": 30 }, { "epoch": 0.004408476871240986, "grad_norm": 4.0639753341674805, "learning_rate": 4.282115869017632e-06, "loss": 2.6194, "step": 35 }, { "epoch": 0.0050382592814182696, "grad_norm": 3.363410234451294, "learning_rate": 4.911838790931989e-06, "loss": 2.3438, "step": 40 }, { "epoch": 0.005668041691595554, "grad_norm": 3.086527109146118, "learning_rate": 5.541561712846347e-06, "loss": 2.21, "step": 45 }, { "epoch": 0.006297824101772837, "grad_norm": 2.987258195877075, "learning_rate": 6.171284634760705e-06, "loss": 2.063, "step": 50 }, { "epoch": 0.006927606511950121, "grad_norm": 2.956822156906128, "learning_rate": 6.801007556675062e-06, "loss": 1.9592, "step": 55 }, { "epoch": 0.007557388922127405, "grad_norm": 2.7860183715820312, "learning_rate": 7.43073047858942e-06, "loss": 1.8058, "step": 60 }, { "epoch": 0.008187171332304689, "grad_norm": 2.930774211883545, "learning_rate": 8.060453400503778e-06, "loss": 1.7843, "step": 65 }, { "epoch": 0.008816953742481972, "grad_norm": 2.797520637512207, "learning_rate": 8.690176322418136e-06, "loss": 1.6922, "step": 70 }, { "epoch": 0.009446736152659256, "grad_norm": 2.945425271987915, "learning_rate": 9.319899244332492e-06, "loss": 1.6863, "step": 75 }, { "epoch": 0.010076518562836539, "grad_norm": 3.775211811065674, "learning_rate": 9.94962216624685e-06, "loss": 1.6918, "step": 80 }, { "epoch": 0.010706300973013824, "grad_norm": 4.795872688293457, "learning_rate": 1.0579345088161209e-05, "loss": 1.5709, "step": 85 }, { "epoch": 0.011336083383191108, "grad_norm": 6.182225227355957, "learning_rate": 1.1209068010075565e-05, "loss": 1.582, "step": 90 }, { "epoch": 0.011965865793368391, "grad_norm": 2.548555612564087, "learning_rate": 1.1838790931989923e-05, "loss": 1.4157, "step": 95 }, { "epoch": 0.012595648203545674, "grad_norm": 2.5125293731689453, "learning_rate": 1.246851385390428e-05, "loss": 1.4848, "step": 100 }, { "epoch": 0.01322543061372296, "grad_norm": 2.3581464290618896, "learning_rate": 1.309823677581864e-05, "loss": 1.4169, "step": 105 }, { "epoch": 0.013855213023900243, "grad_norm": 2.4205543994903564, "learning_rate": 1.3727959697732996e-05, "loss": 1.3803, "step": 110 }, { "epoch": 0.014484995434077526, "grad_norm": 2.251526355743408, "learning_rate": 1.4357682619647355e-05, "loss": 1.3928, "step": 115 }, { "epoch": 0.01511477784425481, "grad_norm": 2.3966078758239746, "learning_rate": 1.4987405541561711e-05, "loss": 1.3994, "step": 120 }, { "epoch": 0.015744560254432095, "grad_norm": 2.2904422283172607, "learning_rate": 1.561712846347607e-05, "loss": 1.3884, "step": 125 }, { "epoch": 0.016374342664609378, "grad_norm": 2.238481283187866, "learning_rate": 1.6246851385390428e-05, "loss": 1.3419, "step": 130 }, { "epoch": 0.01700412507478666, "grad_norm": 2.3208041191101074, "learning_rate": 1.6876574307304786e-05, "loss": 1.3282, "step": 135 }, { "epoch": 0.017633907484963945, "grad_norm": 2.34198260307312, "learning_rate": 1.750629722921914e-05, "loss": 1.3398, "step": 140 }, { "epoch": 0.018263689895141228, "grad_norm": 2.316502332687378, "learning_rate": 1.81360201511335e-05, "loss": 1.3171, "step": 145 }, { "epoch": 0.01889347230531851, "grad_norm": 2.1474759578704834, "learning_rate": 1.8765743073047857e-05, "loss": 1.2896, "step": 150 }, { "epoch": 0.019523254715495795, "grad_norm": 2.2437920570373535, "learning_rate": 1.9395465994962215e-05, "loss": 1.3504, "step": 155 }, { "epoch": 0.020153037125673078, "grad_norm": 2.3530609607696533, "learning_rate": 2.002518891687657e-05, "loss": 1.2937, "step": 160 }, { "epoch": 0.020782819535850365, "grad_norm": 3.199941635131836, "learning_rate": 2.065491183879093e-05, "loss": 1.2841, "step": 165 }, { "epoch": 0.02141260194602765, "grad_norm": 2.3354921340942383, "learning_rate": 2.128463476070529e-05, "loss": 1.2573, "step": 170 }, { "epoch": 0.022042384356204932, "grad_norm": 1.9546146392822266, "learning_rate": 2.1914357682619645e-05, "loss": 1.2179, "step": 175 }, { "epoch": 0.022672166766382215, "grad_norm": 2.329948902130127, "learning_rate": 2.2544080604534003e-05, "loss": 1.2417, "step": 180 }, { "epoch": 0.0233019491765595, "grad_norm": 2.110067367553711, "learning_rate": 2.3173803526448358e-05, "loss": 1.289, "step": 185 }, { "epoch": 0.023931731586736782, "grad_norm": 1.8875372409820557, "learning_rate": 2.380352644836272e-05, "loss": 1.2183, "step": 190 }, { "epoch": 0.024561513996914065, "grad_norm": 2.3789267539978027, "learning_rate": 2.4433249370277077e-05, "loss": 1.3136, "step": 195 }, { "epoch": 0.02519129640709135, "grad_norm": 2.4072821140289307, "learning_rate": 2.5062972292191432e-05, "loss": 1.2241, "step": 200 }, { "epoch": 0.025821078817268632, "grad_norm": 1.936230182647705, "learning_rate": 2.569269521410579e-05, "loss": 1.2715, "step": 205 }, { "epoch": 0.02645086122744592, "grad_norm": 1.8356448411941528, "learning_rate": 2.632241813602015e-05, "loss": 1.1728, "step": 210 }, { "epoch": 0.027080643637623202, "grad_norm": 1.7642450332641602, "learning_rate": 2.6952141057934507e-05, "loss": 1.2022, "step": 215 }, { "epoch": 0.027710426047800486, "grad_norm": 1.8820688724517822, "learning_rate": 2.7581863979848865e-05, "loss": 1.1563, "step": 220 }, { "epoch": 0.02834020845797777, "grad_norm": 1.8875645399093628, "learning_rate": 2.821158690176322e-05, "loss": 1.1473, "step": 225 }, { "epoch": 0.028969990868155052, "grad_norm": 1.950432538986206, "learning_rate": 2.884130982367758e-05, "loss": 1.1478, "step": 230 }, { "epoch": 0.029599773278332336, "grad_norm": 1.8610332012176514, "learning_rate": 2.9471032745591936e-05, "loss": 1.2026, "step": 235 }, { "epoch": 0.03022955568850962, "grad_norm": 1.865122675895691, "learning_rate": 3.0100755667506295e-05, "loss": 1.1748, "step": 240 }, { "epoch": 0.030859338098686902, "grad_norm": 1.7126635313034058, "learning_rate": 3.0730478589420656e-05, "loss": 1.0982, "step": 245 }, { "epoch": 0.03148912050886419, "grad_norm": 1.809869408607483, "learning_rate": 3.136020151133501e-05, "loss": 1.1032, "step": 250 }, { "epoch": 0.03211890291904147, "grad_norm": 1.9480305910110474, "learning_rate": 3.1989924433249366e-05, "loss": 1.1195, "step": 255 }, { "epoch": 0.032748685329218756, "grad_norm": 1.762191653251648, "learning_rate": 3.2619647355163724e-05, "loss": 1.1413, "step": 260 }, { "epoch": 0.03337846773939604, "grad_norm": 1.5951004028320312, "learning_rate": 3.324937027707808e-05, "loss": 1.1514, "step": 265 }, { "epoch": 0.03400825014957332, "grad_norm": 1.9166375398635864, "learning_rate": 3.387909319899244e-05, "loss": 1.1129, "step": 270 }, { "epoch": 0.034638032559750606, "grad_norm": 1.8051133155822754, "learning_rate": 3.45088161209068e-05, "loss": 1.1827, "step": 275 }, { "epoch": 0.03526781496992789, "grad_norm": 1.8178284168243408, "learning_rate": 3.513853904282116e-05, "loss": 1.1567, "step": 280 }, { "epoch": 0.03589759738010517, "grad_norm": 1.5286771059036255, "learning_rate": 3.5768261964735515e-05, "loss": 1.011, "step": 285 }, { "epoch": 0.036527379790282456, "grad_norm": 1.6900701522827148, "learning_rate": 3.639798488664987e-05, "loss": 1.059, "step": 290 }, { "epoch": 0.03715716220045974, "grad_norm": 1.5855501890182495, "learning_rate": 3.702770780856423e-05, "loss": 1.0943, "step": 295 }, { "epoch": 0.03778694461063702, "grad_norm": 1.650087833404541, "learning_rate": 3.765743073047858e-05, "loss": 1.0843, "step": 300 }, { "epoch": 0.038416727020814306, "grad_norm": 1.772127389907837, "learning_rate": 3.828715365239294e-05, "loss": 1.1161, "step": 305 }, { "epoch": 0.03904650943099159, "grad_norm": 1.8052891492843628, "learning_rate": 3.8916876574307306e-05, "loss": 1.1493, "step": 310 }, { "epoch": 0.03967629184116887, "grad_norm": 1.816603183746338, "learning_rate": 3.954659949622166e-05, "loss": 1.1113, "step": 315 }, { "epoch": 0.040306074251346156, "grad_norm": 1.7114448547363281, "learning_rate": 4.0176322418136016e-05, "loss": 1.1124, "step": 320 }, { "epoch": 0.04093585666152345, "grad_norm": 1.693428874015808, "learning_rate": 4.080604534005038e-05, "loss": 1.0994, "step": 325 }, { "epoch": 0.04156563907170073, "grad_norm": 1.6820402145385742, "learning_rate": 4.143576826196473e-05, "loss": 1.0989, "step": 330 }, { "epoch": 0.042195421481878014, "grad_norm": 1.5841305255889893, "learning_rate": 4.206549118387909e-05, "loss": 1.123, "step": 335 }, { "epoch": 0.0428252038920553, "grad_norm": 1.714936375617981, "learning_rate": 4.269521410579344e-05, "loss": 1.0054, "step": 340 }, { "epoch": 0.04345498630223258, "grad_norm": 1.6568523645401, "learning_rate": 4.332493702770781e-05, "loss": 1.0397, "step": 345 }, { "epoch": 0.044084768712409864, "grad_norm": 1.9612834453582764, "learning_rate": 4.3954659949622165e-05, "loss": 1.0481, "step": 350 }, { "epoch": 0.04471455112258715, "grad_norm": 1.5859555006027222, "learning_rate": 4.4584382871536516e-05, "loss": 1.0754, "step": 355 }, { "epoch": 0.04534433353276443, "grad_norm": 1.6280044317245483, "learning_rate": 4.521410579345088e-05, "loss": 1.0696, "step": 360 }, { "epoch": 0.045974115942941714, "grad_norm": 1.5000571012496948, "learning_rate": 4.584382871536523e-05, "loss": 1.0508, "step": 365 }, { "epoch": 0.046603898353119, "grad_norm": 1.4447442293167114, "learning_rate": 4.647355163727959e-05, "loss": 1.0924, "step": 370 }, { "epoch": 0.04723368076329628, "grad_norm": 1.6207387447357178, "learning_rate": 4.7103274559193956e-05, "loss": 1.0477, "step": 375 }, { "epoch": 0.047863463173473564, "grad_norm": 1.5458993911743164, "learning_rate": 4.773299748110831e-05, "loss": 1.0905, "step": 380 }, { "epoch": 0.04849324558365085, "grad_norm": 1.5125916004180908, "learning_rate": 4.8362720403022666e-05, "loss": 1.0858, "step": 385 }, { "epoch": 0.04912302799382813, "grad_norm": 1.5438019037246704, "learning_rate": 4.899244332493702e-05, "loss": 1.1133, "step": 390 }, { "epoch": 0.049752810404005414, "grad_norm": 1.5938135385513306, "learning_rate": 4.962216624685138e-05, "loss": 1.0814, "step": 395 }, { "epoch": 0.0503825928141827, "grad_norm": 1.4425631761550903, "learning_rate": 5.025188916876574e-05, "loss": 1.0668, "step": 400 }, { "epoch": 0.05101237522435998, "grad_norm": 1.505650520324707, "learning_rate": 5.088161209068009e-05, "loss": 1.1201, "step": 405 }, { "epoch": 0.051642157634537264, "grad_norm": 1.421877145767212, "learning_rate": 5.151133501259446e-05, "loss": 1.0199, "step": 410 }, { "epoch": 0.052271940044714554, "grad_norm": 1.6243743896484375, "learning_rate": 5.2141057934508815e-05, "loss": 1.0791, "step": 415 }, { "epoch": 0.05290172245489184, "grad_norm": 1.5055480003356934, "learning_rate": 5.2770780856423166e-05, "loss": 1.0553, "step": 420 }, { "epoch": 0.05353150486506912, "grad_norm": 1.5624281167984009, "learning_rate": 5.340050377833753e-05, "loss": 1.0778, "step": 425 }, { "epoch": 0.054161287275246404, "grad_norm": 1.4252930879592896, "learning_rate": 5.403022670025188e-05, "loss": 1.0354, "step": 430 }, { "epoch": 0.05479106968542369, "grad_norm": 1.4582849740982056, "learning_rate": 5.465994962216624e-05, "loss": 0.9825, "step": 435 }, { "epoch": 0.05542085209560097, "grad_norm": 1.4336227178573608, "learning_rate": 5.52896725440806e-05, "loss": 1.0753, "step": 440 }, { "epoch": 0.056050634505778255, "grad_norm": 1.4182472229003906, "learning_rate": 5.591939546599496e-05, "loss": 1.0005, "step": 445 }, { "epoch": 0.05668041691595554, "grad_norm": 1.4060423374176025, "learning_rate": 5.6549118387909316e-05, "loss": 1.0311, "step": 450 }, { "epoch": 0.05731019932613282, "grad_norm": 1.5005807876586914, "learning_rate": 5.717884130982367e-05, "loss": 1.0815, "step": 455 }, { "epoch": 0.057939981736310105, "grad_norm": 1.3563963174819946, "learning_rate": 5.780856423173803e-05, "loss": 1.0099, "step": 460 }, { "epoch": 0.05856976414648739, "grad_norm": 1.4664981365203857, "learning_rate": 5.843828715365239e-05, "loss": 1.0221, "step": 465 }, { "epoch": 0.05919954655666467, "grad_norm": 1.3635119199752808, "learning_rate": 5.906801007556674e-05, "loss": 1.0864, "step": 470 }, { "epoch": 0.059829328966841955, "grad_norm": 1.3736735582351685, "learning_rate": 5.9697732997481107e-05, "loss": 1.0949, "step": 475 }, { "epoch": 0.06045911137701924, "grad_norm": 1.3973110914230347, "learning_rate": 6.0327455919395465e-05, "loss": 1.0936, "step": 480 }, { "epoch": 0.06108889378719652, "grad_norm": 1.4158260822296143, "learning_rate": 6.0957178841309816e-05, "loss": 1.0389, "step": 485 }, { "epoch": 0.061718676197373805, "grad_norm": 1.3810491561889648, "learning_rate": 6.158690176322417e-05, "loss": 1.0027, "step": 490 }, { "epoch": 0.06234845860755109, "grad_norm": 1.3191838264465332, "learning_rate": 6.221662468513854e-05, "loss": 1.0626, "step": 495 }, { "epoch": 0.06297824101772838, "grad_norm": 1.2959917783737183, "learning_rate": 6.28463476070529e-05, "loss": 1.064, "step": 500 }, { "epoch": 0.06360802342790566, "grad_norm": 1.321399450302124, "learning_rate": 6.347607052896724e-05, "loss": 1.0407, "step": 505 }, { "epoch": 0.06423780583808295, "grad_norm": 1.464593768119812, "learning_rate": 6.410579345088161e-05, "loss": 1.0961, "step": 510 }, { "epoch": 0.06486758824826022, "grad_norm": 1.1570991277694702, "learning_rate": 6.473551637279596e-05, "loss": 1.0373, "step": 515 }, { "epoch": 0.06549737065843751, "grad_norm": 1.2346997261047363, "learning_rate": 6.536523929471032e-05, "loss": 1.0016, "step": 520 }, { "epoch": 0.06612715306861479, "grad_norm": 1.2645131349563599, "learning_rate": 6.599496221662469e-05, "loss": 1.0175, "step": 525 }, { "epoch": 0.06675693547879208, "grad_norm": 1.7016206979751587, "learning_rate": 6.662468513853903e-05, "loss": 1.0517, "step": 530 }, { "epoch": 0.06738671788896936, "grad_norm": 1.2666237354278564, "learning_rate": 6.725440806045339e-05, "loss": 0.9876, "step": 535 }, { "epoch": 0.06801650029914665, "grad_norm": 1.365417242050171, "learning_rate": 6.788413098236775e-05, "loss": 1.0262, "step": 540 }, { "epoch": 0.06864628270932392, "grad_norm": 1.1083537340164185, "learning_rate": 6.851385390428211e-05, "loss": 0.979, "step": 545 }, { "epoch": 0.06927606511950121, "grad_norm": 1.2279447317123413, "learning_rate": 6.914357682619647e-05, "loss": 1.0566, "step": 550 }, { "epoch": 0.0699058475296785, "grad_norm": 1.2323870658874512, "learning_rate": 6.977329974811082e-05, "loss": 1.0047, "step": 555 }, { "epoch": 0.07053562993985578, "grad_norm": 1.2166273593902588, "learning_rate": 7.040302267002518e-05, "loss": 0.989, "step": 560 }, { "epoch": 0.07116541235003307, "grad_norm": 1.3858731985092163, "learning_rate": 7.103274559193954e-05, "loss": 1.0389, "step": 565 }, { "epoch": 0.07179519476021035, "grad_norm": 1.1922657489776611, "learning_rate": 7.16624685138539e-05, "loss": 1.0214, "step": 570 }, { "epoch": 0.07242497717038764, "grad_norm": 1.2922885417938232, "learning_rate": 7.229219143576826e-05, "loss": 1.0647, "step": 575 }, { "epoch": 0.07305475958056491, "grad_norm": 1.174737811088562, "learning_rate": 7.292191435768262e-05, "loss": 1.039, "step": 580 }, { "epoch": 0.0736845419907422, "grad_norm": 1.4469586610794067, "learning_rate": 7.355163727959697e-05, "loss": 0.9553, "step": 585 }, { "epoch": 0.07431432440091948, "grad_norm": 1.2497671842575073, "learning_rate": 7.418136020151133e-05, "loss": 1.0676, "step": 590 }, { "epoch": 0.07494410681109677, "grad_norm": 1.0431822538375854, "learning_rate": 7.481108312342569e-05, "loss": 0.9664, "step": 595 }, { "epoch": 0.07557388922127405, "grad_norm": 1.1643774509429932, "learning_rate": 7.544080604534005e-05, "loss": 1.0354, "step": 600 }, { "epoch": 0.07620367163145134, "grad_norm": 1.277813196182251, "learning_rate": 7.60705289672544e-05, "loss": 1.0078, "step": 605 }, { "epoch": 0.07683345404162861, "grad_norm": 1.17416250705719, "learning_rate": 7.670025188916876e-05, "loss": 0.9802, "step": 610 }, { "epoch": 0.0774632364518059, "grad_norm": 1.3622347116470337, "learning_rate": 7.732997481108312e-05, "loss": 0.9323, "step": 615 }, { "epoch": 0.07809301886198318, "grad_norm": 1.1814721822738647, "learning_rate": 7.795969773299747e-05, "loss": 0.989, "step": 620 }, { "epoch": 0.07872280127216047, "grad_norm": 1.1661953926086426, "learning_rate": 7.858942065491183e-05, "loss": 0.9873, "step": 625 }, { "epoch": 0.07935258368233775, "grad_norm": 1.1950370073318481, "learning_rate": 7.921914357682618e-05, "loss": 0.9997, "step": 630 }, { "epoch": 0.07998236609251504, "grad_norm": 1.2635341882705688, "learning_rate": 7.984886649874056e-05, "loss": 0.9939, "step": 635 }, { "epoch": 0.08061214850269231, "grad_norm": 1.094709038734436, "learning_rate": 8.047858942065491e-05, "loss": 1.035, "step": 640 }, { "epoch": 0.0812419309128696, "grad_norm": 1.0304499864578247, "learning_rate": 8.110831234256926e-05, "loss": 1.0385, "step": 645 }, { "epoch": 0.0818717133230469, "grad_norm": 1.203204870223999, "learning_rate": 8.173803526448362e-05, "loss": 1.0159, "step": 650 }, { "epoch": 0.08250149573322417, "grad_norm": 1.317717432975769, "learning_rate": 8.236775818639797e-05, "loss": 0.9972, "step": 655 }, { "epoch": 0.08313127814340146, "grad_norm": 1.1352335214614868, "learning_rate": 8.299748110831233e-05, "loss": 1.0027, "step": 660 }, { "epoch": 0.08376106055357874, "grad_norm": 1.1126285791397095, "learning_rate": 8.362720403022669e-05, "loss": 0.9824, "step": 665 }, { "epoch": 0.08439084296375603, "grad_norm": 1.1858975887298584, "learning_rate": 8.425692695214106e-05, "loss": 1.0084, "step": 670 }, { "epoch": 0.0850206253739333, "grad_norm": 1.1154075860977173, "learning_rate": 8.488664987405541e-05, "loss": 0.968, "step": 675 }, { "epoch": 0.0856504077841106, "grad_norm": 1.1235395669937134, "learning_rate": 8.551637279596977e-05, "loss": 0.9828, "step": 680 }, { "epoch": 0.08628019019428787, "grad_norm": 1.251595139503479, "learning_rate": 8.614609571788412e-05, "loss": 0.9945, "step": 685 }, { "epoch": 0.08690997260446516, "grad_norm": 1.2071354389190674, "learning_rate": 8.677581863979848e-05, "loss": 1.0263, "step": 690 }, { "epoch": 0.08753975501464244, "grad_norm": 1.1954138278961182, "learning_rate": 8.740554156171283e-05, "loss": 0.9987, "step": 695 }, { "epoch": 0.08816953742481973, "grad_norm": 1.088759422302246, "learning_rate": 8.803526448362719e-05, "loss": 1.0323, "step": 700 }, { "epoch": 0.088799319834997, "grad_norm": 1.0866519212722778, "learning_rate": 8.866498740554156e-05, "loss": 1.0009, "step": 705 }, { "epoch": 0.0894291022451743, "grad_norm": 1.1450897455215454, "learning_rate": 8.929471032745592e-05, "loss": 1.0493, "step": 710 }, { "epoch": 0.09005888465535157, "grad_norm": 1.0619937181472778, "learning_rate": 8.992443324937027e-05, "loss": 1.0007, "step": 715 }, { "epoch": 0.09068866706552886, "grad_norm": 1.1548367738723755, "learning_rate": 9.055415617128463e-05, "loss": 1.006, "step": 720 }, { "epoch": 0.09131844947570614, "grad_norm": 1.0847381353378296, "learning_rate": 9.118387909319898e-05, "loss": 0.9703, "step": 725 }, { "epoch": 0.09194823188588343, "grad_norm": 1.1227048635482788, "learning_rate": 9.181360201511333e-05, "loss": 0.957, "step": 730 }, { "epoch": 0.09257801429606072, "grad_norm": 1.0912779569625854, "learning_rate": 9.24433249370277e-05, "loss": 0.9866, "step": 735 }, { "epoch": 0.093207796706238, "grad_norm": 1.0490583181381226, "learning_rate": 9.307304785894206e-05, "loss": 1.0605, "step": 740 }, { "epoch": 0.09383757911641528, "grad_norm": 1.0930982828140259, "learning_rate": 9.370277078085642e-05, "loss": 1.0061, "step": 745 }, { "epoch": 0.09446736152659256, "grad_norm": 1.1814302206039429, "learning_rate": 9.433249370277077e-05, "loss": 1.0187, "step": 750 }, { "epoch": 0.09509714393676985, "grad_norm": 1.113162636756897, "learning_rate": 9.496221662468513e-05, "loss": 0.9929, "step": 755 }, { "epoch": 0.09572692634694713, "grad_norm": 1.0789388418197632, "learning_rate": 9.559193954659948e-05, "loss": 0.9699, "step": 760 }, { "epoch": 0.09635670875712442, "grad_norm": 1.0445059537887573, "learning_rate": 9.622166246851384e-05, "loss": 0.929, "step": 765 }, { "epoch": 0.0969864911673017, "grad_norm": 1.0267964601516724, "learning_rate": 9.685138539042821e-05, "loss": 0.9581, "step": 770 }, { "epoch": 0.09761627357747898, "grad_norm": 1.0283193588256836, "learning_rate": 9.748110831234256e-05, "loss": 1.0331, "step": 775 }, { "epoch": 0.09824605598765626, "grad_norm": 1.0210477113723755, "learning_rate": 9.811083123425692e-05, "loss": 0.96, "step": 780 }, { "epoch": 0.09887583839783355, "grad_norm": 1.0163402557373047, "learning_rate": 9.874055415617127e-05, "loss": 0.9923, "step": 785 }, { "epoch": 0.09950562080801083, "grad_norm": 1.0012452602386475, "learning_rate": 9.937027707808563e-05, "loss": 0.9722, "step": 790 }, { "epoch": 0.10013540321818812, "grad_norm": 0.9810453653335571, "learning_rate": 9.999999999999999e-05, "loss": 0.991, "step": 795 }, { "epoch": 0.1007651856283654, "grad_norm": 1.1151692867279053, "learning_rate": 0.00010062972292191434, "loss": 0.9783, "step": 800 }, { "epoch": 0.10139496803854268, "grad_norm": 1.1338117122650146, "learning_rate": 0.00010125944584382871, "loss": 0.9979, "step": 805 }, { "epoch": 0.10202475044871996, "grad_norm": 0.9878106117248535, "learning_rate": 0.00010188916876574307, "loss": 0.9068, "step": 810 }, { "epoch": 0.10265453285889725, "grad_norm": 1.0334627628326416, "learning_rate": 0.00010251889168765742, "loss": 0.9389, "step": 815 }, { "epoch": 0.10328431526907453, "grad_norm": 0.9542704224586487, "learning_rate": 0.00010314861460957178, "loss": 0.9699, "step": 820 }, { "epoch": 0.10391409767925182, "grad_norm": 1.003753423690796, "learning_rate": 0.00010377833753148613, "loss": 0.9309, "step": 825 }, { "epoch": 0.10454388008942911, "grad_norm": 0.9803423285484314, "learning_rate": 0.00010440806045340049, "loss": 0.9711, "step": 830 }, { "epoch": 0.10517366249960639, "grad_norm": 0.9765311479568481, "learning_rate": 0.00010503778337531486, "loss": 1.0237, "step": 835 }, { "epoch": 0.10580344490978368, "grad_norm": 1.035510540008545, "learning_rate": 0.00010566750629722922, "loss": 0.9737, "step": 840 }, { "epoch": 0.10643322731996095, "grad_norm": 3.4597954750061035, "learning_rate": 0.00010629722921914357, "loss": 1.038, "step": 845 }, { "epoch": 0.10706300973013824, "grad_norm": 1.0254745483398438, "learning_rate": 0.00010692695214105793, "loss": 1.0044, "step": 850 }, { "epoch": 0.10769279214031552, "grad_norm": 4.8941521644592285, "learning_rate": 0.00010755667506297228, "loss": 1.1038, "step": 855 }, { "epoch": 0.10832257455049281, "grad_norm": 1.6676890850067139, "learning_rate": 0.00010818639798488663, "loss": 1.0043, "step": 860 }, { "epoch": 0.10895235696067009, "grad_norm": 5.738070964813232, "learning_rate": 0.00010881612090680099, "loss": 1.0502, "step": 865 }, { "epoch": 0.10958213937084738, "grad_norm": 1.1913108825683594, "learning_rate": 0.00010944584382871536, "loss": 1.0071, "step": 870 }, { "epoch": 0.11021192178102465, "grad_norm": 1.0302019119262695, "learning_rate": 0.00011007556675062972, "loss": 0.9732, "step": 875 }, { "epoch": 0.11084170419120194, "grad_norm": 0.92161625623703, "learning_rate": 0.00011070528967254407, "loss": 0.9414, "step": 880 }, { "epoch": 0.11147148660137922, "grad_norm": 0.954598605632782, "learning_rate": 0.00011133501259445843, "loss": 0.9772, "step": 885 }, { "epoch": 0.11210126901155651, "grad_norm": 0.9241647720336914, "learning_rate": 0.00011196473551637278, "loss": 0.9498, "step": 890 }, { "epoch": 0.11273105142173379, "grad_norm": 0.9744060039520264, "learning_rate": 0.00011259445843828714, "loss": 0.9501, "step": 895 }, { "epoch": 0.11336083383191108, "grad_norm": 1.0800458192825317, "learning_rate": 0.0001132241813602015, "loss": 0.9823, "step": 900 }, { "epoch": 0.11399061624208835, "grad_norm": 1.0275344848632812, "learning_rate": 0.00011385390428211587, "loss": 1.0426, "step": 905 }, { "epoch": 0.11462039865226564, "grad_norm": 1.0069867372512817, "learning_rate": 0.00011448362720403022, "loss": 0.9933, "step": 910 }, { "epoch": 0.11525018106244292, "grad_norm": 1.0309741497039795, "learning_rate": 0.00011511335012594457, "loss": 0.9792, "step": 915 }, { "epoch": 0.11587996347262021, "grad_norm": 0.9738866090774536, "learning_rate": 0.00011574307304785893, "loss": 1.0193, "step": 920 }, { "epoch": 0.1165097458827975, "grad_norm": 0.9231003522872925, "learning_rate": 0.00011637279596977329, "loss": 0.9741, "step": 925 }, { "epoch": 0.11713952829297478, "grad_norm": 1.1318124532699585, "learning_rate": 0.00011700251889168764, "loss": 0.9644, "step": 930 }, { "epoch": 0.11776931070315207, "grad_norm": 1.033288598060608, "learning_rate": 0.00011763224181360201, "loss": 0.9216, "step": 935 }, { "epoch": 0.11839909311332934, "grad_norm": 1.003190517425537, "learning_rate": 0.00011826196473551637, "loss": 0.9521, "step": 940 }, { "epoch": 0.11902887552350663, "grad_norm": 1.0145738124847412, "learning_rate": 0.00011889168765743072, "loss": 0.9676, "step": 945 }, { "epoch": 0.11965865793368391, "grad_norm": 1.1370879411697388, "learning_rate": 0.00011952141057934508, "loss": 0.9987, "step": 950 }, { "epoch": 0.1202884403438612, "grad_norm": 0.9657129645347595, "learning_rate": 0.00012015113350125943, "loss": 0.9622, "step": 955 }, { "epoch": 0.12091822275403848, "grad_norm": 0.9489335417747498, "learning_rate": 0.00012078085642317378, "loss": 0.9402, "step": 960 }, { "epoch": 0.12154800516421577, "grad_norm": 1.0598636865615845, "learning_rate": 0.00012141057934508814, "loss": 1.0047, "step": 965 }, { "epoch": 0.12217778757439304, "grad_norm": 0.9747732281684875, "learning_rate": 0.00012204030226700251, "loss": 1.009, "step": 970 }, { "epoch": 0.12280756998457033, "grad_norm": 0.9424954056739807, "learning_rate": 0.00012267002518891686, "loss": 0.9603, "step": 975 }, { "epoch": 0.12343735239474761, "grad_norm": 1.0061867237091064, "learning_rate": 0.00012329974811083123, "loss": 0.9494, "step": 980 }, { "epoch": 0.1240671348049249, "grad_norm": 0.924182116985321, "learning_rate": 0.00012392947103274558, "loss": 0.9804, "step": 985 }, { "epoch": 0.12469691721510218, "grad_norm": 0.983267605304718, "learning_rate": 0.00012455919395465995, "loss": 0.9814, "step": 990 }, { "epoch": 0.12532669962527945, "grad_norm": 0.896524965763092, "learning_rate": 0.0001251889168765743, "loss": 0.9533, "step": 995 }, { "epoch": 0.12595648203545676, "grad_norm": 0.8669747710227966, "learning_rate": 0.00012581863979848864, "loss": 0.9544, "step": 1000 }, { "epoch": 0.12595648203545676, "eval_loss": 0.383962482213974, "eval_runtime": 6.2938, "eval_samples_per_second": 158.887, "eval_steps_per_second": 10.01, "step": 1000 }, { "epoch": 0.12658626444563403, "grad_norm": 0.9055171012878418, "learning_rate": 0.000126448362720403, "loss": 0.9353, "step": 1005 }, { "epoch": 0.1272160468558113, "grad_norm": 0.9889428019523621, "learning_rate": 0.00012707808564231738, "loss": 0.9508, "step": 1010 }, { "epoch": 0.12784582926598861, "grad_norm": 0.8966602683067322, "learning_rate": 0.00012770780856423173, "loss": 0.995, "step": 1015 }, { "epoch": 0.1284756116761659, "grad_norm": 0.9995138645172119, "learning_rate": 0.0001283375314861461, "loss": 0.9624, "step": 1020 }, { "epoch": 0.12910539408634317, "grad_norm": 0.8536145687103271, "learning_rate": 0.00012896725440806044, "loss": 0.9549, "step": 1025 }, { "epoch": 0.12973517649652044, "grad_norm": 0.8860256671905518, "learning_rate": 0.00012959697732997479, "loss": 1.0021, "step": 1030 }, { "epoch": 0.13036495890669775, "grad_norm": 0.8574298620223999, "learning_rate": 0.00013022670025188916, "loss": 0.9798, "step": 1035 }, { "epoch": 0.13099474131687502, "grad_norm": 1.1180200576782227, "learning_rate": 0.00013085642317380353, "loss": 0.9225, "step": 1040 }, { "epoch": 0.1316245237270523, "grad_norm": 0.9391751289367676, "learning_rate": 0.00013148614609571787, "loss": 0.9467, "step": 1045 }, { "epoch": 0.13225430613722958, "grad_norm": 0.8861620426177979, "learning_rate": 0.00013211586901763222, "loss": 0.9413, "step": 1050 }, { "epoch": 0.13288408854740688, "grad_norm": 0.8499036431312561, "learning_rate": 0.0001327455919395466, "loss": 0.9644, "step": 1055 }, { "epoch": 0.13351387095758416, "grad_norm": 0.9816482067108154, "learning_rate": 0.00013337531486146094, "loss": 0.9552, "step": 1060 }, { "epoch": 0.13414365336776143, "grad_norm": 0.9725036025047302, "learning_rate": 0.0001340050377833753, "loss": 0.9461, "step": 1065 }, { "epoch": 0.1347734357779387, "grad_norm": 0.9366094470024109, "learning_rate": 0.00013463476070528968, "loss": 0.9305, "step": 1070 }, { "epoch": 0.13540321818811601, "grad_norm": 0.9212390780448914, "learning_rate": 0.00013526448362720402, "loss": 0.9551, "step": 1075 }, { "epoch": 0.1360330005982933, "grad_norm": 0.8980582356452942, "learning_rate": 0.00013589420654911837, "loss": 0.9491, "step": 1080 }, { "epoch": 0.13666278300847057, "grad_norm": 0.9107893109321594, "learning_rate": 0.00013652392947103274, "loss": 0.9366, "step": 1085 }, { "epoch": 0.13729256541864784, "grad_norm": 0.8583124876022339, "learning_rate": 0.00013715365239294708, "loss": 0.9628, "step": 1090 }, { "epoch": 0.13792234782882515, "grad_norm": 0.877052903175354, "learning_rate": 0.00013778337531486146, "loss": 0.9675, "step": 1095 }, { "epoch": 0.13855213023900242, "grad_norm": 0.8020456433296204, "learning_rate": 0.0001384130982367758, "loss": 0.9015, "step": 1100 }, { "epoch": 0.1391819126491797, "grad_norm": 0.8703967928886414, "learning_rate": 0.00013904282115869017, "loss": 0.9658, "step": 1105 }, { "epoch": 0.139811695059357, "grad_norm": 0.7955961227416992, "learning_rate": 0.00013967254408060452, "loss": 0.9084, "step": 1110 }, { "epoch": 0.14044147746953428, "grad_norm": 0.893059492111206, "learning_rate": 0.0001403022670025189, "loss": 0.9591, "step": 1115 }, { "epoch": 0.14107125987971156, "grad_norm": 0.8481057286262512, "learning_rate": 0.00014093198992443323, "loss": 0.9588, "step": 1120 }, { "epoch": 0.14170104228988883, "grad_norm": 0.8342163562774658, "learning_rate": 0.00014156171284634758, "loss": 0.947, "step": 1125 }, { "epoch": 0.14233082470006614, "grad_norm": 0.790868878364563, "learning_rate": 0.00014219143576826195, "loss": 0.9366, "step": 1130 }, { "epoch": 0.14296060711024342, "grad_norm": 0.8430061340332031, "learning_rate": 0.0001428211586901763, "loss": 0.9014, "step": 1135 }, { "epoch": 0.1435903895204207, "grad_norm": 0.9150258302688599, "learning_rate": 0.00014345088161209067, "loss": 0.9546, "step": 1140 }, { "epoch": 0.14422017193059797, "grad_norm": 0.8204888105392456, "learning_rate": 0.00014408060453400504, "loss": 0.9159, "step": 1145 }, { "epoch": 0.14484995434077527, "grad_norm": 0.7595349550247192, "learning_rate": 0.00014471032745591938, "loss": 0.9179, "step": 1150 }, { "epoch": 0.14547973675095255, "grad_norm": 0.8642888069152832, "learning_rate": 0.00014534005037783373, "loss": 1.0338, "step": 1155 }, { "epoch": 0.14610951916112982, "grad_norm": 0.9633650183677673, "learning_rate": 0.0001459697732997481, "loss": 0.9638, "step": 1160 }, { "epoch": 0.1467393015713071, "grad_norm": 0.8363626599311829, "learning_rate": 0.00014659949622166244, "loss": 0.8828, "step": 1165 }, { "epoch": 0.1473690839814844, "grad_norm": 0.8199290633201599, "learning_rate": 0.00014722921914357682, "loss": 0.9577, "step": 1170 }, { "epoch": 0.14799886639166168, "grad_norm": 0.7671203017234802, "learning_rate": 0.0001478589420654912, "loss": 0.9381, "step": 1175 }, { "epoch": 0.14862864880183896, "grad_norm": 0.8354636430740356, "learning_rate": 0.00014848866498740553, "loss": 1.0019, "step": 1180 }, { "epoch": 0.14925843121201623, "grad_norm": 0.911165714263916, "learning_rate": 0.00014911838790931988, "loss": 0.8985, "step": 1185 }, { "epoch": 0.14988821362219354, "grad_norm": 0.8125472664833069, "learning_rate": 0.00014974811083123425, "loss": 0.9628, "step": 1190 }, { "epoch": 0.15051799603237082, "grad_norm": 0.8937430381774902, "learning_rate": 0.00015037783375314862, "loss": 0.9843, "step": 1195 }, { "epoch": 0.1511477784425481, "grad_norm": 0.9609346985816956, "learning_rate": 0.00015100755667506297, "loss": 0.9552, "step": 1200 }, { "epoch": 0.1517775608527254, "grad_norm": 0.7975132465362549, "learning_rate": 0.0001516372795969773, "loss": 0.9799, "step": 1205 }, { "epoch": 0.15240734326290267, "grad_norm": 0.8690225481987, "learning_rate": 0.00015226700251889168, "loss": 0.9604, "step": 1210 }, { "epoch": 0.15303712567307995, "grad_norm": 0.7486653923988342, "learning_rate": 0.00015289672544080603, "loss": 0.9022, "step": 1215 }, { "epoch": 0.15366690808325723, "grad_norm": 0.8420302271842957, "learning_rate": 0.0001535264483627204, "loss": 0.8791, "step": 1220 }, { "epoch": 0.15429669049343453, "grad_norm": 0.8187466263771057, "learning_rate": 0.00015415617128463474, "loss": 0.9332, "step": 1225 }, { "epoch": 0.1549264729036118, "grad_norm": 0.8711130619049072, "learning_rate": 0.0001547858942065491, "loss": 0.8924, "step": 1230 }, { "epoch": 0.15555625531378908, "grad_norm": 0.8086002469062805, "learning_rate": 0.00015541561712846346, "loss": 0.9491, "step": 1235 }, { "epoch": 0.15618603772396636, "grad_norm": 0.8274957537651062, "learning_rate": 0.0001560453400503778, "loss": 0.9392, "step": 1240 }, { "epoch": 0.15681582013414366, "grad_norm": 0.831676721572876, "learning_rate": 0.00015667506297229218, "loss": 1.0327, "step": 1245 }, { "epoch": 0.15744560254432094, "grad_norm": 0.8806201219558716, "learning_rate": 0.00015730478589420652, "loss": 0.8607, "step": 1250 }, { "epoch": 0.15807538495449822, "grad_norm": 0.905436635017395, "learning_rate": 0.00015793450881612092, "loss": 0.9301, "step": 1255 }, { "epoch": 0.1587051673646755, "grad_norm": 0.8631262183189392, "learning_rate": 0.00015856423173803526, "loss": 0.9443, "step": 1260 }, { "epoch": 0.1593349497748528, "grad_norm": 0.7483521699905396, "learning_rate": 0.0001591939546599496, "loss": 0.901, "step": 1265 }, { "epoch": 0.15996473218503007, "grad_norm": 0.8273198008537292, "learning_rate": 0.00015982367758186398, "loss": 0.9608, "step": 1270 }, { "epoch": 0.16059451459520735, "grad_norm": 0.7562909722328186, "learning_rate": 0.00016045340050377832, "loss": 0.9219, "step": 1275 }, { "epoch": 0.16122429700538463, "grad_norm": 0.8585835099220276, "learning_rate": 0.0001610831234256927, "loss": 0.942, "step": 1280 }, { "epoch": 0.16185407941556193, "grad_norm": 0.8192921876907349, "learning_rate": 0.00016171284634760704, "loss": 0.9531, "step": 1285 }, { "epoch": 0.1624838618257392, "grad_norm": 0.8301946520805359, "learning_rate": 0.00016234256926952139, "loss": 0.8972, "step": 1290 }, { "epoch": 0.16311364423591648, "grad_norm": 0.8291681408882141, "learning_rate": 0.00016297229219143576, "loss": 0.9653, "step": 1295 }, { "epoch": 0.1637434266460938, "grad_norm": 0.8672564625740051, "learning_rate": 0.0001636020151133501, "loss": 0.9498, "step": 1300 }, { "epoch": 0.16437320905627106, "grad_norm": 0.7432397603988647, "learning_rate": 0.00016423173803526445, "loss": 0.8782, "step": 1305 }, { "epoch": 0.16500299146644834, "grad_norm": 0.7710584402084351, "learning_rate": 0.00016486146095717882, "loss": 0.8872, "step": 1310 }, { "epoch": 0.16563277387662562, "grad_norm": 0.7810630798339844, "learning_rate": 0.00016549118387909316, "loss": 0.9357, "step": 1315 }, { "epoch": 0.16626255628680292, "grad_norm": 0.7368482947349548, "learning_rate": 0.00016612090680100756, "loss": 0.8935, "step": 1320 }, { "epoch": 0.1668923386969802, "grad_norm": 0.7725487947463989, "learning_rate": 0.0001667506297229219, "loss": 0.9241, "step": 1325 }, { "epoch": 0.16752212110715747, "grad_norm": 0.7551338076591492, "learning_rate": 0.00016738035264483628, "loss": 0.8878, "step": 1330 }, { "epoch": 0.16815190351733475, "grad_norm": 0.8027164340019226, "learning_rate": 0.00016801007556675062, "loss": 0.9149, "step": 1335 }, { "epoch": 0.16878168592751205, "grad_norm": 0.7476945519447327, "learning_rate": 0.00016863979848866497, "loss": 0.9241, "step": 1340 }, { "epoch": 0.16941146833768933, "grad_norm": 0.7967312335968018, "learning_rate": 0.00016926952141057934, "loss": 0.932, "step": 1345 }, { "epoch": 0.1700412507478666, "grad_norm": 0.809727668762207, "learning_rate": 0.00016989924433249368, "loss": 0.922, "step": 1350 }, { "epoch": 0.17067103315804388, "grad_norm": 0.7631811499595642, "learning_rate": 0.00017052896725440806, "loss": 0.94, "step": 1355 }, { "epoch": 0.1713008155682212, "grad_norm": 0.6545524001121521, "learning_rate": 0.0001711586901763224, "loss": 0.8898, "step": 1360 }, { "epoch": 0.17193059797839846, "grad_norm": 0.8232229351997375, "learning_rate": 0.00017178841309823675, "loss": 0.9235, "step": 1365 }, { "epoch": 0.17256038038857574, "grad_norm": 0.8617391586303711, "learning_rate": 0.00017241813602015112, "loss": 0.9491, "step": 1370 }, { "epoch": 0.17319016279875304, "grad_norm": 0.7971004247665405, "learning_rate": 0.00017304785894206546, "loss": 0.8749, "step": 1375 }, { "epoch": 0.17381994520893032, "grad_norm": 0.7876558899879456, "learning_rate": 0.0001736775818639798, "loss": 0.9954, "step": 1380 }, { "epoch": 0.1744497276191076, "grad_norm": 0.8051108121871948, "learning_rate": 0.00017430730478589418, "loss": 0.897, "step": 1385 }, { "epoch": 0.17507951002928487, "grad_norm": 0.8449770212173462, "learning_rate": 0.00017493702770780855, "loss": 0.8881, "step": 1390 }, { "epoch": 0.17570929243946218, "grad_norm": 0.8217072486877441, "learning_rate": 0.00017556675062972292, "loss": 0.9027, "step": 1395 }, { "epoch": 0.17633907484963945, "grad_norm": 0.806914210319519, "learning_rate": 0.00017619647355163727, "loss": 0.9222, "step": 1400 }, { "epoch": 0.17696885725981673, "grad_norm": 0.8344951868057251, "learning_rate": 0.00017682619647355164, "loss": 0.9462, "step": 1405 }, { "epoch": 0.177598639669994, "grad_norm": 0.7249205112457275, "learning_rate": 0.00017745591939546598, "loss": 0.917, "step": 1410 }, { "epoch": 0.1782284220801713, "grad_norm": 0.8052341341972351, "learning_rate": 0.00017808564231738033, "loss": 0.9168, "step": 1415 }, { "epoch": 0.1788582044903486, "grad_norm": 0.7675748467445374, "learning_rate": 0.0001787153652392947, "loss": 0.9186, "step": 1420 }, { "epoch": 0.17948798690052586, "grad_norm": 0.7672801613807678, "learning_rate": 0.00017934508816120904, "loss": 0.8637, "step": 1425 }, { "epoch": 0.18011776931070314, "grad_norm": 0.7517289519309998, "learning_rate": 0.00017997481108312342, "loss": 0.9053, "step": 1430 }, { "epoch": 0.18074755172088044, "grad_norm": 0.7253280878067017, "learning_rate": 0.00018060453400503776, "loss": 0.9047, "step": 1435 }, { "epoch": 0.18137733413105772, "grad_norm": 0.7113356590270996, "learning_rate": 0.0001812342569269521, "loss": 0.9288, "step": 1440 }, { "epoch": 0.182007116541235, "grad_norm": 0.6626010537147522, "learning_rate": 0.00018186397984886648, "loss": 0.8942, "step": 1445 }, { "epoch": 0.18263689895141227, "grad_norm": 0.7033849358558655, "learning_rate": 0.00018249370277078082, "loss": 0.9086, "step": 1450 }, { "epoch": 0.18326668136158958, "grad_norm": 0.701263427734375, "learning_rate": 0.00018312342569269522, "loss": 0.9429, "step": 1455 }, { "epoch": 0.18389646377176685, "grad_norm": 0.7362795472145081, "learning_rate": 0.00018375314861460957, "loss": 0.8955, "step": 1460 }, { "epoch": 0.18452624618194413, "grad_norm": 0.7902641296386719, "learning_rate": 0.00018438287153652394, "loss": 0.8535, "step": 1465 }, { "epoch": 0.18515602859212144, "grad_norm": 0.6855788230895996, "learning_rate": 0.00018501259445843828, "loss": 0.9166, "step": 1470 }, { "epoch": 0.1857858110022987, "grad_norm": 0.6782147884368896, "learning_rate": 0.00018564231738035263, "loss": 0.8755, "step": 1475 }, { "epoch": 0.186415593412476, "grad_norm": 0.6875694990158081, "learning_rate": 0.000186272040302267, "loss": 0.8712, "step": 1480 }, { "epoch": 0.18704537582265326, "grad_norm": 0.7253673672676086, "learning_rate": 0.00018690176322418134, "loss": 0.8933, "step": 1485 }, { "epoch": 0.18767515823283057, "grad_norm": 0.8096954822540283, "learning_rate": 0.0001875314861460957, "loss": 0.9359, "step": 1490 }, { "epoch": 0.18830494064300785, "grad_norm": 0.7597787380218506, "learning_rate": 0.00018816120906801006, "loss": 0.9341, "step": 1495 }, { "epoch": 0.18893472305318512, "grad_norm": 0.7736676931381226, "learning_rate": 0.0001887909319899244, "loss": 0.9281, "step": 1500 }, { "epoch": 0.1895645054633624, "grad_norm": 0.6343753337860107, "learning_rate": 0.00018942065491183878, "loss": 0.9177, "step": 1505 }, { "epoch": 0.1901942878735397, "grad_norm": 0.728712260723114, "learning_rate": 0.00019005037783375312, "loss": 0.9371, "step": 1510 }, { "epoch": 0.19082407028371698, "grad_norm": 0.7092194557189941, "learning_rate": 0.00019068010075566746, "loss": 0.8902, "step": 1515 }, { "epoch": 0.19145385269389426, "grad_norm": 0.7485836744308472, "learning_rate": 0.00019130982367758186, "loss": 0.8931, "step": 1520 }, { "epoch": 0.19208363510407153, "grad_norm": 0.7485086917877197, "learning_rate": 0.0001919395465994962, "loss": 0.9368, "step": 1525 }, { "epoch": 0.19271341751424884, "grad_norm": 0.7100546360015869, "learning_rate": 0.00019256926952141058, "loss": 0.8803, "step": 1530 }, { "epoch": 0.1933431999244261, "grad_norm": 0.7371817827224731, "learning_rate": 0.00019319899244332492, "loss": 0.8963, "step": 1535 }, { "epoch": 0.1939729823346034, "grad_norm": 0.6849647164344788, "learning_rate": 0.0001938287153652393, "loss": 0.9137, "step": 1540 }, { "epoch": 0.19460276474478067, "grad_norm": 0.7108625173568726, "learning_rate": 0.00019445843828715364, "loss": 0.9078, "step": 1545 }, { "epoch": 0.19523254715495797, "grad_norm": 0.7581806182861328, "learning_rate": 0.00019508816120906799, "loss": 0.9002, "step": 1550 }, { "epoch": 0.19586232956513525, "grad_norm": 0.7299503087997437, "learning_rate": 0.00019571788413098236, "loss": 0.8897, "step": 1555 }, { "epoch": 0.19649211197531252, "grad_norm": 0.7815247774124146, "learning_rate": 0.0001963476070528967, "loss": 0.8454, "step": 1560 }, { "epoch": 0.19712189438548983, "grad_norm": 0.7475869655609131, "learning_rate": 0.00019697732997481105, "loss": 0.9482, "step": 1565 }, { "epoch": 0.1977516767956671, "grad_norm": 0.7469599843025208, "learning_rate": 0.00019760705289672542, "loss": 0.9048, "step": 1570 }, { "epoch": 0.19838145920584438, "grad_norm": 0.6186767220497131, "learning_rate": 0.00019823677581863976, "loss": 0.8698, "step": 1575 }, { "epoch": 0.19901124161602166, "grad_norm": 0.843999445438385, "learning_rate": 0.00019886649874055413, "loss": 0.9567, "step": 1580 }, { "epoch": 0.19964102402619896, "grad_norm": 0.749344527721405, "learning_rate": 0.00019949622166246848, "loss": 0.9234, "step": 1585 }, { "epoch": 0.20027080643637624, "grad_norm": 0.6822441220283508, "learning_rate": 0.00020012594458438288, "loss": 0.8915, "step": 1590 }, { "epoch": 0.2009005888465535, "grad_norm": 0.7193272113800049, "learning_rate": 0.00020075566750629722, "loss": 0.8922, "step": 1595 }, { "epoch": 0.2015303712567308, "grad_norm": 0.7202250361442566, "learning_rate": 0.00020138539042821157, "loss": 0.9026, "step": 1600 }, { "epoch": 0.2021601536669081, "grad_norm": 0.6946163773536682, "learning_rate": 0.00020201511335012594, "loss": 0.9181, "step": 1605 }, { "epoch": 0.20278993607708537, "grad_norm": 0.7185525894165039, "learning_rate": 0.00020264483627204028, "loss": 0.8809, "step": 1610 }, { "epoch": 0.20341971848726265, "grad_norm": 0.6290002465248108, "learning_rate": 0.00020327455919395466, "loss": 0.9033, "step": 1615 }, { "epoch": 0.20404950089743992, "grad_norm": 0.6773431897163391, "learning_rate": 0.000203904282115869, "loss": 0.838, "step": 1620 }, { "epoch": 0.20467928330761723, "grad_norm": 0.7076095342636108, "learning_rate": 0.00020453400503778335, "loss": 0.9158, "step": 1625 }, { "epoch": 0.2053090657177945, "grad_norm": 0.7354462146759033, "learning_rate": 0.00020516372795969772, "loss": 0.8336, "step": 1630 }, { "epoch": 0.20593884812797178, "grad_norm": 0.6885705590248108, "learning_rate": 0.00020579345088161206, "loss": 0.8971, "step": 1635 }, { "epoch": 0.20656863053814906, "grad_norm": 0.697887659072876, "learning_rate": 0.00020642317380352643, "loss": 0.851, "step": 1640 }, { "epoch": 0.20719841294832636, "grad_norm": 0.7369652986526489, "learning_rate": 0.00020705289672544078, "loss": 0.8567, "step": 1645 }, { "epoch": 0.20782819535850364, "grad_norm": 0.7226613759994507, "learning_rate": 0.00020768261964735512, "loss": 0.9038, "step": 1650 }, { "epoch": 0.2084579777686809, "grad_norm": 0.6973157525062561, "learning_rate": 0.00020831234256926952, "loss": 0.8443, "step": 1655 }, { "epoch": 0.20908776017885822, "grad_norm": 0.7276191115379333, "learning_rate": 0.00020894206549118387, "loss": 0.8985, "step": 1660 }, { "epoch": 0.2097175425890355, "grad_norm": 0.694542646408081, "learning_rate": 0.00020957178841309824, "loss": 0.8914, "step": 1665 }, { "epoch": 0.21034732499921277, "grad_norm": 0.8255221843719482, "learning_rate": 0.00021020151133501258, "loss": 0.9072, "step": 1670 }, { "epoch": 0.21097710740939005, "grad_norm": 0.637487530708313, "learning_rate": 0.00021083123425692693, "loss": 0.8637, "step": 1675 }, { "epoch": 0.21160688981956735, "grad_norm": 0.6839597821235657, "learning_rate": 0.0002114609571788413, "loss": 0.8736, "step": 1680 }, { "epoch": 0.21223667222974463, "grad_norm": 0.6435440182685852, "learning_rate": 0.00021209068010075564, "loss": 0.8725, "step": 1685 }, { "epoch": 0.2128664546399219, "grad_norm": 0.7100492715835571, "learning_rate": 0.00021272040302267002, "loss": 0.9169, "step": 1690 }, { "epoch": 0.21349623705009918, "grad_norm": 0.6926056742668152, "learning_rate": 0.00021335012594458436, "loss": 0.8549, "step": 1695 }, { "epoch": 0.21412601946027648, "grad_norm": 0.8507684469223022, "learning_rate": 0.0002139798488664987, "loss": 0.9011, "step": 1700 }, { "epoch": 0.21475580187045376, "grad_norm": 0.7276325821876526, "learning_rate": 0.00021460957178841308, "loss": 0.8607, "step": 1705 }, { "epoch": 0.21538558428063104, "grad_norm": 0.6535823941230774, "learning_rate": 0.00021523929471032742, "loss": 0.8558, "step": 1710 }, { "epoch": 0.2160153666908083, "grad_norm": 0.6517070531845093, "learning_rate": 0.0002158690176322418, "loss": 0.8703, "step": 1715 }, { "epoch": 0.21664514910098562, "grad_norm": 0.7442309260368347, "learning_rate": 0.00021649874055415614, "loss": 0.8961, "step": 1720 }, { "epoch": 0.2172749315111629, "grad_norm": 0.7261196374893188, "learning_rate": 0.00021712846347607054, "loss": 0.8902, "step": 1725 }, { "epoch": 0.21790471392134017, "grad_norm": 0.7019686698913574, "learning_rate": 0.00021775818639798488, "loss": 0.8929, "step": 1730 }, { "epoch": 0.21853449633151745, "grad_norm": 0.7852956056594849, "learning_rate": 0.00021838790931989923, "loss": 0.8766, "step": 1735 }, { "epoch": 0.21916427874169475, "grad_norm": 0.7370544672012329, "learning_rate": 0.0002190176322418136, "loss": 0.855, "step": 1740 }, { "epoch": 0.21979406115187203, "grad_norm": 0.6246267557144165, "learning_rate": 0.00021964735516372794, "loss": 0.9127, "step": 1745 }, { "epoch": 0.2204238435620493, "grad_norm": 0.6939797401428223, "learning_rate": 0.0002202770780856423, "loss": 0.8878, "step": 1750 }, { "epoch": 0.2210536259722266, "grad_norm": 0.6594600081443787, "learning_rate": 0.00022090680100755666, "loss": 0.9105, "step": 1755 }, { "epoch": 0.22168340838240388, "grad_norm": 0.6578107476234436, "learning_rate": 0.000221536523929471, "loss": 0.9016, "step": 1760 }, { "epoch": 0.22231319079258116, "grad_norm": 0.6889748573303223, "learning_rate": 0.00022216624685138538, "loss": 0.9091, "step": 1765 }, { "epoch": 0.22294297320275844, "grad_norm": 0.6207224130630493, "learning_rate": 0.00022279596977329972, "loss": 0.9058, "step": 1770 }, { "epoch": 0.22357275561293574, "grad_norm": 0.6724773645401001, "learning_rate": 0.00022342569269521406, "loss": 0.9144, "step": 1775 }, { "epoch": 0.22420253802311302, "grad_norm": 0.702472448348999, "learning_rate": 0.00022405541561712844, "loss": 0.9, "step": 1780 }, { "epoch": 0.2248323204332903, "grad_norm": 0.6482950448989868, "learning_rate": 0.00022468513853904278, "loss": 0.88, "step": 1785 }, { "epoch": 0.22546210284346757, "grad_norm": 0.7253268957138062, "learning_rate": 0.00022531486146095718, "loss": 0.9147, "step": 1790 }, { "epoch": 0.22609188525364488, "grad_norm": 0.7196680307388306, "learning_rate": 0.00022594458438287152, "loss": 0.8687, "step": 1795 }, { "epoch": 0.22672166766382215, "grad_norm": 0.6720924973487854, "learning_rate": 0.0002265743073047859, "loss": 0.9173, "step": 1800 }, { "epoch": 0.22735145007399943, "grad_norm": 0.6656882762908936, "learning_rate": 0.00022720403022670024, "loss": 0.8237, "step": 1805 }, { "epoch": 0.2279812324841767, "grad_norm": 0.6303510665893555, "learning_rate": 0.00022783375314861459, "loss": 0.891, "step": 1810 }, { "epoch": 0.228611014894354, "grad_norm": 0.6595205068588257, "learning_rate": 0.00022846347607052896, "loss": 0.8745, "step": 1815 }, { "epoch": 0.22924079730453129, "grad_norm": 0.6373685002326965, "learning_rate": 0.0002290931989924433, "loss": 0.895, "step": 1820 }, { "epoch": 0.22987057971470856, "grad_norm": 0.6187670230865479, "learning_rate": 0.00022972292191435767, "loss": 0.8954, "step": 1825 }, { "epoch": 0.23050036212488584, "grad_norm": 0.6348496079444885, "learning_rate": 0.00023035264483627202, "loss": 0.8462, "step": 1830 }, { "epoch": 0.23113014453506314, "grad_norm": 0.6880120038986206, "learning_rate": 0.00023098236775818636, "loss": 0.883, "step": 1835 }, { "epoch": 0.23175992694524042, "grad_norm": 0.7668615579605103, "learning_rate": 0.00023161209068010073, "loss": 0.9134, "step": 1840 }, { "epoch": 0.2323897093554177, "grad_norm": 0.6664952635765076, "learning_rate": 0.00023224181360201508, "loss": 0.9276, "step": 1845 }, { "epoch": 0.233019491765595, "grad_norm": 0.754509449005127, "learning_rate": 0.00023287153652392942, "loss": 0.858, "step": 1850 }, { "epoch": 0.23364927417577228, "grad_norm": 0.6345789432525635, "learning_rate": 0.00023350125944584382, "loss": 0.9048, "step": 1855 }, { "epoch": 0.23427905658594955, "grad_norm": 0.6877152323722839, "learning_rate": 0.00023413098236775817, "loss": 0.9023, "step": 1860 }, { "epoch": 0.23490883899612683, "grad_norm": 0.6173678636550903, "learning_rate": 0.00023476070528967254, "loss": 0.8951, "step": 1865 }, { "epoch": 0.23553862140630413, "grad_norm": 0.6912857294082642, "learning_rate": 0.00023539042821158688, "loss": 0.8412, "step": 1870 }, { "epoch": 0.2361684038164814, "grad_norm": 0.6385686993598938, "learning_rate": 0.00023602015113350126, "loss": 0.8954, "step": 1875 }, { "epoch": 0.23679818622665869, "grad_norm": 0.6755088567733765, "learning_rate": 0.0002366498740554156, "loss": 0.8964, "step": 1880 }, { "epoch": 0.23742796863683596, "grad_norm": 0.6391545534133911, "learning_rate": 0.00023727959697732995, "loss": 0.9294, "step": 1885 }, { "epoch": 0.23805775104701327, "grad_norm": 0.7155817747116089, "learning_rate": 0.00023790931989924432, "loss": 0.8967, "step": 1890 }, { "epoch": 0.23868753345719054, "grad_norm": 0.681224524974823, "learning_rate": 0.00023853904282115866, "loss": 0.8997, "step": 1895 }, { "epoch": 0.23931731586736782, "grad_norm": 0.6473144888877869, "learning_rate": 0.00023916876574307303, "loss": 0.9172, "step": 1900 }, { "epoch": 0.2399470982775451, "grad_norm": 0.6562004685401917, "learning_rate": 0.00023979848866498738, "loss": 0.8488, "step": 1905 }, { "epoch": 0.2405768806877224, "grad_norm": 0.6842007637023926, "learning_rate": 0.00024042821158690172, "loss": 0.9183, "step": 1910 }, { "epoch": 0.24120666309789968, "grad_norm": 0.5957079529762268, "learning_rate": 0.0002410579345088161, "loss": 0.8293, "step": 1915 }, { "epoch": 0.24183644550807695, "grad_norm": 0.6745590567588806, "learning_rate": 0.00024168765743073044, "loss": 0.8219, "step": 1920 }, { "epoch": 0.24246622791825426, "grad_norm": 0.6895525455474854, "learning_rate": 0.00024231738035264484, "loss": 0.9034, "step": 1925 }, { "epoch": 0.24309601032843153, "grad_norm": 0.7394620776176453, "learning_rate": 0.00024294710327455918, "loss": 0.8702, "step": 1930 }, { "epoch": 0.2437257927386088, "grad_norm": 0.7846884727478027, "learning_rate": 0.00024357682619647353, "loss": 0.9143, "step": 1935 }, { "epoch": 0.24435557514878609, "grad_norm": 0.594127893447876, "learning_rate": 0.00024420654911838787, "loss": 0.8838, "step": 1940 }, { "epoch": 0.2449853575589634, "grad_norm": 0.6737518906593323, "learning_rate": 0.00024483627204030224, "loss": 0.8657, "step": 1945 }, { "epoch": 0.24561513996914067, "grad_norm": 0.6851866245269775, "learning_rate": 0.0002454659949622166, "loss": 0.9133, "step": 1950 }, { "epoch": 0.24624492237931794, "grad_norm": 0.6238758563995361, "learning_rate": 0.000246095717884131, "loss": 0.8816, "step": 1955 }, { "epoch": 0.24687470478949522, "grad_norm": 0.6002854704856873, "learning_rate": 0.0002467254408060453, "loss": 0.8888, "step": 1960 }, { "epoch": 0.24750448719967252, "grad_norm": 0.6201847791671753, "learning_rate": 0.0002473551637279597, "loss": 0.8299, "step": 1965 }, { "epoch": 0.2481342696098498, "grad_norm": 0.6619172692298889, "learning_rate": 0.00024798488664987405, "loss": 0.9297, "step": 1970 }, { "epoch": 0.24876405202002708, "grad_norm": 0.6359203457832336, "learning_rate": 0.00024861460957178837, "loss": 0.8811, "step": 1975 }, { "epoch": 0.24939383443020435, "grad_norm": 0.6441104412078857, "learning_rate": 0.00024924433249370274, "loss": 0.8704, "step": 1980 }, { "epoch": 0.25002361684038166, "grad_norm": 0.7083386778831482, "learning_rate": 0.0002498740554156171, "loss": 0.8877, "step": 1985 }, { "epoch": 0.2506533992505589, "grad_norm": 0.642206072807312, "learning_rate": 0.0002505037783375315, "loss": 0.8661, "step": 1990 }, { "epoch": 0.2512831816607362, "grad_norm": 0.6782190203666687, "learning_rate": 0.00025113350125944585, "loss": 0.901, "step": 1995 }, { "epoch": 0.2519129640709135, "grad_norm": 0.6277428269386292, "learning_rate": 0.00025176322418136017, "loss": 0.8212, "step": 2000 }, { "epoch": 0.2519129640709135, "eval_loss": 0.35735848546028137, "eval_runtime": 6.2326, "eval_samples_per_second": 160.447, "eval_steps_per_second": 10.108, "step": 2000 }, { "epoch": 0.25254274648109076, "grad_norm": 0.5980456471443176, "learning_rate": 0.00025239294710327454, "loss": 0.8808, "step": 2005 }, { "epoch": 0.25317252889126807, "grad_norm": 0.6398759484291077, "learning_rate": 0.0002530226700251889, "loss": 0.8817, "step": 2010 }, { "epoch": 0.25380231130144537, "grad_norm": 0.5681187510490417, "learning_rate": 0.00025365239294710323, "loss": 0.8672, "step": 2015 }, { "epoch": 0.2544320937116226, "grad_norm": 0.6202912926673889, "learning_rate": 0.0002542821158690176, "loss": 0.8627, "step": 2020 }, { "epoch": 0.2550618761217999, "grad_norm": 0.5921783447265625, "learning_rate": 0.000254911838790932, "loss": 0.8214, "step": 2025 }, { "epoch": 0.25569165853197723, "grad_norm": 0.629782497882843, "learning_rate": 0.00025554156171284635, "loss": 0.8995, "step": 2030 }, { "epoch": 0.2563214409421545, "grad_norm": 0.6545585989952087, "learning_rate": 0.00025617128463476066, "loss": 0.8422, "step": 2035 }, { "epoch": 0.2569512233523318, "grad_norm": 0.6024030447006226, "learning_rate": 0.00025680100755667504, "loss": 0.8341, "step": 2040 }, { "epoch": 0.25758100576250903, "grad_norm": 0.6795976161956787, "learning_rate": 0.0002574307304785894, "loss": 0.852, "step": 2045 }, { "epoch": 0.25821078817268633, "grad_norm": 0.6465495228767395, "learning_rate": 0.0002580604534005037, "loss": 0.8514, "step": 2050 }, { "epoch": 0.25884057058286364, "grad_norm": 0.6498434543609619, "learning_rate": 0.0002586901763224181, "loss": 0.8906, "step": 2055 }, { "epoch": 0.2594703529930409, "grad_norm": 0.7072421908378601, "learning_rate": 0.00025931989924433247, "loss": 0.9061, "step": 2060 }, { "epoch": 0.2601001354032182, "grad_norm": 0.5902896523475647, "learning_rate": 0.00025994962216624684, "loss": 0.8327, "step": 2065 }, { "epoch": 0.2607299178133955, "grad_norm": 0.6410335302352905, "learning_rate": 0.0002605793450881612, "loss": 0.9002, "step": 2070 }, { "epoch": 0.26135970022357274, "grad_norm": 1.628951072692871, "learning_rate": 0.00026120906801007553, "loss": 0.8944, "step": 2075 }, { "epoch": 0.26198948263375005, "grad_norm": 0.6544843316078186, "learning_rate": 0.0002618387909319899, "loss": 0.8656, "step": 2080 }, { "epoch": 0.2626192650439273, "grad_norm": 15.444189071655273, "learning_rate": 0.0002624685138539043, "loss": 0.9639, "step": 2085 }, { "epoch": 0.2632490474541046, "grad_norm": 8.399425506591797, "learning_rate": 0.0002630982367758186, "loss": 1.1367, "step": 2090 }, { "epoch": 0.2638788298642819, "grad_norm": 24.009044647216797, "learning_rate": 0.00026372795969773296, "loss": 1.1429, "step": 2095 }, { "epoch": 0.26450861227445915, "grad_norm": 4.226770877838135, "learning_rate": 0.00026435768261964733, "loss": 0.974, "step": 2100 }, { "epoch": 0.26513839468463646, "grad_norm": 1.0910799503326416, "learning_rate": 0.0002649874055415617, "loss": 1.0182, "step": 2105 }, { "epoch": 0.26576817709481376, "grad_norm": 2.8835411071777344, "learning_rate": 0.000265617128463476, "loss": 1.0283, "step": 2110 }, { "epoch": 0.266397959504991, "grad_norm": 2.8626575469970703, "learning_rate": 0.0002662468513853904, "loss": 0.9273, "step": 2115 }, { "epoch": 0.2670277419151683, "grad_norm": 1.4587650299072266, "learning_rate": 0.00026687657430730477, "loss": 0.9578, "step": 2120 }, { "epoch": 0.2676575243253456, "grad_norm": 0.7692992091178894, "learning_rate": 0.00026750629722921914, "loss": 0.8701, "step": 2125 }, { "epoch": 0.26828730673552287, "grad_norm": 0.8609071373939514, "learning_rate": 0.0002681360201511335, "loss": 0.8718, "step": 2130 }, { "epoch": 0.26891708914570017, "grad_norm": 0.7419576048851013, "learning_rate": 0.00026876574307304783, "loss": 0.8732, "step": 2135 }, { "epoch": 0.2695468715558774, "grad_norm": 0.8134281635284424, "learning_rate": 0.0002693954659949622, "loss": 0.9112, "step": 2140 }, { "epoch": 0.2701766539660547, "grad_norm": 0.7559547424316406, "learning_rate": 0.00027002518891687657, "loss": 0.8804, "step": 2145 }, { "epoch": 0.27080643637623203, "grad_norm": 0.7497460842132568, "learning_rate": 0.0002706549118387909, "loss": 0.8439, "step": 2150 }, { "epoch": 0.2714362187864093, "grad_norm": 0.775444746017456, "learning_rate": 0.00027128463476070526, "loss": 0.888, "step": 2155 }, { "epoch": 0.2720660011965866, "grad_norm": 0.7074035406112671, "learning_rate": 0.00027191435768261963, "loss": 0.8628, "step": 2160 }, { "epoch": 0.2726957836067639, "grad_norm": 0.730311393737793, "learning_rate": 0.00027254408060453395, "loss": 0.8908, "step": 2165 }, { "epoch": 0.27332556601694113, "grad_norm": 0.7610625624656677, "learning_rate": 0.0002731738035264483, "loss": 0.8954, "step": 2170 }, { "epoch": 0.27395534842711844, "grad_norm": 0.6473423838615417, "learning_rate": 0.0002738035264483627, "loss": 0.8488, "step": 2175 }, { "epoch": 0.2745851308372957, "grad_norm": 0.7084975838661194, "learning_rate": 0.00027443324937027707, "loss": 0.8631, "step": 2180 }, { "epoch": 0.275214913247473, "grad_norm": 0.6844817996025085, "learning_rate": 0.0002750629722921914, "loss": 0.9021, "step": 2185 }, { "epoch": 0.2758446956576503, "grad_norm": 0.641327440738678, "learning_rate": 0.0002756926952141058, "loss": 0.9002, "step": 2190 }, { "epoch": 0.27647447806782754, "grad_norm": 0.7175489664077759, "learning_rate": 0.0002763224181360201, "loss": 0.8794, "step": 2195 }, { "epoch": 0.27710426047800485, "grad_norm": 0.6306767463684082, "learning_rate": 0.0002769521410579345, "loss": 0.8732, "step": 2200 }, { "epoch": 0.27773404288818215, "grad_norm": 0.6501113176345825, "learning_rate": 0.00027758186397984887, "loss": 0.8725, "step": 2205 }, { "epoch": 0.2783638252983594, "grad_norm": 0.5996410250663757, "learning_rate": 0.0002782115869017632, "loss": 0.8828, "step": 2210 }, { "epoch": 0.2789936077085367, "grad_norm": 0.6551349759101868, "learning_rate": 0.00027884130982367756, "loss": 0.8725, "step": 2215 }, { "epoch": 0.279623390118714, "grad_norm": 0.6475560069084167, "learning_rate": 0.00027947103274559193, "loss": 0.9333, "step": 2220 }, { "epoch": 0.28025317252889126, "grad_norm": 0.6957899928092957, "learning_rate": 0.00028010075566750625, "loss": 0.8933, "step": 2225 }, { "epoch": 0.28088295493906856, "grad_norm": 0.6194736361503601, "learning_rate": 0.0002807304785894206, "loss": 0.9268, "step": 2230 }, { "epoch": 0.2815127373492458, "grad_norm": 0.6293075084686279, "learning_rate": 0.000281360201511335, "loss": 0.8985, "step": 2235 }, { "epoch": 0.2821425197594231, "grad_norm": 0.6805360317230225, "learning_rate": 0.0002819899244332493, "loss": 0.854, "step": 2240 }, { "epoch": 0.2827723021696004, "grad_norm": 0.6671084761619568, "learning_rate": 0.0002826196473551637, "loss": 0.8774, "step": 2245 }, { "epoch": 0.28340208457977767, "grad_norm": 0.5680047273635864, "learning_rate": 0.00028324937027707805, "loss": 0.8273, "step": 2250 }, { "epoch": 0.284031866989955, "grad_norm": 0.5691477060317993, "learning_rate": 0.0002838790931989924, "loss": 0.8633, "step": 2255 }, { "epoch": 0.2846616494001323, "grad_norm": 0.6509323120117188, "learning_rate": 0.0002845088161209068, "loss": 0.8991, "step": 2260 }, { "epoch": 0.2852914318103095, "grad_norm": 0.714750349521637, "learning_rate": 0.00028513853904282117, "loss": 0.8863, "step": 2265 }, { "epoch": 0.28592121422048683, "grad_norm": 0.6934742331504822, "learning_rate": 0.0002857682619647355, "loss": 0.8699, "step": 2270 }, { "epoch": 0.2865509966306641, "grad_norm": 0.6048073172569275, "learning_rate": 0.00028639798488664986, "loss": 0.8983, "step": 2275 }, { "epoch": 0.2871807790408414, "grad_norm": 0.6630669236183167, "learning_rate": 0.00028702770780856423, "loss": 0.9142, "step": 2280 }, { "epoch": 0.2878105614510187, "grad_norm": 0.6518734693527222, "learning_rate": 0.00028765743073047855, "loss": 0.8734, "step": 2285 }, { "epoch": 0.28844034386119594, "grad_norm": 0.5939868688583374, "learning_rate": 0.0002882871536523929, "loss": 0.8873, "step": 2290 }, { "epoch": 0.28907012627137324, "grad_norm": 0.6081305742263794, "learning_rate": 0.0002889168765743073, "loss": 0.8735, "step": 2295 }, { "epoch": 0.28969990868155054, "grad_norm": 0.5869495272636414, "learning_rate": 0.0002895465994962216, "loss": 0.8694, "step": 2300 }, { "epoch": 0.2903296910917278, "grad_norm": 0.6381964683532715, "learning_rate": 0.000290176322418136, "loss": 0.8638, "step": 2305 }, { "epoch": 0.2909594735019051, "grad_norm": 0.5546308755874634, "learning_rate": 0.00029080604534005035, "loss": 0.8748, "step": 2310 }, { "epoch": 0.2915892559120824, "grad_norm": 0.7318828701972961, "learning_rate": 0.0002914357682619647, "loss": 0.8594, "step": 2315 }, { "epoch": 0.29221903832225965, "grad_norm": 0.5685531497001648, "learning_rate": 0.00029206549118387904, "loss": 0.8815, "step": 2320 }, { "epoch": 0.29284882073243695, "grad_norm": 0.6351069808006287, "learning_rate": 0.00029269521410579347, "loss": 0.8351, "step": 2325 }, { "epoch": 0.2934786031426142, "grad_norm": 0.5828582048416138, "learning_rate": 0.0002933249370277078, "loss": 0.8678, "step": 2330 }, { "epoch": 0.2941083855527915, "grad_norm": 0.5991604924201965, "learning_rate": 0.00029395465994962216, "loss": 0.8939, "step": 2335 }, { "epoch": 0.2947381679629688, "grad_norm": 0.5732405781745911, "learning_rate": 0.00029458438287153653, "loss": 0.8594, "step": 2340 }, { "epoch": 0.29536795037314606, "grad_norm": 0.5813714265823364, "learning_rate": 0.00029521410579345085, "loss": 0.8412, "step": 2345 }, { "epoch": 0.29599773278332336, "grad_norm": 0.5281296968460083, "learning_rate": 0.0002958438287153652, "loss": 0.9049, "step": 2350 }, { "epoch": 0.29662751519350067, "grad_norm": 0.6491068005561829, "learning_rate": 0.0002964735516372796, "loss": 0.8955, "step": 2355 }, { "epoch": 0.2972572976036779, "grad_norm": 0.6236696839332581, "learning_rate": 0.0002971032745591939, "loss": 0.8792, "step": 2360 }, { "epoch": 0.2978870800138552, "grad_norm": 0.605625331401825, "learning_rate": 0.0002977329974811083, "loss": 0.8448, "step": 2365 }, { "epoch": 0.29851686242403247, "grad_norm": 0.6011054515838623, "learning_rate": 0.00029836272040302265, "loss": 0.911, "step": 2370 }, { "epoch": 0.2991466448342098, "grad_norm": 0.5662422180175781, "learning_rate": 0.00029899244332493697, "loss": 0.872, "step": 2375 }, { "epoch": 0.2997764272443871, "grad_norm": 0.8375005125999451, "learning_rate": 0.00029962216624685134, "loss": 0.7924, "step": 2380 }, { "epoch": 0.3004062096545643, "grad_norm": 0.5916186571121216, "learning_rate": 0.0002999999935557256, "loss": 0.9007, "step": 2385 }, { "epoch": 0.30103599206474163, "grad_norm": 0.6436251997947693, "learning_rate": 0.00029999992105764553, "loss": 0.8247, "step": 2390 }, { "epoch": 0.30166577447491894, "grad_norm": 0.6368377208709717, "learning_rate": 0.0002999997680061815, "loss": 0.891, "step": 2395 }, { "epoch": 0.3022955568850962, "grad_norm": 0.5848705172538757, "learning_rate": 0.0002999995344014156, "loss": 0.8335, "step": 2400 }, { "epoch": 0.3029253392952735, "grad_norm": 0.5829634070396423, "learning_rate": 0.0002999992202434735, "loss": 0.8705, "step": 2405 }, { "epoch": 0.3035551217054508, "grad_norm": 0.6242154240608215, "learning_rate": 0.0002999988255325237, "loss": 0.8819, "step": 2410 }, { "epoch": 0.30418490411562804, "grad_norm": 0.5757481455802917, "learning_rate": 0.0002999983502687783, "loss": 0.8748, "step": 2415 }, { "epoch": 0.30481468652580535, "grad_norm": 0.5024969577789307, "learning_rate": 0.00029999779445249243, "loss": 0.8534, "step": 2420 }, { "epoch": 0.3054444689359826, "grad_norm": 0.5515364408493042, "learning_rate": 0.00029999715808396463, "loss": 0.8535, "step": 2425 }, { "epoch": 0.3060742513461599, "grad_norm": 0.5151112079620361, "learning_rate": 0.00029999644116353666, "loss": 0.8686, "step": 2430 }, { "epoch": 0.3067040337563372, "grad_norm": 0.5231375098228455, "learning_rate": 0.0002999956436915935, "loss": 0.8465, "step": 2435 }, { "epoch": 0.30733381616651445, "grad_norm": 0.5415048003196716, "learning_rate": 0.0002999947656685634, "loss": 0.853, "step": 2440 }, { "epoch": 0.30796359857669175, "grad_norm": 0.5642004609107971, "learning_rate": 0.00029999380709491794, "loss": 0.8827, "step": 2445 }, { "epoch": 0.30859338098686906, "grad_norm": 0.6197057366371155, "learning_rate": 0.0002999927679711718, "loss": 0.9072, "step": 2450 }, { "epoch": 0.3092231633970463, "grad_norm": 0.5865146517753601, "learning_rate": 0.0002999916482978831, "loss": 0.837, "step": 2455 }, { "epoch": 0.3098529458072236, "grad_norm": 0.5961802005767822, "learning_rate": 0.0002999904480756531, "loss": 0.8657, "step": 2460 }, { "epoch": 0.31048272821740086, "grad_norm": 0.5736685395240784, "learning_rate": 0.0002999891673051263, "loss": 0.872, "step": 2465 }, { "epoch": 0.31111251062757816, "grad_norm": 0.5412915945053101, "learning_rate": 0.0002999878059869905, "loss": 0.8327, "step": 2470 }, { "epoch": 0.31174229303775547, "grad_norm": 0.5011366605758667, "learning_rate": 0.0002999863641219769, "loss": 0.8418, "step": 2475 }, { "epoch": 0.3123720754479327, "grad_norm": 0.5566514134407043, "learning_rate": 0.0002999848417108597, "loss": 0.8768, "step": 2480 }, { "epoch": 0.31300185785811, "grad_norm": 0.5639830231666565, "learning_rate": 0.0002999832387544564, "loss": 0.8178, "step": 2485 }, { "epoch": 0.3136316402682873, "grad_norm": 0.5784679055213928, "learning_rate": 0.000299981555253628, "loss": 0.8698, "step": 2490 }, { "epoch": 0.3142614226784646, "grad_norm": 0.5428637266159058, "learning_rate": 0.00029997979120927846, "loss": 0.8671, "step": 2495 }, { "epoch": 0.3148912050886419, "grad_norm": 0.5629287362098694, "learning_rate": 0.00029997794662235515, "loss": 0.873, "step": 2500 }, { "epoch": 0.3155209874988192, "grad_norm": 0.5561172366142273, "learning_rate": 0.00029997602149384856, "loss": 0.8664, "step": 2505 }, { "epoch": 0.31615076990899643, "grad_norm": 0.5451831221580505, "learning_rate": 0.0002999740158247927, "loss": 0.8349, "step": 2510 }, { "epoch": 0.31678055231917374, "grad_norm": 0.5645403861999512, "learning_rate": 0.00029997192961626456, "loss": 0.8924, "step": 2515 }, { "epoch": 0.317410334729351, "grad_norm": 0.5120379328727722, "learning_rate": 0.00029996976286938444, "loss": 0.8606, "step": 2520 }, { "epoch": 0.3180401171395283, "grad_norm": 0.45988166332244873, "learning_rate": 0.0002999675155853161, "loss": 0.8285, "step": 2525 }, { "epoch": 0.3186698995497056, "grad_norm": 0.5446504950523376, "learning_rate": 0.00029996518776526614, "loss": 0.8913, "step": 2530 }, { "epoch": 0.31929968195988284, "grad_norm": 0.648369550704956, "learning_rate": 0.00029996277941048485, "loss": 0.8753, "step": 2535 }, { "epoch": 0.31992946437006015, "grad_norm": 0.6404165029525757, "learning_rate": 0.0002999602905222655, "loss": 0.8747, "step": 2540 }, { "epoch": 0.32055924678023745, "grad_norm": 0.46791502833366394, "learning_rate": 0.0002999577211019447, "loss": 0.8132, "step": 2545 }, { "epoch": 0.3211890291904147, "grad_norm": 0.5365081429481506, "learning_rate": 0.00029995507115090225, "loss": 0.8363, "step": 2550 }, { "epoch": 0.321818811600592, "grad_norm": 0.5029319524765015, "learning_rate": 0.00029995234067056124, "loss": 0.8297, "step": 2555 }, { "epoch": 0.32244859401076925, "grad_norm": 0.509843647480011, "learning_rate": 0.00029994952966238804, "loss": 0.828, "step": 2560 }, { "epoch": 0.32307837642094656, "grad_norm": 0.479045569896698, "learning_rate": 0.0002999466381278922, "loss": 0.8689, "step": 2565 }, { "epoch": 0.32370815883112386, "grad_norm": 0.5639600157737732, "learning_rate": 0.0002999436660686265, "loss": 0.8521, "step": 2570 }, { "epoch": 0.3243379412413011, "grad_norm": 0.5077898502349854, "learning_rate": 0.00029994061348618715, "loss": 0.835, "step": 2575 }, { "epoch": 0.3249677236514784, "grad_norm": 0.45198580622673035, "learning_rate": 0.00029993748038221324, "loss": 0.8394, "step": 2580 }, { "epoch": 0.3255975060616557, "grad_norm": 0.5617688894271851, "learning_rate": 0.0002999342667583875, "loss": 0.8285, "step": 2585 }, { "epoch": 0.32622728847183297, "grad_norm": 0.5159285664558411, "learning_rate": 0.0002999309726164356, "loss": 0.7654, "step": 2590 }, { "epoch": 0.32685707088201027, "grad_norm": 0.526965320110321, "learning_rate": 0.00029992759795812666, "loss": 0.8392, "step": 2595 }, { "epoch": 0.3274868532921876, "grad_norm": 0.4861494302749634, "learning_rate": 0.0002999241427852729, "loss": 0.8177, "step": 2600 }, { "epoch": 0.3281166357023648, "grad_norm": 0.5498744249343872, "learning_rate": 0.0002999206070997298, "loss": 0.8006, "step": 2605 }, { "epoch": 0.3287464181125421, "grad_norm": 0.526978075504303, "learning_rate": 0.0002999169909033962, "loss": 0.8261, "step": 2610 }, { "epoch": 0.3293762005227194, "grad_norm": 0.5078813433647156, "learning_rate": 0.0002999132941982139, "loss": 0.8396, "step": 2615 }, { "epoch": 0.3300059829328967, "grad_norm": 0.5390729308128357, "learning_rate": 0.00029990951698616834, "loss": 0.8695, "step": 2620 }, { "epoch": 0.330635765343074, "grad_norm": 0.520889401435852, "learning_rate": 0.00029990565926928787, "loss": 0.8489, "step": 2625 }, { "epoch": 0.33126554775325123, "grad_norm": 0.6547030210494995, "learning_rate": 0.00029990172104964413, "loss": 0.8821, "step": 2630 }, { "epoch": 0.33189533016342854, "grad_norm": 0.5034601092338562, "learning_rate": 0.00029989770232935204, "loss": 0.8202, "step": 2635 }, { "epoch": 0.33252511257360584, "grad_norm": 0.5204071402549744, "learning_rate": 0.0002998936031105698, "loss": 0.852, "step": 2640 }, { "epoch": 0.3331548949837831, "grad_norm": 0.499221533536911, "learning_rate": 0.0002998894233954988, "loss": 0.8338, "step": 2645 }, { "epoch": 0.3337846773939604, "grad_norm": 0.5096358060836792, "learning_rate": 0.0002998851631863835, "loss": 0.8149, "step": 2650 }, { "epoch": 0.3344144598041377, "grad_norm": 0.4654362201690674, "learning_rate": 0.0002998808224855119, "loss": 0.8461, "step": 2655 }, { "epoch": 0.33504424221431495, "grad_norm": 0.7029035091400146, "learning_rate": 0.00029987640129521497, "loss": 0.8137, "step": 2660 }, { "epoch": 0.33567402462449225, "grad_norm": 0.5634217262268066, "learning_rate": 0.000299871899617867, "loss": 0.8434, "step": 2665 }, { "epoch": 0.3363038070346695, "grad_norm": 0.5168646574020386, "learning_rate": 0.0002998673174558855, "loss": 0.8554, "step": 2670 }, { "epoch": 0.3369335894448468, "grad_norm": 0.4693644344806671, "learning_rate": 0.00029986265481173123, "loss": 0.8246, "step": 2675 }, { "epoch": 0.3375633718550241, "grad_norm": 0.44928330183029175, "learning_rate": 0.00029985791168790805, "loss": 0.8554, "step": 2680 }, { "epoch": 0.33819315426520136, "grad_norm": 0.5288846492767334, "learning_rate": 0.0002998530880869632, "loss": 0.8319, "step": 2685 }, { "epoch": 0.33882293667537866, "grad_norm": 0.4755760431289673, "learning_rate": 0.00029984818401148706, "loss": 0.874, "step": 2690 }, { "epoch": 0.33945271908555597, "grad_norm": 0.541684091091156, "learning_rate": 0.0002998431994641132, "loss": 0.8526, "step": 2695 }, { "epoch": 0.3400825014957332, "grad_norm": 0.5160995125770569, "learning_rate": 0.0002998381344475184, "loss": 0.8749, "step": 2700 }, { "epoch": 0.3407122839059105, "grad_norm": 0.5409444570541382, "learning_rate": 0.00029983298896442276, "loss": 0.8118, "step": 2705 }, { "epoch": 0.34134206631608777, "grad_norm": 0.5148081183433533, "learning_rate": 0.00029982776301758956, "loss": 0.8685, "step": 2710 }, { "epoch": 0.34197184872626507, "grad_norm": 0.5689860582351685, "learning_rate": 0.0002998224566098251, "loss": 0.8476, "step": 2715 }, { "epoch": 0.3426016311364424, "grad_norm": 0.520268440246582, "learning_rate": 0.00029981706974397917, "loss": 0.8128, "step": 2720 }, { "epoch": 0.3432314135466196, "grad_norm": 0.49906817078590393, "learning_rate": 0.00029981160242294457, "loss": 0.836, "step": 2725 }, { "epoch": 0.34386119595679693, "grad_norm": 0.47317516803741455, "learning_rate": 0.0002998060546496575, "loss": 0.8251, "step": 2730 }, { "epoch": 0.34449097836697423, "grad_norm": 0.49573519825935364, "learning_rate": 0.0002998004264270971, "loss": 0.832, "step": 2735 }, { "epoch": 0.3451207607771515, "grad_norm": 0.43803608417510986, "learning_rate": 0.0002997947177582859, "loss": 0.7875, "step": 2740 }, { "epoch": 0.3457505431873288, "grad_norm": 0.5324883460998535, "learning_rate": 0.0002997889286462896, "loss": 0.824, "step": 2745 }, { "epoch": 0.3463803255975061, "grad_norm": 0.5902321934700012, "learning_rate": 0.00029978305909421707, "loss": 0.8265, "step": 2750 }, { "epoch": 0.34701010800768334, "grad_norm": 0.5052042007446289, "learning_rate": 0.0002997771091052204, "loss": 0.7715, "step": 2755 }, { "epoch": 0.34763989041786064, "grad_norm": 0.4439961314201355, "learning_rate": 0.0002997710786824949, "loss": 0.8387, "step": 2760 }, { "epoch": 0.3482696728280379, "grad_norm": 0.5099385976791382, "learning_rate": 0.0002997649678292789, "loss": 0.8424, "step": 2765 }, { "epoch": 0.3488994552382152, "grad_norm": 0.4415825605392456, "learning_rate": 0.00029975877654885426, "loss": 0.8066, "step": 2770 }, { "epoch": 0.3495292376483925, "grad_norm": 0.5088052153587341, "learning_rate": 0.0002997525048445458, "loss": 0.8172, "step": 2775 }, { "epoch": 0.35015902005856975, "grad_norm": 0.5503986477851868, "learning_rate": 0.00029974615271972146, "loss": 0.873, "step": 2780 }, { "epoch": 0.35078880246874705, "grad_norm": 0.4609704613685608, "learning_rate": 0.0002997397201777926, "loss": 0.8041, "step": 2785 }, { "epoch": 0.35141858487892436, "grad_norm": 0.49494004249572754, "learning_rate": 0.00029973320722221356, "loss": 0.916, "step": 2790 }, { "epoch": 0.3520483672891016, "grad_norm": 0.4820273518562317, "learning_rate": 0.00029972661385648197, "loss": 0.8597, "step": 2795 }, { "epoch": 0.3526781496992789, "grad_norm": 0.467641681432724, "learning_rate": 0.0002997199400841386, "loss": 0.7944, "step": 2800 }, { "epoch": 0.35330793210945616, "grad_norm": 0.49666082859039307, "learning_rate": 0.00029971318590876745, "loss": 0.8204, "step": 2805 }, { "epoch": 0.35393771451963346, "grad_norm": 0.4578961133956909, "learning_rate": 0.00029970635133399565, "loss": 0.8426, "step": 2810 }, { "epoch": 0.35456749692981077, "grad_norm": 0.48815369606018066, "learning_rate": 0.00029969943636349363, "loss": 0.8277, "step": 2815 }, { "epoch": 0.355197279339988, "grad_norm": 0.4686887562274933, "learning_rate": 0.0002996924410009747, "loss": 0.7936, "step": 2820 }, { "epoch": 0.3558270617501653, "grad_norm": 0.4245474636554718, "learning_rate": 0.0002996853652501956, "loss": 0.8085, "step": 2825 }, { "epoch": 0.3564568441603426, "grad_norm": 0.5085129737854004, "learning_rate": 0.0002996782091149562, "loss": 0.8584, "step": 2830 }, { "epoch": 0.35708662657051987, "grad_norm": 0.4415908455848694, "learning_rate": 0.0002996709725990995, "loss": 0.8234, "step": 2835 }, { "epoch": 0.3577164089806972, "grad_norm": 0.44018128514289856, "learning_rate": 0.00029966365570651164, "loss": 0.8566, "step": 2840 }, { "epoch": 0.3583461913908745, "grad_norm": 0.4675704836845398, "learning_rate": 0.000299656258441122, "loss": 0.8164, "step": 2845 }, { "epoch": 0.35897597380105173, "grad_norm": 0.47553756833076477, "learning_rate": 0.0002996487808069031, "loss": 0.8177, "step": 2850 }, { "epoch": 0.35960575621122903, "grad_norm": 0.5298905372619629, "learning_rate": 0.00029964122280787053, "loss": 0.8537, "step": 2855 }, { "epoch": 0.3602355386214063, "grad_norm": 0.484838604927063, "learning_rate": 0.0002996335844480832, "loss": 0.8495, "step": 2860 }, { "epoch": 0.3608653210315836, "grad_norm": 0.4366026818752289, "learning_rate": 0.000299625865731643, "loss": 0.8073, "step": 2865 }, { "epoch": 0.3614951034417609, "grad_norm": 0.4988342225551605, "learning_rate": 0.00029961806666269503, "loss": 0.8127, "step": 2870 }, { "epoch": 0.36212488585193814, "grad_norm": 0.5028805732727051, "learning_rate": 0.00029961018724542767, "loss": 0.8711, "step": 2875 }, { "epoch": 0.36275466826211544, "grad_norm": 0.5009424686431885, "learning_rate": 0.00029960222748407226, "loss": 0.8015, "step": 2880 }, { "epoch": 0.36338445067229275, "grad_norm": 0.4522798955440521, "learning_rate": 0.00029959418738290344, "loss": 0.8261, "step": 2885 }, { "epoch": 0.36401423308247, "grad_norm": 0.49349725246429443, "learning_rate": 0.00029958606694623893, "loss": 0.8006, "step": 2890 }, { "epoch": 0.3646440154926473, "grad_norm": 0.46870625019073486, "learning_rate": 0.00029957786617843956, "loss": 0.8285, "step": 2895 }, { "epoch": 0.36527379790282455, "grad_norm": 0.5234463810920715, "learning_rate": 0.0002995695850839093, "loss": 0.8497, "step": 2900 }, { "epoch": 0.36590358031300185, "grad_norm": 0.487884521484375, "learning_rate": 0.0002995612236670953, "loss": 0.8033, "step": 2905 }, { "epoch": 0.36653336272317916, "grad_norm": 0.4760074317455292, "learning_rate": 0.0002995527819324879, "loss": 0.8412, "step": 2910 }, { "epoch": 0.3671631451333564, "grad_norm": 0.4630395472049713, "learning_rate": 0.0002995442598846205, "loss": 0.8244, "step": 2915 }, { "epoch": 0.3677929275435337, "grad_norm": 0.4981043040752411, "learning_rate": 0.0002995356575280695, "loss": 0.798, "step": 2920 }, { "epoch": 0.368422709953711, "grad_norm": 0.4630597233772278, "learning_rate": 0.00029952697486745466, "loss": 0.8032, "step": 2925 }, { "epoch": 0.36905249236388826, "grad_norm": 0.4962010979652405, "learning_rate": 0.00029951821190743884, "loss": 0.8183, "step": 2930 }, { "epoch": 0.36968227477406557, "grad_norm": 0.47193852066993713, "learning_rate": 0.00029950936865272775, "loss": 0.841, "step": 2935 }, { "epoch": 0.37031205718424287, "grad_norm": 0.4802277982234955, "learning_rate": 0.0002995004451080706, "loss": 0.7433, "step": 2940 }, { "epoch": 0.3709418395944201, "grad_norm": 0.43486830592155457, "learning_rate": 0.00029949144127825947, "loss": 0.8051, "step": 2945 }, { "epoch": 0.3715716220045974, "grad_norm": 0.5078021287918091, "learning_rate": 0.0002994823571681296, "loss": 0.8662, "step": 2950 }, { "epoch": 0.3722014044147747, "grad_norm": 0.44738146662712097, "learning_rate": 0.0002994731927825594, "loss": 0.7997, "step": 2955 }, { "epoch": 0.372831186824952, "grad_norm": 0.44323739409446716, "learning_rate": 0.0002994639481264704, "loss": 0.8481, "step": 2960 }, { "epoch": 0.3734609692351293, "grad_norm": 0.525050938129425, "learning_rate": 0.0002994546232048271, "loss": 0.8375, "step": 2965 }, { "epoch": 0.37409075164530653, "grad_norm": 0.4817000925540924, "learning_rate": 0.00029944521802263723, "loss": 0.8, "step": 2970 }, { "epoch": 0.37472053405548383, "grad_norm": 0.5709639191627502, "learning_rate": 0.00029943573258495165, "loss": 0.8104, "step": 2975 }, { "epoch": 0.37535031646566114, "grad_norm": 0.518618643283844, "learning_rate": 0.00029942616689686416, "loss": 0.7948, "step": 2980 }, { "epoch": 0.3759800988758384, "grad_norm": 0.4226647615432739, "learning_rate": 0.00029941652096351174, "loss": 0.7599, "step": 2985 }, { "epoch": 0.3766098812860157, "grad_norm": 0.4751405119895935, "learning_rate": 0.0002994067947900746, "loss": 0.8015, "step": 2990 }, { "epoch": 0.37723966369619294, "grad_norm": 0.4653600752353668, "learning_rate": 0.0002993969883817758, "loss": 0.7758, "step": 2995 }, { "epoch": 0.37786944610637024, "grad_norm": 0.512941837310791, "learning_rate": 0.00029938710174388163, "loss": 0.8188, "step": 3000 }, { "epoch": 0.37786944610637024, "eval_loss": 0.3438470661640167, "eval_runtime": 6.225, "eval_samples_per_second": 160.642, "eval_steps_per_second": 10.12, "step": 3000 }, { "epoch": 0.37849922851654755, "grad_norm": 0.43524855375289917, "learning_rate": 0.0002993771348817015, "loss": 0.803, "step": 3005 }, { "epoch": 0.3791290109267248, "grad_norm": 0.4569668173789978, "learning_rate": 0.0002993670878005878, "loss": 0.8777, "step": 3010 }, { "epoch": 0.3797587933369021, "grad_norm": 0.4643417000770569, "learning_rate": 0.00029935696050593604, "loss": 0.7621, "step": 3015 }, { "epoch": 0.3803885757470794, "grad_norm": 0.4604712128639221, "learning_rate": 0.00029934675300318485, "loss": 0.8216, "step": 3020 }, { "epoch": 0.38101835815725665, "grad_norm": 0.4307630956172943, "learning_rate": 0.0002993364652978158, "loss": 0.8163, "step": 3025 }, { "epoch": 0.38164814056743396, "grad_norm": 0.44455698132514954, "learning_rate": 0.00029932609739535365, "loss": 0.818, "step": 3030 }, { "epoch": 0.38227792297761126, "grad_norm": 0.43203669786453247, "learning_rate": 0.0002993156493013663, "loss": 0.8168, "step": 3035 }, { "epoch": 0.3829077053877885, "grad_norm": 0.42328670620918274, "learning_rate": 0.00029930512102146453, "loss": 0.8025, "step": 3040 }, { "epoch": 0.3835374877979658, "grad_norm": 0.43900108337402344, "learning_rate": 0.0002992945125613023, "loss": 0.7595, "step": 3045 }, { "epoch": 0.38416727020814306, "grad_norm": 0.46638986468315125, "learning_rate": 0.00029928382392657656, "loss": 0.8208, "step": 3050 }, { "epoch": 0.38479705261832037, "grad_norm": 0.4279174208641052, "learning_rate": 0.00029927305512302736, "loss": 0.8151, "step": 3055 }, { "epoch": 0.38542683502849767, "grad_norm": 0.4648323357105255, "learning_rate": 0.0002992622061564378, "loss": 0.7666, "step": 3060 }, { "epoch": 0.3860566174386749, "grad_norm": 0.45052894949913025, "learning_rate": 0.000299251277032634, "loss": 0.7995, "step": 3065 }, { "epoch": 0.3866863998488522, "grad_norm": 0.46262305974960327, "learning_rate": 0.0002992402677574852, "loss": 0.8175, "step": 3070 }, { "epoch": 0.38731618225902953, "grad_norm": 0.4934038519859314, "learning_rate": 0.00029922917833690365, "loss": 0.821, "step": 3075 }, { "epoch": 0.3879459646692068, "grad_norm": 0.46494096517562866, "learning_rate": 0.0002992180087768445, "loss": 0.8081, "step": 3080 }, { "epoch": 0.3885757470793841, "grad_norm": 0.9760459661483765, "learning_rate": 0.0002992067590833062, "loss": 0.7673, "step": 3085 }, { "epoch": 0.38920552948956133, "grad_norm": 0.7070348262786865, "learning_rate": 0.00029919542926233, "loss": 0.8017, "step": 3090 }, { "epoch": 0.38983531189973863, "grad_norm": 0.6773821711540222, "learning_rate": 0.00029918401932000027, "loss": 0.7946, "step": 3095 }, { "epoch": 0.39046509430991594, "grad_norm": 0.4955935478210449, "learning_rate": 0.0002991725292624445, "loss": 0.8431, "step": 3100 }, { "epoch": 0.3910948767200932, "grad_norm": 0.7728490829467773, "learning_rate": 0.000299160959095833, "loss": 0.8024, "step": 3105 }, { "epoch": 0.3917246591302705, "grad_norm": 0.6880044341087341, "learning_rate": 0.00029914930882637926, "loss": 0.788, "step": 3110 }, { "epoch": 0.3923544415404478, "grad_norm": 0.4916500747203827, "learning_rate": 0.0002991375784603398, "loss": 0.7878, "step": 3115 }, { "epoch": 0.39298422395062504, "grad_norm": 0.5188093781471252, "learning_rate": 0.00029912576800401403, "loss": 0.8404, "step": 3120 }, { "epoch": 0.39361400636080235, "grad_norm": 0.5084378123283386, "learning_rate": 0.0002991138774637444, "loss": 0.8277, "step": 3125 }, { "epoch": 0.39424378877097965, "grad_norm": 0.4139776825904846, "learning_rate": 0.0002991019068459165, "loss": 0.7672, "step": 3130 }, { "epoch": 0.3948735711811569, "grad_norm": 0.4756976366043091, "learning_rate": 0.0002990898561569588, "loss": 0.7936, "step": 3135 }, { "epoch": 0.3955033535913342, "grad_norm": 0.46053630113601685, "learning_rate": 0.0002990777254033427, "loss": 0.8102, "step": 3140 }, { "epoch": 0.39613313600151145, "grad_norm": 0.48546189069747925, "learning_rate": 0.00029906551459158283, "loss": 0.8184, "step": 3145 }, { "epoch": 0.39676291841168876, "grad_norm": 0.477192223072052, "learning_rate": 0.0002990532237282366, "loss": 0.828, "step": 3150 }, { "epoch": 0.39739270082186606, "grad_norm": 0.48900333046913147, "learning_rate": 0.00029904085281990447, "loss": 0.8183, "step": 3155 }, { "epoch": 0.3980224832320433, "grad_norm": 0.5019087791442871, "learning_rate": 0.0002990284018732299, "loss": 0.8002, "step": 3160 }, { "epoch": 0.3986522656422206, "grad_norm": 0.5127068758010864, "learning_rate": 0.0002990158708948994, "loss": 0.8088, "step": 3165 }, { "epoch": 0.3992820480523979, "grad_norm": 0.44172775745391846, "learning_rate": 0.00029900325989164233, "loss": 0.8013, "step": 3170 }, { "epoch": 0.39991183046257517, "grad_norm": 0.5318475961685181, "learning_rate": 0.0002989905688702311, "loss": 0.8239, "step": 3175 }, { "epoch": 0.4005416128727525, "grad_norm": 0.4257467985153198, "learning_rate": 0.0002989777978374811, "loss": 0.7714, "step": 3180 }, { "epoch": 0.4011713952829297, "grad_norm": 0.42196664214134216, "learning_rate": 0.0002989649468002506, "loss": 0.7987, "step": 3185 }, { "epoch": 0.401801177693107, "grad_norm": 0.47977736592292786, "learning_rate": 0.000298952015765441, "loss": 0.7699, "step": 3190 }, { "epoch": 0.40243096010328433, "grad_norm": 0.4841693639755249, "learning_rate": 0.0002989390047399965, "loss": 0.7916, "step": 3195 }, { "epoch": 0.4030607425134616, "grad_norm": 0.5104061961174011, "learning_rate": 0.0002989259137309043, "loss": 0.8244, "step": 3200 }, { "epoch": 0.4036905249236389, "grad_norm": 0.46594473719596863, "learning_rate": 0.00029891274274519464, "loss": 0.786, "step": 3205 }, { "epoch": 0.4043203073338162, "grad_norm": 0.4309998154640198, "learning_rate": 0.0002988994917899405, "loss": 0.8266, "step": 3210 }, { "epoch": 0.40495008974399344, "grad_norm": 0.4976588785648346, "learning_rate": 0.00029888616087225817, "loss": 0.7911, "step": 3215 }, { "epoch": 0.40557987215417074, "grad_norm": 0.47657066583633423, "learning_rate": 0.00029887274999930647, "loss": 0.7926, "step": 3220 }, { "epoch": 0.40620965456434804, "grad_norm": 0.42497026920318604, "learning_rate": 0.0002988592591782874, "loss": 0.7838, "step": 3225 }, { "epoch": 0.4068394369745253, "grad_norm": 0.4974801540374756, "learning_rate": 0.00029884568841644587, "loss": 0.7854, "step": 3230 }, { "epoch": 0.4074692193847026, "grad_norm": 0.43505486845970154, "learning_rate": 0.00029883203772106966, "loss": 0.8689, "step": 3235 }, { "epoch": 0.40809900179487985, "grad_norm": 0.5216085314750671, "learning_rate": 0.0002988183070994895, "loss": 0.8445, "step": 3240 }, { "epoch": 0.40872878420505715, "grad_norm": 0.5993830561637878, "learning_rate": 0.0002988044965590791, "loss": 0.7944, "step": 3245 }, { "epoch": 0.40935856661523445, "grad_norm": 0.7245651483535767, "learning_rate": 0.00029879060610725494, "loss": 0.8175, "step": 3250 }, { "epoch": 0.4099883490254117, "grad_norm": 0.4758714735507965, "learning_rate": 0.00029877663575147653, "loss": 0.7862, "step": 3255 }, { "epoch": 0.410618131435589, "grad_norm": 0.5264742970466614, "learning_rate": 0.0002987625854992464, "loss": 0.7625, "step": 3260 }, { "epoch": 0.4112479138457663, "grad_norm": 0.46857550740242004, "learning_rate": 0.0002987484553581097, "loss": 0.7878, "step": 3265 }, { "epoch": 0.41187769625594356, "grad_norm": 0.4588899314403534, "learning_rate": 0.0002987342453356547, "loss": 0.8435, "step": 3270 }, { "epoch": 0.41250747866612086, "grad_norm": 0.47005462646484375, "learning_rate": 0.0002987199554395125, "loss": 0.8343, "step": 3275 }, { "epoch": 0.4131372610762981, "grad_norm": 0.4855548143386841, "learning_rate": 0.00029870558567735716, "loss": 0.7944, "step": 3280 }, { "epoch": 0.4137670434864754, "grad_norm": 0.4832567572593689, "learning_rate": 0.00029869113605690545, "loss": 0.7999, "step": 3285 }, { "epoch": 0.4143968258966527, "grad_norm": 0.4483296573162079, "learning_rate": 0.00029867660658591724, "loss": 0.8074, "step": 3290 }, { "epoch": 0.41502660830682997, "grad_norm": 0.5084306001663208, "learning_rate": 0.00029866199727219514, "loss": 0.8173, "step": 3295 }, { "epoch": 0.4156563907170073, "grad_norm": 0.43247321248054504, "learning_rate": 0.00029864730812358473, "loss": 0.7904, "step": 3300 }, { "epoch": 0.4162861731271846, "grad_norm": 0.4278540313243866, "learning_rate": 0.0002986325391479744, "loss": 0.79, "step": 3305 }, { "epoch": 0.4169159555373618, "grad_norm": 0.4396720230579376, "learning_rate": 0.00029861769035329546, "loss": 0.7737, "step": 3310 }, { "epoch": 0.41754573794753913, "grad_norm": 0.4305702745914459, "learning_rate": 0.0002986027617475219, "loss": 0.8133, "step": 3315 }, { "epoch": 0.41817552035771643, "grad_norm": 0.4455117881298065, "learning_rate": 0.0002985877533386709, "loss": 0.7932, "step": 3320 }, { "epoch": 0.4188053027678937, "grad_norm": 0.45051881670951843, "learning_rate": 0.00029857266513480226, "loss": 0.8162, "step": 3325 }, { "epoch": 0.419435085178071, "grad_norm": 0.47537773847579956, "learning_rate": 0.0002985574971440187, "loss": 0.7931, "step": 3330 }, { "epoch": 0.42006486758824824, "grad_norm": 0.46828627586364746, "learning_rate": 0.0002985422493744657, "loss": 0.8399, "step": 3335 }, { "epoch": 0.42069464999842554, "grad_norm": 0.4528372585773468, "learning_rate": 0.00029852692183433176, "loss": 0.7821, "step": 3340 }, { "epoch": 0.42132443240860284, "grad_norm": 0.4476306736469269, "learning_rate": 0.00029851151453184807, "loss": 0.7986, "step": 3345 }, { "epoch": 0.4219542148187801, "grad_norm": 0.4092450439929962, "learning_rate": 0.00029849602747528874, "loss": 0.7827, "step": 3350 }, { "epoch": 0.4225839972289574, "grad_norm": 0.4776279330253601, "learning_rate": 0.00029848046067297064, "loss": 0.8269, "step": 3355 }, { "epoch": 0.4232137796391347, "grad_norm": 0.45867466926574707, "learning_rate": 0.00029846481413325346, "loss": 0.8094, "step": 3360 }, { "epoch": 0.42384356204931195, "grad_norm": 0.4665123522281647, "learning_rate": 0.00029844908786453986, "loss": 0.7288, "step": 3365 }, { "epoch": 0.42447334445948925, "grad_norm": 0.4820057451725006, "learning_rate": 0.0002984332818752751, "loss": 0.8012, "step": 3370 }, { "epoch": 0.4251031268696665, "grad_norm": 0.46213796734809875, "learning_rate": 0.00029841739617394737, "loss": 0.7693, "step": 3375 }, { "epoch": 0.4257329092798438, "grad_norm": 0.4329429864883423, "learning_rate": 0.0002984014307690878, "loss": 0.7215, "step": 3380 }, { "epoch": 0.4263626916900211, "grad_norm": 0.437621146440506, "learning_rate": 0.00029838538566926993, "loss": 0.7839, "step": 3385 }, { "epoch": 0.42699247410019836, "grad_norm": 0.4661789536476135, "learning_rate": 0.0002983692608831105, "loss": 0.7827, "step": 3390 }, { "epoch": 0.42762225651037566, "grad_norm": 0.4203425645828247, "learning_rate": 0.0002983530564192689, "loss": 0.8096, "step": 3395 }, { "epoch": 0.42825203892055297, "grad_norm": 0.4614803194999695, "learning_rate": 0.00029833677228644726, "loss": 0.8189, "step": 3400 }, { "epoch": 0.4288818213307302, "grad_norm": 0.4247860908508301, "learning_rate": 0.0002983204084933905, "loss": 0.8123, "step": 3405 }, { "epoch": 0.4295116037409075, "grad_norm": 0.4418291449546814, "learning_rate": 0.0002983039650488864, "loss": 0.8036, "step": 3410 }, { "epoch": 0.4301413861510848, "grad_norm": 0.46780282258987427, "learning_rate": 0.00029828744196176547, "loss": 0.8122, "step": 3415 }, { "epoch": 0.4307711685612621, "grad_norm": 0.44973024725914, "learning_rate": 0.0002982708392409009, "loss": 0.7813, "step": 3420 }, { "epoch": 0.4314009509714394, "grad_norm": 0.3922254741191864, "learning_rate": 0.00029825415689520887, "loss": 0.7809, "step": 3425 }, { "epoch": 0.4320307333816166, "grad_norm": 0.391084223985672, "learning_rate": 0.00029823739493364804, "loss": 0.7757, "step": 3430 }, { "epoch": 0.43266051579179393, "grad_norm": 0.4502919316291809, "learning_rate": 0.00029822055336522005, "loss": 0.7688, "step": 3435 }, { "epoch": 0.43329029820197124, "grad_norm": 0.475436270236969, "learning_rate": 0.0002982036321989692, "loss": 0.7667, "step": 3440 }, { "epoch": 0.4339200806121485, "grad_norm": 0.42362409830093384, "learning_rate": 0.00029818663144398253, "loss": 0.8098, "step": 3445 }, { "epoch": 0.4345498630223258, "grad_norm": 0.45329517126083374, "learning_rate": 0.0002981695511093898, "loss": 0.7706, "step": 3450 }, { "epoch": 0.4351796454325031, "grad_norm": 0.4297536313533783, "learning_rate": 0.00029815239120436365, "loss": 0.808, "step": 3455 }, { "epoch": 0.43580942784268034, "grad_norm": 0.4800092577934265, "learning_rate": 0.0002981351517381192, "loss": 0.7973, "step": 3460 }, { "epoch": 0.43643921025285765, "grad_norm": 0.5014777779579163, "learning_rate": 0.00029811783271991454, "loss": 0.8098, "step": 3465 }, { "epoch": 0.4370689926630349, "grad_norm": 0.4412321448326111, "learning_rate": 0.00029810043415905027, "loss": 0.7669, "step": 3470 }, { "epoch": 0.4376987750732122, "grad_norm": 0.4491146206855774, "learning_rate": 0.00029808295606486993, "loss": 0.7599, "step": 3475 }, { "epoch": 0.4383285574833895, "grad_norm": 0.42482897639274597, "learning_rate": 0.0002980653984467596, "loss": 0.7501, "step": 3480 }, { "epoch": 0.43895833989356675, "grad_norm": 0.4166581332683563, "learning_rate": 0.0002980477613141482, "loss": 0.7807, "step": 3485 }, { "epoch": 0.43958812230374406, "grad_norm": 0.48076120018959045, "learning_rate": 0.0002980300446765071, "loss": 0.821, "step": 3490 }, { "epoch": 0.44021790471392136, "grad_norm": 0.4148639142513275, "learning_rate": 0.00029801224854335073, "loss": 0.781, "step": 3495 }, { "epoch": 0.4408476871240986, "grad_norm": 0.41731131076812744, "learning_rate": 0.00029799437292423586, "loss": 0.7784, "step": 3500 }, { "epoch": 0.4414774695342759, "grad_norm": 0.4514264762401581, "learning_rate": 0.00029797641782876224, "loss": 0.8066, "step": 3505 }, { "epoch": 0.4421072519444532, "grad_norm": 0.44717252254486084, "learning_rate": 0.00029795838326657204, "loss": 0.7761, "step": 3510 }, { "epoch": 0.44273703435463047, "grad_norm": 0.42850586771965027, "learning_rate": 0.00029794026924735034, "loss": 0.783, "step": 3515 }, { "epoch": 0.44336681676480777, "grad_norm": 0.7937319278717041, "learning_rate": 0.00029792207578082476, "loss": 0.7894, "step": 3520 }, { "epoch": 0.443996599174985, "grad_norm": 0.4401470124721527, "learning_rate": 0.0002979038028767656, "loss": 0.8046, "step": 3525 }, { "epoch": 0.4446263815851623, "grad_norm": 0.45515474677085876, "learning_rate": 0.00029788545054498577, "loss": 0.8095, "step": 3530 }, { "epoch": 0.4452561639953396, "grad_norm": 0.4676735997200012, "learning_rate": 0.00029786701879534093, "loss": 0.7969, "step": 3535 }, { "epoch": 0.4458859464055169, "grad_norm": 0.42322975397109985, "learning_rate": 0.0002978485076377294, "loss": 0.8336, "step": 3540 }, { "epoch": 0.4465157288156942, "grad_norm": 0.4256497025489807, "learning_rate": 0.000297829917082092, "loss": 0.7773, "step": 3545 }, { "epoch": 0.4471455112258715, "grad_norm": 0.40527772903442383, "learning_rate": 0.00029781124713841237, "loss": 0.8058, "step": 3550 }, { "epoch": 0.44777529363604873, "grad_norm": 0.4047418534755707, "learning_rate": 0.0002977924978167166, "loss": 0.7769, "step": 3555 }, { "epoch": 0.44840507604622604, "grad_norm": 0.4016299545764923, "learning_rate": 0.00029777366912707366, "loss": 0.7531, "step": 3560 }, { "epoch": 0.4490348584564033, "grad_norm": 0.4176371395587921, "learning_rate": 0.00029775476107959486, "loss": 0.7865, "step": 3565 }, { "epoch": 0.4496646408665806, "grad_norm": 0.44107553362846375, "learning_rate": 0.00029773577368443426, "loss": 0.7735, "step": 3570 }, { "epoch": 0.4502944232767579, "grad_norm": 0.43897122144699097, "learning_rate": 0.00029771670695178857, "loss": 0.7715, "step": 3575 }, { "epoch": 0.45092420568693514, "grad_norm": 0.4024025499820709, "learning_rate": 0.000297697560891897, "loss": 0.7442, "step": 3580 }, { "epoch": 0.45155398809711245, "grad_norm": 0.4896734356880188, "learning_rate": 0.0002976783355150415, "loss": 0.829, "step": 3585 }, { "epoch": 0.45218377050728975, "grad_norm": 0.47621142864227295, "learning_rate": 0.0002976590308315465, "loss": 0.7915, "step": 3590 }, { "epoch": 0.452813552917467, "grad_norm": 0.3869519829750061, "learning_rate": 0.00029763964685177905, "loss": 0.7696, "step": 3595 }, { "epoch": 0.4534433353276443, "grad_norm": 0.4245865046977997, "learning_rate": 0.0002976201835861488, "loss": 0.7281, "step": 3600 }, { "epoch": 0.4540731177378216, "grad_norm": 0.40193241834640503, "learning_rate": 0.0002976006410451079, "loss": 0.7435, "step": 3605 }, { "epoch": 0.45470290014799886, "grad_norm": 0.4142551124095917, "learning_rate": 0.00029758101923915123, "loss": 0.7627, "step": 3610 }, { "epoch": 0.45533268255817616, "grad_norm": 0.4214671552181244, "learning_rate": 0.0002975613181788162, "loss": 0.8084, "step": 3615 }, { "epoch": 0.4559624649683534, "grad_norm": 0.46768540143966675, "learning_rate": 0.0002975415378746826, "loss": 0.7502, "step": 3620 }, { "epoch": 0.4565922473785307, "grad_norm": 0.43812718987464905, "learning_rate": 0.00029752167833737295, "loss": 0.7555, "step": 3625 }, { "epoch": 0.457222029788708, "grad_norm": 0.44065701961517334, "learning_rate": 0.00029750173957755223, "loss": 0.7824, "step": 3630 }, { "epoch": 0.45785181219888527, "grad_norm": 0.46632614731788635, "learning_rate": 0.00029748172160592816, "loss": 0.7787, "step": 3635 }, { "epoch": 0.45848159460906257, "grad_norm": 0.3616549074649811, "learning_rate": 0.00029746162443325066, "loss": 0.766, "step": 3640 }, { "epoch": 0.4591113770192399, "grad_norm": 0.39852583408355713, "learning_rate": 0.00029744144807031253, "loss": 0.7318, "step": 3645 }, { "epoch": 0.4597411594294171, "grad_norm": 0.4082608222961426, "learning_rate": 0.0002974211925279488, "loss": 0.7726, "step": 3650 }, { "epoch": 0.4603709418395944, "grad_norm": 0.4750503599643707, "learning_rate": 0.00029740085781703726, "loss": 0.7953, "step": 3655 }, { "epoch": 0.4610007242497717, "grad_norm": 0.439531534910202, "learning_rate": 0.0002973804439484981, "loss": 0.788, "step": 3660 }, { "epoch": 0.461630506659949, "grad_norm": 0.41563692688941956, "learning_rate": 0.000297359950933294, "loss": 0.7935, "step": 3665 }, { "epoch": 0.4622602890701263, "grad_norm": 0.4645535945892334, "learning_rate": 0.00029733937878243015, "loss": 0.7716, "step": 3670 }, { "epoch": 0.46289007148030353, "grad_norm": 0.4334595501422882, "learning_rate": 0.0002973187275069544, "loss": 0.7455, "step": 3675 }, { "epoch": 0.46351985389048084, "grad_norm": 0.4452027678489685, "learning_rate": 0.0002972979971179568, "loss": 0.7533, "step": 3680 }, { "epoch": 0.46414963630065814, "grad_norm": 0.4289001226425171, "learning_rate": 0.0002972771876265701, "loss": 0.8066, "step": 3685 }, { "epoch": 0.4647794187108354, "grad_norm": 0.44446882605552673, "learning_rate": 0.0002972562990439694, "loss": 0.8017, "step": 3690 }, { "epoch": 0.4654092011210127, "grad_norm": 0.4466266930103302, "learning_rate": 0.00029723533138137256, "loss": 0.7686, "step": 3695 }, { "epoch": 0.46603898353119, "grad_norm": 0.44262874126434326, "learning_rate": 0.0002972142846500395, "loss": 0.7835, "step": 3700 }, { "epoch": 0.46666876594136725, "grad_norm": 0.40932217240333557, "learning_rate": 0.0002971931588612729, "loss": 0.7844, "step": 3705 }, { "epoch": 0.46729854835154455, "grad_norm": 0.38220685720443726, "learning_rate": 0.0002971719540264177, "loss": 0.7682, "step": 3710 }, { "epoch": 0.4679283307617218, "grad_norm": 0.4890231788158417, "learning_rate": 0.0002971506701568614, "loss": 0.7883, "step": 3715 }, { "epoch": 0.4685581131718991, "grad_norm": 0.44211700558662415, "learning_rate": 0.00029712930726403397, "loss": 0.7287, "step": 3720 }, { "epoch": 0.4691878955820764, "grad_norm": 0.4585074782371521, "learning_rate": 0.0002971078653594078, "loss": 0.7452, "step": 3725 }, { "epoch": 0.46981767799225366, "grad_norm": 0.3818283975124359, "learning_rate": 0.00029708634445449754, "loss": 0.751, "step": 3730 }, { "epoch": 0.47044746040243096, "grad_norm": 0.4351007640361786, "learning_rate": 0.00029706474456086054, "loss": 0.7665, "step": 3735 }, { "epoch": 0.47107724281260827, "grad_norm": 0.4167363941669464, "learning_rate": 0.0002970430656900964, "loss": 0.7421, "step": 3740 }, { "epoch": 0.4717070252227855, "grad_norm": 0.40461474657058716, "learning_rate": 0.0002970213078538472, "loss": 0.7496, "step": 3745 }, { "epoch": 0.4723368076329628, "grad_norm": 0.38994473218917847, "learning_rate": 0.00029699947106379734, "loss": 0.773, "step": 3750 }, { "epoch": 0.4729665900431401, "grad_norm": 0.42335331439971924, "learning_rate": 0.0002969775553316737, "loss": 0.7496, "step": 3755 }, { "epoch": 0.47359637245331737, "grad_norm": 0.39755743741989136, "learning_rate": 0.0002969555606692455, "loss": 0.7794, "step": 3760 }, { "epoch": 0.4742261548634947, "grad_norm": 0.4671246409416199, "learning_rate": 0.0002969334870883244, "loss": 0.8289, "step": 3765 }, { "epoch": 0.4748559372736719, "grad_norm": 0.4498395621776581, "learning_rate": 0.00029691133460076443, "loss": 0.7856, "step": 3770 }, { "epoch": 0.47548571968384923, "grad_norm": 0.4068240225315094, "learning_rate": 0.00029688910321846193, "loss": 0.7572, "step": 3775 }, { "epoch": 0.47611550209402653, "grad_norm": 0.4673171043395996, "learning_rate": 0.0002968667929533557, "loss": 0.7972, "step": 3780 }, { "epoch": 0.4767452845042038, "grad_norm": 0.4210684597492218, "learning_rate": 0.00029684440381742697, "loss": 0.7566, "step": 3785 }, { "epoch": 0.4773750669143811, "grad_norm": 0.4167214632034302, "learning_rate": 0.000296821935822699, "loss": 0.7383, "step": 3790 }, { "epoch": 0.4780048493245584, "grad_norm": 0.3826481103897095, "learning_rate": 0.0002967993889812378, "loss": 0.7749, "step": 3795 }, { "epoch": 0.47863463173473564, "grad_norm": 0.416202187538147, "learning_rate": 0.0002967767633051514, "loss": 0.7755, "step": 3800 }, { "epoch": 0.47926441414491294, "grad_norm": 0.38089507818222046, "learning_rate": 0.0002967540588065904, "loss": 0.7813, "step": 3805 }, { "epoch": 0.4798941965550902, "grad_norm": 0.5257447957992554, "learning_rate": 0.0002967312754977476, "loss": 0.7408, "step": 3810 }, { "epoch": 0.4805239789652675, "grad_norm": 0.4472402334213257, "learning_rate": 0.00029670841339085813, "loss": 0.7946, "step": 3815 }, { "epoch": 0.4811537613754448, "grad_norm": 0.39956340193748474, "learning_rate": 0.00029668547249819957, "loss": 0.7469, "step": 3820 }, { "epoch": 0.48178354378562205, "grad_norm": 0.41508588194847107, "learning_rate": 0.00029666245283209154, "loss": 0.7328, "step": 3825 }, { "epoch": 0.48241332619579935, "grad_norm": 0.3888874053955078, "learning_rate": 0.00029663935440489624, "loss": 0.7529, "step": 3830 }, { "epoch": 0.48304310860597666, "grad_norm": 0.39619430899620056, "learning_rate": 0.00029661617722901806, "loss": 0.7406, "step": 3835 }, { "epoch": 0.4836728910161539, "grad_norm": 0.38242536783218384, "learning_rate": 0.0002965929213169036, "loss": 0.7355, "step": 3840 }, { "epoch": 0.4843026734263312, "grad_norm": 0.37065988779067993, "learning_rate": 0.0002965695866810419, "loss": 0.7087, "step": 3845 }, { "epoch": 0.4849324558365085, "grad_norm": 0.4015233814716339, "learning_rate": 0.0002965461733339641, "loss": 0.7531, "step": 3850 }, { "epoch": 0.48556223824668576, "grad_norm": 0.4394242465496063, "learning_rate": 0.0002965226812882438, "loss": 0.7619, "step": 3855 }, { "epoch": 0.48619202065686307, "grad_norm": 0.39809074997901917, "learning_rate": 0.00029649911055649666, "loss": 0.7702, "step": 3860 }, { "epoch": 0.4868218030670403, "grad_norm": 0.5184118747711182, "learning_rate": 0.0002964754611513808, "loss": 0.7931, "step": 3865 }, { "epoch": 0.4874515854772176, "grad_norm": 0.5002544522285461, "learning_rate": 0.00029645173308559644, "loss": 0.7989, "step": 3870 }, { "epoch": 0.4880813678873949, "grad_norm": 0.551458477973938, "learning_rate": 0.0002964279263718861, "loss": 0.7345, "step": 3875 }, { "epoch": 0.48871115029757217, "grad_norm": 0.499612420797348, "learning_rate": 0.0002964040410230345, "loss": 0.7885, "step": 3880 }, { "epoch": 0.4893409327077495, "grad_norm": 0.5279458165168762, "learning_rate": 0.0002963800770518687, "loss": 0.795, "step": 3885 }, { "epoch": 0.4899707151179268, "grad_norm": 0.47077476978302, "learning_rate": 0.0002963560344712578, "loss": 0.7716, "step": 3890 }, { "epoch": 0.49060049752810403, "grad_norm": 0.8377729058265686, "learning_rate": 0.0002963319132941133, "loss": 0.7625, "step": 3895 }, { "epoch": 0.49123027993828133, "grad_norm": 0.43871793150901794, "learning_rate": 0.0002963077135333888, "loss": 0.734, "step": 3900 }, { "epoch": 0.4918600623484586, "grad_norm": 0.44589656591415405, "learning_rate": 0.00029628343520208004, "loss": 0.7735, "step": 3905 }, { "epoch": 0.4924898447586359, "grad_norm": 0.7113938927650452, "learning_rate": 0.00029625907831322515, "loss": 0.7611, "step": 3910 }, { "epoch": 0.4931196271688132, "grad_norm": 0.3830680847167969, "learning_rate": 0.0002962346428799043, "loss": 0.7399, "step": 3915 }, { "epoch": 0.49374940957899044, "grad_norm": 0.4787169396877289, "learning_rate": 0.00029621012891523985, "loss": 0.7572, "step": 3920 }, { "epoch": 0.49437919198916774, "grad_norm": 0.428469717502594, "learning_rate": 0.0002961855364323964, "loss": 0.7548, "step": 3925 }, { "epoch": 0.49500897439934505, "grad_norm": 0.3982272148132324, "learning_rate": 0.00029616086544458065, "loss": 0.7846, "step": 3930 }, { "epoch": 0.4956387568095223, "grad_norm": 0.45961281657218933, "learning_rate": 0.00029613611596504146, "loss": 0.8041, "step": 3935 }, { "epoch": 0.4962685392196996, "grad_norm": 0.400146484375, "learning_rate": 0.00029611128800706996, "loss": 0.7395, "step": 3940 }, { "epoch": 0.4968983216298769, "grad_norm": 0.3984740674495697, "learning_rate": 0.00029608638158399925, "loss": 0.7569, "step": 3945 }, { "epoch": 0.49752810404005415, "grad_norm": 0.4343632161617279, "learning_rate": 0.0002960613967092046, "loss": 0.7958, "step": 3950 }, { "epoch": 0.49815788645023146, "grad_norm": 0.4569413363933563, "learning_rate": 0.0002960363333961036, "loss": 0.7673, "step": 3955 }, { "epoch": 0.4987876688604087, "grad_norm": 0.4818612039089203, "learning_rate": 0.0002960111916581557, "loss": 0.7621, "step": 3960 }, { "epoch": 0.499417451270586, "grad_norm": 0.4524887800216675, "learning_rate": 0.0002959859715088626, "loss": 0.7793, "step": 3965 }, { "epoch": 0.5000472336807633, "grad_norm": 0.4414517283439636, "learning_rate": 0.0002959606729617682, "loss": 0.7557, "step": 3970 }, { "epoch": 0.5006770160909406, "grad_norm": 0.3883852958679199, "learning_rate": 0.0002959352960304583, "loss": 0.7146, "step": 3975 }, { "epoch": 0.5013067985011178, "grad_norm": 0.34819212555885315, "learning_rate": 0.00029590984072856084, "loss": 0.7271, "step": 3980 }, { "epoch": 0.5019365809112951, "grad_norm": 0.3943585753440857, "learning_rate": 0.0002958843070697461, "loss": 0.817, "step": 3985 }, { "epoch": 0.5025663633214724, "grad_norm": 0.3881372809410095, "learning_rate": 0.000295858695067726, "loss": 0.6997, "step": 3990 }, { "epoch": 0.5031961457316497, "grad_norm": 0.4077765941619873, "learning_rate": 0.00029583300473625497, "loss": 0.789, "step": 3995 }, { "epoch": 0.503825928141827, "grad_norm": 0.4519467353820801, "learning_rate": 0.0002958072360891292, "loss": 0.7081, "step": 4000 }, { "epoch": 0.503825928141827, "eval_loss": 0.3237769305706024, "eval_runtime": 6.2328, "eval_samples_per_second": 160.442, "eval_steps_per_second": 10.108, "step": 4000 }, { "epoch": 0.5044557105520043, "grad_norm": 0.4161011278629303, "learning_rate": 0.00029578138914018704, "loss": 0.7426, "step": 4005 }, { "epoch": 0.5050854929621815, "grad_norm": 0.4170863926410675, "learning_rate": 0.0002957554639033089, "loss": 0.7614, "step": 4010 }, { "epoch": 0.5057152753723588, "grad_norm": 0.41827666759490967, "learning_rate": 0.0002957294603924172, "loss": 0.7339, "step": 4015 }, { "epoch": 0.5063450577825361, "grad_norm": 0.4575699269771576, "learning_rate": 0.0002957033786214766, "loss": 0.7506, "step": 4020 }, { "epoch": 0.5069748401927134, "grad_norm": 0.39499175548553467, "learning_rate": 0.00029567721860449333, "loss": 0.7227, "step": 4025 }, { "epoch": 0.5076046226028907, "grad_norm": 0.4190625548362732, "learning_rate": 0.00029565098035551606, "loss": 0.7375, "step": 4030 }, { "epoch": 0.5082344050130679, "grad_norm": 0.41470181941986084, "learning_rate": 0.00029562466388863534, "loss": 0.7953, "step": 4035 }, { "epoch": 0.5088641874232452, "grad_norm": 0.5376180410385132, "learning_rate": 0.00029559826921798373, "loss": 0.7927, "step": 4040 }, { "epoch": 0.5094939698334225, "grad_norm": 0.41073331236839294, "learning_rate": 0.0002955717963577357, "loss": 0.7175, "step": 4045 }, { "epoch": 0.5101237522435998, "grad_norm": 0.3894195854663849, "learning_rate": 0.0002955452453221078, "loss": 0.743, "step": 4050 }, { "epoch": 0.5107535346537772, "grad_norm": 0.37404918670654297, "learning_rate": 0.00029551861612535856, "loss": 0.6833, "step": 4055 }, { "epoch": 0.5113833170639545, "grad_norm": 0.4107655882835388, "learning_rate": 0.0002954919087817885, "loss": 0.7588, "step": 4060 }, { "epoch": 0.5120130994741316, "grad_norm": 0.4086790382862091, "learning_rate": 0.00029546512330574004, "loss": 0.7328, "step": 4065 }, { "epoch": 0.512642881884309, "grad_norm": 0.4246830344200134, "learning_rate": 0.0002954382597115976, "loss": 0.7171, "step": 4070 }, { "epoch": 0.5132726642944863, "grad_norm": 0.4021676182746887, "learning_rate": 0.00029541131801378743, "loss": 0.8009, "step": 4075 }, { "epoch": 0.5139024467046636, "grad_norm": 0.42611315846443176, "learning_rate": 0.00029538429822677806, "loss": 0.7338, "step": 4080 }, { "epoch": 0.5145322291148409, "grad_norm": 0.40971845388412476, "learning_rate": 0.0002953572003650795, "loss": 0.7883, "step": 4085 }, { "epoch": 0.5151620115250181, "grad_norm": 0.4226076304912567, "learning_rate": 0.0002953300244432441, "loss": 0.7696, "step": 4090 }, { "epoch": 0.5157917939351954, "grad_norm": 0.4504645764827728, "learning_rate": 0.0002953027704758659, "loss": 0.7123, "step": 4095 }, { "epoch": 0.5164215763453727, "grad_norm": 0.4032662510871887, "learning_rate": 0.00029527543847758086, "loss": 0.6786, "step": 4100 }, { "epoch": 0.51705135875555, "grad_norm": 0.4030795097351074, "learning_rate": 0.00029524802846306694, "loss": 0.7335, "step": 4105 }, { "epoch": 0.5176811411657273, "grad_norm": 0.4887290596961975, "learning_rate": 0.0002952205404470439, "loss": 0.7238, "step": 4110 }, { "epoch": 0.5183109235759045, "grad_norm": 0.4061615467071533, "learning_rate": 0.00029519297444427343, "loss": 0.7733, "step": 4115 }, { "epoch": 0.5189407059860818, "grad_norm": 0.4060840308666229, "learning_rate": 0.00029516533046955917, "loss": 0.7268, "step": 4120 }, { "epoch": 0.5195704883962591, "grad_norm": 0.4293743371963501, "learning_rate": 0.0002951376085377465, "loss": 0.7234, "step": 4125 }, { "epoch": 0.5202002708064364, "grad_norm": 0.410264790058136, "learning_rate": 0.00029510980866372273, "loss": 0.774, "step": 4130 }, { "epoch": 0.5208300532166137, "grad_norm": 0.3944232761859894, "learning_rate": 0.0002950819308624171, "loss": 0.7517, "step": 4135 }, { "epoch": 0.521459835626791, "grad_norm": 0.4245687425136566, "learning_rate": 0.0002950539751488005, "loss": 0.7612, "step": 4140 }, { "epoch": 0.5220896180369682, "grad_norm": 0.3795585632324219, "learning_rate": 0.00029502594153788593, "loss": 0.7778, "step": 4145 }, { "epoch": 0.5227194004471455, "grad_norm": 0.42911338806152344, "learning_rate": 0.000294997830044728, "loss": 0.8033, "step": 4150 }, { "epoch": 0.5233491828573228, "grad_norm": 0.4150259494781494, "learning_rate": 0.0002949696406844232, "loss": 0.7326, "step": 4155 }, { "epoch": 0.5239789652675001, "grad_norm": 0.3846616744995117, "learning_rate": 0.0002949413734721099, "loss": 0.7085, "step": 4160 }, { "epoch": 0.5246087476776774, "grad_norm": 0.3434165418148041, "learning_rate": 0.00029491302842296824, "loss": 0.711, "step": 4165 }, { "epoch": 0.5252385300878546, "grad_norm": 0.33985382318496704, "learning_rate": 0.0002948846055522202, "loss": 0.7493, "step": 4170 }, { "epoch": 0.5258683124980319, "grad_norm": 0.3809979259967804, "learning_rate": 0.0002948561048751294, "loss": 0.7224, "step": 4175 }, { "epoch": 0.5264980949082092, "grad_norm": 0.45042338967323303, "learning_rate": 0.00029482752640700143, "loss": 0.7554, "step": 4180 }, { "epoch": 0.5271278773183865, "grad_norm": 0.4068913757801056, "learning_rate": 0.00029479887016318357, "loss": 0.7267, "step": 4185 }, { "epoch": 0.5277576597285638, "grad_norm": 0.41964098811149597, "learning_rate": 0.0002947701361590649, "loss": 0.7255, "step": 4190 }, { "epoch": 0.5283874421387411, "grad_norm": 0.3956906795501709, "learning_rate": 0.0002947413244100762, "loss": 0.7272, "step": 4195 }, { "epoch": 0.5290172245489183, "grad_norm": 0.41254857182502747, "learning_rate": 0.0002947124349316901, "loss": 0.7155, "step": 4200 }, { "epoch": 0.5296470069590956, "grad_norm": 0.4162386655807495, "learning_rate": 0.0002946834677394208, "loss": 0.7729, "step": 4205 }, { "epoch": 0.5302767893692729, "grad_norm": 0.4521070420742035, "learning_rate": 0.00029465442284882436, "loss": 0.7328, "step": 4210 }, { "epoch": 0.5309065717794502, "grad_norm": 0.3702057898044586, "learning_rate": 0.00029462530027549866, "loss": 0.7592, "step": 4215 }, { "epoch": 0.5315363541896275, "grad_norm": 0.4132764935493469, "learning_rate": 0.00029459610003508313, "loss": 0.7238, "step": 4220 }, { "epoch": 0.5321661365998047, "grad_norm": 0.3817763328552246, "learning_rate": 0.0002945668221432589, "loss": 0.7524, "step": 4225 }, { "epoch": 0.532795919009982, "grad_norm": 0.41137659549713135, "learning_rate": 0.000294537466615749, "loss": 0.7405, "step": 4230 }, { "epoch": 0.5334257014201593, "grad_norm": 0.446150541305542, "learning_rate": 0.00029450803346831787, "loss": 0.7481, "step": 4235 }, { "epoch": 0.5340554838303366, "grad_norm": 0.37535202503204346, "learning_rate": 0.0002944785227167719, "loss": 0.7505, "step": 4240 }, { "epoch": 0.5346852662405139, "grad_norm": 0.4109747111797333, "learning_rate": 0.000294448934376959, "loss": 0.7406, "step": 4245 }, { "epoch": 0.5353150486506912, "grad_norm": 0.4233269989490509, "learning_rate": 0.00029441926846476873, "loss": 0.7823, "step": 4250 }, { "epoch": 0.5359448310608684, "grad_norm": 0.40127456188201904, "learning_rate": 0.00029438952499613244, "loss": 0.7486, "step": 4255 }, { "epoch": 0.5365746134710457, "grad_norm": 0.40279653668403625, "learning_rate": 0.000294359703987023, "loss": 0.7157, "step": 4260 }, { "epoch": 0.537204395881223, "grad_norm": 0.34208250045776367, "learning_rate": 0.000294329805453455, "loss": 0.7158, "step": 4265 }, { "epoch": 0.5378341782914003, "grad_norm": 0.41574689745903015, "learning_rate": 0.0002942998294114846, "loss": 0.7668, "step": 4270 }, { "epoch": 0.5384639607015776, "grad_norm": 0.401426762342453, "learning_rate": 0.0002942697758772097, "loss": 0.734, "step": 4275 }, { "epoch": 0.5390937431117548, "grad_norm": 0.4085477292537689, "learning_rate": 0.00029423964486676964, "loss": 0.7448, "step": 4280 }, { "epoch": 0.5397235255219321, "grad_norm": 0.43037959933280945, "learning_rate": 0.0002942094363963456, "loss": 0.7618, "step": 4285 }, { "epoch": 0.5403533079321095, "grad_norm": 0.34685570001602173, "learning_rate": 0.00029417915048216003, "loss": 0.7314, "step": 4290 }, { "epoch": 0.5409830903422868, "grad_norm": 0.3967381417751312, "learning_rate": 0.00029414878714047725, "loss": 0.7465, "step": 4295 }, { "epoch": 0.5416128727524641, "grad_norm": 0.36378154158592224, "learning_rate": 0.0002941183463876031, "loss": 0.7372, "step": 4300 }, { "epoch": 0.5422426551626414, "grad_norm": 0.3804253339767456, "learning_rate": 0.00029408782823988494, "loss": 0.7488, "step": 4305 }, { "epoch": 0.5428724375728186, "grad_norm": 0.3679543137550354, "learning_rate": 0.00029405723271371166, "loss": 0.7253, "step": 4310 }, { "epoch": 0.5435022199829959, "grad_norm": 0.35688257217407227, "learning_rate": 0.0002940265598255138, "loss": 0.7523, "step": 4315 }, { "epoch": 0.5441320023931732, "grad_norm": 0.40890881419181824, "learning_rate": 0.00029399580959176344, "loss": 0.756, "step": 4320 }, { "epoch": 0.5447617848033505, "grad_norm": 0.478547602891922, "learning_rate": 0.00029396498202897406, "loss": 0.7249, "step": 4325 }, { "epoch": 0.5453915672135278, "grad_norm": 0.40117356181144714, "learning_rate": 0.0002939340771537009, "loss": 0.7466, "step": 4330 }, { "epoch": 0.546021349623705, "grad_norm": 0.42868953943252563, "learning_rate": 0.0002939030949825404, "loss": 0.7894, "step": 4335 }, { "epoch": 0.5466511320338823, "grad_norm": 0.41796940565109253, "learning_rate": 0.0002938720355321309, "loss": 0.7446, "step": 4340 }, { "epoch": 0.5472809144440596, "grad_norm": 0.427336186170578, "learning_rate": 0.0002938408988191519, "loss": 0.7824, "step": 4345 }, { "epoch": 0.5479106968542369, "grad_norm": 0.38179048895835876, "learning_rate": 0.00029380968486032456, "loss": 0.7427, "step": 4350 }, { "epoch": 0.5485404792644142, "grad_norm": 0.39974477887153625, "learning_rate": 0.0002937783936724115, "loss": 0.7347, "step": 4355 }, { "epoch": 0.5491702616745914, "grad_norm": 0.3805896043777466, "learning_rate": 0.00029374702527221674, "loss": 0.7547, "step": 4360 }, { "epoch": 0.5498000440847687, "grad_norm": 0.43362486362457275, "learning_rate": 0.0002937155796765859, "loss": 0.7651, "step": 4365 }, { "epoch": 0.550429826494946, "grad_norm": 0.38877996802330017, "learning_rate": 0.000293684056902406, "loss": 0.7054, "step": 4370 }, { "epoch": 0.5510596089051233, "grad_norm": 0.393184095621109, "learning_rate": 0.00029365245696660544, "loss": 0.7453, "step": 4375 }, { "epoch": 0.5516893913153006, "grad_norm": 0.3892836570739746, "learning_rate": 0.0002936207798861541, "loss": 0.7036, "step": 4380 }, { "epoch": 0.5523191737254779, "grad_norm": 0.3737259805202484, "learning_rate": 0.0002935890256780633, "loss": 0.7403, "step": 4385 }, { "epoch": 0.5529489561356551, "grad_norm": 0.36731937527656555, "learning_rate": 0.00029355719435938585, "loss": 0.7098, "step": 4390 }, { "epoch": 0.5535787385458324, "grad_norm": 0.40238016843795776, "learning_rate": 0.00029352528594721577, "loss": 0.7625, "step": 4395 }, { "epoch": 0.5542085209560097, "grad_norm": 0.3878697454929352, "learning_rate": 0.0002934933004586887, "loss": 0.7486, "step": 4400 }, { "epoch": 0.554838303366187, "grad_norm": 0.36463412642478943, "learning_rate": 0.00029346123791098157, "loss": 0.7489, "step": 4405 }, { "epoch": 0.5554680857763643, "grad_norm": 0.3860667049884796, "learning_rate": 0.0002934290983213126, "loss": 0.7503, "step": 4410 }, { "epoch": 0.5560978681865415, "grad_norm": 0.40702390670776367, "learning_rate": 0.0002933968817069417, "loss": 0.6892, "step": 4415 }, { "epoch": 0.5567276505967188, "grad_norm": 0.4769366979598999, "learning_rate": 0.0002933645880851697, "loss": 0.7285, "step": 4420 }, { "epoch": 0.5573574330068961, "grad_norm": 0.37400034070014954, "learning_rate": 0.00029333221747333913, "loss": 0.7055, "step": 4425 }, { "epoch": 0.5579872154170734, "grad_norm": 0.4280668795108795, "learning_rate": 0.00029329976988883374, "loss": 0.7629, "step": 4430 }, { "epoch": 0.5586169978272507, "grad_norm": 0.3710954189300537, "learning_rate": 0.00029326724534907856, "loss": 0.696, "step": 4435 }, { "epoch": 0.559246780237428, "grad_norm": 0.4311872720718384, "learning_rate": 0.0002932346438715401, "loss": 0.726, "step": 4440 }, { "epoch": 0.5598765626476052, "grad_norm": 0.3708207309246063, "learning_rate": 0.000293201965473726, "loss": 0.7308, "step": 4445 }, { "epoch": 0.5605063450577825, "grad_norm": 0.36177051067352295, "learning_rate": 0.00029316921017318536, "loss": 0.7403, "step": 4450 }, { "epoch": 0.5611361274679598, "grad_norm": 0.4313011169433594, "learning_rate": 0.0002931363779875086, "loss": 0.7053, "step": 4455 }, { "epoch": 0.5617659098781371, "grad_norm": 0.36055561900138855, "learning_rate": 0.0002931034689343272, "loss": 0.7544, "step": 4460 }, { "epoch": 0.5623956922883144, "grad_norm": 0.37126588821411133, "learning_rate": 0.0002930704830313142, "loss": 0.7444, "step": 4465 }, { "epoch": 0.5630254746984916, "grad_norm": 0.35056549310684204, "learning_rate": 0.00029303742029618377, "loss": 0.7251, "step": 4470 }, { "epoch": 0.5636552571086689, "grad_norm": 0.3944834768772125, "learning_rate": 0.0002930042807466913, "loss": 0.771, "step": 4475 }, { "epoch": 0.5642850395188462, "grad_norm": 0.39250391721725464, "learning_rate": 0.0002929710644006334, "loss": 0.7177, "step": 4480 }, { "epoch": 0.5649148219290235, "grad_norm": 0.41848230361938477, "learning_rate": 0.00029293777127584826, "loss": 0.7362, "step": 4485 }, { "epoch": 0.5655446043392008, "grad_norm": 0.3222586214542389, "learning_rate": 0.00029290440139021477, "loss": 0.6746, "step": 4490 }, { "epoch": 0.5661743867493781, "grad_norm": 0.4275425672531128, "learning_rate": 0.00029287095476165356, "loss": 0.7641, "step": 4495 }, { "epoch": 0.5668041691595553, "grad_norm": 0.37914222478866577, "learning_rate": 0.0002928374314081261, "loss": 0.7367, "step": 4500 }, { "epoch": 0.5674339515697326, "grad_norm": 0.36903491616249084, "learning_rate": 0.00029280383134763516, "loss": 0.726, "step": 4505 }, { "epoch": 0.56806373397991, "grad_norm": 0.45876601338386536, "learning_rate": 0.0002927701545982249, "loss": 0.7285, "step": 4510 }, { "epoch": 0.5686935163900873, "grad_norm": 0.3885752856731415, "learning_rate": 0.0002927364011779803, "loss": 0.7111, "step": 4515 }, { "epoch": 0.5693232988002646, "grad_norm": 0.3738529086112976, "learning_rate": 0.00029270257110502784, "loss": 0.7381, "step": 4520 }, { "epoch": 0.5699530812104417, "grad_norm": 0.38678133487701416, "learning_rate": 0.0002926686643975351, "loss": 0.7069, "step": 4525 }, { "epoch": 0.570582863620619, "grad_norm": 0.38699817657470703, "learning_rate": 0.0002926346810737106, "loss": 0.7456, "step": 4530 }, { "epoch": 0.5712126460307964, "grad_norm": 0.39948272705078125, "learning_rate": 0.0002926006211518043, "loss": 0.7018, "step": 4535 }, { "epoch": 0.5718424284409737, "grad_norm": 0.36441704630851746, "learning_rate": 0.00029256648465010706, "loss": 0.7155, "step": 4540 }, { "epoch": 0.572472210851151, "grad_norm": 0.38412773609161377, "learning_rate": 0.00029253227158695103, "loss": 0.7131, "step": 4545 }, { "epoch": 0.5731019932613282, "grad_norm": 0.3713320791721344, "learning_rate": 0.0002924979819807094, "loss": 0.7109, "step": 4550 }, { "epoch": 0.5737317756715055, "grad_norm": 0.41460588574409485, "learning_rate": 0.00029246361584979637, "loss": 0.7218, "step": 4555 }, { "epoch": 0.5743615580816828, "grad_norm": 0.37706735730171204, "learning_rate": 0.0002924291732126675, "loss": 0.7364, "step": 4560 }, { "epoch": 0.5749913404918601, "grad_norm": 0.3931211829185486, "learning_rate": 0.00029239465408781914, "loss": 0.793, "step": 4565 }, { "epoch": 0.5756211229020374, "grad_norm": 0.4280949831008911, "learning_rate": 0.0002923600584937889, "loss": 0.7577, "step": 4570 }, { "epoch": 0.5762509053122147, "grad_norm": 0.408357173204422, "learning_rate": 0.0002923253864491554, "loss": 0.6866, "step": 4575 }, { "epoch": 0.5768806877223919, "grad_norm": 0.3654685318470001, "learning_rate": 0.0002922906379725383, "loss": 0.7409, "step": 4580 }, { "epoch": 0.5775104701325692, "grad_norm": 0.3723433017730713, "learning_rate": 0.0002922558130825984, "loss": 0.7106, "step": 4585 }, { "epoch": 0.5781402525427465, "grad_norm": 0.40489017963409424, "learning_rate": 0.00029222091179803735, "loss": 0.7311, "step": 4590 }, { "epoch": 0.5787700349529238, "grad_norm": 0.40270909667015076, "learning_rate": 0.000292185934137598, "loss": 0.7393, "step": 4595 }, { "epoch": 0.5793998173631011, "grad_norm": 0.4228857159614563, "learning_rate": 0.0002921508801200642, "loss": 0.7253, "step": 4600 }, { "epoch": 0.5800295997732783, "grad_norm": 0.39830881357192993, "learning_rate": 0.0002921157497642607, "loss": 0.7413, "step": 4605 }, { "epoch": 0.5806593821834556, "grad_norm": 0.40520498156547546, "learning_rate": 0.00029208054308905323, "loss": 0.6902, "step": 4610 }, { "epoch": 0.5812891645936329, "grad_norm": 0.3546881377696991, "learning_rate": 0.0002920452601133487, "loss": 0.7104, "step": 4615 }, { "epoch": 0.5819189470038102, "grad_norm": 0.40294864773750305, "learning_rate": 0.0002920099008560949, "loss": 0.7258, "step": 4620 }, { "epoch": 0.5825487294139875, "grad_norm": 0.36979302763938904, "learning_rate": 0.0002919744653362804, "loss": 0.708, "step": 4625 }, { "epoch": 0.5831785118241648, "grad_norm": 0.42616382241249084, "learning_rate": 0.000291938953572935, "loss": 0.7044, "step": 4630 }, { "epoch": 0.583808294234342, "grad_norm": 0.3644506335258484, "learning_rate": 0.0002919033655851293, "loss": 0.7277, "step": 4635 }, { "epoch": 0.5844380766445193, "grad_norm": 0.34578534960746765, "learning_rate": 0.0002918677013919749, "loss": 0.7233, "step": 4640 }, { "epoch": 0.5850678590546966, "grad_norm": 0.3914281725883484, "learning_rate": 0.00029183196101262423, "loss": 0.6829, "step": 4645 }, { "epoch": 0.5856976414648739, "grad_norm": 0.35399550199508667, "learning_rate": 0.0002917961444662707, "loss": 0.7371, "step": 4650 }, { "epoch": 0.5863274238750512, "grad_norm": 0.3999468684196472, "learning_rate": 0.0002917602517721486, "loss": 0.7228, "step": 4655 }, { "epoch": 0.5869572062852284, "grad_norm": 0.4196580946445465, "learning_rate": 0.0002917242829495332, "loss": 0.7013, "step": 4660 }, { "epoch": 0.5875869886954057, "grad_norm": 0.38301941752433777, "learning_rate": 0.0002916882380177405, "loss": 0.7409, "step": 4665 }, { "epoch": 0.588216771105583, "grad_norm": 0.3997241258621216, "learning_rate": 0.0002916521169961275, "loss": 0.7216, "step": 4670 }, { "epoch": 0.5888465535157603, "grad_norm": 0.3389524817466736, "learning_rate": 0.00029161591990409203, "loss": 0.7109, "step": 4675 }, { "epoch": 0.5894763359259376, "grad_norm": 0.38282495737075806, "learning_rate": 0.0002915796467610727, "loss": 0.7608, "step": 4680 }, { "epoch": 0.5901061183361149, "grad_norm": 0.40406349301338196, "learning_rate": 0.000291543297586549, "loss": 0.7062, "step": 4685 }, { "epoch": 0.5907359007462921, "grad_norm": 0.37658101320266724, "learning_rate": 0.0002915068724000413, "loss": 0.7305, "step": 4690 }, { "epoch": 0.5913656831564694, "grad_norm": 0.397401362657547, "learning_rate": 0.0002914703712211108, "loss": 0.7276, "step": 4695 }, { "epoch": 0.5919954655666467, "grad_norm": 0.4348791539669037, "learning_rate": 0.0002914337940693594, "loss": 0.7572, "step": 4700 }, { "epoch": 0.592625247976824, "grad_norm": 0.372659295797348, "learning_rate": 0.0002913971409644299, "loss": 0.7436, "step": 4705 }, { "epoch": 0.5932550303870013, "grad_norm": 0.3933033049106598, "learning_rate": 0.0002913604119260059, "loss": 0.7229, "step": 4710 }, { "epoch": 0.5938848127971785, "grad_norm": 0.35579994320869446, "learning_rate": 0.0002913236069738116, "loss": 0.7055, "step": 4715 }, { "epoch": 0.5945145952073558, "grad_norm": 0.40102267265319824, "learning_rate": 0.0002912867261276122, "loss": 0.7167, "step": 4720 }, { "epoch": 0.5951443776175331, "grad_norm": 0.3881862163543701, "learning_rate": 0.0002912497694072136, "loss": 0.7395, "step": 4725 }, { "epoch": 0.5957741600277104, "grad_norm": 0.43878352642059326, "learning_rate": 0.00029121273683246234, "loss": 0.7251, "step": 4730 }, { "epoch": 0.5964039424378877, "grad_norm": 0.3500851094722748, "learning_rate": 0.0002911756284232457, "loss": 0.6989, "step": 4735 }, { "epoch": 0.5970337248480649, "grad_norm": 0.3887772262096405, "learning_rate": 0.00029113844419949184, "loss": 0.7324, "step": 4740 }, { "epoch": 0.5976635072582422, "grad_norm": 0.376321405172348, "learning_rate": 0.0002911011841811695, "loss": 0.7239, "step": 4745 }, { "epoch": 0.5982932896684195, "grad_norm": 0.3770900070667267, "learning_rate": 0.00029106384838828816, "loss": 0.6973, "step": 4750 }, { "epoch": 0.5989230720785969, "grad_norm": 0.366205096244812, "learning_rate": 0.000291026436840898, "loss": 0.749, "step": 4755 }, { "epoch": 0.5995528544887742, "grad_norm": 0.40675458312034607, "learning_rate": 0.00029098894955908983, "loss": 0.7155, "step": 4760 }, { "epoch": 0.6001826368989515, "grad_norm": 0.4125664532184601, "learning_rate": 0.0002909513865629953, "loss": 0.7455, "step": 4765 }, { "epoch": 0.6008124193091287, "grad_norm": 0.39872869849205017, "learning_rate": 0.0002909137478727864, "loss": 0.7194, "step": 4770 }, { "epoch": 0.601442201719306, "grad_norm": 0.3824906051158905, "learning_rate": 0.00029087603350867616, "loss": 0.742, "step": 4775 }, { "epoch": 0.6020719841294833, "grad_norm": 0.37914398312568665, "learning_rate": 0.00029083824349091794, "loss": 0.692, "step": 4780 }, { "epoch": 0.6027017665396606, "grad_norm": 0.35589495301246643, "learning_rate": 0.0002908003778398059, "loss": 0.6706, "step": 4785 }, { "epoch": 0.6033315489498379, "grad_norm": 0.34000104665756226, "learning_rate": 0.0002907624365756748, "loss": 0.7506, "step": 4790 }, { "epoch": 0.6039613313600151, "grad_norm": 0.34795689582824707, "learning_rate": 0.0002907244197188998, "loss": 0.7097, "step": 4795 }, { "epoch": 0.6045911137701924, "grad_norm": 0.38767385482788086, "learning_rate": 0.00029068632728989697, "loss": 0.6986, "step": 4800 }, { "epoch": 0.6052208961803697, "grad_norm": 0.40651988983154297, "learning_rate": 0.00029064815930912276, "loss": 0.7159, "step": 4805 }, { "epoch": 0.605850678590547, "grad_norm": 0.37715932726860046, "learning_rate": 0.00029060991579707424, "loss": 0.7189, "step": 4810 }, { "epoch": 0.6064804610007243, "grad_norm": 0.3925745487213135, "learning_rate": 0.0002905715967742891, "loss": 0.6956, "step": 4815 }, { "epoch": 0.6071102434109016, "grad_norm": 0.33669450879096985, "learning_rate": 0.0002905332022613455, "loss": 0.6806, "step": 4820 }, { "epoch": 0.6077400258210788, "grad_norm": 0.38812607526779175, "learning_rate": 0.00029049473227886214, "loss": 0.6997, "step": 4825 }, { "epoch": 0.6083698082312561, "grad_norm": 0.3890033960342407, "learning_rate": 0.00029045618684749833, "loss": 0.7306, "step": 4830 }, { "epoch": 0.6089995906414334, "grad_norm": 0.4020345211029053, "learning_rate": 0.00029041756598795383, "loss": 0.7357, "step": 4835 }, { "epoch": 0.6096293730516107, "grad_norm": 0.39244237542152405, "learning_rate": 0.0002903788697209689, "loss": 0.6956, "step": 4840 }, { "epoch": 0.610259155461788, "grad_norm": 0.35866880416870117, "learning_rate": 0.0002903400980673243, "loss": 0.7219, "step": 4845 }, { "epoch": 0.6108889378719652, "grad_norm": 0.3912501931190491, "learning_rate": 0.0002903012510478414, "loss": 0.7194, "step": 4850 }, { "epoch": 0.6115187202821425, "grad_norm": 0.3933585584163666, "learning_rate": 0.00029026232868338184, "loss": 0.7136, "step": 4855 }, { "epoch": 0.6121485026923198, "grad_norm": 0.37603482604026794, "learning_rate": 0.0002902233309948479, "loss": 0.7208, "step": 4860 }, { "epoch": 0.6127782851024971, "grad_norm": 0.36919742822647095, "learning_rate": 0.00029018425800318205, "loss": 0.7499, "step": 4865 }, { "epoch": 0.6134080675126744, "grad_norm": 0.38067975640296936, "learning_rate": 0.0002901451097293676, "loss": 0.7468, "step": 4870 }, { "epoch": 0.6140378499228517, "grad_norm": 0.33954986929893494, "learning_rate": 0.00029010588619442793, "loss": 0.6894, "step": 4875 }, { "epoch": 0.6146676323330289, "grad_norm": 0.38103437423706055, "learning_rate": 0.000290066587419427, "loss": 0.6661, "step": 4880 }, { "epoch": 0.6152974147432062, "grad_norm": 0.3855966031551361, "learning_rate": 0.00029002721342546924, "loss": 0.7138, "step": 4885 }, { "epoch": 0.6159271971533835, "grad_norm": 0.5084123611450195, "learning_rate": 0.00028998776423369923, "loss": 0.7005, "step": 4890 }, { "epoch": 0.6165569795635608, "grad_norm": 0.36192139983177185, "learning_rate": 0.0002899482398653022, "loss": 0.7386, "step": 4895 }, { "epoch": 0.6171867619737381, "grad_norm": 0.37423619627952576, "learning_rate": 0.0002899086403415037, "loss": 0.7172, "step": 4900 }, { "epoch": 0.6178165443839153, "grad_norm": 0.3741579055786133, "learning_rate": 0.00028986896568356933, "loss": 0.7519, "step": 4905 }, { "epoch": 0.6184463267940926, "grad_norm": 0.4323353171348572, "learning_rate": 0.0002898292159128055, "loss": 0.7325, "step": 4910 }, { "epoch": 0.6190761092042699, "grad_norm": 0.3273026645183563, "learning_rate": 0.00028978939105055873, "loss": 0.7211, "step": 4915 }, { "epoch": 0.6197058916144472, "grad_norm": 0.38831016421318054, "learning_rate": 0.0002897494911182158, "loss": 0.6435, "step": 4920 }, { "epoch": 0.6203356740246245, "grad_norm": 0.36923748254776, "learning_rate": 0.00028970951613720397, "loss": 0.7184, "step": 4925 }, { "epoch": 0.6209654564348017, "grad_norm": 0.3658188283443451, "learning_rate": 0.0002896694661289906, "loss": 0.7171, "step": 4930 }, { "epoch": 0.621595238844979, "grad_norm": 0.3589092493057251, "learning_rate": 0.00028962934111508357, "loss": 0.7173, "step": 4935 }, { "epoch": 0.6222250212551563, "grad_norm": 0.41886886954307556, "learning_rate": 0.00028958914111703086, "loss": 0.7412, "step": 4940 }, { "epoch": 0.6228548036653336, "grad_norm": 0.34496763348579407, "learning_rate": 0.0002895488661564208, "loss": 0.6608, "step": 4945 }, { "epoch": 0.6234845860755109, "grad_norm": 0.3527592122554779, "learning_rate": 0.000289508516254882, "loss": 0.7179, "step": 4950 }, { "epoch": 0.6241143684856882, "grad_norm": 0.3406129479408264, "learning_rate": 0.0002894680914340833, "loss": 0.6862, "step": 4955 }, { "epoch": 0.6247441508958654, "grad_norm": 0.33078086376190186, "learning_rate": 0.00028942759171573374, "loss": 0.6804, "step": 4960 }, { "epoch": 0.6253739333060427, "grad_norm": 0.3582599461078644, "learning_rate": 0.00028938701712158247, "loss": 0.6681, "step": 4965 }, { "epoch": 0.62600371571622, "grad_norm": 0.3656567633152008, "learning_rate": 0.0002893463676734191, "loss": 0.6714, "step": 4970 }, { "epoch": 0.6266334981263973, "grad_norm": 0.35537272691726685, "learning_rate": 0.00028930564339307337, "loss": 0.6917, "step": 4975 }, { "epoch": 0.6272632805365747, "grad_norm": 0.35100945830345154, "learning_rate": 0.0002892648443024149, "loss": 0.7217, "step": 4980 }, { "epoch": 0.6278930629467518, "grad_norm": 0.34070494771003723, "learning_rate": 0.000289223970423354, "loss": 0.7237, "step": 4985 }, { "epoch": 0.6285228453569291, "grad_norm": 0.3810268044471741, "learning_rate": 0.00028918302177784075, "loss": 0.7513, "step": 4990 }, { "epoch": 0.6291526277671065, "grad_norm": 0.3511486053466797, "learning_rate": 0.0002891419983878655, "loss": 0.7112, "step": 4995 }, { "epoch": 0.6297824101772838, "grad_norm": 0.30101874470710754, "learning_rate": 0.0002891009002754588, "loss": 0.6666, "step": 5000 }, { "epoch": 0.6297824101772838, "eval_loss": 0.31327521800994873, "eval_runtime": 6.2403, "eval_samples_per_second": 160.248, "eval_steps_per_second": 10.096, "step": 5000 }, { "epoch": 0.6304121925874611, "grad_norm": 0.3446876108646393, "learning_rate": 0.00028905972746269125, "loss": 0.6651, "step": 5005 }, { "epoch": 0.6310419749976384, "grad_norm": 0.3606228232383728, "learning_rate": 0.0002890184799716736, "loss": 0.7387, "step": 5010 }, { "epoch": 0.6316717574078156, "grad_norm": 0.37057119607925415, "learning_rate": 0.0002889771578245567, "loss": 0.7044, "step": 5015 }, { "epoch": 0.6323015398179929, "grad_norm": 0.36304429173469543, "learning_rate": 0.0002889357610435314, "loss": 0.7391, "step": 5020 }, { "epoch": 0.6329313222281702, "grad_norm": 0.38329148292541504, "learning_rate": 0.00028889428965082886, "loss": 0.7045, "step": 5025 }, { "epoch": 0.6335611046383475, "grad_norm": 0.3362608850002289, "learning_rate": 0.00028885274366872006, "loss": 0.6865, "step": 5030 }, { "epoch": 0.6341908870485248, "grad_norm": 0.4079527258872986, "learning_rate": 0.00028881112311951625, "loss": 0.6892, "step": 5035 }, { "epoch": 0.634820669458702, "grad_norm": 0.35261860489845276, "learning_rate": 0.00028876942802556847, "loss": 0.7189, "step": 5040 }, { "epoch": 0.6354504518688793, "grad_norm": 0.40486040711402893, "learning_rate": 0.00028872765840926804, "loss": 0.7385, "step": 5045 }, { "epoch": 0.6360802342790566, "grad_norm": 0.32852765917778015, "learning_rate": 0.0002886858142930462, "loss": 0.6267, "step": 5050 }, { "epoch": 0.6367100166892339, "grad_norm": 0.31455445289611816, "learning_rate": 0.0002886438956993741, "loss": 0.6813, "step": 5055 }, { "epoch": 0.6373397990994112, "grad_norm": 0.3047012686729431, "learning_rate": 0.00028860190265076304, "loss": 0.6862, "step": 5060 }, { "epoch": 0.6379695815095885, "grad_norm": 0.34203359484672546, "learning_rate": 0.0002885598351697643, "loss": 0.6996, "step": 5065 }, { "epoch": 0.6385993639197657, "grad_norm": 0.4077922999858856, "learning_rate": 0.0002885176932789691, "loss": 0.7018, "step": 5070 }, { "epoch": 0.639229146329943, "grad_norm": 0.3590135872364044, "learning_rate": 0.00028847547700100836, "loss": 0.6741, "step": 5075 }, { "epoch": 0.6398589287401203, "grad_norm": 0.33030763268470764, "learning_rate": 0.0002884331863585535, "loss": 0.6775, "step": 5080 }, { "epoch": 0.6404887111502976, "grad_norm": 0.3921838104724884, "learning_rate": 0.0002883908213743153, "loss": 0.7359, "step": 5085 }, { "epoch": 0.6411184935604749, "grad_norm": 0.35765379667282104, "learning_rate": 0.0002883483820710449, "loss": 0.6953, "step": 5090 }, { "epoch": 0.6417482759706521, "grad_norm": 0.3486902415752411, "learning_rate": 0.0002883058684715331, "loss": 0.6848, "step": 5095 }, { "epoch": 0.6423780583808294, "grad_norm": 0.35446256399154663, "learning_rate": 0.0002882632805986108, "loss": 0.7031, "step": 5100 }, { "epoch": 0.6430078407910067, "grad_norm": 0.3666916489601135, "learning_rate": 0.00028822061847514843, "loss": 0.7135, "step": 5105 }, { "epoch": 0.643637623201184, "grad_norm": 0.38766369223594666, "learning_rate": 0.00028817788212405666, "loss": 0.6623, "step": 5110 }, { "epoch": 0.6442674056113613, "grad_norm": 0.3532891273498535, "learning_rate": 0.0002881350715682859, "loss": 0.699, "step": 5115 }, { "epoch": 0.6448971880215385, "grad_norm": 0.36512479186058044, "learning_rate": 0.0002880921868308263, "loss": 0.6859, "step": 5120 }, { "epoch": 0.6455269704317158, "grad_norm": 0.34285515546798706, "learning_rate": 0.0002880492279347081, "loss": 0.7254, "step": 5125 }, { "epoch": 0.6461567528418931, "grad_norm": 0.3731713891029358, "learning_rate": 0.00028800619490300107, "loss": 0.6995, "step": 5130 }, { "epoch": 0.6467865352520704, "grad_norm": 0.37182632088661194, "learning_rate": 0.000287963087758815, "loss": 0.7262, "step": 5135 }, { "epoch": 0.6474163176622477, "grad_norm": 0.371231347322464, "learning_rate": 0.0002879199065252994, "loss": 0.7051, "step": 5140 }, { "epoch": 0.648046100072425, "grad_norm": 0.35507723689079285, "learning_rate": 0.00028787665122564357, "loss": 0.6799, "step": 5145 }, { "epoch": 0.6486758824826022, "grad_norm": 0.4001401662826538, "learning_rate": 0.0002878333218830766, "loss": 0.7718, "step": 5150 }, { "epoch": 0.6493056648927795, "grad_norm": 0.36585733294487, "learning_rate": 0.0002877899185208673, "loss": 0.6652, "step": 5155 }, { "epoch": 0.6499354473029568, "grad_norm": 0.3719576895236969, "learning_rate": 0.00028774644116232436, "loss": 0.7232, "step": 5160 }, { "epoch": 0.6505652297131341, "grad_norm": 0.40236014127731323, "learning_rate": 0.000287702889830796, "loss": 0.6697, "step": 5165 }, { "epoch": 0.6511950121233114, "grad_norm": 0.4343264400959015, "learning_rate": 0.00028765926454967037, "loss": 0.6877, "step": 5170 }, { "epoch": 0.6518247945334886, "grad_norm": 0.3576568067073822, "learning_rate": 0.00028761556534237514, "loss": 0.7239, "step": 5175 }, { "epoch": 0.6524545769436659, "grad_norm": 0.33383145928382874, "learning_rate": 0.00028757179223237793, "loss": 0.6822, "step": 5180 }, { "epoch": 0.6530843593538432, "grad_norm": 0.353253573179245, "learning_rate": 0.0002875279452431858, "loss": 0.6925, "step": 5185 }, { "epoch": 0.6537141417640205, "grad_norm": 0.3755667209625244, "learning_rate": 0.0002874840243983455, "loss": 0.6872, "step": 5190 }, { "epoch": 0.6543439241741978, "grad_norm": 0.3973848521709442, "learning_rate": 0.00028744002972144376, "loss": 0.7251, "step": 5195 }, { "epoch": 0.6549737065843751, "grad_norm": 0.3476422131061554, "learning_rate": 0.0002873959612361066, "loss": 0.6964, "step": 5200 }, { "epoch": 0.6556034889945523, "grad_norm": 0.42737796902656555, "learning_rate": 0.0002873518189659997, "loss": 0.7106, "step": 5205 }, { "epoch": 0.6562332714047296, "grad_norm": 0.3009507358074188, "learning_rate": 0.00028730760293482863, "loss": 0.6614, "step": 5210 }, { "epoch": 0.656863053814907, "grad_norm": 0.38053247332572937, "learning_rate": 0.00028726331316633835, "loss": 0.6963, "step": 5215 }, { "epoch": 0.6574928362250843, "grad_norm": 0.4153291583061218, "learning_rate": 0.00028721894968431345, "loss": 0.7471, "step": 5220 }, { "epoch": 0.6581226186352616, "grad_norm": 0.36470016837120056, "learning_rate": 0.0002871745125125782, "loss": 0.6558, "step": 5225 }, { "epoch": 0.6587524010454388, "grad_norm": 0.3935704827308655, "learning_rate": 0.00028713000167499627, "loss": 0.7025, "step": 5230 }, { "epoch": 0.659382183455616, "grad_norm": 0.36777618527412415, "learning_rate": 0.0002870854171954711, "loss": 0.7386, "step": 5235 }, { "epoch": 0.6600119658657934, "grad_norm": 0.36549127101898193, "learning_rate": 0.0002870407590979455, "loss": 0.703, "step": 5240 }, { "epoch": 0.6606417482759707, "grad_norm": 0.37523144483566284, "learning_rate": 0.00028699602740640194, "loss": 0.6708, "step": 5245 }, { "epoch": 0.661271530686148, "grad_norm": 0.3451475203037262, "learning_rate": 0.00028695122214486237, "loss": 0.6776, "step": 5250 }, { "epoch": 0.6619013130963253, "grad_norm": 0.35215169191360474, "learning_rate": 0.00028690634333738816, "loss": 0.6983, "step": 5255 }, { "epoch": 0.6625310955065025, "grad_norm": 0.37627631425857544, "learning_rate": 0.00028686139100808037, "loss": 0.6844, "step": 5260 }, { "epoch": 0.6631608779166798, "grad_norm": 0.34171178936958313, "learning_rate": 0.0002868163651810793, "loss": 0.7068, "step": 5265 }, { "epoch": 0.6637906603268571, "grad_norm": 0.3566179573535919, "learning_rate": 0.0002867712658805649, "loss": 0.6618, "step": 5270 }, { "epoch": 0.6644204427370344, "grad_norm": 0.3453030586242676, "learning_rate": 0.00028672609313075664, "loss": 0.7046, "step": 5275 }, { "epoch": 0.6650502251472117, "grad_norm": 0.40633949637413025, "learning_rate": 0.00028668084695591316, "loss": 0.6931, "step": 5280 }, { "epoch": 0.6656800075573889, "grad_norm": 0.3927484154701233, "learning_rate": 0.00028663552738033275, "loss": 0.7051, "step": 5285 }, { "epoch": 0.6663097899675662, "grad_norm": 0.35829389095306396, "learning_rate": 0.000286590134428353, "loss": 0.7051, "step": 5290 }, { "epoch": 0.6669395723777435, "grad_norm": 0.4202066957950592, "learning_rate": 0.00028654466812435105, "loss": 0.7179, "step": 5295 }, { "epoch": 0.6675693547879208, "grad_norm": 0.37852293252944946, "learning_rate": 0.0002864991284927433, "loss": 0.7107, "step": 5300 }, { "epoch": 0.6681991371980981, "grad_norm": 0.3831678330898285, "learning_rate": 0.0002864535155579856, "loss": 0.659, "step": 5305 }, { "epoch": 0.6688289196082754, "grad_norm": 0.3563750684261322, "learning_rate": 0.0002864078293445731, "loss": 0.7111, "step": 5310 }, { "epoch": 0.6694587020184526, "grad_norm": 0.3460354804992676, "learning_rate": 0.0002863620698770403, "loss": 0.6822, "step": 5315 }, { "epoch": 0.6700884844286299, "grad_norm": 0.36469632387161255, "learning_rate": 0.0002863162371799612, "loss": 0.6298, "step": 5320 }, { "epoch": 0.6707182668388072, "grad_norm": 0.3730217218399048, "learning_rate": 0.00028627033127794896, "loss": 0.7137, "step": 5325 }, { "epoch": 0.6713480492489845, "grad_norm": 0.347002774477005, "learning_rate": 0.00028622435219565606, "loss": 0.6873, "step": 5330 }, { "epoch": 0.6719778316591618, "grad_norm": 0.35723358392715454, "learning_rate": 0.00028617829995777433, "loss": 0.7055, "step": 5335 }, { "epoch": 0.672607614069339, "grad_norm": 0.3175225257873535, "learning_rate": 0.0002861321745890349, "loss": 0.6702, "step": 5340 }, { "epoch": 0.6732373964795163, "grad_norm": 0.3599521517753601, "learning_rate": 0.00028608597611420807, "loss": 0.6646, "step": 5345 }, { "epoch": 0.6738671788896936, "grad_norm": 0.4381812810897827, "learning_rate": 0.00028603970455810357, "loss": 0.7122, "step": 5350 }, { "epoch": 0.6744969612998709, "grad_norm": 0.3400894105434418, "learning_rate": 0.00028599335994557027, "loss": 0.705, "step": 5355 }, { "epoch": 0.6751267437100482, "grad_norm": 0.3332962989807129, "learning_rate": 0.00028594694230149625, "loss": 0.6497, "step": 5360 }, { "epoch": 0.6757565261202254, "grad_norm": 0.386343389749527, "learning_rate": 0.00028590045165080883, "loss": 0.6344, "step": 5365 }, { "epoch": 0.6763863085304027, "grad_norm": 0.4404468834400177, "learning_rate": 0.0002858538880184746, "loss": 0.7115, "step": 5370 }, { "epoch": 0.67701609094058, "grad_norm": 0.35227730870246887, "learning_rate": 0.00028580725142949925, "loss": 0.702, "step": 5375 }, { "epoch": 0.6776458733507573, "grad_norm": 0.38216719031333923, "learning_rate": 0.00028576054190892775, "loss": 0.6845, "step": 5380 }, { "epoch": 0.6782756557609346, "grad_norm": 0.3602873682975769, "learning_rate": 0.0002857137594818441, "loss": 0.7156, "step": 5385 }, { "epoch": 0.6789054381711119, "grad_norm": 0.38896870613098145, "learning_rate": 0.00028566690417337166, "loss": 0.7029, "step": 5390 }, { "epoch": 0.6795352205812891, "grad_norm": 0.3434313237667084, "learning_rate": 0.0002856199760086726, "loss": 0.687, "step": 5395 }, { "epoch": 0.6801650029914664, "grad_norm": 0.381331205368042, "learning_rate": 0.0002855729750129487, "loss": 0.6597, "step": 5400 }, { "epoch": 0.6807947854016437, "grad_norm": 0.35004013776779175, "learning_rate": 0.0002855259012114403, "loss": 0.6604, "step": 5405 }, { "epoch": 0.681424567811821, "grad_norm": 0.3601452112197876, "learning_rate": 0.0002854787546294272, "loss": 0.6949, "step": 5410 }, { "epoch": 0.6820543502219983, "grad_norm": 0.3827126920223236, "learning_rate": 0.0002854315352922282, "loss": 0.7121, "step": 5415 }, { "epoch": 0.6826841326321755, "grad_norm": 0.35859569907188416, "learning_rate": 0.0002853842432252012, "loss": 0.6662, "step": 5420 }, { "epoch": 0.6833139150423528, "grad_norm": 0.36607855558395386, "learning_rate": 0.00028533687845374304, "loss": 0.6716, "step": 5425 }, { "epoch": 0.6839436974525301, "grad_norm": 0.3658086061477661, "learning_rate": 0.00028528944100328975, "loss": 0.6718, "step": 5430 }, { "epoch": 0.6845734798627074, "grad_norm": 0.3442821800708771, "learning_rate": 0.00028524193089931633, "loss": 0.6474, "step": 5435 }, { "epoch": 0.6852032622728847, "grad_norm": 0.38460132479667664, "learning_rate": 0.0002851943481673367, "loss": 0.6973, "step": 5440 }, { "epoch": 0.685833044683062, "grad_norm": 0.3717944622039795, "learning_rate": 0.000285146692832904, "loss": 0.6962, "step": 5445 }, { "epoch": 0.6864628270932392, "grad_norm": 0.42136862874031067, "learning_rate": 0.00028509896492161013, "loss": 0.6783, "step": 5450 }, { "epoch": 0.6870926095034166, "grad_norm": 0.37208443880081177, "learning_rate": 0.0002850511644590862, "loss": 0.6915, "step": 5455 }, { "epoch": 0.6877223919135939, "grad_norm": 0.3807058036327362, "learning_rate": 0.000285003291471002, "loss": 0.7269, "step": 5460 }, { "epoch": 0.6883521743237712, "grad_norm": 0.38431763648986816, "learning_rate": 0.00028495534598306645, "loss": 0.6589, "step": 5465 }, { "epoch": 0.6889819567339485, "grad_norm": 0.372773140668869, "learning_rate": 0.0002849073280210274, "loss": 0.6922, "step": 5470 }, { "epoch": 0.6896117391441257, "grad_norm": 0.3280029892921448, "learning_rate": 0.00028485923761067164, "loss": 0.6887, "step": 5475 }, { "epoch": 0.690241521554303, "grad_norm": 0.3463418483734131, "learning_rate": 0.0002848110747778247, "loss": 0.6565, "step": 5480 }, { "epoch": 0.6908713039644803, "grad_norm": 0.3423214256763458, "learning_rate": 0.00028476283954835123, "loss": 0.6412, "step": 5485 }, { "epoch": 0.6915010863746576, "grad_norm": 0.3461606204509735, "learning_rate": 0.0002847145319481546, "loss": 0.6803, "step": 5490 }, { "epoch": 0.6921308687848349, "grad_norm": 0.38746729493141174, "learning_rate": 0.0002846661520031772, "loss": 0.6424, "step": 5495 }, { "epoch": 0.6927606511950122, "grad_norm": 0.32353097200393677, "learning_rate": 0.00028461769973939997, "loss": 0.6761, "step": 5500 }, { "epoch": 0.6933904336051894, "grad_norm": 0.3790241777896881, "learning_rate": 0.00028456917518284304, "loss": 0.6683, "step": 5505 }, { "epoch": 0.6940202160153667, "grad_norm": 0.3713475465774536, "learning_rate": 0.0002845205783595651, "loss": 0.6663, "step": 5510 }, { "epoch": 0.694649998425544, "grad_norm": 0.3859196901321411, "learning_rate": 0.00028447190929566384, "loss": 0.6717, "step": 5515 }, { "epoch": 0.6952797808357213, "grad_norm": 0.34451383352279663, "learning_rate": 0.0002844231680172756, "loss": 0.6368, "step": 5520 }, { "epoch": 0.6959095632458986, "grad_norm": 0.3519328534603119, "learning_rate": 0.00028437435455057564, "loss": 0.6882, "step": 5525 }, { "epoch": 0.6965393456560758, "grad_norm": 0.382755309343338, "learning_rate": 0.0002843254689217778, "loss": 0.6415, "step": 5530 }, { "epoch": 0.6971691280662531, "grad_norm": 0.35310298204421997, "learning_rate": 0.0002842765111571349, "loss": 0.6744, "step": 5535 }, { "epoch": 0.6977989104764304, "grad_norm": 0.3392702341079712, "learning_rate": 0.0002842274812829382, "loss": 0.6705, "step": 5540 }, { "epoch": 0.6984286928866077, "grad_norm": 0.36502036452293396, "learning_rate": 0.00028417837932551805, "loss": 0.6777, "step": 5545 }, { "epoch": 0.699058475296785, "grad_norm": 0.36270782351493835, "learning_rate": 0.0002841292053112432, "loss": 0.6988, "step": 5550 }, { "epoch": 0.6996882577069622, "grad_norm": 0.3752531111240387, "learning_rate": 0.0002840799592665213, "loss": 0.6745, "step": 5555 }, { "epoch": 0.7003180401171395, "grad_norm": 0.32373905181884766, "learning_rate": 0.00028403064121779853, "loss": 0.664, "step": 5560 }, { "epoch": 0.7009478225273168, "grad_norm": 0.4017639756202698, "learning_rate": 0.0002839812511915599, "loss": 0.6793, "step": 5565 }, { "epoch": 0.7015776049374941, "grad_norm": 0.33867186307907104, "learning_rate": 0.00028393178921432883, "loss": 0.6811, "step": 5570 }, { "epoch": 0.7022073873476714, "grad_norm": 0.3769174814224243, "learning_rate": 0.0002838822553126677, "loss": 0.7118, "step": 5575 }, { "epoch": 0.7028371697578487, "grad_norm": 0.36820533871650696, "learning_rate": 0.00028383264951317727, "loss": 0.6581, "step": 5580 }, { "epoch": 0.7034669521680259, "grad_norm": 0.37128061056137085, "learning_rate": 0.00028378297184249694, "loss": 0.6722, "step": 5585 }, { "epoch": 0.7040967345782032, "grad_norm": 0.39225873351097107, "learning_rate": 0.00028373322232730483, "loss": 0.6846, "step": 5590 }, { "epoch": 0.7047265169883805, "grad_norm": 0.3394504189491272, "learning_rate": 0.0002836834009943175, "loss": 0.6815, "step": 5595 }, { "epoch": 0.7053562993985578, "grad_norm": 0.37265124917030334, "learning_rate": 0.0002836335078702903, "loss": 0.6614, "step": 5600 }, { "epoch": 0.7059860818087351, "grad_norm": 0.33066150546073914, "learning_rate": 0.00028358354298201673, "loss": 0.6701, "step": 5605 }, { "epoch": 0.7066158642189123, "grad_norm": 0.35536128282546997, "learning_rate": 0.0002835335063563293, "loss": 0.6149, "step": 5610 }, { "epoch": 0.7072456466290896, "grad_norm": 0.35491225123405457, "learning_rate": 0.0002834833980200987, "loss": 0.6773, "step": 5615 }, { "epoch": 0.7078754290392669, "grad_norm": 0.37837696075439453, "learning_rate": 0.0002834332180002343, "loss": 0.6899, "step": 5620 }, { "epoch": 0.7085052114494442, "grad_norm": 0.3391937017440796, "learning_rate": 0.0002833829663236838, "loss": 0.7041, "step": 5625 }, { "epoch": 0.7091349938596215, "grad_norm": 0.3482423424720764, "learning_rate": 0.00028333264301743375, "loss": 0.6597, "step": 5630 }, { "epoch": 0.7097647762697988, "grad_norm": 0.4188586175441742, "learning_rate": 0.00028328224810850866, "loss": 0.6916, "step": 5635 }, { "epoch": 0.710394558679976, "grad_norm": 0.32832324504852295, "learning_rate": 0.0002832317816239718, "loss": 0.6791, "step": 5640 }, { "epoch": 0.7110243410901533, "grad_norm": 0.343058705329895, "learning_rate": 0.00028318124359092496, "loss": 0.6423, "step": 5645 }, { "epoch": 0.7116541235003306, "grad_norm": 0.37011584639549255, "learning_rate": 0.0002831306340365081, "loss": 0.6783, "step": 5650 }, { "epoch": 0.7122839059105079, "grad_norm": 0.38297170400619507, "learning_rate": 0.00028307995298789974, "loss": 0.6751, "step": 5655 }, { "epoch": 0.7129136883206852, "grad_norm": 0.38705122470855713, "learning_rate": 0.00028302920047231677, "loss": 0.6844, "step": 5660 }, { "epoch": 0.7135434707308624, "grad_norm": 0.3647492527961731, "learning_rate": 0.0002829783765170144, "loss": 0.6811, "step": 5665 }, { "epoch": 0.7141732531410397, "grad_norm": 0.3796983063220978, "learning_rate": 0.0002829274811492863, "loss": 0.6766, "step": 5670 }, { "epoch": 0.714803035551217, "grad_norm": 0.36972787976264954, "learning_rate": 0.00028287651439646444, "loss": 0.6701, "step": 5675 }, { "epoch": 0.7154328179613944, "grad_norm": 0.37298983335494995, "learning_rate": 0.0002828254762859192, "loss": 0.6439, "step": 5680 }, { "epoch": 0.7160626003715717, "grad_norm": 0.3464621603488922, "learning_rate": 0.0002827743668450591, "loss": 0.6626, "step": 5685 }, { "epoch": 0.716692382781749, "grad_norm": 0.34213629364967346, "learning_rate": 0.00028272318610133104, "loss": 0.6987, "step": 5690 }, { "epoch": 0.7173221651919262, "grad_norm": 0.38596463203430176, "learning_rate": 0.0002826719340822204, "loss": 0.6846, "step": 5695 }, { "epoch": 0.7179519476021035, "grad_norm": 0.3410765826702118, "learning_rate": 0.0002826206108152506, "loss": 0.6769, "step": 5700 }, { "epoch": 0.7185817300122808, "grad_norm": 0.3370499610900879, "learning_rate": 0.0002825692163279834, "loss": 0.6563, "step": 5705 }, { "epoch": 0.7192115124224581, "grad_norm": 0.3973693549633026, "learning_rate": 0.0002825177506480189, "loss": 0.6587, "step": 5710 }, { "epoch": 0.7198412948326354, "grad_norm": 0.3341182470321655, "learning_rate": 0.0002824662138029952, "loss": 0.6489, "step": 5715 }, { "epoch": 0.7204710772428126, "grad_norm": 0.3598056733608246, "learning_rate": 0.00028241460582058883, "loss": 0.6623, "step": 5720 }, { "epoch": 0.7211008596529899, "grad_norm": 0.34275728464126587, "learning_rate": 0.00028236292672851443, "loss": 0.6987, "step": 5725 }, { "epoch": 0.7217306420631672, "grad_norm": 0.3606712222099304, "learning_rate": 0.000282311176554525, "loss": 0.6947, "step": 5730 }, { "epoch": 0.7223604244733445, "grad_norm": 0.32409214973449707, "learning_rate": 0.0002822593553264114, "loss": 0.6468, "step": 5735 }, { "epoch": 0.7229902068835218, "grad_norm": 0.3465891182422638, "learning_rate": 0.00028220746307200287, "loss": 0.647, "step": 5740 }, { "epoch": 0.723619989293699, "grad_norm": 0.3540678918361664, "learning_rate": 0.0002821554998191667, "loss": 0.6964, "step": 5745 }, { "epoch": 0.7242497717038763, "grad_norm": 0.35845157504081726, "learning_rate": 0.0002821034655958084, "loss": 0.6599, "step": 5750 }, { "epoch": 0.7248795541140536, "grad_norm": 0.3469247817993164, "learning_rate": 0.00028205136042987156, "loss": 0.6518, "step": 5755 }, { "epoch": 0.7255093365242309, "grad_norm": 0.3693814277648926, "learning_rate": 0.0002819991843493377, "loss": 0.6339, "step": 5760 }, { "epoch": 0.7261391189344082, "grad_norm": 0.35166436433792114, "learning_rate": 0.0002819469373822268, "loss": 0.6593, "step": 5765 }, { "epoch": 0.7267689013445855, "grad_norm": 0.376717746257782, "learning_rate": 0.00028189461955659644, "loss": 0.6583, "step": 5770 }, { "epoch": 0.7273986837547627, "grad_norm": 0.36365002393722534, "learning_rate": 0.0002818422309005426, "loss": 0.707, "step": 5775 }, { "epoch": 0.72802846616494, "grad_norm": 0.3356451392173767, "learning_rate": 0.00028178977144219914, "loss": 0.6439, "step": 5780 }, { "epoch": 0.7286582485751173, "grad_norm": 0.33520832657814026, "learning_rate": 0.00028173724120973806, "loss": 0.6276, "step": 5785 }, { "epoch": 0.7292880309852946, "grad_norm": 0.3459213376045227, "learning_rate": 0.00028168464023136926, "loss": 0.648, "step": 5790 }, { "epoch": 0.7299178133954719, "grad_norm": 0.3563973903656006, "learning_rate": 0.0002816319685353406, "loss": 0.6579, "step": 5795 }, { "epoch": 0.7305475958056491, "grad_norm": 0.3637474775314331, "learning_rate": 0.0002815792261499381, "loss": 0.6828, "step": 5800 }, { "epoch": 0.7311773782158264, "grad_norm": 0.38304394483566284, "learning_rate": 0.00028152641310348554, "loss": 0.6348, "step": 5805 }, { "epoch": 0.7318071606260037, "grad_norm": 0.33336034417152405, "learning_rate": 0.0002814735294243448, "loss": 0.6337, "step": 5810 }, { "epoch": 0.732436943036181, "grad_norm": 0.34154805541038513, "learning_rate": 0.0002814205751409156, "loss": 0.6885, "step": 5815 }, { "epoch": 0.7330667254463583, "grad_norm": 0.3780697286128998, "learning_rate": 0.00028136755028163556, "loss": 0.6558, "step": 5820 }, { "epoch": 0.7336965078565356, "grad_norm": 0.3496229946613312, "learning_rate": 0.0002813144548749802, "loss": 0.7058, "step": 5825 }, { "epoch": 0.7343262902667128, "grad_norm": 0.36560389399528503, "learning_rate": 0.0002812612889494631, "loss": 0.6991, "step": 5830 }, { "epoch": 0.7349560726768901, "grad_norm": 0.3215349316596985, "learning_rate": 0.00028120805253363545, "loss": 0.612, "step": 5835 }, { "epoch": 0.7355858550870674, "grad_norm": 0.36016130447387695, "learning_rate": 0.00028115474565608656, "loss": 0.6905, "step": 5840 }, { "epoch": 0.7362156374972447, "grad_norm": 0.3493592441082001, "learning_rate": 0.00028110136834544336, "loss": 0.6922, "step": 5845 }, { "epoch": 0.736845419907422, "grad_norm": 0.34350746870040894, "learning_rate": 0.00028104792063037064, "loss": 0.6238, "step": 5850 }, { "epoch": 0.7374752023175992, "grad_norm": 0.3633589446544647, "learning_rate": 0.0002809944025395711, "loss": 0.6775, "step": 5855 }, { "epoch": 0.7381049847277765, "grad_norm": 0.3892457187175751, "learning_rate": 0.00028094081410178515, "loss": 0.6756, "step": 5860 }, { "epoch": 0.7387347671379538, "grad_norm": 0.33569657802581787, "learning_rate": 0.00028088715534579104, "loss": 0.63, "step": 5865 }, { "epoch": 0.7393645495481311, "grad_norm": 0.36327067017555237, "learning_rate": 0.0002808334263004047, "loss": 0.6653, "step": 5870 }, { "epoch": 0.7399943319583084, "grad_norm": 0.32698652148246765, "learning_rate": 0.00028077962699448, "loss": 0.655, "step": 5875 }, { "epoch": 0.7406241143684857, "grad_norm": 0.35473042726516724, "learning_rate": 0.0002807257574569082, "loss": 0.6341, "step": 5880 }, { "epoch": 0.7412538967786629, "grad_norm": 0.33008939027786255, "learning_rate": 0.0002806718177166185, "loss": 0.6614, "step": 5885 }, { "epoch": 0.7418836791888402, "grad_norm": 0.3434574007987976, "learning_rate": 0.0002806178078025779, "loss": 0.6313, "step": 5890 }, { "epoch": 0.7425134615990175, "grad_norm": 0.30766573548316956, "learning_rate": 0.00028056372774379085, "loss": 0.6296, "step": 5895 }, { "epoch": 0.7431432440091948, "grad_norm": 0.3676775097846985, "learning_rate": 0.00028050957756929965, "loss": 0.628, "step": 5900 }, { "epoch": 0.7437730264193722, "grad_norm": 0.3424786925315857, "learning_rate": 0.0002804553573081841, "loss": 0.6141, "step": 5905 }, { "epoch": 0.7444028088295493, "grad_norm": 0.391250878572464, "learning_rate": 0.0002804010669895618, "loss": 0.6615, "step": 5910 }, { "epoch": 0.7450325912397266, "grad_norm": 0.34186193346977234, "learning_rate": 0.0002803467066425878, "loss": 0.6389, "step": 5915 }, { "epoch": 0.745662373649904, "grad_norm": 0.37509649991989136, "learning_rate": 0.0002802922762964549, "loss": 0.6397, "step": 5920 }, { "epoch": 0.7462921560600813, "grad_norm": 0.3327299654483795, "learning_rate": 0.00028023777598039346, "loss": 0.6241, "step": 5925 }, { "epoch": 0.7469219384702586, "grad_norm": 0.37098389863967896, "learning_rate": 0.0002801832057236714, "loss": 0.7004, "step": 5930 }, { "epoch": 0.7475517208804358, "grad_norm": 0.36630627512931824, "learning_rate": 0.00028012856555559415, "loss": 0.6201, "step": 5935 }, { "epoch": 0.7481815032906131, "grad_norm": 0.3580261170864105, "learning_rate": 0.00028007385550550475, "loss": 0.6969, "step": 5940 }, { "epoch": 0.7488112857007904, "grad_norm": 0.3491668105125427, "learning_rate": 0.0002800190756027837, "loss": 0.6457, "step": 5945 }, { "epoch": 0.7494410681109677, "grad_norm": 0.2999480664730072, "learning_rate": 0.0002799642258768491, "loss": 0.6398, "step": 5950 }, { "epoch": 0.750070850521145, "grad_norm": 0.33795973658561707, "learning_rate": 0.00027990930635715655, "loss": 0.6672, "step": 5955 }, { "epoch": 0.7507006329313223, "grad_norm": 0.39881202578544617, "learning_rate": 0.00027985431707319903, "loss": 0.6796, "step": 5960 }, { "epoch": 0.7513304153414995, "grad_norm": 0.4092641770839691, "learning_rate": 0.0002797992580545071, "loss": 0.6488, "step": 5965 }, { "epoch": 0.7519601977516768, "grad_norm": 0.33037346601486206, "learning_rate": 0.0002797441293306486, "loss": 0.667, "step": 5970 }, { "epoch": 0.7525899801618541, "grad_norm": 0.35514095425605774, "learning_rate": 0.00027968893093122896, "loss": 0.6984, "step": 5975 }, { "epoch": 0.7532197625720314, "grad_norm": 0.4268254339694977, "learning_rate": 0.0002796336628858911, "loss": 0.6762, "step": 5980 }, { "epoch": 0.7538495449822087, "grad_norm": 0.33386656641960144, "learning_rate": 0.00027957832522431503, "loss": 0.6438, "step": 5985 }, { "epoch": 0.7544793273923859, "grad_norm": 0.374845415353775, "learning_rate": 0.00027952291797621846, "loss": 0.6422, "step": 5990 }, { "epoch": 0.7551091098025632, "grad_norm": 0.32742077112197876, "learning_rate": 0.0002794674411713563, "loss": 0.6685, "step": 5995 }, { "epoch": 0.7557388922127405, "grad_norm": 0.3118845820426941, "learning_rate": 0.0002794118948395209, "loss": 0.6273, "step": 6000 }, { "epoch": 0.7557388922127405, "eval_loss": 0.3097546696662903, "eval_runtime": 6.2567, "eval_samples_per_second": 159.828, "eval_steps_per_second": 10.069, "step": 6000 }, { "epoch": 0.7563686746229178, "grad_norm": 0.3407754898071289, "learning_rate": 0.00027935627901054197, "loss": 0.6712, "step": 6005 }, { "epoch": 0.7569984570330951, "grad_norm": 0.34817007184028625, "learning_rate": 0.0002793005937142863, "loss": 0.6492, "step": 6010 }, { "epoch": 0.7576282394432724, "grad_norm": 0.36492645740509033, "learning_rate": 0.00027924483898065833, "loss": 0.6467, "step": 6015 }, { "epoch": 0.7582580218534496, "grad_norm": 0.33556580543518066, "learning_rate": 0.0002791890148395995, "loss": 0.6486, "step": 6020 }, { "epoch": 0.7588878042636269, "grad_norm": 0.36699965596199036, "learning_rate": 0.00027913312132108874, "loss": 0.6909, "step": 6025 }, { "epoch": 0.7595175866738042, "grad_norm": 0.32526010274887085, "learning_rate": 0.0002790771584551421, "loss": 0.6234, "step": 6030 }, { "epoch": 0.7601473690839815, "grad_norm": 0.38366591930389404, "learning_rate": 0.00027902112627181295, "loss": 0.6195, "step": 6035 }, { "epoch": 0.7607771514941588, "grad_norm": 0.33587443828582764, "learning_rate": 0.0002789650248011918, "loss": 0.6546, "step": 6040 }, { "epoch": 0.761406933904336, "grad_norm": 0.36170026659965515, "learning_rate": 0.00027890885407340653, "loss": 0.6294, "step": 6045 }, { "epoch": 0.7620367163145133, "grad_norm": 0.34692490100860596, "learning_rate": 0.000278852614118622, "loss": 0.6468, "step": 6050 }, { "epoch": 0.7626664987246906, "grad_norm": 0.346608966588974, "learning_rate": 0.0002787963049670404, "loss": 0.6714, "step": 6055 }, { "epoch": 0.7632962811348679, "grad_norm": 0.3632940948009491, "learning_rate": 0.00027873992664890097, "loss": 0.6772, "step": 6060 }, { "epoch": 0.7639260635450452, "grad_norm": 0.38135001063346863, "learning_rate": 0.00027868347919448027, "loss": 0.658, "step": 6065 }, { "epoch": 0.7645558459552225, "grad_norm": 0.3518752455711365, "learning_rate": 0.00027862696263409177, "loss": 0.6445, "step": 6070 }, { "epoch": 0.7651856283653997, "grad_norm": 0.33004361391067505, "learning_rate": 0.00027857037699808613, "loss": 0.6553, "step": 6075 }, { "epoch": 0.765815410775577, "grad_norm": 0.36370858550071716, "learning_rate": 0.0002785137223168512, "loss": 0.6632, "step": 6080 }, { "epoch": 0.7664451931857543, "grad_norm": 0.3472859561443329, "learning_rate": 0.0002784569986208119, "loss": 0.626, "step": 6085 }, { "epoch": 0.7670749755959316, "grad_norm": 0.3560635447502136, "learning_rate": 0.00027840020594043, "loss": 0.6628, "step": 6090 }, { "epoch": 0.7677047580061089, "grad_norm": 0.3515082895755768, "learning_rate": 0.00027834334430620455, "loss": 0.7061, "step": 6095 }, { "epoch": 0.7683345404162861, "grad_norm": 0.3222733736038208, "learning_rate": 0.00027828641374867154, "loss": 0.617, "step": 6100 }, { "epoch": 0.7689643228264634, "grad_norm": 0.3362828493118286, "learning_rate": 0.00027822941429840397, "loss": 0.6825, "step": 6105 }, { "epoch": 0.7695941052366407, "grad_norm": 0.34228187799453735, "learning_rate": 0.0002781723459860119, "loss": 0.6306, "step": 6110 }, { "epoch": 0.770223887646818, "grad_norm": 0.3672444820404053, "learning_rate": 0.0002781152088421422, "loss": 0.6601, "step": 6115 }, { "epoch": 0.7708536700569953, "grad_norm": 0.3703080415725708, "learning_rate": 0.00027805800289747894, "loss": 0.6385, "step": 6120 }, { "epoch": 0.7714834524671725, "grad_norm": 0.34456151723861694, "learning_rate": 0.0002780007281827429, "loss": 0.6635, "step": 6125 }, { "epoch": 0.7721132348773498, "grad_norm": 0.3449029326438904, "learning_rate": 0.00027794338472869205, "loss": 0.6258, "step": 6130 }, { "epoch": 0.7727430172875271, "grad_norm": 0.3441922068595886, "learning_rate": 0.0002778859725661211, "loss": 0.627, "step": 6135 }, { "epoch": 0.7733727996977044, "grad_norm": 0.3855600357055664, "learning_rate": 0.00027782849172586156, "loss": 0.6205, "step": 6140 }, { "epoch": 0.7740025821078818, "grad_norm": 0.3838488757610321, "learning_rate": 0.0002777709422387821, "loss": 0.6463, "step": 6145 }, { "epoch": 0.7746323645180591, "grad_norm": 0.3128564953804016, "learning_rate": 0.00027771332413578805, "loss": 0.6639, "step": 6150 }, { "epoch": 0.7752621469282363, "grad_norm": 0.32142025232315063, "learning_rate": 0.00027765563744782166, "loss": 0.6187, "step": 6155 }, { "epoch": 0.7758919293384136, "grad_norm": 0.34378373622894287, "learning_rate": 0.000277597882205862, "loss": 0.659, "step": 6160 }, { "epoch": 0.7765217117485909, "grad_norm": 0.35872867703437805, "learning_rate": 0.0002775400584409249, "loss": 0.6245, "step": 6165 }, { "epoch": 0.7771514941587682, "grad_norm": 0.32217180728912354, "learning_rate": 0.00027748216618406316, "loss": 0.6216, "step": 6170 }, { "epoch": 0.7777812765689455, "grad_norm": 0.3139524757862091, "learning_rate": 0.00027742420546636616, "loss": 0.6831, "step": 6175 }, { "epoch": 0.7784110589791227, "grad_norm": 0.3159128427505493, "learning_rate": 0.00027736617631896017, "loss": 0.6417, "step": 6180 }, { "epoch": 0.7790408413893, "grad_norm": 0.36738142371177673, "learning_rate": 0.0002773080787730081, "loss": 0.6592, "step": 6185 }, { "epoch": 0.7796706237994773, "grad_norm": 0.31971079111099243, "learning_rate": 0.0002772499128597097, "loss": 0.6296, "step": 6190 }, { "epoch": 0.7803004062096546, "grad_norm": 0.3699764609336853, "learning_rate": 0.00027719167861030145, "loss": 0.6161, "step": 6195 }, { "epoch": 0.7809301886198319, "grad_norm": 0.3316752016544342, "learning_rate": 0.0002771333760560564, "loss": 0.6698, "step": 6200 }, { "epoch": 0.7815599710300092, "grad_norm": 0.34318891167640686, "learning_rate": 0.00027707500522828433, "loss": 0.6312, "step": 6205 }, { "epoch": 0.7821897534401864, "grad_norm": 0.3325194716453552, "learning_rate": 0.00027701656615833185, "loss": 0.6515, "step": 6210 }, { "epoch": 0.7828195358503637, "grad_norm": 0.3374411463737488, "learning_rate": 0.0002769580588775819, "loss": 0.6811, "step": 6215 }, { "epoch": 0.783449318260541, "grad_norm": 0.3507198989391327, "learning_rate": 0.00027689948341745433, "loss": 0.6177, "step": 6220 }, { "epoch": 0.7840791006707183, "grad_norm": 0.3619876205921173, "learning_rate": 0.00027684083980940543, "loss": 0.6812, "step": 6225 }, { "epoch": 0.7847088830808956, "grad_norm": 0.3660729229450226, "learning_rate": 0.00027678212808492824, "loss": 0.6888, "step": 6230 }, { "epoch": 0.7853386654910728, "grad_norm": 0.37557917833328247, "learning_rate": 0.00027672334827555226, "loss": 0.6516, "step": 6235 }, { "epoch": 0.7859684479012501, "grad_norm": 0.37117084860801697, "learning_rate": 0.00027666450041284363, "loss": 0.6503, "step": 6240 }, { "epoch": 0.7865982303114274, "grad_norm": 0.3434617519378662, "learning_rate": 0.00027660558452840487, "loss": 0.6582, "step": 6245 }, { "epoch": 0.7872280127216047, "grad_norm": 0.3878399431705475, "learning_rate": 0.0002765466006538753, "loss": 0.6309, "step": 6250 }, { "epoch": 0.787857795131782, "grad_norm": 0.3379189968109131, "learning_rate": 0.0002764875488209305, "loss": 0.6802, "step": 6255 }, { "epoch": 0.7884875775419593, "grad_norm": 0.3534158170223236, "learning_rate": 0.0002764284290612827, "loss": 0.6248, "step": 6260 }, { "epoch": 0.7891173599521365, "grad_norm": 0.3273150324821472, "learning_rate": 0.0002763692414066806, "loss": 0.617, "step": 6265 }, { "epoch": 0.7897471423623138, "grad_norm": 0.4256115257740021, "learning_rate": 0.0002763099858889093, "loss": 0.6452, "step": 6270 }, { "epoch": 0.7903769247724911, "grad_norm": 0.34881314635276794, "learning_rate": 0.0002762506625397903, "loss": 0.6545, "step": 6275 }, { "epoch": 0.7910067071826684, "grad_norm": 0.3283347487449646, "learning_rate": 0.0002761912713911817, "loss": 0.6819, "step": 6280 }, { "epoch": 0.7916364895928457, "grad_norm": 0.33939605951309204, "learning_rate": 0.0002761318124749778, "loss": 0.6188, "step": 6285 }, { "epoch": 0.7922662720030229, "grad_norm": 0.3786788582801819, "learning_rate": 0.00027607228582310947, "loss": 0.6583, "step": 6290 }, { "epoch": 0.7928960544132002, "grad_norm": 0.34528714418411255, "learning_rate": 0.0002760126914675439, "loss": 0.6594, "step": 6295 }, { "epoch": 0.7935258368233775, "grad_norm": 0.3494967818260193, "learning_rate": 0.00027595302944028447, "loss": 0.6241, "step": 6300 }, { "epoch": 0.7941556192335548, "grad_norm": 0.350005179643631, "learning_rate": 0.00027589329977337126, "loss": 0.6724, "step": 6305 }, { "epoch": 0.7947854016437321, "grad_norm": 0.3381168246269226, "learning_rate": 0.0002758335024988803, "loss": 0.6062, "step": 6310 }, { "epoch": 0.7954151840539094, "grad_norm": 0.32583653926849365, "learning_rate": 0.0002757736376489242, "loss": 0.6602, "step": 6315 }, { "epoch": 0.7960449664640866, "grad_norm": 0.33687326312065125, "learning_rate": 0.0002757137052556517, "loss": 0.6391, "step": 6320 }, { "epoch": 0.7966747488742639, "grad_norm": 0.35395026206970215, "learning_rate": 0.00027565370535124784, "loss": 0.6445, "step": 6325 }, { "epoch": 0.7973045312844412, "grad_norm": 0.3484829068183899, "learning_rate": 0.000275593637967934, "loss": 0.6242, "step": 6330 }, { "epoch": 0.7979343136946185, "grad_norm": 0.32783517241477966, "learning_rate": 0.0002755335031379677, "loss": 0.6481, "step": 6335 }, { "epoch": 0.7985640961047958, "grad_norm": 0.3683319389820099, "learning_rate": 0.0002754733008936427, "loss": 0.6506, "step": 6340 }, { "epoch": 0.799193878514973, "grad_norm": 0.360219269990921, "learning_rate": 0.00027541303126728907, "loss": 0.6377, "step": 6345 }, { "epoch": 0.7998236609251503, "grad_norm": 0.3323548436164856, "learning_rate": 0.00027535269429127283, "loss": 0.6278, "step": 6350 }, { "epoch": 0.8004534433353276, "grad_norm": 0.33823835849761963, "learning_rate": 0.0002752922899979965, "loss": 0.5999, "step": 6355 }, { "epoch": 0.801083225745505, "grad_norm": 0.35394924879074097, "learning_rate": 0.0002752318184198984, "loss": 0.6873, "step": 6360 }, { "epoch": 0.8017130081556822, "grad_norm": 0.35529881715774536, "learning_rate": 0.00027517127958945315, "loss": 0.6183, "step": 6365 }, { "epoch": 0.8023427905658594, "grad_norm": 0.35854044556617737, "learning_rate": 0.00027511067353917166, "loss": 0.6394, "step": 6370 }, { "epoch": 0.8029725729760367, "grad_norm": 0.32757097482681274, "learning_rate": 0.0002750500003016006, "loss": 0.6383, "step": 6375 }, { "epoch": 0.803602355386214, "grad_norm": 0.3267909586429596, "learning_rate": 0.0002749892599093229, "loss": 0.5951, "step": 6380 }, { "epoch": 0.8042321377963914, "grad_norm": 0.31262004375457764, "learning_rate": 0.0002749284523949576, "loss": 0.6497, "step": 6385 }, { "epoch": 0.8048619202065687, "grad_norm": 0.34036824107170105, "learning_rate": 0.00027486757779115973, "loss": 0.6295, "step": 6390 }, { "epoch": 0.805491702616746, "grad_norm": 0.3461470901966095, "learning_rate": 0.0002748066361306203, "loss": 0.6537, "step": 6395 }, { "epoch": 0.8061214850269232, "grad_norm": 0.35146886110305786, "learning_rate": 0.00027474562744606636, "loss": 0.6217, "step": 6400 }, { "epoch": 0.8067512674371005, "grad_norm": 0.37654054164886475, "learning_rate": 0.000274684551770261, "loss": 0.6417, "step": 6405 }, { "epoch": 0.8073810498472778, "grad_norm": 0.36115625500679016, "learning_rate": 0.0002746234091360032, "loss": 0.6638, "step": 6410 }, { "epoch": 0.8080108322574551, "grad_norm": 0.3503740727901459, "learning_rate": 0.00027456219957612804, "loss": 0.6652, "step": 6415 }, { "epoch": 0.8086406146676324, "grad_norm": 0.3303118646144867, "learning_rate": 0.0002745009231235064, "loss": 0.614, "step": 6420 }, { "epoch": 0.8092703970778096, "grad_norm": 0.35880813002586365, "learning_rate": 0.00027443957981104517, "loss": 0.6449, "step": 6425 }, { "epoch": 0.8099001794879869, "grad_norm": 0.3664454221725464, "learning_rate": 0.000274378169671687, "loss": 0.6448, "step": 6430 }, { "epoch": 0.8105299618981642, "grad_norm": 0.38473254442214966, "learning_rate": 0.00027431669273841067, "loss": 0.6576, "step": 6435 }, { "epoch": 0.8111597443083415, "grad_norm": 0.3694675862789154, "learning_rate": 0.0002742551490442307, "loss": 0.6365, "step": 6440 }, { "epoch": 0.8117895267185188, "grad_norm": 0.32066047191619873, "learning_rate": 0.0002741935386221973, "loss": 0.6563, "step": 6445 }, { "epoch": 0.8124193091286961, "grad_norm": 0.3764455020427704, "learning_rate": 0.0002741318615053968, "loss": 0.61, "step": 6450 }, { "epoch": 0.8130490915388733, "grad_norm": 0.3913812041282654, "learning_rate": 0.00027407011772695124, "loss": 0.6606, "step": 6455 }, { "epoch": 0.8136788739490506, "grad_norm": 0.2876626253128052, "learning_rate": 0.0002740083073200184, "loss": 0.6123, "step": 6460 }, { "epoch": 0.8143086563592279, "grad_norm": 0.37668120861053467, "learning_rate": 0.0002739464303177919, "loss": 0.6323, "step": 6465 }, { "epoch": 0.8149384387694052, "grad_norm": 0.3343159854412079, "learning_rate": 0.000273884486753501, "loss": 0.6051, "step": 6470 }, { "epoch": 0.8155682211795825, "grad_norm": 0.3852281868457794, "learning_rate": 0.00027382247666041097, "loss": 0.6614, "step": 6475 }, { "epoch": 0.8161980035897597, "grad_norm": 0.36491283774375916, "learning_rate": 0.0002737604000718225, "loss": 0.6383, "step": 6480 }, { "epoch": 0.816827785999937, "grad_norm": 0.32019633054733276, "learning_rate": 0.00027369825702107224, "loss": 0.623, "step": 6485 }, { "epoch": 0.8174575684101143, "grad_norm": 0.3173837661743164, "learning_rate": 0.0002736360475415324, "loss": 0.599, "step": 6490 }, { "epoch": 0.8180873508202916, "grad_norm": 0.31505605578422546, "learning_rate": 0.00027357377166661086, "loss": 0.6341, "step": 6495 }, { "epoch": 0.8187171332304689, "grad_norm": 0.3370759189128876, "learning_rate": 0.00027351142942975124, "loss": 0.6296, "step": 6500 }, { "epoch": 0.8193469156406462, "grad_norm": 0.3554564416408539, "learning_rate": 0.0002734490208644327, "loss": 0.6587, "step": 6505 }, { "epoch": 0.8199766980508234, "grad_norm": 0.3487757444381714, "learning_rate": 0.0002733865460041701, "loss": 0.6292, "step": 6510 }, { "epoch": 0.8206064804610007, "grad_norm": 0.3280607759952545, "learning_rate": 0.0002733240048825138, "loss": 0.5964, "step": 6515 }, { "epoch": 0.821236262871178, "grad_norm": 0.35416868329048157, "learning_rate": 0.0002732613975330499, "loss": 0.6089, "step": 6520 }, { "epoch": 0.8218660452813553, "grad_norm": 0.3558996915817261, "learning_rate": 0.00027319872398939995, "loss": 0.5791, "step": 6525 }, { "epoch": 0.8224958276915326, "grad_norm": 0.35394206643104553, "learning_rate": 0.000273135984285221, "loss": 0.6183, "step": 6530 }, { "epoch": 0.8231256101017098, "grad_norm": 0.33172932267189026, "learning_rate": 0.0002730731784542058, "loss": 0.605, "step": 6535 }, { "epoch": 0.8237553925118871, "grad_norm": 0.3498142957687378, "learning_rate": 0.00027301030653008253, "loss": 0.6199, "step": 6540 }, { "epoch": 0.8243851749220644, "grad_norm": 0.3364173471927643, "learning_rate": 0.0002729473685466148, "loss": 0.6352, "step": 6545 }, { "epoch": 0.8250149573322417, "grad_norm": 0.38148370385169983, "learning_rate": 0.00027288436453760164, "loss": 0.6216, "step": 6550 }, { "epoch": 0.825644739742419, "grad_norm": 0.33975306153297424, "learning_rate": 0.0002728212945368778, "loss": 0.6155, "step": 6555 }, { "epoch": 0.8262745221525962, "grad_norm": 0.3361944854259491, "learning_rate": 0.0002727581585783133, "loss": 0.6084, "step": 6560 }, { "epoch": 0.8269043045627735, "grad_norm": 0.3503773808479309, "learning_rate": 0.00027269495669581353, "loss": 0.6355, "step": 6565 }, { "epoch": 0.8275340869729508, "grad_norm": 0.35406753420829773, "learning_rate": 0.00027263168892331934, "loss": 0.624, "step": 6570 }, { "epoch": 0.8281638693831281, "grad_norm": 0.3337428569793701, "learning_rate": 0.00027256835529480697, "loss": 0.6451, "step": 6575 }, { "epoch": 0.8287936517933054, "grad_norm": 0.3431616425514221, "learning_rate": 0.00027250495584428807, "loss": 0.5969, "step": 6580 }, { "epoch": 0.8294234342034827, "grad_norm": 0.4032285511493683, "learning_rate": 0.0002724414906058096, "loss": 0.5954, "step": 6585 }, { "epoch": 0.8300532166136599, "grad_norm": 0.3352124094963074, "learning_rate": 0.00027237795961345383, "loss": 0.6077, "step": 6590 }, { "epoch": 0.8306829990238372, "grad_norm": 0.3181077837944031, "learning_rate": 0.0002723143629013383, "loss": 0.6107, "step": 6595 }, { "epoch": 0.8313127814340145, "grad_norm": 0.32390958070755005, "learning_rate": 0.000272250700503616, "loss": 0.6345, "step": 6600 }, { "epoch": 0.8319425638441919, "grad_norm": 0.3199234902858734, "learning_rate": 0.0002721869724544749, "loss": 0.6268, "step": 6605 }, { "epoch": 0.8325723462543692, "grad_norm": 0.38811957836151123, "learning_rate": 0.00027212317878813863, "loss": 0.643, "step": 6610 }, { "epoch": 0.8332021286645463, "grad_norm": 0.38010820746421814, "learning_rate": 0.00027205931953886575, "loss": 0.6055, "step": 6615 }, { "epoch": 0.8338319110747237, "grad_norm": 0.3288145065307617, "learning_rate": 0.00027199539474095013, "loss": 0.6311, "step": 6620 }, { "epoch": 0.834461693484901, "grad_norm": 0.33361807465553284, "learning_rate": 0.0002719314044287209, "loss": 0.6083, "step": 6625 }, { "epoch": 0.8350914758950783, "grad_norm": 0.350864440202713, "learning_rate": 0.0002718673486365423, "loss": 0.5969, "step": 6630 }, { "epoch": 0.8357212583052556, "grad_norm": 0.35795754194259644, "learning_rate": 0.0002718032273988137, "loss": 0.6623, "step": 6635 }, { "epoch": 0.8363510407154329, "grad_norm": 0.3748815357685089, "learning_rate": 0.0002717390407499697, "loss": 0.6301, "step": 6640 }, { "epoch": 0.8369808231256101, "grad_norm": 0.3146851360797882, "learning_rate": 0.00027167478872448, "loss": 0.62, "step": 6645 }, { "epoch": 0.8376106055357874, "grad_norm": 0.3758367598056793, "learning_rate": 0.0002716104713568495, "loss": 0.6202, "step": 6650 }, { "epoch": 0.8382403879459647, "grad_norm": 0.4035817086696625, "learning_rate": 0.0002715460886816179, "loss": 0.606, "step": 6655 }, { "epoch": 0.838870170356142, "grad_norm": 0.3586306869983673, "learning_rate": 0.00027148164073336026, "loss": 0.6523, "step": 6660 }, { "epoch": 0.8394999527663193, "grad_norm": 0.33375057578086853, "learning_rate": 0.0002714171275466866, "loss": 0.6193, "step": 6665 }, { "epoch": 0.8401297351764965, "grad_norm": 0.30258163809776306, "learning_rate": 0.0002713525491562421, "loss": 0.6225, "step": 6670 }, { "epoch": 0.8407595175866738, "grad_norm": 0.33032524585723877, "learning_rate": 0.00027128790559670667, "loss": 0.628, "step": 6675 }, { "epoch": 0.8413892999968511, "grad_norm": 0.36689457297325134, "learning_rate": 0.00027122319690279535, "loss": 0.6341, "step": 6680 }, { "epoch": 0.8420190824070284, "grad_norm": 0.35744035243988037, "learning_rate": 0.00027115842310925837, "loss": 0.5945, "step": 6685 }, { "epoch": 0.8426488648172057, "grad_norm": 0.3377218246459961, "learning_rate": 0.0002710935842508806, "loss": 0.6216, "step": 6690 }, { "epoch": 0.843278647227383, "grad_norm": 0.3244309723377228, "learning_rate": 0.000271028680362482, "loss": 0.6045, "step": 6695 }, { "epoch": 0.8439084296375602, "grad_norm": 0.34593185782432556, "learning_rate": 0.00027096371147891744, "loss": 0.6277, "step": 6700 }, { "epoch": 0.8445382120477375, "grad_norm": 0.3151993751525879, "learning_rate": 0.0002708986776350767, "loss": 0.5929, "step": 6705 }, { "epoch": 0.8451679944579148, "grad_norm": 0.38307860493659973, "learning_rate": 0.0002708335788658845, "loss": 0.5934, "step": 6710 }, { "epoch": 0.8457977768680921, "grad_norm": 0.3155449330806732, "learning_rate": 0.0002707684152063003, "loss": 0.5838, "step": 6715 }, { "epoch": 0.8464275592782694, "grad_norm": 0.3827744424343109, "learning_rate": 0.00027070318669131845, "loss": 0.5976, "step": 6720 }, { "epoch": 0.8470573416884466, "grad_norm": 0.35382625460624695, "learning_rate": 0.00027063789335596825, "loss": 0.5997, "step": 6725 }, { "epoch": 0.8476871240986239, "grad_norm": 0.36884164810180664, "learning_rate": 0.00027057253523531365, "loss": 0.6373, "step": 6730 }, { "epoch": 0.8483169065088012, "grad_norm": 0.35557276010513306, "learning_rate": 0.0002705071123644534, "loss": 0.6717, "step": 6735 }, { "epoch": 0.8489466889189785, "grad_norm": 0.3088480234146118, "learning_rate": 0.00027044162477852124, "loss": 0.6011, "step": 6740 }, { "epoch": 0.8495764713291558, "grad_norm": 0.33665964007377625, "learning_rate": 0.0002703760725126853, "loss": 0.6039, "step": 6745 }, { "epoch": 0.850206253739333, "grad_norm": 0.3297666311264038, "learning_rate": 0.0002703104556021488, "loss": 0.6226, "step": 6750 }, { "epoch": 0.8508360361495103, "grad_norm": 0.33897268772125244, "learning_rate": 0.00027024477408214945, "loss": 0.5564, "step": 6755 }, { "epoch": 0.8514658185596876, "grad_norm": 0.3549680709838867, "learning_rate": 0.0002701790279879597, "loss": 0.5989, "step": 6760 }, { "epoch": 0.8520956009698649, "grad_norm": 0.31162139773368835, "learning_rate": 0.0002701132173548868, "loss": 0.6363, "step": 6765 }, { "epoch": 0.8527253833800422, "grad_norm": 0.35543885827064514, "learning_rate": 0.0002700473422182724, "loss": 0.6228, "step": 6770 }, { "epoch": 0.8533551657902195, "grad_norm": 0.3361263871192932, "learning_rate": 0.0002699814026134932, "loss": 0.5957, "step": 6775 }, { "epoch": 0.8539849482003967, "grad_norm": 0.2764013707637787, "learning_rate": 0.00026991539857596, "loss": 0.5982, "step": 6780 }, { "epoch": 0.854614730610574, "grad_norm": 0.3229328691959381, "learning_rate": 0.0002698493301411187, "loss": 0.6562, "step": 6785 }, { "epoch": 0.8552445130207513, "grad_norm": 0.3163946270942688, "learning_rate": 0.00026978319734444943, "loss": 0.6125, "step": 6790 }, { "epoch": 0.8558742954309286, "grad_norm": 0.38809090852737427, "learning_rate": 0.0002697170002214671, "loss": 0.6308, "step": 6795 }, { "epoch": 0.8565040778411059, "grad_norm": 0.3416973650455475, "learning_rate": 0.0002696507388077209, "loss": 0.6565, "step": 6800 }, { "epoch": 0.8571338602512831, "grad_norm": 0.3220008909702301, "learning_rate": 0.00026958441313879494, "loss": 0.6211, "step": 6805 }, { "epoch": 0.8577636426614604, "grad_norm": 0.34507647156715393, "learning_rate": 0.00026951802325030755, "loss": 0.6384, "step": 6810 }, { "epoch": 0.8583934250716377, "grad_norm": 0.3345770239830017, "learning_rate": 0.00026945156917791154, "loss": 0.6566, "step": 6815 }, { "epoch": 0.859023207481815, "grad_norm": 0.32488980889320374, "learning_rate": 0.0002693850509572943, "loss": 0.626, "step": 6820 }, { "epoch": 0.8596529898919923, "grad_norm": 0.3537434935569763, "learning_rate": 0.00026931846862417766, "loss": 0.6539, "step": 6825 }, { "epoch": 0.8602827723021697, "grad_norm": 0.3165736794471741, "learning_rate": 0.0002692518222143179, "loss": 0.6468, "step": 6830 }, { "epoch": 0.8609125547123468, "grad_norm": 0.34746891260147095, "learning_rate": 0.0002691851117635056, "loss": 0.6498, "step": 6835 }, { "epoch": 0.8615423371225241, "grad_norm": 0.3370078206062317, "learning_rate": 0.00026911833730756577, "loss": 0.5951, "step": 6840 }, { "epoch": 0.8621721195327015, "grad_norm": 0.3180099427700043, "learning_rate": 0.00026905149888235787, "loss": 0.609, "step": 6845 }, { "epoch": 0.8628019019428788, "grad_norm": 0.34123897552490234, "learning_rate": 0.0002689845965237757, "loss": 0.6228, "step": 6850 }, { "epoch": 0.8634316843530561, "grad_norm": 0.3529733717441559, "learning_rate": 0.00026891763026774725, "loss": 0.6101, "step": 6855 }, { "epoch": 0.8640614667632333, "grad_norm": 0.3116464614868164, "learning_rate": 0.00026885060015023496, "loss": 0.5734, "step": 6860 }, { "epoch": 0.8646912491734106, "grad_norm": 0.3331621587276459, "learning_rate": 0.00026878350620723556, "loss": 0.6004, "step": 6865 }, { "epoch": 0.8653210315835879, "grad_norm": 0.3215835690498352, "learning_rate": 0.00026871634847478007, "loss": 0.6105, "step": 6870 }, { "epoch": 0.8659508139937652, "grad_norm": 0.3454177677631378, "learning_rate": 0.0002686491269889336, "loss": 0.6203, "step": 6875 }, { "epoch": 0.8665805964039425, "grad_norm": 0.3336181640625, "learning_rate": 0.0002685818417857958, "loss": 0.6179, "step": 6880 }, { "epoch": 0.8672103788141198, "grad_norm": 0.3452587127685547, "learning_rate": 0.00026851449290150024, "loss": 0.5918, "step": 6885 }, { "epoch": 0.867840161224297, "grad_norm": 0.37552833557128906, "learning_rate": 0.0002684470803722148, "loss": 0.6284, "step": 6890 }, { "epoch": 0.8684699436344743, "grad_norm": 0.33525559306144714, "learning_rate": 0.0002683796042341416, "loss": 0.6465, "step": 6895 }, { "epoch": 0.8690997260446516, "grad_norm": 0.3272569477558136, "learning_rate": 0.00026831206452351683, "loss": 0.636, "step": 6900 }, { "epoch": 0.8697295084548289, "grad_norm": 0.35215091705322266, "learning_rate": 0.0002682444612766109, "loss": 0.6415, "step": 6905 }, { "epoch": 0.8703592908650062, "grad_norm": 0.33025211095809937, "learning_rate": 0.0002681767945297282, "loss": 0.6677, "step": 6910 }, { "epoch": 0.8709890732751834, "grad_norm": 0.34073176980018616, "learning_rate": 0.0002681090643192075, "loss": 0.6386, "step": 6915 }, { "epoch": 0.8716188556853607, "grad_norm": 0.4070134162902832, "learning_rate": 0.0002680412706814213, "loss": 0.6365, "step": 6920 }, { "epoch": 0.872248638095538, "grad_norm": 0.33693283796310425, "learning_rate": 0.00026797341365277644, "loss": 0.6465, "step": 6925 }, { "epoch": 0.8728784205057153, "grad_norm": 0.3678983747959137, "learning_rate": 0.0002679054932697136, "loss": 0.594, "step": 6930 }, { "epoch": 0.8735082029158926, "grad_norm": 0.31632333993911743, "learning_rate": 0.00026783750956870764, "loss": 0.6128, "step": 6935 }, { "epoch": 0.8741379853260698, "grad_norm": 0.3184865713119507, "learning_rate": 0.0002677694625862674, "loss": 0.5955, "step": 6940 }, { "epoch": 0.8747677677362471, "grad_norm": 0.33729860186576843, "learning_rate": 0.00026770135235893556, "loss": 0.609, "step": 6945 }, { "epoch": 0.8753975501464244, "grad_norm": 0.3195466995239258, "learning_rate": 0.0002676331789232889, "loss": 0.6399, "step": 6950 }, { "epoch": 0.8760273325566017, "grad_norm": 0.35504212975502014, "learning_rate": 0.0002675649423159382, "loss": 0.6162, "step": 6955 }, { "epoch": 0.876657114966779, "grad_norm": 0.3598940372467041, "learning_rate": 0.000267496642573528, "loss": 0.6117, "step": 6960 }, { "epoch": 0.8772868973769563, "grad_norm": 0.32016637921333313, "learning_rate": 0.0002674282797327368, "loss": 0.6129, "step": 6965 }, { "epoch": 0.8779166797871335, "grad_norm": 0.36968451738357544, "learning_rate": 0.00026735985383027704, "loss": 0.619, "step": 6970 }, { "epoch": 0.8785464621973108, "grad_norm": 0.3299955427646637, "learning_rate": 0.000267291364902895, "loss": 0.5894, "step": 6975 }, { "epoch": 0.8791762446074881, "grad_norm": 0.34892305731773376, "learning_rate": 0.0002672228129873708, "loss": 0.6152, "step": 6980 }, { "epoch": 0.8798060270176654, "grad_norm": 0.379016637802124, "learning_rate": 0.00026715419812051833, "loss": 0.6633, "step": 6985 }, { "epoch": 0.8804358094278427, "grad_norm": 0.3378797173500061, "learning_rate": 0.00026708552033918544, "loss": 0.5911, "step": 6990 }, { "epoch": 0.8810655918380199, "grad_norm": 0.3348138928413391, "learning_rate": 0.0002670167796802536, "loss": 0.5841, "step": 6995 }, { "epoch": 0.8816953742481972, "grad_norm": 0.36374861001968384, "learning_rate": 0.0002669479761806381, "loss": 0.5973, "step": 7000 }, { "epoch": 0.8816953742481972, "eval_loss": 0.3066178560256958, "eval_runtime": 6.2494, "eval_samples_per_second": 160.014, "eval_steps_per_second": 10.081, "step": 7000 }, { "epoch": 0.8823251566583745, "grad_norm": 0.31616318225860596, "learning_rate": 0.000266879109877288, "loss": 0.6302, "step": 7005 }, { "epoch": 0.8829549390685518, "grad_norm": 0.37413114309310913, "learning_rate": 0.00026681018080718615, "loss": 0.6141, "step": 7010 }, { "epoch": 0.8835847214787291, "grad_norm": 0.3616124987602234, "learning_rate": 0.0002667411890073489, "loss": 0.6081, "step": 7015 }, { "epoch": 0.8842145038889064, "grad_norm": 0.3536156713962555, "learning_rate": 0.00026667213451482655, "loss": 0.6101, "step": 7020 }, { "epoch": 0.8848442862990836, "grad_norm": 0.2826579809188843, "learning_rate": 0.00026660301736670293, "loss": 0.5803, "step": 7025 }, { "epoch": 0.8854740687092609, "grad_norm": 0.3352709710597992, "learning_rate": 0.00026653383760009546, "loss": 0.5994, "step": 7030 }, { "epoch": 0.8861038511194382, "grad_norm": 0.320122092962265, "learning_rate": 0.00026646459525215524, "loss": 0.6159, "step": 7035 }, { "epoch": 0.8867336335296155, "grad_norm": 0.3512963652610779, "learning_rate": 0.0002663952903600671, "loss": 0.6034, "step": 7040 }, { "epoch": 0.8873634159397928, "grad_norm": 0.358071506023407, "learning_rate": 0.00026632592296104926, "loss": 0.6155, "step": 7045 }, { "epoch": 0.88799319834997, "grad_norm": 0.342318058013916, "learning_rate": 0.0002662564930923536, "loss": 0.5997, "step": 7050 }, { "epoch": 0.8886229807601473, "grad_norm": 0.291960746049881, "learning_rate": 0.0002661870007912656, "loss": 0.5721, "step": 7055 }, { "epoch": 0.8892527631703246, "grad_norm": 0.3608805239200592, "learning_rate": 0.0002661174460951042, "loss": 0.6248, "step": 7060 }, { "epoch": 0.889882545580502, "grad_norm": 0.329289972782135, "learning_rate": 0.0002660478290412218, "loss": 0.6163, "step": 7065 }, { "epoch": 0.8905123279906793, "grad_norm": 0.352383553981781, "learning_rate": 0.0002659781496670044, "loss": 0.6252, "step": 7070 }, { "epoch": 0.8911421104008566, "grad_norm": 0.3424574136734009, "learning_rate": 0.0002659084080098714, "loss": 0.5562, "step": 7075 }, { "epoch": 0.8917718928110338, "grad_norm": 0.32095563411712646, "learning_rate": 0.0002658386041072757, "loss": 0.6232, "step": 7080 }, { "epoch": 0.892401675221211, "grad_norm": 0.3307218849658966, "learning_rate": 0.00026576873799670356, "loss": 0.5958, "step": 7085 }, { "epoch": 0.8930314576313884, "grad_norm": 0.31858259439468384, "learning_rate": 0.00026569880971567464, "loss": 0.6128, "step": 7090 }, { "epoch": 0.8936612400415657, "grad_norm": 0.3014832139015198, "learning_rate": 0.00026562881930174213, "loss": 0.5886, "step": 7095 }, { "epoch": 0.894291022451743, "grad_norm": 0.35925576090812683, "learning_rate": 0.00026555876679249234, "loss": 0.6032, "step": 7100 }, { "epoch": 0.8949208048619202, "grad_norm": 0.337100625038147, "learning_rate": 0.0002654886522255452, "loss": 0.6217, "step": 7105 }, { "epoch": 0.8955505872720975, "grad_norm": 0.34906861186027527, "learning_rate": 0.00026541847563855373, "loss": 0.5999, "step": 7110 }, { "epoch": 0.8961803696822748, "grad_norm": 0.2829444110393524, "learning_rate": 0.00026534823706920443, "loss": 0.5747, "step": 7115 }, { "epoch": 0.8968101520924521, "grad_norm": 0.3298097550868988, "learning_rate": 0.00026527793655521697, "loss": 0.5959, "step": 7120 }, { "epoch": 0.8974399345026294, "grad_norm": 0.3762158453464508, "learning_rate": 0.0002652075741343444, "loss": 0.6325, "step": 7125 }, { "epoch": 0.8980697169128066, "grad_norm": 0.3318065106868744, "learning_rate": 0.00026513714984437284, "loss": 0.6015, "step": 7130 }, { "epoch": 0.8986994993229839, "grad_norm": 0.3132246434688568, "learning_rate": 0.0002650666637231218, "loss": 0.6317, "step": 7135 }, { "epoch": 0.8993292817331612, "grad_norm": 0.3308473527431488, "learning_rate": 0.00026499611580844403, "loss": 0.6364, "step": 7140 }, { "epoch": 0.8999590641433385, "grad_norm": 0.31450155377388, "learning_rate": 0.0002649255061382252, "loss": 0.6186, "step": 7145 }, { "epoch": 0.9005888465535158, "grad_norm": 0.3408615291118622, "learning_rate": 0.00026485483475038445, "loss": 0.5954, "step": 7150 }, { "epoch": 0.9012186289636931, "grad_norm": 0.34355321526527405, "learning_rate": 0.0002647841016828738, "loss": 0.6143, "step": 7155 }, { "epoch": 0.9018484113738703, "grad_norm": 0.35341107845306396, "learning_rate": 0.00026471330697367865, "loss": 0.5887, "step": 7160 }, { "epoch": 0.9024781937840476, "grad_norm": 0.3439336121082306, "learning_rate": 0.0002646424506608173, "loss": 0.6152, "step": 7165 }, { "epoch": 0.9031079761942249, "grad_norm": 0.32301509380340576, "learning_rate": 0.00026457153278234126, "loss": 0.6191, "step": 7170 }, { "epoch": 0.9037377586044022, "grad_norm": 0.3085480034351349, "learning_rate": 0.000264500553376335, "loss": 0.5993, "step": 7175 }, { "epoch": 0.9043675410145795, "grad_norm": 0.3285475969314575, "learning_rate": 0.0002644295124809161, "loss": 0.5832, "step": 7180 }, { "epoch": 0.9049973234247567, "grad_norm": 0.3160327076911926, "learning_rate": 0.0002643584101342352, "loss": 0.6258, "step": 7185 }, { "epoch": 0.905627105834934, "grad_norm": 0.30449238419532776, "learning_rate": 0.0002642872463744759, "loss": 0.62, "step": 7190 }, { "epoch": 0.9062568882451113, "grad_norm": 0.31154754757881165, "learning_rate": 0.00026421602123985455, "loss": 0.5888, "step": 7195 }, { "epoch": 0.9068866706552886, "grad_norm": 0.32224607467651367, "learning_rate": 0.0002641447347686209, "loss": 0.5971, "step": 7200 }, { "epoch": 0.9075164530654659, "grad_norm": 0.33809399604797363, "learning_rate": 0.0002640733869990573, "loss": 0.5942, "step": 7205 }, { "epoch": 0.9081462354756432, "grad_norm": 0.337990403175354, "learning_rate": 0.0002640019779694792, "loss": 0.5996, "step": 7210 }, { "epoch": 0.9087760178858204, "grad_norm": 0.33843520283699036, "learning_rate": 0.0002639305077182348, "loss": 0.6009, "step": 7215 }, { "epoch": 0.9094058002959977, "grad_norm": 0.31854307651519775, "learning_rate": 0.00026385897628370536, "loss": 0.5929, "step": 7220 }, { "epoch": 0.910035582706175, "grad_norm": 0.31263160705566406, "learning_rate": 0.0002637873837043049, "loss": 0.5861, "step": 7225 }, { "epoch": 0.9106653651163523, "grad_norm": 0.3141006827354431, "learning_rate": 0.00026371573001848005, "loss": 0.6204, "step": 7230 }, { "epoch": 0.9112951475265296, "grad_norm": 0.3565130829811096, "learning_rate": 0.00026364401526471077, "loss": 0.6051, "step": 7235 }, { "epoch": 0.9119249299367068, "grad_norm": 0.3886755108833313, "learning_rate": 0.0002635722394815094, "loss": 0.6162, "step": 7240 }, { "epoch": 0.9125547123468841, "grad_norm": 0.32173478603363037, "learning_rate": 0.0002635004027074211, "loss": 0.5908, "step": 7245 }, { "epoch": 0.9131844947570614, "grad_norm": 0.3483346998691559, "learning_rate": 0.0002634285049810239, "loss": 0.5934, "step": 7250 }, { "epoch": 0.9138142771672387, "grad_norm": 0.31829094886779785, "learning_rate": 0.00026335654634092857, "loss": 0.6205, "step": 7255 }, { "epoch": 0.914444059577416, "grad_norm": 0.2864934206008911, "learning_rate": 0.0002632845268257785, "loss": 0.5486, "step": 7260 }, { "epoch": 0.9150738419875933, "grad_norm": 0.34583529829978943, "learning_rate": 0.0002632124464742499, "loss": 0.5994, "step": 7265 }, { "epoch": 0.9157036243977705, "grad_norm": 0.3405662775039673, "learning_rate": 0.00026314030532505146, "loss": 0.5941, "step": 7270 }, { "epoch": 0.9163334068079478, "grad_norm": 0.319985568523407, "learning_rate": 0.00026306810341692464, "loss": 0.5949, "step": 7275 }, { "epoch": 0.9169631892181251, "grad_norm": 0.3206420838832855, "learning_rate": 0.00026299584078864354, "loss": 0.5895, "step": 7280 }, { "epoch": 0.9175929716283024, "grad_norm": 0.33022215962409973, "learning_rate": 0.00026292351747901486, "loss": 0.6018, "step": 7285 }, { "epoch": 0.9182227540384797, "grad_norm": 0.3440692722797394, "learning_rate": 0.00026285113352687785, "loss": 0.5818, "step": 7290 }, { "epoch": 0.9188525364486569, "grad_norm": 0.3580811619758606, "learning_rate": 0.0002627786889711043, "loss": 0.6024, "step": 7295 }, { "epoch": 0.9194823188588342, "grad_norm": 0.3101358413696289, "learning_rate": 0.0002627061838505987, "loss": 0.6241, "step": 7300 }, { "epoch": 0.9201121012690116, "grad_norm": 0.3681425452232361, "learning_rate": 0.00026263361820429783, "loss": 0.5759, "step": 7305 }, { "epoch": 0.9207418836791889, "grad_norm": 0.3331769108772278, "learning_rate": 0.0002625609920711712, "loss": 0.5696, "step": 7310 }, { "epoch": 0.9213716660893662, "grad_norm": 0.34252071380615234, "learning_rate": 0.00026248830549022064, "loss": 0.6171, "step": 7315 }, { "epoch": 0.9220014484995434, "grad_norm": 0.31009170413017273, "learning_rate": 0.00026241555850048056, "loss": 0.5758, "step": 7320 }, { "epoch": 0.9226312309097207, "grad_norm": 0.33126717805862427, "learning_rate": 0.00026234275114101765, "loss": 0.557, "step": 7325 }, { "epoch": 0.923261013319898, "grad_norm": 0.35423141717910767, "learning_rate": 0.00026226988345093126, "loss": 0.6239, "step": 7330 }, { "epoch": 0.9238907957300753, "grad_norm": 0.31321558356285095, "learning_rate": 0.0002621969554693529, "loss": 0.5796, "step": 7335 }, { "epoch": 0.9245205781402526, "grad_norm": 0.38709312677383423, "learning_rate": 0.00026212396723544664, "loss": 0.5831, "step": 7340 }, { "epoch": 0.9251503605504299, "grad_norm": 0.3205506205558777, "learning_rate": 0.0002620509187884088, "loss": 0.5577, "step": 7345 }, { "epoch": 0.9257801429606071, "grad_norm": 0.3263196647167206, "learning_rate": 0.00026197781016746804, "loss": 0.5729, "step": 7350 }, { "epoch": 0.9264099253707844, "grad_norm": 0.3553536534309387, "learning_rate": 0.0002619046414118854, "loss": 0.5968, "step": 7355 }, { "epoch": 0.9270397077809617, "grad_norm": 0.4170524477958679, "learning_rate": 0.0002618314125609541, "loss": 0.5731, "step": 7360 }, { "epoch": 0.927669490191139, "grad_norm": 0.3739701807498932, "learning_rate": 0.00026175812365399976, "loss": 0.5785, "step": 7365 }, { "epoch": 0.9282992726013163, "grad_norm": 0.32139813899993896, "learning_rate": 0.0002616847747303802, "loss": 0.5909, "step": 7370 }, { "epoch": 0.9289290550114935, "grad_norm": 0.3099890947341919, "learning_rate": 0.00026161136582948544, "loss": 0.5579, "step": 7375 }, { "epoch": 0.9295588374216708, "grad_norm": 0.349729984998703, "learning_rate": 0.0002615378969907378, "loss": 0.5762, "step": 7380 }, { "epoch": 0.9301886198318481, "grad_norm": 0.3257734775543213, "learning_rate": 0.00026146436825359167, "loss": 0.6216, "step": 7385 }, { "epoch": 0.9308184022420254, "grad_norm": 0.3399578332901001, "learning_rate": 0.0002613907796575337, "loss": 0.5694, "step": 7390 }, { "epoch": 0.9314481846522027, "grad_norm": 0.3863985240459442, "learning_rate": 0.0002613171312420826, "loss": 0.6416, "step": 7395 }, { "epoch": 0.93207796706238, "grad_norm": 0.3288150429725647, "learning_rate": 0.0002612434230467892, "loss": 0.5839, "step": 7400 }, { "epoch": 0.9327077494725572, "grad_norm": 0.37783902883529663, "learning_rate": 0.00026116965511123664, "loss": 0.5919, "step": 7405 }, { "epoch": 0.9333375318827345, "grad_norm": 0.36346110701560974, "learning_rate": 0.00026109582747503986, "loss": 0.5796, "step": 7410 }, { "epoch": 0.9339673142929118, "grad_norm": 0.3194875419139862, "learning_rate": 0.00026102194017784606, "loss": 0.5808, "step": 7415 }, { "epoch": 0.9345970967030891, "grad_norm": 0.286823570728302, "learning_rate": 0.00026094799325933435, "loss": 0.5605, "step": 7420 }, { "epoch": 0.9352268791132664, "grad_norm": 0.3147251307964325, "learning_rate": 0.0002608739867592159, "loss": 0.572, "step": 7425 }, { "epoch": 0.9358566615234436, "grad_norm": 0.34172821044921875, "learning_rate": 0.000260799920717234, "loss": 0.5763, "step": 7430 }, { "epoch": 0.9364864439336209, "grad_norm": 0.32804232835769653, "learning_rate": 0.0002607257951731637, "loss": 0.5925, "step": 7435 }, { "epoch": 0.9371162263437982, "grad_norm": 0.2969893515110016, "learning_rate": 0.0002606516101668122, "loss": 0.5754, "step": 7440 }, { "epoch": 0.9377460087539755, "grad_norm": 0.3364142179489136, "learning_rate": 0.00026057736573801844, "loss": 0.6248, "step": 7445 }, { "epoch": 0.9383757911641528, "grad_norm": 0.3493711054325104, "learning_rate": 0.0002605030619266534, "loss": 0.5828, "step": 7450 }, { "epoch": 0.9390055735743301, "grad_norm": 0.3338306248188019, "learning_rate": 0.00026042869877262, "loss": 0.5947, "step": 7455 }, { "epoch": 0.9396353559845073, "grad_norm": 0.30441364645957947, "learning_rate": 0.0002603542763158529, "loss": 0.5743, "step": 7460 }, { "epoch": 0.9402651383946846, "grad_norm": 0.31838342547416687, "learning_rate": 0.0002602797945963186, "loss": 0.5493, "step": 7465 }, { "epoch": 0.9408949208048619, "grad_norm": 0.3308780789375305, "learning_rate": 0.0002602052536540156, "loss": 0.5984, "step": 7470 }, { "epoch": 0.9415247032150392, "grad_norm": 0.30487555265426636, "learning_rate": 0.00026013065352897407, "loss": 0.5687, "step": 7475 }, { "epoch": 0.9421544856252165, "grad_norm": 0.33297523856163025, "learning_rate": 0.0002600559942612559, "loss": 0.5728, "step": 7480 }, { "epoch": 0.9427842680353937, "grad_norm": 0.3194848299026489, "learning_rate": 0.00025998127589095483, "loss": 0.5939, "step": 7485 }, { "epoch": 0.943414050445571, "grad_norm": 0.3401489555835724, "learning_rate": 0.0002599064984581964, "loss": 0.5282, "step": 7490 }, { "epoch": 0.9440438328557483, "grad_norm": 0.3722991943359375, "learning_rate": 0.0002598316620031378, "loss": 0.6044, "step": 7495 }, { "epoch": 0.9446736152659256, "grad_norm": 0.3582395613193512, "learning_rate": 0.0002597567665659678, "loss": 0.574, "step": 7500 }, { "epoch": 0.9453033976761029, "grad_norm": 0.30922654271125793, "learning_rate": 0.0002596818121869071, "loss": 0.6086, "step": 7505 }, { "epoch": 0.9459331800862802, "grad_norm": 0.34381213784217834, "learning_rate": 0.00025960679890620785, "loss": 0.6032, "step": 7510 }, { "epoch": 0.9465629624964574, "grad_norm": 0.3153468072414398, "learning_rate": 0.0002595317267641539, "loss": 0.5758, "step": 7515 }, { "epoch": 0.9471927449066347, "grad_norm": 0.30763527750968933, "learning_rate": 0.0002594565958010607, "loss": 0.6036, "step": 7520 }, { "epoch": 0.947822527316812, "grad_norm": 0.33897343277931213, "learning_rate": 0.00025938140605727536, "loss": 0.5879, "step": 7525 }, { "epoch": 0.9484523097269894, "grad_norm": 0.2996034324169159, "learning_rate": 0.00025930615757317635, "loss": 0.6095, "step": 7530 }, { "epoch": 0.9490820921371667, "grad_norm": 0.37265533208847046, "learning_rate": 0.00025923085038917395, "loss": 0.5718, "step": 7535 }, { "epoch": 0.9497118745473438, "grad_norm": 0.32904815673828125, "learning_rate": 0.00025915548454570977, "loss": 0.5689, "step": 7540 }, { "epoch": 0.9503416569575212, "grad_norm": 0.3493824005126953, "learning_rate": 0.000259080060083257, "loss": 0.594, "step": 7545 }, { "epoch": 0.9509714393676985, "grad_norm": 0.33561789989471436, "learning_rate": 0.0002590045770423204, "loss": 0.5604, "step": 7550 }, { "epoch": 0.9516012217778758, "grad_norm": 0.3272433876991272, "learning_rate": 0.00025892903546343587, "loss": 0.5819, "step": 7555 }, { "epoch": 0.9522310041880531, "grad_norm": 0.34539222717285156, "learning_rate": 0.00025885343538717116, "loss": 0.591, "step": 7560 }, { "epoch": 0.9528607865982303, "grad_norm": 0.3331897258758545, "learning_rate": 0.0002587777768541252, "loss": 0.5885, "step": 7565 }, { "epoch": 0.9534905690084076, "grad_norm": 0.3285147547721863, "learning_rate": 0.00025870205990492827, "loss": 0.5561, "step": 7570 }, { "epoch": 0.9541203514185849, "grad_norm": 0.3221907317638397, "learning_rate": 0.0002586262845802422, "loss": 0.5837, "step": 7575 }, { "epoch": 0.9547501338287622, "grad_norm": 0.4986007511615753, "learning_rate": 0.00025855045092076, "loss": 0.5645, "step": 7580 }, { "epoch": 0.9553799162389395, "grad_norm": 0.33891043066978455, "learning_rate": 0.00025847455896720615, "loss": 0.5801, "step": 7585 }, { "epoch": 0.9560096986491168, "grad_norm": 0.345480740070343, "learning_rate": 0.00025839860876033626, "loss": 0.5876, "step": 7590 }, { "epoch": 0.956639481059294, "grad_norm": 0.39212220907211304, "learning_rate": 0.0002583226003409374, "loss": 0.5949, "step": 7595 }, { "epoch": 0.9572692634694713, "grad_norm": 0.3195202648639679, "learning_rate": 0.00025824653374982776, "loss": 0.592, "step": 7600 }, { "epoch": 0.9578990458796486, "grad_norm": 0.31688785552978516, "learning_rate": 0.00025817040902785694, "loss": 0.5432, "step": 7605 }, { "epoch": 0.9585288282898259, "grad_norm": 0.3165288269519806, "learning_rate": 0.00025809422621590554, "loss": 0.552, "step": 7610 }, { "epoch": 0.9591586107000032, "grad_norm": 0.33528926968574524, "learning_rate": 0.0002580179853548856, "loss": 0.5745, "step": 7615 }, { "epoch": 0.9597883931101804, "grad_norm": 0.34123846888542175, "learning_rate": 0.0002579416864857401, "loss": 0.6019, "step": 7620 }, { "epoch": 0.9604181755203577, "grad_norm": 0.3223724663257599, "learning_rate": 0.0002578653296494433, "loss": 0.5725, "step": 7625 }, { "epoch": 0.961047957930535, "grad_norm": 0.349751740694046, "learning_rate": 0.0002577889148870006, "loss": 0.5739, "step": 7630 }, { "epoch": 0.9616777403407123, "grad_norm": 0.3111324608325958, "learning_rate": 0.0002577124422394484, "loss": 0.5555, "step": 7635 }, { "epoch": 0.9623075227508896, "grad_norm": 0.364615797996521, "learning_rate": 0.00025763591174785433, "loss": 0.5789, "step": 7640 }, { "epoch": 0.9629373051610669, "grad_norm": 0.31817707419395447, "learning_rate": 0.000257559323453317, "loss": 0.5799, "step": 7645 }, { "epoch": 0.9635670875712441, "grad_norm": 0.33710840344429016, "learning_rate": 0.000257482677396966, "loss": 0.6, "step": 7650 }, { "epoch": 0.9641968699814214, "grad_norm": 0.3512105345726013, "learning_rate": 0.00025740597361996215, "loss": 0.5772, "step": 7655 }, { "epoch": 0.9648266523915987, "grad_norm": 0.32505640387535095, "learning_rate": 0.00025732921216349705, "loss": 0.5872, "step": 7660 }, { "epoch": 0.965456434801776, "grad_norm": 0.32156363129615784, "learning_rate": 0.0002572523930687933, "loss": 0.5842, "step": 7665 }, { "epoch": 0.9660862172119533, "grad_norm": 0.313147634267807, "learning_rate": 0.0002571755163771046, "loss": 0.5697, "step": 7670 }, { "epoch": 0.9667159996221305, "grad_norm": 0.3494894504547119, "learning_rate": 0.00025709858212971545, "loss": 0.5651, "step": 7675 }, { "epoch": 0.9673457820323078, "grad_norm": 0.317107230424881, "learning_rate": 0.00025702159036794135, "loss": 0.5563, "step": 7680 }, { "epoch": 0.9679755644424851, "grad_norm": 0.3228907585144043, "learning_rate": 0.00025694454113312854, "loss": 0.5642, "step": 7685 }, { "epoch": 0.9686053468526624, "grad_norm": 0.33400991559028625, "learning_rate": 0.00025686743446665426, "loss": 0.5738, "step": 7690 }, { "epoch": 0.9692351292628397, "grad_norm": 0.35151737928390503, "learning_rate": 0.0002567902704099266, "loss": 0.562, "step": 7695 }, { "epoch": 0.969864911673017, "grad_norm": 0.33582988381385803, "learning_rate": 0.00025671304900438437, "loss": 0.5724, "step": 7700 }, { "epoch": 0.9704946940831942, "grad_norm": 0.4050043523311615, "learning_rate": 0.00025663577029149727, "loss": 0.6038, "step": 7705 }, { "epoch": 0.9711244764933715, "grad_norm": 0.3320407271385193, "learning_rate": 0.00025655843431276565, "loss": 0.5725, "step": 7710 }, { "epoch": 0.9717542589035488, "grad_norm": 0.33253729343414307, "learning_rate": 0.00025648104110972074, "loss": 0.559, "step": 7715 }, { "epoch": 0.9723840413137261, "grad_norm": 0.3316608667373657, "learning_rate": 0.0002564035907239245, "loss": 0.5813, "step": 7720 }, { "epoch": 0.9730138237239034, "grad_norm": 0.35272932052612305, "learning_rate": 0.0002563260831969695, "loss": 0.5544, "step": 7725 }, { "epoch": 0.9736436061340806, "grad_norm": 0.2942962348461151, "learning_rate": 0.00025624851857047914, "loss": 0.5741, "step": 7730 }, { "epoch": 0.9742733885442579, "grad_norm": 0.30799049139022827, "learning_rate": 0.0002561708968861073, "loss": 0.5604, "step": 7735 }, { "epoch": 0.9749031709544352, "grad_norm": 0.2929095923900604, "learning_rate": 0.00025609321818553864, "loss": 0.5399, "step": 7740 }, { "epoch": 0.9755329533646125, "grad_norm": 0.3074556291103363, "learning_rate": 0.00025601548251048833, "loss": 0.5714, "step": 7745 }, { "epoch": 0.9761627357747898, "grad_norm": 0.3233494162559509, "learning_rate": 0.0002559376899027024, "loss": 0.5559, "step": 7750 }, { "epoch": 0.976792518184967, "grad_norm": 0.3106531500816345, "learning_rate": 0.000255859840403957, "loss": 0.5462, "step": 7755 }, { "epoch": 0.9774223005951443, "grad_norm": 0.35069772601127625, "learning_rate": 0.00025578193405605923, "loss": 0.5635, "step": 7760 }, { "epoch": 0.9780520830053216, "grad_norm": 0.310811311006546, "learning_rate": 0.00025570397090084656, "loss": 0.5658, "step": 7765 }, { "epoch": 0.978681865415499, "grad_norm": 0.36216944456100464, "learning_rate": 0.000255625950980187, "loss": 0.5785, "step": 7770 }, { "epoch": 0.9793116478256763, "grad_norm": 0.30353617668151855, "learning_rate": 0.000255547874335979, "loss": 0.5347, "step": 7775 }, { "epoch": 0.9799414302358536, "grad_norm": 0.3112618029117584, "learning_rate": 0.00025546974101015154, "loss": 0.5559, "step": 7780 }, { "epoch": 0.9805712126460308, "grad_norm": 0.3782903552055359, "learning_rate": 0.00025539155104466394, "loss": 0.5717, "step": 7785 }, { "epoch": 0.9812009950562081, "grad_norm": 0.3308548331260681, "learning_rate": 0.000255313304481506, "loss": 0.5511, "step": 7790 }, { "epoch": 0.9818307774663854, "grad_norm": 0.2971625328063965, "learning_rate": 0.000255235001362698, "loss": 0.5411, "step": 7795 }, { "epoch": 0.9824605598765627, "grad_norm": 0.3594948351383209, "learning_rate": 0.0002551566417302904, "loss": 0.5817, "step": 7800 }, { "epoch": 0.98309034228674, "grad_norm": 0.3537582755088806, "learning_rate": 0.0002550782256263642, "loss": 0.5631, "step": 7805 }, { "epoch": 0.9837201246969172, "grad_norm": 0.3132795989513397, "learning_rate": 0.0002549997530930306, "loss": 0.5725, "step": 7810 }, { "epoch": 0.9843499071070945, "grad_norm": 0.3250652551651001, "learning_rate": 0.00025492122417243113, "loss": 0.5786, "step": 7815 }, { "epoch": 0.9849796895172718, "grad_norm": 0.3318973183631897, "learning_rate": 0.0002548426389067376, "loss": 0.5399, "step": 7820 }, { "epoch": 0.9856094719274491, "grad_norm": 0.3335192799568176, "learning_rate": 0.00025476399733815214, "loss": 0.5693, "step": 7825 }, { "epoch": 0.9862392543376264, "grad_norm": 0.31399449706077576, "learning_rate": 0.00025468529950890703, "loss": 0.5821, "step": 7830 }, { "epoch": 0.9868690367478037, "grad_norm": 0.33886855840682983, "learning_rate": 0.00025460654546126485, "loss": 0.556, "step": 7835 }, { "epoch": 0.9874988191579809, "grad_norm": 0.3620472848415375, "learning_rate": 0.0002545277352375183, "loss": 0.6104, "step": 7840 }, { "epoch": 0.9881286015681582, "grad_norm": 0.31123921275138855, "learning_rate": 0.0002544488688799902, "loss": 0.5802, "step": 7845 }, { "epoch": 0.9887583839783355, "grad_norm": 0.33104339241981506, "learning_rate": 0.0002543699464310337, "loss": 0.5882, "step": 7850 }, { "epoch": 0.9893881663885128, "grad_norm": 0.3223660886287689, "learning_rate": 0.00025429096793303186, "loss": 0.5649, "step": 7855 }, { "epoch": 0.9900179487986901, "grad_norm": 0.3436056077480316, "learning_rate": 0.000254211933428398, "loss": 0.5546, "step": 7860 }, { "epoch": 0.9906477312088673, "grad_norm": 0.29697200655937195, "learning_rate": 0.00025413284295957547, "loss": 0.5434, "step": 7865 }, { "epoch": 0.9912775136190446, "grad_norm": 0.32985180616378784, "learning_rate": 0.0002540536965690376, "loss": 0.5737, "step": 7870 }, { "epoch": 0.9919072960292219, "grad_norm": 0.31599709391593933, "learning_rate": 0.0002539744942992878, "loss": 0.5452, "step": 7875 }, { "epoch": 0.9925370784393992, "grad_norm": 0.30331170558929443, "learning_rate": 0.00025389523619285956, "loss": 0.5593, "step": 7880 }, { "epoch": 0.9931668608495765, "grad_norm": 0.3150465786457062, "learning_rate": 0.0002538159222923163, "loss": 0.5518, "step": 7885 }, { "epoch": 0.9937966432597538, "grad_norm": 0.3179359436035156, "learning_rate": 0.00025373655264025134, "loss": 0.5546, "step": 7890 }, { "epoch": 0.994426425669931, "grad_norm": 0.3226470947265625, "learning_rate": 0.000253657127279288, "loss": 0.58, "step": 7895 }, { "epoch": 0.9950562080801083, "grad_norm": 0.3453287184238434, "learning_rate": 0.0002535776462520795, "loss": 0.5681, "step": 7900 }, { "epoch": 0.9956859904902856, "grad_norm": 0.3329002261161804, "learning_rate": 0.0002534981096013091, "loss": 0.548, "step": 7905 }, { "epoch": 0.9963157729004629, "grad_norm": 0.32592061161994934, "learning_rate": 0.00025341851736968956, "loss": 0.5244, "step": 7910 }, { "epoch": 0.9969455553106402, "grad_norm": 0.32833319902420044, "learning_rate": 0.00025333886959996396, "loss": 0.5558, "step": 7915 }, { "epoch": 0.9975753377208174, "grad_norm": 0.3146878182888031, "learning_rate": 0.00025325916633490487, "loss": 0.595, "step": 7920 }, { "epoch": 0.9982051201309947, "grad_norm": 0.3828830122947693, "learning_rate": 0.00025317940761731476, "loss": 0.5675, "step": 7925 }, { "epoch": 0.998834902541172, "grad_norm": 0.3208398222923279, "learning_rate": 0.0002530995934900259, "loss": 0.5439, "step": 7930 }, { "epoch": 0.9994646849513493, "grad_norm": 0.3446502983570099, "learning_rate": 0.00025301972399590023, "loss": 0.5276, "step": 7935 }, { "epoch": 1.0, "grad_norm": 0.31275373697280884, "learning_rate": 0.0002529397991778297, "loss": 0.543, "step": 7940 }, { "epoch": 1.0006297824101773, "grad_norm": 0.3219754099845886, "learning_rate": 0.0002528598190787355, "loss": 0.4901, "step": 7945 }, { "epoch": 1.0012595648203546, "grad_norm": 0.33292412757873535, "learning_rate": 0.0002527797837415689, "loss": 0.4794, "step": 7950 }, { "epoch": 1.001889347230532, "grad_norm": 0.36561062932014465, "learning_rate": 0.00025269969320931065, "loss": 0.4948, "step": 7955 }, { "epoch": 1.0025191296407092, "grad_norm": 0.2977091372013092, "learning_rate": 0.0002526195475249713, "loss": 0.5172, "step": 7960 }, { "epoch": 1.0031489120508865, "grad_norm": 0.3075500428676605, "learning_rate": 0.00025253934673159084, "loss": 0.4755, "step": 7965 }, { "epoch": 1.0037786944610636, "grad_norm": 0.30956047773361206, "learning_rate": 0.00025245909087223895, "loss": 0.4783, "step": 7970 }, { "epoch": 1.004408476871241, "grad_norm": 0.34965232014656067, "learning_rate": 0.00025237877999001484, "loss": 0.4876, "step": 7975 }, { "epoch": 1.0050382592814182, "grad_norm": 0.3290039896965027, "learning_rate": 0.00025229841412804726, "loss": 0.501, "step": 7980 }, { "epoch": 1.0056680416915955, "grad_norm": 0.3144761323928833, "learning_rate": 0.00025221799332949456, "loss": 0.4923, "step": 7985 }, { "epoch": 1.0062978241017728, "grad_norm": 0.3586188554763794, "learning_rate": 0.0002521375176375446, "loss": 0.487, "step": 7990 }, { "epoch": 1.0069276065119501, "grad_norm": 0.3210572302341461, "learning_rate": 0.0002520569870954146, "loss": 0.4916, "step": 7995 }, { "epoch": 1.0075573889221274, "grad_norm": 0.3171830177307129, "learning_rate": 0.0002519764017463512, "loss": 0.4834, "step": 8000 }, { "epoch": 1.0075573889221274, "eval_loss": 0.30723655223846436, "eval_runtime": 6.2539, "eval_samples_per_second": 159.899, "eval_steps_per_second": 10.074, "step": 8000 }, { "epoch": 1.0081871713323047, "grad_norm": 0.3511858880519867, "learning_rate": 0.00025189576163363076, "loss": 0.4937, "step": 8005 }, { "epoch": 1.008816953742482, "grad_norm": 0.3305964171886444, "learning_rate": 0.00025181506680055875, "loss": 0.4665, "step": 8010 }, { "epoch": 1.0094467361526593, "grad_norm": 0.3735099732875824, "learning_rate": 0.00025173431729047014, "loss": 0.5116, "step": 8015 }, { "epoch": 1.0100765185628366, "grad_norm": 0.34169599413871765, "learning_rate": 0.0002516535131467293, "loss": 0.475, "step": 8020 }, { "epoch": 1.0107063009730137, "grad_norm": 0.3473950922489166, "learning_rate": 0.00025157265441272993, "loss": 0.4812, "step": 8025 }, { "epoch": 1.011336083383191, "grad_norm": 0.31877681612968445, "learning_rate": 0.00025149174113189496, "loss": 0.4906, "step": 8030 }, { "epoch": 1.0119658657933683, "grad_norm": 0.364511638879776, "learning_rate": 0.0002514107733476766, "loss": 0.4926, "step": 8035 }, { "epoch": 1.0125956482035456, "grad_norm": 0.3073696792125702, "learning_rate": 0.00025132975110355664, "loss": 0.4994, "step": 8040 }, { "epoch": 1.013225430613723, "grad_norm": 0.3270637094974518, "learning_rate": 0.0002512486744430456, "loss": 0.468, "step": 8045 }, { "epoch": 1.0138552130239002, "grad_norm": 0.3626968264579773, "learning_rate": 0.0002511675434096837, "loss": 0.5139, "step": 8050 }, { "epoch": 1.0144849954340776, "grad_norm": 0.30527931451797485, "learning_rate": 0.00025108635804704, "loss": 0.4922, "step": 8055 }, { "epoch": 1.0151147778442549, "grad_norm": 0.3518252968788147, "learning_rate": 0.000251005118398713, "loss": 0.5297, "step": 8060 }, { "epoch": 1.0157445602544322, "grad_norm": 0.3298850655555725, "learning_rate": 0.0002509238245083302, "loss": 0.5292, "step": 8065 }, { "epoch": 1.0163743426646095, "grad_norm": 0.3175168037414551, "learning_rate": 0.0002508424764195484, "loss": 0.4907, "step": 8070 }, { "epoch": 1.0170041250747868, "grad_norm": 0.33489352464675903, "learning_rate": 0.0002507610741760531, "loss": 0.4869, "step": 8075 }, { "epoch": 1.0176339074849639, "grad_norm": 0.2922315299510956, "learning_rate": 0.0002506796178215595, "loss": 0.474, "step": 8080 }, { "epoch": 1.0182636898951412, "grad_norm": 0.32073619961738586, "learning_rate": 0.00025059810739981125, "loss": 0.4951, "step": 8085 }, { "epoch": 1.0188934723053185, "grad_norm": 0.2875652611255646, "learning_rate": 0.0002505165429545815, "loss": 0.5104, "step": 8090 }, { "epoch": 1.0195232547154958, "grad_norm": 0.33247148990631104, "learning_rate": 0.0002504349245296721, "loss": 0.489, "step": 8095 }, { "epoch": 1.020153037125673, "grad_norm": 0.29777953028678894, "learning_rate": 0.0002503532521689141, "loss": 0.5172, "step": 8100 }, { "epoch": 1.0207828195358504, "grad_norm": 0.3418375253677368, "learning_rate": 0.0002502715259161673, "loss": 0.4464, "step": 8105 }, { "epoch": 1.0214126019460277, "grad_norm": 0.39162155985832214, "learning_rate": 0.0002501897458153207, "loss": 0.4953, "step": 8110 }, { "epoch": 1.022042384356205, "grad_norm": 0.32206737995147705, "learning_rate": 0.000250107911910292, "loss": 0.4732, "step": 8115 }, { "epoch": 1.0226721667663823, "grad_norm": 0.37178757786750793, "learning_rate": 0.0002500260242450279, "loss": 0.504, "step": 8120 }, { "epoch": 1.0233019491765596, "grad_norm": 0.33448055386543274, "learning_rate": 0.0002499440828635039, "loss": 0.4774, "step": 8125 }, { "epoch": 1.0239317315867367, "grad_norm": 0.344594806432724, "learning_rate": 0.00024986208780972455, "loss": 0.4948, "step": 8130 }, { "epoch": 1.024561513996914, "grad_norm": 0.3440978527069092, "learning_rate": 0.00024978003912772283, "loss": 0.4979, "step": 8135 }, { "epoch": 1.0251912964070913, "grad_norm": 0.2915257513523102, "learning_rate": 0.000249697936861561, "loss": 0.4875, "step": 8140 }, { "epoch": 1.0258210788172686, "grad_norm": 0.271371990442276, "learning_rate": 0.0002496157810553296, "loss": 0.4929, "step": 8145 }, { "epoch": 1.0264508612274459, "grad_norm": 0.3228522539138794, "learning_rate": 0.0002495335717531484, "loss": 0.4706, "step": 8150 }, { "epoch": 1.0270806436376232, "grad_norm": 0.3222556412220001, "learning_rate": 0.00024945130899916554, "loss": 0.487, "step": 8155 }, { "epoch": 1.0277104260478005, "grad_norm": 0.32311001420021057, "learning_rate": 0.00024936899283755807, "loss": 0.5144, "step": 8160 }, { "epoch": 1.0283402084579778, "grad_norm": 0.2946212589740753, "learning_rate": 0.0002492866233125316, "loss": 0.4867, "step": 8165 }, { "epoch": 1.028969990868155, "grad_norm": 0.32464465498924255, "learning_rate": 0.0002492042004683205, "loss": 0.4729, "step": 8170 }, { "epoch": 1.0295997732783324, "grad_norm": 0.3378526270389557, "learning_rate": 0.0002491217243491876, "loss": 0.4843, "step": 8175 }, { "epoch": 1.0302295556885097, "grad_norm": 0.35685908794403076, "learning_rate": 0.0002490391949994246, "loss": 0.4941, "step": 8180 }, { "epoch": 1.0308593380986868, "grad_norm": 0.30618053674697876, "learning_rate": 0.0002489566124633516, "loss": 0.4985, "step": 8185 }, { "epoch": 1.031489120508864, "grad_norm": 0.34786808490753174, "learning_rate": 0.0002488739767853173, "loss": 0.4914, "step": 8190 }, { "epoch": 1.0321189029190414, "grad_norm": 0.35167476534843445, "learning_rate": 0.00024879128800969893, "loss": 0.493, "step": 8195 }, { "epoch": 1.0327486853292187, "grad_norm": 0.3278263509273529, "learning_rate": 0.00024870854618090225, "loss": 0.4676, "step": 8200 }, { "epoch": 1.033378467739396, "grad_norm": 0.36896881461143494, "learning_rate": 0.00024862575134336154, "loss": 0.4995, "step": 8205 }, { "epoch": 1.0340082501495733, "grad_norm": 0.3700760304927826, "learning_rate": 0.00024854290354153953, "loss": 0.5189, "step": 8210 }, { "epoch": 1.0346380325597506, "grad_norm": 0.3370974063873291, "learning_rate": 0.00024846000281992733, "loss": 0.5044, "step": 8215 }, { "epoch": 1.035267814969928, "grad_norm": 0.3200768232345581, "learning_rate": 0.00024837704922304457, "loss": 0.4779, "step": 8220 }, { "epoch": 1.0358975973801052, "grad_norm": 0.2786978781223297, "learning_rate": 0.0002482940427954392, "loss": 0.4677, "step": 8225 }, { "epoch": 1.0365273797902825, "grad_norm": 0.3220120668411255, "learning_rate": 0.00024821098358168757, "loss": 0.4503, "step": 8230 }, { "epoch": 1.0371571622004598, "grad_norm": 0.3315715491771698, "learning_rate": 0.00024812787162639444, "loss": 0.4715, "step": 8235 }, { "epoch": 1.037786944610637, "grad_norm": 0.3595867455005646, "learning_rate": 0.00024804470697419273, "loss": 0.4712, "step": 8240 }, { "epoch": 1.0384167270208142, "grad_norm": 0.29993361234664917, "learning_rate": 0.00024796148966974376, "loss": 0.47, "step": 8245 }, { "epoch": 1.0390465094309915, "grad_norm": 0.39950379729270935, "learning_rate": 0.00024787821975773717, "loss": 0.5233, "step": 8250 }, { "epoch": 1.0396762918411688, "grad_norm": 0.312003493309021, "learning_rate": 0.0002477948972828908, "loss": 0.4836, "step": 8255 }, { "epoch": 1.0403060742513461, "grad_norm": 0.29678481817245483, "learning_rate": 0.0002477115222899507, "loss": 0.4928, "step": 8260 }, { "epoch": 1.0409358566615234, "grad_norm": 0.35694456100463867, "learning_rate": 0.0002476280948236912, "loss": 0.4925, "step": 8265 }, { "epoch": 1.0415656390717007, "grad_norm": 0.3164297640323639, "learning_rate": 0.00024754461492891474, "loss": 0.4828, "step": 8270 }, { "epoch": 1.042195421481878, "grad_norm": 0.37906938791275024, "learning_rate": 0.00024746108265045184, "loss": 0.4989, "step": 8275 }, { "epoch": 1.0428252038920554, "grad_norm": 0.3458475172519684, "learning_rate": 0.0002473774980331614, "loss": 0.5072, "step": 8280 }, { "epoch": 1.0434549863022327, "grad_norm": 0.36052700877189636, "learning_rate": 0.0002472938611219301, "loss": 0.4872, "step": 8285 }, { "epoch": 1.04408476871241, "grad_norm": 0.4497036337852478, "learning_rate": 0.00024721017196167297, "loss": 0.4921, "step": 8290 }, { "epoch": 1.044714551122587, "grad_norm": 0.357461154460907, "learning_rate": 0.000247126430597333, "loss": 0.5035, "step": 8295 }, { "epoch": 1.0453443335327643, "grad_norm": 0.3499346375465393, "learning_rate": 0.00024704263707388117, "loss": 0.5242, "step": 8300 }, { "epoch": 1.0459741159429417, "grad_norm": 0.2994784414768219, "learning_rate": 0.0002469587914363166, "loss": 0.4575, "step": 8305 }, { "epoch": 1.046603898353119, "grad_norm": 0.3699876666069031, "learning_rate": 0.0002468748937296662, "loss": 0.4804, "step": 8310 }, { "epoch": 1.0472336807632963, "grad_norm": 0.37695133686065674, "learning_rate": 0.000246790943998985, "loss": 0.4914, "step": 8315 }, { "epoch": 1.0478634631734736, "grad_norm": 0.30732589960098267, "learning_rate": 0.0002467069422893559, "loss": 0.458, "step": 8320 }, { "epoch": 1.0484932455836509, "grad_norm": 0.3094361424446106, "learning_rate": 0.0002466228886458899, "loss": 0.4584, "step": 8325 }, { "epoch": 1.0491230279938282, "grad_norm": 0.3499257564544678, "learning_rate": 0.0002465387831137255, "loss": 0.4717, "step": 8330 }, { "epoch": 1.0497528104040055, "grad_norm": 0.32755059003829956, "learning_rate": 0.0002464546257380294, "loss": 0.49, "step": 8335 }, { "epoch": 1.0503825928141828, "grad_norm": 0.3201046884059906, "learning_rate": 0.000246370416563996, "loss": 0.4833, "step": 8340 }, { "epoch": 1.05101237522436, "grad_norm": 0.2581581771373749, "learning_rate": 0.0002462861556368476, "loss": 0.465, "step": 8345 }, { "epoch": 1.0516421576345372, "grad_norm": 0.3480297923088074, "learning_rate": 0.00024620184300183423, "loss": 0.4756, "step": 8350 }, { "epoch": 1.0522719400447145, "grad_norm": 0.36630478501319885, "learning_rate": 0.00024611747870423366, "loss": 0.5051, "step": 8355 }, { "epoch": 1.0529017224548918, "grad_norm": 0.3450157940387726, "learning_rate": 0.0002460330627893515, "loss": 0.4996, "step": 8360 }, { "epoch": 1.053531504865069, "grad_norm": 0.30790945887565613, "learning_rate": 0.000245948595302521, "loss": 0.4826, "step": 8365 }, { "epoch": 1.0541612872752464, "grad_norm": 0.39590683579444885, "learning_rate": 0.00024586407628910306, "loss": 0.4963, "step": 8370 }, { "epoch": 1.0547910696854237, "grad_norm": 0.3294634521007538, "learning_rate": 0.00024577950579448643, "loss": 0.4868, "step": 8375 }, { "epoch": 1.055420852095601, "grad_norm": 0.33493947982788086, "learning_rate": 0.00024569488386408736, "loss": 0.4773, "step": 8380 }, { "epoch": 1.0560506345057783, "grad_norm": 0.32626229524612427, "learning_rate": 0.00024561021054334974, "loss": 0.4898, "step": 8385 }, { "epoch": 1.0566804169159556, "grad_norm": 0.3181340992450714, "learning_rate": 0.00024552548587774507, "loss": 0.4757, "step": 8390 }, { "epoch": 1.057310199326133, "grad_norm": 0.3592873215675354, "learning_rate": 0.0002454407099127725, "loss": 0.5034, "step": 8395 }, { "epoch": 1.0579399817363102, "grad_norm": 0.3184007406234741, "learning_rate": 0.00024535588269395856, "loss": 0.4929, "step": 8400 }, { "epoch": 1.0585697641464873, "grad_norm": 0.3555738627910614, "learning_rate": 0.00024527100426685746, "loss": 0.503, "step": 8405 }, { "epoch": 1.0591995465566646, "grad_norm": 0.29637908935546875, "learning_rate": 0.0002451860746770509, "loss": 0.4716, "step": 8410 }, { "epoch": 1.059829328966842, "grad_norm": 0.3031441569328308, "learning_rate": 0.0002451010939701479, "loss": 0.4757, "step": 8415 }, { "epoch": 1.0604591113770192, "grad_norm": 0.28256094455718994, "learning_rate": 0.0002450160621917851, "loss": 0.4558, "step": 8420 }, { "epoch": 1.0610888937871965, "grad_norm": 0.3192931115627289, "learning_rate": 0.0002449309793876266, "loss": 0.499, "step": 8425 }, { "epoch": 1.0617186761973738, "grad_norm": 0.2788430154323578, "learning_rate": 0.00024484584560336363, "loss": 0.4616, "step": 8430 }, { "epoch": 1.0623484586075511, "grad_norm": 0.35733649134635925, "learning_rate": 0.00024476066088471507, "loss": 0.4926, "step": 8435 }, { "epoch": 1.0629782410177284, "grad_norm": 0.3398718535900116, "learning_rate": 0.00024467542527742707, "loss": 0.4944, "step": 8440 }, { "epoch": 1.0636080234279057, "grad_norm": 0.3338175117969513, "learning_rate": 0.000244590138827273, "loss": 0.5181, "step": 8445 }, { "epoch": 1.064237805838083, "grad_norm": 0.33433952927589417, "learning_rate": 0.00024450480158005384, "loss": 0.4837, "step": 8450 }, { "epoch": 1.0648675882482603, "grad_norm": 0.3656097948551178, "learning_rate": 0.0002444194135815974, "loss": 0.4639, "step": 8455 }, { "epoch": 1.0654973706584374, "grad_norm": 0.31470635533332825, "learning_rate": 0.0002443339748777592, "loss": 0.4718, "step": 8460 }, { "epoch": 1.0661271530686147, "grad_norm": 0.29020166397094727, "learning_rate": 0.00024424848551442166, "loss": 0.4712, "step": 8465 }, { "epoch": 1.066756935478792, "grad_norm": 0.34259042143821716, "learning_rate": 0.00024416294553749446, "loss": 0.5252, "step": 8470 }, { "epoch": 1.0673867178889693, "grad_norm": 0.33828607201576233, "learning_rate": 0.0002440773549929146, "loss": 0.4663, "step": 8475 }, { "epoch": 1.0680165002991466, "grad_norm": 0.35722973942756653, "learning_rate": 0.00024399171392664622, "loss": 0.4868, "step": 8480 }, { "epoch": 1.068646282709324, "grad_norm": 0.3226557970046997, "learning_rate": 0.00024390602238468043, "loss": 0.4785, "step": 8485 }, { "epoch": 1.0692760651195012, "grad_norm": 0.3097434639930725, "learning_rate": 0.0002438202804130356, "loss": 0.4677, "step": 8490 }, { "epoch": 1.0699058475296785, "grad_norm": 0.3146856725215912, "learning_rate": 0.00024373448805775709, "loss": 0.4802, "step": 8495 }, { "epoch": 1.0705356299398558, "grad_norm": 0.3576582372188568, "learning_rate": 0.00024364864536491739, "loss": 0.5113, "step": 8500 }, { "epoch": 1.0711654123500332, "grad_norm": 0.33004313707351685, "learning_rate": 0.0002435627523806159, "loss": 0.4625, "step": 8505 }, { "epoch": 1.0717951947602105, "grad_norm": 0.3689037263393402, "learning_rate": 0.00024347680915097928, "loss": 0.4923, "step": 8510 }, { "epoch": 1.0724249771703875, "grad_norm": 0.28334125876426697, "learning_rate": 0.00024339081572216084, "loss": 0.4818, "step": 8515 }, { "epoch": 1.0730547595805648, "grad_norm": 0.3461993336677551, "learning_rate": 0.00024330477214034113, "loss": 0.4807, "step": 8520 }, { "epoch": 1.0736845419907421, "grad_norm": 0.32148951292037964, "learning_rate": 0.00024321867845172743, "loss": 0.4829, "step": 8525 }, { "epoch": 1.0743143244009195, "grad_norm": 0.31461793184280396, "learning_rate": 0.0002431325347025541, "loss": 0.5045, "step": 8530 }, { "epoch": 1.0749441068110968, "grad_norm": 0.30194273591041565, "learning_rate": 0.00024304634093908224, "loss": 0.4747, "step": 8535 }, { "epoch": 1.075573889221274, "grad_norm": 0.27379968762397766, "learning_rate": 0.0002429600972075999, "loss": 0.4382, "step": 8540 }, { "epoch": 1.0762036716314514, "grad_norm": 0.3732368052005768, "learning_rate": 0.0002428738035544219, "loss": 0.4704, "step": 8545 }, { "epoch": 1.0768334540416287, "grad_norm": 0.3252260088920593, "learning_rate": 0.00024278746002588997, "loss": 0.4929, "step": 8550 }, { "epoch": 1.077463236451806, "grad_norm": 0.31606802344322205, "learning_rate": 0.00024270106666837246, "loss": 0.4698, "step": 8555 }, { "epoch": 1.0780930188619833, "grad_norm": 0.320529967546463, "learning_rate": 0.00024261462352826468, "loss": 0.4531, "step": 8560 }, { "epoch": 1.0787228012721606, "grad_norm": 0.36827871203422546, "learning_rate": 0.00024252813065198852, "loss": 0.4948, "step": 8565 }, { "epoch": 1.0793525836823377, "grad_norm": 0.3132867217063904, "learning_rate": 0.00024244158808599264, "loss": 0.4836, "step": 8570 }, { "epoch": 1.079982366092515, "grad_norm": 0.32383888959884644, "learning_rate": 0.00024235499587675236, "loss": 0.4749, "step": 8575 }, { "epoch": 1.0806121485026923, "grad_norm": 0.32294297218322754, "learning_rate": 0.0002422683540707697, "loss": 0.4616, "step": 8580 }, { "epoch": 1.0812419309128696, "grad_norm": 0.3049245774745941, "learning_rate": 0.00024218166271457322, "loss": 0.4871, "step": 8585 }, { "epoch": 1.0818717133230469, "grad_norm": 0.3330252170562744, "learning_rate": 0.00024209492185471826, "loss": 0.4908, "step": 8590 }, { "epoch": 1.0825014957332242, "grad_norm": 0.35933157801628113, "learning_rate": 0.00024200813153778654, "loss": 0.4936, "step": 8595 }, { "epoch": 1.0831312781434015, "grad_norm": 0.345434308052063, "learning_rate": 0.00024192129181038654, "loss": 0.4637, "step": 8600 }, { "epoch": 1.0837610605535788, "grad_norm": 0.3012515604496002, "learning_rate": 0.0002418344027191531, "loss": 0.4719, "step": 8605 }, { "epoch": 1.084390842963756, "grad_norm": 0.3081362247467041, "learning_rate": 0.0002417474643107477, "loss": 0.4852, "step": 8610 }, { "epoch": 1.0850206253739334, "grad_norm": 0.367389053106308, "learning_rate": 0.00024166047663185826, "loss": 0.5046, "step": 8615 }, { "epoch": 1.0856504077841107, "grad_norm": 0.3392958641052246, "learning_rate": 0.0002415734397291991, "loss": 0.5087, "step": 8620 }, { "epoch": 1.0862801901942878, "grad_norm": 0.2843685746192932, "learning_rate": 0.000241486353649511, "loss": 0.4722, "step": 8625 }, { "epoch": 1.086909972604465, "grad_norm": 0.29619672894477844, "learning_rate": 0.00024139921843956128, "loss": 0.4645, "step": 8630 }, { "epoch": 1.0875397550146424, "grad_norm": 0.30029621720314026, "learning_rate": 0.00024131203414614347, "loss": 0.4434, "step": 8635 }, { "epoch": 1.0881695374248197, "grad_norm": 0.3630850911140442, "learning_rate": 0.00024122480081607755, "loss": 0.4772, "step": 8640 }, { "epoch": 1.088799319834997, "grad_norm": 0.32482001185417175, "learning_rate": 0.00024113751849620974, "loss": 0.4441, "step": 8645 }, { "epoch": 1.0894291022451743, "grad_norm": 0.3149590492248535, "learning_rate": 0.00024105018723341275, "loss": 0.468, "step": 8650 }, { "epoch": 1.0900588846553516, "grad_norm": 0.34652113914489746, "learning_rate": 0.0002409628070745854, "loss": 0.4706, "step": 8655 }, { "epoch": 1.090688667065529, "grad_norm": 0.31633374094963074, "learning_rate": 0.00024087537806665279, "loss": 0.4693, "step": 8660 }, { "epoch": 1.0913184494757062, "grad_norm": 0.31668806076049805, "learning_rate": 0.00024078790025656638, "loss": 0.4619, "step": 8665 }, { "epoch": 1.0919482318858835, "grad_norm": 0.3093356490135193, "learning_rate": 0.00024070037369130375, "loss": 0.485, "step": 8670 }, { "epoch": 1.0925780142960608, "grad_norm": 0.31765609979629517, "learning_rate": 0.0002406127984178686, "loss": 0.4696, "step": 8675 }, { "epoch": 1.093207796706238, "grad_norm": 0.35910454392433167, "learning_rate": 0.00024052517448329086, "loss": 0.4781, "step": 8680 }, { "epoch": 1.0938375791164152, "grad_norm": 0.37290528416633606, "learning_rate": 0.00024043750193462665, "loss": 0.4824, "step": 8685 }, { "epoch": 1.0944673615265925, "grad_norm": 0.3106020390987396, "learning_rate": 0.00024034978081895807, "loss": 0.4607, "step": 8690 }, { "epoch": 1.0950971439367698, "grad_norm": 0.3306252062320709, "learning_rate": 0.0002402620111833934, "loss": 0.4725, "step": 8695 }, { "epoch": 1.0957269263469471, "grad_norm": 0.2956124544143677, "learning_rate": 0.00024017419307506687, "loss": 0.4784, "step": 8700 }, { "epoch": 1.0963567087571244, "grad_norm": 0.3285719156265259, "learning_rate": 0.00024008632654113894, "loss": 0.4856, "step": 8705 }, { "epoch": 1.0969864911673017, "grad_norm": 0.3430241644382477, "learning_rate": 0.00023999841162879583, "loss": 0.5017, "step": 8710 }, { "epoch": 1.097616273577479, "grad_norm": 0.33543142676353455, "learning_rate": 0.00023991044838524985, "loss": 0.516, "step": 8715 }, { "epoch": 1.0982460559876563, "grad_norm": 0.28755661845207214, "learning_rate": 0.0002398224368577394, "loss": 0.4645, "step": 8720 }, { "epoch": 1.0988758383978336, "grad_norm": 0.34112608432769775, "learning_rate": 0.00023973437709352851, "loss": 0.5134, "step": 8725 }, { "epoch": 1.0995056208080107, "grad_norm": 0.3198321759700775, "learning_rate": 0.00023964626913990743, "loss": 0.4939, "step": 8730 }, { "epoch": 1.100135403218188, "grad_norm": 0.2985571026802063, "learning_rate": 0.00023955811304419205, "loss": 0.4817, "step": 8735 }, { "epoch": 1.1007651856283653, "grad_norm": 0.32038047909736633, "learning_rate": 0.0002394699088537243, "loss": 0.524, "step": 8740 }, { "epoch": 1.1013949680385426, "grad_norm": 0.3562256395816803, "learning_rate": 0.00023938165661587175, "loss": 0.4779, "step": 8745 }, { "epoch": 1.10202475044872, "grad_norm": 0.3481481969356537, "learning_rate": 0.00023929335637802788, "loss": 0.4861, "step": 8750 }, { "epoch": 1.1026545328588973, "grad_norm": 0.3087615966796875, "learning_rate": 0.00023920500818761198, "loss": 0.473, "step": 8755 }, { "epoch": 1.1032843152690746, "grad_norm": 0.34575629234313965, "learning_rate": 0.00023911661209206903, "loss": 0.4709, "step": 8760 }, { "epoch": 1.1039140976792519, "grad_norm": 0.3505946099758148, "learning_rate": 0.0002390281681388697, "loss": 0.4766, "step": 8765 }, { "epoch": 1.1045438800894292, "grad_norm": 0.40102317929267883, "learning_rate": 0.0002389396763755105, "loss": 0.5048, "step": 8770 }, { "epoch": 1.1051736624996065, "grad_norm": 0.3319726884365082, "learning_rate": 0.0002388511368495135, "loss": 0.4768, "step": 8775 }, { "epoch": 1.1058034449097838, "grad_norm": 0.3191297948360443, "learning_rate": 0.00023876254960842645, "loss": 0.5009, "step": 8780 }, { "epoch": 1.1064332273199609, "grad_norm": 0.3122735619544983, "learning_rate": 0.00023867391469982268, "loss": 0.4777, "step": 8785 }, { "epoch": 1.1070630097301382, "grad_norm": 0.33340710401535034, "learning_rate": 0.0002385852321713012, "loss": 0.459, "step": 8790 }, { "epoch": 1.1076927921403155, "grad_norm": 0.32803764939308167, "learning_rate": 0.00023849650207048655, "loss": 0.4784, "step": 8795 }, { "epoch": 1.1083225745504928, "grad_norm": 0.35463786125183105, "learning_rate": 0.00023840772444502878, "loss": 0.4739, "step": 8800 }, { "epoch": 1.10895235696067, "grad_norm": 0.3237099349498749, "learning_rate": 0.00023831889934260357, "loss": 0.4652, "step": 8805 }, { "epoch": 1.1095821393708474, "grad_norm": 0.34681713581085205, "learning_rate": 0.000238230026810912, "loss": 0.4872, "step": 8810 }, { "epoch": 1.1102119217810247, "grad_norm": 0.3360891342163086, "learning_rate": 0.00023814110689768066, "loss": 0.496, "step": 8815 }, { "epoch": 1.110841704191202, "grad_norm": 0.32971322536468506, "learning_rate": 0.0002380521396506615, "loss": 0.4468, "step": 8820 }, { "epoch": 1.1114714866013793, "grad_norm": 0.3112764060497284, "learning_rate": 0.00023796312511763205, "loss": 0.4985, "step": 8825 }, { "epoch": 1.1121012690115566, "grad_norm": 0.30539095401763916, "learning_rate": 0.0002378740633463951, "loss": 0.4835, "step": 8830 }, { "epoch": 1.1127310514217337, "grad_norm": 0.274139940738678, "learning_rate": 0.00023778495438477894, "loss": 0.5014, "step": 8835 }, { "epoch": 1.113360833831911, "grad_norm": 0.2877870500087738, "learning_rate": 0.000237695798280637, "loss": 0.4842, "step": 8840 }, { "epoch": 1.1139906162420883, "grad_norm": 0.262893944978714, "learning_rate": 0.00023760659508184823, "loss": 0.4754, "step": 8845 }, { "epoch": 1.1146203986522656, "grad_norm": 0.3255792260169983, "learning_rate": 0.00023751734483631672, "loss": 0.489, "step": 8850 }, { "epoch": 1.115250181062443, "grad_norm": 0.3453415632247925, "learning_rate": 0.00023742804759197195, "loss": 0.4624, "step": 8855 }, { "epoch": 1.1158799634726202, "grad_norm": 0.3276025354862213, "learning_rate": 0.00023733870339676856, "loss": 0.4629, "step": 8860 }, { "epoch": 1.1165097458827975, "grad_norm": 0.32096150517463684, "learning_rate": 0.0002372493122986864, "loss": 0.4482, "step": 8865 }, { "epoch": 1.1171395282929748, "grad_norm": 0.33016180992126465, "learning_rate": 0.00023715987434573055, "loss": 0.493, "step": 8870 }, { "epoch": 1.117769310703152, "grad_norm": 0.2946653366088867, "learning_rate": 0.00023707038958593126, "loss": 0.4365, "step": 8875 }, { "epoch": 1.1183990931133294, "grad_norm": 0.37148308753967285, "learning_rate": 0.00023698085806734385, "loss": 0.4974, "step": 8880 }, { "epoch": 1.1190288755235067, "grad_norm": 0.3068748116493225, "learning_rate": 0.00023689127983804882, "loss": 0.4886, "step": 8885 }, { "epoch": 1.1196586579336838, "grad_norm": 0.3096564710140228, "learning_rate": 0.00023680165494615167, "loss": 0.4592, "step": 8890 }, { "epoch": 1.120288440343861, "grad_norm": 0.3341507613658905, "learning_rate": 0.00023671198343978308, "loss": 0.4258, "step": 8895 }, { "epoch": 1.1209182227540384, "grad_norm": 0.30653128027915955, "learning_rate": 0.00023662226536709868, "loss": 0.486, "step": 8900 }, { "epoch": 1.1215480051642157, "grad_norm": 0.30991849303245544, "learning_rate": 0.00023653250077627908, "loss": 0.4879, "step": 8905 }, { "epoch": 1.122177787574393, "grad_norm": 0.3082162141799927, "learning_rate": 0.00023644268971552998, "loss": 0.4538, "step": 8910 }, { "epoch": 1.1228075699845703, "grad_norm": 0.30248114466667175, "learning_rate": 0.00023635283223308193, "loss": 0.4501, "step": 8915 }, { "epoch": 1.1234373523947476, "grad_norm": 0.34090158343315125, "learning_rate": 0.00023626292837719047, "loss": 0.4825, "step": 8920 }, { "epoch": 1.124067134804925, "grad_norm": 0.28670960664749146, "learning_rate": 0.00023617297819613598, "loss": 0.4422, "step": 8925 }, { "epoch": 1.1246969172151022, "grad_norm": 0.37079116702079773, "learning_rate": 0.0002360829817382239, "loss": 0.4725, "step": 8930 }, { "epoch": 1.1253266996252795, "grad_norm": 0.35876086354255676, "learning_rate": 0.00023599293905178417, "loss": 0.4672, "step": 8935 }, { "epoch": 1.1259564820354568, "grad_norm": 0.28581666946411133, "learning_rate": 0.00023590285018517196, "loss": 0.4597, "step": 8940 }, { "epoch": 1.126586264445634, "grad_norm": 0.34076693654060364, "learning_rate": 0.00023581271518676694, "loss": 0.4894, "step": 8945 }, { "epoch": 1.1272160468558112, "grad_norm": 0.29919254779815674, "learning_rate": 0.0002357225341049737, "loss": 0.4538, "step": 8950 }, { "epoch": 1.1278458292659885, "grad_norm": 0.2799806594848633, "learning_rate": 0.00023563230698822154, "loss": 0.4814, "step": 8955 }, { "epoch": 1.1284756116761658, "grad_norm": 0.3249780833721161, "learning_rate": 0.00023554203388496446, "loss": 0.4825, "step": 8960 }, { "epoch": 1.1291053940863431, "grad_norm": 0.3509981036186218, "learning_rate": 0.0002354517148436812, "loss": 0.4468, "step": 8965 }, { "epoch": 1.1297351764965204, "grad_norm": 0.33016157150268555, "learning_rate": 0.0002353613499128752, "loss": 0.449, "step": 8970 }, { "epoch": 1.1303649589066977, "grad_norm": 0.2889571487903595, "learning_rate": 0.00023527093914107436, "loss": 0.4584, "step": 8975 }, { "epoch": 1.130994741316875, "grad_norm": 0.31957536935806274, "learning_rate": 0.00023518048257683145, "loss": 0.4807, "step": 8980 }, { "epoch": 1.1316245237270524, "grad_norm": 0.31418105959892273, "learning_rate": 0.00023508998026872365, "loss": 0.4755, "step": 8985 }, { "epoch": 1.1322543061372297, "grad_norm": 0.3458874523639679, "learning_rate": 0.00023499943226535278, "loss": 0.4906, "step": 8990 }, { "epoch": 1.132884088547407, "grad_norm": 0.3091862201690674, "learning_rate": 0.0002349088386153452, "loss": 0.4786, "step": 8995 }, { "epoch": 1.133513870957584, "grad_norm": 0.2758231461048126, "learning_rate": 0.00023481819936735178, "loss": 0.4189, "step": 9000 }, { "epoch": 1.133513870957584, "eval_loss": 0.3038506805896759, "eval_runtime": 6.258, "eval_samples_per_second": 159.795, "eval_steps_per_second": 10.067, "step": 9000 }, { "epoch": 1.1341436533677614, "grad_norm": 0.3153883218765259, "learning_rate": 0.00023472751457004782, "loss": 0.4802, "step": 9005 }, { "epoch": 1.1347734357779387, "grad_norm": 0.3110881745815277, "learning_rate": 0.00023463678427213317, "loss": 0.4488, "step": 9010 }, { "epoch": 1.135403218188116, "grad_norm": 0.30957111716270447, "learning_rate": 0.00023454600852233206, "loss": 0.476, "step": 9015 }, { "epoch": 1.1360330005982933, "grad_norm": 0.3130200207233429, "learning_rate": 0.00023445518736939312, "loss": 0.4396, "step": 9020 }, { "epoch": 1.1366627830084706, "grad_norm": 0.31500178575515747, "learning_rate": 0.0002343643208620894, "loss": 0.4644, "step": 9025 }, { "epoch": 1.1372925654186479, "grad_norm": 0.3096972703933716, "learning_rate": 0.00023427340904921834, "loss": 0.4775, "step": 9030 }, { "epoch": 1.1379223478288252, "grad_norm": 0.3503490388393402, "learning_rate": 0.00023418245197960155, "loss": 0.4617, "step": 9035 }, { "epoch": 1.1385521302390025, "grad_norm": 0.31281721591949463, "learning_rate": 0.00023409144970208516, "loss": 0.4703, "step": 9040 }, { "epoch": 1.1391819126491798, "grad_norm": 0.3011356592178345, "learning_rate": 0.0002340004022655394, "loss": 0.4472, "step": 9045 }, { "epoch": 1.139811695059357, "grad_norm": 0.3240005075931549, "learning_rate": 0.00023390930971885888, "loss": 0.4726, "step": 9050 }, { "epoch": 1.1404414774695342, "grad_norm": 0.35690784454345703, "learning_rate": 0.0002338181721109623, "loss": 0.4601, "step": 9055 }, { "epoch": 1.1410712598797115, "grad_norm": 0.30888888239860535, "learning_rate": 0.0002337269894907927, "loss": 0.45, "step": 9060 }, { "epoch": 1.1417010422898888, "grad_norm": 0.3118223249912262, "learning_rate": 0.00023363576190731726, "loss": 0.4456, "step": 9065 }, { "epoch": 1.142330824700066, "grad_norm": 0.3156544864177704, "learning_rate": 0.0002335444894095272, "loss": 0.4744, "step": 9070 }, { "epoch": 1.1429606071102434, "grad_norm": 0.33679795265197754, "learning_rate": 0.00023345317204643797, "loss": 0.4662, "step": 9075 }, { "epoch": 1.1435903895204207, "grad_norm": 0.32647955417633057, "learning_rate": 0.00023336180986708904, "loss": 0.4573, "step": 9080 }, { "epoch": 1.144220171930598, "grad_norm": 0.3759111762046814, "learning_rate": 0.00023327040292054412, "loss": 0.4439, "step": 9085 }, { "epoch": 1.1448499543407753, "grad_norm": 0.31271886825561523, "learning_rate": 0.00023317895125589066, "loss": 0.4778, "step": 9090 }, { "epoch": 1.1454797367509526, "grad_norm": 0.2915593385696411, "learning_rate": 0.0002330874549222404, "loss": 0.4646, "step": 9095 }, { "epoch": 1.14610951916113, "grad_norm": 0.3337639570236206, "learning_rate": 0.00023299591396872893, "loss": 0.4597, "step": 9100 }, { "epoch": 1.1467393015713072, "grad_norm": 0.3345816433429718, "learning_rate": 0.0002329043284445158, "loss": 0.519, "step": 9105 }, { "epoch": 1.1473690839814843, "grad_norm": 0.31568819284439087, "learning_rate": 0.0002328126983987846, "loss": 0.455, "step": 9110 }, { "epoch": 1.1479988663916616, "grad_norm": 0.3630363643169403, "learning_rate": 0.00023272102388074265, "loss": 0.4544, "step": 9115 }, { "epoch": 1.148628648801839, "grad_norm": 0.30382248759269714, "learning_rate": 0.00023262930493962142, "loss": 0.485, "step": 9120 }, { "epoch": 1.1492584312120162, "grad_norm": 0.30339518189430237, "learning_rate": 0.0002325375416246759, "loss": 0.474, "step": 9125 }, { "epoch": 1.1498882136221935, "grad_norm": 0.33041009306907654, "learning_rate": 0.00023244573398518523, "loss": 0.447, "step": 9130 }, { "epoch": 1.1505179960323708, "grad_norm": 0.35708925127983093, "learning_rate": 0.00023235388207045214, "loss": 0.4801, "step": 9135 }, { "epoch": 1.1511477784425481, "grad_norm": 0.3497597575187683, "learning_rate": 0.00023226198592980318, "loss": 0.4753, "step": 9140 }, { "epoch": 1.1517775608527254, "grad_norm": 0.31747546792030334, "learning_rate": 0.00023217004561258876, "loss": 0.4642, "step": 9145 }, { "epoch": 1.1524073432629027, "grad_norm": 0.31225451827049255, "learning_rate": 0.00023207806116818283, "loss": 0.501, "step": 9150 }, { "epoch": 1.15303712567308, "grad_norm": 0.31150931119918823, "learning_rate": 0.00023198603264598327, "loss": 0.447, "step": 9155 }, { "epoch": 1.1536669080832573, "grad_norm": 0.29207199811935425, "learning_rate": 0.00023189396009541135, "loss": 0.448, "step": 9160 }, { "epoch": 1.1542966904934344, "grad_norm": 0.33640962839126587, "learning_rate": 0.00023180184356591223, "loss": 0.4725, "step": 9165 }, { "epoch": 1.1549264729036117, "grad_norm": 0.292582631111145, "learning_rate": 0.00023170968310695457, "loss": 0.4603, "step": 9170 }, { "epoch": 1.155556255313789, "grad_norm": 0.3217863142490387, "learning_rate": 0.00023161747876803066, "loss": 0.4386, "step": 9175 }, { "epoch": 1.1561860377239663, "grad_norm": 0.32607826590538025, "learning_rate": 0.00023152523059865622, "loss": 0.4747, "step": 9180 }, { "epoch": 1.1568158201341436, "grad_norm": 0.35956209897994995, "learning_rate": 0.00023143293864837078, "loss": 0.4563, "step": 9185 }, { "epoch": 1.157445602544321, "grad_norm": 0.3542852997779846, "learning_rate": 0.00023134060296673716, "loss": 0.4907, "step": 9190 }, { "epoch": 1.1580753849544982, "grad_norm": 0.3324996829032898, "learning_rate": 0.0002312482236033417, "loss": 0.4539, "step": 9195 }, { "epoch": 1.1587051673646755, "grad_norm": 0.3436378836631775, "learning_rate": 0.00023115580060779429, "loss": 0.5107, "step": 9200 }, { "epoch": 1.1593349497748529, "grad_norm": 0.2886941730976105, "learning_rate": 0.00023106333402972813, "loss": 0.4547, "step": 9205 }, { "epoch": 1.1599647321850302, "grad_norm": 0.30411913990974426, "learning_rate": 0.00023097082391879993, "loss": 0.4517, "step": 9210 }, { "epoch": 1.1605945145952075, "grad_norm": 0.3265014886856079, "learning_rate": 0.00023087827032468975, "loss": 0.4589, "step": 9215 }, { "epoch": 1.1612242970053845, "grad_norm": 0.2876526713371277, "learning_rate": 0.00023078567329710091, "loss": 0.466, "step": 9220 }, { "epoch": 1.1618540794155618, "grad_norm": 0.29947248101234436, "learning_rate": 0.0002306930328857602, "loss": 0.4459, "step": 9225 }, { "epoch": 1.1624838618257392, "grad_norm": 0.33246028423309326, "learning_rate": 0.00023060034914041753, "loss": 0.4826, "step": 9230 }, { "epoch": 1.1631136442359165, "grad_norm": 0.29653674364089966, "learning_rate": 0.0002305076221108463, "loss": 0.4394, "step": 9235 }, { "epoch": 1.1637434266460938, "grad_norm": 0.30506858229637146, "learning_rate": 0.00023041485184684308, "loss": 0.4645, "step": 9240 }, { "epoch": 1.164373209056271, "grad_norm": 0.2603437304496765, "learning_rate": 0.00023032203839822748, "loss": 0.4536, "step": 9245 }, { "epoch": 1.1650029914664484, "grad_norm": 0.3310236632823944, "learning_rate": 0.00023022918181484254, "loss": 0.4653, "step": 9250 }, { "epoch": 1.1656327738766257, "grad_norm": 0.3645521104335785, "learning_rate": 0.0002301362821465543, "loss": 0.4404, "step": 9255 }, { "epoch": 1.166262556286803, "grad_norm": 0.33431464433670044, "learning_rate": 0.00023004333944325208, "loss": 0.4389, "step": 9260 }, { "epoch": 1.1668923386969803, "grad_norm": 0.31086647510528564, "learning_rate": 0.00022995035375484817, "loss": 0.493, "step": 9265 }, { "epoch": 1.1675221211071576, "grad_norm": 0.34322085976600647, "learning_rate": 0.00022985732513127805, "loss": 0.4839, "step": 9270 }, { "epoch": 1.1681519035173347, "grad_norm": 0.3111884593963623, "learning_rate": 0.0002297642536225002, "loss": 0.4473, "step": 9275 }, { "epoch": 1.168781685927512, "grad_norm": 0.3494400084018707, "learning_rate": 0.00022967113927849613, "loss": 0.469, "step": 9280 }, { "epoch": 1.1694114683376893, "grad_norm": 0.27351829409599304, "learning_rate": 0.00022957798214927037, "loss": 0.4617, "step": 9285 }, { "epoch": 1.1700412507478666, "grad_norm": 0.3605945408344269, "learning_rate": 0.00022948478228485046, "loss": 0.444, "step": 9290 }, { "epoch": 1.1706710331580439, "grad_norm": 0.31383225321769714, "learning_rate": 0.0002293915397352869, "loss": 0.4716, "step": 9295 }, { "epoch": 1.1713008155682212, "grad_norm": 0.3261600732803345, "learning_rate": 0.00022929825455065292, "loss": 0.4646, "step": 9300 }, { "epoch": 1.1719305979783985, "grad_norm": 0.29624396562576294, "learning_rate": 0.00022920492678104492, "loss": 0.4636, "step": 9305 }, { "epoch": 1.1725603803885758, "grad_norm": 0.39078545570373535, "learning_rate": 0.00022911155647658201, "loss": 0.4933, "step": 9310 }, { "epoch": 1.173190162798753, "grad_norm": 0.2990373373031616, "learning_rate": 0.00022901814368740615, "loss": 0.4726, "step": 9315 }, { "epoch": 1.1738199452089304, "grad_norm": 0.28325891494750977, "learning_rate": 0.00022892468846368217, "loss": 0.4428, "step": 9320 }, { "epoch": 1.1744497276191077, "grad_norm": 0.3357643187046051, "learning_rate": 0.0002288311908555977, "loss": 0.4618, "step": 9325 }, { "epoch": 1.1750795100292848, "grad_norm": 0.31550613045692444, "learning_rate": 0.00022873765091336302, "loss": 0.4607, "step": 9330 }, { "epoch": 1.175709292439462, "grad_norm": 0.30639806389808655, "learning_rate": 0.00022864406868721118, "loss": 0.458, "step": 9335 }, { "epoch": 1.1763390748496394, "grad_norm": 0.3836449086666107, "learning_rate": 0.0002285504442273981, "loss": 0.4788, "step": 9340 }, { "epoch": 1.1769688572598167, "grad_norm": 0.2955804467201233, "learning_rate": 0.00022845677758420217, "loss": 0.4636, "step": 9345 }, { "epoch": 1.177598639669994, "grad_norm": 0.3264003098011017, "learning_rate": 0.0002283630688079245, "loss": 0.4769, "step": 9350 }, { "epoch": 1.1782284220801713, "grad_norm": 0.34578555822372437, "learning_rate": 0.00022826931794888894, "loss": 0.4784, "step": 9355 }, { "epoch": 1.1788582044903486, "grad_norm": 0.37039560079574585, "learning_rate": 0.00022817552505744178, "loss": 0.5042, "step": 9360 }, { "epoch": 1.179487986900526, "grad_norm": 0.319118857383728, "learning_rate": 0.00022808169018395192, "loss": 0.4607, "step": 9365 }, { "epoch": 1.1801177693107032, "grad_norm": 0.32380104064941406, "learning_rate": 0.00022798781337881086, "loss": 0.4606, "step": 9370 }, { "epoch": 1.1807475517208805, "grad_norm": 0.3038274943828583, "learning_rate": 0.00022789389469243256, "loss": 0.448, "step": 9375 }, { "epoch": 1.1813773341310578, "grad_norm": 0.3078247308731079, "learning_rate": 0.00022779993417525356, "loss": 0.4683, "step": 9380 }, { "epoch": 1.182007116541235, "grad_norm": 0.2909676432609558, "learning_rate": 0.00022770593187773275, "loss": 0.4778, "step": 9385 }, { "epoch": 1.1826368989514122, "grad_norm": 0.3095955550670624, "learning_rate": 0.00022761188785035155, "loss": 0.4523, "step": 9390 }, { "epoch": 1.1832666813615895, "grad_norm": 0.2969966530799866, "learning_rate": 0.0002275178021436137, "loss": 0.4735, "step": 9395 }, { "epoch": 1.1838964637717668, "grad_norm": 0.2896679937839508, "learning_rate": 0.00022742367480804544, "loss": 0.45, "step": 9400 }, { "epoch": 1.1845262461819441, "grad_norm": 0.31511151790618896, "learning_rate": 0.0002273295058941952, "loss": 0.4614, "step": 9405 }, { "epoch": 1.1851560285921214, "grad_norm": 0.3440285623073578, "learning_rate": 0.00022723529545263399, "loss": 0.4593, "step": 9410 }, { "epoch": 1.1857858110022987, "grad_norm": 0.29399538040161133, "learning_rate": 0.00022714104353395483, "loss": 0.4519, "step": 9415 }, { "epoch": 1.186415593412476, "grad_norm": 0.3958999812602997, "learning_rate": 0.00022704675018877322, "loss": 0.4838, "step": 9420 }, { "epoch": 1.1870453758226533, "grad_norm": 0.2960554361343384, "learning_rate": 0.0002269524154677268, "loss": 0.459, "step": 9425 }, { "epoch": 1.1876751582328307, "grad_norm": 0.32369253039360046, "learning_rate": 0.00022685803942147555, "loss": 0.4542, "step": 9430 }, { "epoch": 1.188304940643008, "grad_norm": 0.320547491312027, "learning_rate": 0.00022676362210070144, "loss": 0.4853, "step": 9435 }, { "epoch": 1.188934723053185, "grad_norm": 0.2984744906425476, "learning_rate": 0.00022666916355610885, "loss": 0.4201, "step": 9440 }, { "epoch": 1.1895645054633623, "grad_norm": 0.34194597601890564, "learning_rate": 0.00022657466383842407, "loss": 0.4705, "step": 9445 }, { "epoch": 1.1901942878735396, "grad_norm": 0.29718858003616333, "learning_rate": 0.0002264801229983957, "loss": 0.4403, "step": 9450 }, { "epoch": 1.190824070283717, "grad_norm": 0.29723846912384033, "learning_rate": 0.0002263855410867943, "loss": 0.4841, "step": 9455 }, { "epoch": 1.1914538526938943, "grad_norm": 0.31662440299987793, "learning_rate": 0.00022629091815441245, "loss": 0.456, "step": 9460 }, { "epoch": 1.1920836351040716, "grad_norm": 0.3458605408668518, "learning_rate": 0.0002261962542520649, "loss": 0.4504, "step": 9465 }, { "epoch": 1.1927134175142489, "grad_norm": 0.31829431653022766, "learning_rate": 0.00022610154943058833, "loss": 0.4821, "step": 9470 }, { "epoch": 1.1933431999244262, "grad_norm": 0.3380287289619446, "learning_rate": 0.00022600680374084138, "loss": 0.4963, "step": 9475 }, { "epoch": 1.1939729823346035, "grad_norm": 0.3048580288887024, "learning_rate": 0.00022591201723370458, "loss": 0.4443, "step": 9480 }, { "epoch": 1.1946027647447806, "grad_norm": 0.34586548805236816, "learning_rate": 0.0002258171899600806, "loss": 0.473, "step": 9485 }, { "epoch": 1.195232547154958, "grad_norm": 0.2828037440776825, "learning_rate": 0.0002257223219708937, "loss": 0.4539, "step": 9490 }, { "epoch": 1.1958623295651352, "grad_norm": 0.31300345063209534, "learning_rate": 0.00022562741331709024, "loss": 0.4353, "step": 9495 }, { "epoch": 1.1964921119753125, "grad_norm": 0.311260849237442, "learning_rate": 0.0002255324640496383, "loss": 0.4553, "step": 9500 }, { "epoch": 1.1971218943854898, "grad_norm": 0.2941080331802368, "learning_rate": 0.0002254374742195279, "loss": 0.4464, "step": 9505 }, { "epoch": 1.197751676795667, "grad_norm": 0.26669132709503174, "learning_rate": 0.00022534244387777057, "loss": 0.4368, "step": 9510 }, { "epoch": 1.1983814592058444, "grad_norm": 0.2933709919452667, "learning_rate": 0.00022524737307539995, "loss": 0.4526, "step": 9515 }, { "epoch": 1.1990112416160217, "grad_norm": 0.338360458612442, "learning_rate": 0.0002251522618634711, "loss": 0.4625, "step": 9520 }, { "epoch": 1.199641024026199, "grad_norm": 0.31670835614204407, "learning_rate": 0.00022505711029306098, "loss": 0.4553, "step": 9525 }, { "epoch": 1.2002708064363763, "grad_norm": 0.3221518099308014, "learning_rate": 0.00022496191841526813, "loss": 0.475, "step": 9530 }, { "epoch": 1.2009005888465536, "grad_norm": 0.32984668016433716, "learning_rate": 0.00022486668628121282, "loss": 0.4526, "step": 9535 }, { "epoch": 1.2015303712567307, "grad_norm": 0.2793140113353729, "learning_rate": 0.00022477141394203678, "loss": 0.4374, "step": 9540 }, { "epoch": 1.2021601536669082, "grad_norm": 0.3125605881214142, "learning_rate": 0.00022467610144890357, "loss": 0.4569, "step": 9545 }, { "epoch": 1.2027899360770853, "grad_norm": 0.2892754375934601, "learning_rate": 0.00022458074885299808, "loss": 0.4747, "step": 9550 }, { "epoch": 1.2034197184872626, "grad_norm": 0.3224146068096161, "learning_rate": 0.00022448535620552684, "loss": 0.4372, "step": 9555 }, { "epoch": 1.20404950089744, "grad_norm": 0.33973759412765503, "learning_rate": 0.00022438992355771787, "loss": 0.4368, "step": 9560 }, { "epoch": 1.2046792833076172, "grad_norm": 0.37665504217147827, "learning_rate": 0.00022429445096082073, "loss": 0.4747, "step": 9565 }, { "epoch": 1.2053090657177945, "grad_norm": 0.2834467589855194, "learning_rate": 0.00022419893846610634, "loss": 0.4841, "step": 9570 }, { "epoch": 1.2059388481279718, "grad_norm": 0.3729229271411896, "learning_rate": 0.00022410338612486715, "loss": 0.475, "step": 9575 }, { "epoch": 1.2065686305381491, "grad_norm": 0.30668923258781433, "learning_rate": 0.00022400779398841684, "loss": 0.4271, "step": 9580 }, { "epoch": 1.2071984129483264, "grad_norm": 0.33016908168792725, "learning_rate": 0.00022391216210809072, "loss": 0.4553, "step": 9585 }, { "epoch": 1.2078281953585037, "grad_norm": 0.30926114320755005, "learning_rate": 0.00022381649053524518, "loss": 0.4512, "step": 9590 }, { "epoch": 1.2084579777686808, "grad_norm": 0.3481772840023041, "learning_rate": 0.00022372077932125809, "loss": 0.4707, "step": 9595 }, { "epoch": 1.2090877601788583, "grad_norm": 0.2549537420272827, "learning_rate": 0.0002236250285175285, "loss": 0.4686, "step": 9600 }, { "epoch": 1.2097175425890354, "grad_norm": 0.3111298978328705, "learning_rate": 0.00022352923817547688, "loss": 0.4535, "step": 9605 }, { "epoch": 1.2103473249992127, "grad_norm": 0.29062095284461975, "learning_rate": 0.00022343340834654472, "loss": 0.4612, "step": 9610 }, { "epoch": 1.21097710740939, "grad_norm": 0.3373335897922516, "learning_rate": 0.0002233375390821949, "loss": 0.4233, "step": 9615 }, { "epoch": 1.2116068898195673, "grad_norm": 0.308648943901062, "learning_rate": 0.0002232416304339114, "loss": 0.4535, "step": 9620 }, { "epoch": 1.2122366722297446, "grad_norm": 0.32941722869873047, "learning_rate": 0.00022314568245319935, "loss": 0.4564, "step": 9625 }, { "epoch": 1.212866454639922, "grad_norm": 0.33229124546051025, "learning_rate": 0.00022304969519158495, "loss": 0.458, "step": 9630 }, { "epoch": 1.2134962370500992, "grad_norm": 0.29093366861343384, "learning_rate": 0.00022295366870061565, "loss": 0.4315, "step": 9635 }, { "epoch": 1.2141260194602765, "grad_norm": 0.3482106328010559, "learning_rate": 0.00022285760303185982, "loss": 0.4311, "step": 9640 }, { "epoch": 1.2147558018704538, "grad_norm": 0.29717814922332764, "learning_rate": 0.0002227614982369069, "loss": 0.4261, "step": 9645 }, { "epoch": 1.215385584280631, "grad_norm": 0.3359118700027466, "learning_rate": 0.00022266535436736738, "loss": 0.4698, "step": 9650 }, { "epoch": 1.2160153666908082, "grad_norm": 0.3095514476299286, "learning_rate": 0.0002225691714748727, "loss": 0.4463, "step": 9655 }, { "epoch": 1.2166451491009855, "grad_norm": 0.29095733165740967, "learning_rate": 0.0002224729496110753, "loss": 0.4662, "step": 9660 }, { "epoch": 1.2172749315111628, "grad_norm": 0.34425532817840576, "learning_rate": 0.00022237668882764847, "loss": 0.4579, "step": 9665 }, { "epoch": 1.2179047139213401, "grad_norm": 0.32856446504592896, "learning_rate": 0.0002222803891762865, "loss": 0.4648, "step": 9670 }, { "epoch": 1.2185344963315174, "grad_norm": 0.35708895325660706, "learning_rate": 0.00022218405070870451, "loss": 0.4579, "step": 9675 }, { "epoch": 1.2191642787416948, "grad_norm": 0.26759231090545654, "learning_rate": 0.0002220876734766384, "loss": 0.4321, "step": 9680 }, { "epoch": 1.219794061151872, "grad_norm": 0.27995094656944275, "learning_rate": 0.00022199125753184497, "loss": 0.4552, "step": 9685 }, { "epoch": 1.2204238435620494, "grad_norm": 0.3591984510421753, "learning_rate": 0.00022189480292610187, "loss": 0.4685, "step": 9690 }, { "epoch": 1.2210536259722267, "grad_norm": 0.2892036736011505, "learning_rate": 0.00022179830971120722, "loss": 0.4609, "step": 9695 }, { "epoch": 1.221683408382404, "grad_norm": 0.3287111520767212, "learning_rate": 0.00022170177793898028, "loss": 0.479, "step": 9700 }, { "epoch": 1.222313190792581, "grad_norm": 0.3088148832321167, "learning_rate": 0.00022160520766126074, "loss": 0.4597, "step": 9705 }, { "epoch": 1.2229429732027584, "grad_norm": 0.3263307511806488, "learning_rate": 0.0002215085989299091, "loss": 0.4801, "step": 9710 }, { "epoch": 1.2235727556129357, "grad_norm": 0.283078134059906, "learning_rate": 0.0002214119517968063, "loss": 0.4476, "step": 9715 }, { "epoch": 1.224202538023113, "grad_norm": 0.3226225674152374, "learning_rate": 0.00022131526631385422, "loss": 0.4644, "step": 9720 }, { "epoch": 1.2248323204332903, "grad_norm": 0.32242435216903687, "learning_rate": 0.00022121854253297514, "loss": 0.4477, "step": 9725 }, { "epoch": 1.2254621028434676, "grad_norm": 0.3373146057128906, "learning_rate": 0.0002211217805061119, "loss": 0.4541, "step": 9730 }, { "epoch": 1.2260918852536449, "grad_norm": 0.28866246342658997, "learning_rate": 0.00022102498028522786, "loss": 0.4388, "step": 9735 }, { "epoch": 1.2267216676638222, "grad_norm": 0.308704674243927, "learning_rate": 0.00022092814192230711, "loss": 0.425, "step": 9740 }, { "epoch": 1.2273514500739995, "grad_norm": 0.3144040107727051, "learning_rate": 0.00022083126546935394, "loss": 0.4532, "step": 9745 }, { "epoch": 1.2279812324841768, "grad_norm": 0.29848021268844604, "learning_rate": 0.00022073435097839329, "loss": 0.457, "step": 9750 }, { "epoch": 1.228611014894354, "grad_norm": 0.35102754831314087, "learning_rate": 0.00022063739850147036, "loss": 0.4258, "step": 9755 }, { "epoch": 1.2292407973045312, "grad_norm": 0.32105547189712524, "learning_rate": 0.000220540408090651, "loss": 0.4226, "step": 9760 }, { "epoch": 1.2298705797147085, "grad_norm": 0.3647817075252533, "learning_rate": 0.0002204433797980211, "loss": 0.4556, "step": 9765 }, { "epoch": 1.2305003621248858, "grad_norm": 0.3260333836078644, "learning_rate": 0.00022034631367568718, "loss": 0.4834, "step": 9770 }, { "epoch": 1.231130144535063, "grad_norm": 0.30218422412872314, "learning_rate": 0.00022024920977577596, "loss": 0.4327, "step": 9775 }, { "epoch": 1.2317599269452404, "grad_norm": 0.3666177988052368, "learning_rate": 0.0002201520681504344, "loss": 0.4361, "step": 9780 }, { "epoch": 1.2323897093554177, "grad_norm": 0.3113807737827301, "learning_rate": 0.00022005488885182975, "loss": 0.4554, "step": 9785 }, { "epoch": 1.233019491765595, "grad_norm": 0.31085875630378723, "learning_rate": 0.00021995767193214963, "loss": 0.4391, "step": 9790 }, { "epoch": 1.2336492741757723, "grad_norm": 0.304509699344635, "learning_rate": 0.0002198604174436017, "loss": 0.4754, "step": 9795 }, { "epoch": 1.2342790565859496, "grad_norm": 0.2930733263492584, "learning_rate": 0.0002197631254384138, "loss": 0.4194, "step": 9800 }, { "epoch": 1.234908838996127, "grad_norm": 0.30277615785598755, "learning_rate": 0.00021966579596883394, "loss": 0.4506, "step": 9805 }, { "epoch": 1.2355386214063042, "grad_norm": 0.2824211120605469, "learning_rate": 0.00021956842908713037, "loss": 0.4398, "step": 9810 }, { "epoch": 1.2361684038164813, "grad_norm": 0.31834569573402405, "learning_rate": 0.00021947102484559121, "loss": 0.4756, "step": 9815 }, { "epoch": 1.2367981862266586, "grad_norm": 0.355283260345459, "learning_rate": 0.00021937358329652488, "loss": 0.456, "step": 9820 }, { "epoch": 1.237427968636836, "grad_norm": 0.2955317497253418, "learning_rate": 0.00021927610449225962, "loss": 0.4462, "step": 9825 }, { "epoch": 1.2380577510470132, "grad_norm": 0.2653120756149292, "learning_rate": 0.00021917858848514383, "loss": 0.4197, "step": 9830 }, { "epoch": 1.2386875334571905, "grad_norm": 0.3773416578769684, "learning_rate": 0.0002190810353275458, "loss": 0.4263, "step": 9835 }, { "epoch": 1.2393173158673678, "grad_norm": 0.28635114431381226, "learning_rate": 0.00021898344507185384, "loss": 0.4705, "step": 9840 }, { "epoch": 1.2399470982775451, "grad_norm": 0.3044835031032562, "learning_rate": 0.00021888581777047608, "loss": 0.4671, "step": 9845 }, { "epoch": 1.2405768806877224, "grad_norm": 0.293748676776886, "learning_rate": 0.0002187881534758407, "loss": 0.436, "step": 9850 }, { "epoch": 1.2412066630978997, "grad_norm": 0.3891184628009796, "learning_rate": 0.00021869045224039564, "loss": 0.456, "step": 9855 }, { "epoch": 1.241836445508077, "grad_norm": 0.3140691816806793, "learning_rate": 0.0002185927141166086, "loss": 0.4402, "step": 9860 }, { "epoch": 1.2424662279182543, "grad_norm": 0.33889827132225037, "learning_rate": 0.00021849493915696738, "loss": 0.4363, "step": 9865 }, { "epoch": 1.2430960103284314, "grad_norm": 0.3084375858306885, "learning_rate": 0.0002183971274139791, "loss": 0.4295, "step": 9870 }, { "epoch": 1.2437257927386087, "grad_norm": 0.3091178834438324, "learning_rate": 0.00021829927894017115, "loss": 0.4263, "step": 9875 }, { "epoch": 1.244355575148786, "grad_norm": 0.3208729922771454, "learning_rate": 0.00021820139378809025, "loss": 0.4233, "step": 9880 }, { "epoch": 1.2449853575589633, "grad_norm": 0.30196666717529297, "learning_rate": 0.000218103472010303, "loss": 0.4265, "step": 9885 }, { "epoch": 1.2456151399691406, "grad_norm": 0.3044353127479553, "learning_rate": 0.0002180055136593956, "loss": 0.48, "step": 9890 }, { "epoch": 1.246244922379318, "grad_norm": 0.31633850932121277, "learning_rate": 0.000217907518787974, "loss": 0.4708, "step": 9895 }, { "epoch": 1.2468747047894952, "grad_norm": 0.29174062609672546, "learning_rate": 0.0002178094874486636, "loss": 0.4135, "step": 9900 }, { "epoch": 1.2475044871996726, "grad_norm": 0.33092647790908813, "learning_rate": 0.00021771141969410956, "loss": 0.4541, "step": 9905 }, { "epoch": 1.2481342696098499, "grad_norm": 0.30151379108428955, "learning_rate": 0.00021761331557697635, "loss": 0.4397, "step": 9910 }, { "epoch": 1.2487640520200272, "grad_norm": 0.31203630566596985, "learning_rate": 0.00021751517514994836, "loss": 0.454, "step": 9915 }, { "epoch": 1.2493938344302045, "grad_norm": 0.30847153067588806, "learning_rate": 0.00021741699846572902, "loss": 0.4309, "step": 9920 }, { "epoch": 1.2500236168403815, "grad_norm": 0.2937026619911194, "learning_rate": 0.00021731878557704158, "loss": 0.4206, "step": 9925 }, { "epoch": 1.2506533992505589, "grad_norm": 0.2875721752643585, "learning_rate": 0.0002172205365366285, "loss": 0.4385, "step": 9930 }, { "epoch": 1.2512831816607362, "grad_norm": 0.2834903299808502, "learning_rate": 0.00021712225139725188, "loss": 0.423, "step": 9935 }, { "epoch": 1.2519129640709135, "grad_norm": 0.3069617748260498, "learning_rate": 0.000217023930211693, "loss": 0.4536, "step": 9940 }, { "epoch": 1.2525427464810908, "grad_norm": 0.32263246178627014, "learning_rate": 0.0002169255730327526, "loss": 0.4281, "step": 9945 }, { "epoch": 1.253172528891268, "grad_norm": 0.2980237603187561, "learning_rate": 0.00021682717991325075, "loss": 0.4163, "step": 9950 }, { "epoch": 1.2538023113014454, "grad_norm": 0.3552669584751129, "learning_rate": 0.0002167287509060268, "loss": 0.4378, "step": 9955 }, { "epoch": 1.2544320937116227, "grad_norm": 0.3207598924636841, "learning_rate": 0.00021663028606393932, "loss": 0.4411, "step": 9960 }, { "epoch": 1.2550618761218, "grad_norm": 0.3187711238861084, "learning_rate": 0.0002165317854398663, "loss": 0.4384, "step": 9965 }, { "epoch": 1.2556916585319773, "grad_norm": 0.3156946897506714, "learning_rate": 0.00021643324908670472, "loss": 0.4227, "step": 9970 }, { "epoch": 1.2563214409421546, "grad_norm": 0.3305997848510742, "learning_rate": 0.00021633467705737085, "loss": 0.4521, "step": 9975 }, { "epoch": 1.2569512233523317, "grad_norm": 0.2964983880519867, "learning_rate": 0.00021623606940480015, "loss": 0.4373, "step": 9980 }, { "epoch": 1.257581005762509, "grad_norm": 0.29807519912719727, "learning_rate": 0.00021613742618194727, "loss": 0.4591, "step": 9985 }, { "epoch": 1.2582107881726863, "grad_norm": 0.29127413034439087, "learning_rate": 0.00021603874744178576, "loss": 0.43, "step": 9990 }, { "epoch": 1.2588405705828636, "grad_norm": 0.339418888092041, "learning_rate": 0.00021594003323730836, "loss": 0.4407, "step": 9995 }, { "epoch": 1.2594703529930409, "grad_norm": 0.3419913053512573, "learning_rate": 0.0002158412836215269, "loss": 0.4678, "step": 10000 }, { "epoch": 1.2594703529930409, "eval_loss": 0.30844178795814514, "eval_runtime": 6.157, "eval_samples_per_second": 162.416, "eval_steps_per_second": 10.232, "step": 10000 }, { "epoch": 1.2601001354032182, "grad_norm": 0.3139461576938629, "learning_rate": 0.00021574249864747216, "loss": 0.4491, "step": 10005 }, { "epoch": 1.2607299178133955, "grad_norm": 0.319892555475235, "learning_rate": 0.00021564367836819393, "loss": 0.4648, "step": 10010 }, { "epoch": 1.2613597002235728, "grad_norm": 0.30732426047325134, "learning_rate": 0.00021554482283676093, "loss": 0.4113, "step": 10015 }, { "epoch": 1.26198948263375, "grad_norm": 0.3234427571296692, "learning_rate": 0.00021544593210626092, "loss": 0.4461, "step": 10020 }, { "epoch": 1.2626192650439272, "grad_norm": 0.3298225998878479, "learning_rate": 0.00021534700622980038, "loss": 0.4487, "step": 10025 }, { "epoch": 1.2632490474541047, "grad_norm": 0.3394641578197479, "learning_rate": 0.0002152480452605048, "loss": 0.4653, "step": 10030 }, { "epoch": 1.2638788298642818, "grad_norm": 0.29091107845306396, "learning_rate": 0.00021514904925151854, "loss": 0.4639, "step": 10035 }, { "epoch": 1.264508612274459, "grad_norm": 0.27975961565971375, "learning_rate": 0.00021505001825600461, "loss": 0.4094, "step": 10040 }, { "epoch": 1.2651383946846364, "grad_norm": 0.2882293164730072, "learning_rate": 0.00021495095232714503, "loss": 0.4212, "step": 10045 }, { "epoch": 1.2657681770948137, "grad_norm": 0.31701260805130005, "learning_rate": 0.0002148518515181404, "loss": 0.4427, "step": 10050 }, { "epoch": 1.266397959504991, "grad_norm": 0.33051052689552307, "learning_rate": 0.00021475271588221014, "loss": 0.4331, "step": 10055 }, { "epoch": 1.2670277419151683, "grad_norm": 0.32075920701026917, "learning_rate": 0.00021465354547259234, "loss": 0.4486, "step": 10060 }, { "epoch": 1.2676575243253456, "grad_norm": 0.3044838309288025, "learning_rate": 0.00021455434034254375, "loss": 0.4141, "step": 10065 }, { "epoch": 1.268287306735523, "grad_norm": 0.31618407368659973, "learning_rate": 0.00021445510054533983, "loss": 0.446, "step": 10070 }, { "epoch": 1.2689170891457002, "grad_norm": 0.3025960624217987, "learning_rate": 0.0002143558261342746, "loss": 0.4233, "step": 10075 }, { "epoch": 1.2695468715558773, "grad_norm": 0.2974034249782562, "learning_rate": 0.0002142565171626607, "loss": 0.4078, "step": 10080 }, { "epoch": 1.2701766539660548, "grad_norm": 0.34097397327423096, "learning_rate": 0.0002141571736838293, "loss": 0.4555, "step": 10085 }, { "epoch": 1.270806436376232, "grad_norm": 0.30995890498161316, "learning_rate": 0.0002140577957511302, "loss": 0.4388, "step": 10090 }, { "epoch": 1.2714362187864092, "grad_norm": 0.24191588163375854, "learning_rate": 0.00021395838341793145, "loss": 0.4114, "step": 10095 }, { "epoch": 1.2720660011965865, "grad_norm": 0.31779953837394714, "learning_rate": 0.00021385893673761986, "loss": 0.4169, "step": 10100 }, { "epoch": 1.2726957836067638, "grad_norm": 0.31599584221839905, "learning_rate": 0.0002137594557636006, "loss": 0.4081, "step": 10105 }, { "epoch": 1.2733255660169411, "grad_norm": 0.31904011964797974, "learning_rate": 0.00021365994054929713, "loss": 0.4406, "step": 10110 }, { "epoch": 1.2739553484271184, "grad_norm": 0.2923012375831604, "learning_rate": 0.00021356039114815145, "loss": 0.4335, "step": 10115 }, { "epoch": 1.2745851308372957, "grad_norm": 0.27983418107032776, "learning_rate": 0.00021346080761362385, "loss": 0.4039, "step": 10120 }, { "epoch": 1.275214913247473, "grad_norm": 0.29870182275772095, "learning_rate": 0.000213361189999193, "loss": 0.4311, "step": 10125 }, { "epoch": 1.2758446956576504, "grad_norm": 0.3060225546360016, "learning_rate": 0.00021326153835835574, "loss": 0.4722, "step": 10130 }, { "epoch": 1.2764744780678274, "grad_norm": 0.38860756158828735, "learning_rate": 0.00021316185274462734, "loss": 0.4276, "step": 10135 }, { "epoch": 1.277104260478005, "grad_norm": 0.32171720266342163, "learning_rate": 0.0002130621332115413, "loss": 0.4334, "step": 10140 }, { "epoch": 1.277734042888182, "grad_norm": 0.2947072684764862, "learning_rate": 0.00021296237981264916, "loss": 0.411, "step": 10145 }, { "epoch": 1.2783638252983593, "grad_norm": 0.2904439866542816, "learning_rate": 0.00021286259260152088, "loss": 0.4222, "step": 10150 }, { "epoch": 1.2789936077085367, "grad_norm": 0.2517947554588318, "learning_rate": 0.00021276277163174444, "loss": 0.4336, "step": 10155 }, { "epoch": 1.279623390118714, "grad_norm": 0.295692503452301, "learning_rate": 0.00021266291695692602, "loss": 0.4617, "step": 10160 }, { "epoch": 1.2802531725288913, "grad_norm": 0.3214627802371979, "learning_rate": 0.00021256302863068976, "loss": 0.4327, "step": 10165 }, { "epoch": 1.2808829549390686, "grad_norm": 0.3030719459056854, "learning_rate": 0.00021246310670667808, "loss": 0.4289, "step": 10170 }, { "epoch": 1.2815127373492459, "grad_norm": 0.32924139499664307, "learning_rate": 0.00021236315123855128, "loss": 0.4391, "step": 10175 }, { "epoch": 1.2821425197594232, "grad_norm": 0.2978973984718323, "learning_rate": 0.00021226316227998773, "loss": 0.4356, "step": 10180 }, { "epoch": 1.2827723021696005, "grad_norm": 0.289858341217041, "learning_rate": 0.00021216313988468375, "loss": 0.4302, "step": 10185 }, { "epoch": 1.2834020845797776, "grad_norm": 0.28235578536987305, "learning_rate": 0.00021206308410635376, "loss": 0.4581, "step": 10190 }, { "epoch": 1.284031866989955, "grad_norm": 0.28610706329345703, "learning_rate": 0.0002119629949987299, "loss": 0.4233, "step": 10195 }, { "epoch": 1.2846616494001322, "grad_norm": 0.347464382648468, "learning_rate": 0.00021186287261556238, "loss": 0.4191, "step": 10200 }, { "epoch": 1.2852914318103095, "grad_norm": 0.3228091299533844, "learning_rate": 0.00021176271701061914, "loss": 0.4162, "step": 10205 }, { "epoch": 1.2859212142204868, "grad_norm": 0.34487780928611755, "learning_rate": 0.00021166252823768606, "loss": 0.4383, "step": 10210 }, { "epoch": 1.286550996630664, "grad_norm": 0.34411466121673584, "learning_rate": 0.00021156230635056676, "loss": 0.4532, "step": 10215 }, { "epoch": 1.2871807790408414, "grad_norm": 0.38219863176345825, "learning_rate": 0.00021146205140308273, "loss": 0.4656, "step": 10220 }, { "epoch": 1.2878105614510187, "grad_norm": 0.3240879774093628, "learning_rate": 0.00021136176344907322, "loss": 0.4174, "step": 10225 }, { "epoch": 1.288440343861196, "grad_norm": 0.34157487750053406, "learning_rate": 0.00021126144254239503, "loss": 0.4297, "step": 10230 }, { "epoch": 1.2890701262713733, "grad_norm": 0.2788861095905304, "learning_rate": 0.00021116108873692286, "loss": 0.429, "step": 10235 }, { "epoch": 1.2896999086815506, "grad_norm": 0.28119325637817383, "learning_rate": 0.00021106070208654895, "loss": 0.4145, "step": 10240 }, { "epoch": 1.2903296910917277, "grad_norm": 0.32004043459892273, "learning_rate": 0.00021096028264518325, "loss": 0.4361, "step": 10245 }, { "epoch": 1.2909594735019052, "grad_norm": 0.3054758310317993, "learning_rate": 0.0002108598304667533, "loss": 0.4331, "step": 10250 }, { "epoch": 1.2915892559120823, "grad_norm": 0.3827783167362213, "learning_rate": 0.0002107593456052042, "loss": 0.4246, "step": 10255 }, { "epoch": 1.2922190383222596, "grad_norm": 0.3008691370487213, "learning_rate": 0.00021065882811449862, "loss": 0.4448, "step": 10260 }, { "epoch": 1.292848820732437, "grad_norm": 0.3227977752685547, "learning_rate": 0.00021055827804861675, "loss": 0.4308, "step": 10265 }, { "epoch": 1.2934786031426142, "grad_norm": 0.32592520117759705, "learning_rate": 0.00021045769546155623, "loss": 0.4472, "step": 10270 }, { "epoch": 1.2941083855527915, "grad_norm": 0.30866268277168274, "learning_rate": 0.00021035708040733231, "loss": 0.4193, "step": 10275 }, { "epoch": 1.2947381679629688, "grad_norm": 0.36590054631233215, "learning_rate": 0.0002102564329399775, "loss": 0.4554, "step": 10280 }, { "epoch": 1.2953679503731461, "grad_norm": 0.34002235531806946, "learning_rate": 0.00021015575311354175, "loss": 0.465, "step": 10285 }, { "epoch": 1.2959977327833234, "grad_norm": 0.26847660541534424, "learning_rate": 0.00021005504098209248, "loss": 0.4226, "step": 10290 }, { "epoch": 1.2966275151935007, "grad_norm": 0.2904103398323059, "learning_rate": 0.00020995429659971445, "loss": 0.4135, "step": 10295 }, { "epoch": 1.2972572976036778, "grad_norm": 0.2799352705478668, "learning_rate": 0.00020985352002050962, "loss": 0.4241, "step": 10300 }, { "epoch": 1.2978870800138553, "grad_norm": 0.3527425229549408, "learning_rate": 0.00020975271129859734, "loss": 0.4397, "step": 10305 }, { "epoch": 1.2985168624240324, "grad_norm": 0.30795904994010925, "learning_rate": 0.00020965187048811417, "loss": 0.4248, "step": 10310 }, { "epoch": 1.2991466448342097, "grad_norm": 0.31814008951187134, "learning_rate": 0.00020955099764321402, "loss": 0.4501, "step": 10315 }, { "epoch": 1.299776427244387, "grad_norm": 0.29917100071907043, "learning_rate": 0.0002094500928180678, "loss": 0.4511, "step": 10320 }, { "epoch": 1.3004062096545643, "grad_norm": 0.32853367924690247, "learning_rate": 0.00020934915606686373, "loss": 0.4055, "step": 10325 }, { "epoch": 1.3010359920647416, "grad_norm": 0.420550137758255, "learning_rate": 0.00020924818744380723, "loss": 0.4417, "step": 10330 }, { "epoch": 1.301665774474919, "grad_norm": 0.3183051347732544, "learning_rate": 0.0002091471870031207, "loss": 0.4256, "step": 10335 }, { "epoch": 1.3022955568850962, "grad_norm": 0.30520761013031006, "learning_rate": 0.00020904615479904362, "loss": 0.4213, "step": 10340 }, { "epoch": 1.3029253392952735, "grad_norm": 0.3484478294849396, "learning_rate": 0.0002089450908858327, "loss": 0.4202, "step": 10345 }, { "epoch": 1.3035551217054508, "grad_norm": 0.3063777983188629, "learning_rate": 0.00020884399531776154, "loss": 0.4121, "step": 10350 }, { "epoch": 1.304184904115628, "grad_norm": 0.35436901450157166, "learning_rate": 0.00020874286814912072, "loss": 0.4351, "step": 10355 }, { "epoch": 1.3048146865258055, "grad_norm": 0.3233969211578369, "learning_rate": 0.00020864170943421786, "loss": 0.4326, "step": 10360 }, { "epoch": 1.3054444689359825, "grad_norm": 0.34073448181152344, "learning_rate": 0.0002085405192273776, "loss": 0.4454, "step": 10365 }, { "epoch": 1.3060742513461598, "grad_norm": 0.28455135226249695, "learning_rate": 0.00020843929758294121, "loss": 0.4511, "step": 10370 }, { "epoch": 1.3067040337563371, "grad_norm": 0.31585589051246643, "learning_rate": 0.0002083380445552672, "loss": 0.4258, "step": 10375 }, { "epoch": 1.3073338161665145, "grad_norm": 0.31528952717781067, "learning_rate": 0.00020823676019873064, "loss": 0.424, "step": 10380 }, { "epoch": 1.3079635985766918, "grad_norm": 0.3014485836029053, "learning_rate": 0.00020813544456772362, "loss": 0.4429, "step": 10385 }, { "epoch": 1.308593380986869, "grad_norm": 0.2870473861694336, "learning_rate": 0.00020803409771665484, "loss": 0.439, "step": 10390 }, { "epoch": 1.3092231633970464, "grad_norm": 0.2971458435058594, "learning_rate": 0.00020793271969994997, "loss": 0.4233, "step": 10395 }, { "epoch": 1.3098529458072237, "grad_norm": 0.2853131890296936, "learning_rate": 0.00020783131057205135, "loss": 0.4164, "step": 10400 }, { "epoch": 1.310482728217401, "grad_norm": 0.29392004013061523, "learning_rate": 0.00020772987038741793, "loss": 0.4234, "step": 10405 }, { "epoch": 1.311112510627578, "grad_norm": 0.2874060273170471, "learning_rate": 0.00020762839920052543, "loss": 0.4413, "step": 10410 }, { "epoch": 1.3117422930377556, "grad_norm": 0.2806376516819, "learning_rate": 0.00020752689706586615, "loss": 0.4223, "step": 10415 }, { "epoch": 1.3123720754479327, "grad_norm": 0.28510767221450806, "learning_rate": 0.00020742536403794908, "loss": 0.4183, "step": 10420 }, { "epoch": 1.31300185785811, "grad_norm": 0.3087919056415558, "learning_rate": 0.00020732380017129983, "loss": 0.4241, "step": 10425 }, { "epoch": 1.3136316402682873, "grad_norm": 0.2965323328971863, "learning_rate": 0.00020722220552046048, "loss": 0.4225, "step": 10430 }, { "epoch": 1.3142614226784646, "grad_norm": 0.2907772660255432, "learning_rate": 0.00020712058013998963, "loss": 0.4176, "step": 10435 }, { "epoch": 1.3148912050886419, "grad_norm": 0.3242434859275818, "learning_rate": 0.0002070189240844625, "loss": 0.4377, "step": 10440 }, { "epoch": 1.3155209874988192, "grad_norm": 0.28129857778549194, "learning_rate": 0.00020691723740847066, "loss": 0.425, "step": 10445 }, { "epoch": 1.3161507699089965, "grad_norm": 0.3053089380264282, "learning_rate": 0.00020681552016662224, "loss": 0.4066, "step": 10450 }, { "epoch": 1.3167805523191738, "grad_norm": 0.27167361974716187, "learning_rate": 0.00020671377241354168, "loss": 0.4458, "step": 10455 }, { "epoch": 1.317410334729351, "grad_norm": 0.29331174492836, "learning_rate": 0.00020661199420386986, "loss": 0.427, "step": 10460 }, { "epoch": 1.3180401171395282, "grad_norm": 0.329908162355423, "learning_rate": 0.00020651018559226394, "loss": 0.4292, "step": 10465 }, { "epoch": 1.3186698995497057, "grad_norm": 0.32669904828071594, "learning_rate": 0.0002064083466333976, "loss": 0.4118, "step": 10470 }, { "epoch": 1.3192996819598828, "grad_norm": 0.35706159472465515, "learning_rate": 0.00020630647738196058, "loss": 0.4433, "step": 10475 }, { "epoch": 1.31992946437006, "grad_norm": 0.3119877278804779, "learning_rate": 0.00020620457789265905, "loss": 0.4206, "step": 10480 }, { "epoch": 1.3205592467802374, "grad_norm": 0.34798958897590637, "learning_rate": 0.00020610264822021532, "loss": 0.39, "step": 10485 }, { "epoch": 1.3211890291904147, "grad_norm": 0.36972302198410034, "learning_rate": 0.000206000688419368, "loss": 0.4402, "step": 10490 }, { "epoch": 1.321818811600592, "grad_norm": 0.27949050068855286, "learning_rate": 0.00020589869854487175, "loss": 0.4221, "step": 10495 }, { "epoch": 1.3224485940107693, "grad_norm": 0.30757853388786316, "learning_rate": 0.00020579667865149758, "loss": 0.4402, "step": 10500 }, { "epoch": 1.3230783764209466, "grad_norm": 0.3018808364868164, "learning_rate": 0.0002056946287940324, "loss": 0.4088, "step": 10505 }, { "epoch": 1.323708158831124, "grad_norm": 0.2630440592765808, "learning_rate": 0.00020559254902727942, "loss": 0.4062, "step": 10510 }, { "epoch": 1.3243379412413012, "grad_norm": 0.3145885169506073, "learning_rate": 0.00020549043940605767, "loss": 0.4301, "step": 10515 }, { "epoch": 1.3249677236514783, "grad_norm": 0.3040730655193329, "learning_rate": 0.0002053882999852025, "loss": 0.4267, "step": 10520 }, { "epoch": 1.3255975060616558, "grad_norm": 0.2861897945404053, "learning_rate": 0.00020528613081956498, "loss": 0.4115, "step": 10525 }, { "epoch": 1.326227288471833, "grad_norm": 0.2938830256462097, "learning_rate": 0.00020518393196401234, "loss": 0.4315, "step": 10530 }, { "epoch": 1.3268570708820102, "grad_norm": 0.24550281465053558, "learning_rate": 0.0002050817034734277, "loss": 0.4181, "step": 10535 }, { "epoch": 1.3274868532921875, "grad_norm": 0.30074000358581543, "learning_rate": 0.00020497944540271017, "loss": 0.4016, "step": 10540 }, { "epoch": 1.3281166357023648, "grad_norm": 0.34675145149230957, "learning_rate": 0.0002048771578067745, "loss": 0.4157, "step": 10545 }, { "epoch": 1.3287464181125421, "grad_norm": 0.3144848644733429, "learning_rate": 0.00020477484074055157, "loss": 0.4024, "step": 10550 }, { "epoch": 1.3293762005227194, "grad_norm": 0.32153722643852234, "learning_rate": 0.00020467249425898805, "loss": 0.4114, "step": 10555 }, { "epoch": 1.3300059829328967, "grad_norm": 0.301707923412323, "learning_rate": 0.0002045701184170462, "loss": 0.423, "step": 10560 }, { "epoch": 1.330635765343074, "grad_norm": 0.25224459171295166, "learning_rate": 0.00020446771326970424, "loss": 0.4037, "step": 10565 }, { "epoch": 1.3312655477532513, "grad_norm": 0.3072243928909302, "learning_rate": 0.00020436527887195607, "loss": 0.4279, "step": 10570 }, { "epoch": 1.3318953301634284, "grad_norm": 0.36949509382247925, "learning_rate": 0.00020426281527881137, "loss": 0.4259, "step": 10575 }, { "epoch": 1.332525112573606, "grad_norm": 0.30465519428253174, "learning_rate": 0.00020416032254529535, "loss": 0.457, "step": 10580 }, { "epoch": 1.333154894983783, "grad_norm": 0.2719140350818634, "learning_rate": 0.00020405780072644896, "loss": 0.3927, "step": 10585 }, { "epoch": 1.3337846773939603, "grad_norm": 0.33556681871414185, "learning_rate": 0.00020395524987732876, "loss": 0.4341, "step": 10590 }, { "epoch": 1.3344144598041376, "grad_norm": 0.3145639896392822, "learning_rate": 0.0002038526700530069, "loss": 0.4176, "step": 10595 }, { "epoch": 1.335044242214315, "grad_norm": 0.31328147649765015, "learning_rate": 0.00020375006130857111, "loss": 0.4332, "step": 10600 }, { "epoch": 1.3356740246244923, "grad_norm": 0.3016543388366699, "learning_rate": 0.00020364742369912464, "loss": 0.4173, "step": 10605 }, { "epoch": 1.3363038070346696, "grad_norm": 0.31259703636169434, "learning_rate": 0.0002035447572797862, "loss": 0.4091, "step": 10610 }, { "epoch": 1.3369335894448469, "grad_norm": 0.34624606370925903, "learning_rate": 0.00020344206210569, "loss": 0.4408, "step": 10615 }, { "epoch": 1.3375633718550242, "grad_norm": 0.3144773542881012, "learning_rate": 0.00020333933823198566, "loss": 0.3863, "step": 10620 }, { "epoch": 1.3381931542652015, "grad_norm": 0.3231208026409149, "learning_rate": 0.00020323658571383833, "loss": 0.4151, "step": 10625 }, { "epoch": 1.3388229366753785, "grad_norm": 0.3022227883338928, "learning_rate": 0.00020313380460642842, "loss": 0.4108, "step": 10630 }, { "epoch": 1.339452719085556, "grad_norm": 0.2899850606918335, "learning_rate": 0.00020303099496495172, "loss": 0.412, "step": 10635 }, { "epoch": 1.3400825014957332, "grad_norm": 0.31005537509918213, "learning_rate": 0.00020292815684461936, "loss": 0.4114, "step": 10640 }, { "epoch": 1.3407122839059105, "grad_norm": 0.29457420110702515, "learning_rate": 0.00020282529030065784, "loss": 0.4292, "step": 10645 }, { "epoch": 1.3413420663160878, "grad_norm": 0.31712374091148376, "learning_rate": 0.00020272239538830867, "loss": 0.4029, "step": 10650 }, { "epoch": 1.341971848726265, "grad_norm": 0.3228032886981964, "learning_rate": 0.00020261947216282896, "loss": 0.414, "step": 10655 }, { "epoch": 1.3426016311364424, "grad_norm": 0.305351197719574, "learning_rate": 0.00020251652067949068, "loss": 0.4233, "step": 10660 }, { "epoch": 1.3432314135466197, "grad_norm": 0.30317017436027527, "learning_rate": 0.00020241354099358123, "loss": 0.3816, "step": 10665 }, { "epoch": 1.343861195956797, "grad_norm": 0.3036525845527649, "learning_rate": 0.00020231053316040293, "loss": 0.4115, "step": 10670 }, { "epoch": 1.3444909783669743, "grad_norm": 0.33367687463760376, "learning_rate": 0.00020220749723527353, "loss": 0.449, "step": 10675 }, { "epoch": 1.3451207607771516, "grad_norm": 0.28938767313957214, "learning_rate": 0.00020210443327352553, "loss": 0.3919, "step": 10680 }, { "epoch": 1.3457505431873287, "grad_norm": 0.2946431338787079, "learning_rate": 0.00020200134133050666, "loss": 0.4043, "step": 10685 }, { "epoch": 1.3463803255975062, "grad_norm": 0.31588709354400635, "learning_rate": 0.00020189822146157962, "loss": 0.4136, "step": 10690 }, { "epoch": 1.3470101080076833, "grad_norm": 0.2830824851989746, "learning_rate": 0.00020179507372212224, "loss": 0.4164, "step": 10695 }, { "epoch": 1.3476398904178606, "grad_norm": 0.31364426016807556, "learning_rate": 0.0002016918981675271, "loss": 0.4197, "step": 10700 }, { "epoch": 1.348269672828038, "grad_norm": 0.32086437940597534, "learning_rate": 0.00020158869485320194, "loss": 0.4346, "step": 10705 }, { "epoch": 1.3488994552382152, "grad_norm": 0.30549678206443787, "learning_rate": 0.0002014854638345692, "loss": 0.4134, "step": 10710 }, { "epoch": 1.3495292376483925, "grad_norm": 0.2996455132961273, "learning_rate": 0.00020138220516706634, "loss": 0.3846, "step": 10715 }, { "epoch": 1.3501590200585698, "grad_norm": 0.3013511002063751, "learning_rate": 0.00020127891890614556, "loss": 0.3994, "step": 10720 }, { "epoch": 1.350788802468747, "grad_norm": 0.28055283427238464, "learning_rate": 0.00020117560510727402, "loss": 0.4163, "step": 10725 }, { "epoch": 1.3514185848789244, "grad_norm": 0.3024522364139557, "learning_rate": 0.00020107226382593357, "loss": 0.4042, "step": 10730 }, { "epoch": 1.3520483672891017, "grad_norm": 0.28080272674560547, "learning_rate": 0.00020096889511762083, "loss": 0.4176, "step": 10735 }, { "epoch": 1.3526781496992788, "grad_norm": 0.3069353997707367, "learning_rate": 0.00020086549903784715, "loss": 0.4189, "step": 10740 }, { "epoch": 1.353307932109456, "grad_norm": 0.2898117005825043, "learning_rate": 0.00020076207564213866, "loss": 0.4342, "step": 10745 }, { "epoch": 1.3539377145196334, "grad_norm": 0.3365933299064636, "learning_rate": 0.00020065862498603592, "loss": 0.3944, "step": 10750 }, { "epoch": 1.3545674969298107, "grad_norm": 0.29901427030563354, "learning_rate": 0.00020055514712509446, "loss": 0.4059, "step": 10755 }, { "epoch": 1.355197279339988, "grad_norm": 0.2927230894565582, "learning_rate": 0.00020045164211488417, "loss": 0.4137, "step": 10760 }, { "epoch": 1.3558270617501653, "grad_norm": 0.35867777466773987, "learning_rate": 0.00020034811001098964, "loss": 0.4108, "step": 10765 }, { "epoch": 1.3564568441603426, "grad_norm": 0.2955409586429596, "learning_rate": 0.00020024455086900994, "loss": 0.4328, "step": 10770 }, { "epoch": 1.35708662657052, "grad_norm": 0.29247814416885376, "learning_rate": 0.00020014096474455873, "loss": 0.4014, "step": 10775 }, { "epoch": 1.3577164089806972, "grad_norm": 0.30858153104782104, "learning_rate": 0.00020003735169326413, "loss": 0.4112, "step": 10780 }, { "epoch": 1.3583461913908745, "grad_norm": 0.4134693145751953, "learning_rate": 0.0001999337117707687, "loss": 0.4062, "step": 10785 }, { "epoch": 1.3589759738010518, "grad_norm": 0.3120553195476532, "learning_rate": 0.0001998300450327294, "loss": 0.4049, "step": 10790 }, { "epoch": 1.359605756211229, "grad_norm": 0.3146657645702362, "learning_rate": 0.00019972635153481767, "loss": 0.4029, "step": 10795 }, { "epoch": 1.3602355386214062, "grad_norm": 0.2997225821018219, "learning_rate": 0.00019962263133271933, "loss": 0.3792, "step": 10800 }, { "epoch": 1.3608653210315835, "grad_norm": 0.32136911153793335, "learning_rate": 0.0001995188844821345, "loss": 0.3987, "step": 10805 }, { "epoch": 1.3614951034417608, "grad_norm": 0.30875489115715027, "learning_rate": 0.0001994151110387775, "loss": 0.4211, "step": 10810 }, { "epoch": 1.3621248858519381, "grad_norm": 0.30939677357673645, "learning_rate": 0.00019931131105837714, "loss": 0.451, "step": 10815 }, { "epoch": 1.3627546682621154, "grad_norm": 0.27874892950057983, "learning_rate": 0.0001992074845966764, "loss": 0.4102, "step": 10820 }, { "epoch": 1.3633844506722927, "grad_norm": 0.28371527791023254, "learning_rate": 0.00019910363170943233, "loss": 0.4153, "step": 10825 }, { "epoch": 1.36401423308247, "grad_norm": 0.2852970063686371, "learning_rate": 0.00019899975245241643, "loss": 0.409, "step": 10830 }, { "epoch": 1.3646440154926474, "grad_norm": 0.300521582365036, "learning_rate": 0.00019889584688141418, "loss": 0.4032, "step": 10835 }, { "epoch": 1.3652737979028244, "grad_norm": 0.30631181597709656, "learning_rate": 0.00019879191505222526, "loss": 0.4299, "step": 10840 }, { "epoch": 1.365903580313002, "grad_norm": 0.3514620363712311, "learning_rate": 0.00019868795702066342, "loss": 0.4051, "step": 10845 }, { "epoch": 1.366533362723179, "grad_norm": 0.27533403038978577, "learning_rate": 0.00019858397284255657, "loss": 0.4108, "step": 10850 }, { "epoch": 1.3671631451333564, "grad_norm": 0.3143390119075775, "learning_rate": 0.00019847996257374645, "loss": 0.426, "step": 10855 }, { "epoch": 1.3677929275435337, "grad_norm": 0.3388061821460724, "learning_rate": 0.00019837592627008904, "loss": 0.4163, "step": 10860 }, { "epoch": 1.368422709953711, "grad_norm": 0.34078383445739746, "learning_rate": 0.00019827186398745417, "loss": 0.4015, "step": 10865 }, { "epoch": 1.3690524923638883, "grad_norm": 0.33532068133354187, "learning_rate": 0.00019816777578172582, "loss": 0.4436, "step": 10870 }, { "epoch": 1.3696822747740656, "grad_norm": 0.3230116069316864, "learning_rate": 0.0001980636617088015, "loss": 0.4239, "step": 10875 }, { "epoch": 1.3703120571842429, "grad_norm": 0.31974872946739197, "learning_rate": 0.00019795952182459297, "loss": 0.4313, "step": 10880 }, { "epoch": 1.3709418395944202, "grad_norm": 0.2825758159160614, "learning_rate": 0.0001978553561850257, "loss": 0.4045, "step": 10885 }, { "epoch": 1.3715716220045975, "grad_norm": 0.2678980529308319, "learning_rate": 0.00019775116484603908, "loss": 0.3899, "step": 10890 }, { "epoch": 1.3722014044147746, "grad_norm": 0.3492506146430969, "learning_rate": 0.00019764694786358612, "loss": 0.3807, "step": 10895 }, { "epoch": 1.372831186824952, "grad_norm": 0.30808547139167786, "learning_rate": 0.00019754270529363384, "loss": 0.4163, "step": 10900 }, { "epoch": 1.3734609692351292, "grad_norm": 0.30980342626571655, "learning_rate": 0.0001974384371921628, "loss": 0.3843, "step": 10905 }, { "epoch": 1.3740907516453065, "grad_norm": 0.2915787100791931, "learning_rate": 0.00019733414361516736, "loss": 0.4208, "step": 10910 }, { "epoch": 1.3747205340554838, "grad_norm": 0.30979228019714355, "learning_rate": 0.00019722982461865555, "loss": 0.4188, "step": 10915 }, { "epoch": 1.375350316465661, "grad_norm": 0.28953999280929565, "learning_rate": 0.00019712548025864918, "loss": 0.3934, "step": 10920 }, { "epoch": 1.3759800988758384, "grad_norm": 0.31495416164398193, "learning_rate": 0.00019702111059118334, "loss": 0.4117, "step": 10925 }, { "epoch": 1.3766098812860157, "grad_norm": 0.38459569215774536, "learning_rate": 0.00019691671567230714, "loss": 0.4229, "step": 10930 }, { "epoch": 1.377239663696193, "grad_norm": 0.31138870120048523, "learning_rate": 0.00019681229555808285, "loss": 0.4284, "step": 10935 }, { "epoch": 1.3778694461063703, "grad_norm": 0.2761414051055908, "learning_rate": 0.0001967078503045866, "loss": 0.3838, "step": 10940 }, { "epoch": 1.3784992285165476, "grad_norm": 0.31627506017684937, "learning_rate": 0.00019660337996790772, "loss": 0.4008, "step": 10945 }, { "epoch": 1.3791290109267247, "grad_norm": 0.29025107622146606, "learning_rate": 0.00019649888460414937, "loss": 0.409, "step": 10950 }, { "epoch": 1.3797587933369022, "grad_norm": 0.3379102349281311, "learning_rate": 0.0001963943642694278, "loss": 0.4213, "step": 10955 }, { "epoch": 1.3803885757470793, "grad_norm": 0.3209204375743866, "learning_rate": 0.00019628981901987285, "loss": 0.3834, "step": 10960 }, { "epoch": 1.3810183581572566, "grad_norm": 0.31717419624328613, "learning_rate": 0.0001961852489116277, "loss": 0.4499, "step": 10965 }, { "epoch": 1.381648140567434, "grad_norm": 0.27936458587646484, "learning_rate": 0.00019608065400084898, "loss": 0.3987, "step": 10970 }, { "epoch": 1.3822779229776112, "grad_norm": 0.28877684473991394, "learning_rate": 0.00019597603434370637, "loss": 0.4252, "step": 10975 }, { "epoch": 1.3829077053877885, "grad_norm": 0.3423072397708893, "learning_rate": 0.00019587138999638316, "loss": 0.421, "step": 10980 }, { "epoch": 1.3835374877979658, "grad_norm": 0.26486262679100037, "learning_rate": 0.00019576672101507568, "loss": 0.4104, "step": 10985 }, { "epoch": 1.3841672702081431, "grad_norm": 0.2929472029209137, "learning_rate": 0.00019566202745599365, "loss": 0.4127, "step": 10990 }, { "epoch": 1.3847970526183204, "grad_norm": 0.2696884870529175, "learning_rate": 0.00019555730937535976, "loss": 0.4067, "step": 10995 }, { "epoch": 1.3854268350284977, "grad_norm": 0.32420167326927185, "learning_rate": 0.0001954525668294102, "loss": 0.4136, "step": 11000 }, { "epoch": 1.3854268350284977, "eval_loss": 0.3039778470993042, "eval_runtime": 6.1549, "eval_samples_per_second": 162.472, "eval_steps_per_second": 10.236, "step": 11000 }, { "epoch": 1.3860566174386748, "grad_norm": 0.3149106502532959, "learning_rate": 0.00019534779987439395, "loss": 0.3954, "step": 11005 }, { "epoch": 1.3866863998488523, "grad_norm": 0.332868367433548, "learning_rate": 0.0001952430085665733, "loss": 0.4178, "step": 11010 }, { "epoch": 1.3873161822590294, "grad_norm": 0.285671591758728, "learning_rate": 0.00019513819296222362, "loss": 0.3788, "step": 11015 }, { "epoch": 1.3879459646692067, "grad_norm": 0.3317325711250305, "learning_rate": 0.0001950333531176332, "loss": 0.4091, "step": 11020 }, { "epoch": 1.388575747079384, "grad_norm": 0.27808326482772827, "learning_rate": 0.00019492848908910356, "loss": 0.4104, "step": 11025 }, { "epoch": 1.3892055294895613, "grad_norm": 0.29725268483161926, "learning_rate": 0.00019482360093294897, "loss": 0.3981, "step": 11030 }, { "epoch": 1.3898353118997386, "grad_norm": 0.24770186841487885, "learning_rate": 0.0001947186887054968, "loss": 0.4052, "step": 11035 }, { "epoch": 1.390465094309916, "grad_norm": 0.31627580523490906, "learning_rate": 0.00019461375246308734, "loss": 0.4051, "step": 11040 }, { "epoch": 1.3910948767200932, "grad_norm": 0.2721163332462311, "learning_rate": 0.00019450879226207368, "loss": 0.3962, "step": 11045 }, { "epoch": 1.3917246591302705, "grad_norm": 0.31926798820495605, "learning_rate": 0.00019440380815882187, "loss": 0.3964, "step": 11050 }, { "epoch": 1.3923544415404479, "grad_norm": 0.3047574460506439, "learning_rate": 0.0001942988002097108, "loss": 0.3818, "step": 11055 }, { "epoch": 1.392984223950625, "grad_norm": 0.35394978523254395, "learning_rate": 0.00019419376847113216, "loss": 0.4398, "step": 11060 }, { "epoch": 1.3936140063608025, "grad_norm": 0.2855307459831238, "learning_rate": 0.00019408871299949037, "loss": 0.4089, "step": 11065 }, { "epoch": 1.3942437887709795, "grad_norm": 0.3066868484020233, "learning_rate": 0.00019398363385120254, "loss": 0.3987, "step": 11070 }, { "epoch": 1.3948735711811568, "grad_norm": 0.312775194644928, "learning_rate": 0.0001938785310826987, "loss": 0.3794, "step": 11075 }, { "epoch": 1.3955033535913342, "grad_norm": 0.3235652446746826, "learning_rate": 0.00019377340475042136, "loss": 0.3852, "step": 11080 }, { "epoch": 1.3961331360015115, "grad_norm": 0.33732032775878906, "learning_rate": 0.00019366825491082574, "loss": 0.4003, "step": 11085 }, { "epoch": 1.3967629184116888, "grad_norm": 0.33549800515174866, "learning_rate": 0.00019356308162037976, "loss": 0.3699, "step": 11090 }, { "epoch": 1.397392700821866, "grad_norm": 0.3360839784145355, "learning_rate": 0.00019345788493556394, "loss": 0.394, "step": 11095 }, { "epoch": 1.3980224832320434, "grad_norm": 0.3089699447154999, "learning_rate": 0.00019335266491287112, "loss": 0.4016, "step": 11100 }, { "epoch": 1.3986522656422207, "grad_norm": 0.30863386392593384, "learning_rate": 0.00019324742160880702, "loss": 0.3973, "step": 11105 }, { "epoch": 1.399282048052398, "grad_norm": 0.30803561210632324, "learning_rate": 0.00019314215507988965, "loss": 0.4119, "step": 11110 }, { "epoch": 1.399911830462575, "grad_norm": 0.2869633138179779, "learning_rate": 0.0001930368653826495, "loss": 0.4098, "step": 11115 }, { "epoch": 1.4005416128727526, "grad_norm": 0.25851666927337646, "learning_rate": 0.00019293155257362957, "loss": 0.4034, "step": 11120 }, { "epoch": 1.4011713952829297, "grad_norm": 0.32763540744781494, "learning_rate": 0.00019282621670938527, "loss": 0.4121, "step": 11125 }, { "epoch": 1.401801177693107, "grad_norm": 0.3531438410282135, "learning_rate": 0.00019272085784648432, "loss": 0.4021, "step": 11130 }, { "epoch": 1.4024309601032843, "grad_norm": 0.27890294790267944, "learning_rate": 0.00019261547604150687, "loss": 0.3872, "step": 11135 }, { "epoch": 1.4030607425134616, "grad_norm": 0.26616647839546204, "learning_rate": 0.00019251007135104534, "loss": 0.4293, "step": 11140 }, { "epoch": 1.4036905249236389, "grad_norm": 0.3214140236377716, "learning_rate": 0.0001924046438317045, "loss": 0.3974, "step": 11145 }, { "epoch": 1.4043203073338162, "grad_norm": 0.31075042486190796, "learning_rate": 0.00019229919354010126, "loss": 0.3978, "step": 11150 }, { "epoch": 1.4049500897439935, "grad_norm": 0.31546491384506226, "learning_rate": 0.00019219372053286485, "loss": 0.3937, "step": 11155 }, { "epoch": 1.4055798721541708, "grad_norm": 0.33116820454597473, "learning_rate": 0.00019208822486663677, "loss": 0.3779, "step": 11160 }, { "epoch": 1.406209654564348, "grad_norm": 0.30159297585487366, "learning_rate": 0.0001919827065980705, "loss": 0.3822, "step": 11165 }, { "epoch": 1.4068394369745252, "grad_norm": 0.29656147956848145, "learning_rate": 0.00019187716578383178, "loss": 0.4047, "step": 11170 }, { "epoch": 1.4074692193847027, "grad_norm": 0.3193992078304291, "learning_rate": 0.0001917716024805985, "loss": 0.4088, "step": 11175 }, { "epoch": 1.4080990017948798, "grad_norm": 0.29688236117362976, "learning_rate": 0.0001916660167450605, "loss": 0.3693, "step": 11180 }, { "epoch": 1.408728784205057, "grad_norm": 0.33146485686302185, "learning_rate": 0.00019156040863391977, "loss": 0.3865, "step": 11185 }, { "epoch": 1.4093585666152344, "grad_norm": 0.3015727698802948, "learning_rate": 0.00019145477820389027, "loss": 0.3857, "step": 11190 }, { "epoch": 1.4099883490254117, "grad_norm": 0.27797931432724, "learning_rate": 0.00019134912551169796, "loss": 0.4148, "step": 11195 }, { "epoch": 1.410618131435589, "grad_norm": 0.30010297894477844, "learning_rate": 0.00019124345061408067, "loss": 0.4076, "step": 11200 }, { "epoch": 1.4112479138457663, "grad_norm": 0.29101455211639404, "learning_rate": 0.00019113775356778833, "loss": 0.3802, "step": 11205 }, { "epoch": 1.4118776962559436, "grad_norm": 0.29706794023513794, "learning_rate": 0.00019103203442958266, "loss": 0.3867, "step": 11210 }, { "epoch": 1.412507478666121, "grad_norm": 0.2546458840370178, "learning_rate": 0.00019092629325623723, "loss": 0.3964, "step": 11215 }, { "epoch": 1.4131372610762982, "grad_norm": 0.3409089148044586, "learning_rate": 0.0001908205301045375, "loss": 0.4171, "step": 11220 }, { "epoch": 1.4137670434864753, "grad_norm": 0.27688878774642944, "learning_rate": 0.00019071474503128057, "loss": 0.405, "step": 11225 }, { "epoch": 1.4143968258966528, "grad_norm": 0.30704399943351746, "learning_rate": 0.00019060893809327563, "loss": 0.4024, "step": 11230 }, { "epoch": 1.41502660830683, "grad_norm": 0.2823016941547394, "learning_rate": 0.00019050310934734326, "loss": 0.3908, "step": 11235 }, { "epoch": 1.4156563907170072, "grad_norm": 0.3309246897697449, "learning_rate": 0.000190397258850316, "loss": 0.4049, "step": 11240 }, { "epoch": 1.4162861731271845, "grad_norm": 0.2959790527820587, "learning_rate": 0.00019029138665903794, "loss": 0.4031, "step": 11245 }, { "epoch": 1.4169159555373618, "grad_norm": 0.29836803674697876, "learning_rate": 0.00019018549283036497, "loss": 0.4103, "step": 11250 }, { "epoch": 1.4175457379475391, "grad_norm": 0.3187415301799774, "learning_rate": 0.00019007957742116433, "loss": 0.4055, "step": 11255 }, { "epoch": 1.4181755203577164, "grad_norm": 0.3521386981010437, "learning_rate": 0.00018997364048831515, "loss": 0.3839, "step": 11260 }, { "epoch": 1.4188053027678937, "grad_norm": 0.3985449969768524, "learning_rate": 0.00018986768208870792, "loss": 0.4058, "step": 11265 }, { "epoch": 1.419435085178071, "grad_norm": 0.30885374546051025, "learning_rate": 0.00018976170227924473, "loss": 0.394, "step": 11270 }, { "epoch": 1.4200648675882483, "grad_norm": 0.2981209456920624, "learning_rate": 0.00018965570111683917, "loss": 0.3917, "step": 11275 }, { "epoch": 1.4206946499984254, "grad_norm": 0.2993827164173126, "learning_rate": 0.00018954967865841629, "loss": 0.4016, "step": 11280 }, { "epoch": 1.421324432408603, "grad_norm": 0.283632755279541, "learning_rate": 0.00018944363496091254, "loss": 0.3873, "step": 11285 }, { "epoch": 1.42195421481878, "grad_norm": 0.2871907353401184, "learning_rate": 0.0001893375700812758, "loss": 0.4136, "step": 11290 }, { "epoch": 1.4225839972289573, "grad_norm": 0.3341853618621826, "learning_rate": 0.00018923148407646537, "loss": 0.409, "step": 11295 }, { "epoch": 1.4232137796391346, "grad_norm": 0.32463696599006653, "learning_rate": 0.00018912537700345192, "loss": 0.3912, "step": 11300 }, { "epoch": 1.423843562049312, "grad_norm": 0.33242395520210266, "learning_rate": 0.00018901924891921726, "loss": 0.4158, "step": 11305 }, { "epoch": 1.4244733444594893, "grad_norm": 0.301289439201355, "learning_rate": 0.00018891309988075463, "loss": 0.4012, "step": 11310 }, { "epoch": 1.4251031268696666, "grad_norm": 0.28636494278907776, "learning_rate": 0.00018880692994506845, "loss": 0.3817, "step": 11315 }, { "epoch": 1.4257329092798439, "grad_norm": 0.2837861478328705, "learning_rate": 0.00018870073916917455, "loss": 0.4116, "step": 11320 }, { "epoch": 1.4263626916900212, "grad_norm": 0.31169527769088745, "learning_rate": 0.0001885945276100996, "loss": 0.3967, "step": 11325 }, { "epoch": 1.4269924741001985, "grad_norm": 0.31035301089286804, "learning_rate": 0.00018848829532488177, "loss": 0.407, "step": 11330 }, { "epoch": 1.4276222565103756, "grad_norm": 0.3047008812427521, "learning_rate": 0.00018838204237057023, "loss": 0.3939, "step": 11335 }, { "epoch": 1.428252038920553, "grad_norm": 0.2646077871322632, "learning_rate": 0.00018827576880422515, "loss": 0.3881, "step": 11340 }, { "epoch": 1.4288818213307302, "grad_norm": 0.31041520833969116, "learning_rate": 0.00018816947468291788, "loss": 0.3822, "step": 11345 }, { "epoch": 1.4295116037409075, "grad_norm": 0.2699204385280609, "learning_rate": 0.00018806316006373086, "loss": 0.3895, "step": 11350 }, { "epoch": 1.4301413861510848, "grad_norm": 0.285363107919693, "learning_rate": 0.00018795682500375742, "loss": 0.4027, "step": 11355 }, { "epoch": 1.430771168561262, "grad_norm": 0.27154308557510376, "learning_rate": 0.00018785046956010194, "loss": 0.3815, "step": 11360 }, { "epoch": 1.4314009509714394, "grad_norm": 0.29652640223503113, "learning_rate": 0.00018774409378987972, "loss": 0.4003, "step": 11365 }, { "epoch": 1.4320307333816167, "grad_norm": 0.2921524941921234, "learning_rate": 0.00018763769775021695, "loss": 0.3828, "step": 11370 }, { "epoch": 1.432660515791794, "grad_norm": 0.26934945583343506, "learning_rate": 0.00018753128149825074, "loss": 0.3999, "step": 11375 }, { "epoch": 1.4332902982019713, "grad_norm": 0.29320502281188965, "learning_rate": 0.00018742484509112907, "loss": 0.4034, "step": 11380 }, { "epoch": 1.4339200806121486, "grad_norm": 0.2842418849468231, "learning_rate": 0.00018731838858601074, "loss": 0.3877, "step": 11385 }, { "epoch": 1.4345498630223257, "grad_norm": 0.31208139657974243, "learning_rate": 0.00018721191204006525, "loss": 0.3731, "step": 11390 }, { "epoch": 1.4351796454325032, "grad_norm": 0.2809062600135803, "learning_rate": 0.00018710541551047303, "loss": 0.3939, "step": 11395 }, { "epoch": 1.4358094278426803, "grad_norm": 0.308969646692276, "learning_rate": 0.00018699889905442508, "loss": 0.3874, "step": 11400 }, { "epoch": 1.4364392102528576, "grad_norm": 0.3051275610923767, "learning_rate": 0.00018689236272912316, "loss": 0.3676, "step": 11405 }, { "epoch": 1.437068992663035, "grad_norm": 0.31084486842155457, "learning_rate": 0.0001867858065917798, "loss": 0.3954, "step": 11410 }, { "epoch": 1.4376987750732122, "grad_norm": 0.28356167674064636, "learning_rate": 0.000186679230699618, "loss": 0.3701, "step": 11415 }, { "epoch": 1.4383285574833895, "grad_norm": 0.3026244044303894, "learning_rate": 0.0001865726351098715, "loss": 0.3797, "step": 11420 }, { "epoch": 1.4389583398935668, "grad_norm": 0.2909928560256958, "learning_rate": 0.00018646601987978452, "loss": 0.4022, "step": 11425 }, { "epoch": 1.439588122303744, "grad_norm": 0.3085511326789856, "learning_rate": 0.00018635938506661183, "loss": 0.4099, "step": 11430 }, { "epoch": 1.4402179047139214, "grad_norm": 0.28047701716423035, "learning_rate": 0.0001862527307276189, "loss": 0.3789, "step": 11435 }, { "epoch": 1.4408476871240987, "grad_norm": 0.2697209119796753, "learning_rate": 0.00018614605692008146, "loss": 0.3864, "step": 11440 }, { "epoch": 1.4414774695342758, "grad_norm": 0.40744665265083313, "learning_rate": 0.0001860393637012858, "loss": 0.4085, "step": 11445 }, { "epoch": 1.4421072519444533, "grad_norm": 0.25875118374824524, "learning_rate": 0.00018593265112852854, "loss": 0.4033, "step": 11450 }, { "epoch": 1.4427370343546304, "grad_norm": 0.2960642874240875, "learning_rate": 0.00018582591925911694, "loss": 0.4214, "step": 11455 }, { "epoch": 1.4433668167648077, "grad_norm": 0.2711925506591797, "learning_rate": 0.00018571916815036824, "loss": 0.3537, "step": 11460 }, { "epoch": 1.443996599174985, "grad_norm": 0.28002485632896423, "learning_rate": 0.0001856123978596104, "loss": 0.3787, "step": 11465 }, { "epoch": 1.4446263815851623, "grad_norm": 0.3143458366394043, "learning_rate": 0.00018550560844418138, "loss": 0.3553, "step": 11470 }, { "epoch": 1.4452561639953396, "grad_norm": 0.3184334337711334, "learning_rate": 0.00018539879996142962, "loss": 0.385, "step": 11475 }, { "epoch": 1.445885946405517, "grad_norm": 0.3327188789844513, "learning_rate": 0.00018529197246871368, "loss": 0.4074, "step": 11480 }, { "epoch": 1.4465157288156942, "grad_norm": 0.317942351102829, "learning_rate": 0.0001851851260234024, "loss": 0.3995, "step": 11485 }, { "epoch": 1.4471455112258715, "grad_norm": 0.2567351758480072, "learning_rate": 0.00018507826068287473, "loss": 0.3661, "step": 11490 }, { "epoch": 1.4477752936360488, "grad_norm": 0.29439592361450195, "learning_rate": 0.0001849713765045198, "loss": 0.3759, "step": 11495 }, { "epoch": 1.448405076046226, "grad_norm": 0.3125048279762268, "learning_rate": 0.0001848644735457368, "loss": 0.4107, "step": 11500 }, { "epoch": 1.4490348584564032, "grad_norm": 0.2855313718318939, "learning_rate": 0.00018475755186393516, "loss": 0.4061, "step": 11505 }, { "epoch": 1.4496646408665805, "grad_norm": 0.3040854036808014, "learning_rate": 0.00018465061151653423, "loss": 0.3902, "step": 11510 }, { "epoch": 1.4502944232767578, "grad_norm": 0.28425633907318115, "learning_rate": 0.0001845436525609634, "loss": 0.3861, "step": 11515 }, { "epoch": 1.4509242056869351, "grad_norm": 0.31335607171058655, "learning_rate": 0.00018443667505466205, "loss": 0.3949, "step": 11520 }, { "epoch": 1.4515539880971124, "grad_norm": 0.2725260555744171, "learning_rate": 0.00018432967905507967, "loss": 0.3979, "step": 11525 }, { "epoch": 1.4521837705072898, "grad_norm": 0.2674049437046051, "learning_rate": 0.00018422266461967537, "loss": 0.3747, "step": 11530 }, { "epoch": 1.452813552917467, "grad_norm": 0.3076520562171936, "learning_rate": 0.0001841156318059185, "loss": 0.385, "step": 11535 }, { "epoch": 1.4534433353276444, "grad_norm": 0.23340527713298798, "learning_rate": 0.00018400858067128806, "loss": 0.3736, "step": 11540 }, { "epoch": 1.4540731177378217, "grad_norm": 0.29402169585227966, "learning_rate": 0.00018390151127327295, "loss": 0.3994, "step": 11545 }, { "epoch": 1.454702900147999, "grad_norm": 0.32409217953681946, "learning_rate": 0.00018379442366937187, "loss": 0.3979, "step": 11550 }, { "epoch": 1.455332682558176, "grad_norm": 0.28875911235809326, "learning_rate": 0.00018368731791709337, "loss": 0.365, "step": 11555 }, { "epoch": 1.4559624649683534, "grad_norm": 0.26838234066963196, "learning_rate": 0.0001835801940739556, "loss": 0.3912, "step": 11560 }, { "epoch": 1.4565922473785307, "grad_norm": 0.31797516345977783, "learning_rate": 0.00018347305219748665, "loss": 0.3622, "step": 11565 }, { "epoch": 1.457222029788708, "grad_norm": 0.31115812063217163, "learning_rate": 0.00018336589234522398, "loss": 0.4283, "step": 11570 }, { "epoch": 1.4578518121988853, "grad_norm": 0.2730168402194977, "learning_rate": 0.00018325871457471496, "loss": 0.3864, "step": 11575 }, { "epoch": 1.4584815946090626, "grad_norm": 0.28333088755607605, "learning_rate": 0.00018315151894351657, "loss": 0.3451, "step": 11580 }, { "epoch": 1.4591113770192399, "grad_norm": 0.3169468343257904, "learning_rate": 0.00018304430550919522, "loss": 0.3719, "step": 11585 }, { "epoch": 1.4597411594294172, "grad_norm": 0.3411467969417572, "learning_rate": 0.000182937074329327, "loss": 0.4073, "step": 11590 }, { "epoch": 1.4603709418395945, "grad_norm": 0.3131183385848999, "learning_rate": 0.0001828298254614975, "loss": 0.4117, "step": 11595 }, { "epoch": 1.4610007242497716, "grad_norm": 0.25929832458496094, "learning_rate": 0.0001827225589633018, "loss": 0.3834, "step": 11600 }, { "epoch": 1.461630506659949, "grad_norm": 0.32609832286834717, "learning_rate": 0.00018261527489234444, "loss": 0.3972, "step": 11605 }, { "epoch": 1.4622602890701262, "grad_norm": 0.3089287579059601, "learning_rate": 0.00018250797330623953, "loss": 0.3727, "step": 11610 }, { "epoch": 1.4628900714803035, "grad_norm": 0.2891997992992401, "learning_rate": 0.00018240065426261033, "loss": 0.3891, "step": 11615 }, { "epoch": 1.4635198538904808, "grad_norm": 0.3119528293609619, "learning_rate": 0.00018229331781908971, "loss": 0.388, "step": 11620 }, { "epoch": 1.464149636300658, "grad_norm": 0.3314844071865082, "learning_rate": 0.00018218596403331977, "loss": 0.3803, "step": 11625 }, { "epoch": 1.4647794187108354, "grad_norm": 0.27267536520957947, "learning_rate": 0.00018207859296295197, "loss": 0.3665, "step": 11630 }, { "epoch": 1.4654092011210127, "grad_norm": 0.30490440130233765, "learning_rate": 0.00018197120466564693, "loss": 0.4051, "step": 11635 }, { "epoch": 1.46603898353119, "grad_norm": 0.3182273209095001, "learning_rate": 0.00018186379919907472, "loss": 0.38, "step": 11640 }, { "epoch": 1.4666687659413673, "grad_norm": 0.3026832044124603, "learning_rate": 0.00018175637662091448, "loss": 0.3371, "step": 11645 }, { "epoch": 1.4672985483515446, "grad_norm": 0.3287534713745117, "learning_rate": 0.0001816489369888546, "loss": 0.4234, "step": 11650 }, { "epoch": 1.4679283307617217, "grad_norm": 0.28076720237731934, "learning_rate": 0.00018154148036059263, "loss": 0.3825, "step": 11655 }, { "epoch": 1.4685581131718992, "grad_norm": 0.304766446352005, "learning_rate": 0.0001814340067938352, "loss": 0.3905, "step": 11660 }, { "epoch": 1.4691878955820763, "grad_norm": 0.30473533272743225, "learning_rate": 0.00018132651634629812, "loss": 0.409, "step": 11665 }, { "epoch": 1.4698176779922536, "grad_norm": 0.32186418771743774, "learning_rate": 0.00018121900907570618, "loss": 0.3741, "step": 11670 }, { "epoch": 1.470447460402431, "grad_norm": 0.33314061164855957, "learning_rate": 0.00018111148503979326, "loss": 0.3981, "step": 11675 }, { "epoch": 1.4710772428126082, "grad_norm": 0.3202495872974396, "learning_rate": 0.00018100394429630223, "loss": 0.4014, "step": 11680 }, { "epoch": 1.4717070252227855, "grad_norm": 0.2801063656806946, "learning_rate": 0.00018089638690298488, "loss": 0.3827, "step": 11685 }, { "epoch": 1.4723368076329628, "grad_norm": 0.3252180516719818, "learning_rate": 0.000180788812917602, "loss": 0.4207, "step": 11690 }, { "epoch": 1.4729665900431401, "grad_norm": 0.279823899269104, "learning_rate": 0.0001806812223979233, "loss": 0.4092, "step": 11695 }, { "epoch": 1.4735963724533174, "grad_norm": 0.29136526584625244, "learning_rate": 0.00018057361540172733, "loss": 0.3939, "step": 11700 }, { "epoch": 1.4742261548634947, "grad_norm": 0.2708832621574402, "learning_rate": 0.00018046599198680153, "loss": 0.3645, "step": 11705 }, { "epoch": 1.4748559372736718, "grad_norm": 0.34708496928215027, "learning_rate": 0.00018035835221094214, "loss": 0.3814, "step": 11710 }, { "epoch": 1.4754857196838493, "grad_norm": 0.3081948161125183, "learning_rate": 0.00018025069613195413, "loss": 0.3738, "step": 11715 }, { "epoch": 1.4761155020940264, "grad_norm": 0.26891911029815674, "learning_rate": 0.0001801430238076513, "loss": 0.3724, "step": 11720 }, { "epoch": 1.4767452845042037, "grad_norm": 0.3266797363758087, "learning_rate": 0.00018003533529585612, "loss": 0.3749, "step": 11725 }, { "epoch": 1.477375066914381, "grad_norm": 0.25788089632987976, "learning_rate": 0.00017992763065439982, "loss": 0.3661, "step": 11730 }, { "epoch": 1.4780048493245583, "grad_norm": 0.301270067691803, "learning_rate": 0.00017981990994112227, "loss": 0.3832, "step": 11735 }, { "epoch": 1.4786346317347356, "grad_norm": 0.2785583734512329, "learning_rate": 0.0001797121732138719, "loss": 0.357, "step": 11740 }, { "epoch": 1.479264414144913, "grad_norm": 0.3153518736362457, "learning_rate": 0.00017960442053050583, "loss": 0.3964, "step": 11745 }, { "epoch": 1.4798941965550902, "grad_norm": 0.2862750291824341, "learning_rate": 0.00017949665194888972, "loss": 0.3781, "step": 11750 }, { "epoch": 1.4805239789652676, "grad_norm": 0.31263992190361023, "learning_rate": 0.00017938886752689765, "loss": 0.3822, "step": 11755 }, { "epoch": 1.4811537613754449, "grad_norm": 0.31964340806007385, "learning_rate": 0.00017928106732241248, "loss": 0.3757, "step": 11760 }, { "epoch": 1.481783543785622, "grad_norm": 0.29111340641975403, "learning_rate": 0.0001791732513933253, "loss": 0.362, "step": 11765 }, { "epoch": 1.4824133261957995, "grad_norm": 0.32248637080192566, "learning_rate": 0.00017906541979753572, "loss": 0.3978, "step": 11770 }, { "epoch": 1.4830431086059765, "grad_norm": 0.2964222729206085, "learning_rate": 0.0001789575725929518, "loss": 0.3853, "step": 11775 }, { "epoch": 1.4836728910161538, "grad_norm": 0.32823482155799866, "learning_rate": 0.0001788497098374899, "loss": 0.3828, "step": 11780 }, { "epoch": 1.4843026734263312, "grad_norm": 0.30054226517677307, "learning_rate": 0.0001787418315890748, "loss": 0.38, "step": 11785 }, { "epoch": 1.4849324558365085, "grad_norm": 0.30829596519470215, "learning_rate": 0.0001786339379056397, "loss": 0.3645, "step": 11790 }, { "epoch": 1.4855622382466858, "grad_norm": 0.3095497786998749, "learning_rate": 0.00017852602884512584, "loss": 0.3727, "step": 11795 }, { "epoch": 1.486192020656863, "grad_norm": 0.29647621512413025, "learning_rate": 0.00017841810446548283, "loss": 0.3764, "step": 11800 }, { "epoch": 1.4868218030670404, "grad_norm": 0.3227784037590027, "learning_rate": 0.00017831016482466864, "loss": 0.3797, "step": 11805 }, { "epoch": 1.4874515854772177, "grad_norm": 0.32365646958351135, "learning_rate": 0.00017820220998064927, "loss": 0.3766, "step": 11810 }, { "epoch": 1.488081367887395, "grad_norm": 0.36090198159217834, "learning_rate": 0.0001780942399913989, "loss": 0.4015, "step": 11815 }, { "epoch": 1.488711150297572, "grad_norm": 0.28814610838890076, "learning_rate": 0.00017798625491489994, "loss": 0.3616, "step": 11820 }, { "epoch": 1.4893409327077496, "grad_norm": 0.2654825747013092, "learning_rate": 0.00017787825480914283, "loss": 0.3462, "step": 11825 }, { "epoch": 1.4899707151179267, "grad_norm": 0.2913071811199188, "learning_rate": 0.000177770239732126, "loss": 0.3707, "step": 11830 }, { "epoch": 1.490600497528104, "grad_norm": 0.33099865913391113, "learning_rate": 0.0001776622097418562, "loss": 0.3644, "step": 11835 }, { "epoch": 1.4912302799382813, "grad_norm": 0.2980974018573761, "learning_rate": 0.0001775541648963478, "loss": 0.3839, "step": 11840 }, { "epoch": 1.4918600623484586, "grad_norm": 0.2673074007034302, "learning_rate": 0.00017744610525362352, "loss": 0.3736, "step": 11845 }, { "epoch": 1.4924898447586359, "grad_norm": 0.26277023553848267, "learning_rate": 0.00017733803087171372, "loss": 0.3463, "step": 11850 }, { "epoch": 1.4931196271688132, "grad_norm": 0.27924680709838867, "learning_rate": 0.00017722994180865696, "loss": 0.4095, "step": 11855 }, { "epoch": 1.4937494095789905, "grad_norm": 0.2761695086956024, "learning_rate": 0.00017712183812249938, "loss": 0.3748, "step": 11860 }, { "epoch": 1.4943791919891678, "grad_norm": 0.312854528427124, "learning_rate": 0.00017701371987129523, "loss": 0.3748, "step": 11865 }, { "epoch": 1.495008974399345, "grad_norm": 0.3033592998981476, "learning_rate": 0.00017690558711310644, "loss": 0.3728, "step": 11870 }, { "epoch": 1.4956387568095222, "grad_norm": 0.2711508572101593, "learning_rate": 0.00017679743990600281, "loss": 0.3748, "step": 11875 }, { "epoch": 1.4962685392196997, "grad_norm": 0.28003159165382385, "learning_rate": 0.00017668927830806177, "loss": 0.3658, "step": 11880 }, { "epoch": 1.4968983216298768, "grad_norm": 0.2750314772129059, "learning_rate": 0.0001765811023773687, "loss": 0.3705, "step": 11885 }, { "epoch": 1.497528104040054, "grad_norm": 0.31037452816963196, "learning_rate": 0.00017647291217201644, "loss": 0.3718, "step": 11890 }, { "epoch": 1.4981578864502314, "grad_norm": 0.33681520819664, "learning_rate": 0.00017636470775010563, "loss": 0.37, "step": 11895 }, { "epoch": 1.4987876688604087, "grad_norm": 0.2735719084739685, "learning_rate": 0.00017625648916974452, "loss": 0.3898, "step": 11900 }, { "epoch": 1.499417451270586, "grad_norm": 0.2873845398426056, "learning_rate": 0.00017614825648904902, "loss": 0.387, "step": 11905 }, { "epoch": 1.5000472336807633, "grad_norm": 0.2826070189476013, "learning_rate": 0.00017604000976614243, "loss": 0.3656, "step": 11910 }, { "epoch": 1.5006770160909406, "grad_norm": 0.2709527015686035, "learning_rate": 0.00017593174905915581, "loss": 0.3583, "step": 11915 }, { "epoch": 1.5013067985011177, "grad_norm": 0.3088144063949585, "learning_rate": 0.00017582347442622755, "loss": 0.3715, "step": 11920 }, { "epoch": 1.5019365809112952, "grad_norm": 0.27996301651000977, "learning_rate": 0.0001757151859255038, "loss": 0.3636, "step": 11925 }, { "epoch": 1.5025663633214723, "grad_norm": 0.3117114007472992, "learning_rate": 0.00017560688361513766, "loss": 0.351, "step": 11930 }, { "epoch": 1.5031961457316498, "grad_norm": 0.32614433765411377, "learning_rate": 0.00017549856755329012, "loss": 0.3711, "step": 11935 }, { "epoch": 1.503825928141827, "grad_norm": 0.23831017315387726, "learning_rate": 0.0001753902377981294, "loss": 0.3645, "step": 11940 }, { "epoch": 1.5044557105520044, "grad_norm": 0.27338019013404846, "learning_rate": 0.000175281894407831, "loss": 0.3606, "step": 11945 }, { "epoch": 1.5050854929621815, "grad_norm": 0.2813990116119385, "learning_rate": 0.0001751735374405778, "loss": 0.3637, "step": 11950 }, { "epoch": 1.5057152753723588, "grad_norm": 0.2607782781124115, "learning_rate": 0.00017506516695455992, "loss": 0.3493, "step": 11955 }, { "epoch": 1.5063450577825361, "grad_norm": 0.2825680077075958, "learning_rate": 0.0001749567830079749, "loss": 0.3474, "step": 11960 }, { "epoch": 1.5069748401927134, "grad_norm": 0.2957023084163666, "learning_rate": 0.00017484838565902735, "loss": 0.3852, "step": 11965 }, { "epoch": 1.5076046226028907, "grad_norm": 0.31363338232040405, "learning_rate": 0.00017473997496592904, "loss": 0.3944, "step": 11970 }, { "epoch": 1.5082344050130678, "grad_norm": 0.271010160446167, "learning_rate": 0.00017463155098689908, "loss": 0.3667, "step": 11975 }, { "epoch": 1.5088641874232454, "grad_norm": 0.28360188007354736, "learning_rate": 0.00017452311378016362, "loss": 0.3564, "step": 11980 }, { "epoch": 1.5094939698334224, "grad_norm": 0.28345590829849243, "learning_rate": 0.00017441466340395583, "loss": 0.358, "step": 11985 }, { "epoch": 1.5101237522436, "grad_norm": 0.23574601113796234, "learning_rate": 0.00017430619991651614, "loss": 0.3588, "step": 11990 }, { "epoch": 1.510753534653777, "grad_norm": 0.32633906602859497, "learning_rate": 0.0001741977233760919, "loss": 0.3786, "step": 11995 }, { "epoch": 1.5113833170639546, "grad_norm": 0.31216609477996826, "learning_rate": 0.00017408923384093746, "loss": 0.3949, "step": 12000 }, { "epoch": 1.5113833170639546, "eval_loss": 0.3009350597858429, "eval_runtime": 6.1573, "eval_samples_per_second": 162.409, "eval_steps_per_second": 10.232, "step": 12000 }, { "epoch": 1.5120130994741316, "grad_norm": 0.2735341191291809, "learning_rate": 0.00017398073136931416, "loss": 0.3667, "step": 12005 }, { "epoch": 1.512642881884309, "grad_norm": 0.3168368637561798, "learning_rate": 0.0001738722160194904, "loss": 0.3693, "step": 12010 }, { "epoch": 1.5132726642944863, "grad_norm": 0.27563655376434326, "learning_rate": 0.0001737636878497413, "loss": 0.3721, "step": 12015 }, { "epoch": 1.5139024467046636, "grad_norm": 0.27887552976608276, "learning_rate": 0.00017365514691834898, "loss": 0.402, "step": 12020 }, { "epoch": 1.5145322291148409, "grad_norm": 0.30676189064979553, "learning_rate": 0.0001735465932836024, "loss": 0.3875, "step": 12025 }, { "epoch": 1.515162011525018, "grad_norm": 0.30623871088027954, "learning_rate": 0.00017343802700379746, "loss": 0.3644, "step": 12030 }, { "epoch": 1.5157917939351955, "grad_norm": 0.2534305453300476, "learning_rate": 0.00017332944813723658, "loss": 0.3753, "step": 12035 }, { "epoch": 1.5164215763453726, "grad_norm": 0.29374125599861145, "learning_rate": 0.00017322085674222916, "loss": 0.3964, "step": 12040 }, { "epoch": 1.51705135875555, "grad_norm": 0.2833009362220764, "learning_rate": 0.00017311225287709126, "loss": 0.3778, "step": 12045 }, { "epoch": 1.5176811411657272, "grad_norm": 0.273299902677536, "learning_rate": 0.0001730036366001456, "loss": 0.3661, "step": 12050 }, { "epoch": 1.5183109235759045, "grad_norm": 0.32840752601623535, "learning_rate": 0.00017289500796972165, "loss": 0.3564, "step": 12055 }, { "epoch": 1.5189407059860818, "grad_norm": 0.289202481508255, "learning_rate": 0.00017278636704415545, "loss": 0.3885, "step": 12060 }, { "epoch": 1.519570488396259, "grad_norm": 0.28327277302742004, "learning_rate": 0.0001726777138817896, "loss": 0.376, "step": 12065 }, { "epoch": 1.5202002708064364, "grad_norm": 0.2617267370223999, "learning_rate": 0.00017256904854097343, "loss": 0.353, "step": 12070 }, { "epoch": 1.5208300532166137, "grad_norm": 0.2693130671977997, "learning_rate": 0.00017246037108006266, "loss": 0.386, "step": 12075 }, { "epoch": 1.521459835626791, "grad_norm": 0.260217547416687, "learning_rate": 0.00017235168155741956, "loss": 0.3773, "step": 12080 }, { "epoch": 1.522089618036968, "grad_norm": 0.2806963622570038, "learning_rate": 0.0001722429800314129, "loss": 0.3703, "step": 12085 }, { "epoch": 1.5227194004471456, "grad_norm": 0.2797011435031891, "learning_rate": 0.00017213426656041787, "loss": 0.3523, "step": 12090 }, { "epoch": 1.5233491828573227, "grad_norm": 0.3413710296154022, "learning_rate": 0.00017202554120281612, "loss": 0.3825, "step": 12095 }, { "epoch": 1.5239789652675002, "grad_norm": 0.2759542167186737, "learning_rate": 0.0001719168040169956, "loss": 0.346, "step": 12100 }, { "epoch": 1.5246087476776773, "grad_norm": 0.28816723823547363, "learning_rate": 0.00017180805506135068, "loss": 0.3772, "step": 12105 }, { "epoch": 1.5252385300878546, "grad_norm": 0.2563376724720001, "learning_rate": 0.00017169929439428207, "loss": 0.3661, "step": 12110 }, { "epoch": 1.525868312498032, "grad_norm": 0.29572755098342896, "learning_rate": 0.0001715905220741967, "loss": 0.3428, "step": 12115 }, { "epoch": 1.5264980949082092, "grad_norm": 0.28491732478141785, "learning_rate": 0.0001714817381595078, "loss": 0.3778, "step": 12120 }, { "epoch": 1.5271278773183865, "grad_norm": 0.28429144620895386, "learning_rate": 0.0001713729427086348, "loss": 0.351, "step": 12125 }, { "epoch": 1.5277576597285638, "grad_norm": 0.3044835925102234, "learning_rate": 0.00017126413578000342, "loss": 0.3651, "step": 12130 }, { "epoch": 1.5283874421387411, "grad_norm": 0.30945730209350586, "learning_rate": 0.0001711553174320453, "loss": 0.3731, "step": 12135 }, { "epoch": 1.5290172245489182, "grad_norm": 0.26389655470848083, "learning_rate": 0.00017104648772319853, "loss": 0.3527, "step": 12140 }, { "epoch": 1.5296470069590957, "grad_norm": 0.3144720196723938, "learning_rate": 0.0001709376467119071, "loss": 0.3776, "step": 12145 }, { "epoch": 1.5302767893692728, "grad_norm": 0.2860710918903351, "learning_rate": 0.00017082879445662113, "loss": 0.3575, "step": 12150 }, { "epoch": 1.5309065717794503, "grad_norm": 0.2869095802307129, "learning_rate": 0.00017071993101579674, "loss": 0.3322, "step": 12155 }, { "epoch": 1.5315363541896274, "grad_norm": 0.2524400055408478, "learning_rate": 0.00017061105644789612, "loss": 0.3743, "step": 12160 }, { "epoch": 1.5321661365998047, "grad_norm": 0.2670304477214813, "learning_rate": 0.00017050217081138736, "loss": 0.3735, "step": 12165 }, { "epoch": 1.532795919009982, "grad_norm": 0.2701478898525238, "learning_rate": 0.00017039327416474456, "loss": 0.3467, "step": 12170 }, { "epoch": 1.5334257014201593, "grad_norm": 0.2941682040691376, "learning_rate": 0.0001702843665664477, "loss": 0.3895, "step": 12175 }, { "epoch": 1.5340554838303366, "grad_norm": 0.28004932403564453, "learning_rate": 0.00017017544807498264, "loss": 0.3666, "step": 12180 }, { "epoch": 1.534685266240514, "grad_norm": 0.29110807180404663, "learning_rate": 0.00017006651874884116, "loss": 0.3628, "step": 12185 }, { "epoch": 1.5353150486506912, "grad_norm": 0.2467578798532486, "learning_rate": 0.00016995757864652066, "loss": 0.35, "step": 12190 }, { "epoch": 1.5359448310608683, "grad_norm": 0.3148331046104431, "learning_rate": 0.00016984862782652463, "loss": 0.3535, "step": 12195 }, { "epoch": 1.5365746134710458, "grad_norm": 0.28578343987464905, "learning_rate": 0.00016973966634736202, "loss": 0.3477, "step": 12200 }, { "epoch": 1.537204395881223, "grad_norm": 0.24588525295257568, "learning_rate": 0.0001696306942675477, "loss": 0.3459, "step": 12205 }, { "epoch": 1.5378341782914005, "grad_norm": 0.2754054665565491, "learning_rate": 0.00016952171164560213, "loss": 0.3555, "step": 12210 }, { "epoch": 1.5384639607015775, "grad_norm": 0.28237447142601013, "learning_rate": 0.00016941271854005148, "loss": 0.3446, "step": 12215 }, { "epoch": 1.5390937431117548, "grad_norm": 0.27689647674560547, "learning_rate": 0.00016930371500942755, "loss": 0.3651, "step": 12220 }, { "epoch": 1.5397235255219321, "grad_norm": 0.29644525051116943, "learning_rate": 0.0001691947011122677, "loss": 0.3482, "step": 12225 }, { "epoch": 1.5403533079321095, "grad_norm": 0.3168468475341797, "learning_rate": 0.0001690856769071149, "loss": 0.3859, "step": 12230 }, { "epoch": 1.5409830903422868, "grad_norm": 0.282879501581192, "learning_rate": 0.0001689766424525177, "loss": 0.3742, "step": 12235 }, { "epoch": 1.541612872752464, "grad_norm": 0.2539578676223755, "learning_rate": 0.00016886759780702996, "loss": 0.3467, "step": 12240 }, { "epoch": 1.5422426551626414, "grad_norm": 0.3353635370731354, "learning_rate": 0.00016875854302921122, "loss": 0.3831, "step": 12245 }, { "epoch": 1.5428724375728184, "grad_norm": 0.2890516519546509, "learning_rate": 0.0001686494781776264, "loss": 0.3672, "step": 12250 }, { "epoch": 1.543502219982996, "grad_norm": 0.3136516213417053, "learning_rate": 0.00016854040331084583, "loss": 0.37, "step": 12255 }, { "epoch": 1.544132002393173, "grad_norm": 0.29757821559906006, "learning_rate": 0.0001684313184874451, "loss": 0.3681, "step": 12260 }, { "epoch": 1.5447617848033506, "grad_norm": 0.3504684269428253, "learning_rate": 0.0001683222237660054, "loss": 0.3868, "step": 12265 }, { "epoch": 1.5453915672135277, "grad_norm": 0.25241127610206604, "learning_rate": 0.00016821311920511297, "loss": 0.3425, "step": 12270 }, { "epoch": 1.546021349623705, "grad_norm": 0.27286654710769653, "learning_rate": 0.00016810400486335953, "loss": 0.3604, "step": 12275 }, { "epoch": 1.5466511320338823, "grad_norm": 0.3104652166366577, "learning_rate": 0.0001679948807993419, "loss": 0.3487, "step": 12280 }, { "epoch": 1.5472809144440596, "grad_norm": 0.2972196042537689, "learning_rate": 0.00016788574707166226, "loss": 0.3555, "step": 12285 }, { "epoch": 1.5479106968542369, "grad_norm": 0.29232388734817505, "learning_rate": 0.00016777660373892787, "loss": 0.3654, "step": 12290 }, { "epoch": 1.5485404792644142, "grad_norm": 0.29798245429992676, "learning_rate": 0.00016766745085975126, "loss": 0.3575, "step": 12295 }, { "epoch": 1.5491702616745915, "grad_norm": 0.2721775472164154, "learning_rate": 0.0001675582884927499, "loss": 0.3409, "step": 12300 }, { "epoch": 1.5498000440847686, "grad_norm": 0.3131150007247925, "learning_rate": 0.00016744911669654662, "loss": 0.3695, "step": 12305 }, { "epoch": 1.550429826494946, "grad_norm": 0.29543280601501465, "learning_rate": 0.00016733993552976901, "loss": 0.3572, "step": 12310 }, { "epoch": 1.5510596089051232, "grad_norm": 0.3287052512168884, "learning_rate": 0.00016723074505105, "loss": 0.3681, "step": 12315 }, { "epoch": 1.5516893913153007, "grad_norm": 0.2833183705806732, "learning_rate": 0.0001671215453190273, "loss": 0.3709, "step": 12320 }, { "epoch": 1.5523191737254778, "grad_norm": 0.2558510899543762, "learning_rate": 0.00016701233639234363, "loss": 0.3404, "step": 12325 }, { "epoch": 1.552948956135655, "grad_norm": 0.2524779438972473, "learning_rate": 0.0001669031183296467, "loss": 0.3492, "step": 12330 }, { "epoch": 1.5535787385458324, "grad_norm": 0.2844880521297455, "learning_rate": 0.00016679389118958918, "loss": 0.3538, "step": 12335 }, { "epoch": 1.5542085209560097, "grad_norm": 0.28060173988342285, "learning_rate": 0.0001666846550308285, "loss": 0.3615, "step": 12340 }, { "epoch": 1.554838303366187, "grad_norm": 0.2490835040807724, "learning_rate": 0.00016657540991202687, "loss": 0.3655, "step": 12345 }, { "epoch": 1.5554680857763643, "grad_norm": 0.27524054050445557, "learning_rate": 0.00016646615589185153, "loss": 0.3412, "step": 12350 }, { "epoch": 1.5560978681865416, "grad_norm": 0.31142935156822205, "learning_rate": 0.00016635689302897435, "loss": 0.347, "step": 12355 }, { "epoch": 1.5567276505967187, "grad_norm": 0.28053995966911316, "learning_rate": 0.00016624762138207197, "loss": 0.3838, "step": 12360 }, { "epoch": 1.5573574330068962, "grad_norm": 0.2476169764995575, "learning_rate": 0.0001661383410098258, "loss": 0.3636, "step": 12365 }, { "epoch": 1.5579872154170733, "grad_norm": 0.4054109752178192, "learning_rate": 0.00016602905197092183, "loss": 0.3657, "step": 12370 }, { "epoch": 1.5586169978272508, "grad_norm": 0.2735072672367096, "learning_rate": 0.00016591975432405084, "loss": 0.3593, "step": 12375 }, { "epoch": 1.559246780237428, "grad_norm": 0.2994532883167267, "learning_rate": 0.00016581044812790817, "loss": 0.3641, "step": 12380 }, { "epoch": 1.5598765626476052, "grad_norm": 0.263090044260025, "learning_rate": 0.0001657011334411936, "loss": 0.3711, "step": 12385 }, { "epoch": 1.5605063450577825, "grad_norm": 0.25073063373565674, "learning_rate": 0.0001655918103226118, "loss": 0.3554, "step": 12390 }, { "epoch": 1.5611361274679598, "grad_norm": 0.2575080096721649, "learning_rate": 0.00016548247883087168, "loss": 0.3744, "step": 12395 }, { "epoch": 1.5617659098781371, "grad_norm": 0.2630578577518463, "learning_rate": 0.00016537313902468677, "loss": 0.3501, "step": 12400 }, { "epoch": 1.5623956922883144, "grad_norm": 0.3097805678844452, "learning_rate": 0.00016526379096277503, "loss": 0.3586, "step": 12405 }, { "epoch": 1.5630254746984917, "grad_norm": 0.3104281723499298, "learning_rate": 0.0001651544347038589, "loss": 0.3643, "step": 12410 }, { "epoch": 1.5636552571086688, "grad_norm": 0.3604758381843567, "learning_rate": 0.0001650450703066652, "loss": 0.3645, "step": 12415 }, { "epoch": 1.5642850395188463, "grad_norm": 0.30638590455055237, "learning_rate": 0.000164935697829925, "loss": 0.3572, "step": 12420 }, { "epoch": 1.5649148219290234, "grad_norm": 0.27940669655799866, "learning_rate": 0.00016482631733237397, "loss": 0.3636, "step": 12425 }, { "epoch": 1.565544604339201, "grad_norm": 0.28857216238975525, "learning_rate": 0.00016471692887275185, "loss": 0.3601, "step": 12430 }, { "epoch": 1.566174386749378, "grad_norm": 0.2992657721042633, "learning_rate": 0.0001646075325098027, "loss": 0.3621, "step": 12435 }, { "epoch": 1.5668041691595553, "grad_norm": 0.28050917387008667, "learning_rate": 0.00016449812830227498, "loss": 0.3623, "step": 12440 }, { "epoch": 1.5674339515697326, "grad_norm": 0.269634485244751, "learning_rate": 0.0001643887163089212, "loss": 0.3375, "step": 12445 }, { "epoch": 1.56806373397991, "grad_norm": 0.2825991213321686, "learning_rate": 0.00016427929658849807, "loss": 0.3523, "step": 12450 }, { "epoch": 1.5686935163900873, "grad_norm": 0.3219839334487915, "learning_rate": 0.00016416986919976645, "loss": 0.3588, "step": 12455 }, { "epoch": 1.5693232988002646, "grad_norm": 0.2681691646575928, "learning_rate": 0.00016406043420149146, "loss": 0.3466, "step": 12460 }, { "epoch": 1.5699530812104419, "grad_norm": 0.2719057500362396, "learning_rate": 0.0001639509916524421, "loss": 0.3599, "step": 12465 }, { "epoch": 1.570582863620619, "grad_norm": 0.24405649304389954, "learning_rate": 0.00016384154161139158, "loss": 0.3402, "step": 12470 }, { "epoch": 1.5712126460307965, "grad_norm": 0.306537002325058, "learning_rate": 0.00016373208413711696, "loss": 0.3283, "step": 12475 }, { "epoch": 1.5718424284409735, "grad_norm": 0.28490665555000305, "learning_rate": 0.0001636226192883996, "loss": 0.3529, "step": 12480 }, { "epoch": 1.572472210851151, "grad_norm": 0.2510652542114258, "learning_rate": 0.00016351314712402442, "loss": 0.3228, "step": 12485 }, { "epoch": 1.5731019932613282, "grad_norm": 0.2670060694217682, "learning_rate": 0.0001634036677027806, "loss": 0.3592, "step": 12490 }, { "epoch": 1.5737317756715055, "grad_norm": 0.29240545630455017, "learning_rate": 0.00016329418108346105, "loss": 0.3717, "step": 12495 }, { "epoch": 1.5743615580816828, "grad_norm": 0.29088887572288513, "learning_rate": 0.00016318468732486255, "loss": 0.3679, "step": 12500 }, { "epoch": 1.57499134049186, "grad_norm": 0.25105100870132446, "learning_rate": 0.0001630751864857858, "loss": 0.3464, "step": 12505 }, { "epoch": 1.5756211229020374, "grad_norm": 0.26624953746795654, "learning_rate": 0.00016296567862503526, "loss": 0.3552, "step": 12510 }, { "epoch": 1.5762509053122147, "grad_norm": 0.28500837087631226, "learning_rate": 0.00016285616380141914, "loss": 0.3591, "step": 12515 }, { "epoch": 1.576880687722392, "grad_norm": 0.2937677502632141, "learning_rate": 0.00016274664207374936, "loss": 0.3664, "step": 12520 }, { "epoch": 1.577510470132569, "grad_norm": 0.28588148951530457, "learning_rate": 0.00016263711350084165, "loss": 0.3767, "step": 12525 }, { "epoch": 1.5781402525427466, "grad_norm": 0.31547772884368896, "learning_rate": 0.0001625275781415153, "loss": 0.3521, "step": 12530 }, { "epoch": 1.5787700349529237, "grad_norm": 0.29322996735572815, "learning_rate": 0.00016241803605459334, "loss": 0.3777, "step": 12535 }, { "epoch": 1.5793998173631012, "grad_norm": 0.29141756892204285, "learning_rate": 0.00016230848729890238, "loss": 0.3367, "step": 12540 }, { "epoch": 1.5800295997732783, "grad_norm": 0.2583523094654083, "learning_rate": 0.00016219893193327258, "loss": 0.3473, "step": 12545 }, { "epoch": 1.5806593821834556, "grad_norm": 0.26929906010627747, "learning_rate": 0.00016208937001653765, "loss": 0.3622, "step": 12550 }, { "epoch": 1.581289164593633, "grad_norm": 0.2727062702178955, "learning_rate": 0.0001619798016075349, "loss": 0.3607, "step": 12555 }, { "epoch": 1.5819189470038102, "grad_norm": 0.35366252064704895, "learning_rate": 0.000161870226765105, "loss": 0.3453, "step": 12560 }, { "epoch": 1.5825487294139875, "grad_norm": 0.30689889192581177, "learning_rate": 0.00016176064554809225, "loss": 0.3672, "step": 12565 }, { "epoch": 1.5831785118241648, "grad_norm": 0.2855357825756073, "learning_rate": 0.00016165105801534414, "loss": 0.3715, "step": 12570 }, { "epoch": 1.583808294234342, "grad_norm": 0.33706697821617126, "learning_rate": 0.00016154146422571176, "loss": 0.3645, "step": 12575 }, { "epoch": 1.5844380766445192, "grad_norm": 0.24410569667816162, "learning_rate": 0.00016143186423804944, "loss": 0.3576, "step": 12580 }, { "epoch": 1.5850678590546967, "grad_norm": 0.33356451988220215, "learning_rate": 0.00016132225811121492, "loss": 0.3774, "step": 12585 }, { "epoch": 1.5856976414648738, "grad_norm": 0.2804293632507324, "learning_rate": 0.00016121264590406912, "loss": 0.3656, "step": 12590 }, { "epoch": 1.5863274238750513, "grad_norm": 0.29394668340682983, "learning_rate": 0.0001611030276754764, "loss": 0.3468, "step": 12595 }, { "epoch": 1.5869572062852284, "grad_norm": 0.2657965421676636, "learning_rate": 0.0001609934034843042, "loss": 0.3518, "step": 12600 }, { "epoch": 1.5875869886954057, "grad_norm": 0.25583842396736145, "learning_rate": 0.00016088377338942318, "loss": 0.361, "step": 12605 }, { "epoch": 1.588216771105583, "grad_norm": 0.27986687421798706, "learning_rate": 0.00016077413744970722, "loss": 0.3771, "step": 12610 }, { "epoch": 1.5888465535157603, "grad_norm": 0.3200220763683319, "learning_rate": 0.0001606644957240334, "loss": 0.3666, "step": 12615 }, { "epoch": 1.5894763359259376, "grad_norm": 0.29622554779052734, "learning_rate": 0.00016055484827128173, "loss": 0.3469, "step": 12620 }, { "epoch": 1.590106118336115, "grad_norm": 0.3073137700557709, "learning_rate": 0.00016044519515033545, "loss": 0.3382, "step": 12625 }, { "epoch": 1.5907359007462922, "grad_norm": 0.31342241168022156, "learning_rate": 0.00016033553642008077, "loss": 0.357, "step": 12630 }, { "epoch": 1.5913656831564693, "grad_norm": 0.2913351058959961, "learning_rate": 0.00016022587213940698, "loss": 0.3487, "step": 12635 }, { "epoch": 1.5919954655666468, "grad_norm": 0.2823300361633301, "learning_rate": 0.00016011620236720621, "loss": 0.3367, "step": 12640 }, { "epoch": 1.592625247976824, "grad_norm": 0.3134678304195404, "learning_rate": 0.00016000652716237373, "loss": 0.3393, "step": 12645 }, { "epoch": 1.5932550303870014, "grad_norm": 0.3235761821269989, "learning_rate": 0.0001598968465838076, "loss": 0.3752, "step": 12650 }, { "epoch": 1.5938848127971785, "grad_norm": 0.26606664061546326, "learning_rate": 0.00015978716069040875, "loss": 0.3413, "step": 12655 }, { "epoch": 1.5945145952073558, "grad_norm": 0.30575528740882874, "learning_rate": 0.0001596774695410811, "loss": 0.3715, "step": 12660 }, { "epoch": 1.5951443776175331, "grad_norm": 0.3017826974391937, "learning_rate": 0.0001595677731947312, "loss": 0.3586, "step": 12665 }, { "epoch": 1.5957741600277104, "grad_norm": 0.3203217089176178, "learning_rate": 0.00015945807171026855, "loss": 0.3753, "step": 12670 }, { "epoch": 1.5964039424378877, "grad_norm": 0.2615835666656494, "learning_rate": 0.00015934836514660536, "loss": 0.3641, "step": 12675 }, { "epoch": 1.5970337248480648, "grad_norm": 0.2814265191555023, "learning_rate": 0.00015923865356265652, "loss": 0.3467, "step": 12680 }, { "epoch": 1.5976635072582424, "grad_norm": 0.28240394592285156, "learning_rate": 0.00015912893701733975, "loss": 0.3405, "step": 12685 }, { "epoch": 1.5982932896684194, "grad_norm": 0.2675967514514923, "learning_rate": 0.0001590192155695752, "loss": 0.3341, "step": 12690 }, { "epoch": 1.598923072078597, "grad_norm": 0.33408063650131226, "learning_rate": 0.00015890948927828593, "loss": 0.3431, "step": 12695 }, { "epoch": 1.599552854488774, "grad_norm": 0.2793383300304413, "learning_rate": 0.00015879975820239737, "loss": 0.3334, "step": 12700 }, { "epoch": 1.6001826368989516, "grad_norm": 0.29299893975257874, "learning_rate": 0.00015869002240083765, "loss": 0.3479, "step": 12705 }, { "epoch": 1.6008124193091287, "grad_norm": 0.2782432436943054, "learning_rate": 0.0001585802819325374, "loss": 0.3491, "step": 12710 }, { "epoch": 1.601442201719306, "grad_norm": 0.2812289297580719, "learning_rate": 0.00015847053685642977, "loss": 0.3406, "step": 12715 }, { "epoch": 1.6020719841294833, "grad_norm": 0.2464970499277115, "learning_rate": 0.00015836078723145032, "loss": 0.3539, "step": 12720 }, { "epoch": 1.6027017665396606, "grad_norm": 0.3087675869464874, "learning_rate": 0.0001582510331165372, "loss": 0.356, "step": 12725 }, { "epoch": 1.6033315489498379, "grad_norm": 0.2726532816886902, "learning_rate": 0.0001581412745706308, "loss": 0.3443, "step": 12730 }, { "epoch": 1.603961331360015, "grad_norm": 0.28401410579681396, "learning_rate": 0.00015803151165267397, "loss": 0.3359, "step": 12735 }, { "epoch": 1.6045911137701925, "grad_norm": 0.2700473666191101, "learning_rate": 0.00015792174442161194, "loss": 0.3523, "step": 12740 }, { "epoch": 1.6052208961803696, "grad_norm": 0.32183146476745605, "learning_rate": 0.00015781197293639223, "loss": 0.3765, "step": 12745 }, { "epoch": 1.605850678590547, "grad_norm": 0.2805304229259491, "learning_rate": 0.0001577021972559646, "loss": 0.3546, "step": 12750 }, { "epoch": 1.6064804610007242, "grad_norm": 0.30137768387794495, "learning_rate": 0.00015759241743928108, "loss": 0.3721, "step": 12755 }, { "epoch": 1.6071102434109017, "grad_norm": 0.25476494431495667, "learning_rate": 0.00015748263354529597, "loss": 0.3281, "step": 12760 }, { "epoch": 1.6077400258210788, "grad_norm": 0.3311167061328888, "learning_rate": 0.0001573728456329657, "loss": 0.3875, "step": 12765 }, { "epoch": 1.608369808231256, "grad_norm": 0.27148258686065674, "learning_rate": 0.00015726305376124897, "loss": 0.3547, "step": 12770 }, { "epoch": 1.6089995906414334, "grad_norm": 0.25366437435150146, "learning_rate": 0.00015715325798910644, "loss": 0.3423, "step": 12775 }, { "epoch": 1.6096293730516107, "grad_norm": 0.2699160873889923, "learning_rate": 0.000157043458375501, "loss": 0.3347, "step": 12780 }, { "epoch": 1.610259155461788, "grad_norm": 0.2672334611415863, "learning_rate": 0.00015693365497939743, "loss": 0.3354, "step": 12785 }, { "epoch": 1.610888937871965, "grad_norm": 0.3269018828868866, "learning_rate": 0.00015682384785976284, "loss": 0.3427, "step": 12790 }, { "epoch": 1.6115187202821426, "grad_norm": 0.2637929320335388, "learning_rate": 0.00015671403707556605, "loss": 0.3501, "step": 12795 }, { "epoch": 1.6121485026923197, "grad_norm": 0.2606005072593689, "learning_rate": 0.00015660422268577801, "loss": 0.3387, "step": 12800 }, { "epoch": 1.6127782851024972, "grad_norm": 0.30220791697502136, "learning_rate": 0.00015649440474937152, "loss": 0.3489, "step": 12805 }, { "epoch": 1.6134080675126743, "grad_norm": 0.29726284742355347, "learning_rate": 0.0001563845833253213, "loss": 0.3358, "step": 12810 }, { "epoch": 1.6140378499228518, "grad_norm": 0.2928326427936554, "learning_rate": 0.000156274758472604, "loss": 0.3236, "step": 12815 }, { "epoch": 1.614667632333029, "grad_norm": 0.31645599007606506, "learning_rate": 0.0001561649302501981, "loss": 0.3571, "step": 12820 }, { "epoch": 1.6152974147432062, "grad_norm": 0.26705339550971985, "learning_rate": 0.00015605509871708382, "loss": 0.3671, "step": 12825 }, { "epoch": 1.6159271971533835, "grad_norm": 0.2691219449043274, "learning_rate": 0.00015594526393224322, "loss": 0.3452, "step": 12830 }, { "epoch": 1.6165569795635608, "grad_norm": 0.2822478413581848, "learning_rate": 0.00015583542595466005, "loss": 0.3273, "step": 12835 }, { "epoch": 1.6171867619737381, "grad_norm": 0.2974461019039154, "learning_rate": 0.00015572558484331994, "loss": 0.3652, "step": 12840 }, { "epoch": 1.6178165443839152, "grad_norm": 0.2611928880214691, "learning_rate": 0.00015561574065720986, "loss": 0.3445, "step": 12845 }, { "epoch": 1.6184463267940927, "grad_norm": 0.2836850583553314, "learning_rate": 0.00015550589345531885, "loss": 0.3326, "step": 12850 }, { "epoch": 1.6190761092042698, "grad_norm": 0.2735482156276703, "learning_rate": 0.00015539604329663725, "loss": 0.3532, "step": 12855 }, { "epoch": 1.6197058916144473, "grad_norm": 0.34394770860671997, "learning_rate": 0.0001552861902401572, "loss": 0.3532, "step": 12860 }, { "epoch": 1.6203356740246244, "grad_norm": 0.2786300778388977, "learning_rate": 0.0001551763343448722, "loss": 0.3591, "step": 12865 }, { "epoch": 1.6209654564348017, "grad_norm": 0.26574140787124634, "learning_rate": 0.00015506647566977737, "loss": 0.3527, "step": 12870 }, { "epoch": 1.621595238844979, "grad_norm": 0.23986276984214783, "learning_rate": 0.00015495661427386944, "loss": 0.3437, "step": 12875 }, { "epoch": 1.6222250212551563, "grad_norm": 0.29332754015922546, "learning_rate": 0.0001548467502161464, "loss": 0.3323, "step": 12880 }, { "epoch": 1.6228548036653336, "grad_norm": 0.34338971972465515, "learning_rate": 0.0001547368835556078, "loss": 0.3367, "step": 12885 }, { "epoch": 1.623484586075511, "grad_norm": 0.3112575113773346, "learning_rate": 0.00015462701435125451, "loss": 0.3392, "step": 12890 }, { "epoch": 1.6241143684856882, "grad_norm": 0.26299479603767395, "learning_rate": 0.0001545171426620888, "loss": 0.3194, "step": 12895 }, { "epoch": 1.6247441508958653, "grad_norm": 0.27403828501701355, "learning_rate": 0.00015440726854711436, "loss": 0.3344, "step": 12900 }, { "epoch": 1.6253739333060429, "grad_norm": 0.2603330910205841, "learning_rate": 0.000154297392065336, "loss": 0.3564, "step": 12905 }, { "epoch": 1.62600371571622, "grad_norm": 0.2812626361846924, "learning_rate": 0.00015418751327575994, "loss": 0.3583, "step": 12910 }, { "epoch": 1.6266334981263975, "grad_norm": 0.27280038595199585, "learning_rate": 0.0001540776322373936, "loss": 0.3568, "step": 12915 }, { "epoch": 1.6272632805365745, "grad_norm": 0.3073441982269287, "learning_rate": 0.0001539677490092456, "loss": 0.3336, "step": 12920 }, { "epoch": 1.6278930629467518, "grad_norm": 0.2868177890777588, "learning_rate": 0.00015385786365032576, "loss": 0.3455, "step": 12925 }, { "epoch": 1.6285228453569291, "grad_norm": 0.2661624550819397, "learning_rate": 0.000153747976219645, "loss": 0.3377, "step": 12930 }, { "epoch": 1.6291526277671065, "grad_norm": 0.3179323673248291, "learning_rate": 0.0001536380867762154, "loss": 0.3706, "step": 12935 }, { "epoch": 1.6297824101772838, "grad_norm": 0.30941662192344666, "learning_rate": 0.0001535281953790501, "loss": 0.3514, "step": 12940 }, { "epoch": 1.630412192587461, "grad_norm": 0.3018413782119751, "learning_rate": 0.0001534183020871633, "loss": 0.3516, "step": 12945 }, { "epoch": 1.6310419749976384, "grad_norm": 0.34621462225914, "learning_rate": 0.00015330840695957019, "loss": 0.3522, "step": 12950 }, { "epoch": 1.6316717574078154, "grad_norm": 0.2858521342277527, "learning_rate": 0.000153198510055287, "loss": 0.3378, "step": 12955 }, { "epoch": 1.632301539817993, "grad_norm": 0.2880783975124359, "learning_rate": 0.00015308861143333076, "loss": 0.3615, "step": 12960 }, { "epoch": 1.63293132222817, "grad_norm": 0.24324443936347961, "learning_rate": 0.00015297871115271976, "loss": 0.3346, "step": 12965 }, { "epoch": 1.6335611046383476, "grad_norm": 0.26982635259628296, "learning_rate": 0.00015286880927247273, "loss": 0.3423, "step": 12970 }, { "epoch": 1.6341908870485247, "grad_norm": 0.27813199162483215, "learning_rate": 0.00015275890585160961, "loss": 0.3545, "step": 12975 }, { "epoch": 1.634820669458702, "grad_norm": 0.27575090527534485, "learning_rate": 0.00015264900094915106, "loss": 0.3357, "step": 12980 }, { "epoch": 1.6354504518688793, "grad_norm": 0.25838521122932434, "learning_rate": 0.00015253909462411847, "loss": 0.3244, "step": 12985 }, { "epoch": 1.6360802342790566, "grad_norm": 0.2889041602611542, "learning_rate": 0.00015242918693553404, "loss": 0.3297, "step": 12990 }, { "epoch": 1.6367100166892339, "grad_norm": 0.3074316680431366, "learning_rate": 0.0001523192779424208, "loss": 0.3525, "step": 12995 }, { "epoch": 1.6373397990994112, "grad_norm": 0.26425209641456604, "learning_rate": 0.00015220936770380227, "loss": 0.3493, "step": 13000 }, { "epoch": 1.6373397990994112, "eval_loss": 0.30248695611953735, "eval_runtime": 6.1682, "eval_samples_per_second": 162.123, "eval_steps_per_second": 10.214, "step": 13000 }, { "epoch": 1.6379695815095885, "grad_norm": 0.2873767018318176, "learning_rate": 0.00015209945627870283, "loss": 0.3838, "step": 13005 }, { "epoch": 1.6385993639197656, "grad_norm": 0.2895953059196472, "learning_rate": 0.0001519895437261474, "loss": 0.3509, "step": 13010 }, { "epoch": 1.639229146329943, "grad_norm": 0.2910915017127991, "learning_rate": 0.0001518796301051616, "loss": 0.326, "step": 13015 }, { "epoch": 1.6398589287401202, "grad_norm": 0.2735256552696228, "learning_rate": 0.00015176971547477142, "loss": 0.366, "step": 13020 }, { "epoch": 1.6404887111502977, "grad_norm": 0.3099430501461029, "learning_rate": 0.00015165979989400366, "loss": 0.3226, "step": 13025 }, { "epoch": 1.6411184935604748, "grad_norm": 0.2963193655014038, "learning_rate": 0.00015154988342188543, "loss": 0.3301, "step": 13030 }, { "epoch": 1.641748275970652, "grad_norm": 0.28377631306648254, "learning_rate": 0.0001514399661174444, "loss": 0.3143, "step": 13035 }, { "epoch": 1.6423780583808294, "grad_norm": 0.25847306847572327, "learning_rate": 0.00015133004803970866, "loss": 0.325, "step": 13040 }, { "epoch": 1.6430078407910067, "grad_norm": 0.2919864058494568, "learning_rate": 0.00015122012924770675, "loss": 0.3543, "step": 13045 }, { "epoch": 1.643637623201184, "grad_norm": 0.31185677647590637, "learning_rate": 0.00015111020980046756, "loss": 0.3546, "step": 13050 }, { "epoch": 1.6442674056113613, "grad_norm": 0.27933362126350403, "learning_rate": 0.00015100028975702036, "loss": 0.3344, "step": 13055 }, { "epoch": 1.6448971880215386, "grad_norm": 0.2898799777030945, "learning_rate": 0.00015089036917639468, "loss": 0.3473, "step": 13060 }, { "epoch": 1.6455269704317157, "grad_norm": 0.31464672088623047, "learning_rate": 0.00015078044811762047, "loss": 0.3418, "step": 13065 }, { "epoch": 1.6461567528418932, "grad_norm": 0.2676648199558258, "learning_rate": 0.00015067052663972775, "loss": 0.3331, "step": 13070 }, { "epoch": 1.6467865352520703, "grad_norm": 0.30420759320259094, "learning_rate": 0.0001505606048017469, "loss": 0.3544, "step": 13075 }, { "epoch": 1.6474163176622478, "grad_norm": 0.3160271942615509, "learning_rate": 0.00015045068266270848, "loss": 0.3526, "step": 13080 }, { "epoch": 1.648046100072425, "grad_norm": 0.31276562809944153, "learning_rate": 0.0001503407602816432, "loss": 0.3213, "step": 13085 }, { "epoch": 1.6486758824826022, "grad_norm": 0.316756933927536, "learning_rate": 0.00015023083771758183, "loss": 0.3446, "step": 13090 }, { "epoch": 1.6493056648927795, "grad_norm": 0.23935994505882263, "learning_rate": 0.00015012091502955533, "loss": 0.3416, "step": 13095 }, { "epoch": 1.6499354473029568, "grad_norm": 0.2719472348690033, "learning_rate": 0.00015001099227659475, "loss": 0.3567, "step": 13100 }, { "epoch": 1.6505652297131341, "grad_norm": 0.3108009696006775, "learning_rate": 0.00014990106951773098, "loss": 0.3524, "step": 13105 }, { "epoch": 1.6511950121233114, "grad_norm": 0.3002628982067108, "learning_rate": 0.00014979114681199524, "loss": 0.3314, "step": 13110 }, { "epoch": 1.6518247945334887, "grad_norm": 0.32287389039993286, "learning_rate": 0.0001496812242184184, "loss": 0.3376, "step": 13115 }, { "epoch": 1.6524545769436658, "grad_norm": 0.27193522453308105, "learning_rate": 0.0001495713017960314, "loss": 0.3443, "step": 13120 }, { "epoch": 1.6530843593538433, "grad_norm": 0.30429700016975403, "learning_rate": 0.00014946137960386512, "loss": 0.3345, "step": 13125 }, { "epoch": 1.6537141417640204, "grad_norm": 0.2757263481616974, "learning_rate": 0.00014935145770095034, "loss": 0.3405, "step": 13130 }, { "epoch": 1.654343924174198, "grad_norm": 0.274728000164032, "learning_rate": 0.00014924153614631754, "loss": 0.3199, "step": 13135 }, { "epoch": 1.654973706584375, "grad_norm": 0.2992052137851715, "learning_rate": 0.0001491316149989972, "loss": 0.3641, "step": 13140 }, { "epoch": 1.6556034889945523, "grad_norm": 0.28687140345573425, "learning_rate": 0.00014902169431801947, "loss": 0.3586, "step": 13145 }, { "epoch": 1.6562332714047296, "grad_norm": 0.31748563051223755, "learning_rate": 0.00014891177416241416, "loss": 0.3318, "step": 13150 }, { "epoch": 1.656863053814907, "grad_norm": 0.2876995801925659, "learning_rate": 0.00014880185459121103, "loss": 0.3446, "step": 13155 }, { "epoch": 1.6574928362250843, "grad_norm": 0.2874261736869812, "learning_rate": 0.00014869193566343934, "loss": 0.3058, "step": 13160 }, { "epoch": 1.6581226186352616, "grad_norm": 0.2720824182033539, "learning_rate": 0.00014858201743812806, "loss": 0.3332, "step": 13165 }, { "epoch": 1.6587524010454389, "grad_norm": 0.27765411138534546, "learning_rate": 0.00014847209997430582, "loss": 0.3428, "step": 13170 }, { "epoch": 1.659382183455616, "grad_norm": 0.28871631622314453, "learning_rate": 0.0001483621833310008, "loss": 0.3325, "step": 13175 }, { "epoch": 1.6600119658657935, "grad_norm": 0.2875865697860718, "learning_rate": 0.00014825226756724077, "loss": 0.3527, "step": 13180 }, { "epoch": 1.6606417482759706, "grad_norm": 0.2774711549282074, "learning_rate": 0.00014814235274205297, "loss": 0.335, "step": 13185 }, { "epoch": 1.661271530686148, "grad_norm": 0.2727283537387848, "learning_rate": 0.00014803243891446416, "loss": 0.3393, "step": 13190 }, { "epoch": 1.6619013130963252, "grad_norm": 0.27977532148361206, "learning_rate": 0.00014792252614350055, "loss": 0.3566, "step": 13195 }, { "epoch": 1.6625310955065025, "grad_norm": 0.29823413491249084, "learning_rate": 0.0001478126144881879, "loss": 0.3287, "step": 13200 }, { "epoch": 1.6631608779166798, "grad_norm": 0.2849923372268677, "learning_rate": 0.00014770270400755125, "loss": 0.3166, "step": 13205 }, { "epoch": 1.663790660326857, "grad_norm": 0.259219229221344, "learning_rate": 0.00014759279476061503, "loss": 0.336, "step": 13210 }, { "epoch": 1.6644204427370344, "grad_norm": 0.2877882719039917, "learning_rate": 0.00014748288680640302, "loss": 0.3506, "step": 13215 }, { "epoch": 1.6650502251472117, "grad_norm": 0.2952651381492615, "learning_rate": 0.00014737298020393828, "loss": 0.3562, "step": 13220 }, { "epoch": 1.665680007557389, "grad_norm": 0.25878390669822693, "learning_rate": 0.00014726307501224312, "loss": 0.3289, "step": 13225 }, { "epoch": 1.666309789967566, "grad_norm": 0.29914605617523193, "learning_rate": 0.00014715317129033924, "loss": 0.3321, "step": 13230 }, { "epoch": 1.6669395723777436, "grad_norm": 0.27533242106437683, "learning_rate": 0.00014704326909724738, "loss": 0.3234, "step": 13235 }, { "epoch": 1.6675693547879207, "grad_norm": 0.2584016025066376, "learning_rate": 0.0001469333684919876, "loss": 0.3181, "step": 13240 }, { "epoch": 1.6681991371980982, "grad_norm": 0.262953519821167, "learning_rate": 0.00014682346953357898, "loss": 0.3127, "step": 13245 }, { "epoch": 1.6688289196082753, "grad_norm": 0.3399054706096649, "learning_rate": 0.00014671357228103978, "loss": 0.3529, "step": 13250 }, { "epoch": 1.6694587020184526, "grad_norm": 0.26437637209892273, "learning_rate": 0.00014660367679338732, "loss": 0.318, "step": 13255 }, { "epoch": 1.67008848442863, "grad_norm": 0.28796815872192383, "learning_rate": 0.000146493783129638, "loss": 0.3226, "step": 13260 }, { "epoch": 1.6707182668388072, "grad_norm": 0.3208424150943756, "learning_rate": 0.00014638389134880722, "loss": 0.3661, "step": 13265 }, { "epoch": 1.6713480492489845, "grad_norm": 0.2934640347957611, "learning_rate": 0.00014627400150990941, "loss": 0.3414, "step": 13270 }, { "epoch": 1.6719778316591618, "grad_norm": 0.28860223293304443, "learning_rate": 0.0001461641136719579, "loss": 0.3386, "step": 13275 }, { "epoch": 1.672607614069339, "grad_norm": 0.2960747182369232, "learning_rate": 0.00014605422789396494, "loss": 0.3466, "step": 13280 }, { "epoch": 1.6732373964795162, "grad_norm": 0.25040510296821594, "learning_rate": 0.00014594434423494178, "loss": 0.3366, "step": 13285 }, { "epoch": 1.6738671788896937, "grad_norm": 0.2958894371986389, "learning_rate": 0.0001458344627538984, "loss": 0.3614, "step": 13290 }, { "epoch": 1.6744969612998708, "grad_norm": 0.26937004923820496, "learning_rate": 0.00014572458350984362, "loss": 0.3499, "step": 13295 }, { "epoch": 1.6751267437100483, "grad_norm": 0.267607182264328, "learning_rate": 0.00014561470656178517, "loss": 0.3268, "step": 13300 }, { "epoch": 1.6757565261202254, "grad_norm": 0.30197760462760925, "learning_rate": 0.0001455048319687295, "loss": 0.3212, "step": 13305 }, { "epoch": 1.6763863085304027, "grad_norm": 0.29999008774757385, "learning_rate": 0.0001453949597896817, "loss": 0.3492, "step": 13310 }, { "epoch": 1.67701609094058, "grad_norm": 0.30626264214515686, "learning_rate": 0.00014528509008364572, "loss": 0.3541, "step": 13315 }, { "epoch": 1.6776458733507573, "grad_norm": 0.2915571630001068, "learning_rate": 0.0001451752229096241, "loss": 0.3231, "step": 13320 }, { "epoch": 1.6782756557609346, "grad_norm": 0.2660951018333435, "learning_rate": 0.0001450653583266179, "loss": 0.321, "step": 13325 }, { "epoch": 1.678905438171112, "grad_norm": 0.2831597924232483, "learning_rate": 0.00014495549639362707, "loss": 0.3243, "step": 13330 }, { "epoch": 1.6795352205812892, "grad_norm": 0.2856467664241791, "learning_rate": 0.0001448456371696499, "loss": 0.3134, "step": 13335 }, { "epoch": 1.6801650029914663, "grad_norm": 0.31137335300445557, "learning_rate": 0.00014473578071368324, "loss": 0.3266, "step": 13340 }, { "epoch": 1.6807947854016438, "grad_norm": 0.3102738857269287, "learning_rate": 0.0001446259270847226, "loss": 0.3368, "step": 13345 }, { "epoch": 1.681424567811821, "grad_norm": 0.2788311839103699, "learning_rate": 0.00014451607634176196, "loss": 0.345, "step": 13350 }, { "epoch": 1.6820543502219985, "grad_norm": 0.26762083172798157, "learning_rate": 0.0001444062285437935, "loss": 0.3112, "step": 13355 }, { "epoch": 1.6826841326321755, "grad_norm": 0.30155837535858154, "learning_rate": 0.00014429638374980814, "loss": 0.3353, "step": 13360 }, { "epoch": 1.6833139150423528, "grad_norm": 0.3196204602718353, "learning_rate": 0.00014418654201879498, "loss": 0.3738, "step": 13365 }, { "epoch": 1.6839436974525301, "grad_norm": 0.29560673236846924, "learning_rate": 0.0001440767034097415, "loss": 0.3458, "step": 13370 }, { "epoch": 1.6845734798627074, "grad_norm": 0.30189448595046997, "learning_rate": 0.00014396686798163365, "loss": 0.3577, "step": 13375 }, { "epoch": 1.6852032622728847, "grad_norm": 0.29545098543167114, "learning_rate": 0.00014385703579345544, "loss": 0.3299, "step": 13380 }, { "epoch": 1.685833044683062, "grad_norm": 0.3403629660606384, "learning_rate": 0.00014374720690418942, "loss": 0.3349, "step": 13385 }, { "epoch": 1.6864628270932394, "grad_norm": 0.2561693489551544, "learning_rate": 0.0001436373813728161, "loss": 0.321, "step": 13390 }, { "epoch": 1.6870926095034164, "grad_norm": 0.2968713641166687, "learning_rate": 0.00014352755925831428, "loss": 0.3314, "step": 13395 }, { "epoch": 1.687722391913594, "grad_norm": 0.25213027000427246, "learning_rate": 0.00014341774061966096, "loss": 0.3245, "step": 13400 }, { "epoch": 1.688352174323771, "grad_norm": 0.26504096388816833, "learning_rate": 0.00014330792551583133, "loss": 0.324, "step": 13405 }, { "epoch": 1.6889819567339486, "grad_norm": 0.31459683179855347, "learning_rate": 0.00014319811400579854, "loss": 0.33, "step": 13410 }, { "epoch": 1.6896117391441257, "grad_norm": 0.31566324830055237, "learning_rate": 0.00014308830614853392, "loss": 0.3097, "step": 13415 }, { "epoch": 1.690241521554303, "grad_norm": 0.3083827793598175, "learning_rate": 0.00014297850200300683, "loss": 0.3345, "step": 13420 }, { "epoch": 1.6908713039644803, "grad_norm": 0.29203763604164124, "learning_rate": 0.0001428687016281845, "loss": 0.3459, "step": 13425 }, { "epoch": 1.6915010863746576, "grad_norm": 0.28596800565719604, "learning_rate": 0.00014275890508303225, "loss": 0.3188, "step": 13430 }, { "epoch": 1.6921308687848349, "grad_norm": 0.3753102421760559, "learning_rate": 0.00014264911242651342, "loss": 0.3457, "step": 13435 }, { "epoch": 1.6927606511950122, "grad_norm": 0.28502312302589417, "learning_rate": 0.0001425393237175891, "loss": 0.3295, "step": 13440 }, { "epoch": 1.6933904336051895, "grad_norm": 0.3175462782382965, "learning_rate": 0.00014242953901521838, "loss": 0.3094, "step": 13445 }, { "epoch": 1.6940202160153666, "grad_norm": 0.25370490550994873, "learning_rate": 0.00014231975837835815, "loss": 0.3446, "step": 13450 }, { "epoch": 1.694649998425544, "grad_norm": 0.2589857876300812, "learning_rate": 0.00014220998186596315, "loss": 0.3258, "step": 13455 }, { "epoch": 1.6952797808357212, "grad_norm": 0.31022030115127563, "learning_rate": 0.00014210020953698573, "loss": 0.344, "step": 13460 }, { "epoch": 1.6959095632458987, "grad_norm": 0.3099876046180725, "learning_rate": 0.0001419904414503763, "loss": 0.3425, "step": 13465 }, { "epoch": 1.6965393456560758, "grad_norm": 0.27715328335762024, "learning_rate": 0.00014188067766508273, "loss": 0.3309, "step": 13470 }, { "epoch": 1.697169128066253, "grad_norm": 0.2700579762458801, "learning_rate": 0.00014177091824005075, "loss": 0.3191, "step": 13475 }, { "epoch": 1.6977989104764304, "grad_norm": 0.2773703336715698, "learning_rate": 0.00014166116323422365, "loss": 0.3321, "step": 13480 }, { "epoch": 1.6984286928866077, "grad_norm": 0.2699192464351654, "learning_rate": 0.00014155141270654232, "loss": 0.3318, "step": 13485 }, { "epoch": 1.699058475296785, "grad_norm": 0.26127228140830994, "learning_rate": 0.00014144166671594544, "loss": 0.2982, "step": 13490 }, { "epoch": 1.699688257706962, "grad_norm": 0.37218350172042847, "learning_rate": 0.000141331925321369, "loss": 0.3335, "step": 13495 }, { "epoch": 1.7003180401171396, "grad_norm": 0.26352524757385254, "learning_rate": 0.0001412221885817466, "loss": 0.3246, "step": 13500 }, { "epoch": 1.7009478225273167, "grad_norm": 0.27649009227752686, "learning_rate": 0.00014111245655600948, "loss": 0.3117, "step": 13505 }, { "epoch": 1.7015776049374942, "grad_norm": 0.26316478848457336, "learning_rate": 0.00014100272930308623, "loss": 0.3268, "step": 13510 }, { "epoch": 1.7022073873476713, "grad_norm": 0.26319512724876404, "learning_rate": 0.0001408930068819028, "loss": 0.3083, "step": 13515 }, { "epoch": 1.7028371697578488, "grad_norm": 0.26792389154434204, "learning_rate": 0.00014078328935138276, "loss": 0.3317, "step": 13520 }, { "epoch": 1.703466952168026, "grad_norm": 0.2627207338809967, "learning_rate": 0.0001406735767704469, "loss": 0.3225, "step": 13525 }, { "epoch": 1.7040967345782032, "grad_norm": 0.30815207958221436, "learning_rate": 0.00014056386919801325, "loss": 0.3201, "step": 13530 }, { "epoch": 1.7047265169883805, "grad_norm": 0.296520471572876, "learning_rate": 0.00014045416669299747, "loss": 0.3189, "step": 13535 }, { "epoch": 1.7053562993985578, "grad_norm": 0.2739796042442322, "learning_rate": 0.0001403444693143122, "loss": 0.3023, "step": 13540 }, { "epoch": 1.7059860818087351, "grad_norm": 0.311927855014801, "learning_rate": 0.00014023477712086743, "loss": 0.3311, "step": 13545 }, { "epoch": 1.7066158642189122, "grad_norm": 0.2842674255371094, "learning_rate": 0.0001401250901715704, "loss": 0.3376, "step": 13550 }, { "epoch": 1.7072456466290897, "grad_norm": 0.30459704995155334, "learning_rate": 0.00014001540852532553, "loss": 0.3276, "step": 13555 }, { "epoch": 1.7078754290392668, "grad_norm": 0.26651817560195923, "learning_rate": 0.00013990573224103442, "loss": 0.3309, "step": 13560 }, { "epoch": 1.7085052114494443, "grad_norm": 0.32419687509536743, "learning_rate": 0.00013979606137759563, "loss": 0.314, "step": 13565 }, { "epoch": 1.7091349938596214, "grad_norm": 0.2715966999530792, "learning_rate": 0.000139686395993905, "loss": 0.3293, "step": 13570 }, { "epoch": 1.709764776269799, "grad_norm": 0.29049497842788696, "learning_rate": 0.0001395767361488552, "loss": 0.3159, "step": 13575 }, { "epoch": 1.710394558679976, "grad_norm": 0.3235701024532318, "learning_rate": 0.00013946708190133627, "loss": 0.3422, "step": 13580 }, { "epoch": 1.7110243410901533, "grad_norm": 0.2732395529747009, "learning_rate": 0.00013935743331023492, "loss": 0.317, "step": 13585 }, { "epoch": 1.7116541235003306, "grad_norm": 0.2833672761917114, "learning_rate": 0.000139247790434435, "loss": 0.3619, "step": 13590 }, { "epoch": 1.712283905910508, "grad_norm": 0.2510261535644531, "learning_rate": 0.00013913815333281728, "loss": 0.3215, "step": 13595 }, { "epoch": 1.7129136883206852, "grad_norm": 0.29638463258743286, "learning_rate": 0.00013902852206425925, "loss": 0.3341, "step": 13600 }, { "epoch": 1.7135434707308623, "grad_norm": 0.26883918046951294, "learning_rate": 0.0001389188966876355, "loss": 0.3198, "step": 13605 }, { "epoch": 1.7141732531410399, "grad_norm": 0.280301958322525, "learning_rate": 0.00013880927726181737, "loss": 0.3232, "step": 13610 }, { "epoch": 1.714803035551217, "grad_norm": 0.25223594903945923, "learning_rate": 0.00013869966384567293, "loss": 0.3362, "step": 13615 }, { "epoch": 1.7154328179613945, "grad_norm": 0.29902294278144836, "learning_rate": 0.00013859005649806717, "loss": 0.3169, "step": 13620 }, { "epoch": 1.7160626003715715, "grad_norm": 0.3142664134502411, "learning_rate": 0.00013848045527786168, "loss": 0.3149, "step": 13625 }, { "epoch": 1.716692382781749, "grad_norm": 0.312800794839859, "learning_rate": 0.0001383708602439149, "loss": 0.3327, "step": 13630 }, { "epoch": 1.7173221651919262, "grad_norm": 0.3177478015422821, "learning_rate": 0.00013826127145508176, "loss": 0.3215, "step": 13635 }, { "epoch": 1.7179519476021035, "grad_norm": 0.2900395691394806, "learning_rate": 0.00013815168897021398, "loss": 0.3169, "step": 13640 }, { "epoch": 1.7185817300122808, "grad_norm": 0.2877413332462311, "learning_rate": 0.00013804211284815986, "loss": 0.3247, "step": 13645 }, { "epoch": 1.719211512422458, "grad_norm": 0.25947847962379456, "learning_rate": 0.00013793254314776432, "loss": 0.3091, "step": 13650 }, { "epoch": 1.7198412948326354, "grad_norm": 0.270942747592926, "learning_rate": 0.00013782297992786873, "loss": 0.3318, "step": 13655 }, { "epoch": 1.7204710772428125, "grad_norm": 0.2605541944503784, "learning_rate": 0.00013771342324731106, "loss": 0.3247, "step": 13660 }, { "epoch": 1.72110085965299, "grad_norm": 0.25236964225769043, "learning_rate": 0.00013760387316492584, "loss": 0.3111, "step": 13665 }, { "epoch": 1.721730642063167, "grad_norm": 0.2639407217502594, "learning_rate": 0.00013749432973954385, "loss": 0.305, "step": 13670 }, { "epoch": 1.7223604244733446, "grad_norm": 0.3111459016799927, "learning_rate": 0.0001373847930299924, "loss": 0.3367, "step": 13675 }, { "epoch": 1.7229902068835217, "grad_norm": 0.31038767099380493, "learning_rate": 0.00013727526309509531, "loss": 0.3223, "step": 13680 }, { "epoch": 1.723619989293699, "grad_norm": 0.2571181058883667, "learning_rate": 0.00013716573999367259, "loss": 0.3057, "step": 13685 }, { "epoch": 1.7242497717038763, "grad_norm": 0.24940542876720428, "learning_rate": 0.0001370562237845406, "loss": 0.319, "step": 13690 }, { "epoch": 1.7248795541140536, "grad_norm": 0.2301412671804428, "learning_rate": 0.00013694671452651216, "loss": 0.3099, "step": 13695 }, { "epoch": 1.7255093365242309, "grad_norm": 0.27043718099594116, "learning_rate": 0.00013683721227839623, "loss": 0.3345, "step": 13700 }, { "epoch": 1.7261391189344082, "grad_norm": 0.26595422625541687, "learning_rate": 0.00013672771709899792, "loss": 0.3162, "step": 13705 }, { "epoch": 1.7267689013445855, "grad_norm": 0.26224714517593384, "learning_rate": 0.0001366182290471187, "loss": 0.322, "step": 13710 }, { "epoch": 1.7273986837547626, "grad_norm": 0.26390886306762695, "learning_rate": 0.00013650874818155618, "loss": 0.2964, "step": 13715 }, { "epoch": 1.72802846616494, "grad_norm": 0.3042176365852356, "learning_rate": 0.00013639927456110402, "loss": 0.3128, "step": 13720 }, { "epoch": 1.7286582485751172, "grad_norm": 0.269771009683609, "learning_rate": 0.00013628980824455212, "loss": 0.2963, "step": 13725 }, { "epoch": 1.7292880309852947, "grad_norm": 0.3462948203086853, "learning_rate": 0.00013618034929068634, "loss": 0.3445, "step": 13730 }, { "epoch": 1.7299178133954718, "grad_norm": 0.270379900932312, "learning_rate": 0.0001360708977582887, "loss": 0.3174, "step": 13735 }, { "epoch": 1.730547595805649, "grad_norm": 0.23746255040168762, "learning_rate": 0.00013596145370613715, "loss": 0.3006, "step": 13740 }, { "epoch": 1.7311773782158264, "grad_norm": 0.30519574880599976, "learning_rate": 0.00013585201719300562, "loss": 0.3272, "step": 13745 }, { "epoch": 1.7318071606260037, "grad_norm": 0.3508155941963196, "learning_rate": 0.000135742588277664, "loss": 0.3385, "step": 13750 }, { "epoch": 1.732436943036181, "grad_norm": 0.2649688720703125, "learning_rate": 0.00013563316701887816, "loss": 0.3191, "step": 13755 }, { "epoch": 1.7330667254463583, "grad_norm": 0.25044509768486023, "learning_rate": 0.0001355237534754098, "loss": 0.3114, "step": 13760 }, { "epoch": 1.7336965078565356, "grad_norm": 0.27739325165748596, "learning_rate": 0.00013541434770601653, "loss": 0.3555, "step": 13765 }, { "epoch": 1.7343262902667127, "grad_norm": 0.27952834963798523, "learning_rate": 0.00013530494976945172, "loss": 0.3287, "step": 13770 }, { "epoch": 1.7349560726768902, "grad_norm": 0.29794949293136597, "learning_rate": 0.00013519555972446454, "loss": 0.3248, "step": 13775 }, { "epoch": 1.7355858550870673, "grad_norm": 0.3177776634693146, "learning_rate": 0.00013508617762979992, "loss": 0.3311, "step": 13780 }, { "epoch": 1.7362156374972448, "grad_norm": 0.29036352038383484, "learning_rate": 0.0001349768035441986, "loss": 0.3021, "step": 13785 }, { "epoch": 1.736845419907422, "grad_norm": 0.2803820073604584, "learning_rate": 0.00013486743752639694, "loss": 0.3021, "step": 13790 }, { "epoch": 1.7374752023175992, "grad_norm": 0.25854361057281494, "learning_rate": 0.000134758079635127, "loss": 0.3215, "step": 13795 }, { "epoch": 1.7381049847277765, "grad_norm": 0.2606901228427887, "learning_rate": 0.0001346487299291165, "loss": 0.3093, "step": 13800 }, { "epoch": 1.7387347671379538, "grad_norm": 0.25198522210121155, "learning_rate": 0.00013453938846708864, "loss": 0.2954, "step": 13805 }, { "epoch": 1.7393645495481311, "grad_norm": 0.27399036288261414, "learning_rate": 0.00013443005530776233, "loss": 0.3212, "step": 13810 }, { "epoch": 1.7399943319583084, "grad_norm": 0.2777753174304962, "learning_rate": 0.000134320730509852, "loss": 0.32, "step": 13815 }, { "epoch": 1.7406241143684857, "grad_norm": 0.28130999207496643, "learning_rate": 0.0001342114141320675, "loss": 0.305, "step": 13820 }, { "epoch": 1.7412538967786628, "grad_norm": 0.28102371096611023, "learning_rate": 0.00013410210623311428, "loss": 0.3066, "step": 13825 }, { "epoch": 1.7418836791888404, "grad_norm": 0.21866032481193542, "learning_rate": 0.00013399280687169312, "loss": 0.3181, "step": 13830 }, { "epoch": 1.7425134615990174, "grad_norm": 0.27159667015075684, "learning_rate": 0.00013388351610650045, "loss": 0.2983, "step": 13835 }, { "epoch": 1.743143244009195, "grad_norm": 0.26473724842071533, "learning_rate": 0.00013377423399622764, "loss": 0.3041, "step": 13840 }, { "epoch": 1.743773026419372, "grad_norm": 0.30044063925743103, "learning_rate": 0.00013366496059956184, "loss": 0.3391, "step": 13845 }, { "epoch": 1.7444028088295493, "grad_norm": 0.3015748858451843, "learning_rate": 0.00013355569597518532, "loss": 0.3033, "step": 13850 }, { "epoch": 1.7450325912397266, "grad_norm": 0.27009138464927673, "learning_rate": 0.00013344644018177572, "loss": 0.2973, "step": 13855 }, { "epoch": 1.745662373649904, "grad_norm": 0.28925400972366333, "learning_rate": 0.00013333719327800585, "loss": 0.3137, "step": 13860 }, { "epoch": 1.7462921560600813, "grad_norm": 0.27679139375686646, "learning_rate": 0.00013322795532254379, "loss": 0.3119, "step": 13865 }, { "epoch": 1.7469219384702586, "grad_norm": 0.283965140581131, "learning_rate": 0.0001331187263740529, "loss": 0.3151, "step": 13870 }, { "epoch": 1.7475517208804359, "grad_norm": 0.24927465617656708, "learning_rate": 0.0001330095064911915, "loss": 0.2968, "step": 13875 }, { "epoch": 1.748181503290613, "grad_norm": 0.2976732850074768, "learning_rate": 0.0001329002957326132, "loss": 0.3257, "step": 13880 }, { "epoch": 1.7488112857007905, "grad_norm": 0.27860409021377563, "learning_rate": 0.00013279109415696672, "loss": 0.2988, "step": 13885 }, { "epoch": 1.7494410681109676, "grad_norm": 0.28782716393470764, "learning_rate": 0.0001326819018228958, "loss": 0.3098, "step": 13890 }, { "epoch": 1.750070850521145, "grad_norm": 0.24729984998703003, "learning_rate": 0.0001325727187890391, "loss": 0.3123, "step": 13895 }, { "epoch": 1.7507006329313222, "grad_norm": 0.23218853771686554, "learning_rate": 0.00013246354511403058, "loss": 0.3025, "step": 13900 }, { "epoch": 1.7513304153414995, "grad_norm": 0.2634672522544861, "learning_rate": 0.00013235438085649893, "loss": 0.3123, "step": 13905 }, { "epoch": 1.7519601977516768, "grad_norm": 0.3087509572505951, "learning_rate": 0.00013224522607506776, "loss": 0.3515, "step": 13910 }, { "epoch": 1.752589980161854, "grad_norm": 0.28160160779953003, "learning_rate": 0.00013213608082835576, "loss": 0.3141, "step": 13915 }, { "epoch": 1.7532197625720314, "grad_norm": 0.2643168866634369, "learning_rate": 0.0001320269451749764, "loss": 0.297, "step": 13920 }, { "epoch": 1.7538495449822087, "grad_norm": 0.34547582268714905, "learning_rate": 0.00013191781917353803, "loss": 0.3194, "step": 13925 }, { "epoch": 1.754479327392386, "grad_norm": 0.29079994559288025, "learning_rate": 0.00013180870288264385, "loss": 0.3334, "step": 13930 }, { "epoch": 1.755109109802563, "grad_norm": 0.2323244959115982, "learning_rate": 0.00013169959636089167, "loss": 0.3106, "step": 13935 }, { "epoch": 1.7557388922127406, "grad_norm": 0.29080161452293396, "learning_rate": 0.00013159049966687437, "loss": 0.2978, "step": 13940 }, { "epoch": 1.7563686746229177, "grad_norm": 0.2688988149166107, "learning_rate": 0.00013148141285917924, "loss": 0.3184, "step": 13945 }, { "epoch": 1.7569984570330952, "grad_norm": 0.25353583693504333, "learning_rate": 0.0001313723359963884, "loss": 0.2956, "step": 13950 }, { "epoch": 1.7576282394432723, "grad_norm": 0.32606688141822815, "learning_rate": 0.0001312632691370786, "loss": 0.3136, "step": 13955 }, { "epoch": 1.7582580218534496, "grad_norm": 0.24126961827278137, "learning_rate": 0.0001311542123398213, "loss": 0.304, "step": 13960 }, { "epoch": 1.758887804263627, "grad_norm": 0.2840232253074646, "learning_rate": 0.0001310451656631824, "loss": 0.3126, "step": 13965 }, { "epoch": 1.7595175866738042, "grad_norm": 0.30879929661750793, "learning_rate": 0.0001309361291657226, "loss": 0.3115, "step": 13970 }, { "epoch": 1.7601473690839815, "grad_norm": 0.29478558897972107, "learning_rate": 0.0001308271029059969, "loss": 0.3035, "step": 13975 }, { "epoch": 1.7607771514941588, "grad_norm": 0.29496970772743225, "learning_rate": 0.00013071808694255484, "loss": 0.3417, "step": 13980 }, { "epoch": 1.7614069339043361, "grad_norm": 0.27189967036247253, "learning_rate": 0.00013060908133394054, "loss": 0.3146, "step": 13985 }, { "epoch": 1.7620367163145132, "grad_norm": 0.2737963795661926, "learning_rate": 0.00013050008613869256, "loss": 0.3223, "step": 13990 }, { "epoch": 1.7626664987246907, "grad_norm": 0.2881993055343628, "learning_rate": 0.00013039110141534367, "loss": 0.3039, "step": 13995 }, { "epoch": 1.7632962811348678, "grad_norm": 0.29045918583869934, "learning_rate": 0.00013028212722242127, "loss": 0.3193, "step": 14000 }, { "epoch": 1.7632962811348678, "eval_loss": 0.3040441870689392, "eval_runtime": 6.1585, "eval_samples_per_second": 162.378, "eval_steps_per_second": 10.23, "step": 14000 }, { "epoch": 1.7639260635450453, "grad_norm": 0.24037687480449677, "learning_rate": 0.00013017316361844692, "loss": 0.2918, "step": 14005 }, { "epoch": 1.7645558459552224, "grad_norm": 0.25562503933906555, "learning_rate": 0.0001300642106619367, "loss": 0.2967, "step": 14010 }, { "epoch": 1.7651856283653997, "grad_norm": 0.3410753905773163, "learning_rate": 0.00012995526841140068, "loss": 0.3158, "step": 14015 }, { "epoch": 1.765815410775577, "grad_norm": 0.2569274306297302, "learning_rate": 0.00012984633692534337, "loss": 0.306, "step": 14020 }, { "epoch": 1.7664451931857543, "grad_norm": 0.26620200276374817, "learning_rate": 0.00012973741626226348, "loss": 0.3122, "step": 14025 }, { "epoch": 1.7670749755959316, "grad_norm": 0.2842133045196533, "learning_rate": 0.00012962850648065393, "loss": 0.3253, "step": 14030 }, { "epoch": 1.767704758006109, "grad_norm": 0.27718397974967957, "learning_rate": 0.00012951960763900173, "loss": 0.3187, "step": 14035 }, { "epoch": 1.7683345404162862, "grad_norm": 0.27699559926986694, "learning_rate": 0.00012941071979578805, "loss": 0.33, "step": 14040 }, { "epoch": 1.7689643228264633, "grad_norm": 0.21499434113502502, "learning_rate": 0.00012930184300948819, "loss": 0.2765, "step": 14045 }, { "epoch": 1.7695941052366408, "grad_norm": 0.29474014043807983, "learning_rate": 0.00012919297733857138, "loss": 0.32, "step": 14050 }, { "epoch": 1.770223887646818, "grad_norm": 0.3570992052555084, "learning_rate": 0.00012908412284150104, "loss": 0.3088, "step": 14055 }, { "epoch": 1.7708536700569955, "grad_norm": 0.2408706545829773, "learning_rate": 0.00012897527957673446, "loss": 0.2991, "step": 14060 }, { "epoch": 1.7714834524671725, "grad_norm": 0.23086212575435638, "learning_rate": 0.00012886644760272306, "loss": 0.2959, "step": 14065 }, { "epoch": 1.7721132348773498, "grad_norm": 0.25117409229278564, "learning_rate": 0.00012875762697791199, "loss": 0.2933, "step": 14070 }, { "epoch": 1.7727430172875271, "grad_norm": 0.28731420636177063, "learning_rate": 0.0001286488177607405, "loss": 0.3234, "step": 14075 }, { "epoch": 1.7733727996977044, "grad_norm": 0.23875364661216736, "learning_rate": 0.0001285400200096416, "loss": 0.2952, "step": 14080 }, { "epoch": 1.7740025821078818, "grad_norm": 0.2722354829311371, "learning_rate": 0.0001284312337830421, "loss": 0.2997, "step": 14085 }, { "epoch": 1.774632364518059, "grad_norm": 0.27776023745536804, "learning_rate": 0.00012832245913936278, "loss": 0.3256, "step": 14090 }, { "epoch": 1.7752621469282364, "grad_norm": 0.26422828435897827, "learning_rate": 0.00012821369613701808, "loss": 0.2983, "step": 14095 }, { "epoch": 1.7758919293384134, "grad_norm": 0.23418962955474854, "learning_rate": 0.00012810494483441614, "loss": 0.3024, "step": 14100 }, { "epoch": 1.776521711748591, "grad_norm": 0.300912082195282, "learning_rate": 0.000127996205289959, "loss": 0.3001, "step": 14105 }, { "epoch": 1.777151494158768, "grad_norm": 0.2872162461280823, "learning_rate": 0.00012788747756204222, "loss": 0.3074, "step": 14110 }, { "epoch": 1.7777812765689456, "grad_norm": 0.2784421145915985, "learning_rate": 0.00012777876170905515, "loss": 0.2978, "step": 14115 }, { "epoch": 1.7784110589791227, "grad_norm": 0.28062257170677185, "learning_rate": 0.00012767005778938062, "loss": 0.2993, "step": 14120 }, { "epoch": 1.7790408413893, "grad_norm": 0.3496231734752655, "learning_rate": 0.0001275613658613951, "loss": 0.3147, "step": 14125 }, { "epoch": 1.7796706237994773, "grad_norm": 0.2595261037349701, "learning_rate": 0.00012745268598346864, "loss": 0.2943, "step": 14130 }, { "epoch": 1.7803004062096546, "grad_norm": 0.2795499563217163, "learning_rate": 0.00012734401821396486, "loss": 0.3123, "step": 14135 }, { "epoch": 1.7809301886198319, "grad_norm": 0.2615763247013092, "learning_rate": 0.0001272353626112408, "loss": 0.3059, "step": 14140 }, { "epoch": 1.7815599710300092, "grad_norm": 0.2783886790275574, "learning_rate": 0.00012712671923364706, "loss": 0.3134, "step": 14145 }, { "epoch": 1.7821897534401865, "grad_norm": 0.2884584367275238, "learning_rate": 0.0001270180881395276, "loss": 0.3151, "step": 14150 }, { "epoch": 1.7828195358503636, "grad_norm": 0.2677745521068573, "learning_rate": 0.0001269094693872197, "loss": 0.3146, "step": 14155 }, { "epoch": 1.783449318260541, "grad_norm": 0.25956082344055176, "learning_rate": 0.0001268008630350542, "loss": 0.3118, "step": 14160 }, { "epoch": 1.7840791006707182, "grad_norm": 0.2646723985671997, "learning_rate": 0.0001266922691413552, "loss": 0.2861, "step": 14165 }, { "epoch": 1.7847088830808957, "grad_norm": 0.29946067929267883, "learning_rate": 0.00012658368776444004, "loss": 0.3349, "step": 14170 }, { "epoch": 1.7853386654910728, "grad_norm": 0.24171167612075806, "learning_rate": 0.00012647511896261943, "loss": 0.2805, "step": 14175 }, { "epoch": 1.78596844790125, "grad_norm": 0.26428696513175964, "learning_rate": 0.0001263665627941973, "loss": 0.3231, "step": 14180 }, { "epoch": 1.7865982303114274, "grad_norm": 0.2787708044052124, "learning_rate": 0.0001262580193174709, "loss": 0.2961, "step": 14185 }, { "epoch": 1.7872280127216047, "grad_norm": 0.2826111614704132, "learning_rate": 0.00012614948859073036, "loss": 0.3343, "step": 14190 }, { "epoch": 1.787857795131782, "grad_norm": 0.278361052274704, "learning_rate": 0.00012604097067225927, "loss": 0.2919, "step": 14195 }, { "epoch": 1.7884875775419593, "grad_norm": 0.24778404831886292, "learning_rate": 0.00012593246562033419, "loss": 0.316, "step": 14200 }, { "epoch": 1.7891173599521366, "grad_norm": 0.28171002864837646, "learning_rate": 0.00012582397349322484, "loss": 0.3076, "step": 14205 }, { "epoch": 1.7897471423623137, "grad_norm": 0.26361143589019775, "learning_rate": 0.00012571549434919392, "loss": 0.2953, "step": 14210 }, { "epoch": 1.7903769247724912, "grad_norm": 0.27602389454841614, "learning_rate": 0.0001256070282464973, "loss": 0.3266, "step": 14215 }, { "epoch": 1.7910067071826683, "grad_norm": 0.2887786328792572, "learning_rate": 0.00012549857524338378, "loss": 0.3166, "step": 14220 }, { "epoch": 1.7916364895928458, "grad_norm": 0.272359162569046, "learning_rate": 0.00012539013539809493, "loss": 0.3053, "step": 14225 }, { "epoch": 1.792266272003023, "grad_norm": 0.2615000903606415, "learning_rate": 0.00012528170876886555, "loss": 0.2974, "step": 14230 }, { "epoch": 1.7928960544132002, "grad_norm": 0.2882770597934723, "learning_rate": 0.00012517329541392316, "loss": 0.301, "step": 14235 }, { "epoch": 1.7935258368233775, "grad_norm": 0.29980406165122986, "learning_rate": 0.00012506489539148823, "loss": 0.3009, "step": 14240 }, { "epoch": 1.7941556192335548, "grad_norm": 0.2714889943599701, "learning_rate": 0.0001249565087597741, "loss": 0.2897, "step": 14245 }, { "epoch": 1.7947854016437321, "grad_norm": 0.3578423261642456, "learning_rate": 0.00012484813557698678, "loss": 0.3021, "step": 14250 }, { "epoch": 1.7954151840539094, "grad_norm": 0.29889971017837524, "learning_rate": 0.00012473977590132524, "loss": 0.3039, "step": 14255 }, { "epoch": 1.7960449664640867, "grad_norm": 0.27244943380355835, "learning_rate": 0.000124631429790981, "loss": 0.3068, "step": 14260 }, { "epoch": 1.7966747488742638, "grad_norm": 0.2793833613395691, "learning_rate": 0.00012452309730413843, "loss": 0.3081, "step": 14265 }, { "epoch": 1.7973045312844413, "grad_norm": 0.27198326587677, "learning_rate": 0.00012441477849897461, "loss": 0.2957, "step": 14270 }, { "epoch": 1.7979343136946184, "grad_norm": 0.24795940518379211, "learning_rate": 0.0001243064734336591, "loss": 0.3094, "step": 14275 }, { "epoch": 1.798564096104796, "grad_norm": 0.29008451104164124, "learning_rate": 0.0001241981821663543, "loss": 0.3306, "step": 14280 }, { "epoch": 1.799193878514973, "grad_norm": 0.24478363990783691, "learning_rate": 0.00012408990475521508, "loss": 0.291, "step": 14285 }, { "epoch": 1.7998236609251503, "grad_norm": 0.2566664218902588, "learning_rate": 0.00012398164125838881, "loss": 0.3087, "step": 14290 }, { "epoch": 1.8004534433353276, "grad_norm": 0.24992555379867554, "learning_rate": 0.00012387339173401552, "loss": 0.318, "step": 14295 }, { "epoch": 1.801083225745505, "grad_norm": 0.244164377450943, "learning_rate": 0.00012376515624022767, "loss": 0.3096, "step": 14300 }, { "epoch": 1.8017130081556822, "grad_norm": 0.2495235651731491, "learning_rate": 0.00012365693483515016, "loss": 0.283, "step": 14305 }, { "epoch": 1.8023427905658593, "grad_norm": 0.2685554027557373, "learning_rate": 0.00012354872757690038, "loss": 0.3359, "step": 14310 }, { "epoch": 1.8029725729760369, "grad_norm": 0.23964886367321014, "learning_rate": 0.0001234405345235881, "loss": 0.3074, "step": 14315 }, { "epoch": 1.803602355386214, "grad_norm": 0.24736544489860535, "learning_rate": 0.00012333235573331556, "loss": 0.2891, "step": 14320 }, { "epoch": 1.8042321377963915, "grad_norm": 0.2994007170200348, "learning_rate": 0.00012322419126417706, "loss": 0.3109, "step": 14325 }, { "epoch": 1.8048619202065685, "grad_norm": 0.26516586542129517, "learning_rate": 0.0001231160411742595, "loss": 0.2974, "step": 14330 }, { "epoch": 1.805491702616746, "grad_norm": 0.27139636874198914, "learning_rate": 0.0001230079055216419, "loss": 0.3023, "step": 14335 }, { "epoch": 1.8061214850269232, "grad_norm": 0.26109209656715393, "learning_rate": 0.00012289978436439558, "loss": 0.3059, "step": 14340 }, { "epoch": 1.8067512674371005, "grad_norm": 0.29744458198547363, "learning_rate": 0.0001227916777605841, "loss": 0.3088, "step": 14345 }, { "epoch": 1.8073810498472778, "grad_norm": 0.27332085371017456, "learning_rate": 0.0001226835857682632, "loss": 0.2888, "step": 14350 }, { "epoch": 1.808010832257455, "grad_norm": 0.2586978077888489, "learning_rate": 0.00012257550844548074, "loss": 0.328, "step": 14355 }, { "epoch": 1.8086406146676324, "grad_norm": 0.29042935371398926, "learning_rate": 0.00012246744585027667, "loss": 0.3113, "step": 14360 }, { "epoch": 1.8092703970778095, "grad_norm": 0.271710067987442, "learning_rate": 0.000122359398040683, "loss": 0.2888, "step": 14365 }, { "epoch": 1.809900179487987, "grad_norm": 0.2969205379486084, "learning_rate": 0.00012225136507472406, "loss": 0.312, "step": 14370 }, { "epoch": 1.810529961898164, "grad_norm": 0.301145076751709, "learning_rate": 0.00012214334701041586, "loss": 0.2952, "step": 14375 }, { "epoch": 1.8111597443083416, "grad_norm": 0.250630259513855, "learning_rate": 0.00012203534390576666, "loss": 0.3073, "step": 14380 }, { "epoch": 1.8117895267185187, "grad_norm": 0.24282781779766083, "learning_rate": 0.00012192735581877654, "loss": 0.2863, "step": 14385 }, { "epoch": 1.8124193091286962, "grad_norm": 0.2824462652206421, "learning_rate": 0.00012181938280743769, "loss": 0.2999, "step": 14390 }, { "epoch": 1.8130490915388733, "grad_norm": 0.2740934491157532, "learning_rate": 0.00012171142492973388, "loss": 0.3131, "step": 14395 }, { "epoch": 1.8136788739490506, "grad_norm": 0.23533669114112854, "learning_rate": 0.00012160348224364109, "loss": 0.2846, "step": 14400 }, { "epoch": 1.814308656359228, "grad_norm": 0.26320409774780273, "learning_rate": 0.00012149555480712697, "loss": 0.2954, "step": 14405 }, { "epoch": 1.8149384387694052, "grad_norm": 0.2816338837146759, "learning_rate": 0.00012138764267815105, "loss": 0.2811, "step": 14410 }, { "epoch": 1.8155682211795825, "grad_norm": 0.23801551759243011, "learning_rate": 0.00012127974591466455, "loss": 0.2846, "step": 14415 }, { "epoch": 1.8161980035897596, "grad_norm": 0.3131721317768097, "learning_rate": 0.00012117186457461056, "loss": 0.2969, "step": 14420 }, { "epoch": 1.816827785999937, "grad_norm": 0.2892078757286072, "learning_rate": 0.00012106399871592385, "loss": 0.3, "step": 14425 }, { "epoch": 1.8174575684101142, "grad_norm": 0.253273606300354, "learning_rate": 0.00012095614839653074, "loss": 0.3005, "step": 14430 }, { "epoch": 1.8180873508202917, "grad_norm": 0.2675528824329376, "learning_rate": 0.00012084831367434937, "loss": 0.2947, "step": 14435 }, { "epoch": 1.8187171332304688, "grad_norm": 0.2665347158908844, "learning_rate": 0.00012074049460728945, "loss": 0.3012, "step": 14440 }, { "epoch": 1.8193469156406463, "grad_norm": 0.2987824082374573, "learning_rate": 0.00012063269125325228, "loss": 0.2986, "step": 14445 }, { "epoch": 1.8199766980508234, "grad_norm": 0.2429313212633133, "learning_rate": 0.00012052490367013076, "loss": 0.3035, "step": 14450 }, { "epoch": 1.8206064804610007, "grad_norm": 0.28424081206321716, "learning_rate": 0.00012041713191580925, "loss": 0.2948, "step": 14455 }, { "epoch": 1.821236262871178, "grad_norm": 0.25087571144104004, "learning_rate": 0.00012030937604816365, "loss": 0.2949, "step": 14460 }, { "epoch": 1.8218660452813553, "grad_norm": 0.23633217811584473, "learning_rate": 0.00012020163612506127, "loss": 0.2669, "step": 14465 }, { "epoch": 1.8224958276915326, "grad_norm": 0.26396888494491577, "learning_rate": 0.000120093912204361, "loss": 0.2912, "step": 14470 }, { "epoch": 1.8231256101017097, "grad_norm": 0.2898525297641754, "learning_rate": 0.00011998620434391299, "loss": 0.319, "step": 14475 }, { "epoch": 1.8237553925118872, "grad_norm": 0.25507113337516785, "learning_rate": 0.00011987851260155881, "loss": 0.3028, "step": 14480 }, { "epoch": 1.8243851749220643, "grad_norm": 0.2405284345149994, "learning_rate": 0.00011977083703513145, "loss": 0.2879, "step": 14485 }, { "epoch": 1.8250149573322418, "grad_norm": 0.27114009857177734, "learning_rate": 0.00011966317770245507, "loss": 0.3094, "step": 14490 }, { "epoch": 1.825644739742419, "grad_norm": 0.2708043158054352, "learning_rate": 0.0001195555346613453, "loss": 0.3062, "step": 14495 }, { "epoch": 1.8262745221525962, "grad_norm": 0.2507513463497162, "learning_rate": 0.00011944790796960878, "loss": 0.2832, "step": 14500 }, { "epoch": 1.8269043045627735, "grad_norm": 0.2864154577255249, "learning_rate": 0.0001193402976850436, "loss": 0.3067, "step": 14505 }, { "epoch": 1.8275340869729508, "grad_norm": 0.26530271768569946, "learning_rate": 0.00011923270386543886, "loss": 0.2816, "step": 14510 }, { "epoch": 1.8281638693831281, "grad_norm": 0.24444885551929474, "learning_rate": 0.00011912512656857498, "loss": 0.2993, "step": 14515 }, { "epoch": 1.8287936517933054, "grad_norm": 0.2591851055622101, "learning_rate": 0.00011901756585222334, "loss": 0.2926, "step": 14520 }, { "epoch": 1.8294234342034827, "grad_norm": 0.2942061424255371, "learning_rate": 0.0001189100217741466, "loss": 0.3032, "step": 14525 }, { "epoch": 1.8300532166136598, "grad_norm": 0.28199318051338196, "learning_rate": 0.00011880249439209836, "loss": 0.291, "step": 14530 }, { "epoch": 1.8306829990238374, "grad_norm": 0.2743484377861023, "learning_rate": 0.00011869498376382324, "loss": 0.3101, "step": 14535 }, { "epoch": 1.8313127814340144, "grad_norm": 0.24012960493564606, "learning_rate": 0.00011858748994705689, "loss": 0.3, "step": 14540 }, { "epoch": 1.831942563844192, "grad_norm": 0.2856425344944, "learning_rate": 0.00011848001299952598, "loss": 0.3042, "step": 14545 }, { "epoch": 1.832572346254369, "grad_norm": 0.2720118463039398, "learning_rate": 0.00011837255297894808, "loss": 0.298, "step": 14550 }, { "epoch": 1.8332021286645463, "grad_norm": 0.26973578333854675, "learning_rate": 0.0001182651099430317, "loss": 0.2734, "step": 14555 }, { "epoch": 1.8338319110747237, "grad_norm": 0.35720425844192505, "learning_rate": 0.00011815768394947616, "loss": 0.3174, "step": 14560 }, { "epoch": 1.834461693484901, "grad_norm": 0.2649666666984558, "learning_rate": 0.00011805027505597178, "loss": 0.3009, "step": 14565 }, { "epoch": 1.8350914758950783, "grad_norm": 0.2809504270553589, "learning_rate": 0.00011794288332019939, "loss": 0.3075, "step": 14570 }, { "epoch": 1.8357212583052556, "grad_norm": 0.247705340385437, "learning_rate": 0.00011783550879983097, "loss": 0.2929, "step": 14575 }, { "epoch": 1.8363510407154329, "grad_norm": 0.3010486364364624, "learning_rate": 0.00011772815155252901, "loss": 0.2923, "step": 14580 }, { "epoch": 1.83698082312561, "grad_norm": 0.29634296894073486, "learning_rate": 0.00011762081163594686, "loss": 0.2956, "step": 14585 }, { "epoch": 1.8376106055357875, "grad_norm": 0.3235035538673401, "learning_rate": 0.00011751348910772844, "loss": 0.299, "step": 14590 }, { "epoch": 1.8382403879459646, "grad_norm": 0.27069565653800964, "learning_rate": 0.00011740618402550849, "loss": 0.2885, "step": 14595 }, { "epoch": 1.838870170356142, "grad_norm": 0.26986175775527954, "learning_rate": 0.00011729889644691227, "loss": 0.2974, "step": 14600 }, { "epoch": 1.8394999527663192, "grad_norm": 0.24633704125881195, "learning_rate": 0.00011719162642955559, "loss": 0.3011, "step": 14605 }, { "epoch": 1.8401297351764965, "grad_norm": 0.2659735381603241, "learning_rate": 0.00011708437403104491, "loss": 0.2802, "step": 14610 }, { "epoch": 1.8407595175866738, "grad_norm": 0.2634638547897339, "learning_rate": 0.00011697713930897728, "loss": 0.2834, "step": 14615 }, { "epoch": 1.841389299996851, "grad_norm": 0.2891436517238617, "learning_rate": 0.00011686992232094012, "loss": 0.2892, "step": 14620 }, { "epoch": 1.8420190824070284, "grad_norm": 0.26533305644989014, "learning_rate": 0.0001167627231245115, "loss": 0.2954, "step": 14625 }, { "epoch": 1.8426488648172057, "grad_norm": 0.26114416122436523, "learning_rate": 0.00011665554177725977, "loss": 0.2936, "step": 14630 }, { "epoch": 1.843278647227383, "grad_norm": 0.24782754480838776, "learning_rate": 0.00011654837833674379, "loss": 0.283, "step": 14635 }, { "epoch": 1.84390842963756, "grad_norm": 0.2653804123401642, "learning_rate": 0.00011644123286051274, "loss": 0.2911, "step": 14640 }, { "epoch": 1.8445382120477376, "grad_norm": 0.2524818181991577, "learning_rate": 0.00011633410540610621, "loss": 0.29, "step": 14645 }, { "epoch": 1.8451679944579147, "grad_norm": 0.2844378352165222, "learning_rate": 0.00011622699603105404, "loss": 0.298, "step": 14650 }, { "epoch": 1.8457977768680922, "grad_norm": 0.2608543038368225, "learning_rate": 0.0001161199047928765, "loss": 0.2807, "step": 14655 }, { "epoch": 1.8464275592782693, "grad_norm": 0.2596459984779358, "learning_rate": 0.000116012831749084, "loss": 0.29, "step": 14660 }, { "epoch": 1.8470573416884466, "grad_norm": 0.2654721140861511, "learning_rate": 0.00011590577695717717, "loss": 0.2878, "step": 14665 }, { "epoch": 1.847687124098624, "grad_norm": 0.283388614654541, "learning_rate": 0.00011579874047464696, "loss": 0.2751, "step": 14670 }, { "epoch": 1.8483169065088012, "grad_norm": 0.24917341768741608, "learning_rate": 0.00011569172235897433, "loss": 0.3, "step": 14675 }, { "epoch": 1.8489466889189785, "grad_norm": 0.2464076280593872, "learning_rate": 0.00011558472266763049, "loss": 0.2848, "step": 14680 }, { "epoch": 1.8495764713291558, "grad_norm": 0.2884039282798767, "learning_rate": 0.00011547774145807665, "loss": 0.2698, "step": 14685 }, { "epoch": 1.8502062537393331, "grad_norm": 0.2762083411216736, "learning_rate": 0.00011537077878776425, "loss": 0.3151, "step": 14690 }, { "epoch": 1.8508360361495102, "grad_norm": 0.22906774282455444, "learning_rate": 0.00011526383471413463, "loss": 0.2669, "step": 14695 }, { "epoch": 1.8514658185596877, "grad_norm": 0.28603047132492065, "learning_rate": 0.00011515690929461928, "loss": 0.2922, "step": 14700 }, { "epoch": 1.8520956009698648, "grad_norm": 0.26245948672294617, "learning_rate": 0.00011505000258663954, "loss": 0.3095, "step": 14705 }, { "epoch": 1.8527253833800423, "grad_norm": 0.2754320800304413, "learning_rate": 0.00011494311464760673, "loss": 0.2843, "step": 14710 }, { "epoch": 1.8533551657902194, "grad_norm": 0.24283255636692047, "learning_rate": 0.00011483624553492212, "loss": 0.3039, "step": 14715 }, { "epoch": 1.8539849482003967, "grad_norm": 0.299950510263443, "learning_rate": 0.00011472939530597691, "loss": 0.3108, "step": 14720 }, { "epoch": 1.854614730610574, "grad_norm": 0.23872928321361542, "learning_rate": 0.00011462256401815205, "loss": 0.3221, "step": 14725 }, { "epoch": 1.8552445130207513, "grad_norm": 0.32674193382263184, "learning_rate": 0.00011451575172881845, "loss": 0.3066, "step": 14730 }, { "epoch": 1.8558742954309286, "grad_norm": 0.2620803415775299, "learning_rate": 0.00011440895849533675, "loss": 0.2855, "step": 14735 }, { "epoch": 1.856504077841106, "grad_norm": 0.2653051018714905, "learning_rate": 0.0001143021843750573, "loss": 0.2827, "step": 14740 }, { "epoch": 1.8571338602512832, "grad_norm": 0.29697105288505554, "learning_rate": 0.00011419542942532023, "loss": 0.2848, "step": 14745 }, { "epoch": 1.8577636426614603, "grad_norm": 0.26711151003837585, "learning_rate": 0.00011408869370345545, "loss": 0.301, "step": 14750 }, { "epoch": 1.8583934250716379, "grad_norm": 0.26371288299560547, "learning_rate": 0.0001139819772667824, "loss": 0.2994, "step": 14755 }, { "epoch": 1.859023207481815, "grad_norm": 0.34920862317085266, "learning_rate": 0.00011387528017261035, "loss": 0.2968, "step": 14760 }, { "epoch": 1.8596529898919925, "grad_norm": 0.2951182723045349, "learning_rate": 0.000113768602478238, "loss": 0.2935, "step": 14765 }, { "epoch": 1.8602827723021695, "grad_norm": 0.22442401945590973, "learning_rate": 0.00011366194424095381, "loss": 0.2952, "step": 14770 }, { "epoch": 1.8609125547123468, "grad_norm": 0.26102137565612793, "learning_rate": 0.00011355530551803553, "loss": 0.2823, "step": 14775 }, { "epoch": 1.8615423371225241, "grad_norm": 0.27118000388145447, "learning_rate": 0.0001134486863667507, "loss": 0.286, "step": 14780 }, { "epoch": 1.8621721195327015, "grad_norm": 0.2869999408721924, "learning_rate": 0.00011334208684435617, "loss": 0.2734, "step": 14785 }, { "epoch": 1.8628019019428788, "grad_norm": 0.32944396138191223, "learning_rate": 0.0001132355070080984, "loss": 0.3038, "step": 14790 }, { "epoch": 1.863431684353056, "grad_norm": 0.28535759449005127, "learning_rate": 0.00011312894691521312, "loss": 0.3213, "step": 14795 }, { "epoch": 1.8640614667632334, "grad_norm": 0.23751592636108398, "learning_rate": 0.00011302240662292561, "loss": 0.2972, "step": 14800 }, { "epoch": 1.8646912491734104, "grad_norm": 0.2352185994386673, "learning_rate": 0.00011291588618845043, "loss": 0.2772, "step": 14805 }, { "epoch": 1.865321031583588, "grad_norm": 0.24066108465194702, "learning_rate": 0.00011280938566899142, "loss": 0.3053, "step": 14810 }, { "epoch": 1.865950813993765, "grad_norm": 0.33842501044273376, "learning_rate": 0.0001127029051217418, "loss": 0.2992, "step": 14815 }, { "epoch": 1.8665805964039426, "grad_norm": 0.24873322248458862, "learning_rate": 0.00011259644460388412, "loss": 0.2887, "step": 14820 }, { "epoch": 1.8672103788141197, "grad_norm": 0.27127575874328613, "learning_rate": 0.00011249000417259005, "loss": 0.2619, "step": 14825 }, { "epoch": 1.867840161224297, "grad_norm": 0.28289374709129333, "learning_rate": 0.00011238358388502059, "loss": 0.2815, "step": 14830 }, { "epoch": 1.8684699436344743, "grad_norm": 0.29764994978904724, "learning_rate": 0.00011227718379832583, "loss": 0.3006, "step": 14835 }, { "epoch": 1.8690997260446516, "grad_norm": 0.2869538366794586, "learning_rate": 0.00011217080396964507, "loss": 0.2707, "step": 14840 }, { "epoch": 1.8697295084548289, "grad_norm": 0.2732262909412384, "learning_rate": 0.00011206444445610663, "loss": 0.2768, "step": 14845 }, { "epoch": 1.8703592908650062, "grad_norm": 0.3032742738723755, "learning_rate": 0.0001119581053148281, "loss": 0.2715, "step": 14850 }, { "epoch": 1.8709890732751835, "grad_norm": 0.26171359419822693, "learning_rate": 0.00011185178660291594, "loss": 0.2701, "step": 14855 }, { "epoch": 1.8716188556853606, "grad_norm": 0.30940353870391846, "learning_rate": 0.00011174548837746581, "loss": 0.2843, "step": 14860 }, { "epoch": 1.872248638095538, "grad_norm": 0.2774769067764282, "learning_rate": 0.00011163921069556224, "loss": 0.2951, "step": 14865 }, { "epoch": 1.8728784205057152, "grad_norm": 0.29232633113861084, "learning_rate": 0.00011153295361427876, "loss": 0.2938, "step": 14870 }, { "epoch": 1.8735082029158927, "grad_norm": 0.28283149003982544, "learning_rate": 0.00011142671719067793, "loss": 0.2875, "step": 14875 }, { "epoch": 1.8741379853260698, "grad_norm": 0.24245183169841766, "learning_rate": 0.00011132050148181103, "loss": 0.2829, "step": 14880 }, { "epoch": 1.874767767736247, "grad_norm": 0.27495938539505005, "learning_rate": 0.00011121430654471837, "loss": 0.2923, "step": 14885 }, { "epoch": 1.8753975501464244, "grad_norm": 0.3106895089149475, "learning_rate": 0.00011110813243642906, "loss": 0.2855, "step": 14890 }, { "epoch": 1.8760273325566017, "grad_norm": 0.263810396194458, "learning_rate": 0.00011100197921396102, "loss": 0.271, "step": 14895 }, { "epoch": 1.876657114966779, "grad_norm": 0.23044048249721527, "learning_rate": 0.00011089584693432091, "loss": 0.2608, "step": 14900 }, { "epoch": 1.8772868973769563, "grad_norm": 0.29268765449523926, "learning_rate": 0.00011078973565450427, "loss": 0.2835, "step": 14905 }, { "epoch": 1.8779166797871336, "grad_norm": 0.2691350281238556, "learning_rate": 0.00011068364543149527, "loss": 0.291, "step": 14910 }, { "epoch": 1.8785464621973107, "grad_norm": 0.26748213171958923, "learning_rate": 0.00011057757632226672, "loss": 0.2994, "step": 14915 }, { "epoch": 1.8791762446074882, "grad_norm": 0.2624029815196991, "learning_rate": 0.00011047152838378018, "loss": 0.2832, "step": 14920 }, { "epoch": 1.8798060270176653, "grad_norm": 0.2670036554336548, "learning_rate": 0.00011036550167298583, "loss": 0.284, "step": 14925 }, { "epoch": 1.8804358094278428, "grad_norm": 0.2848396599292755, "learning_rate": 0.0001102594962468224, "loss": 0.2831, "step": 14930 }, { "epoch": 1.88106559183802, "grad_norm": 0.2502748668193817, "learning_rate": 0.0001101535121622173, "loss": 0.3038, "step": 14935 }, { "epoch": 1.8816953742481972, "grad_norm": 0.2998834252357483, "learning_rate": 0.0001100475494760863, "loss": 0.2847, "step": 14940 }, { "epoch": 1.8823251566583745, "grad_norm": 0.229685977101326, "learning_rate": 0.00010994160824533398, "loss": 0.261, "step": 14945 }, { "epoch": 1.8829549390685518, "grad_norm": 0.26833808422088623, "learning_rate": 0.00010983568852685294, "loss": 0.2923, "step": 14950 }, { "epoch": 1.8835847214787291, "grad_norm": 0.2380465716123581, "learning_rate": 0.00010972979037752465, "loss": 0.2664, "step": 14955 }, { "epoch": 1.8842145038889064, "grad_norm": 0.2505188286304474, "learning_rate": 0.00010962391385421876, "loss": 0.2914, "step": 14960 }, { "epoch": 1.8848442862990837, "grad_norm": 0.33335885405540466, "learning_rate": 0.00010951805901379346, "loss": 0.3092, "step": 14965 }, { "epoch": 1.8854740687092608, "grad_norm": 0.22425580024719238, "learning_rate": 0.0001094122259130951, "loss": 0.2583, "step": 14970 }, { "epoch": 1.8861038511194383, "grad_norm": 0.25008514523506165, "learning_rate": 0.00010930641460895863, "loss": 0.2936, "step": 14975 }, { "epoch": 1.8867336335296154, "grad_norm": 0.2543163299560547, "learning_rate": 0.00010920062515820707, "loss": 0.2855, "step": 14980 }, { "epoch": 1.887363415939793, "grad_norm": 0.25144490599632263, "learning_rate": 0.00010909485761765172, "loss": 0.2788, "step": 14985 }, { "epoch": 1.88799319834997, "grad_norm": 0.23470145463943481, "learning_rate": 0.00010898911204409218, "loss": 0.2709, "step": 14990 }, { "epoch": 1.8886229807601473, "grad_norm": 0.27916932106018066, "learning_rate": 0.00010888338849431629, "loss": 0.279, "step": 14995 }, { "epoch": 1.8892527631703246, "grad_norm": 0.24980424344539642, "learning_rate": 0.00010877768702509996, "loss": 0.2982, "step": 15000 }, { "epoch": 1.8892527631703246, "eval_loss": 0.3032541871070862, "eval_runtime": 6.1659, "eval_samples_per_second": 162.182, "eval_steps_per_second": 10.217, "step": 15000 }, { "epoch": 1.889882545580502, "grad_norm": 0.24535268545150757, "learning_rate": 0.00010867200769320732, "loss": 0.2667, "step": 15005 }, { "epoch": 1.8905123279906793, "grad_norm": 0.2690826654434204, "learning_rate": 0.0001085663505553906, "loss": 0.2703, "step": 15010 }, { "epoch": 1.8911421104008566, "grad_norm": 0.2511346936225891, "learning_rate": 0.00010846071566839008, "loss": 0.3011, "step": 15015 }, { "epoch": 1.8917718928110339, "grad_norm": 0.28077587485313416, "learning_rate": 0.00010835510308893407, "loss": 0.285, "step": 15020 }, { "epoch": 1.892401675221211, "grad_norm": 0.309238463640213, "learning_rate": 0.000108249512873739, "loss": 0.2894, "step": 15025 }, { "epoch": 1.8930314576313885, "grad_norm": 0.26940178871154785, "learning_rate": 0.00010814394507950917, "loss": 0.2864, "step": 15030 }, { "epoch": 1.8936612400415656, "grad_norm": 0.27850431203842163, "learning_rate": 0.00010803839976293694, "loss": 0.2716, "step": 15035 }, { "epoch": 1.894291022451743, "grad_norm": 0.24114792048931122, "learning_rate": 0.00010793287698070256, "loss": 0.2695, "step": 15040 }, { "epoch": 1.8949208048619202, "grad_norm": 0.3137163817882538, "learning_rate": 0.0001078273767894741, "loss": 0.3063, "step": 15045 }, { "epoch": 1.8955505872720975, "grad_norm": 0.27090078592300415, "learning_rate": 0.00010772189924590773, "loss": 0.2643, "step": 15050 }, { "epoch": 1.8961803696822748, "grad_norm": 0.27956193685531616, "learning_rate": 0.00010761644440664714, "loss": 0.271, "step": 15055 }, { "epoch": 1.896810152092452, "grad_norm": 0.24823328852653503, "learning_rate": 0.00010751101232832401, "loss": 0.2849, "step": 15060 }, { "epoch": 1.8974399345026294, "grad_norm": 0.2675158977508545, "learning_rate": 0.00010740560306755787, "loss": 0.2744, "step": 15065 }, { "epoch": 1.8980697169128065, "grad_norm": 0.2589218318462372, "learning_rate": 0.0001073002166809558, "loss": 0.2834, "step": 15070 }, { "epoch": 1.898699499322984, "grad_norm": 0.277705579996109, "learning_rate": 0.00010719485322511273, "loss": 0.2826, "step": 15075 }, { "epoch": 1.899329281733161, "grad_norm": 0.23539955914020538, "learning_rate": 0.0001070895127566113, "loss": 0.2589, "step": 15080 }, { "epoch": 1.8999590641433386, "grad_norm": 0.3010064661502838, "learning_rate": 0.00010698419533202172, "loss": 0.2804, "step": 15085 }, { "epoch": 1.9005888465535157, "grad_norm": 0.25453826785087585, "learning_rate": 0.00010687890100790175, "loss": 0.2863, "step": 15090 }, { "epoch": 1.9012186289636932, "grad_norm": 0.2774878144264221, "learning_rate": 0.00010677362984079699, "loss": 0.2933, "step": 15095 }, { "epoch": 1.9018484113738703, "grad_norm": 0.26002323627471924, "learning_rate": 0.00010666838188724038, "loss": 0.2891, "step": 15100 }, { "epoch": 1.9024781937840476, "grad_norm": 0.25788870453834534, "learning_rate": 0.00010656315720375246, "loss": 0.2934, "step": 15105 }, { "epoch": 1.903107976194225, "grad_norm": 0.24301236867904663, "learning_rate": 0.00010645795584684138, "loss": 0.2848, "step": 15110 }, { "epoch": 1.9037377586044022, "grad_norm": 0.309514582157135, "learning_rate": 0.00010635277787300256, "loss": 0.2846, "step": 15115 }, { "epoch": 1.9043675410145795, "grad_norm": 0.274870902299881, "learning_rate": 0.00010624762333871913, "loss": 0.2956, "step": 15120 }, { "epoch": 1.9049973234247566, "grad_norm": 0.24861137568950653, "learning_rate": 0.00010614249230046129, "loss": 0.2777, "step": 15125 }, { "epoch": 1.905627105834934, "grad_norm": 0.26125532388687134, "learning_rate": 0.00010603738481468693, "loss": 0.2794, "step": 15130 }, { "epoch": 1.9062568882451112, "grad_norm": 0.24094760417938232, "learning_rate": 0.0001059323009378411, "loss": 0.2633, "step": 15135 }, { "epoch": 1.9068866706552887, "grad_norm": 0.3418034315109253, "learning_rate": 0.0001058272407263563, "loss": 0.3045, "step": 15140 }, { "epoch": 1.9075164530654658, "grad_norm": 0.2657215893268585, "learning_rate": 0.00010572220423665222, "loss": 0.3085, "step": 15145 }, { "epoch": 1.9081462354756433, "grad_norm": 0.23728597164154053, "learning_rate": 0.00010561719152513591, "loss": 0.2788, "step": 15150 }, { "epoch": 1.9087760178858204, "grad_norm": 0.2741139829158783, "learning_rate": 0.0001055122026482016, "loss": 0.2855, "step": 15155 }, { "epoch": 1.9094058002959977, "grad_norm": 0.2415517419576645, "learning_rate": 0.00010540723766223064, "loss": 0.2799, "step": 15160 }, { "epoch": 1.910035582706175, "grad_norm": 0.2724277675151825, "learning_rate": 0.00010530229662359162, "loss": 0.2821, "step": 15165 }, { "epoch": 1.9106653651163523, "grad_norm": 0.28418639302253723, "learning_rate": 0.00010519737958864036, "loss": 0.2899, "step": 15170 }, { "epoch": 1.9112951475265296, "grad_norm": 0.26423749327659607, "learning_rate": 0.00010509248661371962, "loss": 0.3033, "step": 15175 }, { "epoch": 1.9119249299367067, "grad_norm": 0.2523916959762573, "learning_rate": 0.00010498761775515941, "loss": 0.2763, "step": 15180 }, { "epoch": 1.9125547123468842, "grad_norm": 0.29665645956993103, "learning_rate": 0.00010488277306927663, "loss": 0.2918, "step": 15185 }, { "epoch": 1.9131844947570613, "grad_norm": 0.2941978871822357, "learning_rate": 0.00010477795261237537, "loss": 0.2753, "step": 15190 }, { "epoch": 1.9138142771672388, "grad_norm": 0.2701078951358795, "learning_rate": 0.00010467315644074646, "loss": 0.2925, "step": 15195 }, { "epoch": 1.914444059577416, "grad_norm": 0.2497081160545349, "learning_rate": 0.00010456838461066793, "loss": 0.2669, "step": 15200 }, { "epoch": 1.9150738419875935, "grad_norm": 0.2448865920305252, "learning_rate": 0.00010446363717840462, "loss": 0.2766, "step": 15205 }, { "epoch": 1.9157036243977705, "grad_norm": 0.26188936829566956, "learning_rate": 0.00010435891420020833, "loss": 0.2935, "step": 15210 }, { "epoch": 1.9163334068079478, "grad_norm": 0.3044489622116089, "learning_rate": 0.00010425421573231767, "loss": 0.2791, "step": 15215 }, { "epoch": 1.9169631892181251, "grad_norm": 0.30361208319664, "learning_rate": 0.00010414954183095813, "loss": 0.277, "step": 15220 }, { "epoch": 1.9175929716283024, "grad_norm": 0.31100359559059143, "learning_rate": 0.00010404489255234191, "loss": 0.2687, "step": 15225 }, { "epoch": 1.9182227540384797, "grad_norm": 0.26500749588012695, "learning_rate": 0.00010394026795266814, "loss": 0.2804, "step": 15230 }, { "epoch": 1.9188525364486568, "grad_norm": 0.33220374584198, "learning_rate": 0.00010383566808812257, "loss": 0.284, "step": 15235 }, { "epoch": 1.9194823188588344, "grad_norm": 0.23146981000900269, "learning_rate": 0.00010373109301487777, "loss": 0.2949, "step": 15240 }, { "epoch": 1.9201121012690114, "grad_norm": 0.24833330512046814, "learning_rate": 0.00010362654278909292, "loss": 0.2685, "step": 15245 }, { "epoch": 1.920741883679189, "grad_norm": 0.22905099391937256, "learning_rate": 0.00010352201746691381, "loss": 0.248, "step": 15250 }, { "epoch": 1.921371666089366, "grad_norm": 0.2544589936733246, "learning_rate": 0.00010341751710447308, "loss": 0.2763, "step": 15255 }, { "epoch": 1.9220014484995434, "grad_norm": 0.24207763373851776, "learning_rate": 0.0001033130417578897, "loss": 0.2691, "step": 15260 }, { "epoch": 1.9226312309097207, "grad_norm": 0.3025490939617157, "learning_rate": 0.0001032085914832693, "loss": 0.2902, "step": 15265 }, { "epoch": 1.923261013319898, "grad_norm": 0.2563372552394867, "learning_rate": 0.00010310416633670413, "loss": 0.2937, "step": 15270 }, { "epoch": 1.9238907957300753, "grad_norm": 0.22143816947937012, "learning_rate": 0.00010299976637427285, "loss": 0.2615, "step": 15275 }, { "epoch": 1.9245205781402526, "grad_norm": 0.26383697986602783, "learning_rate": 0.00010289539165204058, "loss": 0.2834, "step": 15280 }, { "epoch": 1.9251503605504299, "grad_norm": 0.2607567310333252, "learning_rate": 0.00010279104222605903, "loss": 0.2875, "step": 15285 }, { "epoch": 1.925780142960607, "grad_norm": 0.23255427181720734, "learning_rate": 0.0001026867181523662, "loss": 0.2645, "step": 15290 }, { "epoch": 1.9264099253707845, "grad_norm": 0.2203371226787567, "learning_rate": 0.00010258241948698641, "loss": 0.276, "step": 15295 }, { "epoch": 1.9270397077809616, "grad_norm": 0.2557859718799591, "learning_rate": 0.00010247814628593052, "loss": 0.2877, "step": 15300 }, { "epoch": 1.927669490191139, "grad_norm": 0.2551586925983429, "learning_rate": 0.00010237389860519557, "loss": 0.2678, "step": 15305 }, { "epoch": 1.9282992726013162, "grad_norm": 0.2592737376689911, "learning_rate": 0.00010226967650076495, "loss": 0.2645, "step": 15310 }, { "epoch": 1.9289290550114935, "grad_norm": 0.25076064467430115, "learning_rate": 0.00010216548002860836, "loss": 0.2595, "step": 15315 }, { "epoch": 1.9295588374216708, "grad_norm": 0.28892189264297485, "learning_rate": 0.0001020613092446816, "loss": 0.2658, "step": 15320 }, { "epoch": 1.930188619831848, "grad_norm": 0.28119730949401855, "learning_rate": 0.00010195716420492692, "loss": 0.2783, "step": 15325 }, { "epoch": 1.9308184022420254, "grad_norm": 0.23143291473388672, "learning_rate": 0.00010185304496527239, "loss": 0.2745, "step": 15330 }, { "epoch": 1.9314481846522027, "grad_norm": 0.23947221040725708, "learning_rate": 0.00010174895158163252, "loss": 0.2642, "step": 15335 }, { "epoch": 1.93207796706238, "grad_norm": 0.27924421429634094, "learning_rate": 0.00010164488410990779, "loss": 0.2895, "step": 15340 }, { "epoch": 1.932707749472557, "grad_norm": 0.2736763656139374, "learning_rate": 0.00010154084260598488, "loss": 0.2798, "step": 15345 }, { "epoch": 1.9333375318827346, "grad_norm": 0.26288047432899475, "learning_rate": 0.00010143682712573639, "loss": 0.2799, "step": 15350 }, { "epoch": 1.9339673142929117, "grad_norm": 0.2662082016468048, "learning_rate": 0.00010133283772502105, "loss": 0.2708, "step": 15355 }, { "epoch": 1.9345970967030892, "grad_norm": 0.2595316767692566, "learning_rate": 0.00010122887445968358, "loss": 0.2631, "step": 15360 }, { "epoch": 1.9352268791132663, "grad_norm": 0.22839054465293884, "learning_rate": 0.00010112493738555453, "loss": 0.2533, "step": 15365 }, { "epoch": 1.9358566615234436, "grad_norm": 0.25195086002349854, "learning_rate": 0.0001010210265584505, "loss": 0.26, "step": 15370 }, { "epoch": 1.936486443933621, "grad_norm": 0.2431613951921463, "learning_rate": 0.00010091714203417404, "loss": 0.2802, "step": 15375 }, { "epoch": 1.9371162263437982, "grad_norm": 0.24503393471240997, "learning_rate": 0.00010081328386851342, "loss": 0.2968, "step": 15380 }, { "epoch": 1.9377460087539755, "grad_norm": 0.26283174753189087, "learning_rate": 0.00010070945211724298, "loss": 0.2831, "step": 15385 }, { "epoch": 1.9383757911641528, "grad_norm": 0.23644685745239258, "learning_rate": 0.00010060564683612264, "loss": 0.2843, "step": 15390 }, { "epoch": 1.9390055735743301, "grad_norm": 0.271457314491272, "learning_rate": 0.00010050186808089828, "loss": 0.2736, "step": 15395 }, { "epoch": 1.9396353559845072, "grad_norm": 0.2437523454427719, "learning_rate": 0.00010039811590730137, "loss": 0.2839, "step": 15400 }, { "epoch": 1.9402651383946847, "grad_norm": 0.25611042976379395, "learning_rate": 0.00010029439037104925, "loss": 0.2671, "step": 15405 }, { "epoch": 1.9408949208048618, "grad_norm": 0.2646775245666504, "learning_rate": 0.00010019069152784486, "loss": 0.3072, "step": 15410 }, { "epoch": 1.9415247032150393, "grad_norm": 0.26959145069122314, "learning_rate": 0.00010008701943337695, "loss": 0.2655, "step": 15415 }, { "epoch": 1.9421544856252164, "grad_norm": 0.28409838676452637, "learning_rate": 9.998337414331971e-05, "loss": 0.2643, "step": 15420 }, { "epoch": 1.9427842680353937, "grad_norm": 0.288766086101532, "learning_rate": 9.987975571333303e-05, "loss": 0.2849, "step": 15425 }, { "epoch": 1.943414050445571, "grad_norm": 0.28650057315826416, "learning_rate": 9.977616419906247e-05, "loss": 0.2672, "step": 15430 }, { "epoch": 1.9440438328557483, "grad_norm": 0.28229546546936035, "learning_rate": 9.967259965613893e-05, "loss": 0.2649, "step": 15435 }, { "epoch": 1.9446736152659256, "grad_norm": 0.21892526745796204, "learning_rate": 9.956906214017894e-05, "loss": 0.2668, "step": 15440 }, { "epoch": 1.945303397676103, "grad_norm": 0.27021822333335876, "learning_rate": 9.946555170678458e-05, "loss": 0.2725, "step": 15445 }, { "epoch": 1.9459331800862802, "grad_norm": 0.2574271857738495, "learning_rate": 9.936206841154328e-05, "loss": 0.2643, "step": 15450 }, { "epoch": 1.9465629624964573, "grad_norm": 0.2907993495464325, "learning_rate": 9.925861231002792e-05, "loss": 0.3103, "step": 15455 }, { "epoch": 1.9471927449066349, "grad_norm": 0.225221186876297, "learning_rate": 9.915518345779681e-05, "loss": 0.2804, "step": 15460 }, { "epoch": 1.947822527316812, "grad_norm": 0.2557651400566101, "learning_rate": 9.905178191039365e-05, "loss": 0.2735, "step": 15465 }, { "epoch": 1.9484523097269895, "grad_norm": 0.26498880982398987, "learning_rate": 9.894840772334733e-05, "loss": 0.2664, "step": 15470 }, { "epoch": 1.9490820921371665, "grad_norm": 0.2424790859222412, "learning_rate": 9.884506095217222e-05, "loss": 0.2693, "step": 15475 }, { "epoch": 1.9497118745473438, "grad_norm": 0.25802767276763916, "learning_rate": 9.87417416523679e-05, "loss": 0.2831, "step": 15480 }, { "epoch": 1.9503416569575212, "grad_norm": 0.2601839005947113, "learning_rate": 9.863844987941912e-05, "loss": 0.2629, "step": 15485 }, { "epoch": 1.9509714393676985, "grad_norm": 0.26015961170196533, "learning_rate": 9.853518568879602e-05, "loss": 0.2634, "step": 15490 }, { "epoch": 1.9516012217778758, "grad_norm": 0.2370160073041916, "learning_rate": 9.843194913595374e-05, "loss": 0.2557, "step": 15495 }, { "epoch": 1.952231004188053, "grad_norm": 0.2519363462924957, "learning_rate": 9.832874027633281e-05, "loss": 0.255, "step": 15500 }, { "epoch": 1.9528607865982304, "grad_norm": 0.3419806659221649, "learning_rate": 9.822555916535858e-05, "loss": 0.2744, "step": 15505 }, { "epoch": 1.9534905690084075, "grad_norm": 0.24397574365139008, "learning_rate": 9.812240585844176e-05, "loss": 0.2619, "step": 15510 }, { "epoch": 1.954120351418585, "grad_norm": 0.2432924211025238, "learning_rate": 9.801928041097795e-05, "loss": 0.2581, "step": 15515 }, { "epoch": 1.954750133828762, "grad_norm": 0.27478650212287903, "learning_rate": 9.791618287834797e-05, "loss": 0.2606, "step": 15520 }, { "epoch": 1.9553799162389396, "grad_norm": 0.29080766439437866, "learning_rate": 9.781311331591747e-05, "loss": 0.2656, "step": 15525 }, { "epoch": 1.9560096986491167, "grad_norm": 0.24801793694496155, "learning_rate": 9.771007177903723e-05, "loss": 0.2651, "step": 15530 }, { "epoch": 1.956639481059294, "grad_norm": 0.22467739880084991, "learning_rate": 9.76070583230429e-05, "loss": 0.2663, "step": 15535 }, { "epoch": 1.9572692634694713, "grad_norm": 0.24151213467121124, "learning_rate": 9.750407300325502e-05, "loss": 0.2612, "step": 15540 }, { "epoch": 1.9578990458796486, "grad_norm": 0.262352854013443, "learning_rate": 9.7401115874979e-05, "loss": 0.2508, "step": 15545 }, { "epoch": 1.9585288282898259, "grad_norm": 0.2491580843925476, "learning_rate": 9.72981869935053e-05, "loss": 0.2735, "step": 15550 }, { "epoch": 1.9591586107000032, "grad_norm": 0.27000558376312256, "learning_rate": 9.719528641410898e-05, "loss": 0.2794, "step": 15555 }, { "epoch": 1.9597883931101805, "grad_norm": 0.2562926113605499, "learning_rate": 9.709241419205008e-05, "loss": 0.2829, "step": 15560 }, { "epoch": 1.9604181755203576, "grad_norm": 0.2559642493724823, "learning_rate": 9.69895703825733e-05, "loss": 0.2768, "step": 15565 }, { "epoch": 1.961047957930535, "grad_norm": 0.23282787203788757, "learning_rate": 9.688675504090811e-05, "loss": 0.2648, "step": 15570 }, { "epoch": 1.9616777403407122, "grad_norm": 0.2280416637659073, "learning_rate": 9.678396822226868e-05, "loss": 0.2474, "step": 15575 }, { "epoch": 1.9623075227508897, "grad_norm": 0.2516798973083496, "learning_rate": 9.668120998185392e-05, "loss": 0.2855, "step": 15580 }, { "epoch": 1.9629373051610668, "grad_norm": 0.24892964959144592, "learning_rate": 9.657848037484726e-05, "loss": 0.2731, "step": 15585 }, { "epoch": 1.963567087571244, "grad_norm": 0.2524420917034149, "learning_rate": 9.647577945641699e-05, "loss": 0.275, "step": 15590 }, { "epoch": 1.9641968699814214, "grad_norm": 0.2617582380771637, "learning_rate": 9.637310728171577e-05, "loss": 0.293, "step": 15595 }, { "epoch": 1.9648266523915987, "grad_norm": 0.2635948061943054, "learning_rate": 9.627046390588086e-05, "loss": 0.2642, "step": 15600 }, { "epoch": 1.965456434801776, "grad_norm": 0.22701425850391388, "learning_rate": 9.61678493840342e-05, "loss": 0.2647, "step": 15605 }, { "epoch": 1.9660862172119533, "grad_norm": 0.2594752609729767, "learning_rate": 9.606526377128207e-05, "loss": 0.2846, "step": 15610 }, { "epoch": 1.9667159996221306, "grad_norm": 0.25541216135025024, "learning_rate": 9.596270712271524e-05, "loss": 0.2712, "step": 15615 }, { "epoch": 1.9673457820323077, "grad_norm": 0.26473337411880493, "learning_rate": 9.586017949340909e-05, "loss": 0.2515, "step": 15620 }, { "epoch": 1.9679755644424852, "grad_norm": 0.2370501607656479, "learning_rate": 9.575768093842321e-05, "loss": 0.2569, "step": 15625 }, { "epoch": 1.9686053468526623, "grad_norm": 0.25999268889427185, "learning_rate": 9.565521151280168e-05, "loss": 0.2846, "step": 15630 }, { "epoch": 1.9692351292628398, "grad_norm": 0.2597227394580841, "learning_rate": 9.555277127157294e-05, "loss": 0.2814, "step": 15635 }, { "epoch": 1.969864911673017, "grad_norm": 0.22267143428325653, "learning_rate": 9.545036026974979e-05, "loss": 0.2703, "step": 15640 }, { "epoch": 1.9704946940831942, "grad_norm": 0.2599702477455139, "learning_rate": 9.534797856232913e-05, "loss": 0.2741, "step": 15645 }, { "epoch": 1.9711244764933715, "grad_norm": 0.23703083395957947, "learning_rate": 9.524562620429243e-05, "loss": 0.2657, "step": 15650 }, { "epoch": 1.9717542589035488, "grad_norm": 0.24194732308387756, "learning_rate": 9.514330325060515e-05, "loss": 0.2613, "step": 15655 }, { "epoch": 1.9723840413137261, "grad_norm": 0.2648374140262604, "learning_rate": 9.504100975621709e-05, "loss": 0.2808, "step": 15660 }, { "epoch": 1.9730138237239034, "grad_norm": 0.2491552084684372, "learning_rate": 9.493874577606218e-05, "loss": 0.2622, "step": 15665 }, { "epoch": 1.9736436061340807, "grad_norm": 0.25322696566581726, "learning_rate": 9.483651136505857e-05, "loss": 0.2716, "step": 15670 }, { "epoch": 1.9742733885442578, "grad_norm": 0.22565199434757233, "learning_rate": 9.473430657810838e-05, "loss": 0.2947, "step": 15675 }, { "epoch": 1.9749031709544353, "grad_norm": 0.2594245672225952, "learning_rate": 9.463213147009795e-05, "loss": 0.2793, "step": 15680 }, { "epoch": 1.9755329533646124, "grad_norm": 0.2432025521993637, "learning_rate": 9.452998609589769e-05, "loss": 0.2559, "step": 15685 }, { "epoch": 1.97616273577479, "grad_norm": 0.2537454068660736, "learning_rate": 9.442787051036192e-05, "loss": 0.2842, "step": 15690 }, { "epoch": 1.976792518184967, "grad_norm": 0.2597581446170807, "learning_rate": 9.432578476832911e-05, "loss": 0.2755, "step": 15695 }, { "epoch": 1.9774223005951443, "grad_norm": 0.25382810831069946, "learning_rate": 9.42237289246216e-05, "loss": 0.2653, "step": 15700 }, { "epoch": 1.9780520830053216, "grad_norm": 0.23822832107543945, "learning_rate": 9.412170303404579e-05, "loss": 0.2624, "step": 15705 }, { "epoch": 1.978681865415499, "grad_norm": 0.2722800076007843, "learning_rate": 9.40197071513918e-05, "loss": 0.2712, "step": 15710 }, { "epoch": 1.9793116478256763, "grad_norm": 0.2273283749818802, "learning_rate": 9.39177413314338e-05, "loss": 0.2545, "step": 15715 }, { "epoch": 1.9799414302358536, "grad_norm": 0.24674946069717407, "learning_rate": 9.381580562892972e-05, "loss": 0.2606, "step": 15720 }, { "epoch": 1.9805712126460309, "grad_norm": 0.23100855946540833, "learning_rate": 9.371390009862145e-05, "loss": 0.2632, "step": 15725 }, { "epoch": 1.981200995056208, "grad_norm": 0.23489323258399963, "learning_rate": 9.361202479523448e-05, "loss": 0.2833, "step": 15730 }, { "epoch": 1.9818307774663855, "grad_norm": 0.26526087522506714, "learning_rate": 9.35101797734783e-05, "loss": 0.2717, "step": 15735 }, { "epoch": 1.9824605598765626, "grad_norm": 0.27042046189308167, "learning_rate": 9.340836508804595e-05, "loss": 0.2664, "step": 15740 }, { "epoch": 1.98309034228674, "grad_norm": 0.28461650013923645, "learning_rate": 9.330658079361422e-05, "loss": 0.26, "step": 15745 }, { "epoch": 1.9837201246969172, "grad_norm": 0.26529213786125183, "learning_rate": 9.320482694484356e-05, "loss": 0.2808, "step": 15750 }, { "epoch": 1.9843499071070945, "grad_norm": 0.32026639580726624, "learning_rate": 9.310310359637823e-05, "loss": 0.2631, "step": 15755 }, { "epoch": 1.9849796895172718, "grad_norm": 0.2596029043197632, "learning_rate": 9.300141080284588e-05, "loss": 0.2771, "step": 15760 }, { "epoch": 1.985609471927449, "grad_norm": 0.25400853157043457, "learning_rate": 9.289974861885796e-05, "loss": 0.2532, "step": 15765 }, { "epoch": 1.9862392543376264, "grad_norm": 0.29176244139671326, "learning_rate": 9.279811709900934e-05, "loss": 0.2719, "step": 15770 }, { "epoch": 1.9868690367478037, "grad_norm": 0.2530720829963684, "learning_rate": 9.26965162978785e-05, "loss": 0.2676, "step": 15775 }, { "epoch": 1.987498819157981, "grad_norm": 0.23311518132686615, "learning_rate": 9.259494627002728e-05, "loss": 0.2632, "step": 15780 }, { "epoch": 1.988128601568158, "grad_norm": 0.2402007132768631, "learning_rate": 9.249340707000123e-05, "loss": 0.2419, "step": 15785 }, { "epoch": 1.9887583839783356, "grad_norm": 0.24241755902767181, "learning_rate": 9.239189875232914e-05, "loss": 0.2567, "step": 15790 }, { "epoch": 1.9893881663885127, "grad_norm": 0.31180015206336975, "learning_rate": 9.229042137152337e-05, "loss": 0.2864, "step": 15795 }, { "epoch": 1.9900179487986902, "grad_norm": 0.29330986738204956, "learning_rate": 9.218897498207952e-05, "loss": 0.2626, "step": 15800 }, { "epoch": 1.9906477312088673, "grad_norm": 0.24687117338180542, "learning_rate": 9.208755963847663e-05, "loss": 0.2721, "step": 15805 }, { "epoch": 1.9912775136190446, "grad_norm": 0.22886700928211212, "learning_rate": 9.198617539517714e-05, "loss": 0.2626, "step": 15810 }, { "epoch": 1.991907296029222, "grad_norm": 0.2331649363040924, "learning_rate": 9.188482230662662e-05, "loss": 0.2484, "step": 15815 }, { "epoch": 1.9925370784393992, "grad_norm": 0.23371124267578125, "learning_rate": 9.178350042725397e-05, "loss": 0.291, "step": 15820 }, { "epoch": 1.9931668608495765, "grad_norm": 0.2478175163269043, "learning_rate": 9.168220981147143e-05, "loss": 0.2748, "step": 15825 }, { "epoch": 1.9937966432597538, "grad_norm": 0.25952714681625366, "learning_rate": 9.158095051367433e-05, "loss": 0.2568, "step": 15830 }, { "epoch": 1.9944264256699311, "grad_norm": 0.2522846758365631, "learning_rate": 9.14797225882412e-05, "loss": 0.2414, "step": 15835 }, { "epoch": 1.9950562080801082, "grad_norm": 0.24577966332435608, "learning_rate": 9.137852608953384e-05, "loss": 0.2573, "step": 15840 }, { "epoch": 1.9956859904902857, "grad_norm": 0.2714809775352478, "learning_rate": 9.127736107189705e-05, "loss": 0.2703, "step": 15845 }, { "epoch": 1.9963157729004628, "grad_norm": 0.25562793016433716, "learning_rate": 9.117622758965866e-05, "loss": 0.2601, "step": 15850 }, { "epoch": 1.9969455553106403, "grad_norm": 0.23811009526252747, "learning_rate": 9.107512569712975e-05, "loss": 0.2474, "step": 15855 }, { "epoch": 1.9975753377208174, "grad_norm": 0.24953188002109528, "learning_rate": 9.097405544860437e-05, "loss": 0.2582, "step": 15860 }, { "epoch": 1.9982051201309947, "grad_norm": 0.23611120879650116, "learning_rate": 9.087301689835944e-05, "loss": 0.253, "step": 15865 }, { "epoch": 1.998834902541172, "grad_norm": 0.24487170577049255, "learning_rate": 9.077201010065509e-05, "loss": 0.2508, "step": 15870 }, { "epoch": 1.9994646849513493, "grad_norm": 0.2839270830154419, "learning_rate": 9.06710351097342e-05, "loss": 0.2748, "step": 15875 }, { "epoch": 2.0, "grad_norm": 0.2304636538028717, "learning_rate": 9.057009197982272e-05, "loss": 0.2653, "step": 15880 }, { "epoch": 2.000629782410177, "grad_norm": 0.19537301361560822, "learning_rate": 9.046918076512935e-05, "loss": 0.1844, "step": 15885 }, { "epoch": 2.0012595648203546, "grad_norm": 0.22054894268512726, "learning_rate": 9.036830151984571e-05, "loss": 0.196, "step": 15890 }, { "epoch": 2.0018893472305317, "grad_norm": 0.23987694084644318, "learning_rate": 9.02674542981463e-05, "loss": 0.1962, "step": 15895 }, { "epoch": 2.002519129640709, "grad_norm": 0.24562768638134003, "learning_rate": 9.016663915418835e-05, "loss": 0.1826, "step": 15900 }, { "epoch": 2.0031489120508863, "grad_norm": 0.27111175656318665, "learning_rate": 9.00658561421119e-05, "loss": 0.2025, "step": 15905 }, { "epoch": 2.003778694461064, "grad_norm": 0.21321839094161987, "learning_rate": 8.99651053160398e-05, "loss": 0.1743, "step": 15910 }, { "epoch": 2.004408476871241, "grad_norm": 0.2295263558626175, "learning_rate": 8.986438673007749e-05, "loss": 0.1856, "step": 15915 }, { "epoch": 2.0050382592814184, "grad_norm": 0.22658327221870422, "learning_rate": 8.976370043831313e-05, "loss": 0.1896, "step": 15920 }, { "epoch": 2.0056680416915955, "grad_norm": 0.21595464646816254, "learning_rate": 8.966304649481753e-05, "loss": 0.1865, "step": 15925 }, { "epoch": 2.006297824101773, "grad_norm": 0.23339222371578217, "learning_rate": 8.956242495364426e-05, "loss": 0.1866, "step": 15930 }, { "epoch": 2.00692760651195, "grad_norm": 0.20041927695274353, "learning_rate": 8.946183586882929e-05, "loss": 0.1745, "step": 15935 }, { "epoch": 2.007557388922127, "grad_norm": 0.19914592802524567, "learning_rate": 8.936127929439131e-05, "loss": 0.1885, "step": 15940 }, { "epoch": 2.0081871713323047, "grad_norm": 0.20688550174236298, "learning_rate": 8.926075528433149e-05, "loss": 0.1932, "step": 15945 }, { "epoch": 2.008816953742482, "grad_norm": 0.23507048189640045, "learning_rate": 8.916026389263358e-05, "loss": 0.1865, "step": 15950 }, { "epoch": 2.0094467361526593, "grad_norm": 0.2366725355386734, "learning_rate": 8.905980517326358e-05, "loss": 0.1867, "step": 15955 }, { "epoch": 2.0100765185628364, "grad_norm": 0.20678187906742096, "learning_rate": 8.895937918017028e-05, "loss": 0.1785, "step": 15960 }, { "epoch": 2.010706300973014, "grad_norm": 0.2642296850681305, "learning_rate": 8.885898596728463e-05, "loss": 0.1812, "step": 15965 }, { "epoch": 2.011336083383191, "grad_norm": 0.20598894357681274, "learning_rate": 8.875862558852016e-05, "loss": 0.1861, "step": 15970 }, { "epoch": 2.0119658657933686, "grad_norm": 0.23556114733219147, "learning_rate": 8.865829809777265e-05, "loss": 0.1873, "step": 15975 }, { "epoch": 2.0125956482035456, "grad_norm": 0.25772175192832947, "learning_rate": 8.855800354892022e-05, "loss": 0.1858, "step": 15980 }, { "epoch": 2.013225430613723, "grad_norm": 0.21538549661636353, "learning_rate": 8.845774199582344e-05, "loss": 0.1738, "step": 15985 }, { "epoch": 2.0138552130239002, "grad_norm": 0.22819840908050537, "learning_rate": 8.835751349232496e-05, "loss": 0.1843, "step": 15990 }, { "epoch": 2.0144849954340773, "grad_norm": 0.23319579660892487, "learning_rate": 8.825731809224976e-05, "loss": 0.1878, "step": 15995 }, { "epoch": 2.015114777844255, "grad_norm": 0.24107947945594788, "learning_rate": 8.815715584940511e-05, "loss": 0.1867, "step": 16000 }, { "epoch": 2.015114777844255, "eval_loss": 0.3415575921535492, "eval_runtime": 6.166, "eval_samples_per_second": 162.181, "eval_steps_per_second": 10.217, "step": 16000 }, { "epoch": 2.015744560254432, "grad_norm": 0.2272019386291504, "learning_rate": 8.805702681758042e-05, "loss": 0.1718, "step": 16005 }, { "epoch": 2.0163743426646095, "grad_norm": 0.22147491574287415, "learning_rate": 8.795693105054723e-05, "loss": 0.175, "step": 16010 }, { "epoch": 2.0170041250747865, "grad_norm": 0.21899926662445068, "learning_rate": 8.785686860205929e-05, "loss": 0.1749, "step": 16015 }, { "epoch": 2.017633907484964, "grad_norm": 0.24299047887325287, "learning_rate": 8.775683952585246e-05, "loss": 0.1902, "step": 16020 }, { "epoch": 2.018263689895141, "grad_norm": 0.24278461933135986, "learning_rate": 8.765684387564454e-05, "loss": 0.1872, "step": 16025 }, { "epoch": 2.0188934723053187, "grad_norm": 0.24929705262184143, "learning_rate": 8.75568817051355e-05, "loss": 0.1838, "step": 16030 }, { "epoch": 2.0195232547154958, "grad_norm": 0.20675018429756165, "learning_rate": 8.745695306800738e-05, "loss": 0.1734, "step": 16035 }, { "epoch": 2.0201530371256733, "grad_norm": 0.25064778327941895, "learning_rate": 8.73570580179241e-05, "loss": 0.1821, "step": 16040 }, { "epoch": 2.0207828195358504, "grad_norm": 0.23618988692760468, "learning_rate": 8.725719660853157e-05, "loss": 0.1935, "step": 16045 }, { "epoch": 2.0214126019460275, "grad_norm": 0.2201015204191208, "learning_rate": 8.715736889345766e-05, "loss": 0.1806, "step": 16050 }, { "epoch": 2.022042384356205, "grad_norm": 0.23748455941677094, "learning_rate": 8.705757492631214e-05, "loss": 0.1807, "step": 16055 }, { "epoch": 2.022672166766382, "grad_norm": 0.2563530504703522, "learning_rate": 8.695781476068664e-05, "loss": 0.1825, "step": 16060 }, { "epoch": 2.0233019491765596, "grad_norm": 0.27659016847610474, "learning_rate": 8.685808845015464e-05, "loss": 0.1861, "step": 16065 }, { "epoch": 2.0239317315867367, "grad_norm": 0.19301186501979828, "learning_rate": 8.675839604827146e-05, "loss": 0.1804, "step": 16070 }, { "epoch": 2.024561513996914, "grad_norm": 0.245374858379364, "learning_rate": 8.665873760857415e-05, "loss": 0.1785, "step": 16075 }, { "epoch": 2.0251912964070913, "grad_norm": 0.21472232043743134, "learning_rate": 8.655911318458166e-05, "loss": 0.1785, "step": 16080 }, { "epoch": 2.025821078817269, "grad_norm": 0.22257132828235626, "learning_rate": 8.645952282979453e-05, "loss": 0.1812, "step": 16085 }, { "epoch": 2.026450861227446, "grad_norm": 0.25223472714424133, "learning_rate": 8.635996659769512e-05, "loss": 0.1934, "step": 16090 }, { "epoch": 2.0270806436376234, "grad_norm": 0.22251825034618378, "learning_rate": 8.626044454174724e-05, "loss": 0.1895, "step": 16095 }, { "epoch": 2.0277104260478005, "grad_norm": 0.2073337882757187, "learning_rate": 8.616095671539663e-05, "loss": 0.1851, "step": 16100 }, { "epoch": 2.0283402084579776, "grad_norm": 0.21960042417049408, "learning_rate": 8.606150317207053e-05, "loss": 0.1809, "step": 16105 }, { "epoch": 2.028969990868155, "grad_norm": 0.23633064329624176, "learning_rate": 8.596208396517771e-05, "loss": 0.1839, "step": 16110 }, { "epoch": 2.029599773278332, "grad_norm": 0.21128375828266144, "learning_rate": 8.586269914810855e-05, "loss": 0.1828, "step": 16115 }, { "epoch": 2.0302295556885097, "grad_norm": 0.24467304348945618, "learning_rate": 8.576334877423505e-05, "loss": 0.1784, "step": 16120 }, { "epoch": 2.030859338098687, "grad_norm": 0.24976873397827148, "learning_rate": 8.566403289691062e-05, "loss": 0.1924, "step": 16125 }, { "epoch": 2.0314891205088643, "grad_norm": 0.258323609828949, "learning_rate": 8.556475156947008e-05, "loss": 0.1889, "step": 16130 }, { "epoch": 2.0321189029190414, "grad_norm": 0.24420535564422607, "learning_rate": 8.546550484522973e-05, "loss": 0.197, "step": 16135 }, { "epoch": 2.032748685329219, "grad_norm": 0.2438700944185257, "learning_rate": 8.536629277748746e-05, "loss": 0.1958, "step": 16140 }, { "epoch": 2.033378467739396, "grad_norm": 0.25343936681747437, "learning_rate": 8.526711541952236e-05, "loss": 0.1877, "step": 16145 }, { "epoch": 2.0340082501495735, "grad_norm": 0.24403081834316254, "learning_rate": 8.516797282459493e-05, "loss": 0.1774, "step": 16150 }, { "epoch": 2.0346380325597506, "grad_norm": 0.24733777344226837, "learning_rate": 8.506886504594704e-05, "loss": 0.1792, "step": 16155 }, { "epoch": 2.0352678149699277, "grad_norm": 0.22619028389453888, "learning_rate": 8.496979213680177e-05, "loss": 0.1807, "step": 16160 }, { "epoch": 2.0358975973801052, "grad_norm": 0.23040007054805756, "learning_rate": 8.48707541503636e-05, "loss": 0.1804, "step": 16165 }, { "epoch": 2.0365273797902823, "grad_norm": 0.21034270524978638, "learning_rate": 8.477175113981813e-05, "loss": 0.1787, "step": 16170 }, { "epoch": 2.03715716220046, "grad_norm": 0.21682168543338776, "learning_rate": 8.467278315833224e-05, "loss": 0.1817, "step": 16175 }, { "epoch": 2.037786944610637, "grad_norm": 0.2700116038322449, "learning_rate": 8.457385025905407e-05, "loss": 0.1896, "step": 16180 }, { "epoch": 2.0384167270208144, "grad_norm": 0.214239239692688, "learning_rate": 8.44749524951128e-05, "loss": 0.1827, "step": 16185 }, { "epoch": 2.0390465094309915, "grad_norm": 0.2243194878101349, "learning_rate": 8.437608991961885e-05, "loss": 0.1833, "step": 16190 }, { "epoch": 2.039676291841169, "grad_norm": 0.28487569093704224, "learning_rate": 8.427726258566353e-05, "loss": 0.1901, "step": 16195 }, { "epoch": 2.040306074251346, "grad_norm": 0.24857446551322937, "learning_rate": 8.41784705463195e-05, "loss": 0.192, "step": 16200 }, { "epoch": 2.0409358566615237, "grad_norm": 0.22208547592163086, "learning_rate": 8.407971385464032e-05, "loss": 0.1907, "step": 16205 }, { "epoch": 2.0415656390717007, "grad_norm": 0.22752498090267181, "learning_rate": 8.398099256366057e-05, "loss": 0.1827, "step": 16210 }, { "epoch": 2.042195421481878, "grad_norm": 0.25674304366111755, "learning_rate": 8.388230672639584e-05, "loss": 0.1889, "step": 16215 }, { "epoch": 2.0428252038920554, "grad_norm": 0.22372281551361084, "learning_rate": 8.378365639584264e-05, "loss": 0.1816, "step": 16220 }, { "epoch": 2.0434549863022324, "grad_norm": 0.25298216938972473, "learning_rate": 8.368504162497859e-05, "loss": 0.1813, "step": 16225 }, { "epoch": 2.04408476871241, "grad_norm": 0.21058551967144012, "learning_rate": 8.358646246676197e-05, "loss": 0.1855, "step": 16230 }, { "epoch": 2.044714551122587, "grad_norm": 0.2757975459098816, "learning_rate": 8.348791897413196e-05, "loss": 0.1749, "step": 16235 }, { "epoch": 2.0453443335327646, "grad_norm": 0.22646676003932953, "learning_rate": 8.338941120000884e-05, "loss": 0.1852, "step": 16240 }, { "epoch": 2.0459741159429417, "grad_norm": 0.23769816756248474, "learning_rate": 8.329093919729342e-05, "loss": 0.1869, "step": 16245 }, { "epoch": 2.046603898353119, "grad_norm": 0.22907455265522003, "learning_rate": 8.319250301886746e-05, "loss": 0.1876, "step": 16250 }, { "epoch": 2.0472336807632963, "grad_norm": 0.22925196588039398, "learning_rate": 8.309410271759342e-05, "loss": 0.1885, "step": 16255 }, { "epoch": 2.0478634631734733, "grad_norm": 0.22043700516223907, "learning_rate": 8.299573834631454e-05, "loss": 0.181, "step": 16260 }, { "epoch": 2.048493245583651, "grad_norm": 0.23858542740345, "learning_rate": 8.289740995785468e-05, "loss": 0.1898, "step": 16265 }, { "epoch": 2.049123027993828, "grad_norm": 0.23982049524784088, "learning_rate": 8.279911760501846e-05, "loss": 0.1838, "step": 16270 }, { "epoch": 2.0497528104040055, "grad_norm": 0.21694807708263397, "learning_rate": 8.270086134059113e-05, "loss": 0.1795, "step": 16275 }, { "epoch": 2.0503825928141826, "grad_norm": 0.20050913095474243, "learning_rate": 8.260264121733846e-05, "loss": 0.175, "step": 16280 }, { "epoch": 2.05101237522436, "grad_norm": 0.2118636816740036, "learning_rate": 8.250445728800706e-05, "loss": 0.1778, "step": 16285 }, { "epoch": 2.051642157634537, "grad_norm": 0.2250407338142395, "learning_rate": 8.240630960532382e-05, "loss": 0.1885, "step": 16290 }, { "epoch": 2.0522719400447147, "grad_norm": 0.2565051019191742, "learning_rate": 8.230819822199642e-05, "loss": 0.1901, "step": 16295 }, { "epoch": 2.0529017224548918, "grad_norm": 0.24367564916610718, "learning_rate": 8.221012319071268e-05, "loss": 0.1798, "step": 16300 }, { "epoch": 2.0535315048650693, "grad_norm": 0.24313905835151672, "learning_rate": 8.211208456414135e-05, "loss": 0.1908, "step": 16305 }, { "epoch": 2.0541612872752464, "grad_norm": 0.23950958251953125, "learning_rate": 8.201408239493131e-05, "loss": 0.1815, "step": 16310 }, { "epoch": 2.0547910696854235, "grad_norm": 0.24551273882389069, "learning_rate": 8.1916116735712e-05, "loss": 0.1941, "step": 16315 }, { "epoch": 2.055420852095601, "grad_norm": 0.21070988476276398, "learning_rate": 8.181818763909314e-05, "loss": 0.1868, "step": 16320 }, { "epoch": 2.056050634505778, "grad_norm": 0.21926933526992798, "learning_rate": 8.172029515766502e-05, "loss": 0.1848, "step": 16325 }, { "epoch": 2.0566804169159556, "grad_norm": 0.22517934441566467, "learning_rate": 8.162243934399812e-05, "loss": 0.1912, "step": 16330 }, { "epoch": 2.0573101993261327, "grad_norm": 0.2571990489959717, "learning_rate": 8.152462025064315e-05, "loss": 0.1834, "step": 16335 }, { "epoch": 2.05793998173631, "grad_norm": 0.22555163502693176, "learning_rate": 8.14268379301312e-05, "loss": 0.19, "step": 16340 }, { "epoch": 2.0585697641464873, "grad_norm": 0.2326682209968567, "learning_rate": 8.13290924349737e-05, "loss": 0.1786, "step": 16345 }, { "epoch": 2.059199546556665, "grad_norm": 0.22472088038921356, "learning_rate": 8.123138381766218e-05, "loss": 0.1843, "step": 16350 }, { "epoch": 2.059829328966842, "grad_norm": 0.2206810563802719, "learning_rate": 8.113371213066838e-05, "loss": 0.1781, "step": 16355 }, { "epoch": 2.0604591113770194, "grad_norm": 0.2740577757358551, "learning_rate": 8.103607742644426e-05, "loss": 0.1875, "step": 16360 }, { "epoch": 2.0610888937871965, "grad_norm": 0.22217485308647156, "learning_rate": 8.093847975742185e-05, "loss": 0.1748, "step": 16365 }, { "epoch": 2.0617186761973736, "grad_norm": 0.2460946887731552, "learning_rate": 8.084091917601336e-05, "loss": 0.1839, "step": 16370 }, { "epoch": 2.062348458607551, "grad_norm": 0.2489384114742279, "learning_rate": 8.074339573461101e-05, "loss": 0.1818, "step": 16375 }, { "epoch": 2.062978241017728, "grad_norm": 0.22755055129528046, "learning_rate": 8.06459094855871e-05, "loss": 0.1885, "step": 16380 }, { "epoch": 2.0636080234279057, "grad_norm": 0.22558000683784485, "learning_rate": 8.054846048129406e-05, "loss": 0.1805, "step": 16385 }, { "epoch": 2.064237805838083, "grad_norm": 0.2083364725112915, "learning_rate": 8.045104877406418e-05, "loss": 0.1809, "step": 16390 }, { "epoch": 2.0648675882482603, "grad_norm": 0.23679542541503906, "learning_rate": 8.035367441620976e-05, "loss": 0.181, "step": 16395 }, { "epoch": 2.0654973706584374, "grad_norm": 0.2173621654510498, "learning_rate": 8.025633746002311e-05, "loss": 0.1857, "step": 16400 }, { "epoch": 2.066127153068615, "grad_norm": 0.22376009821891785, "learning_rate": 8.015903795777634e-05, "loss": 0.1832, "step": 16405 }, { "epoch": 2.066756935478792, "grad_norm": 0.24444858729839325, "learning_rate": 8.00617759617215e-05, "loss": 0.1959, "step": 16410 }, { "epoch": 2.0673867178889695, "grad_norm": 0.21472635865211487, "learning_rate": 7.996455152409055e-05, "loss": 0.17, "step": 16415 }, { "epoch": 2.0680165002991466, "grad_norm": 0.22463464736938477, "learning_rate": 7.986736469709521e-05, "loss": 0.1847, "step": 16420 }, { "epoch": 2.0686462827093237, "grad_norm": 0.2251402884721756, "learning_rate": 7.977021553292696e-05, "loss": 0.1822, "step": 16425 }, { "epoch": 2.0692760651195012, "grad_norm": 0.21793001890182495, "learning_rate": 7.967310408375725e-05, "loss": 0.1862, "step": 16430 }, { "epoch": 2.0699058475296783, "grad_norm": 0.2344975620508194, "learning_rate": 7.957603040173714e-05, "loss": 0.1791, "step": 16435 }, { "epoch": 2.070535629939856, "grad_norm": 0.23466047644615173, "learning_rate": 7.947899453899725e-05, "loss": 0.1867, "step": 16440 }, { "epoch": 2.071165412350033, "grad_norm": 0.2190965861082077, "learning_rate": 7.93819965476482e-05, "loss": 0.1831, "step": 16445 }, { "epoch": 2.0717951947602105, "grad_norm": 0.22384218871593475, "learning_rate": 7.928503647978012e-05, "loss": 0.1745, "step": 16450 }, { "epoch": 2.0724249771703875, "grad_norm": 0.23837679624557495, "learning_rate": 7.918811438746272e-05, "loss": 0.1875, "step": 16455 }, { "epoch": 2.073054759580565, "grad_norm": 0.2510152757167816, "learning_rate": 7.909123032274542e-05, "loss": 0.1849, "step": 16460 }, { "epoch": 2.073684541990742, "grad_norm": 0.2514597475528717, "learning_rate": 7.899438433765711e-05, "loss": 0.1882, "step": 16465 }, { "epoch": 2.0743143244009197, "grad_norm": 0.20441210269927979, "learning_rate": 7.889757648420648e-05, "loss": 0.1754, "step": 16470 }, { "epoch": 2.0749441068110968, "grad_norm": 0.25783875584602356, "learning_rate": 7.880080681438134e-05, "loss": 0.1859, "step": 16475 }, { "epoch": 2.075573889221274, "grad_norm": 0.2234499454498291, "learning_rate": 7.870407538014933e-05, "loss": 0.1842, "step": 16480 }, { "epoch": 2.0762036716314514, "grad_norm": 0.24572981894016266, "learning_rate": 7.860738223345734e-05, "loss": 0.1728, "step": 16485 }, { "epoch": 2.0768334540416284, "grad_norm": 0.23702028393745422, "learning_rate": 7.851072742623194e-05, "loss": 0.1748, "step": 16490 }, { "epoch": 2.077463236451806, "grad_norm": 0.23450568318367004, "learning_rate": 7.84141110103789e-05, "loss": 0.1826, "step": 16495 }, { "epoch": 2.078093018861983, "grad_norm": 0.23022450506687164, "learning_rate": 7.831753303778342e-05, "loss": 0.1684, "step": 16500 }, { "epoch": 2.0787228012721606, "grad_norm": 0.22727181017398834, "learning_rate": 7.822099356031014e-05, "loss": 0.1751, "step": 16505 }, { "epoch": 2.0793525836823377, "grad_norm": 0.20935000479221344, "learning_rate": 7.812449262980289e-05, "loss": 0.1748, "step": 16510 }, { "epoch": 2.079982366092515, "grad_norm": 0.2445985972881317, "learning_rate": 7.802803029808492e-05, "loss": 0.1869, "step": 16515 }, { "epoch": 2.0806121485026923, "grad_norm": 0.21021974086761475, "learning_rate": 7.793160661695867e-05, "loss": 0.1778, "step": 16520 }, { "epoch": 2.08124193091287, "grad_norm": 0.20149335265159607, "learning_rate": 7.783522163820587e-05, "loss": 0.1685, "step": 16525 }, { "epoch": 2.081871713323047, "grad_norm": 0.2342994064092636, "learning_rate": 7.773887541358749e-05, "loss": 0.1714, "step": 16530 }, { "epoch": 2.082501495733224, "grad_norm": 0.2518448829650879, "learning_rate": 7.764256799484364e-05, "loss": 0.1899, "step": 16535 }, { "epoch": 2.0831312781434015, "grad_norm": 0.22891752421855927, "learning_rate": 7.754629943369365e-05, "loss": 0.1724, "step": 16540 }, { "epoch": 2.0837610605535786, "grad_norm": 0.2348988950252533, "learning_rate": 7.74500697818358e-05, "loss": 0.1772, "step": 16545 }, { "epoch": 2.084390842963756, "grad_norm": 0.21126072108745575, "learning_rate": 7.735387909094772e-05, "loss": 0.182, "step": 16550 }, { "epoch": 2.085020625373933, "grad_norm": 0.2134072482585907, "learning_rate": 7.725772741268598e-05, "loss": 0.1861, "step": 16555 }, { "epoch": 2.0856504077841107, "grad_norm": 0.22559498250484467, "learning_rate": 7.716161479868623e-05, "loss": 0.1745, "step": 16560 }, { "epoch": 2.086280190194288, "grad_norm": 0.2076030671596527, "learning_rate": 7.706554130056315e-05, "loss": 0.1811, "step": 16565 }, { "epoch": 2.0869099726044653, "grad_norm": 0.24279461801052094, "learning_rate": 7.696950696991032e-05, "loss": 0.1829, "step": 16570 }, { "epoch": 2.0875397550146424, "grad_norm": 0.21790249645709991, "learning_rate": 7.687351185830058e-05, "loss": 0.1835, "step": 16575 }, { "epoch": 2.08816953742482, "grad_norm": 0.2210235744714737, "learning_rate": 7.677755601728527e-05, "loss": 0.1678, "step": 16580 }, { "epoch": 2.088799319834997, "grad_norm": 0.21354030072689056, "learning_rate": 7.668163949839492e-05, "loss": 0.1863, "step": 16585 }, { "epoch": 2.089429102245174, "grad_norm": 0.264240026473999, "learning_rate": 7.658576235313896e-05, "loss": 0.1879, "step": 16590 }, { "epoch": 2.0900588846553516, "grad_norm": 0.2348974198102951, "learning_rate": 7.648992463300561e-05, "loss": 0.1796, "step": 16595 }, { "epoch": 2.0906886670655287, "grad_norm": 0.23128418624401093, "learning_rate": 7.639412638946186e-05, "loss": 0.1793, "step": 16600 }, { "epoch": 2.091318449475706, "grad_norm": 0.2405007928609848, "learning_rate": 7.629836767395359e-05, "loss": 0.1856, "step": 16605 }, { "epoch": 2.0919482318858833, "grad_norm": 0.23123788833618164, "learning_rate": 7.620264853790539e-05, "loss": 0.1752, "step": 16610 }, { "epoch": 2.092578014296061, "grad_norm": 0.22082751989364624, "learning_rate": 7.610696903272062e-05, "loss": 0.1731, "step": 16615 }, { "epoch": 2.093207796706238, "grad_norm": 0.23356421291828156, "learning_rate": 7.601132920978139e-05, "loss": 0.1839, "step": 16620 }, { "epoch": 2.0938375791164154, "grad_norm": 0.2418486326932907, "learning_rate": 7.591572912044846e-05, "loss": 0.1883, "step": 16625 }, { "epoch": 2.0944673615265925, "grad_norm": 0.2357870191335678, "learning_rate": 7.58201688160612e-05, "loss": 0.176, "step": 16630 }, { "epoch": 2.09509714393677, "grad_norm": 0.27169832587242126, "learning_rate": 7.572464834793778e-05, "loss": 0.1824, "step": 16635 }, { "epoch": 2.095726926346947, "grad_norm": 0.23245801031589508, "learning_rate": 7.562916776737488e-05, "loss": 0.1937, "step": 16640 }, { "epoch": 2.096356708757124, "grad_norm": 0.2312193661928177, "learning_rate": 7.55337271256476e-05, "loss": 0.1873, "step": 16645 }, { "epoch": 2.0969864911673017, "grad_norm": 0.2394751012325287, "learning_rate": 7.543832647400989e-05, "loss": 0.1748, "step": 16650 }, { "epoch": 2.097616273577479, "grad_norm": 0.2679862976074219, "learning_rate": 7.534296586369402e-05, "loss": 0.1868, "step": 16655 }, { "epoch": 2.0982460559876563, "grad_norm": 0.2397966831922531, "learning_rate": 7.524764534591086e-05, "loss": 0.1768, "step": 16660 }, { "epoch": 2.0988758383978334, "grad_norm": 0.22550681233406067, "learning_rate": 7.515236497184965e-05, "loss": 0.1764, "step": 16665 }, { "epoch": 2.099505620808011, "grad_norm": 0.23124639689922333, "learning_rate": 7.505712479267809e-05, "loss": 0.1828, "step": 16670 }, { "epoch": 2.100135403218188, "grad_norm": 0.2034096121788025, "learning_rate": 7.496192485954254e-05, "loss": 0.179, "step": 16675 }, { "epoch": 2.1007651856283656, "grad_norm": 0.2237498164176941, "learning_rate": 7.486676522356732e-05, "loss": 0.1867, "step": 16680 }, { "epoch": 2.1013949680385426, "grad_norm": 0.22583693265914917, "learning_rate": 7.477164593585537e-05, "loss": 0.1882, "step": 16685 }, { "epoch": 2.10202475044872, "grad_norm": 0.20145735144615173, "learning_rate": 7.467656704748792e-05, "loss": 0.1749, "step": 16690 }, { "epoch": 2.1026545328588973, "grad_norm": 0.204311341047287, "learning_rate": 7.458152860952458e-05, "loss": 0.1803, "step": 16695 }, { "epoch": 2.1032843152690743, "grad_norm": 0.23768644034862518, "learning_rate": 7.448653067300313e-05, "loss": 0.1915, "step": 16700 }, { "epoch": 2.103914097679252, "grad_norm": 0.21348991990089417, "learning_rate": 7.439157328893961e-05, "loss": 0.1778, "step": 16705 }, { "epoch": 2.104543880089429, "grad_norm": 0.22427400946617126, "learning_rate": 7.429665650832831e-05, "loss": 0.1712, "step": 16710 }, { "epoch": 2.1051736624996065, "grad_norm": 0.22512148320674896, "learning_rate": 7.420178038214172e-05, "loss": 0.1889, "step": 16715 }, { "epoch": 2.1058034449097836, "grad_norm": 0.22715777158737183, "learning_rate": 7.410694496133048e-05, "loss": 0.1737, "step": 16720 }, { "epoch": 2.106433227319961, "grad_norm": 0.2505483627319336, "learning_rate": 7.401215029682339e-05, "loss": 0.1809, "step": 16725 }, { "epoch": 2.107063009730138, "grad_norm": 0.2218826860189438, "learning_rate": 7.391739643952725e-05, "loss": 0.1766, "step": 16730 }, { "epoch": 2.1076927921403157, "grad_norm": 0.2085668295621872, "learning_rate": 7.38226834403272e-05, "loss": 0.1739, "step": 16735 }, { "epoch": 2.1083225745504928, "grad_norm": 0.21690475940704346, "learning_rate": 7.372801135008622e-05, "loss": 0.1738, "step": 16740 }, { "epoch": 2.1089523569606703, "grad_norm": 0.263988733291626, "learning_rate": 7.363338021964545e-05, "loss": 0.1951, "step": 16745 }, { "epoch": 2.1095821393708474, "grad_norm": 0.24228844046592712, "learning_rate": 7.353879009982377e-05, "loss": 0.1775, "step": 16750 }, { "epoch": 2.1102119217810245, "grad_norm": 0.2030615508556366, "learning_rate": 7.344424104141843e-05, "loss": 0.1754, "step": 16755 }, { "epoch": 2.110841704191202, "grad_norm": 0.22505883872509003, "learning_rate": 7.334973309520438e-05, "loss": 0.1814, "step": 16760 }, { "epoch": 2.111471486601379, "grad_norm": 0.28446871042251587, "learning_rate": 7.32552663119345e-05, "loss": 0.2009, "step": 16765 }, { "epoch": 2.1121012690115566, "grad_norm": 0.2320084124803543, "learning_rate": 7.316084074233968e-05, "loss": 0.1866, "step": 16770 }, { "epoch": 2.1127310514217337, "grad_norm": 0.23432306945323944, "learning_rate": 7.306645643712851e-05, "loss": 0.1838, "step": 16775 }, { "epoch": 2.113360833831911, "grad_norm": 0.20252206921577454, "learning_rate": 7.297211344698769e-05, "loss": 0.1753, "step": 16780 }, { "epoch": 2.1139906162420883, "grad_norm": 0.25251004099845886, "learning_rate": 7.28778118225814e-05, "loss": 0.1836, "step": 16785 }, { "epoch": 2.114620398652266, "grad_norm": 0.2514311373233795, "learning_rate": 7.278355161455176e-05, "loss": 0.1838, "step": 16790 }, { "epoch": 2.115250181062443, "grad_norm": 0.21513232588768005, "learning_rate": 7.268933287351876e-05, "loss": 0.1745, "step": 16795 }, { "epoch": 2.1158799634726204, "grad_norm": 0.2200087606906891, "learning_rate": 7.259515565007999e-05, "loss": 0.1839, "step": 16800 }, { "epoch": 2.1165097458827975, "grad_norm": 0.22383321821689606, "learning_rate": 7.250101999481073e-05, "loss": 0.1865, "step": 16805 }, { "epoch": 2.1171395282929746, "grad_norm": 0.2382001131772995, "learning_rate": 7.2406925958264e-05, "loss": 0.1862, "step": 16810 }, { "epoch": 2.117769310703152, "grad_norm": 0.2178415209054947, "learning_rate": 7.231287359097045e-05, "loss": 0.1799, "step": 16815 }, { "epoch": 2.118399093113329, "grad_norm": 0.22616611421108246, "learning_rate": 7.221886294343834e-05, "loss": 0.1819, "step": 16820 }, { "epoch": 2.1190288755235067, "grad_norm": 0.24810658395290375, "learning_rate": 7.212489406615355e-05, "loss": 0.181, "step": 16825 }, { "epoch": 2.119658657933684, "grad_norm": 0.2408507764339447, "learning_rate": 7.20309670095795e-05, "loss": 0.1867, "step": 16830 }, { "epoch": 2.1202884403438613, "grad_norm": 0.20721390843391418, "learning_rate": 7.19370818241571e-05, "loss": 0.175, "step": 16835 }, { "epoch": 2.1209182227540384, "grad_norm": 0.22691728174686432, "learning_rate": 7.184323856030497e-05, "loss": 0.1753, "step": 16840 }, { "epoch": 2.121548005164216, "grad_norm": 0.22788456082344055, "learning_rate": 7.174943726841902e-05, "loss": 0.1829, "step": 16845 }, { "epoch": 2.122177787574393, "grad_norm": 0.21744227409362793, "learning_rate": 7.165567799887268e-05, "loss": 0.1797, "step": 16850 }, { "epoch": 2.1228075699845705, "grad_norm": 0.211074560880661, "learning_rate": 7.156196080201685e-05, "loss": 0.1875, "step": 16855 }, { "epoch": 2.1234373523947476, "grad_norm": 0.27859583497047424, "learning_rate": 7.146828572817975e-05, "loss": 0.1791, "step": 16860 }, { "epoch": 2.1240671348049247, "grad_norm": 0.202862948179245, "learning_rate": 7.13746528276671e-05, "loss": 0.1752, "step": 16865 }, { "epoch": 2.1246969172151022, "grad_norm": 0.2529730498790741, "learning_rate": 7.128106215076187e-05, "loss": 0.1734, "step": 16870 }, { "epoch": 2.1253266996252793, "grad_norm": 0.22796177864074707, "learning_rate": 7.118751374772433e-05, "loss": 0.1807, "step": 16875 }, { "epoch": 2.125956482035457, "grad_norm": 0.20112904906272888, "learning_rate": 7.109400766879223e-05, "loss": 0.1711, "step": 16880 }, { "epoch": 2.126586264445634, "grad_norm": 0.22492708265781403, "learning_rate": 7.100054396418048e-05, "loss": 0.1784, "step": 16885 }, { "epoch": 2.1272160468558114, "grad_norm": 0.25224363803863525, "learning_rate": 7.09071226840811e-05, "loss": 0.185, "step": 16890 }, { "epoch": 2.1278458292659885, "grad_norm": 0.24734210968017578, "learning_rate": 7.081374387866346e-05, "loss": 0.1739, "step": 16895 }, { "epoch": 2.128475611676166, "grad_norm": 0.21726474165916443, "learning_rate": 7.07204075980742e-05, "loss": 0.1695, "step": 16900 }, { "epoch": 2.129105394086343, "grad_norm": 0.2073916345834732, "learning_rate": 7.062711389243703e-05, "loss": 0.1782, "step": 16905 }, { "epoch": 2.1297351764965207, "grad_norm": 0.2361113578081131, "learning_rate": 7.053386281185274e-05, "loss": 0.1787, "step": 16910 }, { "epoch": 2.1303649589066977, "grad_norm": 0.22586499154567719, "learning_rate": 7.044065440639933e-05, "loss": 0.1738, "step": 16915 }, { "epoch": 2.130994741316875, "grad_norm": 0.23469188809394836, "learning_rate": 7.034748872613184e-05, "loss": 0.1805, "step": 16920 }, { "epoch": 2.1316245237270524, "grad_norm": 0.1897682100534439, "learning_rate": 7.025436582108234e-05, "loss": 0.171, "step": 16925 }, { "epoch": 2.1322543061372294, "grad_norm": 0.22100795805454254, "learning_rate": 7.016128574126e-05, "loss": 0.1736, "step": 16930 }, { "epoch": 2.132884088547407, "grad_norm": 0.2332223504781723, "learning_rate": 7.006824853665085e-05, "loss": 0.1729, "step": 16935 }, { "epoch": 2.133513870957584, "grad_norm": 0.23929065465927124, "learning_rate": 6.997525425721814e-05, "loss": 0.1736, "step": 16940 }, { "epoch": 2.1341436533677616, "grad_norm": 0.26240813732147217, "learning_rate": 6.988230295290185e-05, "loss": 0.1798, "step": 16945 }, { "epoch": 2.1347734357779387, "grad_norm": 0.22387517988681793, "learning_rate": 6.978939467361895e-05, "loss": 0.1734, "step": 16950 }, { "epoch": 2.135403218188116, "grad_norm": 0.246952623128891, "learning_rate": 6.969652946926332e-05, "loss": 0.1834, "step": 16955 }, { "epoch": 2.1360330005982933, "grad_norm": 0.25226834416389465, "learning_rate": 6.960370738970568e-05, "loss": 0.1798, "step": 16960 }, { "epoch": 2.136662783008471, "grad_norm": 0.22118602693080902, "learning_rate": 6.951092848479364e-05, "loss": 0.1863, "step": 16965 }, { "epoch": 2.137292565418648, "grad_norm": 0.2567583918571472, "learning_rate": 6.941819280435155e-05, "loss": 0.1828, "step": 16970 }, { "epoch": 2.137922347828825, "grad_norm": 0.28791603446006775, "learning_rate": 6.93255003981806e-05, "loss": 0.1817, "step": 16975 }, { "epoch": 2.1385521302390025, "grad_norm": 0.2655430734157562, "learning_rate": 6.923285131605871e-05, "loss": 0.1789, "step": 16980 }, { "epoch": 2.1391819126491796, "grad_norm": 0.24513307213783264, "learning_rate": 6.914024560774061e-05, "loss": 0.1885, "step": 16985 }, { "epoch": 2.139811695059357, "grad_norm": 0.211643248796463, "learning_rate": 6.904768332295772e-05, "loss": 0.188, "step": 16990 }, { "epoch": 2.140441477469534, "grad_norm": 0.2373894900083542, "learning_rate": 6.895516451141791e-05, "loss": 0.1819, "step": 16995 }, { "epoch": 2.1410712598797117, "grad_norm": 0.22991600632667542, "learning_rate": 6.88626892228061e-05, "loss": 0.189, "step": 17000 }, { "epoch": 2.1410712598797117, "eval_loss": 0.3501429557800293, "eval_runtime": 6.1606, "eval_samples_per_second": 162.322, "eval_steps_per_second": 10.226, "step": 17000 }, { "epoch": 2.141701042289889, "grad_norm": 0.23578788340091705, "learning_rate": 6.877025750678352e-05, "loss": 0.1804, "step": 17005 }, { "epoch": 2.1423308247000663, "grad_norm": 0.20814631879329681, "learning_rate": 6.867786941298816e-05, "loss": 0.1776, "step": 17010 }, { "epoch": 2.1429606071102434, "grad_norm": 0.24113385379314423, "learning_rate": 6.858552499103451e-05, "loss": 0.171, "step": 17015 }, { "epoch": 2.143590389520421, "grad_norm": 0.2317270189523697, "learning_rate": 6.84932242905136e-05, "loss": 0.1881, "step": 17020 }, { "epoch": 2.144220171930598, "grad_norm": 0.26681753993034363, "learning_rate": 6.840096736099314e-05, "loss": 0.1792, "step": 17025 }, { "epoch": 2.144849954340775, "grad_norm": 0.2119479924440384, "learning_rate": 6.83087542520171e-05, "loss": 0.178, "step": 17030 }, { "epoch": 2.1454797367509526, "grad_norm": 0.20759105682373047, "learning_rate": 6.821658501310604e-05, "loss": 0.1754, "step": 17035 }, { "epoch": 2.1461095191611297, "grad_norm": 0.23515643179416656, "learning_rate": 6.812445969375691e-05, "loss": 0.1854, "step": 17040 }, { "epoch": 2.146739301571307, "grad_norm": 0.20694191753864288, "learning_rate": 6.803237834344322e-05, "loss": 0.1801, "step": 17045 }, { "epoch": 2.1473690839814843, "grad_norm": 0.21541932225227356, "learning_rate": 6.794034101161469e-05, "loss": 0.1752, "step": 17050 }, { "epoch": 2.147998866391662, "grad_norm": 0.20586980879306793, "learning_rate": 6.784834774769748e-05, "loss": 0.1803, "step": 17055 }, { "epoch": 2.148628648801839, "grad_norm": 0.23750190436840057, "learning_rate": 6.775639860109406e-05, "loss": 0.1842, "step": 17060 }, { "epoch": 2.1492584312120164, "grad_norm": 0.2041424959897995, "learning_rate": 6.766449362118324e-05, "loss": 0.1729, "step": 17065 }, { "epoch": 2.1498882136221935, "grad_norm": 0.24630430340766907, "learning_rate": 6.757263285732009e-05, "loss": 0.1821, "step": 17070 }, { "epoch": 2.150517996032371, "grad_norm": 0.23113587498664856, "learning_rate": 6.748081635883594e-05, "loss": 0.1821, "step": 17075 }, { "epoch": 2.151147778442548, "grad_norm": 0.203240305185318, "learning_rate": 6.738904417503829e-05, "loss": 0.1767, "step": 17080 }, { "epoch": 2.151777560852725, "grad_norm": 0.2500320374965668, "learning_rate": 6.7297316355211e-05, "loss": 0.1852, "step": 17085 }, { "epoch": 2.1524073432629027, "grad_norm": 0.2349621206521988, "learning_rate": 6.720563294861403e-05, "loss": 0.1764, "step": 17090 }, { "epoch": 2.15303712567308, "grad_norm": 0.2351408451795578, "learning_rate": 6.71139940044833e-05, "loss": 0.1835, "step": 17095 }, { "epoch": 2.1536669080832573, "grad_norm": 0.2078278511762619, "learning_rate": 6.702239957203108e-05, "loss": 0.1783, "step": 17100 }, { "epoch": 2.1542966904934344, "grad_norm": 0.23805204033851624, "learning_rate": 6.693084970044574e-05, "loss": 0.1858, "step": 17105 }, { "epoch": 2.154926472903612, "grad_norm": 0.22789132595062256, "learning_rate": 6.683934443889161e-05, "loss": 0.1839, "step": 17110 }, { "epoch": 2.155556255313789, "grad_norm": 0.27035263180732727, "learning_rate": 6.674788383650911e-05, "loss": 0.1878, "step": 17115 }, { "epoch": 2.1561860377239666, "grad_norm": 0.21787506341934204, "learning_rate": 6.665646794241468e-05, "loss": 0.1854, "step": 17120 }, { "epoch": 2.1568158201341436, "grad_norm": 0.2302270233631134, "learning_rate": 6.656509680570073e-05, "loss": 0.1822, "step": 17125 }, { "epoch": 2.157445602544321, "grad_norm": 0.21228045225143433, "learning_rate": 6.647377047543563e-05, "loss": 0.1855, "step": 17130 }, { "epoch": 2.1580753849544982, "grad_norm": 0.22131386399269104, "learning_rate": 6.638248900066375e-05, "loss": 0.1763, "step": 17135 }, { "epoch": 2.1587051673646753, "grad_norm": 0.2691584527492523, "learning_rate": 6.629125243040524e-05, "loss": 0.1815, "step": 17140 }, { "epoch": 2.159334949774853, "grad_norm": 0.22926035523414612, "learning_rate": 6.620006081365634e-05, "loss": 0.1833, "step": 17145 }, { "epoch": 2.15996473218503, "grad_norm": 0.20654956996440887, "learning_rate": 6.610891419938899e-05, "loss": 0.1755, "step": 17150 }, { "epoch": 2.1605945145952075, "grad_norm": 0.22390377521514893, "learning_rate": 6.601781263655096e-05, "loss": 0.1839, "step": 17155 }, { "epoch": 2.1612242970053845, "grad_norm": 0.23877164721488953, "learning_rate": 6.592675617406593e-05, "loss": 0.1739, "step": 17160 }, { "epoch": 2.161854079415562, "grad_norm": 0.24347762763500214, "learning_rate": 6.583574486083325e-05, "loss": 0.1863, "step": 17165 }, { "epoch": 2.162483861825739, "grad_norm": 0.23407521843910217, "learning_rate": 6.574477874572811e-05, "loss": 0.1741, "step": 17170 }, { "epoch": 2.1631136442359167, "grad_norm": 0.23338505625724792, "learning_rate": 6.565385787760137e-05, "loss": 0.1754, "step": 17175 }, { "epoch": 2.1637434266460938, "grad_norm": 0.2206541895866394, "learning_rate": 6.556298230527962e-05, "loss": 0.1706, "step": 17180 }, { "epoch": 2.1643732090562713, "grad_norm": 0.20819810032844543, "learning_rate": 6.547215207756504e-05, "loss": 0.1735, "step": 17185 }, { "epoch": 2.1650029914664484, "grad_norm": 0.22891941666603088, "learning_rate": 6.53813672432357e-05, "loss": 0.187, "step": 17190 }, { "epoch": 2.1656327738766254, "grad_norm": 0.2094859778881073, "learning_rate": 6.52906278510451e-05, "loss": 0.1795, "step": 17195 }, { "epoch": 2.166262556286803, "grad_norm": 0.20969723165035248, "learning_rate": 6.519993394972219e-05, "loss": 0.1679, "step": 17200 }, { "epoch": 2.16689233869698, "grad_norm": 0.25252285599708557, "learning_rate": 6.510928558797185e-05, "loss": 0.183, "step": 17205 }, { "epoch": 2.1675221211071576, "grad_norm": 0.22556447982788086, "learning_rate": 6.501868281447424e-05, "loss": 0.1694, "step": 17210 }, { "epoch": 2.1681519035173347, "grad_norm": 0.2429586797952652, "learning_rate": 6.492812567788516e-05, "loss": 0.18, "step": 17215 }, { "epoch": 2.168781685927512, "grad_norm": 0.2400483787059784, "learning_rate": 6.483761422683582e-05, "loss": 0.1818, "step": 17220 }, { "epoch": 2.1694114683376893, "grad_norm": 0.228154718875885, "learning_rate": 6.47471485099329e-05, "loss": 0.1744, "step": 17225 }, { "epoch": 2.170041250747867, "grad_norm": 0.21748559176921844, "learning_rate": 6.465672857575875e-05, "loss": 0.1765, "step": 17230 }, { "epoch": 2.170671033158044, "grad_norm": 0.2296319603919983, "learning_rate": 6.456635447287073e-05, "loss": 0.1881, "step": 17235 }, { "epoch": 2.1713008155682214, "grad_norm": 0.2402602881193161, "learning_rate": 6.447602624980186e-05, "loss": 0.1769, "step": 17240 }, { "epoch": 2.1719305979783985, "grad_norm": 0.2783866226673126, "learning_rate": 6.438574395506043e-05, "loss": 0.1836, "step": 17245 }, { "epoch": 2.1725603803885756, "grad_norm": 0.20301677286624908, "learning_rate": 6.429550763713017e-05, "loss": 0.1655, "step": 17250 }, { "epoch": 2.173190162798753, "grad_norm": 0.21163971722126007, "learning_rate": 6.420531734447e-05, "loss": 0.1764, "step": 17255 }, { "epoch": 2.17381994520893, "grad_norm": 0.24942253530025482, "learning_rate": 6.41151731255142e-05, "loss": 0.1833, "step": 17260 }, { "epoch": 2.1744497276191077, "grad_norm": 0.22958967089653015, "learning_rate": 6.402507502867222e-05, "loss": 0.1703, "step": 17265 }, { "epoch": 2.175079510029285, "grad_norm": 0.21424312889575958, "learning_rate": 6.393502310232886e-05, "loss": 0.1757, "step": 17270 }, { "epoch": 2.1757092924394623, "grad_norm": 0.20825864374637604, "learning_rate": 6.384501739484401e-05, "loss": 0.1715, "step": 17275 }, { "epoch": 2.1763390748496394, "grad_norm": 0.21387939155101776, "learning_rate": 6.375505795455281e-05, "loss": 0.1697, "step": 17280 }, { "epoch": 2.176968857259817, "grad_norm": 0.2073564976453781, "learning_rate": 6.366514482976546e-05, "loss": 0.1846, "step": 17285 }, { "epoch": 2.177598639669994, "grad_norm": 0.21405762434005737, "learning_rate": 6.35752780687675e-05, "loss": 0.1777, "step": 17290 }, { "epoch": 2.1782284220801715, "grad_norm": 0.22343981266021729, "learning_rate": 6.348545771981938e-05, "loss": 0.1801, "step": 17295 }, { "epoch": 2.1788582044903486, "grad_norm": 0.22697073221206665, "learning_rate": 6.339568383115668e-05, "loss": 0.1829, "step": 17300 }, { "epoch": 2.1794879869005257, "grad_norm": 0.2561056613922119, "learning_rate": 6.330595645098996e-05, "loss": 0.185, "step": 17305 }, { "epoch": 2.1801177693107032, "grad_norm": 0.2563771903514862, "learning_rate": 6.321627562750495e-05, "loss": 0.1752, "step": 17310 }, { "epoch": 2.1807475517208803, "grad_norm": 0.21171104907989502, "learning_rate": 6.312664140886228e-05, "loss": 0.166, "step": 17315 }, { "epoch": 2.181377334131058, "grad_norm": 0.23899543285369873, "learning_rate": 6.303705384319757e-05, "loss": 0.1828, "step": 17320 }, { "epoch": 2.182007116541235, "grad_norm": 0.26108884811401367, "learning_rate": 6.29475129786214e-05, "loss": 0.1829, "step": 17325 }, { "epoch": 2.1826368989514124, "grad_norm": 0.2397276908159256, "learning_rate": 6.285801886321919e-05, "loss": 0.1733, "step": 17330 }, { "epoch": 2.1832666813615895, "grad_norm": 0.22638286650180817, "learning_rate": 6.27685715450515e-05, "loss": 0.1719, "step": 17335 }, { "epoch": 2.183896463771767, "grad_norm": 0.2424623966217041, "learning_rate": 6.26791710721534e-05, "loss": 0.1749, "step": 17340 }, { "epoch": 2.184526246181944, "grad_norm": 0.23895704746246338, "learning_rate": 6.2589817492535e-05, "loss": 0.178, "step": 17345 }, { "epoch": 2.1851560285921217, "grad_norm": 0.2223139852285385, "learning_rate": 6.250051085418133e-05, "loss": 0.1872, "step": 17350 }, { "epoch": 2.1857858110022987, "grad_norm": 0.22255347669124603, "learning_rate": 6.241125120505204e-05, "loss": 0.1791, "step": 17355 }, { "epoch": 2.186415593412476, "grad_norm": 0.23792186379432678, "learning_rate": 6.232203859308157e-05, "loss": 0.1738, "step": 17360 }, { "epoch": 2.1870453758226533, "grad_norm": 0.24884961545467377, "learning_rate": 6.223287306617915e-05, "loss": 0.1778, "step": 17365 }, { "epoch": 2.1876751582328304, "grad_norm": 0.2130117118358612, "learning_rate": 6.214375467222873e-05, "loss": 0.1666, "step": 17370 }, { "epoch": 2.188304940643008, "grad_norm": 0.20538979768753052, "learning_rate": 6.205468345908888e-05, "loss": 0.1716, "step": 17375 }, { "epoch": 2.188934723053185, "grad_norm": 0.2519354224205017, "learning_rate": 6.196565947459292e-05, "loss": 0.1885, "step": 17380 }, { "epoch": 2.1895645054633626, "grad_norm": 0.2644721567630768, "learning_rate": 6.187668276654872e-05, "loss": 0.1923, "step": 17385 }, { "epoch": 2.1901942878735396, "grad_norm": 0.22676245868206024, "learning_rate": 6.178775338273876e-05, "loss": 0.1745, "step": 17390 }, { "epoch": 2.190824070283717, "grad_norm": 0.21329110860824585, "learning_rate": 6.169887137092029e-05, "loss": 0.1782, "step": 17395 }, { "epoch": 2.1914538526938943, "grad_norm": 0.2096760869026184, "learning_rate": 6.161003677882489e-05, "loss": 0.1705, "step": 17400 }, { "epoch": 2.1920836351040713, "grad_norm": 0.20192061364650726, "learning_rate": 6.15212496541588e-05, "loss": 0.1662, "step": 17405 }, { "epoch": 2.192713417514249, "grad_norm": 0.2351575493812561, "learning_rate": 6.14325100446027e-05, "loss": 0.1716, "step": 17410 }, { "epoch": 2.193343199924426, "grad_norm": 0.23202987015247345, "learning_rate": 6.13438179978118e-05, "loss": 0.1848, "step": 17415 }, { "epoch": 2.1939729823346035, "grad_norm": 0.22229251265525818, "learning_rate": 6.125517356141576e-05, "loss": 0.1757, "step": 17420 }, { "epoch": 2.1946027647447806, "grad_norm": 0.20741891860961914, "learning_rate": 6.116657678301868e-05, "loss": 0.1804, "step": 17425 }, { "epoch": 2.195232547154958, "grad_norm": 0.2023356705904007, "learning_rate": 6.107802771019895e-05, "loss": 0.168, "step": 17430 }, { "epoch": 2.195862329565135, "grad_norm": 0.30032244324684143, "learning_rate": 6.098952639050961e-05, "loss": 0.176, "step": 17435 }, { "epoch": 2.1964921119753127, "grad_norm": 0.2093886286020279, "learning_rate": 6.090107287147786e-05, "loss": 0.171, "step": 17440 }, { "epoch": 2.1971218943854898, "grad_norm": 0.20918086171150208, "learning_rate": 6.081266720060517e-05, "loss": 0.1705, "step": 17445 }, { "epoch": 2.1977516767956673, "grad_norm": 0.2089412659406662, "learning_rate": 6.072430942536737e-05, "loss": 0.1797, "step": 17450 }, { "epoch": 2.1983814592058444, "grad_norm": 0.2460128515958786, "learning_rate": 6.0635999593214765e-05, "loss": 0.1752, "step": 17455 }, { "epoch": 2.1990112416160215, "grad_norm": 0.25952646136283875, "learning_rate": 6.0547737751571654e-05, "loss": 0.1784, "step": 17460 }, { "epoch": 2.199641024026199, "grad_norm": 0.2011132687330246, "learning_rate": 6.0459523947836674e-05, "loss": 0.1714, "step": 17465 }, { "epoch": 2.200270806436376, "grad_norm": 0.19077162444591522, "learning_rate": 6.03713582293826e-05, "loss": 0.174, "step": 17470 }, { "epoch": 2.2009005888465536, "grad_norm": 0.22354647517204285, "learning_rate": 6.02832406435566e-05, "loss": 0.1754, "step": 17475 }, { "epoch": 2.2015303712567307, "grad_norm": 0.22434799373149872, "learning_rate": 6.019517123767968e-05, "loss": 0.1747, "step": 17480 }, { "epoch": 2.202160153666908, "grad_norm": 0.22911998629570007, "learning_rate": 6.010715005904716e-05, "loss": 0.1812, "step": 17485 }, { "epoch": 2.2027899360770853, "grad_norm": 0.23919759690761566, "learning_rate": 6.0019177154928364e-05, "loss": 0.1771, "step": 17490 }, { "epoch": 2.203419718487263, "grad_norm": 0.21539629995822906, "learning_rate": 5.993125257256687e-05, "loss": 0.1799, "step": 17495 }, { "epoch": 2.20404950089744, "grad_norm": 0.22069337964057922, "learning_rate": 5.984337635918014e-05, "loss": 0.177, "step": 17500 }, { "epoch": 2.2046792833076174, "grad_norm": 0.20763671398162842, "learning_rate": 5.97555485619597e-05, "loss": 0.1664, "step": 17505 }, { "epoch": 2.2053090657177945, "grad_norm": 0.1950199007987976, "learning_rate": 5.966776922807109e-05, "loss": 0.1648, "step": 17510 }, { "epoch": 2.2059388481279716, "grad_norm": 0.25142478942871094, "learning_rate": 5.95800384046538e-05, "loss": 0.1754, "step": 17515 }, { "epoch": 2.206568630538149, "grad_norm": 0.2232702225446701, "learning_rate": 5.94923561388213e-05, "loss": 0.1716, "step": 17520 }, { "epoch": 2.207198412948326, "grad_norm": 0.27657321095466614, "learning_rate": 5.940472247766097e-05, "loss": 0.1878, "step": 17525 }, { "epoch": 2.2078281953585037, "grad_norm": 0.21436183154582977, "learning_rate": 5.9317137468234083e-05, "loss": 0.1727, "step": 17530 }, { "epoch": 2.208457977768681, "grad_norm": 0.19741742312908173, "learning_rate": 5.9229601157575744e-05, "loss": 0.1694, "step": 17535 }, { "epoch": 2.2090877601788583, "grad_norm": 0.2042321413755417, "learning_rate": 5.914211359269509e-05, "loss": 0.17, "step": 17540 }, { "epoch": 2.2097175425890354, "grad_norm": 0.21126088500022888, "learning_rate": 5.9054674820574814e-05, "loss": 0.1703, "step": 17545 }, { "epoch": 2.210347324999213, "grad_norm": 0.20463821291923523, "learning_rate": 5.896728488817151e-05, "loss": 0.172, "step": 17550 }, { "epoch": 2.21097710740939, "grad_norm": 0.204604834318161, "learning_rate": 5.887994384241569e-05, "loss": 0.1723, "step": 17555 }, { "epoch": 2.2116068898195675, "grad_norm": 0.18806815147399902, "learning_rate": 5.879265173021141e-05, "loss": 0.161, "step": 17560 }, { "epoch": 2.2122366722297446, "grad_norm": 0.22745926678180695, "learning_rate": 5.870540859843656e-05, "loss": 0.1653, "step": 17565 }, { "epoch": 2.2128664546399217, "grad_norm": 0.1888933777809143, "learning_rate": 5.8618214493942675e-05, "loss": 0.1685, "step": 17570 }, { "epoch": 2.2134962370500992, "grad_norm": 0.19480280578136444, "learning_rate": 5.853106946355501e-05, "loss": 0.1676, "step": 17575 }, { "epoch": 2.2141260194602763, "grad_norm": 0.2703428864479065, "learning_rate": 5.8443973554072383e-05, "loss": 0.1788, "step": 17580 }, { "epoch": 2.214755801870454, "grad_norm": 0.21035927534103394, "learning_rate": 5.8356926812267335e-05, "loss": 0.1806, "step": 17585 }, { "epoch": 2.215385584280631, "grad_norm": 0.21794281899929047, "learning_rate": 5.826992928488594e-05, "loss": 0.1641, "step": 17590 }, { "epoch": 2.2160153666908085, "grad_norm": 0.2512260675430298, "learning_rate": 5.818298101864779e-05, "loss": 0.1697, "step": 17595 }, { "epoch": 2.2166451491009855, "grad_norm": 0.2089598923921585, "learning_rate": 5.8096082060246226e-05, "loss": 0.1656, "step": 17600 }, { "epoch": 2.217274931511163, "grad_norm": 0.2160467952489853, "learning_rate": 5.80092324563479e-05, "loss": 0.185, "step": 17605 }, { "epoch": 2.21790471392134, "grad_norm": 0.20858334004878998, "learning_rate": 5.7922432253593025e-05, "loss": 0.1721, "step": 17610 }, { "epoch": 2.2185344963315172, "grad_norm": 0.2090991735458374, "learning_rate": 5.7835681498595327e-05, "loss": 0.1706, "step": 17615 }, { "epoch": 2.2191642787416948, "grad_norm": 0.21040284633636475, "learning_rate": 5.77489802379419e-05, "loss": 0.1789, "step": 17620 }, { "epoch": 2.219794061151872, "grad_norm": 0.22497640550136566, "learning_rate": 5.766232851819332e-05, "loss": 0.1779, "step": 17625 }, { "epoch": 2.2204238435620494, "grad_norm": 0.2845938801765442, "learning_rate": 5.757572638588356e-05, "loss": 0.1771, "step": 17630 }, { "epoch": 2.2210536259722264, "grad_norm": 0.21166571974754333, "learning_rate": 5.748917388751985e-05, "loss": 0.1741, "step": 17635 }, { "epoch": 2.221683408382404, "grad_norm": 0.26706454157829285, "learning_rate": 5.7402671069583004e-05, "loss": 0.1715, "step": 17640 }, { "epoch": 2.222313190792581, "grad_norm": 0.2745297849178314, "learning_rate": 5.731621797852698e-05, "loss": 0.1843, "step": 17645 }, { "epoch": 2.2229429732027586, "grad_norm": 0.2507629990577698, "learning_rate": 5.7229814660778985e-05, "loss": 0.186, "step": 17650 }, { "epoch": 2.2235727556129357, "grad_norm": 0.21768365800380707, "learning_rate": 5.7143461162739545e-05, "loss": 0.1731, "step": 17655 }, { "epoch": 2.224202538023113, "grad_norm": 0.22099876403808594, "learning_rate": 5.705715753078259e-05, "loss": 0.1802, "step": 17660 }, { "epoch": 2.2248323204332903, "grad_norm": 0.20643608272075653, "learning_rate": 5.697090381125507e-05, "loss": 0.1769, "step": 17665 }, { "epoch": 2.2254621028434673, "grad_norm": 0.2723044455051422, "learning_rate": 5.688470005047722e-05, "loss": 0.1882, "step": 17670 }, { "epoch": 2.226091885253645, "grad_norm": 0.23548351228237152, "learning_rate": 5.679854629474238e-05, "loss": 0.1702, "step": 17675 }, { "epoch": 2.226721667663822, "grad_norm": 0.24578404426574707, "learning_rate": 5.671244259031722e-05, "loss": 0.1736, "step": 17680 }, { "epoch": 2.2273514500739995, "grad_norm": 0.21030524373054504, "learning_rate": 5.662638898344125e-05, "loss": 0.1711, "step": 17685 }, { "epoch": 2.2279812324841766, "grad_norm": 0.24249999225139618, "learning_rate": 5.6540385520327275e-05, "loss": 0.1742, "step": 17690 }, { "epoch": 2.228611014894354, "grad_norm": 0.23971515893936157, "learning_rate": 5.645443224716106e-05, "loss": 0.1655, "step": 17695 }, { "epoch": 2.229240797304531, "grad_norm": 0.2133120596408844, "learning_rate": 5.636852921010161e-05, "loss": 0.1786, "step": 17700 }, { "epoch": 2.2298705797147087, "grad_norm": 0.23475615680217743, "learning_rate": 5.628267645528073e-05, "loss": 0.1753, "step": 17705 }, { "epoch": 2.230500362124886, "grad_norm": 0.22111907601356506, "learning_rate": 5.619687402880332e-05, "loss": 0.1617, "step": 17710 }, { "epoch": 2.2311301445350633, "grad_norm": 0.2323450744152069, "learning_rate": 5.611112197674725e-05, "loss": 0.167, "step": 17715 }, { "epoch": 2.2317599269452404, "grad_norm": 0.18698996305465698, "learning_rate": 5.602542034516333e-05, "loss": 0.1632, "step": 17720 }, { "epoch": 2.2323897093554175, "grad_norm": 0.2252064198255539, "learning_rate": 5.5939769180075286e-05, "loss": 0.1709, "step": 17725 }, { "epoch": 2.233019491765595, "grad_norm": 0.2561705410480499, "learning_rate": 5.5854168527479756e-05, "loss": 0.1826, "step": 17730 }, { "epoch": 2.233649274175772, "grad_norm": 0.2448531985282898, "learning_rate": 5.576861843334625e-05, "loss": 0.1819, "step": 17735 }, { "epoch": 2.2342790565859496, "grad_norm": 0.238671213388443, "learning_rate": 5.568311894361707e-05, "loss": 0.1839, "step": 17740 }, { "epoch": 2.2349088389961267, "grad_norm": 0.22651298344135284, "learning_rate": 5.5597670104207485e-05, "loss": 0.172, "step": 17745 }, { "epoch": 2.235538621406304, "grad_norm": 0.23881249129772186, "learning_rate": 5.551227196100549e-05, "loss": 0.1698, "step": 17750 }, { "epoch": 2.2361684038164813, "grad_norm": 0.23065873980522156, "learning_rate": 5.542692455987167e-05, "loss": 0.1727, "step": 17755 }, { "epoch": 2.236798186226659, "grad_norm": 0.19607169926166534, "learning_rate": 5.534162794663969e-05, "loss": 0.1719, "step": 17760 }, { "epoch": 2.237427968636836, "grad_norm": 0.2033766806125641, "learning_rate": 5.525638216711573e-05, "loss": 0.171, "step": 17765 }, { "epoch": 2.2380577510470134, "grad_norm": 0.20412589609622955, "learning_rate": 5.5171187267078733e-05, "loss": 0.1633, "step": 17770 }, { "epoch": 2.2386875334571905, "grad_norm": 0.21895913779735565, "learning_rate": 5.508604329228028e-05, "loss": 0.1801, "step": 17775 }, { "epoch": 2.2393173158673676, "grad_norm": 0.19198501110076904, "learning_rate": 5.50009502884446e-05, "loss": 0.1764, "step": 17780 }, { "epoch": 2.239947098277545, "grad_norm": 0.21897682547569275, "learning_rate": 5.4915908301268724e-05, "loss": 0.1719, "step": 17785 }, { "epoch": 2.240576880687722, "grad_norm": 0.22070536017417908, "learning_rate": 5.483091737642198e-05, "loss": 0.1678, "step": 17790 }, { "epoch": 2.2412066630978997, "grad_norm": 0.2158748209476471, "learning_rate": 5.474597755954651e-05, "loss": 0.1703, "step": 17795 }, { "epoch": 2.241836445508077, "grad_norm": 0.21174906194210052, "learning_rate": 5.466108889625687e-05, "loss": 0.1698, "step": 17800 }, { "epoch": 2.2424662279182543, "grad_norm": 0.23331063985824585, "learning_rate": 5.457625143214029e-05, "loss": 0.1855, "step": 17805 }, { "epoch": 2.2430960103284314, "grad_norm": 0.2186896651983261, "learning_rate": 5.449146521275643e-05, "loss": 0.1629, "step": 17810 }, { "epoch": 2.243725792738609, "grad_norm": 0.22406966984272003, "learning_rate": 5.440673028363738e-05, "loss": 0.1731, "step": 17815 }, { "epoch": 2.244355575148786, "grad_norm": 0.21894322335720062, "learning_rate": 5.432204669028777e-05, "loss": 0.1671, "step": 17820 }, { "epoch": 2.2449853575589636, "grad_norm": 0.19151312112808228, "learning_rate": 5.4237414478184585e-05, "loss": 0.1634, "step": 17825 }, { "epoch": 2.2456151399691406, "grad_norm": 0.20597226917743683, "learning_rate": 5.415283369277729e-05, "loss": 0.1594, "step": 17830 }, { "epoch": 2.2462449223793177, "grad_norm": 0.23415236175060272, "learning_rate": 5.406830437948767e-05, "loss": 0.1667, "step": 17835 }, { "epoch": 2.2468747047894952, "grad_norm": 0.21160747110843658, "learning_rate": 5.398382658370986e-05, "loss": 0.1694, "step": 17840 }, { "epoch": 2.2475044871996723, "grad_norm": 0.2644958198070526, "learning_rate": 5.3899400350810466e-05, "loss": 0.1767, "step": 17845 }, { "epoch": 2.24813426960985, "grad_norm": 0.23654960095882416, "learning_rate": 5.381502572612826e-05, "loss": 0.1684, "step": 17850 }, { "epoch": 2.248764052020027, "grad_norm": 0.22581151127815247, "learning_rate": 5.373070275497439e-05, "loss": 0.1805, "step": 17855 }, { "epoch": 2.2493938344302045, "grad_norm": 0.21524479985237122, "learning_rate": 5.364643148263205e-05, "loss": 0.1753, "step": 17860 }, { "epoch": 2.2500236168403815, "grad_norm": 0.22853802144527435, "learning_rate": 5.3562211954357006e-05, "loss": 0.1752, "step": 17865 }, { "epoch": 2.250653399250559, "grad_norm": 0.19708101451396942, "learning_rate": 5.347804421537701e-05, "loss": 0.1701, "step": 17870 }, { "epoch": 2.251283181660736, "grad_norm": 0.22857971489429474, "learning_rate": 5.339392831089209e-05, "loss": 0.1662, "step": 17875 }, { "epoch": 2.2519129640709137, "grad_norm": 0.2373005598783493, "learning_rate": 5.33098642860743e-05, "loss": 0.1878, "step": 17880 }, { "epoch": 2.2525427464810908, "grad_norm": 0.22458739578723907, "learning_rate": 5.322585218606811e-05, "loss": 0.1711, "step": 17885 }, { "epoch": 2.253172528891268, "grad_norm": 0.24684731662273407, "learning_rate": 5.314189205598987e-05, "loss": 0.1833, "step": 17890 }, { "epoch": 2.2538023113014454, "grad_norm": 0.22604569792747498, "learning_rate": 5.3057983940928046e-05, "loss": 0.1683, "step": 17895 }, { "epoch": 2.2544320937116225, "grad_norm": 0.23015649616718292, "learning_rate": 5.2974127885943166e-05, "loss": 0.1793, "step": 17900 }, { "epoch": 2.2550618761218, "grad_norm": 0.2156984657049179, "learning_rate": 5.289032393606797e-05, "loss": 0.1816, "step": 17905 }, { "epoch": 2.255691658531977, "grad_norm": 0.2468300610780716, "learning_rate": 5.280657213630704e-05, "loss": 0.1795, "step": 17910 }, { "epoch": 2.2563214409421546, "grad_norm": 0.19326730072498322, "learning_rate": 5.2722872531637024e-05, "loss": 0.1726, "step": 17915 }, { "epoch": 2.2569512233523317, "grad_norm": 0.19111455976963043, "learning_rate": 5.2639225167006475e-05, "loss": 0.1709, "step": 17920 }, { "epoch": 2.257581005762509, "grad_norm": 0.24302569031715393, "learning_rate": 5.255563008733599e-05, "loss": 0.1752, "step": 17925 }, { "epoch": 2.2582107881726863, "grad_norm": 0.20797547698020935, "learning_rate": 5.247208733751801e-05, "loss": 0.1792, "step": 17930 }, { "epoch": 2.258840570582864, "grad_norm": 0.21642006933689117, "learning_rate": 5.238859696241689e-05, "loss": 0.1673, "step": 17935 }, { "epoch": 2.259470352993041, "grad_norm": 0.22728614509105682, "learning_rate": 5.2305159006868885e-05, "loss": 0.1793, "step": 17940 }, { "epoch": 2.260100135403218, "grad_norm": 0.24052174389362335, "learning_rate": 5.2221773515682035e-05, "loss": 0.1791, "step": 17945 }, { "epoch": 2.2607299178133955, "grad_norm": 0.21312139928340912, "learning_rate": 5.213844053363635e-05, "loss": 0.177, "step": 17950 }, { "epoch": 2.2613597002235726, "grad_norm": 0.22087723016738892, "learning_rate": 5.205516010548349e-05, "loss": 0.1764, "step": 17955 }, { "epoch": 2.26198948263375, "grad_norm": 0.24077439308166504, "learning_rate": 5.1971932275946967e-05, "loss": 0.1884, "step": 17960 }, { "epoch": 2.262619265043927, "grad_norm": 0.2120356261730194, "learning_rate": 5.188875708972198e-05, "loss": 0.173, "step": 17965 }, { "epoch": 2.2632490474541047, "grad_norm": 0.24573729932308197, "learning_rate": 5.1805634591475555e-05, "loss": 0.1824, "step": 17970 }, { "epoch": 2.263878829864282, "grad_norm": 0.20354896783828735, "learning_rate": 5.1722564825846336e-05, "loss": 0.1738, "step": 17975 }, { "epoch": 2.2645086122744593, "grad_norm": 0.2105248123407364, "learning_rate": 5.1639547837444725e-05, "loss": 0.1694, "step": 17980 }, { "epoch": 2.2651383946846364, "grad_norm": 0.21009747684001923, "learning_rate": 5.1556583670852636e-05, "loss": 0.1773, "step": 17985 }, { "epoch": 2.265768177094814, "grad_norm": 0.21542850136756897, "learning_rate": 5.147367237062387e-05, "loss": 0.1682, "step": 17990 }, { "epoch": 2.266397959504991, "grad_norm": 0.20584627985954285, "learning_rate": 5.1390813981283676e-05, "loss": 0.1734, "step": 17995 }, { "epoch": 2.267027741915168, "grad_norm": 0.2486305981874466, "learning_rate": 5.130800854732877e-05, "loss": 0.1825, "step": 18000 }, { "epoch": 2.267027741915168, "eval_loss": 0.35427358746528625, "eval_runtime": 6.1591, "eval_samples_per_second": 162.361, "eval_steps_per_second": 10.229, "step": 18000 }, { "epoch": 2.2676575243253456, "grad_norm": 0.19808907806873322, "learning_rate": 5.122525611322761e-05, "loss": 0.1625, "step": 18005 }, { "epoch": 2.2682873067355227, "grad_norm": 0.24098962545394897, "learning_rate": 5.114255672342022e-05, "loss": 0.1687, "step": 18010 }, { "epoch": 2.2689170891457002, "grad_norm": 0.22834831476211548, "learning_rate": 5.105991042231799e-05, "loss": 0.1695, "step": 18015 }, { "epoch": 2.2695468715558773, "grad_norm": 0.19950784742832184, "learning_rate": 5.097731725430392e-05, "loss": 0.1692, "step": 18020 }, { "epoch": 2.270176653966055, "grad_norm": 0.23613286018371582, "learning_rate": 5.0894777263732405e-05, "loss": 0.176, "step": 18025 }, { "epoch": 2.270806436376232, "grad_norm": 0.2248247116804123, "learning_rate": 5.081229049492929e-05, "loss": 0.1638, "step": 18030 }, { "epoch": 2.2714362187864094, "grad_norm": 0.21063442528247833, "learning_rate": 5.072985699219186e-05, "loss": 0.1696, "step": 18035 }, { "epoch": 2.2720660011965865, "grad_norm": 0.26251456141471863, "learning_rate": 5.064747679978881e-05, "loss": 0.1784, "step": 18040 }, { "epoch": 2.272695783606764, "grad_norm": 0.20396436750888824, "learning_rate": 5.056514996196011e-05, "loss": 0.1733, "step": 18045 }, { "epoch": 2.273325566016941, "grad_norm": 0.21515126526355743, "learning_rate": 5.048287652291728e-05, "loss": 0.1625, "step": 18050 }, { "epoch": 2.273955348427118, "grad_norm": 0.24371370673179626, "learning_rate": 5.0400656526842946e-05, "loss": 0.1739, "step": 18055 }, { "epoch": 2.2745851308372957, "grad_norm": 0.22852087020874023, "learning_rate": 5.03184900178912e-05, "loss": 0.171, "step": 18060 }, { "epoch": 2.275214913247473, "grad_norm": 0.22659562528133392, "learning_rate": 5.023637704018719e-05, "loss": 0.1769, "step": 18065 }, { "epoch": 2.2758446956576504, "grad_norm": 0.2462269514799118, "learning_rate": 5.01543176378276e-05, "loss": 0.1731, "step": 18070 }, { "epoch": 2.2764744780678274, "grad_norm": 0.21395175158977509, "learning_rate": 5.007231185488016e-05, "loss": 0.1705, "step": 18075 }, { "epoch": 2.277104260478005, "grad_norm": 0.2166956514120102, "learning_rate": 4.9990359735383837e-05, "loss": 0.1671, "step": 18080 }, { "epoch": 2.277734042888182, "grad_norm": 0.23139755427837372, "learning_rate": 4.9908461323348754e-05, "loss": 0.1785, "step": 18085 }, { "epoch": 2.2783638252983596, "grad_norm": 0.23193643987178802, "learning_rate": 4.982661666275632e-05, "loss": 0.1746, "step": 18090 }, { "epoch": 2.2789936077085367, "grad_norm": 0.21008536219596863, "learning_rate": 4.974482579755899e-05, "loss": 0.1784, "step": 18095 }, { "epoch": 2.279623390118714, "grad_norm": 0.23688139021396637, "learning_rate": 4.9663088771680235e-05, "loss": 0.1812, "step": 18100 }, { "epoch": 2.2802531725288913, "grad_norm": 0.20811019837856293, "learning_rate": 4.958140562901468e-05, "loss": 0.1721, "step": 18105 }, { "epoch": 2.2808829549390683, "grad_norm": 0.2096734642982483, "learning_rate": 4.9499776413428167e-05, "loss": 0.1697, "step": 18110 }, { "epoch": 2.281512737349246, "grad_norm": 0.22839121520519257, "learning_rate": 4.9418201168757386e-05, "loss": 0.1729, "step": 18115 }, { "epoch": 2.282142519759423, "grad_norm": 0.21908484399318695, "learning_rate": 4.9336679938810106e-05, "loss": 0.1659, "step": 18120 }, { "epoch": 2.2827723021696005, "grad_norm": 0.20620904862880707, "learning_rate": 4.925521276736511e-05, "loss": 0.1636, "step": 18125 }, { "epoch": 2.2834020845797776, "grad_norm": 0.28344854712486267, "learning_rate": 4.9173799698172095e-05, "loss": 0.1753, "step": 18130 }, { "epoch": 2.284031866989955, "grad_norm": 0.2172774374485016, "learning_rate": 4.909244077495175e-05, "loss": 0.1702, "step": 18135 }, { "epoch": 2.284661649400132, "grad_norm": 0.19668060541152954, "learning_rate": 4.90111360413957e-05, "loss": 0.1715, "step": 18140 }, { "epoch": 2.2852914318103097, "grad_norm": 0.19766007363796234, "learning_rate": 4.892988554116642e-05, "loss": 0.1608, "step": 18145 }, { "epoch": 2.2859212142204868, "grad_norm": 0.2108301967382431, "learning_rate": 4.884868931789724e-05, "loss": 0.1633, "step": 18150 }, { "epoch": 2.2865509966306643, "grad_norm": 0.25781720876693726, "learning_rate": 4.8767547415192476e-05, "loss": 0.1634, "step": 18155 }, { "epoch": 2.2871807790408414, "grad_norm": 0.21515868604183197, "learning_rate": 4.8686459876627164e-05, "loss": 0.1687, "step": 18160 }, { "epoch": 2.2878105614510185, "grad_norm": 0.23936854302883148, "learning_rate": 4.860542674574713e-05, "loss": 0.1786, "step": 18165 }, { "epoch": 2.288440343861196, "grad_norm": 0.2083710879087448, "learning_rate": 4.852444806606904e-05, "loss": 0.1727, "step": 18170 }, { "epoch": 2.289070126271373, "grad_norm": 0.24087072908878326, "learning_rate": 4.844352388108028e-05, "loss": 0.1646, "step": 18175 }, { "epoch": 2.2896999086815506, "grad_norm": 0.22956833243370056, "learning_rate": 4.836265423423898e-05, "loss": 0.1667, "step": 18180 }, { "epoch": 2.2903296910917277, "grad_norm": 0.2500525414943695, "learning_rate": 4.828183916897402e-05, "loss": 0.1788, "step": 18185 }, { "epoch": 2.290959473501905, "grad_norm": 0.23779354989528656, "learning_rate": 4.820107872868486e-05, "loss": 0.1687, "step": 18190 }, { "epoch": 2.2915892559120823, "grad_norm": 0.21519017219543457, "learning_rate": 4.81203729567418e-05, "loss": 0.173, "step": 18195 }, { "epoch": 2.29221903832226, "grad_norm": 0.2123459428548813, "learning_rate": 4.803972189648568e-05, "loss": 0.1648, "step": 18200 }, { "epoch": 2.292848820732437, "grad_norm": 0.2364078015089035, "learning_rate": 4.795912559122789e-05, "loss": 0.1743, "step": 18205 }, { "epoch": 2.2934786031426144, "grad_norm": 0.23717305064201355, "learning_rate": 4.787858408425045e-05, "loss": 0.1827, "step": 18210 }, { "epoch": 2.2941083855527915, "grad_norm": 0.197091206908226, "learning_rate": 4.7798097418806134e-05, "loss": 0.1713, "step": 18215 }, { "epoch": 2.2947381679629686, "grad_norm": 0.19760344922542572, "learning_rate": 4.771766563811803e-05, "loss": 0.1612, "step": 18220 }, { "epoch": 2.295367950373146, "grad_norm": 0.22046242654323578, "learning_rate": 4.763728878537984e-05, "loss": 0.1691, "step": 18225 }, { "epoch": 2.295997732783323, "grad_norm": 0.22356641292572021, "learning_rate": 4.755696690375574e-05, "loss": 0.1684, "step": 18230 }, { "epoch": 2.2966275151935007, "grad_norm": 0.20664890110492706, "learning_rate": 4.7476700036380565e-05, "loss": 0.1656, "step": 18235 }, { "epoch": 2.297257297603678, "grad_norm": 0.2873956859111786, "learning_rate": 4.73964882263593e-05, "loss": 0.1811, "step": 18240 }, { "epoch": 2.2978870800138553, "grad_norm": 0.23324726521968842, "learning_rate": 4.7316331516767575e-05, "loss": 0.17, "step": 18245 }, { "epoch": 2.2985168624240324, "grad_norm": 0.22407886385917664, "learning_rate": 4.7236229950651314e-05, "loss": 0.1589, "step": 18250 }, { "epoch": 2.29914664483421, "grad_norm": 0.2202986776828766, "learning_rate": 4.7156183571026985e-05, "loss": 0.1806, "step": 18255 }, { "epoch": 2.299776427244387, "grad_norm": 0.1998445987701416, "learning_rate": 4.707619242088129e-05, "loss": 0.1571, "step": 18260 }, { "epoch": 2.3004062096545645, "grad_norm": 0.24477636814117432, "learning_rate": 4.69962565431713e-05, "loss": 0.1788, "step": 18265 }, { "epoch": 2.3010359920647416, "grad_norm": 0.2186649590730667, "learning_rate": 4.691637598082439e-05, "loss": 0.1837, "step": 18270 }, { "epoch": 2.3016657744749187, "grad_norm": 0.19296254217624664, "learning_rate": 4.683655077673826e-05, "loss": 0.1609, "step": 18275 }, { "epoch": 2.3022955568850962, "grad_norm": 0.234447181224823, "learning_rate": 4.675678097378086e-05, "loss": 0.1711, "step": 18280 }, { "epoch": 2.3029253392952733, "grad_norm": 0.19974513351917267, "learning_rate": 4.667706661479041e-05, "loss": 0.1666, "step": 18285 }, { "epoch": 2.303555121705451, "grad_norm": 0.23064357042312622, "learning_rate": 4.659740774257527e-05, "loss": 0.1684, "step": 18290 }, { "epoch": 2.304184904115628, "grad_norm": 0.19428302347660065, "learning_rate": 4.6517804399914214e-05, "loss": 0.166, "step": 18295 }, { "epoch": 2.3048146865258055, "grad_norm": 0.23040397465229034, "learning_rate": 4.6438256629555956e-05, "loss": 0.1687, "step": 18300 }, { "epoch": 2.3054444689359825, "grad_norm": 0.22161847352981567, "learning_rate": 4.635876447421955e-05, "loss": 0.1784, "step": 18305 }, { "epoch": 2.30607425134616, "grad_norm": 0.22831936180591583, "learning_rate": 4.6279327976593924e-05, "loss": 0.1731, "step": 18310 }, { "epoch": 2.306704033756337, "grad_norm": 0.25957801938056946, "learning_rate": 4.619994717933848e-05, "loss": 0.1823, "step": 18315 }, { "epoch": 2.3073338161665147, "grad_norm": 0.23449194431304932, "learning_rate": 4.6120622125082426e-05, "loss": 0.1725, "step": 18320 }, { "epoch": 2.3079635985766918, "grad_norm": 0.24584275484085083, "learning_rate": 4.604135285642514e-05, "loss": 0.1857, "step": 18325 }, { "epoch": 2.308593380986869, "grad_norm": 0.21245352923870087, "learning_rate": 4.5962139415936056e-05, "loss": 0.164, "step": 18330 }, { "epoch": 2.3092231633970464, "grad_norm": 0.2068212777376175, "learning_rate": 4.588298184615453e-05, "loss": 0.1661, "step": 18335 }, { "epoch": 2.3098529458072234, "grad_norm": 0.21349553763866425, "learning_rate": 4.580388018959013e-05, "loss": 0.1707, "step": 18340 }, { "epoch": 2.310482728217401, "grad_norm": 0.2073366641998291, "learning_rate": 4.5724834488722106e-05, "loss": 0.1608, "step": 18345 }, { "epoch": 2.311112510627578, "grad_norm": 0.2493850737810135, "learning_rate": 4.564584478599982e-05, "loss": 0.176, "step": 18350 }, { "epoch": 2.3117422930377556, "grad_norm": 0.25253990292549133, "learning_rate": 4.556691112384262e-05, "loss": 0.1744, "step": 18355 }, { "epoch": 2.3123720754479327, "grad_norm": 0.24499280750751495, "learning_rate": 4.548803354463967e-05, "loss": 0.1755, "step": 18360 }, { "epoch": 2.31300185785811, "grad_norm": 0.21188803017139435, "learning_rate": 4.540921209075e-05, "loss": 0.1675, "step": 18365 }, { "epoch": 2.3136316402682873, "grad_norm": 0.2255249321460724, "learning_rate": 4.5330446804502543e-05, "loss": 0.1668, "step": 18370 }, { "epoch": 2.314261422678465, "grad_norm": 0.2088666409254074, "learning_rate": 4.525173772819606e-05, "loss": 0.173, "step": 18375 }, { "epoch": 2.314891205088642, "grad_norm": 0.24474313855171204, "learning_rate": 4.517308490409912e-05, "loss": 0.1672, "step": 18380 }, { "epoch": 2.315520987498819, "grad_norm": 0.2033611238002777, "learning_rate": 4.5094488374450085e-05, "loss": 0.1677, "step": 18385 }, { "epoch": 2.3161507699089965, "grad_norm": 0.22693341970443726, "learning_rate": 4.50159481814571e-05, "loss": 0.1653, "step": 18390 }, { "epoch": 2.3167805523191736, "grad_norm": 0.24162709712982178, "learning_rate": 4.493746436729797e-05, "loss": 0.1668, "step": 18395 }, { "epoch": 2.317410334729351, "grad_norm": 0.21281133592128754, "learning_rate": 4.485903697412041e-05, "loss": 0.167, "step": 18400 }, { "epoch": 2.318040117139528, "grad_norm": 0.2348182648420334, "learning_rate": 4.478066604404168e-05, "loss": 0.1683, "step": 18405 }, { "epoch": 2.3186698995497057, "grad_norm": 0.2391456663608551, "learning_rate": 4.470235161914878e-05, "loss": 0.1708, "step": 18410 }, { "epoch": 2.319299681959883, "grad_norm": 0.2014867216348648, "learning_rate": 4.462409374149822e-05, "loss": 0.1679, "step": 18415 }, { "epoch": 2.3199294643700603, "grad_norm": 0.19464534521102905, "learning_rate": 4.4545892453116414e-05, "loss": 0.167, "step": 18420 }, { "epoch": 2.3205592467802374, "grad_norm": 0.18525034189224243, "learning_rate": 4.446774779599918e-05, "loss": 0.16, "step": 18425 }, { "epoch": 2.321189029190415, "grad_norm": 0.220379039645195, "learning_rate": 4.438965981211201e-05, "loss": 0.1728, "step": 18430 }, { "epoch": 2.321818811600592, "grad_norm": 0.22186563909053802, "learning_rate": 4.431162854338985e-05, "loss": 0.1651, "step": 18435 }, { "epoch": 2.322448594010769, "grad_norm": 0.22272159159183502, "learning_rate": 4.423365403173739e-05, "loss": 0.171, "step": 18440 }, { "epoch": 2.3230783764209466, "grad_norm": 0.220636785030365, "learning_rate": 4.4155736319028725e-05, "loss": 0.1691, "step": 18445 }, { "epoch": 2.3237081588311237, "grad_norm": 0.22500810027122498, "learning_rate": 4.4077875447107356e-05, "loss": 0.1648, "step": 18450 }, { "epoch": 2.324337941241301, "grad_norm": 0.2163766771554947, "learning_rate": 4.4000071457786335e-05, "loss": 0.1655, "step": 18455 }, { "epoch": 2.3249677236514783, "grad_norm": 0.2258923053741455, "learning_rate": 4.392232439284829e-05, "loss": 0.1704, "step": 18460 }, { "epoch": 2.325597506061656, "grad_norm": 0.23461341857910156, "learning_rate": 4.384463429404511e-05, "loss": 0.1686, "step": 18465 }, { "epoch": 2.326227288471833, "grad_norm": 0.22406549751758575, "learning_rate": 4.376700120309816e-05, "loss": 0.1655, "step": 18470 }, { "epoch": 2.3268570708820104, "grad_norm": 0.21646642684936523, "learning_rate": 4.368942516169819e-05, "loss": 0.1682, "step": 18475 }, { "epoch": 2.3274868532921875, "grad_norm": 0.23925819993019104, "learning_rate": 4.3611906211505284e-05, "loss": 0.1746, "step": 18480 }, { "epoch": 2.328116635702365, "grad_norm": 0.19920630753040314, "learning_rate": 4.35344443941489e-05, "loss": 0.158, "step": 18485 }, { "epoch": 2.328746418112542, "grad_norm": 0.2575379014015198, "learning_rate": 4.345703975122783e-05, "loss": 0.1708, "step": 18490 }, { "epoch": 2.329376200522719, "grad_norm": 0.19556741416454315, "learning_rate": 4.3379692324310056e-05, "loss": 0.1677, "step": 18495 }, { "epoch": 2.3300059829328967, "grad_norm": 0.2595387101173401, "learning_rate": 4.3302402154933005e-05, "loss": 0.1705, "step": 18500 }, { "epoch": 2.330635765343074, "grad_norm": 0.21318422257900238, "learning_rate": 4.322516928460325e-05, "loss": 0.1676, "step": 18505 }, { "epoch": 2.3312655477532513, "grad_norm": 0.2212359607219696, "learning_rate": 4.3147993754796624e-05, "loss": 0.1661, "step": 18510 }, { "epoch": 2.3318953301634284, "grad_norm": 0.1886136680841446, "learning_rate": 4.3070875606958006e-05, "loss": 0.1613, "step": 18515 }, { "epoch": 2.332525112573606, "grad_norm": 0.23505628108978271, "learning_rate": 4.2993814882501754e-05, "loss": 0.1687, "step": 18520 }, { "epoch": 2.333154894983783, "grad_norm": 0.18686296045780182, "learning_rate": 4.2916811622811195e-05, "loss": 0.1613, "step": 18525 }, { "epoch": 2.3337846773939606, "grad_norm": 0.21165959537029266, "learning_rate": 4.2839865869238845e-05, "loss": 0.1604, "step": 18530 }, { "epoch": 2.3344144598041376, "grad_norm": 0.29806169867515564, "learning_rate": 4.27629776631063e-05, "loss": 0.1682, "step": 18535 }, { "epoch": 2.335044242214315, "grad_norm": 0.2488899528980255, "learning_rate": 4.268614704570426e-05, "loss": 0.1758, "step": 18540 }, { "epoch": 2.3356740246244923, "grad_norm": 0.21834008395671844, "learning_rate": 4.2609374058292666e-05, "loss": 0.1587, "step": 18545 }, { "epoch": 2.3363038070346693, "grad_norm": 0.22900566458702087, "learning_rate": 4.253265874210022e-05, "loss": 0.1798, "step": 18550 }, { "epoch": 2.336933589444847, "grad_norm": 0.22346030175685883, "learning_rate": 4.2456001138324794e-05, "loss": 0.1656, "step": 18555 }, { "epoch": 2.337563371855024, "grad_norm": 0.22244654595851898, "learning_rate": 4.237940128813336e-05, "loss": 0.1734, "step": 18560 }, { "epoch": 2.3381931542652015, "grad_norm": 0.19254350662231445, "learning_rate": 4.230285923266175e-05, "loss": 0.1619, "step": 18565 }, { "epoch": 2.3388229366753785, "grad_norm": 0.22871673107147217, "learning_rate": 4.222637501301481e-05, "loss": 0.166, "step": 18570 }, { "epoch": 2.339452719085556, "grad_norm": 0.20270411670207977, "learning_rate": 4.2149948670266284e-05, "loss": 0.1637, "step": 18575 }, { "epoch": 2.340082501495733, "grad_norm": 0.23636558651924133, "learning_rate": 4.2073580245458874e-05, "loss": 0.1839, "step": 18580 }, { "epoch": 2.3407122839059107, "grad_norm": 0.24934862554073334, "learning_rate": 4.1997269779604185e-05, "loss": 0.1661, "step": 18585 }, { "epoch": 2.3413420663160878, "grad_norm": 0.2234071046113968, "learning_rate": 4.192101731368267e-05, "loss": 0.1699, "step": 18590 }, { "epoch": 2.3419718487262653, "grad_norm": 0.20725548267364502, "learning_rate": 4.1844822888643634e-05, "loss": 0.1663, "step": 18595 }, { "epoch": 2.3426016311364424, "grad_norm": 0.22668230533599854, "learning_rate": 4.1768686545405186e-05, "loss": 0.1647, "step": 18600 }, { "epoch": 2.3432314135466195, "grad_norm": 0.23123641312122345, "learning_rate": 4.1692608324854384e-05, "loss": 0.171, "step": 18605 }, { "epoch": 2.343861195956797, "grad_norm": 0.21715596318244934, "learning_rate": 4.161658826784692e-05, "loss": 0.1631, "step": 18610 }, { "epoch": 2.344490978366974, "grad_norm": 0.24206319451332092, "learning_rate": 4.154062641520732e-05, "loss": 0.1724, "step": 18615 }, { "epoch": 2.3451207607771516, "grad_norm": 0.21535861492156982, "learning_rate": 4.1464722807728724e-05, "loss": 0.1673, "step": 18620 }, { "epoch": 2.3457505431873287, "grad_norm": 0.24345341324806213, "learning_rate": 4.1388877486173245e-05, "loss": 0.1648, "step": 18625 }, { "epoch": 2.346380325597506, "grad_norm": 0.2361554056406021, "learning_rate": 4.131309049127149e-05, "loss": 0.1624, "step": 18630 }, { "epoch": 2.3470101080076833, "grad_norm": 0.20666177570819855, "learning_rate": 4.1237361863722816e-05, "loss": 0.1662, "step": 18635 }, { "epoch": 2.347639890417861, "grad_norm": 0.22876566648483276, "learning_rate": 4.1161691644195165e-05, "loss": 0.1767, "step": 18640 }, { "epoch": 2.348269672828038, "grad_norm": 0.19370432198047638, "learning_rate": 4.108607987332529e-05, "loss": 0.1604, "step": 18645 }, { "epoch": 2.3488994552382154, "grad_norm": 0.22485142946243286, "learning_rate": 4.101052659171842e-05, "loss": 0.1667, "step": 18650 }, { "epoch": 2.3495292376483925, "grad_norm": 0.2446049600839615, "learning_rate": 4.0935031839948315e-05, "loss": 0.1719, "step": 18655 }, { "epoch": 2.3501590200585696, "grad_norm": 0.22652800381183624, "learning_rate": 4.0859595658557367e-05, "loss": 0.1666, "step": 18660 }, { "epoch": 2.350788802468747, "grad_norm": 0.1760840266942978, "learning_rate": 4.078421808805663e-05, "loss": 0.1516, "step": 18665 }, { "epoch": 2.351418584878924, "grad_norm": 0.20791617035865784, "learning_rate": 4.070889916892553e-05, "loss": 0.164, "step": 18670 }, { "epoch": 2.3520483672891017, "grad_norm": 0.2205626517534256, "learning_rate": 4.063363894161206e-05, "loss": 0.1669, "step": 18675 }, { "epoch": 2.352678149699279, "grad_norm": 0.23379269242286682, "learning_rate": 4.055843744653266e-05, "loss": 0.1593, "step": 18680 }, { "epoch": 2.3533079321094563, "grad_norm": 0.23451068997383118, "learning_rate": 4.0483294724072254e-05, "loss": 0.1633, "step": 18685 }, { "epoch": 2.3539377145196334, "grad_norm": 0.28889602422714233, "learning_rate": 4.040821081458422e-05, "loss": 0.1752, "step": 18690 }, { "epoch": 2.354567496929811, "grad_norm": 0.2054235339164734, "learning_rate": 4.0333185758390307e-05, "loss": 0.1666, "step": 18695 }, { "epoch": 2.355197279339988, "grad_norm": 0.19770711660385132, "learning_rate": 4.025821959578067e-05, "loss": 0.1701, "step": 18700 }, { "epoch": 2.3558270617501655, "grad_norm": 0.25233033299446106, "learning_rate": 4.0183312367013906e-05, "loss": 0.1722, "step": 18705 }, { "epoch": 2.3564568441603426, "grad_norm": 0.20867769420146942, "learning_rate": 4.010846411231689e-05, "loss": 0.1601, "step": 18710 }, { "epoch": 2.3570866265705197, "grad_norm": 0.21671661734580994, "learning_rate": 4.003367487188483e-05, "loss": 0.1658, "step": 18715 }, { "epoch": 2.3577164089806972, "grad_norm": 0.17957130074501038, "learning_rate": 3.9958944685881265e-05, "loss": 0.1619, "step": 18720 }, { "epoch": 2.3583461913908743, "grad_norm": 0.21048414707183838, "learning_rate": 3.988427359443802e-05, "loss": 0.1668, "step": 18725 }, { "epoch": 2.358975973801052, "grad_norm": 0.21969716250896454, "learning_rate": 3.980966163765513e-05, "loss": 0.1619, "step": 18730 }, { "epoch": 2.359605756211229, "grad_norm": 0.22368858754634857, "learning_rate": 3.9735108855600984e-05, "loss": 0.168, "step": 18735 }, { "epoch": 2.3602355386214064, "grad_norm": 0.2626504600048065, "learning_rate": 3.966061528831209e-05, "loss": 0.1651, "step": 18740 }, { "epoch": 2.3608653210315835, "grad_norm": 0.21985310316085815, "learning_rate": 3.958618097579316e-05, "loss": 0.1671, "step": 18745 }, { "epoch": 2.361495103441761, "grad_norm": 0.22451792657375336, "learning_rate": 3.9511805958017205e-05, "loss": 0.1609, "step": 18750 }, { "epoch": 2.362124885851938, "grad_norm": 0.2123977243900299, "learning_rate": 3.943749027492532e-05, "loss": 0.1719, "step": 18755 }, { "epoch": 2.3627546682621157, "grad_norm": 0.2234313040971756, "learning_rate": 3.936323396642658e-05, "loss": 0.1556, "step": 18760 }, { "epoch": 2.3633844506722927, "grad_norm": 0.19645099341869354, "learning_rate": 3.928903707239846e-05, "loss": 0.1673, "step": 18765 }, { "epoch": 2.36401423308247, "grad_norm": 0.22249870002269745, "learning_rate": 3.9214899632686334e-05, "loss": 0.1589, "step": 18770 }, { "epoch": 2.3646440154926474, "grad_norm": 0.2180803418159485, "learning_rate": 3.914082168710369e-05, "loss": 0.1685, "step": 18775 }, { "epoch": 2.3652737979028244, "grad_norm": 0.2156234085559845, "learning_rate": 3.906680327543212e-05, "loss": 0.1613, "step": 18780 }, { "epoch": 2.365903580313002, "grad_norm": 0.2180781066417694, "learning_rate": 3.899284443742112e-05, "loss": 0.1654, "step": 18785 }, { "epoch": 2.366533362723179, "grad_norm": 0.2102290391921997, "learning_rate": 3.89189452127884e-05, "loss": 0.1635, "step": 18790 }, { "epoch": 2.3671631451333566, "grad_norm": 0.26211512088775635, "learning_rate": 3.884510564121944e-05, "loss": 0.174, "step": 18795 }, { "epoch": 2.3677929275435337, "grad_norm": 0.1999218463897705, "learning_rate": 3.877132576236778e-05, "loss": 0.1619, "step": 18800 }, { "epoch": 2.368422709953711, "grad_norm": 0.21774223446846008, "learning_rate": 3.8697605615854875e-05, "loss": 0.1616, "step": 18805 }, { "epoch": 2.3690524923638883, "grad_norm": 0.2304651439189911, "learning_rate": 3.862394524127023e-05, "loss": 0.1705, "step": 18810 }, { "epoch": 2.369682274774066, "grad_norm": 0.24826854467391968, "learning_rate": 3.8550344678171084e-05, "loss": 0.1734, "step": 18815 }, { "epoch": 2.370312057184243, "grad_norm": 0.21676623821258545, "learning_rate": 3.847680396608262e-05, "loss": 0.1669, "step": 18820 }, { "epoch": 2.37094183959442, "grad_norm": 0.21203717589378357, "learning_rate": 3.840332314449788e-05, "loss": 0.1633, "step": 18825 }, { "epoch": 2.3715716220045975, "grad_norm": 0.2409755140542984, "learning_rate": 3.832990225287776e-05, "loss": 0.1687, "step": 18830 }, { "epoch": 2.3722014044147746, "grad_norm": 0.19998323917388916, "learning_rate": 3.825654133065094e-05, "loss": 0.1578, "step": 18835 }, { "epoch": 2.372831186824952, "grad_norm": 0.22083517909049988, "learning_rate": 3.818324041721391e-05, "loss": 0.1721, "step": 18840 }, { "epoch": 2.373460969235129, "grad_norm": 0.19865678250789642, "learning_rate": 3.8109999551930914e-05, "loss": 0.1613, "step": 18845 }, { "epoch": 2.3740907516453067, "grad_norm": 0.2167719304561615, "learning_rate": 3.8036818774134037e-05, "loss": 0.1569, "step": 18850 }, { "epoch": 2.374720534055484, "grad_norm": 0.2173914611339569, "learning_rate": 3.796369812312298e-05, "loss": 0.1676, "step": 18855 }, { "epoch": 2.3753503164656613, "grad_norm": 0.22559495270252228, "learning_rate": 3.7890637638165255e-05, "loss": 0.169, "step": 18860 }, { "epoch": 2.3759800988758384, "grad_norm": 0.2124035507440567, "learning_rate": 3.781763735849589e-05, "loss": 0.1715, "step": 18865 }, { "epoch": 2.376609881286016, "grad_norm": 0.23133991658687592, "learning_rate": 3.774469732331782e-05, "loss": 0.162, "step": 18870 }, { "epoch": 2.377239663696193, "grad_norm": 0.1754513680934906, "learning_rate": 3.7671817571801464e-05, "loss": 0.1602, "step": 18875 }, { "epoch": 2.37786944610637, "grad_norm": 0.2158019244670868, "learning_rate": 3.7598998143084924e-05, "loss": 0.1571, "step": 18880 }, { "epoch": 2.3784992285165476, "grad_norm": 0.20694270730018616, "learning_rate": 3.752623907627388e-05, "loss": 0.162, "step": 18885 }, { "epoch": 2.3791290109267247, "grad_norm": 0.250929057598114, "learning_rate": 3.7453540410441604e-05, "loss": 0.1744, "step": 18890 }, { "epoch": 2.379758793336902, "grad_norm": 0.23653000593185425, "learning_rate": 3.738090218462903e-05, "loss": 0.1789, "step": 18895 }, { "epoch": 2.3803885757470793, "grad_norm": 0.1936427801847458, "learning_rate": 3.730832443784443e-05, "loss": 0.1532, "step": 18900 }, { "epoch": 2.381018358157257, "grad_norm": 0.188064306974411, "learning_rate": 3.7235807209063716e-05, "loss": 0.1629, "step": 18905 }, { "epoch": 2.381648140567434, "grad_norm": 0.2069697082042694, "learning_rate": 3.71633505372304e-05, "loss": 0.1568, "step": 18910 }, { "epoch": 2.3822779229776114, "grad_norm": 0.21809029579162598, "learning_rate": 3.709095446125529e-05, "loss": 0.1717, "step": 18915 }, { "epoch": 2.3829077053877885, "grad_norm": 0.23560817539691925, "learning_rate": 3.701861902001675e-05, "loss": 0.1662, "step": 18920 }, { "epoch": 2.383537487797966, "grad_norm": 0.19693933427333832, "learning_rate": 3.694634425236057e-05, "loss": 0.1558, "step": 18925 }, { "epoch": 2.384167270208143, "grad_norm": 0.19060872495174408, "learning_rate": 3.687413019709994e-05, "loss": 0.1621, "step": 18930 }, { "epoch": 2.38479705261832, "grad_norm": 0.2021481990814209, "learning_rate": 3.680197689301548e-05, "loss": 0.1551, "step": 18935 }, { "epoch": 2.3854268350284977, "grad_norm": 0.22511224448680878, "learning_rate": 3.672988437885512e-05, "loss": 0.1587, "step": 18940 }, { "epoch": 2.386056617438675, "grad_norm": 0.2018289864063263, "learning_rate": 3.665785269333423e-05, "loss": 0.1654, "step": 18945 }, { "epoch": 2.3866863998488523, "grad_norm": 0.21350149810314178, "learning_rate": 3.65858818751354e-05, "loss": 0.1673, "step": 18950 }, { "epoch": 2.3873161822590294, "grad_norm": 0.21213771402835846, "learning_rate": 3.65139719629087e-05, "loss": 0.1791, "step": 18955 }, { "epoch": 2.387945964669207, "grad_norm": 0.24175149202346802, "learning_rate": 3.644212299527139e-05, "loss": 0.1714, "step": 18960 }, { "epoch": 2.388575747079384, "grad_norm": 0.2541513741016388, "learning_rate": 3.63703350108079e-05, "loss": 0.1685, "step": 18965 }, { "epoch": 2.389205529489561, "grad_norm": 0.24447733163833618, "learning_rate": 3.629860804807011e-05, "loss": 0.1728, "step": 18970 }, { "epoch": 2.3898353118997386, "grad_norm": 0.1830032914876938, "learning_rate": 3.622694214557702e-05, "loss": 0.1698, "step": 18975 }, { "epoch": 2.390465094309916, "grad_norm": 0.23851166665554047, "learning_rate": 3.6155337341814844e-05, "loss": 0.1754, "step": 18980 }, { "epoch": 2.3910948767200932, "grad_norm": 0.1973876655101776, "learning_rate": 3.608379367523702e-05, "loss": 0.1703, "step": 18985 }, { "epoch": 2.3917246591302703, "grad_norm": 0.2209198772907257, "learning_rate": 3.6012311184264046e-05, "loss": 0.1674, "step": 18990 }, { "epoch": 2.392354441540448, "grad_norm": 0.216825932264328, "learning_rate": 3.5940889907283834e-05, "loss": 0.1677, "step": 18995 }, { "epoch": 2.392984223950625, "grad_norm": 0.1855764538049698, "learning_rate": 3.586952988265106e-05, "loss": 0.1592, "step": 19000 }, { "epoch": 2.392984223950625, "eval_loss": 0.35235053300857544, "eval_runtime": 6.1677, "eval_samples_per_second": 162.135, "eval_steps_per_second": 10.215, "step": 19000 }, { "epoch": 2.3936140063608025, "grad_norm": 0.25512510538101196, "learning_rate": 3.579823114868778e-05, "loss": 0.1649, "step": 19005 }, { "epoch": 2.3942437887709795, "grad_norm": 0.20220012962818146, "learning_rate": 3.572699374368296e-05, "loss": 0.1638, "step": 19010 }, { "epoch": 2.394873571181157, "grad_norm": 0.21141274273395538, "learning_rate": 3.5655817705892814e-05, "loss": 0.1697, "step": 19015 }, { "epoch": 2.395503353591334, "grad_norm": 0.23368066549301147, "learning_rate": 3.558470307354046e-05, "loss": 0.1653, "step": 19020 }, { "epoch": 2.3961331360015112, "grad_norm": 0.20436784625053406, "learning_rate": 3.5513649884816064e-05, "loss": 0.1561, "step": 19025 }, { "epoch": 2.3967629184116888, "grad_norm": 0.223700612783432, "learning_rate": 3.5442658177876835e-05, "loss": 0.1693, "step": 19030 }, { "epoch": 2.3973927008218663, "grad_norm": 0.26057204604148865, "learning_rate": 3.5371727990846944e-05, "loss": 0.1767, "step": 19035 }, { "epoch": 2.3980224832320434, "grad_norm": 0.21637168526649475, "learning_rate": 3.53008593618175e-05, "loss": 0.1671, "step": 19040 }, { "epoch": 2.3986522656422204, "grad_norm": 0.23995353281497955, "learning_rate": 3.5230052328846585e-05, "loss": 0.1788, "step": 19045 }, { "epoch": 2.399282048052398, "grad_norm": 0.2069759964942932, "learning_rate": 3.5159306929959144e-05, "loss": 0.1655, "step": 19050 }, { "epoch": 2.399911830462575, "grad_norm": 0.20498618483543396, "learning_rate": 3.508862320314717e-05, "loss": 0.1589, "step": 19055 }, { "epoch": 2.4005416128727526, "grad_norm": 0.20835870504379272, "learning_rate": 3.501800118636939e-05, "loss": 0.1556, "step": 19060 }, { "epoch": 2.4011713952829297, "grad_norm": 0.22261159121990204, "learning_rate": 3.4947440917551475e-05, "loss": 0.1645, "step": 19065 }, { "epoch": 2.401801177693107, "grad_norm": 0.20514576137065887, "learning_rate": 3.487694243458578e-05, "loss": 0.1558, "step": 19070 }, { "epoch": 2.4024309601032843, "grad_norm": 0.18798956274986267, "learning_rate": 3.480650577533175e-05, "loss": 0.1635, "step": 19075 }, { "epoch": 2.4030607425134614, "grad_norm": 0.1777620017528534, "learning_rate": 3.47361309776154e-05, "loss": 0.1613, "step": 19080 }, { "epoch": 2.403690524923639, "grad_norm": 0.22258161008358002, "learning_rate": 3.466581807922962e-05, "loss": 0.1657, "step": 19085 }, { "epoch": 2.4043203073338164, "grad_norm": 0.20584604144096375, "learning_rate": 3.4595567117934045e-05, "loss": 0.1609, "step": 19090 }, { "epoch": 2.4049500897439935, "grad_norm": 0.26850444078445435, "learning_rate": 3.452537813145501e-05, "loss": 0.165, "step": 19095 }, { "epoch": 2.4055798721541706, "grad_norm": 0.1789688616991043, "learning_rate": 3.4455251157485706e-05, "loss": 0.1597, "step": 19100 }, { "epoch": 2.406209654564348, "grad_norm": 0.24336829781532288, "learning_rate": 3.438518623368581e-05, "loss": 0.1582, "step": 19105 }, { "epoch": 2.406839436974525, "grad_norm": 0.20159520208835602, "learning_rate": 3.4315183397681806e-05, "loss": 0.1572, "step": 19110 }, { "epoch": 2.4074692193847027, "grad_norm": 0.22423477470874786, "learning_rate": 3.424524268706686e-05, "loss": 0.1611, "step": 19115 }, { "epoch": 2.40809900179488, "grad_norm": 0.22861574590206146, "learning_rate": 3.417536413940073e-05, "loss": 0.1708, "step": 19120 }, { "epoch": 2.4087287842050573, "grad_norm": 0.22517502307891846, "learning_rate": 3.4105547792209766e-05, "loss": 0.1498, "step": 19125 }, { "epoch": 2.4093585666152344, "grad_norm": 0.22406402230262756, "learning_rate": 3.403579368298694e-05, "loss": 0.1722, "step": 19130 }, { "epoch": 2.4099883490254115, "grad_norm": 0.21624189615249634, "learning_rate": 3.3966101849191807e-05, "loss": 0.165, "step": 19135 }, { "epoch": 2.410618131435589, "grad_norm": 0.2186998724937439, "learning_rate": 3.389647232825048e-05, "loss": 0.1545, "step": 19140 }, { "epoch": 2.4112479138457665, "grad_norm": 0.20615451037883759, "learning_rate": 3.38269051575556e-05, "loss": 0.1653, "step": 19145 }, { "epoch": 2.4118776962559436, "grad_norm": 0.21351304650306702, "learning_rate": 3.3757400374466323e-05, "loss": 0.1667, "step": 19150 }, { "epoch": 2.4125074786661207, "grad_norm": 0.2263455092906952, "learning_rate": 3.368795801630826e-05, "loss": 0.1635, "step": 19155 }, { "epoch": 2.4131372610762982, "grad_norm": 0.20655429363250732, "learning_rate": 3.361857812037365e-05, "loss": 0.1657, "step": 19160 }, { "epoch": 2.4137670434864753, "grad_norm": 0.1987982541322708, "learning_rate": 3.354926072392101e-05, "loss": 0.1554, "step": 19165 }, { "epoch": 2.414396825896653, "grad_norm": 0.20431582629680634, "learning_rate": 3.348000586417539e-05, "loss": 0.1552, "step": 19170 }, { "epoch": 2.41502660830683, "grad_norm": 0.241183340549469, "learning_rate": 3.34108135783282e-05, "loss": 0.1758, "step": 19175 }, { "epoch": 2.4156563907170074, "grad_norm": 0.1910007894039154, "learning_rate": 3.3341683903537295e-05, "loss": 0.1609, "step": 19180 }, { "epoch": 2.4162861731271845, "grad_norm": 0.2089349776506424, "learning_rate": 3.3272616876926916e-05, "loss": 0.1608, "step": 19185 }, { "epoch": 2.4169159555373616, "grad_norm": 0.20799914002418518, "learning_rate": 3.3203612535587594e-05, "loss": 0.1636, "step": 19190 }, { "epoch": 2.417545737947539, "grad_norm": 0.2071768194437027, "learning_rate": 3.313467091657622e-05, "loss": 0.1643, "step": 19195 }, { "epoch": 2.4181755203577167, "grad_norm": 0.22981540858745575, "learning_rate": 3.3065792056916077e-05, "loss": 0.1749, "step": 19200 }, { "epoch": 2.4188053027678937, "grad_norm": 0.22096100449562073, "learning_rate": 3.2996975993596706e-05, "loss": 0.1671, "step": 19205 }, { "epoch": 2.419435085178071, "grad_norm": 0.19851092994213104, "learning_rate": 3.292822276357382e-05, "loss": 0.1605, "step": 19210 }, { "epoch": 2.4200648675882483, "grad_norm": 0.21692755818367004, "learning_rate": 3.285953240376947e-05, "loss": 0.1629, "step": 19215 }, { "epoch": 2.4206946499984254, "grad_norm": 0.1912955939769745, "learning_rate": 3.279090495107204e-05, "loss": 0.1626, "step": 19220 }, { "epoch": 2.421324432408603, "grad_norm": 0.17941045761108398, "learning_rate": 3.2722340442335993e-05, "loss": 0.1528, "step": 19225 }, { "epoch": 2.42195421481878, "grad_norm": 0.24879097938537598, "learning_rate": 3.265383891438203e-05, "loss": 0.1622, "step": 19230 }, { "epoch": 2.4225839972289576, "grad_norm": 0.27064043283462524, "learning_rate": 3.258540040399703e-05, "loss": 0.1677, "step": 19235 }, { "epoch": 2.4232137796391346, "grad_norm": 0.20788533985614777, "learning_rate": 3.2517024947934046e-05, "loss": 0.1742, "step": 19240 }, { "epoch": 2.4238435620493117, "grad_norm": 0.20137952268123627, "learning_rate": 3.2448712582912265e-05, "loss": 0.1656, "step": 19245 }, { "epoch": 2.4244733444594893, "grad_norm": 0.22439540922641754, "learning_rate": 3.2380463345616986e-05, "loss": 0.1704, "step": 19250 }, { "epoch": 2.4251031268696663, "grad_norm": 0.22877377271652222, "learning_rate": 3.231227727269956e-05, "loss": 0.1655, "step": 19255 }, { "epoch": 2.425732909279844, "grad_norm": 0.19454975426197052, "learning_rate": 3.224415440077757e-05, "loss": 0.1711, "step": 19260 }, { "epoch": 2.426362691690021, "grad_norm": 0.21811099350452423, "learning_rate": 3.217609476643447e-05, "loss": 0.1602, "step": 19265 }, { "epoch": 2.4269924741001985, "grad_norm": 0.21663612127304077, "learning_rate": 3.2108098406219884e-05, "loss": 0.1626, "step": 19270 }, { "epoch": 2.4276222565103756, "grad_norm": 0.21124348044395447, "learning_rate": 3.204016535664937e-05, "loss": 0.1621, "step": 19275 }, { "epoch": 2.428252038920553, "grad_norm": 0.20466844737529755, "learning_rate": 3.1972295654204554e-05, "loss": 0.1608, "step": 19280 }, { "epoch": 2.42888182133073, "grad_norm": 0.20153960585594177, "learning_rate": 3.1904489335333014e-05, "loss": 0.1699, "step": 19285 }, { "epoch": 2.4295116037409077, "grad_norm": 0.2766586244106293, "learning_rate": 3.1836746436448294e-05, "loss": 0.1716, "step": 19290 }, { "epoch": 2.4301413861510848, "grad_norm": 0.1904149353504181, "learning_rate": 3.176906699392986e-05, "loss": 0.1756, "step": 19295 }, { "epoch": 2.430771168561262, "grad_norm": 0.19312819838523865, "learning_rate": 3.170145104412309e-05, "loss": 0.1666, "step": 19300 }, { "epoch": 2.4314009509714394, "grad_norm": 0.19623906910419464, "learning_rate": 3.163389862333939e-05, "loss": 0.1541, "step": 19305 }, { "epoch": 2.4320307333816165, "grad_norm": 0.1920466423034668, "learning_rate": 3.156640976785592e-05, "loss": 0.1575, "step": 19310 }, { "epoch": 2.432660515791794, "grad_norm": 0.2178039401769638, "learning_rate": 3.149898451391565e-05, "loss": 0.1533, "step": 19315 }, { "epoch": 2.433290298201971, "grad_norm": 0.21117891371250153, "learning_rate": 3.143162289772757e-05, "loss": 0.1529, "step": 19320 }, { "epoch": 2.4339200806121486, "grad_norm": 0.21997326612472534, "learning_rate": 3.1364324955466405e-05, "loss": 0.167, "step": 19325 }, { "epoch": 2.4345498630223257, "grad_norm": 0.2015310823917389, "learning_rate": 3.129709072327264e-05, "loss": 0.1608, "step": 19330 }, { "epoch": 2.435179645432503, "grad_norm": 0.21516267955303192, "learning_rate": 3.122992023725263e-05, "loss": 0.159, "step": 19335 }, { "epoch": 2.4358094278426803, "grad_norm": 0.22670945525169373, "learning_rate": 3.116281353347841e-05, "loss": 0.1703, "step": 19340 }, { "epoch": 2.436439210252858, "grad_norm": 0.19361603260040283, "learning_rate": 3.109577064798793e-05, "loss": 0.1647, "step": 19345 }, { "epoch": 2.437068992663035, "grad_norm": 0.18795832991600037, "learning_rate": 3.1028791616784624e-05, "loss": 0.1532, "step": 19350 }, { "epoch": 2.437698775073212, "grad_norm": 0.24311493337154388, "learning_rate": 3.0961876475837814e-05, "loss": 0.1599, "step": 19355 }, { "epoch": 2.4383285574833895, "grad_norm": 0.20328237116336823, "learning_rate": 3.089502526108242e-05, "loss": 0.1604, "step": 19360 }, { "epoch": 2.4389583398935666, "grad_norm": 0.2067318707704544, "learning_rate": 3.082823800841914e-05, "loss": 0.161, "step": 19365 }, { "epoch": 2.439588122303744, "grad_norm": 0.22590148448944092, "learning_rate": 3.0761514753714235e-05, "loss": 0.1711, "step": 19370 }, { "epoch": 2.440217904713921, "grad_norm": 0.2264234572649002, "learning_rate": 3.069485553279958e-05, "loss": 0.1625, "step": 19375 }, { "epoch": 2.4408476871240987, "grad_norm": 0.21357667446136475, "learning_rate": 3.062826038147274e-05, "loss": 0.162, "step": 19380 }, { "epoch": 2.441477469534276, "grad_norm": 0.19787681102752686, "learning_rate": 3.0561729335496816e-05, "loss": 0.1566, "step": 19385 }, { "epoch": 2.4421072519444533, "grad_norm": 0.20055502653121948, "learning_rate": 3.0495262430600487e-05, "loss": 0.1612, "step": 19390 }, { "epoch": 2.4427370343546304, "grad_norm": 0.2178819328546524, "learning_rate": 3.0428859702478003e-05, "loss": 0.1701, "step": 19395 }, { "epoch": 2.443366816764808, "grad_norm": 0.2206692099571228, "learning_rate": 3.0362521186789125e-05, "loss": 0.1668, "step": 19400 }, { "epoch": 2.443996599174985, "grad_norm": 0.22452767193317413, "learning_rate": 3.0296246919159218e-05, "loss": 0.1713, "step": 19405 }, { "epoch": 2.444626381585162, "grad_norm": 0.23711274564266205, "learning_rate": 3.023003693517908e-05, "loss": 0.1637, "step": 19410 }, { "epoch": 2.4452561639953396, "grad_norm": 0.2252456545829773, "learning_rate": 3.0163891270404904e-05, "loss": 0.1685, "step": 19415 }, { "epoch": 2.4458859464055167, "grad_norm": 0.2477557361125946, "learning_rate": 3.0097809960358427e-05, "loss": 0.1669, "step": 19420 }, { "epoch": 2.4465157288156942, "grad_norm": 0.21543872356414795, "learning_rate": 3.003179304052689e-05, "loss": 0.1624, "step": 19425 }, { "epoch": 2.4471455112258713, "grad_norm": 0.18810850381851196, "learning_rate": 2.9965840546362858e-05, "loss": 0.1531, "step": 19430 }, { "epoch": 2.447775293636049, "grad_norm": 0.2468540370464325, "learning_rate": 2.9899952513284307e-05, "loss": 0.1644, "step": 19435 }, { "epoch": 2.448405076046226, "grad_norm": 0.2639712393283844, "learning_rate": 2.9834128976674643e-05, "loss": 0.166, "step": 19440 }, { "epoch": 2.4490348584564035, "grad_norm": 0.18254607915878296, "learning_rate": 2.9768369971882598e-05, "loss": 0.1478, "step": 19445 }, { "epoch": 2.4496646408665805, "grad_norm": 0.2060953974723816, "learning_rate": 2.9702675534222265e-05, "loss": 0.161, "step": 19450 }, { "epoch": 2.450294423276758, "grad_norm": 0.20919503271579742, "learning_rate": 2.963704569897305e-05, "loss": 0.1635, "step": 19455 }, { "epoch": 2.450924205686935, "grad_norm": 0.20381583273410797, "learning_rate": 2.957148050137963e-05, "loss": 0.1677, "step": 19460 }, { "epoch": 2.4515539880971122, "grad_norm": 0.23379412293434143, "learning_rate": 2.9505979976652106e-05, "loss": 0.1669, "step": 19465 }, { "epoch": 2.4521837705072898, "grad_norm": 0.21713408827781677, "learning_rate": 2.9440544159965707e-05, "loss": 0.1639, "step": 19470 }, { "epoch": 2.452813552917467, "grad_norm": 0.2360960692167282, "learning_rate": 2.9375173086460975e-05, "loss": 0.1682, "step": 19475 }, { "epoch": 2.4534433353276444, "grad_norm": 0.21496212482452393, "learning_rate": 2.9309866791243643e-05, "loss": 0.1508, "step": 19480 }, { "epoch": 2.4540731177378214, "grad_norm": 0.19526614248752594, "learning_rate": 2.9244625309384706e-05, "loss": 0.1607, "step": 19485 }, { "epoch": 2.454702900147999, "grad_norm": 0.2625288665294647, "learning_rate": 2.917944867592031e-05, "loss": 0.1708, "step": 19490 }, { "epoch": 2.455332682558176, "grad_norm": 0.19196555018424988, "learning_rate": 2.9114336925851818e-05, "loss": 0.1715, "step": 19495 }, { "epoch": 2.4559624649683536, "grad_norm": 0.21597431600093842, "learning_rate": 2.9049290094145726e-05, "loss": 0.1508, "step": 19500 }, { "epoch": 2.4565922473785307, "grad_norm": 0.2433023750782013, "learning_rate": 2.8984308215733615e-05, "loss": 0.1568, "step": 19505 }, { "epoch": 2.457222029788708, "grad_norm": 0.231834277510643, "learning_rate": 2.8919391325512314e-05, "loss": 0.1552, "step": 19510 }, { "epoch": 2.4578518121988853, "grad_norm": 0.21281488239765167, "learning_rate": 2.885453945834369e-05, "loss": 0.161, "step": 19515 }, { "epoch": 2.4584815946090623, "grad_norm": 0.21355679631233215, "learning_rate": 2.878975264905455e-05, "loss": 0.1515, "step": 19520 }, { "epoch": 2.45911137701924, "grad_norm": 0.20718532800674438, "learning_rate": 2.8725030932437025e-05, "loss": 0.1622, "step": 19525 }, { "epoch": 2.459741159429417, "grad_norm": 0.21609242260456085, "learning_rate": 2.8660374343248087e-05, "loss": 0.1531, "step": 19530 }, { "epoch": 2.4603709418395945, "grad_norm": 0.2453998625278473, "learning_rate": 2.8595782916209825e-05, "loss": 0.1605, "step": 19535 }, { "epoch": 2.4610007242497716, "grad_norm": 0.27632614970207214, "learning_rate": 2.8531256686009306e-05, "loss": 0.1598, "step": 19540 }, { "epoch": 2.461630506659949, "grad_norm": 0.19357621669769287, "learning_rate": 2.846679568729855e-05, "loss": 0.1527, "step": 19545 }, { "epoch": 2.462260289070126, "grad_norm": 0.19920161366462708, "learning_rate": 2.8402399954694692e-05, "loss": 0.1561, "step": 19550 }, { "epoch": 2.4628900714803037, "grad_norm": 0.19081860780715942, "learning_rate": 2.8338069522779595e-05, "loss": 0.1524, "step": 19555 }, { "epoch": 2.463519853890481, "grad_norm": 0.22451332211494446, "learning_rate": 2.8273804426100234e-05, "loss": 0.1628, "step": 19560 }, { "epoch": 2.4641496363006583, "grad_norm": 0.19204290211200714, "learning_rate": 2.820960469916837e-05, "loss": 0.1499, "step": 19565 }, { "epoch": 2.4647794187108354, "grad_norm": 0.20258976519107819, "learning_rate": 2.814547037646081e-05, "loss": 0.1514, "step": 19570 }, { "epoch": 2.4654092011210125, "grad_norm": 0.21591047942638397, "learning_rate": 2.8081401492419102e-05, "loss": 0.1555, "step": 19575 }, { "epoch": 2.46603898353119, "grad_norm": 0.20639857649803162, "learning_rate": 2.8017398081449728e-05, "loss": 0.1597, "step": 19580 }, { "epoch": 2.466668765941367, "grad_norm": 0.18190859258174896, "learning_rate": 2.7953460177923953e-05, "loss": 0.1676, "step": 19585 }, { "epoch": 2.4672985483515446, "grad_norm": 0.23196272552013397, "learning_rate": 2.7889587816177884e-05, "loss": 0.1644, "step": 19590 }, { "epoch": 2.4679283307617217, "grad_norm": 0.23499402403831482, "learning_rate": 2.782578103051248e-05, "loss": 0.1596, "step": 19595 }, { "epoch": 2.468558113171899, "grad_norm": 0.19516189396381378, "learning_rate": 2.7762039855193398e-05, "loss": 0.1592, "step": 19600 }, { "epoch": 2.4691878955820763, "grad_norm": 0.25550252199172974, "learning_rate": 2.769836432445109e-05, "loss": 0.1652, "step": 19605 }, { "epoch": 2.469817677992254, "grad_norm": 0.20900960266590118, "learning_rate": 2.7634754472480852e-05, "loss": 0.1576, "step": 19610 }, { "epoch": 2.470447460402431, "grad_norm": 0.19483284652233124, "learning_rate": 2.757121033344258e-05, "loss": 0.1671, "step": 19615 }, { "epoch": 2.4710772428126084, "grad_norm": 0.21054719388484955, "learning_rate": 2.7507731941460952e-05, "loss": 0.1572, "step": 19620 }, { "epoch": 2.4717070252227855, "grad_norm": 0.23577210307121277, "learning_rate": 2.7444319330625243e-05, "loss": 0.1657, "step": 19625 }, { "epoch": 2.4723368076329626, "grad_norm": 0.21099181473255157, "learning_rate": 2.7380972534989538e-05, "loss": 0.1696, "step": 19630 }, { "epoch": 2.47296659004314, "grad_norm": 0.20165832340717316, "learning_rate": 2.7317691588572495e-05, "loss": 0.1529, "step": 19635 }, { "epoch": 2.473596372453317, "grad_norm": 0.19725088775157928, "learning_rate": 2.7254476525357443e-05, "loss": 0.1503, "step": 19640 }, { "epoch": 2.4742261548634947, "grad_norm": 0.23867055773735046, "learning_rate": 2.7191327379292283e-05, "loss": 0.1766, "step": 19645 }, { "epoch": 2.474855937273672, "grad_norm": 0.21567271649837494, "learning_rate": 2.712824418428955e-05, "loss": 0.1562, "step": 19650 }, { "epoch": 2.4754857196838493, "grad_norm": 0.2127571702003479, "learning_rate": 2.7065226974226444e-05, "loss": 0.1588, "step": 19655 }, { "epoch": 2.4761155020940264, "grad_norm": 0.192424476146698, "learning_rate": 2.700227578294455e-05, "loss": 0.1632, "step": 19660 }, { "epoch": 2.476745284504204, "grad_norm": 0.19549550116062164, "learning_rate": 2.693939064425007e-05, "loss": 0.1666, "step": 19665 }, { "epoch": 2.477375066914381, "grad_norm": 0.21715867519378662, "learning_rate": 2.6876571591913874e-05, "loss": 0.1637, "step": 19670 }, { "epoch": 2.4780048493245586, "grad_norm": 0.246476948261261, "learning_rate": 2.6813818659671167e-05, "loss": 0.1691, "step": 19675 }, { "epoch": 2.4786346317347356, "grad_norm": 0.19329246878623962, "learning_rate": 2.6751131881221698e-05, "loss": 0.1576, "step": 19680 }, { "epoch": 2.4792644141449127, "grad_norm": 0.1897173672914505, "learning_rate": 2.6688511290229714e-05, "loss": 0.1566, "step": 19685 }, { "epoch": 2.4798941965550902, "grad_norm": 0.19795387983322144, "learning_rate": 2.662595692032391e-05, "loss": 0.159, "step": 19690 }, { "epoch": 2.4805239789652673, "grad_norm": 0.19520628452301025, "learning_rate": 2.65634688050974e-05, "loss": 0.1577, "step": 19695 }, { "epoch": 2.481153761375445, "grad_norm": 0.21223746240139008, "learning_rate": 2.650104697810772e-05, "loss": 0.1674, "step": 19700 }, { "epoch": 2.481783543785622, "grad_norm": 0.19204822182655334, "learning_rate": 2.6438691472876828e-05, "loss": 0.1492, "step": 19705 }, { "epoch": 2.4824133261957995, "grad_norm": 0.2568466067314148, "learning_rate": 2.6376402322891032e-05, "loss": 0.1557, "step": 19710 }, { "epoch": 2.4830431086059765, "grad_norm": 0.21695761382579803, "learning_rate": 2.6314179561601078e-05, "loss": 0.1715, "step": 19715 }, { "epoch": 2.483672891016154, "grad_norm": 0.21485815942287445, "learning_rate": 2.625202322242197e-05, "loss": 0.1599, "step": 19720 }, { "epoch": 2.484302673426331, "grad_norm": 0.18373069167137146, "learning_rate": 2.6189933338733122e-05, "loss": 0.1636, "step": 19725 }, { "epoch": 2.4849324558365087, "grad_norm": 0.2190975546836853, "learning_rate": 2.6127909943878177e-05, "loss": 0.1613, "step": 19730 }, { "epoch": 2.4855622382466858, "grad_norm": 0.22146424651145935, "learning_rate": 2.606595307116513e-05, "loss": 0.1554, "step": 19735 }, { "epoch": 2.486192020656863, "grad_norm": 0.22576889395713806, "learning_rate": 2.6004062753866228e-05, "loss": 0.1723, "step": 19740 }, { "epoch": 2.4868218030670404, "grad_norm": 0.22661438584327698, "learning_rate": 2.5942239025218004e-05, "loss": 0.1616, "step": 19745 }, { "epoch": 2.4874515854772175, "grad_norm": 0.20992781221866608, "learning_rate": 2.588048191842118e-05, "loss": 0.1666, "step": 19750 }, { "epoch": 2.488081367887395, "grad_norm": 0.18685118854045868, "learning_rate": 2.581879146664078e-05, "loss": 0.163, "step": 19755 }, { "epoch": 2.488711150297572, "grad_norm": 0.2547582983970642, "learning_rate": 2.5757167703005987e-05, "loss": 0.1683, "step": 19760 }, { "epoch": 2.4893409327077496, "grad_norm": 0.18066510558128357, "learning_rate": 2.569561066061013e-05, "loss": 0.1581, "step": 19765 }, { "epoch": 2.4899707151179267, "grad_norm": 0.22709952294826508, "learning_rate": 2.5634120372510708e-05, "loss": 0.1655, "step": 19770 }, { "epoch": 2.490600497528104, "grad_norm": 0.18300481140613556, "learning_rate": 2.5572696871729496e-05, "loss": 0.1634, "step": 19775 }, { "epoch": 2.4912302799382813, "grad_norm": 0.23889437317848206, "learning_rate": 2.5511340191252294e-05, "loss": 0.1653, "step": 19780 }, { "epoch": 2.491860062348459, "grad_norm": 0.18972428143024445, "learning_rate": 2.545005036402904e-05, "loss": 0.1522, "step": 19785 }, { "epoch": 2.492489844758636, "grad_norm": 0.1869877278804779, "learning_rate": 2.5388827422973722e-05, "loss": 0.1587, "step": 19790 }, { "epoch": 2.493119627168813, "grad_norm": 0.200529083609581, "learning_rate": 2.5327671400964562e-05, "loss": 0.1621, "step": 19795 }, { "epoch": 2.4937494095789905, "grad_norm": 0.20414294302463531, "learning_rate": 2.526658233084365e-05, "loss": 0.1619, "step": 19800 }, { "epoch": 2.4943791919891676, "grad_norm": 0.2506503760814667, "learning_rate": 2.5205560245417227e-05, "loss": 0.1711, "step": 19805 }, { "epoch": 2.495008974399345, "grad_norm": 0.2258518785238266, "learning_rate": 2.5144605177455534e-05, "loss": 0.1718, "step": 19810 }, { "epoch": 2.495638756809522, "grad_norm": 0.22719348967075348, "learning_rate": 2.5083717159692902e-05, "loss": 0.1611, "step": 19815 }, { "epoch": 2.4962685392196997, "grad_norm": 0.18670164048671722, "learning_rate": 2.502289622482752e-05, "loss": 0.155, "step": 19820 }, { "epoch": 2.496898321629877, "grad_norm": 0.19051162898540497, "learning_rate": 2.4962142405521666e-05, "loss": 0.1528, "step": 19825 }, { "epoch": 2.4975281040400543, "grad_norm": 0.2364228218793869, "learning_rate": 2.4901455734401508e-05, "loss": 0.1642, "step": 19830 }, { "epoch": 2.4981578864502314, "grad_norm": 0.1748083382844925, "learning_rate": 2.484083624405716e-05, "loss": 0.1536, "step": 19835 }, { "epoch": 2.498787668860409, "grad_norm": 0.21124523878097534, "learning_rate": 2.4780283967042697e-05, "loss": 0.1641, "step": 19840 }, { "epoch": 2.499417451270586, "grad_norm": 0.21559958159923553, "learning_rate": 2.4719798935876073e-05, "loss": 0.1522, "step": 19845 }, { "epoch": 2.500047233680763, "grad_norm": 0.20545977354049683, "learning_rate": 2.4659381183039105e-05, "loss": 0.1492, "step": 19850 }, { "epoch": 2.5006770160909406, "grad_norm": 0.21759046614170074, "learning_rate": 2.459903074097749e-05, "loss": 0.1637, "step": 19855 }, { "epoch": 2.5013067985011177, "grad_norm": 0.2807125151157379, "learning_rate": 2.4538747642100927e-05, "loss": 0.1701, "step": 19860 }, { "epoch": 2.5019365809112952, "grad_norm": 0.1915740966796875, "learning_rate": 2.4478531918782656e-05, "loss": 0.1551, "step": 19865 }, { "epoch": 2.5025663633214723, "grad_norm": 0.1929636150598526, "learning_rate": 2.441838360335992e-05, "loss": 0.1561, "step": 19870 }, { "epoch": 2.50319614573165, "grad_norm": 0.23392513394355774, "learning_rate": 2.4358302728133827e-05, "loss": 0.1606, "step": 19875 }, { "epoch": 2.503825928141827, "grad_norm": 0.21680179238319397, "learning_rate": 2.4298289325369137e-05, "loss": 0.166, "step": 19880 }, { "epoch": 2.5044557105520044, "grad_norm": 0.20863774418830872, "learning_rate": 2.42383434272944e-05, "loss": 0.153, "step": 19885 }, { "epoch": 2.5050854929621815, "grad_norm": 0.2562030851840973, "learning_rate": 2.4178465066101933e-05, "loss": 0.1591, "step": 19890 }, { "epoch": 2.505715275372359, "grad_norm": 0.22802165150642395, "learning_rate": 2.4118654273947796e-05, "loss": 0.1664, "step": 19895 }, { "epoch": 2.506345057782536, "grad_norm": 0.23240098357200623, "learning_rate": 2.4058911082951764e-05, "loss": 0.1585, "step": 19900 }, { "epoch": 2.506974840192713, "grad_norm": 0.21342670917510986, "learning_rate": 2.3999235525197275e-05, "loss": 0.1471, "step": 19905 }, { "epoch": 2.5076046226028907, "grad_norm": 0.20659485459327698, "learning_rate": 2.3939627632731458e-05, "loss": 0.1593, "step": 19910 }, { "epoch": 2.508234405013068, "grad_norm": 0.21293510496616364, "learning_rate": 2.3880087437565104e-05, "loss": 0.1575, "step": 19915 }, { "epoch": 2.5088641874232454, "grad_norm": 0.2346251904964447, "learning_rate": 2.382061497167271e-05, "loss": 0.1639, "step": 19920 }, { "epoch": 2.5094939698334224, "grad_norm": 0.22029395401477814, "learning_rate": 2.376121026699232e-05, "loss": 0.1537, "step": 19925 }, { "epoch": 2.5101237522436, "grad_norm": 0.1979423314332962, "learning_rate": 2.3701873355425606e-05, "loss": 0.154, "step": 19930 }, { "epoch": 2.510753534653777, "grad_norm": 0.1969837099313736, "learning_rate": 2.3642604268837873e-05, "loss": 0.1623, "step": 19935 }, { "epoch": 2.5113833170639546, "grad_norm": 0.23190250992774963, "learning_rate": 2.3583403039057946e-05, "loss": 0.1673, "step": 19940 }, { "epoch": 2.5120130994741316, "grad_norm": 0.20579595863819122, "learning_rate": 2.3524269697878244e-05, "loss": 0.1638, "step": 19945 }, { "epoch": 2.512642881884309, "grad_norm": 0.2181597501039505, "learning_rate": 2.3465204277054734e-05, "loss": 0.1535, "step": 19950 }, { "epoch": 2.5132726642944863, "grad_norm": 0.27504584193229675, "learning_rate": 2.3406206808306854e-05, "loss": 0.1687, "step": 19955 }, { "epoch": 2.5139024467046633, "grad_norm": 0.21288301050662994, "learning_rate": 2.334727732331765e-05, "loss": 0.1611, "step": 19960 }, { "epoch": 2.514532229114841, "grad_norm": 0.20768193900585175, "learning_rate": 2.3288415853733615e-05, "loss": 0.1595, "step": 19965 }, { "epoch": 2.515162011525018, "grad_norm": 0.1934243142604828, "learning_rate": 2.322962243116464e-05, "loss": 0.1573, "step": 19970 }, { "epoch": 2.5157917939351955, "grad_norm": 0.21198545396327972, "learning_rate": 2.3170897087184133e-05, "loss": 0.1549, "step": 19975 }, { "epoch": 2.5164215763453726, "grad_norm": 0.25898632407188416, "learning_rate": 2.3112239853328996e-05, "loss": 0.171, "step": 19980 }, { "epoch": 2.51705135875555, "grad_norm": 0.2114986777305603, "learning_rate": 2.3053650761099485e-05, "loss": 0.1544, "step": 19985 }, { "epoch": 2.517681141165727, "grad_norm": 0.21560825407505035, "learning_rate": 2.2995129841959266e-05, "loss": 0.1736, "step": 19990 }, { "epoch": 2.5183109235759042, "grad_norm": 0.18198496103286743, "learning_rate": 2.2936677127335395e-05, "loss": 0.154, "step": 19995 }, { "epoch": 2.5189407059860818, "grad_norm": 0.20246680080890656, "learning_rate": 2.287829264861842e-05, "loss": 0.1598, "step": 20000 }, { "epoch": 2.5189407059860818, "eval_loss": 0.3499235212802887, "eval_runtime": 6.1623, "eval_samples_per_second": 162.278, "eval_steps_per_second": 10.224, "step": 20000 }, { "epoch": 2.5195704883962593, "grad_norm": 0.2162911742925644, "learning_rate": 2.2819976437162e-05, "loss": 0.1623, "step": 20005 }, { "epoch": 2.5202002708064364, "grad_norm": 0.16897226870059967, "learning_rate": 2.2761728524283344e-05, "loss": 0.1511, "step": 20010 }, { "epoch": 2.5208300532166135, "grad_norm": 0.19399495422840118, "learning_rate": 2.2703548941262877e-05, "loss": 0.1615, "step": 20015 }, { "epoch": 2.521459835626791, "grad_norm": 0.24832330644130707, "learning_rate": 2.2645437719344424e-05, "loss": 0.1596, "step": 20020 }, { "epoch": 2.522089618036968, "grad_norm": 0.1990746706724167, "learning_rate": 2.2587394889734982e-05, "loss": 0.1517, "step": 20025 }, { "epoch": 2.5227194004471456, "grad_norm": 0.2356463521718979, "learning_rate": 2.252942048360491e-05, "loss": 0.1666, "step": 20030 }, { "epoch": 2.5233491828573227, "grad_norm": 0.2032928168773651, "learning_rate": 2.2471514532087766e-05, "loss": 0.1463, "step": 20035 }, { "epoch": 2.5239789652675, "grad_norm": 0.24515411257743835, "learning_rate": 2.2413677066280388e-05, "loss": 0.1629, "step": 20040 }, { "epoch": 2.5246087476776773, "grad_norm": 0.24597153067588806, "learning_rate": 2.2355908117242803e-05, "loss": 0.1507, "step": 20045 }, { "epoch": 2.5252385300878544, "grad_norm": 0.1958838254213333, "learning_rate": 2.2298207715998246e-05, "loss": 0.167, "step": 20050 }, { "epoch": 2.525868312498032, "grad_norm": 0.18343359231948853, "learning_rate": 2.2240575893533176e-05, "loss": 0.1582, "step": 20055 }, { "epoch": 2.5264980949082094, "grad_norm": 0.2554282248020172, "learning_rate": 2.218301268079715e-05, "loss": 0.1701, "step": 20060 }, { "epoch": 2.5271278773183865, "grad_norm": 0.2655259072780609, "learning_rate": 2.2125518108703e-05, "loss": 0.1666, "step": 20065 }, { "epoch": 2.5277576597285636, "grad_norm": 0.23147699236869812, "learning_rate": 2.206809220812662e-05, "loss": 0.1646, "step": 20070 }, { "epoch": 2.528387442138741, "grad_norm": 0.19453732669353485, "learning_rate": 2.2010735009906926e-05, "loss": 0.1595, "step": 20075 }, { "epoch": 2.529017224548918, "grad_norm": 0.21716727316379547, "learning_rate": 2.195344654484615e-05, "loss": 0.1656, "step": 20080 }, { "epoch": 2.5296470069590957, "grad_norm": 0.19851936399936676, "learning_rate": 2.1896226843709475e-05, "loss": 0.1545, "step": 20085 }, { "epoch": 2.530276789369273, "grad_norm": 0.20362606644630432, "learning_rate": 2.1839075937225192e-05, "loss": 0.1534, "step": 20090 }, { "epoch": 2.5309065717794503, "grad_norm": 0.23197387158870697, "learning_rate": 2.1781993856084633e-05, "loss": 0.1624, "step": 20095 }, { "epoch": 2.5315363541896274, "grad_norm": 0.2547961473464966, "learning_rate": 2.1724980630942145e-05, "loss": 0.1539, "step": 20100 }, { "epoch": 2.5321661365998045, "grad_norm": 0.1867532879114151, "learning_rate": 2.1668036292415237e-05, "loss": 0.1518, "step": 20105 }, { "epoch": 2.532795919009982, "grad_norm": 0.19402964413166046, "learning_rate": 2.161116087108421e-05, "loss": 0.1522, "step": 20110 }, { "epoch": 2.5334257014201595, "grad_norm": 0.20450226962566376, "learning_rate": 2.1554354397492517e-05, "loss": 0.155, "step": 20115 }, { "epoch": 2.5340554838303366, "grad_norm": 0.22179925441741943, "learning_rate": 2.149761690214649e-05, "loss": 0.1557, "step": 20120 }, { "epoch": 2.5346852662405137, "grad_norm": 0.2105506807565689, "learning_rate": 2.1440948415515524e-05, "loss": 0.1668, "step": 20125 }, { "epoch": 2.5353150486506912, "grad_norm": 0.24963414669036865, "learning_rate": 2.1384348968031857e-05, "loss": 0.1597, "step": 20130 }, { "epoch": 2.5359448310608683, "grad_norm": 0.23433445394039154, "learning_rate": 2.132781859009069e-05, "loss": 0.1579, "step": 20135 }, { "epoch": 2.536574613471046, "grad_norm": 0.19620360434055328, "learning_rate": 2.1271357312050126e-05, "loss": 0.1492, "step": 20140 }, { "epoch": 2.537204395881223, "grad_norm": 0.23040203750133514, "learning_rate": 2.1214965164231157e-05, "loss": 0.1585, "step": 20145 }, { "epoch": 2.5378341782914005, "grad_norm": 0.23273873329162598, "learning_rate": 2.1158642176917647e-05, "loss": 0.1589, "step": 20150 }, { "epoch": 2.5384639607015775, "grad_norm": 0.2472730576992035, "learning_rate": 2.1102388380356344e-05, "loss": 0.1677, "step": 20155 }, { "epoch": 2.5390937431117546, "grad_norm": 0.19982990622520447, "learning_rate": 2.104620380475679e-05, "loss": 0.1515, "step": 20160 }, { "epoch": 2.539723525521932, "grad_norm": 0.21257297694683075, "learning_rate": 2.099008848029143e-05, "loss": 0.165, "step": 20165 }, { "epoch": 2.5403533079321097, "grad_norm": 0.20112313330173492, "learning_rate": 2.0934042437095457e-05, "loss": 0.1497, "step": 20170 }, { "epoch": 2.5409830903422868, "grad_norm": 0.24434730410575867, "learning_rate": 2.087806570526691e-05, "loss": 0.1583, "step": 20175 }, { "epoch": 2.541612872752464, "grad_norm": 0.20866596698760986, "learning_rate": 2.0822158314866467e-05, "loss": 0.1584, "step": 20180 }, { "epoch": 2.5422426551626414, "grad_norm": 0.1903751641511917, "learning_rate": 2.076632029591777e-05, "loss": 0.1447, "step": 20185 }, { "epoch": 2.5428724375728184, "grad_norm": 0.24377766251564026, "learning_rate": 2.071055167840709e-05, "loss": 0.1636, "step": 20190 }, { "epoch": 2.543502219982996, "grad_norm": 0.25960245728492737, "learning_rate": 2.0654852492283446e-05, "loss": 0.164, "step": 20195 }, { "epoch": 2.544132002393173, "grad_norm": 0.23870185017585754, "learning_rate": 2.0599222767458533e-05, "loss": 0.1579, "step": 20200 }, { "epoch": 2.5447617848033506, "grad_norm": 0.2245192676782608, "learning_rate": 2.0543662533806855e-05, "loss": 0.1655, "step": 20205 }, { "epoch": 2.5453915672135277, "grad_norm": 0.23136839270591736, "learning_rate": 2.048817182116554e-05, "loss": 0.1591, "step": 20210 }, { "epoch": 2.5460213496237047, "grad_norm": 0.21092520654201508, "learning_rate": 2.043275065933427e-05, "loss": 0.1536, "step": 20215 }, { "epoch": 2.5466511320338823, "grad_norm": 0.18601630628108978, "learning_rate": 2.0377399078075485e-05, "loss": 0.1523, "step": 20220 }, { "epoch": 2.54728091444406, "grad_norm": 0.21489211916923523, "learning_rate": 2.0322117107114343e-05, "loss": 0.1554, "step": 20225 }, { "epoch": 2.547910696854237, "grad_norm": 0.2098049521446228, "learning_rate": 2.026690477613845e-05, "loss": 0.1522, "step": 20230 }, { "epoch": 2.548540479264414, "grad_norm": 0.2187887281179428, "learning_rate": 2.021176211479813e-05, "loss": 0.15, "step": 20235 }, { "epoch": 2.5491702616745915, "grad_norm": 0.2641262710094452, "learning_rate": 2.0156689152706216e-05, "loss": 0.1725, "step": 20240 }, { "epoch": 2.5498000440847686, "grad_norm": 0.22713615000247955, "learning_rate": 2.010168591943817e-05, "loss": 0.1528, "step": 20245 }, { "epoch": 2.550429826494946, "grad_norm": 0.20724020898342133, "learning_rate": 2.0046752444531976e-05, "loss": 0.1646, "step": 20250 }, { "epoch": 2.551059608905123, "grad_norm": 0.19516219198703766, "learning_rate": 1.9991888757488156e-05, "loss": 0.1574, "step": 20255 }, { "epoch": 2.5516893913153007, "grad_norm": 0.22299246490001678, "learning_rate": 1.993709488776979e-05, "loss": 0.1656, "step": 20260 }, { "epoch": 2.552319173725478, "grad_norm": 0.1897648572921753, "learning_rate": 1.9882370864802373e-05, "loss": 0.1639, "step": 20265 }, { "epoch": 2.552948956135655, "grad_norm": 0.23607775568962097, "learning_rate": 1.9827716717974048e-05, "loss": 0.1618, "step": 20270 }, { "epoch": 2.5535787385458324, "grad_norm": 0.250823438167572, "learning_rate": 1.9773132476635285e-05, "loss": 0.1628, "step": 20275 }, { "epoch": 2.55420852095601, "grad_norm": 0.2012414038181305, "learning_rate": 1.9718618170099087e-05, "loss": 0.1536, "step": 20280 }, { "epoch": 2.554838303366187, "grad_norm": 0.17350980639457703, "learning_rate": 1.9664173827640873e-05, "loss": 0.1524, "step": 20285 }, { "epoch": 2.555468085776364, "grad_norm": 0.18761439621448517, "learning_rate": 1.96097994784985e-05, "loss": 0.1452, "step": 20290 }, { "epoch": 2.5560978681865416, "grad_norm": 0.2061910331249237, "learning_rate": 1.955549515187223e-05, "loss": 0.1507, "step": 20295 }, { "epoch": 2.5567276505967187, "grad_norm": 0.20667202770709991, "learning_rate": 1.9501260876924736e-05, "loss": 0.1484, "step": 20300 }, { "epoch": 2.557357433006896, "grad_norm": 0.19904933869838715, "learning_rate": 1.9447096682781015e-05, "loss": 0.1562, "step": 20305 }, { "epoch": 2.5579872154170733, "grad_norm": 0.20500166714191437, "learning_rate": 1.9393002598528555e-05, "loss": 0.1505, "step": 20310 }, { "epoch": 2.558616997827251, "grad_norm": 0.21382258832454681, "learning_rate": 1.933897865321712e-05, "loss": 0.1606, "step": 20315 }, { "epoch": 2.559246780237428, "grad_norm": 0.22117263078689575, "learning_rate": 1.928502487585873e-05, "loss": 0.1626, "step": 20320 }, { "epoch": 2.559876562647605, "grad_norm": 0.2301877737045288, "learning_rate": 1.9231141295427794e-05, "loss": 0.1566, "step": 20325 }, { "epoch": 2.5605063450577825, "grad_norm": 0.23893754184246063, "learning_rate": 1.917732794086108e-05, "loss": 0.1571, "step": 20330 }, { "epoch": 2.56113612746796, "grad_norm": 0.2627946734428406, "learning_rate": 1.9123584841057578e-05, "loss": 0.1658, "step": 20335 }, { "epoch": 2.561765909878137, "grad_norm": 0.19281533360481262, "learning_rate": 1.906991202487854e-05, "loss": 0.1525, "step": 20340 }, { "epoch": 2.562395692288314, "grad_norm": 0.2772383689880371, "learning_rate": 1.901630952114752e-05, "loss": 0.1661, "step": 20345 }, { "epoch": 2.5630254746984917, "grad_norm": 0.216465026140213, "learning_rate": 1.896277735865027e-05, "loss": 0.1538, "step": 20350 }, { "epoch": 2.563655257108669, "grad_norm": 0.23878604173660278, "learning_rate": 1.8909315566134782e-05, "loss": 0.1601, "step": 20355 }, { "epoch": 2.5642850395188463, "grad_norm": 0.2141411155462265, "learning_rate": 1.8855924172311248e-05, "loss": 0.1631, "step": 20360 }, { "epoch": 2.5649148219290234, "grad_norm": 0.2064596712589264, "learning_rate": 1.8802603205852073e-05, "loss": 0.1578, "step": 20365 }, { "epoch": 2.565544604339201, "grad_norm": 0.19963258504867554, "learning_rate": 1.8749352695391867e-05, "loss": 0.1609, "step": 20370 }, { "epoch": 2.566174386749378, "grad_norm": 0.1846475601196289, "learning_rate": 1.8696172669527336e-05, "loss": 0.1544, "step": 20375 }, { "epoch": 2.566804169159555, "grad_norm": 0.1911933869123459, "learning_rate": 1.8643063156817423e-05, "loss": 0.1565, "step": 20380 }, { "epoch": 2.5674339515697326, "grad_norm": 0.2263742834329605, "learning_rate": 1.8590024185783042e-05, "loss": 0.1673, "step": 20385 }, { "epoch": 2.56806373397991, "grad_norm": 0.21621178090572357, "learning_rate": 1.8537055784907413e-05, "loss": 0.1587, "step": 20390 }, { "epoch": 2.5686935163900873, "grad_norm": 0.20875446498394012, "learning_rate": 1.848415798263576e-05, "loss": 0.1564, "step": 20395 }, { "epoch": 2.5693232988002643, "grad_norm": 0.20144003629684448, "learning_rate": 1.8431330807375417e-05, "loss": 0.156, "step": 20400 }, { "epoch": 2.569953081210442, "grad_norm": 0.24883227050304413, "learning_rate": 1.837857428749575e-05, "loss": 0.1568, "step": 20405 }, { "epoch": 2.570582863620619, "grad_norm": 0.18426820635795593, "learning_rate": 1.832588845132827e-05, "loss": 0.1549, "step": 20410 }, { "epoch": 2.5712126460307965, "grad_norm": 0.2462303191423416, "learning_rate": 1.827327332716649e-05, "loss": 0.1625, "step": 20415 }, { "epoch": 2.5718424284409735, "grad_norm": 0.21659249067306519, "learning_rate": 1.8220728943265837e-05, "loss": 0.1521, "step": 20420 }, { "epoch": 2.572472210851151, "grad_norm": 0.17811377346515656, "learning_rate": 1.8168255327843882e-05, "loss": 0.1586, "step": 20425 }, { "epoch": 2.573101993261328, "grad_norm": 0.19524861872196198, "learning_rate": 1.8115852509080197e-05, "loss": 0.1474, "step": 20430 }, { "epoch": 2.5737317756715052, "grad_norm": 0.17528071999549866, "learning_rate": 1.806352051511627e-05, "loss": 0.1599, "step": 20435 }, { "epoch": 2.5743615580816828, "grad_norm": 0.19344571232795715, "learning_rate": 1.801125937405557e-05, "loss": 0.1579, "step": 20440 }, { "epoch": 2.5749913404918603, "grad_norm": 0.20909984409809113, "learning_rate": 1.795906911396353e-05, "loss": 0.1584, "step": 20445 }, { "epoch": 2.5756211229020374, "grad_norm": 0.17548586428165436, "learning_rate": 1.790694976286752e-05, "loss": 0.1535, "step": 20450 }, { "epoch": 2.5762509053122145, "grad_norm": 0.25540080666542053, "learning_rate": 1.7854901348756807e-05, "loss": 0.1637, "step": 20455 }, { "epoch": 2.576880687722392, "grad_norm": 0.1803160160779953, "learning_rate": 1.780292389958257e-05, "loss": 0.1526, "step": 20460 }, { "epoch": 2.577510470132569, "grad_norm": 0.260122686624527, "learning_rate": 1.775101744325792e-05, "loss": 0.1704, "step": 20465 }, { "epoch": 2.5781402525427466, "grad_norm": 0.19697842001914978, "learning_rate": 1.7699182007657736e-05, "loss": 0.1568, "step": 20470 }, { "epoch": 2.5787700349529237, "grad_norm": 0.2179180532693863, "learning_rate": 1.7647417620618936e-05, "loss": 0.1612, "step": 20475 }, { "epoch": 2.579399817363101, "grad_norm": 0.2509031593799591, "learning_rate": 1.7595724309940117e-05, "loss": 0.1531, "step": 20480 }, { "epoch": 2.5800295997732783, "grad_norm": 0.19090527296066284, "learning_rate": 1.754410210338179e-05, "loss": 0.1477, "step": 20485 }, { "epoch": 2.5806593821834554, "grad_norm": 0.20662526786327362, "learning_rate": 1.749255102866623e-05, "loss": 0.1584, "step": 20490 }, { "epoch": 2.581289164593633, "grad_norm": 0.2258034199476242, "learning_rate": 1.7441071113477572e-05, "loss": 0.1597, "step": 20495 }, { "epoch": 2.5819189470038104, "grad_norm": 0.22661426663398743, "learning_rate": 1.738966238546169e-05, "loss": 0.1582, "step": 20500 }, { "epoch": 2.5825487294139875, "grad_norm": 0.24459710717201233, "learning_rate": 1.7338324872226227e-05, "loss": 0.1523, "step": 20505 }, { "epoch": 2.5831785118241646, "grad_norm": 0.18816480040550232, "learning_rate": 1.728705860134062e-05, "loss": 0.1554, "step": 20510 }, { "epoch": 2.583808294234342, "grad_norm": 0.18354368209838867, "learning_rate": 1.7235863600336042e-05, "loss": 0.1413, "step": 20515 }, { "epoch": 2.584438076644519, "grad_norm": 0.1984662562608719, "learning_rate": 1.71847398967054e-05, "loss": 0.1566, "step": 20520 }, { "epoch": 2.5850678590546967, "grad_norm": 0.2628153860569, "learning_rate": 1.713368751790322e-05, "loss": 0.1592, "step": 20525 }, { "epoch": 2.585697641464874, "grad_norm": 0.18952016532421112, "learning_rate": 1.7082706491345806e-05, "loss": 0.1531, "step": 20530 }, { "epoch": 2.5863274238750513, "grad_norm": 0.16905049979686737, "learning_rate": 1.7031796844411198e-05, "loss": 0.1556, "step": 20535 }, { "epoch": 2.5869572062852284, "grad_norm": 0.20969530940055847, "learning_rate": 1.6980958604438988e-05, "loss": 0.1585, "step": 20540 }, { "epoch": 2.5875869886954055, "grad_norm": 0.2143043577671051, "learning_rate": 1.693019179873048e-05, "loss": 0.1636, "step": 20545 }, { "epoch": 2.588216771105583, "grad_norm": 0.24208824336528778, "learning_rate": 1.6879496454548585e-05, "loss": 0.1572, "step": 20550 }, { "epoch": 2.5888465535157605, "grad_norm": 0.22409161925315857, "learning_rate": 1.6828872599117958e-05, "loss": 0.162, "step": 20555 }, { "epoch": 2.5894763359259376, "grad_norm": 0.20685546100139618, "learning_rate": 1.6778320259624654e-05, "loss": 0.1587, "step": 20560 }, { "epoch": 2.5901061183361147, "grad_norm": 0.19393740594387054, "learning_rate": 1.672783946321649e-05, "loss": 0.1491, "step": 20565 }, { "epoch": 2.5907359007462922, "grad_norm": 0.1944616734981537, "learning_rate": 1.667743023700275e-05, "loss": 0.1565, "step": 20570 }, { "epoch": 2.5913656831564693, "grad_norm": 0.21134309470653534, "learning_rate": 1.662709260805442e-05, "loss": 0.1575, "step": 20575 }, { "epoch": 2.591995465566647, "grad_norm": 0.20300306379795074, "learning_rate": 1.657682660340392e-05, "loss": 0.1509, "step": 20580 }, { "epoch": 2.592625247976824, "grad_norm": 0.209407240152359, "learning_rate": 1.6526632250045237e-05, "loss": 0.1568, "step": 20585 }, { "epoch": 2.5932550303870014, "grad_norm": 0.18960040807724, "learning_rate": 1.6476509574933888e-05, "loss": 0.1561, "step": 20590 }, { "epoch": 2.5938848127971785, "grad_norm": 0.2009792923927307, "learning_rate": 1.6426458604986897e-05, "loss": 0.1584, "step": 20595 }, { "epoch": 2.5945145952073556, "grad_norm": 0.2359851896762848, "learning_rate": 1.6376479367082796e-05, "loss": 0.1573, "step": 20600 }, { "epoch": 2.595144377617533, "grad_norm": 0.2108912616968155, "learning_rate": 1.632657188806153e-05, "loss": 0.1537, "step": 20605 }, { "epoch": 2.5957741600277107, "grad_norm": 0.22792066633701324, "learning_rate": 1.6276736194724575e-05, "loss": 0.1611, "step": 20610 }, { "epoch": 2.5964039424378877, "grad_norm": 0.1896820068359375, "learning_rate": 1.622697231383488e-05, "loss": 0.1623, "step": 20615 }, { "epoch": 2.597033724848065, "grad_norm": 0.19234326481819153, "learning_rate": 1.6177280272116728e-05, "loss": 0.1448, "step": 20620 }, { "epoch": 2.5976635072582424, "grad_norm": 0.17547307908535004, "learning_rate": 1.6127660096255955e-05, "loss": 0.1479, "step": 20625 }, { "epoch": 2.5982932896684194, "grad_norm": 0.20076265931129456, "learning_rate": 1.6078111812899618e-05, "loss": 0.1504, "step": 20630 }, { "epoch": 2.598923072078597, "grad_norm": 0.1888744831085205, "learning_rate": 1.6028635448656364e-05, "loss": 0.1587, "step": 20635 }, { "epoch": 2.599552854488774, "grad_norm": 0.2309001237154007, "learning_rate": 1.59792310300961e-05, "loss": 0.1662, "step": 20640 }, { "epoch": 2.6001826368989516, "grad_norm": 0.29581940174102783, "learning_rate": 1.592989858375013e-05, "loss": 0.1708, "step": 20645 }, { "epoch": 2.6008124193091287, "grad_norm": 0.19039921462535858, "learning_rate": 1.588063813611112e-05, "loss": 0.1548, "step": 20650 }, { "epoch": 2.6014422017193057, "grad_norm": 0.18266427516937256, "learning_rate": 1.5831449713632993e-05, "loss": 0.1523, "step": 20655 }, { "epoch": 2.6020719841294833, "grad_norm": 0.1932811439037323, "learning_rate": 1.5782333342731174e-05, "loss": 0.1516, "step": 20660 }, { "epoch": 2.602701766539661, "grad_norm": 0.17890222370624542, "learning_rate": 1.5733289049782177e-05, "loss": 0.1511, "step": 20665 }, { "epoch": 2.603331548949838, "grad_norm": 0.20573283731937408, "learning_rate": 1.5684316861123935e-05, "loss": 0.1525, "step": 20670 }, { "epoch": 2.603961331360015, "grad_norm": 0.21194593608379364, "learning_rate": 1.5635416803055596e-05, "loss": 0.1599, "step": 20675 }, { "epoch": 2.6045911137701925, "grad_norm": 0.17930278182029724, "learning_rate": 1.558658890183768e-05, "loss": 0.1571, "step": 20680 }, { "epoch": 2.6052208961803696, "grad_norm": 0.1965799480676651, "learning_rate": 1.5537833183691857e-05, "loss": 0.1552, "step": 20685 }, { "epoch": 2.605850678590547, "grad_norm": 0.20715682208538055, "learning_rate": 1.5489149674801054e-05, "loss": 0.1588, "step": 20690 }, { "epoch": 2.606480461000724, "grad_norm": 0.1894584596157074, "learning_rate": 1.544053840130943e-05, "loss": 0.1547, "step": 20695 }, { "epoch": 2.6071102434109017, "grad_norm": 0.20791690051555634, "learning_rate": 1.539199938932234e-05, "loss": 0.1479, "step": 20700 }, { "epoch": 2.6077400258210788, "grad_norm": 0.20393605530261993, "learning_rate": 1.534353266490636e-05, "loss": 0.1491, "step": 20705 }, { "epoch": 2.608369808231256, "grad_norm": 0.20201466977596283, "learning_rate": 1.5295138254089206e-05, "loss": 0.1664, "step": 20710 }, { "epoch": 2.6089995906414334, "grad_norm": 0.220575213432312, "learning_rate": 1.5246816182859773e-05, "loss": 0.16, "step": 20715 }, { "epoch": 2.609629373051611, "grad_norm": 0.1888882964849472, "learning_rate": 1.5198566477168166e-05, "loss": 0.1592, "step": 20720 }, { "epoch": 2.610259155461788, "grad_norm": 0.2035285383462906, "learning_rate": 1.5150389162925564e-05, "loss": 0.149, "step": 20725 }, { "epoch": 2.610888937871965, "grad_norm": 0.21430674195289612, "learning_rate": 1.5102284266004282e-05, "loss": 0.1568, "step": 20730 }, { "epoch": 2.6115187202821426, "grad_norm": 0.2220098227262497, "learning_rate": 1.5054251812237695e-05, "loss": 0.1601, "step": 20735 }, { "epoch": 2.6121485026923197, "grad_norm": 0.18914029002189636, "learning_rate": 1.5006291827420397e-05, "loss": 0.1524, "step": 20740 }, { "epoch": 2.612778285102497, "grad_norm": 0.19741562008857727, "learning_rate": 1.4958404337307972e-05, "loss": 0.1418, "step": 20745 }, { "epoch": 2.6134080675126743, "grad_norm": 0.22962430119514465, "learning_rate": 1.49105893676171e-05, "loss": 0.1523, "step": 20750 }, { "epoch": 2.614037849922852, "grad_norm": 0.17770111560821533, "learning_rate": 1.4862846944025469e-05, "loss": 0.1585, "step": 20755 }, { "epoch": 2.614667632333029, "grad_norm": 0.22975338995456696, "learning_rate": 1.481517709217191e-05, "loss": 0.1608, "step": 20760 }, { "epoch": 2.615297414743206, "grad_norm": 0.21070002019405365, "learning_rate": 1.476757983765624e-05, "loss": 0.1442, "step": 20765 }, { "epoch": 2.6159271971533835, "grad_norm": 0.19414427876472473, "learning_rate": 1.47200552060392e-05, "loss": 0.1445, "step": 20770 }, { "epoch": 2.616556979563561, "grad_norm": 0.18657416105270386, "learning_rate": 1.4672603222842605e-05, "loss": 0.1534, "step": 20775 }, { "epoch": 2.617186761973738, "grad_norm": 0.2389591485261917, "learning_rate": 1.4625223913549323e-05, "loss": 0.158, "step": 20780 }, { "epoch": 2.617816544383915, "grad_norm": 0.19741186499595642, "learning_rate": 1.4577917303603081e-05, "loss": 0.1585, "step": 20785 }, { "epoch": 2.6184463267940927, "grad_norm": 0.18730677664279938, "learning_rate": 1.4530683418408612e-05, "loss": 0.1487, "step": 20790 }, { "epoch": 2.61907610920427, "grad_norm": 0.2060120701789856, "learning_rate": 1.4483522283331606e-05, "loss": 0.1499, "step": 20795 }, { "epoch": 2.6197058916144473, "grad_norm": 0.2186814844608307, "learning_rate": 1.4436433923698638e-05, "loss": 0.1562, "step": 20800 }, { "epoch": 2.6203356740246244, "grad_norm": 0.21503032743930817, "learning_rate": 1.4389418364797279e-05, "loss": 0.1456, "step": 20805 }, { "epoch": 2.620965456434802, "grad_norm": 0.17447194457054138, "learning_rate": 1.4342475631875916e-05, "loss": 0.1454, "step": 20810 }, { "epoch": 2.621595238844979, "grad_norm": 0.18272021412849426, "learning_rate": 1.4295605750143851e-05, "loss": 0.149, "step": 20815 }, { "epoch": 2.622225021255156, "grad_norm": 0.2014734447002411, "learning_rate": 1.424880874477135e-05, "loss": 0.1582, "step": 20820 }, { "epoch": 2.6228548036653336, "grad_norm": 0.21231862902641296, "learning_rate": 1.4202084640889443e-05, "loss": 0.152, "step": 20825 }, { "epoch": 2.623484586075511, "grad_norm": 0.19817417860031128, "learning_rate": 1.415543346359006e-05, "loss": 0.1492, "step": 20830 }, { "epoch": 2.6241143684856882, "grad_norm": 0.20216423273086548, "learning_rate": 1.410885523792586e-05, "loss": 0.1452, "step": 20835 }, { "epoch": 2.6247441508958653, "grad_norm": 0.20939548313617706, "learning_rate": 1.4062349988910515e-05, "loss": 0.1512, "step": 20840 }, { "epoch": 2.625373933306043, "grad_norm": 0.19018815457820892, "learning_rate": 1.4015917741518384e-05, "loss": 0.1579, "step": 20845 }, { "epoch": 2.62600371571622, "grad_norm": 0.20512887835502625, "learning_rate": 1.396955852068462e-05, "loss": 0.1624, "step": 20850 }, { "epoch": 2.6266334981263975, "grad_norm": 0.24390068650245667, "learning_rate": 1.3923272351305193e-05, "loss": 0.1663, "step": 20855 }, { "epoch": 2.6272632805365745, "grad_norm": 0.21338611841201782, "learning_rate": 1.38770592582368e-05, "loss": 0.1695, "step": 20860 }, { "epoch": 2.627893062946752, "grad_norm": 0.21631261706352234, "learning_rate": 1.3830919266297025e-05, "loss": 0.1598, "step": 20865 }, { "epoch": 2.628522845356929, "grad_norm": 0.21297498047351837, "learning_rate": 1.3784852400264013e-05, "loss": 0.1554, "step": 20870 }, { "epoch": 2.6291526277671062, "grad_norm": 0.22182469069957733, "learning_rate": 1.3738858684876724e-05, "loss": 0.1618, "step": 20875 }, { "epoch": 2.6297824101772838, "grad_norm": 0.2602301239967346, "learning_rate": 1.369293814483487e-05, "loss": 0.1616, "step": 20880 }, { "epoch": 2.6304121925874613, "grad_norm": 0.20623180270195007, "learning_rate": 1.3647090804798822e-05, "loss": 0.1574, "step": 20885 }, { "epoch": 2.6310419749976384, "grad_norm": 0.22888506948947906, "learning_rate": 1.3601316689389635e-05, "loss": 0.1476, "step": 20890 }, { "epoch": 2.6316717574078154, "grad_norm": 0.1814454197883606, "learning_rate": 1.3555615823189065e-05, "loss": 0.1505, "step": 20895 }, { "epoch": 2.632301539817993, "grad_norm": 0.19201195240020752, "learning_rate": 1.350998823073951e-05, "loss": 0.1559, "step": 20900 }, { "epoch": 2.63293132222817, "grad_norm": 0.21456103026866913, "learning_rate": 1.3464433936544055e-05, "loss": 0.1519, "step": 20905 }, { "epoch": 2.6335611046383476, "grad_norm": 0.21142180263996124, "learning_rate": 1.3418952965066365e-05, "loss": 0.153, "step": 20910 }, { "epoch": 2.6341908870485247, "grad_norm": 0.2231752574443817, "learning_rate": 1.3373545340730785e-05, "loss": 0.1641, "step": 20915 }, { "epoch": 2.634820669458702, "grad_norm": 0.19116418063640594, "learning_rate": 1.3328211087922192e-05, "loss": 0.1503, "step": 20920 }, { "epoch": 2.6354504518688793, "grad_norm": 0.18010330200195312, "learning_rate": 1.3282950230986194e-05, "loss": 0.1434, "step": 20925 }, { "epoch": 2.6360802342790564, "grad_norm": 0.179785817861557, "learning_rate": 1.3237762794228884e-05, "loss": 0.1502, "step": 20930 }, { "epoch": 2.636710016689234, "grad_norm": 0.1842677742242813, "learning_rate": 1.319264880191695e-05, "loss": 0.1421, "step": 20935 }, { "epoch": 2.6373397990994114, "grad_norm": 0.22725196182727814, "learning_rate": 1.314760827827756e-05, "loss": 0.1464, "step": 20940 }, { "epoch": 2.6379695815095885, "grad_norm": 0.21761812269687653, "learning_rate": 1.3102641247498585e-05, "loss": 0.1492, "step": 20945 }, { "epoch": 2.6385993639197656, "grad_norm": 0.2054702490568161, "learning_rate": 1.305774773372834e-05, "loss": 0.1626, "step": 20950 }, { "epoch": 2.639229146329943, "grad_norm": 0.20189544558525085, "learning_rate": 1.3012927761075658e-05, "loss": 0.1672, "step": 20955 }, { "epoch": 2.63985892874012, "grad_norm": 0.2214374542236328, "learning_rate": 1.2968181353609852e-05, "loss": 0.159, "step": 20960 }, { "epoch": 2.6404887111502977, "grad_norm": 0.20227175951004028, "learning_rate": 1.2923508535360833e-05, "loss": 0.1668, "step": 20965 }, { "epoch": 2.641118493560475, "grad_norm": 0.2125934362411499, "learning_rate": 1.2878909330318893e-05, "loss": 0.1587, "step": 20970 }, { "epoch": 2.6417482759706523, "grad_norm": 0.20071247220039368, "learning_rate": 1.2834383762434807e-05, "loss": 0.1515, "step": 20975 }, { "epoch": 2.6423780583808294, "grad_norm": 0.18576478958129883, "learning_rate": 1.2789931855619817e-05, "loss": 0.152, "step": 20980 }, { "epoch": 2.6430078407910065, "grad_norm": 0.2210751622915268, "learning_rate": 1.2745553633745642e-05, "loss": 0.1542, "step": 20985 }, { "epoch": 2.643637623201184, "grad_norm": 0.20466601848602295, "learning_rate": 1.2701249120644402e-05, "loss": 0.1599, "step": 20990 }, { "epoch": 2.6442674056113615, "grad_norm": 0.18101972341537476, "learning_rate": 1.2657018340108616e-05, "loss": 0.1434, "step": 20995 }, { "epoch": 2.6448971880215386, "grad_norm": 0.19254201650619507, "learning_rate": 1.2612861315891215e-05, "loss": 0.1492, "step": 21000 }, { "epoch": 2.6448971880215386, "eval_loss": 0.35662394762039185, "eval_runtime": 6.1657, "eval_samples_per_second": 162.189, "eval_steps_per_second": 10.218, "step": 21000 }, { "epoch": 2.6455269704317157, "grad_norm": 0.21622531116008759, "learning_rate": 1.2568778071705564e-05, "loss": 0.1508, "step": 21005 }, { "epoch": 2.6461567528418932, "grad_norm": 0.20254169404506683, "learning_rate": 1.2524768631225329e-05, "loss": 0.1541, "step": 21010 }, { "epoch": 2.6467865352520703, "grad_norm": 0.18706361949443817, "learning_rate": 1.2480833018084619e-05, "loss": 0.1554, "step": 21015 }, { "epoch": 2.647416317662248, "grad_norm": 0.19682380557060242, "learning_rate": 1.2436971255877825e-05, "loss": 0.1527, "step": 21020 }, { "epoch": 2.648046100072425, "grad_norm": 0.193098783493042, "learning_rate": 1.2393183368159759e-05, "loss": 0.1505, "step": 21025 }, { "epoch": 2.6486758824826024, "grad_norm": 0.1954520344734192, "learning_rate": 1.2349469378445493e-05, "loss": 0.1463, "step": 21030 }, { "epoch": 2.6493056648927795, "grad_norm": 0.2523531913757324, "learning_rate": 1.2305829310210446e-05, "loss": 0.1655, "step": 21035 }, { "epoch": 2.6499354473029566, "grad_norm": 0.20331156253814697, "learning_rate": 1.2262263186890325e-05, "loss": 0.1514, "step": 21040 }, { "epoch": 2.650565229713134, "grad_norm": 0.2400408834218979, "learning_rate": 1.221877103188113e-05, "loss": 0.1673, "step": 21045 }, { "epoch": 2.6511950121233117, "grad_norm": 0.18541163206100464, "learning_rate": 1.2175352868539162e-05, "loss": 0.1582, "step": 21050 }, { "epoch": 2.6518247945334887, "grad_norm": 0.24442121386528015, "learning_rate": 1.2132008720180953e-05, "loss": 0.1525, "step": 21055 }, { "epoch": 2.652454576943666, "grad_norm": 0.23133227229118347, "learning_rate": 1.2088738610083282e-05, "loss": 0.1547, "step": 21060 }, { "epoch": 2.6530843593538433, "grad_norm": 0.21159769594669342, "learning_rate": 1.2045542561483196e-05, "loss": 0.1451, "step": 21065 }, { "epoch": 2.6537141417640204, "grad_norm": 0.19382759928703308, "learning_rate": 1.2002420597577972e-05, "loss": 0.1532, "step": 21070 }, { "epoch": 2.654343924174198, "grad_norm": 0.18077696859836578, "learning_rate": 1.1959372741525135e-05, "loss": 0.1493, "step": 21075 }, { "epoch": 2.654973706584375, "grad_norm": 0.24647746980190277, "learning_rate": 1.1916399016442264e-05, "loss": 0.1533, "step": 21080 }, { "epoch": 2.6556034889945526, "grad_norm": 0.19929052889347076, "learning_rate": 1.1873499445407291e-05, "loss": 0.1418, "step": 21085 }, { "epoch": 2.6562332714047296, "grad_norm": 0.2208561897277832, "learning_rate": 1.1830674051458277e-05, "loss": 0.1628, "step": 21090 }, { "epoch": 2.6568630538149067, "grad_norm": 0.18301671743392944, "learning_rate": 1.1787922857593406e-05, "loss": 0.1495, "step": 21095 }, { "epoch": 2.6574928362250843, "grad_norm": 0.1876724660396576, "learning_rate": 1.1745245886771065e-05, "loss": 0.1526, "step": 21100 }, { "epoch": 2.658122618635262, "grad_norm": 0.18740510940551758, "learning_rate": 1.1702643161909736e-05, "loss": 0.1469, "step": 21105 }, { "epoch": 2.658752401045439, "grad_norm": 0.2159937024116516, "learning_rate": 1.1660114705888119e-05, "loss": 0.1534, "step": 21110 }, { "epoch": 2.659382183455616, "grad_norm": 0.19949863851070404, "learning_rate": 1.1617660541544893e-05, "loss": 0.1446, "step": 21115 }, { "epoch": 2.6600119658657935, "grad_norm": 0.18760685622692108, "learning_rate": 1.1575280691678956e-05, "loss": 0.1495, "step": 21120 }, { "epoch": 2.6606417482759706, "grad_norm": 0.1935281902551651, "learning_rate": 1.153297517904922e-05, "loss": 0.1563, "step": 21125 }, { "epoch": 2.661271530686148, "grad_norm": 0.20625917613506317, "learning_rate": 1.1490744026374743e-05, "loss": 0.1527, "step": 21130 }, { "epoch": 2.661901313096325, "grad_norm": 0.22293558716773987, "learning_rate": 1.1448587256334618e-05, "loss": 0.1573, "step": 21135 }, { "epoch": 2.6625310955065027, "grad_norm": 0.22753605246543884, "learning_rate": 1.1406504891567986e-05, "loss": 0.1563, "step": 21140 }, { "epoch": 2.6631608779166798, "grad_norm": 0.1857980489730835, "learning_rate": 1.1364496954674035e-05, "loss": 0.1542, "step": 21145 }, { "epoch": 2.663790660326857, "grad_norm": 0.20376616716384888, "learning_rate": 1.1322563468212003e-05, "loss": 0.1533, "step": 21150 }, { "epoch": 2.6644204427370344, "grad_norm": 0.18928895890712738, "learning_rate": 1.1280704454701111e-05, "loss": 0.151, "step": 21155 }, { "epoch": 2.665050225147212, "grad_norm": 0.2215338945388794, "learning_rate": 1.1238919936620593e-05, "loss": 0.1484, "step": 21160 }, { "epoch": 2.665680007557389, "grad_norm": 0.26164811849594116, "learning_rate": 1.1197209936409702e-05, "loss": 0.1604, "step": 21165 }, { "epoch": 2.666309789967566, "grad_norm": 0.1930347979068756, "learning_rate": 1.1155574476467682e-05, "loss": 0.1578, "step": 21170 }, { "epoch": 2.6669395723777436, "grad_norm": 0.18400873243808746, "learning_rate": 1.1114013579153719e-05, "loss": 0.1559, "step": 21175 }, { "epoch": 2.6675693547879207, "grad_norm": 0.19113576412200928, "learning_rate": 1.1072527266786974e-05, "loss": 0.1583, "step": 21180 }, { "epoch": 2.668199137198098, "grad_norm": 0.1980462521314621, "learning_rate": 1.1031115561646476e-05, "loss": 0.1516, "step": 21185 }, { "epoch": 2.6688289196082753, "grad_norm": 0.24164487421512604, "learning_rate": 1.0989778485971334e-05, "loss": 0.1578, "step": 21190 }, { "epoch": 2.669458702018453, "grad_norm": 0.18318641185760498, "learning_rate": 1.0948516061960478e-05, "loss": 0.1517, "step": 21195 }, { "epoch": 2.67008848442863, "grad_norm": 0.20504848659038544, "learning_rate": 1.0907328311772778e-05, "loss": 0.1619, "step": 21200 }, { "epoch": 2.670718266838807, "grad_norm": 0.214483380317688, "learning_rate": 1.0866215257526978e-05, "loss": 0.1445, "step": 21205 }, { "epoch": 2.6713480492489845, "grad_norm": 0.24230434000492096, "learning_rate": 1.0825176921301698e-05, "loss": 0.1521, "step": 21210 }, { "epoch": 2.671977831659162, "grad_norm": 0.22668616473674774, "learning_rate": 1.0784213325135577e-05, "loss": 0.1539, "step": 21215 }, { "epoch": 2.672607614069339, "grad_norm": 0.18815076351165771, "learning_rate": 1.0743324491026883e-05, "loss": 0.1496, "step": 21220 }, { "epoch": 2.673237396479516, "grad_norm": 0.24532602727413177, "learning_rate": 1.070251044093387e-05, "loss": 0.1572, "step": 21225 }, { "epoch": 2.6738671788896937, "grad_norm": 0.2050776183605194, "learning_rate": 1.066177119677467e-05, "loss": 0.1585, "step": 21230 }, { "epoch": 2.674496961299871, "grad_norm": 0.1992231011390686, "learning_rate": 1.062110678042717e-05, "loss": 0.1493, "step": 21235 }, { "epoch": 2.6751267437100483, "grad_norm": 0.2188093513250351, "learning_rate": 1.0580517213729062e-05, "loss": 0.1526, "step": 21240 }, { "epoch": 2.6757565261202254, "grad_norm": 0.17839093506336212, "learning_rate": 1.0540002518477898e-05, "loss": 0.146, "step": 21245 }, { "epoch": 2.676386308530403, "grad_norm": 0.20759402215480804, "learning_rate": 1.0499562716430987e-05, "loss": 0.1527, "step": 21250 }, { "epoch": 2.67701609094058, "grad_norm": 0.20209045708179474, "learning_rate": 1.0459197829305427e-05, "loss": 0.1507, "step": 21255 }, { "epoch": 2.677645873350757, "grad_norm": 0.24553018808364868, "learning_rate": 1.0418907878778077e-05, "loss": 0.1568, "step": 21260 }, { "epoch": 2.6782756557609346, "grad_norm": 0.24322043359279633, "learning_rate": 1.0378692886485563e-05, "loss": 0.1527, "step": 21265 }, { "epoch": 2.678905438171112, "grad_norm": 0.20755696296691895, "learning_rate": 1.0338552874024242e-05, "loss": 0.1497, "step": 21270 }, { "epoch": 2.6795352205812892, "grad_norm": 0.19075340032577515, "learning_rate": 1.0298487862950256e-05, "loss": 0.1514, "step": 21275 }, { "epoch": 2.6801650029914663, "grad_norm": 0.20733466744422913, "learning_rate": 1.0258497874779426e-05, "loss": 0.1531, "step": 21280 }, { "epoch": 2.680794785401644, "grad_norm": 0.20700550079345703, "learning_rate": 1.0218582930987224e-05, "loss": 0.1547, "step": 21285 }, { "epoch": 2.681424567811821, "grad_norm": 0.1864207237958908, "learning_rate": 1.0178743053008969e-05, "loss": 0.1507, "step": 21290 }, { "epoch": 2.6820543502219985, "grad_norm": 0.21107596158981323, "learning_rate": 1.0138978262239532e-05, "loss": 0.1511, "step": 21295 }, { "epoch": 2.6826841326321755, "grad_norm": 0.25058072805404663, "learning_rate": 1.0099288580033548e-05, "loss": 0.1573, "step": 21300 }, { "epoch": 2.683313915042353, "grad_norm": 0.18913084268569946, "learning_rate": 1.005967402770525e-05, "loss": 0.1415, "step": 21305 }, { "epoch": 2.68394369745253, "grad_norm": 0.18435829877853394, "learning_rate": 1.002013462652857e-05, "loss": 0.1432, "step": 21310 }, { "epoch": 2.6845734798627072, "grad_norm": 0.19929499924182892, "learning_rate": 9.980670397737106e-06, "loss": 0.1562, "step": 21315 }, { "epoch": 2.6852032622728847, "grad_norm": 0.2412646859884262, "learning_rate": 9.941281362524007e-06, "loss": 0.1544, "step": 21320 }, { "epoch": 2.6858330446830623, "grad_norm": 0.23384952545166016, "learning_rate": 9.9019675420421e-06, "loss": 0.1598, "step": 21325 }, { "epoch": 2.6864628270932394, "grad_norm": 0.1778135895729065, "learning_rate": 9.862728957403766e-06, "loss": 0.1515, "step": 21330 }, { "epoch": 2.6870926095034164, "grad_norm": 0.20782922208309174, "learning_rate": 9.823565629681079e-06, "loss": 0.1504, "step": 21335 }, { "epoch": 2.687722391913594, "grad_norm": 0.18523196876049042, "learning_rate": 9.78447757990562e-06, "loss": 0.1425, "step": 21340 }, { "epoch": 2.688352174323771, "grad_norm": 0.18965183198451996, "learning_rate": 9.745464829068561e-06, "loss": 0.1541, "step": 21345 }, { "epoch": 2.6889819567339486, "grad_norm": 0.18834419548511505, "learning_rate": 9.706527398120645e-06, "loss": 0.1536, "step": 21350 }, { "epoch": 2.6896117391441257, "grad_norm": 0.18705077469348907, "learning_rate": 9.66766530797216e-06, "loss": 0.1451, "step": 21355 }, { "epoch": 2.690241521554303, "grad_norm": 0.1886008232831955, "learning_rate": 9.628878579492932e-06, "loss": 0.1484, "step": 21360 }, { "epoch": 2.6908713039644803, "grad_norm": 0.19375784695148468, "learning_rate": 9.590167233512314e-06, "loss": 0.1554, "step": 21365 }, { "epoch": 2.6915010863746573, "grad_norm": 0.18573135137557983, "learning_rate": 9.551531290819192e-06, "loss": 0.1608, "step": 21370 }, { "epoch": 2.692130868784835, "grad_norm": 0.18215128779411316, "learning_rate": 9.512970772161955e-06, "loss": 0.1564, "step": 21375 }, { "epoch": 2.6927606511950124, "grad_norm": 0.1941639482975006, "learning_rate": 9.474485698248469e-06, "loss": 0.1551, "step": 21380 }, { "epoch": 2.6933904336051895, "grad_norm": 0.19289493560791016, "learning_rate": 9.436076089746153e-06, "loss": 0.1537, "step": 21385 }, { "epoch": 2.6940202160153666, "grad_norm": 0.19275395572185516, "learning_rate": 9.397741967281724e-06, "loss": 0.1441, "step": 21390 }, { "epoch": 2.694649998425544, "grad_norm": 0.18316887319087982, "learning_rate": 9.359483351441599e-06, "loss": 0.1496, "step": 21395 }, { "epoch": 2.695279780835721, "grad_norm": 0.16594599187374115, "learning_rate": 9.321300262771475e-06, "loss": 0.1408, "step": 21400 }, { "epoch": 2.6959095632458987, "grad_norm": 0.2479625940322876, "learning_rate": 9.28319272177655e-06, "loss": 0.1565, "step": 21405 }, { "epoch": 2.696539345656076, "grad_norm": 0.18492808938026428, "learning_rate": 9.245160748921454e-06, "loss": 0.143, "step": 21410 }, { "epoch": 2.6971691280662533, "grad_norm": 0.22853007912635803, "learning_rate": 9.207204364630182e-06, "loss": 0.1668, "step": 21415 }, { "epoch": 2.6977989104764304, "grad_norm": 0.1997872143983841, "learning_rate": 9.169323589286264e-06, "loss": 0.1563, "step": 21420 }, { "epoch": 2.6984286928866075, "grad_norm": 0.23863272368907928, "learning_rate": 9.131518443232476e-06, "loss": 0.1554, "step": 21425 }, { "epoch": 2.699058475296785, "grad_norm": 0.17353664338588715, "learning_rate": 9.09378894677103e-06, "loss": 0.147, "step": 21430 }, { "epoch": 2.699688257706962, "grad_norm": 0.2168291211128235, "learning_rate": 9.056135120163582e-06, "loss": 0.1553, "step": 21435 }, { "epoch": 2.7003180401171396, "grad_norm": 0.23211082816123962, "learning_rate": 9.018556983631076e-06, "loss": 0.1493, "step": 21440 }, { "epoch": 2.7009478225273167, "grad_norm": 0.22088773548603058, "learning_rate": 8.981054557353834e-06, "loss": 0.1567, "step": 21445 }, { "epoch": 2.701577604937494, "grad_norm": 0.20668818056583405, "learning_rate": 8.943627861471497e-06, "loss": 0.1559, "step": 21450 }, { "epoch": 2.7022073873476713, "grad_norm": 0.22993560135364532, "learning_rate": 8.906276916083072e-06, "loss": 0.1628, "step": 21455 }, { "epoch": 2.702837169757849, "grad_norm": 0.214871346950531, "learning_rate": 8.869001741246862e-06, "loss": 0.1567, "step": 21460 }, { "epoch": 2.703466952168026, "grad_norm": 0.20056143403053284, "learning_rate": 8.831802356980505e-06, "loss": 0.1494, "step": 21465 }, { "epoch": 2.7040967345782034, "grad_norm": 0.18365876376628876, "learning_rate": 8.79467878326089e-06, "loss": 0.1547, "step": 21470 }, { "epoch": 2.7047265169883805, "grad_norm": 0.1938326060771942, "learning_rate": 8.757631040024215e-06, "loss": 0.1591, "step": 21475 }, { "epoch": 2.7053562993985576, "grad_norm": 0.207264244556427, "learning_rate": 8.72065914716602e-06, "loss": 0.1588, "step": 21480 }, { "epoch": 2.705986081808735, "grad_norm": 0.23815831542015076, "learning_rate": 8.683763124541021e-06, "loss": 0.1551, "step": 21485 }, { "epoch": 2.706615864218912, "grad_norm": 0.20644132792949677, "learning_rate": 8.646942991963236e-06, "loss": 0.1496, "step": 21490 }, { "epoch": 2.7072456466290897, "grad_norm": 0.19380377233028412, "learning_rate": 8.610198769205895e-06, "loss": 0.1499, "step": 21495 }, { "epoch": 2.707875429039267, "grad_norm": 0.1877509504556656, "learning_rate": 8.5735304760015e-06, "loss": 0.1582, "step": 21500 }, { "epoch": 2.7085052114494443, "grad_norm": 0.20092125236988068, "learning_rate": 8.536938132041781e-06, "loss": 0.1541, "step": 21505 }, { "epoch": 2.7091349938596214, "grad_norm": 0.20917046070098877, "learning_rate": 8.500421756977637e-06, "loss": 0.1555, "step": 21510 }, { "epoch": 2.709764776269799, "grad_norm": 0.18814347684383392, "learning_rate": 8.463981370419165e-06, "loss": 0.1511, "step": 21515 }, { "epoch": 2.710394558679976, "grad_norm": 0.2021394819021225, "learning_rate": 8.427616991935759e-06, "loss": 0.1539, "step": 21520 }, { "epoch": 2.7110243410901536, "grad_norm": 0.19899116456508636, "learning_rate": 8.3913286410559e-06, "loss": 0.1553, "step": 21525 }, { "epoch": 2.7116541235003306, "grad_norm": 0.2093294858932495, "learning_rate": 8.355116337267231e-06, "loss": 0.1581, "step": 21530 }, { "epoch": 2.7122839059105077, "grad_norm": 0.215724378824234, "learning_rate": 8.318980100016564e-06, "loss": 0.1516, "step": 21535 }, { "epoch": 2.7129136883206852, "grad_norm": 0.21019119024276733, "learning_rate": 8.28291994870996e-06, "loss": 0.1521, "step": 21540 }, { "epoch": 2.7135434707308623, "grad_norm": 0.20992571115493774, "learning_rate": 8.246935902712493e-06, "loss": 0.1401, "step": 21545 }, { "epoch": 2.71417325314104, "grad_norm": 0.1939440220594406, "learning_rate": 8.21102798134844e-06, "loss": 0.1572, "step": 21550 }, { "epoch": 2.714803035551217, "grad_norm": 0.2128129005432129, "learning_rate": 8.175196203901157e-06, "loss": 0.1624, "step": 21555 }, { "epoch": 2.7154328179613945, "grad_norm": 0.22001588344573975, "learning_rate": 8.139440589613122e-06, "loss": 0.1498, "step": 21560 }, { "epoch": 2.7160626003715715, "grad_norm": 0.24958358705043793, "learning_rate": 8.103761157685939e-06, "loss": 0.1614, "step": 21565 }, { "epoch": 2.716692382781749, "grad_norm": 0.21756353974342346, "learning_rate": 8.068157927280284e-06, "loss": 0.1515, "step": 21570 }, { "epoch": 2.717322165191926, "grad_norm": 0.19753116369247437, "learning_rate": 8.032630917515842e-06, "loss": 0.1504, "step": 21575 }, { "epoch": 2.7179519476021037, "grad_norm": 0.2083761841058731, "learning_rate": 7.997180147471505e-06, "loss": 0.1488, "step": 21580 }, { "epoch": 2.7185817300122808, "grad_norm": 0.2009708434343338, "learning_rate": 7.961805636185126e-06, "loss": 0.1475, "step": 21585 }, { "epoch": 2.719211512422458, "grad_norm": 0.23513175547122955, "learning_rate": 7.926507402653609e-06, "loss": 0.1479, "step": 21590 }, { "epoch": 2.7198412948326354, "grad_norm": 0.1990012526512146, "learning_rate": 7.891285465832909e-06, "loss": 0.1498, "step": 21595 }, { "epoch": 2.7204710772428125, "grad_norm": 0.2000730186700821, "learning_rate": 7.856139844638044e-06, "loss": 0.1553, "step": 21600 }, { "epoch": 2.72110085965299, "grad_norm": 0.17009419202804565, "learning_rate": 7.821070557942966e-06, "loss": 0.138, "step": 21605 }, { "epoch": 2.721730642063167, "grad_norm": 0.19666020572185516, "learning_rate": 7.786077624580728e-06, "loss": 0.1505, "step": 21610 }, { "epoch": 2.7223604244733446, "grad_norm": 0.20230218768119812, "learning_rate": 7.751161063343314e-06, "loss": 0.1459, "step": 21615 }, { "epoch": 2.7229902068835217, "grad_norm": 0.20249028503894806, "learning_rate": 7.716320892981692e-06, "loss": 0.1481, "step": 21620 }, { "epoch": 2.7236199892936988, "grad_norm": 0.183380126953125, "learning_rate": 7.681557132205861e-06, "loss": 0.1513, "step": 21625 }, { "epoch": 2.7242497717038763, "grad_norm": 0.22188283503055573, "learning_rate": 7.646869799684791e-06, "loss": 0.1534, "step": 21630 }, { "epoch": 2.724879554114054, "grad_norm": 0.19538500905036926, "learning_rate": 7.6122589140462766e-06, "loss": 0.1524, "step": 21635 }, { "epoch": 2.725509336524231, "grad_norm": 0.1824834644794464, "learning_rate": 7.577724493877219e-06, "loss": 0.1564, "step": 21640 }, { "epoch": 2.726139118934408, "grad_norm": 0.18397974967956543, "learning_rate": 7.543266557723398e-06, "loss": 0.1467, "step": 21645 }, { "epoch": 2.7267689013445855, "grad_norm": 0.22993116080760956, "learning_rate": 7.508885124089481e-06, "loss": 0.1546, "step": 21650 }, { "epoch": 2.7273986837547626, "grad_norm": 0.18351049721240997, "learning_rate": 7.47458021143908e-06, "loss": 0.1616, "step": 21655 }, { "epoch": 2.72802846616494, "grad_norm": 0.20072756707668304, "learning_rate": 7.440351838194724e-06, "loss": 0.1451, "step": 21660 }, { "epoch": 2.728658248575117, "grad_norm": 0.19199103116989136, "learning_rate": 7.406200022737879e-06, "loss": 0.1518, "step": 21665 }, { "epoch": 2.7292880309852947, "grad_norm": 0.21039634943008423, "learning_rate": 7.372124783408789e-06, "loss": 0.154, "step": 21670 }, { "epoch": 2.729917813395472, "grad_norm": 0.2162015289068222, "learning_rate": 7.33812613850665e-06, "loss": 0.1459, "step": 21675 }, { "epoch": 2.730547595805649, "grad_norm": 0.192021444439888, "learning_rate": 7.304204106289507e-06, "loss": 0.1547, "step": 21680 }, { "epoch": 2.7311773782158264, "grad_norm": 0.20860375463962555, "learning_rate": 7.270358704974289e-06, "loss": 0.1501, "step": 21685 }, { "epoch": 2.731807160626004, "grad_norm": 0.1841016709804535, "learning_rate": 7.236589952736738e-06, "loss": 0.1538, "step": 21690 }, { "epoch": 2.732436943036181, "grad_norm": 0.23411309719085693, "learning_rate": 7.202897867711449e-06, "loss": 0.153, "step": 21695 }, { "epoch": 2.733066725446358, "grad_norm": 0.2005651742219925, "learning_rate": 7.1692824679918325e-06, "loss": 0.1505, "step": 21700 }, { "epoch": 2.7336965078565356, "grad_norm": 0.18157663941383362, "learning_rate": 7.135743771630131e-06, "loss": 0.1424, "step": 21705 }, { "epoch": 2.7343262902667127, "grad_norm": 0.20939917862415314, "learning_rate": 7.102281796637388e-06, "loss": 0.1585, "step": 21710 }, { "epoch": 2.7349560726768902, "grad_norm": 0.17006689310073853, "learning_rate": 7.068896560983445e-06, "loss": 0.1529, "step": 21715 }, { "epoch": 2.7355858550870673, "grad_norm": 0.23061016201972961, "learning_rate": 7.035588082596927e-06, "loss": 0.1556, "step": 21720 }, { "epoch": 2.736215637497245, "grad_norm": 0.20175643265247345, "learning_rate": 7.002356379365276e-06, "loss": 0.1559, "step": 21725 }, { "epoch": 2.736845419907422, "grad_norm": 0.19943305850028992, "learning_rate": 6.969201469134683e-06, "loss": 0.147, "step": 21730 }, { "epoch": 2.737475202317599, "grad_norm": 0.22196878492832184, "learning_rate": 6.936123369710056e-06, "loss": 0.1517, "step": 21735 }, { "epoch": 2.7381049847277765, "grad_norm": 0.19505414366722107, "learning_rate": 6.903122098855085e-06, "loss": 0.1464, "step": 21740 }, { "epoch": 2.738734767137954, "grad_norm": 0.19797982275485992, "learning_rate": 6.870197674292227e-06, "loss": 0.1407, "step": 21745 }, { "epoch": 2.739364549548131, "grad_norm": 0.2223568856716156, "learning_rate": 6.837350113702672e-06, "loss": 0.1524, "step": 21750 }, { "epoch": 2.739994331958308, "grad_norm": 0.22087423503398895, "learning_rate": 6.804579434726276e-06, "loss": 0.1578, "step": 21755 }, { "epoch": 2.7406241143684857, "grad_norm": 0.19389192759990692, "learning_rate": 6.771885654961662e-06, "loss": 0.1445, "step": 21760 }, { "epoch": 2.741253896778663, "grad_norm": 0.20979470014572144, "learning_rate": 6.739268791966118e-06, "loss": 0.1548, "step": 21765 }, { "epoch": 2.7418836791888404, "grad_norm": 0.22365309298038483, "learning_rate": 6.7067288632556505e-06, "loss": 0.1471, "step": 21770 }, { "epoch": 2.7425134615990174, "grad_norm": 0.20007841289043427, "learning_rate": 6.674265886304964e-06, "loss": 0.1548, "step": 21775 }, { "epoch": 2.743143244009195, "grad_norm": 0.1756853312253952, "learning_rate": 6.641879878547379e-06, "loss": 0.1443, "step": 21780 }, { "epoch": 2.743773026419372, "grad_norm": 0.21500404179096222, "learning_rate": 6.609570857374952e-06, "loss": 0.1584, "step": 21785 }, { "epoch": 2.744402808829549, "grad_norm": 0.1938805729150772, "learning_rate": 6.577338840138369e-06, "loss": 0.155, "step": 21790 }, { "epoch": 2.7450325912397266, "grad_norm": 0.20673929154872894, "learning_rate": 6.545183844146951e-06, "loss": 0.1526, "step": 21795 }, { "epoch": 2.745662373649904, "grad_norm": 0.19749803841114044, "learning_rate": 6.513105886668668e-06, "loss": 0.1533, "step": 21800 }, { "epoch": 2.7462921560600813, "grad_norm": 0.212607279419899, "learning_rate": 6.481104984930107e-06, "loss": 0.1565, "step": 21805 }, { "epoch": 2.7469219384702583, "grad_norm": 0.1796950250864029, "learning_rate": 6.449181156116473e-06, "loss": 0.1464, "step": 21810 }, { "epoch": 2.747551720880436, "grad_norm": 0.18281513452529907, "learning_rate": 6.417334417371616e-06, "loss": 0.1482, "step": 21815 }, { "epoch": 2.748181503290613, "grad_norm": 0.23321060836315155, "learning_rate": 6.385564785797958e-06, "loss": 0.1489, "step": 21820 }, { "epoch": 2.7488112857007905, "grad_norm": 0.2202220857143402, "learning_rate": 6.353872278456501e-06, "loss": 0.1477, "step": 21825 }, { "epoch": 2.7494410681109676, "grad_norm": 0.283456951379776, "learning_rate": 6.3222569123668635e-06, "loss": 0.1582, "step": 21830 }, { "epoch": 2.750070850521145, "grad_norm": 0.18883143365383148, "learning_rate": 6.29071870450723e-06, "loss": 0.1469, "step": 21835 }, { "epoch": 2.750700632931322, "grad_norm": 0.20364224910736084, "learning_rate": 6.259257671814272e-06, "loss": 0.1567, "step": 21840 }, { "epoch": 2.7513304153414992, "grad_norm": 0.19058570265769958, "learning_rate": 6.227873831183355e-06, "loss": 0.1449, "step": 21845 }, { "epoch": 2.7519601977516768, "grad_norm": 0.20439192652702332, "learning_rate": 6.196567199468299e-06, "loss": 0.1486, "step": 21850 }, { "epoch": 2.7525899801618543, "grad_norm": 0.1962665468454361, "learning_rate": 6.165337793481473e-06, "loss": 0.1499, "step": 21855 }, { "epoch": 2.7532197625720314, "grad_norm": 0.22097113728523254, "learning_rate": 6.134185629993793e-06, "loss": 0.153, "step": 21860 }, { "epoch": 2.7538495449822085, "grad_norm": 0.20070448517799377, "learning_rate": 6.103110725734644e-06, "loss": 0.1463, "step": 21865 }, { "epoch": 2.754479327392386, "grad_norm": 0.20577707886695862, "learning_rate": 6.072113097392028e-06, "loss": 0.1549, "step": 21870 }, { "epoch": 2.755109109802563, "grad_norm": 0.1798795461654663, "learning_rate": 6.041192761612313e-06, "loss": 0.1454, "step": 21875 }, { "epoch": 2.7557388922127406, "grad_norm": 0.20694920420646667, "learning_rate": 6.010349735000464e-06, "loss": 0.1524, "step": 21880 }, { "epoch": 2.7563686746229177, "grad_norm": 0.19873858988285065, "learning_rate": 5.979584034119867e-06, "loss": 0.1523, "step": 21885 }, { "epoch": 2.756998457033095, "grad_norm": 0.2215358465909958, "learning_rate": 5.948895675492421e-06, "loss": 0.1508, "step": 21890 }, { "epoch": 2.7576282394432723, "grad_norm": 0.21731533110141754, "learning_rate": 5.918284675598478e-06, "loss": 0.149, "step": 21895 }, { "epoch": 2.7582580218534494, "grad_norm": 0.21298860013484955, "learning_rate": 5.887751050876837e-06, "loss": 0.156, "step": 21900 }, { "epoch": 2.758887804263627, "grad_norm": 0.20131991803646088, "learning_rate": 5.85729481772475e-06, "loss": 0.1403, "step": 21905 }, { "epoch": 2.7595175866738044, "grad_norm": 0.17870669066905975, "learning_rate": 5.826915992497932e-06, "loss": 0.1483, "step": 21910 }, { "epoch": 2.7601473690839815, "grad_norm": 0.2430955022573471, "learning_rate": 5.796614591510468e-06, "loss": 0.1484, "step": 21915 }, { "epoch": 2.7607771514941586, "grad_norm": 0.1986503154039383, "learning_rate": 5.766390631034939e-06, "loss": 0.1524, "step": 21920 }, { "epoch": 2.761406933904336, "grad_norm": 0.1926422268152237, "learning_rate": 5.7362441273022645e-06, "loss": 0.1484, "step": 21925 }, { "epoch": 2.762036716314513, "grad_norm": 0.23347438871860504, "learning_rate": 5.706175096501825e-06, "loss": 0.1512, "step": 21930 }, { "epoch": 2.7626664987246907, "grad_norm": 0.20513305068016052, "learning_rate": 5.676183554781405e-06, "loss": 0.1518, "step": 21935 }, { "epoch": 2.763296281134868, "grad_norm": 0.18283484876155853, "learning_rate": 5.64626951824712e-06, "loss": 0.1381, "step": 21940 }, { "epoch": 2.7639260635450453, "grad_norm": 0.17075172066688538, "learning_rate": 5.616433002963472e-06, "loss": 0.1501, "step": 21945 }, { "epoch": 2.7645558459552224, "grad_norm": 0.2107374221086502, "learning_rate": 5.5866740249533746e-06, "loss": 0.1581, "step": 21950 }, { "epoch": 2.7651856283653995, "grad_norm": 0.23205851018428802, "learning_rate": 5.556992600198079e-06, "loss": 0.1467, "step": 21955 }, { "epoch": 2.765815410775577, "grad_norm": 0.1973281055688858, "learning_rate": 5.527388744637201e-06, "loss": 0.1434, "step": 21960 }, { "epoch": 2.7664451931857545, "grad_norm": 0.20235906541347504, "learning_rate": 5.497862474168657e-06, "loss": 0.1454, "step": 21965 }, { "epoch": 2.7670749755959316, "grad_norm": 0.21266506612300873, "learning_rate": 5.4684138046487134e-06, "loss": 0.1454, "step": 21970 }, { "epoch": 2.7677047580061087, "grad_norm": 0.1890571415424347, "learning_rate": 5.43904275189207e-06, "loss": 0.1414, "step": 21975 }, { "epoch": 2.7683345404162862, "grad_norm": 0.1897963136434555, "learning_rate": 5.409749331671559e-06, "loss": 0.1493, "step": 21980 }, { "epoch": 2.7689643228264633, "grad_norm": 0.18935035169124603, "learning_rate": 5.380533559718414e-06, "loss": 0.1543, "step": 21985 }, { "epoch": 2.769594105236641, "grad_norm": 0.20879988372325897, "learning_rate": 5.351395451722251e-06, "loss": 0.151, "step": 21990 }, { "epoch": 2.770223887646818, "grad_norm": 0.20008423924446106, "learning_rate": 5.322335023330837e-06, "loss": 0.1515, "step": 21995 }, { "epoch": 2.7708536700569955, "grad_norm": 0.18473681807518005, "learning_rate": 5.293352290150321e-06, "loss": 0.1464, "step": 22000 }, { "epoch": 2.7708536700569955, "eval_loss": 0.3584047257900238, "eval_runtime": 6.1661, "eval_samples_per_second": 162.176, "eval_steps_per_second": 10.217, "step": 22000 }, { "epoch": 2.7714834524671725, "grad_norm": 0.19308076798915863, "learning_rate": 5.264447267745053e-06, "loss": 0.1582, "step": 22005 }, { "epoch": 2.7721132348773496, "grad_norm": 0.23008759319782257, "learning_rate": 5.235619971637734e-06, "loss": 0.1546, "step": 22010 }, { "epoch": 2.772743017287527, "grad_norm": 0.21323955059051514, "learning_rate": 5.206870417309245e-06, "loss": 0.1536, "step": 22015 }, { "epoch": 2.7733727996977047, "grad_norm": 0.23257404565811157, "learning_rate": 5.17819862019877e-06, "loss": 0.1516, "step": 22020 }, { "epoch": 2.7740025821078818, "grad_norm": 0.22094878554344177, "learning_rate": 5.14960459570371e-06, "loss": 0.1546, "step": 22025 }, { "epoch": 2.774632364518059, "grad_norm": 0.21868959069252014, "learning_rate": 5.121088359179698e-06, "loss": 0.1567, "step": 22030 }, { "epoch": 2.7752621469282364, "grad_norm": 0.19147329032421112, "learning_rate": 5.09264992594065e-06, "loss": 0.1502, "step": 22035 }, { "epoch": 2.7758919293384134, "grad_norm": 0.17076317965984344, "learning_rate": 5.064289311258618e-06, "loss": 0.1511, "step": 22040 }, { "epoch": 2.776521711748591, "grad_norm": 0.23041397333145142, "learning_rate": 5.036006530363917e-06, "loss": 0.1611, "step": 22045 }, { "epoch": 2.777151494158768, "grad_norm": 0.21972966194152832, "learning_rate": 5.007801598445033e-06, "loss": 0.1493, "step": 22050 }, { "epoch": 2.7777812765689456, "grad_norm": 0.17348721623420715, "learning_rate": 4.979674530648664e-06, "loss": 0.1481, "step": 22055 }, { "epoch": 2.7784110589791227, "grad_norm": 0.19225727021694183, "learning_rate": 4.9516253420796795e-06, "loss": 0.1493, "step": 22060 }, { "epoch": 2.7790408413892997, "grad_norm": 0.19729195535182953, "learning_rate": 4.9236540478011625e-06, "loss": 0.1442, "step": 22065 }, { "epoch": 2.7796706237994773, "grad_norm": 0.17798985540866852, "learning_rate": 4.8957606628342805e-06, "loss": 0.1507, "step": 22070 }, { "epoch": 2.780300406209655, "grad_norm": 0.19311825931072235, "learning_rate": 4.867945202158469e-06, "loss": 0.149, "step": 22075 }, { "epoch": 2.780930188619832, "grad_norm": 0.18525920808315277, "learning_rate": 4.840207680711278e-06, "loss": 0.1635, "step": 22080 }, { "epoch": 2.781559971030009, "grad_norm": 0.18988420069217682, "learning_rate": 4.812548113388342e-06, "loss": 0.153, "step": 22085 }, { "epoch": 2.7821897534401865, "grad_norm": 0.18699151277542114, "learning_rate": 4.784966515043498e-06, "loss": 0.147, "step": 22090 }, { "epoch": 2.7828195358503636, "grad_norm": 0.23182329535484314, "learning_rate": 4.757462900488695e-06, "loss": 0.1496, "step": 22095 }, { "epoch": 2.783449318260541, "grad_norm": 0.20079541206359863, "learning_rate": 4.730037284494021e-06, "loss": 0.1583, "step": 22100 }, { "epoch": 2.784079100670718, "grad_norm": 0.21548844873905182, "learning_rate": 4.702689681787625e-06, "loss": 0.1481, "step": 22105 }, { "epoch": 2.7847088830808957, "grad_norm": 0.1968826800584793, "learning_rate": 4.6754201070558105e-06, "loss": 0.1452, "step": 22110 }, { "epoch": 2.785338665491073, "grad_norm": 0.20061470568180084, "learning_rate": 4.648228574942997e-06, "loss": 0.1472, "step": 22115 }, { "epoch": 2.78596844790125, "grad_norm": 0.19061359763145447, "learning_rate": 4.621115100051604e-06, "loss": 0.1478, "step": 22120 }, { "epoch": 2.7865982303114274, "grad_norm": 0.23252861201763153, "learning_rate": 4.594079696942199e-06, "loss": 0.1527, "step": 22125 }, { "epoch": 2.787228012721605, "grad_norm": 0.1698002964258194, "learning_rate": 4.56712238013342e-06, "loss": 0.1379, "step": 22130 }, { "epoch": 2.787857795131782, "grad_norm": 0.19811010360717773, "learning_rate": 4.540243164101954e-06, "loss": 0.1417, "step": 22135 }, { "epoch": 2.788487577541959, "grad_norm": 0.2089819759130478, "learning_rate": 4.513442063282585e-06, "loss": 0.1517, "step": 22140 }, { "epoch": 2.7891173599521366, "grad_norm": 0.21028514206409454, "learning_rate": 4.486719092068086e-06, "loss": 0.1536, "step": 22145 }, { "epoch": 2.7897471423623137, "grad_norm": 0.20895244181156158, "learning_rate": 4.46007426480931e-06, "loss": 0.1421, "step": 22150 }, { "epoch": 2.790376924772491, "grad_norm": 0.1925353854894638, "learning_rate": 4.4335075958151275e-06, "loss": 0.1506, "step": 22155 }, { "epoch": 2.7910067071826683, "grad_norm": 0.21809720993041992, "learning_rate": 4.407019099352477e-06, "loss": 0.1537, "step": 22160 }, { "epoch": 2.791636489592846, "grad_norm": 0.23316286504268646, "learning_rate": 4.380608789646245e-06, "loss": 0.1593, "step": 22165 }, { "epoch": 2.792266272003023, "grad_norm": 0.20298117399215698, "learning_rate": 4.354276680879404e-06, "loss": 0.1469, "step": 22170 }, { "epoch": 2.7928960544132, "grad_norm": 0.18828284740447998, "learning_rate": 4.328022787192875e-06, "loss": 0.1478, "step": 22175 }, { "epoch": 2.7935258368233775, "grad_norm": 0.19351090490818024, "learning_rate": 4.301847122685614e-06, "loss": 0.139, "step": 22180 }, { "epoch": 2.794155619233555, "grad_norm": 0.19426658749580383, "learning_rate": 4.27574970141456e-06, "loss": 0.148, "step": 22185 }, { "epoch": 2.794785401643732, "grad_norm": 0.18554694950580597, "learning_rate": 4.2497305373945855e-06, "loss": 0.1484, "step": 22190 }, { "epoch": 2.795415184053909, "grad_norm": 0.21555371582508087, "learning_rate": 4.223789644598613e-06, "loss": 0.1537, "step": 22195 }, { "epoch": 2.7960449664640867, "grad_norm": 0.20736396312713623, "learning_rate": 4.197927036957499e-06, "loss": 0.1533, "step": 22200 }, { "epoch": 2.796674748874264, "grad_norm": 0.2143113762140274, "learning_rate": 4.172142728360017e-06, "loss": 0.1509, "step": 22205 }, { "epoch": 2.7973045312844413, "grad_norm": 0.1888829916715622, "learning_rate": 4.146436732652958e-06, "loss": 0.1507, "step": 22210 }, { "epoch": 2.7979343136946184, "grad_norm": 0.19072696566581726, "learning_rate": 4.1208090636410286e-06, "loss": 0.153, "step": 22215 }, { "epoch": 2.798564096104796, "grad_norm": 0.23674504458904266, "learning_rate": 4.09525973508687e-06, "loss": 0.1584, "step": 22220 }, { "epoch": 2.799193878514973, "grad_norm": 0.23174551129341125, "learning_rate": 4.06978876071104e-06, "loss": 0.1475, "step": 22225 }, { "epoch": 2.79982366092515, "grad_norm": 0.2185906022787094, "learning_rate": 4.044396154192031e-06, "loss": 0.1494, "step": 22230 }, { "epoch": 2.8004534433353276, "grad_norm": 0.1940082162618637, "learning_rate": 4.019081929166268e-06, "loss": 0.1497, "step": 22235 }, { "epoch": 2.801083225745505, "grad_norm": 0.1945921629667282, "learning_rate": 3.993846099228093e-06, "loss": 0.1524, "step": 22240 }, { "epoch": 2.8017130081556822, "grad_norm": 0.21760894358158112, "learning_rate": 3.968688677929682e-06, "loss": 0.1459, "step": 22245 }, { "epoch": 2.8023427905658593, "grad_norm": 0.19670112431049347, "learning_rate": 3.943609678781162e-06, "loss": 0.151, "step": 22250 }, { "epoch": 2.802972572976037, "grad_norm": 0.2076457440853119, "learning_rate": 3.918609115250509e-06, "loss": 0.1515, "step": 22255 }, { "epoch": 2.803602355386214, "grad_norm": 0.20138059556484222, "learning_rate": 3.893687000763635e-06, "loss": 0.1492, "step": 22260 }, { "epoch": 2.8042321377963915, "grad_norm": 0.20619480311870575, "learning_rate": 3.868843348704265e-06, "loss": 0.1516, "step": 22265 }, { "epoch": 2.8048619202065685, "grad_norm": 0.17885464429855347, "learning_rate": 3.844078172413994e-06, "loss": 0.1413, "step": 22270 }, { "epoch": 2.805491702616746, "grad_norm": 0.17029553651809692, "learning_rate": 3.8193914851922855e-06, "loss": 0.143, "step": 22275 }, { "epoch": 2.806121485026923, "grad_norm": 0.18624289333820343, "learning_rate": 3.794783300296483e-06, "loss": 0.1448, "step": 22280 }, { "epoch": 2.8067512674371002, "grad_norm": 0.20082144439220428, "learning_rate": 3.7702536309417497e-06, "loss": 0.1498, "step": 22285 }, { "epoch": 2.8073810498472778, "grad_norm": 0.213558167219162, "learning_rate": 3.745802490301031e-06, "loss": 0.165, "step": 22290 }, { "epoch": 2.8080108322574553, "grad_norm": 0.23692555725574493, "learning_rate": 3.721429891505173e-06, "loss": 0.1568, "step": 22295 }, { "epoch": 2.8086406146676324, "grad_norm": 0.18088509142398834, "learning_rate": 3.6971358476428237e-06, "loss": 0.1508, "step": 22300 }, { "epoch": 2.8092703970778095, "grad_norm": 0.20369915664196014, "learning_rate": 3.672920371760446e-06, "loss": 0.1469, "step": 22305 }, { "epoch": 2.809900179487987, "grad_norm": 0.18801896274089813, "learning_rate": 3.6487834768622883e-06, "loss": 0.1417, "step": 22310 }, { "epoch": 2.810529961898164, "grad_norm": 0.2028091549873352, "learning_rate": 3.6247251759104145e-06, "loss": 0.157, "step": 22315 }, { "epoch": 2.8111597443083416, "grad_norm": 0.16689006984233856, "learning_rate": 3.600745481824707e-06, "loss": 0.1393, "step": 22320 }, { "epoch": 2.8117895267185187, "grad_norm": 0.20889881253242493, "learning_rate": 3.576844407482765e-06, "loss": 0.1586, "step": 22325 }, { "epoch": 2.812419309128696, "grad_norm": 0.21049942076206207, "learning_rate": 3.5530219657200543e-06, "loss": 0.155, "step": 22330 }, { "epoch": 2.8130490915388733, "grad_norm": 0.23332563042640686, "learning_rate": 3.5292781693297247e-06, "loss": 0.1557, "step": 22335 }, { "epoch": 2.8136788739490504, "grad_norm": 0.1706390082836151, "learning_rate": 3.505613031062776e-06, "loss": 0.1421, "step": 22340 }, { "epoch": 2.814308656359228, "grad_norm": 0.17925478518009186, "learning_rate": 3.4820265636279265e-06, "loss": 0.1433, "step": 22345 }, { "epoch": 2.8149384387694054, "grad_norm": 0.15641047060489655, "learning_rate": 3.458518779691627e-06, "loss": 0.1423, "step": 22350 }, { "epoch": 2.8155682211795825, "grad_norm": 0.18733102083206177, "learning_rate": 3.435089691878112e-06, "loss": 0.1533, "step": 22355 }, { "epoch": 2.8161980035897596, "grad_norm": 0.22065778076648712, "learning_rate": 3.4117393127693183e-06, "loss": 0.157, "step": 22360 }, { "epoch": 2.816827785999937, "grad_norm": 0.20604351162910461, "learning_rate": 3.388467654904947e-06, "loss": 0.1438, "step": 22365 }, { "epoch": 2.817457568410114, "grad_norm": 0.17883001267910004, "learning_rate": 3.365274730782419e-06, "loss": 0.1465, "step": 22370 }, { "epoch": 2.8180873508202917, "grad_norm": 0.18118852376937866, "learning_rate": 3.3421605528568374e-06, "loss": 0.1501, "step": 22375 }, { "epoch": 2.818717133230469, "grad_norm": 0.2178465574979782, "learning_rate": 3.3191251335410564e-06, "loss": 0.1467, "step": 22380 }, { "epoch": 2.8193469156406463, "grad_norm": 0.18433082103729248, "learning_rate": 3.29616848520563e-06, "loss": 0.1478, "step": 22385 }, { "epoch": 2.8199766980508234, "grad_norm": 0.19671761989593506, "learning_rate": 3.273290620178831e-06, "loss": 0.144, "step": 22390 }, { "epoch": 2.8206064804610005, "grad_norm": 0.2000323235988617, "learning_rate": 3.2504915507465144e-06, "loss": 0.1443, "step": 22395 }, { "epoch": 2.821236262871178, "grad_norm": 0.19443731009960175, "learning_rate": 3.22777128915237e-06, "loss": 0.1537, "step": 22400 }, { "epoch": 2.8218660452813555, "grad_norm": 0.19904273748397827, "learning_rate": 3.2051298475976707e-06, "loss": 0.1581, "step": 22405 }, { "epoch": 2.8224958276915326, "grad_norm": 0.1972033828496933, "learning_rate": 3.18256723824139e-06, "loss": 0.1383, "step": 22410 }, { "epoch": 2.8231256101017097, "grad_norm": 0.21138480305671692, "learning_rate": 3.16008347320017e-06, "loss": 0.1442, "step": 22415 }, { "epoch": 2.8237553925118872, "grad_norm": 0.22747448086738586, "learning_rate": 3.1376785645483016e-06, "loss": 0.1485, "step": 22420 }, { "epoch": 2.8243851749220643, "grad_norm": 0.23757314682006836, "learning_rate": 3.11535252431776e-06, "loss": 0.1568, "step": 22425 }, { "epoch": 2.825014957332242, "grad_norm": 0.2193070352077484, "learning_rate": 3.0931053644980885e-06, "loss": 0.1605, "step": 22430 }, { "epoch": 2.825644739742419, "grad_norm": 0.2223901003599167, "learning_rate": 3.0709370970365464e-06, "loss": 0.1453, "step": 22435 }, { "epoch": 2.8262745221525964, "grad_norm": 0.23655427992343903, "learning_rate": 3.0488477338379944e-06, "loss": 0.1484, "step": 22440 }, { "epoch": 2.8269043045627735, "grad_norm": 0.20859979093074799, "learning_rate": 3.026837286764944e-06, "loss": 0.154, "step": 22445 }, { "epoch": 2.8275340869729506, "grad_norm": 0.1994808316230774, "learning_rate": 3.004905767637472e-06, "loss": 0.1634, "step": 22450 }, { "epoch": 2.828163869383128, "grad_norm": 0.19530266523361206, "learning_rate": 2.983053188233342e-06, "loss": 0.1458, "step": 22455 }, { "epoch": 2.8287936517933057, "grad_norm": 0.19528019428253174, "learning_rate": 2.9612795602878827e-06, "loss": 0.1472, "step": 22460 }, { "epoch": 2.8294234342034827, "grad_norm": 0.20543955266475677, "learning_rate": 2.939584895494007e-06, "loss": 0.1544, "step": 22465 }, { "epoch": 2.83005321661366, "grad_norm": 0.18907050788402557, "learning_rate": 2.917969205502263e-06, "loss": 0.1469, "step": 22470 }, { "epoch": 2.8306829990238374, "grad_norm": 0.2009141594171524, "learning_rate": 2.896432501920748e-06, "loss": 0.1463, "step": 22475 }, { "epoch": 2.8313127814340144, "grad_norm": 0.1845710128545761, "learning_rate": 2.8749747963151937e-06, "loss": 0.1523, "step": 22480 }, { "epoch": 2.831942563844192, "grad_norm": 0.22671662271022797, "learning_rate": 2.853596100208866e-06, "loss": 0.1553, "step": 22485 }, { "epoch": 2.832572346254369, "grad_norm": 0.1716582477092743, "learning_rate": 2.832296425082614e-06, "loss": 0.1423, "step": 22490 }, { "epoch": 2.8332021286645466, "grad_norm": 0.17477920651435852, "learning_rate": 2.8110757823748554e-06, "loss": 0.142, "step": 22495 }, { "epoch": 2.8338319110747237, "grad_norm": 0.22391197085380554, "learning_rate": 2.7899341834815236e-06, "loss": 0.1576, "step": 22500 }, { "epoch": 2.8344616934849007, "grad_norm": 0.19235247373580933, "learning_rate": 2.7688716397561874e-06, "loss": 0.1432, "step": 22505 }, { "epoch": 2.8350914758950783, "grad_norm": 0.21828468143939972, "learning_rate": 2.747888162509898e-06, "loss": 0.1461, "step": 22510 }, { "epoch": 2.835721258305256, "grad_norm": 0.19712364673614502, "learning_rate": 2.726983763011259e-06, "loss": 0.1461, "step": 22515 }, { "epoch": 2.836351040715433, "grad_norm": 0.22868654131889343, "learning_rate": 2.7061584524864066e-06, "loss": 0.1546, "step": 22520 }, { "epoch": 2.83698082312561, "grad_norm": 0.18876421451568604, "learning_rate": 2.685412242119012e-06, "loss": 0.1481, "step": 22525 }, { "epoch": 2.8376106055357875, "grad_norm": 0.21973784267902374, "learning_rate": 2.664745143050295e-06, "loss": 0.1568, "step": 22530 }, { "epoch": 2.8382403879459646, "grad_norm": 0.24478502571582794, "learning_rate": 2.6441571663788963e-06, "loss": 0.1558, "step": 22535 }, { "epoch": 2.838870170356142, "grad_norm": 0.18952693045139313, "learning_rate": 2.6236483231610707e-06, "loss": 0.1461, "step": 22540 }, { "epoch": 2.839499952766319, "grad_norm": 0.20026876032352448, "learning_rate": 2.603218624410525e-06, "loss": 0.1466, "step": 22545 }, { "epoch": 2.8401297351764967, "grad_norm": 0.21935871243476868, "learning_rate": 2.5828680810984824e-06, "loss": 0.1563, "step": 22550 }, { "epoch": 2.8407595175866738, "grad_norm": 0.22446821630001068, "learning_rate": 2.5625967041536354e-06, "loss": 0.1462, "step": 22555 }, { "epoch": 2.841389299996851, "grad_norm": 0.2072252631187439, "learning_rate": 2.5424045044621922e-06, "loss": 0.1505, "step": 22560 }, { "epoch": 2.8420190824070284, "grad_norm": 0.19828562438488007, "learning_rate": 2.5222914928678285e-06, "loss": 0.1462, "step": 22565 }, { "epoch": 2.842648864817206, "grad_norm": 0.18411174416542053, "learning_rate": 2.502257680171671e-06, "loss": 0.1415, "step": 22570 }, { "epoch": 2.843278647227383, "grad_norm": 0.20017574727535248, "learning_rate": 2.482303077132347e-06, "loss": 0.1556, "step": 22575 }, { "epoch": 2.84390842963756, "grad_norm": 0.1881314069032669, "learning_rate": 2.462427694465935e-06, "loss": 0.1464, "step": 22580 }, { "epoch": 2.8445382120477376, "grad_norm": 0.2211647629737854, "learning_rate": 2.4426315428459466e-06, "loss": 0.1471, "step": 22585 }, { "epoch": 2.8451679944579147, "grad_norm": 0.20288364589214325, "learning_rate": 2.4229146329033944e-06, "loss": 0.146, "step": 22590 }, { "epoch": 2.845797776868092, "grad_norm": 0.22115926444530487, "learning_rate": 2.4032769752267087e-06, "loss": 0.1422, "step": 22595 }, { "epoch": 2.8464275592782693, "grad_norm": 0.196670264005661, "learning_rate": 2.3837185803617544e-06, "loss": 0.153, "step": 22600 }, { "epoch": 2.847057341688447, "grad_norm": 0.23514890670776367, "learning_rate": 2.3642394588118285e-06, "loss": 0.1573, "step": 22605 }, { "epoch": 2.847687124098624, "grad_norm": 0.1987423151731491, "learning_rate": 2.3448396210376807e-06, "loss": 0.1457, "step": 22610 }, { "epoch": 2.848316906508801, "grad_norm": 0.18859946727752686, "learning_rate": 2.3255190774574605e-06, "loss": 0.1533, "step": 22615 }, { "epoch": 2.8489466889189785, "grad_norm": 0.21700045466423035, "learning_rate": 2.306277838446735e-06, "loss": 0.1416, "step": 22620 }, { "epoch": 2.849576471329156, "grad_norm": 0.17610225081443787, "learning_rate": 2.2871159143384723e-06, "loss": 0.1498, "step": 22625 }, { "epoch": 2.850206253739333, "grad_norm": 0.2066749781370163, "learning_rate": 2.26803331542309e-06, "loss": 0.1587, "step": 22630 }, { "epoch": 2.85083603614951, "grad_norm": 0.19877871870994568, "learning_rate": 2.2490300519484082e-06, "loss": 0.1526, "step": 22635 }, { "epoch": 2.8514658185596877, "grad_norm": 0.19332483410835266, "learning_rate": 2.230106134119547e-06, "loss": 0.1562, "step": 22640 }, { "epoch": 2.852095600969865, "grad_norm": 0.21806974709033966, "learning_rate": 2.21126157209911e-06, "loss": 0.1508, "step": 22645 }, { "epoch": 2.8527253833800423, "grad_norm": 0.20896165072917938, "learning_rate": 2.192496376007069e-06, "loss": 0.1612, "step": 22650 }, { "epoch": 2.8533551657902194, "grad_norm": 0.2381521761417389, "learning_rate": 2.1738105559207465e-06, "loss": 0.1545, "step": 22655 }, { "epoch": 2.853984948200397, "grad_norm": 0.24022352695465088, "learning_rate": 2.155204121874882e-06, "loss": 0.1548, "step": 22660 }, { "epoch": 2.854614730610574, "grad_norm": 0.20042377710342407, "learning_rate": 2.1366770838615322e-06, "loss": 0.1423, "step": 22665 }, { "epoch": 2.855244513020751, "grad_norm": 0.1943242996931076, "learning_rate": 2.118229451830139e-06, "loss": 0.1453, "step": 22670 }, { "epoch": 2.8558742954309286, "grad_norm": 0.20173771679401398, "learning_rate": 2.0998612356874944e-06, "loss": 0.1406, "step": 22675 }, { "epoch": 2.856504077841106, "grad_norm": 0.21339194476604462, "learning_rate": 2.081572445297791e-06, "loss": 0.1447, "step": 22680 }, { "epoch": 2.8571338602512832, "grad_norm": 0.18814577162265778, "learning_rate": 2.0633630904824727e-06, "loss": 0.144, "step": 22685 }, { "epoch": 2.8577636426614603, "grad_norm": 0.1956281065940857, "learning_rate": 2.045233181020417e-06, "loss": 0.1503, "step": 22690 }, { "epoch": 2.858393425071638, "grad_norm": 0.22954149544239044, "learning_rate": 2.027182726647786e-06, "loss": 0.1491, "step": 22695 }, { "epoch": 2.859023207481815, "grad_norm": 0.18004447221755981, "learning_rate": 2.009211737058092e-06, "loss": 0.1492, "step": 22700 }, { "epoch": 2.8596529898919925, "grad_norm": 0.226220041513443, "learning_rate": 1.991320221902165e-06, "loss": 0.159, "step": 22705 }, { "epoch": 2.8602827723021695, "grad_norm": 0.1808856725692749, "learning_rate": 1.9735081907881367e-06, "loss": 0.1473, "step": 22710 }, { "epoch": 2.860912554712347, "grad_norm": 0.2538818418979645, "learning_rate": 1.9557756532815216e-06, "loss": 0.1523, "step": 22715 }, { "epoch": 2.861542337122524, "grad_norm": 0.18744130432605743, "learning_rate": 1.9381226189050524e-06, "loss": 0.1501, "step": 22720 }, { "epoch": 2.8621721195327012, "grad_norm": 0.2162604182958603, "learning_rate": 1.920549097138813e-06, "loss": 0.1505, "step": 22725 }, { "epoch": 2.8628019019428788, "grad_norm": 0.2076927125453949, "learning_rate": 1.9030550974202197e-06, "loss": 0.1493, "step": 22730 }, { "epoch": 2.8634316843530563, "grad_norm": 0.2153797596693039, "learning_rate": 1.885640629143942e-06, "loss": 0.1409, "step": 22735 }, { "epoch": 2.8640614667632334, "grad_norm": 0.19790925085544586, "learning_rate": 1.868305701661932e-06, "loss": 0.1596, "step": 22740 }, { "epoch": 2.8646912491734104, "grad_norm": 0.18141327798366547, "learning_rate": 1.8510503242834263e-06, "loss": 0.1459, "step": 22745 }, { "epoch": 2.865321031583588, "grad_norm": 0.20295578241348267, "learning_rate": 1.833874506274996e-06, "loss": 0.1485, "step": 22750 }, { "epoch": 2.865950813993765, "grad_norm": 0.20226307213306427, "learning_rate": 1.8167782568604127e-06, "loss": 0.1507, "step": 22755 }, { "epoch": 2.8665805964039426, "grad_norm": 0.19584356248378754, "learning_rate": 1.7997615852207825e-06, "loss": 0.1526, "step": 22760 }, { "epoch": 2.8672103788141197, "grad_norm": 0.17093075811862946, "learning_rate": 1.7828245004944286e-06, "loss": 0.1481, "step": 22765 }, { "epoch": 2.867840161224297, "grad_norm": 0.17845821380615234, "learning_rate": 1.7659670117769587e-06, "loss": 0.1459, "step": 22770 }, { "epoch": 2.8684699436344743, "grad_norm": 0.1874646097421646, "learning_rate": 1.749189128121231e-06, "loss": 0.1493, "step": 22775 }, { "epoch": 2.8690997260446514, "grad_norm": 0.18263909220695496, "learning_rate": 1.7324908585373387e-06, "loss": 0.1438, "step": 22780 }, { "epoch": 2.869729508454829, "grad_norm": 0.1789528727531433, "learning_rate": 1.7158722119926583e-06, "loss": 0.1476, "step": 22785 }, { "epoch": 2.8703592908650064, "grad_norm": 0.21683086454868317, "learning_rate": 1.6993331974117508e-06, "loss": 0.1499, "step": 22790 }, { "epoch": 2.8709890732751835, "grad_norm": 0.24182718992233276, "learning_rate": 1.6828738236764617e-06, "loss": 0.1541, "step": 22795 }, { "epoch": 2.8716188556853606, "grad_norm": 0.21868962049484253, "learning_rate": 1.6664940996258702e-06, "loss": 0.1471, "step": 22800 }, { "epoch": 2.872248638095538, "grad_norm": 0.1993272453546524, "learning_rate": 1.6501940340562236e-06, "loss": 0.1526, "step": 22805 }, { "epoch": 2.872878420505715, "grad_norm": 0.201304093003273, "learning_rate": 1.6339736357210697e-06, "loss": 0.1516, "step": 22810 }, { "epoch": 2.8735082029158927, "grad_norm": 0.25056761503219604, "learning_rate": 1.6178329133310908e-06, "loss": 0.151, "step": 22815 }, { "epoch": 2.87413798532607, "grad_norm": 0.19561152160167694, "learning_rate": 1.6017718755542696e-06, "loss": 0.143, "step": 22820 }, { "epoch": 2.8747677677362473, "grad_norm": 0.22097674012184143, "learning_rate": 1.5857905310157071e-06, "loss": 0.1512, "step": 22825 }, { "epoch": 2.8753975501464244, "grad_norm": 0.21212686598300934, "learning_rate": 1.5698888882977712e-06, "loss": 0.1541, "step": 22830 }, { "epoch": 2.8760273325566015, "grad_norm": 0.20324502885341644, "learning_rate": 1.5540669559399977e-06, "loss": 0.1533, "step": 22835 }, { "epoch": 2.876657114966779, "grad_norm": 0.169882133603096, "learning_rate": 1.5383247424391564e-06, "loss": 0.1406, "step": 22840 }, { "epoch": 2.8772868973769565, "grad_norm": 0.23402316868305206, "learning_rate": 1.5226622562491352e-06, "loss": 0.1569, "step": 22845 }, { "epoch": 2.8779166797871336, "grad_norm": 0.17247354984283447, "learning_rate": 1.5070795057810559e-06, "loss": 0.1432, "step": 22850 }, { "epoch": 2.8785464621973107, "grad_norm": 0.21733173727989197, "learning_rate": 1.4915764994032409e-06, "loss": 0.1523, "step": 22855 }, { "epoch": 2.879176244607488, "grad_norm": 0.19319911301136017, "learning_rate": 1.4761532454411306e-06, "loss": 0.1391, "step": 22860 }, { "epoch": 2.8798060270176653, "grad_norm": 0.19645391404628754, "learning_rate": 1.4608097521773664e-06, "loss": 0.1499, "step": 22865 }, { "epoch": 2.880435809427843, "grad_norm": 0.18772046267986298, "learning_rate": 1.4455460278517572e-06, "loss": 0.1483, "step": 22870 }, { "epoch": 2.88106559183802, "grad_norm": 0.22282320261001587, "learning_rate": 1.4303620806612792e-06, "loss": 0.1468, "step": 22875 }, { "epoch": 2.8816953742481974, "grad_norm": 0.19287440180778503, "learning_rate": 1.4152579187600599e-06, "loss": 0.1495, "step": 22880 }, { "epoch": 2.8823251566583745, "grad_norm": 0.1981481909751892, "learning_rate": 1.400233550259361e-06, "loss": 0.1506, "step": 22885 }, { "epoch": 2.8829549390685516, "grad_norm": 0.21331623196601868, "learning_rate": 1.385288983227628e-06, "loss": 0.1483, "step": 22890 }, { "epoch": 2.883584721478729, "grad_norm": 0.20138582587242126, "learning_rate": 1.3704242256904252e-06, "loss": 0.1504, "step": 22895 }, { "epoch": 2.8842145038889067, "grad_norm": 0.18493309617042542, "learning_rate": 1.3556392856304831e-06, "loss": 0.1539, "step": 22900 }, { "epoch": 2.8848442862990837, "grad_norm": 0.22465452551841736, "learning_rate": 1.3409341709876343e-06, "loss": 0.1511, "step": 22905 }, { "epoch": 2.885474068709261, "grad_norm": 0.19788892567157745, "learning_rate": 1.326308889658878e-06, "loss": 0.1504, "step": 22910 }, { "epoch": 2.8861038511194383, "grad_norm": 0.20821528136730194, "learning_rate": 1.3117634494982986e-06, "loss": 0.1511, "step": 22915 }, { "epoch": 2.8867336335296154, "grad_norm": 0.24520978331565857, "learning_rate": 1.2972978583171644e-06, "loss": 0.1503, "step": 22920 }, { "epoch": 2.887363415939793, "grad_norm": 0.18331633508205414, "learning_rate": 1.2829121238837947e-06, "loss": 0.1485, "step": 22925 }, { "epoch": 2.88799319834997, "grad_norm": 0.23501911759376526, "learning_rate": 1.2686062539236762e-06, "loss": 0.1452, "step": 22930 }, { "epoch": 2.8886229807601476, "grad_norm": 0.19778122007846832, "learning_rate": 1.2543802561193806e-06, "loss": 0.1523, "step": 22935 }, { "epoch": 2.8892527631703246, "grad_norm": 0.21170009672641754, "learning_rate": 1.2402341381105962e-06, "loss": 0.1627, "step": 22940 }, { "epoch": 2.8898825455805017, "grad_norm": 0.2205863893032074, "learning_rate": 1.22616790749413e-06, "loss": 0.1507, "step": 22945 }, { "epoch": 2.8905123279906793, "grad_norm": 0.1905989944934845, "learning_rate": 1.2121815718238393e-06, "loss": 0.15, "step": 22950 }, { "epoch": 2.891142110400857, "grad_norm": 0.19767914712429047, "learning_rate": 1.1982751386107159e-06, "loss": 0.1499, "step": 22955 }, { "epoch": 2.891771892811034, "grad_norm": 0.23298701643943787, "learning_rate": 1.1844486153228361e-06, "loss": 0.1507, "step": 22960 }, { "epoch": 2.892401675221211, "grad_norm": 0.17696991562843323, "learning_rate": 1.1707020093853602e-06, "loss": 0.1458, "step": 22965 }, { "epoch": 2.8930314576313885, "grad_norm": 0.22472181916236877, "learning_rate": 1.1570353281805334e-06, "loss": 0.154, "step": 22970 }, { "epoch": 2.8936612400415656, "grad_norm": 0.18597114086151123, "learning_rate": 1.1434485790476512e-06, "loss": 0.1531, "step": 22975 }, { "epoch": 2.894291022451743, "grad_norm": 0.20187944173812866, "learning_rate": 1.1299417692831436e-06, "loss": 0.1558, "step": 22980 }, { "epoch": 2.89492080486192, "grad_norm": 0.19914616644382477, "learning_rate": 1.1165149061404422e-06, "loss": 0.1418, "step": 22985 }, { "epoch": 2.8955505872720977, "grad_norm": 0.2554416060447693, "learning_rate": 1.1031679968301122e-06, "loss": 0.1556, "step": 22990 }, { "epoch": 2.8961803696822748, "grad_norm": 0.2221318930387497, "learning_rate": 1.08990104851972e-06, "loss": 0.1523, "step": 22995 }, { "epoch": 2.896810152092452, "grad_norm": 0.2058124542236328, "learning_rate": 1.0767140683339336e-06, "loss": 0.1403, "step": 23000 }, { "epoch": 2.896810152092452, "eval_loss": 0.36229029297828674, "eval_runtime": 6.1614, "eval_samples_per_second": 162.302, "eval_steps_per_second": 10.225, "step": 23000 }, { "epoch": 2.8974399345026294, "grad_norm": 0.2248660773038864, "learning_rate": 1.0636070633544547e-06, "loss": 0.1514, "step": 23005 }, { "epoch": 2.8980697169128065, "grad_norm": 0.1835104525089264, "learning_rate": 1.0505800406200526e-06, "loss": 0.1405, "step": 23010 }, { "epoch": 2.898699499322984, "grad_norm": 0.22200733423233032, "learning_rate": 1.0376330071265482e-06, "loss": 0.1503, "step": 23015 }, { "epoch": 2.899329281733161, "grad_norm": 0.18144001066684723, "learning_rate": 1.024765969826763e-06, "loss": 0.1516, "step": 23020 }, { "epoch": 2.8999590641433386, "grad_norm": 0.17609558999538422, "learning_rate": 1.0119789356306196e-06, "loss": 0.1517, "step": 23025 }, { "epoch": 2.9005888465535157, "grad_norm": 0.2412068098783493, "learning_rate": 9.99271911405025e-07, "loss": 0.157, "step": 23030 }, { "epoch": 2.901218628963693, "grad_norm": 0.19257797300815582, "learning_rate": 9.866449039739544e-07, "loss": 0.1486, "step": 23035 }, { "epoch": 2.9018484113738703, "grad_norm": 0.22018341720104218, "learning_rate": 9.74097920118383e-07, "loss": 0.1517, "step": 23040 }, { "epoch": 2.902478193784048, "grad_norm": 0.19569897651672363, "learning_rate": 9.616309665763544e-07, "loss": 0.144, "step": 23045 }, { "epoch": 2.903107976194225, "grad_norm": 0.20541365444660187, "learning_rate": 9.492440500428966e-07, "loss": 0.1535, "step": 23050 }, { "epoch": 2.903737758604402, "grad_norm": 0.1934703141450882, "learning_rate": 9.369371771700552e-07, "loss": 0.1369, "step": 23055 }, { "epoch": 2.9043675410145795, "grad_norm": 0.16967949271202087, "learning_rate": 9.247103545669266e-07, "loss": 0.1447, "step": 23060 }, { "epoch": 2.9049973234247566, "grad_norm": 0.15972602367401123, "learning_rate": 9.125635887995586e-07, "loss": 0.1484, "step": 23065 }, { "epoch": 2.905627105834934, "grad_norm": 0.21369343996047974, "learning_rate": 9.004968863910667e-07, "loss": 0.153, "step": 23070 }, { "epoch": 2.906256888245111, "grad_norm": 0.18001650273799896, "learning_rate": 8.885102538215338e-07, "loss": 0.1499, "step": 23075 }, { "epoch": 2.9068866706552887, "grad_norm": 0.20539362728595734, "learning_rate": 8.766036975280777e-07, "loss": 0.1495, "step": 23080 }, { "epoch": 2.907516453065466, "grad_norm": 0.2048049122095108, "learning_rate": 8.647772239047667e-07, "loss": 0.1477, "step": 23085 }, { "epoch": 2.9081462354756433, "grad_norm": 0.2018105536699295, "learning_rate": 8.530308393027041e-07, "loss": 0.1457, "step": 23090 }, { "epoch": 2.9087760178858204, "grad_norm": 0.21647921204566956, "learning_rate": 8.413645500299437e-07, "loss": 0.1511, "step": 23095 }, { "epoch": 2.909405800295998, "grad_norm": 0.19362643361091614, "learning_rate": 8.297783623515741e-07, "loss": 0.1491, "step": 23100 }, { "epoch": 2.910035582706175, "grad_norm": 0.18819986283779144, "learning_rate": 8.182722824896182e-07, "loss": 0.1482, "step": 23105 }, { "epoch": 2.910665365116352, "grad_norm": 0.18341930210590363, "learning_rate": 8.068463166231332e-07, "loss": 0.1468, "step": 23110 }, { "epoch": 2.9112951475265296, "grad_norm": 0.17555510997772217, "learning_rate": 7.955004708881107e-07, "loss": 0.1538, "step": 23115 }, { "epoch": 2.9119249299367067, "grad_norm": 0.1862919181585312, "learning_rate": 7.842347513775271e-07, "loss": 0.1485, "step": 23120 }, { "epoch": 2.9125547123468842, "grad_norm": 0.23576001822948456, "learning_rate": 7.730491641413262e-07, "loss": 0.15, "step": 23125 }, { "epoch": 2.9131844947570613, "grad_norm": 0.20176522433757782, "learning_rate": 7.619437151864194e-07, "loss": 0.1528, "step": 23130 }, { "epoch": 2.913814277167239, "grad_norm": 0.1988651603460312, "learning_rate": 7.50918410476703e-07, "loss": 0.1513, "step": 23135 }, { "epoch": 2.914444059577416, "grad_norm": 0.19462084770202637, "learning_rate": 7.399732559330074e-07, "loss": 0.1471, "step": 23140 }, { "epoch": 2.9150738419875935, "grad_norm": 0.2095441371202469, "learning_rate": 7.291082574331309e-07, "loss": 0.1488, "step": 23145 }, { "epoch": 2.9157036243977705, "grad_norm": 0.19712376594543457, "learning_rate": 7.18323420811856e-07, "loss": 0.146, "step": 23150 }, { "epoch": 2.916333406807948, "grad_norm": 0.20228298008441925, "learning_rate": 7.076187518608168e-07, "loss": 0.1408, "step": 23155 }, { "epoch": 2.916963189218125, "grad_norm": 0.21097783744335175, "learning_rate": 6.969942563287311e-07, "loss": 0.1463, "step": 23160 }, { "epoch": 2.9175929716283022, "grad_norm": 0.1911788433790207, "learning_rate": 6.864499399211687e-07, "loss": 0.1359, "step": 23165 }, { "epoch": 2.9182227540384797, "grad_norm": 0.19333137571811676, "learning_rate": 6.759858083006831e-07, "loss": 0.1521, "step": 23170 }, { "epoch": 2.918852536448657, "grad_norm": 0.20187996327877045, "learning_rate": 6.656018670867125e-07, "loss": 0.1391, "step": 23175 }, { "epoch": 2.9194823188588344, "grad_norm": 0.26705697178840637, "learning_rate": 6.55298121855713e-07, "loss": 0.1531, "step": 23180 }, { "epoch": 2.9201121012690114, "grad_norm": 0.2374356985092163, "learning_rate": 6.450745781410249e-07, "loss": 0.1604, "step": 23185 }, { "epoch": 2.920741883679189, "grad_norm": 0.18227587640285492, "learning_rate": 6.349312414329067e-07, "loss": 0.1486, "step": 23190 }, { "epoch": 2.921371666089366, "grad_norm": 0.22778551280498505, "learning_rate": 6.248681171785675e-07, "loss": 0.1475, "step": 23195 }, { "epoch": 2.922001448499543, "grad_norm": 0.2099718153476715, "learning_rate": 6.148852107821511e-07, "loss": 0.1442, "step": 23200 }, { "epoch": 2.9226312309097207, "grad_norm": 0.19987498223781586, "learning_rate": 6.04982527604686e-07, "loss": 0.1504, "step": 23205 }, { "epoch": 2.923261013319898, "grad_norm": 0.1993655115365982, "learning_rate": 5.951600729641515e-07, "loss": 0.154, "step": 23210 }, { "epoch": 2.9238907957300753, "grad_norm": 0.19234336912631989, "learning_rate": 5.854178521354113e-07, "loss": 0.1531, "step": 23215 }, { "epoch": 2.9245205781402523, "grad_norm": 0.20532859861850739, "learning_rate": 5.757558703502973e-07, "loss": 0.1522, "step": 23220 }, { "epoch": 2.92515036055043, "grad_norm": 0.23982007801532745, "learning_rate": 5.661741327974755e-07, "loss": 0.15, "step": 23225 }, { "epoch": 2.925780142960607, "grad_norm": 0.18457266688346863, "learning_rate": 5.5667264462258e-07, "loss": 0.1457, "step": 23230 }, { "epoch": 2.9264099253707845, "grad_norm": 0.1986248642206192, "learning_rate": 5.472514109281123e-07, "loss": 0.145, "step": 23235 }, { "epoch": 2.9270397077809616, "grad_norm": 0.2265506535768509, "learning_rate": 5.379104367735087e-07, "loss": 0.147, "step": 23240 }, { "epoch": 2.927669490191139, "grad_norm": 0.21421676874160767, "learning_rate": 5.286497271750733e-07, "loss": 0.1492, "step": 23245 }, { "epoch": 2.928299272601316, "grad_norm": 0.1608610451221466, "learning_rate": 5.19469287106028e-07, "loss": 0.1339, "step": 23250 }, { "epoch": 2.9289290550114933, "grad_norm": 0.18140849471092224, "learning_rate": 5.103691214964789e-07, "loss": 0.1469, "step": 23255 }, { "epoch": 2.929558837421671, "grad_norm": 0.21651338040828705, "learning_rate": 5.013492352334003e-07, "loss": 0.1596, "step": 23260 }, { "epoch": 2.9301886198318483, "grad_norm": 0.18771570920944214, "learning_rate": 4.924096331607008e-07, "loss": 0.1482, "step": 23265 }, { "epoch": 2.9308184022420254, "grad_norm": 0.2401566356420517, "learning_rate": 4.835503200791402e-07, "loss": 0.1557, "step": 23270 }, { "epoch": 2.9314481846522025, "grad_norm": 0.22064423561096191, "learning_rate": 4.747713007463627e-07, "loss": 0.1621, "step": 23275 }, { "epoch": 2.93207796706238, "grad_norm": 0.18799829483032227, "learning_rate": 4.660725798769305e-07, "loss": 0.1517, "step": 23280 }, { "epoch": 2.932707749472557, "grad_norm": 0.24584966897964478, "learning_rate": 4.574541621422401e-07, "loss": 0.1563, "step": 23285 }, { "epoch": 2.9333375318827346, "grad_norm": 0.18013089895248413, "learning_rate": 4.489160521705726e-07, "loss": 0.1388, "step": 23290 }, { "epoch": 2.9339673142929117, "grad_norm": 0.20351989567279816, "learning_rate": 4.404582545470936e-07, "loss": 0.1451, "step": 23295 }, { "epoch": 2.934597096703089, "grad_norm": 0.17512726783752441, "learning_rate": 4.3208077381383655e-07, "loss": 0.1413, "step": 23300 }, { "epoch": 2.9352268791132663, "grad_norm": 0.2058653086423874, "learning_rate": 4.2378361446970267e-07, "loss": 0.1578, "step": 23305 }, { "epoch": 2.9358566615234434, "grad_norm": 0.23548051714897156, "learning_rate": 4.155667809704444e-07, "loss": 0.154, "step": 23310 }, { "epoch": 2.936486443933621, "grad_norm": 0.19606271386146545, "learning_rate": 4.074302777286986e-07, "loss": 0.1523, "step": 23315 }, { "epoch": 2.9371162263437984, "grad_norm": 0.19439321756362915, "learning_rate": 3.993741091139369e-07, "loss": 0.1447, "step": 23320 }, { "epoch": 2.9377460087539755, "grad_norm": 0.20347769558429718, "learning_rate": 3.9139827945253167e-07, "loss": 0.1466, "step": 23325 }, { "epoch": 2.9383757911641526, "grad_norm": 0.23724155128002167, "learning_rate": 3.835027930276735e-07, "loss": 0.1597, "step": 23330 }, { "epoch": 2.93900557357433, "grad_norm": 0.1813487559556961, "learning_rate": 3.7568765407940406e-07, "loss": 0.1358, "step": 23335 }, { "epoch": 2.939635355984507, "grad_norm": 0.21012306213378906, "learning_rate": 3.679528668046494e-07, "loss": 0.1508, "step": 23340 }, { "epoch": 2.9402651383946847, "grad_norm": 0.21550029516220093, "learning_rate": 3.602984353571703e-07, "loss": 0.1449, "step": 23345 }, { "epoch": 2.940894920804862, "grad_norm": 0.203638955950737, "learning_rate": 3.5272436384756186e-07, "loss": 0.1491, "step": 23350 }, { "epoch": 2.9415247032150393, "grad_norm": 0.21173645555973053, "learning_rate": 3.452306563432872e-07, "loss": 0.1553, "step": 23355 }, { "epoch": 2.9421544856252164, "grad_norm": 0.22810731828212738, "learning_rate": 3.3781731686861047e-07, "loss": 0.1606, "step": 23360 }, { "epoch": 2.9427842680353935, "grad_norm": 0.19044247269630432, "learning_rate": 3.3048434940469713e-07, "loss": 0.1421, "step": 23365 }, { "epoch": 2.943414050445571, "grad_norm": 0.21632073819637299, "learning_rate": 3.232317578894805e-07, "loss": 0.1473, "step": 23370 }, { "epoch": 2.9440438328557486, "grad_norm": 0.17703349888324738, "learning_rate": 3.160595462178117e-07, "loss": 0.1503, "step": 23375 }, { "epoch": 2.9446736152659256, "grad_norm": 0.19983936846256256, "learning_rate": 3.089677182412931e-07, "loss": 0.1427, "step": 23380 }, { "epoch": 2.9453033976761027, "grad_norm": 0.18913906812667847, "learning_rate": 3.019562777684115e-07, "loss": 0.1443, "step": 23385 }, { "epoch": 2.9459331800862802, "grad_norm": 0.2024787813425064, "learning_rate": 2.950252285644883e-07, "loss": 0.1501, "step": 23390 }, { "epoch": 2.9465629624964573, "grad_norm": 0.22307011485099792, "learning_rate": 2.8817457435164614e-07, "loss": 0.1526, "step": 23395 }, { "epoch": 2.947192744906635, "grad_norm": 0.23350244760513306, "learning_rate": 2.814043188088255e-07, "loss": 0.1583, "step": 23400 }, { "epoch": 2.947822527316812, "grad_norm": 0.18705366551876068, "learning_rate": 2.7471446557181807e-07, "loss": 0.1515, "step": 23405 }, { "epoch": 2.9484523097269895, "grad_norm": 0.18902996182441711, "learning_rate": 2.681050182332334e-07, "loss": 0.1489, "step": 23410 }, { "epoch": 2.9490820921371665, "grad_norm": 0.18764075636863708, "learning_rate": 2.6157598034249885e-07, "loss": 0.1519, "step": 23415 }, { "epoch": 2.9497118745473436, "grad_norm": 0.20529431104660034, "learning_rate": 2.5512735540584305e-07, "loss": 0.1504, "step": 23420 }, { "epoch": 2.950341656957521, "grad_norm": 0.21828597784042358, "learning_rate": 2.487591468863293e-07, "loss": 0.147, "step": 23425 }, { "epoch": 2.9509714393676987, "grad_norm": 0.2224801480770111, "learning_rate": 2.424713582038551e-07, "loss": 0.1542, "step": 23430 }, { "epoch": 2.9516012217778758, "grad_norm": 0.1950775682926178, "learning_rate": 2.3626399273506957e-07, "loss": 0.1462, "step": 23435 }, { "epoch": 2.952231004188053, "grad_norm": 0.17124883830547333, "learning_rate": 2.3013705381348946e-07, "loss": 0.1374, "step": 23440 }, { "epoch": 2.9528607865982304, "grad_norm": 0.20156201720237732, "learning_rate": 2.2409054472941613e-07, "loss": 0.1471, "step": 23445 }, { "epoch": 2.9534905690084075, "grad_norm": 0.19449764490127563, "learning_rate": 2.1812446872995214e-07, "loss": 0.158, "step": 23450 }, { "epoch": 2.954120351418585, "grad_norm": 0.18049843609333038, "learning_rate": 2.1223882901905132e-07, "loss": 0.1518, "step": 23455 }, { "epoch": 2.954750133828762, "grad_norm": 0.19583791494369507, "learning_rate": 2.06433628757402e-07, "loss": 0.1556, "step": 23460 }, { "epoch": 2.9553799162389396, "grad_norm": 0.1842242181301117, "learning_rate": 2.0070887106254373e-07, "loss": 0.1494, "step": 23465 }, { "epoch": 2.9560096986491167, "grad_norm": 0.22290416061878204, "learning_rate": 1.950645590088007e-07, "loss": 0.1454, "step": 23470 }, { "epoch": 2.9566394810592938, "grad_norm": 0.18742886185646057, "learning_rate": 1.895006956272982e-07, "loss": 0.1432, "step": 23475 }, { "epoch": 2.9572692634694713, "grad_norm": 0.181674525141716, "learning_rate": 1.8401728390594617e-07, "loss": 0.1534, "step": 23480 }, { "epoch": 2.957899045879649, "grad_norm": 0.2029273808002472, "learning_rate": 1.786143267894724e-07, "loss": 0.1515, "step": 23485 }, { "epoch": 2.958528828289826, "grad_norm": 0.23291803896427155, "learning_rate": 1.7329182717940594e-07, "loss": 0.144, "step": 23490 }, { "epoch": 2.959158610700003, "grad_norm": 0.24005140364170074, "learning_rate": 1.6804978793401036e-07, "loss": 0.1508, "step": 23495 }, { "epoch": 2.9597883931101805, "grad_norm": 0.18763796985149384, "learning_rate": 1.6288821186841716e-07, "loss": 0.1477, "step": 23500 }, { "epoch": 2.9604181755203576, "grad_norm": 0.18949641287326813, "learning_rate": 1.578071017544924e-07, "loss": 0.1501, "step": 23505 }, { "epoch": 2.961047957930535, "grad_norm": 0.21106213331222534, "learning_rate": 1.5280646032092003e-07, "loss": 0.152, "step": 23510 }, { "epoch": 2.961677740340712, "grad_norm": 0.231742724776268, "learning_rate": 1.4788629025313526e-07, "loss": 0.1546, "step": 23515 }, { "epoch": 2.9623075227508897, "grad_norm": 0.21462422609329224, "learning_rate": 1.430465941934078e-07, "loss": 0.149, "step": 23520 }, { "epoch": 2.962937305161067, "grad_norm": 0.18480440974235535, "learning_rate": 1.382873747407587e-07, "loss": 0.1486, "step": 23525 }, { "epoch": 2.963567087571244, "grad_norm": 0.18864907324314117, "learning_rate": 1.3360863445097682e-07, "loss": 0.1556, "step": 23530 }, { "epoch": 2.9641968699814214, "grad_norm": 0.17919088900089264, "learning_rate": 1.2901037583668562e-07, "loss": 0.1435, "step": 23535 }, { "epoch": 2.964826652391599, "grad_norm": 0.22338031232357025, "learning_rate": 1.2449260136722649e-07, "loss": 0.1538, "step": 23540 }, { "epoch": 2.965456434801776, "grad_norm": 0.23020599782466888, "learning_rate": 1.200553134687754e-07, "loss": 0.1559, "step": 23545 }, { "epoch": 2.966086217211953, "grad_norm": 0.20723643898963928, "learning_rate": 1.1569851452422618e-07, "loss": 0.1415, "step": 23550 }, { "epoch": 2.9667159996221306, "grad_norm": 0.2118474692106247, "learning_rate": 1.1142220687330727e-07, "loss": 0.1567, "step": 23555 }, { "epoch": 2.9673457820323077, "grad_norm": 0.20279090106487274, "learning_rate": 1.0722639281246503e-07, "loss": 0.145, "step": 23560 }, { "epoch": 2.9679755644424852, "grad_norm": 0.21842657029628754, "learning_rate": 1.0311107459498035e-07, "loss": 0.1557, "step": 23565 }, { "epoch": 2.9686053468526623, "grad_norm": 0.2776351571083069, "learning_rate": 9.90762544308521e-08, "loss": 0.1638, "step": 23570 }, { "epoch": 2.96923512926284, "grad_norm": 0.19698885083198547, "learning_rate": 9.512193448686367e-08, "loss": 0.1457, "step": 23575 }, { "epoch": 2.969864911673017, "grad_norm": 0.1835564374923706, "learning_rate": 9.124811688659972e-08, "loss": 0.1569, "step": 23580 }, { "epoch": 2.970494694083194, "grad_norm": 0.20081757009029388, "learning_rate": 8.745480371036284e-08, "loss": 0.1451, "step": 23585 }, { "epoch": 2.9711244764933715, "grad_norm": 0.2095508873462677, "learning_rate": 8.37419969952735e-08, "loss": 0.1461, "step": 23590 }, { "epoch": 2.971754258903549, "grad_norm": 0.24606873095035553, "learning_rate": 8.010969873517015e-08, "loss": 0.1555, "step": 23595 }, { "epoch": 2.972384041313726, "grad_norm": 0.2295389175415039, "learning_rate": 7.65579108806924e-08, "loss": 0.1534, "step": 23600 }, { "epoch": 2.973013823723903, "grad_norm": 0.217758446931839, "learning_rate": 7.308663533924786e-08, "loss": 0.1647, "step": 23605 }, { "epoch": 2.9736436061340807, "grad_norm": 0.21592317521572113, "learning_rate": 6.969587397496201e-08, "loss": 0.1482, "step": 23610 }, { "epoch": 2.974273388544258, "grad_norm": 0.2219853401184082, "learning_rate": 6.638562860876162e-08, "loss": 0.151, "step": 23615 }, { "epoch": 2.9749031709544353, "grad_norm": 0.21385011076927185, "learning_rate": 6.315590101832468e-08, "loss": 0.1477, "step": 23620 }, { "epoch": 2.9755329533646124, "grad_norm": 0.18287068605422974, "learning_rate": 6.000669293808048e-08, "loss": 0.1478, "step": 23625 }, { "epoch": 2.97616273577479, "grad_norm": 0.23458221554756165, "learning_rate": 5.693800605924281e-08, "loss": 0.1521, "step": 23630 }, { "epoch": 2.976792518184967, "grad_norm": 0.2393937110900879, "learning_rate": 5.394984202976016e-08, "loss": 0.1443, "step": 23635 }, { "epoch": 2.977422300595144, "grad_norm": 0.2523866891860962, "learning_rate": 5.104220245434887e-08, "loss": 0.16, "step": 23640 }, { "epoch": 2.9780520830053216, "grad_norm": 0.20749832689762115, "learning_rate": 4.821508889445991e-08, "loss": 0.1493, "step": 23645 }, { "epoch": 2.978681865415499, "grad_norm": 0.20586150884628296, "learning_rate": 4.546850286834547e-08, "loss": 0.1425, "step": 23650 }, { "epoch": 2.9793116478256763, "grad_norm": 0.20742247998714447, "learning_rate": 4.2802445850959046e-08, "loss": 0.1556, "step": 23655 }, { "epoch": 2.9799414302358533, "grad_norm": 0.178171768784523, "learning_rate": 4.0216919274038696e-08, "loss": 0.1532, "step": 23660 }, { "epoch": 2.980571212646031, "grad_norm": 0.24389080703258514, "learning_rate": 3.771192452607374e-08, "loss": 0.1551, "step": 23665 }, { "epoch": 2.981200995056208, "grad_norm": 0.18905188143253326, "learning_rate": 3.528746295232143e-08, "loss": 0.144, "step": 23670 }, { "epoch": 2.9818307774663855, "grad_norm": 0.258633017539978, "learning_rate": 3.2943535854756956e-08, "loss": 0.1624, "step": 23675 }, { "epoch": 2.9824605598765626, "grad_norm": 0.19491221010684967, "learning_rate": 3.0680144492123416e-08, "loss": 0.1429, "step": 23680 }, { "epoch": 2.98309034228674, "grad_norm": 0.19454774260520935, "learning_rate": 2.8497290079898537e-08, "loss": 0.1439, "step": 23685 }, { "epoch": 2.983720124696917, "grad_norm": 0.21658724546432495, "learning_rate": 2.6394973790361262e-08, "loss": 0.1397, "step": 23690 }, { "epoch": 2.9843499071070942, "grad_norm": 0.18729209899902344, "learning_rate": 2.4373196752475177e-08, "loss": 0.1494, "step": 23695 }, { "epoch": 2.9849796895172718, "grad_norm": 0.19640378654003143, "learning_rate": 2.243196005198844e-08, "loss": 0.1497, "step": 23700 }, { "epoch": 2.9856094719274493, "grad_norm": 0.20427659153938293, "learning_rate": 2.0571264731383817e-08, "loss": 0.1533, "step": 23705 }, { "epoch": 2.9862392543376264, "grad_norm": 0.217566579580307, "learning_rate": 1.8791111789911995e-08, "loss": 0.1539, "step": 23710 }, { "epoch": 2.9868690367478035, "grad_norm": 0.2069326490163803, "learning_rate": 1.7091502183541606e-08, "loss": 0.1481, "step": 23715 }, { "epoch": 2.987498819157981, "grad_norm": 0.17276856303215027, "learning_rate": 1.5472436825009205e-08, "loss": 0.1457, "step": 23720 }, { "epoch": 2.988128601568158, "grad_norm": 0.21325160562992096, "learning_rate": 1.3933916583785954e-08, "loss": 0.1453, "step": 23725 }, { "epoch": 2.9887583839783356, "grad_norm": 0.2071818858385086, "learning_rate": 1.2475942286094275e-08, "loss": 0.1522, "step": 23730 }, { "epoch": 2.9893881663885127, "grad_norm": 0.19544194638729095, "learning_rate": 1.1098514714891205e-08, "loss": 0.1427, "step": 23735 }, { "epoch": 2.99001794879869, "grad_norm": 0.2268587052822113, "learning_rate": 9.801634609901688e-09, "loss": 0.1574, "step": 23740 }, { "epoch": 2.9906477312088673, "grad_norm": 0.1800483763217926, "learning_rate": 8.585302667585281e-09, "loss": 0.1491, "step": 23745 }, { "epoch": 2.9912775136190444, "grad_norm": 0.1946074366569519, "learning_rate": 7.449519541119498e-09, "loss": 0.1504, "step": 23750 }, { "epoch": 2.991907296029222, "grad_norm": 0.19125299155712128, "learning_rate": 6.394285840449764e-09, "loss": 0.148, "step": 23755 }, { "epoch": 2.9925370784393994, "grad_norm": 0.18596796691417694, "learning_rate": 5.419602132272771e-09, "loss": 0.1418, "step": 23760 }, { "epoch": 2.9931668608495765, "grad_norm": 0.20286522805690765, "learning_rate": 4.525468940003163e-09, "loss": 0.1425, "step": 23765 }, { "epoch": 2.9937966432597536, "grad_norm": 0.1934228241443634, "learning_rate": 3.7118867438068465e-09, "loss": 0.1456, "step": 23770 }, { "epoch": 2.994426425669931, "grad_norm": 0.19864603877067566, "learning_rate": 2.9788559806176447e-09, "loss": 0.141, "step": 23775 }, { "epoch": 2.995056208080108, "grad_norm": 0.1812632828950882, "learning_rate": 2.326377044070682e-09, "loss": 0.1414, "step": 23780 }, { "epoch": 2.9956859904902857, "grad_norm": 0.18837498128414154, "learning_rate": 1.7544502845856512e-09, "loss": 0.1466, "step": 23785 }, { "epoch": 2.996315772900463, "grad_norm": 0.19479897618293762, "learning_rate": 1.2630760092668946e-09, "loss": 0.1434, "step": 23790 }, { "epoch": 2.9969455553106403, "grad_norm": 0.2550322413444519, "learning_rate": 8.522544820199761e-10, "loss": 0.1527, "step": 23795 }, { "epoch": 2.9975753377208174, "grad_norm": 0.22663426399230957, "learning_rate": 5.21985923451762e-10, "loss": 0.154, "step": 23800 }, { "epoch": 2.9982051201309945, "grad_norm": 0.19043132662773132, "learning_rate": 2.7227051092038043e-10, "loss": 0.1413, "step": 23805 }, { "epoch": 2.998834902541172, "grad_norm": 0.17332880198955536, "learning_rate": 1.0310837855187492e-10, "loss": 0.137, "step": 23810 }, { "epoch": 2.9994646849513495, "grad_norm": 0.2147851288318634, "learning_rate": 1.4499617156937815e-11, "loss": 0.1501, "step": 23815 } ], "logging_steps": 5, "max_steps": 23817, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0258751160588435e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null }