{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9998897828722584, "eval_steps": 500, "global_step": 4536, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0011021712774165104, "grad_norm": 59.11821880494146, "learning_rate": 3.303964757709251e-06, "loss": 3.825, "step": 5 }, { "epoch": 0.002204342554833021, "grad_norm": 42.17582597601867, "learning_rate": 6.607929515418502e-06, "loss": 3.6223, "step": 10 }, { "epoch": 0.0033065138322495315, "grad_norm": 18.347454692288814, "learning_rate": 9.911894273127752e-06, "loss": 2.8332, "step": 15 }, { "epoch": 0.004408685109666042, "grad_norm": 7.929104229119195, "learning_rate": 1.3215859030837005e-05, "loss": 2.0196, "step": 20 }, { "epoch": 0.005510856387082552, "grad_norm": 3.3860390518924492, "learning_rate": 1.6519823788546254e-05, "loss": 1.6062, "step": 25 }, { "epoch": 0.006613027664499063, "grad_norm": 1.5094502156188854, "learning_rate": 1.9823788546255504e-05, "loss": 1.3491, "step": 30 }, { "epoch": 0.007715198941915574, "grad_norm": 0.8243795908744608, "learning_rate": 2.3127753303964757e-05, "loss": 1.1719, "step": 35 }, { "epoch": 0.008817370219332083, "grad_norm": 0.47607081050989597, "learning_rate": 2.643171806167401e-05, "loss": 1.1158, "step": 40 }, { "epoch": 0.009919541496748594, "grad_norm": 0.34776556828145655, "learning_rate": 2.9735682819383256e-05, "loss": 1.0865, "step": 45 }, { "epoch": 0.011021712774165105, "grad_norm": 0.34304699002193145, "learning_rate": 3.303964757709251e-05, "loss": 1.0369, "step": 50 }, { "epoch": 0.012123884051581615, "grad_norm": 0.2531395849305058, "learning_rate": 3.634361233480176e-05, "loss": 1.0383, "step": 55 }, { "epoch": 0.013226055328998126, "grad_norm": 0.21165915652546113, "learning_rate": 3.964757709251101e-05, "loss": 1.021, "step": 60 }, { "epoch": 0.014328226606414637, "grad_norm": 0.1944258505202001, "learning_rate": 4.295154185022026e-05, "loss": 0.9806, "step": 65 }, { "epoch": 0.015430397883831147, "grad_norm": 0.2225104161521765, "learning_rate": 4.625550660792951e-05, "loss": 0.9731, "step": 70 }, { "epoch": 0.016532569161247658, "grad_norm": 0.18722320005454055, "learning_rate": 4.9559471365638766e-05, "loss": 0.9756, "step": 75 }, { "epoch": 0.017634740438664167, "grad_norm": 0.18788262680578474, "learning_rate": 5.286343612334802e-05, "loss": 0.9827, "step": 80 }, { "epoch": 0.01873691171608068, "grad_norm": 0.15447512609368866, "learning_rate": 5.6167400881057265e-05, "loss": 0.9681, "step": 85 }, { "epoch": 0.019839082993497188, "grad_norm": 0.16941375745095524, "learning_rate": 5.947136563876651e-05, "loss": 0.9765, "step": 90 }, { "epoch": 0.0209412542709137, "grad_norm": 0.14612889382972916, "learning_rate": 6.277533039647576e-05, "loss": 0.9359, "step": 95 }, { "epoch": 0.02204342554833021, "grad_norm": 0.1405739682413469, "learning_rate": 6.607929515418502e-05, "loss": 0.9543, "step": 100 }, { "epoch": 0.023145596825746722, "grad_norm": 0.130605536696256, "learning_rate": 6.938325991189426e-05, "loss": 0.9136, "step": 105 }, { "epoch": 0.02424776810316323, "grad_norm": 0.11300041363364274, "learning_rate": 7.268722466960352e-05, "loss": 0.9478, "step": 110 }, { "epoch": 0.025349939380579743, "grad_norm": 0.09643276514367859, "learning_rate": 7.599118942731278e-05, "loss": 0.9125, "step": 115 }, { "epoch": 0.026452110657996252, "grad_norm": 0.10011972487944946, "learning_rate": 7.929515418502201e-05, "loss": 0.9309, "step": 120 }, { "epoch": 0.027554281935412765, "grad_norm": 0.08587719618641608, "learning_rate": 8.259911894273126e-05, "loss": 0.9023, "step": 125 }, { "epoch": 0.028656453212829273, "grad_norm": 0.08726681655122204, "learning_rate": 8.590308370044052e-05, "loss": 0.9058, "step": 130 }, { "epoch": 0.029758624490245786, "grad_norm": 0.09342883329751345, "learning_rate": 8.920704845814977e-05, "loss": 0.9054, "step": 135 }, { "epoch": 0.030860795767662295, "grad_norm": 0.08738078317709104, "learning_rate": 9.251101321585903e-05, "loss": 0.8833, "step": 140 }, { "epoch": 0.031962967045078804, "grad_norm": 0.07421598879157193, "learning_rate": 9.581497797356827e-05, "loss": 0.9187, "step": 145 }, { "epoch": 0.033065138322495316, "grad_norm": 0.05948715305053877, "learning_rate": 9.911894273127753e-05, "loss": 0.8747, "step": 150 }, { "epoch": 0.03416730959991183, "grad_norm": 0.07139242065324014, "learning_rate": 0.00010242290748898678, "loss": 0.8821, "step": 155 }, { "epoch": 0.035269480877328334, "grad_norm": 0.0688597307361162, "learning_rate": 0.00010572687224669604, "loss": 0.8818, "step": 160 }, { "epoch": 0.036371652154744846, "grad_norm": 0.06917882261732754, "learning_rate": 0.00010903083700440527, "loss": 0.9223, "step": 165 }, { "epoch": 0.03747382343216136, "grad_norm": 0.06477308185348316, "learning_rate": 0.00011233480176211453, "loss": 0.8979, "step": 170 }, { "epoch": 0.03857599470957787, "grad_norm": 0.07148480842885613, "learning_rate": 0.00011563876651982378, "loss": 0.8864, "step": 175 }, { "epoch": 0.039678165986994376, "grad_norm": 0.06963902310697093, "learning_rate": 0.00011894273127753302, "loss": 0.8924, "step": 180 }, { "epoch": 0.04078033726441089, "grad_norm": 0.06681188993008794, "learning_rate": 0.00012224669603524228, "loss": 0.8853, "step": 185 }, { "epoch": 0.0418825085418274, "grad_norm": 0.07882523551134729, "learning_rate": 0.00012555066079295151, "loss": 0.8752, "step": 190 }, { "epoch": 0.042984679819243914, "grad_norm": 0.07046808160734085, "learning_rate": 0.00012885462555066077, "loss": 0.9005, "step": 195 }, { "epoch": 0.04408685109666042, "grad_norm": 0.08831018054166795, "learning_rate": 0.00013215859030837003, "loss": 0.8779, "step": 200 }, { "epoch": 0.04518902237407693, "grad_norm": 0.06786610627531549, "learning_rate": 0.0001354625550660793, "loss": 0.8865, "step": 205 }, { "epoch": 0.046291193651493444, "grad_norm": 0.06898944984160912, "learning_rate": 0.00013876651982378853, "loss": 0.8951, "step": 210 }, { "epoch": 0.04739336492890995, "grad_norm": 0.07193213519196924, "learning_rate": 0.00014207048458149779, "loss": 0.91, "step": 215 }, { "epoch": 0.04849553620632646, "grad_norm": 0.06470248286974109, "learning_rate": 0.00014537444933920705, "loss": 0.8713, "step": 220 }, { "epoch": 0.049597707483742974, "grad_norm": 0.07558920025422085, "learning_rate": 0.0001486784140969163, "loss": 0.9003, "step": 225 }, { "epoch": 0.050699878761159486, "grad_norm": 0.08225650056399321, "learning_rate": 0.00015198237885462556, "loss": 0.8744, "step": 230 }, { "epoch": 0.05180205003857599, "grad_norm": 0.08830347321776405, "learning_rate": 0.0001552863436123348, "loss": 0.8543, "step": 235 }, { "epoch": 0.052904221315992504, "grad_norm": 0.06812983519818898, "learning_rate": 0.00015859030837004403, "loss": 0.9006, "step": 240 }, { "epoch": 0.05400639259340902, "grad_norm": 0.08404581821873025, "learning_rate": 0.0001618942731277533, "loss": 0.8915, "step": 245 }, { "epoch": 0.05510856387082553, "grad_norm": 0.07050034877556227, "learning_rate": 0.00016519823788546252, "loss": 0.8737, "step": 250 }, { "epoch": 0.056210735148242034, "grad_norm": 0.060716071571555404, "learning_rate": 0.0001685022026431718, "loss": 0.8551, "step": 255 }, { "epoch": 0.05731290642565855, "grad_norm": 0.06881772285936742, "learning_rate": 0.00017180616740088104, "loss": 0.8895, "step": 260 }, { "epoch": 0.05841507770307506, "grad_norm": 0.0616288543822739, "learning_rate": 0.0001751101321585903, "loss": 0.8761, "step": 265 }, { "epoch": 0.05951724898049157, "grad_norm": 0.06164986983886088, "learning_rate": 0.00017841409691629953, "loss": 0.8948, "step": 270 }, { "epoch": 0.06061942025790808, "grad_norm": 0.08556452067304546, "learning_rate": 0.00018171806167400882, "loss": 0.8965, "step": 275 }, { "epoch": 0.06172159153532459, "grad_norm": 0.06708897999556158, "learning_rate": 0.00018502202643171805, "loss": 0.8717, "step": 280 }, { "epoch": 0.0628237628127411, "grad_norm": 0.06626854890594584, "learning_rate": 0.00018832599118942728, "loss": 0.8967, "step": 285 }, { "epoch": 0.06392593409015761, "grad_norm": 0.06798823718932381, "learning_rate": 0.00019162995594713654, "loss": 0.879, "step": 290 }, { "epoch": 0.06502810536757413, "grad_norm": 0.07021508305766244, "learning_rate": 0.0001949339207048458, "loss": 0.8761, "step": 295 }, { "epoch": 0.06613027664499063, "grad_norm": 0.06347368995094158, "learning_rate": 0.00019823788546255506, "loss": 0.853, "step": 300 }, { "epoch": 0.06723244792240714, "grad_norm": 0.07136142919506049, "learning_rate": 0.0002015418502202643, "loss": 0.8771, "step": 305 }, { "epoch": 0.06833461919982366, "grad_norm": 0.06846198864454035, "learning_rate": 0.00020484581497797356, "loss": 0.8903, "step": 310 }, { "epoch": 0.06943679047724016, "grad_norm": 0.06137623690084353, "learning_rate": 0.0002081497797356828, "loss": 0.853, "step": 315 }, { "epoch": 0.07053896175465667, "grad_norm": 0.07068069693448537, "learning_rate": 0.00021145374449339208, "loss": 0.8868, "step": 320 }, { "epoch": 0.07164113303207319, "grad_norm": 0.0633263499263589, "learning_rate": 0.0002147577092511013, "loss": 0.8847, "step": 325 }, { "epoch": 0.07274330430948969, "grad_norm": 0.06653681784940939, "learning_rate": 0.00021806167400881054, "loss": 0.8725, "step": 330 }, { "epoch": 0.07384547558690621, "grad_norm": 0.06583964059153263, "learning_rate": 0.0002213656387665198, "loss": 0.878, "step": 335 }, { "epoch": 0.07494764686432272, "grad_norm": 0.08061748319197447, "learning_rate": 0.00022466960352422906, "loss": 0.8984, "step": 340 }, { "epoch": 0.07604981814173922, "grad_norm": 0.0731004519013094, "learning_rate": 0.00022797356828193832, "loss": 0.8629, "step": 345 }, { "epoch": 0.07715198941915574, "grad_norm": 0.06044943906403856, "learning_rate": 0.00023127753303964755, "loss": 0.8783, "step": 350 }, { "epoch": 0.07825416069657225, "grad_norm": 0.06920900396817772, "learning_rate": 0.0002345814977973568, "loss": 0.8882, "step": 355 }, { "epoch": 0.07935633197398875, "grad_norm": 0.06396348587422171, "learning_rate": 0.00023788546255506604, "loss": 0.8557, "step": 360 }, { "epoch": 0.08045850325140527, "grad_norm": 0.06844620445410649, "learning_rate": 0.00024118942731277533, "loss": 0.8973, "step": 365 }, { "epoch": 0.08156067452882178, "grad_norm": 0.06653654136399571, "learning_rate": 0.00024449339207048456, "loss": 0.8916, "step": 370 }, { "epoch": 0.08266284580623828, "grad_norm": 0.06216710353519921, "learning_rate": 0.0002477973568281938, "loss": 0.8499, "step": 375 }, { "epoch": 0.0837650170836548, "grad_norm": 0.05880885627082627, "learning_rate": 0.00025110132158590303, "loss": 0.9042, "step": 380 }, { "epoch": 0.08486718836107131, "grad_norm": 0.06754842514493627, "learning_rate": 0.0002544052863436123, "loss": 0.8641, "step": 385 }, { "epoch": 0.08596935963848783, "grad_norm": 0.06140819068848091, "learning_rate": 0.00025770925110132155, "loss": 0.8943, "step": 390 }, { "epoch": 0.08707153091590433, "grad_norm": 0.061754148705009615, "learning_rate": 0.00026101321585903083, "loss": 0.8718, "step": 395 }, { "epoch": 0.08817370219332084, "grad_norm": 0.06597271175452663, "learning_rate": 0.00026431718061674007, "loss": 0.8791, "step": 400 }, { "epoch": 0.08927587347073736, "grad_norm": 0.058815224093820354, "learning_rate": 0.00026762114537444935, "loss": 0.8822, "step": 405 }, { "epoch": 0.09037804474815386, "grad_norm": 0.06335803406426932, "learning_rate": 0.0002709251101321586, "loss": 0.8555, "step": 410 }, { "epoch": 0.09148021602557037, "grad_norm": 0.061177086710910725, "learning_rate": 0.0002742290748898678, "loss": 0.8999, "step": 415 }, { "epoch": 0.09258238730298689, "grad_norm": 0.0640511083565074, "learning_rate": 0.00027753303964757705, "loss": 0.8907, "step": 420 }, { "epoch": 0.0936845585804034, "grad_norm": 0.06233966383032991, "learning_rate": 0.0002808370044052863, "loss": 0.8794, "step": 425 }, { "epoch": 0.0947867298578199, "grad_norm": 0.057022002708166875, "learning_rate": 0.00028414096916299557, "loss": 0.9005, "step": 430 }, { "epoch": 0.09588890113523642, "grad_norm": 0.0617487231342341, "learning_rate": 0.0002874449339207048, "loss": 0.8574, "step": 435 }, { "epoch": 0.09699107241265292, "grad_norm": 0.07666912755096209, "learning_rate": 0.0002907488986784141, "loss": 0.89, "step": 440 }, { "epoch": 0.09809324369006944, "grad_norm": 0.06908006068504674, "learning_rate": 0.0002940528634361233, "loss": 0.9058, "step": 445 }, { "epoch": 0.09919541496748595, "grad_norm": 0.06466123609412561, "learning_rate": 0.0002973568281938326, "loss": 0.8852, "step": 450 }, { "epoch": 0.10029758624490245, "grad_norm": 0.05620325485859758, "learning_rate": 0.0002999999555762735, "loss": 0.882, "step": 455 }, { "epoch": 0.10139975752231897, "grad_norm": 0.05608554381774001, "learning_rate": 0.0002999984007486092, "loss": 0.8602, "step": 460 }, { "epoch": 0.10250192879973548, "grad_norm": 0.06300983111668536, "learning_rate": 0.0002999946247609333, "loss": 0.8939, "step": 465 }, { "epoch": 0.10360410007715198, "grad_norm": 0.05359836181219753, "learning_rate": 0.00029998862766916014, "loss": 0.8719, "step": 470 }, { "epoch": 0.1047062713545685, "grad_norm": 0.05972412283076983, "learning_rate": 0.0002999804095620941, "loss": 0.8567, "step": 475 }, { "epoch": 0.10580844263198501, "grad_norm": 0.055736449030028, "learning_rate": 0.00029996997056142786, "loss": 0.8928, "step": 480 }, { "epoch": 0.10691061390940153, "grad_norm": 0.05535452244152051, "learning_rate": 0.0002999573108217412, "loss": 0.8815, "step": 485 }, { "epoch": 0.10801278518681803, "grad_norm": 0.05571951712028587, "learning_rate": 0.00029994243053049795, "loss": 0.8273, "step": 490 }, { "epoch": 0.10911495646423454, "grad_norm": 0.0572367057417111, "learning_rate": 0.000299925329908044, "loss": 0.8891, "step": 495 }, { "epoch": 0.11021712774165106, "grad_norm": 0.05206161611783487, "learning_rate": 0.00029990600920760355, "loss": 0.8467, "step": 500 }, { "epoch": 0.11131929901906756, "grad_norm": 0.057618220952806956, "learning_rate": 0.0002998844687152753, "loss": 0.84, "step": 505 }, { "epoch": 0.11242147029648407, "grad_norm": 0.0731502466891883, "learning_rate": 0.0002998607087500286, "loss": 0.8899, "step": 510 }, { "epoch": 0.11352364157390059, "grad_norm": 0.06531011144403108, "learning_rate": 0.00029983472966369835, "loss": 0.8805, "step": 515 }, { "epoch": 0.1146258128513171, "grad_norm": 0.05719733051934535, "learning_rate": 0.0002998065318409801, "loss": 0.8998, "step": 520 }, { "epoch": 0.1157279841287336, "grad_norm": 0.04786336091320019, "learning_rate": 0.0002997761156994242, "loss": 0.8454, "step": 525 }, { "epoch": 0.11683015540615012, "grad_norm": 0.061209707796165434, "learning_rate": 0.00029974348168942944, "loss": 0.894, "step": 530 }, { "epoch": 0.11793232668356662, "grad_norm": 0.05567071955981122, "learning_rate": 0.0002997086302942368, "loss": 0.8791, "step": 535 }, { "epoch": 0.11903449796098314, "grad_norm": 0.051378774159055826, "learning_rate": 0.00029967156202992184, "loss": 0.8908, "step": 540 }, { "epoch": 0.12013666923839965, "grad_norm": 0.06416620508611666, "learning_rate": 0.0002996322774453875, "loss": 0.912, "step": 545 }, { "epoch": 0.12123884051581615, "grad_norm": 0.049933851892099514, "learning_rate": 0.0002995907771223556, "loss": 0.8819, "step": 550 }, { "epoch": 0.12234101179323267, "grad_norm": 0.052571228924698295, "learning_rate": 0.00029954706167535834, "loss": 0.8926, "step": 555 }, { "epoch": 0.12344318307064918, "grad_norm": 0.05946512503425042, "learning_rate": 0.0002995011317517294, "loss": 0.8767, "step": 560 }, { "epoch": 0.12454535434806568, "grad_norm": 0.05379133265550323, "learning_rate": 0.0002994529880315941, "loss": 0.8541, "step": 565 }, { "epoch": 0.1256475256254822, "grad_norm": 0.04996259410414281, "learning_rate": 0.00029940263122785936, "loss": 0.8975, "step": 570 }, { "epoch": 0.12674969690289872, "grad_norm": 0.06293201636932731, "learning_rate": 0.0002993500620862033, "loss": 0.8538, "step": 575 }, { "epoch": 0.12785186818031521, "grad_norm": 0.0549834121731086, "learning_rate": 0.000299295281385064, "loss": 0.8766, "step": 580 }, { "epoch": 0.12895403945773173, "grad_norm": 0.06031562586347478, "learning_rate": 0.00029923828993562814, "loss": 0.8519, "step": 585 }, { "epoch": 0.13005621073514825, "grad_norm": 0.05792440007594611, "learning_rate": 0.00029917908858181897, "loss": 0.8295, "step": 590 }, { "epoch": 0.13115838201256474, "grad_norm": 0.22881705102396294, "learning_rate": 0.00029911767820028364, "loss": 0.8934, "step": 595 }, { "epoch": 0.13226055328998126, "grad_norm": 0.2022359302630765, "learning_rate": 0.0002990540597003804, "loss": 0.9332, "step": 600 }, { "epoch": 0.13336272456739778, "grad_norm": 0.07299327465806, "learning_rate": 0.0002989882340241651, "loss": 0.8848, "step": 605 }, { "epoch": 0.13446489584481428, "grad_norm": 0.06761831265802426, "learning_rate": 0.0002989202021463772, "loss": 0.8613, "step": 610 }, { "epoch": 0.1355670671222308, "grad_norm": 0.08020708226646049, "learning_rate": 0.0002988499650744254, "loss": 0.8961, "step": 615 }, { "epoch": 0.1366692383996473, "grad_norm": 0.08560660775941882, "learning_rate": 0.0002987775238483725, "loss": 0.9122, "step": 620 }, { "epoch": 0.1377714096770638, "grad_norm": 0.05636244041478988, "learning_rate": 0.0002987028795409204, "loss": 0.8427, "step": 625 }, { "epoch": 0.13887358095448032, "grad_norm": 1.0715539508022556, "learning_rate": 0.0002986260332573939, "loss": 0.8535, "step": 630 }, { "epoch": 0.13997575223189684, "grad_norm": 0.07028650609498803, "learning_rate": 0.0002985469861357243, "loss": 0.8843, "step": 635 }, { "epoch": 0.14107792350931334, "grad_norm": 0.10912168092090672, "learning_rate": 0.0002984657393464329, "loss": 0.8802, "step": 640 }, { "epoch": 0.14218009478672985, "grad_norm": 0.07320669082767357, "learning_rate": 0.0002983822940926133, "loss": 0.8534, "step": 645 }, { "epoch": 0.14328226606414637, "grad_norm": 0.049879806960918614, "learning_rate": 0.0002982966516099137, "loss": 0.8661, "step": 650 }, { "epoch": 0.14438443734156287, "grad_norm": 0.05068348497813321, "learning_rate": 0.00029820881316651866, "loss": 0.881, "step": 655 }, { "epoch": 0.14548660861897939, "grad_norm": 0.05280972060027596, "learning_rate": 0.00029811878006313046, "loss": 0.8552, "step": 660 }, { "epoch": 0.1465887798963959, "grad_norm": 0.060948440074130784, "learning_rate": 0.00029802655363294934, "loss": 0.8694, "step": 665 }, { "epoch": 0.14769095117381242, "grad_norm": 0.051871602193463616, "learning_rate": 0.0002979321352416543, "loss": 0.8482, "step": 670 }, { "epoch": 0.14879312245122892, "grad_norm": 0.05140620390257059, "learning_rate": 0.0002978355262873826, "loss": 0.871, "step": 675 }, { "epoch": 0.14989529372864543, "grad_norm": 0.053923668745928, "learning_rate": 0.00029773672820070915, "loss": 0.8617, "step": 680 }, { "epoch": 0.15099746500606195, "grad_norm": 0.05491254789252112, "learning_rate": 0.0002976357424446253, "loss": 0.8688, "step": 685 }, { "epoch": 0.15209963628347845, "grad_norm": 0.057725313291247395, "learning_rate": 0.00029753257051451707, "loss": 0.8725, "step": 690 }, { "epoch": 0.15320180756089496, "grad_norm": 0.06175295050381468, "learning_rate": 0.0002974272139381433, "loss": 0.8721, "step": 695 }, { "epoch": 0.15430397883831148, "grad_norm": 0.05416095170725182, "learning_rate": 0.00029731967427561266, "loss": 0.8477, "step": 700 }, { "epoch": 0.15540615011572798, "grad_norm": 0.05008825843504415, "learning_rate": 0.00029720995311936077, "loss": 0.8539, "step": 705 }, { "epoch": 0.1565083213931445, "grad_norm": 0.048098359842914856, "learning_rate": 0.0002970980520941266, "loss": 0.8391, "step": 710 }, { "epoch": 0.15761049267056101, "grad_norm": 0.05792884649127905, "learning_rate": 0.00029698397285692833, "loss": 0.836, "step": 715 }, { "epoch": 0.1587126639479775, "grad_norm": 0.04553225276243662, "learning_rate": 0.000296867717097039, "loss": 0.8407, "step": 720 }, { "epoch": 0.15981483522539403, "grad_norm": 0.04857966778373228, "learning_rate": 0.0002967492865359611, "loss": 0.843, "step": 725 }, { "epoch": 0.16091700650281054, "grad_norm": 0.05464944380446163, "learning_rate": 0.00029662868292740165, "loss": 0.85, "step": 730 }, { "epoch": 0.16201917778022704, "grad_norm": 0.045834951820991766, "learning_rate": 0.00029650590805724574, "loss": 0.8661, "step": 735 }, { "epoch": 0.16312134905764356, "grad_norm": 0.053929904093643635, "learning_rate": 0.0002963809637435303, "loss": 0.9115, "step": 740 }, { "epoch": 0.16422352033506007, "grad_norm": 0.04871618742526512, "learning_rate": 0.00029625385183641706, "loss": 0.845, "step": 745 }, { "epoch": 0.16532569161247657, "grad_norm": 0.05247129804792461, "learning_rate": 0.00029612457421816546, "loss": 0.8772, "step": 750 }, { "epoch": 0.16642786288989309, "grad_norm": 0.04850518585643222, "learning_rate": 0.0002959931328031043, "loss": 0.8687, "step": 755 }, { "epoch": 0.1675300341673096, "grad_norm": 0.05475798664220526, "learning_rate": 0.00029585952953760386, "loss": 0.8666, "step": 760 }, { "epoch": 0.16863220544472612, "grad_norm": 0.04884060269417904, "learning_rate": 0.00029572376640004674, "loss": 0.8681, "step": 765 }, { "epoch": 0.16973437672214262, "grad_norm": 0.04948889650089674, "learning_rate": 0.00029558584540079864, "loss": 0.8822, "step": 770 }, { "epoch": 0.17083654799955914, "grad_norm": 0.044346461043723126, "learning_rate": 0.0002954457685821789, "loss": 0.8656, "step": 775 }, { "epoch": 0.17193871927697565, "grad_norm": 0.05856011647955299, "learning_rate": 0.0002953035380184296, "loss": 0.8487, "step": 780 }, { "epoch": 0.17304089055439215, "grad_norm": 0.05004499788895783, "learning_rate": 0.0002951591558156856, "loss": 0.8219, "step": 785 }, { "epoch": 0.17414306183180867, "grad_norm": 0.04373207037056602, "learning_rate": 0.0002950126241119429, "loss": 0.8712, "step": 790 }, { "epoch": 0.17524523310922518, "grad_norm": 0.04536155490811825, "learning_rate": 0.0002948639450770269, "loss": 0.8616, "step": 795 }, { "epoch": 0.17634740438664168, "grad_norm": 0.04345801787038758, "learning_rate": 0.0002947131209125607, "loss": 0.859, "step": 800 }, { "epoch": 0.1774495756640582, "grad_norm": 0.045860871445007584, "learning_rate": 0.0002945601538519321, "loss": 0.8497, "step": 805 }, { "epoch": 0.17855174694147471, "grad_norm": 0.04997067347593218, "learning_rate": 0.0002944050461602607, "loss": 0.8428, "step": 810 }, { "epoch": 0.1796539182188912, "grad_norm": 0.04961878609503156, "learning_rate": 0.00029424780013436434, "loss": 0.8582, "step": 815 }, { "epoch": 0.18075608949630773, "grad_norm": 0.047930042373250895, "learning_rate": 0.0002940884181027251, "loss": 0.8523, "step": 820 }, { "epoch": 0.18185826077372425, "grad_norm": 0.05532903520929939, "learning_rate": 0.0002939269024254547, "loss": 0.8544, "step": 825 }, { "epoch": 0.18296043205114074, "grad_norm": 0.051023957267831634, "learning_rate": 0.0002937632554942598, "loss": 0.8419, "step": 830 }, { "epoch": 0.18406260332855726, "grad_norm": 0.0453229581659907, "learning_rate": 0.0002935974797324064, "loss": 0.8335, "step": 835 }, { "epoch": 0.18516477460597378, "grad_norm": 0.04606389790907126, "learning_rate": 0.0002934295775946839, "loss": 0.8368, "step": 840 }, { "epoch": 0.18626694588339027, "grad_norm": 0.045473867263415464, "learning_rate": 0.00029325955156736885, "loss": 0.8304, "step": 845 }, { "epoch": 0.1873691171608068, "grad_norm": 0.044416090960029395, "learning_rate": 0.0002930874041681883, "loss": 0.8526, "step": 850 }, { "epoch": 0.1884712884382233, "grad_norm": 0.05975247770022884, "learning_rate": 0.0002929131379462821, "loss": 0.8442, "step": 855 }, { "epoch": 0.1895734597156398, "grad_norm": 0.048173050625599255, "learning_rate": 0.00029273675548216563, "loss": 0.8725, "step": 860 }, { "epoch": 0.19067563099305632, "grad_norm": 0.04762107623243284, "learning_rate": 0.0002925582593876912, "loss": 0.8666, "step": 865 }, { "epoch": 0.19177780227047284, "grad_norm": 0.05283512920588099, "learning_rate": 0.0002923776523060095, "loss": 0.8584, "step": 870 }, { "epoch": 0.19287997354788936, "grad_norm": 0.05051732507674948, "learning_rate": 0.0002921949369115307, "loss": 0.8685, "step": 875 }, { "epoch": 0.19398214482530585, "grad_norm": 0.04737129568146193, "learning_rate": 0.00029201011590988444, "loss": 0.8108, "step": 880 }, { "epoch": 0.19508431610272237, "grad_norm": 0.052972441854251665, "learning_rate": 0.00029182319203788, "loss": 0.8554, "step": 885 }, { "epoch": 0.19618648738013889, "grad_norm": 0.045056064701286494, "learning_rate": 0.0002916341680634657, "loss": 0.8271, "step": 890 }, { "epoch": 0.19728865865755538, "grad_norm": 0.04699409375298448, "learning_rate": 0.00029144304678568807, "loss": 0.836, "step": 895 }, { "epoch": 0.1983908299349719, "grad_norm": 0.048175359720427025, "learning_rate": 0.00029124983103465026, "loss": 0.8541, "step": 900 }, { "epoch": 0.19949300121238842, "grad_norm": 0.04727829823889592, "learning_rate": 0.00029105452367147, "loss": 0.8502, "step": 905 }, { "epoch": 0.2005951724898049, "grad_norm": 0.05148206426329491, "learning_rate": 0.0002908571275882376, "loss": 0.8453, "step": 910 }, { "epoch": 0.20169734376722143, "grad_norm": 0.0514916225658676, "learning_rate": 0.00029065764570797276, "loss": 0.8609, "step": 915 }, { "epoch": 0.20279951504463795, "grad_norm": 0.04883020506266138, "learning_rate": 0.0002904560809845814, "loss": 0.8461, "step": 920 }, { "epoch": 0.20390168632205444, "grad_norm": 0.04568888763813157, "learning_rate": 0.00029025243640281223, "loss": 0.8827, "step": 925 }, { "epoch": 0.20500385759947096, "grad_norm": 0.05356399559573866, "learning_rate": 0.0002900467149782118, "loss": 0.8606, "step": 930 }, { "epoch": 0.20610602887688748, "grad_norm": 0.04851687061223907, "learning_rate": 0.0002898389197570808, "loss": 0.8586, "step": 935 }, { "epoch": 0.20720820015430397, "grad_norm": 0.04669683817318346, "learning_rate": 0.00028962905381642827, "loss": 0.834, "step": 940 }, { "epoch": 0.2083103714317205, "grad_norm": 0.04842521203522015, "learning_rate": 0.0002894171202639262, "loss": 0.8352, "step": 945 }, { "epoch": 0.209412542709137, "grad_norm": 0.0440579152345845, "learning_rate": 0.0002892031222378635, "loss": 0.8324, "step": 950 }, { "epoch": 0.2105147139865535, "grad_norm": 0.04627079587508343, "learning_rate": 0.0002889870629070998, "loss": 0.8253, "step": 955 }, { "epoch": 0.21161688526397002, "grad_norm": 0.049721876458835836, "learning_rate": 0.0002887689454710182, "loss": 0.8322, "step": 960 }, { "epoch": 0.21271905654138654, "grad_norm": 0.04459805370567298, "learning_rate": 0.0002885487731594779, "loss": 0.8522, "step": 965 }, { "epoch": 0.21382122781880306, "grad_norm": 0.04335466084634585, "learning_rate": 0.0002883265492327666, "loss": 0.8385, "step": 970 }, { "epoch": 0.21492339909621955, "grad_norm": 0.04968671178179274, "learning_rate": 0.000288102276981552, "loss": 0.8293, "step": 975 }, { "epoch": 0.21602557037363607, "grad_norm": 0.04449544356929534, "learning_rate": 0.00028787595972683326, "loss": 0.8444, "step": 980 }, { "epoch": 0.21712774165105259, "grad_norm": 0.043972214438083426, "learning_rate": 0.0002876476008198917, "loss": 0.8337, "step": 985 }, { "epoch": 0.21822991292846908, "grad_norm": 0.04311951621843307, "learning_rate": 0.00028741720364224113, "loss": 0.851, "step": 990 }, { "epoch": 0.2193320842058856, "grad_norm": 0.045060530710752784, "learning_rate": 0.000287184771605578, "loss": 0.8404, "step": 995 }, { "epoch": 0.22043425548330212, "grad_norm": 0.045664652704444204, "learning_rate": 0.0002869503081517305, "loss": 0.8181, "step": 1000 }, { "epoch": 0.2215364267607186, "grad_norm": 0.048272419665718394, "learning_rate": 0.0002867138167526081, "loss": 0.851, "step": 1005 }, { "epoch": 0.22263859803813513, "grad_norm": 0.047040645763608745, "learning_rate": 0.0002864753009101497, "loss": 0.8187, "step": 1010 }, { "epoch": 0.22374076931555165, "grad_norm": 0.0428558339017769, "learning_rate": 0.00028623476415627185, "loss": 0.8425, "step": 1015 }, { "epoch": 0.22484294059296814, "grad_norm": 0.04197047231886558, "learning_rate": 0.0002859922100528168, "loss": 0.8565, "step": 1020 }, { "epoch": 0.22594511187038466, "grad_norm": 0.044195983390617165, "learning_rate": 0.0002857476421914993, "loss": 0.8265, "step": 1025 }, { "epoch": 0.22704728314780118, "grad_norm": 0.04490807738250268, "learning_rate": 0.0002855010641938536, "loss": 0.8273, "step": 1030 }, { "epoch": 0.22814945442521767, "grad_norm": 0.045467911300816795, "learning_rate": 0.00028525247971118, "loss": 0.8448, "step": 1035 }, { "epoch": 0.2292516257026342, "grad_norm": 0.06605089099746904, "learning_rate": 0.0002850018924244903, "loss": 0.8452, "step": 1040 }, { "epoch": 0.2303537969800507, "grad_norm": 0.04096505035492289, "learning_rate": 0.00028474930604445404, "loss": 0.8205, "step": 1045 }, { "epoch": 0.2314559682574672, "grad_norm": 0.04304934507293174, "learning_rate": 0.0002844947243113427, "loss": 0.8488, "step": 1050 }, { "epoch": 0.23255813953488372, "grad_norm": 0.04230208217283045, "learning_rate": 0.000284238150994975, "loss": 0.8376, "step": 1055 }, { "epoch": 0.23366031081230024, "grad_norm": 0.04708086600574921, "learning_rate": 0.00028397958989466064, "loss": 0.8231, "step": 1060 }, { "epoch": 0.23476248208971673, "grad_norm": 0.046838021251746416, "learning_rate": 0.00028371904483914437, "loss": 0.8284, "step": 1065 }, { "epoch": 0.23586465336713325, "grad_norm": 0.05160315046793239, "learning_rate": 0.00028345651968654897, "loss": 0.8489, "step": 1070 }, { "epoch": 0.23696682464454977, "grad_norm": 0.04831039086387138, "learning_rate": 0.0002831920183243184, "loss": 0.8611, "step": 1075 }, { "epoch": 0.2380689959219663, "grad_norm": 0.05287417954296867, "learning_rate": 0.00028292554466916004, "loss": 0.8323, "step": 1080 }, { "epoch": 0.23917116719938278, "grad_norm": 0.04693015235603077, "learning_rate": 0.00028265710266698685, "loss": 0.8632, "step": 1085 }, { "epoch": 0.2402733384767993, "grad_norm": 0.041988241282154073, "learning_rate": 0.00028238669629285885, "loss": 0.8068, "step": 1090 }, { "epoch": 0.24137550975421582, "grad_norm": 0.04810130529903119, "learning_rate": 0.0002821143295509241, "loss": 0.8193, "step": 1095 }, { "epoch": 0.2424776810316323, "grad_norm": 0.04870507456288239, "learning_rate": 0.0002818400064743599, "loss": 0.8726, "step": 1100 }, { "epoch": 0.24357985230904883, "grad_norm": 0.04918127091170888, "learning_rate": 0.00028156373112531234, "loss": 0.8501, "step": 1105 }, { "epoch": 0.24468202358646535, "grad_norm": 0.04501871928173725, "learning_rate": 0.0002812855075948369, "loss": 0.8623, "step": 1110 }, { "epoch": 0.24578419486388184, "grad_norm": 0.04700064721205868, "learning_rate": 0.00028100534000283727, "loss": 0.8334, "step": 1115 }, { "epoch": 0.24688636614129836, "grad_norm": 0.043956386695142506, "learning_rate": 0.0002807232324980048, "loss": 0.8729, "step": 1120 }, { "epoch": 0.24798853741871488, "grad_norm": 0.044369171455137094, "learning_rate": 0.00028043918925775666, "loss": 0.8198, "step": 1125 }, { "epoch": 0.24909070869613137, "grad_norm": 0.04924934475039299, "learning_rate": 0.00028015321448817435, "loss": 0.8425, "step": 1130 }, { "epoch": 0.2501928799735479, "grad_norm": 0.052400033371239746, "learning_rate": 0.0002798653124239411, "loss": 0.8627, "step": 1135 }, { "epoch": 0.2512950512509644, "grad_norm": 0.04354647857372681, "learning_rate": 0.0002795754873282794, "loss": 0.8052, "step": 1140 }, { "epoch": 0.2523972225283809, "grad_norm": 0.050485861813274184, "learning_rate": 0.0002792837434928878, "loss": 0.8437, "step": 1145 }, { "epoch": 0.25349939380579745, "grad_norm": 0.04827775784152042, "learning_rate": 0.00027899008523787726, "loss": 0.8595, "step": 1150 }, { "epoch": 0.2546015650832139, "grad_norm": 0.04298338268947314, "learning_rate": 0.0002786945169117073, "loss": 0.8306, "step": 1155 }, { "epoch": 0.25570373636063043, "grad_norm": 0.04907197256608099, "learning_rate": 0.0002783970428911216, "loss": 0.8305, "step": 1160 }, { "epoch": 0.25680590763804695, "grad_norm": 0.049467752786233936, "learning_rate": 0.000278097667581083, "loss": 0.848, "step": 1165 }, { "epoch": 0.25790807891546347, "grad_norm": 0.04656416214482384, "learning_rate": 0.0002777963954147087, "loss": 0.8165, "step": 1170 }, { "epoch": 0.25901025019288, "grad_norm": 0.051521287489935036, "learning_rate": 0.0002774932308532041, "loss": 0.8362, "step": 1175 }, { "epoch": 0.2601124214702965, "grad_norm": 0.053549824085333694, "learning_rate": 0.00027718817838579706, "loss": 0.8267, "step": 1180 }, { "epoch": 0.26121459274771297, "grad_norm": 0.054270101506079374, "learning_rate": 0.0002768812425296714, "loss": 0.8119, "step": 1185 }, { "epoch": 0.2623167640251295, "grad_norm": 0.05333532704116092, "learning_rate": 0.00027657242782989987, "loss": 0.8099, "step": 1190 }, { "epoch": 0.263418935302546, "grad_norm": 0.046490067153489543, "learning_rate": 0.00027626173885937703, "loss": 0.806, "step": 1195 }, { "epoch": 0.26452110657996253, "grad_norm": 0.04891723101243592, "learning_rate": 0.0002759491802187513, "loss": 0.8336, "step": 1200 }, { "epoch": 0.26562327785737905, "grad_norm": 0.052736333806997764, "learning_rate": 0.00027563475653635713, "loss": 0.8471, "step": 1205 }, { "epoch": 0.26672544913479557, "grad_norm": 0.04347575493484375, "learning_rate": 0.00027531847246814613, "loss": 0.8388, "step": 1210 }, { "epoch": 0.26782762041221203, "grad_norm": 0.044934221766930585, "learning_rate": 0.00027500033269761855, "loss": 0.8382, "step": 1215 }, { "epoch": 0.26892979168962855, "grad_norm": 0.04472911222526219, "learning_rate": 0.0002746803419357534, "loss": 0.823, "step": 1220 }, { "epoch": 0.27003196296704507, "grad_norm": 0.042907862680138985, "learning_rate": 0.0002743585049209391, "loss": 0.8217, "step": 1225 }, { "epoch": 0.2711341342444616, "grad_norm": 0.0430952497583985, "learning_rate": 0.00027403482641890324, "loss": 0.8148, "step": 1230 }, { "epoch": 0.2722363055218781, "grad_norm": 0.04625178973487922, "learning_rate": 0.0002737093112226418, "loss": 0.8633, "step": 1235 }, { "epoch": 0.2733384767992946, "grad_norm": 0.05941599713124254, "learning_rate": 0.00027338196415234857, "loss": 0.8307, "step": 1240 }, { "epoch": 0.27444064807671115, "grad_norm": 0.055472061651380494, "learning_rate": 0.0002730527900553432, "loss": 0.8527, "step": 1245 }, { "epoch": 0.2755428193541276, "grad_norm": 0.04678339387739163, "learning_rate": 0.00027272179380600006, "loss": 0.849, "step": 1250 }, { "epoch": 0.27664499063154413, "grad_norm": 0.04692059237171032, "learning_rate": 0.0002723889803056756, "loss": 0.8706, "step": 1255 }, { "epoch": 0.27774716190896065, "grad_norm": 0.04559570312510178, "learning_rate": 0.00027205435448263593, "loss": 0.8418, "step": 1260 }, { "epoch": 0.27884933318637717, "grad_norm": 0.0419292041399672, "learning_rate": 0.0002717179212919838, "loss": 0.8583, "step": 1265 }, { "epoch": 0.2799515044637937, "grad_norm": 0.04430167866474891, "learning_rate": 0.00027137968571558553, "loss": 0.8333, "step": 1270 }, { "epoch": 0.2810536757412102, "grad_norm": 0.0456691993264945, "learning_rate": 0.00027103965276199647, "loss": 0.8447, "step": 1275 }, { "epoch": 0.28215584701862667, "grad_norm": 0.04665005990926171, "learning_rate": 0.0002706978274663879, "loss": 0.7695, "step": 1280 }, { "epoch": 0.2832580182960432, "grad_norm": 0.043712884590464574, "learning_rate": 0.0002703542148904715, "loss": 0.8267, "step": 1285 }, { "epoch": 0.2843601895734597, "grad_norm": 0.04652550786503771, "learning_rate": 0.00027000882012242496, "loss": 0.8437, "step": 1290 }, { "epoch": 0.28546236085087623, "grad_norm": 0.045612398832845166, "learning_rate": 0.00026966164827681643, "loss": 0.8138, "step": 1295 }, { "epoch": 0.28656453212829275, "grad_norm": 0.04839127808608333, "learning_rate": 0.00026931270449452897, "loss": 0.8372, "step": 1300 }, { "epoch": 0.28766670340570927, "grad_norm": 0.0445074064874431, "learning_rate": 0.000268961993942684, "loss": 0.8214, "step": 1305 }, { "epoch": 0.28876887468312573, "grad_norm": 0.0402466463654265, "learning_rate": 0.0002686095218145654, "loss": 0.8086, "step": 1310 }, { "epoch": 0.28987104596054225, "grad_norm": 0.05239340243182599, "learning_rate": 0.000268255293329542, "loss": 0.8368, "step": 1315 }, { "epoch": 0.29097321723795877, "grad_norm": 0.04600417253087546, "learning_rate": 0.0002678993137329908, "loss": 0.8081, "step": 1320 }, { "epoch": 0.2920753885153753, "grad_norm": 0.04275984799543839, "learning_rate": 0.0002675415882962189, "loss": 0.8257, "step": 1325 }, { "epoch": 0.2931775597927918, "grad_norm": 0.044570824361962434, "learning_rate": 0.0002671821223163858, "loss": 0.8208, "step": 1330 }, { "epoch": 0.29427973107020833, "grad_norm": 0.045119447979777474, "learning_rate": 0.0002668209211164244, "loss": 0.8488, "step": 1335 }, { "epoch": 0.29538190234762485, "grad_norm": 0.04036708835023624, "learning_rate": 0.00026645799004496306, "loss": 0.8512, "step": 1340 }, { "epoch": 0.2964840736250413, "grad_norm": 0.041845363942691824, "learning_rate": 0.0002660933344762455, "loss": 0.8228, "step": 1345 }, { "epoch": 0.29758624490245783, "grad_norm": 0.052565125884796726, "learning_rate": 0.0002657269598100518, "loss": 0.833, "step": 1350 }, { "epoch": 0.29868841617987435, "grad_norm": 0.05263005472080342, "learning_rate": 0.0002653588714716181, "loss": 0.8482, "step": 1355 }, { "epoch": 0.29979058745729087, "grad_norm": 0.04489153452747211, "learning_rate": 0.00026498907491155665, "loss": 0.7975, "step": 1360 }, { "epoch": 0.3008927587347074, "grad_norm": 0.0477109850798695, "learning_rate": 0.0002646175756057745, "loss": 0.8168, "step": 1365 }, { "epoch": 0.3019949300121239, "grad_norm": 0.04476257759581266, "learning_rate": 0.00026424437905539315, "loss": 0.8062, "step": 1370 }, { "epoch": 0.30309710128954037, "grad_norm": 0.04415673060651443, "learning_rate": 0.00026386949078666653, "loss": 0.8352, "step": 1375 }, { "epoch": 0.3041992725669569, "grad_norm": 0.04044906513322972, "learning_rate": 0.0002634929163508993, "loss": 0.8299, "step": 1380 }, { "epoch": 0.3053014438443734, "grad_norm": 0.047407594480881006, "learning_rate": 0.0002631146613243648, "loss": 0.8509, "step": 1385 }, { "epoch": 0.30640361512178993, "grad_norm": 0.04212908230670688, "learning_rate": 0.00026273473130822235, "loss": 0.8348, "step": 1390 }, { "epoch": 0.30750578639920645, "grad_norm": 0.04777049716354304, "learning_rate": 0.0002623531319284343, "loss": 0.8477, "step": 1395 }, { "epoch": 0.30860795767662297, "grad_norm": 0.048704830358582564, "learning_rate": 0.00026196986883568284, "loss": 0.8514, "step": 1400 }, { "epoch": 0.30971012895403943, "grad_norm": 0.05238527702204111, "learning_rate": 0.00026158494770528614, "loss": 0.82, "step": 1405 }, { "epoch": 0.31081230023145595, "grad_norm": 0.05352166328603628, "learning_rate": 0.0002611983742371144, "loss": 0.8293, "step": 1410 }, { "epoch": 0.31191447150887247, "grad_norm": 0.04537796100997172, "learning_rate": 0.0002608101541555056, "loss": 0.8001, "step": 1415 }, { "epoch": 0.313016642786289, "grad_norm": 0.04950116503834519, "learning_rate": 0.0002604202932091805, "loss": 0.8406, "step": 1420 }, { "epoch": 0.3141188140637055, "grad_norm": 0.042514906932797684, "learning_rate": 0.0002600287971711576, "loss": 0.8467, "step": 1425 }, { "epoch": 0.31522098534112203, "grad_norm": 0.03957295732321364, "learning_rate": 0.0002596356718386676, "loss": 0.8457, "step": 1430 }, { "epoch": 0.31632315661853855, "grad_norm": 0.04155394927306569, "learning_rate": 0.0002592409230330677, "loss": 0.8087, "step": 1435 }, { "epoch": 0.317425327895955, "grad_norm": 0.04562885194483694, "learning_rate": 0.0002588445565997554, "loss": 0.8394, "step": 1440 }, { "epoch": 0.31852749917337153, "grad_norm": 0.04641875664850794, "learning_rate": 0.0002584465784080817, "loss": 0.8407, "step": 1445 }, { "epoch": 0.31962967045078805, "grad_norm": 0.0430053369111926, "learning_rate": 0.0002580469943512644, "loss": 0.8494, "step": 1450 }, { "epoch": 0.32073184172820457, "grad_norm": 0.04973211543509925, "learning_rate": 0.0002576458103463007, "loss": 0.798, "step": 1455 }, { "epoch": 0.3218340130056211, "grad_norm": 0.04129288667946417, "learning_rate": 0.00025724303233387987, "loss": 0.8446, "step": 1460 }, { "epoch": 0.3229361842830376, "grad_norm": 0.04473055100882414, "learning_rate": 0.00025683866627829486, "loss": 0.8455, "step": 1465 }, { "epoch": 0.32403835556045407, "grad_norm": 0.046468538153449125, "learning_rate": 0.00025643271816735416, "loss": 0.8194, "step": 1470 }, { "epoch": 0.3251405268378706, "grad_norm": 0.039997009480838126, "learning_rate": 0.0002560251940122935, "loss": 0.8198, "step": 1475 }, { "epoch": 0.3262426981152871, "grad_norm": 0.045094913214168995, "learning_rate": 0.000255616099847686, "loss": 0.8099, "step": 1480 }, { "epoch": 0.32734486939270363, "grad_norm": 0.04423782452900588, "learning_rate": 0.0002552054417313538, "loss": 0.8205, "step": 1485 }, { "epoch": 0.32844704067012015, "grad_norm": 0.044248198901321145, "learning_rate": 0.0002547932257442775, "loss": 0.8115, "step": 1490 }, { "epoch": 0.32954921194753667, "grad_norm": 0.04981089857434275, "learning_rate": 0.00025437945799050674, "loss": 0.8398, "step": 1495 }, { "epoch": 0.33065138322495313, "grad_norm": 0.04545569085042973, "learning_rate": 0.00025396414459706926, "loss": 0.8086, "step": 1500 }, { "epoch": 0.33175355450236965, "grad_norm": 0.0428291316212456, "learning_rate": 0.00025354729171388077, "loss": 0.813, "step": 1505 }, { "epoch": 0.33285572577978617, "grad_norm": 0.04223393291828146, "learning_rate": 0.0002531289055136535, "loss": 0.8322, "step": 1510 }, { "epoch": 0.3339578970572027, "grad_norm": 0.04755472767341248, "learning_rate": 0.0002527089921918047, "loss": 0.8496, "step": 1515 }, { "epoch": 0.3350600683346192, "grad_norm": 0.0440763985224576, "learning_rate": 0.00025228755796636524, "loss": 0.8317, "step": 1520 }, { "epoch": 0.33616223961203573, "grad_norm": 0.04739432394265274, "learning_rate": 0.00025186460907788733, "loss": 0.8291, "step": 1525 }, { "epoch": 0.33726441088945225, "grad_norm": 0.04365113581228294, "learning_rate": 0.0002514401517893521, "loss": 0.8314, "step": 1530 }, { "epoch": 0.3383665821668687, "grad_norm": 0.05295299465042367, "learning_rate": 0.0002510141923860769, "loss": 0.8386, "step": 1535 }, { "epoch": 0.33946875344428523, "grad_norm": 0.04480906078196552, "learning_rate": 0.0002505867371756224, "loss": 0.8087, "step": 1540 }, { "epoch": 0.34057092472170175, "grad_norm": 0.04166659673582314, "learning_rate": 0.0002501577924876987, "loss": 0.8336, "step": 1545 }, { "epoch": 0.34167309599911827, "grad_norm": 0.04392236356125392, "learning_rate": 0.0002497273646740723, "loss": 0.8221, "step": 1550 }, { "epoch": 0.3427752672765348, "grad_norm": 0.037583836271015914, "learning_rate": 0.0002492954601084713, "loss": 0.8347, "step": 1555 }, { "epoch": 0.3438774385539513, "grad_norm": 0.04245959731615741, "learning_rate": 0.00024886208518649173, "loss": 0.8341, "step": 1560 }, { "epoch": 0.3449796098313678, "grad_norm": 0.04299442216812149, "learning_rate": 0.00024842724632550216, "loss": 0.8143, "step": 1565 }, { "epoch": 0.3460817811087843, "grad_norm": 0.044416364158721855, "learning_rate": 0.00024799094996454926, "loss": 0.817, "step": 1570 }, { "epoch": 0.3471839523862008, "grad_norm": 0.04080370665097008, "learning_rate": 0.0002475532025642621, "loss": 0.8404, "step": 1575 }, { "epoch": 0.34828612366361733, "grad_norm": 0.043404652747125856, "learning_rate": 0.0002471140106067565, "loss": 0.8056, "step": 1580 }, { "epoch": 0.34938829494103385, "grad_norm": 0.04702543231191137, "learning_rate": 0.0002466733805955394, "loss": 0.8364, "step": 1585 }, { "epoch": 0.35049046621845037, "grad_norm": 0.04379810092412971, "learning_rate": 0.000246231319055412, "loss": 0.7982, "step": 1590 }, { "epoch": 0.35159263749586683, "grad_norm": 0.04861448921339962, "learning_rate": 0.0002457878325323735, "loss": 0.8108, "step": 1595 }, { "epoch": 0.35269480877328335, "grad_norm": 0.05366919048317618, "learning_rate": 0.00024534292759352414, "loss": 0.8406, "step": 1600 }, { "epoch": 0.35379698005069987, "grad_norm": 0.04637591531118319, "learning_rate": 0.000244896610826968, "loss": 0.7878, "step": 1605 }, { "epoch": 0.3548991513281164, "grad_norm": 0.0451952378709979, "learning_rate": 0.00024444888884171505, "loss": 0.8073, "step": 1610 }, { "epoch": 0.3560013226055329, "grad_norm": 0.04229445731850634, "learning_rate": 0.00024399976826758392, "loss": 0.8133, "step": 1615 }, { "epoch": 0.35710349388294943, "grad_norm": 0.04303409290387014, "learning_rate": 0.00024354925575510315, "loss": 0.7969, "step": 1620 }, { "epoch": 0.3582056651603659, "grad_norm": 0.04070575320795186, "learning_rate": 0.00024309735797541318, "loss": 0.8192, "step": 1625 }, { "epoch": 0.3593078364377824, "grad_norm": 0.04197565777082017, "learning_rate": 0.0002426440816201671, "loss": 0.8239, "step": 1630 }, { "epoch": 0.36041000771519893, "grad_norm": 0.044014973357833005, "learning_rate": 0.00024218943340143182, "loss": 0.8334, "step": 1635 }, { "epoch": 0.36151217899261545, "grad_norm": 0.04571887208233434, "learning_rate": 0.00024173342005158894, "loss": 0.8432, "step": 1640 }, { "epoch": 0.36261435027003197, "grad_norm": 0.04094752957919555, "learning_rate": 0.00024127604832323445, "loss": 0.7932, "step": 1645 }, { "epoch": 0.3637165215474485, "grad_norm": 0.04013508208109058, "learning_rate": 0.0002408173249890792, "loss": 0.8034, "step": 1650 }, { "epoch": 0.364818692824865, "grad_norm": 0.040311829670245755, "learning_rate": 0.00024035725684184845, "loss": 0.7866, "step": 1655 }, { "epoch": 0.3659208641022815, "grad_norm": 0.041899754058127306, "learning_rate": 0.00023989585069418134, "loss": 0.7872, "step": 1660 }, { "epoch": 0.367023035379698, "grad_norm": 0.043805964643653376, "learning_rate": 0.0002394331133785299, "loss": 0.8146, "step": 1665 }, { "epoch": 0.3681252066571145, "grad_norm": 0.04065431765935861, "learning_rate": 0.000238969051747058, "loss": 0.8394, "step": 1670 }, { "epoch": 0.36922737793453103, "grad_norm": 0.04291000131859918, "learning_rate": 0.00023850367267153985, "loss": 0.8414, "step": 1675 }, { "epoch": 0.37032954921194755, "grad_norm": 0.04361025776347538, "learning_rate": 0.00023803698304325824, "loss": 0.7826, "step": 1680 }, { "epoch": 0.37143172048936407, "grad_norm": 0.04765203149725648, "learning_rate": 0.00023756898977290235, "loss": 0.8113, "step": 1685 }, { "epoch": 0.37253389176678053, "grad_norm": 0.04008616793311488, "learning_rate": 0.00023709969979046576, "loss": 0.8291, "step": 1690 }, { "epoch": 0.37363606304419705, "grad_norm": 0.04271593974112689, "learning_rate": 0.00023662912004514345, "loss": 0.82, "step": 1695 }, { "epoch": 0.3747382343216136, "grad_norm": 0.04110249253005406, "learning_rate": 0.00023615725750522913, "loss": 0.8305, "step": 1700 }, { "epoch": 0.3758404055990301, "grad_norm": 0.0403637537968849, "learning_rate": 0.00023568411915801205, "loss": 0.8177, "step": 1705 }, { "epoch": 0.3769425768764466, "grad_norm": 0.044289241987404186, "learning_rate": 0.00023520971200967334, "loss": 0.8215, "step": 1710 }, { "epoch": 0.37804474815386313, "grad_norm": 0.04129576160875138, "learning_rate": 0.00023473404308518256, "loss": 0.8337, "step": 1715 }, { "epoch": 0.3791469194312796, "grad_norm": 0.04173364684891999, "learning_rate": 0.00023425711942819333, "loss": 0.8067, "step": 1720 }, { "epoch": 0.3802490907086961, "grad_norm": 0.04440013453397569, "learning_rate": 0.00023377894810093944, "loss": 0.8396, "step": 1725 }, { "epoch": 0.38135126198611263, "grad_norm": 0.04364153613366134, "learning_rate": 0.00023329953618412985, "loss": 0.8126, "step": 1730 }, { "epoch": 0.38245343326352915, "grad_norm": 0.03880573318839816, "learning_rate": 0.0002328188907768441, "loss": 0.7861, "step": 1735 }, { "epoch": 0.38355560454094567, "grad_norm": 0.044630935282676455, "learning_rate": 0.00023233701899642712, "loss": 0.8041, "step": 1740 }, { "epoch": 0.3846577758183622, "grad_norm": 0.045036425102858116, "learning_rate": 0.0002318539279783839, "loss": 0.8389, "step": 1745 }, { "epoch": 0.3857599470957787, "grad_norm": 0.044227375694242045, "learning_rate": 0.0002313696248762737, "loss": 0.8024, "step": 1750 }, { "epoch": 0.3868621183731952, "grad_norm": 0.04607651991018229, "learning_rate": 0.00023088411686160415, "loss": 0.8087, "step": 1755 }, { "epoch": 0.3879642896506117, "grad_norm": 0.04430605414125412, "learning_rate": 0.00023039741112372528, "loss": 0.8279, "step": 1760 }, { "epoch": 0.3890664609280282, "grad_norm": 0.047410669524145625, "learning_rate": 0.00022990951486972258, "loss": 0.8104, "step": 1765 }, { "epoch": 0.39016863220544473, "grad_norm": 0.041525912031087554, "learning_rate": 0.0002294204353243109, "loss": 0.7937, "step": 1770 }, { "epoch": 0.39127080348286125, "grad_norm": 0.03803908868674592, "learning_rate": 0.00022893017972972686, "loss": 0.8099, "step": 1775 }, { "epoch": 0.39237297476027777, "grad_norm": 0.04663481529200365, "learning_rate": 0.00022843875534562204, "loss": 0.7985, "step": 1780 }, { "epoch": 0.39347514603769423, "grad_norm": 0.04209155232991889, "learning_rate": 0.0002279461694489553, "loss": 0.7984, "step": 1785 }, { "epoch": 0.39457731731511075, "grad_norm": 0.04273613495319002, "learning_rate": 0.00022745242933388507, "loss": 0.7856, "step": 1790 }, { "epoch": 0.3956794885925273, "grad_norm": 0.04007855374098208, "learning_rate": 0.00022695754231166125, "loss": 0.798, "step": 1795 }, { "epoch": 0.3967816598699438, "grad_norm": 0.03874216976594234, "learning_rate": 0.0002264615157105171, "loss": 0.8303, "step": 1800 }, { "epoch": 0.3978838311473603, "grad_norm": 0.042291279075714026, "learning_rate": 0.00022596435687556067, "loss": 0.8284, "step": 1805 }, { "epoch": 0.39898600242477683, "grad_norm": 0.04134434249625502, "learning_rate": 0.00022546607316866583, "loss": 0.8143, "step": 1810 }, { "epoch": 0.4000881737021933, "grad_norm": 0.0426391584789927, "learning_rate": 0.00022496667196836358, "loss": 0.8291, "step": 1815 }, { "epoch": 0.4011903449796098, "grad_norm": 0.03943254080642002, "learning_rate": 0.0002244661606697326, "loss": 0.8093, "step": 1820 }, { "epoch": 0.40229251625702633, "grad_norm": 0.04164893383401447, "learning_rate": 0.00022396454668428982, "loss": 0.8135, "step": 1825 }, { "epoch": 0.40339468753444285, "grad_norm": 0.04110460672662909, "learning_rate": 0.00022346183743988056, "loss": 0.8083, "step": 1830 }, { "epoch": 0.40449685881185937, "grad_norm": 0.04143399066869764, "learning_rate": 0.00022295804038056867, "loss": 0.798, "step": 1835 }, { "epoch": 0.4055990300892759, "grad_norm": 0.04010672194926782, "learning_rate": 0.0002224531629665263, "loss": 0.8132, "step": 1840 }, { "epoch": 0.4067012013666924, "grad_norm": 0.040718372885152405, "learning_rate": 0.00022194721267392324, "loss": 0.8237, "step": 1845 }, { "epoch": 0.4078033726441089, "grad_norm": 0.044019144470429636, "learning_rate": 0.0002214401969948164, "loss": 0.7955, "step": 1850 }, { "epoch": 0.4089055439215254, "grad_norm": 0.0449280577770788, "learning_rate": 0.00022093212343703893, "loss": 0.7929, "step": 1855 }, { "epoch": 0.4100077151989419, "grad_norm": 0.050365477985191316, "learning_rate": 0.00022042299952408872, "loss": 0.8389, "step": 1860 }, { "epoch": 0.41110988647635843, "grad_norm": 0.04032290135169436, "learning_rate": 0.00021991283279501744, "loss": 0.796, "step": 1865 }, { "epoch": 0.41221205775377495, "grad_norm": 0.03859944961488867, "learning_rate": 0.0002194016308043185, "loss": 0.7977, "step": 1870 }, { "epoch": 0.41331422903119147, "grad_norm": 0.04159446854663543, "learning_rate": 0.00021888940112181542, "loss": 0.826, "step": 1875 }, { "epoch": 0.41441640030860794, "grad_norm": 0.04359969871410925, "learning_rate": 0.0002183761513325496, "loss": 0.8251, "step": 1880 }, { "epoch": 0.41551857158602445, "grad_norm": 0.04343781370305786, "learning_rate": 0.0002178618890366682, "loss": 0.7984, "step": 1885 }, { "epoch": 0.416620742863441, "grad_norm": 0.04412546972897982, "learning_rate": 0.00021734662184931137, "loss": 0.8275, "step": 1890 }, { "epoch": 0.4177229141408575, "grad_norm": 0.046901376458696624, "learning_rate": 0.00021683035740049952, "loss": 0.8286, "step": 1895 }, { "epoch": 0.418825085418274, "grad_norm": 0.04296065580749498, "learning_rate": 0.00021631310333502062, "loss": 0.8245, "step": 1900 }, { "epoch": 0.41992725669569053, "grad_norm": 0.042120916788582694, "learning_rate": 0.00021579486731231653, "loss": 0.7803, "step": 1905 }, { "epoch": 0.421029427973107, "grad_norm": 0.040883347305042415, "learning_rate": 0.00021527565700637003, "loss": 0.8347, "step": 1910 }, { "epoch": 0.4221315992505235, "grad_norm": 0.04111643166609028, "learning_rate": 0.0002147554801055908, "loss": 0.7808, "step": 1915 }, { "epoch": 0.42323377052794003, "grad_norm": 0.042728251907987964, "learning_rate": 0.0002142343443127018, "loss": 0.8306, "step": 1920 }, { "epoch": 0.42433594180535655, "grad_norm": 0.042082803091619436, "learning_rate": 0.0002137122573446254, "loss": 0.8057, "step": 1925 }, { "epoch": 0.4254381130827731, "grad_norm": 0.04346598671784763, "learning_rate": 0.00021318922693236845, "loss": 0.812, "step": 1930 }, { "epoch": 0.4265402843601896, "grad_norm": 0.04311043189883461, "learning_rate": 0.00021266526082090858, "loss": 0.7732, "step": 1935 }, { "epoch": 0.4276424556376061, "grad_norm": 0.03914987981413652, "learning_rate": 0.00021214036676907888, "loss": 0.7875, "step": 1940 }, { "epoch": 0.4287446269150226, "grad_norm": 0.03762591331444608, "learning_rate": 0.00021161455254945354, "loss": 0.8256, "step": 1945 }, { "epoch": 0.4298467981924391, "grad_norm": 0.04123350564815786, "learning_rate": 0.00021108782594823227, "loss": 0.8177, "step": 1950 }, { "epoch": 0.4309489694698556, "grad_norm": 0.0396041073736449, "learning_rate": 0.00021056019476512532, "loss": 0.8145, "step": 1955 }, { "epoch": 0.43205114074727213, "grad_norm": 0.0380641273178196, "learning_rate": 0.00021003166681323794, "loss": 0.7952, "step": 1960 }, { "epoch": 0.43315331202468865, "grad_norm": 0.038313042170990505, "learning_rate": 0.00020950224991895456, "loss": 0.7872, "step": 1965 }, { "epoch": 0.43425548330210517, "grad_norm": 0.04604786797820384, "learning_rate": 0.00020897195192182299, "loss": 0.8094, "step": 1970 }, { "epoch": 0.43535765457952164, "grad_norm": 0.04044192743630967, "learning_rate": 0.00020844078067443835, "loss": 0.8141, "step": 1975 }, { "epoch": 0.43645982585693816, "grad_norm": 0.04177226121543752, "learning_rate": 0.00020790874404232667, "loss": 0.8181, "step": 1980 }, { "epoch": 0.4375619971343547, "grad_norm": 0.04439891759072682, "learning_rate": 0.00020737584990382862, "loss": 0.7925, "step": 1985 }, { "epoch": 0.4386641684117712, "grad_norm": 0.04270949977662141, "learning_rate": 0.0002068421061499826, "loss": 0.7786, "step": 1990 }, { "epoch": 0.4397663396891877, "grad_norm": 0.040991624121934515, "learning_rate": 0.0002063075206844082, "loss": 0.8308, "step": 1995 }, { "epoch": 0.44086851096660423, "grad_norm": 0.041275862470859626, "learning_rate": 0.00020577210142318876, "loss": 0.8342, "step": 2000 }, { "epoch": 0.4419706822440207, "grad_norm": 0.040817642657457055, "learning_rate": 0.00020523585629475457, "loss": 0.8274, "step": 2005 }, { "epoch": 0.4430728535214372, "grad_norm": 0.04417509610771959, "learning_rate": 0.00020469879323976517, "loss": 0.8176, "step": 2010 }, { "epoch": 0.44417502479885373, "grad_norm": 0.04251521072112325, "learning_rate": 0.00020416092021099193, "loss": 0.8049, "step": 2015 }, { "epoch": 0.44527719607627025, "grad_norm": 0.042166821439132904, "learning_rate": 0.00020362224517320014, "loss": 0.8014, "step": 2020 }, { "epoch": 0.4463793673536868, "grad_norm": 0.041857744422797494, "learning_rate": 0.0002030827761030312, "loss": 0.7916, "step": 2025 }, { "epoch": 0.4474815386311033, "grad_norm": 0.0411849481904813, "learning_rate": 0.00020254252098888447, "loss": 0.7706, "step": 2030 }, { "epoch": 0.44858370990851976, "grad_norm": 0.04092873755686162, "learning_rate": 0.00020200148783079892, "loss": 0.7896, "step": 2035 }, { "epoch": 0.4496858811859363, "grad_norm": 0.04336026242745058, "learning_rate": 0.0002014596846403348, "loss": 0.7672, "step": 2040 }, { "epoch": 0.4507880524633528, "grad_norm": 0.03910209178598118, "learning_rate": 0.0002009171194404548, "loss": 0.7752, "step": 2045 }, { "epoch": 0.4518902237407693, "grad_norm": 0.04605371401305054, "learning_rate": 0.00020037380026540543, "loss": 0.8172, "step": 2050 }, { "epoch": 0.45299239501818583, "grad_norm": 0.04099809926171493, "learning_rate": 0.000199829735160598, "loss": 0.7939, "step": 2055 }, { "epoch": 0.45409456629560235, "grad_norm": 0.04031696273456619, "learning_rate": 0.0001992849321824894, "loss": 0.7852, "step": 2060 }, { "epoch": 0.4551967375730189, "grad_norm": 0.038355940898486575, "learning_rate": 0.0001987393993984629, "loss": 0.7772, "step": 2065 }, { "epoch": 0.45629890885043534, "grad_norm": 0.04164706481380678, "learning_rate": 0.00019819314488670866, "loss": 0.8031, "step": 2070 }, { "epoch": 0.45740108012785186, "grad_norm": 0.042502765015544626, "learning_rate": 0.00019764617673610413, "loss": 0.8199, "step": 2075 }, { "epoch": 0.4585032514052684, "grad_norm": 0.0394129908093004, "learning_rate": 0.0001970985030460942, "loss": 0.7861, "step": 2080 }, { "epoch": 0.4596054226826849, "grad_norm": 0.03849198049749736, "learning_rate": 0.00019655013192657135, "loss": 0.79, "step": 2085 }, { "epoch": 0.4607075939601014, "grad_norm": 0.044138018952375395, "learning_rate": 0.0001960010714977555, "loss": 0.7813, "step": 2090 }, { "epoch": 0.46180976523751793, "grad_norm": 0.04225359100682865, "learning_rate": 0.00019545132989007375, "loss": 0.7865, "step": 2095 }, { "epoch": 0.4629119365149344, "grad_norm": 0.041786418031829003, "learning_rate": 0.00019490091524404016, "loss": 0.7911, "step": 2100 }, { "epoch": 0.4640141077923509, "grad_norm": 0.04528452410421116, "learning_rate": 0.00019434983571013485, "loss": 0.7932, "step": 2105 }, { "epoch": 0.46511627906976744, "grad_norm": 0.03898208334364371, "learning_rate": 0.00019379809944868376, "loss": 0.8061, "step": 2110 }, { "epoch": 0.46621845034718395, "grad_norm": 0.039265674569634, "learning_rate": 0.00019324571462973737, "loss": 0.7707, "step": 2115 }, { "epoch": 0.4673206216246005, "grad_norm": 0.040002387303416556, "learning_rate": 0.00019269268943295013, "loss": 0.7777, "step": 2120 }, { "epoch": 0.468422792902017, "grad_norm": 0.04208726677362012, "learning_rate": 0.00019213903204745895, "loss": 0.7979, "step": 2125 }, { "epoch": 0.46952496417943346, "grad_norm": 0.03816406171273399, "learning_rate": 0.0001915847506717622, "loss": 0.806, "step": 2130 }, { "epoch": 0.47062713545685, "grad_norm": 0.04211319505603739, "learning_rate": 0.00019102985351359832, "loss": 0.7887, "step": 2135 }, { "epoch": 0.4717293067342665, "grad_norm": 0.041883426079474324, "learning_rate": 0.00019047434878982403, "loss": 0.7814, "step": 2140 }, { "epoch": 0.472831478011683, "grad_norm": 0.037960810503627374, "learning_rate": 0.00018991824472629293, "loss": 0.7698, "step": 2145 }, { "epoch": 0.47393364928909953, "grad_norm": 0.03832451540546507, "learning_rate": 0.0001893615495577335, "loss": 0.7953, "step": 2150 }, { "epoch": 0.47503582056651605, "grad_norm": 0.0399683526421958, "learning_rate": 0.0001888042715276273, "loss": 0.7875, "step": 2155 }, { "epoch": 0.4761379918439326, "grad_norm": 0.03995007716254075, "learning_rate": 0.00018824641888808683, "loss": 0.7958, "step": 2160 }, { "epoch": 0.47724016312134904, "grad_norm": 0.04082501729363916, "learning_rate": 0.0001876879998997333, "loss": 0.8004, "step": 2165 }, { "epoch": 0.47834233439876556, "grad_norm": 0.03753976072129883, "learning_rate": 0.00018712902283157438, "loss": 0.7862, "step": 2170 }, { "epoch": 0.4794445056761821, "grad_norm": 0.039247708791779135, "learning_rate": 0.00018656949596088177, "loss": 0.7846, "step": 2175 }, { "epoch": 0.4805466769535986, "grad_norm": 0.040194874528480064, "learning_rate": 0.00018600942757306853, "loss": 0.7948, "step": 2180 }, { "epoch": 0.4816488482310151, "grad_norm": 0.042684703351327506, "learning_rate": 0.00018544882596156643, "loss": 0.8328, "step": 2185 }, { "epoch": 0.48275101950843163, "grad_norm": 0.04256364237578435, "learning_rate": 0.0001848876994277032, "loss": 0.8036, "step": 2190 }, { "epoch": 0.4838531907858481, "grad_norm": 0.04084201277383364, "learning_rate": 0.0001843260562805796, "loss": 0.7838, "step": 2195 }, { "epoch": 0.4849553620632646, "grad_norm": 0.03951302995836591, "learning_rate": 0.0001837639048369462, "loss": 0.7729, "step": 2200 }, { "epoch": 0.48605753334068114, "grad_norm": 0.0440116634757643, "learning_rate": 0.00018320125342108058, "loss": 0.8097, "step": 2205 }, { "epoch": 0.48715970461809766, "grad_norm": 0.03880553291311738, "learning_rate": 0.0001826381103646636, "loss": 0.7858, "step": 2210 }, { "epoch": 0.4882618758955142, "grad_norm": 0.04347096499790543, "learning_rate": 0.00018207448400665656, "loss": 0.7931, "step": 2215 }, { "epoch": 0.4893640471729307, "grad_norm": 0.04137171191530775, "learning_rate": 0.0001815103826931772, "loss": 0.7904, "step": 2220 }, { "epoch": 0.49046621845034716, "grad_norm": 0.03924451193006248, "learning_rate": 0.00018094581477737652, "loss": 0.7892, "step": 2225 }, { "epoch": 0.4915683897277637, "grad_norm": 0.035326553011083374, "learning_rate": 0.00018038078861931482, "loss": 0.7699, "step": 2230 }, { "epoch": 0.4926705610051802, "grad_norm": 0.037295368892913544, "learning_rate": 0.00017981531258583794, "loss": 0.7688, "step": 2235 }, { "epoch": 0.4937727322825967, "grad_norm": 0.04102492750892726, "learning_rate": 0.00017924939505045364, "loss": 0.7959, "step": 2240 }, { "epoch": 0.49487490356001324, "grad_norm": 0.038911715276106644, "learning_rate": 0.0001786830443932071, "loss": 0.8129, "step": 2245 }, { "epoch": 0.49597707483742975, "grad_norm": 0.0401123359070641, "learning_rate": 0.00017811626900055748, "loss": 0.8031, "step": 2250 }, { "epoch": 0.4970792461148463, "grad_norm": 0.044018814366690515, "learning_rate": 0.00017754907726525302, "loss": 0.7963, "step": 2255 }, { "epoch": 0.49818141739226274, "grad_norm": 0.03902942910566316, "learning_rate": 0.00017698147758620736, "loss": 0.7607, "step": 2260 }, { "epoch": 0.49928358866967926, "grad_norm": 0.04968656280514321, "learning_rate": 0.0001764134783683748, "loss": 0.8039, "step": 2265 }, { "epoch": 0.5003857599470958, "grad_norm": 0.04093320446137599, "learning_rate": 0.00017584508802262602, "loss": 0.8126, "step": 2270 }, { "epoch": 0.5014879312245123, "grad_norm": 0.04617686206504717, "learning_rate": 0.00017527631496562352, "loss": 0.8063, "step": 2275 }, { "epoch": 0.5025901025019288, "grad_norm": 0.038529497279601546, "learning_rate": 0.0001747071676196968, "loss": 0.7816, "step": 2280 }, { "epoch": 0.5036922737793453, "grad_norm": 0.03888023182945524, "learning_rate": 0.000174137654412718, "loss": 0.8077, "step": 2285 }, { "epoch": 0.5047944450567619, "grad_norm": 0.04001737897478531, "learning_rate": 0.00017356778377797664, "loss": 0.8262, "step": 2290 }, { "epoch": 0.5058966163341784, "grad_norm": 0.039323331641178884, "learning_rate": 0.00017299756415405524, "loss": 0.795, "step": 2295 }, { "epoch": 0.5069987876115949, "grad_norm": 0.04467088321827032, "learning_rate": 0.00017242700398470393, "loss": 0.7939, "step": 2300 }, { "epoch": 0.5081009588890113, "grad_norm": 0.043810507147911466, "learning_rate": 0.00017185611171871573, "loss": 0.7669, "step": 2305 }, { "epoch": 0.5092031301664278, "grad_norm": 0.040939929874561506, "learning_rate": 0.0001712848958098012, "loss": 0.8017, "step": 2310 }, { "epoch": 0.5103053014438443, "grad_norm": 0.03898184972185426, "learning_rate": 0.00017071336471646348, "loss": 0.8045, "step": 2315 }, { "epoch": 0.5114074727212609, "grad_norm": 0.038962684331486475, "learning_rate": 0.0001701415269018728, "loss": 0.8071, "step": 2320 }, { "epoch": 0.5125096439986774, "grad_norm": 0.04491474953288176, "learning_rate": 0.0001695693908337414, "loss": 0.7909, "step": 2325 }, { "epoch": 0.5136118152760939, "grad_norm": 0.04437277653745269, "learning_rate": 0.00016899696498419794, "loss": 0.7973, "step": 2330 }, { "epoch": 0.5147139865535104, "grad_norm": 0.04158284754618555, "learning_rate": 0.00016842425782966224, "loss": 0.7778, "step": 2335 }, { "epoch": 0.5158161578309269, "grad_norm": 0.04116323767907288, "learning_rate": 0.00016785127785071949, "loss": 0.8043, "step": 2340 }, { "epoch": 0.5169183291083435, "grad_norm": 0.034684359802969134, "learning_rate": 0.000167278033531995, "loss": 0.79, "step": 2345 }, { "epoch": 0.51802050038576, "grad_norm": 0.04229406791148304, "learning_rate": 0.0001667045333620283, "loss": 0.7795, "step": 2350 }, { "epoch": 0.5191226716631765, "grad_norm": 0.03764426814798344, "learning_rate": 0.00016613078583314756, "loss": 0.7781, "step": 2355 }, { "epoch": 0.520224842940593, "grad_norm": 0.03973158416131846, "learning_rate": 0.00016555679944134382, "loss": 0.7873, "step": 2360 }, { "epoch": 0.5213270142180095, "grad_norm": 0.039706342704437673, "learning_rate": 0.00016498258268614514, "loss": 0.761, "step": 2365 }, { "epoch": 0.5224291854954259, "grad_norm": 0.04246187330898323, "learning_rate": 0.00016440814407049092, "loss": 0.7904, "step": 2370 }, { "epoch": 0.5235313567728425, "grad_norm": 0.037315547695955284, "learning_rate": 0.00016383349210060555, "loss": 0.7916, "step": 2375 }, { "epoch": 0.524633528050259, "grad_norm": 0.03680043140841983, "learning_rate": 0.000163258635285873, "loss": 0.7839, "step": 2380 }, { "epoch": 0.5257356993276755, "grad_norm": 0.04240483271155321, "learning_rate": 0.00016268358213871058, "loss": 0.7717, "step": 2385 }, { "epoch": 0.526837870605092, "grad_norm": 0.038360421064488774, "learning_rate": 0.0001621083411744427, "loss": 0.8082, "step": 2390 }, { "epoch": 0.5279400418825085, "grad_norm": 0.03923682854690225, "learning_rate": 0.00016153292091117505, "loss": 0.7675, "step": 2395 }, { "epoch": 0.5290422131599251, "grad_norm": 0.04266472133785625, "learning_rate": 0.00016095732986966824, "loss": 0.7826, "step": 2400 }, { "epoch": 0.5301443844373416, "grad_norm": 0.04032119205943179, "learning_rate": 0.00016038157657321202, "loss": 0.7694, "step": 2405 }, { "epoch": 0.5312465557147581, "grad_norm": 0.038493359106615645, "learning_rate": 0.0001598056695474984, "loss": 0.7851, "step": 2410 }, { "epoch": 0.5323487269921746, "grad_norm": 0.04788764713041358, "learning_rate": 0.00015922961732049617, "loss": 0.8041, "step": 2415 }, { "epoch": 0.5334508982695911, "grad_norm": 0.03867840390035932, "learning_rate": 0.000158653428422324, "loss": 0.763, "step": 2420 }, { "epoch": 0.5345530695470077, "grad_norm": 0.03926183742629788, "learning_rate": 0.00015807711138512458, "loss": 0.774, "step": 2425 }, { "epoch": 0.5356552408244241, "grad_norm": 0.04275754859474233, "learning_rate": 0.00015750067474293774, "loss": 0.8008, "step": 2430 }, { "epoch": 0.5367574121018406, "grad_norm": 0.041396745192273696, "learning_rate": 0.00015692412703157478, "loss": 0.7899, "step": 2435 }, { "epoch": 0.5378595833792571, "grad_norm": 0.03771777455755809, "learning_rate": 0.00015634747678849146, "loss": 0.7662, "step": 2440 }, { "epoch": 0.5389617546566736, "grad_norm": 0.037105475754751184, "learning_rate": 0.00015577073255266185, "loss": 0.7963, "step": 2445 }, { "epoch": 0.5400639259340901, "grad_norm": 0.042466734201556076, "learning_rate": 0.00015519390286445201, "loss": 0.7795, "step": 2450 }, { "epoch": 0.5411660972115067, "grad_norm": 0.03800624488466557, "learning_rate": 0.00015461699626549314, "loss": 0.7789, "step": 2455 }, { "epoch": 0.5422682684889232, "grad_norm": 0.03799071186124082, "learning_rate": 0.00015404002129855557, "loss": 0.7621, "step": 2460 }, { "epoch": 0.5433704397663397, "grad_norm": 0.04246315994111129, "learning_rate": 0.00015346298650742177, "loss": 0.7898, "step": 2465 }, { "epoch": 0.5444726110437562, "grad_norm": 0.0383853594452228, "learning_rate": 0.00015288590043676027, "loss": 0.7838, "step": 2470 }, { "epoch": 0.5455747823211727, "grad_norm": 0.037342426062281935, "learning_rate": 0.00015230877163199878, "loss": 0.7746, "step": 2475 }, { "epoch": 0.5466769535985893, "grad_norm": 0.03967766879530587, "learning_rate": 0.000151731608639198, "loss": 0.7807, "step": 2480 }, { "epoch": 0.5477791248760058, "grad_norm": 0.038046687905520335, "learning_rate": 0.0001511544200049247, "loss": 0.7624, "step": 2485 }, { "epoch": 0.5488812961534223, "grad_norm": 0.038282722756821576, "learning_rate": 0.00015057721427612548, "loss": 0.7781, "step": 2490 }, { "epoch": 0.5499834674308387, "grad_norm": 0.04204297605214361, "learning_rate": 0.00015, "loss": 0.7889, "step": 2495 }, { "epoch": 0.5510856387082552, "grad_norm": 0.04253941444925998, "learning_rate": 0.00014942278572387452, "loss": 0.7874, "step": 2500 }, { "epoch": 0.5521878099856717, "grad_norm": 0.04099337109892425, "learning_rate": 0.00014884557999507528, "loss": 0.7932, "step": 2505 }, { "epoch": 0.5532899812630883, "grad_norm": 0.043225237652168194, "learning_rate": 0.00014826839136080204, "loss": 0.8035, "step": 2510 }, { "epoch": 0.5543921525405048, "grad_norm": 0.04237211794633771, "learning_rate": 0.00014769122836800122, "loss": 0.782, "step": 2515 }, { "epoch": 0.5554943238179213, "grad_norm": 0.0390643188084349, "learning_rate": 0.00014711409956323976, "loss": 0.8021, "step": 2520 }, { "epoch": 0.5565964950953378, "grad_norm": 0.038912412857210685, "learning_rate": 0.00014653701349257823, "loss": 0.7713, "step": 2525 }, { "epoch": 0.5576986663727543, "grad_norm": 0.04021618253944335, "learning_rate": 0.00014595997870144443, "loss": 0.7711, "step": 2530 }, { "epoch": 0.5588008376501709, "grad_norm": 0.04054714580080947, "learning_rate": 0.00014538300373450683, "loss": 0.7959, "step": 2535 }, { "epoch": 0.5599030089275874, "grad_norm": 0.0378078133538945, "learning_rate": 0.00014480609713554796, "loss": 0.7533, "step": 2540 }, { "epoch": 0.5610051802050039, "grad_norm": 0.03566763348348747, "learning_rate": 0.0001442292674473381, "loss": 0.7842, "step": 2545 }, { "epoch": 0.5621073514824204, "grad_norm": 0.04162087831710151, "learning_rate": 0.0001436525232115086, "loss": 0.7765, "step": 2550 }, { "epoch": 0.5632095227598369, "grad_norm": 0.039192296915950345, "learning_rate": 0.00014307587296842524, "loss": 0.7761, "step": 2555 }, { "epoch": 0.5643116940372533, "grad_norm": 0.040771917142651264, "learning_rate": 0.00014249932525706223, "loss": 0.7637, "step": 2560 }, { "epoch": 0.5654138653146699, "grad_norm": 0.0404557988061511, "learning_rate": 0.00014192288861487545, "loss": 0.7809, "step": 2565 }, { "epoch": 0.5665160365920864, "grad_norm": 0.0380287543914902, "learning_rate": 0.00014134657157767593, "loss": 0.7744, "step": 2570 }, { "epoch": 0.5676182078695029, "grad_norm": 0.037545959627603626, "learning_rate": 0.00014077038267950383, "loss": 0.7705, "step": 2575 }, { "epoch": 0.5687203791469194, "grad_norm": 0.04121660123612755, "learning_rate": 0.00014019433045250158, "loss": 0.7969, "step": 2580 }, { "epoch": 0.5698225504243359, "grad_norm": 0.03543171857215221, "learning_rate": 0.00013961842342678798, "loss": 0.7706, "step": 2585 }, { "epoch": 0.5709247217017525, "grad_norm": 0.03970445149727504, "learning_rate": 0.0001390426701303317, "loss": 0.7813, "step": 2590 }, { "epoch": 0.572026892979169, "grad_norm": 0.04263582518492721, "learning_rate": 0.00013846707908882498, "loss": 0.8044, "step": 2595 }, { "epoch": 0.5731290642565855, "grad_norm": 0.03919599782168947, "learning_rate": 0.0001378916588255573, "loss": 0.7709, "step": 2600 }, { "epoch": 0.574231235534002, "grad_norm": 0.04085513710063447, "learning_rate": 0.0001373164178612894, "loss": 0.7916, "step": 2605 }, { "epoch": 0.5753334068114185, "grad_norm": 0.03947664604661718, "learning_rate": 0.0001367413647141269, "loss": 0.7829, "step": 2610 }, { "epoch": 0.576435578088835, "grad_norm": 0.03819848642986916, "learning_rate": 0.00013616650789939443, "loss": 0.7736, "step": 2615 }, { "epoch": 0.5775377493662515, "grad_norm": 0.03928261920892333, "learning_rate": 0.0001355918559295091, "loss": 0.7934, "step": 2620 }, { "epoch": 0.578639920643668, "grad_norm": 0.04465492341767027, "learning_rate": 0.00013501741731385483, "loss": 0.7872, "step": 2625 }, { "epoch": 0.5797420919210845, "grad_norm": 0.04518050772542813, "learning_rate": 0.00013444320055865618, "loss": 0.7978, "step": 2630 }, { "epoch": 0.580844263198501, "grad_norm": 0.03823568510951906, "learning_rate": 0.00013386921416685239, "loss": 0.8026, "step": 2635 }, { "epoch": 0.5819464344759175, "grad_norm": 0.03860337235855855, "learning_rate": 0.0001332954666379717, "loss": 0.7819, "step": 2640 }, { "epoch": 0.5830486057533341, "grad_norm": 0.040848904672585555, "learning_rate": 0.00013272196646800497, "loss": 0.7718, "step": 2645 }, { "epoch": 0.5841507770307506, "grad_norm": 0.039991424568808075, "learning_rate": 0.0001321487221492805, "loss": 0.7737, "step": 2650 }, { "epoch": 0.5852529483081671, "grad_norm": 0.04406998907502384, "learning_rate": 0.00013157574217033773, "loss": 0.7804, "step": 2655 }, { "epoch": 0.5863551195855836, "grad_norm": 0.042736167667461564, "learning_rate": 0.00013100303501580206, "loss": 0.7864, "step": 2660 }, { "epoch": 0.5874572908630001, "grad_norm": 0.039658625402537326, "learning_rate": 0.0001304306091662586, "loss": 0.7879, "step": 2665 }, { "epoch": 0.5885594621404167, "grad_norm": 0.03732667476656254, "learning_rate": 0.0001298584730981272, "loss": 0.7958, "step": 2670 }, { "epoch": 0.5896616334178332, "grad_norm": 0.0385663925190591, "learning_rate": 0.00012928663528353652, "loss": 0.7532, "step": 2675 }, { "epoch": 0.5907638046952497, "grad_norm": 0.039100737225537294, "learning_rate": 0.00012871510419019876, "loss": 0.8146, "step": 2680 }, { "epoch": 0.5918659759726661, "grad_norm": 0.04322733978868932, "learning_rate": 0.0001281438882812843, "loss": 0.7844, "step": 2685 }, { "epoch": 0.5929681472500826, "grad_norm": 0.03855540614705994, "learning_rate": 0.00012757299601529604, "loss": 0.7444, "step": 2690 }, { "epoch": 0.5940703185274991, "grad_norm": 0.03976869418403505, "learning_rate": 0.00012700243584594479, "loss": 0.7706, "step": 2695 }, { "epoch": 0.5951724898049157, "grad_norm": 0.038039179428976305, "learning_rate": 0.00012643221622202336, "loss": 0.7497, "step": 2700 }, { "epoch": 0.5962746610823322, "grad_norm": 0.03951382972148692, "learning_rate": 0.00012586234558728207, "loss": 0.7571, "step": 2705 }, { "epoch": 0.5973768323597487, "grad_norm": 0.04180256816699712, "learning_rate": 0.0001252928323803032, "loss": 0.7538, "step": 2710 }, { "epoch": 0.5984790036371652, "grad_norm": 0.037309356744730904, "learning_rate": 0.00012472368503437648, "loss": 0.7924, "step": 2715 }, { "epoch": 0.5995811749145817, "grad_norm": 0.04216451657557382, "learning_rate": 0.00012415491197737395, "loss": 0.7816, "step": 2720 }, { "epoch": 0.6006833461919983, "grad_norm": 0.039059339374739786, "learning_rate": 0.00012358652163162523, "loss": 0.7394, "step": 2725 }, { "epoch": 0.6017855174694148, "grad_norm": 0.03934478577799933, "learning_rate": 0.00012301852241379267, "loss": 0.7903, "step": 2730 }, { "epoch": 0.6028876887468313, "grad_norm": 0.03677529471356991, "learning_rate": 0.00012245092273474695, "loss": 0.7688, "step": 2735 }, { "epoch": 0.6039898600242478, "grad_norm": 0.04379989553642847, "learning_rate": 0.00012188373099944252, "loss": 0.7791, "step": 2740 }, { "epoch": 0.6050920313016642, "grad_norm": 0.03853225124689197, "learning_rate": 0.00012131695560679285, "loss": 0.7842, "step": 2745 }, { "epoch": 0.6061942025790807, "grad_norm": 0.04243715932465795, "learning_rate": 0.0001207506049495464, "loss": 0.7633, "step": 2750 }, { "epoch": 0.6072963738564973, "grad_norm": 0.042350895868304664, "learning_rate": 0.00012018468741416206, "loss": 0.7992, "step": 2755 }, { "epoch": 0.6083985451339138, "grad_norm": 0.042511366861454146, "learning_rate": 0.00011961921138068517, "loss": 0.7628, "step": 2760 }, { "epoch": 0.6095007164113303, "grad_norm": 0.03945967376445848, "learning_rate": 0.00011905418522262343, "loss": 0.7798, "step": 2765 }, { "epoch": 0.6106028876887468, "grad_norm": 0.04134467702346202, "learning_rate": 0.00011848961730682276, "loss": 0.7736, "step": 2770 }, { "epoch": 0.6117050589661633, "grad_norm": 0.04321593442131728, "learning_rate": 0.00011792551599334342, "loss": 0.7729, "step": 2775 }, { "epoch": 0.6128072302435799, "grad_norm": 0.03753481531786513, "learning_rate": 0.00011736188963533636, "loss": 0.7868, "step": 2780 }, { "epoch": 0.6139094015209964, "grad_norm": 0.046677240052044086, "learning_rate": 0.0001167987465789194, "loss": 0.788, "step": 2785 }, { "epoch": 0.6150115727984129, "grad_norm": 0.04004956558986007, "learning_rate": 0.00011623609516305375, "loss": 0.7669, "step": 2790 }, { "epoch": 0.6161137440758294, "grad_norm": 0.03877661195736084, "learning_rate": 0.0001156739437194204, "loss": 0.7403, "step": 2795 }, { "epoch": 0.6172159153532459, "grad_norm": 0.037935876666486346, "learning_rate": 0.00011511230057229678, "loss": 0.7373, "step": 2800 }, { "epoch": 0.6183180866306625, "grad_norm": 0.038133591224345446, "learning_rate": 0.00011455117403843358, "loss": 0.7626, "step": 2805 }, { "epoch": 0.6194202579080789, "grad_norm": 0.040488281712004456, "learning_rate": 0.00011399057242693143, "loss": 0.7748, "step": 2810 }, { "epoch": 0.6205224291854954, "grad_norm": 0.045588727331748555, "learning_rate": 0.00011343050403911823, "loss": 0.7566, "step": 2815 }, { "epoch": 0.6216246004629119, "grad_norm": 0.04322927430800435, "learning_rate": 0.0001128709771684256, "loss": 0.7405, "step": 2820 }, { "epoch": 0.6227267717403284, "grad_norm": 0.03941622010511477, "learning_rate": 0.00011231200010026668, "loss": 0.7699, "step": 2825 }, { "epoch": 0.6238289430177449, "grad_norm": 0.040341244924510265, "learning_rate": 0.00011175358111191316, "loss": 0.7546, "step": 2830 }, { "epoch": 0.6249311142951615, "grad_norm": 0.0370716397793875, "learning_rate": 0.00011119572847237272, "loss": 0.7524, "step": 2835 }, { "epoch": 0.626033285572578, "grad_norm": 0.04088692753580051, "learning_rate": 0.00011063845044226649, "loss": 0.7737, "step": 2840 }, { "epoch": 0.6271354568499945, "grad_norm": 0.04149954061127777, "learning_rate": 0.00011008175527370708, "loss": 0.7635, "step": 2845 }, { "epoch": 0.628237628127411, "grad_norm": 0.04067030099074359, "learning_rate": 0.00010952565121017595, "loss": 0.7781, "step": 2850 }, { "epoch": 0.6293397994048275, "grad_norm": 0.037995610342879724, "learning_rate": 0.00010897014648640164, "loss": 0.7536, "step": 2855 }, { "epoch": 0.6304419706822441, "grad_norm": 0.038638564159584964, "learning_rate": 0.0001084152493282378, "loss": 0.7582, "step": 2860 }, { "epoch": 0.6315441419596606, "grad_norm": 0.03865577853003443, "learning_rate": 0.00010786096795254105, "loss": 0.743, "step": 2865 }, { "epoch": 0.6326463132370771, "grad_norm": 0.037742667105078676, "learning_rate": 0.00010730731056704987, "loss": 0.7738, "step": 2870 }, { "epoch": 0.6337484845144935, "grad_norm": 0.04087500905979875, "learning_rate": 0.0001067542853702626, "loss": 0.7454, "step": 2875 }, { "epoch": 0.63485065579191, "grad_norm": 0.03681788140604627, "learning_rate": 0.00010620190055131628, "loss": 0.7513, "step": 2880 }, { "epoch": 0.6359528270693265, "grad_norm": 0.039530646965778786, "learning_rate": 0.00010565016428986515, "loss": 0.7863, "step": 2885 }, { "epoch": 0.6370549983467431, "grad_norm": 0.03637232048859645, "learning_rate": 0.00010509908475595984, "loss": 0.7871, "step": 2890 }, { "epoch": 0.6381571696241596, "grad_norm": 0.037532035474368244, "learning_rate": 0.0001045486701099262, "loss": 0.7868, "step": 2895 }, { "epoch": 0.6392593409015761, "grad_norm": 0.04139117865057032, "learning_rate": 0.0001039989285022445, "loss": 0.7544, "step": 2900 }, { "epoch": 0.6403615121789926, "grad_norm": 0.04037843881677972, "learning_rate": 0.00010344986807342866, "loss": 0.7612, "step": 2905 }, { "epoch": 0.6414636834564091, "grad_norm": 0.039565719386194985, "learning_rate": 0.00010290149695390581, "loss": 0.7616, "step": 2910 }, { "epoch": 0.6425658547338257, "grad_norm": 0.038310169105771584, "learning_rate": 0.00010235382326389586, "loss": 0.7576, "step": 2915 }, { "epoch": 0.6436680260112422, "grad_norm": 0.03936468865097429, "learning_rate": 0.00010180685511329131, "loss": 0.7702, "step": 2920 }, { "epoch": 0.6447701972886587, "grad_norm": 0.0398256871335756, "learning_rate": 0.00010126060060153713, "loss": 0.7822, "step": 2925 }, { "epoch": 0.6458723685660752, "grad_norm": 0.0413034275637383, "learning_rate": 0.00010071506781751063, "loss": 0.7542, "step": 2930 }, { "epoch": 0.6469745398434916, "grad_norm": 0.03871254029791003, "learning_rate": 0.000100170264839402, "loss": 0.7335, "step": 2935 }, { "epoch": 0.6480767111209081, "grad_norm": 0.036091901012243334, "learning_rate": 9.962619973459453e-05, "loss": 0.7748, "step": 2940 }, { "epoch": 0.6491788823983247, "grad_norm": 0.04138951063703848, "learning_rate": 9.90828805595452e-05, "loss": 0.7524, "step": 2945 }, { "epoch": 0.6502810536757412, "grad_norm": 0.03858848182433528, "learning_rate": 9.854031535966521e-05, "loss": 0.7517, "step": 2950 }, { "epoch": 0.6513832249531577, "grad_norm": 0.03602604340055366, "learning_rate": 9.799851216920107e-05, "loss": 0.7337, "step": 2955 }, { "epoch": 0.6524853962305742, "grad_norm": 0.03579532104123597, "learning_rate": 9.745747901111552e-05, "loss": 0.7623, "step": 2960 }, { "epoch": 0.6535875675079907, "grad_norm": 0.0381744125557122, "learning_rate": 9.691722389696879e-05, "loss": 0.7683, "step": 2965 }, { "epoch": 0.6546897387854073, "grad_norm": 0.03608442329560764, "learning_rate": 9.637775482679988e-05, "loss": 0.7732, "step": 2970 }, { "epoch": 0.6557919100628238, "grad_norm": 0.03784439256503652, "learning_rate": 9.583907978900807e-05, "loss": 0.7739, "step": 2975 }, { "epoch": 0.6568940813402403, "grad_norm": 0.03945771278463694, "learning_rate": 9.530120676023482e-05, "loss": 0.7442, "step": 2980 }, { "epoch": 0.6579962526176568, "grad_norm": 0.03667458666034089, "learning_rate": 9.476414370524538e-05, "loss": 0.7456, "step": 2985 }, { "epoch": 0.6590984238950733, "grad_norm": 0.03940939789339881, "learning_rate": 9.422789857681124e-05, "loss": 0.7438, "step": 2990 }, { "epoch": 0.6602005951724899, "grad_norm": 0.03803960926078932, "learning_rate": 9.36924793155918e-05, "loss": 0.778, "step": 2995 }, { "epoch": 0.6613027664499063, "grad_norm": 0.035231525831095054, "learning_rate": 9.315789385001738e-05, "loss": 0.7647, "step": 3000 }, { "epoch": 0.6624049377273228, "grad_norm": 0.03745699399316068, "learning_rate": 9.262415009617139e-05, "loss": 0.7684, "step": 3005 }, { "epoch": 0.6635071090047393, "grad_norm": 0.03586306355233504, "learning_rate": 9.209125595767336e-05, "loss": 0.7458, "step": 3010 }, { "epoch": 0.6646092802821558, "grad_norm": 0.036839355886467764, "learning_rate": 9.15592193255617e-05, "loss": 0.7706, "step": 3015 }, { "epoch": 0.6657114515595723, "grad_norm": 0.03906285720782755, "learning_rate": 9.102804807817699e-05, "loss": 0.781, "step": 3020 }, { "epoch": 0.6668136228369889, "grad_norm": 0.03869457663028768, "learning_rate": 9.049775008104542e-05, "loss": 0.7486, "step": 3025 }, { "epoch": 0.6679157941144054, "grad_norm": 0.0393845820755994, "learning_rate": 8.996833318676204e-05, "loss": 0.7473, "step": 3030 }, { "epoch": 0.6690179653918219, "grad_norm": 0.03941498428248022, "learning_rate": 8.943980523487469e-05, "loss": 0.7846, "step": 3035 }, { "epoch": 0.6701201366692384, "grad_norm": 0.0420251439088419, "learning_rate": 8.891217405176774e-05, "loss": 0.7608, "step": 3040 }, { "epoch": 0.6712223079466549, "grad_norm": 0.035649123267955884, "learning_rate": 8.838544745054645e-05, "loss": 0.7719, "step": 3045 }, { "epoch": 0.6723244792240715, "grad_norm": 0.03877525844159031, "learning_rate": 8.785963323092108e-05, "loss": 0.7582, "step": 3050 }, { "epoch": 0.673426650501488, "grad_norm": 0.037091463531239946, "learning_rate": 8.733473917909144e-05, "loss": 0.7411, "step": 3055 }, { "epoch": 0.6745288217789045, "grad_norm": 0.036271594287721816, "learning_rate": 8.68107730676315e-05, "loss": 0.7849, "step": 3060 }, { "epoch": 0.6756309930563209, "grad_norm": 0.03541699632121048, "learning_rate": 8.628774265537462e-05, "loss": 0.7514, "step": 3065 }, { "epoch": 0.6767331643337374, "grad_norm": 0.0362990364448018, "learning_rate": 8.576565568729813e-05, "loss": 0.7474, "step": 3070 }, { "epoch": 0.6778353356111539, "grad_norm": 0.03921802869726997, "learning_rate": 8.524451989440918e-05, "loss": 0.754, "step": 3075 }, { "epoch": 0.6789375068885705, "grad_norm": 0.037337016378149755, "learning_rate": 8.472434299362998e-05, "loss": 0.75, "step": 3080 }, { "epoch": 0.680039678165987, "grad_norm": 0.03891038958554527, "learning_rate": 8.420513268768347e-05, "loss": 0.7859, "step": 3085 }, { "epoch": 0.6811418494434035, "grad_norm": 0.035894066909538044, "learning_rate": 8.368689666497938e-05, "loss": 0.7329, "step": 3090 }, { "epoch": 0.68224402072082, "grad_norm": 0.039979769321902066, "learning_rate": 8.31696425995004e-05, "loss": 0.7503, "step": 3095 }, { "epoch": 0.6833461919982365, "grad_norm": 0.03936168369937806, "learning_rate": 8.26533781506887e-05, "loss": 0.7726, "step": 3100 }, { "epoch": 0.6844483632756531, "grad_norm": 0.04019560215837221, "learning_rate": 8.21381109633318e-05, "loss": 0.7432, "step": 3105 }, { "epoch": 0.6855505345530696, "grad_norm": 0.039269909994462844, "learning_rate": 8.162384866745036e-05, "loss": 0.7538, "step": 3110 }, { "epoch": 0.6866527058304861, "grad_norm": 0.03904037612780349, "learning_rate": 8.111059887818459e-05, "loss": 0.744, "step": 3115 }, { "epoch": 0.6877548771079026, "grad_norm": 0.03885534367589714, "learning_rate": 8.059836919568152e-05, "loss": 0.7328, "step": 3120 }, { "epoch": 0.688857048385319, "grad_norm": 0.03858452994442185, "learning_rate": 8.008716720498253e-05, "loss": 0.7701, "step": 3125 }, { "epoch": 0.6899592196627355, "grad_norm": 0.03519383464662896, "learning_rate": 7.957700047591121e-05, "loss": 0.7451, "step": 3130 }, { "epoch": 0.6910613909401521, "grad_norm": 0.0380259068615794, "learning_rate": 7.906787656296107e-05, "loss": 0.7556, "step": 3135 }, { "epoch": 0.6921635622175686, "grad_norm": 0.038944765250869484, "learning_rate": 7.855980300518354e-05, "loss": 0.7389, "step": 3140 }, { "epoch": 0.6932657334949851, "grad_norm": 0.04470561682300718, "learning_rate": 7.805278732607678e-05, "loss": 0.7568, "step": 3145 }, { "epoch": 0.6943679047724016, "grad_norm": 0.04297680477477516, "learning_rate": 7.754683703347372e-05, "loss": 0.7626, "step": 3150 }, { "epoch": 0.6954700760498181, "grad_norm": 0.034697565387832634, "learning_rate": 7.704195961943129e-05, "loss": 0.7721, "step": 3155 }, { "epoch": 0.6965722473272347, "grad_norm": 0.04072535239631796, "learning_rate": 7.653816256011941e-05, "loss": 0.7757, "step": 3160 }, { "epoch": 0.6976744186046512, "grad_norm": 0.035271346752025576, "learning_rate": 7.603545331571018e-05, "loss": 0.7629, "step": 3165 }, { "epoch": 0.6987765898820677, "grad_norm": 0.037357663884374157, "learning_rate": 7.553383933026741e-05, "loss": 0.7549, "step": 3170 }, { "epoch": 0.6998787611594842, "grad_norm": 0.03637211308219045, "learning_rate": 7.503332803163641e-05, "loss": 0.7529, "step": 3175 }, { "epoch": 0.7009809324369007, "grad_norm": 0.039519222558093065, "learning_rate": 7.453392683133415e-05, "loss": 0.7879, "step": 3180 }, { "epoch": 0.7020831037143173, "grad_norm": 0.03323807896911395, "learning_rate": 7.403564312443932e-05, "loss": 0.7189, "step": 3185 }, { "epoch": 0.7031852749917337, "grad_norm": 0.03896304992954701, "learning_rate": 7.353848428948288e-05, "loss": 0.732, "step": 3190 }, { "epoch": 0.7042874462691502, "grad_norm": 0.03533752667617695, "learning_rate": 7.304245768833872e-05, "loss": 0.7499, "step": 3195 }, { "epoch": 0.7053896175465667, "grad_norm": 0.04289239640414403, "learning_rate": 7.25475706661149e-05, "loss": 0.7518, "step": 3200 }, { "epoch": 0.7064917888239832, "grad_norm": 0.036700606939269846, "learning_rate": 7.20538305510447e-05, "loss": 0.7444, "step": 3205 }, { "epoch": 0.7075939601013997, "grad_norm": 0.039301039880959406, "learning_rate": 7.156124465437799e-05, "loss": 0.7647, "step": 3210 }, { "epoch": 0.7086961313788163, "grad_norm": 0.039132429979466775, "learning_rate": 7.106982027027314e-05, "loss": 0.7464, "step": 3215 }, { "epoch": 0.7097983026562328, "grad_norm": 0.03668166024260441, "learning_rate": 7.057956467568913e-05, "loss": 0.768, "step": 3220 }, { "epoch": 0.7109004739336493, "grad_norm": 0.03763349214718496, "learning_rate": 7.009048513027738e-05, "loss": 0.7627, "step": 3225 }, { "epoch": 0.7120026452110658, "grad_norm": 0.03910255645252377, "learning_rate": 6.960258887627474e-05, "loss": 0.7393, "step": 3230 }, { "epoch": 0.7131048164884823, "grad_norm": 0.03847225495364757, "learning_rate": 6.911588313839579e-05, "loss": 0.758, "step": 3235 }, { "epoch": 0.7142069877658989, "grad_norm": 0.038410315616110316, "learning_rate": 6.86303751237263e-05, "loss": 0.7385, "step": 3240 }, { "epoch": 0.7153091590433154, "grad_norm": 0.038761774380026405, "learning_rate": 6.814607202161606e-05, "loss": 0.7382, "step": 3245 }, { "epoch": 0.7164113303207318, "grad_norm": 0.03723197930881741, "learning_rate": 6.766298100357281e-05, "loss": 0.7359, "step": 3250 }, { "epoch": 0.7175135015981483, "grad_norm": 0.040413685922780995, "learning_rate": 6.718110922315593e-05, "loss": 0.7342, "step": 3255 }, { "epoch": 0.7186156728755648, "grad_norm": 0.03978308906273803, "learning_rate": 6.670046381587016e-05, "loss": 0.7645, "step": 3260 }, { "epoch": 0.7197178441529813, "grad_norm": 0.03897559054869522, "learning_rate": 6.622105189906052e-05, "loss": 0.7455, "step": 3265 }, { "epoch": 0.7208200154303979, "grad_norm": 0.03815718519490893, "learning_rate": 6.574288057180663e-05, "loss": 0.7615, "step": 3270 }, { "epoch": 0.7219221867078144, "grad_norm": 0.03856332628344952, "learning_rate": 6.526595691481746e-05, "loss": 0.7599, "step": 3275 }, { "epoch": 0.7230243579852309, "grad_norm": 0.037326383736852486, "learning_rate": 6.479028799032664e-05, "loss": 0.7727, "step": 3280 }, { "epoch": 0.7241265292626474, "grad_norm": 0.03759649917895476, "learning_rate": 6.431588084198791e-05, "loss": 0.733, "step": 3285 }, { "epoch": 0.7252287005400639, "grad_norm": 0.0351188278300472, "learning_rate": 6.384274249477086e-05, "loss": 0.7603, "step": 3290 }, { "epoch": 0.7263308718174805, "grad_norm": 0.0387440053943191, "learning_rate": 6.337087995485658e-05, "loss": 0.7401, "step": 3295 }, { "epoch": 0.727433043094897, "grad_norm": 0.03680120173341686, "learning_rate": 6.290030020953423e-05, "loss": 0.7811, "step": 3300 }, { "epoch": 0.7285352143723135, "grad_norm": 0.037694080907078036, "learning_rate": 6.243101022709761e-05, "loss": 0.7279, "step": 3305 }, { "epoch": 0.72963738564973, "grad_norm": 0.04067783323042442, "learning_rate": 6.196301695674176e-05, "loss": 0.7827, "step": 3310 }, { "epoch": 0.7307395569271464, "grad_norm": 0.038537648221183, "learning_rate": 6.14963273284601e-05, "loss": 0.7586, "step": 3315 }, { "epoch": 0.731841728204563, "grad_norm": 0.038447173395684923, "learning_rate": 6.1030948252941985e-05, "loss": 0.7599, "step": 3320 }, { "epoch": 0.7329438994819795, "grad_norm": 0.037364125258692316, "learning_rate": 6.056688662147012e-05, "loss": 0.7546, "step": 3325 }, { "epoch": 0.734046070759396, "grad_norm": 0.03886268408011641, "learning_rate": 6.010414930581866e-05, "loss": 0.7451, "step": 3330 }, { "epoch": 0.7351482420368125, "grad_norm": 0.03769869242431956, "learning_rate": 5.96427431581515e-05, "loss": 0.768, "step": 3335 }, { "epoch": 0.736250413314229, "grad_norm": 0.037020275271875513, "learning_rate": 5.918267501092078e-05, "loss": 0.7392, "step": 3340 }, { "epoch": 0.7373525845916455, "grad_norm": 0.03729781683672499, "learning_rate": 5.872395167676555e-05, "loss": 0.7541, "step": 3345 }, { "epoch": 0.7384547558690621, "grad_norm": 0.03769698709111463, "learning_rate": 5.826657994841104e-05, "loss": 0.7464, "step": 3350 }, { "epoch": 0.7395569271464786, "grad_norm": 0.035749490646957455, "learning_rate": 5.78105665985681e-05, "loss": 0.783, "step": 3355 }, { "epoch": 0.7406590984238951, "grad_norm": 0.03815712387980432, "learning_rate": 5.7355918379832925e-05, "loss": 0.7415, "step": 3360 }, { "epoch": 0.7417612697013116, "grad_norm": 0.03756149653556473, "learning_rate": 5.690264202458685e-05, "loss": 0.7754, "step": 3365 }, { "epoch": 0.7428634409787281, "grad_norm": 0.03958578609528177, "learning_rate": 5.64507442448968e-05, "loss": 0.7835, "step": 3370 }, { "epoch": 0.7439656122561447, "grad_norm": 0.038064835951232556, "learning_rate": 5.6000231732416045e-05, "loss": 0.7938, "step": 3375 }, { "epoch": 0.7450677835335611, "grad_norm": 0.0371943932393074, "learning_rate": 5.555111115828492e-05, "loss": 0.7406, "step": 3380 }, { "epoch": 0.7461699548109776, "grad_norm": 0.03776336387841464, "learning_rate": 5.510338917303204e-05, "loss": 0.7459, "step": 3385 }, { "epoch": 0.7472721260883941, "grad_norm": 0.04142505105864296, "learning_rate": 5.4657072406475816e-05, "loss": 0.7419, "step": 3390 }, { "epoch": 0.7483742973658106, "grad_norm": 0.03664875530168412, "learning_rate": 5.421216746762651e-05, "loss": 0.7701, "step": 3395 }, { "epoch": 0.7494764686432271, "grad_norm": 0.03963080933205579, "learning_rate": 5.3768680944588006e-05, "loss": 0.7449, "step": 3400 }, { "epoch": 0.7505786399206437, "grad_norm": 0.03875301352382599, "learning_rate": 5.3326619404460594e-05, "loss": 0.7512, "step": 3405 }, { "epoch": 0.7516808111980602, "grad_norm": 0.03812435605779621, "learning_rate": 5.2885989393243446e-05, "loss": 0.7524, "step": 3410 }, { "epoch": 0.7527829824754767, "grad_norm": 0.036201552008782494, "learning_rate": 5.244679743573793e-05, "loss": 0.7313, "step": 3415 }, { "epoch": 0.7538851537528932, "grad_norm": 0.03508489489495532, "learning_rate": 5.200905003545072e-05, "loss": 0.7143, "step": 3420 }, { "epoch": 0.7549873250303097, "grad_norm": 0.03873819352052367, "learning_rate": 5.1572753674497784e-05, "loss": 0.7262, "step": 3425 }, { "epoch": 0.7560894963077263, "grad_norm": 0.03878661096421005, "learning_rate": 5.11379148135083e-05, "loss": 0.7388, "step": 3430 }, { "epoch": 0.7571916675851428, "grad_norm": 0.03624350442263521, "learning_rate": 5.070453989152865e-05, "loss": 0.7516, "step": 3435 }, { "epoch": 0.7582938388625592, "grad_norm": 0.03462784231516503, "learning_rate": 5.0272635325927666e-05, "loss": 0.735, "step": 3440 }, { "epoch": 0.7593960101399757, "grad_norm": 0.03546841031831082, "learning_rate": 4.9842207512301255e-05, "loss": 0.7688, "step": 3445 }, { "epoch": 0.7604981814173922, "grad_norm": 0.03765881036086525, "learning_rate": 4.941326282437765e-05, "loss": 0.7584, "step": 3450 }, { "epoch": 0.7616003526948087, "grad_norm": 0.04070540653962422, "learning_rate": 4.8985807613923084e-05, "loss": 0.7658, "step": 3455 }, { "epoch": 0.7627025239722253, "grad_norm": 0.041025307893189714, "learning_rate": 4.855984821064789e-05, "loss": 0.753, "step": 3460 }, { "epoch": 0.7638046952496418, "grad_norm": 0.03747182722869465, "learning_rate": 4.8135390922112687e-05, "loss": 0.7481, "step": 3465 }, { "epoch": 0.7649068665270583, "grad_norm": 0.03475376097595749, "learning_rate": 4.771244203363478e-05, "loss": 0.7322, "step": 3470 }, { "epoch": 0.7660090378044748, "grad_norm": 0.03620242697594977, "learning_rate": 4.72910078081953e-05, "loss": 0.7289, "step": 3475 }, { "epoch": 0.7671112090818913, "grad_norm": 0.039201952070474604, "learning_rate": 4.687109448634647e-05, "loss": 0.7663, "step": 3480 }, { "epoch": 0.7682133803593079, "grad_norm": 0.038508731501384584, "learning_rate": 4.6452708286119176e-05, "loss": 0.7554, "step": 3485 }, { "epoch": 0.7693155516367244, "grad_norm": 0.03899698694328063, "learning_rate": 4.603585540293071e-05, "loss": 0.7736, "step": 3490 }, { "epoch": 0.7704177229141409, "grad_norm": 0.0368565333958254, "learning_rate": 4.5620542009493304e-05, "loss": 0.7516, "step": 3495 }, { "epoch": 0.7715198941915574, "grad_norm": 0.035388497953352936, "learning_rate": 4.5206774255722504e-05, "loss": 0.7484, "step": 3500 }, { "epoch": 0.7726220654689738, "grad_norm": 0.03538316494242759, "learning_rate": 4.4794558268646194e-05, "loss": 0.7581, "step": 3505 }, { "epoch": 0.7737242367463903, "grad_norm": 0.037362884464824934, "learning_rate": 4.4383900152313926e-05, "loss": 0.7459, "step": 3510 }, { "epoch": 0.7748264080238069, "grad_norm": 0.036038446641414534, "learning_rate": 4.397480598770652e-05, "loss": 0.7606, "step": 3515 }, { "epoch": 0.7759285793012234, "grad_norm": 0.0402761096628342, "learning_rate": 4.3567281832645815e-05, "loss": 0.7813, "step": 3520 }, { "epoch": 0.7770307505786399, "grad_norm": 0.03506614642106647, "learning_rate": 4.3161333721705146e-05, "loss": 0.7303, "step": 3525 }, { "epoch": 0.7781329218560564, "grad_norm": 0.03710237528152325, "learning_rate": 4.275696766612007e-05, "loss": 0.7658, "step": 3530 }, { "epoch": 0.779235093133473, "grad_norm": 0.039207961974188736, "learning_rate": 4.2354189653699234e-05, "loss": 0.7686, "step": 3535 }, { "epoch": 0.7803372644108895, "grad_norm": 0.0400226786429818, "learning_rate": 4.1953005648735606e-05, "loss": 0.7365, "step": 3540 }, { "epoch": 0.781439435688306, "grad_norm": 0.038069210231566904, "learning_rate": 4.1553421591918264e-05, "loss": 0.7612, "step": 3545 }, { "epoch": 0.7825416069657225, "grad_norm": 0.036731650972072025, "learning_rate": 4.115544340024456e-05, "loss": 0.7276, "step": 3550 }, { "epoch": 0.783643778243139, "grad_norm": 0.03761683304943094, "learning_rate": 4.075907696693224e-05, "loss": 0.7397, "step": 3555 }, { "epoch": 0.7847459495205555, "grad_norm": 0.039130062081128986, "learning_rate": 4.036432816133241e-05, "loss": 0.7412, "step": 3560 }, { "epoch": 0.785848120797972, "grad_norm": 0.03725082169003722, "learning_rate": 3.99712028288424e-05, "loss": 0.7378, "step": 3565 }, { "epoch": 0.7869502920753885, "grad_norm": 0.03581598167403878, "learning_rate": 3.957970679081948e-05, "loss": 0.7377, "step": 3570 }, { "epoch": 0.788052463352805, "grad_norm": 0.036766846530443355, "learning_rate": 3.918984584449435e-05, "loss": 0.7606, "step": 3575 }, { "epoch": 0.7891546346302215, "grad_norm": 0.03708226420234272, "learning_rate": 3.880162576288557e-05, "loss": 0.763, "step": 3580 }, { "epoch": 0.790256805907638, "grad_norm": 0.035646641087147025, "learning_rate": 3.841505229471386e-05, "loss": 0.7472, "step": 3585 }, { "epoch": 0.7913589771850545, "grad_norm": 0.03623704020179618, "learning_rate": 3.803013116431716e-05, "loss": 0.7371, "step": 3590 }, { "epoch": 0.7924611484624711, "grad_norm": 0.03685938607213482, "learning_rate": 3.764686807156565e-05, "loss": 0.7636, "step": 3595 }, { "epoch": 0.7935633197398876, "grad_norm": 0.03581401378985991, "learning_rate": 3.72652686917776e-05, "loss": 0.7436, "step": 3600 }, { "epoch": 0.7946654910173041, "grad_norm": 0.03576091311918202, "learning_rate": 3.6885338675635215e-05, "loss": 0.741, "step": 3605 }, { "epoch": 0.7957676622947206, "grad_norm": 0.03938541436587627, "learning_rate": 3.65070836491007e-05, "loss": 0.7511, "step": 3610 }, { "epoch": 0.7968698335721371, "grad_norm": 0.03831985268675037, "learning_rate": 3.613050921333345e-05, "loss": 0.7581, "step": 3615 }, { "epoch": 0.7979720048495537, "grad_norm": 0.036183263282557804, "learning_rate": 3.575562094460682e-05, "loss": 0.7519, "step": 3620 }, { "epoch": 0.7990741761269702, "grad_norm": 0.039441336486759127, "learning_rate": 3.5382424394225506e-05, "loss": 0.7566, "step": 3625 }, { "epoch": 0.8001763474043866, "grad_norm": 0.03791319055471918, "learning_rate": 3.501092508844339e-05, "loss": 0.7483, "step": 3630 }, { "epoch": 0.8012785186818031, "grad_norm": 0.034917608244421146, "learning_rate": 3.464112852838184e-05, "loss": 0.7434, "step": 3635 }, { "epoch": 0.8023806899592196, "grad_norm": 0.03606780306247915, "learning_rate": 3.427304018994821e-05, "loss": 0.7478, "step": 3640 }, { "epoch": 0.8034828612366361, "grad_norm": 0.03680798923538717, "learning_rate": 3.3906665523754504e-05, "loss": 0.7496, "step": 3645 }, { "epoch": 0.8045850325140527, "grad_norm": 0.03865353271265747, "learning_rate": 3.354200995503692e-05, "loss": 0.7397, "step": 3650 }, { "epoch": 0.8056872037914692, "grad_norm": 0.03635669491983459, "learning_rate": 3.3179078883575536e-05, "loss": 0.7718, "step": 3655 }, { "epoch": 0.8067893750688857, "grad_norm": 0.0363260735871896, "learning_rate": 3.2817877683614244e-05, "loss": 0.7209, "step": 3660 }, { "epoch": 0.8078915463463022, "grad_norm": 0.03739024777521627, "learning_rate": 3.245841170378106e-05, "loss": 0.7276, "step": 3665 }, { "epoch": 0.8089937176237187, "grad_norm": 0.04056944004711678, "learning_rate": 3.21006862670092e-05, "loss": 0.7427, "step": 3670 }, { "epoch": 0.8100958889011353, "grad_norm": 0.038893468288220996, "learning_rate": 3.174470667045801e-05, "loss": 0.7337, "step": 3675 }, { "epoch": 0.8111980601785518, "grad_norm": 0.037502075980572286, "learning_rate": 3.139047818543462e-05, "loss": 0.7536, "step": 3680 }, { "epoch": 0.8123002314559683, "grad_norm": 0.03650106907146565, "learning_rate": 3.103800605731598e-05, "loss": 0.7533, "step": 3685 }, { "epoch": 0.8134024027333848, "grad_norm": 0.03819525064272321, "learning_rate": 3.068729550547105e-05, "loss": 0.7681, "step": 3690 }, { "epoch": 0.8145045740108012, "grad_norm": 0.03711022658424072, "learning_rate": 3.033835172318355e-05, "loss": 0.7449, "step": 3695 }, { "epoch": 0.8156067452882177, "grad_norm": 0.03418347940782499, "learning_rate": 2.9991179877575032e-05, "loss": 0.7393, "step": 3700 }, { "epoch": 0.8167089165656343, "grad_norm": 0.03626371876045612, "learning_rate": 2.964578510952847e-05, "loss": 0.7371, "step": 3705 }, { "epoch": 0.8178110878430508, "grad_norm": 0.03878297701004356, "learning_rate": 2.9302172533612077e-05, "loss": 0.747, "step": 3710 }, { "epoch": 0.8189132591204673, "grad_norm": 0.038112632260334955, "learning_rate": 2.8960347238003488e-05, "loss": 0.7579, "step": 3715 }, { "epoch": 0.8200154303978838, "grad_norm": 0.03926882455039221, "learning_rate": 2.8620314284414486e-05, "loss": 0.7529, "step": 3720 }, { "epoch": 0.8211176016753003, "grad_norm": 0.03648970703950733, "learning_rate": 2.8282078708016163e-05, "loss": 0.7473, "step": 3725 }, { "epoch": 0.8222197729527169, "grad_norm": 0.03630414782533231, "learning_rate": 2.7945645517364064e-05, "loss": 0.7355, "step": 3730 }, { "epoch": 0.8233219442301334, "grad_norm": 0.036468505132406466, "learning_rate": 2.7611019694324415e-05, "loss": 0.7101, "step": 3735 }, { "epoch": 0.8244241155075499, "grad_norm": 0.037520748694235606, "learning_rate": 2.727820619399992e-05, "loss": 0.7431, "step": 3740 }, { "epoch": 0.8255262867849664, "grad_norm": 0.03483967252567691, "learning_rate": 2.6947209944656784e-05, "loss": 0.7008, "step": 3745 }, { "epoch": 0.8266284580623829, "grad_norm": 0.03683830824123343, "learning_rate": 2.661803584765143e-05, "loss": 0.7397, "step": 3750 }, { "epoch": 0.8277306293397994, "grad_norm": 0.037699100377305624, "learning_rate": 2.6290688777358164e-05, "loss": 0.7663, "step": 3755 }, { "epoch": 0.8288328006172159, "grad_norm": 0.04181925311652994, "learning_rate": 2.5965173581096748e-05, "loss": 0.7553, "step": 3760 }, { "epoch": 0.8299349718946324, "grad_norm": 0.03779442944677197, "learning_rate": 2.564149507906089e-05, "loss": 0.7589, "step": 3765 }, { "epoch": 0.8310371431720489, "grad_norm": 0.03577568203273454, "learning_rate": 2.5319658064246595e-05, "loss": 0.7446, "step": 3770 }, { "epoch": 0.8321393144494654, "grad_norm": 0.03931081842755508, "learning_rate": 2.4999667302381404e-05, "loss": 0.751, "step": 3775 }, { "epoch": 0.833241485726882, "grad_norm": 0.03745824041736122, "learning_rate": 2.4681527531853835e-05, "loss": 0.7123, "step": 3780 }, { "epoch": 0.8343436570042985, "grad_norm": 0.03527193664557951, "learning_rate": 2.436524346364286e-05, "loss": 0.7025, "step": 3785 }, { "epoch": 0.835445828281715, "grad_norm": 0.03606947650450607, "learning_rate": 2.4050819781248647e-05, "loss": 0.7206, "step": 3790 }, { "epoch": 0.8365479995591315, "grad_norm": 0.035888622608410504, "learning_rate": 2.373826114062296e-05, "loss": 0.7537, "step": 3795 }, { "epoch": 0.837650170836548, "grad_norm": 0.036850404718823324, "learning_rate": 2.3427572170100112e-05, "loss": 0.7638, "step": 3800 }, { "epoch": 0.8387523421139645, "grad_norm": 0.037724339996222885, "learning_rate": 2.311875747032858e-05, "loss": 0.7557, "step": 3805 }, { "epoch": 0.8398545133913811, "grad_norm": 0.03518696843854021, "learning_rate": 2.2811821614202897e-05, "loss": 0.7602, "step": 3810 }, { "epoch": 0.8409566846687976, "grad_norm": 0.03634301372317722, "learning_rate": 2.2506769146795893e-05, "loss": 0.7427, "step": 3815 }, { "epoch": 0.842058855946214, "grad_norm": 0.03565812651945666, "learning_rate": 2.2203604585291303e-05, "loss": 0.7336, "step": 3820 }, { "epoch": 0.8431610272236305, "grad_norm": 0.03768119047665601, "learning_rate": 2.1902332418916956e-05, "loss": 0.7661, "step": 3825 }, { "epoch": 0.844263198501047, "grad_norm": 0.035791641873146804, "learning_rate": 2.1602957108878434e-05, "loss": 0.7589, "step": 3830 }, { "epoch": 0.8453653697784635, "grad_norm": 0.03888367446787956, "learning_rate": 2.130548308829267e-05, "loss": 0.7395, "step": 3835 }, { "epoch": 0.8464675410558801, "grad_norm": 0.03780624024053704, "learning_rate": 2.1009914762122694e-05, "loss": 0.7324, "step": 3840 }, { "epoch": 0.8475697123332966, "grad_norm": 0.03701658755711745, "learning_rate": 2.071625650711217e-05, "loss": 0.7261, "step": 3845 }, { "epoch": 0.8486718836107131, "grad_norm": 0.03349203678484499, "learning_rate": 2.0424512671720566e-05, "loss": 0.7285, "step": 3850 }, { "epoch": 0.8497740548881296, "grad_norm": 0.035807885382574296, "learning_rate": 2.0134687576058878e-05, "loss": 0.7513, "step": 3855 }, { "epoch": 0.8508762261655461, "grad_norm": 0.03540300293328915, "learning_rate": 1.9846785511825618e-05, "loss": 0.7506, "step": 3860 }, { "epoch": 0.8519783974429627, "grad_norm": 0.036548562423081375, "learning_rate": 1.9560810742243298e-05, "loss": 0.7486, "step": 3865 }, { "epoch": 0.8530805687203792, "grad_norm": 0.03688805027139149, "learning_rate": 1.9276767501995206e-05, "loss": 0.756, "step": 3870 }, { "epoch": 0.8541827399977957, "grad_norm": 0.03511921343987327, "learning_rate": 1.8994659997162687e-05, "loss": 0.7188, "step": 3875 }, { "epoch": 0.8552849112752122, "grad_norm": 0.03593480104178096, "learning_rate": 1.8714492405163072e-05, "loss": 0.7241, "step": 3880 }, { "epoch": 0.8563870825526286, "grad_norm": 0.03912290065785617, "learning_rate": 1.843626887468764e-05, "loss": 0.735, "step": 3885 }, { "epoch": 0.8574892538300452, "grad_norm": 0.036540249139284824, "learning_rate": 1.8159993525640115e-05, "loss": 0.7629, "step": 3890 }, { "epoch": 0.8585914251074617, "grad_norm": 0.03586116680339337, "learning_rate": 1.788567044907585e-05, "loss": 0.728, "step": 3895 }, { "epoch": 0.8596935963848782, "grad_norm": 0.04108515819021104, "learning_rate": 1.7613303707141164e-05, "loss": 0.7544, "step": 3900 }, { "epoch": 0.8607957676622947, "grad_norm": 0.04154327787956279, "learning_rate": 1.7342897333013112e-05, "loss": 0.715, "step": 3905 }, { "epoch": 0.8618979389397112, "grad_norm": 0.0375147580423421, "learning_rate": 1.7074455330839943e-05, "loss": 0.7325, "step": 3910 }, { "epoch": 0.8630001102171277, "grad_norm": 0.03505267883405521, "learning_rate": 1.6807981675681587e-05, "loss": 0.7463, "step": 3915 }, { "epoch": 0.8641022814945443, "grad_norm": 0.035319705305492576, "learning_rate": 1.654348031345104e-05, "loss": 0.7225, "step": 3920 }, { "epoch": 0.8652044527719608, "grad_norm": 0.038996518100504834, "learning_rate": 1.6280955160855628e-05, "loss": 0.7537, "step": 3925 }, { "epoch": 0.8663066240493773, "grad_norm": 0.03826307362758949, "learning_rate": 1.602041010533934e-05, "loss": 0.7287, "step": 3930 }, { "epoch": 0.8674087953267938, "grad_norm": 0.03711543681709775, "learning_rate": 1.5761849005024985e-05, "loss": 0.7709, "step": 3935 }, { "epoch": 0.8685109666042103, "grad_norm": 0.03886269645051025, "learning_rate": 1.5505275688657275e-05, "loss": 0.733, "step": 3940 }, { "epoch": 0.8696131378816268, "grad_norm": 0.039368459771957895, "learning_rate": 1.5250693955545929e-05, "loss": 0.7377, "step": 3945 }, { "epoch": 0.8707153091590433, "grad_norm": 0.03542501994719629, "learning_rate": 1.4998107575509633e-05, "loss": 0.7509, "step": 3950 }, { "epoch": 0.8718174804364598, "grad_norm": 0.03662399917022349, "learning_rate": 1.4747520288820014e-05, "loss": 0.7221, "step": 3955 }, { "epoch": 0.8729196517138763, "grad_norm": 0.03657298741205027, "learning_rate": 1.449893580614636e-05, "loss": 0.7497, "step": 3960 }, { "epoch": 0.8740218229912928, "grad_norm": 0.03781150680434851, "learning_rate": 1.425235780850067e-05, "loss": 0.7582, "step": 3965 }, { "epoch": 0.8751239942687093, "grad_norm": 0.04032655125637331, "learning_rate": 1.4007789947183168e-05, "loss": 0.7447, "step": 3970 }, { "epoch": 0.8762261655461259, "grad_norm": 0.03589883470863497, "learning_rate": 1.3765235843728129e-05, "loss": 0.7276, "step": 3975 }, { "epoch": 0.8773283368235424, "grad_norm": 0.03717658884234944, "learning_rate": 1.3524699089850328e-05, "loss": 0.7401, "step": 3980 }, { "epoch": 0.8784305081009589, "grad_norm": 0.03497351776047112, "learning_rate": 1.3286183247391868e-05, "loss": 0.7392, "step": 3985 }, { "epoch": 0.8795326793783754, "grad_norm": 0.03680345383865433, "learning_rate": 1.3049691848269461e-05, "loss": 0.7397, "step": 3990 }, { "epoch": 0.880634850655792, "grad_norm": 0.037067904261301174, "learning_rate": 1.2815228394421995e-05, "loss": 0.7543, "step": 3995 }, { "epoch": 0.8817370219332085, "grad_norm": 0.03635951971560963, "learning_rate": 1.2582796357758829e-05, "loss": 0.7268, "step": 4000 }, { "epoch": 0.882839193210625, "grad_norm": 0.037149225593830666, "learning_rate": 1.2352399180108286e-05, "loss": 0.7447, "step": 4005 }, { "epoch": 0.8839413644880414, "grad_norm": 0.034848783140386315, "learning_rate": 1.2124040273166691e-05, "loss": 0.7311, "step": 4010 }, { "epoch": 0.8850435357654579, "grad_norm": 0.03985554439292569, "learning_rate": 1.1897723018447946e-05, "loss": 0.7288, "step": 4015 }, { "epoch": 0.8861457070428744, "grad_norm": 0.035706356323576875, "learning_rate": 1.1673450767233388e-05, "loss": 0.7326, "step": 4020 }, { "epoch": 0.887247878320291, "grad_norm": 0.036740845098738074, "learning_rate": 1.1451226840522077e-05, "loss": 0.7496, "step": 4025 }, { "epoch": 0.8883500495977075, "grad_norm": 0.037031961816322526, "learning_rate": 1.1231054528981765e-05, "loss": 0.7524, "step": 4030 }, { "epoch": 0.889452220875124, "grad_norm": 0.03756810225346114, "learning_rate": 1.1012937092900126e-05, "loss": 0.7312, "step": 4035 }, { "epoch": 0.8905543921525405, "grad_norm": 0.037472375253416255, "learning_rate": 1.0796877762136458e-05, "loss": 0.7544, "step": 4040 }, { "epoch": 0.891656563429957, "grad_norm": 0.0359317417796455, "learning_rate": 1.0582879736073819e-05, "loss": 0.7354, "step": 4045 }, { "epoch": 0.8927587347073735, "grad_norm": 0.03749781446172768, "learning_rate": 1.03709461835717e-05, "loss": 0.7546, "step": 4050 }, { "epoch": 0.8938609059847901, "grad_norm": 0.03733411968021827, "learning_rate": 1.0161080242919129e-05, "loss": 0.7259, "step": 4055 }, { "epoch": 0.8949630772622066, "grad_norm": 0.03761288676599098, "learning_rate": 9.953285021788143e-06, "loss": 0.7489, "step": 4060 }, { "epoch": 0.8960652485396231, "grad_norm": 0.03412569507517913, "learning_rate": 9.747563597187791e-06, "loss": 0.7286, "step": 4065 }, { "epoch": 0.8971674198170395, "grad_norm": 0.03525939402552114, "learning_rate": 9.543919015418516e-06, "loss": 0.7513, "step": 4070 }, { "epoch": 0.898269591094456, "grad_norm": 0.03720593815884494, "learning_rate": 9.342354292027215e-06, "loss": 0.7474, "step": 4075 }, { "epoch": 0.8993717623718726, "grad_norm": 0.0375416497695239, "learning_rate": 9.142872411762354e-06, "loss": 0.7685, "step": 4080 }, { "epoch": 0.9004739336492891, "grad_norm": 0.03906878754343892, "learning_rate": 8.945476328529949e-06, "loss": 0.732, "step": 4085 }, { "epoch": 0.9015761049267056, "grad_norm": 0.035056491589514995, "learning_rate": 8.750168965349713e-06, "loss": 0.7436, "step": 4090 }, { "epoch": 0.9026782762041221, "grad_norm": 0.03470624619912502, "learning_rate": 8.556953214311896e-06, "loss": 0.6928, "step": 4095 }, { "epoch": 0.9037804474815386, "grad_norm": 0.03557139688126229, "learning_rate": 8.365831936534289e-06, "loss": 0.7236, "step": 4100 }, { "epoch": 0.9048826187589551, "grad_norm": 0.03860114842216224, "learning_rate": 8.17680796212003e-06, "loss": 0.7367, "step": 4105 }, { "epoch": 0.9059847900363717, "grad_norm": 0.0358454686914662, "learning_rate": 7.989884090115579e-06, "loss": 0.7393, "step": 4110 }, { "epoch": 0.9070869613137882, "grad_norm": 0.036131468273917416, "learning_rate": 7.80506308846927e-06, "loss": 0.7187, "step": 4115 }, { "epoch": 0.9081891325912047, "grad_norm": 0.03431907826116676, "learning_rate": 7.622347693990438e-06, "loss": 0.7368, "step": 4120 }, { "epoch": 0.9092913038686212, "grad_norm": 0.036378482006826245, "learning_rate": 7.4417406123088e-06, "loss": 0.7123, "step": 4125 }, { "epoch": 0.9103934751460377, "grad_norm": 0.03610068662140505, "learning_rate": 7.263244517834365e-06, "loss": 0.7298, "step": 4130 }, { "epoch": 0.9114956464234542, "grad_norm": 0.035675737631172474, "learning_rate": 7.086862053717867e-06, "loss": 0.7329, "step": 4135 }, { "epoch": 0.9125978177008707, "grad_norm": 0.03488018397228746, "learning_rate": 6.91259583181169e-06, "loss": 0.7459, "step": 4140 }, { "epoch": 0.9136999889782872, "grad_norm": 0.037711098080699654, "learning_rate": 6.740448432631118e-06, "loss": 0.7456, "step": 4145 }, { "epoch": 0.9148021602557037, "grad_norm": 0.03683158716361964, "learning_rate": 6.570422405316117e-06, "loss": 0.7477, "step": 4150 }, { "epoch": 0.9159043315331202, "grad_norm": 0.03722178270525642, "learning_rate": 6.4025202675935635e-06, "loss": 0.7668, "step": 4155 }, { "epoch": 0.9170065028105367, "grad_norm": 0.03548538866880862, "learning_rate": 6.236744505740126e-06, "loss": 0.7612, "step": 4160 }, { "epoch": 0.9181086740879533, "grad_norm": 0.03573889132178891, "learning_rate": 6.073097574545244e-06, "loss": 0.7374, "step": 4165 }, { "epoch": 0.9192108453653698, "grad_norm": 0.03490127726620679, "learning_rate": 5.91158189727487e-06, "loss": 0.7233, "step": 4170 }, { "epoch": 0.9203130166427863, "grad_norm": 0.04015606965771483, "learning_rate": 5.752199865635604e-06, "loss": 0.7356, "step": 4175 }, { "epoch": 0.9214151879202028, "grad_norm": 0.038889913565105266, "learning_rate": 5.594953839739252e-06, "loss": 0.7571, "step": 4180 }, { "epoch": 0.9225173591976193, "grad_norm": 0.03632902640398831, "learning_rate": 5.439846148067856e-06, "loss": 0.7478, "step": 4185 }, { "epoch": 0.9236195304750359, "grad_norm": 0.036911826600897674, "learning_rate": 5.2868790874392495e-06, "loss": 0.7351, "step": 4190 }, { "epoch": 0.9247217017524524, "grad_norm": 0.035394959022726914, "learning_rate": 5.13605492297306e-06, "loss": 0.7513, "step": 4195 }, { "epoch": 0.9258238730298688, "grad_norm": 0.038584292984150205, "learning_rate": 4.98737588805711e-06, "loss": 0.7503, "step": 4200 }, { "epoch": 0.9269260443072853, "grad_norm": 0.03443755510774623, "learning_rate": 4.840844184314368e-06, "loss": 0.7467, "step": 4205 }, { "epoch": 0.9280282155847018, "grad_norm": 0.03507393739539605, "learning_rate": 4.696461981570371e-06, "loss": 0.7479, "step": 4210 }, { "epoch": 0.9291303868621184, "grad_norm": 0.03572284290016147, "learning_rate": 4.554231417821147e-06, "loss": 0.7438, "step": 4215 }, { "epoch": 0.9302325581395349, "grad_norm": 0.03939452564404578, "learning_rate": 4.414154599201314e-06, "loss": 0.7528, "step": 4220 }, { "epoch": 0.9313347294169514, "grad_norm": 0.0343058423932778, "learning_rate": 4.2762335999532494e-06, "loss": 0.7123, "step": 4225 }, { "epoch": 0.9324369006943679, "grad_norm": 0.03688347815350498, "learning_rate": 4.140470462396101e-06, "loss": 0.7363, "step": 4230 }, { "epoch": 0.9335390719717844, "grad_norm": 0.03669486942706104, "learning_rate": 4.006867196895641e-06, "loss": 0.7285, "step": 4235 }, { "epoch": 0.934641243249201, "grad_norm": 0.035278173584095886, "learning_rate": 3.8754257818345125e-06, "loss": 0.7273, "step": 4240 }, { "epoch": 0.9357434145266175, "grad_norm": 0.03838555636403366, "learning_rate": 3.7461481635828793e-06, "loss": 0.7406, "step": 4245 }, { "epoch": 0.936845585804034, "grad_norm": 0.03449587188356417, "learning_rate": 3.619036256469704e-06, "loss": 0.719, "step": 4250 }, { "epoch": 0.9379477570814505, "grad_norm": 0.03617056410014076, "learning_rate": 3.4940919427542345e-06, "loss": 0.7074, "step": 4255 }, { "epoch": 0.9390499283588669, "grad_norm": 0.03720545719113871, "learning_rate": 3.371317072598312e-06, "loss": 0.761, "step": 4260 }, { "epoch": 0.9401520996362834, "grad_norm": 0.03733982842334115, "learning_rate": 3.2507134640388566e-06, "loss": 0.7373, "step": 4265 }, { "epoch": 0.9412542709137, "grad_norm": 0.03832577468879599, "learning_rate": 3.132282902961025e-06, "loss": 0.7744, "step": 4270 }, { "epoch": 0.9423564421911165, "grad_norm": 0.03561487376514044, "learning_rate": 3.016027143071631e-06, "loss": 0.7367, "step": 4275 }, { "epoch": 0.943458613468533, "grad_norm": 0.037260538854841714, "learning_rate": 2.9019479058733974e-06, "loss": 0.7412, "step": 4280 }, { "epoch": 0.9445607847459495, "grad_norm": 0.033914247331261665, "learning_rate": 2.7900468806392128e-06, "loss": 0.7191, "step": 4285 }, { "epoch": 0.945662956023366, "grad_norm": 0.036359448459938014, "learning_rate": 2.6803257243873165e-06, "loss": 0.7236, "step": 4290 }, { "epoch": 0.9467651273007825, "grad_norm": 0.036671268303379516, "learning_rate": 2.572786061856652e-06, "loss": 0.706, "step": 4295 }, { "epoch": 0.9478672985781991, "grad_norm": 0.03688898129821375, "learning_rate": 2.467429485482869e-06, "loss": 0.7719, "step": 4300 }, { "epoch": 0.9489694698556156, "grad_norm": 0.035848288240590206, "learning_rate": 2.3642575553746933e-06, "loss": 0.7375, "step": 4305 }, { "epoch": 0.9500716411330321, "grad_norm": 0.03649010626732304, "learning_rate": 2.2632717992908278e-06, "loss": 0.7492, "step": 4310 }, { "epoch": 0.9511738124104486, "grad_norm": 0.035622089704062276, "learning_rate": 2.164473712617387e-06, "loss": 0.7277, "step": 4315 }, { "epoch": 0.9522759836878651, "grad_norm": 0.034793596935363234, "learning_rate": 2.0678647583456995e-06, "loss": 0.7167, "step": 4320 }, { "epoch": 0.9533781549652816, "grad_norm": 0.038314433912779305, "learning_rate": 1.973446367050674e-06, "loss": 0.694, "step": 4325 }, { "epoch": 0.9544803262426981, "grad_norm": 0.032175274076676946, "learning_rate": 1.8812199368695325e-06, "loss": 0.7399, "step": 4330 }, { "epoch": 0.9555824975201146, "grad_norm": 0.03747625530620859, "learning_rate": 1.7911868334812618e-06, "loss": 0.7543, "step": 4335 }, { "epoch": 0.9566846687975311, "grad_norm": 0.03469398206914549, "learning_rate": 1.7033483900862953e-06, "loss": 0.719, "step": 4340 }, { "epoch": 0.9577868400749476, "grad_norm": 0.03716371251632767, "learning_rate": 1.617705907386696e-06, "loss": 0.7242, "step": 4345 }, { "epoch": 0.9588890113523642, "grad_norm": 0.033096194137156504, "learning_rate": 1.5342606535670877e-06, "loss": 0.7395, "step": 4350 }, { "epoch": 0.9599911826297807, "grad_norm": 0.038402200718579416, "learning_rate": 1.4530138642756872e-06, "loss": 0.75, "step": 4355 }, { "epoch": 0.9610933539071972, "grad_norm": 0.03629339352134901, "learning_rate": 1.3739667426061196e-06, "loss": 0.7121, "step": 4360 }, { "epoch": 0.9621955251846137, "grad_norm": 0.03480704310265325, "learning_rate": 1.2971204590795813e-06, "loss": 0.7347, "step": 4365 }, { "epoch": 0.9632976964620302, "grad_norm": 0.03454475792553819, "learning_rate": 1.2224761516274883e-06, "loss": 0.7322, "step": 4370 }, { "epoch": 0.9643998677394467, "grad_norm": 0.0348941999387763, "learning_rate": 1.1500349255746055e-06, "loss": 0.744, "step": 4375 }, { "epoch": 0.9655020390168633, "grad_norm": 0.03661845220058871, "learning_rate": 1.0797978536227602e-06, "loss": 0.7429, "step": 4380 }, { "epoch": 0.9666042102942798, "grad_norm": 0.03670130845399715, "learning_rate": 1.011765975834855e-06, "loss": 0.7508, "step": 4385 }, { "epoch": 0.9677063815716962, "grad_norm": 0.037482737750151054, "learning_rate": 9.459402996195797e-07, "loss": 0.7199, "step": 4390 }, { "epoch": 0.9688085528491127, "grad_norm": 0.03609308663664764, "learning_rate": 8.823217997163401e-07, "loss": 0.7202, "step": 4395 }, { "epoch": 0.9699107241265292, "grad_norm": 0.03562801152195268, "learning_rate": 8.209114181810029e-07, "loss": 0.7519, "step": 4400 }, { "epoch": 0.9710128954039458, "grad_norm": 0.037875623718305995, "learning_rate": 7.617100643718066e-07, "loss": 0.736, "step": 4405 }, { "epoch": 0.9721150666813623, "grad_norm": 0.0343579172461373, "learning_rate": 7.04718614935973e-07, "loss": 0.7319, "step": 4410 }, { "epoch": 0.9732172379587788, "grad_norm": 0.0351522271270778, "learning_rate": 6.499379137966831e-07, "loss": 0.7158, "step": 4415 }, { "epoch": 0.9743194092361953, "grad_norm": 0.03685325990487051, "learning_rate": 5.973687721405884e-07, "loss": 0.7343, "step": 4420 }, { "epoch": 0.9754215805136118, "grad_norm": 0.03353546299147592, "learning_rate": 5.470119684058527e-07, "loss": 0.73, "step": 4425 }, { "epoch": 0.9765237517910283, "grad_norm": 0.03850593917914308, "learning_rate": 4.988682482705286e-07, "loss": 0.756, "step": 4430 }, { "epoch": 0.9776259230684449, "grad_norm": 0.040410693859280186, "learning_rate": 4.5293832464159965e-07, "loss": 0.7463, "step": 4435 }, { "epoch": 0.9787280943458614, "grad_norm": 0.03585489720397234, "learning_rate": 4.0922287764438843e-07, "loss": 0.7227, "step": 4440 }, { "epoch": 0.9798302656232779, "grad_norm": 0.03643983778724927, "learning_rate": 3.677225546124818e-07, "loss": 0.7297, "step": 4445 }, { "epoch": 0.9809324369006943, "grad_norm": 0.03527743845105325, "learning_rate": 3.2843797007812147e-07, "loss": 0.7343, "step": 4450 }, { "epoch": 0.9820346081781108, "grad_norm": 0.03420044354382039, "learning_rate": 2.913697057632114e-07, "loss": 0.7211, "step": 4455 }, { "epoch": 0.9831367794555274, "grad_norm": 0.03541898225117583, "learning_rate": 2.565183105705415e-07, "loss": 0.7596, "step": 4460 }, { "epoch": 0.9842389507329439, "grad_norm": 0.03803822863693149, "learning_rate": 2.23884300575794e-07, "loss": 0.7484, "step": 4465 }, { "epoch": 0.9853411220103604, "grad_norm": 0.0339643099555736, "learning_rate": 1.9346815901984947e-07, "loss": 0.7259, "step": 4470 }, { "epoch": 0.9864432932877769, "grad_norm": 0.035078257970822355, "learning_rate": 1.6527033630162613e-07, "loss": 0.7306, "step": 4475 }, { "epoch": 0.9875454645651934, "grad_norm": 0.0345533584530425, "learning_rate": 1.392912499714016e-07, "loss": 0.734, "step": 4480 }, { "epoch": 0.98864763584261, "grad_norm": 0.03944350230563913, "learning_rate": 1.1553128472468476e-07, "loss": 0.764, "step": 4485 }, { "epoch": 0.9897498071200265, "grad_norm": 0.0357818169230016, "learning_rate": 9.39907923964367e-08, "loss": 0.7235, "step": 4490 }, { "epoch": 0.990851978397443, "grad_norm": 0.03566232250402245, "learning_rate": 7.467009195594176e-08, "loss": 0.7433, "step": 4495 }, { "epoch": 0.9919541496748595, "grad_norm": 0.03513694284161295, "learning_rate": 5.7569469502011247e-08, "loss": 0.7419, "step": 4500 }, { "epoch": 0.993056320952276, "grad_norm": 0.033604401356817075, "learning_rate": 4.2689178258820125e-08, "loss": 0.7302, "step": 4505 }, { "epoch": 0.9941584922296925, "grad_norm": 0.038757934920082, "learning_rate": 3.0029438572110045e-08, "loss": 0.7509, "step": 4510 }, { "epoch": 0.995260663507109, "grad_norm": 0.03542769692423219, "learning_rate": 1.959043790590864e-08, "loss": 0.7425, "step": 4515 }, { "epoch": 0.9963628347845255, "grad_norm": 0.037185347711832795, "learning_rate": 1.137233083983169e-08, "loss": 0.7187, "step": 4520 }, { "epoch": 0.997465006061942, "grad_norm": 0.03758234390906879, "learning_rate": 5.375239066685022e-09, "loss": 0.7592, "step": 4525 }, { "epoch": 0.9985671773393585, "grad_norm": 0.036473312127062285, "learning_rate": 1.5992513907658878e-09, "loss": 0.7272, "step": 4530 }, { "epoch": 0.999669348616775, "grad_norm": 0.03634038177846733, "learning_rate": 4.442372649737791e-11, "loss": 0.7268, "step": 4535 }, { "epoch": 0.9998897828722584, "eval_loss": 1.1339110136032104, "eval_runtime": 1020.4828, "eval_samples_per_second": 187.325, "eval_steps_per_second": 5.854, "step": 4536 }, { "epoch": 0.9998897828722584, "step": 4536, "total_flos": 693442503278592.0, "train_loss": 0.8111531063651491, "train_runtime": 19545.8709, "train_samples_per_second": 29.708, "train_steps_per_second": 0.232 } ], "logging_steps": 5, "max_steps": 4536, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 693442503278592.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }