{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 5280, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000946969696969697, "grad_norm": 49.75650598702225, "learning_rate": 2.8409090909090907e-06, "loss": 3.9178, "step": 5 }, { "epoch": 0.001893939393939394, "grad_norm": 39.27361859039198, "learning_rate": 5.6818181818181815e-06, "loss": 3.7038, "step": 10 }, { "epoch": 0.002840909090909091, "grad_norm": 22.806730377223552, "learning_rate": 8.522727272727271e-06, "loss": 2.8608, "step": 15 }, { "epoch": 0.003787878787878788, "grad_norm": 9.34674382037142, "learning_rate": 1.1363636363636363e-05, "loss": 2.207, "step": 20 }, { "epoch": 0.004734848484848485, "grad_norm": 4.014713370693969, "learning_rate": 1.4204545454545453e-05, "loss": 1.74, "step": 25 }, { "epoch": 0.005681818181818182, "grad_norm": 2.1906667080696756, "learning_rate": 1.7045454545454543e-05, "loss": 1.4441, "step": 30 }, { "epoch": 0.006628787878787879, "grad_norm": 1.0752279902100068, "learning_rate": 1.9886363636363634e-05, "loss": 1.2429, "step": 35 }, { "epoch": 0.007575757575757576, "grad_norm": 0.7187175898896523, "learning_rate": 2.2727272727272726e-05, "loss": 1.1147, "step": 40 }, { "epoch": 0.008522727272727272, "grad_norm": 0.49281093084047145, "learning_rate": 2.5568181818181814e-05, "loss": 1.1034, "step": 45 }, { "epoch": 0.00946969696969697, "grad_norm": 0.3269633043103454, "learning_rate": 2.8409090909090906e-05, "loss": 1.058, "step": 50 }, { "epoch": 0.010416666666666666, "grad_norm": 0.33257710673863067, "learning_rate": 3.125e-05, "loss": 1.0722, "step": 55 }, { "epoch": 0.011363636363636364, "grad_norm": 0.25938022961715196, "learning_rate": 3.4090909090909085e-05, "loss": 1.0246, "step": 60 }, { "epoch": 0.01231060606060606, "grad_norm": 0.23813343449521693, "learning_rate": 3.693181818181818e-05, "loss": 1.0074, "step": 65 }, { "epoch": 0.013257575757575758, "grad_norm": 0.2200079499271612, "learning_rate": 3.977272727272727e-05, "loss": 1.0025, "step": 70 }, { "epoch": 0.014204545454545454, "grad_norm": 0.20277353873378182, "learning_rate": 4.261363636363637e-05, "loss": 0.9877, "step": 75 }, { "epoch": 0.015151515151515152, "grad_norm": 0.20338107431083782, "learning_rate": 4.545454545454545e-05, "loss": 0.9579, "step": 80 }, { "epoch": 0.016098484848484848, "grad_norm": 0.20166600929973375, "learning_rate": 4.8295454545454537e-05, "loss": 1.0121, "step": 85 }, { "epoch": 0.017045454545454544, "grad_norm": 0.14601915731061527, "learning_rate": 5.113636363636363e-05, "loss": 0.9809, "step": 90 }, { "epoch": 0.017992424242424244, "grad_norm": 0.1910575503845415, "learning_rate": 5.3977272727272727e-05, "loss": 0.9687, "step": 95 }, { "epoch": 0.01893939393939394, "grad_norm": 0.1710125261741899, "learning_rate": 5.681818181818181e-05, "loss": 0.9759, "step": 100 }, { "epoch": 0.019886363636363636, "grad_norm": 0.1546310725877226, "learning_rate": 5.96590909090909e-05, "loss": 0.9449, "step": 105 }, { "epoch": 0.020833333333333332, "grad_norm": 0.15669899681499375, "learning_rate": 6.25e-05, "loss": 0.9379, "step": 110 }, { "epoch": 0.021780303030303032, "grad_norm": 0.10549598423376465, "learning_rate": 6.534090909090909e-05, "loss": 0.9521, "step": 115 }, { "epoch": 0.022727272727272728, "grad_norm": 0.09265444874772286, "learning_rate": 6.818181818181817e-05, "loss": 0.9402, "step": 120 }, { "epoch": 0.023674242424242424, "grad_norm": 0.10012187642369699, "learning_rate": 7.102272727272727e-05, "loss": 0.9509, "step": 125 }, { "epoch": 0.02462121212121212, "grad_norm": 0.10405267547853224, "learning_rate": 7.386363636363635e-05, "loss": 0.943, "step": 130 }, { "epoch": 0.02556818181818182, "grad_norm": 0.0932862459532729, "learning_rate": 7.670454545454545e-05, "loss": 0.923, "step": 135 }, { "epoch": 0.026515151515151516, "grad_norm": 0.08799539522039788, "learning_rate": 7.954545454545454e-05, "loss": 0.9221, "step": 140 }, { "epoch": 0.027462121212121212, "grad_norm": 0.07936971459661492, "learning_rate": 8.238636363636362e-05, "loss": 0.9385, "step": 145 }, { "epoch": 0.028409090909090908, "grad_norm": 0.09395245331857886, "learning_rate": 8.522727272727273e-05, "loss": 0.9098, "step": 150 }, { "epoch": 0.029356060606060608, "grad_norm": 0.08346381824724323, "learning_rate": 8.806818181818182e-05, "loss": 0.9592, "step": 155 }, { "epoch": 0.030303030303030304, "grad_norm": 0.0683897969731906, "learning_rate": 9.09090909090909e-05, "loss": 0.9162, "step": 160 }, { "epoch": 0.03125, "grad_norm": 0.07364507428375824, "learning_rate": 9.374999999999999e-05, "loss": 0.905, "step": 165 }, { "epoch": 0.032196969696969696, "grad_norm": 0.06957507038154116, "learning_rate": 9.659090909090907e-05, "loss": 0.9277, "step": 170 }, { "epoch": 0.03314393939393939, "grad_norm": 0.07231783801209996, "learning_rate": 9.943181818181817e-05, "loss": 0.8865, "step": 175 }, { "epoch": 0.03409090909090909, "grad_norm": 0.08108886238015861, "learning_rate": 0.00010227272727272726, "loss": 0.9221, "step": 180 }, { "epoch": 0.035037878787878785, "grad_norm": 0.06746334152643936, "learning_rate": 0.00010511363636363635, "loss": 0.8921, "step": 185 }, { "epoch": 0.03598484848484849, "grad_norm": 0.07474975292416153, "learning_rate": 0.00010795454545454545, "loss": 0.9067, "step": 190 }, { "epoch": 0.036931818181818184, "grad_norm": 0.06954632694727424, "learning_rate": 0.00011079545454545454, "loss": 0.9274, "step": 195 }, { "epoch": 0.03787878787878788, "grad_norm": 0.1071194914420164, "learning_rate": 0.00011363636363636362, "loss": 0.9174, "step": 200 }, { "epoch": 0.038825757575757576, "grad_norm": 0.08047063933324308, "learning_rate": 0.00011647727272727271, "loss": 0.8853, "step": 205 }, { "epoch": 0.03977272727272727, "grad_norm": 0.06720262982444847, "learning_rate": 0.0001193181818181818, "loss": 0.8936, "step": 210 }, { "epoch": 0.04071969696969697, "grad_norm": 0.06874990083102131, "learning_rate": 0.0001221590909090909, "loss": 0.8966, "step": 215 }, { "epoch": 0.041666666666666664, "grad_norm": 0.09031821629007566, "learning_rate": 0.000125, "loss": 0.9002, "step": 220 }, { "epoch": 0.04261363636363636, "grad_norm": 0.08250031333079004, "learning_rate": 0.00012784090909090907, "loss": 0.9314, "step": 225 }, { "epoch": 0.043560606060606064, "grad_norm": 0.06517851552105172, "learning_rate": 0.00013068181818181817, "loss": 0.9264, "step": 230 }, { "epoch": 0.04450757575757576, "grad_norm": 0.0682659812110987, "learning_rate": 0.00013352272727272727, "loss": 0.8933, "step": 235 }, { "epoch": 0.045454545454545456, "grad_norm": 0.07147589587336683, "learning_rate": 0.00013636363636363634, "loss": 0.9181, "step": 240 }, { "epoch": 0.04640151515151515, "grad_norm": 0.06876166962712452, "learning_rate": 0.00013920454545454544, "loss": 0.9221, "step": 245 }, { "epoch": 0.04734848484848485, "grad_norm": 0.06370293403172177, "learning_rate": 0.00014204545454545454, "loss": 0.8685, "step": 250 }, { "epoch": 0.048295454545454544, "grad_norm": 0.06615942930120759, "learning_rate": 0.00014488636363636364, "loss": 0.8895, "step": 255 }, { "epoch": 0.04924242424242424, "grad_norm": 0.06664909205706883, "learning_rate": 0.0001477272727272727, "loss": 0.9207, "step": 260 }, { "epoch": 0.050189393939393936, "grad_norm": 0.07595494362406784, "learning_rate": 0.00015056818181818183, "loss": 0.8644, "step": 265 }, { "epoch": 0.05113636363636364, "grad_norm": 0.07408826723348173, "learning_rate": 0.0001534090909090909, "loss": 0.9062, "step": 270 }, { "epoch": 0.052083333333333336, "grad_norm": 0.06808841073565555, "learning_rate": 0.00015625, "loss": 0.9151, "step": 275 }, { "epoch": 0.05303030303030303, "grad_norm": 0.07953899365614112, "learning_rate": 0.00015909090909090907, "loss": 0.8909, "step": 280 }, { "epoch": 0.05397727272727273, "grad_norm": 0.07167080509602292, "learning_rate": 0.00016193181818181817, "loss": 0.9009, "step": 285 }, { "epoch": 0.054924242424242424, "grad_norm": 0.07064676898002652, "learning_rate": 0.00016477272727272724, "loss": 0.8908, "step": 290 }, { "epoch": 0.05587121212121212, "grad_norm": 0.07214540293164669, "learning_rate": 0.00016761363636363634, "loss": 0.9244, "step": 295 }, { "epoch": 0.056818181818181816, "grad_norm": 0.06617148307042509, "learning_rate": 0.00017045454545454547, "loss": 0.8841, "step": 300 }, { "epoch": 0.05776515151515151, "grad_norm": 0.06536482732681019, "learning_rate": 0.00017329545454545454, "loss": 0.8926, "step": 305 }, { "epoch": 0.058712121212121215, "grad_norm": 0.0770022545469697, "learning_rate": 0.00017613636363636364, "loss": 0.8918, "step": 310 }, { "epoch": 0.05965909090909091, "grad_norm": 0.07796650440153677, "learning_rate": 0.0001789772727272727, "loss": 0.8861, "step": 315 }, { "epoch": 0.06060606060606061, "grad_norm": 0.06664140681353005, "learning_rate": 0.0001818181818181818, "loss": 0.883, "step": 320 }, { "epoch": 0.061553030303030304, "grad_norm": 0.06505572579245275, "learning_rate": 0.00018465909090909088, "loss": 0.9046, "step": 325 }, { "epoch": 0.0625, "grad_norm": 0.07411769991572527, "learning_rate": 0.00018749999999999998, "loss": 0.8935, "step": 330 }, { "epoch": 0.0634469696969697, "grad_norm": 0.06839084238999557, "learning_rate": 0.00019034090909090908, "loss": 0.8994, "step": 335 }, { "epoch": 0.06439393939393939, "grad_norm": 0.06991741218579048, "learning_rate": 0.00019318181818181815, "loss": 0.9011, "step": 340 }, { "epoch": 0.06534090909090909, "grad_norm": 0.06765382117363, "learning_rate": 0.00019602272727272727, "loss": 0.8757, "step": 345 }, { "epoch": 0.06628787878787878, "grad_norm": 0.07394479834842242, "learning_rate": 0.00019886363636363634, "loss": 0.8869, "step": 350 }, { "epoch": 0.06723484848484848, "grad_norm": 0.07779852408072253, "learning_rate": 0.00020170454545454544, "loss": 0.8721, "step": 355 }, { "epoch": 0.06818181818181818, "grad_norm": 0.07182147114935328, "learning_rate": 0.0002045454545454545, "loss": 0.9053, "step": 360 }, { "epoch": 0.06912878787878787, "grad_norm": 0.07763475442947491, "learning_rate": 0.0002073863636363636, "loss": 0.8886, "step": 365 }, { "epoch": 0.07007575757575757, "grad_norm": 0.06639383299470691, "learning_rate": 0.0002102272727272727, "loss": 0.9216, "step": 370 }, { "epoch": 0.07102272727272728, "grad_norm": 0.07582408978067692, "learning_rate": 0.00021306818181818178, "loss": 0.9187, "step": 375 }, { "epoch": 0.07196969696969698, "grad_norm": 0.067889778321114, "learning_rate": 0.0002159090909090909, "loss": 0.8848, "step": 380 }, { "epoch": 0.07291666666666667, "grad_norm": 0.06350237430320019, "learning_rate": 0.00021874999999999998, "loss": 0.8991, "step": 385 }, { "epoch": 0.07386363636363637, "grad_norm": 0.06463105152473327, "learning_rate": 0.00022159090909090908, "loss": 0.8993, "step": 390 }, { "epoch": 0.07481060606060606, "grad_norm": 0.06289812753072489, "learning_rate": 0.00022443181818181815, "loss": 0.8977, "step": 395 }, { "epoch": 0.07575757575757576, "grad_norm": 0.06451182368963407, "learning_rate": 0.00022727272727272725, "loss": 0.9109, "step": 400 }, { "epoch": 0.07670454545454546, "grad_norm": 0.06417545375628221, "learning_rate": 0.00023011363636363634, "loss": 0.8689, "step": 405 }, { "epoch": 0.07765151515151515, "grad_norm": 0.06624677302997224, "learning_rate": 0.00023295454545454542, "loss": 0.9096, "step": 410 }, { "epoch": 0.07859848484848485, "grad_norm": 0.06713767944662469, "learning_rate": 0.00023579545454545454, "loss": 0.9128, "step": 415 }, { "epoch": 0.07954545454545454, "grad_norm": 0.06632474263833514, "learning_rate": 0.0002386363636363636, "loss": 0.8992, "step": 420 }, { "epoch": 0.08049242424242424, "grad_norm": 0.06326893641363093, "learning_rate": 0.0002414772727272727, "loss": 0.8838, "step": 425 }, { "epoch": 0.08143939393939394, "grad_norm": 0.05270584817938461, "learning_rate": 0.0002443181818181818, "loss": 0.8604, "step": 430 }, { "epoch": 0.08238636363636363, "grad_norm": 0.06950851335464077, "learning_rate": 0.0002471590909090909, "loss": 0.8928, "step": 435 }, { "epoch": 0.08333333333333333, "grad_norm": 0.06031142221337703, "learning_rate": 0.00025, "loss": 0.8997, "step": 440 }, { "epoch": 0.08428030303030302, "grad_norm": 0.0598802579058441, "learning_rate": 0.00025284090909090905, "loss": 0.888, "step": 445 }, { "epoch": 0.08522727272727272, "grad_norm": 0.05979293999494916, "learning_rate": 0.00025568181818181815, "loss": 0.914, "step": 450 }, { "epoch": 0.08617424242424243, "grad_norm": 0.06332115337692762, "learning_rate": 0.00025852272727272725, "loss": 0.8897, "step": 455 }, { "epoch": 0.08712121212121213, "grad_norm": 0.05664133712393486, "learning_rate": 0.00026136363636363634, "loss": 0.8958, "step": 460 }, { "epoch": 0.08806818181818182, "grad_norm": 0.06262104837726735, "learning_rate": 0.00026420454545454544, "loss": 0.8773, "step": 465 }, { "epoch": 0.08901515151515152, "grad_norm": 0.06325434933754956, "learning_rate": 0.00026704545454545454, "loss": 0.8941, "step": 470 }, { "epoch": 0.08996212121212122, "grad_norm": 0.06454144975644246, "learning_rate": 0.00026988636363636364, "loss": 0.9055, "step": 475 }, { "epoch": 0.09090909090909091, "grad_norm": 0.058848918660389354, "learning_rate": 0.0002727272727272727, "loss": 0.9066, "step": 480 }, { "epoch": 0.09185606060606061, "grad_norm": 0.0643339517437263, "learning_rate": 0.0002755681818181818, "loss": 0.9207, "step": 485 }, { "epoch": 0.0928030303030303, "grad_norm": 0.06062790165341026, "learning_rate": 0.0002784090909090909, "loss": 0.9096, "step": 490 }, { "epoch": 0.09375, "grad_norm": 0.06483851920476219, "learning_rate": 0.00028125, "loss": 0.8924, "step": 495 }, { "epoch": 0.0946969696969697, "grad_norm": 0.06599789444637924, "learning_rate": 0.0002840909090909091, "loss": 0.9052, "step": 500 }, { "epoch": 0.09564393939393939, "grad_norm": 0.06622053779375818, "learning_rate": 0.0002869318181818182, "loss": 0.9261, "step": 505 }, { "epoch": 0.09659090909090909, "grad_norm": 0.06986338841915192, "learning_rate": 0.0002897727272727273, "loss": 0.9147, "step": 510 }, { "epoch": 0.09753787878787878, "grad_norm": 0.0566839679645091, "learning_rate": 0.0002926136363636363, "loss": 0.8702, "step": 515 }, { "epoch": 0.09848484848484848, "grad_norm": 0.06286540817635865, "learning_rate": 0.0002954545454545454, "loss": 0.9081, "step": 520 }, { "epoch": 0.09943181818181818, "grad_norm": 0.1319474057131131, "learning_rate": 0.0002982954545454545, "loss": 0.9121, "step": 525 }, { "epoch": 0.10037878787878787, "grad_norm": 0.05848327672137334, "learning_rate": 0.0002999998688802619, "loss": 0.9124, "step": 530 }, { "epoch": 0.10132575757575757, "grad_norm": 0.06169209923879713, "learning_rate": 0.0002999983937858416, "loss": 0.9065, "step": 535 }, { "epoch": 0.10227272727272728, "grad_norm": 0.06107286390796437, "learning_rate": 0.0002999952797134999, "loss": 0.9061, "step": 540 }, { "epoch": 0.10321969696969698, "grad_norm": 0.051714218670872523, "learning_rate": 0.00029999052669726326, "loss": 0.9188, "step": 545 }, { "epoch": 0.10416666666666667, "grad_norm": 0.05186568461682868, "learning_rate": 0.00029998413478906613, "loss": 0.8956, "step": 550 }, { "epoch": 0.10511363636363637, "grad_norm": 0.06237137033014081, "learning_rate": 0.00029997610405875047, "loss": 0.913, "step": 555 }, { "epoch": 0.10606060606060606, "grad_norm": 0.0627745121471118, "learning_rate": 0.00029996643459406525, "loss": 0.8781, "step": 560 }, { "epoch": 0.10700757575757576, "grad_norm": 0.05775548150329091, "learning_rate": 0.00029995512650066516, "loss": 0.8961, "step": 565 }, { "epoch": 0.10795454545454546, "grad_norm": 0.058742015004762956, "learning_rate": 0.0002999421799021097, "loss": 0.9081, "step": 570 }, { "epoch": 0.10890151515151515, "grad_norm": 0.06278642260383162, "learning_rate": 0.00029992759493986144, "loss": 0.9065, "step": 575 }, { "epoch": 0.10984848484848485, "grad_norm": 0.05368202960228582, "learning_rate": 0.0002999113717732852, "loss": 0.8793, "step": 580 }, { "epoch": 0.11079545454545454, "grad_norm": 0.06412640518837653, "learning_rate": 0.0002998935105796455, "loss": 0.8537, "step": 585 }, { "epoch": 0.11174242424242424, "grad_norm": 0.06196513917098741, "learning_rate": 0.00029987401155410516, "loss": 0.8954, "step": 590 }, { "epoch": 0.11268939393939394, "grad_norm": 0.0605833197123053, "learning_rate": 0.00029985287490972293, "loss": 0.8945, "step": 595 }, { "epoch": 0.11363636363636363, "grad_norm": 0.06504637583052113, "learning_rate": 0.0002998301008774512, "loss": 0.9008, "step": 600 }, { "epoch": 0.11458333333333333, "grad_norm": 0.05164915733505111, "learning_rate": 0.0002998056897061335, "loss": 0.9051, "step": 605 }, { "epoch": 0.11553030303030302, "grad_norm": 0.051685554793978426, "learning_rate": 0.000299779641662502, "loss": 0.8529, "step": 610 }, { "epoch": 0.11647727272727272, "grad_norm": 0.062362055131338696, "learning_rate": 0.00029975195703117405, "loss": 0.8691, "step": 615 }, { "epoch": 0.11742424242424243, "grad_norm": 0.06117736368428696, "learning_rate": 0.00029972263611464966, "loss": 0.8849, "step": 620 }, { "epoch": 0.11837121212121213, "grad_norm": 0.050904058331399764, "learning_rate": 0.00029969167923330766, "loss": 0.8576, "step": 625 }, { "epoch": 0.11931818181818182, "grad_norm": 0.053607823287624916, "learning_rate": 0.0002996590867254028, "loss": 0.9272, "step": 630 }, { "epoch": 0.12026515151515152, "grad_norm": 0.06037742366776271, "learning_rate": 0.00029962485894706155, "loss": 0.882, "step": 635 }, { "epoch": 0.12121212121212122, "grad_norm": 0.05143566912627675, "learning_rate": 0.00029958899627227837, "loss": 0.8828, "step": 640 }, { "epoch": 0.12215909090909091, "grad_norm": 0.05625636209624713, "learning_rate": 0.00029955149909291154, "loss": 0.9344, "step": 645 }, { "epoch": 0.12310606060606061, "grad_norm": 0.056332501852780985, "learning_rate": 0.00029951236781867937, "loss": 0.8857, "step": 650 }, { "epoch": 0.1240530303030303, "grad_norm": 0.05582242562634511, "learning_rate": 0.0002994716028771549, "loss": 0.8911, "step": 655 }, { "epoch": 0.125, "grad_norm": 0.05726658160474268, "learning_rate": 0.0002994292047137618, "loss": 0.9116, "step": 660 }, { "epoch": 0.1259469696969697, "grad_norm": 0.06041167916510802, "learning_rate": 0.0002993851737917695, "loss": 0.8898, "step": 665 }, { "epoch": 0.1268939393939394, "grad_norm": 0.05471735073845254, "learning_rate": 0.00029933951059228777, "loss": 0.8831, "step": 670 }, { "epoch": 0.1278409090909091, "grad_norm": 0.05947780307997745, "learning_rate": 0.0002992922156142619, "loss": 0.8745, "step": 675 }, { "epoch": 0.12878787878787878, "grad_norm": 0.06439001370901883, "learning_rate": 0.00029924328937446686, "loss": 0.8786, "step": 680 }, { "epoch": 0.12973484848484848, "grad_norm": 0.05448875777529848, "learning_rate": 0.0002991927324075019, "loss": 0.8619, "step": 685 }, { "epoch": 0.13068181818181818, "grad_norm": 0.05664358337517303, "learning_rate": 0.0002991405452657846, "loss": 0.8997, "step": 690 }, { "epoch": 0.13162878787878787, "grad_norm": 0.06476323651067244, "learning_rate": 0.0002990867285195449, "loss": 0.8965, "step": 695 }, { "epoch": 0.13257575757575757, "grad_norm": 0.052072472586110224, "learning_rate": 0.0002990312827568188, "loss": 0.9026, "step": 700 }, { "epoch": 0.13352272727272727, "grad_norm": 0.058261512452499706, "learning_rate": 0.00029897420858344205, "loss": 0.8927, "step": 705 }, { "epoch": 0.13446969696969696, "grad_norm": 0.048984858535432614, "learning_rate": 0.0002989155066230433, "loss": 0.8755, "step": 710 }, { "epoch": 0.13541666666666666, "grad_norm": 0.05290095759260302, "learning_rate": 0.0002988551775170377, "loss": 0.8848, "step": 715 }, { "epoch": 0.13636363636363635, "grad_norm": 0.05687350078562719, "learning_rate": 0.00029879322192461925, "loss": 0.8539, "step": 720 }, { "epoch": 0.13731060606060605, "grad_norm": 0.05103479492140091, "learning_rate": 0.0002987296405227543, "loss": 0.8953, "step": 725 }, { "epoch": 0.13825757575757575, "grad_norm": 0.06301978431499398, "learning_rate": 0.0002986644340061738, "loss": 0.8679, "step": 730 }, { "epoch": 0.13920454545454544, "grad_norm": 0.06819363260699743, "learning_rate": 0.0002985976030873655, "loss": 0.8767, "step": 735 }, { "epoch": 0.14015151515151514, "grad_norm": 0.06229636016637017, "learning_rate": 0.0002985291484965666, "loss": 0.8764, "step": 740 }, { "epoch": 0.14109848484848486, "grad_norm": 0.05610530939029699, "learning_rate": 0.0002984590709817555, "loss": 0.9009, "step": 745 }, { "epoch": 0.14204545454545456, "grad_norm": 0.04941941919339848, "learning_rate": 0.0002983873713086439, "loss": 0.8986, "step": 750 }, { "epoch": 0.14299242424242425, "grad_norm": 0.054105335971047615, "learning_rate": 0.00029831405026066785, "loss": 0.9131, "step": 755 }, { "epoch": 0.14393939393939395, "grad_norm": 0.055466909315876986, "learning_rate": 0.0002982391086389799, "loss": 0.8663, "step": 760 }, { "epoch": 0.14488636363636365, "grad_norm": 0.05043550339837371, "learning_rate": 0.00029816254726243983, "loss": 0.8959, "step": 765 }, { "epoch": 0.14583333333333334, "grad_norm": 0.05417610489948402, "learning_rate": 0.0002980843669676061, "loss": 0.8616, "step": 770 }, { "epoch": 0.14678030303030304, "grad_norm": 0.05041852142450034, "learning_rate": 0.0002980045686087262, "loss": 0.8855, "step": 775 }, { "epoch": 0.14772727272727273, "grad_norm": 0.06901010647152613, "learning_rate": 0.00029792315305772796, "loss": 0.9032, "step": 780 }, { "epoch": 0.14867424242424243, "grad_norm": 2.738390350230735, "learning_rate": 0.00029784012120420944, "loss": 0.888, "step": 785 }, { "epoch": 0.14962121212121213, "grad_norm": 0.1937615852009521, "learning_rate": 0.0002977554739554294, "loss": 1.0592, "step": 790 }, { "epoch": 0.15056818181818182, "grad_norm": 0.11234392360677802, "learning_rate": 0.00029766921223629774, "loss": 0.9652, "step": 795 }, { "epoch": 0.15151515151515152, "grad_norm": 0.12436979475241608, "learning_rate": 0.00029758133698936485, "loss": 0.9394, "step": 800 }, { "epoch": 0.15246212121212122, "grad_norm": 0.06657911701516095, "learning_rate": 0.00029749184917481157, "loss": 0.9099, "step": 805 }, { "epoch": 0.1534090909090909, "grad_norm": 0.0824368743598765, "learning_rate": 0.00029740074977043873, "loss": 0.8753, "step": 810 }, { "epoch": 0.1543560606060606, "grad_norm": 0.059895919738978086, "learning_rate": 0.00029730803977165643, "loss": 0.9159, "step": 815 }, { "epoch": 0.1553030303030303, "grad_norm": 0.04706104790187168, "learning_rate": 0.00029721372019147314, "loss": 0.9117, "step": 820 }, { "epoch": 0.15625, "grad_norm": 0.05202762617344226, "learning_rate": 0.00029711779206048454, "loss": 0.8807, "step": 825 }, { "epoch": 0.1571969696969697, "grad_norm": 0.050944436154957536, "learning_rate": 0.0002970202564268625, "loss": 0.8665, "step": 830 }, { "epoch": 0.1581439393939394, "grad_norm": 0.050530106988524406, "learning_rate": 0.00029692111435634347, "loss": 0.853, "step": 835 }, { "epoch": 0.1590909090909091, "grad_norm": 0.047944409481566634, "learning_rate": 0.0002968203669322168, "loss": 0.8719, "step": 840 }, { "epoch": 0.16003787878787878, "grad_norm": 0.05621479623321537, "learning_rate": 0.0002967180152553129, "loss": 0.8602, "step": 845 }, { "epoch": 0.16098484848484848, "grad_norm": 0.053970797332012134, "learning_rate": 0.0002966140604439914, "loss": 0.8804, "step": 850 }, { "epoch": 0.16193181818181818, "grad_norm": 0.05575444277143619, "learning_rate": 0.0002965085036341287, "loss": 0.8672, "step": 855 }, { "epoch": 0.16287878787878787, "grad_norm": 0.054447528875280926, "learning_rate": 0.0002964013459791057, "loss": 0.8705, "step": 860 }, { "epoch": 0.16382575757575757, "grad_norm": 0.04819722412795564, "learning_rate": 0.0002962925886497952, "loss": 0.885, "step": 865 }, { "epoch": 0.16477272727272727, "grad_norm": 0.04927047066910389, "learning_rate": 0.00029618223283454893, "loss": 0.8793, "step": 870 }, { "epoch": 0.16571969696969696, "grad_norm": 0.05156180447905288, "learning_rate": 0.0002960702797391848, "loss": 0.8697, "step": 875 }, { "epoch": 0.16666666666666666, "grad_norm": 0.047498663975991506, "learning_rate": 0.00029595673058697357, "loss": 0.8944, "step": 880 }, { "epoch": 0.16761363636363635, "grad_norm": 0.04931121452183828, "learning_rate": 0.0002958415866186255, "loss": 0.8708, "step": 885 }, { "epoch": 0.16856060606060605, "grad_norm": 0.04971806922102403, "learning_rate": 0.000295724849092277, "loss": 0.886, "step": 890 }, { "epoch": 0.16950757575757575, "grad_norm": 0.04782190553843517, "learning_rate": 0.0002956065192834765, "loss": 0.8625, "step": 895 }, { "epoch": 0.17045454545454544, "grad_norm": 0.05985135374827048, "learning_rate": 0.00029548659848517073, "loss": 0.8572, "step": 900 }, { "epoch": 0.17140151515151514, "grad_norm": 0.052563040632905934, "learning_rate": 0.00029536508800769083, "loss": 0.8527, "step": 905 }, { "epoch": 0.17234848484848486, "grad_norm": 0.04857882149280908, "learning_rate": 0.0002952419891787375, "loss": 0.8739, "step": 910 }, { "epoch": 0.17329545454545456, "grad_norm": 0.0479060897399876, "learning_rate": 0.00029511730334336693, "loss": 0.8905, "step": 915 }, { "epoch": 0.17424242424242425, "grad_norm": 0.05040950791010248, "learning_rate": 0.00029499103186397596, "loss": 0.8738, "step": 920 }, { "epoch": 0.17518939393939395, "grad_norm": 0.04549261460480799, "learning_rate": 0.00029486317612028705, "loss": 0.8697, "step": 925 }, { "epoch": 0.17613636363636365, "grad_norm": 0.04640647025353498, "learning_rate": 0.00029473373750933354, "loss": 0.8697, "step": 930 }, { "epoch": 0.17708333333333334, "grad_norm": 0.04813238779352585, "learning_rate": 0.0002946027174454439, "loss": 0.8691, "step": 935 }, { "epoch": 0.17803030303030304, "grad_norm": 0.04735624926206103, "learning_rate": 0.0002944701173602269, "loss": 0.8785, "step": 940 }, { "epoch": 0.17897727272727273, "grad_norm": 0.04918343048516045, "learning_rate": 0.00029433593870255547, "loss": 0.8832, "step": 945 }, { "epoch": 0.17992424242424243, "grad_norm": 0.05187324555019995, "learning_rate": 0.00029420018293855097, "loss": 0.8931, "step": 950 }, { "epoch": 0.18087121212121213, "grad_norm": 0.051833428541254264, "learning_rate": 0.0002940628515515673, "loss": 0.8505, "step": 955 }, { "epoch": 0.18181818181818182, "grad_norm": 0.04842441750953903, "learning_rate": 0.0002939239460421746, "loss": 0.8619, "step": 960 }, { "epoch": 0.18276515151515152, "grad_norm": 0.04450318082424258, "learning_rate": 0.00029378346792814284, "loss": 0.8935, "step": 965 }, { "epoch": 0.18371212121212122, "grad_norm": 0.050265508339128746, "learning_rate": 0.00029364141874442534, "loss": 0.8875, "step": 970 }, { "epoch": 0.1846590909090909, "grad_norm": 0.0489312969341679, "learning_rate": 0.00029349780004314196, "loss": 0.8707, "step": 975 }, { "epoch": 0.1856060606060606, "grad_norm": 0.0448678465223849, "learning_rate": 0.0002933526133935619, "loss": 0.8759, "step": 980 }, { "epoch": 0.1865530303030303, "grad_norm": 0.04933677179150076, "learning_rate": 0.000293205860382087, "loss": 0.8761, "step": 985 }, { "epoch": 0.1875, "grad_norm": 0.04538600611093541, "learning_rate": 0.000293057542612234, "loss": 0.8683, "step": 990 }, { "epoch": 0.1884469696969697, "grad_norm": 0.04816654163118486, "learning_rate": 0.00029290766170461733, "loss": 0.8575, "step": 995 }, { "epoch": 0.1893939393939394, "grad_norm": 0.05162840473736974, "learning_rate": 0.0002927562192969312, "loss": 0.8788, "step": 1000 }, { "epoch": 0.1903409090909091, "grad_norm": 0.0526139634089819, "learning_rate": 0.00029260321704393166, "loss": 0.8842, "step": 1005 }, { "epoch": 0.19128787878787878, "grad_norm": 0.06020706080827499, "learning_rate": 0.0002924486566174187, "loss": 0.8873, "step": 1010 }, { "epoch": 0.19223484848484848, "grad_norm": 0.05225939758166581, "learning_rate": 0.00029229253970621796, "loss": 0.8354, "step": 1015 }, { "epoch": 0.19318181818181818, "grad_norm": 0.05284878294440371, "learning_rate": 0.0002921348680161622, "loss": 0.9025, "step": 1020 }, { "epoch": 0.19412878787878787, "grad_norm": 0.059173148849731974, "learning_rate": 0.00029197564327007266, "loss": 0.8405, "step": 1025 }, { "epoch": 0.19507575757575757, "grad_norm": 0.05723447002828778, "learning_rate": 0.00029181486720774024, "loss": 0.9033, "step": 1030 }, { "epoch": 0.19602272727272727, "grad_norm": 0.04961291522370079, "learning_rate": 0.0002916525415859065, "loss": 0.8517, "step": 1035 }, { "epoch": 0.19696969696969696, "grad_norm": 0.04405850577071398, "learning_rate": 0.0002914886681782445, "loss": 0.8605, "step": 1040 }, { "epoch": 0.19791666666666666, "grad_norm": 0.052549109623340674, "learning_rate": 0.00029132324877533943, "loss": 0.8903, "step": 1045 }, { "epoch": 0.19886363636363635, "grad_norm": 0.052002448553744814, "learning_rate": 0.000291156285184669, "loss": 0.8673, "step": 1050 }, { "epoch": 0.19981060606060605, "grad_norm": 0.057528545450935206, "learning_rate": 0.0002909877792305836, "loss": 0.8693, "step": 1055 }, { "epoch": 0.20075757575757575, "grad_norm": 0.05849975352441284, "learning_rate": 0.0002908177327542866, "loss": 0.8806, "step": 1060 }, { "epoch": 0.20170454545454544, "grad_norm": 0.0668330093981767, "learning_rate": 0.00029064614761381395, "loss": 0.8573, "step": 1065 }, { "epoch": 0.20265151515151514, "grad_norm": 0.08085163002687007, "learning_rate": 0.0002904730256840142, "loss": 0.8588, "step": 1070 }, { "epoch": 0.20359848484848486, "grad_norm": 0.06115289967256575, "learning_rate": 0.0002902983688565276, "loss": 0.8489, "step": 1075 }, { "epoch": 0.20454545454545456, "grad_norm": 0.04578661935010964, "learning_rate": 0.000290122179039766, "loss": 0.8647, "step": 1080 }, { "epoch": 0.20549242424242425, "grad_norm": 0.04738987289722607, "learning_rate": 0.00028994445815889135, "loss": 0.8928, "step": 1085 }, { "epoch": 0.20643939393939395, "grad_norm": 0.048922711283470234, "learning_rate": 0.00028976520815579516, "loss": 0.8571, "step": 1090 }, { "epoch": 0.20738636363636365, "grad_norm": 0.042842101095754544, "learning_rate": 0.000289584430989077, "loss": 0.8994, "step": 1095 }, { "epoch": 0.20833333333333334, "grad_norm": 0.049240088817609676, "learning_rate": 0.0002894021286340233, "loss": 0.8703, "step": 1100 }, { "epoch": 0.20928030303030304, "grad_norm": 0.04405475980914324, "learning_rate": 0.0002892183030825857, "loss": 0.8697, "step": 1105 }, { "epoch": 0.21022727272727273, "grad_norm": 0.052077127621998655, "learning_rate": 0.00028903295634335904, "loss": 0.8995, "step": 1110 }, { "epoch": 0.21117424242424243, "grad_norm": 0.059590406683339404, "learning_rate": 0.00028884609044155983, "loss": 0.8798, "step": 1115 }, { "epoch": 0.21212121212121213, "grad_norm": 0.044865973163293216, "learning_rate": 0.0002886577074190038, "loss": 0.8965, "step": 1120 }, { "epoch": 0.21306818181818182, "grad_norm": 0.05169860969332574, "learning_rate": 0.0002884678093340838, "loss": 0.8554, "step": 1125 }, { "epoch": 0.21401515151515152, "grad_norm": 0.05156894746619203, "learning_rate": 0.00028827639826174716, "loss": 0.8727, "step": 1130 }, { "epoch": 0.21496212121212122, "grad_norm": 0.04493193351896756, "learning_rate": 0.0002880834762934731, "loss": 0.8659, "step": 1135 }, { "epoch": 0.2159090909090909, "grad_norm": 0.04709600714027476, "learning_rate": 0.0002878890455372498, "loss": 0.8494, "step": 1140 }, { "epoch": 0.2168560606060606, "grad_norm": 0.040578982776887075, "learning_rate": 0.00028769310811755153, "loss": 0.8345, "step": 1145 }, { "epoch": 0.2178030303030303, "grad_norm": 0.0454120547462, "learning_rate": 0.0002874956661753152, "loss": 0.893, "step": 1150 }, { "epoch": 0.21875, "grad_norm": 0.0458718950734734, "learning_rate": 0.00028729672186791704, "loss": 0.8453, "step": 1155 }, { "epoch": 0.2196969696969697, "grad_norm": 0.04315005418940203, "learning_rate": 0.0002870962773691493, "loss": 0.8389, "step": 1160 }, { "epoch": 0.2206439393939394, "grad_norm": 0.04752379784930806, "learning_rate": 0.00028689433486919617, "loss": 0.8673, "step": 1165 }, { "epoch": 0.2215909090909091, "grad_norm": 0.05684552525580732, "learning_rate": 0.00028669089657460984, "loss": 0.867, "step": 1170 }, { "epoch": 0.22253787878787878, "grad_norm": 0.050983860484211566, "learning_rate": 0.00028648596470828673, "loss": 0.8647, "step": 1175 }, { "epoch": 0.22348484848484848, "grad_norm": 0.046503504621597226, "learning_rate": 0.0002862795415094427, "loss": 0.8697, "step": 1180 }, { "epoch": 0.22443181818181818, "grad_norm": 0.0450753851181475, "learning_rate": 0.0002860716292335891, "loss": 0.8249, "step": 1185 }, { "epoch": 0.22537878787878787, "grad_norm": 0.04485421555009402, "learning_rate": 0.0002858622301525078, "loss": 0.8637, "step": 1190 }, { "epoch": 0.22632575757575757, "grad_norm": 0.04710476072245473, "learning_rate": 0.0002856513465542263, "loss": 0.8712, "step": 1195 }, { "epoch": 0.22727272727272727, "grad_norm": 0.04590515210754422, "learning_rate": 0.00028543898074299317, "loss": 0.8899, "step": 1200 }, { "epoch": 0.22821969696969696, "grad_norm": 0.046677806922824555, "learning_rate": 0.00028522513503925236, "loss": 0.8331, "step": 1205 }, { "epoch": 0.22916666666666666, "grad_norm": 0.05326891301520586, "learning_rate": 0.00028500981177961816, "loss": 0.8506, "step": 1210 }, { "epoch": 0.23011363636363635, "grad_norm": 0.04687694995176648, "learning_rate": 0.0002847930133168495, "loss": 0.8718, "step": 1215 }, { "epoch": 0.23106060606060605, "grad_norm": 0.043766596329374095, "learning_rate": 0.0002845747420198245, "loss": 0.8355, "step": 1220 }, { "epoch": 0.23200757575757575, "grad_norm": 0.051852335183307466, "learning_rate": 0.00028435500027351415, "loss": 0.9018, "step": 1225 }, { "epoch": 0.23295454545454544, "grad_norm": 0.046871786280526524, "learning_rate": 0.00028413379047895665, "loss": 0.8773, "step": 1230 }, { "epoch": 0.23390151515151514, "grad_norm": 0.05319047340562311, "learning_rate": 0.0002839111150532311, "loss": 0.8744, "step": 1235 }, { "epoch": 0.23484848484848486, "grad_norm": 0.0480557659328724, "learning_rate": 0.0002836869764294308, "loss": 0.8543, "step": 1240 }, { "epoch": 0.23579545454545456, "grad_norm": 0.045235971797863456, "learning_rate": 0.0002834613770566371, "loss": 0.8811, "step": 1245 }, { "epoch": 0.23674242424242425, "grad_norm": 0.051086084811488776, "learning_rate": 0.0002832343193998923, "loss": 0.8688, "step": 1250 }, { "epoch": 0.23768939393939395, "grad_norm": 0.04921720651133015, "learning_rate": 0.00028300580594017296, "loss": 0.8556, "step": 1255 }, { "epoch": 0.23863636363636365, "grad_norm": 0.046260991109867027, "learning_rate": 0.00028277583917436246, "loss": 0.8536, "step": 1260 }, { "epoch": 0.23958333333333334, "grad_norm": 0.04507558602085682, "learning_rate": 0.00028254442161522415, "loss": 0.8606, "step": 1265 }, { "epoch": 0.24053030303030304, "grad_norm": 0.04347528822105258, "learning_rate": 0.00028231155579137347, "loss": 0.8224, "step": 1270 }, { "epoch": 0.24147727272727273, "grad_norm": 0.044645406213969646, "learning_rate": 0.00028207724424725067, "loss": 0.8103, "step": 1275 }, { "epoch": 0.24242424242424243, "grad_norm": 0.04886991006139869, "learning_rate": 0.0002818414895430929, "loss": 0.8681, "step": 1280 }, { "epoch": 0.24337121212121213, "grad_norm": 0.050211499173204034, "learning_rate": 0.000281604294254906, "loss": 0.8465, "step": 1285 }, { "epoch": 0.24431818181818182, "grad_norm": 0.04407450456467826, "learning_rate": 0.0002813656609744367, "loss": 0.8587, "step": 1290 }, { "epoch": 0.24526515151515152, "grad_norm": 0.046396078514299374, "learning_rate": 0.00028112559230914413, "loss": 0.8836, "step": 1295 }, { "epoch": 0.24621212121212122, "grad_norm": 0.04992457772876899, "learning_rate": 0.0002808840908821713, "loss": 0.847, "step": 1300 }, { "epoch": 0.2471590909090909, "grad_norm": 0.06899088573327124, "learning_rate": 0.00028064115933231653, "loss": 0.8284, "step": 1305 }, { "epoch": 0.2481060606060606, "grad_norm": 0.047887893856181384, "learning_rate": 0.00028039680031400455, "loss": 0.8428, "step": 1310 }, { "epoch": 0.2490530303030303, "grad_norm": 0.04582410718511601, "learning_rate": 0.00028015101649725747, "loss": 0.8384, "step": 1315 }, { "epoch": 0.25, "grad_norm": 0.045636060323846026, "learning_rate": 0.0002799038105676658, "loss": 0.843, "step": 1320 }, { "epoch": 0.2509469696969697, "grad_norm": 0.04519189300518443, "learning_rate": 0.0002796551852263588, "loss": 0.8908, "step": 1325 }, { "epoch": 0.2518939393939394, "grad_norm": 0.045578564236409914, "learning_rate": 0.00027940514318997516, "loss": 0.8572, "step": 1330 }, { "epoch": 0.2528409090909091, "grad_norm": 0.051686507213315025, "learning_rate": 0.0002791536871906334, "loss": 0.8619, "step": 1335 }, { "epoch": 0.2537878787878788, "grad_norm": 0.04345687899653502, "learning_rate": 0.0002789008199759018, "loss": 0.8459, "step": 1340 }, { "epoch": 0.2547348484848485, "grad_norm": 0.046507137153713074, "learning_rate": 0.0002786465443087685, "loss": 0.8607, "step": 1345 }, { "epoch": 0.2556818181818182, "grad_norm": 0.042657023216453836, "learning_rate": 0.0002783908629676112, "loss": 0.8548, "step": 1350 }, { "epoch": 0.2566287878787879, "grad_norm": 0.040538053747547015, "learning_rate": 0.00027813377874616707, "loss": 0.8389, "step": 1355 }, { "epoch": 0.25757575757575757, "grad_norm": 0.04525382597081455, "learning_rate": 0.0002778752944535019, "loss": 0.8372, "step": 1360 }, { "epoch": 0.2585227272727273, "grad_norm": 0.04234205633912051, "learning_rate": 0.00027761541291397964, "loss": 0.8426, "step": 1365 }, { "epoch": 0.25946969696969696, "grad_norm": 0.04757806675030589, "learning_rate": 0.00027735413696723123, "loss": 0.8459, "step": 1370 }, { "epoch": 0.2604166666666667, "grad_norm": 0.043206314979838206, "learning_rate": 0.00027709146946812413, "loss": 0.8384, "step": 1375 }, { "epoch": 0.26136363636363635, "grad_norm": 0.04812617395176995, "learning_rate": 0.00027682741328673063, "loss": 0.83, "step": 1380 }, { "epoch": 0.2623106060606061, "grad_norm": 0.04595880004567738, "learning_rate": 0.0002765619713082965, "loss": 0.8704, "step": 1385 }, { "epoch": 0.26325757575757575, "grad_norm": 0.04342248392045638, "learning_rate": 0.0002762951464332098, "loss": 0.8545, "step": 1390 }, { "epoch": 0.26420454545454547, "grad_norm": 0.0468715942875786, "learning_rate": 0.0002760269415769691, "loss": 0.854, "step": 1395 }, { "epoch": 0.26515151515151514, "grad_norm": 0.045557679264808, "learning_rate": 0.0002757573596701511, "loss": 0.8543, "step": 1400 }, { "epoch": 0.26609848484848486, "grad_norm": 0.045277257320571196, "learning_rate": 0.0002754864036583795, "loss": 0.8519, "step": 1405 }, { "epoch": 0.26704545454545453, "grad_norm": 0.04446480155061431, "learning_rate": 0.000275214076502292, "loss": 0.852, "step": 1410 }, { "epoch": 0.26799242424242425, "grad_norm": 0.04286461978075112, "learning_rate": 0.00027494038117750855, "loss": 0.873, "step": 1415 }, { "epoch": 0.2689393939393939, "grad_norm": 0.04635120751169076, "learning_rate": 0.0002746653206745984, "loss": 0.8675, "step": 1420 }, { "epoch": 0.26988636363636365, "grad_norm": 0.04897795119080024, "learning_rate": 0.0002743888979990477, "loss": 0.8489, "step": 1425 }, { "epoch": 0.2708333333333333, "grad_norm": 0.045351523544172836, "learning_rate": 0.00027411111617122656, "loss": 0.8815, "step": 1430 }, { "epoch": 0.27178030303030304, "grad_norm": 0.046888003925487816, "learning_rate": 0.00027383197822635597, "loss": 0.8619, "step": 1435 }, { "epoch": 0.2727272727272727, "grad_norm": 0.050747409625094775, "learning_rate": 0.0002735514872144749, "loss": 0.877, "step": 1440 }, { "epoch": 0.27367424242424243, "grad_norm": 0.04867163124702627, "learning_rate": 0.0002732696462004066, "loss": 0.86, "step": 1445 }, { "epoch": 0.2746212121212121, "grad_norm": 0.053312130647735934, "learning_rate": 0.00027298645826372527, "loss": 0.8609, "step": 1450 }, { "epoch": 0.2755681818181818, "grad_norm": 0.04023418732684079, "learning_rate": 0.0002727019264987227, "loss": 0.8598, "step": 1455 }, { "epoch": 0.2765151515151515, "grad_norm": 0.04347366104067643, "learning_rate": 0.000272416054014374, "loss": 0.8443, "step": 1460 }, { "epoch": 0.2774621212121212, "grad_norm": 0.042854675811405736, "learning_rate": 0.00027212884393430396, "loss": 0.8632, "step": 1465 }, { "epoch": 0.2784090909090909, "grad_norm": 0.04461599878281101, "learning_rate": 0.0002718402993967526, "loss": 0.8469, "step": 1470 }, { "epoch": 0.2793560606060606, "grad_norm": 0.0458799502796299, "learning_rate": 0.0002715504235545412, "loss": 0.8675, "step": 1475 }, { "epoch": 0.2803030303030303, "grad_norm": 0.041761756053765885, "learning_rate": 0.0002712592195750378, "loss": 0.8751, "step": 1480 }, { "epoch": 0.28125, "grad_norm": 0.04293009223271159, "learning_rate": 0.0002709666906401224, "loss": 0.8591, "step": 1485 }, { "epoch": 0.2821969696969697, "grad_norm": 0.042628404150602366, "learning_rate": 0.00027067283994615225, "loss": 0.8314, "step": 1490 }, { "epoch": 0.2831439393939394, "grad_norm": 0.043803929434188336, "learning_rate": 0.0002703776707039271, "loss": 0.8515, "step": 1495 }, { "epoch": 0.2840909090909091, "grad_norm": 0.047256485311155456, "learning_rate": 0.00027008118613865406, "loss": 0.8376, "step": 1500 }, { "epoch": 0.2850378787878788, "grad_norm": 0.046926959946348615, "learning_rate": 0.00026978338948991206, "loss": 0.8423, "step": 1505 }, { "epoch": 0.2859848484848485, "grad_norm": 0.04941952831110132, "learning_rate": 0.0002694842840116169, "loss": 0.8564, "step": 1510 }, { "epoch": 0.2869318181818182, "grad_norm": 0.04638823285342314, "learning_rate": 0.0002691838729719854, "loss": 0.851, "step": 1515 }, { "epoch": 0.2878787878787879, "grad_norm": 0.051062848616744594, "learning_rate": 0.0002688821596534997, "loss": 0.8592, "step": 1520 }, { "epoch": 0.28882575757575757, "grad_norm": 0.048642765971924094, "learning_rate": 0.00026857914735287173, "loss": 0.8651, "step": 1525 }, { "epoch": 0.2897727272727273, "grad_norm": 0.041614396540575575, "learning_rate": 0.0002682748393810066, "loss": 0.853, "step": 1530 }, { "epoch": 0.29071969696969696, "grad_norm": 0.04037850703898104, "learning_rate": 0.0002679692390629669, "loss": 0.8714, "step": 1535 }, { "epoch": 0.2916666666666667, "grad_norm": 0.045919213909734996, "learning_rate": 0.0002676623497379363, "loss": 0.8526, "step": 1540 }, { "epoch": 0.29261363636363635, "grad_norm": 0.0435916558717206, "learning_rate": 0.00026735417475918285, "loss": 0.8474, "step": 1545 }, { "epoch": 0.2935606060606061, "grad_norm": 0.04829847525687287, "learning_rate": 0.00026704471749402256, "loss": 0.8548, "step": 1550 }, { "epoch": 0.29450757575757575, "grad_norm": 0.04805727825764429, "learning_rate": 0.0002667339813237824, "loss": 0.8453, "step": 1555 }, { "epoch": 0.29545454545454547, "grad_norm": 0.05010295983510741, "learning_rate": 0.0002664219696437635, "loss": 0.8416, "step": 1560 }, { "epoch": 0.29640151515151514, "grad_norm": 0.04388851666661931, "learning_rate": 0.00026610868586320416, "loss": 0.8341, "step": 1565 }, { "epoch": 0.29734848484848486, "grad_norm": 0.045305461318018866, "learning_rate": 0.00026579413340524233, "loss": 0.8322, "step": 1570 }, { "epoch": 0.29829545454545453, "grad_norm": 0.04178070112825466, "learning_rate": 0.0002654783157068785, "loss": 0.8798, "step": 1575 }, { "epoch": 0.29924242424242425, "grad_norm": 0.039660241288771, "learning_rate": 0.00026516123621893756, "loss": 0.8512, "step": 1580 }, { "epoch": 0.3001893939393939, "grad_norm": 0.04956139252725399, "learning_rate": 0.0002648428984060321, "loss": 0.8531, "step": 1585 }, { "epoch": 0.30113636363636365, "grad_norm": 0.04050711765679051, "learning_rate": 0.0002645233057465235, "loss": 0.8714, "step": 1590 }, { "epoch": 0.3020833333333333, "grad_norm": 0.044882658138218526, "learning_rate": 0.00026420246173248466, "loss": 0.8576, "step": 1595 }, { "epoch": 0.30303030303030304, "grad_norm": 0.0443601837928335, "learning_rate": 0.00026388036986966146, "loss": 0.8458, "step": 1600 }, { "epoch": 0.3039772727272727, "grad_norm": 0.04445201847842639, "learning_rate": 0.00026355703367743463, "loss": 0.8262, "step": 1605 }, { "epoch": 0.30492424242424243, "grad_norm": 0.04296295815749959, "learning_rate": 0.0002632324566887811, "loss": 0.852, "step": 1610 }, { "epoch": 0.3058712121212121, "grad_norm": 0.04595484323544366, "learning_rate": 0.0002629066424502358, "loss": 0.8712, "step": 1615 }, { "epoch": 0.3068181818181818, "grad_norm": 0.05135389183761192, "learning_rate": 0.0002625795945218523, "loss": 0.8686, "step": 1620 }, { "epoch": 0.3077651515151515, "grad_norm": 0.05402260665782284, "learning_rate": 0.00026225131647716454, "loss": 0.8705, "step": 1625 }, { "epoch": 0.3087121212121212, "grad_norm": 0.04647458435053281, "learning_rate": 0.00026192181190314734, "loss": 0.8497, "step": 1630 }, { "epoch": 0.3096590909090909, "grad_norm": 0.04501320267899854, "learning_rate": 0.0002615910844001774, "loss": 0.8699, "step": 1635 }, { "epoch": 0.3106060606060606, "grad_norm": 0.044016650809303635, "learning_rate": 0.0002612591375819939, "loss": 0.8451, "step": 1640 }, { "epoch": 0.3115530303030303, "grad_norm": 0.04270641863707379, "learning_rate": 0.0002609259750756591, "loss": 0.8264, "step": 1645 }, { "epoch": 0.3125, "grad_norm": 0.0454677296060317, "learning_rate": 0.0002605916005215186, "loss": 0.8344, "step": 1650 }, { "epoch": 0.3134469696969697, "grad_norm": 0.04056658550140151, "learning_rate": 0.0002602560175731615, "loss": 0.8187, "step": 1655 }, { "epoch": 0.3143939393939394, "grad_norm": 0.047795906369322495, "learning_rate": 0.0002599192298973808, "loss": 0.8596, "step": 1660 }, { "epoch": 0.3153409090909091, "grad_norm": 0.04746275776015859, "learning_rate": 0.00025958124117413296, "loss": 0.8373, "step": 1665 }, { "epoch": 0.3162878787878788, "grad_norm": 0.0490167919610274, "learning_rate": 0.0002592420550964979, "loss": 0.8605, "step": 1670 }, { "epoch": 0.3172348484848485, "grad_norm": 0.0425174441426416, "learning_rate": 0.00025890167537063856, "loss": 0.8466, "step": 1675 }, { "epoch": 0.3181818181818182, "grad_norm": 0.04266217832901279, "learning_rate": 0.0002585601057157605, "loss": 0.853, "step": 1680 }, { "epoch": 0.3191287878787879, "grad_norm": 0.042298923569418494, "learning_rate": 0.00025821734986407113, "loss": 0.852, "step": 1685 }, { "epoch": 0.32007575757575757, "grad_norm": 0.03839109699626175, "learning_rate": 0.00025787341156073915, "loss": 0.8079, "step": 1690 }, { "epoch": 0.3210227272727273, "grad_norm": 0.046090433170696436, "learning_rate": 0.0002575282945638532, "loss": 0.8622, "step": 1695 }, { "epoch": 0.32196969696969696, "grad_norm": 0.043394806500603955, "learning_rate": 0.0002571820026443814, "loss": 0.8569, "step": 1700 }, { "epoch": 0.3229166666666667, "grad_norm": 0.04364474268105156, "learning_rate": 0.00025683453958612963, "loss": 0.859, "step": 1705 }, { "epoch": 0.32386363636363635, "grad_norm": 0.04738452037799788, "learning_rate": 0.0002564859091857004, "loss": 0.8639, "step": 1710 }, { "epoch": 0.3248106060606061, "grad_norm": 0.046706389086655475, "learning_rate": 0.0002561361152524513, "loss": 0.8685, "step": 1715 }, { "epoch": 0.32575757575757575, "grad_norm": 0.04065493744655141, "learning_rate": 0.0002557851616084536, "loss": 0.8287, "step": 1720 }, { "epoch": 0.32670454545454547, "grad_norm": 0.042170889184096456, "learning_rate": 0.00025543305208845015, "loss": 0.8397, "step": 1725 }, { "epoch": 0.32765151515151514, "grad_norm": 0.057009989738631466, "learning_rate": 0.0002550797905398136, "loss": 0.8424, "step": 1730 }, { "epoch": 0.32859848484848486, "grad_norm": 0.054999335357357654, "learning_rate": 0.0002547253808225045, "loss": 0.8493, "step": 1735 }, { "epoch": 0.32954545454545453, "grad_norm": 0.04878510537743201, "learning_rate": 0.0002543698268090291, "loss": 0.8687, "step": 1740 }, { "epoch": 0.33049242424242425, "grad_norm": 0.04555832298125919, "learning_rate": 0.0002540131323843968, "loss": 0.848, "step": 1745 }, { "epoch": 0.3314393939393939, "grad_norm": 0.04559121786828398, "learning_rate": 0.0002536553014460778, "loss": 0.8422, "step": 1750 }, { "epoch": 0.33238636363636365, "grad_norm": 0.0412750415651045, "learning_rate": 0.00025329633790396086, "loss": 0.8528, "step": 1755 }, { "epoch": 0.3333333333333333, "grad_norm": 0.041659064100662745, "learning_rate": 0.00025293624568031, "loss": 0.8587, "step": 1760 }, { "epoch": 0.33428030303030304, "grad_norm": 0.04017536408178578, "learning_rate": 0.0002525750287097221, "loss": 0.8273, "step": 1765 }, { "epoch": 0.3352272727272727, "grad_norm": 0.043594334725810896, "learning_rate": 0.00025221269093908365, "loss": 0.8344, "step": 1770 }, { "epoch": 0.33617424242424243, "grad_norm": 0.03975857698007793, "learning_rate": 0.00025184923632752776, "loss": 0.8312, "step": 1775 }, { "epoch": 0.3371212121212121, "grad_norm": 0.043307661902050605, "learning_rate": 0.0002514846688463909, "loss": 0.8384, "step": 1780 }, { "epoch": 0.3380681818181818, "grad_norm": 0.041752317967921206, "learning_rate": 0.00025111899247916926, "loss": 0.8407, "step": 1785 }, { "epoch": 0.3390151515151515, "grad_norm": 0.042776187451381745, "learning_rate": 0.0002507522112214758, "loss": 0.8217, "step": 1790 }, { "epoch": 0.3399621212121212, "grad_norm": 0.04588631965260685, "learning_rate": 0.0002503843290809958, "loss": 0.8546, "step": 1795 }, { "epoch": 0.3409090909090909, "grad_norm": 0.043083702336913526, "learning_rate": 0.00025001535007744373, "loss": 0.8378, "step": 1800 }, { "epoch": 0.3418560606060606, "grad_norm": 0.04134116740011592, "learning_rate": 0.00024964527824251903, "loss": 0.8525, "step": 1805 }, { "epoch": 0.3428030303030303, "grad_norm": 0.04016855550268295, "learning_rate": 0.00024927411761986216, "loss": 0.8114, "step": 1810 }, { "epoch": 0.34375, "grad_norm": 0.04440153926149279, "learning_rate": 0.0002489018722650103, "loss": 0.8502, "step": 1815 }, { "epoch": 0.3446969696969697, "grad_norm": 0.056819456110476556, "learning_rate": 0.00024852854624535307, "loss": 0.8235, "step": 1820 }, { "epoch": 0.3456439393939394, "grad_norm": 0.05226275677923263, "learning_rate": 0.00024815414364008826, "loss": 0.8361, "step": 1825 }, { "epoch": 0.3465909090909091, "grad_norm": 0.044209574304730354, "learning_rate": 0.0002477786685401769, "loss": 0.8408, "step": 1830 }, { "epoch": 0.3475378787878788, "grad_norm": 0.04524443871766083, "learning_rate": 0.0002474021250482991, "loss": 0.837, "step": 1835 }, { "epoch": 0.3484848484848485, "grad_norm": 0.045412892342884655, "learning_rate": 0.0002470245172788086, "loss": 0.8386, "step": 1840 }, { "epoch": 0.3494318181818182, "grad_norm": 0.04952839628457116, "learning_rate": 0.0002466458493576882, "loss": 0.8396, "step": 1845 }, { "epoch": 0.3503787878787879, "grad_norm": 0.05660754811136995, "learning_rate": 0.0002462661254225047, "loss": 0.881, "step": 1850 }, { "epoch": 0.35132575757575757, "grad_norm": 0.04478496122656017, "learning_rate": 0.00024588534962236344, "loss": 0.8725, "step": 1855 }, { "epoch": 0.3522727272727273, "grad_norm": 0.047218982842698924, "learning_rate": 0.0002455035261178632, "loss": 0.8637, "step": 1860 }, { "epoch": 0.35321969696969696, "grad_norm": 0.041481601312663416, "learning_rate": 0.0002451206590810506, "loss": 0.8217, "step": 1865 }, { "epoch": 0.3541666666666667, "grad_norm": 0.04270379380461237, "learning_rate": 0.0002447367526953746, "loss": 0.8779, "step": 1870 }, { "epoch": 0.35511363636363635, "grad_norm": 0.03961220993813286, "learning_rate": 0.0002443518111556407, "loss": 0.8625, "step": 1875 }, { "epoch": 0.3560606060606061, "grad_norm": 0.042995120300460946, "learning_rate": 0.00024396583866796517, "loss": 0.8335, "step": 1880 }, { "epoch": 0.35700757575757575, "grad_norm": 0.04563005281967126, "learning_rate": 0.00024357883944972904, "loss": 0.8734, "step": 1885 }, { "epoch": 0.35795454545454547, "grad_norm": 0.046070346094399604, "learning_rate": 0.00024319081772953213, "loss": 0.8503, "step": 1890 }, { "epoch": 0.35890151515151514, "grad_norm": 0.046399508495104796, "learning_rate": 0.0002428017777471467, "loss": 0.8468, "step": 1895 }, { "epoch": 0.35984848484848486, "grad_norm": 0.04153432929534662, "learning_rate": 0.0002424117237534712, "loss": 0.8511, "step": 1900 }, { "epoch": 0.36079545454545453, "grad_norm": 0.04277554441903786, "learning_rate": 0.0002420206600104839, "loss": 0.8517, "step": 1905 }, { "epoch": 0.36174242424242425, "grad_norm": 0.041758108039158866, "learning_rate": 0.0002416285907911961, "loss": 0.8114, "step": 1910 }, { "epoch": 0.3626893939393939, "grad_norm": 0.05362310318010687, "learning_rate": 0.0002412355203796056, "loss": 0.8584, "step": 1915 }, { "epoch": 0.36363636363636365, "grad_norm": 0.0415342758524878, "learning_rate": 0.00024084145307064997, "loss": 0.8338, "step": 1920 }, { "epoch": 0.3645833333333333, "grad_norm": 0.04660724457587357, "learning_rate": 0.00024044639317015942, "loss": 0.8458, "step": 1925 }, { "epoch": 0.36553030303030304, "grad_norm": 0.0420525971980156, "learning_rate": 0.00024005034499480983, "loss": 0.8127, "step": 1930 }, { "epoch": 0.3664772727272727, "grad_norm": 0.04142360417103535, "learning_rate": 0.0002396533128720757, "loss": 0.8255, "step": 1935 }, { "epoch": 0.36742424242424243, "grad_norm": 0.04258598086280745, "learning_rate": 0.0002392553011401827, "loss": 0.8083, "step": 1940 }, { "epoch": 0.3683712121212121, "grad_norm": 0.04277812754465849, "learning_rate": 0.00023885631414806026, "loss": 0.8093, "step": 1945 }, { "epoch": 0.3693181818181818, "grad_norm": 0.048152407374910465, "learning_rate": 0.0002384563562552943, "loss": 0.8265, "step": 1950 }, { "epoch": 0.3702651515151515, "grad_norm": 0.047713607129726394, "learning_rate": 0.00023805543183207927, "loss": 0.8302, "step": 1955 }, { "epoch": 0.3712121212121212, "grad_norm": 0.04316835360005878, "learning_rate": 0.00023765354525917063, "loss": 0.8699, "step": 1960 }, { "epoch": 0.3721590909090909, "grad_norm": 0.04318616029335284, "learning_rate": 0.0002372507009278368, "loss": 0.8369, "step": 1965 }, { "epoch": 0.3731060606060606, "grad_norm": 0.04234533104631891, "learning_rate": 0.00023684690323981142, "loss": 0.8252, "step": 1970 }, { "epoch": 0.3740530303030303, "grad_norm": 0.03731733862303514, "learning_rate": 0.00023644215660724503, "loss": 0.8043, "step": 1975 }, { "epoch": 0.375, "grad_norm": 0.04858576668810662, "learning_rate": 0.00023603646545265687, "loss": 0.8011, "step": 1980 }, { "epoch": 0.3759469696969697, "grad_norm": 0.03947493260183539, "learning_rate": 0.00023562983420888684, "loss": 0.8456, "step": 1985 }, { "epoch": 0.3768939393939394, "grad_norm": 0.03987615045295327, "learning_rate": 0.00023522226731904664, "loss": 0.8081, "step": 1990 }, { "epoch": 0.3778409090909091, "grad_norm": 0.04467453055781031, "learning_rate": 0.0002348137692364715, "loss": 0.8196, "step": 1995 }, { "epoch": 0.3787878787878788, "grad_norm": 0.04289778568763919, "learning_rate": 0.00023440434442467152, "loss": 0.8242, "step": 2000 }, { "epoch": 0.3797348484848485, "grad_norm": 0.03957134697373444, "learning_rate": 0.00023399399735728277, "loss": 0.8271, "step": 2005 }, { "epoch": 0.3806818181818182, "grad_norm": 0.04057581593775104, "learning_rate": 0.00023358273251801847, "loss": 0.7991, "step": 2010 }, { "epoch": 0.3816287878787879, "grad_norm": 0.044395062218471865, "learning_rate": 0.00023317055440062, "loss": 0.8398, "step": 2015 }, { "epoch": 0.38257575757575757, "grad_norm": 0.04570817682106231, "learning_rate": 0.00023275746750880784, "loss": 0.8499, "step": 2020 }, { "epoch": 0.3835227272727273, "grad_norm": 0.04410857496355408, "learning_rate": 0.00023234347635623233, "loss": 0.8344, "step": 2025 }, { "epoch": 0.38446969696969696, "grad_norm": 0.044494824681638484, "learning_rate": 0.0002319285854664242, "loss": 0.8177, "step": 2030 }, { "epoch": 0.3854166666666667, "grad_norm": 0.05291529187135586, "learning_rate": 0.00023151279937274548, "loss": 0.8162, "step": 2035 }, { "epoch": 0.38636363636363635, "grad_norm": 0.0475425548888116, "learning_rate": 0.00023109612261833963, "loss": 0.836, "step": 2040 }, { "epoch": 0.3873106060606061, "grad_norm": 0.04795570932520818, "learning_rate": 0.00023067855975608204, "loss": 0.8017, "step": 2045 }, { "epoch": 0.38825757575757575, "grad_norm": 0.04351083558421903, "learning_rate": 0.0002302601153485304, "loss": 0.8304, "step": 2050 }, { "epoch": 0.38920454545454547, "grad_norm": 0.03951679906520293, "learning_rate": 0.00022984079396787453, "loss": 0.8141, "step": 2055 }, { "epoch": 0.39015151515151514, "grad_norm": 0.04196816207750975, "learning_rate": 0.00022942060019588681, "loss": 0.8152, "step": 2060 }, { "epoch": 0.39109848484848486, "grad_norm": 0.04118592006087123, "learning_rate": 0.00022899953862387182, "loss": 0.8221, "step": 2065 }, { "epoch": 0.39204545454545453, "grad_norm": 0.043398944942405406, "learning_rate": 0.00022857761385261624, "loss": 0.8784, "step": 2070 }, { "epoch": 0.39299242424242425, "grad_norm": 0.048877313557472005, "learning_rate": 0.0002281548304923387, "loss": 0.8301, "step": 2075 }, { "epoch": 0.3939393939393939, "grad_norm": 0.04660152034578749, "learning_rate": 0.0002277311931626393, "loss": 0.8383, "step": 2080 }, { "epoch": 0.39488636363636365, "grad_norm": 0.043844576323543855, "learning_rate": 0.00022730670649244913, "loss": 0.8598, "step": 2085 }, { "epoch": 0.3958333333333333, "grad_norm": 0.04598334857841716, "learning_rate": 0.00022688137511997977, "loss": 0.8339, "step": 2090 }, { "epoch": 0.39678030303030304, "grad_norm": 0.044197199133415584, "learning_rate": 0.00022645520369267246, "loss": 0.8444, "step": 2095 }, { "epoch": 0.3977272727272727, "grad_norm": 0.04324972778637147, "learning_rate": 0.00022602819686714745, "loss": 0.8347, "step": 2100 }, { "epoch": 0.39867424242424243, "grad_norm": 0.07113871411866793, "learning_rate": 0.00022560035930915308, "loss": 0.8084, "step": 2105 }, { "epoch": 0.3996212121212121, "grad_norm": 0.04394138759860652, "learning_rate": 0.0002251716956935149, "loss": 0.7981, "step": 2110 }, { "epoch": 0.4005681818181818, "grad_norm": 0.0418228499942872, "learning_rate": 0.00022474221070408436, "loss": 0.8289, "step": 2115 }, { "epoch": 0.4015151515151515, "grad_norm": 0.04161882179160139, "learning_rate": 0.00022431190903368786, "loss": 0.847, "step": 2120 }, { "epoch": 0.4024621212121212, "grad_norm": 0.043508618425632965, "learning_rate": 0.00022388079538407523, "loss": 0.8437, "step": 2125 }, { "epoch": 0.4034090909090909, "grad_norm": 0.0411877941963478, "learning_rate": 0.00022344887446586865, "loss": 0.8397, "step": 2130 }, { "epoch": 0.4043560606060606, "grad_norm": 0.044653146921684504, "learning_rate": 0.00022301615099851104, "loss": 0.8387, "step": 2135 }, { "epoch": 0.4053030303030303, "grad_norm": 0.039794410056582685, "learning_rate": 0.00022258262971021437, "loss": 0.8602, "step": 2140 }, { "epoch": 0.40625, "grad_norm": 0.04159725529141693, "learning_rate": 0.00022214831533790813, "loss": 0.8418, "step": 2145 }, { "epoch": 0.4071969696969697, "grad_norm": 0.04333570277742372, "learning_rate": 0.00022171321262718765, "loss": 0.8405, "step": 2150 }, { "epoch": 0.4081439393939394, "grad_norm": 0.041080619204016525, "learning_rate": 0.00022127732633226205, "loss": 0.812, "step": 2155 }, { "epoch": 0.4090909090909091, "grad_norm": 0.040348407175793716, "learning_rate": 0.0002208406612159024, "loss": 0.814, "step": 2160 }, { "epoch": 0.4100378787878788, "grad_norm": 0.04031482700237761, "learning_rate": 0.0002204032220493897, "loss": 0.8147, "step": 2165 }, { "epoch": 0.4109848484848485, "grad_norm": 0.039485799909013825, "learning_rate": 0.00021996501361246277, "loss": 0.8176, "step": 2170 }, { "epoch": 0.4119318181818182, "grad_norm": 0.04055755183348957, "learning_rate": 0.00021952604069326579, "loss": 0.7957, "step": 2175 }, { "epoch": 0.4128787878787879, "grad_norm": 0.042423918520902, "learning_rate": 0.0002190863080882964, "loss": 0.8233, "step": 2180 }, { "epoch": 0.41382575757575757, "grad_norm": 0.0425176465118192, "learning_rate": 0.00021864582060235278, "loss": 0.8248, "step": 2185 }, { "epoch": 0.4147727272727273, "grad_norm": 0.04617541469982274, "learning_rate": 0.00021820458304848165, "loss": 0.8517, "step": 2190 }, { "epoch": 0.41571969696969696, "grad_norm": 0.04672199768475857, "learning_rate": 0.0002177626002479254, "loss": 0.8431, "step": 2195 }, { "epoch": 0.4166666666666667, "grad_norm": 0.04438315570807549, "learning_rate": 0.00021731987703006933, "loss": 0.8259, "step": 2200 }, { "epoch": 0.41761363636363635, "grad_norm": 0.04427543226414618, "learning_rate": 0.00021687641823238914, "loss": 0.8297, "step": 2205 }, { "epoch": 0.4185606060606061, "grad_norm": 0.046143213680102727, "learning_rate": 0.00021643222870039788, "loss": 0.8183, "step": 2210 }, { "epoch": 0.41950757575757575, "grad_norm": 0.05133388935241887, "learning_rate": 0.00021598731328759316, "loss": 0.8433, "step": 2215 }, { "epoch": 0.42045454545454547, "grad_norm": 0.04409942894634961, "learning_rate": 0.0002155416768554039, "loss": 0.8341, "step": 2220 }, { "epoch": 0.42140151515151514, "grad_norm": 0.040874633772368184, "learning_rate": 0.00021509532427313745, "loss": 0.8257, "step": 2225 }, { "epoch": 0.42234848484848486, "grad_norm": 0.04536497602517255, "learning_rate": 0.00021464826041792616, "loss": 0.8265, "step": 2230 }, { "epoch": 0.42329545454545453, "grad_norm": 0.04063288891442798, "learning_rate": 0.0002142004901746743, "loss": 0.8157, "step": 2235 }, { "epoch": 0.42424242424242425, "grad_norm": 0.04304936310139827, "learning_rate": 0.00021375201843600448, "loss": 0.8154, "step": 2240 }, { "epoch": 0.4251893939393939, "grad_norm": 0.03886996255738611, "learning_rate": 0.00021330285010220444, "loss": 0.8064, "step": 2245 }, { "epoch": 0.42613636363636365, "grad_norm": 0.0439951415809052, "learning_rate": 0.00021285299008117327, "loss": 0.8189, "step": 2250 }, { "epoch": 0.4270833333333333, "grad_norm": 0.04105867295635526, "learning_rate": 0.00021240244328836786, "loss": 0.8042, "step": 2255 }, { "epoch": 0.42803030303030304, "grad_norm": 0.0413161086382512, "learning_rate": 0.0002119512146467492, "loss": 0.8416, "step": 2260 }, { "epoch": 0.4289772727272727, "grad_norm": 0.04014206208769556, "learning_rate": 0.00021149930908672868, "loss": 0.8185, "step": 2265 }, { "epoch": 0.42992424242424243, "grad_norm": 0.04829740014687004, "learning_rate": 0.00021104673154611408, "loss": 0.8361, "step": 2270 }, { "epoch": 0.4308712121212121, "grad_norm": 0.03905046798702236, "learning_rate": 0.0002105934869700556, "loss": 0.8242, "step": 2275 }, { "epoch": 0.4318181818181818, "grad_norm": 0.053526559819707935, "learning_rate": 0.00021013958031099205, "loss": 0.8426, "step": 2280 }, { "epoch": 0.4327651515151515, "grad_norm": 0.04462350019901078, "learning_rate": 0.0002096850165285964, "loss": 0.8408, "step": 2285 }, { "epoch": 0.4337121212121212, "grad_norm": 0.04177938781732431, "learning_rate": 0.00020922980058972194, "loss": 0.8295, "step": 2290 }, { "epoch": 0.4346590909090909, "grad_norm": 0.03988703975583103, "learning_rate": 0.00020877393746834768, "loss": 0.8324, "step": 2295 }, { "epoch": 0.4356060606060606, "grad_norm": 0.04147526586188008, "learning_rate": 0.0002083174321455243, "loss": 0.8388, "step": 2300 }, { "epoch": 0.4365530303030303, "grad_norm": 0.038044695307941134, "learning_rate": 0.0002078602896093194, "loss": 0.7954, "step": 2305 }, { "epoch": 0.4375, "grad_norm": 0.045559472118838124, "learning_rate": 0.00020740251485476345, "loss": 0.8813, "step": 2310 }, { "epoch": 0.4384469696969697, "grad_norm": 0.06279101091973911, "learning_rate": 0.0002069441128837947, "loss": 0.839, "step": 2315 }, { "epoch": 0.4393939393939394, "grad_norm": 0.04692715704201686, "learning_rate": 0.00020648508870520476, "loss": 0.8352, "step": 2320 }, { "epoch": 0.4403409090909091, "grad_norm": 0.051468594729952064, "learning_rate": 0.00020602544733458418, "loss": 0.839, "step": 2325 }, { "epoch": 0.4412878787878788, "grad_norm": 0.049853534068296984, "learning_rate": 0.00020556519379426693, "loss": 0.8457, "step": 2330 }, { "epoch": 0.4422348484848485, "grad_norm": 0.043981249031014996, "learning_rate": 0.0002051043331132762, "loss": 0.8371, "step": 2335 }, { "epoch": 0.4431818181818182, "grad_norm": 0.044420240602267035, "learning_rate": 0.00020464287032726913, "loss": 0.889, "step": 2340 }, { "epoch": 0.4441287878787879, "grad_norm": 0.0386102436252357, "learning_rate": 0.00020418081047848187, "loss": 0.8372, "step": 2345 }, { "epoch": 0.44507575757575757, "grad_norm": 0.04140201905259435, "learning_rate": 0.00020371815861567428, "loss": 0.8336, "step": 2350 }, { "epoch": 0.4460227272727273, "grad_norm": 0.04249364186602932, "learning_rate": 0.00020325491979407523, "loss": 0.8116, "step": 2355 }, { "epoch": 0.44696969696969696, "grad_norm": 0.04505263140825889, "learning_rate": 0.00020279109907532693, "loss": 0.8089, "step": 2360 }, { "epoch": 0.4479166666666667, "grad_norm": 0.04905451772451616, "learning_rate": 0.0002023267015274296, "loss": 0.8161, "step": 2365 }, { "epoch": 0.44886363636363635, "grad_norm": 0.044083131589713095, "learning_rate": 0.0002018617322246866, "loss": 0.7928, "step": 2370 }, { "epoch": 0.4498106060606061, "grad_norm": 0.04087182152366722, "learning_rate": 0.0002013961962476484, "loss": 0.8176, "step": 2375 }, { "epoch": 0.45075757575757575, "grad_norm": 0.03992981513666836, "learning_rate": 0.0002009300986830574, "loss": 0.8202, "step": 2380 }, { "epoch": 0.45170454545454547, "grad_norm": 0.04118122404075756, "learning_rate": 0.00020046344462379222, "loss": 0.8084, "step": 2385 }, { "epoch": 0.45265151515151514, "grad_norm": 0.03836846591943074, "learning_rate": 0.00019999623916881217, "loss": 0.7813, "step": 2390 }, { "epoch": 0.45359848484848486, "grad_norm": 0.042256616559667545, "learning_rate": 0.0001995284874231014, "loss": 0.8405, "step": 2395 }, { "epoch": 0.45454545454545453, "grad_norm": 0.041816536986922656, "learning_rate": 0.00019906019449761325, "loss": 0.8265, "step": 2400 }, { "epoch": 0.45549242424242425, "grad_norm": 0.03818514732175339, "learning_rate": 0.0001985913655092142, "loss": 0.829, "step": 2405 }, { "epoch": 0.4564393939393939, "grad_norm": 0.040830601356944725, "learning_rate": 0.00019812200558062817, "loss": 0.833, "step": 2410 }, { "epoch": 0.45738636363636365, "grad_norm": 0.04011921388896301, "learning_rate": 0.0001976521198403806, "loss": 0.7861, "step": 2415 }, { "epoch": 0.4583333333333333, "grad_norm": 0.044386214010181065, "learning_rate": 0.00019718171342274205, "loss": 0.8065, "step": 2420 }, { "epoch": 0.45928030303030304, "grad_norm": 0.039421528176134195, "learning_rate": 0.00019671079146767244, "loss": 0.8064, "step": 2425 }, { "epoch": 0.4602272727272727, "grad_norm": 0.039658502246534034, "learning_rate": 0.00019623935912076488, "loss": 0.8319, "step": 2430 }, { "epoch": 0.46117424242424243, "grad_norm": 0.04350429112923609, "learning_rate": 0.00019576742153318914, "loss": 0.7962, "step": 2435 }, { "epoch": 0.4621212121212121, "grad_norm": 0.042692790817445286, "learning_rate": 0.0001952949838616357, "loss": 0.8373, "step": 2440 }, { "epoch": 0.4630681818181818, "grad_norm": 0.04339398756235945, "learning_rate": 0.00019482205126825937, "loss": 0.8022, "step": 2445 }, { "epoch": 0.4640151515151515, "grad_norm": 0.04304166274280883, "learning_rate": 0.0001943486289206225, "loss": 0.8106, "step": 2450 }, { "epoch": 0.4649621212121212, "grad_norm": 0.043750220078402964, "learning_rate": 0.0001938747219916391, "loss": 0.8435, "step": 2455 }, { "epoch": 0.4659090909090909, "grad_norm": 0.04664680007453336, "learning_rate": 0.0001934003356595179, "loss": 0.8472, "step": 2460 }, { "epoch": 0.4668560606060606, "grad_norm": 0.041222110220423554, "learning_rate": 0.00019292547510770585, "loss": 0.7787, "step": 2465 }, { "epoch": 0.4678030303030303, "grad_norm": 0.044722804250566385, "learning_rate": 0.00019245014552483162, "loss": 0.8394, "step": 2470 }, { "epoch": 0.46875, "grad_norm": 0.0404146031607715, "learning_rate": 0.00019197435210464882, "loss": 0.8154, "step": 2475 }, { "epoch": 0.4696969696969697, "grad_norm": 0.04023149404763966, "learning_rate": 0.00019149810004597903, "loss": 0.8191, "step": 2480 }, { "epoch": 0.4706439393939394, "grad_norm": 0.042874304030485456, "learning_rate": 0.00019102139455265556, "loss": 0.815, "step": 2485 }, { "epoch": 0.4715909090909091, "grad_norm": 0.042901399276201645, "learning_rate": 0.00019054424083346592, "loss": 0.8254, "step": 2490 }, { "epoch": 0.4725378787878788, "grad_norm": 0.048218277435419815, "learning_rate": 0.00019006664410209533, "loss": 0.8005, "step": 2495 }, { "epoch": 0.4734848484848485, "grad_norm": 0.04801481036784791, "learning_rate": 0.00018958860957706973, "loss": 0.7971, "step": 2500 }, { "epoch": 0.4744318181818182, "grad_norm": 0.045807113572995425, "learning_rate": 0.00018911014248169862, "loss": 0.8308, "step": 2505 }, { "epoch": 0.4753787878787879, "grad_norm": 0.04157428170316455, "learning_rate": 0.00018863124804401792, "loss": 0.7937, "step": 2510 }, { "epoch": 0.47632575757575757, "grad_norm": 0.044060198787386526, "learning_rate": 0.0001881519314967331, "loss": 0.8345, "step": 2515 }, { "epoch": 0.4772727272727273, "grad_norm": 0.04229775014082669, "learning_rate": 0.00018767219807716185, "loss": 0.7952, "step": 2520 }, { "epoch": 0.47821969696969696, "grad_norm": 0.038797572756557264, "learning_rate": 0.00018719205302717687, "loss": 0.8176, "step": 2525 }, { "epoch": 0.4791666666666667, "grad_norm": 0.03699768912795355, "learning_rate": 0.00018671150159314855, "loss": 0.8063, "step": 2530 }, { "epoch": 0.48011363636363635, "grad_norm": 0.044197153844287275, "learning_rate": 0.00018623054902588775, "loss": 0.8083, "step": 2535 }, { "epoch": 0.4810606060606061, "grad_norm": 0.04091887354099648, "learning_rate": 0.00018574920058058824, "loss": 0.807, "step": 2540 }, { "epoch": 0.48200757575757575, "grad_norm": 0.48718224838979884, "learning_rate": 0.0001852674615167696, "loss": 0.8124, "step": 2545 }, { "epoch": 0.48295454545454547, "grad_norm": 0.08791099719174246, "learning_rate": 0.00018478533709821946, "loss": 0.8227, "step": 2550 }, { "epoch": 0.48390151515151514, "grad_norm": 0.049480601037611455, "learning_rate": 0.000184302832592936, "loss": 0.8321, "step": 2555 }, { "epoch": 0.48484848484848486, "grad_norm": 0.0429607961370786, "learning_rate": 0.00018381995327307067, "loss": 0.8178, "step": 2560 }, { "epoch": 0.48579545454545453, "grad_norm": 0.0432423766004113, "learning_rate": 0.0001833367044148701, "loss": 0.7845, "step": 2565 }, { "epoch": 0.48674242424242425, "grad_norm": 0.04315753569890211, "learning_rate": 0.00018285309129861905, "loss": 0.8346, "step": 2570 }, { "epoch": 0.4876893939393939, "grad_norm": 0.04241261149768791, "learning_rate": 0.00018236911920858215, "loss": 0.8322, "step": 2575 }, { "epoch": 0.48863636363636365, "grad_norm": 0.042058874287831324, "learning_rate": 0.00018188479343294648, "loss": 0.8246, "step": 2580 }, { "epoch": 0.4895833333333333, "grad_norm": 0.04305771883252671, "learning_rate": 0.0001814001192637638, "loss": 0.826, "step": 2585 }, { "epoch": 0.49053030303030304, "grad_norm": 0.041842967176783166, "learning_rate": 0.0001809151019968925, "loss": 0.7911, "step": 2590 }, { "epoch": 0.4914772727272727, "grad_norm": 0.04061701352923131, "learning_rate": 0.00018042974693193998, "loss": 0.797, "step": 2595 }, { "epoch": 0.49242424242424243, "grad_norm": 0.04647922786872379, "learning_rate": 0.0001799440593722046, "loss": 0.7946, "step": 2600 }, { "epoch": 0.4933712121212121, "grad_norm": 0.04039980658232629, "learning_rate": 0.00017945804462461776, "loss": 0.8, "step": 2605 }, { "epoch": 0.4943181818181818, "grad_norm": 0.03886277509185924, "learning_rate": 0.00017897170799968583, "loss": 0.7849, "step": 2610 }, { "epoch": 0.4952651515151515, "grad_norm": 0.038833022410046734, "learning_rate": 0.00017848505481143253, "loss": 0.844, "step": 2615 }, { "epoch": 0.4962121212121212, "grad_norm": 0.03944433422520882, "learning_rate": 0.00017799809037734017, "loss": 0.8163, "step": 2620 }, { "epoch": 0.4971590909090909, "grad_norm": 0.043008540505396285, "learning_rate": 0.00017751082001829215, "loss": 0.8258, "step": 2625 }, { "epoch": 0.4981060606060606, "grad_norm": 0.040199050500284966, "learning_rate": 0.00017702324905851456, "loss": 0.8315, "step": 2630 }, { "epoch": 0.4990530303030303, "grad_norm": 0.040791783573880774, "learning_rate": 0.00017653538282551805, "loss": 0.7863, "step": 2635 }, { "epoch": 0.5, "grad_norm": 0.0442708357721425, "learning_rate": 0.00017604722665003956, "loss": 0.8213, "step": 2640 }, { "epoch": 0.5009469696969697, "grad_norm": 0.04301849579782737, "learning_rate": 0.00017555878586598413, "loss": 0.8236, "step": 2645 }, { "epoch": 0.5018939393939394, "grad_norm": 0.04101785865093793, "learning_rate": 0.00017507006581036678, "loss": 0.8062, "step": 2650 }, { "epoch": 0.5028409090909091, "grad_norm": 0.04131104487003713, "learning_rate": 0.00017458107182325374, "loss": 0.8257, "step": 2655 }, { "epoch": 0.5037878787878788, "grad_norm": 0.043009445132835604, "learning_rate": 0.00017409180924770468, "loss": 0.8165, "step": 2660 }, { "epoch": 0.5047348484848485, "grad_norm": 0.03958181194344823, "learning_rate": 0.00017360228342971383, "loss": 0.8325, "step": 2665 }, { "epoch": 0.5056818181818182, "grad_norm": 0.0397542558833111, "learning_rate": 0.00017311249971815185, "loss": 0.798, "step": 2670 }, { "epoch": 0.5066287878787878, "grad_norm": 0.04623083615061935, "learning_rate": 0.00017262246346470733, "loss": 0.8354, "step": 2675 }, { "epoch": 0.5075757575757576, "grad_norm": 0.04071727047371299, "learning_rate": 0.0001721321800238283, "loss": 0.7985, "step": 2680 }, { "epoch": 0.5085227272727273, "grad_norm": 0.03863023788708821, "learning_rate": 0.00017164165475266362, "loss": 0.8162, "step": 2685 }, { "epoch": 0.509469696969697, "grad_norm": 0.03916331940406132, "learning_rate": 0.0001711508930110047, "loss": 0.7845, "step": 2690 }, { "epoch": 0.5104166666666666, "grad_norm": 0.03931132854415474, "learning_rate": 0.0001706599001612266, "loss": 0.7776, "step": 2695 }, { "epoch": 0.5113636363636364, "grad_norm": 0.043368245986302636, "learning_rate": 0.00017016868156822978, "loss": 0.8054, "step": 2700 }, { "epoch": 0.5123106060606061, "grad_norm": 0.03992828095880944, "learning_rate": 0.00016967724259938123, "loss": 0.7988, "step": 2705 }, { "epoch": 0.5132575757575758, "grad_norm": 0.04739921469866324, "learning_rate": 0.00016918558862445582, "loss": 0.7943, "step": 2710 }, { "epoch": 0.5142045454545454, "grad_norm": 0.04332958886748095, "learning_rate": 0.00016869372501557788, "loss": 0.819, "step": 2715 }, { "epoch": 0.5151515151515151, "grad_norm": 0.04003282248966938, "learning_rate": 0.00016820165714716227, "loss": 0.8292, "step": 2720 }, { "epoch": 0.5160984848484849, "grad_norm": 0.04862073540042466, "learning_rate": 0.00016770939039585571, "loss": 0.827, "step": 2725 }, { "epoch": 0.5170454545454546, "grad_norm": 0.04320764468844852, "learning_rate": 0.00016721693014047805, "loss": 0.804, "step": 2730 }, { "epoch": 0.5179924242424242, "grad_norm": 0.041252462956353965, "learning_rate": 0.00016672428176196344, "loss": 0.7767, "step": 2735 }, { "epoch": 0.5189393939393939, "grad_norm": 0.040985992228162016, "learning_rate": 0.00016623145064330162, "loss": 0.8092, "step": 2740 }, { "epoch": 0.5198863636363636, "grad_norm": 0.042258864981091555, "learning_rate": 0.0001657384421694791, "loss": 0.7994, "step": 2745 }, { "epoch": 0.5208333333333334, "grad_norm": 0.04021897972485824, "learning_rate": 0.00016524526172742026, "loss": 0.784, "step": 2750 }, { "epoch": 0.521780303030303, "grad_norm": 0.04326415978694196, "learning_rate": 0.0001647519147059285, "loss": 0.8047, "step": 2755 }, { "epoch": 0.5227272727272727, "grad_norm": 0.04410213043970681, "learning_rate": 0.00016425840649562736, "loss": 0.8126, "step": 2760 }, { "epoch": 0.5236742424242424, "grad_norm": 0.04353103158360417, "learning_rate": 0.00016376474248890171, "loss": 0.8286, "step": 2765 }, { "epoch": 0.5246212121212122, "grad_norm": 0.040750168292872986, "learning_rate": 0.00016327092807983865, "loss": 0.808, "step": 2770 }, { "epoch": 0.5255681818181818, "grad_norm": 0.03802927883237263, "learning_rate": 0.0001627769686641687, "loss": 0.8053, "step": 2775 }, { "epoch": 0.5265151515151515, "grad_norm": 0.04157749031417506, "learning_rate": 0.0001622828696392069, "loss": 0.8244, "step": 2780 }, { "epoch": 0.5274621212121212, "grad_norm": 0.03901280596306352, "learning_rate": 0.00016178863640379357, "loss": 0.8057, "step": 2785 }, { "epoch": 0.5284090909090909, "grad_norm": 0.04452601323911491, "learning_rate": 0.0001612942743582357, "loss": 0.8382, "step": 2790 }, { "epoch": 0.5293560606060606, "grad_norm": 0.05541519438754282, "learning_rate": 0.0001607997889042476, "loss": 0.841, "step": 2795 }, { "epoch": 0.5303030303030303, "grad_norm": 0.058989969809454576, "learning_rate": 0.00016030518544489213, "loss": 0.8176, "step": 2800 }, { "epoch": 0.53125, "grad_norm": 0.05002098578052214, "learning_rate": 0.00015981046938452146, "loss": 0.8002, "step": 2805 }, { "epoch": 0.5321969696969697, "grad_norm": 0.08021694387476866, "learning_rate": 0.00015931564612871812, "loss": 0.81, "step": 2810 }, { "epoch": 0.5331439393939394, "grad_norm": 0.045649438932203974, "learning_rate": 0.00015882072108423594, "loss": 0.7931, "step": 2815 }, { "epoch": 0.5340909090909091, "grad_norm": 0.042928318104272216, "learning_rate": 0.000158325699658941, "loss": 0.8097, "step": 2820 }, { "epoch": 0.5350378787878788, "grad_norm": 0.041290327628681206, "learning_rate": 0.0001578305872617525, "loss": 0.8009, "step": 2825 }, { "epoch": 0.5359848484848485, "grad_norm": 0.043370278921034656, "learning_rate": 0.0001573353893025835, "loss": 0.8072, "step": 2830 }, { "epoch": 0.5369318181818182, "grad_norm": 0.04062057924038298, "learning_rate": 0.00015684011119228224, "loss": 0.8135, "step": 2835 }, { "epoch": 0.5378787878787878, "grad_norm": 0.03956842259978757, "learning_rate": 0.00015634475834257246, "loss": 0.8083, "step": 2840 }, { "epoch": 0.5388257575757576, "grad_norm": 0.04065029502341851, "learning_rate": 0.00015584933616599473, "loss": 0.8252, "step": 2845 }, { "epoch": 0.5397727272727273, "grad_norm": 0.0406473529931244, "learning_rate": 0.00015535385007584706, "loss": 0.788, "step": 2850 }, { "epoch": 0.540719696969697, "grad_norm": 0.040142414490938486, "learning_rate": 0.0001548583054861259, "loss": 0.7869, "step": 2855 }, { "epoch": 0.5416666666666666, "grad_norm": 0.03672349497309872, "learning_rate": 0.0001543627078114667, "loss": 0.7999, "step": 2860 }, { "epoch": 0.5426136363636364, "grad_norm": 0.0430040050376934, "learning_rate": 0.00015386706246708524, "loss": 0.8061, "step": 2865 }, { "epoch": 0.5435606060606061, "grad_norm": 0.0395889490877207, "learning_rate": 0.00015337137486871796, "loss": 0.7938, "step": 2870 }, { "epoch": 0.5445075757575758, "grad_norm": 0.04178894808034304, "learning_rate": 0.00015287565043256302, "loss": 0.7898, "step": 2875 }, { "epoch": 0.5454545454545454, "grad_norm": 0.04229269873654704, "learning_rate": 0.00015237989457522118, "loss": 0.8025, "step": 2880 }, { "epoch": 0.5464015151515151, "grad_norm": 0.04274569352435655, "learning_rate": 0.00015188411271363646, "loss": 0.8477, "step": 2885 }, { "epoch": 0.5473484848484849, "grad_norm": 0.04018997606928589, "learning_rate": 0.00015138831026503702, "loss": 0.8121, "step": 2890 }, { "epoch": 0.5482954545454546, "grad_norm": 0.04487654715701671, "learning_rate": 0.00015089249264687603, "loss": 0.7961, "step": 2895 }, { "epoch": 0.5492424242424242, "grad_norm": 0.03928757645472925, "learning_rate": 0.00015039666527677233, "loss": 0.8406, "step": 2900 }, { "epoch": 0.5501893939393939, "grad_norm": 0.042611840013764696, "learning_rate": 0.00014990083357245128, "loss": 0.7913, "step": 2905 }, { "epoch": 0.5511363636363636, "grad_norm": 0.04292928983412645, "learning_rate": 0.0001494050029516858, "loss": 0.7977, "step": 2910 }, { "epoch": 0.5520833333333334, "grad_norm": 0.038369849549461224, "learning_rate": 0.00014890917883223677, "loss": 0.8199, "step": 2915 }, { "epoch": 0.553030303030303, "grad_norm": 0.04541317576278365, "learning_rate": 0.00014841336663179406, "loss": 0.8091, "step": 2920 }, { "epoch": 0.5539772727272727, "grad_norm": 0.03684570730569431, "learning_rate": 0.00014791757176791742, "loss": 0.8195, "step": 2925 }, { "epoch": 0.5549242424242424, "grad_norm": 0.03872377757151605, "learning_rate": 0.00014742179965797705, "loss": 0.8107, "step": 2930 }, { "epoch": 0.5558712121212122, "grad_norm": 0.0395990381746492, "learning_rate": 0.00014692605571909462, "loss": 0.8034, "step": 2935 }, { "epoch": 0.5568181818181818, "grad_norm": 0.03886126907028109, "learning_rate": 0.00014643034536808387, "loss": 0.7968, "step": 2940 }, { "epoch": 0.5577651515151515, "grad_norm": 0.039986116602809194, "learning_rate": 0.00014593467402139164, "loss": 0.7946, "step": 2945 }, { "epoch": 0.5587121212121212, "grad_norm": 0.03812573349649333, "learning_rate": 0.00014543904709503854, "loss": 0.7866, "step": 2950 }, { "epoch": 0.5596590909090909, "grad_norm": 0.03932184467330868, "learning_rate": 0.0001449434700045599, "loss": 0.8019, "step": 2955 }, { "epoch": 0.5606060606060606, "grad_norm": 0.04445798754887963, "learning_rate": 0.00014444794816494626, "loss": 0.825, "step": 2960 }, { "epoch": 0.5615530303030303, "grad_norm": 0.04248501207056126, "learning_rate": 0.0001439524869905848, "loss": 0.8226, "step": 2965 }, { "epoch": 0.5625, "grad_norm": 0.040603736504944546, "learning_rate": 0.0001434570918951996, "loss": 0.8263, "step": 2970 }, { "epoch": 0.5634469696969697, "grad_norm": 0.044713609887935456, "learning_rate": 0.00014296176829179275, "loss": 0.7915, "step": 2975 }, { "epoch": 0.5643939393939394, "grad_norm": 0.042449864997604524, "learning_rate": 0.00014246652159258526, "loss": 0.7896, "step": 2980 }, { "epoch": 0.5653409090909091, "grad_norm": 0.03798962681468922, "learning_rate": 0.0001419713572089577, "loss": 0.9055, "step": 2985 }, { "epoch": 0.5662878787878788, "grad_norm": 0.04786757520604103, "learning_rate": 0.0001414762805513914, "loss": 0.8006, "step": 2990 }, { "epoch": 0.5672348484848485, "grad_norm": 0.04252660263305811, "learning_rate": 0.00014098129702940892, "loss": 0.7907, "step": 2995 }, { "epoch": 0.5681818181818182, "grad_norm": 0.043829106135149745, "learning_rate": 0.00014048641205151533, "loss": 0.7872, "step": 3000 }, { "epoch": 0.5691287878787878, "grad_norm": 0.03916395834671137, "learning_rate": 0.0001399916310251388, "loss": 0.7761, "step": 3005 }, { "epoch": 0.5700757575757576, "grad_norm": 0.03967852059796232, "learning_rate": 0.00013949695935657193, "loss": 0.7951, "step": 3010 }, { "epoch": 0.5710227272727273, "grad_norm": 0.04226287650595886, "learning_rate": 0.00013900240245091203, "loss": 0.7765, "step": 3015 }, { "epoch": 0.571969696969697, "grad_norm": 0.04195996621258936, "learning_rate": 0.00013850796571200264, "loss": 0.8174, "step": 3020 }, { "epoch": 0.5729166666666666, "grad_norm": 0.04343356999935917, "learning_rate": 0.00013801365454237444, "loss": 0.8048, "step": 3025 }, { "epoch": 0.5738636363636364, "grad_norm": 0.038284495114294666, "learning_rate": 0.00013751947434318564, "loss": 0.7818, "step": 3030 }, { "epoch": 0.5748106060606061, "grad_norm": 0.04286762969801166, "learning_rate": 0.00013702543051416383, "loss": 0.7904, "step": 3035 }, { "epoch": 0.5757575757575758, "grad_norm": 0.03931256543968111, "learning_rate": 0.00013653152845354623, "loss": 0.8209, "step": 3040 }, { "epoch": 0.5767045454545454, "grad_norm": 0.04052581793415016, "learning_rate": 0.0001360377735580212, "loss": 0.7895, "step": 3045 }, { "epoch": 0.5776515151515151, "grad_norm": 0.04013847290742192, "learning_rate": 0.00013554417122266888, "loss": 0.7997, "step": 3050 }, { "epoch": 0.5785984848484849, "grad_norm": 0.04225466886048973, "learning_rate": 0.00013505072684090263, "loss": 0.8018, "step": 3055 }, { "epoch": 0.5795454545454546, "grad_norm": 0.04063423310025803, "learning_rate": 0.00013455744580440982, "loss": 0.8103, "step": 3060 }, { "epoch": 0.5804924242424242, "grad_norm": 0.041112600968304276, "learning_rate": 0.00013406433350309304, "loss": 0.771, "step": 3065 }, { "epoch": 0.5814393939393939, "grad_norm": 0.043166425761174104, "learning_rate": 0.0001335713953250111, "loss": 0.7813, "step": 3070 }, { "epoch": 0.5823863636363636, "grad_norm": 0.042559737261154675, "learning_rate": 0.0001330786366563203, "loss": 0.7795, "step": 3075 }, { "epoch": 0.5833333333333334, "grad_norm": 0.039294962688869624, "learning_rate": 0.00013258606288121542, "loss": 0.7852, "step": 3080 }, { "epoch": 0.584280303030303, "grad_norm": 0.039625489168065825, "learning_rate": 0.00013209367938187125, "loss": 0.7602, "step": 3085 }, { "epoch": 0.5852272727272727, "grad_norm": 0.038997900760427306, "learning_rate": 0.000131601491538383, "loss": 0.78, "step": 3090 }, { "epoch": 0.5861742424242424, "grad_norm": 0.040146555417594515, "learning_rate": 0.00013110950472870853, "loss": 0.8004, "step": 3095 }, { "epoch": 0.5871212121212122, "grad_norm": 0.039349933500868364, "learning_rate": 0.00013061772432860886, "loss": 0.8254, "step": 3100 }, { "epoch": 0.5880681818181818, "grad_norm": 0.040521068783339456, "learning_rate": 0.0001301261557115895, "loss": 0.7688, "step": 3105 }, { "epoch": 0.5890151515151515, "grad_norm": 0.04280174220822872, "learning_rate": 0.00012963480424884214, "loss": 0.7883, "step": 3110 }, { "epoch": 0.5899621212121212, "grad_norm": 0.04000965172907218, "learning_rate": 0.00012914367530918557, "loss": 0.7733, "step": 3115 }, { "epoch": 0.5909090909090909, "grad_norm": 0.04058021099794167, "learning_rate": 0.00012865277425900724, "loss": 0.7816, "step": 3120 }, { "epoch": 0.5918560606060606, "grad_norm": 0.044091148466870456, "learning_rate": 0.00012816210646220437, "loss": 0.7797, "step": 3125 }, { "epoch": 0.5928030303030303, "grad_norm": 0.042915659449647994, "learning_rate": 0.00012767167728012566, "loss": 0.787, "step": 3130 }, { "epoch": 0.59375, "grad_norm": 0.04044950460083324, "learning_rate": 0.00012718149207151247, "loss": 0.8153, "step": 3135 }, { "epoch": 0.5946969696969697, "grad_norm": 0.035974336074393466, "learning_rate": 0.00012669155619244048, "loss": 0.7665, "step": 3140 }, { "epoch": 0.5956439393939394, "grad_norm": 0.03990766970124255, "learning_rate": 0.00012620187499626082, "loss": 0.7814, "step": 3145 }, { "epoch": 0.5965909090909091, "grad_norm": 0.04063094274983586, "learning_rate": 0.00012571245383354192, "loss": 0.8079, "step": 3150 }, { "epoch": 0.5975378787878788, "grad_norm": 0.04192680886047405, "learning_rate": 0.00012522329805201104, "loss": 0.7851, "step": 3155 }, { "epoch": 0.5984848484848485, "grad_norm": 0.0405476068701757, "learning_rate": 0.00012473441299649544, "loss": 0.8231, "step": 3160 }, { "epoch": 0.5994318181818182, "grad_norm": 0.040569633945997545, "learning_rate": 0.0001242458040088644, "loss": 0.7737, "step": 3165 }, { "epoch": 0.6003787878787878, "grad_norm": 0.038360447057587385, "learning_rate": 0.00012375747642797083, "loss": 0.7874, "step": 3170 }, { "epoch": 0.6013257575757576, "grad_norm": 0.04006212810733869, "learning_rate": 0.00012326943558959265, "loss": 0.7899, "step": 3175 }, { "epoch": 0.6022727272727273, "grad_norm": 0.04200526076077111, "learning_rate": 0.0001227816868263746, "loss": 0.8006, "step": 3180 }, { "epoch": 0.603219696969697, "grad_norm": 0.04132950189249958, "learning_rate": 0.0001222942354677702, "loss": 0.7927, "step": 3185 }, { "epoch": 0.6041666666666666, "grad_norm": 0.039846023645240154, "learning_rate": 0.00012180708683998321, "loss": 0.8127, "step": 3190 }, { "epoch": 0.6051136363636364, "grad_norm": 0.0370381211582106, "learning_rate": 0.00012132024626590963, "loss": 0.7977, "step": 3195 }, { "epoch": 0.6060606060606061, "grad_norm": 0.03637545691166675, "learning_rate": 0.00012083371906507937, "loss": 0.7972, "step": 3200 }, { "epoch": 0.6070075757575758, "grad_norm": 0.03719020082784945, "learning_rate": 0.00012034751055359836, "loss": 0.7944, "step": 3205 }, { "epoch": 0.6079545454545454, "grad_norm": 0.04061887180440516, "learning_rate": 0.00011986162604409015, "loss": 0.8207, "step": 3210 }, { "epoch": 0.6089015151515151, "grad_norm": 0.03857442410511439, "learning_rate": 0.00011937607084563836, "loss": 0.7841, "step": 3215 }, { "epoch": 0.6098484848484849, "grad_norm": 0.03544743527411389, "learning_rate": 0.00011889085026372792, "loss": 0.7499, "step": 3220 }, { "epoch": 0.6107954545454546, "grad_norm": 0.040036064198766305, "learning_rate": 0.00011840596960018779, "loss": 0.7856, "step": 3225 }, { "epoch": 0.6117424242424242, "grad_norm": 0.03717988537059713, "learning_rate": 0.00011792143415313285, "loss": 0.7884, "step": 3230 }, { "epoch": 0.6126893939393939, "grad_norm": 0.038107331310669845, "learning_rate": 0.00011743724921690557, "loss": 0.8106, "step": 3235 }, { "epoch": 0.6136363636363636, "grad_norm": 0.04482871942046956, "learning_rate": 0.00011695342008201888, "loss": 0.7865, "step": 3240 }, { "epoch": 0.6145833333333334, "grad_norm": 0.04317100302292292, "learning_rate": 0.00011646995203509786, "loss": 0.7826, "step": 3245 }, { "epoch": 0.615530303030303, "grad_norm": 0.039413143234785654, "learning_rate": 0.00011598685035882209, "loss": 0.8101, "step": 3250 }, { "epoch": 0.6164772727272727, "grad_norm": 0.04160937673240829, "learning_rate": 0.00011550412033186792, "loss": 0.8075, "step": 3255 }, { "epoch": 0.6174242424242424, "grad_norm": 0.04265804651359686, "learning_rate": 0.00011502176722885092, "loss": 0.7775, "step": 3260 }, { "epoch": 0.6183712121212122, "grad_norm": 0.03635722191647411, "learning_rate": 0.00011453979632026809, "loss": 0.791, "step": 3265 }, { "epoch": 0.6193181818181818, "grad_norm": 0.03705221554060922, "learning_rate": 0.00011405821287244035, "loss": 0.8008, "step": 3270 }, { "epoch": 0.6202651515151515, "grad_norm": 0.042403810547206766, "learning_rate": 0.00011357702214745493, "loss": 0.7652, "step": 3275 }, { "epoch": 0.6212121212121212, "grad_norm": 0.03979753737480537, "learning_rate": 0.00011309622940310798, "loss": 0.7991, "step": 3280 }, { "epoch": 0.6221590909090909, "grad_norm": 0.03836363646080294, "learning_rate": 0.00011261583989284712, "loss": 0.803, "step": 3285 }, { "epoch": 0.6231060606060606, "grad_norm": 0.04244615364903799, "learning_rate": 0.00011213585886571376, "loss": 0.8072, "step": 3290 }, { "epoch": 0.6240530303030303, "grad_norm": 0.04283582850640676, "learning_rate": 0.00011165629156628613, "loss": 0.7861, "step": 3295 }, { "epoch": 0.625, "grad_norm": 0.038461780382639685, "learning_rate": 0.00011117714323462186, "loss": 0.7835, "step": 3300 }, { "epoch": 0.6259469696969697, "grad_norm": 0.03744497970084062, "learning_rate": 0.00011069841910620057, "loss": 0.8062, "step": 3305 }, { "epoch": 0.6268939393939394, "grad_norm": 0.04483128738934721, "learning_rate": 0.00011022012441186671, "loss": 0.7961, "step": 3310 }, { "epoch": 0.6278409090909091, "grad_norm": 0.04424941740338033, "learning_rate": 0.00010974226437777261, "loss": 0.7949, "step": 3315 }, { "epoch": 0.6287878787878788, "grad_norm": 0.04251454254286352, "learning_rate": 0.0001092648442253211, "loss": 0.7725, "step": 3320 }, { "epoch": 0.6297348484848485, "grad_norm": 0.040105374119544505, "learning_rate": 0.0001087878691711087, "loss": 0.8147, "step": 3325 }, { "epoch": 0.6306818181818182, "grad_norm": 0.04164828498994665, "learning_rate": 0.00010831134442686835, "loss": 0.8076, "step": 3330 }, { "epoch": 0.6316287878787878, "grad_norm": 0.04194724008380887, "learning_rate": 0.00010783527519941272, "loss": 0.7514, "step": 3335 }, { "epoch": 0.6325757575757576, "grad_norm": 0.04282010148959667, "learning_rate": 0.00010735966669057723, "loss": 0.8084, "step": 3340 }, { "epoch": 0.6335227272727273, "grad_norm": 0.037751992950868556, "learning_rate": 0.00010688452409716325, "loss": 0.7971, "step": 3345 }, { "epoch": 0.634469696969697, "grad_norm": 0.040981833047628674, "learning_rate": 0.00010640985261088102, "loss": 0.8259, "step": 3350 }, { "epoch": 0.6354166666666666, "grad_norm": 0.03623074593719334, "learning_rate": 0.00010593565741829331, "loss": 0.7584, "step": 3355 }, { "epoch": 0.6363636363636364, "grad_norm": 0.04085407578588483, "learning_rate": 0.00010546194370075881, "loss": 0.7941, "step": 3360 }, { "epoch": 0.6373106060606061, "grad_norm": 0.04107679689904555, "learning_rate": 0.00010498871663437485, "loss": 0.7985, "step": 3365 }, { "epoch": 0.6382575757575758, "grad_norm": 0.03850210602630568, "learning_rate": 0.00010451598138992173, "loss": 0.7737, "step": 3370 }, { "epoch": 0.6392045454545454, "grad_norm": 0.0375973308222491, "learning_rate": 0.00010404374313280557, "loss": 0.7849, "step": 3375 }, { "epoch": 0.6401515151515151, "grad_norm": 0.03545282006804828, "learning_rate": 0.00010357200702300214, "loss": 0.7993, "step": 3380 }, { "epoch": 0.6410984848484849, "grad_norm": 0.04099405038912456, "learning_rate": 0.0001031007782150004, "loss": 0.7879, "step": 3385 }, { "epoch": 0.6420454545454546, "grad_norm": 0.04269554474417421, "learning_rate": 0.00010263006185774627, "loss": 0.7559, "step": 3390 }, { "epoch": 0.6429924242424242, "grad_norm": 0.039655024113479126, "learning_rate": 0.00010215986309458622, "loss": 0.7633, "step": 3395 }, { "epoch": 0.6439393939393939, "grad_norm": 0.040202236041103546, "learning_rate": 0.0001016901870632113, "loss": 0.7795, "step": 3400 }, { "epoch": 0.6448863636363636, "grad_norm": 0.038440162083217946, "learning_rate": 0.00010122103889560066, "loss": 0.788, "step": 3405 }, { "epoch": 0.6458333333333334, "grad_norm": 0.0380210653037665, "learning_rate": 0.00010075242371796585, "loss": 0.7796, "step": 3410 }, { "epoch": 0.646780303030303, "grad_norm": 0.038714184645298265, "learning_rate": 0.00010028434665069456, "loss": 0.7505, "step": 3415 }, { "epoch": 0.6477272727272727, "grad_norm": 0.036301784575765876, "learning_rate": 9.981681280829472e-05, "loss": 0.7863, "step": 3420 }, { "epoch": 0.6486742424242424, "grad_norm": 0.04273246901454883, "learning_rate": 9.934982729933864e-05, "loss": 0.7936, "step": 3425 }, { "epoch": 0.6496212121212122, "grad_norm": 0.04096752213327176, "learning_rate": 9.888339522640727e-05, "loss": 0.7848, "step": 3430 }, { "epoch": 0.6505681818181818, "grad_norm": 0.03654932535140771, "learning_rate": 9.84175216860344e-05, "loss": 0.801, "step": 3435 }, { "epoch": 0.6515151515151515, "grad_norm": 0.03977277870704206, "learning_rate": 9.795221176865064e-05, "loss": 0.7817, "step": 3440 }, { "epoch": 0.6524621212121212, "grad_norm": 0.03945778648109342, "learning_rate": 9.748747055852845e-05, "loss": 0.8034, "step": 3445 }, { "epoch": 0.6534090909090909, "grad_norm": 0.03750951937019652, "learning_rate": 9.702330313372607e-05, "loss": 0.8047, "step": 3450 }, { "epoch": 0.6543560606060606, "grad_norm": 0.04224253753307829, "learning_rate": 9.655971456603222e-05, "loss": 0.7741, "step": 3455 }, { "epoch": 0.6553030303030303, "grad_norm": 0.04193635128292089, "learning_rate": 9.609670992091063e-05, "loss": 0.7686, "step": 3460 }, { "epoch": 0.65625, "grad_norm": 0.0383707593111435, "learning_rate": 9.563429425744476e-05, "loss": 0.7937, "step": 3465 }, { "epoch": 0.6571969696969697, "grad_norm": 0.04221940705987869, "learning_rate": 9.517247262828245e-05, "loss": 0.7589, "step": 3470 }, { "epoch": 0.6581439393939394, "grad_norm": 0.03796604736644861, "learning_rate": 9.47112500795808e-05, "loss": 0.7673, "step": 3475 }, { "epoch": 0.6590909090909091, "grad_norm": 0.03883212329330115, "learning_rate": 9.425063165095088e-05, "loss": 0.7899, "step": 3480 }, { "epoch": 0.6600378787878788, "grad_norm": 0.03775017282994837, "learning_rate": 9.379062237540282e-05, "loss": 0.7824, "step": 3485 }, { "epoch": 0.6609848484848485, "grad_norm": 0.040969682549424714, "learning_rate": 9.333122727929086e-05, "loss": 0.7744, "step": 3490 }, { "epoch": 0.6619318181818182, "grad_norm": 0.043909710244610795, "learning_rate": 9.287245138225807e-05, "loss": 0.7844, "step": 3495 }, { "epoch": 0.6628787878787878, "grad_norm": 0.03979263612757763, "learning_rate": 9.241429969718193e-05, "loss": 0.7771, "step": 3500 }, { "epoch": 0.6638257575757576, "grad_norm": 0.036720996927973024, "learning_rate": 9.195677723011943e-05, "loss": 0.7787, "step": 3505 }, { "epoch": 0.6647727272727273, "grad_norm": 0.03931047020350426, "learning_rate": 9.149988898025224e-05, "loss": 0.7924, "step": 3510 }, { "epoch": 0.665719696969697, "grad_norm": 0.037239986158338824, "learning_rate": 9.10436399398321e-05, "loss": 0.763, "step": 3515 }, { "epoch": 0.6666666666666666, "grad_norm": 0.03903031387121002, "learning_rate": 9.058803509412646e-05, "loss": 0.7948, "step": 3520 }, { "epoch": 0.6676136363636364, "grad_norm": 0.037974042366075905, "learning_rate": 9.013307942136387e-05, "loss": 0.7958, "step": 3525 }, { "epoch": 0.6685606060606061, "grad_norm": 0.03996518480569968, "learning_rate": 8.967877789267957e-05, "loss": 0.7961, "step": 3530 }, { "epoch": 0.6695075757575758, "grad_norm": 0.04156879401863253, "learning_rate": 8.92251354720612e-05, "loss": 0.7805, "step": 3535 }, { "epoch": 0.6704545454545454, "grad_norm": 0.0379764100996147, "learning_rate": 8.877215711629457e-05, "loss": 0.776, "step": 3540 }, { "epoch": 0.6714015151515151, "grad_norm": 0.03967109810939677, "learning_rate": 8.831984777490954e-05, "loss": 0.7884, "step": 3545 }, { "epoch": 0.6723484848484849, "grad_norm": 0.04222761106798767, "learning_rate": 8.786821239012582e-05, "loss": 0.7714, "step": 3550 }, { "epoch": 0.6732954545454546, "grad_norm": 0.04051009224725267, "learning_rate": 8.741725589679912e-05, "loss": 0.7656, "step": 3555 }, { "epoch": 0.6742424242424242, "grad_norm": 0.03816042826696097, "learning_rate": 8.696698322236706e-05, "loss": 0.7609, "step": 3560 }, { "epoch": 0.6751893939393939, "grad_norm": 0.03985241002351074, "learning_rate": 8.651739928679556e-05, "loss": 0.7982, "step": 3565 }, { "epoch": 0.6761363636363636, "grad_norm": 0.039265226899704256, "learning_rate": 8.606850900252478e-05, "loss": 0.7886, "step": 3570 }, { "epoch": 0.6770833333333334, "grad_norm": 0.03925154055754282, "learning_rate": 8.562031727441567e-05, "loss": 0.7963, "step": 3575 }, { "epoch": 0.678030303030303, "grad_norm": 0.039792050085331696, "learning_rate": 8.517282899969629e-05, "loss": 0.8051, "step": 3580 }, { "epoch": 0.6789772727272727, "grad_norm": 0.03879414602215694, "learning_rate": 8.472604906790852e-05, "loss": 0.8024, "step": 3585 }, { "epoch": 0.6799242424242424, "grad_norm": 0.0418000610573599, "learning_rate": 8.427998236085404e-05, "loss": 0.762, "step": 3590 }, { "epoch": 0.6808712121212122, "grad_norm": 0.045760675836821356, "learning_rate": 8.38346337525417e-05, "loss": 0.7923, "step": 3595 }, { "epoch": 0.6818181818181818, "grad_norm": 0.040543801463440665, "learning_rate": 8.339000810913386e-05, "loss": 0.7809, "step": 3600 }, { "epoch": 0.6827651515151515, "grad_norm": 0.03937397555590882, "learning_rate": 8.294611028889332e-05, "loss": 0.7985, "step": 3605 }, { "epoch": 0.6837121212121212, "grad_norm": 0.04054190343244338, "learning_rate": 8.250294514213009e-05, "loss": 0.8063, "step": 3610 }, { "epoch": 0.6846590909090909, "grad_norm": 0.039503064995133216, "learning_rate": 8.206051751114875e-05, "loss": 0.8033, "step": 3615 }, { "epoch": 0.6856060606060606, "grad_norm": 0.03961793432281865, "learning_rate": 8.161883223019513e-05, "loss": 0.7841, "step": 3620 }, { "epoch": 0.6865530303030303, "grad_norm": 0.03964303335275124, "learning_rate": 8.11778941254037e-05, "loss": 0.793, "step": 3625 }, { "epoch": 0.6875, "grad_norm": 0.03665153936722208, "learning_rate": 8.073770801474495e-05, "loss": 0.776, "step": 3630 }, { "epoch": 0.6884469696969697, "grad_norm": 0.04064557439554845, "learning_rate": 8.029827870797233e-05, "loss": 0.7622, "step": 3635 }, { "epoch": 0.6893939393939394, "grad_norm": 0.038999462198328914, "learning_rate": 7.985961100657029e-05, "loss": 0.7945, "step": 3640 }, { "epoch": 0.6903409090909091, "grad_norm": 0.03814629462651061, "learning_rate": 7.942170970370128e-05, "loss": 0.7907, "step": 3645 }, { "epoch": 0.6912878787878788, "grad_norm": 0.03936834359810894, "learning_rate": 7.898457958415362e-05, "loss": 0.8105, "step": 3650 }, { "epoch": 0.6922348484848485, "grad_norm": 0.043208861683073814, "learning_rate": 7.854822542428923e-05, "loss": 0.7829, "step": 3655 }, { "epoch": 0.6931818181818182, "grad_norm": 0.04118046552241357, "learning_rate": 7.811265199199152e-05, "loss": 0.7881, "step": 3660 }, { "epoch": 0.6941287878787878, "grad_norm": 0.04008925612177105, "learning_rate": 7.76778640466128e-05, "loss": 0.7898, "step": 3665 }, { "epoch": 0.6950757575757576, "grad_norm": 0.03889735909259863, "learning_rate": 7.724386633892306e-05, "loss": 0.7829, "step": 3670 }, { "epoch": 0.6960227272727273, "grad_norm": 0.04151816317577747, "learning_rate": 7.681066361105756e-05, "loss": 0.7767, "step": 3675 }, { "epoch": 0.696969696969697, "grad_norm": 0.036944282946496376, "learning_rate": 7.63782605964648e-05, "loss": 0.7765, "step": 3680 }, { "epoch": 0.6979166666666666, "grad_norm": 0.03615917598734965, "learning_rate": 7.594666201985545e-05, "loss": 0.7861, "step": 3685 }, { "epoch": 0.6988636363636364, "grad_norm": 0.04067200248262229, "learning_rate": 7.551587259715034e-05, "loss": 0.8289, "step": 3690 }, { "epoch": 0.6998106060606061, "grad_norm": 0.037365461143322884, "learning_rate": 7.508589703542878e-05, "loss": 0.811, "step": 3695 }, { "epoch": 0.7007575757575758, "grad_norm": 0.04185500665647231, "learning_rate": 7.465674003287745e-05, "loss": 0.7682, "step": 3700 }, { "epoch": 0.7017045454545454, "grad_norm": 0.040446504718946015, "learning_rate": 7.422840627873897e-05, "loss": 0.795, "step": 3705 }, { "epoch": 0.7026515151515151, "grad_norm": 0.03757009814629214, "learning_rate": 7.380090045326045e-05, "loss": 0.7504, "step": 3710 }, { "epoch": 0.7035984848484849, "grad_norm": 0.038548413203444785, "learning_rate": 7.337422722764275e-05, "loss": 0.8075, "step": 3715 }, { "epoch": 0.7045454545454546, "grad_norm": 0.03999887548841091, "learning_rate": 7.294839126398908e-05, "loss": 0.774, "step": 3720 }, { "epoch": 0.7054924242424242, "grad_norm": 0.04141191338877119, "learning_rate": 7.252339721525412e-05, "loss": 0.8107, "step": 3725 }, { "epoch": 0.7064393939393939, "grad_norm": 0.0427189690829545, "learning_rate": 7.209924972519343e-05, "loss": 0.783, "step": 3730 }, { "epoch": 0.7073863636363636, "grad_norm": 0.041829790074471566, "learning_rate": 7.167595342831253e-05, "loss": 0.8037, "step": 3735 }, { "epoch": 0.7083333333333334, "grad_norm": 0.03932038001837439, "learning_rate": 7.125351294981598e-05, "loss": 0.7577, "step": 3740 }, { "epoch": 0.709280303030303, "grad_norm": 0.044979053856984176, "learning_rate": 7.083193290555744e-05, "loss": 0.7623, "step": 3745 }, { "epoch": 0.7102272727272727, "grad_norm": 0.040516807472682444, "learning_rate": 7.041121790198881e-05, "loss": 0.7796, "step": 3750 }, { "epoch": 0.7111742424242424, "grad_norm": 0.04076058081578708, "learning_rate": 6.999137253611e-05, "loss": 0.789, "step": 3755 }, { "epoch": 0.7121212121212122, "grad_norm": 0.03822915371523041, "learning_rate": 6.95724013954186e-05, "loss": 0.784, "step": 3760 }, { "epoch": 0.7130681818181818, "grad_norm": 0.04000916349581932, "learning_rate": 6.91543090578601e-05, "loss": 0.7722, "step": 3765 }, { "epoch": 0.7140151515151515, "grad_norm": 0.044726470002604886, "learning_rate": 6.87371000917774e-05, "loss": 0.7575, "step": 3770 }, { "epoch": 0.7149621212121212, "grad_norm": 0.04471754304335626, "learning_rate": 6.832077905586119e-05, "loss": 0.7691, "step": 3775 }, { "epoch": 0.7159090909090909, "grad_norm": 0.03949943632638639, "learning_rate": 6.790535049910017e-05, "loss": 0.784, "step": 3780 }, { "epoch": 0.7168560606060606, "grad_norm": 0.038373759541872915, "learning_rate": 6.749081896073106e-05, "loss": 0.7601, "step": 3785 }, { "epoch": 0.7178030303030303, "grad_norm": 0.03616627695182055, "learning_rate": 6.707718897018941e-05, "loss": 0.7591, "step": 3790 }, { "epoch": 0.71875, "grad_norm": 0.04327639838927876, "learning_rate": 6.66644650470597e-05, "loss": 0.7846, "step": 3795 }, { "epoch": 0.7196969696969697, "grad_norm": 0.043231180710510686, "learning_rate": 6.625265170102615e-05, "loss": 0.752, "step": 3800 }, { "epoch": 0.7206439393939394, "grad_norm": 0.039624521674453655, "learning_rate": 6.584175343182359e-05, "loss": 0.7995, "step": 3805 }, { "epoch": 0.7215909090909091, "grad_norm": 0.04268727987190514, "learning_rate": 6.543177472918794e-05, "loss": 0.7877, "step": 3810 }, { "epoch": 0.7225378787878788, "grad_norm": 0.0402914394741491, "learning_rate": 6.502272007280755e-05, "loss": 0.7539, "step": 3815 }, { "epoch": 0.7234848484848485, "grad_norm": 0.03869463859402182, "learning_rate": 6.461459393227385e-05, "loss": 0.7583, "step": 3820 }, { "epoch": 0.7244318181818182, "grad_norm": 0.03726113086293714, "learning_rate": 6.420740076703291e-05, "loss": 0.7435, "step": 3825 }, { "epoch": 0.7253787878787878, "grad_norm": 0.04242697385724998, "learning_rate": 6.38011450263364e-05, "loss": 0.7909, "step": 3830 }, { "epoch": 0.7263257575757576, "grad_norm": 0.041072190514661835, "learning_rate": 6.339583114919301e-05, "loss": 0.7938, "step": 3835 }, { "epoch": 0.7272727272727273, "grad_norm": 0.04140101649993429, "learning_rate": 6.299146356432029e-05, "loss": 0.7724, "step": 3840 }, { "epoch": 0.728219696969697, "grad_norm": 0.04276312404745283, "learning_rate": 6.258804669009575e-05, "loss": 0.8042, "step": 3845 }, { "epoch": 0.7291666666666666, "grad_norm": 0.03951731344689689, "learning_rate": 6.218558493450893e-05, "loss": 0.7555, "step": 3850 }, { "epoch": 0.7301136363636364, "grad_norm": 0.03936898937096199, "learning_rate": 6.178408269511312e-05, "loss": 0.7863, "step": 3855 }, { "epoch": 0.7310606060606061, "grad_norm": 0.038716693149565, "learning_rate": 6.138354435897748e-05, "loss": 0.7745, "step": 3860 }, { "epoch": 0.7320075757575758, "grad_norm": 0.04072421020095559, "learning_rate": 6.098397430263858e-05, "loss": 0.7956, "step": 3865 }, { "epoch": 0.7329545454545454, "grad_norm": 0.04100101910580714, "learning_rate": 6.058537689205328e-05, "loss": 0.7578, "step": 3870 }, { "epoch": 0.7339015151515151, "grad_norm": 0.039583105949787575, "learning_rate": 6.0187756482550645e-05, "loss": 0.796, "step": 3875 }, { "epoch": 0.7348484848484849, "grad_norm": 0.0398070275621675, "learning_rate": 5.9791117418784274e-05, "loss": 0.7667, "step": 3880 }, { "epoch": 0.7357954545454546, "grad_norm": 0.03854944447288334, "learning_rate": 5.939546403468501e-05, "loss": 0.7499, "step": 3885 }, { "epoch": 0.7367424242424242, "grad_norm": 0.04368614364362331, "learning_rate": 5.900080065341363e-05, "loss": 0.78, "step": 3890 }, { "epoch": 0.7376893939393939, "grad_norm": 0.042603936273429066, "learning_rate": 5.860713158731333e-05, "loss": 0.7636, "step": 3895 }, { "epoch": 0.7386363636363636, "grad_norm": 0.043008266779781215, "learning_rate": 5.821446113786302e-05, "loss": 0.7631, "step": 3900 }, { "epoch": 0.7395833333333334, "grad_norm": 0.03910221102626979, "learning_rate": 5.782279359562988e-05, "loss": 0.7691, "step": 3905 }, { "epoch": 0.740530303030303, "grad_norm": 0.042051023569034784, "learning_rate": 5.743213324022272e-05, "loss": 0.7905, "step": 3910 }, { "epoch": 0.7414772727272727, "grad_norm": 0.044919920953571925, "learning_rate": 5.7042484340245265e-05, "loss": 0.7715, "step": 3915 }, { "epoch": 0.7424242424242424, "grad_norm": 0.041073163710179876, "learning_rate": 5.665385115324953e-05, "loss": 0.7468, "step": 3920 }, { "epoch": 0.7433712121212122, "grad_norm": 0.03567748888143746, "learning_rate": 5.626623792568885e-05, "loss": 0.7902, "step": 3925 }, { "epoch": 0.7443181818181818, "grad_norm": 0.040223394497797826, "learning_rate": 5.587964889287218e-05, "loss": 0.8142, "step": 3930 }, { "epoch": 0.7452651515151515, "grad_norm": 0.03937628241815354, "learning_rate": 5.5494088278917434e-05, "loss": 0.7561, "step": 3935 }, { "epoch": 0.7462121212121212, "grad_norm": 0.039022205324506364, "learning_rate": 5.5109560296705066e-05, "loss": 0.7761, "step": 3940 }, { "epoch": 0.7471590909090909, "grad_norm": 0.037705226948508606, "learning_rate": 5.472606914783266e-05, "loss": 0.7697, "step": 3945 }, { "epoch": 0.7481060606060606, "grad_norm": 0.03949092410910686, "learning_rate": 5.434361902256868e-05, "loss": 0.7804, "step": 3950 }, { "epoch": 0.7490530303030303, "grad_norm": 0.040225291682004415, "learning_rate": 5.396221409980653e-05, "loss": 0.7895, "step": 3955 }, { "epoch": 0.75, "grad_norm": 0.03696261298169589, "learning_rate": 5.358185854701909e-05, "loss": 0.7715, "step": 3960 }, { "epoch": 0.7509469696969697, "grad_norm": 0.03863523973389968, "learning_rate": 5.320255652021336e-05, "loss": 0.7748, "step": 3965 }, { "epoch": 0.7518939393939394, "grad_norm": 0.041539206419021424, "learning_rate": 5.282431216388457e-05, "loss": 0.7556, "step": 3970 }, { "epoch": 0.7528409090909091, "grad_norm": 0.040538971828186623, "learning_rate": 5.244712961097142e-05, "loss": 0.7843, "step": 3975 }, { "epoch": 0.7537878787878788, "grad_norm": 0.042618390094256595, "learning_rate": 5.207101298281049e-05, "loss": 0.7666, "step": 3980 }, { "epoch": 0.7547348484848485, "grad_norm": 0.03893190636534372, "learning_rate": 5.1695966389091396e-05, "loss": 0.7793, "step": 3985 }, { "epoch": 0.7556818181818182, "grad_norm": 0.03755845511933422, "learning_rate": 5.132199392781205e-05, "loss": 0.77, "step": 3990 }, { "epoch": 0.7566287878787878, "grad_norm": 0.04022540043473745, "learning_rate": 5.094909968523351e-05, "loss": 0.78, "step": 3995 }, { "epoch": 0.7575757575757576, "grad_norm": 0.04253171860319729, "learning_rate": 5.057728773583559e-05, "loss": 0.7478, "step": 4000 }, { "epoch": 0.7585227272727273, "grad_norm": 0.042102270133092194, "learning_rate": 5.0206562142272334e-05, "loss": 0.7817, "step": 4005 }, { "epoch": 0.759469696969697, "grad_norm": 0.04424433693890534, "learning_rate": 4.9836926955327656e-05, "loss": 0.7774, "step": 4010 }, { "epoch": 0.7604166666666666, "grad_norm": 0.03727474161719155, "learning_rate": 4.946838621387063e-05, "loss": 0.7548, "step": 4015 }, { "epoch": 0.7613636363636364, "grad_norm": 0.038729203633207226, "learning_rate": 4.9100943944812114e-05, "loss": 0.7723, "step": 4020 }, { "epoch": 0.7623106060606061, "grad_norm": 0.04033403314672209, "learning_rate": 4.873460416306023e-05, "loss": 0.7815, "step": 4025 }, { "epoch": 0.7632575757575758, "grad_norm": 0.03663235670257645, "learning_rate": 4.836937087147655e-05, "loss": 0.7968, "step": 4030 }, { "epoch": 0.7642045454545454, "grad_norm": 0.038393805155331855, "learning_rate": 4.8005248060832446e-05, "loss": 0.7572, "step": 4035 }, { "epoch": 0.7651515151515151, "grad_norm": 0.03692754409484954, "learning_rate": 4.7642239709765596e-05, "loss": 0.7707, "step": 4040 }, { "epoch": 0.7660984848484849, "grad_norm": 0.03765360322249641, "learning_rate": 4.728034978473621e-05, "loss": 0.7886, "step": 4045 }, { "epoch": 0.7670454545454546, "grad_norm": 0.04124091795917703, "learning_rate": 4.691958223998401e-05, "loss": 0.7693, "step": 4050 }, { "epoch": 0.7679924242424242, "grad_norm": 0.038689870405394365, "learning_rate": 4.655994101748477e-05, "loss": 0.7921, "step": 4055 }, { "epoch": 0.7689393939393939, "grad_norm": 0.03609443794000663, "learning_rate": 4.620143004690736e-05, "loss": 0.7289, "step": 4060 }, { "epoch": 0.7698863636363636, "grad_norm": 0.03987735376629914, "learning_rate": 4.584405324557092e-05, "loss": 0.7605, "step": 4065 }, { "epoch": 0.7708333333333334, "grad_norm": 0.04097229768216734, "learning_rate": 4.548781451840179e-05, "loss": 0.7663, "step": 4070 }, { "epoch": 0.771780303030303, "grad_norm": 0.040742694287622665, "learning_rate": 4.513271775789099e-05, "loss": 0.8028, "step": 4075 }, { "epoch": 0.7727272727272727, "grad_norm": 0.04002660818121977, "learning_rate": 4.477876684405179e-05, "loss": 0.7613, "step": 4080 }, { "epoch": 0.7736742424242424, "grad_norm": 0.03889418322921735, "learning_rate": 4.4425965644377206e-05, "loss": 0.7551, "step": 4085 }, { "epoch": 0.7746212121212122, "grad_norm": 0.041611350969633386, "learning_rate": 4.407431801379765e-05, "loss": 0.7626, "step": 4090 }, { "epoch": 0.7755681818181818, "grad_norm": 0.038681908003403036, "learning_rate": 4.37238277946389e-05, "loss": 0.7903, "step": 4095 }, { "epoch": 0.7765151515151515, "grad_norm": 0.03548309908592482, "learning_rate": 4.337449881658027e-05, "loss": 0.7786, "step": 4100 }, { "epoch": 0.7774621212121212, "grad_norm": 0.039380169633909605, "learning_rate": 4.3026334896612454e-05, "loss": 0.7403, "step": 4105 }, { "epoch": 0.7784090909090909, "grad_norm": 0.04079678253532297, "learning_rate": 4.267933983899601e-05, "loss": 0.7436, "step": 4110 }, { "epoch": 0.7793560606060606, "grad_norm": 0.039301682797346464, "learning_rate": 4.233351743521987e-05, "loss": 0.7671, "step": 4115 }, { "epoch": 0.7803030303030303, "grad_norm": 0.03847308151820405, "learning_rate": 4.19888714639597e-05, "loss": 0.7448, "step": 4120 }, { "epoch": 0.78125, "grad_norm": 0.041300407433686306, "learning_rate": 4.164540569103667e-05, "loss": 0.7589, "step": 4125 }, { "epoch": 0.7821969696969697, "grad_norm": 0.03941592905737965, "learning_rate": 4.1303123869376535e-05, "loss": 0.757, "step": 4130 }, { "epoch": 0.7831439393939394, "grad_norm": 0.037406188407398566, "learning_rate": 4.096202973896825e-05, "loss": 0.7725, "step": 4135 }, { "epoch": 0.7840909090909091, "grad_norm": 0.03990816917288711, "learning_rate": 4.0622127026823445e-05, "loss": 0.7317, "step": 4140 }, { "epoch": 0.7850378787878788, "grad_norm": 0.03511090719697071, "learning_rate": 4.028341944693543e-05, "loss": 0.7529, "step": 4145 }, { "epoch": 0.7859848484848485, "grad_norm": 0.0379918188663595, "learning_rate": 3.9945910700238865e-05, "loss": 0.7766, "step": 4150 }, { "epoch": 0.7869318181818182, "grad_norm": 0.04057440463664927, "learning_rate": 3.960960447456907e-05, "loss": 0.7828, "step": 4155 }, { "epoch": 0.7878787878787878, "grad_norm": 0.03820049794907823, "learning_rate": 3.9274504444622016e-05, "loss": 0.7687, "step": 4160 }, { "epoch": 0.7888257575757576, "grad_norm": 0.04059223380009775, "learning_rate": 3.894061427191384e-05, "loss": 0.7736, "step": 4165 }, { "epoch": 0.7897727272727273, "grad_norm": 0.03586736668288177, "learning_rate": 3.860793760474105e-05, "loss": 0.7504, "step": 4170 }, { "epoch": 0.790719696969697, "grad_norm": 0.03808778963155071, "learning_rate": 3.8276478078140746e-05, "loss": 0.7827, "step": 4175 }, { "epoch": 0.7916666666666666, "grad_norm": 0.04112934182070688, "learning_rate": 3.794623931385062e-05, "loss": 0.7754, "step": 4180 }, { "epoch": 0.7926136363636364, "grad_norm": 0.03890850582072587, "learning_rate": 3.7617224920269607e-05, "loss": 0.7529, "step": 4185 }, { "epoch": 0.7935606060606061, "grad_norm": 0.03888836620814126, "learning_rate": 3.7289438492418375e-05, "loss": 0.7797, "step": 4190 }, { "epoch": 0.7945075757575758, "grad_norm": 0.04186176891014295, "learning_rate": 3.696288361190015e-05, "loss": 0.7735, "step": 4195 }, { "epoch": 0.7954545454545454, "grad_norm": 0.03922361327888292, "learning_rate": 3.663756384686127e-05, "loss": 0.7431, "step": 4200 }, { "epoch": 0.7964015151515151, "grad_norm": 0.037477031699552646, "learning_rate": 3.631348275195259e-05, "loss": 0.7477, "step": 4205 }, { "epoch": 0.7973484848484849, "grad_norm": 0.03804879504351854, "learning_rate": 3.599064386829051e-05, "loss": 0.7873, "step": 4210 }, { "epoch": 0.7982954545454546, "grad_norm": 0.041760751407490776, "learning_rate": 3.5669050723418074e-05, "loss": 0.7644, "step": 4215 }, { "epoch": 0.7992424242424242, "grad_norm": 0.035448569098769006, "learning_rate": 3.534870683126664e-05, "loss": 0.7786, "step": 4220 }, { "epoch": 0.8001893939393939, "grad_norm": 0.042355358195710444, "learning_rate": 3.5029615692117555e-05, "loss": 0.7576, "step": 4225 }, { "epoch": 0.8011363636363636, "grad_norm": 0.039815508314079394, "learning_rate": 3.47117807925636e-05, "loss": 0.7678, "step": 4230 }, { "epoch": 0.8020833333333334, "grad_norm": 0.04043653268326337, "learning_rate": 3.4395205605471286e-05, "loss": 0.7763, "step": 4235 }, { "epoch": 0.803030303030303, "grad_norm": 0.03871480607482675, "learning_rate": 3.4079893589942543e-05, "loss": 0.761, "step": 4240 }, { "epoch": 0.8039772727272727, "grad_norm": 0.041056778138514105, "learning_rate": 3.376584819127712e-05, "loss": 0.7686, "step": 4245 }, { "epoch": 0.8049242424242424, "grad_norm": 0.0402161423571865, "learning_rate": 3.3453072840935e-05, "loss": 0.7704, "step": 4250 }, { "epoch": 0.8058712121212122, "grad_norm": 0.03864045638204508, "learning_rate": 3.314157095649868e-05, "loss": 0.7707, "step": 4255 }, { "epoch": 0.8068181818181818, "grad_norm": 0.042228336396620804, "learning_rate": 3.283134594163599e-05, "loss": 0.7482, "step": 4260 }, { "epoch": 0.8077651515151515, "grad_norm": 0.04047310966274174, "learning_rate": 3.252240118606293e-05, "loss": 0.7587, "step": 4265 }, { "epoch": 0.8087121212121212, "grad_norm": 0.03977289729530792, "learning_rate": 3.221474006550662e-05, "loss": 0.768, "step": 4270 }, { "epoch": 0.8096590909090909, "grad_norm": 0.03903161422504673, "learning_rate": 3.1908365941668115e-05, "loss": 0.7433, "step": 4275 }, { "epoch": 0.8106060606060606, "grad_norm": 0.03937683373863951, "learning_rate": 3.160328216218617e-05, "loss": 0.7889, "step": 4280 }, { "epoch": 0.8115530303030303, "grad_norm": 0.039794170439367574, "learning_rate": 3.129949206060039e-05, "loss": 0.7418, "step": 4285 }, { "epoch": 0.8125, "grad_norm": 0.041917271276222724, "learning_rate": 3.099699895631474e-05, "loss": 0.7451, "step": 4290 }, { "epoch": 0.8134469696969697, "grad_norm": 0.037936541801760024, "learning_rate": 3.069580615456137e-05, "loss": 0.7627, "step": 4295 }, { "epoch": 0.8143939393939394, "grad_norm": 0.037614637220929004, "learning_rate": 3.03959169463646e-05, "loss": 0.7674, "step": 4300 }, { "epoch": 0.8153409090909091, "grad_norm": 0.03881575160344137, "learning_rate": 3.009733460850473e-05, "loss": 0.7646, "step": 4305 }, { "epoch": 0.8162878787878788, "grad_norm": 0.041064989397716814, "learning_rate": 2.9800062403482493e-05, "loss": 0.7554, "step": 4310 }, { "epoch": 0.8172348484848485, "grad_norm": 0.03751500962917203, "learning_rate": 2.9504103579483163e-05, "loss": 0.772, "step": 4315 }, { "epoch": 0.8181818181818182, "grad_norm": 0.03762830736526142, "learning_rate": 2.9209461370341204e-05, "loss": 0.7419, "step": 4320 }, { "epoch": 0.8191287878787878, "grad_norm": 0.043952738002868426, "learning_rate": 2.891613899550499e-05, "loss": 0.7876, "step": 4325 }, { "epoch": 0.8200757575757576, "grad_norm": 0.04012795700450522, "learning_rate": 2.8624139660001448e-05, "loss": 0.7589, "step": 4330 }, { "epoch": 0.8210227272727273, "grad_norm": 0.03722728875218868, "learning_rate": 2.8333466554401125e-05, "loss": 0.7521, "step": 4335 }, { "epoch": 0.821969696969697, "grad_norm": 0.037476550396028804, "learning_rate": 2.804412285478343e-05, "loss": 0.7393, "step": 4340 }, { "epoch": 0.8229166666666666, "grad_norm": 0.037123539268036035, "learning_rate": 2.775611172270185e-05, "loss": 0.7654, "step": 4345 }, { "epoch": 0.8238636363636364, "grad_norm": 0.03776861945697112, "learning_rate": 2.7469436305149172e-05, "loss": 0.7629, "step": 4350 }, { "epoch": 0.8248106060606061, "grad_norm": 0.0397986113112224, "learning_rate": 2.7184099734523567e-05, "loss": 0.776, "step": 4355 }, { "epoch": 0.8257575757575758, "grad_norm": 0.04058484357450401, "learning_rate": 2.690010512859403e-05, "loss": 0.7563, "step": 4360 }, { "epoch": 0.8267045454545454, "grad_norm": 0.03896827353754326, "learning_rate": 2.6617455590466363e-05, "loss": 0.7457, "step": 4365 }, { "epoch": 0.8276515151515151, "grad_norm": 0.03825814102977763, "learning_rate": 2.633615420854928e-05, "loss": 0.75, "step": 4370 }, { "epoch": 0.8285984848484849, "grad_norm": 0.03716265282693635, "learning_rate": 2.6056204056520795e-05, "loss": 0.758, "step": 4375 }, { "epoch": 0.8295454545454546, "grad_norm": 0.04444901558357328, "learning_rate": 2.5777608193294396e-05, "loss": 0.7576, "step": 4380 }, { "epoch": 0.8304924242424242, "grad_norm": 0.03624822046682716, "learning_rate": 2.550036966298581e-05, "loss": 0.7483, "step": 4385 }, { "epoch": 0.8314393939393939, "grad_norm": 0.038466147912297376, "learning_rate": 2.5224491494879705e-05, "loss": 0.7735, "step": 4390 }, { "epoch": 0.8323863636363636, "grad_norm": 0.04216736810756713, "learning_rate": 2.4949976703396486e-05, "loss": 0.7666, "step": 4395 }, { "epoch": 0.8333333333333334, "grad_norm": 0.037577989344391036, "learning_rate": 2.4676828288059558e-05, "loss": 0.7504, "step": 4400 }, { "epoch": 0.834280303030303, "grad_norm": 0.04096373784722742, "learning_rate": 2.4405049233462316e-05, "loss": 0.7541, "step": 4405 }, { "epoch": 0.8352272727272727, "grad_norm": 0.03828220611720389, "learning_rate": 2.413464250923566e-05, "loss": 0.7512, "step": 4410 }, { "epoch": 0.8361742424242424, "grad_norm": 0.03788635961789358, "learning_rate": 2.3865611070015605e-05, "loss": 0.7544, "step": 4415 }, { "epoch": 0.8371212121212122, "grad_norm": 0.04139269826653916, "learning_rate": 2.3597957855410932e-05, "loss": 0.7847, "step": 4420 }, { "epoch": 0.8380681818181818, "grad_norm": 0.04098240265367287, "learning_rate": 2.3331685789970978e-05, "loss": 0.7548, "step": 4425 }, { "epoch": 0.8390151515151515, "grad_norm": 0.0366730676630168, "learning_rate": 2.3066797783153767e-05, "loss": 0.7546, "step": 4430 }, { "epoch": 0.8399621212121212, "grad_norm": 0.03855939364168934, "learning_rate": 2.280329672929434e-05, "loss": 0.7526, "step": 4435 }, { "epoch": 0.8409090909090909, "grad_norm": 0.03769398425525407, "learning_rate": 2.2541185507572858e-05, "loss": 0.7659, "step": 4440 }, { "epoch": 0.8418560606060606, "grad_norm": 0.037763802543836905, "learning_rate": 2.228046698198336e-05, "loss": 0.7492, "step": 4445 }, { "epoch": 0.8428030303030303, "grad_norm": 0.038504139823869195, "learning_rate": 2.202114400130246e-05, "loss": 0.7532, "step": 4450 }, { "epoch": 0.84375, "grad_norm": 0.03839729099480198, "learning_rate": 2.1763219399058042e-05, "loss": 0.7716, "step": 4455 }, { "epoch": 0.8446969696969697, "grad_norm": 0.03634271413981629, "learning_rate": 2.150669599349845e-05, "loss": 0.781, "step": 4460 }, { "epoch": 0.8456439393939394, "grad_norm": 0.038799770819478115, "learning_rate": 2.1251576587561774e-05, "loss": 0.7471, "step": 4465 }, { "epoch": 0.8465909090909091, "grad_norm": 0.037447866031002947, "learning_rate": 2.0997863968844914e-05, "loss": 0.7454, "step": 4470 }, { "epoch": 0.8475378787878788, "grad_norm": 0.03812532250323499, "learning_rate": 2.0745560909573534e-05, "loss": 0.7487, "step": 4475 }, { "epoch": 0.8484848484848485, "grad_norm": 0.03967052575801908, "learning_rate": 2.0494670166571353e-05, "loss": 0.7448, "step": 4480 }, { "epoch": 0.8494318181818182, "grad_norm": 0.038717835231477656, "learning_rate": 2.0245194481230386e-05, "loss": 0.746, "step": 4485 }, { "epoch": 0.8503787878787878, "grad_norm": 0.03978509491909852, "learning_rate": 1.9997136579480698e-05, "loss": 0.7591, "step": 4490 }, { "epoch": 0.8513257575757576, "grad_norm": 0.040392976405494746, "learning_rate": 1.9750499171760864e-05, "loss": 0.7437, "step": 4495 }, { "epoch": 0.8522727272727273, "grad_norm": 0.03839961150423484, "learning_rate": 1.9505284952988154e-05, "loss": 0.7191, "step": 4500 }, { "epoch": 0.853219696969697, "grad_norm": 0.03701174541805748, "learning_rate": 1.9261496602529163e-05, "loss": 0.7614, "step": 4505 }, { "epoch": 0.8541666666666666, "grad_norm": 0.03820961258437268, "learning_rate": 1.9019136784170635e-05, "loss": 0.7914, "step": 4510 }, { "epoch": 0.8551136363636364, "grad_norm": 0.039034746383769636, "learning_rate": 1.877820814609018e-05, "loss": 0.7378, "step": 4515 }, { "epoch": 0.8560606060606061, "grad_norm": 0.035548177827413464, "learning_rate": 1.8538713320827398e-05, "loss": 0.7587, "step": 4520 }, { "epoch": 0.8570075757575758, "grad_norm": 0.03927586449468295, "learning_rate": 1.8300654925255227e-05, "loss": 0.7505, "step": 4525 }, { "epoch": 0.8579545454545454, "grad_norm": 0.03808728080301323, "learning_rate": 1.8064035560551254e-05, "loss": 0.7546, "step": 4530 }, { "epoch": 0.8589015151515151, "grad_norm": 0.03971353114564455, "learning_rate": 1.7828857812169183e-05, "loss": 0.7481, "step": 4535 }, { "epoch": 0.8598484848484849, "grad_norm": 0.038394015305144635, "learning_rate": 1.7595124249810798e-05, "loss": 0.7512, "step": 4540 }, { "epoch": 0.8607954545454546, "grad_norm": 0.04035971231008132, "learning_rate": 1.736283742739781e-05, "loss": 0.7514, "step": 4545 }, { "epoch": 0.8617424242424242, "grad_norm": 0.03852526269337616, "learning_rate": 1.7131999883043864e-05, "loss": 0.7324, "step": 4550 }, { "epoch": 0.8626893939393939, "grad_norm": 0.0380464207669555, "learning_rate": 1.690261413902685e-05, "loss": 0.778, "step": 4555 }, { "epoch": 0.8636363636363636, "grad_norm": 0.03809307250418814, "learning_rate": 1.6674682701761493e-05, "loss": 0.741, "step": 4560 }, { "epoch": 0.8645833333333334, "grad_norm": 0.03730545835154613, "learning_rate": 1.644820806177165e-05, "loss": 0.7494, "step": 4565 }, { "epoch": 0.865530303030303, "grad_norm": 0.0410319046669476, "learning_rate": 1.622319269366349e-05, "loss": 0.7774, "step": 4570 }, { "epoch": 0.8664772727272727, "grad_norm": 0.0365044250054721, "learning_rate": 1.599963905609807e-05, "loss": 0.7404, "step": 4575 }, { "epoch": 0.8674242424242424, "grad_norm": 0.039763773803898096, "learning_rate": 1.5777549591764705e-05, "loss": 0.7789, "step": 4580 }, { "epoch": 0.8683712121212122, "grad_norm": 0.03734007441823457, "learning_rate": 1.555692672735431e-05, "loss": 0.7487, "step": 4585 }, { "epoch": 0.8693181818181818, "grad_norm": 0.040126269127401595, "learning_rate": 1.5337772873532696e-05, "loss": 0.7653, "step": 4590 }, { "epoch": 0.8702651515151515, "grad_norm": 0.04012812890151172, "learning_rate": 1.5120090424914305e-05, "loss": 0.7763, "step": 4595 }, { "epoch": 0.8712121212121212, "grad_norm": 0.03851463594249241, "learning_rate": 1.4903881760036163e-05, "loss": 0.7654, "step": 4600 }, { "epoch": 0.8721590909090909, "grad_norm": 0.03700030549429758, "learning_rate": 1.46891492413318e-05, "loss": 0.7481, "step": 4605 }, { "epoch": 0.8731060606060606, "grad_norm": 0.037764961164442196, "learning_rate": 1.4475895215105299e-05, "loss": 0.751, "step": 4610 }, { "epoch": 0.8740530303030303, "grad_norm": 0.039658099635677214, "learning_rate": 1.4264122011505919e-05, "loss": 0.7454, "step": 4615 }, { "epoch": 0.875, "grad_norm": 0.037877514658421034, "learning_rate": 1.4053831944502508e-05, "loss": 0.7311, "step": 4620 }, { "epoch": 0.8759469696969697, "grad_norm": 0.03981948814673359, "learning_rate": 1.3845027311858149e-05, "loss": 0.7701, "step": 4625 }, { "epoch": 0.8768939393939394, "grad_norm": 0.03725499979086121, "learning_rate": 1.3637710395105134e-05, "loss": 0.7496, "step": 4630 }, { "epoch": 0.8778409090909091, "grad_norm": 0.03845165051106696, "learning_rate": 1.3431883459520115e-05, "loss": 0.7598, "step": 4635 }, { "epoch": 0.8787878787878788, "grad_norm": 0.03921039831701288, "learning_rate": 1.3227548754099148e-05, "loss": 0.7576, "step": 4640 }, { "epoch": 0.8797348484848485, "grad_norm": 0.03677027674299414, "learning_rate": 1.3024708511533266e-05, "loss": 0.7536, "step": 4645 }, { "epoch": 0.8806818181818182, "grad_norm": 0.03769732908420657, "learning_rate": 1.2823364948184095e-05, "loss": 0.7631, "step": 4650 }, { "epoch": 0.8816287878787878, "grad_norm": 0.038277457641516056, "learning_rate": 1.2623520264059528e-05, "loss": 0.758, "step": 4655 }, { "epoch": 0.8825757575757576, "grad_norm": 0.03835316890120053, "learning_rate": 1.2425176642789841e-05, "loss": 0.7545, "step": 4660 }, { "epoch": 0.8835227272727273, "grad_norm": 0.040317569065410515, "learning_rate": 1.2228336251603632e-05, "loss": 0.7211, "step": 4665 }, { "epoch": 0.884469696969697, "grad_norm": 0.03605361368261573, "learning_rate": 1.2033001241304285e-05, "loss": 0.7356, "step": 4670 }, { "epoch": 0.8854166666666666, "grad_norm": 0.042368074274969164, "learning_rate": 1.1839173746246462e-05, "loss": 0.7643, "step": 4675 }, { "epoch": 0.8863636363636364, "grad_norm": 0.04206007952837537, "learning_rate": 1.164685588431281e-05, "loss": 0.7694, "step": 4680 }, { "epoch": 0.8873106060606061, "grad_norm": 0.03587287003409619, "learning_rate": 1.14560497568906e-05, "loss": 0.7336, "step": 4685 }, { "epoch": 0.8882575757575758, "grad_norm": 0.04055727525356863, "learning_rate": 1.126675744884904e-05, "loss": 0.7858, "step": 4690 }, { "epoch": 0.8892045454545454, "grad_norm": 0.03731194721410893, "learning_rate": 1.1078981028516421e-05, "loss": 0.7546, "step": 4695 }, { "epoch": 0.8901515151515151, "grad_norm": 0.03913350636593797, "learning_rate": 1.08927225476574e-05, "loss": 0.7555, "step": 4700 }, { "epoch": 0.8910984848484849, "grad_norm": 0.03620266304429595, "learning_rate": 1.0707984041450673e-05, "loss": 0.7393, "step": 4705 }, { "epoch": 0.8920454545454546, "grad_norm": 0.0372176814841684, "learning_rate": 1.0524767528466766e-05, "loss": 0.7815, "step": 4710 }, { "epoch": 0.8929924242424242, "grad_norm": 0.04163117308071071, "learning_rate": 1.034307501064589e-05, "loss": 0.7744, "step": 4715 }, { "epoch": 0.8939393939393939, "grad_norm": 0.03841314072028053, "learning_rate": 1.0162908473276133e-05, "loss": 0.7441, "step": 4720 }, { "epoch": 0.8948863636363636, "grad_norm": 0.03658511014566751, "learning_rate": 9.984269884971796e-06, "loss": 0.7534, "step": 4725 }, { "epoch": 0.8958333333333334, "grad_norm": 0.035726934558083914, "learning_rate": 9.807161197651742e-06, "loss": 0.7561, "step": 4730 }, { "epoch": 0.896780303030303, "grad_norm": 0.03786917865045401, "learning_rate": 9.63158434651825e-06, "loss": 0.753, "step": 4735 }, { "epoch": 0.8977272727272727, "grad_norm": 0.03878081614015611, "learning_rate": 9.45754125003576e-06, "loss": 0.7665, "step": 4740 }, { "epoch": 0.8986742424242424, "grad_norm": 0.03776273136819908, "learning_rate": 9.285033809909863e-06, "loss": 0.7882, "step": 4745 }, { "epoch": 0.8996212121212122, "grad_norm": 0.04079662714361428, "learning_rate": 9.114063911066676e-06, "loss": 0.7775, "step": 4750 }, { "epoch": 0.9005681818181818, "grad_norm": 0.04107251149823735, "learning_rate": 8.944633421632169e-06, "loss": 0.7785, "step": 4755 }, { "epoch": 0.9015151515151515, "grad_norm": 0.043937037368177494, "learning_rate": 8.776744192911666e-06, "loss": 0.7709, "step": 4760 }, { "epoch": 0.9024621212121212, "grad_norm": 0.03806032575275296, "learning_rate": 8.610398059369733e-06, "loss": 0.7398, "step": 4765 }, { "epoch": 0.9034090909090909, "grad_norm": 0.03989849682979902, "learning_rate": 8.445596838610136e-06, "loss": 0.7839, "step": 4770 }, { "epoch": 0.9043560606060606, "grad_norm": 0.03804089571024527, "learning_rate": 8.282342331355896e-06, "loss": 0.737, "step": 4775 }, { "epoch": 0.9053030303030303, "grad_norm": 0.036823538728651795, "learning_rate": 8.120636321429618e-06, "loss": 0.7365, "step": 4780 }, { "epoch": 0.90625, "grad_norm": 0.039238403212191623, "learning_rate": 7.960480575734162e-06, "loss": 0.7679, "step": 4785 }, { "epoch": 0.9071969696969697, "grad_norm": 0.03655300704953951, "learning_rate": 7.801876844233102e-06, "loss": 0.7276, "step": 4790 }, { "epoch": 0.9081439393939394, "grad_norm": 0.038671267549804565, "learning_rate": 7.64482685993174e-06, "loss": 0.754, "step": 4795 }, { "epoch": 0.9090909090909091, "grad_norm": 0.04012488210471297, "learning_rate": 7.489332338858201e-06, "loss": 0.7706, "step": 4800 }, { "epoch": 0.9100378787878788, "grad_norm": 0.039340313195190324, "learning_rate": 7.3353949800445625e-06, "loss": 0.7437, "step": 4805 }, { "epoch": 0.9109848484848485, "grad_norm": 0.03623402554079789, "learning_rate": 7.1830164655084175e-06, "loss": 0.747, "step": 4810 }, { "epoch": 0.9119318181818182, "grad_norm": 0.040697271432715135, "learning_rate": 7.032198460234367e-06, "loss": 0.7624, "step": 4815 }, { "epoch": 0.9128787878787878, "grad_norm": 0.03876794678188874, "learning_rate": 6.88294261215595e-06, "loss": 0.7132, "step": 4820 }, { "epoch": 0.9138257575757576, "grad_norm": 0.039259366616565435, "learning_rate": 6.7352505521375445e-06, "loss": 0.768, "step": 4825 }, { "epoch": 0.9147727272727273, "grad_norm": 0.04205245328852447, "learning_rate": 6.5891238939566275e-06, "loss": 0.78, "step": 4830 }, { "epoch": 0.915719696969697, "grad_norm": 0.041430998012228624, "learning_rate": 6.444564234286059e-06, "loss": 0.7476, "step": 4835 }, { "epoch": 0.9166666666666666, "grad_norm": 0.04090068304483327, "learning_rate": 6.301573152676664e-06, "loss": 0.7832, "step": 4840 }, { "epoch": 0.9176136363636364, "grad_norm": 0.03893377913410083, "learning_rate": 6.160152211540059e-06, "loss": 0.766, "step": 4845 }, { "epoch": 0.9185606060606061, "grad_norm": 0.03562213581544829, "learning_rate": 6.020302956131434e-06, "loss": 0.7506, "step": 4850 }, { "epoch": 0.9195075757575758, "grad_norm": 0.03858250177735203, "learning_rate": 5.8820269145327335e-06, "loss": 0.7449, "step": 4855 }, { "epoch": 0.9204545454545454, "grad_norm": 0.03698269538357442, "learning_rate": 5.7453255976360526e-06, "loss": 0.7419, "step": 4860 }, { "epoch": 0.9214015151515151, "grad_norm": 0.039313884060948906, "learning_rate": 5.6102004991269655e-06, "loss": 0.7509, "step": 4865 }, { "epoch": 0.9223484848484849, "grad_norm": 0.038202779909671226, "learning_rate": 5.476653095468292e-06, "loss": 0.7404, "step": 4870 }, { "epoch": 0.9232954545454546, "grad_norm": 0.038488820882748215, "learning_rate": 5.344684845883957e-06, "loss": 0.7584, "step": 4875 }, { "epoch": 0.9242424242424242, "grad_norm": 0.03673421514324292, "learning_rate": 5.214297192343104e-06, "loss": 0.7493, "step": 4880 }, { "epoch": 0.9251893939393939, "grad_norm": 0.0350920577902006, "learning_rate": 5.085491559544175e-06, "loss": 0.7834, "step": 4885 }, { "epoch": 0.9261363636363636, "grad_norm": 0.03508459667371372, "learning_rate": 4.9582693548994914e-06, "loss": 0.761, "step": 4890 }, { "epoch": 0.9270833333333334, "grad_norm": 0.03866865889378931, "learning_rate": 4.832631968519862e-06, "loss": 0.7536, "step": 4895 }, { "epoch": 0.928030303030303, "grad_norm": 0.03900577265735235, "learning_rate": 4.708580773199333e-06, "loss": 0.7588, "step": 4900 }, { "epoch": 0.9289772727272727, "grad_norm": 0.03975550249874538, "learning_rate": 4.586117124400196e-06, "loss": 0.7301, "step": 4905 }, { "epoch": 0.9299242424242424, "grad_norm": 0.03658042099322432, "learning_rate": 4.465242360238269e-06, "loss": 0.7192, "step": 4910 }, { "epoch": 0.9308712121212122, "grad_norm": 0.03538458441608596, "learning_rate": 4.345957801468092e-06, "loss": 0.7537, "step": 4915 }, { "epoch": 0.9318181818181818, "grad_norm": 0.039696089459974056, "learning_rate": 4.228264751468752e-06, "loss": 0.7578, "step": 4920 }, { "epoch": 0.9327651515151515, "grad_norm": 0.03891493239675872, "learning_rate": 4.112164496229381e-06, "loss": 0.7988, "step": 4925 }, { "epoch": 0.9337121212121212, "grad_norm": 0.0391191766137019, "learning_rate": 3.997658304335249e-06, "loss": 0.748, "step": 4930 }, { "epoch": 0.9346590909090909, "grad_norm": 0.041273803822080235, "learning_rate": 3.88474742695391e-06, "loss": 0.7444, "step": 4935 }, { "epoch": 0.9356060606060606, "grad_norm": 0.03833780199551714, "learning_rate": 3.77343309782151e-06, "loss": 0.7535, "step": 4940 }, { "epoch": 0.9365530303030303, "grad_norm": 0.0377506973575768, "learning_rate": 3.663716533229183e-06, "loss": 0.7603, "step": 4945 }, { "epoch": 0.9375, "grad_norm": 0.03920529885627104, "learning_rate": 3.5555989320099952e-06, "loss": 0.7346, "step": 4950 }, { "epoch": 0.9384469696969697, "grad_norm": 0.03926812513866438, "learning_rate": 3.4490814755256724e-06, "loss": 0.7882, "step": 4955 }, { "epoch": 0.9393939393939394, "grad_norm": 0.03915603389468844, "learning_rate": 3.344165327653725e-06, "loss": 0.7804, "step": 4960 }, { "epoch": 0.9403409090909091, "grad_norm": 0.03692758597782573, "learning_rate": 3.2408516347747606e-06, "loss": 0.7615, "step": 4965 }, { "epoch": 0.9412878787878788, "grad_norm": 0.040111477519722376, "learning_rate": 3.1391415257599583e-06, "loss": 0.7624, "step": 4970 }, { "epoch": 0.9422348484848485, "grad_norm": 0.036054816584654786, "learning_rate": 3.039036111958715e-06, "loss": 0.7595, "step": 4975 }, { "epoch": 0.9431818181818182, "grad_norm": 0.03402332704452141, "learning_rate": 2.9405364871864514e-06, "loss": 0.7569, "step": 4980 }, { "epoch": 0.9441287878787878, "grad_norm": 0.039782631917374064, "learning_rate": 2.8436437277128075e-06, "loss": 0.7616, "step": 4985 }, { "epoch": 0.9450757575757576, "grad_norm": 0.03902157823658662, "learning_rate": 2.7483588922497025e-06, "loss": 0.7324, "step": 4990 }, { "epoch": 0.9460227272727273, "grad_norm": 0.039325622745832914, "learning_rate": 2.6546830219399405e-06, "loss": 0.7597, "step": 4995 }, { "epoch": 0.946969696969697, "grad_norm": 0.03881835950586153, "learning_rate": 2.562617140345691e-06, "loss": 0.7473, "step": 5000 }, { "epoch": 0.9479166666666666, "grad_norm": 0.039364545515671236, "learning_rate": 2.472162253437343e-06, "loss": 0.7553, "step": 5005 }, { "epoch": 0.9488636363636364, "grad_norm": 0.03556550322704117, "learning_rate": 2.3833193495825853e-06, "loss": 0.7329, "step": 5010 }, { "epoch": 0.9498106060606061, "grad_norm": 0.03769878462512779, "learning_rate": 2.2960893995355443e-06, "loss": 0.7677, "step": 5015 }, { "epoch": 0.9507575757575758, "grad_norm": 0.04179821670604519, "learning_rate": 2.210473356426146e-06, "loss": 0.7329, "step": 5020 }, { "epoch": 0.9517045454545454, "grad_norm": 0.03551927757361974, "learning_rate": 2.1264721557497866e-06, "loss": 0.745, "step": 5025 }, { "epoch": 0.9526515151515151, "grad_norm": 0.035762866838308725, "learning_rate": 2.0440867153570627e-06, "loss": 0.757, "step": 5030 }, { "epoch": 0.9535984848484849, "grad_norm": 0.03810233771587777, "learning_rate": 1.9633179354437257e-06, "loss": 0.737, "step": 5035 }, { "epoch": 0.9545454545454546, "grad_norm": 0.03824899164000916, "learning_rate": 1.8841666985408566e-06, "loss": 0.7708, "step": 5040 }, { "epoch": 0.9554924242424242, "grad_norm": 0.03784087071597624, "learning_rate": 1.8066338695052585e-06, "loss": 0.7791, "step": 5045 }, { "epoch": 0.9564393939393939, "grad_norm": 0.03875228767293192, "learning_rate": 1.730720295509963e-06, "loss": 0.757, "step": 5050 }, { "epoch": 0.9573863636363636, "grad_norm": 0.03959979451862991, "learning_rate": 1.6564268060349884e-06, "loss": 0.7581, "step": 5055 }, { "epoch": 0.9583333333333334, "grad_norm": 0.03746814864806074, "learning_rate": 1.583754212858329e-06, "loss": 0.7492, "step": 5060 }, { "epoch": 0.959280303030303, "grad_norm": 0.03840085310645041, "learning_rate": 1.5127033100469477e-06, "loss": 0.7428, "step": 5065 }, { "epoch": 0.9602272727272727, "grad_norm": 0.04015902576846351, "learning_rate": 1.4432748739482468e-06, "loss": 0.7601, "step": 5070 }, { "epoch": 0.9611742424242424, "grad_norm": 0.03815333482717027, "learning_rate": 1.3754696631815276e-06, "loss": 0.7781, "step": 5075 }, { "epoch": 0.9621212121212122, "grad_norm": 0.039134552740557424, "learning_rate": 1.3092884186296282e-06, "loss": 0.7605, "step": 5080 }, { "epoch": 0.9630681818181818, "grad_norm": 0.03873683575508503, "learning_rate": 1.2447318634309977e-06, "loss": 0.7465, "step": 5085 }, { "epoch": 0.9640151515151515, "grad_norm": 0.038640801639052244, "learning_rate": 1.1818007029716525e-06, "loss": 0.7616, "step": 5090 }, { "epoch": 0.9649621212121212, "grad_norm": 0.04251912678550345, "learning_rate": 1.1204956248774655e-06, "loss": 0.747, "step": 5095 }, { "epoch": 0.9659090909090909, "grad_norm": 0.039291943037907916, "learning_rate": 1.0608172990067553e-06, "loss": 0.7628, "step": 5100 }, { "epoch": 0.9668560606060606, "grad_norm": 0.04115060966519561, "learning_rate": 1.0027663774429096e-06, "loss": 0.7533, "step": 5105 }, { "epoch": 0.9678030303030303, "grad_norm": 0.040550851971633786, "learning_rate": 9.463434944872395e-07, "loss": 0.77, "step": 5110 }, { "epoch": 0.96875, "grad_norm": 0.036797005792547945, "learning_rate": 8.91549266652053e-07, "loss": 0.7296, "step": 5115 }, { "epoch": 0.9696969696969697, "grad_norm": 0.03621498987532269, "learning_rate": 8.383842926539929e-07, "loss": 0.7682, "step": 5120 }, { "epoch": 0.9706439393939394, "grad_norm": 0.03987403439986009, "learning_rate": 7.868491534073928e-07, "loss": 0.793, "step": 5125 }, { "epoch": 0.9715909090909091, "grad_norm": 0.03862093235236962, "learning_rate": 7.369444120179647e-07, "loss": 0.7388, "step": 5130 }, { "epoch": 0.9725378787878788, "grad_norm": 0.03845042714550149, "learning_rate": 6.88670613776704e-07, "loss": 0.7571, "step": 5135 }, { "epoch": 0.9734848484848485, "grad_norm": 0.03537218356309702, "learning_rate": 6.420282861538283e-07, "loss": 0.7192, "step": 5140 }, { "epoch": 0.9744318181818182, "grad_norm": 0.03716360855745044, "learning_rate": 5.970179387931151e-07, "loss": 0.7498, "step": 5145 }, { "epoch": 0.9753787878787878, "grad_norm": 0.03704971797049268, "learning_rate": 5.536400635062721e-07, "loss": 0.7639, "step": 5150 }, { "epoch": 0.9763257575757576, "grad_norm": 0.03658375948794085, "learning_rate": 5.118951342675592e-07, "loss": 0.7607, "step": 5155 }, { "epoch": 0.9772727272727273, "grad_norm": 0.038160851981614306, "learning_rate": 4.717836072086589e-07, "loss": 0.7761, "step": 5160 }, { "epoch": 0.978219696969697, "grad_norm": 0.03554298384663066, "learning_rate": 4.3330592061361357e-07, "loss": 0.7515, "step": 5165 }, { "epoch": 0.9791666666666666, "grad_norm": 0.035988940616932245, "learning_rate": 3.964624949141626e-07, "loss": 0.7287, "step": 5170 }, { "epoch": 0.9801136363636364, "grad_norm": 0.038573387129357734, "learning_rate": 3.6125373268499625e-07, "loss": 0.7584, "step": 5175 }, { "epoch": 0.9810606060606061, "grad_norm": 0.03560535292438474, "learning_rate": 3.2768001863945905e-07, "loss": 0.7381, "step": 5180 }, { "epoch": 0.9820075757575758, "grad_norm": 0.03470144345138998, "learning_rate": 2.9574171962533644e-07, "loss": 0.7447, "step": 5185 }, { "epoch": 0.9829545454545454, "grad_norm": 0.038083967145801485, "learning_rate": 2.654391846207915e-07, "loss": 0.7667, "step": 5190 }, { "epoch": 0.9839015151515151, "grad_norm": 0.03704077024461041, "learning_rate": 2.3677274473063444e-07, "loss": 0.7666, "step": 5195 }, { "epoch": 0.9848484848484849, "grad_norm": 0.04114138125476826, "learning_rate": 2.0974271318260905e-07, "loss": 0.7681, "step": 5200 }, { "epoch": 0.9857954545454546, "grad_norm": 0.03557878772125844, "learning_rate": 1.8434938532406186e-07, "loss": 0.7482, "step": 5205 }, { "epoch": 0.9867424242424242, "grad_norm": 0.03606070157019983, "learning_rate": 1.6059303861862826e-07, "loss": 0.7404, "step": 5210 }, { "epoch": 0.9876893939393939, "grad_norm": 0.037415707092603924, "learning_rate": 1.3847393264330153e-07, "loss": 0.768, "step": 5215 }, { "epoch": 0.9886363636363636, "grad_norm": 0.03725866754101771, "learning_rate": 1.1799230908550173e-07, "loss": 0.7409, "step": 5220 }, { "epoch": 0.9895833333333334, "grad_norm": 0.039547937064916217, "learning_rate": 9.914839174049449e-08, "loss": 0.7408, "step": 5225 }, { "epoch": 0.990530303030303, "grad_norm": 0.03604106643192906, "learning_rate": 8.194238650889307e-08, "loss": 0.7571, "step": 5230 }, { "epoch": 0.9914772727272727, "grad_norm": 0.03696081603757769, "learning_rate": 6.637448139447666e-08, "loss": 0.7416, "step": 5235 }, { "epoch": 0.9924242424242424, "grad_norm": 0.037770041786195266, "learning_rate": 5.244484650207548e-08, "loss": 0.756, "step": 5240 }, { "epoch": 0.9933712121212122, "grad_norm": 0.04046458037414051, "learning_rate": 4.01536340357389e-08, "loss": 0.759, "step": 5245 }, { "epoch": 0.9943181818181818, "grad_norm": 0.03827692567603896, "learning_rate": 2.9500978297103407e-08, "loss": 0.7787, "step": 5250 }, { "epoch": 0.9952651515151515, "grad_norm": 0.03835615698389389, "learning_rate": 2.0486995683860476e-08, "loss": 0.7284, "step": 5255 }, { "epoch": 0.9962121212121212, "grad_norm": 0.04008817912232536, "learning_rate": 1.3111784688507599e-08, "loss": 0.7493, "step": 5260 }, { "epoch": 0.9971590909090909, "grad_norm": 0.03940503331438013, "learning_rate": 7.375425897299115e-09, "loss": 0.7522, "step": 5265 }, { "epoch": 0.9981060606060606, "grad_norm": 0.03788195266551941, "learning_rate": 3.277981989346923e-09, "loss": 0.746, "step": 5270 }, { "epoch": 0.9990530303030303, "grad_norm": 0.03792409661384259, "learning_rate": 8.194977359210486e-10, "loss": 0.7443, "step": 5275 }, { "epoch": 1.0, "grad_norm": 0.04145596279829835, "learning_rate": 0.0, "loss": 0.7751, "step": 5280 }, { "epoch": 1.0, "eval_loss": 1.116625189781189, "eval_runtime": 1241.8314, "eval_samples_per_second": 194.261, "eval_steps_per_second": 6.071, "step": 5280 }, { "epoch": 1.0, "step": 5280, "total_flos": 771937243234304.0, "train_loss": 0.8302312182657646, "train_runtime": 21905.5744, "train_samples_per_second": 30.851, "train_steps_per_second": 0.241 } ], "logging_steps": 5, "max_steps": 5280, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 771937243234304.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }