{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 12091, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0008270614506657845, "grad_norm": 14.711603164672852, "learning_rate": 1.487603305785124e-07, "loss": 2.7309, "step": 10 }, { "epoch": 0.001654122901331569, "grad_norm": 14.97842025756836, "learning_rate": 3.1404958677685957e-07, "loss": 2.7435, "step": 20 }, { "epoch": 0.0024811843519973536, "grad_norm": 12.011658668518066, "learning_rate": 4.793388429752067e-07, "loss": 2.7037, "step": 30 }, { "epoch": 0.003308245802663138, "grad_norm": 11.382739067077637, "learning_rate": 6.446280991735538e-07, "loss": 2.5787, "step": 40 }, { "epoch": 0.0041353072533289225, "grad_norm": 7.4454193115234375, "learning_rate": 8.099173553719009e-07, "loss": 2.2764, "step": 50 }, { "epoch": 0.004962368703994707, "grad_norm": 5.262652397155762, "learning_rate": 9.75206611570248e-07, "loss": 1.9743, "step": 60 }, { "epoch": 0.005789430154660491, "grad_norm": 3.370814323425293, "learning_rate": 1.140495867768595e-06, "loss": 1.5944, "step": 70 }, { "epoch": 0.006616491605326276, "grad_norm": 3.199523687362671, "learning_rate": 1.3057851239669423e-06, "loss": 1.3237, "step": 80 }, { "epoch": 0.00744355305599206, "grad_norm": 2.578493118286133, "learning_rate": 1.4710743801652894e-06, "loss": 1.0191, "step": 90 }, { "epoch": 0.008270614506657845, "grad_norm": 2.9655439853668213, "learning_rate": 1.6363636363636365e-06, "loss": 0.7199, "step": 100 }, { "epoch": 0.00909767595732363, "grad_norm": 5.320194244384766, "learning_rate": 1.8016528925619835e-06, "loss": 0.5692, "step": 110 }, { "epoch": 0.009924737407989414, "grad_norm": 1.893227458000183, "learning_rate": 1.966942148760331e-06, "loss": 0.4681, "step": 120 }, { "epoch": 0.010751798858655199, "grad_norm": 1.1032103300094604, "learning_rate": 2.132231404958678e-06, "loss": 0.3882, "step": 130 }, { "epoch": 0.011578860309320982, "grad_norm": 1.873787760734558, "learning_rate": 2.297520661157025e-06, "loss": 0.3182, "step": 140 }, { "epoch": 0.012405921759986767, "grad_norm": 3.0224239826202393, "learning_rate": 2.462809917355372e-06, "loss": 0.2801, "step": 150 }, { "epoch": 0.013232983210652551, "grad_norm": 1.4292343854904175, "learning_rate": 2.628099173553719e-06, "loss": 0.2074, "step": 160 }, { "epoch": 0.014060044661318336, "grad_norm": 5.8895368576049805, "learning_rate": 2.7933884297520662e-06, "loss": 0.2002, "step": 170 }, { "epoch": 0.01488710611198412, "grad_norm": 1.3134126663208008, "learning_rate": 2.9586776859504133e-06, "loss": 0.1672, "step": 180 }, { "epoch": 0.015714167562649903, "grad_norm": 0.5309329628944397, "learning_rate": 3.123966942148761e-06, "loss": 0.1528, "step": 190 }, { "epoch": 0.01654122901331569, "grad_norm": 0.7183464169502258, "learning_rate": 3.289256198347108e-06, "loss": 0.1324, "step": 200 }, { "epoch": 0.017368290463981473, "grad_norm": 0.7033889293670654, "learning_rate": 3.454545454545455e-06, "loss": 0.1214, "step": 210 }, { "epoch": 0.01819535191464726, "grad_norm": 0.39757004380226135, "learning_rate": 3.619834710743802e-06, "loss": 0.1116, "step": 220 }, { "epoch": 0.019022413365313042, "grad_norm": 1.0405199527740479, "learning_rate": 3.785123966942149e-06, "loss": 0.1011, "step": 230 }, { "epoch": 0.01984947481597883, "grad_norm": 1.6865506172180176, "learning_rate": 3.950413223140496e-06, "loss": 0.1023, "step": 240 }, { "epoch": 0.02067653626664461, "grad_norm": 0.746986985206604, "learning_rate": 4.115702479338843e-06, "loss": 0.092, "step": 250 }, { "epoch": 0.021503597717310398, "grad_norm": 0.482876718044281, "learning_rate": 4.28099173553719e-06, "loss": 0.0803, "step": 260 }, { "epoch": 0.02233065916797618, "grad_norm": 0.4853907525539398, "learning_rate": 4.4462809917355374e-06, "loss": 0.0782, "step": 270 }, { "epoch": 0.023157720618641964, "grad_norm": 0.2537175714969635, "learning_rate": 4.6115702479338845e-06, "loss": 0.0776, "step": 280 }, { "epoch": 0.02398478206930775, "grad_norm": 0.7867515683174133, "learning_rate": 4.776859504132232e-06, "loss": 0.073, "step": 290 }, { "epoch": 0.024811843519973533, "grad_norm": 0.43127694725990295, "learning_rate": 4.942148760330579e-06, "loss": 0.0734, "step": 300 }, { "epoch": 0.02563890497063932, "grad_norm": 0.20157092809677124, "learning_rate": 5.107438016528926e-06, "loss": 0.0737, "step": 310 }, { "epoch": 0.026465966421305102, "grad_norm": 0.5229440927505493, "learning_rate": 5.272727272727273e-06, "loss": 0.0692, "step": 320 }, { "epoch": 0.02729302787197089, "grad_norm": 0.7331608533859253, "learning_rate": 5.438016528925621e-06, "loss": 0.0688, "step": 330 }, { "epoch": 0.028120089322636672, "grad_norm": 0.2658148407936096, "learning_rate": 5.603305785123967e-06, "loss": 0.0603, "step": 340 }, { "epoch": 0.028947150773302455, "grad_norm": 0.3650042414665222, "learning_rate": 5.768595041322315e-06, "loss": 0.0593, "step": 350 }, { "epoch": 0.02977421222396824, "grad_norm": 0.2189350426197052, "learning_rate": 5.933884297520661e-06, "loss": 0.0561, "step": 360 }, { "epoch": 0.030601273674634024, "grad_norm": 0.2192901372909546, "learning_rate": 6.099173553719009e-06, "loss": 0.0624, "step": 370 }, { "epoch": 0.03142833512529981, "grad_norm": 0.2812904715538025, "learning_rate": 6.264462809917355e-06, "loss": 0.0556, "step": 380 }, { "epoch": 0.03225539657596559, "grad_norm": 0.6402881145477295, "learning_rate": 6.429752066115703e-06, "loss": 0.0572, "step": 390 }, { "epoch": 0.03308245802663138, "grad_norm": 0.17161770164966583, "learning_rate": 6.5950413223140495e-06, "loss": 0.0537, "step": 400 }, { "epoch": 0.033909519477297166, "grad_norm": 0.3633708357810974, "learning_rate": 6.760330578512397e-06, "loss": 0.0504, "step": 410 }, { "epoch": 0.034736580927962946, "grad_norm": 0.3227091133594513, "learning_rate": 6.925619834710744e-06, "loss": 0.0527, "step": 420 }, { "epoch": 0.03556364237862873, "grad_norm": 0.1883084774017334, "learning_rate": 7.0909090909090916e-06, "loss": 0.0496, "step": 430 }, { "epoch": 0.03639070382929452, "grad_norm": 0.404940664768219, "learning_rate": 7.256198347107438e-06, "loss": 0.0515, "step": 440 }, { "epoch": 0.0372177652799603, "grad_norm": 0.2766735553741455, "learning_rate": 7.421487603305786e-06, "loss": 0.0482, "step": 450 }, { "epoch": 0.038044826730626084, "grad_norm": 0.14233002066612244, "learning_rate": 7.586776859504133e-06, "loss": 0.0495, "step": 460 }, { "epoch": 0.03887188818129187, "grad_norm": 0.17358863353729248, "learning_rate": 7.75206611570248e-06, "loss": 0.0464, "step": 470 }, { "epoch": 0.03969894963195766, "grad_norm": 0.24469003081321716, "learning_rate": 7.917355371900827e-06, "loss": 0.0479, "step": 480 }, { "epoch": 0.04052601108262344, "grad_norm": 0.20702078938484192, "learning_rate": 8.082644628099174e-06, "loss": 0.042, "step": 490 }, { "epoch": 0.04135307253328922, "grad_norm": 0.38820740580558777, "learning_rate": 8.247933884297521e-06, "loss": 0.0486, "step": 500 }, { "epoch": 0.04218013398395501, "grad_norm": 0.17128099501132965, "learning_rate": 8.413223140495868e-06, "loss": 0.0432, "step": 510 }, { "epoch": 0.043007195434620796, "grad_norm": 0.15014755725860596, "learning_rate": 8.578512396694215e-06, "loss": 0.0471, "step": 520 }, { "epoch": 0.043834256885286575, "grad_norm": 0.31599992513656616, "learning_rate": 8.743801652892562e-06, "loss": 0.0431, "step": 530 }, { "epoch": 0.04466131833595236, "grad_norm": 0.2722884714603424, "learning_rate": 8.90909090909091e-06, "loss": 0.0422, "step": 540 }, { "epoch": 0.04548837978661815, "grad_norm": 0.2727777361869812, "learning_rate": 9.074380165289256e-06, "loss": 0.0411, "step": 550 }, { "epoch": 0.04631544123728393, "grad_norm": 0.11177966743707657, "learning_rate": 9.239669421487604e-06, "loss": 0.0422, "step": 560 }, { "epoch": 0.047142502687949714, "grad_norm": 0.603720486164093, "learning_rate": 9.40495867768595e-06, "loss": 0.0434, "step": 570 }, { "epoch": 0.0479695641386155, "grad_norm": 0.13153497874736786, "learning_rate": 9.570247933884298e-06, "loss": 0.0392, "step": 580 }, { "epoch": 0.04879662558928129, "grad_norm": 0.11294803768396378, "learning_rate": 9.735537190082645e-06, "loss": 0.0421, "step": 590 }, { "epoch": 0.049623687039947066, "grad_norm": 0.08017970621585846, "learning_rate": 9.900826446280992e-06, "loss": 0.0395, "step": 600 }, { "epoch": 0.05045074849061285, "grad_norm": 0.10552022606134415, "learning_rate": 9.999997007583302e-06, "loss": 0.0409, "step": 610 }, { "epoch": 0.05127780994127864, "grad_norm": 0.09761521220207214, "learning_rate": 9.999963342936584e-06, "loss": 0.041, "step": 620 }, { "epoch": 0.05210487139194442, "grad_norm": 0.19305112957954407, "learning_rate": 9.999892273374958e-06, "loss": 0.0387, "step": 630 }, { "epoch": 0.052931932842610205, "grad_norm": 0.09766830503940582, "learning_rate": 9.999783799430103e-06, "loss": 0.0407, "step": 640 }, { "epoch": 0.05375899429327599, "grad_norm": 0.14489105343818665, "learning_rate": 9.999637921913512e-06, "loss": 0.0389, "step": 650 }, { "epoch": 0.05458605574394178, "grad_norm": 0.32930856943130493, "learning_rate": 9.999454641916505e-06, "loss": 0.038, "step": 660 }, { "epoch": 0.05541311719460756, "grad_norm": 0.14028801023960114, "learning_rate": 9.9992339608102e-06, "loss": 0.0389, "step": 670 }, { "epoch": 0.056240178645273343, "grad_norm": 0.18841058015823364, "learning_rate": 9.998975880245528e-06, "loss": 0.0377, "step": 680 }, { "epoch": 0.05706724009593913, "grad_norm": 0.22034569084644318, "learning_rate": 9.998680402153193e-06, "loss": 0.0375, "step": 690 }, { "epoch": 0.05789430154660491, "grad_norm": 0.09396768361330032, "learning_rate": 9.998347528743684e-06, "loss": 0.0373, "step": 700 }, { "epoch": 0.058721362997270696, "grad_norm": 0.21181143820285797, "learning_rate": 9.997977262507234e-06, "loss": 0.0368, "step": 710 }, { "epoch": 0.05954842444793648, "grad_norm": 0.09176570922136307, "learning_rate": 9.997569606213822e-06, "loss": 0.0402, "step": 720 }, { "epoch": 0.06037548589860227, "grad_norm": 0.1015128493309021, "learning_rate": 9.997124562913138e-06, "loss": 0.037, "step": 730 }, { "epoch": 0.06120254734926805, "grad_norm": 0.10508076846599579, "learning_rate": 9.996642135934571e-06, "loss": 0.0359, "step": 740 }, { "epoch": 0.062029608799933834, "grad_norm": 0.1171686202287674, "learning_rate": 9.996122328887173e-06, "loss": 0.0355, "step": 750 }, { "epoch": 0.06285667025059961, "grad_norm": 0.0857829749584198, "learning_rate": 9.99556514565964e-06, "loss": 0.0373, "step": 760 }, { "epoch": 0.0636837317012654, "grad_norm": 0.10609547048807144, "learning_rate": 9.994970590420284e-06, "loss": 0.0358, "step": 770 }, { "epoch": 0.06451079315193119, "grad_norm": 0.08259180933237076, "learning_rate": 9.994338667616989e-06, "loss": 0.0357, "step": 780 }, { "epoch": 0.06533785460259697, "grad_norm": 0.09571733325719833, "learning_rate": 9.9936693819772e-06, "loss": 0.0384, "step": 790 }, { "epoch": 0.06616491605326276, "grad_norm": 0.15978851914405823, "learning_rate": 9.992962738507862e-06, "loss": 0.0365, "step": 800 }, { "epoch": 0.06699197750392855, "grad_norm": 0.14394982159137726, "learning_rate": 9.992218742495409e-06, "loss": 0.0371, "step": 810 }, { "epoch": 0.06781903895459433, "grad_norm": 0.09818094223737717, "learning_rate": 9.991437399505697e-06, "loss": 0.0375, "step": 820 }, { "epoch": 0.0686461004052601, "grad_norm": 0.23322197794914246, "learning_rate": 9.990618715383985e-06, "loss": 0.0349, "step": 830 }, { "epoch": 0.06947316185592589, "grad_norm": 0.09629665315151215, "learning_rate": 9.98976269625488e-06, "loss": 0.0349, "step": 840 }, { "epoch": 0.07030022330659168, "grad_norm": 0.1651093065738678, "learning_rate": 9.988869348522293e-06, "loss": 0.035, "step": 850 }, { "epoch": 0.07112728475725746, "grad_norm": 0.12863469123840332, "learning_rate": 9.98793867886939e-06, "loss": 0.0364, "step": 860 }, { "epoch": 0.07195434620792325, "grad_norm": 0.3346972167491913, "learning_rate": 9.98697069425855e-06, "loss": 0.0337, "step": 870 }, { "epoch": 0.07278140765858904, "grad_norm": 0.146303191781044, "learning_rate": 9.9859654019313e-06, "loss": 0.0363, "step": 880 }, { "epoch": 0.07360846910925482, "grad_norm": 0.14840497076511383, "learning_rate": 9.984922809408272e-06, "loss": 0.0349, "step": 890 }, { "epoch": 0.0744355305599206, "grad_norm": 0.10886628180742264, "learning_rate": 9.983842924489137e-06, "loss": 0.0344, "step": 900 }, { "epoch": 0.07526259201058638, "grad_norm": 0.09753160178661346, "learning_rate": 9.982725755252557e-06, "loss": 0.0327, "step": 910 }, { "epoch": 0.07608965346125217, "grad_norm": 0.09709116816520691, "learning_rate": 9.981571310056116e-06, "loss": 0.0361, "step": 920 }, { "epoch": 0.07691671491191795, "grad_norm": 0.08936941623687744, "learning_rate": 9.980379597536263e-06, "loss": 0.039, "step": 930 }, { "epoch": 0.07774377636258374, "grad_norm": 0.07184750586748123, "learning_rate": 9.979150626608246e-06, "loss": 0.034, "step": 940 }, { "epoch": 0.07857083781324953, "grad_norm": 0.07059776037931442, "learning_rate": 9.97788440646604e-06, "loss": 0.0314, "step": 950 }, { "epoch": 0.07939789926391531, "grad_norm": 0.07418540120124817, "learning_rate": 9.976580946582289e-06, "loss": 0.0338, "step": 960 }, { "epoch": 0.08022496071458109, "grad_norm": 0.14634068310260773, "learning_rate": 9.975240256708222e-06, "loss": 0.0344, "step": 970 }, { "epoch": 0.08105202216524687, "grad_norm": 0.10202177613973618, "learning_rate": 9.973862346873594e-06, "loss": 0.0312, "step": 980 }, { "epoch": 0.08187908361591266, "grad_norm": 0.08847320824861526, "learning_rate": 9.9724472273866e-06, "loss": 0.0335, "step": 990 }, { "epoch": 0.08270614506657845, "grad_norm": 0.1381935477256775, "learning_rate": 9.9709949088338e-06, "loss": 0.0399, "step": 1000 }, { "epoch": 0.08270614506657845, "eval_loss": 0.0343376062810421, "eval_runtime": 1220.1317, "eval_samples_per_second": 4.917, "eval_steps_per_second": 0.307, "step": 1000 }, { "epoch": 0.08353320651724423, "grad_norm": 0.15219560265541077, "learning_rate": 9.969505402080044e-06, "loss": 0.0337, "step": 1010 }, { "epoch": 0.08436026796791002, "grad_norm": 0.20263217389583588, "learning_rate": 9.967978718268391e-06, "loss": 0.0315, "step": 1020 }, { "epoch": 0.0851873294185758, "grad_norm": 0.10303157567977905, "learning_rate": 9.966414868820022e-06, "loss": 0.0354, "step": 1030 }, { "epoch": 0.08601439086924159, "grad_norm": 0.10471872240304947, "learning_rate": 9.964813865434149e-06, "loss": 0.035, "step": 1040 }, { "epoch": 0.08684145231990736, "grad_norm": 0.08253839612007141, "learning_rate": 9.963175720087941e-06, "loss": 0.0317, "step": 1050 }, { "epoch": 0.08766851377057315, "grad_norm": 0.08755608648061752, "learning_rate": 9.961500445036428e-06, "loss": 0.0314, "step": 1060 }, { "epoch": 0.08849557522123894, "grad_norm": 0.15729674696922302, "learning_rate": 9.9597880528124e-06, "loss": 0.0371, "step": 1070 }, { "epoch": 0.08932263667190472, "grad_norm": 0.14116688072681427, "learning_rate": 9.958038556226332e-06, "loss": 0.0317, "step": 1080 }, { "epoch": 0.09014969812257051, "grad_norm": 0.1923297643661499, "learning_rate": 9.956251968366276e-06, "loss": 0.035, "step": 1090 }, { "epoch": 0.0909767595732363, "grad_norm": 0.07124887406826019, "learning_rate": 9.954428302597759e-06, "loss": 0.0308, "step": 1100 }, { "epoch": 0.09180382102390208, "grad_norm": 0.07786933332681656, "learning_rate": 9.952567572563696e-06, "loss": 0.0304, "step": 1110 }, { "epoch": 0.09263088247456785, "grad_norm": 0.15352018177509308, "learning_rate": 9.950669792184279e-06, "loss": 0.0332, "step": 1120 }, { "epoch": 0.09345794392523364, "grad_norm": 0.0767594501376152, "learning_rate": 9.948734975656874e-06, "loss": 0.032, "step": 1130 }, { "epoch": 0.09428500537589943, "grad_norm": 0.06958391517400742, "learning_rate": 9.946763137455915e-06, "loss": 0.032, "step": 1140 }, { "epoch": 0.09511206682656521, "grad_norm": 0.09188707917928696, "learning_rate": 9.944754292332802e-06, "loss": 0.0318, "step": 1150 }, { "epoch": 0.095939128277231, "grad_norm": 0.07205051183700562, "learning_rate": 9.942708455315779e-06, "loss": 0.03, "step": 1160 }, { "epoch": 0.09676618972789679, "grad_norm": 0.07053325325250626, "learning_rate": 9.94062564170983e-06, "loss": 0.0314, "step": 1170 }, { "epoch": 0.09759325117856257, "grad_norm": 0.09830465912818909, "learning_rate": 9.938505867096563e-06, "loss": 0.031, "step": 1180 }, { "epoch": 0.09842031262922835, "grad_norm": 0.08823514729738235, "learning_rate": 9.93634914733409e-06, "loss": 0.0308, "step": 1190 }, { "epoch": 0.09924737407989413, "grad_norm": 0.16881339251995087, "learning_rate": 9.934155498556919e-06, "loss": 0.0319, "step": 1200 }, { "epoch": 0.10007443553055992, "grad_norm": 0.07580441236495972, "learning_rate": 9.931924937175813e-06, "loss": 0.0304, "step": 1210 }, { "epoch": 0.1009014969812257, "grad_norm": 0.12182483077049255, "learning_rate": 9.929657479877688e-06, "loss": 0.03, "step": 1220 }, { "epoch": 0.10172855843189149, "grad_norm": 0.1227443590760231, "learning_rate": 9.92735314362548e-06, "loss": 0.0297, "step": 1230 }, { "epoch": 0.10255561988255728, "grad_norm": 0.1861189603805542, "learning_rate": 9.925011945658012e-06, "loss": 0.0298, "step": 1240 }, { "epoch": 0.10338268133322306, "grad_norm": 0.07195472717285156, "learning_rate": 9.922633903489878e-06, "loss": 0.0348, "step": 1250 }, { "epoch": 0.10420974278388884, "grad_norm": 0.08541199564933777, "learning_rate": 9.9202190349113e-06, "loss": 0.0329, "step": 1260 }, { "epoch": 0.10503680423455462, "grad_norm": 0.07149334251880646, "learning_rate": 9.917767357988e-06, "loss": 0.0295, "step": 1270 }, { "epoch": 0.10586386568522041, "grad_norm": 0.07702941447496414, "learning_rate": 9.915278891061069e-06, "loss": 0.0317, "step": 1280 }, { "epoch": 0.1066909271358862, "grad_norm": 0.09982211887836456, "learning_rate": 9.912753652746819e-06, "loss": 0.0296, "step": 1290 }, { "epoch": 0.10751798858655198, "grad_norm": 0.15653453767299652, "learning_rate": 9.910191661936654e-06, "loss": 0.0312, "step": 1300 }, { "epoch": 0.10834505003721777, "grad_norm": 0.09917636215686798, "learning_rate": 9.907592937796927e-06, "loss": 0.0304, "step": 1310 }, { "epoch": 0.10917211148788356, "grad_norm": 0.07035510987043381, "learning_rate": 9.904957499768787e-06, "loss": 0.0314, "step": 1320 }, { "epoch": 0.10999917293854933, "grad_norm": 0.07983675599098206, "learning_rate": 9.902285367568049e-06, "loss": 0.0301, "step": 1330 }, { "epoch": 0.11082623438921511, "grad_norm": 0.06464583426713943, "learning_rate": 9.899576561185034e-06, "loss": 0.0305, "step": 1340 }, { "epoch": 0.1116532958398809, "grad_norm": 0.1578930765390396, "learning_rate": 9.896831100884424e-06, "loss": 0.0303, "step": 1350 }, { "epoch": 0.11248035729054669, "grad_norm": 0.06734715402126312, "learning_rate": 9.894049007205112e-06, "loss": 0.0281, "step": 1360 }, { "epoch": 0.11330741874121247, "grad_norm": 0.09131414443254471, "learning_rate": 9.891230300960049e-06, "loss": 0.0302, "step": 1370 }, { "epoch": 0.11413448019187826, "grad_norm": 0.06612879037857056, "learning_rate": 9.888375003236078e-06, "loss": 0.032, "step": 1380 }, { "epoch": 0.11496154164254405, "grad_norm": 0.07150176167488098, "learning_rate": 9.885483135393792e-06, "loss": 0.031, "step": 1390 }, { "epoch": 0.11578860309320982, "grad_norm": 0.06837069243192673, "learning_rate": 9.882554719067363e-06, "loss": 0.0292, "step": 1400 }, { "epoch": 0.1166156645438756, "grad_norm": 0.09854337573051453, "learning_rate": 9.879589776164387e-06, "loss": 0.0302, "step": 1410 }, { "epoch": 0.11744272599454139, "grad_norm": 0.06270977109670639, "learning_rate": 9.87658832886571e-06, "loss": 0.0285, "step": 1420 }, { "epoch": 0.11826978744520718, "grad_norm": 0.09647868573665619, "learning_rate": 9.873550399625275e-06, "loss": 0.0283, "step": 1430 }, { "epoch": 0.11909684889587296, "grad_norm": 0.06852090358734131, "learning_rate": 9.870476011169948e-06, "loss": 0.0299, "step": 1440 }, { "epoch": 0.11992391034653875, "grad_norm": 0.06666602194309235, "learning_rate": 9.867365186499337e-06, "loss": 0.0338, "step": 1450 }, { "epoch": 0.12075097179720454, "grad_norm": 0.07159853726625443, "learning_rate": 9.864217948885648e-06, "loss": 0.0281, "step": 1460 }, { "epoch": 0.12157803324787032, "grad_norm": 0.1366601437330246, "learning_rate": 9.861034321873481e-06, "loss": 0.0309, "step": 1470 }, { "epoch": 0.1224050946985361, "grad_norm": 0.08372735232114792, "learning_rate": 9.85781432927967e-06, "loss": 0.0308, "step": 1480 }, { "epoch": 0.12323215614920188, "grad_norm": 0.10882294178009033, "learning_rate": 9.854557995193102e-06, "loss": 0.0289, "step": 1490 }, { "epoch": 0.12405921759986767, "grad_norm": 0.07682844996452332, "learning_rate": 9.851265343974534e-06, "loss": 0.031, "step": 1500 }, { "epoch": 0.12488627905053346, "grad_norm": 0.12793898582458496, "learning_rate": 9.847936400256415e-06, "loss": 0.0291, "step": 1510 }, { "epoch": 0.12571334050119923, "grad_norm": 0.07250412553548813, "learning_rate": 9.844571188942701e-06, "loss": 0.029, "step": 1520 }, { "epoch": 0.12654040195186503, "grad_norm": 0.06386396288871765, "learning_rate": 9.841169735208662e-06, "loss": 0.0307, "step": 1530 }, { "epoch": 0.1273674634025308, "grad_norm": 0.05723176896572113, "learning_rate": 9.837732064500705e-06, "loss": 0.0286, "step": 1540 }, { "epoch": 0.1281945248531966, "grad_norm": 0.07307655364274979, "learning_rate": 9.834258202536173e-06, "loss": 0.0304, "step": 1550 }, { "epoch": 0.12902158630386237, "grad_norm": 0.06836479902267456, "learning_rate": 9.830748175303157e-06, "loss": 0.0286, "step": 1560 }, { "epoch": 0.12984864775452817, "grad_norm": 0.0659850612282753, "learning_rate": 9.827202009060307e-06, "loss": 0.0271, "step": 1570 }, { "epoch": 0.13067570920519395, "grad_norm": 0.06328194588422775, "learning_rate": 9.823619730336624e-06, "loss": 0.028, "step": 1580 }, { "epoch": 0.13150277065585972, "grad_norm": 0.05650272220373154, "learning_rate": 9.820001365931273e-06, "loss": 0.0279, "step": 1590 }, { "epoch": 0.13232983210652552, "grad_norm": 0.10159070044755936, "learning_rate": 9.816346942913376e-06, "loss": 0.029, "step": 1600 }, { "epoch": 0.1331568935571913, "grad_norm": 0.4464583098888397, "learning_rate": 9.812656488621804e-06, "loss": 0.0298, "step": 1610 }, { "epoch": 0.1339839550078571, "grad_norm": 0.06621966511011124, "learning_rate": 9.808930030664989e-06, "loss": 0.0303, "step": 1620 }, { "epoch": 0.13481101645852286, "grad_norm": 0.07061782479286194, "learning_rate": 9.805167596920707e-06, "loss": 0.0283, "step": 1630 }, { "epoch": 0.13563807790918866, "grad_norm": 0.06339192390441895, "learning_rate": 9.80136921553586e-06, "loss": 0.0274, "step": 1640 }, { "epoch": 0.13646513935985444, "grad_norm": 0.09378033131361008, "learning_rate": 9.797534914926289e-06, "loss": 0.028, "step": 1650 }, { "epoch": 0.1372922008105202, "grad_norm": 0.11731720715761185, "learning_rate": 9.793664723776539e-06, "loss": 0.0289, "step": 1660 }, { "epoch": 0.138119262261186, "grad_norm": 0.07038633525371552, "learning_rate": 9.789758671039658e-06, "loss": 0.0279, "step": 1670 }, { "epoch": 0.13894632371185178, "grad_norm": 0.08343333005905151, "learning_rate": 9.785816785936973e-06, "loss": 0.0278, "step": 1680 }, { "epoch": 0.13977338516251758, "grad_norm": 0.08339129388332367, "learning_rate": 9.781839097957875e-06, "loss": 0.0302, "step": 1690 }, { "epoch": 0.14060044661318336, "grad_norm": 0.15373483300209045, "learning_rate": 9.777825636859599e-06, "loss": 0.0293, "step": 1700 }, { "epoch": 0.14142750806384916, "grad_norm": 0.07383430004119873, "learning_rate": 9.773776432667e-06, "loss": 0.0295, "step": 1710 }, { "epoch": 0.14225456951451493, "grad_norm": 0.07228893786668777, "learning_rate": 9.769691515672328e-06, "loss": 0.0276, "step": 1720 }, { "epoch": 0.1430816309651807, "grad_norm": 0.09278323501348495, "learning_rate": 9.765570916434998e-06, "loss": 0.0289, "step": 1730 }, { "epoch": 0.1439086924158465, "grad_norm": 0.09062926471233368, "learning_rate": 9.761414665781374e-06, "loss": 0.028, "step": 1740 }, { "epoch": 0.14473575386651227, "grad_norm": 0.06459420919418335, "learning_rate": 9.757222794804522e-06, "loss": 0.0279, "step": 1750 }, { "epoch": 0.14556281531717807, "grad_norm": 0.06160286068916321, "learning_rate": 9.752995334863985e-06, "loss": 0.028, "step": 1760 }, { "epoch": 0.14638987676784385, "grad_norm": 0.07651007920503616, "learning_rate": 9.748732317585557e-06, "loss": 0.0295, "step": 1770 }, { "epoch": 0.14721693821850965, "grad_norm": 0.08549734950065613, "learning_rate": 9.744433774861024e-06, "loss": 0.028, "step": 1780 }, { "epoch": 0.14804399966917542, "grad_norm": 0.06986084580421448, "learning_rate": 9.74009973884795e-06, "loss": 0.029, "step": 1790 }, { "epoch": 0.1488710611198412, "grad_norm": 0.0717945545911789, "learning_rate": 9.735730241969425e-06, "loss": 0.0287, "step": 1800 }, { "epoch": 0.149698122570507, "grad_norm": 0.055828921496868134, "learning_rate": 9.731325316913816e-06, "loss": 0.0279, "step": 1810 }, { "epoch": 0.15052518402117276, "grad_norm": 0.09485841542482376, "learning_rate": 9.726884996634535e-06, "loss": 0.0288, "step": 1820 }, { "epoch": 0.15135224547183856, "grad_norm": 0.09525461494922638, "learning_rate": 9.72240931434979e-06, "loss": 0.0266, "step": 1830 }, { "epoch": 0.15217930692250434, "grad_norm": 0.058921415358781815, "learning_rate": 9.717898303542324e-06, "loss": 0.0278, "step": 1840 }, { "epoch": 0.15300636837317014, "grad_norm": 0.08154033869504929, "learning_rate": 9.713351997959184e-06, "loss": 0.0348, "step": 1850 }, { "epoch": 0.1538334298238359, "grad_norm": 0.059776682406663895, "learning_rate": 9.70877043161145e-06, "loss": 0.0275, "step": 1860 }, { "epoch": 0.15466049127450168, "grad_norm": 0.089345782995224, "learning_rate": 9.704153638773996e-06, "loss": 0.0253, "step": 1870 }, { "epoch": 0.15548755272516748, "grad_norm": 0.0736837238073349, "learning_rate": 9.699501653985223e-06, "loss": 0.0263, "step": 1880 }, { "epoch": 0.15631461417583326, "grad_norm": 0.09357444941997528, "learning_rate": 9.694814512046805e-06, "loss": 0.0278, "step": 1890 }, { "epoch": 0.15714167562649906, "grad_norm": 0.06266128271818161, "learning_rate": 9.690092248023428e-06, "loss": 0.0277, "step": 1900 }, { "epoch": 0.15796873707716483, "grad_norm": 0.081548310816288, "learning_rate": 9.68533489724253e-06, "loss": 0.0307, "step": 1910 }, { "epoch": 0.15879579852783063, "grad_norm": 0.05208978429436684, "learning_rate": 9.680542495294027e-06, "loss": 0.0277, "step": 1920 }, { "epoch": 0.1596228599784964, "grad_norm": 0.055970244109630585, "learning_rate": 9.675715078030063e-06, "loss": 0.027, "step": 1930 }, { "epoch": 0.16044992142916217, "grad_norm": 0.05580395460128784, "learning_rate": 9.67085268156473e-06, "loss": 0.0277, "step": 1940 }, { "epoch": 0.16127698287982797, "grad_norm": 0.058690398931503296, "learning_rate": 9.665955342273799e-06, "loss": 0.0274, "step": 1950 }, { "epoch": 0.16210404433049375, "grad_norm": 0.06293683499097824, "learning_rate": 9.661023096794449e-06, "loss": 0.0267, "step": 1960 }, { "epoch": 0.16293110578115955, "grad_norm": 0.08121343702077866, "learning_rate": 9.656055982024995e-06, "loss": 0.0279, "step": 1970 }, { "epoch": 0.16375816723182532, "grad_norm": 0.07257858663797379, "learning_rate": 9.651054035124614e-06, "loss": 0.0264, "step": 1980 }, { "epoch": 0.16458522868249112, "grad_norm": 0.06639426201581955, "learning_rate": 9.646017293513056e-06, "loss": 0.0265, "step": 1990 }, { "epoch": 0.1654122901331569, "grad_norm": 0.06022842600941658, "learning_rate": 9.640945794870377e-06, "loss": 0.0261, "step": 2000 }, { "epoch": 0.1654122901331569, "eval_loss": 0.028430579230189323, "eval_runtime": 1220.0038, "eval_samples_per_second": 4.917, "eval_steps_per_second": 0.307, "step": 2000 }, { "epoch": 0.16623935158382266, "grad_norm": 0.05734001100063324, "learning_rate": 9.63583957713665e-06, "loss": 0.0277, "step": 2010 }, { "epoch": 0.16706641303448846, "grad_norm": 0.08106731623411179, "learning_rate": 9.630698678511684e-06, "loss": 0.0266, "step": 2020 }, { "epoch": 0.16789347448515424, "grad_norm": 0.056222159415483475, "learning_rate": 9.625523137454736e-06, "loss": 0.0261, "step": 2030 }, { "epoch": 0.16872053593582004, "grad_norm": 0.06166260689496994, "learning_rate": 9.620312992684223e-06, "loss": 0.0265, "step": 2040 }, { "epoch": 0.1695475973864858, "grad_norm": 0.08784622699022293, "learning_rate": 9.615068283177434e-06, "loss": 0.0281, "step": 2050 }, { "epoch": 0.1703746588371516, "grad_norm": 0.0706792026758194, "learning_rate": 9.609789048170243e-06, "loss": 0.029, "step": 2060 }, { "epoch": 0.17120172028781738, "grad_norm": 0.05976736173033714, "learning_rate": 9.604475327156804e-06, "loss": 0.0254, "step": 2070 }, { "epoch": 0.17202878173848318, "grad_norm": 0.05874831974506378, "learning_rate": 9.599127159889266e-06, "loss": 0.0279, "step": 2080 }, { "epoch": 0.17285584318914896, "grad_norm": 0.06354232132434845, "learning_rate": 9.593744586377472e-06, "loss": 0.0266, "step": 2090 }, { "epoch": 0.17368290463981473, "grad_norm": 0.06033729389309883, "learning_rate": 9.588327646888655e-06, "loss": 0.0266, "step": 2100 }, { "epoch": 0.17450996609048053, "grad_norm": 0.18101929128170013, "learning_rate": 9.582876381947145e-06, "loss": 0.0266, "step": 2110 }, { "epoch": 0.1753370275411463, "grad_norm": 0.26323285698890686, "learning_rate": 9.577390832334064e-06, "loss": 0.0265, "step": 2120 }, { "epoch": 0.1761640889918121, "grad_norm": 0.05492362007498741, "learning_rate": 9.571871039087013e-06, "loss": 0.0266, "step": 2130 }, { "epoch": 0.17699115044247787, "grad_norm": 0.05727216601371765, "learning_rate": 9.566317043499773e-06, "loss": 0.0263, "step": 2140 }, { "epoch": 0.17781821189314367, "grad_norm": 0.14531953632831573, "learning_rate": 9.560728887122e-06, "loss": 0.0286, "step": 2150 }, { "epoch": 0.17864527334380945, "grad_norm": 0.06639876216650009, "learning_rate": 9.5551066117589e-06, "loss": 0.0262, "step": 2160 }, { "epoch": 0.17947233479447522, "grad_norm": 0.06139986589550972, "learning_rate": 9.549450259470927e-06, "loss": 0.0272, "step": 2170 }, { "epoch": 0.18029939624514102, "grad_norm": 0.07039148360490799, "learning_rate": 9.543759872573469e-06, "loss": 0.0282, "step": 2180 }, { "epoch": 0.1811264576958068, "grad_norm": 0.08487813919782639, "learning_rate": 9.538035493636524e-06, "loss": 0.0284, "step": 2190 }, { "epoch": 0.1819535191464726, "grad_norm": 0.07776181399822235, "learning_rate": 9.532277165484387e-06, "loss": 0.0279, "step": 2200 }, { "epoch": 0.18278058059713836, "grad_norm": 0.061026498675346375, "learning_rate": 9.52648493119533e-06, "loss": 0.0256, "step": 2210 }, { "epoch": 0.18360764204780416, "grad_norm": 0.061437539756298065, "learning_rate": 9.520658834101275e-06, "loss": 0.027, "step": 2220 }, { "epoch": 0.18443470349846994, "grad_norm": 0.06019297242164612, "learning_rate": 9.514798917787477e-06, "loss": 0.0305, "step": 2230 }, { "epoch": 0.1852617649491357, "grad_norm": 0.08646666258573532, "learning_rate": 9.50890522609219e-06, "loss": 0.0263, "step": 2240 }, { "epoch": 0.1860888263998015, "grad_norm": 0.1908756047487259, "learning_rate": 9.502977803106346e-06, "loss": 0.0259, "step": 2250 }, { "epoch": 0.18691588785046728, "grad_norm": 0.24025577306747437, "learning_rate": 9.497016693173218e-06, "loss": 0.0294, "step": 2260 }, { "epoch": 0.18774294930113308, "grad_norm": 0.07112468034029007, "learning_rate": 9.491021940888096e-06, "loss": 0.0266, "step": 2270 }, { "epoch": 0.18857001075179886, "grad_norm": 0.08155805617570877, "learning_rate": 9.484993591097952e-06, "loss": 0.0258, "step": 2280 }, { "epoch": 0.18939707220246466, "grad_norm": 0.05596913397312164, "learning_rate": 9.478931688901095e-06, "loss": 0.0264, "step": 2290 }, { "epoch": 0.19022413365313043, "grad_norm": 0.059164054691791534, "learning_rate": 9.472836279646844e-06, "loss": 0.0272, "step": 2300 }, { "epoch": 0.1910511951037962, "grad_norm": 0.06571198254823685, "learning_rate": 9.466707408935189e-06, "loss": 0.0272, "step": 2310 }, { "epoch": 0.191878256554462, "grad_norm": 0.07002273201942444, "learning_rate": 9.460545122616442e-06, "loss": 0.0275, "step": 2320 }, { "epoch": 0.19270531800512777, "grad_norm": 0.06005439907312393, "learning_rate": 9.4543494667909e-06, "loss": 0.028, "step": 2330 }, { "epoch": 0.19353237945579357, "grad_norm": 0.11123955994844437, "learning_rate": 9.4481204878085e-06, "loss": 0.0268, "step": 2340 }, { "epoch": 0.19435944090645935, "grad_norm": 0.0618051253259182, "learning_rate": 9.441858232268467e-06, "loss": 0.0259, "step": 2350 }, { "epoch": 0.19518650235712515, "grad_norm": 0.06244316324591637, "learning_rate": 9.435562747018976e-06, "loss": 0.0262, "step": 2360 }, { "epoch": 0.19601356380779092, "grad_norm": 0.08520273864269257, "learning_rate": 9.429234079156787e-06, "loss": 0.0267, "step": 2370 }, { "epoch": 0.1968406252584567, "grad_norm": 0.06388260424137115, "learning_rate": 9.422872276026902e-06, "loss": 0.0263, "step": 2380 }, { "epoch": 0.1976676867091225, "grad_norm": 0.06510653346776962, "learning_rate": 9.416477385222213e-06, "loss": 0.0281, "step": 2390 }, { "epoch": 0.19849474815978826, "grad_norm": 0.12499203532934189, "learning_rate": 9.41004945458314e-06, "loss": 0.0268, "step": 2400 }, { "epoch": 0.19932180961045406, "grad_norm": 0.06236669421195984, "learning_rate": 9.403588532197277e-06, "loss": 0.0262, "step": 2410 }, { "epoch": 0.20014887106111984, "grad_norm": 0.06980706751346588, "learning_rate": 9.397094666399025e-06, "loss": 0.0264, "step": 2420 }, { "epoch": 0.20097593251178564, "grad_norm": 0.05483941361308098, "learning_rate": 9.390567905769242e-06, "loss": 0.025, "step": 2430 }, { "epoch": 0.2018029939624514, "grad_norm": 0.08971832692623138, "learning_rate": 9.384008299134871e-06, "loss": 0.0243, "step": 2440 }, { "epoch": 0.20263005541311718, "grad_norm": 0.07181546092033386, "learning_rate": 9.377415895568578e-06, "loss": 0.0257, "step": 2450 }, { "epoch": 0.20345711686378298, "grad_norm": 0.06366802752017975, "learning_rate": 9.370790744388381e-06, "loss": 0.026, "step": 2460 }, { "epoch": 0.20428417831444876, "grad_norm": 0.065264031291008, "learning_rate": 9.36413289515729e-06, "loss": 0.0274, "step": 2470 }, { "epoch": 0.20511123976511456, "grad_norm": 0.09247761219739914, "learning_rate": 9.357442397682924e-06, "loss": 0.0251, "step": 2480 }, { "epoch": 0.20593830121578033, "grad_norm": 0.04937649890780449, "learning_rate": 9.350719302017148e-06, "loss": 0.0277, "step": 2490 }, { "epoch": 0.20676536266644613, "grad_norm": 0.06501265615224838, "learning_rate": 9.343963658455698e-06, "loss": 0.0266, "step": 2500 }, { "epoch": 0.2075924241171119, "grad_norm": 0.07810965925455093, "learning_rate": 9.337175517537796e-06, "loss": 0.0302, "step": 2510 }, { "epoch": 0.20841948556777767, "grad_norm": 0.056350335478782654, "learning_rate": 9.330354930045782e-06, "loss": 0.0275, "step": 2520 }, { "epoch": 0.20924654701844347, "grad_norm": 0.070098377764225, "learning_rate": 9.323501947004727e-06, "loss": 0.0268, "step": 2530 }, { "epoch": 0.21007360846910925, "grad_norm": 0.07072274386882782, "learning_rate": 9.316616619682059e-06, "loss": 0.0256, "step": 2540 }, { "epoch": 0.21090066991977505, "grad_norm": 0.05314943194389343, "learning_rate": 9.309698999587174e-06, "loss": 0.0256, "step": 2550 }, { "epoch": 0.21172773137044082, "grad_norm": 0.05929897353053093, "learning_rate": 9.302749138471046e-06, "loss": 0.0274, "step": 2560 }, { "epoch": 0.21255479282110662, "grad_norm": 0.07132343202829361, "learning_rate": 9.295767088325848e-06, "loss": 0.0256, "step": 2570 }, { "epoch": 0.2133818542717724, "grad_norm": 0.05408504605293274, "learning_rate": 9.288752901384563e-06, "loss": 0.0323, "step": 2580 }, { "epoch": 0.21420891572243816, "grad_norm": 0.0529806949198246, "learning_rate": 9.281706630120592e-06, "loss": 0.0252, "step": 2590 }, { "epoch": 0.21503597717310396, "grad_norm": 0.09713909775018692, "learning_rate": 9.274628327247353e-06, "loss": 0.0249, "step": 2600 }, { "epoch": 0.21586303862376974, "grad_norm": 0.07577594369649887, "learning_rate": 9.267518045717897e-06, "loss": 0.0283, "step": 2610 }, { "epoch": 0.21669010007443554, "grad_norm": 0.05945679545402527, "learning_rate": 9.260375838724511e-06, "loss": 0.0263, "step": 2620 }, { "epoch": 0.2175171615251013, "grad_norm": 0.06303580105304718, "learning_rate": 9.253201759698317e-06, "loss": 0.0297, "step": 2630 }, { "epoch": 0.2183442229757671, "grad_norm": 0.06167830526828766, "learning_rate": 9.245995862308867e-06, "loss": 0.0275, "step": 2640 }, { "epoch": 0.21917128442643288, "grad_norm": 0.05566466599702835, "learning_rate": 9.238758200463756e-06, "loss": 0.0279, "step": 2650 }, { "epoch": 0.21999834587709866, "grad_norm": 0.06275132298469543, "learning_rate": 9.231488828308205e-06, "loss": 0.0248, "step": 2660 }, { "epoch": 0.22082540732776446, "grad_norm": 0.07304584234952927, "learning_rate": 9.224187800224661e-06, "loss": 0.0273, "step": 2670 }, { "epoch": 0.22165246877843023, "grad_norm": 0.05730755627155304, "learning_rate": 9.216855170832393e-06, "loss": 0.0271, "step": 2680 }, { "epoch": 0.22247953022909603, "grad_norm": 0.05435599759221077, "learning_rate": 9.209490994987079e-06, "loss": 0.0248, "step": 2690 }, { "epoch": 0.2233065916797618, "grad_norm": 0.05061393231153488, "learning_rate": 9.202095327780394e-06, "loss": 0.0258, "step": 2700 }, { "epoch": 0.2241336531304276, "grad_norm": 0.05590864270925522, "learning_rate": 9.194668224539608e-06, "loss": 0.0256, "step": 2710 }, { "epoch": 0.22496071458109337, "grad_norm": 0.04720637574791908, "learning_rate": 9.187209740827159e-06, "loss": 0.0243, "step": 2720 }, { "epoch": 0.22578777603175915, "grad_norm": 0.055518608540296555, "learning_rate": 9.179719932440245e-06, "loss": 0.026, "step": 2730 }, { "epoch": 0.22661483748242495, "grad_norm": 0.060342345386743546, "learning_rate": 9.172198855410408e-06, "loss": 0.0254, "step": 2740 }, { "epoch": 0.22744189893309072, "grad_norm": 0.06279141455888748, "learning_rate": 9.164646566003109e-06, "loss": 0.0262, "step": 2750 }, { "epoch": 0.22826896038375652, "grad_norm": 0.050773248076438904, "learning_rate": 9.15706312071731e-06, "loss": 0.0271, "step": 2760 }, { "epoch": 0.2290960218344223, "grad_norm": 0.052737098187208176, "learning_rate": 9.149448576285055e-06, "loss": 0.0259, "step": 2770 }, { "epoch": 0.2299230832850881, "grad_norm": 0.055731095373630524, "learning_rate": 9.141802989671036e-06, "loss": 0.0255, "step": 2780 }, { "epoch": 0.23075014473575386, "grad_norm": 0.05351400747895241, "learning_rate": 9.134126418072175e-06, "loss": 0.0255, "step": 2790 }, { "epoch": 0.23157720618641964, "grad_norm": 0.05681459978222847, "learning_rate": 9.126418918917197e-06, "loss": 0.0268, "step": 2800 }, { "epoch": 0.23240426763708544, "grad_norm": 0.05028412118554115, "learning_rate": 9.118680549866193e-06, "loss": 0.0239, "step": 2810 }, { "epoch": 0.2332313290877512, "grad_norm": 0.05494118854403496, "learning_rate": 9.110911368810193e-06, "loss": 0.0239, "step": 2820 }, { "epoch": 0.234058390538417, "grad_norm": 0.04639596492052078, "learning_rate": 9.10311143387074e-06, "loss": 0.0246, "step": 2830 }, { "epoch": 0.23488545198908278, "grad_norm": 0.06322944909334183, "learning_rate": 9.095280803399437e-06, "loss": 0.0245, "step": 2840 }, { "epoch": 0.23571251343974858, "grad_norm": 0.08805207163095474, "learning_rate": 9.08741953597753e-06, "loss": 0.0247, "step": 2850 }, { "epoch": 0.23653957489041436, "grad_norm": 0.058313675224781036, "learning_rate": 9.079527690415455e-06, "loss": 0.0258, "step": 2860 }, { "epoch": 0.23736663634108016, "grad_norm": 0.057351235300302505, "learning_rate": 9.07160532575241e-06, "loss": 0.0254, "step": 2870 }, { "epoch": 0.23819369779174593, "grad_norm": 0.06278271973133087, "learning_rate": 9.063652501255904e-06, "loss": 0.0247, "step": 2880 }, { "epoch": 0.2390207592424117, "grad_norm": 0.05352174490690231, "learning_rate": 9.055669276421315e-06, "loss": 0.026, "step": 2890 }, { "epoch": 0.2398478206930775, "grad_norm": 0.05026556923985481, "learning_rate": 9.047655710971455e-06, "loss": 0.0266, "step": 2900 }, { "epoch": 0.24067488214374327, "grad_norm": 0.0480768196284771, "learning_rate": 9.039611864856105e-06, "loss": 0.0247, "step": 2910 }, { "epoch": 0.24150194359440907, "grad_norm": 0.07413890212774277, "learning_rate": 9.031537798251589e-06, "loss": 0.0284, "step": 2920 }, { "epoch": 0.24232900504507485, "grad_norm": 0.07313451170921326, "learning_rate": 9.023433571560297e-06, "loss": 0.0256, "step": 2930 }, { "epoch": 0.24315606649574065, "grad_norm": 0.05278393253684044, "learning_rate": 9.015299245410258e-06, "loss": 0.0249, "step": 2940 }, { "epoch": 0.24398312794640642, "grad_norm": 0.08677669614553452, "learning_rate": 9.007134880654677e-06, "loss": 0.026, "step": 2950 }, { "epoch": 0.2448101893970722, "grad_norm": 0.060126129537820816, "learning_rate": 8.998940538371472e-06, "loss": 0.0259, "step": 2960 }, { "epoch": 0.245637250847738, "grad_norm": 0.05079201981425285, "learning_rate": 8.99071627986283e-06, "loss": 0.0243, "step": 2970 }, { "epoch": 0.24646431229840376, "grad_norm": 0.053754109889268875, "learning_rate": 8.982462166654737e-06, "loss": 0.0257, "step": 2980 }, { "epoch": 0.24729137374906957, "grad_norm": 0.05371469631791115, "learning_rate": 8.974178260496529e-06, "loss": 0.0253, "step": 2990 }, { "epoch": 0.24811843519973534, "grad_norm": 0.060160018503665924, "learning_rate": 8.965864623360418e-06, "loss": 0.0283, "step": 3000 }, { "epoch": 0.24811843519973534, "eval_loss": 0.026417342945933342, "eval_runtime": 1220.3014, "eval_samples_per_second": 4.916, "eval_steps_per_second": 0.307, "step": 3000 }, { "epoch": 0.24894549665040114, "grad_norm": 0.06683066487312317, "learning_rate": 8.957521317441043e-06, "loss": 0.0245, "step": 3010 }, { "epoch": 0.2497725581010669, "grad_norm": 0.045557327568531036, "learning_rate": 8.949148405154986e-06, "loss": 0.0251, "step": 3020 }, { "epoch": 0.2505996195517327, "grad_norm": 0.05416623502969742, "learning_rate": 8.940745949140323e-06, "loss": 0.0247, "step": 3030 }, { "epoch": 0.25142668100239846, "grad_norm": 0.17342466115951538, "learning_rate": 8.932314012256147e-06, "loss": 0.0249, "step": 3040 }, { "epoch": 0.25225374245306426, "grad_norm": 0.06348035484552383, "learning_rate": 8.923852657582092e-06, "loss": 0.0258, "step": 3050 }, { "epoch": 0.25308080390373006, "grad_norm": 0.05559645593166351, "learning_rate": 8.915361948417878e-06, "loss": 0.0361, "step": 3060 }, { "epoch": 0.25390786535439586, "grad_norm": 0.050857000052928925, "learning_rate": 8.906841948282818e-06, "loss": 0.0257, "step": 3070 }, { "epoch": 0.2547349268050616, "grad_norm": 0.04826486483216286, "learning_rate": 8.898292720915354e-06, "loss": 0.0257, "step": 3080 }, { "epoch": 0.2555619882557274, "grad_norm": 0.06656019389629364, "learning_rate": 8.889714330272584e-06, "loss": 0.0261, "step": 3090 }, { "epoch": 0.2563890497063932, "grad_norm": 0.06416959315538406, "learning_rate": 8.881106840529769e-06, "loss": 0.0252, "step": 3100 }, { "epoch": 0.25721611115705895, "grad_norm": 0.04848102107644081, "learning_rate": 8.872470316079866e-06, "loss": 0.024, "step": 3110 }, { "epoch": 0.25804317260772475, "grad_norm": 0.06827887147665024, "learning_rate": 8.863804821533043e-06, "loss": 0.0236, "step": 3120 }, { "epoch": 0.25887023405839055, "grad_norm": 0.0632987692952156, "learning_rate": 8.855110421716191e-06, "loss": 0.0261, "step": 3130 }, { "epoch": 0.25969729550905635, "grad_norm": 0.05443909019231796, "learning_rate": 8.846387181672443e-06, "loss": 0.0245, "step": 3140 }, { "epoch": 0.2605243569597221, "grad_norm": 0.050953421741724014, "learning_rate": 8.837635166660689e-06, "loss": 0.0258, "step": 3150 }, { "epoch": 0.2613514184103879, "grad_norm": 0.04987896978855133, "learning_rate": 8.828854442155087e-06, "loss": 0.0259, "step": 3160 }, { "epoch": 0.2621784798610537, "grad_norm": 0.05325448885560036, "learning_rate": 8.820045073844563e-06, "loss": 0.0263, "step": 3170 }, { "epoch": 0.26300554131171944, "grad_norm": 0.06813682615756989, "learning_rate": 8.81120712763234e-06, "loss": 0.024, "step": 3180 }, { "epoch": 0.26383260276238524, "grad_norm": 0.053441476076841354, "learning_rate": 8.802340669635423e-06, "loss": 0.0255, "step": 3190 }, { "epoch": 0.26465966421305104, "grad_norm": 0.061251021921634674, "learning_rate": 8.793445766184126e-06, "loss": 0.0329, "step": 3200 }, { "epoch": 0.26548672566371684, "grad_norm": 0.06079159677028656, "learning_rate": 8.784522483821554e-06, "loss": 0.0271, "step": 3210 }, { "epoch": 0.2663137871143826, "grad_norm": 0.04815410450100899, "learning_rate": 8.77557088930312e-06, "loss": 0.0256, "step": 3220 }, { "epoch": 0.2671408485650484, "grad_norm": 0.058222122490406036, "learning_rate": 8.766591049596043e-06, "loss": 0.0239, "step": 3230 }, { "epoch": 0.2679679100157142, "grad_norm": 0.06425308436155319, "learning_rate": 8.75758303187884e-06, "loss": 0.0248, "step": 3240 }, { "epoch": 0.26879497146637993, "grad_norm": 0.05385325476527214, "learning_rate": 8.748546903540838e-06, "loss": 0.0249, "step": 3250 }, { "epoch": 0.26962203291704573, "grad_norm": 0.04803679138422012, "learning_rate": 8.739482732181648e-06, "loss": 0.0313, "step": 3260 }, { "epoch": 0.27044909436771153, "grad_norm": 0.05667194724082947, "learning_rate": 8.730390585610685e-06, "loss": 0.025, "step": 3270 }, { "epoch": 0.27127615581837733, "grad_norm": 0.04525600001215935, "learning_rate": 8.72127053184664e-06, "loss": 0.0254, "step": 3280 }, { "epoch": 0.2721032172690431, "grad_norm": 0.07599420845508575, "learning_rate": 8.712122639116975e-06, "loss": 0.0243, "step": 3290 }, { "epoch": 0.2729302787197089, "grad_norm": 0.052151359617710114, "learning_rate": 8.70294697585743e-06, "loss": 0.0234, "step": 3300 }, { "epoch": 0.2737573401703747, "grad_norm": 0.05731287971138954, "learning_rate": 8.693743610711482e-06, "loss": 0.0248, "step": 3310 }, { "epoch": 0.2745844016210404, "grad_norm": 0.04920828342437744, "learning_rate": 8.684512612529857e-06, "loss": 0.0245, "step": 3320 }, { "epoch": 0.2754114630717062, "grad_norm": 0.05730625241994858, "learning_rate": 8.67525405037e-06, "loss": 0.0264, "step": 3330 }, { "epoch": 0.276238524522372, "grad_norm": 0.04498128592967987, "learning_rate": 8.665967993495568e-06, "loss": 0.0244, "step": 3340 }, { "epoch": 0.2770655859730378, "grad_norm": 0.0674099400639534, "learning_rate": 8.656654511375902e-06, "loss": 0.0285, "step": 3350 }, { "epoch": 0.27789264742370356, "grad_norm": 0.06094598397612572, "learning_rate": 8.64731367368551e-06, "loss": 0.0258, "step": 3360 }, { "epoch": 0.27871970887436937, "grad_norm": 0.07126502692699432, "learning_rate": 8.637945550303557e-06, "loss": 0.0279, "step": 3370 }, { "epoch": 0.27954677032503517, "grad_norm": 0.08413068950176239, "learning_rate": 8.628550211313328e-06, "loss": 0.0441, "step": 3380 }, { "epoch": 0.2803738317757009, "grad_norm": 0.04862065240740776, "learning_rate": 8.619127727001708e-06, "loss": 0.0238, "step": 3390 }, { "epoch": 0.2812008932263667, "grad_norm": 0.0653972402215004, "learning_rate": 8.60967816785866e-06, "loss": 0.0245, "step": 3400 }, { "epoch": 0.2820279546770325, "grad_norm": 0.05237039551138878, "learning_rate": 8.60020160457669e-06, "loss": 0.0255, "step": 3410 }, { "epoch": 0.2828550161276983, "grad_norm": 0.06689222902059555, "learning_rate": 8.59069810805033e-06, "loss": 0.0286, "step": 3420 }, { "epoch": 0.28368207757836406, "grad_norm": 0.06750566512346268, "learning_rate": 8.581167749375596e-06, "loss": 0.0373, "step": 3430 }, { "epoch": 0.28450913902902986, "grad_norm": 0.04513133317232132, "learning_rate": 8.571610599849462e-06, "loss": 0.0266, "step": 3440 }, { "epoch": 0.28533620047969566, "grad_norm": 0.05559685453772545, "learning_rate": 8.562026730969325e-06, "loss": 0.0253, "step": 3450 }, { "epoch": 0.2861632619303614, "grad_norm": 0.04561685398221016, "learning_rate": 8.552416214432469e-06, "loss": 0.0259, "step": 3460 }, { "epoch": 0.2869903233810272, "grad_norm": 0.054727304726839066, "learning_rate": 8.542779122135532e-06, "loss": 0.0254, "step": 3470 }, { "epoch": 0.287817384831693, "grad_norm": 0.05550670251250267, "learning_rate": 8.533115526173969e-06, "loss": 0.025, "step": 3480 }, { "epoch": 0.2886444462823588, "grad_norm": 0.04571954905986786, "learning_rate": 8.523425498841505e-06, "loss": 0.0272, "step": 3490 }, { "epoch": 0.28947150773302455, "grad_norm": 0.07001665234565735, "learning_rate": 8.513709112629599e-06, "loss": 0.0245, "step": 3500 }, { "epoch": 0.29029856918369035, "grad_norm": 0.05153432488441467, "learning_rate": 8.503966440226908e-06, "loss": 0.0424, "step": 3510 }, { "epoch": 0.29112563063435615, "grad_norm": 0.05176723748445511, "learning_rate": 8.494197554518729e-06, "loss": 0.0245, "step": 3520 }, { "epoch": 0.2919526920850219, "grad_norm": 0.07877220213413239, "learning_rate": 8.484402528586469e-06, "loss": 0.0241, "step": 3530 }, { "epoch": 0.2927797535356877, "grad_norm": 0.0443316325545311, "learning_rate": 8.474581435707085e-06, "loss": 0.0245, "step": 3540 }, { "epoch": 0.2936068149863535, "grad_norm": 0.05324044078588486, "learning_rate": 8.464734349352544e-06, "loss": 0.024, "step": 3550 }, { "epoch": 0.2944338764370193, "grad_norm": 0.0497773103415966, "learning_rate": 8.454861343189274e-06, "loss": 0.0236, "step": 3560 }, { "epoch": 0.29526093788768504, "grad_norm": 0.04881919547915459, "learning_rate": 8.444962491077604e-06, "loss": 0.0236, "step": 3570 }, { "epoch": 0.29608799933835084, "grad_norm": 0.054020971059799194, "learning_rate": 8.435037867071225e-06, "loss": 0.0264, "step": 3580 }, { "epoch": 0.29691506078901664, "grad_norm": 0.04821145534515381, "learning_rate": 8.425087545416622e-06, "loss": 0.0235, "step": 3590 }, { "epoch": 0.2977421222396824, "grad_norm": 0.04773546755313873, "learning_rate": 8.41511160055253e-06, "loss": 0.0406, "step": 3600 }, { "epoch": 0.2985691836903482, "grad_norm": 0.06340964883565903, "learning_rate": 8.405110107109365e-06, "loss": 0.0252, "step": 3610 }, { "epoch": 0.299396245141014, "grad_norm": 0.0523238480091095, "learning_rate": 8.395083139908684e-06, "loss": 0.0245, "step": 3620 }, { "epoch": 0.3002233065916798, "grad_norm": 0.04797879606485367, "learning_rate": 8.385030773962605e-06, "loss": 0.0257, "step": 3630 }, { "epoch": 0.30105036804234553, "grad_norm": 0.05554933100938797, "learning_rate": 8.37495308447326e-06, "loss": 0.0233, "step": 3640 }, { "epoch": 0.30187742949301133, "grad_norm": 0.08046616613864899, "learning_rate": 8.364850146832218e-06, "loss": 0.0237, "step": 3650 }, { "epoch": 0.30270449094367713, "grad_norm": 0.04799005016684532, "learning_rate": 8.354722036619947e-06, "loss": 0.0244, "step": 3660 }, { "epoch": 0.3035315523943429, "grad_norm": 0.05324197933077812, "learning_rate": 8.344568829605216e-06, "loss": 0.0232, "step": 3670 }, { "epoch": 0.3043586138450087, "grad_norm": 0.04944256320595741, "learning_rate": 8.334390601744556e-06, "loss": 0.0255, "step": 3680 }, { "epoch": 0.3051856752956745, "grad_norm": 0.0510077141225338, "learning_rate": 8.324187429181669e-06, "loss": 0.0252, "step": 3690 }, { "epoch": 0.3060127367463403, "grad_norm": 0.045672621577978134, "learning_rate": 8.313959388246882e-06, "loss": 0.0257, "step": 3700 }, { "epoch": 0.306839798197006, "grad_norm": 0.04965253919363022, "learning_rate": 8.303706555456547e-06, "loss": 0.0291, "step": 3710 }, { "epoch": 0.3076668596476718, "grad_norm": 0.043674346059560776, "learning_rate": 8.293429007512503e-06, "loss": 0.0253, "step": 3720 }, { "epoch": 0.3084939210983376, "grad_norm": 0.04634533450007439, "learning_rate": 8.283126821301468e-06, "loss": 0.0236, "step": 3730 }, { "epoch": 0.30932098254900336, "grad_norm": 0.06959991902112961, "learning_rate": 8.272800073894492e-06, "loss": 0.0245, "step": 3740 }, { "epoch": 0.31014804399966917, "grad_norm": 0.04980204254388809, "learning_rate": 8.26244884254636e-06, "loss": 0.0237, "step": 3750 }, { "epoch": 0.31097510545033497, "grad_norm": 0.052351828664541245, "learning_rate": 8.252073204695025e-06, "loss": 0.0257, "step": 3760 }, { "epoch": 0.31180216690100077, "grad_norm": 0.04672665148973465, "learning_rate": 8.241673237961027e-06, "loss": 0.0238, "step": 3770 }, { "epoch": 0.3126292283516665, "grad_norm": 0.041996221989393234, "learning_rate": 8.231249020146913e-06, "loss": 0.024, "step": 3780 }, { "epoch": 0.3134562898023323, "grad_norm": 0.05913085490465164, "learning_rate": 8.220800629236647e-06, "loss": 0.0244, "step": 3790 }, { "epoch": 0.3142833512529981, "grad_norm": 0.04715942218899727, "learning_rate": 8.21032814339504e-06, "loss": 0.0239, "step": 3800 }, { "epoch": 0.31511041270366386, "grad_norm": 0.04261414706707001, "learning_rate": 8.19983164096715e-06, "loss": 0.0231, "step": 3810 }, { "epoch": 0.31593747415432966, "grad_norm": 0.05027526617050171, "learning_rate": 8.189311200477713e-06, "loss": 0.0245, "step": 3820 }, { "epoch": 0.31676453560499546, "grad_norm": 0.19037795066833496, "learning_rate": 8.17876690063054e-06, "loss": 0.0242, "step": 3830 }, { "epoch": 0.31759159705566126, "grad_norm": 0.09254226088523865, "learning_rate": 8.168198820307938e-06, "loss": 0.0234, "step": 3840 }, { "epoch": 0.318418658506327, "grad_norm": 0.04657592624425888, "learning_rate": 8.157607038570117e-06, "loss": 0.0241, "step": 3850 }, { "epoch": 0.3192457199569928, "grad_norm": 0.06853280961513519, "learning_rate": 8.146991634654595e-06, "loss": 0.0261, "step": 3860 }, { "epoch": 0.3200727814076586, "grad_norm": 0.05595746263861656, "learning_rate": 8.136352687975609e-06, "loss": 0.0242, "step": 3870 }, { "epoch": 0.32089984285832435, "grad_norm": 0.04363076388835907, "learning_rate": 8.125690278123524e-06, "loss": 0.0235, "step": 3880 }, { "epoch": 0.32172690430899015, "grad_norm": 0.06170443445444107, "learning_rate": 8.115004484864231e-06, "loss": 0.0233, "step": 3890 }, { "epoch": 0.32255396575965595, "grad_norm": 0.04467644914984703, "learning_rate": 8.104295388138553e-06, "loss": 0.0245, "step": 3900 }, { "epoch": 0.32338102721032175, "grad_norm": 0.06176682561635971, "learning_rate": 8.093563068061649e-06, "loss": 0.0232, "step": 3910 }, { "epoch": 0.3242080886609875, "grad_norm": 0.047685880213975906, "learning_rate": 8.082807604922409e-06, "loss": 0.0248, "step": 3920 }, { "epoch": 0.3250351501116533, "grad_norm": 0.05187467485666275, "learning_rate": 8.072029079182862e-06, "loss": 0.0245, "step": 3930 }, { "epoch": 0.3258622115623191, "grad_norm": 0.04737105965614319, "learning_rate": 8.061227571477565e-06, "loss": 0.0268, "step": 3940 }, { "epoch": 0.32668927301298484, "grad_norm": 0.04560704901814461, "learning_rate": 8.050403162613007e-06, "loss": 0.024, "step": 3950 }, { "epoch": 0.32751633446365064, "grad_norm": 0.057890139520168304, "learning_rate": 8.039555933567e-06, "loss": 0.0267, "step": 3960 }, { "epoch": 0.32834339591431644, "grad_norm": 0.04416472092270851, "learning_rate": 8.028685965488074e-06, "loss": 0.0241, "step": 3970 }, { "epoch": 0.32917045736498224, "grad_norm": 0.04871301352977753, "learning_rate": 8.017793339694873e-06, "loss": 0.0237, "step": 3980 }, { "epoch": 0.329997518815648, "grad_norm": 0.05144352838397026, "learning_rate": 8.00687813767554e-06, "loss": 0.0236, "step": 3990 }, { "epoch": 0.3308245802663138, "grad_norm": 0.06144755333662033, "learning_rate": 7.995940441087117e-06, "loss": 0.0228, "step": 4000 }, { "epoch": 0.3308245802663138, "eval_loss": 0.025024140253663063, "eval_runtime": 1220.32, "eval_samples_per_second": 4.916, "eval_steps_per_second": 0.307, "step": 4000 }, { "epoch": 0.3316516417169796, "grad_norm": 0.07986024022102356, "learning_rate": 7.984980331754924e-06, "loss": 0.0249, "step": 4010 }, { "epoch": 0.33247870316764533, "grad_norm": 0.04930829629302025, "learning_rate": 7.973997891671953e-06, "loss": 0.024, "step": 4020 }, { "epoch": 0.33330576461831113, "grad_norm": 0.07743251323699951, "learning_rate": 7.962993202998257e-06, "loss": 0.0234, "step": 4030 }, { "epoch": 0.33413282606897693, "grad_norm": 0.05702010914683342, "learning_rate": 7.951966348060325e-06, "loss": 0.025, "step": 4040 }, { "epoch": 0.33495988751964273, "grad_norm": 0.042675841599702835, "learning_rate": 7.940917409350476e-06, "loss": 0.0245, "step": 4050 }, { "epoch": 0.3357869489703085, "grad_norm": 0.04492352157831192, "learning_rate": 7.929846469526242e-06, "loss": 0.025, "step": 4060 }, { "epoch": 0.3366140104209743, "grad_norm": 0.07774407416582108, "learning_rate": 7.91875361140974e-06, "loss": 0.0226, "step": 4070 }, { "epoch": 0.3374410718716401, "grad_norm": 0.06625732779502869, "learning_rate": 7.90763891798706e-06, "loss": 0.0235, "step": 4080 }, { "epoch": 0.3382681333223059, "grad_norm": 0.048172276467084885, "learning_rate": 7.896502472407644e-06, "loss": 0.0236, "step": 4090 }, { "epoch": 0.3390951947729716, "grad_norm": 0.05588380619883537, "learning_rate": 7.885344357983665e-06, "loss": 0.0365, "step": 4100 }, { "epoch": 0.3399222562236374, "grad_norm": 0.04697740450501442, "learning_rate": 7.874164658189398e-06, "loss": 0.0261, "step": 4110 }, { "epoch": 0.3407493176743032, "grad_norm": 0.14661569893360138, "learning_rate": 7.8629634566606e-06, "loss": 0.0422, "step": 4120 }, { "epoch": 0.34157637912496897, "grad_norm": 0.050860997289419174, "learning_rate": 7.851740837193883e-06, "loss": 0.0253, "step": 4130 }, { "epoch": 0.34240344057563477, "grad_norm": 0.06831306964159012, "learning_rate": 7.840496883746089e-06, "loss": 0.0236, "step": 4140 }, { "epoch": 0.34323050202630057, "grad_norm": 0.07154014706611633, "learning_rate": 7.829231680433658e-06, "loss": 0.0241, "step": 4150 }, { "epoch": 0.34405756347696637, "grad_norm": 0.060069840401411057, "learning_rate": 7.817945311532001e-06, "loss": 0.0233, "step": 4160 }, { "epoch": 0.3448846249276321, "grad_norm": 0.06343766301870346, "learning_rate": 7.806637861474873e-06, "loss": 0.029, "step": 4170 }, { "epoch": 0.3457116863782979, "grad_norm": 0.046083442866802216, "learning_rate": 7.795309414853735e-06, "loss": 0.0233, "step": 4180 }, { "epoch": 0.3465387478289637, "grad_norm": 0.04395199194550514, "learning_rate": 7.783960056417123e-06, "loss": 0.024, "step": 4190 }, { "epoch": 0.34736580927962946, "grad_norm": 0.04960530623793602, "learning_rate": 7.77258987107002e-06, "loss": 0.0252, "step": 4200 }, { "epoch": 0.34819287073029526, "grad_norm": 0.053416695445775986, "learning_rate": 7.76119894387321e-06, "loss": 0.0233, "step": 4210 }, { "epoch": 0.34901993218096106, "grad_norm": 0.06489969789981842, "learning_rate": 7.749787360042651e-06, "loss": 0.0225, "step": 4220 }, { "epoch": 0.34984699363162686, "grad_norm": 0.054353874176740646, "learning_rate": 7.738355204948833e-06, "loss": 0.025, "step": 4230 }, { "epoch": 0.3506740550822926, "grad_norm": 0.05458907410502434, "learning_rate": 7.726902564116141e-06, "loss": 0.0234, "step": 4240 }, { "epoch": 0.3515011165329584, "grad_norm": 0.04842905327677727, "learning_rate": 7.715429523222214e-06, "loss": 0.0221, "step": 4250 }, { "epoch": 0.3523281779836242, "grad_norm": 0.0519806407392025, "learning_rate": 7.703936168097306e-06, "loss": 0.0239, "step": 4260 }, { "epoch": 0.35315523943428995, "grad_norm": 0.05236365273594856, "learning_rate": 7.692422584723641e-06, "loss": 0.0235, "step": 4270 }, { "epoch": 0.35398230088495575, "grad_norm": 0.04914037883281708, "learning_rate": 7.68088885923477e-06, "loss": 0.0235, "step": 4280 }, { "epoch": 0.35480936233562155, "grad_norm": 0.05043815076351166, "learning_rate": 7.669335077914932e-06, "loss": 0.0241, "step": 4290 }, { "epoch": 0.35563642378628735, "grad_norm": 0.04599103704094887, "learning_rate": 7.657761327198404e-06, "loss": 0.0242, "step": 4300 }, { "epoch": 0.3564634852369531, "grad_norm": 0.04246712476015091, "learning_rate": 7.646167693668846e-06, "loss": 0.0241, "step": 4310 }, { "epoch": 0.3572905466876189, "grad_norm": 0.04617106169462204, "learning_rate": 7.634554264058676e-06, "loss": 0.0235, "step": 4320 }, { "epoch": 0.3581176081382847, "grad_norm": 0.046657975763082504, "learning_rate": 7.6229211252483956e-06, "loss": 0.0233, "step": 4330 }, { "epoch": 0.35894466958895044, "grad_norm": 0.047864075750112534, "learning_rate": 7.611268364265958e-06, "loss": 0.0241, "step": 4340 }, { "epoch": 0.35977173103961624, "grad_norm": 0.054371584206819534, "learning_rate": 7.599596068286111e-06, "loss": 0.0238, "step": 4350 }, { "epoch": 0.36059879249028204, "grad_norm": 0.04631248489022255, "learning_rate": 7.58790432462974e-06, "loss": 0.0268, "step": 4360 }, { "epoch": 0.36142585394094784, "grad_norm": 0.06476343423128128, "learning_rate": 7.576193220763221e-06, "loss": 0.0246, "step": 4370 }, { "epoch": 0.3622529153916136, "grad_norm": 0.057965509593486786, "learning_rate": 7.564462844297766e-06, "loss": 0.0233, "step": 4380 }, { "epoch": 0.3630799768422794, "grad_norm": 0.05117254704236984, "learning_rate": 7.552713282988765e-06, "loss": 0.024, "step": 4390 }, { "epoch": 0.3639070382929452, "grad_norm": 0.0481458455324173, "learning_rate": 7.540944624735132e-06, "loss": 0.0233, "step": 4400 }, { "epoch": 0.36473409974361093, "grad_norm": 0.0458373986184597, "learning_rate": 7.529156957578641e-06, "loss": 0.0228, "step": 4410 }, { "epoch": 0.36556116119427673, "grad_norm": 0.043816305696964264, "learning_rate": 7.517350369703279e-06, "loss": 0.0234, "step": 4420 }, { "epoch": 0.36638822264494253, "grad_norm": 0.050691138952970505, "learning_rate": 7.505524949434575e-06, "loss": 0.0219, "step": 4430 }, { "epoch": 0.36721528409560833, "grad_norm": 0.0413176566362381, "learning_rate": 7.493680785238948e-06, "loss": 0.0231, "step": 4440 }, { "epoch": 0.3680423455462741, "grad_norm": 0.04249545931816101, "learning_rate": 7.481817965723035e-06, "loss": 0.0226, "step": 4450 }, { "epoch": 0.3688694069969399, "grad_norm": 0.05581935495138168, "learning_rate": 7.4699365796330395e-06, "loss": 0.0265, "step": 4460 }, { "epoch": 0.3696964684476057, "grad_norm": 0.0569755993783474, "learning_rate": 7.458036715854059e-06, "loss": 0.0232, "step": 4470 }, { "epoch": 0.3705235298982714, "grad_norm": 0.05333729833364487, "learning_rate": 7.4461184634094256e-06, "loss": 0.0242, "step": 4480 }, { "epoch": 0.3713505913489372, "grad_norm": 0.05248766019940376, "learning_rate": 7.434181911460036e-06, "loss": 0.0307, "step": 4490 }, { "epoch": 0.372177652799603, "grad_norm": 0.043839454650878906, "learning_rate": 7.4222271493036875e-06, "loss": 0.0241, "step": 4500 }, { "epoch": 0.3730047142502688, "grad_norm": 0.05857829377055168, "learning_rate": 7.41025426637441e-06, "loss": 0.0223, "step": 4510 }, { "epoch": 0.37383177570093457, "grad_norm": 0.041583914309740067, "learning_rate": 7.398263352241788e-06, "loss": 0.0225, "step": 4520 }, { "epoch": 0.37465883715160037, "grad_norm": 0.043787844479084015, "learning_rate": 7.386254496610309e-06, "loss": 0.0215, "step": 4530 }, { "epoch": 0.37548589860226617, "grad_norm": 0.04298454895615578, "learning_rate": 7.374227789318673e-06, "loss": 0.0229, "step": 4540 }, { "epoch": 0.3763129600529319, "grad_norm": 0.05074107274413109, "learning_rate": 7.362183320339133e-06, "loss": 0.023, "step": 4550 }, { "epoch": 0.3771400215035977, "grad_norm": 0.06284487992525101, "learning_rate": 7.350121179776819e-06, "loss": 0.0231, "step": 4560 }, { "epoch": 0.3779670829542635, "grad_norm": 0.053102780133485794, "learning_rate": 7.33804145786906e-06, "loss": 0.0255, "step": 4570 }, { "epoch": 0.3787941444049293, "grad_norm": 0.04331573098897934, "learning_rate": 7.325944244984711e-06, "loss": 0.0228, "step": 4580 }, { "epoch": 0.37962120585559506, "grad_norm": 0.051730215549468994, "learning_rate": 7.31382963162348e-06, "loss": 0.0216, "step": 4590 }, { "epoch": 0.38044826730626086, "grad_norm": 0.03934797644615173, "learning_rate": 7.301697708415248e-06, "loss": 0.0242, "step": 4600 }, { "epoch": 0.38127532875692666, "grad_norm": 0.04784635826945305, "learning_rate": 7.289548566119391e-06, "loss": 0.0221, "step": 4610 }, { "epoch": 0.3821023902075924, "grad_norm": 0.1260228306055069, "learning_rate": 7.277382295624104e-06, "loss": 0.0282, "step": 4620 }, { "epoch": 0.3829294516582582, "grad_norm": 0.06200871989130974, "learning_rate": 7.265198987945714e-06, "loss": 0.0261, "step": 4630 }, { "epoch": 0.383756513108924, "grad_norm": 0.061095982789993286, "learning_rate": 7.252998734228007e-06, "loss": 0.0245, "step": 4640 }, { "epoch": 0.3845835745595898, "grad_norm": 0.053159236907958984, "learning_rate": 7.240781625741545e-06, "loss": 0.0233, "step": 4650 }, { "epoch": 0.38541063601025555, "grad_norm": 0.0482206866145134, "learning_rate": 7.228547753882976e-06, "loss": 0.0261, "step": 4660 }, { "epoch": 0.38623769746092135, "grad_norm": 0.05078030377626419, "learning_rate": 7.216297210174361e-06, "loss": 0.0244, "step": 4670 }, { "epoch": 0.38706475891158715, "grad_norm": 0.044170767068862915, "learning_rate": 7.204030086262478e-06, "loss": 0.0238, "step": 4680 }, { "epoch": 0.3878918203622529, "grad_norm": 0.04695448279380798, "learning_rate": 7.191746473918148e-06, "loss": 0.0223, "step": 4690 }, { "epoch": 0.3887188818129187, "grad_norm": 0.052788231521844864, "learning_rate": 7.179446465035535e-06, "loss": 0.0249, "step": 4700 }, { "epoch": 0.3895459432635845, "grad_norm": 0.05831609293818474, "learning_rate": 7.167130151631475e-06, "loss": 0.0244, "step": 4710 }, { "epoch": 0.3903730047142503, "grad_norm": 0.05152612552046776, "learning_rate": 7.154797625844773e-06, "loss": 0.0224, "step": 4720 }, { "epoch": 0.39120006616491604, "grad_norm": 0.047528255730867386, "learning_rate": 7.142448979935521e-06, "loss": 0.0236, "step": 4730 }, { "epoch": 0.39202712761558184, "grad_norm": 0.051114026457071304, "learning_rate": 7.130084306284406e-06, "loss": 0.0235, "step": 4740 }, { "epoch": 0.39285418906624764, "grad_norm": 0.04298287630081177, "learning_rate": 7.11770369739202e-06, "loss": 0.0224, "step": 4750 }, { "epoch": 0.3936812505169134, "grad_norm": 0.05048811435699463, "learning_rate": 7.105307245878166e-06, "loss": 0.0238, "step": 4760 }, { "epoch": 0.3945083119675792, "grad_norm": 0.04245224595069885, "learning_rate": 7.092895044481165e-06, "loss": 0.0235, "step": 4770 }, { "epoch": 0.395335373418245, "grad_norm": 0.05021793767809868, "learning_rate": 7.080467186057168e-06, "loss": 0.0228, "step": 4780 }, { "epoch": 0.3961624348689108, "grad_norm": 0.04611439257860184, "learning_rate": 7.068023763579453e-06, "loss": 0.0304, "step": 4790 }, { "epoch": 0.39698949631957653, "grad_norm": 0.050482239574193954, "learning_rate": 7.055564870137733e-06, "loss": 0.0241, "step": 4800 }, { "epoch": 0.39781655777024233, "grad_norm": 0.050899263471364975, "learning_rate": 7.043090598937463e-06, "loss": 0.0246, "step": 4810 }, { "epoch": 0.39864361922090813, "grad_norm": 0.05052196979522705, "learning_rate": 7.030601043299138e-06, "loss": 0.0238, "step": 4820 }, { "epoch": 0.3994706806715739, "grad_norm": 0.04977920651435852, "learning_rate": 7.018096296657595e-06, "loss": 0.0234, "step": 4830 }, { "epoch": 0.4002977421222397, "grad_norm": 0.0429433137178421, "learning_rate": 7.005576452561314e-06, "loss": 0.0249, "step": 4840 }, { "epoch": 0.4011248035729055, "grad_norm": 0.04633225128054619, "learning_rate": 6.993041604671727e-06, "loss": 0.0221, "step": 4850 }, { "epoch": 0.4019518650235713, "grad_norm": 0.044517192989587784, "learning_rate": 6.980491846762503e-06, "loss": 0.023, "step": 4860 }, { "epoch": 0.402778926474237, "grad_norm": 0.04668491706252098, "learning_rate": 6.967927272718855e-06, "loss": 0.023, "step": 4870 }, { "epoch": 0.4036059879249028, "grad_norm": 0.13357175886631012, "learning_rate": 6.955347976536841e-06, "loss": 0.0218, "step": 4880 }, { "epoch": 0.4044330493755686, "grad_norm": 0.04721111059188843, "learning_rate": 6.942754052322645e-06, "loss": 0.0222, "step": 4890 }, { "epoch": 0.40526011082623437, "grad_norm": 0.07329542189836502, "learning_rate": 6.9301455942918934e-06, "loss": 0.0219, "step": 4900 }, { "epoch": 0.40608717227690017, "grad_norm": 0.04098494350910187, "learning_rate": 6.9175226967689395e-06, "loss": 0.0224, "step": 4910 }, { "epoch": 0.40691423372756597, "grad_norm": 0.0693870559334755, "learning_rate": 6.904885454186155e-06, "loss": 0.0239, "step": 4920 }, { "epoch": 0.40774129517823177, "grad_norm": 0.04788215458393097, "learning_rate": 6.89223396108323e-06, "loss": 0.0278, "step": 4930 }, { "epoch": 0.4085683566288975, "grad_norm": 0.041839174926280975, "learning_rate": 6.879568312106462e-06, "loss": 0.0215, "step": 4940 }, { "epoch": 0.4093954180795633, "grad_norm": 0.04695757478475571, "learning_rate": 6.866888602008053e-06, "loss": 0.0235, "step": 4950 }, { "epoch": 0.4102224795302291, "grad_norm": 0.05025403946638107, "learning_rate": 6.854194925645392e-06, "loss": 0.023, "step": 4960 }, { "epoch": 0.41104954098089486, "grad_norm": 0.05418792739510536, "learning_rate": 6.841487377980353e-06, "loss": 0.0247, "step": 4970 }, { "epoch": 0.41187660243156066, "grad_norm": 0.05611952021718025, "learning_rate": 6.82876605407858e-06, "loss": 0.023, "step": 4980 }, { "epoch": 0.41270366388222646, "grad_norm": 0.04246920347213745, "learning_rate": 6.816031049108777e-06, "loss": 0.024, "step": 4990 }, { "epoch": 0.41353072533289226, "grad_norm": 0.044995930045843124, "learning_rate": 6.803282458342e-06, "loss": 0.0215, "step": 5000 }, { "epoch": 0.41353072533289226, "eval_loss": 0.024215074256062508, "eval_runtime": 1219.5377, "eval_samples_per_second": 4.919, "eval_steps_per_second": 0.307, "step": 5000 }, { "epoch": 0.414357786783558, "grad_norm": 0.05199587345123291, "learning_rate": 6.790520377150939e-06, "loss": 0.0233, "step": 5010 }, { "epoch": 0.4151848482342238, "grad_norm": 0.04450158774852753, "learning_rate": 6.777744901009204e-06, "loss": 0.023, "step": 5020 }, { "epoch": 0.4160119096848896, "grad_norm": 0.0536041297018528, "learning_rate": 6.764956125490616e-06, "loss": 0.022, "step": 5030 }, { "epoch": 0.41683897113555535, "grad_norm": 0.04742833226919174, "learning_rate": 6.752154146268491e-06, "loss": 0.0267, "step": 5040 }, { "epoch": 0.41766603258622115, "grad_norm": 0.05334756523370743, "learning_rate": 6.739339059114916e-06, "loss": 0.0232, "step": 5050 }, { "epoch": 0.41849309403688695, "grad_norm": 0.0501900352537632, "learning_rate": 6.726510959900046e-06, "loss": 0.0248, "step": 5060 }, { "epoch": 0.41932015548755275, "grad_norm": 0.04328269138932228, "learning_rate": 6.713669944591375e-06, "loss": 0.0229, "step": 5070 }, { "epoch": 0.4201472169382185, "grad_norm": 0.04845112934708595, "learning_rate": 6.700816109253023e-06, "loss": 0.0242, "step": 5080 }, { "epoch": 0.4209742783888843, "grad_norm": 0.051792118698358536, "learning_rate": 6.6879495500450184e-06, "loss": 0.0224, "step": 5090 }, { "epoch": 0.4218013398395501, "grad_norm": 0.03820064663887024, "learning_rate": 6.675070363222581e-06, "loss": 0.0225, "step": 5100 }, { "epoch": 0.42262840129021584, "grad_norm": 0.04609294980764389, "learning_rate": 6.662178645135392e-06, "loss": 0.0222, "step": 5110 }, { "epoch": 0.42345546274088164, "grad_norm": 0.043115533888339996, "learning_rate": 6.649274492226882e-06, "loss": 0.0229, "step": 5120 }, { "epoch": 0.42428252419154744, "grad_norm": 0.04883312061429024, "learning_rate": 6.636358001033508e-06, "loss": 0.0228, "step": 5130 }, { "epoch": 0.42510958564221324, "grad_norm": 0.062484171241521835, "learning_rate": 6.623429268184027e-06, "loss": 0.0237, "step": 5140 }, { "epoch": 0.425936647092879, "grad_norm": 0.0440596267580986, "learning_rate": 6.6104883903987815e-06, "loss": 0.0264, "step": 5150 }, { "epoch": 0.4267637085435448, "grad_norm": 0.04892463609576225, "learning_rate": 6.5975354644889665e-06, "loss": 0.0217, "step": 5160 }, { "epoch": 0.4275907699942106, "grad_norm": 0.04017140343785286, "learning_rate": 6.5845705873559094e-06, "loss": 0.0225, "step": 5170 }, { "epoch": 0.42841783144487633, "grad_norm": 0.04880579188466072, "learning_rate": 6.571593855990348e-06, "loss": 0.023, "step": 5180 }, { "epoch": 0.42924489289554213, "grad_norm": 0.06134543567895889, "learning_rate": 6.5586053674717e-06, "loss": 0.0227, "step": 5190 }, { "epoch": 0.43007195434620793, "grad_norm": 0.03942278400063515, "learning_rate": 6.545605218967341e-06, "loss": 0.0222, "step": 5200 }, { "epoch": 0.43089901579687373, "grad_norm": 0.04633478447794914, "learning_rate": 6.5325935077318705e-06, "loss": 0.0226, "step": 5210 }, { "epoch": 0.4317260772475395, "grad_norm": 0.06766749918460846, "learning_rate": 6.519570331106395e-06, "loss": 0.0226, "step": 5220 }, { "epoch": 0.4325531386982053, "grad_norm": 0.04740046337246895, "learning_rate": 6.506535786517789e-06, "loss": 0.0261, "step": 5230 }, { "epoch": 0.4333802001488711, "grad_norm": 0.05168073996901512, "learning_rate": 6.493489971477977e-06, "loss": 0.0242, "step": 5240 }, { "epoch": 0.4342072615995368, "grad_norm": 0.05117257684469223, "learning_rate": 6.480432983583194e-06, "loss": 0.0276, "step": 5250 }, { "epoch": 0.4350343230502026, "grad_norm": 0.05560829117894173, "learning_rate": 6.467364920513257e-06, "loss": 0.0235, "step": 5260 }, { "epoch": 0.4358613845008684, "grad_norm": 0.04257509857416153, "learning_rate": 6.454285880030844e-06, "loss": 0.022, "step": 5270 }, { "epoch": 0.4366884459515342, "grad_norm": 0.047841571271419525, "learning_rate": 6.441195959980749e-06, "loss": 0.0235, "step": 5280 }, { "epoch": 0.43751550740219997, "grad_norm": 0.04220358282327652, "learning_rate": 6.428095258289162e-06, "loss": 0.0227, "step": 5290 }, { "epoch": 0.43834256885286577, "grad_norm": 0.04904833808541298, "learning_rate": 6.414983872962924e-06, "loss": 0.023, "step": 5300 }, { "epoch": 0.43916963030353157, "grad_norm": 0.041855499148368835, "learning_rate": 6.401861902088809e-06, "loss": 0.0247, "step": 5310 }, { "epoch": 0.4399966917541973, "grad_norm": 0.046882931143045425, "learning_rate": 6.388729443832774e-06, "loss": 0.0218, "step": 5320 }, { "epoch": 0.4408237532048631, "grad_norm": 0.06054188311100006, "learning_rate": 6.375586596439237e-06, "loss": 0.0239, "step": 5330 }, { "epoch": 0.4416508146555289, "grad_norm": 0.04277319461107254, "learning_rate": 6.362433458230337e-06, "loss": 0.0232, "step": 5340 }, { "epoch": 0.4424778761061947, "grad_norm": 0.050606515258550644, "learning_rate": 6.349270127605198e-06, "loss": 0.0224, "step": 5350 }, { "epoch": 0.44330493755686046, "grad_norm": 0.050200313329696655, "learning_rate": 6.336096703039196e-06, "loss": 0.0225, "step": 5360 }, { "epoch": 0.44413199900752626, "grad_norm": 0.0431785061955452, "learning_rate": 6.322913283083214e-06, "loss": 0.0223, "step": 5370 }, { "epoch": 0.44495906045819206, "grad_norm": 0.04577941447496414, "learning_rate": 6.309719966362922e-06, "loss": 0.0219, "step": 5380 }, { "epoch": 0.4457861219088578, "grad_norm": 0.04745447263121605, "learning_rate": 6.296516851578016e-06, "loss": 0.0239, "step": 5390 }, { "epoch": 0.4466131833595236, "grad_norm": 0.0505000539124012, "learning_rate": 6.283304037501501e-06, "loss": 0.0238, "step": 5400 }, { "epoch": 0.4474402448101894, "grad_norm": 0.0681275799870491, "learning_rate": 6.270081622978934e-06, "loss": 0.0238, "step": 5410 }, { "epoch": 0.4482673062608552, "grad_norm": 0.05186863988637924, "learning_rate": 6.256849706927703e-06, "loss": 0.0225, "step": 5420 }, { "epoch": 0.44909436771152095, "grad_norm": 0.04716340824961662, "learning_rate": 6.2436083883362706e-06, "loss": 0.022, "step": 5430 }, { "epoch": 0.44992142916218675, "grad_norm": 0.042241595685482025, "learning_rate": 6.230357766263442e-06, "loss": 0.0216, "step": 5440 }, { "epoch": 0.45074849061285255, "grad_norm": 0.04572228342294693, "learning_rate": 6.217097939837623e-06, "loss": 0.0219, "step": 5450 }, { "epoch": 0.4515755520635183, "grad_norm": 0.05299137532711029, "learning_rate": 6.203829008256075e-06, "loss": 0.0222, "step": 5460 }, { "epoch": 0.4524026135141841, "grad_norm": 0.04044192656874657, "learning_rate": 6.190551070784179e-06, "loss": 0.0233, "step": 5470 }, { "epoch": 0.4532296749648499, "grad_norm": 0.04427442327141762, "learning_rate": 6.177264226754685e-06, "loss": 0.0239, "step": 5480 }, { "epoch": 0.4540567364155157, "grad_norm": 0.0423441156744957, "learning_rate": 6.163968575566979e-06, "loss": 0.0243, "step": 5490 }, { "epoch": 0.45488379786618144, "grad_norm": 0.052600838243961334, "learning_rate": 6.150664216686329e-06, "loss": 0.0231, "step": 5500 }, { "epoch": 0.45571085931684724, "grad_norm": 0.04956282302737236, "learning_rate": 6.137351249643147e-06, "loss": 0.0238, "step": 5510 }, { "epoch": 0.45653792076751304, "grad_norm": 0.037822380661964417, "learning_rate": 6.124029774032242e-06, "loss": 0.0224, "step": 5520 }, { "epoch": 0.4573649822181788, "grad_norm": 0.04192859306931496, "learning_rate": 6.110699889512077e-06, "loss": 0.0273, "step": 5530 }, { "epoch": 0.4581920436688446, "grad_norm": 0.04586039483547211, "learning_rate": 6.0973616958040265e-06, "loss": 0.0223, "step": 5540 }, { "epoch": 0.4590191051195104, "grad_norm": 0.049864090979099274, "learning_rate": 6.084015292691617e-06, "loss": 0.0237, "step": 5550 }, { "epoch": 0.4598461665701762, "grad_norm": 0.061950068920850754, "learning_rate": 6.070660780019797e-06, "loss": 0.0228, "step": 5560 }, { "epoch": 0.46067322802084193, "grad_norm": 0.04114188626408577, "learning_rate": 6.057298257694182e-06, "loss": 0.0233, "step": 5570 }, { "epoch": 0.46150028947150773, "grad_norm": 0.048220761120319366, "learning_rate": 6.043927825680305e-06, "loss": 0.0285, "step": 5580 }, { "epoch": 0.46232735092217353, "grad_norm": 0.047901052981615067, "learning_rate": 6.030549584002876e-06, "loss": 0.0247, "step": 5590 }, { "epoch": 0.4631544123728393, "grad_norm": 0.04301442205905914, "learning_rate": 6.017163632745025e-06, "loss": 0.0222, "step": 5600 }, { "epoch": 0.4639814738235051, "grad_norm": 0.059639185667037964, "learning_rate": 6.003770072047559e-06, "loss": 0.0224, "step": 5610 }, { "epoch": 0.4648085352741709, "grad_norm": 0.05088592320680618, "learning_rate": 5.990369002108215e-06, "loss": 0.0255, "step": 5620 }, { "epoch": 0.4656355967248367, "grad_norm": 0.04898575693368912, "learning_rate": 5.976960523180904e-06, "loss": 0.0221, "step": 5630 }, { "epoch": 0.4664626581755024, "grad_norm": 0.04929777607321739, "learning_rate": 5.963544735574961e-06, "loss": 0.023, "step": 5640 }, { "epoch": 0.4672897196261682, "grad_norm": 0.04379523918032646, "learning_rate": 5.9501217396544034e-06, "loss": 0.023, "step": 5650 }, { "epoch": 0.468116781076834, "grad_norm": 0.049279894679784775, "learning_rate": 5.93669163583717e-06, "loss": 0.0232, "step": 5660 }, { "epoch": 0.46894384252749977, "grad_norm": 0.044354990124702454, "learning_rate": 5.923254524594376e-06, "loss": 0.0229, "step": 5670 }, { "epoch": 0.46977090397816557, "grad_norm": 0.05658494308590889, "learning_rate": 5.9098105064495606e-06, "loss": 0.0221, "step": 5680 }, { "epoch": 0.47059796542883137, "grad_norm": 0.041339486837387085, "learning_rate": 5.896359681977928e-06, "loss": 0.0226, "step": 5690 }, { "epoch": 0.47142502687949717, "grad_norm": 0.052800171077251434, "learning_rate": 5.8829021518056095e-06, "loss": 0.0237, "step": 5700 }, { "epoch": 0.4722520883301629, "grad_norm": 0.04378625750541687, "learning_rate": 5.869438016608893e-06, "loss": 0.0241, "step": 5710 }, { "epoch": 0.4730791497808287, "grad_norm": 0.08634616434574127, "learning_rate": 5.855967377113487e-06, "loss": 0.0263, "step": 5720 }, { "epoch": 0.4739062112314945, "grad_norm": 0.0738649070262909, "learning_rate": 5.842490334093752e-06, "loss": 0.0231, "step": 5730 }, { "epoch": 0.4747332726821603, "grad_norm": 0.04509838670492172, "learning_rate": 5.829006988371959e-06, "loss": 0.0231, "step": 5740 }, { "epoch": 0.47556033413282606, "grad_norm": 0.044409893453121185, "learning_rate": 5.815517440817526e-06, "loss": 0.0222, "step": 5750 }, { "epoch": 0.47638739558349186, "grad_norm": 0.04454704746603966, "learning_rate": 5.8020217923462696e-06, "loss": 0.022, "step": 5760 }, { "epoch": 0.47721445703415766, "grad_norm": 0.04391258582472801, "learning_rate": 5.788520143919647e-06, "loss": 0.0223, "step": 5770 }, { "epoch": 0.4780415184848234, "grad_norm": 0.039742667227983475, "learning_rate": 5.775012596543999e-06, "loss": 0.0236, "step": 5780 }, { "epoch": 0.4788685799354892, "grad_norm": 0.04627054184675217, "learning_rate": 5.761499251269798e-06, "loss": 0.0225, "step": 5790 }, { "epoch": 0.479695641386155, "grad_norm": 0.03860992565751076, "learning_rate": 5.7479802091908945e-06, "loss": 0.0268, "step": 5800 }, { "epoch": 0.4805227028368208, "grad_norm": 0.04734113812446594, "learning_rate": 5.734455571443751e-06, "loss": 0.0233, "step": 5810 }, { "epoch": 0.48134976428748655, "grad_norm": 0.07089436799287796, "learning_rate": 5.720925439206695e-06, "loss": 0.0267, "step": 5820 }, { "epoch": 0.48217682573815235, "grad_norm": 0.04937206953763962, "learning_rate": 5.707389913699157e-06, "loss": 0.0225, "step": 5830 }, { "epoch": 0.48300388718881815, "grad_norm": 0.04481448978185654, "learning_rate": 5.693849096180917e-06, "loss": 0.0221, "step": 5840 }, { "epoch": 0.4838309486394839, "grad_norm": 0.051826462149620056, "learning_rate": 5.680303087951339e-06, "loss": 0.0237, "step": 5850 }, { "epoch": 0.4846580100901497, "grad_norm": 0.13001324236392975, "learning_rate": 5.666751990348627e-06, "loss": 0.0223, "step": 5860 }, { "epoch": 0.4854850715408155, "grad_norm": 0.04917273670434952, "learning_rate": 5.653195904749054e-06, "loss": 0.0219, "step": 5870 }, { "epoch": 0.4863121329914813, "grad_norm": 0.04470530524849892, "learning_rate": 5.639634932566208e-06, "loss": 0.0307, "step": 5880 }, { "epoch": 0.48713919444214704, "grad_norm": 0.04076725244522095, "learning_rate": 5.626069175250236e-06, "loss": 0.0223, "step": 5890 }, { "epoch": 0.48796625589281284, "grad_norm": 0.050211817026138306, "learning_rate": 5.61249873428708e-06, "loss": 0.0227, "step": 5900 }, { "epoch": 0.48879331734347864, "grad_norm": 0.03654312714934349, "learning_rate": 5.5989237111977255e-06, "loss": 0.0216, "step": 5910 }, { "epoch": 0.4896203787941444, "grad_norm": 0.050298597663640976, "learning_rate": 5.58534420753743e-06, "loss": 0.0217, "step": 5920 }, { "epoch": 0.4904474402448102, "grad_norm": 0.04905930534005165, "learning_rate": 5.571760324894977e-06, "loss": 0.0227, "step": 5930 }, { "epoch": 0.491274501695476, "grad_norm": 0.045814525336027145, "learning_rate": 5.558172164891903e-06, "loss": 0.0225, "step": 5940 }, { "epoch": 0.4921015631461418, "grad_norm": 0.06343957781791687, "learning_rate": 5.544579829181751e-06, "loss": 0.023, "step": 5950 }, { "epoch": 0.49292862459680753, "grad_norm": 0.042192984372377396, "learning_rate": 5.530983419449296e-06, "loss": 0.021, "step": 5960 }, { "epoch": 0.49375568604747333, "grad_norm": 0.04143495857715607, "learning_rate": 5.517383037409794e-06, "loss": 0.0253, "step": 5970 }, { "epoch": 0.49458274749813913, "grad_norm": 0.04273596778512001, "learning_rate": 5.503778784808218e-06, "loss": 0.0226, "step": 5980 }, { "epoch": 0.4954098089488049, "grad_norm": 0.047943755984306335, "learning_rate": 5.490170763418496e-06, "loss": 0.022, "step": 5990 }, { "epoch": 0.4962368703994707, "grad_norm": 0.045045025646686554, "learning_rate": 5.476559075042751e-06, "loss": 0.0216, "step": 6000 }, { "epoch": 0.4962368703994707, "eval_loss": 0.02347772754728794, "eval_runtime": 1220.4355, "eval_samples_per_second": 4.915, "eval_steps_per_second": 0.307, "step": 6000 }, { "epoch": 0.4970639318501365, "grad_norm": 0.04491131007671356, "learning_rate": 5.4629438215105375e-06, "loss": 0.0228, "step": 6010 }, { "epoch": 0.4978909933008023, "grad_norm": 0.053035978227853775, "learning_rate": 5.449325104678085e-06, "loss": 0.0233, "step": 6020 }, { "epoch": 0.498718054751468, "grad_norm": 0.04346757382154465, "learning_rate": 5.4357030264275256e-06, "loss": 0.0218, "step": 6030 }, { "epoch": 0.4995451162021338, "grad_norm": 0.03982304036617279, "learning_rate": 5.422077688666145e-06, "loss": 0.0216, "step": 6040 }, { "epoch": 0.5003721776527996, "grad_norm": 0.0594533309340477, "learning_rate": 5.4084491933256086e-06, "loss": 0.0228, "step": 6050 }, { "epoch": 0.5011992391034654, "grad_norm": 0.03943202272057533, "learning_rate": 5.394817642361206e-06, "loss": 0.0231, "step": 6060 }, { "epoch": 0.5020263005541312, "grad_norm": 0.03965817019343376, "learning_rate": 5.381183137751087e-06, "loss": 0.0234, "step": 6070 }, { "epoch": 0.5028533620047969, "grad_norm": 0.05061696469783783, "learning_rate": 5.367545781495495e-06, "loss": 0.0252, "step": 6080 }, { "epoch": 0.5036804234554627, "grad_norm": 0.0856064036488533, "learning_rate": 5.353905675616008e-06, "loss": 0.0228, "step": 6090 }, { "epoch": 0.5045074849061285, "grad_norm": 0.05830984562635422, "learning_rate": 5.340262922154773e-06, "loss": 0.0239, "step": 6100 }, { "epoch": 0.5053345463567943, "grad_norm": 0.042031850665807724, "learning_rate": 5.326617623173747e-06, "loss": 0.0218, "step": 6110 }, { "epoch": 0.5061616078074601, "grad_norm": 0.04255002364516258, "learning_rate": 5.312969880753928e-06, "loss": 0.0257, "step": 6120 }, { "epoch": 0.5069886692581259, "grad_norm": 0.046407558023929596, "learning_rate": 5.299319796994591e-06, "loss": 0.0214, "step": 6130 }, { "epoch": 0.5078157307087917, "grad_norm": 0.044977955520153046, "learning_rate": 5.285667474012529e-06, "loss": 0.0243, "step": 6140 }, { "epoch": 0.5086427921594574, "grad_norm": 0.041169311851263046, "learning_rate": 5.272013013941289e-06, "loss": 0.0221, "step": 6150 }, { "epoch": 0.5094698536101232, "grad_norm": 0.04349064826965332, "learning_rate": 5.258356518930403e-06, "loss": 0.0222, "step": 6160 }, { "epoch": 0.510296915060789, "grad_norm": 0.051616426557302475, "learning_rate": 5.244698091144624e-06, "loss": 0.0226, "step": 6170 }, { "epoch": 0.5111239765114548, "grad_norm": 0.04476653039455414, "learning_rate": 5.2310378327631695e-06, "loss": 0.0225, "step": 6180 }, { "epoch": 0.5119510379621206, "grad_norm": 0.04472777247428894, "learning_rate": 5.21737584597895e-06, "loss": 0.0231, "step": 6190 }, { "epoch": 0.5127780994127864, "grad_norm": 0.05034750699996948, "learning_rate": 5.203712232997801e-06, "loss": 0.0215, "step": 6200 }, { "epoch": 0.5136051608634522, "grad_norm": 0.04265570640563965, "learning_rate": 5.190047096037734e-06, "loss": 0.0246, "step": 6210 }, { "epoch": 0.5144322223141179, "grad_norm": 0.0414557047188282, "learning_rate": 5.176380537328149e-06, "loss": 0.0224, "step": 6220 }, { "epoch": 0.5152592837647837, "grad_norm": 0.047177575528621674, "learning_rate": 5.1627126591090945e-06, "loss": 0.0248, "step": 6230 }, { "epoch": 0.5160863452154495, "grad_norm": 0.03995126485824585, "learning_rate": 5.149043563630481e-06, "loss": 0.0222, "step": 6240 }, { "epoch": 0.5169134066661153, "grad_norm": 0.038500089198350906, "learning_rate": 5.135373353151333e-06, "loss": 0.0226, "step": 6250 }, { "epoch": 0.5177404681167811, "grad_norm": 0.04477696493268013, "learning_rate": 5.1217021299390055e-06, "loss": 0.0252, "step": 6260 }, { "epoch": 0.5185675295674469, "grad_norm": 0.04252477362751961, "learning_rate": 5.108029996268442e-06, "loss": 0.0208, "step": 6270 }, { "epoch": 0.5193945910181127, "grad_norm": 0.04710827022790909, "learning_rate": 5.09435705442139e-06, "loss": 0.0208, "step": 6280 }, { "epoch": 0.5202216524687784, "grad_norm": 0.04434856027364731, "learning_rate": 5.080683406685644e-06, "loss": 0.0223, "step": 6290 }, { "epoch": 0.5210487139194442, "grad_norm": 0.04365675151348114, "learning_rate": 5.067009155354281e-06, "loss": 0.0219, "step": 6300 }, { "epoch": 0.52187577537011, "grad_norm": 0.04527043551206589, "learning_rate": 5.053334402724891e-06, "loss": 0.0216, "step": 6310 }, { "epoch": 0.5227028368207758, "grad_norm": 0.04446522891521454, "learning_rate": 5.039659251098818e-06, "loss": 0.0325, "step": 6320 }, { "epoch": 0.5235298982714416, "grad_norm": 0.03923187032341957, "learning_rate": 5.025983802780387e-06, "loss": 0.0225, "step": 6330 }, { "epoch": 0.5243569597221074, "grad_norm": 0.0494740828871727, "learning_rate": 5.012308160076143e-06, "loss": 0.0236, "step": 6340 }, { "epoch": 0.5251840211727732, "grad_norm": 0.048305340111255646, "learning_rate": 4.998632425294089e-06, "loss": 0.0219, "step": 6350 }, { "epoch": 0.5260110826234389, "grad_norm": 0.05675299093127251, "learning_rate": 4.984956700742914e-06, "loss": 0.023, "step": 6360 }, { "epoch": 0.5268381440741047, "grad_norm": 0.05156668648123741, "learning_rate": 4.9712810887312285e-06, "loss": 0.021, "step": 6370 }, { "epoch": 0.5276652055247705, "grad_norm": 0.0496770441532135, "learning_rate": 4.957605691566806e-06, "loss": 0.0226, "step": 6380 }, { "epoch": 0.5284922669754363, "grad_norm": 0.044166844338178635, "learning_rate": 4.943930611555807e-06, "loss": 0.0285, "step": 6390 }, { "epoch": 0.5293193284261021, "grad_norm": 0.0438714399933815, "learning_rate": 4.930255951002023e-06, "loss": 0.0235, "step": 6400 }, { "epoch": 0.5301463898767679, "grad_norm": 0.049872253090143204, "learning_rate": 4.91658181220611e-06, "loss": 0.0213, "step": 6410 }, { "epoch": 0.5309734513274337, "grad_norm": 0.05873720347881317, "learning_rate": 4.902908297464815e-06, "loss": 0.0214, "step": 6420 }, { "epoch": 0.5318005127780994, "grad_norm": 0.04734335094690323, "learning_rate": 4.8892355090702195e-06, "loss": 0.0219, "step": 6430 }, { "epoch": 0.5326275742287652, "grad_norm": 0.04171719029545784, "learning_rate": 4.875563549308971e-06, "loss": 0.0217, "step": 6440 }, { "epoch": 0.533454635679431, "grad_norm": 0.04020686820149422, "learning_rate": 4.861892520461514e-06, "loss": 0.0229, "step": 6450 }, { "epoch": 0.5342816971300968, "grad_norm": 0.04311240091919899, "learning_rate": 4.848222524801341e-06, "loss": 0.0232, "step": 6460 }, { "epoch": 0.5351087585807626, "grad_norm": 0.05833645164966583, "learning_rate": 4.834553664594197e-06, "loss": 0.022, "step": 6470 }, { "epoch": 0.5359358200314284, "grad_norm": 0.0407719612121582, "learning_rate": 4.820886042097349e-06, "loss": 0.0233, "step": 6480 }, { "epoch": 0.5367628814820942, "grad_norm": 0.03404640033841133, "learning_rate": 4.807219759558794e-06, "loss": 0.0222, "step": 6490 }, { "epoch": 0.5375899429327599, "grad_norm": 0.04761282354593277, "learning_rate": 4.7935549192165116e-06, "loss": 0.0224, "step": 6500 }, { "epoch": 0.5384170043834257, "grad_norm": 0.04644225910305977, "learning_rate": 4.779891623297688e-06, "loss": 0.0231, "step": 6510 }, { "epoch": 0.5392440658340915, "grad_norm": 0.04503655433654785, "learning_rate": 4.7662299740179544e-06, "loss": 0.0226, "step": 6520 }, { "epoch": 0.5400711272847573, "grad_norm": 0.04182233288884163, "learning_rate": 4.752570073580632e-06, "loss": 0.0207, "step": 6530 }, { "epoch": 0.5408981887354231, "grad_norm": 0.04567556828260422, "learning_rate": 4.738912024175945e-06, "loss": 0.0218, "step": 6540 }, { "epoch": 0.5417252501860889, "grad_norm": 0.04384360834956169, "learning_rate": 4.725255927980283e-06, "loss": 0.0214, "step": 6550 }, { "epoch": 0.5425523116367547, "grad_norm": 0.0403946228325367, "learning_rate": 4.711601887155417e-06, "loss": 0.0264, "step": 6560 }, { "epoch": 0.5433793730874203, "grad_norm": 0.038516897708177567, "learning_rate": 4.6979500038477425e-06, "loss": 0.0221, "step": 6570 }, { "epoch": 0.5442064345380861, "grad_norm": 0.0414847806096077, "learning_rate": 4.684300380187516e-06, "loss": 0.0204, "step": 6580 }, { "epoch": 0.545033495988752, "grad_norm": 0.04257076978683472, "learning_rate": 4.670653118288085e-06, "loss": 0.0211, "step": 6590 }, { "epoch": 0.5458605574394177, "grad_norm": 0.04257350042462349, "learning_rate": 4.657008320245136e-06, "loss": 0.0218, "step": 6600 }, { "epoch": 0.5466876188900835, "grad_norm": 0.04577566310763359, "learning_rate": 4.643366088135918e-06, "loss": 0.0221, "step": 6610 }, { "epoch": 0.5475146803407493, "grad_norm": 0.11741481721401215, "learning_rate": 4.629726524018486e-06, "loss": 0.0222, "step": 6620 }, { "epoch": 0.5483417417914151, "grad_norm": 0.04335429146885872, "learning_rate": 4.616089729930932e-06, "loss": 0.0252, "step": 6630 }, { "epoch": 0.5491688032420808, "grad_norm": 0.04533402994275093, "learning_rate": 4.602455807890634e-06, "loss": 0.0218, "step": 6640 }, { "epoch": 0.5499958646927466, "grad_norm": 0.042610831558704376, "learning_rate": 4.588824859893473e-06, "loss": 0.022, "step": 6650 }, { "epoch": 0.5508229261434124, "grad_norm": 0.03981228917837143, "learning_rate": 4.57519698791309e-06, "loss": 0.0227, "step": 6660 }, { "epoch": 0.5516499875940782, "grad_norm": 0.0377313606441021, "learning_rate": 4.561572293900109e-06, "loss": 0.0226, "step": 6670 }, { "epoch": 0.552477049044744, "grad_norm": 0.08314741402864456, "learning_rate": 4.547950879781382e-06, "loss": 0.0229, "step": 6680 }, { "epoch": 0.5533041104954098, "grad_norm": 0.04389451816678047, "learning_rate": 4.534332847459225e-06, "loss": 0.0212, "step": 6690 }, { "epoch": 0.5541311719460756, "grad_norm": 0.04181825742125511, "learning_rate": 4.520718298810649e-06, "loss": 0.0203, "step": 6700 }, { "epoch": 0.5549582333967413, "grad_norm": 0.042209409177303314, "learning_rate": 4.507107335686611e-06, "loss": 0.0234, "step": 6710 }, { "epoch": 0.5557852948474071, "grad_norm": 0.03632921725511551, "learning_rate": 4.49350005991124e-06, "loss": 0.0213, "step": 6720 }, { "epoch": 0.5566123562980729, "grad_norm": 0.03909287229180336, "learning_rate": 4.47989657328108e-06, "loss": 0.0259, "step": 6730 }, { "epoch": 0.5574394177487387, "grad_norm": 0.04961128160357475, "learning_rate": 4.466296977564331e-06, "loss": 0.0229, "step": 6740 }, { "epoch": 0.5582664791994045, "grad_norm": 0.04496648535132408, "learning_rate": 4.452701374500079e-06, "loss": 0.0207, "step": 6750 }, { "epoch": 0.5590935406500703, "grad_norm": 0.045161984860897064, "learning_rate": 4.43910986579755e-06, "loss": 0.0233, "step": 6760 }, { "epoch": 0.5599206021007361, "grad_norm": 0.047101061791181564, "learning_rate": 4.42552255313533e-06, "loss": 0.0327, "step": 6770 }, { "epoch": 0.5607476635514018, "grad_norm": 0.044754352420568466, "learning_rate": 4.411939538160621e-06, "loss": 0.0221, "step": 6780 }, { "epoch": 0.5615747250020676, "grad_norm": 0.04385341331362724, "learning_rate": 4.398360922488474e-06, "loss": 0.0266, "step": 6790 }, { "epoch": 0.5624017864527334, "grad_norm": 0.05165982246398926, "learning_rate": 4.384786807701024e-06, "loss": 0.0218, "step": 6800 }, { "epoch": 0.5632288479033992, "grad_norm": 0.03928116336464882, "learning_rate": 4.371217295346738e-06, "loss": 0.022, "step": 6810 }, { "epoch": 0.564055909354065, "grad_norm": 0.038528576493263245, "learning_rate": 4.357652486939649e-06, "loss": 0.0218, "step": 6820 }, { "epoch": 0.5648829708047308, "grad_norm": 0.04096828028559685, "learning_rate": 4.3440924839586045e-06, "loss": 0.0221, "step": 6830 }, { "epoch": 0.5657100322553966, "grad_norm": 0.04172588139772415, "learning_rate": 4.3305373878465e-06, "loss": 0.0214, "step": 6840 }, { "epoch": 0.5665370937060623, "grad_norm": 0.04250342398881912, "learning_rate": 4.316987300009521e-06, "loss": 0.0216, "step": 6850 }, { "epoch": 0.5673641551567281, "grad_norm": 0.04389472305774689, "learning_rate": 4.303442321816388e-06, "loss": 0.0225, "step": 6860 }, { "epoch": 0.5681912166073939, "grad_norm": 0.04604129120707512, "learning_rate": 4.2899025545975935e-06, "loss": 0.025, "step": 6870 }, { "epoch": 0.5690182780580597, "grad_norm": 0.04432059824466705, "learning_rate": 4.276368099644649e-06, "loss": 0.0223, "step": 6880 }, { "epoch": 0.5698453395087255, "grad_norm": 0.04254218190908432, "learning_rate": 4.262839058209325e-06, "loss": 0.0254, "step": 6890 }, { "epoch": 0.5706724009593913, "grad_norm": 0.04665306955575943, "learning_rate": 4.249315531502892e-06, "loss": 0.0233, "step": 6900 }, { "epoch": 0.5714994624100571, "grad_norm": 0.06424245983362198, "learning_rate": 4.235797620695365e-06, "loss": 0.0223, "step": 6910 }, { "epoch": 0.5723265238607228, "grad_norm": 0.04606041684746742, "learning_rate": 4.222285426914744e-06, "loss": 0.0226, "step": 6920 }, { "epoch": 0.5731535853113886, "grad_norm": 0.055455636233091354, "learning_rate": 4.208779051246264e-06, "loss": 0.0217, "step": 6930 }, { "epoch": 0.5739806467620544, "grad_norm": 0.05722310021519661, "learning_rate": 4.1952785947316335e-06, "loss": 0.0287, "step": 6940 }, { "epoch": 0.5748077082127202, "grad_norm": 0.047114696353673935, "learning_rate": 4.181784158368274e-06, "loss": 0.0213, "step": 6950 }, { "epoch": 0.575634769663386, "grad_norm": 0.041593633592128754, "learning_rate": 4.1682958431085784e-06, "loss": 0.0226, "step": 6960 }, { "epoch": 0.5764618311140518, "grad_norm": 0.044355396181344986, "learning_rate": 4.1548137498591415e-06, "loss": 0.0214, "step": 6970 }, { "epoch": 0.5772888925647176, "grad_norm": 0.043452925980091095, "learning_rate": 4.141337979480014e-06, "loss": 0.022, "step": 6980 }, { "epoch": 0.5781159540153833, "grad_norm": 0.04600623995065689, "learning_rate": 4.127868632783943e-06, "loss": 0.0219, "step": 6990 }, { "epoch": 0.5789430154660491, "grad_norm": 0.045817919075489044, "learning_rate": 4.114405810535619e-06, "loss": 0.0228, "step": 7000 }, { "epoch": 0.5789430154660491, "eval_loss": 0.022890722379088402, "eval_runtime": 1220.8476, "eval_samples_per_second": 4.914, "eval_steps_per_second": 0.307, "step": 7000 }, { "epoch": 0.5797700769167149, "grad_norm": 0.04360632598400116, "learning_rate": 4.100949613450929e-06, "loss": 0.0232, "step": 7010 }, { "epoch": 0.5805971383673807, "grad_norm": 0.11677900701761246, "learning_rate": 4.087500142196188e-06, "loss": 0.0239, "step": 7020 }, { "epoch": 0.5814241998180465, "grad_norm": 0.03949005529284477, "learning_rate": 4.074057497387402e-06, "loss": 0.0215, "step": 7030 }, { "epoch": 0.5822512612687123, "grad_norm": 0.04393787682056427, "learning_rate": 4.060621779589505e-06, "loss": 0.0224, "step": 7040 }, { "epoch": 0.5830783227193781, "grad_norm": 0.05478642135858536, "learning_rate": 4.047193089315608e-06, "loss": 0.0217, "step": 7050 }, { "epoch": 0.5839053841700438, "grad_norm": 0.05870141461491585, "learning_rate": 4.033771527026252e-06, "loss": 0.0218, "step": 7060 }, { "epoch": 0.5847324456207096, "grad_norm": 0.04158046096563339, "learning_rate": 4.020357193128655e-06, "loss": 0.021, "step": 7070 }, { "epoch": 0.5855595070713754, "grad_norm": 0.05177818983793259, "learning_rate": 4.006950187975951e-06, "loss": 0.0202, "step": 7080 }, { "epoch": 0.5863865685220412, "grad_norm": 0.04415697604417801, "learning_rate": 3.993550611866458e-06, "loss": 0.0222, "step": 7090 }, { "epoch": 0.587213629972707, "grad_norm": 0.06037106364965439, "learning_rate": 3.980158565042908e-06, "loss": 0.022, "step": 7100 }, { "epoch": 0.5880406914233728, "grad_norm": 0.03998905047774315, "learning_rate": 3.96677414769171e-06, "loss": 0.0227, "step": 7110 }, { "epoch": 0.5888677528740386, "grad_norm": 0.035650502890348434, "learning_rate": 3.9533974599422e-06, "loss": 0.0218, "step": 7120 }, { "epoch": 0.5896948143247043, "grad_norm": 0.04198850691318512, "learning_rate": 3.940028601865881e-06, "loss": 0.0229, "step": 7130 }, { "epoch": 0.5905218757753701, "grad_norm": 0.041859325021505356, "learning_rate": 3.9266676734756894e-06, "loss": 0.0217, "step": 7140 }, { "epoch": 0.5913489372260359, "grad_norm": 0.04040461406111717, "learning_rate": 3.913314774725234e-06, "loss": 0.0212, "step": 7150 }, { "epoch": 0.5921759986767017, "grad_norm": 0.04604990780353546, "learning_rate": 3.899970005508053e-06, "loss": 0.022, "step": 7160 }, { "epoch": 0.5930030601273675, "grad_norm": 0.04515118896961212, "learning_rate": 3.8866334656568765e-06, "loss": 0.022, "step": 7170 }, { "epoch": 0.5938301215780333, "grad_norm": 0.04524078220129013, "learning_rate": 3.8733052549428566e-06, "loss": 0.0215, "step": 7180 }, { "epoch": 0.5946571830286991, "grad_norm": 0.04891633987426758, "learning_rate": 3.859985473074847e-06, "loss": 0.0226, "step": 7190 }, { "epoch": 0.5954842444793648, "grad_norm": 0.042289573699235916, "learning_rate": 3.846674219698635e-06, "loss": 0.0213, "step": 7200 }, { "epoch": 0.5963113059300306, "grad_norm": 0.04168631508946419, "learning_rate": 3.833371594396214e-06, "loss": 0.0228, "step": 7210 }, { "epoch": 0.5971383673806964, "grad_norm": 0.04345110431313515, "learning_rate": 3.820077696685027e-06, "loss": 0.0213, "step": 7220 }, { "epoch": 0.5979654288313622, "grad_norm": 0.04696614667773247, "learning_rate": 3.8067926260172234e-06, "loss": 0.0226, "step": 7230 }, { "epoch": 0.598792490282028, "grad_norm": 0.041647132486104965, "learning_rate": 3.793516481778924e-06, "loss": 0.022, "step": 7240 }, { "epoch": 0.5996195517326938, "grad_norm": 0.04166780784726143, "learning_rate": 3.780249363289459e-06, "loss": 0.0253, "step": 7250 }, { "epoch": 0.6004466131833596, "grad_norm": 0.04384204372763634, "learning_rate": 3.766991369800649e-06, "loss": 0.0219, "step": 7260 }, { "epoch": 0.6012736746340253, "grad_norm": 0.03765762969851494, "learning_rate": 3.7537426004960446e-06, "loss": 0.0207, "step": 7270 }, { "epoch": 0.6021007360846911, "grad_norm": 0.04585011675953865, "learning_rate": 3.7405031544901884e-06, "loss": 0.0209, "step": 7280 }, { "epoch": 0.6029277975353569, "grad_norm": 0.04622683674097061, "learning_rate": 3.7272731308278777e-06, "loss": 0.0225, "step": 7290 }, { "epoch": 0.6037548589860227, "grad_norm": 0.061212386935949326, "learning_rate": 3.714052628483417e-06, "loss": 0.0202, "step": 7300 }, { "epoch": 0.6045819204366885, "grad_norm": 0.0689668282866478, "learning_rate": 3.700841746359889e-06, "loss": 0.0222, "step": 7310 }, { "epoch": 0.6054089818873543, "grad_norm": 0.041381001472473145, "learning_rate": 3.6876405832884016e-06, "loss": 0.0214, "step": 7320 }, { "epoch": 0.6062360433380201, "grad_norm": 0.0484529472887516, "learning_rate": 3.6744492380273533e-06, "loss": 0.0219, "step": 7330 }, { "epoch": 0.6070631047886857, "grad_norm": 0.04558572545647621, "learning_rate": 3.661267809261698e-06, "loss": 0.0212, "step": 7340 }, { "epoch": 0.6078901662393515, "grad_norm": 0.03745023533701897, "learning_rate": 3.648096395602202e-06, "loss": 0.0231, "step": 7350 }, { "epoch": 0.6087172276900173, "grad_norm": 0.04229872673749924, "learning_rate": 3.6349350955847094e-06, "loss": 0.0215, "step": 7360 }, { "epoch": 0.6095442891406831, "grad_norm": 0.06078009679913521, "learning_rate": 3.6217840076694066e-06, "loss": 0.0233, "step": 7370 }, { "epoch": 0.610371350591349, "grad_norm": 0.04391666501760483, "learning_rate": 3.6086432302400754e-06, "loss": 0.0218, "step": 7380 }, { "epoch": 0.6111984120420147, "grad_norm": 0.04776912182569504, "learning_rate": 3.5955128616033717e-06, "loss": 0.0238, "step": 7390 }, { "epoch": 0.6120254734926805, "grad_norm": 0.04561059549450874, "learning_rate": 3.582392999988078e-06, "loss": 0.0229, "step": 7400 }, { "epoch": 0.6128525349433462, "grad_norm": 0.043533895164728165, "learning_rate": 3.569283743544375e-06, "loss": 0.022, "step": 7410 }, { "epoch": 0.613679596394012, "grad_norm": 0.03526020050048828, "learning_rate": 3.55618519034311e-06, "loss": 0.0214, "step": 7420 }, { "epoch": 0.6145066578446778, "grad_norm": 0.03638261556625366, "learning_rate": 3.5430974383750503e-06, "loss": 0.0208, "step": 7430 }, { "epoch": 0.6153337192953436, "grad_norm": 0.04244010150432587, "learning_rate": 3.530020585550166e-06, "loss": 0.0224, "step": 7440 }, { "epoch": 0.6161607807460094, "grad_norm": 0.03991573676466942, "learning_rate": 3.5169547296968874e-06, "loss": 0.0218, "step": 7450 }, { "epoch": 0.6169878421966752, "grad_norm": 0.03916684165596962, "learning_rate": 3.5038999685613752e-06, "loss": 0.0212, "step": 7460 }, { "epoch": 0.617814903647341, "grad_norm": 0.037909045815467834, "learning_rate": 3.4908563998067945e-06, "loss": 0.0222, "step": 7470 }, { "epoch": 0.6186419650980067, "grad_norm": 0.048196956515312195, "learning_rate": 3.4778241210125718e-06, "loss": 0.021, "step": 7480 }, { "epoch": 0.6194690265486725, "grad_norm": 0.04458421468734741, "learning_rate": 3.4648032296736805e-06, "loss": 0.0236, "step": 7490 }, { "epoch": 0.6202960879993383, "grad_norm": 0.039592791348695755, "learning_rate": 3.4517938231999026e-06, "loss": 0.0228, "step": 7500 }, { "epoch": 0.6211231494500041, "grad_norm": 0.048372309654951096, "learning_rate": 3.4387959989150977e-06, "loss": 0.0215, "step": 7510 }, { "epoch": 0.6219502109006699, "grad_norm": 0.041564539074897766, "learning_rate": 3.425809854056482e-06, "loss": 0.0219, "step": 7520 }, { "epoch": 0.6227772723513357, "grad_norm": 0.043838802725076675, "learning_rate": 3.4128354857738942e-06, "loss": 0.0208, "step": 7530 }, { "epoch": 0.6236043338020015, "grad_norm": 0.04145396873354912, "learning_rate": 3.3998729911290775e-06, "loss": 0.0212, "step": 7540 }, { "epoch": 0.6244313952526672, "grad_norm": 0.04556450992822647, "learning_rate": 3.386922467094944e-06, "loss": 0.023, "step": 7550 }, { "epoch": 0.625258456703333, "grad_norm": 0.04290676862001419, "learning_rate": 3.3739840105548528e-06, "loss": 0.021, "step": 7560 }, { "epoch": 0.6260855181539988, "grad_norm": 0.042239073663949966, "learning_rate": 3.3610577183018877e-06, "loss": 0.0225, "step": 7570 }, { "epoch": 0.6269125796046646, "grad_norm": 0.04751691594719887, "learning_rate": 3.348143687038128e-06, "loss": 0.0215, "step": 7580 }, { "epoch": 0.6277396410553304, "grad_norm": 0.04237852618098259, "learning_rate": 3.3352420133739304e-06, "loss": 0.0218, "step": 7590 }, { "epoch": 0.6285667025059962, "grad_norm": 0.03740919381380081, "learning_rate": 3.3223527938272076e-06, "loss": 0.0213, "step": 7600 }, { "epoch": 0.629393763956662, "grad_norm": 0.036210279911756516, "learning_rate": 3.3094761248226948e-06, "loss": 0.0255, "step": 7610 }, { "epoch": 0.6302208254073277, "grad_norm": 0.04506264254450798, "learning_rate": 3.296612102691241e-06, "loss": 0.0224, "step": 7620 }, { "epoch": 0.6310478868579935, "grad_norm": 0.04092979431152344, "learning_rate": 3.283760823669082e-06, "loss": 0.0206, "step": 7630 }, { "epoch": 0.6318749483086593, "grad_norm": 0.04056790471076965, "learning_rate": 3.270922383897121e-06, "loss": 0.0213, "step": 7640 }, { "epoch": 0.6327020097593251, "grad_norm": 0.03952750191092491, "learning_rate": 3.258096879420216e-06, "loss": 0.021, "step": 7650 }, { "epoch": 0.6335290712099909, "grad_norm": 0.04810957983136177, "learning_rate": 3.245284406186446e-06, "loss": 0.0226, "step": 7660 }, { "epoch": 0.6343561326606567, "grad_norm": 0.038928598165512085, "learning_rate": 3.232485060046412e-06, "loss": 0.0231, "step": 7670 }, { "epoch": 0.6351831941113225, "grad_norm": 0.03903147578239441, "learning_rate": 3.2196989367525035e-06, "loss": 0.0255, "step": 7680 }, { "epoch": 0.6360102555619882, "grad_norm": 0.04532884061336517, "learning_rate": 3.2069261319581922e-06, "loss": 0.02, "step": 7690 }, { "epoch": 0.636837317012654, "grad_norm": 0.0435151644051075, "learning_rate": 3.19416674121732e-06, "loss": 0.022, "step": 7700 }, { "epoch": 0.6376643784633198, "grad_norm": 0.04332192242145538, "learning_rate": 3.1814208599833634e-06, "loss": 0.0273, "step": 7710 }, { "epoch": 0.6384914399139856, "grad_norm": 0.0369616374373436, "learning_rate": 3.168688583608748e-06, "loss": 0.0214, "step": 7720 }, { "epoch": 0.6393185013646514, "grad_norm": 0.07783352583646774, "learning_rate": 3.1559700073441123e-06, "loss": 0.0213, "step": 7730 }, { "epoch": 0.6401455628153172, "grad_norm": 0.0504750981926918, "learning_rate": 3.1432652263376073e-06, "loss": 0.0202, "step": 7740 }, { "epoch": 0.640972624265983, "grad_norm": 0.0557858943939209, "learning_rate": 3.130574335634181e-06, "loss": 0.0222, "step": 7750 }, { "epoch": 0.6417996857166487, "grad_norm": 0.0438205786049366, "learning_rate": 3.117897430174863e-06, "loss": 0.0211, "step": 7760 }, { "epoch": 0.6426267471673145, "grad_norm": 0.04007831588387489, "learning_rate": 3.1052346047960696e-06, "loss": 0.0223, "step": 7770 }, { "epoch": 0.6434538086179803, "grad_norm": 0.04356636852025986, "learning_rate": 3.0925859542288695e-06, "loss": 0.021, "step": 7780 }, { "epoch": 0.6442808700686461, "grad_norm": 0.044068679213523865, "learning_rate": 3.0799515730982987e-06, "loss": 0.0239, "step": 7790 }, { "epoch": 0.6451079315193119, "grad_norm": 0.058787260204553604, "learning_rate": 3.0673315559226426e-06, "loss": 0.0223, "step": 7800 }, { "epoch": 0.6459349929699777, "grad_norm": 0.04351416230201721, "learning_rate": 3.054725997112724e-06, "loss": 0.0227, "step": 7810 }, { "epoch": 0.6467620544206435, "grad_norm": 0.0457034632563591, "learning_rate": 3.042134990971205e-06, "loss": 0.021, "step": 7820 }, { "epoch": 0.6475891158713092, "grad_norm": 0.04021298885345459, "learning_rate": 3.0295586316918816e-06, "loss": 0.0205, "step": 7830 }, { "epoch": 0.648416177321975, "grad_norm": 0.045050378888845444, "learning_rate": 3.0169970133589714e-06, "loss": 0.0217, "step": 7840 }, { "epoch": 0.6492432387726408, "grad_norm": 0.036717429757118225, "learning_rate": 3.004450229946418e-06, "loss": 0.0218, "step": 7850 }, { "epoch": 0.6500703002233066, "grad_norm": 0.05614123493432999, "learning_rate": 2.99191837531718e-06, "loss": 0.0234, "step": 7860 }, { "epoch": 0.6508973616739724, "grad_norm": 0.037934400141239166, "learning_rate": 2.9794015432225363e-06, "loss": 0.022, "step": 7870 }, { "epoch": 0.6517244231246382, "grad_norm": 0.04340437054634094, "learning_rate": 2.966899827301386e-06, "loss": 0.0286, "step": 7880 }, { "epoch": 0.652551484575304, "grad_norm": 0.04128657281398773, "learning_rate": 2.9544133210795317e-06, "loss": 0.0217, "step": 7890 }, { "epoch": 0.6533785460259697, "grad_norm": 0.04219742491841316, "learning_rate": 2.9419421179690044e-06, "loss": 0.0207, "step": 7900 }, { "epoch": 0.6542056074766355, "grad_norm": 0.04193083569407463, "learning_rate": 2.929486311267343e-06, "loss": 0.0218, "step": 7910 }, { "epoch": 0.6550326689273013, "grad_norm": 0.03400260955095291, "learning_rate": 2.9170459941569094e-06, "loss": 0.0215, "step": 7920 }, { "epoch": 0.6558597303779671, "grad_norm": 0.04170495644211769, "learning_rate": 2.904621259704188e-06, "loss": 0.0219, "step": 7930 }, { "epoch": 0.6566867918286329, "grad_norm": 0.04302512854337692, "learning_rate": 2.892212200859086e-06, "loss": 0.0244, "step": 7940 }, { "epoch": 0.6575138532792987, "grad_norm": 0.043327417224645615, "learning_rate": 2.8798189104542436e-06, "loss": 0.022, "step": 7950 }, { "epoch": 0.6583409147299645, "grad_norm": 0.05167660862207413, "learning_rate": 2.8674414812043317e-06, "loss": 0.0205, "step": 7960 }, { "epoch": 0.6591679761806302, "grad_norm": 0.061974212527275085, "learning_rate": 2.855080005705367e-06, "loss": 0.0243, "step": 7970 }, { "epoch": 0.659995037631296, "grad_norm": 0.04321138933300972, "learning_rate": 2.842734576434021e-06, "loss": 0.0212, "step": 7980 }, { "epoch": 0.6608220990819618, "grad_norm": 0.05327922850847244, "learning_rate": 2.8304052857469107e-06, "loss": 0.021, "step": 7990 }, { "epoch": 0.6616491605326276, "grad_norm": 0.04471385106444359, "learning_rate": 2.8180922258799286e-06, "loss": 0.0214, "step": 8000 }, { "epoch": 0.6616491605326276, "eval_loss": 0.022467145696282387, "eval_runtime": 1221.4961, "eval_samples_per_second": 4.911, "eval_steps_per_second": 0.307, "step": 8000 }, { "epoch": 0.6624762219832934, "grad_norm": 0.045148443430662155, "learning_rate": 2.8057954889475415e-06, "loss": 0.0216, "step": 8010 }, { "epoch": 0.6633032834339592, "grad_norm": 0.04102947190403938, "learning_rate": 2.7935151669421033e-06, "loss": 0.0208, "step": 8020 }, { "epoch": 0.664130344884625, "grad_norm": 0.0464673787355423, "learning_rate": 2.7812513517331695e-06, "loss": 0.0206, "step": 8030 }, { "epoch": 0.6649574063352907, "grad_norm": 0.04477581009268761, "learning_rate": 2.7690041350667995e-06, "loss": 0.0215, "step": 8040 }, { "epoch": 0.6657844677859565, "grad_norm": 0.043795693665742874, "learning_rate": 2.7567736085648935e-06, "loss": 0.0219, "step": 8050 }, { "epoch": 0.6666115292366223, "grad_norm": 0.04666496440768242, "learning_rate": 2.7445598637244746e-06, "loss": 0.021, "step": 8060 }, { "epoch": 0.6674385906872881, "grad_norm": 0.039747051894664764, "learning_rate": 2.7323629919170334e-06, "loss": 0.0219, "step": 8070 }, { "epoch": 0.6682656521379539, "grad_norm": 0.037099067121744156, "learning_rate": 2.72018308438783e-06, "loss": 0.02, "step": 8080 }, { "epoch": 0.6690927135886197, "grad_norm": 0.0401119664311409, "learning_rate": 2.7080202322552126e-06, "loss": 0.0214, "step": 8090 }, { "epoch": 0.6699197750392855, "grad_norm": 0.0409838892519474, "learning_rate": 2.6958745265099397e-06, "loss": 0.0205, "step": 8100 }, { "epoch": 0.6707468364899511, "grad_norm": 0.035290639847517014, "learning_rate": 2.683746058014489e-06, "loss": 0.0209, "step": 8110 }, { "epoch": 0.671573897940617, "grad_norm": 0.03809922933578491, "learning_rate": 2.6716349175023997e-06, "loss": 0.022, "step": 8120 }, { "epoch": 0.6724009593912827, "grad_norm": 0.044197600334882736, "learning_rate": 2.659541195577571e-06, "loss": 0.02, "step": 8130 }, { "epoch": 0.6732280208419485, "grad_norm": 0.041063982993364334, "learning_rate": 2.6474649827135913e-06, "loss": 0.0203, "step": 8140 }, { "epoch": 0.6740550822926143, "grad_norm": 0.039071984589099884, "learning_rate": 2.635406369253066e-06, "loss": 0.0216, "step": 8150 }, { "epoch": 0.6748821437432801, "grad_norm": 0.038477640599012375, "learning_rate": 2.6233654454069397e-06, "loss": 0.0217, "step": 8160 }, { "epoch": 0.675709205193946, "grad_norm": 0.05265484377741814, "learning_rate": 2.6113423012538184e-06, "loss": 0.0223, "step": 8170 }, { "epoch": 0.6765362666446118, "grad_norm": 0.04026918113231659, "learning_rate": 2.5993370267392998e-06, "loss": 0.0212, "step": 8180 }, { "epoch": 0.6773633280952774, "grad_norm": 0.040949251502752304, "learning_rate": 2.5873497116752955e-06, "loss": 0.0218, "step": 8190 }, { "epoch": 0.6781903895459432, "grad_norm": 0.04553502798080444, "learning_rate": 2.575380445739363e-06, "loss": 0.0224, "step": 8200 }, { "epoch": 0.679017450996609, "grad_norm": 0.040991537272930145, "learning_rate": 2.5634293184740337e-06, "loss": 0.0207, "step": 8210 }, { "epoch": 0.6798445124472748, "grad_norm": 0.04071825370192528, "learning_rate": 2.551496419286143e-06, "loss": 0.0215, "step": 8220 }, { "epoch": 0.6806715738979406, "grad_norm": 0.04148703068494797, "learning_rate": 2.5395818374461626e-06, "loss": 0.0215, "step": 8230 }, { "epoch": 0.6814986353486064, "grad_norm": 0.04831210896372795, "learning_rate": 2.5276856620875267e-06, "loss": 0.0204, "step": 8240 }, { "epoch": 0.6823256967992722, "grad_norm": 0.05425499007105827, "learning_rate": 2.5158079822059726e-06, "loss": 0.0214, "step": 8250 }, { "epoch": 0.6831527582499379, "grad_norm": 0.042809970676898956, "learning_rate": 2.503948886658879e-06, "loss": 0.0204, "step": 8260 }, { "epoch": 0.6839798197006037, "grad_norm": 0.039092812687158585, "learning_rate": 2.492108464164582e-06, "loss": 0.0209, "step": 8270 }, { "epoch": 0.6848068811512695, "grad_norm": 0.0440199077129364, "learning_rate": 2.4802868033017325e-06, "loss": 0.0205, "step": 8280 }, { "epoch": 0.6856339426019353, "grad_norm": 0.04389241337776184, "learning_rate": 2.4684839925086222e-06, "loss": 0.0218, "step": 8290 }, { "epoch": 0.6864610040526011, "grad_norm": 0.03971746936440468, "learning_rate": 2.4567001200825257e-06, "loss": 0.0211, "step": 8300 }, { "epoch": 0.6872880655032669, "grad_norm": 0.03864897042512894, "learning_rate": 2.44493527417904e-06, "loss": 0.0224, "step": 8310 }, { "epoch": 0.6881151269539327, "grad_norm": 0.04412490129470825, "learning_rate": 2.4331895428114167e-06, "loss": 0.0206, "step": 8320 }, { "epoch": 0.6889421884045984, "grad_norm": 0.045335933566093445, "learning_rate": 2.4214630138499235e-06, "loss": 0.0203, "step": 8330 }, { "epoch": 0.6897692498552642, "grad_norm": 0.040548257529735565, "learning_rate": 2.4097557750211627e-06, "loss": 0.0208, "step": 8340 }, { "epoch": 0.69059631130593, "grad_norm": 0.043131329119205475, "learning_rate": 2.3980679139074314e-06, "loss": 0.021, "step": 8350 }, { "epoch": 0.6914233727565958, "grad_norm": 0.039993565529584885, "learning_rate": 2.3863995179460612e-06, "loss": 0.0222, "step": 8360 }, { "epoch": 0.6922504342072616, "grad_norm": 0.037337690591812134, "learning_rate": 2.374750674428764e-06, "loss": 0.0218, "step": 8370 }, { "epoch": 0.6930774956579274, "grad_norm": 0.042838480323553085, "learning_rate": 2.3631214705009806e-06, "loss": 0.0208, "step": 8380 }, { "epoch": 0.6939045571085932, "grad_norm": 0.036257416009902954, "learning_rate": 2.3515119931612196e-06, "loss": 0.02, "step": 8390 }, { "epoch": 0.6947316185592589, "grad_norm": 0.042761024087667465, "learning_rate": 2.339922329260426e-06, "loss": 0.0223, "step": 8400 }, { "epoch": 0.6955586800099247, "grad_norm": 0.04689721390604973, "learning_rate": 2.328352565501314e-06, "loss": 0.0235, "step": 8410 }, { "epoch": 0.6963857414605905, "grad_norm": 0.04648851230740547, "learning_rate": 2.316802788437719e-06, "loss": 0.0217, "step": 8420 }, { "epoch": 0.6972128029112563, "grad_norm": 0.04260076582431793, "learning_rate": 2.3052730844739636e-06, "loss": 0.0216, "step": 8430 }, { "epoch": 0.6980398643619221, "grad_norm": 0.042848605662584305, "learning_rate": 2.293763539864199e-06, "loss": 0.0214, "step": 8440 }, { "epoch": 0.6988669258125879, "grad_norm": 0.039934489876031876, "learning_rate": 2.2822742407117625e-06, "loss": 0.0202, "step": 8450 }, { "epoch": 0.6996939872632537, "grad_norm": 0.03947708010673523, "learning_rate": 2.270805272968537e-06, "loss": 0.0207, "step": 8460 }, { "epoch": 0.7005210487139194, "grad_norm": 0.03535833582282066, "learning_rate": 2.2593567224343037e-06, "loss": 0.0225, "step": 8470 }, { "epoch": 0.7013481101645852, "grad_norm": 0.04926292970776558, "learning_rate": 2.2479286747561037e-06, "loss": 0.0221, "step": 8480 }, { "epoch": 0.702175171615251, "grad_norm": 0.03978796303272247, "learning_rate": 2.2365212154275908e-06, "loss": 0.0226, "step": 8490 }, { "epoch": 0.7030022330659168, "grad_norm": 0.04777059331536293, "learning_rate": 2.2251344297883996e-06, "loss": 0.0204, "step": 8500 }, { "epoch": 0.7038292945165826, "grad_norm": 0.04967991262674332, "learning_rate": 2.2137684030235095e-06, "loss": 0.0203, "step": 8510 }, { "epoch": 0.7046563559672484, "grad_norm": 0.04070328548550606, "learning_rate": 2.202423220162591e-06, "loss": 0.0214, "step": 8520 }, { "epoch": 0.7054834174179142, "grad_norm": 0.036942508071660995, "learning_rate": 2.191098966079389e-06, "loss": 0.0205, "step": 8530 }, { "epoch": 0.7063104788685799, "grad_norm": 0.042975060641765594, "learning_rate": 2.1797957254910757e-06, "loss": 0.0218, "step": 8540 }, { "epoch": 0.7071375403192457, "grad_norm": 0.044698718935251236, "learning_rate": 2.168513582957622e-06, "loss": 0.0225, "step": 8550 }, { "epoch": 0.7079646017699115, "grad_norm": 0.0593951940536499, "learning_rate": 2.1572526228811645e-06, "loss": 0.0205, "step": 8560 }, { "epoch": 0.7087916632205773, "grad_norm": 0.042812854051589966, "learning_rate": 2.1460129295053666e-06, "loss": 0.0215, "step": 8570 }, { "epoch": 0.7096187246712431, "grad_norm": 0.05073460936546326, "learning_rate": 2.134794586914806e-06, "loss": 0.0234, "step": 8580 }, { "epoch": 0.7104457861219089, "grad_norm": 0.03609664365649223, "learning_rate": 2.123597679034324e-06, "loss": 0.02, "step": 8590 }, { "epoch": 0.7112728475725747, "grad_norm": 0.040147822350263596, "learning_rate": 2.112422289628412e-06, "loss": 0.0205, "step": 8600 }, { "epoch": 0.7120999090232404, "grad_norm": 0.039646077901124954, "learning_rate": 2.101268502300582e-06, "loss": 0.0213, "step": 8610 }, { "epoch": 0.7129269704739062, "grad_norm": 0.04966466873884201, "learning_rate": 2.090136400492739e-06, "loss": 0.0244, "step": 8620 }, { "epoch": 0.713754031924572, "grad_norm": 0.04764994978904724, "learning_rate": 2.0790260674845563e-06, "loss": 0.0202, "step": 8630 }, { "epoch": 0.7145810933752378, "grad_norm": 0.04711426794528961, "learning_rate": 2.0679375863928576e-06, "loss": 0.0214, "step": 8640 }, { "epoch": 0.7154081548259036, "grad_norm": 0.04078923165798187, "learning_rate": 2.056871040170988e-06, "loss": 0.0199, "step": 8650 }, { "epoch": 0.7162352162765694, "grad_norm": 0.039174020290374756, "learning_rate": 2.0458265116082002e-06, "loss": 0.021, "step": 8660 }, { "epoch": 0.7170622777272352, "grad_norm": 0.04337885230779648, "learning_rate": 2.034804083329027e-06, "loss": 0.0208, "step": 8670 }, { "epoch": 0.7178893391779009, "grad_norm": 0.04172796383500099, "learning_rate": 2.0238038377926715e-06, "loss": 0.0218, "step": 8680 }, { "epoch": 0.7187164006285667, "grad_norm": 0.043501630425453186, "learning_rate": 2.012825857292392e-06, "loss": 0.0232, "step": 8690 }, { "epoch": 0.7195434620792325, "grad_norm": 0.04335128515958786, "learning_rate": 2.00187022395487e-06, "loss": 0.0215, "step": 8700 }, { "epoch": 0.7203705235298983, "grad_norm": 0.04587217792868614, "learning_rate": 1.9909370197396148e-06, "loss": 0.0246, "step": 8710 }, { "epoch": 0.7211975849805641, "grad_norm": 0.037936147302389145, "learning_rate": 1.9800263264383405e-06, "loss": 0.0206, "step": 8720 }, { "epoch": 0.7220246464312299, "grad_norm": 0.0373714417219162, "learning_rate": 1.969138225674358e-06, "loss": 0.0213, "step": 8730 }, { "epoch": 0.7228517078818957, "grad_norm": 0.04090265929698944, "learning_rate": 1.9582727989019607e-06, "loss": 0.021, "step": 8740 }, { "epoch": 0.7236787693325614, "grad_norm": 0.033642202615737915, "learning_rate": 1.9474301274058125e-06, "loss": 0.0198, "step": 8750 }, { "epoch": 0.7245058307832272, "grad_norm": 0.0450110137462616, "learning_rate": 1.9366102923003578e-06, "loss": 0.0202, "step": 8760 }, { "epoch": 0.725332892233893, "grad_norm": 0.03714507818222046, "learning_rate": 1.9258133745291845e-06, "loss": 0.0211, "step": 8770 }, { "epoch": 0.7261599536845588, "grad_norm": 0.04287153109908104, "learning_rate": 1.9150394548644463e-06, "loss": 0.02, "step": 8780 }, { "epoch": 0.7269870151352246, "grad_norm": 0.041864458471536636, "learning_rate": 1.9042886139062427e-06, "loss": 0.0218, "step": 8790 }, { "epoch": 0.7278140765858904, "grad_norm": 0.11404255032539368, "learning_rate": 1.893560932082023e-06, "loss": 0.0224, "step": 8800 }, { "epoch": 0.7286411380365562, "grad_norm": 0.04448498412966728, "learning_rate": 1.8828564896459795e-06, "loss": 0.0217, "step": 8810 }, { "epoch": 0.7294681994872219, "grad_norm": 0.038884907960891724, "learning_rate": 1.872175366678451e-06, "loss": 0.0206, "step": 8820 }, { "epoch": 0.7302952609378877, "grad_norm": 0.041435256600379944, "learning_rate": 1.8615176430853231e-06, "loss": 0.0211, "step": 8830 }, { "epoch": 0.7311223223885535, "grad_norm": 0.04282752797007561, "learning_rate": 1.8508833985974306e-06, "loss": 0.0209, "step": 8840 }, { "epoch": 0.7319493838392193, "grad_norm": 0.043493740260601044, "learning_rate": 1.8402727127699537e-06, "loss": 0.02, "step": 8850 }, { "epoch": 0.7327764452898851, "grad_norm": 0.036238010972738266, "learning_rate": 1.8296856649818418e-06, "loss": 0.0211, "step": 8860 }, { "epoch": 0.7336035067405509, "grad_norm": 0.04608851671218872, "learning_rate": 1.8191223344351932e-06, "loss": 0.0222, "step": 8870 }, { "epoch": 0.7344305681912167, "grad_norm": 0.04390549659729004, "learning_rate": 1.8085828001546869e-06, "loss": 0.0207, "step": 8880 }, { "epoch": 0.7352576296418823, "grad_norm": 0.04080136865377426, "learning_rate": 1.798067140986976e-06, "loss": 0.0215, "step": 8890 }, { "epoch": 0.7360846910925481, "grad_norm": 0.05361476168036461, "learning_rate": 1.7875754356001052e-06, "loss": 0.0215, "step": 8900 }, { "epoch": 0.736911752543214, "grad_norm": 0.041193023324012756, "learning_rate": 1.7771077624829213e-06, "loss": 0.0226, "step": 8910 }, { "epoch": 0.7377388139938797, "grad_norm": 0.05157098174095154, "learning_rate": 1.7666641999444777e-06, "loss": 0.0213, "step": 8920 }, { "epoch": 0.7385658754445455, "grad_norm": 0.038595810532569885, "learning_rate": 1.7562448261134658e-06, "loss": 0.0204, "step": 8930 }, { "epoch": 0.7393929368952114, "grad_norm": 0.12353651970624924, "learning_rate": 1.7458497189376145e-06, "loss": 0.0208, "step": 8940 }, { "epoch": 0.7402199983458772, "grad_norm": 0.04080955684185028, "learning_rate": 1.735478956183112e-06, "loss": 0.0203, "step": 8950 }, { "epoch": 0.7410470597965428, "grad_norm": 0.0376775749027729, "learning_rate": 1.725132615434027e-06, "loss": 0.0214, "step": 8960 }, { "epoch": 0.7418741212472086, "grad_norm": 0.04121479019522667, "learning_rate": 1.7148107740917269e-06, "loss": 0.0222, "step": 8970 }, { "epoch": 0.7427011826978744, "grad_norm": 0.03592400997877121, "learning_rate": 1.7045135093742976e-06, "loss": 0.0207, "step": 8980 }, { "epoch": 0.7435282441485402, "grad_norm": 0.03217403218150139, "learning_rate": 1.6942408983159648e-06, "loss": 0.0208, "step": 8990 }, { "epoch": 0.744355305599206, "grad_norm": 0.038189876824617386, "learning_rate": 1.6839930177665208e-06, "loss": 0.0232, "step": 9000 }, { "epoch": 0.744355305599206, "eval_loss": 0.02212439477443695, "eval_runtime": 1220.8964, "eval_samples_per_second": 4.914, "eval_steps_per_second": 0.307, "step": 9000 }, { "epoch": 0.7451823670498718, "grad_norm": 0.04287660866975784, "learning_rate": 1.6737699443907486e-06, "loss": 0.0203, "step": 9010 }, { "epoch": 0.7460094285005376, "grad_norm": 0.03629004582762718, "learning_rate": 1.663571754667847e-06, "loss": 0.0209, "step": 9020 }, { "epoch": 0.7468364899512033, "grad_norm": 0.042142104357481, "learning_rate": 1.6533985248908551e-06, "loss": 0.0203, "step": 9030 }, { "epoch": 0.7476635514018691, "grad_norm": 0.042162686586380005, "learning_rate": 1.6432503311660963e-06, "loss": 0.0195, "step": 9040 }, { "epoch": 0.7484906128525349, "grad_norm": 0.042325787246227264, "learning_rate": 1.6331272494125865e-06, "loss": 0.025, "step": 9050 }, { "epoch": 0.7493176743032007, "grad_norm": 0.03958788514137268, "learning_rate": 1.6230293553614851e-06, "loss": 0.0208, "step": 9060 }, { "epoch": 0.7501447357538665, "grad_norm": 0.04631664603948593, "learning_rate": 1.612956724555519e-06, "loss": 0.0222, "step": 9070 }, { "epoch": 0.7509717972045323, "grad_norm": 0.03541667386889458, "learning_rate": 1.6029094323484207e-06, "loss": 0.0188, "step": 9080 }, { "epoch": 0.7517988586551981, "grad_norm": 0.04303780198097229, "learning_rate": 1.5928875539043649e-06, "loss": 0.0218, "step": 9090 }, { "epoch": 0.7526259201058638, "grad_norm": 0.045209407806396484, "learning_rate": 1.5828911641973981e-06, "loss": 0.0216, "step": 9100 }, { "epoch": 0.7534529815565296, "grad_norm": 0.0393962636590004, "learning_rate": 1.5729203380108955e-06, "loss": 0.0201, "step": 9110 }, { "epoch": 0.7542800430071954, "grad_norm": 0.04141068086028099, "learning_rate": 1.5629751499369839e-06, "loss": 0.0221, "step": 9120 }, { "epoch": 0.7551071044578612, "grad_norm": 0.04183319956064224, "learning_rate": 1.553055674375989e-06, "loss": 0.0207, "step": 9130 }, { "epoch": 0.755934165908527, "grad_norm": 0.04659945145249367, "learning_rate": 1.5431619855358842e-06, "loss": 0.0228, "step": 9140 }, { "epoch": 0.7567612273591928, "grad_norm": 0.04036922752857208, "learning_rate": 1.5332941574317294e-06, "loss": 0.0218, "step": 9150 }, { "epoch": 0.7575882888098586, "grad_norm": 0.04024342820048332, "learning_rate": 1.5234522638851213e-06, "loss": 0.0213, "step": 9160 }, { "epoch": 0.7584153502605243, "grad_norm": 0.04086223989725113, "learning_rate": 1.5136363785236362e-06, "loss": 0.0206, "step": 9170 }, { "epoch": 0.7592424117111901, "grad_norm": 0.045924026519060135, "learning_rate": 1.503846574780285e-06, "loss": 0.0212, "step": 9180 }, { "epoch": 0.7600694731618559, "grad_norm": 0.0389275960624218, "learning_rate": 1.4940829258929606e-06, "loss": 0.0217, "step": 9190 }, { "epoch": 0.7608965346125217, "grad_norm": 0.042410727590322495, "learning_rate": 1.4843455049038869e-06, "loss": 0.0206, "step": 9200 }, { "epoch": 0.7617235960631875, "grad_norm": 0.04143417999148369, "learning_rate": 1.4746343846590783e-06, "loss": 0.0218, "step": 9210 }, { "epoch": 0.7625506575138533, "grad_norm": 0.04118340089917183, "learning_rate": 1.4649496378077983e-06, "loss": 0.0203, "step": 9220 }, { "epoch": 0.7633777189645191, "grad_norm": 0.04239552468061447, "learning_rate": 1.455291336801999e-06, "loss": 0.0222, "step": 9230 }, { "epoch": 0.7642047804151848, "grad_norm": 0.041403092443943024, "learning_rate": 1.4456595538957974e-06, "loss": 0.0211, "step": 9240 }, { "epoch": 0.7650318418658506, "grad_norm": 0.12348439544439316, "learning_rate": 1.436054361144925e-06, "loss": 0.0215, "step": 9250 }, { "epoch": 0.7658589033165164, "grad_norm": 0.04165393486618996, "learning_rate": 1.4264758304061938e-06, "loss": 0.0202, "step": 9260 }, { "epoch": 0.7666859647671822, "grad_norm": 0.044469356536865234, "learning_rate": 1.4169240333369543e-06, "loss": 0.0207, "step": 9270 }, { "epoch": 0.767513026217848, "grad_norm": 0.04392145201563835, "learning_rate": 1.4073990413945582e-06, "loss": 0.0208, "step": 9280 }, { "epoch": 0.7683400876685138, "grad_norm": 0.043122172355651855, "learning_rate": 1.3979009258358367e-06, "loss": 0.021, "step": 9290 }, { "epoch": 0.7691671491191796, "grad_norm": 0.0898752361536026, "learning_rate": 1.3884297577165462e-06, "loss": 0.0212, "step": 9300 }, { "epoch": 0.7699942105698453, "grad_norm": 0.04254557564854622, "learning_rate": 1.378985607890856e-06, "loss": 0.0219, "step": 9310 }, { "epoch": 0.7708212720205111, "grad_norm": 0.05117588862776756, "learning_rate": 1.3695685470108078e-06, "loss": 0.0219, "step": 9320 }, { "epoch": 0.7716483334711769, "grad_norm": 0.04056469351053238, "learning_rate": 1.3601786455257905e-06, "loss": 0.0207, "step": 9330 }, { "epoch": 0.7724753949218427, "grad_norm": 0.05269391089677811, "learning_rate": 1.3508159736820132e-06, "loss": 0.0217, "step": 9340 }, { "epoch": 0.7733024563725085, "grad_norm": 0.036445554345846176, "learning_rate": 1.341480601521974e-06, "loss": 0.0211, "step": 9350 }, { "epoch": 0.7741295178231743, "grad_norm": 0.04814046248793602, "learning_rate": 1.33217259888395e-06, "loss": 0.0212, "step": 9360 }, { "epoch": 0.7749565792738401, "grad_norm": 0.038837458938360214, "learning_rate": 1.3228920354014607e-06, "loss": 0.0209, "step": 9370 }, { "epoch": 0.7757836407245058, "grad_norm": 0.0410507507622242, "learning_rate": 1.31363898050275e-06, "loss": 0.0205, "step": 9380 }, { "epoch": 0.7766107021751716, "grad_norm": 0.03645321726799011, "learning_rate": 1.3044135034102711e-06, "loss": 0.0207, "step": 9390 }, { "epoch": 0.7774377636258374, "grad_norm": 0.040732916444540024, "learning_rate": 1.2952156731401716e-06, "loss": 0.0202, "step": 9400 }, { "epoch": 0.7782648250765032, "grad_norm": 0.043882377445697784, "learning_rate": 1.2860455585017634e-06, "loss": 0.0204, "step": 9410 }, { "epoch": 0.779091886527169, "grad_norm": 0.038812581449747086, "learning_rate": 1.2769032280970222e-06, "loss": 0.0209, "step": 9420 }, { "epoch": 0.7799189479778348, "grad_norm": 0.049354683607816696, "learning_rate": 1.2677887503200681e-06, "loss": 0.0197, "step": 9430 }, { "epoch": 0.7807460094285006, "grad_norm": 0.030933791771531105, "learning_rate": 1.258702193356654e-06, "loss": 0.0223, "step": 9440 }, { "epoch": 0.7815730708791663, "grad_norm": 0.03828246891498566, "learning_rate": 1.2496436251836563e-06, "loss": 0.0231, "step": 9450 }, { "epoch": 0.7824001323298321, "grad_norm": 0.04567508026957512, "learning_rate": 1.2406131135685656e-06, "loss": 0.0217, "step": 9460 }, { "epoch": 0.7832271937804979, "grad_norm": 0.04124726355075836, "learning_rate": 1.231610726068983e-06, "loss": 0.0207, "step": 9470 }, { "epoch": 0.7840542552311637, "grad_norm": 0.03909214958548546, "learning_rate": 1.2226365300321063e-06, "loss": 0.021, "step": 9480 }, { "epoch": 0.7848813166818295, "grad_norm": 0.03634734824299812, "learning_rate": 1.2136905925942367e-06, "loss": 0.0214, "step": 9490 }, { "epoch": 0.7857083781324953, "grad_norm": 0.04165159910917282, "learning_rate": 1.2047729806802739e-06, "loss": 0.0205, "step": 9500 }, { "epoch": 0.7865354395831611, "grad_norm": 0.042302560061216354, "learning_rate": 1.195883761003206e-06, "loss": 0.0216, "step": 9510 }, { "epoch": 0.7873625010338268, "grad_norm": 0.05284997075796127, "learning_rate": 1.187023000063623e-06, "loss": 0.0205, "step": 9520 }, { "epoch": 0.7881895624844926, "grad_norm": 0.037878263741731644, "learning_rate": 1.1781907641492129e-06, "loss": 0.0224, "step": 9530 }, { "epoch": 0.7890166239351584, "grad_norm": 0.04263276234269142, "learning_rate": 1.169387119334266e-06, "loss": 0.0222, "step": 9540 }, { "epoch": 0.7898436853858242, "grad_norm": 0.042502984404563904, "learning_rate": 1.1606121314791846e-06, "loss": 0.0216, "step": 9550 }, { "epoch": 0.79067074683649, "grad_norm": 0.041224028915166855, "learning_rate": 1.1518658662299798e-06, "loss": 0.0232, "step": 9560 }, { "epoch": 0.7914978082871558, "grad_norm": 0.04057363048195839, "learning_rate": 1.1431483890177991e-06, "loss": 0.0209, "step": 9570 }, { "epoch": 0.7923248697378216, "grad_norm": 0.038158901035785675, "learning_rate": 1.1344597650584139e-06, "loss": 0.0212, "step": 9580 }, { "epoch": 0.7931519311884873, "grad_norm": 0.038800351321697235, "learning_rate": 1.1258000593517516e-06, "loss": 0.0201, "step": 9590 }, { "epoch": 0.7939789926391531, "grad_norm": 0.044678255915641785, "learning_rate": 1.1171693366813967e-06, "loss": 0.0209, "step": 9600 }, { "epoch": 0.7948060540898189, "grad_norm": 0.038671303540468216, "learning_rate": 1.1085676616141133e-06, "loss": 0.021, "step": 9610 }, { "epoch": 0.7956331155404847, "grad_norm": 0.03674842417240143, "learning_rate": 1.0999950984993584e-06, "loss": 0.0221, "step": 9620 }, { "epoch": 0.7964601769911505, "grad_norm": 0.04083636775612831, "learning_rate": 1.0914517114687973e-06, "loss": 0.0285, "step": 9630 }, { "epoch": 0.7972872384418163, "grad_norm": 0.03498871251940727, "learning_rate": 1.0829375644358352e-06, "loss": 0.0197, "step": 9640 }, { "epoch": 0.7981142998924821, "grad_norm": 0.04270506650209427, "learning_rate": 1.074452721095129e-06, "loss": 0.0199, "step": 9650 }, { "epoch": 0.7989413613431477, "grad_norm": 0.04146299883723259, "learning_rate": 1.065997244922109e-06, "loss": 0.0209, "step": 9660 }, { "epoch": 0.7997684227938135, "grad_norm": 0.0386267714202404, "learning_rate": 1.057571199172514e-06, "loss": 0.0204, "step": 9670 }, { "epoch": 0.8005954842444793, "grad_norm": 0.03857827186584473, "learning_rate": 1.0491746468819114e-06, "loss": 0.0216, "step": 9680 }, { "epoch": 0.8014225456951451, "grad_norm": 0.03981781378388405, "learning_rate": 1.040807650865226e-06, "loss": 0.0207, "step": 9690 }, { "epoch": 0.802249607145811, "grad_norm": 0.03532428294420242, "learning_rate": 1.0324702737162717e-06, "loss": 0.0207, "step": 9700 }, { "epoch": 0.8030766685964768, "grad_norm": 0.059968218207359314, "learning_rate": 1.0241625778072823e-06, "loss": 0.0216, "step": 9710 }, { "epoch": 0.8039037300471426, "grad_norm": 0.042843446135520935, "learning_rate": 1.0158846252884464e-06, "loss": 0.0196, "step": 9720 }, { "epoch": 0.8047307914978082, "grad_norm": 0.04270855337381363, "learning_rate": 1.007636478087437e-06, "loss": 0.0247, "step": 9730 }, { "epoch": 0.805557852948474, "grad_norm": 0.037198904901742935, "learning_rate": 9.994181979089563e-07, "loss": 0.0249, "step": 9740 }, { "epoch": 0.8063849143991398, "grad_norm": 0.04327964037656784, "learning_rate": 9.912298462342724e-07, "loss": 0.0214, "step": 9750 }, { "epoch": 0.8072119758498056, "grad_norm": 0.0518951341509819, "learning_rate": 9.8307148432075e-07, "loss": 0.0313, "step": 9760 }, { "epoch": 0.8080390373004714, "grad_norm": 0.04297772794961929, "learning_rate": 9.749431732014047e-07, "loss": 0.0201, "step": 9770 }, { "epoch": 0.8088660987511372, "grad_norm": 0.04605748876929283, "learning_rate": 9.668449736844392e-07, "loss": 0.0229, "step": 9780 }, { "epoch": 0.809693160201803, "grad_norm": 0.09627640247344971, "learning_rate": 9.587769463527908e-07, "loss": 0.0231, "step": 9790 }, { "epoch": 0.8105202216524687, "grad_norm": 0.05746271833777428, "learning_rate": 9.507391515636783e-07, "loss": 0.0201, "step": 9800 }, { "epoch": 0.8113472831031345, "grad_norm": 0.038759179413318634, "learning_rate": 9.427316494481447e-07, "loss": 0.0201, "step": 9810 }, { "epoch": 0.8121743445538003, "grad_norm": 0.04053572565317154, "learning_rate": 9.347544999106195e-07, "loss": 0.0213, "step": 9820 }, { "epoch": 0.8130014060044661, "grad_norm": 0.03577113896608353, "learning_rate": 9.26807762628461e-07, "loss": 0.0212, "step": 9830 }, { "epoch": 0.8138284674551319, "grad_norm": 0.04762836545705795, "learning_rate": 9.188914970515089e-07, "loss": 0.0229, "step": 9840 }, { "epoch": 0.8146555289057977, "grad_norm": 0.03575866296887398, "learning_rate": 9.110057624016461e-07, "loss": 0.0213, "step": 9850 }, { "epoch": 0.8154825903564635, "grad_norm": 0.04291502758860588, "learning_rate": 9.03150617672352e-07, "loss": 0.0211, "step": 9860 }, { "epoch": 0.8163096518071292, "grad_norm": 0.05030713975429535, "learning_rate": 8.953261216282616e-07, "loss": 0.0195, "step": 9870 }, { "epoch": 0.817136713257795, "grad_norm": 0.03975163400173187, "learning_rate": 8.875323328047258e-07, "loss": 0.0199, "step": 9880 }, { "epoch": 0.8179637747084608, "grad_norm": 0.049601927399635315, "learning_rate": 8.797693095073733e-07, "loss": 0.0213, "step": 9890 }, { "epoch": 0.8187908361591266, "grad_norm": 0.04565184563398361, "learning_rate": 8.72037109811677e-07, "loss": 0.0213, "step": 9900 }, { "epoch": 0.8196178976097924, "grad_norm": 0.04753655195236206, "learning_rate": 8.643357915625122e-07, "loss": 0.0217, "step": 9910 }, { "epoch": 0.8204449590604582, "grad_norm": 0.0501379668712616, "learning_rate": 8.566654123737322e-07, "loss": 0.0215, "step": 9920 }, { "epoch": 0.821272020511124, "grad_norm": 0.037206389009952545, "learning_rate": 8.490260296277375e-07, "loss": 0.0284, "step": 9930 }, { "epoch": 0.8220990819617897, "grad_norm": 0.040716107934713364, "learning_rate": 8.414177004750357e-07, "loss": 0.0219, "step": 9940 }, { "epoch": 0.8229261434124555, "grad_norm": 0.05191744118928909, "learning_rate": 8.338404818338264e-07, "loss": 0.0219, "step": 9950 }, { "epoch": 0.8237532048631213, "grad_norm": 0.04306924715638161, "learning_rate": 8.262944303895687e-07, "loss": 0.0199, "step": 9960 }, { "epoch": 0.8245802663137871, "grad_norm": 0.050016891211271286, "learning_rate": 8.187796025945588e-07, "loss": 0.0207, "step": 9970 }, { "epoch": 0.8254073277644529, "grad_norm": 0.036976251751184464, "learning_rate": 8.112960546675091e-07, "loss": 0.021, "step": 9980 }, { "epoch": 0.8262343892151187, "grad_norm": 0.03663003817200661, "learning_rate": 8.038438425931216e-07, "loss": 0.0204, "step": 9990 }, { "epoch": 0.8270614506657845, "grad_norm": 0.0474594421684742, "learning_rate": 7.964230221216806e-07, "loss": 0.0205, "step": 10000 }, { "epoch": 0.8270614506657845, "eval_loss": 0.021883510053157806, "eval_runtime": 1220.4916, "eval_samples_per_second": 4.915, "eval_steps_per_second": 0.307, "step": 10000 }, { "epoch": 0.8278885121164502, "grad_norm": 0.03651763126254082, "learning_rate": 7.890336487686218e-07, "loss": 0.0205, "step": 10010 }, { "epoch": 0.828715573567116, "grad_norm": 0.038936734199523926, "learning_rate": 7.816757778141281e-07, "loss": 0.0224, "step": 10020 }, { "epoch": 0.8295426350177818, "grad_norm": 0.05199515074491501, "learning_rate": 7.743494643027094e-07, "loss": 0.021, "step": 10030 }, { "epoch": 0.8303696964684476, "grad_norm": 0.03698007017374039, "learning_rate": 7.670547630427954e-07, "loss": 0.0202, "step": 10040 }, { "epoch": 0.8311967579191134, "grad_norm": 0.043272070586681366, "learning_rate": 7.597917286063233e-07, "loss": 0.021, "step": 10050 }, { "epoch": 0.8320238193697792, "grad_norm": 0.04484783858060837, "learning_rate": 7.525604153283239e-07, "loss": 0.0211, "step": 10060 }, { "epoch": 0.832850880820445, "grad_norm": 0.03705143555998802, "learning_rate": 7.453608773065296e-07, "loss": 0.0203, "step": 10070 }, { "epoch": 0.8336779422711107, "grad_norm": 0.039405085146427155, "learning_rate": 7.381931684009569e-07, "loss": 0.0212, "step": 10080 }, { "epoch": 0.8345050037217765, "grad_norm": 0.044893745332956314, "learning_rate": 7.310573422335044e-07, "loss": 0.0203, "step": 10090 }, { "epoch": 0.8353320651724423, "grad_norm": 0.03983564302325249, "learning_rate": 7.23953452187559e-07, "loss": 0.0216, "step": 10100 }, { "epoch": 0.8361591266231081, "grad_norm": 0.044955916702747345, "learning_rate": 7.16881551407591e-07, "loss": 0.0331, "step": 10110 }, { "epoch": 0.8369861880737739, "grad_norm": 0.035963404923677444, "learning_rate": 7.098416927987578e-07, "loss": 0.0198, "step": 10120 }, { "epoch": 0.8378132495244397, "grad_norm": 0.052064284682273865, "learning_rate": 7.028339290265068e-07, "loss": 0.0219, "step": 10130 }, { "epoch": 0.8386403109751055, "grad_norm": 0.04226592183113098, "learning_rate": 6.958583125161855e-07, "loss": 0.0208, "step": 10140 }, { "epoch": 0.8394673724257712, "grad_norm": 0.040528856217861176, "learning_rate": 6.889148954526448e-07, "loss": 0.0201, "step": 10150 }, { "epoch": 0.840294433876437, "grad_norm": 0.04366208612918854, "learning_rate": 6.820037297798476e-07, "loss": 0.0217, "step": 10160 }, { "epoch": 0.8411214953271028, "grad_norm": 0.03569814935326576, "learning_rate": 6.75124867200489e-07, "loss": 0.0203, "step": 10170 }, { "epoch": 0.8419485567777686, "grad_norm": 0.03940269351005554, "learning_rate": 6.682783591755998e-07, "loss": 0.0277, "step": 10180 }, { "epoch": 0.8427756182284344, "grad_norm": 0.04888477176427841, "learning_rate": 6.614642569241642e-07, "loss": 0.0201, "step": 10190 }, { "epoch": 0.8436026796791002, "grad_norm": 0.037041421979665756, "learning_rate": 6.546826114227378e-07, "loss": 0.0215, "step": 10200 }, { "epoch": 0.844429741129766, "grad_norm": 0.054255735129117966, "learning_rate": 6.479334734050713e-07, "loss": 0.0204, "step": 10210 }, { "epoch": 0.8452568025804317, "grad_norm": 0.04159407690167427, "learning_rate": 6.41216893361718e-07, "loss": 0.0199, "step": 10220 }, { "epoch": 0.8460838640310975, "grad_norm": 0.042949602007865906, "learning_rate": 6.345329215396678e-07, "loss": 0.0217, "step": 10230 }, { "epoch": 0.8469109254817633, "grad_norm": 0.04219160974025726, "learning_rate": 6.278816079419675e-07, "loss": 0.0214, "step": 10240 }, { "epoch": 0.8477379869324291, "grad_norm": 0.0364801287651062, "learning_rate": 6.212630023273452e-07, "loss": 0.0224, "step": 10250 }, { "epoch": 0.8485650483830949, "grad_norm": 0.03598921000957489, "learning_rate": 6.146771542098418e-07, "loss": 0.0203, "step": 10260 }, { "epoch": 0.8493921098337607, "grad_norm": 0.04587122052907944, "learning_rate": 6.08124112858432e-07, "loss": 0.0203, "step": 10270 }, { "epoch": 0.8502191712844265, "grad_norm": 0.05026087537407875, "learning_rate": 6.0160392729667e-07, "loss": 0.0212, "step": 10280 }, { "epoch": 0.8510462327350922, "grad_norm": 0.03648602217435837, "learning_rate": 5.951166463023089e-07, "loss": 0.0209, "step": 10290 }, { "epoch": 0.851873294185758, "grad_norm": 0.03705860301852226, "learning_rate": 5.886623184069434e-07, "loss": 0.0206, "step": 10300 }, { "epoch": 0.8527003556364238, "grad_norm": 0.03770057111978531, "learning_rate": 5.822409918956445e-07, "loss": 0.0207, "step": 10310 }, { "epoch": 0.8535274170870896, "grad_norm": 0.038956765085458755, "learning_rate": 5.758527148065989e-07, "loss": 0.0248, "step": 10320 }, { "epoch": 0.8543544785377554, "grad_norm": 0.04223814234137535, "learning_rate": 5.694975349307503e-07, "loss": 0.0211, "step": 10330 }, { "epoch": 0.8551815399884212, "grad_norm": 0.04067877680063248, "learning_rate": 5.631754998114369e-07, "loss": 0.021, "step": 10340 }, { "epoch": 0.856008601439087, "grad_norm": 0.03736858442425728, "learning_rate": 5.568866567440451e-07, "loss": 0.0209, "step": 10350 }, { "epoch": 0.8568356628897527, "grad_norm": 0.03979681432247162, "learning_rate": 5.506310527756481e-07, "loss": 0.0206, "step": 10360 }, { "epoch": 0.8576627243404185, "grad_norm": 0.04270453378558159, "learning_rate": 5.444087347046534e-07, "loss": 0.0222, "step": 10370 }, { "epoch": 0.8584897857910843, "grad_norm": 0.03582329303026199, "learning_rate": 5.382197490804597e-07, "loss": 0.0193, "step": 10380 }, { "epoch": 0.8593168472417501, "grad_norm": 0.040274590253829956, "learning_rate": 5.32064142203102e-07, "loss": 0.0215, "step": 10390 }, { "epoch": 0.8601439086924159, "grad_norm": 0.035478100180625916, "learning_rate": 5.259419601229076e-07, "loss": 0.0193, "step": 10400 }, { "epoch": 0.8609709701430817, "grad_norm": 0.05305038392543793, "learning_rate": 5.198532486401536e-07, "loss": 0.0208, "step": 10410 }, { "epoch": 0.8617980315937475, "grad_norm": 0.03564458340406418, "learning_rate": 5.137980533047204e-07, "loss": 0.0208, "step": 10420 }, { "epoch": 0.8626250930444131, "grad_norm": 0.03643946349620819, "learning_rate": 5.077764194157536e-07, "loss": 0.0201, "step": 10430 }, { "epoch": 0.863452154495079, "grad_norm": 0.03864193707704544, "learning_rate": 5.017883920213229e-07, "loss": 0.0208, "step": 10440 }, { "epoch": 0.8642792159457447, "grad_norm": 0.04269906133413315, "learning_rate": 4.95834015918088e-07, "loss": 0.0205, "step": 10450 }, { "epoch": 0.8651062773964106, "grad_norm": 0.04071857035160065, "learning_rate": 4.899133356509639e-07, "loss": 0.0218, "step": 10460 }, { "epoch": 0.8659333388470764, "grad_norm": 0.05047852545976639, "learning_rate": 4.840263955127811e-07, "loss": 0.02, "step": 10470 }, { "epoch": 0.8667604002977422, "grad_norm": 0.03635663166642189, "learning_rate": 4.78173239543962e-07, "loss": 0.0206, "step": 10480 }, { "epoch": 0.867587461748408, "grad_norm": 0.05464612692594528, "learning_rate": 4.72353911532189e-07, "loss": 0.0201, "step": 10490 }, { "epoch": 0.8684145231990736, "grad_norm": 0.04334511607885361, "learning_rate": 4.665684550120736e-07, "loss": 0.0213, "step": 10500 }, { "epoch": 0.8692415846497394, "grad_norm": 0.036809250712394714, "learning_rate": 4.608169132648371e-07, "loss": 0.0205, "step": 10510 }, { "epoch": 0.8700686461004052, "grad_norm": 0.04075481742620468, "learning_rate": 4.5509932931797727e-07, "loss": 0.0202, "step": 10520 }, { "epoch": 0.870895707551071, "grad_norm": 0.041940901428461075, "learning_rate": 4.4941574594495994e-07, "loss": 0.0201, "step": 10530 }, { "epoch": 0.8717227690017368, "grad_norm": 0.040375709533691406, "learning_rate": 4.437662056648845e-07, "loss": 0.0219, "step": 10540 }, { "epoch": 0.8725498304524026, "grad_norm": 0.0415097214281559, "learning_rate": 4.3815075074217615e-07, "loss": 0.0204, "step": 10550 }, { "epoch": 0.8733768919030684, "grad_norm": 0.0475936122238636, "learning_rate": 4.325694231862665e-07, "loss": 0.0217, "step": 10560 }, { "epoch": 0.8742039533537341, "grad_norm": 0.04286682605743408, "learning_rate": 4.2702226475127675e-07, "loss": 0.0214, "step": 10570 }, { "epoch": 0.8750310148043999, "grad_norm": 0.043143562972545624, "learning_rate": 4.2150931693570986e-07, "loss": 0.0209, "step": 10580 }, { "epoch": 0.8758580762550657, "grad_norm": 0.03890874236822128, "learning_rate": 4.1603062098213685e-07, "loss": 0.0207, "step": 10590 }, { "epoch": 0.8766851377057315, "grad_norm": 0.03903120383620262, "learning_rate": 4.1058621787688934e-07, "loss": 0.0378, "step": 10600 }, { "epoch": 0.8775121991563973, "grad_norm": 0.03754309564828873, "learning_rate": 4.051761483497541e-07, "loss": 0.036, "step": 10610 }, { "epoch": 0.8783392606070631, "grad_norm": 0.04438405483961105, "learning_rate": 3.998004528736632e-07, "loss": 0.0213, "step": 10620 }, { "epoch": 0.8791663220577289, "grad_norm": 0.037207264453172684, "learning_rate": 3.9445917166439915e-07, "loss": 0.0198, "step": 10630 }, { "epoch": 0.8799933835083946, "grad_norm": 0.04333435744047165, "learning_rate": 3.8915234468029027e-07, "loss": 0.0202, "step": 10640 }, { "epoch": 0.8808204449590604, "grad_norm": 0.034282222390174866, "learning_rate": 3.838800116219082e-07, "loss": 0.0205, "step": 10650 }, { "epoch": 0.8816475064097262, "grad_norm": 0.04391239210963249, "learning_rate": 3.786422119317762e-07, "loss": 0.0197, "step": 10660 }, { "epoch": 0.882474567860392, "grad_norm": 0.037362392991781235, "learning_rate": 3.7343898479407227e-07, "loss": 0.0204, "step": 10670 }, { "epoch": 0.8833016293110578, "grad_norm": 0.03807242214679718, "learning_rate": 3.682703691343353e-07, "loss": 0.0209, "step": 10680 }, { "epoch": 0.8841286907617236, "grad_norm": 0.0376625694334507, "learning_rate": 3.6313640361917535e-07, "loss": 0.0203, "step": 10690 }, { "epoch": 0.8849557522123894, "grad_norm": 0.039894696325063705, "learning_rate": 3.580371266559801e-07, "loss": 0.0203, "step": 10700 }, { "epoch": 0.8857828136630551, "grad_norm": 0.03698920086026192, "learning_rate": 3.529725763926367e-07, "loss": 0.025, "step": 10710 }, { "epoch": 0.8866098751137209, "grad_norm": 0.04414455220103264, "learning_rate": 3.4794279071723503e-07, "loss": 0.0212, "step": 10720 }, { "epoch": 0.8874369365643867, "grad_norm": 0.041200559586286545, "learning_rate": 3.4294780725779296e-07, "loss": 0.022, "step": 10730 }, { "epoch": 0.8882639980150525, "grad_norm": 0.03711444512009621, "learning_rate": 3.379876633819701e-07, "loss": 0.0214, "step": 10740 }, { "epoch": 0.8890910594657183, "grad_norm": 0.046689391136169434, "learning_rate": 3.3306239619679106e-07, "loss": 0.0207, "step": 10750 }, { "epoch": 0.8899181209163841, "grad_norm": 0.04057691618800163, "learning_rate": 3.281720425483653e-07, "loss": 0.0206, "step": 10760 }, { "epoch": 0.8907451823670499, "grad_norm": 0.04174448922276497, "learning_rate": 3.2331663902161416e-07, "loss": 0.0212, "step": 10770 }, { "epoch": 0.8915722438177156, "grad_norm": 0.03700239583849907, "learning_rate": 3.184962219399945e-07, "loss": 0.0206, "step": 10780 }, { "epoch": 0.8923993052683814, "grad_norm": 0.03949680179357529, "learning_rate": 3.137108273652301e-07, "loss": 0.0235, "step": 10790 }, { "epoch": 0.8932263667190472, "grad_norm": 0.03353721275925636, "learning_rate": 3.0896049109703616e-07, "loss": 0.0199, "step": 10800 }, { "epoch": 0.894053428169713, "grad_norm": 0.040740326046943665, "learning_rate": 3.0424524867286085e-07, "loss": 0.02, "step": 10810 }, { "epoch": 0.8948804896203788, "grad_norm": 0.04401690140366554, "learning_rate": 2.9956513536760934e-07, "loss": 0.0307, "step": 10820 }, { "epoch": 0.8957075510710446, "grad_norm": 0.033138833940029144, "learning_rate": 2.9492018619338703e-07, "loss": 0.0206, "step": 10830 }, { "epoch": 0.8965346125217104, "grad_norm": 0.03981183469295502, "learning_rate": 2.9031043589923426e-07, "loss": 0.0211, "step": 10840 }, { "epoch": 0.8973616739723761, "grad_norm": 0.036721404641866684, "learning_rate": 2.857359189708669e-07, "loss": 0.0215, "step": 10850 }, { "epoch": 0.8981887354230419, "grad_norm": 0.049144960939884186, "learning_rate": 2.8119666963042025e-07, "loss": 0.0218, "step": 10860 }, { "epoch": 0.8990157968737077, "grad_norm": 0.0382852703332901, "learning_rate": 2.766927218361887e-07, "loss": 0.0209, "step": 10870 }, { "epoch": 0.8998428583243735, "grad_norm": 0.04550480842590332, "learning_rate": 2.722241092823774e-07, "loss": 0.0208, "step": 10880 }, { "epoch": 0.9006699197750393, "grad_norm": 0.04339880868792534, "learning_rate": 2.677908653988465e-07, "loss": 0.0211, "step": 10890 }, { "epoch": 0.9014969812257051, "grad_norm": 0.036550212651491165, "learning_rate": 2.6339302335085914e-07, "loss": 0.0197, "step": 10900 }, { "epoch": 0.9023240426763709, "grad_norm": 0.034907784312963486, "learning_rate": 2.5903061603883897e-07, "loss": 0.0207, "step": 10910 }, { "epoch": 0.9031511041270366, "grad_norm": 0.043843794614076614, "learning_rate": 2.5470367609812084e-07, "loss": 0.0207, "step": 10920 }, { "epoch": 0.9039781655777024, "grad_norm": 0.03882720693945885, "learning_rate": 2.504122358987049e-07, "loss": 0.0206, "step": 10930 }, { "epoch": 0.9048052270283682, "grad_norm": 0.04570434242486954, "learning_rate": 2.461563275450185e-07, "loss": 0.0203, "step": 10940 }, { "epoch": 0.905632288479034, "grad_norm": 0.03797876834869385, "learning_rate": 2.4193598287567287e-07, "loss": 0.0203, "step": 10950 }, { "epoch": 0.9064593499296998, "grad_norm": 0.04128405451774597, "learning_rate": 2.3775123346322593e-07, "loss": 0.0213, "step": 10960 }, { "epoch": 0.9072864113803656, "grad_norm": 0.036211997270584106, "learning_rate": 2.3360211061394743e-07, "loss": 0.0209, "step": 10970 }, { "epoch": 0.9081134728310314, "grad_norm": 0.0425226129591465, "learning_rate": 2.2948864536757985e-07, "loss": 0.0208, "step": 10980 }, { "epoch": 0.9089405342816971, "grad_norm": 0.042280830442905426, "learning_rate": 2.2541086849711514e-07, "loss": 0.021, "step": 10990 }, { "epoch": 0.9097675957323629, "grad_norm": 0.03988664597272873, "learning_rate": 2.213688105085543e-07, "loss": 0.0214, "step": 11000 }, { "epoch": 0.9097675957323629, "eval_loss": 0.021760277450084686, "eval_runtime": 1221.1845, "eval_samples_per_second": 4.912, "eval_steps_per_second": 0.307, "step": 11000 }, { "epoch": 0.9105946571830287, "grad_norm": 0.038166627287864685, "learning_rate": 2.1736250164068662e-07, "loss": 0.0211, "step": 11010 }, { "epoch": 0.9114217186336945, "grad_norm": 0.03990177437663078, "learning_rate": 2.1339197186486027e-07, "loss": 0.0199, "step": 11020 }, { "epoch": 0.9122487800843603, "grad_norm": 0.03698251396417618, "learning_rate": 2.0945725088475921e-07, "loss": 0.0213, "step": 11030 }, { "epoch": 0.9130758415350261, "grad_norm": 0.03812938556075096, "learning_rate": 2.0555836813618003e-07, "loss": 0.0214, "step": 11040 }, { "epoch": 0.9139029029856919, "grad_norm": 0.03743589296936989, "learning_rate": 2.0169535278680984e-07, "loss": 0.0204, "step": 11050 }, { "epoch": 0.9147299644363576, "grad_norm": 0.040262360125780106, "learning_rate": 1.978682337360155e-07, "loss": 0.0205, "step": 11060 }, { "epoch": 0.9155570258870234, "grad_norm": 0.03923022374510765, "learning_rate": 1.940770396146191e-07, "loss": 0.0189, "step": 11070 }, { "epoch": 0.9163840873376892, "grad_norm": 0.038444485515356064, "learning_rate": 1.903217987846856e-07, "loss": 0.0219, "step": 11080 }, { "epoch": 0.917211148788355, "grad_norm": 0.04547708109021187, "learning_rate": 1.866025393393145e-07, "loss": 0.0206, "step": 11090 }, { "epoch": 0.9180382102390208, "grad_norm": 0.03776419907808304, "learning_rate": 1.8291928910242618e-07, "loss": 0.0194, "step": 11100 }, { "epoch": 0.9188652716896866, "grad_norm": 0.037485282868146896, "learning_rate": 1.792720756285554e-07, "loss": 0.0206, "step": 11110 }, { "epoch": 0.9196923331403524, "grad_norm": 0.0491410493850708, "learning_rate": 1.7566092620264374e-07, "loss": 0.0208, "step": 11120 }, { "epoch": 0.9205193945910181, "grad_norm": 0.04069705307483673, "learning_rate": 1.720858678398374e-07, "loss": 0.0211, "step": 11130 }, { "epoch": 0.9213464560416839, "grad_norm": 0.04729039594531059, "learning_rate": 1.6854692728528298e-07, "loss": 0.0211, "step": 11140 }, { "epoch": 0.9221735174923497, "grad_norm": 0.041814010590314865, "learning_rate": 1.650441310139278e-07, "loss": 0.0201, "step": 11150 }, { "epoch": 0.9230005789430155, "grad_norm": 0.04224241524934769, "learning_rate": 1.615775052303231e-07, "loss": 0.0205, "step": 11160 }, { "epoch": 0.9238276403936813, "grad_norm": 0.13403134047985077, "learning_rate": 1.5814707586842948e-07, "loss": 0.021, "step": 11170 }, { "epoch": 0.9246547018443471, "grad_norm": 0.03628065064549446, "learning_rate": 1.5475286859141736e-07, "loss": 0.0208, "step": 11180 }, { "epoch": 0.9254817632950129, "grad_norm": 0.03844155743718147, "learning_rate": 1.5139490879147955e-07, "loss": 0.0206, "step": 11190 }, { "epoch": 0.9263088247456785, "grad_norm": 0.03782174736261368, "learning_rate": 1.4807322158964021e-07, "loss": 0.0218, "step": 11200 }, { "epoch": 0.9271358861963443, "grad_norm": 0.04245174303650856, "learning_rate": 1.4478783183556834e-07, "loss": 0.0204, "step": 11210 }, { "epoch": 0.9279629476470102, "grad_norm": 0.038376543670892715, "learning_rate": 1.4153876410738787e-07, "loss": 0.0209, "step": 11220 }, { "epoch": 0.928790009097676, "grad_norm": 0.03631012141704559, "learning_rate": 1.3832604271149742e-07, "loss": 0.0202, "step": 11230 }, { "epoch": 0.9296170705483418, "grad_norm": 0.03546414151787758, "learning_rate": 1.35149691682388e-07, "loss": 0.0211, "step": 11240 }, { "epoch": 0.9304441319990076, "grad_norm": 0.03860907629132271, "learning_rate": 1.320097347824606e-07, "loss": 0.0203, "step": 11250 }, { "epoch": 0.9312711934496734, "grad_norm": 0.04019659012556076, "learning_rate": 1.2890619550185225e-07, "loss": 0.0224, "step": 11260 }, { "epoch": 0.932098254900339, "grad_norm": 0.03943018242716789, "learning_rate": 1.2583909705825792e-07, "loss": 0.0199, "step": 11270 }, { "epoch": 0.9329253163510048, "grad_norm": 0.044663459062576294, "learning_rate": 1.228084623967568e-07, "loss": 0.0218, "step": 11280 }, { "epoch": 0.9337523778016706, "grad_norm": 0.041751962155103683, "learning_rate": 1.1981431418964185e-07, "loss": 0.0241, "step": 11290 }, { "epoch": 0.9345794392523364, "grad_norm": 0.03652814030647278, "learning_rate": 1.1685667483624763e-07, "loss": 0.0202, "step": 11300 }, { "epoch": 0.9354065007030022, "grad_norm": 0.040455400943756104, "learning_rate": 1.139355664627878e-07, "loss": 0.022, "step": 11310 }, { "epoch": 0.936233562153668, "grad_norm": 0.03818695247173309, "learning_rate": 1.1105101092218462e-07, "loss": 0.0207, "step": 11320 }, { "epoch": 0.9370606236043338, "grad_norm": 0.04204050451517105, "learning_rate": 1.0820302979390574e-07, "loss": 0.0213, "step": 11330 }, { "epoch": 0.9378876850549995, "grad_norm": 0.04151742160320282, "learning_rate": 1.0539164438380655e-07, "loss": 0.0204, "step": 11340 }, { "epoch": 0.9387147465056653, "grad_norm": 0.038962677121162415, "learning_rate": 1.0261687572396762e-07, "loss": 0.0209, "step": 11350 }, { "epoch": 0.9395418079563311, "grad_norm": 0.045446451753377914, "learning_rate": 9.987874457253799e-08, "loss": 0.0207, "step": 11360 }, { "epoch": 0.9403688694069969, "grad_norm": 0.038088973611593246, "learning_rate": 9.717727141358046e-08, "loss": 0.0214, "step": 11370 }, { "epoch": 0.9411959308576627, "grad_norm": 0.04547916352748871, "learning_rate": 9.45124764569183e-08, "loss": 0.0245, "step": 11380 }, { "epoch": 0.9420229923083285, "grad_norm": 0.038108475506305695, "learning_rate": 9.188437963798314e-08, "loss": 0.0262, "step": 11390 }, { "epoch": 0.9428500537589943, "grad_norm": 0.041740551590919495, "learning_rate": 8.929300061766677e-08, "loss": 0.0208, "step": 11400 }, { "epoch": 0.94367711520966, "grad_norm": 0.04090382158756256, "learning_rate": 8.673835878217351e-08, "loss": 0.0219, "step": 11410 }, { "epoch": 0.9445041766603258, "grad_norm": 0.03528539836406708, "learning_rate": 8.42204732428764e-08, "loss": 0.02, "step": 11420 }, { "epoch": 0.9453312381109916, "grad_norm": 0.03898858278989792, "learning_rate": 8.173936283617068e-08, "loss": 0.0198, "step": 11430 }, { "epoch": 0.9461582995616574, "grad_norm": 0.03353780135512352, "learning_rate": 7.929504612333827e-08, "loss": 0.0204, "step": 11440 }, { "epoch": 0.9469853610123232, "grad_norm": 0.03581018000841141, "learning_rate": 7.688754139040522e-08, "loss": 0.02, "step": 11450 }, { "epoch": 0.947812422462989, "grad_norm": 0.0409320667386055, "learning_rate": 7.451686664800505e-08, "loss": 0.0201, "step": 11460 }, { "epoch": 0.9486394839136548, "grad_norm": 0.04594825208187103, "learning_rate": 7.218303963124507e-08, "loss": 0.0206, "step": 11470 }, { "epoch": 0.9494665453643206, "grad_norm": 0.036496005952358246, "learning_rate": 6.988607779957357e-08, "loss": 0.0216, "step": 11480 }, { "epoch": 0.9502936068149863, "grad_norm": 0.043786004185676575, "learning_rate": 6.762599833664896e-08, "loss": 0.0225, "step": 11490 }, { "epoch": 0.9511206682656521, "grad_norm": 0.04733569920063019, "learning_rate": 6.540281815021198e-08, "loss": 0.0202, "step": 11500 }, { "epoch": 0.9519477297163179, "grad_norm": 0.033522963523864746, "learning_rate": 6.321655387195591e-08, "loss": 0.0211, "step": 11510 }, { "epoch": 0.9527747911669837, "grad_norm": 0.04227704182267189, "learning_rate": 6.106722185740821e-08, "loss": 0.02, "step": 11520 }, { "epoch": 0.9536018526176495, "grad_norm": 0.04079505801200867, "learning_rate": 5.8954838185801834e-08, "loss": 0.0205, "step": 11530 }, { "epoch": 0.9544289140683153, "grad_norm": 0.0362294465303421, "learning_rate": 5.6879418659959716e-08, "loss": 0.0215, "step": 11540 }, { "epoch": 0.9552559755189811, "grad_norm": 0.039718855172395706, "learning_rate": 5.4840978806173786e-08, "loss": 0.0199, "step": 11550 }, { "epoch": 0.9560830369696468, "grad_norm": 0.038249652832746506, "learning_rate": 5.283953387408891e-08, "loss": 0.02, "step": 11560 }, { "epoch": 0.9569100984203126, "grad_norm": 0.04237562417984009, "learning_rate": 5.087509883659136e-08, "loss": 0.0206, "step": 11570 }, { "epoch": 0.9577371598709784, "grad_norm": 0.042977023869752884, "learning_rate": 4.8947688389693325e-08, "loss": 0.0213, "step": 11580 }, { "epoch": 0.9585642213216442, "grad_norm": 0.03681021183729172, "learning_rate": 4.705731695242521e-08, "loss": 0.0197, "step": 11590 }, { "epoch": 0.95939128277231, "grad_norm": 0.05688457190990448, "learning_rate": 4.520399866672798e-08, "loss": 0.0203, "step": 11600 }, { "epoch": 0.9602183442229758, "grad_norm": 0.05174125358462334, "learning_rate": 4.338774739734541e-08, "loss": 0.0204, "step": 11610 }, { "epoch": 0.9610454056736416, "grad_norm": 0.041839614510536194, "learning_rate": 4.160857673172147e-08, "loss": 0.021, "step": 11620 }, { "epoch": 0.9618724671243073, "grad_norm": 0.04174220189452171, "learning_rate": 3.986649997989922e-08, "loss": 0.0222, "step": 11630 }, { "epoch": 0.9626995285749731, "grad_norm": 0.04051094874739647, "learning_rate": 3.816153017442148e-08, "loss": 0.0211, "step": 11640 }, { "epoch": 0.9635265900256389, "grad_norm": 0.04232405498623848, "learning_rate": 3.649368007023202e-08, "loss": 0.0206, "step": 11650 }, { "epoch": 0.9643536514763047, "grad_norm": 0.03878673538565636, "learning_rate": 3.486296214457952e-08, "loss": 0.0213, "step": 11660 }, { "epoch": 0.9651807129269705, "grad_norm": 0.04240000247955322, "learning_rate": 3.326938859692708e-08, "loss": 0.0209, "step": 11670 }, { "epoch": 0.9660077743776363, "grad_norm": 0.04394629970192909, "learning_rate": 3.171297134885842e-08, "loss": 0.0215, "step": 11680 }, { "epoch": 0.9668348358283021, "grad_norm": 0.03907477483153343, "learning_rate": 3.019372204399018e-08, "loss": 0.0202, "step": 11690 }, { "epoch": 0.9676618972789678, "grad_norm": 0.03773649409413338, "learning_rate": 2.8711652047884176e-08, "loss": 0.0203, "step": 11700 }, { "epoch": 0.9684889587296336, "grad_norm": 0.041162021458148956, "learning_rate": 2.7266772447961387e-08, "loss": 0.0204, "step": 11710 }, { "epoch": 0.9693160201802994, "grad_norm": 0.03880157694220543, "learning_rate": 2.585909405342091e-08, "loss": 0.021, "step": 11720 }, { "epoch": 0.9701430816309652, "grad_norm": 0.04054776951670647, "learning_rate": 2.4488627395157783e-08, "loss": 0.0203, "step": 11730 }, { "epoch": 0.970970143081631, "grad_norm": 0.03724834322929382, "learning_rate": 2.315538272568585e-08, "loss": 0.0206, "step": 11740 }, { "epoch": 0.9717972045322968, "grad_norm": 0.04481399431824684, "learning_rate": 2.1859370019058913e-08, "loss": 0.0245, "step": 11750 }, { "epoch": 0.9726242659829626, "grad_norm": 0.04241091012954712, "learning_rate": 2.0600598970795804e-08, "loss": 0.02, "step": 11760 }, { "epoch": 0.9734513274336283, "grad_norm": 0.04424299672245979, "learning_rate": 1.9379078997810995e-08, "loss": 0.0245, "step": 11770 }, { "epoch": 0.9742783888842941, "grad_norm": 0.038491178303956985, "learning_rate": 1.8194819238341877e-08, "loss": 0.0201, "step": 11780 }, { "epoch": 0.9751054503349599, "grad_norm": 0.04305344447493553, "learning_rate": 1.7047828551880475e-08, "loss": 0.0204, "step": 11790 }, { "epoch": 0.9759325117856257, "grad_norm": 0.03870062530040741, "learning_rate": 1.59381155191074e-08, "loss": 0.0204, "step": 11800 }, { "epoch": 0.9767595732362915, "grad_norm": 0.040076758712530136, "learning_rate": 1.4865688441828008e-08, "loss": 0.0216, "step": 11810 }, { "epoch": 0.9775866346869573, "grad_norm": 0.0319267176091671, "learning_rate": 1.3830555342909113e-08, "loss": 0.0229, "step": 11820 }, { "epoch": 0.9784136961376231, "grad_norm": 0.03797907382249832, "learning_rate": 1.283272396622126e-08, "loss": 0.0194, "step": 11830 }, { "epoch": 0.9792407575882888, "grad_norm": 0.03610234335064888, "learning_rate": 1.1872201776578219e-08, "loss": 0.0213, "step": 11840 }, { "epoch": 0.9800678190389546, "grad_norm": 0.048893995583057404, "learning_rate": 1.0948995959683683e-08, "loss": 0.022, "step": 11850 }, { "epoch": 0.9808948804896204, "grad_norm": 0.1131172701716423, "learning_rate": 1.0063113422074667e-08, "loss": 0.0202, "step": 11860 }, { "epoch": 0.9817219419402862, "grad_norm": 0.038660723716020584, "learning_rate": 9.21456079107208e-09, "loss": 0.0205, "step": 11870 }, { "epoch": 0.982549003390952, "grad_norm": 0.03787451982498169, "learning_rate": 8.40334441473023e-09, "loss": 0.0219, "step": 11880 }, { "epoch": 0.9833760648416178, "grad_norm": 0.0439179353415966, "learning_rate": 7.629470361789071e-09, "loss": 0.0206, "step": 11890 }, { "epoch": 0.9842031262922836, "grad_norm": 0.04450133442878723, "learning_rate": 6.892944421630354e-09, "loss": 0.0236, "step": 11900 }, { "epoch": 0.9850301877429493, "grad_norm": 0.04056097939610481, "learning_rate": 6.193772104232665e-09, "loss": 0.0218, "step": 11910 }, { "epoch": 0.9858572491936151, "grad_norm": 0.04353098198771477, "learning_rate": 5.531958640129787e-09, "loss": 0.0228, "step": 11920 }, { "epoch": 0.9866843106442809, "grad_norm": 0.03741453215479851, "learning_rate": 4.90750898037351e-09, "loss": 0.0226, "step": 11930 }, { "epoch": 0.9875113720949467, "grad_norm": 0.04343261569738388, "learning_rate": 4.32042779649533e-09, "loss": 0.0197, "step": 11940 }, { "epoch": 0.9883384335456125, "grad_norm": 0.04530951753258705, "learning_rate": 3.7707194804725846e-09, "loss": 0.0196, "step": 11950 }, { "epoch": 0.9891654949962783, "grad_norm": 0.05223441496491432, "learning_rate": 3.2583881446929256e-09, "loss": 0.0206, "step": 11960 }, { "epoch": 0.9899925564469441, "grad_norm": 0.03598857298493385, "learning_rate": 2.783437621926566e-09, "loss": 0.0223, "step": 11970 }, { "epoch": 0.9908196178976098, "grad_norm": 0.044676005840301514, "learning_rate": 2.345871465296856e-09, "loss": 0.0203, "step": 11980 }, { "epoch": 0.9916466793482756, "grad_norm": 0.035769447684288025, "learning_rate": 1.945692948253086e-09, "loss": 0.0254, "step": 11990 }, { "epoch": 0.9924737407989414, "grad_norm": 0.050190046429634094, "learning_rate": 1.5829050645449484e-09, "loss": 0.0241, "step": 12000 }, { "epoch": 0.9924737407989414, "eval_loss": 0.021732060238718987, "eval_runtime": 1221.3996, "eval_samples_per_second": 4.912, "eval_steps_per_second": 0.307, "step": 12000 }, { "epoch": 0.9933008022496072, "grad_norm": 0.038614947348833084, "learning_rate": 1.2575105282025545e-09, "loss": 0.0251, "step": 12010 }, { "epoch": 0.994127863700273, "grad_norm": 0.03808825463056564, "learning_rate": 9.695117735147863e-10, "loss": 0.0197, "step": 12020 }, { "epoch": 0.9949549251509388, "grad_norm": 0.037998467683792114, "learning_rate": 7.189109550115314e-10, "loss": 0.0211, "step": 12030 }, { "epoch": 0.9957819866016046, "grad_norm": 0.03407185524702072, "learning_rate": 5.057099474470306e-10, "loss": 0.0192, "step": 12040 }, { "epoch": 0.9966090480522702, "grad_norm": 0.03972679004073143, "learning_rate": 3.299103457854447e-10, "loss": 0.0231, "step": 12050 }, { "epoch": 0.997436109502936, "grad_norm": 0.0530787818133831, "learning_rate": 1.9151346519086233e-10, "loss": 0.0203, "step": 12060 }, { "epoch": 0.9982631709536018, "grad_norm": 0.03939739987254143, "learning_rate": 9.052034101508789e-11, "loss": 0.021, "step": 12070 }, { "epoch": 0.9990902324042676, "grad_norm": 0.039279334247112274, "learning_rate": 2.693172879209005e-11, "loss": 0.0211, "step": 12080 }, { "epoch": 0.9999172938549334, "grad_norm": 0.04228970408439636, "learning_rate": 7.481042302304175e-13, "loss": 0.0213, "step": 12090 }, { "epoch": 1.0, "step": 12091, "total_flos": 2.46882716505537e+20, "train_loss": 0.04353406456638118, "train_runtime": 186963.3453, "train_samples_per_second": 1.035, "train_steps_per_second": 0.065 } ], "logging_steps": 10, "max_steps": 12091, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.46882716505537e+20, "train_batch_size": 2, "trial_name": null, "trial_params": null }