{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 1000, "global_step": 4717, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00211999152003392, "grad_norm": 32.50752996739497, "learning_rate": 1.6949152542372883e-07, "loss": 3.7461, "step": 10 }, { "epoch": 0.00423998304006784, "grad_norm": 19.7783715335098, "learning_rate": 5.93220338983051e-07, "loss": 3.5757, "step": 20 }, { "epoch": 0.006359974560101759, "grad_norm": 9.354428743861847, "learning_rate": 1.016949152542373e-06, "loss": 2.7484, "step": 30 }, { "epoch": 0.00847996608013568, "grad_norm": 5.7293369283524935, "learning_rate": 1.4406779661016951e-06, "loss": 1.9415, "step": 40 }, { "epoch": 0.0105999576001696, "grad_norm": 6.878907941943918, "learning_rate": 1.8644067796610171e-06, "loss": 1.2673, "step": 50 }, { "epoch": 0.012719949120203519, "grad_norm": 8.215857672525798, "learning_rate": 2.288135593220339e-06, "loss": 0.8349, "step": 60 }, { "epoch": 0.014839940640237439, "grad_norm": 2.0985232088184698, "learning_rate": 2.7118644067796613e-06, "loss": 0.6788, "step": 70 }, { "epoch": 0.01695993216027136, "grad_norm": 4.181552201854574, "learning_rate": 3.135593220338983e-06, "loss": 0.5306, "step": 80 }, { "epoch": 0.01907992368030528, "grad_norm": 1.3578290731521812, "learning_rate": 3.5593220338983053e-06, "loss": 0.428, "step": 90 }, { "epoch": 0.0211999152003392, "grad_norm": 8.019239361403569, "learning_rate": 3.9830508474576275e-06, "loss": 0.3659, "step": 100 }, { "epoch": 0.02331990672037312, "grad_norm": 1.0371743917362437, "learning_rate": 4.40677966101695e-06, "loss": 0.3098, "step": 110 }, { "epoch": 0.025439898240407037, "grad_norm": 1.0725520171241911, "learning_rate": 4.830508474576272e-06, "loss": 0.272, "step": 120 }, { "epoch": 0.027559889760440957, "grad_norm": 4.320619473460736, "learning_rate": 5.254237288135594e-06, "loss": 0.2408, "step": 130 }, { "epoch": 0.029679881280474878, "grad_norm": 9.318265850779325, "learning_rate": 5.677966101694916e-06, "loss": 0.2177, "step": 140 }, { "epoch": 0.0317998728005088, "grad_norm": 0.9802919759217678, "learning_rate": 6.1016949152542385e-06, "loss": 0.2049, "step": 150 }, { "epoch": 0.03391986432054272, "grad_norm": 2.4459592955831337, "learning_rate": 6.52542372881356e-06, "loss": 0.1998, "step": 160 }, { "epoch": 0.03603985584057664, "grad_norm": 1.5882881176464134, "learning_rate": 6.949152542372882e-06, "loss": 0.1803, "step": 170 }, { "epoch": 0.03815984736061056, "grad_norm": 3.036655132788213, "learning_rate": 7.372881355932204e-06, "loss": 0.1643, "step": 180 }, { "epoch": 0.04027983888064448, "grad_norm": 0.4963099670959003, "learning_rate": 7.796610169491526e-06, "loss": 0.1595, "step": 190 }, { "epoch": 0.0423998304006784, "grad_norm": 0.8389581242220279, "learning_rate": 8.220338983050849e-06, "loss": 0.1556, "step": 200 }, { "epoch": 0.04451982192071232, "grad_norm": 0.5257252909788135, "learning_rate": 8.64406779661017e-06, "loss": 0.1471, "step": 210 }, { "epoch": 0.04663981344074624, "grad_norm": 0.3592742330584675, "learning_rate": 9.067796610169493e-06, "loss": 0.1374, "step": 220 }, { "epoch": 0.048759804960780154, "grad_norm": 0.3187507665176873, "learning_rate": 9.491525423728815e-06, "loss": 0.1398, "step": 230 }, { "epoch": 0.050879796480814074, "grad_norm": 0.3482668638560697, "learning_rate": 9.915254237288137e-06, "loss": 0.1293, "step": 240 }, { "epoch": 0.052999788000847994, "grad_norm": 0.44242263127368797, "learning_rate": 9.999921355437334e-06, "loss": 0.1281, "step": 250 }, { "epoch": 0.055119779520881915, "grad_norm": 0.43066449617717567, "learning_rate": 9.999601866141578e-06, "loss": 0.1236, "step": 260 }, { "epoch": 0.057239771040915835, "grad_norm": 0.2632241978204935, "learning_rate": 9.999036632519274e-06, "loss": 0.1198, "step": 270 }, { "epoch": 0.059359762560949755, "grad_norm": 0.5107458203310429, "learning_rate": 9.998225682353224e-06, "loss": 0.1219, "step": 280 }, { "epoch": 0.061479754080983676, "grad_norm": 0.4495473347031151, "learning_rate": 9.997169055503885e-06, "loss": 0.1215, "step": 290 }, { "epoch": 0.0635997456010176, "grad_norm": 0.852881887488011, "learning_rate": 9.995866803907402e-06, "loss": 0.1113, "step": 300 }, { "epoch": 0.06571973712105152, "grad_norm": 0.3612657381641809, "learning_rate": 9.99431899157306e-06, "loss": 0.1111, "step": 310 }, { "epoch": 0.06783972864108544, "grad_norm": 0.44988476523435184, "learning_rate": 9.992525694580135e-06, "loss": 0.1072, "step": 320 }, { "epoch": 0.06995972016111936, "grad_norm": 0.29666583291365195, "learning_rate": 9.990487001074161e-06, "loss": 0.1124, "step": 330 }, { "epoch": 0.07207971168115328, "grad_norm": 0.28345381185455476, "learning_rate": 9.988203011262589e-06, "loss": 0.1075, "step": 340 }, { "epoch": 0.0741997032011872, "grad_norm": 0.3252373191017647, "learning_rate": 9.985673837409865e-06, "loss": 0.1012, "step": 350 }, { "epoch": 0.07631969472122112, "grad_norm": 0.34083940392900397, "learning_rate": 9.982899603831912e-06, "loss": 0.1031, "step": 360 }, { "epoch": 0.07843968624125504, "grad_norm": 0.4836055260534099, "learning_rate": 9.979880446890025e-06, "loss": 0.0996, "step": 370 }, { "epoch": 0.08055967776128896, "grad_norm": 0.3984011129854127, "learning_rate": 9.976616514984152e-06, "loss": 0.1009, "step": 380 }, { "epoch": 0.08267966928132288, "grad_norm": 0.35365486849367245, "learning_rate": 9.973107968545623e-06, "loss": 0.0976, "step": 390 }, { "epoch": 0.0847996608013568, "grad_norm": 0.30027024887187254, "learning_rate": 9.969354980029243e-06, "loss": 0.0969, "step": 400 }, { "epoch": 0.08691965232139072, "grad_norm": 0.25355683531168066, "learning_rate": 9.96535773390483e-06, "loss": 0.1002, "step": 410 }, { "epoch": 0.08903964384142464, "grad_norm": 0.20391383326557452, "learning_rate": 9.961116426648138e-06, "loss": 0.0995, "step": 420 }, { "epoch": 0.09115963536145856, "grad_norm": 0.29085751915140184, "learning_rate": 9.956631266731207e-06, "loss": 0.0992, "step": 430 }, { "epoch": 0.09327962688149248, "grad_norm": 0.2774668505220891, "learning_rate": 9.951902474612112e-06, "loss": 0.0981, "step": 440 }, { "epoch": 0.09539961840152639, "grad_norm": 0.23046127720501614, "learning_rate": 9.946930282724128e-06, "loss": 0.0946, "step": 450 }, { "epoch": 0.09751960992156031, "grad_norm": 0.21609489095737885, "learning_rate": 9.941714935464303e-06, "loss": 0.0903, "step": 460 }, { "epoch": 0.09963960144159423, "grad_norm": 0.3223533405901417, "learning_rate": 9.936256689181454e-06, "loss": 0.0996, "step": 470 }, { "epoch": 0.10175959296162815, "grad_norm": 0.33768759021085587, "learning_rate": 9.930555812163552e-06, "loss": 0.094, "step": 480 }, { "epoch": 0.10387958448166207, "grad_norm": 0.22549215526677444, "learning_rate": 9.924612584624545e-06, "loss": 0.094, "step": 490 }, { "epoch": 0.10599957600169599, "grad_norm": 0.20645922834306707, "learning_rate": 9.918427298690585e-06, "loss": 0.0909, "step": 500 }, { "epoch": 0.10811956752172991, "grad_norm": 0.3341574730006992, "learning_rate": 9.912000258385669e-06, "loss": 0.0873, "step": 510 }, { "epoch": 0.11023955904176383, "grad_norm": 0.22497042484649127, "learning_rate": 9.905331779616683e-06, "loss": 0.091, "step": 520 }, { "epoch": 0.11235955056179775, "grad_norm": 0.25629246939970207, "learning_rate": 9.898422190157897e-06, "loss": 0.0908, "step": 530 }, { "epoch": 0.11447954208183167, "grad_norm": 0.3221978930825859, "learning_rate": 9.891271829634837e-06, "loss": 0.0958, "step": 540 }, { "epoch": 0.11659953360186559, "grad_norm": 0.24323805543228721, "learning_rate": 9.883881049507592e-06, "loss": 0.0931, "step": 550 }, { "epoch": 0.11871952512189951, "grad_norm": 1.4298692529820525, "learning_rate": 9.876250213053542e-06, "loss": 0.0899, "step": 560 }, { "epoch": 0.12083951664193343, "grad_norm": 0.2343862275786179, "learning_rate": 9.868379695349514e-06, "loss": 0.0954, "step": 570 }, { "epoch": 0.12295950816196735, "grad_norm": 0.2786132714201901, "learning_rate": 9.860269883253321e-06, "loss": 0.0909, "step": 580 }, { "epoch": 0.12507949968200127, "grad_norm": 0.3051144098888441, "learning_rate": 9.851921175384769e-06, "loss": 0.0875, "step": 590 }, { "epoch": 0.1271994912020352, "grad_norm": 0.19169696652783166, "learning_rate": 9.843333982106052e-06, "loss": 0.0877, "step": 600 }, { "epoch": 0.1293194827220691, "grad_norm": 0.17535067812063176, "learning_rate": 9.834508725501584e-06, "loss": 0.088, "step": 610 }, { "epoch": 0.13143947424210303, "grad_norm": 0.24821593393490932, "learning_rate": 9.825445839357256e-06, "loss": 0.0869, "step": 620 }, { "epoch": 0.13355946576213695, "grad_norm": 0.25345703968329003, "learning_rate": 9.816145769139107e-06, "loss": 0.0882, "step": 630 }, { "epoch": 0.13567945728217087, "grad_norm": 0.2081953809171809, "learning_rate": 9.806608971971436e-06, "loss": 0.0862, "step": 640 }, { "epoch": 0.1377994488022048, "grad_norm": 0.18784011735994216, "learning_rate": 9.796835916614329e-06, "loss": 0.0872, "step": 650 }, { "epoch": 0.13991944032223871, "grad_norm": 0.24880072040487056, "learning_rate": 9.786827083440616e-06, "loss": 0.0845, "step": 660 }, { "epoch": 0.14203943184227263, "grad_norm": 0.24058730019755548, "learning_rate": 9.776582964412267e-06, "loss": 0.0862, "step": 670 }, { "epoch": 0.14415942336230655, "grad_norm": 0.2565530024882537, "learning_rate": 9.766104063056201e-06, "loss": 0.0867, "step": 680 }, { "epoch": 0.14627941488234047, "grad_norm": 0.2716229887520234, "learning_rate": 9.75539089443954e-06, "loss": 0.0847, "step": 690 }, { "epoch": 0.1483994064023744, "grad_norm": 0.18039433691787665, "learning_rate": 9.7444439851443e-06, "loss": 0.084, "step": 700 }, { "epoch": 0.15051939792240832, "grad_norm": 0.19975307362406894, "learning_rate": 9.733263873241494e-06, "loss": 0.085, "step": 710 }, { "epoch": 0.15263938944244224, "grad_norm": 0.20672720831030839, "learning_rate": 9.721851108264692e-06, "loss": 0.0854, "step": 720 }, { "epoch": 0.15475938096247616, "grad_norm": 0.22685728560638968, "learning_rate": 9.710206251183015e-06, "loss": 0.0822, "step": 730 }, { "epoch": 0.15687937248251008, "grad_norm": 0.19518820234055553, "learning_rate": 9.698329874373547e-06, "loss": 0.0841, "step": 740 }, { "epoch": 0.158999364002544, "grad_norm": 0.1711051072902191, "learning_rate": 9.686222561593218e-06, "loss": 0.0813, "step": 750 }, { "epoch": 0.16111935552257792, "grad_norm": 0.21782045297391434, "learning_rate": 9.6738849079501e-06, "loss": 0.0811, "step": 760 }, { "epoch": 0.16323934704261184, "grad_norm": 0.24001365708386638, "learning_rate": 9.661317519874156e-06, "loss": 0.0839, "step": 770 }, { "epoch": 0.16535933856264576, "grad_norm": 0.24788434673856172, "learning_rate": 9.648521015087437e-06, "loss": 0.0821, "step": 780 }, { "epoch": 0.16747933008267968, "grad_norm": 0.2361652593535425, "learning_rate": 9.63549602257372e-06, "loss": 0.0815, "step": 790 }, { "epoch": 0.1695993216027136, "grad_norm": 0.20131220073696676, "learning_rate": 9.622243182547584e-06, "loss": 0.0814, "step": 800 }, { "epoch": 0.17171931312274752, "grad_norm": 0.1910839922084592, "learning_rate": 9.608763146422947e-06, "loss": 0.0805, "step": 810 }, { "epoch": 0.17383930464278144, "grad_norm": 0.19343246908459, "learning_rate": 9.59505657678105e-06, "loss": 0.0817, "step": 820 }, { "epoch": 0.17595929616281536, "grad_norm": 0.20795991387602794, "learning_rate": 9.581124147337886e-06, "loss": 0.0829, "step": 830 }, { "epoch": 0.17807928768284928, "grad_norm": 0.2384907168932879, "learning_rate": 9.566966542911079e-06, "loss": 0.0828, "step": 840 }, { "epoch": 0.1801992792028832, "grad_norm": 0.2661800926148604, "learning_rate": 9.552584459386234e-06, "loss": 0.0807, "step": 850 }, { "epoch": 0.18231927072291712, "grad_norm": 0.16793437681402207, "learning_rate": 9.537978603682728e-06, "loss": 0.0808, "step": 860 }, { "epoch": 0.18443926224295104, "grad_norm": 0.17934828150494173, "learning_rate": 9.52314969371896e-06, "loss": 0.084, "step": 870 }, { "epoch": 0.18655925376298496, "grad_norm": 0.2375412871314736, "learning_rate": 9.50809845837707e-06, "loss": 0.0816, "step": 880 }, { "epoch": 0.18867924528301888, "grad_norm": 0.19298906022439435, "learning_rate": 9.492825637467103e-06, "loss": 0.0823, "step": 890 }, { "epoch": 0.19079923680305277, "grad_norm": 0.21813774555943088, "learning_rate": 9.47733198169065e-06, "loss": 0.0783, "step": 900 }, { "epoch": 0.1929192283230867, "grad_norm": 0.20165689985494703, "learning_rate": 9.461618252603956e-06, "loss": 0.0799, "step": 910 }, { "epoch": 0.19503921984312061, "grad_norm": 0.19889073544853322, "learning_rate": 9.44568522258048e-06, "loss": 0.0824, "step": 920 }, { "epoch": 0.19715921136315454, "grad_norm": 0.1677587269136254, "learning_rate": 9.42953367477292e-06, "loss": 0.0817, "step": 930 }, { "epoch": 0.19927920288318846, "grad_norm": 0.27029655878775144, "learning_rate": 9.413164403074744e-06, "loss": 0.0771, "step": 940 }, { "epoch": 0.20139919440322238, "grad_norm": 0.20752899646185938, "learning_rate": 9.398246569397352e-06, "loss": 0.083, "step": 950 }, { "epoch": 0.2035191859232563, "grad_norm": 0.1755855686731489, "learning_rate": 9.381465847779896e-06, "loss": 0.0773, "step": 960 }, { "epoch": 0.20563917744329022, "grad_norm": 0.20944110815210132, "learning_rate": 9.364469764939109e-06, "loss": 0.0856, "step": 970 }, { "epoch": 0.20775916896332414, "grad_norm": 0.20414341977081546, "learning_rate": 9.347259156279697e-06, "loss": 0.0814, "step": 980 }, { "epoch": 0.20987916048335806, "grad_norm": 0.23534197904825535, "learning_rate": 9.329834867750912e-06, "loss": 0.0782, "step": 990 }, { "epoch": 0.21199915200339198, "grad_norm": 0.19706741061553582, "learning_rate": 9.312197755804957e-06, "loss": 0.0813, "step": 1000 }, { "epoch": 0.21199915200339198, "eval_loss": 0.07808271795511246, "eval_runtime": 489.1656, "eval_samples_per_second": 4.183, "eval_steps_per_second": 0.301, "step": 1000 }, { "epoch": 0.2141191435234259, "grad_norm": 0.20255197961584584, "learning_rate": 9.294348687354899e-06, "loss": 0.0786, "step": 1010 }, { "epoch": 0.21623913504345982, "grad_norm": 0.18756797354546684, "learning_rate": 9.278104027838603e-06, "loss": 0.0904, "step": 1020 }, { "epoch": 0.21835912656349374, "grad_norm": 0.1803465926904638, "learning_rate": 9.259854667654485e-06, "loss": 0.0794, "step": 1030 }, { "epoch": 0.22047911808352766, "grad_norm": 0.23334153257650003, "learning_rate": 9.24139592377452e-06, "loss": 0.0787, "step": 1040 }, { "epoch": 0.22259910960356158, "grad_norm": 0.2179720848023895, "learning_rate": 9.222728703497267e-06, "loss": 0.082, "step": 1050 }, { "epoch": 0.2247191011235955, "grad_norm": 0.19658693620368076, "learning_rate": 9.203853924368488e-06, "loss": 0.0774, "step": 1060 }, { "epoch": 0.22683909264362942, "grad_norm": 0.19207266252736302, "learning_rate": 9.18477251413603e-06, "loss": 0.075, "step": 1070 }, { "epoch": 0.22895908416366334, "grad_norm": 0.26026047947148445, "learning_rate": 9.165485410704238e-06, "loss": 0.0767, "step": 1080 }, { "epoch": 0.23107907568369726, "grad_norm": 0.20808377225507274, "learning_rate": 9.145993562087848e-06, "loss": 0.0784, "step": 1090 }, { "epoch": 0.23319906720373118, "grad_norm": 0.18546577944881873, "learning_rate": 9.12629792636539e-06, "loss": 0.0761, "step": 1100 }, { "epoch": 0.2353190587237651, "grad_norm": 0.17056106678355273, "learning_rate": 9.1063994716321e-06, "loss": 0.079, "step": 1110 }, { "epoch": 0.23743905024379902, "grad_norm": 0.23748473849259574, "learning_rate": 9.086299175952327e-06, "loss": 0.0769, "step": 1120 }, { "epoch": 0.23955904176383294, "grad_norm": 0.21686782311110087, "learning_rate": 9.065998027311467e-06, "loss": 0.0783, "step": 1130 }, { "epoch": 0.24167903328386686, "grad_norm": 0.18252100885280442, "learning_rate": 9.045497023567396e-06, "loss": 0.08, "step": 1140 }, { "epoch": 0.24379902480390078, "grad_norm": 0.24544524196615075, "learning_rate": 9.024797172401426e-06, "loss": 0.08, "step": 1150 }, { "epoch": 0.2459190163239347, "grad_norm": 0.19645758097319682, "learning_rate": 9.003899491268768e-06, "loss": 0.0798, "step": 1160 }, { "epoch": 0.24803900784396862, "grad_norm": 0.21351292431439828, "learning_rate": 8.982805007348531e-06, "loss": 0.0754, "step": 1170 }, { "epoch": 0.25015899936400254, "grad_norm": 0.17911187274119753, "learning_rate": 8.961514757493224e-06, "loss": 0.0772, "step": 1180 }, { "epoch": 0.2522789908840365, "grad_norm": 0.22385841458052336, "learning_rate": 8.940029788177795e-06, "loss": 0.0773, "step": 1190 }, { "epoch": 0.2543989824040704, "grad_norm": 0.1683385451298309, "learning_rate": 8.9183511554482e-06, "loss": 0.0747, "step": 1200 }, { "epoch": 0.2565189739241043, "grad_norm": 0.2372277626069385, "learning_rate": 8.896479924869483e-06, "loss": 0.076, "step": 1210 }, { "epoch": 0.2586389654441382, "grad_norm": 0.2022762684424488, "learning_rate": 8.874417171473415e-06, "loss": 0.074, "step": 1220 }, { "epoch": 0.2607589569641721, "grad_norm": 0.19827726532798173, "learning_rate": 8.852163979705639e-06, "loss": 0.0782, "step": 1230 }, { "epoch": 0.26287894848420607, "grad_norm": 0.2343304853478425, "learning_rate": 8.829721443372378e-06, "loss": 0.0756, "step": 1240 }, { "epoch": 0.26499894000423996, "grad_norm": 0.2217975820152699, "learning_rate": 8.807090665586664e-06, "loss": 0.0777, "step": 1250 }, { "epoch": 0.2671189315242739, "grad_norm": 0.20059042051568582, "learning_rate": 8.784272758714118e-06, "loss": 0.0738, "step": 1260 }, { "epoch": 0.2692389230443078, "grad_norm": 0.18668008819406168, "learning_rate": 8.761268844318282e-06, "loss": 0.0757, "step": 1270 }, { "epoch": 0.27135891456434175, "grad_norm": 0.24287290051699115, "learning_rate": 8.73808005310548e-06, "loss": 0.0762, "step": 1280 }, { "epoch": 0.27347890608437564, "grad_norm": 0.18654864916400832, "learning_rate": 8.714707524869245e-06, "loss": 0.0795, "step": 1290 }, { "epoch": 0.2755988976044096, "grad_norm": 0.2341640233496617, "learning_rate": 8.691152408434296e-06, "loss": 0.0732, "step": 1300 }, { "epoch": 0.2777188891244435, "grad_norm": 0.2082849165240921, "learning_rate": 8.66741586160007e-06, "loss": 0.0774, "step": 1310 }, { "epoch": 0.27983888064447743, "grad_norm": 0.22099657211974957, "learning_rate": 8.643499051083812e-06, "loss": 0.0738, "step": 1320 }, { "epoch": 0.2819588721645113, "grad_norm": 0.20266362710361852, "learning_rate": 8.619403152463231e-06, "loss": 0.0765, "step": 1330 }, { "epoch": 0.28407886368454527, "grad_norm": 0.2061785256600231, "learning_rate": 8.595129350118707e-06, "loss": 0.0743, "step": 1340 }, { "epoch": 0.28619885520457916, "grad_norm": 0.16451845890324332, "learning_rate": 8.570678837175089e-06, "loss": 0.0731, "step": 1350 }, { "epoch": 0.2883188467246131, "grad_norm": 0.2039051062049253, "learning_rate": 8.546052815443041e-06, "loss": 0.075, "step": 1360 }, { "epoch": 0.290438838244647, "grad_norm": 0.20907838865950076, "learning_rate": 8.521252495359971e-06, "loss": 0.0779, "step": 1370 }, { "epoch": 0.29255882976468095, "grad_norm": 0.24041556209946813, "learning_rate": 8.496279095930535e-06, "loss": 0.0752, "step": 1380 }, { "epoch": 0.29467882128471484, "grad_norm": 0.749878332883915, "learning_rate": 8.471133844666721e-06, "loss": 0.0736, "step": 1390 }, { "epoch": 0.2967988128047488, "grad_norm": 0.20890550228302898, "learning_rate": 8.445817977527513e-06, "loss": 0.075, "step": 1400 }, { "epoch": 0.2989188043247827, "grad_norm": 0.19623374633823948, "learning_rate": 8.420332738858136e-06, "loss": 0.0764, "step": 1410 }, { "epoch": 0.30103879584481663, "grad_norm": 0.1803239880391424, "learning_rate": 8.394679381328904e-06, "loss": 0.0782, "step": 1420 }, { "epoch": 0.3031587873648505, "grad_norm": 0.20782215083834218, "learning_rate": 8.368859165873629e-06, "loss": 0.075, "step": 1430 }, { "epoch": 0.30527877888488447, "grad_norm": 0.15864327444836024, "learning_rate": 8.342873361627663e-06, "loss": 0.0736, "step": 1440 }, { "epoch": 0.30739877040491836, "grad_norm": 0.16758113802075347, "learning_rate": 8.316723245865503e-06, "loss": 0.0743, "step": 1450 }, { "epoch": 0.3095187619249523, "grad_norm": 0.18187491458078606, "learning_rate": 8.290410103938015e-06, "loss": 0.0763, "step": 1460 }, { "epoch": 0.3116387534449862, "grad_norm": 0.21015841421612225, "learning_rate": 8.263935229209255e-06, "loss": 0.0778, "step": 1470 }, { "epoch": 0.31375874496502015, "grad_norm": 0.18123545189165477, "learning_rate": 8.237299922992894e-06, "loss": 0.0737, "step": 1480 }, { "epoch": 0.31587873648505405, "grad_norm": 0.22166407582198208, "learning_rate": 8.210505494488257e-06, "loss": 0.0747, "step": 1490 }, { "epoch": 0.317998728005088, "grad_norm": 0.20907147507686014, "learning_rate": 8.183553260715971e-06, "loss": 0.0753, "step": 1500 }, { "epoch": 0.3201187195251219, "grad_norm": 0.18137783305550276, "learning_rate": 8.15644454645323e-06, "loss": 0.076, "step": 1510 }, { "epoch": 0.32223871104515583, "grad_norm": 0.18468699366411487, "learning_rate": 8.129180684168683e-06, "loss": 0.0756, "step": 1520 }, { "epoch": 0.3243587025651897, "grad_norm": 0.25213749529106116, "learning_rate": 8.101763013956933e-06, "loss": 0.0746, "step": 1530 }, { "epoch": 0.3264786940852237, "grad_norm": 0.2161642309592381, "learning_rate": 8.074192883472667e-06, "loss": 0.0759, "step": 1540 }, { "epoch": 0.32859868560525757, "grad_norm": 0.20176374830183055, "learning_rate": 8.04647164786442e-06, "loss": 0.0731, "step": 1550 }, { "epoch": 0.3307186771252915, "grad_norm": 0.1969763529692524, "learning_rate": 8.01860066970797e-06, "loss": 0.0747, "step": 1560 }, { "epoch": 0.3328386686453254, "grad_norm": 0.2060140719520926, "learning_rate": 7.990581318939346e-06, "loss": 0.0776, "step": 1570 }, { "epoch": 0.33495866016535936, "grad_norm": 0.19063310891295998, "learning_rate": 7.962414972787513e-06, "loss": 0.0732, "step": 1580 }, { "epoch": 0.33707865168539325, "grad_norm": 0.32170213151067406, "learning_rate": 7.934103015706665e-06, "loss": 0.0718, "step": 1590 }, { "epoch": 0.3391986432054272, "grad_norm": 0.3017816089089381, "learning_rate": 7.905646839308171e-06, "loss": 0.0713, "step": 1600 }, { "epoch": 0.3413186347254611, "grad_norm": 0.23008639936227068, "learning_rate": 7.877047842292193e-06, "loss": 0.0761, "step": 1610 }, { "epoch": 0.34343862624549504, "grad_norm": 0.21592149981422365, "learning_rate": 7.84830743037891e-06, "loss": 0.0743, "step": 1620 }, { "epoch": 0.34555861776552893, "grad_norm": 0.19186122210621318, "learning_rate": 7.819427016239447e-06, "loss": 0.0727, "step": 1630 }, { "epoch": 0.3476786092855629, "grad_norm": 0.19028094584781277, "learning_rate": 7.790408019426424e-06, "loss": 0.0732, "step": 1640 }, { "epoch": 0.34979860080559677, "grad_norm": 0.18613264864125317, "learning_rate": 7.761251866304176e-06, "loss": 0.0735, "step": 1650 }, { "epoch": 0.3519185923256307, "grad_norm": 0.25693606830513216, "learning_rate": 7.731959989978667e-06, "loss": 0.0761, "step": 1660 }, { "epoch": 0.3540385838456646, "grad_norm": 0.21174972493824354, "learning_rate": 7.702533830227024e-06, "loss": 0.073, "step": 1670 }, { "epoch": 0.35615857536569856, "grad_norm": 0.17143400242703105, "learning_rate": 7.672974833426779e-06, "loss": 0.0737, "step": 1680 }, { "epoch": 0.35827856688573245, "grad_norm": 0.1896798767328968, "learning_rate": 7.643284452484773e-06, "loss": 0.0725, "step": 1690 }, { "epoch": 0.3603985584057664, "grad_norm": 0.20989812969482, "learning_rate": 7.613464146765748e-06, "loss": 0.0728, "step": 1700 }, { "epoch": 0.3625185499258003, "grad_norm": 0.19528230055400297, "learning_rate": 7.583515382020603e-06, "loss": 0.0732, "step": 1710 }, { "epoch": 0.36463854144583424, "grad_norm": 0.21719433509159808, "learning_rate": 7.5534396303143605e-06, "loss": 0.0704, "step": 1720 }, { "epoch": 0.36675853296586813, "grad_norm": 0.17922009396269356, "learning_rate": 7.523238369953802e-06, "loss": 0.0683, "step": 1730 }, { "epoch": 0.3688785244859021, "grad_norm": 0.43299284172403135, "learning_rate": 7.4929130854148105e-06, "loss": 0.0724, "step": 1740 }, { "epoch": 0.370998516005936, "grad_norm": 0.17140377305963891, "learning_rate": 7.4624652672693984e-06, "loss": 0.0748, "step": 1750 }, { "epoch": 0.3731185075259699, "grad_norm": 0.21895097629186294, "learning_rate": 7.43189641211245e-06, "loss": 0.0731, "step": 1760 }, { "epoch": 0.3752384990460038, "grad_norm": 0.217978788931641, "learning_rate": 7.401208022488152e-06, "loss": 0.0742, "step": 1770 }, { "epoch": 0.37735849056603776, "grad_norm": 0.1778572661187928, "learning_rate": 7.370401606816142e-06, "loss": 0.0699, "step": 1780 }, { "epoch": 0.37947848208607166, "grad_norm": 0.22879874120541474, "learning_rate": 7.339478679317369e-06, "loss": 0.0762, "step": 1790 }, { "epoch": 0.38159847360610555, "grad_norm": 0.20685054508670736, "learning_rate": 7.308440759939659e-06, "loss": 0.0717, "step": 1800 }, { "epoch": 0.3837184651261395, "grad_norm": 0.23785935863776791, "learning_rate": 7.277289374283009e-06, "loss": 0.0736, "step": 1810 }, { "epoch": 0.3858384566461734, "grad_norm": 0.25317055966913016, "learning_rate": 7.246026053524603e-06, "loss": 0.0729, "step": 1820 }, { "epoch": 0.38795844816620734, "grad_norm": 0.17578925874031257, "learning_rate": 7.214652334343539e-06, "loss": 0.0736, "step": 1830 }, { "epoch": 0.39007843968624123, "grad_norm": 0.20527746180283624, "learning_rate": 7.183169758845308e-06, "loss": 0.0738, "step": 1840 }, { "epoch": 0.3921984312062752, "grad_norm": 0.17762579012389196, "learning_rate": 7.151579874485995e-06, "loss": 0.0713, "step": 1850 }, { "epoch": 0.39431842272630907, "grad_norm": 0.200855955585812, "learning_rate": 7.119884233996208e-06, "loss": 0.0712, "step": 1860 }, { "epoch": 0.396438414246343, "grad_norm": 0.18153714925752748, "learning_rate": 7.088084395304765e-06, "loss": 0.0716, "step": 1870 }, { "epoch": 0.3985584057663769, "grad_norm": 0.30888829292022363, "learning_rate": 7.0561819214621186e-06, "loss": 0.0709, "step": 1880 }, { "epoch": 0.40067839728641086, "grad_norm": 0.1677914580623982, "learning_rate": 7.024178380563517e-06, "loss": 0.0686, "step": 1890 }, { "epoch": 0.40279838880644475, "grad_norm": 0.23978151499288752, "learning_rate": 6.99207534567194e-06, "loss": 0.0733, "step": 1900 }, { "epoch": 0.4049183803264787, "grad_norm": 0.1496936015385523, "learning_rate": 6.959874394740775e-06, "loss": 0.0703, "step": 1910 }, { "epoch": 0.4070383718465126, "grad_norm": 0.17861927332885064, "learning_rate": 6.927577110536251e-06, "loss": 0.0709, "step": 1920 }, { "epoch": 0.40915836336654654, "grad_norm": 0.20321672481515973, "learning_rate": 6.895185080559649e-06, "loss": 0.0718, "step": 1930 }, { "epoch": 0.41127835488658043, "grad_norm": 0.1702613475343327, "learning_rate": 6.862699896969262e-06, "loss": 0.0726, "step": 1940 }, { "epoch": 0.4133983464066144, "grad_norm": 0.19820271991017135, "learning_rate": 6.830123156502147e-06, "loss": 0.0722, "step": 1950 }, { "epoch": 0.4155183379266483, "grad_norm": 0.21659807580282067, "learning_rate": 6.7974564603956395e-06, "loss": 0.072, "step": 1960 }, { "epoch": 0.4176383294466822, "grad_norm": 0.2484602203487257, "learning_rate": 6.7647014143086334e-06, "loss": 0.0707, "step": 1970 }, { "epoch": 0.4197583209667161, "grad_norm": 0.2428781266233325, "learning_rate": 6.7318596282426796e-06, "loss": 0.0726, "step": 1980 }, { "epoch": 0.42187831248675006, "grad_norm": 0.19890188197759706, "learning_rate": 6.6989327164628375e-06, "loss": 0.0735, "step": 1990 }, { "epoch": 0.42399830400678395, "grad_norm": 0.17618893341728534, "learning_rate": 6.665922297418328e-06, "loss": 0.0717, "step": 2000 }, { "epoch": 0.42399830400678395, "eval_loss": 0.07119767367839813, "eval_runtime": 489.4309, "eval_samples_per_second": 4.18, "eval_steps_per_second": 0.3, "step": 2000 }, { "epoch": 0.4261182955268179, "grad_norm": 0.2226486798390119, "learning_rate": 6.632829993662994e-06, "loss": 0.0698, "step": 2010 }, { "epoch": 0.4282382870468518, "grad_norm": 0.15591714439754456, "learning_rate": 6.599657431775529e-06, "loss": 0.073, "step": 2020 }, { "epoch": 0.43035827856688574, "grad_norm": 0.18569107867432982, "learning_rate": 6.566406242279546e-06, "loss": 0.0701, "step": 2030 }, { "epoch": 0.43247827008691964, "grad_norm": 0.2044929271888512, "learning_rate": 6.53307805956342e-06, "loss": 0.0684, "step": 2040 }, { "epoch": 0.4345982616069536, "grad_norm": 0.1591048247213101, "learning_rate": 6.4996745217999566e-06, "loss": 0.0712, "step": 2050 }, { "epoch": 0.4367182531269875, "grad_norm": 0.18457570554796743, "learning_rate": 6.4661972708658715e-06, "loss": 0.0682, "step": 2060 }, { "epoch": 0.4388382446470214, "grad_norm": 0.18024866180958676, "learning_rate": 6.4326479522610855e-06, "loss": 0.0703, "step": 2070 }, { "epoch": 0.4409582361670553, "grad_norm": 0.17393779181333482, "learning_rate": 6.399028215027849e-06, "loss": 0.0677, "step": 2080 }, { "epoch": 0.44307822768708927, "grad_norm": 0.15822079895374294, "learning_rate": 6.365339711669687e-06, "loss": 0.0696, "step": 2090 }, { "epoch": 0.44519821920712316, "grad_norm": 0.17783185791820674, "learning_rate": 6.331584098070159e-06, "loss": 0.0729, "step": 2100 }, { "epoch": 0.4473182107271571, "grad_norm": 0.17784945554783102, "learning_rate": 6.2977630334114904e-06, "loss": 0.0706, "step": 2110 }, { "epoch": 0.449438202247191, "grad_norm": 0.21655542057286598, "learning_rate": 6.263878180093004e-06, "loss": 0.0734, "step": 2120 }, { "epoch": 0.45155819376722495, "grad_norm": 0.1933797514771672, "learning_rate": 6.2299312036494134e-06, "loss": 0.069, "step": 2130 }, { "epoch": 0.45367818528725884, "grad_norm": 0.15757976242950295, "learning_rate": 6.195923772668955e-06, "loss": 0.0722, "step": 2140 }, { "epoch": 0.4557981768072928, "grad_norm": 0.20409740685630307, "learning_rate": 6.161857558711372e-06, "loss": 0.0705, "step": 2150 }, { "epoch": 0.4579181683273267, "grad_norm": 0.15041431962094184, "learning_rate": 6.12773423622576e-06, "loss": 0.0695, "step": 2160 }, { "epoch": 0.46003815984736063, "grad_norm": 0.281897607782115, "learning_rate": 6.0935554824682556e-06, "loss": 0.0704, "step": 2170 }, { "epoch": 0.4621581513673945, "grad_norm": 0.22084672726453938, "learning_rate": 6.059322977419591e-06, "loss": 0.0705, "step": 2180 }, { "epoch": 0.46427814288742847, "grad_norm": 0.20019812476026203, "learning_rate": 6.02503840370253e-06, "loss": 0.0703, "step": 2190 }, { "epoch": 0.46639813440746236, "grad_norm": 0.17909334136517222, "learning_rate": 5.990703446499153e-06, "loss": 0.0706, "step": 2200 }, { "epoch": 0.4685181259274963, "grad_norm": 0.16644185623431462, "learning_rate": 5.9563197934680325e-06, "loss": 0.0746, "step": 2210 }, { "epoch": 0.4706381174475302, "grad_norm": 0.23611788687622157, "learning_rate": 5.921889134661272e-06, "loss": 0.0715, "step": 2220 }, { "epoch": 0.47275810896756415, "grad_norm": 0.1692697227784412, "learning_rate": 5.887413162441438e-06, "loss": 0.0703, "step": 2230 }, { "epoch": 0.47487810048759804, "grad_norm": 0.16272992258196417, "learning_rate": 5.852893571398385e-06, "loss": 0.0703, "step": 2240 }, { "epoch": 0.476998092007632, "grad_norm": 0.16602591153652455, "learning_rate": 5.818332058265948e-06, "loss": 0.0682, "step": 2250 }, { "epoch": 0.4791180835276659, "grad_norm": 0.15187588978068958, "learning_rate": 5.783730321838548e-06, "loss": 0.0658, "step": 2260 }, { "epoch": 0.48123807504769983, "grad_norm": 0.21228071370192056, "learning_rate": 5.749090062887697e-06, "loss": 0.07, "step": 2270 }, { "epoch": 0.4833580665677337, "grad_norm": 0.1935655119130272, "learning_rate": 5.714412984078393e-06, "loss": 0.0699, "step": 2280 }, { "epoch": 0.48547805808776767, "grad_norm": 0.1611360908597304, "learning_rate": 5.679700789885436e-06, "loss": 0.0715, "step": 2290 }, { "epoch": 0.48759804960780156, "grad_norm": 0.2436477600612657, "learning_rate": 5.644955186509641e-06, "loss": 0.0689, "step": 2300 }, { "epoch": 0.4897180411278355, "grad_norm": 0.24133950450204542, "learning_rate": 5.610177881793976e-06, "loss": 0.0693, "step": 2310 }, { "epoch": 0.4918380326478694, "grad_norm": 0.20263042804166118, "learning_rate": 5.5753705851396236e-06, "loss": 0.0692, "step": 2320 }, { "epoch": 0.49395802416790335, "grad_norm": 0.1758643154419978, "learning_rate": 5.54053500742195e-06, "loss": 0.0717, "step": 2330 }, { "epoch": 0.49607801568793725, "grad_norm": 0.17041444204200845, "learning_rate": 5.505672860906412e-06, "loss": 0.0731, "step": 2340 }, { "epoch": 0.4981980072079712, "grad_norm": 0.16318236116620452, "learning_rate": 5.470785859164402e-06, "loss": 0.0717, "step": 2350 }, { "epoch": 0.5003179987280051, "grad_norm": 0.1684480788354608, "learning_rate": 5.435875716989013e-06, "loss": 0.0731, "step": 2360 }, { "epoch": 0.502437990248039, "grad_norm": 0.16940752138117054, "learning_rate": 5.400944150310754e-06, "loss": 0.0686, "step": 2370 }, { "epoch": 0.504557981768073, "grad_norm": 0.18543062436184285, "learning_rate": 5.3659928761132084e-06, "loss": 0.0712, "step": 2380 }, { "epoch": 0.5066779732881068, "grad_norm": 0.18981591633920203, "learning_rate": 5.3310236123486396e-06, "loss": 0.0713, "step": 2390 }, { "epoch": 0.5087979648081408, "grad_norm": 0.20107039697147697, "learning_rate": 5.296038077853545e-06, "loss": 0.0724, "step": 2400 }, { "epoch": 0.5109179563281747, "grad_norm": 0.15561965976521763, "learning_rate": 5.261037992264182e-06, "loss": 0.0691, "step": 2410 }, { "epoch": 0.5130379478482086, "grad_norm": 0.18814302974879546, "learning_rate": 5.226025075932024e-06, "loss": 0.0725, "step": 2420 }, { "epoch": 0.5151579393682425, "grad_norm": 0.19409510196146995, "learning_rate": 5.191001049839218e-06, "loss": 0.0718, "step": 2430 }, { "epoch": 0.5172779308882764, "grad_norm": 0.1948905204885732, "learning_rate": 5.155967635513985e-06, "loss": 0.0689, "step": 2440 }, { "epoch": 0.5193979224083104, "grad_norm": 0.15910287909271553, "learning_rate": 5.120926554946003e-06, "loss": 0.07, "step": 2450 }, { "epoch": 0.5215179139283442, "grad_norm": 0.16754971258212684, "learning_rate": 5.0858795305017696e-06, "loss": 0.0697, "step": 2460 }, { "epoch": 0.5236379054483782, "grad_norm": 0.19912027070603852, "learning_rate": 5.050828284839936e-06, "loss": 0.0707, "step": 2470 }, { "epoch": 0.5257578969684121, "grad_norm": 0.1770839557299797, "learning_rate": 5.015774540826639e-06, "loss": 0.0708, "step": 2480 }, { "epoch": 0.5278778884884461, "grad_norm": 0.1879664250171856, "learning_rate": 4.980720021450822e-06, "loss": 0.0719, "step": 2490 }, { "epoch": 0.5299978800084799, "grad_norm": 0.2038214395747643, "learning_rate": 4.945666449739534e-06, "loss": 0.0724, "step": 2500 }, { "epoch": 0.5321178715285139, "grad_norm": 0.14313855897723543, "learning_rate": 4.910615548673245e-06, "loss": 0.0671, "step": 2510 }, { "epoch": 0.5342378630485478, "grad_norm": 0.1667988114624785, "learning_rate": 4.875569041101152e-06, "loss": 0.0704, "step": 2520 }, { "epoch": 0.5363578545685818, "grad_norm": 0.15027735583780358, "learning_rate": 4.840528649656507e-06, "loss": 0.0683, "step": 2530 }, { "epoch": 0.5384778460886156, "grad_norm": 0.18256318111022773, "learning_rate": 4.805496096671933e-06, "loss": 0.0723, "step": 2540 }, { "epoch": 0.5405978376086495, "grad_norm": 0.1581087386884916, "learning_rate": 4.77047310409477e-06, "loss": 0.0678, "step": 2550 }, { "epoch": 0.5427178291286835, "grad_norm": 0.15440122313131024, "learning_rate": 4.735461393402437e-06, "loss": 0.0683, "step": 2560 }, { "epoch": 0.5448378206487174, "grad_norm": 0.16903444896901648, "learning_rate": 4.700462685517822e-06, "loss": 0.069, "step": 2570 }, { "epoch": 0.5469578121687513, "grad_norm": 0.16890011975539956, "learning_rate": 4.665478700724684e-06, "loss": 0.0684, "step": 2580 }, { "epoch": 0.5490778036887852, "grad_norm": 0.20866405931584792, "learning_rate": 4.630511158583102e-06, "loss": 0.0698, "step": 2590 }, { "epoch": 0.5511977952088192, "grad_norm": 0.17297937706938452, "learning_rate": 4.595561777844954e-06, "loss": 0.0683, "step": 2600 }, { "epoch": 0.5533177867288531, "grad_norm": 0.19334365079463428, "learning_rate": 4.560632276369436e-06, "loss": 0.071, "step": 2610 }, { "epoch": 0.555437778248887, "grad_norm": 0.1566806985390742, "learning_rate": 4.525724371038616e-06, "loss": 0.0681, "step": 2620 }, { "epoch": 0.5575577697689209, "grad_norm": 0.19883045801104277, "learning_rate": 4.4908397776730634e-06, "loss": 0.0693, "step": 2630 }, { "epoch": 0.5596777612889549, "grad_norm": 0.18015525710269312, "learning_rate": 4.455980210947488e-06, "loss": 0.0694, "step": 2640 }, { "epoch": 0.5617977528089888, "grad_norm": 0.19996437083442065, "learning_rate": 4.421147384306476e-06, "loss": 0.0724, "step": 2650 }, { "epoch": 0.5639177443290226, "grad_norm": 0.1576506824802755, "learning_rate": 4.3863430098802674e-06, "loss": 0.0676, "step": 2660 }, { "epoch": 0.5660377358490566, "grad_norm": 0.15643885696863916, "learning_rate": 4.35156879840059e-06, "loss": 0.0711, "step": 2670 }, { "epoch": 0.5681577273690905, "grad_norm": 0.1810361041257664, "learning_rate": 4.3168264591165825e-06, "loss": 0.0673, "step": 2680 }, { "epoch": 0.5702777188891245, "grad_norm": 0.18342485320088614, "learning_rate": 4.282117699710775e-06, "loss": 0.0693, "step": 2690 }, { "epoch": 0.5723977104091583, "grad_norm": 0.1715806871966692, "learning_rate": 4.247444226215157e-06, "loss": 0.0663, "step": 2700 }, { "epoch": 0.5745177019291923, "grad_norm": 0.182483141924479, "learning_rate": 4.212807742927315e-06, "loss": 0.0679, "step": 2710 }, { "epoch": 0.5766376934492262, "grad_norm": 0.17017972956968302, "learning_rate": 4.178209952326659e-06, "loss": 0.0708, "step": 2720 }, { "epoch": 0.5787576849692602, "grad_norm": 0.17249912512947316, "learning_rate": 4.143652554990756e-06, "loss": 0.0665, "step": 2730 }, { "epoch": 0.580877676489294, "grad_norm": 0.16601147330223942, "learning_rate": 4.109137249511726e-06, "loss": 0.0663, "step": 2740 }, { "epoch": 0.582997668009328, "grad_norm": 0.18185554052245853, "learning_rate": 4.074665732412753e-06, "loss": 0.0678, "step": 2750 }, { "epoch": 0.5851176595293619, "grad_norm": 0.16710135698081338, "learning_rate": 4.040239698064712e-06, "loss": 0.0679, "step": 2760 }, { "epoch": 0.5872376510493958, "grad_norm": 0.14000708323466857, "learning_rate": 4.005860838602863e-06, "loss": 0.0697, "step": 2770 }, { "epoch": 0.5893576425694297, "grad_norm": 0.14927867611572637, "learning_rate": 3.971530843843694e-06, "loss": 0.0688, "step": 2780 }, { "epoch": 0.5914776340894636, "grad_norm": 0.15171093238665134, "learning_rate": 3.9372514012018596e-06, "loss": 0.0699, "step": 2790 }, { "epoch": 0.5935976256094976, "grad_norm": 0.1804183937869093, "learning_rate": 3.903024195607232e-06, "loss": 0.0716, "step": 2800 }, { "epoch": 0.5957176171295315, "grad_norm": 0.15215129472796332, "learning_rate": 3.868850909422092e-06, "loss": 0.0698, "step": 2810 }, { "epoch": 0.5978376086495654, "grad_norm": 0.15088502756447672, "learning_rate": 3.834733222358427e-06, "loss": 0.0687, "step": 2820 }, { "epoch": 0.5999576001695993, "grad_norm": 0.18595108870060142, "learning_rate": 3.80067281139538e-06, "loss": 0.0724, "step": 2830 }, { "epoch": 0.6020775916896333, "grad_norm": 0.16954135988504473, "learning_rate": 3.7666713506968052e-06, "loss": 0.0691, "step": 2840 }, { "epoch": 0.6041975832096672, "grad_norm": 0.18097495350034157, "learning_rate": 3.7327305115289938e-06, "loss": 0.066, "step": 2850 }, { "epoch": 0.606317574729701, "grad_norm": 0.15243124437822092, "learning_rate": 3.69885196217852e-06, "loss": 0.0682, "step": 2860 }, { "epoch": 0.608437566249735, "grad_norm": 0.1614824984725212, "learning_rate": 3.66503736787024e-06, "loss": 0.0637, "step": 2870 }, { "epoch": 0.6105575577697689, "grad_norm": 0.16257796428966814, "learning_rate": 3.6312883906854376e-06, "loss": 0.0674, "step": 2880 }, { "epoch": 0.6126775492898029, "grad_norm": 0.1786290065781706, "learning_rate": 3.5976066894801386e-06, "loss": 0.0657, "step": 2890 }, { "epoch": 0.6147975408098367, "grad_norm": 0.1489676922818998, "learning_rate": 3.5639939198035655e-06, "loss": 0.0662, "step": 2900 }, { "epoch": 0.6169175323298707, "grad_norm": 0.15203380554832843, "learning_rate": 3.530451733816762e-06, "loss": 0.0682, "step": 2910 }, { "epoch": 0.6190375238499046, "grad_norm": 0.20295326197958097, "learning_rate": 3.496981780211392e-06, "loss": 0.0685, "step": 2920 }, { "epoch": 0.6211575153699386, "grad_norm": 0.18783757751530197, "learning_rate": 3.4635857041286922e-06, "loss": 0.0696, "step": 2930 }, { "epoch": 0.6232775068899724, "grad_norm": 0.14570978880022487, "learning_rate": 3.430265147078616e-06, "loss": 0.0702, "step": 2940 }, { "epoch": 0.6253974984100064, "grad_norm": 0.14375379036873775, "learning_rate": 3.3970217468591486e-06, "loss": 0.0664, "step": 2950 }, { "epoch": 0.6275174899300403, "grad_norm": 0.173702914525196, "learning_rate": 3.3638571374758e-06, "loss": 0.0657, "step": 2960 }, { "epoch": 0.6296374814500743, "grad_norm": 0.15569914868699147, "learning_rate": 3.3307729490612896e-06, "loss": 0.0659, "step": 2970 }, { "epoch": 0.6317574729701081, "grad_norm": 0.18252127530194195, "learning_rate": 3.297770807795425e-06, "loss": 0.0665, "step": 2980 }, { "epoch": 0.633877464490142, "grad_norm": 0.18473089844858295, "learning_rate": 3.2648523358251726e-06, "loss": 0.068, "step": 2990 }, { "epoch": 0.635997456010176, "grad_norm": 0.156014132437691, "learning_rate": 3.232019151184913e-06, "loss": 0.0664, "step": 3000 }, { "epoch": 0.635997456010176, "eval_loss": 0.06693108379840851, "eval_runtime": 487.8882, "eval_samples_per_second": 4.194, "eval_steps_per_second": 0.301, "step": 3000 }, { "epoch": 0.6381174475302098, "grad_norm": 0.17571197418197826, "learning_rate": 3.1992728677169214e-06, "loss": 0.0688, "step": 3010 }, { "epoch": 0.6402374390502438, "grad_norm": 0.14947601720845555, "learning_rate": 3.1666150949920393e-06, "loss": 0.0665, "step": 3020 }, { "epoch": 0.6423574305702777, "grad_norm": 0.15331032877554068, "learning_rate": 3.1340474382305585e-06, "loss": 0.0655, "step": 3030 }, { "epoch": 0.6444774220903117, "grad_norm": 0.18933623167552627, "learning_rate": 3.101571498223317e-06, "loss": 0.0649, "step": 3040 }, { "epoch": 0.6465974136103455, "grad_norm": 0.15247439973376195, "learning_rate": 3.069188871253026e-06, "loss": 0.0649, "step": 3050 }, { "epoch": 0.6487174051303795, "grad_norm": 0.16943772711604502, "learning_rate": 3.0369011490157984e-06, "loss": 0.0692, "step": 3060 }, { "epoch": 0.6508373966504134, "grad_norm": 0.15521523385110902, "learning_rate": 3.0047099185429142e-06, "loss": 0.0654, "step": 3070 }, { "epoch": 0.6529573881704474, "grad_norm": 0.14728105383234777, "learning_rate": 2.9726167621228187e-06, "loss": 0.0657, "step": 3080 }, { "epoch": 0.6550773796904812, "grad_norm": 0.1832509363427932, "learning_rate": 2.940623257223341e-06, "loss": 0.0665, "step": 3090 }, { "epoch": 0.6571973712105151, "grad_norm": 0.15168423601274655, "learning_rate": 2.9087309764141613e-06, "loss": 0.0665, "step": 3100 }, { "epoch": 0.6593173627305491, "grad_norm": 0.1483275502933062, "learning_rate": 2.876941487289522e-06, "loss": 0.072, "step": 3110 }, { "epoch": 0.661437354250583, "grad_norm": 0.15452416173310596, "learning_rate": 2.845256352391157e-06, "loss": 0.0687, "step": 3120 }, { "epoch": 0.6635573457706169, "grad_norm": 0.16759174006680952, "learning_rate": 2.8136771291315063e-06, "loss": 0.0669, "step": 3130 }, { "epoch": 0.6656773372906508, "grad_norm": 0.14998494541872762, "learning_rate": 2.7822053697171588e-06, "loss": 0.0666, "step": 3140 }, { "epoch": 0.6677973288106848, "grad_norm": 0.17131639340630408, "learning_rate": 2.7508426210725546e-06, "loss": 0.0672, "step": 3150 }, { "epoch": 0.6699173203307187, "grad_norm": 0.19399216153317256, "learning_rate": 2.7195904247639544e-06, "loss": 0.0662, "step": 3160 }, { "epoch": 0.6720373118507526, "grad_norm": 0.15393012051599972, "learning_rate": 2.68845031692366e-06, "loss": 0.0685, "step": 3170 }, { "epoch": 0.6741573033707865, "grad_norm": 0.1761419745993989, "learning_rate": 2.657423828174518e-06, "loss": 0.0644, "step": 3180 }, { "epoch": 0.6762772948908204, "grad_norm": 0.16292970391303543, "learning_rate": 2.626512483554678e-06, "loss": 0.0673, "step": 3190 }, { "epoch": 0.6783972864108544, "grad_norm": 0.15248743923822936, "learning_rate": 2.595717802442636e-06, "loss": 0.0636, "step": 3200 }, { "epoch": 0.6805172779308882, "grad_norm": 0.17164291620759312, "learning_rate": 2.5650412984825535e-06, "loss": 0.0661, "step": 3210 }, { "epoch": 0.6826372694509222, "grad_norm": 0.14003403542018764, "learning_rate": 2.5344844795098577e-06, "loss": 0.0644, "step": 3220 }, { "epoch": 0.6847572609709561, "grad_norm": 0.13906331383996035, "learning_rate": 2.5040488474771183e-06, "loss": 0.0664, "step": 3230 }, { "epoch": 0.6868772524909901, "grad_norm": 0.1654974091386292, "learning_rate": 2.4737358983802417e-06, "loss": 0.0657, "step": 3240 }, { "epoch": 0.6889972440110239, "grad_norm": 0.17123238672779562, "learning_rate": 2.443547122184921e-06, "loss": 0.0684, "step": 3250 }, { "epoch": 0.6911172355310579, "grad_norm": 0.13771748743849033, "learning_rate": 2.416484617979397e-06, "loss": 0.0718, "step": 3260 }, { "epoch": 0.6932372270510918, "grad_norm": 0.14445999863423453, "learning_rate": 2.386535853234254e-06, "loss": 0.0703, "step": 3270 }, { "epoch": 0.6953572185711258, "grad_norm": 0.15853968355485656, "learning_rate": 2.356715547515228e-06, "loss": 0.071, "step": 3280 }, { "epoch": 0.6974772100911596, "grad_norm": 0.16059029746896103, "learning_rate": 2.3270251665732236e-06, "loss": 0.0682, "step": 3290 }, { "epoch": 0.6995972016111935, "grad_norm": 0.1311794653898363, "learning_rate": 2.2974661697729777e-06, "loss": 0.0656, "step": 3300 }, { "epoch": 0.7017171931312275, "grad_norm": 0.14003109623808868, "learning_rate": 2.268040010021334e-06, "loss": 0.0658, "step": 3310 }, { "epoch": 0.7038371846512614, "grad_norm": 0.13679184892364368, "learning_rate": 2.2387481336958243e-06, "loss": 0.0676, "step": 3320 }, { "epoch": 0.7059571761712953, "grad_norm": 0.1553274432738983, "learning_rate": 2.2095919805735786e-06, "loss": 0.0654, "step": 3330 }, { "epoch": 0.7080771676913292, "grad_norm": 0.16561059697374547, "learning_rate": 2.1805729837605533e-06, "loss": 0.0677, "step": 3340 }, { "epoch": 0.7101971592113632, "grad_norm": 0.1504235594519663, "learning_rate": 2.1516925696210917e-06, "loss": 0.0666, "step": 3350 }, { "epoch": 0.7123171507313971, "grad_norm": 0.15286590047529391, "learning_rate": 2.122952157707808e-06, "loss": 0.0684, "step": 3360 }, { "epoch": 0.714437142251431, "grad_norm": 0.1598473142296576, "learning_rate": 2.0943531606918304e-06, "loss": 0.0665, "step": 3370 }, { "epoch": 0.7165571337714649, "grad_norm": 0.14455546823633267, "learning_rate": 2.0658969842933386e-06, "loss": 0.0694, "step": 3380 }, { "epoch": 0.7186771252914989, "grad_norm": 0.14684140177879562, "learning_rate": 2.0375850272124865e-06, "loss": 0.063, "step": 3390 }, { "epoch": 0.7207971168115328, "grad_norm": 0.1558774790348137, "learning_rate": 2.0094186810606553e-06, "loss": 0.0664, "step": 3400 }, { "epoch": 0.7229171083315666, "grad_norm": 0.14875490426420004, "learning_rate": 1.9813993302920325e-06, "loss": 0.065, "step": 3410 }, { "epoch": 0.7250370998516006, "grad_norm": 0.14376290785675833, "learning_rate": 1.9535283521355807e-06, "loss": 0.0645, "step": 3420 }, { "epoch": 0.7271570913716345, "grad_norm": 0.1774603135257143, "learning_rate": 1.925807116527336e-06, "loss": 0.0628, "step": 3430 }, { "epoch": 0.7292770828916685, "grad_norm": 0.15648377782580034, "learning_rate": 1.8982369860430693e-06, "loss": 0.0669, "step": 3440 }, { "epoch": 0.7313970744117023, "grad_norm": 0.1410744866971639, "learning_rate": 1.8708193158313175e-06, "loss": 0.0652, "step": 3450 }, { "epoch": 0.7335170659317363, "grad_norm": 0.13848781419789238, "learning_rate": 1.8435554535467709e-06, "loss": 0.0668, "step": 3460 }, { "epoch": 0.7356370574517702, "grad_norm": 0.1565698593715598, "learning_rate": 1.8164467392840306e-06, "loss": 0.065, "step": 3470 }, { "epoch": 0.7377570489718042, "grad_norm": 0.14483319470448863, "learning_rate": 1.7894945055117462e-06, "loss": 0.0689, "step": 3480 }, { "epoch": 0.739877040491838, "grad_norm": 0.14690155750172293, "learning_rate": 1.7627000770071062e-06, "loss": 0.0643, "step": 3490 }, { "epoch": 0.741997032011872, "grad_norm": 0.1523880735964551, "learning_rate": 1.7360647707907447e-06, "loss": 0.0666, "step": 3500 }, { "epoch": 0.7441170235319059, "grad_norm": 0.16195859504926405, "learning_rate": 1.7095898960619862e-06, "loss": 0.0657, "step": 3510 }, { "epoch": 0.7462370150519398, "grad_norm": 0.13638473122547523, "learning_rate": 1.6832767541344974e-06, "loss": 0.0655, "step": 3520 }, { "epoch": 0.7483570065719737, "grad_norm": 0.13278921458936405, "learning_rate": 1.6571266383723388e-06, "loss": 0.0672, "step": 3530 }, { "epoch": 0.7504769980920076, "grad_norm": 0.14406828983312037, "learning_rate": 1.631140834126373e-06, "loss": 0.066, "step": 3540 }, { "epoch": 0.7525969896120416, "grad_norm": 0.1395384360254768, "learning_rate": 1.6053206186710967e-06, "loss": 0.0652, "step": 3550 }, { "epoch": 0.7547169811320755, "grad_norm": 0.1579964196169218, "learning_rate": 1.5796672611418645e-06, "loss": 0.0656, "step": 3560 }, { "epoch": 0.7568369726521094, "grad_norm": 0.1539176914379727, "learning_rate": 1.5541820224724884e-06, "loss": 0.0659, "step": 3570 }, { "epoch": 0.7589569641721433, "grad_norm": 0.1432268965723713, "learning_rate": 1.5288661553332802e-06, "loss": 0.068, "step": 3580 }, { "epoch": 0.7610769556921773, "grad_norm": 0.1475776868236256, "learning_rate": 1.5037209040694668e-06, "loss": 0.0674, "step": 3590 }, { "epoch": 0.7631969472122111, "grad_norm": 0.13942686520284647, "learning_rate": 1.4787475046400307e-06, "loss": 0.0658, "step": 3600 }, { "epoch": 0.765316938732245, "grad_norm": 0.16612542851996417, "learning_rate": 1.4539471845569598e-06, "loss": 0.0673, "step": 3610 }, { "epoch": 0.767436930252279, "grad_norm": 0.13347560560880484, "learning_rate": 1.4293211628249115e-06, "loss": 0.0651, "step": 3620 }, { "epoch": 0.7695569217723129, "grad_norm": 0.14290885311257007, "learning_rate": 1.4048706498812936e-06, "loss": 0.0632, "step": 3630 }, { "epoch": 0.7716769132923468, "grad_norm": 0.15900916314465804, "learning_rate": 1.380596847536772e-06, "loss": 0.0662, "step": 3640 }, { "epoch": 0.7737969048123807, "grad_norm": 0.15826198491620722, "learning_rate": 1.3565009489161878e-06, "loss": 0.0669, "step": 3650 }, { "epoch": 0.7759168963324147, "grad_norm": 0.1338916105091316, "learning_rate": 1.3325841383999321e-06, "loss": 0.0661, "step": 3660 }, { "epoch": 0.7780368878524486, "grad_norm": 0.14647123286090982, "learning_rate": 1.3088475915657066e-06, "loss": 0.0653, "step": 3670 }, { "epoch": 0.7801568793724825, "grad_norm": 0.12519200539181277, "learning_rate": 1.2852924751307555e-06, "loss": 0.065, "step": 3680 }, { "epoch": 0.7822768708925164, "grad_norm": 0.15737674167435736, "learning_rate": 1.2619199468945215e-06, "loss": 0.0647, "step": 3690 }, { "epoch": 0.7843968624125504, "grad_norm": 0.14864208572307655, "learning_rate": 1.2387311556817183e-06, "loss": 0.0671, "step": 3700 }, { "epoch": 0.7865168539325843, "grad_norm": 0.14386823191288503, "learning_rate": 1.2157272412858811e-06, "loss": 0.0672, "step": 3710 }, { "epoch": 0.7886368454526181, "grad_norm": 0.15384247542083423, "learning_rate": 1.192909334413338e-06, "loss": 0.0654, "step": 3720 }, { "epoch": 0.7907568369726521, "grad_norm": 0.14067508984359764, "learning_rate": 1.1702785566276236e-06, "loss": 0.0644, "step": 3730 }, { "epoch": 0.792876828492686, "grad_norm": 0.1437217497105591, "learning_rate": 1.1478360202943618e-06, "loss": 0.0645, "step": 3740 }, { "epoch": 0.79499682001272, "grad_norm": 0.15519712474428182, "learning_rate": 1.1255828285265862e-06, "loss": 0.0649, "step": 3750 }, { "epoch": 0.7971168115327538, "grad_norm": 0.14145423148178207, "learning_rate": 1.1035200751305176e-06, "loss": 0.0653, "step": 3760 }, { "epoch": 0.7992368030527878, "grad_norm": 0.13536631332448693, "learning_rate": 1.0816488445518014e-06, "loss": 0.0663, "step": 3770 }, { "epoch": 0.8013567945728217, "grad_norm": 0.17054020151205723, "learning_rate": 1.0599702118222054e-06, "loss": 0.072, "step": 3780 }, { "epoch": 0.8034767860928557, "grad_norm": 0.15685256143417375, "learning_rate": 1.038485242506777e-06, "loss": 0.0656, "step": 3790 }, { "epoch": 0.8055967776128895, "grad_norm": 0.14095243780908379, "learning_rate": 1.0171949926514706e-06, "loss": 0.0647, "step": 3800 }, { "epoch": 0.8077167691329235, "grad_norm": 0.13455803530480603, "learning_rate": 9.96100508731232e-07, "loss": 0.0656, "step": 3810 }, { "epoch": 0.8098367606529574, "grad_norm": 0.1334874446417613, "learning_rate": 9.75202827598576e-07, "loss": 0.0646, "step": 3820 }, { "epoch": 0.8119567521729913, "grad_norm": 0.14692274148907183, "learning_rate": 9.54502976432606e-07, "loss": 0.069, "step": 3830 }, { "epoch": 0.8140767436930252, "grad_norm": 0.15341984926535815, "learning_rate": 9.340019726885341e-07, "loss": 0.0673, "step": 3840 }, { "epoch": 0.8161967352130591, "grad_norm": 0.13727695678950919, "learning_rate": 9.137008240476752e-07, "loss": 0.0644, "step": 3850 }, { "epoch": 0.8183167267330931, "grad_norm": 0.1301351491962901, "learning_rate": 8.936005283679022e-07, "loss": 0.0653, "step": 3860 }, { "epoch": 0.820436718253127, "grad_norm": 0.15417587256216225, "learning_rate": 8.737020736346114e-07, "loss": 0.0687, "step": 3870 }, { "epoch": 0.8225567097731609, "grad_norm": 0.13849301619933713, "learning_rate": 8.540064379121537e-07, "loss": 0.0643, "step": 3880 }, { "epoch": 0.8246767012931948, "grad_norm": 0.12751613533813724, "learning_rate": 8.345145892957635e-07, "loss": 0.0675, "step": 3890 }, { "epoch": 0.8267966928132288, "grad_norm": 0.13641022859652724, "learning_rate": 8.152274858639709e-07, "loss": 0.0644, "step": 3900 }, { "epoch": 0.8289166843332627, "grad_norm": 0.13498806105829741, "learning_rate": 7.961460756315131e-07, "loss": 0.0661, "step": 3910 }, { "epoch": 0.8310366758532965, "grad_norm": 0.1649413393713791, "learning_rate": 7.772712965027329e-07, "loss": 0.0681, "step": 3920 }, { "epoch": 0.8331566673733305, "grad_norm": 0.14352566951876747, "learning_rate": 7.586040762254831e-07, "loss": 0.0666, "step": 3930 }, { "epoch": 0.8352766588933644, "grad_norm": 0.13644803212350157, "learning_rate": 7.40145332345516e-07, "loss": 0.0703, "step": 3940 }, { "epoch": 0.8373966504133984, "grad_norm": 0.13207832198888683, "learning_rate": 7.218959721613966e-07, "loss": 0.0677, "step": 3950 }, { "epoch": 0.8395166419334322, "grad_norm": 0.12801202011992016, "learning_rate": 7.038568926798972e-07, "loss": 0.0669, "step": 3960 }, { "epoch": 0.8416366334534662, "grad_norm": 0.1446132283031493, "learning_rate": 6.860289805719051e-07, "loss": 0.0657, "step": 3970 }, { "epoch": 0.8437566249735001, "grad_norm": 0.15537910760985132, "learning_rate": 6.684131121288506e-07, "loss": 0.0645, "step": 3980 }, { "epoch": 0.8458766164935341, "grad_norm": 0.138745128040943, "learning_rate": 6.510101532196228e-07, "loss": 0.0663, "step": 3990 }, { "epoch": 0.8479966080135679, "grad_norm": 0.1422532471037774, "learning_rate": 6.338209592480187e-07, "loss": 0.0659, "step": 4000 }, { "epoch": 0.8479966080135679, "eval_loss": 0.06505845487117767, "eval_runtime": 488.4948, "eval_samples_per_second": 4.188, "eval_steps_per_second": 0.301, "step": 4000 }, { "epoch": 0.8501165995336019, "grad_norm": 0.13051273275584138, "learning_rate": 6.168463751106973e-07, "loss": 0.0676, "step": 4010 }, { "epoch": 0.8522365910536358, "grad_norm": 0.1584048424567531, "learning_rate": 6.000872351556402e-07, "loss": 0.0647, "step": 4020 }, { "epoch": 0.8543565825736698, "grad_norm": 0.14980989572464892, "learning_rate": 5.835443631411548e-07, "loss": 0.0656, "step": 4030 }, { "epoch": 0.8564765740937036, "grad_norm": 0.14192254512990504, "learning_rate": 5.672185721953761e-07, "loss": 0.0664, "step": 4040 }, { "epoch": 0.8585965656137375, "grad_norm": 0.14482703076509035, "learning_rate": 5.51110664776302e-07, "loss": 0.0672, "step": 4050 }, { "epoch": 0.8607165571337715, "grad_norm": 0.134823932240745, "learning_rate": 5.352214326323485e-07, "loss": 0.0675, "step": 4060 }, { "epoch": 0.8628365486538054, "grad_norm": 0.13549221587996976, "learning_rate": 5.195516567634345e-07, "loss": 0.0643, "step": 4070 }, { "epoch": 0.8649565401738393, "grad_norm": 0.14157113823645306, "learning_rate": 5.041021073825935e-07, "loss": 0.0681, "step": 4080 }, { "epoch": 0.8670765316938732, "grad_norm": 0.13268190112303166, "learning_rate": 4.888735438781156e-07, "loss": 0.0634, "step": 4090 }, { "epoch": 0.8691965232139072, "grad_norm": 0.15044371596526965, "learning_rate": 4.738667147762177e-07, "loss": 0.0638, "step": 4100 }, { "epoch": 0.8713165147339411, "grad_norm": 0.15554565316213642, "learning_rate": 4.590823577042597e-07, "loss": 0.0673, "step": 4110 }, { "epoch": 0.873436506253975, "grad_norm": 0.13924045324828926, "learning_rate": 4.4452119935447844e-07, "loss": 0.0684, "step": 4120 }, { "epoch": 0.8755564977740089, "grad_norm": 0.14738905954975795, "learning_rate": 4.301839554482745e-07, "loss": 0.0646, "step": 4130 }, { "epoch": 0.8776764892940428, "grad_norm": 0.17734811072141277, "learning_rate": 4.160713307010339e-07, "loss": 0.0627, "step": 4140 }, { "epoch": 0.8797964808140768, "grad_norm": 0.14667184777407763, "learning_rate": 4.021840187874831e-07, "loss": 0.0665, "step": 4150 }, { "epoch": 0.8819164723341106, "grad_norm": 0.13247047796191325, "learning_rate": 3.8852270230759715e-07, "loss": 0.068, "step": 4160 }, { "epoch": 0.8840364638541446, "grad_norm": 0.1318359100305846, "learning_rate": 3.750880527530515e-07, "loss": 0.0642, "step": 4170 }, { "epoch": 0.8861564553741785, "grad_norm": 0.14660978947680608, "learning_rate": 3.618807304742067e-07, "loss": 0.064, "step": 4180 }, { "epoch": 0.8882764468942124, "grad_norm": 0.16073812743121169, "learning_rate": 3.4890138464765854e-07, "loss": 0.0624, "step": 4190 }, { "epoch": 0.8903964384142463, "grad_norm": 0.1317816842544379, "learning_rate": 3.361506532443265e-07, "loss": 0.0637, "step": 4200 }, { "epoch": 0.8925164299342803, "grad_norm": 0.17027046123997486, "learning_rate": 3.2362916299809643e-07, "loss": 0.066, "step": 4210 }, { "epoch": 0.8946364214543142, "grad_norm": 0.13836358324093678, "learning_rate": 3.113375293750137e-07, "loss": 0.0676, "step": 4220 }, { "epoch": 0.896756412974348, "grad_norm": 0.13744563225532516, "learning_rate": 2.992763565430301e-07, "loss": 0.064, "step": 4230 }, { "epoch": 0.898876404494382, "grad_norm": 0.13017524095673055, "learning_rate": 2.874462373423115e-07, "loss": 0.0682, "step": 4240 }, { "epoch": 0.900996396014416, "grad_norm": 0.13351598157626, "learning_rate": 2.7584775325609546e-07, "loss": 0.0684, "step": 4250 }, { "epoch": 0.9031163875344499, "grad_norm": 0.14042699267228834, "learning_rate": 2.6448147438210725e-07, "loss": 0.0652, "step": 4260 }, { "epoch": 0.9052363790544837, "grad_norm": 0.1347635143628056, "learning_rate": 2.5334795940454514e-07, "loss": 0.0687, "step": 4270 }, { "epoch": 0.9073563705745177, "grad_norm": 0.14711313054197867, "learning_rate": 2.424477555666105e-07, "loss": 0.0642, "step": 4280 }, { "epoch": 0.9094763620945516, "grad_norm": 0.12669127686809334, "learning_rate": 2.3178139864361514e-07, "loss": 0.0662, "step": 4290 }, { "epoch": 0.9115963536145856, "grad_norm": 0.1482558394168092, "learning_rate": 2.213494129166477e-07, "loss": 0.0663, "step": 4300 }, { "epoch": 0.9137163451346194, "grad_norm": 0.13767908522932615, "learning_rate": 2.111523111467978e-07, "loss": 0.0662, "step": 4310 }, { "epoch": 0.9158363366546534, "grad_norm": 0.1307957796839651, "learning_rate": 2.0119059454995705e-07, "loss": 0.0637, "step": 4320 }, { "epoch": 0.9179563281746873, "grad_norm": 0.1387365413487702, "learning_rate": 1.9146475277218247e-07, "loss": 0.066, "step": 4330 }, { "epoch": 0.9200763196947213, "grad_norm": 0.13783938524006778, "learning_rate": 1.8197526386562637e-07, "loss": 0.0656, "step": 4340 }, { "epoch": 0.9221963112147551, "grad_norm": 0.15258547100463352, "learning_rate": 1.7272259426504178e-07, "loss": 0.0635, "step": 4350 }, { "epoch": 0.924316302734789, "grad_norm": 0.12836303549043818, "learning_rate": 1.6370719876485474e-07, "loss": 0.0654, "step": 4360 }, { "epoch": 0.926436294254823, "grad_norm": 0.16082006996334058, "learning_rate": 1.5492952049680987e-07, "loss": 0.0665, "step": 4370 }, { "epoch": 0.9285562857748569, "grad_norm": 0.15509787140465903, "learning_rate": 1.463899909081884e-07, "loss": 0.0701, "step": 4380 }, { "epoch": 0.9306762772948908, "grad_norm": 0.15582542247867595, "learning_rate": 1.3808902974060234e-07, "loss": 0.0663, "step": 4390 }, { "epoch": 0.9327962688149247, "grad_norm": 0.1285740918117715, "learning_rate": 1.3002704500936324e-07, "loss": 0.0666, "step": 4400 }, { "epoch": 0.9349162603349587, "grad_norm": 0.12844195029643576, "learning_rate": 1.222044329834271e-07, "loss": 0.0649, "step": 4410 }, { "epoch": 0.9370362518549926, "grad_norm": 0.14024654119089836, "learning_rate": 1.1462157816591435e-07, "loss": 0.0653, "step": 4420 }, { "epoch": 0.9391562433750265, "grad_norm": 0.13569299209873553, "learning_rate": 1.0727885327521448e-07, "loss": 0.0636, "step": 4430 }, { "epoch": 0.9412762348950604, "grad_norm": 0.1506002944720704, "learning_rate": 1.0017661922666177e-07, "loss": 0.0666, "step": 4440 }, { "epoch": 0.9433962264150944, "grad_norm": 0.1454846226979092, "learning_rate": 9.331522511479785e-08, "loss": 0.0666, "step": 4450 }, { "epoch": 0.9455162179351283, "grad_norm": 0.12532906503884286, "learning_rate": 8.669500819621424e-08, "loss": 0.0633, "step": 4460 }, { "epoch": 0.9476362094551621, "grad_norm": 0.13843242796856053, "learning_rate": 8.031629387296958e-08, "loss": 0.065, "step": 4470 }, { "epoch": 0.9497562009751961, "grad_norm": 0.14800544603658328, "learning_rate": 7.41793956766007e-08, "loss": 0.068, "step": 4480 }, { "epoch": 0.95187619249523, "grad_norm": 0.1302604634301598, "learning_rate": 6.828461525271057e-08, "loss": 0.0669, "step": 4490 }, { "epoch": 0.953996184015264, "grad_norm": 0.1317179792652165, "learning_rate": 6.26322423461384e-08, "loss": 0.0669, "step": 4500 }, { "epoch": 0.9561161755352978, "grad_norm": 0.14912594874221324, "learning_rate": 5.7222554786722784e-08, "loss": 0.0656, "step": 4510 }, { "epoch": 0.9582361670553318, "grad_norm": 0.14837310552453115, "learning_rate": 5.20558184756409e-08, "loss": 0.0637, "step": 4520 }, { "epoch": 0.9603561585753657, "grad_norm": 0.13757532372594639, "learning_rate": 4.7132287372341764e-08, "loss": 0.0648, "step": 4530 }, { "epoch": 0.9624761500953997, "grad_norm": 0.1412166618656987, "learning_rate": 4.245220348206347e-08, "loss": 0.0652, "step": 4540 }, { "epoch": 0.9645961416154335, "grad_norm": 0.13183693219990808, "learning_rate": 3.801579684393486e-08, "loss": 0.0641, "step": 4550 }, { "epoch": 0.9667161331354674, "grad_norm": 0.13113013926781358, "learning_rate": 3.382328551967296e-08, "loss": 0.062, "step": 4560 }, { "epoch": 0.9688361246555014, "grad_norm": 0.1413505271426179, "learning_rate": 2.9874875582860395e-08, "loss": 0.0645, "step": 4570 }, { "epoch": 0.9709561161755353, "grad_norm": 0.1412996190385206, "learning_rate": 2.6170761108818554e-08, "loss": 0.0663, "step": 4580 }, { "epoch": 0.9730761076955692, "grad_norm": 0.12681619081672574, "learning_rate": 2.2711124165069043e-08, "loss": 0.0642, "step": 4590 }, { "epoch": 0.9751960992156031, "grad_norm": 0.12962052445070302, "learning_rate": 1.949613480238255e-08, "loss": 0.069, "step": 4600 }, { "epoch": 0.9773160907356371, "grad_norm": 0.12947117948159637, "learning_rate": 1.652595104642052e-08, "loss": 0.0664, "step": 4610 }, { "epoch": 0.979436082255671, "grad_norm": 0.1306635495902112, "learning_rate": 1.3800718889970255e-08, "loss": 0.0631, "step": 4620 }, { "epoch": 0.9815560737757049, "grad_norm": 0.14160335253570747, "learning_rate": 1.1320572285765663e-08, "loss": 0.0655, "step": 4630 }, { "epoch": 0.9836760652957388, "grad_norm": 0.1331013714780897, "learning_rate": 9.085633139905292e-09, "loss": 0.0679, "step": 4640 }, { "epoch": 0.9857960568157728, "grad_norm": 0.1521352520849485, "learning_rate": 7.096011305859352e-09, "loss": 0.0659, "step": 4650 }, { "epoch": 0.9879160483358067, "grad_norm": 0.1490651261311961, "learning_rate": 5.351804579070696e-09, "loss": 0.0663, "step": 4660 }, { "epoch": 0.9900360398558405, "grad_norm": 0.14100706470294022, "learning_rate": 3.853098692147006e-09, "loss": 0.0658, "step": 4670 }, { "epoch": 0.9921560313758745, "grad_norm": 0.1375637676528597, "learning_rate": 2.5999673106480438e-09, "loss": 0.0638, "step": 4680 }, { "epoch": 0.9942760228959084, "grad_norm": 0.1362445388806314, "learning_rate": 1.5924720294641093e-09, "loss": 0.0645, "step": 4690 }, { "epoch": 0.9963960144159424, "grad_norm": 0.12875364924264648, "learning_rate": 8.306623697884597e-10, "loss": 0.0669, "step": 4700 }, { "epoch": 0.9985160059359762, "grad_norm": 0.12918021464281548, "learning_rate": 3.1457577668259074e-10, "loss": 0.0658, "step": 4710 }, { "epoch": 1.0, "step": 4717, "total_flos": 3995069728161792.0, "train_loss": 0.10921624170816473, "train_runtime": 69121.8578, "train_samples_per_second": 0.955, "train_steps_per_second": 0.068 } ], "logging_steps": 10, "max_steps": 4717, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3995069728161792.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }