{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1044, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.014406627048442283, "grad_norm": 2.235235207512626, "learning_rate": 3.8095238095238102e-06, "loss": 0.7567409992218017, "step": 5 }, { "epoch": 0.028813254096884566, "grad_norm": 3.1220907369972015, "learning_rate": 8.571428571428573e-06, "loss": 0.6131507873535156, "step": 10 }, { "epoch": 0.04321988114532685, "grad_norm": 1.26071593847269, "learning_rate": 1.3333333333333333e-05, "loss": 0.4174152374267578, "step": 15 }, { "epoch": 0.05762650819376913, "grad_norm": 0.7001329999687771, "learning_rate": 1.8095238095238094e-05, "loss": 0.3269367218017578, "step": 20 }, { "epoch": 0.07203313524221142, "grad_norm": 0.45760881472851334, "learning_rate": 2.2857142857142858e-05, "loss": 0.2579814910888672, "step": 25 }, { "epoch": 0.0864397622906537, "grad_norm": 0.3195758164531423, "learning_rate": 2.7619047619047622e-05, "loss": 0.22403466701507568, "step": 30 }, { "epoch": 0.10084638933909598, "grad_norm": 0.3937202043873978, "learning_rate": 3.2380952380952386e-05, "loss": 0.19430849552154542, "step": 35 }, { "epoch": 0.11525301638753827, "grad_norm": 0.4622992203768602, "learning_rate": 3.7142857142857143e-05, "loss": 0.17950987815856934, "step": 40 }, { "epoch": 0.12965964343598055, "grad_norm": 0.6153699707363115, "learning_rate": 4.190476190476191e-05, "loss": 0.16267883777618408, "step": 45 }, { "epoch": 0.14406627048442283, "grad_norm": 0.44308873981945024, "learning_rate": 4.666666666666667e-05, "loss": 0.15465893745422363, "step": 50 }, { "epoch": 0.15847289753286511, "grad_norm": 0.2667361601014973, "learning_rate": 5.142857142857143e-05, "loss": 0.1339721441268921, "step": 55 }, { "epoch": 0.1728795245813074, "grad_norm": 0.2856336328456399, "learning_rate": 5.619047619047619e-05, "loss": 0.12487690448760987, "step": 60 }, { "epoch": 0.18728615162974968, "grad_norm": 0.25342260570898284, "learning_rate": 6.0952380952380964e-05, "loss": 0.12374393939971924, "step": 65 }, { "epoch": 0.20169277867819196, "grad_norm": 0.30856178034874804, "learning_rate": 6.571428571428571e-05, "loss": 0.11958651542663574, "step": 70 }, { "epoch": 0.21609940572663425, "grad_norm": 0.37526519775047024, "learning_rate": 7.047619047619048e-05, "loss": 0.1187552571296692, "step": 75 }, { "epoch": 0.23050603277507653, "grad_norm": 0.23950000695755433, "learning_rate": 7.523809523809524e-05, "loss": 0.11327266693115234, "step": 80 }, { "epoch": 0.2449126598235188, "grad_norm": 0.2205068195876155, "learning_rate": 8e-05, "loss": 0.10409235954284668, "step": 85 }, { "epoch": 0.2593192868719611, "grad_norm": 0.38702416724389993, "learning_rate": 8.476190476190477e-05, "loss": 0.10459071397781372, "step": 90 }, { "epoch": 0.2737259139204034, "grad_norm": 1.5049886602582672, "learning_rate": 8.952380952380953e-05, "loss": 0.10260200500488281, "step": 95 }, { "epoch": 0.28813254096884566, "grad_norm": 0.7639386034441172, "learning_rate": 9.428571428571429e-05, "loss": 0.15102967023849487, "step": 100 }, { "epoch": 0.302539168017288, "grad_norm": 1.2918337466488035, "learning_rate": 9.904761904761905e-05, "loss": 0.21126885414123536, "step": 105 }, { "epoch": 0.31694579506573023, "grad_norm": 0.3993035259049567, "learning_rate": 9.99955226394288e-05, "loss": 0.1489308714866638, "step": 110 }, { "epoch": 0.33135242211417254, "grad_norm": 0.4337702538884852, "learning_rate": 9.997733473639876e-05, "loss": 0.1149595022201538, "step": 115 }, { "epoch": 0.3457590491626148, "grad_norm": 0.19718286454816564, "learning_rate": 9.994516154152849e-05, "loss": 0.10596739053726197, "step": 120 }, { "epoch": 0.3601656762110571, "grad_norm": 0.20133933569463766, "learning_rate": 9.989901205792952e-05, "loss": 0.09920316338539123, "step": 125 }, { "epoch": 0.37457230325949936, "grad_norm": 0.1627871886908846, "learning_rate": 9.983889919973586e-05, "loss": 0.08917503952980041, "step": 130 }, { "epoch": 0.3889789303079417, "grad_norm": 0.1732609558706655, "learning_rate": 9.976483978849007e-05, "loss": 0.08957574367523194, "step": 135 }, { "epoch": 0.40338555735638393, "grad_norm": 0.17068559939024425, "learning_rate": 9.967685454843618e-05, "loss": 0.08561774492263793, "step": 140 }, { "epoch": 0.41779218440482624, "grad_norm": 0.1627969165227149, "learning_rate": 9.957496810072027e-05, "loss": 0.083004629611969, "step": 145 }, { "epoch": 0.4321988114532685, "grad_norm": 0.1467656284113813, "learning_rate": 9.945920895650071e-05, "loss": 0.0808147668838501, "step": 150 }, { "epoch": 0.4466054385017108, "grad_norm": 0.13322475158297778, "learning_rate": 9.932960950896981e-05, "loss": 0.07847496271133422, "step": 155 }, { "epoch": 0.46101206555015306, "grad_norm": 0.12265959792803287, "learning_rate": 9.918620602428915e-05, "loss": 0.07879123687744141, "step": 160 }, { "epoch": 0.47541869259859537, "grad_norm": 0.11616840330675544, "learning_rate": 9.902903863144107e-05, "loss": 0.07581273913383484, "step": 165 }, { "epoch": 0.4898253196470376, "grad_norm": 0.12203330893770489, "learning_rate": 9.885815131099934e-05, "loss": 0.07368603944778443, "step": 170 }, { "epoch": 0.5042319466954799, "grad_norm": 0.1546308315564488, "learning_rate": 9.867359188282192e-05, "loss": 0.06976621150970459, "step": 175 }, { "epoch": 0.5186385737439222, "grad_norm": 0.2238419658216969, "learning_rate": 9.847541199266941e-05, "loss": 0.07270271778106689, "step": 180 }, { "epoch": 0.5330452007923645, "grad_norm": 0.1778940838233776, "learning_rate": 9.826366709775286e-05, "loss": 0.06899308562278747, "step": 185 }, { "epoch": 0.5474518278408068, "grad_norm": 0.1236118996782577, "learning_rate": 9.803841645121504e-05, "loss": 0.06641653776168824, "step": 190 }, { "epoch": 0.561858454889249, "grad_norm": 0.20583115161000629, "learning_rate": 9.779972308554952e-05, "loss": 0.06647136211395263, "step": 195 }, { "epoch": 0.5762650819376913, "grad_norm": 0.13075349423709637, "learning_rate": 9.754765379496202e-05, "loss": 0.06856078505516053, "step": 200 }, { "epoch": 0.5906717089861336, "grad_norm": 0.17339943774041067, "learning_rate": 9.728227911667934e-05, "loss": 0.06724534034729004, "step": 205 }, { "epoch": 0.605078336034576, "grad_norm": 0.14182602625367816, "learning_rate": 9.700367331121054e-05, "loss": 0.06738802194595336, "step": 210 }, { "epoch": 0.6194849630830181, "grad_norm": 0.16393619983216387, "learning_rate": 9.67119143415667e-05, "loss": 0.07073606252670288, "step": 215 }, { "epoch": 0.6338915901314605, "grad_norm": 0.1328841830481596, "learning_rate": 9.640708385144403e-05, "loss": 0.06382153034210206, "step": 220 }, { "epoch": 0.6482982171799028, "grad_norm": 0.13571266215544603, "learning_rate": 9.608926714237754e-05, "loss": 0.06776301860809326, "step": 225 }, { "epoch": 0.6627048442283451, "grad_norm": 0.12351937744356929, "learning_rate": 9.575855314987068e-05, "loss": 0.06309446096420288, "step": 230 }, { "epoch": 0.6771114712767873, "grad_norm": 0.12290269065618897, "learning_rate": 9.541503441850843e-05, "loss": 0.06422497630119324, "step": 235 }, { "epoch": 0.6915180983252296, "grad_norm": 0.12486344744276894, "learning_rate": 9.505880707606024e-05, "loss": 0.06324135661125183, "step": 240 }, { "epoch": 0.7059247253736719, "grad_norm": 0.1371721876913286, "learning_rate": 9.468997080658031e-05, "loss": 0.06205494403839111, "step": 245 }, { "epoch": 0.7203313524221142, "grad_norm": 0.12440196659006258, "learning_rate": 9.430862882251278e-05, "loss": 0.057729125022888184, "step": 250 }, { "epoch": 0.7347379794705564, "grad_norm": 0.11085006544539791, "learning_rate": 9.391488783580955e-05, "loss": 0.059876751899719236, "step": 255 }, { "epoch": 0.7491446065189987, "grad_norm": 0.11611256342361528, "learning_rate": 9.350885802806863e-05, "loss": 0.05882802605628967, "step": 260 }, { "epoch": 0.763551233567441, "grad_norm": 0.1279259846460798, "learning_rate": 9.309065301970193e-05, "loss": 0.06077917814254761, "step": 265 }, { "epoch": 0.7779578606158833, "grad_norm": 0.11105876561542377, "learning_rate": 9.266038983814039e-05, "loss": 0.05303559303283691, "step": 270 }, { "epoch": 0.7923644876643255, "grad_norm": 0.11671310410423168, "learning_rate": 9.221818888508602e-05, "loss": 0.06124954223632813, "step": 275 }, { "epoch": 0.8067711147127679, "grad_norm": 0.11537085211038406, "learning_rate": 9.176417390281944e-05, "loss": 0.055888807773590087, "step": 280 }, { "epoch": 0.8211777417612102, "grad_norm": 0.1480823536245831, "learning_rate": 9.129847193957282e-05, "loss": 0.056972581148147586, "step": 285 }, { "epoch": 0.8355843688096525, "grad_norm": 0.15744268133880865, "learning_rate": 9.08212133139776e-05, "loss": 0.05824898481369019, "step": 290 }, { "epoch": 0.8499909958580947, "grad_norm": 0.1397867333597395, "learning_rate": 9.033253157859714e-05, "loss": 0.05415785312652588, "step": 295 }, { "epoch": 0.864397622906537, "grad_norm": 0.12034022108734013, "learning_rate": 8.983256348255423e-05, "loss": 0.05467197895050049, "step": 300 }, { "epoch": 0.8788042499549793, "grad_norm": 0.12682573622924756, "learning_rate": 8.932144893326432e-05, "loss": 0.06181464791297912, "step": 305 }, { "epoch": 0.8932108770034216, "grad_norm": 0.11321366531316682, "learning_rate": 8.879933095728485e-05, "loss": 0.05511963367462158, "step": 310 }, { "epoch": 0.9076175040518638, "grad_norm": 0.1076394497380973, "learning_rate": 8.826635566029166e-05, "loss": 0.05229709148406982, "step": 315 }, { "epoch": 0.9220241311003061, "grad_norm": 0.11249447920151531, "learning_rate": 8.772267218619388e-05, "loss": 0.05275582075119019, "step": 320 }, { "epoch": 0.9364307581487484, "grad_norm": 0.11401150417345533, "learning_rate": 8.716843267539869e-05, "loss": 0.05470834374427795, "step": 325 }, { "epoch": 0.9508373851971907, "grad_norm": 0.13321527980254963, "learning_rate": 8.660379222223727e-05, "loss": 0.05563476085662842, "step": 330 }, { "epoch": 0.9652440122456329, "grad_norm": 0.10771804020098895, "learning_rate": 8.602890883156454e-05, "loss": 0.054843342304229735, "step": 335 }, { "epoch": 0.9796506392940753, "grad_norm": 0.12601833333178913, "learning_rate": 8.544394337454409e-05, "loss": 0.05721263885498047, "step": 340 }, { "epoch": 0.9940572663425176, "grad_norm": 0.12322820499048608, "learning_rate": 8.484905954363123e-05, "loss": 0.05096786618232727, "step": 345 }, { "epoch": 1.0057626508193769, "grad_norm": 0.14089468629356533, "learning_rate": 8.424442380676647e-05, "loss": 0.05167339444160461, "step": 350 }, { "epoch": 1.0201692778678193, "grad_norm": 0.1705872004915626, "learning_rate": 8.363020536079239e-05, "loss": 0.05249757170677185, "step": 355 }, { "epoch": 1.0345759049162615, "grad_norm": 0.15358181481824462, "learning_rate": 8.300657608410678e-05, "loss": 0.05038872957229614, "step": 360 }, { "epoch": 1.0489825319647037, "grad_norm": 0.13895400680332037, "learning_rate": 8.237371048856546e-05, "loss": 0.050058400630950926, "step": 365 }, { "epoch": 1.0633891590131461, "grad_norm": 0.09560889181658183, "learning_rate": 8.17317856706482e-05, "loss": 0.04919912219047547, "step": 370 }, { "epoch": 1.0777957860615883, "grad_norm": 0.10951811698505555, "learning_rate": 8.108098126190129e-05, "loss": 0.04963598847389221, "step": 375 }, { "epoch": 1.0922024131100305, "grad_norm": 0.09853927812254934, "learning_rate": 8.042147937867079e-05, "loss": 0.046415746212005615, "step": 380 }, { "epoch": 1.106609040158473, "grad_norm": 0.09238299590671381, "learning_rate": 7.975346457114034e-05, "loss": 0.04439312219619751, "step": 385 }, { "epoch": 1.1210156672069151, "grad_norm": 0.10940030307745394, "learning_rate": 7.907712377168817e-05, "loss": 0.051634716987609866, "step": 390 }, { "epoch": 1.1354222942553576, "grad_norm": 0.09338579936215781, "learning_rate": 7.839264624257712e-05, "loss": 0.04415662288665771, "step": 395 }, { "epoch": 1.1498289213037998, "grad_norm": 0.10999587309136662, "learning_rate": 7.770022352299293e-05, "loss": 0.047378170490264895, "step": 400 }, { "epoch": 1.164235548352242, "grad_norm": 0.10109309983264758, "learning_rate": 7.700004937544542e-05, "loss": 0.04249417781829834, "step": 405 }, { "epoch": 1.1786421754006844, "grad_norm": 0.10231496239314469, "learning_rate": 7.629231973154725e-05, "loss": 0.04593285918235779, "step": 410 }, { "epoch": 1.1930488024491266, "grad_norm": 0.1000912342655061, "learning_rate": 7.557723263718596e-05, "loss": 0.05370241403579712, "step": 415 }, { "epoch": 1.2074554294975688, "grad_norm": 0.08355578823714238, "learning_rate": 7.485498819710417e-05, "loss": 0.04612640142440796, "step": 420 }, { "epoch": 1.2218620565460112, "grad_norm": 0.087036754767847, "learning_rate": 7.412578851890384e-05, "loss": 0.043773263692855835, "step": 425 }, { "epoch": 1.2362686835944534, "grad_norm": 0.09341830589519805, "learning_rate": 7.338983765648985e-05, "loss": 0.046638333797454835, "step": 430 }, { "epoch": 1.2506753106428956, "grad_norm": 0.09163918271970233, "learning_rate": 7.264734155296912e-05, "loss": 0.045640939474105836, "step": 435 }, { "epoch": 1.265081937691338, "grad_norm": 0.09623135416486957, "learning_rate": 7.189850798302099e-05, "loss": 0.04710923135280609, "step": 440 }, { "epoch": 1.2794885647397802, "grad_norm": 0.09010925699278292, "learning_rate": 7.114354649475499e-05, "loss": 0.04437531530857086, "step": 445 }, { "epoch": 1.2938951917882227, "grad_norm": 0.09828854110045074, "learning_rate": 7.038266835107257e-05, "loss": 0.04155453443527222, "step": 450 }, { "epoch": 1.3083018188366649, "grad_norm": 0.09261388252893078, "learning_rate": 6.961608647054873e-05, "loss": 0.04477185308933258, "step": 455 }, { "epoch": 1.322708445885107, "grad_norm": 0.09199618999958105, "learning_rate": 6.884401536785045e-05, "loss": 0.045587533712387086, "step": 460 }, { "epoch": 1.3371150729335495, "grad_norm": 0.10296954226773448, "learning_rate": 6.806667109370853e-05, "loss": 0.04496743679046631, "step": 465 }, { "epoch": 1.3515216999819917, "grad_norm": 0.0991741419475408, "learning_rate": 6.728427117445948e-05, "loss": 0.04124987423419953, "step": 470 }, { "epoch": 1.365928327030434, "grad_norm": 0.08767468242608127, "learning_rate": 6.649703455117458e-05, "loss": 0.044256627559661865, "step": 475 }, { "epoch": 1.3803349540788763, "grad_norm": 0.08419233546507805, "learning_rate": 6.5705181518393e-05, "loss": 0.047923988103866576, "step": 480 }, { "epoch": 1.3947415811273185, "grad_norm": 0.15529323580619178, "learning_rate": 6.490893366247612e-05, "loss": 0.040982422232627866, "step": 485 }, { "epoch": 1.409148208175761, "grad_norm": 0.08719252163236856, "learning_rate": 6.41085137996006e-05, "loss": 0.0431306004524231, "step": 490 }, { "epoch": 1.4235548352242031, "grad_norm": 0.09381117178448978, "learning_rate": 6.330414591340689e-05, "loss": 0.039784133434295654, "step": 495 }, { "epoch": 1.4379614622726455, "grad_norm": 0.08334433128110437, "learning_rate": 6.249605509232149e-05, "loss": 0.04327746033668518, "step": 500 }, { "epoch": 1.4523680893210877, "grad_norm": 0.09141409005562276, "learning_rate": 6.168446746656973e-05, "loss": 0.04065501093864441, "step": 505 }, { "epoch": 1.46677471636953, "grad_norm": 0.10836927533553822, "learning_rate": 6.0869610144897215e-05, "loss": 0.040621763467788695, "step": 510 }, { "epoch": 1.4811813434179721, "grad_norm": 0.11429670482454558, "learning_rate": 6.005171115101735e-05, "loss": 0.042708945274353025, "step": 515 }, { "epoch": 1.4955879704664146, "grad_norm": 0.10265027708777795, "learning_rate": 5.9230999359802784e-05, "loss": 0.03845831751823425, "step": 520 }, { "epoch": 1.509994597514857, "grad_norm": 0.0937825232136341, "learning_rate": 5.84077044332389e-05, "loss": 0.04369714856147766, "step": 525 }, { "epoch": 1.5244012245632992, "grad_norm": 0.14710934296521627, "learning_rate": 5.7582056756156665e-05, "loss": 0.04057990908622742, "step": 530 }, { "epoch": 1.5388078516117414, "grad_norm": 0.08557873748617338, "learning_rate": 5.675428737176367e-05, "loss": 0.03988811373710632, "step": 535 }, { "epoch": 1.5532144786601836, "grad_norm": 0.08304731519894865, "learning_rate": 5.5924627916990446e-05, "loss": 0.040156081318855286, "step": 540 }, { "epoch": 1.567621105708626, "grad_norm": 0.09009100140646863, "learning_rate": 5.5093310557671074e-05, "loss": 0.04313129186630249, "step": 545 }, { "epoch": 1.5820277327570682, "grad_norm": 0.09229023810015868, "learning_rate": 5.426056792357551e-05, "loss": 0.04041691720485687, "step": 550 }, { "epoch": 1.5964343598055106, "grad_norm": 0.08400211717158966, "learning_rate": 5.342663304331211e-05, "loss": 0.04093085825443268, "step": 555 }, { "epoch": 1.6108409868539528, "grad_norm": 0.09614326424875454, "learning_rate": 5.25917392791188e-05, "loss": 0.039686673879623414, "step": 560 }, { "epoch": 1.625247613902395, "grad_norm": 0.1067845470194038, "learning_rate": 5.1756120261560446e-05, "loss": 0.039973828196525577, "step": 565 }, { "epoch": 1.6396542409508372, "grad_norm": 0.08943621090417164, "learning_rate": 5.092000982415162e-05, "loss": 0.03885244131088257, "step": 570 }, { "epoch": 1.6540608679992796, "grad_norm": 0.08753082979407804, "learning_rate": 5.0083641937922145e-05, "loss": 0.03913732171058655, "step": 575 }, { "epoch": 1.668467495047722, "grad_norm": 0.09803669811995008, "learning_rate": 4.924725064594447e-05, "loss": 0.038859084248542786, "step": 580 }, { "epoch": 1.6828741220961643, "grad_norm": 0.08541143736458823, "learning_rate": 4.8411069997840756e-05, "loss": 0.037244629859924314, "step": 585 }, { "epoch": 1.6972807491446065, "grad_norm": 0.08650694144802851, "learning_rate": 4.757533398428812e-05, "loss": 0.04225952625274658, "step": 590 }, { "epoch": 1.7116873761930487, "grad_norm": 0.09490787276668022, "learning_rate": 4.674027647154037e-05, "loss": 0.03874731659889221, "step": 595 }, { "epoch": 1.726094003241491, "grad_norm": 0.07772058542302925, "learning_rate": 4.590613113598461e-05, "loss": 0.03750569224357605, "step": 600 }, { "epoch": 1.7405006302899335, "grad_norm": 0.07856101825582532, "learning_rate": 4.507313139875102e-05, "loss": 0.03765683174133301, "step": 605 }, { "epoch": 1.7549072573383757, "grad_norm": 0.07088260858693515, "learning_rate": 4.4241510360393804e-05, "loss": 0.03841148316860199, "step": 610 }, { "epoch": 1.769313884386818, "grad_norm": 0.08315598782355023, "learning_rate": 4.341150073566227e-05, "loss": 0.03978689610958099, "step": 615 }, { "epoch": 1.7837205114352601, "grad_norm": 0.08933153255691949, "learning_rate": 4.258333478837947e-05, "loss": 0.038895291090011594, "step": 620 }, { "epoch": 1.7981271384837025, "grad_norm": 0.08396668543385523, "learning_rate": 4.1757244266447245e-05, "loss": 0.04072596728801727, "step": 625 }, { "epoch": 1.8125337655321447, "grad_norm": 0.07957802106126194, "learning_rate": 4.093346033699557e-05, "loss": 0.03865320086479187, "step": 630 }, { "epoch": 1.8269403925805872, "grad_norm": 0.08958406118221353, "learning_rate": 4.011221352169447e-05, "loss": 0.04185936748981476, "step": 635 }, { "epoch": 1.8413470196290294, "grad_norm": 0.08961676019198377, "learning_rate": 3.9293733632246544e-05, "loss": 0.04408974051475525, "step": 640 }, { "epoch": 1.8557536466774716, "grad_norm": 0.07858278806552751, "learning_rate": 3.847824970607797e-05, "loss": 0.04014042019844055, "step": 645 }, { "epoch": 1.8701602737259138, "grad_norm": 0.07419667584622487, "learning_rate": 3.7665989942246625e-05, "loss": 0.03581300973892212, "step": 650 }, { "epoch": 1.8845669007743562, "grad_norm": 0.08037951897237189, "learning_rate": 3.685718163758427e-05, "loss": 0.04189331531524658, "step": 655 }, { "epoch": 1.8989735278227986, "grad_norm": 0.08133067284522653, "learning_rate": 3.6052051123091634e-05, "loss": 0.03912949562072754, "step": 660 }, { "epoch": 1.9133801548712408, "grad_norm": 0.08974888658045152, "learning_rate": 3.5250823700603496e-05, "loss": 0.03808005452156067, "step": 665 }, { "epoch": 1.927786781919683, "grad_norm": 0.07193212698550007, "learning_rate": 3.445372357974194e-05, "loss": 0.03524368405342102, "step": 670 }, { "epoch": 1.9421934089681252, "grad_norm": 0.07439568567213939, "learning_rate": 3.3660973815175165e-05, "loss": 0.03650209903717041, "step": 675 }, { "epoch": 1.9566000360165676, "grad_norm": 0.07586041788325688, "learning_rate": 3.287279624419945e-05, "loss": 0.036546701192855836, "step": 680 }, { "epoch": 1.97100666306501, "grad_norm": 0.08294122441026296, "learning_rate": 3.208941142466187e-05, "loss": 0.03591431975364685, "step": 685 }, { "epoch": 1.9854132901134522, "grad_norm": 0.08528763303850583, "learning_rate": 3.1311038573240975e-05, "loss": 0.03485568761825562, "step": 690 }, { "epoch": 1.9998199171618944, "grad_norm": 0.0756456466151007, "learning_rate": 3.0537895504102874e-05, "loss": 0.037538421154022214, "step": 695 }, { "epoch": 2.0115253016387538, "grad_norm": 0.0987258257656567, "learning_rate": 2.9770198567949546e-05, "loss": 0.027647560834884642, "step": 700 }, { "epoch": 2.025931928687196, "grad_norm": 0.10342059226496335, "learning_rate": 2.900816259147705e-05, "loss": 0.03239924311637878, "step": 705 }, { "epoch": 2.0403385557356386, "grad_norm": 0.08947622183974005, "learning_rate": 2.8252000817259837e-05, "loss": 0.02974867820739746, "step": 710 }, { "epoch": 2.054745182784081, "grad_norm": 0.07819720124564082, "learning_rate": 2.7501924844078534e-05, "loss": 0.027856966853141783, "step": 715 }, { "epoch": 2.069151809832523, "grad_norm": 0.07255651027166257, "learning_rate": 2.6758144567707754e-05, "loss": 0.028209209442138672, "step": 720 }, { "epoch": 2.083558436880965, "grad_norm": 0.0777676865315773, "learning_rate": 2.6020868122180385e-05, "loss": 0.02793322205543518, "step": 725 }, { "epoch": 2.0979650639294074, "grad_norm": 0.08664972293238134, "learning_rate": 2.5290301821544825e-05, "loss": 0.02801375389099121, "step": 730 }, { "epoch": 2.1123716909778496, "grad_norm": 0.08559466896073407, "learning_rate": 2.4566650102131573e-05, "loss": 0.02737850546836853, "step": 735 }, { "epoch": 2.1267783180262922, "grad_norm": 0.07852535239386964, "learning_rate": 2.3850115465345324e-05, "loss": 0.030919501185417177, "step": 740 }, { "epoch": 2.1411849450747344, "grad_norm": 0.08182892636530964, "learning_rate": 2.3140898420998426e-05, "loss": 0.028718733787536622, "step": 745 }, { "epoch": 2.1555915721231766, "grad_norm": 0.07295529971805709, "learning_rate": 2.2439197431201646e-05, "loss": 0.028903046250343324, "step": 750 }, { "epoch": 2.169998199171619, "grad_norm": 0.07624400365106067, "learning_rate": 2.1745208854828058e-05, "loss": 0.024923816323280334, "step": 755 }, { "epoch": 2.184404826220061, "grad_norm": 0.07567603422035397, "learning_rate": 2.105912689256533e-05, "loss": 0.026013752818107604, "step": 760 }, { "epoch": 2.1988114532685037, "grad_norm": 0.07427613549699529, "learning_rate": 2.0381143532572082e-05, "loss": 0.026708921790122984, "step": 765 }, { "epoch": 2.213218080316946, "grad_norm": 0.0721068508797536, "learning_rate": 1.9711448496753297e-05, "loss": 0.02909781038761139, "step": 770 }, { "epoch": 2.227624707365388, "grad_norm": 0.09841381262275949, "learning_rate": 1.905022918766995e-05, "loss": 0.027940624952316286, "step": 775 }, { "epoch": 2.2420313344138303, "grad_norm": 0.0816958462956758, "learning_rate": 1.8397670636097636e-05, "loss": 0.026423072814941405, "step": 780 }, { "epoch": 2.2564379614622725, "grad_norm": 0.07936813973695164, "learning_rate": 1.775395544924885e-05, "loss": 0.028386065363883974, "step": 785 }, { "epoch": 2.270844588510715, "grad_norm": 0.07710097062295308, "learning_rate": 1.7119263759673675e-05, "loss": 0.02769894599914551, "step": 790 }, { "epoch": 2.2852512155591573, "grad_norm": 0.08498281330072474, "learning_rate": 1.6493773174852673e-05, "loss": 0.02839537858963013, "step": 795 }, { "epoch": 2.2996578426075995, "grad_norm": 0.07674813377075432, "learning_rate": 1.587765872749649e-05, "loss": 0.02569463849067688, "step": 800 }, { "epoch": 2.3140644696560417, "grad_norm": 0.06662948325098497, "learning_rate": 1.527109282656611e-05, "loss": 0.028371796011924744, "step": 805 }, { "epoch": 2.328471096704484, "grad_norm": 0.08015839069477317, "learning_rate": 1.4674245209027066e-05, "loss": 0.026229003071784975, "step": 810 }, { "epoch": 2.3428777237529266, "grad_norm": 0.08019588118318016, "learning_rate": 1.4087282892351623e-05, "loss": 0.029995208978652953, "step": 815 }, { "epoch": 2.3572843508013688, "grad_norm": 0.08221863155956374, "learning_rate": 1.3510370127781635e-05, "loss": 0.029001206159591675, "step": 820 }, { "epoch": 2.371690977849811, "grad_norm": 0.07480678399512465, "learning_rate": 1.2943668354365878e-05, "loss": 0.02766028940677643, "step": 825 }, { "epoch": 2.386097604898253, "grad_norm": 0.07477452302806815, "learning_rate": 1.2387336153784018e-05, "loss": 0.02593517005443573, "step": 830 }, { "epoch": 2.4005042319466954, "grad_norm": 0.07081183958851973, "learning_rate": 1.184152920597028e-05, "loss": 0.026943469047546388, "step": 835 }, { "epoch": 2.4149108589951376, "grad_norm": 0.07536754957279856, "learning_rate": 1.1306400245549158e-05, "loss": 0.024954386055469513, "step": 840 }, { "epoch": 2.42931748604358, "grad_norm": 0.06344152496317775, "learning_rate": 1.0782099019095238e-05, "loss": 0.028272977471351622, "step": 845 }, { "epoch": 2.4437241130920224, "grad_norm": 0.0644553682371491, "learning_rate": 1.026877224322923e-05, "loss": 0.02370927333831787, "step": 850 }, { "epoch": 2.4581307401404646, "grad_norm": 0.07529675849595874, "learning_rate": 9.766563563561799e-06, "loss": 0.025498074293136597, "step": 855 }, { "epoch": 2.472537367188907, "grad_norm": 0.08420954265091966, "learning_rate": 9.275613514496977e-06, "loss": 0.02770912051200867, "step": 860 }, { "epoch": 2.486943994237349, "grad_norm": 0.0744332415489311, "learning_rate": 8.7960594799059e-06, "loss": 0.027615338563919067, "step": 865 }, { "epoch": 2.501350621285791, "grad_norm": 0.07212967627396147, "learning_rate": 8.328035654682325e-06, "loss": 0.027428582310676575, "step": 870 }, { "epoch": 2.515757248334234, "grad_norm": 0.08246547759863139, "learning_rate": 7.871673007190599e-06, "loss": 0.026888126134872438, "step": 875 }, { "epoch": 2.530163875382676, "grad_norm": 0.06863337011207567, "learning_rate": 7.427099242616348e-06, "loss": 0.025411182641983034, "step": 880 }, { "epoch": 2.5445705024311183, "grad_norm": 0.06777467806972155, "learning_rate": 6.994438767230466e-06, "loss": 0.024811127781867982, "step": 885 }, { "epoch": 2.5589771294795605, "grad_norm": 0.07029495896606512, "learning_rate": 6.573812653576062e-06, "loss": 0.02613699436187744, "step": 890 }, { "epoch": 2.5733837565280027, "grad_norm": 0.07134936463967867, "learning_rate": 6.1653386065885165e-06, "loss": 0.026964515447616577, "step": 895 }, { "epoch": 2.5877903835764453, "grad_norm": 0.07711841632882044, "learning_rate": 5.769130930657734e-06, "loss": 0.028112486004829407, "step": 900 }, { "epoch": 2.6021970106248875, "grad_norm": 0.08360128959008864, "learning_rate": 5.38530049764206e-06, "loss": 0.02626214623451233, "step": 905 }, { "epoch": 2.6166036376733297, "grad_norm": 0.07456201121764428, "learning_rate": 5.0139547158427e-06, "loss": 0.02669944763183594, "step": 910 }, { "epoch": 2.631010264721772, "grad_norm": 0.07740576081667884, "learning_rate": 4.655197499947378e-06, "loss": 0.029006192088127138, "step": 915 }, { "epoch": 2.645416891770214, "grad_norm": 0.06845350619031464, "learning_rate": 4.309129241951587e-06, "loss": 0.02491077184677124, "step": 920 }, { "epoch": 2.6598235188186568, "grad_norm": 0.07501903308333313, "learning_rate": 3.975846783065662e-06, "loss": 0.026326572895050047, "step": 925 }, { "epoch": 2.674230145867099, "grad_norm": 0.07580375293031513, "learning_rate": 3.6554433866154036e-06, "loss": 0.026823589205741884, "step": 930 }, { "epoch": 2.688636772915541, "grad_norm": 0.06969116474563261, "learning_rate": 3.3480087119440063e-06, "loss": 0.025913709402084352, "step": 935 }, { "epoch": 2.7030433999639834, "grad_norm": 0.0714630826160477, "learning_rate": 3.0536287893223604e-06, "loss": 0.026928871870040894, "step": 940 }, { "epoch": 2.7174500270124256, "grad_norm": 0.07358152299227637, "learning_rate": 2.7723859958750486e-06, "loss": 0.02748822569847107, "step": 945 }, { "epoch": 2.731856654060868, "grad_norm": 0.06838564316740577, "learning_rate": 2.5043590325285195e-06, "loss": 0.025952500104904175, "step": 950 }, { "epoch": 2.7462632811093104, "grad_norm": 0.07787109185214655, "learning_rate": 2.249622901987963e-06, "loss": 0.02589995265007019, "step": 955 }, { "epoch": 2.7606699081577526, "grad_norm": 0.07156945963749864, "learning_rate": 2.0082488877491033e-06, "loss": 0.027577921748161316, "step": 960 }, { "epoch": 2.775076535206195, "grad_norm": 0.06514188446012159, "learning_rate": 1.7803045341507952e-06, "loss": 0.025488072633743288, "step": 965 }, { "epoch": 2.789483162254637, "grad_norm": 0.0712195602884753, "learning_rate": 1.5658536274738621e-06, "loss": 0.02348570078611374, "step": 970 }, { "epoch": 2.8038897893030796, "grad_norm": 0.0680133235009968, "learning_rate": 1.3649561780916199e-06, "loss": 0.02316732406616211, "step": 975 }, { "epoch": 2.818296416351522, "grad_norm": 0.0824565977146897, "learning_rate": 1.1776684036770347e-06, "loss": 0.02901957035064697, "step": 980 }, { "epoch": 2.832703043399964, "grad_norm": 0.08111572063117606, "learning_rate": 1.004042713471165e-06, "loss": 0.02710677683353424, "step": 985 }, { "epoch": 2.8471096704484062, "grad_norm": 0.07416113908713114, "learning_rate": 8.441276936173193e-07, "loss": 0.024537976086139678, "step": 990 }, { "epoch": 2.8615162974968484, "grad_norm": 0.06645937685734804, "learning_rate": 6.9796809356511e-07, "loss": 0.025470972061157227, "step": 995 }, { "epoch": 2.875922924545291, "grad_norm": 0.07056688302520532, "learning_rate": 5.656048135480763e-07, "loss": 0.025230163335800172, "step": 1000 }, { "epoch": 2.8903295515937333, "grad_norm": 0.07480029198072068, "learning_rate": 4.470748931384494e-07, "loss": 0.026770299673080443, "step": 1005 }, { "epoch": 2.9047361786421755, "grad_norm": 0.06476290220031579, "learning_rate": 3.424115008822726e-07, "loss": 0.026645660400390625, "step": 1010 }, { "epoch": 2.9191428056906177, "grad_norm": 0.07374044092567203, "learning_rate": 2.5164392501777487e-07, "loss": 0.025820019841194152, "step": 1015 }, { "epoch": 2.93354943273906, "grad_norm": 0.07098709082144111, "learning_rate": 1.7479756527955527e-07, "loss": 0.025720816850662232, "step": 1020 }, { "epoch": 2.9479560597875025, "grad_norm": 0.07593395611493338, "learning_rate": 1.1189392579090129e-07, "loss": 0.024733534455299376, "step": 1025 }, { "epoch": 2.9623626868359443, "grad_norm": 0.07179585283776127, "learning_rate": 6.295060904623617e-08, "loss": 0.02832019031047821, "step": 1030 }, { "epoch": 2.976769313884387, "grad_norm": 0.06802635060193646, "learning_rate": 2.7981310985369935e-08, "loss": 0.025465887784957886, "step": 1035 }, { "epoch": 2.991175940932829, "grad_norm": 0.0759224455019542, "learning_rate": 6.995817160920792e-09, "loss": 0.0264853298664093, "step": 1040 }, { "epoch": 3.0, "step": 1044, "total_flos": 1577088536150016.0, "train_loss": 0.06165807318099385, "train_runtime": 23128.4215, "train_samples_per_second": 2.881, "train_steps_per_second": 0.045 } ], "logging_steps": 5, "max_steps": 1044, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 207, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1577088536150016.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }