{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4999945685825087, "eval_steps": 500, "global_step": 11507, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00043451339930695114, "grad_norm": 2.7321550846099854, "learning_rate": 5.780346820809249e-07, "loss": 0.2142, "step": 10 }, { "epoch": 0.0008690267986139023, "grad_norm": 1.0791596174240112, "learning_rate": 1.1560693641618499e-06, "loss": 0.1757, "step": 20 }, { "epoch": 0.0013035401979208535, "grad_norm": 0.9877675175666809, "learning_rate": 1.7341040462427746e-06, "loss": 0.1539, "step": 30 }, { "epoch": 0.0017380535972278046, "grad_norm": 0.6228306293487549, "learning_rate": 2.3121387283236997e-06, "loss": 0.111, "step": 40 }, { "epoch": 0.0021725669965347557, "grad_norm": 0.626401960849762, "learning_rate": 2.890173410404625e-06, "loss": 0.0919, "step": 50 }, { "epoch": 0.002607080395841707, "grad_norm": 0.47861582040786743, "learning_rate": 3.468208092485549e-06, "loss": 0.0713, "step": 60 }, { "epoch": 0.003041593795148658, "grad_norm": 0.6577467918395996, "learning_rate": 4.046242774566474e-06, "loss": 0.0638, "step": 70 }, { "epoch": 0.003476107194455609, "grad_norm": 0.49629291892051697, "learning_rate": 4.6242774566473994e-06, "loss": 0.0648, "step": 80 }, { "epoch": 0.00391062059376256, "grad_norm": 0.4986942708492279, "learning_rate": 5.202312138728324e-06, "loss": 0.0547, "step": 90 }, { "epoch": 0.004345133993069511, "grad_norm": 0.48853418231010437, "learning_rate": 5.78034682080925e-06, "loss": 0.0578, "step": 100 }, { "epoch": 0.004779647392376463, "grad_norm": 0.39011338353157043, "learning_rate": 6.358381502890174e-06, "loss": 0.0392, "step": 110 }, { "epoch": 0.005214160791683414, "grad_norm": 0.6059409976005554, "learning_rate": 6.936416184971098e-06, "loss": 0.0455, "step": 120 }, { "epoch": 0.005648674190990364, "grad_norm": 0.3396739363670349, "learning_rate": 7.5144508670520235e-06, "loss": 0.0415, "step": 130 }, { "epoch": 0.006083187590297316, "grad_norm": 0.6290504336357117, "learning_rate": 8.092485549132949e-06, "loss": 0.0452, "step": 140 }, { "epoch": 0.006517700989604267, "grad_norm": 0.38049280643463135, "learning_rate": 8.670520231213873e-06, "loss": 0.04, "step": 150 }, { "epoch": 0.006952214388911218, "grad_norm": 0.32164865732192993, "learning_rate": 9.248554913294799e-06, "loss": 0.0396, "step": 160 }, { "epoch": 0.00738672778821817, "grad_norm": 0.5390863418579102, "learning_rate": 9.826589595375723e-06, "loss": 0.0466, "step": 170 }, { "epoch": 0.00782124118752512, "grad_norm": 0.31393370032310486, "learning_rate": 1.0404624277456647e-05, "loss": 0.0397, "step": 180 }, { "epoch": 0.008255754586832071, "grad_norm": 0.2798372805118561, "learning_rate": 1.0982658959537573e-05, "loss": 0.0329, "step": 190 }, { "epoch": 0.008690267986139023, "grad_norm": 0.3006483018398285, "learning_rate": 1.15606936416185e-05, "loss": 0.0462, "step": 200 }, { "epoch": 0.009124781385445974, "grad_norm": 0.40412917733192444, "learning_rate": 1.2138728323699422e-05, "loss": 0.0371, "step": 210 }, { "epoch": 0.009559294784752925, "grad_norm": 0.366756796836853, "learning_rate": 1.2716763005780348e-05, "loss": 0.0362, "step": 220 }, { "epoch": 0.009993808184059877, "grad_norm": 0.32743439078330994, "learning_rate": 1.3294797687861272e-05, "loss": 0.0343, "step": 230 }, { "epoch": 0.010428321583366828, "grad_norm": 0.3373115360736847, "learning_rate": 1.3872832369942197e-05, "loss": 0.0376, "step": 240 }, { "epoch": 0.010862834982673777, "grad_norm": 0.3147127032279968, "learning_rate": 1.4450867052023123e-05, "loss": 0.0376, "step": 250 }, { "epoch": 0.011297348381980729, "grad_norm": 0.33591964840888977, "learning_rate": 1.5028901734104047e-05, "loss": 0.0408, "step": 260 }, { "epoch": 0.01173186178128768, "grad_norm": 0.21263393759727478, "learning_rate": 1.5606936416184973e-05, "loss": 0.029, "step": 270 }, { "epoch": 0.012166375180594631, "grad_norm": 0.2800604999065399, "learning_rate": 1.6184971098265897e-05, "loss": 0.0318, "step": 280 }, { "epoch": 0.012600888579901583, "grad_norm": 0.38907432556152344, "learning_rate": 1.676300578034682e-05, "loss": 0.0307, "step": 290 }, { "epoch": 0.013035401979208534, "grad_norm": 0.25566425919532776, "learning_rate": 1.7341040462427746e-05, "loss": 0.0384, "step": 300 }, { "epoch": 0.013469915378515485, "grad_norm": 0.28023627400398254, "learning_rate": 1.7919075144508673e-05, "loss": 0.0285, "step": 310 }, { "epoch": 0.013904428777822437, "grad_norm": 0.3044200837612152, "learning_rate": 1.8497109826589598e-05, "loss": 0.0343, "step": 320 }, { "epoch": 0.014338942177129388, "grad_norm": 0.2846660017967224, "learning_rate": 1.9075144508670522e-05, "loss": 0.0315, "step": 330 }, { "epoch": 0.01477345557643634, "grad_norm": 0.27756503224372864, "learning_rate": 1.9653179190751446e-05, "loss": 0.027, "step": 340 }, { "epoch": 0.015207968975743289, "grad_norm": 0.34100356698036194, "learning_rate": 1.9999993661544142e-05, "loss": 0.0268, "step": 350 }, { "epoch": 0.01564248237505024, "grad_norm": 0.17514333128929138, "learning_rate": 1.999992235400801e-05, "loss": 0.0312, "step": 360 }, { "epoch": 0.016076995774357193, "grad_norm": 0.26727843284606934, "learning_rate": 1.9999771816432783e-05, "loss": 0.0328, "step": 370 }, { "epoch": 0.016511509173664143, "grad_norm": 0.2973346412181854, "learning_rate": 1.9999542050011175e-05, "loss": 0.0291, "step": 380 }, { "epoch": 0.016946022572971092, "grad_norm": 0.21407945454120636, "learning_rate": 1.999923305656364e-05, "loss": 0.0257, "step": 390 }, { "epoch": 0.017380535972278045, "grad_norm": 0.23754067718982697, "learning_rate": 1.999884483853836e-05, "loss": 0.0313, "step": 400 }, { "epoch": 0.017815049371584995, "grad_norm": 0.2189449518918991, "learning_rate": 1.9998377399011203e-05, "loss": 0.0334, "step": 410 }, { "epoch": 0.018249562770891948, "grad_norm": 0.2209245264530182, "learning_rate": 1.9997830741685734e-05, "loss": 0.0308, "step": 420 }, { "epoch": 0.018684076170198897, "grad_norm": 0.1769685447216034, "learning_rate": 1.9997204870893147e-05, "loss": 0.0304, "step": 430 }, { "epoch": 0.01911858956950585, "grad_norm": 0.20189526677131653, "learning_rate": 1.9996499791592257e-05, "loss": 0.0272, "step": 440 }, { "epoch": 0.0195531029688128, "grad_norm": 0.20217107236385345, "learning_rate": 1.9995715509369456e-05, "loss": 0.029, "step": 450 }, { "epoch": 0.019987616368119753, "grad_norm": 0.2422274649143219, "learning_rate": 1.999485203043866e-05, "loss": 0.0258, "step": 460 }, { "epoch": 0.020422129767426703, "grad_norm": 0.22238817811012268, "learning_rate": 1.9993909361641272e-05, "loss": 0.0279, "step": 470 }, { "epoch": 0.020856643166733656, "grad_norm": 0.18254493176937103, "learning_rate": 1.999288751044612e-05, "loss": 0.0261, "step": 480 }, { "epoch": 0.021291156566040605, "grad_norm": 0.3444249629974365, "learning_rate": 1.9991786484949397e-05, "loss": 0.0319, "step": 490 }, { "epoch": 0.021725669965347555, "grad_norm": 0.22629131376743317, "learning_rate": 1.99906062938746e-05, "loss": 0.0311, "step": 500 }, { "epoch": 0.022160183364654508, "grad_norm": 0.22771018743515015, "learning_rate": 1.9989346946572455e-05, "loss": 0.0317, "step": 510 }, { "epoch": 0.022594696763961458, "grad_norm": 0.22105267643928528, "learning_rate": 1.998800845302086e-05, "loss": 0.0271, "step": 520 }, { "epoch": 0.02302921016326841, "grad_norm": 0.20002543926239014, "learning_rate": 1.9986590823824785e-05, "loss": 0.0285, "step": 530 }, { "epoch": 0.02346372356257536, "grad_norm": 0.27326658368110657, "learning_rate": 1.99850940702162e-05, "loss": 0.0242, "step": 540 }, { "epoch": 0.023898236961882313, "grad_norm": 0.1711287945508957, "learning_rate": 1.9983518204053976e-05, "loss": 0.022, "step": 550 }, { "epoch": 0.024332750361189263, "grad_norm": 0.15666693449020386, "learning_rate": 1.9981863237823807e-05, "loss": 0.0248, "step": 560 }, { "epoch": 0.024767263760496216, "grad_norm": 0.2459748536348343, "learning_rate": 1.9980129184638103e-05, "loss": 0.0259, "step": 570 }, { "epoch": 0.025201777159803165, "grad_norm": 0.17908240854740143, "learning_rate": 1.9978316058235875e-05, "loss": 0.0263, "step": 580 }, { "epoch": 0.025636290559110115, "grad_norm": 0.17861013114452362, "learning_rate": 1.9976423872982646e-05, "loss": 0.0235, "step": 590 }, { "epoch": 0.026070803958417068, "grad_norm": 0.19937822222709656, "learning_rate": 1.9974452643870318e-05, "loss": 0.0294, "step": 600 }, { "epoch": 0.026505317357724018, "grad_norm": 0.14718875288963318, "learning_rate": 1.9972402386517076e-05, "loss": 0.0175, "step": 610 }, { "epoch": 0.02693983075703097, "grad_norm": 0.16689449548721313, "learning_rate": 1.997027311716724e-05, "loss": 0.0219, "step": 620 }, { "epoch": 0.02737434415633792, "grad_norm": 0.191191628575325, "learning_rate": 1.9968064852691145e-05, "loss": 0.021, "step": 630 }, { "epoch": 0.027808857555644873, "grad_norm": 0.16465428471565247, "learning_rate": 1.9965777610585023e-05, "loss": 0.025, "step": 640 }, { "epoch": 0.028243370954951823, "grad_norm": 0.1445729285478592, "learning_rate": 1.9963411408970837e-05, "loss": 0.0262, "step": 650 }, { "epoch": 0.028677884354258776, "grad_norm": 0.17803451418876648, "learning_rate": 1.9960966266596164e-05, "loss": 0.0215, "step": 660 }, { "epoch": 0.029112397753565725, "grad_norm": 0.11042923480272293, "learning_rate": 1.995844220283402e-05, "loss": 0.0246, "step": 670 }, { "epoch": 0.02954691115287268, "grad_norm": 0.2461649775505066, "learning_rate": 1.995583923768273e-05, "loss": 0.0242, "step": 680 }, { "epoch": 0.029981424552179628, "grad_norm": 0.17665310204029083, "learning_rate": 1.995315739176576e-05, "loss": 0.0266, "step": 690 }, { "epoch": 0.030415937951486578, "grad_norm": 0.18192529678344727, "learning_rate": 1.9950396686331543e-05, "loss": 0.0224, "step": 700 }, { "epoch": 0.03085045135079353, "grad_norm": 0.18379367887973785, "learning_rate": 1.9947557143253337e-05, "loss": 0.021, "step": 710 }, { "epoch": 0.03128496475010048, "grad_norm": 0.1325334906578064, "learning_rate": 1.9944638785029024e-05, "loss": 0.0211, "step": 720 }, { "epoch": 0.03171947814940743, "grad_norm": 0.1236315667629242, "learning_rate": 1.9941641634780942e-05, "loss": 0.0176, "step": 730 }, { "epoch": 0.032153991548714386, "grad_norm": 0.17992247641086578, "learning_rate": 1.993856571625572e-05, "loss": 0.022, "step": 740 }, { "epoch": 0.03258850494802133, "grad_norm": 0.21584922075271606, "learning_rate": 1.9935411053824057e-05, "loss": 0.0218, "step": 750 }, { "epoch": 0.033023018347328285, "grad_norm": 0.1667386144399643, "learning_rate": 1.9932177672480544e-05, "loss": 0.0236, "step": 760 }, { "epoch": 0.03345753174663524, "grad_norm": 0.1880834996700287, "learning_rate": 1.992886559784348e-05, "loss": 0.0233, "step": 770 }, { "epoch": 0.033892045145942185, "grad_norm": 0.16038943827152252, "learning_rate": 1.9925474856154645e-05, "loss": 0.0174, "step": 780 }, { "epoch": 0.03432655854524914, "grad_norm": 0.22902469336986542, "learning_rate": 1.992200547427911e-05, "loss": 0.0245, "step": 790 }, { "epoch": 0.03476107194455609, "grad_norm": 0.1996387541294098, "learning_rate": 1.9918457479705012e-05, "loss": 0.02, "step": 800 }, { "epoch": 0.035195585343863044, "grad_norm": 0.13516375422477722, "learning_rate": 1.9914830900543346e-05, "loss": 0.0238, "step": 810 }, { "epoch": 0.03563009874316999, "grad_norm": 0.21028189361095428, "learning_rate": 1.991112576552774e-05, "loss": 0.0265, "step": 820 }, { "epoch": 0.03606461214247694, "grad_norm": 0.1437552571296692, "learning_rate": 1.9907342104014213e-05, "loss": 0.0198, "step": 830 }, { "epoch": 0.036499125541783896, "grad_norm": 0.12813818454742432, "learning_rate": 1.9903479945980964e-05, "loss": 0.0186, "step": 840 }, { "epoch": 0.03693363894109085, "grad_norm": 0.13648340106010437, "learning_rate": 1.9899539322028128e-05, "loss": 0.0216, "step": 850 }, { "epoch": 0.037368152340397795, "grad_norm": 0.23872444033622742, "learning_rate": 1.9895520263377523e-05, "loss": 0.0279, "step": 860 }, { "epoch": 0.03780266573970475, "grad_norm": 0.17388087511062622, "learning_rate": 1.989142280187242e-05, "loss": 0.0247, "step": 870 }, { "epoch": 0.0382371791390117, "grad_norm": 0.26048898696899414, "learning_rate": 1.9887246969977266e-05, "loss": 0.0193, "step": 880 }, { "epoch": 0.03867169253831865, "grad_norm": 0.15297527611255646, "learning_rate": 1.9882992800777458e-05, "loss": 0.018, "step": 890 }, { "epoch": 0.0391062059376256, "grad_norm": 0.21532462537288666, "learning_rate": 1.987866032797905e-05, "loss": 0.0195, "step": 900 }, { "epoch": 0.03954071933693255, "grad_norm": 0.16272452473640442, "learning_rate": 1.987424958590851e-05, "loss": 0.0188, "step": 910 }, { "epoch": 0.039975232736239506, "grad_norm": 0.16188514232635498, "learning_rate": 1.9869760609512435e-05, "loss": 0.0193, "step": 920 }, { "epoch": 0.04040974613554645, "grad_norm": 0.16015928983688354, "learning_rate": 1.9865193434357284e-05, "loss": 0.021, "step": 930 }, { "epoch": 0.040844259534853405, "grad_norm": 0.14305295050144196, "learning_rate": 1.9860548096629084e-05, "loss": 0.0199, "step": 940 }, { "epoch": 0.04127877293416036, "grad_norm": 0.17772972583770752, "learning_rate": 1.985582463313314e-05, "loss": 0.0233, "step": 950 }, { "epoch": 0.04171328633346731, "grad_norm": 0.17599040269851685, "learning_rate": 1.985102308129377e-05, "loss": 0.0225, "step": 960 }, { "epoch": 0.04214779973277426, "grad_norm": 0.17291192710399628, "learning_rate": 1.9846143479153986e-05, "loss": 0.0224, "step": 970 }, { "epoch": 0.04258231313208121, "grad_norm": 0.15880149602890015, "learning_rate": 1.9841185865375186e-05, "loss": 0.0235, "step": 980 }, { "epoch": 0.043016826531388164, "grad_norm": 0.147139310836792, "learning_rate": 1.9836150279236878e-05, "loss": 0.0181, "step": 990 }, { "epoch": 0.04345133993069511, "grad_norm": 0.17359748482704163, "learning_rate": 1.9831036760636334e-05, "loss": 0.0204, "step": 1000 }, { "epoch": 0.04388585333000206, "grad_norm": 0.1651877760887146, "learning_rate": 1.98258453500883e-05, "loss": 0.0223, "step": 1010 }, { "epoch": 0.044320366729309016, "grad_norm": 0.2016420215368271, "learning_rate": 1.982057608872466e-05, "loss": 0.0263, "step": 1020 }, { "epoch": 0.04475488012861597, "grad_norm": 0.20891492068767548, "learning_rate": 1.981522901829411e-05, "loss": 0.0236, "step": 1030 }, { "epoch": 0.045189393527922915, "grad_norm": 0.1273709386587143, "learning_rate": 1.9809804181161857e-05, "loss": 0.0237, "step": 1040 }, { "epoch": 0.04562390692722987, "grad_norm": 0.22251392900943756, "learning_rate": 1.9804301620309223e-05, "loss": 0.0232, "step": 1050 }, { "epoch": 0.04605842032653682, "grad_norm": 0.20716802775859833, "learning_rate": 1.9798721379333363e-05, "loss": 0.0194, "step": 1060 }, { "epoch": 0.046492933725843774, "grad_norm": 0.18907934427261353, "learning_rate": 1.9793063502446894e-05, "loss": 0.0192, "step": 1070 }, { "epoch": 0.04692744712515072, "grad_norm": 0.13406594097614288, "learning_rate": 1.978732803447754e-05, "loss": 0.0187, "step": 1080 }, { "epoch": 0.04736196052445767, "grad_norm": 0.16759559512138367, "learning_rate": 1.9781515020867793e-05, "loss": 0.0207, "step": 1090 }, { "epoch": 0.047796473923764626, "grad_norm": 0.1420600861310959, "learning_rate": 1.9775624507674543e-05, "loss": 0.0157, "step": 1100 }, { "epoch": 0.04823098732307157, "grad_norm": 0.08774017542600632, "learning_rate": 1.9769656541568703e-05, "loss": 0.0172, "step": 1110 }, { "epoch": 0.048665500722378525, "grad_norm": 0.13488835096359253, "learning_rate": 1.9763611169834865e-05, "loss": 0.0192, "step": 1120 }, { "epoch": 0.04910001412168548, "grad_norm": 0.34726813435554504, "learning_rate": 1.9757488440370904e-05, "loss": 0.0182, "step": 1130 }, { "epoch": 0.04953452752099243, "grad_norm": 0.12690973281860352, "learning_rate": 1.9751288401687603e-05, "loss": 0.0159, "step": 1140 }, { "epoch": 0.04996904092029938, "grad_norm": 0.16107550263404846, "learning_rate": 1.9745011102908277e-05, "loss": 0.0229, "step": 1150 }, { "epoch": 0.05040355431960633, "grad_norm": 0.2785297930240631, "learning_rate": 1.9738656593768372e-05, "loss": 0.0178, "step": 1160 }, { "epoch": 0.050838067718913284, "grad_norm": 0.14942985773086548, "learning_rate": 1.9732224924615083e-05, "loss": 0.0194, "step": 1170 }, { "epoch": 0.05127258111822023, "grad_norm": 0.1952151656150818, "learning_rate": 1.9725716146406948e-05, "loss": 0.0257, "step": 1180 }, { "epoch": 0.05170709451752718, "grad_norm": 0.10802031308412552, "learning_rate": 1.9719130310713438e-05, "loss": 0.0163, "step": 1190 }, { "epoch": 0.052141607916834136, "grad_norm": 0.15618370473384857, "learning_rate": 1.971246746971456e-05, "loss": 0.0219, "step": 1200 }, { "epoch": 0.05257612131614109, "grad_norm": 0.11094862222671509, "learning_rate": 1.9705727676200443e-05, "loss": 0.0158, "step": 1210 }, { "epoch": 0.053010634715448035, "grad_norm": 0.20805992186069489, "learning_rate": 1.9698910983570907e-05, "loss": 0.0213, "step": 1220 }, { "epoch": 0.05344514811475499, "grad_norm": 0.14978080987930298, "learning_rate": 1.9692017445835057e-05, "loss": 0.0195, "step": 1230 }, { "epoch": 0.05387966151406194, "grad_norm": 0.17037218809127808, "learning_rate": 1.968504711761084e-05, "loss": 0.0241, "step": 1240 }, { "epoch": 0.054314174913368894, "grad_norm": 0.1423705667257309, "learning_rate": 1.9678000054124626e-05, "loss": 0.0227, "step": 1250 }, { "epoch": 0.05474868831267584, "grad_norm": 0.1888200342655182, "learning_rate": 1.9670876311210763e-05, "loss": 0.0201, "step": 1260 }, { "epoch": 0.05518320171198279, "grad_norm": 0.1742173284292221, "learning_rate": 1.9663675945311125e-05, "loss": 0.0199, "step": 1270 }, { "epoch": 0.055617715111289746, "grad_norm": 0.17522349953651428, "learning_rate": 1.9656399013474686e-05, "loss": 0.0191, "step": 1280 }, { "epoch": 0.05605222851059669, "grad_norm": 0.1633007824420929, "learning_rate": 1.9649045573357053e-05, "loss": 0.0197, "step": 1290 }, { "epoch": 0.056486741909903646, "grad_norm": 0.1677575409412384, "learning_rate": 1.9641615683220017e-05, "loss": 0.0204, "step": 1300 }, { "epoch": 0.0569212553092106, "grad_norm": 0.17271777987480164, "learning_rate": 1.9634109401931076e-05, "loss": 0.0189, "step": 1310 }, { "epoch": 0.05735576870851755, "grad_norm": 0.13161471486091614, "learning_rate": 1.962652678896299e-05, "loss": 0.0209, "step": 1320 }, { "epoch": 0.0577902821078245, "grad_norm": 0.1275712102651596, "learning_rate": 1.9618867904393303e-05, "loss": 0.0165, "step": 1330 }, { "epoch": 0.05822479550713145, "grad_norm": 0.1670198291540146, "learning_rate": 1.9611132808903854e-05, "loss": 0.0187, "step": 1340 }, { "epoch": 0.058659308906438404, "grad_norm": 0.14722135663032532, "learning_rate": 1.960332156378031e-05, "loss": 0.0186, "step": 1350 }, { "epoch": 0.05909382230574536, "grad_norm": 0.21211297810077667, "learning_rate": 1.9595434230911676e-05, "loss": 0.0188, "step": 1360 }, { "epoch": 0.0595283357050523, "grad_norm": 0.14945900440216064, "learning_rate": 1.9587470872789813e-05, "loss": 0.0184, "step": 1370 }, { "epoch": 0.059962849104359256, "grad_norm": 0.12674035131931305, "learning_rate": 1.957943155250892e-05, "loss": 0.0168, "step": 1380 }, { "epoch": 0.06039736250366621, "grad_norm": 0.14971709251403809, "learning_rate": 1.9571316333765066e-05, "loss": 0.0169, "step": 1390 }, { "epoch": 0.060831875902973155, "grad_norm": 0.1633756160736084, "learning_rate": 1.9563125280855655e-05, "loss": 0.0182, "step": 1400 }, { "epoch": 0.06126638930228011, "grad_norm": 0.15784451365470886, "learning_rate": 1.9554858458678935e-05, "loss": 0.0177, "step": 1410 }, { "epoch": 0.06170090270158706, "grad_norm": 0.1466844081878662, "learning_rate": 1.9546515932733482e-05, "loss": 0.0204, "step": 1420 }, { "epoch": 0.062135416100894014, "grad_norm": 0.11465369910001755, "learning_rate": 1.9538097769117673e-05, "loss": 0.0189, "step": 1430 }, { "epoch": 0.06256992950020096, "grad_norm": 0.1450086385011673, "learning_rate": 1.9529604034529167e-05, "loss": 0.0216, "step": 1440 }, { "epoch": 0.06300444289950792, "grad_norm": 0.1369352489709854, "learning_rate": 1.952103479626438e-05, "loss": 0.0185, "step": 1450 }, { "epoch": 0.06343895629881487, "grad_norm": 0.1455765813589096, "learning_rate": 1.951239012221795e-05, "loss": 0.0164, "step": 1460 }, { "epoch": 0.06387346969812181, "grad_norm": 0.1134624183177948, "learning_rate": 1.9503670080882196e-05, "loss": 0.0202, "step": 1470 }, { "epoch": 0.06430798309742877, "grad_norm": 0.16108715534210205, "learning_rate": 1.949487474134657e-05, "loss": 0.02, "step": 1480 }, { "epoch": 0.06474249649673572, "grad_norm": 0.13906016945838928, "learning_rate": 1.9486004173297127e-05, "loss": 0.0155, "step": 1490 }, { "epoch": 0.06517700989604266, "grad_norm": 0.18072283267974854, "learning_rate": 1.9477058447015958e-05, "loss": 0.0199, "step": 1500 }, { "epoch": 0.06561152329534962, "grad_norm": 0.13988341391086578, "learning_rate": 1.9468037633380638e-05, "loss": 0.0176, "step": 1510 }, { "epoch": 0.06604603669465657, "grad_norm": 0.21149617433547974, "learning_rate": 1.9458941803863662e-05, "loss": 0.018, "step": 1520 }, { "epoch": 0.06648055009396352, "grad_norm": 0.13330025970935822, "learning_rate": 1.9449771030531884e-05, "loss": 0.0159, "step": 1530 }, { "epoch": 0.06691506349327048, "grad_norm": 0.11512052267789841, "learning_rate": 1.9440525386045938e-05, "loss": 0.0184, "step": 1540 }, { "epoch": 0.06734957689257742, "grad_norm": 0.14134028553962708, "learning_rate": 1.9431204943659673e-05, "loss": 0.0185, "step": 1550 }, { "epoch": 0.06778409029188437, "grad_norm": 0.1068054661154747, "learning_rate": 1.9421809777219566e-05, "loss": 0.0156, "step": 1560 }, { "epoch": 0.06821860369119133, "grad_norm": 0.19036848843097687, "learning_rate": 1.9412339961164125e-05, "loss": 0.0224, "step": 1570 }, { "epoch": 0.06865311709049828, "grad_norm": 0.1468607634305954, "learning_rate": 1.9402795570523337e-05, "loss": 0.0206, "step": 1580 }, { "epoch": 0.06908763048980524, "grad_norm": 0.11990370601415634, "learning_rate": 1.9393176680918023e-05, "loss": 0.0189, "step": 1590 }, { "epoch": 0.06952214388911218, "grad_norm": 0.18098638951778412, "learning_rate": 1.938348336855928e-05, "loss": 0.0217, "step": 1600 }, { "epoch": 0.06995665728841913, "grad_norm": 0.15582935512065887, "learning_rate": 1.9373715710247855e-05, "loss": 0.0208, "step": 1610 }, { "epoch": 0.07039117068772609, "grad_norm": 0.11491195857524872, "learning_rate": 1.9363873783373538e-05, "loss": 0.0187, "step": 1620 }, { "epoch": 0.07082568408703303, "grad_norm": 0.11585468053817749, "learning_rate": 1.935395766591457e-05, "loss": 0.0172, "step": 1630 }, { "epoch": 0.07126019748633998, "grad_norm": 0.21553751826286316, "learning_rate": 1.934396743643699e-05, "loss": 0.0154, "step": 1640 }, { "epoch": 0.07169471088564694, "grad_norm": 0.1415535807609558, "learning_rate": 1.9333903174094042e-05, "loss": 0.0164, "step": 1650 }, { "epoch": 0.07212922428495389, "grad_norm": 0.1466447412967682, "learning_rate": 1.9323764958625538e-05, "loss": 0.0177, "step": 1660 }, { "epoch": 0.07256373768426083, "grad_norm": 0.16668741405010223, "learning_rate": 1.931355287035722e-05, "loss": 0.0221, "step": 1670 }, { "epoch": 0.07299825108356779, "grad_norm": 0.15750114619731903, "learning_rate": 1.9303266990200132e-05, "loss": 0.0194, "step": 1680 }, { "epoch": 0.07343276448287474, "grad_norm": 0.13911347091197968, "learning_rate": 1.9292907399649974e-05, "loss": 0.0154, "step": 1690 }, { "epoch": 0.0738672778821817, "grad_norm": 0.12891244888305664, "learning_rate": 1.928247418078646e-05, "loss": 0.0168, "step": 1700 }, { "epoch": 0.07430179128148864, "grad_norm": 0.11099886149168015, "learning_rate": 1.927196741627267e-05, "loss": 0.0206, "step": 1710 }, { "epoch": 0.07473630468079559, "grad_norm": 0.1321699619293213, "learning_rate": 1.926138718935438e-05, "loss": 0.0174, "step": 1720 }, { "epoch": 0.07517081808010255, "grad_norm": 0.12956419587135315, "learning_rate": 1.9250733583859426e-05, "loss": 0.0183, "step": 1730 }, { "epoch": 0.0756053314794095, "grad_norm": 0.14954937994480133, "learning_rate": 1.9240006684197018e-05, "loss": 0.0219, "step": 1740 }, { "epoch": 0.07603984487871644, "grad_norm": 0.1368705779314041, "learning_rate": 1.9229206575357086e-05, "loss": 0.0223, "step": 1750 }, { "epoch": 0.0764743582780234, "grad_norm": 0.09426697343587875, "learning_rate": 1.9218333342909595e-05, "loss": 0.0178, "step": 1760 }, { "epoch": 0.07690887167733035, "grad_norm": 0.13452351093292236, "learning_rate": 1.9207387073003882e-05, "loss": 0.0159, "step": 1770 }, { "epoch": 0.0773433850766373, "grad_norm": 0.13616427779197693, "learning_rate": 1.9196367852367958e-05, "loss": 0.0163, "step": 1780 }, { "epoch": 0.07777789847594425, "grad_norm": 0.09108922630548477, "learning_rate": 1.918527576830783e-05, "loss": 0.0167, "step": 1790 }, { "epoch": 0.0782124118752512, "grad_norm": 0.1489681601524353, "learning_rate": 1.9174110908706803e-05, "loss": 0.0191, "step": 1800 }, { "epoch": 0.07864692527455816, "grad_norm": 0.1320810317993164, "learning_rate": 1.916287336202479e-05, "loss": 0.015, "step": 1810 }, { "epoch": 0.0790814386738651, "grad_norm": 0.1843460649251938, "learning_rate": 1.9151563217297612e-05, "loss": 0.0153, "step": 1820 }, { "epoch": 0.07951595207317205, "grad_norm": 0.1414232701063156, "learning_rate": 1.9140180564136285e-05, "loss": 0.0164, "step": 1830 }, { "epoch": 0.07995046547247901, "grad_norm": 0.2440994381904602, "learning_rate": 1.912872549272631e-05, "loss": 0.021, "step": 1840 }, { "epoch": 0.08038497887178596, "grad_norm": 0.15282997488975525, "learning_rate": 1.9117198093826973e-05, "loss": 0.0177, "step": 1850 }, { "epoch": 0.0808194922710929, "grad_norm": 0.1545763909816742, "learning_rate": 1.910559845877061e-05, "loss": 0.0162, "step": 1860 }, { "epoch": 0.08125400567039986, "grad_norm": 0.11866192519664764, "learning_rate": 1.9093926679461883e-05, "loss": 0.0169, "step": 1870 }, { "epoch": 0.08168851906970681, "grad_norm": 0.16106340289115906, "learning_rate": 1.9082182848377066e-05, "loss": 0.0175, "step": 1880 }, { "epoch": 0.08212303246901376, "grad_norm": 0.11041273176670074, "learning_rate": 1.90703670585633e-05, "loss": 0.0185, "step": 1890 }, { "epoch": 0.08255754586832072, "grad_norm": 0.1270485520362854, "learning_rate": 1.9058479403637867e-05, "loss": 0.0181, "step": 1900 }, { "epoch": 0.08299205926762766, "grad_norm": 0.16733244061470032, "learning_rate": 1.9046519977787424e-05, "loss": 0.0139, "step": 1910 }, { "epoch": 0.08342657266693462, "grad_norm": 0.13699039816856384, "learning_rate": 1.9034488875767296e-05, "loss": 0.013, "step": 1920 }, { "epoch": 0.08386108606624157, "grad_norm": 0.10159099847078323, "learning_rate": 1.9022386192900682e-05, "loss": 0.019, "step": 1930 }, { "epoch": 0.08429559946554852, "grad_norm": 0.13112778961658478, "learning_rate": 1.9010212025077938e-05, "loss": 0.0149, "step": 1940 }, { "epoch": 0.08473011286485548, "grad_norm": 0.11180630326271057, "learning_rate": 1.8997966468755785e-05, "loss": 0.0169, "step": 1950 }, { "epoch": 0.08516462626416242, "grad_norm": 0.100865937769413, "learning_rate": 1.898564962095657e-05, "loss": 0.0129, "step": 1960 }, { "epoch": 0.08559913966346937, "grad_norm": 0.11768165975809097, "learning_rate": 1.8973261579267486e-05, "loss": 0.0154, "step": 1970 }, { "epoch": 0.08603365306277633, "grad_norm": 0.17192742228507996, "learning_rate": 1.8960802441839794e-05, "loss": 0.0149, "step": 1980 }, { "epoch": 0.08646816646208327, "grad_norm": 0.21123839914798737, "learning_rate": 1.894827230738806e-05, "loss": 0.0163, "step": 1990 }, { "epoch": 0.08690267986139022, "grad_norm": 0.10695893317461014, "learning_rate": 1.8935671275189356e-05, "loss": 0.0163, "step": 2000 }, { "epoch": 0.08733719326069718, "grad_norm": 0.14011739194393158, "learning_rate": 1.8922999445082484e-05, "loss": 0.0148, "step": 2010 }, { "epoch": 0.08777170666000413, "grad_norm": 0.13201646506786346, "learning_rate": 1.8910256917467183e-05, "loss": 0.0149, "step": 2020 }, { "epoch": 0.08820622005931109, "grad_norm": 0.18131311237812042, "learning_rate": 1.8897443793303334e-05, "loss": 0.0162, "step": 2030 }, { "epoch": 0.08864073345861803, "grad_norm": 0.1718572974205017, "learning_rate": 1.888456017411016e-05, "loss": 0.0168, "step": 2040 }, { "epoch": 0.08907524685792498, "grad_norm": 0.13650089502334595, "learning_rate": 1.8871606161965416e-05, "loss": 0.0176, "step": 2050 }, { "epoch": 0.08950976025723194, "grad_norm": 0.12102147191762924, "learning_rate": 1.8858581859504587e-05, "loss": 0.0169, "step": 2060 }, { "epoch": 0.08994427365653888, "grad_norm": 0.11048076301813126, "learning_rate": 1.8845487369920076e-05, "loss": 0.0119, "step": 2070 }, { "epoch": 0.09037878705584583, "grad_norm": 0.11731571704149246, "learning_rate": 1.883232279696038e-05, "loss": 0.0155, "step": 2080 }, { "epoch": 0.09081330045515279, "grad_norm": 0.13679373264312744, "learning_rate": 1.8819088244929275e-05, "loss": 0.015, "step": 2090 }, { "epoch": 0.09124781385445974, "grad_norm": 0.1238800659775734, "learning_rate": 1.8805783818684976e-05, "loss": 0.0151, "step": 2100 }, { "epoch": 0.09168232725376668, "grad_norm": 0.15139545500278473, "learning_rate": 1.8792409623639325e-05, "loss": 0.0196, "step": 2110 }, { "epoch": 0.09211684065307364, "grad_norm": 0.19011171162128448, "learning_rate": 1.8778965765756946e-05, "loss": 0.0199, "step": 2120 }, { "epoch": 0.09255135405238059, "grad_norm": 0.1415945589542389, "learning_rate": 1.8765452351554408e-05, "loss": 0.0165, "step": 2130 }, { "epoch": 0.09298586745168755, "grad_norm": 0.15646842122077942, "learning_rate": 1.875186948809937e-05, "loss": 0.0132, "step": 2140 }, { "epoch": 0.0934203808509945, "grad_norm": 0.09835656732320786, "learning_rate": 1.8738217283009747e-05, "loss": 0.0181, "step": 2150 }, { "epoch": 0.09385489425030144, "grad_norm": 0.12268593162298203, "learning_rate": 1.872449584445286e-05, "loss": 0.0137, "step": 2160 }, { "epoch": 0.0942894076496084, "grad_norm": 0.14363738894462585, "learning_rate": 1.8710705281144557e-05, "loss": 0.0165, "step": 2170 }, { "epoch": 0.09472392104891535, "grad_norm": 0.13551345467567444, "learning_rate": 1.869684570234838e-05, "loss": 0.0151, "step": 2180 }, { "epoch": 0.09515843444822229, "grad_norm": 0.16080650687217712, "learning_rate": 1.8682917217874675e-05, "loss": 0.0204, "step": 2190 }, { "epoch": 0.09559294784752925, "grad_norm": 0.20290249586105347, "learning_rate": 1.8668919938079738e-05, "loss": 0.0196, "step": 2200 }, { "epoch": 0.0960274612468362, "grad_norm": 0.1393677294254303, "learning_rate": 1.8654853973864928e-05, "loss": 0.0184, "step": 2210 }, { "epoch": 0.09646197464614314, "grad_norm": 0.13181516528129578, "learning_rate": 1.8640719436675806e-05, "loss": 0.0156, "step": 2220 }, { "epoch": 0.0968964880454501, "grad_norm": 0.16097570955753326, "learning_rate": 1.862651643850123e-05, "loss": 0.0166, "step": 2230 }, { "epoch": 0.09733100144475705, "grad_norm": 0.1397980898618698, "learning_rate": 1.8612245091872484e-05, "loss": 0.0135, "step": 2240 }, { "epoch": 0.097765514844064, "grad_norm": 0.10855773836374283, "learning_rate": 1.8597905509862386e-05, "loss": 0.0182, "step": 2250 }, { "epoch": 0.09820002824337096, "grad_norm": 0.1343299150466919, "learning_rate": 1.8583497806084377e-05, "loss": 0.0145, "step": 2260 }, { "epoch": 0.0986345416426779, "grad_norm": 0.09601090848445892, "learning_rate": 1.856902209469164e-05, "loss": 0.0217, "step": 2270 }, { "epoch": 0.09906905504198486, "grad_norm": 0.16081653535366058, "learning_rate": 1.8554478490376186e-05, "loss": 0.0187, "step": 2280 }, { "epoch": 0.09950356844129181, "grad_norm": 0.12331274151802063, "learning_rate": 1.8539867108367937e-05, "loss": 0.0136, "step": 2290 }, { "epoch": 0.09993808184059876, "grad_norm": 0.11327315866947174, "learning_rate": 1.8525188064433827e-05, "loss": 0.018, "step": 2300 }, { "epoch": 0.10037259523990572, "grad_norm": 0.18709249794483185, "learning_rate": 1.8510441474876893e-05, "loss": 0.0173, "step": 2310 }, { "epoch": 0.10080710863921266, "grad_norm": 0.1772821843624115, "learning_rate": 1.8495627456535316e-05, "loss": 0.0161, "step": 2320 }, { "epoch": 0.10124162203851961, "grad_norm": 0.15738612413406372, "learning_rate": 1.8480746126781544e-05, "loss": 0.0158, "step": 2330 }, { "epoch": 0.10167613543782657, "grad_norm": 0.10788944363594055, "learning_rate": 1.846579760352132e-05, "loss": 0.0155, "step": 2340 }, { "epoch": 0.10211064883713351, "grad_norm": 0.11907012015581131, "learning_rate": 1.845078200519277e-05, "loss": 0.0226, "step": 2350 }, { "epoch": 0.10254516223644046, "grad_norm": 0.1725740134716034, "learning_rate": 1.8435699450765467e-05, "loss": 0.0163, "step": 2360 }, { "epoch": 0.10297967563574742, "grad_norm": 0.12773928046226501, "learning_rate": 1.8420550059739476e-05, "loss": 0.0158, "step": 2370 }, { "epoch": 0.10341418903505437, "grad_norm": 0.14268366992473602, "learning_rate": 1.840533395214441e-05, "loss": 0.0147, "step": 2380 }, { "epoch": 0.10384870243436133, "grad_norm": 0.13451974093914032, "learning_rate": 1.839005124853849e-05, "loss": 0.019, "step": 2390 }, { "epoch": 0.10428321583366827, "grad_norm": 0.13277988135814667, "learning_rate": 1.837470207000757e-05, "loss": 0.0151, "step": 2400 }, { "epoch": 0.10471772923297522, "grad_norm": 0.12170128524303436, "learning_rate": 1.83592865381642e-05, "loss": 0.0141, "step": 2410 }, { "epoch": 0.10515224263228218, "grad_norm": 0.1353437900543213, "learning_rate": 1.8343804775146646e-05, "loss": 0.0149, "step": 2420 }, { "epoch": 0.10558675603158912, "grad_norm": 0.129754900932312, "learning_rate": 1.8328256903617928e-05, "loss": 0.0173, "step": 2430 }, { "epoch": 0.10602126943089607, "grad_norm": 0.11942622065544128, "learning_rate": 1.8312643046764854e-05, "loss": 0.018, "step": 2440 }, { "epoch": 0.10645578283020303, "grad_norm": 0.1126435250043869, "learning_rate": 1.829696332829703e-05, "loss": 0.0158, "step": 2450 }, { "epoch": 0.10689029622950998, "grad_norm": 0.178188294172287, "learning_rate": 1.8281217872445894e-05, "loss": 0.0139, "step": 2460 }, { "epoch": 0.10732480962881692, "grad_norm": 0.1451696902513504, "learning_rate": 1.8265406803963723e-05, "loss": 0.0175, "step": 2470 }, { "epoch": 0.10775932302812388, "grad_norm": 0.1306012123823166, "learning_rate": 1.8249530248122643e-05, "loss": 0.0155, "step": 2480 }, { "epoch": 0.10819383642743083, "grad_norm": 0.09586621820926666, "learning_rate": 1.8233588330713648e-05, "loss": 0.0182, "step": 2490 }, { "epoch": 0.10862834982673779, "grad_norm": 0.13471537828445435, "learning_rate": 1.8217581178045588e-05, "loss": 0.0119, "step": 2500 }, { "epoch": 0.10906286322604473, "grad_norm": 0.1438797116279602, "learning_rate": 1.8201508916944187e-05, "loss": 0.0177, "step": 2510 }, { "epoch": 0.10949737662535168, "grad_norm": 0.16950510442256927, "learning_rate": 1.818537167475102e-05, "loss": 0.0167, "step": 2520 }, { "epoch": 0.10993189002465864, "grad_norm": 0.1016625165939331, "learning_rate": 1.816916957932251e-05, "loss": 0.0135, "step": 2530 }, { "epoch": 0.11036640342396559, "grad_norm": 0.12382980436086655, "learning_rate": 1.815290275902892e-05, "loss": 0.0173, "step": 2540 }, { "epoch": 0.11080091682327253, "grad_norm": 0.10736893862485886, "learning_rate": 1.813657134275333e-05, "loss": 0.015, "step": 2550 }, { "epoch": 0.11123543022257949, "grad_norm": 0.09122450649738312, "learning_rate": 1.812017545989063e-05, "loss": 0.015, "step": 2560 }, { "epoch": 0.11166994362188644, "grad_norm": 0.1347617357969284, "learning_rate": 1.810371524034646e-05, "loss": 0.0138, "step": 2570 }, { "epoch": 0.11210445702119338, "grad_norm": 0.13458900153636932, "learning_rate": 1.808719081453622e-05, "loss": 0.0148, "step": 2580 }, { "epoch": 0.11253897042050034, "grad_norm": 0.10396227240562439, "learning_rate": 1.8070602313384018e-05, "loss": 0.014, "step": 2590 }, { "epoch": 0.11297348381980729, "grad_norm": 0.115020751953125, "learning_rate": 1.8053949868321637e-05, "loss": 0.0141, "step": 2600 }, { "epoch": 0.11340799721911425, "grad_norm": 0.1418481320142746, "learning_rate": 1.803723361128748e-05, "loss": 0.0158, "step": 2610 }, { "epoch": 0.1138425106184212, "grad_norm": 0.11479926854372025, "learning_rate": 1.8020453674725557e-05, "loss": 0.016, "step": 2620 }, { "epoch": 0.11427702401772814, "grad_norm": 0.13422134518623352, "learning_rate": 1.8003610191584394e-05, "loss": 0.0122, "step": 2630 }, { "epoch": 0.1147115374170351, "grad_norm": 0.10288428515195847, "learning_rate": 1.7986703295316018e-05, "loss": 0.0152, "step": 2640 }, { "epoch": 0.11514605081634205, "grad_norm": 0.14369966089725494, "learning_rate": 1.7969733119874866e-05, "loss": 0.0176, "step": 2650 }, { "epoch": 0.115580564215649, "grad_norm": 0.0837118998169899, "learning_rate": 1.795269979971675e-05, "loss": 0.0145, "step": 2660 }, { "epoch": 0.11601507761495596, "grad_norm": 0.1787196844816208, "learning_rate": 1.7935603469797784e-05, "loss": 0.0145, "step": 2670 }, { "epoch": 0.1164495910142629, "grad_norm": 0.07823372632265091, "learning_rate": 1.7918444265573308e-05, "loss": 0.0169, "step": 2680 }, { "epoch": 0.11688410441356985, "grad_norm": 0.18035945296287537, "learning_rate": 1.7901222322996815e-05, "loss": 0.0161, "step": 2690 }, { "epoch": 0.11731861781287681, "grad_norm": 0.11666823923587799, "learning_rate": 1.788393777851889e-05, "loss": 0.0161, "step": 2700 }, { "epoch": 0.11775313121218375, "grad_norm": 0.11370805650949478, "learning_rate": 1.7866590769086104e-05, "loss": 0.0171, "step": 2710 }, { "epoch": 0.11818764461149071, "grad_norm": 0.11509191244840622, "learning_rate": 1.7849181432139946e-05, "loss": 0.016, "step": 2720 }, { "epoch": 0.11862215801079766, "grad_norm": 0.09989365190267563, "learning_rate": 1.7831709905615744e-05, "loss": 0.0162, "step": 2730 }, { "epoch": 0.1190566714101046, "grad_norm": 0.14785495400428772, "learning_rate": 1.781417632794153e-05, "loss": 0.0176, "step": 2740 }, { "epoch": 0.11949118480941157, "grad_norm": 0.13436812162399292, "learning_rate": 1.7796580838037e-05, "loss": 0.0132, "step": 2750 }, { "epoch": 0.11992569820871851, "grad_norm": 0.1204938143491745, "learning_rate": 1.777892357531236e-05, "loss": 0.0146, "step": 2760 }, { "epoch": 0.12036021160802546, "grad_norm": 0.10923068225383759, "learning_rate": 1.776120467966727e-05, "loss": 0.0155, "step": 2770 }, { "epoch": 0.12079472500733242, "grad_norm": 0.10814539343118668, "learning_rate": 1.774342429148969e-05, "loss": 0.0153, "step": 2780 }, { "epoch": 0.12122923840663936, "grad_norm": 0.09558166563510895, "learning_rate": 1.7725582551654804e-05, "loss": 0.0133, "step": 2790 }, { "epoch": 0.12166375180594631, "grad_norm": 0.10233845561742783, "learning_rate": 1.7707679601523882e-05, "loss": 0.0137, "step": 2800 }, { "epoch": 0.12209826520525327, "grad_norm": 0.14606024324893951, "learning_rate": 1.7689715582943167e-05, "loss": 0.0171, "step": 2810 }, { "epoch": 0.12253277860456022, "grad_norm": 0.14307714998722076, "learning_rate": 1.7671690638242763e-05, "loss": 0.0161, "step": 2820 }, { "epoch": 0.12296729200386716, "grad_norm": 0.1591099351644516, "learning_rate": 1.7653604910235474e-05, "loss": 0.0164, "step": 2830 }, { "epoch": 0.12340180540317412, "grad_norm": 0.14573435485363007, "learning_rate": 1.763545854221571e-05, "loss": 0.0129, "step": 2840 }, { "epoch": 0.12383631880248107, "grad_norm": 0.10369982570409775, "learning_rate": 1.761725167795834e-05, "loss": 0.016, "step": 2850 }, { "epoch": 0.12427083220178803, "grad_norm": 0.16531281173229218, "learning_rate": 1.7598984461717532e-05, "loss": 0.0177, "step": 2860 }, { "epoch": 0.12470534560109497, "grad_norm": 0.14084115624427795, "learning_rate": 1.758065703822564e-05, "loss": 0.0134, "step": 2870 }, { "epoch": 0.12513985900040192, "grad_norm": 0.14803741872310638, "learning_rate": 1.756226955269204e-05, "loss": 0.0136, "step": 2880 }, { "epoch": 0.12557437239970887, "grad_norm": 0.09318149834871292, "learning_rate": 1.7543822150801975e-05, "loss": 0.0167, "step": 2890 }, { "epoch": 0.12600888579901584, "grad_norm": 0.13491356372833252, "learning_rate": 1.7525314978715425e-05, "loss": 0.015, "step": 2900 }, { "epoch": 0.1264433991983228, "grad_norm": 0.1445305049419403, "learning_rate": 1.7506748183065925e-05, "loss": 0.0168, "step": 2910 }, { "epoch": 0.12687791259762973, "grad_norm": 0.1350657045841217, "learning_rate": 1.7488121910959405e-05, "loss": 0.0146, "step": 2920 }, { "epoch": 0.12731242599693668, "grad_norm": 0.13050585985183716, "learning_rate": 1.7469436309973046e-05, "loss": 0.0171, "step": 2930 }, { "epoch": 0.12774693939624363, "grad_norm": 0.12927724421024323, "learning_rate": 1.7450691528154087e-05, "loss": 0.0176, "step": 2940 }, { "epoch": 0.12818145279555057, "grad_norm": 0.10419710725545883, "learning_rate": 1.7431887714018653e-05, "loss": 0.0203, "step": 2950 }, { "epoch": 0.12861596619485755, "grad_norm": 0.0957477018237114, "learning_rate": 1.74130250165506e-05, "loss": 0.0134, "step": 2960 }, { "epoch": 0.1290504795941645, "grad_norm": 0.14931203424930573, "learning_rate": 1.7394103585200316e-05, "loss": 0.016, "step": 2970 }, { "epoch": 0.12948499299347144, "grad_norm": 0.14293114840984344, "learning_rate": 1.737512356988353e-05, "loss": 0.0176, "step": 2980 }, { "epoch": 0.12991950639277838, "grad_norm": 0.08967836201190948, "learning_rate": 1.7356085120980154e-05, "loss": 0.0131, "step": 2990 }, { "epoch": 0.13035401979208533, "grad_norm": 0.09333787113428116, "learning_rate": 1.7336988389333064e-05, "loss": 0.0158, "step": 3000 }, { "epoch": 0.1307885331913923, "grad_norm": 0.10451024025678635, "learning_rate": 1.731783352624691e-05, "loss": 0.0151, "step": 3010 }, { "epoch": 0.13122304659069925, "grad_norm": 0.10734964907169342, "learning_rate": 1.7298620683486927e-05, "loss": 0.015, "step": 3020 }, { "epoch": 0.1316575599900062, "grad_norm": 0.11963914334774017, "learning_rate": 1.7279350013277725e-05, "loss": 0.0149, "step": 3030 }, { "epoch": 0.13209207338931314, "grad_norm": 0.11979930102825165, "learning_rate": 1.726002166830209e-05, "loss": 0.0125, "step": 3040 }, { "epoch": 0.1325265867886201, "grad_norm": 0.12624527513980865, "learning_rate": 1.7240635801699755e-05, "loss": 0.0162, "step": 3050 }, { "epoch": 0.13296110018792703, "grad_norm": 0.1427086591720581, "learning_rate": 1.7221192567066215e-05, "loss": 0.0131, "step": 3060 }, { "epoch": 0.133395613587234, "grad_norm": 0.12968061864376068, "learning_rate": 1.720169211845149e-05, "loss": 0.0164, "step": 3070 }, { "epoch": 0.13383012698654095, "grad_norm": 0.09234379976987839, "learning_rate": 1.718213461035891e-05, "loss": 0.0151, "step": 3080 }, { "epoch": 0.1342646403858479, "grad_norm": 0.09016980230808258, "learning_rate": 1.716252019774389e-05, "loss": 0.0141, "step": 3090 }, { "epoch": 0.13469915378515485, "grad_norm": 0.12514056265354156, "learning_rate": 1.7142849036012706e-05, "loss": 0.0139, "step": 3100 }, { "epoch": 0.1351336671844618, "grad_norm": 0.11477980017662048, "learning_rate": 1.712312128102126e-05, "loss": 0.0133, "step": 3110 }, { "epoch": 0.13556818058376874, "grad_norm": 0.11837895959615707, "learning_rate": 1.710333708907384e-05, "loss": 0.0157, "step": 3120 }, { "epoch": 0.1360026939830757, "grad_norm": 0.08836041390895844, "learning_rate": 1.70834966169219e-05, "loss": 0.0152, "step": 3130 }, { "epoch": 0.13643720738238266, "grad_norm": 0.10545707494020462, "learning_rate": 1.7063600021762798e-05, "loss": 0.014, "step": 3140 }, { "epoch": 0.1368717207816896, "grad_norm": 0.14428788423538208, "learning_rate": 1.7043647461238557e-05, "loss": 0.0159, "step": 3150 }, { "epoch": 0.13730623418099655, "grad_norm": 0.1318032443523407, "learning_rate": 1.702363909343462e-05, "loss": 0.0137, "step": 3160 }, { "epoch": 0.1377407475803035, "grad_norm": 0.09827178716659546, "learning_rate": 1.7003575076878593e-05, "loss": 0.0151, "step": 3170 }, { "epoch": 0.13817526097961047, "grad_norm": 0.10411065816879272, "learning_rate": 1.6983455570538996e-05, "loss": 0.0137, "step": 3180 }, { "epoch": 0.13860977437891742, "grad_norm": 0.09401621669530869, "learning_rate": 1.696328073382399e-05, "loss": 0.014, "step": 3190 }, { "epoch": 0.13904428777822436, "grad_norm": 0.13308677077293396, "learning_rate": 1.694305072658013e-05, "loss": 0.0179, "step": 3200 }, { "epoch": 0.1394788011775313, "grad_norm": 0.13722048699855804, "learning_rate": 1.6922765709091085e-05, "loss": 0.0131, "step": 3210 }, { "epoch": 0.13991331457683825, "grad_norm": 0.09373093396425247, "learning_rate": 1.6902425842076372e-05, "loss": 0.0129, "step": 3220 }, { "epoch": 0.1403478279761452, "grad_norm": 0.12741738557815552, "learning_rate": 1.6882031286690095e-05, "loss": 0.0138, "step": 3230 }, { "epoch": 0.14078234137545217, "grad_norm": 0.08921542763710022, "learning_rate": 1.6861582204519648e-05, "loss": 0.0139, "step": 3240 }, { "epoch": 0.14121685477475912, "grad_norm": 0.1536066234111786, "learning_rate": 1.6841078757584445e-05, "loss": 0.0143, "step": 3250 }, { "epoch": 0.14165136817406607, "grad_norm": 0.12931448221206665, "learning_rate": 1.6820521108334643e-05, "loss": 0.0158, "step": 3260 }, { "epoch": 0.142085881573373, "grad_norm": 0.14044098556041718, "learning_rate": 1.6799909419649835e-05, "loss": 0.0134, "step": 3270 }, { "epoch": 0.14252039497267996, "grad_norm": 0.10403291136026382, "learning_rate": 1.6779243854837784e-05, "loss": 0.0136, "step": 3280 }, { "epoch": 0.14295490837198693, "grad_norm": 0.13179700076580048, "learning_rate": 1.675852457763311e-05, "loss": 0.0144, "step": 3290 }, { "epoch": 0.14338942177129388, "grad_norm": 0.11001206189393997, "learning_rate": 1.6737751752196005e-05, "loss": 0.014, "step": 3300 }, { "epoch": 0.14382393517060083, "grad_norm": 0.1299135833978653, "learning_rate": 1.6716925543110916e-05, "loss": 0.0135, "step": 3310 }, { "epoch": 0.14425844856990777, "grad_norm": 0.09712474048137665, "learning_rate": 1.669604611538527e-05, "loss": 0.0141, "step": 3320 }, { "epoch": 0.14469296196921472, "grad_norm": 0.08611998707056046, "learning_rate": 1.6675113634448136e-05, "loss": 0.0133, "step": 3330 }, { "epoch": 0.14512747536852166, "grad_norm": 0.14184758067131042, "learning_rate": 1.6654128266148926e-05, "loss": 0.0176, "step": 3340 }, { "epoch": 0.14556198876782864, "grad_norm": 0.10312198847532272, "learning_rate": 1.6633090176756092e-05, "loss": 0.0131, "step": 3350 }, { "epoch": 0.14599650216713558, "grad_norm": 0.07793601602315903, "learning_rate": 1.6611999532955783e-05, "loss": 0.0148, "step": 3360 }, { "epoch": 0.14643101556644253, "grad_norm": 0.058829378336668015, "learning_rate": 1.6590856501850562e-05, "loss": 0.0123, "step": 3370 }, { "epoch": 0.14686552896574948, "grad_norm": 0.08373578637838364, "learning_rate": 1.6569661250958042e-05, "loss": 0.012, "step": 3380 }, { "epoch": 0.14730004236505642, "grad_norm": 0.12803837656974792, "learning_rate": 1.6548413948209584e-05, "loss": 0.0172, "step": 3390 }, { "epoch": 0.1477345557643634, "grad_norm": 0.11021724343299866, "learning_rate": 1.6527114761948957e-05, "loss": 0.0172, "step": 3400 }, { "epoch": 0.14816906916367034, "grad_norm": 0.07495907694101334, "learning_rate": 1.6505763860931002e-05, "loss": 0.0137, "step": 3410 }, { "epoch": 0.1486035825629773, "grad_norm": 0.13075798749923706, "learning_rate": 1.6484361414320312e-05, "loss": 0.0161, "step": 3420 }, { "epoch": 0.14903809596228423, "grad_norm": 0.1488410085439682, "learning_rate": 1.6462907591689875e-05, "loss": 0.0154, "step": 3430 }, { "epoch": 0.14947260936159118, "grad_norm": 0.13530975580215454, "learning_rate": 1.644140256301972e-05, "loss": 0.0141, "step": 3440 }, { "epoch": 0.14990712276089813, "grad_norm": 0.12314094603061676, "learning_rate": 1.6419846498695605e-05, "loss": 0.015, "step": 3450 }, { "epoch": 0.1503416361602051, "grad_norm": 0.13297641277313232, "learning_rate": 1.639823956950764e-05, "loss": 0.0181, "step": 3460 }, { "epoch": 0.15077614955951205, "grad_norm": 0.18126749992370605, "learning_rate": 1.6376581946648928e-05, "loss": 0.0147, "step": 3470 }, { "epoch": 0.151210662958819, "grad_norm": 0.09250223636627197, "learning_rate": 1.6354873801714236e-05, "loss": 0.0121, "step": 3480 }, { "epoch": 0.15164517635812594, "grad_norm": 0.1678650975227356, "learning_rate": 1.6333115306698625e-05, "loss": 0.0116, "step": 3490 }, { "epoch": 0.15207968975743288, "grad_norm": 0.09464786946773529, "learning_rate": 1.6311306633996064e-05, "loss": 0.0118, "step": 3500 }, { "epoch": 0.15251420315673986, "grad_norm": 0.14440122246742249, "learning_rate": 1.62894479563981e-05, "loss": 0.0145, "step": 3510 }, { "epoch": 0.1529487165560468, "grad_norm": 0.08724182844161987, "learning_rate": 1.6267539447092463e-05, "loss": 0.0121, "step": 3520 }, { "epoch": 0.15338322995535375, "grad_norm": 0.11894796788692474, "learning_rate": 1.6245581279661708e-05, "loss": 0.0133, "step": 3530 }, { "epoch": 0.1538177433546607, "grad_norm": 0.08838991075754166, "learning_rate": 1.6223573628081826e-05, "loss": 0.0125, "step": 3540 }, { "epoch": 0.15425225675396764, "grad_norm": 0.27632540464401245, "learning_rate": 1.620151666672089e-05, "loss": 0.0124, "step": 3550 }, { "epoch": 0.1546867701532746, "grad_norm": 0.11539135873317719, "learning_rate": 1.617941057033764e-05, "loss": 0.017, "step": 3560 }, { "epoch": 0.15512128355258156, "grad_norm": 0.14245297014713287, "learning_rate": 1.6157255514080134e-05, "loss": 0.0143, "step": 3570 }, { "epoch": 0.1555557969518885, "grad_norm": 0.0976201668381691, "learning_rate": 1.6135051673484323e-05, "loss": 0.0151, "step": 3580 }, { "epoch": 0.15599031035119545, "grad_norm": 0.11722775548696518, "learning_rate": 1.6112799224472686e-05, "loss": 0.0141, "step": 3590 }, { "epoch": 0.1564248237505024, "grad_norm": 0.05707870051264763, "learning_rate": 1.6090498343352844e-05, "loss": 0.0131, "step": 3600 }, { "epoch": 0.15685933714980935, "grad_norm": 0.08693547546863556, "learning_rate": 1.606814920681613e-05, "loss": 0.0135, "step": 3610 }, { "epoch": 0.15729385054911632, "grad_norm": 0.0932309627532959, "learning_rate": 1.6045751991936213e-05, "loss": 0.0126, "step": 3620 }, { "epoch": 0.15772836394842327, "grad_norm": 0.16721123456954956, "learning_rate": 1.60233068761677e-05, "loss": 0.0148, "step": 3630 }, { "epoch": 0.1581628773477302, "grad_norm": 0.11662253737449646, "learning_rate": 1.60008140373447e-05, "loss": 0.0111, "step": 3640 }, { "epoch": 0.15859739074703716, "grad_norm": 0.10414501279592514, "learning_rate": 1.5978273653679458e-05, "loss": 0.0124, "step": 3650 }, { "epoch": 0.1590319041463441, "grad_norm": 0.2175975888967514, "learning_rate": 1.5955685903760905e-05, "loss": 0.0134, "step": 3660 }, { "epoch": 0.15946641754565105, "grad_norm": 0.1993752419948578, "learning_rate": 1.593305096655326e-05, "loss": 0.0138, "step": 3670 }, { "epoch": 0.15990093094495803, "grad_norm": 0.10459215939044952, "learning_rate": 1.591036902139461e-05, "loss": 0.0147, "step": 3680 }, { "epoch": 0.16033544434426497, "grad_norm": 0.0869443491101265, "learning_rate": 1.5887640247995495e-05, "loss": 0.0164, "step": 3690 }, { "epoch": 0.16076995774357192, "grad_norm": 0.09978344291448593, "learning_rate": 1.5864864826437473e-05, "loss": 0.0112, "step": 3700 }, { "epoch": 0.16120447114287886, "grad_norm": 0.10143698751926422, "learning_rate": 1.5842042937171696e-05, "loss": 0.013, "step": 3710 }, { "epoch": 0.1616389845421858, "grad_norm": 0.13116247951984406, "learning_rate": 1.5819174761017485e-05, "loss": 0.0157, "step": 3720 }, { "epoch": 0.16207349794149278, "grad_norm": 0.08238431811332703, "learning_rate": 1.57962604791609e-05, "loss": 0.0132, "step": 3730 }, { "epoch": 0.16250801134079973, "grad_norm": 0.13452813029289246, "learning_rate": 1.5773300273153296e-05, "loss": 0.0152, "step": 3740 }, { "epoch": 0.16294252474010668, "grad_norm": 0.18914894759655, "learning_rate": 1.5750294324909886e-05, "loss": 0.0141, "step": 3750 }, { "epoch": 0.16337703813941362, "grad_norm": 0.08433225005865097, "learning_rate": 1.57272428167083e-05, "loss": 0.0148, "step": 3760 }, { "epoch": 0.16381155153872057, "grad_norm": 0.1387450397014618, "learning_rate": 1.570414593118715e-05, "loss": 0.0124, "step": 3770 }, { "epoch": 0.16424606493802751, "grad_norm": 0.11928314715623856, "learning_rate": 1.5681003851344568e-05, "loss": 0.013, "step": 3780 }, { "epoch": 0.1646805783373345, "grad_norm": 0.09446173161268234, "learning_rate": 1.5657816760536767e-05, "loss": 0.0115, "step": 3790 }, { "epoch": 0.16511509173664143, "grad_norm": 0.14342348277568817, "learning_rate": 1.5634584842476588e-05, "loss": 0.0146, "step": 3800 }, { "epoch": 0.16554960513594838, "grad_norm": 0.13533058762550354, "learning_rate": 1.5611308281232038e-05, "loss": 0.0164, "step": 3810 }, { "epoch": 0.16598411853525533, "grad_norm": 0.10803470760583878, "learning_rate": 1.5587987261224827e-05, "loss": 0.0129, "step": 3820 }, { "epoch": 0.16641863193456227, "grad_norm": 0.16718289256095886, "learning_rate": 1.556462196722893e-05, "loss": 0.0163, "step": 3830 }, { "epoch": 0.16685314533386925, "grad_norm": 0.12127949297428131, "learning_rate": 1.55412125843691e-05, "loss": 0.012, "step": 3840 }, { "epoch": 0.1672876587331762, "grad_norm": 0.12097957730293274, "learning_rate": 1.5517759298119406e-05, "loss": 0.014, "step": 3850 }, { "epoch": 0.16772217213248314, "grad_norm": 0.10893377661705017, "learning_rate": 1.5494262294301768e-05, "loss": 0.0141, "step": 3860 }, { "epoch": 0.16815668553179008, "grad_norm": 0.12233606725931168, "learning_rate": 1.547072175908449e-05, "loss": 0.0125, "step": 3870 }, { "epoch": 0.16859119893109703, "grad_norm": 0.10938113927841187, "learning_rate": 1.5447137878980768e-05, "loss": 0.01, "step": 3880 }, { "epoch": 0.16902571233040398, "grad_norm": 0.1092139258980751, "learning_rate": 1.5423510840847228e-05, "loss": 0.0166, "step": 3890 }, { "epoch": 0.16946022572971095, "grad_norm": 0.1419563740491867, "learning_rate": 1.5399840831882442e-05, "loss": 0.0183, "step": 3900 }, { "epoch": 0.1698947391290179, "grad_norm": 0.09912795573472977, "learning_rate": 1.5376128039625438e-05, "loss": 0.0138, "step": 3910 }, { "epoch": 0.17032925252832484, "grad_norm": 0.11845508217811584, "learning_rate": 1.535237265195422e-05, "loss": 0.0127, "step": 3920 }, { "epoch": 0.1707637659276318, "grad_norm": 0.11280318349599838, "learning_rate": 1.5328574857084277e-05, "loss": 0.0151, "step": 3930 }, { "epoch": 0.17119827932693873, "grad_norm": 0.1355055868625641, "learning_rate": 1.53047348435671e-05, "loss": 0.0104, "step": 3940 }, { "epoch": 0.1716327927262457, "grad_norm": 0.11777744442224503, "learning_rate": 1.5280852800288672e-05, "loss": 0.0112, "step": 3950 }, { "epoch": 0.17206730612555265, "grad_norm": 0.15552428364753723, "learning_rate": 1.5256928916467986e-05, "loss": 0.01, "step": 3960 }, { "epoch": 0.1725018195248596, "grad_norm": 0.08679380267858505, "learning_rate": 1.5232963381655536e-05, "loss": 0.0128, "step": 3970 }, { "epoch": 0.17293633292416655, "grad_norm": 0.0947830006480217, "learning_rate": 1.5208956385731824e-05, "loss": 0.0099, "step": 3980 }, { "epoch": 0.1733708463234735, "grad_norm": 0.09130040556192398, "learning_rate": 1.5184908118905853e-05, "loss": 0.0151, "step": 3990 }, { "epoch": 0.17380535972278044, "grad_norm": 0.08566273748874664, "learning_rate": 1.5160818771713609e-05, "loss": 0.0121, "step": 4000 }, { "epoch": 0.1742398731220874, "grad_norm": 0.08710242807865143, "learning_rate": 1.5136688535016571e-05, "loss": 0.0146, "step": 4010 }, { "epoch": 0.17467438652139436, "grad_norm": 0.09466756135225296, "learning_rate": 1.5112517600000179e-05, "loss": 0.0164, "step": 4020 }, { "epoch": 0.1751088999207013, "grad_norm": 0.08716245740652084, "learning_rate": 1.5088306158172334e-05, "loss": 0.0134, "step": 4030 }, { "epoch": 0.17554341332000825, "grad_norm": 0.14040999114513397, "learning_rate": 1.5064054401361872e-05, "loss": 0.012, "step": 4040 }, { "epoch": 0.1759779267193152, "grad_norm": 0.11178477108478546, "learning_rate": 1.5039762521717054e-05, "loss": 0.0116, "step": 4050 }, { "epoch": 0.17641244011862217, "grad_norm": 0.06869493424892426, "learning_rate": 1.5015430711704027e-05, "loss": 0.0121, "step": 4060 }, { "epoch": 0.17684695351792912, "grad_norm": 0.11685600876808167, "learning_rate": 1.4991059164105318e-05, "loss": 0.0153, "step": 4070 }, { "epoch": 0.17728146691723606, "grad_norm": 0.12035491317510605, "learning_rate": 1.496664807201829e-05, "loss": 0.0129, "step": 4080 }, { "epoch": 0.177715980316543, "grad_norm": 0.10753650963306427, "learning_rate": 1.494219762885362e-05, "loss": 0.0121, "step": 4090 }, { "epoch": 0.17815049371584996, "grad_norm": 0.08452361077070236, "learning_rate": 1.4917708028333779e-05, "loss": 0.0157, "step": 4100 }, { "epoch": 0.1785850071151569, "grad_norm": 0.08389674127101898, "learning_rate": 1.4893179464491461e-05, "loss": 0.013, "step": 4110 }, { "epoch": 0.17901952051446388, "grad_norm": 0.15167665481567383, "learning_rate": 1.4868612131668095e-05, "loss": 0.0123, "step": 4120 }, { "epoch": 0.17945403391377082, "grad_norm": 0.15268860757350922, "learning_rate": 1.4844006224512254e-05, "loss": 0.0129, "step": 4130 }, { "epoch": 0.17988854731307777, "grad_norm": 0.08754164725542068, "learning_rate": 1.4819361937978162e-05, "loss": 0.0121, "step": 4140 }, { "epoch": 0.18032306071238471, "grad_norm": 0.11443080008029938, "learning_rate": 1.4794679467324106e-05, "loss": 0.0114, "step": 4150 }, { "epoch": 0.18075757411169166, "grad_norm": 0.09617436677217484, "learning_rate": 1.4769959008110922e-05, "loss": 0.012, "step": 4160 }, { "epoch": 0.18119208751099863, "grad_norm": 0.10229325294494629, "learning_rate": 1.4745200756200418e-05, "loss": 0.0104, "step": 4170 }, { "epoch": 0.18162660091030558, "grad_norm": 0.09325513988733292, "learning_rate": 1.4720404907753849e-05, "loss": 0.0102, "step": 4180 }, { "epoch": 0.18206111430961253, "grad_norm": 0.11858367919921875, "learning_rate": 1.4695571659230343e-05, "loss": 0.0138, "step": 4190 }, { "epoch": 0.18249562770891947, "grad_norm": 0.09570284187793732, "learning_rate": 1.4670701207385354e-05, "loss": 0.0143, "step": 4200 }, { "epoch": 0.18293014110822642, "grad_norm": 0.07322904467582703, "learning_rate": 1.46457937492691e-05, "loss": 0.0117, "step": 4210 }, { "epoch": 0.18336465450753336, "grad_norm": 0.16292162239551544, "learning_rate": 1.4620849482224996e-05, "loss": 0.0135, "step": 4220 }, { "epoch": 0.18379916790684034, "grad_norm": 0.13735614717006683, "learning_rate": 1.459586860388811e-05, "loss": 0.0138, "step": 4230 }, { "epoch": 0.18423368130614728, "grad_norm": 0.09884654730558395, "learning_rate": 1.4570851312183572e-05, "loss": 0.0097, "step": 4240 }, { "epoch": 0.18466819470545423, "grad_norm": 0.09125562757253647, "learning_rate": 1.4545797805325017e-05, "loss": 0.0113, "step": 4250 }, { "epoch": 0.18510270810476118, "grad_norm": 0.0886969044804573, "learning_rate": 1.4520708281813023e-05, "loss": 0.0132, "step": 4260 }, { "epoch": 0.18553722150406812, "grad_norm": 0.1494559645652771, "learning_rate": 1.4495582940433525e-05, "loss": 0.012, "step": 4270 }, { "epoch": 0.1859717349033751, "grad_norm": 0.13574957847595215, "learning_rate": 1.4470421980256253e-05, "loss": 0.0156, "step": 4280 }, { "epoch": 0.18640624830268204, "grad_norm": 0.1322220414876938, "learning_rate": 1.4445225600633128e-05, "loss": 0.0111, "step": 4290 }, { "epoch": 0.186840761701989, "grad_norm": 0.10809938609600067, "learning_rate": 1.4419994001196727e-05, "loss": 0.013, "step": 4300 }, { "epoch": 0.18727527510129593, "grad_norm": 0.12031327188014984, "learning_rate": 1.4394727381858655e-05, "loss": 0.0095, "step": 4310 }, { "epoch": 0.18770978850060288, "grad_norm": 0.0926700234413147, "learning_rate": 1.4369425942807994e-05, "loss": 0.0102, "step": 4320 }, { "epoch": 0.18814430189990983, "grad_norm": 0.1475488394498825, "learning_rate": 1.4344089884509702e-05, "loss": 0.0145, "step": 4330 }, { "epoch": 0.1885788152992168, "grad_norm": 0.08219380676746368, "learning_rate": 1.4318719407703022e-05, "loss": 0.0103, "step": 4340 }, { "epoch": 0.18901332869852375, "grad_norm": 0.1727781891822815, "learning_rate": 1.4293314713399904e-05, "loss": 0.0113, "step": 4350 }, { "epoch": 0.1894478420978307, "grad_norm": 0.09111393988132477, "learning_rate": 1.4267876002883406e-05, "loss": 0.0165, "step": 4360 }, { "epoch": 0.18988235549713764, "grad_norm": 0.1518193483352661, "learning_rate": 1.424240347770609e-05, "loss": 0.0128, "step": 4370 }, { "epoch": 0.19031686889644459, "grad_norm": 0.11269281804561615, "learning_rate": 1.4216897339688446e-05, "loss": 0.014, "step": 4380 }, { "epoch": 0.19075138229575153, "grad_norm": 0.08917608112096786, "learning_rate": 1.419135779091727e-05, "loss": 0.012, "step": 4390 }, { "epoch": 0.1911858956950585, "grad_norm": 0.08260834217071533, "learning_rate": 1.4165785033744081e-05, "loss": 0.0146, "step": 4400 }, { "epoch": 0.19162040909436545, "grad_norm": 0.1132323145866394, "learning_rate": 1.4140179270783506e-05, "loss": 0.014, "step": 4410 }, { "epoch": 0.1920549224936724, "grad_norm": 0.14547128975391388, "learning_rate": 1.4114540704911679e-05, "loss": 0.0129, "step": 4420 }, { "epoch": 0.19248943589297934, "grad_norm": 0.10227222740650177, "learning_rate": 1.4088869539264636e-05, "loss": 0.0148, "step": 4430 }, { "epoch": 0.1929239492922863, "grad_norm": 0.12026118487119675, "learning_rate": 1.40631659772367e-05, "loss": 0.0142, "step": 4440 }, { "epoch": 0.19335846269159326, "grad_norm": 0.08430317789316177, "learning_rate": 1.4037430222478876e-05, "loss": 0.0125, "step": 4450 }, { "epoch": 0.1937929760909002, "grad_norm": 0.12531019747257233, "learning_rate": 1.4011662478897239e-05, "loss": 0.0147, "step": 4460 }, { "epoch": 0.19422748949020716, "grad_norm": 0.1069069430232048, "learning_rate": 1.3985862950651296e-05, "loss": 0.0131, "step": 4470 }, { "epoch": 0.1946620028895141, "grad_norm": 0.08250004053115845, "learning_rate": 1.3960031842152404e-05, "loss": 0.0136, "step": 4480 }, { "epoch": 0.19509651628882105, "grad_norm": 0.12113544344902039, "learning_rate": 1.3934169358062128e-05, "loss": 0.0131, "step": 4490 }, { "epoch": 0.195531029688128, "grad_norm": 0.07268256694078445, "learning_rate": 1.3908275703290616e-05, "loss": 0.0123, "step": 4500 }, { "epoch": 0.19596554308743497, "grad_norm": 0.10087154060602188, "learning_rate": 1.3882351082994996e-05, "loss": 0.01, "step": 4510 }, { "epoch": 0.19640005648674191, "grad_norm": 0.1169930100440979, "learning_rate": 1.385639570257772e-05, "loss": 0.0112, "step": 4520 }, { "epoch": 0.19683456988604886, "grad_norm": 0.161058247089386, "learning_rate": 1.3830409767684976e-05, "loss": 0.0102, "step": 4530 }, { "epoch": 0.1972690832853558, "grad_norm": 0.1058209165930748, "learning_rate": 1.380439348420502e-05, "loss": 0.01, "step": 4540 }, { "epoch": 0.19770359668466275, "grad_norm": 0.12935835123062134, "learning_rate": 1.377834705826657e-05, "loss": 0.0111, "step": 4550 }, { "epoch": 0.19813811008396973, "grad_norm": 0.08859094232320786, "learning_rate": 1.3752270696237164e-05, "loss": 0.0098, "step": 4560 }, { "epoch": 0.19857262348327667, "grad_norm": 0.07520358264446259, "learning_rate": 1.3726164604721523e-05, "loss": 0.0122, "step": 4570 }, { "epoch": 0.19900713688258362, "grad_norm": 0.08760765194892883, "learning_rate": 1.370002899055992e-05, "loss": 0.0115, "step": 4580 }, { "epoch": 0.19944165028189056, "grad_norm": 0.12289754301309586, "learning_rate": 1.3673864060826531e-05, "loss": 0.0119, "step": 4590 }, { "epoch": 0.1998761636811975, "grad_norm": 0.089107945561409, "learning_rate": 1.3647670022827815e-05, "loss": 0.0098, "step": 4600 }, { "epoch": 0.20031067708050446, "grad_norm": 0.1029924601316452, "learning_rate": 1.3621447084100843e-05, "loss": 0.0157, "step": 4610 }, { "epoch": 0.20074519047981143, "grad_norm": 0.07722745835781097, "learning_rate": 1.3595195452411674e-05, "loss": 0.0149, "step": 4620 }, { "epoch": 0.20117970387911838, "grad_norm": 0.07080460339784622, "learning_rate": 1.3568915335753704e-05, "loss": 0.012, "step": 4630 }, { "epoch": 0.20161421727842532, "grad_norm": 0.08929922431707382, "learning_rate": 1.3542606942346019e-05, "loss": 0.0109, "step": 4640 }, { "epoch": 0.20204873067773227, "grad_norm": 0.10253210365772247, "learning_rate": 1.3516270480631738e-05, "loss": 0.0122, "step": 4650 }, { "epoch": 0.20248324407703922, "grad_norm": 0.08288437128067017, "learning_rate": 1.3489906159276374e-05, "loss": 0.012, "step": 4660 }, { "epoch": 0.2029177574763462, "grad_norm": 0.10819882154464722, "learning_rate": 1.3463514187166169e-05, "loss": 0.0131, "step": 4670 }, { "epoch": 0.20335227087565313, "grad_norm": 0.11791016161441803, "learning_rate": 1.343709477340644e-05, "loss": 0.013, "step": 4680 }, { "epoch": 0.20378678427496008, "grad_norm": 0.08969360589981079, "learning_rate": 1.3410648127319941e-05, "loss": 0.0128, "step": 4690 }, { "epoch": 0.20422129767426703, "grad_norm": 0.09490638226270676, "learning_rate": 1.3384174458445167e-05, "loss": 0.0085, "step": 4700 }, { "epoch": 0.20465581107357397, "grad_norm": 0.11180137097835541, "learning_rate": 1.335767397653474e-05, "loss": 0.0109, "step": 4710 }, { "epoch": 0.20509032447288092, "grad_norm": 0.07162585109472275, "learning_rate": 1.3331146891553708e-05, "loss": 0.0114, "step": 4720 }, { "epoch": 0.2055248378721879, "grad_norm": 0.07085591554641724, "learning_rate": 1.3304593413677893e-05, "loss": 0.0103, "step": 4730 }, { "epoch": 0.20595935127149484, "grad_norm": 0.1729234755039215, "learning_rate": 1.327801375329225e-05, "loss": 0.0104, "step": 4740 }, { "epoch": 0.20639386467080179, "grad_norm": 0.09915310144424438, "learning_rate": 1.325140812098916e-05, "loss": 0.0146, "step": 4750 }, { "epoch": 0.20682837807010873, "grad_norm": 0.1390744298696518, "learning_rate": 1.322477672756679e-05, "loss": 0.0131, "step": 4760 }, { "epoch": 0.20726289146941568, "grad_norm": 0.11264286935329437, "learning_rate": 1.3198119784027415e-05, "loss": 0.0123, "step": 4770 }, { "epoch": 0.20769740486872265, "grad_norm": 0.1154993548989296, "learning_rate": 1.3171437501575739e-05, "loss": 0.013, "step": 4780 }, { "epoch": 0.2081319182680296, "grad_norm": 0.08860547095537186, "learning_rate": 1.3144730091617235e-05, "loss": 0.0129, "step": 4790 }, { "epoch": 0.20856643166733654, "grad_norm": 0.11519289761781693, "learning_rate": 1.3117997765756455e-05, "loss": 0.014, "step": 4800 }, { "epoch": 0.2090009450666435, "grad_norm": 0.08243907243013382, "learning_rate": 1.3091240735795372e-05, "loss": 0.011, "step": 4810 }, { "epoch": 0.20943545846595044, "grad_norm": 0.08150099217891693, "learning_rate": 1.3064459213731679e-05, "loss": 0.0105, "step": 4820 }, { "epoch": 0.20986997186525738, "grad_norm": 0.09725738316774368, "learning_rate": 1.3037653411757134e-05, "loss": 0.014, "step": 4830 }, { "epoch": 0.21030448526456436, "grad_norm": 0.0995921865105629, "learning_rate": 1.301082354225585e-05, "loss": 0.0111, "step": 4840 }, { "epoch": 0.2107389986638713, "grad_norm": 0.09498723596334457, "learning_rate": 1.2983969817802653e-05, "loss": 0.0121, "step": 4850 }, { "epoch": 0.21117351206317825, "grad_norm": 0.07547599077224731, "learning_rate": 1.2957092451161344e-05, "loss": 0.0098, "step": 4860 }, { "epoch": 0.2116080254624852, "grad_norm": 0.11737942695617676, "learning_rate": 1.293019165528307e-05, "loss": 0.0121, "step": 4870 }, { "epoch": 0.21204253886179214, "grad_norm": 0.07418479025363922, "learning_rate": 1.2903267643304588e-05, "loss": 0.0133, "step": 4880 }, { "epoch": 0.21247705226109911, "grad_norm": 0.14015620946884155, "learning_rate": 1.2876320628546608e-05, "loss": 0.0109, "step": 4890 }, { "epoch": 0.21291156566040606, "grad_norm": 0.09719277173280716, "learning_rate": 1.2849350824512097e-05, "loss": 0.0114, "step": 4900 }, { "epoch": 0.213346079059713, "grad_norm": 0.07804874330759048, "learning_rate": 1.2822358444884568e-05, "loss": 0.0113, "step": 4910 }, { "epoch": 0.21378059245901995, "grad_norm": 0.06252479553222656, "learning_rate": 1.279534370352642e-05, "loss": 0.0099, "step": 4920 }, { "epoch": 0.2142151058583269, "grad_norm": 0.07044639438390732, "learning_rate": 1.276830681447721e-05, "loss": 0.0096, "step": 4930 }, { "epoch": 0.21464961925763384, "grad_norm": 0.15297789871692657, "learning_rate": 1.2741247991951976e-05, "loss": 0.0108, "step": 4940 }, { "epoch": 0.21508413265694082, "grad_norm": 0.10401252657175064, "learning_rate": 1.2714167450339551e-05, "loss": 0.0128, "step": 4950 }, { "epoch": 0.21551864605624776, "grad_norm": 0.13289690017700195, "learning_rate": 1.268706540420083e-05, "loss": 0.0119, "step": 4960 }, { "epoch": 0.2159531594555547, "grad_norm": 0.09449134021997452, "learning_rate": 1.2659942068267097e-05, "loss": 0.0085, "step": 4970 }, { "epoch": 0.21638767285486166, "grad_norm": 0.11930648237466812, "learning_rate": 1.2632797657438317e-05, "loss": 0.0109, "step": 4980 }, { "epoch": 0.2168221862541686, "grad_norm": 0.10005539655685425, "learning_rate": 1.2605632386781442e-05, "loss": 0.0117, "step": 4990 }, { "epoch": 0.21725669965347558, "grad_norm": 0.11891493946313858, "learning_rate": 1.2578446471528678e-05, "loss": 0.0125, "step": 5000 }, { "epoch": 0.21769121305278252, "grad_norm": 0.12067518383264542, "learning_rate": 1.2551240127075815e-05, "loss": 0.0124, "step": 5010 }, { "epoch": 0.21812572645208947, "grad_norm": 0.12736202776432037, "learning_rate": 1.2524013568980496e-05, "loss": 0.012, "step": 5020 }, { "epoch": 0.21856023985139642, "grad_norm": 0.08547815680503845, "learning_rate": 1.249676701296053e-05, "loss": 0.0108, "step": 5030 }, { "epoch": 0.21899475325070336, "grad_norm": 0.08990458399057388, "learning_rate": 1.2469500674892159e-05, "loss": 0.0107, "step": 5040 }, { "epoch": 0.2194292666500103, "grad_norm": 0.09064441174268723, "learning_rate": 1.244221477080836e-05, "loss": 0.011, "step": 5050 }, { "epoch": 0.21986378004931728, "grad_norm": 0.10774874687194824, "learning_rate": 1.2414909516897145e-05, "loss": 0.01, "step": 5060 }, { "epoch": 0.22029829344862423, "grad_norm": 0.11564863473176956, "learning_rate": 1.2387585129499815e-05, "loss": 0.0116, "step": 5070 }, { "epoch": 0.22073280684793117, "grad_norm": 0.1112249344587326, "learning_rate": 1.2360241825109293e-05, "loss": 0.0122, "step": 5080 }, { "epoch": 0.22116732024723812, "grad_norm": 0.09462766349315643, "learning_rate": 1.2332879820368358e-05, "loss": 0.0128, "step": 5090 }, { "epoch": 0.22160183364654507, "grad_norm": 0.11477925628423691, "learning_rate": 1.2305499332067967e-05, "loss": 0.0105, "step": 5100 }, { "epoch": 0.22203634704585204, "grad_norm": 0.08903530240058899, "learning_rate": 1.2278100577145526e-05, "loss": 0.0121, "step": 5110 }, { "epoch": 0.22247086044515899, "grad_norm": 0.0783642828464508, "learning_rate": 1.2250683772683151e-05, "loss": 0.0144, "step": 5120 }, { "epoch": 0.22290537384446593, "grad_norm": 0.08656443655490875, "learning_rate": 1.222324913590599e-05, "loss": 0.0163, "step": 5130 }, { "epoch": 0.22333988724377288, "grad_norm": 0.08687745034694672, "learning_rate": 1.2195796884180458e-05, "loss": 0.0102, "step": 5140 }, { "epoch": 0.22377440064307982, "grad_norm": 0.09154532104730606, "learning_rate": 1.2168327235012544e-05, "loss": 0.0097, "step": 5150 }, { "epoch": 0.22420891404238677, "grad_norm": 0.14180970191955566, "learning_rate": 1.2140840406046075e-05, "loss": 0.0111, "step": 5160 }, { "epoch": 0.22464342744169374, "grad_norm": 0.09861394762992859, "learning_rate": 1.2113336615060996e-05, "loss": 0.0099, "step": 5170 }, { "epoch": 0.2250779408410007, "grad_norm": 0.1602022647857666, "learning_rate": 1.2085816079971639e-05, "loss": 0.0114, "step": 5180 }, { "epoch": 0.22551245424030764, "grad_norm": 0.13536742329597473, "learning_rate": 1.205827901882501e-05, "loss": 0.0098, "step": 5190 }, { "epoch": 0.22594696763961458, "grad_norm": 0.1036570593714714, "learning_rate": 1.2030725649799043e-05, "loss": 0.011, "step": 5200 }, { "epoch": 0.22638148103892153, "grad_norm": 0.14041714370250702, "learning_rate": 1.2003156191200885e-05, "loss": 0.0102, "step": 5210 }, { "epoch": 0.2268159944382285, "grad_norm": 0.13318133354187012, "learning_rate": 1.1975570861465156e-05, "loss": 0.0107, "step": 5220 }, { "epoch": 0.22725050783753545, "grad_norm": 0.05623549222946167, "learning_rate": 1.194796987915223e-05, "loss": 0.0145, "step": 5230 }, { "epoch": 0.2276850212368424, "grad_norm": 0.04628011956810951, "learning_rate": 1.1920353462946503e-05, "loss": 0.014, "step": 5240 }, { "epoch": 0.22811953463614934, "grad_norm": 0.10823997110128403, "learning_rate": 1.1892721831654638e-05, "loss": 0.0096, "step": 5250 }, { "epoch": 0.2285540480354563, "grad_norm": 0.08705588430166245, "learning_rate": 1.1865075204203866e-05, "loss": 0.0093, "step": 5260 }, { "epoch": 0.22898856143476323, "grad_norm": 0.10484207421541214, "learning_rate": 1.1837413799640216e-05, "loss": 0.0113, "step": 5270 }, { "epoch": 0.2294230748340702, "grad_norm": 0.13769450783729553, "learning_rate": 1.1809737837126812e-05, "loss": 0.0117, "step": 5280 }, { "epoch": 0.22985758823337715, "grad_norm": 0.09259689599275589, "learning_rate": 1.1782047535942117e-05, "loss": 0.0104, "step": 5290 }, { "epoch": 0.2302921016326841, "grad_norm": 0.1792672574520111, "learning_rate": 1.1754343115478193e-05, "loss": 0.0117, "step": 5300 }, { "epoch": 0.23072661503199104, "grad_norm": 0.17120079696178436, "learning_rate": 1.1726624795238981e-05, "loss": 0.0111, "step": 5310 }, { "epoch": 0.231161128431298, "grad_norm": 0.08341917395591736, "learning_rate": 1.1698892794838546e-05, "loss": 0.0116, "step": 5320 }, { "epoch": 0.23159564183060496, "grad_norm": 0.07231700420379639, "learning_rate": 1.167114733399934e-05, "loss": 0.0106, "step": 5330 }, { "epoch": 0.2320301552299119, "grad_norm": 0.06728389859199524, "learning_rate": 1.1643388632550468e-05, "loss": 0.0102, "step": 5340 }, { "epoch": 0.23246466862921886, "grad_norm": 0.0841410830616951, "learning_rate": 1.1615616910425928e-05, "loss": 0.0113, "step": 5350 }, { "epoch": 0.2328991820285258, "grad_norm": 0.10476098209619522, "learning_rate": 1.15878323876629e-05, "loss": 0.0103, "step": 5360 }, { "epoch": 0.23333369542783275, "grad_norm": 0.08550181239843369, "learning_rate": 1.1560035284399977e-05, "loss": 0.0142, "step": 5370 }, { "epoch": 0.2337682088271397, "grad_norm": 0.09405146539211273, "learning_rate": 1.1532225820875422e-05, "loss": 0.0127, "step": 5380 }, { "epoch": 0.23420272222644667, "grad_norm": 0.11513853073120117, "learning_rate": 1.1504404217425438e-05, "loss": 0.0127, "step": 5390 }, { "epoch": 0.23463723562575362, "grad_norm": 0.12319008260965347, "learning_rate": 1.1476570694482406e-05, "loss": 0.0104, "step": 5400 }, { "epoch": 0.23507174902506056, "grad_norm": 0.05747454985976219, "learning_rate": 1.1448725472573145e-05, "loss": 0.0125, "step": 5410 }, { "epoch": 0.2355062624243675, "grad_norm": 0.09699085354804993, "learning_rate": 1.1420868772317184e-05, "loss": 0.0117, "step": 5420 }, { "epoch": 0.23594077582367445, "grad_norm": 0.1155138611793518, "learning_rate": 1.1393000814424973e-05, "loss": 0.011, "step": 5430 }, { "epoch": 0.23637528922298143, "grad_norm": 0.07734677940607071, "learning_rate": 1.1365121819696163e-05, "loss": 0.0122, "step": 5440 }, { "epoch": 0.23680980262228837, "grad_norm": 0.13127054274082184, "learning_rate": 1.1337232009017858e-05, "loss": 0.0148, "step": 5450 }, { "epoch": 0.23724431602159532, "grad_norm": 0.11363419145345688, "learning_rate": 1.130933160336285e-05, "loss": 0.0135, "step": 5460 }, { "epoch": 0.23767882942090227, "grad_norm": 0.19144205749034882, "learning_rate": 1.1281420823787883e-05, "loss": 0.0136, "step": 5470 }, { "epoch": 0.2381133428202092, "grad_norm": 0.11351267248392105, "learning_rate": 1.1253499891431882e-05, "loss": 0.0107, "step": 5480 }, { "epoch": 0.23854785621951616, "grad_norm": 0.09887941181659698, "learning_rate": 1.1225569027514229e-05, "loss": 0.0093, "step": 5490 }, { "epoch": 0.23898236961882313, "grad_norm": 0.15717951953411102, "learning_rate": 1.1197628453332986e-05, "loss": 0.0118, "step": 5500 }, { "epoch": 0.23941688301813008, "grad_norm": 0.0757034569978714, "learning_rate": 1.1169678390263143e-05, "loss": 0.0114, "step": 5510 }, { "epoch": 0.23985139641743702, "grad_norm": 0.12099827080965042, "learning_rate": 1.1141719059754884e-05, "loss": 0.0111, "step": 5520 }, { "epoch": 0.24028590981674397, "grad_norm": 0.11477316915988922, "learning_rate": 1.1113750683331813e-05, "loss": 0.0142, "step": 5530 }, { "epoch": 0.24072042321605092, "grad_norm": 0.12390632182359695, "learning_rate": 1.1085773482589206e-05, "loss": 0.0122, "step": 5540 }, { "epoch": 0.2411549366153579, "grad_norm": 0.06999865174293518, "learning_rate": 1.1057787679192256e-05, "loss": 0.0086, "step": 5550 }, { "epoch": 0.24158945001466484, "grad_norm": 0.14412355422973633, "learning_rate": 1.1029793494874312e-05, "loss": 0.0136, "step": 5560 }, { "epoch": 0.24202396341397178, "grad_norm": 0.12041021883487701, "learning_rate": 1.1001791151435131e-05, "loss": 0.0119, "step": 5570 }, { "epoch": 0.24245847681327873, "grad_norm": 0.054349642246961594, "learning_rate": 1.0973780870739111e-05, "loss": 0.0086, "step": 5580 }, { "epoch": 0.24289299021258567, "grad_norm": 0.0854615718126297, "learning_rate": 1.0945762874713537e-05, "loss": 0.0104, "step": 5590 }, { "epoch": 0.24332750361189262, "grad_norm": 0.09958671033382416, "learning_rate": 1.0917737385346828e-05, "loss": 0.0081, "step": 5600 }, { "epoch": 0.2437620170111996, "grad_norm": 0.13889344036579132, "learning_rate": 1.0889704624686766e-05, "loss": 0.0134, "step": 5610 }, { "epoch": 0.24419653041050654, "grad_norm": 0.125096395611763, "learning_rate": 1.0861664814838747e-05, "loss": 0.0102, "step": 5620 }, { "epoch": 0.2446310438098135, "grad_norm": 0.06588041037321091, "learning_rate": 1.083361817796403e-05, "loss": 0.0122, "step": 5630 }, { "epoch": 0.24506555720912043, "grad_norm": 0.11299753189086914, "learning_rate": 1.0805564936277936e-05, "loss": 0.0105, "step": 5640 }, { "epoch": 0.24550007060842738, "grad_norm": 0.10001208633184433, "learning_rate": 1.0777505312048152e-05, "loss": 0.0128, "step": 5650 }, { "epoch": 0.24593458400773432, "grad_norm": 0.1002093255519867, "learning_rate": 1.0749439527592909e-05, "loss": 0.013, "step": 5660 }, { "epoch": 0.2463690974070413, "grad_norm": 0.10719779878854752, "learning_rate": 1.0721367805279251e-05, "loss": 0.0109, "step": 5670 }, { "epoch": 0.24680361080634824, "grad_norm": 0.09654932469129562, "learning_rate": 1.0693290367521276e-05, "loss": 0.0093, "step": 5680 }, { "epoch": 0.2472381242056552, "grad_norm": 0.06348220258951187, "learning_rate": 1.0665207436778353e-05, "loss": 0.0102, "step": 5690 }, { "epoch": 0.24767263760496214, "grad_norm": 0.13976316154003143, "learning_rate": 1.0637119235553388e-05, "loss": 0.0121, "step": 5700 }, { "epoch": 0.24810715100426908, "grad_norm": 0.10334043204784393, "learning_rate": 1.0609025986391032e-05, "loss": 0.0118, "step": 5710 }, { "epoch": 0.24854166440357606, "grad_norm": 0.09673593193292618, "learning_rate": 1.0580927911875938e-05, "loss": 0.0093, "step": 5720 }, { "epoch": 0.248976177802883, "grad_norm": 0.08155500888824463, "learning_rate": 1.055282523463099e-05, "loss": 0.011, "step": 5730 }, { "epoch": 0.24941069120218995, "grad_norm": 0.10107115656137466, "learning_rate": 1.0524718177315536e-05, "loss": 0.0077, "step": 5740 }, { "epoch": 0.2498452046014969, "grad_norm": 0.10318543016910553, "learning_rate": 1.0496606962623632e-05, "loss": 0.0113, "step": 5750 }, { "epoch": 0.25027971800080384, "grad_norm": 0.07837533950805664, "learning_rate": 1.0468491813282269e-05, "loss": 0.0096, "step": 5760 }, { "epoch": 0.2507142314001108, "grad_norm": 0.1109745129942894, "learning_rate": 1.0440372952049618e-05, "loss": 0.0104, "step": 5770 }, { "epoch": 0.25114874479941773, "grad_norm": 0.10056454688310623, "learning_rate": 1.0412250601713254e-05, "loss": 0.0095, "step": 5780 }, { "epoch": 0.2515832581987247, "grad_norm": 0.11573497951030731, "learning_rate": 1.03841249850884e-05, "loss": 0.0127, "step": 5790 }, { "epoch": 0.2520177715980317, "grad_norm": 0.09385628998279572, "learning_rate": 1.0355996325016152e-05, "loss": 0.012, "step": 5800 }, { "epoch": 0.2524522849973386, "grad_norm": 0.07893621921539307, "learning_rate": 1.0327864844361735e-05, "loss": 0.0086, "step": 5810 }, { "epoch": 0.2528867983966456, "grad_norm": 0.10404128581285477, "learning_rate": 1.02997307660127e-05, "loss": 0.0103, "step": 5820 }, { "epoch": 0.2533213117959525, "grad_norm": 0.17071014642715454, "learning_rate": 1.0271594312877196e-05, "loss": 0.0109, "step": 5830 }, { "epoch": 0.25375582519525947, "grad_norm": 0.07020419836044312, "learning_rate": 1.024345570788218e-05, "loss": 0.0101, "step": 5840 }, { "epoch": 0.2541903385945664, "grad_norm": 0.11266210675239563, "learning_rate": 1.0215315173971661e-05, "loss": 0.0122, "step": 5850 }, { "epoch": 0.25462485199387336, "grad_norm": 0.11897215247154236, "learning_rate": 1.0187172934104934e-05, "loss": 0.0107, "step": 5860 }, { "epoch": 0.2550593653931803, "grad_norm": 0.10172966867685318, "learning_rate": 1.01590292112548e-05, "loss": 0.0135, "step": 5870 }, { "epoch": 0.25549387879248725, "grad_norm": 0.09467621892690659, "learning_rate": 1.013088422840582e-05, "loss": 0.0083, "step": 5880 }, { "epoch": 0.2559283921917942, "grad_norm": 0.09752559661865234, "learning_rate": 1.0102738208552535e-05, "loss": 0.0114, "step": 5890 }, { "epoch": 0.25636290559110114, "grad_norm": 0.08257199823856354, "learning_rate": 1.0074591374697701e-05, "loss": 0.0101, "step": 5900 }, { "epoch": 0.25679741899040814, "grad_norm": 0.07855411618947983, "learning_rate": 1.0046443949850531e-05, "loss": 0.0105, "step": 5910 }, { "epoch": 0.2572319323897151, "grad_norm": 0.07632487267255783, "learning_rate": 1.00182961570249e-05, "loss": 0.0098, "step": 5920 }, { "epoch": 0.25766644578902204, "grad_norm": 0.0864594429731369, "learning_rate": 9.990148219237623e-06, "loss": 0.0099, "step": 5930 }, { "epoch": 0.258100959188329, "grad_norm": 0.07804177701473236, "learning_rate": 9.96200035950665e-06, "loss": 0.0091, "step": 5940 }, { "epoch": 0.25853547258763593, "grad_norm": 0.1131715476512909, "learning_rate": 9.933852800849311e-06, "loss": 0.0089, "step": 5950 }, { "epoch": 0.2589699859869429, "grad_norm": 0.08851603418588638, "learning_rate": 9.905705766280564e-06, "loss": 0.011, "step": 5960 }, { "epoch": 0.2594044993862498, "grad_norm": 0.08719158172607422, "learning_rate": 9.877559478811199e-06, "loss": 0.0083, "step": 5970 }, { "epoch": 0.25983901278555677, "grad_norm": 0.1157078891992569, "learning_rate": 9.849414161446093e-06, "loss": 0.0104, "step": 5980 }, { "epoch": 0.2602735261848637, "grad_norm": 0.1483270823955536, "learning_rate": 9.821270037182442e-06, "loss": 0.0114, "step": 5990 }, { "epoch": 0.26070803958417066, "grad_norm": 0.0843772441148758, "learning_rate": 9.793127329007973e-06, "loss": 0.0107, "step": 6000 }, { "epoch": 0.2611425529834776, "grad_norm": 0.06375252455472946, "learning_rate": 9.76498625989922e-06, "loss": 0.0081, "step": 6010 }, { "epoch": 0.2615770663827846, "grad_norm": 0.07981333881616592, "learning_rate": 9.736847052819704e-06, "loss": 0.012, "step": 6020 }, { "epoch": 0.26201157978209155, "grad_norm": 0.1121138408780098, "learning_rate": 9.708709930718204e-06, "loss": 0.011, "step": 6030 }, { "epoch": 0.2624460931813985, "grad_norm": 0.08210161328315735, "learning_rate": 9.680575116526982e-06, "loss": 0.0114, "step": 6040 }, { "epoch": 0.26288060658070544, "grad_norm": 0.06528054177761078, "learning_rate": 9.652442833160012e-06, "loss": 0.0099, "step": 6050 }, { "epoch": 0.2633151199800124, "grad_norm": 0.16328716278076172, "learning_rate": 9.624313303511218e-06, "loss": 0.0105, "step": 6060 }, { "epoch": 0.26374963337931934, "grad_norm": 0.08479687571525574, "learning_rate": 9.5961867504527e-06, "loss": 0.0095, "step": 6070 }, { "epoch": 0.2641841467786263, "grad_norm": 0.07424337416887283, "learning_rate": 9.568063396832979e-06, "loss": 0.0097, "step": 6080 }, { "epoch": 0.26461866017793323, "grad_norm": 0.07293923199176788, "learning_rate": 9.539943465475224e-06, "loss": 0.0118, "step": 6090 }, { "epoch": 0.2650531735772402, "grad_norm": 0.11847243458032608, "learning_rate": 9.511827179175496e-06, "loss": 0.0101, "step": 6100 }, { "epoch": 0.2654876869765471, "grad_norm": 0.08558386564254761, "learning_rate": 9.483714760700968e-06, "loss": 0.0108, "step": 6110 }, { "epoch": 0.26592220037585407, "grad_norm": 0.08817441016435623, "learning_rate": 9.455606432788172e-06, "loss": 0.0099, "step": 6120 }, { "epoch": 0.26635671377516107, "grad_norm": 0.057785723358392715, "learning_rate": 9.427502418141228e-06, "loss": 0.0096, "step": 6130 }, { "epoch": 0.266791227174468, "grad_norm": 0.12684212625026703, "learning_rate": 9.399402939430078e-06, "loss": 0.0118, "step": 6140 }, { "epoch": 0.26722574057377496, "grad_norm": 0.09561102092266083, "learning_rate": 9.371308219288739e-06, "loss": 0.0099, "step": 6150 }, { "epoch": 0.2676602539730819, "grad_norm": 0.09827898442745209, "learning_rate": 9.343218480313514e-06, "loss": 0.0097, "step": 6160 }, { "epoch": 0.26809476737238885, "grad_norm": 0.07475527375936508, "learning_rate": 9.315133945061243e-06, "loss": 0.011, "step": 6170 }, { "epoch": 0.2685292807716958, "grad_norm": 0.13806475698947906, "learning_rate": 9.287054836047532e-06, "loss": 0.0129, "step": 6180 }, { "epoch": 0.26896379417100275, "grad_norm": 0.06837272644042969, "learning_rate": 9.258981375745005e-06, "loss": 0.0121, "step": 6190 }, { "epoch": 0.2693983075703097, "grad_norm": 0.09564583748579025, "learning_rate": 9.230913786581523e-06, "loss": 0.0085, "step": 6200 }, { "epoch": 0.26983282096961664, "grad_norm": 0.09246288239955902, "learning_rate": 9.20285229093843e-06, "loss": 0.0126, "step": 6210 }, { "epoch": 0.2702673343689236, "grad_norm": 0.08352892100811005, "learning_rate": 9.174797111148792e-06, "loss": 0.0116, "step": 6220 }, { "epoch": 0.27070184776823053, "grad_norm": 0.10271226614713669, "learning_rate": 9.146748469495632e-06, "loss": 0.0101, "step": 6230 }, { "epoch": 0.2711363611675375, "grad_norm": 0.1079292818903923, "learning_rate": 9.11870658821018e-06, "loss": 0.0115, "step": 6240 }, { "epoch": 0.2715708745668445, "grad_norm": 0.10774653404951096, "learning_rate": 9.090671689470092e-06, "loss": 0.0077, "step": 6250 }, { "epoch": 0.2720053879661514, "grad_norm": 0.08636409789323807, "learning_rate": 9.062643995397705e-06, "loss": 0.009, "step": 6260 }, { "epoch": 0.27243990136545837, "grad_norm": 0.09142716228961945, "learning_rate": 9.034623728058269e-06, "loss": 0.009, "step": 6270 }, { "epoch": 0.2728744147647653, "grad_norm": 0.08086559921503067, "learning_rate": 9.006611109458201e-06, "loss": 0.0095, "step": 6280 }, { "epoch": 0.27330892816407226, "grad_norm": 0.11120078712701797, "learning_rate": 8.97860636154331e-06, "loss": 0.0109, "step": 6290 }, { "epoch": 0.2737434415633792, "grad_norm": 0.12770721316337585, "learning_rate": 8.950609706197048e-06, "loss": 0.0111, "step": 6300 }, { "epoch": 0.27417795496268615, "grad_norm": 0.08982839435338974, "learning_rate": 8.922621365238742e-06, "loss": 0.011, "step": 6310 }, { "epoch": 0.2746124683619931, "grad_norm": 0.10453902184963226, "learning_rate": 8.89464156042185e-06, "loss": 0.0113, "step": 6320 }, { "epoch": 0.27504698176130005, "grad_norm": 0.1214829757809639, "learning_rate": 8.8666705134322e-06, "loss": 0.0139, "step": 6330 }, { "epoch": 0.275481495160607, "grad_norm": 0.09030287712812424, "learning_rate": 8.838708445886223e-06, "loss": 0.0113, "step": 6340 }, { "epoch": 0.27591600855991394, "grad_norm": 0.0918763130903244, "learning_rate": 8.810755579329213e-06, "loss": 0.0092, "step": 6350 }, { "epoch": 0.27635052195922094, "grad_norm": 0.07734326273202896, "learning_rate": 8.782812135233556e-06, "loss": 0.0091, "step": 6360 }, { "epoch": 0.2767850353585279, "grad_norm": 0.09394949674606323, "learning_rate": 8.754878334996995e-06, "loss": 0.0098, "step": 6370 }, { "epoch": 0.27721954875783483, "grad_norm": 0.10168241709470749, "learning_rate": 8.726954399940855e-06, "loss": 0.0111, "step": 6380 }, { "epoch": 0.2776540621571418, "grad_norm": 0.06285534054040909, "learning_rate": 8.699040551308296e-06, "loss": 0.0121, "step": 6390 }, { "epoch": 0.2780885755564487, "grad_norm": 0.09565223753452301, "learning_rate": 8.671137010262568e-06, "loss": 0.0084, "step": 6400 }, { "epoch": 0.27852308895575567, "grad_norm": 0.08614465594291687, "learning_rate": 8.643243997885253e-06, "loss": 0.0101, "step": 6410 }, { "epoch": 0.2789576023550626, "grad_norm": 0.06294821202754974, "learning_rate": 8.615361735174517e-06, "loss": 0.0106, "step": 6420 }, { "epoch": 0.27939211575436956, "grad_norm": 0.07929501682519913, "learning_rate": 8.58749044304335e-06, "loss": 0.0106, "step": 6430 }, { "epoch": 0.2798266291536765, "grad_norm": 0.06724255532026291, "learning_rate": 8.559630342317822e-06, "loss": 0.0075, "step": 6440 }, { "epoch": 0.28026114255298346, "grad_norm": 0.10936015844345093, "learning_rate": 8.531781653735334e-06, "loss": 0.0153, "step": 6450 }, { "epoch": 0.2806956559522904, "grad_norm": 0.08154020458459854, "learning_rate": 8.503944597942865e-06, "loss": 0.0116, "step": 6460 }, { "epoch": 0.2811301693515974, "grad_norm": 0.143080934882164, "learning_rate": 8.476119395495235e-06, "loss": 0.0107, "step": 6470 }, { "epoch": 0.28156468275090435, "grad_norm": 0.06979650259017944, "learning_rate": 8.44830626685334e-06, "loss": 0.01, "step": 6480 }, { "epoch": 0.2819991961502113, "grad_norm": 0.08214490860700607, "learning_rate": 8.42050543238242e-06, "loss": 0.0103, "step": 6490 }, { "epoch": 0.28243370954951824, "grad_norm": 0.15016458928585052, "learning_rate": 8.392717112350301e-06, "loss": 0.0127, "step": 6500 }, { "epoch": 0.2828682229488252, "grad_norm": 0.1313597708940506, "learning_rate": 8.364941526925667e-06, "loss": 0.0116, "step": 6510 }, { "epoch": 0.28330273634813213, "grad_norm": 0.14201946556568146, "learning_rate": 8.337178896176295e-06, "loss": 0.0088, "step": 6520 }, { "epoch": 0.2837372497474391, "grad_norm": 0.111870177090168, "learning_rate": 8.309429440067324e-06, "loss": 0.0121, "step": 6530 }, { "epoch": 0.284171763146746, "grad_norm": 0.07782815396785736, "learning_rate": 8.281693378459516e-06, "loss": 0.0102, "step": 6540 }, { "epoch": 0.28460627654605297, "grad_norm": 0.10611778497695923, "learning_rate": 8.253970931107492e-06, "loss": 0.0124, "step": 6550 }, { "epoch": 0.2850407899453599, "grad_norm": 0.08557621389627457, "learning_rate": 8.226262317658027e-06, "loss": 0.0116, "step": 6560 }, { "epoch": 0.28547530334466686, "grad_norm": 0.09672235697507858, "learning_rate": 8.198567757648272e-06, "loss": 0.0114, "step": 6570 }, { "epoch": 0.28590981674397387, "grad_norm": 0.09065134823322296, "learning_rate": 8.170887470504038e-06, "loss": 0.0104, "step": 6580 }, { "epoch": 0.2863443301432808, "grad_norm": 0.09284477680921555, "learning_rate": 8.143221675538053e-06, "loss": 0.0115, "step": 6590 }, { "epoch": 0.28677884354258776, "grad_norm": 0.07387055456638336, "learning_rate": 8.115570591948222e-06, "loss": 0.0114, "step": 6600 }, { "epoch": 0.2872133569418947, "grad_norm": 0.09602408856153488, "learning_rate": 8.087934438815888e-06, "loss": 0.0109, "step": 6610 }, { "epoch": 0.28764787034120165, "grad_norm": 0.10952190309762955, "learning_rate": 8.0603134351041e-06, "loss": 0.0104, "step": 6620 }, { "epoch": 0.2880823837405086, "grad_norm": 0.2506903111934662, "learning_rate": 8.032707799655876e-06, "loss": 0.0103, "step": 6630 }, { "epoch": 0.28851689713981554, "grad_norm": 0.12849514186382294, "learning_rate": 8.005117751192472e-06, "loss": 0.0097, "step": 6640 }, { "epoch": 0.2889514105391225, "grad_norm": 0.06921357661485672, "learning_rate": 7.977543508311653e-06, "loss": 0.0087, "step": 6650 }, { "epoch": 0.28938592393842943, "grad_norm": 0.15184468030929565, "learning_rate": 7.949985289485945e-06, "loss": 0.0111, "step": 6660 }, { "epoch": 0.2898204373377364, "grad_norm": 0.07449796050786972, "learning_rate": 7.922443313060919e-06, "loss": 0.0093, "step": 6670 }, { "epoch": 0.2902549507370433, "grad_norm": 0.10266256332397461, "learning_rate": 7.894917797253452e-06, "loss": 0.0113, "step": 6680 }, { "epoch": 0.29068946413635033, "grad_norm": 0.10616596043109894, "learning_rate": 7.867408960150015e-06, "loss": 0.0105, "step": 6690 }, { "epoch": 0.2911239775356573, "grad_norm": 0.1440172642469406, "learning_rate": 7.839917019704921e-06, "loss": 0.0116, "step": 6700 }, { "epoch": 0.2915584909349642, "grad_norm": 0.13536323606967926, "learning_rate": 7.812442193738612e-06, "loss": 0.009, "step": 6710 }, { "epoch": 0.29199300433427117, "grad_norm": 0.09956809878349304, "learning_rate": 7.784984699935934e-06, "loss": 0.0097, "step": 6720 }, { "epoch": 0.2924275177335781, "grad_norm": 0.07595683634281158, "learning_rate": 7.7575447558444e-06, "loss": 0.0095, "step": 6730 }, { "epoch": 0.29286203113288506, "grad_norm": 0.10689617693424225, "learning_rate": 7.730122578872492e-06, "loss": 0.011, "step": 6740 }, { "epoch": 0.293296544532192, "grad_norm": 0.08313745260238647, "learning_rate": 7.702718386287904e-06, "loss": 0.0082, "step": 6750 }, { "epoch": 0.29373105793149895, "grad_norm": 0.0555373840034008, "learning_rate": 7.675332395215853e-06, "loss": 0.0105, "step": 6760 }, { "epoch": 0.2941655713308059, "grad_norm": 0.11606048047542572, "learning_rate": 7.64796482263734e-06, "loss": 0.0099, "step": 6770 }, { "epoch": 0.29460008473011284, "grad_norm": 0.08943700790405273, "learning_rate": 7.620615885387419e-06, "loss": 0.0077, "step": 6780 }, { "epoch": 0.2950345981294198, "grad_norm": 0.14774949848651886, "learning_rate": 7.593285800153527e-06, "loss": 0.008, "step": 6790 }, { "epoch": 0.2954691115287268, "grad_norm": 0.10569046437740326, "learning_rate": 7.565974783473709e-06, "loss": 0.0117, "step": 6800 }, { "epoch": 0.29590362492803374, "grad_norm": 0.0632302314043045, "learning_rate": 7.5386830517349366e-06, "loss": 0.0082, "step": 6810 }, { "epoch": 0.2963381383273407, "grad_norm": 0.1415219008922577, "learning_rate": 7.511410821171385e-06, "loss": 0.0098, "step": 6820 }, { "epoch": 0.29677265172664763, "grad_norm": 0.08672233670949936, "learning_rate": 7.484158307862726e-06, "loss": 0.0108, "step": 6830 }, { "epoch": 0.2972071651259546, "grad_norm": 0.1161867156624794, "learning_rate": 7.4569257277324035e-06, "loss": 0.0107, "step": 6840 }, { "epoch": 0.2976416785252615, "grad_norm": 0.09897766262292862, "learning_rate": 7.429713296545934e-06, "loss": 0.0095, "step": 6850 }, { "epoch": 0.29807619192456847, "grad_norm": 0.07324810326099396, "learning_rate": 7.402521229909185e-06, "loss": 0.0117, "step": 6860 }, { "epoch": 0.2985107053238754, "grad_norm": 0.06913300603628159, "learning_rate": 7.37534974326668e-06, "loss": 0.0091, "step": 6870 }, { "epoch": 0.29894521872318236, "grad_norm": 0.08431734144687653, "learning_rate": 7.3481990518998915e-06, "loss": 0.0086, "step": 6880 }, { "epoch": 0.2993797321224893, "grad_norm": 0.0898478627204895, "learning_rate": 7.321069370925519e-06, "loss": 0.0121, "step": 6890 }, { "epoch": 0.29981424552179625, "grad_norm": 0.09967532753944397, "learning_rate": 7.293960915293803e-06, "loss": 0.012, "step": 6900 }, { "epoch": 0.30024875892110325, "grad_norm": 0.07716736942529678, "learning_rate": 7.266873899786803e-06, "loss": 0.0096, "step": 6910 }, { "epoch": 0.3006832723204102, "grad_norm": 0.0543416365981102, "learning_rate": 7.2398085390167275e-06, "loss": 0.0078, "step": 6920 }, { "epoch": 0.30111778571971715, "grad_norm": 0.1596456915140152, "learning_rate": 7.212765047424191e-06, "loss": 0.0101, "step": 6930 }, { "epoch": 0.3015522991190241, "grad_norm": 0.1087801456451416, "learning_rate": 7.185743639276552e-06, "loss": 0.0107, "step": 6940 }, { "epoch": 0.30198681251833104, "grad_norm": 0.09049522876739502, "learning_rate": 7.158744528666196e-06, "loss": 0.0097, "step": 6950 }, { "epoch": 0.302421325917638, "grad_norm": 0.10051853209733963, "learning_rate": 7.131767929508833e-06, "loss": 0.0109, "step": 6960 }, { "epoch": 0.30285583931694493, "grad_norm": 0.06672215461730957, "learning_rate": 7.104814055541838e-06, "loss": 0.0095, "step": 6970 }, { "epoch": 0.3032903527162519, "grad_norm": 0.15979322791099548, "learning_rate": 7.077883120322507e-06, "loss": 0.0099, "step": 6980 }, { "epoch": 0.3037248661155588, "grad_norm": 0.07499784976243973, "learning_rate": 7.0509753372264065e-06, "loss": 0.0105, "step": 6990 }, { "epoch": 0.30415937951486577, "grad_norm": 0.09794459491968155, "learning_rate": 7.024090919445662e-06, "loss": 0.0101, "step": 7000 }, { "epoch": 0.3045938929141727, "grad_norm": 0.06780888140201569, "learning_rate": 6.997230079987272e-06, "loss": 0.0121, "step": 7010 }, { "epoch": 0.3050284063134797, "grad_norm": 0.08303526043891907, "learning_rate": 6.970393031671428e-06, "loss": 0.0098, "step": 7020 }, { "epoch": 0.30546291971278666, "grad_norm": 0.1095365509390831, "learning_rate": 6.943579987129822e-06, "loss": 0.0116, "step": 7030 }, { "epoch": 0.3058974331120936, "grad_norm": 0.11114852130413055, "learning_rate": 6.916791158803954e-06, "loss": 0.0098, "step": 7040 }, { "epoch": 0.30633194651140055, "grad_norm": 0.08195824921131134, "learning_rate": 6.890026758943464e-06, "loss": 0.0087, "step": 7050 }, { "epoch": 0.3067664599107075, "grad_norm": 0.0689440369606018, "learning_rate": 6.86328699960445e-06, "loss": 0.0095, "step": 7060 }, { "epoch": 0.30720097331001445, "grad_norm": 0.11462501436471939, "learning_rate": 6.83657209264777e-06, "loss": 0.0085, "step": 7070 }, { "epoch": 0.3076354867093214, "grad_norm": 0.06977716833353043, "learning_rate": 6.809882249737383e-06, "loss": 0.0075, "step": 7080 }, { "epoch": 0.30807000010862834, "grad_norm": 0.07778250426054001, "learning_rate": 6.783217682338655e-06, "loss": 0.0076, "step": 7090 }, { "epoch": 0.3085045135079353, "grad_norm": 0.08335845917463303, "learning_rate": 6.7565786017167004e-06, "loss": 0.0101, "step": 7100 }, { "epoch": 0.30893902690724223, "grad_norm": 0.09140942245721817, "learning_rate": 6.7299652189347e-06, "loss": 0.012, "step": 7110 }, { "epoch": 0.3093735403065492, "grad_norm": 0.10216088593006134, "learning_rate": 6.703377744852227e-06, "loss": 0.0108, "step": 7120 }, { "epoch": 0.3098080537058562, "grad_norm": 0.09677540510892868, "learning_rate": 6.6768163901235776e-06, "loss": 0.0083, "step": 7130 }, { "epoch": 0.3102425671051631, "grad_norm": 0.1337706297636032, "learning_rate": 6.650281365196096e-06, "loss": 0.0099, "step": 7140 }, { "epoch": 0.31067708050447007, "grad_norm": 0.09689532220363617, "learning_rate": 6.623772880308534e-06, "loss": 0.0096, "step": 7150 }, { "epoch": 0.311111593903777, "grad_norm": 0.08149607479572296, "learning_rate": 6.597291145489344e-06, "loss": 0.0102, "step": 7160 }, { "epoch": 0.31154610730308396, "grad_norm": 0.10443510860204697, "learning_rate": 6.570836370555045e-06, "loss": 0.009, "step": 7170 }, { "epoch": 0.3119806207023909, "grad_norm": 0.10890229046344757, "learning_rate": 6.544408765108549e-06, "loss": 0.0105, "step": 7180 }, { "epoch": 0.31241513410169786, "grad_norm": 0.04773566499352455, "learning_rate": 6.518008538537501e-06, "loss": 0.0092, "step": 7190 }, { "epoch": 0.3128496475010048, "grad_norm": 0.08288843929767609, "learning_rate": 6.4916359000126284e-06, "loss": 0.0104, "step": 7200 }, { "epoch": 0.31328416090031175, "grad_norm": 0.12421102821826935, "learning_rate": 6.465291058486072e-06, "loss": 0.0117, "step": 7210 }, { "epoch": 0.3137186742996187, "grad_norm": 0.0949474424123764, "learning_rate": 6.438974222689729e-06, "loss": 0.0104, "step": 7220 }, { "epoch": 0.31415318769892564, "grad_norm": 0.08548901975154877, "learning_rate": 6.4126856011336146e-06, "loss": 0.0124, "step": 7230 }, { "epoch": 0.31458770109823264, "grad_norm": 0.09069912880659103, "learning_rate": 6.386425402104199e-06, "loss": 0.0104, "step": 7240 }, { "epoch": 0.3150222144975396, "grad_norm": 0.1187874972820282, "learning_rate": 6.3601938336627555e-06, "loss": 0.0109, "step": 7250 }, { "epoch": 0.31545672789684653, "grad_norm": 0.07378183305263519, "learning_rate": 6.33399110364372e-06, "loss": 0.0068, "step": 7260 }, { "epoch": 0.3158912412961535, "grad_norm": 0.09195563942193985, "learning_rate": 6.307817419653031e-06, "loss": 0.01, "step": 7270 }, { "epoch": 0.3163257546954604, "grad_norm": 0.11191501468420029, "learning_rate": 6.281672989066501e-06, "loss": 0.0094, "step": 7280 }, { "epoch": 0.31676026809476737, "grad_norm": 0.08982055634260178, "learning_rate": 6.255558019028168e-06, "loss": 0.0114, "step": 7290 }, { "epoch": 0.3171947814940743, "grad_norm": 0.10709337145090103, "learning_rate": 6.229472716448647e-06, "loss": 0.0096, "step": 7300 }, { "epoch": 0.31762929489338126, "grad_norm": 0.06784507632255554, "learning_rate": 6.203417288003497e-06, "loss": 0.008, "step": 7310 }, { "epoch": 0.3180638082926882, "grad_norm": 0.08509954065084457, "learning_rate": 6.177391940131581e-06, "loss": 0.0095, "step": 7320 }, { "epoch": 0.31849832169199516, "grad_norm": 0.07616768777370453, "learning_rate": 6.15139687903343e-06, "loss": 0.0093, "step": 7330 }, { "epoch": 0.3189328350913021, "grad_norm": 0.11212021857500076, "learning_rate": 6.12543231066962e-06, "loss": 0.0101, "step": 7340 }, { "epoch": 0.3193673484906091, "grad_norm": 0.0787481889128685, "learning_rate": 6.099498440759123e-06, "loss": 0.0108, "step": 7350 }, { "epoch": 0.31980186188991605, "grad_norm": 0.10750093311071396, "learning_rate": 6.0735954747776856e-06, "loss": 0.0089, "step": 7360 }, { "epoch": 0.320236375289223, "grad_norm": 0.06456255167722702, "learning_rate": 6.047723617956201e-06, "loss": 0.0106, "step": 7370 }, { "epoch": 0.32067088868852994, "grad_norm": 0.08563439548015594, "learning_rate": 6.021883075279089e-06, "loss": 0.0095, "step": 7380 }, { "epoch": 0.3211054020878369, "grad_norm": 0.0769663155078888, "learning_rate": 5.996074051482657e-06, "loss": 0.0098, "step": 7390 }, { "epoch": 0.32153991548714383, "grad_norm": 0.08421932905912399, "learning_rate": 5.9702967510534884e-06, "loss": 0.0083, "step": 7400 }, { "epoch": 0.3219744288864508, "grad_norm": 0.07485505938529968, "learning_rate": 5.94455137822682e-06, "loss": 0.008, "step": 7410 }, { "epoch": 0.3224089422857577, "grad_norm": 0.04607333242893219, "learning_rate": 5.918838136984926e-06, "loss": 0.008, "step": 7420 }, { "epoch": 0.3228434556850647, "grad_norm": 0.10801783204078674, "learning_rate": 5.893157231055501e-06, "loss": 0.0089, "step": 7430 }, { "epoch": 0.3232779690843716, "grad_norm": 0.09994127601385117, "learning_rate": 5.867508863910043e-06, "loss": 0.0092, "step": 7440 }, { "epoch": 0.32371248248367857, "grad_norm": 0.0876353308558464, "learning_rate": 5.841893238762242e-06, "loss": 0.0094, "step": 7450 }, { "epoch": 0.32414699588298557, "grad_norm": 0.1374131441116333, "learning_rate": 5.816310558566367e-06, "loss": 0.0079, "step": 7460 }, { "epoch": 0.3245815092822925, "grad_norm": 0.14037005603313446, "learning_rate": 5.790761026015675e-06, "loss": 0.0117, "step": 7470 }, { "epoch": 0.32501602268159946, "grad_norm": 0.10385984182357788, "learning_rate": 5.765244843540783e-06, "loss": 0.0121, "step": 7480 }, { "epoch": 0.3254505360809064, "grad_norm": 0.15009550750255585, "learning_rate": 5.739762213308073e-06, "loss": 0.0105, "step": 7490 }, { "epoch": 0.32588504948021335, "grad_norm": 0.0838378369808197, "learning_rate": 5.714313337218087e-06, "loss": 0.0081, "step": 7500 }, { "epoch": 0.3263195628795203, "grad_norm": 0.1275458186864853, "learning_rate": 5.688898416903938e-06, "loss": 0.01, "step": 7510 }, { "epoch": 0.32675407627882724, "grad_norm": 0.10727114230394363, "learning_rate": 5.663517653729708e-06, "loss": 0.0079, "step": 7520 }, { "epoch": 0.3271885896781342, "grad_norm": 0.058422524482011795, "learning_rate": 5.638171248788842e-06, "loss": 0.0086, "step": 7530 }, { "epoch": 0.32762310307744114, "grad_norm": 0.06441164761781693, "learning_rate": 5.6128594029025585e-06, "loss": 0.0082, "step": 7540 }, { "epoch": 0.3280576164767481, "grad_norm": 0.17803671956062317, "learning_rate": 5.587582316618276e-06, "loss": 0.0154, "step": 7550 }, { "epoch": 0.32849212987605503, "grad_norm": 0.06189972534775734, "learning_rate": 5.562340190207995e-06, "loss": 0.0099, "step": 7560 }, { "epoch": 0.32892664327536203, "grad_norm": 0.0892493724822998, "learning_rate": 5.537133223666742e-06, "loss": 0.0117, "step": 7570 }, { "epoch": 0.329361156674669, "grad_norm": 0.06378696113824844, "learning_rate": 5.511961616710961e-06, "loss": 0.008, "step": 7580 }, { "epoch": 0.3297956700739759, "grad_norm": 0.06784821301698685, "learning_rate": 5.486825568776941e-06, "loss": 0.0086, "step": 7590 }, { "epoch": 0.33023018347328287, "grad_norm": 0.057210661470890045, "learning_rate": 5.461725279019226e-06, "loss": 0.0113, "step": 7600 }, { "epoch": 0.3306646968725898, "grad_norm": 0.10207509994506836, "learning_rate": 5.436660946309067e-06, "loss": 0.0094, "step": 7610 }, { "epoch": 0.33109921027189676, "grad_norm": 0.09658393263816833, "learning_rate": 5.411632769232808e-06, "loss": 0.0108, "step": 7620 }, { "epoch": 0.3315337236712037, "grad_norm": 0.11029795557260513, "learning_rate": 5.386640946090325e-06, "loss": 0.0109, "step": 7630 }, { "epoch": 0.33196823707051065, "grad_norm": 0.12160946428775787, "learning_rate": 5.361685674893481e-06, "loss": 0.008, "step": 7640 }, { "epoch": 0.3324027504698176, "grad_norm": 0.13957437872886658, "learning_rate": 5.3367671533645105e-06, "loss": 0.0082, "step": 7650 }, { "epoch": 0.33283726386912454, "grad_norm": 0.07055667042732239, "learning_rate": 5.3118855789345e-06, "loss": 0.0076, "step": 7660 }, { "epoch": 0.3332717772684315, "grad_norm": 0.07234865427017212, "learning_rate": 5.2870411487417825e-06, "loss": 0.0082, "step": 7670 }, { "epoch": 0.3337062906677385, "grad_norm": 0.1645868867635727, "learning_rate": 5.262234059630415e-06, "loss": 0.0094, "step": 7680 }, { "epoch": 0.33414080406704544, "grad_norm": 0.07004209607839584, "learning_rate": 5.237464508148575e-06, "loss": 0.008, "step": 7690 }, { "epoch": 0.3345753174663524, "grad_norm": 0.11307430267333984, "learning_rate": 5.212732690547047e-06, "loss": 0.01, "step": 7700 }, { "epoch": 0.33500983086565933, "grad_norm": 0.07407734543085098, "learning_rate": 5.1880388027776415e-06, "loss": 0.0099, "step": 7710 }, { "epoch": 0.3354443442649663, "grad_norm": 0.11762917041778564, "learning_rate": 5.163383040491645e-06, "loss": 0.0114, "step": 7720 }, { "epoch": 0.3358788576642732, "grad_norm": 0.08234626799821854, "learning_rate": 5.1387655990382716e-06, "loss": 0.0088, "step": 7730 }, { "epoch": 0.33631337106358017, "grad_norm": 0.07098574936389923, "learning_rate": 5.114186673463123e-06, "loss": 0.0119, "step": 7740 }, { "epoch": 0.3367478844628871, "grad_norm": 0.05303031578660011, "learning_rate": 5.089646458506639e-06, "loss": 0.0113, "step": 7750 }, { "epoch": 0.33718239786219406, "grad_norm": 0.06391644477844238, "learning_rate": 5.065145148602542e-06, "loss": 0.0094, "step": 7760 }, { "epoch": 0.337616911261501, "grad_norm": 0.1745784729719162, "learning_rate": 5.040682937876319e-06, "loss": 0.0085, "step": 7770 }, { "epoch": 0.33805142466080795, "grad_norm": 0.09049512445926666, "learning_rate": 5.016260020143659e-06, "loss": 0.0095, "step": 7780 }, { "epoch": 0.33848593806011495, "grad_norm": 0.09682343155145645, "learning_rate": 4.9918765889089475e-06, "loss": 0.0087, "step": 7790 }, { "epoch": 0.3389204514594219, "grad_norm": 0.09558317810297012, "learning_rate": 4.967532837363695e-06, "loss": 0.0087, "step": 7800 }, { "epoch": 0.33935496485872885, "grad_norm": 0.09068993479013443, "learning_rate": 4.943228958385045e-06, "loss": 0.0081, "step": 7810 }, { "epoch": 0.3397894782580358, "grad_norm": 0.1312408298254013, "learning_rate": 4.918965144534219e-06, "loss": 0.0114, "step": 7820 }, { "epoch": 0.34022399165734274, "grad_norm": 0.10443583130836487, "learning_rate": 4.894741588054993e-06, "loss": 0.0078, "step": 7830 }, { "epoch": 0.3406585050566497, "grad_norm": 0.0706024244427681, "learning_rate": 4.8705584808722065e-06, "loss": 0.0076, "step": 7840 }, { "epoch": 0.34109301845595663, "grad_norm": 0.12487440556287766, "learning_rate": 4.8464160145901894e-06, "loss": 0.0114, "step": 7850 }, { "epoch": 0.3415275318552636, "grad_norm": 0.11978522688150406, "learning_rate": 4.822314380491281e-06, "loss": 0.0102, "step": 7860 }, { "epoch": 0.3419620452545705, "grad_norm": 0.05795501545071602, "learning_rate": 4.7982537695343115e-06, "loss": 0.0098, "step": 7870 }, { "epoch": 0.34239655865387747, "grad_norm": 0.09978840500116348, "learning_rate": 4.7742343723530685e-06, "loss": 0.0101, "step": 7880 }, { "epoch": 0.3428310720531844, "grad_norm": 0.20433145761489868, "learning_rate": 4.750256379254814e-06, "loss": 0.0093, "step": 7890 }, { "epoch": 0.3432655854524914, "grad_norm": 0.13666003942489624, "learning_rate": 4.72631998021875e-06, "loss": 0.0097, "step": 7900 }, { "epoch": 0.34370009885179836, "grad_norm": 0.06936058402061462, "learning_rate": 4.70242536489454e-06, "loss": 0.0082, "step": 7910 }, { "epoch": 0.3441346122511053, "grad_norm": 0.12730272114276886, "learning_rate": 4.6785727226007746e-06, "loss": 0.0079, "step": 7920 }, { "epoch": 0.34456912565041226, "grad_norm": 0.08144715428352356, "learning_rate": 4.654762242323506e-06, "loss": 0.0083, "step": 7930 }, { "epoch": 0.3450036390497192, "grad_norm": 0.054894205182790756, "learning_rate": 4.63099411271473e-06, "loss": 0.0093, "step": 7940 }, { "epoch": 0.34543815244902615, "grad_norm": 0.08420240879058838, "learning_rate": 4.607268522090887e-06, "loss": 0.0106, "step": 7950 }, { "epoch": 0.3458726658483331, "grad_norm": 0.09710361808538437, "learning_rate": 4.583585658431383e-06, "loss": 0.0089, "step": 7960 }, { "epoch": 0.34630717924764004, "grad_norm": 0.06969739496707916, "learning_rate": 4.5599457093771e-06, "loss": 0.0115, "step": 7970 }, { "epoch": 0.346741692646947, "grad_norm": 0.10703381150960922, "learning_rate": 4.536348862228902e-06, "loss": 0.0092, "step": 7980 }, { "epoch": 0.34717620604625393, "grad_norm": 0.06389251351356506, "learning_rate": 4.512795303946148e-06, "loss": 0.007, "step": 7990 }, { "epoch": 0.3476107194455609, "grad_norm": 0.09196222573518753, "learning_rate": 4.489285221145227e-06, "loss": 0.0092, "step": 8000 }, { "epoch": 0.3480452328448679, "grad_norm": 0.0640423446893692, "learning_rate": 4.4658188000980586e-06, "loss": 0.0071, "step": 8010 }, { "epoch": 0.3484797462441748, "grad_norm": 0.08359646052122116, "learning_rate": 4.442396226730637e-06, "loss": 0.0103, "step": 8020 }, { "epoch": 0.34891425964348177, "grad_norm": 0.08465054631233215, "learning_rate": 4.419017686621536e-06, "loss": 0.0077, "step": 8030 }, { "epoch": 0.3493487730427887, "grad_norm": 0.10882612317800522, "learning_rate": 4.395683365000468e-06, "loss": 0.0087, "step": 8040 }, { "epoch": 0.34978328644209566, "grad_norm": 0.08855381608009338, "learning_rate": 4.372393446746781e-06, "loss": 0.01, "step": 8050 }, { "epoch": 0.3502177998414026, "grad_norm": 0.08646983653306961, "learning_rate": 4.349148116388026e-06, "loss": 0.0066, "step": 8060 }, { "epoch": 0.35065231324070956, "grad_norm": 0.11281073838472366, "learning_rate": 4.325947558098478e-06, "loss": 0.0099, "step": 8070 }, { "epoch": 0.3510868266400165, "grad_norm": 0.05874338001012802, "learning_rate": 4.302791955697676e-06, "loss": 0.0081, "step": 8080 }, { "epoch": 0.35152134003932345, "grad_norm": 0.07645628601312637, "learning_rate": 4.27968149264897e-06, "loss": 0.0082, "step": 8090 }, { "epoch": 0.3519558534386304, "grad_norm": 0.061246972531080246, "learning_rate": 4.256616352058073e-06, "loss": 0.0097, "step": 8100 }, { "epoch": 0.35239036683793734, "grad_norm": 0.09351754933595657, "learning_rate": 4.2335967166716064e-06, "loss": 0.0078, "step": 8110 }, { "epoch": 0.35282488023724434, "grad_norm": 0.07282616943120956, "learning_rate": 4.210622768875643e-06, "loss": 0.01, "step": 8120 }, { "epoch": 0.3532593936365513, "grad_norm": 0.11318627744913101, "learning_rate": 4.187694690694279e-06, "loss": 0.0145, "step": 8130 }, { "epoch": 0.35369390703585823, "grad_norm": 0.103764608502388, "learning_rate": 4.1648126637881745e-06, "loss": 0.01, "step": 8140 }, { "epoch": 0.3541284204351652, "grad_norm": 0.089578777551651, "learning_rate": 4.141976869453123e-06, "loss": 0.0081, "step": 8150 }, { "epoch": 0.3545629338344721, "grad_norm": 0.10986506193876266, "learning_rate": 4.119187488618621e-06, "loss": 0.0084, "step": 8160 }, { "epoch": 0.3549974472337791, "grad_norm": 0.08879537880420685, "learning_rate": 4.096444701846427e-06, "loss": 0.0091, "step": 8170 }, { "epoch": 0.355431960633086, "grad_norm": 0.11045394837856293, "learning_rate": 4.073748689329125e-06, "loss": 0.0098, "step": 8180 }, { "epoch": 0.35586647403239297, "grad_norm": 0.18344798684120178, "learning_rate": 4.051099630888704e-06, "loss": 0.0108, "step": 8190 }, { "epoch": 0.3563009874316999, "grad_norm": 0.07954873144626617, "learning_rate": 4.028497705975139e-06, "loss": 0.0081, "step": 8200 }, { "epoch": 0.35673550083100686, "grad_norm": 0.1363687366247177, "learning_rate": 4.0059430936649645e-06, "loss": 0.0082, "step": 8210 }, { "epoch": 0.3571700142303138, "grad_norm": 0.11748107522726059, "learning_rate": 3.9834359726598415e-06, "loss": 0.0112, "step": 8220 }, { "epoch": 0.3576045276296208, "grad_norm": 0.09678258746862411, "learning_rate": 3.9609765212851694e-06, "loss": 0.0075, "step": 8230 }, { "epoch": 0.35803904102892775, "grad_norm": 0.06474039703607559, "learning_rate": 3.938564917488644e-06, "loss": 0.0072, "step": 8240 }, { "epoch": 0.3584735544282347, "grad_norm": 0.1036151722073555, "learning_rate": 3.916201338838872e-06, "loss": 0.0083, "step": 8250 }, { "epoch": 0.35890806782754164, "grad_norm": 0.11131062358617783, "learning_rate": 3.893885962523954e-06, "loss": 0.0084, "step": 8260 }, { "epoch": 0.3593425812268486, "grad_norm": 0.11692364513874054, "learning_rate": 3.871618965350075e-06, "loss": 0.008, "step": 8270 }, { "epoch": 0.35977709462615554, "grad_norm": 0.07803528755903244, "learning_rate": 3.849400523740102e-06, "loss": 0.0086, "step": 8280 }, { "epoch": 0.3602116080254625, "grad_norm": 0.05871092155575752, "learning_rate": 3.82723081373221e-06, "loss": 0.0071, "step": 8290 }, { "epoch": 0.36064612142476943, "grad_norm": 0.06725364178419113, "learning_rate": 3.805110010978463e-06, "loss": 0.0099, "step": 8300 }, { "epoch": 0.3610806348240764, "grad_norm": 0.09455585479736328, "learning_rate": 3.783038290743427e-06, "loss": 0.0092, "step": 8310 }, { "epoch": 0.3615151482233833, "grad_norm": 0.07518789917230606, "learning_rate": 3.7610158279027796e-06, "loss": 0.0085, "step": 8320 }, { "epoch": 0.36194966162269027, "grad_norm": 0.07231789082288742, "learning_rate": 3.7390427969419395e-06, "loss": 0.0085, "step": 8330 }, { "epoch": 0.36238417502199727, "grad_norm": 0.07863481342792511, "learning_rate": 3.7171193719546726e-06, "loss": 0.0064, "step": 8340 }, { "epoch": 0.3628186884213042, "grad_norm": 0.09734117984771729, "learning_rate": 3.695245726641702e-06, "loss": 0.0093, "step": 8350 }, { "epoch": 0.36325320182061116, "grad_norm": 0.12269584834575653, "learning_rate": 3.6734220343093575e-06, "loss": 0.0077, "step": 8360 }, { "epoch": 0.3636877152199181, "grad_norm": 0.147287979722023, "learning_rate": 3.6516484678681783e-06, "loss": 0.0131, "step": 8370 }, { "epoch": 0.36412222861922505, "grad_norm": 0.06834188103675842, "learning_rate": 3.629925199831552e-06, "loss": 0.0077, "step": 8380 }, { "epoch": 0.364556742018532, "grad_norm": 0.07882396131753922, "learning_rate": 3.6082524023143574e-06, "loss": 0.0075, "step": 8390 }, { "epoch": 0.36499125541783894, "grad_norm": 0.08493094146251678, "learning_rate": 3.5866302470315882e-06, "loss": 0.0088, "step": 8400 }, { "epoch": 0.3654257688171459, "grad_norm": 0.12626749277114868, "learning_rate": 3.565058905296991e-06, "loss": 0.0093, "step": 8410 }, { "epoch": 0.36586028221645284, "grad_norm": 0.1473812311887741, "learning_rate": 3.543538548021723e-06, "loss": 0.0105, "step": 8420 }, { "epoch": 0.3662947956157598, "grad_norm": 0.06739123910665512, "learning_rate": 3.5220693457129775e-06, "loss": 0.0085, "step": 8430 }, { "epoch": 0.36672930901506673, "grad_norm": 0.07891149073839188, "learning_rate": 3.5006514684726545e-06, "loss": 0.0084, "step": 8440 }, { "epoch": 0.36716382241437373, "grad_norm": 0.08155108988285065, "learning_rate": 3.4792850859959903e-06, "loss": 0.0089, "step": 8450 }, { "epoch": 0.3675983358136807, "grad_norm": 0.10823169350624084, "learning_rate": 3.457970367570239e-06, "loss": 0.0096, "step": 8460 }, { "epoch": 0.3680328492129876, "grad_norm": 0.09815893322229385, "learning_rate": 3.4367074820733017e-06, "loss": 0.0067, "step": 8470 }, { "epoch": 0.36846736261229457, "grad_norm": 0.08411363512277603, "learning_rate": 3.415496597972414e-06, "loss": 0.006, "step": 8480 }, { "epoch": 0.3689018760116015, "grad_norm": 0.1024174764752388, "learning_rate": 3.394337883322805e-06, "loss": 0.0107, "step": 8490 }, { "epoch": 0.36933638941090846, "grad_norm": 0.12380263209342957, "learning_rate": 3.373231505766348e-06, "loss": 0.0092, "step": 8500 }, { "epoch": 0.3697709028102154, "grad_norm": 0.07073214650154114, "learning_rate": 3.352177632530251e-06, "loss": 0.009, "step": 8510 }, { "epoch": 0.37020541620952235, "grad_norm": 0.062046460807323456, "learning_rate": 3.3311764304257342e-06, "loss": 0.0073, "step": 8520 }, { "epoch": 0.3706399296088293, "grad_norm": 0.09883429110050201, "learning_rate": 3.3102280658466977e-06, "loss": 0.0101, "step": 8530 }, { "epoch": 0.37107444300813625, "grad_norm": 0.08380346745252609, "learning_rate": 3.2893327047684034e-06, "loss": 0.008, "step": 8540 }, { "epoch": 0.3715089564074432, "grad_norm": 0.13216561079025269, "learning_rate": 3.2684905127461573e-06, "loss": 0.0084, "step": 8550 }, { "epoch": 0.3719434698067502, "grad_norm": 0.08744744956493378, "learning_rate": 3.2477016549140173e-06, "loss": 0.0084, "step": 8560 }, { "epoch": 0.37237798320605714, "grad_norm": 0.09140066802501678, "learning_rate": 3.226966295983466e-06, "loss": 0.0089, "step": 8570 }, { "epoch": 0.3728124966053641, "grad_norm": 0.11554740369319916, "learning_rate": 3.206284600242102e-06, "loss": 0.0084, "step": 8580 }, { "epoch": 0.37324701000467103, "grad_norm": 0.07135514169931412, "learning_rate": 3.185656731552362e-06, "loss": 0.0065, "step": 8590 }, { "epoch": 0.373681523403978, "grad_norm": 0.08268517255783081, "learning_rate": 3.1650828533501943e-06, "loss": 0.009, "step": 8600 }, { "epoch": 0.3741160368032849, "grad_norm": 0.09547808766365051, "learning_rate": 3.144563128643776e-06, "loss": 0.0068, "step": 8610 }, { "epoch": 0.37455055020259187, "grad_norm": 0.10216235369443893, "learning_rate": 3.1240977200122422e-06, "loss": 0.0102, "step": 8620 }, { "epoch": 0.3749850636018988, "grad_norm": 0.08232805877923965, "learning_rate": 3.1036867896043574e-06, "loss": 0.0082, "step": 8630 }, { "epoch": 0.37541957700120576, "grad_norm": 0.08157678693532944, "learning_rate": 3.0833304991372557e-06, "loss": 0.0079, "step": 8640 }, { "epoch": 0.3758540904005127, "grad_norm": 0.09660549461841583, "learning_rate": 3.063029009895162e-06, "loss": 0.0081, "step": 8650 }, { "epoch": 0.37628860379981965, "grad_norm": 0.07229529321193695, "learning_rate": 3.0427824827281062e-06, "loss": 0.0081, "step": 8660 }, { "epoch": 0.3767231171991266, "grad_norm": 0.11833970248699188, "learning_rate": 3.022591078050644e-06, "loss": 0.0094, "step": 8670 }, { "epoch": 0.3771576305984336, "grad_norm": 0.06864393502473831, "learning_rate": 3.0024549558405945e-06, "loss": 0.0118, "step": 8680 }, { "epoch": 0.37759214399774055, "grad_norm": 0.06534848362207413, "learning_rate": 2.982374275637776e-06, "loss": 0.0091, "step": 8690 }, { "epoch": 0.3780266573970475, "grad_norm": 0.06855802237987518, "learning_rate": 2.9623491965427264e-06, "loss": 0.0052, "step": 8700 }, { "epoch": 0.37846117079635444, "grad_norm": 0.1193542554974556, "learning_rate": 2.942379877215461e-06, "loss": 0.0093, "step": 8710 }, { "epoch": 0.3788956841956614, "grad_norm": 0.09538474678993225, "learning_rate": 2.922466475874206e-06, "loss": 0.0097, "step": 8720 }, { "epoch": 0.37933019759496833, "grad_norm": 0.08435144275426865, "learning_rate": 2.90260915029414e-06, "loss": 0.0098, "step": 8730 }, { "epoch": 0.3797647109942753, "grad_norm": 0.15806706249713898, "learning_rate": 2.882808057806149e-06, "loss": 0.0126, "step": 8740 }, { "epoch": 0.3801992243935822, "grad_norm": 0.07213329523801804, "learning_rate": 2.863063355295589e-06, "loss": 0.0074, "step": 8750 }, { "epoch": 0.38063373779288917, "grad_norm": 0.09936099499464035, "learning_rate": 2.8433751992010315e-06, "loss": 0.0078, "step": 8760 }, { "epoch": 0.3810682511921961, "grad_norm": 0.08155147731304169, "learning_rate": 2.8237437455130203e-06, "loss": 0.0074, "step": 8770 }, { "epoch": 0.38150276459150306, "grad_norm": 0.1396973729133606, "learning_rate": 2.8041691497728527e-06, "loss": 0.0111, "step": 8780 }, { "epoch": 0.38193727799081006, "grad_norm": 0.045445725321769714, "learning_rate": 2.784651567071327e-06, "loss": 0.0087, "step": 8790 }, { "epoch": 0.382371791390117, "grad_norm": 0.08116365969181061, "learning_rate": 2.7651911520475316e-06, "loss": 0.0085, "step": 8800 }, { "epoch": 0.38280630478942396, "grad_norm": 0.10875967890024185, "learning_rate": 2.745788058887604e-06, "loss": 0.0082, "step": 8810 }, { "epoch": 0.3832408181887309, "grad_norm": 0.07952379435300827, "learning_rate": 2.7264424413235267e-06, "loss": 0.0069, "step": 8820 }, { "epoch": 0.38367533158803785, "grad_norm": 0.068133145570755, "learning_rate": 2.707154452631889e-06, "loss": 0.0066, "step": 8830 }, { "epoch": 0.3841098449873448, "grad_norm": 0.08167216181755066, "learning_rate": 2.6879242456326827e-06, "loss": 0.0067, "step": 8840 }, { "epoch": 0.38454435838665174, "grad_norm": 0.090968556702137, "learning_rate": 2.6687519726881063e-06, "loss": 0.0072, "step": 8850 }, { "epoch": 0.3849788717859587, "grad_norm": 0.14810791611671448, "learning_rate": 2.649637785701329e-06, "loss": 0.0088, "step": 8860 }, { "epoch": 0.38541338518526563, "grad_norm": 0.047760531306266785, "learning_rate": 2.630581836115301e-06, "loss": 0.0078, "step": 8870 }, { "epoch": 0.3858478985845726, "grad_norm": 0.11700766533613205, "learning_rate": 2.6115842749115604e-06, "loss": 0.0092, "step": 8880 }, { "epoch": 0.3862824119838795, "grad_norm": 0.2241494059562683, "learning_rate": 2.5926452526090305e-06, "loss": 0.0082, "step": 8890 }, { "epoch": 0.3867169253831865, "grad_norm": 0.04898640885949135, "learning_rate": 2.573764919262819e-06, "loss": 0.0067, "step": 8900 }, { "epoch": 0.3871514387824935, "grad_norm": 0.0778077021241188, "learning_rate": 2.5549434244630478e-06, "loss": 0.0089, "step": 8910 }, { "epoch": 0.3875859521818004, "grad_norm": 0.13349191844463348, "learning_rate": 2.536180917333648e-06, "loss": 0.0095, "step": 8920 }, { "epoch": 0.38802046558110737, "grad_norm": 0.0766734853386879, "learning_rate": 2.5174775465311897e-06, "loss": 0.0086, "step": 8930 }, { "epoch": 0.3884549789804143, "grad_norm": 0.09370673447847366, "learning_rate": 2.4988334602437057e-06, "loss": 0.0083, "step": 8940 }, { "epoch": 0.38888949237972126, "grad_norm": 0.06771666556596756, "learning_rate": 2.4802488061895137e-06, "loss": 0.0081, "step": 8950 }, { "epoch": 0.3893240057790282, "grad_norm": 0.08422492444515228, "learning_rate": 2.4617237316160427e-06, "loss": 0.0091, "step": 8960 }, { "epoch": 0.38975851917833515, "grad_norm": 0.08454887568950653, "learning_rate": 2.4432583832986633e-06, "loss": 0.0078, "step": 8970 }, { "epoch": 0.3901930325776421, "grad_norm": 0.07766442745923996, "learning_rate": 2.42485290753955e-06, "loss": 0.0099, "step": 8980 }, { "epoch": 0.39062754597694904, "grad_norm": 0.10074387490749359, "learning_rate": 2.4065074501664863e-06, "loss": 0.0083, "step": 8990 }, { "epoch": 0.391062059376256, "grad_norm": 0.11751067638397217, "learning_rate": 2.3882221565317277e-06, "loss": 0.0065, "step": 9000 }, { "epoch": 0.391496572775563, "grad_norm": 0.07051953673362732, "learning_rate": 2.3699971715108593e-06, "loss": 0.0071, "step": 9010 }, { "epoch": 0.39193108617486994, "grad_norm": 0.0636526346206665, "learning_rate": 2.3518326395016222e-06, "loss": 0.0061, "step": 9020 }, { "epoch": 0.3923655995741769, "grad_norm": 0.0790957361459732, "learning_rate": 2.3337287044227996e-06, "loss": 0.0086, "step": 9030 }, { "epoch": 0.39280011297348383, "grad_norm": 0.06394562870264053, "learning_rate": 2.315685509713046e-06, "loss": 0.0074, "step": 9040 }, { "epoch": 0.3932346263727908, "grad_norm": 0.08622386306524277, "learning_rate": 2.2977031983297817e-06, "loss": 0.0091, "step": 9050 }, { "epoch": 0.3936691397720977, "grad_norm": 0.09151028841733932, "learning_rate": 2.279781912748028e-06, "loss": 0.0076, "step": 9060 }, { "epoch": 0.39410365317140467, "grad_norm": 0.09050817787647247, "learning_rate": 2.2619217949593076e-06, "loss": 0.0096, "step": 9070 }, { "epoch": 0.3945381665707116, "grad_norm": 0.11273612827062607, "learning_rate": 2.2441229864705048e-06, "loss": 0.0098, "step": 9080 }, { "epoch": 0.39497267997001856, "grad_norm": 0.11558801680803299, "learning_rate": 2.226385628302742e-06, "loss": 0.0103, "step": 9090 }, { "epoch": 0.3954071933693255, "grad_norm": 0.0860101580619812, "learning_rate": 2.2087098609902636e-06, "loss": 0.0112, "step": 9100 }, { "epoch": 0.39584170676863245, "grad_norm": 0.07617778331041336, "learning_rate": 2.1910958245793347e-06, "loss": 0.0105, "step": 9110 }, { "epoch": 0.39627622016793945, "grad_norm": 0.09402777254581451, "learning_rate": 2.173543658627121e-06, "loss": 0.0079, "step": 9120 }, { "epoch": 0.3967107335672464, "grad_norm": 0.060731224715709686, "learning_rate": 2.1560535022005766e-06, "loss": 0.0093, "step": 9130 }, { "epoch": 0.39714524696655334, "grad_norm": 0.050082068890333176, "learning_rate": 2.138625493875359e-06, "loss": 0.0078, "step": 9140 }, { "epoch": 0.3975797603658603, "grad_norm": 0.07878375798463821, "learning_rate": 2.1212597717347183e-06, "loss": 0.0094, "step": 9150 }, { "epoch": 0.39801427376516724, "grad_norm": 0.13812611997127533, "learning_rate": 2.1039564733684014e-06, "loss": 0.0091, "step": 9160 }, { "epoch": 0.3984487871644742, "grad_norm": 0.07215043902397156, "learning_rate": 2.0867157358715794e-06, "loss": 0.0117, "step": 9170 }, { "epoch": 0.39888330056378113, "grad_norm": 0.0668293833732605, "learning_rate": 2.0695376958437442e-06, "loss": 0.0084, "step": 9180 }, { "epoch": 0.3993178139630881, "grad_norm": 0.05682065710425377, "learning_rate": 2.0524224893876253e-06, "loss": 0.0091, "step": 9190 }, { "epoch": 0.399752327362395, "grad_norm": 0.08696268498897552, "learning_rate": 2.0353702521081277e-06, "loss": 0.007, "step": 9200 }, { "epoch": 0.40018684076170197, "grad_norm": 0.10888504981994629, "learning_rate": 2.0183811191112436e-06, "loss": 0.006, "step": 9210 }, { "epoch": 0.4006213541610089, "grad_norm": 0.08665753901004791, "learning_rate": 2.001455225002984e-06, "loss": 0.0079, "step": 9220 }, { "epoch": 0.4010558675603159, "grad_norm": 0.11043235659599304, "learning_rate": 1.984592703888313e-06, "loss": 0.0078, "step": 9230 }, { "epoch": 0.40149038095962286, "grad_norm": 0.06967038661241531, "learning_rate": 1.967793689370093e-06, "loss": 0.0065, "step": 9240 }, { "epoch": 0.4019248943589298, "grad_norm": 0.08630409091711044, "learning_rate": 1.95105831454801e-06, "loss": 0.0102, "step": 9250 }, { "epoch": 0.40235940775823675, "grad_norm": 0.07549581676721573, "learning_rate": 1.9343867120175375e-06, "loss": 0.0093, "step": 9260 }, { "epoch": 0.4027939211575437, "grad_norm": 0.10131137073040009, "learning_rate": 1.9177790138688746e-06, "loss": 0.0092, "step": 9270 }, { "epoch": 0.40322843455685065, "grad_norm": 0.071138396859169, "learning_rate": 1.9012353516858984e-06, "loss": 0.0073, "step": 9280 }, { "epoch": 0.4036629479561576, "grad_norm": 0.11135800927877426, "learning_rate": 1.884755856545123e-06, "loss": 0.0074, "step": 9290 }, { "epoch": 0.40409746135546454, "grad_norm": 0.06701090931892395, "learning_rate": 1.8683406590146714e-06, "loss": 0.0075, "step": 9300 }, { "epoch": 0.4045319747547715, "grad_norm": 0.10666152089834213, "learning_rate": 1.8519898891532273e-06, "loss": 0.0071, "step": 9310 }, { "epoch": 0.40496648815407843, "grad_norm": 0.07806232571601868, "learning_rate": 1.8357036765090107e-06, "loss": 0.0069, "step": 9320 }, { "epoch": 0.4054010015533854, "grad_norm": 0.09611749649047852, "learning_rate": 1.8194821501187455e-06, "loss": 0.0081, "step": 9330 }, { "epoch": 0.4058355149526924, "grad_norm": 0.08786460012197495, "learning_rate": 1.8033254385066501e-06, "loss": 0.0056, "step": 9340 }, { "epoch": 0.4062700283519993, "grad_norm": 0.05407165735960007, "learning_rate": 1.7872336696834091e-06, "loss": 0.0088, "step": 9350 }, { "epoch": 0.40670454175130627, "grad_norm": 0.09034055471420288, "learning_rate": 1.7712069711451553e-06, "loss": 0.0082, "step": 9360 }, { "epoch": 0.4071390551506132, "grad_norm": 0.11984706670045853, "learning_rate": 1.7552454698724753e-06, "loss": 0.0089, "step": 9370 }, { "epoch": 0.40757356854992016, "grad_norm": 0.07762929052114487, "learning_rate": 1.7393492923293854e-06, "loss": 0.007, "step": 9380 }, { "epoch": 0.4080080819492271, "grad_norm": 0.08071978390216827, "learning_rate": 1.7235185644623352e-06, "loss": 0.0089, "step": 9390 }, { "epoch": 0.40844259534853405, "grad_norm": 0.07938024401664734, "learning_rate": 1.7077534116992266e-06, "loss": 0.0078, "step": 9400 }, { "epoch": 0.408877108747841, "grad_norm": 0.07711252570152283, "learning_rate": 1.692053958948393e-06, "loss": 0.0091, "step": 9410 }, { "epoch": 0.40931162214714795, "grad_norm": 0.08081989735364914, "learning_rate": 1.6764203305976224e-06, "loss": 0.0083, "step": 9420 }, { "epoch": 0.4097461355464549, "grad_norm": 0.04149989038705826, "learning_rate": 1.6608526505131773e-06, "loss": 0.007, "step": 9430 }, { "epoch": 0.41018064894576184, "grad_norm": 0.08724639564752579, "learning_rate": 1.6453510420388085e-06, "loss": 0.0067, "step": 9440 }, { "epoch": 0.41061516234506884, "grad_norm": 0.07853716611862183, "learning_rate": 1.6299156279947725e-06, "loss": 0.0069, "step": 9450 }, { "epoch": 0.4110496757443758, "grad_norm": 0.08704400062561035, "learning_rate": 1.6145465306768604e-06, "loss": 0.0081, "step": 9460 }, { "epoch": 0.41148418914368273, "grad_norm": 0.0688142478466034, "learning_rate": 1.5992438718554415e-06, "loss": 0.0081, "step": 9470 }, { "epoch": 0.4119187025429897, "grad_norm": 0.08707388490438461, "learning_rate": 1.5840077727744785e-06, "loss": 0.0072, "step": 9480 }, { "epoch": 0.4123532159422966, "grad_norm": 0.07289273291826248, "learning_rate": 1.5688383541505835e-06, "loss": 0.0071, "step": 9490 }, { "epoch": 0.41278772934160357, "grad_norm": 0.08625438064336777, "learning_rate": 1.5537357361720551e-06, "loss": 0.0086, "step": 9500 }, { "epoch": 0.4132222427409105, "grad_norm": 0.07625974714756012, "learning_rate": 1.5387000384979223e-06, "loss": 0.0066, "step": 9510 }, { "epoch": 0.41365675614021746, "grad_norm": 0.06150345876812935, "learning_rate": 1.5237313802569974e-06, "loss": 0.0074, "step": 9520 }, { "epoch": 0.4140912695395244, "grad_norm": 0.049613695591688156, "learning_rate": 1.5088298800469413e-06, "loss": 0.0097, "step": 9530 }, { "epoch": 0.41452578293883136, "grad_norm": 0.12269088625907898, "learning_rate": 1.4939956559333202e-06, "loss": 0.0089, "step": 9540 }, { "epoch": 0.4149602963381383, "grad_norm": 0.12324448674917221, "learning_rate": 1.479228825448654e-06, "loss": 0.0081, "step": 9550 }, { "epoch": 0.4153948097374453, "grad_norm": 0.09652116894721985, "learning_rate": 1.4645295055915154e-06, "loss": 0.0076, "step": 9560 }, { "epoch": 0.41582932313675225, "grad_norm": 0.08091606199741364, "learning_rate": 1.4498978128255691e-06, "loss": 0.0088, "step": 9570 }, { "epoch": 0.4162638365360592, "grad_norm": 0.06057927384972572, "learning_rate": 1.4353338630786817e-06, "loss": 0.0071, "step": 9580 }, { "epoch": 0.41669834993536614, "grad_norm": 0.10036496818065643, "learning_rate": 1.420837771741973e-06, "loss": 0.0091, "step": 9590 }, { "epoch": 0.4171328633346731, "grad_norm": 0.08344124257564545, "learning_rate": 1.4064096536689298e-06, "loss": 0.0088, "step": 9600 }, { "epoch": 0.41756737673398003, "grad_norm": 0.0929148942232132, "learning_rate": 1.3920496231744717e-06, "loss": 0.0065, "step": 9610 }, { "epoch": 0.418001890133287, "grad_norm": 0.14110197126865387, "learning_rate": 1.3777577940340558e-06, "loss": 0.0091, "step": 9620 }, { "epoch": 0.4184364035325939, "grad_norm": 0.08782806992530823, "learning_rate": 1.3635342794827888e-06, "loss": 0.0081, "step": 9630 }, { "epoch": 0.41887091693190087, "grad_norm": 0.09160462021827698, "learning_rate": 1.3493791922145027e-06, "loss": 0.0084, "step": 9640 }, { "epoch": 0.4193054303312078, "grad_norm": 0.09273369610309601, "learning_rate": 1.3352926443808778e-06, "loss": 0.0079, "step": 9650 }, { "epoch": 0.41973994373051476, "grad_norm": 0.09354612231254578, "learning_rate": 1.3212747475905564e-06, "loss": 0.0101, "step": 9660 }, { "epoch": 0.42017445712982177, "grad_norm": 0.13009706139564514, "learning_rate": 1.3073256129082534e-06, "loss": 0.0083, "step": 9670 }, { "epoch": 0.4206089705291287, "grad_norm": 0.07470608502626419, "learning_rate": 1.2934453508538746e-06, "loss": 0.0062, "step": 9680 }, { "epoch": 0.42104348392843566, "grad_norm": 0.07935813069343567, "learning_rate": 1.2796340714016419e-06, "loss": 0.0062, "step": 9690 }, { "epoch": 0.4214779973277426, "grad_norm": 0.1074632778763771, "learning_rate": 1.26589188397923e-06, "loss": 0.0088, "step": 9700 }, { "epoch": 0.42191251072704955, "grad_norm": 0.09066551923751831, "learning_rate": 1.2522188974668847e-06, "loss": 0.0083, "step": 9710 }, { "epoch": 0.4223470241263565, "grad_norm": 0.09772542119026184, "learning_rate": 1.2386152201965763e-06, "loss": 0.0086, "step": 9720 }, { "epoch": 0.42278153752566344, "grad_norm": 0.10772576928138733, "learning_rate": 1.2250809599511293e-06, "loss": 0.0075, "step": 9730 }, { "epoch": 0.4232160509249704, "grad_norm": 0.07655716687440872, "learning_rate": 1.2116162239633734e-06, "loss": 0.0079, "step": 9740 }, { "epoch": 0.42365056432427733, "grad_norm": 0.11319755017757416, "learning_rate": 1.198221118915287e-06, "loss": 0.0095, "step": 9750 }, { "epoch": 0.4240850777235843, "grad_norm": 0.1037856787443161, "learning_rate": 1.1848957509371739e-06, "loss": 0.0101, "step": 9760 }, { "epoch": 0.4245195911228912, "grad_norm": 0.09453088045120239, "learning_rate": 1.1716402256067905e-06, "loss": 0.0085, "step": 9770 }, { "epoch": 0.42495410452219823, "grad_norm": 0.12351549416780472, "learning_rate": 1.1584546479485316e-06, "loss": 0.0087, "step": 9780 }, { "epoch": 0.4253886179215052, "grad_norm": 0.10265389829874039, "learning_rate": 1.1453391224325928e-06, "loss": 0.0079, "step": 9790 }, { "epoch": 0.4258231313208121, "grad_norm": 0.14251692593097687, "learning_rate": 1.1322937529741384e-06, "loss": 0.0078, "step": 9800 }, { "epoch": 0.42625764472011907, "grad_norm": 0.1251758188009262, "learning_rate": 1.1193186429324887e-06, "loss": 0.0096, "step": 9810 }, { "epoch": 0.426692158119426, "grad_norm": 0.17213517427444458, "learning_rate": 1.1064138951102843e-06, "loss": 0.0116, "step": 9820 }, { "epoch": 0.42712667151873296, "grad_norm": 0.10503554344177246, "learning_rate": 1.093579611752692e-06, "loss": 0.0084, "step": 9830 }, { "epoch": 0.4275611849180399, "grad_norm": 0.11157539486885071, "learning_rate": 1.080815894546574e-06, "loss": 0.0063, "step": 9840 }, { "epoch": 0.42799569831734685, "grad_norm": 0.10868073999881744, "learning_rate": 1.0681228446196978e-06, "loss": 0.0072, "step": 9850 }, { "epoch": 0.4284302117166538, "grad_norm": 0.08796529471874237, "learning_rate": 1.0555005625399316e-06, "loss": 0.0092, "step": 9860 }, { "epoch": 0.42886472511596074, "grad_norm": 0.0735132247209549, "learning_rate": 1.0429491483144394e-06, "loss": 0.0074, "step": 9870 }, { "epoch": 0.4292992385152677, "grad_norm": 0.07387060672044754, "learning_rate": 1.030468701388896e-06, "loss": 0.0061, "step": 9880 }, { "epoch": 0.4297337519145747, "grad_norm": 0.07616886496543884, "learning_rate": 1.0180593206467015e-06, "loss": 0.0081, "step": 9890 }, { "epoch": 0.43016826531388164, "grad_norm": 0.09123454242944717, "learning_rate": 1.0057211044081916e-06, "loss": 0.0068, "step": 9900 }, { "epoch": 0.4306027787131886, "grad_norm": 0.059252478182315826, "learning_rate": 9.934541504298589e-07, "loss": 0.0098, "step": 9910 }, { "epoch": 0.43103729211249553, "grad_norm": 0.08855234086513519, "learning_rate": 9.812585559035848e-07, "loss": 0.0094, "step": 9920 }, { "epoch": 0.4314718055118025, "grad_norm": 0.060283273458480835, "learning_rate": 9.691344174558615e-07, "loss": 0.009, "step": 9930 }, { "epoch": 0.4319063189111094, "grad_norm": 0.07556108385324478, "learning_rate": 9.570818311470298e-07, "loss": 0.0085, "step": 9940 }, { "epoch": 0.43234083231041637, "grad_norm": 0.11125895380973816, "learning_rate": 9.451008924705196e-07, "loss": 0.0076, "step": 9950 }, { "epoch": 0.4327753457097233, "grad_norm": 0.09853236377239227, "learning_rate": 9.331916963520959e-07, "loss": 0.0092, "step": 9960 }, { "epoch": 0.43320985910903026, "grad_norm": 0.06747264415025711, "learning_rate": 9.213543371490963e-07, "loss": 0.0085, "step": 9970 }, { "epoch": 0.4336443725083372, "grad_norm": 0.07546751201152802, "learning_rate": 9.095889086496867e-07, "loss": 0.0094, "step": 9980 }, { "epoch": 0.43407888590764415, "grad_norm": 0.11386007070541382, "learning_rate": 8.978955040721371e-07, "loss": 0.0083, "step": 9990 }, { "epoch": 0.43451339930695115, "grad_norm": 0.08858129382133484, "learning_rate": 8.862742160640525e-07, "loss": 0.0092, "step": 10000 }, { "epoch": 0.4349479127062581, "grad_norm": 0.05357801169157028, "learning_rate": 8.747251367016552e-07, "loss": 0.0093, "step": 10010 }, { "epoch": 0.43538242610556505, "grad_norm": 0.0597519688308239, "learning_rate": 8.632483574890615e-07, "loss": 0.0074, "step": 10020 }, { "epoch": 0.435816939504872, "grad_norm": 0.043685659766197205, "learning_rate": 8.518439693575408e-07, "loss": 0.0073, "step": 10030 }, { "epoch": 0.43625145290417894, "grad_norm": 0.09410804510116577, "learning_rate": 8.405120626648067e-07, "loss": 0.0115, "step": 10040 }, { "epoch": 0.4366859663034859, "grad_norm": 0.07961921393871307, "learning_rate": 8.292527271942996e-07, "loss": 0.0067, "step": 10050 }, { "epoch": 0.43712047970279283, "grad_norm": 0.08737465739250183, "learning_rate": 8.180660521544692e-07, "loss": 0.0102, "step": 10060 }, { "epoch": 0.4375549931020998, "grad_norm": 0.0933104082942009, "learning_rate": 8.069521261780733e-07, "loss": 0.008, "step": 10070 }, { "epoch": 0.4379895065014067, "grad_norm": 0.09345684945583344, "learning_rate": 7.959110373214751e-07, "loss": 0.0092, "step": 10080 }, { "epoch": 0.43842401990071367, "grad_norm": 0.057416338473558426, "learning_rate": 7.849428730639463e-07, "loss": 0.0063, "step": 10090 }, { "epoch": 0.4388585333000206, "grad_norm": 0.10068265348672867, "learning_rate": 7.740477203069674e-07, "loss": 0.007, "step": 10100 }, { "epoch": 0.4392930466993276, "grad_norm": 0.05323368310928345, "learning_rate": 7.63225665373546e-07, "loss": 0.0093, "step": 10110 }, { "epoch": 0.43972756009863456, "grad_norm": 0.06257743388414383, "learning_rate": 7.524767940075329e-07, "loss": 0.0086, "step": 10120 }, { "epoch": 0.4401620734979415, "grad_norm": 0.06518563628196716, "learning_rate": 7.418011913729406e-07, "loss": 0.0069, "step": 10130 }, { "epoch": 0.44059658689724845, "grad_norm": 0.10690577328205109, "learning_rate": 7.311989420532639e-07, "loss": 0.0084, "step": 10140 }, { "epoch": 0.4410311002965554, "grad_norm": 0.09482588618993759, "learning_rate": 7.206701300508212e-07, "loss": 0.0087, "step": 10150 }, { "epoch": 0.44146561369586235, "grad_norm": 0.0618489608168602, "learning_rate": 7.102148387860764e-07, "loss": 0.0068, "step": 10160 }, { "epoch": 0.4419001270951693, "grad_norm": 0.07976125925779343, "learning_rate": 6.998331510969869e-07, "loss": 0.0102, "step": 10170 }, { "epoch": 0.44233464049447624, "grad_norm": 0.07916069775819778, "learning_rate": 6.895251492383426e-07, "loss": 0.0082, "step": 10180 }, { "epoch": 0.4427691538937832, "grad_norm": 0.08093802630901337, "learning_rate": 6.7929091488112e-07, "loss": 0.0071, "step": 10190 }, { "epoch": 0.44320366729309013, "grad_norm": 0.0789642184972763, "learning_rate": 6.691305291118234e-07, "loss": 0.0068, "step": 10200 }, { "epoch": 0.4436381806923971, "grad_norm": 0.10139094293117523, "learning_rate": 6.59044072431857e-07, "loss": 0.0074, "step": 10210 }, { "epoch": 0.4440726940917041, "grad_norm": 0.06098568066954613, "learning_rate": 6.490316247568762e-07, "loss": 0.0071, "step": 10220 }, { "epoch": 0.444507207491011, "grad_norm": 0.1078798696398735, "learning_rate": 6.390932654161596e-07, "loss": 0.0093, "step": 10230 }, { "epoch": 0.44494172089031797, "grad_norm": 0.072936050593853, "learning_rate": 6.292290731519757e-07, "loss": 0.0067, "step": 10240 }, { "epoch": 0.4453762342896249, "grad_norm": 0.10374324023723602, "learning_rate": 6.194391261189703e-07, "loss": 0.006, "step": 10250 }, { "epoch": 0.44581074768893186, "grad_norm": 0.13051003217697144, "learning_rate": 6.097235018835279e-07, "loss": 0.0073, "step": 10260 }, { "epoch": 0.4462452610882388, "grad_norm": 0.12584161758422852, "learning_rate": 6.000822774231796e-07, "loss": 0.0105, "step": 10270 }, { "epoch": 0.44667977448754576, "grad_norm": 0.09230499714612961, "learning_rate": 5.905155291259768e-07, "loss": 0.008, "step": 10280 }, { "epoch": 0.4471142878868527, "grad_norm": 0.07139464467763901, "learning_rate": 5.810233327898929e-07, "loss": 0.0079, "step": 10290 }, { "epoch": 0.44754880128615965, "grad_norm": 0.09752029180526733, "learning_rate": 5.716057636222172e-07, "loss": 0.0075, "step": 10300 }, { "epoch": 0.4479833146854666, "grad_norm": 0.0810999721288681, "learning_rate": 5.622628962389687e-07, "loss": 0.0072, "step": 10310 }, { "epoch": 0.44841782808477354, "grad_norm": 0.10258316993713379, "learning_rate": 5.529948046642985e-07, "loss": 0.0063, "step": 10320 }, { "epoch": 0.44885234148408054, "grad_norm": 0.08662743121385574, "learning_rate": 5.43801562329902e-07, "loss": 0.007, "step": 10330 }, { "epoch": 0.4492868548833875, "grad_norm": 0.0792480930685997, "learning_rate": 5.346832420744363e-07, "loss": 0.0069, "step": 10340 }, { "epoch": 0.44972136828269443, "grad_norm": 0.0747712031006813, "learning_rate": 5.256399161429515e-07, "loss": 0.0088, "step": 10350 }, { "epoch": 0.4501558816820014, "grad_norm": 0.06478843092918396, "learning_rate": 5.166716561863128e-07, "loss": 0.0093, "step": 10360 }, { "epoch": 0.4505903950813083, "grad_norm": 0.09518998861312866, "learning_rate": 5.077785332606266e-07, "loss": 0.0061, "step": 10370 }, { "epoch": 0.45102490848061527, "grad_norm": 0.12032055854797363, "learning_rate": 4.989606178266914e-07, "loss": 0.0076, "step": 10380 }, { "epoch": 0.4514594218799222, "grad_norm": 0.10467323660850525, "learning_rate": 4.902179797494255e-07, "loss": 0.0076, "step": 10390 }, { "epoch": 0.45189393527922916, "grad_norm": 0.11998245865106583, "learning_rate": 4.815506882973242e-07, "loss": 0.0099, "step": 10400 }, { "epoch": 0.4523284486785361, "grad_norm": 0.06274235248565674, "learning_rate": 4.7295881214190486e-07, "loss": 0.0101, "step": 10410 }, { "epoch": 0.45276296207784306, "grad_norm": 0.07720432430505753, "learning_rate": 4.644424193571628e-07, "loss": 0.007, "step": 10420 }, { "epoch": 0.45319747547715, "grad_norm": 0.07087533175945282, "learning_rate": 4.5600157741903626e-07, "loss": 0.0072, "step": 10430 }, { "epoch": 0.453631988876457, "grad_norm": 0.1298869401216507, "learning_rate": 4.4763635320486663e-07, "loss": 0.0111, "step": 10440 }, { "epoch": 0.45406650227576395, "grad_norm": 0.09260991215705872, "learning_rate": 4.3934681299287683e-07, "loss": 0.0086, "step": 10450 }, { "epoch": 0.4545010156750709, "grad_norm": 0.1423112154006958, "learning_rate": 4.311330224616328e-07, "loss": 0.0094, "step": 10460 }, { "epoch": 0.45493552907437784, "grad_norm": 0.09400004893541336, "learning_rate": 4.2299504668953383e-07, "loss": 0.0073, "step": 10470 }, { "epoch": 0.4553700424736848, "grad_norm": 0.06269126385450363, "learning_rate": 4.1493295015429645e-07, "loss": 0.0083, "step": 10480 }, { "epoch": 0.45580455587299173, "grad_norm": 0.07868027687072754, "learning_rate": 4.0694679673243807e-07, "loss": 0.0069, "step": 10490 }, { "epoch": 0.4562390692722987, "grad_norm": 0.06967227905988693, "learning_rate": 3.990366496987741e-07, "loss": 0.0078, "step": 10500 }, { "epoch": 0.4566735826716056, "grad_norm": 0.0847291499376297, "learning_rate": 3.912025717259194e-07, "loss": 0.0079, "step": 10510 }, { "epoch": 0.4571080960709126, "grad_norm": 0.0945800244808197, "learning_rate": 3.834446248837853e-07, "loss": 0.0076, "step": 10520 }, { "epoch": 0.4575426094702195, "grad_norm": 0.06890492141246796, "learning_rate": 3.7576287063909034e-07, "loss": 0.0113, "step": 10530 }, { "epoch": 0.45797712286952647, "grad_norm": 0.08804440498352051, "learning_rate": 3.681573698548779e-07, "loss": 0.0075, "step": 10540 }, { "epoch": 0.45841163626883347, "grad_norm": 0.07646997272968292, "learning_rate": 3.606281827900282e-07, "loss": 0.0104, "step": 10550 }, { "epoch": 0.4588461496681404, "grad_norm": 0.061834532767534256, "learning_rate": 3.531753690987816e-07, "loss": 0.008, "step": 10560 }, { "epoch": 0.45928066306744736, "grad_norm": 0.07643786072731018, "learning_rate": 3.4579898783027145e-07, "loss": 0.0087, "step": 10570 }, { "epoch": 0.4597151764667543, "grad_norm": 0.09748928993940353, "learning_rate": 3.3849909742804553e-07, "loss": 0.0064, "step": 10580 }, { "epoch": 0.46014968986606125, "grad_norm": 0.09467586874961853, "learning_rate": 3.3127575572961755e-07, "loss": 0.0088, "step": 10590 }, { "epoch": 0.4605842032653682, "grad_norm": 0.10658750683069229, "learning_rate": 3.2412901996599075e-07, "loss": 0.0082, "step": 10600 }, { "epoch": 0.46101871666467514, "grad_norm": 0.08129631727933884, "learning_rate": 3.170589467612262e-07, "loss": 0.0072, "step": 10610 }, { "epoch": 0.4614532300639821, "grad_norm": 0.09332036226987839, "learning_rate": 3.100655921319706e-07, "loss": 0.0094, "step": 10620 }, { "epoch": 0.46188774346328904, "grad_norm": 0.09338422864675522, "learning_rate": 3.03149011487035e-07, "loss": 0.0066, "step": 10630 }, { "epoch": 0.462322256862596, "grad_norm": 0.06272900104522705, "learning_rate": 2.96309259626939e-07, "loss": 0.0114, "step": 10640 }, { "epoch": 0.4627567702619029, "grad_norm": 0.1304650604724884, "learning_rate": 2.895463907434837e-07, "loss": 0.0089, "step": 10650 }, { "epoch": 0.46319128366120993, "grad_norm": 0.06198961287736893, "learning_rate": 2.8286045841932064e-07, "loss": 0.0079, "step": 10660 }, { "epoch": 0.4636257970605169, "grad_norm": 0.0889345034956932, "learning_rate": 2.762515156275303e-07, "loss": 0.0072, "step": 10670 }, { "epoch": 0.4640603104598238, "grad_norm": 0.07997926324605942, "learning_rate": 2.697196147311987e-07, "loss": 0.009, "step": 10680 }, { "epoch": 0.46449482385913077, "grad_norm": 0.09037624299526215, "learning_rate": 2.6326480748300467e-07, "loss": 0.0084, "step": 10690 }, { "epoch": 0.4649293372584377, "grad_norm": 0.08070208877325058, "learning_rate": 2.5688714502480783e-07, "loss": 0.0098, "step": 10700 }, { "epoch": 0.46536385065774466, "grad_norm": 0.06974229961633682, "learning_rate": 2.5058667788724566e-07, "loss": 0.0076, "step": 10710 }, { "epoch": 0.4657983640570516, "grad_norm": 0.0585511177778244, "learning_rate": 2.4436345598932933e-07, "loss": 0.0091, "step": 10720 }, { "epoch": 0.46623287745635855, "grad_norm": 0.04711228981614113, "learning_rate": 2.3821752863805502e-07, "loss": 0.0074, "step": 10730 }, { "epoch": 0.4666673908556655, "grad_norm": 0.10080069303512573, "learning_rate": 2.3214894452800784e-07, "loss": 0.0065, "step": 10740 }, { "epoch": 0.46710190425497244, "grad_norm": 0.0889425128698349, "learning_rate": 2.2615775174097633e-07, "loss": 0.0083, "step": 10750 }, { "epoch": 0.4675364176542794, "grad_norm": 0.05146535485982895, "learning_rate": 2.2024399774556948e-07, "loss": 0.0055, "step": 10760 }, { "epoch": 0.4679709310535864, "grad_norm": 0.10434900969266891, "learning_rate": 2.1440772939685272e-07, "loss": 0.0071, "step": 10770 }, { "epoch": 0.46840544445289334, "grad_norm": 0.1268317848443985, "learning_rate": 2.086489929359603e-07, "loss": 0.0065, "step": 10780 }, { "epoch": 0.4688399578522003, "grad_norm": 0.1149526908993721, "learning_rate": 2.0296783398973452e-07, "loss": 0.0075, "step": 10790 }, { "epoch": 0.46927447125150723, "grad_norm": 0.07514665275812149, "learning_rate": 1.973642975703738e-07, "loss": 0.0074, "step": 10800 }, { "epoch": 0.4697089846508142, "grad_norm": 0.0836552307009697, "learning_rate": 1.918384280750618e-07, "loss": 0.0055, "step": 10810 }, { "epoch": 0.4701434980501211, "grad_norm": 0.14539428055286407, "learning_rate": 1.8639026928562453e-07, "loss": 0.0099, "step": 10820 }, { "epoch": 0.47057801144942807, "grad_norm": 0.10615331679582596, "learning_rate": 1.810198643681793e-07, "loss": 0.0062, "step": 10830 }, { "epoch": 0.471012524848735, "grad_norm": 0.0767487958073616, "learning_rate": 1.7572725587279738e-07, "loss": 0.0084, "step": 10840 }, { "epoch": 0.47144703824804196, "grad_norm": 0.11197537183761597, "learning_rate": 1.7051248573316083e-07, "loss": 0.0091, "step": 10850 }, { "epoch": 0.4718815516473489, "grad_norm": 0.08262715488672256, "learning_rate": 1.6537559526623614e-07, "loss": 0.0077, "step": 10860 }, { "epoch": 0.47231606504665585, "grad_norm": 0.0603024885058403, "learning_rate": 1.6031662517194235e-07, "loss": 0.0086, "step": 10870 }, { "epoch": 0.47275057844596285, "grad_norm": 0.02666429989039898, "learning_rate": 1.5533561553282895e-07, "loss": 0.0054, "step": 10880 }, { "epoch": 0.4731850918452698, "grad_norm": 0.10526155680418015, "learning_rate": 1.5043260581376285e-07, "loss": 0.0078, "step": 10890 }, { "epoch": 0.47361960524457675, "grad_norm": 0.0845399722456932, "learning_rate": 1.4560763486160868e-07, "loss": 0.0076, "step": 10900 }, { "epoch": 0.4740541186438837, "grad_norm": 0.09399423003196716, "learning_rate": 1.4086074090493007e-07, "loss": 0.007, "step": 10910 }, { "epoch": 0.47448863204319064, "grad_norm": 0.07824098318815231, "learning_rate": 1.3619196155367664e-07, "loss": 0.0076, "step": 10920 }, { "epoch": 0.4749231454424976, "grad_norm": 0.0887395516037941, "learning_rate": 1.3160133379889305e-07, "loss": 0.0077, "step": 10930 }, { "epoch": 0.47535765884180453, "grad_norm": 0.05440656468272209, "learning_rate": 1.2708889401242263e-07, "loss": 0.0081, "step": 10940 }, { "epoch": 0.4757921722411115, "grad_norm": 0.11987116187810898, "learning_rate": 1.22654677946622e-07, "loss": 0.0094, "step": 10950 }, { "epoch": 0.4762266856404184, "grad_norm": 0.09870567172765732, "learning_rate": 1.1829872073407467e-07, "loss": 0.0064, "step": 10960 }, { "epoch": 0.47666119903972537, "grad_norm": 0.07806263118982315, "learning_rate": 1.1402105688731568e-07, "loss": 0.0075, "step": 10970 }, { "epoch": 0.4770957124390323, "grad_norm": 0.0932944044470787, "learning_rate": 1.0982172029855409e-07, "loss": 0.0098, "step": 10980 }, { "epoch": 0.4775302258383393, "grad_norm": 0.09490207582712173, "learning_rate": 1.0570074423940758e-07, "loss": 0.0069, "step": 10990 }, { "epoch": 0.47796473923764626, "grad_norm": 0.10747203975915909, "learning_rate": 1.0165816136064266e-07, "loss": 0.0095, "step": 11000 }, { "epoch": 0.4783992526369532, "grad_norm": 0.10749585181474686, "learning_rate": 9.769400369190496e-08, "loss": 0.0083, "step": 11010 }, { "epoch": 0.47883376603626016, "grad_norm": 0.09229978919029236, "learning_rate": 9.3808302641476e-08, "loss": 0.0082, "step": 11020 }, { "epoch": 0.4792682794355671, "grad_norm": 0.05347978696227074, "learning_rate": 9.000108899602011e-08, "loss": 0.0076, "step": 11030 }, { "epoch": 0.47970279283487405, "grad_norm": 0.08263083547353745, "learning_rate": 8.627239292033907e-08, "loss": 0.0067, "step": 11040 }, { "epoch": 0.480137306234181, "grad_norm": 0.12910178303718567, "learning_rate": 8.262224395713559e-08, "loss": 0.0099, "step": 11050 }, { "epoch": 0.48057181963348794, "grad_norm": 0.0963844284415245, "learning_rate": 7.905067102678021e-08, "loss": 0.0067, "step": 11060 }, { "epoch": 0.4810063330327949, "grad_norm": 0.07919830083847046, "learning_rate": 7.555770242707705e-08, "loss": 0.0068, "step": 11070 }, { "epoch": 0.48144084643210183, "grad_norm": 0.12025429308414459, "learning_rate": 7.214336583304616e-08, "loss": 0.0086, "step": 11080 }, { "epoch": 0.4818753598314088, "grad_norm": 0.08870609104633331, "learning_rate": 6.880768829670036e-08, "loss": 0.0087, "step": 11090 }, { "epoch": 0.4823098732307158, "grad_norm": 0.08957424014806747, "learning_rate": 6.555069624682997e-08, "loss": 0.0079, "step": 11100 }, { "epoch": 0.4827443866300227, "grad_norm": 0.0865994244813919, "learning_rate": 6.237241548879613e-08, "loss": 0.0073, "step": 11110 }, { "epoch": 0.48317890002932967, "grad_norm": 0.10729099810123444, "learning_rate": 5.9272871204324457e-08, "loss": 0.0089, "step": 11120 }, { "epoch": 0.4836134134286366, "grad_norm": 0.12580542266368866, "learning_rate": 5.625208795130954e-08, "loss": 0.0079, "step": 11130 }, { "epoch": 0.48404792682794356, "grad_norm": 0.09476730972528458, "learning_rate": 5.3310089663611844e-08, "loss": 0.0071, "step": 11140 }, { "epoch": 0.4844824402272505, "grad_norm": 0.051090896129608154, "learning_rate": 5.04468996508789e-08, "loss": 0.0121, "step": 11150 }, { "epoch": 0.48491695362655746, "grad_norm": 0.09190193563699722, "learning_rate": 4.766254059835107e-08, "loss": 0.0066, "step": 11160 }, { "epoch": 0.4853514670258644, "grad_norm": 0.08857838064432144, "learning_rate": 4.4957034566687205e-08, "loss": 0.0091, "step": 11170 }, { "epoch": 0.48578598042517135, "grad_norm": 0.0357205905020237, "learning_rate": 4.2330402991789255e-08, "loss": 0.0064, "step": 11180 }, { "epoch": 0.4862204938244783, "grad_norm": 0.10596976429224014, "learning_rate": 3.9782666684631266e-08, "loss": 0.0096, "step": 11190 }, { "epoch": 0.48665500722378524, "grad_norm": 0.0852462649345398, "learning_rate": 3.7313845831093984e-08, "loss": 0.0096, "step": 11200 }, { "epoch": 0.4870895206230922, "grad_norm": 0.07427710294723511, "learning_rate": 3.492395999180609e-08, "loss": 0.008, "step": 11210 }, { "epoch": 0.4875240340223992, "grad_norm": 0.10507290810346603, "learning_rate": 3.261302810198985e-08, "loss": 0.0095, "step": 11220 }, { "epoch": 0.48795854742170613, "grad_norm": 0.0772646963596344, "learning_rate": 3.038106847131128e-08, "loss": 0.0066, "step": 11230 }, { "epoch": 0.4883930608210131, "grad_norm": 0.07355330139398575, "learning_rate": 2.822809878373134e-08, "loss": 0.0091, "step": 11240 }, { "epoch": 0.48882757422032, "grad_norm": 0.06781157851219177, "learning_rate": 2.6154136097369386e-08, "loss": 0.0061, "step": 11250 }, { "epoch": 0.489262087619627, "grad_norm": 0.11774353682994843, "learning_rate": 2.415919684436774e-08, "loss": 0.0084, "step": 11260 }, { "epoch": 0.4896966010189339, "grad_norm": 0.10667645186185837, "learning_rate": 2.224329683076065e-08, "loss": 0.0094, "step": 11270 }, { "epoch": 0.49013111441824087, "grad_norm": 0.09955430775880814, "learning_rate": 2.0406451236349988e-08, "loss": 0.007, "step": 11280 }, { "epoch": 0.4905656278175478, "grad_norm": 0.07685834914445877, "learning_rate": 1.8648674614583084e-08, "loss": 0.0078, "step": 11290 }, { "epoch": 0.49100014121685476, "grad_norm": 0.08372903615236282, "learning_rate": 1.6969980892439508e-08, "loss": 0.0076, "step": 11300 }, { "epoch": 0.4914346546161617, "grad_norm": 0.07744579017162323, "learning_rate": 1.537038337031782e-08, "loss": 0.0085, "step": 11310 }, { "epoch": 0.49186916801546865, "grad_norm": 0.13699276745319366, "learning_rate": 1.3849894721935653e-08, "loss": 0.0084, "step": 11320 }, { "epoch": 0.49230368141477565, "grad_norm": 0.10157019644975662, "learning_rate": 1.2408526994223125e-08, "loss": 0.0073, "step": 11330 }, { "epoch": 0.4927381948140826, "grad_norm": 0.09220637381076813, "learning_rate": 1.1046291607231807e-08, "loss": 0.008, "step": 11340 }, { "epoch": 0.49317270821338954, "grad_norm": 0.06238904967904091, "learning_rate": 9.763199354041463e-09, "loss": 0.0083, "step": 11350 }, { "epoch": 0.4936072216126965, "grad_norm": 0.069469153881073, "learning_rate": 8.559260400674564e-09, "loss": 0.008, "step": 11360 }, { "epoch": 0.49404173501200344, "grad_norm": 0.08541562408208847, "learning_rate": 7.434484286020782e-09, "loss": 0.0078, "step": 11370 }, { "epoch": 0.4944762484113104, "grad_norm": 0.05740821361541748, "learning_rate": 6.388879921753743e-09, "loss": 0.0076, "step": 11380 }, { "epoch": 0.49491076181061733, "grad_norm": 0.20525012910366058, "learning_rate": 5.422455592264397e-09, "loss": 0.008, "step": 11390 }, { "epoch": 0.4953452752099243, "grad_norm": 0.09683717787265778, "learning_rate": 4.535218954596632e-09, "loss": 0.0081, "step": 11400 }, { "epoch": 0.4957797886092312, "grad_norm": 0.12019814550876617, "learning_rate": 3.727177038385099e-09, "loss": 0.0078, "step": 11410 }, { "epoch": 0.49621430200853817, "grad_norm": 0.12986373901367188, "learning_rate": 2.998336245797484e-09, "loss": 0.007, "step": 11420 }, { "epoch": 0.4966488154078451, "grad_norm": 0.08595575392246246, "learning_rate": 2.348702351487875e-09, "loss": 0.0074, "step": 11430 }, { "epoch": 0.4970833288071521, "grad_norm": 0.06685249507427216, "learning_rate": 1.778280502546803e-09, "loss": 0.0087, "step": 11440 }, { "epoch": 0.49751784220645906, "grad_norm": 0.0984988659620285, "learning_rate": 1.2870752184657165e-09, "loss": 0.0066, "step": 11450 }, { "epoch": 0.497952355605766, "grad_norm": 0.08850094676017761, "learning_rate": 8.750903910959008e-10, "loss": 0.0093, "step": 11460 }, { "epoch": 0.49838686900507295, "grad_norm": 0.08885602653026581, "learning_rate": 5.423292846196138e-10, "loss": 0.0084, "step": 11470 }, { "epoch": 0.4988213824043799, "grad_norm": 0.08707667142152786, "learning_rate": 2.8879453552455026e-10, "loss": 0.009, "step": 11480 }, { "epoch": 0.49925589580368684, "grad_norm": 0.07700314372777939, "learning_rate": 1.1448815258385815e-10, "loss": 0.0065, "step": 11490 }, { "epoch": 0.4996904092029938, "grad_norm": 0.08865564316511154, "learning_rate": 1.9411516838374612e-11, "loss": 0.0067, "step": 11500 } ], "logging_steps": 10, "max_steps": 11507, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.2555120144411525e+19, "train_batch_size": 1, "trial_name": null, "trial_params": null }