{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4999858203495, "eval_steps": 500, "global_step": 11019, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00045374881599918323, "grad_norm": 7.252313137054443, "learning_rate": 6.042296072507553e-07, "loss": 0.6, "step": 10 }, { "epoch": 0.0009074976319983665, "grad_norm": 5.196468830108643, "learning_rate": 1.2084592145015106e-06, "loss": 0.5496, "step": 20 }, { "epoch": 0.0013612464479975498, "grad_norm": 2.6206696033477783, "learning_rate": 1.8126888217522659e-06, "loss": 0.4653, "step": 30 }, { "epoch": 0.001814995263996733, "grad_norm": 1.8120253086090088, "learning_rate": 2.4169184290030213e-06, "loss": 0.3081, "step": 40 }, { "epoch": 0.002268744079995916, "grad_norm": 1.6768836975097656, "learning_rate": 3.0211480362537765e-06, "loss": 0.1997, "step": 50 }, { "epoch": 0.0027224928959950997, "grad_norm": 1.3571159839630127, "learning_rate": 3.6253776435045317e-06, "loss": 0.1518, "step": 60 }, { "epoch": 0.0031762417119942828, "grad_norm": 1.4011030197143555, "learning_rate": 4.229607250755287e-06, "loss": 0.1352, "step": 70 }, { "epoch": 0.003629990527993466, "grad_norm": 1.4001766443252563, "learning_rate": 4.833836858006043e-06, "loss": 0.1122, "step": 80 }, { "epoch": 0.004083739343992649, "grad_norm": 1.524217128753662, "learning_rate": 5.438066465256799e-06, "loss": 0.1008, "step": 90 }, { "epoch": 0.004537488159991832, "grad_norm": 0.9944186806678772, "learning_rate": 6.042296072507553e-06, "loss": 0.0858, "step": 100 }, { "epoch": 0.004991236975991016, "grad_norm": 1.0692752599716187, "learning_rate": 6.646525679758309e-06, "loss": 0.104, "step": 110 }, { "epoch": 0.005444985791990199, "grad_norm": 0.9209011793136597, "learning_rate": 7.2507552870090635e-06, "loss": 0.0802, "step": 120 }, { "epoch": 0.005898734607989382, "grad_norm": 1.146437168121338, "learning_rate": 7.85498489425982e-06, "loss": 0.0775, "step": 130 }, { "epoch": 0.0063524834239885655, "grad_norm": 0.8163508176803589, "learning_rate": 8.459214501510575e-06, "loss": 0.0863, "step": 140 }, { "epoch": 0.006806232239987749, "grad_norm": 0.8391405344009399, "learning_rate": 9.06344410876133e-06, "loss": 0.0724, "step": 150 }, { "epoch": 0.007259981055986932, "grad_norm": 0.908923864364624, "learning_rate": 9.667673716012085e-06, "loss": 0.0718, "step": 160 }, { "epoch": 0.007713729871986115, "grad_norm": 0.852513313293457, "learning_rate": 1.0271903323262842e-05, "loss": 0.0579, "step": 170 }, { "epoch": 0.008167478687985298, "grad_norm": 0.7151938676834106, "learning_rate": 1.0876132930513597e-05, "loss": 0.0628, "step": 180 }, { "epoch": 0.008621227503984481, "grad_norm": 0.763804018497467, "learning_rate": 1.1480362537764351e-05, "loss": 0.0727, "step": 190 }, { "epoch": 0.009074976319983665, "grad_norm": 0.74290931224823, "learning_rate": 1.2084592145015106e-05, "loss": 0.0553, "step": 200 }, { "epoch": 0.009528725135982848, "grad_norm": 0.9412379264831543, "learning_rate": 1.2688821752265863e-05, "loss": 0.0564, "step": 210 }, { "epoch": 0.009982473951982032, "grad_norm": 0.9401483535766602, "learning_rate": 1.3293051359516618e-05, "loss": 0.0519, "step": 220 }, { "epoch": 0.010436222767981215, "grad_norm": 0.7330880165100098, "learning_rate": 1.3897280966767372e-05, "loss": 0.0524, "step": 230 }, { "epoch": 0.010889971583980399, "grad_norm": 0.6178134679794312, "learning_rate": 1.4501510574018127e-05, "loss": 0.0558, "step": 240 }, { "epoch": 0.01134372039997958, "grad_norm": 0.630576491355896, "learning_rate": 1.5105740181268884e-05, "loss": 0.0499, "step": 250 }, { "epoch": 0.011797469215978764, "grad_norm": 0.6925823092460632, "learning_rate": 1.570996978851964e-05, "loss": 0.0613, "step": 260 }, { "epoch": 0.012251218031977948, "grad_norm": 0.7759488224983215, "learning_rate": 1.6314199395770393e-05, "loss": 0.0604, "step": 270 }, { "epoch": 0.012704966847977131, "grad_norm": 0.7457413673400879, "learning_rate": 1.691842900302115e-05, "loss": 0.0582, "step": 280 }, { "epoch": 0.013158715663976315, "grad_norm": 0.8155379891395569, "learning_rate": 1.7522658610271906e-05, "loss": 0.0574, "step": 290 }, { "epoch": 0.013612464479975498, "grad_norm": 0.763914167881012, "learning_rate": 1.812688821752266e-05, "loss": 0.0484, "step": 300 }, { "epoch": 0.014066213295974682, "grad_norm": 0.6495068669319153, "learning_rate": 1.8731117824773413e-05, "loss": 0.0553, "step": 310 }, { "epoch": 0.014519962111973863, "grad_norm": 0.6732931733131409, "learning_rate": 1.933534743202417e-05, "loss": 0.0511, "step": 320 }, { "epoch": 0.014973710927973047, "grad_norm": 0.5877078175544739, "learning_rate": 1.9939577039274927e-05, "loss": 0.0478, "step": 330 }, { "epoch": 0.01542745974397223, "grad_norm": 0.6282989382743835, "learning_rate": 1.9999965008575334e-05, "loss": 0.051, "step": 340 }, { "epoch": 0.015881208559971414, "grad_norm": 0.6462369561195374, "learning_rate": 1.999984405087852e-05, "loss": 0.0477, "step": 350 }, { "epoch": 0.016334957375970596, "grad_norm": 0.5832022428512573, "learning_rate": 1.999963669596147e-05, "loss": 0.0412, "step": 360 }, { "epoch": 0.01678870619196978, "grad_norm": 0.6947014927864075, "learning_rate": 1.9999342945615705e-05, "loss": 0.0492, "step": 370 }, { "epoch": 0.017242455007968963, "grad_norm": 0.5751299858093262, "learning_rate": 1.9998962802379185e-05, "loss": 0.042, "step": 380 }, { "epoch": 0.017696203823968148, "grad_norm": 0.5907843112945557, "learning_rate": 1.9998496269536293e-05, "loss": 0.0451, "step": 390 }, { "epoch": 0.01814995263996733, "grad_norm": 0.5383661985397339, "learning_rate": 1.9997943351117804e-05, "loss": 0.0459, "step": 400 }, { "epoch": 0.018603701455966515, "grad_norm": 0.43585777282714844, "learning_rate": 1.9997304051900853e-05, "loss": 0.0425, "step": 410 }, { "epoch": 0.019057450271965697, "grad_norm": 0.5105756521224976, "learning_rate": 1.9996578377408897e-05, "loss": 0.0422, "step": 420 }, { "epoch": 0.01951119908796488, "grad_norm": 0.6340963244438171, "learning_rate": 1.9995766333911663e-05, "loss": 0.0376, "step": 430 }, { "epoch": 0.019964947903964064, "grad_norm": 0.5040590763092041, "learning_rate": 1.999486792842508e-05, "loss": 0.0484, "step": 440 }, { "epoch": 0.020418696719963245, "grad_norm": 0.6529092192649841, "learning_rate": 1.999388316871125e-05, "loss": 0.0464, "step": 450 }, { "epoch": 0.02087244553596243, "grad_norm": 0.6630532145500183, "learning_rate": 1.9992812063278354e-05, "loss": 0.0418, "step": 460 }, { "epoch": 0.021326194351961612, "grad_norm": 0.42047423124313354, "learning_rate": 1.9991654621380593e-05, "loss": 0.0317, "step": 470 }, { "epoch": 0.021779943167960798, "grad_norm": 0.5003728270530701, "learning_rate": 1.9990410853018094e-05, "loss": 0.0448, "step": 480 }, { "epoch": 0.02223369198395998, "grad_norm": 0.5685811638832092, "learning_rate": 1.998908076893684e-05, "loss": 0.0367, "step": 490 }, { "epoch": 0.02268744079995916, "grad_norm": 0.586402952671051, "learning_rate": 1.9987664380628566e-05, "loss": 0.0453, "step": 500 }, { "epoch": 0.023141189615958346, "grad_norm": 0.42892974615097046, "learning_rate": 1.9986161700330668e-05, "loss": 0.0376, "step": 510 }, { "epoch": 0.023594938431957528, "grad_norm": 0.367193341255188, "learning_rate": 1.998457274102608e-05, "loss": 0.0418, "step": 520 }, { "epoch": 0.024048687247956713, "grad_norm": 0.42002618312835693, "learning_rate": 1.9982897516443194e-05, "loss": 0.0423, "step": 530 }, { "epoch": 0.024502436063955895, "grad_norm": 0.5938308835029602, "learning_rate": 1.9981136041055703e-05, "loss": 0.0442, "step": 540 }, { "epoch": 0.02495618487995508, "grad_norm": 0.5407899022102356, "learning_rate": 1.99792883300825e-05, "loss": 0.0443, "step": 550 }, { "epoch": 0.025409933695954262, "grad_norm": 0.6072173714637756, "learning_rate": 1.997735439948755e-05, "loss": 0.0413, "step": 560 }, { "epoch": 0.025863682511953444, "grad_norm": 0.42686253786087036, "learning_rate": 1.997533426597973e-05, "loss": 0.0402, "step": 570 }, { "epoch": 0.02631743132795263, "grad_norm": 0.6148861646652222, "learning_rate": 1.9973227947012713e-05, "loss": 0.0355, "step": 580 }, { "epoch": 0.02677118014395181, "grad_norm": 0.5279844999313354, "learning_rate": 1.9971035460784783e-05, "loss": 0.0428, "step": 590 }, { "epoch": 0.027224928959950996, "grad_norm": 0.4733355939388275, "learning_rate": 1.9968756826238713e-05, "loss": 0.0422, "step": 600 }, { "epoch": 0.027678677775950178, "grad_norm": 0.5398796796798706, "learning_rate": 1.9966392063061573e-05, "loss": 0.0433, "step": 610 }, { "epoch": 0.028132426591949363, "grad_norm": 0.318294882774353, "learning_rate": 1.9963941191684585e-05, "loss": 0.0358, "step": 620 }, { "epoch": 0.028586175407948545, "grad_norm": 0.42730024456977844, "learning_rate": 1.9961404233282926e-05, "loss": 0.0353, "step": 630 }, { "epoch": 0.029039924223947727, "grad_norm": 0.659148097038269, "learning_rate": 1.995878120977555e-05, "loss": 0.0394, "step": 640 }, { "epoch": 0.029493673039946912, "grad_norm": 0.45361000299453735, "learning_rate": 1.9956072143825006e-05, "loss": 0.0375, "step": 650 }, { "epoch": 0.029947421855946094, "grad_norm": 0.47503188252449036, "learning_rate": 1.9953277058837237e-05, "loss": 0.0379, "step": 660 }, { "epoch": 0.03040117067194528, "grad_norm": 0.5961551070213318, "learning_rate": 1.9950395978961376e-05, "loss": 0.0439, "step": 670 }, { "epoch": 0.03085491948794446, "grad_norm": 0.4613669812679291, "learning_rate": 1.9947428929089536e-05, "loss": 0.0368, "step": 680 }, { "epoch": 0.031308668303943646, "grad_norm": 0.3827897012233734, "learning_rate": 1.9944375934856606e-05, "loss": 0.0343, "step": 690 }, { "epoch": 0.03176241711994283, "grad_norm": 0.32436254620552063, "learning_rate": 1.9941237022640024e-05, "loss": 0.0336, "step": 700 }, { "epoch": 0.03221616593594201, "grad_norm": 0.46000194549560547, "learning_rate": 1.9938012219559536e-05, "loss": 0.0326, "step": 710 }, { "epoch": 0.03266991475194119, "grad_norm": 0.3377334177494049, "learning_rate": 1.9934701553476983e-05, "loss": 0.0402, "step": 720 }, { "epoch": 0.03312366356794038, "grad_norm": 0.4465102553367615, "learning_rate": 1.993130505299604e-05, "loss": 0.0372, "step": 730 }, { "epoch": 0.03357741238393956, "grad_norm": 0.48582109808921814, "learning_rate": 1.9927822747461987e-05, "loss": 0.0336, "step": 740 }, { "epoch": 0.03403116119993874, "grad_norm": 0.45290496945381165, "learning_rate": 1.9924254666961446e-05, "loss": 0.0307, "step": 750 }, { "epoch": 0.034484910015937925, "grad_norm": 0.433858722448349, "learning_rate": 1.9920600842322123e-05, "loss": 0.0268, "step": 760 }, { "epoch": 0.034938658831937114, "grad_norm": 0.3633202016353607, "learning_rate": 1.9916861305112536e-05, "loss": 0.0348, "step": 770 }, { "epoch": 0.035392407647936296, "grad_norm": 0.5342480540275574, "learning_rate": 1.9913036087641756e-05, "loss": 0.035, "step": 780 }, { "epoch": 0.03584615646393548, "grad_norm": 0.444950670003891, "learning_rate": 1.9909125222959106e-05, "loss": 0.0319, "step": 790 }, { "epoch": 0.03629990527993466, "grad_norm": 0.5999451875686646, "learning_rate": 1.9905128744853903e-05, "loss": 0.0336, "step": 800 }, { "epoch": 0.03675365409593384, "grad_norm": 0.5298904180526733, "learning_rate": 1.9901046687855142e-05, "loss": 0.0415, "step": 810 }, { "epoch": 0.03720740291193303, "grad_norm": 0.4022998511791229, "learning_rate": 1.9896879087231212e-05, "loss": 0.0395, "step": 820 }, { "epoch": 0.03766115172793221, "grad_norm": 0.32381728291511536, "learning_rate": 1.989262597898959e-05, "loss": 0.0311, "step": 830 }, { "epoch": 0.03811490054393139, "grad_norm": 0.362618625164032, "learning_rate": 1.9888287399876514e-05, "loss": 0.0333, "step": 840 }, { "epoch": 0.038568649359930575, "grad_norm": 0.43890586495399475, "learning_rate": 1.9883863387376688e-05, "loss": 0.0323, "step": 850 }, { "epoch": 0.03902239817592976, "grad_norm": 0.4210602939128876, "learning_rate": 1.9879353979712953e-05, "loss": 0.0364, "step": 860 }, { "epoch": 0.039476146991928945, "grad_norm": 0.3277105987071991, "learning_rate": 1.987475921584594e-05, "loss": 0.0279, "step": 870 }, { "epoch": 0.03992989580792813, "grad_norm": 0.3682249188423157, "learning_rate": 1.987007913547375e-05, "loss": 0.0351, "step": 880 }, { "epoch": 0.04038364462392731, "grad_norm": 0.44841283559799194, "learning_rate": 1.9865313779031607e-05, "loss": 0.0365, "step": 890 }, { "epoch": 0.04083739343992649, "grad_norm": 0.4340668022632599, "learning_rate": 1.986046318769151e-05, "loss": 0.0387, "step": 900 }, { "epoch": 0.04129114225592568, "grad_norm": 0.38138914108276367, "learning_rate": 1.9855527403361874e-05, "loss": 0.0365, "step": 910 }, { "epoch": 0.04174489107192486, "grad_norm": 0.4621522128582001, "learning_rate": 1.9850506468687164e-05, "loss": 0.0339, "step": 920 }, { "epoch": 0.04219863988792404, "grad_norm": 0.3258332312107086, "learning_rate": 1.9845400427047542e-05, "loss": 0.0284, "step": 930 }, { "epoch": 0.042652388703923225, "grad_norm": 0.4138632118701935, "learning_rate": 1.9840209322558476e-05, "loss": 0.0363, "step": 940 }, { "epoch": 0.043106137519922406, "grad_norm": 0.34551170468330383, "learning_rate": 1.983493320007036e-05, "loss": 0.0297, "step": 950 }, { "epoch": 0.043559886335921595, "grad_norm": 0.502480149269104, "learning_rate": 1.9829572105168137e-05, "loss": 0.039, "step": 960 }, { "epoch": 0.04401363515192078, "grad_norm": 0.5837337970733643, "learning_rate": 1.9824126084170907e-05, "loss": 0.035, "step": 970 }, { "epoch": 0.04446738396791996, "grad_norm": 0.329112708568573, "learning_rate": 1.9818595184131505e-05, "loss": 0.0375, "step": 980 }, { "epoch": 0.04492113278391914, "grad_norm": 0.4185082018375397, "learning_rate": 1.9812979452836117e-05, "loss": 0.0355, "step": 990 }, { "epoch": 0.04537488159991832, "grad_norm": 0.393245667219162, "learning_rate": 1.9807278938803853e-05, "loss": 0.0352, "step": 1000 }, { "epoch": 0.04582863041591751, "grad_norm": 0.3117782771587372, "learning_rate": 1.980149369128634e-05, "loss": 0.0293, "step": 1010 }, { "epoch": 0.04628237923191669, "grad_norm": 0.46574193239212036, "learning_rate": 1.9795623760267294e-05, "loss": 0.0298, "step": 1020 }, { "epoch": 0.046736128047915874, "grad_norm": 0.2960023581981659, "learning_rate": 1.9789669196462072e-05, "loss": 0.0327, "step": 1030 }, { "epoch": 0.047189876863915056, "grad_norm": 0.3810667395591736, "learning_rate": 1.978363005131725e-05, "loss": 0.0326, "step": 1040 }, { "epoch": 0.047643625679914245, "grad_norm": 0.3071083724498749, "learning_rate": 1.9777506377010182e-05, "loss": 0.031, "step": 1050 }, { "epoch": 0.04809737449591343, "grad_norm": 0.3337906301021576, "learning_rate": 1.9771298226448535e-05, "loss": 0.0282, "step": 1060 }, { "epoch": 0.04855112331191261, "grad_norm": 0.3475215435028076, "learning_rate": 1.9765005653269842e-05, "loss": 0.0263, "step": 1070 }, { "epoch": 0.04900487212791179, "grad_norm": 0.43681520223617554, "learning_rate": 1.9758628711841035e-05, "loss": 0.0356, "step": 1080 }, { "epoch": 0.04945862094391097, "grad_norm": 0.35723158717155457, "learning_rate": 1.975216745725797e-05, "loss": 0.0338, "step": 1090 }, { "epoch": 0.04991236975991016, "grad_norm": 0.47841957211494446, "learning_rate": 1.974562194534496e-05, "loss": 0.03, "step": 1100 }, { "epoch": 0.05036611857590934, "grad_norm": 0.5104230046272278, "learning_rate": 1.9738992232654296e-05, "loss": 0.0302, "step": 1110 }, { "epoch": 0.050819867391908524, "grad_norm": 0.36496758460998535, "learning_rate": 1.9732278376465746e-05, "loss": 0.0333, "step": 1120 }, { "epoch": 0.051273616207907706, "grad_norm": 0.47600677609443665, "learning_rate": 1.9725480434786065e-05, "loss": 0.0247, "step": 1130 }, { "epoch": 0.05172736502390689, "grad_norm": 0.6296696066856384, "learning_rate": 1.971859846634849e-05, "loss": 0.0335, "step": 1140 }, { "epoch": 0.052181113839906076, "grad_norm": 0.3508012294769287, "learning_rate": 1.9711632530612247e-05, "loss": 0.034, "step": 1150 }, { "epoch": 0.05263486265590526, "grad_norm": 0.4987192749977112, "learning_rate": 1.970458268776202e-05, "loss": 0.0304, "step": 1160 }, { "epoch": 0.05308861147190444, "grad_norm": 0.30687594413757324, "learning_rate": 1.9697448998707448e-05, "loss": 0.0293, "step": 1170 }, { "epoch": 0.05354236028790362, "grad_norm": 0.3851865828037262, "learning_rate": 1.9690231525082576e-05, "loss": 0.031, "step": 1180 }, { "epoch": 0.05399610910390281, "grad_norm": 0.4362975060939789, "learning_rate": 1.968293032924535e-05, "loss": 0.0261, "step": 1190 }, { "epoch": 0.05444985791990199, "grad_norm": 0.3843149244785309, "learning_rate": 1.9675545474277045e-05, "loss": 0.0321, "step": 1200 }, { "epoch": 0.054903606735901174, "grad_norm": 0.39889925718307495, "learning_rate": 1.966807702398176e-05, "loss": 0.0278, "step": 1210 }, { "epoch": 0.055357355551900356, "grad_norm": 0.4780742824077606, "learning_rate": 1.9660525042885828e-05, "loss": 0.029, "step": 1220 }, { "epoch": 0.05581110436789954, "grad_norm": 0.32199135422706604, "learning_rate": 1.965288959623729e-05, "loss": 0.0307, "step": 1230 }, { "epoch": 0.056264853183898726, "grad_norm": 0.34257176518440247, "learning_rate": 1.964517075000531e-05, "loss": 0.0323, "step": 1240 }, { "epoch": 0.05671860199989791, "grad_norm": 0.38171863555908203, "learning_rate": 1.9637368570879612e-05, "loss": 0.0288, "step": 1250 }, { "epoch": 0.05717235081589709, "grad_norm": 0.4814264178276062, "learning_rate": 1.9629483126269904e-05, "loss": 0.038, "step": 1260 }, { "epoch": 0.05762609963189627, "grad_norm": 0.3798184096813202, "learning_rate": 1.9621514484305308e-05, "loss": 0.0325, "step": 1270 }, { "epoch": 0.05807984844789545, "grad_norm": 0.45723605155944824, "learning_rate": 1.9613462713833734e-05, "loss": 0.0318, "step": 1280 }, { "epoch": 0.05853359726389464, "grad_norm": 0.32979801297187805, "learning_rate": 1.9605327884421338e-05, "loss": 0.0324, "step": 1290 }, { "epoch": 0.058987346079893824, "grad_norm": 0.4058956801891327, "learning_rate": 1.9597110066351875e-05, "loss": 0.0298, "step": 1300 }, { "epoch": 0.059441094895893005, "grad_norm": 0.2746201455593109, "learning_rate": 1.958880933062612e-05, "loss": 0.0272, "step": 1310 }, { "epoch": 0.05989484371189219, "grad_norm": 0.3261467218399048, "learning_rate": 1.958042574896124e-05, "loss": 0.0263, "step": 1320 }, { "epoch": 0.060348592527891376, "grad_norm": 0.36839559674263, "learning_rate": 1.9571959393790174e-05, "loss": 0.0286, "step": 1330 }, { "epoch": 0.06080234134389056, "grad_norm": 0.3967912793159485, "learning_rate": 1.9563410338261022e-05, "loss": 0.0309, "step": 1340 }, { "epoch": 0.06125609015988974, "grad_norm": 0.31618067622184753, "learning_rate": 1.9554778656236402e-05, "loss": 0.03, "step": 1350 }, { "epoch": 0.06170983897588892, "grad_norm": 0.2765614986419678, "learning_rate": 1.9546064422292806e-05, "loss": 0.0272, "step": 1360 }, { "epoch": 0.0621635877918881, "grad_norm": 0.656044602394104, "learning_rate": 1.9537267711719966e-05, "loss": 0.0349, "step": 1370 }, { "epoch": 0.06261733660788729, "grad_norm": 0.34816327691078186, "learning_rate": 1.9528388600520208e-05, "loss": 0.0287, "step": 1380 }, { "epoch": 0.06307108542388647, "grad_norm": 0.37753716111183167, "learning_rate": 1.9519427165407773e-05, "loss": 0.0273, "step": 1390 }, { "epoch": 0.06352483423988566, "grad_norm": 0.3496685326099396, "learning_rate": 1.9510383483808183e-05, "loss": 0.0293, "step": 1400 }, { "epoch": 0.06397858305588484, "grad_norm": 0.32307639718055725, "learning_rate": 1.950125763385755e-05, "loss": 0.023, "step": 1410 }, { "epoch": 0.06443233187188402, "grad_norm": 0.21378570795059204, "learning_rate": 1.949204969440191e-05, "loss": 0.0301, "step": 1420 }, { "epoch": 0.06488608068788321, "grad_norm": 0.39907675981521606, "learning_rate": 1.9482759744996537e-05, "loss": 0.0321, "step": 1430 }, { "epoch": 0.06533982950388238, "grad_norm": 0.24587471783161163, "learning_rate": 1.9473387865905268e-05, "loss": 0.0331, "step": 1440 }, { "epoch": 0.06579357831988157, "grad_norm": 0.4524153172969818, "learning_rate": 1.9463934138099796e-05, "loss": 0.0304, "step": 1450 }, { "epoch": 0.06624732713588076, "grad_norm": 0.2183820903301239, "learning_rate": 1.945439864325897e-05, "loss": 0.0264, "step": 1460 }, { "epoch": 0.06670107595187993, "grad_norm": 0.347924143075943, "learning_rate": 1.944478146376811e-05, "loss": 0.0306, "step": 1470 }, { "epoch": 0.06715482476787912, "grad_norm": 0.38224905729293823, "learning_rate": 1.943508268271826e-05, "loss": 0.0328, "step": 1480 }, { "epoch": 0.0676085735838783, "grad_norm": 0.3916206359863281, "learning_rate": 1.9425302383905497e-05, "loss": 0.0308, "step": 1490 }, { "epoch": 0.06806232239987749, "grad_norm": 0.29595285654067993, "learning_rate": 1.941544065183021e-05, "loss": 0.0281, "step": 1500 }, { "epoch": 0.06851607121587668, "grad_norm": 0.3547518849372864, "learning_rate": 1.9405497571696347e-05, "loss": 0.0273, "step": 1510 }, { "epoch": 0.06896982003187585, "grad_norm": 0.3417685329914093, "learning_rate": 1.93954732294107e-05, "loss": 0.0298, "step": 1520 }, { "epoch": 0.06942356884787504, "grad_norm": 0.3414689004421234, "learning_rate": 1.9385367711582142e-05, "loss": 0.0261, "step": 1530 }, { "epoch": 0.06987731766387423, "grad_norm": 0.3908116817474365, "learning_rate": 1.9375181105520907e-05, "loss": 0.0237, "step": 1540 }, { "epoch": 0.0703310664798734, "grad_norm": 0.3118087351322174, "learning_rate": 1.9364913499237814e-05, "loss": 0.0216, "step": 1550 }, { "epoch": 0.07078481529587259, "grad_norm": 0.3672642111778259, "learning_rate": 1.93545649814435e-05, "loss": 0.0244, "step": 1560 }, { "epoch": 0.07123856411187177, "grad_norm": 0.3155844211578369, "learning_rate": 1.934413564154769e-05, "loss": 0.0234, "step": 1570 }, { "epoch": 0.07169231292787095, "grad_norm": 0.49833133816719055, "learning_rate": 1.9333625569658377e-05, "loss": 0.0367, "step": 1580 }, { "epoch": 0.07214606174387014, "grad_norm": 0.3114766478538513, "learning_rate": 1.9323034856581083e-05, "loss": 0.0253, "step": 1590 }, { "epoch": 0.07259981055986932, "grad_norm": 0.34141266345977783, "learning_rate": 1.9312363593818045e-05, "loss": 0.0304, "step": 1600 }, { "epoch": 0.07305355937586851, "grad_norm": 0.43003976345062256, "learning_rate": 1.930161187356745e-05, "loss": 0.026, "step": 1610 }, { "epoch": 0.07350730819186768, "grad_norm": 0.4651493430137634, "learning_rate": 1.929077978872262e-05, "loss": 0.0281, "step": 1620 }, { "epoch": 0.07396105700786687, "grad_norm": 0.4466242492198944, "learning_rate": 1.9279867432871215e-05, "loss": 0.0261, "step": 1630 }, { "epoch": 0.07441480582386606, "grad_norm": 0.4062393605709076, "learning_rate": 1.9268874900294426e-05, "loss": 0.0301, "step": 1640 }, { "epoch": 0.07486855463986523, "grad_norm": 0.2957588732242584, "learning_rate": 1.9257802285966166e-05, "loss": 0.0274, "step": 1650 }, { "epoch": 0.07532230345586442, "grad_norm": 0.40534257888793945, "learning_rate": 1.924664968555223e-05, "loss": 0.0297, "step": 1660 }, { "epoch": 0.0757760522718636, "grad_norm": 0.37658095359802246, "learning_rate": 1.9235417195409487e-05, "loss": 0.0255, "step": 1670 }, { "epoch": 0.07622980108786279, "grad_norm": 0.4333280622959137, "learning_rate": 1.922410491258505e-05, "loss": 0.0287, "step": 1680 }, { "epoch": 0.07668354990386198, "grad_norm": 0.5800235867500305, "learning_rate": 1.9212712934815413e-05, "loss": 0.0322, "step": 1690 }, { "epoch": 0.07713729871986115, "grad_norm": 0.2994270920753479, "learning_rate": 1.9201241360525643e-05, "loss": 0.0259, "step": 1700 }, { "epoch": 0.07759104753586034, "grad_norm": 0.32620176672935486, "learning_rate": 1.9189690288828487e-05, "loss": 0.0302, "step": 1710 }, { "epoch": 0.07804479635185951, "grad_norm": 0.41358160972595215, "learning_rate": 1.9178059819523563e-05, "loss": 0.026, "step": 1720 }, { "epoch": 0.0784985451678587, "grad_norm": 0.4136590361595154, "learning_rate": 1.9166350053096453e-05, "loss": 0.0268, "step": 1730 }, { "epoch": 0.07895229398385789, "grad_norm": 0.2818216383457184, "learning_rate": 1.9154561090717857e-05, "loss": 0.0266, "step": 1740 }, { "epoch": 0.07940604279985707, "grad_norm": 0.30833351612091064, "learning_rate": 1.9142693034242726e-05, "loss": 0.0225, "step": 1750 }, { "epoch": 0.07985979161585625, "grad_norm": 0.46563273668289185, "learning_rate": 1.913074598620937e-05, "loss": 0.0283, "step": 1760 }, { "epoch": 0.08031354043185543, "grad_norm": 0.27527183294296265, "learning_rate": 1.9118720049838567e-05, "loss": 0.0261, "step": 1770 }, { "epoch": 0.08076728924785462, "grad_norm": 0.2639108896255493, "learning_rate": 1.9106615329032695e-05, "loss": 0.0287, "step": 1780 }, { "epoch": 0.0812210380638538, "grad_norm": 0.3344709277153015, "learning_rate": 1.9094431928374798e-05, "loss": 0.0243, "step": 1790 }, { "epoch": 0.08167478687985298, "grad_norm": 0.3626515865325928, "learning_rate": 1.9082169953127714e-05, "loss": 0.0264, "step": 1800 }, { "epoch": 0.08212853569585217, "grad_norm": 0.2846546173095703, "learning_rate": 1.9069829509233156e-05, "loss": 0.0234, "step": 1810 }, { "epoch": 0.08258228451185136, "grad_norm": 0.28296372294425964, "learning_rate": 1.9057410703310788e-05, "loss": 0.0236, "step": 1820 }, { "epoch": 0.08303603332785053, "grad_norm": 0.3109569251537323, "learning_rate": 1.9044913642657318e-05, "loss": 0.0266, "step": 1830 }, { "epoch": 0.08348978214384972, "grad_norm": 0.35602954030036926, "learning_rate": 1.9032338435245557e-05, "loss": 0.0262, "step": 1840 }, { "epoch": 0.0839435309598489, "grad_norm": 0.36112645268440247, "learning_rate": 1.9019685189723497e-05, "loss": 0.0288, "step": 1850 }, { "epoch": 0.08439727977584809, "grad_norm": 0.364214688539505, "learning_rate": 1.900695401541337e-05, "loss": 0.0248, "step": 1860 }, { "epoch": 0.08485102859184727, "grad_norm": 0.4925951659679413, "learning_rate": 1.8994145022310693e-05, "loss": 0.0231, "step": 1870 }, { "epoch": 0.08530477740784645, "grad_norm": 0.32205620408058167, "learning_rate": 1.8981258321083335e-05, "loss": 0.0252, "step": 1880 }, { "epoch": 0.08575852622384564, "grad_norm": 0.366228848695755, "learning_rate": 1.8968294023070548e-05, "loss": 0.0249, "step": 1890 }, { "epoch": 0.08621227503984481, "grad_norm": 0.3457716107368469, "learning_rate": 1.895525224028201e-05, "loss": 0.0265, "step": 1900 }, { "epoch": 0.086666023855844, "grad_norm": 0.3465504050254822, "learning_rate": 1.8942133085396855e-05, "loss": 0.0225, "step": 1910 }, { "epoch": 0.08711977267184319, "grad_norm": 0.3065875172615051, "learning_rate": 1.8928936671762704e-05, "loss": 0.0264, "step": 1920 }, { "epoch": 0.08757352148784237, "grad_norm": 0.3289739191532135, "learning_rate": 1.8915663113394677e-05, "loss": 0.0304, "step": 1930 }, { "epoch": 0.08802727030384155, "grad_norm": 0.37356194853782654, "learning_rate": 1.890231252497442e-05, "loss": 0.0245, "step": 1940 }, { "epoch": 0.08848101911984073, "grad_norm": 0.3293350040912628, "learning_rate": 1.8888885021849103e-05, "loss": 0.0255, "step": 1950 }, { "epoch": 0.08893476793583992, "grad_norm": 0.3437206745147705, "learning_rate": 1.8875380720030434e-05, "loss": 0.026, "step": 1960 }, { "epoch": 0.0893885167518391, "grad_norm": 0.37463971972465515, "learning_rate": 1.886179973619364e-05, "loss": 0.0282, "step": 1970 }, { "epoch": 0.08984226556783828, "grad_norm": 0.42150387167930603, "learning_rate": 1.8848142187676485e-05, "loss": 0.0255, "step": 1980 }, { "epoch": 0.09029601438383747, "grad_norm": 0.2749548852443695, "learning_rate": 1.883440819247822e-05, "loss": 0.0278, "step": 1990 }, { "epoch": 0.09074976319983664, "grad_norm": 0.15577726066112518, "learning_rate": 1.8820597869258606e-05, "loss": 0.0251, "step": 2000 }, { "epoch": 0.09120351201583583, "grad_norm": 0.26942554116249084, "learning_rate": 1.8806711337336852e-05, "loss": 0.0208, "step": 2010 }, { "epoch": 0.09165726083183502, "grad_norm": 0.3115650415420532, "learning_rate": 1.8792748716690608e-05, "loss": 0.0299, "step": 2020 }, { "epoch": 0.0921110096478342, "grad_norm": 0.3561892807483673, "learning_rate": 1.8778710127954912e-05, "loss": 0.0285, "step": 2030 }, { "epoch": 0.09256475846383339, "grad_norm": 0.32506442070007324, "learning_rate": 1.8764595692421163e-05, "loss": 0.0244, "step": 2040 }, { "epoch": 0.09301850727983256, "grad_norm": 0.339581698179245, "learning_rate": 1.8750405532036064e-05, "loss": 0.0265, "step": 2050 }, { "epoch": 0.09347225609583175, "grad_norm": 0.30347487330436707, "learning_rate": 1.8736139769400567e-05, "loss": 0.0263, "step": 2060 }, { "epoch": 0.09392600491183094, "grad_norm": 0.3692399263381958, "learning_rate": 1.8721798527768813e-05, "loss": 0.0248, "step": 2070 }, { "epoch": 0.09437975372783011, "grad_norm": 0.258657306432724, "learning_rate": 1.870738193104708e-05, "loss": 0.0243, "step": 2080 }, { "epoch": 0.0948335025438293, "grad_norm": 0.2374759018421173, "learning_rate": 1.86928901037927e-05, "loss": 0.0238, "step": 2090 }, { "epoch": 0.09528725135982849, "grad_norm": 0.3503143787384033, "learning_rate": 1.8678323171212982e-05, "loss": 0.0321, "step": 2100 }, { "epoch": 0.09574100017582766, "grad_norm": 0.42297273874282837, "learning_rate": 1.866368125916414e-05, "loss": 0.0238, "step": 2110 }, { "epoch": 0.09619474899182685, "grad_norm": 0.28227752447128296, "learning_rate": 1.864896449415019e-05, "loss": 0.022, "step": 2120 }, { "epoch": 0.09664849780782603, "grad_norm": 0.32826894521713257, "learning_rate": 1.863417300332188e-05, "loss": 0.0244, "step": 2130 }, { "epoch": 0.09710224662382522, "grad_norm": 0.33181822299957275, "learning_rate": 1.8619306914475573e-05, "loss": 0.0253, "step": 2140 }, { "epoch": 0.0975559954398244, "grad_norm": 0.23897556960582733, "learning_rate": 1.860436635605214e-05, "loss": 0.0278, "step": 2150 }, { "epoch": 0.09800974425582358, "grad_norm": 0.3242506682872772, "learning_rate": 1.8589351457135873e-05, "loss": 0.0215, "step": 2160 }, { "epoch": 0.09846349307182277, "grad_norm": 0.31502264738082886, "learning_rate": 1.8574262347453344e-05, "loss": 0.0295, "step": 2170 }, { "epoch": 0.09891724188782194, "grad_norm": 0.3040638267993927, "learning_rate": 1.85590991573723e-05, "loss": 0.0255, "step": 2180 }, { "epoch": 0.09937099070382113, "grad_norm": 0.28746503591537476, "learning_rate": 1.854386201790053e-05, "loss": 0.0216, "step": 2190 }, { "epoch": 0.09982473951982032, "grad_norm": 0.3485678434371948, "learning_rate": 1.8528551060684744e-05, "loss": 0.0255, "step": 2200 }, { "epoch": 0.1002784883358195, "grad_norm": 0.3102795481681824, "learning_rate": 1.851316641800941e-05, "loss": 0.0246, "step": 2210 }, { "epoch": 0.10073223715181868, "grad_norm": 0.24928243458271027, "learning_rate": 1.8497708222795638e-05, "loss": 0.0244, "step": 2220 }, { "epoch": 0.10118598596781786, "grad_norm": 0.26787352561950684, "learning_rate": 1.8482176608600025e-05, "loss": 0.0252, "step": 2230 }, { "epoch": 0.10163973478381705, "grad_norm": 0.30643823742866516, "learning_rate": 1.846657170961349e-05, "loss": 0.0294, "step": 2240 }, { "epoch": 0.10209348359981624, "grad_norm": 0.4242754578590393, "learning_rate": 1.8450893660660126e-05, "loss": 0.027, "step": 2250 }, { "epoch": 0.10254723241581541, "grad_norm": 0.21771681308746338, "learning_rate": 1.8435142597196033e-05, "loss": 0.0246, "step": 2260 }, { "epoch": 0.1030009812318146, "grad_norm": 0.4637884795665741, "learning_rate": 1.8419318655308135e-05, "loss": 0.0266, "step": 2270 }, { "epoch": 0.10345473004781378, "grad_norm": 0.35978934168815613, "learning_rate": 1.8403421971713034e-05, "loss": 0.0303, "step": 2280 }, { "epoch": 0.10390847886381296, "grad_norm": 0.31819626688957214, "learning_rate": 1.838745268375579e-05, "loss": 0.0225, "step": 2290 }, { "epoch": 0.10436222767981215, "grad_norm": 0.35580259561538696, "learning_rate": 1.8371410929408767e-05, "loss": 0.026, "step": 2300 }, { "epoch": 0.10481597649581133, "grad_norm": 0.3587094247341156, "learning_rate": 1.835529684727043e-05, "loss": 0.0211, "step": 2310 }, { "epoch": 0.10526972531181052, "grad_norm": 0.3146100640296936, "learning_rate": 1.8339110576564132e-05, "loss": 0.0193, "step": 2320 }, { "epoch": 0.10572347412780969, "grad_norm": 0.3359289765357971, "learning_rate": 1.8322852257136935e-05, "loss": 0.0215, "step": 2330 }, { "epoch": 0.10617722294380888, "grad_norm": 0.33030635118484497, "learning_rate": 1.8306522029458395e-05, "loss": 0.022, "step": 2340 }, { "epoch": 0.10663097175980807, "grad_norm": 0.3473166823387146, "learning_rate": 1.8290120034619335e-05, "loss": 0.0249, "step": 2350 }, { "epoch": 0.10708472057580724, "grad_norm": 0.3232431411743164, "learning_rate": 1.8273646414330645e-05, "loss": 0.0259, "step": 2360 }, { "epoch": 0.10753846939180643, "grad_norm": 0.40749892592430115, "learning_rate": 1.8257101310922042e-05, "loss": 0.0285, "step": 2370 }, { "epoch": 0.10799221820780562, "grad_norm": 0.29123133420944214, "learning_rate": 1.8240484867340852e-05, "loss": 0.02, "step": 2380 }, { "epoch": 0.1084459670238048, "grad_norm": 0.248729407787323, "learning_rate": 1.8223797227150762e-05, "loss": 0.024, "step": 2390 }, { "epoch": 0.10889971583980398, "grad_norm": 0.23471763730049133, "learning_rate": 1.8207038534530598e-05, "loss": 0.0233, "step": 2400 }, { "epoch": 0.10935346465580316, "grad_norm": 0.33546414971351624, "learning_rate": 1.819020893427306e-05, "loss": 0.019, "step": 2410 }, { "epoch": 0.10980721347180235, "grad_norm": 0.3919527530670166, "learning_rate": 1.817330857178349e-05, "loss": 0.0268, "step": 2420 }, { "epoch": 0.11026096228780154, "grad_norm": 0.2865142226219177, "learning_rate": 1.8156337593078594e-05, "loss": 0.0223, "step": 2430 }, { "epoch": 0.11071471110380071, "grad_norm": 0.39257389307022095, "learning_rate": 1.81392961447852e-05, "loss": 0.0229, "step": 2440 }, { "epoch": 0.1111684599197999, "grad_norm": 0.3611172139644623, "learning_rate": 1.8122184374138973e-05, "loss": 0.0237, "step": 2450 }, { "epoch": 0.11162220873579907, "grad_norm": 0.2578670382499695, "learning_rate": 1.810500242898317e-05, "loss": 0.0244, "step": 2460 }, { "epoch": 0.11207595755179826, "grad_norm": 0.24695947766304016, "learning_rate": 1.808775045776733e-05, "loss": 0.0255, "step": 2470 }, { "epoch": 0.11252970636779745, "grad_norm": 0.38776108622550964, "learning_rate": 1.8070428609546012e-05, "loss": 0.026, "step": 2480 }, { "epoch": 0.11298345518379663, "grad_norm": 0.33222416043281555, "learning_rate": 1.8053037033977513e-05, "loss": 0.0203, "step": 2490 }, { "epoch": 0.11343720399979582, "grad_norm": 0.34614139795303345, "learning_rate": 1.803557588132254e-05, "loss": 0.0217, "step": 2500 }, { "epoch": 0.11389095281579499, "grad_norm": 0.3952532708644867, "learning_rate": 1.8018045302442966e-05, "loss": 0.0274, "step": 2510 }, { "epoch": 0.11434470163179418, "grad_norm": 0.30400317907333374, "learning_rate": 1.8000445448800473e-05, "loss": 0.0207, "step": 2520 }, { "epoch": 0.11479845044779337, "grad_norm": 0.30075228214263916, "learning_rate": 1.7982776472455274e-05, "loss": 0.0223, "step": 2530 }, { "epoch": 0.11525219926379254, "grad_norm": 0.3695099651813507, "learning_rate": 1.7965038526064796e-05, "loss": 0.0318, "step": 2540 }, { "epoch": 0.11570594807979173, "grad_norm": 0.26347431540489197, "learning_rate": 1.794723176288236e-05, "loss": 0.0244, "step": 2550 }, { "epoch": 0.1161596968957909, "grad_norm": 0.3378712832927704, "learning_rate": 1.7929356336755842e-05, "loss": 0.0234, "step": 2560 }, { "epoch": 0.1166134457117901, "grad_norm": 0.3646625280380249, "learning_rate": 1.7911412402126366e-05, "loss": 0.0251, "step": 2570 }, { "epoch": 0.11706719452778928, "grad_norm": 0.26392313838005066, "learning_rate": 1.789340011402696e-05, "loss": 0.0251, "step": 2580 }, { "epoch": 0.11752094334378846, "grad_norm": 0.30855876207351685, "learning_rate": 1.7875319628081205e-05, "loss": 0.023, "step": 2590 }, { "epoch": 0.11797469215978765, "grad_norm": 0.4409993290901184, "learning_rate": 1.785717110050192e-05, "loss": 0.0246, "step": 2600 }, { "epoch": 0.11842844097578682, "grad_norm": 0.38101503252983093, "learning_rate": 1.7838954688089777e-05, "loss": 0.0251, "step": 2610 }, { "epoch": 0.11888218979178601, "grad_norm": 0.35045790672302246, "learning_rate": 1.782067054823197e-05, "loss": 0.0247, "step": 2620 }, { "epoch": 0.1193359386077852, "grad_norm": 0.35324978828430176, "learning_rate": 1.7802318838900855e-05, "loss": 0.0244, "step": 2630 }, { "epoch": 0.11978968742378437, "grad_norm": 0.389617919921875, "learning_rate": 1.7783899718652563e-05, "loss": 0.0225, "step": 2640 }, { "epoch": 0.12024343623978356, "grad_norm": 0.30963388085365295, "learning_rate": 1.776541334662566e-05, "loss": 0.0254, "step": 2650 }, { "epoch": 0.12069718505578275, "grad_norm": 0.3309627175331116, "learning_rate": 1.7746859882539747e-05, "loss": 0.0255, "step": 2660 }, { "epoch": 0.12115093387178193, "grad_norm": 0.36231184005737305, "learning_rate": 1.7728239486694104e-05, "loss": 0.0264, "step": 2670 }, { "epoch": 0.12160468268778112, "grad_norm": 0.24148836731910706, "learning_rate": 1.7709552319966275e-05, "loss": 0.0251, "step": 2680 }, { "epoch": 0.12205843150378029, "grad_norm": 0.30826303362846375, "learning_rate": 1.7690798543810715e-05, "loss": 0.0253, "step": 2690 }, { "epoch": 0.12251218031977948, "grad_norm": 0.36621302366256714, "learning_rate": 1.7671978320257356e-05, "loss": 0.0243, "step": 2700 }, { "epoch": 0.12296592913577867, "grad_norm": 0.30000630021095276, "learning_rate": 1.7653091811910236e-05, "loss": 0.0227, "step": 2710 }, { "epoch": 0.12341967795177784, "grad_norm": 0.3522748351097107, "learning_rate": 1.763413918194608e-05, "loss": 0.0249, "step": 2720 }, { "epoch": 0.12387342676777703, "grad_norm": 0.23331865668296814, "learning_rate": 1.7615120594112895e-05, "loss": 0.017, "step": 2730 }, { "epoch": 0.1243271755837762, "grad_norm": 0.35786885023117065, "learning_rate": 1.7596036212728558e-05, "loss": 0.0186, "step": 2740 }, { "epoch": 0.1247809243997754, "grad_norm": 0.2073945850133896, "learning_rate": 1.757688620267939e-05, "loss": 0.0234, "step": 2750 }, { "epoch": 0.12523467321577458, "grad_norm": 0.22009114921092987, "learning_rate": 1.755767072941874e-05, "loss": 0.0235, "step": 2760 }, { "epoch": 0.12568842203177377, "grad_norm": 0.2753148674964905, "learning_rate": 1.7538389958965537e-05, "loss": 0.0265, "step": 2770 }, { "epoch": 0.12614217084777293, "grad_norm": 0.24320273101329803, "learning_rate": 1.7519044057902877e-05, "loss": 0.0248, "step": 2780 }, { "epoch": 0.12659591966377212, "grad_norm": 0.2807812988758087, "learning_rate": 1.749963319337658e-05, "loss": 0.0266, "step": 2790 }, { "epoch": 0.1270496684797713, "grad_norm": 0.33782607316970825, "learning_rate": 1.748015753309373e-05, "loss": 0.0187, "step": 2800 }, { "epoch": 0.1275034172957705, "grad_norm": 0.28411775827407837, "learning_rate": 1.746061724532124e-05, "loss": 0.0242, "step": 2810 }, { "epoch": 0.1279571661117697, "grad_norm": 0.22015537321567535, "learning_rate": 1.7441012498884402e-05, "loss": 0.0204, "step": 2820 }, { "epoch": 0.12841091492776885, "grad_norm": 0.25588226318359375, "learning_rate": 1.7421343463165415e-05, "loss": 0.0246, "step": 2830 }, { "epoch": 0.12886466374376804, "grad_norm": 0.3184933364391327, "learning_rate": 1.7401610308101933e-05, "loss": 0.0263, "step": 2840 }, { "epoch": 0.12931841255976723, "grad_norm": 0.20176275074481964, "learning_rate": 1.7381813204185585e-05, "loss": 0.0185, "step": 2850 }, { "epoch": 0.12977216137576641, "grad_norm": 0.21839043498039246, "learning_rate": 1.7361952322460513e-05, "loss": 0.0215, "step": 2860 }, { "epoch": 0.1302259101917656, "grad_norm": 0.2835388481616974, "learning_rate": 1.7342027834521896e-05, "loss": 0.0221, "step": 2870 }, { "epoch": 0.13067965900776476, "grad_norm": 0.33354708552360535, "learning_rate": 1.7322039912514453e-05, "loss": 0.0182, "step": 2880 }, { "epoch": 0.13113340782376395, "grad_norm": 0.2000766396522522, "learning_rate": 1.7301988729130964e-05, "loss": 0.0226, "step": 2890 }, { "epoch": 0.13158715663976314, "grad_norm": 0.28084298968315125, "learning_rate": 1.7281874457610787e-05, "loss": 0.0214, "step": 2900 }, { "epoch": 0.13204090545576233, "grad_norm": 0.2850341796875, "learning_rate": 1.7261697271738337e-05, "loss": 0.0252, "step": 2910 }, { "epoch": 0.13249465427176152, "grad_norm": 0.3214298486709595, "learning_rate": 1.724145734584162e-05, "loss": 0.0207, "step": 2920 }, { "epoch": 0.13294840308776068, "grad_norm": 0.24159666895866394, "learning_rate": 1.7221154854790696e-05, "loss": 0.0211, "step": 2930 }, { "epoch": 0.13340215190375987, "grad_norm": 0.3537675142288208, "learning_rate": 1.7200789973996172e-05, "loss": 0.0221, "step": 2940 }, { "epoch": 0.13385590071975906, "grad_norm": 0.24425993859767914, "learning_rate": 1.7180362879407707e-05, "loss": 0.0215, "step": 2950 }, { "epoch": 0.13430964953575825, "grad_norm": 0.25267666578292847, "learning_rate": 1.7159873747512472e-05, "loss": 0.0231, "step": 2960 }, { "epoch": 0.13476339835175744, "grad_norm": 0.2916654348373413, "learning_rate": 1.713932275533363e-05, "loss": 0.0243, "step": 2970 }, { "epoch": 0.1352171471677566, "grad_norm": 0.3018513321876526, "learning_rate": 1.7118710080428807e-05, "loss": 0.0235, "step": 2980 }, { "epoch": 0.13567089598375578, "grad_norm": 0.33731094002723694, "learning_rate": 1.7098035900888566e-05, "loss": 0.0216, "step": 2990 }, { "epoch": 0.13612464479975497, "grad_norm": 0.24875997006893158, "learning_rate": 1.7077300395334857e-05, "loss": 0.0277, "step": 3000 }, { "epoch": 0.13657839361575416, "grad_norm": 0.44436484575271606, "learning_rate": 1.7056503742919476e-05, "loss": 0.0231, "step": 3010 }, { "epoch": 0.13703214243175335, "grad_norm": 0.32984521985054016, "learning_rate": 1.703564612332252e-05, "loss": 0.0213, "step": 3020 }, { "epoch": 0.1374858912477525, "grad_norm": 0.22811533510684967, "learning_rate": 1.7014727716750842e-05, "loss": 0.0223, "step": 3030 }, { "epoch": 0.1379396400637517, "grad_norm": 0.2657774090766907, "learning_rate": 1.699374870393647e-05, "loss": 0.0221, "step": 3040 }, { "epoch": 0.1383933888797509, "grad_norm": 0.2331739366054535, "learning_rate": 1.697270926613507e-05, "loss": 0.022, "step": 3050 }, { "epoch": 0.13884713769575008, "grad_norm": 0.40839359164237976, "learning_rate": 1.6951609585124377e-05, "loss": 0.026, "step": 3060 }, { "epoch": 0.13930088651174927, "grad_norm": 0.27357104420661926, "learning_rate": 1.6930449843202607e-05, "loss": 0.0229, "step": 3070 }, { "epoch": 0.13975463532774846, "grad_norm": 0.21143865585327148, "learning_rate": 1.69092302231869e-05, "loss": 0.0248, "step": 3080 }, { "epoch": 0.14020838414374762, "grad_norm": 0.2651277780532837, "learning_rate": 1.688795090841173e-05, "loss": 0.0206, "step": 3090 }, { "epoch": 0.1406621329597468, "grad_norm": 0.2734297811985016, "learning_rate": 1.686661208272734e-05, "loss": 0.0211, "step": 3100 }, { "epoch": 0.141115881775746, "grad_norm": 0.2321801632642746, "learning_rate": 1.6845213930498122e-05, "loss": 0.0229, "step": 3110 }, { "epoch": 0.14156963059174518, "grad_norm": 0.19926059246063232, "learning_rate": 1.682375663660104e-05, "loss": 0.019, "step": 3120 }, { "epoch": 0.14202337940774437, "grad_norm": 0.29449644684791565, "learning_rate": 1.680224038642405e-05, "loss": 0.0189, "step": 3130 }, { "epoch": 0.14247712822374353, "grad_norm": 0.39612048864364624, "learning_rate": 1.6780665365864465e-05, "loss": 0.0253, "step": 3140 }, { "epoch": 0.14293087703974272, "grad_norm": 0.3045453727245331, "learning_rate": 1.675903176132737e-05, "loss": 0.0208, "step": 3150 }, { "epoch": 0.1433846258557419, "grad_norm": 0.26917892694473267, "learning_rate": 1.6737339759724016e-05, "loss": 0.023, "step": 3160 }, { "epoch": 0.1438383746717411, "grad_norm": 0.2601112723350525, "learning_rate": 1.6715589548470187e-05, "loss": 0.0213, "step": 3170 }, { "epoch": 0.1442921234877403, "grad_norm": 0.36996549367904663, "learning_rate": 1.669378131548459e-05, "loss": 0.0211, "step": 3180 }, { "epoch": 0.14474587230373945, "grad_norm": 0.21878410875797272, "learning_rate": 1.6671915249187237e-05, "loss": 0.019, "step": 3190 }, { "epoch": 0.14519962111973864, "grad_norm": 0.2494911253452301, "learning_rate": 1.6649991538497808e-05, "loss": 0.0229, "step": 3200 }, { "epoch": 0.14565336993573783, "grad_norm": 0.3173881471157074, "learning_rate": 1.6628010372834028e-05, "loss": 0.0251, "step": 3210 }, { "epoch": 0.14610711875173701, "grad_norm": 0.2641706168651581, "learning_rate": 1.660597194211001e-05, "loss": 0.0226, "step": 3220 }, { "epoch": 0.1465608675677362, "grad_norm": 0.2626473307609558, "learning_rate": 1.6583876436734646e-05, "loss": 0.0222, "step": 3230 }, { "epoch": 0.14701461638373536, "grad_norm": 0.30592644214630127, "learning_rate": 1.6561724047609936e-05, "loss": 0.0214, "step": 3240 }, { "epoch": 0.14746836519973455, "grad_norm": 0.32091420888900757, "learning_rate": 1.653951496612935e-05, "loss": 0.0192, "step": 3250 }, { "epoch": 0.14792211401573374, "grad_norm": 0.27296578884124756, "learning_rate": 1.6517249384176163e-05, "loss": 0.0177, "step": 3260 }, { "epoch": 0.14837586283173293, "grad_norm": 0.28070753812789917, "learning_rate": 1.6494927494121827e-05, "loss": 0.0192, "step": 3270 }, { "epoch": 0.14882961164773212, "grad_norm": 0.33768898248672485, "learning_rate": 1.647254948882426e-05, "loss": 0.0198, "step": 3280 }, { "epoch": 0.14928336046373128, "grad_norm": 0.2648533284664154, "learning_rate": 1.6450115561626237e-05, "loss": 0.0254, "step": 3290 }, { "epoch": 0.14973710927973047, "grad_norm": 0.2503820061683655, "learning_rate": 1.6427625906353667e-05, "loss": 0.0215, "step": 3300 }, { "epoch": 0.15019085809572966, "grad_norm": 0.22484207153320312, "learning_rate": 1.640508071731395e-05, "loss": 0.0242, "step": 3310 }, { "epoch": 0.15064460691172885, "grad_norm": 0.2587619125843048, "learning_rate": 1.6382480189294293e-05, "loss": 0.0162, "step": 3320 }, { "epoch": 0.15109835572772803, "grad_norm": 0.3761514127254486, "learning_rate": 1.635982451756002e-05, "loss": 0.0228, "step": 3330 }, { "epoch": 0.1515521045437272, "grad_norm": 0.27407318353652954, "learning_rate": 1.6337113897852887e-05, "loss": 0.0185, "step": 3340 }, { "epoch": 0.15200585335972638, "grad_norm": 0.33319827914237976, "learning_rate": 1.6314348526389396e-05, "loss": 0.0196, "step": 3350 }, { "epoch": 0.15245960217572557, "grad_norm": 0.26822468638420105, "learning_rate": 1.6291528599859102e-05, "loss": 0.0203, "step": 3360 }, { "epoch": 0.15291335099172476, "grad_norm": 0.41813936829566956, "learning_rate": 1.6268654315422892e-05, "loss": 0.024, "step": 3370 }, { "epoch": 0.15336709980772395, "grad_norm": 0.34442031383514404, "learning_rate": 1.6245725870711314e-05, "loss": 0.0228, "step": 3380 }, { "epoch": 0.1538208486237231, "grad_norm": 0.2739299535751343, "learning_rate": 1.6222743463822842e-05, "loss": 0.0209, "step": 3390 }, { "epoch": 0.1542745974397223, "grad_norm": 0.31265953183174133, "learning_rate": 1.6199707293322183e-05, "loss": 0.021, "step": 3400 }, { "epoch": 0.1547283462557215, "grad_norm": 0.23598606884479523, "learning_rate": 1.6176617558238548e-05, "loss": 0.0252, "step": 3410 }, { "epoch": 0.15518209507172068, "grad_norm": 0.28678372502326965, "learning_rate": 1.615347445806394e-05, "loss": 0.0253, "step": 3420 }, { "epoch": 0.15563584388771987, "grad_norm": 0.2389034628868103, "learning_rate": 1.613027819275143e-05, "loss": 0.0234, "step": 3430 }, { "epoch": 0.15608959270371903, "grad_norm": 0.23054935038089752, "learning_rate": 1.6107028962713433e-05, "loss": 0.0212, "step": 3440 }, { "epoch": 0.15654334151971822, "grad_norm": 0.2669808566570282, "learning_rate": 1.608372696881996e-05, "loss": 0.0211, "step": 3450 }, { "epoch": 0.1569970903357174, "grad_norm": 0.3297053277492523, "learning_rate": 1.60603724123969e-05, "loss": 0.0207, "step": 3460 }, { "epoch": 0.1574508391517166, "grad_norm": 0.21432587504386902, "learning_rate": 1.603696549522428e-05, "loss": 0.0206, "step": 3470 }, { "epoch": 0.15790458796771578, "grad_norm": 0.271321177482605, "learning_rate": 1.6013506419534505e-05, "loss": 0.0254, "step": 3480 }, { "epoch": 0.15835833678371494, "grad_norm": 0.27586275339126587, "learning_rate": 1.598999538801064e-05, "loss": 0.0193, "step": 3490 }, { "epoch": 0.15881208559971413, "grad_norm": 0.26843059062957764, "learning_rate": 1.5966432603784615e-05, "loss": 0.0185, "step": 3500 }, { "epoch": 0.15926583441571332, "grad_norm": 0.21643030643463135, "learning_rate": 1.594281827043552e-05, "loss": 0.0191, "step": 3510 }, { "epoch": 0.1597195832317125, "grad_norm": 0.3599730432033539, "learning_rate": 1.5919152591987814e-05, "loss": 0.0186, "step": 3520 }, { "epoch": 0.1601733320477117, "grad_norm": 0.31179705262184143, "learning_rate": 1.5895435772909564e-05, "loss": 0.0201, "step": 3530 }, { "epoch": 0.16062708086371086, "grad_norm": 0.2292439043521881, "learning_rate": 1.5871668018110694e-05, "loss": 0.0251, "step": 3540 }, { "epoch": 0.16108082967971005, "grad_norm": 0.26696154475212097, "learning_rate": 1.5847849532941196e-05, "loss": 0.0205, "step": 3550 }, { "epoch": 0.16153457849570924, "grad_norm": 0.33195433020591736, "learning_rate": 1.5823980523189373e-05, "loss": 0.0173, "step": 3560 }, { "epoch": 0.16198832731170842, "grad_norm": 0.27310413122177124, "learning_rate": 1.580006119508005e-05, "loss": 0.0203, "step": 3570 }, { "epoch": 0.1624420761277076, "grad_norm": 0.3618914783000946, "learning_rate": 1.5776091755272792e-05, "loss": 0.0181, "step": 3580 }, { "epoch": 0.16289582494370677, "grad_norm": 0.3037472069263458, "learning_rate": 1.5752072410860132e-05, "loss": 0.0256, "step": 3590 }, { "epoch": 0.16334957375970596, "grad_norm": 0.2834845781326294, "learning_rate": 1.5728003369365763e-05, "loss": 0.0211, "step": 3600 }, { "epoch": 0.16380332257570515, "grad_norm": 0.24537642300128937, "learning_rate": 1.5703884838742755e-05, "loss": 0.021, "step": 3610 }, { "epoch": 0.16425707139170434, "grad_norm": 0.31755295395851135, "learning_rate": 1.5679717027371756e-05, "loss": 0.0228, "step": 3620 }, { "epoch": 0.16471082020770353, "grad_norm": 0.23259779810905457, "learning_rate": 1.5655500144059202e-05, "loss": 0.0181, "step": 3630 }, { "epoch": 0.16516456902370272, "grad_norm": 0.27074870467185974, "learning_rate": 1.5631234398035483e-05, "loss": 0.0209, "step": 3640 }, { "epoch": 0.16561831783970188, "grad_norm": 0.32329607009887695, "learning_rate": 1.5606919998953182e-05, "loss": 0.0216, "step": 3650 }, { "epoch": 0.16607206665570107, "grad_norm": 0.34350404143333435, "learning_rate": 1.5582557156885218e-05, "loss": 0.0215, "step": 3660 }, { "epoch": 0.16652581547170026, "grad_norm": 0.21537230908870697, "learning_rate": 1.5558146082323056e-05, "loss": 0.0223, "step": 3670 }, { "epoch": 0.16697956428769944, "grad_norm": 0.35468804836273193, "learning_rate": 1.5533686986174885e-05, "loss": 0.0191, "step": 3680 }, { "epoch": 0.16743331310369863, "grad_norm": 0.49466362595558167, "learning_rate": 1.5509180079763794e-05, "loss": 0.025, "step": 3690 }, { "epoch": 0.1678870619196978, "grad_norm": 0.2885371446609497, "learning_rate": 1.548462557482594e-05, "loss": 0.021, "step": 3700 }, { "epoch": 0.16834081073569698, "grad_norm": 0.19030006229877472, "learning_rate": 1.546002368350873e-05, "loss": 0.0185, "step": 3710 }, { "epoch": 0.16879455955169617, "grad_norm": 0.33008700609207153, "learning_rate": 1.5435374618368987e-05, "loss": 0.0219, "step": 3720 }, { "epoch": 0.16924830836769536, "grad_norm": 0.4167426526546478, "learning_rate": 1.5410678592371097e-05, "loss": 0.0236, "step": 3730 }, { "epoch": 0.16970205718369455, "grad_norm": 0.25845128297805786, "learning_rate": 1.5385935818885185e-05, "loss": 0.0186, "step": 3740 }, { "epoch": 0.1701558059996937, "grad_norm": 0.19601842761039734, "learning_rate": 1.5361146511685275e-05, "loss": 0.0174, "step": 3750 }, { "epoch": 0.1706095548156929, "grad_norm": 0.24205590784549713, "learning_rate": 1.5336310884947424e-05, "loss": 0.0182, "step": 3760 }, { "epoch": 0.1710633036316921, "grad_norm": 0.2610029876232147, "learning_rate": 1.5311429153247898e-05, "loss": 0.0169, "step": 3770 }, { "epoch": 0.17151705244769128, "grad_norm": 0.2502942681312561, "learning_rate": 1.5286501531561292e-05, "loss": 0.019, "step": 3780 }, { "epoch": 0.17197080126369046, "grad_norm": 0.29031598567962646, "learning_rate": 1.526152823525868e-05, "loss": 0.0203, "step": 3790 }, { "epoch": 0.17242455007968963, "grad_norm": 0.2875536382198334, "learning_rate": 1.5236509480105781e-05, "loss": 0.0237, "step": 3800 }, { "epoch": 0.17287829889568881, "grad_norm": 0.31084775924682617, "learning_rate": 1.5211445482261039e-05, "loss": 0.0247, "step": 3810 }, { "epoch": 0.173332047711688, "grad_norm": 0.22767387330532074, "learning_rate": 1.5186336458273809e-05, "loss": 0.0223, "step": 3820 }, { "epoch": 0.1737857965276872, "grad_norm": 0.3203928768634796, "learning_rate": 1.5161182625082469e-05, "loss": 0.0204, "step": 3830 }, { "epoch": 0.17423954534368638, "grad_norm": 0.277687132358551, "learning_rate": 1.5135984200012526e-05, "loss": 0.0238, "step": 3840 }, { "epoch": 0.17469329415968554, "grad_norm": 0.23926757276058197, "learning_rate": 1.511074140077477e-05, "loss": 0.0195, "step": 3850 }, { "epoch": 0.17514704297568473, "grad_norm": 0.278305321931839, "learning_rate": 1.5085454445463367e-05, "loss": 0.0221, "step": 3860 }, { "epoch": 0.17560079179168392, "grad_norm": 0.22894488275051117, "learning_rate": 1.506012355255399e-05, "loss": 0.0173, "step": 3870 }, { "epoch": 0.1760545406076831, "grad_norm": 0.315358966588974, "learning_rate": 1.503474894090193e-05, "loss": 0.0193, "step": 3880 }, { "epoch": 0.1765082894236823, "grad_norm": 0.2551231384277344, "learning_rate": 1.5009330829740183e-05, "loss": 0.0173, "step": 3890 }, { "epoch": 0.17696203823968146, "grad_norm": 0.198433056473732, "learning_rate": 1.4983869438677605e-05, "loss": 0.0213, "step": 3900 }, { "epoch": 0.17741578705568065, "grad_norm": 0.3573533892631531, "learning_rate": 1.4958364987696956e-05, "loss": 0.0159, "step": 3910 }, { "epoch": 0.17786953587167983, "grad_norm": 0.24327196180820465, "learning_rate": 1.4932817697153046e-05, "loss": 0.019, "step": 3920 }, { "epoch": 0.17832328468767902, "grad_norm": 0.2960565984249115, "learning_rate": 1.4907227787770805e-05, "loss": 0.0214, "step": 3930 }, { "epoch": 0.1787770335036782, "grad_norm": 0.26914340257644653, "learning_rate": 1.4881595480643379e-05, "loss": 0.021, "step": 3940 }, { "epoch": 0.17923078231967737, "grad_norm": 0.20682498812675476, "learning_rate": 1.4855920997230238e-05, "loss": 0.0199, "step": 3950 }, { "epoch": 0.17968453113567656, "grad_norm": 0.3470107913017273, "learning_rate": 1.4830204559355234e-05, "loss": 0.0214, "step": 3960 }, { "epoch": 0.18013827995167575, "grad_norm": 0.2254122793674469, "learning_rate": 1.4804446389204715e-05, "loss": 0.0182, "step": 3970 }, { "epoch": 0.18059202876767494, "grad_norm": 0.29120850563049316, "learning_rate": 1.4778646709325573e-05, "loss": 0.0196, "step": 3980 }, { "epoch": 0.18104577758367413, "grad_norm": 0.23350928723812103, "learning_rate": 1.4752805742623349e-05, "loss": 0.0204, "step": 3990 }, { "epoch": 0.1814995263996733, "grad_norm": 0.34425532817840576, "learning_rate": 1.47269237123603e-05, "loss": 0.0239, "step": 4000 }, { "epoch": 0.18195327521567248, "grad_norm": 0.15927566587924957, "learning_rate": 1.470100084215345e-05, "loss": 0.0181, "step": 4010 }, { "epoch": 0.18240702403167167, "grad_norm": 0.2823963761329651, "learning_rate": 1.4675037355972693e-05, "loss": 0.0247, "step": 4020 }, { "epoch": 0.18286077284767085, "grad_norm": 0.3391494154930115, "learning_rate": 1.4649033478138825e-05, "loss": 0.02, "step": 4030 }, { "epoch": 0.18331452166367004, "grad_norm": 0.21237792074680328, "learning_rate": 1.4622989433321627e-05, "loss": 0.0268, "step": 4040 }, { "epoch": 0.1837682704796692, "grad_norm": 0.1844082921743393, "learning_rate": 1.459690544653792e-05, "loss": 0.0217, "step": 4050 }, { "epoch": 0.1842220192956684, "grad_norm": 0.2141391634941101, "learning_rate": 1.457078174314961e-05, "loss": 0.0195, "step": 4060 }, { "epoch": 0.18467576811166758, "grad_norm": 0.27008184790611267, "learning_rate": 1.4544618548861753e-05, "loss": 0.0198, "step": 4070 }, { "epoch": 0.18512951692766677, "grad_norm": 0.31953588128089905, "learning_rate": 1.45184160897206e-05, "loss": 0.0164, "step": 4080 }, { "epoch": 0.18558326574366596, "grad_norm": 0.3222372531890869, "learning_rate": 1.4492174592111642e-05, "loss": 0.0227, "step": 4090 }, { "epoch": 0.18603701455966512, "grad_norm": 0.40079936385154724, "learning_rate": 1.4465894282757662e-05, "loss": 0.0198, "step": 4100 }, { "epoch": 0.1864907633756643, "grad_norm": 0.26317480206489563, "learning_rate": 1.4439575388716768e-05, "loss": 0.0204, "step": 4110 }, { "epoch": 0.1869445121916635, "grad_norm": 0.24995321035385132, "learning_rate": 1.441321813738044e-05, "loss": 0.0187, "step": 4120 }, { "epoch": 0.1873982610076627, "grad_norm": 0.29002171754837036, "learning_rate": 1.4386822756471545e-05, "loss": 0.0211, "step": 4130 }, { "epoch": 0.18785200982366188, "grad_norm": 0.13846643269062042, "learning_rate": 1.43603894740424e-05, "loss": 0.0198, "step": 4140 }, { "epoch": 0.18830575863966104, "grad_norm": 0.29654228687286377, "learning_rate": 1.4333918518472773e-05, "loss": 0.0212, "step": 4150 }, { "epoch": 0.18875950745566022, "grad_norm": 0.24401293694972992, "learning_rate": 1.4307410118467932e-05, "loss": 0.0168, "step": 4160 }, { "epoch": 0.1892132562716594, "grad_norm": 0.19131697714328766, "learning_rate": 1.428086450305666e-05, "loss": 0.0207, "step": 4170 }, { "epoch": 0.1896670050876586, "grad_norm": 0.2668836712837219, "learning_rate": 1.4254281901589263e-05, "loss": 0.0191, "step": 4180 }, { "epoch": 0.1901207539036578, "grad_norm": 0.20712126791477203, "learning_rate": 1.4227662543735618e-05, "loss": 0.0192, "step": 4190 }, { "epoch": 0.19057450271965698, "grad_norm": 0.27224496006965637, "learning_rate": 1.4201006659483156e-05, "loss": 0.0187, "step": 4200 }, { "epoch": 0.19102825153565614, "grad_norm": 0.3294546902179718, "learning_rate": 1.4174314479134909e-05, "loss": 0.0174, "step": 4210 }, { "epoch": 0.19148200035165533, "grad_norm": 0.291116327047348, "learning_rate": 1.4147586233307485e-05, "loss": 0.022, "step": 4220 }, { "epoch": 0.19193574916765452, "grad_norm": 0.30992236733436584, "learning_rate": 1.4120822152929099e-05, "loss": 0.0201, "step": 4230 }, { "epoch": 0.1923894979836537, "grad_norm": 0.304377943277359, "learning_rate": 1.4094022469237577e-05, "loss": 0.0218, "step": 4240 }, { "epoch": 0.1928432467996529, "grad_norm": 0.20545247197151184, "learning_rate": 1.4067187413778338e-05, "loss": 0.0166, "step": 4250 }, { "epoch": 0.19329699561565206, "grad_norm": 0.24775473773479462, "learning_rate": 1.4040317218402426e-05, "loss": 0.0183, "step": 4260 }, { "epoch": 0.19375074443165125, "grad_norm": 0.28842994570732117, "learning_rate": 1.4013412115264477e-05, "loss": 0.0229, "step": 4270 }, { "epoch": 0.19420449324765043, "grad_norm": 0.23698964715003967, "learning_rate": 1.398647233682073e-05, "loss": 0.0198, "step": 4280 }, { "epoch": 0.19465824206364962, "grad_norm": 0.2762935161590576, "learning_rate": 1.3959498115827007e-05, "loss": 0.0154, "step": 4290 }, { "epoch": 0.1951119908796488, "grad_norm": 0.23715965449810028, "learning_rate": 1.3932489685336722e-05, "loss": 0.0197, "step": 4300 }, { "epoch": 0.19556573969564797, "grad_norm": 0.28485918045043945, "learning_rate": 1.3905447278698838e-05, "loss": 0.0202, "step": 4310 }, { "epoch": 0.19601948851164716, "grad_norm": 0.29975974559783936, "learning_rate": 1.3878371129555874e-05, "loss": 0.018, "step": 4320 }, { "epoch": 0.19647323732764635, "grad_norm": 0.22805023193359375, "learning_rate": 1.3851261471841891e-05, "loss": 0.018, "step": 4330 }, { "epoch": 0.19692698614364554, "grad_norm": 0.21557988226413727, "learning_rate": 1.382411853978044e-05, "loss": 0.017, "step": 4340 }, { "epoch": 0.19738073495964473, "grad_norm": 0.16113589704036713, "learning_rate": 1.3796942567882565e-05, "loss": 0.0162, "step": 4350 }, { "epoch": 0.1978344837756439, "grad_norm": 0.2514406740665436, "learning_rate": 1.3769733790944777e-05, "loss": 0.0179, "step": 4360 }, { "epoch": 0.19828823259164308, "grad_norm": 0.24677151441574097, "learning_rate": 1.3742492444047e-05, "loss": 0.0191, "step": 4370 }, { "epoch": 0.19874198140764227, "grad_norm": 0.30861198902130127, "learning_rate": 1.3715218762550584e-05, "loss": 0.017, "step": 4380 }, { "epoch": 0.19919573022364145, "grad_norm": 0.24955937266349792, "learning_rate": 1.368791298209622e-05, "loss": 0.0201, "step": 4390 }, { "epoch": 0.19964947903964064, "grad_norm": 0.2943950295448303, "learning_rate": 1.3660575338601945e-05, "loss": 0.0174, "step": 4400 }, { "epoch": 0.2001032278556398, "grad_norm": 0.24923928081989288, "learning_rate": 1.363320606826108e-05, "loss": 0.0185, "step": 4410 }, { "epoch": 0.200556976671639, "grad_norm": 0.25516021251678467, "learning_rate": 1.36058054075402e-05, "loss": 0.017, "step": 4420 }, { "epoch": 0.20101072548763818, "grad_norm": 0.25542205572128296, "learning_rate": 1.3578373593177091e-05, "loss": 0.0162, "step": 4430 }, { "epoch": 0.20146447430363737, "grad_norm": 0.29285094141960144, "learning_rate": 1.35509108621787e-05, "loss": 0.0193, "step": 4440 }, { "epoch": 0.20191822311963656, "grad_norm": 0.33193743228912354, "learning_rate": 1.3523417451819087e-05, "loss": 0.0182, "step": 4450 }, { "epoch": 0.20237197193563572, "grad_norm": 0.23243366181850433, "learning_rate": 1.3495893599637385e-05, "loss": 0.0186, "step": 4460 }, { "epoch": 0.2028257207516349, "grad_norm": 0.23335903882980347, "learning_rate": 1.3468339543435725e-05, "loss": 0.0187, "step": 4470 }, { "epoch": 0.2032794695676341, "grad_norm": 0.2702314257621765, "learning_rate": 1.3440755521277209e-05, "loss": 0.0208, "step": 4480 }, { "epoch": 0.20373321838363329, "grad_norm": 0.23103106021881104, "learning_rate": 1.3413141771483842e-05, "loss": 0.0185, "step": 4490 }, { "epoch": 0.20418696719963247, "grad_norm": 0.26019635796546936, "learning_rate": 1.3385498532634465e-05, "loss": 0.0183, "step": 4500 }, { "epoch": 0.20464071601563164, "grad_norm": 0.3081931173801422, "learning_rate": 1.3357826043562698e-05, "loss": 0.021, "step": 4510 }, { "epoch": 0.20509446483163082, "grad_norm": 0.19445614516735077, "learning_rate": 1.3330124543354888e-05, "loss": 0.02, "step": 4520 }, { "epoch": 0.20554821364763, "grad_norm": 0.3190820813179016, "learning_rate": 1.3302394271348026e-05, "loss": 0.0205, "step": 4530 }, { "epoch": 0.2060019624636292, "grad_norm": 0.37715262174606323, "learning_rate": 1.3274635467127688e-05, "loss": 0.0164, "step": 4540 }, { "epoch": 0.2064557112796284, "grad_norm": 0.21620811522006989, "learning_rate": 1.3246848370525973e-05, "loss": 0.0195, "step": 4550 }, { "epoch": 0.20690946009562755, "grad_norm": 0.21234837174415588, "learning_rate": 1.3219033221619408e-05, "loss": 0.0178, "step": 4560 }, { "epoch": 0.20736320891162674, "grad_norm": 0.2603495717048645, "learning_rate": 1.3191190260726903e-05, "loss": 0.018, "step": 4570 }, { "epoch": 0.20781695772762593, "grad_norm": 0.30615153908729553, "learning_rate": 1.3163319728407645e-05, "loss": 0.0201, "step": 4580 }, { "epoch": 0.20827070654362512, "grad_norm": 0.41749992966651917, "learning_rate": 1.3135421865459042e-05, "loss": 0.0219, "step": 4590 }, { "epoch": 0.2087244553596243, "grad_norm": 0.2161131054162979, "learning_rate": 1.3107496912914636e-05, "loss": 0.0158, "step": 4600 }, { "epoch": 0.20917820417562347, "grad_norm": 0.2533343732357025, "learning_rate": 1.307954511204202e-05, "loss": 0.0207, "step": 4610 }, { "epoch": 0.20963195299162266, "grad_norm": 0.23658685386180878, "learning_rate": 1.3051566704340746e-05, "loss": 0.0173, "step": 4620 }, { "epoch": 0.21008570180762184, "grad_norm": 0.25153669714927673, "learning_rate": 1.3023561931540247e-05, "loss": 0.0181, "step": 4630 }, { "epoch": 0.21053945062362103, "grad_norm": 0.23631227016448975, "learning_rate": 1.2995531035597753e-05, "loss": 0.0175, "step": 4640 }, { "epoch": 0.21099319943962022, "grad_norm": 0.26333701610565186, "learning_rate": 1.2967474258696186e-05, "loss": 0.017, "step": 4650 }, { "epoch": 0.21144694825561938, "grad_norm": 0.21821460127830505, "learning_rate": 1.2939391843242082e-05, "loss": 0.0225, "step": 4660 }, { "epoch": 0.21190069707161857, "grad_norm": 0.15162289142608643, "learning_rate": 1.291128403186349e-05, "loss": 0.0163, "step": 4670 }, { "epoch": 0.21235444588761776, "grad_norm": 0.2898188531398773, "learning_rate": 1.2883151067407866e-05, "loss": 0.0189, "step": 4680 }, { "epoch": 0.21280819470361695, "grad_norm": 0.12821677327156067, "learning_rate": 1.2854993192940005e-05, "loss": 0.0191, "step": 4690 }, { "epoch": 0.21326194351961614, "grad_norm": 0.25664690136909485, "learning_rate": 1.2826810651739899e-05, "loss": 0.0206, "step": 4700 }, { "epoch": 0.2137156923356153, "grad_norm": 0.18185020983219147, "learning_rate": 1.279860368730067e-05, "loss": 0.0179, "step": 4710 }, { "epoch": 0.2141694411516145, "grad_norm": 0.20872211456298828, "learning_rate": 1.2770372543326454e-05, "loss": 0.0218, "step": 4720 }, { "epoch": 0.21462318996761368, "grad_norm": 0.2035290002822876, "learning_rate": 1.2742117463730289e-05, "loss": 0.014, "step": 4730 }, { "epoch": 0.21507693878361286, "grad_norm": 0.20565688610076904, "learning_rate": 1.2713838692632015e-05, "loss": 0.02, "step": 4740 }, { "epoch": 0.21553068759961205, "grad_norm": 0.2465037703514099, "learning_rate": 1.2685536474356161e-05, "loss": 0.0204, "step": 4750 }, { "epoch": 0.21598443641561124, "grad_norm": 0.19388899207115173, "learning_rate": 1.2657211053429844e-05, "loss": 0.018, "step": 4760 }, { "epoch": 0.2164381852316104, "grad_norm": 0.29507505893707275, "learning_rate": 1.2628862674580642e-05, "loss": 0.0191, "step": 4770 }, { "epoch": 0.2168919340476096, "grad_norm": 0.2713572382926941, "learning_rate": 1.2600491582734484e-05, "loss": 0.0196, "step": 4780 }, { "epoch": 0.21734568286360878, "grad_norm": 0.2892291843891144, "learning_rate": 1.2572098023013544e-05, "loss": 0.0214, "step": 4790 }, { "epoch": 0.21779943167960797, "grad_norm": 0.14440372586250305, "learning_rate": 1.254368224073411e-05, "loss": 0.0192, "step": 4800 }, { "epoch": 0.21825318049560716, "grad_norm": 0.25437942147254944, "learning_rate": 1.251524448140447e-05, "loss": 0.0217, "step": 4810 }, { "epoch": 0.21870692931160632, "grad_norm": 0.280804306268692, "learning_rate": 1.2486784990722791e-05, "loss": 0.0212, "step": 4820 }, { "epoch": 0.2191606781276055, "grad_norm": 0.16871598362922668, "learning_rate": 1.2458304014574996e-05, "loss": 0.017, "step": 4830 }, { "epoch": 0.2196144269436047, "grad_norm": 0.34904944896698, "learning_rate": 1.242980179903264e-05, "loss": 0.022, "step": 4840 }, { "epoch": 0.22006817575960388, "grad_norm": 0.2750530540943146, "learning_rate": 1.2401278590350782e-05, "loss": 0.0217, "step": 4850 }, { "epoch": 0.22052192457560307, "grad_norm": 0.2973173260688782, "learning_rate": 1.2372734634965861e-05, "loss": 0.0202, "step": 4860 }, { "epoch": 0.22097567339160223, "grad_norm": 0.2599181830883026, "learning_rate": 1.234417017949356e-05, "loss": 0.0184, "step": 4870 }, { "epoch": 0.22142942220760142, "grad_norm": 0.24286292493343353, "learning_rate": 1.2315585470726685e-05, "loss": 0.0217, "step": 4880 }, { "epoch": 0.2218831710236006, "grad_norm": 0.2696918547153473, "learning_rate": 1.2286980755633027e-05, "loss": 0.0186, "step": 4890 }, { "epoch": 0.2223369198395998, "grad_norm": 0.27403658628463745, "learning_rate": 1.225835628135322e-05, "loss": 0.0183, "step": 4900 }, { "epoch": 0.222790668655599, "grad_norm": 0.29792413115501404, "learning_rate": 1.2229712295198633e-05, "loss": 0.019, "step": 4910 }, { "epoch": 0.22324441747159815, "grad_norm": 0.2840261161327362, "learning_rate": 1.2201049044649192e-05, "loss": 0.0215, "step": 4920 }, { "epoch": 0.22369816628759734, "grad_norm": 0.2787289023399353, "learning_rate": 1.217236677735128e-05, "loss": 0.0163, "step": 4930 }, { "epoch": 0.22415191510359653, "grad_norm": 0.23643659055233002, "learning_rate": 1.2143665741115581e-05, "loss": 0.0157, "step": 4940 }, { "epoch": 0.22460566391959572, "grad_norm": 0.22223883867263794, "learning_rate": 1.2114946183914935e-05, "loss": 0.0167, "step": 4950 }, { "epoch": 0.2250594127355949, "grad_norm": 0.3151393234729767, "learning_rate": 1.2086208353882203e-05, "loss": 0.0226, "step": 4960 }, { "epoch": 0.22551316155159407, "grad_norm": 0.3878695070743561, "learning_rate": 1.2057452499308117e-05, "loss": 0.0203, "step": 4970 }, { "epoch": 0.22596691036759325, "grad_norm": 0.25745895504951477, "learning_rate": 1.2028678868639147e-05, "loss": 0.016, "step": 4980 }, { "epoch": 0.22642065918359244, "grad_norm": 0.33859357237815857, "learning_rate": 1.1999887710475337e-05, "loss": 0.0186, "step": 4990 }, { "epoch": 0.22687440799959163, "grad_norm": 0.24744485318660736, "learning_rate": 1.197107927356817e-05, "loss": 0.0168, "step": 5000 }, { "epoch": 0.22732815681559082, "grad_norm": 0.28199446201324463, "learning_rate": 1.1942253806818414e-05, "loss": 0.0207, "step": 5010 }, { "epoch": 0.22778190563158998, "grad_norm": 0.20874083042144775, "learning_rate": 1.1913411559273973e-05, "loss": 0.0167, "step": 5020 }, { "epoch": 0.22823565444758917, "grad_norm": 0.2683298587799072, "learning_rate": 1.1884552780127736e-05, "loss": 0.0194, "step": 5030 }, { "epoch": 0.22868940326358836, "grad_norm": 0.20313489437103271, "learning_rate": 1.1855677718715417e-05, "loss": 0.0162, "step": 5040 }, { "epoch": 0.22914315207958755, "grad_norm": 0.2615211308002472, "learning_rate": 1.1826786624513416e-05, "loss": 0.0199, "step": 5050 }, { "epoch": 0.22959690089558674, "grad_norm": 0.3032364845275879, "learning_rate": 1.1797879747136645e-05, "loss": 0.0155, "step": 5060 }, { "epoch": 0.2300506497115859, "grad_norm": 0.23648901283740997, "learning_rate": 1.1768957336336384e-05, "loss": 0.0161, "step": 5070 }, { "epoch": 0.23050439852758509, "grad_norm": 0.2589399814605713, "learning_rate": 1.1740019641998124e-05, "loss": 0.0188, "step": 5080 }, { "epoch": 0.23095814734358427, "grad_norm": 0.2104191780090332, "learning_rate": 1.171106691413939e-05, "loss": 0.017, "step": 5090 }, { "epoch": 0.23141189615958346, "grad_norm": 0.19657613337039948, "learning_rate": 1.1682099402907612e-05, "loss": 0.0195, "step": 5100 }, { "epoch": 0.23186564497558265, "grad_norm": 0.3007795512676239, "learning_rate": 1.1653117358577937e-05, "loss": 0.0186, "step": 5110 }, { "epoch": 0.2323193937915818, "grad_norm": 0.2928745150566101, "learning_rate": 1.1624121031551073e-05, "loss": 0.0187, "step": 5120 }, { "epoch": 0.232773142607581, "grad_norm": 0.26405069231987, "learning_rate": 1.1595110672351132e-05, "loss": 0.0129, "step": 5130 }, { "epoch": 0.2332268914235802, "grad_norm": 0.27418017387390137, "learning_rate": 1.1566086531623464e-05, "loss": 0.0192, "step": 5140 }, { "epoch": 0.23368064023957938, "grad_norm": 0.2746230363845825, "learning_rate": 1.1537048860132487e-05, "loss": 0.0184, "step": 5150 }, { "epoch": 0.23413438905557857, "grad_norm": 0.2148381769657135, "learning_rate": 1.1507997908759525e-05, "loss": 0.0174, "step": 5160 }, { "epoch": 0.23458813787157773, "grad_norm": 0.2621228098869324, "learning_rate": 1.1478933928500635e-05, "loss": 0.0159, "step": 5170 }, { "epoch": 0.23504188668757692, "grad_norm": 0.23674488067626953, "learning_rate": 1.1449857170464445e-05, "loss": 0.0163, "step": 5180 }, { "epoch": 0.2354956355035761, "grad_norm": 0.32664570212364197, "learning_rate": 1.1420767885869974e-05, "loss": 0.0209, "step": 5190 }, { "epoch": 0.2359493843195753, "grad_norm": 0.18606340885162354, "learning_rate": 1.1391666326044484e-05, "loss": 0.0161, "step": 5200 }, { "epoch": 0.23640313313557448, "grad_norm": 0.13740357756614685, "learning_rate": 1.1362552742421269e-05, "loss": 0.0195, "step": 5210 }, { "epoch": 0.23685688195157364, "grad_norm": 0.2785436809062958, "learning_rate": 1.1333427386537537e-05, "loss": 0.0194, "step": 5220 }, { "epoch": 0.23731063076757283, "grad_norm": 0.24752356112003326, "learning_rate": 1.1304290510032184e-05, "loss": 0.0162, "step": 5230 }, { "epoch": 0.23776437958357202, "grad_norm": 0.1751922219991684, "learning_rate": 1.1275142364643645e-05, "loss": 0.0169, "step": 5240 }, { "epoch": 0.2382181283995712, "grad_norm": 0.2900957465171814, "learning_rate": 1.1245983202207729e-05, "loss": 0.0185, "step": 5250 }, { "epoch": 0.2386718772155704, "grad_norm": 0.19540879130363464, "learning_rate": 1.1216813274655417e-05, "loss": 0.0149, "step": 5260 }, { "epoch": 0.23912562603156956, "grad_norm": 0.2967893183231354, "learning_rate": 1.1187632834010707e-05, "loss": 0.0148, "step": 5270 }, { "epoch": 0.23957937484756875, "grad_norm": 0.2377578765153885, "learning_rate": 1.1158442132388427e-05, "loss": 0.0179, "step": 5280 }, { "epoch": 0.24003312366356794, "grad_norm": 0.3314591944217682, "learning_rate": 1.1129241421992059e-05, "loss": 0.0204, "step": 5290 }, { "epoch": 0.24048687247956713, "grad_norm": 0.20294828712940216, "learning_rate": 1.1100030955111554e-05, "loss": 0.0146, "step": 5300 }, { "epoch": 0.24094062129556632, "grad_norm": 0.2913452684879303, "learning_rate": 1.1070810984121164e-05, "loss": 0.0171, "step": 5310 }, { "epoch": 0.2413943701115655, "grad_norm": 0.28436318039894104, "learning_rate": 1.1041581761477252e-05, "loss": 0.0208, "step": 5320 }, { "epoch": 0.24184811892756466, "grad_norm": 0.29696106910705566, "learning_rate": 1.1012343539716115e-05, "loss": 0.0232, "step": 5330 }, { "epoch": 0.24230186774356385, "grad_norm": 0.34529179334640503, "learning_rate": 1.0983096571451805e-05, "loss": 0.0199, "step": 5340 }, { "epoch": 0.24275561655956304, "grad_norm": 0.2736857235431671, "learning_rate": 1.0953841109373935e-05, "loss": 0.0214, "step": 5350 }, { "epoch": 0.24320936537556223, "grad_norm": 0.21796494722366333, "learning_rate": 1.0924577406245507e-05, "loss": 0.0151, "step": 5360 }, { "epoch": 0.24366311419156142, "grad_norm": 0.3323252201080322, "learning_rate": 1.0895305714900721e-05, "loss": 0.0133, "step": 5370 }, { "epoch": 0.24411686300756058, "grad_norm": 0.19576877355575562, "learning_rate": 1.0866026288242803e-05, "loss": 0.0156, "step": 5380 }, { "epoch": 0.24457061182355977, "grad_norm": 0.2512732148170471, "learning_rate": 1.0836739379241805e-05, "loss": 0.0196, "step": 5390 }, { "epoch": 0.24502436063955896, "grad_norm": 0.299798846244812, "learning_rate": 1.0807445240932422e-05, "loss": 0.0235, "step": 5400 }, { "epoch": 0.24547810945555815, "grad_norm": 0.18500925600528717, "learning_rate": 1.0778144126411815e-05, "loss": 0.015, "step": 5410 }, { "epoch": 0.24593185827155734, "grad_norm": 0.25614041090011597, "learning_rate": 1.0748836288837418e-05, "loss": 0.0198, "step": 5420 }, { "epoch": 0.2463856070875565, "grad_norm": 0.238459050655365, "learning_rate": 1.0719521981424745e-05, "loss": 0.0171, "step": 5430 }, { "epoch": 0.24683935590355569, "grad_norm": 0.30754324793815613, "learning_rate": 1.0690201457445218e-05, "loss": 0.0155, "step": 5440 }, { "epoch": 0.24729310471955487, "grad_norm": 0.24890351295471191, "learning_rate": 1.0660874970223963e-05, "loss": 0.0176, "step": 5450 }, { "epoch": 0.24774685353555406, "grad_norm": 0.2697153687477112, "learning_rate": 1.0631542773137627e-05, "loss": 0.0148, "step": 5460 }, { "epoch": 0.24820060235155325, "grad_norm": 0.3703860640525818, "learning_rate": 1.060220511961219e-05, "loss": 0.0163, "step": 5470 }, { "epoch": 0.2486543511675524, "grad_norm": 0.24785463511943817, "learning_rate": 1.0572862263120784e-05, "loss": 0.0176, "step": 5480 }, { "epoch": 0.2491080999835516, "grad_norm": 0.13200345635414124, "learning_rate": 1.0543514457181476e-05, "loss": 0.0166, "step": 5490 }, { "epoch": 0.2495618487995508, "grad_norm": 0.1360347867012024, "learning_rate": 1.051416195535511e-05, "loss": 0.0169, "step": 5500 }, { "epoch": 0.25001559761555, "grad_norm": 0.1942114382982254, "learning_rate": 1.0484805011243102e-05, "loss": 0.016, "step": 5510 }, { "epoch": 0.25046934643154917, "grad_norm": 0.20423969626426697, "learning_rate": 1.0455443878485238e-05, "loss": 0.0165, "step": 5520 }, { "epoch": 0.25092309524754836, "grad_norm": 0.24992233514785767, "learning_rate": 1.0426078810757502e-05, "loss": 0.018, "step": 5530 }, { "epoch": 0.25137684406354754, "grad_norm": 0.23850075900554657, "learning_rate": 1.039671006176987e-05, "loss": 0.0171, "step": 5540 }, { "epoch": 0.2518305928795467, "grad_norm": 0.2388443797826767, "learning_rate": 1.0367337885264128e-05, "loss": 0.018, "step": 5550 }, { "epoch": 0.25228434169554587, "grad_norm": 0.16273514926433563, "learning_rate": 1.0337962535011679e-05, "loss": 0.0154, "step": 5560 }, { "epoch": 0.25273809051154505, "grad_norm": 0.16232016682624817, "learning_rate": 1.0308584264811332e-05, "loss": 0.0142, "step": 5570 }, { "epoch": 0.25319183932754424, "grad_norm": 0.22517164051532745, "learning_rate": 1.0279203328487142e-05, "loss": 0.0133, "step": 5580 }, { "epoch": 0.25364558814354343, "grad_norm": 0.19299018383026123, "learning_rate": 1.0249819979886184e-05, "loss": 0.0134, "step": 5590 }, { "epoch": 0.2540993369595426, "grad_norm": 0.16733065247535706, "learning_rate": 1.0220434472876384e-05, "loss": 0.0113, "step": 5600 }, { "epoch": 0.2545530857755418, "grad_norm": 0.17303834855556488, "learning_rate": 1.0191047061344315e-05, "loss": 0.0157, "step": 5610 }, { "epoch": 0.255006834591541, "grad_norm": 0.23059505224227905, "learning_rate": 1.0161657999192998e-05, "loss": 0.0156, "step": 5620 }, { "epoch": 0.2554605834075402, "grad_norm": 0.20609940588474274, "learning_rate": 1.0132267540339726e-05, "loss": 0.0141, "step": 5630 }, { "epoch": 0.2559143322235394, "grad_norm": 0.21203909814357758, "learning_rate": 1.010287593871385e-05, "loss": 0.0149, "step": 5640 }, { "epoch": 0.25636808103953856, "grad_norm": 0.1916971057653427, "learning_rate": 1.0073483448254599e-05, "loss": 0.0155, "step": 5650 }, { "epoch": 0.2568218298555377, "grad_norm": 0.2466076910495758, "learning_rate": 1.0044090322908884e-05, "loss": 0.0137, "step": 5660 }, { "epoch": 0.2572755786715369, "grad_norm": 0.24274955689907074, "learning_rate": 1.0014696816629093e-05, "loss": 0.016, "step": 5670 }, { "epoch": 0.2577293274875361, "grad_norm": 0.23252969980239868, "learning_rate": 9.985303183370909e-06, "loss": 0.0193, "step": 5680 }, { "epoch": 0.25818307630353526, "grad_norm": 0.1784472018480301, "learning_rate": 9.95590967709112e-06, "loss": 0.0152, "step": 5690 }, { "epoch": 0.25863682511953445, "grad_norm": 0.22088126838207245, "learning_rate": 9.926516551745401e-06, "loss": 0.0149, "step": 5700 }, { "epoch": 0.25909057393553364, "grad_norm": 0.2285860925912857, "learning_rate": 9.897124061286152e-06, "loss": 0.013, "step": 5710 }, { "epoch": 0.25954432275153283, "grad_norm": 0.22873656451702118, "learning_rate": 9.867732459660277e-06, "loss": 0.0145, "step": 5720 }, { "epoch": 0.259998071567532, "grad_norm": 0.30698075890541077, "learning_rate": 9.838342000807006e-06, "loss": 0.017, "step": 5730 }, { "epoch": 0.2604518203835312, "grad_norm": 0.221647709608078, "learning_rate": 9.808952938655689e-06, "loss": 0.0196, "step": 5740 }, { "epoch": 0.2609055691995304, "grad_norm": 0.2609390318393707, "learning_rate": 9.77956552712362e-06, "loss": 0.0196, "step": 5750 }, { "epoch": 0.26135931801552953, "grad_norm": 0.15778374671936035, "learning_rate": 9.75018002011382e-06, "loss": 0.0116, "step": 5760 }, { "epoch": 0.2618130668315287, "grad_norm": 0.2896614968776703, "learning_rate": 9.720796671512863e-06, "loss": 0.0178, "step": 5770 }, { "epoch": 0.2622668156475279, "grad_norm": 0.19365887343883514, "learning_rate": 9.69141573518867e-06, "loss": 0.0175, "step": 5780 }, { "epoch": 0.2627205644635271, "grad_norm": 0.20279602706432343, "learning_rate": 9.662037464988323e-06, "loss": 0.0151, "step": 5790 }, { "epoch": 0.2631743132795263, "grad_norm": 0.2189822942018509, "learning_rate": 9.63266211473587e-06, "loss": 0.0162, "step": 5800 }, { "epoch": 0.2636280620955255, "grad_norm": 0.2945132553577423, "learning_rate": 9.603289938230132e-06, "loss": 0.0166, "step": 5810 }, { "epoch": 0.26408181091152466, "grad_norm": 0.16155044734477997, "learning_rate": 9.573921189242501e-06, "loss": 0.0166, "step": 5820 }, { "epoch": 0.26453555972752385, "grad_norm": 0.2859227657318115, "learning_rate": 9.544556121514765e-06, "loss": 0.0173, "step": 5830 }, { "epoch": 0.26498930854352304, "grad_norm": 0.2795960009098053, "learning_rate": 9.5151949887569e-06, "loss": 0.0176, "step": 5840 }, { "epoch": 0.2654430573595222, "grad_norm": 0.22469447553157806, "learning_rate": 9.485838044644891e-06, "loss": 0.0167, "step": 5850 }, { "epoch": 0.26589680617552136, "grad_norm": 0.22965514659881592, "learning_rate": 9.456485542818527e-06, "loss": 0.0181, "step": 5860 }, { "epoch": 0.26635055499152055, "grad_norm": 0.2655508518218994, "learning_rate": 9.427137736879222e-06, "loss": 0.0145, "step": 5870 }, { "epoch": 0.26680430380751974, "grad_norm": 0.21690212190151215, "learning_rate": 9.397794880387812e-06, "loss": 0.0166, "step": 5880 }, { "epoch": 0.2672580526235189, "grad_norm": 0.23310965299606323, "learning_rate": 9.368457226862378e-06, "loss": 0.0145, "step": 5890 }, { "epoch": 0.2677118014395181, "grad_norm": 0.27904966473579407, "learning_rate": 9.339125029776039e-06, "loss": 0.0198, "step": 5900 }, { "epoch": 0.2681655502555173, "grad_norm": 0.2676159739494324, "learning_rate": 9.309798542554782e-06, "loss": 0.0141, "step": 5910 }, { "epoch": 0.2686192990715165, "grad_norm": 0.21334831416606903, "learning_rate": 9.280478018575257e-06, "loss": 0.016, "step": 5920 }, { "epoch": 0.2690730478875157, "grad_norm": 0.21897172927856445, "learning_rate": 9.251163711162584e-06, "loss": 0.0166, "step": 5930 }, { "epoch": 0.26952679670351487, "grad_norm": 0.21741193532943726, "learning_rate": 9.221855873588187e-06, "loss": 0.0157, "step": 5940 }, { "epoch": 0.26998054551951406, "grad_norm": 0.21360056102275848, "learning_rate": 9.192554759067581e-06, "loss": 0.0195, "step": 5950 }, { "epoch": 0.2704342943355132, "grad_norm": 0.16907712817192078, "learning_rate": 9.163260620758197e-06, "loss": 0.0122, "step": 5960 }, { "epoch": 0.2708880431515124, "grad_norm": 0.1720825433731079, "learning_rate": 9.133973711757198e-06, "loss": 0.0154, "step": 5970 }, { "epoch": 0.27134179196751157, "grad_norm": 0.27795663475990295, "learning_rate": 9.10469428509928e-06, "loss": 0.0172, "step": 5980 }, { "epoch": 0.27179554078351076, "grad_norm": 0.27780142426490784, "learning_rate": 9.075422593754498e-06, "loss": 0.0142, "step": 5990 }, { "epoch": 0.27224928959950995, "grad_norm": 0.33743828535079956, "learning_rate": 9.046158890626069e-06, "loss": 0.0202, "step": 6000 }, { "epoch": 0.27270303841550914, "grad_norm": 0.22341972589492798, "learning_rate": 9.016903428548195e-06, "loss": 0.0175, "step": 6010 }, { "epoch": 0.2731567872315083, "grad_norm": 0.2931184470653534, "learning_rate": 8.987656460283885e-06, "loss": 0.0153, "step": 6020 }, { "epoch": 0.2736105360475075, "grad_norm": 0.25371456146240234, "learning_rate": 8.958418238522748e-06, "loss": 0.0181, "step": 6030 }, { "epoch": 0.2740642848635067, "grad_norm": 0.21415118873119354, "learning_rate": 8.929189015878838e-06, "loss": 0.0164, "step": 6040 }, { "epoch": 0.2745180336795059, "grad_norm": 0.30469053983688354, "learning_rate": 8.899969044888448e-06, "loss": 0.015, "step": 6050 }, { "epoch": 0.274971782495505, "grad_norm": 0.2558583915233612, "learning_rate": 8.870758578007944e-06, "loss": 0.0152, "step": 6060 }, { "epoch": 0.2754255313115042, "grad_norm": 0.21895679831504822, "learning_rate": 8.841557867611576e-06, "loss": 0.0171, "step": 6070 }, { "epoch": 0.2758792801275034, "grad_norm": 0.29539406299591064, "learning_rate": 8.812367165989295e-06, "loss": 0.0166, "step": 6080 }, { "epoch": 0.2763330289435026, "grad_norm": 0.20228557288646698, "learning_rate": 8.783186725344588e-06, "loss": 0.015, "step": 6090 }, { "epoch": 0.2767867777595018, "grad_norm": 0.30251553654670715, "learning_rate": 8.754016797792276e-06, "loss": 0.019, "step": 6100 }, { "epoch": 0.27724052657550097, "grad_norm": 0.2147679477930069, "learning_rate": 8.72485763535636e-06, "loss": 0.0152, "step": 6110 }, { "epoch": 0.27769427539150016, "grad_norm": 0.26095765829086304, "learning_rate": 8.695709489967821e-06, "loss": 0.016, "step": 6120 }, { "epoch": 0.27814802420749934, "grad_norm": 0.2686784565448761, "learning_rate": 8.666572613462465e-06, "loss": 0.0185, "step": 6130 }, { "epoch": 0.27860177302349853, "grad_norm": 0.2595486342906952, "learning_rate": 8.63744725757873e-06, "loss": 0.0148, "step": 6140 }, { "epoch": 0.2790555218394977, "grad_norm": 0.1412426382303238, "learning_rate": 8.60833367395552e-06, "loss": 0.0152, "step": 6150 }, { "epoch": 0.2795092706554969, "grad_norm": 0.18741200864315033, "learning_rate": 8.579232114130027e-06, "loss": 0.0182, "step": 6160 }, { "epoch": 0.27996301947149604, "grad_norm": 0.22104179859161377, "learning_rate": 8.550142829535559e-06, "loss": 0.0113, "step": 6170 }, { "epoch": 0.28041676828749523, "grad_norm": 0.31477832794189453, "learning_rate": 8.521066071499368e-06, "loss": 0.0154, "step": 6180 }, { "epoch": 0.2808705171034944, "grad_norm": 0.21463708579540253, "learning_rate": 8.492002091240478e-06, "loss": 0.0165, "step": 6190 }, { "epoch": 0.2813242659194936, "grad_norm": 0.19469910860061646, "learning_rate": 8.462951139867514e-06, "loss": 0.0165, "step": 6200 }, { "epoch": 0.2817780147354928, "grad_norm": 0.21120582520961761, "learning_rate": 8.43391346837654e-06, "loss": 0.0155, "step": 6210 }, { "epoch": 0.282231763551492, "grad_norm": 0.252131849527359, "learning_rate": 8.404889327648873e-06, "loss": 0.0159, "step": 6220 }, { "epoch": 0.2826855123674912, "grad_norm": 0.30879899859428406, "learning_rate": 8.375878968448934e-06, "loss": 0.0157, "step": 6230 }, { "epoch": 0.28313926118349036, "grad_norm": 0.181107297539711, "learning_rate": 8.346882641422066e-06, "loss": 0.0151, "step": 6240 }, { "epoch": 0.28359300999948955, "grad_norm": 0.3074702322483063, "learning_rate": 8.317900597092388e-06, "loss": 0.0173, "step": 6250 }, { "epoch": 0.28404675881548874, "grad_norm": 0.22269389033317566, "learning_rate": 8.288933085860611e-06, "loss": 0.0153, "step": 6260 }, { "epoch": 0.2845005076314879, "grad_norm": 0.17028804123401642, "learning_rate": 8.25998035800188e-06, "loss": 0.0157, "step": 6270 }, { "epoch": 0.28495425644748706, "grad_norm": 0.17685209214687347, "learning_rate": 8.231042663663619e-06, "loss": 0.0176, "step": 6280 }, { "epoch": 0.28540800526348625, "grad_norm": 0.24024584889411926, "learning_rate": 8.202120252863359e-06, "loss": 0.0145, "step": 6290 }, { "epoch": 0.28586175407948544, "grad_norm": 0.1644550859928131, "learning_rate": 8.173213375486589e-06, "loss": 0.0173, "step": 6300 }, { "epoch": 0.28631550289548463, "grad_norm": 0.2395004779100418, "learning_rate": 8.144322281284586e-06, "loss": 0.019, "step": 6310 }, { "epoch": 0.2867692517114838, "grad_norm": 0.23114000260829926, "learning_rate": 8.11544721987227e-06, "loss": 0.0171, "step": 6320 }, { "epoch": 0.287223000527483, "grad_norm": 0.2958763539791107, "learning_rate": 8.086588440726034e-06, "loss": 0.0154, "step": 6330 }, { "epoch": 0.2876767493434822, "grad_norm": 0.22971633076667786, "learning_rate": 8.057746193181591e-06, "loss": 0.015, "step": 6340 }, { "epoch": 0.2881304981594814, "grad_norm": 0.37281668186187744, "learning_rate": 8.028920726431832e-06, "loss": 0.0153, "step": 6350 }, { "epoch": 0.2885842469754806, "grad_norm": 0.1991814225912094, "learning_rate": 8.000112289524666e-06, "loss": 0.0146, "step": 6360 }, { "epoch": 0.2890379957914797, "grad_norm": 0.23134256899356842, "learning_rate": 7.971321131360855e-06, "loss": 0.0155, "step": 6370 }, { "epoch": 0.2894917446074789, "grad_norm": 0.18794924020767212, "learning_rate": 7.942547500691884e-06, "loss": 0.0175, "step": 6380 }, { "epoch": 0.2899454934234781, "grad_norm": 0.201078400015831, "learning_rate": 7.913791646117798e-06, "loss": 0.0165, "step": 6390 }, { "epoch": 0.2903992422394773, "grad_norm": 0.18238510191440582, "learning_rate": 7.885053816085067e-06, "loss": 0.0191, "step": 6400 }, { "epoch": 0.29085299105547646, "grad_norm": 0.1990356296300888, "learning_rate": 7.85633425888442e-06, "loss": 0.0151, "step": 6410 }, { "epoch": 0.29130673987147565, "grad_norm": 0.2516949772834778, "learning_rate": 7.827633222648722e-06, "loss": 0.0168, "step": 6420 }, { "epoch": 0.29176048868747484, "grad_norm": 0.2401595413684845, "learning_rate": 7.798950955350812e-06, "loss": 0.0169, "step": 6430 }, { "epoch": 0.29221423750347403, "grad_norm": 0.22385676205158234, "learning_rate": 7.770287704801374e-06, "loss": 0.017, "step": 6440 }, { "epoch": 0.2926679863194732, "grad_norm": 0.2761000990867615, "learning_rate": 7.741643718646783e-06, "loss": 0.0157, "step": 6450 }, { "epoch": 0.2931217351354724, "grad_norm": 0.20440326631069183, "learning_rate": 7.713019244366977e-06, "loss": 0.016, "step": 6460 }, { "epoch": 0.29357548395147154, "grad_norm": 0.222296804189682, "learning_rate": 7.684414529273315e-06, "loss": 0.015, "step": 6470 }, { "epoch": 0.2940292327674707, "grad_norm": 0.17870034277439117, "learning_rate": 7.655829820506442e-06, "loss": 0.0143, "step": 6480 }, { "epoch": 0.2944829815834699, "grad_norm": 0.19074006378650665, "learning_rate": 7.627265365034141e-06, "loss": 0.0123, "step": 6490 }, { "epoch": 0.2949367303994691, "grad_norm": 0.1940397173166275, "learning_rate": 7.59872140964922e-06, "loss": 0.0173, "step": 6500 }, { "epoch": 0.2953904792154683, "grad_norm": 0.2498466670513153, "learning_rate": 7.570198200967363e-06, "loss": 0.0138, "step": 6510 }, { "epoch": 0.2958442280314675, "grad_norm": 0.14221906661987305, "learning_rate": 7.5416959854250076e-06, "loss": 0.0132, "step": 6520 }, { "epoch": 0.29629797684746667, "grad_norm": 0.2832084596157074, "learning_rate": 7.513215009277212e-06, "loss": 0.0169, "step": 6530 }, { "epoch": 0.29675172566346586, "grad_norm": 0.32706505060195923, "learning_rate": 7.484755518595534e-06, "loss": 0.0184, "step": 6540 }, { "epoch": 0.29720547447946505, "grad_norm": 0.23653177917003632, "learning_rate": 7.456317759265893e-06, "loss": 0.0141, "step": 6550 }, { "epoch": 0.29765922329546424, "grad_norm": 0.2779649794101715, "learning_rate": 7.4279019769864605e-06, "loss": 0.0165, "step": 6560 }, { "epoch": 0.29811297211146337, "grad_norm": 0.2597646117210388, "learning_rate": 7.399508417265517e-06, "loss": 0.0175, "step": 6570 }, { "epoch": 0.29856672092746256, "grad_norm": 0.2561059892177582, "learning_rate": 7.3711373254193595e-06, "loss": 0.015, "step": 6580 }, { "epoch": 0.29902046974346175, "grad_norm": 0.20948241651058197, "learning_rate": 7.342788946570159e-06, "loss": 0.0153, "step": 6590 }, { "epoch": 0.29947421855946094, "grad_norm": 0.2807292938232422, "learning_rate": 7.314463525643842e-06, "loss": 0.0153, "step": 6600 }, { "epoch": 0.2999279673754601, "grad_norm": 0.2393791526556015, "learning_rate": 7.286161307367989e-06, "loss": 0.0161, "step": 6610 }, { "epoch": 0.3003817161914593, "grad_norm": 0.23379753530025482, "learning_rate": 7.257882536269716e-06, "loss": 0.0137, "step": 6620 }, { "epoch": 0.3008354650074585, "grad_norm": 0.30089071393013, "learning_rate": 7.2296274566735494e-06, "loss": 0.015, "step": 6630 }, { "epoch": 0.3012892138234577, "grad_norm": 0.21323876082897186, "learning_rate": 7.201396312699334e-06, "loss": 0.0116, "step": 6640 }, { "epoch": 0.3017429626394569, "grad_norm": 0.20455586910247803, "learning_rate": 7.173189348260105e-06, "loss": 0.0141, "step": 6650 }, { "epoch": 0.30219671145545607, "grad_norm": 0.20841535925865173, "learning_rate": 7.145006807060002e-06, "loss": 0.0146, "step": 6660 }, { "epoch": 0.3026504602714552, "grad_norm": 0.2604532539844513, "learning_rate": 7.116848932592136e-06, "loss": 0.0174, "step": 6670 }, { "epoch": 0.3031042090874544, "grad_norm": 0.17063336074352264, "learning_rate": 7.088715968136513e-06, "loss": 0.018, "step": 6680 }, { "epoch": 0.3035579579034536, "grad_norm": 0.307041734457016, "learning_rate": 7.06060815675792e-06, "loss": 0.0148, "step": 6690 }, { "epoch": 0.30401170671945277, "grad_norm": 0.21992157399654388, "learning_rate": 7.032525741303815e-06, "loss": 0.0173, "step": 6700 }, { "epoch": 0.30446545553545196, "grad_norm": 0.184308722615242, "learning_rate": 7.00446896440225e-06, "loss": 0.0161, "step": 6710 }, { "epoch": 0.30491920435145115, "grad_norm": 0.21939603984355927, "learning_rate": 6.976438068459756e-06, "loss": 0.0131, "step": 6720 }, { "epoch": 0.30537295316745033, "grad_norm": 0.22889043390750885, "learning_rate": 6.948433295659258e-06, "loss": 0.0163, "step": 6730 }, { "epoch": 0.3058267019834495, "grad_norm": 0.2898446321487427, "learning_rate": 6.920454887957984e-06, "loss": 0.0183, "step": 6740 }, { "epoch": 0.3062804507994487, "grad_norm": 0.3563949763774872, "learning_rate": 6.892503087085365e-06, "loss": 0.0169, "step": 6750 }, { "epoch": 0.3067341996154479, "grad_norm": 0.192538321018219, "learning_rate": 6.864578134540961e-06, "loss": 0.0158, "step": 6760 }, { "epoch": 0.3071879484314471, "grad_norm": 0.23178447782993317, "learning_rate": 6.83668027159236e-06, "loss": 0.0154, "step": 6770 }, { "epoch": 0.3076416972474462, "grad_norm": 0.2624194920063019, "learning_rate": 6.8088097392731035e-06, "loss": 0.0172, "step": 6780 }, { "epoch": 0.3080954460634454, "grad_norm": 0.37407219409942627, "learning_rate": 6.7809667783805934e-06, "loss": 0.0165, "step": 6790 }, { "epoch": 0.3085491948794446, "grad_norm": 0.17963409423828125, "learning_rate": 6.753151629474028e-06, "loss": 0.0145, "step": 6800 }, { "epoch": 0.3090029436954438, "grad_norm": 0.2914386987686157, "learning_rate": 6.725364532872312e-06, "loss": 0.0157, "step": 6810 }, { "epoch": 0.309456692511443, "grad_norm": 0.2792305648326874, "learning_rate": 6.697605728651977e-06, "loss": 0.0154, "step": 6820 }, { "epoch": 0.30991044132744217, "grad_norm": 0.2489727884531021, "learning_rate": 6.669875456645115e-06, "loss": 0.0163, "step": 6830 }, { "epoch": 0.31036419014344135, "grad_norm": 0.25871676206588745, "learning_rate": 6.642173956437306e-06, "loss": 0.0133, "step": 6840 }, { "epoch": 0.31081793895944054, "grad_norm": 0.2216576337814331, "learning_rate": 6.614501467365539e-06, "loss": 0.0145, "step": 6850 }, { "epoch": 0.31127168777543973, "grad_norm": 0.20063291490077972, "learning_rate": 6.586858228516162e-06, "loss": 0.0142, "step": 6860 }, { "epoch": 0.3117254365914389, "grad_norm": 0.3031257390975952, "learning_rate": 6.559244478722792e-06, "loss": 0.0155, "step": 6870 }, { "epoch": 0.31217918540743805, "grad_norm": 0.2504667043685913, "learning_rate": 6.531660456564282e-06, "loss": 0.0222, "step": 6880 }, { "epoch": 0.31263293422343724, "grad_norm": 0.22241756319999695, "learning_rate": 6.504106400362621e-06, "loss": 0.014, "step": 6890 }, { "epoch": 0.31308668303943643, "grad_norm": 0.32420772314071655, "learning_rate": 6.476582548180912e-06, "loss": 0.0143, "step": 6900 }, { "epoch": 0.3135404318554356, "grad_norm": 0.1600920557975769, "learning_rate": 6.449089137821301e-06, "loss": 0.0182, "step": 6910 }, { "epoch": 0.3139941806714348, "grad_norm": 0.2813020348548889, "learning_rate": 6.421626406822909e-06, "loss": 0.0141, "step": 6920 }, { "epoch": 0.314447929487434, "grad_norm": 0.28015148639678955, "learning_rate": 6.394194592459801e-06, "loss": 0.0188, "step": 6930 }, { "epoch": 0.3149016783034332, "grad_norm": 0.2934596538543701, "learning_rate": 6.366793931738922e-06, "loss": 0.0179, "step": 6940 }, { "epoch": 0.3153554271194324, "grad_norm": 0.1725728064775467, "learning_rate": 6.339424661398058e-06, "loss": 0.0143, "step": 6950 }, { "epoch": 0.31580917593543156, "grad_norm": 0.25477278232574463, "learning_rate": 6.312087017903783e-06, "loss": 0.0189, "step": 6960 }, { "epoch": 0.31626292475143075, "grad_norm": 0.3287123441696167, "learning_rate": 6.284781237449419e-06, "loss": 0.0163, "step": 6970 }, { "epoch": 0.3167166735674299, "grad_norm": 0.22698882222175598, "learning_rate": 6.257507555953002e-06, "loss": 0.0152, "step": 6980 }, { "epoch": 0.3171704223834291, "grad_norm": 0.3387869596481323, "learning_rate": 6.230266209055229e-06, "loss": 0.0126, "step": 6990 }, { "epoch": 0.31762417119942826, "grad_norm": 0.25053390860557556, "learning_rate": 6.20305743211744e-06, "loss": 0.0163, "step": 7000 }, { "epoch": 0.31807792001542745, "grad_norm": 0.2294333428144455, "learning_rate": 6.175881460219565e-06, "loss": 0.0131, "step": 7010 }, { "epoch": 0.31853166883142664, "grad_norm": 0.20624062418937683, "learning_rate": 6.148738528158109e-06, "loss": 0.0136, "step": 7020 }, { "epoch": 0.31898541764742583, "grad_norm": 0.18476051092147827, "learning_rate": 6.1216288704441255e-06, "loss": 0.0132, "step": 7030 }, { "epoch": 0.319439166463425, "grad_norm": 0.18245407938957214, "learning_rate": 6.094552721301164e-06, "loss": 0.0135, "step": 7040 }, { "epoch": 0.3198929152794242, "grad_norm": 0.20179583132266998, "learning_rate": 6.067510314663283e-06, "loss": 0.0122, "step": 7050 }, { "epoch": 0.3203466640954234, "grad_norm": 0.1874779313802719, "learning_rate": 6.0405018841729934e-06, "loss": 0.0134, "step": 7060 }, { "epoch": 0.3208004129114226, "grad_norm": 0.2021348625421524, "learning_rate": 6.013527663179275e-06, "loss": 0.016, "step": 7070 }, { "epoch": 0.3212541617274217, "grad_norm": 0.23553486168384552, "learning_rate": 5.986587884735526e-06, "loss": 0.0189, "step": 7080 }, { "epoch": 0.3217079105434209, "grad_norm": 0.26657748222351074, "learning_rate": 5.9596827815975775e-06, "loss": 0.0168, "step": 7090 }, { "epoch": 0.3221616593594201, "grad_norm": 0.27437448501586914, "learning_rate": 5.9328125862216676e-06, "loss": 0.016, "step": 7100 }, { "epoch": 0.3226154081754193, "grad_norm": 0.29050520062446594, "learning_rate": 5.90597753076243e-06, "loss": 0.0149, "step": 7110 }, { "epoch": 0.32306915699141847, "grad_norm": 0.2895548343658447, "learning_rate": 5.879177847070906e-06, "loss": 0.0148, "step": 7120 }, { "epoch": 0.32352290580741766, "grad_norm": 0.24653907120227814, "learning_rate": 5.8524137666925174e-06, "loss": 0.0133, "step": 7130 }, { "epoch": 0.32397665462341685, "grad_norm": 0.19593603909015656, "learning_rate": 5.825685520865092e-06, "loss": 0.015, "step": 7140 }, { "epoch": 0.32443040343941604, "grad_norm": 0.23352883756160736, "learning_rate": 5.798993340516843e-06, "loss": 0.0147, "step": 7150 }, { "epoch": 0.3248841522554152, "grad_norm": 0.24445559084415436, "learning_rate": 5.772337456264386e-06, "loss": 0.0161, "step": 7160 }, { "epoch": 0.3253379010714144, "grad_norm": 0.2534903585910797, "learning_rate": 5.745718098410737e-06, "loss": 0.0169, "step": 7170 }, { "epoch": 0.32579164988741355, "grad_norm": 0.17404842376708984, "learning_rate": 5.719135496943343e-06, "loss": 0.013, "step": 7180 }, { "epoch": 0.32624539870341274, "grad_norm": 0.20368589460849762, "learning_rate": 5.69258988153207e-06, "loss": 0.015, "step": 7190 }, { "epoch": 0.3266991475194119, "grad_norm": 0.25374406576156616, "learning_rate": 5.666081481527232e-06, "loss": 0.0151, "step": 7200 }, { "epoch": 0.3271528963354111, "grad_norm": 0.2245100438594818, "learning_rate": 5.639610525957604e-06, "loss": 0.0143, "step": 7210 }, { "epoch": 0.3276066451514103, "grad_norm": 0.15795458853244781, "learning_rate": 5.613177243528458e-06, "loss": 0.0151, "step": 7220 }, { "epoch": 0.3280603939674095, "grad_norm": 0.24801495671272278, "learning_rate": 5.586781862619566e-06, "loss": 0.0145, "step": 7230 }, { "epoch": 0.3285141427834087, "grad_norm": 0.2641449570655823, "learning_rate": 5.560424611283231e-06, "loss": 0.015, "step": 7240 }, { "epoch": 0.32896789159940787, "grad_norm": 0.2431277483701706, "learning_rate": 5.53410571724234e-06, "loss": 0.0144, "step": 7250 }, { "epoch": 0.32942164041540706, "grad_norm": 0.26529666781425476, "learning_rate": 5.507825407888362e-06, "loss": 0.016, "step": 7260 }, { "epoch": 0.32987538923140625, "grad_norm": 0.25077733397483826, "learning_rate": 5.481583910279402e-06, "loss": 0.0168, "step": 7270 }, { "epoch": 0.33032913804740544, "grad_norm": 0.2647542655467987, "learning_rate": 5.4553814511382485e-06, "loss": 0.0137, "step": 7280 }, { "epoch": 0.33078288686340457, "grad_norm": 0.29781800508499146, "learning_rate": 5.429218256850393e-06, "loss": 0.0145, "step": 7290 }, { "epoch": 0.33123663567940376, "grad_norm": 0.23706211149692535, "learning_rate": 5.403094553462083e-06, "loss": 0.0164, "step": 7300 }, { "epoch": 0.33169038449540295, "grad_norm": 0.15716779232025146, "learning_rate": 5.377010566678371e-06, "loss": 0.0171, "step": 7310 }, { "epoch": 0.33214413331140213, "grad_norm": 0.1803821474313736, "learning_rate": 5.350966521861178e-06, "loss": 0.013, "step": 7320 }, { "epoch": 0.3325978821274013, "grad_norm": 0.14119935035705566, "learning_rate": 5.324962644027312e-06, "loss": 0.0192, "step": 7330 }, { "epoch": 0.3330516309434005, "grad_norm": 0.14944466948509216, "learning_rate": 5.298999157846555e-06, "loss": 0.0147, "step": 7340 }, { "epoch": 0.3335053797593997, "grad_norm": 0.19122423231601715, "learning_rate": 5.273076287639704e-06, "loss": 0.0158, "step": 7350 }, { "epoch": 0.3339591285753989, "grad_norm": 0.19128480553627014, "learning_rate": 5.247194257376653e-06, "loss": 0.0153, "step": 7360 }, { "epoch": 0.3344128773913981, "grad_norm": 0.2281884253025055, "learning_rate": 5.221353290674429e-06, "loss": 0.0148, "step": 7370 }, { "epoch": 0.33486662620739727, "grad_norm": 0.15246489644050598, "learning_rate": 5.1955536107952885e-06, "loss": 0.0123, "step": 7380 }, { "epoch": 0.3353203750233964, "grad_norm": 0.18526007235050201, "learning_rate": 5.169795440644767e-06, "loss": 0.0143, "step": 7390 }, { "epoch": 0.3357741238393956, "grad_norm": 0.24510568380355835, "learning_rate": 5.144079002769766e-06, "loss": 0.0134, "step": 7400 }, { "epoch": 0.3362278726553948, "grad_norm": 0.20868492126464844, "learning_rate": 5.118404519356621e-06, "loss": 0.0137, "step": 7410 }, { "epoch": 0.33668162147139397, "grad_norm": 0.2769564688205719, "learning_rate": 5.0927722122292e-06, "loss": 0.0143, "step": 7420 }, { "epoch": 0.33713537028739315, "grad_norm": 0.23524247109889984, "learning_rate": 5.067182302846958e-06, "loss": 0.0146, "step": 7430 }, { "epoch": 0.33758911910339234, "grad_norm": 0.2885001301765442, "learning_rate": 5.041635012303048e-06, "loss": 0.0159, "step": 7440 }, { "epoch": 0.33804286791939153, "grad_norm": 0.1964728832244873, "learning_rate": 5.016130561322399e-06, "loss": 0.0121, "step": 7450 }, { "epoch": 0.3384966167353907, "grad_norm": 0.2259715050458908, "learning_rate": 4.990669170259816e-06, "loss": 0.015, "step": 7460 }, { "epoch": 0.3389503655513899, "grad_norm": 0.18058039247989655, "learning_rate": 4.965251059098074e-06, "loss": 0.0127, "step": 7470 }, { "epoch": 0.3394041143673891, "grad_norm": 0.22099602222442627, "learning_rate": 4.93987644744601e-06, "loss": 0.015, "step": 7480 }, { "epoch": 0.33985786318338823, "grad_norm": 0.2702184021472931, "learning_rate": 4.9145455545366335e-06, "loss": 0.0128, "step": 7490 }, { "epoch": 0.3403116119993874, "grad_norm": 0.3326244354248047, "learning_rate": 4.889258599225233e-06, "loss": 0.0176, "step": 7500 }, { "epoch": 0.3407653608153866, "grad_norm": 0.23411507904529572, "learning_rate": 4.864015799987474e-06, "loss": 0.0141, "step": 7510 }, { "epoch": 0.3412191096313858, "grad_norm": 0.2044791877269745, "learning_rate": 4.838817374917534e-06, "loss": 0.0192, "step": 7520 }, { "epoch": 0.341672858447385, "grad_norm": 0.27798885107040405, "learning_rate": 4.8136635417261935e-06, "loss": 0.0152, "step": 7530 }, { "epoch": 0.3421266072633842, "grad_norm": 0.1772766411304474, "learning_rate": 4.788554517738967e-06, "loss": 0.0147, "step": 7540 }, { "epoch": 0.34258035607938336, "grad_norm": 0.24441121518611908, "learning_rate": 4.763490519894223e-06, "loss": 0.013, "step": 7550 }, { "epoch": 0.34303410489538255, "grad_norm": 0.23297441005706787, "learning_rate": 4.738471764741319e-06, "loss": 0.0141, "step": 7560 }, { "epoch": 0.34348785371138174, "grad_norm": 0.1813233494758606, "learning_rate": 4.713498468438709e-06, "loss": 0.0167, "step": 7570 }, { "epoch": 0.34394160252738093, "grad_norm": 0.3871300220489502, "learning_rate": 4.6885708467521015e-06, "loss": 0.0186, "step": 7580 }, { "epoch": 0.34439535134338006, "grad_norm": 0.45125827193260193, "learning_rate": 4.6636891150525765e-06, "loss": 0.0144, "step": 7590 }, { "epoch": 0.34484910015937925, "grad_norm": 0.17681972682476044, "learning_rate": 4.638853488314727e-06, "loss": 0.014, "step": 7600 }, { "epoch": 0.34530284897537844, "grad_norm": 0.2789676785469055, "learning_rate": 4.614064181114817e-06, "loss": 0.0131, "step": 7610 }, { "epoch": 0.34575659779137763, "grad_norm": 0.2384275496006012, "learning_rate": 4.589321407628907e-06, "loss": 0.0125, "step": 7620 }, { "epoch": 0.3462103466073768, "grad_norm": 0.19603055715560913, "learning_rate": 4.5646253816310175e-06, "loss": 0.0143, "step": 7630 }, { "epoch": 0.346664095423376, "grad_norm": 0.24392522871494293, "learning_rate": 4.539976316491272e-06, "loss": 0.0152, "step": 7640 }, { "epoch": 0.3471178442393752, "grad_norm": 0.21089184284210205, "learning_rate": 4.515374425174062e-06, "loss": 0.0133, "step": 7650 }, { "epoch": 0.3475715930553744, "grad_norm": 0.2559266984462738, "learning_rate": 4.49081992023621e-06, "loss": 0.0114, "step": 7660 }, { "epoch": 0.3480253418713736, "grad_norm": 0.22928175330162048, "learning_rate": 4.466313013825119e-06, "loss": 0.013, "step": 7670 }, { "epoch": 0.34847909068737276, "grad_norm": 0.26739931106567383, "learning_rate": 4.4418539176769456e-06, "loss": 0.0152, "step": 7680 }, { "epoch": 0.3489328395033719, "grad_norm": 0.2298872470855713, "learning_rate": 4.417442843114786e-06, "loss": 0.0134, "step": 7690 }, { "epoch": 0.3493865883193711, "grad_norm": 0.20221474766731262, "learning_rate": 4.393080001046818e-06, "loss": 0.0147, "step": 7700 }, { "epoch": 0.34984033713537027, "grad_norm": 0.2766014039516449, "learning_rate": 4.368765601964516e-06, "loss": 0.0171, "step": 7710 }, { "epoch": 0.35029408595136946, "grad_norm": 0.20398274064064026, "learning_rate": 4.3444998559408025e-06, "loss": 0.0127, "step": 7720 }, { "epoch": 0.35074783476736865, "grad_norm": 0.2094474583864212, "learning_rate": 4.320282972628246e-06, "loss": 0.0144, "step": 7730 }, { "epoch": 0.35120158358336784, "grad_norm": 0.25641191005706787, "learning_rate": 4.2961151612572495e-06, "loss": 0.0161, "step": 7740 }, { "epoch": 0.351655332399367, "grad_norm": 0.18166835606098175, "learning_rate": 4.2719966306342386e-06, "loss": 0.016, "step": 7750 }, { "epoch": 0.3521090812153662, "grad_norm": 0.2359856367111206, "learning_rate": 4.247927589139869e-06, "loss": 0.0132, "step": 7760 }, { "epoch": 0.3525628300313654, "grad_norm": 0.1438111662864685, "learning_rate": 4.223908244727211e-06, "loss": 0.0134, "step": 7770 }, { "epoch": 0.3530165788473646, "grad_norm": 0.207376629114151, "learning_rate": 4.199938804919957e-06, "loss": 0.0123, "step": 7780 }, { "epoch": 0.3534703276633637, "grad_norm": 0.3160220980644226, "learning_rate": 4.176019476810631e-06, "loss": 0.0182, "step": 7790 }, { "epoch": 0.3539240764793629, "grad_norm": 0.21872855722904205, "learning_rate": 4.152150467058805e-06, "loss": 0.0168, "step": 7800 }, { "epoch": 0.3543778252953621, "grad_norm": 0.22001872956752777, "learning_rate": 4.128331981889309e-06, "loss": 0.0145, "step": 7810 }, { "epoch": 0.3548315741113613, "grad_norm": 0.19883836805820465, "learning_rate": 4.104564227090437e-06, "loss": 0.0165, "step": 7820 }, { "epoch": 0.3552853229273605, "grad_norm": 0.24806258082389832, "learning_rate": 4.080847408012189e-06, "loss": 0.0147, "step": 7830 }, { "epoch": 0.35573907174335967, "grad_norm": 0.21885566413402557, "learning_rate": 4.057181729564478e-06, "loss": 0.0127, "step": 7840 }, { "epoch": 0.35619282055935886, "grad_norm": 0.22148142755031586, "learning_rate": 4.033567396215387e-06, "loss": 0.0121, "step": 7850 }, { "epoch": 0.35664656937535805, "grad_norm": 0.2564483880996704, "learning_rate": 4.0100046119893654e-06, "loss": 0.0182, "step": 7860 }, { "epoch": 0.35710031819135724, "grad_norm": 0.20372381806373596, "learning_rate": 3.986493580465498e-06, "loss": 0.0129, "step": 7870 }, { "epoch": 0.3575540670073564, "grad_norm": 0.249146968126297, "learning_rate": 3.963034504775727e-06, "loss": 0.0153, "step": 7880 }, { "epoch": 0.3580078158233556, "grad_norm": 0.27544742822647095, "learning_rate": 3.939627587603103e-06, "loss": 0.0187, "step": 7890 }, { "epoch": 0.35846156463935475, "grad_norm": 0.298480361700058, "learning_rate": 3.9162730311800455e-06, "loss": 0.0154, "step": 7900 }, { "epoch": 0.35891531345535393, "grad_norm": 0.1499815136194229, "learning_rate": 3.8929710372865696e-06, "loss": 0.0143, "step": 7910 }, { "epoch": 0.3593690622713531, "grad_norm": 0.34332308173179626, "learning_rate": 3.869721807248571e-06, "loss": 0.0142, "step": 7920 }, { "epoch": 0.3598228110873523, "grad_norm": 0.2051619291305542, "learning_rate": 3.8465255419360635e-06, "loss": 0.0146, "step": 7930 }, { "epoch": 0.3602765599033515, "grad_norm": 0.1794043481349945, "learning_rate": 3.823382441761454e-06, "loss": 0.0124, "step": 7940 }, { "epoch": 0.3607303087193507, "grad_norm": 0.15501756966114044, "learning_rate": 3.8002927066778193e-06, "loss": 0.0163, "step": 7950 }, { "epoch": 0.3611840575353499, "grad_norm": 0.2080763280391693, "learning_rate": 3.7772565361771596e-06, "loss": 0.0171, "step": 7960 }, { "epoch": 0.36163780635134907, "grad_norm": 0.23499339818954468, "learning_rate": 3.75427412928869e-06, "loss": 0.0182, "step": 7970 }, { "epoch": 0.36209155516734826, "grad_norm": 0.2578664720058441, "learning_rate": 3.731345684577109e-06, "loss": 0.0161, "step": 7980 }, { "epoch": 0.36254530398334744, "grad_norm": 0.2592340409755707, "learning_rate": 3.7084714001409016e-06, "loss": 0.0158, "step": 7990 }, { "epoch": 0.3629990527993466, "grad_norm": 0.18188521265983582, "learning_rate": 3.6856514736106063e-06, "loss": 0.0135, "step": 8000 }, { "epoch": 0.36345280161534577, "grad_norm": 0.19300994277000427, "learning_rate": 3.6628861021471185e-06, "loss": 0.0162, "step": 8010 }, { "epoch": 0.36390655043134496, "grad_norm": 0.1781187355518341, "learning_rate": 3.6401754824399837e-06, "loss": 0.0146, "step": 8020 }, { "epoch": 0.36436029924734414, "grad_norm": 0.17654544115066528, "learning_rate": 3.6175198107057107e-06, "loss": 0.0137, "step": 8030 }, { "epoch": 0.36481404806334333, "grad_norm": 0.25945717096328735, "learning_rate": 3.5949192826860513e-06, "loss": 0.0112, "step": 8040 }, { "epoch": 0.3652677968793425, "grad_norm": 0.17957012355327606, "learning_rate": 3.572374093646336e-06, "loss": 0.0142, "step": 8050 }, { "epoch": 0.3657215456953417, "grad_norm": 0.2526027262210846, "learning_rate": 3.5498844383737653e-06, "loss": 0.0142, "step": 8060 }, { "epoch": 0.3661752945113409, "grad_norm": 0.24309176206588745, "learning_rate": 3.5274505111757405e-06, "loss": 0.0129, "step": 8070 }, { "epoch": 0.3666290433273401, "grad_norm": 0.24594396352767944, "learning_rate": 3.5050725058781765e-06, "loss": 0.017, "step": 8080 }, { "epoch": 0.3670827921433393, "grad_norm": 0.23228099942207336, "learning_rate": 3.482750615823838e-06, "loss": 0.0131, "step": 8090 }, { "epoch": 0.3675365409593384, "grad_norm": 0.22823822498321533, "learning_rate": 3.4604850338706554e-06, "loss": 0.0137, "step": 8100 }, { "epoch": 0.3679902897753376, "grad_norm": 0.16873668134212494, "learning_rate": 3.4382759523900678e-06, "loss": 0.0138, "step": 8110 }, { "epoch": 0.3684440385913368, "grad_norm": 0.257123202085495, "learning_rate": 3.4161235632653587e-06, "loss": 0.0141, "step": 8120 }, { "epoch": 0.368897787407336, "grad_norm": 0.207536518573761, "learning_rate": 3.394028057889992e-06, "loss": 0.0137, "step": 8130 }, { "epoch": 0.36935153622333516, "grad_norm": 0.24851061403751373, "learning_rate": 3.3719896271659734e-06, "loss": 0.0144, "step": 8140 }, { "epoch": 0.36980528503933435, "grad_norm": 0.29697203636169434, "learning_rate": 3.3500084615021912e-06, "loss": 0.0142, "step": 8150 }, { "epoch": 0.37025903385533354, "grad_norm": 0.19616173207759857, "learning_rate": 3.3280847508127644e-06, "loss": 0.0123, "step": 8160 }, { "epoch": 0.37071278267133273, "grad_norm": 0.21613579988479614, "learning_rate": 3.306218684515413e-06, "loss": 0.0115, "step": 8170 }, { "epoch": 0.3711665314873319, "grad_norm": 0.23955759406089783, "learning_rate": 3.284410451529816e-06, "loss": 0.0119, "step": 8180 }, { "epoch": 0.3716202803033311, "grad_norm": 0.1896020472049713, "learning_rate": 3.2626602402759865e-06, "loss": 0.012, "step": 8190 }, { "epoch": 0.37207402911933024, "grad_norm": 0.32029375433921814, "learning_rate": 3.240968238672633e-06, "loss": 0.0127, "step": 8200 }, { "epoch": 0.37252777793532943, "grad_norm": 0.19627264142036438, "learning_rate": 3.2193346341355413e-06, "loss": 0.0154, "step": 8210 }, { "epoch": 0.3729815267513286, "grad_norm": 0.21380770206451416, "learning_rate": 3.1977596135759524e-06, "loss": 0.0124, "step": 8220 }, { "epoch": 0.3734352755673278, "grad_norm": 0.30287957191467285, "learning_rate": 3.176243363398961e-06, "loss": 0.0166, "step": 8230 }, { "epoch": 0.373889024383327, "grad_norm": 0.4695558547973633, "learning_rate": 3.1547860695018793e-06, "loss": 0.0163, "step": 8240 }, { "epoch": 0.3743427731993262, "grad_norm": 0.17148882150650024, "learning_rate": 3.13338791727266e-06, "loss": 0.0155, "step": 8250 }, { "epoch": 0.3747965220153254, "grad_norm": 0.22904188930988312, "learning_rate": 3.1120490915882694e-06, "loss": 0.0164, "step": 8260 }, { "epoch": 0.37525027083132456, "grad_norm": 0.1836533397436142, "learning_rate": 3.090769776813106e-06, "loss": 0.0123, "step": 8270 }, { "epoch": 0.37570401964732375, "grad_norm": 0.1699976921081543, "learning_rate": 3.0695501567973983e-06, "loss": 0.0115, "step": 8280 }, { "epoch": 0.37615776846332294, "grad_norm": 0.2852734923362732, "learning_rate": 3.0483904148756284e-06, "loss": 0.0165, "step": 8290 }, { "epoch": 0.37661151727932207, "grad_norm": 0.24860599637031555, "learning_rate": 3.0272907338649337e-06, "loss": 0.0172, "step": 8300 }, { "epoch": 0.37706526609532126, "grad_norm": 0.2308913916349411, "learning_rate": 3.006251296063536e-06, "loss": 0.0134, "step": 8310 }, { "epoch": 0.37751901491132045, "grad_norm": 0.17909853160381317, "learning_rate": 2.985272283249161e-06, "loss": 0.0147, "step": 8320 }, { "epoch": 0.37797276372731964, "grad_norm": 0.2275843322277069, "learning_rate": 2.9643538766774793e-06, "loss": 0.0134, "step": 8330 }, { "epoch": 0.3784265125433188, "grad_norm": 0.1711130440235138, "learning_rate": 2.943496257080527e-06, "loss": 0.0121, "step": 8340 }, { "epoch": 0.378880261359318, "grad_norm": 0.17838282883167267, "learning_rate": 2.9226996046651435e-06, "loss": 0.0161, "step": 8350 }, { "epoch": 0.3793340101753172, "grad_norm": 0.21856985986232758, "learning_rate": 2.901964099111435e-06, "loss": 0.0138, "step": 8360 }, { "epoch": 0.3797877589913164, "grad_norm": 0.2315887212753296, "learning_rate": 2.881289919571193e-06, "loss": 0.0157, "step": 8370 }, { "epoch": 0.3802415078073156, "grad_norm": 0.21517451107501984, "learning_rate": 2.860677244666373e-06, "loss": 0.0127, "step": 8380 }, { "epoch": 0.38069525662331477, "grad_norm": 0.1922135204076767, "learning_rate": 2.840126252487532e-06, "loss": 0.0165, "step": 8390 }, { "epoch": 0.38114900543931396, "grad_norm": 0.2624659538269043, "learning_rate": 2.8196371205922955e-06, "loss": 0.0131, "step": 8400 }, { "epoch": 0.3816027542553131, "grad_norm": 0.16270387172698975, "learning_rate": 2.799210026003831e-06, "loss": 0.0133, "step": 8410 }, { "epoch": 0.3820565030713123, "grad_norm": 0.279345840215683, "learning_rate": 2.7788451452093067e-06, "loss": 0.0173, "step": 8420 }, { "epoch": 0.38251025188731147, "grad_norm": 0.2302316129207611, "learning_rate": 2.75854265415838e-06, "loss": 0.0121, "step": 8430 }, { "epoch": 0.38296400070331066, "grad_norm": 0.2500201463699341, "learning_rate": 2.738302728261665e-06, "loss": 0.0165, "step": 8440 }, { "epoch": 0.38341774951930985, "grad_norm": 0.16474679112434387, "learning_rate": 2.7181255423892192e-06, "loss": 0.0136, "step": 8450 }, { "epoch": 0.38387149833530904, "grad_norm": 0.27152135968208313, "learning_rate": 2.6980112708690374e-06, "loss": 0.0144, "step": 8460 }, { "epoch": 0.3843252471513082, "grad_norm": 0.2450963407754898, "learning_rate": 2.677960087485547e-06, "loss": 0.0132, "step": 8470 }, { "epoch": 0.3847789959673074, "grad_norm": 0.21445010602474213, "learning_rate": 2.657972165478103e-06, "loss": 0.0124, "step": 8480 }, { "epoch": 0.3852327447833066, "grad_norm": 0.2566172480583191, "learning_rate": 2.638047677539487e-06, "loss": 0.0144, "step": 8490 }, { "epoch": 0.3856864935993058, "grad_norm": 0.16092081367969513, "learning_rate": 2.618186795814418e-06, "loss": 0.0111, "step": 8500 }, { "epoch": 0.3861402424153049, "grad_norm": 0.19989892840385437, "learning_rate": 2.598389691898072e-06, "loss": 0.0169, "step": 8510 }, { "epoch": 0.3865939912313041, "grad_norm": 0.23404096066951752, "learning_rate": 2.578656536834586e-06, "loss": 0.0174, "step": 8520 }, { "epoch": 0.3870477400473033, "grad_norm": 0.3670184910297394, "learning_rate": 2.5589875011156008e-06, "loss": 0.0178, "step": 8530 }, { "epoch": 0.3875014888633025, "grad_norm": 0.20950834453105927, "learning_rate": 2.539382754678764e-06, "loss": 0.0154, "step": 8540 }, { "epoch": 0.3879552376793017, "grad_norm": 0.20955273509025574, "learning_rate": 2.519842466906276e-06, "loss": 0.011, "step": 8550 }, { "epoch": 0.38840898649530087, "grad_norm": 0.26075512170791626, "learning_rate": 2.5003668066234233e-06, "loss": 0.0122, "step": 8560 }, { "epoch": 0.38886273531130006, "grad_norm": 0.14794066548347473, "learning_rate": 2.480955942097121e-06, "loss": 0.0144, "step": 8570 }, { "epoch": 0.38931648412729924, "grad_norm": 0.1937081217765808, "learning_rate": 2.4616100410344634e-06, "loss": 0.014, "step": 8580 }, { "epoch": 0.38977023294329843, "grad_norm": 0.22162729501724243, "learning_rate": 2.442329270581262e-06, "loss": 0.0176, "step": 8590 }, { "epoch": 0.3902239817592976, "grad_norm": 0.18681414425373077, "learning_rate": 2.4231137973206097e-06, "loss": 0.018, "step": 8600 }, { "epoch": 0.39067773057529676, "grad_norm": 0.25027239322662354, "learning_rate": 2.4039637872714417e-06, "loss": 0.0121, "step": 8610 }, { "epoch": 0.39113147939129594, "grad_norm": 0.2140311449766159, "learning_rate": 2.3848794058871073e-06, "loss": 0.0124, "step": 8620 }, { "epoch": 0.39158522820729513, "grad_norm": 0.1816917210817337, "learning_rate": 2.3658608180539243e-06, "loss": 0.0153, "step": 8630 }, { "epoch": 0.3920389770232943, "grad_norm": 0.26411861181259155, "learning_rate": 2.3469081880897694e-06, "loss": 0.0147, "step": 8640 }, { "epoch": 0.3924927258392935, "grad_norm": 0.18821869790554047, "learning_rate": 2.328021679742648e-06, "loss": 0.0113, "step": 8650 }, { "epoch": 0.3929464746552927, "grad_norm": 0.1908893585205078, "learning_rate": 2.309201456189286e-06, "loss": 0.0137, "step": 8660 }, { "epoch": 0.3934002234712919, "grad_norm": 0.2324998527765274, "learning_rate": 2.290447680033725e-06, "loss": 0.0143, "step": 8670 }, { "epoch": 0.3938539722872911, "grad_norm": 0.18674464523792267, "learning_rate": 2.2717605133059007e-06, "loss": 0.0153, "step": 8680 }, { "epoch": 0.39430772110329027, "grad_norm": 0.20163287222385406, "learning_rate": 2.253140117460255e-06, "loss": 0.013, "step": 8690 }, { "epoch": 0.39476146991928945, "grad_norm": 0.1990893930196762, "learning_rate": 2.2345866533743453e-06, "loss": 0.0133, "step": 8700 }, { "epoch": 0.3952152187352886, "grad_norm": 0.18103700876235962, "learning_rate": 2.2161002813474397e-06, "loss": 0.0121, "step": 8710 }, { "epoch": 0.3956689675512878, "grad_norm": 0.21486005187034607, "learning_rate": 2.197681161099149e-06, "loss": 0.0099, "step": 8720 }, { "epoch": 0.39612271636728696, "grad_norm": 0.2406061887741089, "learning_rate": 2.179329451768031e-06, "loss": 0.013, "step": 8730 }, { "epoch": 0.39657646518328615, "grad_norm": 0.2234313189983368, "learning_rate": 2.161045311910227e-06, "loss": 0.0155, "step": 8740 }, { "epoch": 0.39703021399928534, "grad_norm": 0.2159639447927475, "learning_rate": 2.1428288994980816e-06, "loss": 0.0138, "step": 8750 }, { "epoch": 0.39748396281528453, "grad_norm": 0.2941778302192688, "learning_rate": 2.124680371918796e-06, "loss": 0.0157, "step": 8760 }, { "epoch": 0.3979377116312837, "grad_norm": 0.19672468304634094, "learning_rate": 2.106599885973044e-06, "loss": 0.015, "step": 8770 }, { "epoch": 0.3983914604472829, "grad_norm": 0.24345293641090393, "learning_rate": 2.088587597873637e-06, "loss": 0.0126, "step": 8780 }, { "epoch": 0.3988452092632821, "grad_norm": 0.2621181905269623, "learning_rate": 2.070643663244163e-06, "loss": 0.0149, "step": 8790 }, { "epoch": 0.3992989580792813, "grad_norm": 0.19737562537193298, "learning_rate": 2.052768237117644e-06, "loss": 0.0162, "step": 8800 }, { "epoch": 0.3997527068952804, "grad_norm": 0.26204782724380493, "learning_rate": 2.034961473935203e-06, "loss": 0.0114, "step": 8810 }, { "epoch": 0.4002064557112796, "grad_norm": 0.17524027824401855, "learning_rate": 2.0172235275447284e-06, "loss": 0.012, "step": 8820 }, { "epoch": 0.4006602045272788, "grad_norm": 0.2259146273136139, "learning_rate": 1.9995545511995316e-06, "loss": 0.0132, "step": 8830 }, { "epoch": 0.401113953343278, "grad_norm": 0.19948813319206238, "learning_rate": 1.9819546975570382e-06, "loss": 0.0118, "step": 8840 }, { "epoch": 0.4015677021592772, "grad_norm": 0.2756507992744446, "learning_rate": 1.9644241186774593e-06, "loss": 0.0138, "step": 8850 }, { "epoch": 0.40202145097527636, "grad_norm": 0.14642247557640076, "learning_rate": 1.9469629660224907e-06, "loss": 0.014, "step": 8860 }, { "epoch": 0.40247519979127555, "grad_norm": 0.23485340178012848, "learning_rate": 1.9295713904539892e-06, "loss": 0.0101, "step": 8870 }, { "epoch": 0.40292894860727474, "grad_norm": 0.15757770836353302, "learning_rate": 1.912249542232675e-06, "loss": 0.0149, "step": 8880 }, { "epoch": 0.40338269742327393, "grad_norm": 0.21072518825531006, "learning_rate": 1.8949975710168357e-06, "loss": 0.0144, "step": 8890 }, { "epoch": 0.4038364462392731, "grad_norm": 0.20553289353847504, "learning_rate": 1.8778156258610292e-06, "loss": 0.0166, "step": 8900 }, { "epoch": 0.40429019505527225, "grad_norm": 0.2357052117586136, "learning_rate": 1.8607038552148039e-06, "loss": 0.0135, "step": 8910 }, { "epoch": 0.40474394387127144, "grad_norm": 0.18113180994987488, "learning_rate": 1.8436624069214071e-06, "loss": 0.0107, "step": 8920 }, { "epoch": 0.4051976926872706, "grad_norm": 0.18645435571670532, "learning_rate": 1.8266914282165116e-06, "loss": 0.0129, "step": 8930 }, { "epoch": 0.4056514415032698, "grad_norm": 0.15311095118522644, "learning_rate": 1.80979106572694e-06, "loss": 0.012, "step": 8940 }, { "epoch": 0.406105190319269, "grad_norm": 0.23073312640190125, "learning_rate": 1.792961465469404e-06, "loss": 0.0137, "step": 8950 }, { "epoch": 0.4065589391352682, "grad_norm": 0.30698448419570923, "learning_rate": 1.7762027728492405e-06, "loss": 0.0169, "step": 8960 }, { "epoch": 0.4070126879512674, "grad_norm": 0.234145849943161, "learning_rate": 1.759515132659153e-06, "loss": 0.0144, "step": 8970 }, { "epoch": 0.40746643676726657, "grad_norm": 0.22161205112934113, "learning_rate": 1.742898689077961e-06, "loss": 0.014, "step": 8980 }, { "epoch": 0.40792018558326576, "grad_norm": 0.12403864413499832, "learning_rate": 1.726353585669356e-06, "loss": 0.0119, "step": 8990 }, { "epoch": 0.40837393439926495, "grad_norm": 0.22862762212753296, "learning_rate": 1.7098799653806663e-06, "loss": 0.0127, "step": 9000 }, { "epoch": 0.40882768321526414, "grad_norm": 0.23607058823108673, "learning_rate": 1.6934779705416082e-06, "loss": 0.0124, "step": 9010 }, { "epoch": 0.40928143203126327, "grad_norm": 0.23919253051280975, "learning_rate": 1.6771477428630656e-06, "loss": 0.0146, "step": 9020 }, { "epoch": 0.40973518084726246, "grad_norm": 0.27512481808662415, "learning_rate": 1.6608894234358708e-06, "loss": 0.0136, "step": 9030 }, { "epoch": 0.41018892966326165, "grad_norm": 0.20255787670612335, "learning_rate": 1.6447031527295744e-06, "loss": 0.0157, "step": 9040 }, { "epoch": 0.41064267847926084, "grad_norm": 0.3193971812725067, "learning_rate": 1.628589070591232e-06, "loss": 0.0151, "step": 9050 }, { "epoch": 0.41109642729526, "grad_norm": 0.3560577929019928, "learning_rate": 1.6125473162442107e-06, "loss": 0.0132, "step": 9060 }, { "epoch": 0.4115501761112592, "grad_norm": 0.28087684512138367, "learning_rate": 1.5965780282869693e-06, "loss": 0.0129, "step": 9070 }, { "epoch": 0.4120039249272584, "grad_norm": 0.13372847437858582, "learning_rate": 1.5806813446918657e-06, "loss": 0.0106, "step": 9080 }, { "epoch": 0.4124576737432576, "grad_norm": 0.18017086386680603, "learning_rate": 1.56485740280397e-06, "loss": 0.0136, "step": 9090 }, { "epoch": 0.4129114225592568, "grad_norm": 0.19132131338119507, "learning_rate": 1.5491063393398742e-06, "loss": 0.0154, "step": 9100 }, { "epoch": 0.41336517137525597, "grad_norm": 0.26309046149253845, "learning_rate": 1.5334282903865116e-06, "loss": 0.0143, "step": 9110 }, { "epoch": 0.4138189201912551, "grad_norm": 0.16610760986804962, "learning_rate": 1.5178233913999784e-06, "loss": 0.0129, "step": 9120 }, { "epoch": 0.4142726690072543, "grad_norm": 0.22090893983840942, "learning_rate": 1.5022917772043633e-06, "loss": 0.0125, "step": 9130 }, { "epoch": 0.4147264178232535, "grad_norm": 0.18332765996456146, "learning_rate": 1.4868335819905922e-06, "loss": 0.0125, "step": 9140 }, { "epoch": 0.41518016663925267, "grad_norm": 0.1964850276708603, "learning_rate": 1.4714489393152586e-06, "loss": 0.0137, "step": 9150 }, { "epoch": 0.41563391545525186, "grad_norm": 0.20184849202632904, "learning_rate": 1.4561379820994692e-06, "loss": 0.0121, "step": 9160 }, { "epoch": 0.41608766427125105, "grad_norm": 0.11549197137355804, "learning_rate": 1.4409008426277028e-06, "loss": 0.0129, "step": 9170 }, { "epoch": 0.41654141308725023, "grad_norm": 0.23759661614894867, "learning_rate": 1.4257376525466594e-06, "loss": 0.0151, "step": 9180 }, { "epoch": 0.4169951619032494, "grad_norm": 0.15113626420497894, "learning_rate": 1.4106485428641292e-06, "loss": 0.0114, "step": 9190 }, { "epoch": 0.4174489107192486, "grad_norm": 0.18651457130908966, "learning_rate": 1.3956336439478612e-06, "loss": 0.0127, "step": 9200 }, { "epoch": 0.4179026595352478, "grad_norm": 0.19801680743694305, "learning_rate": 1.3806930855244315e-06, "loss": 0.013, "step": 9210 }, { "epoch": 0.41835640835124693, "grad_norm": 0.2270527333021164, "learning_rate": 1.3658269966781223e-06, "loss": 0.0133, "step": 9220 }, { "epoch": 0.4188101571672461, "grad_norm": 0.18981532752513885, "learning_rate": 1.3510355058498114e-06, "loss": 0.011, "step": 9230 }, { "epoch": 0.4192639059832453, "grad_norm": 0.18792828917503357, "learning_rate": 1.3363187408358612e-06, "loss": 0.0129, "step": 9240 }, { "epoch": 0.4197176547992445, "grad_norm": 0.22293440997600555, "learning_rate": 1.3216768287870185e-06, "loss": 0.0127, "step": 9250 }, { "epoch": 0.4201714036152437, "grad_norm": 0.2767641544342041, "learning_rate": 1.3071098962073004e-06, "loss": 0.0108, "step": 9260 }, { "epoch": 0.4206251524312429, "grad_norm": 0.16410525143146515, "learning_rate": 1.292618068952921e-06, "loss": 0.0135, "step": 9270 }, { "epoch": 0.42107890124724207, "grad_norm": 0.2391669601202011, "learning_rate": 1.2782014722311897e-06, "loss": 0.0118, "step": 9280 }, { "epoch": 0.42153265006324125, "grad_norm": 0.1578000783920288, "learning_rate": 1.2638602305994364e-06, "loss": 0.014, "step": 9290 }, { "epoch": 0.42198639887924044, "grad_norm": 0.1698511242866516, "learning_rate": 1.2495944679639383e-06, "loss": 0.0142, "step": 9300 }, { "epoch": 0.42244014769523963, "grad_norm": 0.17831853032112122, "learning_rate": 1.2354043075788391e-06, "loss": 0.0136, "step": 9310 }, { "epoch": 0.42289389651123876, "grad_norm": 0.2128513604402542, "learning_rate": 1.2212898720450915e-06, "loss": 0.0151, "step": 9320 }, { "epoch": 0.42334764532723795, "grad_norm": 0.183706134557724, "learning_rate": 1.2072512833093964e-06, "loss": 0.0137, "step": 9330 }, { "epoch": 0.42380139414323714, "grad_norm": 0.2649199962615967, "learning_rate": 1.1932886626631512e-06, "loss": 0.0163, "step": 9340 }, { "epoch": 0.42425514295923633, "grad_norm": 0.26215529441833496, "learning_rate": 1.179402130741396e-06, "loss": 0.0112, "step": 9350 }, { "epoch": 0.4247088917752355, "grad_norm": 0.2475711703300476, "learning_rate": 1.165591807521781e-06, "loss": 0.0159, "step": 9360 }, { "epoch": 0.4251626405912347, "grad_norm": 0.21362560987472534, "learning_rate": 1.1518578123235191e-06, "loss": 0.0113, "step": 9370 }, { "epoch": 0.4256163894072339, "grad_norm": 0.272306352853775, "learning_rate": 1.1382002638063584e-06, "loss": 0.0131, "step": 9380 }, { "epoch": 0.4260701382232331, "grad_norm": 0.23435984551906586, "learning_rate": 1.1246192799695666e-06, "loss": 0.0133, "step": 9390 }, { "epoch": 0.4265238870392323, "grad_norm": 0.2903789281845093, "learning_rate": 1.1111149781508968e-06, "loss": 0.014, "step": 9400 }, { "epoch": 0.42697763585523146, "grad_norm": 0.22060996294021606, "learning_rate": 1.0976874750255828e-06, "loss": 0.0126, "step": 9410 }, { "epoch": 0.4274313846712306, "grad_norm": 0.19333960115909576, "learning_rate": 1.0843368866053271e-06, "loss": 0.0132, "step": 9420 }, { "epoch": 0.4278851334872298, "grad_norm": 0.2444356083869934, "learning_rate": 1.0710633282372996e-06, "loss": 0.014, "step": 9430 }, { "epoch": 0.428338882303229, "grad_norm": 0.19958622753620148, "learning_rate": 1.0578669146031484e-06, "loss": 0.0141, "step": 9440 }, { "epoch": 0.42879263111922816, "grad_norm": 0.2752648890018463, "learning_rate": 1.0447477597179945e-06, "loss": 0.0146, "step": 9450 }, { "epoch": 0.42924637993522735, "grad_norm": 0.19226497411727905, "learning_rate": 1.0317059769294557e-06, "loss": 0.0132, "step": 9460 }, { "epoch": 0.42970012875122654, "grad_norm": 0.2440602332353592, "learning_rate": 1.0187416789166672e-06, "loss": 0.0128, "step": 9470 }, { "epoch": 0.43015387756722573, "grad_norm": 0.18320639431476593, "learning_rate": 1.0058549776893068e-06, "loss": 0.0163, "step": 9480 }, { "epoch": 0.4306076263832249, "grad_norm": 0.26742392778396606, "learning_rate": 9.930459845866313e-07, "loss": 0.0128, "step": 9490 }, { "epoch": 0.4310613751992241, "grad_norm": 0.22233040630817413, "learning_rate": 9.803148102765026e-07, "loss": 0.0124, "step": 9500 }, { "epoch": 0.4315151240152233, "grad_norm": 0.2635855972766876, "learning_rate": 9.676615647544452e-07, "loss": 0.0119, "step": 9510 }, { "epoch": 0.4319688728312225, "grad_norm": 0.27369099855422974, "learning_rate": 9.550863573426838e-07, "loss": 0.0141, "step": 9520 }, { "epoch": 0.4324226216472216, "grad_norm": 0.2566049098968506, "learning_rate": 9.425892966892136e-07, "loss": 0.0153, "step": 9530 }, { "epoch": 0.4328763704632208, "grad_norm": 0.25569114089012146, "learning_rate": 9.301704907668474e-07, "loss": 0.0137, "step": 9540 }, { "epoch": 0.43333011927922, "grad_norm": 0.1821911633014679, "learning_rate": 9.178300468722901e-07, "loss": 0.0132, "step": 9550 }, { "epoch": 0.4337838680952192, "grad_norm": 0.20974910259246826, "learning_rate": 9.055680716252068e-07, "loss": 0.0129, "step": 9560 }, { "epoch": 0.43423761691121837, "grad_norm": 0.18692532181739807, "learning_rate": 8.933846709673078e-07, "loss": 0.0126, "step": 9570 }, { "epoch": 0.43469136572721756, "grad_norm": 0.20537251234054565, "learning_rate": 8.812799501614311e-07, "loss": 0.0117, "step": 9580 }, { "epoch": 0.43514511454321675, "grad_norm": 0.14430077373981476, "learning_rate": 8.692540137906314e-07, "loss": 0.0126, "step": 9590 }, { "epoch": 0.43559886335921594, "grad_norm": 0.21880139410495758, "learning_rate": 8.573069657572752e-07, "loss": 0.0131, "step": 9600 }, { "epoch": 0.4360526121752151, "grad_norm": 0.17590667307376862, "learning_rate": 8.454389092821458e-07, "loss": 0.0134, "step": 9610 }, { "epoch": 0.4365063609912143, "grad_norm": 0.21016256511211395, "learning_rate": 8.336499469035509e-07, "loss": 0.0152, "step": 9620 }, { "epoch": 0.43696010980721345, "grad_norm": 0.18677109479904175, "learning_rate": 8.219401804764382e-07, "loss": 0.014, "step": 9630 }, { "epoch": 0.43741385862321264, "grad_norm": 0.2555142641067505, "learning_rate": 8.10309711171512e-07, "loss": 0.0147, "step": 9640 }, { "epoch": 0.4378676074392118, "grad_norm": 0.2963770031929016, "learning_rate": 7.987586394743608e-07, "loss": 0.0134, "step": 9650 }, { "epoch": 0.438321356255211, "grad_norm": 0.25530311465263367, "learning_rate": 7.872870651845888e-07, "loss": 0.0149, "step": 9660 }, { "epoch": 0.4387751050712102, "grad_norm": 0.30015891790390015, "learning_rate": 7.758950874149541e-07, "loss": 0.0155, "step": 9670 }, { "epoch": 0.4392288538872094, "grad_norm": 0.24320900440216064, "learning_rate": 7.645828045905157e-07, "loss": 0.0149, "step": 9680 }, { "epoch": 0.4396826027032086, "grad_norm": 0.20217321813106537, "learning_rate": 7.533503144477738e-07, "loss": 0.0128, "step": 9690 }, { "epoch": 0.44013635151920777, "grad_norm": 0.27635639905929565, "learning_rate": 7.421977140338376e-07, "loss": 0.015, "step": 9700 }, { "epoch": 0.44059010033520696, "grad_norm": 0.2953373193740845, "learning_rate": 7.311250997055752e-07, "loss": 0.0143, "step": 9710 }, { "epoch": 0.44104384915120615, "grad_norm": 0.17709863185882568, "learning_rate": 7.201325671287862e-07, "loss": 0.0145, "step": 9720 }, { "epoch": 0.4414975979672053, "grad_norm": 0.1878160983324051, "learning_rate": 7.092202112773817e-07, "loss": 0.0136, "step": 9730 }, { "epoch": 0.44195134678320447, "grad_norm": 0.12095965445041656, "learning_rate": 6.983881264325521e-07, "loss": 0.0101, "step": 9740 }, { "epoch": 0.44240509559920366, "grad_norm": 0.15436555445194244, "learning_rate": 6.876364061819574e-07, "loss": 0.0135, "step": 9750 }, { "epoch": 0.44285884441520285, "grad_norm": 0.23995015025138855, "learning_rate": 6.769651434189195e-07, "loss": 0.0114, "step": 9760 }, { "epoch": 0.44331259323120203, "grad_norm": 0.24015134572982788, "learning_rate": 6.663744303416231e-07, "loss": 0.0121, "step": 9770 }, { "epoch": 0.4437663420472012, "grad_norm": 0.16917678713798523, "learning_rate": 6.558643584523117e-07, "loss": 0.0125, "step": 9780 }, { "epoch": 0.4442200908632004, "grad_norm": 0.21945761144161224, "learning_rate": 6.454350185564994e-07, "loss": 0.0117, "step": 9790 }, { "epoch": 0.4446738396791996, "grad_norm": 0.1128990575671196, "learning_rate": 6.350865007621887e-07, "loss": 0.0136, "step": 9800 }, { "epoch": 0.4451275884951988, "grad_norm": 0.18295995891094208, "learning_rate": 6.248188944790933e-07, "loss": 0.01, "step": 9810 }, { "epoch": 0.445581337311198, "grad_norm": 0.10374835878610611, "learning_rate": 6.146322884178591e-07, "loss": 0.0111, "step": 9820 }, { "epoch": 0.4460350861271971, "grad_norm": 0.25979602336883545, "learning_rate": 6.045267705893043e-07, "loss": 0.0128, "step": 9830 }, { "epoch": 0.4464888349431963, "grad_norm": 0.2592591941356659, "learning_rate": 5.945024283036549e-07, "loss": 0.0119, "step": 9840 }, { "epoch": 0.4469425837591955, "grad_norm": 0.1858862340450287, "learning_rate": 5.845593481697931e-07, "loss": 0.0122, "step": 9850 }, { "epoch": 0.4473963325751947, "grad_norm": 0.2113071084022522, "learning_rate": 5.746976160945051e-07, "loss": 0.0125, "step": 9860 }, { "epoch": 0.44785008139119387, "grad_norm": 0.3666979670524597, "learning_rate": 5.649173172817457e-07, "loss": 0.0151, "step": 9870 }, { "epoch": 0.44830383020719305, "grad_norm": 0.2563125491142273, "learning_rate": 5.55218536231894e-07, "loss": 0.0163, "step": 9880 }, { "epoch": 0.44875757902319224, "grad_norm": 0.1554328203201294, "learning_rate": 5.456013567410312e-07, "loss": 0.0126, "step": 9890 }, { "epoch": 0.44921132783919143, "grad_norm": 0.2281852513551712, "learning_rate": 5.360658619002068e-07, "loss": 0.0155, "step": 9900 }, { "epoch": 0.4496650766551906, "grad_norm": 0.17750845849514008, "learning_rate": 5.266121340947327e-07, "loss": 0.0119, "step": 9910 }, { "epoch": 0.4501188254711898, "grad_norm": 0.1800116002559662, "learning_rate": 5.172402550034639e-07, "loss": 0.0149, "step": 9920 }, { "epoch": 0.45057257428718894, "grad_norm": 0.11780201643705368, "learning_rate": 5.079503055980939e-07, "loss": 0.0115, "step": 9930 }, { "epoch": 0.45102632310318813, "grad_norm": 0.17817994952201843, "learning_rate": 4.987423661424517e-07, "loss": 0.014, "step": 9940 }, { "epoch": 0.4514800719191873, "grad_norm": 0.1950632631778717, "learning_rate": 4.896165161918176e-07, "loss": 0.0132, "step": 9950 }, { "epoch": 0.4519338207351865, "grad_norm": 0.28178641200065613, "learning_rate": 4.805728345922267e-07, "loss": 0.0121, "step": 9960 }, { "epoch": 0.4523875695511857, "grad_norm": 0.24776586890220642, "learning_rate": 4.716113994797944e-07, "loss": 0.0175, "step": 9970 }, { "epoch": 0.4528413183671849, "grad_norm": 0.18266946077346802, "learning_rate": 4.627322882800345e-07, "loss": 0.0153, "step": 9980 }, { "epoch": 0.4532950671831841, "grad_norm": 0.2108931541442871, "learning_rate": 4.5393557770719744e-07, "loss": 0.0139, "step": 9990 }, { "epoch": 0.45374881599918326, "grad_norm": 0.3489595353603363, "learning_rate": 4.4522134376359995e-07, "loss": 0.0133, "step": 10000 }, { "epoch": 0.45420256481518245, "grad_norm": 0.2508578598499298, "learning_rate": 4.3658966173897866e-07, "loss": 0.013, "step": 10010 }, { "epoch": 0.45465631363118164, "grad_norm": 0.18659383058547974, "learning_rate": 4.2804060620982747e-07, "loss": 0.0111, "step": 10020 }, { "epoch": 0.4551100624471808, "grad_norm": 0.22624759376049042, "learning_rate": 4.1957425103876235e-07, "loss": 0.0141, "step": 10030 }, { "epoch": 0.45556381126317996, "grad_norm": 0.20770379900932312, "learning_rate": 4.111906693738799e-07, "loss": 0.0104, "step": 10040 }, { "epoch": 0.45601756007917915, "grad_norm": 0.24289782345294952, "learning_rate": 4.02889933648124e-07, "loss": 0.0131, "step": 10050 }, { "epoch": 0.45647130889517834, "grad_norm": 0.158809095621109, "learning_rate": 3.946721155786615e-07, "loss": 0.0096, "step": 10060 }, { "epoch": 0.45692505771117753, "grad_norm": 0.21078507602214813, "learning_rate": 3.865372861662664e-07, "loss": 0.0128, "step": 10070 }, { "epoch": 0.4573788065271767, "grad_norm": 0.2302614003419876, "learning_rate": 3.784855156946965e-07, "loss": 0.0125, "step": 10080 }, { "epoch": 0.4578325553431759, "grad_norm": 0.2259799987077713, "learning_rate": 3.705168737300968e-07, "loss": 0.0173, "step": 10090 }, { "epoch": 0.4582863041591751, "grad_norm": 0.19881856441497803, "learning_rate": 3.626314291203914e-07, "loss": 0.0142, "step": 10100 }, { "epoch": 0.4587400529751743, "grad_norm": 0.4168280363082886, "learning_rate": 3.548292499946937e-07, "loss": 0.0141, "step": 10110 }, { "epoch": 0.4591938017911735, "grad_norm": 0.22501815855503082, "learning_rate": 3.4711040376271264e-07, "loss": 0.0128, "step": 10120 }, { "epoch": 0.45964755060717266, "grad_norm": 0.22636446356773376, "learning_rate": 3.394749571141731e-07, "loss": 0.0137, "step": 10130 }, { "epoch": 0.4601012994231718, "grad_norm": 0.30836841464042664, "learning_rate": 3.319229760182441e-07, "loss": 0.0148, "step": 10140 }, { "epoch": 0.460555048239171, "grad_norm": 0.29831039905548096, "learning_rate": 3.244545257229559e-07, "loss": 0.017, "step": 10150 }, { "epoch": 0.46100879705517017, "grad_norm": 0.24868373572826385, "learning_rate": 3.170696707546539e-07, "loss": 0.0118, "step": 10160 }, { "epoch": 0.46146254587116936, "grad_norm": 0.265507310628891, "learning_rate": 3.0976847491742347e-07, "loss": 0.0144, "step": 10170 }, { "epoch": 0.46191629468716855, "grad_norm": 0.2085072547197342, "learning_rate": 3.0255100129255364e-07, "loss": 0.0126, "step": 10180 }, { "epoch": 0.46237004350316774, "grad_norm": 0.2751619815826416, "learning_rate": 2.9541731223797997e-07, "loss": 0.0138, "step": 10190 }, { "epoch": 0.4628237923191669, "grad_norm": 0.22353744506835938, "learning_rate": 2.883674693877558e-07, "loss": 0.0126, "step": 10200 }, { "epoch": 0.4632775411351661, "grad_norm": 0.14840485155582428, "learning_rate": 2.8140153365151304e-07, "loss": 0.015, "step": 10210 }, { "epoch": 0.4637312899511653, "grad_norm": 0.1195407435297966, "learning_rate": 2.7451956521393983e-07, "loss": 0.013, "step": 10220 }, { "epoch": 0.4641850387671645, "grad_norm": 0.1975809931755066, "learning_rate": 2.677216235342561e-07, "loss": 0.0126, "step": 10230 }, { "epoch": 0.4646387875831636, "grad_norm": 0.25036758184432983, "learning_rate": 2.6100776734570345e-07, "loss": 0.0149, "step": 10240 }, { "epoch": 0.4650925363991628, "grad_norm": 0.20223237574100494, "learning_rate": 2.543780546550401e-07, "loss": 0.0141, "step": 10250 }, { "epoch": 0.465546285215162, "grad_norm": 0.30273565649986267, "learning_rate": 2.478325427420336e-07, "loss": 0.0136, "step": 10260 }, { "epoch": 0.4660000340311612, "grad_norm": 0.17049254477024078, "learning_rate": 2.4137128815896803e-07, "loss": 0.0141, "step": 10270 }, { "epoch": 0.4664537828471604, "grad_norm": 0.20468038320541382, "learning_rate": 2.3499434673015852e-07, "loss": 0.0129, "step": 10280 }, { "epoch": 0.46690753166315957, "grad_norm": 0.1765865534543991, "learning_rate": 2.2870177355146406e-07, "loss": 0.0133, "step": 10290 }, { "epoch": 0.46736128047915876, "grad_norm": 0.21547533571720123, "learning_rate": 2.2249362298981892e-07, "loss": 0.0166, "step": 10300 }, { "epoch": 0.46781502929515795, "grad_norm": 0.20720413327217102, "learning_rate": 2.1636994868275085e-07, "loss": 0.0108, "step": 10310 }, { "epoch": 0.46826877811115714, "grad_norm": 0.20710434019565582, "learning_rate": 2.1033080353793144e-07, "loss": 0.0124, "step": 10320 }, { "epoch": 0.4687225269271563, "grad_norm": 0.20307783782482147, "learning_rate": 2.043762397327087e-07, "loss": 0.0133, "step": 10330 }, { "epoch": 0.46917627574315546, "grad_norm": 0.27375757694244385, "learning_rate": 1.985063087136596e-07, "loss": 0.0118, "step": 10340 }, { "epoch": 0.46963002455915465, "grad_norm": 0.19632002711296082, "learning_rate": 1.927210611961494e-07, "loss": 0.013, "step": 10350 }, { "epoch": 0.47008377337515383, "grad_norm": 0.23758046329021454, "learning_rate": 1.870205471638864e-07, "loss": 0.0115, "step": 10360 }, { "epoch": 0.470537522191153, "grad_norm": 0.2348666489124298, "learning_rate": 1.814048158684978e-07, "loss": 0.0122, "step": 10370 }, { "epoch": 0.4709912710071522, "grad_norm": 0.24661144614219666, "learning_rate": 1.7587391582909452e-07, "loss": 0.0131, "step": 10380 }, { "epoch": 0.4714450198231514, "grad_norm": 0.22455532848834991, "learning_rate": 1.7042789483186273e-07, "loss": 0.0136, "step": 10390 }, { "epoch": 0.4718987686391506, "grad_norm": 0.20333920419216156, "learning_rate": 1.6506679992964292e-07, "loss": 0.012, "step": 10400 }, { "epoch": 0.4723525174551498, "grad_norm": 0.2547316551208496, "learning_rate": 1.597906774415281e-07, "loss": 0.0123, "step": 10410 }, { "epoch": 0.47280626627114897, "grad_norm": 0.22607842087745667, "learning_rate": 1.5459957295245965e-07, "loss": 0.0135, "step": 10420 }, { "epoch": 0.47326001508714816, "grad_norm": 0.24883075058460236, "learning_rate": 1.494935313128376e-07, "loss": 0.0123, "step": 10430 }, { "epoch": 0.4737137639031473, "grad_norm": 0.24136239290237427, "learning_rate": 1.4447259663812886e-07, "loss": 0.0133, "step": 10440 }, { "epoch": 0.4741675127191465, "grad_norm": 0.19822607934474945, "learning_rate": 1.395368123084917e-07, "loss": 0.0105, "step": 10450 }, { "epoch": 0.47462126153514567, "grad_norm": 0.2522118091583252, "learning_rate": 1.3468622096839524e-07, "loss": 0.0148, "step": 10460 }, { "epoch": 0.47507501035114486, "grad_norm": 0.18099448084831238, "learning_rate": 1.2992086452625175e-07, "loss": 0.0117, "step": 10470 }, { "epoch": 0.47552875916714404, "grad_norm": 0.2114916890859604, "learning_rate": 1.252407841540626e-07, "loss": 0.0134, "step": 10480 }, { "epoch": 0.47598250798314323, "grad_norm": 0.21255327761173248, "learning_rate": 1.2064602028704742e-07, "loss": 0.0131, "step": 10490 }, { "epoch": 0.4764362567991424, "grad_norm": 0.20135633647441864, "learning_rate": 1.1613661262331099e-07, "loss": 0.0131, "step": 10500 }, { "epoch": 0.4768900056151416, "grad_norm": 0.2436068207025528, "learning_rate": 1.1171260012348805e-07, "loss": 0.0132, "step": 10510 }, { "epoch": 0.4773437544311408, "grad_norm": 0.18859992921352386, "learning_rate": 1.0737402101041349e-07, "loss": 0.0128, "step": 10520 }, { "epoch": 0.47779750324714, "grad_norm": 0.19239741563796997, "learning_rate": 1.0312091276878821e-07, "loss": 0.0133, "step": 10530 }, { "epoch": 0.4782512520631391, "grad_norm": 0.2157398760318756, "learning_rate": 9.895331214485937e-08, "loss": 0.0127, "step": 10540 }, { "epoch": 0.4787050008791383, "grad_norm": 0.2037370353937149, "learning_rate": 9.487125514610063e-08, "loss": 0.0123, "step": 10550 }, { "epoch": 0.4791587496951375, "grad_norm": 0.23567423224449158, "learning_rate": 9.087477704089686e-08, "loss": 0.0127, "step": 10560 }, { "epoch": 0.4796124985111367, "grad_norm": 0.2106132209300995, "learning_rate": 8.696391235824886e-08, "loss": 0.0137, "step": 10570 }, { "epoch": 0.4800662473271359, "grad_norm": 0.27171772718429565, "learning_rate": 8.313869488746574e-08, "loss": 0.012, "step": 10580 }, { "epoch": 0.48051999614313506, "grad_norm": 0.22261816263198853, "learning_rate": 7.939915767787853e-08, "loss": 0.0098, "step": 10590 }, { "epoch": 0.48097374495913425, "grad_norm": 0.21753747761249542, "learning_rate": 7.574533303855491e-08, "loss": 0.0147, "step": 10600 }, { "epoch": 0.48142749377513344, "grad_norm": 0.19983717799186707, "learning_rate": 7.217725253801488e-08, "loss": 0.0141, "step": 10610 }, { "epoch": 0.48188124259113263, "grad_norm": 0.17265300452709198, "learning_rate": 6.869494700396328e-08, "loss": 0.013, "step": 10620 }, { "epoch": 0.4823349914071318, "grad_norm": 0.21228156983852386, "learning_rate": 6.529844652301997e-08, "loss": 0.0148, "step": 10630 }, { "epoch": 0.482788740223131, "grad_norm": 0.2246147096157074, "learning_rate": 6.19877804404645e-08, "loss": 0.0104, "step": 10640 }, { "epoch": 0.48324248903913014, "grad_norm": 0.22203919291496277, "learning_rate": 5.876297735997738e-08, "loss": 0.0139, "step": 10650 }, { "epoch": 0.48369623785512933, "grad_norm": 0.18882927298545837, "learning_rate": 5.562406514339369e-08, "loss": 0.0127, "step": 10660 }, { "epoch": 0.4841499866711285, "grad_norm": 0.1806451976299286, "learning_rate": 5.257107091046654e-08, "loss": 0.0112, "step": 10670 }, { "epoch": 0.4846037354871277, "grad_norm": 0.15896078944206238, "learning_rate": 4.9604021038628384e-08, "loss": 0.012, "step": 10680 }, { "epoch": 0.4850574843031269, "grad_norm": 0.2942860424518585, "learning_rate": 4.6722941162764546e-08, "loss": 0.0125, "step": 10690 }, { "epoch": 0.4855112331191261, "grad_norm": 0.3003031015396118, "learning_rate": 4.392785617499451e-08, "loss": 0.0133, "step": 10700 }, { "epoch": 0.4859649819351253, "grad_norm": 0.20426614582538605, "learning_rate": 4.1218790224450965e-08, "loss": 0.0146, "step": 10710 }, { "epoch": 0.48641873075112446, "grad_norm": 0.1857881397008896, "learning_rate": 3.859576671707554e-08, "loss": 0.0123, "step": 10720 }, { "epoch": 0.48687247956712365, "grad_norm": 0.26162266731262207, "learning_rate": 3.605880831541564e-08, "loss": 0.0162, "step": 10730 }, { "epoch": 0.48732622838312284, "grad_norm": 0.21630750596523285, "learning_rate": 3.36079369384279e-08, "loss": 0.0156, "step": 10740 }, { "epoch": 0.487779977199122, "grad_norm": 0.15894387662410736, "learning_rate": 3.124317376129171e-08, "loss": 0.009, "step": 10750 }, { "epoch": 0.48823372601512116, "grad_norm": 0.2581477761268616, "learning_rate": 2.8964539215220468e-08, "loss": 0.0134, "step": 10760 }, { "epoch": 0.48868747483112035, "grad_norm": 0.17207610607147217, "learning_rate": 2.6772052987290575e-08, "loss": 0.0117, "step": 10770 }, { "epoch": 0.48914122364711954, "grad_norm": 0.12550193071365356, "learning_rate": 2.4665734020270503e-08, "loss": 0.0118, "step": 10780 }, { "epoch": 0.4895949724631187, "grad_norm": 0.15162256360054016, "learning_rate": 2.2645600512452016e-08, "loss": 0.0141, "step": 10790 }, { "epoch": 0.4900487212791179, "grad_norm": 0.21558327972888947, "learning_rate": 2.0711669917501398e-08, "loss": 0.0108, "step": 10800 }, { "epoch": 0.4905024700951171, "grad_norm": 0.2502230703830719, "learning_rate": 1.8863958944300708e-08, "loss": 0.0144, "step": 10810 }, { "epoch": 0.4909562189111163, "grad_norm": 0.2717040479183197, "learning_rate": 1.710248355680788e-08, "loss": 0.0147, "step": 10820 }, { "epoch": 0.4914099677271155, "grad_norm": 0.20592103898525238, "learning_rate": 1.5427258973919058e-08, "loss": 0.0118, "step": 10830 }, { "epoch": 0.49186371654311467, "grad_norm": 0.24586917459964752, "learning_rate": 1.3838299669334255e-08, "loss": 0.0121, "step": 10840 }, { "epoch": 0.4923174653591138, "grad_norm": 0.14670969545841217, "learning_rate": 1.2335619371434126e-08, "loss": 0.0114, "step": 10850 }, { "epoch": 0.492771214175113, "grad_norm": 0.18548691272735596, "learning_rate": 1.0919231063161173e-08, "loss": 0.0127, "step": 10860 }, { "epoch": 0.4932249629911122, "grad_norm": 0.20851780474185944, "learning_rate": 9.589146981907604e-09, "loss": 0.0114, "step": 10870 }, { "epoch": 0.49367871180711137, "grad_norm": 0.21012237668037415, "learning_rate": 8.345378619408762e-09, "loss": 0.0097, "step": 10880 }, { "epoch": 0.49413246062311056, "grad_norm": 0.22415730357170105, "learning_rate": 7.187936721646527e-09, "loss": 0.0137, "step": 10890 }, { "epoch": 0.49458620943910975, "grad_norm": 0.17579254508018494, "learning_rate": 6.116831288751624e-09, "loss": 0.0133, "step": 10900 }, { "epoch": 0.49503995825510894, "grad_norm": 0.16525723040103912, "learning_rate": 5.13207157492257e-09, "loss": 0.0133, "step": 10910 }, { "epoch": 0.4954937070711081, "grad_norm": 0.23372505605220795, "learning_rate": 4.233666088341304e-09, "loss": 0.0135, "step": 10920 }, { "epoch": 0.4959474558871073, "grad_norm": 0.2231581211090088, "learning_rate": 3.4216225911032354e-09, "loss": 0.0128, "step": 10930 }, { "epoch": 0.4964012047031065, "grad_norm": 0.21696442365646362, "learning_rate": 2.6959480991484157e-09, "loss": 0.0111, "step": 10940 }, { "epoch": 0.49685495351910564, "grad_norm": 0.2921532690525055, "learning_rate": 2.0566488821993635e-09, "loss": 0.0161, "step": 10950 }, { "epoch": 0.4973087023351048, "grad_norm": 0.25637537240982056, "learning_rate": 1.503730463709996e-09, "loss": 0.0107, "step": 10960 }, { "epoch": 0.497762451151104, "grad_norm": 0.2902769148349762, "learning_rate": 1.0371976208167766e-09, "loss": 0.0156, "step": 10970 }, { "epoch": 0.4982161999671032, "grad_norm": 0.18658849596977234, "learning_rate": 6.570543842965293e-10, "loss": 0.014, "step": 10980 }, { "epoch": 0.4986699487831024, "grad_norm": 0.1382964849472046, "learning_rate": 3.6330403853201966e-10, "loss": 0.0103, "step": 10990 }, { "epoch": 0.4991236975991016, "grad_norm": 0.19721582531929016, "learning_rate": 1.5594912148420017e-10, "loss": 0.0142, "step": 11000 }, { "epoch": 0.49957744641510077, "grad_norm": 0.24301065504550934, "learning_rate": 3.49914246700056e-11, "loss": 0.0171, "step": 11010 }, { "epoch": 0.4999858203495, "step": 11019, "total_flos": 1.5507177612131697e+18, "train_loss": 0.022521909138005034, "train_runtime": 48168.4507, "train_samples_per_second": 1.83, "train_steps_per_second": 0.229 } ], "logging_steps": 10, "max_steps": 11019, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.5507177612131697e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }