[ { "loss": 1.8996, "grad_norm": 5.035277843475342, "learning_rate": 2.278481012658228e-05, "epoch": 0.012690355329949238, "step": 10 }, { "loss": 0.5315, "grad_norm": 1.102072834968567, "learning_rate": 4.810126582278481e-05, "epoch": 0.025380710659898477, "step": 20 }, { "loss": 0.3353, "grad_norm": 0.7798988819122314, "learning_rate": 7.341772151898734e-05, "epoch": 0.03807106598984772, "step": 30 }, { "loss": 0.2226, "grad_norm": 0.8653473854064941, "learning_rate": 9.873417721518988e-05, "epoch": 0.050761421319796954, "step": 40 }, { "loss": 0.164, "grad_norm": 0.7569780349731445, "learning_rate": 0.0001240506329113924, "epoch": 0.06345177664974619, "step": 50 }, { "loss": 0.1394, "grad_norm": 1.0211968421936035, "learning_rate": 0.00014936708860759494, "epoch": 0.07614213197969544, "step": 60 }, { "loss": 0.1201, "grad_norm": 0.5370887517929077, "learning_rate": 0.00017468354430379748, "epoch": 0.08883248730964467, "step": 70 }, { "loss": 0.122, "grad_norm": 0.49917498230934143, "learning_rate": 0.0002, "epoch": 0.10152284263959391, "step": 80 }, { "loss": 0.1217, "grad_norm": 0.4577413499355316, "learning_rate": 0.0001999779803602204, "epoch": 0.11421319796954314, "step": 90 }, { "loss": 0.0965, "grad_norm": 0.48522070050239563, "learning_rate": 0.00019991193113817244, "epoch": 0.12690355329949238, "step": 100 }, { "loss": 0.11, "grad_norm": 0.41902250051498413, "learning_rate": 0.00019980188142145754, "epoch": 0.13959390862944163, "step": 110 }, { "loss": 0.0823, "grad_norm": 0.5561641454696655, "learning_rate": 0.00019964787967517817, "epoch": 0.15228426395939088, "step": 120 }, { "loss": 0.0856, "grad_norm": 0.3316971957683563, "learning_rate": 0.00019944999372059388, "epoch": 0.1649746192893401, "step": 130 }, { "loss": 0.0849, "grad_norm": 0.372153639793396, "learning_rate": 0.00019920831070525342, "epoch": 0.17766497461928935, "step": 140 }, { "loss": 0.0929, "grad_norm": 0.33250877261161804, "learning_rate": 0.00019892293706461555, "epoch": 0.19035532994923857, "step": 150 }, { "eval_loss": 0.08791538327932358, "eval_runtime": 29.62, "eval_samples_per_second": 44.801, "eval_steps_per_second": 11.209, "epoch": 0.19923857868020303, "step": 157 }, { "loss": 0.0824, "grad_norm": 0.4130192995071411, "learning_rate": 0.00019859399847517567, "epoch": 0.20304568527918782, "step": 160 }, { "loss": 0.0902, "grad_norm": 0.3217241168022156, "learning_rate": 0.0001982216397991188, "epoch": 0.21573604060913706, "step": 170 }, { "loss": 0.0766, "grad_norm": 0.4728490710258484, "learning_rate": 0.0001978060250205232, "epoch": 0.22842639593908629, "step": 180 }, { "loss": 0.0844, "grad_norm": 0.5730077028274536, "learning_rate": 0.0001973473371731431, "epoch": 0.24111675126903553, "step": 190 }, { "loss": 0.0841, "grad_norm": 0.5745298862457275, "learning_rate": 0.00019684577825980192, "epoch": 0.25380710659898476, "step": 200 }, { "loss": 0.0797, "grad_norm": 0.3141058683395386, "learning_rate": 0.0001963015691634317, "epoch": 0.26649746192893403, "step": 210 }, { "loss": 0.0822, "grad_norm": 0.3730680048465729, "learning_rate": 0.00019571494954979775, "epoch": 0.27918781725888325, "step": 220 }, { "loss": 0.0677, "grad_norm": 0.3915182650089264, "learning_rate": 0.00019508617776195167, "epoch": 0.2918781725888325, "step": 230 }, { "loss": 0.08, "grad_norm": 0.3052193820476532, "learning_rate": 0.00019441553070645887, "epoch": 0.30456852791878175, "step": 240 }, { "loss": 0.0744, "grad_norm": 0.3673352003097534, "learning_rate": 0.000193703303731451, "epoch": 0.31725888324873097, "step": 250 }, { "loss": 0.0821, "grad_norm": 0.39443644881248474, "learning_rate": 0.00019294981049655668, "epoch": 0.3299492385786802, "step": 260 }, { "loss": 0.073, "grad_norm": 0.44178199768066406, "learning_rate": 0.0001921553828347681, "epoch": 0.3426395939086294, "step": 270 }, { "loss": 0.0784, "grad_norm": 0.4202715754508972, "learning_rate": 0.00019132037060630409, "epoch": 0.3553299492385787, "step": 280 }, { "loss": 0.0646, "grad_norm": 0.23640507459640503, "learning_rate": 0.00019044514154453434, "epoch": 0.3680203045685279, "step": 290 }, { "loss": 0.0785, "grad_norm": 0.4354120194911957, "learning_rate": 0.0001895300810940321, "epoch": 0.38071065989847713, "step": 300 }, { "loss": 0.0656, "grad_norm": 0.2467317283153534, "learning_rate": 0.00018857559224082736, "epoch": 0.3934010152284264, "step": 310 }, { "eval_loss": 0.0728072002530098, "eval_runtime": 19.9827, "eval_samples_per_second": 66.407, "eval_steps_per_second": 16.614, "epoch": 0.39847715736040606, "step": 314 }, { "loss": 0.0738, "grad_norm": 0.2969267666339874, "learning_rate": 0.00018758209533493444, "epoch": 0.40609137055837563, "step": 320 }, { "loss": 0.067, "grad_norm": 0.3527528643608093, "learning_rate": 0.00018655002790523328, "epoch": 0.41878172588832485, "step": 330 }, { "loss": 0.0714, "grad_norm": 0.2732889950275421, "learning_rate": 0.00018547984446678437, "epoch": 0.43147208121827413, "step": 340 }, { "loss": 0.0602, "grad_norm": 0.25770312547683716, "learning_rate": 0.000184372016320664, "epoch": 0.44416243654822335, "step": 350 }, { "loss": 0.0624, "grad_norm": 0.22473905980587006, "learning_rate": 0.00018322703134640654, "epoch": 0.45685279187817257, "step": 360 }, { "loss": 0.0709, "grad_norm": 0.3180300295352936, "learning_rate": 0.00018204539378714561, "epoch": 0.46954314720812185, "step": 370 }, { "loss": 0.0698, "grad_norm": 0.2796868085861206, "learning_rate": 0.00018082762402754936, "epoch": 0.48223350253807107, "step": 380 }, { "loss": 0.0658, "grad_norm": 0.3655967712402344, "learning_rate": 0.0001795742583646466, "epoch": 0.4949238578680203, "step": 390 }, { "loss": 0.0682, "grad_norm": 0.2886195182800293, "learning_rate": 0.0001782858487716455, "epoch": 0.5076142131979695, "step": 400 }, { "loss": 0.071, "grad_norm": 0.27021610736846924, "learning_rate": 0.00017696296265484862, "epoch": 0.5203045685279187, "step": 410 }, { "loss": 0.0636, "grad_norm": 0.28307008743286133, "learning_rate": 0.00017560618260377116, "epoch": 0.5329949238578681, "step": 420 }, { "loss": 0.0546, "grad_norm": 0.28294482827186584, "learning_rate": 0.00017421610613457282, "epoch": 0.5456852791878173, "step": 430 }, { "loss": 0.0612, "grad_norm": 0.2255251258611679, "learning_rate": 0.00017279334542691596, "epoch": 0.5583756345177665, "step": 440 }, { "loss": 0.0629, "grad_norm": 0.22404751181602478, "learning_rate": 0.0001713385270543661, "epoch": 0.5710659898477157, "step": 450 }, { "loss": 0.0596, "grad_norm": 0.2632795572280884, "learning_rate": 0.00016985229170845339, "epoch": 0.583756345177665, "step": 460 }, { "loss": 0.0717, "grad_norm": 0.3002878427505493, "learning_rate": 0.0001683352939165167, "epoch": 0.5964467005076142, "step": 470 }, { "eval_loss": 0.06722872704267502, "eval_runtime": 20.1214, "eval_samples_per_second": 65.95, "eval_steps_per_second": 16.5, "epoch": 0.5977157360406091, "step": 471 }, { "loss": 0.0618, "grad_norm": 0.15326248109340668, "learning_rate": 0.00016678820175345454, "epoch": 0.6091370558375635, "step": 480 }, { "loss": 0.0718, "grad_norm": 0.27122628688812256, "learning_rate": 0.00016521169654750968, "epoch": 0.6218274111675127, "step": 490 }, { "loss": 0.0636, "grad_norm": 0.29509711265563965, "learning_rate": 0.00016360647258021696, "epoch": 0.6345177664974619, "step": 500 }, { "loss": 0.0655, "grad_norm": 0.4090014100074768, "learning_rate": 0.00016197323678064697, "epoch": 0.6472081218274112, "step": 510 }, { "loss": 0.0606, "grad_norm": 0.2687474191188812, "learning_rate": 0.00016031270841407926, "epoch": 0.6598984771573604, "step": 520 }, { "loss": 0.0519, "grad_norm": 0.25125357508659363, "learning_rate": 0.00015862561876524338, "epoch": 0.6725888324873096, "step": 530 }, { "loss": 0.0623, "grad_norm": 0.21579739451408386, "learning_rate": 0.0001569127108162662, "epoch": 0.6852791878172588, "step": 540 }, { "loss": 0.0612, "grad_norm": 0.24012021720409393, "learning_rate": 0.000155174738919468, "epoch": 0.6979695431472082, "step": 550 }, { "loss": 0.0617, "grad_norm": 0.22273781895637512, "learning_rate": 0.00015341246846515096, "epoch": 0.7106598984771574, "step": 560 }, { "loss": 0.0627, "grad_norm": 0.29965269565582275, "learning_rate": 0.0001516266755445271, "epoch": 0.7233502538071066, "step": 570 }, { "loss": 0.0649, "grad_norm": 0.2375640720129013, "learning_rate": 0.00014981814660793314, "epoch": 0.7360406091370558, "step": 580 }, { "loss": 0.0653, "grad_norm": 0.2595769166946411, "learning_rate": 0.0001479876781184833, "epoch": 0.748730964467005, "step": 590 }, { "loss": 0.0634, "grad_norm": 0.28185659646987915, "learning_rate": 0.00014613607620131294, "epoch": 0.7614213197969543, "step": 600 }, { "loss": 0.0601, "grad_norm": 0.20655085146427155, "learning_rate": 0.00014426415628856663, "epoch": 0.7741116751269036, "step": 610 }, { "loss": 0.0632, "grad_norm": 0.4992614686489105, "learning_rate": 0.0001423727427602879, "epoch": 0.7868020304568528, "step": 620 }, { "eval_loss": 0.05841095373034477, "eval_runtime": 20.0018, "eval_samples_per_second": 66.344, "eval_steps_per_second": 16.599, "epoch": 0.7969543147208121, "step": 628 }, { "loss": 0.0522, "grad_norm": 0.2023015171289444, "learning_rate": 0.0001404626685813681, "epoch": 0.799492385786802, "step": 630 }, { "loss": 0.0567, "grad_norm": 0.20891991257667542, "learning_rate": 0.00013853477493471468, "epoch": 0.8121827411167513, "step": 640 }, { "loss": 0.0555, "grad_norm": 0.27132412791252136, "learning_rate": 0.00013658991085080025, "epoch": 0.8248730964467005, "step": 650 }, { "loss": 0.0594, "grad_norm": 0.22256866097450256, "learning_rate": 0.0001346289328337558, "epoch": 0.8375634517766497, "step": 660 }, { "loss": 0.0556, "grad_norm": 0.20859505236148834, "learning_rate": 0.00013265270448417234, "epoch": 0.850253807106599, "step": 670 }, { "loss": 0.0557, "grad_norm": 0.2204328030347824, "learning_rate": 0.00013066209611877746, "epoch": 0.8629441624365483, "step": 680 }, { "loss": 0.059, "grad_norm": 0.2515346109867096, "learning_rate": 0.00012865798438715413, "epoch": 0.8756345177664975, "step": 690 }, { "loss": 0.0546, "grad_norm": 0.3130325376987457, "learning_rate": 0.00012664125188567056, "epoch": 0.8883248730964467, "step": 700 }, { "loss": 0.0475, "grad_norm": 0.2509436011314392, "learning_rate": 0.00012461278676879098, "epoch": 0.9010152284263959, "step": 710 }, { "loss": 0.0561, "grad_norm": 0.23676852881908417, "learning_rate": 0.00012257348235793897, "epoch": 0.9137055837563451, "step": 720 }, { "loss": 0.0536, "grad_norm": 0.20894668996334076, "learning_rate": 0.00012052423674808513, "epoch": 0.9263959390862944, "step": 730 }, { "loss": 0.0517, "grad_norm": 0.18107716739177704, "learning_rate": 0.00011846595241223247, "epoch": 0.9390862944162437, "step": 740 }, { "loss": 0.0623, "grad_norm": 0.3013327717781067, "learning_rate": 0.00011639953580397367, "epoch": 0.9517766497461929, "step": 750 }, { "loss": 0.0579, "grad_norm": 0.19317802786827087, "learning_rate": 0.00011432589695829576, "epoch": 0.9644670050761421, "step": 760 }, { "loss": 0.0559, "grad_norm": 0.26291170716285706, "learning_rate": 0.00011224594909080704, "epoch": 0.9771573604060914, "step": 770 }, { "loss": 0.0537, "grad_norm": 0.28403881192207336, "learning_rate": 0.00011016060819556353, "epoch": 0.9898477157360406, "step": 780 }, { "eval_loss": 0.05360769107937813, "eval_runtime": 20.0465, "eval_samples_per_second": 66.196, "eval_steps_per_second": 16.562, "epoch": 0.9961928934010152, "step": 785 }, { "loss": 0.0502, "grad_norm": 0.1471383273601532, "learning_rate": 0.0001080707926416719, "epoch": 1.00253807106599, "step": 790 }, { "loss": 0.038, "grad_norm": 0.17716127634048462, "learning_rate": 0.00010597742276884614, "epoch": 1.015228426395939, "step": 800 }, { "loss": 0.0351, "grad_norm": 0.2006382942199707, "learning_rate": 0.00010388142048209676, "epoch": 1.0279187817258884, "step": 810 }, { "loss": 0.0375, "grad_norm": 0.2539692521095276, "learning_rate": 0.00010178370884573046, "epoch": 1.0406091370558375, "step": 820 }, { "loss": 0.0422, "grad_norm": 0.2615308165550232, "learning_rate": 9.968521167683905e-05, "epoch": 1.0532994923857868, "step": 830 }, { "loss": 0.0406, "grad_norm": 0.23757147789001465, "learning_rate": 9.758685313845727e-05, "epoch": 1.0659898477157361, "step": 840 }, { "loss": 0.0387, "grad_norm": 0.16979315876960754, "learning_rate": 9.548955733256803e-05, "epoch": 1.0786802030456852, "step": 850 }, { "loss": 0.0352, "grad_norm": 0.1853126734495163, "learning_rate": 9.339424789313445e-05, "epoch": 1.0913705583756346, "step": 860 }, { "loss": 0.0356, "grad_norm": 0.15106192231178284, "learning_rate": 9.13018475793382e-05, "epoch": 1.1040609137055837, "step": 870 }, { "loss": 0.037, "grad_norm": 0.20427311956882477, "learning_rate": 8.921327786920294e-05, "epoch": 1.116751269035533, "step": 880 }, { "loss": 0.0324, "grad_norm": 0.1580514758825302, "learning_rate": 8.712945855378218e-05, "epoch": 1.1294416243654823, "step": 890 }, { "loss": 0.0301, "grad_norm": 0.2191898375749588, "learning_rate": 8.505130733208968e-05, "epoch": 1.1421319796954315, "step": 900 }, { "loss": 0.0355, "grad_norm": 0.16614247858524323, "learning_rate": 8.297973940695163e-05, "epoch": 1.1548223350253808, "step": 910 }, { "loss": 0.0349, "grad_norm": 0.18907427787780762, "learning_rate": 8.091566708195786e-05, "epoch": 1.16751269035533, "step": 920 }, { "loss": 0.0336, "grad_norm": 0.24296258389949799, "learning_rate": 7.885999935968982e-05, "epoch": 1.1802030456852792, "step": 930 }, { "loss": 0.0372, "grad_norm": 0.1817648708820343, "learning_rate": 7.681364154140264e-05, "epoch": 1.1928934010152283, "step": 940 }, { "eval_loss": 0.057017017155885696, "eval_runtime": 19.9628, "eval_samples_per_second": 66.474, "eval_steps_per_second": 16.631, "epoch": 1.1954314720812182, "step": 942 }, { "loss": 0.03, "grad_norm": 0.19095705449581146, "learning_rate": 7.47774948283366e-05, "epoch": 1.2055837563451777, "step": 950 }, { "loss": 0.035, "grad_norm": 0.33682745695114136, "learning_rate": 7.275245592483492e-05, "epoch": 1.218274111675127, "step": 960 }, { "loss": 0.0384, "grad_norm": 0.2646084427833557, "learning_rate": 7.073941664344152e-05, "epoch": 1.2309644670050761, "step": 970 }, { "loss": 0.0287, "grad_norm": 0.1980791836977005, "learning_rate": 6.873926351215312e-05, "epoch": 1.2436548223350254, "step": 980 }, { "loss": 0.0342, "grad_norm": 0.18797655403614044, "learning_rate": 6.67528773839989e-05, "epoch": 1.2563451776649746, "step": 990 }, { "loss": 0.0337, "grad_norm": 0.24009937047958374, "learning_rate": 6.478113304911886e-05, "epoch": 1.2690355329949239, "step": 1000 }, { "loss": 0.0272, "grad_norm": 0.29159170389175415, "learning_rate": 6.282489884951295e-05, "epoch": 1.281725888324873, "step": 1010 }, { "loss": 0.036, "grad_norm": 0.16352516412734985, "learning_rate": 6.0885036296629064e-05, "epoch": 1.2944162436548223, "step": 1020 }, { "loss": 0.0292, "grad_norm": 0.17807820439338684, "learning_rate": 5.896239969195994e-05, "epoch": 1.3071065989847717, "step": 1030 }, { "loss": 0.0332, "grad_norm": 0.2500491738319397, "learning_rate": 5.7057835750814867e-05, "epoch": 1.3197969543147208, "step": 1040 }, { "loss": 0.0294, "grad_norm": 0.2208271473646164, "learning_rate": 5.517218322943224e-05, "epoch": 1.33248730964467, "step": 1050 }, { "loss": 0.0342, "grad_norm": 0.23927471041679382, "learning_rate": 5.3306272555597504e-05, "epoch": 1.3451776649746192, "step": 1060 }, { "loss": 0.0307, "grad_norm": 0.20309758186340332, "learning_rate": 5.1460925462928546e-05, "epoch": 1.3578680203045685, "step": 1070 }, { "loss": 0.0314, "grad_norm": 0.23275193572044373, "learning_rate": 4.96369546289904e-05, "epoch": 1.3705583756345177, "step": 1080 }, { "loss": 0.0333, "grad_norm": 0.2078331708908081, "learning_rate": 4.783516331739769e-05, "epoch": 1.383248730964467, "step": 1090 }, { "eval_loss": 0.05335332825779915, "eval_runtime": 19.9859, "eval_samples_per_second": 66.397, "eval_steps_per_second": 16.612, "epoch": 1.3946700507614214, "step": 1099 }, { "loss": 0.0309, "grad_norm": 0.18032079935073853, "learning_rate": 4.605634502406321e-05, "epoch": 1.3959390862944163, "step": 1100 }, { "loss": 0.0328, "grad_norm": 0.20803005993366241, "learning_rate": 4.430128312774804e-05, "epoch": 1.4086294416243654, "step": 1110 }, { "loss": 0.027, "grad_norm": 0.1680465191602707, "learning_rate": 4.2570750545067076e-05, "epoch": 1.4213197969543148, "step": 1120 }, { "loss": 0.0317, "grad_norm": 0.2528463900089264, "learning_rate": 4.086550939010227e-05, "epoch": 1.434010152284264, "step": 1130 }, { "loss": 0.0313, "grad_norm": 0.19024434685707092, "learning_rate": 3.9186310638773047e-05, "epoch": 1.4467005076142132, "step": 1140 }, { "loss": 0.0287, "grad_norm": 0.20934472978115082, "learning_rate": 3.753389379811185e-05, "epoch": 1.4593908629441623, "step": 1150 }, { "loss": 0.0265, "grad_norm": 0.29412180185317993, "learning_rate": 3.590898658059062e-05, "epoch": 1.4720812182741116, "step": 1160 }, { "loss": 0.0298, "grad_norm": 0.3268195390701294, "learning_rate": 3.4312304583641484e-05, "epoch": 1.484771573604061, "step": 1170 }, { "loss": 0.0251, "grad_norm": 0.17332251369953156, "learning_rate": 3.274455097451269e-05, "epoch": 1.49746192893401, "step": 1180 }, { "loss": 0.0318, "grad_norm": 0.3481772541999817, "learning_rate": 3.1206416180598995e-05, "epoch": 1.5101522842639594, "step": 1190 }, { "loss": 0.0335, "grad_norm": 0.24047453701496124, "learning_rate": 2.9698577585382282e-05, "epoch": 1.5228426395939088, "step": 1200 }, { "loss": 0.0339, "grad_norm": 0.21146714687347412, "learning_rate": 2.8221699230116793e-05, "epoch": 1.5355329949238579, "step": 1210 }, { "loss": 0.0308, "grad_norm": 0.140832781791687, "learning_rate": 2.67764315213902e-05, "epoch": 1.548223350253807, "step": 1220 }, { "loss": 0.026, "grad_norm": 0.1721792370080948, "learning_rate": 2.536341094468906e-05, "epoch": 1.5609137055837563, "step": 1230 }, { "loss": 0.0277, "grad_norm": 0.14980490505695343, "learning_rate": 2.398325978409539e-05, "epoch": 1.5736040609137056, "step": 1240 }, { "loss": 0.028, "grad_norm": 0.18908673524856567, "learning_rate": 2.263658584823717e-05, "epoch": 1.5862944162436547, "step": 1250 }, { "eval_loss": 0.052472274750471115, "eval_runtime": 19.9786, "eval_samples_per_second": 66.421, "eval_steps_per_second": 16.618, "epoch": 1.5939086294416245, "step": 1256 }, { "loss": 0.0272, "grad_norm": 0.12164825201034546, "learning_rate": 2.1323982202613735e-05, "epoch": 1.598984771573604, "step": 1260 }, { "loss": 0.0245, "grad_norm": 0.2658851146697998, "learning_rate": 2.004602690841414e-05, "epoch": 1.6116751269035534, "step": 1270 }, { "loss": 0.0304, "grad_norm": 0.2891974151134491, "learning_rate": 1.8803282767942954e-05, "epoch": 1.6243654822335025, "step": 1280 }, { "loss": 0.0292, "grad_norm": 0.2979351580142975, "learning_rate": 1.7596297076766455e-05, "epoch": 1.6370558375634516, "step": 1290 }, { "loss": 0.0284, "grad_norm": 0.20141719281673431, "learning_rate": 1.6425601382687405e-05, "epoch": 1.649746192893401, "step": 1300 }, { "loss": 0.0254, "grad_norm": 0.1950131356716156, "learning_rate": 1.5291711251655316e-05, "epoch": 1.6624365482233503, "step": 1310 }, { "loss": 0.0282, "grad_norm": 0.21205022931098938, "learning_rate": 1.41951260407149e-05, "epoch": 1.6751269035532994, "step": 1320 }, { "loss": 0.0247, "grad_norm": 0.2470894753932953, "learning_rate": 1.3136328678092746e-05, "epoch": 1.6878172588832487, "step": 1330 }, { "loss": 0.0257, "grad_norm": 0.26378998160362244, "learning_rate": 1.2115785450519434e-05, "epoch": 1.700507614213198, "step": 1340 }, { "loss": 0.0282, "grad_norm": 0.12680888175964355, "learning_rate": 1.1133945797879908e-05, "epoch": 1.7131979695431472, "step": 1350 }, { "loss": 0.0251, "grad_norm": 0.19744935631752014, "learning_rate": 1.019124211528365e-05, "epoch": 1.7258883248730963, "step": 1360 }, { "loss": 0.0327, "grad_norm": 0.18419434130191803, "learning_rate": 9.288089562640844e-06, "epoch": 1.7385786802030458, "step": 1370 }, { "loss": 0.0282, "grad_norm": 0.19115136563777924, "learning_rate": 8.42488588182897e-06, "epoch": 1.751269035532995, "step": 1380 }, { "loss": 0.0245, "grad_norm": 0.17252641916275024, "learning_rate": 7.602011221530236e-06, "epoch": 1.763959390862944, "step": 1390 }, { "loss": 0.029, "grad_norm": 0.22253695130348206, "learning_rate": 6.819827969816661e-06, "epoch": 1.7766497461928934, "step": 1400 }, { "loss": 0.0269, "grad_norm": 0.21938475966453552, "learning_rate": 6.078680594557163e-06, "epoch": 1.7893401015228427, "step": 1410 }, { "eval_loss": 0.05091211572289467, "eval_runtime": 20.0145, "eval_samples_per_second": 66.302, "eval_steps_per_second": 16.588, "epoch": 1.7931472081218274, "step": 1413 }, { "loss": 0.0305, "grad_norm": 0.2024271935224533, "learning_rate": 5.378895491716285e-06, "epoch": 1.8020304568527918, "step": 1420 }, { "loss": 0.029, "grad_norm": 0.22723488509655, "learning_rate": 4.720780841611738e-06, "epoch": 1.8147208121827412, "step": 1430 }, { "loss": 0.0266, "grad_norm": 0.2747625410556793, "learning_rate": 4.104626473194151e-06, "epoch": 1.8274111675126905, "step": 1440 }, { "loss": 0.0262, "grad_norm": 0.18593831360340118, "learning_rate": 3.5307037364083253e-06, "epoch": 1.8401015228426396, "step": 1450 }, { "loss": 0.0291, "grad_norm": 0.2651998996734619, "learning_rate": 2.9992653826927508e-06, "epoch": 1.8527918781725887, "step": 1460 }, { "loss": 0.026, "grad_norm": 0.19439752399921417, "learning_rate": 2.510545453669744e-06, "epoch": 1.865482233502538, "step": 1470 }, { "loss": 0.03, "grad_norm": 0.17483021318912506, "learning_rate": 2.06475917807506e-06, "epoch": 1.8781725888324874, "step": 1480 }, { "loss": 0.029, "grad_norm": 0.22444817423820496, "learning_rate": 1.662102876972882e-06, "epoch": 1.8908629441624365, "step": 1490 }, { "loss": 0.0243, "grad_norm": 0.17885605990886688, "learning_rate": 1.3027538772973026e-06, "epoch": 1.9035532994923858, "step": 1500 }, { "loss": 0.0272, "grad_norm": 0.19312232732772827, "learning_rate": 9.868704337588797e-07, "epoch": 1.9162436548223352, "step": 1510 }, { "loss": 0.0254, "grad_norm": 0.1709776520729065, "learning_rate": 7.145916591504098e-07, "epoch": 1.9289340101522843, "step": 1520 }, { "loss": 0.0252, "grad_norm": 0.18656505644321442, "learning_rate": 4.860374630826004e-07, "epoch": 1.9416243654822334, "step": 1530 }, { "loss": 0.0267, "grad_norm": 0.11956395953893661, "learning_rate": 3.0130849917681114e-07, "epoch": 1.9543147208121827, "step": 1540 }, { "loss": 0.0305, "grad_norm": 0.25038954615592957, "learning_rate": 1.604861207378794e-07, "epoch": 1.967005076142132, "step": 1550 }, { "loss": 0.025, "grad_norm": 0.19318363070487976, "learning_rate": 6.363234492674507e-08, "epoch": 1.9796954314720812, "step": 1560 }, { "loss": 0.0276, "grad_norm": 0.26012641191482544, "learning_rate": 1.0789825448476177e-08, "epoch": 1.9923857868020305, "step": 1570 }, { "eval_loss": 0.0504293330013752, "eval_runtime": 19.8337, "eval_samples_per_second": 66.906, "eval_steps_per_second": 16.739, "epoch": 1.9923857868020305, "step": 1570 }, { "train_runtime": 2681.8444, "train_samples_per_second": 18.795, "train_steps_per_second": 0.588, "total_flos": 6.800278675429786e+17, "train_loss": 0.06838441643920647, "epoch": 2.0, "step": 1576 } ]