[ { "loss": 1.5751, "grad_norm": 5.8206610679626465, "learning_rate": 4.556962025316456e-05, "epoch": 0.012690355329949238, "step": 10 }, { "loss": 0.6817, "grad_norm": 2.4909415245056152, "learning_rate": 9.620253164556962e-05, "epoch": 0.025380710659898477, "step": 20 }, { "loss": 0.5317, "grad_norm": 2.0207533836364746, "learning_rate": 0.0001468354430379747, "epoch": 0.03807106598984772, "step": 30 }, { "loss": 0.4042, "grad_norm": 2.2393276691436768, "learning_rate": 0.00019746835443037975, "epoch": 0.050761421319796954, "step": 40 }, { "loss": 0.3339, "grad_norm": 1.3759368658065796, "learning_rate": 0.0002481012658227848, "epoch": 0.06345177664974619, "step": 50 }, { "loss": 0.2921, "grad_norm": 1.4795056581497192, "learning_rate": 0.0002987341772151899, "epoch": 0.07614213197969544, "step": 60 }, { "loss": 0.2594, "grad_norm": 1.5126503705978394, "learning_rate": 0.00034936708860759495, "epoch": 0.08883248730964467, "step": 70 }, { "loss": 0.2418, "grad_norm": 1.2799267768859863, "learning_rate": 0.0004, "epoch": 0.10152284263959391, "step": 80 }, { "loss": 0.2121, "grad_norm": 1.081551194190979, "learning_rate": 0.0003999559607204408, "epoch": 0.11421319796954314, "step": 90 }, { "loss": 0.1958, "grad_norm": 1.5507794618606567, "learning_rate": 0.0003998238622763449, "epoch": 0.12690355329949238, "step": 100 }, { "loss": 0.21, "grad_norm": 0.9705930352210999, "learning_rate": 0.0003996037628429151, "epoch": 0.13959390862944163, "step": 110 }, { "loss": 0.173, "grad_norm": 0.9663600325584412, "learning_rate": 0.00039929575935035633, "epoch": 0.15228426395939088, "step": 120 }, { "loss": 0.1662, "grad_norm": 1.1074914932250977, "learning_rate": 0.00039889998744118777, "epoch": 0.1649746192893401, "step": 130 }, { "loss": 0.1733, "grad_norm": 0.9759091734886169, "learning_rate": 0.00039841662141050683, "epoch": 0.17766497461928935, "step": 140 }, { "loss": 0.1646, "grad_norm": 0.8136119842529297, "learning_rate": 0.0003978458741292311, "epoch": 0.19035532994923857, "step": 150 }, { "eval_loss": 0.15786977112293243, "eval_runtime": 24.4689, "eval_samples_per_second": 54.232, "eval_steps_per_second": 13.568, "epoch": 0.19923857868020303, "step": 157 }, { "loss": 0.1468, "grad_norm": 0.8433499336242676, "learning_rate": 0.00039718799695035134, "epoch": 0.20304568527918782, "step": 160 }, { "loss": 0.1321, "grad_norm": 0.5848754644393921, "learning_rate": 0.0003964432795982376, "epoch": 0.21573604060913706, "step": 170 }, { "loss": 0.1427, "grad_norm": 0.7149267792701721, "learning_rate": 0.0003956120500410464, "epoch": 0.22842639593908629, "step": 180 }, { "loss": 0.1277, "grad_norm": 0.6142985224723816, "learning_rate": 0.0003946946743462862, "epoch": 0.24111675126903553, "step": 190 }, { "loss": 0.1355, "grad_norm": 0.5795847773551941, "learning_rate": 0.00039369155651960383, "epoch": 0.25380710659898476, "step": 200 }, { "loss": 0.1229, "grad_norm": 0.6075690984725952, "learning_rate": 0.0003926031383268634, "epoch": 0.26649746192893403, "step": 210 }, { "loss": 0.1307, "grad_norm": 0.5007538199424744, "learning_rate": 0.0003914298990995955, "epoch": 0.27918781725888325, "step": 220 }, { "loss": 0.1179, "grad_norm": 0.7290500402450562, "learning_rate": 0.00039017235552390333, "epoch": 0.2918781725888325, "step": 230 }, { "loss": 0.1097, "grad_norm": 0.5982555747032166, "learning_rate": 0.00038883106141291774, "epoch": 0.30456852791878175, "step": 240 }, { "loss": 0.1304, "grad_norm": 0.5894684195518494, "learning_rate": 0.000387406607462902, "epoch": 0.31725888324873097, "step": 250 }, { "loss": 0.1253, "grad_norm": 0.4741029441356659, "learning_rate": 0.00038589962099311336, "epoch": 0.3299492385786802, "step": 260 }, { "loss": 0.1252, "grad_norm": 0.5968972444534302, "learning_rate": 0.0003843107656695362, "epoch": 0.3426395939086294, "step": 270 }, { "loss": 0.1179, "grad_norm": 0.6765711307525635, "learning_rate": 0.00038264074121260817, "epoch": 0.3553299492385787, "step": 280 }, { "loss": 0.1245, "grad_norm": 0.531008243560791, "learning_rate": 0.0003808902830890687, "epoch": 0.3680203045685279, "step": 290 }, { "loss": 0.1166, "grad_norm": 0.6060107350349426, "learning_rate": 0.0003790601621880642, "epoch": 0.38071065989847713, "step": 300 }, { "loss": 0.1004, "grad_norm": 0.6947969198226929, "learning_rate": 0.0003771511844816547, "epoch": 0.3934010152284264, "step": 310 }, { "eval_loss": 0.1160828173160553, "eval_runtime": 9.5412, "eval_samples_per_second": 139.081, "eval_steps_per_second": 34.796, "epoch": 0.39847715736040606, "step": 314 }, { "loss": 0.1162, "grad_norm": 0.6814019083976746, "learning_rate": 0.0003751641906698689, "epoch": 0.40609137055837563, "step": 320 }, { "loss": 0.1209, "grad_norm": 0.6832170486450195, "learning_rate": 0.00037310005581046656, "epoch": 0.41878172588832485, "step": 330 }, { "loss": 0.1132, "grad_norm": 0.5093628764152527, "learning_rate": 0.00037095968893356875, "epoch": 0.43147208121827413, "step": 340 }, { "loss": 0.1072, "grad_norm": 0.6062504053115845, "learning_rate": 0.000368744032641328, "epoch": 0.44416243654822335, "step": 350 }, { "loss": 0.1056, "grad_norm": 0.4274779260158539, "learning_rate": 0.00036645406269281307, "epoch": 0.45685279187817257, "step": 360 }, { "loss": 0.1178, "grad_norm": 0.469860315322876, "learning_rate": 0.00036409078757429123, "epoch": 0.46954314720812185, "step": 370 }, { "loss": 0.1157, "grad_norm": 0.5052468776702881, "learning_rate": 0.0003616552480550987, "epoch": 0.48223350253807107, "step": 380 }, { "loss": 0.1052, "grad_norm": 0.6406036019325256, "learning_rate": 0.0003591485167292932, "epoch": 0.4949238578680203, "step": 390 }, { "loss": 0.1099, "grad_norm": 0.4725392162799835, "learning_rate": 0.000356571697543291, "epoch": 0.5076142131979695, "step": 400 }, { "loss": 0.0995, "grad_norm": 0.566696286201477, "learning_rate": 0.00035392592530969724, "epoch": 0.5203045685279187, "step": 410 }, { "loss": 0.1005, "grad_norm": 0.5996329188346863, "learning_rate": 0.0003512123652075423, "epoch": 0.5329949238578681, "step": 420 }, { "loss": 0.0991, "grad_norm": 0.529081404209137, "learning_rate": 0.00034843221226914565, "epoch": 0.5456852791878173, "step": 430 }, { "loss": 0.0992, "grad_norm": 0.49170249700546265, "learning_rate": 0.0003455866908538319, "epoch": 0.5583756345177665, "step": 440 }, { "loss": 0.1054, "grad_norm": 0.4307686388492584, "learning_rate": 0.0003426770541087322, "epoch": 0.5710659898477157, "step": 450 }, { "loss": 0.0999, "grad_norm": 0.42143514752388, "learning_rate": 0.00033970458341690677, "epoch": 0.583756345177665, "step": 460 }, { "loss": 0.091, "grad_norm": 0.49249398708343506, "learning_rate": 0.0003366705878330334, "epoch": 0.5964467005076142, "step": 470 }, { "eval_loss": 0.09394794702529907, "eval_runtime": 9.5472, "eval_samples_per_second": 138.993, "eval_steps_per_second": 34.774, "epoch": 0.5977157360406091, "step": 471 }, { "loss": 0.0934, "grad_norm": 0.5115765333175659, "learning_rate": 0.00033357640350690907, "epoch": 0.6091370558375635, "step": 480 }, { "loss": 0.0869, "grad_norm": 0.43772196769714355, "learning_rate": 0.00033042339309501936, "epoch": 0.6218274111675127, "step": 490 }, { "loss": 0.0887, "grad_norm": 0.4491313397884369, "learning_rate": 0.0003272129451604339, "epoch": 0.6345177664974619, "step": 500 }, { "loss": 0.0912, "grad_norm": 0.3995817303657532, "learning_rate": 0.00032394647356129394, "epoch": 0.6472081218274112, "step": 510 }, { "loss": 0.0923, "grad_norm": 0.3744134306907654, "learning_rate": 0.0003206254168281585, "epoch": 0.6598984771573604, "step": 520 }, { "loss": 0.1008, "grad_norm": 0.47935950756073, "learning_rate": 0.00031725123753048676, "epoch": 0.6725888324873096, "step": 530 }, { "loss": 0.0976, "grad_norm": 0.5493370294570923, "learning_rate": 0.0003138254216325324, "epoch": 0.6852791878172588, "step": 540 }, { "loss": 0.0972, "grad_norm": 0.5088523030281067, "learning_rate": 0.000310349477838936, "epoch": 0.6979695431472082, "step": 550 }, { "loss": 0.0974, "grad_norm": 0.4719524681568146, "learning_rate": 0.0003068249369303019, "epoch": 0.7106598984771574, "step": 560 }, { "loss": 0.0871, "grad_norm": 0.5160706043243408, "learning_rate": 0.0003032533510890542, "epoch": 0.7233502538071066, "step": 570 }, { "loss": 0.0883, "grad_norm": 0.42795512080192566, "learning_rate": 0.0002996362932158663, "epoch": 0.7360406091370558, "step": 580 }, { "loss": 0.0914, "grad_norm": 0.5042217969894409, "learning_rate": 0.0002959753562369666, "epoch": 0.748730964467005, "step": 590 }, { "loss": 0.0858, "grad_norm": 0.41944852471351624, "learning_rate": 0.0002922721524026259, "epoch": 0.7614213197969543, "step": 600 }, { "loss": 0.0784, "grad_norm": 0.3173629343509674, "learning_rate": 0.00028852831257713326, "epoch": 0.7741116751269036, "step": 610 }, { "loss": 0.0899, "grad_norm": 0.5276319980621338, "learning_rate": 0.0002847454855205758, "epoch": 0.7868020304568528, "step": 620 }, { "eval_loss": 0.09238167852163315, "eval_runtime": 9.5377, "eval_samples_per_second": 139.132, "eval_steps_per_second": 34.809, "epoch": 0.7969543147208121, "step": 628 }, { "loss": 0.0868, "grad_norm": 0.5028842091560364, "learning_rate": 0.0002809253371627362, "epoch": 0.799492385786802, "step": 630 }, { "loss": 0.0834, "grad_norm": 0.40975555777549744, "learning_rate": 0.00027706954986942935, "epoch": 0.8121827411167513, "step": 640 }, { "loss": 0.0927, "grad_norm": 0.5146291851997375, "learning_rate": 0.0002731798217016005, "epoch": 0.8248730964467005, "step": 650 }, { "loss": 0.0813, "grad_norm": 0.3960341811180115, "learning_rate": 0.0002692578656675116, "epoch": 0.8375634517766497, "step": 660 }, { "loss": 0.0934, "grad_norm": 0.4789310693740845, "learning_rate": 0.00026530540896834467, "epoch": 0.850253807106599, "step": 670 }, { "loss": 0.0917, "grad_norm": 0.43943315744400024, "learning_rate": 0.00026132419223755493, "epoch": 0.8629441624365483, "step": 680 }, { "loss": 0.0862, "grad_norm": 0.5376535654067993, "learning_rate": 0.00025731596877430826, "epoch": 0.8756345177664975, "step": 690 }, { "loss": 0.088, "grad_norm": 0.452738493680954, "learning_rate": 0.0002532825037713411, "epoch": 0.8883248730964467, "step": 700 }, { "loss": 0.0819, "grad_norm": 0.4121873080730438, "learning_rate": 0.00024922557353758196, "epoch": 0.9010152284263959, "step": 710 }, { "loss": 0.0917, "grad_norm": 0.39808085560798645, "learning_rate": 0.00024514696471587794, "epoch": 0.9137055837563451, "step": 720 }, { "loss": 0.0802, "grad_norm": 0.43925586342811584, "learning_rate": 0.00024104847349617025, "epoch": 0.9263959390862944, "step": 730 }, { "loss": 0.0867, "grad_norm": 0.42966383695602417, "learning_rate": 0.00023693190482446493, "epoch": 0.9390862944162437, "step": 740 }, { "loss": 0.081, "grad_norm": 0.4360509514808655, "learning_rate": 0.00023279907160794733, "epoch": 0.9517766497461929, "step": 750 }, { "loss": 0.0943, "grad_norm": 0.4277612268924713, "learning_rate": 0.00022865179391659153, "epoch": 0.9644670050761421, "step": 760 }, { "loss": 0.0847, "grad_norm": 0.44837474822998047, "learning_rate": 0.00022449189818161407, "epoch": 0.9771573604060914, "step": 770 }, { "loss": 0.0857, "grad_norm": 0.4456328749656677, "learning_rate": 0.00022032121639112707, "epoch": 0.9898477157360406, "step": 780 }, { "eval_loss": 0.08206839114427567, "eval_runtime": 9.5084, "eval_samples_per_second": 139.561, "eval_steps_per_second": 34.917, "epoch": 0.9961928934010152, "step": 785 }, { "loss": 0.0885, "grad_norm": 0.3194786608219147, "learning_rate": 0.0002161415852833438, "epoch": 1.00253807106599, "step": 790 }, { "loss": 0.0613, "grad_norm": 0.3748820722103119, "learning_rate": 0.00021195484553769228, "epoch": 1.015228426395939, "step": 800 }, { "loss": 0.0608, "grad_norm": 0.4187680780887604, "learning_rate": 0.00020776284096419353, "epoch": 1.0279187817258884, "step": 810 }, { "loss": 0.0511, "grad_norm": 0.29912981390953064, "learning_rate": 0.0002035674176914609, "epoch": 1.0406091370558375, "step": 820 }, { "loss": 0.052, "grad_norm": 0.538901150226593, "learning_rate": 0.0001993704233536781, "epoch": 1.0532994923857868, "step": 830 }, { "loss": 0.0583, "grad_norm": 0.33813655376434326, "learning_rate": 0.00019517370627691454, "epoch": 1.0659898477157361, "step": 840 }, { "loss": 0.064, "grad_norm": 0.43808960914611816, "learning_rate": 0.00019097911466513606, "epoch": 1.0786802030456852, "step": 850 }, { "loss": 0.0644, "grad_norm": 0.34947964549064636, "learning_rate": 0.0001867884957862689, "epoch": 1.0913705583756346, "step": 860 }, { "loss": 0.0541, "grad_norm": 0.3377130627632141, "learning_rate": 0.0001826036951586764, "epoch": 1.1040609137055837, "step": 870 }, { "loss": 0.0585, "grad_norm": 0.5515534281730652, "learning_rate": 0.00017842655573840587, "epoch": 1.116751269035533, "step": 880 }, { "loss": 0.0499, "grad_norm": 0.37032851576805115, "learning_rate": 0.00017425891710756437, "epoch": 1.1294416243654823, "step": 890 }, { "loss": 0.0537, "grad_norm": 0.439820796251297, "learning_rate": 0.00017010261466417936, "epoch": 1.1421319796954315, "step": 900 }, { "loss": 0.053, "grad_norm": 0.47381657361984253, "learning_rate": 0.00016595947881390327, "epoch": 1.1548223350253808, "step": 910 }, { "loss": 0.0576, "grad_norm": 0.33624354004859924, "learning_rate": 0.00016183133416391573, "epoch": 1.16751269035533, "step": 920 }, { "loss": 0.0544, "grad_norm": 0.4324477016925812, "learning_rate": 0.00015771999871937964, "epoch": 1.1802030456852792, "step": 930 }, { "loss": 0.0562, "grad_norm": 0.48575344681739807, "learning_rate": 0.00015362728308280528, "epoch": 1.1928934010152283, "step": 940 }, { "eval_loss": 0.07984930276870728, "eval_runtime": 9.6222, "eval_samples_per_second": 137.91, "eval_steps_per_second": 34.504, "epoch": 1.1954314720812182, "step": 942 }, { "loss": 0.0556, "grad_norm": 0.3980204463005066, "learning_rate": 0.0001495549896566732, "epoch": 1.2055837563451777, "step": 950 }, { "loss": 0.0525, "grad_norm": 0.40248578786849976, "learning_rate": 0.00014550491184966985, "epoch": 1.218274111675127, "step": 960 }, { "loss": 0.0601, "grad_norm": 0.4088118076324463, "learning_rate": 0.00014147883328688305, "epoch": 1.2309644670050761, "step": 970 }, { "loss": 0.0561, "grad_norm": 0.49921339750289917, "learning_rate": 0.00013747852702430624, "epoch": 1.2436548223350254, "step": 980 }, { "loss": 0.0584, "grad_norm": 0.404526948928833, "learning_rate": 0.0001335057547679978, "epoch": 1.2563451776649746, "step": 990 }, { "loss": 0.0575, "grad_norm": 0.4947160482406616, "learning_rate": 0.00012956226609823771, "epoch": 1.2690355329949239, "step": 1000 }, { "loss": 0.0539, "grad_norm": 0.40267854928970337, "learning_rate": 0.0001256497976990259, "epoch": 1.281725888324873, "step": 1010 }, { "loss": 0.0571, "grad_norm": 0.4298776686191559, "learning_rate": 0.00012177007259325813, "epoch": 1.2944162436548223, "step": 1020 }, { "loss": 0.0472, "grad_norm": 0.4440214931964874, "learning_rate": 0.00011792479938391988, "epoch": 1.3071065989847717, "step": 1030 }, { "loss": 0.0557, "grad_norm": 0.4760426878929138, "learning_rate": 0.00011411567150162973, "epoch": 1.3197969543147208, "step": 1040 }, { "loss": 0.0525, "grad_norm": 0.38695594668388367, "learning_rate": 0.00011034436645886447, "epoch": 1.33248730964467, "step": 1050 }, { "loss": 0.0521, "grad_norm": 0.4604351222515106, "learning_rate": 0.00010661254511119501, "epoch": 1.3451776649746192, "step": 1060 }, { "loss": 0.0514, "grad_norm": 0.4119158089160919, "learning_rate": 0.00010292185092585709, "epoch": 1.3578680203045685, "step": 1070 }, { "loss": 0.0551, "grad_norm": 0.48942986130714417, "learning_rate": 9.92739092579808e-05, "epoch": 1.3705583756345177, "step": 1080 }, { "loss": 0.0507, "grad_norm": 0.34124505519866943, "learning_rate": 9.567032663479538e-05, "epoch": 1.383248730964467, "step": 1090 }, { "eval_loss": 0.07261822372674942, "eval_runtime": 9.4618, "eval_samples_per_second": 140.248, "eval_steps_per_second": 35.089, "epoch": 1.3946700507614214, "step": 1099 }, { "loss": 0.0521, "grad_norm": 0.37472787499427795, "learning_rate": 9.211269004812642e-05, "epoch": 1.3959390862944163, "step": 1100 }, { "loss": 0.0521, "grad_norm": 0.41362205147743225, "learning_rate": 8.860256625549608e-05, "epoch": 1.4086294416243654, "step": 1110 }, { "loss": 0.0502, "grad_norm": 0.41950875520706177, "learning_rate": 8.514150109013415e-05, "epoch": 1.4213197969543148, "step": 1120 }, { "loss": 0.0496, "grad_norm": 0.42962411046028137, "learning_rate": 8.173101878020454e-05, "epoch": 1.434010152284264, "step": 1130 }, { "loss": 0.0531, "grad_norm": 0.44318121671676636, "learning_rate": 7.837262127754609e-05, "epoch": 1.4467005076142132, "step": 1140 }, { "loss": 0.0521, "grad_norm": 0.4739588797092438, "learning_rate": 7.50677875962237e-05, "epoch": 1.4593908629441623, "step": 1150 }, { "loss": 0.0496, "grad_norm": 0.47502049803733826, "learning_rate": 7.181797316118124e-05, "epoch": 1.4720812182741116, "step": 1160 }, { "loss": 0.0489, "grad_norm": 0.41291266679763794, "learning_rate": 6.862460916728297e-05, "epoch": 1.484771573604061, "step": 1170 }, { "loss": 0.0477, "grad_norm": 0.3678615689277649, "learning_rate": 6.548910194902538e-05, "epoch": 1.49746192893401, "step": 1180 }, { "loss": 0.0539, "grad_norm": 0.4368578791618347, "learning_rate": 6.241283236119799e-05, "epoch": 1.5101522842639594, "step": 1190 }, { "loss": 0.0471, "grad_norm": 0.3706371784210205, "learning_rate": 5.9397155170764564e-05, "epoch": 1.5228426395939088, "step": 1200 }, { "loss": 0.0546, "grad_norm": 0.48822730779647827, "learning_rate": 5.644339846023359e-05, "epoch": 1.5355329949238579, "step": 1210 }, { "loss": 0.0458, "grad_norm": 0.3171629011631012, "learning_rate": 5.35528630427804e-05, "epoch": 1.548223350253807, "step": 1220 }, { "loss": 0.0524, "grad_norm": 0.5258194208145142, "learning_rate": 5.072682188937812e-05, "epoch": 1.5609137055837563, "step": 1230 }, { "loss": 0.0439, "grad_norm": 0.40187448263168335, "learning_rate": 4.796651956819078e-05, "epoch": 1.5736040609137056, "step": 1240 }, { "loss": 0.0488, "grad_norm": 0.5225071310997009, "learning_rate": 4.527317169647434e-05, "epoch": 1.5862944162436547, "step": 1250 }, { "eval_loss": 0.06903357803821564, "eval_runtime": 9.4791, "eval_samples_per_second": 139.992, "eval_steps_per_second": 35.024, "epoch": 1.5939086294416245, "step": 1256 }, { "loss": 0.0469, "grad_norm": 0.37204620242118835, "learning_rate": 4.264796440522747e-05, "epoch": 1.598984771573604, "step": 1260 }, { "loss": 0.045, "grad_norm": 0.40722259879112244, "learning_rate": 4.009205381682828e-05, "epoch": 1.6116751269035534, "step": 1270 }, { "loss": 0.05, "grad_norm": 0.4333907663822174, "learning_rate": 3.760656553588591e-05, "epoch": 1.6243654822335025, "step": 1280 }, { "loss": 0.0408, "grad_norm": 0.3389703631401062, "learning_rate": 3.519259415353291e-05, "epoch": 1.6370558375634516, "step": 1290 }, { "loss": 0.0478, "grad_norm": 0.43472781777381897, "learning_rate": 3.285120276537481e-05, "epoch": 1.649746192893401, "step": 1300 }, { "loss": 0.0449, "grad_norm": 0.35716742277145386, "learning_rate": 3.058342250331063e-05, "epoch": 1.6624365482233503, "step": 1310 }, { "loss": 0.0455, "grad_norm": 0.419575959444046, "learning_rate": 2.83902520814298e-05, "epoch": 1.6751269035532994, "step": 1320 }, { "loss": 0.044, "grad_norm": 0.29724448919296265, "learning_rate": 2.627265735618549e-05, "epoch": 1.6878172588832487, "step": 1330 }, { "loss": 0.0469, "grad_norm": 0.4269305169582367, "learning_rate": 2.4231570901038868e-05, "epoch": 1.700507614213198, "step": 1340 }, { "loss": 0.0459, "grad_norm": 0.5313772559165955, "learning_rate": 2.2267891595759816e-05, "epoch": 1.7131979695431472, "step": 1350 }, { "loss": 0.0418, "grad_norm": 0.4228779077529907, "learning_rate": 2.03824842305673e-05, "epoch": 1.7258883248730963, "step": 1360 }, { "loss": 0.0476, "grad_norm": 0.42005908489227295, "learning_rate": 1.8576179125281688e-05, "epoch": 1.7385786802030458, "step": 1370 }, { "loss": 0.0477, "grad_norm": 0.40062543749809265, "learning_rate": 1.684977176365794e-05, "epoch": 1.751269035532995, "step": 1380 }, { "loss": 0.0475, "grad_norm": 0.4797188639640808, "learning_rate": 1.5204022443060472e-05, "epoch": 1.763959390862944, "step": 1390 }, { "loss": 0.0468, "grad_norm": 0.391634076833725, "learning_rate": 1.3639655939633323e-05, "epoch": 1.7766497461928934, "step": 1400 }, { "loss": 0.0489, "grad_norm": 0.42081767320632935, "learning_rate": 1.2157361189114325e-05, "epoch": 1.7893401015228427, "step": 1410 }, { "eval_loss": 0.0661860853433609, "eval_runtime": 9.5602, "eval_samples_per_second": 138.805, "eval_steps_per_second": 34.727, "epoch": 1.7931472081218274, "step": 1413 }, { "loss": 0.0454, "grad_norm": 0.46962904930114746, "learning_rate": 1.075779098343257e-05, "epoch": 1.8020304568527918, "step": 1420 }, { "loss": 0.0441, "grad_norm": 0.4605846405029297, "learning_rate": 9.441561683223476e-06, "epoch": 1.8147208121827412, "step": 1430 }, { "loss": 0.0414, "grad_norm": 0.41056081652641296, "learning_rate": 8.209252946388302e-06, "epoch": 1.8274111675126905, "step": 1440 }, { "loss": 0.0376, "grad_norm": 0.4158293902873993, "learning_rate": 7.0614074728166506e-06, "epoch": 1.8401015228426396, "step": 1450 }, { "loss": 0.0466, "grad_norm": 0.5301814675331116, "learning_rate": 5.9985307653855016e-06, "epoch": 1.8527918781725887, "step": 1460 }, { "loss": 0.0458, "grad_norm": 0.44248396158218384, "learning_rate": 5.021090907339488e-06, "epoch": 1.865482233502538, "step": 1470 }, { "loss": 0.0454, "grad_norm": 0.32068517804145813, "learning_rate": 4.12951835615012e-06, "epoch": 1.8781725888324874, "step": 1480 }, { "loss": 0.0453, "grad_norm": 0.5575969815254211, "learning_rate": 3.324205753945764e-06, "epoch": 1.8908629441624365, "step": 1490 }, { "loss": 0.0404, "grad_norm": 0.3342937231063843, "learning_rate": 2.605507754594605e-06, "epoch": 1.9035532994923858, "step": 1500 }, { "loss": 0.0452, "grad_norm": 0.516743004322052, "learning_rate": 1.9737408675177594e-06, "epoch": 1.9162436548223352, "step": 1510 }, { "loss": 0.0438, "grad_norm": 0.4447910487651825, "learning_rate": 1.4291833183008196e-06, "epoch": 1.9289340101522843, "step": 1520 }, { "loss": 0.0523, "grad_norm": 0.46259304881095886, "learning_rate": 9.720749261652007e-07, "epoch": 1.9416243654822334, "step": 1530 }, { "loss": 0.0463, "grad_norm": 0.4858075678348541, "learning_rate": 6.026169983536223e-07, "epoch": 1.9543147208121827, "step": 1540 }, { "loss": 0.0486, "grad_norm": 0.43060243129730225, "learning_rate": 3.209722414757588e-07, "epoch": 1.967005076142132, "step": 1550 }, { "loss": 0.0416, "grad_norm": 0.4320192337036133, "learning_rate": 1.2726468985349015e-07, "epoch": 1.9796954314720812, "step": 1560 }, { "loss": 0.0368, "grad_norm": 0.38553470373153687, "learning_rate": 2.1579650896952354e-08, "epoch": 1.9923857868020305, "step": 1570 }, { "eval_loss": 0.06596987694501877, "eval_runtime": 9.5315, "eval_samples_per_second": 139.222, "eval_steps_per_second": 34.832, "epoch": 1.9923857868020305, "step": 1570 }, { "train_runtime": 366.7473, "train_samples_per_second": 137.435, "train_steps_per_second": 4.297, "total_flos": 9515820530880000.0, "train_loss": 0.10226353834652659, "epoch": 2.0, "step": 1576 } ]