{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 600, "global_step": 4677, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021381227282446015, "grad_norm": 13.399383544921875, "learning_rate": 3.846153846153847e-07, "loss": 3.2323, "step": 10 }, { "epoch": 0.004276245456489203, "grad_norm": 12.956534385681152, "learning_rate": 8.11965811965812e-07, "loss": 3.2089, "step": 20 }, { "epoch": 0.006414368184733804, "grad_norm": 6.201784610748291, "learning_rate": 1.2393162393162394e-06, "loss": 2.8859, "step": 30 }, { "epoch": 0.008552490912978406, "grad_norm": 5.795752048492432, "learning_rate": 1.6666666666666667e-06, "loss": 2.469, "step": 40 }, { "epoch": 0.010690613641223007, "grad_norm": 4.1530351638793945, "learning_rate": 2.094017094017094e-06, "loss": 2.0068, "step": 50 }, { "epoch": 0.012828736369467608, "grad_norm": 4.0573530197143555, "learning_rate": 2.5213675213675216e-06, "loss": 1.5318, "step": 60 }, { "epoch": 0.014966859097712209, "grad_norm": 9.95679759979248, "learning_rate": 2.948717948717949e-06, "loss": 1.073, "step": 70 }, { "epoch": 0.01710498182595681, "grad_norm": 2.138951539993286, "learning_rate": 3.3760683760683765e-06, "loss": 0.7936, "step": 80 }, { "epoch": 0.01924310455420141, "grad_norm": 1.6260210275650024, "learning_rate": 3.8034188034188036e-06, "loss": 0.6121, "step": 90 }, { "epoch": 0.021381227282446014, "grad_norm": 3.553804636001587, "learning_rate": 4.230769230769231e-06, "loss": 0.4587, "step": 100 }, { "epoch": 0.023519350010690613, "grad_norm": 4.238368988037109, "learning_rate": 4.658119658119659e-06, "loss": 0.3611, "step": 110 }, { "epoch": 0.025657472738935216, "grad_norm": 3.7698726654052734, "learning_rate": 5.085470085470086e-06, "loss": 0.3285, "step": 120 }, { "epoch": 0.027795595467179815, "grad_norm": 6.797550201416016, "learning_rate": 5.512820512820514e-06, "loss": 0.2798, "step": 130 }, { "epoch": 0.029933718195424418, "grad_norm": 4.188291549682617, "learning_rate": 5.940170940170941e-06, "loss": 0.2401, "step": 140 }, { "epoch": 0.03207184092366902, "grad_norm": 0.8605281114578247, "learning_rate": 6.367521367521368e-06, "loss": 0.2354, "step": 150 }, { "epoch": 0.03420996365191362, "grad_norm": 1.5260062217712402, "learning_rate": 6.794871794871796e-06, "loss": 0.1918, "step": 160 }, { "epoch": 0.03634808638015822, "grad_norm": 1.3726520538330078, "learning_rate": 7.222222222222223e-06, "loss": 0.1818, "step": 170 }, { "epoch": 0.03848620910840282, "grad_norm": 1.8705629110336304, "learning_rate": 7.649572649572649e-06, "loss": 0.153, "step": 180 }, { "epoch": 0.040624331836647425, "grad_norm": 2.4513299465179443, "learning_rate": 8.076923076923077e-06, "loss": 0.154, "step": 190 }, { "epoch": 0.04276245456489203, "grad_norm": 0.2777731418609619, "learning_rate": 8.504273504273505e-06, "loss": 0.1415, "step": 200 }, { "epoch": 0.04490057729313662, "grad_norm": 0.23002663254737854, "learning_rate": 8.931623931623933e-06, "loss": 0.1326, "step": 210 }, { "epoch": 0.047038700021381226, "grad_norm": 1.1118742227554321, "learning_rate": 9.358974358974359e-06, "loss": 0.126, "step": 220 }, { "epoch": 0.04917682274962583, "grad_norm": 1.368764042854309, "learning_rate": 9.786324786324787e-06, "loss": 0.1286, "step": 230 }, { "epoch": 0.05131494547787043, "grad_norm": 0.519333004951477, "learning_rate": 9.999968751679245e-06, "loss": 0.1195, "step": 240 }, { "epoch": 0.053453068206115034, "grad_norm": 0.5977477431297302, "learning_rate": 9.999718767456692e-06, "loss": 0.1206, "step": 250 }, { "epoch": 0.05559119093435963, "grad_norm": 0.4855315089225769, "learning_rate": 9.999218811510088e-06, "loss": 0.1103, "step": 260 }, { "epoch": 0.05772931366260423, "grad_norm": 0.3158447742462158, "learning_rate": 9.998468908835808e-06, "loss": 0.1089, "step": 270 }, { "epoch": 0.059867436390848836, "grad_norm": 0.25278440117836, "learning_rate": 9.997469096926852e-06, "loss": 0.1089, "step": 280 }, { "epoch": 0.06200555911909344, "grad_norm": 0.25526583194732666, "learning_rate": 9.996219425770975e-06, "loss": 0.1024, "step": 290 }, { "epoch": 0.06414368184733804, "grad_norm": 0.7181093692779541, "learning_rate": 9.994719957848182e-06, "loss": 0.1004, "step": 300 }, { "epoch": 0.06628180457558264, "grad_norm": 0.5958463549613953, "learning_rate": 9.992970768127605e-06, "loss": 0.0957, "step": 310 }, { "epoch": 0.06841992730382725, "grad_norm": 0.8600453734397888, "learning_rate": 9.990971944063758e-06, "loss": 0.1029, "step": 320 }, { "epoch": 0.07055805003207184, "grad_norm": 0.267674058675766, "learning_rate": 9.98872358559216e-06, "loss": 0.0985, "step": 330 }, { "epoch": 0.07269617276031644, "grad_norm": 2.607964038848877, "learning_rate": 9.986225805124345e-06, "loss": 0.0921, "step": 340 }, { "epoch": 0.07483429548856105, "grad_norm": 0.9426595568656921, "learning_rate": 9.983478727542233e-06, "loss": 0.089, "step": 350 }, { "epoch": 0.07697241821680564, "grad_norm": 0.5454517602920532, "learning_rate": 9.980482490191895e-06, "loss": 0.0898, "step": 360 }, { "epoch": 0.07911054094505024, "grad_norm": 1.0879571437835693, "learning_rate": 9.977237242876677e-06, "loss": 0.0932, "step": 370 }, { "epoch": 0.08124866367329485, "grad_norm": 0.4590705335140228, "learning_rate": 9.973743147849721e-06, "loss": 0.0884, "step": 380 }, { "epoch": 0.08338678640153944, "grad_norm": 0.5434951186180115, "learning_rate": 9.970000379805843e-06, "loss": 0.0936, "step": 390 }, { "epoch": 0.08552490912978405, "grad_norm": 5.244622707366943, "learning_rate": 9.966009125872806e-06, "loss": 0.0892, "step": 400 }, { "epoch": 0.08766303185802865, "grad_norm": 0.16936945915222168, "learning_rate": 9.96176958560196e-06, "loss": 0.0872, "step": 410 }, { "epoch": 0.08980115458627325, "grad_norm": 0.4548771381378174, "learning_rate": 9.957281970958264e-06, "loss": 0.0879, "step": 420 }, { "epoch": 0.09193927731451786, "grad_norm": 1.1364250183105469, "learning_rate": 9.952546506309691e-06, "loss": 0.0869, "step": 430 }, { "epoch": 0.09407740004276245, "grad_norm": 0.7596004605293274, "learning_rate": 9.94756342841601e-06, "loss": 0.0873, "step": 440 }, { "epoch": 0.09621552277100706, "grad_norm": 0.2505151331424713, "learning_rate": 9.94233298641695e-06, "loss": 0.088, "step": 450 }, { "epoch": 0.09835364549925166, "grad_norm": 0.16528713703155518, "learning_rate": 9.936855441819744e-06, "loss": 0.0834, "step": 460 }, { "epoch": 0.10049176822749625, "grad_norm": 0.43566420674324036, "learning_rate": 9.931131068486045e-06, "loss": 0.0808, "step": 470 }, { "epoch": 0.10262989095574086, "grad_norm": 0.24824728071689606, "learning_rate": 9.925160152618246e-06, "loss": 0.0871, "step": 480 }, { "epoch": 0.10476801368398546, "grad_norm": 0.47349920868873596, "learning_rate": 9.918942992745161e-06, "loss": 0.0827, "step": 490 }, { "epoch": 0.10690613641223007, "grad_norm": 0.30691832304000854, "learning_rate": 9.912479899707117e-06, "loss": 0.0834, "step": 500 }, { "epoch": 0.10904425914047466, "grad_norm": 0.1434836983680725, "learning_rate": 9.905771196640384e-06, "loss": 0.0811, "step": 510 }, { "epoch": 0.11118238186871926, "grad_norm": 0.38805854320526123, "learning_rate": 9.898817218961043e-06, "loss": 0.0805, "step": 520 }, { "epoch": 0.11332050459696387, "grad_norm": 0.9806022644042969, "learning_rate": 9.89161831434821e-06, "loss": 0.0832, "step": 530 }, { "epoch": 0.11545862732520847, "grad_norm": 0.4095519185066223, "learning_rate": 9.88417484272665e-06, "loss": 0.0821, "step": 540 }, { "epoch": 0.11759675005345306, "grad_norm": 0.14556068181991577, "learning_rate": 9.87648717624878e-06, "loss": 0.0813, "step": 550 }, { "epoch": 0.11973487278169767, "grad_norm": 0.1319616138935089, "learning_rate": 9.868555699276065e-06, "loss": 0.0783, "step": 560 }, { "epoch": 0.12187299550994227, "grad_norm": 0.34334319829940796, "learning_rate": 9.860380808359808e-06, "loss": 0.0812, "step": 570 }, { "epoch": 0.12401111823818688, "grad_norm": 0.28321507573127747, "learning_rate": 9.851962912221315e-06, "loss": 0.0833, "step": 580 }, { "epoch": 0.1261492409664315, "grad_norm": 0.28973206877708435, "learning_rate": 9.843302431731456e-06, "loss": 0.0781, "step": 590 }, { "epoch": 0.12828736369467608, "grad_norm": 1.1749194860458374, "learning_rate": 9.834399799889637e-06, "loss": 0.0843, "step": 600 }, { "epoch": 0.12828736369467608, "eval_loss": 0.08129256218671799, "eval_runtime": 471.6671, "eval_samples_per_second": 4.906, "eval_steps_per_second": 0.307, "step": 600 }, { "epoch": 0.13042548642292068, "grad_norm": 0.261525422334671, "learning_rate": 9.825255461802137e-06, "loss": 0.0819, "step": 610 }, { "epoch": 0.13256360915116527, "grad_norm": 0.3438051640987396, "learning_rate": 9.815869874659866e-06, "loss": 0.0823, "step": 620 }, { "epoch": 0.13470173187940987, "grad_norm": 0.34199830889701843, "learning_rate": 9.806243507715494e-06, "loss": 0.0808, "step": 630 }, { "epoch": 0.1368398546076545, "grad_norm": 0.22008945047855377, "learning_rate": 9.796376842260004e-06, "loss": 0.0774, "step": 640 }, { "epoch": 0.1389779773358991, "grad_norm": 0.1826457530260086, "learning_rate": 9.786270371598613e-06, "loss": 0.0748, "step": 650 }, { "epoch": 0.14111610006414368, "grad_norm": 0.15165981650352478, "learning_rate": 9.775924601026127e-06, "loss": 0.0782, "step": 660 }, { "epoch": 0.14325422279238828, "grad_norm": 0.3881567120552063, "learning_rate": 9.765340047801656e-06, "loss": 0.0764, "step": 670 }, { "epoch": 0.14539234552063288, "grad_norm": 0.5628389120101929, "learning_rate": 9.754517241122771e-06, "loss": 0.0774, "step": 680 }, { "epoch": 0.14753046824887747, "grad_norm": 0.18720681965351105, "learning_rate": 9.743456722099039e-06, "loss": 0.0779, "step": 690 }, { "epoch": 0.1496685909771221, "grad_norm": 0.21351012587547302, "learning_rate": 9.732159043724963e-06, "loss": 0.0782, "step": 700 }, { "epoch": 0.1518067137053667, "grad_norm": 0.31227338314056396, "learning_rate": 9.720624770852341e-06, "loss": 0.077, "step": 710 }, { "epoch": 0.1539448364336113, "grad_norm": 0.18778669834136963, "learning_rate": 9.70885448016203e-06, "loss": 0.0751, "step": 720 }, { "epoch": 0.15608295916185588, "grad_norm": 0.2396603375673294, "learning_rate": 9.696848760135093e-06, "loss": 0.0776, "step": 730 }, { "epoch": 0.15822108189010048, "grad_norm": 0.24791832268238068, "learning_rate": 9.684608211023406e-06, "loss": 0.0729, "step": 740 }, { "epoch": 0.1603592046183451, "grad_norm": 0.5819320678710938, "learning_rate": 9.672133444819619e-06, "loss": 0.0738, "step": 750 }, { "epoch": 0.1624973273465897, "grad_norm": 0.2177390158176422, "learning_rate": 9.659425085226581e-06, "loss": 0.0789, "step": 760 }, { "epoch": 0.1646354500748343, "grad_norm": 0.14574196934700012, "learning_rate": 9.646483767626138e-06, "loss": 0.0755, "step": 770 }, { "epoch": 0.1667735728030789, "grad_norm": 0.16599872708320618, "learning_rate": 9.63331013904738e-06, "loss": 0.0758, "step": 780 }, { "epoch": 0.16891169553132349, "grad_norm": 0.21911288797855377, "learning_rate": 9.619904858134281e-06, "loss": 0.0763, "step": 790 }, { "epoch": 0.1710498182595681, "grad_norm": 0.12567879259586334, "learning_rate": 9.606268595112776e-06, "loss": 0.0752, "step": 800 }, { "epoch": 0.1731879409878127, "grad_norm": 0.13315202295780182, "learning_rate": 9.59240203175725e-06, "loss": 0.0727, "step": 810 }, { "epoch": 0.1753260637160573, "grad_norm": 0.4594784379005432, "learning_rate": 9.57830586135644e-06, "loss": 0.076, "step": 820 }, { "epoch": 0.1774641864443019, "grad_norm": 0.15148596465587616, "learning_rate": 9.5639807886788e-06, "loss": 0.0766, "step": 830 }, { "epoch": 0.1796023091725465, "grad_norm": 0.5493207573890686, "learning_rate": 9.549427529937233e-06, "loss": 0.0769, "step": 840 }, { "epoch": 0.18174043190079112, "grad_norm": 0.1707722693681717, "learning_rate": 9.534646812753301e-06, "loss": 0.0733, "step": 850 }, { "epoch": 0.1838785546290357, "grad_norm": 0.15772108733654022, "learning_rate": 9.519639376120841e-06, "loss": 0.0767, "step": 860 }, { "epoch": 0.1860166773572803, "grad_norm": 0.36901381611824036, "learning_rate": 9.504405970369017e-06, "loss": 0.0767, "step": 870 }, { "epoch": 0.1881548000855249, "grad_norm": 0.21110327541828156, "learning_rate": 9.488947357124812e-06, "loss": 0.0749, "step": 880 }, { "epoch": 0.1902929228137695, "grad_norm": 0.1756543517112732, "learning_rate": 9.473264309274934e-06, "loss": 0.0747, "step": 890 }, { "epoch": 0.19243104554201412, "grad_norm": 0.19304226338863373, "learning_rate": 9.45735761092719e-06, "loss": 0.0722, "step": 900 }, { "epoch": 0.19456916827025872, "grad_norm": 0.14899924397468567, "learning_rate": 9.441228057371275e-06, "loss": 0.0722, "step": 910 }, { "epoch": 0.19670729099850331, "grad_norm": 0.24590201675891876, "learning_rate": 9.42487645503901e-06, "loss": 0.0753, "step": 920 }, { "epoch": 0.1988454137267479, "grad_norm": 0.2768633961677551, "learning_rate": 9.408303621464024e-06, "loss": 0.0738, "step": 930 }, { "epoch": 0.2009835364549925, "grad_norm": 0.44510579109191895, "learning_rate": 9.391510385240876e-06, "loss": 0.0725, "step": 940 }, { "epoch": 0.20312165918323713, "grad_norm": 0.15240268409252167, "learning_rate": 9.374497585983635e-06, "loss": 0.0748, "step": 950 }, { "epoch": 0.20525978191148173, "grad_norm": 0.307160884141922, "learning_rate": 9.3572660742839e-06, "loss": 0.0734, "step": 960 }, { "epoch": 0.20739790463972632, "grad_norm": 0.19927459955215454, "learning_rate": 9.339816711668262e-06, "loss": 0.0723, "step": 970 }, { "epoch": 0.20953602736797092, "grad_norm": 0.12397543340921402, "learning_rate": 9.322150370555242e-06, "loss": 0.0728, "step": 980 }, { "epoch": 0.2116741500962155, "grad_norm": 0.20738820731639862, "learning_rate": 9.304267934211672e-06, "loss": 0.0749, "step": 990 }, { "epoch": 0.21381227282446014, "grad_norm": 0.14943927526474, "learning_rate": 9.28617029670853e-06, "loss": 0.0717, "step": 1000 }, { "epoch": 0.21595039555270473, "grad_norm": 0.16114865243434906, "learning_rate": 9.267858362876238e-06, "loss": 0.0714, "step": 1010 }, { "epoch": 0.21808851828094933, "grad_norm": 0.14569199085235596, "learning_rate": 9.249333048259426e-06, "loss": 0.0751, "step": 1020 }, { "epoch": 0.22022664100919392, "grad_norm": 0.46952638030052185, "learning_rate": 9.230595279071156e-06, "loss": 0.0712, "step": 1030 }, { "epoch": 0.22236476373743852, "grad_norm": 0.1461559236049652, "learning_rate": 9.211645992146618e-06, "loss": 0.0716, "step": 1040 }, { "epoch": 0.22450288646568314, "grad_norm": 0.13023297488689423, "learning_rate": 9.192486134896282e-06, "loss": 0.0696, "step": 1050 }, { "epoch": 0.22664100919392774, "grad_norm": 0.3277701437473297, "learning_rate": 9.17311666525854e-06, "loss": 0.0694, "step": 1060 }, { "epoch": 0.22877913192217234, "grad_norm": 0.11651341617107391, "learning_rate": 9.153538551651808e-06, "loss": 0.0681, "step": 1070 }, { "epoch": 0.23091725465041693, "grad_norm": 0.14407022297382355, "learning_rate": 9.133752772926102e-06, "loss": 0.0717, "step": 1080 }, { "epoch": 0.23305537737866153, "grad_norm": 0.14212338626384735, "learning_rate": 9.113760318314109e-06, "loss": 0.0701, "step": 1090 }, { "epoch": 0.23519350010690612, "grad_norm": 0.1330636888742447, "learning_rate": 9.09356218738172e-06, "loss": 0.0714, "step": 1100 }, { "epoch": 0.23733162283515075, "grad_norm": 0.19741296768188477, "learning_rate": 9.073159389978056e-06, "loss": 0.0704, "step": 1110 }, { "epoch": 0.23946974556339534, "grad_norm": 0.1623307764530182, "learning_rate": 9.052552946184985e-06, "loss": 0.0682, "step": 1120 }, { "epoch": 0.24160786829163994, "grad_norm": 0.11294026672840118, "learning_rate": 9.031743886266109e-06, "loss": 0.0686, "step": 1130 }, { "epoch": 0.24374599101988453, "grad_norm": 0.2032402753829956, "learning_rate": 9.010733250615264e-06, "loss": 0.0685, "step": 1140 }, { "epoch": 0.24588411374812913, "grad_norm": 0.14151214063167572, "learning_rate": 8.989522089704502e-06, "loss": 0.0665, "step": 1150 }, { "epoch": 0.24802223647637375, "grad_norm": 0.16347168385982513, "learning_rate": 8.96811146403156e-06, "loss": 0.0683, "step": 1160 }, { "epoch": 0.2501603592046183, "grad_norm": 0.2190970480442047, "learning_rate": 8.946502444066854e-06, "loss": 0.0702, "step": 1170 }, { "epoch": 0.252298481932863, "grad_norm": 0.1208793967962265, "learning_rate": 8.924696110199944e-06, "loss": 0.0687, "step": 1180 }, { "epoch": 0.25443660466110757, "grad_norm": 0.16621464490890503, "learning_rate": 8.902693552685532e-06, "loss": 0.0705, "step": 1190 }, { "epoch": 0.25657472738935216, "grad_norm": 0.468980997800827, "learning_rate": 8.880495871588934e-06, "loss": 0.0712, "step": 1200 }, { "epoch": 0.25657472738935216, "eval_loss": 0.0711250901222229, "eval_runtime": 472.4294, "eval_samples_per_second": 4.898, "eval_steps_per_second": 0.307, "step": 1200 }, { "epoch": 0.25871285011759676, "grad_norm": 0.145741268992424, "learning_rate": 8.858104176731102e-06, "loss": 0.0663, "step": 1210 }, { "epoch": 0.26085097284584136, "grad_norm": 0.26027005910873413, "learning_rate": 8.835519587633116e-06, "loss": 0.0683, "step": 1220 }, { "epoch": 0.26298909557408595, "grad_norm": 0.1468413919210434, "learning_rate": 8.812743233460224e-06, "loss": 0.07, "step": 1230 }, { "epoch": 0.26512721830233055, "grad_norm": 0.12431179732084274, "learning_rate": 8.789776252965378e-06, "loss": 0.0712, "step": 1240 }, { "epoch": 0.26726534103057514, "grad_norm": 0.10897620022296906, "learning_rate": 8.76661979443231e-06, "loss": 0.0706, "step": 1250 }, { "epoch": 0.26940346375881974, "grad_norm": 0.4356347620487213, "learning_rate": 8.74327501561811e-06, "loss": 0.071, "step": 1260 }, { "epoch": 0.27154158648706433, "grad_norm": 0.13456492125988007, "learning_rate": 8.71974308369535e-06, "loss": 0.075, "step": 1270 }, { "epoch": 0.273679709215309, "grad_norm": 0.15002375841140747, "learning_rate": 8.696025175193725e-06, "loss": 0.0683, "step": 1280 }, { "epoch": 0.2758178319435536, "grad_norm": 0.25080016255378723, "learning_rate": 8.672122475941228e-06, "loss": 0.0669, "step": 1290 }, { "epoch": 0.2779559546717982, "grad_norm": 0.10547586530447006, "learning_rate": 8.648036181004867e-06, "loss": 0.0681, "step": 1300 }, { "epoch": 0.2800940774000428, "grad_norm": 0.3222697079181671, "learning_rate": 8.62376749463091e-06, "loss": 0.0689, "step": 1310 }, { "epoch": 0.28223220012828737, "grad_norm": 0.16391637921333313, "learning_rate": 8.59931763018468e-06, "loss": 0.0658, "step": 1320 }, { "epoch": 0.28437032285653197, "grad_norm": 0.11527778953313828, "learning_rate": 8.574687810089887e-06, "loss": 0.0682, "step": 1330 }, { "epoch": 0.28650844558477656, "grad_norm": 0.21346218883991241, "learning_rate": 8.549879265767514e-06, "loss": 0.0695, "step": 1340 }, { "epoch": 0.28864656831302116, "grad_norm": 0.2827841341495514, "learning_rate": 8.524893237574244e-06, "loss": 0.0683, "step": 1350 }, { "epoch": 0.29078469104126575, "grad_norm": 0.21771486103534698, "learning_rate": 8.499730974740452e-06, "loss": 0.0679, "step": 1360 }, { "epoch": 0.29292281376951035, "grad_norm": 0.10718706995248795, "learning_rate": 8.47439373530774e-06, "loss": 0.0696, "step": 1370 }, { "epoch": 0.29506093649775494, "grad_norm": 0.1284645050764084, "learning_rate": 8.44888278606605e-06, "loss": 0.069, "step": 1380 }, { "epoch": 0.2971990592259996, "grad_norm": 0.09697470813989639, "learning_rate": 8.423199402490314e-06, "loss": 0.067, "step": 1390 }, { "epoch": 0.2993371819542442, "grad_norm": 0.16259442269802094, "learning_rate": 8.39734486867669e-06, "loss": 0.0676, "step": 1400 }, { "epoch": 0.3014753046824888, "grad_norm": 0.12434723228216171, "learning_rate": 8.371320477278363e-06, "loss": 0.0682, "step": 1410 }, { "epoch": 0.3036134274107334, "grad_norm": 0.11663969606161118, "learning_rate": 8.345127529440921e-06, "loss": 0.0685, "step": 1420 }, { "epoch": 0.305751550138978, "grad_norm": 0.15766265988349915, "learning_rate": 8.318767334737286e-06, "loss": 0.067, "step": 1430 }, { "epoch": 0.3078896728672226, "grad_norm": 0.1343747079372406, "learning_rate": 8.292241211102246e-06, "loss": 0.0685, "step": 1440 }, { "epoch": 0.31002779559546717, "grad_norm": 0.801880419254303, "learning_rate": 8.265550484766574e-06, "loss": 0.0721, "step": 1450 }, { "epoch": 0.31216591832371177, "grad_norm": 0.40033191442489624, "learning_rate": 8.238696490190701e-06, "loss": 0.0668, "step": 1460 }, { "epoch": 0.31430404105195636, "grad_norm": 0.10282248258590698, "learning_rate": 8.211680569998011e-06, "loss": 0.0699, "step": 1470 }, { "epoch": 0.31644216378020096, "grad_norm": 0.128205344080925, "learning_rate": 8.184504074907706e-06, "loss": 0.0666, "step": 1480 }, { "epoch": 0.3185802865084456, "grad_norm": 0.2041744887828827, "learning_rate": 8.157168363667278e-06, "loss": 0.0652, "step": 1490 }, { "epoch": 0.3207184092366902, "grad_norm": 0.13964834809303284, "learning_rate": 8.129674802984573e-06, "loss": 0.0676, "step": 1500 }, { "epoch": 0.3228565319649348, "grad_norm": 0.2170010209083557, "learning_rate": 8.102024767459457e-06, "loss": 0.0663, "step": 1510 }, { "epoch": 0.3249946546931794, "grad_norm": 0.1943911761045456, "learning_rate": 8.074219639515101e-06, "loss": 0.0692, "step": 1520 }, { "epoch": 0.327132777421424, "grad_norm": 0.16795021295547485, "learning_rate": 8.046260809328848e-06, "loss": 0.0675, "step": 1530 }, { "epoch": 0.3292709001496686, "grad_norm": 0.11824577301740646, "learning_rate": 8.018149674762723e-06, "loss": 0.066, "step": 1540 }, { "epoch": 0.3314090228779132, "grad_norm": 0.11008622497320175, "learning_rate": 7.98988764129353e-06, "loss": 0.0687, "step": 1550 }, { "epoch": 0.3335471456061578, "grad_norm": 0.1290241926908493, "learning_rate": 7.961476121942598e-06, "loss": 0.0655, "step": 1560 }, { "epoch": 0.3356852683344024, "grad_norm": 0.10323217511177063, "learning_rate": 7.932916537205112e-06, "loss": 0.0662, "step": 1570 }, { "epoch": 0.33782339106264697, "grad_norm": 0.14995551109313965, "learning_rate": 7.904210314979122e-06, "loss": 0.0687, "step": 1580 }, { "epoch": 0.3399615137908916, "grad_norm": 0.6756893992424011, "learning_rate": 7.875358890494122e-06, "loss": 0.0674, "step": 1590 }, { "epoch": 0.3420996365191362, "grad_norm": 0.10807085037231445, "learning_rate": 7.846363706239312e-06, "loss": 0.0686, "step": 1600 }, { "epoch": 0.3442377592473808, "grad_norm": 0.1266680657863617, "learning_rate": 7.817226211891468e-06, "loss": 0.0684, "step": 1610 }, { "epoch": 0.3463758819756254, "grad_norm": 0.17101961374282837, "learning_rate": 7.787947864242474e-06, "loss": 0.0658, "step": 1620 }, { "epoch": 0.34851400470387, "grad_norm": 0.10010931640863419, "learning_rate": 7.75853012712647e-06, "loss": 0.0687, "step": 1630 }, { "epoch": 0.3506521274321146, "grad_norm": 0.0984787791967392, "learning_rate": 7.728974471346678e-06, "loss": 0.0682, "step": 1640 }, { "epoch": 0.3527902501603592, "grad_norm": 0.15190352499485016, "learning_rate": 7.699282374601857e-06, "loss": 0.0665, "step": 1650 }, { "epoch": 0.3549283728886038, "grad_norm": 0.10777094215154648, "learning_rate": 7.66945532141243e-06, "loss": 0.0671, "step": 1660 }, { "epoch": 0.3570664956168484, "grad_norm": 0.11840742081403732, "learning_rate": 7.639494803046261e-06, "loss": 0.0642, "step": 1670 }, { "epoch": 0.359204618345093, "grad_norm": 0.12402522563934326, "learning_rate": 7.609402317444086e-06, "loss": 0.0652, "step": 1680 }, { "epoch": 0.36134274107333764, "grad_norm": 0.1083909198641777, "learning_rate": 7.579179369144631e-06, "loss": 0.0654, "step": 1690 }, { "epoch": 0.36348086380158223, "grad_norm": 0.11909038573503494, "learning_rate": 7.5488274692093874e-06, "loss": 0.0657, "step": 1700 }, { "epoch": 0.36561898652982683, "grad_norm": 0.12170397490262985, "learning_rate": 7.518348135147063e-06, "loss": 0.0677, "step": 1710 }, { "epoch": 0.3677571092580714, "grad_norm": 0.0956568717956543, "learning_rate": 7.487742890837704e-06, "loss": 0.0666, "step": 1720 }, { "epoch": 0.369895231986316, "grad_norm": 0.09268027544021606, "learning_rate": 7.457013266456517e-06, "loss": 0.065, "step": 1730 }, { "epoch": 0.3720333547145606, "grad_norm": 0.09884276241064072, "learning_rate": 7.426160798397355e-06, "loss": 0.0655, "step": 1740 }, { "epoch": 0.3741714774428052, "grad_norm": 0.13707584142684937, "learning_rate": 7.395187029195906e-06, "loss": 0.0633, "step": 1750 }, { "epoch": 0.3763096001710498, "grad_norm": 0.18375861644744873, "learning_rate": 7.364093507452572e-06, "loss": 0.0666, "step": 1760 }, { "epoch": 0.3784477228992944, "grad_norm": 0.09120248258113861, "learning_rate": 7.33288178775504e-06, "loss": 0.0663, "step": 1770 }, { "epoch": 0.380585845627539, "grad_norm": 0.1667109578847885, "learning_rate": 7.301553430600559e-06, "loss": 0.0647, "step": 1780 }, { "epoch": 0.3827239683557836, "grad_norm": 0.17563393712043762, "learning_rate": 7.270110002317921e-06, "loss": 0.0646, "step": 1790 }, { "epoch": 0.38486209108402825, "grad_norm": 0.16917386651039124, "learning_rate": 7.238553074989143e-06, "loss": 0.0654, "step": 1800 }, { "epoch": 0.38486209108402825, "eval_loss": 0.0673459991812706, "eval_runtime": 472.3199, "eval_samples_per_second": 4.899, "eval_steps_per_second": 0.307, "step": 1800 }, { "epoch": 0.38700021381227284, "grad_norm": 0.11774443835020065, "learning_rate": 7.206884226370875e-06, "loss": 0.0655, "step": 1810 }, { "epoch": 0.38913833654051744, "grad_norm": 0.119617760181427, "learning_rate": 7.175105039815515e-06, "loss": 0.0639, "step": 1820 }, { "epoch": 0.39127645926876203, "grad_norm": 0.12271067500114441, "learning_rate": 7.143217104192041e-06, "loss": 0.0682, "step": 1830 }, { "epoch": 0.39341458199700663, "grad_norm": 0.19674961268901825, "learning_rate": 7.111222013806573e-06, "loss": 0.0628, "step": 1840 }, { "epoch": 0.3955527047252512, "grad_norm": 0.23792926967144012, "learning_rate": 7.07912136832267e-06, "loss": 0.0654, "step": 1850 }, { "epoch": 0.3976908274534958, "grad_norm": 0.1446692794561386, "learning_rate": 7.0469167726813445e-06, "loss": 0.067, "step": 1860 }, { "epoch": 0.3998289501817404, "grad_norm": 0.10155676305294037, "learning_rate": 7.014609837020817e-06, "loss": 0.0654, "step": 1870 }, { "epoch": 0.401967072909985, "grad_norm": 0.11214682459831238, "learning_rate": 6.9822021765960225e-06, "loss": 0.065, "step": 1880 }, { "epoch": 0.4041051956382296, "grad_norm": 0.09929853677749634, "learning_rate": 6.949695411697848e-06, "loss": 0.0656, "step": 1890 }, { "epoch": 0.40624331836647426, "grad_norm": 0.10509800910949707, "learning_rate": 6.9170911675721175e-06, "loss": 0.0668, "step": 1900 }, { "epoch": 0.40838144109471886, "grad_norm": 0.11406348645687103, "learning_rate": 6.884391074338348e-06, "loss": 0.0651, "step": 1910 }, { "epoch": 0.41051956382296345, "grad_norm": 0.13327282667160034, "learning_rate": 6.851596766908229e-06, "loss": 0.0681, "step": 1920 }, { "epoch": 0.41265768655120805, "grad_norm": 0.26162999868392944, "learning_rate": 6.818709884903897e-06, "loss": 0.0638, "step": 1930 }, { "epoch": 0.41479580927945264, "grad_norm": 0.10129717737436295, "learning_rate": 6.785732072575958e-06, "loss": 0.0648, "step": 1940 }, { "epoch": 0.41693393200769724, "grad_norm": 0.1475946605205536, "learning_rate": 6.752664978721269e-06, "loss": 0.0643, "step": 1950 }, { "epoch": 0.41907205473594183, "grad_norm": 0.14560334384441376, "learning_rate": 6.719510256600512e-06, "loss": 0.0657, "step": 1960 }, { "epoch": 0.42121017746418643, "grad_norm": 0.14020417630672455, "learning_rate": 6.686269563855534e-06, "loss": 0.0658, "step": 1970 }, { "epoch": 0.423348300192431, "grad_norm": 0.1445358246564865, "learning_rate": 6.652944562426469e-06, "loss": 0.0654, "step": 1980 }, { "epoch": 0.4254864229206756, "grad_norm": 0.1675933301448822, "learning_rate": 6.619536918468643e-06, "loss": 0.0621, "step": 1990 }, { "epoch": 0.4276245456489203, "grad_norm": 0.12988397479057312, "learning_rate": 6.586048302269277e-06, "loss": 0.0637, "step": 2000 }, { "epoch": 0.42976266837716487, "grad_norm": 0.10703526437282562, "learning_rate": 6.5524803881639694e-06, "loss": 0.0639, "step": 2010 }, { "epoch": 0.43190079110540947, "grad_norm": 0.10743958503007889, "learning_rate": 6.518834854452993e-06, "loss": 0.0647, "step": 2020 }, { "epoch": 0.43403891383365406, "grad_norm": 0.09658580273389816, "learning_rate": 6.485113383317378e-06, "loss": 0.0616, "step": 2030 }, { "epoch": 0.43617703656189866, "grad_norm": 0.12495086342096329, "learning_rate": 6.451317660734812e-06, "loss": 0.0657, "step": 2040 }, { "epoch": 0.43831515929014325, "grad_norm": 0.14097630977630615, "learning_rate": 6.417449376395339e-06, "loss": 0.0651, "step": 2050 }, { "epoch": 0.44045328201838785, "grad_norm": 0.09859970957040787, "learning_rate": 6.3835102236168885e-06, "loss": 0.0634, "step": 2060 }, { "epoch": 0.44259140474663244, "grad_norm": 0.27701902389526367, "learning_rate": 6.34950189926061e-06, "loss": 0.0653, "step": 2070 }, { "epoch": 0.44472952747487704, "grad_norm": 0.10874440521001816, "learning_rate": 6.315426103646036e-06, "loss": 0.0654, "step": 2080 }, { "epoch": 0.44686765020312164, "grad_norm": 0.14138060808181763, "learning_rate": 6.281284540466067e-06, "loss": 0.0645, "step": 2090 }, { "epoch": 0.4490057729313663, "grad_norm": 0.09226495772600174, "learning_rate": 6.247078916701797e-06, "loss": 0.0635, "step": 2100 }, { "epoch": 0.4511438956596109, "grad_norm": 0.10510533303022385, "learning_rate": 6.212810942537167e-06, "loss": 0.0609, "step": 2110 }, { "epoch": 0.4532820183878555, "grad_norm": 0.13226036727428436, "learning_rate": 6.178482331273462e-06, "loss": 0.0631, "step": 2120 }, { "epoch": 0.4554201411161001, "grad_norm": 0.09952764213085175, "learning_rate": 6.144094799243647e-06, "loss": 0.0664, "step": 2130 }, { "epoch": 0.45755826384434467, "grad_norm": 0.25290119647979736, "learning_rate": 6.1096500657265575e-06, "loss": 0.0638, "step": 2140 }, { "epoch": 0.45969638657258927, "grad_norm": 0.25737249851226807, "learning_rate": 6.075149852860945e-06, "loss": 0.0636, "step": 2150 }, { "epoch": 0.46183450930083386, "grad_norm": 0.10331868380308151, "learning_rate": 6.040595885559366e-06, "loss": 0.0646, "step": 2160 }, { "epoch": 0.46397263202907846, "grad_norm": 0.10511161386966705, "learning_rate": 6.005989891421948e-06, "loss": 0.0662, "step": 2170 }, { "epoch": 0.46611075475732305, "grad_norm": 0.1610974222421646, "learning_rate": 5.971333600650012e-06, "loss": 0.0621, "step": 2180 }, { "epoch": 0.46824887748556765, "grad_norm": 0.13542431592941284, "learning_rate": 5.936628745959568e-06, "loss": 0.0648, "step": 2190 }, { "epoch": 0.47038700021381225, "grad_norm": 0.11100970953702927, "learning_rate": 5.901877062494684e-06, "loss": 0.0616, "step": 2200 }, { "epoch": 0.4725251229420569, "grad_norm": 0.09847405552864075, "learning_rate": 5.867080287740735e-06, "loss": 0.0622, "step": 2210 }, { "epoch": 0.4746632456703015, "grad_norm": 0.11069463193416595, "learning_rate": 5.832240161437528e-06, "loss": 0.0658, "step": 2220 }, { "epoch": 0.4768013683985461, "grad_norm": 0.14424288272857666, "learning_rate": 5.797358425492328e-06, "loss": 0.0627, "step": 2230 }, { "epoch": 0.4789394911267907, "grad_norm": 0.11425557732582092, "learning_rate": 5.762436823892763e-06, "loss": 0.0645, "step": 2240 }, { "epoch": 0.4810776138550353, "grad_norm": 0.12212098389863968, "learning_rate": 5.727477102619628e-06, "loss": 0.0661, "step": 2250 }, { "epoch": 0.4832157365832799, "grad_norm": 0.11695980280637741, "learning_rate": 5.692481009559598e-06, "loss": 0.0633, "step": 2260 }, { "epoch": 0.48535385931152447, "grad_norm": 0.16033753752708435, "learning_rate": 5.657450294417831e-06, "loss": 0.068, "step": 2270 }, { "epoch": 0.48749198203976907, "grad_norm": 0.09469865262508392, "learning_rate": 5.622386708630488e-06, "loss": 0.0657, "step": 2280 }, { "epoch": 0.48963010476801366, "grad_norm": 0.10984878987073898, "learning_rate": 5.587292005277176e-06, "loss": 0.0617, "step": 2290 }, { "epoch": 0.49176822749625826, "grad_norm": 0.10703036934137344, "learning_rate": 5.552167938993286e-06, "loss": 0.0641, "step": 2300 }, { "epoch": 0.4939063502245029, "grad_norm": 0.09129951894283295, "learning_rate": 5.51701626588227e-06, "loss": 0.0648, "step": 2310 }, { "epoch": 0.4960444729527475, "grad_norm": 0.14747264981269836, "learning_rate": 5.481838743427852e-06, "loss": 0.0617, "step": 2320 }, { "epoch": 0.4981825956809921, "grad_norm": 0.11260967701673508, "learning_rate": 5.446637130406141e-06, "loss": 0.0631, "step": 2330 }, { "epoch": 0.5003207184092366, "grad_norm": 0.1024189218878746, "learning_rate": 5.411413186797709e-06, "loss": 0.064, "step": 2340 }, { "epoch": 0.5024588411374813, "grad_norm": 0.16150939464569092, "learning_rate": 5.376168673699596e-06, "loss": 0.0637, "step": 2350 }, { "epoch": 0.504596963865726, "grad_norm": 0.14528174698352814, "learning_rate": 5.340905353237254e-06, "loss": 0.0655, "step": 2360 }, { "epoch": 0.5067350865939705, "grad_norm": 0.12370527535676956, "learning_rate": 5.305624988476452e-06, "loss": 0.0635, "step": 2370 }, { "epoch": 0.5088732093222151, "grad_norm": 0.09441283345222473, "learning_rate": 5.270329343335126e-06, "loss": 0.0651, "step": 2380 }, { "epoch": 0.5110113320504597, "grad_norm": 0.09483297914266586, "learning_rate": 5.235020182495188e-06, "loss": 0.0658, "step": 2390 }, { "epoch": 0.5131494547787043, "grad_norm": 0.11624085903167725, "learning_rate": 5.199699271314289e-06, "loss": 0.0675, "step": 2400 }, { "epoch": 0.5131494547787043, "eval_loss": 0.06467495113611221, "eval_runtime": 471.8273, "eval_samples_per_second": 4.904, "eval_steps_per_second": 0.307, "step": 2400 }, { "epoch": 0.5152875775069489, "grad_norm": 0.1344379037618637, "learning_rate": 5.164368375737576e-06, "loss": 0.0619, "step": 2410 }, { "epoch": 0.5174257002351935, "grad_norm": 0.09949100762605667, "learning_rate": 5.129029262209381e-06, "loss": 0.0617, "step": 2420 }, { "epoch": 0.5195638229634381, "grad_norm": 0.11078672856092453, "learning_rate": 5.093683697584907e-06, "loss": 0.0625, "step": 2430 }, { "epoch": 0.5217019456916827, "grad_norm": 0.15946152806282043, "learning_rate": 5.058333449041899e-06, "loss": 0.0608, "step": 2440 }, { "epoch": 0.5238400684199273, "grad_norm": 0.09759578853845596, "learning_rate": 5.022980283992283e-06, "loss": 0.0604, "step": 2450 }, { "epoch": 0.5259781911481719, "grad_norm": 0.10458780080080032, "learning_rate": 4.9876259699938e-06, "loss": 0.063, "step": 2460 }, { "epoch": 0.5281163138764166, "grad_norm": 0.1019633337855339, "learning_rate": 4.952272274661637e-06, "loss": 0.0608, "step": 2470 }, { "epoch": 0.5302544366046611, "grad_norm": 0.09208445250988007, "learning_rate": 4.916920965580052e-06, "loss": 0.0652, "step": 2480 }, { "epoch": 0.5323925593329057, "grad_norm": 0.11165319383144379, "learning_rate": 4.881573810213989e-06, "loss": 0.0615, "step": 2490 }, { "epoch": 0.5345306820611503, "grad_norm": 0.18430490791797638, "learning_rate": 4.8462325758207304e-06, "loss": 0.0657, "step": 2500 }, { "epoch": 0.5366688047893949, "grad_norm": 0.157784566283226, "learning_rate": 4.810899029361515e-06, "loss": 0.0653, "step": 2510 }, { "epoch": 0.5388069275176395, "grad_norm": 0.13744202256202698, "learning_rate": 4.775574937413211e-06, "loss": 0.0618, "step": 2520 }, { "epoch": 0.5409450502458841, "grad_norm": 0.13207334280014038, "learning_rate": 4.740262066079994e-06, "loss": 0.0644, "step": 2530 }, { "epoch": 0.5430831729741287, "grad_norm": 0.16908520460128784, "learning_rate": 4.70496218090503e-06, "loss": 0.0642, "step": 2540 }, { "epoch": 0.5452212957023733, "grad_norm": 0.13128970563411713, "learning_rate": 4.669677046782221e-06, "loss": 0.0652, "step": 2550 }, { "epoch": 0.547359418430618, "grad_norm": 0.08551183342933655, "learning_rate": 4.6344084278679574e-06, "loss": 0.065, "step": 2560 }, { "epoch": 0.5494975411588625, "grad_norm": 0.1189018115401268, "learning_rate": 4.599158087492913e-06, "loss": 0.0619, "step": 2570 }, { "epoch": 0.5516356638871072, "grad_norm": 0.24343594908714294, "learning_rate": 4.563927788073893e-06, "loss": 0.0625, "step": 2580 }, { "epoch": 0.5537737866153517, "grad_norm": 0.30038872361183167, "learning_rate": 4.528719291025706e-06, "loss": 0.062, "step": 2590 }, { "epoch": 0.5559119093435964, "grad_norm": 0.08746038377285004, "learning_rate": 4.493534356673102e-06, "loss": 0.0638, "step": 2600 }, { "epoch": 0.5580500320718409, "grad_norm": 0.09681444615125656, "learning_rate": 4.458374744162773e-06, "loss": 0.0647, "step": 2610 }, { "epoch": 0.5601881548000855, "grad_norm": 0.11077167838811874, "learning_rate": 4.423242211375381e-06, "loss": 0.0643, "step": 2620 }, { "epoch": 0.5623262775283301, "grad_norm": 0.08863001316785812, "learning_rate": 4.388138514837685e-06, "loss": 0.0627, "step": 2630 }, { "epoch": 0.5644644002565747, "grad_norm": 0.13838346302509308, "learning_rate": 4.35306540963471e-06, "loss": 0.0622, "step": 2640 }, { "epoch": 0.5666025229848193, "grad_norm": 0.09143807739019394, "learning_rate": 4.318024649322001e-06, "loss": 0.0627, "step": 2650 }, { "epoch": 0.5687406457130639, "grad_norm": 0.19630184769630432, "learning_rate": 4.283017985837955e-06, "loss": 0.0626, "step": 2660 }, { "epoch": 0.5708787684413086, "grad_norm": 0.10313283652067184, "learning_rate": 4.248047169416221e-06, "loss": 0.062, "step": 2670 }, { "epoch": 0.5730168911695531, "grad_norm": 0.08923624455928802, "learning_rate": 4.213113948498194e-06, "loss": 0.0626, "step": 2680 }, { "epoch": 0.5751550138977978, "grad_norm": 0.13680312037467957, "learning_rate": 4.178220069645608e-06, "loss": 0.0648, "step": 2690 }, { "epoch": 0.5772931366260423, "grad_norm": 0.164349764585495, "learning_rate": 4.143367277453197e-06, "loss": 0.0622, "step": 2700 }, { "epoch": 0.579431259354287, "grad_norm": 0.09581846743822098, "learning_rate": 4.10855731446149e-06, "loss": 0.0637, "step": 2710 }, { "epoch": 0.5815693820825315, "grad_norm": 0.1474573314189911, "learning_rate": 4.073791921069664e-06, "loss": 0.0611, "step": 2720 }, { "epoch": 0.5837075048107762, "grad_norm": 0.09582812339067459, "learning_rate": 4.039072835448553e-06, "loss": 0.0615, "step": 2730 }, { "epoch": 0.5858456275390207, "grad_norm": 0.101468525826931, "learning_rate": 4.004401793453731e-06, "loss": 0.061, "step": 2740 }, { "epoch": 0.5879837502672653, "grad_norm": 0.10648108273744583, "learning_rate": 3.969780528538726e-06, "loss": 0.0642, "step": 2750 }, { "epoch": 0.5901218729955099, "grad_norm": 0.09285979717969894, "learning_rate": 3.935210771668357e-06, "loss": 0.062, "step": 2760 }, { "epoch": 0.5922599957237545, "grad_norm": 0.09097687900066376, "learning_rate": 3.900694251232182e-06, "loss": 0.0608, "step": 2770 }, { "epoch": 0.5943981184519992, "grad_norm": 0.12016556411981583, "learning_rate": 3.8662326929580925e-06, "loss": 0.0644, "step": 2780 }, { "epoch": 0.5965362411802437, "grad_norm": 0.10592561960220337, "learning_rate": 3.831827819826027e-06, "loss": 0.0619, "step": 2790 }, { "epoch": 0.5986743639084884, "grad_norm": 0.09551785886287689, "learning_rate": 3.7974813519818288e-06, "loss": 0.0629, "step": 2800 }, { "epoch": 0.6008124866367329, "grad_norm": 0.13584552705287933, "learning_rate": 3.7631950066512423e-06, "loss": 0.0652, "step": 2810 }, { "epoch": 0.6029506093649776, "grad_norm": 0.10060502588748932, "learning_rate": 3.7289704980540586e-06, "loss": 0.0602, "step": 2820 }, { "epoch": 0.6050887320932221, "grad_norm": 0.10792689025402069, "learning_rate": 3.694809537318402e-06, "loss": 0.0635, "step": 2830 }, { "epoch": 0.6072268548214668, "grad_norm": 0.1079002246260643, "learning_rate": 3.660713832395193e-06, "loss": 0.0646, "step": 2840 }, { "epoch": 0.6093649775497113, "grad_norm": 0.12554524838924408, "learning_rate": 3.626685087972743e-06, "loss": 0.0607, "step": 2850 }, { "epoch": 0.611503100277956, "grad_norm": 0.09405695647001266, "learning_rate": 3.592725005391524e-06, "loss": 0.065, "step": 2860 }, { "epoch": 0.6136412230062006, "grad_norm": 0.09625021368265152, "learning_rate": 3.55883528255912e-06, "loss": 0.0621, "step": 2870 }, { "epoch": 0.6157793457344451, "grad_norm": 0.1028503030538559, "learning_rate": 3.525017613865321e-06, "loss": 0.0628, "step": 2880 }, { "epoch": 0.6179174684626898, "grad_norm": 0.08910300582647324, "learning_rate": 3.491273690097421e-06, "loss": 0.0599, "step": 2890 }, { "epoch": 0.6200555911909343, "grad_norm": 0.1026788130402565, "learning_rate": 3.45760519835567e-06, "loss": 0.0612, "step": 2900 }, { "epoch": 0.622193713919179, "grad_norm": 0.09384245425462723, "learning_rate": 3.4240138219689343e-06, "loss": 0.0625, "step": 2910 }, { "epoch": 0.6243318366474235, "grad_norm": 0.09324868768453598, "learning_rate": 3.390501240410535e-06, "loss": 0.0611, "step": 2920 }, { "epoch": 0.6264699593756682, "grad_norm": 0.1346089392900467, "learning_rate": 3.3570691292142694e-06, "loss": 0.0644, "step": 2930 }, { "epoch": 0.6286080821039127, "grad_norm": 0.13233442604541779, "learning_rate": 3.3237191598906536e-06, "loss": 0.0634, "step": 2940 }, { "epoch": 0.6307462048321574, "grad_norm": 0.11263269186019897, "learning_rate": 3.2904529998433356e-06, "loss": 0.0658, "step": 2950 }, { "epoch": 0.6328843275604019, "grad_norm": 0.32706591486930847, "learning_rate": 3.2572723122857416e-06, "loss": 0.0656, "step": 2960 }, { "epoch": 0.6350224502886466, "grad_norm": 0.2275686264038086, "learning_rate": 3.224178756157918e-06, "loss": 0.0614, "step": 2970 }, { "epoch": 0.6371605730168912, "grad_norm": 0.09637604653835297, "learning_rate": 3.191173986043583e-06, "loss": 0.0607, "step": 2980 }, { "epoch": 0.6392986957451358, "grad_norm": 0.10239467024803162, "learning_rate": 3.1582596520874096e-06, "loss": 0.0623, "step": 2990 }, { "epoch": 0.6414368184733804, "grad_norm": 0.08615203946828842, "learning_rate": 3.125437399912521e-06, "loss": 0.0613, "step": 3000 }, { "epoch": 0.6414368184733804, "eval_loss": 0.06311963498592377, "eval_runtime": 471.8096, "eval_samples_per_second": 4.905, "eval_steps_per_second": 0.307, "step": 3000 }, { "epoch": 0.643574941201625, "grad_norm": 0.08130071312189102, "learning_rate": 3.0927088705382092e-06, "loss": 0.0637, "step": 3010 }, { "epoch": 0.6457130639298696, "grad_norm": 0.09418642520904541, "learning_rate": 3.060075700297896e-06, "loss": 0.061, "step": 3020 }, { "epoch": 0.6478511866581141, "grad_norm": 0.09334340691566467, "learning_rate": 3.0275395207573178e-06, "loss": 0.0598, "step": 3030 }, { "epoch": 0.6499893093863588, "grad_norm": 0.10348668694496155, "learning_rate": 2.9951019586329467e-06, "loss": 0.0613, "step": 3040 }, { "epoch": 0.6521274321146033, "grad_norm": 0.09085489809513092, "learning_rate": 2.962764635710672e-06, "loss": 0.0619, "step": 3050 }, { "epoch": 0.654265554842848, "grad_norm": 0.09327159821987152, "learning_rate": 2.930529168764702e-06, "loss": 0.0635, "step": 3060 }, { "epoch": 0.6564036775710925, "grad_norm": 0.0930228903889656, "learning_rate": 2.89839716947674e-06, "loss": 0.0643, "step": 3070 }, { "epoch": 0.6585418002993372, "grad_norm": 0.1704426109790802, "learning_rate": 2.8663702443553967e-06, "loss": 0.0633, "step": 3080 }, { "epoch": 0.6606799230275818, "grad_norm": 0.09673333913087845, "learning_rate": 2.8344499946558714e-06, "loss": 0.0606, "step": 3090 }, { "epoch": 0.6628180457558264, "grad_norm": 0.07926999032497406, "learning_rate": 2.8026380162999055e-06, "loss": 0.0614, "step": 3100 }, { "epoch": 0.664956168484071, "grad_norm": 0.09460759162902832, "learning_rate": 2.7709358997959724e-06, "loss": 0.0622, "step": 3110 }, { "epoch": 0.6670942912123156, "grad_norm": 0.09549721330404282, "learning_rate": 2.7393452301597645e-06, "loss": 0.0618, "step": 3120 }, { "epoch": 0.6692324139405602, "grad_norm": 0.09777417033910751, "learning_rate": 2.7078675868349546e-06, "loss": 0.0602, "step": 3130 }, { "epoch": 0.6713705366688048, "grad_norm": 0.08188968896865845, "learning_rate": 2.676504543614214e-06, "loss": 0.0624, "step": 3140 }, { "epoch": 0.6735086593970494, "grad_norm": 0.10578031837940216, "learning_rate": 2.6452576685605385e-06, "loss": 0.0608, "step": 3150 }, { "epoch": 0.6756467821252939, "grad_norm": 0.09571292251348495, "learning_rate": 2.614128523928848e-06, "loss": 0.0613, "step": 3160 }, { "epoch": 0.6777849048535386, "grad_norm": 0.09241370856761932, "learning_rate": 2.583118666087869e-06, "loss": 0.0615, "step": 3170 }, { "epoch": 0.6799230275817832, "grad_norm": 0.09565988928079605, "learning_rate": 2.552229645442337e-06, "loss": 0.0605, "step": 3180 }, { "epoch": 0.6820611503100278, "grad_norm": 0.1339564025402069, "learning_rate": 2.5214630063554597e-06, "loss": 0.0614, "step": 3190 }, { "epoch": 0.6841992730382724, "grad_norm": 0.21678894758224487, "learning_rate": 2.4908202870717267e-06, "loss": 0.0631, "step": 3200 }, { "epoch": 0.686337395766517, "grad_norm": 0.1025332510471344, "learning_rate": 2.4603030196399796e-06, "loss": 0.0612, "step": 3210 }, { "epoch": 0.6884755184947616, "grad_norm": 0.09821213781833649, "learning_rate": 2.4299127298368314e-06, "loss": 0.0606, "step": 3220 }, { "epoch": 0.6906136412230062, "grad_norm": 0.11024197936058044, "learning_rate": 2.399650937090373e-06, "loss": 0.0618, "step": 3230 }, { "epoch": 0.6927517639512508, "grad_norm": 0.09742552042007446, "learning_rate": 2.369519154404205e-06, "loss": 0.0602, "step": 3240 }, { "epoch": 0.6948898866794954, "grad_norm": 0.11659736931324005, "learning_rate": 2.339518888281795e-06, "loss": 0.0599, "step": 3250 }, { "epoch": 0.69702800940774, "grad_norm": 0.12253785133361816, "learning_rate": 2.3096516386511585e-06, "loss": 0.062, "step": 3260 }, { "epoch": 0.6991661321359846, "grad_norm": 0.10601403564214706, "learning_rate": 2.279918898789865e-06, "loss": 0.0603, "step": 3270 }, { "epoch": 0.7013042548642292, "grad_norm": 0.1589890867471695, "learning_rate": 2.2503221552503777e-06, "loss": 0.0617, "step": 3280 }, { "epoch": 0.7034423775924739, "grad_norm": 0.10667730122804642, "learning_rate": 2.2208628877857276e-06, "loss": 0.0595, "step": 3290 }, { "epoch": 0.7055805003207184, "grad_norm": 0.09270080178976059, "learning_rate": 2.1915425692755325e-06, "loss": 0.0613, "step": 3300 }, { "epoch": 0.707718623048963, "grad_norm": 0.08238282054662704, "learning_rate": 2.162362665652364e-06, "loss": 0.0593, "step": 3310 }, { "epoch": 0.7098567457772076, "grad_norm": 0.08994623273611069, "learning_rate": 2.1333246358284394e-06, "loss": 0.0602, "step": 3320 }, { "epoch": 0.7119948685054522, "grad_norm": 0.08377353101968765, "learning_rate": 2.1044299316226962e-06, "loss": 0.0639, "step": 3330 }, { "epoch": 0.7141329912336968, "grad_norm": 0.11232832074165344, "learning_rate": 2.0756799976881987e-06, "loss": 0.0633, "step": 3340 }, { "epoch": 0.7162711139619414, "grad_norm": 0.10576164722442627, "learning_rate": 2.047076271439903e-06, "loss": 0.0621, "step": 3350 }, { "epoch": 0.718409236690186, "grad_norm": 0.10784109681844711, "learning_rate": 2.018620182982803e-06, "loss": 0.0633, "step": 3360 }, { "epoch": 0.7205473594184306, "grad_norm": 0.10578976571559906, "learning_rate": 1.9903131550404185e-06, "loss": 0.0619, "step": 3370 }, { "epoch": 0.7226854821466753, "grad_norm": 0.13373176753520966, "learning_rate": 1.9621566028836717e-06, "loss": 0.0589, "step": 3380 }, { "epoch": 0.7248236048749198, "grad_norm": 0.09690658748149872, "learning_rate": 1.9341519342601166e-06, "loss": 0.0606, "step": 3390 }, { "epoch": 0.7269617276031645, "grad_norm": 0.09626404196023941, "learning_rate": 1.9063005493235692e-06, "loss": 0.0597, "step": 3400 }, { "epoch": 0.729099850331409, "grad_norm": 0.08039379119873047, "learning_rate": 1.8786038405640954e-06, "loss": 0.0619, "step": 3410 }, { "epoch": 0.7312379730596537, "grad_norm": 0.08523211628198624, "learning_rate": 1.8510631927383887e-06, "loss": 0.0601, "step": 3420 }, { "epoch": 0.7333760957878982, "grad_norm": 0.10265989601612091, "learning_rate": 1.8236799828005402e-06, "loss": 0.0602, "step": 3430 }, { "epoch": 0.7355142185161428, "grad_norm": 0.0833195149898529, "learning_rate": 1.796455579833198e-06, "loss": 0.0613, "step": 3440 }, { "epoch": 0.7376523412443874, "grad_norm": 0.08251874148845673, "learning_rate": 1.7693913449791094e-06, "loss": 0.061, "step": 3450 }, { "epoch": 0.739790463972632, "grad_norm": 0.0832567885518074, "learning_rate": 1.7424886313730765e-06, "loss": 0.0607, "step": 3460 }, { "epoch": 0.7419285867008766, "grad_norm": 0.09145036339759827, "learning_rate": 1.7157487840742908e-06, "loss": 0.0625, "step": 3470 }, { "epoch": 0.7440667094291212, "grad_norm": 0.10999724268913269, "learning_rate": 1.6891731399990952e-06, "loss": 0.0618, "step": 3480 }, { "epoch": 0.7462048321573659, "grad_norm": 0.08439410477876663, "learning_rate": 1.6627630278541406e-06, "loss": 0.062, "step": 3490 }, { "epoch": 0.7483429548856104, "grad_norm": 0.09440149366855621, "learning_rate": 1.6365197680699468e-06, "loss": 0.0635, "step": 3500 }, { "epoch": 0.7504810776138551, "grad_norm": 0.07940148562192917, "learning_rate": 1.6104446727348944e-06, "loss": 0.0594, "step": 3510 }, { "epoch": 0.7526192003420996, "grad_norm": 0.174238920211792, "learning_rate": 1.5845390455296195e-06, "loss": 0.0602, "step": 3520 }, { "epoch": 0.7547573230703443, "grad_norm": 0.08501884341239929, "learning_rate": 1.5588041816618288e-06, "loss": 0.0636, "step": 3530 }, { "epoch": 0.7568954457985888, "grad_norm": 0.10274173319339752, "learning_rate": 1.533241367801554e-06, "loss": 0.0596, "step": 3540 }, { "epoch": 0.7590335685268335, "grad_norm": 0.11729396134614944, "learning_rate": 1.5078518820168097e-06, "loss": 0.0587, "step": 3550 }, { "epoch": 0.761171691255078, "grad_norm": 0.11430974304676056, "learning_rate": 1.482636993709703e-06, "loss": 0.0603, "step": 3560 }, { "epoch": 0.7633098139833226, "grad_norm": 0.08141325414180756, "learning_rate": 1.4575979635529653e-06, "loss": 0.061, "step": 3570 }, { "epoch": 0.7654479367115672, "grad_norm": 0.13267385959625244, "learning_rate": 1.4327360434269138e-06, "loss": 0.0621, "step": 3580 }, { "epoch": 0.7675860594398118, "grad_norm": 0.18512435257434845, "learning_rate": 1.4080524763568754e-06, "loss": 0.0599, "step": 3590 }, { "epoch": 0.7697241821680565, "grad_norm": 0.10558240115642548, "learning_rate": 1.383548496451026e-06, "loss": 0.0604, "step": 3600 }, { "epoch": 0.7697241821680565, "eval_loss": 0.06212155520915985, "eval_runtime": 472.0918, "eval_samples_per_second": 4.902, "eval_steps_per_second": 0.307, "step": 3600 }, { "epoch": 0.771862304896301, "grad_norm": 0.08125459402799606, "learning_rate": 1.3592253288386937e-06, "loss": 0.0569, "step": 3610 }, { "epoch": 0.7740004276245457, "grad_norm": 0.10237967222929001, "learning_rate": 1.33508418960911e-06, "loss": 0.0634, "step": 3620 }, { "epoch": 0.7761385503527902, "grad_norm": 0.13348835706710815, "learning_rate": 1.3111262857506018e-06, "loss": 0.0622, "step": 3630 }, { "epoch": 0.7782766730810349, "grad_norm": 0.0952489823102951, "learning_rate": 1.287352815090251e-06, "loss": 0.0624, "step": 3640 }, { "epoch": 0.7804147958092794, "grad_norm": 0.08191373199224472, "learning_rate": 1.263764966234e-06, "loss": 0.0608, "step": 3650 }, { "epoch": 0.7825529185375241, "grad_norm": 0.1063610091805458, "learning_rate": 1.2403639185072298e-06, "loss": 0.0606, "step": 3660 }, { "epoch": 0.7846910412657686, "grad_norm": 0.10587465018033981, "learning_rate": 1.2171508418958005e-06, "loss": 0.061, "step": 3670 }, { "epoch": 0.7868291639940133, "grad_norm": 0.08978404104709625, "learning_rate": 1.194126896987543e-06, "loss": 0.0604, "step": 3680 }, { "epoch": 0.7889672867222579, "grad_norm": 0.08431018143892288, "learning_rate": 1.1712932349142481e-06, "loss": 0.0587, "step": 3690 }, { "epoch": 0.7911054094505025, "grad_norm": 0.09140188992023468, "learning_rate": 1.1486509972941029e-06, "loss": 0.059, "step": 3700 }, { "epoch": 0.7932435321787471, "grad_norm": 0.21178825199604034, "learning_rate": 1.1262013161746144e-06, "loss": 0.0589, "step": 3710 }, { "epoch": 0.7953816549069916, "grad_norm": 0.09437291324138641, "learning_rate": 1.1039453139760154e-06, "loss": 0.059, "step": 3720 }, { "epoch": 0.7975197776352363, "grad_norm": 0.08276817947626114, "learning_rate": 1.081884103435139e-06, "loss": 0.062, "step": 3730 }, { "epoch": 0.7996579003634808, "grad_norm": 0.08296237885951996, "learning_rate": 1.060018787549793e-06, "loss": 0.0595, "step": 3740 }, { "epoch": 0.8017960230917255, "grad_norm": 0.09400928020477295, "learning_rate": 1.03835045952361e-06, "loss": 0.0586, "step": 3750 }, { "epoch": 0.80393414581997, "grad_norm": 0.09929320216178894, "learning_rate": 1.016880202711384e-06, "loss": 0.0603, "step": 3760 }, { "epoch": 0.8060722685482147, "grad_norm": 0.1045432910323143, "learning_rate": 9.956090905649184e-07, "loss": 0.0591, "step": 3770 }, { "epoch": 0.8082103912764592, "grad_norm": 0.08326222002506256, "learning_rate": 9.74538186579345e-07, "loss": 0.0596, "step": 3780 }, { "epoch": 0.8103485140047039, "grad_norm": 0.08769119530916214, "learning_rate": 9.536685442399568e-07, "loss": 0.0594, "step": 3790 }, { "epoch": 0.8124866367329485, "grad_norm": 0.09135784208774567, "learning_rate": 9.330012069695387e-07, "loss": 0.059, "step": 3800 }, { "epoch": 0.8146247594611931, "grad_norm": 0.09643464535474777, "learning_rate": 9.125372080761985e-07, "loss": 0.0584, "step": 3810 }, { "epoch": 0.8167628821894377, "grad_norm": 0.08774056285619736, "learning_rate": 8.922775707016973e-07, "loss": 0.0617, "step": 3820 }, { "epoch": 0.8189010049176823, "grad_norm": 0.1310407817363739, "learning_rate": 8.722233077703096e-07, "loss": 0.0618, "step": 3830 }, { "epoch": 0.8210391276459269, "grad_norm": 0.09406092017889023, "learning_rate": 8.523754219381631e-07, "loss": 0.0581, "step": 3840 }, { "epoch": 0.8231772503741714, "grad_norm": 0.111025370657444, "learning_rate": 8.327349055431233e-07, "loss": 0.061, "step": 3850 }, { "epoch": 0.8253153731024161, "grad_norm": 0.08444110304117203, "learning_rate": 8.13302740555173e-07, "loss": 0.0613, "step": 3860 }, { "epoch": 0.8274534958306606, "grad_norm": 0.16973550617694855, "learning_rate": 7.940798985273124e-07, "loss": 0.0622, "step": 3870 }, { "epoch": 0.8295916185589053, "grad_norm": 0.14076325297355652, "learning_rate": 7.750673405469949e-07, "loss": 0.0622, "step": 3880 }, { "epoch": 0.8317297412871499, "grad_norm": 0.12866050004959106, "learning_rate": 7.562660171880632e-07, "loss": 0.0623, "step": 3890 }, { "epoch": 0.8338678640153945, "grad_norm": 0.08956257998943329, "learning_rate": 7.376768684632357e-07, "loss": 0.0589, "step": 3900 }, { "epoch": 0.8360059867436391, "grad_norm": 0.09152022004127502, "learning_rate": 7.193008237770971e-07, "loss": 0.0615, "step": 3910 }, { "epoch": 0.8381441094718837, "grad_norm": 0.0782172754406929, "learning_rate": 7.011388018796389e-07, "loss": 0.0611, "step": 3920 }, { "epoch": 0.8402822322001283, "grad_norm": 0.0877286046743393, "learning_rate": 6.831917108203217e-07, "loss": 0.0597, "step": 3930 }, { "epoch": 0.8424203549283729, "grad_norm": 0.08773230016231537, "learning_rate": 6.654604479026728e-07, "loss": 0.0601, "step": 3940 }, { "epoch": 0.8445584776566175, "grad_norm": 0.08106860518455505, "learning_rate": 6.479458996394294e-07, "loss": 0.0633, "step": 3950 }, { "epoch": 0.846696600384862, "grad_norm": 0.09297432750463486, "learning_rate": 6.306489417082096e-07, "loss": 0.0621, "step": 3960 }, { "epoch": 0.8488347231131067, "grad_norm": 0.09097818285226822, "learning_rate": 6.135704389077335e-07, "loss": 0.0609, "step": 3970 }, { "epoch": 0.8509728458413512, "grad_norm": 0.10589548945426941, "learning_rate": 5.967112451145868e-07, "loss": 0.0605, "step": 3980 }, { "epoch": 0.8531109685695959, "grad_norm": 0.08168578892946243, "learning_rate": 5.800722032405304e-07, "loss": 0.0591, "step": 3990 }, { "epoch": 0.8552490912978405, "grad_norm": 0.09122731536626816, "learning_rate": 5.636541451903494e-07, "loss": 0.0586, "step": 4000 }, { "epoch": 0.8573872140260851, "grad_norm": 0.10044734925031662, "learning_rate": 5.474578918202717e-07, "loss": 0.0608, "step": 4010 }, { "epoch": 0.8595253367543297, "grad_norm": 0.07848547399044037, "learning_rate": 5.314842528969177e-07, "loss": 0.0609, "step": 4020 }, { "epoch": 0.8616634594825743, "grad_norm": 0.09690708667039871, "learning_rate": 5.157340270568212e-07, "loss": 0.0629, "step": 4030 }, { "epoch": 0.8638015822108189, "grad_norm": 0.09269597381353378, "learning_rate": 5.002080017664973e-07, "loss": 0.0587, "step": 4040 }, { "epoch": 0.8659397049390635, "grad_norm": 0.08430242538452148, "learning_rate": 4.849069532830669e-07, "loss": 0.0616, "step": 4050 }, { "epoch": 0.8680778276673081, "grad_norm": 0.09234491735696793, "learning_rate": 4.698316466154551e-07, "loss": 0.0613, "step": 4060 }, { "epoch": 0.8702159503955527, "grad_norm": 0.11234113574028015, "learning_rate": 4.549828354861341e-07, "loss": 0.06, "step": 4070 }, { "epoch": 0.8723540731237973, "grad_norm": 0.08569945394992828, "learning_rate": 4.4036126229344613e-07, "loss": 0.0592, "step": 4080 }, { "epoch": 0.8744921958520419, "grad_norm": 0.085124172270298, "learning_rate": 4.2596765807448037e-07, "loss": 0.0599, "step": 4090 }, { "epoch": 0.8766303185802865, "grad_norm": 0.08620309084653854, "learning_rate": 4.1180274246852724e-07, "loss": 0.0644, "step": 4100 }, { "epoch": 0.8787684413085312, "grad_norm": 0.08025231957435608, "learning_rate": 3.97867223681096e-07, "loss": 0.0607, "step": 4110 }, { "epoch": 0.8809065640367757, "grad_norm": 0.08026058226823807, "learning_rate": 3.841617984485069e-07, "loss": 0.0585, "step": 4120 }, { "epoch": 0.8830446867650203, "grad_norm": 0.07990364730358124, "learning_rate": 3.706871520030553e-07, "loss": 0.0622, "step": 4130 }, { "epoch": 0.8851828094932649, "grad_norm": 0.11747777462005615, "learning_rate": 3.574439580387562e-07, "loss": 0.0624, "step": 4140 }, { "epoch": 0.8873209322215095, "grad_norm": 0.08300217986106873, "learning_rate": 3.444328786776557e-07, "loss": 0.0617, "step": 4150 }, { "epoch": 0.8894590549497541, "grad_norm": 0.11143102496862411, "learning_rate": 3.3165456443673307e-07, "loss": 0.061, "step": 4160 }, { "epoch": 0.8915971776779987, "grad_norm": 0.0875178873538971, "learning_rate": 3.1910965419537087e-07, "loss": 0.062, "step": 4170 }, { "epoch": 0.8937353004062433, "grad_norm": 0.07958797365427017, "learning_rate": 3.0679877516341386e-07, "loss": 0.0607, "step": 4180 }, { "epoch": 0.8958734231344879, "grad_norm": 0.07875709980726242, "learning_rate": 2.947225428498152e-07, "loss": 0.0602, "step": 4190 }, { "epoch": 0.8980115458627326, "grad_norm": 0.08480419218540192, "learning_rate": 2.828815610318569e-07, "loss": 0.0625, "step": 4200 }, { "epoch": 0.8980115458627326, "eval_loss": 0.061606768518686295, "eval_runtime": 472.6015, "eval_samples_per_second": 4.896, "eval_steps_per_second": 0.307, "step": 4200 }, { "epoch": 0.9001496685909771, "grad_norm": 0.08824723958969116, "learning_rate": 2.7127642172496583e-07, "loss": 0.0595, "step": 4210 }, { "epoch": 0.9022877913192218, "grad_norm": 0.10775342583656311, "learning_rate": 2.59907705153114e-07, "loss": 0.0587, "step": 4220 }, { "epoch": 0.9044259140474663, "grad_norm": 0.08595039695501328, "learning_rate": 2.487759797198075e-07, "loss": 0.0603, "step": 4230 }, { "epoch": 0.906564036775711, "grad_norm": 0.09018037468194962, "learning_rate": 2.3788180197967193e-07, "loss": 0.061, "step": 4240 }, { "epoch": 0.9087021595039555, "grad_norm": 0.10949289798736572, "learning_rate": 2.272257166106201e-07, "loss": 0.059, "step": 4250 }, { "epoch": 0.9108402822322001, "grad_norm": 0.0874534547328949, "learning_rate": 2.1680825638662527e-07, "loss": 0.0607, "step": 4260 }, { "epoch": 0.9129784049604447, "grad_norm": 0.09561596810817719, "learning_rate": 2.06629942151082e-07, "loss": 0.0605, "step": 4270 }, { "epoch": 0.9151165276886893, "grad_norm": 0.09017440676689148, "learning_rate": 1.9669128279076522e-07, "loss": 0.0603, "step": 4280 }, { "epoch": 0.9172546504169339, "grad_norm": 0.08165230602025986, "learning_rate": 1.8699277521038672e-07, "loss": 0.0607, "step": 4290 }, { "epoch": 0.9193927731451785, "grad_norm": 0.09331604838371277, "learning_rate": 1.7753490430775288e-07, "loss": 0.0597, "step": 4300 }, { "epoch": 0.9215308958734232, "grad_norm": 0.08846830576658249, "learning_rate": 1.6831814294951843e-07, "loss": 0.0612, "step": 4310 }, { "epoch": 0.9236690186016677, "grad_norm": 0.1046655997633934, "learning_rate": 1.5934295194754924e-07, "loss": 0.0593, "step": 4320 }, { "epoch": 0.9258071413299124, "grad_norm": 0.0815853402018547, "learning_rate": 1.5060978003587745e-07, "loss": 0.0614, "step": 4330 }, { "epoch": 0.9279452640581569, "grad_norm": 0.08241681009531021, "learning_rate": 1.4211906384827223e-07, "loss": 0.0614, "step": 4340 }, { "epoch": 0.9300833867864016, "grad_norm": 0.07734426110982895, "learning_rate": 1.3387122789640163e-07, "loss": 0.0597, "step": 4350 }, { "epoch": 0.9322215095146461, "grad_norm": 0.09207015484571457, "learning_rate": 1.2586668454861505e-07, "loss": 0.0612, "step": 4360 }, { "epoch": 0.9343596322428908, "grad_norm": 0.09312504529953003, "learning_rate": 1.181058340093233e-07, "loss": 0.0602, "step": 4370 }, { "epoch": 0.9364977549711353, "grad_norm": 0.08721912652254105, "learning_rate": 1.1058906429898764e-07, "loss": 0.0627, "step": 4380 }, { "epoch": 0.93863587769938, "grad_norm": 0.08252057433128357, "learning_rate": 1.033167512347244e-07, "loss": 0.0591, "step": 4390 }, { "epoch": 0.9407740004276245, "grad_norm": 0.07943445444107056, "learning_rate": 9.62892584115116e-08, "loss": 0.0586, "step": 4400 }, { "epoch": 0.9429121231558691, "grad_norm": 0.1239377111196518, "learning_rate": 8.950693718401016e-08, "loss": 0.0596, "step": 4410 }, { "epoch": 0.9450502458841138, "grad_norm": 0.08284857869148254, "learning_rate": 8.297012664900017e-08, "loss": 0.0581, "step": 4420 }, { "epoch": 0.9471883686123583, "grad_norm": 0.11521401256322861, "learning_rate": 7.667915362842337e-08, "loss": 0.0619, "step": 4430 }, { "epoch": 0.949326491340603, "grad_norm": 0.0899544283747673, "learning_rate": 7.063433265304509e-08, "loss": 0.0645, "step": 4440 }, { "epoch": 0.9514646140688475, "grad_norm": 0.07524080574512482, "learning_rate": 6.483596594672959e-08, "loss": 0.0597, "step": 4450 }, { "epoch": 0.9536027367970922, "grad_norm": 0.10981776565313339, "learning_rate": 5.928434341132605e-08, "loss": 0.0582, "step": 4460 }, { "epoch": 0.9557408595253367, "grad_norm": 0.09796682000160217, "learning_rate": 5.397974261217909e-08, "loss": 0.0568, "step": 4470 }, { "epoch": 0.9578789822535814, "grad_norm": 0.0810341015458107, "learning_rate": 4.892242876424702e-08, "loss": 0.0612, "step": 4480 }, { "epoch": 0.9600171049818259, "grad_norm": 0.08066795766353607, "learning_rate": 4.411265471884363e-08, "loss": 0.0629, "step": 4490 }, { "epoch": 0.9621552277100706, "grad_norm": 0.08418738096952438, "learning_rate": 3.955066095099769e-08, "loss": 0.0584, "step": 4500 }, { "epoch": 0.9642933504383152, "grad_norm": 0.07691395282745361, "learning_rate": 3.523667554742704e-08, "loss": 0.06, "step": 4510 }, { "epoch": 0.9664314731665598, "grad_norm": 0.08319604396820068, "learning_rate": 3.117091419513829e-08, "loss": 0.0597, "step": 4520 }, { "epoch": 0.9685695958948044, "grad_norm": 0.09783016890287399, "learning_rate": 2.7353580170638714e-08, "loss": 0.0605, "step": 4530 }, { "epoch": 0.9707077186230489, "grad_norm": 0.10302453488111496, "learning_rate": 2.3784864329777224e-08, "loss": 0.0594, "step": 4540 }, { "epoch": 0.9728458413512936, "grad_norm": 0.0973035991191864, "learning_rate": 2.0464945098200296e-08, "loss": 0.0634, "step": 4550 }, { "epoch": 0.9749839640795381, "grad_norm": 0.07076684385538101, "learning_rate": 1.739398846242968e-08, "loss": 0.0618, "step": 4560 }, { "epoch": 0.9771220868077828, "grad_norm": 0.1335037350654602, "learning_rate": 1.4572147961567917e-08, "loss": 0.062, "step": 4570 }, { "epoch": 0.9792602095360273, "grad_norm": 0.08357635885477066, "learning_rate": 1.1999564679616715e-08, "loss": 0.0618, "step": 4580 }, { "epoch": 0.981398332264272, "grad_norm": 0.08964463323354721, "learning_rate": 9.67636723842591e-09, "loss": 0.0604, "step": 4590 }, { "epoch": 0.9835364549925165, "grad_norm": 0.08757588267326355, "learning_rate": 7.602671791263616e-09, "loss": 0.0619, "step": 4600 }, { "epoch": 0.9856745777207612, "grad_norm": 0.08129626512527466, "learning_rate": 5.778582017005874e-09, "loss": 0.0591, "step": 4610 }, { "epoch": 0.9878127004490058, "grad_norm": 0.08929581940174103, "learning_rate": 4.204189114955793e-09, "loss": 0.0581, "step": 4620 }, { "epoch": 0.9899508231772504, "grad_norm": 0.09756463766098022, "learning_rate": 2.8795718002821993e-09, "loss": 0.0603, "step": 4630 }, { "epoch": 0.992088945905495, "grad_norm": 0.09502363204956055, "learning_rate": 1.80479630008501e-09, "loss": 0.062, "step": 4640 }, { "epoch": 0.9942270686337396, "grad_norm": 0.0853857547044754, "learning_rate": 9.799163500834319e-10, "loss": 0.0616, "step": 4650 }, { "epoch": 0.9963651913619842, "grad_norm": 0.0833888053894043, "learning_rate": 4.049731919303357e-10, "loss": 0.0587, "step": 4660 }, { "epoch": 0.9985033140902287, "grad_norm": 0.089773990213871, "learning_rate": 7.999557114835022e-11, "loss": 0.0608, "step": 4670 }, { "epoch": 1.0, "step": 4677, "total_flos": 9.549834298038052e+19, "train_loss": 0.10822304976860733, "train_runtime": 69525.0456, "train_samples_per_second": 1.076, "train_steps_per_second": 0.067 } ], "logging_steps": 10, "max_steps": 4677, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.549834298038052e+19, "train_batch_size": 2, "trial_name": null, "trial_params": null }