{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5265, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005698005698005698, "grad_norm": 1.248605852290593, "learning_rate": 5e-06, "loss": 0.7464, "step": 10 }, { "epoch": 0.011396011396011397, "grad_norm": 1.0452895054741067, "learning_rate": 5e-06, "loss": 0.7185, "step": 20 }, { "epoch": 0.017094017094017096, "grad_norm": 0.8153324886981814, "learning_rate": 5e-06, "loss": 0.6852, "step": 30 }, { "epoch": 0.022792022792022793, "grad_norm": 1.0831283785058512, "learning_rate": 5e-06, "loss": 0.6966, "step": 40 }, { "epoch": 0.02849002849002849, "grad_norm": 0.8504265623978658, "learning_rate": 5e-06, "loss": 0.6617, "step": 50 }, { "epoch": 0.03418803418803419, "grad_norm": 0.9237045733670409, "learning_rate": 5e-06, "loss": 0.6694, "step": 60 }, { "epoch": 0.039886039886039885, "grad_norm": 0.7642515560623869, "learning_rate": 5e-06, "loss": 0.6675, "step": 70 }, { "epoch": 0.045584045584045586, "grad_norm": 0.627995094432306, "learning_rate": 5e-06, "loss": 0.6698, "step": 80 }, { "epoch": 0.05128205128205128, "grad_norm": 0.5944277816882634, "learning_rate": 5e-06, "loss": 0.6493, "step": 90 }, { "epoch": 0.05698005698005698, "grad_norm": 0.545138744190239, "learning_rate": 5e-06, "loss": 0.6493, "step": 100 }, { "epoch": 0.06267806267806268, "grad_norm": 0.5758914966539062, "learning_rate": 5e-06, "loss": 0.6593, "step": 110 }, { "epoch": 0.06837606837606838, "grad_norm": 0.6137211171560547, "learning_rate": 5e-06, "loss": 0.6464, "step": 120 }, { "epoch": 0.07407407407407407, "grad_norm": 0.6226547108170967, "learning_rate": 5e-06, "loss": 0.6618, "step": 130 }, { "epoch": 0.07977207977207977, "grad_norm": 0.603249856221309, "learning_rate": 5e-06, "loss": 0.6589, "step": 140 }, { "epoch": 0.08547008547008547, "grad_norm": 0.5036390357219359, "learning_rate": 5e-06, "loss": 0.6529, "step": 150 }, { "epoch": 0.09116809116809117, "grad_norm": 0.5977465496475399, "learning_rate": 5e-06, "loss": 0.655, "step": 160 }, { "epoch": 0.09686609686609686, "grad_norm": 0.5961195181863996, "learning_rate": 5e-06, "loss": 0.6712, "step": 170 }, { "epoch": 0.10256410256410256, "grad_norm": 0.6066789044063328, "learning_rate": 5e-06, "loss": 0.653, "step": 180 }, { "epoch": 0.10826210826210826, "grad_norm": 0.5768094248224014, "learning_rate": 5e-06, "loss": 0.6571, "step": 190 }, { "epoch": 0.11396011396011396, "grad_norm": 0.5562364789120665, "learning_rate": 5e-06, "loss": 0.655, "step": 200 }, { "epoch": 0.11965811965811966, "grad_norm": 0.5491574417916478, "learning_rate": 5e-06, "loss": 0.6499, "step": 210 }, { "epoch": 0.12535612535612536, "grad_norm": 0.5975656942284756, "learning_rate": 5e-06, "loss": 0.6521, "step": 220 }, { "epoch": 0.13105413105413105, "grad_norm": 0.5783672727047425, "learning_rate": 5e-06, "loss": 0.6339, "step": 230 }, { "epoch": 0.13675213675213677, "grad_norm": 0.5949371595243242, "learning_rate": 5e-06, "loss": 0.6359, "step": 240 }, { "epoch": 0.14245014245014245, "grad_norm": 0.6034430748838411, "learning_rate": 5e-06, "loss": 0.632, "step": 250 }, { "epoch": 0.14814814814814814, "grad_norm": 0.5821788725046587, "learning_rate": 5e-06, "loss": 0.6308, "step": 260 }, { "epoch": 0.15384615384615385, "grad_norm": 0.5678471771976431, "learning_rate": 5e-06, "loss": 0.6503, "step": 270 }, { "epoch": 0.15954415954415954, "grad_norm": 0.5602675467648068, "learning_rate": 5e-06, "loss": 0.6599, "step": 280 }, { "epoch": 0.16524216524216523, "grad_norm": 0.5941388781564068, "learning_rate": 5e-06, "loss": 0.6355, "step": 290 }, { "epoch": 0.17094017094017094, "grad_norm": 0.5858722107720983, "learning_rate": 5e-06, "loss": 0.6508, "step": 300 }, { "epoch": 0.17663817663817663, "grad_norm": 0.6035463226302998, "learning_rate": 5e-06, "loss": 0.6553, "step": 310 }, { "epoch": 0.18233618233618235, "grad_norm": 0.6342249591951397, "learning_rate": 5e-06, "loss": 0.6534, "step": 320 }, { "epoch": 0.18803418803418803, "grad_norm": 0.5798733093393231, "learning_rate": 5e-06, "loss": 0.6238, "step": 330 }, { "epoch": 0.19373219373219372, "grad_norm": 0.6315524208231319, "learning_rate": 5e-06, "loss": 0.6287, "step": 340 }, { "epoch": 0.19943019943019943, "grad_norm": 0.6118349674877842, "learning_rate": 5e-06, "loss": 0.6439, "step": 350 }, { "epoch": 0.20512820512820512, "grad_norm": 0.5613968317306219, "learning_rate": 5e-06, "loss": 0.6422, "step": 360 }, { "epoch": 0.21082621082621084, "grad_norm": 0.650029101740181, "learning_rate": 5e-06, "loss": 0.6352, "step": 370 }, { "epoch": 0.21652421652421652, "grad_norm": 0.6034494108497298, "learning_rate": 5e-06, "loss": 0.6419, "step": 380 }, { "epoch": 0.2222222222222222, "grad_norm": 0.5686444051570657, "learning_rate": 5e-06, "loss": 0.6403, "step": 390 }, { "epoch": 0.22792022792022792, "grad_norm": 0.5897904881774643, "learning_rate": 5e-06, "loss": 0.6277, "step": 400 }, { "epoch": 0.2336182336182336, "grad_norm": 0.5969120088165002, "learning_rate": 5e-06, "loss": 0.6343, "step": 410 }, { "epoch": 0.23931623931623933, "grad_norm": 0.6066999477029202, "learning_rate": 5e-06, "loss": 0.6223, "step": 420 }, { "epoch": 0.245014245014245, "grad_norm": 0.6019250090914218, "learning_rate": 5e-06, "loss": 0.6227, "step": 430 }, { "epoch": 0.25071225071225073, "grad_norm": 0.635752248265818, "learning_rate": 5e-06, "loss": 0.6301, "step": 440 }, { "epoch": 0.2564102564102564, "grad_norm": 0.5334914544076793, "learning_rate": 5e-06, "loss": 0.6277, "step": 450 }, { "epoch": 0.2621082621082621, "grad_norm": 0.5614109457622548, "learning_rate": 5e-06, "loss": 0.6367, "step": 460 }, { "epoch": 0.2678062678062678, "grad_norm": 0.5716858172898615, "learning_rate": 5e-06, "loss": 0.6433, "step": 470 }, { "epoch": 0.27350427350427353, "grad_norm": 0.569385420792123, "learning_rate": 5e-06, "loss": 0.6358, "step": 480 }, { "epoch": 0.2792022792022792, "grad_norm": 0.5396490903906098, "learning_rate": 5e-06, "loss": 0.6483, "step": 490 }, { "epoch": 0.2849002849002849, "grad_norm": 0.5712372988205724, "learning_rate": 5e-06, "loss": 0.6247, "step": 500 }, { "epoch": 0.2905982905982906, "grad_norm": 0.5616938335269104, "learning_rate": 5e-06, "loss": 0.6397, "step": 510 }, { "epoch": 0.2962962962962963, "grad_norm": 0.5586395315743943, "learning_rate": 5e-06, "loss": 0.637, "step": 520 }, { "epoch": 0.301994301994302, "grad_norm": 0.593890274267505, "learning_rate": 5e-06, "loss": 0.6425, "step": 530 }, { "epoch": 0.3076923076923077, "grad_norm": 0.5691720448900264, "learning_rate": 5e-06, "loss": 0.6395, "step": 540 }, { "epoch": 0.31339031339031337, "grad_norm": 0.5825957418827326, "learning_rate": 5e-06, "loss": 0.6404, "step": 550 }, { "epoch": 0.3190883190883191, "grad_norm": 0.5712581943035149, "learning_rate": 5e-06, "loss": 0.6363, "step": 560 }, { "epoch": 0.3247863247863248, "grad_norm": 0.5611359179186056, "learning_rate": 5e-06, "loss": 0.6247, "step": 570 }, { "epoch": 0.33048433048433046, "grad_norm": 0.5263861318528564, "learning_rate": 5e-06, "loss": 0.6397, "step": 580 }, { "epoch": 0.33618233618233617, "grad_norm": 0.6008086879223974, "learning_rate": 5e-06, "loss": 0.6366, "step": 590 }, { "epoch": 0.3418803418803419, "grad_norm": 0.5359029223247334, "learning_rate": 5e-06, "loss": 0.6279, "step": 600 }, { "epoch": 0.3475783475783476, "grad_norm": 0.5476117829162411, "learning_rate": 5e-06, "loss": 0.6472, "step": 610 }, { "epoch": 0.35327635327635326, "grad_norm": 0.548345413171009, "learning_rate": 5e-06, "loss": 0.6236, "step": 620 }, { "epoch": 0.358974358974359, "grad_norm": 0.549515203524155, "learning_rate": 5e-06, "loss": 0.6399, "step": 630 }, { "epoch": 0.3646723646723647, "grad_norm": 0.6247502479073669, "learning_rate": 5e-06, "loss": 0.638, "step": 640 }, { "epoch": 0.37037037037037035, "grad_norm": 0.6367708386541642, "learning_rate": 5e-06, "loss": 0.6303, "step": 650 }, { "epoch": 0.37606837606837606, "grad_norm": 0.6035098886365409, "learning_rate": 5e-06, "loss": 0.618, "step": 660 }, { "epoch": 0.3817663817663818, "grad_norm": 0.5843073759359196, "learning_rate": 5e-06, "loss": 0.6433, "step": 670 }, { "epoch": 0.38746438746438744, "grad_norm": 0.6208173453592988, "learning_rate": 5e-06, "loss": 0.6265, "step": 680 }, { "epoch": 0.39316239316239315, "grad_norm": 0.6005049567591871, "learning_rate": 5e-06, "loss": 0.6303, "step": 690 }, { "epoch": 0.39886039886039887, "grad_norm": 0.569831828840819, "learning_rate": 5e-06, "loss": 0.6456, "step": 700 }, { "epoch": 0.4045584045584046, "grad_norm": 0.582087320229329, "learning_rate": 5e-06, "loss": 0.6388, "step": 710 }, { "epoch": 0.41025641025641024, "grad_norm": 0.5537164119876729, "learning_rate": 5e-06, "loss": 0.6265, "step": 720 }, { "epoch": 0.41595441595441596, "grad_norm": 0.5993733531397847, "learning_rate": 5e-06, "loss": 0.6431, "step": 730 }, { "epoch": 0.42165242165242167, "grad_norm": 0.5878911525467738, "learning_rate": 5e-06, "loss": 0.6231, "step": 740 }, { "epoch": 0.42735042735042733, "grad_norm": 0.5703047756344629, "learning_rate": 5e-06, "loss": 0.6221, "step": 750 }, { "epoch": 0.43304843304843305, "grad_norm": 0.577768195476661, "learning_rate": 5e-06, "loss": 0.6208, "step": 760 }, { "epoch": 0.43874643874643876, "grad_norm": 0.5854764641336874, "learning_rate": 5e-06, "loss": 0.6286, "step": 770 }, { "epoch": 0.4444444444444444, "grad_norm": 0.5387219205137932, "learning_rate": 5e-06, "loss": 0.6295, "step": 780 }, { "epoch": 0.45014245014245013, "grad_norm": 0.5918701715838999, "learning_rate": 5e-06, "loss": 0.6293, "step": 790 }, { "epoch": 0.45584045584045585, "grad_norm": 0.610827318398544, "learning_rate": 5e-06, "loss": 0.6548, "step": 800 }, { "epoch": 0.46153846153846156, "grad_norm": 0.5581493937641772, "learning_rate": 5e-06, "loss": 0.6357, "step": 810 }, { "epoch": 0.4672364672364672, "grad_norm": 0.5526185327228441, "learning_rate": 5e-06, "loss": 0.6255, "step": 820 }, { "epoch": 0.47293447293447294, "grad_norm": 0.5604472711614089, "learning_rate": 5e-06, "loss": 0.6318, "step": 830 }, { "epoch": 0.47863247863247865, "grad_norm": 0.5876335201995966, "learning_rate": 5e-06, "loss": 0.6314, "step": 840 }, { "epoch": 0.4843304843304843, "grad_norm": 0.5674159078017988, "learning_rate": 5e-06, "loss": 0.6438, "step": 850 }, { "epoch": 0.49002849002849, "grad_norm": 0.5707009756648699, "learning_rate": 5e-06, "loss": 0.6186, "step": 860 }, { "epoch": 0.49572649572649574, "grad_norm": 0.6126275501973454, "learning_rate": 5e-06, "loss": 0.6298, "step": 870 }, { "epoch": 0.5014245014245015, "grad_norm": 0.5973891881380156, "learning_rate": 5e-06, "loss": 0.6482, "step": 880 }, { "epoch": 0.5071225071225072, "grad_norm": 0.5440973024632595, "learning_rate": 5e-06, "loss": 0.6287, "step": 890 }, { "epoch": 0.5128205128205128, "grad_norm": 0.5719103793581991, "learning_rate": 5e-06, "loss": 0.63, "step": 900 }, { "epoch": 0.5185185185185185, "grad_norm": 0.5927304335774137, "learning_rate": 5e-06, "loss": 0.6364, "step": 910 }, { "epoch": 0.5242165242165242, "grad_norm": 0.5468702165759647, "learning_rate": 5e-06, "loss": 0.6297, "step": 920 }, { "epoch": 0.5299145299145299, "grad_norm": 0.5689062473394013, "learning_rate": 5e-06, "loss": 0.6256, "step": 930 }, { "epoch": 0.5356125356125356, "grad_norm": 0.580087974342758, "learning_rate": 5e-06, "loss": 0.6179, "step": 940 }, { "epoch": 0.5413105413105413, "grad_norm": 0.6278973426700435, "learning_rate": 5e-06, "loss": 0.6349, "step": 950 }, { "epoch": 0.5470085470085471, "grad_norm": 0.5749182904288472, "learning_rate": 5e-06, "loss": 0.6312, "step": 960 }, { "epoch": 0.5527065527065527, "grad_norm": 0.5755058045692314, "learning_rate": 5e-06, "loss": 0.6464, "step": 970 }, { "epoch": 0.5584045584045584, "grad_norm": 0.564209988292775, "learning_rate": 5e-06, "loss": 0.6204, "step": 980 }, { "epoch": 0.5641025641025641, "grad_norm": 0.6064650017065378, "learning_rate": 5e-06, "loss": 0.6302, "step": 990 }, { "epoch": 0.5698005698005698, "grad_norm": 0.5981562518766129, "learning_rate": 5e-06, "loss": 0.6278, "step": 1000 }, { "epoch": 0.5754985754985755, "grad_norm": 0.5985428419859516, "learning_rate": 5e-06, "loss": 0.6278, "step": 1010 }, { "epoch": 0.5811965811965812, "grad_norm": 0.613528620026823, "learning_rate": 5e-06, "loss": 0.6358, "step": 1020 }, { "epoch": 0.5868945868945868, "grad_norm": 0.5785257508799594, "learning_rate": 5e-06, "loss": 0.6367, "step": 1030 }, { "epoch": 0.5925925925925926, "grad_norm": 0.6325574889479847, "learning_rate": 5e-06, "loss": 0.6214, "step": 1040 }, { "epoch": 0.5982905982905983, "grad_norm": 0.5798171618499341, "learning_rate": 5e-06, "loss": 0.6318, "step": 1050 }, { "epoch": 0.603988603988604, "grad_norm": 0.5917058378245685, "learning_rate": 5e-06, "loss": 0.6239, "step": 1060 }, { "epoch": 0.6096866096866097, "grad_norm": 0.6462363857504108, "learning_rate": 5e-06, "loss": 0.6253, "step": 1070 }, { "epoch": 0.6153846153846154, "grad_norm": 0.5964629820058396, "learning_rate": 5e-06, "loss": 0.6301, "step": 1080 }, { "epoch": 0.6210826210826211, "grad_norm": 0.5697422137314763, "learning_rate": 5e-06, "loss": 0.6318, "step": 1090 }, { "epoch": 0.6267806267806267, "grad_norm": 0.6246288555076868, "learning_rate": 5e-06, "loss": 0.6498, "step": 1100 }, { "epoch": 0.6324786324786325, "grad_norm": 0.5736841164436457, "learning_rate": 5e-06, "loss": 0.6256, "step": 1110 }, { "epoch": 0.6381766381766382, "grad_norm": 0.6073053058703407, "learning_rate": 5e-06, "loss": 0.6435, "step": 1120 }, { "epoch": 0.6438746438746439, "grad_norm": 0.5682386523303715, "learning_rate": 5e-06, "loss": 0.6118, "step": 1130 }, { "epoch": 0.6495726495726496, "grad_norm": 0.5789250463385112, "learning_rate": 5e-06, "loss": 0.6438, "step": 1140 }, { "epoch": 0.6552706552706553, "grad_norm": 0.5677226033216014, "learning_rate": 5e-06, "loss": 0.6171, "step": 1150 }, { "epoch": 0.6609686609686609, "grad_norm": 0.570510089133293, "learning_rate": 5e-06, "loss": 0.6333, "step": 1160 }, { "epoch": 0.6666666666666666, "grad_norm": 0.5534962109817962, "learning_rate": 5e-06, "loss": 0.6311, "step": 1170 }, { "epoch": 0.6723646723646723, "grad_norm": 0.6028706947112492, "learning_rate": 5e-06, "loss": 0.6335, "step": 1180 }, { "epoch": 0.6780626780626781, "grad_norm": 0.5435510233432874, "learning_rate": 5e-06, "loss": 0.6266, "step": 1190 }, { "epoch": 0.6837606837606838, "grad_norm": 0.5781607598620716, "learning_rate": 5e-06, "loss": 0.6081, "step": 1200 }, { "epoch": 0.6894586894586895, "grad_norm": 0.5387883641010746, "learning_rate": 5e-06, "loss": 0.6312, "step": 1210 }, { "epoch": 0.6951566951566952, "grad_norm": 0.6108820344681773, "learning_rate": 5e-06, "loss": 0.6228, "step": 1220 }, { "epoch": 0.7008547008547008, "grad_norm": 0.5547515213703605, "learning_rate": 5e-06, "loss": 0.6184, "step": 1230 }, { "epoch": 0.7065527065527065, "grad_norm": 0.5703753317669427, "learning_rate": 5e-06, "loss": 0.6283, "step": 1240 }, { "epoch": 0.7122507122507122, "grad_norm": 0.5718536532526751, "learning_rate": 5e-06, "loss": 0.6331, "step": 1250 }, { "epoch": 0.717948717948718, "grad_norm": 0.5559094696648192, "learning_rate": 5e-06, "loss": 0.624, "step": 1260 }, { "epoch": 0.7236467236467237, "grad_norm": 0.55959365727665, "learning_rate": 5e-06, "loss": 0.6253, "step": 1270 }, { "epoch": 0.7293447293447294, "grad_norm": 0.6031181458566351, "learning_rate": 5e-06, "loss": 0.6192, "step": 1280 }, { "epoch": 0.7350427350427351, "grad_norm": 0.5994092568067709, "learning_rate": 5e-06, "loss": 0.6192, "step": 1290 }, { "epoch": 0.7407407407407407, "grad_norm": 0.6365099348102852, "learning_rate": 5e-06, "loss": 0.6355, "step": 1300 }, { "epoch": 0.7464387464387464, "grad_norm": 0.6054035693063426, "learning_rate": 5e-06, "loss": 0.6187, "step": 1310 }, { "epoch": 0.7521367521367521, "grad_norm": 0.5812878969763033, "learning_rate": 5e-06, "loss": 0.6134, "step": 1320 }, { "epoch": 0.7578347578347578, "grad_norm": 0.6221634431669929, "learning_rate": 5e-06, "loss": 0.6317, "step": 1330 }, { "epoch": 0.7635327635327636, "grad_norm": 0.5837064991966057, "learning_rate": 5e-06, "loss": 0.6186, "step": 1340 }, { "epoch": 0.7692307692307693, "grad_norm": 0.5519308410148134, "learning_rate": 5e-06, "loss": 0.6282, "step": 1350 }, { "epoch": 0.7749287749287749, "grad_norm": 0.5884579917268693, "learning_rate": 5e-06, "loss": 0.6301, "step": 1360 }, { "epoch": 0.7806267806267806, "grad_norm": 0.5863408060758529, "learning_rate": 5e-06, "loss": 0.6332, "step": 1370 }, { "epoch": 0.7863247863247863, "grad_norm": 0.606462849435967, "learning_rate": 5e-06, "loss": 0.6444, "step": 1380 }, { "epoch": 0.792022792022792, "grad_norm": 0.609745642222076, "learning_rate": 5e-06, "loss": 0.6188, "step": 1390 }, { "epoch": 0.7977207977207977, "grad_norm": 0.6278637624500826, "learning_rate": 5e-06, "loss": 0.6438, "step": 1400 }, { "epoch": 0.8034188034188035, "grad_norm": 0.5964004351905415, "learning_rate": 5e-06, "loss": 0.6227, "step": 1410 }, { "epoch": 0.8091168091168092, "grad_norm": 0.5695342658619863, "learning_rate": 5e-06, "loss": 0.6357, "step": 1420 }, { "epoch": 0.8148148148148148, "grad_norm": 0.5859213800511389, "learning_rate": 5e-06, "loss": 0.6193, "step": 1430 }, { "epoch": 0.8205128205128205, "grad_norm": 0.5752147052829165, "learning_rate": 5e-06, "loss": 0.627, "step": 1440 }, { "epoch": 0.8262108262108262, "grad_norm": 0.6177624749983104, "learning_rate": 5e-06, "loss": 0.624, "step": 1450 }, { "epoch": 0.8319088319088319, "grad_norm": 0.608719985454889, "learning_rate": 5e-06, "loss": 0.6191, "step": 1460 }, { "epoch": 0.8376068376068376, "grad_norm": 0.5667215459680056, "learning_rate": 5e-06, "loss": 0.6157, "step": 1470 }, { "epoch": 0.8433048433048433, "grad_norm": 0.5672924637566275, "learning_rate": 5e-06, "loss": 0.623, "step": 1480 }, { "epoch": 0.8490028490028491, "grad_norm": 0.6493622667391157, "learning_rate": 5e-06, "loss": 0.6359, "step": 1490 }, { "epoch": 0.8547008547008547, "grad_norm": 0.5623923532248208, "learning_rate": 5e-06, "loss": 0.6289, "step": 1500 }, { "epoch": 0.8603988603988604, "grad_norm": 0.5978019810160363, "learning_rate": 5e-06, "loss": 0.6286, "step": 1510 }, { "epoch": 0.8660968660968661, "grad_norm": 0.5455769144299073, "learning_rate": 5e-06, "loss": 0.6355, "step": 1520 }, { "epoch": 0.8717948717948718, "grad_norm": 0.5694355383197235, "learning_rate": 5e-06, "loss": 0.646, "step": 1530 }, { "epoch": 0.8774928774928775, "grad_norm": 0.5755078976127412, "learning_rate": 5e-06, "loss": 0.6119, "step": 1540 }, { "epoch": 0.8831908831908832, "grad_norm": 0.5678832987577136, "learning_rate": 5e-06, "loss": 0.6253, "step": 1550 }, { "epoch": 0.8888888888888888, "grad_norm": 0.6059681652536819, "learning_rate": 5e-06, "loss": 0.6423, "step": 1560 }, { "epoch": 0.8945868945868946, "grad_norm": 0.5683013444396426, "learning_rate": 5e-06, "loss": 0.6423, "step": 1570 }, { "epoch": 0.9002849002849003, "grad_norm": 0.5683186943846946, "learning_rate": 5e-06, "loss": 0.6219, "step": 1580 }, { "epoch": 0.905982905982906, "grad_norm": 0.5863192427228783, "learning_rate": 5e-06, "loss": 0.642, "step": 1590 }, { "epoch": 0.9116809116809117, "grad_norm": 0.5595388030839678, "learning_rate": 5e-06, "loss": 0.6327, "step": 1600 }, { "epoch": 0.9173789173789174, "grad_norm": 0.5676065816242905, "learning_rate": 5e-06, "loss": 0.6362, "step": 1610 }, { "epoch": 0.9230769230769231, "grad_norm": 0.5762087240408192, "learning_rate": 5e-06, "loss": 0.6226, "step": 1620 }, { "epoch": 0.9287749287749287, "grad_norm": 0.6707590903760927, "learning_rate": 5e-06, "loss": 0.6305, "step": 1630 }, { "epoch": 0.9344729344729344, "grad_norm": 0.6103119559173308, "learning_rate": 5e-06, "loss": 0.6319, "step": 1640 }, { "epoch": 0.9401709401709402, "grad_norm": 0.5633994724981068, "learning_rate": 5e-06, "loss": 0.6068, "step": 1650 }, { "epoch": 0.9458689458689459, "grad_norm": 0.6187682445922644, "learning_rate": 5e-06, "loss": 0.6131, "step": 1660 }, { "epoch": 0.9515669515669516, "grad_norm": 0.5525323634268904, "learning_rate": 5e-06, "loss": 0.632, "step": 1670 }, { "epoch": 0.9572649572649573, "grad_norm": 0.5564758529357481, "learning_rate": 5e-06, "loss": 0.6159, "step": 1680 }, { "epoch": 0.9629629629629629, "grad_norm": 0.6349335167662418, "learning_rate": 5e-06, "loss": 0.6365, "step": 1690 }, { "epoch": 0.9686609686609686, "grad_norm": 0.5650086755333475, "learning_rate": 5e-06, "loss": 0.6281, "step": 1700 }, { "epoch": 0.9743589743589743, "grad_norm": 0.5521116095578306, "learning_rate": 5e-06, "loss": 0.6344, "step": 1710 }, { "epoch": 0.98005698005698, "grad_norm": 0.6644841541531104, "learning_rate": 5e-06, "loss": 0.6234, "step": 1720 }, { "epoch": 0.9857549857549858, "grad_norm": 0.5223237563049048, "learning_rate": 5e-06, "loss": 0.6194, "step": 1730 }, { "epoch": 0.9914529914529915, "grad_norm": 0.583506791545265, "learning_rate": 5e-06, "loss": 0.6042, "step": 1740 }, { "epoch": 0.9971509971509972, "grad_norm": 0.5524358612737751, "learning_rate": 5e-06, "loss": 0.6165, "step": 1750 }, { "epoch": 1.0, "eval_loss": 0.6207965016365051, "eval_runtime": 445.7759, "eval_samples_per_second": 26.522, "eval_steps_per_second": 0.415, "step": 1755 }, { "epoch": 1.002849002849003, "grad_norm": 0.6193051669442373, "learning_rate": 5e-06, "loss": 0.5925, "step": 1760 }, { "epoch": 1.0085470085470085, "grad_norm": 0.5538188542900693, "learning_rate": 5e-06, "loss": 0.5831, "step": 1770 }, { "epoch": 1.0142450142450143, "grad_norm": 0.5437973437283771, "learning_rate": 5e-06, "loss": 0.5574, "step": 1780 }, { "epoch": 1.01994301994302, "grad_norm": 0.5402398297961842, "learning_rate": 5e-06, "loss": 0.5807, "step": 1790 }, { "epoch": 1.0256410256410255, "grad_norm": 0.5457114785479574, "learning_rate": 5e-06, "loss": 0.5786, "step": 1800 }, { "epoch": 1.0313390313390314, "grad_norm": 0.5835431121788174, "learning_rate": 5e-06, "loss": 0.5873, "step": 1810 }, { "epoch": 1.037037037037037, "grad_norm": 0.549315178206307, "learning_rate": 5e-06, "loss": 0.5637, "step": 1820 }, { "epoch": 1.0427350427350428, "grad_norm": 0.551965760141286, "learning_rate": 5e-06, "loss": 0.574, "step": 1830 }, { "epoch": 1.0484330484330484, "grad_norm": 0.5558874848824243, "learning_rate": 5e-06, "loss": 0.5641, "step": 1840 }, { "epoch": 1.0541310541310542, "grad_norm": 0.5817186570163869, "learning_rate": 5e-06, "loss": 0.5778, "step": 1850 }, { "epoch": 1.0598290598290598, "grad_norm": 0.4980497695823232, "learning_rate": 5e-06, "loss": 0.5705, "step": 1860 }, { "epoch": 1.0655270655270654, "grad_norm": 0.6034343866560189, "learning_rate": 5e-06, "loss": 0.5705, "step": 1870 }, { "epoch": 1.0712250712250713, "grad_norm": 0.5404707262066166, "learning_rate": 5e-06, "loss": 0.5642, "step": 1880 }, { "epoch": 1.0769230769230769, "grad_norm": 0.5172417418761445, "learning_rate": 5e-06, "loss": 0.5817, "step": 1890 }, { "epoch": 1.0826210826210827, "grad_norm": 0.5444976667004858, "learning_rate": 5e-06, "loss": 0.5795, "step": 1900 }, { "epoch": 1.0883190883190883, "grad_norm": 0.5808327796447509, "learning_rate": 5e-06, "loss": 0.5838, "step": 1910 }, { "epoch": 1.0940170940170941, "grad_norm": 0.5553080452734186, "learning_rate": 5e-06, "loss": 0.5729, "step": 1920 }, { "epoch": 1.0997150997150997, "grad_norm": 0.6252535797926512, "learning_rate": 5e-06, "loss": 0.5888, "step": 1930 }, { "epoch": 1.1054131054131053, "grad_norm": 0.5418701052068917, "learning_rate": 5e-06, "loss": 0.5749, "step": 1940 }, { "epoch": 1.1111111111111112, "grad_norm": 0.5427412867505934, "learning_rate": 5e-06, "loss": 0.5802, "step": 1950 }, { "epoch": 1.1168091168091168, "grad_norm": 0.5838842247306398, "learning_rate": 5e-06, "loss": 0.5919, "step": 1960 }, { "epoch": 1.1225071225071226, "grad_norm": 0.5659489766269445, "learning_rate": 5e-06, "loss": 0.5679, "step": 1970 }, { "epoch": 1.1282051282051282, "grad_norm": 0.5710950036482688, "learning_rate": 5e-06, "loss": 0.588, "step": 1980 }, { "epoch": 1.133903133903134, "grad_norm": 0.5563097510452688, "learning_rate": 5e-06, "loss": 0.5747, "step": 1990 }, { "epoch": 1.1396011396011396, "grad_norm": 0.5413759858943353, "learning_rate": 5e-06, "loss": 0.5679, "step": 2000 }, { "epoch": 1.1452991452991452, "grad_norm": 0.5610725075898626, "learning_rate": 5e-06, "loss": 0.5795, "step": 2010 }, { "epoch": 1.150997150997151, "grad_norm": 0.5317980893898213, "learning_rate": 5e-06, "loss": 0.565, "step": 2020 }, { "epoch": 1.1566951566951567, "grad_norm": 0.5402604242832053, "learning_rate": 5e-06, "loss": 0.5671, "step": 2030 }, { "epoch": 1.1623931623931625, "grad_norm": 0.5628406736489239, "learning_rate": 5e-06, "loss": 0.5785, "step": 2040 }, { "epoch": 1.168091168091168, "grad_norm": 0.5598060055556051, "learning_rate": 5e-06, "loss": 0.5687, "step": 2050 }, { "epoch": 1.173789173789174, "grad_norm": 0.5812067996328552, "learning_rate": 5e-06, "loss": 0.5739, "step": 2060 }, { "epoch": 1.1794871794871795, "grad_norm": 0.5804815720213962, "learning_rate": 5e-06, "loss": 0.58, "step": 2070 }, { "epoch": 1.1851851851851851, "grad_norm": 0.6009435615613525, "learning_rate": 5e-06, "loss": 0.5883, "step": 2080 }, { "epoch": 1.190883190883191, "grad_norm": 0.54252794895387, "learning_rate": 5e-06, "loss": 0.5777, "step": 2090 }, { "epoch": 1.1965811965811965, "grad_norm": 0.5996787413433816, "learning_rate": 5e-06, "loss": 0.5671, "step": 2100 }, { "epoch": 1.2022792022792024, "grad_norm": 0.5536047256778152, "learning_rate": 5e-06, "loss": 0.5745, "step": 2110 }, { "epoch": 1.207977207977208, "grad_norm": 0.561711893430855, "learning_rate": 5e-06, "loss": 0.5748, "step": 2120 }, { "epoch": 1.2136752136752136, "grad_norm": 0.5283929440389717, "learning_rate": 5e-06, "loss": 0.5915, "step": 2130 }, { "epoch": 1.2193732193732194, "grad_norm": 0.5301857105389954, "learning_rate": 5e-06, "loss": 0.5737, "step": 2140 }, { "epoch": 1.225071225071225, "grad_norm": 0.5563444083260252, "learning_rate": 5e-06, "loss": 0.5642, "step": 2150 }, { "epoch": 1.2307692307692308, "grad_norm": 0.5430715224319318, "learning_rate": 5e-06, "loss": 0.5895, "step": 2160 }, { "epoch": 1.2364672364672364, "grad_norm": 0.5629400205999858, "learning_rate": 5e-06, "loss": 0.5827, "step": 2170 }, { "epoch": 1.242165242165242, "grad_norm": 0.5751709064271272, "learning_rate": 5e-06, "loss": 0.5835, "step": 2180 }, { "epoch": 1.2478632478632479, "grad_norm": 0.5741003758226062, "learning_rate": 5e-06, "loss": 0.5865, "step": 2190 }, { "epoch": 1.2535612535612537, "grad_norm": 0.5791349713821747, "learning_rate": 5e-06, "loss": 0.5699, "step": 2200 }, { "epoch": 1.2592592592592593, "grad_norm": 0.5538967837131181, "learning_rate": 5e-06, "loss": 0.5888, "step": 2210 }, { "epoch": 1.264957264957265, "grad_norm": 0.6047679094918335, "learning_rate": 5e-06, "loss": 0.5852, "step": 2220 }, { "epoch": 1.2706552706552707, "grad_norm": 0.5513200031887904, "learning_rate": 5e-06, "loss": 0.581, "step": 2230 }, { "epoch": 1.2763532763532763, "grad_norm": 0.5416098244594392, "learning_rate": 5e-06, "loss": 0.5608, "step": 2240 }, { "epoch": 1.282051282051282, "grad_norm": 0.6042872751469346, "learning_rate": 5e-06, "loss": 0.5753, "step": 2250 }, { "epoch": 1.2877492877492878, "grad_norm": 0.5529496445289886, "learning_rate": 5e-06, "loss": 0.5869, "step": 2260 }, { "epoch": 1.2934472934472934, "grad_norm": 0.5061156686160359, "learning_rate": 5e-06, "loss": 0.5784, "step": 2270 }, { "epoch": 1.2991452991452992, "grad_norm": 0.5340704963602597, "learning_rate": 5e-06, "loss": 0.5591, "step": 2280 }, { "epoch": 1.3048433048433048, "grad_norm": 0.5138792740114064, "learning_rate": 5e-06, "loss": 0.5687, "step": 2290 }, { "epoch": 1.3105413105413106, "grad_norm": 0.5804911265669914, "learning_rate": 5e-06, "loss": 0.5808, "step": 2300 }, { "epoch": 1.3162393162393162, "grad_norm": 0.6117190706702494, "learning_rate": 5e-06, "loss": 0.5867, "step": 2310 }, { "epoch": 1.3219373219373218, "grad_norm": 0.5374452206535677, "learning_rate": 5e-06, "loss": 0.5674, "step": 2320 }, { "epoch": 1.3276353276353277, "grad_norm": 0.5510977367381295, "learning_rate": 5e-06, "loss": 0.5781, "step": 2330 }, { "epoch": 1.3333333333333333, "grad_norm": 0.5721844682107659, "learning_rate": 5e-06, "loss": 0.5714, "step": 2340 }, { "epoch": 1.339031339031339, "grad_norm": 0.6182161591629188, "learning_rate": 5e-06, "loss": 0.5751, "step": 2350 }, { "epoch": 1.3447293447293447, "grad_norm": 0.5662236658011203, "learning_rate": 5e-06, "loss": 0.5647, "step": 2360 }, { "epoch": 1.3504273504273505, "grad_norm": 0.5538407330840261, "learning_rate": 5e-06, "loss": 0.587, "step": 2370 }, { "epoch": 1.3561253561253561, "grad_norm": 0.5707317602292841, "learning_rate": 5e-06, "loss": 0.5922, "step": 2380 }, { "epoch": 1.3618233618233617, "grad_norm": 0.5641874789639416, "learning_rate": 5e-06, "loss": 0.5641, "step": 2390 }, { "epoch": 1.3675213675213675, "grad_norm": 0.5484949550764525, "learning_rate": 5e-06, "loss": 0.5751, "step": 2400 }, { "epoch": 1.3732193732193732, "grad_norm": 0.5681990922872379, "learning_rate": 5e-06, "loss": 0.5794, "step": 2410 }, { "epoch": 1.378917378917379, "grad_norm": 0.5369480638512492, "learning_rate": 5e-06, "loss": 0.5722, "step": 2420 }, { "epoch": 1.3846153846153846, "grad_norm": 0.5401250464395447, "learning_rate": 5e-06, "loss": 0.5786, "step": 2430 }, { "epoch": 1.3903133903133904, "grad_norm": 0.5143005599419228, "learning_rate": 5e-06, "loss": 0.5795, "step": 2440 }, { "epoch": 1.396011396011396, "grad_norm": 0.571280225689413, "learning_rate": 5e-06, "loss": 0.5836, "step": 2450 }, { "epoch": 1.4017094017094016, "grad_norm": 0.5302500110416417, "learning_rate": 5e-06, "loss": 0.5778, "step": 2460 }, { "epoch": 1.4074074074074074, "grad_norm": 0.6184863756024197, "learning_rate": 5e-06, "loss": 0.5932, "step": 2470 }, { "epoch": 1.413105413105413, "grad_norm": 0.5500463967692277, "learning_rate": 5e-06, "loss": 0.5763, "step": 2480 }, { "epoch": 1.4188034188034189, "grad_norm": 0.6265908750634392, "learning_rate": 5e-06, "loss": 0.5829, "step": 2490 }, { "epoch": 1.4245014245014245, "grad_norm": 0.5888611093710416, "learning_rate": 5e-06, "loss": 0.6017, "step": 2500 }, { "epoch": 1.4301994301994303, "grad_norm": 0.5682311211543241, "learning_rate": 5e-06, "loss": 0.5719, "step": 2510 }, { "epoch": 1.435897435897436, "grad_norm": 0.572149006000227, "learning_rate": 5e-06, "loss": 0.5712, "step": 2520 }, { "epoch": 1.4415954415954415, "grad_norm": 0.5928100499577562, "learning_rate": 5e-06, "loss": 0.5781, "step": 2530 }, { "epoch": 1.4472934472934473, "grad_norm": 0.6074436695553413, "learning_rate": 5e-06, "loss": 0.5681, "step": 2540 }, { "epoch": 1.452991452991453, "grad_norm": 0.5499482375209497, "learning_rate": 5e-06, "loss": 0.578, "step": 2550 }, { "epoch": 1.4586894586894588, "grad_norm": 0.6303664310772396, "learning_rate": 5e-06, "loss": 0.5688, "step": 2560 }, { "epoch": 1.4643874643874644, "grad_norm": 0.510009725228524, "learning_rate": 5e-06, "loss": 0.5678, "step": 2570 }, { "epoch": 1.4700854700854702, "grad_norm": 0.5748311852491685, "learning_rate": 5e-06, "loss": 0.5714, "step": 2580 }, { "epoch": 1.4757834757834758, "grad_norm": 0.6184990291175282, "learning_rate": 5e-06, "loss": 0.5743, "step": 2590 }, { "epoch": 1.4814814814814814, "grad_norm": 0.555767002073107, "learning_rate": 5e-06, "loss": 0.5817, "step": 2600 }, { "epoch": 1.4871794871794872, "grad_norm": 0.5903444647981344, "learning_rate": 5e-06, "loss": 0.5793, "step": 2610 }, { "epoch": 1.4928774928774928, "grad_norm": 0.5576429393071742, "learning_rate": 5e-06, "loss": 0.5647, "step": 2620 }, { "epoch": 1.4985754985754987, "grad_norm": 0.5520440692451319, "learning_rate": 5e-06, "loss": 0.5716, "step": 2630 }, { "epoch": 1.5042735042735043, "grad_norm": 0.5643022408516812, "learning_rate": 5e-06, "loss": 0.5786, "step": 2640 }, { "epoch": 1.50997150997151, "grad_norm": 0.6330193140871835, "learning_rate": 5e-06, "loss": 0.5836, "step": 2650 }, { "epoch": 1.5156695156695157, "grad_norm": 0.625250641713771, "learning_rate": 5e-06, "loss": 0.5718, "step": 2660 }, { "epoch": 1.5213675213675213, "grad_norm": 0.5418501880171682, "learning_rate": 5e-06, "loss": 0.5814, "step": 2670 }, { "epoch": 1.5270655270655271, "grad_norm": 0.6064856119796758, "learning_rate": 5e-06, "loss": 0.5859, "step": 2680 }, { "epoch": 1.5327635327635327, "grad_norm": 0.5672868138655305, "learning_rate": 5e-06, "loss": 0.5542, "step": 2690 }, { "epoch": 1.5384615384615383, "grad_norm": 0.5139650665849255, "learning_rate": 5e-06, "loss": 0.5765, "step": 2700 }, { "epoch": 1.5441595441595442, "grad_norm": 0.5940578181125139, "learning_rate": 5e-06, "loss": 0.5881, "step": 2710 }, { "epoch": 1.54985754985755, "grad_norm": 0.5756432474296378, "learning_rate": 5e-06, "loss": 0.5651, "step": 2720 }, { "epoch": 1.5555555555555556, "grad_norm": 0.545530119077096, "learning_rate": 5e-06, "loss": 0.5764, "step": 2730 }, { "epoch": 1.5612535612535612, "grad_norm": 0.5756357865222977, "learning_rate": 5e-06, "loss": 0.5612, "step": 2740 }, { "epoch": 1.566951566951567, "grad_norm": 0.5876180011106431, "learning_rate": 5e-06, "loss": 0.5794, "step": 2750 }, { "epoch": 1.5726495726495726, "grad_norm": 0.5147868956095404, "learning_rate": 5e-06, "loss": 0.5707, "step": 2760 }, { "epoch": 1.5783475783475782, "grad_norm": 0.5591733948940201, "learning_rate": 5e-06, "loss": 0.5884, "step": 2770 }, { "epoch": 1.584045584045584, "grad_norm": 0.5627964904713231, "learning_rate": 5e-06, "loss": 0.6041, "step": 2780 }, { "epoch": 1.5897435897435899, "grad_norm": 0.6297308754027342, "learning_rate": 5e-06, "loss": 0.5923, "step": 2790 }, { "epoch": 1.5954415954415955, "grad_norm": 0.5936598449915134, "learning_rate": 5e-06, "loss": 0.5761, "step": 2800 }, { "epoch": 1.601139601139601, "grad_norm": 0.5515458557481886, "learning_rate": 5e-06, "loss": 0.5833, "step": 2810 }, { "epoch": 1.606837606837607, "grad_norm": 0.5257725141562714, "learning_rate": 5e-06, "loss": 0.5765, "step": 2820 }, { "epoch": 1.6125356125356125, "grad_norm": 0.5730936759045179, "learning_rate": 5e-06, "loss": 0.5649, "step": 2830 }, { "epoch": 1.618233618233618, "grad_norm": 0.5354221551624224, "learning_rate": 5e-06, "loss": 0.5776, "step": 2840 }, { "epoch": 1.623931623931624, "grad_norm": 0.5824066939857122, "learning_rate": 5e-06, "loss": 0.5769, "step": 2850 }, { "epoch": 1.6296296296296298, "grad_norm": 0.5554767609605542, "learning_rate": 5e-06, "loss": 0.5964, "step": 2860 }, { "epoch": 1.6353276353276354, "grad_norm": 0.5526840997991586, "learning_rate": 5e-06, "loss": 0.5727, "step": 2870 }, { "epoch": 1.641025641025641, "grad_norm": 0.5465241482682912, "learning_rate": 5e-06, "loss": 0.5686, "step": 2880 }, { "epoch": 1.6467236467236468, "grad_norm": 0.6190071105580203, "learning_rate": 5e-06, "loss": 0.5823, "step": 2890 }, { "epoch": 1.6524216524216524, "grad_norm": 0.5356564377870386, "learning_rate": 5e-06, "loss": 0.5755, "step": 2900 }, { "epoch": 1.658119658119658, "grad_norm": 0.5979037263652105, "learning_rate": 5e-06, "loss": 0.5772, "step": 2910 }, { "epoch": 1.6638176638176638, "grad_norm": 0.5872422786253441, "learning_rate": 5e-06, "loss": 0.5761, "step": 2920 }, { "epoch": 1.6695156695156697, "grad_norm": 0.5667349984301929, "learning_rate": 5e-06, "loss": 0.565, "step": 2930 }, { "epoch": 1.6752136752136753, "grad_norm": 0.5553088977731174, "learning_rate": 5e-06, "loss": 0.5737, "step": 2940 }, { "epoch": 1.6809116809116809, "grad_norm": 0.5615666295169086, "learning_rate": 5e-06, "loss": 0.5793, "step": 2950 }, { "epoch": 1.6866096866096867, "grad_norm": 0.553145275478625, "learning_rate": 5e-06, "loss": 0.5742, "step": 2960 }, { "epoch": 1.6923076923076923, "grad_norm": 0.5341696725770836, "learning_rate": 5e-06, "loss": 0.5614, "step": 2970 }, { "epoch": 1.698005698005698, "grad_norm": 0.5463223359456406, "learning_rate": 5e-06, "loss": 0.5786, "step": 2980 }, { "epoch": 1.7037037037037037, "grad_norm": 0.5777813141939787, "learning_rate": 5e-06, "loss": 0.5841, "step": 2990 }, { "epoch": 1.7094017094017095, "grad_norm": 0.5300785322610779, "learning_rate": 5e-06, "loss": 0.5778, "step": 3000 }, { "epoch": 1.7150997150997151, "grad_norm": 0.563500423103054, "learning_rate": 5e-06, "loss": 0.5755, "step": 3010 }, { "epoch": 1.7207977207977208, "grad_norm": 0.5634105059434202, "learning_rate": 5e-06, "loss": 0.5643, "step": 3020 }, { "epoch": 1.7264957264957266, "grad_norm": 0.5570811501478863, "learning_rate": 5e-06, "loss": 0.581, "step": 3030 }, { "epoch": 1.7321937321937322, "grad_norm": 0.5739299531519032, "learning_rate": 5e-06, "loss": 0.5748, "step": 3040 }, { "epoch": 1.7378917378917378, "grad_norm": 0.5701909466155488, "learning_rate": 5e-06, "loss": 0.5708, "step": 3050 }, { "epoch": 1.7435897435897436, "grad_norm": 0.5308366598042483, "learning_rate": 5e-06, "loss": 0.5815, "step": 3060 }, { "epoch": 1.7492877492877494, "grad_norm": 0.5528144482172378, "learning_rate": 5e-06, "loss": 0.5728, "step": 3070 }, { "epoch": 1.7549857549857548, "grad_norm": 0.6087932311563443, "learning_rate": 5e-06, "loss": 0.5707, "step": 3080 }, { "epoch": 1.7606837606837606, "grad_norm": 0.5516607270748759, "learning_rate": 5e-06, "loss": 0.5896, "step": 3090 }, { "epoch": 1.7663817663817665, "grad_norm": 0.5843004310212261, "learning_rate": 5e-06, "loss": 0.5767, "step": 3100 }, { "epoch": 1.772079772079772, "grad_norm": 0.5852609740047551, "learning_rate": 5e-06, "loss": 0.5845, "step": 3110 }, { "epoch": 1.7777777777777777, "grad_norm": 0.552703665463295, "learning_rate": 5e-06, "loss": 0.5703, "step": 3120 }, { "epoch": 1.7834757834757835, "grad_norm": 0.6753220707242441, "learning_rate": 5e-06, "loss": 0.5758, "step": 3130 }, { "epoch": 1.7891737891737893, "grad_norm": 0.616525110015123, "learning_rate": 5e-06, "loss": 0.5884, "step": 3140 }, { "epoch": 1.7948717948717947, "grad_norm": 0.5654708131985335, "learning_rate": 5e-06, "loss": 0.5758, "step": 3150 }, { "epoch": 1.8005698005698005, "grad_norm": 0.5323851407450543, "learning_rate": 5e-06, "loss": 0.5686, "step": 3160 }, { "epoch": 1.8062678062678064, "grad_norm": 0.5799884631320908, "learning_rate": 5e-06, "loss": 0.5733, "step": 3170 }, { "epoch": 1.811965811965812, "grad_norm": 0.6251760605506265, "learning_rate": 5e-06, "loss": 0.5766, "step": 3180 }, { "epoch": 1.8176638176638176, "grad_norm": 0.5779038699801032, "learning_rate": 5e-06, "loss": 0.5624, "step": 3190 }, { "epoch": 1.8233618233618234, "grad_norm": 0.5088787245145318, "learning_rate": 5e-06, "loss": 0.5596, "step": 3200 }, { "epoch": 1.8290598290598292, "grad_norm": 0.4965401461152827, "learning_rate": 5e-06, "loss": 0.5698, "step": 3210 }, { "epoch": 1.8347578347578346, "grad_norm": 0.5353414236210642, "learning_rate": 5e-06, "loss": 0.5683, "step": 3220 }, { "epoch": 1.8404558404558404, "grad_norm": 0.5409699336409666, "learning_rate": 5e-06, "loss": 0.569, "step": 3230 }, { "epoch": 1.8461538461538463, "grad_norm": 0.574913493745111, "learning_rate": 5e-06, "loss": 0.5789, "step": 3240 }, { "epoch": 1.8518518518518519, "grad_norm": 0.5521006517580501, "learning_rate": 5e-06, "loss": 0.5715, "step": 3250 }, { "epoch": 1.8575498575498575, "grad_norm": 0.5805482573176093, "learning_rate": 5e-06, "loss": 0.5585, "step": 3260 }, { "epoch": 1.8632478632478633, "grad_norm": 0.573426960061509, "learning_rate": 5e-06, "loss": 0.578, "step": 3270 }, { "epoch": 1.868945868945869, "grad_norm": 0.5565980515165863, "learning_rate": 5e-06, "loss": 0.5729, "step": 3280 }, { "epoch": 1.8746438746438745, "grad_norm": 0.5586859824206695, "learning_rate": 5e-06, "loss": 0.5761, "step": 3290 }, { "epoch": 1.8803418803418803, "grad_norm": 0.5496999183024133, "learning_rate": 5e-06, "loss": 0.5561, "step": 3300 }, { "epoch": 1.8860398860398861, "grad_norm": 0.5997656668355231, "learning_rate": 5e-06, "loss": 0.5859, "step": 3310 }, { "epoch": 1.8917378917378918, "grad_norm": 0.6206812095844995, "learning_rate": 5e-06, "loss": 0.5845, "step": 3320 }, { "epoch": 1.8974358974358974, "grad_norm": 0.5963021360576573, "learning_rate": 5e-06, "loss": 0.5839, "step": 3330 }, { "epoch": 1.9031339031339032, "grad_norm": 0.5743434396116716, "learning_rate": 5e-06, "loss": 0.5816, "step": 3340 }, { "epoch": 1.9088319088319088, "grad_norm": 0.5858788059851108, "learning_rate": 5e-06, "loss": 0.5716, "step": 3350 }, { "epoch": 1.9145299145299144, "grad_norm": 0.5648325254524372, "learning_rate": 5e-06, "loss": 0.5789, "step": 3360 }, { "epoch": 1.9202279202279202, "grad_norm": 0.5444165984165006, "learning_rate": 5e-06, "loss": 0.5775, "step": 3370 }, { "epoch": 1.925925925925926, "grad_norm": 0.6059359662104461, "learning_rate": 5e-06, "loss": 0.5802, "step": 3380 }, { "epoch": 1.9316239316239316, "grad_norm": 0.5755161543958413, "learning_rate": 5e-06, "loss": 0.5784, "step": 3390 }, { "epoch": 1.9373219373219372, "grad_norm": 0.5368654626324941, "learning_rate": 5e-06, "loss": 0.5742, "step": 3400 }, { "epoch": 1.943019943019943, "grad_norm": 0.5582700416390507, "learning_rate": 5e-06, "loss": 0.5676, "step": 3410 }, { "epoch": 1.9487179487179487, "grad_norm": 0.5625374338203465, "learning_rate": 5e-06, "loss": 0.5741, "step": 3420 }, { "epoch": 1.9544159544159543, "grad_norm": 0.6326198974505973, "learning_rate": 5e-06, "loss": 0.5874, "step": 3430 }, { "epoch": 1.96011396011396, "grad_norm": 0.5826625931834889, "learning_rate": 5e-06, "loss": 0.5875, "step": 3440 }, { "epoch": 1.965811965811966, "grad_norm": 0.6063426488985952, "learning_rate": 5e-06, "loss": 0.5802, "step": 3450 }, { "epoch": 1.9715099715099715, "grad_norm": 0.5589982670093196, "learning_rate": 5e-06, "loss": 0.5813, "step": 3460 }, { "epoch": 1.9772079772079771, "grad_norm": 0.5197556180794218, "learning_rate": 5e-06, "loss": 0.5763, "step": 3470 }, { "epoch": 1.982905982905983, "grad_norm": 0.5840640270883682, "learning_rate": 5e-06, "loss": 0.5701, "step": 3480 }, { "epoch": 1.9886039886039886, "grad_norm": 0.5335458990414118, "learning_rate": 5e-06, "loss": 0.5701, "step": 3490 }, { "epoch": 1.9943019943019942, "grad_norm": 0.5707109550752067, "learning_rate": 5e-06, "loss": 0.5753, "step": 3500 }, { "epoch": 2.0, "grad_norm": 0.5883827598003003, "learning_rate": 5e-06, "loss": 0.581, "step": 3510 }, { "epoch": 2.0, "eval_loss": 0.6180241703987122, "eval_runtime": 447.562, "eval_samples_per_second": 26.416, "eval_steps_per_second": 0.413, "step": 3510 }, { "epoch": 2.005698005698006, "grad_norm": 0.5776212119618224, "learning_rate": 5e-06, "loss": 0.5254, "step": 3520 }, { "epoch": 2.011396011396011, "grad_norm": 0.5810335608051052, "learning_rate": 5e-06, "loss": 0.5094, "step": 3530 }, { "epoch": 2.017094017094017, "grad_norm": 0.5432713405979779, "learning_rate": 5e-06, "loss": 0.5032, "step": 3540 }, { "epoch": 2.022792022792023, "grad_norm": 0.5416321025053085, "learning_rate": 5e-06, "loss": 0.515, "step": 3550 }, { "epoch": 2.0284900284900287, "grad_norm": 0.5775523857428819, "learning_rate": 5e-06, "loss": 0.518, "step": 3560 }, { "epoch": 2.034188034188034, "grad_norm": 0.5592263586107213, "learning_rate": 5e-06, "loss": 0.5234, "step": 3570 }, { "epoch": 2.03988603988604, "grad_norm": 0.5476768718918923, "learning_rate": 5e-06, "loss": 0.5289, "step": 3580 }, { "epoch": 2.0455840455840457, "grad_norm": 0.6102527285044014, "learning_rate": 5e-06, "loss": 0.5208, "step": 3590 }, { "epoch": 2.051282051282051, "grad_norm": 0.589225412655405, "learning_rate": 5e-06, "loss": 0.5298, "step": 3600 }, { "epoch": 2.056980056980057, "grad_norm": 0.57808060135501, "learning_rate": 5e-06, "loss": 0.5208, "step": 3610 }, { "epoch": 2.0626780626780628, "grad_norm": 0.5634395489010126, "learning_rate": 5e-06, "loss": 0.5212, "step": 3620 }, { "epoch": 2.0683760683760686, "grad_norm": 0.5526570622014573, "learning_rate": 5e-06, "loss": 0.5297, "step": 3630 }, { "epoch": 2.074074074074074, "grad_norm": 0.5810750660810072, "learning_rate": 5e-06, "loss": 0.525, "step": 3640 }, { "epoch": 2.07977207977208, "grad_norm": 0.5614577275900066, "learning_rate": 5e-06, "loss": 0.5259, "step": 3650 }, { "epoch": 2.0854700854700856, "grad_norm": 0.5486462905219032, "learning_rate": 5e-06, "loss": 0.5213, "step": 3660 }, { "epoch": 2.091168091168091, "grad_norm": 0.5307563733817223, "learning_rate": 5e-06, "loss": 0.529, "step": 3670 }, { "epoch": 2.096866096866097, "grad_norm": 0.5389945236629596, "learning_rate": 5e-06, "loss": 0.5348, "step": 3680 }, { "epoch": 2.1025641025641026, "grad_norm": 0.5527322408012718, "learning_rate": 5e-06, "loss": 0.5116, "step": 3690 }, { "epoch": 2.1082621082621085, "grad_norm": 0.5328079584501793, "learning_rate": 5e-06, "loss": 0.5282, "step": 3700 }, { "epoch": 2.113960113960114, "grad_norm": 0.5686915040528058, "learning_rate": 5e-06, "loss": 0.5261, "step": 3710 }, { "epoch": 2.1196581196581197, "grad_norm": 0.5501606190305495, "learning_rate": 5e-06, "loss": 0.5365, "step": 3720 }, { "epoch": 2.1253561253561255, "grad_norm": 0.5536761094008102, "learning_rate": 5e-06, "loss": 0.5263, "step": 3730 }, { "epoch": 2.131054131054131, "grad_norm": 0.5345031800564628, "learning_rate": 5e-06, "loss": 0.5206, "step": 3740 }, { "epoch": 2.1367521367521367, "grad_norm": 0.6046490261900991, "learning_rate": 5e-06, "loss": 0.5275, "step": 3750 }, { "epoch": 2.1424501424501425, "grad_norm": 0.5840211791187765, "learning_rate": 5e-06, "loss": 0.5201, "step": 3760 }, { "epoch": 2.148148148148148, "grad_norm": 0.5529533135143219, "learning_rate": 5e-06, "loss": 0.5115, "step": 3770 }, { "epoch": 2.1538461538461537, "grad_norm": 0.5680751070257097, "learning_rate": 5e-06, "loss": 0.5294, "step": 3780 }, { "epoch": 2.1595441595441596, "grad_norm": 0.5245141535052799, "learning_rate": 5e-06, "loss": 0.5281, "step": 3790 }, { "epoch": 2.1652421652421654, "grad_norm": 0.5648362949355089, "learning_rate": 5e-06, "loss": 0.5147, "step": 3800 }, { "epoch": 2.1709401709401708, "grad_norm": 0.5254847337067438, "learning_rate": 5e-06, "loss": 0.5313, "step": 3810 }, { "epoch": 2.1766381766381766, "grad_norm": 0.5976261665941772, "learning_rate": 5e-06, "loss": 0.5198, "step": 3820 }, { "epoch": 2.1823361823361824, "grad_norm": 0.5864445373756276, "learning_rate": 5e-06, "loss": 0.5336, "step": 3830 }, { "epoch": 2.1880341880341883, "grad_norm": 0.5537617774511332, "learning_rate": 5e-06, "loss": 0.5239, "step": 3840 }, { "epoch": 2.1937321937321936, "grad_norm": 0.5790262967504055, "learning_rate": 5e-06, "loss": 0.5387, "step": 3850 }, { "epoch": 2.1994301994301995, "grad_norm": 0.5448893578337308, "learning_rate": 5e-06, "loss": 0.5158, "step": 3860 }, { "epoch": 2.2051282051282053, "grad_norm": 0.5224956999156651, "learning_rate": 5e-06, "loss": 0.5319, "step": 3870 }, { "epoch": 2.2108262108262107, "grad_norm": 0.5452041541066649, "learning_rate": 5e-06, "loss": 0.5283, "step": 3880 }, { "epoch": 2.2165242165242165, "grad_norm": 0.5188463908276534, "learning_rate": 5e-06, "loss": 0.5111, "step": 3890 }, { "epoch": 2.2222222222222223, "grad_norm": 0.6153310194594807, "learning_rate": 5e-06, "loss": 0.5358, "step": 3900 }, { "epoch": 2.2279202279202277, "grad_norm": 0.5926494217956065, "learning_rate": 5e-06, "loss": 0.5154, "step": 3910 }, { "epoch": 2.2336182336182335, "grad_norm": 0.5109574356125176, "learning_rate": 5e-06, "loss": 0.518, "step": 3920 }, { "epoch": 2.2393162393162394, "grad_norm": 0.5289253041831274, "learning_rate": 5e-06, "loss": 0.5246, "step": 3930 }, { "epoch": 2.245014245014245, "grad_norm": 0.5628951778576998, "learning_rate": 5e-06, "loss": 0.526, "step": 3940 }, { "epoch": 2.2507122507122506, "grad_norm": 0.551449654946418, "learning_rate": 5e-06, "loss": 0.5374, "step": 3950 }, { "epoch": 2.2564102564102564, "grad_norm": 0.5466152136858086, "learning_rate": 5e-06, "loss": 0.5169, "step": 3960 }, { "epoch": 2.262108262108262, "grad_norm": 0.5146969054690042, "learning_rate": 5e-06, "loss": 0.5281, "step": 3970 }, { "epoch": 2.267806267806268, "grad_norm": 0.5293060782125808, "learning_rate": 5e-06, "loss": 0.5191, "step": 3980 }, { "epoch": 2.2735042735042734, "grad_norm": 0.5473420088344219, "learning_rate": 5e-06, "loss": 0.531, "step": 3990 }, { "epoch": 2.2792022792022792, "grad_norm": 0.5512443710837232, "learning_rate": 5e-06, "loss": 0.5256, "step": 4000 }, { "epoch": 2.284900284900285, "grad_norm": 0.5442787627600018, "learning_rate": 5e-06, "loss": 0.5222, "step": 4010 }, { "epoch": 2.2905982905982905, "grad_norm": 0.5545916348777593, "learning_rate": 5e-06, "loss": 0.535, "step": 4020 }, { "epoch": 2.2962962962962963, "grad_norm": 0.59632132003208, "learning_rate": 5e-06, "loss": 0.5317, "step": 4030 }, { "epoch": 2.301994301994302, "grad_norm": 0.5408157566248561, "learning_rate": 5e-06, "loss": 0.5146, "step": 4040 }, { "epoch": 2.3076923076923075, "grad_norm": 0.5820724583290839, "learning_rate": 5e-06, "loss": 0.5349, "step": 4050 }, { "epoch": 2.3133903133903133, "grad_norm": 0.5687662322666911, "learning_rate": 5e-06, "loss": 0.5282, "step": 4060 }, { "epoch": 2.319088319088319, "grad_norm": 0.573552994416881, "learning_rate": 5e-06, "loss": 0.5336, "step": 4070 }, { "epoch": 2.324786324786325, "grad_norm": 0.5677912645112424, "learning_rate": 5e-06, "loss": 0.5214, "step": 4080 }, { "epoch": 2.3304843304843303, "grad_norm": 0.5274433334199329, "learning_rate": 5e-06, "loss": 0.5244, "step": 4090 }, { "epoch": 2.336182336182336, "grad_norm": 0.5658209536678374, "learning_rate": 5e-06, "loss": 0.5286, "step": 4100 }, { "epoch": 2.341880341880342, "grad_norm": 0.5780434495697487, "learning_rate": 5e-06, "loss": 0.5341, "step": 4110 }, { "epoch": 2.347578347578348, "grad_norm": 0.5818657983745251, "learning_rate": 5e-06, "loss": 0.5338, "step": 4120 }, { "epoch": 2.353276353276353, "grad_norm": 0.5389779504746351, "learning_rate": 5e-06, "loss": 0.5291, "step": 4130 }, { "epoch": 2.358974358974359, "grad_norm": 0.5610403895418081, "learning_rate": 5e-06, "loss": 0.5225, "step": 4140 }, { "epoch": 2.364672364672365, "grad_norm": 0.5209098217965255, "learning_rate": 5e-06, "loss": 0.5334, "step": 4150 }, { "epoch": 2.3703703703703702, "grad_norm": 0.5744294920867676, "learning_rate": 5e-06, "loss": 0.5204, "step": 4160 }, { "epoch": 2.376068376068376, "grad_norm": 0.598425566675419, "learning_rate": 5e-06, "loss": 0.52, "step": 4170 }, { "epoch": 2.381766381766382, "grad_norm": 0.5493923391327106, "learning_rate": 5e-06, "loss": 0.5359, "step": 4180 }, { "epoch": 2.3874643874643873, "grad_norm": 0.5533392246170049, "learning_rate": 5e-06, "loss": 0.521, "step": 4190 }, { "epoch": 2.393162393162393, "grad_norm": 0.5731160307080695, "learning_rate": 5e-06, "loss": 0.5329, "step": 4200 }, { "epoch": 2.398860398860399, "grad_norm": 0.5775023991320096, "learning_rate": 5e-06, "loss": 0.5359, "step": 4210 }, { "epoch": 2.4045584045584047, "grad_norm": 0.5901628223866878, "learning_rate": 5e-06, "loss": 0.5493, "step": 4220 }, { "epoch": 2.41025641025641, "grad_norm": 0.5542817321146499, "learning_rate": 5e-06, "loss": 0.526, "step": 4230 }, { "epoch": 2.415954415954416, "grad_norm": 0.5524566146364747, "learning_rate": 5e-06, "loss": 0.5307, "step": 4240 }, { "epoch": 2.421652421652422, "grad_norm": 0.5244228024377005, "learning_rate": 5e-06, "loss": 0.5278, "step": 4250 }, { "epoch": 2.427350427350427, "grad_norm": 0.5786633243903677, "learning_rate": 5e-06, "loss": 0.5482, "step": 4260 }, { "epoch": 2.433048433048433, "grad_norm": 0.5858650466682291, "learning_rate": 5e-06, "loss": 0.5394, "step": 4270 }, { "epoch": 2.438746438746439, "grad_norm": 0.5858885917781449, "learning_rate": 5e-06, "loss": 0.5371, "step": 4280 }, { "epoch": 2.4444444444444446, "grad_norm": 0.5339546065735147, "learning_rate": 5e-06, "loss": 0.5373, "step": 4290 }, { "epoch": 2.45014245014245, "grad_norm": 0.5984049498497251, "learning_rate": 5e-06, "loss": 0.5474, "step": 4300 }, { "epoch": 2.455840455840456, "grad_norm": 0.5807043848022856, "learning_rate": 5e-06, "loss": 0.5309, "step": 4310 }, { "epoch": 2.4615384615384617, "grad_norm": 0.5709610370467612, "learning_rate": 5e-06, "loss": 0.5246, "step": 4320 }, { "epoch": 2.467236467236467, "grad_norm": 0.5499687224770995, "learning_rate": 5e-06, "loss": 0.531, "step": 4330 }, { "epoch": 2.472934472934473, "grad_norm": 0.5722356598944494, "learning_rate": 5e-06, "loss": 0.5286, "step": 4340 }, { "epoch": 2.4786324786324787, "grad_norm": 0.5486032250328287, "learning_rate": 5e-06, "loss": 0.5358, "step": 4350 }, { "epoch": 2.484330484330484, "grad_norm": 0.5142530295671646, "learning_rate": 5e-06, "loss": 0.5324, "step": 4360 }, { "epoch": 2.49002849002849, "grad_norm": 0.6364539965127788, "learning_rate": 5e-06, "loss": 0.5325, "step": 4370 }, { "epoch": 2.4957264957264957, "grad_norm": 0.5822908149062661, "learning_rate": 5e-06, "loss": 0.5378, "step": 4380 }, { "epoch": 2.5014245014245016, "grad_norm": 0.5660579125585127, "learning_rate": 5e-06, "loss": 0.539, "step": 4390 }, { "epoch": 2.5071225071225074, "grad_norm": 0.6015416980494055, "learning_rate": 5e-06, "loss": 0.543, "step": 4400 }, { "epoch": 2.5128205128205128, "grad_norm": 0.544050303995212, "learning_rate": 5e-06, "loss": 0.5186, "step": 4410 }, { "epoch": 2.5185185185185186, "grad_norm": 0.5489445408860626, "learning_rate": 5e-06, "loss": 0.5293, "step": 4420 }, { "epoch": 2.5242165242165244, "grad_norm": 0.5804195388596164, "learning_rate": 5e-06, "loss": 0.5368, "step": 4430 }, { "epoch": 2.52991452991453, "grad_norm": 0.5465444916928103, "learning_rate": 5e-06, "loss": 0.5395, "step": 4440 }, { "epoch": 2.5356125356125356, "grad_norm": 0.5679778769321939, "learning_rate": 5e-06, "loss": 0.5358, "step": 4450 }, { "epoch": 2.5413105413105415, "grad_norm": 0.5726465912316608, "learning_rate": 5e-06, "loss": 0.5253, "step": 4460 }, { "epoch": 2.547008547008547, "grad_norm": 0.5387152868301355, "learning_rate": 5e-06, "loss": 0.5268, "step": 4470 }, { "epoch": 2.5527065527065527, "grad_norm": 0.5559047427422275, "learning_rate": 5e-06, "loss": 0.5305, "step": 4480 }, { "epoch": 2.5584045584045585, "grad_norm": 0.5428769349897132, "learning_rate": 5e-06, "loss": 0.5209, "step": 4490 }, { "epoch": 2.564102564102564, "grad_norm": 0.5407361307856526, "learning_rate": 5e-06, "loss": 0.5351, "step": 4500 }, { "epoch": 2.5698005698005697, "grad_norm": 0.5595203034409101, "learning_rate": 5e-06, "loss": 0.5312, "step": 4510 }, { "epoch": 2.5754985754985755, "grad_norm": 0.5752885902852435, "learning_rate": 5e-06, "loss": 0.5328, "step": 4520 }, { "epoch": 2.5811965811965814, "grad_norm": 0.5448007027240791, "learning_rate": 5e-06, "loss": 0.5295, "step": 4530 }, { "epoch": 2.5868945868945867, "grad_norm": 0.5494957146695392, "learning_rate": 5e-06, "loss": 0.5327, "step": 4540 }, { "epoch": 2.5925925925925926, "grad_norm": 0.5743882596085497, "learning_rate": 5e-06, "loss": 0.5295, "step": 4550 }, { "epoch": 2.5982905982905984, "grad_norm": 0.5481581540445639, "learning_rate": 5e-06, "loss": 0.5305, "step": 4560 }, { "epoch": 2.603988603988604, "grad_norm": 0.5834328837619958, "learning_rate": 5e-06, "loss": 0.5376, "step": 4570 }, { "epoch": 2.6096866096866096, "grad_norm": 0.5536117193354623, "learning_rate": 5e-06, "loss": 0.5341, "step": 4580 }, { "epoch": 2.6153846153846154, "grad_norm": 0.545383573085851, "learning_rate": 5e-06, "loss": 0.5233, "step": 4590 }, { "epoch": 2.6210826210826212, "grad_norm": 0.5204672857822074, "learning_rate": 5e-06, "loss": 0.5228, "step": 4600 }, { "epoch": 2.6267806267806266, "grad_norm": 0.5139161169258046, "learning_rate": 5e-06, "loss": 0.5328, "step": 4610 }, { "epoch": 2.6324786324786325, "grad_norm": 0.6028262892085369, "learning_rate": 5e-06, "loss": 0.5332, "step": 4620 }, { "epoch": 2.6381766381766383, "grad_norm": 0.5559617493532288, "learning_rate": 5e-06, "loss": 0.5232, "step": 4630 }, { "epoch": 2.6438746438746437, "grad_norm": 0.5435028142224008, "learning_rate": 5e-06, "loss": 0.5415, "step": 4640 }, { "epoch": 2.6495726495726495, "grad_norm": 0.604873621040108, "learning_rate": 5e-06, "loss": 0.5303, "step": 4650 }, { "epoch": 2.6552706552706553, "grad_norm": 0.5697598259817795, "learning_rate": 5e-06, "loss": 0.5373, "step": 4660 }, { "epoch": 2.6609686609686607, "grad_norm": 0.5511420813626869, "learning_rate": 5e-06, "loss": 0.5434, "step": 4670 }, { "epoch": 2.6666666666666665, "grad_norm": 0.5394695044798543, "learning_rate": 5e-06, "loss": 0.5238, "step": 4680 }, { "epoch": 2.6723646723646723, "grad_norm": 0.5330927779679859, "learning_rate": 5e-06, "loss": 0.5245, "step": 4690 }, { "epoch": 2.678062678062678, "grad_norm": 0.5736642108384618, "learning_rate": 5e-06, "loss": 0.5305, "step": 4700 }, { "epoch": 2.683760683760684, "grad_norm": 0.6197551413034075, "learning_rate": 5e-06, "loss": 0.5408, "step": 4710 }, { "epoch": 2.6894586894586894, "grad_norm": 0.5791951412024915, "learning_rate": 5e-06, "loss": 0.5306, "step": 4720 }, { "epoch": 2.695156695156695, "grad_norm": 0.5631274263966353, "learning_rate": 5e-06, "loss": 0.5228, "step": 4730 }, { "epoch": 2.700854700854701, "grad_norm": 0.5605562545980405, "learning_rate": 5e-06, "loss": 0.5266, "step": 4740 }, { "epoch": 2.7065527065527064, "grad_norm": 0.5321827825743034, "learning_rate": 5e-06, "loss": 0.5338, "step": 4750 }, { "epoch": 2.7122507122507122, "grad_norm": 0.5644337354264807, "learning_rate": 5e-06, "loss": 0.5376, "step": 4760 }, { "epoch": 2.717948717948718, "grad_norm": 0.5719762386839188, "learning_rate": 5e-06, "loss": 0.5298, "step": 4770 }, { "epoch": 2.7236467236467234, "grad_norm": 0.5870644859394915, "learning_rate": 5e-06, "loss": 0.5408, "step": 4780 }, { "epoch": 2.7293447293447293, "grad_norm": 0.5161759448699083, "learning_rate": 5e-06, "loss": 0.5385, "step": 4790 }, { "epoch": 2.735042735042735, "grad_norm": 0.5685973523356822, "learning_rate": 5e-06, "loss": 0.5295, "step": 4800 }, { "epoch": 2.7407407407407405, "grad_norm": 0.5955189388351516, "learning_rate": 5e-06, "loss": 0.5357, "step": 4810 }, { "epoch": 2.7464387464387463, "grad_norm": 0.5927243869455354, "learning_rate": 5e-06, "loss": 0.5397, "step": 4820 }, { "epoch": 2.752136752136752, "grad_norm": 0.5892611711545225, "learning_rate": 5e-06, "loss": 0.5427, "step": 4830 }, { "epoch": 2.757834757834758, "grad_norm": 0.5320349130904972, "learning_rate": 5e-06, "loss": 0.5322, "step": 4840 }, { "epoch": 2.763532763532764, "grad_norm": 0.5215197760783008, "learning_rate": 5e-06, "loss": 0.5196, "step": 4850 }, { "epoch": 2.769230769230769, "grad_norm": 0.5967746123628929, "learning_rate": 5e-06, "loss": 0.5349, "step": 4860 }, { "epoch": 2.774928774928775, "grad_norm": 0.5303530858087516, "learning_rate": 5e-06, "loss": 0.5288, "step": 4870 }, { "epoch": 2.780626780626781, "grad_norm": 0.5294938033518871, "learning_rate": 5e-06, "loss": 0.5254, "step": 4880 }, { "epoch": 2.786324786324786, "grad_norm": 0.6085557642175643, "learning_rate": 5e-06, "loss": 0.5362, "step": 4890 }, { "epoch": 2.792022792022792, "grad_norm": 0.5563638209032657, "learning_rate": 5e-06, "loss": 0.5243, "step": 4900 }, { "epoch": 2.797720797720798, "grad_norm": 0.5426535982775469, "learning_rate": 5e-06, "loss": 0.5302, "step": 4910 }, { "epoch": 2.8034188034188032, "grad_norm": 0.5606166025371381, "learning_rate": 5e-06, "loss": 0.5195, "step": 4920 }, { "epoch": 2.809116809116809, "grad_norm": 0.5600176374437925, "learning_rate": 5e-06, "loss": 0.5339, "step": 4930 }, { "epoch": 2.814814814814815, "grad_norm": 0.5735203266072578, "learning_rate": 5e-06, "loss": 0.5463, "step": 4940 }, { "epoch": 2.8205128205128203, "grad_norm": 0.5647627688966846, "learning_rate": 5e-06, "loss": 0.5342, "step": 4950 }, { "epoch": 2.826210826210826, "grad_norm": 0.6181052514822875, "learning_rate": 5e-06, "loss": 0.5502, "step": 4960 }, { "epoch": 2.831908831908832, "grad_norm": 0.5487589815910356, "learning_rate": 5e-06, "loss": 0.5332, "step": 4970 }, { "epoch": 2.8376068376068377, "grad_norm": 0.5519304274572768, "learning_rate": 5e-06, "loss": 0.5342, "step": 4980 }, { "epoch": 2.8433048433048436, "grad_norm": 0.5710774380484754, "learning_rate": 5e-06, "loss": 0.5468, "step": 4990 }, { "epoch": 2.849002849002849, "grad_norm": 0.5396253108717034, "learning_rate": 5e-06, "loss": 0.536, "step": 5000 }, { "epoch": 2.8547008547008548, "grad_norm": 0.5481621751659937, "learning_rate": 5e-06, "loss": 0.5361, "step": 5010 }, { "epoch": 2.8603988603988606, "grad_norm": 0.5815133705980525, "learning_rate": 5e-06, "loss": 0.5321, "step": 5020 }, { "epoch": 2.866096866096866, "grad_norm": 0.5408578285161547, "learning_rate": 5e-06, "loss": 0.5367, "step": 5030 }, { "epoch": 2.871794871794872, "grad_norm": 0.5405279703831611, "learning_rate": 5e-06, "loss": 0.532, "step": 5040 }, { "epoch": 2.8774928774928776, "grad_norm": 0.5566749988018465, "learning_rate": 5e-06, "loss": 0.5374, "step": 5050 }, { "epoch": 2.883190883190883, "grad_norm": 0.5806758592562609, "learning_rate": 5e-06, "loss": 0.5425, "step": 5060 }, { "epoch": 2.888888888888889, "grad_norm": 0.5820389002607862, "learning_rate": 5e-06, "loss": 0.5329, "step": 5070 }, { "epoch": 2.8945868945868947, "grad_norm": 0.5375342327708015, "learning_rate": 5e-06, "loss": 0.5404, "step": 5080 }, { "epoch": 2.9002849002849, "grad_norm": 0.5641024886824925, "learning_rate": 5e-06, "loss": 0.5206, "step": 5090 }, { "epoch": 2.905982905982906, "grad_norm": 0.5595993132067282, "learning_rate": 5e-06, "loss": 0.5456, "step": 5100 }, { "epoch": 2.9116809116809117, "grad_norm": 0.5729657514196825, "learning_rate": 5e-06, "loss": 0.5303, "step": 5110 }, { "epoch": 2.9173789173789175, "grad_norm": 0.5592258039441389, "learning_rate": 5e-06, "loss": 0.5137, "step": 5120 }, { "epoch": 2.9230769230769234, "grad_norm": 0.5482964902412071, "learning_rate": 5e-06, "loss": 0.5379, "step": 5130 }, { "epoch": 2.9287749287749287, "grad_norm": 0.5336701580376303, "learning_rate": 5e-06, "loss": 0.516, "step": 5140 }, { "epoch": 2.9344729344729346, "grad_norm": 0.573991652444628, "learning_rate": 5e-06, "loss": 0.5197, "step": 5150 }, { "epoch": 2.9401709401709404, "grad_norm": 0.5656512132917955, "learning_rate": 5e-06, "loss": 0.5299, "step": 5160 }, { "epoch": 2.9458689458689458, "grad_norm": 0.5637897139695605, "learning_rate": 5e-06, "loss": 0.5318, "step": 5170 }, { "epoch": 2.9515669515669516, "grad_norm": 0.5805647906397857, "learning_rate": 5e-06, "loss": 0.5348, "step": 5180 }, { "epoch": 2.9572649572649574, "grad_norm": 0.5629404743153653, "learning_rate": 5e-06, "loss": 0.5239, "step": 5190 }, { "epoch": 2.962962962962963, "grad_norm": 0.5482910577257104, "learning_rate": 5e-06, "loss": 0.5248, "step": 5200 }, { "epoch": 2.9686609686609686, "grad_norm": 0.5428900145420302, "learning_rate": 5e-06, "loss": 0.5286, "step": 5210 }, { "epoch": 2.9743589743589745, "grad_norm": 0.5426923796436356, "learning_rate": 5e-06, "loss": 0.5439, "step": 5220 }, { "epoch": 2.98005698005698, "grad_norm": 0.5421746187267816, "learning_rate": 5e-06, "loss": 0.5394, "step": 5230 }, { "epoch": 2.9857549857549857, "grad_norm": 0.5703778871540313, "learning_rate": 5e-06, "loss": 0.5359, "step": 5240 }, { "epoch": 2.9914529914529915, "grad_norm": 0.5488503690583575, "learning_rate": 5e-06, "loss": 0.5253, "step": 5250 }, { "epoch": 2.9971509971509973, "grad_norm": 0.6053670974984723, "learning_rate": 5e-06, "loss": 0.5255, "step": 5260 }, { "epoch": 3.0, "eval_loss": 0.6272810697555542, "eval_runtime": 446.6687, "eval_samples_per_second": 26.469, "eval_steps_per_second": 0.414, "step": 5265 }, { "epoch": 3.0, "step": 5265, "total_flos": 2759937528692736.0, "train_loss": 0.5805289232719545, "train_runtime": 71924.3439, "train_samples_per_second": 9.369, "train_steps_per_second": 0.073 } ], "logging_steps": 10, "max_steps": 5265, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2759937528692736.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }