{ "best_global_step": 14858, "best_metric": 0.3537425398826599, "best_model_checkpoint": "saves_bts_preliminary/freeze/llama-3.2-1b-instruct/train_record_42_1779354540/checkpoint-14858", "epoch": 1.0, "eval_steps": 782, "global_step": 15621, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003200819409768901, "grad_norm": 655.673828125, "learning_rate": 5.118362124120281e-09, "loss": 2.1603, "num_input_tokens_seen": 15360, "step": 5 }, { "epoch": 0.0006401638819537802, "grad_norm": 461.5277099609375, "learning_rate": 1.1516314779270634e-08, "loss": 2.344, "num_input_tokens_seen": 31104, "step": 10 }, { "epoch": 0.0009602458229306702, "grad_norm": 540.4111938476562, "learning_rate": 1.7914267434420987e-08, "loss": 2.115, "num_input_tokens_seen": 46208, "step": 15 }, { "epoch": 0.0012803277639075604, "grad_norm": 371.52410888671875, "learning_rate": 2.431222008957134e-08, "loss": 2.741, "num_input_tokens_seen": 62464, "step": 20 }, { "epoch": 0.0016004097048844504, "grad_norm": 420.4732666015625, "learning_rate": 3.071017274472169e-08, "loss": 2.0952, "num_input_tokens_seen": 79104, "step": 25 }, { "epoch": 0.0019204916458613404, "grad_norm": 360.0107421875, "learning_rate": 3.710812539987204e-08, "loss": 2.1934, "num_input_tokens_seen": 94912, "step": 30 }, { "epoch": 0.0022405735868382304, "grad_norm": 533.6338500976562, "learning_rate": 4.350607805502239e-08, "loss": 2.3371, "num_input_tokens_seen": 110784, "step": 35 }, { "epoch": 0.002560655527815121, "grad_norm": 316.48663330078125, "learning_rate": 4.990403071017274e-08, "loss": 2.1424, "num_input_tokens_seen": 125696, "step": 40 }, { "epoch": 0.002880737468792011, "grad_norm": 393.286865234375, "learning_rate": 5.6301983365323095e-08, "loss": 2.0945, "num_input_tokens_seen": 140672, "step": 45 }, { "epoch": 0.003200819409768901, "grad_norm": 386.58819580078125, "learning_rate": 6.269993602047345e-08, "loss": 2.0027, "num_input_tokens_seen": 155456, "step": 50 }, { "epoch": 0.003520901350745791, "grad_norm": 368.06884765625, "learning_rate": 6.90978886756238e-08, "loss": 1.915, "num_input_tokens_seen": 170816, "step": 55 }, { "epoch": 0.003840983291722681, "grad_norm": 332.34002685546875, "learning_rate": 7.549584133077414e-08, "loss": 2.0244, "num_input_tokens_seen": 185088, "step": 60 }, { "epoch": 0.004161065232699571, "grad_norm": 362.953125, "learning_rate": 8.18937939859245e-08, "loss": 1.6385, "num_input_tokens_seen": 200384, "step": 65 }, { "epoch": 0.004481147173676461, "grad_norm": 266.420166015625, "learning_rate": 8.829174664107485e-08, "loss": 1.6591, "num_input_tokens_seen": 215744, "step": 70 }, { "epoch": 0.004801229114653352, "grad_norm": 168.38560485839844, "learning_rate": 9.468969929622521e-08, "loss": 1.6555, "num_input_tokens_seen": 230400, "step": 75 }, { "epoch": 0.005121311055630242, "grad_norm": 282.5287780761719, "learning_rate": 1.0108765195137556e-07, "loss": 1.3232, "num_input_tokens_seen": 246592, "step": 80 }, { "epoch": 0.005441392996607132, "grad_norm": 106.96839141845703, "learning_rate": 1.074856046065259e-07, "loss": 1.1532, "num_input_tokens_seen": 262272, "step": 85 }, { "epoch": 0.005761474937584022, "grad_norm": 119.50403594970703, "learning_rate": 1.1388355726167625e-07, "loss": 1.0452, "num_input_tokens_seen": 277760, "step": 90 }, { "epoch": 0.006081556878560912, "grad_norm": 165.79542541503906, "learning_rate": 1.202815099168266e-07, "loss": 1.2493, "num_input_tokens_seen": 292992, "step": 95 }, { "epoch": 0.006401638819537802, "grad_norm": 155.2497100830078, "learning_rate": 1.2667946257197694e-07, "loss": 1.1191, "num_input_tokens_seen": 307840, "step": 100 }, { "epoch": 0.006721720760514692, "grad_norm": 112.60347747802734, "learning_rate": 1.3307741522712732e-07, "loss": 1.0359, "num_input_tokens_seen": 323008, "step": 105 }, { "epoch": 0.007041802701491582, "grad_norm": 88.95298767089844, "learning_rate": 1.3947536788227767e-07, "loss": 1.0546, "num_input_tokens_seen": 339456, "step": 110 }, { "epoch": 0.007361884642468472, "grad_norm": 87.05043029785156, "learning_rate": 1.45873320537428e-07, "loss": 1.1286, "num_input_tokens_seen": 354816, "step": 115 }, { "epoch": 0.007681966583445362, "grad_norm": 77.32754516601562, "learning_rate": 1.5227127319257838e-07, "loss": 0.8243, "num_input_tokens_seen": 369472, "step": 120 }, { "epoch": 0.008002048524422252, "grad_norm": 69.6989974975586, "learning_rate": 1.586692258477287e-07, "loss": 0.9582, "num_input_tokens_seen": 384768, "step": 125 }, { "epoch": 0.008322130465399142, "grad_norm": 96.46429443359375, "learning_rate": 1.6506717850287908e-07, "loss": 1.0307, "num_input_tokens_seen": 400192, "step": 130 }, { "epoch": 0.008642212406376032, "grad_norm": 118.02337646484375, "learning_rate": 1.7146513115802943e-07, "loss": 0.8953, "num_input_tokens_seen": 416640, "step": 135 }, { "epoch": 0.008962294347352922, "grad_norm": 65.88743591308594, "learning_rate": 1.7786308381317976e-07, "loss": 0.8263, "num_input_tokens_seen": 432640, "step": 140 }, { "epoch": 0.009282376288329812, "grad_norm": 77.22103881835938, "learning_rate": 1.8426103646833014e-07, "loss": 0.8971, "num_input_tokens_seen": 448640, "step": 145 }, { "epoch": 0.009602458229306703, "grad_norm": 88.69629669189453, "learning_rate": 1.9065898912348046e-07, "loss": 0.9544, "num_input_tokens_seen": 464448, "step": 150 }, { "epoch": 0.009922540170283593, "grad_norm": 86.22632598876953, "learning_rate": 1.9705694177863084e-07, "loss": 0.8598, "num_input_tokens_seen": 479488, "step": 155 }, { "epoch": 0.010242622111260483, "grad_norm": 55.39344787597656, "learning_rate": 2.034548944337812e-07, "loss": 0.7343, "num_input_tokens_seen": 495296, "step": 160 }, { "epoch": 0.010562704052237373, "grad_norm": 87.78097534179688, "learning_rate": 2.0985284708893152e-07, "loss": 0.7845, "num_input_tokens_seen": 510144, "step": 165 }, { "epoch": 0.010882785993214263, "grad_norm": 80.47422790527344, "learning_rate": 2.162507997440819e-07, "loss": 0.8491, "num_input_tokens_seen": 524928, "step": 170 }, { "epoch": 0.011202867934191153, "grad_norm": 45.75130081176758, "learning_rate": 2.2264875239923222e-07, "loss": 0.7122, "num_input_tokens_seen": 541504, "step": 175 }, { "epoch": 0.011522949875168043, "grad_norm": 81.46015167236328, "learning_rate": 2.290467050543826e-07, "loss": 0.7354, "num_input_tokens_seen": 556096, "step": 180 }, { "epoch": 0.011843031816144933, "grad_norm": 77.93597412109375, "learning_rate": 2.3544465770953295e-07, "loss": 0.734, "num_input_tokens_seen": 572736, "step": 185 }, { "epoch": 0.012163113757121823, "grad_norm": 73.0274658203125, "learning_rate": 2.418426103646833e-07, "loss": 0.8565, "num_input_tokens_seen": 588352, "step": 190 }, { "epoch": 0.012483195698098713, "grad_norm": 56.91474533081055, "learning_rate": 2.4824056301983363e-07, "loss": 0.9816, "num_input_tokens_seen": 603520, "step": 195 }, { "epoch": 0.012803277639075603, "grad_norm": 66.9703369140625, "learning_rate": 2.54638515674984e-07, "loss": 0.8158, "num_input_tokens_seen": 619392, "step": 200 }, { "epoch": 0.013123359580052493, "grad_norm": 59.1487922668457, "learning_rate": 2.6103646833013433e-07, "loss": 0.8032, "num_input_tokens_seen": 635456, "step": 205 }, { "epoch": 0.013443441521029383, "grad_norm": 121.4522705078125, "learning_rate": 2.6743442098528466e-07, "loss": 0.8716, "num_input_tokens_seen": 650880, "step": 210 }, { "epoch": 0.013763523462006273, "grad_norm": 50.31541442871094, "learning_rate": 2.7383237364043504e-07, "loss": 0.8278, "num_input_tokens_seen": 666688, "step": 215 }, { "epoch": 0.014083605402983163, "grad_norm": 70.05236053466797, "learning_rate": 2.802303262955854e-07, "loss": 0.7898, "num_input_tokens_seen": 682112, "step": 220 }, { "epoch": 0.014403687343960053, "grad_norm": 64.9844741821289, "learning_rate": 2.866282789507358e-07, "loss": 0.8381, "num_input_tokens_seen": 697728, "step": 225 }, { "epoch": 0.014723769284936943, "grad_norm": 53.501747131347656, "learning_rate": 2.9302623160588607e-07, "loss": 0.6829, "num_input_tokens_seen": 712704, "step": 230 }, { "epoch": 0.015043851225913833, "grad_norm": 91.16888427734375, "learning_rate": 2.9942418426103644e-07, "loss": 0.9619, "num_input_tokens_seen": 729408, "step": 235 }, { "epoch": 0.015363933166890723, "grad_norm": 89.74860382080078, "learning_rate": 3.058221369161868e-07, "loss": 0.7854, "num_input_tokens_seen": 745344, "step": 240 }, { "epoch": 0.015684015107867613, "grad_norm": 56.365665435791016, "learning_rate": 3.1222008957133715e-07, "loss": 0.6965, "num_input_tokens_seen": 762688, "step": 245 }, { "epoch": 0.016004097048844503, "grad_norm": 62.77731704711914, "learning_rate": 3.186180422264875e-07, "loss": 0.7105, "num_input_tokens_seen": 779392, "step": 250 }, { "epoch": 0.016324178989821393, "grad_norm": 80.97101593017578, "learning_rate": 3.2501599488163785e-07, "loss": 0.7964, "num_input_tokens_seen": 794112, "step": 255 }, { "epoch": 0.016644260930798283, "grad_norm": 50.28890609741211, "learning_rate": 3.314139475367882e-07, "loss": 0.8427, "num_input_tokens_seen": 810112, "step": 260 }, { "epoch": 0.016964342871775173, "grad_norm": 79.30187225341797, "learning_rate": 3.3781190019193855e-07, "loss": 0.8614, "num_input_tokens_seen": 825472, "step": 265 }, { "epoch": 0.017284424812752063, "grad_norm": 69.35704803466797, "learning_rate": 3.4420985284708893e-07, "loss": 0.9819, "num_input_tokens_seen": 840128, "step": 270 }, { "epoch": 0.017604506753728953, "grad_norm": 70.34232330322266, "learning_rate": 3.5060780550223926e-07, "loss": 0.7825, "num_input_tokens_seen": 855104, "step": 275 }, { "epoch": 0.017924588694705843, "grad_norm": 67.7530517578125, "learning_rate": 3.570057581573896e-07, "loss": 0.8069, "num_input_tokens_seen": 870848, "step": 280 }, { "epoch": 0.018244670635682733, "grad_norm": 46.21129608154297, "learning_rate": 3.6340371081253996e-07, "loss": 0.7403, "num_input_tokens_seen": 885760, "step": 285 }, { "epoch": 0.018564752576659623, "grad_norm": 44.078643798828125, "learning_rate": 3.6980166346769034e-07, "loss": 0.7078, "num_input_tokens_seen": 900928, "step": 290 }, { "epoch": 0.018884834517636517, "grad_norm": 54.419532775878906, "learning_rate": 3.7619961612284067e-07, "loss": 0.793, "num_input_tokens_seen": 915968, "step": 295 }, { "epoch": 0.019204916458613407, "grad_norm": 107.00920867919922, "learning_rate": 3.8259756877799104e-07, "loss": 0.9919, "num_input_tokens_seen": 933056, "step": 300 }, { "epoch": 0.019524998399590297, "grad_norm": 84.30803680419922, "learning_rate": 3.889955214331414e-07, "loss": 0.7373, "num_input_tokens_seen": 948416, "step": 305 }, { "epoch": 0.019845080340567187, "grad_norm": 65.89620971679688, "learning_rate": 3.953934740882917e-07, "loss": 0.7694, "num_input_tokens_seen": 962880, "step": 310 }, { "epoch": 0.020165162281544077, "grad_norm": 58.68693923950195, "learning_rate": 4.0179142674344207e-07, "loss": 0.8088, "num_input_tokens_seen": 979904, "step": 315 }, { "epoch": 0.020485244222520967, "grad_norm": 64.4815902709961, "learning_rate": 4.0818937939859245e-07, "loss": 0.8251, "num_input_tokens_seen": 995136, "step": 320 }, { "epoch": 0.020805326163497857, "grad_norm": 59.8892707824707, "learning_rate": 4.145873320537428e-07, "loss": 0.7695, "num_input_tokens_seen": 1011008, "step": 325 }, { "epoch": 0.021125408104474747, "grad_norm": 61.05699157714844, "learning_rate": 4.2098528470889315e-07, "loss": 0.8335, "num_input_tokens_seen": 1025792, "step": 330 }, { "epoch": 0.021445490045451637, "grad_norm": 54.53645324707031, "learning_rate": 4.273832373640435e-07, "loss": 0.6901, "num_input_tokens_seen": 1042944, "step": 335 }, { "epoch": 0.021765571986428527, "grad_norm": 69.49205017089844, "learning_rate": 4.3378119001919386e-07, "loss": 0.8267, "num_input_tokens_seen": 1058688, "step": 340 }, { "epoch": 0.022085653927405417, "grad_norm": 52.010841369628906, "learning_rate": 4.401791426743442e-07, "loss": 0.7233, "num_input_tokens_seen": 1074560, "step": 345 }, { "epoch": 0.022405735868382307, "grad_norm": 76.15229034423828, "learning_rate": 4.4657709532949456e-07, "loss": 0.6991, "num_input_tokens_seen": 1089728, "step": 350 }, { "epoch": 0.022725817809359197, "grad_norm": 93.6746597290039, "learning_rate": 4.5297504798464494e-07, "loss": 0.9114, "num_input_tokens_seen": 1105024, "step": 355 }, { "epoch": 0.023045899750336087, "grad_norm": 51.18860626220703, "learning_rate": 4.593730006397952e-07, "loss": 0.7824, "num_input_tokens_seen": 1121088, "step": 360 }, { "epoch": 0.023365981691312977, "grad_norm": 51.50726318359375, "learning_rate": 4.657709532949456e-07, "loss": 0.7048, "num_input_tokens_seen": 1136896, "step": 365 }, { "epoch": 0.023686063632289867, "grad_norm": 61.41284942626953, "learning_rate": 4.7216890595009597e-07, "loss": 0.7082, "num_input_tokens_seen": 1153280, "step": 370 }, { "epoch": 0.024006145573266757, "grad_norm": 56.15473175048828, "learning_rate": 4.785668586052463e-07, "loss": 0.8338, "num_input_tokens_seen": 1169536, "step": 375 }, { "epoch": 0.024326227514243647, "grad_norm": 87.2235107421875, "learning_rate": 4.849648112603967e-07, "loss": 0.7577, "num_input_tokens_seen": 1185088, "step": 380 }, { "epoch": 0.024646309455220537, "grad_norm": 35.1290397644043, "learning_rate": 4.91362763915547e-07, "loss": 0.6664, "num_input_tokens_seen": 1200832, "step": 385 }, { "epoch": 0.024966391396197427, "grad_norm": 50.34434509277344, "learning_rate": 4.977607165706974e-07, "loss": 0.6605, "num_input_tokens_seen": 1216320, "step": 390 }, { "epoch": 0.025286473337174317, "grad_norm": 61.464664459228516, "learning_rate": 5.041586692258478e-07, "loss": 0.7361, "num_input_tokens_seen": 1232832, "step": 395 }, { "epoch": 0.025606555278151207, "grad_norm": 49.942779541015625, "learning_rate": 5.10556621880998e-07, "loss": 0.7037, "num_input_tokens_seen": 1248384, "step": 400 }, { "epoch": 0.025926637219128097, "grad_norm": 35.994441986083984, "learning_rate": 5.169545745361484e-07, "loss": 0.6727, "num_input_tokens_seen": 1263936, "step": 405 }, { "epoch": 0.026246719160104987, "grad_norm": 65.4025650024414, "learning_rate": 5.233525271912988e-07, "loss": 1.118, "num_input_tokens_seen": 1294208, "step": 410 }, { "epoch": 0.026566801101081877, "grad_norm": 63.024566650390625, "learning_rate": 5.297504798464492e-07, "loss": 0.7921, "num_input_tokens_seen": 1309120, "step": 415 }, { "epoch": 0.026886883042058767, "grad_norm": 56.8184814453125, "learning_rate": 5.361484325015994e-07, "loss": 0.8592, "num_input_tokens_seen": 1324224, "step": 420 }, { "epoch": 0.027206964983035656, "grad_norm": 74.34542846679688, "learning_rate": 5.425463851567498e-07, "loss": 0.6829, "num_input_tokens_seen": 1341056, "step": 425 }, { "epoch": 0.027527046924012546, "grad_norm": 52.57733154296875, "learning_rate": 5.489443378119002e-07, "loss": 0.7533, "num_input_tokens_seen": 1356544, "step": 430 }, { "epoch": 0.027847128864989436, "grad_norm": 62.9859733581543, "learning_rate": 5.553422904670505e-07, "loss": 0.6696, "num_input_tokens_seen": 1371840, "step": 435 }, { "epoch": 0.028167210805966326, "grad_norm": 55.78180694580078, "learning_rate": 5.61740243122201e-07, "loss": 0.6825, "num_input_tokens_seen": 1386816, "step": 440 }, { "epoch": 0.028487292746943216, "grad_norm": 64.4063720703125, "learning_rate": 5.681381957773512e-07, "loss": 0.7438, "num_input_tokens_seen": 1401792, "step": 445 }, { "epoch": 0.028807374687920106, "grad_norm": 66.20137023925781, "learning_rate": 5.745361484325015e-07, "loss": 0.6214, "num_input_tokens_seen": 1416896, "step": 450 }, { "epoch": 0.029127456628896996, "grad_norm": 82.4999008178711, "learning_rate": 5.80934101087652e-07, "loss": 0.7517, "num_input_tokens_seen": 1432704, "step": 455 }, { "epoch": 0.029447538569873886, "grad_norm": 85.98738861083984, "learning_rate": 5.873320537428022e-07, "loss": 0.7009, "num_input_tokens_seen": 1448384, "step": 460 }, { "epoch": 0.029767620510850776, "grad_norm": 60.4025764465332, "learning_rate": 5.937300063979526e-07, "loss": 0.7179, "num_input_tokens_seen": 1464832, "step": 465 }, { "epoch": 0.030087702451827666, "grad_norm": 69.6055908203125, "learning_rate": 6.00127959053103e-07, "loss": 0.6785, "num_input_tokens_seen": 1479424, "step": 470 }, { "epoch": 0.030407784392804556, "grad_norm": 40.376953125, "learning_rate": 6.065259117082533e-07, "loss": 0.7292, "num_input_tokens_seen": 1494336, "step": 475 }, { "epoch": 0.030727866333781446, "grad_norm": 53.5233154296875, "learning_rate": 6.129238643634037e-07, "loss": 0.6741, "num_input_tokens_seen": 1509184, "step": 480 }, { "epoch": 0.031047948274758336, "grad_norm": 36.17082214355469, "learning_rate": 6.19321817018554e-07, "loss": 0.8032, "num_input_tokens_seen": 1525504, "step": 485 }, { "epoch": 0.031368030215735226, "grad_norm": 39.83842468261719, "learning_rate": 6.257197696737044e-07, "loss": 0.5911, "num_input_tokens_seen": 1541504, "step": 490 }, { "epoch": 0.03168811215671212, "grad_norm": 38.20148849487305, "learning_rate": 6.321177223288548e-07, "loss": 0.6188, "num_input_tokens_seen": 1557184, "step": 495 }, { "epoch": 0.032008194097689006, "grad_norm": 58.953765869140625, "learning_rate": 6.385156749840051e-07, "loss": 0.7662, "num_input_tokens_seen": 1573440, "step": 500 }, { "epoch": 0.0323282760386659, "grad_norm": 34.08373260498047, "learning_rate": 6.449136276391554e-07, "loss": 0.8712, "num_input_tokens_seen": 1588736, "step": 505 }, { "epoch": 0.032648357979642786, "grad_norm": 42.26185989379883, "learning_rate": 6.513115802943058e-07, "loss": 0.6979, "num_input_tokens_seen": 1604352, "step": 510 }, { "epoch": 0.03296843992061968, "grad_norm": 39.405391693115234, "learning_rate": 6.577095329494562e-07, "loss": 0.6574, "num_input_tokens_seen": 1618816, "step": 515 }, { "epoch": 0.033288521861596566, "grad_norm": 68.6015853881836, "learning_rate": 6.641074856046065e-07, "loss": 0.7462, "num_input_tokens_seen": 1635648, "step": 520 }, { "epoch": 0.03360860380257346, "grad_norm": 51.983734130859375, "learning_rate": 6.705054382597568e-07, "loss": 0.719, "num_input_tokens_seen": 1651328, "step": 525 }, { "epoch": 0.033928685743550346, "grad_norm": 41.2055549621582, "learning_rate": 6.769033909149072e-07, "loss": 0.7345, "num_input_tokens_seen": 1668928, "step": 530 }, { "epoch": 0.03424876768452724, "grad_norm": 49.78337478637695, "learning_rate": 6.833013435700575e-07, "loss": 0.6202, "num_input_tokens_seen": 1685504, "step": 535 }, { "epoch": 0.034568849625504126, "grad_norm": 56.454078674316406, "learning_rate": 6.89699296225208e-07, "loss": 0.7053, "num_input_tokens_seen": 1701952, "step": 540 }, { "epoch": 0.03488893156648102, "grad_norm": 57.62031173706055, "learning_rate": 6.960972488803583e-07, "loss": 0.7308, "num_input_tokens_seen": 1716992, "step": 545 }, { "epoch": 0.035209013507457906, "grad_norm": 26.579238891601562, "learning_rate": 7.024952015355085e-07, "loss": 0.5835, "num_input_tokens_seen": 1732160, "step": 550 }, { "epoch": 0.0355290954484348, "grad_norm": 62.191402435302734, "learning_rate": 7.08893154190659e-07, "loss": 0.6553, "num_input_tokens_seen": 1748416, "step": 555 }, { "epoch": 0.035849177389411686, "grad_norm": 47.643890380859375, "learning_rate": 7.152911068458093e-07, "loss": 0.7096, "num_input_tokens_seen": 1763776, "step": 560 }, { "epoch": 0.03616925933038858, "grad_norm": 53.94837188720703, "learning_rate": 7.216890595009597e-07, "loss": 0.6985, "num_input_tokens_seen": 1780160, "step": 565 }, { "epoch": 0.036489341271365466, "grad_norm": 49.82310104370117, "learning_rate": 7.2808701215611e-07, "loss": 0.6057, "num_input_tokens_seen": 1795968, "step": 570 }, { "epoch": 0.03680942321234236, "grad_norm": 45.038936614990234, "learning_rate": 7.344849648112603e-07, "loss": 0.6327, "num_input_tokens_seen": 1815424, "step": 575 }, { "epoch": 0.037129505153319246, "grad_norm": 86.25282287597656, "learning_rate": 7.408829174664107e-07, "loss": 0.8275, "num_input_tokens_seen": 1831936, "step": 580 }, { "epoch": 0.03744958709429614, "grad_norm": 33.02293014526367, "learning_rate": 7.472808701215611e-07, "loss": 0.6155, "num_input_tokens_seen": 1847424, "step": 585 }, { "epoch": 0.03776966903527303, "grad_norm": 46.377925872802734, "learning_rate": 7.536788227767114e-07, "loss": 0.7381, "num_input_tokens_seen": 1862400, "step": 590 }, { "epoch": 0.03808975097624992, "grad_norm": 83.56999969482422, "learning_rate": 7.600767754318617e-07, "loss": 0.7694, "num_input_tokens_seen": 1876928, "step": 595 }, { "epoch": 0.03840983291722681, "grad_norm": 52.600372314453125, "learning_rate": 7.664747280870121e-07, "loss": 0.6363, "num_input_tokens_seen": 1892608, "step": 600 }, { "epoch": 0.0387299148582037, "grad_norm": 35.62962341308594, "learning_rate": 7.728726807421625e-07, "loss": 0.7292, "num_input_tokens_seen": 1909696, "step": 605 }, { "epoch": 0.03904999679918059, "grad_norm": 38.61429214477539, "learning_rate": 7.792706333973129e-07, "loss": 0.7601, "num_input_tokens_seen": 1924864, "step": 610 }, { "epoch": 0.03937007874015748, "grad_norm": 35.40009689331055, "learning_rate": 7.856685860524632e-07, "loss": 0.5592, "num_input_tokens_seen": 1939968, "step": 615 }, { "epoch": 0.03969016068113437, "grad_norm": 69.34685516357422, "learning_rate": 7.920665387076135e-07, "loss": 0.7152, "num_input_tokens_seen": 1955136, "step": 620 }, { "epoch": 0.04001024262211126, "grad_norm": 42.532005310058594, "learning_rate": 7.984644913627639e-07, "loss": 0.7036, "num_input_tokens_seen": 1970880, "step": 625 }, { "epoch": 0.04033032456308815, "grad_norm": 61.369667053222656, "learning_rate": 8.048624440179143e-07, "loss": 0.5794, "num_input_tokens_seen": 1986752, "step": 630 }, { "epoch": 0.04065040650406504, "grad_norm": 39.555450439453125, "learning_rate": 8.112603966730645e-07, "loss": 0.5994, "num_input_tokens_seen": 2001856, "step": 635 }, { "epoch": 0.04097048844504193, "grad_norm": 48.69257354736328, "learning_rate": 8.17658349328215e-07, "loss": 0.6586, "num_input_tokens_seen": 2019968, "step": 640 }, { "epoch": 0.04129057038601882, "grad_norm": 57.668907165527344, "learning_rate": 8.240563019833653e-07, "loss": 0.7047, "num_input_tokens_seen": 2035328, "step": 645 }, { "epoch": 0.04161065232699571, "grad_norm": 43.12187576293945, "learning_rate": 8.304542546385156e-07, "loss": 0.6282, "num_input_tokens_seen": 2055168, "step": 650 }, { "epoch": 0.0419307342679726, "grad_norm": 43.47643280029297, "learning_rate": 8.36852207293666e-07, "loss": 0.7521, "num_input_tokens_seen": 2071808, "step": 655 }, { "epoch": 0.04225081620894949, "grad_norm": 44.587730407714844, "learning_rate": 8.432501599488163e-07, "loss": 0.6527, "num_input_tokens_seen": 2087424, "step": 660 }, { "epoch": 0.04257089814992638, "grad_norm": 35.20018768310547, "learning_rate": 8.496481126039667e-07, "loss": 0.7682, "num_input_tokens_seen": 2102592, "step": 665 }, { "epoch": 0.04289098009090327, "grad_norm": 44.10483169555664, "learning_rate": 8.560460652591171e-07, "loss": 0.6517, "num_input_tokens_seen": 2119488, "step": 670 }, { "epoch": 0.04321106203188016, "grad_norm": 37.69010543823242, "learning_rate": 8.624440179142674e-07, "loss": 0.6454, "num_input_tokens_seen": 2136000, "step": 675 }, { "epoch": 0.04353114397285705, "grad_norm": 47.20091247558594, "learning_rate": 8.688419705694177e-07, "loss": 0.7404, "num_input_tokens_seen": 2152448, "step": 680 }, { "epoch": 0.04385122591383394, "grad_norm": 44.33426284790039, "learning_rate": 8.752399232245681e-07, "loss": 0.6177, "num_input_tokens_seen": 2168000, "step": 685 }, { "epoch": 0.04417130785481083, "grad_norm": 42.24176025390625, "learning_rate": 8.816378758797185e-07, "loss": 0.5953, "num_input_tokens_seen": 2183552, "step": 690 }, { "epoch": 0.04449138979578772, "grad_norm": 52.65016174316406, "learning_rate": 8.880358285348688e-07, "loss": 0.7135, "num_input_tokens_seen": 2199488, "step": 695 }, { "epoch": 0.04481147173676461, "grad_norm": 36.20340347290039, "learning_rate": 8.944337811900191e-07, "loss": 0.6167, "num_input_tokens_seen": 2215296, "step": 700 }, { "epoch": 0.0451315536777415, "grad_norm": 50.2882080078125, "learning_rate": 9.008317338451695e-07, "loss": 0.7051, "num_input_tokens_seen": 2230016, "step": 705 }, { "epoch": 0.04545163561871839, "grad_norm": 48.945701599121094, "learning_rate": 9.072296865003198e-07, "loss": 0.6629, "num_input_tokens_seen": 2245056, "step": 710 }, { "epoch": 0.04577171755969528, "grad_norm": 35.3903923034668, "learning_rate": 9.136276391554703e-07, "loss": 0.6166, "num_input_tokens_seen": 2261248, "step": 715 }, { "epoch": 0.04609179950067217, "grad_norm": 57.04933547973633, "learning_rate": 9.200255918106205e-07, "loss": 0.6516, "num_input_tokens_seen": 2278016, "step": 720 }, { "epoch": 0.04641188144164906, "grad_norm": 46.2874755859375, "learning_rate": 9.264235444657708e-07, "loss": 0.5696, "num_input_tokens_seen": 2292800, "step": 725 }, { "epoch": 0.04673196338262595, "grad_norm": 34.89374542236328, "learning_rate": 9.328214971209213e-07, "loss": 0.6049, "num_input_tokens_seen": 2308224, "step": 730 }, { "epoch": 0.04705204532360284, "grad_norm": 39.93567657470703, "learning_rate": 9.392194497760716e-07, "loss": 0.7005, "num_input_tokens_seen": 2325760, "step": 735 }, { "epoch": 0.04737212726457973, "grad_norm": 56.7358512878418, "learning_rate": 9.456174024312221e-07, "loss": 0.6971, "num_input_tokens_seen": 2341632, "step": 740 }, { "epoch": 0.04769220920555662, "grad_norm": 38.78962326049805, "learning_rate": 9.520153550863723e-07, "loss": 0.7066, "num_input_tokens_seen": 2357504, "step": 745 }, { "epoch": 0.04801229114653351, "grad_norm": 42.24749755859375, "learning_rate": 9.584133077415226e-07, "loss": 0.7294, "num_input_tokens_seen": 2372608, "step": 750 }, { "epoch": 0.0483323730875104, "grad_norm": 42.99443817138672, "learning_rate": 9.64811260396673e-07, "loss": 0.587, "num_input_tokens_seen": 2388352, "step": 755 }, { "epoch": 0.04865245502848729, "grad_norm": 46.318416595458984, "learning_rate": 9.712092130518234e-07, "loss": 0.6934, "num_input_tokens_seen": 2404480, "step": 760 }, { "epoch": 0.04897253696946418, "grad_norm": 32.9005126953125, "learning_rate": 9.776071657069737e-07, "loss": 0.518, "num_input_tokens_seen": 2419648, "step": 765 }, { "epoch": 0.04929261891044107, "grad_norm": 45.313751220703125, "learning_rate": 9.840051183621241e-07, "loss": 0.7121, "num_input_tokens_seen": 2435584, "step": 770 }, { "epoch": 0.04961270085141796, "grad_norm": 57.734039306640625, "learning_rate": 9.904030710172743e-07, "loss": 0.6265, "num_input_tokens_seen": 2451072, "step": 775 }, { "epoch": 0.04993278279239485, "grad_norm": 60.701107025146484, "learning_rate": 9.968010236724249e-07, "loss": 0.764, "num_input_tokens_seen": 2467968, "step": 780 }, { "epoch": 0.05006081556878561, "eval_loss": 0.6362079381942749, "eval_runtime": 49.1703, "eval_samples_per_second": 282.406, "eval_steps_per_second": 35.306, "num_input_tokens_seen": 2474432, "step": 782 }, { "epoch": 0.05025286473337175, "grad_norm": 52.689231872558594, "learning_rate": 1.0031989763275752e-06, "loss": 0.669, "num_input_tokens_seen": 2484928, "step": 785 }, { "epoch": 0.05057294667434863, "grad_norm": 42.69588851928711, "learning_rate": 1.0095969289827256e-06, "loss": 0.6777, "num_input_tokens_seen": 2501504, "step": 790 }, { "epoch": 0.050893028615325527, "grad_norm": 32.48566436767578, "learning_rate": 1.0159948816378758e-06, "loss": 0.5188, "num_input_tokens_seen": 2518848, "step": 795 }, { "epoch": 0.05121311055630241, "grad_norm": 33.27299880981445, "learning_rate": 1.0223928342930262e-06, "loss": 0.5482, "num_input_tokens_seen": 2535680, "step": 800 }, { "epoch": 0.051533192497279307, "grad_norm": 51.12800979614258, "learning_rate": 1.0287907869481766e-06, "loss": 0.676, "num_input_tokens_seen": 2550976, "step": 805 }, { "epoch": 0.05185327443825619, "grad_norm": 39.38006591796875, "learning_rate": 1.035188739603327e-06, "loss": 0.5562, "num_input_tokens_seen": 2566656, "step": 810 }, { "epoch": 0.052173356379233086, "grad_norm": 49.9570426940918, "learning_rate": 1.0415866922584773e-06, "loss": 0.6315, "num_input_tokens_seen": 2581568, "step": 815 }, { "epoch": 0.05249343832020997, "grad_norm": 51.84290313720703, "learning_rate": 1.0479846449136277e-06, "loss": 0.6426, "num_input_tokens_seen": 2596608, "step": 820 }, { "epoch": 0.052813520261186866, "grad_norm": 42.30448532104492, "learning_rate": 1.0543825975687779e-06, "loss": 0.6719, "num_input_tokens_seen": 2612032, "step": 825 }, { "epoch": 0.05313360220216375, "grad_norm": 54.049774169921875, "learning_rate": 1.0607805502239282e-06, "loss": 0.7313, "num_input_tokens_seen": 2627264, "step": 830 }, { "epoch": 0.053453684143140646, "grad_norm": 43.845027923583984, "learning_rate": 1.0671785028790788e-06, "loss": 0.548, "num_input_tokens_seen": 2643264, "step": 835 }, { "epoch": 0.05377376608411753, "grad_norm": 43.62913131713867, "learning_rate": 1.073576455534229e-06, "loss": 0.5474, "num_input_tokens_seen": 2659264, "step": 840 }, { "epoch": 0.054093848025094426, "grad_norm": 37.99971389770508, "learning_rate": 1.0799744081893794e-06, "loss": 0.5737, "num_input_tokens_seen": 2673856, "step": 845 }, { "epoch": 0.05441392996607131, "grad_norm": 35.17848587036133, "learning_rate": 1.0863723608445297e-06, "loss": 0.4779, "num_input_tokens_seen": 2688448, "step": 850 }, { "epoch": 0.054734011907048206, "grad_norm": 69.50128173828125, "learning_rate": 1.09277031349968e-06, "loss": 0.6201, "num_input_tokens_seen": 2703872, "step": 855 }, { "epoch": 0.05505409384802509, "grad_norm": 49.573143005371094, "learning_rate": 1.0991682661548305e-06, "loss": 0.6104, "num_input_tokens_seen": 2719040, "step": 860 }, { "epoch": 0.055374175789001986, "grad_norm": 35.63096618652344, "learning_rate": 1.1055662188099809e-06, "loss": 0.6205, "num_input_tokens_seen": 2735168, "step": 865 }, { "epoch": 0.05569425772997887, "grad_norm": 38.10055160522461, "learning_rate": 1.111964171465131e-06, "loss": 0.5224, "num_input_tokens_seen": 2750592, "step": 870 }, { "epoch": 0.056014339670955766, "grad_norm": 21.403268814086914, "learning_rate": 1.1183621241202814e-06, "loss": 0.6572, "num_input_tokens_seen": 2767232, "step": 875 }, { "epoch": 0.05633442161193265, "grad_norm": 55.04920959472656, "learning_rate": 1.1247600767754318e-06, "loss": 0.665, "num_input_tokens_seen": 2784768, "step": 880 }, { "epoch": 0.056654503552909546, "grad_norm": 39.130226135253906, "learning_rate": 1.1311580294305822e-06, "loss": 0.5809, "num_input_tokens_seen": 2799872, "step": 885 }, { "epoch": 0.05697458549388643, "grad_norm": 51.871341705322266, "learning_rate": 1.1375559820857326e-06, "loss": 0.6481, "num_input_tokens_seen": 2816000, "step": 890 }, { "epoch": 0.057294667434863326, "grad_norm": 46.604705810546875, "learning_rate": 1.143953934740883e-06, "loss": 0.5859, "num_input_tokens_seen": 2831744, "step": 895 }, { "epoch": 0.05761474937584021, "grad_norm": 56.78334426879883, "learning_rate": 1.150351887396033e-06, "loss": 0.6183, "num_input_tokens_seen": 2847424, "step": 900 }, { "epoch": 0.057934831316817106, "grad_norm": 51.35699462890625, "learning_rate": 1.1567498400511835e-06, "loss": 0.616, "num_input_tokens_seen": 2862272, "step": 905 }, { "epoch": 0.05825491325779399, "grad_norm": 38.57978820800781, "learning_rate": 1.163147792706334e-06, "loss": 0.4927, "num_input_tokens_seen": 2877120, "step": 910 }, { "epoch": 0.058574995198770886, "grad_norm": 41.00065612792969, "learning_rate": 1.1695457453614842e-06, "loss": 0.5249, "num_input_tokens_seen": 2894592, "step": 915 }, { "epoch": 0.05889507713974777, "grad_norm": 40.363075256347656, "learning_rate": 1.1759436980166346e-06, "loss": 0.6159, "num_input_tokens_seen": 2909888, "step": 920 }, { "epoch": 0.059215159080724666, "grad_norm": 49.1600456237793, "learning_rate": 1.182341650671785e-06, "loss": 0.6195, "num_input_tokens_seen": 2925632, "step": 925 }, { "epoch": 0.05953524102170155, "grad_norm": 47.78977966308594, "learning_rate": 1.1887396033269352e-06, "loss": 0.6153, "num_input_tokens_seen": 2941760, "step": 930 }, { "epoch": 0.059855322962678446, "grad_norm": 47.449405670166016, "learning_rate": 1.1951375559820858e-06, "loss": 0.7076, "num_input_tokens_seen": 2957376, "step": 935 }, { "epoch": 0.06017540490365533, "grad_norm": 66.98524475097656, "learning_rate": 1.2015355086372361e-06, "loss": 0.5704, "num_input_tokens_seen": 2972800, "step": 940 }, { "epoch": 0.060495486844632226, "grad_norm": 48.29072952270508, "learning_rate": 1.2079334612923863e-06, "loss": 0.7172, "num_input_tokens_seen": 2988480, "step": 945 }, { "epoch": 0.06081556878560911, "grad_norm": 43.3856086730957, "learning_rate": 1.2143314139475367e-06, "loss": 0.6613, "num_input_tokens_seen": 3004480, "step": 950 }, { "epoch": 0.061135650726586006, "grad_norm": 38.56562423706055, "learning_rate": 1.220729366602687e-06, "loss": 0.444, "num_input_tokens_seen": 3020288, "step": 955 }, { "epoch": 0.06145573266756289, "grad_norm": 60.62529373168945, "learning_rate": 1.2271273192578374e-06, "loss": 0.6011, "num_input_tokens_seen": 3035968, "step": 960 }, { "epoch": 0.061775814608539786, "grad_norm": 61.26271438598633, "learning_rate": 1.2335252719129878e-06, "loss": 0.7411, "num_input_tokens_seen": 3051776, "step": 965 }, { "epoch": 0.06209589654951667, "grad_norm": 52.55011749267578, "learning_rate": 1.2399232245681382e-06, "loss": 0.5575, "num_input_tokens_seen": 3066560, "step": 970 }, { "epoch": 0.062415978490493566, "grad_norm": 52.49790954589844, "learning_rate": 1.2463211772232884e-06, "loss": 0.6357, "num_input_tokens_seen": 3082496, "step": 975 }, { "epoch": 0.06273606043147045, "grad_norm": 43.31839370727539, "learning_rate": 1.2527191298784387e-06, "loss": 0.6233, "num_input_tokens_seen": 3097856, "step": 980 }, { "epoch": 0.06305614237244735, "grad_norm": 25.353742599487305, "learning_rate": 1.2591170825335893e-06, "loss": 0.5062, "num_input_tokens_seen": 3113664, "step": 985 }, { "epoch": 0.06337622431342424, "grad_norm": 37.9774169921875, "learning_rate": 1.2655150351887395e-06, "loss": 0.6242, "num_input_tokens_seen": 3129792, "step": 990 }, { "epoch": 0.06369630625440113, "grad_norm": 30.752185821533203, "learning_rate": 1.2719129878438899e-06, "loss": 0.5901, "num_input_tokens_seen": 3145024, "step": 995 }, { "epoch": 0.06401638819537801, "grad_norm": 41.19409942626953, "learning_rate": 1.2783109404990402e-06, "loss": 0.7747, "num_input_tokens_seen": 3161216, "step": 1000 }, { "epoch": 0.0643364701363549, "grad_norm": 27.8523006439209, "learning_rate": 1.2847088931541904e-06, "loss": 0.4118, "num_input_tokens_seen": 3176960, "step": 1005 }, { "epoch": 0.0646565520773318, "grad_norm": 39.628929138183594, "learning_rate": 1.291106845809341e-06, "loss": 0.607, "num_input_tokens_seen": 3193088, "step": 1010 }, { "epoch": 0.0649766340183087, "grad_norm": 62.03862762451172, "learning_rate": 1.2975047984644914e-06, "loss": 0.6808, "num_input_tokens_seen": 3210112, "step": 1015 }, { "epoch": 0.06529671595928557, "grad_norm": 41.16059494018555, "learning_rate": 1.3039027511196418e-06, "loss": 0.5044, "num_input_tokens_seen": 3224768, "step": 1020 }, { "epoch": 0.06561679790026247, "grad_norm": 45.047080993652344, "learning_rate": 1.310300703774792e-06, "loss": 0.6235, "num_input_tokens_seen": 3240128, "step": 1025 }, { "epoch": 0.06593687984123936, "grad_norm": 41.879398345947266, "learning_rate": 1.3166986564299423e-06, "loss": 0.5605, "num_input_tokens_seen": 3256576, "step": 1030 }, { "epoch": 0.06625696178221625, "grad_norm": 34.385223388671875, "learning_rate": 1.3230966090850929e-06, "loss": 0.5942, "num_input_tokens_seen": 3272384, "step": 1035 }, { "epoch": 0.06657704372319313, "grad_norm": 38.94369125366211, "learning_rate": 1.329494561740243e-06, "loss": 0.4108, "num_input_tokens_seen": 3288512, "step": 1040 }, { "epoch": 0.06689712566417003, "grad_norm": 40.253990173339844, "learning_rate": 1.3358925143953934e-06, "loss": 0.4897, "num_input_tokens_seen": 3306304, "step": 1045 }, { "epoch": 0.06721720760514692, "grad_norm": 42.53627395629883, "learning_rate": 1.3422904670505438e-06, "loss": 0.4785, "num_input_tokens_seen": 3321344, "step": 1050 }, { "epoch": 0.06753728954612381, "grad_norm": 38.27849197387695, "learning_rate": 1.348688419705694e-06, "loss": 0.6127, "num_input_tokens_seen": 3338560, "step": 1055 }, { "epoch": 0.06785737148710069, "grad_norm": 26.670169830322266, "learning_rate": 1.3550863723608446e-06, "loss": 0.5135, "num_input_tokens_seen": 3353152, "step": 1060 }, { "epoch": 0.06817745342807759, "grad_norm": 46.529396057128906, "learning_rate": 1.361484325015995e-06, "loss": 0.5401, "num_input_tokens_seen": 3369536, "step": 1065 }, { "epoch": 0.06849753536905448, "grad_norm": 45.95737075805664, "learning_rate": 1.3678822776711451e-06, "loss": 0.6023, "num_input_tokens_seen": 3384832, "step": 1070 }, { "epoch": 0.06881761731003137, "grad_norm": 38.86219787597656, "learning_rate": 1.3742802303262955e-06, "loss": 0.4881, "num_input_tokens_seen": 3399424, "step": 1075 }, { "epoch": 0.06913769925100825, "grad_norm": 30.497953414916992, "learning_rate": 1.3806781829814459e-06, "loss": 0.6565, "num_input_tokens_seen": 3416704, "step": 1080 }, { "epoch": 0.06945778119198515, "grad_norm": 59.77437210083008, "learning_rate": 1.3870761356365963e-06, "loss": 0.5553, "num_input_tokens_seen": 3431552, "step": 1085 }, { "epoch": 0.06977786313296204, "grad_norm": 36.94731521606445, "learning_rate": 1.3934740882917466e-06, "loss": 0.6472, "num_input_tokens_seen": 3447488, "step": 1090 }, { "epoch": 0.07009794507393893, "grad_norm": 39.8687744140625, "learning_rate": 1.399872040946897e-06, "loss": 0.5137, "num_input_tokens_seen": 3463424, "step": 1095 }, { "epoch": 0.07041802701491581, "grad_norm": 51.21504211425781, "learning_rate": 1.4062699936020472e-06, "loss": 0.6527, "num_input_tokens_seen": 3479680, "step": 1100 }, { "epoch": 0.0707381089558927, "grad_norm": 49.46668243408203, "learning_rate": 1.4126679462571976e-06, "loss": 0.5117, "num_input_tokens_seen": 3495552, "step": 1105 }, { "epoch": 0.0710581908968696, "grad_norm": 56.50544357299805, "learning_rate": 1.4190658989123481e-06, "loss": 0.4748, "num_input_tokens_seen": 3510976, "step": 1110 }, { "epoch": 0.0713782728378465, "grad_norm": 49.386070251464844, "learning_rate": 1.4254638515674983e-06, "loss": 0.6499, "num_input_tokens_seen": 3526016, "step": 1115 }, { "epoch": 0.07169835477882337, "grad_norm": 22.4860782623291, "learning_rate": 1.4318618042226487e-06, "loss": 0.5645, "num_input_tokens_seen": 3540544, "step": 1120 }, { "epoch": 0.07201843671980027, "grad_norm": 43.12958908081055, "learning_rate": 1.438259756877799e-06, "loss": 0.6069, "num_input_tokens_seen": 3556416, "step": 1125 }, { "epoch": 0.07233851866077716, "grad_norm": 43.865108489990234, "learning_rate": 1.4446577095329492e-06, "loss": 0.5077, "num_input_tokens_seen": 3572096, "step": 1130 }, { "epoch": 0.07265860060175405, "grad_norm": 41.96502685546875, "learning_rate": 1.4510556621880998e-06, "loss": 0.4993, "num_input_tokens_seen": 3587712, "step": 1135 }, { "epoch": 0.07297868254273093, "grad_norm": 30.780799865722656, "learning_rate": 1.4574536148432502e-06, "loss": 0.5417, "num_input_tokens_seen": 3605056, "step": 1140 }, { "epoch": 0.07329876448370783, "grad_norm": 42.194156646728516, "learning_rate": 1.4638515674984004e-06, "loss": 0.6805, "num_input_tokens_seen": 3621184, "step": 1145 }, { "epoch": 0.07361884642468472, "grad_norm": 25.724376678466797, "learning_rate": 1.4702495201535507e-06, "loss": 0.5834, "num_input_tokens_seen": 3635392, "step": 1150 }, { "epoch": 0.07393892836566161, "grad_norm": 32.53746795654297, "learning_rate": 1.4766474728087011e-06, "loss": 0.5049, "num_input_tokens_seen": 3649984, "step": 1155 }, { "epoch": 0.07425901030663849, "grad_norm": 34.3016471862793, "learning_rate": 1.4830454254638515e-06, "loss": 0.5276, "num_input_tokens_seen": 3665920, "step": 1160 }, { "epoch": 0.07457909224761539, "grad_norm": 32.034515380859375, "learning_rate": 1.4894433781190019e-06, "loss": 0.4587, "num_input_tokens_seen": 3680256, "step": 1165 }, { "epoch": 0.07489917418859228, "grad_norm": 49.901329040527344, "learning_rate": 1.4958413307741523e-06, "loss": 0.5255, "num_input_tokens_seen": 3697536, "step": 1170 }, { "epoch": 0.07521925612956917, "grad_norm": 35.28968048095703, "learning_rate": 1.5022392834293024e-06, "loss": 0.6111, "num_input_tokens_seen": 3713088, "step": 1175 }, { "epoch": 0.07553933807054607, "grad_norm": 56.491756439208984, "learning_rate": 1.5086372360844528e-06, "loss": 0.6712, "num_input_tokens_seen": 3729920, "step": 1180 }, { "epoch": 0.07585942001152295, "grad_norm": 45.67325210571289, "learning_rate": 1.5150351887396034e-06, "loss": 0.5489, "num_input_tokens_seen": 3745664, "step": 1185 }, { "epoch": 0.07617950195249984, "grad_norm": 35.20317840576172, "learning_rate": 1.5214331413947536e-06, "loss": 0.5258, "num_input_tokens_seen": 3760576, "step": 1190 }, { "epoch": 0.07649958389347673, "grad_norm": 29.504152297973633, "learning_rate": 1.527831094049904e-06, "loss": 0.5085, "num_input_tokens_seen": 3776576, "step": 1195 }, { "epoch": 0.07681966583445363, "grad_norm": 43.33934783935547, "learning_rate": 1.5342290467050543e-06, "loss": 0.5857, "num_input_tokens_seen": 3792384, "step": 1200 }, { "epoch": 0.0771397477754305, "grad_norm": 44.849308013916016, "learning_rate": 1.5406269993602045e-06, "loss": 0.6438, "num_input_tokens_seen": 3806592, "step": 1205 }, { "epoch": 0.0774598297164074, "grad_norm": 52.07255935668945, "learning_rate": 1.547024952015355e-06, "loss": 0.5775, "num_input_tokens_seen": 3822080, "step": 1210 }, { "epoch": 0.07777991165738429, "grad_norm": 37.863677978515625, "learning_rate": 1.5534229046705055e-06, "loss": 0.5269, "num_input_tokens_seen": 3837120, "step": 1215 }, { "epoch": 0.07809999359836119, "grad_norm": 37.92720413208008, "learning_rate": 1.5598208573256556e-06, "loss": 0.6994, "num_input_tokens_seen": 3852864, "step": 1220 }, { "epoch": 0.07842007553933807, "grad_norm": 34.05339431762695, "learning_rate": 1.566218809980806e-06, "loss": 0.515, "num_input_tokens_seen": 3869184, "step": 1225 }, { "epoch": 0.07874015748031496, "grad_norm": 31.917217254638672, "learning_rate": 1.5726167626359564e-06, "loss": 0.5388, "num_input_tokens_seen": 3885248, "step": 1230 }, { "epoch": 0.07906023942129185, "grad_norm": 32.81400680541992, "learning_rate": 1.5790147152911068e-06, "loss": 0.4306, "num_input_tokens_seen": 3900416, "step": 1235 }, { "epoch": 0.07938032136226875, "grad_norm": 38.30088806152344, "learning_rate": 1.5854126679462571e-06, "loss": 0.5503, "num_input_tokens_seen": 3916096, "step": 1240 }, { "epoch": 0.07970040330324563, "grad_norm": 50.25246810913086, "learning_rate": 1.5918106206014075e-06, "loss": 0.6993, "num_input_tokens_seen": 3933312, "step": 1245 }, { "epoch": 0.08002048524422252, "grad_norm": 74.49282836914062, "learning_rate": 1.5982085732565577e-06, "loss": 0.6197, "num_input_tokens_seen": 3949440, "step": 1250 }, { "epoch": 0.08034056718519941, "grad_norm": 36.928924560546875, "learning_rate": 1.604606525911708e-06, "loss": 0.6799, "num_input_tokens_seen": 3964992, "step": 1255 }, { "epoch": 0.0806606491261763, "grad_norm": 56.78390884399414, "learning_rate": 1.6110044785668586e-06, "loss": 0.7324, "num_input_tokens_seen": 3981696, "step": 1260 }, { "epoch": 0.08098073106715319, "grad_norm": 38.05080795288086, "learning_rate": 1.617402431222009e-06, "loss": 0.6136, "num_input_tokens_seen": 3997248, "step": 1265 }, { "epoch": 0.08130081300813008, "grad_norm": 27.51533317565918, "learning_rate": 1.6238003838771592e-06, "loss": 0.6689, "num_input_tokens_seen": 4011648, "step": 1270 }, { "epoch": 0.08162089494910697, "grad_norm": 42.947906494140625, "learning_rate": 1.6301983365323096e-06, "loss": 0.5254, "num_input_tokens_seen": 4028160, "step": 1275 }, { "epoch": 0.08194097689008387, "grad_norm": 47.13071060180664, "learning_rate": 1.63659628918746e-06, "loss": 0.5398, "num_input_tokens_seen": 4043584, "step": 1280 }, { "epoch": 0.08226105883106075, "grad_norm": 47.630218505859375, "learning_rate": 1.6429942418426103e-06, "loss": 0.7076, "num_input_tokens_seen": 4059456, "step": 1285 }, { "epoch": 0.08258114077203764, "grad_norm": 26.62889289855957, "learning_rate": 1.6493921944977607e-06, "loss": 0.6103, "num_input_tokens_seen": 4076096, "step": 1290 }, { "epoch": 0.08290122271301453, "grad_norm": 41.755088806152344, "learning_rate": 1.655790147152911e-06, "loss": 0.6111, "num_input_tokens_seen": 4093568, "step": 1295 }, { "epoch": 0.08322130465399143, "grad_norm": 36.05648422241211, "learning_rate": 1.6621880998080612e-06, "loss": 0.6676, "num_input_tokens_seen": 4108864, "step": 1300 }, { "epoch": 0.0835413865949683, "grad_norm": 37.495201110839844, "learning_rate": 1.6685860524632116e-06, "loss": 0.6425, "num_input_tokens_seen": 4124224, "step": 1305 }, { "epoch": 0.0838614685359452, "grad_norm": 40.898502349853516, "learning_rate": 1.6749840051183622e-06, "loss": 0.5516, "num_input_tokens_seen": 4139008, "step": 1310 }, { "epoch": 0.08418155047692209, "grad_norm": 24.442567825317383, "learning_rate": 1.6813819577735124e-06, "loss": 0.5551, "num_input_tokens_seen": 4155008, "step": 1315 }, { "epoch": 0.08450163241789899, "grad_norm": 26.63324737548828, "learning_rate": 1.6877799104286628e-06, "loss": 0.4792, "num_input_tokens_seen": 4172544, "step": 1320 }, { "epoch": 0.08482171435887587, "grad_norm": 48.432395935058594, "learning_rate": 1.6941778630838131e-06, "loss": 0.6306, "num_input_tokens_seen": 4188416, "step": 1325 }, { "epoch": 0.08514179629985276, "grad_norm": 19.108352661132812, "learning_rate": 1.7005758157389633e-06, "loss": 0.5031, "num_input_tokens_seen": 4202560, "step": 1330 }, { "epoch": 0.08546187824082965, "grad_norm": 35.99553680419922, "learning_rate": 1.706973768394114e-06, "loss": 0.5574, "num_input_tokens_seen": 4219392, "step": 1335 }, { "epoch": 0.08578196018180655, "grad_norm": 50.857059478759766, "learning_rate": 1.7133717210492643e-06, "loss": 0.4844, "num_input_tokens_seen": 4235328, "step": 1340 }, { "epoch": 0.08610204212278343, "grad_norm": 52.725589752197266, "learning_rate": 1.7197696737044144e-06, "loss": 0.5778, "num_input_tokens_seen": 4250368, "step": 1345 }, { "epoch": 0.08642212406376032, "grad_norm": 27.934179306030273, "learning_rate": 1.7261676263595648e-06, "loss": 0.4549, "num_input_tokens_seen": 4265856, "step": 1350 }, { "epoch": 0.08674220600473721, "grad_norm": 41.11848068237305, "learning_rate": 1.7325655790147152e-06, "loss": 0.6627, "num_input_tokens_seen": 4281792, "step": 1355 }, { "epoch": 0.0870622879457141, "grad_norm": 38.61765670776367, "learning_rate": 1.7389635316698656e-06, "loss": 0.5873, "num_input_tokens_seen": 4297088, "step": 1360 }, { "epoch": 0.087382369886691, "grad_norm": 27.60044288635254, "learning_rate": 1.745361484325016e-06, "loss": 0.5028, "num_input_tokens_seen": 4312192, "step": 1365 }, { "epoch": 0.08770245182766788, "grad_norm": 27.299213409423828, "learning_rate": 1.7517594369801663e-06, "loss": 0.4819, "num_input_tokens_seen": 4326720, "step": 1370 }, { "epoch": 0.08802253376864477, "grad_norm": 58.7935791015625, "learning_rate": 1.7581573896353165e-06, "loss": 0.7894, "num_input_tokens_seen": 4341760, "step": 1375 }, { "epoch": 0.08834261570962167, "grad_norm": 36.60477828979492, "learning_rate": 1.7645553422904669e-06, "loss": 0.6215, "num_input_tokens_seen": 4357760, "step": 1380 }, { "epoch": 0.08866269765059856, "grad_norm": 36.011505126953125, "learning_rate": 1.7709532949456175e-06, "loss": 0.6267, "num_input_tokens_seen": 4373824, "step": 1385 }, { "epoch": 0.08898277959157544, "grad_norm": 32.17240524291992, "learning_rate": 1.7773512476007676e-06, "loss": 0.4739, "num_input_tokens_seen": 4388992, "step": 1390 }, { "epoch": 0.08930286153255233, "grad_norm": 29.726274490356445, "learning_rate": 1.783749200255918e-06, "loss": 0.5295, "num_input_tokens_seen": 4404288, "step": 1395 }, { "epoch": 0.08962294347352923, "grad_norm": 49.96647644042969, "learning_rate": 1.7901471529110684e-06, "loss": 0.5366, "num_input_tokens_seen": 4419840, "step": 1400 }, { "epoch": 0.08994302541450612, "grad_norm": 55.71930694580078, "learning_rate": 1.7965451055662186e-06, "loss": 0.5109, "num_input_tokens_seen": 4435200, "step": 1405 }, { "epoch": 0.090263107355483, "grad_norm": 54.367244720458984, "learning_rate": 1.8029430582213691e-06, "loss": 0.6082, "num_input_tokens_seen": 4450368, "step": 1410 }, { "epoch": 0.09058318929645989, "grad_norm": 42.54631042480469, "learning_rate": 1.8093410108765195e-06, "loss": 0.4889, "num_input_tokens_seen": 4466048, "step": 1415 }, { "epoch": 0.09090327123743679, "grad_norm": 56.23736572265625, "learning_rate": 1.8157389635316697e-06, "loss": 0.5985, "num_input_tokens_seen": 4481920, "step": 1420 }, { "epoch": 0.09122335317841368, "grad_norm": 34.284244537353516, "learning_rate": 1.82213691618682e-06, "loss": 0.5671, "num_input_tokens_seen": 4498112, "step": 1425 }, { "epoch": 0.09154343511939056, "grad_norm": 32.31144714355469, "learning_rate": 1.8285348688419704e-06, "loss": 0.4306, "num_input_tokens_seen": 4515648, "step": 1430 }, { "epoch": 0.09186351706036745, "grad_norm": 34.67725372314453, "learning_rate": 1.8349328214971208e-06, "loss": 0.5719, "num_input_tokens_seen": 4531840, "step": 1435 }, { "epoch": 0.09218359900134435, "grad_norm": 48.15701675415039, "learning_rate": 1.8413307741522712e-06, "loss": 0.5478, "num_input_tokens_seen": 4547456, "step": 1440 }, { "epoch": 0.09250368094232124, "grad_norm": 40.82353210449219, "learning_rate": 1.8477287268074216e-06, "loss": 0.557, "num_input_tokens_seen": 4563328, "step": 1445 }, { "epoch": 0.09282376288329812, "grad_norm": 28.479816436767578, "learning_rate": 1.8541266794625718e-06, "loss": 0.5856, "num_input_tokens_seen": 4579392, "step": 1450 }, { "epoch": 0.09314384482427501, "grad_norm": 80.24234008789062, "learning_rate": 1.8605246321177221e-06, "loss": 0.6149, "num_input_tokens_seen": 4595584, "step": 1455 }, { "epoch": 0.0934639267652519, "grad_norm": 33.8602294921875, "learning_rate": 1.8669225847728727e-06, "loss": 0.5711, "num_input_tokens_seen": 4610112, "step": 1460 }, { "epoch": 0.0937840087062288, "grad_norm": 53.692935943603516, "learning_rate": 1.8733205374280229e-06, "loss": 0.6948, "num_input_tokens_seen": 4626432, "step": 1465 }, { "epoch": 0.09410409064720568, "grad_norm": 38.61556625366211, "learning_rate": 1.8797184900831733e-06, "loss": 0.5771, "num_input_tokens_seen": 4641792, "step": 1470 }, { "epoch": 0.09442417258818257, "grad_norm": 18.766170501708984, "learning_rate": 1.8861164427383236e-06, "loss": 0.4046, "num_input_tokens_seen": 4656896, "step": 1475 }, { "epoch": 0.09474425452915947, "grad_norm": 47.406803131103516, "learning_rate": 1.8925143953934738e-06, "loss": 0.605, "num_input_tokens_seen": 4673472, "step": 1480 }, { "epoch": 0.09506433647013636, "grad_norm": 26.30023765563965, "learning_rate": 1.8989123480486244e-06, "loss": 0.426, "num_input_tokens_seen": 4688896, "step": 1485 }, { "epoch": 0.09538441841111324, "grad_norm": 43.65274429321289, "learning_rate": 1.9053103007037748e-06, "loss": 0.6785, "num_input_tokens_seen": 4704576, "step": 1490 }, { "epoch": 0.09570450035209013, "grad_norm": 38.707481384277344, "learning_rate": 1.911708253358925e-06, "loss": 0.6069, "num_input_tokens_seen": 4719040, "step": 1495 }, { "epoch": 0.09602458229306703, "grad_norm": 30.288116455078125, "learning_rate": 1.9181062060140753e-06, "loss": 0.4831, "num_input_tokens_seen": 4733696, "step": 1500 }, { "epoch": 0.09634466423404392, "grad_norm": 39.540462493896484, "learning_rate": 1.9245041586692255e-06, "loss": 0.6045, "num_input_tokens_seen": 4748992, "step": 1505 }, { "epoch": 0.0966647461750208, "grad_norm": 50.21097183227539, "learning_rate": 1.930902111324376e-06, "loss": 0.6876, "num_input_tokens_seen": 4764992, "step": 1510 }, { "epoch": 0.09698482811599769, "grad_norm": 45.92460632324219, "learning_rate": 1.9373000639795267e-06, "loss": 0.6773, "num_input_tokens_seen": 4780352, "step": 1515 }, { "epoch": 0.09730491005697459, "grad_norm": 31.217529296875, "learning_rate": 1.943698016634677e-06, "loss": 0.5393, "num_input_tokens_seen": 4796224, "step": 1520 }, { "epoch": 0.09762499199795148, "grad_norm": 30.454994201660156, "learning_rate": 1.950095969289827e-06, "loss": 0.5401, "num_input_tokens_seen": 4811840, "step": 1525 }, { "epoch": 0.09794507393892836, "grad_norm": 26.992660522460938, "learning_rate": 1.9564939219449776e-06, "loss": 0.5811, "num_input_tokens_seen": 4826432, "step": 1530 }, { "epoch": 0.09826515587990525, "grad_norm": 28.93795394897461, "learning_rate": 1.9628918746001278e-06, "loss": 0.393, "num_input_tokens_seen": 4841920, "step": 1535 }, { "epoch": 0.09858523782088215, "grad_norm": 30.038558959960938, "learning_rate": 1.9692898272552783e-06, "loss": 0.5971, "num_input_tokens_seen": 4857536, "step": 1540 }, { "epoch": 0.09890531976185904, "grad_norm": 56.7470588684082, "learning_rate": 1.9756877799104285e-06, "loss": 0.6844, "num_input_tokens_seen": 4873408, "step": 1545 }, { "epoch": 0.09922540170283592, "grad_norm": 43.21520233154297, "learning_rate": 1.9820857325655787e-06, "loss": 0.5973, "num_input_tokens_seen": 4889536, "step": 1550 }, { "epoch": 0.09954548364381281, "grad_norm": 59.145320892333984, "learning_rate": 1.9884836852207293e-06, "loss": 0.627, "num_input_tokens_seen": 4904448, "step": 1555 }, { "epoch": 0.0998655655847897, "grad_norm": 25.44906997680664, "learning_rate": 1.99488163787588e-06, "loss": 0.6569, "num_input_tokens_seen": 4919616, "step": 1560 }, { "epoch": 0.10012163113757122, "eval_loss": 0.5394634008407593, "eval_runtime": 49.1959, "eval_samples_per_second": 282.259, "eval_steps_per_second": 35.288, "num_input_tokens_seen": 4931328, "step": 1564 }, { "epoch": 0.1001856475257666, "grad_norm": 30.809673309326172, "learning_rate": 1.9999999750297625e-06, "loss": 0.516, "num_input_tokens_seen": 4934144, "step": 1565 }, { "epoch": 0.1005057294667435, "grad_norm": 31.836828231811523, "learning_rate": 1.9999991010715873e-06, "loss": 0.523, "num_input_tokens_seen": 4950272, "step": 1570 }, { "epoch": 0.10082581140772037, "grad_norm": 43.53628921508789, "learning_rate": 1.999996978602793e-06, "loss": 0.5346, "num_input_tokens_seen": 4965056, "step": 1575 }, { "epoch": 0.10114589334869727, "grad_norm": 33.710304260253906, "learning_rate": 1.99999360762603e-06, "loss": 0.5489, "num_input_tokens_seen": 4980160, "step": 1580 }, { "epoch": 0.10146597528967416, "grad_norm": 36.39333724975586, "learning_rate": 1.9999889881455065e-06, "loss": 0.453, "num_input_tokens_seen": 4996992, "step": 1585 }, { "epoch": 0.10178605723065105, "grad_norm": 35.11768341064453, "learning_rate": 1.9999831201669897e-06, "loss": 0.5146, "num_input_tokens_seen": 5012608, "step": 1590 }, { "epoch": 0.10210613917162793, "grad_norm": 38.27321243286133, "learning_rate": 1.9999760036978067e-06, "loss": 0.4848, "num_input_tokens_seen": 5027840, "step": 1595 }, { "epoch": 0.10242622111260483, "grad_norm": 33.53286361694336, "learning_rate": 1.9999676387468417e-06, "loss": 0.5746, "num_input_tokens_seen": 5042752, "step": 1600 }, { "epoch": 0.10274630305358172, "grad_norm": 26.00925636291504, "learning_rate": 1.999958025324539e-06, "loss": 0.5487, "num_input_tokens_seen": 5058624, "step": 1605 }, { "epoch": 0.10306638499455861, "grad_norm": 36.0686149597168, "learning_rate": 1.999947163442901e-06, "loss": 0.6233, "num_input_tokens_seen": 5075008, "step": 1610 }, { "epoch": 0.10338646693553549, "grad_norm": 48.770294189453125, "learning_rate": 1.9999350531154884e-06, "loss": 0.5332, "num_input_tokens_seen": 5090880, "step": 1615 }, { "epoch": 0.10370654887651239, "grad_norm": 51.00628662109375, "learning_rate": 1.9999216943574223e-06, "loss": 0.5713, "num_input_tokens_seen": 5106816, "step": 1620 }, { "epoch": 0.10402663081748928, "grad_norm": 22.478723526000977, "learning_rate": 1.9999070871853796e-06, "loss": 0.4563, "num_input_tokens_seen": 5123904, "step": 1625 }, { "epoch": 0.10434671275846617, "grad_norm": 40.58604049682617, "learning_rate": 1.9998912316175986e-06, "loss": 0.4954, "num_input_tokens_seen": 5140160, "step": 1630 }, { "epoch": 0.10466679469944305, "grad_norm": 46.91875457763672, "learning_rate": 1.9998741276738752e-06, "loss": 0.5159, "num_input_tokens_seen": 5156288, "step": 1635 }, { "epoch": 0.10498687664041995, "grad_norm": 38.68816375732422, "learning_rate": 1.999855775375563e-06, "loss": 0.5823, "num_input_tokens_seen": 5171776, "step": 1640 }, { "epoch": 0.10530695858139684, "grad_norm": 40.31874465942383, "learning_rate": 1.999836174745576e-06, "loss": 0.683, "num_input_tokens_seen": 5189504, "step": 1645 }, { "epoch": 0.10562704052237373, "grad_norm": 45.76553726196289, "learning_rate": 1.9998153258083853e-06, "loss": 0.5783, "num_input_tokens_seen": 5205056, "step": 1650 }, { "epoch": 0.10594712246335061, "grad_norm": 37.635047912597656, "learning_rate": 1.9997932285900214e-06, "loss": 0.586, "num_input_tokens_seen": 5222656, "step": 1655 }, { "epoch": 0.1062672044043275, "grad_norm": 53.1572380065918, "learning_rate": 1.9997698831180726e-06, "loss": 0.6272, "num_input_tokens_seen": 5238848, "step": 1660 }, { "epoch": 0.1065872863453044, "grad_norm": 33.89291000366211, "learning_rate": 1.999745289421686e-06, "loss": 0.5203, "num_input_tokens_seen": 5255296, "step": 1665 }, { "epoch": 0.10690736828628129, "grad_norm": 58.435916900634766, "learning_rate": 1.9997194475315674e-06, "loss": 0.7716, "num_input_tokens_seen": 5270336, "step": 1670 }, { "epoch": 0.10722745022725817, "grad_norm": 36.215858459472656, "learning_rate": 1.9996923574799808e-06, "loss": 0.4842, "num_input_tokens_seen": 5286720, "step": 1675 }, { "epoch": 0.10754753216823507, "grad_norm": 32.55356979370117, "learning_rate": 1.9996640193007476e-06, "loss": 0.6428, "num_input_tokens_seen": 5301632, "step": 1680 }, { "epoch": 0.10786761410921196, "grad_norm": 49.88198471069336, "learning_rate": 1.9996344330292495e-06, "loss": 0.403, "num_input_tokens_seen": 5316544, "step": 1685 }, { "epoch": 0.10818769605018885, "grad_norm": 34.35507583618164, "learning_rate": 1.9996035987024245e-06, "loss": 0.5503, "num_input_tokens_seen": 5332544, "step": 1690 }, { "epoch": 0.10850777799116573, "grad_norm": 35.31010437011719, "learning_rate": 1.99957151635877e-06, "loss": 0.5388, "num_input_tokens_seen": 5348096, "step": 1695 }, { "epoch": 0.10882785993214263, "grad_norm": 50.9265022277832, "learning_rate": 1.999538186038341e-06, "loss": 0.6275, "num_input_tokens_seen": 5362368, "step": 1700 }, { "epoch": 0.10914794187311952, "grad_norm": 34.14656066894531, "learning_rate": 1.999503607782751e-06, "loss": 0.5426, "num_input_tokens_seen": 5378176, "step": 1705 }, { "epoch": 0.10946802381409641, "grad_norm": 42.861480712890625, "learning_rate": 1.999467781635171e-06, "loss": 0.5163, "num_input_tokens_seen": 5394752, "step": 1710 }, { "epoch": 0.10978810575507329, "grad_norm": 31.575403213500977, "learning_rate": 1.9994307076403306e-06, "loss": 0.6991, "num_input_tokens_seen": 5412160, "step": 1715 }, { "epoch": 0.11010818769605019, "grad_norm": 35.84833908081055, "learning_rate": 1.999392385844517e-06, "loss": 0.5245, "num_input_tokens_seen": 5427840, "step": 1720 }, { "epoch": 0.11042826963702708, "grad_norm": 36.32638931274414, "learning_rate": 1.9993528162955753e-06, "loss": 0.4035, "num_input_tokens_seen": 5444224, "step": 1725 }, { "epoch": 0.11074835157800397, "grad_norm": 60.70829391479492, "learning_rate": 1.9993119990429095e-06, "loss": 0.5767, "num_input_tokens_seen": 5459648, "step": 1730 }, { "epoch": 0.11106843351898085, "grad_norm": 47.9375114440918, "learning_rate": 1.9992699341374794e-06, "loss": 0.7821, "num_input_tokens_seen": 5475008, "step": 1735 }, { "epoch": 0.11138851545995775, "grad_norm": 32.798091888427734, "learning_rate": 1.9992266216318033e-06, "loss": 0.5285, "num_input_tokens_seen": 5491456, "step": 1740 }, { "epoch": 0.11170859740093464, "grad_norm": 35.342803955078125, "learning_rate": 1.9991820615799583e-06, "loss": 0.5674, "num_input_tokens_seen": 5507520, "step": 1745 }, { "epoch": 0.11202867934191153, "grad_norm": 49.72675704956055, "learning_rate": 1.999136254037578e-06, "loss": 0.6917, "num_input_tokens_seen": 5523072, "step": 1750 }, { "epoch": 0.11234876128288843, "grad_norm": 37.71804428100586, "learning_rate": 1.999089199061853e-06, "loss": 0.5094, "num_input_tokens_seen": 5538304, "step": 1755 }, { "epoch": 0.1126688432238653, "grad_norm": 35.397056579589844, "learning_rate": 1.9990408967115326e-06, "loss": 0.4612, "num_input_tokens_seen": 5553920, "step": 1760 }, { "epoch": 0.1129889251648422, "grad_norm": 26.074499130249023, "learning_rate": 1.998991347046922e-06, "loss": 0.4599, "num_input_tokens_seen": 5569344, "step": 1765 }, { "epoch": 0.11330900710581909, "grad_norm": 33.73558044433594, "learning_rate": 1.9989405501298857e-06, "loss": 0.5104, "num_input_tokens_seen": 5585856, "step": 1770 }, { "epoch": 0.11362908904679599, "grad_norm": 50.979820251464844, "learning_rate": 1.9988885060238436e-06, "loss": 0.5755, "num_input_tokens_seen": 5603840, "step": 1775 }, { "epoch": 0.11394917098777287, "grad_norm": 25.762378692626953, "learning_rate": 1.9988352147937735e-06, "loss": 0.5167, "num_input_tokens_seen": 5620352, "step": 1780 }, { "epoch": 0.11426925292874976, "grad_norm": 44.76283645629883, "learning_rate": 1.99878067650621e-06, "loss": 0.5382, "num_input_tokens_seen": 5636544, "step": 1785 }, { "epoch": 0.11458933486972665, "grad_norm": 38.91508102416992, "learning_rate": 1.998724891229245e-06, "loss": 0.5438, "num_input_tokens_seen": 5652672, "step": 1790 }, { "epoch": 0.11490941681070355, "grad_norm": 42.947147369384766, "learning_rate": 1.998667859032527e-06, "loss": 0.4956, "num_input_tokens_seen": 5668224, "step": 1795 }, { "epoch": 0.11522949875168043, "grad_norm": 21.492956161499023, "learning_rate": 1.9986095799872613e-06, "loss": 0.4506, "num_input_tokens_seen": 5684480, "step": 1800 }, { "epoch": 0.11554958069265732, "grad_norm": 65.54640197753906, "learning_rate": 1.99855005416621e-06, "loss": 0.472, "num_input_tokens_seen": 5700864, "step": 1805 }, { "epoch": 0.11586966263363421, "grad_norm": 40.5084114074707, "learning_rate": 1.998489281643692e-06, "loss": 0.5965, "num_input_tokens_seen": 5716224, "step": 1810 }, { "epoch": 0.1161897445746111, "grad_norm": 29.857545852661133, "learning_rate": 1.998427262495582e-06, "loss": 0.4977, "num_input_tokens_seen": 5733056, "step": 1815 }, { "epoch": 0.11650982651558799, "grad_norm": 36.56293487548828, "learning_rate": 1.9983639967993124e-06, "loss": 0.6683, "num_input_tokens_seen": 5749120, "step": 1820 }, { "epoch": 0.11682990845656488, "grad_norm": 30.559627532958984, "learning_rate": 1.99829948463387e-06, "loss": 0.7297, "num_input_tokens_seen": 5763968, "step": 1825 }, { "epoch": 0.11714999039754177, "grad_norm": 31.007530212402344, "learning_rate": 1.9982337260798e-06, "loss": 0.543, "num_input_tokens_seen": 5779520, "step": 1830 }, { "epoch": 0.11747007233851867, "grad_norm": 36.148040771484375, "learning_rate": 1.998166721219203e-06, "loss": 0.5856, "num_input_tokens_seen": 5798848, "step": 1835 }, { "epoch": 0.11779015427949555, "grad_norm": 31.41288185119629, "learning_rate": 1.9980984701357338e-06, "loss": 0.5155, "num_input_tokens_seen": 5813952, "step": 1840 }, { "epoch": 0.11811023622047244, "grad_norm": 36.17179489135742, "learning_rate": 1.998028972914606e-06, "loss": 0.4362, "num_input_tokens_seen": 5830016, "step": 1845 }, { "epoch": 0.11843031816144933, "grad_norm": 38.044334411621094, "learning_rate": 1.9979582296425877e-06, "loss": 0.5893, "num_input_tokens_seen": 5845312, "step": 1850 }, { "epoch": 0.11875040010242623, "grad_norm": 22.015993118286133, "learning_rate": 1.9978862404080022e-06, "loss": 0.5851, "num_input_tokens_seen": 5860672, "step": 1855 }, { "epoch": 0.1190704820434031, "grad_norm": 34.7830696105957, "learning_rate": 1.9978130053007295e-06, "loss": 0.5376, "num_input_tokens_seen": 5875776, "step": 1860 }, { "epoch": 0.11939056398438, "grad_norm": 37.767024993896484, "learning_rate": 1.9977385244122034e-06, "loss": 0.4319, "num_input_tokens_seen": 5891200, "step": 1865 }, { "epoch": 0.11971064592535689, "grad_norm": 35.13771438598633, "learning_rate": 1.997662797835415e-06, "loss": 0.4821, "num_input_tokens_seen": 5907008, "step": 1870 }, { "epoch": 0.12003072786633379, "grad_norm": 34.38051986694336, "learning_rate": 1.9975858256649097e-06, "loss": 0.4645, "num_input_tokens_seen": 5923264, "step": 1875 }, { "epoch": 0.12035080980731067, "grad_norm": 52.10721206665039, "learning_rate": 1.997507607996788e-06, "loss": 0.4911, "num_input_tokens_seen": 5939648, "step": 1880 }, { "epoch": 0.12067089174828756, "grad_norm": 25.434171676635742, "learning_rate": 1.997428144928706e-06, "loss": 0.4557, "num_input_tokens_seen": 5955520, "step": 1885 }, { "epoch": 0.12099097368926445, "grad_norm": 41.22515106201172, "learning_rate": 1.9973474365598736e-06, "loss": 0.5237, "num_input_tokens_seen": 5971072, "step": 1890 }, { "epoch": 0.12131105563024135, "grad_norm": 40.53886413574219, "learning_rate": 1.9972654829910568e-06, "loss": 0.5787, "num_input_tokens_seen": 5987264, "step": 1895 }, { "epoch": 0.12163113757121823, "grad_norm": 53.25776672363281, "learning_rate": 1.9971822843245748e-06, "loss": 0.6193, "num_input_tokens_seen": 6002880, "step": 1900 }, { "epoch": 0.12195121951219512, "grad_norm": 40.23493957519531, "learning_rate": 1.997097840664303e-06, "loss": 0.5277, "num_input_tokens_seen": 6019520, "step": 1905 }, { "epoch": 0.12227130145317201, "grad_norm": 64.40421295166016, "learning_rate": 1.99701215211567e-06, "loss": 0.5641, "num_input_tokens_seen": 6035904, "step": 1910 }, { "epoch": 0.1225913833941489, "grad_norm": 28.99512481689453, "learning_rate": 1.9969252187856587e-06, "loss": 0.6009, "num_input_tokens_seen": 6050816, "step": 1915 }, { "epoch": 0.12291146533512579, "grad_norm": 26.199125289916992, "learning_rate": 1.9968370407828065e-06, "loss": 0.4204, "num_input_tokens_seen": 6065920, "step": 1920 }, { "epoch": 0.12323154727610268, "grad_norm": 25.35918426513672, "learning_rate": 1.996747618217205e-06, "loss": 0.5962, "num_input_tokens_seen": 6081728, "step": 1925 }, { "epoch": 0.12355162921707957, "grad_norm": 27.474023818969727, "learning_rate": 1.9966569512004987e-06, "loss": 0.4945, "num_input_tokens_seen": 6097472, "step": 1930 }, { "epoch": 0.12387171115805647, "grad_norm": 32.64793395996094, "learning_rate": 1.996565039845887e-06, "loss": 0.5101, "num_input_tokens_seen": 6113152, "step": 1935 }, { "epoch": 0.12419179309903335, "grad_norm": 47.92166519165039, "learning_rate": 1.996471884268122e-06, "loss": 0.614, "num_input_tokens_seen": 6129408, "step": 1940 }, { "epoch": 0.12451187504001024, "grad_norm": 20.421428680419922, "learning_rate": 1.9963774845835097e-06, "loss": 0.545, "num_input_tokens_seen": 6144896, "step": 1945 }, { "epoch": 0.12483195698098713, "grad_norm": 50.85639572143555, "learning_rate": 1.996281840909909e-06, "loss": 0.5868, "num_input_tokens_seen": 6160256, "step": 1950 }, { "epoch": 0.12515203892196403, "grad_norm": 36.815921783447266, "learning_rate": 1.9961849533667322e-06, "loss": 0.6354, "num_input_tokens_seen": 6175104, "step": 1955 }, { "epoch": 0.1254721208629409, "grad_norm": 30.276325225830078, "learning_rate": 1.9960868220749447e-06, "loss": 0.5185, "num_input_tokens_seen": 6190272, "step": 1960 }, { "epoch": 0.1257922028039178, "grad_norm": 39.995033264160156, "learning_rate": 1.9959874471570644e-06, "loss": 0.5855, "num_input_tokens_seen": 6205952, "step": 1965 }, { "epoch": 0.1261122847448947, "grad_norm": 44.66065216064453, "learning_rate": 1.9958868287371625e-06, "loss": 0.56, "num_input_tokens_seen": 6222592, "step": 1970 }, { "epoch": 0.12643236668587157, "grad_norm": 37.10478591918945, "learning_rate": 1.9957849669408617e-06, "loss": 0.4803, "num_input_tokens_seen": 6237696, "step": 1975 }, { "epoch": 0.12675244862684848, "grad_norm": 53.23179244995117, "learning_rate": 1.995681861895338e-06, "loss": 0.4858, "num_input_tokens_seen": 6254080, "step": 1980 }, { "epoch": 0.12707253056782536, "grad_norm": 27.040605545043945, "learning_rate": 1.9955775137293187e-06, "loss": 0.5741, "num_input_tokens_seen": 6270016, "step": 1985 }, { "epoch": 0.12739261250880227, "grad_norm": 50.933433532714844, "learning_rate": 1.9954719225730845e-06, "loss": 0.6124, "num_input_tokens_seen": 6285184, "step": 1990 }, { "epoch": 0.12771269444977915, "grad_norm": 47.980018615722656, "learning_rate": 1.9953650885584666e-06, "loss": 0.4774, "num_input_tokens_seen": 6300992, "step": 1995 }, { "epoch": 0.12803277639075603, "grad_norm": 15.962865829467773, "learning_rate": 1.995257011818849e-06, "loss": 0.5445, "num_input_tokens_seen": 6315392, "step": 2000 }, { "epoch": 0.12835285833173293, "grad_norm": 38.00539779663086, "learning_rate": 1.9951476924891666e-06, "loss": 0.4739, "num_input_tokens_seen": 6331136, "step": 2005 }, { "epoch": 0.1286729402727098, "grad_norm": 33.02660369873047, "learning_rate": 1.9950371307059056e-06, "loss": 0.5553, "num_input_tokens_seen": 6347584, "step": 2010 }, { "epoch": 0.1289930222136867, "grad_norm": 55.436187744140625, "learning_rate": 1.9949253266071036e-06, "loss": 0.5728, "num_input_tokens_seen": 6362560, "step": 2015 }, { "epoch": 0.1293131041546636, "grad_norm": 28.438800811767578, "learning_rate": 1.9948122803323503e-06, "loss": 0.5075, "num_input_tokens_seen": 6378304, "step": 2020 }, { "epoch": 0.12963318609564048, "grad_norm": 39.819091796875, "learning_rate": 1.9946979920227844e-06, "loss": 0.5147, "num_input_tokens_seen": 6393280, "step": 2025 }, { "epoch": 0.1299532680366174, "grad_norm": 61.551517486572266, "learning_rate": 1.994582461821096e-06, "loss": 0.5251, "num_input_tokens_seen": 6409472, "step": 2030 }, { "epoch": 0.13027334997759427, "grad_norm": 71.56808471679688, "learning_rate": 1.9944656898715267e-06, "loss": 0.7157, "num_input_tokens_seen": 6424960, "step": 2035 }, { "epoch": 0.13059343191857115, "grad_norm": 31.08255386352539, "learning_rate": 1.994347676319867e-06, "loss": 0.6057, "num_input_tokens_seen": 6440000, "step": 2040 }, { "epoch": 0.13091351385954805, "grad_norm": 24.44256019592285, "learning_rate": 1.994228421313459e-06, "loss": 0.453, "num_input_tokens_seen": 6457600, "step": 2045 }, { "epoch": 0.13123359580052493, "grad_norm": 39.8853759765625, "learning_rate": 1.994107925001193e-06, "loss": 0.5143, "num_input_tokens_seen": 6473088, "step": 2050 }, { "epoch": 0.1315536777415018, "grad_norm": 51.965187072753906, "learning_rate": 1.9939861875335108e-06, "loss": 0.6013, "num_input_tokens_seen": 6487680, "step": 2055 }, { "epoch": 0.13187375968247872, "grad_norm": 35.12892532348633, "learning_rate": 1.9938632090624025e-06, "loss": 0.4831, "num_input_tokens_seen": 6503296, "step": 2060 }, { "epoch": 0.1321938416234556, "grad_norm": 14.63175106048584, "learning_rate": 1.9937389897414087e-06, "loss": 0.5363, "num_input_tokens_seen": 6518912, "step": 2065 }, { "epoch": 0.1325139235644325, "grad_norm": 43.0014533996582, "learning_rate": 1.993613529725618e-06, "loss": 0.5631, "num_input_tokens_seen": 6534784, "step": 2070 }, { "epoch": 0.13283400550540939, "grad_norm": 51.7596435546875, "learning_rate": 1.99348682917167e-06, "loss": 0.5248, "num_input_tokens_seen": 6550528, "step": 2075 }, { "epoch": 0.13315408744638627, "grad_norm": 34.12824630737305, "learning_rate": 1.99335888823775e-06, "loss": 0.5344, "num_input_tokens_seen": 6566144, "step": 2080 }, { "epoch": 0.13347416938736317, "grad_norm": 51.184452056884766, "learning_rate": 1.993229707083595e-06, "loss": 0.5605, "num_input_tokens_seen": 6583872, "step": 2085 }, { "epoch": 0.13379425132834005, "grad_norm": 19.79715919494629, "learning_rate": 1.993099285870489e-06, "loss": 0.4144, "num_input_tokens_seen": 6602304, "step": 2090 }, { "epoch": 0.13411433326931693, "grad_norm": 41.58517074584961, "learning_rate": 1.992967624761264e-06, "loss": 0.4607, "num_input_tokens_seen": 6618112, "step": 2095 }, { "epoch": 0.13443441521029384, "grad_norm": 47.04132080078125, "learning_rate": 1.9928347239203014e-06, "loss": 0.6174, "num_input_tokens_seen": 6635584, "step": 2100 }, { "epoch": 0.13475449715127072, "grad_norm": 34.34235763549805, "learning_rate": 1.9927005835135282e-06, "loss": 0.5339, "num_input_tokens_seen": 6653568, "step": 2105 }, { "epoch": 0.13507457909224763, "grad_norm": 30.708681106567383, "learning_rate": 1.9925652037084214e-06, "loss": 0.4604, "num_input_tokens_seen": 6668864, "step": 2110 }, { "epoch": 0.1353946610332245, "grad_norm": 25.8023738861084, "learning_rate": 1.9924285846740037e-06, "loss": 0.4852, "num_input_tokens_seen": 6684416, "step": 2115 }, { "epoch": 0.13571474297420139, "grad_norm": 49.56015396118164, "learning_rate": 1.9922907265808452e-06, "loss": 0.5927, "num_input_tokens_seen": 6699392, "step": 2120 }, { "epoch": 0.1360348249151783, "grad_norm": 43.24879455566406, "learning_rate": 1.9921516296010643e-06, "loss": 0.5477, "num_input_tokens_seen": 6714560, "step": 2125 }, { "epoch": 0.13635490685615517, "grad_norm": 50.094120025634766, "learning_rate": 1.9920112939083246e-06, "loss": 0.5584, "num_input_tokens_seen": 6729920, "step": 2130 }, { "epoch": 0.13667498879713205, "grad_norm": 27.34825897216797, "learning_rate": 1.9918697196778367e-06, "loss": 0.5555, "num_input_tokens_seen": 6744768, "step": 2135 }, { "epoch": 0.13699507073810896, "grad_norm": 26.479101181030273, "learning_rate": 1.9917269070863578e-06, "loss": 0.4607, "num_input_tokens_seen": 6759680, "step": 2140 }, { "epoch": 0.13731515267908584, "grad_norm": 35.83186340332031, "learning_rate": 1.9915828563121915e-06, "loss": 0.5094, "num_input_tokens_seen": 6775168, "step": 2145 }, { "epoch": 0.13763523462006275, "grad_norm": 43.06388473510742, "learning_rate": 1.9914375675351865e-06, "loss": 0.5364, "num_input_tokens_seen": 6791296, "step": 2150 }, { "epoch": 0.13795531656103963, "grad_norm": 18.07638168334961, "learning_rate": 1.991291040936738e-06, "loss": 0.43, "num_input_tokens_seen": 6808640, "step": 2155 }, { "epoch": 0.1382753985020165, "grad_norm": 41.67695236206055, "learning_rate": 1.9911432766997857e-06, "loss": 0.6627, "num_input_tokens_seen": 6824064, "step": 2160 }, { "epoch": 0.1385954804429934, "grad_norm": 56.66114807128906, "learning_rate": 1.990994275008815e-06, "loss": 0.4426, "num_input_tokens_seen": 6839872, "step": 2165 }, { "epoch": 0.1389155623839703, "grad_norm": 58.23060989379883, "learning_rate": 1.9908440360498565e-06, "loss": 0.5081, "num_input_tokens_seen": 6855744, "step": 2170 }, { "epoch": 0.1392356443249472, "grad_norm": 45.5991096496582, "learning_rate": 1.990692560010485e-06, "loss": 0.5566, "num_input_tokens_seen": 6869632, "step": 2175 }, { "epoch": 0.13955572626592408, "grad_norm": 27.95288848876953, "learning_rate": 1.9905398470798206e-06, "loss": 0.448, "num_input_tokens_seen": 6885696, "step": 2180 }, { "epoch": 0.13987580820690096, "grad_norm": 29.043428421020508, "learning_rate": 1.990385897448527e-06, "loss": 0.3634, "num_input_tokens_seen": 6901504, "step": 2185 }, { "epoch": 0.14019589014787787, "grad_norm": 36.92293167114258, "learning_rate": 1.9902307113088114e-06, "loss": 0.5822, "num_input_tokens_seen": 6916480, "step": 2190 }, { "epoch": 0.14051597208885475, "grad_norm": 49.32163619995117, "learning_rate": 1.9900742888544264e-06, "loss": 0.4818, "num_input_tokens_seen": 6932416, "step": 2195 }, { "epoch": 0.14083605402983163, "grad_norm": 46.43427658081055, "learning_rate": 1.989916630280667e-06, "loss": 0.534, "num_input_tokens_seen": 6948992, "step": 2200 }, { "epoch": 0.14115613597080853, "grad_norm": 57.13213348388672, "learning_rate": 1.989757735784372e-06, "loss": 0.4636, "num_input_tokens_seen": 6964416, "step": 2205 }, { "epoch": 0.1414762179117854, "grad_norm": 39.992496490478516, "learning_rate": 1.989597605563923e-06, "loss": 0.4218, "num_input_tokens_seen": 6980544, "step": 2210 }, { "epoch": 0.14179629985276232, "grad_norm": 29.56856918334961, "learning_rate": 1.9894362398192437e-06, "loss": 0.5658, "num_input_tokens_seen": 6997440, "step": 2215 }, { "epoch": 0.1421163817937392, "grad_norm": 22.893774032592773, "learning_rate": 1.9892736387518023e-06, "loss": 0.4163, "num_input_tokens_seen": 7012672, "step": 2220 }, { "epoch": 0.14243646373471608, "grad_norm": 58.755828857421875, "learning_rate": 1.9891098025646075e-06, "loss": 0.4773, "num_input_tokens_seen": 7027648, "step": 2225 }, { "epoch": 0.142756545675693, "grad_norm": 28.471839904785156, "learning_rate": 1.9889447314622105e-06, "loss": 0.5303, "num_input_tokens_seen": 7043200, "step": 2230 }, { "epoch": 0.14307662761666987, "grad_norm": 41.83107376098633, "learning_rate": 1.9887784256507046e-06, "loss": 0.7152, "num_input_tokens_seen": 7058688, "step": 2235 }, { "epoch": 0.14339670955764675, "grad_norm": 32.69862365722656, "learning_rate": 1.988610885337725e-06, "loss": 0.6679, "num_input_tokens_seen": 7074048, "step": 2240 }, { "epoch": 0.14371679149862365, "grad_norm": 32.3195686340332, "learning_rate": 1.9884421107324476e-06, "loss": 0.5261, "num_input_tokens_seen": 7089792, "step": 2245 }, { "epoch": 0.14403687343960053, "grad_norm": 39.98912811279297, "learning_rate": 1.9882721020455893e-06, "loss": 0.4755, "num_input_tokens_seen": 7104640, "step": 2250 }, { "epoch": 0.14435695538057744, "grad_norm": 31.75237464904785, "learning_rate": 1.988100859489408e-06, "loss": 0.5019, "num_input_tokens_seen": 7120064, "step": 2255 }, { "epoch": 0.14467703732155432, "grad_norm": 34.01973342895508, "learning_rate": 1.9879283832777017e-06, "loss": 0.4754, "num_input_tokens_seen": 7135232, "step": 2260 }, { "epoch": 0.1449971192625312, "grad_norm": 48.68187713623047, "learning_rate": 1.9877546736258096e-06, "loss": 0.5075, "num_input_tokens_seen": 7149632, "step": 2265 }, { "epoch": 0.1453172012035081, "grad_norm": 38.213932037353516, "learning_rate": 1.98757973075061e-06, "loss": 0.4107, "num_input_tokens_seen": 7164352, "step": 2270 }, { "epoch": 0.14563728314448499, "grad_norm": 35.81578063964844, "learning_rate": 1.987403554870521e-06, "loss": 0.5188, "num_input_tokens_seen": 7179776, "step": 2275 }, { "epoch": 0.14595736508546187, "grad_norm": 36.673587799072266, "learning_rate": 1.9872261462055003e-06, "loss": 0.4212, "num_input_tokens_seen": 7194240, "step": 2280 }, { "epoch": 0.14627744702643877, "grad_norm": 21.095298767089844, "learning_rate": 1.987047504977045e-06, "loss": 0.4335, "num_input_tokens_seen": 7209472, "step": 2285 }, { "epoch": 0.14659752896741565, "grad_norm": 50.36029052734375, "learning_rate": 1.9868676314081902e-06, "loss": 0.414, "num_input_tokens_seen": 7225088, "step": 2290 }, { "epoch": 0.14691761090839256, "grad_norm": 74.21929931640625, "learning_rate": 1.9866865257235107e-06, "loss": 0.6901, "num_input_tokens_seen": 7240704, "step": 2295 }, { "epoch": 0.14723769284936944, "grad_norm": 29.289196014404297, "learning_rate": 1.9865041881491188e-06, "loss": 0.4177, "num_input_tokens_seen": 7256000, "step": 2300 }, { "epoch": 0.14755777479034632, "grad_norm": 50.457210540771484, "learning_rate": 1.9863206189126653e-06, "loss": 0.6016, "num_input_tokens_seen": 7270336, "step": 2305 }, { "epoch": 0.14787785673132323, "grad_norm": 49.66255187988281, "learning_rate": 1.9861358182433382e-06, "loss": 0.5612, "num_input_tokens_seen": 7285440, "step": 2310 }, { "epoch": 0.1481979386723001, "grad_norm": 37.03299331665039, "learning_rate": 1.9859497863718634e-06, "loss": 0.4711, "num_input_tokens_seen": 7301120, "step": 2315 }, { "epoch": 0.14851802061327699, "grad_norm": 22.66673469543457, "learning_rate": 1.985762523530504e-06, "loss": 0.5204, "num_input_tokens_seen": 7316416, "step": 2320 }, { "epoch": 0.1488381025542539, "grad_norm": 27.409502029418945, "learning_rate": 1.98557402995306e-06, "loss": 0.5051, "num_input_tokens_seen": 7332160, "step": 2325 }, { "epoch": 0.14915818449523077, "grad_norm": 40.534942626953125, "learning_rate": 1.985384305874868e-06, "loss": 0.7069, "num_input_tokens_seen": 7347776, "step": 2330 }, { "epoch": 0.14947826643620768, "grad_norm": 33.9571647644043, "learning_rate": 1.9851933515328e-06, "loss": 0.5467, "num_input_tokens_seen": 7363200, "step": 2335 }, { "epoch": 0.14979834837718456, "grad_norm": 40.87738037109375, "learning_rate": 1.985001167165265e-06, "loss": 0.4699, "num_input_tokens_seen": 7378752, "step": 2340 }, { "epoch": 0.15011843031816144, "grad_norm": 36.36539840698242, "learning_rate": 1.984807753012208e-06, "loss": 0.5165, "num_input_tokens_seen": 7393984, "step": 2345 }, { "epoch": 0.15018244670635683, "eval_loss": 0.5076366662979126, "eval_runtime": 49.1845, "eval_samples_per_second": 282.325, "eval_steps_per_second": 35.296, "num_input_tokens_seen": 7397056, "step": 2346 }, { "epoch": 0.15043851225913835, "grad_norm": 25.733684539794922, "learning_rate": 1.9846131093151086e-06, "loss": 0.5902, "num_input_tokens_seen": 7408832, "step": 2350 }, { "epoch": 0.15075859420011523, "grad_norm": 16.574737548828125, "learning_rate": 1.9844172363169808e-06, "loss": 0.4582, "num_input_tokens_seen": 7423040, "step": 2355 }, { "epoch": 0.15107867614109213, "grad_norm": 53.731632232666016, "learning_rate": 1.9842201342623756e-06, "loss": 0.5117, "num_input_tokens_seen": 7438464, "step": 2360 }, { "epoch": 0.151398758082069, "grad_norm": 28.75635528564453, "learning_rate": 1.9840218033973766e-06, "loss": 0.5205, "num_input_tokens_seen": 7453824, "step": 2365 }, { "epoch": 0.1517188400230459, "grad_norm": 36.89908981323242, "learning_rate": 1.9838222439696027e-06, "loss": 0.5717, "num_input_tokens_seen": 7469312, "step": 2370 }, { "epoch": 0.1520389219640228, "grad_norm": 53.630462646484375, "learning_rate": 1.9836214562282058e-06, "loss": 0.7065, "num_input_tokens_seen": 7485120, "step": 2375 }, { "epoch": 0.15235900390499968, "grad_norm": 36.9291877746582, "learning_rate": 1.9834194404238715e-06, "loss": 0.4971, "num_input_tokens_seen": 7500416, "step": 2380 }, { "epoch": 0.15267908584597656, "grad_norm": 41.09784698486328, "learning_rate": 1.9832161968088193e-06, "loss": 0.4125, "num_input_tokens_seen": 7516672, "step": 2385 }, { "epoch": 0.15299916778695347, "grad_norm": 53.901432037353516, "learning_rate": 1.9830117256368015e-06, "loss": 0.4764, "num_input_tokens_seen": 7532800, "step": 2390 }, { "epoch": 0.15331924972793035, "grad_norm": 38.6842041015625, "learning_rate": 1.982806027163102e-06, "loss": 0.4924, "num_input_tokens_seen": 7547776, "step": 2395 }, { "epoch": 0.15363933166890725, "grad_norm": 28.168846130371094, "learning_rate": 1.9825991016445386e-06, "loss": 0.5579, "num_input_tokens_seen": 7562496, "step": 2400 }, { "epoch": 0.15395941360988413, "grad_norm": 41.71428298950195, "learning_rate": 1.9823909493394594e-06, "loss": 0.5286, "num_input_tokens_seen": 7577920, "step": 2405 }, { "epoch": 0.154279495550861, "grad_norm": 41.26945114135742, "learning_rate": 1.9821815705077455e-06, "loss": 0.5331, "num_input_tokens_seen": 7593216, "step": 2410 }, { "epoch": 0.15459957749183792, "grad_norm": 63.113800048828125, "learning_rate": 1.9819709654108087e-06, "loss": 0.5768, "num_input_tokens_seen": 7608192, "step": 2415 }, { "epoch": 0.1549196594328148, "grad_norm": 51.21147537231445, "learning_rate": 1.981759134311592e-06, "loss": 0.4652, "num_input_tokens_seen": 7624448, "step": 2420 }, { "epoch": 0.15523974137379168, "grad_norm": 45.952392578125, "learning_rate": 1.981546077474569e-06, "loss": 0.4847, "num_input_tokens_seen": 7640192, "step": 2425 }, { "epoch": 0.15555982331476859, "grad_norm": 33.45967483520508, "learning_rate": 1.981331795165744e-06, "loss": 0.5143, "num_input_tokens_seen": 7654848, "step": 2430 }, { "epoch": 0.15587990525574547, "grad_norm": 64.05781555175781, "learning_rate": 1.9811162876526498e-06, "loss": 0.6067, "num_input_tokens_seen": 7670848, "step": 2435 }, { "epoch": 0.15619998719672237, "grad_norm": 28.034521102905273, "learning_rate": 1.9808995552043515e-06, "loss": 0.6387, "num_input_tokens_seen": 7686016, "step": 2440 }, { "epoch": 0.15652006913769925, "grad_norm": 33.880714416503906, "learning_rate": 1.9806815980914413e-06, "loss": 0.5478, "num_input_tokens_seen": 7701760, "step": 2445 }, { "epoch": 0.15684015107867613, "grad_norm": 35.8829231262207, "learning_rate": 1.9804624165860417e-06, "loss": 0.5624, "num_input_tokens_seen": 7717760, "step": 2450 }, { "epoch": 0.15716023301965304, "grad_norm": 17.44162368774414, "learning_rate": 1.9802420109618028e-06, "loss": 0.3852, "num_input_tokens_seen": 7733376, "step": 2455 }, { "epoch": 0.15748031496062992, "grad_norm": 15.591707229614258, "learning_rate": 1.980020381493904e-06, "loss": 0.4984, "num_input_tokens_seen": 7750464, "step": 2460 }, { "epoch": 0.1578003969016068, "grad_norm": 38.971927642822266, "learning_rate": 1.979797528459052e-06, "loss": 0.4942, "num_input_tokens_seen": 7768576, "step": 2465 }, { "epoch": 0.1581204788425837, "grad_norm": 44.95268249511719, "learning_rate": 1.979573452135482e-06, "loss": 0.5334, "num_input_tokens_seen": 7784256, "step": 2470 }, { "epoch": 0.15844056078356059, "grad_norm": 33.37703323364258, "learning_rate": 1.979348152802955e-06, "loss": 0.3186, "num_input_tokens_seen": 7799232, "step": 2475 }, { "epoch": 0.1587606427245375, "grad_norm": 48.24396896362305, "learning_rate": 1.979121630742761e-06, "loss": 0.592, "num_input_tokens_seen": 7815040, "step": 2480 }, { "epoch": 0.15908072466551437, "grad_norm": 18.306211471557617, "learning_rate": 1.9788938862377146e-06, "loss": 0.4479, "num_input_tokens_seen": 7830400, "step": 2485 }, { "epoch": 0.15940080660649125, "grad_norm": 30.219003677368164, "learning_rate": 1.9786649195721577e-06, "loss": 0.4818, "num_input_tokens_seen": 7846336, "step": 2490 }, { "epoch": 0.15972088854746816, "grad_norm": 42.44570541381836, "learning_rate": 1.978434731031958e-06, "loss": 0.6323, "num_input_tokens_seen": 7862528, "step": 2495 }, { "epoch": 0.16004097048844504, "grad_norm": 36.15270233154297, "learning_rate": 1.9782033209045085e-06, "loss": 0.4541, "num_input_tokens_seen": 7880000, "step": 2500 }, { "epoch": 0.16036105242942192, "grad_norm": 18.829133987426758, "learning_rate": 1.977970689478727e-06, "loss": 0.4053, "num_input_tokens_seen": 7895296, "step": 2505 }, { "epoch": 0.16068113437039883, "grad_norm": 54.07673645019531, "learning_rate": 1.9777368370450577e-06, "loss": 0.5884, "num_input_tokens_seen": 7911104, "step": 2510 }, { "epoch": 0.1610012163113757, "grad_norm": 31.81148910522461, "learning_rate": 1.9775017638954674e-06, "loss": 0.521, "num_input_tokens_seen": 7925952, "step": 2515 }, { "epoch": 0.1613212982523526, "grad_norm": 31.94769287109375, "learning_rate": 1.9772654703234476e-06, "loss": 0.5943, "num_input_tokens_seen": 7940928, "step": 2520 }, { "epoch": 0.1616413801933295, "grad_norm": 43.36374282836914, "learning_rate": 1.977027956624014e-06, "loss": 0.5665, "num_input_tokens_seen": 7955200, "step": 2525 }, { "epoch": 0.16196146213430637, "grad_norm": 40.16360855102539, "learning_rate": 1.9767892230937046e-06, "loss": 0.5819, "num_input_tokens_seen": 7970944, "step": 2530 }, { "epoch": 0.16228154407528328, "grad_norm": 57.72364044189453, "learning_rate": 1.976549270030581e-06, "loss": 0.4311, "num_input_tokens_seen": 7985856, "step": 2535 }, { "epoch": 0.16260162601626016, "grad_norm": 38.951045989990234, "learning_rate": 1.9763080977342286e-06, "loss": 0.4678, "num_input_tokens_seen": 8001088, "step": 2540 }, { "epoch": 0.16292170795723707, "grad_norm": 41.949275970458984, "learning_rate": 1.9760657065057527e-06, "loss": 0.4965, "num_input_tokens_seen": 8017856, "step": 2545 }, { "epoch": 0.16324178989821395, "grad_norm": 40.579071044921875, "learning_rate": 1.975822096647782e-06, "loss": 0.4527, "num_input_tokens_seen": 8033792, "step": 2550 }, { "epoch": 0.16356187183919083, "grad_norm": 38.93642807006836, "learning_rate": 1.975577268464466e-06, "loss": 0.4821, "num_input_tokens_seen": 8048256, "step": 2555 }, { "epoch": 0.16388195378016773, "grad_norm": 30.569536209106445, "learning_rate": 1.9753312222614765e-06, "loss": 0.5626, "num_input_tokens_seen": 8063680, "step": 2560 }, { "epoch": 0.1642020357211446, "grad_norm": 53.63691329956055, "learning_rate": 1.9750839583460036e-06, "loss": 0.4853, "num_input_tokens_seen": 8079744, "step": 2565 }, { "epoch": 0.1645221176621215, "grad_norm": 32.5906982421875, "learning_rate": 1.9748354770267603e-06, "loss": 0.502, "num_input_tokens_seen": 8094656, "step": 2570 }, { "epoch": 0.1648421996030984, "grad_norm": 24.61626434326172, "learning_rate": 1.9745857786139777e-06, "loss": 0.5116, "num_input_tokens_seen": 8110528, "step": 2575 }, { "epoch": 0.16516228154407528, "grad_norm": 48.395931243896484, "learning_rate": 1.974334863419408e-06, "loss": 0.6028, "num_input_tokens_seen": 8126720, "step": 2580 }, { "epoch": 0.1654823634850522, "grad_norm": 34.782806396484375, "learning_rate": 1.9740827317563212e-06, "loss": 0.518, "num_input_tokens_seen": 8141312, "step": 2585 }, { "epoch": 0.16580244542602907, "grad_norm": 35.59202575683594, "learning_rate": 1.973829383939507e-06, "loss": 0.4889, "num_input_tokens_seen": 8156736, "step": 2590 }, { "epoch": 0.16612252736700595, "grad_norm": 49.05874252319336, "learning_rate": 1.973574820285273e-06, "loss": 0.4987, "num_input_tokens_seen": 8172480, "step": 2595 }, { "epoch": 0.16644260930798285, "grad_norm": 39.507137298583984, "learning_rate": 1.9733190411114443e-06, "loss": 0.5702, "num_input_tokens_seen": 8188224, "step": 2600 }, { "epoch": 0.16676269124895973, "grad_norm": 36.02799987792969, "learning_rate": 1.9730620467373654e-06, "loss": 0.438, "num_input_tokens_seen": 8204352, "step": 2605 }, { "epoch": 0.1670827731899366, "grad_norm": 44.20855712890625, "learning_rate": 1.9728038374838958e-06, "loss": 0.5744, "num_input_tokens_seen": 8219328, "step": 2610 }, { "epoch": 0.16740285513091352, "grad_norm": 20.6259822845459, "learning_rate": 1.972544413673413e-06, "loss": 0.3913, "num_input_tokens_seen": 8234560, "step": 2615 }, { "epoch": 0.1677229370718904, "grad_norm": 28.986614227294922, "learning_rate": 1.9722837756298108e-06, "loss": 0.5779, "num_input_tokens_seen": 8249344, "step": 2620 }, { "epoch": 0.1680430190128673, "grad_norm": 53.51920700073242, "learning_rate": 1.972021923678499e-06, "loss": 0.5548, "num_input_tokens_seen": 8265600, "step": 2625 }, { "epoch": 0.16836310095384419, "grad_norm": 27.421762466430664, "learning_rate": 1.971758858146403e-06, "loss": 0.4861, "num_input_tokens_seen": 8280384, "step": 2630 }, { "epoch": 0.16868318289482107, "grad_norm": 41.67002868652344, "learning_rate": 1.9714945793619626e-06, "loss": 0.4897, "num_input_tokens_seen": 8295744, "step": 2635 }, { "epoch": 0.16900326483579797, "grad_norm": 27.60586929321289, "learning_rate": 1.971229087655133e-06, "loss": 0.5052, "num_input_tokens_seen": 8311680, "step": 2640 }, { "epoch": 0.16932334677677485, "grad_norm": 29.15129280090332, "learning_rate": 1.9709623833573842e-06, "loss": 0.4678, "num_input_tokens_seen": 8326592, "step": 2645 }, { "epoch": 0.16964342871775173, "grad_norm": 54.205875396728516, "learning_rate": 1.9706944668016994e-06, "loss": 0.4588, "num_input_tokens_seen": 8341632, "step": 2650 }, { "epoch": 0.16996351065872864, "grad_norm": 38.538326263427734, "learning_rate": 1.9704253383225756e-06, "loss": 0.4627, "num_input_tokens_seen": 8358400, "step": 2655 }, { "epoch": 0.17028359259970552, "grad_norm": 33.1207275390625, "learning_rate": 1.970154998256023e-06, "loss": 0.4845, "num_input_tokens_seen": 8374144, "step": 2660 }, { "epoch": 0.17060367454068243, "grad_norm": 35.72023010253906, "learning_rate": 1.9698834469395644e-06, "loss": 0.4215, "num_input_tokens_seen": 8389440, "step": 2665 }, { "epoch": 0.1709237564816593, "grad_norm": 33.63475036621094, "learning_rate": 1.969610684712234e-06, "loss": 0.5408, "num_input_tokens_seen": 8404672, "step": 2670 }, { "epoch": 0.17124383842263619, "grad_norm": 59.44383239746094, "learning_rate": 1.9693367119145794e-06, "loss": 0.5508, "num_input_tokens_seen": 8420096, "step": 2675 }, { "epoch": 0.1715639203636131, "grad_norm": 42.37469482421875, "learning_rate": 1.969061528888659e-06, "loss": 0.6684, "num_input_tokens_seen": 8436288, "step": 2680 }, { "epoch": 0.17188400230458997, "grad_norm": 23.906444549560547, "learning_rate": 1.9687851359780415e-06, "loss": 0.5401, "num_input_tokens_seen": 8452672, "step": 2685 }, { "epoch": 0.17220408424556685, "grad_norm": 19.489620208740234, "learning_rate": 1.968507533527807e-06, "loss": 0.4867, "num_input_tokens_seen": 8469120, "step": 2690 }, { "epoch": 0.17252416618654376, "grad_norm": 46.37827682495117, "learning_rate": 1.9682287218845455e-06, "loss": 0.4748, "num_input_tokens_seen": 8484736, "step": 2695 }, { "epoch": 0.17284424812752064, "grad_norm": 38.747093200683594, "learning_rate": 1.967948701396356e-06, "loss": 0.7367, "num_input_tokens_seen": 8500480, "step": 2700 }, { "epoch": 0.17316433006849755, "grad_norm": 28.16217803955078, "learning_rate": 1.9676674724128485e-06, "loss": 0.3977, "num_input_tokens_seen": 8514624, "step": 2705 }, { "epoch": 0.17348441200947443, "grad_norm": 19.507436752319336, "learning_rate": 1.9673850352851397e-06, "loss": 0.4543, "num_input_tokens_seen": 8529664, "step": 2710 }, { "epoch": 0.1738044939504513, "grad_norm": 31.663122177124023, "learning_rate": 1.967101390365856e-06, "loss": 0.5825, "num_input_tokens_seen": 8545280, "step": 2715 }, { "epoch": 0.1741245758914282, "grad_norm": 29.334657669067383, "learning_rate": 1.966816538009131e-06, "loss": 0.492, "num_input_tokens_seen": 8560384, "step": 2720 }, { "epoch": 0.1744446578324051, "grad_norm": 41.919986724853516, "learning_rate": 1.966530478570607e-06, "loss": 0.5425, "num_input_tokens_seen": 8576960, "step": 2725 }, { "epoch": 0.174764739773382, "grad_norm": 31.315555572509766, "learning_rate": 1.9662432124074325e-06, "loss": 0.4635, "num_input_tokens_seen": 8592384, "step": 2730 }, { "epoch": 0.17508482171435888, "grad_norm": 29.594783782958984, "learning_rate": 1.965954739878262e-06, "loss": 0.4836, "num_input_tokens_seen": 8609024, "step": 2735 }, { "epoch": 0.17540490365533576, "grad_norm": 46.86975860595703, "learning_rate": 1.965665061343257e-06, "loss": 0.4283, "num_input_tokens_seen": 8624768, "step": 2740 }, { "epoch": 0.17572498559631267, "grad_norm": 25.347562789916992, "learning_rate": 1.965374177164085e-06, "loss": 0.4646, "num_input_tokens_seen": 8640448, "step": 2745 }, { "epoch": 0.17604506753728955, "grad_norm": 27.5438232421875, "learning_rate": 1.9650820877039182e-06, "loss": 0.5427, "num_input_tokens_seen": 8655296, "step": 2750 }, { "epoch": 0.17636514947826643, "grad_norm": 69.62262725830078, "learning_rate": 1.9647887933274334e-06, "loss": 0.4878, "num_input_tokens_seen": 8671872, "step": 2755 }, { "epoch": 0.17668523141924333, "grad_norm": 21.517606735229492, "learning_rate": 1.9644942944008124e-06, "loss": 0.4822, "num_input_tokens_seen": 8687680, "step": 2760 }, { "epoch": 0.1770053133602202, "grad_norm": 57.37998962402344, "learning_rate": 1.96419859129174e-06, "loss": 0.5914, "num_input_tokens_seen": 8702912, "step": 2765 }, { "epoch": 0.17732539530119712, "grad_norm": 25.293439865112305, "learning_rate": 1.963901684369406e-06, "loss": 0.4702, "num_input_tokens_seen": 8718144, "step": 2770 }, { "epoch": 0.177645477242174, "grad_norm": 36.15742874145508, "learning_rate": 1.9636035740045013e-06, "loss": 0.4989, "num_input_tokens_seen": 8732992, "step": 2775 }, { "epoch": 0.17796555918315088, "grad_norm": 26.592554092407227, "learning_rate": 1.9633042605692207e-06, "loss": 0.6024, "num_input_tokens_seen": 8749056, "step": 2780 }, { "epoch": 0.17828564112412779, "grad_norm": 22.61241912841797, "learning_rate": 1.9630037444372597e-06, "loss": 0.4879, "num_input_tokens_seen": 8765184, "step": 2785 }, { "epoch": 0.17860572306510467, "grad_norm": 43.24379348754883, "learning_rate": 1.9627020259838177e-06, "loss": 0.4133, "num_input_tokens_seen": 8780480, "step": 2790 }, { "epoch": 0.17892580500608155, "grad_norm": 33.002906799316406, "learning_rate": 1.9623991055855925e-06, "loss": 0.5539, "num_input_tokens_seen": 8796352, "step": 2795 }, { "epoch": 0.17924588694705845, "grad_norm": 27.26972770690918, "learning_rate": 1.962094983620784e-06, "loss": 0.443, "num_input_tokens_seen": 8810688, "step": 2800 }, { "epoch": 0.17956596888803533, "grad_norm": 49.42767333984375, "learning_rate": 1.9617896604690925e-06, "loss": 0.4279, "num_input_tokens_seen": 8826304, "step": 2805 }, { "epoch": 0.17988605082901224, "grad_norm": 22.84317970275879, "learning_rate": 1.961483136511717e-06, "loss": 0.4628, "num_input_tokens_seen": 8841344, "step": 2810 }, { "epoch": 0.18020613276998912, "grad_norm": 47.95643997192383, "learning_rate": 1.9611754121313567e-06, "loss": 0.6058, "num_input_tokens_seen": 8857664, "step": 2815 }, { "epoch": 0.180526214710966, "grad_norm": 52.1284294128418, "learning_rate": 1.960866487712209e-06, "loss": 0.5762, "num_input_tokens_seen": 8873408, "step": 2820 }, { "epoch": 0.1808462966519429, "grad_norm": 31.013389587402344, "learning_rate": 1.9605563636399695e-06, "loss": 0.425, "num_input_tokens_seen": 8889472, "step": 2825 }, { "epoch": 0.18116637859291979, "grad_norm": 60.00368118286133, "learning_rate": 1.9602450403018315e-06, "loss": 0.5908, "num_input_tokens_seen": 8904640, "step": 2830 }, { "epoch": 0.18148646053389667, "grad_norm": 35.06608200073242, "learning_rate": 1.9599325180864864e-06, "loss": 0.4446, "num_input_tokens_seen": 8919680, "step": 2835 }, { "epoch": 0.18180654247487357, "grad_norm": 31.069002151489258, "learning_rate": 1.9596187973841216e-06, "loss": 0.4418, "num_input_tokens_seen": 8935360, "step": 2840 }, { "epoch": 0.18212662441585045, "grad_norm": 26.10578727722168, "learning_rate": 1.959303878586421e-06, "loss": 0.4892, "num_input_tokens_seen": 8951552, "step": 2845 }, { "epoch": 0.18244670635682736, "grad_norm": 42.628684997558594, "learning_rate": 1.9589877620865647e-06, "loss": 0.5694, "num_input_tokens_seen": 8968576, "step": 2850 }, { "epoch": 0.18276678829780424, "grad_norm": 27.467554092407227, "learning_rate": 1.9586704482792277e-06, "loss": 0.4559, "num_input_tokens_seen": 8983744, "step": 2855 }, { "epoch": 0.18308687023878112, "grad_norm": 30.344791412353516, "learning_rate": 1.95835193756058e-06, "loss": 0.4376, "num_input_tokens_seen": 8999040, "step": 2860 }, { "epoch": 0.18340695217975803, "grad_norm": 37.68637466430664, "learning_rate": 1.9580322303282858e-06, "loss": 0.4186, "num_input_tokens_seen": 9015872, "step": 2865 }, { "epoch": 0.1837270341207349, "grad_norm": 26.828548431396484, "learning_rate": 1.9577113269815038e-06, "loss": 0.4001, "num_input_tokens_seen": 9031744, "step": 2870 }, { "epoch": 0.18404711606171179, "grad_norm": 34.85321807861328, "learning_rate": 1.957389227920885e-06, "loss": 0.5877, "num_input_tokens_seen": 9047872, "step": 2875 }, { "epoch": 0.1843671980026887, "grad_norm": 33.741172790527344, "learning_rate": 1.957065933548574e-06, "loss": 0.5101, "num_input_tokens_seen": 9062976, "step": 2880 }, { "epoch": 0.18468727994366557, "grad_norm": 56.83228302001953, "learning_rate": 1.956741444268208e-06, "loss": 0.5899, "num_input_tokens_seen": 9078208, "step": 2885 }, { "epoch": 0.18500736188464248, "grad_norm": 30.513900756835938, "learning_rate": 1.9564157604849154e-06, "loss": 0.4744, "num_input_tokens_seen": 9094720, "step": 2890 }, { "epoch": 0.18532744382561936, "grad_norm": 28.41360092163086, "learning_rate": 1.9560888826053163e-06, "loss": 0.5274, "num_input_tokens_seen": 9110336, "step": 2895 }, { "epoch": 0.18564752576659624, "grad_norm": 25.244827270507812, "learning_rate": 1.9557608110375212e-06, "loss": 0.5573, "num_input_tokens_seen": 9126912, "step": 2900 }, { "epoch": 0.18596760770757315, "grad_norm": 26.246530532836914, "learning_rate": 1.955431546191132e-06, "loss": 0.549, "num_input_tokens_seen": 9142400, "step": 2905 }, { "epoch": 0.18628768964855003, "grad_norm": 44.32508087158203, "learning_rate": 1.95510108847724e-06, "loss": 0.5161, "num_input_tokens_seen": 9157184, "step": 2910 }, { "epoch": 0.1866077715895269, "grad_norm": 28.210281372070312, "learning_rate": 1.954769438308424e-06, "loss": 0.5237, "num_input_tokens_seen": 9173696, "step": 2915 }, { "epoch": 0.1869278535305038, "grad_norm": 36.434974670410156, "learning_rate": 1.954436596098754e-06, "loss": 0.4992, "num_input_tokens_seen": 9190080, "step": 2920 }, { "epoch": 0.1872479354714807, "grad_norm": 59.13997268676758, "learning_rate": 1.9541025622637875e-06, "loss": 0.5761, "num_input_tokens_seen": 9204352, "step": 2925 }, { "epoch": 0.1875680174124576, "grad_norm": 50.34525680541992, "learning_rate": 1.95376733722057e-06, "loss": 0.6098, "num_input_tokens_seen": 9219200, "step": 2930 }, { "epoch": 0.18788809935343448, "grad_norm": 33.083404541015625, "learning_rate": 1.9534309213876337e-06, "loss": 0.4702, "num_input_tokens_seen": 9233600, "step": 2935 }, { "epoch": 0.18820818129441136, "grad_norm": 40.38674545288086, "learning_rate": 1.953093315184997e-06, "loss": 0.4343, "num_input_tokens_seen": 9249536, "step": 2940 }, { "epoch": 0.18852826323538827, "grad_norm": 39.487579345703125, "learning_rate": 1.952754519034166e-06, "loss": 0.6391, "num_input_tokens_seen": 9264256, "step": 2945 }, { "epoch": 0.18884834517636515, "grad_norm": 58.533199310302734, "learning_rate": 1.9524145333581313e-06, "loss": 0.4487, "num_input_tokens_seen": 9279488, "step": 2950 }, { "epoch": 0.18916842711734205, "grad_norm": 26.437389373779297, "learning_rate": 1.952073358581369e-06, "loss": 0.5122, "num_input_tokens_seen": 9294336, "step": 2955 }, { "epoch": 0.18948850905831893, "grad_norm": 34.934356689453125, "learning_rate": 1.95173099512984e-06, "loss": 0.5552, "num_input_tokens_seen": 9309376, "step": 2960 }, { "epoch": 0.1898085909992958, "grad_norm": 22.976945877075195, "learning_rate": 1.9513874434309894e-06, "loss": 0.4579, "num_input_tokens_seen": 9324224, "step": 2965 }, { "epoch": 0.19012867294027272, "grad_norm": 27.009410858154297, "learning_rate": 1.951042703913745e-06, "loss": 0.4466, "num_input_tokens_seen": 9339136, "step": 2970 }, { "epoch": 0.1904487548812496, "grad_norm": 26.152063369750977, "learning_rate": 1.950696777008518e-06, "loss": 0.4491, "num_input_tokens_seen": 9354688, "step": 2975 }, { "epoch": 0.19076883682222648, "grad_norm": 23.096553802490234, "learning_rate": 1.9503496631472025e-06, "loss": 0.4917, "num_input_tokens_seen": 9369664, "step": 2980 }, { "epoch": 0.19108891876320339, "grad_norm": 42.896331787109375, "learning_rate": 1.9500013627631746e-06, "loss": 0.6324, "num_input_tokens_seen": 9384768, "step": 2985 }, { "epoch": 0.19140900070418027, "grad_norm": 34.10990524291992, "learning_rate": 1.949651876291291e-06, "loss": 0.3728, "num_input_tokens_seen": 9400320, "step": 2990 }, { "epoch": 0.19172908264515717, "grad_norm": 56.81764221191406, "learning_rate": 1.9493012041678894e-06, "loss": 0.4739, "num_input_tokens_seen": 9415872, "step": 2995 }, { "epoch": 0.19204916458613405, "grad_norm": 31.37006187438965, "learning_rate": 1.9489493468307883e-06, "loss": 0.6013, "num_input_tokens_seen": 9432704, "step": 3000 }, { "epoch": 0.19236924652711093, "grad_norm": 52.02330017089844, "learning_rate": 1.948596304719286e-06, "loss": 0.5159, "num_input_tokens_seen": 9448192, "step": 3005 }, { "epoch": 0.19268932846808784, "grad_norm": 44.85215759277344, "learning_rate": 1.9482420782741594e-06, "loss": 0.4322, "num_input_tokens_seen": 9464576, "step": 3010 }, { "epoch": 0.19300941040906472, "grad_norm": 30.883983612060547, "learning_rate": 1.9478866679376647e-06, "loss": 0.5546, "num_input_tokens_seen": 9479936, "step": 3015 }, { "epoch": 0.1933294923500416, "grad_norm": 29.6319637298584, "learning_rate": 1.9475300741535353e-06, "loss": 0.5447, "num_input_tokens_seen": 9497280, "step": 3020 }, { "epoch": 0.1936495742910185, "grad_norm": 36.820396423339844, "learning_rate": 1.9471722973669833e-06, "loss": 0.4568, "num_input_tokens_seen": 9514496, "step": 3025 }, { "epoch": 0.19396965623199539, "grad_norm": 23.96208953857422, "learning_rate": 1.946813338024697e-06, "loss": 0.3932, "num_input_tokens_seen": 9529536, "step": 3030 }, { "epoch": 0.1942897381729723, "grad_norm": 55.99610137939453, "learning_rate": 1.9464531965748414e-06, "loss": 0.526, "num_input_tokens_seen": 9545472, "step": 3035 }, { "epoch": 0.19460982011394917, "grad_norm": 39.6732292175293, "learning_rate": 1.9460918734670573e-06, "loss": 0.585, "num_input_tokens_seen": 9560960, "step": 3040 }, { "epoch": 0.19492990205492605, "grad_norm": 29.82390022277832, "learning_rate": 1.945729369152461e-06, "loss": 0.5221, "num_input_tokens_seen": 9576320, "step": 3045 }, { "epoch": 0.19524998399590296, "grad_norm": 44.162254333496094, "learning_rate": 1.945365684083643e-06, "loss": 0.5632, "num_input_tokens_seen": 9592192, "step": 3050 }, { "epoch": 0.19557006593687984, "grad_norm": 52.55691146850586, "learning_rate": 1.945000818714668e-06, "loss": 0.6164, "num_input_tokens_seen": 9608128, "step": 3055 }, { "epoch": 0.19589014787785672, "grad_norm": 27.91643714904785, "learning_rate": 1.944634773501076e-06, "loss": 0.5338, "num_input_tokens_seen": 9623872, "step": 3060 }, { "epoch": 0.19621022981883363, "grad_norm": 51.04069900512695, "learning_rate": 1.9442675488998783e-06, "loss": 0.5496, "num_input_tokens_seen": 9639488, "step": 3065 }, { "epoch": 0.1965303117598105, "grad_norm": 28.205469131469727, "learning_rate": 1.9438991453695587e-06, "loss": 0.4913, "num_input_tokens_seen": 9655680, "step": 3070 }, { "epoch": 0.1968503937007874, "grad_norm": 36.26915740966797, "learning_rate": 1.943529563370073e-06, "loss": 0.5489, "num_input_tokens_seen": 9670400, "step": 3075 }, { "epoch": 0.1971704756417643, "grad_norm": 21.7237606048584, "learning_rate": 1.9431588033628495e-06, "loss": 0.3868, "num_input_tokens_seen": 9685504, "step": 3080 }, { "epoch": 0.19749055758274117, "grad_norm": 44.26191329956055, "learning_rate": 1.9427868658107862e-06, "loss": 0.635, "num_input_tokens_seen": 9701952, "step": 3085 }, { "epoch": 0.19781063952371808, "grad_norm": 22.945430755615234, "learning_rate": 1.942413751178251e-06, "loss": 0.4485, "num_input_tokens_seen": 9716928, "step": 3090 }, { "epoch": 0.19813072146469496, "grad_norm": 55.33934783935547, "learning_rate": 1.9420394599310826e-06, "loss": 0.6516, "num_input_tokens_seen": 9732096, "step": 3095 }, { "epoch": 0.19845080340567184, "grad_norm": 27.51698112487793, "learning_rate": 1.941663992536588e-06, "loss": 0.5307, "num_input_tokens_seen": 9747648, "step": 3100 }, { "epoch": 0.19877088534664875, "grad_norm": 14.455513954162598, "learning_rate": 1.941287349463542e-06, "loss": 0.4371, "num_input_tokens_seen": 9763072, "step": 3105 }, { "epoch": 0.19909096728762563, "grad_norm": 28.985132217407227, "learning_rate": 1.940909531182188e-06, "loss": 0.4726, "num_input_tokens_seen": 9778176, "step": 3110 }, { "epoch": 0.19941104922860253, "grad_norm": 45.77129364013672, "learning_rate": 1.9405305381642375e-06, "loss": 0.6129, "num_input_tokens_seen": 9793536, "step": 3115 }, { "epoch": 0.1997311311695794, "grad_norm": 24.09324836730957, "learning_rate": 1.9401503708828665e-06, "loss": 0.4986, "num_input_tokens_seen": 9808192, "step": 3120 }, { "epoch": 0.2000512131105563, "grad_norm": 32.09850311279297, "learning_rate": 1.939769029812719e-06, "loss": 0.5774, "num_input_tokens_seen": 9823232, "step": 3125 }, { "epoch": 0.20024326227514244, "eval_loss": 0.48840755224227905, "eval_runtime": 49.2154, "eval_samples_per_second": 282.148, "eval_steps_per_second": 35.274, "num_input_tokens_seen": 9832064, "step": 3128 }, { "epoch": 0.2003712950515332, "grad_norm": 42.523658752441406, "learning_rate": 1.939386515429904e-06, "loss": 0.5893, "num_input_tokens_seen": 9839488, "step": 3130 }, { "epoch": 0.20069137699251008, "grad_norm": 20.936914443969727, "learning_rate": 1.9390028282119942e-06, "loss": 0.421, "num_input_tokens_seen": 9856192, "step": 3135 }, { "epoch": 0.201011458933487, "grad_norm": 37.659271240234375, "learning_rate": 1.938617968638029e-06, "loss": 0.5122, "num_input_tokens_seen": 9871552, "step": 3140 }, { "epoch": 0.20133154087446387, "grad_norm": 40.56658172607422, "learning_rate": 1.938231937188509e-06, "loss": 0.5077, "num_input_tokens_seen": 9886016, "step": 3145 }, { "epoch": 0.20165162281544075, "grad_norm": 43.369693756103516, "learning_rate": 1.9378447343453995e-06, "loss": 0.6156, "num_input_tokens_seen": 9903552, "step": 3150 }, { "epoch": 0.20197170475641765, "grad_norm": 43.882118225097656, "learning_rate": 1.9374563605921275e-06, "loss": 0.3458, "num_input_tokens_seen": 9920320, "step": 3155 }, { "epoch": 0.20229178669739453, "grad_norm": 30.69708251953125, "learning_rate": 1.937066816413582e-06, "loss": 0.5926, "num_input_tokens_seen": 9935936, "step": 3160 }, { "epoch": 0.2026118686383714, "grad_norm": 30.447908401489258, "learning_rate": 1.9366761022961146e-06, "loss": 0.4757, "num_input_tokens_seen": 9950912, "step": 3165 }, { "epoch": 0.20293195057934832, "grad_norm": 40.40016174316406, "learning_rate": 1.9362842187275354e-06, "loss": 0.5615, "num_input_tokens_seen": 9966080, "step": 3170 }, { "epoch": 0.2032520325203252, "grad_norm": 29.704164505004883, "learning_rate": 1.9358911661971155e-06, "loss": 0.4789, "num_input_tokens_seen": 9982080, "step": 3175 }, { "epoch": 0.2035721144613021, "grad_norm": 28.506755828857422, "learning_rate": 1.9354969451955864e-06, "loss": 0.4647, "num_input_tokens_seen": 9996544, "step": 3180 }, { "epoch": 0.20389219640227899, "grad_norm": 27.22804832458496, "learning_rate": 1.9351015562151375e-06, "loss": 0.5497, "num_input_tokens_seen": 10011776, "step": 3185 }, { "epoch": 0.20421227834325587, "grad_norm": 25.4746150970459, "learning_rate": 1.934704999749416e-06, "loss": 0.4331, "num_input_tokens_seen": 10027264, "step": 3190 }, { "epoch": 0.20453236028423277, "grad_norm": 23.414485931396484, "learning_rate": 1.9343072762935274e-06, "loss": 0.4203, "num_input_tokens_seen": 10042432, "step": 3195 }, { "epoch": 0.20485244222520965, "grad_norm": 28.72736167907715, "learning_rate": 1.933908386344035e-06, "loss": 0.4135, "num_input_tokens_seen": 10057792, "step": 3200 }, { "epoch": 0.20517252416618653, "grad_norm": 33.854576110839844, "learning_rate": 1.9335083303989565e-06, "loss": 0.5222, "num_input_tokens_seen": 10074752, "step": 3205 }, { "epoch": 0.20549260610716344, "grad_norm": 37.276336669921875, "learning_rate": 1.9331071089577674e-06, "loss": 0.576, "num_input_tokens_seen": 10090752, "step": 3210 }, { "epoch": 0.20581268804814032, "grad_norm": 51.40751647949219, "learning_rate": 1.9327047225213963e-06, "loss": 0.4961, "num_input_tokens_seen": 10106240, "step": 3215 }, { "epoch": 0.20613276998911723, "grad_norm": 35.04685974121094, "learning_rate": 1.9323011715922283e-06, "loss": 0.4128, "num_input_tokens_seen": 10121856, "step": 3220 }, { "epoch": 0.2064528519300941, "grad_norm": 67.41058349609375, "learning_rate": 1.931896456674101e-06, "loss": 0.4764, "num_input_tokens_seen": 10137408, "step": 3225 }, { "epoch": 0.20677293387107099, "grad_norm": 32.64918899536133, "learning_rate": 1.931490578272306e-06, "loss": 0.4548, "num_input_tokens_seen": 10152640, "step": 3230 }, { "epoch": 0.2070930158120479, "grad_norm": 33.72087097167969, "learning_rate": 1.9310835368935867e-06, "loss": 0.3538, "num_input_tokens_seen": 10167936, "step": 3235 }, { "epoch": 0.20741309775302477, "grad_norm": 36.13018035888672, "learning_rate": 1.93067533304614e-06, "loss": 0.4205, "num_input_tokens_seen": 10183360, "step": 3240 }, { "epoch": 0.20773317969400165, "grad_norm": 29.964752197265625, "learning_rate": 1.9302659672396128e-06, "loss": 0.5557, "num_input_tokens_seen": 10198208, "step": 3245 }, { "epoch": 0.20805326163497856, "grad_norm": 27.227624893188477, "learning_rate": 1.9298554399851025e-06, "loss": 0.4903, "num_input_tokens_seen": 10213568, "step": 3250 }, { "epoch": 0.20837334357595544, "grad_norm": 37.30453109741211, "learning_rate": 1.929443751795158e-06, "loss": 0.4833, "num_input_tokens_seen": 10230080, "step": 3255 }, { "epoch": 0.20869342551693235, "grad_norm": 23.320819854736328, "learning_rate": 1.929030903183776e-06, "loss": 0.4759, "num_input_tokens_seen": 10246912, "step": 3260 }, { "epoch": 0.20901350745790923, "grad_norm": 42.66804885864258, "learning_rate": 1.9286168946664033e-06, "loss": 0.5368, "num_input_tokens_seen": 10262464, "step": 3265 }, { "epoch": 0.2093335893988861, "grad_norm": 60.002376556396484, "learning_rate": 1.9282017267599352e-06, "loss": 0.6679, "num_input_tokens_seen": 10278016, "step": 3270 }, { "epoch": 0.209653671339863, "grad_norm": 42.901100158691406, "learning_rate": 1.9277853999827125e-06, "loss": 0.5054, "num_input_tokens_seen": 10293824, "step": 3275 }, { "epoch": 0.2099737532808399, "grad_norm": 44.74653244018555, "learning_rate": 1.9273679148545244e-06, "loss": 0.5116, "num_input_tokens_seen": 10309568, "step": 3280 }, { "epoch": 0.21029383522181677, "grad_norm": 33.75946044921875, "learning_rate": 1.9269492718966062e-06, "loss": 0.4229, "num_input_tokens_seen": 10325696, "step": 3285 }, { "epoch": 0.21061391716279368, "grad_norm": 30.77555274963379, "learning_rate": 1.9265294716316384e-06, "loss": 0.5261, "num_input_tokens_seen": 10342016, "step": 3290 }, { "epoch": 0.21093399910377056, "grad_norm": 29.430330276489258, "learning_rate": 1.926108514583747e-06, "loss": 0.4688, "num_input_tokens_seen": 10357632, "step": 3295 }, { "epoch": 0.21125408104474747, "grad_norm": 50.258575439453125, "learning_rate": 1.925686401278501e-06, "loss": 0.4801, "num_input_tokens_seen": 10373056, "step": 3300 }, { "epoch": 0.21157416298572435, "grad_norm": 61.20192337036133, "learning_rate": 1.9252631322429143e-06, "loss": 0.6373, "num_input_tokens_seen": 10389248, "step": 3305 }, { "epoch": 0.21189424492670123, "grad_norm": 23.071653366088867, "learning_rate": 1.9248387080054435e-06, "loss": 0.439, "num_input_tokens_seen": 10404864, "step": 3310 }, { "epoch": 0.21221432686767813, "grad_norm": 18.74202537536621, "learning_rate": 1.9244131290959864e-06, "loss": 0.4878, "num_input_tokens_seen": 10420416, "step": 3315 }, { "epoch": 0.212534408808655, "grad_norm": 33.07780075073242, "learning_rate": 1.9239863960458845e-06, "loss": 0.4244, "num_input_tokens_seen": 10435456, "step": 3320 }, { "epoch": 0.21285449074963192, "grad_norm": 31.487497329711914, "learning_rate": 1.923558509387918e-06, "loss": 0.4881, "num_input_tokens_seen": 10451584, "step": 3325 }, { "epoch": 0.2131745726906088, "grad_norm": 37.91923904418945, "learning_rate": 1.9231294696563086e-06, "loss": 0.3745, "num_input_tokens_seen": 10467584, "step": 3330 }, { "epoch": 0.21349465463158568, "grad_norm": 34.82919692993164, "learning_rate": 1.922699277386718e-06, "loss": 0.4146, "num_input_tokens_seen": 10483264, "step": 3335 }, { "epoch": 0.21381473657256259, "grad_norm": 34.857810974121094, "learning_rate": 1.9222679331162454e-06, "loss": 0.5865, "num_input_tokens_seen": 10498560, "step": 3340 }, { "epoch": 0.21413481851353947, "grad_norm": 37.536800384521484, "learning_rate": 1.92183543738343e-06, "loss": 0.4515, "num_input_tokens_seen": 10514176, "step": 3345 }, { "epoch": 0.21445490045451635, "grad_norm": 25.721649169921875, "learning_rate": 1.9214017907282475e-06, "loss": 0.4363, "num_input_tokens_seen": 10529792, "step": 3350 }, { "epoch": 0.21477498239549325, "grad_norm": 37.20597457885742, "learning_rate": 1.9209669936921105e-06, "loss": 0.4809, "num_input_tokens_seen": 10545856, "step": 3355 }, { "epoch": 0.21509506433647013, "grad_norm": 43.397335052490234, "learning_rate": 1.920531046817869e-06, "loss": 0.4092, "num_input_tokens_seen": 10562368, "step": 3360 }, { "epoch": 0.21541514627744704, "grad_norm": 44.76917266845703, "learning_rate": 1.9200939506498067e-06, "loss": 0.6238, "num_input_tokens_seen": 10577280, "step": 3365 }, { "epoch": 0.21573522821842392, "grad_norm": 30.43206214904785, "learning_rate": 1.9196557057336446e-06, "loss": 0.5817, "num_input_tokens_seen": 10592384, "step": 3370 }, { "epoch": 0.2160553101594008, "grad_norm": 24.50318145751953, "learning_rate": 1.9192163126165354e-06, "loss": 0.4498, "num_input_tokens_seen": 10608704, "step": 3375 }, { "epoch": 0.2163753921003777, "grad_norm": 43.877662658691406, "learning_rate": 1.9187757718470673e-06, "loss": 0.3997, "num_input_tokens_seen": 10625280, "step": 3380 }, { "epoch": 0.21669547404135459, "grad_norm": 22.60622215270996, "learning_rate": 1.9183340839752606e-06, "loss": 0.5339, "num_input_tokens_seen": 10641152, "step": 3385 }, { "epoch": 0.21701555598233147, "grad_norm": 28.090923309326172, "learning_rate": 1.9178912495525672e-06, "loss": 0.4193, "num_input_tokens_seen": 10657472, "step": 3390 }, { "epoch": 0.21733563792330837, "grad_norm": 24.062137603759766, "learning_rate": 1.917447269131872e-06, "loss": 0.5054, "num_input_tokens_seen": 10673600, "step": 3395 }, { "epoch": 0.21765571986428525, "grad_norm": 35.8740119934082, "learning_rate": 1.917002143267489e-06, "loss": 0.5693, "num_input_tokens_seen": 10689344, "step": 3400 }, { "epoch": 0.21797580180526216, "grad_norm": 29.342618942260742, "learning_rate": 1.9165558725151633e-06, "loss": 0.4478, "num_input_tokens_seen": 10704384, "step": 3405 }, { "epoch": 0.21829588374623904, "grad_norm": 56.710784912109375, "learning_rate": 1.9161084574320692e-06, "loss": 0.5002, "num_input_tokens_seen": 10720512, "step": 3410 }, { "epoch": 0.21861596568721592, "grad_norm": 31.751296997070312, "learning_rate": 1.91565989857681e-06, "loss": 0.4727, "num_input_tokens_seen": 10735744, "step": 3415 }, { "epoch": 0.21893604762819283, "grad_norm": 31.050350189208984, "learning_rate": 1.9152101965094162e-06, "loss": 0.4573, "num_input_tokens_seen": 10750848, "step": 3420 }, { "epoch": 0.2192561295691697, "grad_norm": 42.99034881591797, "learning_rate": 1.9147593517913464e-06, "loss": 0.4878, "num_input_tokens_seen": 10765632, "step": 3425 }, { "epoch": 0.21957621151014659, "grad_norm": 17.069164276123047, "learning_rate": 1.914307364985485e-06, "loss": 0.3856, "num_input_tokens_seen": 10780928, "step": 3430 }, { "epoch": 0.2198962934511235, "grad_norm": 24.95672607421875, "learning_rate": 1.913854236656144e-06, "loss": 0.4217, "num_input_tokens_seen": 10796864, "step": 3435 }, { "epoch": 0.22021637539210037, "grad_norm": 39.41409683227539, "learning_rate": 1.9133999673690584e-06, "loss": 0.4653, "num_input_tokens_seen": 10812672, "step": 3440 }, { "epoch": 0.22053645733307728, "grad_norm": 44.56681442260742, "learning_rate": 1.9129445576913886e-06, "loss": 0.4709, "num_input_tokens_seen": 10828544, "step": 3445 }, { "epoch": 0.22085653927405416, "grad_norm": 23.38069725036621, "learning_rate": 1.91248800819172e-06, "loss": 0.5335, "num_input_tokens_seen": 10844288, "step": 3450 }, { "epoch": 0.22117662121503104, "grad_norm": 48.04047775268555, "learning_rate": 1.912030319440059e-06, "loss": 0.5192, "num_input_tokens_seen": 10860160, "step": 3455 }, { "epoch": 0.22149670315600795, "grad_norm": 36.49208068847656, "learning_rate": 1.9115714920078354e-06, "loss": 0.6043, "num_input_tokens_seen": 10875968, "step": 3460 }, { "epoch": 0.22181678509698483, "grad_norm": 25.53341293334961, "learning_rate": 1.9111115264679017e-06, "loss": 0.3252, "num_input_tokens_seen": 10892096, "step": 3465 }, { "epoch": 0.2221368670379617, "grad_norm": 45.4945068359375, "learning_rate": 1.910650423394529e-06, "loss": 0.4378, "num_input_tokens_seen": 10908544, "step": 3470 }, { "epoch": 0.2224569489789386, "grad_norm": 45.49387741088867, "learning_rate": 1.910188183363411e-06, "loss": 0.4817, "num_input_tokens_seen": 10924544, "step": 3475 }, { "epoch": 0.2227770309199155, "grad_norm": 50.44002151489258, "learning_rate": 1.909724806951659e-06, "loss": 0.4441, "num_input_tokens_seen": 10941888, "step": 3480 }, { "epoch": 0.2230971128608924, "grad_norm": 50.978633880615234, "learning_rate": 1.909260294737804e-06, "loss": 0.4669, "num_input_tokens_seen": 10958592, "step": 3485 }, { "epoch": 0.22341719480186928, "grad_norm": 80.07136535644531, "learning_rate": 1.9087946473017953e-06, "loss": 0.555, "num_input_tokens_seen": 10974208, "step": 3490 }, { "epoch": 0.22373727674284616, "grad_norm": 33.776737213134766, "learning_rate": 1.9083278652249992e-06, "loss": 0.4304, "num_input_tokens_seen": 10988928, "step": 3495 }, { "epoch": 0.22405735868382307, "grad_norm": 35.86427307128906, "learning_rate": 1.9078599490901983e-06, "loss": 0.425, "num_input_tokens_seen": 11005952, "step": 3500 }, { "epoch": 0.22437744062479995, "grad_norm": 51.98170852661133, "learning_rate": 1.9073908994815914e-06, "loss": 0.3971, "num_input_tokens_seen": 11020608, "step": 3505 }, { "epoch": 0.22469752256577685, "grad_norm": 46.34355926513672, "learning_rate": 1.9069207169847928e-06, "loss": 0.4862, "num_input_tokens_seen": 11036736, "step": 3510 }, { "epoch": 0.22501760450675373, "grad_norm": 33.5971794128418, "learning_rate": 1.9064494021868302e-06, "loss": 0.3584, "num_input_tokens_seen": 11052480, "step": 3515 }, { "epoch": 0.2253376864477306, "grad_norm": 39.64836120605469, "learning_rate": 1.9059769556761464e-06, "loss": 0.48, "num_input_tokens_seen": 11068416, "step": 3520 }, { "epoch": 0.22565776838870752, "grad_norm": 31.865467071533203, "learning_rate": 1.9055033780425962e-06, "loss": 0.4454, "num_input_tokens_seen": 11086400, "step": 3525 }, { "epoch": 0.2259778503296844, "grad_norm": 88.44284057617188, "learning_rate": 1.9050286698774464e-06, "loss": 0.562, "num_input_tokens_seen": 11102848, "step": 3530 }, { "epoch": 0.22629793227066128, "grad_norm": 41.320526123046875, "learning_rate": 1.904552831773376e-06, "loss": 0.5359, "num_input_tokens_seen": 11118080, "step": 3535 }, { "epoch": 0.22661801421163819, "grad_norm": 24.0659236907959, "learning_rate": 1.9040758643244748e-06, "loss": 0.4967, "num_input_tokens_seen": 11133120, "step": 3540 }, { "epoch": 0.22693809615261507, "grad_norm": 31.473848342895508, "learning_rate": 1.903597768126242e-06, "loss": 0.4694, "num_input_tokens_seen": 11150144, "step": 3545 }, { "epoch": 0.22725817809359197, "grad_norm": 58.51475143432617, "learning_rate": 1.9031185437755862e-06, "loss": 0.4787, "num_input_tokens_seen": 11165760, "step": 3550 }, { "epoch": 0.22757826003456885, "grad_norm": 52.226993560791016, "learning_rate": 1.9026381918708246e-06, "loss": 0.4582, "num_input_tokens_seen": 11180096, "step": 3555 }, { "epoch": 0.22789834197554573, "grad_norm": 19.623682022094727, "learning_rate": 1.9021567130116822e-06, "loss": 0.3618, "num_input_tokens_seen": 11195584, "step": 3560 }, { "epoch": 0.22821842391652264, "grad_norm": 59.500858306884766, "learning_rate": 1.9016741077992916e-06, "loss": 0.3909, "num_input_tokens_seen": 11210944, "step": 3565 }, { "epoch": 0.22853850585749952, "grad_norm": 27.949474334716797, "learning_rate": 1.90119037683619e-06, "loss": 0.4052, "num_input_tokens_seen": 11227392, "step": 3570 }, { "epoch": 0.2288585877984764, "grad_norm": 26.94727325439453, "learning_rate": 1.9007055207263223e-06, "loss": 0.6492, "num_input_tokens_seen": 11244416, "step": 3575 }, { "epoch": 0.2291786697394533, "grad_norm": 28.7558536529541, "learning_rate": 1.900219540075036e-06, "loss": 0.3588, "num_input_tokens_seen": 11260672, "step": 3580 }, { "epoch": 0.22949875168043019, "grad_norm": 55.45866775512695, "learning_rate": 1.8997324354890845e-06, "loss": 0.4749, "num_input_tokens_seen": 11277504, "step": 3585 }, { "epoch": 0.2298188336214071, "grad_norm": 110.20696258544922, "learning_rate": 1.8992442075766233e-06, "loss": 0.539, "num_input_tokens_seen": 11293184, "step": 3590 }, { "epoch": 0.23013891556238397, "grad_norm": 29.66388702392578, "learning_rate": 1.8987548569472105e-06, "loss": 0.3191, "num_input_tokens_seen": 11308480, "step": 3595 }, { "epoch": 0.23045899750336085, "grad_norm": 31.010486602783203, "learning_rate": 1.8982643842118064e-06, "loss": 0.396, "num_input_tokens_seen": 11323840, "step": 3600 }, { "epoch": 0.23077907944433776, "grad_norm": 63.96700668334961, "learning_rate": 1.8977727899827716e-06, "loss": 0.5821, "num_input_tokens_seen": 11339456, "step": 3605 }, { "epoch": 0.23109916138531464, "grad_norm": 50.296600341796875, "learning_rate": 1.8972800748738678e-06, "loss": 0.6554, "num_input_tokens_seen": 11354880, "step": 3610 }, { "epoch": 0.23141924332629152, "grad_norm": 27.36386489868164, "learning_rate": 1.896786239500255e-06, "loss": 0.5226, "num_input_tokens_seen": 11369984, "step": 3615 }, { "epoch": 0.23173932526726843, "grad_norm": 51.205718994140625, "learning_rate": 1.8962912844784928e-06, "loss": 0.429, "num_input_tokens_seen": 11384640, "step": 3620 }, { "epoch": 0.2320594072082453, "grad_norm": 53.744346618652344, "learning_rate": 1.8957952104265384e-06, "loss": 0.4945, "num_input_tokens_seen": 11401152, "step": 3625 }, { "epoch": 0.2323794891492222, "grad_norm": 32.322486877441406, "learning_rate": 1.8952980179637458e-06, "loss": 0.4535, "num_input_tokens_seen": 11416896, "step": 3630 }, { "epoch": 0.2326995710901991, "grad_norm": 34.96129608154297, "learning_rate": 1.8947997077108662e-06, "loss": 0.4899, "num_input_tokens_seen": 11432832, "step": 3635 }, { "epoch": 0.23301965303117597, "grad_norm": 30.439565658569336, "learning_rate": 1.894300280290045e-06, "loss": 0.4807, "num_input_tokens_seen": 11448320, "step": 3640 }, { "epoch": 0.23333973497215288, "grad_norm": 23.5026912689209, "learning_rate": 1.8937997363248237e-06, "loss": 0.5674, "num_input_tokens_seen": 11463488, "step": 3645 }, { "epoch": 0.23365981691312976, "grad_norm": 20.100936889648438, "learning_rate": 1.8932980764401373e-06, "loss": 0.4527, "num_input_tokens_seen": 11478592, "step": 3650 }, { "epoch": 0.23397989885410664, "grad_norm": 24.669857025146484, "learning_rate": 1.8927953012623141e-06, "loss": 0.3564, "num_input_tokens_seen": 11494720, "step": 3655 }, { "epoch": 0.23429998079508355, "grad_norm": 56.09657287597656, "learning_rate": 1.8922914114190744e-06, "loss": 0.4846, "num_input_tokens_seen": 11511232, "step": 3660 }, { "epoch": 0.23462006273606043, "grad_norm": 31.37401008605957, "learning_rate": 1.8917864075395312e-06, "loss": 0.5093, "num_input_tokens_seen": 11527040, "step": 3665 }, { "epoch": 0.23494014467703733, "grad_norm": 18.777942657470703, "learning_rate": 1.8912802902541873e-06, "loss": 0.4461, "num_input_tokens_seen": 11542528, "step": 3670 }, { "epoch": 0.2352602266180142, "grad_norm": 37.04750442504883, "learning_rate": 1.8907730601949362e-06, "loss": 0.4974, "num_input_tokens_seen": 11557696, "step": 3675 }, { "epoch": 0.2355803085589911, "grad_norm": 50.14651870727539, "learning_rate": 1.8902647179950608e-06, "loss": 0.4648, "num_input_tokens_seen": 11574848, "step": 3680 }, { "epoch": 0.235900390499968, "grad_norm": 52.763484954833984, "learning_rate": 1.889755264289232e-06, "loss": 0.5108, "num_input_tokens_seen": 11589696, "step": 3685 }, { "epoch": 0.23622047244094488, "grad_norm": 20.895673751831055, "learning_rate": 1.8892446997135087e-06, "loss": 0.384, "num_input_tokens_seen": 11606848, "step": 3690 }, { "epoch": 0.23654055438192176, "grad_norm": 31.011825561523438, "learning_rate": 1.888733024905337e-06, "loss": 0.6707, "num_input_tokens_seen": 11623744, "step": 3695 }, { "epoch": 0.23686063632289867, "grad_norm": 34.26097106933594, "learning_rate": 1.888220240503549e-06, "loss": 0.4755, "num_input_tokens_seen": 11640256, "step": 3700 }, { "epoch": 0.23718071826387555, "grad_norm": 32.54058837890625, "learning_rate": 1.8877063471483618e-06, "loss": 0.412, "num_input_tokens_seen": 11655744, "step": 3705 }, { "epoch": 0.23750080020485245, "grad_norm": 17.025754928588867, "learning_rate": 1.8871913454813772e-06, "loss": 0.2935, "num_input_tokens_seen": 11671104, "step": 3710 }, { "epoch": 0.23782088214582933, "grad_norm": 29.473085403442383, "learning_rate": 1.886675236145581e-06, "loss": 0.3898, "num_input_tokens_seen": 11686848, "step": 3715 }, { "epoch": 0.2381409640868062, "grad_norm": 28.30191421508789, "learning_rate": 1.8861580197853422e-06, "loss": 0.5018, "num_input_tokens_seen": 11701952, "step": 3720 }, { "epoch": 0.23846104602778312, "grad_norm": 41.48347473144531, "learning_rate": 1.8856396970464105e-06, "loss": 0.4647, "num_input_tokens_seen": 11718592, "step": 3725 }, { "epoch": 0.23878112796876, "grad_norm": 40.44169235229492, "learning_rate": 1.8851202685759189e-06, "loss": 0.5143, "num_input_tokens_seen": 11734208, "step": 3730 }, { "epoch": 0.2391012099097369, "grad_norm": 11.559971809387207, "learning_rate": 1.8845997350223792e-06, "loss": 0.407, "num_input_tokens_seen": 11748992, "step": 3735 }, { "epoch": 0.23942129185071379, "grad_norm": 28.135868072509766, "learning_rate": 1.8840780970356842e-06, "loss": 0.4217, "num_input_tokens_seen": 11764608, "step": 3740 }, { "epoch": 0.23974137379169067, "grad_norm": 29.070838928222656, "learning_rate": 1.8835553552671048e-06, "loss": 0.4078, "num_input_tokens_seen": 11780800, "step": 3745 }, { "epoch": 0.24006145573266757, "grad_norm": 30.527294158935547, "learning_rate": 1.8830315103692902e-06, "loss": 0.4593, "num_input_tokens_seen": 11795776, "step": 3750 }, { "epoch": 0.24038153767364445, "grad_norm": 34.47731399536133, "learning_rate": 1.8825065629962669e-06, "loss": 0.5071, "num_input_tokens_seen": 11811776, "step": 3755 }, { "epoch": 0.24070161961462133, "grad_norm": 32.23590087890625, "learning_rate": 1.881980513803438e-06, "loss": 0.4852, "num_input_tokens_seen": 11828224, "step": 3760 }, { "epoch": 0.24102170155559824, "grad_norm": 48.78215026855469, "learning_rate": 1.881453363447582e-06, "loss": 0.5035, "num_input_tokens_seen": 11843904, "step": 3765 }, { "epoch": 0.24134178349657512, "grad_norm": 57.377567291259766, "learning_rate": 1.880925112586852e-06, "loss": 0.5574, "num_input_tokens_seen": 11859392, "step": 3770 }, { "epoch": 0.24166186543755203, "grad_norm": 48.24585723876953, "learning_rate": 1.8803957618807762e-06, "loss": 0.4427, "num_input_tokens_seen": 11875968, "step": 3775 }, { "epoch": 0.2419819473785289, "grad_norm": 72.58015441894531, "learning_rate": 1.8798653119902548e-06, "loss": 0.4404, "num_input_tokens_seen": 11891584, "step": 3780 }, { "epoch": 0.24230202931950579, "grad_norm": 26.939559936523438, "learning_rate": 1.8793337635775603e-06, "loss": 0.5029, "num_input_tokens_seen": 11906944, "step": 3785 }, { "epoch": 0.2426221112604827, "grad_norm": 44.384925842285156, "learning_rate": 1.8788011173063376e-06, "loss": 0.4729, "num_input_tokens_seen": 11922368, "step": 3790 }, { "epoch": 0.24294219320145957, "grad_norm": 45.79201126098633, "learning_rate": 1.8782673738416018e-06, "loss": 0.5181, "num_input_tokens_seen": 11938432, "step": 3795 }, { "epoch": 0.24326227514243645, "grad_norm": 43.953582763671875, "learning_rate": 1.877732533849737e-06, "loss": 0.5078, "num_input_tokens_seen": 11956608, "step": 3800 }, { "epoch": 0.24358235708341336, "grad_norm": 25.617721557617188, "learning_rate": 1.8771965979984988e-06, "loss": 0.4394, "num_input_tokens_seen": 11972480, "step": 3805 }, { "epoch": 0.24390243902439024, "grad_norm": 21.165599822998047, "learning_rate": 1.8766595669570084e-06, "loss": 0.3889, "num_input_tokens_seen": 11987072, "step": 3810 }, { "epoch": 0.24422252096536715, "grad_norm": 32.8095703125, "learning_rate": 1.8761214413957553e-06, "loss": 0.4361, "num_input_tokens_seen": 12002112, "step": 3815 }, { "epoch": 0.24454260290634403, "grad_norm": 23.940019607543945, "learning_rate": 1.8755822219865963e-06, "loss": 0.3493, "num_input_tokens_seen": 12016960, "step": 3820 }, { "epoch": 0.2448626848473209, "grad_norm": 68.5343246459961, "learning_rate": 1.875041909402752e-06, "loss": 0.4331, "num_input_tokens_seen": 12032576, "step": 3825 }, { "epoch": 0.2451827667882978, "grad_norm": 25.4498233795166, "learning_rate": 1.8745005043188102e-06, "loss": 0.3638, "num_input_tokens_seen": 12048768, "step": 3830 }, { "epoch": 0.2455028487292747, "grad_norm": 37.17061233520508, "learning_rate": 1.8739580074107208e-06, "loss": 0.395, "num_input_tokens_seen": 12065088, "step": 3835 }, { "epoch": 0.24582293067025157, "grad_norm": 38.826255798339844, "learning_rate": 1.873414419355798e-06, "loss": 0.6844, "num_input_tokens_seen": 12080704, "step": 3840 }, { "epoch": 0.24614301261122848, "grad_norm": 40.032527923583984, "learning_rate": 1.872869740832717e-06, "loss": 0.4292, "num_input_tokens_seen": 12096704, "step": 3845 }, { "epoch": 0.24646309455220536, "grad_norm": 36.49966049194336, "learning_rate": 1.8723239725215165e-06, "loss": 0.6103, "num_input_tokens_seen": 12111488, "step": 3850 }, { "epoch": 0.24678317649318227, "grad_norm": 22.378215789794922, "learning_rate": 1.871777115103594e-06, "loss": 0.4206, "num_input_tokens_seen": 12128192, "step": 3855 }, { "epoch": 0.24710325843415915, "grad_norm": 21.57525634765625, "learning_rate": 1.8712291692617074e-06, "loss": 0.4786, "num_input_tokens_seen": 12143808, "step": 3860 }, { "epoch": 0.24742334037513602, "grad_norm": 32.303707122802734, "learning_rate": 1.8706801356799735e-06, "loss": 0.4804, "num_input_tokens_seen": 12159232, "step": 3865 }, { "epoch": 0.24774342231611293, "grad_norm": 26.57257843017578, "learning_rate": 1.8701300150438674e-06, "loss": 0.4465, "num_input_tokens_seen": 12175360, "step": 3870 }, { "epoch": 0.2480635042570898, "grad_norm": 17.5268611907959, "learning_rate": 1.869578808040221e-06, "loss": 0.4191, "num_input_tokens_seen": 12190272, "step": 3875 }, { "epoch": 0.2483835861980667, "grad_norm": 48.708431243896484, "learning_rate": 1.869026515357223e-06, "loss": 0.5149, "num_input_tokens_seen": 12208448, "step": 3880 }, { "epoch": 0.2487036681390436, "grad_norm": 38.29990005493164, "learning_rate": 1.8684731376844169e-06, "loss": 0.6372, "num_input_tokens_seen": 12225984, "step": 3885 }, { "epoch": 0.24902375008002048, "grad_norm": 33.091251373291016, "learning_rate": 1.8679186757127014e-06, "loss": 0.4965, "num_input_tokens_seen": 12241408, "step": 3890 }, { "epoch": 0.24934383202099739, "grad_norm": 30.313892364501953, "learning_rate": 1.8673631301343288e-06, "loss": 0.4381, "num_input_tokens_seen": 12256064, "step": 3895 }, { "epoch": 0.24966391396197427, "grad_norm": 26.932268142700195, "learning_rate": 1.8668065016429044e-06, "loss": 0.4388, "num_input_tokens_seen": 12272832, "step": 3900 }, { "epoch": 0.24998399590295114, "grad_norm": 22.444902420043945, "learning_rate": 1.866248790933385e-06, "loss": 0.5257, "num_input_tokens_seen": 12289024, "step": 3905 }, { "epoch": 0.25030407784392805, "grad_norm": 27.67203140258789, "learning_rate": 1.8656899987020795e-06, "loss": 0.4226, "num_input_tokens_seen": 12304064, "step": 3910 }, { "epoch": 0.25030407784392805, "eval_loss": 0.4644124507904053, "eval_runtime": 49.2047, "eval_samples_per_second": 282.209, "eval_steps_per_second": 35.281, "num_input_tokens_seen": 12304064, "step": 3910 }, { "epoch": 0.25062415978490493, "grad_norm": 31.06105613708496, "learning_rate": 1.865130125646646e-06, "loss": 0.4605, "num_input_tokens_seen": 12320256, "step": 3915 }, { "epoch": 0.2509442417258818, "grad_norm": 21.309823989868164, "learning_rate": 1.8645691724660933e-06, "loss": 0.4394, "num_input_tokens_seen": 12335360, "step": 3920 }, { "epoch": 0.2512643236668587, "grad_norm": 24.060503005981445, "learning_rate": 1.8640071398607774e-06, "loss": 0.4616, "num_input_tokens_seen": 12351488, "step": 3925 }, { "epoch": 0.2515844056078356, "grad_norm": 58.631771087646484, "learning_rate": 1.8634440285324024e-06, "loss": 0.6203, "num_input_tokens_seen": 12365952, "step": 3930 }, { "epoch": 0.2519044875488125, "grad_norm": 54.601966857910156, "learning_rate": 1.8628798391840205e-06, "loss": 0.469, "num_input_tokens_seen": 12381376, "step": 3935 }, { "epoch": 0.2522245694897894, "grad_norm": 57.81584548950195, "learning_rate": 1.8623145725200277e-06, "loss": 0.4588, "num_input_tokens_seen": 12396160, "step": 3940 }, { "epoch": 0.25254465143076626, "grad_norm": 27.153488159179688, "learning_rate": 1.8617482292461664e-06, "loss": 0.4468, "num_input_tokens_seen": 12410944, "step": 3945 }, { "epoch": 0.25286473337174314, "grad_norm": 25.399364471435547, "learning_rate": 1.861180810069523e-06, "loss": 0.4172, "num_input_tokens_seen": 12426304, "step": 3950 }, { "epoch": 0.2531848153127201, "grad_norm": 41.58170700073242, "learning_rate": 1.8606123156985268e-06, "loss": 0.4599, "num_input_tokens_seen": 12442432, "step": 3955 }, { "epoch": 0.25350489725369696, "grad_norm": 19.8244686126709, "learning_rate": 1.8600427468429496e-06, "loss": 0.4617, "num_input_tokens_seen": 12458368, "step": 3960 }, { "epoch": 0.25382497919467384, "grad_norm": 30.747608184814453, "learning_rate": 1.8594721042139052e-06, "loss": 0.4302, "num_input_tokens_seen": 12474368, "step": 3965 }, { "epoch": 0.2541450611356507, "grad_norm": 18.357315063476562, "learning_rate": 1.858900388523847e-06, "loss": 0.4147, "num_input_tokens_seen": 12490176, "step": 3970 }, { "epoch": 0.2544651430766276, "grad_norm": 25.5488224029541, "learning_rate": 1.8583276004865694e-06, "loss": 0.4639, "num_input_tokens_seen": 12507840, "step": 3975 }, { "epoch": 0.25478522501760453, "grad_norm": 38.78436279296875, "learning_rate": 1.8577537408172046e-06, "loss": 0.3452, "num_input_tokens_seen": 12523520, "step": 3980 }, { "epoch": 0.2551053069585814, "grad_norm": 32.23760986328125, "learning_rate": 1.8571788102322234e-06, "loss": 0.5365, "num_input_tokens_seen": 12540736, "step": 3985 }, { "epoch": 0.2554253888995583, "grad_norm": 34.73612976074219, "learning_rate": 1.8566028094494332e-06, "loss": 0.4704, "num_input_tokens_seen": 12556352, "step": 3990 }, { "epoch": 0.25574547084053517, "grad_norm": 21.44598388671875, "learning_rate": 1.8560257391879778e-06, "loss": 0.3726, "num_input_tokens_seen": 12570688, "step": 3995 }, { "epoch": 0.25606555278151205, "grad_norm": 16.398038864135742, "learning_rate": 1.855447600168336e-06, "loss": 0.4038, "num_input_tokens_seen": 12585984, "step": 4000 }, { "epoch": 0.25638563472248893, "grad_norm": 19.45931053161621, "learning_rate": 1.8548683931123215e-06, "loss": 0.4665, "num_input_tokens_seen": 12601216, "step": 4005 }, { "epoch": 0.25670571666346587, "grad_norm": 65.39263916015625, "learning_rate": 1.8542881187430807e-06, "loss": 0.4408, "num_input_tokens_seen": 12618624, "step": 4010 }, { "epoch": 0.25702579860444275, "grad_norm": 24.916526794433594, "learning_rate": 1.8537067777850935e-06, "loss": 0.5792, "num_input_tokens_seen": 12635840, "step": 4015 }, { "epoch": 0.2573458805454196, "grad_norm": 21.44871711730957, "learning_rate": 1.8531243709641704e-06, "loss": 0.3554, "num_input_tokens_seen": 12651904, "step": 4020 }, { "epoch": 0.2576659624863965, "grad_norm": 37.30930709838867, "learning_rate": 1.8525408990074533e-06, "loss": 0.4923, "num_input_tokens_seen": 12666944, "step": 4025 }, { "epoch": 0.2579860444273734, "grad_norm": 14.11586856842041, "learning_rate": 1.851956362643414e-06, "loss": 0.4155, "num_input_tokens_seen": 12682688, "step": 4030 }, { "epoch": 0.2583061263683503, "grad_norm": 43.13747024536133, "learning_rate": 1.851370762601853e-06, "loss": 0.5472, "num_input_tokens_seen": 12698304, "step": 4035 }, { "epoch": 0.2586262083093272, "grad_norm": 41.56428527832031, "learning_rate": 1.8507840996138983e-06, "loss": 0.4995, "num_input_tokens_seen": 12712896, "step": 4040 }, { "epoch": 0.2589462902503041, "grad_norm": 61.59485626220703, "learning_rate": 1.8501963744120062e-06, "loss": 0.39, "num_input_tokens_seen": 12727488, "step": 4045 }, { "epoch": 0.25926637219128096, "grad_norm": 34.89384078979492, "learning_rate": 1.849607587729958e-06, "loss": 0.4037, "num_input_tokens_seen": 12742720, "step": 4050 }, { "epoch": 0.25958645413225784, "grad_norm": 26.042404174804688, "learning_rate": 1.8490177403028615e-06, "loss": 0.3918, "num_input_tokens_seen": 12757760, "step": 4055 }, { "epoch": 0.2599065360732348, "grad_norm": 39.44220733642578, "learning_rate": 1.8484268328671475e-06, "loss": 0.4879, "num_input_tokens_seen": 12773312, "step": 4060 }, { "epoch": 0.26022661801421165, "grad_norm": 41.2028923034668, "learning_rate": 1.847834866160571e-06, "loss": 0.553, "num_input_tokens_seen": 12790336, "step": 4065 }, { "epoch": 0.26054669995518853, "grad_norm": 26.452022552490234, "learning_rate": 1.847241840922209e-06, "loss": 0.4995, "num_input_tokens_seen": 12805632, "step": 4070 }, { "epoch": 0.2608667818961654, "grad_norm": 36.87411117553711, "learning_rate": 1.8466477578924616e-06, "loss": 0.4861, "num_input_tokens_seen": 12821184, "step": 4075 }, { "epoch": 0.2611868638371423, "grad_norm": 30.8194522857666, "learning_rate": 1.8460526178130472e-06, "loss": 0.5037, "num_input_tokens_seen": 12836544, "step": 4080 }, { "epoch": 0.26150694577811917, "grad_norm": 37.22843551635742, "learning_rate": 1.8454564214270056e-06, "loss": 0.4307, "num_input_tokens_seen": 12852032, "step": 4085 }, { "epoch": 0.2618270277190961, "grad_norm": 46.01398468017578, "learning_rate": 1.8448591694786955e-06, "loss": 0.446, "num_input_tokens_seen": 12867456, "step": 4090 }, { "epoch": 0.262147109660073, "grad_norm": 30.995271682739258, "learning_rate": 1.8442608627137925e-06, "loss": 0.3206, "num_input_tokens_seen": 12885184, "step": 4095 }, { "epoch": 0.26246719160104987, "grad_norm": 30.171613693237305, "learning_rate": 1.8436615018792897e-06, "loss": 0.3815, "num_input_tokens_seen": 12900416, "step": 4100 }, { "epoch": 0.26278727354202674, "grad_norm": 38.23905563354492, "learning_rate": 1.8430610877234957e-06, "loss": 0.5722, "num_input_tokens_seen": 12915648, "step": 4105 }, { "epoch": 0.2631073554830036, "grad_norm": 15.184795379638672, "learning_rate": 1.8424596209960356e-06, "loss": 0.4491, "num_input_tokens_seen": 12930368, "step": 4110 }, { "epoch": 0.26342743742398056, "grad_norm": 24.648910522460938, "learning_rate": 1.8418571024478466e-06, "loss": 0.5253, "num_input_tokens_seen": 12945472, "step": 4115 }, { "epoch": 0.26374751936495744, "grad_norm": 24.325111389160156, "learning_rate": 1.8412535328311812e-06, "loss": 0.4884, "num_input_tokens_seen": 12961472, "step": 4120 }, { "epoch": 0.2640676013059343, "grad_norm": 67.2924575805664, "learning_rate": 1.8406489128996023e-06, "loss": 0.5935, "num_input_tokens_seen": 12975872, "step": 4125 }, { "epoch": 0.2643876832469112, "grad_norm": 33.307865142822266, "learning_rate": 1.8400432434079853e-06, "loss": 0.5286, "num_input_tokens_seen": 12992128, "step": 4130 }, { "epoch": 0.2647077651878881, "grad_norm": 17.04827308654785, "learning_rate": 1.8394365251125162e-06, "loss": 0.4112, "num_input_tokens_seen": 13021184, "step": 4135 }, { "epoch": 0.265027847128865, "grad_norm": 31.74374771118164, "learning_rate": 1.8388287587706888e-06, "loss": 0.4385, "num_input_tokens_seen": 13037568, "step": 4140 }, { "epoch": 0.2653479290698419, "grad_norm": 35.290184020996094, "learning_rate": 1.8382199451413074e-06, "loss": 0.4655, "num_input_tokens_seen": 13053440, "step": 4145 }, { "epoch": 0.26566801101081877, "grad_norm": 35.621437072753906, "learning_rate": 1.837610084984483e-06, "loss": 0.5121, "num_input_tokens_seen": 13069440, "step": 4150 }, { "epoch": 0.26598809295179565, "grad_norm": 59.76009750366211, "learning_rate": 1.8369991790616327e-06, "loss": 0.5466, "num_input_tokens_seen": 13084224, "step": 4155 }, { "epoch": 0.26630817489277253, "grad_norm": 38.1486701965332, "learning_rate": 1.8363872281354795e-06, "loss": 0.6597, "num_input_tokens_seen": 13098688, "step": 4160 }, { "epoch": 0.26662825683374947, "grad_norm": 33.94224166870117, "learning_rate": 1.835774232970052e-06, "loss": 0.4049, "num_input_tokens_seen": 13114112, "step": 4165 }, { "epoch": 0.26694833877472635, "grad_norm": 29.897977828979492, "learning_rate": 1.8351601943306815e-06, "loss": 0.4672, "num_input_tokens_seen": 13130240, "step": 4170 }, { "epoch": 0.2672684207157032, "grad_norm": 41.0724983215332, "learning_rate": 1.8345451129840025e-06, "loss": 0.3994, "num_input_tokens_seen": 13145536, "step": 4175 }, { "epoch": 0.2675885026566801, "grad_norm": 37.96142578125, "learning_rate": 1.8339289896979515e-06, "loss": 0.552, "num_input_tokens_seen": 13160256, "step": 4180 }, { "epoch": 0.267908584597657, "grad_norm": 37.417449951171875, "learning_rate": 1.8333118252417651e-06, "loss": 0.5336, "num_input_tokens_seen": 13177088, "step": 4185 }, { "epoch": 0.26822866653863386, "grad_norm": 32.74960708618164, "learning_rate": 1.832693620385981e-06, "loss": 0.5098, "num_input_tokens_seen": 13192768, "step": 4190 }, { "epoch": 0.2685487484796108, "grad_norm": 27.491313934326172, "learning_rate": 1.8320743759024352e-06, "loss": 0.5183, "num_input_tokens_seen": 13208192, "step": 4195 }, { "epoch": 0.2688688304205877, "grad_norm": 38.285240173339844, "learning_rate": 1.831454092564261e-06, "loss": 0.5242, "num_input_tokens_seen": 13223872, "step": 4200 }, { "epoch": 0.26918891236156456, "grad_norm": 20.660884857177734, "learning_rate": 1.8308327711458899e-06, "loss": 0.4714, "num_input_tokens_seen": 13239104, "step": 4205 }, { "epoch": 0.26950899430254144, "grad_norm": 36.68329620361328, "learning_rate": 1.830210412423049e-06, "loss": 0.3844, "num_input_tokens_seen": 13254464, "step": 4210 }, { "epoch": 0.2698290762435183, "grad_norm": 22.882728576660156, "learning_rate": 1.8295870171727605e-06, "loss": 0.3647, "num_input_tokens_seen": 13269824, "step": 4215 }, { "epoch": 0.27014915818449525, "grad_norm": 20.831666946411133, "learning_rate": 1.8289625861733408e-06, "loss": 0.4194, "num_input_tokens_seen": 13288448, "step": 4220 }, { "epoch": 0.27046924012547213, "grad_norm": 34.60063171386719, "learning_rate": 1.8283371202043991e-06, "loss": 0.5194, "num_input_tokens_seen": 13304320, "step": 4225 }, { "epoch": 0.270789322066449, "grad_norm": 39.810787200927734, "learning_rate": 1.827710620046837e-06, "loss": 0.5503, "num_input_tokens_seen": 13321920, "step": 4230 }, { "epoch": 0.2711094040074259, "grad_norm": 52.01685333251953, "learning_rate": 1.8270830864828474e-06, "loss": 0.4687, "num_input_tokens_seen": 13337280, "step": 4235 }, { "epoch": 0.27142948594840277, "grad_norm": 15.508134841918945, "learning_rate": 1.8264545202959133e-06, "loss": 0.4287, "num_input_tokens_seen": 13354112, "step": 4240 }, { "epoch": 0.2717495678893797, "grad_norm": 32.78725814819336, "learning_rate": 1.8258249222708067e-06, "loss": 0.4321, "num_input_tokens_seen": 13369600, "step": 4245 }, { "epoch": 0.2720696498303566, "grad_norm": 23.458738327026367, "learning_rate": 1.8251942931935886e-06, "loss": 0.4464, "num_input_tokens_seen": 13385536, "step": 4250 }, { "epoch": 0.27238973177133347, "grad_norm": 31.733396530151367, "learning_rate": 1.8245626338516069e-06, "loss": 0.3788, "num_input_tokens_seen": 13400832, "step": 4255 }, { "epoch": 0.27270981371231034, "grad_norm": 35.16189956665039, "learning_rate": 1.823929945033495e-06, "loss": 0.3397, "num_input_tokens_seen": 13416000, "step": 4260 }, { "epoch": 0.2730298956532872, "grad_norm": 31.286619186401367, "learning_rate": 1.8232962275291728e-06, "loss": 0.5015, "num_input_tokens_seen": 13431360, "step": 4265 }, { "epoch": 0.2733499775942641, "grad_norm": 45.81655502319336, "learning_rate": 1.822661482129844e-06, "loss": 0.4342, "num_input_tokens_seen": 13446976, "step": 4270 }, { "epoch": 0.27367005953524104, "grad_norm": 21.677684783935547, "learning_rate": 1.8220257096279956e-06, "loss": 0.3796, "num_input_tokens_seen": 13463040, "step": 4275 }, { "epoch": 0.2739901414762179, "grad_norm": 35.41159439086914, "learning_rate": 1.8213889108173972e-06, "loss": 0.6798, "num_input_tokens_seen": 13478656, "step": 4280 }, { "epoch": 0.2743102234171948, "grad_norm": 20.70133399963379, "learning_rate": 1.8207510864930992e-06, "loss": 0.4843, "num_input_tokens_seen": 13495296, "step": 4285 }, { "epoch": 0.2746303053581717, "grad_norm": 18.472976684570312, "learning_rate": 1.8201122374514336e-06, "loss": 0.5024, "num_input_tokens_seen": 13510912, "step": 4290 }, { "epoch": 0.27495038729914856, "grad_norm": 22.679168701171875, "learning_rate": 1.8194723644900099e-06, "loss": 0.4465, "num_input_tokens_seen": 13525952, "step": 4295 }, { "epoch": 0.2752704692401255, "grad_norm": 25.11664390563965, "learning_rate": 1.8188314684077173e-06, "loss": 0.5334, "num_input_tokens_seen": 13546752, "step": 4300 }, { "epoch": 0.2755905511811024, "grad_norm": 37.698638916015625, "learning_rate": 1.8181895500047226e-06, "loss": 0.5659, "num_input_tokens_seen": 13561728, "step": 4305 }, { "epoch": 0.27591063312207925, "grad_norm": 21.342445373535156, "learning_rate": 1.817546610082468e-06, "loss": 0.4559, "num_input_tokens_seen": 13577344, "step": 4310 }, { "epoch": 0.27623071506305613, "grad_norm": 25.98567008972168, "learning_rate": 1.816902649443672e-06, "loss": 0.4806, "num_input_tokens_seen": 13592256, "step": 4315 }, { "epoch": 0.276550797004033, "grad_norm": 36.9737548828125, "learning_rate": 1.8162576688923262e-06, "loss": 0.5351, "num_input_tokens_seen": 13608832, "step": 4320 }, { "epoch": 0.27687087894500995, "grad_norm": 25.08713150024414, "learning_rate": 1.815611669233697e-06, "loss": 0.5544, "num_input_tokens_seen": 13624128, "step": 4325 }, { "epoch": 0.2771909608859868, "grad_norm": 25.511003494262695, "learning_rate": 1.8149646512743222e-06, "loss": 0.5301, "num_input_tokens_seen": 13640576, "step": 4330 }, { "epoch": 0.2775110428269637, "grad_norm": 22.00773048400879, "learning_rate": 1.8143166158220118e-06, "loss": 0.4513, "num_input_tokens_seen": 13655872, "step": 4335 }, { "epoch": 0.2778311247679406, "grad_norm": 41.66020584106445, "learning_rate": 1.8136675636858454e-06, "loss": 0.6679, "num_input_tokens_seen": 13672384, "step": 4340 }, { "epoch": 0.27815120670891746, "grad_norm": 20.195674896240234, "learning_rate": 1.8130174956761723e-06, "loss": 0.3988, "num_input_tokens_seen": 13687296, "step": 4345 }, { "epoch": 0.2784712886498944, "grad_norm": 25.734270095825195, "learning_rate": 1.81236641260461e-06, "loss": 0.5363, "num_input_tokens_seen": 13702528, "step": 4350 }, { "epoch": 0.2787913705908713, "grad_norm": 67.11882019042969, "learning_rate": 1.811714315284043e-06, "loss": 0.5002, "num_input_tokens_seen": 13717568, "step": 4355 }, { "epoch": 0.27911145253184816, "grad_norm": 19.78514862060547, "learning_rate": 1.8110612045286229e-06, "loss": 0.4016, "num_input_tokens_seen": 13733568, "step": 4360 }, { "epoch": 0.27943153447282504, "grad_norm": 20.73729705810547, "learning_rate": 1.8104070811537661e-06, "loss": 0.3744, "num_input_tokens_seen": 13749312, "step": 4365 }, { "epoch": 0.2797516164138019, "grad_norm": 16.582807540893555, "learning_rate": 1.8097519459761533e-06, "loss": 0.4299, "num_input_tokens_seen": 13765952, "step": 4370 }, { "epoch": 0.2800716983547788, "grad_norm": 47.0535888671875, "learning_rate": 1.8090957998137283e-06, "loss": 0.495, "num_input_tokens_seen": 13781440, "step": 4375 }, { "epoch": 0.28039178029575573, "grad_norm": 53.1851921081543, "learning_rate": 1.8084386434856978e-06, "loss": 0.4471, "num_input_tokens_seen": 13796864, "step": 4380 }, { "epoch": 0.2807118622367326, "grad_norm": 26.30471420288086, "learning_rate": 1.8077804778125283e-06, "loss": 0.4915, "num_input_tokens_seen": 13812736, "step": 4385 }, { "epoch": 0.2810319441777095, "grad_norm": 60.074981689453125, "learning_rate": 1.807121303615948e-06, "loss": 0.4966, "num_input_tokens_seen": 13828288, "step": 4390 }, { "epoch": 0.28135202611868637, "grad_norm": 40.989219665527344, "learning_rate": 1.8064611217189434e-06, "loss": 0.4125, "num_input_tokens_seen": 13845568, "step": 4395 }, { "epoch": 0.28167210805966325, "grad_norm": 25.27169418334961, "learning_rate": 1.8057999329457596e-06, "loss": 0.398, "num_input_tokens_seen": 13860608, "step": 4400 }, { "epoch": 0.2819921900006402, "grad_norm": 39.82872772216797, "learning_rate": 1.8051377381218984e-06, "loss": 0.5663, "num_input_tokens_seen": 13876608, "step": 4405 }, { "epoch": 0.28231227194161707, "grad_norm": 34.87173080444336, "learning_rate": 1.8044745380741177e-06, "loss": 0.5656, "num_input_tokens_seen": 13893632, "step": 4410 }, { "epoch": 0.28263235388259395, "grad_norm": 49.1501579284668, "learning_rate": 1.8038103336304306e-06, "loss": 0.3896, "num_input_tokens_seen": 13909312, "step": 4415 }, { "epoch": 0.2829524358235708, "grad_norm": 27.521867752075195, "learning_rate": 1.8031451256201042e-06, "loss": 0.5699, "num_input_tokens_seen": 13925824, "step": 4420 }, { "epoch": 0.2832725177645477, "grad_norm": 25.578853607177734, "learning_rate": 1.8024789148736589e-06, "loss": 0.5385, "num_input_tokens_seen": 13942336, "step": 4425 }, { "epoch": 0.28359259970552464, "grad_norm": 27.650800704956055, "learning_rate": 1.8018117022228655e-06, "loss": 0.392, "num_input_tokens_seen": 13957760, "step": 4430 }, { "epoch": 0.2839126816465015, "grad_norm": 49.428855895996094, "learning_rate": 1.8011434885007479e-06, "loss": 0.4997, "num_input_tokens_seen": 13972992, "step": 4435 }, { "epoch": 0.2842327635874784, "grad_norm": 30.81421661376953, "learning_rate": 1.8004742745415787e-06, "loss": 0.4308, "num_input_tokens_seen": 13988736, "step": 4440 }, { "epoch": 0.2845528455284553, "grad_norm": 23.36966323852539, "learning_rate": 1.799804061180879e-06, "loss": 0.5427, "num_input_tokens_seen": 14003520, "step": 4445 }, { "epoch": 0.28487292746943216, "grad_norm": 29.571027755737305, "learning_rate": 1.799132849255418e-06, "loss": 0.518, "num_input_tokens_seen": 14020608, "step": 4450 }, { "epoch": 0.28519300941040904, "grad_norm": 34.7742919921875, "learning_rate": 1.798460639603212e-06, "loss": 0.4011, "num_input_tokens_seen": 14035328, "step": 4455 }, { "epoch": 0.285513091351386, "grad_norm": 37.04494094848633, "learning_rate": 1.7977874330635224e-06, "loss": 0.4805, "num_input_tokens_seen": 14050816, "step": 4460 }, { "epoch": 0.28583317329236285, "grad_norm": 18.75509262084961, "learning_rate": 1.7971132304768555e-06, "loss": 0.3289, "num_input_tokens_seen": 14066880, "step": 4465 }, { "epoch": 0.28615325523333973, "grad_norm": 24.66355323791504, "learning_rate": 1.7964380326849612e-06, "loss": 0.4937, "num_input_tokens_seen": 14081728, "step": 4470 }, { "epoch": 0.2864733371743166, "grad_norm": 18.791399002075195, "learning_rate": 1.795761840530832e-06, "loss": 0.4941, "num_input_tokens_seen": 14097984, "step": 4475 }, { "epoch": 0.2867934191152935, "grad_norm": 27.4366455078125, "learning_rate": 1.7950846548587015e-06, "loss": 0.4208, "num_input_tokens_seen": 14115264, "step": 4480 }, { "epoch": 0.2871135010562704, "grad_norm": 17.53047752380371, "learning_rate": 1.7944064765140445e-06, "loss": 0.2799, "num_input_tokens_seen": 14129472, "step": 4485 }, { "epoch": 0.2874335829972473, "grad_norm": 34.00762939453125, "learning_rate": 1.7937273063435735e-06, "loss": 0.55, "num_input_tokens_seen": 14144896, "step": 4490 }, { "epoch": 0.2877536649382242, "grad_norm": 27.387237548828125, "learning_rate": 1.7930471451952416e-06, "loss": 0.3622, "num_input_tokens_seen": 14159744, "step": 4495 }, { "epoch": 0.28807374687920106, "grad_norm": 39.22768020629883, "learning_rate": 1.7923659939182377e-06, "loss": 0.4915, "num_input_tokens_seen": 14176384, "step": 4500 }, { "epoch": 0.28839382882017794, "grad_norm": 39.973106384277344, "learning_rate": 1.7916838533629866e-06, "loss": 0.5376, "num_input_tokens_seen": 14192320, "step": 4505 }, { "epoch": 0.2887139107611549, "grad_norm": 27.084346771240234, "learning_rate": 1.7910007243811493e-06, "loss": 0.397, "num_input_tokens_seen": 14208192, "step": 4510 }, { "epoch": 0.28903399270213176, "grad_norm": 51.122711181640625, "learning_rate": 1.7903166078256202e-06, "loss": 0.5486, "num_input_tokens_seen": 14223104, "step": 4515 }, { "epoch": 0.28935407464310864, "grad_norm": 49.78089141845703, "learning_rate": 1.789631504550527e-06, "loss": 0.4153, "num_input_tokens_seen": 14238464, "step": 4520 }, { "epoch": 0.2896741565840855, "grad_norm": 32.12791061401367, "learning_rate": 1.7889454154112288e-06, "loss": 0.384, "num_input_tokens_seen": 14254656, "step": 4525 }, { "epoch": 0.2899942385250624, "grad_norm": 43.227901458740234, "learning_rate": 1.7882583412643167e-06, "loss": 0.3983, "num_input_tokens_seen": 14268928, "step": 4530 }, { "epoch": 0.29031432046603933, "grad_norm": 31.457603454589844, "learning_rate": 1.78757028296761e-06, "loss": 0.4326, "num_input_tokens_seen": 14285952, "step": 4535 }, { "epoch": 0.2906344024070162, "grad_norm": 18.678508758544922, "learning_rate": 1.7868812413801582e-06, "loss": 0.3522, "num_input_tokens_seen": 14301760, "step": 4540 }, { "epoch": 0.2909544843479931, "grad_norm": 53.38247299194336, "learning_rate": 1.7861912173622372e-06, "loss": 0.4976, "num_input_tokens_seen": 14318208, "step": 4545 }, { "epoch": 0.29127456628896997, "grad_norm": 41.86543655395508, "learning_rate": 1.7855002117753504e-06, "loss": 0.4597, "num_input_tokens_seen": 14334144, "step": 4550 }, { "epoch": 0.29159464822994685, "grad_norm": 49.806610107421875, "learning_rate": 1.7848082254822266e-06, "loss": 0.5283, "num_input_tokens_seen": 14349120, "step": 4555 }, { "epoch": 0.29191473017092373, "grad_norm": 56.75021743774414, "learning_rate": 1.7841152593468185e-06, "loss": 0.4868, "num_input_tokens_seen": 14365376, "step": 4560 }, { "epoch": 0.29223481211190067, "grad_norm": 34.16107940673828, "learning_rate": 1.7834213142343026e-06, "loss": 0.4582, "num_input_tokens_seen": 14381568, "step": 4565 }, { "epoch": 0.29255489405287755, "grad_norm": 28.742692947387695, "learning_rate": 1.7827263910110777e-06, "loss": 0.4626, "num_input_tokens_seen": 14397312, "step": 4570 }, { "epoch": 0.2928749759938544, "grad_norm": 34.53966522216797, "learning_rate": 1.7820304905447632e-06, "loss": 0.4372, "num_input_tokens_seen": 14412928, "step": 4575 }, { "epoch": 0.2931950579348313, "grad_norm": 47.14699935913086, "learning_rate": 1.7813336137041991e-06, "loss": 0.446, "num_input_tokens_seen": 14427968, "step": 4580 }, { "epoch": 0.2935151398758082, "grad_norm": 37.16606140136719, "learning_rate": 1.7806357613594447e-06, "loss": 0.3693, "num_input_tokens_seen": 14442944, "step": 4585 }, { "epoch": 0.2938352218167851, "grad_norm": 19.43882179260254, "learning_rate": 1.7799369343817764e-06, "loss": 0.4481, "num_input_tokens_seen": 14458176, "step": 4590 }, { "epoch": 0.294155303757762, "grad_norm": 24.445056915283203, "learning_rate": 1.7792371336436883e-06, "loss": 0.3566, "num_input_tokens_seen": 14473600, "step": 4595 }, { "epoch": 0.2944753856987389, "grad_norm": 28.31954574584961, "learning_rate": 1.7785363600188892e-06, "loss": 0.6518, "num_input_tokens_seen": 14488896, "step": 4600 }, { "epoch": 0.29479546763971576, "grad_norm": 38.648948669433594, "learning_rate": 1.7778346143823038e-06, "loss": 0.5881, "num_input_tokens_seen": 14502784, "step": 4605 }, { "epoch": 0.29511554958069264, "grad_norm": 33.51401138305664, "learning_rate": 1.7771318976100696e-06, "loss": 0.4293, "num_input_tokens_seen": 14520000, "step": 4610 }, { "epoch": 0.2954356315216696, "grad_norm": 28.780546188354492, "learning_rate": 1.7764282105795364e-06, "loss": 0.3401, "num_input_tokens_seen": 14536320, "step": 4615 }, { "epoch": 0.29575571346264645, "grad_norm": 47.155277252197266, "learning_rate": 1.7757235541692663e-06, "loss": 0.4524, "num_input_tokens_seen": 14551808, "step": 4620 }, { "epoch": 0.29607579540362333, "grad_norm": 19.841266632080078, "learning_rate": 1.7750179292590306e-06, "loss": 0.3157, "num_input_tokens_seen": 14566976, "step": 4625 }, { "epoch": 0.2963958773446002, "grad_norm": 26.28995132446289, "learning_rate": 1.7743113367298107e-06, "loss": 0.3475, "num_input_tokens_seen": 14583104, "step": 4630 }, { "epoch": 0.2967159592855771, "grad_norm": 38.58869552612305, "learning_rate": 1.7736037774637955e-06, "loss": 0.4454, "num_input_tokens_seen": 14598336, "step": 4635 }, { "epoch": 0.29703604122655397, "grad_norm": 50.025482177734375, "learning_rate": 1.772895252344381e-06, "loss": 0.5142, "num_input_tokens_seen": 14615232, "step": 4640 }, { "epoch": 0.2973561231675309, "grad_norm": 19.640771865844727, "learning_rate": 1.7721857622561692e-06, "loss": 0.3932, "num_input_tokens_seen": 14630848, "step": 4645 }, { "epoch": 0.2976762051085078, "grad_norm": 31.551252365112305, "learning_rate": 1.7714753080849664e-06, "loss": 0.4601, "num_input_tokens_seen": 14647040, "step": 4650 }, { "epoch": 0.29799628704948466, "grad_norm": 22.483062744140625, "learning_rate": 1.7707638907177837e-06, "loss": 0.4116, "num_input_tokens_seen": 14661888, "step": 4655 }, { "epoch": 0.29831636899046154, "grad_norm": 143.85166931152344, "learning_rate": 1.7700515110428336e-06, "loss": 0.7093, "num_input_tokens_seen": 14677696, "step": 4660 }, { "epoch": 0.2986364509314384, "grad_norm": 26.837242126464844, "learning_rate": 1.7693381699495307e-06, "loss": 0.4799, "num_input_tokens_seen": 14693184, "step": 4665 }, { "epoch": 0.29895653287241536, "grad_norm": 30.247093200683594, "learning_rate": 1.7686238683284894e-06, "loss": 0.3643, "num_input_tokens_seen": 14707904, "step": 4670 }, { "epoch": 0.29927661481339224, "grad_norm": 24.62070083618164, "learning_rate": 1.7679086070715237e-06, "loss": 0.3608, "num_input_tokens_seen": 14724096, "step": 4675 }, { "epoch": 0.2995966967543691, "grad_norm": 36.82127380371094, "learning_rate": 1.7671923870716459e-06, "loss": 0.4544, "num_input_tokens_seen": 14738752, "step": 4680 }, { "epoch": 0.299916778695346, "grad_norm": 41.65424346923828, "learning_rate": 1.7664752092230652e-06, "loss": 0.3486, "num_input_tokens_seen": 14753664, "step": 4685 }, { "epoch": 0.3002368606363229, "grad_norm": 34.0866813659668, "learning_rate": 1.7657570744211863e-06, "loss": 0.3784, "num_input_tokens_seen": 14769152, "step": 4690 }, { "epoch": 0.30036489341271366, "eval_loss": 0.4629112482070923, "eval_runtime": 49.1915, "eval_samples_per_second": 282.284, "eval_steps_per_second": 35.291, "num_input_tokens_seen": 14775488, "step": 4692 }, { "epoch": 0.3005569425772998, "grad_norm": 48.05270004272461, "learning_rate": 1.765037983562609e-06, "loss": 0.5028, "num_input_tokens_seen": 14784128, "step": 4695 }, { "epoch": 0.3008770245182767, "grad_norm": 49.29054641723633, "learning_rate": 1.7643179375451264e-06, "loss": 0.4459, "num_input_tokens_seen": 14799936, "step": 4700 }, { "epoch": 0.30119710645925357, "grad_norm": 42.15516662597656, "learning_rate": 1.7635969372677252e-06, "loss": 0.6083, "num_input_tokens_seen": 14814208, "step": 4705 }, { "epoch": 0.30151718840023045, "grad_norm": 37.26246643066406, "learning_rate": 1.7628749836305818e-06, "loss": 0.483, "num_input_tokens_seen": 14829504, "step": 4710 }, { "epoch": 0.30183727034120733, "grad_norm": 30.036657333374023, "learning_rate": 1.7621520775350645e-06, "loss": 0.3949, "num_input_tokens_seen": 14843968, "step": 4715 }, { "epoch": 0.30215735228218427, "grad_norm": 33.79453659057617, "learning_rate": 1.7614282198837293e-06, "loss": 0.4567, "num_input_tokens_seen": 14859840, "step": 4720 }, { "epoch": 0.30247743422316115, "grad_norm": 39.85743713378906, "learning_rate": 1.7607034115803219e-06, "loss": 0.473, "num_input_tokens_seen": 14875648, "step": 4725 }, { "epoch": 0.302797516164138, "grad_norm": 27.397972106933594, "learning_rate": 1.7599776535297734e-06, "loss": 0.4192, "num_input_tokens_seen": 14890560, "step": 4730 }, { "epoch": 0.3031175981051149, "grad_norm": 40.91767501831055, "learning_rate": 1.7592509466382012e-06, "loss": 0.4702, "num_input_tokens_seen": 14906688, "step": 4735 }, { "epoch": 0.3034376800460918, "grad_norm": 54.96405029296875, "learning_rate": 1.7585232918129076e-06, "loss": 0.5561, "num_input_tokens_seen": 14922496, "step": 4740 }, { "epoch": 0.30375776198706866, "grad_norm": 36.16265869140625, "learning_rate": 1.757794689962378e-06, "loss": 0.4601, "num_input_tokens_seen": 14938880, "step": 4745 }, { "epoch": 0.3040778439280456, "grad_norm": 44.08560562133789, "learning_rate": 1.7570651419962807e-06, "loss": 0.4968, "num_input_tokens_seen": 14954112, "step": 4750 }, { "epoch": 0.3043979258690225, "grad_norm": 42.19171142578125, "learning_rate": 1.7563346488254647e-06, "loss": 0.448, "num_input_tokens_seen": 14969536, "step": 4755 }, { "epoch": 0.30471800780999936, "grad_norm": 35.03725051879883, "learning_rate": 1.755603211361959e-06, "loss": 0.3373, "num_input_tokens_seen": 14985728, "step": 4760 }, { "epoch": 0.30503808975097624, "grad_norm": 20.99566078186035, "learning_rate": 1.7548708305189722e-06, "loss": 0.452, "num_input_tokens_seen": 15003904, "step": 4765 }, { "epoch": 0.3053581716919531, "grad_norm": 59.016563415527344, "learning_rate": 1.7541375072108905e-06, "loss": 0.5662, "num_input_tokens_seen": 15019328, "step": 4770 }, { "epoch": 0.30567825363293005, "grad_norm": 45.97145462036133, "learning_rate": 1.7534032423532766e-06, "loss": 0.4597, "num_input_tokens_seen": 15033856, "step": 4775 }, { "epoch": 0.30599833557390693, "grad_norm": 22.04340362548828, "learning_rate": 1.7526680368628685e-06, "loss": 0.3603, "num_input_tokens_seen": 15051200, "step": 4780 }, { "epoch": 0.3063184175148838, "grad_norm": 32.850303649902344, "learning_rate": 1.751931891657579e-06, "loss": 0.4471, "num_input_tokens_seen": 15066368, "step": 4785 }, { "epoch": 0.3066384994558607, "grad_norm": 21.559911727905273, "learning_rate": 1.7511948076564943e-06, "loss": 0.3494, "num_input_tokens_seen": 15081600, "step": 4790 }, { "epoch": 0.30695858139683757, "grad_norm": 30.383432388305664, "learning_rate": 1.7504567857798722e-06, "loss": 0.5308, "num_input_tokens_seen": 15097536, "step": 4795 }, { "epoch": 0.3072786633378145, "grad_norm": 37.53936767578125, "learning_rate": 1.7497178269491417e-06, "loss": 0.5013, "num_input_tokens_seen": 15113728, "step": 4800 }, { "epoch": 0.3075987452787914, "grad_norm": 24.428794860839844, "learning_rate": 1.7489779320869014e-06, "loss": 0.5561, "num_input_tokens_seen": 15130048, "step": 4805 }, { "epoch": 0.30791882721976827, "grad_norm": 22.411056518554688, "learning_rate": 1.7482371021169193e-06, "loss": 0.3673, "num_input_tokens_seen": 15145600, "step": 4810 }, { "epoch": 0.30823890916074514, "grad_norm": 44.107322692871094, "learning_rate": 1.7474953379641297e-06, "loss": 0.3935, "num_input_tokens_seen": 15162368, "step": 4815 }, { "epoch": 0.308558991101722, "grad_norm": 34.96397018432617, "learning_rate": 1.746752640554634e-06, "loss": 0.4323, "num_input_tokens_seen": 15178368, "step": 4820 }, { "epoch": 0.3088790730426989, "grad_norm": 26.387361526489258, "learning_rate": 1.7460090108156988e-06, "loss": 0.5467, "num_input_tokens_seen": 15193408, "step": 4825 }, { "epoch": 0.30919915498367584, "grad_norm": 22.992677688598633, "learning_rate": 1.7452644496757548e-06, "loss": 0.3081, "num_input_tokens_seen": 15208640, "step": 4830 }, { "epoch": 0.3095192369246527, "grad_norm": 44.50247573852539, "learning_rate": 1.7445189580643946e-06, "loss": 0.4533, "num_input_tokens_seen": 15224192, "step": 4835 }, { "epoch": 0.3098393188656296, "grad_norm": 28.59990692138672, "learning_rate": 1.7437725369123737e-06, "loss": 0.5119, "num_input_tokens_seen": 15239616, "step": 4840 }, { "epoch": 0.3101594008066065, "grad_norm": 31.960166931152344, "learning_rate": 1.7430251871516077e-06, "loss": 0.4595, "num_input_tokens_seen": 15255680, "step": 4845 }, { "epoch": 0.31047948274758336, "grad_norm": 25.40645980834961, "learning_rate": 1.7422769097151715e-06, "loss": 0.4886, "num_input_tokens_seen": 15271232, "step": 4850 }, { "epoch": 0.3107995646885603, "grad_norm": 65.88490295410156, "learning_rate": 1.7415277055372982e-06, "loss": 0.4938, "num_input_tokens_seen": 15287040, "step": 4855 }, { "epoch": 0.31111964662953717, "grad_norm": 25.532987594604492, "learning_rate": 1.7407775755533778e-06, "loss": 0.5025, "num_input_tokens_seen": 15304256, "step": 4860 }, { "epoch": 0.31143972857051405, "grad_norm": 18.785158157348633, "learning_rate": 1.7400265206999568e-06, "loss": 0.3567, "num_input_tokens_seen": 15322112, "step": 4865 }, { "epoch": 0.31175981051149093, "grad_norm": 69.29310607910156, "learning_rate": 1.7392745419147362e-06, "loss": 0.5436, "num_input_tokens_seen": 15337216, "step": 4870 }, { "epoch": 0.3120798924524678, "grad_norm": 38.31575393676758, "learning_rate": 1.7385216401365693e-06, "loss": 0.4521, "num_input_tokens_seen": 15354048, "step": 4875 }, { "epoch": 0.31239997439344475, "grad_norm": 28.862852096557617, "learning_rate": 1.7377678163054638e-06, "loss": 0.4933, "num_input_tokens_seen": 15369344, "step": 4880 }, { "epoch": 0.3127200563344216, "grad_norm": 51.59070587158203, "learning_rate": 1.7370130713625775e-06, "loss": 0.4949, "num_input_tokens_seen": 15385920, "step": 4885 }, { "epoch": 0.3130401382753985, "grad_norm": 20.555160522460938, "learning_rate": 1.736257406250218e-06, "loss": 0.3867, "num_input_tokens_seen": 15401536, "step": 4890 }, { "epoch": 0.3133602202163754, "grad_norm": 28.439088821411133, "learning_rate": 1.735500821911842e-06, "loss": 0.4501, "num_input_tokens_seen": 15417152, "step": 4895 }, { "epoch": 0.31368030215735226, "grad_norm": 30.494640350341797, "learning_rate": 1.7347433192920544e-06, "loss": 0.4949, "num_input_tokens_seen": 15431872, "step": 4900 }, { "epoch": 0.3140003840983292, "grad_norm": 19.200109481811523, "learning_rate": 1.7339848993366056e-06, "loss": 0.4021, "num_input_tokens_seen": 15447552, "step": 4905 }, { "epoch": 0.3143204660393061, "grad_norm": 32.95127868652344, "learning_rate": 1.7332255629923922e-06, "loss": 0.4615, "num_input_tokens_seen": 15464384, "step": 4910 }, { "epoch": 0.31464054798028296, "grad_norm": 23.275110244750977, "learning_rate": 1.732465311207454e-06, "loss": 0.4968, "num_input_tokens_seen": 15479808, "step": 4915 }, { "epoch": 0.31496062992125984, "grad_norm": 47.221412658691406, "learning_rate": 1.731704144930975e-06, "loss": 0.4973, "num_input_tokens_seen": 15496512, "step": 4920 }, { "epoch": 0.3152807118622367, "grad_norm": 39.70328903198242, "learning_rate": 1.7309420651132797e-06, "loss": 0.4094, "num_input_tokens_seen": 15512896, "step": 4925 }, { "epoch": 0.3156007938032136, "grad_norm": 32.56901931762695, "learning_rate": 1.7301790727058343e-06, "loss": 0.3234, "num_input_tokens_seen": 15528064, "step": 4930 }, { "epoch": 0.31592087574419053, "grad_norm": 31.572166442871094, "learning_rate": 1.7294151686612431e-06, "loss": 0.3618, "num_input_tokens_seen": 15543424, "step": 4935 }, { "epoch": 0.3162409576851674, "grad_norm": 42.15610122680664, "learning_rate": 1.7286503539332495e-06, "loss": 0.5609, "num_input_tokens_seen": 15560192, "step": 4940 }, { "epoch": 0.3165610396261443, "grad_norm": 43.20957946777344, "learning_rate": 1.7278846294767337e-06, "loss": 0.3968, "num_input_tokens_seen": 15576128, "step": 4945 }, { "epoch": 0.31688112156712117, "grad_norm": 80.63443756103516, "learning_rate": 1.7271179962477118e-06, "loss": 0.7032, "num_input_tokens_seen": 15592576, "step": 4950 }, { "epoch": 0.31720120350809805, "grad_norm": 50.15550994873047, "learning_rate": 1.7263504552033341e-06, "loss": 0.4261, "num_input_tokens_seen": 15607744, "step": 4955 }, { "epoch": 0.317521285449075, "grad_norm": 22.618947982788086, "learning_rate": 1.725582007301885e-06, "loss": 0.4846, "num_input_tokens_seen": 15623360, "step": 4960 }, { "epoch": 0.31784136739005187, "grad_norm": 33.10743713378906, "learning_rate": 1.7248126535027806e-06, "loss": 0.4213, "num_input_tokens_seen": 15638656, "step": 4965 }, { "epoch": 0.31816144933102875, "grad_norm": 41.587379455566406, "learning_rate": 1.7240423947665678e-06, "loss": 0.4632, "num_input_tokens_seen": 15654400, "step": 4970 }, { "epoch": 0.3184815312720056, "grad_norm": 27.983142852783203, "learning_rate": 1.723271232054924e-06, "loss": 0.3822, "num_input_tokens_seen": 15670016, "step": 4975 }, { "epoch": 0.3188016132129825, "grad_norm": 54.66548538208008, "learning_rate": 1.722499166330655e-06, "loss": 0.4977, "num_input_tokens_seen": 15686208, "step": 4980 }, { "epoch": 0.31912169515395944, "grad_norm": 20.663721084594727, "learning_rate": 1.7217261985576936e-06, "loss": 0.44, "num_input_tokens_seen": 15702592, "step": 4985 }, { "epoch": 0.3194417770949363, "grad_norm": 73.22879791259766, "learning_rate": 1.7209523297010992e-06, "loss": 0.5176, "num_input_tokens_seen": 15717696, "step": 4990 }, { "epoch": 0.3197618590359132, "grad_norm": 36.40870666503906, "learning_rate": 1.7201775607270564e-06, "loss": 0.4644, "num_input_tokens_seen": 15733184, "step": 4995 }, { "epoch": 0.3200819409768901, "grad_norm": 30.597986221313477, "learning_rate": 1.7194018926028733e-06, "loss": 0.5267, "num_input_tokens_seen": 15749888, "step": 5000 }, { "epoch": 0.32040202291786696, "grad_norm": 35.71719741821289, "learning_rate": 1.7186253262969803e-06, "loss": 0.3621, "num_input_tokens_seen": 15768384, "step": 5005 }, { "epoch": 0.32072210485884384, "grad_norm": 24.331857681274414, "learning_rate": 1.7178478627789299e-06, "loss": 0.3269, "num_input_tokens_seen": 15784448, "step": 5010 }, { "epoch": 0.3210421867998208, "grad_norm": 25.280595779418945, "learning_rate": 1.7170695030193944e-06, "loss": 0.4088, "num_input_tokens_seen": 15800512, "step": 5015 }, { "epoch": 0.32136226874079765, "grad_norm": 29.914012908935547, "learning_rate": 1.716290247990165e-06, "loss": 0.4744, "num_input_tokens_seen": 15815680, "step": 5020 }, { "epoch": 0.32168235068177453, "grad_norm": 33.56769561767578, "learning_rate": 1.715510098664151e-06, "loss": 0.3939, "num_input_tokens_seen": 15830528, "step": 5025 }, { "epoch": 0.3220024326227514, "grad_norm": 29.231985092163086, "learning_rate": 1.7147290560153777e-06, "loss": 0.4933, "num_input_tokens_seen": 15845568, "step": 5030 }, { "epoch": 0.3223225145637283, "grad_norm": 39.174617767333984, "learning_rate": 1.7139471210189862e-06, "loss": 0.4531, "num_input_tokens_seen": 15861632, "step": 5035 }, { "epoch": 0.3226425965047052, "grad_norm": 31.1746826171875, "learning_rate": 1.7131642946512312e-06, "loss": 0.5187, "num_input_tokens_seen": 15877632, "step": 5040 }, { "epoch": 0.3229626784456821, "grad_norm": 19.761302947998047, "learning_rate": 1.712380577889481e-06, "loss": 0.37, "num_input_tokens_seen": 15893184, "step": 5045 }, { "epoch": 0.323282760386659, "grad_norm": 34.54355239868164, "learning_rate": 1.711595971712215e-06, "loss": 0.3955, "num_input_tokens_seen": 15908416, "step": 5050 }, { "epoch": 0.32360284232763586, "grad_norm": 25.96015739440918, "learning_rate": 1.7108104770990234e-06, "loss": 0.4074, "num_input_tokens_seen": 15924224, "step": 5055 }, { "epoch": 0.32392292426861274, "grad_norm": 22.604724884033203, "learning_rate": 1.7100240950306052e-06, "loss": 0.2532, "num_input_tokens_seen": 15940032, "step": 5060 }, { "epoch": 0.3242430062095897, "grad_norm": 38.15263366699219, "learning_rate": 1.7092368264887677e-06, "loss": 0.4556, "num_input_tokens_seen": 15954944, "step": 5065 }, { "epoch": 0.32456308815056656, "grad_norm": 57.1259765625, "learning_rate": 1.7084486724564252e-06, "loss": 0.4923, "num_input_tokens_seen": 15970624, "step": 5070 }, { "epoch": 0.32488317009154344, "grad_norm": 33.16521072387695, "learning_rate": 1.707659633917597e-06, "loss": 0.418, "num_input_tokens_seen": 15986688, "step": 5075 }, { "epoch": 0.3252032520325203, "grad_norm": 35.50617980957031, "learning_rate": 1.7068697118574064e-06, "loss": 0.4172, "num_input_tokens_seen": 16002752, "step": 5080 }, { "epoch": 0.3255233339734972, "grad_norm": 23.2056884765625, "learning_rate": 1.7060789072620816e-06, "loss": 0.4924, "num_input_tokens_seen": 16018112, "step": 5085 }, { "epoch": 0.32584341591447413, "grad_norm": 23.894432067871094, "learning_rate": 1.7052872211189509e-06, "loss": 0.411, "num_input_tokens_seen": 16033984, "step": 5090 }, { "epoch": 0.326163497855451, "grad_norm": 21.645387649536133, "learning_rate": 1.7044946544164431e-06, "loss": 0.3263, "num_input_tokens_seen": 16049536, "step": 5095 }, { "epoch": 0.3264835797964279, "grad_norm": 32.932411193847656, "learning_rate": 1.703701208144088e-06, "loss": 0.3722, "num_input_tokens_seen": 16066304, "step": 5100 }, { "epoch": 0.32680366173740477, "grad_norm": 42.86146926879883, "learning_rate": 1.702906883292512e-06, "loss": 0.4627, "num_input_tokens_seen": 16081536, "step": 5105 }, { "epoch": 0.32712374367838165, "grad_norm": 25.875411987304688, "learning_rate": 1.7021116808534393e-06, "loss": 0.5501, "num_input_tokens_seen": 16096896, "step": 5110 }, { "epoch": 0.32744382561935853, "grad_norm": 47.58795166015625, "learning_rate": 1.7013156018196893e-06, "loss": 0.4294, "num_input_tokens_seen": 16112960, "step": 5115 }, { "epoch": 0.32776390756033547, "grad_norm": 34.665802001953125, "learning_rate": 1.7005186471851759e-06, "loss": 0.4168, "num_input_tokens_seen": 16129344, "step": 5120 }, { "epoch": 0.32808398950131235, "grad_norm": 23.344072341918945, "learning_rate": 1.6997208179449066e-06, "loss": 0.5931, "num_input_tokens_seen": 16147776, "step": 5125 }, { "epoch": 0.3284040714422892, "grad_norm": 43.283119201660156, "learning_rate": 1.6989221150949806e-06, "loss": 0.3523, "num_input_tokens_seen": 16162880, "step": 5130 }, { "epoch": 0.3287241533832661, "grad_norm": 17.569599151611328, "learning_rate": 1.6981225396325873e-06, "loss": 0.2737, "num_input_tokens_seen": 16179392, "step": 5135 }, { "epoch": 0.329044235324243, "grad_norm": 38.69865036010742, "learning_rate": 1.6973220925560067e-06, "loss": 0.5036, "num_input_tokens_seen": 16194560, "step": 5140 }, { "epoch": 0.3293643172652199, "grad_norm": 55.1820182800293, "learning_rate": 1.696520774864606e-06, "loss": 0.4281, "num_input_tokens_seen": 16210112, "step": 5145 }, { "epoch": 0.3296843992061968, "grad_norm": 68.6947250366211, "learning_rate": 1.69571858755884e-06, "loss": 0.4646, "num_input_tokens_seen": 16225856, "step": 5150 }, { "epoch": 0.3300044811471737, "grad_norm": 25.549705505371094, "learning_rate": 1.6949155316402487e-06, "loss": 0.4177, "num_input_tokens_seen": 16241536, "step": 5155 }, { "epoch": 0.33032456308815056, "grad_norm": 31.668855667114258, "learning_rate": 1.6941116081114566e-06, "loss": 0.3777, "num_input_tokens_seen": 16256384, "step": 5160 }, { "epoch": 0.33064464502912744, "grad_norm": 34.3087158203125, "learning_rate": 1.6933068179761722e-06, "loss": 0.3937, "num_input_tokens_seen": 16271360, "step": 5165 }, { "epoch": 0.3309647269701044, "grad_norm": 26.086729049682617, "learning_rate": 1.6925011622391857e-06, "loss": 0.4118, "num_input_tokens_seen": 16286656, "step": 5170 }, { "epoch": 0.33128480891108125, "grad_norm": 18.95518684387207, "learning_rate": 1.6916946419063667e-06, "loss": 0.4038, "num_input_tokens_seen": 16302592, "step": 5175 }, { "epoch": 0.33160489085205813, "grad_norm": 25.953067779541016, "learning_rate": 1.690887257984666e-06, "loss": 0.5252, "num_input_tokens_seen": 16318656, "step": 5180 }, { "epoch": 0.331924972793035, "grad_norm": 26.030420303344727, "learning_rate": 1.690079011482112e-06, "loss": 0.4784, "num_input_tokens_seen": 16334016, "step": 5185 }, { "epoch": 0.3322450547340119, "grad_norm": 44.0208625793457, "learning_rate": 1.6892699034078096e-06, "loss": 0.5322, "num_input_tokens_seen": 16349888, "step": 5190 }, { "epoch": 0.33256513667498877, "grad_norm": 40.064537048339844, "learning_rate": 1.68845993477194e-06, "loss": 0.5017, "num_input_tokens_seen": 16365056, "step": 5195 }, { "epoch": 0.3328852186159657, "grad_norm": 27.49654197692871, "learning_rate": 1.6876491065857584e-06, "loss": 0.3857, "num_input_tokens_seen": 16380032, "step": 5200 }, { "epoch": 0.3332053005569426, "grad_norm": 31.578556060791016, "learning_rate": 1.6868374198615928e-06, "loss": 0.6437, "num_input_tokens_seen": 16394752, "step": 5205 }, { "epoch": 0.33352538249791946, "grad_norm": 19.591115951538086, "learning_rate": 1.6860248756128448e-06, "loss": 0.4782, "num_input_tokens_seen": 16410368, "step": 5210 }, { "epoch": 0.33384546443889634, "grad_norm": 22.99208641052246, "learning_rate": 1.6852114748539844e-06, "loss": 0.3992, "num_input_tokens_seen": 16425088, "step": 5215 }, { "epoch": 0.3341655463798732, "grad_norm": 22.972055435180664, "learning_rate": 1.6843972186005525e-06, "loss": 0.3352, "num_input_tokens_seen": 16441152, "step": 5220 }, { "epoch": 0.33448562832085016, "grad_norm": 34.798065185546875, "learning_rate": 1.6835821078691577e-06, "loss": 0.4641, "num_input_tokens_seen": 16458240, "step": 5225 }, { "epoch": 0.33480571026182704, "grad_norm": 35.769901275634766, "learning_rate": 1.6827661436774746e-06, "loss": 0.4142, "num_input_tokens_seen": 16474112, "step": 5230 }, { "epoch": 0.3351257922028039, "grad_norm": 43.8751335144043, "learning_rate": 1.681949327044245e-06, "loss": 0.3955, "num_input_tokens_seen": 16490560, "step": 5235 }, { "epoch": 0.3354458741437808, "grad_norm": 67.51107025146484, "learning_rate": 1.6811316589892734e-06, "loss": 0.6757, "num_input_tokens_seen": 16505728, "step": 5240 }, { "epoch": 0.3357659560847577, "grad_norm": 21.818950653076172, "learning_rate": 1.6803131405334284e-06, "loss": 0.4257, "num_input_tokens_seen": 16521856, "step": 5245 }, { "epoch": 0.3360860380257346, "grad_norm": 30.710657119750977, "learning_rate": 1.6794937726986396e-06, "loss": 0.4271, "num_input_tokens_seen": 16537792, "step": 5250 }, { "epoch": 0.3364061199667115, "grad_norm": 42.02250671386719, "learning_rate": 1.6786735565078974e-06, "loss": 0.434, "num_input_tokens_seen": 16553408, "step": 5255 }, { "epoch": 0.33672620190768837, "grad_norm": 28.501094818115234, "learning_rate": 1.677852492985251e-06, "loss": 0.4297, "num_input_tokens_seen": 16570112, "step": 5260 }, { "epoch": 0.33704628384866525, "grad_norm": 56.61883544921875, "learning_rate": 1.6770305831558086e-06, "loss": 0.4931, "num_input_tokens_seen": 16586304, "step": 5265 }, { "epoch": 0.33736636578964213, "grad_norm": 15.158733367919922, "learning_rate": 1.6762078280457342e-06, "loss": 0.3922, "num_input_tokens_seen": 16601920, "step": 5270 }, { "epoch": 0.33768644773061907, "grad_norm": 27.923097610473633, "learning_rate": 1.6753842286822465e-06, "loss": 0.4797, "num_input_tokens_seen": 16618240, "step": 5275 }, { "epoch": 0.33800652967159595, "grad_norm": 36.38385009765625, "learning_rate": 1.6745597860936199e-06, "loss": 0.59, "num_input_tokens_seen": 16633408, "step": 5280 }, { "epoch": 0.3383266116125728, "grad_norm": 38.012123107910156, "learning_rate": 1.6737345013091794e-06, "loss": 0.439, "num_input_tokens_seen": 16649664, "step": 5285 }, { "epoch": 0.3386466935535497, "grad_norm": 39.11860656738281, "learning_rate": 1.672908375359304e-06, "loss": 0.4602, "num_input_tokens_seen": 16664896, "step": 5290 }, { "epoch": 0.3389667754945266, "grad_norm": 56.845096588134766, "learning_rate": 1.6720814092754209e-06, "loss": 0.5433, "num_input_tokens_seen": 16680384, "step": 5295 }, { "epoch": 0.33928685743550346, "grad_norm": 20.308507919311523, "learning_rate": 1.6712536040900075e-06, "loss": 0.3696, "num_input_tokens_seen": 16696192, "step": 5300 }, { "epoch": 0.3396069393764804, "grad_norm": 26.112041473388672, "learning_rate": 1.6704249608365878e-06, "loss": 0.4752, "num_input_tokens_seen": 16727104, "step": 5305 }, { "epoch": 0.3399270213174573, "grad_norm": 27.13048553466797, "learning_rate": 1.669595480549733e-06, "loss": 0.4154, "num_input_tokens_seen": 16741696, "step": 5310 }, { "epoch": 0.34024710325843416, "grad_norm": 40.439273834228516, "learning_rate": 1.6687651642650587e-06, "loss": 0.432, "num_input_tokens_seen": 16757120, "step": 5315 }, { "epoch": 0.34056718519941104, "grad_norm": 27.309789657592773, "learning_rate": 1.6679340130192245e-06, "loss": 0.4471, "num_input_tokens_seen": 16772416, "step": 5320 }, { "epoch": 0.3408872671403879, "grad_norm": 24.121200561523438, "learning_rate": 1.667102027849933e-06, "loss": 0.3172, "num_input_tokens_seen": 16788352, "step": 5325 }, { "epoch": 0.34120734908136485, "grad_norm": 36.701873779296875, "learning_rate": 1.6662692097959266e-06, "loss": 0.3456, "num_input_tokens_seen": 16803648, "step": 5330 }, { "epoch": 0.34152743102234173, "grad_norm": 52.13604736328125, "learning_rate": 1.6654355598969894e-06, "loss": 0.4708, "num_input_tokens_seen": 16818944, "step": 5335 }, { "epoch": 0.3418475129633186, "grad_norm": 31.60714340209961, "learning_rate": 1.6646010791939423e-06, "loss": 0.5078, "num_input_tokens_seen": 16833984, "step": 5340 }, { "epoch": 0.3421675949042955, "grad_norm": 30.880844116210938, "learning_rate": 1.6637657687286446e-06, "loss": 0.5507, "num_input_tokens_seen": 16849280, "step": 5345 }, { "epoch": 0.34248767684527237, "grad_norm": 29.642696380615234, "learning_rate": 1.6629296295439912e-06, "loss": 0.3979, "num_input_tokens_seen": 16865664, "step": 5350 }, { "epoch": 0.3428077587862493, "grad_norm": 46.237457275390625, "learning_rate": 1.6620926626839116e-06, "loss": 0.4884, "num_input_tokens_seen": 16881536, "step": 5355 }, { "epoch": 0.3431278407272262, "grad_norm": 26.425844192504883, "learning_rate": 1.661254869193369e-06, "loss": 0.4395, "num_input_tokens_seen": 16898816, "step": 5360 }, { "epoch": 0.34344792266820307, "grad_norm": 44.35171127319336, "learning_rate": 1.6604162501183581e-06, "loss": 0.5104, "num_input_tokens_seen": 16915136, "step": 5365 }, { "epoch": 0.34376800460917994, "grad_norm": 29.71055793762207, "learning_rate": 1.6595768065059045e-06, "loss": 0.4607, "num_input_tokens_seen": 16931200, "step": 5370 }, { "epoch": 0.3440880865501568, "grad_norm": 26.671714782714844, "learning_rate": 1.6587365394040641e-06, "loss": 0.4652, "num_input_tokens_seen": 16946816, "step": 5375 }, { "epoch": 0.3444081684911337, "grad_norm": 28.532976150512695, "learning_rate": 1.6578954498619195e-06, "loss": 0.3893, "num_input_tokens_seen": 16962880, "step": 5380 }, { "epoch": 0.34472825043211064, "grad_norm": 31.44209861755371, "learning_rate": 1.6570535389295814e-06, "loss": 0.4587, "num_input_tokens_seen": 16978240, "step": 5385 }, { "epoch": 0.3450483323730875, "grad_norm": 22.520421981811523, "learning_rate": 1.6562108076581853e-06, "loss": 0.3628, "num_input_tokens_seen": 16993728, "step": 5390 }, { "epoch": 0.3453684143140644, "grad_norm": 37.299156188964844, "learning_rate": 1.6553672570998912e-06, "loss": 0.5903, "num_input_tokens_seen": 17009728, "step": 5395 }, { "epoch": 0.3456884962550413, "grad_norm": 38.635986328125, "learning_rate": 1.6545228883078815e-06, "loss": 0.4174, "num_input_tokens_seen": 17024640, "step": 5400 }, { "epoch": 0.34600857819601816, "grad_norm": 37.52071762084961, "learning_rate": 1.653677702336361e-06, "loss": 0.3541, "num_input_tokens_seen": 17040512, "step": 5405 }, { "epoch": 0.3463286601369951, "grad_norm": 19.03274917602539, "learning_rate": 1.6528317002405538e-06, "loss": 0.4657, "num_input_tokens_seen": 17056064, "step": 5410 }, { "epoch": 0.34664874207797197, "grad_norm": 28.59636878967285, "learning_rate": 1.6519848830767043e-06, "loss": 0.3692, "num_input_tokens_seen": 17072448, "step": 5415 }, { "epoch": 0.34696882401894885, "grad_norm": 38.893310546875, "learning_rate": 1.6511372519020726e-06, "loss": 0.6197, "num_input_tokens_seen": 17088320, "step": 5420 }, { "epoch": 0.34728890595992573, "grad_norm": 39.06748962402344, "learning_rate": 1.650288807774937e-06, "loss": 0.4291, "num_input_tokens_seen": 17104448, "step": 5425 }, { "epoch": 0.3476089879009026, "grad_norm": 36.80699920654297, "learning_rate": 1.6494395517545893e-06, "loss": 0.3964, "num_input_tokens_seen": 17121856, "step": 5430 }, { "epoch": 0.34792906984187955, "grad_norm": 47.49158477783203, "learning_rate": 1.6485894849013362e-06, "loss": 0.5052, "num_input_tokens_seen": 17136512, "step": 5435 }, { "epoch": 0.3482491517828564, "grad_norm": 26.2275333404541, "learning_rate": 1.6477386082764961e-06, "loss": 0.443, "num_input_tokens_seen": 17152640, "step": 5440 }, { "epoch": 0.3485692337238333, "grad_norm": 25.935453414916992, "learning_rate": 1.6468869229423983e-06, "loss": 0.362, "num_input_tokens_seen": 17167680, "step": 5445 }, { "epoch": 0.3488893156648102, "grad_norm": 57.09697341918945, "learning_rate": 1.6460344299623813e-06, "loss": 0.6295, "num_input_tokens_seen": 17183296, "step": 5450 }, { "epoch": 0.34920939760578706, "grad_norm": 62.791343688964844, "learning_rate": 1.6451811304007939e-06, "loss": 0.5424, "num_input_tokens_seen": 17198272, "step": 5455 }, { "epoch": 0.349529479546764, "grad_norm": 46.02850341796875, "learning_rate": 1.6443270253229895e-06, "loss": 0.5177, "num_input_tokens_seen": 17213376, "step": 5460 }, { "epoch": 0.3498495614877409, "grad_norm": 39.094146728515625, "learning_rate": 1.6434721157953288e-06, "loss": 0.4657, "num_input_tokens_seen": 17229632, "step": 5465 }, { "epoch": 0.35016964342871776, "grad_norm": 35.04682540893555, "learning_rate": 1.6426164028851765e-06, "loss": 0.579, "num_input_tokens_seen": 17245696, "step": 5470 }, { "epoch": 0.3504257089814993, "eval_loss": 0.43906036019325256, "eval_runtime": 49.1679, "eval_samples_per_second": 282.42, "eval_steps_per_second": 35.308, "num_input_tokens_seen": 17259840, "step": 5474 }, { "epoch": 0.35048972536969464, "grad_norm": 26.412445068359375, "learning_rate": 1.6417598876609002e-06, "loss": 0.3787, "num_input_tokens_seen": 17262976, "step": 5475 }, { "epoch": 0.3508098073106715, "grad_norm": 36.70389175415039, "learning_rate": 1.640902571191869e-06, "loss": 0.419, "num_input_tokens_seen": 17278336, "step": 5480 }, { "epoch": 0.3511298892516484, "grad_norm": 41.35291290283203, "learning_rate": 1.6400444545484524e-06, "loss": 0.3535, "num_input_tokens_seen": 17293248, "step": 5485 }, { "epoch": 0.35144997119262533, "grad_norm": 19.715316772460938, "learning_rate": 1.6391855388020193e-06, "loss": 0.4275, "num_input_tokens_seen": 17309184, "step": 5490 }, { "epoch": 0.3517700531336022, "grad_norm": 32.778873443603516, "learning_rate": 1.6383258250249363e-06, "loss": 0.4436, "num_input_tokens_seen": 17325248, "step": 5495 }, { "epoch": 0.3520901350745791, "grad_norm": 19.160093307495117, "learning_rate": 1.6374653142905661e-06, "loss": 0.4226, "num_input_tokens_seen": 17340736, "step": 5500 }, { "epoch": 0.35241021701555597, "grad_norm": 35.900447845458984, "learning_rate": 1.6366040076732662e-06, "loss": 0.4188, "num_input_tokens_seen": 17355904, "step": 5505 }, { "epoch": 0.35273029895653285, "grad_norm": 28.459196090698242, "learning_rate": 1.6357419062483882e-06, "loss": 0.4712, "num_input_tokens_seen": 17371264, "step": 5510 }, { "epoch": 0.3530503808975098, "grad_norm": 24.3746337890625, "learning_rate": 1.6348790110922758e-06, "loss": 0.4168, "num_input_tokens_seen": 17388608, "step": 5515 }, { "epoch": 0.35337046283848667, "grad_norm": 30.681352615356445, "learning_rate": 1.6340153232822635e-06, "loss": 0.4668, "num_input_tokens_seen": 17403712, "step": 5520 }, { "epoch": 0.35369054477946354, "grad_norm": 40.181785583496094, "learning_rate": 1.633150843896676e-06, "loss": 0.4809, "num_input_tokens_seen": 17421056, "step": 5525 }, { "epoch": 0.3540106267204404, "grad_norm": 58.2733154296875, "learning_rate": 1.6322855740148263e-06, "loss": 0.5588, "num_input_tokens_seen": 17436096, "step": 5530 }, { "epoch": 0.3543307086614173, "grad_norm": 24.002464294433594, "learning_rate": 1.6314195147170132e-06, "loss": 0.3701, "num_input_tokens_seen": 17452480, "step": 5535 }, { "epoch": 0.35465079060239424, "grad_norm": 28.335710525512695, "learning_rate": 1.6305526670845225e-06, "loss": 0.4038, "num_input_tokens_seen": 17467776, "step": 5540 }, { "epoch": 0.3549708725433711, "grad_norm": 46.305484771728516, "learning_rate": 1.6296850321996232e-06, "loss": 0.5081, "num_input_tokens_seen": 17482752, "step": 5545 }, { "epoch": 0.355290954484348, "grad_norm": 31.239910125732422, "learning_rate": 1.6288166111455683e-06, "loss": 0.3885, "num_input_tokens_seen": 17497792, "step": 5550 }, { "epoch": 0.3556110364253249, "grad_norm": 21.766979217529297, "learning_rate": 1.6279474050065906e-06, "loss": 0.4774, "num_input_tokens_seen": 17513024, "step": 5555 }, { "epoch": 0.35593111836630176, "grad_norm": 28.28034210205078, "learning_rate": 1.6270774148679054e-06, "loss": 0.4143, "num_input_tokens_seen": 17529024, "step": 5560 }, { "epoch": 0.35625120030727864, "grad_norm": 15.855846405029297, "learning_rate": 1.6262066418157048e-06, "loss": 0.3764, "num_input_tokens_seen": 17543936, "step": 5565 }, { "epoch": 0.35657128224825557, "grad_norm": 52.373390197753906, "learning_rate": 1.6253350869371595e-06, "loss": 0.5374, "num_input_tokens_seen": 17559168, "step": 5570 }, { "epoch": 0.35689136418923245, "grad_norm": 32.6270751953125, "learning_rate": 1.6244627513204158e-06, "loss": 0.3828, "num_input_tokens_seen": 17574912, "step": 5575 }, { "epoch": 0.35721144613020933, "grad_norm": 24.754146575927734, "learning_rate": 1.6235896360545954e-06, "loss": 0.4239, "num_input_tokens_seen": 17590272, "step": 5580 }, { "epoch": 0.3575315280711862, "grad_norm": 40.839786529541016, "learning_rate": 1.622715742229792e-06, "loss": 0.4379, "num_input_tokens_seen": 17605952, "step": 5585 }, { "epoch": 0.3578516100121631, "grad_norm": 21.1004638671875, "learning_rate": 1.6218410709370734e-06, "loss": 0.3813, "num_input_tokens_seen": 17621120, "step": 5590 }, { "epoch": 0.35817169195314, "grad_norm": 40.48637008666992, "learning_rate": 1.6209656232684768e-06, "loss": 0.5629, "num_input_tokens_seen": 17636096, "step": 5595 }, { "epoch": 0.3584917738941169, "grad_norm": 86.99573516845703, "learning_rate": 1.620089400317008e-06, "loss": 0.4427, "num_input_tokens_seen": 17652672, "step": 5600 }, { "epoch": 0.3588118558350938, "grad_norm": 33.9478645324707, "learning_rate": 1.6192124031766425e-06, "loss": 0.4875, "num_input_tokens_seen": 17668032, "step": 5605 }, { "epoch": 0.35913193777607066, "grad_norm": 28.759950637817383, "learning_rate": 1.6183346329423213e-06, "loss": 0.4474, "num_input_tokens_seen": 17683264, "step": 5610 }, { "epoch": 0.35945201971704754, "grad_norm": 49.65534210205078, "learning_rate": 1.6174560907099508e-06, "loss": 0.3642, "num_input_tokens_seen": 17699200, "step": 5615 }, { "epoch": 0.3597721016580245, "grad_norm": 21.184310913085938, "learning_rate": 1.6165767775764013e-06, "loss": 0.3489, "num_input_tokens_seen": 17714816, "step": 5620 }, { "epoch": 0.36009218359900136, "grad_norm": 36.253963470458984, "learning_rate": 1.6156966946395056e-06, "loss": 0.411, "num_input_tokens_seen": 17732352, "step": 5625 }, { "epoch": 0.36041226553997824, "grad_norm": 52.9035758972168, "learning_rate": 1.6148158429980577e-06, "loss": 0.5376, "num_input_tokens_seen": 17748288, "step": 5630 }, { "epoch": 0.3607323474809551, "grad_norm": 40.94856262207031, "learning_rate": 1.6139342237518108e-06, "loss": 0.3839, "num_input_tokens_seen": 17763520, "step": 5635 }, { "epoch": 0.361052429421932, "grad_norm": 33.37528610229492, "learning_rate": 1.6130518380014773e-06, "loss": 0.428, "num_input_tokens_seen": 17779328, "step": 5640 }, { "epoch": 0.3613725113629089, "grad_norm": 38.7974853515625, "learning_rate": 1.6121686868487259e-06, "loss": 0.4178, "num_input_tokens_seen": 17795584, "step": 5645 }, { "epoch": 0.3616925933038858, "grad_norm": 16.909976959228516, "learning_rate": 1.6112847713961815e-06, "loss": 0.44, "num_input_tokens_seen": 17810368, "step": 5650 }, { "epoch": 0.3620126752448627, "grad_norm": 27.985116958618164, "learning_rate": 1.610400092747423e-06, "loss": 0.4283, "num_input_tokens_seen": 17826496, "step": 5655 }, { "epoch": 0.36233275718583957, "grad_norm": 30.853046417236328, "learning_rate": 1.609514652006981e-06, "loss": 0.4191, "num_input_tokens_seen": 17841344, "step": 5660 }, { "epoch": 0.36265283912681645, "grad_norm": 31.243133544921875, "learning_rate": 1.60862845028034e-06, "loss": 0.5596, "num_input_tokens_seen": 17857408, "step": 5665 }, { "epoch": 0.36297292106779333, "grad_norm": 24.529314041137695, "learning_rate": 1.6077414886739327e-06, "loss": 0.4256, "num_input_tokens_seen": 17873280, "step": 5670 }, { "epoch": 0.36329300300877027, "grad_norm": 20.652950286865234, "learning_rate": 1.6068537682951412e-06, "loss": 0.4936, "num_input_tokens_seen": 17888448, "step": 5675 }, { "epoch": 0.36361308494974715, "grad_norm": 28.207895278930664, "learning_rate": 1.6059652902522947e-06, "loss": 0.4402, "num_input_tokens_seen": 17904320, "step": 5680 }, { "epoch": 0.363933166890724, "grad_norm": 51.1041145324707, "learning_rate": 1.6050760556546683e-06, "loss": 0.3667, "num_input_tokens_seen": 17919744, "step": 5685 }, { "epoch": 0.3642532488317009, "grad_norm": 26.759593963623047, "learning_rate": 1.6041860656124823e-06, "loss": 0.3814, "num_input_tokens_seen": 17934656, "step": 5690 }, { "epoch": 0.3645733307726778, "grad_norm": 39.42972946166992, "learning_rate": 1.6032953212368993e-06, "loss": 0.5375, "num_input_tokens_seen": 17950976, "step": 5695 }, { "epoch": 0.3648934127136547, "grad_norm": 22.8485164642334, "learning_rate": 1.6024038236400243e-06, "loss": 0.4688, "num_input_tokens_seen": 17966400, "step": 5700 }, { "epoch": 0.3652134946546316, "grad_norm": 97.59317016601562, "learning_rate": 1.6015115739349027e-06, "loss": 0.5649, "num_input_tokens_seen": 17983872, "step": 5705 }, { "epoch": 0.3655335765956085, "grad_norm": 33.57761764526367, "learning_rate": 1.6006185732355183e-06, "loss": 0.5461, "num_input_tokens_seen": 17999680, "step": 5710 }, { "epoch": 0.36585365853658536, "grad_norm": 21.023252487182617, "learning_rate": 1.5997248226567931e-06, "loss": 0.3802, "num_input_tokens_seen": 18014784, "step": 5715 }, { "epoch": 0.36617374047756224, "grad_norm": 22.69112205505371, "learning_rate": 1.5988303233145853e-06, "loss": 0.4997, "num_input_tokens_seen": 18029888, "step": 5720 }, { "epoch": 0.3664938224185392, "grad_norm": 29.783832550048828, "learning_rate": 1.597935076325688e-06, "loss": 0.3877, "num_input_tokens_seen": 18045632, "step": 5725 }, { "epoch": 0.36681390435951605, "grad_norm": 41.83056640625, "learning_rate": 1.5970390828078272e-06, "loss": 0.5839, "num_input_tokens_seen": 18060928, "step": 5730 }, { "epoch": 0.36713398630049293, "grad_norm": 16.932323455810547, "learning_rate": 1.5961423438796615e-06, "loss": 0.4567, "num_input_tokens_seen": 18076352, "step": 5735 }, { "epoch": 0.3674540682414698, "grad_norm": 43.994022369384766, "learning_rate": 1.59524486066078e-06, "loss": 0.4411, "num_input_tokens_seen": 18092096, "step": 5740 }, { "epoch": 0.3677741501824467, "grad_norm": 29.11937141418457, "learning_rate": 1.5943466342717012e-06, "loss": 0.5834, "num_input_tokens_seen": 18107648, "step": 5745 }, { "epoch": 0.36809423212342357, "grad_norm": 26.03652572631836, "learning_rate": 1.5934476658338708e-06, "loss": 0.4433, "num_input_tokens_seen": 18123264, "step": 5750 }, { "epoch": 0.3684143140644005, "grad_norm": 25.282079696655273, "learning_rate": 1.5925479564696619e-06, "loss": 0.5414, "num_input_tokens_seen": 18138368, "step": 5755 }, { "epoch": 0.3687343960053774, "grad_norm": 11.744181632995605, "learning_rate": 1.5916475073023721e-06, "loss": 0.3336, "num_input_tokens_seen": 18154432, "step": 5760 }, { "epoch": 0.36905447794635426, "grad_norm": 50.17704391479492, "learning_rate": 1.5907463194562226e-06, "loss": 0.3355, "num_input_tokens_seen": 18171200, "step": 5765 }, { "epoch": 0.36937455988733114, "grad_norm": 24.319721221923828, "learning_rate": 1.589844394056357e-06, "loss": 0.3807, "num_input_tokens_seen": 18187008, "step": 5770 }, { "epoch": 0.369694641828308, "grad_norm": 48.6660270690918, "learning_rate": 1.5889417322288403e-06, "loss": 0.3492, "num_input_tokens_seen": 18202944, "step": 5775 }, { "epoch": 0.37001472376928496, "grad_norm": 86.95288848876953, "learning_rate": 1.5880383351006556e-06, "loss": 0.4969, "num_input_tokens_seen": 18217984, "step": 5780 }, { "epoch": 0.37033480571026184, "grad_norm": 30.690433502197266, "learning_rate": 1.5871342037997055e-06, "loss": 0.505, "num_input_tokens_seen": 18233984, "step": 5785 }, { "epoch": 0.3706548876512387, "grad_norm": 43.78403091430664, "learning_rate": 1.5862293394548082e-06, "loss": 0.403, "num_input_tokens_seen": 18249024, "step": 5790 }, { "epoch": 0.3709749695922156, "grad_norm": 73.22137451171875, "learning_rate": 1.5853237431956972e-06, "loss": 0.3414, "num_input_tokens_seen": 18264256, "step": 5795 }, { "epoch": 0.3712950515331925, "grad_norm": 40.81637954711914, "learning_rate": 1.5844174161530206e-06, "loss": 0.5495, "num_input_tokens_seen": 18279936, "step": 5800 }, { "epoch": 0.3716151334741694, "grad_norm": 24.28744888305664, "learning_rate": 1.5835103594583382e-06, "loss": 0.4039, "num_input_tokens_seen": 18295488, "step": 5805 }, { "epoch": 0.3719352154151463, "grad_norm": 25.278915405273438, "learning_rate": 1.5826025742441207e-06, "loss": 0.5329, "num_input_tokens_seen": 18311360, "step": 5810 }, { "epoch": 0.37225529735612317, "grad_norm": 25.298076629638672, "learning_rate": 1.5816940616437486e-06, "loss": 0.4284, "num_input_tokens_seen": 18326592, "step": 5815 }, { "epoch": 0.37257537929710005, "grad_norm": 32.25617599487305, "learning_rate": 1.5807848227915108e-06, "loss": 0.3573, "num_input_tokens_seen": 18344000, "step": 5820 }, { "epoch": 0.37289546123807693, "grad_norm": 61.83903503417969, "learning_rate": 1.5798748588226028e-06, "loss": 0.4787, "num_input_tokens_seen": 18359872, "step": 5825 }, { "epoch": 0.3732155431790538, "grad_norm": 42.77378463745117, "learning_rate": 1.578964170873125e-06, "loss": 0.4776, "num_input_tokens_seen": 18374400, "step": 5830 }, { "epoch": 0.37353562512003075, "grad_norm": 19.963783264160156, "learning_rate": 1.5780527600800816e-06, "loss": 0.2927, "num_input_tokens_seen": 18390656, "step": 5835 }, { "epoch": 0.3738557070610076, "grad_norm": 63.39997100830078, "learning_rate": 1.5771406275813808e-06, "loss": 0.4476, "num_input_tokens_seen": 18406400, "step": 5840 }, { "epoch": 0.3741757890019845, "grad_norm": 51.011985778808594, "learning_rate": 1.5762277745158297e-06, "loss": 0.5497, "num_input_tokens_seen": 18422848, "step": 5845 }, { "epoch": 0.3744958709429614, "grad_norm": 70.113525390625, "learning_rate": 1.5753142020231365e-06, "loss": 0.4932, "num_input_tokens_seen": 18438912, "step": 5850 }, { "epoch": 0.37481595288393826, "grad_norm": 43.822303771972656, "learning_rate": 1.5743999112439073e-06, "loss": 0.525, "num_input_tokens_seen": 18455488, "step": 5855 }, { "epoch": 0.3751360348249152, "grad_norm": 36.65006637573242, "learning_rate": 1.5734849033196446e-06, "loss": 0.3954, "num_input_tokens_seen": 18470080, "step": 5860 }, { "epoch": 0.3754561167658921, "grad_norm": 42.59208297729492, "learning_rate": 1.5725691793927468e-06, "loss": 0.4337, "num_input_tokens_seen": 18484480, "step": 5865 }, { "epoch": 0.37577619870686896, "grad_norm": 23.022443771362305, "learning_rate": 1.5716527406065057e-06, "loss": 0.46, "num_input_tokens_seen": 18501312, "step": 5870 }, { "epoch": 0.37609628064784584, "grad_norm": 25.66585350036621, "learning_rate": 1.570735588105106e-06, "loss": 0.449, "num_input_tokens_seen": 18515968, "step": 5875 }, { "epoch": 0.3764163625888227, "grad_norm": 16.919160842895508, "learning_rate": 1.5698177230336234e-06, "loss": 0.3901, "num_input_tokens_seen": 18531200, "step": 5880 }, { "epoch": 0.37673644452979965, "grad_norm": 31.569171905517578, "learning_rate": 1.568899146538023e-06, "loss": 0.2699, "num_input_tokens_seen": 18547712, "step": 5885 }, { "epoch": 0.37705652647077653, "grad_norm": 28.067827224731445, "learning_rate": 1.5679798597651587e-06, "loss": 0.4111, "num_input_tokens_seen": 18562752, "step": 5890 }, { "epoch": 0.3773766084117534, "grad_norm": 40.843196868896484, "learning_rate": 1.5670598638627706e-06, "loss": 0.4265, "num_input_tokens_seen": 18578368, "step": 5895 }, { "epoch": 0.3776966903527303, "grad_norm": 40.625885009765625, "learning_rate": 1.5661391599794847e-06, "loss": 0.3882, "num_input_tokens_seen": 18593408, "step": 5900 }, { "epoch": 0.37801677229370717, "grad_norm": 28.81650161743164, "learning_rate": 1.56521774926481e-06, "loss": 0.4155, "num_input_tokens_seen": 18607872, "step": 5905 }, { "epoch": 0.3783368542346841, "grad_norm": 25.494752883911133, "learning_rate": 1.5642956328691393e-06, "loss": 0.359, "num_input_tokens_seen": 18624000, "step": 5910 }, { "epoch": 0.378656936175661, "grad_norm": 52.101295471191406, "learning_rate": 1.5633728119437451e-06, "loss": 0.564, "num_input_tokens_seen": 18640704, "step": 5915 }, { "epoch": 0.37897701811663786, "grad_norm": 28.191926956176758, "learning_rate": 1.5624492876407807e-06, "loss": 0.4568, "num_input_tokens_seen": 18658368, "step": 5920 }, { "epoch": 0.37929710005761474, "grad_norm": 46.10580825805664, "learning_rate": 1.5615250611132766e-06, "loss": 0.4087, "num_input_tokens_seen": 18675584, "step": 5925 }, { "epoch": 0.3796171819985916, "grad_norm": 23.61751365661621, "learning_rate": 1.5606001335151405e-06, "loss": 0.5669, "num_input_tokens_seen": 18691904, "step": 5930 }, { "epoch": 0.3799372639395685, "grad_norm": 33.682106018066406, "learning_rate": 1.5596745060011561e-06, "loss": 0.3744, "num_input_tokens_seen": 18708736, "step": 5935 }, { "epoch": 0.38025734588054544, "grad_norm": 35.933292388916016, "learning_rate": 1.5587481797269793e-06, "loss": 0.3464, "num_input_tokens_seen": 18724032, "step": 5940 }, { "epoch": 0.3805774278215223, "grad_norm": 38.045902252197266, "learning_rate": 1.5578211558491396e-06, "loss": 0.4203, "num_input_tokens_seen": 18740352, "step": 5945 }, { "epoch": 0.3808975097624992, "grad_norm": 24.26993751525879, "learning_rate": 1.5568934355250375e-06, "loss": 0.3225, "num_input_tokens_seen": 18754560, "step": 5950 }, { "epoch": 0.3812175917034761, "grad_norm": 67.30828094482422, "learning_rate": 1.5559650199129423e-06, "loss": 0.6491, "num_input_tokens_seen": 18769280, "step": 5955 }, { "epoch": 0.38153767364445296, "grad_norm": 46.19745635986328, "learning_rate": 1.5550359101719921e-06, "loss": 0.4012, "num_input_tokens_seen": 18784512, "step": 5960 }, { "epoch": 0.3818577555854299, "grad_norm": 62.694427490234375, "learning_rate": 1.554106107462191e-06, "loss": 0.3561, "num_input_tokens_seen": 18800384, "step": 5965 }, { "epoch": 0.38217783752640677, "grad_norm": 43.13536834716797, "learning_rate": 1.5531756129444092e-06, "loss": 0.4248, "num_input_tokens_seen": 18815552, "step": 5970 }, { "epoch": 0.38249791946738365, "grad_norm": 23.844327926635742, "learning_rate": 1.5522444277803796e-06, "loss": 0.3884, "num_input_tokens_seen": 18830080, "step": 5975 }, { "epoch": 0.38281800140836053, "grad_norm": 30.173629760742188, "learning_rate": 1.5513125531326976e-06, "loss": 0.4319, "num_input_tokens_seen": 18846272, "step": 5980 }, { "epoch": 0.3831380833493374, "grad_norm": 29.421924591064453, "learning_rate": 1.5503799901648198e-06, "loss": 0.3747, "num_input_tokens_seen": 18860928, "step": 5985 }, { "epoch": 0.38345816529031435, "grad_norm": 61.6126594543457, "learning_rate": 1.5494467400410625e-06, "loss": 0.4553, "num_input_tokens_seen": 18877120, "step": 5990 }, { "epoch": 0.3837782472312912, "grad_norm": 50.92166519165039, "learning_rate": 1.5485128039265986e-06, "loss": 0.6017, "num_input_tokens_seen": 18892224, "step": 5995 }, { "epoch": 0.3840983291722681, "grad_norm": 51.964595794677734, "learning_rate": 1.547578182987459e-06, "loss": 0.4408, "num_input_tokens_seen": 18907008, "step": 6000 }, { "epoch": 0.384418411113245, "grad_norm": 21.846920013427734, "learning_rate": 1.5466428783905286e-06, "loss": 0.2736, "num_input_tokens_seen": 18922368, "step": 6005 }, { "epoch": 0.38473849305422186, "grad_norm": 30.069700241088867, "learning_rate": 1.5457068913035463e-06, "loss": 0.4288, "num_input_tokens_seen": 18937536, "step": 6010 }, { "epoch": 0.38505857499519874, "grad_norm": 40.16860580444336, "learning_rate": 1.544770222895103e-06, "loss": 0.4784, "num_input_tokens_seen": 18954048, "step": 6015 }, { "epoch": 0.3853786569361757, "grad_norm": 30.41385269165039, "learning_rate": 1.5438328743346398e-06, "loss": 0.5188, "num_input_tokens_seen": 18969472, "step": 6020 }, { "epoch": 0.38569873887715256, "grad_norm": 22.75130844116211, "learning_rate": 1.5428948467924478e-06, "loss": 0.4098, "num_input_tokens_seen": 18983872, "step": 6025 }, { "epoch": 0.38601882081812944, "grad_norm": 20.55361557006836, "learning_rate": 1.5419561414396656e-06, "loss": 0.3223, "num_input_tokens_seen": 18999360, "step": 6030 }, { "epoch": 0.3863389027591063, "grad_norm": 23.010210037231445, "learning_rate": 1.541016759448277e-06, "loss": 0.4888, "num_input_tokens_seen": 19015424, "step": 6035 }, { "epoch": 0.3866589847000832, "grad_norm": 30.879016876220703, "learning_rate": 1.5400767019911124e-06, "loss": 0.3641, "num_input_tokens_seen": 19031616, "step": 6040 }, { "epoch": 0.38697906664106013, "grad_norm": 31.967321395874023, "learning_rate": 1.539135970241844e-06, "loss": 0.4821, "num_input_tokens_seen": 19047040, "step": 6045 }, { "epoch": 0.387299148582037, "grad_norm": 50.726158142089844, "learning_rate": 1.5381945653749866e-06, "loss": 0.479, "num_input_tokens_seen": 19062848, "step": 6050 }, { "epoch": 0.3876192305230139, "grad_norm": 80.43476867675781, "learning_rate": 1.5372524885658952e-06, "loss": 0.5564, "num_input_tokens_seen": 19078976, "step": 6055 }, { "epoch": 0.38793931246399077, "grad_norm": 24.717586517333984, "learning_rate": 1.5363097409907638e-06, "loss": 0.3676, "num_input_tokens_seen": 19093632, "step": 6060 }, { "epoch": 0.38825939440496765, "grad_norm": 22.33540916442871, "learning_rate": 1.535366323826624e-06, "loss": 0.3605, "num_input_tokens_seen": 19109056, "step": 6065 }, { "epoch": 0.3885794763459446, "grad_norm": 46.442413330078125, "learning_rate": 1.534422238251343e-06, "loss": 0.3699, "num_input_tokens_seen": 19124544, "step": 6070 }, { "epoch": 0.38889955828692147, "grad_norm": 33.82103729248047, "learning_rate": 1.5334774854436223e-06, "loss": 0.3834, "num_input_tokens_seen": 19140480, "step": 6075 }, { "epoch": 0.38921964022789834, "grad_norm": 41.09638214111328, "learning_rate": 1.5325320665829975e-06, "loss": 0.3776, "num_input_tokens_seen": 19156736, "step": 6080 }, { "epoch": 0.3895397221688752, "grad_norm": 31.53407096862793, "learning_rate": 1.5315859828498352e-06, "loss": 0.4455, "num_input_tokens_seen": 19171520, "step": 6085 }, { "epoch": 0.3898598041098521, "grad_norm": 31.16860580444336, "learning_rate": 1.5306392354253316e-06, "loss": 0.4921, "num_input_tokens_seen": 19187136, "step": 6090 }, { "epoch": 0.39017988605082904, "grad_norm": 23.219755172729492, "learning_rate": 1.5296918254915123e-06, "loss": 0.4377, "num_input_tokens_seen": 19201856, "step": 6095 }, { "epoch": 0.3904999679918059, "grad_norm": 26.253602981567383, "learning_rate": 1.5287437542312296e-06, "loss": 0.3869, "num_input_tokens_seen": 19216704, "step": 6100 }, { "epoch": 0.3908200499327828, "grad_norm": 61.03850173950195, "learning_rate": 1.5277950228281614e-06, "loss": 0.5316, "num_input_tokens_seen": 19233408, "step": 6105 }, { "epoch": 0.3911401318737597, "grad_norm": 26.556734085083008, "learning_rate": 1.52684563246681e-06, "loss": 0.354, "num_input_tokens_seen": 19250048, "step": 6110 }, { "epoch": 0.39146021381473656, "grad_norm": 16.79180335998535, "learning_rate": 1.5258955843325015e-06, "loss": 0.4243, "num_input_tokens_seen": 19266560, "step": 6115 }, { "epoch": 0.39178029575571344, "grad_norm": 58.60289764404297, "learning_rate": 1.5249448796113804e-06, "loss": 0.4885, "num_input_tokens_seen": 19281408, "step": 6120 }, { "epoch": 0.39210037769669037, "grad_norm": 47.47416687011719, "learning_rate": 1.5239935194904141e-06, "loss": 0.4747, "num_input_tokens_seen": 19296384, "step": 6125 }, { "epoch": 0.39242045963766725, "grad_norm": 24.381053924560547, "learning_rate": 1.523041505157386e-06, "loss": 0.3702, "num_input_tokens_seen": 19312000, "step": 6130 }, { "epoch": 0.39274054157864413, "grad_norm": 25.145042419433594, "learning_rate": 1.5220888378008977e-06, "loss": 0.3909, "num_input_tokens_seen": 19327488, "step": 6135 }, { "epoch": 0.393060623519621, "grad_norm": 22.552824020385742, "learning_rate": 1.5211355186103654e-06, "loss": 0.4661, "num_input_tokens_seen": 19342080, "step": 6140 }, { "epoch": 0.3933807054605979, "grad_norm": 50.69114303588867, "learning_rate": 1.5201815487760192e-06, "loss": 0.4126, "num_input_tokens_seen": 19358336, "step": 6145 }, { "epoch": 0.3937007874015748, "grad_norm": 92.56407165527344, "learning_rate": 1.5192269294889019e-06, "loss": 0.508, "num_input_tokens_seen": 19373376, "step": 6150 }, { "epoch": 0.3940208693425517, "grad_norm": 28.557926177978516, "learning_rate": 1.5182716619408666e-06, "loss": 0.4029, "num_input_tokens_seen": 19388608, "step": 6155 }, { "epoch": 0.3943409512835286, "grad_norm": 31.255754470825195, "learning_rate": 1.5173157473245764e-06, "loss": 0.5398, "num_input_tokens_seen": 19403264, "step": 6160 }, { "epoch": 0.39466103322450546, "grad_norm": 36.93677520751953, "learning_rate": 1.5163591868335016e-06, "loss": 0.4363, "num_input_tokens_seen": 19418816, "step": 6165 }, { "epoch": 0.39498111516548234, "grad_norm": 39.658329010009766, "learning_rate": 1.515401981661919e-06, "loss": 0.5781, "num_input_tokens_seen": 19435392, "step": 6170 }, { "epoch": 0.3953011971064593, "grad_norm": 32.506134033203125, "learning_rate": 1.514444133004911e-06, "loss": 0.4592, "num_input_tokens_seen": 19450048, "step": 6175 }, { "epoch": 0.39562127904743616, "grad_norm": 30.993446350097656, "learning_rate": 1.5134856420583631e-06, "loss": 0.4592, "num_input_tokens_seen": 19466368, "step": 6180 }, { "epoch": 0.39594136098841304, "grad_norm": 23.403287887573242, "learning_rate": 1.5125265100189614e-06, "loss": 0.3338, "num_input_tokens_seen": 19482624, "step": 6185 }, { "epoch": 0.3962614429293899, "grad_norm": 32.384483337402344, "learning_rate": 1.5115667380841948e-06, "loss": 0.5304, "num_input_tokens_seen": 19498048, "step": 6190 }, { "epoch": 0.3965815248703668, "grad_norm": 19.235095977783203, "learning_rate": 1.510606327452349e-06, "loss": 0.43, "num_input_tokens_seen": 19515264, "step": 6195 }, { "epoch": 0.3969016068113437, "grad_norm": 34.2067985534668, "learning_rate": 1.5096452793225082e-06, "loss": 0.4319, "num_input_tokens_seen": 19533056, "step": 6200 }, { "epoch": 0.3972216887523206, "grad_norm": 30.670093536376953, "learning_rate": 1.5086835948945522e-06, "loss": 0.4003, "num_input_tokens_seen": 19548480, "step": 6205 }, { "epoch": 0.3975417706932975, "grad_norm": 29.265615463256836, "learning_rate": 1.5077212753691556e-06, "loss": 0.3271, "num_input_tokens_seen": 19563712, "step": 6210 }, { "epoch": 0.39786185263427437, "grad_norm": 35.499732971191406, "learning_rate": 1.5067583219477852e-06, "loss": 0.4049, "num_input_tokens_seen": 19578624, "step": 6215 }, { "epoch": 0.39818193457525125, "grad_norm": 30.449113845825195, "learning_rate": 1.5057947358327e-06, "loss": 0.3916, "num_input_tokens_seen": 19593408, "step": 6220 }, { "epoch": 0.39850201651622813, "grad_norm": 37.85767364501953, "learning_rate": 1.504830518226948e-06, "loss": 0.4907, "num_input_tokens_seen": 19609216, "step": 6225 }, { "epoch": 0.39882209845720507, "grad_norm": 19.524030685424805, "learning_rate": 1.5038656703343672e-06, "loss": 0.449, "num_input_tokens_seen": 19624896, "step": 6230 }, { "epoch": 0.39914218039818194, "grad_norm": 76.64604949951172, "learning_rate": 1.5029001933595805e-06, "loss": 0.4925, "num_input_tokens_seen": 19640128, "step": 6235 }, { "epoch": 0.3994622623391588, "grad_norm": 32.2121696472168, "learning_rate": 1.501934088507998e-06, "loss": 0.3433, "num_input_tokens_seen": 19655680, "step": 6240 }, { "epoch": 0.3997823442801357, "grad_norm": 32.78192901611328, "learning_rate": 1.5009673569858126e-06, "loss": 0.6227, "num_input_tokens_seen": 19672192, "step": 6245 }, { "epoch": 0.4001024262211126, "grad_norm": 46.18693542480469, "learning_rate": 1.5e-06, "loss": 0.5284, "num_input_tokens_seen": 19688896, "step": 6250 }, { "epoch": 0.4004225081620895, "grad_norm": 18.203367233276367, "learning_rate": 1.4990320187583167e-06, "loss": 0.3547, "num_input_tokens_seen": 19704128, "step": 6255 }, { "epoch": 0.4004865245502849, "eval_loss": 0.42333245277404785, "eval_runtime": 49.177, "eval_samples_per_second": 282.368, "eval_steps_per_second": 35.301, "num_input_tokens_seen": 19707456, "step": 6256 }, { "epoch": 0.4007425901030664, "grad_norm": 34.608970642089844, "learning_rate": 1.4980634144692986e-06, "loss": 0.395, "num_input_tokens_seen": 19719744, "step": 6260 }, { "epoch": 0.4010626720440433, "grad_norm": 48.07910919189453, "learning_rate": 1.4970941883422599e-06, "loss": 0.3795, "num_input_tokens_seen": 19736128, "step": 6265 }, { "epoch": 0.40138275398502016, "grad_norm": 25.680130004882812, "learning_rate": 1.4961243415872901e-06, "loss": 0.4165, "num_input_tokens_seen": 19751296, "step": 6270 }, { "epoch": 0.40170283592599704, "grad_norm": 64.0484848022461, "learning_rate": 1.4951538754152551e-06, "loss": 0.4057, "num_input_tokens_seen": 19765888, "step": 6275 }, { "epoch": 0.402022917866974, "grad_norm": 29.654808044433594, "learning_rate": 1.4941827910377925e-06, "loss": 0.4205, "num_input_tokens_seen": 19780864, "step": 6280 }, { "epoch": 0.40234299980795085, "grad_norm": 23.910985946655273, "learning_rate": 1.4932110896673131e-06, "loss": 0.4014, "num_input_tokens_seen": 19796864, "step": 6285 }, { "epoch": 0.40266308174892773, "grad_norm": 29.215768814086914, "learning_rate": 1.4922387725169973e-06, "loss": 0.5395, "num_input_tokens_seen": 19811904, "step": 6290 }, { "epoch": 0.4029831636899046, "grad_norm": 33.94330596923828, "learning_rate": 1.4912658408007947e-06, "loss": 0.4049, "num_input_tokens_seen": 19827456, "step": 6295 }, { "epoch": 0.4033032456308815, "grad_norm": 33.57729721069336, "learning_rate": 1.4902922957334215e-06, "loss": 0.4269, "num_input_tokens_seen": 19842496, "step": 6300 }, { "epoch": 0.40362332757185837, "grad_norm": 43.49477005004883, "learning_rate": 1.4893181385303608e-06, "loss": 0.408, "num_input_tokens_seen": 19858240, "step": 6305 }, { "epoch": 0.4039434095128353, "grad_norm": 32.84989547729492, "learning_rate": 1.4883433704078584e-06, "loss": 0.3994, "num_input_tokens_seen": 19874368, "step": 6310 }, { "epoch": 0.4042634914538122, "grad_norm": 32.79706954956055, "learning_rate": 1.4873679925829246e-06, "loss": 0.3874, "num_input_tokens_seen": 19891904, "step": 6315 }, { "epoch": 0.40458357339478906, "grad_norm": 21.430252075195312, "learning_rate": 1.4863920062733298e-06, "loss": 0.4077, "num_input_tokens_seen": 19907392, "step": 6320 }, { "epoch": 0.40490365533576594, "grad_norm": 46.1721305847168, "learning_rate": 1.485415412697604e-06, "loss": 0.3779, "num_input_tokens_seen": 19922624, "step": 6325 }, { "epoch": 0.4052237372767428, "grad_norm": 36.21952438354492, "learning_rate": 1.484438213075036e-06, "loss": 0.4348, "num_input_tokens_seen": 19939328, "step": 6330 }, { "epoch": 0.40554381921771976, "grad_norm": 42.817806243896484, "learning_rate": 1.4834604086256713e-06, "loss": 0.4465, "num_input_tokens_seen": 19955392, "step": 6335 }, { "epoch": 0.40586390115869664, "grad_norm": 35.81399154663086, "learning_rate": 1.4824820005703097e-06, "loss": 0.3818, "num_input_tokens_seen": 19971520, "step": 6340 }, { "epoch": 0.4061839830996735, "grad_norm": 22.910531997680664, "learning_rate": 1.4815029901305061e-06, "loss": 0.46, "num_input_tokens_seen": 19988352, "step": 6345 }, { "epoch": 0.4065040650406504, "grad_norm": 29.75078010559082, "learning_rate": 1.480523378528565e-06, "loss": 0.4748, "num_input_tokens_seen": 20005184, "step": 6350 }, { "epoch": 0.4068241469816273, "grad_norm": 45.245052337646484, "learning_rate": 1.4795431669875441e-06, "loss": 0.4064, "num_input_tokens_seen": 20020800, "step": 6355 }, { "epoch": 0.4071442289226042, "grad_norm": 29.282560348510742, "learning_rate": 1.478562356731249e-06, "loss": 0.472, "num_input_tokens_seen": 20036416, "step": 6360 }, { "epoch": 0.4074643108635811, "grad_norm": 37.65520095825195, "learning_rate": 1.4775809489842326e-06, "loss": 0.4525, "num_input_tokens_seen": 20053184, "step": 6365 }, { "epoch": 0.40778439280455797, "grad_norm": 28.39930534362793, "learning_rate": 1.4765989449717937e-06, "loss": 0.3987, "num_input_tokens_seen": 20069888, "step": 6370 }, { "epoch": 0.40810447474553485, "grad_norm": 64.06832885742188, "learning_rate": 1.4756163459199763e-06, "loss": 0.5504, "num_input_tokens_seen": 20085760, "step": 6375 }, { "epoch": 0.40842455668651173, "grad_norm": 34.247596740722656, "learning_rate": 1.4746331530555665e-06, "loss": 0.2742, "num_input_tokens_seen": 20101056, "step": 6380 }, { "epoch": 0.4087446386274886, "grad_norm": 41.40673828125, "learning_rate": 1.4736493676060923e-06, "loss": 0.4133, "num_input_tokens_seen": 20116352, "step": 6385 }, { "epoch": 0.40906472056846555, "grad_norm": 20.095537185668945, "learning_rate": 1.4726649907998216e-06, "loss": 0.3642, "num_input_tokens_seen": 20131712, "step": 6390 }, { "epoch": 0.4093848025094424, "grad_norm": 30.422456741333008, "learning_rate": 1.4716800238657599e-06, "loss": 0.3759, "num_input_tokens_seen": 20146880, "step": 6395 }, { "epoch": 0.4097048844504193, "grad_norm": 16.951066970825195, "learning_rate": 1.4706944680336505e-06, "loss": 0.2767, "num_input_tokens_seen": 20163520, "step": 6400 }, { "epoch": 0.4100249663913962, "grad_norm": 42.80522537231445, "learning_rate": 1.469708324533971e-06, "loss": 0.4681, "num_input_tokens_seen": 20177984, "step": 6405 }, { "epoch": 0.41034504833237306, "grad_norm": 16.141464233398438, "learning_rate": 1.4687215945979335e-06, "loss": 0.3395, "num_input_tokens_seen": 20193472, "step": 6410 }, { "epoch": 0.41066513027335, "grad_norm": 42.42402267456055, "learning_rate": 1.4677342794574815e-06, "loss": 0.4507, "num_input_tokens_seen": 20210624, "step": 6415 }, { "epoch": 0.4109852122143269, "grad_norm": 58.724464416503906, "learning_rate": 1.4667463803452902e-06, "loss": 0.4199, "num_input_tokens_seen": 20226688, "step": 6420 }, { "epoch": 0.41130529415530376, "grad_norm": 41.05823516845703, "learning_rate": 1.4657578984947627e-06, "loss": 0.4472, "num_input_tokens_seen": 20244608, "step": 6425 }, { "epoch": 0.41162537609628064, "grad_norm": 36.066612243652344, "learning_rate": 1.4647688351400303e-06, "loss": 0.3699, "num_input_tokens_seen": 20261184, "step": 6430 }, { "epoch": 0.4119454580372575, "grad_norm": 21.72051239013672, "learning_rate": 1.46377919151595e-06, "loss": 0.3348, "num_input_tokens_seen": 20276736, "step": 6435 }, { "epoch": 0.41226553997823445, "grad_norm": 29.512678146362305, "learning_rate": 1.462788968858104e-06, "loss": 0.4651, "num_input_tokens_seen": 20293888, "step": 6440 }, { "epoch": 0.41258562191921133, "grad_norm": 20.376218795776367, "learning_rate": 1.4617981684027966e-06, "loss": 0.482, "num_input_tokens_seen": 20309696, "step": 6445 }, { "epoch": 0.4129057038601882, "grad_norm": 19.641904830932617, "learning_rate": 1.4608067913870536e-06, "loss": 0.4013, "num_input_tokens_seen": 20325632, "step": 6450 }, { "epoch": 0.4132257858011651, "grad_norm": 22.1761474609375, "learning_rate": 1.4598148390486213e-06, "loss": 0.3968, "num_input_tokens_seen": 20341888, "step": 6455 }, { "epoch": 0.41354586774214197, "grad_norm": 30.586984634399414, "learning_rate": 1.4588223126259639e-06, "loss": 0.5073, "num_input_tokens_seen": 20358656, "step": 6460 }, { "epoch": 0.4138659496831189, "grad_norm": 15.245569229125977, "learning_rate": 1.4578292133582615e-06, "loss": 0.3245, "num_input_tokens_seen": 20372864, "step": 6465 }, { "epoch": 0.4141860316240958, "grad_norm": 27.139429092407227, "learning_rate": 1.456835542485411e-06, "loss": 0.3954, "num_input_tokens_seen": 20387840, "step": 6470 }, { "epoch": 0.41450611356507266, "grad_norm": 32.64242172241211, "learning_rate": 1.4558413012480215e-06, "loss": 0.4092, "num_input_tokens_seen": 20404736, "step": 6475 }, { "epoch": 0.41482619550604954, "grad_norm": 37.946998596191406, "learning_rate": 1.4548464908874156e-06, "loss": 0.5673, "num_input_tokens_seen": 20422848, "step": 6480 }, { "epoch": 0.4151462774470264, "grad_norm": 31.876144409179688, "learning_rate": 1.4538511126456255e-06, "loss": 0.3996, "num_input_tokens_seen": 20438016, "step": 6485 }, { "epoch": 0.4154663593880033, "grad_norm": 54.237831115722656, "learning_rate": 1.452855167765392e-06, "loss": 0.5913, "num_input_tokens_seen": 20454464, "step": 6490 }, { "epoch": 0.41578644132898024, "grad_norm": 24.1745548248291, "learning_rate": 1.4518586574901647e-06, "loss": 0.4487, "num_input_tokens_seen": 20470464, "step": 6495 }, { "epoch": 0.4161065232699571, "grad_norm": 27.391712188720703, "learning_rate": 1.450861583064098e-06, "loss": 0.4617, "num_input_tokens_seen": 20485696, "step": 6500 }, { "epoch": 0.416426605210934, "grad_norm": 24.388179779052734, "learning_rate": 1.4498639457320515e-06, "loss": 0.3642, "num_input_tokens_seen": 20500608, "step": 6505 }, { "epoch": 0.4167466871519109, "grad_norm": 34.60757827758789, "learning_rate": 1.4488657467395865e-06, "loss": 0.4686, "num_input_tokens_seen": 20515776, "step": 6510 }, { "epoch": 0.41706676909288776, "grad_norm": 38.96852493286133, "learning_rate": 1.4478669873329663e-06, "loss": 0.5078, "num_input_tokens_seen": 20531456, "step": 6515 }, { "epoch": 0.4173868510338647, "grad_norm": 32.143882751464844, "learning_rate": 1.4468676687591536e-06, "loss": 0.386, "num_input_tokens_seen": 20547200, "step": 6520 }, { "epoch": 0.41770693297484157, "grad_norm": 28.233505249023438, "learning_rate": 1.4458677922658104e-06, "loss": 0.4358, "num_input_tokens_seen": 20562560, "step": 6525 }, { "epoch": 0.41802701491581845, "grad_norm": 18.132049560546875, "learning_rate": 1.444867359101293e-06, "loss": 0.2798, "num_input_tokens_seen": 20577344, "step": 6530 }, { "epoch": 0.41834709685679533, "grad_norm": 35.095619201660156, "learning_rate": 1.4438663705146545e-06, "loss": 0.3529, "num_input_tokens_seen": 20593088, "step": 6535 }, { "epoch": 0.4186671787977722, "grad_norm": 29.12217903137207, "learning_rate": 1.442864827755641e-06, "loss": 0.3589, "num_input_tokens_seen": 20609792, "step": 6540 }, { "epoch": 0.41898726073874915, "grad_norm": 18.185195922851562, "learning_rate": 1.4418627320746901e-06, "loss": 0.4407, "num_input_tokens_seen": 20625280, "step": 6545 }, { "epoch": 0.419307342679726, "grad_norm": 31.992891311645508, "learning_rate": 1.4408600847229304e-06, "loss": 0.3854, "num_input_tokens_seen": 20641984, "step": 6550 }, { "epoch": 0.4196274246207029, "grad_norm": 31.761362075805664, "learning_rate": 1.4398568869521782e-06, "loss": 0.5281, "num_input_tokens_seen": 20658240, "step": 6555 }, { "epoch": 0.4199475065616798, "grad_norm": 31.19809341430664, "learning_rate": 1.4388531400149384e-06, "loss": 0.3645, "num_input_tokens_seen": 20673408, "step": 6560 }, { "epoch": 0.42026758850265666, "grad_norm": 46.35468673706055, "learning_rate": 1.4378488451644007e-06, "loss": 0.3866, "num_input_tokens_seen": 20688960, "step": 6565 }, { "epoch": 0.42058767044363354, "grad_norm": 25.512950897216797, "learning_rate": 1.4368440036544386e-06, "loss": 0.4049, "num_input_tokens_seen": 20704768, "step": 6570 }, { "epoch": 0.4209077523846105, "grad_norm": 43.326324462890625, "learning_rate": 1.435838616739609e-06, "loss": 0.4199, "num_input_tokens_seen": 20719808, "step": 6575 }, { "epoch": 0.42122783432558736, "grad_norm": 35.062923431396484, "learning_rate": 1.4348326856751493e-06, "loss": 0.5392, "num_input_tokens_seen": 20735680, "step": 6580 }, { "epoch": 0.42154791626656424, "grad_norm": 27.509485244750977, "learning_rate": 1.433826211716976e-06, "loss": 0.3422, "num_input_tokens_seen": 20750144, "step": 6585 }, { "epoch": 0.4218679982075411, "grad_norm": 33.31727981567383, "learning_rate": 1.4328191961216835e-06, "loss": 0.3966, "num_input_tokens_seen": 20766016, "step": 6590 }, { "epoch": 0.422188080148518, "grad_norm": 54.75680923461914, "learning_rate": 1.4318116401465427e-06, "loss": 0.4812, "num_input_tokens_seen": 20782720, "step": 6595 }, { "epoch": 0.42250816208949493, "grad_norm": 29.398454666137695, "learning_rate": 1.430803545049499e-06, "loss": 0.388, "num_input_tokens_seen": 20798208, "step": 6600 }, { "epoch": 0.4228282440304718, "grad_norm": 16.673908233642578, "learning_rate": 1.4297949120891716e-06, "loss": 0.5652, "num_input_tokens_seen": 20813056, "step": 6605 }, { "epoch": 0.4231483259714487, "grad_norm": 35.20106506347656, "learning_rate": 1.4287857425248497e-06, "loss": 0.4121, "num_input_tokens_seen": 20828800, "step": 6610 }, { "epoch": 0.42346840791242557, "grad_norm": 23.11936378479004, "learning_rate": 1.427776037616494e-06, "loss": 0.4974, "num_input_tokens_seen": 20844736, "step": 6615 }, { "epoch": 0.42378848985340245, "grad_norm": 30.515439987182617, "learning_rate": 1.4267657986247326e-06, "loss": 0.3527, "num_input_tokens_seen": 20860672, "step": 6620 }, { "epoch": 0.4241085717943794, "grad_norm": 33.26582717895508, "learning_rate": 1.425755026810861e-06, "loss": 0.3746, "num_input_tokens_seen": 20877184, "step": 6625 }, { "epoch": 0.42442865373535626, "grad_norm": 55.12078857421875, "learning_rate": 1.4247437234368394e-06, "loss": 0.4095, "num_input_tokens_seen": 20894208, "step": 6630 }, { "epoch": 0.42474873567633314, "grad_norm": 36.948524475097656, "learning_rate": 1.423731889765292e-06, "loss": 0.4001, "num_input_tokens_seen": 20909696, "step": 6635 }, { "epoch": 0.42506881761731, "grad_norm": 15.861876487731934, "learning_rate": 1.422719527059505e-06, "loss": 0.3504, "num_input_tokens_seen": 20926016, "step": 6640 }, { "epoch": 0.4253888995582869, "grad_norm": 20.37615394592285, "learning_rate": 1.4217066365834253e-06, "loss": 0.3636, "num_input_tokens_seen": 20941440, "step": 6645 }, { "epoch": 0.42570898149926384, "grad_norm": 34.968894958496094, "learning_rate": 1.4206932196016586e-06, "loss": 0.4406, "num_input_tokens_seen": 20956352, "step": 6650 }, { "epoch": 0.4260290634402407, "grad_norm": 54.73747634887695, "learning_rate": 1.4196792773794672e-06, "loss": 0.3928, "num_input_tokens_seen": 20973056, "step": 6655 }, { "epoch": 0.4263491453812176, "grad_norm": 38.245426177978516, "learning_rate": 1.418664811182771e-06, "loss": 0.438, "num_input_tokens_seen": 20989248, "step": 6660 }, { "epoch": 0.4266692273221945, "grad_norm": 41.432498931884766, "learning_rate": 1.417649822278142e-06, "loss": 0.4836, "num_input_tokens_seen": 21004096, "step": 6665 }, { "epoch": 0.42698930926317136, "grad_norm": 24.442115783691406, "learning_rate": 1.4166343119328064e-06, "loss": 0.4722, "num_input_tokens_seen": 21020224, "step": 6670 }, { "epoch": 0.42730939120414824, "grad_norm": 30.54802703857422, "learning_rate": 1.4156182814146404e-06, "loss": 0.4616, "num_input_tokens_seen": 21035264, "step": 6675 }, { "epoch": 0.42762947314512517, "grad_norm": 19.643733978271484, "learning_rate": 1.4146017319921701e-06, "loss": 0.3497, "num_input_tokens_seen": 21051904, "step": 6680 }, { "epoch": 0.42794955508610205, "grad_norm": 31.077213287353516, "learning_rate": 1.4135846649345695e-06, "loss": 0.4215, "num_input_tokens_seen": 21069504, "step": 6685 }, { "epoch": 0.42826963702707893, "grad_norm": 30.736148834228516, "learning_rate": 1.4125670815116589e-06, "loss": 0.427, "num_input_tokens_seen": 21084288, "step": 6690 }, { "epoch": 0.4285897189680558, "grad_norm": 28.045896530151367, "learning_rate": 1.4115489829939025e-06, "loss": 0.2926, "num_input_tokens_seen": 21100544, "step": 6695 }, { "epoch": 0.4289098009090327, "grad_norm": 28.585994720458984, "learning_rate": 1.4105303706524093e-06, "loss": 0.4407, "num_input_tokens_seen": 21116608, "step": 6700 }, { "epoch": 0.4292298828500096, "grad_norm": 48.415164947509766, "learning_rate": 1.4095112457589276e-06, "loss": 0.5926, "num_input_tokens_seen": 21131776, "step": 6705 }, { "epoch": 0.4295499647909865, "grad_norm": 28.813779830932617, "learning_rate": 1.4084916095858477e-06, "loss": 0.3962, "num_input_tokens_seen": 21146368, "step": 6710 }, { "epoch": 0.4298700467319634, "grad_norm": 30.74667739868164, "learning_rate": 1.407471463406197e-06, "loss": 0.4951, "num_input_tokens_seen": 21162368, "step": 6715 }, { "epoch": 0.43019012867294026, "grad_norm": 28.847599029541016, "learning_rate": 1.4064508084936399e-06, "loss": 0.4329, "num_input_tokens_seen": 21179008, "step": 6720 }, { "epoch": 0.43051021061391714, "grad_norm": 30.80069351196289, "learning_rate": 1.405429646122476e-06, "loss": 0.5761, "num_input_tokens_seen": 21196160, "step": 6725 }, { "epoch": 0.4308302925548941, "grad_norm": 20.700214385986328, "learning_rate": 1.4044079775676392e-06, "loss": 0.5175, "num_input_tokens_seen": 21212032, "step": 6730 }, { "epoch": 0.43115037449587096, "grad_norm": 22.982175827026367, "learning_rate": 1.4033858041046936e-06, "loss": 0.3659, "num_input_tokens_seen": 21230272, "step": 6735 }, { "epoch": 0.43147045643684784, "grad_norm": 23.89682388305664, "learning_rate": 1.4023631270098352e-06, "loss": 0.3926, "num_input_tokens_seen": 21245760, "step": 6740 }, { "epoch": 0.4317905383778247, "grad_norm": 28.520267486572266, "learning_rate": 1.4013399475598888e-06, "loss": 0.3411, "num_input_tokens_seen": 21260992, "step": 6745 }, { "epoch": 0.4321106203188016, "grad_norm": 22.041383743286133, "learning_rate": 1.4003162670323056e-06, "loss": 0.2807, "num_input_tokens_seen": 21275136, "step": 6750 }, { "epoch": 0.4324307022597785, "grad_norm": 72.86239624023438, "learning_rate": 1.3992920867051627e-06, "loss": 0.5292, "num_input_tokens_seen": 21290560, "step": 6755 }, { "epoch": 0.4327507842007554, "grad_norm": 43.2622184753418, "learning_rate": 1.3982674078571614e-06, "loss": 0.3525, "num_input_tokens_seen": 21305536, "step": 6760 }, { "epoch": 0.4330708661417323, "grad_norm": 25.79481315612793, "learning_rate": 1.3972422317676252e-06, "loss": 0.3785, "num_input_tokens_seen": 21320576, "step": 6765 }, { "epoch": 0.43339094808270917, "grad_norm": 17.41854476928711, "learning_rate": 1.3962165597164985e-06, "loss": 0.367, "num_input_tokens_seen": 21335680, "step": 6770 }, { "epoch": 0.43371103002368605, "grad_norm": 30.709115982055664, "learning_rate": 1.395190392984345e-06, "loss": 0.3496, "num_input_tokens_seen": 21351808, "step": 6775 }, { "epoch": 0.43403111196466293, "grad_norm": 26.75821304321289, "learning_rate": 1.3941637328523452e-06, "loss": 0.4482, "num_input_tokens_seen": 21366464, "step": 6780 }, { "epoch": 0.43435119390563987, "grad_norm": 36.62665939331055, "learning_rate": 1.3931365806022978e-06, "loss": 0.3094, "num_input_tokens_seen": 21383296, "step": 6785 }, { "epoch": 0.43467127584661674, "grad_norm": 38.72547912597656, "learning_rate": 1.3921089375166131e-06, "loss": 0.3178, "num_input_tokens_seen": 21399616, "step": 6790 }, { "epoch": 0.4349913577875936, "grad_norm": 21.409557342529297, "learning_rate": 1.391080804878316e-06, "loss": 0.4475, "num_input_tokens_seen": 21414848, "step": 6795 }, { "epoch": 0.4353114397285705, "grad_norm": 60.08560562133789, "learning_rate": 1.3900521839710427e-06, "loss": 0.3748, "num_input_tokens_seen": 21430144, "step": 6800 }, { "epoch": 0.4356315216695474, "grad_norm": 23.838533401489258, "learning_rate": 1.3890230760790373e-06, "loss": 0.3516, "num_input_tokens_seen": 21445248, "step": 6805 }, { "epoch": 0.4359516036105243, "grad_norm": 95.79853820800781, "learning_rate": 1.3879934824871544e-06, "loss": 0.5972, "num_input_tokens_seen": 21460544, "step": 6810 }, { "epoch": 0.4362716855515012, "grad_norm": 28.7491512298584, "learning_rate": 1.3869634044808526e-06, "loss": 0.4871, "num_input_tokens_seen": 21476224, "step": 6815 }, { "epoch": 0.4365917674924781, "grad_norm": 38.301719665527344, "learning_rate": 1.3859328433461971e-06, "loss": 0.5996, "num_input_tokens_seen": 21491712, "step": 6820 }, { "epoch": 0.43691184943345496, "grad_norm": 67.00725555419922, "learning_rate": 1.3849018003698553e-06, "loss": 0.5784, "num_input_tokens_seen": 21508928, "step": 6825 }, { "epoch": 0.43723193137443184, "grad_norm": 36.97137451171875, "learning_rate": 1.3838702768390964e-06, "loss": 0.415, "num_input_tokens_seen": 21523648, "step": 6830 }, { "epoch": 0.43755201331540877, "grad_norm": 31.718050003051758, "learning_rate": 1.38283827404179e-06, "loss": 0.4777, "num_input_tokens_seen": 21539264, "step": 6835 }, { "epoch": 0.43787209525638565, "grad_norm": 50.313236236572266, "learning_rate": 1.381805793266403e-06, "loss": 0.3776, "num_input_tokens_seen": 21555520, "step": 6840 }, { "epoch": 0.43819217719736253, "grad_norm": 35.556846618652344, "learning_rate": 1.3807728358020009e-06, "loss": 0.4517, "num_input_tokens_seen": 21570112, "step": 6845 }, { "epoch": 0.4385122591383394, "grad_norm": 45.01139450073242, "learning_rate": 1.3797394029382416e-06, "loss": 0.3386, "num_input_tokens_seen": 21584768, "step": 6850 }, { "epoch": 0.4388323410793163, "grad_norm": 22.66309928894043, "learning_rate": 1.37870549596538e-06, "loss": 0.2963, "num_input_tokens_seen": 21599872, "step": 6855 }, { "epoch": 0.43915242302029317, "grad_norm": 19.721696853637695, "learning_rate": 1.3776711161742595e-06, "loss": 0.5262, "num_input_tokens_seen": 21615808, "step": 6860 }, { "epoch": 0.4394725049612701, "grad_norm": 27.445302963256836, "learning_rate": 1.3766362648563166e-06, "loss": 0.4639, "num_input_tokens_seen": 21630656, "step": 6865 }, { "epoch": 0.439792586902247, "grad_norm": 62.023433685302734, "learning_rate": 1.3756009433035744e-06, "loss": 0.4073, "num_input_tokens_seen": 21646976, "step": 6870 }, { "epoch": 0.44011266884322386, "grad_norm": 27.705705642700195, "learning_rate": 1.3745651528086447e-06, "loss": 0.5615, "num_input_tokens_seen": 21665024, "step": 6875 }, { "epoch": 0.44043275078420074, "grad_norm": 15.181832313537598, "learning_rate": 1.373528894664724e-06, "loss": 0.4486, "num_input_tokens_seen": 21680128, "step": 6880 }, { "epoch": 0.4407528327251776, "grad_norm": 23.56239128112793, "learning_rate": 1.3724921701655924e-06, "loss": 0.3509, "num_input_tokens_seen": 21695808, "step": 6885 }, { "epoch": 0.44107291466615456, "grad_norm": 15.783341407775879, "learning_rate": 1.3714549806056125e-06, "loss": 0.3155, "num_input_tokens_seen": 21711936, "step": 6890 }, { "epoch": 0.44139299660713144, "grad_norm": 45.81801986694336, "learning_rate": 1.3704173272797283e-06, "loss": 0.4241, "num_input_tokens_seen": 21727488, "step": 6895 }, { "epoch": 0.4417130785481083, "grad_norm": 39.76817321777344, "learning_rate": 1.3693792114834619e-06, "loss": 0.4386, "num_input_tokens_seen": 21745280, "step": 6900 }, { "epoch": 0.4420331604890852, "grad_norm": 26.435964584350586, "learning_rate": 1.3683406345129129e-06, "loss": 0.4684, "num_input_tokens_seen": 21760000, "step": 6905 }, { "epoch": 0.4423532424300621, "grad_norm": 25.047027587890625, "learning_rate": 1.3673015976647567e-06, "loss": 0.4025, "num_input_tokens_seen": 21775232, "step": 6910 }, { "epoch": 0.442673324371039, "grad_norm": 36.185760498046875, "learning_rate": 1.3662621022362435e-06, "loss": 0.3967, "num_input_tokens_seen": 21790656, "step": 6915 }, { "epoch": 0.4429934063120159, "grad_norm": 53.022464752197266, "learning_rate": 1.3652221495251952e-06, "loss": 0.4654, "num_input_tokens_seen": 21806336, "step": 6920 }, { "epoch": 0.44331348825299277, "grad_norm": 26.99211883544922, "learning_rate": 1.3641817408300049e-06, "loss": 0.3204, "num_input_tokens_seen": 21823744, "step": 6925 }, { "epoch": 0.44363357019396965, "grad_norm": 30.070894241333008, "learning_rate": 1.3631408774496352e-06, "loss": 0.5579, "num_input_tokens_seen": 21839104, "step": 6930 }, { "epoch": 0.44395365213494653, "grad_norm": 26.091249465942383, "learning_rate": 1.3620995606836165e-06, "loss": 0.3566, "num_input_tokens_seen": 21854528, "step": 6935 }, { "epoch": 0.4442737340759234, "grad_norm": 58.88991165161133, "learning_rate": 1.3610577918320446e-06, "loss": 0.6023, "num_input_tokens_seen": 21870592, "step": 6940 }, { "epoch": 0.44459381601690035, "grad_norm": 44.893310546875, "learning_rate": 1.3600155721955802e-06, "loss": 0.3743, "num_input_tokens_seen": 21885696, "step": 6945 }, { "epoch": 0.4449138979578772, "grad_norm": 24.15410614013672, "learning_rate": 1.3589729030754468e-06, "loss": 0.3819, "num_input_tokens_seen": 21901248, "step": 6950 }, { "epoch": 0.4452339798988541, "grad_norm": 28.12432861328125, "learning_rate": 1.3579297857734293e-06, "loss": 0.4341, "num_input_tokens_seen": 21916352, "step": 6955 }, { "epoch": 0.445554061839831, "grad_norm": 17.1772518157959, "learning_rate": 1.3568862215918717e-06, "loss": 0.3365, "num_input_tokens_seen": 21931072, "step": 6960 }, { "epoch": 0.44587414378080786, "grad_norm": 32.58141326904297, "learning_rate": 1.3558422118336762e-06, "loss": 0.4944, "num_input_tokens_seen": 21946752, "step": 6965 }, { "epoch": 0.4461942257217848, "grad_norm": 37.54017639160156, "learning_rate": 1.354797757802301e-06, "loss": 0.4804, "num_input_tokens_seen": 21962176, "step": 6970 }, { "epoch": 0.4465143076627617, "grad_norm": 17.05492401123047, "learning_rate": 1.3537528608017596e-06, "loss": 0.392, "num_input_tokens_seen": 21978496, "step": 6975 }, { "epoch": 0.44683438960373856, "grad_norm": 23.01466941833496, "learning_rate": 1.352707522136618e-06, "loss": 0.3973, "num_input_tokens_seen": 21992576, "step": 6980 }, { "epoch": 0.44715447154471544, "grad_norm": 17.1395206451416, "learning_rate": 1.3516617431119934e-06, "loss": 0.3998, "num_input_tokens_seen": 22008000, "step": 6985 }, { "epoch": 0.4474745534856923, "grad_norm": 32.3569450378418, "learning_rate": 1.350615525033554e-06, "loss": 0.53, "num_input_tokens_seen": 22022976, "step": 6990 }, { "epoch": 0.44779463542666925, "grad_norm": 25.158411026000977, "learning_rate": 1.3495688692075144e-06, "loss": 0.4027, "num_input_tokens_seen": 22038144, "step": 6995 }, { "epoch": 0.44811471736764613, "grad_norm": 31.82624053955078, "learning_rate": 1.3485217769406376e-06, "loss": 0.3435, "num_input_tokens_seen": 22054016, "step": 7000 }, { "epoch": 0.448434799308623, "grad_norm": 30.720848083496094, "learning_rate": 1.3474742495402303e-06, "loss": 0.3605, "num_input_tokens_seen": 22073920, "step": 7005 }, { "epoch": 0.4487548812495999, "grad_norm": 50.93308639526367, "learning_rate": 1.3464262883141425e-06, "loss": 0.4297, "num_input_tokens_seen": 22089728, "step": 7010 }, { "epoch": 0.44907496319057677, "grad_norm": 36.83964538574219, "learning_rate": 1.3453778945707663e-06, "loss": 0.5687, "num_input_tokens_seen": 22105344, "step": 7015 }, { "epoch": 0.4493950451315537, "grad_norm": 53.62667465209961, "learning_rate": 1.3443290696190332e-06, "loss": 0.4471, "num_input_tokens_seen": 22121792, "step": 7020 }, { "epoch": 0.4497151270725306, "grad_norm": 23.14280128479004, "learning_rate": 1.343279814768414e-06, "loss": 0.4034, "num_input_tokens_seen": 22136128, "step": 7025 }, { "epoch": 0.45003520901350746, "grad_norm": 22.742084503173828, "learning_rate": 1.3422301313289156e-06, "loss": 0.38, "num_input_tokens_seen": 22151936, "step": 7030 }, { "epoch": 0.45035529095448434, "grad_norm": 21.072940826416016, "learning_rate": 1.34118002061108e-06, "loss": 0.3794, "num_input_tokens_seen": 22168128, "step": 7035 }, { "epoch": 0.4505473401190705, "eval_loss": 0.43158382177352905, "eval_runtime": 49.1758, "eval_samples_per_second": 282.375, "eval_steps_per_second": 35.302, "num_input_tokens_seen": 22178432, "step": 7038 }, { "epoch": 0.4506753728954612, "grad_norm": 38.73175048828125, "learning_rate": 1.3401294839259828e-06, "loss": 0.4309, "num_input_tokens_seen": 22184512, "step": 7040 }, { "epoch": 0.4509954548364381, "grad_norm": 33.52423095703125, "learning_rate": 1.3390785225852312e-06, "loss": 0.54, "num_input_tokens_seen": 22199872, "step": 7045 }, { "epoch": 0.45131553677741504, "grad_norm": 19.460634231567383, "learning_rate": 1.3380271379009631e-06, "loss": 0.4411, "num_input_tokens_seen": 22216960, "step": 7050 }, { "epoch": 0.4516356187183919, "grad_norm": 19.555931091308594, "learning_rate": 1.3369753311858442e-06, "loss": 0.2615, "num_input_tokens_seen": 22231488, "step": 7055 }, { "epoch": 0.4519557006593688, "grad_norm": 28.813966751098633, "learning_rate": 1.3359231037530682e-06, "loss": 0.4584, "num_input_tokens_seen": 22246976, "step": 7060 }, { "epoch": 0.4522757826003457, "grad_norm": 16.085895538330078, "learning_rate": 1.3348704569163527e-06, "loss": 0.4139, "num_input_tokens_seen": 22263680, "step": 7065 }, { "epoch": 0.45259586454132256, "grad_norm": 18.5650691986084, "learning_rate": 1.33381739198994e-06, "loss": 0.3347, "num_input_tokens_seen": 22279552, "step": 7070 }, { "epoch": 0.4529159464822995, "grad_norm": 19.012405395507812, "learning_rate": 1.3327639102885938e-06, "loss": 0.4436, "num_input_tokens_seen": 22295296, "step": 7075 }, { "epoch": 0.45323602842327637, "grad_norm": 34.81302261352539, "learning_rate": 1.3317100131275986e-06, "loss": 0.3973, "num_input_tokens_seen": 22310400, "step": 7080 }, { "epoch": 0.45355611036425325, "grad_norm": 60.76240921020508, "learning_rate": 1.3306557018227576e-06, "loss": 0.492, "num_input_tokens_seen": 22326848, "step": 7085 }, { "epoch": 0.45387619230523013, "grad_norm": 30.761585235595703, "learning_rate": 1.3296009776903903e-06, "loss": 0.47, "num_input_tokens_seen": 22342592, "step": 7090 }, { "epoch": 0.454196274246207, "grad_norm": 29.366207122802734, "learning_rate": 1.3285458420473323e-06, "loss": 0.4386, "num_input_tokens_seen": 22358912, "step": 7095 }, { "epoch": 0.45451635618718395, "grad_norm": 30.328184127807617, "learning_rate": 1.3274902962109332e-06, "loss": 0.3744, "num_input_tokens_seen": 22374528, "step": 7100 }, { "epoch": 0.4548364381281608, "grad_norm": 17.943153381347656, "learning_rate": 1.3264343414990539e-06, "loss": 0.3686, "num_input_tokens_seen": 22389824, "step": 7105 }, { "epoch": 0.4551565200691377, "grad_norm": 35.101932525634766, "learning_rate": 1.3253779792300663e-06, "loss": 0.4148, "num_input_tokens_seen": 22405376, "step": 7110 }, { "epoch": 0.4554766020101146, "grad_norm": 14.828371047973633, "learning_rate": 1.3243212107228518e-06, "loss": 0.3551, "num_input_tokens_seen": 22420032, "step": 7115 }, { "epoch": 0.45579668395109146, "grad_norm": 15.68032169342041, "learning_rate": 1.3232640372967974e-06, "loss": 0.3909, "num_input_tokens_seen": 22434688, "step": 7120 }, { "epoch": 0.45611676589206834, "grad_norm": 51.65379333496094, "learning_rate": 1.3222064602717974e-06, "loss": 0.4645, "num_input_tokens_seen": 22451072, "step": 7125 }, { "epoch": 0.4564368478330453, "grad_norm": 30.610668182373047, "learning_rate": 1.321148480968248e-06, "loss": 0.3488, "num_input_tokens_seen": 22466688, "step": 7130 }, { "epoch": 0.45675692977402216, "grad_norm": 38.32967758178711, "learning_rate": 1.3200901007070495e-06, "loss": 0.4609, "num_input_tokens_seen": 22482432, "step": 7135 }, { "epoch": 0.45707701171499904, "grad_norm": 42.44841003417969, "learning_rate": 1.3190313208096022e-06, "loss": 0.4616, "num_input_tokens_seen": 22496960, "step": 7140 }, { "epoch": 0.4573970936559759, "grad_norm": 62.05764389038086, "learning_rate": 1.3179721425978048e-06, "loss": 0.3617, "num_input_tokens_seen": 22512256, "step": 7145 }, { "epoch": 0.4577171755969528, "grad_norm": 27.489582061767578, "learning_rate": 1.3169125673940541e-06, "loss": 0.4002, "num_input_tokens_seen": 22528192, "step": 7150 }, { "epoch": 0.45803725753792973, "grad_norm": 23.193330764770508, "learning_rate": 1.3158525965212422e-06, "loss": 0.4126, "num_input_tokens_seen": 22545408, "step": 7155 }, { "epoch": 0.4583573394789066, "grad_norm": 44.60530090332031, "learning_rate": 1.3147922313027548e-06, "loss": 0.5063, "num_input_tokens_seen": 22560832, "step": 7160 }, { "epoch": 0.4586774214198835, "grad_norm": 34.29766845703125, "learning_rate": 1.3137314730624707e-06, "loss": 0.3456, "num_input_tokens_seen": 22577728, "step": 7165 }, { "epoch": 0.45899750336086037, "grad_norm": 59.20881652832031, "learning_rate": 1.3126703231247588e-06, "loss": 0.4722, "num_input_tokens_seen": 22594112, "step": 7170 }, { "epoch": 0.45931758530183725, "grad_norm": 57.1280632019043, "learning_rate": 1.3116087828144772e-06, "loss": 0.3917, "num_input_tokens_seen": 22609728, "step": 7175 }, { "epoch": 0.4596376672428142, "grad_norm": 24.825468063354492, "learning_rate": 1.310546853456972e-06, "loss": 0.4692, "num_input_tokens_seen": 22624704, "step": 7180 }, { "epoch": 0.45995774918379106, "grad_norm": 27.96169662475586, "learning_rate": 1.3094845363780737e-06, "loss": 0.3145, "num_input_tokens_seen": 22640448, "step": 7185 }, { "epoch": 0.46027783112476794, "grad_norm": 26.550325393676758, "learning_rate": 1.3084218329040976e-06, "loss": 0.2277, "num_input_tokens_seen": 22655680, "step": 7190 }, { "epoch": 0.4605979130657448, "grad_norm": 17.48622703552246, "learning_rate": 1.3073587443618425e-06, "loss": 0.3769, "num_input_tokens_seen": 22672128, "step": 7195 }, { "epoch": 0.4609179950067217, "grad_norm": 60.23152542114258, "learning_rate": 1.3062952720785861e-06, "loss": 0.5418, "num_input_tokens_seen": 22687104, "step": 7200 }, { "epoch": 0.4612380769476986, "grad_norm": 48.24466323852539, "learning_rate": 1.305231417382086e-06, "loss": 0.3724, "num_input_tokens_seen": 22702976, "step": 7205 }, { "epoch": 0.4615581588886755, "grad_norm": 34.0355224609375, "learning_rate": 1.3041671816005777e-06, "loss": 0.3522, "num_input_tokens_seen": 22718464, "step": 7210 }, { "epoch": 0.4618782408296524, "grad_norm": 30.36563491821289, "learning_rate": 1.3031025660627718e-06, "loss": 0.3783, "num_input_tokens_seen": 22734656, "step": 7215 }, { "epoch": 0.4621983227706293, "grad_norm": 38.3671989440918, "learning_rate": 1.3020375720978534e-06, "loss": 0.4376, "num_input_tokens_seen": 22750016, "step": 7220 }, { "epoch": 0.46251840471160616, "grad_norm": 32.97966003417969, "learning_rate": 1.3009722010354799e-06, "loss": 0.3855, "num_input_tokens_seen": 22765632, "step": 7225 }, { "epoch": 0.46283848665258304, "grad_norm": 39.90695571899414, "learning_rate": 1.2999064542057794e-06, "loss": 0.4528, "num_input_tokens_seen": 22781184, "step": 7230 }, { "epoch": 0.46315856859355997, "grad_norm": 31.27988624572754, "learning_rate": 1.2988403329393495e-06, "loss": 0.4842, "num_input_tokens_seen": 22797248, "step": 7235 }, { "epoch": 0.46347865053453685, "grad_norm": 29.927885055541992, "learning_rate": 1.2977738385672557e-06, "loss": 0.4177, "num_input_tokens_seen": 22812800, "step": 7240 }, { "epoch": 0.46379873247551373, "grad_norm": 21.404644012451172, "learning_rate": 1.2967069724210278e-06, "loss": 0.4087, "num_input_tokens_seen": 22827200, "step": 7245 }, { "epoch": 0.4641188144164906, "grad_norm": 31.973535537719727, "learning_rate": 1.2956397358326609e-06, "loss": 0.5265, "num_input_tokens_seen": 22843264, "step": 7250 }, { "epoch": 0.4644388963574675, "grad_norm": 39.217674255371094, "learning_rate": 1.294572130134613e-06, "loss": 0.3799, "num_input_tokens_seen": 22858624, "step": 7255 }, { "epoch": 0.4647589782984444, "grad_norm": 36.54713821411133, "learning_rate": 1.2935041566598016e-06, "loss": 0.5557, "num_input_tokens_seen": 22873856, "step": 7260 }, { "epoch": 0.4650790602394213, "grad_norm": 32.417545318603516, "learning_rate": 1.2924358167416049e-06, "loss": 0.356, "num_input_tokens_seen": 22889600, "step": 7265 }, { "epoch": 0.4653991421803982, "grad_norm": 24.408979415893555, "learning_rate": 1.2913671117138572e-06, "loss": 0.4007, "num_input_tokens_seen": 22904704, "step": 7270 }, { "epoch": 0.46571922412137506, "grad_norm": 22.64531898498535, "learning_rate": 1.29029804291085e-06, "loss": 0.3471, "num_input_tokens_seen": 22920384, "step": 7275 }, { "epoch": 0.46603930606235194, "grad_norm": 44.77216339111328, "learning_rate": 1.2892286116673269e-06, "loss": 0.3475, "num_input_tokens_seen": 22937024, "step": 7280 }, { "epoch": 0.4663593880033289, "grad_norm": 26.58623695373535, "learning_rate": 1.2881588193184865e-06, "loss": 0.4934, "num_input_tokens_seen": 22954816, "step": 7285 }, { "epoch": 0.46667946994430576, "grad_norm": 22.52194595336914, "learning_rate": 1.287088667199977e-06, "loss": 0.2918, "num_input_tokens_seen": 22969472, "step": 7290 }, { "epoch": 0.46699955188528264, "grad_norm": 22.330564498901367, "learning_rate": 1.2860181566478956e-06, "loss": 0.4681, "num_input_tokens_seen": 22984192, "step": 7295 }, { "epoch": 0.4673196338262595, "grad_norm": 13.149898529052734, "learning_rate": 1.2849472889987874e-06, "loss": 0.3868, "num_input_tokens_seen": 22999680, "step": 7300 }, { "epoch": 0.4676397157672364, "grad_norm": 27.509746551513672, "learning_rate": 1.2838760655896431e-06, "loss": 0.3784, "num_input_tokens_seen": 23014720, "step": 7305 }, { "epoch": 0.4679597977082133, "grad_norm": 35.98652648925781, "learning_rate": 1.2828044877578983e-06, "loss": 0.4544, "num_input_tokens_seen": 23030528, "step": 7310 }, { "epoch": 0.4682798796491902, "grad_norm": 26.335607528686523, "learning_rate": 1.2817325568414297e-06, "loss": 0.5205, "num_input_tokens_seen": 23046784, "step": 7315 }, { "epoch": 0.4685999615901671, "grad_norm": 26.756956100463867, "learning_rate": 1.2806602741785562e-06, "loss": 0.3379, "num_input_tokens_seen": 23061632, "step": 7320 }, { "epoch": 0.46892004353114397, "grad_norm": 17.465469360351562, "learning_rate": 1.2795876411080346e-06, "loss": 0.3202, "num_input_tokens_seen": 23077888, "step": 7325 }, { "epoch": 0.46924012547212085, "grad_norm": 24.94025993347168, "learning_rate": 1.278514658969061e-06, "loss": 0.3308, "num_input_tokens_seen": 23093568, "step": 7330 }, { "epoch": 0.46956020741309773, "grad_norm": 29.178998947143555, "learning_rate": 1.2774413291012648e-06, "loss": 0.5047, "num_input_tokens_seen": 23108992, "step": 7335 }, { "epoch": 0.46988028935407467, "grad_norm": 25.278213500976562, "learning_rate": 1.2763676528447122e-06, "loss": 0.4191, "num_input_tokens_seen": 23124992, "step": 7340 }, { "epoch": 0.47020037129505154, "grad_norm": 31.44306755065918, "learning_rate": 1.2752936315399003e-06, "loss": 0.3417, "num_input_tokens_seen": 23141888, "step": 7345 }, { "epoch": 0.4705204532360284, "grad_norm": 27.29042625427246, "learning_rate": 1.2742192665277566e-06, "loss": 0.3346, "num_input_tokens_seen": 23157888, "step": 7350 }, { "epoch": 0.4708405351770053, "grad_norm": 25.130107879638672, "learning_rate": 1.2731445591496393e-06, "loss": 0.2813, "num_input_tokens_seen": 23172864, "step": 7355 }, { "epoch": 0.4711606171179822, "grad_norm": 45.540672302246094, "learning_rate": 1.2720695107473325e-06, "loss": 0.4622, "num_input_tokens_seen": 23188352, "step": 7360 }, { "epoch": 0.4714806990589591, "grad_norm": 38.563602447509766, "learning_rate": 1.2709941226630475e-06, "loss": 0.3897, "num_input_tokens_seen": 23204096, "step": 7365 }, { "epoch": 0.471800780999936, "grad_norm": 27.982297897338867, "learning_rate": 1.2699183962394182e-06, "loss": 0.3513, "num_input_tokens_seen": 23219072, "step": 7370 }, { "epoch": 0.4721208629409129, "grad_norm": 15.643006324768066, "learning_rate": 1.2688423328195021e-06, "loss": 0.4198, "num_input_tokens_seen": 23234560, "step": 7375 }, { "epoch": 0.47244094488188976, "grad_norm": 62.19183349609375, "learning_rate": 1.267765933746777e-06, "loss": 0.3426, "num_input_tokens_seen": 23250304, "step": 7380 }, { "epoch": 0.47276102682286664, "grad_norm": 51.6485710144043, "learning_rate": 1.2666892003651397e-06, "loss": 0.6245, "num_input_tokens_seen": 23265664, "step": 7385 }, { "epoch": 0.4730811087638435, "grad_norm": 28.73395538330078, "learning_rate": 1.2656121340189043e-06, "loss": 0.442, "num_input_tokens_seen": 23281472, "step": 7390 }, { "epoch": 0.47340119070482045, "grad_norm": 28.408031463623047, "learning_rate": 1.264534736052801e-06, "loss": 0.411, "num_input_tokens_seen": 23297024, "step": 7395 }, { "epoch": 0.47372127264579733, "grad_norm": 41.88270950317383, "learning_rate": 1.2634570078119739e-06, "loss": 0.4385, "num_input_tokens_seen": 23313344, "step": 7400 }, { "epoch": 0.4740413545867742, "grad_norm": 27.301424026489258, "learning_rate": 1.262378950641979e-06, "loss": 0.5213, "num_input_tokens_seen": 23328512, "step": 7405 }, { "epoch": 0.4743614365277511, "grad_norm": 23.59923553466797, "learning_rate": 1.2613005658887836e-06, "loss": 0.4465, "num_input_tokens_seen": 23342400, "step": 7410 }, { "epoch": 0.47468151846872797, "grad_norm": 34.58885192871094, "learning_rate": 1.2602218548987637e-06, "loss": 0.4134, "num_input_tokens_seen": 23358400, "step": 7415 }, { "epoch": 0.4750016004097049, "grad_norm": 32.09384536743164, "learning_rate": 1.2591428190187029e-06, "loss": 0.4102, "num_input_tokens_seen": 23373376, "step": 7420 }, { "epoch": 0.4753216823506818, "grad_norm": 57.16767501831055, "learning_rate": 1.2580634595957898e-06, "loss": 0.5013, "num_input_tokens_seen": 23390400, "step": 7425 }, { "epoch": 0.47564176429165866, "grad_norm": 27.278974533081055, "learning_rate": 1.2569837779776172e-06, "loss": 0.3705, "num_input_tokens_seen": 23406400, "step": 7430 }, { "epoch": 0.47596184623263554, "grad_norm": 27.228130340576172, "learning_rate": 1.2559037755121804e-06, "loss": 0.3131, "num_input_tokens_seen": 23421824, "step": 7435 }, { "epoch": 0.4762819281736124, "grad_norm": 51.93519592285156, "learning_rate": 1.2548234535478754e-06, "loss": 0.4512, "num_input_tokens_seen": 23438272, "step": 7440 }, { "epoch": 0.47660201011458936, "grad_norm": 17.943632125854492, "learning_rate": 1.2537428134334968e-06, "loss": 0.4216, "num_input_tokens_seen": 23454976, "step": 7445 }, { "epoch": 0.47692209205556624, "grad_norm": 98.46037292480469, "learning_rate": 1.252661856518236e-06, "loss": 0.5189, "num_input_tokens_seen": 23471168, "step": 7450 }, { "epoch": 0.4772421739965431, "grad_norm": 28.342315673828125, "learning_rate": 1.251580584151681e-06, "loss": 0.3564, "num_input_tokens_seen": 23486720, "step": 7455 }, { "epoch": 0.47756225593752, "grad_norm": 21.639692306518555, "learning_rate": 1.2504989976838129e-06, "loss": 0.3059, "num_input_tokens_seen": 23502912, "step": 7460 }, { "epoch": 0.4778823378784969, "grad_norm": 26.391496658325195, "learning_rate": 1.2494170984650048e-06, "loss": 0.3667, "num_input_tokens_seen": 23519552, "step": 7465 }, { "epoch": 0.4782024198194738, "grad_norm": 31.00334930419922, "learning_rate": 1.248334887846021e-06, "loss": 0.4019, "num_input_tokens_seen": 23535936, "step": 7470 }, { "epoch": 0.4785225017604507, "grad_norm": 29.97296142578125, "learning_rate": 1.2472523671780135e-06, "loss": 0.4373, "num_input_tokens_seen": 23551040, "step": 7475 }, { "epoch": 0.47884258370142757, "grad_norm": 35.39260482788086, "learning_rate": 1.2461695378125233e-06, "loss": 0.3115, "num_input_tokens_seen": 23566208, "step": 7480 }, { "epoch": 0.47916266564240445, "grad_norm": 20.799793243408203, "learning_rate": 1.245086401101474e-06, "loss": 0.4197, "num_input_tokens_seen": 23581696, "step": 7485 }, { "epoch": 0.47948274758338133, "grad_norm": 69.36449432373047, "learning_rate": 1.2440029583971757e-06, "loss": 0.4454, "num_input_tokens_seen": 23597248, "step": 7490 }, { "epoch": 0.4798028295243582, "grad_norm": 16.190322875976562, "learning_rate": 1.2429192110523188e-06, "loss": 0.4913, "num_input_tokens_seen": 23612800, "step": 7495 }, { "epoch": 0.48012291146533514, "grad_norm": 28.28662109375, "learning_rate": 1.2418351604199746e-06, "loss": 0.3338, "num_input_tokens_seen": 23629056, "step": 7500 }, { "epoch": 0.480442993406312, "grad_norm": 39.906612396240234, "learning_rate": 1.2407508078535934e-06, "loss": 0.4447, "num_input_tokens_seen": 23644352, "step": 7505 }, { "epoch": 0.4807630753472889, "grad_norm": 25.87689208984375, "learning_rate": 1.2396661547070017e-06, "loss": 0.2785, "num_input_tokens_seen": 23661120, "step": 7510 }, { "epoch": 0.4810831572882658, "grad_norm": 18.180044174194336, "learning_rate": 1.238581202334402e-06, "loss": 0.3347, "num_input_tokens_seen": 23677632, "step": 7515 }, { "epoch": 0.48140323922924266, "grad_norm": 26.29235076904297, "learning_rate": 1.2374959520903699e-06, "loss": 0.3673, "num_input_tokens_seen": 23693952, "step": 7520 }, { "epoch": 0.4817233211702196, "grad_norm": 17.1253662109375, "learning_rate": 1.2364104053298531e-06, "loss": 0.3341, "num_input_tokens_seen": 23708736, "step": 7525 }, { "epoch": 0.4820434031111965, "grad_norm": 30.4875431060791, "learning_rate": 1.2353245634081692e-06, "loss": 0.3913, "num_input_tokens_seen": 23724864, "step": 7530 }, { "epoch": 0.48236348505217336, "grad_norm": 23.729246139526367, "learning_rate": 1.2342384276810053e-06, "loss": 0.4148, "num_input_tokens_seen": 23740160, "step": 7535 }, { "epoch": 0.48268356699315024, "grad_norm": 70.08629608154297, "learning_rate": 1.233151999504414e-06, "loss": 0.423, "num_input_tokens_seen": 23755264, "step": 7540 }, { "epoch": 0.4830036489341271, "grad_norm": 46.91286849975586, "learning_rate": 1.232065280234814e-06, "loss": 0.3317, "num_input_tokens_seen": 23770112, "step": 7545 }, { "epoch": 0.48332373087510405, "grad_norm": 24.17731285095215, "learning_rate": 1.2309782712289867e-06, "loss": 0.4189, "num_input_tokens_seen": 23785536, "step": 7550 }, { "epoch": 0.48364381281608093, "grad_norm": 50.58120346069336, "learning_rate": 1.2298909738440758e-06, "loss": 0.4307, "num_input_tokens_seen": 23801280, "step": 7555 }, { "epoch": 0.4839638947570578, "grad_norm": 39.50659942626953, "learning_rate": 1.2288033894375847e-06, "loss": 0.371, "num_input_tokens_seen": 23816448, "step": 7560 }, { "epoch": 0.4842839766980347, "grad_norm": 31.22879409790039, "learning_rate": 1.2277155193673755e-06, "loss": 0.5539, "num_input_tokens_seen": 23832512, "step": 7565 }, { "epoch": 0.48460405863901157, "grad_norm": 14.704495429992676, "learning_rate": 1.2266273649916668e-06, "loss": 0.3968, "num_input_tokens_seen": 23848192, "step": 7570 }, { "epoch": 0.48492414057998845, "grad_norm": 18.676654815673828, "learning_rate": 1.2255389276690318e-06, "loss": 0.4249, "num_input_tokens_seen": 23863808, "step": 7575 }, { "epoch": 0.4852442225209654, "grad_norm": 32.08503341674805, "learning_rate": 1.2244502087583978e-06, "loss": 0.2927, "num_input_tokens_seen": 23880960, "step": 7580 }, { "epoch": 0.48556430446194226, "grad_norm": 46.882720947265625, "learning_rate": 1.2233612096190426e-06, "loss": 0.3969, "num_input_tokens_seen": 23896256, "step": 7585 }, { "epoch": 0.48588438640291914, "grad_norm": 36.5152473449707, "learning_rate": 1.222271931610595e-06, "loss": 0.5189, "num_input_tokens_seen": 23912832, "step": 7590 }, { "epoch": 0.486204468343896, "grad_norm": 26.63950538635254, "learning_rate": 1.2211823760930306e-06, "loss": 0.4929, "num_input_tokens_seen": 23928768, "step": 7595 }, { "epoch": 0.4865245502848729, "grad_norm": 18.74747657775879, "learning_rate": 1.2200925444266726e-06, "loss": 0.4206, "num_input_tokens_seen": 23945088, "step": 7600 }, { "epoch": 0.48684463222584984, "grad_norm": 39.23282241821289, "learning_rate": 1.219002437972189e-06, "loss": 0.5087, "num_input_tokens_seen": 23960192, "step": 7605 }, { "epoch": 0.4871647141668267, "grad_norm": 31.527008056640625, "learning_rate": 1.21791205809059e-06, "loss": 0.4208, "num_input_tokens_seen": 23977152, "step": 7610 }, { "epoch": 0.4874847961078036, "grad_norm": 30.472713470458984, "learning_rate": 1.2168214061432283e-06, "loss": 0.3611, "num_input_tokens_seen": 23992448, "step": 7615 }, { "epoch": 0.4878048780487805, "grad_norm": 24.9169864654541, "learning_rate": 1.2157304834917947e-06, "loss": 0.4276, "num_input_tokens_seen": 24008384, "step": 7620 }, { "epoch": 0.48812495998975736, "grad_norm": 28.272476196289062, "learning_rate": 1.2146392914983202e-06, "loss": 0.6241, "num_input_tokens_seen": 24025728, "step": 7625 }, { "epoch": 0.4884450419307343, "grad_norm": 44.216453552246094, "learning_rate": 1.2135478315251694e-06, "loss": 0.5169, "num_input_tokens_seen": 24040448, "step": 7630 }, { "epoch": 0.48876512387171117, "grad_norm": 26.274669647216797, "learning_rate": 1.2124561049350442e-06, "loss": 0.3428, "num_input_tokens_seen": 24055168, "step": 7635 }, { "epoch": 0.48908520581268805, "grad_norm": 41.2357292175293, "learning_rate": 1.2113641130909772e-06, "loss": 0.453, "num_input_tokens_seen": 24070016, "step": 7640 }, { "epoch": 0.48940528775366493, "grad_norm": 58.80428695678711, "learning_rate": 1.2102718573563334e-06, "loss": 0.3108, "num_input_tokens_seen": 24084800, "step": 7645 }, { "epoch": 0.4897253696946418, "grad_norm": 53.14729309082031, "learning_rate": 1.2091793390948066e-06, "loss": 0.4842, "num_input_tokens_seen": 24100416, "step": 7650 }, { "epoch": 0.49004545163561875, "grad_norm": 17.676326751708984, "learning_rate": 1.2080865596704191e-06, "loss": 0.2906, "num_input_tokens_seen": 24117120, "step": 7655 }, { "epoch": 0.4903655335765956, "grad_norm": 30.914222717285156, "learning_rate": 1.2069935204475187e-06, "loss": 0.4391, "num_input_tokens_seen": 24132224, "step": 7660 }, { "epoch": 0.4906856155175725, "grad_norm": 23.044315338134766, "learning_rate": 1.2059002227907776e-06, "loss": 0.3992, "num_input_tokens_seen": 24147712, "step": 7665 }, { "epoch": 0.4910056974585494, "grad_norm": 37.006168365478516, "learning_rate": 1.2048066680651908e-06, "loss": 0.4121, "num_input_tokens_seen": 24164288, "step": 7670 }, { "epoch": 0.49132577939952626, "grad_norm": 37.811988830566406, "learning_rate": 1.2037128576360743e-06, "loss": 0.5577, "num_input_tokens_seen": 24193728, "step": 7675 }, { "epoch": 0.49164586134050314, "grad_norm": 36.05268478393555, "learning_rate": 1.2026187928690627e-06, "loss": 0.4148, "num_input_tokens_seen": 24208832, "step": 7680 }, { "epoch": 0.4919659432814801, "grad_norm": 34.80404281616211, "learning_rate": 1.2015244751301098e-06, "loss": 0.5085, "num_input_tokens_seen": 24223424, "step": 7685 }, { "epoch": 0.49228602522245696, "grad_norm": 47.47758865356445, "learning_rate": 1.2004299057854832e-06, "loss": 0.43, "num_input_tokens_seen": 24238976, "step": 7690 }, { "epoch": 0.49260610716343384, "grad_norm": 22.682682037353516, "learning_rate": 1.1993350862017661e-06, "loss": 0.3893, "num_input_tokens_seen": 24253632, "step": 7695 }, { "epoch": 0.4929261891044107, "grad_norm": 35.10201644897461, "learning_rate": 1.1982400177458534e-06, "loss": 0.3968, "num_input_tokens_seen": 24270720, "step": 7700 }, { "epoch": 0.4932462710453876, "grad_norm": 34.98603820800781, "learning_rate": 1.197144701784951e-06, "loss": 0.4284, "num_input_tokens_seen": 24285312, "step": 7705 }, { "epoch": 0.49356635298636453, "grad_norm": 32.93339157104492, "learning_rate": 1.1960491396865735e-06, "loss": 0.3926, "num_input_tokens_seen": 24300352, "step": 7710 }, { "epoch": 0.4938864349273414, "grad_norm": 27.799358367919922, "learning_rate": 1.1949533328185435e-06, "loss": 0.3458, "num_input_tokens_seen": 24317056, "step": 7715 }, { "epoch": 0.4942065168683183, "grad_norm": 25.46038818359375, "learning_rate": 1.1938572825489883e-06, "loss": 0.3741, "num_input_tokens_seen": 24333184, "step": 7720 }, { "epoch": 0.49452659880929517, "grad_norm": 29.320058822631836, "learning_rate": 1.1927609902463394e-06, "loss": 0.409, "num_input_tokens_seen": 24348672, "step": 7725 }, { "epoch": 0.49484668075027205, "grad_norm": 44.419612884521484, "learning_rate": 1.1916644572793314e-06, "loss": 0.4346, "num_input_tokens_seen": 24363648, "step": 7730 }, { "epoch": 0.495166762691249, "grad_norm": 74.09778594970703, "learning_rate": 1.190567685016998e-06, "loss": 0.4964, "num_input_tokens_seen": 24380992, "step": 7735 }, { "epoch": 0.49548684463222586, "grad_norm": 27.674976348876953, "learning_rate": 1.189470674828672e-06, "loss": 0.4107, "num_input_tokens_seen": 24395776, "step": 7740 }, { "epoch": 0.49580692657320274, "grad_norm": 25.768115997314453, "learning_rate": 1.188373428083984e-06, "loss": 0.3878, "num_input_tokens_seen": 24411584, "step": 7745 }, { "epoch": 0.4961270085141796, "grad_norm": 44.345550537109375, "learning_rate": 1.1872759461528596e-06, "loss": 0.5219, "num_input_tokens_seen": 24426560, "step": 7750 }, { "epoch": 0.4964470904551565, "grad_norm": 13.35042667388916, "learning_rate": 1.1861782304055174e-06, "loss": 0.39, "num_input_tokens_seen": 24441856, "step": 7755 }, { "epoch": 0.4967671723961334, "grad_norm": 18.407421112060547, "learning_rate": 1.1850802822124686e-06, "loss": 0.3345, "num_input_tokens_seen": 24457472, "step": 7760 }, { "epoch": 0.4970872543371103, "grad_norm": 57.33185577392578, "learning_rate": 1.1839821029445143e-06, "loss": 0.5005, "num_input_tokens_seen": 24471936, "step": 7765 }, { "epoch": 0.4974073362780872, "grad_norm": 35.684871673583984, "learning_rate": 1.1828836939727442e-06, "loss": 0.3195, "num_input_tokens_seen": 24487616, "step": 7770 }, { "epoch": 0.4977274182190641, "grad_norm": 39.44476318359375, "learning_rate": 1.181785056668535e-06, "loss": 0.433, "num_input_tokens_seen": 24503936, "step": 7775 }, { "epoch": 0.49804750016004096, "grad_norm": 31.5116024017334, "learning_rate": 1.180686192403548e-06, "loss": 0.4212, "num_input_tokens_seen": 24518464, "step": 7780 }, { "epoch": 0.49836758210101784, "grad_norm": 69.69412231445312, "learning_rate": 1.1795871025497285e-06, "loss": 0.3439, "num_input_tokens_seen": 24533184, "step": 7785 }, { "epoch": 0.49868766404199477, "grad_norm": 33.76158905029297, "learning_rate": 1.1784877884793029e-06, "loss": 0.4122, "num_input_tokens_seen": 24548992, "step": 7790 }, { "epoch": 0.49900774598297165, "grad_norm": 32.13736343383789, "learning_rate": 1.1773882515647776e-06, "loss": 0.3627, "num_input_tokens_seen": 24566592, "step": 7795 }, { "epoch": 0.49932782792394853, "grad_norm": 26.241132736206055, "learning_rate": 1.1762884931789376e-06, "loss": 0.4811, "num_input_tokens_seen": 24583552, "step": 7800 }, { "epoch": 0.4996479098649254, "grad_norm": 15.578927040100098, "learning_rate": 1.1751885146948436e-06, "loss": 0.4548, "num_input_tokens_seen": 24599552, "step": 7805 }, { "epoch": 0.4999679918059023, "grad_norm": 34.21600341796875, "learning_rate": 1.1740883174858327e-06, "loss": 0.3633, "num_input_tokens_seen": 24614912, "step": 7810 }, { "epoch": 0.5002880737468792, "grad_norm": 33.92721939086914, "learning_rate": 1.1729879029255127e-06, "loss": 0.3649, "num_input_tokens_seen": 24629696, "step": 7815 }, { "epoch": 0.5006081556878561, "grad_norm": 32.14542007446289, "learning_rate": 1.171887272387765e-06, "loss": 0.3939, "num_input_tokens_seen": 24646208, "step": 7820 }, { "epoch": 0.5006081556878561, "eval_loss": 0.4134162962436676, "eval_runtime": 49.1457, "eval_samples_per_second": 282.548, "eval_steps_per_second": 35.324, "num_input_tokens_seen": 24646208, "step": 7820 }, { "epoch": 0.500928237628833, "grad_norm": 79.38529205322266, "learning_rate": 1.1707864272467397e-06, "loss": 0.4985, "num_input_tokens_seen": 24661120, "step": 7825 }, { "epoch": 0.5012483195698099, "grad_norm": 39.66872024536133, "learning_rate": 1.169685368876855e-06, "loss": 0.423, "num_input_tokens_seen": 24678336, "step": 7830 }, { "epoch": 0.5015684015107867, "grad_norm": 61.929866790771484, "learning_rate": 1.1685840986527946e-06, "loss": 0.5534, "num_input_tokens_seen": 24694336, "step": 7835 }, { "epoch": 0.5018884834517636, "grad_norm": 36.273685455322266, "learning_rate": 1.1674826179495076e-06, "loss": 0.4044, "num_input_tokens_seen": 24708608, "step": 7840 }, { "epoch": 0.5022085653927405, "grad_norm": 33.48814010620117, "learning_rate": 1.1663809281422056e-06, "loss": 0.415, "num_input_tokens_seen": 24724672, "step": 7845 }, { "epoch": 0.5025286473337174, "grad_norm": 42.979496002197266, "learning_rate": 1.1652790306063615e-06, "loss": 0.4562, "num_input_tokens_seen": 24740608, "step": 7850 }, { "epoch": 0.5028487292746944, "grad_norm": 37.959041595458984, "learning_rate": 1.164176926717707e-06, "loss": 0.416, "num_input_tokens_seen": 24758528, "step": 7855 }, { "epoch": 0.5031688112156713, "grad_norm": 23.2774658203125, "learning_rate": 1.1630746178522315e-06, "loss": 0.3702, "num_input_tokens_seen": 24772992, "step": 7860 }, { "epoch": 0.5034888931566481, "grad_norm": 27.682905197143555, "learning_rate": 1.1619721053861816e-06, "loss": 0.4398, "num_input_tokens_seen": 24788160, "step": 7865 }, { "epoch": 0.503808975097625, "grad_norm": 19.770153045654297, "learning_rate": 1.1608693906960558e-06, "loss": 0.4093, "num_input_tokens_seen": 24804224, "step": 7870 }, { "epoch": 0.5041290570386019, "grad_norm": 30.391685485839844, "learning_rate": 1.1597664751586069e-06, "loss": 0.4426, "num_input_tokens_seen": 24820928, "step": 7875 }, { "epoch": 0.5044491389795788, "grad_norm": 49.482810974121094, "learning_rate": 1.1586633601508382e-06, "loss": 0.3837, "num_input_tokens_seen": 24835776, "step": 7880 }, { "epoch": 0.5047692209205557, "grad_norm": 46.44161605834961, "learning_rate": 1.1575600470500014e-06, "loss": 0.3858, "num_input_tokens_seen": 24851648, "step": 7885 }, { "epoch": 0.5050893028615325, "grad_norm": 59.1083869934082, "learning_rate": 1.1564565372335957e-06, "loss": 0.42, "num_input_tokens_seen": 24866880, "step": 7890 }, { "epoch": 0.5054093848025094, "grad_norm": 41.57418441772461, "learning_rate": 1.1553528320793663e-06, "loss": 0.3162, "num_input_tokens_seen": 24881856, "step": 7895 }, { "epoch": 0.5057294667434863, "grad_norm": 23.643510818481445, "learning_rate": 1.1542489329653022e-06, "loss": 0.4364, "num_input_tokens_seen": 24898560, "step": 7900 }, { "epoch": 0.5060495486844632, "grad_norm": 25.241592407226562, "learning_rate": 1.1531448412696343e-06, "loss": 0.3754, "num_input_tokens_seen": 24913216, "step": 7905 }, { "epoch": 0.5063696306254402, "grad_norm": 21.214923858642578, "learning_rate": 1.1520405583708337e-06, "loss": 0.4913, "num_input_tokens_seen": 24928832, "step": 7910 }, { "epoch": 0.506689712566417, "grad_norm": 33.57106018066406, "learning_rate": 1.1509360856476109e-06, "loss": 0.4917, "num_input_tokens_seen": 24944512, "step": 7915 }, { "epoch": 0.5070097945073939, "grad_norm": 37.114646911621094, "learning_rate": 1.149831424478913e-06, "loss": 0.4612, "num_input_tokens_seen": 24959744, "step": 7920 }, { "epoch": 0.5073298764483708, "grad_norm": 62.12904357910156, "learning_rate": 1.1487265762439224e-06, "loss": 0.3948, "num_input_tokens_seen": 24975488, "step": 7925 }, { "epoch": 0.5076499583893477, "grad_norm": 40.3009033203125, "learning_rate": 1.1476215423220547e-06, "loss": 0.362, "num_input_tokens_seen": 24990272, "step": 7930 }, { "epoch": 0.5079700403303246, "grad_norm": 39.82942199707031, "learning_rate": 1.146516324092959e-06, "loss": 0.3761, "num_input_tokens_seen": 25006272, "step": 7935 }, { "epoch": 0.5082901222713014, "grad_norm": 23.33016014099121, "learning_rate": 1.1454109229365117e-06, "loss": 0.2954, "num_input_tokens_seen": 25022464, "step": 7940 }, { "epoch": 0.5086102042122783, "grad_norm": 27.223312377929688, "learning_rate": 1.14430534023282e-06, "loss": 0.3132, "num_input_tokens_seen": 25037376, "step": 7945 }, { "epoch": 0.5089302861532552, "grad_norm": 36.93307876586914, "learning_rate": 1.1431995773622167e-06, "loss": 0.4736, "num_input_tokens_seen": 25053440, "step": 7950 }, { "epoch": 0.5092503680942321, "grad_norm": 21.982830047607422, "learning_rate": 1.1420936357052597e-06, "loss": 0.4369, "num_input_tokens_seen": 25069120, "step": 7955 }, { "epoch": 0.5095704500352091, "grad_norm": 22.12405014038086, "learning_rate": 1.1409875166427303e-06, "loss": 0.3078, "num_input_tokens_seen": 25084224, "step": 7960 }, { "epoch": 0.509890531976186, "grad_norm": 37.66783142089844, "learning_rate": 1.1398812215556308e-06, "loss": 0.4996, "num_input_tokens_seen": 25099520, "step": 7965 }, { "epoch": 0.5102106139171628, "grad_norm": 28.573827743530273, "learning_rate": 1.1387747518251837e-06, "loss": 0.362, "num_input_tokens_seen": 25115200, "step": 7970 }, { "epoch": 0.5105306958581397, "grad_norm": 20.292476654052734, "learning_rate": 1.13766810883283e-06, "loss": 0.3266, "num_input_tokens_seen": 25131520, "step": 7975 }, { "epoch": 0.5108507777991166, "grad_norm": 36.63866424560547, "learning_rate": 1.1365612939602255e-06, "loss": 0.5172, "num_input_tokens_seen": 25147776, "step": 7980 }, { "epoch": 0.5111708597400935, "grad_norm": 22.338659286499023, "learning_rate": 1.1354543085892423e-06, "loss": 0.3683, "num_input_tokens_seen": 25162816, "step": 7985 }, { "epoch": 0.5114909416810703, "grad_norm": 34.683868408203125, "learning_rate": 1.1343471541019646e-06, "loss": 0.3333, "num_input_tokens_seen": 25178752, "step": 7990 }, { "epoch": 0.5118110236220472, "grad_norm": 57.14018249511719, "learning_rate": 1.1332398318806872e-06, "loss": 0.3719, "num_input_tokens_seen": 25194048, "step": 7995 }, { "epoch": 0.5121311055630241, "grad_norm": 32.1242561340332, "learning_rate": 1.1321323433079158e-06, "loss": 0.3796, "num_input_tokens_seen": 25209216, "step": 8000 }, { "epoch": 0.512451187504001, "grad_norm": 28.248655319213867, "learning_rate": 1.1310246897663623e-06, "loss": 0.379, "num_input_tokens_seen": 25224640, "step": 8005 }, { "epoch": 0.5127712694449779, "grad_norm": 19.069774627685547, "learning_rate": 1.1299168726389447e-06, "loss": 0.408, "num_input_tokens_seen": 25239808, "step": 8010 }, { "epoch": 0.5130913513859549, "grad_norm": 42.42983627319336, "learning_rate": 1.1288088933087868e-06, "loss": 0.3354, "num_input_tokens_seen": 25257344, "step": 8015 }, { "epoch": 0.5134114333269317, "grad_norm": 22.4074764251709, "learning_rate": 1.1277007531592127e-06, "loss": 0.3365, "num_input_tokens_seen": 25272064, "step": 8020 }, { "epoch": 0.5137315152679086, "grad_norm": 28.663759231567383, "learning_rate": 1.1265924535737492e-06, "loss": 0.3619, "num_input_tokens_seen": 25287936, "step": 8025 }, { "epoch": 0.5140515972088855, "grad_norm": 39.256492614746094, "learning_rate": 1.125483995936121e-06, "loss": 0.3007, "num_input_tokens_seen": 25303232, "step": 8030 }, { "epoch": 0.5143716791498624, "grad_norm": 20.142274856567383, "learning_rate": 1.1243753816302507e-06, "loss": 0.376, "num_input_tokens_seen": 25318656, "step": 8035 }, { "epoch": 0.5146917610908393, "grad_norm": 46.976951599121094, "learning_rate": 1.1232666120402558e-06, "loss": 0.417, "num_input_tokens_seen": 25333760, "step": 8040 }, { "epoch": 0.5150118430318161, "grad_norm": 35.951576232910156, "learning_rate": 1.1221576885504487e-06, "loss": 0.3827, "num_input_tokens_seen": 25349824, "step": 8045 }, { "epoch": 0.515331924972793, "grad_norm": 19.6291561126709, "learning_rate": 1.121048612545333e-06, "loss": 0.4027, "num_input_tokens_seen": 25365376, "step": 8050 }, { "epoch": 0.5156520069137699, "grad_norm": 44.66822052001953, "learning_rate": 1.1199393854096034e-06, "loss": 0.4599, "num_input_tokens_seen": 25380928, "step": 8055 }, { "epoch": 0.5159720888547468, "grad_norm": 79.27295684814453, "learning_rate": 1.118830008528143e-06, "loss": 0.3487, "num_input_tokens_seen": 25396352, "step": 8060 }, { "epoch": 0.5162921707957238, "grad_norm": 21.75312042236328, "learning_rate": 1.1177204832860212e-06, "loss": 0.3159, "num_input_tokens_seen": 25411456, "step": 8065 }, { "epoch": 0.5166122527367006, "grad_norm": 19.3381290435791, "learning_rate": 1.1166108110684947e-06, "loss": 0.4322, "num_input_tokens_seen": 25428544, "step": 8070 }, { "epoch": 0.5169323346776775, "grad_norm": 37.30630111694336, "learning_rate": 1.1155009932610003e-06, "loss": 0.3988, "num_input_tokens_seen": 25443968, "step": 8075 }, { "epoch": 0.5172524166186544, "grad_norm": 45.22068786621094, "learning_rate": 1.1143910312491605e-06, "loss": 0.3273, "num_input_tokens_seen": 25458880, "step": 8080 }, { "epoch": 0.5175724985596313, "grad_norm": 53.44335174560547, "learning_rate": 1.1132809264187748e-06, "loss": 0.3196, "num_input_tokens_seen": 25474304, "step": 8085 }, { "epoch": 0.5178925805006082, "grad_norm": 59.70965576171875, "learning_rate": 1.1121706801558226e-06, "loss": 0.3884, "num_input_tokens_seen": 25489472, "step": 8090 }, { "epoch": 0.518212662441585, "grad_norm": 44.1774787902832, "learning_rate": 1.111060293846459e-06, "loss": 0.3827, "num_input_tokens_seen": 25504896, "step": 8095 }, { "epoch": 0.5185327443825619, "grad_norm": 79.03081512451172, "learning_rate": 1.1099497688770148e-06, "loss": 0.4807, "num_input_tokens_seen": 25519360, "step": 8100 }, { "epoch": 0.5188528263235388, "grad_norm": 35.3879280090332, "learning_rate": 1.1088391066339928e-06, "loss": 0.4418, "num_input_tokens_seen": 25535680, "step": 8105 }, { "epoch": 0.5191729082645157, "grad_norm": 43.35395050048828, "learning_rate": 1.1077283085040684e-06, "loss": 0.5327, "num_input_tokens_seen": 25550592, "step": 8110 }, { "epoch": 0.5194929902054926, "grad_norm": 39.26498031616211, "learning_rate": 1.1066173758740863e-06, "loss": 0.4083, "num_input_tokens_seen": 25565696, "step": 8115 }, { "epoch": 0.5198130721464695, "grad_norm": 17.995386123657227, "learning_rate": 1.105506310131058e-06, "loss": 0.3485, "num_input_tokens_seen": 25581568, "step": 8120 }, { "epoch": 0.5201331540874464, "grad_norm": 56.82388687133789, "learning_rate": 1.1043951126621634e-06, "loss": 0.466, "num_input_tokens_seen": 25597760, "step": 8125 }, { "epoch": 0.5204532360284233, "grad_norm": 31.271780014038086, "learning_rate": 1.1032837848547445e-06, "loss": 0.4111, "num_input_tokens_seen": 25615424, "step": 8130 }, { "epoch": 0.5207733179694002, "grad_norm": 33.19522476196289, "learning_rate": 1.1021723280963074e-06, "loss": 0.4094, "num_input_tokens_seen": 25630720, "step": 8135 }, { "epoch": 0.5210933999103771, "grad_norm": 40.24439239501953, "learning_rate": 1.1010607437745194e-06, "loss": 0.4886, "num_input_tokens_seen": 25649280, "step": 8140 }, { "epoch": 0.5214134818513539, "grad_norm": 49.17844009399414, "learning_rate": 1.0999490332772057e-06, "loss": 0.5002, "num_input_tokens_seen": 25664576, "step": 8145 }, { "epoch": 0.5217335637923308, "grad_norm": 26.123889923095703, "learning_rate": 1.0988371979923507e-06, "loss": 0.4193, "num_input_tokens_seen": 25680384, "step": 8150 }, { "epoch": 0.5220536457333077, "grad_norm": 26.953947067260742, "learning_rate": 1.097725239308094e-06, "loss": 0.4017, "num_input_tokens_seen": 25696128, "step": 8155 }, { "epoch": 0.5223737276742846, "grad_norm": 15.423673629760742, "learning_rate": 1.0966131586127278e-06, "loss": 0.2794, "num_input_tokens_seen": 25712768, "step": 8160 }, { "epoch": 0.5226938096152615, "grad_norm": 25.20142936706543, "learning_rate": 1.0955009572946992e-06, "loss": 0.4033, "num_input_tokens_seen": 25727616, "step": 8165 }, { "epoch": 0.5230138915562383, "grad_norm": 22.9870548248291, "learning_rate": 1.094388636742604e-06, "loss": 0.4149, "num_input_tokens_seen": 25744384, "step": 8170 }, { "epoch": 0.5233339734972153, "grad_norm": 31.26616859436035, "learning_rate": 1.0932761983451878e-06, "loss": 0.3376, "num_input_tokens_seen": 25760640, "step": 8175 }, { "epoch": 0.5236540554381922, "grad_norm": 32.35393142700195, "learning_rate": 1.0921636434913425e-06, "loss": 0.3116, "num_input_tokens_seen": 25776640, "step": 8180 }, { "epoch": 0.5239741373791691, "grad_norm": 26.09176254272461, "learning_rate": 1.091050973570106e-06, "loss": 0.2977, "num_input_tokens_seen": 25791744, "step": 8185 }, { "epoch": 0.524294219320146, "grad_norm": 49.68628692626953, "learning_rate": 1.08993818997066e-06, "loss": 0.5531, "num_input_tokens_seen": 25808256, "step": 8190 }, { "epoch": 0.5246143012611229, "grad_norm": 36.49836730957031, "learning_rate": 1.0888252940823283e-06, "loss": 0.4378, "num_input_tokens_seen": 25824128, "step": 8195 }, { "epoch": 0.5249343832020997, "grad_norm": 39.86119842529297, "learning_rate": 1.0877122872945737e-06, "loss": 0.4676, "num_input_tokens_seen": 25840576, "step": 8200 }, { "epoch": 0.5252544651430766, "grad_norm": 32.07432556152344, "learning_rate": 1.0865991709969983e-06, "loss": 0.317, "num_input_tokens_seen": 25856256, "step": 8205 }, { "epoch": 0.5255745470840535, "grad_norm": 20.993459701538086, "learning_rate": 1.0854859465793416e-06, "loss": 0.4482, "num_input_tokens_seen": 25871424, "step": 8210 }, { "epoch": 0.5258946290250304, "grad_norm": 33.609657287597656, "learning_rate": 1.0843726154314767e-06, "loss": 0.4974, "num_input_tokens_seen": 25886272, "step": 8215 }, { "epoch": 0.5262147109660072, "grad_norm": 30.594623565673828, "learning_rate": 1.083259178943411e-06, "loss": 0.4376, "num_input_tokens_seen": 25901952, "step": 8220 }, { "epoch": 0.5265347929069842, "grad_norm": 20.63231086730957, "learning_rate": 1.0821456385052822e-06, "loss": 0.3694, "num_input_tokens_seen": 25917888, "step": 8225 }, { "epoch": 0.5268548748479611, "grad_norm": 46.33021545410156, "learning_rate": 1.0810319955073598e-06, "loss": 0.4199, "num_input_tokens_seen": 25933824, "step": 8230 }, { "epoch": 0.527174956788938, "grad_norm": 36.321929931640625, "learning_rate": 1.0799182513400393e-06, "loss": 0.3888, "num_input_tokens_seen": 25951360, "step": 8235 }, { "epoch": 0.5274950387299149, "grad_norm": 37.35638427734375, "learning_rate": 1.0788044073938438e-06, "loss": 0.3594, "num_input_tokens_seen": 25967232, "step": 8240 }, { "epoch": 0.5278151206708918, "grad_norm": 37.84722900390625, "learning_rate": 1.0776904650594205e-06, "loss": 0.4146, "num_input_tokens_seen": 25982592, "step": 8245 }, { "epoch": 0.5281352026118686, "grad_norm": 67.66139221191406, "learning_rate": 1.0765764257275394e-06, "loss": 0.4094, "num_input_tokens_seen": 25997824, "step": 8250 }, { "epoch": 0.5284552845528455, "grad_norm": 32.80574035644531, "learning_rate": 1.0754622907890914e-06, "loss": 0.4292, "num_input_tokens_seen": 26013632, "step": 8255 }, { "epoch": 0.5287753664938224, "grad_norm": 28.530445098876953, "learning_rate": 1.0743480616350873e-06, "loss": 0.3249, "num_input_tokens_seen": 26028800, "step": 8260 }, { "epoch": 0.5290954484347993, "grad_norm": 30.938467025756836, "learning_rate": 1.0732337396566558e-06, "loss": 0.339, "num_input_tokens_seen": 26044672, "step": 8265 }, { "epoch": 0.5294155303757762, "grad_norm": 20.649280548095703, "learning_rate": 1.07211932624504e-06, "loss": 0.396, "num_input_tokens_seen": 26060544, "step": 8270 }, { "epoch": 0.529735612316753, "grad_norm": 18.15691566467285, "learning_rate": 1.0710048227915988e-06, "loss": 0.3786, "num_input_tokens_seen": 26076160, "step": 8275 }, { "epoch": 0.53005569425773, "grad_norm": 24.960102081298828, "learning_rate": 1.0698902306878024e-06, "loss": 0.4186, "num_input_tokens_seen": 26092352, "step": 8280 }, { "epoch": 0.5303757761987069, "grad_norm": 25.81612205505371, "learning_rate": 1.0687755513252325e-06, "loss": 0.3024, "num_input_tokens_seen": 26107776, "step": 8285 }, { "epoch": 0.5306958581396838, "grad_norm": 11.139862060546875, "learning_rate": 1.0676607860955794e-06, "loss": 0.31, "num_input_tokens_seen": 26123712, "step": 8290 }, { "epoch": 0.5310159400806607, "grad_norm": 42.41699981689453, "learning_rate": 1.0665459363906404e-06, "loss": 0.386, "num_input_tokens_seen": 26139200, "step": 8295 }, { "epoch": 0.5313360220216375, "grad_norm": 23.389768600463867, "learning_rate": 1.0654310036023185e-06, "loss": 0.4355, "num_input_tokens_seen": 26153600, "step": 8300 }, { "epoch": 0.5316561039626144, "grad_norm": 19.833234786987305, "learning_rate": 1.0643159891226203e-06, "loss": 0.4206, "num_input_tokens_seen": 26169600, "step": 8305 }, { "epoch": 0.5319761859035913, "grad_norm": 33.841224670410156, "learning_rate": 1.0632008943436545e-06, "loss": 0.3398, "num_input_tokens_seen": 26185536, "step": 8310 }, { "epoch": 0.5322962678445682, "grad_norm": 17.150596618652344, "learning_rate": 1.0620857206576299e-06, "loss": 0.453, "num_input_tokens_seen": 26201536, "step": 8315 }, { "epoch": 0.5326163497855451, "grad_norm": 14.26513957977295, "learning_rate": 1.0609704694568546e-06, "loss": 0.2888, "num_input_tokens_seen": 26216576, "step": 8320 }, { "epoch": 0.5329364317265219, "grad_norm": 23.111820220947266, "learning_rate": 1.0598551421337318e-06, "loss": 0.2904, "num_input_tokens_seen": 26232640, "step": 8325 }, { "epoch": 0.5332565136674989, "grad_norm": 20.46584701538086, "learning_rate": 1.0587397400807617e-06, "loss": 0.5146, "num_input_tokens_seen": 26248448, "step": 8330 }, { "epoch": 0.5335765956084758, "grad_norm": 36.023284912109375, "learning_rate": 1.057624264690536e-06, "loss": 0.519, "num_input_tokens_seen": 26263872, "step": 8335 }, { "epoch": 0.5338966775494527, "grad_norm": 36.1595573425293, "learning_rate": 1.0565087173557394e-06, "loss": 0.4598, "num_input_tokens_seen": 26279872, "step": 8340 }, { "epoch": 0.5342167594904296, "grad_norm": 24.1319580078125, "learning_rate": 1.055393099469146e-06, "loss": 0.3428, "num_input_tokens_seen": 26295680, "step": 8345 }, { "epoch": 0.5345368414314065, "grad_norm": 34.465797424316406, "learning_rate": 1.054277412423617e-06, "loss": 0.4057, "num_input_tokens_seen": 26311040, "step": 8350 }, { "epoch": 0.5348569233723833, "grad_norm": 24.986618041992188, "learning_rate": 1.0531616576121017e-06, "loss": 0.4603, "num_input_tokens_seen": 26326144, "step": 8355 }, { "epoch": 0.5351770053133602, "grad_norm": 25.222026824951172, "learning_rate": 1.0520458364276325e-06, "loss": 0.3347, "num_input_tokens_seen": 26341952, "step": 8360 }, { "epoch": 0.5354970872543371, "grad_norm": 37.025054931640625, "learning_rate": 1.0509299502633256e-06, "loss": 0.3565, "num_input_tokens_seen": 26356672, "step": 8365 }, { "epoch": 0.535817169195314, "grad_norm": 20.434568405151367, "learning_rate": 1.0498140005123777e-06, "loss": 0.4493, "num_input_tokens_seen": 26373056, "step": 8370 }, { "epoch": 0.5361372511362908, "grad_norm": 12.433558464050293, "learning_rate": 1.0486979885680653e-06, "loss": 0.426, "num_input_tokens_seen": 26388032, "step": 8375 }, { "epoch": 0.5364573330772677, "grad_norm": 54.505035400390625, "learning_rate": 1.0475819158237424e-06, "loss": 0.4115, "num_input_tokens_seen": 26402880, "step": 8380 }, { "epoch": 0.5367774150182447, "grad_norm": 22.174421310424805, "learning_rate": 1.0464657836728389e-06, "loss": 0.4713, "num_input_tokens_seen": 26419328, "step": 8385 }, { "epoch": 0.5370974969592216, "grad_norm": 33.491397857666016, "learning_rate": 1.045349593508859e-06, "loss": 0.3981, "num_input_tokens_seen": 26434112, "step": 8390 }, { "epoch": 0.5374175789001985, "grad_norm": 22.270578384399414, "learning_rate": 1.0442333467253788e-06, "loss": 0.297, "num_input_tokens_seen": 26450688, "step": 8395 }, { "epoch": 0.5377376608411754, "grad_norm": 32.83494186401367, "learning_rate": 1.0431170447160463e-06, "loss": 0.3602, "num_input_tokens_seen": 26466368, "step": 8400 }, { "epoch": 0.5380577427821522, "grad_norm": 21.519004821777344, "learning_rate": 1.0420006888745767e-06, "loss": 0.3495, "num_input_tokens_seen": 26482624, "step": 8405 }, { "epoch": 0.5383778247231291, "grad_norm": 22.21971321105957, "learning_rate": 1.0408842805947543e-06, "loss": 0.3668, "num_input_tokens_seen": 26499200, "step": 8410 }, { "epoch": 0.538697906664106, "grad_norm": 32.608150482177734, "learning_rate": 1.0397678212704276e-06, "loss": 0.5119, "num_input_tokens_seen": 26514048, "step": 8415 }, { "epoch": 0.5390179886050829, "grad_norm": 32.62826919555664, "learning_rate": 1.038651312295509e-06, "loss": 0.4034, "num_input_tokens_seen": 26529216, "step": 8420 }, { "epoch": 0.5393380705460598, "grad_norm": 24.986495971679688, "learning_rate": 1.037534755063973e-06, "loss": 0.4192, "num_input_tokens_seen": 26545152, "step": 8425 }, { "epoch": 0.5396581524870366, "grad_norm": 44.1995964050293, "learning_rate": 1.0364181509698548e-06, "loss": 0.4147, "num_input_tokens_seen": 26560512, "step": 8430 }, { "epoch": 0.5399782344280136, "grad_norm": 29.369369506835938, "learning_rate": 1.0353015014072476e-06, "loss": 0.35, "num_input_tokens_seen": 26575488, "step": 8435 }, { "epoch": 0.5402983163689905, "grad_norm": 50.47454071044922, "learning_rate": 1.0341848077703013e-06, "loss": 0.405, "num_input_tokens_seen": 26591040, "step": 8440 }, { "epoch": 0.5406183983099674, "grad_norm": 26.94370460510254, "learning_rate": 1.033068071453221e-06, "loss": 0.3229, "num_input_tokens_seen": 26606976, "step": 8445 }, { "epoch": 0.5409384802509443, "grad_norm": 35.96391677856445, "learning_rate": 1.0319512938502653e-06, "loss": 0.3623, "num_input_tokens_seen": 26623296, "step": 8450 }, { "epoch": 0.5412585621919211, "grad_norm": 32.38021469116211, "learning_rate": 1.0308344763557444e-06, "loss": 0.3123, "num_input_tokens_seen": 26638336, "step": 8455 }, { "epoch": 0.541578644132898, "grad_norm": 15.892178535461426, "learning_rate": 1.0297176203640175e-06, "loss": 0.2841, "num_input_tokens_seen": 26654400, "step": 8460 }, { "epoch": 0.5418987260738749, "grad_norm": 54.3671760559082, "learning_rate": 1.0286007272694924e-06, "loss": 0.3482, "num_input_tokens_seen": 26669568, "step": 8465 }, { "epoch": 0.5422188080148518, "grad_norm": 27.727298736572266, "learning_rate": 1.0274837984666239e-06, "loss": 0.4695, "num_input_tokens_seen": 26686016, "step": 8470 }, { "epoch": 0.5425388899558287, "grad_norm": 31.10105323791504, "learning_rate": 1.02636683534991e-06, "loss": 0.4184, "num_input_tokens_seen": 26701504, "step": 8475 }, { "epoch": 0.5428589718968055, "grad_norm": 62.131317138671875, "learning_rate": 1.0252498393138928e-06, "loss": 0.5884, "num_input_tokens_seen": 26717120, "step": 8480 }, { "epoch": 0.5431790538377824, "grad_norm": 70.49308776855469, "learning_rate": 1.0241328117531546e-06, "loss": 0.4193, "num_input_tokens_seen": 26732736, "step": 8485 }, { "epoch": 0.5434991357787594, "grad_norm": 30.73244285583496, "learning_rate": 1.0230157540623174e-06, "loss": 0.4126, "num_input_tokens_seen": 26747392, "step": 8490 }, { "epoch": 0.5438192177197363, "grad_norm": 22.281478881835938, "learning_rate": 1.0218986676360415e-06, "loss": 0.4462, "num_input_tokens_seen": 26762112, "step": 8495 }, { "epoch": 0.5441392996607132, "grad_norm": 22.789291381835938, "learning_rate": 1.0207815538690216e-06, "loss": 0.3709, "num_input_tokens_seen": 26777856, "step": 8500 }, { "epoch": 0.54445938160169, "grad_norm": 51.15581512451172, "learning_rate": 1.0196644141559877e-06, "loss": 0.3055, "num_input_tokens_seen": 26794048, "step": 8505 }, { "epoch": 0.5447794635426669, "grad_norm": 42.44687271118164, "learning_rate": 1.0185472498917021e-06, "loss": 0.3509, "num_input_tokens_seen": 26809792, "step": 8510 }, { "epoch": 0.5450995454836438, "grad_norm": 53.03976058959961, "learning_rate": 1.017430062470957e-06, "loss": 0.4421, "num_input_tokens_seen": 26825024, "step": 8515 }, { "epoch": 0.5454196274246207, "grad_norm": 29.0567569732666, "learning_rate": 1.016312853288574e-06, "loss": 0.3472, "num_input_tokens_seen": 26841536, "step": 8520 }, { "epoch": 0.5457397093655976, "grad_norm": 21.90899085998535, "learning_rate": 1.0151956237394027e-06, "loss": 0.395, "num_input_tokens_seen": 26857600, "step": 8525 }, { "epoch": 0.5460597913065744, "grad_norm": 27.42255210876465, "learning_rate": 1.0140783752183164e-06, "loss": 0.3942, "num_input_tokens_seen": 26874176, "step": 8530 }, { "epoch": 0.5463798732475513, "grad_norm": 26.120128631591797, "learning_rate": 1.0129611091202138e-06, "loss": 0.4162, "num_input_tokens_seen": 26890176, "step": 8535 }, { "epoch": 0.5466999551885282, "grad_norm": 25.828702926635742, "learning_rate": 1.0118438268400135e-06, "loss": 0.2897, "num_input_tokens_seen": 26905728, "step": 8540 }, { "epoch": 0.5470200371295052, "grad_norm": 46.99468994140625, "learning_rate": 1.0107265297726568e-06, "loss": 0.4655, "num_input_tokens_seen": 26921280, "step": 8545 }, { "epoch": 0.5473401190704821, "grad_norm": 34.46550369262695, "learning_rate": 1.009609219313102e-06, "loss": 0.4065, "num_input_tokens_seen": 26936704, "step": 8550 }, { "epoch": 0.547660201011459, "grad_norm": 16.38555145263672, "learning_rate": 1.0084918968563236e-06, "loss": 0.4008, "num_input_tokens_seen": 26952448, "step": 8555 }, { "epoch": 0.5479802829524358, "grad_norm": 30.922161102294922, "learning_rate": 1.0073745637973124e-06, "loss": 0.3928, "num_input_tokens_seen": 26967680, "step": 8560 }, { "epoch": 0.5483003648934127, "grad_norm": 17.125778198242188, "learning_rate": 1.0062572215310718e-06, "loss": 0.3489, "num_input_tokens_seen": 26982400, "step": 8565 }, { "epoch": 0.5486204468343896, "grad_norm": 45.65067672729492, "learning_rate": 1.0051398714526165e-06, "loss": 0.313, "num_input_tokens_seen": 26998400, "step": 8570 }, { "epoch": 0.5489405287753665, "grad_norm": 45.19715118408203, "learning_rate": 1.0040225149569712e-06, "loss": 0.3506, "num_input_tokens_seen": 27015936, "step": 8575 }, { "epoch": 0.5492606107163434, "grad_norm": 36.80413055419922, "learning_rate": 1.0029051534391693e-06, "loss": 0.3263, "num_input_tokens_seen": 27030528, "step": 8580 }, { "epoch": 0.5495806926573202, "grad_norm": 21.942888259887695, "learning_rate": 1.001787788294249e-06, "loss": 0.3621, "num_input_tokens_seen": 27046080, "step": 8585 }, { "epoch": 0.5499007745982971, "grad_norm": 22.532997131347656, "learning_rate": 1.0006704209172537e-06, "loss": 0.4206, "num_input_tokens_seen": 27061504, "step": 8590 }, { "epoch": 0.5502208565392741, "grad_norm": 47.835289001464844, "learning_rate": 9.995530527032301e-07, "loss": 0.4297, "num_input_tokens_seen": 27077056, "step": 8595 }, { "epoch": 0.550540938480251, "grad_norm": 27.61309051513672, "learning_rate": 9.984356850472257e-07, "loss": 0.3382, "num_input_tokens_seen": 27095168, "step": 8600 }, { "epoch": 0.5506689712566417, "eval_loss": 0.3985471725463867, "eval_runtime": 49.1827, "eval_samples_per_second": 282.335, "eval_steps_per_second": 35.297, "num_input_tokens_seen": 27101056, "step": 8602 }, { "epoch": 0.5508610204212279, "grad_norm": 21.579906463623047, "learning_rate": 9.97318319344287e-07, "loss": 0.3698, "num_input_tokens_seen": 27110144, "step": 8605 }, { "epoch": 0.5511811023622047, "grad_norm": 25.68075180053711, "learning_rate": 9.962009569894577e-07, "loss": 0.5311, "num_input_tokens_seen": 27124864, "step": 8610 }, { "epoch": 0.5515011843031816, "grad_norm": 29.338640213012695, "learning_rate": 9.95083599377778e-07, "loss": 0.3769, "num_input_tokens_seen": 27140160, "step": 8615 }, { "epoch": 0.5518212662441585, "grad_norm": 28.375497817993164, "learning_rate": 9.939662479042828e-07, "loss": 0.374, "num_input_tokens_seen": 27155712, "step": 8620 }, { "epoch": 0.5521413481851354, "grad_norm": 62.96663284301758, "learning_rate": 9.92848903963998e-07, "loss": 0.4573, "num_input_tokens_seen": 27171520, "step": 8625 }, { "epoch": 0.5524614301261123, "grad_norm": 33.80502700805664, "learning_rate": 9.9173156895194e-07, "loss": 0.4487, "num_input_tokens_seen": 27186752, "step": 8630 }, { "epoch": 0.5527815120670891, "grad_norm": 25.58247184753418, "learning_rate": 9.906142442631154e-07, "loss": 0.3823, "num_input_tokens_seen": 27201664, "step": 8635 }, { "epoch": 0.553101594008066, "grad_norm": 16.91172218322754, "learning_rate": 9.894969312925171e-07, "loss": 0.3804, "num_input_tokens_seen": 27218880, "step": 8640 }, { "epoch": 0.5534216759490429, "grad_norm": 51.58964157104492, "learning_rate": 9.883796314351234e-07, "loss": 0.3448, "num_input_tokens_seen": 27235648, "step": 8645 }, { "epoch": 0.5537417578900199, "grad_norm": 13.989603996276855, "learning_rate": 9.872623460858966e-07, "loss": 0.3997, "num_input_tokens_seen": 27250880, "step": 8650 }, { "epoch": 0.5540618398309968, "grad_norm": 12.994277954101562, "learning_rate": 9.861450766397799e-07, "loss": 0.3163, "num_input_tokens_seen": 27266880, "step": 8655 }, { "epoch": 0.5543819217719737, "grad_norm": 24.744857788085938, "learning_rate": 9.850278244916976e-07, "loss": 0.411, "num_input_tokens_seen": 27282816, "step": 8660 }, { "epoch": 0.5547020037129505, "grad_norm": 19.459922790527344, "learning_rate": 9.839105910365524e-07, "loss": 0.4309, "num_input_tokens_seen": 27298496, "step": 8665 }, { "epoch": 0.5550220856539274, "grad_norm": 38.75739288330078, "learning_rate": 9.827933776692235e-07, "loss": 0.331, "num_input_tokens_seen": 27313856, "step": 8670 }, { "epoch": 0.5553421675949043, "grad_norm": 30.53208351135254, "learning_rate": 9.81676185784564e-07, "loss": 0.34, "num_input_tokens_seen": 27328448, "step": 8675 }, { "epoch": 0.5556622495358812, "grad_norm": 15.985432624816895, "learning_rate": 9.805590167774021e-07, "loss": 0.3916, "num_input_tokens_seen": 27343872, "step": 8680 }, { "epoch": 0.555982331476858, "grad_norm": 75.76026153564453, "learning_rate": 9.79441872042536e-07, "loss": 0.5632, "num_input_tokens_seen": 27358720, "step": 8685 }, { "epoch": 0.5563024134178349, "grad_norm": 32.97372817993164, "learning_rate": 9.783247529747338e-07, "loss": 0.3856, "num_input_tokens_seen": 27373312, "step": 8690 }, { "epoch": 0.5566224953588118, "grad_norm": 24.052457809448242, "learning_rate": 9.772076609687323e-07, "loss": 0.3571, "num_input_tokens_seen": 27388544, "step": 8695 }, { "epoch": 0.5569425772997888, "grad_norm": 35.491371154785156, "learning_rate": 9.760905974192334e-07, "loss": 0.3259, "num_input_tokens_seen": 27405120, "step": 8700 }, { "epoch": 0.5572626592407657, "grad_norm": 22.80748748779297, "learning_rate": 9.749735637209044e-07, "loss": 0.4078, "num_input_tokens_seen": 27420544, "step": 8705 }, { "epoch": 0.5575827411817426, "grad_norm": 16.720609664916992, "learning_rate": 9.738565612683754e-07, "loss": 0.3137, "num_input_tokens_seen": 27435456, "step": 8710 }, { "epoch": 0.5579028231227194, "grad_norm": 28.667015075683594, "learning_rate": 9.727395914562363e-07, "loss": 0.3477, "num_input_tokens_seen": 27452032, "step": 8715 }, { "epoch": 0.5582229050636963, "grad_norm": 25.73943328857422, "learning_rate": 9.716226556790372e-07, "loss": 0.4159, "num_input_tokens_seen": 27467520, "step": 8720 }, { "epoch": 0.5585429870046732, "grad_norm": 29.19492530822754, "learning_rate": 9.705057553312855e-07, "loss": 0.312, "num_input_tokens_seen": 27482816, "step": 8725 }, { "epoch": 0.5588630689456501, "grad_norm": 24.96323013305664, "learning_rate": 9.693888918074452e-07, "loss": 0.374, "num_input_tokens_seen": 27497600, "step": 8730 }, { "epoch": 0.559183150886627, "grad_norm": 40.72119140625, "learning_rate": 9.682720665019325e-07, "loss": 0.4861, "num_input_tokens_seen": 27513344, "step": 8735 }, { "epoch": 0.5595032328276038, "grad_norm": 20.98204231262207, "learning_rate": 9.671552808091172e-07, "loss": 0.4204, "num_input_tokens_seen": 27530304, "step": 8740 }, { "epoch": 0.5598233147685807, "grad_norm": 21.96649932861328, "learning_rate": 9.660385361233195e-07, "loss": 0.3409, "num_input_tokens_seen": 27545664, "step": 8745 }, { "epoch": 0.5601433967095576, "grad_norm": 20.051984786987305, "learning_rate": 9.649218338388084e-07, "loss": 0.2987, "num_input_tokens_seen": 27560704, "step": 8750 }, { "epoch": 0.5604634786505346, "grad_norm": 21.695167541503906, "learning_rate": 9.638051753497994e-07, "loss": 0.4353, "num_input_tokens_seen": 27577472, "step": 8755 }, { "epoch": 0.5607835605915115, "grad_norm": 20.893781661987305, "learning_rate": 9.62688562050454e-07, "loss": 0.3597, "num_input_tokens_seen": 27592960, "step": 8760 }, { "epoch": 0.5611036425324883, "grad_norm": 17.552732467651367, "learning_rate": 9.615719953348772e-07, "loss": 0.4033, "num_input_tokens_seen": 27610304, "step": 8765 }, { "epoch": 0.5614237244734652, "grad_norm": 37.0562629699707, "learning_rate": 9.604554765971148e-07, "loss": 0.5574, "num_input_tokens_seen": 27628288, "step": 8770 }, { "epoch": 0.5617438064144421, "grad_norm": 20.61250114440918, "learning_rate": 9.593390072311549e-07, "loss": 0.4069, "num_input_tokens_seen": 27643904, "step": 8775 }, { "epoch": 0.562063888355419, "grad_norm": 20.135055541992188, "learning_rate": 9.582225886309216e-07, "loss": 0.3576, "num_input_tokens_seen": 27660224, "step": 8780 }, { "epoch": 0.5623839702963959, "grad_norm": 18.054454803466797, "learning_rate": 9.571062221902767e-07, "loss": 0.3015, "num_input_tokens_seen": 27675136, "step": 8785 }, { "epoch": 0.5627040522373727, "grad_norm": 56.18107223510742, "learning_rate": 9.559899093030175e-07, "loss": 0.3485, "num_input_tokens_seen": 27690176, "step": 8790 }, { "epoch": 0.5630241341783496, "grad_norm": 34.77610778808594, "learning_rate": 9.54873651362873e-07, "loss": 0.3061, "num_input_tokens_seen": 27704512, "step": 8795 }, { "epoch": 0.5633442161193265, "grad_norm": 49.7370491027832, "learning_rate": 9.537574497635043e-07, "loss": 0.46, "num_input_tokens_seen": 27720448, "step": 8800 }, { "epoch": 0.5636642980603035, "grad_norm": 27.712543487548828, "learning_rate": 9.52641305898503e-07, "loss": 0.4966, "num_input_tokens_seen": 27735808, "step": 8805 }, { "epoch": 0.5639843800012804, "grad_norm": 27.37342643737793, "learning_rate": 9.515252211613873e-07, "loss": 0.3122, "num_input_tokens_seen": 27750464, "step": 8810 }, { "epoch": 0.5643044619422573, "grad_norm": 35.751590728759766, "learning_rate": 9.504091969456021e-07, "loss": 0.4586, "num_input_tokens_seen": 27764352, "step": 8815 }, { "epoch": 0.5646245438832341, "grad_norm": 20.632070541381836, "learning_rate": 9.492932346445165e-07, "loss": 0.338, "num_input_tokens_seen": 27779840, "step": 8820 }, { "epoch": 0.564944625824211, "grad_norm": 23.484317779541016, "learning_rate": 9.48177335651423e-07, "loss": 0.27, "num_input_tokens_seen": 27796352, "step": 8825 }, { "epoch": 0.5652647077651879, "grad_norm": 33.279884338378906, "learning_rate": 9.470615013595346e-07, "loss": 0.3325, "num_input_tokens_seen": 27810624, "step": 8830 }, { "epoch": 0.5655847897061648, "grad_norm": 42.17190933227539, "learning_rate": 9.459457331619829e-07, "loss": 0.4447, "num_input_tokens_seen": 27825152, "step": 8835 }, { "epoch": 0.5659048716471416, "grad_norm": 33.045230865478516, "learning_rate": 9.448300324518182e-07, "loss": 0.4076, "num_input_tokens_seen": 27840384, "step": 8840 }, { "epoch": 0.5662249535881185, "grad_norm": 32.98795700073242, "learning_rate": 9.437144006220058e-07, "loss": 0.3017, "num_input_tokens_seen": 27856640, "step": 8845 }, { "epoch": 0.5665450355290954, "grad_norm": 9.297707557678223, "learning_rate": 9.425988390654249e-07, "loss": 0.2027, "num_input_tokens_seen": 27872768, "step": 8850 }, { "epoch": 0.5668651174700723, "grad_norm": 40.04125213623047, "learning_rate": 9.414833491748677e-07, "loss": 0.4955, "num_input_tokens_seen": 27887488, "step": 8855 }, { "epoch": 0.5671851994110493, "grad_norm": 45.78459167480469, "learning_rate": 9.40367932343036e-07, "loss": 0.3024, "num_input_tokens_seen": 27902720, "step": 8860 }, { "epoch": 0.5675052813520262, "grad_norm": 28.001405715942383, "learning_rate": 9.392525899625407e-07, "loss": 0.374, "num_input_tokens_seen": 27918080, "step": 8865 }, { "epoch": 0.567825363293003, "grad_norm": 48.28670120239258, "learning_rate": 9.381373234259004e-07, "loss": 0.4011, "num_input_tokens_seen": 27933760, "step": 8870 }, { "epoch": 0.5681454452339799, "grad_norm": 42.333187103271484, "learning_rate": 9.370221341255382e-07, "loss": 0.375, "num_input_tokens_seen": 27948992, "step": 8875 }, { "epoch": 0.5684655271749568, "grad_norm": 28.905458450317383, "learning_rate": 9.359070234537807e-07, "loss": 0.3382, "num_input_tokens_seen": 27966848, "step": 8880 }, { "epoch": 0.5687856091159337, "grad_norm": 27.128929138183594, "learning_rate": 9.34791992802857e-07, "loss": 0.3803, "num_input_tokens_seen": 27981696, "step": 8885 }, { "epoch": 0.5691056910569106, "grad_norm": 29.13878631591797, "learning_rate": 9.336770435648963e-07, "loss": 0.2607, "num_input_tokens_seen": 27997376, "step": 8890 }, { "epoch": 0.5694257729978874, "grad_norm": 25.84345054626465, "learning_rate": 9.325621771319246e-07, "loss": 0.4075, "num_input_tokens_seen": 28014016, "step": 8895 }, { "epoch": 0.5697458549388643, "grad_norm": 21.55052947998047, "learning_rate": 9.314473948958673e-07, "loss": 0.4178, "num_input_tokens_seen": 28030400, "step": 8900 }, { "epoch": 0.5700659368798412, "grad_norm": 25.94553565979004, "learning_rate": 9.303326982485422e-07, "loss": 0.3456, "num_input_tokens_seen": 28047104, "step": 8905 }, { "epoch": 0.5703860188208181, "grad_norm": 49.04792785644531, "learning_rate": 9.29218088581661e-07, "loss": 0.3546, "num_input_tokens_seen": 28063168, "step": 8910 }, { "epoch": 0.5707061007617951, "grad_norm": 28.955217361450195, "learning_rate": 9.281035672868278e-07, "loss": 0.3462, "num_input_tokens_seen": 28079104, "step": 8915 }, { "epoch": 0.571026182702772, "grad_norm": 27.242048263549805, "learning_rate": 9.269891357555348e-07, "loss": 0.3912, "num_input_tokens_seen": 28094720, "step": 8920 }, { "epoch": 0.5713462646437488, "grad_norm": 39.87770462036133, "learning_rate": 9.25874795379163e-07, "loss": 0.2754, "num_input_tokens_seen": 28110848, "step": 8925 }, { "epoch": 0.5716663465847257, "grad_norm": 22.331693649291992, "learning_rate": 9.247605475489793e-07, "loss": 0.4172, "num_input_tokens_seen": 28127040, "step": 8930 }, { "epoch": 0.5719864285257026, "grad_norm": 33.441993713378906, "learning_rate": 9.236463936561358e-07, "loss": 0.3062, "num_input_tokens_seen": 28143424, "step": 8935 }, { "epoch": 0.5723065104666795, "grad_norm": 48.873287200927734, "learning_rate": 9.225323350916661e-07, "loss": 0.5365, "num_input_tokens_seen": 28158528, "step": 8940 }, { "epoch": 0.5726265924076563, "grad_norm": 35.569923400878906, "learning_rate": 9.214183732464855e-07, "loss": 0.3948, "num_input_tokens_seen": 28173888, "step": 8945 }, { "epoch": 0.5729466743486332, "grad_norm": 20.366697311401367, "learning_rate": 9.203045095113886e-07, "loss": 0.3671, "num_input_tokens_seen": 28191872, "step": 8950 }, { "epoch": 0.5732667562896101, "grad_norm": 45.24616622924805, "learning_rate": 9.191907452770476e-07, "loss": 0.4305, "num_input_tokens_seen": 28206912, "step": 8955 }, { "epoch": 0.573586838230587, "grad_norm": 29.864273071289062, "learning_rate": 9.180770819340095e-07, "loss": 0.4233, "num_input_tokens_seen": 28222336, "step": 8960 }, { "epoch": 0.573906920171564, "grad_norm": 14.063233375549316, "learning_rate": 9.169635208726967e-07, "loss": 0.376, "num_input_tokens_seen": 28238144, "step": 8965 }, { "epoch": 0.5742270021125409, "grad_norm": 62.739784240722656, "learning_rate": 9.15850063483403e-07, "loss": 0.3787, "num_input_tokens_seen": 28253376, "step": 8970 }, { "epoch": 0.5745470840535177, "grad_norm": 28.41097068786621, "learning_rate": 9.147367111562928e-07, "loss": 0.3493, "num_input_tokens_seen": 28269248, "step": 8975 }, { "epoch": 0.5748671659944946, "grad_norm": 35.87826919555664, "learning_rate": 9.136234652814005e-07, "loss": 0.4094, "num_input_tokens_seen": 28285440, "step": 8980 }, { "epoch": 0.5751872479354715, "grad_norm": 27.88485336303711, "learning_rate": 9.125103272486255e-07, "loss": 0.2965, "num_input_tokens_seen": 28300736, "step": 8985 }, { "epoch": 0.5755073298764484, "grad_norm": 30.880252838134766, "learning_rate": 9.11397298447734e-07, "loss": 0.361, "num_input_tokens_seen": 28315712, "step": 8990 }, { "epoch": 0.5758274118174252, "grad_norm": 30.014013290405273, "learning_rate": 9.10284380268356e-07, "loss": 0.3287, "num_input_tokens_seen": 28332032, "step": 8995 }, { "epoch": 0.5761474937584021, "grad_norm": 26.396350860595703, "learning_rate": 9.091715740999828e-07, "loss": 0.4476, "num_input_tokens_seen": 28347968, "step": 9000 }, { "epoch": 0.576467575699379, "grad_norm": 23.355926513671875, "learning_rate": 9.080588813319654e-07, "loss": 0.3849, "num_input_tokens_seen": 28362944, "step": 9005 }, { "epoch": 0.5767876576403559, "grad_norm": 42.71702194213867, "learning_rate": 9.069463033535143e-07, "loss": 0.3032, "num_input_tokens_seen": 28378624, "step": 9010 }, { "epoch": 0.5771077395813328, "grad_norm": 62.55430603027344, "learning_rate": 9.058338415536962e-07, "loss": 0.3865, "num_input_tokens_seen": 28394048, "step": 9015 }, { "epoch": 0.5774278215223098, "grad_norm": 38.583648681640625, "learning_rate": 9.04721497321432e-07, "loss": 0.3808, "num_input_tokens_seen": 28409664, "step": 9020 }, { "epoch": 0.5777479034632866, "grad_norm": 31.30422592163086, "learning_rate": 9.036092720454977e-07, "loss": 0.3744, "num_input_tokens_seen": 28424768, "step": 9025 }, { "epoch": 0.5780679854042635, "grad_norm": 29.469755172729492, "learning_rate": 9.024971671145189e-07, "loss": 0.3387, "num_input_tokens_seen": 28439424, "step": 9030 }, { "epoch": 0.5783880673452404, "grad_norm": 41.49711608886719, "learning_rate": 9.013851839169718e-07, "loss": 0.4406, "num_input_tokens_seen": 28456064, "step": 9035 }, { "epoch": 0.5787081492862173, "grad_norm": 42.17570495605469, "learning_rate": 9.002733238411801e-07, "loss": 0.3388, "num_input_tokens_seen": 28472768, "step": 9040 }, { "epoch": 0.5790282312271942, "grad_norm": 31.11846160888672, "learning_rate": 8.991615882753147e-07, "loss": 0.3489, "num_input_tokens_seen": 28488704, "step": 9045 }, { "epoch": 0.579348313168171, "grad_norm": 55.96306610107422, "learning_rate": 8.980499786073904e-07, "loss": 0.4431, "num_input_tokens_seen": 28503808, "step": 9050 }, { "epoch": 0.5796683951091479, "grad_norm": 54.62471008300781, "learning_rate": 8.969384962252645e-07, "loss": 0.4759, "num_input_tokens_seen": 28520320, "step": 9055 }, { "epoch": 0.5799884770501248, "grad_norm": 47.783241271972656, "learning_rate": 8.958271425166366e-07, "loss": 0.4431, "num_input_tokens_seen": 28535680, "step": 9060 }, { "epoch": 0.5803085589911017, "grad_norm": 22.617599487304688, "learning_rate": 8.947159188690442e-07, "loss": 0.396, "num_input_tokens_seen": 28551488, "step": 9065 }, { "epoch": 0.5806286409320787, "grad_norm": 67.4439697265625, "learning_rate": 8.93604826669863e-07, "loss": 0.4786, "num_input_tokens_seen": 28567040, "step": 9070 }, { "epoch": 0.5809487228730555, "grad_norm": 26.622365951538086, "learning_rate": 8.924938673063052e-07, "loss": 0.3986, "num_input_tokens_seen": 28581568, "step": 9075 }, { "epoch": 0.5812688048140324, "grad_norm": 15.871992111206055, "learning_rate": 8.913830421654166e-07, "loss": 0.3559, "num_input_tokens_seen": 28596992, "step": 9080 }, { "epoch": 0.5815888867550093, "grad_norm": 22.36756134033203, "learning_rate": 8.902723526340746e-07, "loss": 0.4757, "num_input_tokens_seen": 28613952, "step": 9085 }, { "epoch": 0.5819089686959862, "grad_norm": 26.785381317138672, "learning_rate": 8.89161800098989e-07, "loss": 0.4202, "num_input_tokens_seen": 28628736, "step": 9090 }, { "epoch": 0.5822290506369631, "grad_norm": 54.52938461303711, "learning_rate": 8.880513859466974e-07, "loss": 0.3704, "num_input_tokens_seen": 28644928, "step": 9095 }, { "epoch": 0.5825491325779399, "grad_norm": 17.885007858276367, "learning_rate": 8.869411115635645e-07, "loss": 0.278, "num_input_tokens_seen": 28661184, "step": 9100 }, { "epoch": 0.5828692145189168, "grad_norm": 17.88958740234375, "learning_rate": 8.858309783357816e-07, "loss": 0.2772, "num_input_tokens_seen": 28675776, "step": 9105 }, { "epoch": 0.5831892964598937, "grad_norm": 53.37077713012695, "learning_rate": 8.847209876493629e-07, "loss": 0.4318, "num_input_tokens_seen": 28692160, "step": 9110 }, { "epoch": 0.5835093784008706, "grad_norm": 30.646394729614258, "learning_rate": 8.836111408901441e-07, "loss": 0.2576, "num_input_tokens_seen": 28707328, "step": 9115 }, { "epoch": 0.5838294603418475, "grad_norm": 43.16847610473633, "learning_rate": 8.825014394437828e-07, "loss": 0.4235, "num_input_tokens_seen": 28722624, "step": 9120 }, { "epoch": 0.5841495422828245, "grad_norm": 14.40605640411377, "learning_rate": 8.813918846957542e-07, "loss": 0.3748, "num_input_tokens_seen": 28737856, "step": 9125 }, { "epoch": 0.5844696242238013, "grad_norm": 20.49512481689453, "learning_rate": 8.802824780313499e-07, "loss": 0.4501, "num_input_tokens_seen": 28752448, "step": 9130 }, { "epoch": 0.5847897061647782, "grad_norm": 22.4967098236084, "learning_rate": 8.791732208356771e-07, "loss": 0.3958, "num_input_tokens_seen": 28767616, "step": 9135 }, { "epoch": 0.5851097881057551, "grad_norm": 15.978533744812012, "learning_rate": 8.780641144936573e-07, "loss": 0.4649, "num_input_tokens_seen": 28782400, "step": 9140 }, { "epoch": 0.585429870046732, "grad_norm": 48.71504211425781, "learning_rate": 8.76955160390022e-07, "loss": 0.4457, "num_input_tokens_seen": 28798336, "step": 9145 }, { "epoch": 0.5857499519877089, "grad_norm": 16.611661911010742, "learning_rate": 8.758463599093136e-07, "loss": 0.2868, "num_input_tokens_seen": 28814336, "step": 9150 }, { "epoch": 0.5860700339286857, "grad_norm": 39.33195495605469, "learning_rate": 8.747377144358825e-07, "loss": 0.5273, "num_input_tokens_seen": 28830656, "step": 9155 }, { "epoch": 0.5863901158696626, "grad_norm": 42.789817810058594, "learning_rate": 8.736292253538861e-07, "loss": 0.418, "num_input_tokens_seen": 28846656, "step": 9160 }, { "epoch": 0.5867101978106395, "grad_norm": 33.47774887084961, "learning_rate": 8.725208940472851e-07, "loss": 0.309, "num_input_tokens_seen": 28862848, "step": 9165 }, { "epoch": 0.5870302797516164, "grad_norm": 14.912242889404297, "learning_rate": 8.714127218998448e-07, "loss": 0.4083, "num_input_tokens_seen": 28878400, "step": 9170 }, { "epoch": 0.5873503616925934, "grad_norm": 67.51158905029297, "learning_rate": 8.70304710295131e-07, "loss": 0.5084, "num_input_tokens_seen": 28893568, "step": 9175 }, { "epoch": 0.5876704436335702, "grad_norm": 29.94365692138672, "learning_rate": 8.691968606165092e-07, "loss": 0.367, "num_input_tokens_seen": 28909824, "step": 9180 }, { "epoch": 0.5879905255745471, "grad_norm": 30.510108947753906, "learning_rate": 8.680891742471429e-07, "loss": 0.3078, "num_input_tokens_seen": 28925568, "step": 9185 }, { "epoch": 0.588310607515524, "grad_norm": 27.14842987060547, "learning_rate": 8.669816525699912e-07, "loss": 0.3272, "num_input_tokens_seen": 28941056, "step": 9190 }, { "epoch": 0.5886306894565009, "grad_norm": 36.03899002075195, "learning_rate": 8.658742969678079e-07, "loss": 0.4143, "num_input_tokens_seen": 28955456, "step": 9195 }, { "epoch": 0.5889507713974778, "grad_norm": 33.955684661865234, "learning_rate": 8.647671088231398e-07, "loss": 0.2927, "num_input_tokens_seen": 28971136, "step": 9200 }, { "epoch": 0.5892708533384546, "grad_norm": 57.654293060302734, "learning_rate": 8.636600895183245e-07, "loss": 0.4087, "num_input_tokens_seen": 28988480, "step": 9205 }, { "epoch": 0.5895909352794315, "grad_norm": 45.632225036621094, "learning_rate": 8.625532404354877e-07, "loss": 0.3669, "num_input_tokens_seen": 29004544, "step": 9210 }, { "epoch": 0.5899110172204084, "grad_norm": 14.44135570526123, "learning_rate": 8.614465629565443e-07, "loss": 0.3809, "num_input_tokens_seen": 29019328, "step": 9215 }, { "epoch": 0.5902310991613853, "grad_norm": 24.873798370361328, "learning_rate": 8.603400584631939e-07, "loss": 0.3336, "num_input_tokens_seen": 29034752, "step": 9220 }, { "epoch": 0.5905511811023622, "grad_norm": 34.6170654296875, "learning_rate": 8.592337283369198e-07, "loss": 0.4422, "num_input_tokens_seen": 29050816, "step": 9225 }, { "epoch": 0.5908712630433391, "grad_norm": 26.38481903076172, "learning_rate": 8.581275739589893e-07, "loss": 0.2752, "num_input_tokens_seen": 29065920, "step": 9230 }, { "epoch": 0.591191344984316, "grad_norm": 36.17750549316406, "learning_rate": 8.570215967104481e-07, "loss": 0.483, "num_input_tokens_seen": 29080960, "step": 9235 }, { "epoch": 0.5915114269252929, "grad_norm": 24.824047088623047, "learning_rate": 8.559157979721225e-07, "loss": 0.4786, "num_input_tokens_seen": 29096768, "step": 9240 }, { "epoch": 0.5918315088662698, "grad_norm": 35.19805908203125, "learning_rate": 8.548101791246145e-07, "loss": 0.5513, "num_input_tokens_seen": 29112448, "step": 9245 }, { "epoch": 0.5921515908072467, "grad_norm": 30.23106575012207, "learning_rate": 8.537047415483028e-07, "loss": 0.3392, "num_input_tokens_seen": 29127808, "step": 9250 }, { "epoch": 0.5924716727482235, "grad_norm": 13.602792739868164, "learning_rate": 8.525994866233388e-07, "loss": 0.2774, "num_input_tokens_seen": 29142912, "step": 9255 }, { "epoch": 0.5927917546892004, "grad_norm": 45.087398529052734, "learning_rate": 8.514944157296464e-07, "loss": 0.3847, "num_input_tokens_seen": 29159168, "step": 9260 }, { "epoch": 0.5931118366301773, "grad_norm": 38.43781280517578, "learning_rate": 8.503895302469199e-07, "loss": 0.3826, "num_input_tokens_seen": 29175488, "step": 9265 }, { "epoch": 0.5934319185711542, "grad_norm": 33.70762634277344, "learning_rate": 8.492848315546214e-07, "loss": 0.4143, "num_input_tokens_seen": 29191104, "step": 9270 }, { "epoch": 0.5937520005121311, "grad_norm": 17.961454391479492, "learning_rate": 8.4818032103198e-07, "loss": 0.4172, "num_input_tokens_seen": 29206208, "step": 9275 }, { "epoch": 0.5940720824531079, "grad_norm": 42.23419189453125, "learning_rate": 8.470760000579906e-07, "loss": 0.4169, "num_input_tokens_seen": 29221312, "step": 9280 }, { "epoch": 0.5943921643940849, "grad_norm": 46.78962707519531, "learning_rate": 8.459718700114108e-07, "loss": 0.4932, "num_input_tokens_seen": 29236800, "step": 9285 }, { "epoch": 0.5947122463350618, "grad_norm": 26.358369827270508, "learning_rate": 8.448679322707595e-07, "loss": 0.4521, "num_input_tokens_seen": 29252480, "step": 9290 }, { "epoch": 0.5950323282760387, "grad_norm": 41.36620330810547, "learning_rate": 8.437641882143163e-07, "loss": 0.5845, "num_input_tokens_seen": 29266944, "step": 9295 }, { "epoch": 0.5953524102170156, "grad_norm": 17.812028884887695, "learning_rate": 8.426606392201185e-07, "loss": 0.319, "num_input_tokens_seen": 29282816, "step": 9300 }, { "epoch": 0.5956724921579925, "grad_norm": 22.074562072753906, "learning_rate": 8.415572866659599e-07, "loss": 0.3009, "num_input_tokens_seen": 29297984, "step": 9305 }, { "epoch": 0.5959925740989693, "grad_norm": 24.042194366455078, "learning_rate": 8.404541319293896e-07, "loss": 0.376, "num_input_tokens_seen": 29313664, "step": 9310 }, { "epoch": 0.5963126560399462, "grad_norm": 20.160175323486328, "learning_rate": 8.393511763877086e-07, "loss": 0.5842, "num_input_tokens_seen": 29329472, "step": 9315 }, { "epoch": 0.5966327379809231, "grad_norm": 33.067359924316406, "learning_rate": 8.3824842141797e-07, "loss": 0.4463, "num_input_tokens_seen": 29346048, "step": 9320 }, { "epoch": 0.5969528199219, "grad_norm": 27.763477325439453, "learning_rate": 8.371458683969765e-07, "loss": 0.3801, "num_input_tokens_seen": 29361664, "step": 9325 }, { "epoch": 0.5972729018628768, "grad_norm": 23.89577865600586, "learning_rate": 8.360435187012787e-07, "loss": 0.3887, "num_input_tokens_seen": 29376896, "step": 9330 }, { "epoch": 0.5975929838038538, "grad_norm": 36.93418502807617, "learning_rate": 8.349413737071725e-07, "loss": 0.3767, "num_input_tokens_seen": 29392640, "step": 9335 }, { "epoch": 0.5979130657448307, "grad_norm": 29.668235778808594, "learning_rate": 8.338394347906994e-07, "loss": 0.4399, "num_input_tokens_seen": 29407808, "step": 9340 }, { "epoch": 0.5982331476858076, "grad_norm": 36.61244201660156, "learning_rate": 8.327377033276431e-07, "loss": 0.2995, "num_input_tokens_seen": 29422528, "step": 9345 }, { "epoch": 0.5985532296267845, "grad_norm": 25.591800689697266, "learning_rate": 8.316361806935279e-07, "loss": 0.3481, "num_input_tokens_seen": 29438272, "step": 9350 }, { "epoch": 0.5988733115677614, "grad_norm": 30.289875030517578, "learning_rate": 8.305348682636177e-07, "loss": 0.4557, "num_input_tokens_seen": 29453376, "step": 9355 }, { "epoch": 0.5991933935087382, "grad_norm": 33.169734954833984, "learning_rate": 8.294337674129144e-07, "loss": 0.4204, "num_input_tokens_seen": 29469248, "step": 9360 }, { "epoch": 0.5995134754497151, "grad_norm": 35.08827209472656, "learning_rate": 8.283328795161554e-07, "loss": 0.2783, "num_input_tokens_seen": 29485888, "step": 9365 }, { "epoch": 0.599833557390692, "grad_norm": 28.095083236694336, "learning_rate": 8.272322059478114e-07, "loss": 0.3194, "num_input_tokens_seen": 29500864, "step": 9370 }, { "epoch": 0.6001536393316689, "grad_norm": 18.85226821899414, "learning_rate": 8.261317480820871e-07, "loss": 0.2312, "num_input_tokens_seen": 29516288, "step": 9375 }, { "epoch": 0.6004737212726458, "grad_norm": 34.60100173950195, "learning_rate": 8.250315072929168e-07, "loss": 0.4, "num_input_tokens_seen": 29530880, "step": 9380 }, { "epoch": 0.6007297868254273, "eval_loss": 0.3916759490966797, "eval_runtime": 49.1281, "eval_samples_per_second": 282.649, "eval_steps_per_second": 35.336, "num_input_tokens_seen": 29544576, "step": 9384 }, { "epoch": 0.6007938032136226, "grad_norm": 20.751314163208008, "learning_rate": 8.239314849539637e-07, "loss": 0.3513, "num_input_tokens_seen": 29547840, "step": 9385 }, { "epoch": 0.6011138851545996, "grad_norm": 31.6501522064209, "learning_rate": 8.228316824386193e-07, "loss": 0.4204, "num_input_tokens_seen": 29564096, "step": 9390 }, { "epoch": 0.6014339670955765, "grad_norm": 33.23552322387695, "learning_rate": 8.217321011199995e-07, "loss": 0.3633, "num_input_tokens_seen": 29579520, "step": 9395 }, { "epoch": 0.6017540490365534, "grad_norm": 49.13716125488281, "learning_rate": 8.206327423709441e-07, "loss": 0.4256, "num_input_tokens_seen": 29594048, "step": 9400 }, { "epoch": 0.6020741309775303, "grad_norm": 23.02613067626953, "learning_rate": 8.195336075640163e-07, "loss": 0.3871, "num_input_tokens_seen": 29610368, "step": 9405 }, { "epoch": 0.6023942129185071, "grad_norm": 32.443267822265625, "learning_rate": 8.184346980714984e-07, "loss": 0.4232, "num_input_tokens_seen": 29625792, "step": 9410 }, { "epoch": 0.602714294859484, "grad_norm": 40.73899459838867, "learning_rate": 8.173360152653914e-07, "loss": 0.3399, "num_input_tokens_seen": 29642240, "step": 9415 }, { "epoch": 0.6030343768004609, "grad_norm": 28.00251007080078, "learning_rate": 8.162375605174143e-07, "loss": 0.293, "num_input_tokens_seen": 29658176, "step": 9420 }, { "epoch": 0.6033544587414378, "grad_norm": 26.76416778564453, "learning_rate": 8.151393351990005e-07, "loss": 0.3118, "num_input_tokens_seen": 29675392, "step": 9425 }, { "epoch": 0.6036745406824147, "grad_norm": 29.030107498168945, "learning_rate": 8.140413406812971e-07, "loss": 0.4241, "num_input_tokens_seen": 29690048, "step": 9430 }, { "epoch": 0.6039946226233915, "grad_norm": 33.374656677246094, "learning_rate": 8.129435783351635e-07, "loss": 0.3052, "num_input_tokens_seen": 29705088, "step": 9435 }, { "epoch": 0.6043147045643685, "grad_norm": 29.674457550048828, "learning_rate": 8.118460495311685e-07, "loss": 0.4482, "num_input_tokens_seen": 29720576, "step": 9440 }, { "epoch": 0.6046347865053454, "grad_norm": 30.353450775146484, "learning_rate": 8.107487556395901e-07, "loss": 0.4204, "num_input_tokens_seen": 29736896, "step": 9445 }, { "epoch": 0.6049548684463223, "grad_norm": 29.06775665283203, "learning_rate": 8.096516980304115e-07, "loss": 0.3567, "num_input_tokens_seen": 29752768, "step": 9450 }, { "epoch": 0.6052749503872992, "grad_norm": 50.72957229614258, "learning_rate": 8.085548780733238e-07, "loss": 0.3355, "num_input_tokens_seen": 29768640, "step": 9455 }, { "epoch": 0.605595032328276, "grad_norm": 32.87676239013672, "learning_rate": 8.074582971377182e-07, "loss": 0.338, "num_input_tokens_seen": 29786240, "step": 9460 }, { "epoch": 0.6059151142692529, "grad_norm": 40.09199142456055, "learning_rate": 8.063619565926892e-07, "loss": 0.4356, "num_input_tokens_seen": 29802176, "step": 9465 }, { "epoch": 0.6062351962102298, "grad_norm": 16.3148250579834, "learning_rate": 8.052658578070313e-07, "loss": 0.3912, "num_input_tokens_seen": 29817600, "step": 9470 }, { "epoch": 0.6065552781512067, "grad_norm": 13.280025482177734, "learning_rate": 8.041700021492362e-07, "loss": 0.3313, "num_input_tokens_seen": 29832960, "step": 9475 }, { "epoch": 0.6068753600921836, "grad_norm": 23.65538215637207, "learning_rate": 8.030743909874924e-07, "loss": 0.2888, "num_input_tokens_seen": 29848448, "step": 9480 }, { "epoch": 0.6071954420331604, "grad_norm": 16.695858001708984, "learning_rate": 8.019790256896839e-07, "loss": 0.3247, "num_input_tokens_seen": 29863296, "step": 9485 }, { "epoch": 0.6075155239741373, "grad_norm": 45.717647552490234, "learning_rate": 8.008839076233871e-07, "loss": 0.3806, "num_input_tokens_seen": 29880128, "step": 9490 }, { "epoch": 0.6078356059151143, "grad_norm": 24.243160247802734, "learning_rate": 7.997890381558691e-07, "loss": 0.3618, "num_input_tokens_seen": 29895296, "step": 9495 }, { "epoch": 0.6081556878560912, "grad_norm": 33.516685485839844, "learning_rate": 7.986944186540878e-07, "loss": 0.4291, "num_input_tokens_seen": 29911296, "step": 9500 }, { "epoch": 0.6084757697970681, "grad_norm": 45.87578582763672, "learning_rate": 7.976000504846885e-07, "loss": 0.4594, "num_input_tokens_seen": 29926912, "step": 9505 }, { "epoch": 0.608795851738045, "grad_norm": 104.76370239257812, "learning_rate": 7.965059350140024e-07, "loss": 0.4726, "num_input_tokens_seen": 29942272, "step": 9510 }, { "epoch": 0.6091159336790218, "grad_norm": 38.258480072021484, "learning_rate": 7.954120736080461e-07, "loss": 0.4037, "num_input_tokens_seen": 29958016, "step": 9515 }, { "epoch": 0.6094360156199987, "grad_norm": 24.145002365112305, "learning_rate": 7.943184676325178e-07, "loss": 0.5797, "num_input_tokens_seen": 29974720, "step": 9520 }, { "epoch": 0.6097560975609756, "grad_norm": 27.14354133605957, "learning_rate": 7.932251184527974e-07, "loss": 0.4342, "num_input_tokens_seen": 29991680, "step": 9525 }, { "epoch": 0.6100761795019525, "grad_norm": 27.287010192871094, "learning_rate": 7.921320274339446e-07, "loss": 0.2753, "num_input_tokens_seen": 30007168, "step": 9530 }, { "epoch": 0.6103962614429294, "grad_norm": 39.53981018066406, "learning_rate": 7.910391959406966e-07, "loss": 0.3337, "num_input_tokens_seen": 30022656, "step": 9535 }, { "epoch": 0.6107163433839062, "grad_norm": 33.61812210083008, "learning_rate": 7.899466253374653e-07, "loss": 0.3943, "num_input_tokens_seen": 30038144, "step": 9540 }, { "epoch": 0.6110364253248832, "grad_norm": 34.27006149291992, "learning_rate": 7.88854316988339e-07, "loss": 0.3347, "num_input_tokens_seen": 30055488, "step": 9545 }, { "epoch": 0.6113565072658601, "grad_norm": 39.317073822021484, "learning_rate": 7.877622722570771e-07, "loss": 0.3016, "num_input_tokens_seen": 30071040, "step": 9550 }, { "epoch": 0.611676589206837, "grad_norm": 23.81880760192871, "learning_rate": 7.866704925071101e-07, "loss": 0.4185, "num_input_tokens_seen": 30088000, "step": 9555 }, { "epoch": 0.6119966711478139, "grad_norm": 24.980806350708008, "learning_rate": 7.855789791015377e-07, "loss": 0.422, "num_input_tokens_seen": 30103040, "step": 9560 }, { "epoch": 0.6123167530887907, "grad_norm": 42.49583053588867, "learning_rate": 7.844877334031277e-07, "loss": 0.3946, "num_input_tokens_seen": 30117760, "step": 9565 }, { "epoch": 0.6126368350297676, "grad_norm": 32.370361328125, "learning_rate": 7.833967567743131e-07, "loss": 0.4797, "num_input_tokens_seen": 30133888, "step": 9570 }, { "epoch": 0.6129569169707445, "grad_norm": 30.043428421020508, "learning_rate": 7.823060505771903e-07, "loss": 0.3747, "num_input_tokens_seen": 30149312, "step": 9575 }, { "epoch": 0.6132769989117214, "grad_norm": 39.43803787231445, "learning_rate": 7.812156161735199e-07, "loss": 0.3944, "num_input_tokens_seen": 30163840, "step": 9580 }, { "epoch": 0.6135970808526983, "grad_norm": 69.42517852783203, "learning_rate": 7.801254549247215e-07, "loss": 0.5462, "num_input_tokens_seen": 30180544, "step": 9585 }, { "epoch": 0.6139171627936751, "grad_norm": 18.023378372192383, "learning_rate": 7.790355681918739e-07, "loss": 0.3212, "num_input_tokens_seen": 30197120, "step": 9590 }, { "epoch": 0.614237244734652, "grad_norm": 52.89658737182617, "learning_rate": 7.779459573357144e-07, "loss": 0.421, "num_input_tokens_seen": 30213376, "step": 9595 }, { "epoch": 0.614557326675629, "grad_norm": 20.749906539916992, "learning_rate": 7.768566237166338e-07, "loss": 0.4225, "num_input_tokens_seen": 30229120, "step": 9600 }, { "epoch": 0.6148774086166059, "grad_norm": 45.14435958862305, "learning_rate": 7.757675686946786e-07, "loss": 0.5064, "num_input_tokens_seen": 30244544, "step": 9605 }, { "epoch": 0.6151974905575828, "grad_norm": 31.990671157836914, "learning_rate": 7.746787936295468e-07, "loss": 0.4207, "num_input_tokens_seen": 30260864, "step": 9610 }, { "epoch": 0.6155175724985597, "grad_norm": 42.7758674621582, "learning_rate": 7.735902998805868e-07, "loss": 0.3739, "num_input_tokens_seen": 30275456, "step": 9615 }, { "epoch": 0.6158376544395365, "grad_norm": 42.92548751831055, "learning_rate": 7.725020888067955e-07, "loss": 0.4195, "num_input_tokens_seen": 30291008, "step": 9620 }, { "epoch": 0.6161577363805134, "grad_norm": 18.282148361206055, "learning_rate": 7.714141617668176e-07, "loss": 0.4814, "num_input_tokens_seen": 30306816, "step": 9625 }, { "epoch": 0.6164778183214903, "grad_norm": 25.10959815979004, "learning_rate": 7.703265201189426e-07, "loss": 0.3298, "num_input_tokens_seen": 30322240, "step": 9630 }, { "epoch": 0.6167979002624672, "grad_norm": 17.638351440429688, "learning_rate": 7.692391652211036e-07, "loss": 0.3357, "num_input_tokens_seen": 30338048, "step": 9635 }, { "epoch": 0.617117982203444, "grad_norm": 40.34111404418945, "learning_rate": 7.681520984308769e-07, "loss": 0.3313, "num_input_tokens_seen": 30353984, "step": 9640 }, { "epoch": 0.6174380641444209, "grad_norm": 39.976497650146484, "learning_rate": 7.670653211054772e-07, "loss": 0.4902, "num_input_tokens_seen": 30370048, "step": 9645 }, { "epoch": 0.6177581460853978, "grad_norm": 35.88365936279297, "learning_rate": 7.659788346017591e-07, "loss": 0.413, "num_input_tokens_seen": 30385344, "step": 9650 }, { "epoch": 0.6180782280263748, "grad_norm": 35.93766784667969, "learning_rate": 7.648926402762133e-07, "loss": 0.3813, "num_input_tokens_seen": 30400576, "step": 9655 }, { "epoch": 0.6183983099673517, "grad_norm": 38.066795349121094, "learning_rate": 7.638067394849671e-07, "loss": 0.3867, "num_input_tokens_seen": 30415424, "step": 9660 }, { "epoch": 0.6187183919083286, "grad_norm": 44.817840576171875, "learning_rate": 7.627211335837797e-07, "loss": 0.4056, "num_input_tokens_seen": 30430592, "step": 9665 }, { "epoch": 0.6190384738493054, "grad_norm": 21.97688865661621, "learning_rate": 7.616358239280427e-07, "loss": 0.4352, "num_input_tokens_seen": 30445952, "step": 9670 }, { "epoch": 0.6193585557902823, "grad_norm": 30.693403244018555, "learning_rate": 7.605508118727787e-07, "loss": 0.3274, "num_input_tokens_seen": 30461568, "step": 9675 }, { "epoch": 0.6196786377312592, "grad_norm": 25.588163375854492, "learning_rate": 7.594660987726373e-07, "loss": 0.3611, "num_input_tokens_seen": 30476672, "step": 9680 }, { "epoch": 0.6199987196722361, "grad_norm": 42.19605255126953, "learning_rate": 7.583816859818956e-07, "loss": 0.4013, "num_input_tokens_seen": 30492672, "step": 9685 }, { "epoch": 0.620318801613213, "grad_norm": 23.23065948486328, "learning_rate": 7.57297574854456e-07, "loss": 0.3785, "num_input_tokens_seen": 30507712, "step": 9690 }, { "epoch": 0.6206388835541898, "grad_norm": 72.38654327392578, "learning_rate": 7.56213766743844e-07, "loss": 0.4395, "num_input_tokens_seen": 30524032, "step": 9695 }, { "epoch": 0.6209589654951667, "grad_norm": 16.888713836669922, "learning_rate": 7.551302630032064e-07, "loss": 0.333, "num_input_tokens_seen": 30539776, "step": 9700 }, { "epoch": 0.6212790474361437, "grad_norm": 18.87851905822754, "learning_rate": 7.540470649853106e-07, "loss": 0.3693, "num_input_tokens_seen": 30554752, "step": 9705 }, { "epoch": 0.6215991293771206, "grad_norm": 25.751543045043945, "learning_rate": 7.529641740425419e-07, "loss": 0.4034, "num_input_tokens_seen": 30571968, "step": 9710 }, { "epoch": 0.6219192113180975, "grad_norm": 30.196582794189453, "learning_rate": 7.518815915269023e-07, "loss": 0.4351, "num_input_tokens_seen": 30587264, "step": 9715 }, { "epoch": 0.6222392932590743, "grad_norm": 18.58189582824707, "learning_rate": 7.507993187900092e-07, "loss": 0.3948, "num_input_tokens_seen": 30603200, "step": 9720 }, { "epoch": 0.6225593752000512, "grad_norm": 29.565282821655273, "learning_rate": 7.497173571830926e-07, "loss": 0.4253, "num_input_tokens_seen": 30617856, "step": 9725 }, { "epoch": 0.6228794571410281, "grad_norm": 43.09429168701172, "learning_rate": 7.486357080569938e-07, "loss": 0.4732, "num_input_tokens_seen": 30632448, "step": 9730 }, { "epoch": 0.623199539082005, "grad_norm": 23.361135482788086, "learning_rate": 7.47554372762165e-07, "loss": 0.3747, "num_input_tokens_seen": 30647680, "step": 9735 }, { "epoch": 0.6235196210229819, "grad_norm": 59.268245697021484, "learning_rate": 7.464733526486662e-07, "loss": 0.4905, "num_input_tokens_seen": 30663616, "step": 9740 }, { "epoch": 0.6238397029639587, "grad_norm": 40.878173828125, "learning_rate": 7.453926490661628e-07, "loss": 0.3424, "num_input_tokens_seen": 30682496, "step": 9745 }, { "epoch": 0.6241597849049356, "grad_norm": 51.57231521606445, "learning_rate": 7.443122633639267e-07, "loss": 0.3639, "num_input_tokens_seen": 30697664, "step": 9750 }, { "epoch": 0.6244798668459125, "grad_norm": 61.236114501953125, "learning_rate": 7.432321968908319e-07, "loss": 0.3835, "num_input_tokens_seen": 30713408, "step": 9755 }, { "epoch": 0.6247999487868895, "grad_norm": 22.234743118286133, "learning_rate": 7.421524509953543e-07, "loss": 0.3173, "num_input_tokens_seen": 30730496, "step": 9760 }, { "epoch": 0.6251200307278664, "grad_norm": 29.365135192871094, "learning_rate": 7.410730270255687e-07, "loss": 0.4158, "num_input_tokens_seen": 30745664, "step": 9765 }, { "epoch": 0.6254401126688433, "grad_norm": 32.928707122802734, "learning_rate": 7.399939263291493e-07, "loss": 0.3655, "num_input_tokens_seen": 30760960, "step": 9770 }, { "epoch": 0.6257601946098201, "grad_norm": 33.555416107177734, "learning_rate": 7.389151502533657e-07, "loss": 0.4854, "num_input_tokens_seen": 30775872, "step": 9775 }, { "epoch": 0.626080276550797, "grad_norm": 17.007144927978516, "learning_rate": 7.378367001450819e-07, "loss": 0.3683, "num_input_tokens_seen": 30791424, "step": 9780 }, { "epoch": 0.6264003584917739, "grad_norm": 55.41214370727539, "learning_rate": 7.367585773507567e-07, "loss": 0.4317, "num_input_tokens_seen": 30807680, "step": 9785 }, { "epoch": 0.6267204404327508, "grad_norm": 41.18684387207031, "learning_rate": 7.356807832164385e-07, "loss": 0.4428, "num_input_tokens_seen": 30823680, "step": 9790 }, { "epoch": 0.6270405223737276, "grad_norm": 18.37259292602539, "learning_rate": 7.346033190877654e-07, "loss": 0.4404, "num_input_tokens_seen": 30839360, "step": 9795 }, { "epoch": 0.6273606043147045, "grad_norm": 36.56877136230469, "learning_rate": 7.335261863099651e-07, "loss": 0.3596, "num_input_tokens_seen": 30854784, "step": 9800 }, { "epoch": 0.6276806862556814, "grad_norm": 31.16109275817871, "learning_rate": 7.324493862278498e-07, "loss": 0.3969, "num_input_tokens_seen": 30870592, "step": 9805 }, { "epoch": 0.6280007681966584, "grad_norm": 39.3582649230957, "learning_rate": 7.313729201858167e-07, "loss": 0.4546, "num_input_tokens_seen": 30885952, "step": 9810 }, { "epoch": 0.6283208501376353, "grad_norm": 21.64111328125, "learning_rate": 7.302967895278473e-07, "loss": 0.3285, "num_input_tokens_seen": 30902080, "step": 9815 }, { "epoch": 0.6286409320786122, "grad_norm": 29.953590393066406, "learning_rate": 7.292209955975028e-07, "loss": 0.4045, "num_input_tokens_seen": 30919232, "step": 9820 }, { "epoch": 0.628961014019589, "grad_norm": 37.44114685058594, "learning_rate": 7.281455397379244e-07, "loss": 0.4068, "num_input_tokens_seen": 30936448, "step": 9825 }, { "epoch": 0.6292810959605659, "grad_norm": 37.291465759277344, "learning_rate": 7.270704232918316e-07, "loss": 0.3249, "num_input_tokens_seen": 30952256, "step": 9830 }, { "epoch": 0.6296011779015428, "grad_norm": 53.09471130371094, "learning_rate": 7.2599564760152e-07, "loss": 0.401, "num_input_tokens_seen": 30967360, "step": 9835 }, { "epoch": 0.6299212598425197, "grad_norm": 31.090974807739258, "learning_rate": 7.249212140088592e-07, "loss": 0.3851, "num_input_tokens_seen": 30982016, "step": 9840 }, { "epoch": 0.6302413417834966, "grad_norm": 19.057065963745117, "learning_rate": 7.23847123855293e-07, "loss": 0.3347, "num_input_tokens_seen": 30998080, "step": 9845 }, { "epoch": 0.6305614237244734, "grad_norm": 22.847869873046875, "learning_rate": 7.227733784818349e-07, "loss": 0.274, "num_input_tokens_seen": 31013184, "step": 9850 }, { "epoch": 0.6308815056654503, "grad_norm": 10.46581745147705, "learning_rate": 7.216999792290683e-07, "loss": 0.3758, "num_input_tokens_seen": 31028800, "step": 9855 }, { "epoch": 0.6312015876064272, "grad_norm": 32.733524322509766, "learning_rate": 7.206269274371457e-07, "loss": 0.4837, "num_input_tokens_seen": 31044736, "step": 9860 }, { "epoch": 0.6315216695474042, "grad_norm": 16.939966201782227, "learning_rate": 7.195542244457845e-07, "loss": 0.3489, "num_input_tokens_seen": 31059968, "step": 9865 }, { "epoch": 0.6318417514883811, "grad_norm": 21.10120391845703, "learning_rate": 7.184818715942666e-07, "loss": 0.3215, "num_input_tokens_seen": 31074880, "step": 9870 }, { "epoch": 0.6321618334293579, "grad_norm": 32.21525573730469, "learning_rate": 7.174098702214374e-07, "loss": 0.3499, "num_input_tokens_seen": 31090432, "step": 9875 }, { "epoch": 0.6324819153703348, "grad_norm": 27.200403213500977, "learning_rate": 7.163382216657033e-07, "loss": 0.372, "num_input_tokens_seen": 31107264, "step": 9880 }, { "epoch": 0.6328019973113117, "grad_norm": 50.323707580566406, "learning_rate": 7.152669272650302e-07, "loss": 0.3531, "num_input_tokens_seen": 31124096, "step": 9885 }, { "epoch": 0.6331220792522886, "grad_norm": 51.96805191040039, "learning_rate": 7.141959883569411e-07, "loss": 0.3881, "num_input_tokens_seen": 31138752, "step": 9890 }, { "epoch": 0.6334421611932655, "grad_norm": 28.28093147277832, "learning_rate": 7.131254062785165e-07, "loss": 0.4624, "num_input_tokens_seen": 31154048, "step": 9895 }, { "epoch": 0.6337622431342423, "grad_norm": 26.130292892456055, "learning_rate": 7.120551823663907e-07, "loss": 0.5159, "num_input_tokens_seen": 31170304, "step": 9900 }, { "epoch": 0.6340823250752192, "grad_norm": 13.736687660217285, "learning_rate": 7.109853179567499e-07, "loss": 0.2778, "num_input_tokens_seen": 31186368, "step": 9905 }, { "epoch": 0.6344024070161961, "grad_norm": 22.524595260620117, "learning_rate": 7.099158143853337e-07, "loss": 0.4266, "num_input_tokens_seen": 31201664, "step": 9910 }, { "epoch": 0.634722488957173, "grad_norm": 48.70823287963867, "learning_rate": 7.088466729874289e-07, "loss": 0.396, "num_input_tokens_seen": 31217216, "step": 9915 }, { "epoch": 0.63504257089815, "grad_norm": 29.29494285583496, "learning_rate": 7.077778950978713e-07, "loss": 0.3762, "num_input_tokens_seen": 31233728, "step": 9920 }, { "epoch": 0.6353626528391269, "grad_norm": 17.022003173828125, "learning_rate": 7.06709482051043e-07, "loss": 0.4657, "num_input_tokens_seen": 31249664, "step": 9925 }, { "epoch": 0.6356827347801037, "grad_norm": 18.935352325439453, "learning_rate": 7.056414351808698e-07, "loss": 0.2958, "num_input_tokens_seen": 31265408, "step": 9930 }, { "epoch": 0.6360028167210806, "grad_norm": 25.703018188476562, "learning_rate": 7.045737558208206e-07, "loss": 0.3557, "num_input_tokens_seen": 31281088, "step": 9935 }, { "epoch": 0.6363228986620575, "grad_norm": 28.873281478881836, "learning_rate": 7.035064453039064e-07, "loss": 0.4025, "num_input_tokens_seen": 31296512, "step": 9940 }, { "epoch": 0.6366429806030344, "grad_norm": 14.413522720336914, "learning_rate": 7.024395049626766e-07, "loss": 0.3796, "num_input_tokens_seen": 31312000, "step": 9945 }, { "epoch": 0.6369630625440112, "grad_norm": 42.59180450439453, "learning_rate": 7.013729361292182e-07, "loss": 0.3378, "num_input_tokens_seen": 31327488, "step": 9950 }, { "epoch": 0.6372831444849881, "grad_norm": 37.88176727294922, "learning_rate": 7.003067401351554e-07, "loss": 0.2992, "num_input_tokens_seen": 31343936, "step": 9955 }, { "epoch": 0.637603226425965, "grad_norm": 69.40776062011719, "learning_rate": 6.992409183116465e-07, "loss": 0.3971, "num_input_tokens_seen": 31359232, "step": 9960 }, { "epoch": 0.6379233083669419, "grad_norm": 18.821264266967773, "learning_rate": 6.981754719893826e-07, "loss": 0.3715, "num_input_tokens_seen": 31375616, "step": 9965 }, { "epoch": 0.6382433903079189, "grad_norm": 49.44694137573242, "learning_rate": 6.971104024985852e-07, "loss": 0.4687, "num_input_tokens_seen": 31391680, "step": 9970 }, { "epoch": 0.6385634722488958, "grad_norm": 28.005033493041992, "learning_rate": 6.960457111690068e-07, "loss": 0.3829, "num_input_tokens_seen": 31407424, "step": 9975 }, { "epoch": 0.6388835541898726, "grad_norm": 18.54348373413086, "learning_rate": 6.94981399329927e-07, "loss": 0.3854, "num_input_tokens_seen": 31422912, "step": 9980 }, { "epoch": 0.6392036361308495, "grad_norm": 53.93000030517578, "learning_rate": 6.939174683101509e-07, "loss": 0.3806, "num_input_tokens_seen": 31438912, "step": 9985 }, { "epoch": 0.6395237180718264, "grad_norm": 25.696611404418945, "learning_rate": 6.9285391943801e-07, "loss": 0.2888, "num_input_tokens_seen": 31455168, "step": 9990 }, { "epoch": 0.6398438000128033, "grad_norm": 32.05419158935547, "learning_rate": 6.917907540413569e-07, "loss": 0.32, "num_input_tokens_seen": 31470592, "step": 9995 }, { "epoch": 0.6401638819537802, "grad_norm": 31.22829818725586, "learning_rate": 6.907279734475659e-07, "loss": 0.3466, "num_input_tokens_seen": 31485632, "step": 10000 }, { "epoch": 0.640483963894757, "grad_norm": 47.022422790527344, "learning_rate": 6.896655789835317e-07, "loss": 0.353, "num_input_tokens_seen": 31500352, "step": 10005 }, { "epoch": 0.6408040458357339, "grad_norm": 38.05258560180664, "learning_rate": 6.886035719756656e-07, "loss": 0.365, "num_input_tokens_seen": 31516928, "step": 10010 }, { "epoch": 0.6411241277767108, "grad_norm": 19.052812576293945, "learning_rate": 6.875419537498959e-07, "loss": 0.272, "num_input_tokens_seen": 31532608, "step": 10015 }, { "epoch": 0.6414442097176877, "grad_norm": 54.83672332763672, "learning_rate": 6.864807256316658e-07, "loss": 0.5903, "num_input_tokens_seen": 31548608, "step": 10020 }, { "epoch": 0.6417642916586647, "grad_norm": 21.3775691986084, "learning_rate": 6.854198889459311e-07, "loss": 0.4124, "num_input_tokens_seen": 31564224, "step": 10025 }, { "epoch": 0.6420843735996415, "grad_norm": 8.760631561279297, "learning_rate": 6.84359445017158e-07, "loss": 0.2575, "num_input_tokens_seen": 31579200, "step": 10030 }, { "epoch": 0.6424044555406184, "grad_norm": 46.2386589050293, "learning_rate": 6.832993951693244e-07, "loss": 0.4146, "num_input_tokens_seen": 31594816, "step": 10035 }, { "epoch": 0.6427245374815953, "grad_norm": 14.027458190917969, "learning_rate": 6.822397407259144e-07, "loss": 0.3439, "num_input_tokens_seen": 31610432, "step": 10040 }, { "epoch": 0.6430446194225722, "grad_norm": 40.68043899536133, "learning_rate": 6.811804830099186e-07, "loss": 0.3688, "num_input_tokens_seen": 31627520, "step": 10045 }, { "epoch": 0.6433647013635491, "grad_norm": 46.507389068603516, "learning_rate": 6.801216233438336e-07, "loss": 0.3446, "num_input_tokens_seen": 31644352, "step": 10050 }, { "epoch": 0.6436847833045259, "grad_norm": 30.864545822143555, "learning_rate": 6.790631630496575e-07, "loss": 0.3831, "num_input_tokens_seen": 31660160, "step": 10055 }, { "epoch": 0.6440048652455028, "grad_norm": 34.4333610534668, "learning_rate": 6.780051034488903e-07, "loss": 0.4395, "num_input_tokens_seen": 31676352, "step": 10060 }, { "epoch": 0.6443249471864797, "grad_norm": 80.57857513427734, "learning_rate": 6.769474458625323e-07, "loss": 0.3439, "num_input_tokens_seen": 31692160, "step": 10065 }, { "epoch": 0.6446450291274566, "grad_norm": 17.465198516845703, "learning_rate": 6.758901916110813e-07, "loss": 0.3099, "num_input_tokens_seen": 31707712, "step": 10070 }, { "epoch": 0.6449651110684336, "grad_norm": 16.61797523498535, "learning_rate": 6.748333420145315e-07, "loss": 0.3246, "num_input_tokens_seen": 31723776, "step": 10075 }, { "epoch": 0.6452851930094105, "grad_norm": 21.947399139404297, "learning_rate": 6.737768983923718e-07, "loss": 0.3972, "num_input_tokens_seen": 31740672, "step": 10080 }, { "epoch": 0.6456052749503873, "grad_norm": 32.63840103149414, "learning_rate": 6.727208620635849e-07, "loss": 0.2989, "num_input_tokens_seen": 31755648, "step": 10085 }, { "epoch": 0.6459253568913642, "grad_norm": 37.15324783325195, "learning_rate": 6.716652343466446e-07, "loss": 0.4543, "num_input_tokens_seen": 31770624, "step": 10090 }, { "epoch": 0.6462454388323411, "grad_norm": 40.501869201660156, "learning_rate": 6.706100165595139e-07, "loss": 0.3094, "num_input_tokens_seen": 31786816, "step": 10095 }, { "epoch": 0.646565520773318, "grad_norm": 33.34444808959961, "learning_rate": 6.695552100196452e-07, "loss": 0.396, "num_input_tokens_seen": 31801792, "step": 10100 }, { "epoch": 0.6468856027142948, "grad_norm": 66.4857177734375, "learning_rate": 6.685008160439769e-07, "loss": 0.5142, "num_input_tokens_seen": 31818944, "step": 10105 }, { "epoch": 0.6472056846552717, "grad_norm": 35.997623443603516, "learning_rate": 6.674468359489313e-07, "loss": 0.4128, "num_input_tokens_seen": 31834176, "step": 10110 }, { "epoch": 0.6475257665962486, "grad_norm": 34.614864349365234, "learning_rate": 6.663932710504163e-07, "loss": 0.3496, "num_input_tokens_seen": 31850176, "step": 10115 }, { "epoch": 0.6478458485372255, "grad_norm": 44.908058166503906, "learning_rate": 6.653401226638192e-07, "loss": 0.3894, "num_input_tokens_seen": 31865600, "step": 10120 }, { "epoch": 0.6481659304782024, "grad_norm": 23.878267288208008, "learning_rate": 6.64287392104008e-07, "loss": 0.3921, "num_input_tokens_seen": 31880512, "step": 10125 }, { "epoch": 0.6484860124191794, "grad_norm": 23.294103622436523, "learning_rate": 6.632350806853299e-07, "loss": 0.4388, "num_input_tokens_seen": 31896512, "step": 10130 }, { "epoch": 0.6488060943601562, "grad_norm": 41.51081848144531, "learning_rate": 6.621831897216074e-07, "loss": 0.4029, "num_input_tokens_seen": 31912768, "step": 10135 }, { "epoch": 0.6491261763011331, "grad_norm": 166.3177032470703, "learning_rate": 6.611317205261387e-07, "loss": 0.4345, "num_input_tokens_seen": 31927488, "step": 10140 }, { "epoch": 0.64944625824211, "grad_norm": 28.243562698364258, "learning_rate": 6.60080674411696e-07, "loss": 0.3416, "num_input_tokens_seen": 31942784, "step": 10145 }, { "epoch": 0.6497663401830869, "grad_norm": 15.446354866027832, "learning_rate": 6.590300526905225e-07, "loss": 0.3172, "num_input_tokens_seen": 31958528, "step": 10150 }, { "epoch": 0.6500864221240638, "grad_norm": 35.624691009521484, "learning_rate": 6.579798566743313e-07, "loss": 0.4676, "num_input_tokens_seen": 31974016, "step": 10155 }, { "epoch": 0.6504065040650406, "grad_norm": 41.85055160522461, "learning_rate": 6.569300876743049e-07, "loss": 0.3143, "num_input_tokens_seen": 31990720, "step": 10160 }, { "epoch": 0.6507265860060175, "grad_norm": 31.454090118408203, "learning_rate": 6.558807470010923e-07, "loss": 0.3188, "num_input_tokens_seen": 32007168, "step": 10165 }, { "epoch": 0.6507906023942129, "eval_loss": 0.37842774391174316, "eval_runtime": 49.1741, "eval_samples_per_second": 282.384, "eval_steps_per_second": 35.303, "num_input_tokens_seen": 32010176, "step": 10166 }, { "epoch": 0.6510466679469944, "grad_norm": 29.121973037719727, "learning_rate": 6.548318359648071e-07, "loss": 0.3642, "num_input_tokens_seen": 32022208, "step": 10170 }, { "epoch": 0.6513667498879713, "grad_norm": 41.79141616821289, "learning_rate": 6.537833558750279e-07, "loss": 0.3967, "num_input_tokens_seen": 32037760, "step": 10175 }, { "epoch": 0.6516868318289483, "grad_norm": 48.895450592041016, "learning_rate": 6.527353080407938e-07, "loss": 0.3055, "num_input_tokens_seen": 32052800, "step": 10180 }, { "epoch": 0.6520069137699251, "grad_norm": 25.074914932250977, "learning_rate": 6.516876937706048e-07, "loss": 0.3366, "num_input_tokens_seen": 32068288, "step": 10185 }, { "epoch": 0.652326995710902, "grad_norm": 24.659767150878906, "learning_rate": 6.506405143724196e-07, "loss": 0.3758, "num_input_tokens_seen": 32083200, "step": 10190 }, { "epoch": 0.6526470776518789, "grad_norm": 51.243431091308594, "learning_rate": 6.495937711536546e-07, "loss": 0.4635, "num_input_tokens_seen": 32098432, "step": 10195 }, { "epoch": 0.6529671595928558, "grad_norm": 37.73176574707031, "learning_rate": 6.485474654211803e-07, "loss": 0.4226, "num_input_tokens_seen": 32114944, "step": 10200 }, { "epoch": 0.6532872415338327, "grad_norm": 38.4589958190918, "learning_rate": 6.475015984813217e-07, "loss": 0.3044, "num_input_tokens_seen": 32131520, "step": 10205 }, { "epoch": 0.6536073234748095, "grad_norm": 12.53635311126709, "learning_rate": 6.464561716398564e-07, "loss": 0.3158, "num_input_tokens_seen": 32147008, "step": 10210 }, { "epoch": 0.6539274054157864, "grad_norm": 31.261737823486328, "learning_rate": 6.454111862020122e-07, "loss": 0.3734, "num_input_tokens_seen": 32162560, "step": 10215 }, { "epoch": 0.6542474873567633, "grad_norm": 27.614994049072266, "learning_rate": 6.443666434724649e-07, "loss": 0.3636, "num_input_tokens_seen": 32177024, "step": 10220 }, { "epoch": 0.6545675692977402, "grad_norm": 25.943843841552734, "learning_rate": 6.43322544755339e-07, "loss": 0.5155, "num_input_tokens_seen": 32193024, "step": 10225 }, { "epoch": 0.6548876512387171, "grad_norm": 29.868574142456055, "learning_rate": 6.422788913542038e-07, "loss": 0.3365, "num_input_tokens_seen": 32208896, "step": 10230 }, { "epoch": 0.655207733179694, "grad_norm": 15.805740356445312, "learning_rate": 6.412356845720726e-07, "loss": 0.3296, "num_input_tokens_seen": 32225280, "step": 10235 }, { "epoch": 0.6555278151206709, "grad_norm": 16.956401824951172, "learning_rate": 6.40192925711402e-07, "loss": 0.3605, "num_input_tokens_seen": 32240768, "step": 10240 }, { "epoch": 0.6558478970616478, "grad_norm": 27.354637145996094, "learning_rate": 6.39150616074088e-07, "loss": 0.3264, "num_input_tokens_seen": 32255872, "step": 10245 }, { "epoch": 0.6561679790026247, "grad_norm": 30.604806900024414, "learning_rate": 6.381087569614668e-07, "loss": 0.4193, "num_input_tokens_seen": 32272512, "step": 10250 }, { "epoch": 0.6564880609436016, "grad_norm": 13.617461204528809, "learning_rate": 6.370673496743116e-07, "loss": 0.3828, "num_input_tokens_seen": 32286272, "step": 10255 }, { "epoch": 0.6568081428845784, "grad_norm": 25.074222564697266, "learning_rate": 6.360263955128315e-07, "loss": 0.4331, "num_input_tokens_seen": 32301952, "step": 10260 }, { "epoch": 0.6571282248255553, "grad_norm": 16.422725677490234, "learning_rate": 6.349858957766701e-07, "loss": 0.3602, "num_input_tokens_seen": 32318208, "step": 10265 }, { "epoch": 0.6574483067665322, "grad_norm": 24.204320907592773, "learning_rate": 6.339458517649036e-07, "loss": 0.336, "num_input_tokens_seen": 32333504, "step": 10270 }, { "epoch": 0.6577683887075091, "grad_norm": 32.412906646728516, "learning_rate": 6.329062647760395e-07, "loss": 0.3626, "num_input_tokens_seen": 32350208, "step": 10275 }, { "epoch": 0.658088470648486, "grad_norm": 35.1417121887207, "learning_rate": 6.318671361080137e-07, "loss": 0.3351, "num_input_tokens_seen": 32365376, "step": 10280 }, { "epoch": 0.6584085525894628, "grad_norm": 16.784576416015625, "learning_rate": 6.308284670581906e-07, "loss": 0.3306, "num_input_tokens_seen": 32381248, "step": 10285 }, { "epoch": 0.6587286345304398, "grad_norm": 27.784454345703125, "learning_rate": 6.297902589233612e-07, "loss": 0.4558, "num_input_tokens_seen": 32395968, "step": 10290 }, { "epoch": 0.6590487164714167, "grad_norm": 32.86067581176758, "learning_rate": 6.287525129997404e-07, "loss": 0.3737, "num_input_tokens_seen": 32411456, "step": 10295 }, { "epoch": 0.6593687984123936, "grad_norm": 24.458457946777344, "learning_rate": 6.277152305829656e-07, "loss": 0.3865, "num_input_tokens_seen": 32426880, "step": 10300 }, { "epoch": 0.6596888803533705, "grad_norm": 30.00528907775879, "learning_rate": 6.266784129680968e-07, "loss": 0.3281, "num_input_tokens_seen": 32442368, "step": 10305 }, { "epoch": 0.6600089622943474, "grad_norm": 37.038761138916016, "learning_rate": 6.256420614496129e-07, "loss": 0.3781, "num_input_tokens_seen": 32457920, "step": 10310 }, { "epoch": 0.6603290442353242, "grad_norm": 34.41950988769531, "learning_rate": 6.246061773214102e-07, "loss": 0.4085, "num_input_tokens_seen": 32473536, "step": 10315 }, { "epoch": 0.6606491261763011, "grad_norm": 33.36504364013672, "learning_rate": 6.235707618768032e-07, "loss": 0.3956, "num_input_tokens_seen": 32490240, "step": 10320 }, { "epoch": 0.660969208117278, "grad_norm": 63.69960021972656, "learning_rate": 6.225358164085196e-07, "loss": 0.3506, "num_input_tokens_seen": 32505728, "step": 10325 }, { "epoch": 0.6612892900582549, "grad_norm": 47.17720031738281, "learning_rate": 6.21501342208701e-07, "loss": 0.3521, "num_input_tokens_seen": 32520960, "step": 10330 }, { "epoch": 0.6616093719992318, "grad_norm": 22.675373077392578, "learning_rate": 6.204673405689007e-07, "loss": 0.4036, "num_input_tokens_seen": 32535872, "step": 10335 }, { "epoch": 0.6619294539402087, "grad_norm": 21.25689697265625, "learning_rate": 6.194338127800823e-07, "loss": 0.3158, "num_input_tokens_seen": 32552448, "step": 10340 }, { "epoch": 0.6622495358811856, "grad_norm": 35.447052001953125, "learning_rate": 6.184007601326165e-07, "loss": 0.3866, "num_input_tokens_seen": 32567232, "step": 10345 }, { "epoch": 0.6625696178221625, "grad_norm": 30.18397331237793, "learning_rate": 6.173681839162824e-07, "loss": 0.3515, "num_input_tokens_seen": 32583360, "step": 10350 }, { "epoch": 0.6628896997631394, "grad_norm": 30.015911102294922, "learning_rate": 6.163360854202635e-07, "loss": 0.3336, "num_input_tokens_seen": 32598656, "step": 10355 }, { "epoch": 0.6632097817041163, "grad_norm": 19.04948616027832, "learning_rate": 6.153044659331461e-07, "loss": 0.306, "num_input_tokens_seen": 32614144, "step": 10360 }, { "epoch": 0.6635298636450931, "grad_norm": 30.63086700439453, "learning_rate": 6.142733267429203e-07, "loss": 0.3687, "num_input_tokens_seen": 32629120, "step": 10365 }, { "epoch": 0.66384994558607, "grad_norm": 25.801145553588867, "learning_rate": 6.132426691369748e-07, "loss": 0.4287, "num_input_tokens_seen": 32645952, "step": 10370 }, { "epoch": 0.6641700275270469, "grad_norm": 12.77051067352295, "learning_rate": 6.122124944020977e-07, "loss": 0.3988, "num_input_tokens_seen": 32661696, "step": 10375 }, { "epoch": 0.6644901094680238, "grad_norm": 23.705963134765625, "learning_rate": 6.111828038244749e-07, "loss": 0.3753, "num_input_tokens_seen": 32677760, "step": 10380 }, { "epoch": 0.6648101914090007, "grad_norm": 14.284012794494629, "learning_rate": 6.101535986896866e-07, "loss": 0.2948, "num_input_tokens_seen": 32693568, "step": 10385 }, { "epoch": 0.6651302733499775, "grad_norm": 15.676067352294922, "learning_rate": 6.091248802827076e-07, "loss": 0.2899, "num_input_tokens_seen": 32708736, "step": 10390 }, { "epoch": 0.6654503552909545, "grad_norm": 20.25788688659668, "learning_rate": 6.080966498879048e-07, "loss": 0.3218, "num_input_tokens_seen": 32725440, "step": 10395 }, { "epoch": 0.6657704372319314, "grad_norm": 40.33934020996094, "learning_rate": 6.070689087890363e-07, "loss": 0.2962, "num_input_tokens_seen": 32740608, "step": 10400 }, { "epoch": 0.6660905191729083, "grad_norm": 20.199983596801758, "learning_rate": 6.060416582692487e-07, "loss": 0.3974, "num_input_tokens_seen": 32756416, "step": 10405 }, { "epoch": 0.6664106011138852, "grad_norm": 28.62371253967285, "learning_rate": 6.05014899611076e-07, "loss": 0.3358, "num_input_tokens_seen": 32771904, "step": 10410 }, { "epoch": 0.666730683054862, "grad_norm": 53.33070755004883, "learning_rate": 6.039886340964391e-07, "loss": 0.3724, "num_input_tokens_seen": 32787392, "step": 10415 }, { "epoch": 0.6670507649958389, "grad_norm": 19.005868911743164, "learning_rate": 6.029628630066423e-07, "loss": 0.334, "num_input_tokens_seen": 32803136, "step": 10420 }, { "epoch": 0.6673708469368158, "grad_norm": 30.18621063232422, "learning_rate": 6.019375876223724e-07, "loss": 0.4173, "num_input_tokens_seen": 32818624, "step": 10425 }, { "epoch": 0.6676909288777927, "grad_norm": 26.04371452331543, "learning_rate": 6.009128092236982e-07, "loss": 0.4672, "num_input_tokens_seen": 32833920, "step": 10430 }, { "epoch": 0.6680110108187696, "grad_norm": 19.67214584350586, "learning_rate": 5.998885290900679e-07, "loss": 0.3859, "num_input_tokens_seen": 32848512, "step": 10435 }, { "epoch": 0.6683310927597464, "grad_norm": 26.48846435546875, "learning_rate": 5.988647485003061e-07, "loss": 0.3391, "num_input_tokens_seen": 32865088, "step": 10440 }, { "epoch": 0.6686511747007234, "grad_norm": 61.240257263183594, "learning_rate": 5.978414687326164e-07, "loss": 0.4559, "num_input_tokens_seen": 32882048, "step": 10445 }, { "epoch": 0.6689712566417003, "grad_norm": 28.02547836303711, "learning_rate": 5.968186910645745e-07, "loss": 0.365, "num_input_tokens_seen": 32898624, "step": 10450 }, { "epoch": 0.6692913385826772, "grad_norm": 33.85887145996094, "learning_rate": 5.957964167731305e-07, "loss": 0.505, "num_input_tokens_seen": 32914176, "step": 10455 }, { "epoch": 0.6696114205236541, "grad_norm": 40.76100158691406, "learning_rate": 5.947746471346065e-07, "loss": 0.4068, "num_input_tokens_seen": 32931136, "step": 10460 }, { "epoch": 0.669931502464631, "grad_norm": 47.47494888305664, "learning_rate": 5.937533834246932e-07, "loss": 0.3349, "num_input_tokens_seen": 32947648, "step": 10465 }, { "epoch": 0.6702515844056078, "grad_norm": 24.66529083251953, "learning_rate": 5.927326269184504e-07, "loss": 0.3745, "num_input_tokens_seen": 32964224, "step": 10470 }, { "epoch": 0.6705716663465847, "grad_norm": 45.57734680175781, "learning_rate": 5.917123788903049e-07, "loss": 0.4498, "num_input_tokens_seen": 32982080, "step": 10475 }, { "epoch": 0.6708917482875616, "grad_norm": 37.37847137451172, "learning_rate": 5.906926406140484e-07, "loss": 0.4642, "num_input_tokens_seen": 32997440, "step": 10480 }, { "epoch": 0.6712118302285385, "grad_norm": 37.47283935546875, "learning_rate": 5.896734133628354e-07, "loss": 0.4298, "num_input_tokens_seen": 33013056, "step": 10485 }, { "epoch": 0.6715319121695154, "grad_norm": 24.019437789916992, "learning_rate": 5.886546984091838e-07, "loss": 0.3736, "num_input_tokens_seen": 33028416, "step": 10490 }, { "epoch": 0.6718519941104922, "grad_norm": 29.65847396850586, "learning_rate": 5.876364970249711e-07, "loss": 0.3415, "num_input_tokens_seen": 33042880, "step": 10495 }, { "epoch": 0.6721720760514692, "grad_norm": 33.120933532714844, "learning_rate": 5.866188104814336e-07, "loss": 0.2735, "num_input_tokens_seen": 33058240, "step": 10500 }, { "epoch": 0.6724921579924461, "grad_norm": 16.05461883544922, "learning_rate": 5.856016400491646e-07, "loss": 0.3792, "num_input_tokens_seen": 33073920, "step": 10505 }, { "epoch": 0.672812239933423, "grad_norm": 8.638588905334473, "learning_rate": 5.845849869981136e-07, "loss": 0.3192, "num_input_tokens_seen": 33089344, "step": 10510 }, { "epoch": 0.6731323218743999, "grad_norm": 19.994060516357422, "learning_rate": 5.835688525975842e-07, "loss": 0.3458, "num_input_tokens_seen": 33104384, "step": 10515 }, { "epoch": 0.6734524038153767, "grad_norm": 22.6699275970459, "learning_rate": 5.825532381162311e-07, "loss": 0.3931, "num_input_tokens_seen": 33120064, "step": 10520 }, { "epoch": 0.6737724857563536, "grad_norm": 22.203550338745117, "learning_rate": 5.815381448220619e-07, "loss": 0.3866, "num_input_tokens_seen": 33136128, "step": 10525 }, { "epoch": 0.6740925676973305, "grad_norm": 22.488792419433594, "learning_rate": 5.805235739824327e-07, "loss": 0.3452, "num_input_tokens_seen": 33154816, "step": 10530 }, { "epoch": 0.6744126496383074, "grad_norm": 37.78539276123047, "learning_rate": 5.795095268640458e-07, "loss": 0.5023, "num_input_tokens_seen": 33169920, "step": 10535 }, { "epoch": 0.6747327315792843, "grad_norm": 35.91427230834961, "learning_rate": 5.784960047329519e-07, "loss": 0.541, "num_input_tokens_seen": 33187712, "step": 10540 }, { "epoch": 0.6750528135202611, "grad_norm": 14.399126052856445, "learning_rate": 5.774830088545452e-07, "loss": 0.3866, "num_input_tokens_seen": 33202880, "step": 10545 }, { "epoch": 0.6753728954612381, "grad_norm": 15.00992202758789, "learning_rate": 5.76470540493563e-07, "loss": 0.2997, "num_input_tokens_seen": 33218944, "step": 10550 }, { "epoch": 0.675692977402215, "grad_norm": 27.697614669799805, "learning_rate": 5.754586009140836e-07, "loss": 0.4652, "num_input_tokens_seen": 33234688, "step": 10555 }, { "epoch": 0.6760130593431919, "grad_norm": 48.48150634765625, "learning_rate": 5.744471913795256e-07, "loss": 0.3679, "num_input_tokens_seen": 33249920, "step": 10560 }, { "epoch": 0.6763331412841688, "grad_norm": 34.35981369018555, "learning_rate": 5.734363131526459e-07, "loss": 0.3365, "num_input_tokens_seen": 33265792, "step": 10565 }, { "epoch": 0.6766532232251457, "grad_norm": 37.82774353027344, "learning_rate": 5.724259674955377e-07, "loss": 0.3742, "num_input_tokens_seen": 33280832, "step": 10570 }, { "epoch": 0.6769733051661225, "grad_norm": 28.590476989746094, "learning_rate": 5.714161556696291e-07, "loss": 0.3888, "num_input_tokens_seen": 33296576, "step": 10575 }, { "epoch": 0.6772933871070994, "grad_norm": 42.37991714477539, "learning_rate": 5.704068789356824e-07, "loss": 0.3388, "num_input_tokens_seen": 33316672, "step": 10580 }, { "epoch": 0.6776134690480763, "grad_norm": 28.075489044189453, "learning_rate": 5.693981385537912e-07, "loss": 0.3496, "num_input_tokens_seen": 33331456, "step": 10585 }, { "epoch": 0.6779335509890532, "grad_norm": 24.502607345581055, "learning_rate": 5.683899357833801e-07, "loss": 0.3447, "num_input_tokens_seen": 33346752, "step": 10590 }, { "epoch": 0.67825363293003, "grad_norm": 34.75849151611328, "learning_rate": 5.673822718832015e-07, "loss": 0.455, "num_input_tokens_seen": 33362688, "step": 10595 }, { "epoch": 0.6785737148710069, "grad_norm": 40.48807144165039, "learning_rate": 5.663751481113362e-07, "loss": 0.3697, "num_input_tokens_seen": 33377600, "step": 10600 }, { "epoch": 0.6788937968119839, "grad_norm": 24.50969696044922, "learning_rate": 5.653685657251896e-07, "loss": 0.4282, "num_input_tokens_seen": 33393280, "step": 10605 }, { "epoch": 0.6792138787529608, "grad_norm": 41.67803192138672, "learning_rate": 5.643625259814922e-07, "loss": 0.3746, "num_input_tokens_seen": 33410112, "step": 10610 }, { "epoch": 0.6795339606939377, "grad_norm": 17.74659538269043, "learning_rate": 5.633570301362953e-07, "loss": 0.3664, "num_input_tokens_seen": 33426624, "step": 10615 }, { "epoch": 0.6798540426349146, "grad_norm": 37.33218002319336, "learning_rate": 5.623520794449739e-07, "loss": 0.36, "num_input_tokens_seen": 33442240, "step": 10620 }, { "epoch": 0.6801741245758914, "grad_norm": 36.54777908325195, "learning_rate": 5.613476751622195e-07, "loss": 0.4713, "num_input_tokens_seen": 33458432, "step": 10625 }, { "epoch": 0.6804942065168683, "grad_norm": 26.362565994262695, "learning_rate": 5.603438185420426e-07, "loss": 0.4368, "num_input_tokens_seen": 33473856, "step": 10630 }, { "epoch": 0.6808142884578452, "grad_norm": 58.49364471435547, "learning_rate": 5.593405108377714e-07, "loss": 0.4714, "num_input_tokens_seen": 33489216, "step": 10635 }, { "epoch": 0.6811343703988221, "grad_norm": 23.39803695678711, "learning_rate": 5.583377533020457e-07, "loss": 0.4586, "num_input_tokens_seen": 33505280, "step": 10640 }, { "epoch": 0.681454452339799, "grad_norm": 40.31536865234375, "learning_rate": 5.573355471868201e-07, "loss": 0.2834, "num_input_tokens_seen": 33520512, "step": 10645 }, { "epoch": 0.6817745342807758, "grad_norm": 24.481168746948242, "learning_rate": 5.563338937433621e-07, "loss": 0.3532, "num_input_tokens_seen": 33537344, "step": 10650 }, { "epoch": 0.6820946162217527, "grad_norm": 15.533621788024902, "learning_rate": 5.553327942222472e-07, "loss": 0.2438, "num_input_tokens_seen": 33552128, "step": 10655 }, { "epoch": 0.6824146981627297, "grad_norm": 26.63052749633789, "learning_rate": 5.54332249873359e-07, "loss": 0.3547, "num_input_tokens_seen": 33566784, "step": 10660 }, { "epoch": 0.6827347801037066, "grad_norm": 21.95829963684082, "learning_rate": 5.533322619458896e-07, "loss": 0.3052, "num_input_tokens_seen": 33582080, "step": 10665 }, { "epoch": 0.6830548620446835, "grad_norm": 45.98701477050781, "learning_rate": 5.52332831688336e-07, "loss": 0.4079, "num_input_tokens_seen": 33596864, "step": 10670 }, { "epoch": 0.6833749439856603, "grad_norm": 79.64530944824219, "learning_rate": 5.513339603484981e-07, "loss": 0.3454, "num_input_tokens_seen": 33613056, "step": 10675 }, { "epoch": 0.6836950259266372, "grad_norm": 69.9050064086914, "learning_rate": 5.503356491734785e-07, "loss": 0.5049, "num_input_tokens_seen": 33628160, "step": 10680 }, { "epoch": 0.6840151078676141, "grad_norm": 18.264413833618164, "learning_rate": 5.493378994096806e-07, "loss": 0.4346, "num_input_tokens_seen": 33645184, "step": 10685 }, { "epoch": 0.684335189808591, "grad_norm": 18.561819076538086, "learning_rate": 5.483407123028067e-07, "loss": 0.3909, "num_input_tokens_seen": 33660800, "step": 10690 }, { "epoch": 0.6846552717495679, "grad_norm": 38.80720138549805, "learning_rate": 5.473440890978566e-07, "loss": 0.4766, "num_input_tokens_seen": 33676736, "step": 10695 }, { "epoch": 0.6849753536905447, "grad_norm": 25.19498634338379, "learning_rate": 5.463480310391261e-07, "loss": 0.4079, "num_input_tokens_seen": 33692928, "step": 10700 }, { "epoch": 0.6852954356315216, "grad_norm": 23.25238800048828, "learning_rate": 5.453525393702052e-07, "loss": 0.3839, "num_input_tokens_seen": 33708352, "step": 10705 }, { "epoch": 0.6856155175724986, "grad_norm": 32.19915771484375, "learning_rate": 5.443576153339771e-07, "loss": 0.3644, "num_input_tokens_seen": 33723968, "step": 10710 }, { "epoch": 0.6859355995134755, "grad_norm": 46.10927963256836, "learning_rate": 5.433632601726159e-07, "loss": 0.3272, "num_input_tokens_seen": 33739200, "step": 10715 }, { "epoch": 0.6862556814544524, "grad_norm": 33.03512191772461, "learning_rate": 5.42369475127586e-07, "loss": 0.3404, "num_input_tokens_seen": 33754944, "step": 10720 }, { "epoch": 0.6865757633954293, "grad_norm": 60.13679504394531, "learning_rate": 5.413762614396396e-07, "loss": 0.4709, "num_input_tokens_seen": 33769472, "step": 10725 }, { "epoch": 0.6868958453364061, "grad_norm": 33.97296142578125, "learning_rate": 5.403836203488157e-07, "loss": 0.4262, "num_input_tokens_seen": 33784896, "step": 10730 }, { "epoch": 0.687215927277383, "grad_norm": 18.200382232666016, "learning_rate": 5.393915530944382e-07, "loss": 0.3638, "num_input_tokens_seen": 33800320, "step": 10735 }, { "epoch": 0.6875360092183599, "grad_norm": 24.23163414001465, "learning_rate": 5.384000609151145e-07, "loss": 0.3765, "num_input_tokens_seen": 33816896, "step": 10740 }, { "epoch": 0.6878560911593368, "grad_norm": 21.162240982055664, "learning_rate": 5.374091450487353e-07, "loss": 0.3763, "num_input_tokens_seen": 33833344, "step": 10745 }, { "epoch": 0.6881761731003136, "grad_norm": 29.74762535095215, "learning_rate": 5.364188067324693e-07, "loss": 0.3352, "num_input_tokens_seen": 33849856, "step": 10750 }, { "epoch": 0.6884962550412905, "grad_norm": 13.640717506408691, "learning_rate": 5.354290472027659e-07, "loss": 0.3441, "num_input_tokens_seen": 33865344, "step": 10755 }, { "epoch": 0.6888163369822674, "grad_norm": 71.6620864868164, "learning_rate": 5.344398676953525e-07, "loss": 0.4955, "num_input_tokens_seen": 33881792, "step": 10760 }, { "epoch": 0.6891364189232444, "grad_norm": 31.854103088378906, "learning_rate": 5.334512694452303e-07, "loss": 0.4902, "num_input_tokens_seen": 33898368, "step": 10765 }, { "epoch": 0.6894565008642213, "grad_norm": 22.185178756713867, "learning_rate": 5.324632536866755e-07, "loss": 0.3489, "num_input_tokens_seen": 33914368, "step": 10770 }, { "epoch": 0.6897765828051982, "grad_norm": 40.81916046142578, "learning_rate": 5.314758216532386e-07, "loss": 0.3526, "num_input_tokens_seen": 33929728, "step": 10775 }, { "epoch": 0.690096664746175, "grad_norm": 20.197229385375977, "learning_rate": 5.304889745777396e-07, "loss": 0.3743, "num_input_tokens_seen": 33944704, "step": 10780 }, { "epoch": 0.6904167466871519, "grad_norm": 31.70199203491211, "learning_rate": 5.295027136922678e-07, "loss": 0.6418, "num_input_tokens_seen": 33960128, "step": 10785 }, { "epoch": 0.6907368286281288, "grad_norm": 22.89275360107422, "learning_rate": 5.285170402281827e-07, "loss": 0.4207, "num_input_tokens_seen": 33975104, "step": 10790 }, { "epoch": 0.6910569105691057, "grad_norm": 33.831241607666016, "learning_rate": 5.275319554161087e-07, "loss": 0.4588, "num_input_tokens_seen": 33990720, "step": 10795 }, { "epoch": 0.6913769925100826, "grad_norm": 31.06147575378418, "learning_rate": 5.265474604859356e-07, "loss": 0.4123, "num_input_tokens_seen": 34006272, "step": 10800 }, { "epoch": 0.6916970744510594, "grad_norm": 26.169334411621094, "learning_rate": 5.255635566668171e-07, "loss": 0.3902, "num_input_tokens_seen": 34022400, "step": 10805 }, { "epoch": 0.6920171563920363, "grad_norm": 22.71941566467285, "learning_rate": 5.245802451871686e-07, "loss": 0.3704, "num_input_tokens_seen": 34038720, "step": 10810 }, { "epoch": 0.6923372383330133, "grad_norm": 23.15312957763672, "learning_rate": 5.235975272746663e-07, "loss": 0.4316, "num_input_tokens_seen": 34053760, "step": 10815 }, { "epoch": 0.6926573202739902, "grad_norm": 22.503173828125, "learning_rate": 5.226154041562442e-07, "loss": 0.3024, "num_input_tokens_seen": 34069568, "step": 10820 }, { "epoch": 0.6929774022149671, "grad_norm": 23.336326599121094, "learning_rate": 5.216338770580953e-07, "loss": 0.406, "num_input_tokens_seen": 34086912, "step": 10825 }, { "epoch": 0.6932974841559439, "grad_norm": 22.208585739135742, "learning_rate": 5.206529472056678e-07, "loss": 0.3649, "num_input_tokens_seen": 34101696, "step": 10830 }, { "epoch": 0.6936175660969208, "grad_norm": 15.775872230529785, "learning_rate": 5.196726158236637e-07, "loss": 0.3168, "num_input_tokens_seen": 34115904, "step": 10835 }, { "epoch": 0.6939376480378977, "grad_norm": 23.13541603088379, "learning_rate": 5.186928841360384e-07, "loss": 0.3372, "num_input_tokens_seen": 34131328, "step": 10840 }, { "epoch": 0.6942577299788746, "grad_norm": 29.86430549621582, "learning_rate": 5.177137533659985e-07, "loss": 0.4395, "num_input_tokens_seen": 34148544, "step": 10845 }, { "epoch": 0.6945778119198515, "grad_norm": 20.5509033203125, "learning_rate": 5.167352247360002e-07, "loss": 0.4564, "num_input_tokens_seen": 34163520, "step": 10850 }, { "epoch": 0.6948978938608283, "grad_norm": 27.466720581054688, "learning_rate": 5.157572994677479e-07, "loss": 0.3993, "num_input_tokens_seen": 34178368, "step": 10855 }, { "epoch": 0.6952179758018052, "grad_norm": 32.89216232299805, "learning_rate": 5.147799787821929e-07, "loss": 0.4055, "num_input_tokens_seen": 34193920, "step": 10860 }, { "epoch": 0.6955380577427821, "grad_norm": 37.79446792602539, "learning_rate": 5.138032638995315e-07, "loss": 0.485, "num_input_tokens_seen": 34210176, "step": 10865 }, { "epoch": 0.6958581396837591, "grad_norm": 53.44511032104492, "learning_rate": 5.128271560392037e-07, "loss": 0.3575, "num_input_tokens_seen": 34227328, "step": 10870 }, { "epoch": 0.696178221624736, "grad_norm": 32.73928451538086, "learning_rate": 5.118516564198916e-07, "loss": 0.3901, "num_input_tokens_seen": 34241984, "step": 10875 }, { "epoch": 0.6964983035657129, "grad_norm": 23.722578048706055, "learning_rate": 5.108767662595175e-07, "loss": 0.3371, "num_input_tokens_seen": 34256896, "step": 10880 }, { "epoch": 0.6968183855066897, "grad_norm": 20.10529899597168, "learning_rate": 5.099024867752446e-07, "loss": 0.3824, "num_input_tokens_seen": 34273792, "step": 10885 }, { "epoch": 0.6971384674476666, "grad_norm": 33.20995330810547, "learning_rate": 5.089288191834709e-07, "loss": 0.3219, "num_input_tokens_seen": 34290752, "step": 10890 }, { "epoch": 0.6974585493886435, "grad_norm": 32.262474060058594, "learning_rate": 5.079557646998318e-07, "loss": 0.3367, "num_input_tokens_seen": 34308416, "step": 10895 }, { "epoch": 0.6977786313296204, "grad_norm": 13.212915420532227, "learning_rate": 5.069833245391981e-07, "loss": 0.403, "num_input_tokens_seen": 34323776, "step": 10900 }, { "epoch": 0.6980987132705972, "grad_norm": 24.54563331604004, "learning_rate": 5.060114999156728e-07, "loss": 0.322, "num_input_tokens_seen": 34338944, "step": 10905 }, { "epoch": 0.6984187952115741, "grad_norm": 37.85472869873047, "learning_rate": 5.050402920425895e-07, "loss": 0.3462, "num_input_tokens_seen": 34354432, "step": 10910 }, { "epoch": 0.698738877152551, "grad_norm": 17.395889282226562, "learning_rate": 5.040697021325128e-07, "loss": 0.2526, "num_input_tokens_seen": 34370432, "step": 10915 }, { "epoch": 0.699058959093528, "grad_norm": 32.64187240600586, "learning_rate": 5.030997313972361e-07, "loss": 0.437, "num_input_tokens_seen": 34386496, "step": 10920 }, { "epoch": 0.6993790410345049, "grad_norm": 19.760494232177734, "learning_rate": 5.021303810477795e-07, "loss": 0.368, "num_input_tokens_seen": 34402560, "step": 10925 }, { "epoch": 0.6996991229754818, "grad_norm": 16.46942710876465, "learning_rate": 5.011616522943869e-07, "loss": 0.2859, "num_input_tokens_seen": 34418496, "step": 10930 }, { "epoch": 0.7000192049164586, "grad_norm": 50.63234329223633, "learning_rate": 5.001935463465289e-07, "loss": 0.2731, "num_input_tokens_seen": 34434752, "step": 10935 }, { "epoch": 0.7003392868574355, "grad_norm": 22.748510360717773, "learning_rate": 4.99226064412897e-07, "loss": 0.3965, "num_input_tokens_seen": 34450176, "step": 10940 }, { "epoch": 0.7006593687984124, "grad_norm": 18.267223358154297, "learning_rate": 4.982592077014026e-07, "loss": 0.4233, "num_input_tokens_seen": 34465600, "step": 10945 }, { "epoch": 0.7008514179629985, "eval_loss": 0.37222641706466675, "eval_runtime": 49.2115, "eval_samples_per_second": 282.17, "eval_steps_per_second": 35.276, "num_input_tokens_seen": 34475136, "step": 10948 }, { "epoch": 0.7009794507393893, "grad_norm": 38.11653518676758, "learning_rate": 4.97292977419179e-07, "loss": 0.3026, "num_input_tokens_seen": 34481600, "step": 10950 }, { "epoch": 0.7012995326803662, "grad_norm": 19.48086166381836, "learning_rate": 4.963273747725755e-07, "loss": 0.2954, "num_input_tokens_seen": 34498752, "step": 10955 }, { "epoch": 0.701619614621343, "grad_norm": 26.763914108276367, "learning_rate": 4.953624009671582e-07, "loss": 0.4061, "num_input_tokens_seen": 34514240, "step": 10960 }, { "epoch": 0.7019396965623199, "grad_norm": 44.18442153930664, "learning_rate": 4.943980572077086e-07, "loss": 0.4161, "num_input_tokens_seen": 34528704, "step": 10965 }, { "epoch": 0.7022597785032968, "grad_norm": 38.56117630004883, "learning_rate": 4.934343446982209e-07, "loss": 0.3243, "num_input_tokens_seen": 34544704, "step": 10970 }, { "epoch": 0.7025798604442738, "grad_norm": 13.776517868041992, "learning_rate": 4.924712646419016e-07, "loss": 0.3698, "num_input_tokens_seen": 34560000, "step": 10975 }, { "epoch": 0.7028999423852507, "grad_norm": 70.76254272460938, "learning_rate": 4.915088182411674e-07, "loss": 0.3211, "num_input_tokens_seen": 34575296, "step": 10980 }, { "epoch": 0.7032200243262275, "grad_norm": 33.83591842651367, "learning_rate": 4.905470066976439e-07, "loss": 0.3715, "num_input_tokens_seen": 34590528, "step": 10985 }, { "epoch": 0.7035401062672044, "grad_norm": 37.384647369384766, "learning_rate": 4.895858312121644e-07, "loss": 0.4187, "num_input_tokens_seen": 34605312, "step": 10990 }, { "epoch": 0.7038601882081813, "grad_norm": 24.66256332397461, "learning_rate": 4.886252929847674e-07, "loss": 0.4337, "num_input_tokens_seen": 34620736, "step": 10995 }, { "epoch": 0.7041802701491582, "grad_norm": 42.17767333984375, "learning_rate": 4.876653932146963e-07, "loss": 0.4578, "num_input_tokens_seen": 34636736, "step": 11000 }, { "epoch": 0.7045003520901351, "grad_norm": 31.28046417236328, "learning_rate": 4.86706133100397e-07, "loss": 0.3782, "num_input_tokens_seen": 34651776, "step": 11005 }, { "epoch": 0.7048204340311119, "grad_norm": 52.68522644042969, "learning_rate": 4.857475138395178e-07, "loss": 0.2923, "num_input_tokens_seen": 34666176, "step": 11010 }, { "epoch": 0.7051405159720888, "grad_norm": 15.296350479125977, "learning_rate": 4.847895366289054e-07, "loss": 0.2529, "num_input_tokens_seen": 34682112, "step": 11015 }, { "epoch": 0.7054605979130657, "grad_norm": 32.735904693603516, "learning_rate": 4.838322026646057e-07, "loss": 0.3828, "num_input_tokens_seen": 34697024, "step": 11020 }, { "epoch": 0.7057806798540426, "grad_norm": 20.01278305053711, "learning_rate": 4.82875513141861e-07, "loss": 0.3577, "num_input_tokens_seen": 34712704, "step": 11025 }, { "epoch": 0.7061007617950196, "grad_norm": 29.205598831176758, "learning_rate": 4.819194692551106e-07, "loss": 0.3791, "num_input_tokens_seen": 34728256, "step": 11030 }, { "epoch": 0.7064208437359965, "grad_norm": 16.80168914794922, "learning_rate": 4.809640721979855e-07, "loss": 0.4268, "num_input_tokens_seen": 34744512, "step": 11035 }, { "epoch": 0.7067409256769733, "grad_norm": 47.780738830566406, "learning_rate": 4.8000932316331e-07, "loss": 0.4158, "num_input_tokens_seen": 34758912, "step": 11040 }, { "epoch": 0.7070610076179502, "grad_norm": 29.31734848022461, "learning_rate": 4.790552233431002e-07, "loss": 0.4037, "num_input_tokens_seen": 34774848, "step": 11045 }, { "epoch": 0.7073810895589271, "grad_norm": 34.01865005493164, "learning_rate": 4.781017739285611e-07, "loss": 0.4168, "num_input_tokens_seen": 34790016, "step": 11050 }, { "epoch": 0.707701171499904, "grad_norm": 13.347481727600098, "learning_rate": 4.771489761100842e-07, "loss": 0.3453, "num_input_tokens_seen": 34804992, "step": 11055 }, { "epoch": 0.7080212534408808, "grad_norm": 40.918357849121094, "learning_rate": 4.761968310772501e-07, "loss": 0.2687, "num_input_tokens_seen": 34820288, "step": 11060 }, { "epoch": 0.7083413353818577, "grad_norm": 40.15391540527344, "learning_rate": 4.7524534001882267e-07, "loss": 0.2718, "num_input_tokens_seen": 34836096, "step": 11065 }, { "epoch": 0.7086614173228346, "grad_norm": 28.034465789794922, "learning_rate": 4.7429450412274897e-07, "loss": 0.394, "num_input_tokens_seen": 34851584, "step": 11070 }, { "epoch": 0.7089814992638115, "grad_norm": 23.965686798095703, "learning_rate": 4.733443245761596e-07, "loss": 0.3458, "num_input_tokens_seen": 34868032, "step": 11075 }, { "epoch": 0.7093015812047885, "grad_norm": 25.30048179626465, "learning_rate": 4.723948025653646e-07, "loss": 0.3821, "num_input_tokens_seen": 34884032, "step": 11080 }, { "epoch": 0.7096216631457654, "grad_norm": 29.63812828063965, "learning_rate": 4.714459392758534e-07, "loss": 0.3254, "num_input_tokens_seen": 34899456, "step": 11085 }, { "epoch": 0.7099417450867422, "grad_norm": 51.972572326660156, "learning_rate": 4.70497735892293e-07, "loss": 0.3735, "num_input_tokens_seen": 34915456, "step": 11090 }, { "epoch": 0.7102618270277191, "grad_norm": 16.07594871520996, "learning_rate": 4.695501935985263e-07, "loss": 0.3331, "num_input_tokens_seen": 34931328, "step": 11095 }, { "epoch": 0.710581908968696, "grad_norm": 34.51850128173828, "learning_rate": 4.686033135775711e-07, "loss": 0.3999, "num_input_tokens_seen": 34946816, "step": 11100 }, { "epoch": 0.7109019909096729, "grad_norm": 25.590112686157227, "learning_rate": 4.6765709701161817e-07, "loss": 0.3245, "num_input_tokens_seen": 34964544, "step": 11105 }, { "epoch": 0.7112220728506498, "grad_norm": 66.03004455566406, "learning_rate": 4.6671154508203003e-07, "loss": 0.3996, "num_input_tokens_seen": 34982208, "step": 11110 }, { "epoch": 0.7115421547916266, "grad_norm": 42.14921188354492, "learning_rate": 4.657666589693393e-07, "loss": 0.3439, "num_input_tokens_seen": 35000576, "step": 11115 }, { "epoch": 0.7118622367326035, "grad_norm": 26.2552433013916, "learning_rate": 4.6482243985324753e-07, "loss": 0.3145, "num_input_tokens_seen": 35014912, "step": 11120 }, { "epoch": 0.7121823186735804, "grad_norm": 28.899272918701172, "learning_rate": 4.638788889126232e-07, "loss": 0.2914, "num_input_tokens_seen": 35029632, "step": 11125 }, { "epoch": 0.7125024006145573, "grad_norm": 27.084138870239258, "learning_rate": 4.6293600732550085e-07, "loss": 0.3239, "num_input_tokens_seen": 35044992, "step": 11130 }, { "epoch": 0.7128224825555343, "grad_norm": 16.42285919189453, "learning_rate": 4.619937962690792e-07, "loss": 0.4686, "num_input_tokens_seen": 35060544, "step": 11135 }, { "epoch": 0.7131425644965111, "grad_norm": 57.51594924926758, "learning_rate": 4.610522569197197e-07, "loss": 0.5105, "num_input_tokens_seen": 35075648, "step": 11140 }, { "epoch": 0.713462646437488, "grad_norm": 20.691587448120117, "learning_rate": 4.6011139045294554e-07, "loss": 0.3294, "num_input_tokens_seen": 35090880, "step": 11145 }, { "epoch": 0.7137827283784649, "grad_norm": 99.84747314453125, "learning_rate": 4.59171198043439e-07, "loss": 0.3904, "num_input_tokens_seen": 35106432, "step": 11150 }, { "epoch": 0.7141028103194418, "grad_norm": 28.633445739746094, "learning_rate": 4.582316808650424e-07, "loss": 0.4349, "num_input_tokens_seen": 35121664, "step": 11155 }, { "epoch": 0.7144228922604187, "grad_norm": 42.922950744628906, "learning_rate": 4.572928400907529e-07, "loss": 0.491, "num_input_tokens_seen": 35137152, "step": 11160 }, { "epoch": 0.7147429742013955, "grad_norm": 38.647911071777344, "learning_rate": 4.5635467689272434e-07, "loss": 0.3682, "num_input_tokens_seen": 35153088, "step": 11165 }, { "epoch": 0.7150630561423724, "grad_norm": 22.412986755371094, "learning_rate": 4.554171924422655e-07, "loss": 0.3654, "num_input_tokens_seen": 35168192, "step": 11170 }, { "epoch": 0.7153831380833493, "grad_norm": 20.65825653076172, "learning_rate": 4.544803879098356e-07, "loss": 0.3242, "num_input_tokens_seen": 35184192, "step": 11175 }, { "epoch": 0.7157032200243262, "grad_norm": 23.79654884338379, "learning_rate": 4.535442644650462e-07, "loss": 0.3848, "num_input_tokens_seen": 35200256, "step": 11180 }, { "epoch": 0.7160233019653032, "grad_norm": 24.546035766601562, "learning_rate": 4.5260882327665906e-07, "loss": 0.4889, "num_input_tokens_seen": 35214720, "step": 11185 }, { "epoch": 0.71634338390628, "grad_norm": 38.554954528808594, "learning_rate": 4.5167406551258347e-07, "loss": 0.5077, "num_input_tokens_seen": 35230720, "step": 11190 }, { "epoch": 0.7166634658472569, "grad_norm": 29.644372940063477, "learning_rate": 4.5073999233987445e-07, "loss": 0.3948, "num_input_tokens_seen": 35246400, "step": 11195 }, { "epoch": 0.7169835477882338, "grad_norm": 30.197397232055664, "learning_rate": 4.4980660492473434e-07, "loss": 0.4854, "num_input_tokens_seen": 35262784, "step": 11200 }, { "epoch": 0.7173036297292107, "grad_norm": 14.077301025390625, "learning_rate": 4.4887390443250804e-07, "loss": 0.2735, "num_input_tokens_seen": 35277632, "step": 11205 }, { "epoch": 0.7176237116701876, "grad_norm": 18.285058975219727, "learning_rate": 4.4794189202768295e-07, "loss": 0.2981, "num_input_tokens_seen": 35292544, "step": 11210 }, { "epoch": 0.7179437936111644, "grad_norm": 32.815086364746094, "learning_rate": 4.4701056887388757e-07, "loss": 0.3816, "num_input_tokens_seen": 35308352, "step": 11215 }, { "epoch": 0.7182638755521413, "grad_norm": 31.119327545166016, "learning_rate": 4.460799361338897e-07, "loss": 0.3307, "num_input_tokens_seen": 35323904, "step": 11220 }, { "epoch": 0.7185839574931182, "grad_norm": 18.72206687927246, "learning_rate": 4.451499949695954e-07, "loss": 0.4203, "num_input_tokens_seen": 35340224, "step": 11225 }, { "epoch": 0.7189040394340951, "grad_norm": 17.376712799072266, "learning_rate": 4.44220746542047e-07, "loss": 0.375, "num_input_tokens_seen": 35355776, "step": 11230 }, { "epoch": 0.719224121375072, "grad_norm": 38.786521911621094, "learning_rate": 4.432921920114221e-07, "loss": 0.474, "num_input_tokens_seen": 35371072, "step": 11235 }, { "epoch": 0.719544203316049, "grad_norm": 36.586570739746094, "learning_rate": 4.4236433253703185e-07, "loss": 0.3144, "num_input_tokens_seen": 35387520, "step": 11240 }, { "epoch": 0.7198642852570258, "grad_norm": 36.61032485961914, "learning_rate": 4.4143716927732e-07, "loss": 0.4042, "num_input_tokens_seen": 35403840, "step": 11245 }, { "epoch": 0.7201843671980027, "grad_norm": 26.34575843811035, "learning_rate": 4.405107033898604e-07, "loss": 0.3767, "num_input_tokens_seen": 35420032, "step": 11250 }, { "epoch": 0.7205044491389796, "grad_norm": 33.59138107299805, "learning_rate": 4.395849360313568e-07, "loss": 0.2887, "num_input_tokens_seen": 35436032, "step": 11255 }, { "epoch": 0.7208245310799565, "grad_norm": 44.58377456665039, "learning_rate": 4.386598683576406e-07, "loss": 0.3505, "num_input_tokens_seen": 35451136, "step": 11260 }, { "epoch": 0.7211446130209334, "grad_norm": 17.373126983642578, "learning_rate": 4.377355015236696e-07, "loss": 0.4744, "num_input_tokens_seen": 35466816, "step": 11265 }, { "epoch": 0.7214646949619102, "grad_norm": 33.182308197021484, "learning_rate": 4.368118366835266e-07, "loss": 0.3588, "num_input_tokens_seen": 35483456, "step": 11270 }, { "epoch": 0.7217847769028871, "grad_norm": 40.823421478271484, "learning_rate": 4.358888749904177e-07, "loss": 0.4691, "num_input_tokens_seen": 35499584, "step": 11275 }, { "epoch": 0.722104858843864, "grad_norm": 24.432401657104492, "learning_rate": 4.349666175966725e-07, "loss": 0.3521, "num_input_tokens_seen": 35515328, "step": 11280 }, { "epoch": 0.7224249407848409, "grad_norm": 18.420427322387695, "learning_rate": 4.340450656537392e-07, "loss": 0.4721, "num_input_tokens_seen": 35530048, "step": 11285 }, { "epoch": 0.7227450227258178, "grad_norm": 31.080825805664062, "learning_rate": 4.331242203121861e-07, "loss": 0.2995, "num_input_tokens_seen": 35545792, "step": 11290 }, { "epoch": 0.7230651046667947, "grad_norm": 43.900115966796875, "learning_rate": 4.322040827217004e-07, "loss": 0.3775, "num_input_tokens_seen": 35561344, "step": 11295 }, { "epoch": 0.7233851866077716, "grad_norm": 42.54143142700195, "learning_rate": 4.312846540310838e-07, "loss": 0.4064, "num_input_tokens_seen": 35577024, "step": 11300 }, { "epoch": 0.7237052685487485, "grad_norm": 25.552127838134766, "learning_rate": 4.3036593538825373e-07, "loss": 0.3527, "num_input_tokens_seen": 35592192, "step": 11305 }, { "epoch": 0.7240253504897254, "grad_norm": 15.031996726989746, "learning_rate": 4.2944792794024196e-07, "loss": 0.3375, "num_input_tokens_seen": 35607872, "step": 11310 }, { "epoch": 0.7243454324307023, "grad_norm": 23.10059928894043, "learning_rate": 4.285306328331915e-07, "loss": 0.3015, "num_input_tokens_seen": 35623872, "step": 11315 }, { "epoch": 0.7246655143716791, "grad_norm": 27.68567657470703, "learning_rate": 4.2761405121235506e-07, "loss": 0.3168, "num_input_tokens_seen": 35638720, "step": 11320 }, { "epoch": 0.724985596312656, "grad_norm": 21.363649368286133, "learning_rate": 4.266981842220965e-07, "loss": 0.538, "num_input_tokens_seen": 35655680, "step": 11325 }, { "epoch": 0.7253056782536329, "grad_norm": 25.60169219970703, "learning_rate": 4.257830330058864e-07, "loss": 0.2663, "num_input_tokens_seen": 35671168, "step": 11330 }, { "epoch": 0.7256257601946098, "grad_norm": 28.766132354736328, "learning_rate": 4.248685987063019e-07, "loss": 0.4085, "num_input_tokens_seen": 35686848, "step": 11335 }, { "epoch": 0.7259458421355867, "grad_norm": 25.852869033813477, "learning_rate": 4.2395488246502396e-07, "loss": 0.3486, "num_input_tokens_seen": 35702720, "step": 11340 }, { "epoch": 0.7262659240765637, "grad_norm": 35.1387939453125, "learning_rate": 4.2304188542283913e-07, "loss": 0.4532, "num_input_tokens_seen": 35720640, "step": 11345 }, { "epoch": 0.7265860060175405, "grad_norm": 63.59513854980469, "learning_rate": 4.221296087196347e-07, "loss": 0.3855, "num_input_tokens_seen": 35735424, "step": 11350 }, { "epoch": 0.7269060879585174, "grad_norm": 22.047700881958008, "learning_rate": 4.2121805349439867e-07, "loss": 0.46, "num_input_tokens_seen": 35751168, "step": 11355 }, { "epoch": 0.7272261698994943, "grad_norm": 29.550992965698242, "learning_rate": 4.203072208852184e-07, "loss": 0.3829, "num_input_tokens_seen": 35767168, "step": 11360 }, { "epoch": 0.7275462518404712, "grad_norm": 47.271080017089844, "learning_rate": 4.193971120292793e-07, "loss": 0.447, "num_input_tokens_seen": 35782464, "step": 11365 }, { "epoch": 0.727866333781448, "grad_norm": 18.53926658630371, "learning_rate": 4.184877280628629e-07, "loss": 0.4004, "num_input_tokens_seen": 35798592, "step": 11370 }, { "epoch": 0.7281864157224249, "grad_norm": 35.71843719482422, "learning_rate": 4.1757907012134565e-07, "loss": 0.3955, "num_input_tokens_seen": 35814720, "step": 11375 }, { "epoch": 0.7285064976634018, "grad_norm": 32.7597770690918, "learning_rate": 4.166711393391978e-07, "loss": 0.2807, "num_input_tokens_seen": 35830016, "step": 11380 }, { "epoch": 0.7288265796043787, "grad_norm": 17.185914993286133, "learning_rate": 4.1576393684998146e-07, "loss": 0.3365, "num_input_tokens_seen": 35845632, "step": 11385 }, { "epoch": 0.7291466615453556, "grad_norm": 23.883012771606445, "learning_rate": 4.1485746378634966e-07, "loss": 0.3505, "num_input_tokens_seen": 35861184, "step": 11390 }, { "epoch": 0.7294667434863324, "grad_norm": 36.17485046386719, "learning_rate": 4.1395172128004473e-07, "loss": 0.4186, "num_input_tokens_seen": 35876864, "step": 11395 }, { "epoch": 0.7297868254273094, "grad_norm": 23.241865158081055, "learning_rate": 4.130467104618963e-07, "loss": 0.3272, "num_input_tokens_seen": 35893568, "step": 11400 }, { "epoch": 0.7301069073682863, "grad_norm": 27.966672897338867, "learning_rate": 4.1214243246182223e-07, "loss": 0.3336, "num_input_tokens_seen": 35909696, "step": 11405 }, { "epoch": 0.7304269893092632, "grad_norm": 27.110546112060547, "learning_rate": 4.1123888840882306e-07, "loss": 0.465, "num_input_tokens_seen": 35925120, "step": 11410 }, { "epoch": 0.7307470712502401, "grad_norm": 27.762094497680664, "learning_rate": 4.1033607943098415e-07, "loss": 0.3184, "num_input_tokens_seen": 35940800, "step": 11415 }, { "epoch": 0.731067153191217, "grad_norm": 13.206759452819824, "learning_rate": 4.0943400665547423e-07, "loss": 0.3461, "num_input_tokens_seen": 35955968, "step": 11420 }, { "epoch": 0.7313872351321938, "grad_norm": 45.63411331176758, "learning_rate": 4.0853267120854064e-07, "loss": 0.3261, "num_input_tokens_seen": 35972096, "step": 11425 }, { "epoch": 0.7317073170731707, "grad_norm": 22.173538208007812, "learning_rate": 4.076320742155117e-07, "loss": 0.3358, "num_input_tokens_seen": 35986624, "step": 11430 }, { "epoch": 0.7320273990141476, "grad_norm": 12.858855247497559, "learning_rate": 4.067322168007928e-07, "loss": 0.3546, "num_input_tokens_seen": 36003008, "step": 11435 }, { "epoch": 0.7323474809551245, "grad_norm": 24.93453598022461, "learning_rate": 4.0583310008786775e-07, "loss": 0.3539, "num_input_tokens_seen": 36017152, "step": 11440 }, { "epoch": 0.7326675628961014, "grad_norm": 42.7269287109375, "learning_rate": 4.049347251992932e-07, "loss": 0.2777, "num_input_tokens_seen": 36031936, "step": 11445 }, { "epoch": 0.7329876448370783, "grad_norm": 27.098237991333008, "learning_rate": 4.0403709325670064e-07, "loss": 0.3461, "num_input_tokens_seen": 36048064, "step": 11450 }, { "epoch": 0.7333077267780552, "grad_norm": 55.99066162109375, "learning_rate": 4.03140205380795e-07, "loss": 0.4433, "num_input_tokens_seen": 36064256, "step": 11455 }, { "epoch": 0.7336278087190321, "grad_norm": 65.33406066894531, "learning_rate": 4.0224406269135115e-07, "loss": 0.6545, "num_input_tokens_seen": 36079424, "step": 11460 }, { "epoch": 0.733947890660009, "grad_norm": 45.920005798339844, "learning_rate": 4.0134866630721266e-07, "loss": 0.3062, "num_input_tokens_seen": 36095424, "step": 11465 }, { "epoch": 0.7342679726009859, "grad_norm": 24.08492660522461, "learning_rate": 4.0045401734629367e-07, "loss": 0.3666, "num_input_tokens_seen": 36111360, "step": 11470 }, { "epoch": 0.7345880545419627, "grad_norm": 25.49542236328125, "learning_rate": 3.9956011692557377e-07, "loss": 0.3819, "num_input_tokens_seen": 36127232, "step": 11475 }, { "epoch": 0.7349081364829396, "grad_norm": 50.120731353759766, "learning_rate": 3.986669661610972e-07, "loss": 0.3447, "num_input_tokens_seen": 36143168, "step": 11480 }, { "epoch": 0.7352282184239165, "grad_norm": 34.99326705932617, "learning_rate": 3.9777456616797414e-07, "loss": 0.329, "num_input_tokens_seen": 36158272, "step": 11485 }, { "epoch": 0.7355483003648934, "grad_norm": 53.85727310180664, "learning_rate": 3.968829180603761e-07, "loss": 0.3544, "num_input_tokens_seen": 36173056, "step": 11490 }, { "epoch": 0.7358683823058703, "grad_norm": 48.56296920776367, "learning_rate": 3.9599202295153624e-07, "loss": 0.4025, "num_input_tokens_seen": 36187904, "step": 11495 }, { "epoch": 0.7361884642468471, "grad_norm": 70.58976745605469, "learning_rate": 3.951018819537476e-07, "loss": 0.3587, "num_input_tokens_seen": 36205632, "step": 11500 }, { "epoch": 0.7365085461878241, "grad_norm": 33.671356201171875, "learning_rate": 3.942124961783616e-07, "loss": 0.3492, "num_input_tokens_seen": 36220160, "step": 11505 }, { "epoch": 0.736828628128801, "grad_norm": 27.56850242614746, "learning_rate": 3.933238667357869e-07, "loss": 0.3096, "num_input_tokens_seen": 36236416, "step": 11510 }, { "epoch": 0.7371487100697779, "grad_norm": 41.809757232666016, "learning_rate": 3.924359947354876e-07, "loss": 0.3546, "num_input_tokens_seen": 36251584, "step": 11515 }, { "epoch": 0.7374687920107548, "grad_norm": 13.83644962310791, "learning_rate": 3.915488812859826e-07, "loss": 0.3261, "num_input_tokens_seen": 36265856, "step": 11520 }, { "epoch": 0.7377888739517316, "grad_norm": 58.69389724731445, "learning_rate": 3.90662527494843e-07, "loss": 0.3797, "num_input_tokens_seen": 36283904, "step": 11525 }, { "epoch": 0.7381089558927085, "grad_norm": 34.365379333496094, "learning_rate": 3.8977693446869285e-07, "loss": 0.3638, "num_input_tokens_seen": 36298432, "step": 11530 }, { "epoch": 0.7384290378336854, "grad_norm": 28.92525291442871, "learning_rate": 3.8889210331320445e-07, "loss": 0.3298, "num_input_tokens_seen": 36313728, "step": 11535 }, { "epoch": 0.7387491197746623, "grad_norm": 20.923290252685547, "learning_rate": 3.8800803513310033e-07, "loss": 0.3795, "num_input_tokens_seen": 36329088, "step": 11540 }, { "epoch": 0.7390692017156392, "grad_norm": 37.18941116333008, "learning_rate": 3.8712473103214993e-07, "loss": 0.4125, "num_input_tokens_seen": 36345024, "step": 11545 }, { "epoch": 0.739389283656616, "grad_norm": 21.793880462646484, "learning_rate": 3.862421921131688e-07, "loss": 0.3077, "num_input_tokens_seen": 36361792, "step": 11550 }, { "epoch": 0.739709365597593, "grad_norm": 39.02511978149414, "learning_rate": 3.85360419478017e-07, "loss": 0.2844, "num_input_tokens_seen": 36377152, "step": 11555 }, { "epoch": 0.7400294475385699, "grad_norm": 22.608049392700195, "learning_rate": 3.8447941422759786e-07, "loss": 0.346, "num_input_tokens_seen": 36394048, "step": 11560 }, { "epoch": 0.7403495294795468, "grad_norm": 31.372352600097656, "learning_rate": 3.835991774618579e-07, "loss": 0.3546, "num_input_tokens_seen": 36409152, "step": 11565 }, { "epoch": 0.7406696114205237, "grad_norm": 89.33686065673828, "learning_rate": 3.827197102797818e-07, "loss": 0.3882, "num_input_tokens_seen": 36427072, "step": 11570 }, { "epoch": 0.7409896933615006, "grad_norm": 60.6555290222168, "learning_rate": 3.818410137793947e-07, "loss": 0.4667, "num_input_tokens_seen": 36444288, "step": 11575 }, { "epoch": 0.7413097753024774, "grad_norm": 17.558565139770508, "learning_rate": 3.809630890577602e-07, "loss": 0.4323, "num_input_tokens_seen": 36460096, "step": 11580 }, { "epoch": 0.7416298572434543, "grad_norm": 123.81878662109375, "learning_rate": 3.800859372109777e-07, "loss": 0.3414, "num_input_tokens_seen": 36475264, "step": 11585 }, { "epoch": 0.7419499391844312, "grad_norm": 19.393999099731445, "learning_rate": 3.7920955933418055e-07, "loss": 0.3205, "num_input_tokens_seen": 36491264, "step": 11590 }, { "epoch": 0.7422700211254081, "grad_norm": 45.717002868652344, "learning_rate": 3.7833395652153775e-07, "loss": 0.3158, "num_input_tokens_seen": 36506368, "step": 11595 }, { "epoch": 0.742590103066385, "grad_norm": 36.06786346435547, "learning_rate": 3.774591298662497e-07, "loss": 0.2953, "num_input_tokens_seen": 36522432, "step": 11600 }, { "epoch": 0.7429101850073618, "grad_norm": 68.29784393310547, "learning_rate": 3.765850804605468e-07, "loss": 0.4255, "num_input_tokens_seen": 36539008, "step": 11605 }, { "epoch": 0.7432302669483388, "grad_norm": 23.243270874023438, "learning_rate": 3.7571180939569104e-07, "loss": 0.2863, "num_input_tokens_seen": 36554240, "step": 11610 }, { "epoch": 0.7435503488893157, "grad_norm": 33.98516082763672, "learning_rate": 3.748393177619711e-07, "loss": 0.308, "num_input_tokens_seen": 36569920, "step": 11615 }, { "epoch": 0.7438704308302926, "grad_norm": 32.500240325927734, "learning_rate": 3.739676066487032e-07, "loss": 0.3273, "num_input_tokens_seen": 36585792, "step": 11620 }, { "epoch": 0.7441905127712695, "grad_norm": 21.474756240844727, "learning_rate": 3.730966771442289e-07, "loss": 0.2906, "num_input_tokens_seen": 36601280, "step": 11625 }, { "epoch": 0.7445105947122463, "grad_norm": 34.38766860961914, "learning_rate": 3.722265303359137e-07, "loss": 0.5193, "num_input_tokens_seen": 36617152, "step": 11630 }, { "epoch": 0.7448306766532232, "grad_norm": 57.639156341552734, "learning_rate": 3.713571673101463e-07, "loss": 0.4, "num_input_tokens_seen": 36632512, "step": 11635 }, { "epoch": 0.7451507585942001, "grad_norm": 13.589853286743164, "learning_rate": 3.704885891523366e-07, "loss": 0.3338, "num_input_tokens_seen": 36647744, "step": 11640 }, { "epoch": 0.745470840535177, "grad_norm": 36.2945442199707, "learning_rate": 3.696207969469146e-07, "loss": 0.3878, "num_input_tokens_seen": 36663360, "step": 11645 }, { "epoch": 0.7457909224761539, "grad_norm": 28.11053466796875, "learning_rate": 3.6875379177732913e-07, "loss": 0.3571, "num_input_tokens_seen": 36678656, "step": 11650 }, { "epoch": 0.7461110044171307, "grad_norm": 71.32048034667969, "learning_rate": 3.6788757472604634e-07, "loss": 0.4971, "num_input_tokens_seen": 36693952, "step": 11655 }, { "epoch": 0.7464310863581076, "grad_norm": 34.13432312011719, "learning_rate": 3.6702214687454825e-07, "loss": 0.3139, "num_input_tokens_seen": 36709888, "step": 11660 }, { "epoch": 0.7467511682990846, "grad_norm": 26.685256958007812, "learning_rate": 3.6615750930333177e-07, "loss": 0.3103, "num_input_tokens_seen": 36725504, "step": 11665 }, { "epoch": 0.7470712502400615, "grad_norm": 11.093647956848145, "learning_rate": 3.65293663091907e-07, "loss": 0.3055, "num_input_tokens_seen": 36741376, "step": 11670 }, { "epoch": 0.7473913321810384, "grad_norm": 34.28535461425781, "learning_rate": 3.6443060931879623e-07, "loss": 0.4277, "num_input_tokens_seen": 36756864, "step": 11675 }, { "epoch": 0.7477114141220152, "grad_norm": 28.244558334350586, "learning_rate": 3.635683490615321e-07, "loss": 0.4503, "num_input_tokens_seen": 36772608, "step": 11680 }, { "epoch": 0.7480314960629921, "grad_norm": 76.50003051757812, "learning_rate": 3.6270688339665634e-07, "loss": 0.2975, "num_input_tokens_seen": 36788352, "step": 11685 }, { "epoch": 0.748351578003969, "grad_norm": 38.5800666809082, "learning_rate": 3.6184621339972e-07, "loss": 0.3444, "num_input_tokens_seen": 36804096, "step": 11690 }, { "epoch": 0.7486716599449459, "grad_norm": 42.00413131713867, "learning_rate": 3.609863401452786e-07, "loss": 0.3568, "num_input_tokens_seen": 36819776, "step": 11695 }, { "epoch": 0.7489917418859228, "grad_norm": 37.22871017456055, "learning_rate": 3.6012726470689416e-07, "loss": 0.4084, "num_input_tokens_seen": 36835072, "step": 11700 }, { "epoch": 0.7493118238268996, "grad_norm": 25.7962646484375, "learning_rate": 3.592689881571329e-07, "loss": 0.3318, "num_input_tokens_seen": 36850816, "step": 11705 }, { "epoch": 0.7496319057678765, "grad_norm": 36.8912467956543, "learning_rate": 3.5841151156756334e-07, "loss": 0.4348, "num_input_tokens_seen": 36866368, "step": 11710 }, { "epoch": 0.7499519877088535, "grad_norm": 39.81080627441406, "learning_rate": 3.575548360087539e-07, "loss": 0.3994, "num_input_tokens_seen": 36885376, "step": 11715 }, { "epoch": 0.7502720696498304, "grad_norm": 17.17061996459961, "learning_rate": 3.5669896255027533e-07, "loss": 0.3173, "num_input_tokens_seen": 36900288, "step": 11720 }, { "epoch": 0.7505921515908073, "grad_norm": 16.3179988861084, "learning_rate": 3.5584389226069543e-07, "loss": 0.4035, "num_input_tokens_seen": 36916224, "step": 11725 }, { "epoch": 0.7509122335317842, "grad_norm": 18.672239303588867, "learning_rate": 3.5498962620757866e-07, "loss": 0.2995, "num_input_tokens_seen": 36931648, "step": 11730 }, { "epoch": 0.7509122335317842, "eval_loss": 0.3647865653038025, "eval_runtime": 49.1603, "eval_samples_per_second": 282.464, "eval_steps_per_second": 35.313, "num_input_tokens_seen": 36931648, "step": 11730 }, { "epoch": 0.751232315472761, "grad_norm": 94.76543426513672, "learning_rate": 3.5413616545748713e-07, "loss": 0.4327, "num_input_tokens_seen": 36945856, "step": 11735 }, { "epoch": 0.7515523974137379, "grad_norm": 24.62285804748535, "learning_rate": 3.532835110759763e-07, "loss": 0.5026, "num_input_tokens_seen": 36961792, "step": 11740 }, { "epoch": 0.7518724793547148, "grad_norm": 24.09138298034668, "learning_rate": 3.524316641275955e-07, "loss": 0.3038, "num_input_tokens_seen": 36977152, "step": 11745 }, { "epoch": 0.7521925612956917, "grad_norm": 19.018442153930664, "learning_rate": 3.5158062567588467e-07, "loss": 0.4152, "num_input_tokens_seen": 36991936, "step": 11750 }, { "epoch": 0.7525126432366686, "grad_norm": 64.08114624023438, "learning_rate": 3.5073039678337633e-07, "loss": 0.3924, "num_input_tokens_seen": 37006784, "step": 11755 }, { "epoch": 0.7528327251776454, "grad_norm": 36.50153732299805, "learning_rate": 3.498809785115908e-07, "loss": 0.348, "num_input_tokens_seen": 37022208, "step": 11760 }, { "epoch": 0.7531528071186223, "grad_norm": 12.052895545959473, "learning_rate": 3.4903237192103697e-07, "loss": 0.3504, "num_input_tokens_seen": 37039488, "step": 11765 }, { "epoch": 0.7534728890595993, "grad_norm": 34.9785270690918, "learning_rate": 3.481845780712099e-07, "loss": 0.3372, "num_input_tokens_seen": 37056064, "step": 11770 }, { "epoch": 0.7537929710005762, "grad_norm": 47.886329650878906, "learning_rate": 3.4733759802059037e-07, "loss": 0.3354, "num_input_tokens_seen": 37072256, "step": 11775 }, { "epoch": 0.7541130529415531, "grad_norm": 68.2834701538086, "learning_rate": 3.4649143282664273e-07, "loss": 0.4239, "num_input_tokens_seen": 37087360, "step": 11780 }, { "epoch": 0.7544331348825299, "grad_norm": 27.56783676147461, "learning_rate": 3.456460835458143e-07, "loss": 0.2992, "num_input_tokens_seen": 37102144, "step": 11785 }, { "epoch": 0.7547532168235068, "grad_norm": 42.50265121459961, "learning_rate": 3.4480155123353337e-07, "loss": 0.3172, "num_input_tokens_seen": 37117568, "step": 11790 }, { "epoch": 0.7550732987644837, "grad_norm": 35.93981170654297, "learning_rate": 3.4395783694420875e-07, "loss": 0.4541, "num_input_tokens_seen": 37132800, "step": 11795 }, { "epoch": 0.7553933807054606, "grad_norm": 20.164365768432617, "learning_rate": 3.4311494173122743e-07, "loss": 0.4009, "num_input_tokens_seen": 37147776, "step": 11800 }, { "epoch": 0.7557134626464375, "grad_norm": 26.284648895263672, "learning_rate": 3.422728666469541e-07, "loss": 0.387, "num_input_tokens_seen": 37163904, "step": 11805 }, { "epoch": 0.7560335445874143, "grad_norm": 42.91219711303711, "learning_rate": 3.41431612742729e-07, "loss": 0.4272, "num_input_tokens_seen": 37180416, "step": 11810 }, { "epoch": 0.7563536265283912, "grad_norm": 23.454986572265625, "learning_rate": 3.4059118106886855e-07, "loss": 0.4243, "num_input_tokens_seen": 37196480, "step": 11815 }, { "epoch": 0.7566737084693682, "grad_norm": 74.44619750976562, "learning_rate": 3.3975157267466036e-07, "loss": 0.5118, "num_input_tokens_seen": 37211648, "step": 11820 }, { "epoch": 0.7569937904103451, "grad_norm": 25.939687728881836, "learning_rate": 3.389127886083656e-07, "loss": 0.29, "num_input_tokens_seen": 37227072, "step": 11825 }, { "epoch": 0.757313872351322, "grad_norm": 24.652931213378906, "learning_rate": 3.3807482991721667e-07, "loss": 0.3415, "num_input_tokens_seen": 37243968, "step": 11830 }, { "epoch": 0.7576339542922989, "grad_norm": 20.54140853881836, "learning_rate": 3.3723769764741474e-07, "loss": 0.3219, "num_input_tokens_seen": 37259200, "step": 11835 }, { "epoch": 0.7579540362332757, "grad_norm": 15.427878379821777, "learning_rate": 3.3640139284412825e-07, "loss": 0.2948, "num_input_tokens_seen": 37275072, "step": 11840 }, { "epoch": 0.7582741181742526, "grad_norm": 42.64249038696289, "learning_rate": 3.355659165514948e-07, "loss": 0.399, "num_input_tokens_seen": 37291392, "step": 11845 }, { "epoch": 0.7585942001152295, "grad_norm": 16.320554733276367, "learning_rate": 3.347312698126161e-07, "loss": 0.2714, "num_input_tokens_seen": 37307648, "step": 11850 }, { "epoch": 0.7589142820562064, "grad_norm": 13.9678316116333, "learning_rate": 3.338974536695578e-07, "loss": 0.2191, "num_input_tokens_seen": 37323136, "step": 11855 }, { "epoch": 0.7592343639971832, "grad_norm": 21.917150497436523, "learning_rate": 3.330644691633492e-07, "loss": 0.3183, "num_input_tokens_seen": 37338496, "step": 11860 }, { "epoch": 0.7595544459381601, "grad_norm": 10.6149320602417, "learning_rate": 3.322323173339818e-07, "loss": 0.2783, "num_input_tokens_seen": 37356800, "step": 11865 }, { "epoch": 0.759874527879137, "grad_norm": 25.766250610351562, "learning_rate": 3.314009992204071e-07, "loss": 0.4264, "num_input_tokens_seen": 37372800, "step": 11870 }, { "epoch": 0.760194609820114, "grad_norm": 66.81485748291016, "learning_rate": 3.3057051586053443e-07, "loss": 0.3269, "num_input_tokens_seen": 37388608, "step": 11875 }, { "epoch": 0.7605146917610909, "grad_norm": 35.36101150512695, "learning_rate": 3.297408682912329e-07, "loss": 0.4584, "num_input_tokens_seen": 37405184, "step": 11880 }, { "epoch": 0.7608347737020678, "grad_norm": 21.154664993286133, "learning_rate": 3.289120575483271e-07, "loss": 0.2741, "num_input_tokens_seen": 37420096, "step": 11885 }, { "epoch": 0.7611548556430446, "grad_norm": 31.978300094604492, "learning_rate": 3.280840846665969e-07, "loss": 0.4214, "num_input_tokens_seen": 37434368, "step": 11890 }, { "epoch": 0.7614749375840215, "grad_norm": 29.54779052734375, "learning_rate": 3.272569506797761e-07, "loss": 0.3005, "num_input_tokens_seen": 37449344, "step": 11895 }, { "epoch": 0.7617950195249984, "grad_norm": 28.143238067626953, "learning_rate": 3.2643065662055136e-07, "loss": 0.3314, "num_input_tokens_seen": 37464448, "step": 11900 }, { "epoch": 0.7621151014659753, "grad_norm": 69.54246520996094, "learning_rate": 3.2560520352056033e-07, "loss": 0.2837, "num_input_tokens_seen": 37481856, "step": 11905 }, { "epoch": 0.7624351834069522, "grad_norm": 18.128210067749023, "learning_rate": 3.24780592410391e-07, "loss": 0.3985, "num_input_tokens_seen": 37497856, "step": 11910 }, { "epoch": 0.762755265347929, "grad_norm": 39.83074188232422, "learning_rate": 3.2395682431957994e-07, "loss": 0.4494, "num_input_tokens_seen": 37513600, "step": 11915 }, { "epoch": 0.7630753472889059, "grad_norm": 32.585750579833984, "learning_rate": 3.231339002766115e-07, "loss": 0.324, "num_input_tokens_seen": 37529408, "step": 11920 }, { "epoch": 0.7633954292298829, "grad_norm": 30.76116371154785, "learning_rate": 3.2231182130891564e-07, "loss": 0.3296, "num_input_tokens_seen": 37545984, "step": 11925 }, { "epoch": 0.7637155111708598, "grad_norm": 59.110801696777344, "learning_rate": 3.214905884428679e-07, "loss": 0.3405, "num_input_tokens_seen": 37561856, "step": 11930 }, { "epoch": 0.7640355931118367, "grad_norm": 29.65723991394043, "learning_rate": 3.206702027037868e-07, "loss": 0.3253, "num_input_tokens_seen": 37578624, "step": 11935 }, { "epoch": 0.7643556750528135, "grad_norm": 43.48826599121094, "learning_rate": 3.198506651159344e-07, "loss": 0.3882, "num_input_tokens_seen": 37593920, "step": 11940 }, { "epoch": 0.7646757569937904, "grad_norm": 23.43718147277832, "learning_rate": 3.190319767025121e-07, "loss": 0.38, "num_input_tokens_seen": 37609664, "step": 11945 }, { "epoch": 0.7649958389347673, "grad_norm": 59.76777267456055, "learning_rate": 3.1821413848566213e-07, "loss": 0.4989, "num_input_tokens_seen": 37626048, "step": 11950 }, { "epoch": 0.7653159208757442, "grad_norm": 17.83317756652832, "learning_rate": 3.1739715148646564e-07, "loss": 0.3798, "num_input_tokens_seen": 37641792, "step": 11955 }, { "epoch": 0.7656360028167211, "grad_norm": 51.09782409667969, "learning_rate": 3.1658101672494043e-07, "loss": 0.4583, "num_input_tokens_seen": 37656512, "step": 11960 }, { "epoch": 0.7659560847576979, "grad_norm": 46.76288604736328, "learning_rate": 3.157657352200397e-07, "loss": 0.3527, "num_input_tokens_seen": 37672000, "step": 11965 }, { "epoch": 0.7662761666986748, "grad_norm": 41.273860931396484, "learning_rate": 3.149513079896521e-07, "loss": 0.3362, "num_input_tokens_seen": 37687232, "step": 11970 }, { "epoch": 0.7665962486396517, "grad_norm": 19.319063186645508, "learning_rate": 3.1413773605060034e-07, "loss": 0.3244, "num_input_tokens_seen": 37702656, "step": 11975 }, { "epoch": 0.7669163305806287, "grad_norm": 65.8237533569336, "learning_rate": 3.1332502041863783e-07, "loss": 0.4343, "num_input_tokens_seen": 37718080, "step": 11980 }, { "epoch": 0.7672364125216056, "grad_norm": 29.25933837890625, "learning_rate": 3.1251316210844946e-07, "loss": 0.3141, "num_input_tokens_seen": 37735680, "step": 11985 }, { "epoch": 0.7675564944625825, "grad_norm": 51.57158660888672, "learning_rate": 3.1170216213365055e-07, "loss": 0.2871, "num_input_tokens_seen": 37749952, "step": 11990 }, { "epoch": 0.7678765764035593, "grad_norm": 34.71276092529297, "learning_rate": 3.1089202150678397e-07, "loss": 0.4607, "num_input_tokens_seen": 37765312, "step": 11995 }, { "epoch": 0.7681966583445362, "grad_norm": 49.59117126464844, "learning_rate": 3.1008274123931886e-07, "loss": 0.4695, "num_input_tokens_seen": 37780160, "step": 12000 }, { "epoch": 0.7685167402855131, "grad_norm": 25.49561309814453, "learning_rate": 3.092743223416523e-07, "loss": 0.2672, "num_input_tokens_seen": 37796352, "step": 12005 }, { "epoch": 0.76883682222649, "grad_norm": 59.26298522949219, "learning_rate": 3.0846676582310413e-07, "loss": 0.3499, "num_input_tokens_seen": 37812864, "step": 12010 }, { "epoch": 0.7691569041674668, "grad_norm": 43.80664825439453, "learning_rate": 3.076600726919185e-07, "loss": 0.3824, "num_input_tokens_seen": 37827840, "step": 12015 }, { "epoch": 0.7694769861084437, "grad_norm": 31.392080307006836, "learning_rate": 3.0685424395526106e-07, "loss": 0.3579, "num_input_tokens_seen": 37847040, "step": 12020 }, { "epoch": 0.7697970680494206, "grad_norm": 37.12458419799805, "learning_rate": 3.060492806192184e-07, "loss": 0.2819, "num_input_tokens_seen": 37862464, "step": 12025 }, { "epoch": 0.7701171499903975, "grad_norm": 36.16139221191406, "learning_rate": 3.052451836887968e-07, "loss": 0.377, "num_input_tokens_seen": 37877760, "step": 12030 }, { "epoch": 0.7704372319313745, "grad_norm": 28.426408767700195, "learning_rate": 3.044419541679207e-07, "loss": 0.2861, "num_input_tokens_seen": 37892800, "step": 12035 }, { "epoch": 0.7707573138723514, "grad_norm": 62.23591232299805, "learning_rate": 3.0363959305943153e-07, "loss": 0.4239, "num_input_tokens_seen": 37909056, "step": 12040 }, { "epoch": 0.7710773958133282, "grad_norm": 42.053489685058594, "learning_rate": 3.028381013650867e-07, "loss": 0.348, "num_input_tokens_seen": 37925376, "step": 12045 }, { "epoch": 0.7713974777543051, "grad_norm": 37.59280014038086, "learning_rate": 3.0203748008555783e-07, "loss": 0.3716, "num_input_tokens_seen": 37941632, "step": 12050 }, { "epoch": 0.771717559695282, "grad_norm": 41.64907455444336, "learning_rate": 3.012377302204301e-07, "loss": 0.3805, "num_input_tokens_seen": 37957056, "step": 12055 }, { "epoch": 0.7720376416362589, "grad_norm": 46.065406799316406, "learning_rate": 3.0043885276820046e-07, "loss": 0.3916, "num_input_tokens_seen": 37973184, "step": 12060 }, { "epoch": 0.7723577235772358, "grad_norm": 24.32598304748535, "learning_rate": 2.99640848726277e-07, "loss": 0.3087, "num_input_tokens_seen": 37988288, "step": 12065 }, { "epoch": 0.7726778055182126, "grad_norm": 23.855104446411133, "learning_rate": 2.9884371909097704e-07, "loss": 0.3812, "num_input_tokens_seen": 38004224, "step": 12070 }, { "epoch": 0.7729978874591895, "grad_norm": 22.65608787536621, "learning_rate": 2.9804746485752616e-07, "loss": 0.3711, "num_input_tokens_seen": 38019456, "step": 12075 }, { "epoch": 0.7733179694001664, "grad_norm": 25.479469299316406, "learning_rate": 2.972520870200573e-07, "loss": 0.4058, "num_input_tokens_seen": 38035264, "step": 12080 }, { "epoch": 0.7736380513411434, "grad_norm": 31.957597732543945, "learning_rate": 2.9645758657160904e-07, "loss": 0.4045, "num_input_tokens_seen": 38051072, "step": 12085 }, { "epoch": 0.7739581332821203, "grad_norm": 16.966663360595703, "learning_rate": 2.9566396450412444e-07, "loss": 0.3538, "num_input_tokens_seen": 38066688, "step": 12090 }, { "epoch": 0.7742782152230971, "grad_norm": 22.1097354888916, "learning_rate": 2.9487122180844957e-07, "loss": 0.3193, "num_input_tokens_seen": 38082048, "step": 12095 }, { "epoch": 0.774598297164074, "grad_norm": 58.488800048828125, "learning_rate": 2.9407935947433406e-07, "loss": 0.2996, "num_input_tokens_seen": 38097344, "step": 12100 }, { "epoch": 0.7749183791050509, "grad_norm": 48.314144134521484, "learning_rate": 2.932883784904264e-07, "loss": 0.446, "num_input_tokens_seen": 38112320, "step": 12105 }, { "epoch": 0.7752384610460278, "grad_norm": 19.5347843170166, "learning_rate": 2.9249827984427555e-07, "loss": 0.2475, "num_input_tokens_seen": 38128000, "step": 12110 }, { "epoch": 0.7755585429870047, "grad_norm": 29.27086639404297, "learning_rate": 2.917090645223297e-07, "loss": 0.3015, "num_input_tokens_seen": 38143168, "step": 12115 }, { "epoch": 0.7758786249279815, "grad_norm": 27.007768630981445, "learning_rate": 2.909207335099332e-07, "loss": 0.2912, "num_input_tokens_seen": 38157824, "step": 12120 }, { "epoch": 0.7761987068689584, "grad_norm": 31.75836181640625, "learning_rate": 2.9013328779132595e-07, "loss": 0.3332, "num_input_tokens_seen": 38172864, "step": 12125 }, { "epoch": 0.7765187888099353, "grad_norm": 115.25257110595703, "learning_rate": 2.893467283496439e-07, "loss": 0.425, "num_input_tokens_seen": 38187264, "step": 12130 }, { "epoch": 0.7768388707509122, "grad_norm": 19.299240112304688, "learning_rate": 2.885610561669155e-07, "loss": 0.3551, "num_input_tokens_seen": 38204288, "step": 12135 }, { "epoch": 0.7771589526918892, "grad_norm": 28.34507179260254, "learning_rate": 2.8777627222406163e-07, "loss": 0.3462, "num_input_tokens_seen": 38219264, "step": 12140 }, { "epoch": 0.777479034632866, "grad_norm": 40.6217041015625, "learning_rate": 2.869923775008943e-07, "loss": 0.3863, "num_input_tokens_seen": 38234496, "step": 12145 }, { "epoch": 0.7777991165738429, "grad_norm": 37.57053756713867, "learning_rate": 2.862093729761155e-07, "loss": 0.2559, "num_input_tokens_seen": 38251072, "step": 12150 }, { "epoch": 0.7781191985148198, "grad_norm": 28.284217834472656, "learning_rate": 2.854272596273152e-07, "loss": 0.4049, "num_input_tokens_seen": 38266560, "step": 12155 }, { "epoch": 0.7784392804557967, "grad_norm": 43.39320373535156, "learning_rate": 2.8464603843097134e-07, "loss": 0.3287, "num_input_tokens_seen": 38282944, "step": 12160 }, { "epoch": 0.7787593623967736, "grad_norm": 32.42449951171875, "learning_rate": 2.8386571036244764e-07, "loss": 0.3291, "num_input_tokens_seen": 38299264, "step": 12165 }, { "epoch": 0.7790794443377504, "grad_norm": 51.791812896728516, "learning_rate": 2.830862763959929e-07, "loss": 0.39, "num_input_tokens_seen": 38314368, "step": 12170 }, { "epoch": 0.7793995262787273, "grad_norm": 10.4609956741333, "learning_rate": 2.8230773750473956e-07, "loss": 0.3154, "num_input_tokens_seen": 38329664, "step": 12175 }, { "epoch": 0.7797196082197042, "grad_norm": 27.046852111816406, "learning_rate": 2.8153009466070267e-07, "loss": 0.3072, "num_input_tokens_seen": 38345408, "step": 12180 }, { "epoch": 0.7800396901606811, "grad_norm": 32.581607818603516, "learning_rate": 2.807533488347783e-07, "loss": 0.2878, "num_input_tokens_seen": 38362688, "step": 12185 }, { "epoch": 0.7803597721016581, "grad_norm": 23.63336944580078, "learning_rate": 2.7997750099674277e-07, "loss": 0.2548, "num_input_tokens_seen": 38377600, "step": 12190 }, { "epoch": 0.780679854042635, "grad_norm": 36.57121276855469, "learning_rate": 2.792025521152512e-07, "loss": 0.5286, "num_input_tokens_seen": 38392640, "step": 12195 }, { "epoch": 0.7809999359836118, "grad_norm": 34.91606521606445, "learning_rate": 2.784285031578365e-07, "loss": 0.4496, "num_input_tokens_seen": 38408448, "step": 12200 }, { "epoch": 0.7813200179245887, "grad_norm": 26.795875549316406, "learning_rate": 2.7765535509090786e-07, "loss": 0.3629, "num_input_tokens_seen": 38424512, "step": 12205 }, { "epoch": 0.7816400998655656, "grad_norm": 29.603397369384766, "learning_rate": 2.768831088797495e-07, "loss": 0.4739, "num_input_tokens_seen": 38439296, "step": 12210 }, { "epoch": 0.7819601818065425, "grad_norm": 15.58344554901123, "learning_rate": 2.761117654885201e-07, "loss": 0.2482, "num_input_tokens_seen": 38455424, "step": 12215 }, { "epoch": 0.7822802637475194, "grad_norm": 24.491289138793945, "learning_rate": 2.7534132588025063e-07, "loss": 0.3265, "num_input_tokens_seen": 38470976, "step": 12220 }, { "epoch": 0.7826003456884962, "grad_norm": 27.425262451171875, "learning_rate": 2.7457179101684483e-07, "loss": 0.5075, "num_input_tokens_seen": 38486016, "step": 12225 }, { "epoch": 0.7829204276294731, "grad_norm": 22.376157760620117, "learning_rate": 2.7380316185907506e-07, "loss": 0.298, "num_input_tokens_seen": 38501248, "step": 12230 }, { "epoch": 0.78324050957045, "grad_norm": 19.046939849853516, "learning_rate": 2.730354393665839e-07, "loss": 0.3503, "num_input_tokens_seen": 38516992, "step": 12235 }, { "epoch": 0.7835605915114269, "grad_norm": 27.88618278503418, "learning_rate": 2.7226862449788245e-07, "loss": 0.3702, "num_input_tokens_seen": 38531456, "step": 12240 }, { "epoch": 0.7838806734524039, "grad_norm": 34.346378326416016, "learning_rate": 2.715027182103482e-07, "loss": 0.3264, "num_input_tokens_seen": 38546880, "step": 12245 }, { "epoch": 0.7842007553933807, "grad_norm": 20.54593276977539, "learning_rate": 2.707377214602232e-07, "loss": 0.3039, "num_input_tokens_seen": 38562176, "step": 12250 }, { "epoch": 0.7845208373343576, "grad_norm": 37.601043701171875, "learning_rate": 2.699736352026157e-07, "loss": 0.4366, "num_input_tokens_seen": 38577472, "step": 12255 }, { "epoch": 0.7848409192753345, "grad_norm": 22.17053985595703, "learning_rate": 2.6921046039149645e-07, "loss": 0.3297, "num_input_tokens_seen": 38593088, "step": 12260 }, { "epoch": 0.7851610012163114, "grad_norm": 31.56439208984375, "learning_rate": 2.6844819797969744e-07, "loss": 0.3408, "num_input_tokens_seen": 38607936, "step": 12265 }, { "epoch": 0.7854810831572883, "grad_norm": 40.473628997802734, "learning_rate": 2.6768684891891236e-07, "loss": 0.2481, "num_input_tokens_seen": 38625024, "step": 12270 }, { "epoch": 0.7858011650982651, "grad_norm": 30.89264678955078, "learning_rate": 2.6692641415969497e-07, "loss": 0.3321, "num_input_tokens_seen": 38641792, "step": 12275 }, { "epoch": 0.786121247039242, "grad_norm": 47.64722442626953, "learning_rate": 2.66166894651457e-07, "loss": 0.395, "num_input_tokens_seen": 38656896, "step": 12280 }, { "epoch": 0.7864413289802189, "grad_norm": 43.44092559814453, "learning_rate": 2.654082913424668e-07, "loss": 0.3426, "num_input_tokens_seen": 38672448, "step": 12285 }, { "epoch": 0.7867614109211958, "grad_norm": 27.422563552856445, "learning_rate": 2.6465060517985003e-07, "loss": 0.3016, "num_input_tokens_seen": 38688576, "step": 12290 }, { "epoch": 0.7870814928621728, "grad_norm": 44.733848571777344, "learning_rate": 2.638938371095867e-07, "loss": 0.5123, "num_input_tokens_seen": 38704064, "step": 12295 }, { "epoch": 0.7874015748031497, "grad_norm": 20.204547882080078, "learning_rate": 2.6313798807651065e-07, "loss": 0.381, "num_input_tokens_seen": 38718976, "step": 12300 }, { "epoch": 0.7877216567441265, "grad_norm": 17.738218307495117, "learning_rate": 2.6238305902430813e-07, "loss": 0.3529, "num_input_tokens_seen": 38734272, "step": 12305 }, { "epoch": 0.7880417386851034, "grad_norm": 14.163119316101074, "learning_rate": 2.61629050895517e-07, "loss": 0.307, "num_input_tokens_seen": 38749504, "step": 12310 }, { "epoch": 0.7883618206260803, "grad_norm": 27.0414981842041, "learning_rate": 2.608759646315253e-07, "loss": 0.3171, "num_input_tokens_seen": 38764352, "step": 12315 }, { "epoch": 0.7886819025670572, "grad_norm": 18.21839714050293, "learning_rate": 2.6012380117257005e-07, "loss": 0.3637, "num_input_tokens_seen": 38780096, "step": 12320 }, { "epoch": 0.789001984508034, "grad_norm": 33.14684295654297, "learning_rate": 2.5937256145773613e-07, "loss": 0.3902, "num_input_tokens_seen": 38795712, "step": 12325 }, { "epoch": 0.7893220664490109, "grad_norm": 39.35667037963867, "learning_rate": 2.586222464249551e-07, "loss": 0.3264, "num_input_tokens_seen": 38811328, "step": 12330 }, { "epoch": 0.7896421483899878, "grad_norm": 27.116695404052734, "learning_rate": 2.5787285701100413e-07, "loss": 0.2022, "num_input_tokens_seen": 38826240, "step": 12335 }, { "epoch": 0.7899622303309647, "grad_norm": 37.62165832519531, "learning_rate": 2.571243941515048e-07, "loss": 0.3672, "num_input_tokens_seen": 38842624, "step": 12340 }, { "epoch": 0.7902823122719416, "grad_norm": 22.701847076416016, "learning_rate": 2.563768587809213e-07, "loss": 0.2672, "num_input_tokens_seen": 38857472, "step": 12345 }, { "epoch": 0.7906023942129186, "grad_norm": 60.98664855957031, "learning_rate": 2.5563025183256137e-07, "loss": 0.4118, "num_input_tokens_seen": 38872256, "step": 12350 }, { "epoch": 0.7909224761538954, "grad_norm": 38.52484893798828, "learning_rate": 2.548845742385717e-07, "loss": 0.552, "num_input_tokens_seen": 38890048, "step": 12355 }, { "epoch": 0.7912425580948723, "grad_norm": 38.10274887084961, "learning_rate": 2.541398269299393e-07, "loss": 0.2356, "num_input_tokens_seen": 38905664, "step": 12360 }, { "epoch": 0.7915626400358492, "grad_norm": 12.663208961486816, "learning_rate": 2.5339601083649063e-07, "loss": 0.2978, "num_input_tokens_seen": 38926144, "step": 12365 }, { "epoch": 0.7918827219768261, "grad_norm": 34.63762283325195, "learning_rate": 2.526531268868889e-07, "loss": 0.4751, "num_input_tokens_seen": 38942720, "step": 12370 }, { "epoch": 0.792202803917803, "grad_norm": 25.847164154052734, "learning_rate": 2.5191117600863266e-07, "loss": 0.3397, "num_input_tokens_seen": 38958144, "step": 12375 }, { "epoch": 0.7925228858587798, "grad_norm": 20.030961990356445, "learning_rate": 2.511701591280565e-07, "loss": 0.2568, "num_input_tokens_seen": 38973376, "step": 12380 }, { "epoch": 0.7928429677997567, "grad_norm": 37.385189056396484, "learning_rate": 2.504300771703295e-07, "loss": 0.346, "num_input_tokens_seen": 38989504, "step": 12385 }, { "epoch": 0.7931630497407336, "grad_norm": 72.55767822265625, "learning_rate": 2.496909310594517e-07, "loss": 0.3626, "num_input_tokens_seen": 39005056, "step": 12390 }, { "epoch": 0.7934831316817105, "grad_norm": 40.421688079833984, "learning_rate": 2.4895272171825587e-07, "loss": 0.4459, "num_input_tokens_seen": 39020608, "step": 12395 }, { "epoch": 0.7938032136226874, "grad_norm": 32.116249084472656, "learning_rate": 2.482154500684055e-07, "loss": 0.443, "num_input_tokens_seen": 39035712, "step": 12400 }, { "epoch": 0.7941232955636643, "grad_norm": 25.23982048034668, "learning_rate": 2.4747911703039293e-07, "loss": 0.3361, "num_input_tokens_seen": 39050880, "step": 12405 }, { "epoch": 0.7944433775046412, "grad_norm": 35.13556671142578, "learning_rate": 2.467437235235378e-07, "loss": 0.3689, "num_input_tokens_seen": 39065792, "step": 12410 }, { "epoch": 0.7947634594456181, "grad_norm": 31.368885040283203, "learning_rate": 2.460092704659883e-07, "loss": 0.3418, "num_input_tokens_seen": 39080960, "step": 12415 }, { "epoch": 0.795083541386595, "grad_norm": 16.847009658813477, "learning_rate": 2.452757587747174e-07, "loss": 0.2604, "num_input_tokens_seen": 39097216, "step": 12420 }, { "epoch": 0.7954036233275719, "grad_norm": 23.280132293701172, "learning_rate": 2.445431893655232e-07, "loss": 0.1771, "num_input_tokens_seen": 39113152, "step": 12425 }, { "epoch": 0.7957237052685487, "grad_norm": 35.485782623291016, "learning_rate": 2.438115631530271e-07, "loss": 0.3722, "num_input_tokens_seen": 39130176, "step": 12430 }, { "epoch": 0.7960437872095256, "grad_norm": 28.096521377563477, "learning_rate": 2.4308088105067305e-07, "loss": 0.2283, "num_input_tokens_seen": 39145792, "step": 12435 }, { "epoch": 0.7963638691505025, "grad_norm": 67.06790924072266, "learning_rate": 2.423511439707262e-07, "loss": 0.4201, "num_input_tokens_seen": 39161280, "step": 12440 }, { "epoch": 0.7966839510914794, "grad_norm": 23.704147338867188, "learning_rate": 2.4162235282427177e-07, "loss": 0.2784, "num_input_tokens_seen": 39176512, "step": 12445 }, { "epoch": 0.7970040330324563, "grad_norm": 42.61015319824219, "learning_rate": 2.408945085212144e-07, "loss": 0.3621, "num_input_tokens_seen": 39191808, "step": 12450 }, { "epoch": 0.7973241149734333, "grad_norm": 33.03046417236328, "learning_rate": 2.401676119702759e-07, "loss": 0.2479, "num_input_tokens_seen": 39208640, "step": 12455 }, { "epoch": 0.7976441969144101, "grad_norm": 19.37267303466797, "learning_rate": 2.394416640789952e-07, "loss": 0.3438, "num_input_tokens_seen": 39223232, "step": 12460 }, { "epoch": 0.797964278855387, "grad_norm": 40.43623352050781, "learning_rate": 2.3871666575372696e-07, "loss": 0.3098, "num_input_tokens_seen": 39238656, "step": 12465 }, { "epoch": 0.7982843607963639, "grad_norm": 54.1468505859375, "learning_rate": 2.3799261789963964e-07, "loss": 0.532, "num_input_tokens_seen": 39255872, "step": 12470 }, { "epoch": 0.7986044427373408, "grad_norm": 21.15880584716797, "learning_rate": 2.3726952142071644e-07, "loss": 0.2708, "num_input_tokens_seen": 39270784, "step": 12475 }, { "epoch": 0.7989245246783176, "grad_norm": 41.602508544921875, "learning_rate": 2.365473772197508e-07, "loss": 0.3462, "num_input_tokens_seen": 39286080, "step": 12480 }, { "epoch": 0.7992446066192945, "grad_norm": 33.63953399658203, "learning_rate": 2.3582618619834883e-07, "loss": 0.356, "num_input_tokens_seen": 39301312, "step": 12485 }, { "epoch": 0.7995646885602714, "grad_norm": 16.34864616394043, "learning_rate": 2.3510594925692528e-07, "loss": 0.2216, "num_input_tokens_seen": 39316736, "step": 12490 }, { "epoch": 0.7998847705012483, "grad_norm": 28.48493194580078, "learning_rate": 2.343866672947057e-07, "loss": 0.3493, "num_input_tokens_seen": 39331264, "step": 12495 }, { "epoch": 0.8002048524422252, "grad_norm": 26.142616271972656, "learning_rate": 2.336683412097209e-07, "loss": 0.2587, "num_input_tokens_seen": 39345856, "step": 12500 }, { "epoch": 0.800524934383202, "grad_norm": 23.281526565551758, "learning_rate": 2.329509718988095e-07, "loss": 0.3645, "num_input_tokens_seen": 39361280, "step": 12505 }, { "epoch": 0.800845016324179, "grad_norm": 34.25197982788086, "learning_rate": 2.3223456025761645e-07, "loss": 0.3367, "num_input_tokens_seen": 39375872, "step": 12510 }, { "epoch": 0.8009730491005698, "eval_loss": 0.36358681321144104, "eval_runtime": 49.1621, "eval_samples_per_second": 282.453, "eval_steps_per_second": 35.312, "num_input_tokens_seen": 39382144, "step": 12512 }, { "epoch": 0.8011650982651559, "grad_norm": 20.024723052978516, "learning_rate": 2.315191071805892e-07, "loss": 0.2866, "num_input_tokens_seen": 39392320, "step": 12515 }, { "epoch": 0.8014851802061328, "grad_norm": 63.86294937133789, "learning_rate": 2.3080461356097937e-07, "loss": 0.3619, "num_input_tokens_seen": 39407680, "step": 12520 }, { "epoch": 0.8018052621471097, "grad_norm": 18.46623992919922, "learning_rate": 2.30091080290841e-07, "loss": 0.288, "num_input_tokens_seen": 39424512, "step": 12525 }, { "epoch": 0.8021253440880866, "grad_norm": 45.297523498535156, "learning_rate": 2.29378508261029e-07, "loss": 0.3463, "num_input_tokens_seen": 39439296, "step": 12530 }, { "epoch": 0.8024454260290634, "grad_norm": 53.35750198364258, "learning_rate": 2.2866689836119702e-07, "loss": 0.3707, "num_input_tokens_seen": 39456576, "step": 12535 }, { "epoch": 0.8027655079700403, "grad_norm": 62.54146957397461, "learning_rate": 2.2795625147979913e-07, "loss": 0.3536, "num_input_tokens_seen": 39472512, "step": 12540 }, { "epoch": 0.8030855899110172, "grad_norm": 22.177854537963867, "learning_rate": 2.2724656850408597e-07, "loss": 0.2332, "num_input_tokens_seen": 39488192, "step": 12545 }, { "epoch": 0.8034056718519941, "grad_norm": 42.50724411010742, "learning_rate": 2.2653785032010532e-07, "loss": 0.3855, "num_input_tokens_seen": 39503552, "step": 12550 }, { "epoch": 0.803725753792971, "grad_norm": 38.946964263916016, "learning_rate": 2.258300978126999e-07, "loss": 0.3363, "num_input_tokens_seen": 39519744, "step": 12555 }, { "epoch": 0.804045835733948, "grad_norm": 22.364994049072266, "learning_rate": 2.2512331186550715e-07, "loss": 0.4753, "num_input_tokens_seen": 39535232, "step": 12560 }, { "epoch": 0.8043659176749248, "grad_norm": 44.744346618652344, "learning_rate": 2.244174933609575e-07, "loss": 0.3878, "num_input_tokens_seen": 39549568, "step": 12565 }, { "epoch": 0.8046859996159017, "grad_norm": 27.26950454711914, "learning_rate": 2.2371264318027383e-07, "loss": 0.2764, "num_input_tokens_seen": 39566016, "step": 12570 }, { "epoch": 0.8050060815568786, "grad_norm": 31.31670570373535, "learning_rate": 2.2300876220346975e-07, "loss": 0.2308, "num_input_tokens_seen": 39581760, "step": 12575 }, { "epoch": 0.8053261634978555, "grad_norm": 39.95564651489258, "learning_rate": 2.2230585130934897e-07, "loss": 0.2785, "num_input_tokens_seen": 39597888, "step": 12580 }, { "epoch": 0.8056462454388323, "grad_norm": 23.922866821289062, "learning_rate": 2.2160391137550394e-07, "loss": 0.4454, "num_input_tokens_seen": 39613568, "step": 12585 }, { "epoch": 0.8059663273798092, "grad_norm": 60.24818420410156, "learning_rate": 2.2090294327831494e-07, "loss": 0.4314, "num_input_tokens_seen": 39628096, "step": 12590 }, { "epoch": 0.8062864093207861, "grad_norm": 40.70429992675781, "learning_rate": 2.202029478929488e-07, "loss": 0.2695, "num_input_tokens_seen": 39642560, "step": 12595 }, { "epoch": 0.806606491261763, "grad_norm": 24.328882217407227, "learning_rate": 2.195039260933581e-07, "loss": 0.2967, "num_input_tokens_seen": 39658112, "step": 12600 }, { "epoch": 0.8069265732027399, "grad_norm": 33.61399841308594, "learning_rate": 2.1880587875227973e-07, "loss": 0.2657, "num_input_tokens_seen": 39674112, "step": 12605 }, { "epoch": 0.8072466551437167, "grad_norm": 27.520858764648438, "learning_rate": 2.18108806741234e-07, "loss": 0.3313, "num_input_tokens_seen": 39690432, "step": 12610 }, { "epoch": 0.8075667370846937, "grad_norm": 21.497695922851562, "learning_rate": 2.1741271093052315e-07, "loss": 0.3512, "num_input_tokens_seen": 39705792, "step": 12615 }, { "epoch": 0.8078868190256706, "grad_norm": 50.78917694091797, "learning_rate": 2.167175921892318e-07, "loss": 0.4692, "num_input_tokens_seen": 39722048, "step": 12620 }, { "epoch": 0.8082069009666475, "grad_norm": 26.748119354248047, "learning_rate": 2.1602345138522314e-07, "loss": 0.4239, "num_input_tokens_seen": 39738304, "step": 12625 }, { "epoch": 0.8085269829076244, "grad_norm": 31.953128814697266, "learning_rate": 2.1533028938514008e-07, "loss": 0.3468, "num_input_tokens_seen": 39753728, "step": 12630 }, { "epoch": 0.8088470648486012, "grad_norm": 41.40265655517578, "learning_rate": 2.1463810705440433e-07, "loss": 0.3435, "num_input_tokens_seen": 39769600, "step": 12635 }, { "epoch": 0.8091671467895781, "grad_norm": 33.059566497802734, "learning_rate": 2.139469052572127e-07, "loss": 0.3519, "num_input_tokens_seen": 39784000, "step": 12640 }, { "epoch": 0.809487228730555, "grad_norm": 46.353363037109375, "learning_rate": 2.1325668485653891e-07, "loss": 0.344, "num_input_tokens_seen": 39800320, "step": 12645 }, { "epoch": 0.8098073106715319, "grad_norm": 27.811872482299805, "learning_rate": 2.1256744671413173e-07, "loss": 0.457, "num_input_tokens_seen": 39815360, "step": 12650 }, { "epoch": 0.8101273926125088, "grad_norm": 31.568683624267578, "learning_rate": 2.1187919169051316e-07, "loss": 0.3821, "num_input_tokens_seen": 39829952, "step": 12655 }, { "epoch": 0.8104474745534856, "grad_norm": 33.802940368652344, "learning_rate": 2.111919206449767e-07, "loss": 0.3528, "num_input_tokens_seen": 39845376, "step": 12660 }, { "epoch": 0.8107675564944626, "grad_norm": 27.218812942504883, "learning_rate": 2.1050563443558922e-07, "loss": 0.4858, "num_input_tokens_seen": 39861696, "step": 12665 }, { "epoch": 0.8110876384354395, "grad_norm": 37.33356475830078, "learning_rate": 2.0982033391918697e-07, "loss": 0.297, "num_input_tokens_seen": 39877440, "step": 12670 }, { "epoch": 0.8114077203764164, "grad_norm": 58.22770309448242, "learning_rate": 2.0913601995137543e-07, "loss": 0.334, "num_input_tokens_seen": 39893760, "step": 12675 }, { "epoch": 0.8117278023173933, "grad_norm": 15.805877685546875, "learning_rate": 2.084526933865287e-07, "loss": 0.2943, "num_input_tokens_seen": 39909568, "step": 12680 }, { "epoch": 0.8120478842583702, "grad_norm": 30.60896873474121, "learning_rate": 2.0777035507778817e-07, "loss": 0.4543, "num_input_tokens_seen": 39923648, "step": 12685 }, { "epoch": 0.812367966199347, "grad_norm": 17.86086654663086, "learning_rate": 2.0708900587706135e-07, "loss": 0.4299, "num_input_tokens_seen": 39939008, "step": 12690 }, { "epoch": 0.8126880481403239, "grad_norm": 45.35393142700195, "learning_rate": 2.0640864663502e-07, "loss": 0.3374, "num_input_tokens_seen": 39955072, "step": 12695 }, { "epoch": 0.8130081300813008, "grad_norm": 31.832155227661133, "learning_rate": 2.057292782011013e-07, "loss": 0.4545, "num_input_tokens_seen": 39970880, "step": 12700 }, { "epoch": 0.8133282120222777, "grad_norm": 22.989181518554688, "learning_rate": 2.0505090142350468e-07, "loss": 0.2967, "num_input_tokens_seen": 39986240, "step": 12705 }, { "epoch": 0.8136482939632546, "grad_norm": 31.20648765563965, "learning_rate": 2.0437351714919127e-07, "loss": 0.3427, "num_input_tokens_seen": 40001856, "step": 12710 }, { "epoch": 0.8139683759042314, "grad_norm": 18.44768714904785, "learning_rate": 2.0369712622388336e-07, "loss": 0.309, "num_input_tokens_seen": 40018112, "step": 12715 }, { "epoch": 0.8142884578452084, "grad_norm": 37.2120475769043, "learning_rate": 2.0302172949206298e-07, "loss": 0.2879, "num_input_tokens_seen": 40033664, "step": 12720 }, { "epoch": 0.8146085397861853, "grad_norm": 54.152069091796875, "learning_rate": 2.0234732779697094e-07, "loss": 0.2967, "num_input_tokens_seen": 40048768, "step": 12725 }, { "epoch": 0.8149286217271622, "grad_norm": 42.13416290283203, "learning_rate": 2.016739219806056e-07, "loss": 0.3229, "num_input_tokens_seen": 40063232, "step": 12730 }, { "epoch": 0.8152487036681391, "grad_norm": 19.65249252319336, "learning_rate": 2.0100151288372215e-07, "loss": 0.3904, "num_input_tokens_seen": 40079296, "step": 12735 }, { "epoch": 0.8155687856091159, "grad_norm": 59.13142013549805, "learning_rate": 2.0033010134583084e-07, "loss": 0.5554, "num_input_tokens_seen": 40094976, "step": 12740 }, { "epoch": 0.8158888675500928, "grad_norm": 32.4484977722168, "learning_rate": 1.9965968820519763e-07, "loss": 0.3218, "num_input_tokens_seen": 40110464, "step": 12745 }, { "epoch": 0.8162089494910697, "grad_norm": 48.04807662963867, "learning_rate": 1.9899027429884042e-07, "loss": 0.3981, "num_input_tokens_seen": 40125568, "step": 12750 }, { "epoch": 0.8165290314320466, "grad_norm": 37.24668502807617, "learning_rate": 1.983218604625305e-07, "loss": 0.4142, "num_input_tokens_seen": 40141440, "step": 12755 }, { "epoch": 0.8168491133730235, "grad_norm": 14.393180847167969, "learning_rate": 1.9765444753079096e-07, "loss": 0.3275, "num_input_tokens_seen": 40156416, "step": 12760 }, { "epoch": 0.8171691953140003, "grad_norm": 29.691728591918945, "learning_rate": 1.9698803633689408e-07, "loss": 0.3998, "num_input_tokens_seen": 40172928, "step": 12765 }, { "epoch": 0.8174892772549772, "grad_norm": 21.646751403808594, "learning_rate": 1.963226277128619e-07, "loss": 0.2336, "num_input_tokens_seen": 40188096, "step": 12770 }, { "epoch": 0.8178093591959542, "grad_norm": 29.038705825805664, "learning_rate": 1.956582224894655e-07, "loss": 0.3593, "num_input_tokens_seen": 40204032, "step": 12775 }, { "epoch": 0.8181294411369311, "grad_norm": 46.25074768066406, "learning_rate": 1.949948214962227e-07, "loss": 0.3646, "num_input_tokens_seen": 40218944, "step": 12780 }, { "epoch": 0.818449523077908, "grad_norm": 54.344844818115234, "learning_rate": 1.943324255613964e-07, "loss": 0.3731, "num_input_tokens_seen": 40235456, "step": 12785 }, { "epoch": 0.8187696050188848, "grad_norm": 24.159887313842773, "learning_rate": 1.936710355119967e-07, "loss": 0.4505, "num_input_tokens_seen": 40250176, "step": 12790 }, { "epoch": 0.8190896869598617, "grad_norm": 33.41341018676758, "learning_rate": 1.9301065217377655e-07, "loss": 0.3157, "num_input_tokens_seen": 40265472, "step": 12795 }, { "epoch": 0.8194097689008386, "grad_norm": 25.555482864379883, "learning_rate": 1.9235127637123249e-07, "loss": 0.3992, "num_input_tokens_seen": 40281728, "step": 12800 }, { "epoch": 0.8197298508418155, "grad_norm": 52.75870132446289, "learning_rate": 1.9169290892760225e-07, "loss": 0.3282, "num_input_tokens_seen": 40296768, "step": 12805 }, { "epoch": 0.8200499327827924, "grad_norm": 44.361934661865234, "learning_rate": 1.91035550664866e-07, "loss": 0.3201, "num_input_tokens_seen": 40311488, "step": 12810 }, { "epoch": 0.8203700147237692, "grad_norm": 54.147613525390625, "learning_rate": 1.903792024037433e-07, "loss": 0.314, "num_input_tokens_seen": 40327232, "step": 12815 }, { "epoch": 0.8206900966647461, "grad_norm": 33.24623489379883, "learning_rate": 1.8972386496369185e-07, "loss": 0.4472, "num_input_tokens_seen": 40344064, "step": 12820 }, { "epoch": 0.8210101786057231, "grad_norm": 41.800315856933594, "learning_rate": 1.89069539162909e-07, "loss": 0.3976, "num_input_tokens_seen": 40359040, "step": 12825 }, { "epoch": 0.8213302605467, "grad_norm": 19.14189338684082, "learning_rate": 1.8841622581832783e-07, "loss": 0.4066, "num_input_tokens_seen": 40376384, "step": 12830 }, { "epoch": 0.8216503424876769, "grad_norm": 28.32308578491211, "learning_rate": 1.8776392574561783e-07, "loss": 0.5901, "num_input_tokens_seen": 40391936, "step": 12835 }, { "epoch": 0.8219704244286538, "grad_norm": 23.97947883605957, "learning_rate": 1.8711263975918322e-07, "loss": 0.4831, "num_input_tokens_seen": 40408832, "step": 12840 }, { "epoch": 0.8222905063696306, "grad_norm": 35.37938690185547, "learning_rate": 1.8646236867216215e-07, "loss": 0.4603, "num_input_tokens_seen": 40425280, "step": 12845 }, { "epoch": 0.8226105883106075, "grad_norm": 34.26011657714844, "learning_rate": 1.8581311329642591e-07, "loss": 0.338, "num_input_tokens_seen": 40440832, "step": 12850 }, { "epoch": 0.8229306702515844, "grad_norm": 29.206497192382812, "learning_rate": 1.8516487444257723e-07, "loss": 0.2651, "num_input_tokens_seen": 40458624, "step": 12855 }, { "epoch": 0.8232507521925613, "grad_norm": 33.5301399230957, "learning_rate": 1.8451765291995004e-07, "loss": 0.4093, "num_input_tokens_seen": 40474688, "step": 12860 }, { "epoch": 0.8235708341335382, "grad_norm": 35.508880615234375, "learning_rate": 1.8387144953660806e-07, "loss": 0.3554, "num_input_tokens_seen": 40490816, "step": 12865 }, { "epoch": 0.823890916074515, "grad_norm": 39.21906280517578, "learning_rate": 1.832262650993437e-07, "loss": 0.4472, "num_input_tokens_seen": 40506112, "step": 12870 }, { "epoch": 0.8242109980154919, "grad_norm": 20.77424430847168, "learning_rate": 1.825821004136774e-07, "loss": 0.2954, "num_input_tokens_seen": 40521344, "step": 12875 }, { "epoch": 0.8245310799564689, "grad_norm": 29.856380462646484, "learning_rate": 1.819389562838559e-07, "loss": 0.2698, "num_input_tokens_seen": 40537024, "step": 12880 }, { "epoch": 0.8248511618974458, "grad_norm": 47.23398208618164, "learning_rate": 1.8129683351285319e-07, "loss": 0.3136, "num_input_tokens_seen": 40552640, "step": 12885 }, { "epoch": 0.8251712438384227, "grad_norm": 35.031856536865234, "learning_rate": 1.8065573290236626e-07, "loss": 0.3186, "num_input_tokens_seen": 40568000, "step": 12890 }, { "epoch": 0.8254913257793995, "grad_norm": 22.70587730407715, "learning_rate": 1.8001565525281682e-07, "loss": 0.3809, "num_input_tokens_seen": 40584960, "step": 12895 }, { "epoch": 0.8258114077203764, "grad_norm": 25.041950225830078, "learning_rate": 1.793766013633493e-07, "loss": 0.3665, "num_input_tokens_seen": 40600704, "step": 12900 }, { "epoch": 0.8261314896613533, "grad_norm": 27.236404418945312, "learning_rate": 1.7873857203183074e-07, "loss": 0.3693, "num_input_tokens_seen": 40615872, "step": 12905 }, { "epoch": 0.8264515716023302, "grad_norm": 54.097450256347656, "learning_rate": 1.7810156805484733e-07, "loss": 0.4563, "num_input_tokens_seen": 40632640, "step": 12910 }, { "epoch": 0.8267716535433071, "grad_norm": 25.137113571166992, "learning_rate": 1.7746559022770612e-07, "loss": 0.2995, "num_input_tokens_seen": 40648064, "step": 12915 }, { "epoch": 0.8270917354842839, "grad_norm": 29.874134063720703, "learning_rate": 1.7683063934443342e-07, "loss": 0.3663, "num_input_tokens_seen": 40664704, "step": 12920 }, { "epoch": 0.8274118174252608, "grad_norm": 40.31401824951172, "learning_rate": 1.7619671619777277e-07, "loss": 0.4004, "num_input_tokens_seen": 40681024, "step": 12925 }, { "epoch": 0.8277318993662378, "grad_norm": 31.526283264160156, "learning_rate": 1.7556382157918404e-07, "loss": 0.4101, "num_input_tokens_seen": 40695936, "step": 12930 }, { "epoch": 0.8280519813072147, "grad_norm": 27.806535720825195, "learning_rate": 1.7493195627884427e-07, "loss": 0.3185, "num_input_tokens_seen": 40713472, "step": 12935 }, { "epoch": 0.8283720632481916, "grad_norm": 42.26551055908203, "learning_rate": 1.7430112108564465e-07, "loss": 0.3141, "num_input_tokens_seen": 40729344, "step": 12940 }, { "epoch": 0.8286921451891684, "grad_norm": 35.58454895019531, "learning_rate": 1.736713167871896e-07, "loss": 0.3861, "num_input_tokens_seen": 40745856, "step": 12945 }, { "epoch": 0.8290122271301453, "grad_norm": 19.220375061035156, "learning_rate": 1.7304254416979803e-07, "loss": 0.2993, "num_input_tokens_seen": 40761920, "step": 12950 }, { "epoch": 0.8293323090711222, "grad_norm": 17.930898666381836, "learning_rate": 1.7241480401849963e-07, "loss": 0.2488, "num_input_tokens_seen": 40776960, "step": 12955 }, { "epoch": 0.8296523910120991, "grad_norm": 21.81646156311035, "learning_rate": 1.7178809711703524e-07, "loss": 0.3455, "num_input_tokens_seen": 40792192, "step": 12960 }, { "epoch": 0.829972472953076, "grad_norm": 34.8779296875, "learning_rate": 1.7116242424785599e-07, "loss": 0.3612, "num_input_tokens_seen": 40808256, "step": 12965 }, { "epoch": 0.8302925548940528, "grad_norm": 40.2933464050293, "learning_rate": 1.7053778619212166e-07, "loss": 0.4288, "num_input_tokens_seen": 40823424, "step": 12970 }, { "epoch": 0.8306126368350297, "grad_norm": 39.040504455566406, "learning_rate": 1.6991418372970022e-07, "loss": 0.4221, "num_input_tokens_seen": 40840960, "step": 12975 }, { "epoch": 0.8309327187760066, "grad_norm": 26.533519744873047, "learning_rate": 1.6929161763916666e-07, "loss": 0.3775, "num_input_tokens_seen": 40857536, "step": 12980 }, { "epoch": 0.8312528007169836, "grad_norm": 25.883270263671875, "learning_rate": 1.686700886978021e-07, "loss": 0.3597, "num_input_tokens_seen": 40874240, "step": 12985 }, { "epoch": 0.8315728826579605, "grad_norm": 37.27665710449219, "learning_rate": 1.6804959768159266e-07, "loss": 0.3573, "num_input_tokens_seen": 40888960, "step": 12990 }, { "epoch": 0.8318929645989374, "grad_norm": 53.164058685302734, "learning_rate": 1.674301453652287e-07, "loss": 0.5238, "num_input_tokens_seen": 40904512, "step": 12995 }, { "epoch": 0.8322130465399142, "grad_norm": 37.5425910949707, "learning_rate": 1.6681173252210378e-07, "loss": 0.2903, "num_input_tokens_seen": 40921856, "step": 13000 }, { "epoch": 0.8325331284808911, "grad_norm": 49.16252517700195, "learning_rate": 1.6619435992431342e-07, "loss": 0.3741, "num_input_tokens_seen": 40938752, "step": 13005 }, { "epoch": 0.832853210421868, "grad_norm": 43.46717071533203, "learning_rate": 1.6557802834265466e-07, "loss": 0.3033, "num_input_tokens_seen": 40954048, "step": 13010 }, { "epoch": 0.8331732923628449, "grad_norm": 24.154077529907227, "learning_rate": 1.649627385466248e-07, "loss": 0.3593, "num_input_tokens_seen": 40972672, "step": 13015 }, { "epoch": 0.8334933743038218, "grad_norm": 19.601119995117188, "learning_rate": 1.643484913044202e-07, "loss": 0.242, "num_input_tokens_seen": 40987648, "step": 13020 }, { "epoch": 0.8338134562447986, "grad_norm": 13.510409355163574, "learning_rate": 1.6373528738293564e-07, "loss": 0.3147, "num_input_tokens_seen": 41003328, "step": 13025 }, { "epoch": 0.8341335381857755, "grad_norm": 31.341299057006836, "learning_rate": 1.6312312754776404e-07, "loss": 0.2875, "num_input_tokens_seen": 41018624, "step": 13030 }, { "epoch": 0.8344536201267524, "grad_norm": 16.611501693725586, "learning_rate": 1.6251201256319357e-07, "loss": 0.3321, "num_input_tokens_seen": 41034624, "step": 13035 }, { "epoch": 0.8347737020677294, "grad_norm": 26.413938522338867, "learning_rate": 1.619019431922083e-07, "loss": 0.3821, "num_input_tokens_seen": 41049664, "step": 13040 }, { "epoch": 0.8350937840087063, "grad_norm": 33.03317642211914, "learning_rate": 1.6129292019648754e-07, "loss": 0.3454, "num_input_tokens_seen": 41066368, "step": 13045 }, { "epoch": 0.8354138659496831, "grad_norm": 25.02870750427246, "learning_rate": 1.606849443364038e-07, "loss": 0.2916, "num_input_tokens_seen": 41082048, "step": 13050 }, { "epoch": 0.83573394789066, "grad_norm": 16.02092170715332, "learning_rate": 1.6007801637102104e-07, "loss": 0.3422, "num_input_tokens_seen": 41098048, "step": 13055 }, { "epoch": 0.8360540298316369, "grad_norm": 20.10306167602539, "learning_rate": 1.594721370580969e-07, "loss": 0.3826, "num_input_tokens_seen": 41112768, "step": 13060 }, { "epoch": 0.8363741117726138, "grad_norm": 20.185379028320312, "learning_rate": 1.588673071540788e-07, "loss": 0.4512, "num_input_tokens_seen": 41127488, "step": 13065 }, { "epoch": 0.8366941937135907, "grad_norm": 37.06159591674805, "learning_rate": 1.5826352741410332e-07, "loss": 0.3295, "num_input_tokens_seen": 41142272, "step": 13070 }, { "epoch": 0.8370142756545675, "grad_norm": 52.25266647338867, "learning_rate": 1.576607985919971e-07, "loss": 0.2947, "num_input_tokens_seen": 41157952, "step": 13075 }, { "epoch": 0.8373343575955444, "grad_norm": 38.03484344482422, "learning_rate": 1.57059121440274e-07, "loss": 0.3595, "num_input_tokens_seen": 41172992, "step": 13080 }, { "epoch": 0.8376544395365213, "grad_norm": 47.07827377319336, "learning_rate": 1.56458496710135e-07, "loss": 0.3642, "num_input_tokens_seen": 41187776, "step": 13085 }, { "epoch": 0.8379745214774983, "grad_norm": 36.153099060058594, "learning_rate": 1.5585892515146716e-07, "loss": 0.3461, "num_input_tokens_seen": 41204416, "step": 13090 }, { "epoch": 0.8382946034184752, "grad_norm": 22.711284637451172, "learning_rate": 1.5526040751284253e-07, "loss": 0.4195, "num_input_tokens_seen": 41220032, "step": 13095 }, { "epoch": 0.838614685359452, "grad_norm": 35.58867263793945, "learning_rate": 1.546629445415174e-07, "loss": 0.3118, "num_input_tokens_seen": 41235776, "step": 13100 }, { "epoch": 0.8389347673004289, "grad_norm": 41.773040771484375, "learning_rate": 1.5406653698343141e-07, "loss": 0.3725, "num_input_tokens_seen": 41252160, "step": 13105 }, { "epoch": 0.8392548492414058, "grad_norm": 33.417354583740234, "learning_rate": 1.5347118558320637e-07, "loss": 0.3539, "num_input_tokens_seen": 41269056, "step": 13110 }, { "epoch": 0.8395749311823827, "grad_norm": 24.998620986938477, "learning_rate": 1.5287689108414558e-07, "loss": 0.3562, "num_input_tokens_seen": 41285312, "step": 13115 }, { "epoch": 0.8398950131233596, "grad_norm": 39.11224365234375, "learning_rate": 1.5228365422823242e-07, "loss": 0.3246, "num_input_tokens_seen": 41300992, "step": 13120 }, { "epoch": 0.8402150950643364, "grad_norm": 28.325523376464844, "learning_rate": 1.5169147575613038e-07, "loss": 0.2623, "num_input_tokens_seen": 41317952, "step": 13125 }, { "epoch": 0.8405351770053133, "grad_norm": 12.87824821472168, "learning_rate": 1.5110035640718098e-07, "loss": 0.2941, "num_input_tokens_seen": 41333440, "step": 13130 }, { "epoch": 0.8408552589462902, "grad_norm": 31.341796875, "learning_rate": 1.5051029691940387e-07, "loss": 0.3725, "num_input_tokens_seen": 41349312, "step": 13135 }, { "epoch": 0.8411753408872671, "grad_norm": 33.42830276489258, "learning_rate": 1.4992129802949515e-07, "loss": 0.3449, "num_input_tokens_seen": 41364288, "step": 13140 }, { "epoch": 0.8414954228282441, "grad_norm": 24.27691078186035, "learning_rate": 1.4933336047282696e-07, "loss": 0.2836, "num_input_tokens_seen": 41379904, "step": 13145 }, { "epoch": 0.841815504769221, "grad_norm": 34.65740203857422, "learning_rate": 1.4874648498344579e-07, "loss": 0.3199, "num_input_tokens_seen": 41394432, "step": 13150 }, { "epoch": 0.8421355867101978, "grad_norm": 53.11001205444336, "learning_rate": 1.4816067229407348e-07, "loss": 0.3419, "num_input_tokens_seen": 41409984, "step": 13155 }, { "epoch": 0.8424556686511747, "grad_norm": 18.456310272216797, "learning_rate": 1.4757592313610322e-07, "loss": 0.3038, "num_input_tokens_seen": 41425984, "step": 13160 }, { "epoch": 0.8427757505921516, "grad_norm": 17.635456085205078, "learning_rate": 1.4699223823960128e-07, "loss": 0.3293, "num_input_tokens_seen": 41441920, "step": 13165 }, { "epoch": 0.8430958325331285, "grad_norm": 38.752742767333984, "learning_rate": 1.4640961833330579e-07, "loss": 0.3392, "num_input_tokens_seen": 41457664, "step": 13170 }, { "epoch": 0.8434159144741054, "grad_norm": 16.197906494140625, "learning_rate": 1.4582806414462378e-07, "loss": 0.2544, "num_input_tokens_seen": 41472832, "step": 13175 }, { "epoch": 0.8437359964150822, "grad_norm": 24.1660213470459, "learning_rate": 1.4524757639963258e-07, "loss": 0.3411, "num_input_tokens_seen": 41490368, "step": 13180 }, { "epoch": 0.8440560783560591, "grad_norm": 44.753700256347656, "learning_rate": 1.4466815582307845e-07, "loss": 0.4458, "num_input_tokens_seen": 41506624, "step": 13185 }, { "epoch": 0.844376160297036, "grad_norm": 9.318314552307129, "learning_rate": 1.440898031383746e-07, "loss": 0.2433, "num_input_tokens_seen": 41523264, "step": 13190 }, { "epoch": 0.844696242238013, "grad_norm": 42.493797302246094, "learning_rate": 1.4351251906760064e-07, "loss": 0.3678, "num_input_tokens_seen": 41538944, "step": 13195 }, { "epoch": 0.8450163241789899, "grad_norm": 40.14229202270508, "learning_rate": 1.4293630433150317e-07, "loss": 0.3919, "num_input_tokens_seen": 41554880, "step": 13200 }, { "epoch": 0.8453364061199667, "grad_norm": 47.614463806152344, "learning_rate": 1.423611596494927e-07, "loss": 0.4473, "num_input_tokens_seen": 41569280, "step": 13205 }, { "epoch": 0.8456564880609436, "grad_norm": 18.392112731933594, "learning_rate": 1.4178708573964438e-07, "loss": 0.3541, "num_input_tokens_seen": 41584576, "step": 13210 }, { "epoch": 0.8459765700019205, "grad_norm": 19.08127212524414, "learning_rate": 1.4121408331869566e-07, "loss": 0.3483, "num_input_tokens_seen": 41600000, "step": 13215 }, { "epoch": 0.8462966519428974, "grad_norm": 37.911075592041016, "learning_rate": 1.406421531020474e-07, "loss": 0.3539, "num_input_tokens_seen": 41615040, "step": 13220 }, { "epoch": 0.8466167338838743, "grad_norm": 69.3670883178711, "learning_rate": 1.4007129580376097e-07, "loss": 0.3418, "num_input_tokens_seen": 41630208, "step": 13225 }, { "epoch": 0.8469368158248511, "grad_norm": 36.10555648803711, "learning_rate": 1.3950151213655847e-07, "loss": 0.354, "num_input_tokens_seen": 41645440, "step": 13230 }, { "epoch": 0.847256897765828, "grad_norm": 42.61678695678711, "learning_rate": 1.389328028118214e-07, "loss": 0.3286, "num_input_tokens_seen": 41661184, "step": 13235 }, { "epoch": 0.8475769797068049, "grad_norm": 27.363248825073242, "learning_rate": 1.3836516853959e-07, "loss": 0.3546, "num_input_tokens_seen": 41676224, "step": 13240 }, { "epoch": 0.8478970616477818, "grad_norm": 18.371397018432617, "learning_rate": 1.3779861002856242e-07, "loss": 0.3031, "num_input_tokens_seen": 41690816, "step": 13245 }, { "epoch": 0.8482171435887588, "grad_norm": 17.178085327148438, "learning_rate": 1.3723312798609366e-07, "loss": 0.3261, "num_input_tokens_seen": 41706688, "step": 13250 }, { "epoch": 0.8485372255297357, "grad_norm": 26.48369789123535, "learning_rate": 1.3666872311819455e-07, "loss": 0.3518, "num_input_tokens_seen": 41721920, "step": 13255 }, { "epoch": 0.8488573074707125, "grad_norm": 21.16022300720215, "learning_rate": 1.361053961295312e-07, "loss": 0.2742, "num_input_tokens_seen": 41738112, "step": 13260 }, { "epoch": 0.8491773894116894, "grad_norm": 50.990020751953125, "learning_rate": 1.3554314772342412e-07, "loss": 0.3463, "num_input_tokens_seen": 41753792, "step": 13265 }, { "epoch": 0.8494974713526663, "grad_norm": 20.54403305053711, "learning_rate": 1.349819786018469e-07, "loss": 0.3268, "num_input_tokens_seen": 41771328, "step": 13270 }, { "epoch": 0.8498175532936432, "grad_norm": 37.34607696533203, "learning_rate": 1.3442188946542566e-07, "loss": 0.375, "num_input_tokens_seen": 41787712, "step": 13275 }, { "epoch": 0.85013763523462, "grad_norm": 24.755434036254883, "learning_rate": 1.338628810134388e-07, "loss": 0.2995, "num_input_tokens_seen": 41803072, "step": 13280 }, { "epoch": 0.8504577171755969, "grad_norm": 36.77594757080078, "learning_rate": 1.3330495394381435e-07, "loss": 0.3636, "num_input_tokens_seen": 41818688, "step": 13285 }, { "epoch": 0.8507777991165738, "grad_norm": 15.947341918945312, "learning_rate": 1.3274810895313083e-07, "loss": 0.272, "num_input_tokens_seen": 41833792, "step": 13290 }, { "epoch": 0.8510338646693554, "eval_loss": 0.3570670485496521, "eval_runtime": 49.1744, "eval_samples_per_second": 282.383, "eval_steps_per_second": 35.303, "num_input_tokens_seen": 41847872, "step": 13294 }, { "epoch": 0.8510978810575507, "grad_norm": 25.20223617553711, "learning_rate": 1.321923467366164e-07, "loss": 0.3708, "num_input_tokens_seen": 41850880, "step": 13295 }, { "epoch": 0.8514179629985277, "grad_norm": 14.625531196594238, "learning_rate": 1.3163766798814603e-07, "loss": 0.1815, "num_input_tokens_seen": 41866560, "step": 13300 }, { "epoch": 0.8517380449395046, "grad_norm": 49.65571594238281, "learning_rate": 1.3108407340024264e-07, "loss": 0.2872, "num_input_tokens_seen": 41882240, "step": 13305 }, { "epoch": 0.8520581268804814, "grad_norm": 37.89714813232422, "learning_rate": 1.3053156366407613e-07, "loss": 0.332, "num_input_tokens_seen": 41898880, "step": 13310 }, { "epoch": 0.8523782088214583, "grad_norm": 19.63136100769043, "learning_rate": 1.2998013946946119e-07, "loss": 0.2398, "num_input_tokens_seen": 41915968, "step": 13315 }, { "epoch": 0.8526982907624352, "grad_norm": 36.910030364990234, "learning_rate": 1.2942980150485706e-07, "loss": 0.3556, "num_input_tokens_seen": 41930816, "step": 13320 }, { "epoch": 0.8530183727034121, "grad_norm": 49.309322357177734, "learning_rate": 1.2888055045736723e-07, "loss": 0.3098, "num_input_tokens_seen": 41947200, "step": 13325 }, { "epoch": 0.853338454644389, "grad_norm": 19.818714141845703, "learning_rate": 1.283323870127384e-07, "loss": 0.3021, "num_input_tokens_seen": 41962240, "step": 13330 }, { "epoch": 0.8536585365853658, "grad_norm": 28.360517501831055, "learning_rate": 1.2778531185535911e-07, "loss": 0.3063, "num_input_tokens_seen": 41978752, "step": 13335 }, { "epoch": 0.8539786185263427, "grad_norm": 19.08763313293457, "learning_rate": 1.2723932566825844e-07, "loss": 0.324, "num_input_tokens_seen": 41994112, "step": 13340 }, { "epoch": 0.8542987004673196, "grad_norm": 16.557178497314453, "learning_rate": 1.2669442913310723e-07, "loss": 0.2986, "num_input_tokens_seen": 42010432, "step": 13345 }, { "epoch": 0.8546187824082965, "grad_norm": 27.915157318115234, "learning_rate": 1.2615062293021506e-07, "loss": 0.2722, "num_input_tokens_seen": 42025984, "step": 13350 }, { "epoch": 0.8549388643492735, "grad_norm": 43.59603500366211, "learning_rate": 1.2560790773853025e-07, "loss": 0.3185, "num_input_tokens_seen": 42040832, "step": 13355 }, { "epoch": 0.8552589462902503, "grad_norm": 25.36774253845215, "learning_rate": 1.2506628423563915e-07, "loss": 0.4035, "num_input_tokens_seen": 42057536, "step": 13360 }, { "epoch": 0.8555790282312272, "grad_norm": 31.750885009765625, "learning_rate": 1.2452575309776493e-07, "loss": 0.2863, "num_input_tokens_seen": 42073152, "step": 13365 }, { "epoch": 0.8558991101722041, "grad_norm": 45.091915130615234, "learning_rate": 1.2398631499976732e-07, "loss": 0.304, "num_input_tokens_seen": 42088512, "step": 13370 }, { "epoch": 0.856219192113181, "grad_norm": 22.48138999938965, "learning_rate": 1.234479706151409e-07, "loss": 0.4208, "num_input_tokens_seen": 42103552, "step": 13375 }, { "epoch": 0.8565392740541579, "grad_norm": 22.086090087890625, "learning_rate": 1.2291072061601503e-07, "loss": 0.3608, "num_input_tokens_seen": 42119872, "step": 13380 }, { "epoch": 0.8568593559951347, "grad_norm": 34.048282623291016, "learning_rate": 1.2237456567315264e-07, "loss": 0.4351, "num_input_tokens_seen": 42136832, "step": 13385 }, { "epoch": 0.8571794379361116, "grad_norm": 23.326128005981445, "learning_rate": 1.2183950645594944e-07, "loss": 0.2975, "num_input_tokens_seen": 42152896, "step": 13390 }, { "epoch": 0.8574995198770885, "grad_norm": 52.200294494628906, "learning_rate": 1.2130554363243318e-07, "loss": 0.3421, "num_input_tokens_seen": 42168064, "step": 13395 }, { "epoch": 0.8578196018180654, "grad_norm": 20.56406593322754, "learning_rate": 1.207726778692625e-07, "loss": 0.3703, "num_input_tokens_seen": 42182784, "step": 13400 }, { "epoch": 0.8581396837590423, "grad_norm": 23.129608154296875, "learning_rate": 1.2024090983172718e-07, "loss": 0.3271, "num_input_tokens_seen": 42199744, "step": 13405 }, { "epoch": 0.8584597657000193, "grad_norm": 40.9952507019043, "learning_rate": 1.1971024018374532e-07, "loss": 0.3625, "num_input_tokens_seen": 42215040, "step": 13410 }, { "epoch": 0.8587798476409961, "grad_norm": 35.23881149291992, "learning_rate": 1.1918066958786432e-07, "loss": 0.3091, "num_input_tokens_seen": 42230144, "step": 13415 }, { "epoch": 0.859099929581973, "grad_norm": 59.670223236083984, "learning_rate": 1.1865219870525922e-07, "loss": 0.3553, "num_input_tokens_seen": 42246528, "step": 13420 }, { "epoch": 0.8594200115229499, "grad_norm": 20.215394973754883, "learning_rate": 1.1812482819573222e-07, "loss": 0.4317, "num_input_tokens_seen": 42263168, "step": 13425 }, { "epoch": 0.8597400934639268, "grad_norm": 32.689353942871094, "learning_rate": 1.1759855871771163e-07, "loss": 0.3905, "num_input_tokens_seen": 42278912, "step": 13430 }, { "epoch": 0.8600601754049036, "grad_norm": 45.541587829589844, "learning_rate": 1.1707339092825075e-07, "loss": 0.3824, "num_input_tokens_seen": 42294656, "step": 13435 }, { "epoch": 0.8603802573458805, "grad_norm": 45.382381439208984, "learning_rate": 1.1654932548302842e-07, "loss": 0.3909, "num_input_tokens_seen": 42311552, "step": 13440 }, { "epoch": 0.8607003392868574, "grad_norm": 48.50038528442383, "learning_rate": 1.1602636303634595e-07, "loss": 0.3635, "num_input_tokens_seen": 42327552, "step": 13445 }, { "epoch": 0.8610204212278343, "grad_norm": 18.829587936401367, "learning_rate": 1.1550450424112801e-07, "loss": 0.3583, "num_input_tokens_seen": 42343360, "step": 13450 }, { "epoch": 0.8613405031688112, "grad_norm": 22.35457992553711, "learning_rate": 1.1498374974892178e-07, "loss": 0.3341, "num_input_tokens_seen": 42360064, "step": 13455 }, { "epoch": 0.8616605851097882, "grad_norm": 23.769941329956055, "learning_rate": 1.144641002098955e-07, "loss": 0.4371, "num_input_tokens_seen": 42374976, "step": 13460 }, { "epoch": 0.861980667050765, "grad_norm": 44.195152282714844, "learning_rate": 1.1394555627283697e-07, "loss": 0.3524, "num_input_tokens_seen": 42391616, "step": 13465 }, { "epoch": 0.8623007489917419, "grad_norm": 58.780975341796875, "learning_rate": 1.134281185851551e-07, "loss": 0.3095, "num_input_tokens_seen": 42406528, "step": 13470 }, { "epoch": 0.8626208309327188, "grad_norm": 29.023456573486328, "learning_rate": 1.1291178779287691e-07, "loss": 0.288, "num_input_tokens_seen": 42424320, "step": 13475 }, { "epoch": 0.8629409128736957, "grad_norm": 41.91423034667969, "learning_rate": 1.1239656454064683e-07, "loss": 0.3654, "num_input_tokens_seen": 42440960, "step": 13480 }, { "epoch": 0.8632609948146726, "grad_norm": 16.42652130126953, "learning_rate": 1.1188244947172776e-07, "loss": 0.2474, "num_input_tokens_seen": 42456448, "step": 13485 }, { "epoch": 0.8635810767556494, "grad_norm": 20.765544891357422, "learning_rate": 1.1136944322799812e-07, "loss": 0.3165, "num_input_tokens_seen": 42472448, "step": 13490 }, { "epoch": 0.8639011586966263, "grad_norm": 51.0446662902832, "learning_rate": 1.1085754644995227e-07, "loss": 0.3147, "num_input_tokens_seen": 42487808, "step": 13495 }, { "epoch": 0.8642212406376032, "grad_norm": 34.88838195800781, "learning_rate": 1.1034675977669938e-07, "loss": 0.3516, "num_input_tokens_seen": 42503744, "step": 13500 }, { "epoch": 0.8645413225785801, "grad_norm": 50.67732238769531, "learning_rate": 1.0983708384596258e-07, "loss": 0.5636, "num_input_tokens_seen": 42520768, "step": 13505 }, { "epoch": 0.864861404519557, "grad_norm": 17.03850555419922, "learning_rate": 1.0932851929407827e-07, "loss": 0.3664, "num_input_tokens_seen": 42537408, "step": 13510 }, { "epoch": 0.8651814864605339, "grad_norm": 45.833168029785156, "learning_rate": 1.0882106675599534e-07, "loss": 0.36, "num_input_tokens_seen": 42553728, "step": 13515 }, { "epoch": 0.8655015684015108, "grad_norm": 14.135661125183105, "learning_rate": 1.0831472686527409e-07, "loss": 0.3304, "num_input_tokens_seen": 42568896, "step": 13520 }, { "epoch": 0.8658216503424877, "grad_norm": 13.662610054016113, "learning_rate": 1.0780950025408586e-07, "loss": 0.2939, "num_input_tokens_seen": 42584000, "step": 13525 }, { "epoch": 0.8661417322834646, "grad_norm": 62.21460723876953, "learning_rate": 1.0730538755321217e-07, "loss": 0.3824, "num_input_tokens_seen": 42600192, "step": 13530 }, { "epoch": 0.8664618142244415, "grad_norm": 20.335872650146484, "learning_rate": 1.0680238939204334e-07, "loss": 0.304, "num_input_tokens_seen": 42614656, "step": 13535 }, { "epoch": 0.8667818961654183, "grad_norm": 42.727237701416016, "learning_rate": 1.0630050639857879e-07, "loss": 0.3989, "num_input_tokens_seen": 42629504, "step": 13540 }, { "epoch": 0.8671019781063952, "grad_norm": 20.651216506958008, "learning_rate": 1.0579973919942508e-07, "loss": 0.3036, "num_input_tokens_seen": 42644224, "step": 13545 }, { "epoch": 0.8674220600473721, "grad_norm": 21.302921295166016, "learning_rate": 1.0530008841979621e-07, "loss": 0.2417, "num_input_tokens_seen": 42659584, "step": 13550 }, { "epoch": 0.867742141988349, "grad_norm": 36.984397888183594, "learning_rate": 1.048015546835117e-07, "loss": 0.2756, "num_input_tokens_seen": 42675776, "step": 13555 }, { "epoch": 0.8680622239293259, "grad_norm": 23.602458953857422, "learning_rate": 1.0430413861299691e-07, "loss": 0.3976, "num_input_tokens_seen": 42693184, "step": 13560 }, { "epoch": 0.8683823058703029, "grad_norm": 45.383060455322266, "learning_rate": 1.0380784082928196e-07, "loss": 0.4533, "num_input_tokens_seen": 42710784, "step": 13565 }, { "epoch": 0.8687023878112797, "grad_norm": 40.113624572753906, "learning_rate": 1.0331266195200006e-07, "loss": 0.3903, "num_input_tokens_seen": 42727040, "step": 13570 }, { "epoch": 0.8690224697522566, "grad_norm": 18.091224670410156, "learning_rate": 1.0281860259938779e-07, "loss": 0.3126, "num_input_tokens_seen": 42742208, "step": 13575 }, { "epoch": 0.8693425516932335, "grad_norm": 19.732269287109375, "learning_rate": 1.0232566338828452e-07, "loss": 0.3673, "num_input_tokens_seen": 42758464, "step": 13580 }, { "epoch": 0.8696626336342104, "grad_norm": 47.176029205322266, "learning_rate": 1.018338449341305e-07, "loss": 0.4102, "num_input_tokens_seen": 42774016, "step": 13585 }, { "epoch": 0.8699827155751872, "grad_norm": 19.62028694152832, "learning_rate": 1.0134314785096632e-07, "loss": 0.3942, "num_input_tokens_seen": 42789248, "step": 13590 }, { "epoch": 0.8703027975161641, "grad_norm": 17.851299285888672, "learning_rate": 1.0085357275143359e-07, "loss": 0.342, "num_input_tokens_seen": 42804608, "step": 13595 }, { "epoch": 0.870622879457141, "grad_norm": 32.63302230834961, "learning_rate": 1.0036512024677268e-07, "loss": 0.4964, "num_input_tokens_seen": 42819584, "step": 13600 }, { "epoch": 0.8709429613981179, "grad_norm": 9.898176193237305, "learning_rate": 9.98777909468217e-08, "loss": 0.2733, "num_input_tokens_seen": 42835200, "step": 13605 }, { "epoch": 0.8712630433390948, "grad_norm": 48.42760467529297, "learning_rate": 9.939158546001736e-08, "loss": 0.406, "num_input_tokens_seen": 42852672, "step": 13610 }, { "epoch": 0.8715831252800716, "grad_norm": 19.67852020263672, "learning_rate": 9.890650439339299e-08, "loss": 0.3322, "num_input_tokens_seen": 42868672, "step": 13615 }, { "epoch": 0.8719032072210486, "grad_norm": 55.09160232543945, "learning_rate": 9.842254835257791e-08, "loss": 0.416, "num_input_tokens_seen": 42884096, "step": 13620 }, { "epoch": 0.8722232891620255, "grad_norm": 32.343929290771484, "learning_rate": 9.793971794179679e-08, "loss": 0.3767, "num_input_tokens_seen": 42898752, "step": 13625 }, { "epoch": 0.8725433711030024, "grad_norm": 27.15031623840332, "learning_rate": 9.745801376386931e-08, "loss": 0.3417, "num_input_tokens_seen": 42914688, "step": 13630 }, { "epoch": 0.8728634530439793, "grad_norm": 42.770503997802734, "learning_rate": 9.697743642020861e-08, "loss": 0.3211, "num_input_tokens_seen": 42930688, "step": 13635 }, { "epoch": 0.8731835349849562, "grad_norm": 37.78193664550781, "learning_rate": 9.649798651082119e-08, "loss": 0.3372, "num_input_tokens_seen": 42947008, "step": 13640 }, { "epoch": 0.873503616925933, "grad_norm": 17.573001861572266, "learning_rate": 9.601966463430588e-08, "loss": 0.3946, "num_input_tokens_seen": 42962816, "step": 13645 }, { "epoch": 0.8738236988669099, "grad_norm": 15.034274101257324, "learning_rate": 9.554247138785321e-08, "loss": 0.3405, "num_input_tokens_seen": 42977664, "step": 13650 }, { "epoch": 0.8741437808078868, "grad_norm": 74.6231460571289, "learning_rate": 9.506640736724447e-08, "loss": 0.4684, "num_input_tokens_seen": 42993472, "step": 13655 }, { "epoch": 0.8744638627488637, "grad_norm": 31.8859920501709, "learning_rate": 9.459147316685123e-08, "loss": 0.3895, "num_input_tokens_seen": 43010688, "step": 13660 }, { "epoch": 0.8747839446898406, "grad_norm": 41.20021438598633, "learning_rate": 9.41176693796345e-08, "loss": 0.3357, "num_input_tokens_seen": 43027392, "step": 13665 }, { "epoch": 0.8751040266308175, "grad_norm": 39.77818298339844, "learning_rate": 9.364499659714364e-08, "loss": 0.4172, "num_input_tokens_seen": 43043008, "step": 13670 }, { "epoch": 0.8754241085717944, "grad_norm": 36.9276123046875, "learning_rate": 9.31734554095165e-08, "loss": 0.342, "num_input_tokens_seen": 43059072, "step": 13675 }, { "epoch": 0.8757441905127713, "grad_norm": 35.170780181884766, "learning_rate": 9.270304640547744e-08, "loss": 0.3481, "num_input_tokens_seen": 43074624, "step": 13680 }, { "epoch": 0.8760642724537482, "grad_norm": 30.96558380126953, "learning_rate": 9.223377017233768e-08, "loss": 0.3952, "num_input_tokens_seen": 43089536, "step": 13685 }, { "epoch": 0.8763843543947251, "grad_norm": 28.36827850341797, "learning_rate": 9.176562729599458e-08, "loss": 0.3535, "num_input_tokens_seen": 43104512, "step": 13690 }, { "epoch": 0.8767044363357019, "grad_norm": 49.10908508300781, "learning_rate": 9.129861836092944e-08, "loss": 0.3463, "num_input_tokens_seen": 43120640, "step": 13695 }, { "epoch": 0.8770245182766788, "grad_norm": 21.713356018066406, "learning_rate": 9.083274395020845e-08, "loss": 0.4422, "num_input_tokens_seen": 43136384, "step": 13700 }, { "epoch": 0.8773446002176557, "grad_norm": 23.583024978637695, "learning_rate": 9.036800464548156e-08, "loss": 0.4045, "num_input_tokens_seen": 43153216, "step": 13705 }, { "epoch": 0.8776646821586326, "grad_norm": 22.666852951049805, "learning_rate": 8.990440102698138e-08, "loss": 0.3473, "num_input_tokens_seen": 43167936, "step": 13710 }, { "epoch": 0.8779847640996095, "grad_norm": 42.15274429321289, "learning_rate": 8.944193367352182e-08, "loss": 0.2767, "num_input_tokens_seen": 43183872, "step": 13715 }, { "epoch": 0.8783048460405863, "grad_norm": 28.620649337768555, "learning_rate": 8.898060316249944e-08, "loss": 0.4057, "num_input_tokens_seen": 43200256, "step": 13720 }, { "epoch": 0.8786249279815633, "grad_norm": 46.91181182861328, "learning_rate": 8.852041006989064e-08, "loss": 0.3563, "num_input_tokens_seen": 43217600, "step": 13725 }, { "epoch": 0.8789450099225402, "grad_norm": 48.15342712402344, "learning_rate": 8.80613549702518e-08, "loss": 0.3785, "num_input_tokens_seen": 43233344, "step": 13730 }, { "epoch": 0.8792650918635171, "grad_norm": 48.054359436035156, "learning_rate": 8.760343843671824e-08, "loss": 0.5423, "num_input_tokens_seen": 43249280, "step": 13735 }, { "epoch": 0.879585173804494, "grad_norm": 74.3794937133789, "learning_rate": 8.714666104100487e-08, "loss": 0.4461, "num_input_tokens_seen": 43265024, "step": 13740 }, { "epoch": 0.8799052557454708, "grad_norm": 75.1503677368164, "learning_rate": 8.66910233534034e-08, "loss": 0.3544, "num_input_tokens_seen": 43280576, "step": 13745 }, { "epoch": 0.8802253376864477, "grad_norm": 32.35490798950195, "learning_rate": 8.62365259427823e-08, "loss": 0.3156, "num_input_tokens_seen": 43296064, "step": 13750 }, { "epoch": 0.8805454196274246, "grad_norm": 29.028377532958984, "learning_rate": 8.578316937658758e-08, "loss": 0.2899, "num_input_tokens_seen": 43311552, "step": 13755 }, { "epoch": 0.8808655015684015, "grad_norm": 18.780216217041016, "learning_rate": 8.533095422083992e-08, "loss": 0.3116, "num_input_tokens_seen": 43326272, "step": 13760 }, { "epoch": 0.8811855835093784, "grad_norm": 26.572908401489258, "learning_rate": 8.487988104013533e-08, "loss": 0.2906, "num_input_tokens_seen": 43342592, "step": 13765 }, { "epoch": 0.8815056654503552, "grad_norm": 24.25293731689453, "learning_rate": 8.4429950397644e-08, "loss": 0.3188, "num_input_tokens_seen": 43357888, "step": 13770 }, { "epoch": 0.8818257473913321, "grad_norm": 20.96013832092285, "learning_rate": 8.398116285510948e-08, "loss": 0.2679, "num_input_tokens_seen": 43374272, "step": 13775 }, { "epoch": 0.8821458293323091, "grad_norm": 47.135711669921875, "learning_rate": 8.353351897284844e-08, "loss": 0.2698, "num_input_tokens_seen": 43393280, "step": 13780 }, { "epoch": 0.882465911273286, "grad_norm": 10.159743309020996, "learning_rate": 8.308701930974949e-08, "loss": 0.4762, "num_input_tokens_seen": 43409600, "step": 13785 }, { "epoch": 0.8827859932142629, "grad_norm": 27.35509490966797, "learning_rate": 8.264166442327269e-08, "loss": 0.4038, "num_input_tokens_seen": 43424384, "step": 13790 }, { "epoch": 0.8831060751552398, "grad_norm": 41.762332916259766, "learning_rate": 8.219745486944885e-08, "loss": 0.2533, "num_input_tokens_seen": 43440128, "step": 13795 }, { "epoch": 0.8834261570962166, "grad_norm": 78.77603912353516, "learning_rate": 8.175439120287875e-08, "loss": 0.4597, "num_input_tokens_seen": 43455168, "step": 13800 }, { "epoch": 0.8837462390371935, "grad_norm": 49.571353912353516, "learning_rate": 8.131247397673269e-08, "loss": 0.3494, "num_input_tokens_seen": 43472064, "step": 13805 }, { "epoch": 0.8840663209781704, "grad_norm": 118.99240112304688, "learning_rate": 8.087170374274921e-08, "loss": 0.4333, "num_input_tokens_seen": 43488000, "step": 13810 }, { "epoch": 0.8843864029191473, "grad_norm": 27.12523078918457, "learning_rate": 8.043208105123578e-08, "loss": 0.2981, "num_input_tokens_seen": 43503488, "step": 13815 }, { "epoch": 0.8847064848601242, "grad_norm": 42.8975830078125, "learning_rate": 7.999360645106579e-08, "loss": 0.335, "num_input_tokens_seen": 43518336, "step": 13820 }, { "epoch": 0.885026566801101, "grad_norm": 17.23529052734375, "learning_rate": 7.955628048968011e-08, "loss": 0.2651, "num_input_tokens_seen": 43532800, "step": 13825 }, { "epoch": 0.885346648742078, "grad_norm": 29.590059280395508, "learning_rate": 7.912010371308564e-08, "loss": 0.2627, "num_input_tokens_seen": 43547648, "step": 13830 }, { "epoch": 0.8856667306830549, "grad_norm": 27.454540252685547, "learning_rate": 7.868507666585422e-08, "loss": 0.2935, "num_input_tokens_seen": 43562688, "step": 13835 }, { "epoch": 0.8859868126240318, "grad_norm": 45.65460968017578, "learning_rate": 7.825119989112172e-08, "loss": 0.4137, "num_input_tokens_seen": 43578176, "step": 13840 }, { "epoch": 0.8863068945650087, "grad_norm": 30.539806365966797, "learning_rate": 7.78184739305886e-08, "loss": 0.2938, "num_input_tokens_seen": 43593920, "step": 13845 }, { "epoch": 0.8866269765059855, "grad_norm": 20.917694091796875, "learning_rate": 7.73868993245187e-08, "loss": 0.3491, "num_input_tokens_seen": 43610944, "step": 13850 }, { "epoch": 0.8869470584469624, "grad_norm": 18.05341911315918, "learning_rate": 7.695647661173754e-08, "loss": 0.3412, "num_input_tokens_seen": 43627008, "step": 13855 }, { "epoch": 0.8872671403879393, "grad_norm": 44.19736862182617, "learning_rate": 7.652720632963284e-08, "loss": 0.3785, "num_input_tokens_seen": 43642752, "step": 13860 }, { "epoch": 0.8875872223289162, "grad_norm": 49.171730041503906, "learning_rate": 7.609908901415396e-08, "loss": 0.3396, "num_input_tokens_seen": 43658496, "step": 13865 }, { "epoch": 0.8879073042698931, "grad_norm": 53.71741485595703, "learning_rate": 7.567212519981047e-08, "loss": 0.4018, "num_input_tokens_seen": 43674304, "step": 13870 }, { "epoch": 0.8882273862108699, "grad_norm": 18.578672409057617, "learning_rate": 7.524631541967108e-08, "loss": 0.3382, "num_input_tokens_seen": 43689536, "step": 13875 }, { "epoch": 0.8885474681518468, "grad_norm": 72.489501953125, "learning_rate": 7.482166020536485e-08, "loss": 0.2903, "num_input_tokens_seen": 43706496, "step": 13880 }, { "epoch": 0.8888675500928238, "grad_norm": 17.48689079284668, "learning_rate": 7.439816008707877e-08, "loss": 0.3108, "num_input_tokens_seen": 43721408, "step": 13885 }, { "epoch": 0.8891876320338007, "grad_norm": 17.783830642700195, "learning_rate": 7.397581559355748e-08, "loss": 0.3216, "num_input_tokens_seen": 43737536, "step": 13890 }, { "epoch": 0.8895077139747776, "grad_norm": 33.39737319946289, "learning_rate": 7.355462725210315e-08, "loss": 0.4116, "num_input_tokens_seen": 43752640, "step": 13895 }, { "epoch": 0.8898277959157544, "grad_norm": 30.600183486938477, "learning_rate": 7.313459558857438e-08, "loss": 0.4081, "num_input_tokens_seen": 43768384, "step": 13900 }, { "epoch": 0.8901478778567313, "grad_norm": 26.679346084594727, "learning_rate": 7.271572112738566e-08, "loss": 0.3108, "num_input_tokens_seen": 43784320, "step": 13905 }, { "epoch": 0.8904679597977082, "grad_norm": 32.508792877197266, "learning_rate": 7.229800439150657e-08, "loss": 0.3582, "num_input_tokens_seen": 43799232, "step": 13910 }, { "epoch": 0.8907880417386851, "grad_norm": 64.69635009765625, "learning_rate": 7.188144590246148e-08, "loss": 0.3721, "num_input_tokens_seen": 43815360, "step": 13915 }, { "epoch": 0.891108123679662, "grad_norm": 24.958736419677734, "learning_rate": 7.146604618032848e-08, "loss": 0.339, "num_input_tokens_seen": 43830336, "step": 13920 }, { "epoch": 0.8914282056206388, "grad_norm": 36.58753967285156, "learning_rate": 7.105180574373904e-08, "loss": 0.4065, "num_input_tokens_seen": 43846656, "step": 13925 }, { "epoch": 0.8917482875616157, "grad_norm": 19.49739646911621, "learning_rate": 7.063872510987712e-08, "loss": 0.3231, "num_input_tokens_seen": 43862720, "step": 13930 }, { "epoch": 0.8920683695025927, "grad_norm": 32.121185302734375, "learning_rate": 7.022680479447874e-08, "loss": 0.3558, "num_input_tokens_seen": 43876800, "step": 13935 }, { "epoch": 0.8923884514435696, "grad_norm": 22.010385513305664, "learning_rate": 6.98160453118316e-08, "loss": 0.2952, "num_input_tokens_seen": 43892160, "step": 13940 }, { "epoch": 0.8927085333845465, "grad_norm": 38.97593688964844, "learning_rate": 6.940644717477328e-08, "loss": 0.333, "num_input_tokens_seen": 43908416, "step": 13945 }, { "epoch": 0.8930286153255234, "grad_norm": 31.57818031311035, "learning_rate": 6.899801089469204e-08, "loss": 0.4213, "num_input_tokens_seen": 43923712, "step": 13950 }, { "epoch": 0.8933486972665002, "grad_norm": 20.735111236572266, "learning_rate": 6.85907369815254e-08, "loss": 0.3555, "num_input_tokens_seen": 43939520, "step": 13955 }, { "epoch": 0.8936687792074771, "grad_norm": 51.4113883972168, "learning_rate": 6.81846259437595e-08, "loss": 0.3895, "num_input_tokens_seen": 43954688, "step": 13960 }, { "epoch": 0.893988861148454, "grad_norm": 53.543155670166016, "learning_rate": 6.77796782884289e-08, "loss": 0.3146, "num_input_tokens_seen": 43969600, "step": 13965 }, { "epoch": 0.8943089430894309, "grad_norm": 46.502647399902344, "learning_rate": 6.737589452111526e-08, "loss": 0.3824, "num_input_tokens_seen": 43985472, "step": 13970 }, { "epoch": 0.8946290250304078, "grad_norm": 39.93029022216797, "learning_rate": 6.697327514594786e-08, "loss": 0.3916, "num_input_tokens_seen": 44000768, "step": 13975 }, { "epoch": 0.8949491069713846, "grad_norm": 41.46504592895508, "learning_rate": 6.657182066560118e-08, "loss": 0.4586, "num_input_tokens_seen": 44017088, "step": 13980 }, { "epoch": 0.8952691889123615, "grad_norm": 26.99639892578125, "learning_rate": 6.617153158129596e-08, "loss": 0.37, "num_input_tokens_seen": 44031488, "step": 13985 }, { "epoch": 0.8955892708533385, "grad_norm": 37.02708435058594, "learning_rate": 6.577240839279807e-08, "loss": 0.337, "num_input_tokens_seen": 44047296, "step": 13990 }, { "epoch": 0.8959093527943154, "grad_norm": 31.63517189025879, "learning_rate": 6.537445159841748e-08, "loss": 0.3143, "num_input_tokens_seen": 44063744, "step": 13995 }, { "epoch": 0.8962294347352923, "grad_norm": 34.43181610107422, "learning_rate": 6.497766169500752e-08, "loss": 0.3936, "num_input_tokens_seen": 44079168, "step": 14000 }, { "epoch": 0.8965495166762691, "grad_norm": 13.677638053894043, "learning_rate": 6.458203917796546e-08, "loss": 0.2643, "num_input_tokens_seen": 44093824, "step": 14005 }, { "epoch": 0.896869598617246, "grad_norm": 19.27773666381836, "learning_rate": 6.418758454123041e-08, "loss": 0.455, "num_input_tokens_seen": 44111296, "step": 14010 }, { "epoch": 0.8971896805582229, "grad_norm": 18.031564712524414, "learning_rate": 6.379429827728377e-08, "loss": 0.3905, "num_input_tokens_seen": 44128000, "step": 14015 }, { "epoch": 0.8975097624991998, "grad_norm": 17.980560302734375, "learning_rate": 6.340218087714799e-08, "loss": 0.3833, "num_input_tokens_seen": 44143488, "step": 14020 }, { "epoch": 0.8978298444401767, "grad_norm": 84.56553649902344, "learning_rate": 6.301123283038634e-08, "loss": 0.3567, "num_input_tokens_seen": 44158976, "step": 14025 }, { "epoch": 0.8981499263811535, "grad_norm": 20.843826293945312, "learning_rate": 6.262145462510193e-08, "loss": 0.319, "num_input_tokens_seen": 44175808, "step": 14030 }, { "epoch": 0.8984700083221304, "grad_norm": 44.17280578613281, "learning_rate": 6.223284674793738e-08, "loss": 0.2817, "num_input_tokens_seen": 44190336, "step": 14035 }, { "epoch": 0.8987900902631074, "grad_norm": 35.57537078857422, "learning_rate": 6.184540968407437e-08, "loss": 0.3835, "num_input_tokens_seen": 44205696, "step": 14040 }, { "epoch": 0.8991101722040843, "grad_norm": 26.58342742919922, "learning_rate": 6.145914391723239e-08, "loss": 0.3546, "num_input_tokens_seen": 44222016, "step": 14045 }, { "epoch": 0.8994302541450612, "grad_norm": 25.470823287963867, "learning_rate": 6.107404992966902e-08, "loss": 0.3285, "num_input_tokens_seen": 44238592, "step": 14050 }, { "epoch": 0.899750336086038, "grad_norm": 23.68887710571289, "learning_rate": 6.069012820217856e-08, "loss": 0.2517, "num_input_tokens_seen": 44254016, "step": 14055 }, { "epoch": 0.9000704180270149, "grad_norm": 28.1870059967041, "learning_rate": 6.030737921409168e-08, "loss": 0.3757, "num_input_tokens_seen": 44269376, "step": 14060 }, { "epoch": 0.9003904999679918, "grad_norm": 53.616127014160156, "learning_rate": 5.992580344327503e-08, "loss": 0.4646, "num_input_tokens_seen": 44284672, "step": 14065 }, { "epoch": 0.9007105819089687, "grad_norm": 33.5253791809082, "learning_rate": 5.954540136613051e-08, "loss": 0.352, "num_input_tokens_seen": 44300224, "step": 14070 }, { "epoch": 0.9010306638499456, "grad_norm": 24.468204498291016, "learning_rate": 5.916617345759456e-08, "loss": 0.3451, "num_input_tokens_seen": 44315264, "step": 14075 }, { "epoch": 0.901094680238141, "eval_loss": 0.3543796241283417, "eval_runtime": 49.176, "eval_samples_per_second": 282.373, "eval_steps_per_second": 35.302, "num_input_tokens_seen": 44318848, "step": 14076 }, { "epoch": 0.9013507457909224, "grad_norm": 45.981563568115234, "learning_rate": 5.878812019113766e-08, "loss": 0.4234, "num_input_tokens_seen": 44330176, "step": 14080 }, { "epoch": 0.9016708277318993, "grad_norm": 22.737422943115234, "learning_rate": 5.84112420387638e-08, "loss": 0.2892, "num_input_tokens_seen": 44345152, "step": 14085 }, { "epoch": 0.9019909096728762, "grad_norm": 31.271459579467773, "learning_rate": 5.8035539471009697e-08, "loss": 0.3656, "num_input_tokens_seen": 44361152, "step": 14090 }, { "epoch": 0.9023109916138532, "grad_norm": 33.406707763671875, "learning_rate": 5.7661012956944253e-08, "loss": 0.4078, "num_input_tokens_seen": 44376128, "step": 14095 }, { "epoch": 0.9026310735548301, "grad_norm": 17.146968841552734, "learning_rate": 5.728766296416876e-08, "loss": 0.2842, "num_input_tokens_seen": 44392192, "step": 14100 }, { "epoch": 0.902951155495807, "grad_norm": 34.22679901123047, "learning_rate": 5.6915489958814453e-08, "loss": 0.4079, "num_input_tokens_seen": 44407680, "step": 14105 }, { "epoch": 0.9032712374367838, "grad_norm": 53.51115798950195, "learning_rate": 5.654449440554399e-08, "loss": 0.4093, "num_input_tokens_seen": 44424384, "step": 14110 }, { "epoch": 0.9035913193777607, "grad_norm": 21.632587432861328, "learning_rate": 5.617467676754972e-08, "loss": 0.3752, "num_input_tokens_seen": 44439744, "step": 14115 }, { "epoch": 0.9039114013187376, "grad_norm": 57.51222610473633, "learning_rate": 5.580603750655344e-08, "loss": 0.3012, "num_input_tokens_seen": 44454272, "step": 14120 }, { "epoch": 0.9042314832597145, "grad_norm": 33.1247444152832, "learning_rate": 5.543857708280497e-08, "loss": 0.3578, "num_input_tokens_seen": 44468992, "step": 14125 }, { "epoch": 0.9045515652006914, "grad_norm": 41.706947326660156, "learning_rate": 5.507229595508367e-08, "loss": 0.4819, "num_input_tokens_seen": 44484864, "step": 14130 }, { "epoch": 0.9048716471416682, "grad_norm": 14.103269577026367, "learning_rate": 5.4707194580695504e-08, "loss": 0.289, "num_input_tokens_seen": 44499968, "step": 14135 }, { "epoch": 0.9051917290826451, "grad_norm": 35.217655181884766, "learning_rate": 5.4343273415473846e-08, "loss": 0.4239, "num_input_tokens_seen": 44517952, "step": 14140 }, { "epoch": 0.905511811023622, "grad_norm": 24.536203384399414, "learning_rate": 5.3980532913778576e-08, "loss": 0.3421, "num_input_tokens_seen": 44532928, "step": 14145 }, { "epoch": 0.905831892964599, "grad_norm": 32.02094650268555, "learning_rate": 5.361897352849554e-08, "loss": 0.3955, "num_input_tokens_seen": 44548288, "step": 14150 }, { "epoch": 0.9061519749055759, "grad_norm": 20.607261657714844, "learning_rate": 5.325859571103586e-08, "loss": 0.3331, "num_input_tokens_seen": 44563712, "step": 14155 }, { "epoch": 0.9064720568465527, "grad_norm": 21.235889434814453, "learning_rate": 5.289939991133508e-08, "loss": 0.3333, "num_input_tokens_seen": 44579264, "step": 14160 }, { "epoch": 0.9067921387875296, "grad_norm": 12.65000057220459, "learning_rate": 5.2541386577853895e-08, "loss": 0.2384, "num_input_tokens_seen": 44594176, "step": 14165 }, { "epoch": 0.9071122207285065, "grad_norm": 16.73200225830078, "learning_rate": 5.2184556157576e-08, "loss": 0.2502, "num_input_tokens_seen": 44609664, "step": 14170 }, { "epoch": 0.9074323026694834, "grad_norm": 52.27291488647461, "learning_rate": 5.1828909096008234e-08, "loss": 0.3649, "num_input_tokens_seen": 44626944, "step": 14175 }, { "epoch": 0.9077523846104603, "grad_norm": 18.205657958984375, "learning_rate": 5.14744458371803e-08, "loss": 0.2331, "num_input_tokens_seen": 44643520, "step": 14180 }, { "epoch": 0.9080724665514371, "grad_norm": 87.80847930908203, "learning_rate": 5.1121166823643646e-08, "loss": 0.5075, "num_input_tokens_seen": 44657984, "step": 14185 }, { "epoch": 0.908392548492414, "grad_norm": 28.186279296875, "learning_rate": 5.076907249647122e-08, "loss": 0.376, "num_input_tokens_seen": 44673024, "step": 14190 }, { "epoch": 0.9087126304333909, "grad_norm": 25.26058578491211, "learning_rate": 5.0418163295257055e-08, "loss": 0.412, "num_input_tokens_seen": 44687424, "step": 14195 }, { "epoch": 0.9090327123743679, "grad_norm": 40.44475555419922, "learning_rate": 5.006843965811536e-08, "loss": 0.2867, "num_input_tokens_seen": 44702976, "step": 14200 }, { "epoch": 0.9093527943153448, "grad_norm": 46.02883529663086, "learning_rate": 4.971990202168008e-08, "loss": 0.482, "num_input_tokens_seen": 44718144, "step": 14205 }, { "epoch": 0.9096728762563216, "grad_norm": 26.443368911743164, "learning_rate": 4.9372550821104697e-08, "loss": 0.3277, "num_input_tokens_seen": 44734912, "step": 14210 }, { "epoch": 0.9099929581972985, "grad_norm": 20.41611671447754, "learning_rate": 4.902638649006119e-08, "loss": 0.311, "num_input_tokens_seen": 44749888, "step": 14215 }, { "epoch": 0.9103130401382754, "grad_norm": 19.726547241210938, "learning_rate": 4.868140946073973e-08, "loss": 0.3201, "num_input_tokens_seen": 44764544, "step": 14220 }, { "epoch": 0.9106331220792523, "grad_norm": 32.19831848144531, "learning_rate": 4.833762016384857e-08, "loss": 0.2995, "num_input_tokens_seen": 44780992, "step": 14225 }, { "epoch": 0.9109532040202292, "grad_norm": 50.0634880065918, "learning_rate": 4.799501902861214e-08, "loss": 0.3879, "num_input_tokens_seen": 44796672, "step": 14230 }, { "epoch": 0.911273285961206, "grad_norm": 44.15312957763672, "learning_rate": 4.765360648277217e-08, "loss": 0.4313, "num_input_tokens_seen": 44812224, "step": 14235 }, { "epoch": 0.9115933679021829, "grad_norm": 38.931339263916016, "learning_rate": 4.7313382952586465e-08, "loss": 0.4254, "num_input_tokens_seen": 44827136, "step": 14240 }, { "epoch": 0.9119134498431598, "grad_norm": 16.312923431396484, "learning_rate": 4.6974348862828027e-08, "loss": 0.3787, "num_input_tokens_seen": 44842176, "step": 14245 }, { "epoch": 0.9122335317841367, "grad_norm": 47.28225326538086, "learning_rate": 4.663650463678448e-08, "loss": 0.4211, "num_input_tokens_seen": 44858880, "step": 14250 }, { "epoch": 0.9125536137251137, "grad_norm": 21.42548942565918, "learning_rate": 4.629985069625875e-08, "loss": 0.4399, "num_input_tokens_seen": 44875328, "step": 14255 }, { "epoch": 0.9128736956660906, "grad_norm": 41.41118240356445, "learning_rate": 4.596438746156728e-08, "loss": 0.3625, "num_input_tokens_seen": 44892032, "step": 14260 }, { "epoch": 0.9131937776070674, "grad_norm": 35.68510818481445, "learning_rate": 4.563011535153949e-08, "loss": 0.3618, "num_input_tokens_seen": 44907328, "step": 14265 }, { "epoch": 0.9135138595480443, "grad_norm": 26.231754302978516, "learning_rate": 4.52970347835181e-08, "loss": 0.2686, "num_input_tokens_seen": 44922560, "step": 14270 }, { "epoch": 0.9138339414890212, "grad_norm": 34.4133186340332, "learning_rate": 4.496514617335845e-08, "loss": 0.3256, "num_input_tokens_seen": 44937728, "step": 14275 }, { "epoch": 0.9141540234299981, "grad_norm": 42.511531829833984, "learning_rate": 4.4634449935427197e-08, "loss": 0.3568, "num_input_tokens_seen": 44954560, "step": 14280 }, { "epoch": 0.914474105370975, "grad_norm": 28.035154342651367, "learning_rate": 4.430494648260219e-08, "loss": 0.3032, "num_input_tokens_seen": 44971520, "step": 14285 }, { "epoch": 0.9147941873119518, "grad_norm": 35.39820098876953, "learning_rate": 4.397663622627279e-08, "loss": 0.4391, "num_input_tokens_seen": 44987392, "step": 14290 }, { "epoch": 0.9151142692529287, "grad_norm": 25.651020050048828, "learning_rate": 4.364951957633789e-08, "loss": 0.3116, "num_input_tokens_seen": 45002688, "step": 14295 }, { "epoch": 0.9154343511939056, "grad_norm": 29.278078079223633, "learning_rate": 4.332359694120669e-08, "loss": 0.2874, "num_input_tokens_seen": 45017792, "step": 14300 }, { "epoch": 0.9157544331348826, "grad_norm": 33.1219482421875, "learning_rate": 4.299886872779734e-08, "loss": 0.3561, "num_input_tokens_seen": 45032640, "step": 14305 }, { "epoch": 0.9160745150758595, "grad_norm": 29.479825973510742, "learning_rate": 4.267533534153678e-08, "loss": 0.2945, "num_input_tokens_seen": 45048256, "step": 14310 }, { "epoch": 0.9163945970168363, "grad_norm": 26.894004821777344, "learning_rate": 4.2352997186360316e-08, "loss": 0.3251, "num_input_tokens_seen": 45064192, "step": 14315 }, { "epoch": 0.9167146789578132, "grad_norm": 19.898136138916016, "learning_rate": 4.203185466471082e-08, "loss": 0.321, "num_input_tokens_seen": 45079488, "step": 14320 }, { "epoch": 0.9170347608987901, "grad_norm": 20.337265014648438, "learning_rate": 4.1711908177538556e-08, "loss": 0.3791, "num_input_tokens_seen": 45095616, "step": 14325 }, { "epoch": 0.917354842839767, "grad_norm": 45.242088317871094, "learning_rate": 4.139315812430055e-08, "loss": 0.3797, "num_input_tokens_seen": 45110592, "step": 14330 }, { "epoch": 0.9176749247807439, "grad_norm": 29.204076766967773, "learning_rate": 4.1075604902959915e-08, "loss": 0.3756, "num_input_tokens_seen": 45127168, "step": 14335 }, { "epoch": 0.9179950067217207, "grad_norm": 31.663959503173828, "learning_rate": 4.07592489099855e-08, "loss": 0.3157, "num_input_tokens_seen": 45142208, "step": 14340 }, { "epoch": 0.9183150886626976, "grad_norm": 38.191898345947266, "learning_rate": 4.044409054035147e-08, "loss": 0.3917, "num_input_tokens_seen": 45157184, "step": 14345 }, { "epoch": 0.9186351706036745, "grad_norm": 15.774205207824707, "learning_rate": 4.0130130187537195e-08, "loss": 0.3891, "num_input_tokens_seen": 45174464, "step": 14350 }, { "epoch": 0.9189552525446514, "grad_norm": 36.91510772705078, "learning_rate": 3.981736824352522e-08, "loss": 0.3157, "num_input_tokens_seen": 45188992, "step": 14355 }, { "epoch": 0.9192753344856284, "grad_norm": 32.23750305175781, "learning_rate": 3.950580509880286e-08, "loss": 0.4661, "num_input_tokens_seen": 45204032, "step": 14360 }, { "epoch": 0.9195954164266052, "grad_norm": 46.32685089111328, "learning_rate": 3.9195441142360066e-08, "loss": 0.4012, "num_input_tokens_seen": 45219328, "step": 14365 }, { "epoch": 0.9199154983675821, "grad_norm": 23.546079635620117, "learning_rate": 3.888627676169043e-08, "loss": 0.3271, "num_input_tokens_seen": 45235584, "step": 14370 }, { "epoch": 0.920235580308559, "grad_norm": 39.16623306274414, "learning_rate": 3.857831234278886e-08, "loss": 0.3709, "num_input_tokens_seen": 45250880, "step": 14375 }, { "epoch": 0.9205556622495359, "grad_norm": 31.843650817871094, "learning_rate": 3.827154827015255e-08, "loss": 0.4085, "num_input_tokens_seen": 45266752, "step": 14380 }, { "epoch": 0.9208757441905128, "grad_norm": 12.346802711486816, "learning_rate": 3.7965984926780383e-08, "loss": 0.2914, "num_input_tokens_seen": 45282496, "step": 14385 }, { "epoch": 0.9211958261314896, "grad_norm": 41.83573532104492, "learning_rate": 3.766162269417139e-08, "loss": 0.3577, "num_input_tokens_seen": 45297024, "step": 14390 }, { "epoch": 0.9215159080724665, "grad_norm": 45.033992767333984, "learning_rate": 3.73584619523255e-08, "loss": 0.3693, "num_input_tokens_seen": 45314176, "step": 14395 }, { "epoch": 0.9218359900134434, "grad_norm": 21.012765884399414, "learning_rate": 3.7056503079742616e-08, "loss": 0.3557, "num_input_tokens_seen": 45329344, "step": 14400 }, { "epoch": 0.9221560719544203, "grad_norm": 29.65179443359375, "learning_rate": 3.6755746453421945e-08, "loss": 0.3428, "num_input_tokens_seen": 45344384, "step": 14405 }, { "epoch": 0.9224761538953972, "grad_norm": 13.857353210449219, "learning_rate": 3.645619244886145e-08, "loss": 0.2869, "num_input_tokens_seen": 45360192, "step": 14410 }, { "epoch": 0.9227962358363742, "grad_norm": 14.174830436706543, "learning_rate": 3.615784144005796e-08, "loss": 0.3103, "num_input_tokens_seen": 45376000, "step": 14415 }, { "epoch": 0.923116317777351, "grad_norm": 30.094505310058594, "learning_rate": 3.5860693799506184e-08, "loss": 0.4093, "num_input_tokens_seen": 45390400, "step": 14420 }, { "epoch": 0.9234363997183279, "grad_norm": 29.435256958007812, "learning_rate": 3.5564749898198466e-08, "loss": 0.4518, "num_input_tokens_seen": 45406976, "step": 14425 }, { "epoch": 0.9237564816593048, "grad_norm": 33.67948913574219, "learning_rate": 3.527001010562425e-08, "loss": 0.3481, "num_input_tokens_seen": 45422080, "step": 14430 }, { "epoch": 0.9240765636002817, "grad_norm": 52.893489837646484, "learning_rate": 3.4976474789769504e-08, "loss": 0.3429, "num_input_tokens_seen": 45439296, "step": 14435 }, { "epoch": 0.9243966455412586, "grad_norm": 34.073848724365234, "learning_rate": 3.4684144317116636e-08, "loss": 0.2983, "num_input_tokens_seen": 45454208, "step": 14440 }, { "epoch": 0.9247167274822354, "grad_norm": 18.271291732788086, "learning_rate": 3.439301905264369e-08, "loss": 0.3001, "num_input_tokens_seen": 45470400, "step": 14445 }, { "epoch": 0.9250368094232123, "grad_norm": 46.16067123413086, "learning_rate": 3.410309935982403e-08, "loss": 0.3212, "num_input_tokens_seen": 45486528, "step": 14450 }, { "epoch": 0.9253568913641892, "grad_norm": 17.307554244995117, "learning_rate": 3.381438560062555e-08, "loss": 0.3429, "num_input_tokens_seen": 45501440, "step": 14455 }, { "epoch": 0.9256769733051661, "grad_norm": 38.451210021972656, "learning_rate": 3.3526878135511025e-08, "loss": 0.3181, "num_input_tokens_seen": 45517760, "step": 14460 }, { "epoch": 0.9259970552461431, "grad_norm": 48.87675094604492, "learning_rate": 3.324057732343666e-08, "loss": 0.3642, "num_input_tokens_seen": 45533056, "step": 14465 }, { "epoch": 0.9263171371871199, "grad_norm": 24.82399559020996, "learning_rate": 3.295548352185262e-08, "loss": 0.4131, "num_input_tokens_seen": 45549248, "step": 14470 }, { "epoch": 0.9266372191280968, "grad_norm": 36.503944396972656, "learning_rate": 3.2671597086701753e-08, "loss": 0.3477, "num_input_tokens_seen": 45565760, "step": 14475 }, { "epoch": 0.9269573010690737, "grad_norm": 23.015771865844727, "learning_rate": 3.238891837241964e-08, "loss": 0.3246, "num_input_tokens_seen": 45581568, "step": 14480 }, { "epoch": 0.9272773830100506, "grad_norm": 43.855220794677734, "learning_rate": 3.210744773193386e-08, "loss": 0.4038, "num_input_tokens_seen": 45596928, "step": 14485 }, { "epoch": 0.9275974649510275, "grad_norm": 45.25807189941406, "learning_rate": 3.182718551666386e-08, "loss": 0.2948, "num_input_tokens_seen": 45612800, "step": 14490 }, { "epoch": 0.9279175468920043, "grad_norm": 79.2214584350586, "learning_rate": 3.154813207652063e-08, "loss": 0.4114, "num_input_tokens_seen": 45627584, "step": 14495 }, { "epoch": 0.9282376288329812, "grad_norm": 48.060794830322266, "learning_rate": 3.1270287759905143e-08, "loss": 0.3379, "num_input_tokens_seen": 45643840, "step": 14500 }, { "epoch": 0.9285577107739581, "grad_norm": 15.610395431518555, "learning_rate": 3.0993652913709476e-08, "loss": 0.2884, "num_input_tokens_seen": 45659072, "step": 14505 }, { "epoch": 0.928877792714935, "grad_norm": 27.879131317138672, "learning_rate": 3.0718227883315796e-08, "loss": 0.482, "num_input_tokens_seen": 45675328, "step": 14510 }, { "epoch": 0.9291978746559119, "grad_norm": 39.35497283935547, "learning_rate": 3.044401301259503e-08, "loss": 0.368, "num_input_tokens_seen": 45690816, "step": 14515 }, { "epoch": 0.9295179565968889, "grad_norm": 15.0499267578125, "learning_rate": 3.017100864390787e-08, "loss": 0.3333, "num_input_tokens_seen": 45706432, "step": 14520 }, { "epoch": 0.9298380385378657, "grad_norm": 51.364315032958984, "learning_rate": 2.9899215118103446e-08, "loss": 0.3446, "num_input_tokens_seen": 45721920, "step": 14525 }, { "epoch": 0.9301581204788426, "grad_norm": 15.155922889709473, "learning_rate": 2.9628632774519435e-08, "loss": 0.3433, "num_input_tokens_seen": 45738048, "step": 14530 }, { "epoch": 0.9304782024198195, "grad_norm": 24.992616653442383, "learning_rate": 2.9359261950980485e-08, "loss": 0.3308, "num_input_tokens_seen": 45753856, "step": 14535 }, { "epoch": 0.9307982843607964, "grad_norm": 22.78838539123535, "learning_rate": 2.90911029837998e-08, "loss": 0.3015, "num_input_tokens_seen": 45768704, "step": 14540 }, { "epoch": 0.9311183663017732, "grad_norm": 28.63710594177246, "learning_rate": 2.8824156207776673e-08, "loss": 0.2789, "num_input_tokens_seen": 45783936, "step": 14545 }, { "epoch": 0.9314384482427501, "grad_norm": 115.81269836425781, "learning_rate": 2.8558421956197397e-08, "loss": 0.4514, "num_input_tokens_seen": 45800320, "step": 14550 }, { "epoch": 0.931758530183727, "grad_norm": 36.78664779663086, "learning_rate": 2.829390056083436e-08, "loss": 0.3864, "num_input_tokens_seen": 45816512, "step": 14555 }, { "epoch": 0.9320786121247039, "grad_norm": 21.332889556884766, "learning_rate": 2.8030592351945492e-08, "loss": 0.3037, "num_input_tokens_seen": 45831936, "step": 14560 }, { "epoch": 0.9323986940656808, "grad_norm": 20.547264099121094, "learning_rate": 2.776849765827427e-08, "loss": 0.2968, "num_input_tokens_seen": 45846784, "step": 14565 }, { "epoch": 0.9327187760066578, "grad_norm": 39.512290954589844, "learning_rate": 2.750761680704905e-08, "loss": 0.4282, "num_input_tokens_seen": 45862080, "step": 14570 }, { "epoch": 0.9330388579476346, "grad_norm": 40.28529357910156, "learning_rate": 2.724795012398251e-08, "loss": 0.3937, "num_input_tokens_seen": 45878528, "step": 14575 }, { "epoch": 0.9333589398886115, "grad_norm": 36.721534729003906, "learning_rate": 2.6989497933271543e-08, "loss": 0.3737, "num_input_tokens_seen": 45894016, "step": 14580 }, { "epoch": 0.9336790218295884, "grad_norm": 18.749881744384766, "learning_rate": 2.673226055759692e-08, "loss": 0.3295, "num_input_tokens_seen": 45909504, "step": 14585 }, { "epoch": 0.9339991037705653, "grad_norm": 31.62596321105957, "learning_rate": 2.6476238318122402e-08, "loss": 0.338, "num_input_tokens_seen": 45925376, "step": 14590 }, { "epoch": 0.9343191857115422, "grad_norm": 33.345306396484375, "learning_rate": 2.6221431534494742e-08, "loss": 0.3956, "num_input_tokens_seen": 45940224, "step": 14595 }, { "epoch": 0.934639267652519, "grad_norm": 57.66178894042969, "learning_rate": 2.5967840524843243e-08, "loss": 0.3521, "num_input_tokens_seen": 45955072, "step": 14600 }, { "epoch": 0.9349593495934959, "grad_norm": 33.97639846801758, "learning_rate": 2.5715465605779195e-08, "loss": 0.4287, "num_input_tokens_seen": 45970240, "step": 14605 }, { "epoch": 0.9352794315344728, "grad_norm": 83.71870422363281, "learning_rate": 2.5464307092395777e-08, "loss": 0.406, "num_input_tokens_seen": 45985856, "step": 14610 }, { "epoch": 0.9355995134754497, "grad_norm": 20.36864471435547, "learning_rate": 2.5214365298267148e-08, "loss": 0.3398, "num_input_tokens_seen": 46000256, "step": 14615 }, { "epoch": 0.9359195954164266, "grad_norm": 26.265127182006836, "learning_rate": 2.4965640535448917e-08, "loss": 0.32, "num_input_tokens_seen": 46015616, "step": 14620 }, { "epoch": 0.9362396773574035, "grad_norm": 32.42552185058594, "learning_rate": 2.471813311447657e-08, "loss": 0.3741, "num_input_tokens_seen": 46031040, "step": 14625 }, { "epoch": 0.9365597592983804, "grad_norm": 37.86249542236328, "learning_rate": 2.4471843344365915e-08, "loss": 0.3304, "num_input_tokens_seen": 46046016, "step": 14630 }, { "epoch": 0.9368798412393573, "grad_norm": 17.967323303222656, "learning_rate": 2.42267715326131e-08, "loss": 0.2715, "num_input_tokens_seen": 46062528, "step": 14635 }, { "epoch": 0.9371999231803342, "grad_norm": 31.25685691833496, "learning_rate": 2.3982917985192697e-08, "loss": 0.3426, "num_input_tokens_seen": 46078144, "step": 14640 }, { "epoch": 0.9375200051213111, "grad_norm": 53.25637435913086, "learning_rate": 2.3740283006558838e-08, "loss": 0.3748, "num_input_tokens_seen": 46096896, "step": 14645 }, { "epoch": 0.9378400870622879, "grad_norm": 47.64904022216797, "learning_rate": 2.349886689964431e-08, "loss": 0.3715, "num_input_tokens_seen": 46111808, "step": 14650 }, { "epoch": 0.9381601690032648, "grad_norm": 36.294498443603516, "learning_rate": 2.32586699658599e-08, "loss": 0.2804, "num_input_tokens_seen": 46127936, "step": 14655 }, { "epoch": 0.9384802509442417, "grad_norm": 22.60685920715332, "learning_rate": 2.3019692505094056e-08, "loss": 0.3522, "num_input_tokens_seen": 46142848, "step": 14660 }, { "epoch": 0.9388003328852186, "grad_norm": 51.22877502441406, "learning_rate": 2.2781934815713223e-08, "loss": 0.5364, "num_input_tokens_seen": 46158848, "step": 14665 }, { "epoch": 0.9391204148261955, "grad_norm": 28.425065994262695, "learning_rate": 2.254539719456061e-08, "loss": 0.3566, "num_input_tokens_seen": 46174912, "step": 14670 }, { "epoch": 0.9394404967671725, "grad_norm": 19.683509826660156, "learning_rate": 2.231007993695633e-08, "loss": 0.2587, "num_input_tokens_seen": 46189248, "step": 14675 }, { "epoch": 0.9397605787081493, "grad_norm": 19.60419273376465, "learning_rate": 2.2075983336696357e-08, "loss": 0.314, "num_input_tokens_seen": 46204928, "step": 14680 }, { "epoch": 0.9400806606491262, "grad_norm": 40.57781982421875, "learning_rate": 2.1843107686053353e-08, "loss": 0.3916, "num_input_tokens_seen": 46220160, "step": 14685 }, { "epoch": 0.9404007425901031, "grad_norm": 24.233959197998047, "learning_rate": 2.1611453275775405e-08, "loss": 0.4249, "num_input_tokens_seen": 46235584, "step": 14690 }, { "epoch": 0.94072082453108, "grad_norm": 20.722745895385742, "learning_rate": 2.138102039508538e-08, "loss": 0.2691, "num_input_tokens_seen": 46251904, "step": 14695 }, { "epoch": 0.9410409064720568, "grad_norm": 43.360191345214844, "learning_rate": 2.1151809331681703e-08, "loss": 0.3948, "num_input_tokens_seen": 46268032, "step": 14700 }, { "epoch": 0.9413609884130337, "grad_norm": 54.16123962402344, "learning_rate": 2.092382037173701e-08, "loss": 0.3362, "num_input_tokens_seen": 46283392, "step": 14705 }, { "epoch": 0.9416810703540106, "grad_norm": 26.91010856628418, "learning_rate": 2.0697053799898277e-08, "loss": 0.2966, "num_input_tokens_seen": 46298752, "step": 14710 }, { "epoch": 0.9420011522949875, "grad_norm": 29.30316734313965, "learning_rate": 2.0471509899286144e-08, "loss": 0.3392, "num_input_tokens_seen": 46314624, "step": 14715 }, { "epoch": 0.9423212342359644, "grad_norm": 25.85833740234375, "learning_rate": 2.0247188951494797e-08, "loss": 0.3403, "num_input_tokens_seen": 46331712, "step": 14720 }, { "epoch": 0.9426413161769412, "grad_norm": 49.84812927246094, "learning_rate": 2.0024091236591655e-08, "loss": 0.5398, "num_input_tokens_seen": 46347200, "step": 14725 }, { "epoch": 0.9429613981179182, "grad_norm": 17.558185577392578, "learning_rate": 1.98022170331168e-08, "loss": 0.3166, "num_input_tokens_seen": 46363008, "step": 14730 }, { "epoch": 0.9432814800588951, "grad_norm": 32.16617202758789, "learning_rate": 1.9581566618082744e-08, "loss": 0.3797, "num_input_tokens_seen": 46378816, "step": 14735 }, { "epoch": 0.943601561999872, "grad_norm": 57.684410095214844, "learning_rate": 1.9362140266974025e-08, "loss": 0.3915, "num_input_tokens_seen": 46395200, "step": 14740 }, { "epoch": 0.9439216439408489, "grad_norm": 53.940555572509766, "learning_rate": 1.9143938253747383e-08, "loss": 0.3198, "num_input_tokens_seen": 46411840, "step": 14745 }, { "epoch": 0.9442417258818258, "grad_norm": 25.7904109954834, "learning_rate": 1.892696085083023e-08, "loss": 0.4515, "num_input_tokens_seen": 46427776, "step": 14750 }, { "epoch": 0.9445618078228026, "grad_norm": 36.919376373291016, "learning_rate": 1.8711208329121542e-08, "loss": 0.3118, "num_input_tokens_seen": 46444736, "step": 14755 }, { "epoch": 0.9448818897637795, "grad_norm": 26.23403549194336, "learning_rate": 1.849668095799084e-08, "loss": 0.3325, "num_input_tokens_seen": 46460672, "step": 14760 }, { "epoch": 0.9452019717047564, "grad_norm": 24.87689781188965, "learning_rate": 1.8283379005278098e-08, "loss": 0.3344, "num_input_tokens_seen": 46476736, "step": 14765 }, { "epoch": 0.9455220536457333, "grad_norm": 13.15492057800293, "learning_rate": 1.807130273729329e-08, "loss": 0.3231, "num_input_tokens_seen": 46492416, "step": 14770 }, { "epoch": 0.9458421355867102, "grad_norm": 36.111331939697266, "learning_rate": 1.7860452418816173e-08, "loss": 0.3349, "num_input_tokens_seen": 46507264, "step": 14775 }, { "epoch": 0.946162217527687, "grad_norm": 28.380617141723633, "learning_rate": 1.7650828313095834e-08, "loss": 0.3288, "num_input_tokens_seen": 46524224, "step": 14780 }, { "epoch": 0.946482299468664, "grad_norm": 14.132955551147461, "learning_rate": 1.7442430681850362e-08, "loss": 0.3101, "num_input_tokens_seen": 46539456, "step": 14785 }, { "epoch": 0.9468023814096409, "grad_norm": 38.144737243652344, "learning_rate": 1.723525978526652e-08, "loss": 0.4302, "num_input_tokens_seen": 46555136, "step": 14790 }, { "epoch": 0.9471224633506178, "grad_norm": 27.17024040222168, "learning_rate": 1.702931588199996e-08, "loss": 0.3501, "num_input_tokens_seen": 46570432, "step": 14795 }, { "epoch": 0.9474425452915947, "grad_norm": 30.944738388061523, "learning_rate": 1.6824599229173897e-08, "loss": 0.3115, "num_input_tokens_seen": 46586304, "step": 14800 }, { "epoch": 0.9477626272325715, "grad_norm": 33.253997802734375, "learning_rate": 1.662111008237932e-08, "loss": 0.2909, "num_input_tokens_seen": 46602432, "step": 14805 }, { "epoch": 0.9480827091735484, "grad_norm": 33.023921966552734, "learning_rate": 1.6418848695675003e-08, "loss": 0.3218, "num_input_tokens_seen": 46617472, "step": 14810 }, { "epoch": 0.9484027911145253, "grad_norm": 35.12213897705078, "learning_rate": 1.6217815321586614e-08, "loss": 0.372, "num_input_tokens_seen": 46632896, "step": 14815 }, { "epoch": 0.9487228730555022, "grad_norm": 18.142263412475586, "learning_rate": 1.6018010211106602e-08, "loss": 0.355, "num_input_tokens_seen": 46649408, "step": 14820 }, { "epoch": 0.9490429549964791, "grad_norm": 16.464832305908203, "learning_rate": 1.58194336136942e-08, "loss": 0.2816, "num_input_tokens_seen": 46665344, "step": 14825 }, { "epoch": 0.9493630369374559, "grad_norm": 36.46229934692383, "learning_rate": 1.5622085777274417e-08, "loss": 0.4274, "num_input_tokens_seen": 46680704, "step": 14830 }, { "epoch": 0.9496831188784329, "grad_norm": 39.555789947509766, "learning_rate": 1.542596694823839e-08, "loss": 0.3333, "num_input_tokens_seen": 46695936, "step": 14835 }, { "epoch": 0.9500032008194098, "grad_norm": 54.21735382080078, "learning_rate": 1.5231077371442914e-08, "loss": 0.4259, "num_input_tokens_seen": 46711680, "step": 14840 }, { "epoch": 0.9503232827603867, "grad_norm": 24.265138626098633, "learning_rate": 1.5037417290209685e-08, "loss": 0.2888, "num_input_tokens_seen": 46727040, "step": 14845 }, { "epoch": 0.9506433647013636, "grad_norm": 37.78664779663086, "learning_rate": 1.4844986946325743e-08, "loss": 0.393, "num_input_tokens_seen": 46742720, "step": 14850 }, { "epoch": 0.9509634466423404, "grad_norm": 23.887489318847656, "learning_rate": 1.4653786580042681e-08, "loss": 0.2502, "num_input_tokens_seen": 46758336, "step": 14855 }, { "epoch": 0.9511554958069266, "eval_loss": 0.3537425398826599, "eval_runtime": 49.1421, "eval_samples_per_second": 282.568, "eval_steps_per_second": 35.326, "num_input_tokens_seen": 46767552, "step": 14858 }, { "epoch": 0.9512835285833173, "grad_norm": 22.978870391845703, "learning_rate": 1.4463816430076215e-08, "loss": 0.3108, "num_input_tokens_seen": 46773312, "step": 14860 }, { "epoch": 0.9516036105242942, "grad_norm": 39.241058349609375, "learning_rate": 1.4275076733606395e-08, "loss": 0.3685, "num_input_tokens_seen": 46787968, "step": 14865 }, { "epoch": 0.9519236924652711, "grad_norm": 24.853103637695312, "learning_rate": 1.4087567726277061e-08, "loss": 0.2913, "num_input_tokens_seen": 46803712, "step": 14870 }, { "epoch": 0.952243774406248, "grad_norm": 28.337535858154297, "learning_rate": 1.390128964219528e-08, "loss": 0.2789, "num_input_tokens_seen": 46820288, "step": 14875 }, { "epoch": 0.9525638563472248, "grad_norm": 45.00613784790039, "learning_rate": 1.3716242713931348e-08, "loss": 0.3819, "num_input_tokens_seen": 46835904, "step": 14880 }, { "epoch": 0.9528839382882017, "grad_norm": 27.987937927246094, "learning_rate": 1.3532427172518789e-08, "loss": 0.3714, "num_input_tokens_seen": 46851136, "step": 14885 }, { "epoch": 0.9532040202291787, "grad_norm": 34.979331970214844, "learning_rate": 1.3349843247453252e-08, "loss": 0.3343, "num_input_tokens_seen": 46867456, "step": 14890 }, { "epoch": 0.9535241021701556, "grad_norm": 26.81144905090332, "learning_rate": 1.3168491166692941e-08, "loss": 0.2772, "num_input_tokens_seen": 46882816, "step": 14895 }, { "epoch": 0.9538441841111325, "grad_norm": 40.77924728393555, "learning_rate": 1.2988371156658073e-08, "loss": 0.4506, "num_input_tokens_seen": 46898624, "step": 14900 }, { "epoch": 0.9541642660521094, "grad_norm": 28.05156135559082, "learning_rate": 1.2809483442230763e-08, "loss": 0.282, "num_input_tokens_seen": 46914304, "step": 14905 }, { "epoch": 0.9544843479930862, "grad_norm": 21.98477554321289, "learning_rate": 1.2631828246754128e-08, "loss": 0.3705, "num_input_tokens_seen": 46930368, "step": 14910 }, { "epoch": 0.9548044299340631, "grad_norm": 38.7076301574707, "learning_rate": 1.2455405792032969e-08, "loss": 0.364, "num_input_tokens_seen": 46945792, "step": 14915 }, { "epoch": 0.95512451187504, "grad_norm": 32.54359817504883, "learning_rate": 1.2280216298332646e-08, "loss": 0.342, "num_input_tokens_seen": 46962048, "step": 14920 }, { "epoch": 0.9554445938160169, "grad_norm": 53.13780212402344, "learning_rate": 1.2106259984379642e-08, "loss": 0.4603, "num_input_tokens_seen": 46976768, "step": 14925 }, { "epoch": 0.9557646757569938, "grad_norm": 45.00946807861328, "learning_rate": 1.1933537067359889e-08, "loss": 0.4141, "num_input_tokens_seen": 46991424, "step": 14930 }, { "epoch": 0.9560847576979706, "grad_norm": 24.874343872070312, "learning_rate": 1.1762047762920446e-08, "loss": 0.3607, "num_input_tokens_seen": 47006656, "step": 14935 }, { "epoch": 0.9564048396389476, "grad_norm": 51.970680236816406, "learning_rate": 1.1591792285167602e-08, "loss": 0.3576, "num_input_tokens_seen": 47021824, "step": 14940 }, { "epoch": 0.9567249215799245, "grad_norm": 29.96383285522461, "learning_rate": 1.1422770846667206e-08, "loss": 0.3907, "num_input_tokens_seen": 47037440, "step": 14945 }, { "epoch": 0.9570450035209014, "grad_norm": 19.72380256652832, "learning_rate": 1.1254983658444572e-08, "loss": 0.307, "num_input_tokens_seen": 47053760, "step": 14950 }, { "epoch": 0.9573650854618783, "grad_norm": 46.794639587402344, "learning_rate": 1.1088430929984017e-08, "loss": 0.3148, "num_input_tokens_seen": 47068928, "step": 14955 }, { "epoch": 0.9576851674028551, "grad_norm": 37.3883056640625, "learning_rate": 1.0923112869228645e-08, "loss": 0.383, "num_input_tokens_seen": 47084672, "step": 14960 }, { "epoch": 0.958005249343832, "grad_norm": 41.08680725097656, "learning_rate": 1.0759029682579801e-08, "loss": 0.3613, "num_input_tokens_seen": 47101632, "step": 14965 }, { "epoch": 0.9583253312848089, "grad_norm": 24.6757755279541, "learning_rate": 1.0596181574897389e-08, "loss": 0.306, "num_input_tokens_seen": 47116480, "step": 14970 }, { "epoch": 0.9586454132257858, "grad_norm": 29.715951919555664, "learning_rate": 1.0434568749499107e-08, "loss": 0.3155, "num_input_tokens_seen": 47132992, "step": 14975 }, { "epoch": 0.9589654951667627, "grad_norm": 26.07288932800293, "learning_rate": 1.027419140816066e-08, "loss": 0.3061, "num_input_tokens_seen": 47149056, "step": 14980 }, { "epoch": 0.9592855771077395, "grad_norm": 23.639156341552734, "learning_rate": 1.0115049751114768e-08, "loss": 0.2984, "num_input_tokens_seen": 47164864, "step": 14985 }, { "epoch": 0.9596056590487164, "grad_norm": 18.913105010986328, "learning_rate": 9.957143977051941e-09, "loss": 0.3481, "num_input_tokens_seen": 47180544, "step": 14990 }, { "epoch": 0.9599257409896934, "grad_norm": 29.4930362701416, "learning_rate": 9.800474283119142e-09, "loss": 0.3836, "num_input_tokens_seen": 47196608, "step": 14995 }, { "epoch": 0.9602458229306703, "grad_norm": 26.606163024902344, "learning_rate": 9.645040864920462e-09, "loss": 0.3701, "num_input_tokens_seen": 47213504, "step": 15000 }, { "epoch": 0.9605659048716472, "grad_norm": 32.366455078125, "learning_rate": 9.490843916516334e-09, "loss": 0.4056, "num_input_tokens_seen": 47228288, "step": 15005 }, { "epoch": 0.960885986812624, "grad_norm": 25.494123458862305, "learning_rate": 9.337883630423316e-09, "loss": 0.4448, "num_input_tokens_seen": 47243712, "step": 15010 }, { "epoch": 0.9612060687536009, "grad_norm": 50.839359283447266, "learning_rate": 9.186160197614423e-09, "loss": 0.4909, "num_input_tokens_seen": 47259904, "step": 15015 }, { "epoch": 0.9615261506945778, "grad_norm": 33.710933685302734, "learning_rate": 9.035673807517795e-09, "loss": 0.4837, "num_input_tokens_seen": 47275072, "step": 15020 }, { "epoch": 0.9618462326355547, "grad_norm": 42.61496353149414, "learning_rate": 8.886424648017698e-09, "loss": 0.27, "num_input_tokens_seen": 47290688, "step": 15025 }, { "epoch": 0.9621663145765316, "grad_norm": 18.92186737060547, "learning_rate": 8.738412905453408e-09, "loss": 0.3408, "num_input_tokens_seen": 47306496, "step": 15030 }, { "epoch": 0.9624863965175084, "grad_norm": 29.760217666625977, "learning_rate": 8.591638764619324e-09, "loss": 0.3575, "num_input_tokens_seen": 47321280, "step": 15035 }, { "epoch": 0.9628064784584853, "grad_norm": 45.232330322265625, "learning_rate": 8.446102408764643e-09, "loss": 0.3623, "num_input_tokens_seen": 47337536, "step": 15040 }, { "epoch": 0.9631265603994623, "grad_norm": 38.70942687988281, "learning_rate": 8.301804019593129e-09, "loss": 0.273, "num_input_tokens_seen": 47353024, "step": 15045 }, { "epoch": 0.9634466423404392, "grad_norm": 31.57654571533203, "learning_rate": 8.158743777263333e-09, "loss": 0.3535, "num_input_tokens_seen": 47369088, "step": 15050 }, { "epoch": 0.9637667242814161, "grad_norm": 26.071718215942383, "learning_rate": 8.016921860387272e-09, "loss": 0.3678, "num_input_tokens_seen": 47384320, "step": 15055 }, { "epoch": 0.964086806222393, "grad_norm": 28.67797088623047, "learning_rate": 7.876338446031416e-09, "loss": 0.3908, "num_input_tokens_seen": 47400896, "step": 15060 }, { "epoch": 0.9644068881633698, "grad_norm": 44.70686340332031, "learning_rate": 7.736993709716033e-09, "loss": 0.3169, "num_input_tokens_seen": 47416896, "step": 15065 }, { "epoch": 0.9647269701043467, "grad_norm": 49.233890533447266, "learning_rate": 7.59888782541418e-09, "loss": 0.4783, "num_input_tokens_seen": 47432320, "step": 15070 }, { "epoch": 0.9650470520453236, "grad_norm": 16.93093490600586, "learning_rate": 7.462020965553151e-09, "loss": 0.2656, "num_input_tokens_seen": 47448320, "step": 15075 }, { "epoch": 0.9653671339863005, "grad_norm": 17.901084899902344, "learning_rate": 7.32639330101259e-09, "loss": 0.49, "num_input_tokens_seen": 47463488, "step": 15080 }, { "epoch": 0.9656872159272774, "grad_norm": 51.073936462402344, "learning_rate": 7.1920050011252675e-09, "loss": 0.3886, "num_input_tokens_seen": 47479104, "step": 15085 }, { "epoch": 0.9660072978682542, "grad_norm": 37.59046173095703, "learning_rate": 7.058856233676525e-09, "loss": 0.391, "num_input_tokens_seen": 47496448, "step": 15090 }, { "epoch": 0.9663273798092311, "grad_norm": 86.30872344970703, "learning_rate": 6.926947164904162e-09, "loss": 0.3733, "num_input_tokens_seen": 47511936, "step": 15095 }, { "epoch": 0.9666474617502081, "grad_norm": 26.688161849975586, "learning_rate": 6.796277959498331e-09, "loss": 0.3984, "num_input_tokens_seen": 47528320, "step": 15100 }, { "epoch": 0.966967543691185, "grad_norm": 26.294218063354492, "learning_rate": 6.666848780600864e-09, "loss": 0.2793, "num_input_tokens_seen": 47543296, "step": 15105 }, { "epoch": 0.9672876256321619, "grad_norm": 10.18204116821289, "learning_rate": 6.538659789805834e-09, "loss": 0.2751, "num_input_tokens_seen": 47558656, "step": 15110 }, { "epoch": 0.9676077075731387, "grad_norm": 34.290340423583984, "learning_rate": 6.411711147158438e-09, "loss": 0.3498, "num_input_tokens_seen": 47574720, "step": 15115 }, { "epoch": 0.9679277895141156, "grad_norm": 52.94532012939453, "learning_rate": 6.286003011155783e-09, "loss": 0.3107, "num_input_tokens_seen": 47590272, "step": 15120 }, { "epoch": 0.9682478714550925, "grad_norm": 32.81538772583008, "learning_rate": 6.161535538745877e-09, "loss": 0.4098, "num_input_tokens_seen": 47605696, "step": 15125 }, { "epoch": 0.9685679533960694, "grad_norm": 32.042781829833984, "learning_rate": 6.0383088853277475e-09, "loss": 0.3975, "num_input_tokens_seen": 47621760, "step": 15130 }, { "epoch": 0.9688880353370463, "grad_norm": 24.502296447753906, "learning_rate": 5.916323204751439e-09, "loss": 0.3081, "num_input_tokens_seen": 47639296, "step": 15135 }, { "epoch": 0.9692081172780231, "grad_norm": 27.488826751708984, "learning_rate": 5.795578649317345e-09, "loss": 0.2648, "num_input_tokens_seen": 47654656, "step": 15140 }, { "epoch": 0.969528199219, "grad_norm": 44.00014877319336, "learning_rate": 5.676075369776656e-09, "loss": 0.3157, "num_input_tokens_seen": 47671168, "step": 15145 }, { "epoch": 0.9698482811599769, "grad_norm": 23.902742385864258, "learning_rate": 5.557813515330468e-09, "loss": 0.3348, "num_input_tokens_seen": 47686400, "step": 15150 }, { "epoch": 0.9701683631009539, "grad_norm": 28.53948211669922, "learning_rate": 5.440793233630115e-09, "loss": 0.3439, "num_input_tokens_seen": 47701760, "step": 15155 }, { "epoch": 0.9704884450419308, "grad_norm": 40.30237579345703, "learning_rate": 5.325014670776951e-09, "loss": 0.3063, "num_input_tokens_seen": 47717248, "step": 15160 }, { "epoch": 0.9708085269829076, "grad_norm": 60.948604583740234, "learning_rate": 5.21047797132157e-09, "loss": 0.3599, "num_input_tokens_seen": 47734336, "step": 15165 }, { "epoch": 0.9711286089238845, "grad_norm": 25.381938934326172, "learning_rate": 5.097183278264694e-09, "loss": 0.3417, "num_input_tokens_seen": 47750464, "step": 15170 }, { "epoch": 0.9714486908648614, "grad_norm": 25.686281204223633, "learning_rate": 4.985130733055954e-09, "loss": 0.4364, "num_input_tokens_seen": 47765824, "step": 15175 }, { "epoch": 0.9717687728058383, "grad_norm": 27.45149803161621, "learning_rate": 4.874320475594107e-09, "loss": 0.3893, "num_input_tokens_seen": 47781760, "step": 15180 }, { "epoch": 0.9720888547468152, "grad_norm": 17.62384605407715, "learning_rate": 4.764752644227377e-09, "loss": 0.2832, "num_input_tokens_seen": 47797312, "step": 15185 }, { "epoch": 0.972408936687792, "grad_norm": 29.088834762573242, "learning_rate": 4.656427375752336e-09, "loss": 0.3392, "num_input_tokens_seen": 47813440, "step": 15190 }, { "epoch": 0.9727290186287689, "grad_norm": 33.35861587524414, "learning_rate": 4.549344805414246e-09, "loss": 0.34, "num_input_tokens_seen": 47829440, "step": 15195 }, { "epoch": 0.9730491005697458, "grad_norm": 32.597530364990234, "learning_rate": 4.443505066907049e-09, "loss": 0.4139, "num_input_tokens_seen": 47844608, "step": 15200 }, { "epoch": 0.9733691825107228, "grad_norm": 28.545236587524414, "learning_rate": 4.338908292372934e-09, "loss": 0.2823, "num_input_tokens_seen": 47860160, "step": 15205 }, { "epoch": 0.9736892644516997, "grad_norm": 42.930023193359375, "learning_rate": 4.235554612402214e-09, "loss": 0.3864, "num_input_tokens_seen": 47875648, "step": 15210 }, { "epoch": 0.9740093463926766, "grad_norm": 48.120704650878906, "learning_rate": 4.133444156033006e-09, "loss": 0.381, "num_input_tokens_seen": 47892736, "step": 15215 }, { "epoch": 0.9743294283336534, "grad_norm": 37.2425422668457, "learning_rate": 4.032577050751551e-09, "loss": 0.3145, "num_input_tokens_seen": 47908992, "step": 15220 }, { "epoch": 0.9746495102746303, "grad_norm": 23.053668975830078, "learning_rate": 3.932953422491669e-09, "loss": 0.3428, "num_input_tokens_seen": 47924736, "step": 15225 }, { "epoch": 0.9749695922156072, "grad_norm": 52.20282745361328, "learning_rate": 3.8345733956345326e-09, "loss": 0.284, "num_input_tokens_seen": 47941056, "step": 15230 }, { "epoch": 0.9752896741565841, "grad_norm": 29.915189743041992, "learning_rate": 3.737437093008777e-09, "loss": 0.3619, "num_input_tokens_seen": 47957824, "step": 15235 }, { "epoch": 0.975609756097561, "grad_norm": 42.0181770324707, "learning_rate": 3.641544635890281e-09, "loss": 0.4107, "num_input_tokens_seen": 47973056, "step": 15240 }, { "epoch": 0.9759298380385378, "grad_norm": 18.199411392211914, "learning_rate": 3.546896144001832e-09, "loss": 0.3896, "num_input_tokens_seen": 47988928, "step": 15245 }, { "epoch": 0.9762499199795147, "grad_norm": 47.75886917114258, "learning_rate": 3.4534917355132364e-09, "loss": 0.3926, "num_input_tokens_seen": 48004032, "step": 15250 }, { "epoch": 0.9765700019204916, "grad_norm": 35.261905670166016, "learning_rate": 3.361331527040878e-09, "loss": 0.4376, "num_input_tokens_seen": 48020800, "step": 15255 }, { "epoch": 0.9768900838614686, "grad_norm": 31.275798797607422, "learning_rate": 3.270415633647938e-09, "loss": 0.3935, "num_input_tokens_seen": 48036800, "step": 15260 }, { "epoch": 0.9772101658024455, "grad_norm": 22.784738540649414, "learning_rate": 3.180744168843952e-09, "loss": 0.2847, "num_input_tokens_seen": 48051264, "step": 15265 }, { "epoch": 0.9775302477434223, "grad_norm": 27.314804077148438, "learning_rate": 3.0923172445849187e-09, "loss": 0.2318, "num_input_tokens_seen": 48066176, "step": 15270 }, { "epoch": 0.9778503296843992, "grad_norm": 34.85258865356445, "learning_rate": 3.0051349712727493e-09, "loss": 0.3178, "num_input_tokens_seen": 48081984, "step": 15275 }, { "epoch": 0.9781704116253761, "grad_norm": 27.141429901123047, "learning_rate": 2.9191974577555954e-09, "loss": 0.4072, "num_input_tokens_seen": 48096896, "step": 15280 }, { "epoch": 0.978490493566353, "grad_norm": 18.883970260620117, "learning_rate": 2.8345048113274096e-09, "loss": 0.2334, "num_input_tokens_seen": 48112128, "step": 15285 }, { "epoch": 0.9788105755073299, "grad_norm": 32.112449645996094, "learning_rate": 2.751057137727941e-09, "loss": 0.3388, "num_input_tokens_seen": 48127616, "step": 15290 }, { "epoch": 0.9791306574483067, "grad_norm": 59.22599411010742, "learning_rate": 2.66885454114274e-09, "loss": 0.384, "num_input_tokens_seen": 48142144, "step": 15295 }, { "epoch": 0.9794507393892836, "grad_norm": 60.90025329589844, "learning_rate": 2.5878971242025983e-09, "loss": 0.3776, "num_input_tokens_seen": 48158272, "step": 15300 }, { "epoch": 0.9797708213302605, "grad_norm": 23.69969940185547, "learning_rate": 2.5081849879837746e-09, "loss": 0.3239, "num_input_tokens_seen": 48173120, "step": 15305 }, { "epoch": 0.9800909032712375, "grad_norm": 19.513404846191406, "learning_rate": 2.429718232007771e-09, "loss": 0.3428, "num_input_tokens_seen": 48188672, "step": 15310 }, { "epoch": 0.9804109852122144, "grad_norm": 25.234663009643555, "learning_rate": 2.3524969542414453e-09, "loss": 0.2688, "num_input_tokens_seen": 48204480, "step": 15315 }, { "epoch": 0.9807310671531912, "grad_norm": 14.73193359375, "learning_rate": 2.2765212510963418e-09, "loss": 0.3525, "num_input_tokens_seen": 48219584, "step": 15320 }, { "epoch": 0.9810511490941681, "grad_norm": 33.33141326904297, "learning_rate": 2.2017912174289164e-09, "loss": 0.2847, "num_input_tokens_seen": 48235904, "step": 15325 }, { "epoch": 0.981371231035145, "grad_norm": 34.248878479003906, "learning_rate": 2.128306946540648e-09, "loss": 0.4052, "num_input_tokens_seen": 48252992, "step": 15330 }, { "epoch": 0.9816913129761219, "grad_norm": 28.99315071105957, "learning_rate": 2.0560685301774792e-09, "loss": 0.3316, "num_input_tokens_seen": 48267840, "step": 15335 }, { "epoch": 0.9820113949170988, "grad_norm": 21.494754791259766, "learning_rate": 1.985076058529933e-09, "loss": 0.3781, "num_input_tokens_seen": 48282688, "step": 15340 }, { "epoch": 0.9823314768580756, "grad_norm": 38.192710876464844, "learning_rate": 1.9153296202328863e-09, "loss": 0.4768, "num_input_tokens_seen": 48300096, "step": 15345 }, { "epoch": 0.9826515587990525, "grad_norm": 32.44169998168945, "learning_rate": 1.8468293023656823e-09, "loss": 0.3929, "num_input_tokens_seen": 48315136, "step": 15350 }, { "epoch": 0.9829716407400294, "grad_norm": 17.585954666137695, "learning_rate": 1.7795751904515766e-09, "loss": 0.4052, "num_input_tokens_seen": 48330240, "step": 15355 }, { "epoch": 0.9832917226810063, "grad_norm": 56.64820098876953, "learning_rate": 1.7135673684584019e-09, "loss": 0.318, "num_input_tokens_seen": 48345280, "step": 15360 }, { "epoch": 0.9836118046219833, "grad_norm": 30.882753372192383, "learning_rate": 1.6488059187974579e-09, "loss": 0.3972, "num_input_tokens_seen": 48361792, "step": 15365 }, { "epoch": 0.9839318865629602, "grad_norm": 32.313411712646484, "learning_rate": 1.5852909223242894e-09, "loss": 0.4099, "num_input_tokens_seen": 48377408, "step": 15370 }, { "epoch": 0.984251968503937, "grad_norm": 16.098203659057617, "learning_rate": 1.5230224583380192e-09, "loss": 0.3759, "num_input_tokens_seen": 48392896, "step": 15375 }, { "epoch": 0.9845720504449139, "grad_norm": 39.47123336791992, "learning_rate": 1.4620006045816813e-09, "loss": 0.4663, "num_input_tokens_seen": 48407552, "step": 15380 }, { "epoch": 0.9848921323858908, "grad_norm": 15.717222213745117, "learning_rate": 1.4022254372417774e-09, "loss": 0.2785, "num_input_tokens_seen": 48424320, "step": 15385 }, { "epoch": 0.9852122143268677, "grad_norm": 35.01372146606445, "learning_rate": 1.3436970309481655e-09, "loss": 0.5093, "num_input_tokens_seen": 48441984, "step": 15390 }, { "epoch": 0.9855322962678446, "grad_norm": 15.031546592712402, "learning_rate": 1.2864154587742815e-09, "loss": 0.3442, "num_input_tokens_seen": 48456832, "step": 15395 }, { "epoch": 0.9858523782088214, "grad_norm": 32.367923736572266, "learning_rate": 1.2303807922370292e-09, "loss": 0.3608, "num_input_tokens_seen": 48472512, "step": 15400 }, { "epoch": 0.9861724601497983, "grad_norm": 53.186859130859375, "learning_rate": 1.1755931012961128e-09, "loss": 0.3122, "num_input_tokens_seen": 48488832, "step": 15405 }, { "epoch": 0.9864925420907752, "grad_norm": 17.48390007019043, "learning_rate": 1.122052454354705e-09, "loss": 0.3491, "num_input_tokens_seen": 48503936, "step": 15410 }, { "epoch": 0.9868126240317522, "grad_norm": 20.294185638427734, "learning_rate": 1.0697589182590005e-09, "loss": 0.4398, "num_input_tokens_seen": 48519040, "step": 15415 }, { "epoch": 0.9871327059727291, "grad_norm": 28.50274085998535, "learning_rate": 1.018712558297996e-09, "loss": 0.5967, "num_input_tokens_seen": 48535040, "step": 15420 }, { "epoch": 0.9874527879137059, "grad_norm": 36.501163482666016, "learning_rate": 9.689134382037113e-10, "loss": 0.4383, "num_input_tokens_seen": 48551808, "step": 15425 }, { "epoch": 0.9877728698546828, "grad_norm": 35.623992919921875, "learning_rate": 9.203616201508557e-10, "loss": 0.3967, "num_input_tokens_seen": 48566592, "step": 15430 }, { "epoch": 0.9880929517956597, "grad_norm": 46.61222457885742, "learning_rate": 8.730571647570517e-10, "loss": 0.3159, "num_input_tokens_seen": 48582720, "step": 15435 }, { "epoch": 0.9884130337366366, "grad_norm": 46.78093338012695, "learning_rate": 8.270001310825003e-10, "loss": 0.4878, "num_input_tokens_seen": 48599104, "step": 15440 }, { "epoch": 0.9887331156776135, "grad_norm": 12.824591636657715, "learning_rate": 7.821905766297599e-10, "loss": 0.3118, "num_input_tokens_seen": 48615040, "step": 15445 }, { "epoch": 0.9890531976185903, "grad_norm": 28.26544952392578, "learning_rate": 7.386285573441897e-10, "loss": 0.3926, "num_input_tokens_seen": 48630976, "step": 15450 }, { "epoch": 0.9893732795595672, "grad_norm": 25.03919792175293, "learning_rate": 6.963141276136175e-10, "loss": 0.2862, "num_input_tokens_seen": 48646080, "step": 15455 }, { "epoch": 0.9896933615005441, "grad_norm": 26.057968139648438, "learning_rate": 6.552473402678949e-10, "loss": 0.2525, "num_input_tokens_seen": 48662528, "step": 15460 }, { "epoch": 0.990013443441521, "grad_norm": 49.04160690307617, "learning_rate": 6.154282465794524e-10, "loss": 0.3301, "num_input_tokens_seen": 48680000, "step": 15465 }, { "epoch": 0.990333525382498, "grad_norm": 30.749189376831055, "learning_rate": 5.768568962629672e-10, "loss": 0.424, "num_input_tokens_seen": 48696256, "step": 15470 }, { "epoch": 0.9906536073234748, "grad_norm": 41.51435470581055, "learning_rate": 5.395333374751398e-10, "loss": 0.3065, "num_input_tokens_seen": 48711168, "step": 15475 }, { "epoch": 0.9909736892644517, "grad_norm": 45.217079162597656, "learning_rate": 5.034576168149174e-10, "loss": 0.5309, "num_input_tokens_seen": 48726848, "step": 15480 }, { "epoch": 0.9912937712054286, "grad_norm": 48.17198181152344, "learning_rate": 4.686297793231597e-10, "loss": 0.4868, "num_input_tokens_seen": 48743232, "step": 15485 }, { "epoch": 0.9916138531464055, "grad_norm": 24.643993377685547, "learning_rate": 4.350498684829729e-10, "loss": 0.456, "num_input_tokens_seen": 48758080, "step": 15490 }, { "epoch": 0.9919339350873824, "grad_norm": 38.15465545654297, "learning_rate": 4.0271792621926483e-10, "loss": 0.3105, "num_input_tokens_seen": 48773120, "step": 15495 }, { "epoch": 0.9922540170283592, "grad_norm": 14.166491508483887, "learning_rate": 3.716339928987455e-10, "loss": 0.3815, "num_input_tokens_seen": 48789056, "step": 15500 }, { "epoch": 0.9925740989693361, "grad_norm": 64.28377532958984, "learning_rate": 3.41798107330149e-10, "loss": 0.4142, "num_input_tokens_seen": 48804288, "step": 15505 }, { "epoch": 0.992894180910313, "grad_norm": 34.623619079589844, "learning_rate": 3.1321030676390027e-10, "loss": 0.3715, "num_input_tokens_seen": 48818816, "step": 15510 }, { "epoch": 0.9932142628512899, "grad_norm": 22.467647552490234, "learning_rate": 2.8587062689222617e-10, "loss": 0.2872, "num_input_tokens_seen": 48835520, "step": 15515 }, { "epoch": 0.9935343447922668, "grad_norm": 30.136613845825195, "learning_rate": 2.5977910184904473e-10, "loss": 0.3221, "num_input_tokens_seen": 48851328, "step": 15520 }, { "epoch": 0.9938544267332438, "grad_norm": 32.950374603271484, "learning_rate": 2.3493576420985373e-10, "loss": 0.3354, "num_input_tokens_seen": 48866304, "step": 15525 }, { "epoch": 0.9941745086742206, "grad_norm": 15.965251922607422, "learning_rate": 2.11340644991842e-10, "loss": 0.3174, "num_input_tokens_seen": 48882752, "step": 15530 }, { "epoch": 0.9944945906151975, "grad_norm": 37.14493942260742, "learning_rate": 1.8899377365388936e-10, "loss": 0.3041, "num_input_tokens_seen": 48898304, "step": 15535 }, { "epoch": 0.9948146725561744, "grad_norm": 16.286380767822266, "learning_rate": 1.6789517809634447e-10, "loss": 0.4202, "num_input_tokens_seen": 48914048, "step": 15540 }, { "epoch": 0.9951347544971513, "grad_norm": 61.637794494628906, "learning_rate": 1.480448846609139e-10, "loss": 0.3127, "num_input_tokens_seen": 48930176, "step": 15545 }, { "epoch": 0.9954548364381282, "grad_norm": 24.89733123779297, "learning_rate": 1.294429181311063e-10, "loss": 0.3505, "num_input_tokens_seen": 48945920, "step": 15550 }, { "epoch": 0.995774918379105, "grad_norm": 23.30603790283203, "learning_rate": 1.1208930173145503e-10, "loss": 0.4079, "num_input_tokens_seen": 48960832, "step": 15555 }, { "epoch": 0.9960950003200819, "grad_norm": 21.470914840698242, "learning_rate": 9.598405712840651e-11, "loss": 0.3213, "num_input_tokens_seen": 48977280, "step": 15560 }, { "epoch": 0.9964150822610588, "grad_norm": 19.718584060668945, "learning_rate": 8.1127204429432e-11, "loss": 0.347, "num_input_tokens_seen": 48992512, "step": 15565 }, { "epoch": 0.9967351642020357, "grad_norm": 25.985633850097656, "learning_rate": 6.751876218336061e-11, "loss": 0.3524, "num_input_tokens_seen": 49008128, "step": 15570 }, { "epoch": 0.9970552461430127, "grad_norm": 22.135334014892578, "learning_rate": 5.515874738071247e-11, "loss": 0.3376, "num_input_tokens_seen": 49024512, "step": 15575 }, { "epoch": 0.9973753280839895, "grad_norm": 44.398292541503906, "learning_rate": 4.404717545303249e-11, "loss": 0.308, "num_input_tokens_seen": 49040128, "step": 15580 }, { "epoch": 0.9976954100249664, "grad_norm": 14.405759811401367, "learning_rate": 3.418406027322352e-11, "loss": 0.3099, "num_input_tokens_seen": 49055360, "step": 15585 }, { "epoch": 0.9980154919659433, "grad_norm": 33.78312683105469, "learning_rate": 2.5569414155546254e-11, "loss": 0.3518, "num_input_tokens_seen": 49071360, "step": 15590 }, { "epoch": 0.9983355739069202, "grad_norm": 50.76702117919922, "learning_rate": 1.8203247855397287e-11, "loss": 0.2734, "num_input_tokens_seen": 49086144, "step": 15595 }, { "epoch": 0.9986556558478971, "grad_norm": 33.41775131225586, "learning_rate": 1.2085570569642101e-11, "loss": 0.395, "num_input_tokens_seen": 49101312, "step": 15600 }, { "epoch": 0.9989757377888739, "grad_norm": 56.984737396240234, "learning_rate": 7.216389936171019e-12, "loss": 0.3097, "num_input_tokens_seen": 49116672, "step": 15605 }, { "epoch": 0.9992958197298508, "grad_norm": 16.939533233642578, "learning_rate": 3.5957120342322567e-12, "loss": 0.1772, "num_input_tokens_seen": 49132288, "step": 15610 }, { "epoch": 0.9996159016708277, "grad_norm": 15.791190147399902, "learning_rate": 1.2235413842098807e-12, "loss": 0.3934, "num_input_tokens_seen": 49148096, "step": 15615 }, { "epoch": 0.9999359836118046, "grad_norm": 20.582731246948242, "learning_rate": 9.98809480678986e-14, "loss": 0.2515, "num_input_tokens_seen": 49163840, "step": 15620 }, { "epoch": 1.0, "num_input_tokens_seen": 49166912, "step": 15621, "total_flos": 2.8707953551107686e+17, "train_loss": 0.44386771268258823, "train_runtime": 3548.0201, "train_samples_per_second": 35.222, "train_steps_per_second": 4.403 } ], "logging_steps": 5, "max_steps": 15621, "num_input_tokens_seen": 49166912, "num_train_epochs": 1, "save_steps": 782, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.8707953551107686e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }