{ "best_global_step": 5306, "best_metric": 0.09084735810756683, "best_model_checkpoint": "saves_bts_preliminary/freeze/llama-3.2-1b-instruct/train_sst2_42_1779354537/checkpoint-5306", "epoch": 1.0, "eval_steps": 379, "global_step": 7577, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006598917777484492, "grad_norm": 442.1559753417969, "learning_rate": 1.0554089709762531e-08, "loss": 1.413, "num_input_tokens_seen": 2240, "step": 5 }, { "epoch": 0.0013197835554968984, "grad_norm": 437.0361328125, "learning_rate": 2.3746701846965696e-08, "loss": 1.5134, "num_input_tokens_seen": 4672, "step": 10 }, { "epoch": 0.0019796753332453477, "grad_norm": 441.0426330566406, "learning_rate": 3.6939313984168866e-08, "loss": 1.3995, "num_input_tokens_seen": 7040, "step": 15 }, { "epoch": 0.002639567110993797, "grad_norm": 481.140380859375, "learning_rate": 5.013192612137203e-08, "loss": 1.422, "num_input_tokens_seen": 9600, "step": 20 }, { "epoch": 0.0032994588887422464, "grad_norm": 432.22509765625, "learning_rate": 6.33245382585752e-08, "loss": 1.2976, "num_input_tokens_seen": 12160, "step": 25 }, { "epoch": 0.0039593506664906955, "grad_norm": 370.72210693359375, "learning_rate": 7.651715039577835e-08, "loss": 1.3834, "num_input_tokens_seen": 14528, "step": 30 }, { "epoch": 0.004619242444239145, "grad_norm": 356.2071533203125, "learning_rate": 8.970976253298153e-08, "loss": 1.1343, "num_input_tokens_seen": 16768, "step": 35 }, { "epoch": 0.005279134221987594, "grad_norm": 318.7434997558594, "learning_rate": 1.0290237467018468e-07, "loss": 1.1513, "num_input_tokens_seen": 19264, "step": 40 }, { "epoch": 0.005939025999736044, "grad_norm": 264.9164733886719, "learning_rate": 1.1609498680738786e-07, "loss": 0.8287, "num_input_tokens_seen": 21632, "step": 45 }, { "epoch": 0.006598917777484493, "grad_norm": 186.7604217529297, "learning_rate": 1.29287598944591e-07, "loss": 0.7425, "num_input_tokens_seen": 24000, "step": 50 }, { "epoch": 0.007258809555232942, "grad_norm": 162.783203125, "learning_rate": 1.424802110817942e-07, "loss": 0.7064, "num_input_tokens_seen": 26496, "step": 55 }, { "epoch": 0.007918701332981391, "grad_norm": 45.531829833984375, "learning_rate": 1.5567282321899736e-07, "loss": 0.3853, "num_input_tokens_seen": 29120, "step": 60 }, { "epoch": 0.008578593110729841, "grad_norm": 28.7725887298584, "learning_rate": 1.688654353562005e-07, "loss": 0.3076, "num_input_tokens_seen": 31744, "step": 65 }, { "epoch": 0.00923848488847829, "grad_norm": 51.50053024291992, "learning_rate": 1.820580474934037e-07, "loss": 0.2971, "num_input_tokens_seen": 34176, "step": 70 }, { "epoch": 0.009898376666226739, "grad_norm": 48.889060974121094, "learning_rate": 1.9525065963060686e-07, "loss": 0.3004, "num_input_tokens_seen": 36864, "step": 75 }, { "epoch": 0.010558268443975187, "grad_norm": 26.879812240600586, "learning_rate": 2.0844327176781002e-07, "loss": 0.2532, "num_input_tokens_seen": 39424, "step": 80 }, { "epoch": 0.011218160221723637, "grad_norm": 41.192012786865234, "learning_rate": 2.2163588390501316e-07, "loss": 0.2616, "num_input_tokens_seen": 42112, "step": 85 }, { "epoch": 0.011878051999472087, "grad_norm": 29.90664291381836, "learning_rate": 2.3482849604221635e-07, "loss": 0.2528, "num_input_tokens_seen": 44544, "step": 90 }, { "epoch": 0.012537943777220536, "grad_norm": 42.19837951660156, "learning_rate": 2.480211081794195e-07, "loss": 0.1904, "num_input_tokens_seen": 47104, "step": 95 }, { "epoch": 0.013197835554968985, "grad_norm": 23.84099769592285, "learning_rate": 2.612137203166227e-07, "loss": 0.1653, "num_input_tokens_seen": 49664, "step": 100 }, { "epoch": 0.013857727332717434, "grad_norm": 22.509811401367188, "learning_rate": 2.744063324538258e-07, "loss": 0.137, "num_input_tokens_seen": 52352, "step": 105 }, { "epoch": 0.014517619110465884, "grad_norm": 86.81343078613281, "learning_rate": 2.8759894459102903e-07, "loss": 0.102, "num_input_tokens_seen": 54720, "step": 110 }, { "epoch": 0.015177510888214334, "grad_norm": 43.25410461425781, "learning_rate": 3.007915567282322e-07, "loss": 0.1477, "num_input_tokens_seen": 57152, "step": 115 }, { "epoch": 0.015837402665962782, "grad_norm": 98.41868591308594, "learning_rate": 3.139841688654353e-07, "loss": 0.2005, "num_input_tokens_seen": 59776, "step": 120 }, { "epoch": 0.01649729444371123, "grad_norm": 23.211767196655273, "learning_rate": 3.271767810026385e-07, "loss": 0.1417, "num_input_tokens_seen": 62464, "step": 125 }, { "epoch": 0.017157186221459682, "grad_norm": 80.11978912353516, "learning_rate": 3.403693931398417e-07, "loss": 0.1226, "num_input_tokens_seen": 65088, "step": 130 }, { "epoch": 0.01781707799920813, "grad_norm": 74.83419036865234, "learning_rate": 3.5356200527704485e-07, "loss": 0.2123, "num_input_tokens_seen": 67776, "step": 135 }, { "epoch": 0.01847696977695658, "grad_norm": 67.47618865966797, "learning_rate": 3.66754617414248e-07, "loss": 0.2606, "num_input_tokens_seen": 70400, "step": 140 }, { "epoch": 0.01913686155470503, "grad_norm": 51.34063720703125, "learning_rate": 3.7994722955145113e-07, "loss": 0.1463, "num_input_tokens_seen": 72704, "step": 145 }, { "epoch": 0.019796753332453478, "grad_norm": 63.031131744384766, "learning_rate": 3.9313984168865435e-07, "loss": 0.346, "num_input_tokens_seen": 75136, "step": 150 }, { "epoch": 0.020456645110201926, "grad_norm": 25.44994354248047, "learning_rate": 4.063324538258575e-07, "loss": 0.0609, "num_input_tokens_seen": 77632, "step": 155 }, { "epoch": 0.021116536887950375, "grad_norm": 72.61922454833984, "learning_rate": 4.195250659630606e-07, "loss": 0.2492, "num_input_tokens_seen": 80320, "step": 160 }, { "epoch": 0.021776428665698826, "grad_norm": 107.4610824584961, "learning_rate": 4.3271767810026384e-07, "loss": 0.1542, "num_input_tokens_seen": 82752, "step": 165 }, { "epoch": 0.022436320443447275, "grad_norm": 148.21913146972656, "learning_rate": 4.45910290237467e-07, "loss": 0.3095, "num_input_tokens_seen": 85248, "step": 170 }, { "epoch": 0.023096212221195723, "grad_norm": 166.9718475341797, "learning_rate": 4.5910290237467017e-07, "loss": 0.2917, "num_input_tokens_seen": 87872, "step": 175 }, { "epoch": 0.023756103998944175, "grad_norm": 49.3593864440918, "learning_rate": 4.7229551451187333e-07, "loss": 0.1369, "num_input_tokens_seen": 90368, "step": 180 }, { "epoch": 0.024415995776692623, "grad_norm": 39.84811019897461, "learning_rate": 4.854881266490765e-07, "loss": 0.0624, "num_input_tokens_seen": 92928, "step": 185 }, { "epoch": 0.02507588755444107, "grad_norm": 61.82024383544922, "learning_rate": 4.986807387862796e-07, "loss": 0.1617, "num_input_tokens_seen": 95296, "step": 190 }, { "epoch": 0.02573577933218952, "grad_norm": 56.04179763793945, "learning_rate": 5.118733509234829e-07, "loss": 0.1049, "num_input_tokens_seen": 97984, "step": 195 }, { "epoch": 0.02639567110993797, "grad_norm": 102.43315124511719, "learning_rate": 5.250659630606859e-07, "loss": 0.1992, "num_input_tokens_seen": 100352, "step": 200 }, { "epoch": 0.02705556288768642, "grad_norm": 17.690580368041992, "learning_rate": 5.382585751978892e-07, "loss": 0.1434, "num_input_tokens_seen": 102464, "step": 205 }, { "epoch": 0.027715454665434867, "grad_norm": 14.600909233093262, "learning_rate": 5.514511873350924e-07, "loss": 0.1237, "num_input_tokens_seen": 105088, "step": 210 }, { "epoch": 0.02837534644318332, "grad_norm": 16.153587341308594, "learning_rate": 5.646437994722954e-07, "loss": 0.3024, "num_input_tokens_seen": 107648, "step": 215 }, { "epoch": 0.029035238220931767, "grad_norm": 108.86598205566406, "learning_rate": 5.778364116094987e-07, "loss": 0.2383, "num_input_tokens_seen": 110144, "step": 220 }, { "epoch": 0.029695129998680216, "grad_norm": 30.326353073120117, "learning_rate": 5.910290237467019e-07, "loss": 0.1367, "num_input_tokens_seen": 112576, "step": 225 }, { "epoch": 0.030355021776428667, "grad_norm": 56.80265808105469, "learning_rate": 6.042216358839049e-07, "loss": 0.2351, "num_input_tokens_seen": 115264, "step": 230 }, { "epoch": 0.031014913554177116, "grad_norm": 17.4112606048584, "learning_rate": 6.174142480211082e-07, "loss": 0.0839, "num_input_tokens_seen": 117888, "step": 235 }, { "epoch": 0.031674805331925564, "grad_norm": 34.93491744995117, "learning_rate": 6.306068601583114e-07, "loss": 0.0189, "num_input_tokens_seen": 120320, "step": 240 }, { "epoch": 0.032334697109674015, "grad_norm": 89.36637115478516, "learning_rate": 6.437994722955144e-07, "loss": 0.1832, "num_input_tokens_seen": 122688, "step": 245 }, { "epoch": 0.03299458888742246, "grad_norm": 80.80409240722656, "learning_rate": 6.569920844327177e-07, "loss": 0.2718, "num_input_tokens_seen": 125248, "step": 250 }, { "epoch": 0.03365448066517091, "grad_norm": 89.6211166381836, "learning_rate": 6.701846965699208e-07, "loss": 0.2037, "num_input_tokens_seen": 127680, "step": 255 }, { "epoch": 0.034314372442919364, "grad_norm": 17.70661735534668, "learning_rate": 6.833773087071239e-07, "loss": 0.0833, "num_input_tokens_seen": 130496, "step": 260 }, { "epoch": 0.03497426422066781, "grad_norm": 31.946949005126953, "learning_rate": 6.965699208443272e-07, "loss": 0.19, "num_input_tokens_seen": 132992, "step": 265 }, { "epoch": 0.03563415599841626, "grad_norm": 58.67094802856445, "learning_rate": 7.097625329815303e-07, "loss": 0.1429, "num_input_tokens_seen": 135040, "step": 270 }, { "epoch": 0.03629404777616471, "grad_norm": 67.81129455566406, "learning_rate": 7.229551451187335e-07, "loss": 0.2488, "num_input_tokens_seen": 137600, "step": 275 }, { "epoch": 0.03695393955391316, "grad_norm": 61.778663635253906, "learning_rate": 7.361477572559367e-07, "loss": 0.1125, "num_input_tokens_seen": 139904, "step": 280 }, { "epoch": 0.03761383133166161, "grad_norm": 15.433332443237305, "learning_rate": 7.493403693931398e-07, "loss": 0.1465, "num_input_tokens_seen": 142016, "step": 285 }, { "epoch": 0.03827372310941006, "grad_norm": 8.442534446716309, "learning_rate": 7.62532981530343e-07, "loss": 0.0092, "num_input_tokens_seen": 144576, "step": 290 }, { "epoch": 0.038933614887158505, "grad_norm": 112.9028091430664, "learning_rate": 7.757255936675461e-07, "loss": 0.0894, "num_input_tokens_seen": 146880, "step": 295 }, { "epoch": 0.039593506664906956, "grad_norm": 95.21463775634766, "learning_rate": 7.889182058047493e-07, "loss": 0.3917, "num_input_tokens_seen": 149184, "step": 300 }, { "epoch": 0.0402533984426554, "grad_norm": 0.18586313724517822, "learning_rate": 8.021108179419525e-07, "loss": 0.2216, "num_input_tokens_seen": 151296, "step": 305 }, { "epoch": 0.04091329022040385, "grad_norm": 79.36148071289062, "learning_rate": 8.153034300791555e-07, "loss": 0.2024, "num_input_tokens_seen": 153664, "step": 310 }, { "epoch": 0.041573181998152305, "grad_norm": 55.61602020263672, "learning_rate": 8.284960422163588e-07, "loss": 0.2624, "num_input_tokens_seen": 156032, "step": 315 }, { "epoch": 0.04223307377590075, "grad_norm": 0.2602592408657074, "learning_rate": 8.41688654353562e-07, "loss": 0.2276, "num_input_tokens_seen": 158528, "step": 320 }, { "epoch": 0.0428929655536492, "grad_norm": 18.09028434753418, "learning_rate": 8.54881266490765e-07, "loss": 0.1227, "num_input_tokens_seen": 160704, "step": 325 }, { "epoch": 0.04355285733139765, "grad_norm": 9.590105056762695, "learning_rate": 8.680738786279683e-07, "loss": 0.1889, "num_input_tokens_seen": 163072, "step": 330 }, { "epoch": 0.0442127491091461, "grad_norm": 17.7542724609375, "learning_rate": 8.812664907651715e-07, "loss": 0.1251, "num_input_tokens_seen": 165568, "step": 335 }, { "epoch": 0.04487264088689455, "grad_norm": 0.6858607530593872, "learning_rate": 8.944591029023745e-07, "loss": 0.2737, "num_input_tokens_seen": 167936, "step": 340 }, { "epoch": 0.045532532664643, "grad_norm": 168.79275512695312, "learning_rate": 9.076517150395778e-07, "loss": 0.0704, "num_input_tokens_seen": 170176, "step": 345 }, { "epoch": 0.046192424442391446, "grad_norm": 137.41461181640625, "learning_rate": 9.20844327176781e-07, "loss": 0.1521, "num_input_tokens_seen": 172352, "step": 350 }, { "epoch": 0.0468523162201399, "grad_norm": 42.20560073852539, "learning_rate": 9.340369393139841e-07, "loss": 0.2593, "num_input_tokens_seen": 175168, "step": 355 }, { "epoch": 0.04751220799788835, "grad_norm": 1.3534170389175415, "learning_rate": 9.472295514511873e-07, "loss": 0.1364, "num_input_tokens_seen": 177856, "step": 360 }, { "epoch": 0.048172099775636794, "grad_norm": 38.15507125854492, "learning_rate": 9.604221635883904e-07, "loss": 0.3046, "num_input_tokens_seen": 180160, "step": 365 }, { "epoch": 0.048831991553385246, "grad_norm": 143.81930541992188, "learning_rate": 9.736147757255936e-07, "loss": 0.1873, "num_input_tokens_seen": 182784, "step": 370 }, { "epoch": 0.0494918833311337, "grad_norm": 0.975854754447937, "learning_rate": 9.86807387862797e-07, "loss": 0.0584, "num_input_tokens_seen": 185024, "step": 375 }, { "epoch": 0.05001979675333245, "eval_loss": 0.17531457543373108, "eval_runtime": 7.4576, "eval_samples_per_second": 903.11, "eval_steps_per_second": 112.905, "num_input_tokens_seen": 187072, "step": 379 }, { "epoch": 0.05015177510888214, "grad_norm": 59.641075134277344, "learning_rate": 1e-06, "loss": 0.1671, "num_input_tokens_seen": 187712, "step": 380 }, { "epoch": 0.050811666886630594, "grad_norm": 49.6882209777832, "learning_rate": 1.0131926121372032e-06, "loss": 0.2583, "num_input_tokens_seen": 190400, "step": 385 }, { "epoch": 0.05147155866437904, "grad_norm": 53.95302963256836, "learning_rate": 1.0263852242744063e-06, "loss": 0.1182, "num_input_tokens_seen": 193280, "step": 390 }, { "epoch": 0.05213145044212749, "grad_norm": 30.752676010131836, "learning_rate": 1.0395778364116096e-06, "loss": 0.1517, "num_input_tokens_seen": 195584, "step": 395 }, { "epoch": 0.05279134221987594, "grad_norm": 14.899801254272461, "learning_rate": 1.0527704485488126e-06, "loss": 0.0928, "num_input_tokens_seen": 198208, "step": 400 }, { "epoch": 0.05345123399762439, "grad_norm": 85.27983093261719, "learning_rate": 1.0659630606860157e-06, "loss": 0.2377, "num_input_tokens_seen": 200704, "step": 405 }, { "epoch": 0.05411112577537284, "grad_norm": 81.29493713378906, "learning_rate": 1.079155672823219e-06, "loss": 0.2115, "num_input_tokens_seen": 203136, "step": 410 }, { "epoch": 0.05477101755312129, "grad_norm": 64.4339370727539, "learning_rate": 1.0923482849604222e-06, "loss": 0.1501, "num_input_tokens_seen": 205504, "step": 415 }, { "epoch": 0.055430909330869735, "grad_norm": 108.44839477539062, "learning_rate": 1.1055408970976253e-06, "loss": 0.2442, "num_input_tokens_seen": 208000, "step": 420 }, { "epoch": 0.056090801108618187, "grad_norm": 229.58607482910156, "learning_rate": 1.1187335092348285e-06, "loss": 0.0951, "num_input_tokens_seen": 210560, "step": 425 }, { "epoch": 0.05675069288636664, "grad_norm": 34.30244827270508, "learning_rate": 1.1319261213720316e-06, "loss": 0.1749, "num_input_tokens_seen": 213056, "step": 430 }, { "epoch": 0.05741058466411508, "grad_norm": 48.8880500793457, "learning_rate": 1.1451187335092347e-06, "loss": 0.1071, "num_input_tokens_seen": 215296, "step": 435 }, { "epoch": 0.058070476441863535, "grad_norm": 0.0803152322769165, "learning_rate": 1.158311345646438e-06, "loss": 0.0089, "num_input_tokens_seen": 217472, "step": 440 }, { "epoch": 0.058730368219611986, "grad_norm": 173.73971557617188, "learning_rate": 1.1715039577836412e-06, "loss": 0.0408, "num_input_tokens_seen": 219968, "step": 445 }, { "epoch": 0.05939025999736043, "grad_norm": 0.06401122361421585, "learning_rate": 1.1846965699208443e-06, "loss": 0.0381, "num_input_tokens_seen": 222592, "step": 450 }, { "epoch": 0.06005015177510888, "grad_norm": 118.57979583740234, "learning_rate": 1.1978891820580475e-06, "loss": 0.5112, "num_input_tokens_seen": 224768, "step": 455 }, { "epoch": 0.060710043552857335, "grad_norm": 3.2848386764526367, "learning_rate": 1.2110817941952508e-06, "loss": 0.2367, "num_input_tokens_seen": 227264, "step": 460 }, { "epoch": 0.06136993533060578, "grad_norm": 39.09904861450195, "learning_rate": 1.2242744063324536e-06, "loss": 0.2476, "num_input_tokens_seen": 229760, "step": 465 }, { "epoch": 0.06202982710835423, "grad_norm": 75.33295440673828, "learning_rate": 1.237467018469657e-06, "loss": 0.1846, "num_input_tokens_seen": 232000, "step": 470 }, { "epoch": 0.06268971888610268, "grad_norm": 1.241437315940857, "learning_rate": 1.2506596306068602e-06, "loss": 0.1636, "num_input_tokens_seen": 234176, "step": 475 }, { "epoch": 0.06334961066385113, "grad_norm": 0.22346267104148865, "learning_rate": 1.2638522427440632e-06, "loss": 0.1056, "num_input_tokens_seen": 236736, "step": 480 }, { "epoch": 0.06400950244159957, "grad_norm": 0.07724567502737045, "learning_rate": 1.2770448548812665e-06, "loss": 0.0032, "num_input_tokens_seen": 239104, "step": 485 }, { "epoch": 0.06466939421934803, "grad_norm": 162.94638061523438, "learning_rate": 1.2902374670184698e-06, "loss": 0.4157, "num_input_tokens_seen": 241600, "step": 490 }, { "epoch": 0.06532928599709648, "grad_norm": 143.5639190673828, "learning_rate": 1.3034300791556726e-06, "loss": 0.3641, "num_input_tokens_seen": 244160, "step": 495 }, { "epoch": 0.06598917777484492, "grad_norm": 0.8768963813781738, "learning_rate": 1.316622691292876e-06, "loss": 0.175, "num_input_tokens_seen": 246464, "step": 500 }, { "epoch": 0.06664906955259338, "grad_norm": 1.546777606010437, "learning_rate": 1.3298153034300792e-06, "loss": 0.0893, "num_input_tokens_seen": 248960, "step": 505 }, { "epoch": 0.06730896133034182, "grad_norm": 82.6065902709961, "learning_rate": 1.3430079155672822e-06, "loss": 0.1824, "num_input_tokens_seen": 251392, "step": 510 }, { "epoch": 0.06796885310809027, "grad_norm": 29.751699447631836, "learning_rate": 1.3562005277044855e-06, "loss": 0.2085, "num_input_tokens_seen": 253888, "step": 515 }, { "epoch": 0.06862874488583873, "grad_norm": 26.04964828491211, "learning_rate": 1.3693931398416888e-06, "loss": 0.1502, "num_input_tokens_seen": 256384, "step": 520 }, { "epoch": 0.06928863666358717, "grad_norm": 60.13207244873047, "learning_rate": 1.3825857519788916e-06, "loss": 0.1721, "num_input_tokens_seen": 258496, "step": 525 }, { "epoch": 0.06994852844133562, "grad_norm": 0.15003204345703125, "learning_rate": 1.3957783641160949e-06, "loss": 0.0959, "num_input_tokens_seen": 260864, "step": 530 }, { "epoch": 0.07060842021908408, "grad_norm": 19.173173904418945, "learning_rate": 1.4089709762532982e-06, "loss": 0.0484, "num_input_tokens_seen": 263360, "step": 535 }, { "epoch": 0.07126831199683252, "grad_norm": 233.016357421875, "learning_rate": 1.4221635883905012e-06, "loss": 0.306, "num_input_tokens_seen": 266112, "step": 540 }, { "epoch": 0.07192820377458096, "grad_norm": 39.65471267700195, "learning_rate": 1.4353562005277045e-06, "loss": 0.2425, "num_input_tokens_seen": 268544, "step": 545 }, { "epoch": 0.07258809555232942, "grad_norm": 0.22139348089694977, "learning_rate": 1.4485488126649078e-06, "loss": 0.4216, "num_input_tokens_seen": 270912, "step": 550 }, { "epoch": 0.07324798733007787, "grad_norm": 0.2836262285709381, "learning_rate": 1.4617414248021108e-06, "loss": 0.0023, "num_input_tokens_seen": 273664, "step": 555 }, { "epoch": 0.07390787910782631, "grad_norm": 91.80794525146484, "learning_rate": 1.4749340369393139e-06, "loss": 0.1737, "num_input_tokens_seen": 276160, "step": 560 }, { "epoch": 0.07456777088557477, "grad_norm": 0.3804304897785187, "learning_rate": 1.4881266490765171e-06, "loss": 0.0057, "num_input_tokens_seen": 278784, "step": 565 }, { "epoch": 0.07522766266332322, "grad_norm": 25.789026260375977, "learning_rate": 1.5013192612137202e-06, "loss": 0.2339, "num_input_tokens_seen": 281152, "step": 570 }, { "epoch": 0.07588755444107166, "grad_norm": 56.50286865234375, "learning_rate": 1.5145118733509235e-06, "loss": 0.2223, "num_input_tokens_seen": 283456, "step": 575 }, { "epoch": 0.07654744621882012, "grad_norm": 3.2999558448791504, "learning_rate": 1.5277044854881265e-06, "loss": 0.0743, "num_input_tokens_seen": 286016, "step": 580 }, { "epoch": 0.07720733799656856, "grad_norm": 0.23049865663051605, "learning_rate": 1.5408970976253298e-06, "loss": 0.1493, "num_input_tokens_seen": 288448, "step": 585 }, { "epoch": 0.07786722977431701, "grad_norm": 0.022031376138329506, "learning_rate": 1.5540897097625329e-06, "loss": 0.1879, "num_input_tokens_seen": 290816, "step": 590 }, { "epoch": 0.07852712155206547, "grad_norm": 71.99144744873047, "learning_rate": 1.567282321899736e-06, "loss": 0.2187, "num_input_tokens_seen": 293504, "step": 595 }, { "epoch": 0.07918701332981391, "grad_norm": 0.02517612837255001, "learning_rate": 1.5804749340369392e-06, "loss": 0.1335, "num_input_tokens_seen": 295744, "step": 600 }, { "epoch": 0.07984690510756236, "grad_norm": 0.04881107434630394, "learning_rate": 1.5936675461741425e-06, "loss": 0.0816, "num_input_tokens_seen": 298112, "step": 605 }, { "epoch": 0.0805067968853108, "grad_norm": 197.50973510742188, "learning_rate": 1.6068601583113455e-06, "loss": 0.1134, "num_input_tokens_seen": 300608, "step": 610 }, { "epoch": 0.08116668866305926, "grad_norm": 0.06382615864276886, "learning_rate": 1.6200527704485488e-06, "loss": 0.0047, "num_input_tokens_seen": 303360, "step": 615 }, { "epoch": 0.0818265804408077, "grad_norm": 9.255777359008789, "learning_rate": 1.633245382585752e-06, "loss": 0.0712, "num_input_tokens_seen": 305920, "step": 620 }, { "epoch": 0.08248647221855615, "grad_norm": 11.119955062866211, "learning_rate": 1.646437994722955e-06, "loss": 0.0795, "num_input_tokens_seen": 308416, "step": 625 }, { "epoch": 0.08314636399630461, "grad_norm": 0.05398223549127579, "learning_rate": 1.6596306068601582e-06, "loss": 0.1324, "num_input_tokens_seen": 310848, "step": 630 }, { "epoch": 0.08380625577405305, "grad_norm": 55.00618362426758, "learning_rate": 1.6728232189973614e-06, "loss": 0.2123, "num_input_tokens_seen": 313408, "step": 635 }, { "epoch": 0.0844661475518015, "grad_norm": 111.69770050048828, "learning_rate": 1.6860158311345645e-06, "loss": 0.1099, "num_input_tokens_seen": 315904, "step": 640 }, { "epoch": 0.08512603932954996, "grad_norm": 48.804962158203125, "learning_rate": 1.6992084432717678e-06, "loss": 0.2301, "num_input_tokens_seen": 318080, "step": 645 }, { "epoch": 0.0857859311072984, "grad_norm": 6.783302307128906, "learning_rate": 1.712401055408971e-06, "loss": 0.0621, "num_input_tokens_seen": 320256, "step": 650 }, { "epoch": 0.08644582288504685, "grad_norm": 0.6253184676170349, "learning_rate": 1.7255936675461739e-06, "loss": 0.0199, "num_input_tokens_seen": 322496, "step": 655 }, { "epoch": 0.0871057146627953, "grad_norm": 257.97125244140625, "learning_rate": 1.7387862796833772e-06, "loss": 0.1723, "num_input_tokens_seen": 325120, "step": 660 }, { "epoch": 0.08776560644054375, "grad_norm": 29.855276107788086, "learning_rate": 1.7519788918205804e-06, "loss": 0.0485, "num_input_tokens_seen": 327296, "step": 665 }, { "epoch": 0.0884254982182922, "grad_norm": 42.55568313598633, "learning_rate": 1.7651715039577835e-06, "loss": 0.4327, "num_input_tokens_seen": 329664, "step": 670 }, { "epoch": 0.08908538999604065, "grad_norm": 23.53718376159668, "learning_rate": 1.7783641160949868e-06, "loss": 0.0918, "num_input_tokens_seen": 332416, "step": 675 }, { "epoch": 0.0897452817737891, "grad_norm": 0.3492559790611267, "learning_rate": 1.79155672823219e-06, "loss": 0.0255, "num_input_tokens_seen": 334976, "step": 680 }, { "epoch": 0.09040517355153754, "grad_norm": 0.0223238542675972, "learning_rate": 1.8047493403693929e-06, "loss": 0.0856, "num_input_tokens_seen": 337472, "step": 685 }, { "epoch": 0.091065065329286, "grad_norm": 8.461647033691406, "learning_rate": 1.8179419525065961e-06, "loss": 0.1861, "num_input_tokens_seen": 339904, "step": 690 }, { "epoch": 0.09172495710703445, "grad_norm": 131.66806030273438, "learning_rate": 1.8311345646437994e-06, "loss": 0.1639, "num_input_tokens_seen": 342272, "step": 695 }, { "epoch": 0.09238484888478289, "grad_norm": 0.0658893883228302, "learning_rate": 1.8443271767810025e-06, "loss": 0.1908, "num_input_tokens_seen": 344640, "step": 700 }, { "epoch": 0.09304474066253135, "grad_norm": 0.02002323418855667, "learning_rate": 1.8575197889182057e-06, "loss": 0.0427, "num_input_tokens_seen": 347072, "step": 705 }, { "epoch": 0.0937046324402798, "grad_norm": 0.023283669725060463, "learning_rate": 1.870712401055409e-06, "loss": 0.194, "num_input_tokens_seen": 349696, "step": 710 }, { "epoch": 0.09436452421802824, "grad_norm": 12.705941200256348, "learning_rate": 1.883905013192612e-06, "loss": 0.0821, "num_input_tokens_seen": 352256, "step": 715 }, { "epoch": 0.0950244159957767, "grad_norm": 0.16988928616046906, "learning_rate": 1.8970976253298151e-06, "loss": 0.1312, "num_input_tokens_seen": 355008, "step": 720 }, { "epoch": 0.09568430777352514, "grad_norm": 82.4719009399414, "learning_rate": 1.9102902374670186e-06, "loss": 0.2885, "num_input_tokens_seen": 357376, "step": 725 }, { "epoch": 0.09634419955127359, "grad_norm": 6.462850093841553, "learning_rate": 1.9234828496042215e-06, "loss": 0.1712, "num_input_tokens_seen": 359680, "step": 730 }, { "epoch": 0.09700409132902205, "grad_norm": 2.923388957977295, "learning_rate": 1.9366754617414247e-06, "loss": 0.1537, "num_input_tokens_seen": 362176, "step": 735 }, { "epoch": 0.09766398310677049, "grad_norm": 104.2777328491211, "learning_rate": 1.949868073878628e-06, "loss": 0.2028, "num_input_tokens_seen": 365056, "step": 740 }, { "epoch": 0.09832387488451894, "grad_norm": 2.3851282596588135, "learning_rate": 1.963060686015831e-06, "loss": 0.2106, "num_input_tokens_seen": 367488, "step": 745 }, { "epoch": 0.0989837666622674, "grad_norm": 1.8862087726593018, "learning_rate": 1.976253298153034e-06, "loss": 0.2852, "num_input_tokens_seen": 369792, "step": 750 }, { "epoch": 0.09964365844001584, "grad_norm": 102.61363220214844, "learning_rate": 1.9894459102902374e-06, "loss": 0.1154, "num_input_tokens_seen": 372160, "step": 755 }, { "epoch": 0.1000395935066649, "eval_loss": 0.129482701420784, "eval_runtime": 7.7189, "eval_samples_per_second": 872.532, "eval_steps_per_second": 109.083, "num_input_tokens_seen": 373504, "step": 758 }, { "epoch": 0.10030355021776428, "grad_norm": 4.566295146942139, "learning_rate": 1.9999998938723955e-06, "loss": 0.0874, "num_input_tokens_seen": 374272, "step": 760 }, { "epoch": 0.10096344199551274, "grad_norm": 25.750286102294922, "learning_rate": 1.9999961794086063e-06, "loss": 0.0774, "num_input_tokens_seen": 376704, "step": 765 }, { "epoch": 0.10162333377326119, "grad_norm": 149.0970458984375, "learning_rate": 1.999987158587122e-06, "loss": 0.2165, "num_input_tokens_seen": 379136, "step": 770 }, { "epoch": 0.10228322555100963, "grad_norm": 47.778255462646484, "learning_rate": 1.9999728314558114e-06, "loss": 0.1505, "num_input_tokens_seen": 381568, "step": 775 }, { "epoch": 0.10294311732875808, "grad_norm": 0.1281862109899521, "learning_rate": 1.9999531980906988e-06, "loss": 0.2297, "num_input_tokens_seen": 384128, "step": 780 }, { "epoch": 0.10360300910650654, "grad_norm": 105.48400115966797, "learning_rate": 1.999928258595967e-06, "loss": 0.0893, "num_input_tokens_seen": 386304, "step": 785 }, { "epoch": 0.10426290088425498, "grad_norm": 16.267196655273438, "learning_rate": 1.9998980131039534e-06, "loss": 0.2538, "num_input_tokens_seen": 388864, "step": 790 }, { "epoch": 0.10492279266200343, "grad_norm": 34.18339920043945, "learning_rate": 1.999862461775153e-06, "loss": 0.0914, "num_input_tokens_seen": 391104, "step": 795 }, { "epoch": 0.10558268443975188, "grad_norm": 14.670069694519043, "learning_rate": 1.999821604798214e-06, "loss": 0.1431, "num_input_tokens_seen": 393856, "step": 800 }, { "epoch": 0.10624257621750033, "grad_norm": 32.27194595336914, "learning_rate": 1.999775442389939e-06, "loss": 0.3214, "num_input_tokens_seen": 396352, "step": 805 }, { "epoch": 0.10690246799524877, "grad_norm": 1.3998618125915527, "learning_rate": 1.9997239747952843e-06, "loss": 0.1422, "num_input_tokens_seen": 398592, "step": 810 }, { "epoch": 0.10756235977299723, "grad_norm": 177.2610321044922, "learning_rate": 1.9996672022873546e-06, "loss": 0.0609, "num_input_tokens_seen": 401088, "step": 815 }, { "epoch": 0.10822225155074568, "grad_norm": 182.47579956054688, "learning_rate": 1.9996051251674073e-06, "loss": 0.0726, "num_input_tokens_seen": 403456, "step": 820 }, { "epoch": 0.10888214332849412, "grad_norm": 31.635814666748047, "learning_rate": 1.999537743764847e-06, "loss": 0.1602, "num_input_tokens_seen": 405696, "step": 825 }, { "epoch": 0.10954203510624258, "grad_norm": 20.697343826293945, "learning_rate": 1.999465058437225e-06, "loss": 0.4649, "num_input_tokens_seen": 408128, "step": 830 }, { "epoch": 0.11020192688399102, "grad_norm": 0.6629725694656372, "learning_rate": 1.9993870695702364e-06, "loss": 0.0112, "num_input_tokens_seen": 411008, "step": 835 }, { "epoch": 0.11086181866173947, "grad_norm": 247.9231719970703, "learning_rate": 1.9993037775777206e-06, "loss": 0.3035, "num_input_tokens_seen": 413312, "step": 840 }, { "epoch": 0.11152171043948793, "grad_norm": 0.9605908989906311, "learning_rate": 1.999215182901656e-06, "loss": 0.1141, "num_input_tokens_seen": 415616, "step": 845 }, { "epoch": 0.11218160221723637, "grad_norm": 75.42913055419922, "learning_rate": 1.9991212860121587e-06, "loss": 0.1391, "num_input_tokens_seen": 418368, "step": 850 }, { "epoch": 0.11284149399498482, "grad_norm": 0.684021532535553, "learning_rate": 1.999022087407482e-06, "loss": 0.0502, "num_input_tokens_seen": 420864, "step": 855 }, { "epoch": 0.11350138577273328, "grad_norm": 61.68302536010742, "learning_rate": 1.998917587614011e-06, "loss": 0.3102, "num_input_tokens_seen": 423040, "step": 860 }, { "epoch": 0.11416127755048172, "grad_norm": 26.822439193725586, "learning_rate": 1.9988077871862615e-06, "loss": 0.3563, "num_input_tokens_seen": 425344, "step": 865 }, { "epoch": 0.11482116932823017, "grad_norm": 1.1649112701416016, "learning_rate": 1.9986926867068752e-06, "loss": 0.0052, "num_input_tokens_seen": 427968, "step": 870 }, { "epoch": 0.11548106110597862, "grad_norm": 0.3206559121608734, "learning_rate": 1.998572286786619e-06, "loss": 0.2265, "num_input_tokens_seen": 430592, "step": 875 }, { "epoch": 0.11614095288372707, "grad_norm": 51.00387954711914, "learning_rate": 1.9984465880643807e-06, "loss": 0.295, "num_input_tokens_seen": 433152, "step": 880 }, { "epoch": 0.11680084466147551, "grad_norm": 94.77568817138672, "learning_rate": 1.998315591207165e-06, "loss": 0.0961, "num_input_tokens_seen": 435456, "step": 885 }, { "epoch": 0.11746073643922397, "grad_norm": 14.036933898925781, "learning_rate": 1.9981792969100912e-06, "loss": 0.1703, "num_input_tokens_seen": 438080, "step": 890 }, { "epoch": 0.11812062821697242, "grad_norm": 8.309388160705566, "learning_rate": 1.9980377058963875e-06, "loss": 0.2036, "num_input_tokens_seen": 440640, "step": 895 }, { "epoch": 0.11878051999472086, "grad_norm": 2.4462878704071045, "learning_rate": 1.99789081891739e-06, "loss": 0.0225, "num_input_tokens_seen": 443008, "step": 900 }, { "epoch": 0.11944041177246932, "grad_norm": 77.5080337524414, "learning_rate": 1.997738636752536e-06, "loss": 0.2203, "num_input_tokens_seen": 445312, "step": 905 }, { "epoch": 0.12010030355021777, "grad_norm": 0.06471412628889084, "learning_rate": 1.9975811602093624e-06, "loss": 0.0016, "num_input_tokens_seen": 447680, "step": 910 }, { "epoch": 0.12076019532796621, "grad_norm": 123.2421875, "learning_rate": 1.9974183901234984e-06, "loss": 0.3289, "num_input_tokens_seen": 450368, "step": 915 }, { "epoch": 0.12142008710571467, "grad_norm": 5.337376594543457, "learning_rate": 1.997250327358664e-06, "loss": 0.3008, "num_input_tokens_seen": 452800, "step": 920 }, { "epoch": 0.12207997888346311, "grad_norm": 15.385493278503418, "learning_rate": 1.997076972806664e-06, "loss": 0.1352, "num_input_tokens_seen": 455744, "step": 925 }, { "epoch": 0.12273987066121156, "grad_norm": 60.07589340209961, "learning_rate": 1.9968983273873827e-06, "loss": 0.2869, "num_input_tokens_seen": 458176, "step": 930 }, { "epoch": 0.12339976243896002, "grad_norm": 1.007535457611084, "learning_rate": 1.99671439204878e-06, "loss": 0.1694, "num_input_tokens_seen": 460480, "step": 935 }, { "epoch": 0.12405965421670846, "grad_norm": 0.9021179676055908, "learning_rate": 1.9965251677668873e-06, "loss": 0.1448, "num_input_tokens_seen": 462656, "step": 940 }, { "epoch": 0.1247195459944569, "grad_norm": 130.83981323242188, "learning_rate": 1.9963306555458e-06, "loss": 0.2976, "num_input_tokens_seen": 465344, "step": 945 }, { "epoch": 0.12537943777220537, "grad_norm": 55.70017623901367, "learning_rate": 1.9961308564176723e-06, "loss": 0.3325, "num_input_tokens_seen": 467712, "step": 950 }, { "epoch": 0.1260393295499538, "grad_norm": 21.9194278717041, "learning_rate": 1.9959257714427147e-06, "loss": 0.1471, "num_input_tokens_seen": 470080, "step": 955 }, { "epoch": 0.12669922132770225, "grad_norm": 9.642888069152832, "learning_rate": 1.995715401709186e-06, "loss": 0.1476, "num_input_tokens_seen": 472512, "step": 960 }, { "epoch": 0.1273591131054507, "grad_norm": 22.075639724731445, "learning_rate": 1.995499748333387e-06, "loss": 0.1639, "num_input_tokens_seen": 474752, "step": 965 }, { "epoch": 0.12801900488319914, "grad_norm": 34.80426025390625, "learning_rate": 1.9952788124596555e-06, "loss": 0.1539, "num_input_tokens_seen": 477440, "step": 970 }, { "epoch": 0.12867889666094762, "grad_norm": 16.518495559692383, "learning_rate": 1.9950525952603617e-06, "loss": 0.1507, "num_input_tokens_seen": 480000, "step": 975 }, { "epoch": 0.12933878843869606, "grad_norm": 63.373817443847656, "learning_rate": 1.994821097935899e-06, "loss": 0.1434, "num_input_tokens_seen": 482368, "step": 980 }, { "epoch": 0.1299986802164445, "grad_norm": 18.813962936401367, "learning_rate": 1.9945843217146804e-06, "loss": 0.0706, "num_input_tokens_seen": 484544, "step": 985 }, { "epoch": 0.13065857199419295, "grad_norm": 0.16079047322273254, "learning_rate": 1.9943422678531293e-06, "loss": 0.1142, "num_input_tokens_seen": 486720, "step": 990 }, { "epoch": 0.1313184637719414, "grad_norm": 0.1510768085718155, "learning_rate": 1.994094937635675e-06, "loss": 0.0692, "num_input_tokens_seen": 489344, "step": 995 }, { "epoch": 0.13197835554968984, "grad_norm": 19.409828186035156, "learning_rate": 1.9938423323747457e-06, "loss": 0.0421, "num_input_tokens_seen": 491776, "step": 1000 }, { "epoch": 0.1326382473274383, "grad_norm": 170.2490997314453, "learning_rate": 1.99358445341076e-06, "loss": 0.1827, "num_input_tokens_seen": 493952, "step": 1005 }, { "epoch": 0.13329813910518676, "grad_norm": 15.425086975097656, "learning_rate": 1.993321302112121e-06, "loss": 0.2152, "num_input_tokens_seen": 496320, "step": 1010 }, { "epoch": 0.1339580308829352, "grad_norm": 117.909423828125, "learning_rate": 1.993052879875209e-06, "loss": 0.0299, "num_input_tokens_seen": 498496, "step": 1015 }, { "epoch": 0.13461792266068365, "grad_norm": 47.48206329345703, "learning_rate": 1.992779188124374e-06, "loss": 0.1351, "num_input_tokens_seen": 501056, "step": 1020 }, { "epoch": 0.1352778144384321, "grad_norm": 1.8825244903564453, "learning_rate": 1.992500228311928e-06, "loss": 0.0501, "num_input_tokens_seen": 503296, "step": 1025 }, { "epoch": 0.13593770621618054, "grad_norm": 12.106839179992676, "learning_rate": 1.9922160019181372e-06, "loss": 0.3259, "num_input_tokens_seen": 505856, "step": 1030 }, { "epoch": 0.13659759799392898, "grad_norm": 0.0899241715669632, "learning_rate": 1.9919265104512138e-06, "loss": 0.1532, "num_input_tokens_seen": 508416, "step": 1035 }, { "epoch": 0.13725748977167745, "grad_norm": 2.1223573684692383, "learning_rate": 1.9916317554473094e-06, "loss": 0.2708, "num_input_tokens_seen": 511040, "step": 1040 }, { "epoch": 0.1379173815494259, "grad_norm": 57.11883544921875, "learning_rate": 1.9913317384705052e-06, "loss": 0.188, "num_input_tokens_seen": 513216, "step": 1045 }, { "epoch": 0.13857727332717434, "grad_norm": 12.335477828979492, "learning_rate": 1.991026461112805e-06, "loss": 0.1146, "num_input_tokens_seen": 515456, "step": 1050 }, { "epoch": 0.1392371651049228, "grad_norm": 0.4140935242176056, "learning_rate": 1.9907159249941257e-06, "loss": 0.1353, "num_input_tokens_seen": 517824, "step": 1055 }, { "epoch": 0.13989705688267123, "grad_norm": 101.3670425415039, "learning_rate": 1.990400131762289e-06, "loss": 0.112, "num_input_tokens_seen": 520320, "step": 1060 }, { "epoch": 0.14055694866041968, "grad_norm": 0.5620743632316589, "learning_rate": 1.9900790830930134e-06, "loss": 0.0702, "num_input_tokens_seen": 522752, "step": 1065 }, { "epoch": 0.14121684043816815, "grad_norm": 2.9457738399505615, "learning_rate": 1.9897527806899047e-06, "loss": 0.1085, "num_input_tokens_seen": 525376, "step": 1070 }, { "epoch": 0.1418767322159166, "grad_norm": 628.5321655273438, "learning_rate": 1.9894212262844465e-06, "loss": 0.2922, "num_input_tokens_seen": 527808, "step": 1075 }, { "epoch": 0.14253662399366504, "grad_norm": 0.10223134607076645, "learning_rate": 1.989084421635992e-06, "loss": 0.1607, "num_input_tokens_seen": 530304, "step": 1080 }, { "epoch": 0.14319651577141349, "grad_norm": 9.022106170654297, "learning_rate": 1.988742368531754e-06, "loss": 0.2576, "num_input_tokens_seen": 532480, "step": 1085 }, { "epoch": 0.14385640754916193, "grad_norm": 3.4002270698547363, "learning_rate": 1.9883950687867947e-06, "loss": 0.0676, "num_input_tokens_seen": 535168, "step": 1090 }, { "epoch": 0.14451629932691037, "grad_norm": 19.492107391357422, "learning_rate": 1.9880425242440187e-06, "loss": 0.1067, "num_input_tokens_seen": 537600, "step": 1095 }, { "epoch": 0.14517619110465885, "grad_norm": 77.36679077148438, "learning_rate": 1.9876847367741607e-06, "loss": 0.1435, "num_input_tokens_seen": 540096, "step": 1100 }, { "epoch": 0.1458360828824073, "grad_norm": 0.1341482549905777, "learning_rate": 1.987321708275776e-06, "loss": 0.1568, "num_input_tokens_seen": 542592, "step": 1105 }, { "epoch": 0.14649597466015574, "grad_norm": 0.15566033124923706, "learning_rate": 1.986953440675231e-06, "loss": 0.0017, "num_input_tokens_seen": 544960, "step": 1110 }, { "epoch": 0.14715586643790418, "grad_norm": 0.3482903242111206, "learning_rate": 1.9865799359266925e-06, "loss": 0.0812, "num_input_tokens_seen": 547136, "step": 1115 }, { "epoch": 0.14781575821565263, "grad_norm": 0.914465606212616, "learning_rate": 1.986201196012118e-06, "loss": 0.0878, "num_input_tokens_seen": 549440, "step": 1120 }, { "epoch": 0.14847564999340107, "grad_norm": 57.043827056884766, "learning_rate": 1.985817222941245e-06, "loss": 0.2476, "num_input_tokens_seen": 552064, "step": 1125 }, { "epoch": 0.14913554177114954, "grad_norm": 10.63588809967041, "learning_rate": 1.9854280187515794e-06, "loss": 0.082, "num_input_tokens_seen": 554432, "step": 1130 }, { "epoch": 0.149795433548898, "grad_norm": 0.7898812890052795, "learning_rate": 1.985033585508386e-06, "loss": 0.0745, "num_input_tokens_seen": 556800, "step": 1135 }, { "epoch": 0.15005939025999737, "eval_loss": 0.19488762319087982, "eval_runtime": 7.5589, "eval_samples_per_second": 890.998, "eval_steps_per_second": 111.391, "num_input_tokens_seen": 557824, "step": 1137 }, { "epoch": 0.15045532532664643, "grad_norm": 250.70848083496094, "learning_rate": 1.9846339253046766e-06, "loss": 0.5451, "num_input_tokens_seen": 559296, "step": 1140 }, { "epoch": 0.15111521710439488, "grad_norm": 27.038022994995117, "learning_rate": 1.984229040261199e-06, "loss": 0.1735, "num_input_tokens_seen": 562112, "step": 1145 }, { "epoch": 0.15177510888214332, "grad_norm": 0.19544407725334167, "learning_rate": 1.9838189325264263e-06, "loss": 0.2349, "num_input_tokens_seen": 564288, "step": 1150 }, { "epoch": 0.15243500065989177, "grad_norm": 199.0702667236328, "learning_rate": 1.983403604276546e-06, "loss": 0.0845, "num_input_tokens_seen": 566848, "step": 1155 }, { "epoch": 0.15309489243764024, "grad_norm": 44.44175720214844, "learning_rate": 1.9829830577154457e-06, "loss": 0.394, "num_input_tokens_seen": 569152, "step": 1160 }, { "epoch": 0.15375478421538868, "grad_norm": 19.46393585205078, "learning_rate": 1.982557295074705e-06, "loss": 0.0604, "num_input_tokens_seen": 571456, "step": 1165 }, { "epoch": 0.15441467599313713, "grad_norm": 0.14685490727424622, "learning_rate": 1.982126318613581e-06, "loss": 0.1545, "num_input_tokens_seen": 573824, "step": 1170 }, { "epoch": 0.15507456777088557, "grad_norm": 0.41161906719207764, "learning_rate": 1.9816901306189977e-06, "loss": 0.0016, "num_input_tokens_seen": 576128, "step": 1175 }, { "epoch": 0.15573445954863402, "grad_norm": 2.800428867340088, "learning_rate": 1.9812487334055342e-06, "loss": 0.139, "num_input_tokens_seen": 578432, "step": 1180 }, { "epoch": 0.15639435132638246, "grad_norm": 0.6683080196380615, "learning_rate": 1.98080212931541e-06, "loss": 0.1618, "num_input_tokens_seen": 580736, "step": 1185 }, { "epoch": 0.15705424310413094, "grad_norm": 0.13595707714557648, "learning_rate": 1.980350320718476e-06, "loss": 0.0846, "num_input_tokens_seen": 583040, "step": 1190 }, { "epoch": 0.15771413488187938, "grad_norm": 0.4379376471042633, "learning_rate": 1.9798933100121985e-06, "loss": 0.0073, "num_input_tokens_seen": 585344, "step": 1195 }, { "epoch": 0.15837402665962783, "grad_norm": 4.045234203338623, "learning_rate": 1.97943109962165e-06, "loss": 0.0793, "num_input_tokens_seen": 587904, "step": 1200 }, { "epoch": 0.15903391843737627, "grad_norm": 0.751695990562439, "learning_rate": 1.978963691999493e-06, "loss": 0.1511, "num_input_tokens_seen": 590208, "step": 1205 }, { "epoch": 0.15969381021512472, "grad_norm": 21.781272888183594, "learning_rate": 1.978491089625969e-06, "loss": 0.0853, "num_input_tokens_seen": 592512, "step": 1210 }, { "epoch": 0.16035370199287316, "grad_norm": 0.15117277204990387, "learning_rate": 1.9780132950088854e-06, "loss": 0.1785, "num_input_tokens_seen": 595072, "step": 1215 }, { "epoch": 0.1610135937706216, "grad_norm": 23.114465713500977, "learning_rate": 1.9775303106836e-06, "loss": 0.2842, "num_input_tokens_seen": 597632, "step": 1220 }, { "epoch": 0.16167348554837008, "grad_norm": 0.19639664888381958, "learning_rate": 1.977042139213011e-06, "loss": 0.0847, "num_input_tokens_seen": 600192, "step": 1225 }, { "epoch": 0.16233337732611852, "grad_norm": 0.22633503377437592, "learning_rate": 1.9765487831875404e-06, "loss": 0.0931, "num_input_tokens_seen": 602304, "step": 1230 }, { "epoch": 0.16299326910386697, "grad_norm": 0.8158997297286987, "learning_rate": 1.9760502452251217e-06, "loss": 0.1418, "num_input_tokens_seen": 604608, "step": 1235 }, { "epoch": 0.1636531608816154, "grad_norm": 9.417763710021973, "learning_rate": 1.975546527971186e-06, "loss": 0.1102, "num_input_tokens_seen": 606976, "step": 1240 }, { "epoch": 0.16431305265936386, "grad_norm": 0.3996043801307678, "learning_rate": 1.9750376340986472e-06, "loss": 0.0447, "num_input_tokens_seen": 609600, "step": 1245 }, { "epoch": 0.1649729444371123, "grad_norm": 17.319820404052734, "learning_rate": 1.974523566307889e-06, "loss": 0.1681, "num_input_tokens_seen": 611840, "step": 1250 }, { "epoch": 0.16563283621486077, "grad_norm": 17.101892471313477, "learning_rate": 1.9740043273267487e-06, "loss": 0.1085, "num_input_tokens_seen": 614528, "step": 1255 }, { "epoch": 0.16629272799260922, "grad_norm": 0.14512968063354492, "learning_rate": 1.973479919910505e-06, "loss": 0.0217, "num_input_tokens_seen": 617024, "step": 1260 }, { "epoch": 0.16695261977035766, "grad_norm": 26.45575523376465, "learning_rate": 1.972950346841862e-06, "loss": 0.1141, "num_input_tokens_seen": 619392, "step": 1265 }, { "epoch": 0.1676125115481061, "grad_norm": 46.64674758911133, "learning_rate": 1.972415610930934e-06, "loss": 0.0049, "num_input_tokens_seen": 621888, "step": 1270 }, { "epoch": 0.16827240332585455, "grad_norm": 48.495487213134766, "learning_rate": 1.9718757150152324e-06, "loss": 0.2469, "num_input_tokens_seen": 624192, "step": 1275 }, { "epoch": 0.168932295103603, "grad_norm": 0.485227108001709, "learning_rate": 1.9713306619596488e-06, "loss": 0.0511, "num_input_tokens_seen": 626624, "step": 1280 }, { "epoch": 0.16959218688135147, "grad_norm": 34.0601692199707, "learning_rate": 1.9707804546564407e-06, "loss": 0.0686, "num_input_tokens_seen": 628928, "step": 1285 }, { "epoch": 0.17025207865909991, "grad_norm": 2.3748066425323486, "learning_rate": 1.9702250960252164e-06, "loss": 0.0234, "num_input_tokens_seen": 631616, "step": 1290 }, { "epoch": 0.17091197043684836, "grad_norm": 0.013682112097740173, "learning_rate": 1.969664589012918e-06, "loss": 0.0015, "num_input_tokens_seen": 634112, "step": 1295 }, { "epoch": 0.1715718622145968, "grad_norm": 314.3330078125, "learning_rate": 1.9690989365938077e-06, "loss": 0.3855, "num_input_tokens_seen": 636416, "step": 1300 }, { "epoch": 0.17223175399234525, "grad_norm": 0.05270430073142052, "learning_rate": 1.9685281417694513e-06, "loss": 0.0051, "num_input_tokens_seen": 638848, "step": 1305 }, { "epoch": 0.1728916457700937, "grad_norm": 0.23324760794639587, "learning_rate": 1.967952207568702e-06, "loss": 0.1125, "num_input_tokens_seen": 641216, "step": 1310 }, { "epoch": 0.17355153754784217, "grad_norm": 0.26865366101264954, "learning_rate": 1.967371137047685e-06, "loss": 0.0011, "num_input_tokens_seen": 644032, "step": 1315 }, { "epoch": 0.1742114293255906, "grad_norm": 0.24145404994487762, "learning_rate": 1.966784933289778e-06, "loss": 0.1494, "num_input_tokens_seen": 646528, "step": 1320 }, { "epoch": 0.17487132110333906, "grad_norm": 0.08738990128040314, "learning_rate": 1.9661935994056014e-06, "loss": 0.1951, "num_input_tokens_seen": 649088, "step": 1325 }, { "epoch": 0.1755312128810875, "grad_norm": 0.9014714956283569, "learning_rate": 1.965597138532996e-06, "loss": 0.0093, "num_input_tokens_seen": 651520, "step": 1330 }, { "epoch": 0.17619110465883595, "grad_norm": 0.6617699265480042, "learning_rate": 1.964995553837009e-06, "loss": 0.0409, "num_input_tokens_seen": 654016, "step": 1335 }, { "epoch": 0.1768509964365844, "grad_norm": 0.015406480059027672, "learning_rate": 1.964388848509875e-06, "loss": 0.1143, "num_input_tokens_seen": 656320, "step": 1340 }, { "epoch": 0.17751088821433286, "grad_norm": 0.025758925825357437, "learning_rate": 1.9637770257710026e-06, "loss": 0.1683, "num_input_tokens_seen": 658880, "step": 1345 }, { "epoch": 0.1781707799920813, "grad_norm": 0.08237680792808533, "learning_rate": 1.9631600888669545e-06, "loss": 0.0205, "num_input_tokens_seen": 661184, "step": 1350 }, { "epoch": 0.17883067176982975, "grad_norm": 0.07278398424386978, "learning_rate": 1.962538041071431e-06, "loss": 0.0664, "num_input_tokens_seen": 663680, "step": 1355 }, { "epoch": 0.1794905635475782, "grad_norm": 25.769346237182617, "learning_rate": 1.961910885685253e-06, "loss": 0.0688, "num_input_tokens_seen": 666048, "step": 1360 }, { "epoch": 0.18015045532532664, "grad_norm": 164.78553771972656, "learning_rate": 1.9612786260363436e-06, "loss": 0.2636, "num_input_tokens_seen": 668480, "step": 1365 }, { "epoch": 0.1808103471030751, "grad_norm": 0.17773790657520294, "learning_rate": 1.9606412654797116e-06, "loss": 0.1108, "num_input_tokens_seen": 671488, "step": 1370 }, { "epoch": 0.18147023888082353, "grad_norm": 66.89860534667969, "learning_rate": 1.9599988073974332e-06, "loss": 0.1088, "num_input_tokens_seen": 673920, "step": 1375 }, { "epoch": 0.182130130658572, "grad_norm": 187.47903442382812, "learning_rate": 1.959351255198634e-06, "loss": 0.1413, "num_input_tokens_seen": 676416, "step": 1380 }, { "epoch": 0.18279002243632045, "grad_norm": 2.0588765144348145, "learning_rate": 1.9586986123194704e-06, "loss": 0.0008, "num_input_tokens_seen": 679040, "step": 1385 }, { "epoch": 0.1834499142140689, "grad_norm": 0.09783805161714554, "learning_rate": 1.958040882223112e-06, "loss": 0.1041, "num_input_tokens_seen": 681920, "step": 1390 }, { "epoch": 0.18410980599181734, "grad_norm": 0.06469712406396866, "learning_rate": 1.9573780683997235e-06, "loss": 0.04, "num_input_tokens_seen": 684416, "step": 1395 }, { "epoch": 0.18476969776956578, "grad_norm": 222.29971313476562, "learning_rate": 1.956710174366445e-06, "loss": 0.3574, "num_input_tokens_seen": 686976, "step": 1400 }, { "epoch": 0.18542958954731423, "grad_norm": 0.0895252674818039, "learning_rate": 1.9560372036673764e-06, "loss": 0.2731, "num_input_tokens_seen": 689408, "step": 1405 }, { "epoch": 0.1860894813250627, "grad_norm": 0.06232970580458641, "learning_rate": 1.955359159873553e-06, "loss": 0.0238, "num_input_tokens_seen": 691712, "step": 1410 }, { "epoch": 0.18674937310281114, "grad_norm": 0.0344870463013649, "learning_rate": 1.954676046582932e-06, "loss": 0.1341, "num_input_tokens_seen": 694080, "step": 1415 }, { "epoch": 0.1874092648805596, "grad_norm": 38.71489334106445, "learning_rate": 1.9539878674203706e-06, "loss": 0.2135, "num_input_tokens_seen": 696640, "step": 1420 }, { "epoch": 0.18806915665830803, "grad_norm": 0.06116657704114914, "learning_rate": 1.9532946260376076e-06, "loss": 0.0011, "num_input_tokens_seen": 699136, "step": 1425 }, { "epoch": 0.18872904843605648, "grad_norm": 53.41019821166992, "learning_rate": 1.952596326113244e-06, "loss": 0.3109, "num_input_tokens_seen": 701696, "step": 1430 }, { "epoch": 0.18938894021380492, "grad_norm": 1.2848087549209595, "learning_rate": 1.9518929713527226e-06, "loss": 0.1812, "num_input_tokens_seen": 704384, "step": 1435 }, { "epoch": 0.1900488319915534, "grad_norm": 2.0177323818206787, "learning_rate": 1.9511845654883097e-06, "loss": 0.0066, "num_input_tokens_seen": 706560, "step": 1440 }, { "epoch": 0.19070872376930184, "grad_norm": 0.4617765247821808, "learning_rate": 1.9504711122790754e-06, "loss": 0.0755, "num_input_tokens_seen": 709248, "step": 1445 }, { "epoch": 0.19136861554705029, "grad_norm": 45.93152618408203, "learning_rate": 1.949752615510871e-06, "loss": 0.2258, "num_input_tokens_seen": 711296, "step": 1450 }, { "epoch": 0.19202850732479873, "grad_norm": 0.20753158628940582, "learning_rate": 1.949029078996313e-06, "loss": 0.0457, "num_input_tokens_seen": 713728, "step": 1455 }, { "epoch": 0.19268839910254718, "grad_norm": 13.657062530517578, "learning_rate": 1.9483005065747584e-06, "loss": 0.1224, "num_input_tokens_seen": 716224, "step": 1460 }, { "epoch": 0.19334829088029562, "grad_norm": 13.369616508483887, "learning_rate": 1.947566902112289e-06, "loss": 0.3816, "num_input_tokens_seen": 718528, "step": 1465 }, { "epoch": 0.1940081826580441, "grad_norm": 60.77271270751953, "learning_rate": 1.9468282695016863e-06, "loss": 0.1841, "num_input_tokens_seen": 720960, "step": 1470 }, { "epoch": 0.19466807443579254, "grad_norm": 1.3714667558670044, "learning_rate": 1.946084612662415e-06, "loss": 0.1318, "num_input_tokens_seen": 723200, "step": 1475 }, { "epoch": 0.19532796621354098, "grad_norm": 114.1025619506836, "learning_rate": 1.9453359355405987e-06, "loss": 0.1708, "num_input_tokens_seen": 725888, "step": 1480 }, { "epoch": 0.19598785799128943, "grad_norm": 0.23408390581607819, "learning_rate": 1.944582242109002e-06, "loss": 0.0194, "num_input_tokens_seen": 728256, "step": 1485 }, { "epoch": 0.19664774976903787, "grad_norm": 0.22887404263019562, "learning_rate": 1.943823536367006e-06, "loss": 0.1454, "num_input_tokens_seen": 730688, "step": 1490 }, { "epoch": 0.19730764154678632, "grad_norm": 2.37292742729187, "learning_rate": 1.9430598223405913e-06, "loss": 0.1624, "num_input_tokens_seen": 732992, "step": 1495 }, { "epoch": 0.1979675333245348, "grad_norm": 0.2745613157749176, "learning_rate": 1.9422911040823125e-06, "loss": 0.1476, "num_input_tokens_seen": 735424, "step": 1500 }, { "epoch": 0.19862742510228323, "grad_norm": 132.48385620117188, "learning_rate": 1.941517385671279e-06, "loss": 0.3263, "num_input_tokens_seen": 737664, "step": 1505 }, { "epoch": 0.19928731688003168, "grad_norm": 0.1534176468849182, "learning_rate": 1.940738671213134e-06, "loss": 0.0942, "num_input_tokens_seen": 740096, "step": 1510 }, { "epoch": 0.19994720865778012, "grad_norm": 46.99830627441406, "learning_rate": 1.93995496484003e-06, "loss": 0.1712, "num_input_tokens_seen": 742912, "step": 1515 }, { "epoch": 0.2000791870133298, "eval_loss": 0.1068890318274498, "eval_runtime": 7.6888, "eval_samples_per_second": 875.951, "eval_steps_per_second": 109.51, "num_input_tokens_seen": 743424, "step": 1516 }, { "epoch": 0.20060710043552857, "grad_norm": 160.11495971679688, "learning_rate": 1.9391662707106092e-06, "loss": 0.1021, "num_input_tokens_seen": 745536, "step": 1520 }, { "epoch": 0.201266992213277, "grad_norm": 0.16469451785087585, "learning_rate": 1.9383725930099814e-06, "loss": 0.0031, "num_input_tokens_seen": 747968, "step": 1525 }, { "epoch": 0.20192688399102549, "grad_norm": 0.772555947303772, "learning_rate": 1.9375739359497e-06, "loss": 0.1222, "num_input_tokens_seen": 750464, "step": 1530 }, { "epoch": 0.20258677576877393, "grad_norm": 0.41962626576423645, "learning_rate": 1.936770303767741e-06, "loss": 0.2416, "num_input_tokens_seen": 752896, "step": 1535 }, { "epoch": 0.20324666754652237, "grad_norm": 11.837217330932617, "learning_rate": 1.9359617007284815e-06, "loss": 0.1974, "num_input_tokens_seen": 755648, "step": 1540 }, { "epoch": 0.20390655932427082, "grad_norm": 9.827956199645996, "learning_rate": 1.9351481311226738e-06, "loss": 0.2312, "num_input_tokens_seen": 758144, "step": 1545 }, { "epoch": 0.20456645110201926, "grad_norm": 0.8918312788009644, "learning_rate": 1.934329599267426e-06, "loss": 0.1313, "num_input_tokens_seen": 760704, "step": 1550 }, { "epoch": 0.2052263428797677, "grad_norm": 43.78156280517578, "learning_rate": 1.933506109506178e-06, "loss": 0.0468, "num_input_tokens_seen": 763136, "step": 1555 }, { "epoch": 0.20588623465751615, "grad_norm": 1.698026418685913, "learning_rate": 1.9326776662086765e-06, "loss": 0.1132, "num_input_tokens_seen": 766016, "step": 1560 }, { "epoch": 0.20654612643526463, "grad_norm": 59.669952392578125, "learning_rate": 1.9318442737709565e-06, "loss": 0.3367, "num_input_tokens_seen": 768512, "step": 1565 }, { "epoch": 0.20720601821301307, "grad_norm": 0.267106831073761, "learning_rate": 1.9310059366153116e-06, "loss": 0.2047, "num_input_tokens_seen": 770816, "step": 1570 }, { "epoch": 0.20786590999076152, "grad_norm": 0.7591071724891663, "learning_rate": 1.930162659190277e-06, "loss": 0.2302, "num_input_tokens_seen": 773312, "step": 1575 }, { "epoch": 0.20852580176850996, "grad_norm": 1.2925443649291992, "learning_rate": 1.9293144459706007e-06, "loss": 0.0029, "num_input_tokens_seen": 775680, "step": 1580 }, { "epoch": 0.2091856935462584, "grad_norm": 17.853742599487305, "learning_rate": 1.928461301457223e-06, "loss": 0.1877, "num_input_tokens_seen": 778048, "step": 1585 }, { "epoch": 0.20984558532400685, "grad_norm": 0.08952134847640991, "learning_rate": 1.92760323017725e-06, "loss": 0.3027, "num_input_tokens_seen": 780672, "step": 1590 }, { "epoch": 0.21050547710175532, "grad_norm": 0.1787254512310028, "learning_rate": 1.9267402366839338e-06, "loss": 0.216, "num_input_tokens_seen": 783360, "step": 1595 }, { "epoch": 0.21116536887950377, "grad_norm": 9.013484954833984, "learning_rate": 1.9258723255566433e-06, "loss": 0.1268, "num_input_tokens_seen": 785856, "step": 1600 }, { "epoch": 0.2118252606572522, "grad_norm": 1.6822223663330078, "learning_rate": 1.924999501400843e-06, "loss": 0.1832, "num_input_tokens_seen": 788480, "step": 1605 }, { "epoch": 0.21248515243500066, "grad_norm": 0.5006535649299622, "learning_rate": 1.924121768848068e-06, "loss": 0.1511, "num_input_tokens_seen": 791040, "step": 1610 }, { "epoch": 0.2131450442127491, "grad_norm": 0.24185070395469666, "learning_rate": 1.923239132555899e-06, "loss": 0.1088, "num_input_tokens_seen": 793600, "step": 1615 }, { "epoch": 0.21380493599049755, "grad_norm": 1.2802025079727173, "learning_rate": 1.9223515972079378e-06, "loss": 0.1302, "num_input_tokens_seen": 795968, "step": 1620 }, { "epoch": 0.21446482776824602, "grad_norm": 6.617660999298096, "learning_rate": 1.9214591675137813e-06, "loss": 0.049, "num_input_tokens_seen": 798272, "step": 1625 }, { "epoch": 0.21512471954599446, "grad_norm": 111.40785217285156, "learning_rate": 1.9205618482090003e-06, "loss": 0.144, "num_input_tokens_seen": 801024, "step": 1630 }, { "epoch": 0.2157846113237429, "grad_norm": 22.670175552368164, "learning_rate": 1.91965964405511e-06, "loss": 0.1374, "num_input_tokens_seen": 803584, "step": 1635 }, { "epoch": 0.21644450310149135, "grad_norm": 56.14551544189453, "learning_rate": 1.9187525598395457e-06, "loss": 0.0117, "num_input_tokens_seen": 805952, "step": 1640 }, { "epoch": 0.2171043948792398, "grad_norm": 35.73996353149414, "learning_rate": 1.9178406003756396e-06, "loss": 0.1249, "num_input_tokens_seen": 808512, "step": 1645 }, { "epoch": 0.21776428665698824, "grad_norm": 68.0622787475586, "learning_rate": 1.9169237705025936e-06, "loss": 0.0819, "num_input_tokens_seen": 811136, "step": 1650 }, { "epoch": 0.21842417843473672, "grad_norm": 0.02525966428220272, "learning_rate": 1.9160020750854533e-06, "loss": 0.0183, "num_input_tokens_seen": 813376, "step": 1655 }, { "epoch": 0.21908407021248516, "grad_norm": 180.0795135498047, "learning_rate": 1.915075519015083e-06, "loss": 0.199, "num_input_tokens_seen": 815872, "step": 1660 }, { "epoch": 0.2197439619902336, "grad_norm": 1.1811161041259766, "learning_rate": 1.914144107208139e-06, "loss": 0.0725, "num_input_tokens_seen": 818240, "step": 1665 }, { "epoch": 0.22040385376798205, "grad_norm": 0.16843393445014954, "learning_rate": 1.913207844607045e-06, "loss": 0.0539, "num_input_tokens_seen": 820736, "step": 1670 }, { "epoch": 0.2210637455457305, "grad_norm": 21.720033645629883, "learning_rate": 1.912266736179964e-06, "loss": 0.2528, "num_input_tokens_seen": 823616, "step": 1675 }, { "epoch": 0.22172363732347894, "grad_norm": 8.987836837768555, "learning_rate": 1.9113207869207727e-06, "loss": 0.1707, "num_input_tokens_seen": 826112, "step": 1680 }, { "epoch": 0.2223835291012274, "grad_norm": 0.8188716769218445, "learning_rate": 1.9103700018490365e-06, "loss": 0.1356, "num_input_tokens_seen": 828672, "step": 1685 }, { "epoch": 0.22304342087897586, "grad_norm": 1.7912400960922241, "learning_rate": 1.9094143860099787e-06, "loss": 0.1711, "num_input_tokens_seen": 831296, "step": 1690 }, { "epoch": 0.2237033126567243, "grad_norm": 108.46529388427734, "learning_rate": 1.9084539444744594e-06, "loss": 0.0895, "num_input_tokens_seen": 833856, "step": 1695 }, { "epoch": 0.22436320443447275, "grad_norm": 0.1527111977338791, "learning_rate": 1.907488682338944e-06, "loss": 0.1324, "num_input_tokens_seen": 836480, "step": 1700 }, { "epoch": 0.2250230962122212, "grad_norm": 63.81155776977539, "learning_rate": 1.9065186047254782e-06, "loss": 0.0553, "num_input_tokens_seen": 838976, "step": 1705 }, { "epoch": 0.22568298798996964, "grad_norm": 0.6900471448898315, "learning_rate": 1.9055437167816604e-06, "loss": 0.2205, "num_input_tokens_seen": 841728, "step": 1710 }, { "epoch": 0.22634287976771808, "grad_norm": 0.05360851809382439, "learning_rate": 1.9045640236806149e-06, "loss": 0.0143, "num_input_tokens_seen": 843968, "step": 1715 }, { "epoch": 0.22700277154546655, "grad_norm": 0.35860204696655273, "learning_rate": 1.903579530620963e-06, "loss": 0.3401, "num_input_tokens_seen": 846464, "step": 1720 }, { "epoch": 0.227662663323215, "grad_norm": 0.1349165141582489, "learning_rate": 1.9025902428267975e-06, "loss": 0.0967, "num_input_tokens_seen": 849088, "step": 1725 }, { "epoch": 0.22832255510096344, "grad_norm": 0.09187756478786469, "learning_rate": 1.901596165547653e-06, "loss": 0.2082, "num_input_tokens_seen": 851712, "step": 1730 }, { "epoch": 0.2289824468787119, "grad_norm": 0.5252279043197632, "learning_rate": 1.9005973040584796e-06, "loss": 0.102, "num_input_tokens_seen": 854208, "step": 1735 }, { "epoch": 0.22964233865646033, "grad_norm": 0.3394613564014435, "learning_rate": 1.8995936636596138e-06, "loss": 0.088, "num_input_tokens_seen": 856576, "step": 1740 }, { "epoch": 0.23030223043420878, "grad_norm": 0.196676105260849, "learning_rate": 1.8985852496767504e-06, "loss": 0.1348, "num_input_tokens_seen": 859008, "step": 1745 }, { "epoch": 0.23096212221195725, "grad_norm": 83.27044677734375, "learning_rate": 1.897572067460916e-06, "loss": 0.1643, "num_input_tokens_seen": 861440, "step": 1750 }, { "epoch": 0.2316220139897057, "grad_norm": 1.6085481643676758, "learning_rate": 1.8965541223884377e-06, "loss": 0.0848, "num_input_tokens_seen": 863936, "step": 1755 }, { "epoch": 0.23228190576745414, "grad_norm": 23.993480682373047, "learning_rate": 1.8955314198609171e-06, "loss": 0.1238, "num_input_tokens_seen": 866176, "step": 1760 }, { "epoch": 0.23294179754520258, "grad_norm": 0.1617557555437088, "learning_rate": 1.8945039653052005e-06, "loss": 0.0977, "num_input_tokens_seen": 868480, "step": 1765 }, { "epoch": 0.23360168932295103, "grad_norm": 0.15750542283058167, "learning_rate": 1.8934717641733498e-06, "loss": 0.0877, "num_input_tokens_seen": 870976, "step": 1770 }, { "epoch": 0.23426158110069947, "grad_norm": 38.80494689941406, "learning_rate": 1.8924348219426143e-06, "loss": 0.2471, "num_input_tokens_seen": 873088, "step": 1775 }, { "epoch": 0.23492147287844795, "grad_norm": 11.684532165527344, "learning_rate": 1.8913931441154016e-06, "loss": 0.2694, "num_input_tokens_seen": 875520, "step": 1780 }, { "epoch": 0.2355813646561964, "grad_norm": 32.83953857421875, "learning_rate": 1.8903467362192482e-06, "loss": 0.0401, "num_input_tokens_seen": 877632, "step": 1785 }, { "epoch": 0.23624125643394484, "grad_norm": 97.62303161621094, "learning_rate": 1.8892956038067895e-06, "loss": 0.0696, "num_input_tokens_seen": 880000, "step": 1790 }, { "epoch": 0.23690114821169328, "grad_norm": 97.3688735961914, "learning_rate": 1.8882397524557317e-06, "loss": 0.0238, "num_input_tokens_seen": 882176, "step": 1795 }, { "epoch": 0.23756103998944172, "grad_norm": 0.09657946974039078, "learning_rate": 1.8871791877688208e-06, "loss": 0.0642, "num_input_tokens_seen": 884800, "step": 1800 }, { "epoch": 0.23822093176719017, "grad_norm": 0.5457859635353088, "learning_rate": 1.8861139153738143e-06, "loss": 0.0068, "num_input_tokens_seen": 887104, "step": 1805 }, { "epoch": 0.23888082354493864, "grad_norm": 2.4806833267211914, "learning_rate": 1.8850439409234498e-06, "loss": 0.0012, "num_input_tokens_seen": 889408, "step": 1810 }, { "epoch": 0.2395407153226871, "grad_norm": 156.25328063964844, "learning_rate": 1.8839692700954161e-06, "loss": 0.1943, "num_input_tokens_seen": 891648, "step": 1815 }, { "epoch": 0.24020060710043553, "grad_norm": 184.66175842285156, "learning_rate": 1.8828899085923234e-06, "loss": 0.3211, "num_input_tokens_seen": 894208, "step": 1820 }, { "epoch": 0.24086049887818398, "grad_norm": 0.037798941135406494, "learning_rate": 1.881805862141671e-06, "loss": 0.2085, "num_input_tokens_seen": 896704, "step": 1825 }, { "epoch": 0.24152039065593242, "grad_norm": 0.04176515340805054, "learning_rate": 1.8807171364958196e-06, "loss": 0.082, "num_input_tokens_seen": 899264, "step": 1830 }, { "epoch": 0.24218028243368087, "grad_norm": 0.2008335441350937, "learning_rate": 1.879623737431959e-06, "loss": 0.0206, "num_input_tokens_seen": 901760, "step": 1835 }, { "epoch": 0.24284017421142934, "grad_norm": 62.211387634277344, "learning_rate": 1.8785256707520778e-06, "loss": 0.3077, "num_input_tokens_seen": 903872, "step": 1840 }, { "epoch": 0.24350006598917778, "grad_norm": 0.0855240523815155, "learning_rate": 1.8774229422829325e-06, "loss": 0.0012, "num_input_tokens_seen": 906368, "step": 1845 }, { "epoch": 0.24415995776692623, "grad_norm": 0.062163472175598145, "learning_rate": 1.8763155578760181e-06, "loss": 0.0491, "num_input_tokens_seen": 908864, "step": 1850 }, { "epoch": 0.24481984954467467, "grad_norm": 0.138889878988266, "learning_rate": 1.8752035234075336e-06, "loss": 0.0892, "num_input_tokens_seen": 911040, "step": 1855 }, { "epoch": 0.24547974132242312, "grad_norm": 21.301368713378906, "learning_rate": 1.8740868447783554e-06, "loss": 0.1932, "num_input_tokens_seen": 913408, "step": 1860 }, { "epoch": 0.24613963310017156, "grad_norm": 77.18330383300781, "learning_rate": 1.8729655279140012e-06, "loss": 0.2285, "num_input_tokens_seen": 915968, "step": 1865 }, { "epoch": 0.24679952487792003, "grad_norm": 28.638566970825195, "learning_rate": 1.8718395787646029e-06, "loss": 0.1745, "num_input_tokens_seen": 918528, "step": 1870 }, { "epoch": 0.24745941665566848, "grad_norm": 0.12477682530879974, "learning_rate": 1.870709003304872e-06, "loss": 0.0009, "num_input_tokens_seen": 921152, "step": 1875 }, { "epoch": 0.24811930843341692, "grad_norm": 0.03698311001062393, "learning_rate": 1.8695738075340693e-06, "loss": 0.0005, "num_input_tokens_seen": 923520, "step": 1880 }, { "epoch": 0.24877920021116537, "grad_norm": 0.9180229902267456, "learning_rate": 1.8684339974759723e-06, "loss": 0.1696, "num_input_tokens_seen": 925888, "step": 1885 }, { "epoch": 0.2494390919889138, "grad_norm": 14.323315620422363, "learning_rate": 1.8672895791788445e-06, "loss": 0.0881, "num_input_tokens_seen": 928704, "step": 1890 }, { "epoch": 0.2500989837666623, "grad_norm": 118.73922729492188, "learning_rate": 1.8661405587154017e-06, "loss": 0.2865, "num_input_tokens_seen": 930944, "step": 1895 }, { "epoch": 0.2500989837666623, "eval_loss": 0.12773367762565613, "eval_runtime": 7.6378, "eval_samples_per_second": 881.797, "eval_steps_per_second": 110.241, "num_input_tokens_seen": 930944, "step": 1895 }, { "epoch": 0.25075887554441073, "grad_norm": 12.755705833435059, "learning_rate": 1.8649869421827808e-06, "loss": 0.2389, "num_input_tokens_seen": 933376, "step": 1900 }, { "epoch": 0.2514187673221592, "grad_norm": 0.946739137172699, "learning_rate": 1.863828735702507e-06, "loss": 0.0517, "num_input_tokens_seen": 936000, "step": 1905 }, { "epoch": 0.2520786590999076, "grad_norm": 19.37730598449707, "learning_rate": 1.862665945420462e-06, "loss": 0.0611, "num_input_tokens_seen": 938432, "step": 1910 }, { "epoch": 0.25273855087765607, "grad_norm": 4.929696559906006, "learning_rate": 1.8614985775068498e-06, "loss": 0.0838, "num_input_tokens_seen": 941312, "step": 1915 }, { "epoch": 0.2533984426554045, "grad_norm": 59.3293342590332, "learning_rate": 1.860326638156167e-06, "loss": 0.0099, "num_input_tokens_seen": 943488, "step": 1920 }, { "epoch": 0.25405833443315295, "grad_norm": 32.63521194458008, "learning_rate": 1.8591501335871653e-06, "loss": 0.1064, "num_input_tokens_seen": 945856, "step": 1925 }, { "epoch": 0.2547182262109014, "grad_norm": 163.6297149658203, "learning_rate": 1.857969070042824e-06, "loss": 0.2861, "num_input_tokens_seen": 948352, "step": 1930 }, { "epoch": 0.25537811798864984, "grad_norm": 0.6843308210372925, "learning_rate": 1.8567834537903116e-06, "loss": 0.0541, "num_input_tokens_seen": 950976, "step": 1935 }, { "epoch": 0.2560380097663983, "grad_norm": 15.151936531066895, "learning_rate": 1.8555932911209565e-06, "loss": 0.1499, "num_input_tokens_seen": 953216, "step": 1940 }, { "epoch": 0.25669790154414673, "grad_norm": 1.8535500764846802, "learning_rate": 1.8543985883502119e-06, "loss": 0.0338, "num_input_tokens_seen": 955648, "step": 1945 }, { "epoch": 0.25735779332189523, "grad_norm": 0.5665189623832703, "learning_rate": 1.8531993518176216e-06, "loss": 0.0462, "num_input_tokens_seen": 957888, "step": 1950 }, { "epoch": 0.2580176850996437, "grad_norm": 91.90030670166016, "learning_rate": 1.8519955878867889e-06, "loss": 0.1275, "num_input_tokens_seen": 960128, "step": 1955 }, { "epoch": 0.2586775768773921, "grad_norm": 0.0542287677526474, "learning_rate": 1.8507873029453392e-06, "loss": 0.1778, "num_input_tokens_seen": 962496, "step": 1960 }, { "epoch": 0.25933746865514057, "grad_norm": 9.215625762939453, "learning_rate": 1.8495745034048896e-06, "loss": 0.2342, "num_input_tokens_seen": 965120, "step": 1965 }, { "epoch": 0.259997360432889, "grad_norm": 0.16024070978164673, "learning_rate": 1.8483571957010127e-06, "loss": 0.0074, "num_input_tokens_seen": 967616, "step": 1970 }, { "epoch": 0.26065725221063746, "grad_norm": 0.10168848931789398, "learning_rate": 1.8471353862932035e-06, "loss": 0.0688, "num_input_tokens_seen": 970240, "step": 1975 }, { "epoch": 0.2613171439883859, "grad_norm": 71.9769515991211, "learning_rate": 1.8459090816648444e-06, "loss": 0.1752, "num_input_tokens_seen": 972544, "step": 1980 }, { "epoch": 0.26197703576613435, "grad_norm": 0.1407454013824463, "learning_rate": 1.8446782883231713e-06, "loss": 0.2913, "num_input_tokens_seen": 974912, "step": 1985 }, { "epoch": 0.2626369275438828, "grad_norm": 17.948266983032227, "learning_rate": 1.8434430127992387e-06, "loss": 0.3162, "num_input_tokens_seen": 977088, "step": 1990 }, { "epoch": 0.26329681932163124, "grad_norm": 32.353912353515625, "learning_rate": 1.8422032616478857e-06, "loss": 0.1709, "num_input_tokens_seen": 979648, "step": 1995 }, { "epoch": 0.2639567110993797, "grad_norm": 1.2442036867141724, "learning_rate": 1.8409590414477001e-06, "loss": 0.1159, "num_input_tokens_seen": 982336, "step": 2000 }, { "epoch": 0.2646166028771281, "grad_norm": 3.403188705444336, "learning_rate": 1.839710358800985e-06, "loss": 0.0056, "num_input_tokens_seen": 984768, "step": 2005 }, { "epoch": 0.2652764946548766, "grad_norm": 5.241207599639893, "learning_rate": 1.8384572203337224e-06, "loss": 0.0349, "num_input_tokens_seen": 987136, "step": 2010 }, { "epoch": 0.26593638643262507, "grad_norm": 0.26890337467193604, "learning_rate": 1.837199632695538e-06, "loss": 0.1309, "num_input_tokens_seen": 989824, "step": 2015 }, { "epoch": 0.2665962782103735, "grad_norm": 53.87063217163086, "learning_rate": 1.8359376025596682e-06, "loss": 0.3374, "num_input_tokens_seen": 992064, "step": 2020 }, { "epoch": 0.26725616998812196, "grad_norm": 17.814453125, "learning_rate": 1.8346711366229215e-06, "loss": 0.1366, "num_input_tokens_seen": 994368, "step": 2025 }, { "epoch": 0.2679160617658704, "grad_norm": 18.101577758789062, "learning_rate": 1.8334002416056442e-06, "loss": 0.215, "num_input_tokens_seen": 996864, "step": 2030 }, { "epoch": 0.26857595354361885, "grad_norm": 0.25549983978271484, "learning_rate": 1.8321249242516865e-06, "loss": 0.2084, "num_input_tokens_seen": 999360, "step": 2035 }, { "epoch": 0.2692358453213673, "grad_norm": 0.35009151697158813, "learning_rate": 1.8308451913283638e-06, "loss": 0.0868, "num_input_tokens_seen": 1001920, "step": 2040 }, { "epoch": 0.26989573709911574, "grad_norm": 0.3472491502761841, "learning_rate": 1.8295610496264229e-06, "loss": 0.0602, "num_input_tokens_seen": 1004224, "step": 2045 }, { "epoch": 0.2705556288768642, "grad_norm": 0.34922727942466736, "learning_rate": 1.828272505960005e-06, "loss": 0.0027, "num_input_tokens_seen": 1006528, "step": 2050 }, { "epoch": 0.27121552065461263, "grad_norm": 0.13754220306873322, "learning_rate": 1.8269795671666098e-06, "loss": 0.1856, "num_input_tokens_seen": 1008896, "step": 2055 }, { "epoch": 0.2718754124323611, "grad_norm": 2.371704339981079, "learning_rate": 1.8256822401070591e-06, "loss": 0.1347, "num_input_tokens_seen": 1011648, "step": 2060 }, { "epoch": 0.2725353042101095, "grad_norm": 125.92493438720703, "learning_rate": 1.8243805316654611e-06, "loss": 0.0254, "num_input_tokens_seen": 1014208, "step": 2065 }, { "epoch": 0.27319519598785796, "grad_norm": 6.873655796051025, "learning_rate": 1.823074448749172e-06, "loss": 0.2187, "num_input_tokens_seen": 1016640, "step": 2070 }, { "epoch": 0.27385508776560646, "grad_norm": 0.0956018716096878, "learning_rate": 1.8217639982887623e-06, "loss": 0.0403, "num_input_tokens_seen": 1019328, "step": 2075 }, { "epoch": 0.2745149795433549, "grad_norm": 0.05648243427276611, "learning_rate": 1.8204491872379769e-06, "loss": 0.0603, "num_input_tokens_seen": 1021696, "step": 2080 }, { "epoch": 0.27517487132110335, "grad_norm": 148.0998077392578, "learning_rate": 1.8191300225737e-06, "loss": 0.0996, "num_input_tokens_seen": 1024256, "step": 2085 }, { "epoch": 0.2758347630988518, "grad_norm": 23.68135643005371, "learning_rate": 1.8178065112959184e-06, "loss": 0.2261, "num_input_tokens_seen": 1026560, "step": 2090 }, { "epoch": 0.27649465487660024, "grad_norm": 23.90264129638672, "learning_rate": 1.8164786604276832e-06, "loss": 0.3078, "num_input_tokens_seen": 1029184, "step": 2095 }, { "epoch": 0.2771545466543487, "grad_norm": 1.5052696466445923, "learning_rate": 1.8151464770150727e-06, "loss": 0.1119, "num_input_tokens_seen": 1031744, "step": 2100 }, { "epoch": 0.27781443843209713, "grad_norm": 0.3358094096183777, "learning_rate": 1.8138099681271558e-06, "loss": 0.2357, "num_input_tokens_seen": 1034048, "step": 2105 }, { "epoch": 0.2784743302098456, "grad_norm": 0.09559042006731033, "learning_rate": 1.8124691408559536e-06, "loss": 0.1489, "num_input_tokens_seen": 1036544, "step": 2110 }, { "epoch": 0.279134221987594, "grad_norm": 0.12438903003931046, "learning_rate": 1.8111240023164023e-06, "loss": 0.1057, "num_input_tokens_seen": 1038848, "step": 2115 }, { "epoch": 0.27979411376534247, "grad_norm": 0.17464138567447662, "learning_rate": 1.809774559646316e-06, "loss": 0.0049, "num_input_tokens_seen": 1041152, "step": 2120 }, { "epoch": 0.2804540055430909, "grad_norm": 15.5991792678833, "learning_rate": 1.8084208200063469e-06, "loss": 0.1192, "num_input_tokens_seen": 1043968, "step": 2125 }, { "epoch": 0.28111389732083936, "grad_norm": 16.062332153320312, "learning_rate": 1.8070627905799496e-06, "loss": 0.2678, "num_input_tokens_seen": 1046272, "step": 2130 }, { "epoch": 0.28177378909858786, "grad_norm": 38.3685302734375, "learning_rate": 1.8057004785733413e-06, "loss": 0.0892, "num_input_tokens_seen": 1048448, "step": 2135 }, { "epoch": 0.2824336808763363, "grad_norm": 1.2147469520568848, "learning_rate": 1.8043338912154647e-06, "loss": 0.171, "num_input_tokens_seen": 1051072, "step": 2140 }, { "epoch": 0.28309357265408475, "grad_norm": 2.57453989982605, "learning_rate": 1.8029630357579486e-06, "loss": 0.0537, "num_input_tokens_seen": 1053312, "step": 2145 }, { "epoch": 0.2837534644318332, "grad_norm": 0.08693689852952957, "learning_rate": 1.8015879194750702e-06, "loss": 0.0727, "num_input_tokens_seen": 1055680, "step": 2150 }, { "epoch": 0.28441335620958164, "grad_norm": 0.2852920889854431, "learning_rate": 1.8002085496637165e-06, "loss": 0.1279, "num_input_tokens_seen": 1057984, "step": 2155 }, { "epoch": 0.2850732479873301, "grad_norm": 0.3056880831718445, "learning_rate": 1.7988249336433448e-06, "loss": 0.1195, "num_input_tokens_seen": 1060736, "step": 2160 }, { "epoch": 0.2857331397650785, "grad_norm": 40.0366096496582, "learning_rate": 1.7974370787559447e-06, "loss": 0.1191, "num_input_tokens_seen": 1063424, "step": 2165 }, { "epoch": 0.28639303154282697, "grad_norm": 0.06313258409500122, "learning_rate": 1.796044992365999e-06, "loss": 0.0407, "num_input_tokens_seen": 1065728, "step": 2170 }, { "epoch": 0.2870529233205754, "grad_norm": 0.0497964546084404, "learning_rate": 1.794648681860444e-06, "loss": 0.0343, "num_input_tokens_seen": 1068160, "step": 2175 }, { "epoch": 0.28771281509832386, "grad_norm": 0.049737598747015, "learning_rate": 1.7932481546486312e-06, "loss": 0.2582, "num_input_tokens_seen": 1070592, "step": 2180 }, { "epoch": 0.2883727068760723, "grad_norm": 15.30761432647705, "learning_rate": 1.791843418162287e-06, "loss": 0.161, "num_input_tokens_seen": 1073280, "step": 2185 }, { "epoch": 0.28903259865382075, "grad_norm": 2.0795376300811768, "learning_rate": 1.7904344798554748e-06, "loss": 0.0127, "num_input_tokens_seen": 1075584, "step": 2190 }, { "epoch": 0.28969249043156925, "grad_norm": 2.7730255126953125, "learning_rate": 1.789021347204553e-06, "loss": 0.0962, "num_input_tokens_seen": 1078016, "step": 2195 }, { "epoch": 0.2903523822093177, "grad_norm": 42.26045608520508, "learning_rate": 1.7876040277081381e-06, "loss": 0.1665, "num_input_tokens_seen": 1080512, "step": 2200 }, { "epoch": 0.29101227398706614, "grad_norm": 49.438175201416016, "learning_rate": 1.7861825288870632e-06, "loss": 0.1979, "num_input_tokens_seen": 1082752, "step": 2205 }, { "epoch": 0.2916721657648146, "grad_norm": 19.054122924804688, "learning_rate": 1.7847568582843376e-06, "loss": 0.3436, "num_input_tokens_seen": 1085184, "step": 2210 }, { "epoch": 0.29233205754256303, "grad_norm": 13.935524940490723, "learning_rate": 1.7833270234651088e-06, "loss": 0.1458, "num_input_tokens_seen": 1087360, "step": 2215 }, { "epoch": 0.2929919493203115, "grad_norm": 12.456001281738281, "learning_rate": 1.781893032016621e-06, "loss": 0.0619, "num_input_tokens_seen": 1089984, "step": 2220 }, { "epoch": 0.2936518410980599, "grad_norm": 0.7875421643257141, "learning_rate": 1.7804548915481746e-06, "loss": 0.0185, "num_input_tokens_seen": 1092608, "step": 2225 }, { "epoch": 0.29431173287580836, "grad_norm": 0.38268232345581055, "learning_rate": 1.7790126096910865e-06, "loss": 0.1235, "num_input_tokens_seen": 1095040, "step": 2230 }, { "epoch": 0.2949716246535568, "grad_norm": 8.145883560180664, "learning_rate": 1.7775661940986492e-06, "loss": 0.064, "num_input_tokens_seen": 1097728, "step": 2235 }, { "epoch": 0.29563151643130525, "grad_norm": 4.164917469024658, "learning_rate": 1.776115652446091e-06, "loss": 0.2202, "num_input_tokens_seen": 1100096, "step": 2240 }, { "epoch": 0.2962914082090537, "grad_norm": 51.62826919555664, "learning_rate": 1.7746609924305336e-06, "loss": 0.1252, "num_input_tokens_seen": 1102400, "step": 2245 }, { "epoch": 0.29695129998680214, "grad_norm": 0.07670289278030396, "learning_rate": 1.7732022217709534e-06, "loss": 0.1016, "num_input_tokens_seen": 1104960, "step": 2250 }, { "epoch": 0.2976111917645506, "grad_norm": 9.224966049194336, "learning_rate": 1.7717393482081384e-06, "loss": 0.0905, "num_input_tokens_seen": 1107520, "step": 2255 }, { "epoch": 0.2982710835422991, "grad_norm": 26.702306747436523, "learning_rate": 1.7702723795046492e-06, "loss": 0.1454, "num_input_tokens_seen": 1109952, "step": 2260 }, { "epoch": 0.29893097532004753, "grad_norm": 0.32974258065223694, "learning_rate": 1.7688013234447757e-06, "loss": 0.0226, "num_input_tokens_seen": 1112128, "step": 2265 }, { "epoch": 0.299590867097796, "grad_norm": 0.09137266874313354, "learning_rate": 1.7673261878344973e-06, "loss": 0.1225, "num_input_tokens_seen": 1114688, "step": 2270 }, { "epoch": 0.30011878051999474, "eval_loss": 0.10979828983545303, "eval_runtime": 7.5343, "eval_samples_per_second": 893.91, "eval_steps_per_second": 111.755, "num_input_tokens_seen": 1116800, "step": 2274 }, { "epoch": 0.3002507588755444, "grad_norm": 0.13611248135566711, "learning_rate": 1.7658469805014414e-06, "loss": 0.1963, "num_input_tokens_seen": 1117248, "step": 2275 }, { "epoch": 0.30091065065329287, "grad_norm": 11.078180313110352, "learning_rate": 1.7643637092948415e-06, "loss": 0.1312, "num_input_tokens_seen": 1119808, "step": 2280 }, { "epoch": 0.3015705424310413, "grad_norm": 13.604643821716309, "learning_rate": 1.7628763820854948e-06, "loss": 0.2181, "num_input_tokens_seen": 1122112, "step": 2285 }, { "epoch": 0.30223043420878976, "grad_norm": 0.21895840764045715, "learning_rate": 1.7613850067657216e-06, "loss": 0.0905, "num_input_tokens_seen": 1124544, "step": 2290 }, { "epoch": 0.3028903259865382, "grad_norm": 0.23491473495960236, "learning_rate": 1.7598895912493232e-06, "loss": 0.0688, "num_input_tokens_seen": 1127104, "step": 2295 }, { "epoch": 0.30355021776428665, "grad_norm": 3.027573823928833, "learning_rate": 1.7583901434715397e-06, "loss": 0.0735, "num_input_tokens_seen": 1129536, "step": 2300 }, { "epoch": 0.3042101095420351, "grad_norm": 0.24895241856575012, "learning_rate": 1.7568866713890074e-06, "loss": 0.0694, "num_input_tokens_seen": 1131840, "step": 2305 }, { "epoch": 0.30487000131978353, "grad_norm": 0.47908198833465576, "learning_rate": 1.7553791829797175e-06, "loss": 0.1669, "num_input_tokens_seen": 1134336, "step": 2310 }, { "epoch": 0.305529893097532, "grad_norm": 8.449284553527832, "learning_rate": 1.7538676862429737e-06, "loss": 0.2863, "num_input_tokens_seen": 1136640, "step": 2315 }, { "epoch": 0.3061897848752805, "grad_norm": 20.579036712646484, "learning_rate": 1.7523521891993486e-06, "loss": 0.1177, "num_input_tokens_seen": 1139136, "step": 2320 }, { "epoch": 0.3068496766530289, "grad_norm": 0.1681635081768036, "learning_rate": 1.7508326998906422e-06, "loss": 0.0919, "num_input_tokens_seen": 1141568, "step": 2325 }, { "epoch": 0.30750956843077737, "grad_norm": 0.1681900918483734, "learning_rate": 1.7493092263798394e-06, "loss": 0.004, "num_input_tokens_seen": 1143936, "step": 2330 }, { "epoch": 0.3081694602085258, "grad_norm": 96.35763549804688, "learning_rate": 1.7477817767510664e-06, "loss": 0.037, "num_input_tokens_seen": 1146624, "step": 2335 }, { "epoch": 0.30882935198627426, "grad_norm": 0.5398063063621521, "learning_rate": 1.7462503591095484e-06, "loss": 0.0055, "num_input_tokens_seen": 1149120, "step": 2340 }, { "epoch": 0.3094892437640227, "grad_norm": 0.031823668628931046, "learning_rate": 1.7447149815815659e-06, "loss": 0.0421, "num_input_tokens_seen": 1151488, "step": 2345 }, { "epoch": 0.31014913554177115, "grad_norm": 0.048917245119810104, "learning_rate": 1.7431756523144126e-06, "loss": 0.1083, "num_input_tokens_seen": 1153600, "step": 2350 }, { "epoch": 0.3108090273195196, "grad_norm": 0.025062644854187965, "learning_rate": 1.7416323794763512e-06, "loss": 0.0665, "num_input_tokens_seen": 1156224, "step": 2355 }, { "epoch": 0.31146891909726804, "grad_norm": 0.01710972562432289, "learning_rate": 1.7400851712565707e-06, "loss": 0.2148, "num_input_tokens_seen": 1158656, "step": 2360 }, { "epoch": 0.3121288108750165, "grad_norm": 0.0433725044131279, "learning_rate": 1.7385340358651432e-06, "loss": 0.2065, "num_input_tokens_seen": 1161408, "step": 2365 }, { "epoch": 0.3127887026527649, "grad_norm": 119.09558868408203, "learning_rate": 1.736978981532979e-06, "loss": 0.0283, "num_input_tokens_seen": 1163904, "step": 2370 }, { "epoch": 0.31344859443051337, "grad_norm": 0.16062359511852264, "learning_rate": 1.7354200165117838e-06, "loss": 0.2238, "num_input_tokens_seen": 1166208, "step": 2375 }, { "epoch": 0.3141084862082619, "grad_norm": 226.05735778808594, "learning_rate": 1.733857149074016e-06, "loss": 0.2442, "num_input_tokens_seen": 1168512, "step": 2380 }, { "epoch": 0.3147683779860103, "grad_norm": 0.14660975337028503, "learning_rate": 1.7322903875128402e-06, "loss": 0.1859, "num_input_tokens_seen": 1171072, "step": 2385 }, { "epoch": 0.31542826976375876, "grad_norm": 0.15770386159420013, "learning_rate": 1.7307197401420858e-06, "loss": 0.0042, "num_input_tokens_seen": 1173312, "step": 2390 }, { "epoch": 0.3160881615415072, "grad_norm": 0.41774991154670715, "learning_rate": 1.7291452152962018e-06, "loss": 0.0649, "num_input_tokens_seen": 1175744, "step": 2395 }, { "epoch": 0.31674805331925565, "grad_norm": 0.3258494734764099, "learning_rate": 1.7275668213302116e-06, "loss": 0.1831, "num_input_tokens_seen": 1178112, "step": 2400 }, { "epoch": 0.3174079450970041, "grad_norm": 57.20950698852539, "learning_rate": 1.72598456661967e-06, "loss": 0.0443, "num_input_tokens_seen": 1180352, "step": 2405 }, { "epoch": 0.31806783687475254, "grad_norm": 0.16907945275306702, "learning_rate": 1.7243984595606191e-06, "loss": 0.1393, "num_input_tokens_seen": 1182528, "step": 2410 }, { "epoch": 0.318727728652501, "grad_norm": 0.5881856679916382, "learning_rate": 1.722808508569542e-06, "loss": 0.0891, "num_input_tokens_seen": 1185280, "step": 2415 }, { "epoch": 0.31938762043024943, "grad_norm": 14.51777458190918, "learning_rate": 1.72121472208332e-06, "loss": 0.0768, "num_input_tokens_seen": 1188032, "step": 2420 }, { "epoch": 0.3200475122079979, "grad_norm": 83.25537872314453, "learning_rate": 1.7196171085591864e-06, "loss": 0.2321, "num_input_tokens_seen": 1190464, "step": 2425 }, { "epoch": 0.3207074039857463, "grad_norm": 222.58181762695312, "learning_rate": 1.7180156764746824e-06, "loss": 0.2085, "num_input_tokens_seen": 1192960, "step": 2430 }, { "epoch": 0.32136729576349476, "grad_norm": 0.1936943382024765, "learning_rate": 1.7164104343276113e-06, "loss": 0.0272, "num_input_tokens_seen": 1195072, "step": 2435 }, { "epoch": 0.3220271875412432, "grad_norm": 0.019381973892450333, "learning_rate": 1.714801390635996e-06, "loss": 0.0063, "num_input_tokens_seen": 1197376, "step": 2440 }, { "epoch": 0.3226870793189917, "grad_norm": 0.04609265550971031, "learning_rate": 1.7131885539380297e-06, "loss": 0.038, "num_input_tokens_seen": 1199936, "step": 2445 }, { "epoch": 0.32334697109674015, "grad_norm": 31.58684539794922, "learning_rate": 1.7115719327920335e-06, "loss": 0.1487, "num_input_tokens_seen": 1202368, "step": 2450 }, { "epoch": 0.3240068628744886, "grad_norm": 0.015219883061945438, "learning_rate": 1.70995153577641e-06, "loss": 0.0011, "num_input_tokens_seen": 1204800, "step": 2455 }, { "epoch": 0.32466675465223704, "grad_norm": 27.071508407592773, "learning_rate": 1.7083273714895991e-06, "loss": 0.0641, "num_input_tokens_seen": 1207552, "step": 2460 }, { "epoch": 0.3253266464299855, "grad_norm": 0.03585462644696236, "learning_rate": 1.7066994485500298e-06, "loss": 0.2123, "num_input_tokens_seen": 1209856, "step": 2465 }, { "epoch": 0.32598653820773393, "grad_norm": 0.20441071689128876, "learning_rate": 1.7050677755960762e-06, "loss": 0.0982, "num_input_tokens_seen": 1212352, "step": 2470 }, { "epoch": 0.3266464299854824, "grad_norm": 99.99305725097656, "learning_rate": 1.7034323612860124e-06, "loss": 0.1048, "num_input_tokens_seen": 1214912, "step": 2475 }, { "epoch": 0.3273063217632308, "grad_norm": 0.028969811275601387, "learning_rate": 1.7017932142979645e-06, "loss": 0.0354, "num_input_tokens_seen": 1217088, "step": 2480 }, { "epoch": 0.32796621354097927, "grad_norm": 4.167091369628906, "learning_rate": 1.700150343329866e-06, "loss": 0.2006, "num_input_tokens_seen": 1219584, "step": 2485 }, { "epoch": 0.3286261053187277, "grad_norm": 30.39582633972168, "learning_rate": 1.6985037570994113e-06, "loss": 0.1335, "num_input_tokens_seen": 1222336, "step": 2490 }, { "epoch": 0.32928599709647616, "grad_norm": 0.7080073952674866, "learning_rate": 1.6968534643440088e-06, "loss": 0.0688, "num_input_tokens_seen": 1224832, "step": 2495 }, { "epoch": 0.3299458888742246, "grad_norm": 36.44700241088867, "learning_rate": 1.6951994738207364e-06, "loss": 0.1821, "num_input_tokens_seen": 1227392, "step": 2500 }, { "epoch": 0.3306057806519731, "grad_norm": 12.277377128601074, "learning_rate": 1.6935417943062928e-06, "loss": 0.2034, "num_input_tokens_seen": 1229952, "step": 2505 }, { "epoch": 0.33126567242972155, "grad_norm": 0.4052470028400421, "learning_rate": 1.6918804345969516e-06, "loss": 0.0106, "num_input_tokens_seen": 1232640, "step": 2510 }, { "epoch": 0.33192556420747, "grad_norm": 26.028976440429688, "learning_rate": 1.6902154035085156e-06, "loss": 0.0161, "num_input_tokens_seen": 1235200, "step": 2515 }, { "epoch": 0.33258545598521844, "grad_norm": 0.1962713748216629, "learning_rate": 1.688546709876269e-06, "loss": 0.0893, "num_input_tokens_seen": 1237632, "step": 2520 }, { "epoch": 0.3332453477629669, "grad_norm": 1.0130256414413452, "learning_rate": 1.6868743625549314e-06, "loss": 0.0905, "num_input_tokens_seen": 1239936, "step": 2525 }, { "epoch": 0.3339052395407153, "grad_norm": 0.6345663666725159, "learning_rate": 1.6851983704186092e-06, "loss": 0.0392, "num_input_tokens_seen": 1242304, "step": 2530 }, { "epoch": 0.33456513131846377, "grad_norm": 0.025052571669220924, "learning_rate": 1.6835187423607503e-06, "loss": 0.0036, "num_input_tokens_seen": 1244736, "step": 2535 }, { "epoch": 0.3352250230962122, "grad_norm": 0.6511954665184021, "learning_rate": 1.681835487294096e-06, "loss": 0.2003, "num_input_tokens_seen": 1247488, "step": 2540 }, { "epoch": 0.33588491487396066, "grad_norm": 0.022017456591129303, "learning_rate": 1.6801486141506342e-06, "loss": 0.2557, "num_input_tokens_seen": 1250048, "step": 2545 }, { "epoch": 0.3365448066517091, "grad_norm": 160.31907653808594, "learning_rate": 1.6784581318815514e-06, "loss": 0.3749, "num_input_tokens_seen": 1252928, "step": 2550 }, { "epoch": 0.33720469842945755, "grad_norm": 0.035775672644376755, "learning_rate": 1.6767640494571849e-06, "loss": 0.146, "num_input_tokens_seen": 1255488, "step": 2555 }, { "epoch": 0.337864590207206, "grad_norm": 25.89168357849121, "learning_rate": 1.6750663758669767e-06, "loss": 0.3346, "num_input_tokens_seen": 1257984, "step": 2560 }, { "epoch": 0.3385244819849545, "grad_norm": 0.10507479310035706, "learning_rate": 1.6733651201194245e-06, "loss": 0.1044, "num_input_tokens_seen": 1260416, "step": 2565 }, { "epoch": 0.33918437376270294, "grad_norm": 36.37501907348633, "learning_rate": 1.6716602912420342e-06, "loss": 0.0797, "num_input_tokens_seen": 1263168, "step": 2570 }, { "epoch": 0.3398442655404514, "grad_norm": 0.687891960144043, "learning_rate": 1.6699518982812726e-06, "loss": 0.1608, "num_input_tokens_seen": 1265600, "step": 2575 }, { "epoch": 0.34050415731819983, "grad_norm": 0.11499731987714767, "learning_rate": 1.6682399503025183e-06, "loss": 0.0033, "num_input_tokens_seen": 1268032, "step": 2580 }, { "epoch": 0.3411640490959483, "grad_norm": 42.86396408081055, "learning_rate": 1.666524456390014e-06, "loss": 0.1571, "num_input_tokens_seen": 1270336, "step": 2585 }, { "epoch": 0.3418239408736967, "grad_norm": 30.411161422729492, "learning_rate": 1.664805425646819e-06, "loss": 0.0566, "num_input_tokens_seen": 1273088, "step": 2590 }, { "epoch": 0.34248383265144516, "grad_norm": 0.1486613005399704, "learning_rate": 1.6630828671947606e-06, "loss": 0.2203, "num_input_tokens_seen": 1275456, "step": 2595 }, { "epoch": 0.3431437244291936, "grad_norm": 0.21017670631408691, "learning_rate": 1.6613567901743842e-06, "loss": 0.0365, "num_input_tokens_seen": 1277888, "step": 2600 }, { "epoch": 0.34380361620694205, "grad_norm": 0.2567872107028961, "learning_rate": 1.6596272037449075e-06, "loss": 0.0013, "num_input_tokens_seen": 1280384, "step": 2605 }, { "epoch": 0.3444635079846905, "grad_norm": 35.565086364746094, "learning_rate": 1.6578941170841696e-06, "loss": 0.064, "num_input_tokens_seen": 1282944, "step": 2610 }, { "epoch": 0.34512339976243894, "grad_norm": 0.281055212020874, "learning_rate": 1.6561575393885833e-06, "loss": 0.0664, "num_input_tokens_seen": 1285184, "step": 2615 }, { "epoch": 0.3457832915401874, "grad_norm": 0.0915956199169159, "learning_rate": 1.6544174798730864e-06, "loss": 0.1976, "num_input_tokens_seen": 1287808, "step": 2620 }, { "epoch": 0.34644318331793583, "grad_norm": 0.18124467134475708, "learning_rate": 1.6526739477710923e-06, "loss": 0.1552, "num_input_tokens_seen": 1290432, "step": 2625 }, { "epoch": 0.34710307509568433, "grad_norm": 0.16865764558315277, "learning_rate": 1.650926952334441e-06, "loss": 0.2257, "num_input_tokens_seen": 1292736, "step": 2630 }, { "epoch": 0.3477629668734328, "grad_norm": 0.4664243757724762, "learning_rate": 1.6491765028333516e-06, "loss": 0.2674, "num_input_tokens_seen": 1295104, "step": 2635 }, { "epoch": 0.3484228586511812, "grad_norm": 0.6427319645881653, "learning_rate": 1.6474226085563693e-06, "loss": 0.0204, "num_input_tokens_seen": 1297600, "step": 2640 }, { "epoch": 0.34908275042892967, "grad_norm": 0.10163812339305878, "learning_rate": 1.6456652788103215e-06, "loss": 0.0496, "num_input_tokens_seen": 1300224, "step": 2645 }, { "epoch": 0.3497426422066781, "grad_norm": 2.703385829925537, "learning_rate": 1.6439045229202631e-06, "loss": 0.1152, "num_input_tokens_seen": 1302528, "step": 2650 }, { "epoch": 0.3501385772733272, "eval_loss": 0.12348020076751709, "eval_runtime": 7.625, "eval_samples_per_second": 883.275, "eval_steps_per_second": 110.426, "num_input_tokens_seen": 1303872, "step": 2653 }, { "epoch": 0.35040253398442656, "grad_norm": 0.24891549348831177, "learning_rate": 1.6421403502294307e-06, "loss": 0.159, "num_input_tokens_seen": 1305024, "step": 2655 }, { "epoch": 0.351062425762175, "grad_norm": 0.2551489472389221, "learning_rate": 1.6403727700991915e-06, "loss": 0.1813, "num_input_tokens_seen": 1307392, "step": 2660 }, { "epoch": 0.35172231753992345, "grad_norm": 0.29944464564323425, "learning_rate": 1.6386017919089933e-06, "loss": 0.1581, "num_input_tokens_seen": 1310016, "step": 2665 }, { "epoch": 0.3523822093176719, "grad_norm": 0.0917045846581459, "learning_rate": 1.636827425056316e-06, "loss": 0.0066, "num_input_tokens_seen": 1312576, "step": 2670 }, { "epoch": 0.35304210109542034, "grad_norm": 0.10297297686338425, "learning_rate": 1.635049678956621e-06, "loss": 0.1432, "num_input_tokens_seen": 1315072, "step": 2675 }, { "epoch": 0.3537019928731688, "grad_norm": 32.947994232177734, "learning_rate": 1.633268563043301e-06, "loss": 0.1222, "num_input_tokens_seen": 1317504, "step": 2680 }, { "epoch": 0.3543618846509172, "grad_norm": 0.27496451139450073, "learning_rate": 1.63148408676763e-06, "loss": 0.0023, "num_input_tokens_seen": 1319680, "step": 2685 }, { "epoch": 0.3550217764286657, "grad_norm": 0.06333144754171371, "learning_rate": 1.6296962595987141e-06, "loss": 0.0014, "num_input_tokens_seen": 1322240, "step": 2690 }, { "epoch": 0.35568166820641417, "grad_norm": 1.364142894744873, "learning_rate": 1.6279050910234392e-06, "loss": 0.1142, "num_input_tokens_seen": 1324736, "step": 2695 }, { "epoch": 0.3563415599841626, "grad_norm": 0.07366377115249634, "learning_rate": 1.626110590546423e-06, "loss": 0.0407, "num_input_tokens_seen": 1327104, "step": 2700 }, { "epoch": 0.35700145176191106, "grad_norm": 134.90122985839844, "learning_rate": 1.6243127676899635e-06, "loss": 0.248, "num_input_tokens_seen": 1329920, "step": 2705 }, { "epoch": 0.3576613435396595, "grad_norm": 0.04035777971148491, "learning_rate": 1.6225116319939884e-06, "loss": 0.2153, "num_input_tokens_seen": 1332352, "step": 2710 }, { "epoch": 0.35832123531740795, "grad_norm": 75.87095642089844, "learning_rate": 1.6207071930160044e-06, "loss": 0.1084, "num_input_tokens_seen": 1335040, "step": 2715 }, { "epoch": 0.3589811270951564, "grad_norm": 0.1767151951789856, "learning_rate": 1.6188994603310468e-06, "loss": 0.0054, "num_input_tokens_seen": 1337472, "step": 2720 }, { "epoch": 0.35964101887290484, "grad_norm": 3.7952630519866943, "learning_rate": 1.617088443531628e-06, "loss": 0.1694, "num_input_tokens_seen": 1339712, "step": 2725 }, { "epoch": 0.3603009106506533, "grad_norm": 0.17187942564487457, "learning_rate": 1.6152741522276882e-06, "loss": 0.0016, "num_input_tokens_seen": 1342144, "step": 2730 }, { "epoch": 0.36096080242840173, "grad_norm": 0.7987899780273438, "learning_rate": 1.6134565960465425e-06, "loss": 0.108, "num_input_tokens_seen": 1344512, "step": 2735 }, { "epoch": 0.3616206942061502, "grad_norm": 0.12640990316867828, "learning_rate": 1.6116357846328312e-06, "loss": 0.242, "num_input_tokens_seen": 1346880, "step": 2740 }, { "epoch": 0.3622805859838986, "grad_norm": 0.04579659551382065, "learning_rate": 1.609811727648468e-06, "loss": 0.1324, "num_input_tokens_seen": 1349056, "step": 2745 }, { "epoch": 0.36294047776164706, "grad_norm": 0.21617701649665833, "learning_rate": 1.6079844347725882e-06, "loss": 0.0724, "num_input_tokens_seen": 1351488, "step": 2750 }, { "epoch": 0.36360036953939556, "grad_norm": 0.17689555883407593, "learning_rate": 1.6061539157014987e-06, "loss": 0.0532, "num_input_tokens_seen": 1353920, "step": 2755 }, { "epoch": 0.364260261317144, "grad_norm": 0.18878047168254852, "learning_rate": 1.6043201801486257e-06, "loss": 0.2916, "num_input_tokens_seen": 1356352, "step": 2760 }, { "epoch": 0.36492015309489245, "grad_norm": 1.1614915132522583, "learning_rate": 1.6024832378444628e-06, "loss": 0.2542, "num_input_tokens_seen": 1359104, "step": 2765 }, { "epoch": 0.3655800448726409, "grad_norm": 26.108612060546875, "learning_rate": 1.6006430985365204e-06, "loss": 0.2718, "num_input_tokens_seen": 1361536, "step": 2770 }, { "epoch": 0.36623993665038934, "grad_norm": 113.12171936035156, "learning_rate": 1.5987997719892735e-06, "loss": 0.2648, "num_input_tokens_seen": 1364160, "step": 2775 }, { "epoch": 0.3668998284281378, "grad_norm": 0.58730149269104, "learning_rate": 1.5969532679841088e-06, "loss": 0.0465, "num_input_tokens_seen": 1366656, "step": 2780 }, { "epoch": 0.36755972020588623, "grad_norm": 32.10945510864258, "learning_rate": 1.5951035963192752e-06, "loss": 0.0486, "num_input_tokens_seen": 1369216, "step": 2785 }, { "epoch": 0.3682196119836347, "grad_norm": 1.1091487407684326, "learning_rate": 1.593250766809829e-06, "loss": 0.2435, "num_input_tokens_seen": 1371712, "step": 2790 }, { "epoch": 0.3688795037613831, "grad_norm": 61.50751495361328, "learning_rate": 1.5913947892875842e-06, "loss": 0.1572, "num_input_tokens_seen": 1374080, "step": 2795 }, { "epoch": 0.36953939553913157, "grad_norm": 0.4279724657535553, "learning_rate": 1.589535673601059e-06, "loss": 0.1055, "num_input_tokens_seen": 1377024, "step": 2800 }, { "epoch": 0.37019928731688, "grad_norm": 42.588748931884766, "learning_rate": 1.587673429615424e-06, "loss": 0.0806, "num_input_tokens_seen": 1379392, "step": 2805 }, { "epoch": 0.37085917909462845, "grad_norm": 0.18637718260288239, "learning_rate": 1.5858080672124495e-06, "loss": 0.1468, "num_input_tokens_seen": 1381760, "step": 2810 }, { "epoch": 0.37151907087237696, "grad_norm": 0.43665367364883423, "learning_rate": 1.5839395962904536e-06, "loss": 0.0923, "num_input_tokens_seen": 1384128, "step": 2815 }, { "epoch": 0.3721789626501254, "grad_norm": 0.0831814855337143, "learning_rate": 1.5820680267642494e-06, "loss": 0.0594, "num_input_tokens_seen": 1386496, "step": 2820 }, { "epoch": 0.37283885442787384, "grad_norm": 0.25996115803718567, "learning_rate": 1.5801933685650917e-06, "loss": 0.0668, "num_input_tokens_seen": 1388736, "step": 2825 }, { "epoch": 0.3734987462056223, "grad_norm": 2.1776347160339355, "learning_rate": 1.5783156316406259e-06, "loss": 0.002, "num_input_tokens_seen": 1391040, "step": 2830 }, { "epoch": 0.37415863798337073, "grad_norm": 66.52011108398438, "learning_rate": 1.5764348259548334e-06, "loss": 0.218, "num_input_tokens_seen": 1393344, "step": 2835 }, { "epoch": 0.3748185297611192, "grad_norm": 234.61207580566406, "learning_rate": 1.5745509614879806e-06, "loss": 0.056, "num_input_tokens_seen": 1395648, "step": 2840 }, { "epoch": 0.3754784215388676, "grad_norm": 0.03497995808720589, "learning_rate": 1.572664048236564e-06, "loss": 0.2865, "num_input_tokens_seen": 1398272, "step": 2845 }, { "epoch": 0.37613831331661607, "grad_norm": 0.07777401059865952, "learning_rate": 1.570774096213259e-06, "loss": 0.0507, "num_input_tokens_seen": 1400576, "step": 2850 }, { "epoch": 0.3767982050943645, "grad_norm": 0.07564707100391388, "learning_rate": 1.5688811154468649e-06, "loss": 0.0513, "num_input_tokens_seen": 1403136, "step": 2855 }, { "epoch": 0.37745809687211296, "grad_norm": 0.08237399160861969, "learning_rate": 1.5669851159822532e-06, "loss": 0.1228, "num_input_tokens_seen": 1405504, "step": 2860 }, { "epoch": 0.3781179886498614, "grad_norm": 42.22079086303711, "learning_rate": 1.5650861078803137e-06, "loss": 0.1389, "num_input_tokens_seen": 1407808, "step": 2865 }, { "epoch": 0.37877788042760985, "grad_norm": 6.883021831512451, "learning_rate": 1.5631841012179013e-06, "loss": 0.0692, "num_input_tokens_seen": 1410304, "step": 2870 }, { "epoch": 0.37943777220535835, "grad_norm": 0.3424462676048279, "learning_rate": 1.5612791060877818e-06, "loss": 0.004, "num_input_tokens_seen": 1412736, "step": 2875 }, { "epoch": 0.3800976639831068, "grad_norm": 75.88460540771484, "learning_rate": 1.5593711325985801e-06, "loss": 0.0961, "num_input_tokens_seen": 1415488, "step": 2880 }, { "epoch": 0.38075755576085524, "grad_norm": 0.043806418776512146, "learning_rate": 1.5574601908747245e-06, "loss": 0.21, "num_input_tokens_seen": 1417856, "step": 2885 }, { "epoch": 0.3814174475386037, "grad_norm": 0.06361314654350281, "learning_rate": 1.5555462910563936e-06, "loss": 0.0664, "num_input_tokens_seen": 1420096, "step": 2890 }, { "epoch": 0.3820773393163521, "grad_norm": 25.98211097717285, "learning_rate": 1.5536294432994636e-06, "loss": 0.2344, "num_input_tokens_seen": 1422656, "step": 2895 }, { "epoch": 0.38273723109410057, "grad_norm": 92.6849594116211, "learning_rate": 1.5517096577754528e-06, "loss": 0.0884, "num_input_tokens_seen": 1425152, "step": 2900 }, { "epoch": 0.383397122871849, "grad_norm": 0.08511543273925781, "learning_rate": 1.5497869446714695e-06, "loss": 0.0623, "num_input_tokens_seen": 1427840, "step": 2905 }, { "epoch": 0.38405701464959746, "grad_norm": 0.13399949669837952, "learning_rate": 1.5478613141901558e-06, "loss": 0.0019, "num_input_tokens_seen": 1430144, "step": 2910 }, { "epoch": 0.3847169064273459, "grad_norm": 0.18390312790870667, "learning_rate": 1.5459327765496348e-06, "loss": 0.1492, "num_input_tokens_seen": 1432448, "step": 2915 }, { "epoch": 0.38537679820509435, "grad_norm": 1.747375726699829, "learning_rate": 1.5440013419834563e-06, "loss": 0.0071, "num_input_tokens_seen": 1434752, "step": 2920 }, { "epoch": 0.3860366899828428, "grad_norm": 0.4480796158313751, "learning_rate": 1.5420670207405419e-06, "loss": 0.0011, "num_input_tokens_seen": 1437184, "step": 2925 }, { "epoch": 0.38669658176059124, "grad_norm": 7.325652122497559, "learning_rate": 1.5401298230851314e-06, "loss": 0.1098, "num_input_tokens_seen": 1440000, "step": 2930 }, { "epoch": 0.3873564735383397, "grad_norm": 5.879019737243652, "learning_rate": 1.5381897592967275e-06, "loss": 0.0072, "num_input_tokens_seen": 1442624, "step": 2935 }, { "epoch": 0.3880163653160882, "grad_norm": 0.20650486648082733, "learning_rate": 1.5362468396700426e-06, "loss": 0.0702, "num_input_tokens_seen": 1445184, "step": 2940 }, { "epoch": 0.38867625709383663, "grad_norm": 22.289382934570312, "learning_rate": 1.5343010745149418e-06, "loss": 0.322, "num_input_tokens_seen": 1447616, "step": 2945 }, { "epoch": 0.3893361488715851, "grad_norm": 0.035571977496147156, "learning_rate": 1.532352474156391e-06, "loss": 0.0715, "num_input_tokens_seen": 1450176, "step": 2950 }, { "epoch": 0.3899960406493335, "grad_norm": 3.2316651344299316, "learning_rate": 1.5304010489343995e-06, "loss": 0.4706, "num_input_tokens_seen": 1452672, "step": 2955 }, { "epoch": 0.39065593242708196, "grad_norm": 0.06907609850168228, "learning_rate": 1.528446809203968e-06, "loss": 0.2238, "num_input_tokens_seen": 1455232, "step": 2960 }, { "epoch": 0.3913158242048304, "grad_norm": 82.65614318847656, "learning_rate": 1.526489765335031e-06, "loss": 0.1729, "num_input_tokens_seen": 1457792, "step": 2965 }, { "epoch": 0.39197571598257885, "grad_norm": 0.3325257897377014, "learning_rate": 1.5245299277124026e-06, "loss": 0.1528, "num_input_tokens_seen": 1460160, "step": 2970 }, { "epoch": 0.3926356077603273, "grad_norm": 0.9707848429679871, "learning_rate": 1.5225673067357218e-06, "loss": 0.1434, "num_input_tokens_seen": 1462400, "step": 2975 }, { "epoch": 0.39329549953807574, "grad_norm": 22.089210510253906, "learning_rate": 1.5206019128193981e-06, "loss": 0.1209, "num_input_tokens_seen": 1465088, "step": 2980 }, { "epoch": 0.3939553913158242, "grad_norm": 1.0957697629928589, "learning_rate": 1.5186337563925538e-06, "loss": 0.1168, "num_input_tokens_seen": 1467456, "step": 2985 }, { "epoch": 0.39461528309357263, "grad_norm": 0.22268956899642944, "learning_rate": 1.516662847898971e-06, "loss": 0.0016, "num_input_tokens_seen": 1470016, "step": 2990 }, { "epoch": 0.3952751748713211, "grad_norm": 0.2794409990310669, "learning_rate": 1.5146891977970349e-06, "loss": 0.1024, "num_input_tokens_seen": 1472448, "step": 2995 }, { "epoch": 0.3959350666490696, "grad_norm": 55.23267364501953, "learning_rate": 1.5127128165596794e-06, "loss": 0.1009, "num_input_tokens_seen": 1475072, "step": 3000 }, { "epoch": 0.396594958426818, "grad_norm": 0.32357192039489746, "learning_rate": 1.51073371467433e-06, "loss": 0.0499, "num_input_tokens_seen": 1477440, "step": 3005 }, { "epoch": 0.39725485020456647, "grad_norm": 2.3438990116119385, "learning_rate": 1.5087519026428498e-06, "loss": 0.0043, "num_input_tokens_seen": 1479872, "step": 3010 }, { "epoch": 0.3979147419823149, "grad_norm": 214.1775665283203, "learning_rate": 1.5067673909814818e-06, "loss": 0.1242, "num_input_tokens_seen": 1481920, "step": 3015 }, { "epoch": 0.39857463376006336, "grad_norm": 0.06694573163986206, "learning_rate": 1.5047801902207953e-06, "loss": 0.1901, "num_input_tokens_seen": 1484992, "step": 3020 }, { "epoch": 0.3992345255378118, "grad_norm": 37.85984802246094, "learning_rate": 1.5027903109056288e-06, "loss": 0.1508, "num_input_tokens_seen": 1487232, "step": 3025 }, { "epoch": 0.39989441731556025, "grad_norm": 22.730335235595703, "learning_rate": 1.5007977635950336e-06, "loss": 0.1615, "num_input_tokens_seen": 1489728, "step": 3030 }, { "epoch": 0.4001583740266596, "eval_loss": 0.13228875398635864, "eval_runtime": 7.7073, "eval_samples_per_second": 873.842, "eval_steps_per_second": 109.246, "num_input_tokens_seen": 1490688, "step": 3032 }, { "epoch": 0.4005543090933087, "grad_norm": 96.17181396484375, "learning_rate": 1.498802558862219e-06, "loss": 0.154, "num_input_tokens_seen": 1491968, "step": 3035 }, { "epoch": 0.40121420087105714, "grad_norm": 0.3932342231273651, "learning_rate": 1.496804707294496e-06, "loss": 0.1078, "num_input_tokens_seen": 1494336, "step": 3040 }, { "epoch": 0.4018740926488056, "grad_norm": 0.33634519577026367, "learning_rate": 1.4948042194932195e-06, "loss": 0.0599, "num_input_tokens_seen": 1497472, "step": 3045 }, { "epoch": 0.402533984426554, "grad_norm": 0.19691598415374756, "learning_rate": 1.4928011060737341e-06, "loss": 0.0399, "num_input_tokens_seen": 1499968, "step": 3050 }, { "epoch": 0.40319387620430247, "grad_norm": 0.058707304298877716, "learning_rate": 1.4907953776653171e-06, "loss": 0.0741, "num_input_tokens_seen": 1502336, "step": 3055 }, { "epoch": 0.40385376798205097, "grad_norm": 17.177833557128906, "learning_rate": 1.4887870449111206e-06, "loss": 0.1581, "num_input_tokens_seen": 1504576, "step": 3060 }, { "epoch": 0.4045136597597994, "grad_norm": 0.7955127954483032, "learning_rate": 1.486776118468118e-06, "loss": 0.1605, "num_input_tokens_seen": 1507136, "step": 3065 }, { "epoch": 0.40517355153754786, "grad_norm": 0.5847259163856506, "learning_rate": 1.4847626090070451e-06, "loss": 0.0716, "num_input_tokens_seen": 1509696, "step": 3070 }, { "epoch": 0.4058334433152963, "grad_norm": 0.25745320320129395, "learning_rate": 1.4827465272123439e-06, "loss": 0.299, "num_input_tokens_seen": 1512192, "step": 3075 }, { "epoch": 0.40649333509304475, "grad_norm": 0.3554550111293793, "learning_rate": 1.4807278837821063e-06, "loss": 0.0453, "num_input_tokens_seen": 1514752, "step": 3080 }, { "epoch": 0.4071532268707932, "grad_norm": 12.156785011291504, "learning_rate": 1.4787066894280178e-06, "loss": 0.2992, "num_input_tokens_seen": 1517440, "step": 3085 }, { "epoch": 0.40781311864854164, "grad_norm": 0.10129724442958832, "learning_rate": 1.476682954875299e-06, "loss": 0.0637, "num_input_tokens_seen": 1519744, "step": 3090 }, { "epoch": 0.4084730104262901, "grad_norm": 84.23600769042969, "learning_rate": 1.4746566908626506e-06, "loss": 0.0773, "num_input_tokens_seen": 1522176, "step": 3095 }, { "epoch": 0.40913290220403853, "grad_norm": 1.9050307273864746, "learning_rate": 1.4726279081421956e-06, "loss": 0.0516, "num_input_tokens_seen": 1524352, "step": 3100 }, { "epoch": 0.409792793981787, "grad_norm": 35.056800842285156, "learning_rate": 1.4705966174794216e-06, "loss": 0.2317, "num_input_tokens_seen": 1526976, "step": 3105 }, { "epoch": 0.4104526857595354, "grad_norm": 0.22622281312942505, "learning_rate": 1.4685628296531248e-06, "loss": 0.1563, "num_input_tokens_seen": 1529152, "step": 3110 }, { "epoch": 0.41111257753728386, "grad_norm": 1.48894202709198, "learning_rate": 1.466526555455352e-06, "loss": 0.051, "num_input_tokens_seen": 1531648, "step": 3115 }, { "epoch": 0.4117724693150323, "grad_norm": 0.444116473197937, "learning_rate": 1.4644878056913432e-06, "loss": 0.0057, "num_input_tokens_seen": 1533952, "step": 3120 }, { "epoch": 0.4124323610927808, "grad_norm": 48.74332046508789, "learning_rate": 1.4624465911794764e-06, "loss": 0.1887, "num_input_tokens_seen": 1536640, "step": 3125 }, { "epoch": 0.41309225287052925, "grad_norm": 0.06482608616352081, "learning_rate": 1.4604029227512062e-06, "loss": 0.0053, "num_input_tokens_seen": 1539200, "step": 3130 }, { "epoch": 0.4137521446482777, "grad_norm": 81.11097717285156, "learning_rate": 1.4583568112510108e-06, "loss": 0.1908, "num_input_tokens_seen": 1541632, "step": 3135 }, { "epoch": 0.41441203642602614, "grad_norm": 12.146714210510254, "learning_rate": 1.4563082675363302e-06, "loss": 0.0965, "num_input_tokens_seen": 1544128, "step": 3140 }, { "epoch": 0.4150719282037746, "grad_norm": 0.2594153583049774, "learning_rate": 1.4542573024775122e-06, "loss": 0.0228, "num_input_tokens_seen": 1546368, "step": 3145 }, { "epoch": 0.41573181998152303, "grad_norm": 4.159293174743652, "learning_rate": 1.4522039269577521e-06, "loss": 0.2984, "num_input_tokens_seen": 1548736, "step": 3150 }, { "epoch": 0.4163917117592715, "grad_norm": 0.10340887308120728, "learning_rate": 1.4501481518730372e-06, "loss": 0.2461, "num_input_tokens_seen": 1551168, "step": 3155 }, { "epoch": 0.4170516035370199, "grad_norm": 0.2676301598548889, "learning_rate": 1.4480899881320868e-06, "loss": 0.0719, "num_input_tokens_seen": 1553664, "step": 3160 }, { "epoch": 0.41771149531476837, "grad_norm": 25.496265411376953, "learning_rate": 1.4460294466562956e-06, "loss": 0.1771, "num_input_tokens_seen": 1555968, "step": 3165 }, { "epoch": 0.4183713870925168, "grad_norm": 0.47720712423324585, "learning_rate": 1.4439665383796756e-06, "loss": 0.0399, "num_input_tokens_seen": 1558208, "step": 3170 }, { "epoch": 0.41903127887026526, "grad_norm": 2.1485588550567627, "learning_rate": 1.4419012742487972e-06, "loss": 0.0054, "num_input_tokens_seen": 1560640, "step": 3175 }, { "epoch": 0.4196911706480137, "grad_norm": 5.430055618286133, "learning_rate": 1.4398336652227335e-06, "loss": 0.095, "num_input_tokens_seen": 1563328, "step": 3180 }, { "epoch": 0.4203510624257622, "grad_norm": 0.05566899850964546, "learning_rate": 1.4377637222729986e-06, "loss": 0.1201, "num_input_tokens_seen": 1565696, "step": 3185 }, { "epoch": 0.42101095420351065, "grad_norm": 0.08947694301605225, "learning_rate": 1.435691456383493e-06, "loss": 0.1675, "num_input_tokens_seen": 1568640, "step": 3190 }, { "epoch": 0.4216708459812591, "grad_norm": 2.342318058013916, "learning_rate": 1.433616878550442e-06, "loss": 0.1212, "num_input_tokens_seen": 1571328, "step": 3195 }, { "epoch": 0.42233073775900754, "grad_norm": 18.465282440185547, "learning_rate": 1.4315399997823403e-06, "loss": 0.3175, "num_input_tokens_seen": 1574016, "step": 3200 }, { "epoch": 0.422990629536756, "grad_norm": 12.997380256652832, "learning_rate": 1.429460831099891e-06, "loss": 0.2534, "num_input_tokens_seen": 1576384, "step": 3205 }, { "epoch": 0.4236505213145044, "grad_norm": 0.08205987513065338, "learning_rate": 1.4273793835359492e-06, "loss": 0.2136, "num_input_tokens_seen": 1579200, "step": 3210 }, { "epoch": 0.42431041309225287, "grad_norm": 66.97320556640625, "learning_rate": 1.4252956681354631e-06, "loss": 0.0964, "num_input_tokens_seen": 1581632, "step": 3215 }, { "epoch": 0.4249703048700013, "grad_norm": 0.7273184657096863, "learning_rate": 1.4232096959554135e-06, "loss": 0.0035, "num_input_tokens_seen": 1584064, "step": 3220 }, { "epoch": 0.42563019664774976, "grad_norm": 65.00259399414062, "learning_rate": 1.4211214780647572e-06, "loss": 0.0297, "num_input_tokens_seen": 1586752, "step": 3225 }, { "epoch": 0.4262900884254982, "grad_norm": 9.714056968688965, "learning_rate": 1.4190310255443676e-06, "loss": 0.0918, "num_input_tokens_seen": 1589248, "step": 3230 }, { "epoch": 0.42694998020324665, "grad_norm": 0.03953593969345093, "learning_rate": 1.4169383494869764e-06, "loss": 0.0286, "num_input_tokens_seen": 1591552, "step": 3235 }, { "epoch": 0.4276098719809951, "grad_norm": 117.95477294921875, "learning_rate": 1.414843460997113e-06, "loss": 0.0616, "num_input_tokens_seen": 1594048, "step": 3240 }, { "epoch": 0.4282697637587436, "grad_norm": 17.138263702392578, "learning_rate": 1.4127463711910483e-06, "loss": 0.1517, "num_input_tokens_seen": 1596544, "step": 3245 }, { "epoch": 0.42892965553649204, "grad_norm": 5.194220542907715, "learning_rate": 1.410647091196733e-06, "loss": 0.1214, "num_input_tokens_seen": 1599104, "step": 3250 }, { "epoch": 0.4295895473142405, "grad_norm": 0.02321782521903515, "learning_rate": 1.4085456321537402e-06, "loss": 0.124, "num_input_tokens_seen": 1601344, "step": 3255 }, { "epoch": 0.43024943909198893, "grad_norm": 10.903656005859375, "learning_rate": 1.4064420052132056e-06, "loss": 0.1022, "num_input_tokens_seen": 1603968, "step": 3260 }, { "epoch": 0.4309093308697374, "grad_norm": 75.95123291015625, "learning_rate": 1.4043362215377696e-06, "loss": 0.078, "num_input_tokens_seen": 1606400, "step": 3265 }, { "epoch": 0.4315692226474858, "grad_norm": 0.12190647423267365, "learning_rate": 1.4022282923015158e-06, "loss": 0.1095, "num_input_tokens_seen": 1608960, "step": 3270 }, { "epoch": 0.43222911442523426, "grad_norm": 0.8287085294723511, "learning_rate": 1.4001182286899136e-06, "loss": 0.0042, "num_input_tokens_seen": 1611456, "step": 3275 }, { "epoch": 0.4328890062029827, "grad_norm": 0.0886739045381546, "learning_rate": 1.398006041899758e-06, "loss": 0.0458, "num_input_tokens_seen": 1613952, "step": 3280 }, { "epoch": 0.43354889798073115, "grad_norm": 27.18416404724121, "learning_rate": 1.3958917431391102e-06, "loss": 0.1192, "num_input_tokens_seen": 1616320, "step": 3285 }, { "epoch": 0.4342087897584796, "grad_norm": 0.13577166199684143, "learning_rate": 1.3937753436272388e-06, "loss": 0.1763, "num_input_tokens_seen": 1619136, "step": 3290 }, { "epoch": 0.43486868153622804, "grad_norm": 431.9822082519531, "learning_rate": 1.3916568545945597e-06, "loss": 0.0483, "num_input_tokens_seen": 1621632, "step": 3295 }, { "epoch": 0.4355285733139765, "grad_norm": 0.2625204920768738, "learning_rate": 1.3895362872825764e-06, "loss": 0.1352, "num_input_tokens_seen": 1624064, "step": 3300 }, { "epoch": 0.43618846509172493, "grad_norm": 0.5975183844566345, "learning_rate": 1.3874136529438205e-06, "loss": 0.1454, "num_input_tokens_seen": 1626496, "step": 3305 }, { "epoch": 0.43684835686947343, "grad_norm": 9.573996543884277, "learning_rate": 1.3852889628417918e-06, "loss": 0.0691, "num_input_tokens_seen": 1628800, "step": 3310 }, { "epoch": 0.4375082486472219, "grad_norm": 2.738884925842285, "learning_rate": 1.3831622282508994e-06, "loss": 0.0967, "num_input_tokens_seen": 1631232, "step": 3315 }, { "epoch": 0.4381681404249703, "grad_norm": 0.1655990183353424, "learning_rate": 1.3810334604564007e-06, "loss": 0.0018, "num_input_tokens_seen": 1633728, "step": 3320 }, { "epoch": 0.43882803220271877, "grad_norm": 0.21200844645500183, "learning_rate": 1.3789026707543423e-06, "loss": 0.0695, "num_input_tokens_seen": 1636224, "step": 3325 }, { "epoch": 0.4394879239804672, "grad_norm": 0.12617841362953186, "learning_rate": 1.3767698704514998e-06, "loss": 0.0631, "num_input_tokens_seen": 1638272, "step": 3330 }, { "epoch": 0.44014781575821565, "grad_norm": 0.025392625480890274, "learning_rate": 1.3746350708653175e-06, "loss": 0.1898, "num_input_tokens_seen": 1640512, "step": 3335 }, { "epoch": 0.4408077075359641, "grad_norm": 51.78602981567383, "learning_rate": 1.3724982833238495e-06, "loss": 0.1903, "num_input_tokens_seen": 1642944, "step": 3340 }, { "epoch": 0.44146759931371254, "grad_norm": 0.11096933484077454, "learning_rate": 1.370359519165697e-06, "loss": 0.0559, "num_input_tokens_seen": 1645376, "step": 3345 }, { "epoch": 0.442127491091461, "grad_norm": 259.23699951171875, "learning_rate": 1.368218789739952e-06, "loss": 0.0108, "num_input_tokens_seen": 1647936, "step": 3350 }, { "epoch": 0.44278738286920943, "grad_norm": 0.37444016337394714, "learning_rate": 1.3660761064061337e-06, "loss": 0.065, "num_input_tokens_seen": 1650496, "step": 3355 }, { "epoch": 0.4434472746469579, "grad_norm": 0.05476607382297516, "learning_rate": 1.3639314805341297e-06, "loss": 0.0935, "num_input_tokens_seen": 1652992, "step": 3360 }, { "epoch": 0.4441071664247063, "grad_norm": 0.11798688024282455, "learning_rate": 1.3617849235041355e-06, "loss": 0.0665, "num_input_tokens_seen": 1655488, "step": 3365 }, { "epoch": 0.4447670582024548, "grad_norm": 0.04145582392811775, "learning_rate": 1.3596364467065938e-06, "loss": 0.1599, "num_input_tokens_seen": 1657984, "step": 3370 }, { "epoch": 0.44542694998020327, "grad_norm": 90.30973052978516, "learning_rate": 1.3574860615421346e-06, "loss": 0.229, "num_input_tokens_seen": 1660736, "step": 3375 }, { "epoch": 0.4460868417579517, "grad_norm": 12.61612319946289, "learning_rate": 1.3553337794215147e-06, "loss": 0.192, "num_input_tokens_seen": 1663104, "step": 3380 }, { "epoch": 0.44674673353570016, "grad_norm": 75.10413360595703, "learning_rate": 1.3531796117655565e-06, "loss": 0.0766, "num_input_tokens_seen": 1665344, "step": 3385 }, { "epoch": 0.4474066253134486, "grad_norm": 30.948253631591797, "learning_rate": 1.3510235700050873e-06, "loss": 0.1651, "num_input_tokens_seen": 1668096, "step": 3390 }, { "epoch": 0.44806651709119705, "grad_norm": 22.553556442260742, "learning_rate": 1.3488656655808801e-06, "loss": 0.0679, "num_input_tokens_seen": 1670272, "step": 3395 }, { "epoch": 0.4487264088689455, "grad_norm": 1.1050207614898682, "learning_rate": 1.3467059099435912e-06, "loss": 0.0905, "num_input_tokens_seen": 1672448, "step": 3400 }, { "epoch": 0.44938630064669394, "grad_norm": 0.16898778080940247, "learning_rate": 1.3445443145537002e-06, "loss": 0.0608, "num_input_tokens_seen": 1675200, "step": 3405 }, { "epoch": 0.4500461924244424, "grad_norm": 1.0715267658233643, "learning_rate": 1.3423808908814494e-06, "loss": 0.0698, "num_input_tokens_seen": 1677696, "step": 3410 }, { "epoch": 0.45017817077999206, "eval_loss": 0.1182408258318901, "eval_runtime": 7.6199, "eval_samples_per_second": 883.874, "eval_steps_per_second": 110.501, "num_input_tokens_seen": 1678208, "step": 3411 }, { "epoch": 0.4507060842021908, "grad_norm": 14.29131031036377, "learning_rate": 1.3402156504067826e-06, "loss": 0.0969, "num_input_tokens_seen": 1680256, "step": 3415 }, { "epoch": 0.45136597597993927, "grad_norm": 0.1442999541759491, "learning_rate": 1.338048604619284e-06, "loss": 0.1191, "num_input_tokens_seen": 1682624, "step": 3420 }, { "epoch": 0.4520258677576877, "grad_norm": 33.37054443359375, "learning_rate": 1.3358797650181178e-06, "loss": 0.0365, "num_input_tokens_seen": 1685056, "step": 3425 }, { "epoch": 0.45268575953543616, "grad_norm": 132.64529418945312, "learning_rate": 1.3337091431119662e-06, "loss": 0.1349, "num_input_tokens_seen": 1687168, "step": 3430 }, { "epoch": 0.45334565131318466, "grad_norm": 168.06629943847656, "learning_rate": 1.3315367504189698e-06, "loss": 0.3197, "num_input_tokens_seen": 1689216, "step": 3435 }, { "epoch": 0.4540055430909331, "grad_norm": 86.57543182373047, "learning_rate": 1.3293625984666656e-06, "loss": 0.0946, "num_input_tokens_seen": 1691776, "step": 3440 }, { "epoch": 0.45466543486868155, "grad_norm": 0.10748296976089478, "learning_rate": 1.3271866987919254e-06, "loss": 0.0012, "num_input_tokens_seen": 1694336, "step": 3445 }, { "epoch": 0.45532532664643, "grad_norm": 0.3375436067581177, "learning_rate": 1.325009062940895e-06, "loss": 0.2113, "num_input_tokens_seen": 1696640, "step": 3450 }, { "epoch": 0.45598521842417844, "grad_norm": 15.320273399353027, "learning_rate": 1.3228297024689336e-06, "loss": 0.0765, "num_input_tokens_seen": 1698880, "step": 3455 }, { "epoch": 0.4566451102019269, "grad_norm": 23.91095733642578, "learning_rate": 1.3206486289405519e-06, "loss": 0.1025, "num_input_tokens_seen": 1701312, "step": 3460 }, { "epoch": 0.45730500197967533, "grad_norm": 44.923030853271484, "learning_rate": 1.3184658539293496e-06, "loss": 0.1407, "num_input_tokens_seen": 1703808, "step": 3465 }, { "epoch": 0.4579648937574238, "grad_norm": 65.6329116821289, "learning_rate": 1.3162813890179564e-06, "loss": 0.125, "num_input_tokens_seen": 1706304, "step": 3470 }, { "epoch": 0.4586247855351722, "grad_norm": 12.479512214660645, "learning_rate": 1.314095245797969e-06, "loss": 0.3138, "num_input_tokens_seen": 1708736, "step": 3475 }, { "epoch": 0.45928467731292066, "grad_norm": 0.6768988370895386, "learning_rate": 1.3119074358698891e-06, "loss": 0.1379, "num_input_tokens_seen": 1711232, "step": 3480 }, { "epoch": 0.4599445690906691, "grad_norm": 0.6303845047950745, "learning_rate": 1.3097179708430634e-06, "loss": 0.0039, "num_input_tokens_seen": 1713600, "step": 3485 }, { "epoch": 0.46060446086841755, "grad_norm": 0.1511518806219101, "learning_rate": 1.3075268623356214e-06, "loss": 0.2013, "num_input_tokens_seen": 1716224, "step": 3490 }, { "epoch": 0.46126435264616605, "grad_norm": 34.9669189453125, "learning_rate": 1.305334121974412e-06, "loss": 0.1515, "num_input_tokens_seen": 1718720, "step": 3495 }, { "epoch": 0.4619242444239145, "grad_norm": 46.562442779541016, "learning_rate": 1.3031397613949448e-06, "loss": 0.1062, "num_input_tokens_seen": 1721280, "step": 3500 }, { "epoch": 0.46258413620166294, "grad_norm": 93.35523986816406, "learning_rate": 1.3009437922413266e-06, "loss": 0.0727, "num_input_tokens_seen": 1723712, "step": 3505 }, { "epoch": 0.4632440279794114, "grad_norm": 87.05264282226562, "learning_rate": 1.2987462261661994e-06, "loss": 0.0932, "num_input_tokens_seen": 1725952, "step": 3510 }, { "epoch": 0.46390391975715983, "grad_norm": 58.2432975769043, "learning_rate": 1.2965470748306798e-06, "loss": 0.0048, "num_input_tokens_seen": 1728512, "step": 3515 }, { "epoch": 0.4645638115349083, "grad_norm": 9.179746627807617, "learning_rate": 1.2943463499042957e-06, "loss": 0.094, "num_input_tokens_seen": 1731008, "step": 3520 }, { "epoch": 0.4652237033126567, "grad_norm": 0.5701031684875488, "learning_rate": 1.2921440630649257e-06, "loss": 0.1567, "num_input_tokens_seen": 1733696, "step": 3525 }, { "epoch": 0.46588359509040517, "grad_norm": 245.243408203125, "learning_rate": 1.2899402259987355e-06, "loss": 0.0778, "num_input_tokens_seen": 1736256, "step": 3530 }, { "epoch": 0.4665434868681536, "grad_norm": 0.34011900424957275, "learning_rate": 1.287734850400118e-06, "loss": 0.2758, "num_input_tokens_seen": 1738944, "step": 3535 }, { "epoch": 0.46720337864590206, "grad_norm": 19.37761116027832, "learning_rate": 1.2855279479716297e-06, "loss": 0.1846, "num_input_tokens_seen": 1741568, "step": 3540 }, { "epoch": 0.4678632704236505, "grad_norm": 0.1848049759864807, "learning_rate": 1.283319530423929e-06, "loss": 0.0017, "num_input_tokens_seen": 1743808, "step": 3545 }, { "epoch": 0.46852316220139895, "grad_norm": 0.10032381117343903, "learning_rate": 1.2811096094757144e-06, "loss": 0.0026, "num_input_tokens_seen": 1746176, "step": 3550 }, { "epoch": 0.46918305397914745, "grad_norm": 0.09643909335136414, "learning_rate": 1.2788981968536612e-06, "loss": 0.1779, "num_input_tokens_seen": 1748608, "step": 3555 }, { "epoch": 0.4698429457568959, "grad_norm": 0.24367760121822357, "learning_rate": 1.2766853042923607e-06, "loss": 0.1046, "num_input_tokens_seen": 1751040, "step": 3560 }, { "epoch": 0.47050283753464434, "grad_norm": 1.557897686958313, "learning_rate": 1.2744709435342573e-06, "loss": 0.0626, "num_input_tokens_seen": 1753280, "step": 3565 }, { "epoch": 0.4711627293123928, "grad_norm": 13.281846046447754, "learning_rate": 1.2722551263295864e-06, "loss": 0.2856, "num_input_tokens_seen": 1755712, "step": 3570 }, { "epoch": 0.4718226210901412, "grad_norm": 53.76845169067383, "learning_rate": 1.2700378644363114e-06, "loss": 0.1173, "num_input_tokens_seen": 1757952, "step": 3575 }, { "epoch": 0.47248251286788967, "grad_norm": 23.442663192749023, "learning_rate": 1.2678191696200621e-06, "loss": 0.0951, "num_input_tokens_seen": 1760384, "step": 3580 }, { "epoch": 0.4731424046456381, "grad_norm": 0.13637100160121918, "learning_rate": 1.2655990536540717e-06, "loss": 0.0029, "num_input_tokens_seen": 1762944, "step": 3585 }, { "epoch": 0.47380229642338656, "grad_norm": 36.00935363769531, "learning_rate": 1.2633775283191144e-06, "loss": 0.275, "num_input_tokens_seen": 1765504, "step": 3590 }, { "epoch": 0.474462188201135, "grad_norm": 0.4418662190437317, "learning_rate": 1.2611546054034436e-06, "loss": 0.0527, "num_input_tokens_seen": 1768128, "step": 3595 }, { "epoch": 0.47512207997888345, "grad_norm": 0.2341255098581314, "learning_rate": 1.2589302967027285e-06, "loss": 0.1554, "num_input_tokens_seen": 1770624, "step": 3600 }, { "epoch": 0.4757819717566319, "grad_norm": 23.149660110473633, "learning_rate": 1.2567046140199914e-06, "loss": 0.2221, "num_input_tokens_seen": 1773248, "step": 3605 }, { "epoch": 0.47644186353438034, "grad_norm": 1.1026215553283691, "learning_rate": 1.2544775691655463e-06, "loss": 0.0267, "num_input_tokens_seen": 1775488, "step": 3610 }, { "epoch": 0.4771017553121288, "grad_norm": 0.24849441647529602, "learning_rate": 1.2522491739569346e-06, "loss": 0.1329, "num_input_tokens_seen": 1777792, "step": 3615 }, { "epoch": 0.4777616470898773, "grad_norm": 1.301603078842163, "learning_rate": 1.250019440218864e-06, "loss": 0.0942, "num_input_tokens_seen": 1780352, "step": 3620 }, { "epoch": 0.47842153886762573, "grad_norm": 0.6911696195602417, "learning_rate": 1.247788379783144e-06, "loss": 0.1692, "num_input_tokens_seen": 1783168, "step": 3625 }, { "epoch": 0.4790814306453742, "grad_norm": 97.18595123291016, "learning_rate": 1.2455560044886248e-06, "loss": 0.0503, "num_input_tokens_seen": 1785920, "step": 3630 }, { "epoch": 0.4797413224231226, "grad_norm": 0.041064053773880005, "learning_rate": 1.2433223261811337e-06, "loss": 0.1104, "num_input_tokens_seen": 1788416, "step": 3635 }, { "epoch": 0.48040121420087106, "grad_norm": 0.06536306440830231, "learning_rate": 1.2410873567134115e-06, "loss": 0.0317, "num_input_tokens_seen": 1790848, "step": 3640 }, { "epoch": 0.4810611059786195, "grad_norm": 2.3887031078338623, "learning_rate": 1.238851107945051e-06, "loss": 0.0394, "num_input_tokens_seen": 1793280, "step": 3645 }, { "epoch": 0.48172099775636795, "grad_norm": 0.03385510668158531, "learning_rate": 1.2366135917424341e-06, "loss": 0.1043, "num_input_tokens_seen": 1795648, "step": 3650 }, { "epoch": 0.4823808895341164, "grad_norm": 23.26211929321289, "learning_rate": 1.2343748199786665e-06, "loss": 0.183, "num_input_tokens_seen": 1797952, "step": 3655 }, { "epoch": 0.48304078131186484, "grad_norm": 0.2056346982717514, "learning_rate": 1.2321348045335182e-06, "loss": 0.0865, "num_input_tokens_seen": 1800192, "step": 3660 }, { "epoch": 0.4837006730896133, "grad_norm": 0.4568115174770355, "learning_rate": 1.2298935572933575e-06, "loss": 0.1479, "num_input_tokens_seen": 1802560, "step": 3665 }, { "epoch": 0.48436056486736173, "grad_norm": 23.873966217041016, "learning_rate": 1.2276510901510892e-06, "loss": 0.1646, "num_input_tokens_seen": 1805056, "step": 3670 }, { "epoch": 0.4850204566451102, "grad_norm": 2.0380196571350098, "learning_rate": 1.2254074150060915e-06, "loss": 0.1443, "num_input_tokens_seen": 1807744, "step": 3675 }, { "epoch": 0.4856803484228587, "grad_norm": 56.635318756103516, "learning_rate": 1.2231625437641535e-06, "loss": 0.0999, "num_input_tokens_seen": 1810368, "step": 3680 }, { "epoch": 0.4863402402006071, "grad_norm": 0.2982792258262634, "learning_rate": 1.2209164883374096e-06, "loss": 0.0791, "num_input_tokens_seen": 1813056, "step": 3685 }, { "epoch": 0.48700013197835557, "grad_norm": 0.19904585182666779, "learning_rate": 1.2186692606442793e-06, "loss": 0.2265, "num_input_tokens_seen": 1815360, "step": 3690 }, { "epoch": 0.487660023756104, "grad_norm": 144.61109924316406, "learning_rate": 1.216420872609402e-06, "loss": 0.1958, "num_input_tokens_seen": 1817920, "step": 3695 }, { "epoch": 0.48831991553385246, "grad_norm": 12.121625900268555, "learning_rate": 1.2141713361635739e-06, "loss": 0.0936, "num_input_tokens_seen": 1820288, "step": 3700 }, { "epoch": 0.4889798073116009, "grad_norm": 0.04935774579644203, "learning_rate": 1.2119206632436864e-06, "loss": 0.157, "num_input_tokens_seen": 1822656, "step": 3705 }, { "epoch": 0.48963969908934935, "grad_norm": 0.5263445973396301, "learning_rate": 1.209668865792661e-06, "loss": 0.116, "num_input_tokens_seen": 1824832, "step": 3710 }, { "epoch": 0.4902995908670978, "grad_norm": 35.05288314819336, "learning_rate": 1.207415955759385e-06, "loss": 0.0906, "num_input_tokens_seen": 1827200, "step": 3715 }, { "epoch": 0.49095948264484623, "grad_norm": 10.884110450744629, "learning_rate": 1.2051619450986514e-06, "loss": 0.1443, "num_input_tokens_seen": 1829632, "step": 3720 }, { "epoch": 0.4916193744225947, "grad_norm": 1.7360846996307373, "learning_rate": 1.2029068457710923e-06, "loss": 0.076, "num_input_tokens_seen": 1832192, "step": 3725 }, { "epoch": 0.4922792662003431, "grad_norm": 3.593554973602295, "learning_rate": 1.200650669743117e-06, "loss": 0.1089, "num_input_tokens_seen": 1834752, "step": 3730 }, { "epoch": 0.49293915797809157, "grad_norm": 24.667346954345703, "learning_rate": 1.1983934289868488e-06, "loss": 0.0533, "num_input_tokens_seen": 1837248, "step": 3735 }, { "epoch": 0.49359904975584007, "grad_norm": 40.43445587158203, "learning_rate": 1.1961351354800595e-06, "loss": 0.2063, "num_input_tokens_seen": 1839680, "step": 3740 }, { "epoch": 0.4942589415335885, "grad_norm": 0.25334975123405457, "learning_rate": 1.193875801206109e-06, "loss": 0.1478, "num_input_tokens_seen": 1842304, "step": 3745 }, { "epoch": 0.49491883331133696, "grad_norm": 0.46043312549591064, "learning_rate": 1.1916154381538786e-06, "loss": 0.0398, "num_input_tokens_seen": 1844480, "step": 3750 }, { "epoch": 0.4955787250890854, "grad_norm": 0.318348228931427, "learning_rate": 1.1893540583177083e-06, "loss": 0.1799, "num_input_tokens_seen": 1846912, "step": 3755 }, { "epoch": 0.49623861686683385, "grad_norm": 13.051739692687988, "learning_rate": 1.187091673697335e-06, "loss": 0.0861, "num_input_tokens_seen": 1849024, "step": 3760 }, { "epoch": 0.4968985086445823, "grad_norm": 0.8000279068946838, "learning_rate": 1.184828296297826e-06, "loss": 0.0693, "num_input_tokens_seen": 1851712, "step": 3765 }, { "epoch": 0.49755840042233074, "grad_norm": 26.590360641479492, "learning_rate": 1.182563938129518e-06, "loss": 0.074, "num_input_tokens_seen": 1854208, "step": 3770 }, { "epoch": 0.4982182922000792, "grad_norm": 0.07655533403158188, "learning_rate": 1.1802986112079507e-06, "loss": 0.0972, "num_input_tokens_seen": 1856704, "step": 3775 }, { "epoch": 0.4988781839778276, "grad_norm": 2.7111520767211914, "learning_rate": 1.1780323275538056e-06, "loss": 0.0812, "num_input_tokens_seen": 1858944, "step": 3780 }, { "epoch": 0.49953807575557607, "grad_norm": 2.1287126541137695, "learning_rate": 1.1757650991928393e-06, "loss": 0.2014, "num_input_tokens_seen": 1861696, "step": 3785 }, { "epoch": 0.5001979675333246, "grad_norm": 0.28718459606170654, "learning_rate": 1.1734969381558235e-06, "loss": 0.3465, "num_input_tokens_seen": 1864128, "step": 3790 }, { "epoch": 0.5001979675333246, "eval_loss": 0.13253989815711975, "eval_runtime": 7.6606, "eval_samples_per_second": 879.171, "eval_steps_per_second": 109.913, "num_input_tokens_seen": 1864128, "step": 3790 }, { "epoch": 0.500857859311073, "grad_norm": 0.05410047248005867, "learning_rate": 1.1712278564784774e-06, "loss": 0.0012, "num_input_tokens_seen": 1866432, "step": 3795 }, { "epoch": 0.5015177510888215, "grad_norm": 50.43254089355469, "learning_rate": 1.1689578662014064e-06, "loss": 0.071, "num_input_tokens_seen": 1868736, "step": 3800 }, { "epoch": 0.5021776428665699, "grad_norm": 10.290699005126953, "learning_rate": 1.1666869793700362e-06, "loss": 0.2416, "num_input_tokens_seen": 1871360, "step": 3805 }, { "epoch": 0.5028375346443184, "grad_norm": 0.025802727788686752, "learning_rate": 1.1644152080345515e-06, "loss": 0.0019, "num_input_tokens_seen": 1873536, "step": 3810 }, { "epoch": 0.5034974264220667, "grad_norm": 32.99125289916992, "learning_rate": 1.1621425642498289e-06, "loss": 0.2788, "num_input_tokens_seen": 1875904, "step": 3815 }, { "epoch": 0.5041573181998152, "grad_norm": 88.0829849243164, "learning_rate": 1.1598690600753759e-06, "loss": 0.2056, "num_input_tokens_seen": 1878464, "step": 3820 }, { "epoch": 0.5048172099775636, "grad_norm": 69.9671630859375, "learning_rate": 1.1575947075752644e-06, "loss": 0.2253, "num_input_tokens_seen": 1880640, "step": 3825 }, { "epoch": 0.5054771017553121, "grad_norm": 16.678607940673828, "learning_rate": 1.1553195188180691e-06, "loss": 0.1243, "num_input_tokens_seen": 1882944, "step": 3830 }, { "epoch": 0.5061369935330606, "grad_norm": 0.3082711398601532, "learning_rate": 1.1530435058768008e-06, "loss": 0.0629, "num_input_tokens_seen": 1885248, "step": 3835 }, { "epoch": 0.506796885310809, "grad_norm": 16.876184463500977, "learning_rate": 1.150766680828845e-06, "loss": 0.0576, "num_input_tokens_seen": 1887872, "step": 3840 }, { "epoch": 0.5074567770885575, "grad_norm": 11.138367652893066, "learning_rate": 1.1484890557558955e-06, "loss": 0.004, "num_input_tokens_seen": 1890560, "step": 3845 }, { "epoch": 0.5081166688663059, "grad_norm": 11.504974365234375, "learning_rate": 1.146210642743892e-06, "loss": 0.0781, "num_input_tokens_seen": 1893056, "step": 3850 }, { "epoch": 0.5087765606440544, "grad_norm": 0.10916353017091751, "learning_rate": 1.1439314538829554e-06, "loss": 0.0498, "num_input_tokens_seen": 1895360, "step": 3855 }, { "epoch": 0.5094364524218028, "grad_norm": 0.09748303145170212, "learning_rate": 1.141651501267323e-06, "loss": 0.0617, "num_input_tokens_seen": 1897664, "step": 3860 }, { "epoch": 0.5100963441995513, "grad_norm": 126.38017272949219, "learning_rate": 1.1393707969952847e-06, "loss": 0.1711, "num_input_tokens_seen": 1900288, "step": 3865 }, { "epoch": 0.5107562359772997, "grad_norm": 139.21932983398438, "learning_rate": 1.13708935316912e-06, "loss": 0.1191, "num_input_tokens_seen": 1903040, "step": 3870 }, { "epoch": 0.5114161277550482, "grad_norm": 2.1678948402404785, "learning_rate": 1.134807181895032e-06, "loss": 0.0025, "num_input_tokens_seen": 1905472, "step": 3875 }, { "epoch": 0.5120760195327966, "grad_norm": 75.74095916748047, "learning_rate": 1.132524295283084e-06, "loss": 0.1253, "num_input_tokens_seen": 1907712, "step": 3880 }, { "epoch": 0.5127359113105451, "grad_norm": 0.061001695692539215, "learning_rate": 1.1302407054471355e-06, "loss": 0.0096, "num_input_tokens_seen": 1910080, "step": 3885 }, { "epoch": 0.5133958030882935, "grad_norm": 64.87725067138672, "learning_rate": 1.1279564245047767e-06, "loss": 0.2717, "num_input_tokens_seen": 1912512, "step": 3890 }, { "epoch": 0.514055694866042, "grad_norm": 0.10021132230758667, "learning_rate": 1.1256714645772662e-06, "loss": 0.0696, "num_input_tokens_seen": 1914752, "step": 3895 }, { "epoch": 0.5147155866437905, "grad_norm": 0.13533316552639008, "learning_rate": 1.1233858377894647e-06, "loss": 0.0073, "num_input_tokens_seen": 1917120, "step": 3900 }, { "epoch": 0.5153754784215389, "grad_norm": 72.85858917236328, "learning_rate": 1.1210995562697722e-06, "loss": 0.0094, "num_input_tokens_seen": 1919232, "step": 3905 }, { "epoch": 0.5160353701992874, "grad_norm": 20.26717758178711, "learning_rate": 1.1188126321500621e-06, "loss": 0.0061, "num_input_tokens_seen": 1921856, "step": 3910 }, { "epoch": 0.5166952619770357, "grad_norm": 105.2000732421875, "learning_rate": 1.1165250775656188e-06, "loss": 0.1091, "num_input_tokens_seen": 1924224, "step": 3915 }, { "epoch": 0.5173551537547842, "grad_norm": 0.042006537318229675, "learning_rate": 1.1142369046550708e-06, "loss": 0.0258, "num_input_tokens_seen": 1926464, "step": 3920 }, { "epoch": 0.5180150455325326, "grad_norm": 0.038001008331775665, "learning_rate": 1.1119481255603289e-06, "loss": 0.253, "num_input_tokens_seen": 1928896, "step": 3925 }, { "epoch": 0.5186749373102811, "grad_norm": 3.7172634601593018, "learning_rate": 1.1096587524265197e-06, "loss": 0.0598, "num_input_tokens_seen": 1931200, "step": 3930 }, { "epoch": 0.5193348290880295, "grad_norm": 0.01534217782318592, "learning_rate": 1.107368797401923e-06, "loss": 0.1918, "num_input_tokens_seen": 1933632, "step": 3935 }, { "epoch": 0.519994720865778, "grad_norm": 0.24886855483055115, "learning_rate": 1.1050782726379054e-06, "loss": 0.0022, "num_input_tokens_seen": 1935872, "step": 3940 }, { "epoch": 0.5206546126435264, "grad_norm": 0.19433605670928955, "learning_rate": 1.1027871902888566e-06, "loss": 0.104, "num_input_tokens_seen": 1938048, "step": 3945 }, { "epoch": 0.5213145044212749, "grad_norm": 46.62074661254883, "learning_rate": 1.1004955625121257e-06, "loss": 0.059, "num_input_tokens_seen": 1940608, "step": 3950 }, { "epoch": 0.5219743961990233, "grad_norm": 4.506015777587891, "learning_rate": 1.0982034014679561e-06, "loss": 0.2127, "num_input_tokens_seen": 1943040, "step": 3955 }, { "epoch": 0.5226342879767718, "grad_norm": 1.7702916860580444, "learning_rate": 1.0959107193194206e-06, "loss": 0.279, "num_input_tokens_seen": 1945664, "step": 3960 }, { "epoch": 0.5232941797545203, "grad_norm": 0.04471131041646004, "learning_rate": 1.0936175282323575e-06, "loss": 0.0022, "num_input_tokens_seen": 1948032, "step": 3965 }, { "epoch": 0.5239540715322687, "grad_norm": 96.3348617553711, "learning_rate": 1.091323840375305e-06, "loss": 0.0235, "num_input_tokens_seen": 1950208, "step": 3970 }, { "epoch": 0.5246139633100172, "grad_norm": 46.00945281982422, "learning_rate": 1.0890296679194378e-06, "loss": 0.2217, "num_input_tokens_seen": 1952896, "step": 3975 }, { "epoch": 0.5252738550877656, "grad_norm": 0.07118234783411026, "learning_rate": 1.086735023038502e-06, "loss": 0.0466, "num_input_tokens_seen": 1955200, "step": 3980 }, { "epoch": 0.5259337468655141, "grad_norm": 0.24527551233768463, "learning_rate": 1.0844399179087512e-06, "loss": 0.0765, "num_input_tokens_seen": 1957376, "step": 3985 }, { "epoch": 0.5265936386432625, "grad_norm": 0.29095086455345154, "learning_rate": 1.0821443647088802e-06, "loss": 0.2646, "num_input_tokens_seen": 1960064, "step": 3990 }, { "epoch": 0.527253530421011, "grad_norm": 0.09518618881702423, "learning_rate": 1.0798483756199623e-06, "loss": 0.1166, "num_input_tokens_seen": 1962624, "step": 3995 }, { "epoch": 0.5279134221987594, "grad_norm": 0.0448361411690712, "learning_rate": 1.0775519628253833e-06, "loss": 0.0901, "num_input_tokens_seen": 1965056, "step": 4000 }, { "epoch": 0.5285733139765079, "grad_norm": 0.339200496673584, "learning_rate": 1.0752551385107772e-06, "loss": 0.1363, "num_input_tokens_seen": 1967424, "step": 4005 }, { "epoch": 0.5292332057542563, "grad_norm": 12.845752716064453, "learning_rate": 1.0729579148639621e-06, "loss": 0.1608, "num_input_tokens_seen": 1969856, "step": 4010 }, { "epoch": 0.5298930975320048, "grad_norm": 0.18415102362632751, "learning_rate": 1.0706603040748747e-06, "loss": 0.0527, "num_input_tokens_seen": 1972544, "step": 4015 }, { "epoch": 0.5305529893097533, "grad_norm": 0.05650022253394127, "learning_rate": 1.0683623183355071e-06, "loss": 0.0851, "num_input_tokens_seen": 1974912, "step": 4020 }, { "epoch": 0.5312128810875016, "grad_norm": 13.724897384643555, "learning_rate": 1.0660639698398392e-06, "loss": 0.0918, "num_input_tokens_seen": 1977216, "step": 4025 }, { "epoch": 0.5318727728652501, "grad_norm": 4.000504970550537, "learning_rate": 1.0637652707837773e-06, "loss": 0.069, "num_input_tokens_seen": 1979648, "step": 4030 }, { "epoch": 0.5325326646429985, "grad_norm": 63.9135627746582, "learning_rate": 1.0614662333650876e-06, "loss": 0.0788, "num_input_tokens_seen": 1981888, "step": 4035 }, { "epoch": 0.533192556420747, "grad_norm": 15.316259384155273, "learning_rate": 1.0591668697833311e-06, "loss": 0.199, "num_input_tokens_seen": 1984448, "step": 4040 }, { "epoch": 0.5338524481984954, "grad_norm": 31.211254119873047, "learning_rate": 1.0568671922398005e-06, "loss": 0.1948, "num_input_tokens_seen": 1987072, "step": 4045 }, { "epoch": 0.5345123399762439, "grad_norm": 0.47070229053497314, "learning_rate": 1.054567212937454e-06, "loss": 0.1732, "num_input_tokens_seen": 1989632, "step": 4050 }, { "epoch": 0.5351722317539923, "grad_norm": 0.44888266921043396, "learning_rate": 1.0522669440808508e-06, "loss": 0.0482, "num_input_tokens_seen": 1992192, "step": 4055 }, { "epoch": 0.5358321235317408, "grad_norm": 1.2094718217849731, "learning_rate": 1.0499663978760871e-06, "loss": 0.2351, "num_input_tokens_seen": 1994624, "step": 4060 }, { "epoch": 0.5364920153094892, "grad_norm": 9.957518577575684, "learning_rate": 1.0476655865307308e-06, "loss": 0.0567, "num_input_tokens_seen": 1997056, "step": 4065 }, { "epoch": 0.5371519070872377, "grad_norm": 0.34155920147895813, "learning_rate": 1.0453645222537556e-06, "loss": 0.0665, "num_input_tokens_seen": 1999360, "step": 4070 }, { "epoch": 0.5378117988649861, "grad_norm": 111.1448974609375, "learning_rate": 1.0430632172554796e-06, "loss": 0.0719, "num_input_tokens_seen": 2001856, "step": 4075 }, { "epoch": 0.5384716906427346, "grad_norm": 36.95001220703125, "learning_rate": 1.0407616837474963e-06, "loss": 0.1029, "num_input_tokens_seen": 2004288, "step": 4080 }, { "epoch": 0.5391315824204831, "grad_norm": 1.12558114528656, "learning_rate": 1.038459933942612e-06, "loss": 0.0145, "num_input_tokens_seen": 2006976, "step": 4085 }, { "epoch": 0.5397914741982315, "grad_norm": 11.313764572143555, "learning_rate": 1.036157980054782e-06, "loss": 0.0129, "num_input_tokens_seen": 2009280, "step": 4090 }, { "epoch": 0.54045136597598, "grad_norm": 168.36546325683594, "learning_rate": 1.0338558342990431e-06, "loss": 0.0985, "num_input_tokens_seen": 2011776, "step": 4095 }, { "epoch": 0.5411112577537284, "grad_norm": 0.4781351089477539, "learning_rate": 1.0315535088914508e-06, "loss": 0.2285, "num_input_tokens_seen": 2014336, "step": 4100 }, { "epoch": 0.5417711495314769, "grad_norm": 33.78492736816406, "learning_rate": 1.0292510160490146e-06, "loss": 0.1558, "num_input_tokens_seen": 2017152, "step": 4105 }, { "epoch": 0.5424310413092253, "grad_norm": 17.072744369506836, "learning_rate": 1.0269483679896308e-06, "loss": 0.1097, "num_input_tokens_seen": 2019520, "step": 4110 }, { "epoch": 0.5430909330869738, "grad_norm": 32.48529052734375, "learning_rate": 1.0246455769320211e-06, "loss": 0.164, "num_input_tokens_seen": 2021632, "step": 4115 }, { "epoch": 0.5437508248647221, "grad_norm": 1.9809108972549438, "learning_rate": 1.0223426550956647e-06, "loss": 0.1157, "num_input_tokens_seen": 2023744, "step": 4120 }, { "epoch": 0.5444107166424706, "grad_norm": 2.384786367416382, "learning_rate": 1.0200396147007354e-06, "loss": 0.06, "num_input_tokens_seen": 2026048, "step": 4125 }, { "epoch": 0.545070608420219, "grad_norm": 0.03345398232340813, "learning_rate": 1.0177364679680367e-06, "loss": 0.1203, "num_input_tokens_seen": 2028352, "step": 4130 }, { "epoch": 0.5457305001979675, "grad_norm": 0.08933035284280777, "learning_rate": 1.015433227118935e-06, "loss": 0.0494, "num_input_tokens_seen": 2030848, "step": 4135 }, { "epoch": 0.5463903919757159, "grad_norm": 0.14635981619358063, "learning_rate": 1.0131299043752967e-06, "loss": 0.1369, "num_input_tokens_seen": 2033344, "step": 4140 }, { "epoch": 0.5470502837534644, "grad_norm": 0.22250190377235413, "learning_rate": 1.0108265119594233e-06, "loss": 0.0777, "num_input_tokens_seen": 2035584, "step": 4145 }, { "epoch": 0.5477101755312129, "grad_norm": 13.305469512939453, "learning_rate": 1.0085230620939853e-06, "loss": 0.0407, "num_input_tokens_seen": 2038272, "step": 4150 }, { "epoch": 0.5483700673089613, "grad_norm": 11.508169174194336, "learning_rate": 1.0062195670019583e-06, "loss": 0.0956, "num_input_tokens_seen": 2040768, "step": 4155 }, { "epoch": 0.5490299590867098, "grad_norm": 114.46903991699219, "learning_rate": 1.0039160389065582e-06, "loss": 0.1461, "num_input_tokens_seen": 2043072, "step": 4160 }, { "epoch": 0.5496898508644582, "grad_norm": 9.968348503112793, "learning_rate": 1.0016124900311755e-06, "loss": 0.1538, "num_input_tokens_seen": 2045824, "step": 4165 }, { "epoch": 0.550217764286657, "eval_loss": 0.0976191833615303, "eval_runtime": 7.5976, "eval_samples_per_second": 886.459, "eval_steps_per_second": 110.824, "num_input_tokens_seen": 2047552, "step": 4169 }, { "epoch": 0.5503497426422067, "grad_norm": 24.443077087402344, "learning_rate": 9.99308932599311e-07, "loss": 0.233, "num_input_tokens_seen": 2048064, "step": 4170 }, { "epoch": 0.5510096344199551, "grad_norm": 0.5319744944572449, "learning_rate": 9.970053788345112e-07, "loss": 0.0557, "num_input_tokens_seen": 2050432, "step": 4175 }, { "epoch": 0.5516695261977036, "grad_norm": 0.8921132683753967, "learning_rate": 9.947018409603036e-07, "loss": 0.0547, "num_input_tokens_seen": 2052928, "step": 4180 }, { "epoch": 0.552329417975452, "grad_norm": 0.3344038724899292, "learning_rate": 9.923983312001304e-07, "loss": 0.0658, "num_input_tokens_seen": 2055424, "step": 4185 }, { "epoch": 0.5529893097532005, "grad_norm": 0.5421162843704224, "learning_rate": 9.900948617772846e-07, "loss": 0.1874, "num_input_tokens_seen": 2057536, "step": 4190 }, { "epoch": 0.5536492015309489, "grad_norm": 43.32229995727539, "learning_rate": 9.877914449148462e-07, "loss": 0.1518, "num_input_tokens_seen": 2059840, "step": 4195 }, { "epoch": 0.5543090933086974, "grad_norm": 87.34823608398438, "learning_rate": 9.854880928356157e-07, "loss": 0.2201, "num_input_tokens_seen": 2062656, "step": 4200 }, { "epoch": 0.5549689850864459, "grad_norm": 0.3885681629180908, "learning_rate": 9.831848177620493e-07, "loss": 0.22, "num_input_tokens_seen": 2064960, "step": 4205 }, { "epoch": 0.5556288768641943, "grad_norm": 18.198888778686523, "learning_rate": 9.808816319161961e-07, "loss": 0.2685, "num_input_tokens_seen": 2067008, "step": 4210 }, { "epoch": 0.5562887686419428, "grad_norm": 0.18500889837741852, "learning_rate": 9.785785475196298e-07, "loss": 0.0021, "num_input_tokens_seen": 2069696, "step": 4215 }, { "epoch": 0.5569486604196912, "grad_norm": 1.4052083492279053, "learning_rate": 9.76275576793387e-07, "loss": 0.0054, "num_input_tokens_seen": 2072320, "step": 4220 }, { "epoch": 0.5576085521974397, "grad_norm": 1.9056949615478516, "learning_rate": 9.739727319579007e-07, "loss": 0.0023, "num_input_tokens_seen": 2074752, "step": 4225 }, { "epoch": 0.558268443975188, "grad_norm": 1.0958954095840454, "learning_rate": 9.716700252329361e-07, "loss": 0.0678, "num_input_tokens_seen": 2077440, "step": 4230 }, { "epoch": 0.5589283357529365, "grad_norm": 20.575729370117188, "learning_rate": 9.693674688375254e-07, "loss": 0.2046, "num_input_tokens_seen": 2080000, "step": 4235 }, { "epoch": 0.5595882275306849, "grad_norm": 0.2594149708747864, "learning_rate": 9.67065074989903e-07, "loss": 0.1257, "num_input_tokens_seen": 2082560, "step": 4240 }, { "epoch": 0.5602481193084334, "grad_norm": 36.21245193481445, "learning_rate": 9.647628559074415e-07, "loss": 0.0827, "num_input_tokens_seen": 2084864, "step": 4245 }, { "epoch": 0.5609080110861818, "grad_norm": 0.03890296071767807, "learning_rate": 9.62460823806585e-07, "loss": 0.1167, "num_input_tokens_seen": 2087424, "step": 4250 }, { "epoch": 0.5615679028639303, "grad_norm": 4.345874786376953, "learning_rate": 9.601589909027857e-07, "loss": 0.2136, "num_input_tokens_seen": 2090048, "step": 4255 }, { "epoch": 0.5622277946416787, "grad_norm": 0.06426483392715454, "learning_rate": 9.578573694104394e-07, "loss": 0.0795, "num_input_tokens_seen": 2092416, "step": 4260 }, { "epoch": 0.5628876864194272, "grad_norm": 5.784552097320557, "learning_rate": 9.555559715428199e-07, "loss": 0.0455, "num_input_tokens_seen": 2094656, "step": 4265 }, { "epoch": 0.5635475781971757, "grad_norm": 0.20891836285591125, "learning_rate": 9.532548095120134e-07, "loss": 0.0031, "num_input_tokens_seen": 2097024, "step": 4270 }, { "epoch": 0.5642074699749241, "grad_norm": 0.08341825008392334, "learning_rate": 9.509538955288564e-07, "loss": 0.0884, "num_input_tokens_seen": 2099392, "step": 4275 }, { "epoch": 0.5648673617526726, "grad_norm": 0.749411940574646, "learning_rate": 9.486532418028672e-07, "loss": 0.0815, "num_input_tokens_seen": 2102016, "step": 4280 }, { "epoch": 0.565527253530421, "grad_norm": 25.93520164489746, "learning_rate": 9.463528605421844e-07, "loss": 0.117, "num_input_tokens_seen": 2104320, "step": 4285 }, { "epoch": 0.5661871453081695, "grad_norm": 45.35911178588867, "learning_rate": 9.440527639535004e-07, "loss": 0.0795, "num_input_tokens_seen": 2107136, "step": 4290 }, { "epoch": 0.5668470370859179, "grad_norm": 0.20163391530513763, "learning_rate": 9.417529642419971e-07, "loss": 0.0935, "num_input_tokens_seen": 2109888, "step": 4295 }, { "epoch": 0.5675069288636664, "grad_norm": 24.672039031982422, "learning_rate": 9.394534736112815e-07, "loss": 0.1225, "num_input_tokens_seen": 2112192, "step": 4300 }, { "epoch": 0.5681668206414148, "grad_norm": 0.07875992357730865, "learning_rate": 9.371543042633192e-07, "loss": 0.1277, "num_input_tokens_seen": 2114752, "step": 4305 }, { "epoch": 0.5688267124191633, "grad_norm": 0.11948826909065247, "learning_rate": 9.348554683983722e-07, "loss": 0.1616, "num_input_tokens_seen": 2117184, "step": 4310 }, { "epoch": 0.5694866041969117, "grad_norm": 0.17669005692005157, "learning_rate": 9.325569782149323e-07, "loss": 0.0485, "num_input_tokens_seen": 2119552, "step": 4315 }, { "epoch": 0.5701464959746602, "grad_norm": 18.713947296142578, "learning_rate": 9.302588459096574e-07, "loss": 0.0897, "num_input_tokens_seen": 2121920, "step": 4320 }, { "epoch": 0.5708063877524086, "grad_norm": 8.844649314880371, "learning_rate": 9.279610836773064e-07, "loss": 0.1948, "num_input_tokens_seen": 2124096, "step": 4325 }, { "epoch": 0.571466279530157, "grad_norm": 62.913169860839844, "learning_rate": 9.256637037106735e-07, "loss": 0.0979, "num_input_tokens_seen": 2126528, "step": 4330 }, { "epoch": 0.5721261713079056, "grad_norm": 35.835323333740234, "learning_rate": 9.233667182005259e-07, "loss": 0.0585, "num_input_tokens_seen": 2128576, "step": 4335 }, { "epoch": 0.5727860630856539, "grad_norm": 236.8058319091797, "learning_rate": 9.210701393355361e-07, "loss": 0.1142, "num_input_tokens_seen": 2130688, "step": 4340 }, { "epoch": 0.5734459548634024, "grad_norm": 0.6673513650894165, "learning_rate": 9.187739793022198e-07, "loss": 0.1147, "num_input_tokens_seen": 2133312, "step": 4345 }, { "epoch": 0.5741058466411508, "grad_norm": 0.05369502305984497, "learning_rate": 9.164782502848702e-07, "loss": 0.0315, "num_input_tokens_seen": 2135680, "step": 4350 }, { "epoch": 0.5747657384188993, "grad_norm": 0.035501688718795776, "learning_rate": 9.141829644654936e-07, "loss": 0.2153, "num_input_tokens_seen": 2138112, "step": 4355 }, { "epoch": 0.5754256301966477, "grad_norm": 7.459763526916504, "learning_rate": 9.118881340237432e-07, "loss": 0.3872, "num_input_tokens_seen": 2140352, "step": 4360 }, { "epoch": 0.5760855219743962, "grad_norm": 0.08102209866046906, "learning_rate": 9.095937711368573e-07, "loss": 0.0637, "num_input_tokens_seen": 2143040, "step": 4365 }, { "epoch": 0.5767454137521446, "grad_norm": 0.06749647855758667, "learning_rate": 9.072998879795923e-07, "loss": 0.1285, "num_input_tokens_seen": 2145280, "step": 4370 }, { "epoch": 0.5774053055298931, "grad_norm": 51.86709976196289, "learning_rate": 9.050064967241596e-07, "loss": 0.0807, "num_input_tokens_seen": 2147904, "step": 4375 }, { "epoch": 0.5780651973076415, "grad_norm": 0.10375242680311203, "learning_rate": 9.027136095401598e-07, "loss": 0.0728, "num_input_tokens_seen": 2150400, "step": 4380 }, { "epoch": 0.57872508908539, "grad_norm": 0.2877858281135559, "learning_rate": 9.004212385945187e-07, "loss": 0.1274, "num_input_tokens_seen": 2153088, "step": 4385 }, { "epoch": 0.5793849808631385, "grad_norm": 0.05926657095551491, "learning_rate": 8.981293960514233e-07, "loss": 0.0495, "num_input_tokens_seen": 2155776, "step": 4390 }, { "epoch": 0.5800448726408869, "grad_norm": 1.292005181312561, "learning_rate": 8.958380940722564e-07, "loss": 0.1366, "num_input_tokens_seen": 2158400, "step": 4395 }, { "epoch": 0.5807047644186354, "grad_norm": 0.3705070912837982, "learning_rate": 8.935473448155326e-07, "loss": 0.0731, "num_input_tokens_seen": 2160704, "step": 4400 }, { "epoch": 0.5813646561963838, "grad_norm": 26.712739944458008, "learning_rate": 8.912571604368324e-07, "loss": 0.0423, "num_input_tokens_seen": 2163200, "step": 4405 }, { "epoch": 0.5820245479741323, "grad_norm": 68.38367462158203, "learning_rate": 8.889675530887404e-07, "loss": 0.1252, "num_input_tokens_seen": 2165376, "step": 4410 }, { "epoch": 0.5826844397518807, "grad_norm": 0.06487785279750824, "learning_rate": 8.866785349207786e-07, "loss": 0.131, "num_input_tokens_seen": 2167808, "step": 4415 }, { "epoch": 0.5833443315296292, "grad_norm": 15.265974044799805, "learning_rate": 8.843901180793423e-07, "loss": 0.1223, "num_input_tokens_seen": 2170112, "step": 4420 }, { "epoch": 0.5840042233073776, "grad_norm": 1.6116943359375, "learning_rate": 8.821023147076362e-07, "loss": 0.001, "num_input_tokens_seen": 2172480, "step": 4425 }, { "epoch": 0.5846641150851261, "grad_norm": 4.275770664215088, "learning_rate": 8.798151369456098e-07, "loss": 0.0822, "num_input_tokens_seen": 2175104, "step": 4430 }, { "epoch": 0.5853240068628744, "grad_norm": 12.192449569702148, "learning_rate": 8.775285969298931e-07, "loss": 0.0803, "num_input_tokens_seen": 2177280, "step": 4435 }, { "epoch": 0.585983898640623, "grad_norm": 0.0718933716416359, "learning_rate": 8.752427067937312e-07, "loss": 0.0628, "num_input_tokens_seen": 2179776, "step": 4440 }, { "epoch": 0.5866437904183713, "grad_norm": 0.020002318546175957, "learning_rate": 8.729574786669214e-07, "loss": 0.0845, "num_input_tokens_seen": 2182400, "step": 4445 }, { "epoch": 0.5873036821961198, "grad_norm": 0.39394357800483704, "learning_rate": 8.706729246757477e-07, "loss": 0.06, "num_input_tokens_seen": 2185088, "step": 4450 }, { "epoch": 0.5879635739738683, "grad_norm": 1.8858518600463867, "learning_rate": 8.683890569429173e-07, "loss": 0.0725, "num_input_tokens_seen": 2187776, "step": 4455 }, { "epoch": 0.5886234657516167, "grad_norm": 0.07854912430047989, "learning_rate": 8.661058875874956e-07, "loss": 0.0027, "num_input_tokens_seen": 2190016, "step": 4460 }, { "epoch": 0.5892833575293652, "grad_norm": 0.09435324370861053, "learning_rate": 8.638234287248423e-07, "loss": 0.0013, "num_input_tokens_seen": 2192320, "step": 4465 }, { "epoch": 0.5899432493071136, "grad_norm": 44.07099533081055, "learning_rate": 8.615416924665464e-07, "loss": 0.0578, "num_input_tokens_seen": 2194752, "step": 4470 }, { "epoch": 0.5906031410848621, "grad_norm": 0.29922375082969666, "learning_rate": 8.592606909203629e-07, "loss": 0.0962, "num_input_tokens_seen": 2197056, "step": 4475 }, { "epoch": 0.5912630328626105, "grad_norm": 0.052084218710660934, "learning_rate": 8.569804361901485e-07, "loss": 0.0401, "num_input_tokens_seen": 2199296, "step": 4480 }, { "epoch": 0.591922924640359, "grad_norm": 59.697113037109375, "learning_rate": 8.547009403757963e-07, "loss": 0.4233, "num_input_tokens_seen": 2201664, "step": 4485 }, { "epoch": 0.5925828164181074, "grad_norm": 16.623720169067383, "learning_rate": 8.524222155731731e-07, "loss": 0.1601, "num_input_tokens_seen": 2204288, "step": 4490 }, { "epoch": 0.5932427081958559, "grad_norm": 82.14921569824219, "learning_rate": 8.501442738740538e-07, "loss": 0.1259, "num_input_tokens_seen": 2206528, "step": 4495 }, { "epoch": 0.5939025999736043, "grad_norm": 0.7616731524467468, "learning_rate": 8.47867127366058e-07, "loss": 0.0636, "num_input_tokens_seen": 2209024, "step": 4500 }, { "epoch": 0.5945624917513528, "grad_norm": 0.1041426807641983, "learning_rate": 8.455907881325858e-07, "loss": 0.0027, "num_input_tokens_seen": 2211584, "step": 4505 }, { "epoch": 0.5952223835291012, "grad_norm": 1.8390711545944214, "learning_rate": 8.433152682527533e-07, "loss": 0.1052, "num_input_tokens_seen": 2213952, "step": 4510 }, { "epoch": 0.5958822753068497, "grad_norm": 0.08113599568605423, "learning_rate": 8.410405798013298e-07, "loss": 0.0747, "num_input_tokens_seen": 2216192, "step": 4515 }, { "epoch": 0.5965421670845982, "grad_norm": 16.143348693847656, "learning_rate": 8.387667348486712e-07, "loss": 0.0035, "num_input_tokens_seen": 2218688, "step": 4520 }, { "epoch": 0.5972020588623466, "grad_norm": 135.14500427246094, "learning_rate": 8.364937454606585e-07, "loss": 0.1296, "num_input_tokens_seen": 2220928, "step": 4525 }, { "epoch": 0.5978619506400951, "grad_norm": 12.444659233093262, "learning_rate": 8.342216236986329e-07, "loss": 0.0014, "num_input_tokens_seen": 2223360, "step": 4530 }, { "epoch": 0.5985218424178435, "grad_norm": 0.052838534116744995, "learning_rate": 8.319503816193305e-07, "loss": 0.1463, "num_input_tokens_seen": 2225792, "step": 4535 }, { "epoch": 0.599181734195592, "grad_norm": 29.65154457092285, "learning_rate": 8.296800312748206e-07, "loss": 0.1496, "num_input_tokens_seen": 2228288, "step": 4540 }, { "epoch": 0.5998416259733403, "grad_norm": 1.4917051792144775, "learning_rate": 8.274105847124404e-07, "loss": 0.1911, "num_input_tokens_seen": 2230848, "step": 4545 }, { "epoch": 0.6002375610399895, "eval_loss": 0.11496574431657791, "eval_runtime": 7.6571, "eval_samples_per_second": 879.582, "eval_steps_per_second": 109.964, "num_input_tokens_seen": 2232448, "step": 4548 }, { "epoch": 0.6005015177510888, "grad_norm": 35.08987808227539, "learning_rate": 8.251420539747311e-07, "loss": 0.1187, "num_input_tokens_seen": 2233472, "step": 4550 }, { "epoch": 0.6011614095288372, "grad_norm": 0.22071610391139984, "learning_rate": 8.228744510993742e-07, "loss": 0.1799, "num_input_tokens_seen": 2236096, "step": 4555 }, { "epoch": 0.6018213013065857, "grad_norm": 0.21558649837970734, "learning_rate": 8.206077881191274e-07, "loss": 0.0908, "num_input_tokens_seen": 2238720, "step": 4560 }, { "epoch": 0.6024811930843341, "grad_norm": 24.909807205200195, "learning_rate": 8.183420770617614e-07, "loss": 0.1394, "num_input_tokens_seen": 2241216, "step": 4565 }, { "epoch": 0.6031410848620826, "grad_norm": 2.2823469638824463, "learning_rate": 8.160773299499955e-07, "loss": 0.0631, "num_input_tokens_seen": 2243648, "step": 4570 }, { "epoch": 0.6038009766398311, "grad_norm": 1.838703989982605, "learning_rate": 8.138135588014339e-07, "loss": 0.0464, "num_input_tokens_seen": 2246080, "step": 4575 }, { "epoch": 0.6044608684175795, "grad_norm": 22.0809268951416, "learning_rate": 8.115507756285017e-07, "loss": 0.0632, "num_input_tokens_seen": 2248256, "step": 4580 }, { "epoch": 0.605120760195328, "grad_norm": 0.09841513633728027, "learning_rate": 8.092889924383819e-07, "loss": 0.1037, "num_input_tokens_seen": 2250688, "step": 4585 }, { "epoch": 0.6057806519730764, "grad_norm": 3.10756516456604, "learning_rate": 8.070282212329508e-07, "loss": 0.0775, "num_input_tokens_seen": 2253120, "step": 4590 }, { "epoch": 0.6064405437508249, "grad_norm": 170.31297302246094, "learning_rate": 8.047684740087156e-07, "loss": 0.22, "num_input_tokens_seen": 2255360, "step": 4595 }, { "epoch": 0.6071004355285733, "grad_norm": 46.11749267578125, "learning_rate": 8.025097627567481e-07, "loss": 0.1834, "num_input_tokens_seen": 2257728, "step": 4600 }, { "epoch": 0.6077603273063218, "grad_norm": 0.045084141194820404, "learning_rate": 8.002520994626247e-07, "loss": 0.0712, "num_input_tokens_seen": 2260224, "step": 4605 }, { "epoch": 0.6084202190840702, "grad_norm": 0.1346772313117981, "learning_rate": 7.979954961063596e-07, "loss": 0.0733, "num_input_tokens_seen": 2262912, "step": 4610 }, { "epoch": 0.6090801108618187, "grad_norm": 18.890954971313477, "learning_rate": 7.957399646623436e-07, "loss": 0.3433, "num_input_tokens_seen": 2265152, "step": 4615 }, { "epoch": 0.6097400026395671, "grad_norm": 0.26090413331985474, "learning_rate": 7.934855170992788e-07, "loss": 0.042, "num_input_tokens_seen": 2267968, "step": 4620 }, { "epoch": 0.6103998944173156, "grad_norm": 0.09057987481355667, "learning_rate": 7.912321653801161e-07, "loss": 0.0468, "num_input_tokens_seen": 2270336, "step": 4625 }, { "epoch": 0.611059786195064, "grad_norm": 19.550853729248047, "learning_rate": 7.889799214619919e-07, "loss": 0.1865, "num_input_tokens_seen": 2273024, "step": 4630 }, { "epoch": 0.6117196779728125, "grad_norm": 0.048422493040561676, "learning_rate": 7.867287972961629e-07, "loss": 0.0821, "num_input_tokens_seen": 2275264, "step": 4635 }, { "epoch": 0.612379569750561, "grad_norm": 0.2724073529243469, "learning_rate": 7.844788048279453e-07, "loss": 0.0704, "num_input_tokens_seen": 2277888, "step": 4640 }, { "epoch": 0.6130394615283093, "grad_norm": 0.041433185338974, "learning_rate": 7.822299559966494e-07, "loss": 0.0007, "num_input_tokens_seen": 2280320, "step": 4645 }, { "epoch": 0.6136993533060578, "grad_norm": 0.03420973941683769, "learning_rate": 7.799822627355171e-07, "loss": 0.0591, "num_input_tokens_seen": 2282560, "step": 4650 }, { "epoch": 0.6143592450838062, "grad_norm": 0.13017447292804718, "learning_rate": 7.77735736971659e-07, "loss": 0.0842, "num_input_tokens_seen": 2284864, "step": 4655 }, { "epoch": 0.6150191368615547, "grad_norm": 0.07092246413230896, "learning_rate": 7.754903906259889e-07, "loss": 0.1524, "num_input_tokens_seen": 2287168, "step": 4660 }, { "epoch": 0.6156790286393031, "grad_norm": 166.20501708984375, "learning_rate": 7.732462356131637e-07, "loss": 0.059, "num_input_tokens_seen": 2289600, "step": 4665 }, { "epoch": 0.6163389204170516, "grad_norm": 0.29940545558929443, "learning_rate": 7.710032838415179e-07, "loss": 0.0896, "num_input_tokens_seen": 2292160, "step": 4670 }, { "epoch": 0.6169988121948, "grad_norm": 0.06602998822927475, "learning_rate": 7.687615472130016e-07, "loss": 0.155, "num_input_tokens_seen": 2294912, "step": 4675 }, { "epoch": 0.6176587039725485, "grad_norm": 0.13479486107826233, "learning_rate": 7.665210376231165e-07, "loss": 0.1138, "num_input_tokens_seen": 2297024, "step": 4680 }, { "epoch": 0.6183185957502969, "grad_norm": 7.841771125793457, "learning_rate": 7.642817669608536e-07, "loss": 0.1342, "num_input_tokens_seen": 2299456, "step": 4685 }, { "epoch": 0.6189784875280454, "grad_norm": 0.10149969160556793, "learning_rate": 7.62043747108629e-07, "loss": 0.0194, "num_input_tokens_seen": 2301568, "step": 4690 }, { "epoch": 0.6196383793057938, "grad_norm": 122.03047943115234, "learning_rate": 7.598069899422221e-07, "loss": 0.1988, "num_input_tokens_seen": 2303936, "step": 4695 }, { "epoch": 0.6202982710835423, "grad_norm": 8.390487670898438, "learning_rate": 7.575715073307119e-07, "loss": 0.2107, "num_input_tokens_seen": 2306176, "step": 4700 }, { "epoch": 0.6209581628612908, "grad_norm": 0.41683492064476013, "learning_rate": 7.55337311136414e-07, "loss": 0.0995, "num_input_tokens_seen": 2308736, "step": 4705 }, { "epoch": 0.6216180546390392, "grad_norm": 14.42542839050293, "learning_rate": 7.531044132148183e-07, "loss": 0.1775, "num_input_tokens_seen": 2311104, "step": 4710 }, { "epoch": 0.6222779464167877, "grad_norm": 16.42903709411621, "learning_rate": 7.508728254145245e-07, "loss": 0.0493, "num_input_tokens_seen": 2313536, "step": 4715 }, { "epoch": 0.6229378381945361, "grad_norm": 26.883657455444336, "learning_rate": 7.486425595771817e-07, "loss": 0.117, "num_input_tokens_seen": 2316032, "step": 4720 }, { "epoch": 0.6235977299722846, "grad_norm": 8.713482856750488, "learning_rate": 7.464136275374223e-07, "loss": 0.1853, "num_input_tokens_seen": 2318656, "step": 4725 }, { "epoch": 0.624257621750033, "grad_norm": 10.561690330505371, "learning_rate": 7.441860411228029e-07, "loss": 0.1311, "num_input_tokens_seen": 2321216, "step": 4730 }, { "epoch": 0.6249175135277815, "grad_norm": 39.474449157714844, "learning_rate": 7.419598121537387e-07, "loss": 0.1273, "num_input_tokens_seen": 2323648, "step": 4735 }, { "epoch": 0.6255774053055299, "grad_norm": 18.26643943786621, "learning_rate": 7.397349524434424e-07, "loss": 0.1446, "num_input_tokens_seen": 2326080, "step": 4740 }, { "epoch": 0.6262372970832784, "grad_norm": 8.37359619140625, "learning_rate": 7.375114737978605e-07, "loss": 0.0544, "num_input_tokens_seen": 2328512, "step": 4745 }, { "epoch": 0.6268971888610267, "grad_norm": 4.634432315826416, "learning_rate": 7.352893880156106e-07, "loss": 0.1048, "num_input_tokens_seen": 2331008, "step": 4750 }, { "epoch": 0.6275570806387752, "grad_norm": 1.4395649433135986, "learning_rate": 7.330687068879202e-07, "loss": 0.0516, "num_input_tokens_seen": 2333376, "step": 4755 }, { "epoch": 0.6282169724165237, "grad_norm": 135.29498291015625, "learning_rate": 7.308494421985626e-07, "loss": 0.1411, "num_input_tokens_seen": 2335872, "step": 4760 }, { "epoch": 0.6288768641942721, "grad_norm": 0.25262773036956787, "learning_rate": 7.286316057237951e-07, "loss": 0.0029, "num_input_tokens_seen": 2338432, "step": 4765 }, { "epoch": 0.6295367559720206, "grad_norm": 20.409406661987305, "learning_rate": 7.264152092322963e-07, "loss": 0.1567, "num_input_tokens_seen": 2340928, "step": 4770 }, { "epoch": 0.630196647749769, "grad_norm": 0.3037130832672119, "learning_rate": 7.242002644851035e-07, "loss": 0.0441, "num_input_tokens_seen": 2343680, "step": 4775 }, { "epoch": 0.6308565395275175, "grad_norm": 0.19966571033000946, "learning_rate": 7.219867832355508e-07, "loss": 0.0673, "num_input_tokens_seen": 2346240, "step": 4780 }, { "epoch": 0.6315164313052659, "grad_norm": 0.16028675436973572, "learning_rate": 7.197747772292071e-07, "loss": 0.0718, "num_input_tokens_seen": 2348544, "step": 4785 }, { "epoch": 0.6321763230830144, "grad_norm": 0.05191419646143913, "learning_rate": 7.17564258203811e-07, "loss": 0.2532, "num_input_tokens_seen": 2350976, "step": 4790 }, { "epoch": 0.6328362148607628, "grad_norm": 21.26822280883789, "learning_rate": 7.153552378892128e-07, "loss": 0.1214, "num_input_tokens_seen": 2353216, "step": 4795 }, { "epoch": 0.6334961066385113, "grad_norm": 0.49603065848350525, "learning_rate": 7.131477280073091e-07, "loss": 0.1191, "num_input_tokens_seen": 2355584, "step": 4800 }, { "epoch": 0.6341559984162597, "grad_norm": 0.12939685583114624, "learning_rate": 7.109417402719813e-07, "loss": 0.1127, "num_input_tokens_seen": 2358144, "step": 4805 }, { "epoch": 0.6348158901940082, "grad_norm": 14.447181701660156, "learning_rate": 7.087372863890346e-07, "loss": 0.0543, "num_input_tokens_seen": 2360896, "step": 4810 }, { "epoch": 0.6354757819717566, "grad_norm": 25.439424514770508, "learning_rate": 7.065343780561344e-07, "loss": 0.2546, "num_input_tokens_seen": 2363264, "step": 4815 }, { "epoch": 0.6361356737495051, "grad_norm": 10.288759231567383, "learning_rate": 7.043330269627448e-07, "loss": 0.0676, "num_input_tokens_seen": 2365632, "step": 4820 }, { "epoch": 0.6367955655272536, "grad_norm": 0.07639932632446289, "learning_rate": 7.021332447900671e-07, "loss": 0.0018, "num_input_tokens_seen": 2368000, "step": 4825 }, { "epoch": 0.637455457305002, "grad_norm": 63.019187927246094, "learning_rate": 6.999350432109766e-07, "loss": 0.1462, "num_input_tokens_seen": 2370560, "step": 4830 }, { "epoch": 0.6381153490827505, "grad_norm": 0.08439631760120392, "learning_rate": 6.977384338899617e-07, "loss": 0.001, "num_input_tokens_seen": 2373120, "step": 4835 }, { "epoch": 0.6387752408604989, "grad_norm": 0.06181376054883003, "learning_rate": 6.955434284830619e-07, "loss": 0.0052, "num_input_tokens_seen": 2375872, "step": 4840 }, { "epoch": 0.6394351326382474, "grad_norm": 0.05570792779326439, "learning_rate": 6.933500386378056e-07, "loss": 0.2037, "num_input_tokens_seen": 2378432, "step": 4845 }, { "epoch": 0.6400950244159958, "grad_norm": 50.27269744873047, "learning_rate": 6.911582759931482e-07, "loss": 0.1581, "num_input_tokens_seen": 2380800, "step": 4850 }, { "epoch": 0.6407549161937443, "grad_norm": 0.039350103586912155, "learning_rate": 6.889681521794109e-07, "loss": 0.2158, "num_input_tokens_seen": 2383744, "step": 4855 }, { "epoch": 0.6414148079714926, "grad_norm": 11.155346870422363, "learning_rate": 6.867796788182181e-07, "loss": 0.0894, "num_input_tokens_seen": 2386112, "step": 4860 }, { "epoch": 0.6420746997492411, "grad_norm": 8.643911361694336, "learning_rate": 6.845928675224366e-07, "loss": 0.1499, "num_input_tokens_seen": 2388736, "step": 4865 }, { "epoch": 0.6427345915269895, "grad_norm": 0.24349497258663177, "learning_rate": 6.82407729896114e-07, "loss": 0.0662, "num_input_tokens_seen": 2391104, "step": 4870 }, { "epoch": 0.643394483304738, "grad_norm": 15.820056915283203, "learning_rate": 6.802242775344163e-07, "loss": 0.0747, "num_input_tokens_seen": 2393728, "step": 4875 }, { "epoch": 0.6440543750824864, "grad_norm": 0.12001825124025345, "learning_rate": 6.780425220235674e-07, "loss": 0.1309, "num_input_tokens_seen": 2396480, "step": 4880 }, { "epoch": 0.6447142668602349, "grad_norm": 0.08038333803415298, "learning_rate": 6.758624749407859e-07, "loss": 0.008, "num_input_tokens_seen": 2399104, "step": 4885 }, { "epoch": 0.6453741586379834, "grad_norm": 15.686113357543945, "learning_rate": 6.736841478542264e-07, "loss": 0.0813, "num_input_tokens_seen": 2401664, "step": 4890 }, { "epoch": 0.6460340504157318, "grad_norm": 0.3630061447620392, "learning_rate": 6.715075523229151e-07, "loss": 0.0084, "num_input_tokens_seen": 2404160, "step": 4895 }, { "epoch": 0.6466939421934803, "grad_norm": 29.911376953125, "learning_rate": 6.693326998966909e-07, "loss": 0.129, "num_input_tokens_seen": 2406592, "step": 4900 }, { "epoch": 0.6473538339712287, "grad_norm": 0.05508751794695854, "learning_rate": 6.671596021161431e-07, "loss": 0.0684, "num_input_tokens_seen": 2409088, "step": 4905 }, { "epoch": 0.6480137257489772, "grad_norm": 0.06392798572778702, "learning_rate": 6.649882705125494e-07, "loss": 0.0965, "num_input_tokens_seen": 2411584, "step": 4910 }, { "epoch": 0.6486736175267256, "grad_norm": 0.36957481503486633, "learning_rate": 6.628187166078163e-07, "loss": 0.4483, "num_input_tokens_seen": 2414400, "step": 4915 }, { "epoch": 0.6493335093044741, "grad_norm": 18.36041259765625, "learning_rate": 6.606509519144166e-07, "loss": 0.0583, "num_input_tokens_seen": 2416640, "step": 4920 }, { "epoch": 0.6499934010822225, "grad_norm": 61.96574783325195, "learning_rate": 6.584849879353289e-07, "loss": 0.1499, "num_input_tokens_seen": 2419136, "step": 4925 }, { "epoch": 0.6502573577933219, "eval_loss": 0.09844312816858292, "eval_runtime": 7.5167, "eval_samples_per_second": 896.002, "eval_steps_per_second": 112.017, "num_input_tokens_seen": 2420096, "step": 4927 }, { "epoch": 0.650653292859971, "grad_norm": 0.3677075207233429, "learning_rate": 6.563208361639772e-07, "loss": 0.0307, "num_input_tokens_seen": 2421440, "step": 4930 }, { "epoch": 0.6513131846377194, "grad_norm": 0.08293258398771286, "learning_rate": 6.541585080841687e-07, "loss": 0.0015, "num_input_tokens_seen": 2424000, "step": 4935 }, { "epoch": 0.6519730764154679, "grad_norm": 88.83380126953125, "learning_rate": 6.519980151700332e-07, "loss": 0.0999, "num_input_tokens_seen": 2426240, "step": 4940 }, { "epoch": 0.6526329681932164, "grad_norm": 0.1625138372182846, "learning_rate": 6.498393688859629e-07, "loss": 0.0789, "num_input_tokens_seen": 2428864, "step": 4945 }, { "epoch": 0.6532928599709648, "grad_norm": 0.05900685489177704, "learning_rate": 6.47682580686551e-07, "loss": 0.0011, "num_input_tokens_seen": 2431296, "step": 4950 }, { "epoch": 0.6539527517487133, "grad_norm": 0.054225701838731766, "learning_rate": 6.455276620165307e-07, "loss": 0.002, "num_input_tokens_seen": 2433984, "step": 4955 }, { "epoch": 0.6546126435264616, "grad_norm": 0.02803809382021427, "learning_rate": 6.433746243107152e-07, "loss": 0.4195, "num_input_tokens_seen": 2436224, "step": 4960 }, { "epoch": 0.6552725353042101, "grad_norm": 0.09517721086740494, "learning_rate": 6.412234789939359e-07, "loss": 0.229, "num_input_tokens_seen": 2438720, "step": 4965 }, { "epoch": 0.6559324270819585, "grad_norm": 0.13722281157970428, "learning_rate": 6.390742374809832e-07, "loss": 0.0818, "num_input_tokens_seen": 2440960, "step": 4970 }, { "epoch": 0.656592318859707, "grad_norm": 0.6646612286567688, "learning_rate": 6.369269111765454e-07, "loss": 0.0417, "num_input_tokens_seen": 2443328, "step": 4975 }, { "epoch": 0.6572522106374554, "grad_norm": 0.05688225477933884, "learning_rate": 6.347815114751465e-07, "loss": 0.1413, "num_input_tokens_seen": 2445952, "step": 4980 }, { "epoch": 0.6579121024152039, "grad_norm": 0.027482135221362114, "learning_rate": 6.326380497610886e-07, "loss": 0.1102, "num_input_tokens_seen": 2448576, "step": 4985 }, { "epoch": 0.6585719941929523, "grad_norm": 52.316715240478516, "learning_rate": 6.304965374083899e-07, "loss": 0.323, "num_input_tokens_seen": 2451136, "step": 4990 }, { "epoch": 0.6592318859707008, "grad_norm": 0.18591034412384033, "learning_rate": 6.283569857807245e-07, "loss": 0.0022, "num_input_tokens_seen": 2453632, "step": 4995 }, { "epoch": 0.6598917777484492, "grad_norm": 0.1707799881696701, "learning_rate": 6.262194062313615e-07, "loss": 0.0082, "num_input_tokens_seen": 2456192, "step": 5000 }, { "epoch": 0.6605516695261977, "grad_norm": 0.05098792165517807, "learning_rate": 6.240838101031063e-07, "loss": 0.0012, "num_input_tokens_seen": 2458624, "step": 5005 }, { "epoch": 0.6612115613039462, "grad_norm": 0.10480757057666779, "learning_rate": 6.21950208728239e-07, "loss": 0.134, "num_input_tokens_seen": 2460928, "step": 5010 }, { "epoch": 0.6618714530816946, "grad_norm": 0.10895920544862747, "learning_rate": 6.198186134284554e-07, "loss": 0.1085, "num_input_tokens_seen": 2463552, "step": 5015 }, { "epoch": 0.6625313448594431, "grad_norm": 25.51168441772461, "learning_rate": 6.176890355148049e-07, "loss": 0.0561, "num_input_tokens_seen": 2465856, "step": 5020 }, { "epoch": 0.6631912366371915, "grad_norm": 3.873609781265259, "learning_rate": 6.155614862876335e-07, "loss": 0.0902, "num_input_tokens_seen": 2468288, "step": 5025 }, { "epoch": 0.66385112841494, "grad_norm": 75.29798889160156, "learning_rate": 6.134359770365214e-07, "loss": 0.1482, "num_input_tokens_seen": 2470912, "step": 5030 }, { "epoch": 0.6645110201926884, "grad_norm": 0.2568621039390564, "learning_rate": 6.11312519040224e-07, "loss": 0.109, "num_input_tokens_seen": 2473536, "step": 5035 }, { "epoch": 0.6651709119704369, "grad_norm": 0.05576321855187416, "learning_rate": 6.091911235666125e-07, "loss": 0.0013, "num_input_tokens_seen": 2476032, "step": 5040 }, { "epoch": 0.6658308037481853, "grad_norm": 0.13206513226032257, "learning_rate": 6.070718018726124e-07, "loss": 0.1091, "num_input_tokens_seen": 2478208, "step": 5045 }, { "epoch": 0.6664906955259338, "grad_norm": 0.10654900968074799, "learning_rate": 6.049545652041459e-07, "loss": 0.1482, "num_input_tokens_seen": 2480512, "step": 5050 }, { "epoch": 0.6671505873036822, "grad_norm": 0.07339984178543091, "learning_rate": 6.028394247960709e-07, "loss": 0.1775, "num_input_tokens_seen": 2483008, "step": 5055 }, { "epoch": 0.6678104790814307, "grad_norm": 0.04593325033783913, "learning_rate": 6.007263918721221e-07, "loss": 0.1572, "num_input_tokens_seen": 2485376, "step": 5060 }, { "epoch": 0.668470370859179, "grad_norm": 0.19269201159477234, "learning_rate": 5.986154776448507e-07, "loss": 0.0559, "num_input_tokens_seen": 2488064, "step": 5065 }, { "epoch": 0.6691302626369275, "grad_norm": 13.757147789001465, "learning_rate": 5.965066933155656e-07, "loss": 0.0578, "num_input_tokens_seen": 2490624, "step": 5070 }, { "epoch": 0.669790154414676, "grad_norm": 20.430967330932617, "learning_rate": 5.944000500742735e-07, "loss": 0.2826, "num_input_tokens_seen": 2493248, "step": 5075 }, { "epoch": 0.6704500461924244, "grad_norm": 51.80553436279297, "learning_rate": 5.922955590996195e-07, "loss": 0.201, "num_input_tokens_seen": 2495744, "step": 5080 }, { "epoch": 0.6711099379701729, "grad_norm": 0.12118737399578094, "learning_rate": 5.901932315588281e-07, "loss": 0.0019, "num_input_tokens_seen": 2498176, "step": 5085 }, { "epoch": 0.6717698297479213, "grad_norm": 20.142244338989258, "learning_rate": 5.880930786076441e-07, "loss": 0.1805, "num_input_tokens_seen": 2500416, "step": 5090 }, { "epoch": 0.6724297215256698, "grad_norm": 0.4407406747341156, "learning_rate": 5.859951113902728e-07, "loss": 0.06, "num_input_tokens_seen": 2502848, "step": 5095 }, { "epoch": 0.6730896133034182, "grad_norm": 32.401554107666016, "learning_rate": 5.83899341039321e-07, "loss": 0.1099, "num_input_tokens_seen": 2505152, "step": 5100 }, { "epoch": 0.6737495050811667, "grad_norm": 34.423946380615234, "learning_rate": 5.818057786757386e-07, "loss": 0.1247, "num_input_tokens_seen": 2507648, "step": 5105 }, { "epoch": 0.6744093968589151, "grad_norm": 0.2243095338344574, "learning_rate": 5.797144354087588e-07, "loss": 0.0989, "num_input_tokens_seen": 2510144, "step": 5110 }, { "epoch": 0.6750692886366636, "grad_norm": 0.06958218663930893, "learning_rate": 5.77625322335839e-07, "loss": 0.076, "num_input_tokens_seen": 2513024, "step": 5115 }, { "epoch": 0.675729180414412, "grad_norm": 0.2868078649044037, "learning_rate": 5.755384505426032e-07, "loss": 0.0721, "num_input_tokens_seen": 2515072, "step": 5120 }, { "epoch": 0.6763890721921605, "grad_norm": 0.19552133977413177, "learning_rate": 5.734538311027819e-07, "loss": 0.0018, "num_input_tokens_seen": 2517376, "step": 5125 }, { "epoch": 0.677048963969909, "grad_norm": 0.6387649178504944, "learning_rate": 5.713714750781533e-07, "loss": 0.0036, "num_input_tokens_seen": 2520064, "step": 5130 }, { "epoch": 0.6777088557476574, "grad_norm": 0.640417218208313, "learning_rate": 5.692913935184862e-07, "loss": 0.0685, "num_input_tokens_seen": 2522688, "step": 5135 }, { "epoch": 0.6783687475254059, "grad_norm": 0.32035917043685913, "learning_rate": 5.672135974614794e-07, "loss": 0.0071, "num_input_tokens_seen": 2525184, "step": 5140 }, { "epoch": 0.6790286393031543, "grad_norm": 0.08546182513237, "learning_rate": 5.651380979327034e-07, "loss": 0.0014, "num_input_tokens_seen": 2527552, "step": 5145 }, { "epoch": 0.6796885310809028, "grad_norm": 1.3679804801940918, "learning_rate": 5.630649059455444e-07, "loss": 0.0442, "num_input_tokens_seen": 2530240, "step": 5150 }, { "epoch": 0.6803484228586512, "grad_norm": 0.5069653391838074, "learning_rate": 5.609940325011413e-07, "loss": 0.0023, "num_input_tokens_seen": 2532480, "step": 5155 }, { "epoch": 0.6810083146363997, "grad_norm": 0.1547362059354782, "learning_rate": 5.589254885883325e-07, "loss": 0.0007, "num_input_tokens_seen": 2534912, "step": 5160 }, { "epoch": 0.681668206414148, "grad_norm": 0.09271689504384995, "learning_rate": 5.568592851835936e-07, "loss": 0.0598, "num_input_tokens_seen": 2537408, "step": 5165 }, { "epoch": 0.6823280981918965, "grad_norm": 0.12092125415802002, "learning_rate": 5.547954332509805e-07, "loss": 0.3023, "num_input_tokens_seen": 2539776, "step": 5170 }, { "epoch": 0.6829879899696449, "grad_norm": 0.06238294392824173, "learning_rate": 5.527339437420717e-07, "loss": 0.0009, "num_input_tokens_seen": 2542208, "step": 5175 }, { "epoch": 0.6836478817473934, "grad_norm": 109.36412811279297, "learning_rate": 5.506748275959094e-07, "loss": 0.1061, "num_input_tokens_seen": 2544704, "step": 5180 }, { "epoch": 0.6843077735251418, "grad_norm": 0.061365850269794464, "learning_rate": 5.48618095738943e-07, "loss": 0.0535, "num_input_tokens_seen": 2547072, "step": 5185 }, { "epoch": 0.6849676653028903, "grad_norm": 0.15806028246879578, "learning_rate": 5.465637590849681e-07, "loss": 0.1301, "num_input_tokens_seen": 2549440, "step": 5190 }, { "epoch": 0.6856275570806388, "grad_norm": 21.357271194458008, "learning_rate": 5.445118285350723e-07, "loss": 0.2169, "num_input_tokens_seen": 2552128, "step": 5195 }, { "epoch": 0.6862874488583872, "grad_norm": 0.09460903704166412, "learning_rate": 5.424623149775745e-07, "loss": 0.0681, "num_input_tokens_seen": 2554368, "step": 5200 }, { "epoch": 0.6869473406361357, "grad_norm": 0.0203552208840847, "learning_rate": 5.404152292879676e-07, "loss": 0.1175, "num_input_tokens_seen": 2556928, "step": 5205 }, { "epoch": 0.6876072324138841, "grad_norm": 16.716796875, "learning_rate": 5.38370582328863e-07, "loss": 0.1624, "num_input_tokens_seen": 2559360, "step": 5210 }, { "epoch": 0.6882671241916326, "grad_norm": 0.22735337913036346, "learning_rate": 5.363283849499293e-07, "loss": 0.1578, "num_input_tokens_seen": 2561856, "step": 5215 }, { "epoch": 0.688927015969381, "grad_norm": 33.698936462402344, "learning_rate": 5.342886479878387e-07, "loss": 0.1794, "num_input_tokens_seen": 2564352, "step": 5220 }, { "epoch": 0.6895869077471295, "grad_norm": 0.5594123601913452, "learning_rate": 5.32251382266206e-07, "loss": 0.0437, "num_input_tokens_seen": 2566784, "step": 5225 }, { "epoch": 0.6902467995248779, "grad_norm": 0.27059707045555115, "learning_rate": 5.302165985955327e-07, "loss": 0.0593, "num_input_tokens_seen": 2569152, "step": 5230 }, { "epoch": 0.6909066913026264, "grad_norm": 0.09355846047401428, "learning_rate": 5.281843077731511e-07, "loss": 0.067, "num_input_tokens_seen": 2571520, "step": 5235 }, { "epoch": 0.6915665830803748, "grad_norm": 121.53573608398438, "learning_rate": 5.26154520583163e-07, "loss": 0.141, "num_input_tokens_seen": 2574080, "step": 5240 }, { "epoch": 0.6922264748581233, "grad_norm": 0.16486892104148865, "learning_rate": 5.241272477963877e-07, "loss": 0.0595, "num_input_tokens_seen": 2576320, "step": 5245 }, { "epoch": 0.6928863666358717, "grad_norm": 1.9759496450424194, "learning_rate": 5.221025001703e-07, "loss": 0.0576, "num_input_tokens_seen": 2578752, "step": 5250 }, { "epoch": 0.6935462584136202, "grad_norm": 17.89307403564453, "learning_rate": 5.200802884489768e-07, "loss": 0.1368, "num_input_tokens_seen": 2581184, "step": 5255 }, { "epoch": 0.6942061501913687, "grad_norm": 0.03805484250187874, "learning_rate": 5.180606233630374e-07, "loss": 0.1654, "num_input_tokens_seen": 2583872, "step": 5260 }, { "epoch": 0.694866041969117, "grad_norm": 0.12207946926355362, "learning_rate": 5.160435156295879e-07, "loss": 0.1912, "num_input_tokens_seen": 2586304, "step": 5265 }, { "epoch": 0.6955259337468656, "grad_norm": 0.035935211926698685, "learning_rate": 5.14028975952165e-07, "loss": 0.0201, "num_input_tokens_seen": 2589056, "step": 5270 }, { "epoch": 0.6961858255246139, "grad_norm": 9.020354270935059, "learning_rate": 5.120170150206768e-07, "loss": 0.14, "num_input_tokens_seen": 2591488, "step": 5275 }, { "epoch": 0.6968457173023624, "grad_norm": 18.322715759277344, "learning_rate": 5.100076435113496e-07, "loss": 0.0542, "num_input_tokens_seen": 2593792, "step": 5280 }, { "epoch": 0.6975056090801108, "grad_norm": 55.9955940246582, "learning_rate": 5.080008720866673e-07, "loss": 0.1538, "num_input_tokens_seen": 2595968, "step": 5285 }, { "epoch": 0.6981655008578593, "grad_norm": 11.932297706604004, "learning_rate": 5.059967113953173e-07, "loss": 0.2123, "num_input_tokens_seen": 2598144, "step": 5290 }, { "epoch": 0.6988253926356077, "grad_norm": 0.08165155351161957, "learning_rate": 5.039951720721349e-07, "loss": 0.0838, "num_input_tokens_seen": 2600448, "step": 5295 }, { "epoch": 0.6994852844133562, "grad_norm": 0.32456350326538086, "learning_rate": 5.019962647380429e-07, "loss": 0.0167, "num_input_tokens_seen": 2602944, "step": 5300 }, { "epoch": 0.7001451761911046, "grad_norm": 20.51830291748047, "learning_rate": 5.000000000000002e-07, "loss": 0.2014, "num_input_tokens_seen": 2605120, "step": 5305 }, { "epoch": 0.7002771545466544, "eval_loss": 0.09084735810756683, "eval_runtime": 7.6666, "eval_samples_per_second": 878.487, "eval_steps_per_second": 109.827, "num_input_tokens_seen": 2605504, "step": 5306 }, { "epoch": 0.7008050679688531, "grad_norm": 1.9377256631851196, "learning_rate": 4.980063884509414e-07, "loss": 0.0377, "num_input_tokens_seen": 2607296, "step": 5310 }, { "epoch": 0.7014649597466015, "grad_norm": 0.11374177783727646, "learning_rate": 4.960154406697229e-07, "loss": 0.0463, "num_input_tokens_seen": 2609728, "step": 5315 }, { "epoch": 0.70212485152435, "grad_norm": 11.871938705444336, "learning_rate": 4.940271672210667e-07, "loss": 0.2924, "num_input_tokens_seen": 2612224, "step": 5320 }, { "epoch": 0.7027847433020985, "grad_norm": 0.26750093698501587, "learning_rate": 4.920415786555025e-07, "loss": 0.0513, "num_input_tokens_seen": 2614720, "step": 5325 }, { "epoch": 0.7034446350798469, "grad_norm": 0.12440818548202515, "learning_rate": 4.900586855093144e-07, "loss": 0.3194, "num_input_tokens_seen": 2617344, "step": 5330 }, { "epoch": 0.7041045268575954, "grad_norm": 23.306577682495117, "learning_rate": 4.880784983044827e-07, "loss": 0.1166, "num_input_tokens_seen": 2619584, "step": 5335 }, { "epoch": 0.7047644186353438, "grad_norm": 0.1234973892569542, "learning_rate": 4.861010275486284e-07, "loss": 0.0176, "num_input_tokens_seen": 2621888, "step": 5340 }, { "epoch": 0.7054243104130923, "grad_norm": 0.14019837975502014, "learning_rate": 4.8412628373496e-07, "loss": 0.0731, "num_input_tokens_seen": 2624512, "step": 5345 }, { "epoch": 0.7060842021908407, "grad_norm": 0.18232476711273193, "learning_rate": 4.821542773422136e-07, "loss": 0.0024, "num_input_tokens_seen": 2627008, "step": 5350 }, { "epoch": 0.7067440939685892, "grad_norm": 0.28430455923080444, "learning_rate": 4.801850188346012e-07, "loss": 0.0019, "num_input_tokens_seen": 2629440, "step": 5355 }, { "epoch": 0.7074039857463376, "grad_norm": 0.19436050951480865, "learning_rate": 4.782185186617523e-07, "loss": 0.1034, "num_input_tokens_seen": 2631872, "step": 5360 }, { "epoch": 0.7080638775240861, "grad_norm": 0.2109547257423401, "learning_rate": 4.762547872586603e-07, "loss": 0.0814, "num_input_tokens_seen": 2634560, "step": 5365 }, { "epoch": 0.7087237693018344, "grad_norm": 0.2513101100921631, "learning_rate": 4.7429383504562605e-07, "loss": 0.1396, "num_input_tokens_seen": 2637120, "step": 5370 }, { "epoch": 0.709383661079583, "grad_norm": 0.30243685841560364, "learning_rate": 4.723356724282029e-07, "loss": 0.0019, "num_input_tokens_seen": 2639552, "step": 5375 }, { "epoch": 0.7100435528573315, "grad_norm": 24.248998641967773, "learning_rate": 4.703803097971426e-07, "loss": 0.1315, "num_input_tokens_seen": 2641984, "step": 5380 }, { "epoch": 0.7107034446350798, "grad_norm": 8.986465454101562, "learning_rate": 4.6842775752833763e-07, "loss": 0.0708, "num_input_tokens_seen": 2644352, "step": 5385 }, { "epoch": 0.7113633364128283, "grad_norm": 0.1666085124015808, "learning_rate": 4.664780259827689e-07, "loss": 0.02, "num_input_tokens_seen": 2647040, "step": 5390 }, { "epoch": 0.7120232281905767, "grad_norm": 0.05778901278972626, "learning_rate": 4.6453112550644857e-07, "loss": 0.0013, "num_input_tokens_seen": 2649472, "step": 5395 }, { "epoch": 0.7126831199683252, "grad_norm": 0.1988663524389267, "learning_rate": 4.625870664303663e-07, "loss": 0.0411, "num_input_tokens_seen": 2651840, "step": 5400 }, { "epoch": 0.7133430117460736, "grad_norm": 0.19517682492733002, "learning_rate": 4.6064585907043486e-07, "loss": 0.0056, "num_input_tokens_seen": 2654464, "step": 5405 }, { "epoch": 0.7140029035238221, "grad_norm": 0.02337566576898098, "learning_rate": 4.587075137274334e-07, "loss": 0.0537, "num_input_tokens_seen": 2656576, "step": 5410 }, { "epoch": 0.7146627953015705, "grad_norm": 1.0309412479400635, "learning_rate": 4.5677204068695597e-07, "loss": 0.0546, "num_input_tokens_seen": 2659008, "step": 5415 }, { "epoch": 0.715322687079319, "grad_norm": 0.022054580971598625, "learning_rate": 4.5483945021935356e-07, "loss": 0.0401, "num_input_tokens_seen": 2661632, "step": 5420 }, { "epoch": 0.7159825788570674, "grad_norm": 0.02314288541674614, "learning_rate": 4.5290975257968155e-07, "loss": 0.0963, "num_input_tokens_seen": 2664192, "step": 5425 }, { "epoch": 0.7166424706348159, "grad_norm": 22.84745216369629, "learning_rate": 4.509829580076452e-07, "loss": 0.1819, "num_input_tokens_seen": 2666624, "step": 5430 }, { "epoch": 0.7173023624125643, "grad_norm": 0.063370481133461, "learning_rate": 4.490590767275442e-07, "loss": 0.1842, "num_input_tokens_seen": 2669120, "step": 5435 }, { "epoch": 0.7179622541903128, "grad_norm": 0.49410998821258545, "learning_rate": 4.4713811894822064e-07, "loss": 0.102, "num_input_tokens_seen": 2671552, "step": 5440 }, { "epoch": 0.7186221459680613, "grad_norm": 0.3350347578525543, "learning_rate": 4.4522009486300204e-07, "loss": 0.071, "num_input_tokens_seen": 2674240, "step": 5445 }, { "epoch": 0.7192820377458097, "grad_norm": 0.07053118199110031, "learning_rate": 4.43305014649649e-07, "loss": 0.1247, "num_input_tokens_seen": 2676544, "step": 5450 }, { "epoch": 0.7199419295235582, "grad_norm": 0.14452704787254333, "learning_rate": 4.4139288847030155e-07, "loss": 0.0005, "num_input_tokens_seen": 2678912, "step": 5455 }, { "epoch": 0.7206018213013066, "grad_norm": 0.08119305223226547, "learning_rate": 4.394837264714233e-07, "loss": 0.0554, "num_input_tokens_seen": 2681344, "step": 5460 }, { "epoch": 0.7212617130790551, "grad_norm": 0.15848740935325623, "learning_rate": 4.3757753878375005e-07, "loss": 0.0013, "num_input_tokens_seen": 2683776, "step": 5465 }, { "epoch": 0.7219216048568035, "grad_norm": 0.031311068683862686, "learning_rate": 4.3567433552223375e-07, "loss": 0.0567, "num_input_tokens_seen": 2686016, "step": 5470 }, { "epoch": 0.722581496634552, "grad_norm": 0.839226484298706, "learning_rate": 4.3377412678599e-07, "loss": 0.1963, "num_input_tokens_seen": 2688128, "step": 5475 }, { "epoch": 0.7232413884123003, "grad_norm": 0.17472581565380096, "learning_rate": 4.318769226582454e-07, "loss": 0.1399, "num_input_tokens_seen": 2690368, "step": 5480 }, { "epoch": 0.7239012801900488, "grad_norm": 0.12172765284776688, "learning_rate": 4.299827332062811e-07, "loss": 0.0348, "num_input_tokens_seen": 2692992, "step": 5485 }, { "epoch": 0.7245611719677972, "grad_norm": 75.4613037109375, "learning_rate": 4.2809156848138363e-07, "loss": 0.0968, "num_input_tokens_seen": 2695424, "step": 5490 }, { "epoch": 0.7252210637455457, "grad_norm": 71.69564056396484, "learning_rate": 4.2620343851878616e-07, "loss": 0.1639, "num_input_tokens_seen": 2697856, "step": 5495 }, { "epoch": 0.7258809555232941, "grad_norm": 0.060778968036174774, "learning_rate": 4.2431835333762123e-07, "loss": 0.0446, "num_input_tokens_seen": 2700608, "step": 5500 }, { "epoch": 0.7265408473010426, "grad_norm": 0.6623153686523438, "learning_rate": 4.224363229408628e-07, "loss": 0.0005, "num_input_tokens_seen": 2703104, "step": 5505 }, { "epoch": 0.7272007390787911, "grad_norm": 0.34537097811698914, "learning_rate": 4.205573573152753e-07, "loss": 0.1834, "num_input_tokens_seen": 2705344, "step": 5510 }, { "epoch": 0.7278606308565395, "grad_norm": 0.14280956983566284, "learning_rate": 4.18681466431361e-07, "loss": 0.0728, "num_input_tokens_seen": 2707520, "step": 5515 }, { "epoch": 0.728520522634288, "grad_norm": 1.0312310457229614, "learning_rate": 4.168086602433055e-07, "loss": 0.105, "num_input_tokens_seen": 2709888, "step": 5520 }, { "epoch": 0.7291804144120364, "grad_norm": 0.46186262369155884, "learning_rate": 4.1493894868892676e-07, "loss": 0.1888, "num_input_tokens_seen": 2712192, "step": 5525 }, { "epoch": 0.7298403061897849, "grad_norm": 0.5339822769165039, "learning_rate": 4.1307234168962093e-07, "loss": 0.0838, "num_input_tokens_seen": 2714368, "step": 5530 }, { "epoch": 0.7305001979675333, "grad_norm": 0.03589556738734245, "learning_rate": 4.112088491503095e-07, "loss": 0.0014, "num_input_tokens_seen": 2716608, "step": 5535 }, { "epoch": 0.7311600897452818, "grad_norm": 0.08683586120605469, "learning_rate": 4.0934848095938937e-07, "loss": 0.001, "num_input_tokens_seen": 2718656, "step": 5540 }, { "epoch": 0.7318199815230302, "grad_norm": 0.04359391704201698, "learning_rate": 4.074912469886763e-07, "loss": 0.098, "num_input_tokens_seen": 2721152, "step": 5545 }, { "epoch": 0.7324798733007787, "grad_norm": 0.47505855560302734, "learning_rate": 4.0563715709335657e-07, "loss": 0.0009, "num_input_tokens_seen": 2723264, "step": 5550 }, { "epoch": 0.7331397650785271, "grad_norm": 40.96418380737305, "learning_rate": 4.037862211119315e-07, "loss": 0.2022, "num_input_tokens_seen": 2725568, "step": 5555 }, { "epoch": 0.7337996568562756, "grad_norm": 0.221147358417511, "learning_rate": 4.0193844886616715e-07, "loss": 0.0389, "num_input_tokens_seen": 2728192, "step": 5560 }, { "epoch": 0.7344595486340241, "grad_norm": 28.70302391052246, "learning_rate": 4.0009385016104137e-07, "loss": 0.1632, "num_input_tokens_seen": 2731072, "step": 5565 }, { "epoch": 0.7351194404117725, "grad_norm": 0.036642443388700485, "learning_rate": 3.9825243478469164e-07, "loss": 0.1455, "num_input_tokens_seen": 2733440, "step": 5570 }, { "epoch": 0.735779332189521, "grad_norm": 8.163640022277832, "learning_rate": 3.9641421250836484e-07, "loss": 0.1211, "num_input_tokens_seen": 2736064, "step": 5575 }, { "epoch": 0.7364392239672694, "grad_norm": 36.30949401855469, "learning_rate": 3.945791930863622e-07, "loss": 0.0356, "num_input_tokens_seen": 2738496, "step": 5580 }, { "epoch": 0.7370991157450179, "grad_norm": 0.6677089333534241, "learning_rate": 3.9274738625599137e-07, "loss": 0.002, "num_input_tokens_seen": 2740800, "step": 5585 }, { "epoch": 0.7377590075227662, "grad_norm": 0.42139413952827454, "learning_rate": 3.909188017375112e-07, "loss": 0.0746, "num_input_tokens_seen": 2743104, "step": 5590 }, { "epoch": 0.7384188993005147, "grad_norm": 0.15833111107349396, "learning_rate": 3.890934492340819e-07, "loss": 0.1553, "num_input_tokens_seen": 2745344, "step": 5595 }, { "epoch": 0.7390787910782631, "grad_norm": 19.225963592529297, "learning_rate": 3.872713384317147e-07, "loss": 0.062, "num_input_tokens_seen": 2747520, "step": 5600 }, { "epoch": 0.7397386828560116, "grad_norm": 0.0576261468231678, "learning_rate": 3.8545247899921776e-07, "loss": 0.1382, "num_input_tokens_seen": 2750016, "step": 5605 }, { "epoch": 0.74039857463376, "grad_norm": 0.09810295701026917, "learning_rate": 3.8363688058814614e-07, "loss": 0.1139, "num_input_tokens_seen": 2752704, "step": 5610 }, { "epoch": 0.7410584664115085, "grad_norm": 37.168209075927734, "learning_rate": 3.818245528327526e-07, "loss": 0.1544, "num_input_tokens_seen": 2755328, "step": 5615 }, { "epoch": 0.7417183581892569, "grad_norm": 12.660454750061035, "learning_rate": 3.8001550534993164e-07, "loss": 0.0911, "num_input_tokens_seen": 2757632, "step": 5620 }, { "epoch": 0.7423782499670054, "grad_norm": 0.21876884996891022, "learning_rate": 3.7820974773917413e-07, "loss": 0.0665, "num_input_tokens_seen": 2760192, "step": 5625 }, { "epoch": 0.7430381417447539, "grad_norm": 0.09194961190223694, "learning_rate": 3.764072895825117e-07, "loss": 0.001, "num_input_tokens_seen": 2762816, "step": 5630 }, { "epoch": 0.7436980335225023, "grad_norm": 11.263919830322266, "learning_rate": 3.7460814044446934e-07, "loss": 0.0625, "num_input_tokens_seen": 2765120, "step": 5635 }, { "epoch": 0.7443579253002508, "grad_norm": 0.1615023910999298, "learning_rate": 3.72812309872012e-07, "loss": 0.0989, "num_input_tokens_seen": 2767808, "step": 5640 }, { "epoch": 0.7450178170779992, "grad_norm": 3.175365447998047, "learning_rate": 3.71019807394495e-07, "loss": 0.0518, "num_input_tokens_seen": 2770176, "step": 5645 }, { "epoch": 0.7456777088557477, "grad_norm": 0.27845051884651184, "learning_rate": 3.6923064252361505e-07, "loss": 0.0983, "num_input_tokens_seen": 2772672, "step": 5650 }, { "epoch": 0.7463376006334961, "grad_norm": 0.15922772884368896, "learning_rate": 3.674448247533561e-07, "loss": 0.1089, "num_input_tokens_seen": 2775104, "step": 5655 }, { "epoch": 0.7469974924112446, "grad_norm": 0.11377550661563873, "learning_rate": 3.656623635599432e-07, "loss": 0.2327, "num_input_tokens_seen": 2777792, "step": 5660 }, { "epoch": 0.747657384188993, "grad_norm": 55.64900207519531, "learning_rate": 3.6388326840178865e-07, "loss": 0.1313, "num_input_tokens_seen": 2780416, "step": 5665 }, { "epoch": 0.7483172759667415, "grad_norm": 0.06009421497583389, "learning_rate": 3.621075487194435e-07, "loss": 0.0056, "num_input_tokens_seen": 2783232, "step": 5670 }, { "epoch": 0.7489771677444899, "grad_norm": 11.293038368225098, "learning_rate": 3.603352139355483e-07, "loss": 0.1054, "num_input_tokens_seen": 2785664, "step": 5675 }, { "epoch": 0.7496370595222384, "grad_norm": 8.784896850585938, "learning_rate": 3.58566273454781e-07, "loss": 0.0984, "num_input_tokens_seen": 2788224, "step": 5680 }, { "epoch": 0.7502969512999867, "grad_norm": 0.07352028787136078, "learning_rate": 3.5680073666380817e-07, "loss": 0.0014, "num_input_tokens_seen": 2790656, "step": 5685 }, { "epoch": 0.7502969512999867, "eval_loss": 0.0956902727484703, "eval_runtime": 7.594, "eval_samples_per_second": 886.88, "eval_steps_per_second": 110.876, "num_input_tokens_seen": 2790656, "step": 5685 }, { "epoch": 0.7509568430777352, "grad_norm": 0.028005223721265793, "learning_rate": 3.5503861293123514e-07, "loss": 0.1594, "num_input_tokens_seen": 2792960, "step": 5690 }, { "epoch": 0.7516167348554837, "grad_norm": 0.16075754165649414, "learning_rate": 3.532799116075571e-07, "loss": 0.0789, "num_input_tokens_seen": 2795648, "step": 5695 }, { "epoch": 0.7522766266332321, "grad_norm": 53.65882873535156, "learning_rate": 3.5152464202510777e-07, "loss": 0.098, "num_input_tokens_seen": 2797696, "step": 5700 }, { "epoch": 0.7529365184109806, "grad_norm": 15.231353759765625, "learning_rate": 3.4977281349801056e-07, "loss": 0.1334, "num_input_tokens_seen": 2800192, "step": 5705 }, { "epoch": 0.753596410188729, "grad_norm": 0.14780941605567932, "learning_rate": 3.4802443532213056e-07, "loss": 0.0133, "num_input_tokens_seen": 2802560, "step": 5710 }, { "epoch": 0.7542563019664775, "grad_norm": 0.030415428802371025, "learning_rate": 3.4627951677502233e-07, "loss": 0.2453, "num_input_tokens_seen": 2804992, "step": 5715 }, { "epoch": 0.7549161937442259, "grad_norm": 0.889872133731842, "learning_rate": 3.4453806711588397e-07, "loss": 0.0492, "num_input_tokens_seen": 2807296, "step": 5720 }, { "epoch": 0.7555760855219744, "grad_norm": 0.08172111958265305, "learning_rate": 3.428000955855054e-07, "loss": 0.0303, "num_input_tokens_seen": 2809984, "step": 5725 }, { "epoch": 0.7562359772997228, "grad_norm": 0.09841586649417877, "learning_rate": 3.4106561140621983e-07, "loss": 0.0023, "num_input_tokens_seen": 2812736, "step": 5730 }, { "epoch": 0.7568958690774713, "grad_norm": 0.37726613879203796, "learning_rate": 3.393346237818567e-07, "loss": 0.1465, "num_input_tokens_seen": 2815040, "step": 5735 }, { "epoch": 0.7575557608552197, "grad_norm": 0.7347794771194458, "learning_rate": 3.3760714189769015e-07, "loss": 0.1114, "num_input_tokens_seen": 2817344, "step": 5740 }, { "epoch": 0.7582156526329682, "grad_norm": 0.10844270884990692, "learning_rate": 3.3588317492039266e-07, "loss": 0.0357, "num_input_tokens_seen": 2819648, "step": 5745 }, { "epoch": 0.7588755444107167, "grad_norm": 46.741573333740234, "learning_rate": 3.341627319979834e-07, "loss": 0.1254, "num_input_tokens_seen": 2822464, "step": 5750 }, { "epoch": 0.7595354361884651, "grad_norm": 150.2995147705078, "learning_rate": 3.324458222597839e-07, "loss": 0.1943, "num_input_tokens_seen": 2824896, "step": 5755 }, { "epoch": 0.7601953279662136, "grad_norm": 0.15239302814006805, "learning_rate": 3.307324548163657e-07, "loss": 0.0749, "num_input_tokens_seen": 2827648, "step": 5760 }, { "epoch": 0.760855219743962, "grad_norm": 0.6753157377243042, "learning_rate": 3.2902263875950374e-07, "loss": 0.114, "num_input_tokens_seen": 2830336, "step": 5765 }, { "epoch": 0.7615151115217105, "grad_norm": 16.683015823364258, "learning_rate": 3.2731638316212894e-07, "loss": 0.0462, "num_input_tokens_seen": 2832640, "step": 5770 }, { "epoch": 0.7621750032994589, "grad_norm": 1.5054552555084229, "learning_rate": 3.256136970782782e-07, "loss": 0.049, "num_input_tokens_seen": 2834880, "step": 5775 }, { "epoch": 0.7628348950772074, "grad_norm": 0.033258408308029175, "learning_rate": 3.23914589543047e-07, "loss": 0.1447, "num_input_tokens_seen": 2837440, "step": 5780 }, { "epoch": 0.7634947868549558, "grad_norm": 103.95304107666016, "learning_rate": 3.2221906957254276e-07, "loss": 0.0424, "num_input_tokens_seen": 2839808, "step": 5785 }, { "epoch": 0.7641546786327043, "grad_norm": 5.235893726348877, "learning_rate": 3.205271461638346e-07, "loss": 0.1412, "num_input_tokens_seen": 2842432, "step": 5790 }, { "epoch": 0.7648145704104526, "grad_norm": 0.035734184086322784, "learning_rate": 3.188388282949085e-07, "loss": 0.1313, "num_input_tokens_seen": 2845120, "step": 5795 }, { "epoch": 0.7654744621882011, "grad_norm": 61.18632125854492, "learning_rate": 3.171541249246166e-07, "loss": 0.1633, "num_input_tokens_seen": 2848000, "step": 5800 }, { "epoch": 0.7661343539659495, "grad_norm": 14.463330268859863, "learning_rate": 3.154730449926316e-07, "loss": 0.161, "num_input_tokens_seen": 2850624, "step": 5805 }, { "epoch": 0.766794245743698, "grad_norm": 0.18341617286205292, "learning_rate": 3.137955974194e-07, "loss": 0.121, "num_input_tokens_seen": 2852992, "step": 5810 }, { "epoch": 0.7674541375214465, "grad_norm": 28.731979370117188, "learning_rate": 3.1212179110609125e-07, "loss": 0.1251, "num_input_tokens_seen": 2855424, "step": 5815 }, { "epoch": 0.7681140292991949, "grad_norm": 0.0925399586558342, "learning_rate": 3.104516349345553e-07, "loss": 0.137, "num_input_tokens_seen": 2857984, "step": 5820 }, { "epoch": 0.7687739210769434, "grad_norm": 0.09687471389770508, "learning_rate": 3.0878513776727144e-07, "loss": 0.0643, "num_input_tokens_seen": 2860672, "step": 5825 }, { "epoch": 0.7694338128546918, "grad_norm": 10.534875869750977, "learning_rate": 3.0712230844730414e-07, "loss": 0.1726, "num_input_tokens_seen": 2863040, "step": 5830 }, { "epoch": 0.7700937046324403, "grad_norm": 0.4192121624946594, "learning_rate": 3.054631557982539e-07, "loss": 0.0704, "num_input_tokens_seen": 2865856, "step": 5835 }, { "epoch": 0.7707535964101887, "grad_norm": 0.11545547842979431, "learning_rate": 3.0380768862421156e-07, "loss": 0.1005, "num_input_tokens_seen": 2868096, "step": 5840 }, { "epoch": 0.7714134881879372, "grad_norm": 0.13741333782672882, "learning_rate": 3.0215591570971234e-07, "loss": 0.0013, "num_input_tokens_seen": 2870784, "step": 5845 }, { "epoch": 0.7720733799656856, "grad_norm": 55.587005615234375, "learning_rate": 3.005078458196868e-07, "loss": 0.0712, "num_input_tokens_seen": 2873216, "step": 5850 }, { "epoch": 0.7727332717434341, "grad_norm": 0.19076700508594513, "learning_rate": 2.988634876994175e-07, "loss": 0.0011, "num_input_tokens_seen": 2875776, "step": 5855 }, { "epoch": 0.7733931635211825, "grad_norm": 0.12881390750408173, "learning_rate": 2.972228500744898e-07, "loss": 0.0336, "num_input_tokens_seen": 2878336, "step": 5860 }, { "epoch": 0.774053055298931, "grad_norm": 22.819622039794922, "learning_rate": 2.955859416507467e-07, "loss": 0.1431, "num_input_tokens_seen": 2880896, "step": 5865 }, { "epoch": 0.7747129470766794, "grad_norm": 0.040956467390060425, "learning_rate": 2.9395277111424357e-07, "loss": 0.0684, "num_input_tokens_seen": 2883648, "step": 5870 }, { "epoch": 0.7753728388544279, "grad_norm": 0.0455995537340641, "learning_rate": 2.9232334713120035e-07, "loss": 0.0016, "num_input_tokens_seen": 2885952, "step": 5875 }, { "epoch": 0.7760327306321764, "grad_norm": 0.3208160996437073, "learning_rate": 2.9069767834795655e-07, "loss": 0.0614, "num_input_tokens_seen": 2888576, "step": 5880 }, { "epoch": 0.7766926224099248, "grad_norm": 3.3780038356781006, "learning_rate": 2.8907577339092483e-07, "loss": 0.1331, "num_input_tokens_seen": 2891136, "step": 5885 }, { "epoch": 0.7773525141876733, "grad_norm": 0.030515162274241447, "learning_rate": 2.8745764086654654e-07, "loss": 0.0711, "num_input_tokens_seen": 2893696, "step": 5890 }, { "epoch": 0.7780124059654216, "grad_norm": 37.801578521728516, "learning_rate": 2.8584328936124424e-07, "loss": 0.0499, "num_input_tokens_seen": 2896512, "step": 5895 }, { "epoch": 0.7786722977431701, "grad_norm": 13.73175048828125, "learning_rate": 2.8423272744137674e-07, "loss": 0.1805, "num_input_tokens_seen": 2899008, "step": 5900 }, { "epoch": 0.7793321895209185, "grad_norm": 1.2914345264434814, "learning_rate": 2.82625963653195e-07, "loss": 0.0781, "num_input_tokens_seen": 2901376, "step": 5905 }, { "epoch": 0.779992081298667, "grad_norm": 8.905738830566406, "learning_rate": 2.810230065227944e-07, "loss": 0.1989, "num_input_tokens_seen": 2903872, "step": 5910 }, { "epoch": 0.7806519730764154, "grad_norm": 0.1153329461812973, "learning_rate": 2.7942386455607203e-07, "loss": 0.0016, "num_input_tokens_seen": 2906240, "step": 5915 }, { "epoch": 0.7813118648541639, "grad_norm": 0.40870046615600586, "learning_rate": 2.77828546238679e-07, "loss": 0.0695, "num_input_tokens_seen": 2908736, "step": 5920 }, { "epoch": 0.7819717566319123, "grad_norm": 50.935813903808594, "learning_rate": 2.762370600359774e-07, "loss": 0.1347, "num_input_tokens_seen": 2911104, "step": 5925 }, { "epoch": 0.7826316484096608, "grad_norm": 0.06911960244178772, "learning_rate": 2.7464941439299484e-07, "loss": 0.0614, "num_input_tokens_seen": 2913472, "step": 5930 }, { "epoch": 0.7832915401874093, "grad_norm": 12.452083587646484, "learning_rate": 2.7306561773437887e-07, "loss": 0.1583, "num_input_tokens_seen": 2915840, "step": 5935 }, { "epoch": 0.7839514319651577, "grad_norm": 0.09292475879192352, "learning_rate": 2.714856784643533e-07, "loss": 0.0047, "num_input_tokens_seen": 2918144, "step": 5940 }, { "epoch": 0.7846113237429062, "grad_norm": 0.06648958474397659, "learning_rate": 2.6990960496667313e-07, "loss": 0.1479, "num_input_tokens_seen": 2920768, "step": 5945 }, { "epoch": 0.7852712155206546, "grad_norm": 0.07045161724090576, "learning_rate": 2.6833740560457976e-07, "loss": 0.067, "num_input_tokens_seen": 2923136, "step": 5950 }, { "epoch": 0.7859311072984031, "grad_norm": 0.12214231491088867, "learning_rate": 2.6676908872075757e-07, "loss": 0.0702, "num_input_tokens_seen": 2925568, "step": 5955 }, { "epoch": 0.7865909990761515, "grad_norm": 0.0641525536775589, "learning_rate": 2.6520466263728836e-07, "loss": 0.0576, "num_input_tokens_seen": 2928064, "step": 5960 }, { "epoch": 0.7872508908539, "grad_norm": 38.231407165527344, "learning_rate": 2.636441356556087e-07, "loss": 0.2178, "num_input_tokens_seen": 2930368, "step": 5965 }, { "epoch": 0.7879107826316484, "grad_norm": 13.16163158416748, "learning_rate": 2.620875160564645e-07, "loss": 0.1005, "num_input_tokens_seen": 2932928, "step": 5970 }, { "epoch": 0.7885706744093969, "grad_norm": 208.31663513183594, "learning_rate": 2.6053481209986715e-07, "loss": 0.418, "num_input_tokens_seen": 2935360, "step": 5975 }, { "epoch": 0.7892305661871453, "grad_norm": 0.08345562219619751, "learning_rate": 2.5898603202505155e-07, "loss": 0.059, "num_input_tokens_seen": 2937920, "step": 5980 }, { "epoch": 0.7898904579648938, "grad_norm": 0.3885025382041931, "learning_rate": 2.5744118405042923e-07, "loss": 0.0502, "num_input_tokens_seen": 2940224, "step": 5985 }, { "epoch": 0.7905503497426422, "grad_norm": 0.12759974598884583, "learning_rate": 2.559002763735485e-07, "loss": 0.0017, "num_input_tokens_seen": 2942848, "step": 5990 }, { "epoch": 0.7912102415203907, "grad_norm": 0.2042687088251114, "learning_rate": 2.543633171710472e-07, "loss": 0.0591, "num_input_tokens_seen": 2945344, "step": 5995 }, { "epoch": 0.7918701332981392, "grad_norm": 2.7166707515716553, "learning_rate": 2.5283031459861205e-07, "loss": 0.0162, "num_input_tokens_seen": 2947840, "step": 6000 }, { "epoch": 0.7925300250758875, "grad_norm": 0.08386794477701187, "learning_rate": 2.5130127679093396e-07, "loss": 0.0344, "num_input_tokens_seen": 2950144, "step": 6005 }, { "epoch": 0.793189916853636, "grad_norm": 59.384368896484375, "learning_rate": 2.497762118616652e-07, "loss": 0.0428, "num_input_tokens_seen": 2952384, "step": 6010 }, { "epoch": 0.7938498086313844, "grad_norm": 0.061096593737602234, "learning_rate": 2.4825512790337745e-07, "loss": 0.0788, "num_input_tokens_seen": 2955136, "step": 6015 }, { "epoch": 0.7945097004091329, "grad_norm": 38.1906623840332, "learning_rate": 2.467380329875163e-07, "loss": 0.0441, "num_input_tokens_seen": 2957824, "step": 6020 }, { "epoch": 0.7951695921868813, "grad_norm": 0.729996919631958, "learning_rate": 2.452249351643615e-07, "loss": 0.0038, "num_input_tokens_seen": 2960256, "step": 6025 }, { "epoch": 0.7958294839646298, "grad_norm": 0.31032249331474304, "learning_rate": 2.437158424629817e-07, "loss": 0.0672, "num_input_tokens_seen": 2962944, "step": 6030 }, { "epoch": 0.7964893757423782, "grad_norm": 0.5417336821556091, "learning_rate": 2.422107628911929e-07, "loss": 0.2047, "num_input_tokens_seen": 2965504, "step": 6035 }, { "epoch": 0.7971492675201267, "grad_norm": 0.07609419524669647, "learning_rate": 2.4070970443551673e-07, "loss": 0.2335, "num_input_tokens_seen": 2967744, "step": 6040 }, { "epoch": 0.7978091592978751, "grad_norm": 0.5796427726745605, "learning_rate": 2.392126750611362e-07, "loss": 0.0017, "num_input_tokens_seen": 2970240, "step": 6045 }, { "epoch": 0.7984690510756236, "grad_norm": 19.56314468383789, "learning_rate": 2.3771968271185538e-07, "loss": 0.1777, "num_input_tokens_seen": 2972928, "step": 6050 }, { "epoch": 0.799128942853372, "grad_norm": 0.015146835707128048, "learning_rate": 2.3623073531005579e-07, "loss": 0.1485, "num_input_tokens_seen": 2975168, "step": 6055 }, { "epoch": 0.7997888346311205, "grad_norm": 0.11885405331850052, "learning_rate": 2.3474584075665493e-07, "loss": 0.1294, "num_input_tokens_seen": 2977408, "step": 6060 }, { "epoch": 0.8003167480533192, "eval_loss": 0.0954766720533371, "eval_runtime": 7.5442, "eval_samples_per_second": 892.739, "eval_steps_per_second": 111.609, "num_input_tokens_seen": 2979456, "step": 6064 }, { "epoch": 0.800448726408869, "grad_norm": 0.06911212205886841, "learning_rate": 2.3326500693106533e-07, "loss": 0.0013, "num_input_tokens_seen": 2979968, "step": 6065 }, { "epoch": 0.8011086181866174, "grad_norm": 82.93882751464844, "learning_rate": 2.3178824169114975e-07, "loss": 0.209, "num_input_tokens_seen": 2982528, "step": 6070 }, { "epoch": 0.8017685099643659, "grad_norm": 0.18306070566177368, "learning_rate": 2.303155528731837e-07, "loss": 0.0494, "num_input_tokens_seen": 2984832, "step": 6075 }, { "epoch": 0.8024284017421143, "grad_norm": 1.8438490629196167, "learning_rate": 2.2884694829181016e-07, "loss": 0.0014, "num_input_tokens_seen": 2987328, "step": 6080 }, { "epoch": 0.8030882935198628, "grad_norm": 0.09328246116638184, "learning_rate": 2.273824357400005e-07, "loss": 0.0083, "num_input_tokens_seen": 2989760, "step": 6085 }, { "epoch": 0.8037481852976112, "grad_norm": 0.17672888934612274, "learning_rate": 2.2592202298901174e-07, "loss": 0.0188, "num_input_tokens_seen": 2992320, "step": 6090 }, { "epoch": 0.8044080770753597, "grad_norm": 0.12940169870853424, "learning_rate": 2.2446571778834555e-07, "loss": 0.0014, "num_input_tokens_seen": 2995136, "step": 6095 }, { "epoch": 0.805067968853108, "grad_norm": 0.075173020362854, "learning_rate": 2.2301352786570827e-07, "loss": 0.0009, "num_input_tokens_seen": 2998016, "step": 6100 }, { "epoch": 0.8057278606308566, "grad_norm": 0.03360762447118759, "learning_rate": 2.215654609269685e-07, "loss": 0.1425, "num_input_tokens_seen": 3000832, "step": 6105 }, { "epoch": 0.8063877524086049, "grad_norm": 15.617521286010742, "learning_rate": 2.201215246561161e-07, "loss": 0.1461, "num_input_tokens_seen": 3003584, "step": 6110 }, { "epoch": 0.8070476441863534, "grad_norm": 62.255313873291016, "learning_rate": 2.1868172671522357e-07, "loss": 0.0738, "num_input_tokens_seen": 3006464, "step": 6115 }, { "epoch": 0.8077075359641019, "grad_norm": 0.16907618939876556, "learning_rate": 2.1724607474440216e-07, "loss": 0.0824, "num_input_tokens_seen": 3008896, "step": 6120 }, { "epoch": 0.8083674277418503, "grad_norm": 21.065229415893555, "learning_rate": 2.158145763617646e-07, "loss": 0.1463, "num_input_tokens_seen": 3011392, "step": 6125 }, { "epoch": 0.8090273195195988, "grad_norm": 0.23459585011005402, "learning_rate": 2.1438723916338198e-07, "loss": 0.2764, "num_input_tokens_seen": 3014016, "step": 6130 }, { "epoch": 0.8096872112973472, "grad_norm": 0.41196635365486145, "learning_rate": 2.1296407072324495e-07, "loss": 0.1715, "num_input_tokens_seen": 3016576, "step": 6135 }, { "epoch": 0.8103471030750957, "grad_norm": 163.61370849609375, "learning_rate": 2.1154507859322336e-07, "loss": 0.0432, "num_input_tokens_seen": 3019008, "step": 6140 }, { "epoch": 0.8110069948528441, "grad_norm": 36.33530044555664, "learning_rate": 2.101302703030252e-07, "loss": 0.1229, "num_input_tokens_seen": 3021504, "step": 6145 }, { "epoch": 0.8116668866305926, "grad_norm": 10.142012596130371, "learning_rate": 2.0871965336015885e-07, "loss": 0.0575, "num_input_tokens_seen": 3023552, "step": 6150 }, { "epoch": 0.812326778408341, "grad_norm": 0.6284022331237793, "learning_rate": 2.0731323524989031e-07, "loss": 0.0704, "num_input_tokens_seen": 3025856, "step": 6155 }, { "epoch": 0.8129866701860895, "grad_norm": 1.1452326774597168, "learning_rate": 2.0591102343520616e-07, "loss": 0.2049, "num_input_tokens_seen": 3028096, "step": 6160 }, { "epoch": 0.8136465619638379, "grad_norm": 44.43306350708008, "learning_rate": 2.0451302535677206e-07, "loss": 0.159, "num_input_tokens_seen": 3030528, "step": 6165 }, { "epoch": 0.8143064537415864, "grad_norm": 113.5491943359375, "learning_rate": 2.0311924843289396e-07, "loss": 0.227, "num_input_tokens_seen": 3033088, "step": 6170 }, { "epoch": 0.8149663455193348, "grad_norm": 0.11563657224178314, "learning_rate": 2.017297000594794e-07, "loss": 0.0642, "num_input_tokens_seen": 3035200, "step": 6175 }, { "epoch": 0.8156262372970833, "grad_norm": 0.15644113719463348, "learning_rate": 2.0034438760999696e-07, "loss": 0.0604, "num_input_tokens_seen": 3037696, "step": 6180 }, { "epoch": 0.8162861290748318, "grad_norm": 14.904664039611816, "learning_rate": 1.9896331843543856e-07, "loss": 0.1423, "num_input_tokens_seen": 3040128, "step": 6185 }, { "epoch": 0.8169460208525802, "grad_norm": 0.2976359724998474, "learning_rate": 1.975864998642789e-07, "loss": 0.1184, "num_input_tokens_seen": 3042560, "step": 6190 }, { "epoch": 0.8176059126303287, "grad_norm": 37.44635772705078, "learning_rate": 1.9621393920243767e-07, "loss": 0.2826, "num_input_tokens_seen": 3044800, "step": 6195 }, { "epoch": 0.8182658044080771, "grad_norm": 105.21582794189453, "learning_rate": 1.9484564373324074e-07, "loss": 0.1028, "num_input_tokens_seen": 3047040, "step": 6200 }, { "epoch": 0.8189256961858256, "grad_norm": 0.048264820128679276, "learning_rate": 1.934816207173805e-07, "loss": 0.0495, "num_input_tokens_seen": 3049600, "step": 6205 }, { "epoch": 0.819585587963574, "grad_norm": 0.17185015976428986, "learning_rate": 1.9212187739287943e-07, "loss": 0.158, "num_input_tokens_seen": 3052416, "step": 6210 }, { "epoch": 0.8202454797413224, "grad_norm": 0.28126591444015503, "learning_rate": 1.907664209750488e-07, "loss": 0.0135, "num_input_tokens_seen": 3055040, "step": 6215 }, { "epoch": 0.8209053715190708, "grad_norm": 0.4188820719718933, "learning_rate": 1.8941525865645336e-07, "loss": 0.0446, "num_input_tokens_seen": 3057856, "step": 6220 }, { "epoch": 0.8215652632968193, "grad_norm": 49.17670440673828, "learning_rate": 1.8806839760687076e-07, "loss": 0.2045, "num_input_tokens_seen": 3060160, "step": 6225 }, { "epoch": 0.8222251550745677, "grad_norm": 0.09683864563703537, "learning_rate": 1.867258449732545e-07, "loss": 0.1205, "num_input_tokens_seen": 3062592, "step": 6230 }, { "epoch": 0.8228850468523162, "grad_norm": 15.184978485107422, "learning_rate": 1.8538760787969676e-07, "loss": 0.0502, "num_input_tokens_seen": 3065088, "step": 6235 }, { "epoch": 0.8235449386300646, "grad_norm": 1.2835743427276611, "learning_rate": 1.8405369342738907e-07, "loss": 0.0019, "num_input_tokens_seen": 3067712, "step": 6240 }, { "epoch": 0.8242048304078131, "grad_norm": 122.01753997802734, "learning_rate": 1.8272410869458598e-07, "loss": 0.0876, "num_input_tokens_seen": 3070144, "step": 6245 }, { "epoch": 0.8248647221855616, "grad_norm": 0.3181722164154053, "learning_rate": 1.8139886073656653e-07, "loss": 0.2369, "num_input_tokens_seen": 3072448, "step": 6250 }, { "epoch": 0.82552461396331, "grad_norm": 0.28298959136009216, "learning_rate": 1.800779565855971e-07, "loss": 0.2066, "num_input_tokens_seen": 3075072, "step": 6255 }, { "epoch": 0.8261845057410585, "grad_norm": 0.08039449155330658, "learning_rate": 1.7876140325089463e-07, "loss": 0.0029, "num_input_tokens_seen": 3077376, "step": 6260 }, { "epoch": 0.8268443975188069, "grad_norm": 0.7492879629135132, "learning_rate": 1.774492077185883e-07, "loss": 0.1344, "num_input_tokens_seen": 3079808, "step": 6265 }, { "epoch": 0.8275042892965554, "grad_norm": 0.0355597622692585, "learning_rate": 1.7614137695168408e-07, "loss": 0.0009, "num_input_tokens_seen": 3082560, "step": 6270 }, { "epoch": 0.8281641810743038, "grad_norm": 0.3212199807167053, "learning_rate": 1.748379178900261e-07, "loss": 0.0705, "num_input_tokens_seen": 3084608, "step": 6275 }, { "epoch": 0.8288240728520523, "grad_norm": 17.22373390197754, "learning_rate": 1.7353883745026055e-07, "loss": 0.228, "num_input_tokens_seen": 3087104, "step": 6280 }, { "epoch": 0.8294839646298007, "grad_norm": 1.737422227859497, "learning_rate": 1.722441425257999e-07, "loss": 0.1102, "num_input_tokens_seen": 3089408, "step": 6285 }, { "epoch": 0.8301438564075492, "grad_norm": 0.23113372921943665, "learning_rate": 1.7095383998678402e-07, "loss": 0.0552, "num_input_tokens_seen": 3091776, "step": 6290 }, { "epoch": 0.8308037481852976, "grad_norm": 0.0835813581943512, "learning_rate": 1.6966793668004653e-07, "loss": 0.1083, "num_input_tokens_seen": 3094208, "step": 6295 }, { "epoch": 0.8314636399630461, "grad_norm": 0.1578727513551712, "learning_rate": 1.6838643942907625e-07, "loss": 0.0801, "num_input_tokens_seen": 3096768, "step": 6300 }, { "epoch": 0.8321235317407946, "grad_norm": 72.27742004394531, "learning_rate": 1.671093550339815e-07, "loss": 0.06, "num_input_tokens_seen": 3099456, "step": 6305 }, { "epoch": 0.832783423518543, "grad_norm": 0.853486180305481, "learning_rate": 1.6583669027145542e-07, "loss": 0.0046, "num_input_tokens_seen": 3102208, "step": 6310 }, { "epoch": 0.8334433152962915, "grad_norm": 0.05901863053441048, "learning_rate": 1.6456845189473767e-07, "loss": 0.0014, "num_input_tokens_seen": 3104896, "step": 6315 }, { "epoch": 0.8341032070740398, "grad_norm": 0.11249249428510666, "learning_rate": 1.6330464663358123e-07, "loss": 0.1178, "num_input_tokens_seen": 3107520, "step": 6320 }, { "epoch": 0.8347630988517883, "grad_norm": 0.057853005826473236, "learning_rate": 1.6204528119421346e-07, "loss": 0.0014, "num_input_tokens_seen": 3110144, "step": 6325 }, { "epoch": 0.8354229906295367, "grad_norm": 0.1531873196363449, "learning_rate": 1.607903622593042e-07, "loss": 0.0501, "num_input_tokens_seen": 3112768, "step": 6330 }, { "epoch": 0.8360828824072852, "grad_norm": 0.04447514936327934, "learning_rate": 1.5953989648792743e-07, "loss": 0.0007, "num_input_tokens_seen": 3115328, "step": 6335 }, { "epoch": 0.8367427741850336, "grad_norm": 0.14438007771968842, "learning_rate": 1.5829389051552678e-07, "loss": 0.0323, "num_input_tokens_seen": 3117888, "step": 6340 }, { "epoch": 0.8374026659627821, "grad_norm": 101.97913360595703, "learning_rate": 1.5705235095388136e-07, "loss": 0.038, "num_input_tokens_seen": 3120384, "step": 6345 }, { "epoch": 0.8380625577405305, "grad_norm": 0.08870197832584381, "learning_rate": 1.5581528439106907e-07, "loss": 0.0436, "num_input_tokens_seen": 3123008, "step": 6350 }, { "epoch": 0.838722449518279, "grad_norm": 0.36987948417663574, "learning_rate": 1.5458269739143292e-07, "loss": 0.0796, "num_input_tokens_seen": 3125504, "step": 6355 }, { "epoch": 0.8393823412960274, "grad_norm": 2.8769209384918213, "learning_rate": 1.5335459649554538e-07, "loss": 0.0025, "num_input_tokens_seen": 3127744, "step": 6360 }, { "epoch": 0.8400422330737759, "grad_norm": 0.1269061416387558, "learning_rate": 1.5213098822017357e-07, "loss": 0.1043, "num_input_tokens_seen": 3130048, "step": 6365 }, { "epoch": 0.8407021248515244, "grad_norm": 0.26083889603614807, "learning_rate": 1.50911879058246e-07, "loss": 0.0469, "num_input_tokens_seen": 3132480, "step": 6370 }, { "epoch": 0.8413620166292728, "grad_norm": 107.53121948242188, "learning_rate": 1.4969727547881628e-07, "loss": 0.1012, "num_input_tokens_seen": 3135104, "step": 6375 }, { "epoch": 0.8420219084070213, "grad_norm": 14.834338188171387, "learning_rate": 1.4848718392703052e-07, "loss": 0.1743, "num_input_tokens_seen": 3137344, "step": 6380 }, { "epoch": 0.8426818001847697, "grad_norm": 16.3470401763916, "learning_rate": 1.472816108240915e-07, "loss": 0.1728, "num_input_tokens_seen": 3140096, "step": 6385 }, { "epoch": 0.8433416919625182, "grad_norm": 0.3881288170814514, "learning_rate": 1.46080562567226e-07, "loss": 0.0782, "num_input_tokens_seen": 3142400, "step": 6390 }, { "epoch": 0.8440015837402666, "grad_norm": 0.1693449318408966, "learning_rate": 1.4488404552964993e-07, "loss": 0.0657, "num_input_tokens_seen": 3144512, "step": 6395 }, { "epoch": 0.8446614755180151, "grad_norm": 68.23955535888672, "learning_rate": 1.4369206606053463e-07, "loss": 0.0303, "num_input_tokens_seen": 3146944, "step": 6400 }, { "epoch": 0.8453213672957635, "grad_norm": 0.20148129761219025, "learning_rate": 1.425046304849742e-07, "loss": 0.0816, "num_input_tokens_seen": 3149376, "step": 6405 }, { "epoch": 0.845981259073512, "grad_norm": 0.5425065755844116, "learning_rate": 1.4132174510395024e-07, "loss": 0.1094, "num_input_tokens_seen": 3151744, "step": 6410 }, { "epoch": 0.8466411508512603, "grad_norm": 0.209197536110878, "learning_rate": 1.4014341619430003e-07, "loss": 0.0082, "num_input_tokens_seen": 3154112, "step": 6415 }, { "epoch": 0.8473010426290088, "grad_norm": 0.13178904354572296, "learning_rate": 1.3896965000868188e-07, "loss": 0.0082, "num_input_tokens_seen": 3156480, "step": 6420 }, { "epoch": 0.8479609344067572, "grad_norm": 33.16168975830078, "learning_rate": 1.3780045277554276e-07, "loss": 0.138, "num_input_tokens_seen": 3158784, "step": 6425 }, { "epoch": 0.8486208261845057, "grad_norm": 0.1764160841703415, "learning_rate": 1.3663583069908535e-07, "loss": 0.1674, "num_input_tokens_seen": 3161152, "step": 6430 }, { "epoch": 0.8492807179622542, "grad_norm": 0.02807113528251648, "learning_rate": 1.3547578995923447e-07, "loss": 0.0385, "num_input_tokens_seen": 3163776, "step": 6435 }, { "epoch": 0.8499406097400026, "grad_norm": 34.52450180053711, "learning_rate": 1.3432033671160458e-07, "loss": 0.1202, "num_input_tokens_seen": 3166272, "step": 6440 }, { "epoch": 0.8503365448066517, "eval_loss": 0.09701072424650192, "eval_runtime": 7.7873, "eval_samples_per_second": 864.874, "eval_steps_per_second": 108.125, "num_input_tokens_seen": 3167488, "step": 6443 }, { "epoch": 0.8506005015177511, "grad_norm": 1.1372543573379517, "learning_rate": 1.3316947708746762e-07, "loss": 0.0653, "num_input_tokens_seen": 3168640, "step": 6445 }, { "epoch": 0.8512603932954995, "grad_norm": 0.052345190197229385, "learning_rate": 1.3202321719371967e-07, "loss": 0.1256, "num_input_tokens_seen": 3171008, "step": 6450 }, { "epoch": 0.851920285073248, "grad_norm": 0.021850943565368652, "learning_rate": 1.3088156311284893e-07, "loss": 0.1099, "num_input_tokens_seen": 3173312, "step": 6455 }, { "epoch": 0.8525801768509964, "grad_norm": 0.08195928484201431, "learning_rate": 1.2974452090290322e-07, "loss": 0.2267, "num_input_tokens_seen": 3175808, "step": 6460 }, { "epoch": 0.8532400686287449, "grad_norm": 0.05846588686108589, "learning_rate": 1.2861209659745865e-07, "loss": 0.0888, "num_input_tokens_seen": 3178048, "step": 6465 }, { "epoch": 0.8538999604064933, "grad_norm": 0.08579205721616745, "learning_rate": 1.2748429620558654e-07, "loss": 0.0148, "num_input_tokens_seen": 3180544, "step": 6470 }, { "epoch": 0.8545598521842418, "grad_norm": 9.61294937133789, "learning_rate": 1.2636112571182167e-07, "loss": 0.1561, "num_input_tokens_seen": 3183040, "step": 6475 }, { "epoch": 0.8552197439619902, "grad_norm": 0.1357005089521408, "learning_rate": 1.2524259107613178e-07, "loss": 0.1766, "num_input_tokens_seen": 3185664, "step": 6480 }, { "epoch": 0.8558796357397387, "grad_norm": 15.378425598144531, "learning_rate": 1.2412869823388382e-07, "loss": 0.146, "num_input_tokens_seen": 3188672, "step": 6485 }, { "epoch": 0.8565395275174872, "grad_norm": 0.08732222765684128, "learning_rate": 1.2301945309581486e-07, "loss": 0.0385, "num_input_tokens_seen": 3191168, "step": 6490 }, { "epoch": 0.8571994192952356, "grad_norm": 1.1724306344985962, "learning_rate": 1.2191486154799846e-07, "loss": 0.0822, "num_input_tokens_seen": 3193664, "step": 6495 }, { "epoch": 0.8578593110729841, "grad_norm": 0.05099056288599968, "learning_rate": 1.208149294518147e-07, "loss": 0.001, "num_input_tokens_seen": 3196224, "step": 6500 }, { "epoch": 0.8585192028507325, "grad_norm": 0.06140226498246193, "learning_rate": 1.1971966264391954e-07, "loss": 0.1988, "num_input_tokens_seen": 3198784, "step": 6505 }, { "epoch": 0.859179094628481, "grad_norm": 0.07323023676872253, "learning_rate": 1.1862906693621233e-07, "loss": 0.1104, "num_input_tokens_seen": 3201472, "step": 6510 }, { "epoch": 0.8598389864062294, "grad_norm": 0.11436515301465988, "learning_rate": 1.1754314811580623e-07, "loss": 0.1169, "num_input_tokens_seen": 3203584, "step": 6515 }, { "epoch": 0.8604988781839779, "grad_norm": 0.09268505871295929, "learning_rate": 1.1646191194499655e-07, "loss": 0.0712, "num_input_tokens_seen": 3205888, "step": 6520 }, { "epoch": 0.8611587699617262, "grad_norm": 1.127016544342041, "learning_rate": 1.1538536416123168e-07, "loss": 0.1908, "num_input_tokens_seen": 3208000, "step": 6525 }, { "epoch": 0.8618186617394747, "grad_norm": 0.36033815145492554, "learning_rate": 1.1431351047708072e-07, "loss": 0.0208, "num_input_tokens_seen": 3210240, "step": 6530 }, { "epoch": 0.8624785535172231, "grad_norm": 46.454463958740234, "learning_rate": 1.1324635658020432e-07, "loss": 0.1363, "num_input_tokens_seen": 3212672, "step": 6535 }, { "epoch": 0.8631384452949716, "grad_norm": 9.648067474365234, "learning_rate": 1.1218390813332479e-07, "loss": 0.1361, "num_input_tokens_seen": 3215360, "step": 6540 }, { "epoch": 0.86379833707272, "grad_norm": 0.07266692072153091, "learning_rate": 1.1112617077419472e-07, "loss": 0.1234, "num_input_tokens_seen": 3218112, "step": 6545 }, { "epoch": 0.8644582288504685, "grad_norm": 0.30243945121765137, "learning_rate": 1.1007315011556884e-07, "loss": 0.0346, "num_input_tokens_seen": 3220288, "step": 6550 }, { "epoch": 0.865118120628217, "grad_norm": 0.1822911947965622, "learning_rate": 1.0902485174517251e-07, "loss": 0.0015, "num_input_tokens_seen": 3222976, "step": 6555 }, { "epoch": 0.8657780124059654, "grad_norm": 0.06488798558712006, "learning_rate": 1.0798128122567285e-07, "loss": 0.0725, "num_input_tokens_seen": 3225472, "step": 6560 }, { "epoch": 0.8664379041837139, "grad_norm": 0.4437669813632965, "learning_rate": 1.0694244409464992e-07, "loss": 0.1631, "num_input_tokens_seen": 3228096, "step": 6565 }, { "epoch": 0.8670977959614623, "grad_norm": 0.030641254037618637, "learning_rate": 1.0590834586456577e-07, "loss": 0.1158, "num_input_tokens_seen": 3230720, "step": 6570 }, { "epoch": 0.8677576877392108, "grad_norm": 149.95916748046875, "learning_rate": 1.0487899202273708e-07, "loss": 0.1239, "num_input_tokens_seen": 3233088, "step": 6575 }, { "epoch": 0.8684175795169592, "grad_norm": 26.74003791809082, "learning_rate": 1.0385438803130364e-07, "loss": 0.1255, "num_input_tokens_seen": 3235712, "step": 6580 }, { "epoch": 0.8690774712947077, "grad_norm": 0.10586915165185928, "learning_rate": 1.0283453932720199e-07, "loss": 0.1423, "num_input_tokens_seen": 3238528, "step": 6585 }, { "epoch": 0.8697373630724561, "grad_norm": 0.10404416173696518, "learning_rate": 1.0181945132213476e-07, "loss": 0.0738, "num_input_tokens_seen": 3240896, "step": 6590 }, { "epoch": 0.8703972548502046, "grad_norm": 0.10048986971378326, "learning_rate": 1.0080912940254227e-07, "loss": 0.0016, "num_input_tokens_seen": 3243392, "step": 6595 }, { "epoch": 0.871057146627953, "grad_norm": 0.277065247297287, "learning_rate": 9.980357892957492e-08, "loss": 0.0041, "num_input_tokens_seen": 3245824, "step": 6600 }, { "epoch": 0.8717170384057015, "grad_norm": 7.954074859619141, "learning_rate": 9.880280523906337e-08, "loss": 0.0031, "num_input_tokens_seen": 3248128, "step": 6605 }, { "epoch": 0.8723769301834499, "grad_norm": 13.635552406311035, "learning_rate": 9.780681364149091e-08, "loss": 0.1351, "num_input_tokens_seen": 3250624, "step": 6610 }, { "epoch": 0.8730368219611984, "grad_norm": 0.3135831356048584, "learning_rate": 9.681560942196587e-08, "loss": 0.1127, "num_input_tokens_seen": 3253312, "step": 6615 }, { "epoch": 0.8736967137389469, "grad_norm": 0.04287222400307655, "learning_rate": 9.582919784019194e-08, "loss": 0.1168, "num_input_tokens_seen": 3255488, "step": 6620 }, { "epoch": 0.8743566055166953, "grad_norm": 0.04105079546570778, "learning_rate": 9.484758413044236e-08, "loss": 0.0668, "num_input_tokens_seen": 3257664, "step": 6625 }, { "epoch": 0.8750164972944438, "grad_norm": 0.12905164062976837, "learning_rate": 9.387077350153017e-08, "loss": 0.0542, "num_input_tokens_seen": 3260160, "step": 6630 }, { "epoch": 0.8756763890721921, "grad_norm": 13.714322090148926, "learning_rate": 9.289877113678168e-08, "loss": 0.0616, "num_input_tokens_seen": 3262528, "step": 6635 }, { "epoch": 0.8763362808499406, "grad_norm": 0.020841121673583984, "learning_rate": 9.19315821940092e-08, "loss": 0.0576, "num_input_tokens_seen": 3265024, "step": 6640 }, { "epoch": 0.876996172627689, "grad_norm": 0.17767778038978577, "learning_rate": 9.096921180548234e-08, "loss": 0.1659, "num_input_tokens_seen": 3267456, "step": 6645 }, { "epoch": 0.8776560644054375, "grad_norm": 0.2553451657295227, "learning_rate": 9.001166507790259e-08, "loss": 0.0915, "num_input_tokens_seen": 3270208, "step": 6650 }, { "epoch": 0.8783159561831859, "grad_norm": 12.365303993225098, "learning_rate": 8.905894709237427e-08, "loss": 0.1045, "num_input_tokens_seen": 3272960, "step": 6655 }, { "epoch": 0.8789758479609344, "grad_norm": 0.07707412540912628, "learning_rate": 8.811106290437975e-08, "loss": 0.0736, "num_input_tokens_seen": 3275136, "step": 6660 }, { "epoch": 0.8796357397386828, "grad_norm": 169.89840698242188, "learning_rate": 8.716801754375036e-08, "loss": 0.1122, "num_input_tokens_seen": 3277696, "step": 6665 }, { "epoch": 0.8802956315164313, "grad_norm": 12.09985065460205, "learning_rate": 8.62298160146413e-08, "loss": 0.1268, "num_input_tokens_seen": 3280064, "step": 6670 }, { "epoch": 0.8809555232941798, "grad_norm": 0.17147305607795715, "learning_rate": 8.529646329550466e-08, "loss": 0.002, "num_input_tokens_seen": 3282304, "step": 6675 }, { "epoch": 0.8816154150719282, "grad_norm": 0.1868370920419693, "learning_rate": 8.436796433906235e-08, "loss": 0.0268, "num_input_tokens_seen": 3284736, "step": 6680 }, { "epoch": 0.8822753068496767, "grad_norm": 16.801742553710938, "learning_rate": 8.344432407228141e-08, "loss": 0.0431, "num_input_tokens_seen": 3287168, "step": 6685 }, { "epoch": 0.8829351986274251, "grad_norm": 1.1702982187271118, "learning_rate": 8.252554739634577e-08, "loss": 0.0486, "num_input_tokens_seen": 3289600, "step": 6690 }, { "epoch": 0.8835950904051736, "grad_norm": 0.043459370732307434, "learning_rate": 8.16116391866316e-08, "loss": 0.0731, "num_input_tokens_seen": 3292160, "step": 6695 }, { "epoch": 0.884254982182922, "grad_norm": 17.922903060913086, "learning_rate": 8.070260429268172e-08, "loss": 0.1312, "num_input_tokens_seen": 3294592, "step": 6700 }, { "epoch": 0.8849148739606705, "grad_norm": 129.87510681152344, "learning_rate": 7.979844753817855e-08, "loss": 0.0078, "num_input_tokens_seen": 3296960, "step": 6705 }, { "epoch": 0.8855747657384189, "grad_norm": 29.814653396606445, "learning_rate": 7.889917372091982e-08, "loss": 0.0772, "num_input_tokens_seen": 3299200, "step": 6710 }, { "epoch": 0.8862346575161674, "grad_norm": 8.637088775634766, "learning_rate": 7.800478761279183e-08, "loss": 0.2034, "num_input_tokens_seen": 3301568, "step": 6715 }, { "epoch": 0.8868945492939158, "grad_norm": 11.602696418762207, "learning_rate": 7.711529395974592e-08, "loss": 0.1794, "num_input_tokens_seen": 3304064, "step": 6720 }, { "epoch": 0.8875544410716643, "grad_norm": 0.04998312518000603, "learning_rate": 7.623069748177135e-08, "loss": 0.1778, "num_input_tokens_seen": 3306432, "step": 6725 }, { "epoch": 0.8882143328494126, "grad_norm": 0.4295664131641388, "learning_rate": 7.535100287287111e-08, "loss": 0.1002, "num_input_tokens_seen": 3308736, "step": 6730 }, { "epoch": 0.8888742246271611, "grad_norm": 0.11967656761407852, "learning_rate": 7.447621480103783e-08, "loss": 0.0022, "num_input_tokens_seen": 3311168, "step": 6735 }, { "epoch": 0.8895341164049096, "grad_norm": 17.428560256958008, "learning_rate": 7.360633790822713e-08, "loss": 0.2822, "num_input_tokens_seen": 3313664, "step": 6740 }, { "epoch": 0.890194008182658, "grad_norm": 0.2180459350347519, "learning_rate": 7.274137681033498e-08, "loss": 0.022, "num_input_tokens_seen": 3316224, "step": 6745 }, { "epoch": 0.8908538999604065, "grad_norm": 0.13484865427017212, "learning_rate": 7.188133609717184e-08, "loss": 0.0855, "num_input_tokens_seen": 3318464, "step": 6750 }, { "epoch": 0.8915137917381549, "grad_norm": 0.0493309311568737, "learning_rate": 7.102622033243843e-08, "loss": 0.0011, "num_input_tokens_seen": 3320896, "step": 6755 }, { "epoch": 0.8921736835159034, "grad_norm": 0.22488893568515778, "learning_rate": 7.017603405370276e-08, "loss": 0.1368, "num_input_tokens_seen": 3323648, "step": 6760 }, { "epoch": 0.8928335752936518, "grad_norm": 0.15953336656093597, "learning_rate": 6.933078177237429e-08, "loss": 0.1476, "num_input_tokens_seen": 3326208, "step": 6765 }, { "epoch": 0.8934934670714003, "grad_norm": 0.4283379912376404, "learning_rate": 6.849046797368108e-08, "loss": 0.0651, "num_input_tokens_seen": 3328576, "step": 6770 }, { "epoch": 0.8941533588491487, "grad_norm": 28.798320770263672, "learning_rate": 6.765509711664574e-08, "loss": 0.003, "num_input_tokens_seen": 3331520, "step": 6775 }, { "epoch": 0.8948132506268972, "grad_norm": 0.33185452222824097, "learning_rate": 6.682467363406174e-08, "loss": 0.0235, "num_input_tokens_seen": 3334336, "step": 6780 }, { "epoch": 0.8954731424046456, "grad_norm": 0.24480366706848145, "learning_rate": 6.59992019324701e-08, "loss": 0.0671, "num_input_tokens_seen": 3336896, "step": 6785 }, { "epoch": 0.8961330341823941, "grad_norm": 9.714395523071289, "learning_rate": 6.517868639213553e-08, "loss": 0.1574, "num_input_tokens_seen": 3339328, "step": 6790 }, { "epoch": 0.8967929259601425, "grad_norm": 0.48568111658096313, "learning_rate": 6.436313136702387e-08, "loss": 0.0331, "num_input_tokens_seen": 3341760, "step": 6795 }, { "epoch": 0.897452817737891, "grad_norm": 0.3631482720375061, "learning_rate": 6.355254118477815e-08, "loss": 0.0527, "num_input_tokens_seen": 3344448, "step": 6800 }, { "epoch": 0.8981127095156395, "grad_norm": 0.10991880297660828, "learning_rate": 6.274692014669602e-08, "loss": 0.0009, "num_input_tokens_seen": 3347008, "step": 6805 }, { "epoch": 0.8987726012933879, "grad_norm": 0.15773239731788635, "learning_rate": 6.194627252770768e-08, "loss": 0.0008, "num_input_tokens_seen": 3349824, "step": 6810 }, { "epoch": 0.8994324930711364, "grad_norm": 0.0758163183927536, "learning_rate": 6.115060257635174e-08, "loss": 0.0687, "num_input_tokens_seen": 3352320, "step": 6815 }, { "epoch": 0.9000923848488848, "grad_norm": 0.21164242923259735, "learning_rate": 6.035991451475375e-08, "loss": 0.0013, "num_input_tokens_seen": 3354688, "step": 6820 }, { "epoch": 0.9003563415599841, "eval_loss": 0.09568765014410019, "eval_runtime": 7.581, "eval_samples_per_second": 888.409, "eval_steps_per_second": 111.068, "num_input_tokens_seen": 3355520, "step": 6822 }, { "epoch": 0.9007522766266333, "grad_norm": 0.030890563502907753, "learning_rate": 5.9574212538603505e-08, "loss": 0.0891, "num_input_tokens_seen": 3357056, "step": 6825 }, { "epoch": 0.9014121684043817, "grad_norm": 0.39177563786506653, "learning_rate": 5.879350081713252e-08, "loss": 0.0683, "num_input_tokens_seen": 3359488, "step": 6830 }, { "epoch": 0.9020720601821302, "grad_norm": 0.23050019145011902, "learning_rate": 5.8017783493092386e-08, "loss": 0.2249, "num_input_tokens_seen": 3361920, "step": 6835 }, { "epoch": 0.9027319519598785, "grad_norm": 0.1468856930732727, "learning_rate": 5.7247064682732104e-08, "loss": 0.0018, "num_input_tokens_seen": 3364416, "step": 6840 }, { "epoch": 0.903391843737627, "grad_norm": 0.22081144154071808, "learning_rate": 5.6481348475777566e-08, "loss": 0.0617, "num_input_tokens_seen": 3366912, "step": 6845 }, { "epoch": 0.9040517355153754, "grad_norm": 0.021701961755752563, "learning_rate": 5.5720638935407796e-08, "loss": 0.0014, "num_input_tokens_seen": 3369088, "step": 6850 }, { "epoch": 0.9047116272931239, "grad_norm": 0.013656373135745525, "learning_rate": 5.49649400982356e-08, "loss": 0.1392, "num_input_tokens_seen": 3371520, "step": 6855 }, { "epoch": 0.9053715190708723, "grad_norm": 0.04417372867465019, "learning_rate": 5.421425597428442e-08, "loss": 0.0007, "num_input_tokens_seen": 3374080, "step": 6860 }, { "epoch": 0.9060314108486208, "grad_norm": 126.34750366210938, "learning_rate": 5.346859054696784e-08, "loss": 0.0786, "num_input_tokens_seen": 3376640, "step": 6865 }, { "epoch": 0.9066913026263693, "grad_norm": 0.02389339543879032, "learning_rate": 5.2727947773068773e-08, "loss": 0.0794, "num_input_tokens_seen": 3379072, "step": 6870 }, { "epoch": 0.9073511944041177, "grad_norm": 0.42352914810180664, "learning_rate": 5.199233158271732e-08, "loss": 0.0732, "num_input_tokens_seen": 3381696, "step": 6875 }, { "epoch": 0.9080110861818662, "grad_norm": 11.932153701782227, "learning_rate": 5.126174587937149e-08, "loss": 0.2058, "num_input_tokens_seen": 3384064, "step": 6880 }, { "epoch": 0.9086709779596146, "grad_norm": 0.0787430927157402, "learning_rate": 5.053619453979485e-08, "loss": 0.0036, "num_input_tokens_seen": 3386304, "step": 6885 }, { "epoch": 0.9093308697373631, "grad_norm": 0.03228071704506874, "learning_rate": 4.9815681414037025e-08, "loss": 0.1486, "num_input_tokens_seen": 3388800, "step": 6890 }, { "epoch": 0.9099907615151115, "grad_norm": 0.38972869515419006, "learning_rate": 4.910021032541334e-08, "loss": 0.0886, "num_input_tokens_seen": 3391232, "step": 6895 }, { "epoch": 0.91065065329286, "grad_norm": 29.313077926635742, "learning_rate": 4.838978507048319e-08, "loss": 0.0815, "num_input_tokens_seen": 3393664, "step": 6900 }, { "epoch": 0.9113105450706084, "grad_norm": 2.1044397354125977, "learning_rate": 4.768440941903207e-08, "loss": 0.0055, "num_input_tokens_seen": 3395968, "step": 6905 }, { "epoch": 0.9119704368483569, "grad_norm": 0.08754704892635345, "learning_rate": 4.698408711404944e-08, "loss": 0.0122, "num_input_tokens_seen": 3398272, "step": 6910 }, { "epoch": 0.9126303286261053, "grad_norm": 19.10022735595703, "learning_rate": 4.628882187171046e-08, "loss": 0.0763, "num_input_tokens_seen": 3400960, "step": 6915 }, { "epoch": 0.9132902204038538, "grad_norm": 20.788782119750977, "learning_rate": 4.559861738135506e-08, "loss": 0.155, "num_input_tokens_seen": 3403520, "step": 6920 }, { "epoch": 0.9139501121816023, "grad_norm": 1.3679563999176025, "learning_rate": 4.491347730546913e-08, "loss": 0.229, "num_input_tokens_seen": 3405952, "step": 6925 }, { "epoch": 0.9146100039593507, "grad_norm": 18.096542358398438, "learning_rate": 4.423340527966512e-08, "loss": 0.128, "num_input_tokens_seen": 3408320, "step": 6930 }, { "epoch": 0.9152698957370992, "grad_norm": 0.17555084824562073, "learning_rate": 4.355840491266205e-08, "loss": 0.0052, "num_input_tokens_seen": 3410880, "step": 6935 }, { "epoch": 0.9159297875148475, "grad_norm": 0.056320879608392715, "learning_rate": 4.288847978626686e-08, "loss": 0.0576, "num_input_tokens_seen": 3413440, "step": 6940 }, { "epoch": 0.916589679292596, "grad_norm": 26.998863220214844, "learning_rate": 4.222363345535585e-08, "loss": 0.1275, "num_input_tokens_seen": 3416000, "step": 6945 }, { "epoch": 0.9172495710703444, "grad_norm": 12.58722972869873, "learning_rate": 4.1563869447854505e-08, "loss": 0.1253, "num_input_tokens_seen": 3418240, "step": 6950 }, { "epoch": 0.9179094628480929, "grad_norm": 0.30387794971466064, "learning_rate": 4.090919126472048e-08, "loss": 0.1407, "num_input_tokens_seen": 3420672, "step": 6955 }, { "epoch": 0.9185693546258413, "grad_norm": 65.01815795898438, "learning_rate": 4.025960237992332e-08, "loss": 0.0538, "num_input_tokens_seen": 3422912, "step": 6960 }, { "epoch": 0.9192292464035898, "grad_norm": 7.150808334350586, "learning_rate": 3.961510624042741e-08, "loss": 0.0027, "num_input_tokens_seen": 3425408, "step": 6965 }, { "epoch": 0.9198891381813382, "grad_norm": 14.636774063110352, "learning_rate": 3.8975706266172636e-08, "loss": 0.1111, "num_input_tokens_seen": 3427776, "step": 6970 }, { "epoch": 0.9205490299590867, "grad_norm": 41.149513244628906, "learning_rate": 3.834140585005696e-08, "loss": 0.0538, "num_input_tokens_seen": 3430336, "step": 6975 }, { "epoch": 0.9212089217368351, "grad_norm": 37.5268669128418, "learning_rate": 3.771220835791844e-08, "loss": 0.2688, "num_input_tokens_seen": 3432896, "step": 6980 }, { "epoch": 0.9218688135145836, "grad_norm": 0.18734599649906158, "learning_rate": 3.708811712851634e-08, "loss": 0.0703, "num_input_tokens_seen": 3435136, "step": 6985 }, { "epoch": 0.9225287052923321, "grad_norm": 0.09961698204278946, "learning_rate": 3.6469135473514936e-08, "loss": 0.0604, "num_input_tokens_seen": 3437824, "step": 6990 }, { "epoch": 0.9231885970700805, "grad_norm": 0.04659373685717583, "learning_rate": 3.5855266677464744e-08, "loss": 0.0066, "num_input_tokens_seen": 3440320, "step": 6995 }, { "epoch": 0.923848488847829, "grad_norm": 0.21239009499549866, "learning_rate": 3.524651399778555e-08, "loss": 0.0499, "num_input_tokens_seen": 3442880, "step": 7000 }, { "epoch": 0.9245083806255774, "grad_norm": 0.08486049622297287, "learning_rate": 3.4642880664749296e-08, "loss": 0.0009, "num_input_tokens_seen": 3445120, "step": 7005 }, { "epoch": 0.9251682724033259, "grad_norm": 0.2830374538898468, "learning_rate": 3.404436988146242e-08, "loss": 0.1758, "num_input_tokens_seen": 3447424, "step": 7010 }, { "epoch": 0.9258281641810743, "grad_norm": 0.012739721685647964, "learning_rate": 3.345098482384956e-08, "loss": 0.0461, "num_input_tokens_seen": 3449920, "step": 7015 }, { "epoch": 0.9264880559588228, "grad_norm": 0.5981858968734741, "learning_rate": 3.2862728640636105e-08, "loss": 0.0499, "num_input_tokens_seen": 3452416, "step": 7020 }, { "epoch": 0.9271479477365712, "grad_norm": 16.553138732910156, "learning_rate": 3.227960445333155e-08, "loss": 0.1119, "num_input_tokens_seen": 3454912, "step": 7025 }, { "epoch": 0.9278078395143197, "grad_norm": 0.03474080190062523, "learning_rate": 3.1701615356213295e-08, "loss": 0.0654, "num_input_tokens_seen": 3457472, "step": 7030 }, { "epoch": 0.928467731292068, "grad_norm": 0.11611025035381317, "learning_rate": 3.112876441630985e-08, "loss": 0.0654, "num_input_tokens_seen": 3459712, "step": 7035 }, { "epoch": 0.9291276230698166, "grad_norm": 0.19927047193050385, "learning_rate": 3.05610546733851e-08, "loss": 0.0532, "num_input_tokens_seen": 3462144, "step": 7040 }, { "epoch": 0.9297875148475649, "grad_norm": 13.10682201385498, "learning_rate": 2.99984891399212e-08, "loss": 0.2881, "num_input_tokens_seen": 3464512, "step": 7045 }, { "epoch": 0.9304474066253134, "grad_norm": 0.17246191203594208, "learning_rate": 2.9441070801103808e-08, "loss": 0.0061, "num_input_tokens_seen": 3466880, "step": 7050 }, { "epoch": 0.931107298403062, "grad_norm": 0.28195682168006897, "learning_rate": 2.8888802614805085e-08, "loss": 0.1035, "num_input_tokens_seen": 3469248, "step": 7055 }, { "epoch": 0.9317671901808103, "grad_norm": 41.38626480102539, "learning_rate": 2.8341687511568734e-08, "loss": 0.2707, "num_input_tokens_seen": 3471616, "step": 7060 }, { "epoch": 0.9324270819585588, "grad_norm": 0.20374363660812378, "learning_rate": 2.7799728394594547e-08, "loss": 0.0773, "num_input_tokens_seen": 3474240, "step": 7065 }, { "epoch": 0.9330869737363072, "grad_norm": 0.10206926614046097, "learning_rate": 2.7262928139722198e-08, "loss": 0.0759, "num_input_tokens_seen": 3476800, "step": 7070 }, { "epoch": 0.9337468655140557, "grad_norm": 0.04854326695203781, "learning_rate": 2.673128959541693e-08, "loss": 0.0879, "num_input_tokens_seen": 3479488, "step": 7075 }, { "epoch": 0.9344067572918041, "grad_norm": 0.021472515538334846, "learning_rate": 2.620481558275367e-08, "loss": 0.0007, "num_input_tokens_seen": 3482176, "step": 7080 }, { "epoch": 0.9350666490695526, "grad_norm": 69.08782958984375, "learning_rate": 2.5683508895402382e-08, "loss": 0.0318, "num_input_tokens_seen": 3484800, "step": 7085 }, { "epoch": 0.935726540847301, "grad_norm": 0.1581341028213501, "learning_rate": 2.5167372299613853e-08, "loss": 0.1076, "num_input_tokens_seen": 3487488, "step": 7090 }, { "epoch": 0.9363864326250495, "grad_norm": 11.627638816833496, "learning_rate": 2.4656408534203365e-08, "loss": 0.238, "num_input_tokens_seen": 3489728, "step": 7095 }, { "epoch": 0.9370463244027979, "grad_norm": 0.025092612951993942, "learning_rate": 2.4150620310538273e-08, "loss": 0.2424, "num_input_tokens_seen": 3491904, "step": 7100 }, { "epoch": 0.9377062161805464, "grad_norm": 12.157607078552246, "learning_rate": 2.3650010312521673e-08, "loss": 0.0751, "num_input_tokens_seen": 3494592, "step": 7105 }, { "epoch": 0.9383661079582949, "grad_norm": 0.0817142128944397, "learning_rate": 2.3154581196579648e-08, "loss": 0.1782, "num_input_tokens_seen": 3497088, "step": 7110 }, { "epoch": 0.9390259997360433, "grad_norm": 0.06925242394208908, "learning_rate": 2.2664335591646377e-08, "loss": 0.0552, "num_input_tokens_seen": 3499520, "step": 7115 }, { "epoch": 0.9396858915137918, "grad_norm": 0.029523100703954697, "learning_rate": 2.2179276099150158e-08, "loss": 0.1962, "num_input_tokens_seen": 3502208, "step": 7120 }, { "epoch": 0.9403457832915402, "grad_norm": 121.08486938476562, "learning_rate": 2.1699405293000182e-08, "loss": 0.1811, "num_input_tokens_seen": 3504640, "step": 7125 }, { "epoch": 0.9410056750692887, "grad_norm": 0.1253107488155365, "learning_rate": 2.1224725719572235e-08, "loss": 0.0653, "num_input_tokens_seen": 3506944, "step": 7130 }, { "epoch": 0.9416655668470371, "grad_norm": 46.052162170410156, "learning_rate": 2.0755239897695453e-08, "loss": 0.1533, "num_input_tokens_seen": 3509376, "step": 7135 }, { "epoch": 0.9423254586247856, "grad_norm": 0.4726586639881134, "learning_rate": 2.0290950318639256e-08, "loss": 0.1645, "num_input_tokens_seen": 3511680, "step": 7140 }, { "epoch": 0.942985350402534, "grad_norm": 3.1492843627929688, "learning_rate": 1.983185944609944e-08, "loss": 0.0611, "num_input_tokens_seen": 3514112, "step": 7145 }, { "epoch": 0.9436452421802825, "grad_norm": 0.20620296895503998, "learning_rate": 1.9377969716185994e-08, "loss": 0.0665, "num_input_tokens_seen": 3516480, "step": 7150 }, { "epoch": 0.9443051339580308, "grad_norm": 0.07421538978815079, "learning_rate": 1.8929283537408968e-08, "loss": 0.1162, "num_input_tokens_seen": 3518720, "step": 7155 }, { "epoch": 0.9449650257357793, "grad_norm": 0.12716051936149597, "learning_rate": 1.848580329066718e-08, "loss": 0.0086, "num_input_tokens_seen": 3521216, "step": 7160 }, { "epoch": 0.9456249175135277, "grad_norm": 18.4110164642334, "learning_rate": 1.804753132923431e-08, "loss": 0.3859, "num_input_tokens_seen": 3523776, "step": 7165 }, { "epoch": 0.9462848092912762, "grad_norm": 0.3863736093044281, "learning_rate": 1.7614469978746827e-08, "loss": 0.0012, "num_input_tokens_seen": 3526272, "step": 7170 }, { "epoch": 0.9469447010690247, "grad_norm": 67.91719818115234, "learning_rate": 1.7186621537192304e-08, "loss": 0.0324, "num_input_tokens_seen": 3528576, "step": 7175 }, { "epoch": 0.9476045928467731, "grad_norm": 0.12099135667085648, "learning_rate": 1.6763988274896003e-08, "loss": 0.0012, "num_input_tokens_seen": 3531136, "step": 7180 }, { "epoch": 0.9482644846245216, "grad_norm": 14.467368125915527, "learning_rate": 1.6346572434509876e-08, "loss": 0.1503, "num_input_tokens_seen": 3533696, "step": 7185 }, { "epoch": 0.94892437640227, "grad_norm": 0.32109934091567993, "learning_rate": 1.5934376231000248e-08, "loss": 0.1569, "num_input_tokens_seen": 3536064, "step": 7190 }, { "epoch": 0.9495842681800185, "grad_norm": 0.11669757217168808, "learning_rate": 1.55274018516357e-08, "loss": 0.0044, "num_input_tokens_seen": 3538432, "step": 7195 }, { "epoch": 0.9502441599577669, "grad_norm": 0.06187443807721138, "learning_rate": 1.512565145597633e-08, "loss": 0.05, "num_input_tokens_seen": 3541120, "step": 7200 }, { "epoch": 0.9503761383133166, "eval_loss": 0.09555233269929886, "eval_runtime": 7.635, "eval_samples_per_second": 882.126, "eval_steps_per_second": 110.282, "num_input_tokens_seen": 3541632, "step": 7201 }, { "epoch": 0.9509040517355154, "grad_norm": 72.59228515625, "learning_rate": 1.47291271758615e-08, "loss": 0.0498, "num_input_tokens_seen": 3543680, "step": 7205 }, { "epoch": 0.9515639435132638, "grad_norm": 107.97425079345703, "learning_rate": 1.4337831115398991e-08, "loss": 0.1477, "num_input_tokens_seen": 3545984, "step": 7210 }, { "epoch": 0.9522238352910123, "grad_norm": 17.91878318786621, "learning_rate": 1.3951765350953548e-08, "loss": 0.1276, "num_input_tokens_seen": 3548544, "step": 7215 }, { "epoch": 0.9528837270687607, "grad_norm": 0.03271764516830444, "learning_rate": 1.3570931931136009e-08, "loss": 0.1596, "num_input_tokens_seen": 3551040, "step": 7220 }, { "epoch": 0.9535436188465092, "grad_norm": 0.11517995595932007, "learning_rate": 1.3195332876792532e-08, "loss": 0.0839, "num_input_tokens_seen": 3553536, "step": 7225 }, { "epoch": 0.9542035106242576, "grad_norm": 0.08386459946632385, "learning_rate": 1.2824970180993488e-08, "loss": 0.1149, "num_input_tokens_seen": 3555712, "step": 7230 }, { "epoch": 0.9548634024020061, "grad_norm": 0.11899381130933762, "learning_rate": 1.2459845809023484e-08, "loss": 0.1233, "num_input_tokens_seen": 3558080, "step": 7235 }, { "epoch": 0.9555232941797546, "grad_norm": 25.366180419921875, "learning_rate": 1.2099961698370353e-08, "loss": 0.3036, "num_input_tokens_seen": 3560640, "step": 7240 }, { "epoch": 0.956183185957503, "grad_norm": 11.879151344299316, "learning_rate": 1.1745319758715288e-08, "loss": 0.0906, "num_input_tokens_seen": 3563392, "step": 7245 }, { "epoch": 0.9568430777352515, "grad_norm": 15.595983505249023, "learning_rate": 1.1395921871922509e-08, "loss": 0.1414, "num_input_tokens_seen": 3565824, "step": 7250 }, { "epoch": 0.9575029695129998, "grad_norm": 0.1044035479426384, "learning_rate": 1.105176989202905e-08, "loss": 0.0009, "num_input_tokens_seen": 3568256, "step": 7255 }, { "epoch": 0.9581628612907483, "grad_norm": 194.96958923339844, "learning_rate": 1.0712865645235659e-08, "loss": 0.0157, "num_input_tokens_seen": 3570752, "step": 7260 }, { "epoch": 0.9588227530684967, "grad_norm": 0.13970717787742615, "learning_rate": 1.0379210929896131e-08, "loss": 0.0805, "num_input_tokens_seen": 3572928, "step": 7265 }, { "epoch": 0.9594826448462452, "grad_norm": 26.312641143798828, "learning_rate": 1.0050807516508553e-08, "loss": 0.2674, "num_input_tokens_seen": 3575296, "step": 7270 }, { "epoch": 0.9601425366239936, "grad_norm": 0.14852392673492432, "learning_rate": 9.727657147705737e-09, "loss": 0.0011, "num_input_tokens_seen": 3577664, "step": 7275 }, { "epoch": 0.9608024284017421, "grad_norm": 0.4168717563152313, "learning_rate": 9.409761538245575e-09, "loss": 0.1992, "num_input_tokens_seen": 3580160, "step": 7280 }, { "epoch": 0.9614623201794905, "grad_norm": 0.20540349185466766, "learning_rate": 9.097122375002264e-09, "loss": 0.0761, "num_input_tokens_seen": 3582464, "step": 7285 }, { "epoch": 0.962122211957239, "grad_norm": 0.28160926699638367, "learning_rate": 8.789741316957312e-09, "loss": 0.1308, "num_input_tokens_seen": 3584896, "step": 7290 }, { "epoch": 0.9627821037349875, "grad_norm": 35.051509857177734, "learning_rate": 8.487619995190986e-09, "loss": 0.005, "num_input_tokens_seen": 3587584, "step": 7295 }, { "epoch": 0.9634419955127359, "grad_norm": 19.22887420654297, "learning_rate": 8.19076001287311e-09, "loss": 0.1393, "num_input_tokens_seen": 3590144, "step": 7300 }, { "epoch": 0.9641018872904844, "grad_norm": 0.0941128209233284, "learning_rate": 7.899162945254945e-09, "loss": 0.0012, "num_input_tokens_seen": 3592832, "step": 7305 }, { "epoch": 0.9647617790682328, "grad_norm": 35.726531982421875, "learning_rate": 7.612830339660758e-09, "loss": 0.0509, "num_input_tokens_seen": 3595456, "step": 7310 }, { "epoch": 0.9654216708459813, "grad_norm": 20.33081817626953, "learning_rate": 7.3317637154796105e-09, "loss": 0.1043, "num_input_tokens_seen": 3597888, "step": 7315 }, { "epoch": 0.9660815626237297, "grad_norm": 19.57093048095703, "learning_rate": 7.0559645641572465e-09, "loss": 0.0687, "num_input_tokens_seen": 3600384, "step": 7320 }, { "epoch": 0.9667414544014782, "grad_norm": 19.5794677734375, "learning_rate": 6.785434349188102e-09, "loss": 0.1628, "num_input_tokens_seen": 3602880, "step": 7325 }, { "epoch": 0.9674013461792266, "grad_norm": 0.20137454569339752, "learning_rate": 6.520174506107867e-09, "loss": 0.0423, "num_input_tokens_seen": 3605248, "step": 7330 }, { "epoch": 0.9680612379569751, "grad_norm": 0.1589362919330597, "learning_rate": 6.260186442485494e-09, "loss": 0.0011, "num_input_tokens_seen": 3607808, "step": 7335 }, { "epoch": 0.9687211297347235, "grad_norm": 6.267104148864746, "learning_rate": 6.005471537915863e-09, "loss": 0.1108, "num_input_tokens_seen": 3610112, "step": 7340 }, { "epoch": 0.969381021512472, "grad_norm": 6.449219226837158, "learning_rate": 5.756031144012685e-09, "loss": 0.0454, "num_input_tokens_seen": 3612352, "step": 7345 }, { "epoch": 0.9700409132902204, "grad_norm": 17.85236930847168, "learning_rate": 5.511866584400837e-09, "loss": 0.1715, "num_input_tokens_seen": 3614848, "step": 7350 }, { "epoch": 0.9707008050679689, "grad_norm": 0.49963685870170593, "learning_rate": 5.2729791547097e-09, "loss": 0.0017, "num_input_tokens_seen": 3617408, "step": 7355 }, { "epoch": 0.9713606968457174, "grad_norm": 20.02973175048828, "learning_rate": 5.039370122566389e-09, "loss": 0.0783, "num_input_tokens_seen": 3619968, "step": 7360 }, { "epoch": 0.9720205886234657, "grad_norm": 0.46782761812210083, "learning_rate": 4.811040727588755e-09, "loss": 0.0965, "num_input_tokens_seen": 3622016, "step": 7365 }, { "epoch": 0.9726804804012142, "grad_norm": 14.681106567382812, "learning_rate": 4.58799218137873e-09, "loss": 0.1156, "num_input_tokens_seen": 3624192, "step": 7370 }, { "epoch": 0.9733403721789626, "grad_norm": 0.10944530367851257, "learning_rate": 4.370225667516325e-09, "loss": 0.0009, "num_input_tokens_seen": 3626624, "step": 7375 }, { "epoch": 0.9740002639567111, "grad_norm": 10.72696304321289, "learning_rate": 4.157742341552861e-09, "loss": 0.1827, "num_input_tokens_seen": 3628928, "step": 7380 }, { "epoch": 0.9746601557344595, "grad_norm": 0.05703306198120117, "learning_rate": 3.950543331005307e-09, "loss": 0.0786, "num_input_tokens_seen": 3631552, "step": 7385 }, { "epoch": 0.975320047512208, "grad_norm": 0.22080153226852417, "learning_rate": 3.748629735349839e-09, "loss": 0.0009, "num_input_tokens_seen": 3633984, "step": 7390 }, { "epoch": 0.9759799392899564, "grad_norm": 0.05749522149562836, "learning_rate": 3.552002626016293e-09, "loss": 0.1332, "num_input_tokens_seen": 3636224, "step": 7395 }, { "epoch": 0.9766398310677049, "grad_norm": 0.09394296258687973, "learning_rate": 3.3606630463824947e-09, "loss": 0.2453, "num_input_tokens_seen": 3638656, "step": 7400 }, { "epoch": 0.9772997228454533, "grad_norm": 0.21530580520629883, "learning_rate": 3.174612011768607e-09, "loss": 0.0011, "num_input_tokens_seen": 3641408, "step": 7405 }, { "epoch": 0.9779596146232018, "grad_norm": 11.811311721801758, "learning_rate": 2.9938505094316834e-09, "loss": 0.1615, "num_input_tokens_seen": 3643840, "step": 7410 }, { "epoch": 0.9786195064009502, "grad_norm": 0.10265132784843445, "learning_rate": 2.8183794985605637e-09, "loss": 0.0006, "num_input_tokens_seen": 3646336, "step": 7415 }, { "epoch": 0.9792793981786987, "grad_norm": 0.0780910775065422, "learning_rate": 2.6481999102707654e-09, "loss": 0.0664, "num_input_tokens_seen": 3648960, "step": 7420 }, { "epoch": 0.9799392899564472, "grad_norm": 0.4066920280456543, "learning_rate": 2.4833126475994894e-09, "loss": 0.0011, "num_input_tokens_seen": 3651200, "step": 7425 }, { "epoch": 0.9805991817341956, "grad_norm": 7.960684776306152, "learning_rate": 2.3237185855008443e-09, "loss": 0.0056, "num_input_tokens_seen": 3653504, "step": 7430 }, { "epoch": 0.9812590735119441, "grad_norm": 62.566429138183594, "learning_rate": 2.1694185708414083e-09, "loss": 0.2456, "num_input_tokens_seen": 3656064, "step": 7435 }, { "epoch": 0.9819189652896925, "grad_norm": 156.15281677246094, "learning_rate": 2.0204134223952284e-09, "loss": 0.2749, "num_input_tokens_seen": 3658112, "step": 7440 }, { "epoch": 0.982578857067441, "grad_norm": 16.413667678833008, "learning_rate": 1.87670393083994e-09, "loss": 0.1727, "num_input_tokens_seen": 3660928, "step": 7445 }, { "epoch": 0.9832387488451894, "grad_norm": 23.082048416137695, "learning_rate": 1.7382908587525447e-09, "loss": 0.0298, "num_input_tokens_seen": 3663232, "step": 7450 }, { "epoch": 0.9838986406229379, "grad_norm": 0.05155172944068909, "learning_rate": 1.6051749406049697e-09, "loss": 0.0013, "num_input_tokens_seen": 3665600, "step": 7455 }, { "epoch": 0.9845585324006862, "grad_norm": 0.047289684414863586, "learning_rate": 1.4773568827607386e-09, "loss": 0.0008, "num_input_tokens_seen": 3668096, "step": 7460 }, { "epoch": 0.9852184241784347, "grad_norm": 0.13699422776699066, "learning_rate": 1.354837363470529e-09, "loss": 0.0016, "num_input_tokens_seen": 3670656, "step": 7465 }, { "epoch": 0.9858783159561831, "grad_norm": 117.30753326416016, "learning_rate": 1.23761703286962e-09, "loss": 0.1466, "num_input_tokens_seen": 3673024, "step": 7470 }, { "epoch": 0.9865382077339316, "grad_norm": 0.04031828045845032, "learning_rate": 1.1256965129730068e-09, "loss": 0.0012, "num_input_tokens_seen": 3675712, "step": 7475 }, { "epoch": 0.9871980995116801, "grad_norm": 0.14629097282886505, "learning_rate": 1.0190763976734018e-09, "loss": 0.1029, "num_input_tokens_seen": 3678080, "step": 7480 }, { "epoch": 0.9878579912894285, "grad_norm": 59.77119064331055, "learning_rate": 9.177572527375721e-10, "loss": 0.1536, "num_input_tokens_seen": 3680448, "step": 7485 }, { "epoch": 0.988517883067177, "grad_norm": 9.770659446716309, "learning_rate": 8.217396158030076e-10, "loss": 0.0019, "num_input_tokens_seen": 3682752, "step": 7490 }, { "epoch": 0.9891777748449254, "grad_norm": 77.23456573486328, "learning_rate": 7.310239963755904e-10, "loss": 0.1517, "num_input_tokens_seen": 3685376, "step": 7495 }, { "epoch": 0.9898376666226739, "grad_norm": 0.24194952845573425, "learning_rate": 6.456108758268186e-10, "loss": 0.0016, "num_input_tokens_seen": 3687744, "step": 7500 }, { "epoch": 0.9904975584004223, "grad_norm": 49.376041412353516, "learning_rate": 5.655007073909202e-10, "loss": 0.1517, "num_input_tokens_seen": 3690240, "step": 7505 }, { "epoch": 0.9911574501781708, "grad_norm": 0.04725305363535881, "learning_rate": 4.906939161627432e-10, "loss": 0.0507, "num_input_tokens_seen": 3692736, "step": 7510 }, { "epoch": 0.9918173419559192, "grad_norm": 14.56191635131836, "learning_rate": 4.2119089909542495e-10, "loss": 0.201, "num_input_tokens_seen": 3695360, "step": 7515 }, { "epoch": 0.9924772337336677, "grad_norm": 0.656075119972229, "learning_rate": 3.569920249981706e-10, "loss": 0.0593, "num_input_tokens_seen": 3697856, "step": 7520 }, { "epoch": 0.9931371255114161, "grad_norm": 1.89599609375, "learning_rate": 2.980976345344777e-10, "loss": 0.027, "num_input_tokens_seen": 3700224, "step": 7525 }, { "epoch": 0.9937970172891646, "grad_norm": 0.08695349097251892, "learning_rate": 2.445080402202482e-10, "loss": 0.0772, "num_input_tokens_seen": 3702592, "step": 7530 }, { "epoch": 0.994456909066913, "grad_norm": 0.4214935898780823, "learning_rate": 1.962235264222345e-10, "loss": 0.1564, "num_input_tokens_seen": 3704896, "step": 7535 }, { "epoch": 0.9951168008446615, "grad_norm": 4.679486274719238, "learning_rate": 1.5324434935615195e-10, "loss": 0.0446, "num_input_tokens_seen": 3707264, "step": 7540 }, { "epoch": 0.99577669262241, "grad_norm": 11.486593246459961, "learning_rate": 1.1557073708579057e-10, "loss": 0.1154, "num_input_tokens_seen": 3709824, "step": 7545 }, { "epoch": 0.9964365844001584, "grad_norm": 1.4587411880493164, "learning_rate": 8.320288952168297e-11, "loss": 0.1286, "num_input_tokens_seen": 3712192, "step": 7550 }, { "epoch": 0.9970964761779069, "grad_norm": 97.42974090576172, "learning_rate": 5.614097841988297e-11, "loss": 0.0832, "num_input_tokens_seen": 3714880, "step": 7555 }, { "epoch": 0.9977563679556553, "grad_norm": 0.4481765627861023, "learning_rate": 3.43851473808554e-11, "loss": 0.0311, "num_input_tokens_seen": 3717184, "step": 7560 }, { "epoch": 0.9984162597334038, "grad_norm": 0.09104231745004654, "learning_rate": 1.7935511849587192e-11, "loss": 0.0522, "num_input_tokens_seen": 3719424, "step": 7565 }, { "epoch": 0.9990761515111521, "grad_norm": 0.0702981948852539, "learning_rate": 6.792159113921947e-12, "loss": 0.1208, "num_input_tokens_seen": 3721920, "step": 7570 }, { "epoch": 0.9997360432889006, "grad_norm": 39.189273834228516, "learning_rate": 9.55148304560005e-13, "loss": 0.1169, "num_input_tokens_seen": 3724288, "step": 7575 }, { "epoch": 1.0, "num_input_tokens_seen": 3725120, "step": 7577, "total_flos": 2.175051626840064e+16, "train_loss": 0.12470523826255549, "train_runtime": 1215.5483, "train_samples_per_second": 49.866, "train_steps_per_second": 6.233 } ], "logging_steps": 5, "max_steps": 7577, "num_input_tokens_seen": 3725120, "num_train_epochs": 1, "save_steps": 379, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.175051626840064e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }