{ "best_global_step": 11210, "best_metric": 0.05228454992175102, "best_model_checkpoint": "saves_bts_preliminary/freeze/llama-3.2-1b-instruct/train_qnli_42_1779286680/checkpoint-11210", "epoch": 1.0, "eval_steps": 590, "global_step": 11784, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0004243041412084182, "grad_norm": 246.15960693359375, "learning_rate": 6.785411365564037e-09, "loss": 0.8772, "num_input_tokens_seen": 4992, "step": 5 }, { "epoch": 0.0008486082824168364, "grad_norm": 284.08428955078125, "learning_rate": 1.526717557251908e-08, "loss": 0.958, "num_input_tokens_seen": 9536, "step": 10 }, { "epoch": 0.0012729124236252546, "grad_norm": 248.28021240234375, "learning_rate": 2.374893977947413e-08, "loss": 0.8691, "num_input_tokens_seen": 14016, "step": 15 }, { "epoch": 0.0016972165648336728, "grad_norm": 297.8770446777344, "learning_rate": 3.223070398642917e-08, "loss": 0.8723, "num_input_tokens_seen": 19648, "step": 20 }, { "epoch": 0.002121520706042091, "grad_norm": 267.6341552734375, "learning_rate": 4.0712468193384224e-08, "loss": 0.8359, "num_input_tokens_seen": 24768, "step": 25 }, { "epoch": 0.0025458248472505093, "grad_norm": 284.44561767578125, "learning_rate": 4.919423240033927e-08, "loss": 0.8734, "num_input_tokens_seen": 29952, "step": 30 }, { "epoch": 0.0029701289884589274, "grad_norm": 267.7585144042969, "learning_rate": 5.767599660729432e-08, "loss": 0.8972, "num_input_tokens_seen": 34304, "step": 35 }, { "epoch": 0.0033944331296673455, "grad_norm": 300.0269470214844, "learning_rate": 6.615776081424935e-08, "loss": 0.8139, "num_input_tokens_seen": 39360, "step": 40 }, { "epoch": 0.0038187372708757637, "grad_norm": 247.20384216308594, "learning_rate": 7.463952502120441e-08, "loss": 0.8369, "num_input_tokens_seen": 44480, "step": 45 }, { "epoch": 0.004243041412084182, "grad_norm": 194.6239013671875, "learning_rate": 8.312128922815945e-08, "loss": 0.6877, "num_input_tokens_seen": 49152, "step": 50 }, { "epoch": 0.0046673455532926, "grad_norm": 150.263427734375, "learning_rate": 9.16030534351145e-08, "loss": 0.5185, "num_input_tokens_seen": 53696, "step": 55 }, { "epoch": 0.0050916496945010185, "grad_norm": 151.96859741210938, "learning_rate": 1.0008481764206955e-07, "loss": 0.4927, "num_input_tokens_seen": 58560, "step": 60 }, { "epoch": 0.005515953835709436, "grad_norm": 108.88196563720703, "learning_rate": 1.085665818490246e-07, "loss": 0.4352, "num_input_tokens_seen": 63808, "step": 65 }, { "epoch": 0.005940257976917855, "grad_norm": 31.13915252685547, "learning_rate": 1.1704834605597964e-07, "loss": 0.3313, "num_input_tokens_seen": 68096, "step": 70 }, { "epoch": 0.006364562118126273, "grad_norm": 19.339509963989258, "learning_rate": 1.2553011026293469e-07, "loss": 0.2376, "num_input_tokens_seen": 72448, "step": 75 }, { "epoch": 0.006788866259334691, "grad_norm": 21.34341812133789, "learning_rate": 1.3401187446988974e-07, "loss": 0.2135, "num_input_tokens_seen": 78336, "step": 80 }, { "epoch": 0.00721317040054311, "grad_norm": 28.049270629882812, "learning_rate": 1.4249363867684477e-07, "loss": 0.165, "num_input_tokens_seen": 83072, "step": 85 }, { "epoch": 0.007637474541751527, "grad_norm": 16.34543228149414, "learning_rate": 1.509754028837998e-07, "loss": 0.1788, "num_input_tokens_seen": 88128, "step": 90 }, { "epoch": 0.008061778682959946, "grad_norm": 58.355735778808594, "learning_rate": 1.594571670907549e-07, "loss": 0.1682, "num_input_tokens_seen": 92992, "step": 95 }, { "epoch": 0.008486082824168364, "grad_norm": 32.8367805480957, "learning_rate": 1.6793893129770992e-07, "loss": 0.16, "num_input_tokens_seen": 98112, "step": 100 }, { "epoch": 0.008910386965376781, "grad_norm": 23.30171775817871, "learning_rate": 1.7642069550466495e-07, "loss": 0.1406, "num_input_tokens_seen": 102720, "step": 105 }, { "epoch": 0.0093346911065852, "grad_norm": 10.516228675842285, "learning_rate": 1.8490245971162e-07, "loss": 0.156, "num_input_tokens_seen": 107520, "step": 110 }, { "epoch": 0.009758995247793618, "grad_norm": 16.174560546875, "learning_rate": 1.9338422391857507e-07, "loss": 0.17, "num_input_tokens_seen": 112064, "step": 115 }, { "epoch": 0.010183299389002037, "grad_norm": 11.447249412536621, "learning_rate": 2.018659881255301e-07, "loss": 0.1468, "num_input_tokens_seen": 117184, "step": 120 }, { "epoch": 0.010607603530210456, "grad_norm": 21.983373641967773, "learning_rate": 2.1034775233248513e-07, "loss": 0.1524, "num_input_tokens_seen": 121792, "step": 125 }, { "epoch": 0.011031907671418872, "grad_norm": 19.451534271240234, "learning_rate": 2.188295165394402e-07, "loss": 0.1493, "num_input_tokens_seen": 126272, "step": 130 }, { "epoch": 0.011456211812627291, "grad_norm": 44.81078338623047, "learning_rate": 2.2731128074639524e-07, "loss": 0.1492, "num_input_tokens_seen": 130880, "step": 135 }, { "epoch": 0.01188051595383571, "grad_norm": 17.800430297851562, "learning_rate": 2.3579304495335027e-07, "loss": 0.152, "num_input_tokens_seen": 135552, "step": 140 }, { "epoch": 0.012304820095044128, "grad_norm": 19.99517250061035, "learning_rate": 2.442748091603053e-07, "loss": 0.1422, "num_input_tokens_seen": 140928, "step": 145 }, { "epoch": 0.012729124236252547, "grad_norm": 43.71803665161133, "learning_rate": 2.5275657336726036e-07, "loss": 0.1357, "num_input_tokens_seen": 146176, "step": 150 }, { "epoch": 0.013153428377460964, "grad_norm": 21.6224365234375, "learning_rate": 2.612383375742154e-07, "loss": 0.1692, "num_input_tokens_seen": 151680, "step": 155 }, { "epoch": 0.013577732518669382, "grad_norm": 54.11295700073242, "learning_rate": 2.697201017811705e-07, "loss": 0.1577, "num_input_tokens_seen": 156480, "step": 160 }, { "epoch": 0.0140020366598778, "grad_norm": 92.01202392578125, "learning_rate": 2.782018659881255e-07, "loss": 0.1464, "num_input_tokens_seen": 161024, "step": 165 }, { "epoch": 0.01442634080108622, "grad_norm": 38.31109619140625, "learning_rate": 2.866836301950806e-07, "loss": 0.1223, "num_input_tokens_seen": 165760, "step": 170 }, { "epoch": 0.014850644942294636, "grad_norm": 16.718843460083008, "learning_rate": 2.951653944020356e-07, "loss": 0.1156, "num_input_tokens_seen": 169984, "step": 175 }, { "epoch": 0.015274949083503055, "grad_norm": 15.30700397491455, "learning_rate": 3.0364715860899065e-07, "loss": 0.1584, "num_input_tokens_seen": 174528, "step": 180 }, { "epoch": 0.01569925322471147, "grad_norm": 77.4272689819336, "learning_rate": 3.121289228159457e-07, "loss": 0.1117, "num_input_tokens_seen": 179136, "step": 185 }, { "epoch": 0.016123557365919892, "grad_norm": 33.88642883300781, "learning_rate": 3.206106870229007e-07, "loss": 0.1283, "num_input_tokens_seen": 183424, "step": 190 }, { "epoch": 0.01654786150712831, "grad_norm": 12.441794395446777, "learning_rate": 3.2909245122985577e-07, "loss": 0.0957, "num_input_tokens_seen": 187968, "step": 195 }, { "epoch": 0.01697216564833673, "grad_norm": 23.63111686706543, "learning_rate": 3.375742154368109e-07, "loss": 0.1221, "num_input_tokens_seen": 193152, "step": 200 }, { "epoch": 0.017396469789545146, "grad_norm": 48.66728210449219, "learning_rate": 3.460559796437659e-07, "loss": 0.1305, "num_input_tokens_seen": 197632, "step": 205 }, { "epoch": 0.017820773930753563, "grad_norm": 22.497852325439453, "learning_rate": 3.5453774385072094e-07, "loss": 0.0971, "num_input_tokens_seen": 202304, "step": 210 }, { "epoch": 0.018245078071961983, "grad_norm": 47.47602844238281, "learning_rate": 3.63019508057676e-07, "loss": 0.1392, "num_input_tokens_seen": 207040, "step": 215 }, { "epoch": 0.0186693822131704, "grad_norm": 27.37762451171875, "learning_rate": 3.71501272264631e-07, "loss": 0.1071, "num_input_tokens_seen": 212480, "step": 220 }, { "epoch": 0.01909368635437882, "grad_norm": 73.55374908447266, "learning_rate": 3.7998303647158606e-07, "loss": 0.1398, "num_input_tokens_seen": 217728, "step": 225 }, { "epoch": 0.019517990495587237, "grad_norm": 31.551063537597656, "learning_rate": 3.8846480067854107e-07, "loss": 0.0853, "num_input_tokens_seen": 221888, "step": 230 }, { "epoch": 0.019942294636795654, "grad_norm": 9.147037506103516, "learning_rate": 3.969465648854962e-07, "loss": 0.0764, "num_input_tokens_seen": 226496, "step": 235 }, { "epoch": 0.020366598778004074, "grad_norm": 24.393016815185547, "learning_rate": 4.0542832909245124e-07, "loss": 0.1296, "num_input_tokens_seen": 230720, "step": 240 }, { "epoch": 0.02079090291921249, "grad_norm": 126.87380981445312, "learning_rate": 4.1391009329940624e-07, "loss": 0.2537, "num_input_tokens_seen": 235584, "step": 245 }, { "epoch": 0.02121520706042091, "grad_norm": 12.414925575256348, "learning_rate": 4.223918575063613e-07, "loss": 0.0764, "num_input_tokens_seen": 241088, "step": 250 }, { "epoch": 0.021639511201629328, "grad_norm": 18.61178970336914, "learning_rate": 4.3087362171331635e-07, "loss": 0.0627, "num_input_tokens_seen": 245568, "step": 255 }, { "epoch": 0.022063815342837745, "grad_norm": 52.55159378051758, "learning_rate": 4.3935538592027136e-07, "loss": 0.0585, "num_input_tokens_seen": 250304, "step": 260 }, { "epoch": 0.022488119484046165, "grad_norm": 39.259368896484375, "learning_rate": 4.4783715012722647e-07, "loss": 0.0933, "num_input_tokens_seen": 255232, "step": 265 }, { "epoch": 0.022912423625254582, "grad_norm": 5.972058296203613, "learning_rate": 4.5631891433418153e-07, "loss": 0.0946, "num_input_tokens_seen": 259840, "step": 270 }, { "epoch": 0.023336727766463002, "grad_norm": 27.744028091430664, "learning_rate": 4.6480067854113653e-07, "loss": 0.081, "num_input_tokens_seen": 264768, "step": 275 }, { "epoch": 0.02376103190767142, "grad_norm": 48.95866394042969, "learning_rate": 4.732824427480916e-07, "loss": 0.1051, "num_input_tokens_seen": 270016, "step": 280 }, { "epoch": 0.024185336048879836, "grad_norm": 29.231962203979492, "learning_rate": 4.817642069550466e-07, "loss": 0.0954, "num_input_tokens_seen": 274496, "step": 285 }, { "epoch": 0.024609640190088256, "grad_norm": 21.682893753051758, "learning_rate": 4.902459711620017e-07, "loss": 0.0791, "num_input_tokens_seen": 279296, "step": 290 }, { "epoch": 0.025033944331296673, "grad_norm": 66.02873229980469, "learning_rate": 4.987277353689568e-07, "loss": 0.077, "num_input_tokens_seen": 284288, "step": 295 }, { "epoch": 0.025458248472505093, "grad_norm": 30.446861267089844, "learning_rate": 5.072094995759117e-07, "loss": 0.1016, "num_input_tokens_seen": 289088, "step": 300 }, { "epoch": 0.02588255261371351, "grad_norm": 34.33888244628906, "learning_rate": 5.156912637828668e-07, "loss": 0.1201, "num_input_tokens_seen": 293632, "step": 305 }, { "epoch": 0.026306856754921927, "grad_norm": 15.951445579528809, "learning_rate": 5.241730279898219e-07, "loss": 0.0812, "num_input_tokens_seen": 298176, "step": 310 }, { "epoch": 0.026731160896130347, "grad_norm": 34.837608337402344, "learning_rate": 5.326547921967769e-07, "loss": 0.1464, "num_input_tokens_seen": 302720, "step": 315 }, { "epoch": 0.027155465037338764, "grad_norm": 66.30300903320312, "learning_rate": 5.411365564037319e-07, "loss": 0.1395, "num_input_tokens_seen": 307648, "step": 320 }, { "epoch": 0.02757976917854718, "grad_norm": 10.806644439697266, "learning_rate": 5.49618320610687e-07, "loss": 0.0854, "num_input_tokens_seen": 312832, "step": 325 }, { "epoch": 0.0280040733197556, "grad_norm": 45.027854919433594, "learning_rate": 5.581000848176421e-07, "loss": 0.0932, "num_input_tokens_seen": 317376, "step": 330 }, { "epoch": 0.028428377460964018, "grad_norm": 16.889318466186523, "learning_rate": 5.66581849024597e-07, "loss": 0.0609, "num_input_tokens_seen": 322560, "step": 335 }, { "epoch": 0.02885268160217244, "grad_norm": 28.992074966430664, "learning_rate": 5.750636132315522e-07, "loss": 0.0641, "num_input_tokens_seen": 327104, "step": 340 }, { "epoch": 0.029276985743380855, "grad_norm": 32.16653060913086, "learning_rate": 5.835453774385072e-07, "loss": 0.0712, "num_input_tokens_seen": 332160, "step": 345 }, { "epoch": 0.029701289884589272, "grad_norm": 64.0754623413086, "learning_rate": 5.920271416454622e-07, "loss": 0.0986, "num_input_tokens_seen": 336960, "step": 350 }, { "epoch": 0.030125594025797692, "grad_norm": 11.283415794372559, "learning_rate": 6.005089058524173e-07, "loss": 0.1444, "num_input_tokens_seen": 341696, "step": 355 }, { "epoch": 0.03054989816700611, "grad_norm": 47.3290901184082, "learning_rate": 6.089906700593723e-07, "loss": 0.1736, "num_input_tokens_seen": 347008, "step": 360 }, { "epoch": 0.03097420230821453, "grad_norm": 44.4918098449707, "learning_rate": 6.174724342663274e-07, "loss": 0.1123, "num_input_tokens_seen": 352256, "step": 365 }, { "epoch": 0.03139850644942294, "grad_norm": 13.7459135055542, "learning_rate": 6.259541984732824e-07, "loss": 0.0541, "num_input_tokens_seen": 357312, "step": 370 }, { "epoch": 0.03182281059063136, "grad_norm": 44.1005973815918, "learning_rate": 6.344359626802375e-07, "loss": 0.151, "num_input_tokens_seen": 361728, "step": 375 }, { "epoch": 0.032247114731839784, "grad_norm": 34.00252914428711, "learning_rate": 6.429177268871925e-07, "loss": 0.1249, "num_input_tokens_seen": 366592, "step": 380 }, { "epoch": 0.032671418873048204, "grad_norm": 41.16194152832031, "learning_rate": 6.513994910941476e-07, "loss": 0.2083, "num_input_tokens_seen": 371392, "step": 385 }, { "epoch": 0.03309572301425662, "grad_norm": 16.21058464050293, "learning_rate": 6.598812553011026e-07, "loss": 0.0916, "num_input_tokens_seen": 376640, "step": 390 }, { "epoch": 0.03352002715546504, "grad_norm": 33.83322525024414, "learning_rate": 6.683630195080576e-07, "loss": 0.1429, "num_input_tokens_seen": 381504, "step": 395 }, { "epoch": 0.03394433129667346, "grad_norm": 39.9620475769043, "learning_rate": 6.768447837150128e-07, "loss": 0.1211, "num_input_tokens_seen": 385920, "step": 400 }, { "epoch": 0.03436863543788187, "grad_norm": 47.01149368286133, "learning_rate": 6.853265479219677e-07, "loss": 0.1086, "num_input_tokens_seen": 390272, "step": 405 }, { "epoch": 0.03479293957909029, "grad_norm": 10.420038223266602, "learning_rate": 6.938083121289228e-07, "loss": 0.1208, "num_input_tokens_seen": 396160, "step": 410 }, { "epoch": 0.03521724372029871, "grad_norm": 10.787165641784668, "learning_rate": 7.022900763358778e-07, "loss": 0.0872, "num_input_tokens_seen": 400768, "step": 415 }, { "epoch": 0.035641547861507125, "grad_norm": 12.446135520935059, "learning_rate": 7.107718405428329e-07, "loss": 0.0613, "num_input_tokens_seen": 405504, "step": 420 }, { "epoch": 0.036065852002715545, "grad_norm": 24.936521530151367, "learning_rate": 7.192536047497879e-07, "loss": 0.0373, "num_input_tokens_seen": 410176, "step": 425 }, { "epoch": 0.036490156143923966, "grad_norm": 6.019582748413086, "learning_rate": 7.277353689567429e-07, "loss": 0.0669, "num_input_tokens_seen": 415040, "step": 430 }, { "epoch": 0.036914460285132386, "grad_norm": 63.409305572509766, "learning_rate": 7.36217133163698e-07, "loss": 0.1154, "num_input_tokens_seen": 419968, "step": 435 }, { "epoch": 0.0373387644263408, "grad_norm": 75.0130615234375, "learning_rate": 7.446988973706531e-07, "loss": 0.2025, "num_input_tokens_seen": 424832, "step": 440 }, { "epoch": 0.03776306856754922, "grad_norm": 48.14998245239258, "learning_rate": 7.531806615776081e-07, "loss": 0.3266, "num_input_tokens_seen": 432064, "step": 445 }, { "epoch": 0.03818737270875764, "grad_norm": 28.366230010986328, "learning_rate": 7.616624257845632e-07, "loss": 0.0826, "num_input_tokens_seen": 437184, "step": 450 }, { "epoch": 0.03861167684996605, "grad_norm": 21.336973190307617, "learning_rate": 7.701441899915182e-07, "loss": 0.102, "num_input_tokens_seen": 442432, "step": 455 }, { "epoch": 0.039035980991174474, "grad_norm": 26.27196502685547, "learning_rate": 7.786259541984732e-07, "loss": 0.0454, "num_input_tokens_seen": 447040, "step": 460 }, { "epoch": 0.039460285132382894, "grad_norm": 53.3596305847168, "learning_rate": 7.871077184054283e-07, "loss": 0.1319, "num_input_tokens_seen": 452032, "step": 465 }, { "epoch": 0.03988458927359131, "grad_norm": 22.736515045166016, "learning_rate": 7.955894826123833e-07, "loss": 0.0636, "num_input_tokens_seen": 457344, "step": 470 }, { "epoch": 0.04030889341479973, "grad_norm": 26.367496490478516, "learning_rate": 8.040712468193384e-07, "loss": 0.1349, "num_input_tokens_seen": 462336, "step": 475 }, { "epoch": 0.04073319755600815, "grad_norm": 22.535175323486328, "learning_rate": 8.125530110262935e-07, "loss": 0.0904, "num_input_tokens_seen": 467072, "step": 480 }, { "epoch": 0.04115750169721656, "grad_norm": 23.10480308532715, "learning_rate": 8.210347752332485e-07, "loss": 0.0983, "num_input_tokens_seen": 471616, "step": 485 }, { "epoch": 0.04158180583842498, "grad_norm": 26.855924606323242, "learning_rate": 8.295165394402035e-07, "loss": 0.0815, "num_input_tokens_seen": 476480, "step": 490 }, { "epoch": 0.0420061099796334, "grad_norm": 27.23138999938965, "learning_rate": 8.379983036471586e-07, "loss": 0.0929, "num_input_tokens_seen": 481536, "step": 495 }, { "epoch": 0.04243041412084182, "grad_norm": 14.400825500488281, "learning_rate": 8.464800678541136e-07, "loss": 0.1216, "num_input_tokens_seen": 486336, "step": 500 }, { "epoch": 0.042854718262050236, "grad_norm": 36.2694091796875, "learning_rate": 8.549618320610686e-07, "loss": 0.1143, "num_input_tokens_seen": 490944, "step": 505 }, { "epoch": 0.043279022403258656, "grad_norm": 7.930865287780762, "learning_rate": 8.634435962680237e-07, "loss": 0.0585, "num_input_tokens_seen": 495488, "step": 510 }, { "epoch": 0.043703326544467076, "grad_norm": 17.148012161254883, "learning_rate": 8.719253604749788e-07, "loss": 0.1, "num_input_tokens_seen": 499840, "step": 515 }, { "epoch": 0.04412763068567549, "grad_norm": 24.2634334564209, "learning_rate": 8.804071246819338e-07, "loss": 0.0818, "num_input_tokens_seen": 504512, "step": 520 }, { "epoch": 0.04455193482688391, "grad_norm": 27.504880905151367, "learning_rate": 8.888888888888888e-07, "loss": 0.0705, "num_input_tokens_seen": 509376, "step": 525 }, { "epoch": 0.04497623896809233, "grad_norm": 29.78805923461914, "learning_rate": 8.973706530958439e-07, "loss": 0.07, "num_input_tokens_seen": 513856, "step": 530 }, { "epoch": 0.045400543109300744, "grad_norm": 31.55817413330078, "learning_rate": 9.058524173027989e-07, "loss": 0.107, "num_input_tokens_seen": 518976, "step": 535 }, { "epoch": 0.045824847250509164, "grad_norm": 68.85687255859375, "learning_rate": 9.143341815097539e-07, "loss": 0.0958, "num_input_tokens_seen": 524160, "step": 540 }, { "epoch": 0.046249151391717584, "grad_norm": 32.40176773071289, "learning_rate": 9.228159457167091e-07, "loss": 0.1709, "num_input_tokens_seen": 529152, "step": 545 }, { "epoch": 0.046673455532926005, "grad_norm": 67.73282623291016, "learning_rate": 9.312977099236641e-07, "loss": 0.1825, "num_input_tokens_seen": 533824, "step": 550 }, { "epoch": 0.04709775967413442, "grad_norm": 13.536025047302246, "learning_rate": 9.397794741306191e-07, "loss": 0.0982, "num_input_tokens_seen": 538560, "step": 555 }, { "epoch": 0.04752206381534284, "grad_norm": 20.389511108398438, "learning_rate": 9.482612383375742e-07, "loss": 0.1049, "num_input_tokens_seen": 542784, "step": 560 }, { "epoch": 0.04794636795655126, "grad_norm": 13.790285110473633, "learning_rate": 9.567430025445291e-07, "loss": 0.1076, "num_input_tokens_seen": 547840, "step": 565 }, { "epoch": 0.04837067209775967, "grad_norm": 9.404131889343262, "learning_rate": 9.652247667514842e-07, "loss": 0.0785, "num_input_tokens_seen": 552448, "step": 570 }, { "epoch": 0.04879497623896809, "grad_norm": 11.799363136291504, "learning_rate": 9.737065309584394e-07, "loss": 0.1165, "num_input_tokens_seen": 557120, "step": 575 }, { "epoch": 0.04921928038017651, "grad_norm": 12.816581726074219, "learning_rate": 9.821882951653943e-07, "loss": 0.1119, "num_input_tokens_seen": 561536, "step": 580 }, { "epoch": 0.049643584521384926, "grad_norm": 16.07106590270996, "learning_rate": 9.906700593723493e-07, "loss": 0.0704, "num_input_tokens_seen": 566336, "step": 585 }, { "epoch": 0.050067888662593346, "grad_norm": 11.079269409179688, "learning_rate": 9.991518235793044e-07, "loss": 0.0929, "num_input_tokens_seen": 571072, "step": 590 }, { "epoch": 0.050067888662593346, "eval_loss": 0.08071617037057877, "eval_runtime": 15.6888, "eval_samples_per_second": 667.674, "eval_steps_per_second": 83.499, "num_input_tokens_seen": 571072, "step": 590 }, { "epoch": 0.050492192803801766, "grad_norm": 21.24359893798828, "learning_rate": 1.0076335877862595e-06, "loss": 0.0317, "num_input_tokens_seen": 576192, "step": 595 }, { "epoch": 0.05091649694501019, "grad_norm": 38.84645462036133, "learning_rate": 1.0161153519932147e-06, "loss": 0.1199, "num_input_tokens_seen": 580928, "step": 600 }, { "epoch": 0.0513408010862186, "grad_norm": 16.443923950195312, "learning_rate": 1.0245971162001696e-06, "loss": 0.1215, "num_input_tokens_seen": 585728, "step": 605 }, { "epoch": 0.05176510522742702, "grad_norm": 51.92510223388672, "learning_rate": 1.0330788804071246e-06, "loss": 0.0926, "num_input_tokens_seen": 591040, "step": 610 }, { "epoch": 0.05218940936863544, "grad_norm": 34.422847747802734, "learning_rate": 1.0415606446140797e-06, "loss": 0.1686, "num_input_tokens_seen": 595584, "step": 615 }, { "epoch": 0.052613713509843854, "grad_norm": 28.12080192565918, "learning_rate": 1.0500424088210348e-06, "loss": 0.1433, "num_input_tokens_seen": 600384, "step": 620 }, { "epoch": 0.053038017651052274, "grad_norm": 17.771556854248047, "learning_rate": 1.0585241730279896e-06, "loss": 0.058, "num_input_tokens_seen": 605248, "step": 625 }, { "epoch": 0.053462321792260695, "grad_norm": 7.092148303985596, "learning_rate": 1.0670059372349449e-06, "loss": 0.0805, "num_input_tokens_seen": 609856, "step": 630 }, { "epoch": 0.05388662593346911, "grad_norm": 11.3302001953125, "learning_rate": 1.0754877014419e-06, "loss": 0.0443, "num_input_tokens_seen": 614976, "step": 635 }, { "epoch": 0.05431093007467753, "grad_norm": 28.04367446899414, "learning_rate": 1.083969465648855e-06, "loss": 0.0862, "num_input_tokens_seen": 619648, "step": 640 }, { "epoch": 0.05473523421588595, "grad_norm": 25.472623825073242, "learning_rate": 1.09245122985581e-06, "loss": 0.0478, "num_input_tokens_seen": 624896, "step": 645 }, { "epoch": 0.05515953835709436, "grad_norm": 16.80830192565918, "learning_rate": 1.1009329940627649e-06, "loss": 0.1452, "num_input_tokens_seen": 629632, "step": 650 }, { "epoch": 0.05558384249830278, "grad_norm": 21.809473037719727, "learning_rate": 1.10941475826972e-06, "loss": 0.0974, "num_input_tokens_seen": 634624, "step": 655 }, { "epoch": 0.0560081466395112, "grad_norm": 25.004167556762695, "learning_rate": 1.1178965224766752e-06, "loss": 0.0974, "num_input_tokens_seen": 639360, "step": 660 }, { "epoch": 0.05643245078071962, "grad_norm": 22.970441818237305, "learning_rate": 1.1263782866836303e-06, "loss": 0.038, "num_input_tokens_seen": 644032, "step": 665 }, { "epoch": 0.056856754921928036, "grad_norm": 17.413970947265625, "learning_rate": 1.1348600508905853e-06, "loss": 0.0969, "num_input_tokens_seen": 648256, "step": 670 }, { "epoch": 0.05728105906313646, "grad_norm": 28.037620544433594, "learning_rate": 1.1433418150975402e-06, "loss": 0.1091, "num_input_tokens_seen": 653056, "step": 675 }, { "epoch": 0.05770536320434488, "grad_norm": 36.98174285888672, "learning_rate": 1.1518235793044952e-06, "loss": 0.0743, "num_input_tokens_seen": 657664, "step": 680 }, { "epoch": 0.05812966734555329, "grad_norm": 27.782451629638672, "learning_rate": 1.1603053435114503e-06, "loss": 0.0379, "num_input_tokens_seen": 662016, "step": 685 }, { "epoch": 0.05855397148676171, "grad_norm": 6.267802715301514, "learning_rate": 1.1687871077184053e-06, "loss": 0.1067, "num_input_tokens_seen": 666368, "step": 690 }, { "epoch": 0.05897827562797013, "grad_norm": 27.76127815246582, "learning_rate": 1.1772688719253606e-06, "loss": 0.1044, "num_input_tokens_seen": 671616, "step": 695 }, { "epoch": 0.059402579769178544, "grad_norm": 1.9939544200897217, "learning_rate": 1.1857506361323155e-06, "loss": 0.0405, "num_input_tokens_seen": 676288, "step": 700 }, { "epoch": 0.059826883910386965, "grad_norm": 5.708540916442871, "learning_rate": 1.1942324003392705e-06, "loss": 0.0789, "num_input_tokens_seen": 680960, "step": 705 }, { "epoch": 0.060251188051595385, "grad_norm": 18.987085342407227, "learning_rate": 1.2027141645462256e-06, "loss": 0.0686, "num_input_tokens_seen": 685440, "step": 710 }, { "epoch": 0.060675492192803805, "grad_norm": 29.056886672973633, "learning_rate": 1.2111959287531806e-06, "loss": 0.1027, "num_input_tokens_seen": 690304, "step": 715 }, { "epoch": 0.06109979633401222, "grad_norm": 24.722980499267578, "learning_rate": 1.2196776929601355e-06, "loss": 0.0782, "num_input_tokens_seen": 695040, "step": 720 }, { "epoch": 0.06152410047522064, "grad_norm": 12.598089218139648, "learning_rate": 1.2281594571670907e-06, "loss": 0.0291, "num_input_tokens_seen": 699456, "step": 725 }, { "epoch": 0.06194840461642906, "grad_norm": 36.17939758300781, "learning_rate": 1.2366412213740458e-06, "loss": 0.1563, "num_input_tokens_seen": 704064, "step": 730 }, { "epoch": 0.06237270875763747, "grad_norm": 3.6635186672210693, "learning_rate": 1.2451229855810009e-06, "loss": 0.0311, "num_input_tokens_seen": 708544, "step": 735 }, { "epoch": 0.06279701289884589, "grad_norm": 36.30998611450195, "learning_rate": 1.253604749787956e-06, "loss": 0.2106, "num_input_tokens_seen": 713088, "step": 740 }, { "epoch": 0.0632213170400543, "grad_norm": 19.025728225708008, "learning_rate": 1.2620865139949108e-06, "loss": 0.1061, "num_input_tokens_seen": 718016, "step": 745 }, { "epoch": 0.06364562118126273, "grad_norm": 19.909170150756836, "learning_rate": 1.2705682782018658e-06, "loss": 0.042, "num_input_tokens_seen": 722752, "step": 750 }, { "epoch": 0.06406992532247115, "grad_norm": 19.725980758666992, "learning_rate": 1.279050042408821e-06, "loss": 0.0744, "num_input_tokens_seen": 727872, "step": 755 }, { "epoch": 0.06449422946367957, "grad_norm": 21.742664337158203, "learning_rate": 1.2875318066157761e-06, "loss": 0.1042, "num_input_tokens_seen": 732480, "step": 760 }, { "epoch": 0.06491853360488799, "grad_norm": 4.472809791564941, "learning_rate": 1.2960135708227312e-06, "loss": 0.0475, "num_input_tokens_seen": 736704, "step": 765 }, { "epoch": 0.06534283774609641, "grad_norm": 8.560938835144043, "learning_rate": 1.304495335029686e-06, "loss": 0.0575, "num_input_tokens_seen": 742848, "step": 770 }, { "epoch": 0.06576714188730481, "grad_norm": 4.725236892700195, "learning_rate": 1.3129770992366411e-06, "loss": 0.1154, "num_input_tokens_seen": 747072, "step": 775 }, { "epoch": 0.06619144602851323, "grad_norm": 1.102369785308838, "learning_rate": 1.3214588634435962e-06, "loss": 0.0311, "num_input_tokens_seen": 751808, "step": 780 }, { "epoch": 0.06661575016972165, "grad_norm": 34.201229095458984, "learning_rate": 1.3299406276505512e-06, "loss": 0.0878, "num_input_tokens_seen": 756800, "step": 785 }, { "epoch": 0.06704005431093008, "grad_norm": 27.980588912963867, "learning_rate": 1.3384223918575063e-06, "loss": 0.1431, "num_input_tokens_seen": 761536, "step": 790 }, { "epoch": 0.0674643584521385, "grad_norm": 28.48628807067871, "learning_rate": 1.3469041560644613e-06, "loss": 0.1062, "num_input_tokens_seen": 765824, "step": 795 }, { "epoch": 0.06788866259334692, "grad_norm": 2.4105472564697266, "learning_rate": 1.3553859202714164e-06, "loss": 0.3519, "num_input_tokens_seen": 770240, "step": 800 }, { "epoch": 0.06831296673455534, "grad_norm": 2.4612503051757812, "learning_rate": 1.3638676844783715e-06, "loss": 0.1262, "num_input_tokens_seen": 775424, "step": 805 }, { "epoch": 0.06873727087576374, "grad_norm": 11.974044799804688, "learning_rate": 1.3723494486853265e-06, "loss": 0.061, "num_input_tokens_seen": 779904, "step": 810 }, { "epoch": 0.06916157501697216, "grad_norm": 11.464366912841797, "learning_rate": 1.3808312128922814e-06, "loss": 0.0747, "num_input_tokens_seen": 784512, "step": 815 }, { "epoch": 0.06958587915818058, "grad_norm": 20.74694061279297, "learning_rate": 1.3893129770992366e-06, "loss": 0.078, "num_input_tokens_seen": 789440, "step": 820 }, { "epoch": 0.070010183299389, "grad_norm": 20.158458709716797, "learning_rate": 1.3977947413061917e-06, "loss": 0.1098, "num_input_tokens_seen": 794240, "step": 825 }, { "epoch": 0.07043448744059742, "grad_norm": 8.966534614562988, "learning_rate": 1.4062765055131467e-06, "loss": 0.139, "num_input_tokens_seen": 798592, "step": 830 }, { "epoch": 0.07085879158180584, "grad_norm": 31.50881004333496, "learning_rate": 1.4147582697201018e-06, "loss": 0.1213, "num_input_tokens_seen": 803776, "step": 835 }, { "epoch": 0.07128309572301425, "grad_norm": 6.87647008895874, "learning_rate": 1.4232400339270566e-06, "loss": 0.0552, "num_input_tokens_seen": 809088, "step": 840 }, { "epoch": 0.07170739986422267, "grad_norm": 15.782602310180664, "learning_rate": 1.4317217981340117e-06, "loss": 0.0523, "num_input_tokens_seen": 813824, "step": 845 }, { "epoch": 0.07213170400543109, "grad_norm": 5.775845527648926, "learning_rate": 1.440203562340967e-06, "loss": 0.0617, "num_input_tokens_seen": 818240, "step": 850 }, { "epoch": 0.07255600814663951, "grad_norm": 14.254950523376465, "learning_rate": 1.448685326547922e-06, "loss": 0.1225, "num_input_tokens_seen": 822400, "step": 855 }, { "epoch": 0.07298031228784793, "grad_norm": 38.01359558105469, "learning_rate": 1.457167090754877e-06, "loss": 0.1247, "num_input_tokens_seen": 827328, "step": 860 }, { "epoch": 0.07340461642905635, "grad_norm": 15.854536056518555, "learning_rate": 1.465648854961832e-06, "loss": 0.1375, "num_input_tokens_seen": 832320, "step": 865 }, { "epoch": 0.07382892057026477, "grad_norm": 21.019590377807617, "learning_rate": 1.474130619168787e-06, "loss": 0.0976, "num_input_tokens_seen": 837376, "step": 870 }, { "epoch": 0.07425322471147318, "grad_norm": 15.762535095214844, "learning_rate": 1.482612383375742e-06, "loss": 0.0646, "num_input_tokens_seen": 843008, "step": 875 }, { "epoch": 0.0746775288526816, "grad_norm": 11.819944381713867, "learning_rate": 1.491094147582697e-06, "loss": 0.1057, "num_input_tokens_seen": 847680, "step": 880 }, { "epoch": 0.07510183299389002, "grad_norm": 11.814828872680664, "learning_rate": 1.4995759117896522e-06, "loss": 0.0703, "num_input_tokens_seen": 852288, "step": 885 }, { "epoch": 0.07552613713509844, "grad_norm": 19.450414657592773, "learning_rate": 1.5080576759966072e-06, "loss": 0.0832, "num_input_tokens_seen": 857280, "step": 890 }, { "epoch": 0.07595044127630686, "grad_norm": 15.626432418823242, "learning_rate": 1.5165394402035623e-06, "loss": 0.0415, "num_input_tokens_seen": 862080, "step": 895 }, { "epoch": 0.07637474541751528, "grad_norm": 20.097511291503906, "learning_rate": 1.5250212044105173e-06, "loss": 0.1667, "num_input_tokens_seen": 866624, "step": 900 }, { "epoch": 0.07679904955872369, "grad_norm": 18.306550979614258, "learning_rate": 1.5335029686174724e-06, "loss": 0.0798, "num_input_tokens_seen": 871360, "step": 905 }, { "epoch": 0.0772233536999321, "grad_norm": 20.8496150970459, "learning_rate": 1.5419847328244272e-06, "loss": 0.1418, "num_input_tokens_seen": 877632, "step": 910 }, { "epoch": 0.07764765784114053, "grad_norm": 8.582545280456543, "learning_rate": 1.5504664970313825e-06, "loss": 0.0659, "num_input_tokens_seen": 881600, "step": 915 }, { "epoch": 0.07807196198234895, "grad_norm": 14.078996658325195, "learning_rate": 1.5589482612383376e-06, "loss": 0.0591, "num_input_tokens_seen": 886400, "step": 920 }, { "epoch": 0.07849626612355737, "grad_norm": 16.29892349243164, "learning_rate": 1.5674300254452926e-06, "loss": 0.0906, "num_input_tokens_seen": 890880, "step": 925 }, { "epoch": 0.07892057026476579, "grad_norm": 20.222871780395508, "learning_rate": 1.5759117896522477e-06, "loss": 0.076, "num_input_tokens_seen": 895744, "step": 930 }, { "epoch": 0.07934487440597421, "grad_norm": 35.49565505981445, "learning_rate": 1.5843935538592025e-06, "loss": 0.1215, "num_input_tokens_seen": 900224, "step": 935 }, { "epoch": 0.07976917854718261, "grad_norm": 9.415274620056152, "learning_rate": 1.5928753180661576e-06, "loss": 0.0687, "num_input_tokens_seen": 905280, "step": 940 }, { "epoch": 0.08019348268839104, "grad_norm": 4.259697914123535, "learning_rate": 1.6013570822731128e-06, "loss": 0.0859, "num_input_tokens_seen": 910336, "step": 945 }, { "epoch": 0.08061778682959946, "grad_norm": 15.21580696105957, "learning_rate": 1.609838846480068e-06, "loss": 0.0632, "num_input_tokens_seen": 915776, "step": 950 }, { "epoch": 0.08104209097080788, "grad_norm": 12.9989595413208, "learning_rate": 1.618320610687023e-06, "loss": 0.0857, "num_input_tokens_seen": 920512, "step": 955 }, { "epoch": 0.0814663951120163, "grad_norm": 33.018089294433594, "learning_rate": 1.6268023748939778e-06, "loss": 0.1331, "num_input_tokens_seen": 924992, "step": 960 }, { "epoch": 0.08189069925322472, "grad_norm": 24.30968475341797, "learning_rate": 1.6352841391009329e-06, "loss": 0.1138, "num_input_tokens_seen": 929792, "step": 965 }, { "epoch": 0.08231500339443312, "grad_norm": 6.3038010597229, "learning_rate": 1.643765903307888e-06, "loss": 0.1123, "num_input_tokens_seen": 934208, "step": 970 }, { "epoch": 0.08273930753564154, "grad_norm": 4.268393039703369, "learning_rate": 1.652247667514843e-06, "loss": 0.0767, "num_input_tokens_seen": 938624, "step": 975 }, { "epoch": 0.08316361167684996, "grad_norm": 9.543072700500488, "learning_rate": 1.660729431721798e-06, "loss": 0.0286, "num_input_tokens_seen": 943168, "step": 980 }, { "epoch": 0.08358791581805838, "grad_norm": 35.978233337402344, "learning_rate": 1.669211195928753e-06, "loss": 0.0942, "num_input_tokens_seen": 948032, "step": 985 }, { "epoch": 0.0840122199592668, "grad_norm": 27.4283504486084, "learning_rate": 1.6776929601357082e-06, "loss": 0.0874, "num_input_tokens_seen": 954176, "step": 990 }, { "epoch": 0.08443652410047522, "grad_norm": 16.803491592407227, "learning_rate": 1.6861747243426632e-06, "loss": 0.0855, "num_input_tokens_seen": 958912, "step": 995 }, { "epoch": 0.08486082824168364, "grad_norm": 11.425243377685547, "learning_rate": 1.6946564885496183e-06, "loss": 0.1609, "num_input_tokens_seen": 963264, "step": 1000 }, { "epoch": 0.08528513238289205, "grad_norm": 13.470209121704102, "learning_rate": 1.7031382527565731e-06, "loss": 0.0747, "num_input_tokens_seen": 968256, "step": 1005 }, { "epoch": 0.08570943652410047, "grad_norm": 7.800799369812012, "learning_rate": 1.7116200169635284e-06, "loss": 0.1322, "num_input_tokens_seen": 972608, "step": 1010 }, { "epoch": 0.08613374066530889, "grad_norm": 16.069305419921875, "learning_rate": 1.7201017811704834e-06, "loss": 0.0859, "num_input_tokens_seen": 977856, "step": 1015 }, { "epoch": 0.08655804480651731, "grad_norm": 31.697803497314453, "learning_rate": 1.7285835453774385e-06, "loss": 0.0987, "num_input_tokens_seen": 982720, "step": 1020 }, { "epoch": 0.08698234894772573, "grad_norm": 32.31370544433594, "learning_rate": 1.7370653095843936e-06, "loss": 0.1228, "num_input_tokens_seen": 987584, "step": 1025 }, { "epoch": 0.08740665308893415, "grad_norm": 19.247591018676758, "learning_rate": 1.7455470737913484e-06, "loss": 0.082, "num_input_tokens_seen": 992448, "step": 1030 }, { "epoch": 0.08783095723014257, "grad_norm": 24.267414093017578, "learning_rate": 1.7540288379983035e-06, "loss": 0.1068, "num_input_tokens_seen": 997184, "step": 1035 }, { "epoch": 0.08825526137135098, "grad_norm": 11.689900398254395, "learning_rate": 1.7625106022052587e-06, "loss": 0.0649, "num_input_tokens_seen": 1002432, "step": 1040 }, { "epoch": 0.0886795655125594, "grad_norm": 29.932924270629883, "learning_rate": 1.7709923664122138e-06, "loss": 0.0748, "num_input_tokens_seen": 1007360, "step": 1045 }, { "epoch": 0.08910386965376782, "grad_norm": 0.5252959728240967, "learning_rate": 1.7794741306191686e-06, "loss": 0.0598, "num_input_tokens_seen": 1011968, "step": 1050 }, { "epoch": 0.08952817379497624, "grad_norm": 39.90684127807617, "learning_rate": 1.7879558948261237e-06, "loss": 0.0976, "num_input_tokens_seen": 1016896, "step": 1055 }, { "epoch": 0.08995247793618466, "grad_norm": 2.350248336791992, "learning_rate": 1.7964376590330787e-06, "loss": 0.1005, "num_input_tokens_seen": 1021952, "step": 1060 }, { "epoch": 0.09037678207739308, "grad_norm": 27.664554595947266, "learning_rate": 1.8049194232400338e-06, "loss": 0.1977, "num_input_tokens_seen": 1026560, "step": 1065 }, { "epoch": 0.09080108621860149, "grad_norm": 1.7253851890563965, "learning_rate": 1.813401187446989e-06, "loss": 0.0821, "num_input_tokens_seen": 1031360, "step": 1070 }, { "epoch": 0.09122539035980991, "grad_norm": 32.1817512512207, "learning_rate": 1.821882951653944e-06, "loss": 0.1014, "num_input_tokens_seen": 1036480, "step": 1075 }, { "epoch": 0.09164969450101833, "grad_norm": 30.749300003051758, "learning_rate": 1.830364715860899e-06, "loss": 0.0861, "num_input_tokens_seen": 1041024, "step": 1080 }, { "epoch": 0.09207399864222675, "grad_norm": 10.305405616760254, "learning_rate": 1.838846480067854e-06, "loss": 0.0588, "num_input_tokens_seen": 1045312, "step": 1085 }, { "epoch": 0.09249830278343517, "grad_norm": 13.730518341064453, "learning_rate": 1.847328244274809e-06, "loss": 0.0759, "num_input_tokens_seen": 1050240, "step": 1090 }, { "epoch": 0.09292260692464359, "grad_norm": 11.231359481811523, "learning_rate": 1.8558100084817641e-06, "loss": 0.09, "num_input_tokens_seen": 1055744, "step": 1095 }, { "epoch": 0.09334691106585201, "grad_norm": 25.91359519958496, "learning_rate": 1.864291772688719e-06, "loss": 0.073, "num_input_tokens_seen": 1060352, "step": 1100 }, { "epoch": 0.09377121520706042, "grad_norm": 28.921405792236328, "learning_rate": 1.8727735368956743e-06, "loss": 0.1046, "num_input_tokens_seen": 1065472, "step": 1105 }, { "epoch": 0.09419551934826884, "grad_norm": 0.6421509385108948, "learning_rate": 1.8812553011026293e-06, "loss": 0.0767, "num_input_tokens_seen": 1070144, "step": 1110 }, { "epoch": 0.09461982348947726, "grad_norm": 0.3113643527030945, "learning_rate": 1.8897370653095844e-06, "loss": 0.0741, "num_input_tokens_seen": 1074688, "step": 1115 }, { "epoch": 0.09504412763068568, "grad_norm": 0.44791167974472046, "learning_rate": 1.8982188295165394e-06, "loss": 0.0479, "num_input_tokens_seen": 1079040, "step": 1120 }, { "epoch": 0.0954684317718941, "grad_norm": 49.530181884765625, "learning_rate": 1.9067005937234943e-06, "loss": 0.0952, "num_input_tokens_seen": 1083456, "step": 1125 }, { "epoch": 0.09589273591310252, "grad_norm": 4.848371505737305, "learning_rate": 1.9151823579304493e-06, "loss": 0.1757, "num_input_tokens_seen": 1088064, "step": 1130 }, { "epoch": 0.09631704005431092, "grad_norm": 11.97738265991211, "learning_rate": 1.9236641221374044e-06, "loss": 0.0502, "num_input_tokens_seen": 1092544, "step": 1135 }, { "epoch": 0.09674134419551934, "grad_norm": 52.587013244628906, "learning_rate": 1.9321458863443595e-06, "loss": 0.0995, "num_input_tokens_seen": 1097792, "step": 1140 }, { "epoch": 0.09716564833672776, "grad_norm": 13.555150032043457, "learning_rate": 1.9406276505513145e-06, "loss": 0.1066, "num_input_tokens_seen": 1102912, "step": 1145 }, { "epoch": 0.09758995247793618, "grad_norm": 3.3610050678253174, "learning_rate": 1.9491094147582696e-06, "loss": 0.0254, "num_input_tokens_seen": 1107840, "step": 1150 }, { "epoch": 0.0980142566191446, "grad_norm": 38.56612777709961, "learning_rate": 1.9575911789652246e-06, "loss": 0.1516, "num_input_tokens_seen": 1112448, "step": 1155 }, { "epoch": 0.09843856076035302, "grad_norm": 24.816673278808594, "learning_rate": 1.9660729431721797e-06, "loss": 0.1223, "num_input_tokens_seen": 1117248, "step": 1160 }, { "epoch": 0.09886286490156145, "grad_norm": 25.663911819458008, "learning_rate": 1.9745547073791347e-06, "loss": 0.089, "num_input_tokens_seen": 1121984, "step": 1165 }, { "epoch": 0.09928716904276985, "grad_norm": 8.376113891601562, "learning_rate": 1.98303647158609e-06, "loss": 0.0988, "num_input_tokens_seen": 1127040, "step": 1170 }, { "epoch": 0.09971147318397827, "grad_norm": 12.205850601196289, "learning_rate": 1.991518235793045e-06, "loss": 0.0578, "num_input_tokens_seen": 1131904, "step": 1175 }, { "epoch": 0.10013577732518669, "grad_norm": 19.723358154296875, "learning_rate": 2e-06, "loss": 0.1054, "num_input_tokens_seen": 1136384, "step": 1180 }, { "epoch": 0.10013577732518669, "eval_loss": 0.07076410949230194, "eval_runtime": 15.763, "eval_samples_per_second": 664.533, "eval_steps_per_second": 83.106, "num_input_tokens_seen": 1136384, "step": 1180 }, { "epoch": 0.10056008146639511, "grad_norm": 8.96194076538086, "learning_rate": 1.999998903046209e-06, "loss": 0.0543, "num_input_tokens_seen": 1140864, "step": 1185 }, { "epoch": 0.10098438560760353, "grad_norm": 24.896997451782227, "learning_rate": 1.999995612187243e-06, "loss": 0.1416, "num_input_tokens_seen": 1145408, "step": 1190 }, { "epoch": 0.10140868974881195, "grad_norm": 21.160457611083984, "learning_rate": 1.9999901274303226e-06, "loss": 0.1497, "num_input_tokens_seen": 1150400, "step": 1195 }, { "epoch": 0.10183299389002037, "grad_norm": 68.9247817993164, "learning_rate": 1.9999824487874795e-06, "loss": 0.1094, "num_input_tokens_seen": 1154880, "step": 1200 }, { "epoch": 0.10225729803122878, "grad_norm": 0.9308235049247742, "learning_rate": 1.999972576275561e-06, "loss": 0.1046, "num_input_tokens_seen": 1159552, "step": 1205 }, { "epoch": 0.1026816021724372, "grad_norm": 14.106361389160156, "learning_rate": 1.999960509916226e-06, "loss": 0.0262, "num_input_tokens_seen": 1164800, "step": 1210 }, { "epoch": 0.10310590631364562, "grad_norm": 31.319644927978516, "learning_rate": 1.9999462497359463e-06, "loss": 0.0621, "num_input_tokens_seen": 1170304, "step": 1215 }, { "epoch": 0.10353021045485404, "grad_norm": 26.950510025024414, "learning_rate": 1.999929795766009e-06, "loss": 0.0834, "num_input_tokens_seen": 1175040, "step": 1220 }, { "epoch": 0.10395451459606246, "grad_norm": 0.2634492814540863, "learning_rate": 1.999911148042511e-06, "loss": 0.0045, "num_input_tokens_seen": 1180288, "step": 1225 }, { "epoch": 0.10437881873727088, "grad_norm": 36.4853630065918, "learning_rate": 1.999890306606365e-06, "loss": 0.097, "num_input_tokens_seen": 1185088, "step": 1230 }, { "epoch": 0.10480312287847929, "grad_norm": 0.16783910989761353, "learning_rate": 1.9998672715032944e-06, "loss": 0.0987, "num_input_tokens_seen": 1189504, "step": 1235 }, { "epoch": 0.10522742701968771, "grad_norm": 0.6057221293449402, "learning_rate": 1.999842042783836e-06, "loss": 0.1065, "num_input_tokens_seen": 1194304, "step": 1240 }, { "epoch": 0.10565173116089613, "grad_norm": 6.111150741577148, "learning_rate": 1.99981462050334e-06, "loss": 0.0156, "num_input_tokens_seen": 1198976, "step": 1245 }, { "epoch": 0.10607603530210455, "grad_norm": 16.78822898864746, "learning_rate": 1.999785004721968e-06, "loss": 0.0797, "num_input_tokens_seen": 1203520, "step": 1250 }, { "epoch": 0.10650033944331297, "grad_norm": 7.630631923675537, "learning_rate": 1.9997531955046936e-06, "loss": 0.0947, "num_input_tokens_seen": 1207808, "step": 1255 }, { "epoch": 0.10692464358452139, "grad_norm": 2.147573947906494, "learning_rate": 1.9997191929213044e-06, "loss": 0.0938, "num_input_tokens_seen": 1212992, "step": 1260 }, { "epoch": 0.10734894772572981, "grad_norm": 33.501197814941406, "learning_rate": 1.999682997046398e-06, "loss": 0.1488, "num_input_tokens_seen": 1217344, "step": 1265 }, { "epoch": 0.10777325186693822, "grad_norm": 34.510108947753906, "learning_rate": 1.9996446079593855e-06, "loss": 0.0821, "num_input_tokens_seen": 1222080, "step": 1270 }, { "epoch": 0.10819755600814664, "grad_norm": 4.3939208984375, "learning_rate": 1.999604025744489e-06, "loss": 0.1393, "num_input_tokens_seen": 1226752, "step": 1275 }, { "epoch": 0.10862186014935506, "grad_norm": 34.965816497802734, "learning_rate": 1.9995612504907414e-06, "loss": 0.1702, "num_input_tokens_seen": 1231808, "step": 1280 }, { "epoch": 0.10904616429056348, "grad_norm": 17.587800979614258, "learning_rate": 1.999516282291988e-06, "loss": 0.1218, "num_input_tokens_seen": 1236352, "step": 1285 }, { "epoch": 0.1094704684317719, "grad_norm": 18.25070571899414, "learning_rate": 1.9994691212468853e-06, "loss": 0.0983, "num_input_tokens_seen": 1241088, "step": 1290 }, { "epoch": 0.10989477257298032, "grad_norm": 2.138901472091675, "learning_rate": 1.9994197674588997e-06, "loss": 0.0506, "num_input_tokens_seen": 1246336, "step": 1295 }, { "epoch": 0.11031907671418872, "grad_norm": 7.946754455566406, "learning_rate": 1.999368221036309e-06, "loss": 0.1021, "num_input_tokens_seen": 1251648, "step": 1300 }, { "epoch": 0.11074338085539714, "grad_norm": 18.631914138793945, "learning_rate": 1.9993144820922015e-06, "loss": 0.0848, "num_input_tokens_seen": 1256448, "step": 1305 }, { "epoch": 0.11116768499660556, "grad_norm": 1.045030117034912, "learning_rate": 1.9992585507444757e-06, "loss": 0.096, "num_input_tokens_seen": 1261184, "step": 1310 }, { "epoch": 0.11159198913781398, "grad_norm": 38.57451629638672, "learning_rate": 1.999200427115839e-06, "loss": 0.1, "num_input_tokens_seen": 1266304, "step": 1315 }, { "epoch": 0.1120162932790224, "grad_norm": 4.0364556312561035, "learning_rate": 1.99914011133381e-06, "loss": 0.0415, "num_input_tokens_seen": 1270848, "step": 1320 }, { "epoch": 0.11244059742023083, "grad_norm": 7.261585712432861, "learning_rate": 1.999077603530716e-06, "loss": 0.0318, "num_input_tokens_seen": 1275712, "step": 1325 }, { "epoch": 0.11286490156143925, "grad_norm": 18.819059371948242, "learning_rate": 1.999012903843693e-06, "loss": 0.0425, "num_input_tokens_seen": 1280000, "step": 1330 }, { "epoch": 0.11328920570264765, "grad_norm": 17.924287796020508, "learning_rate": 1.9989460124146854e-06, "loss": 0.0826, "num_input_tokens_seen": 1285440, "step": 1335 }, { "epoch": 0.11371350984385607, "grad_norm": 0.6249921321868896, "learning_rate": 1.998876929390448e-06, "loss": 0.0835, "num_input_tokens_seen": 1290176, "step": 1340 }, { "epoch": 0.11413781398506449, "grad_norm": 54.781005859375, "learning_rate": 1.9988056549225423e-06, "loss": 0.108, "num_input_tokens_seen": 1294912, "step": 1345 }, { "epoch": 0.11456211812627291, "grad_norm": 1.0688238143920898, "learning_rate": 1.9987321891673375e-06, "loss": 0.0703, "num_input_tokens_seen": 1299136, "step": 1350 }, { "epoch": 0.11498642226748133, "grad_norm": 8.30221939086914, "learning_rate": 1.9986565322860116e-06, "loss": 0.1112, "num_input_tokens_seen": 1303936, "step": 1355 }, { "epoch": 0.11541072640868975, "grad_norm": 21.210979461669922, "learning_rate": 1.9985786844445474e-06, "loss": 0.045, "num_input_tokens_seen": 1308928, "step": 1360 }, { "epoch": 0.11583503054989816, "grad_norm": 16.939516067504883, "learning_rate": 1.9984986458137366e-06, "loss": 0.0518, "num_input_tokens_seen": 1313728, "step": 1365 }, { "epoch": 0.11625933469110658, "grad_norm": 2.8378939628601074, "learning_rate": 1.998416416569177e-06, "loss": 0.062, "num_input_tokens_seen": 1318400, "step": 1370 }, { "epoch": 0.116683638832315, "grad_norm": 15.487943649291992, "learning_rate": 1.9983319968912714e-06, "loss": 0.0946, "num_input_tokens_seen": 1322752, "step": 1375 }, { "epoch": 0.11710794297352342, "grad_norm": 6.950421333312988, "learning_rate": 1.9982453869652286e-06, "loss": 0.0354, "num_input_tokens_seen": 1327552, "step": 1380 }, { "epoch": 0.11753224711473184, "grad_norm": 45.381065368652344, "learning_rate": 1.9981565869810637e-06, "loss": 0.084, "num_input_tokens_seen": 1331776, "step": 1385 }, { "epoch": 0.11795655125594026, "grad_norm": 7.881857872009277, "learning_rate": 1.998065597133594e-06, "loss": 0.0591, "num_input_tokens_seen": 1336128, "step": 1390 }, { "epoch": 0.11838085539714868, "grad_norm": 14.927377700805664, "learning_rate": 1.9979724176224447e-06, "loss": 0.1068, "num_input_tokens_seen": 1340800, "step": 1395 }, { "epoch": 0.11880515953835709, "grad_norm": 7.404366970062256, "learning_rate": 1.997877048652042e-06, "loss": 0.0982, "num_input_tokens_seen": 1345408, "step": 1400 }, { "epoch": 0.11922946367956551, "grad_norm": 37.19292068481445, "learning_rate": 1.9977794904316163e-06, "loss": 0.12, "num_input_tokens_seen": 1350208, "step": 1405 }, { "epoch": 0.11965376782077393, "grad_norm": 4.006378650665283, "learning_rate": 1.9976797431752023e-06, "loss": 0.0689, "num_input_tokens_seen": 1354624, "step": 1410 }, { "epoch": 0.12007807196198235, "grad_norm": 1.479600429534912, "learning_rate": 1.9975778071016357e-06, "loss": 0.0574, "num_input_tokens_seen": 1359232, "step": 1415 }, { "epoch": 0.12050237610319077, "grad_norm": 8.415390014648438, "learning_rate": 1.997473682434555e-06, "loss": 0.0794, "num_input_tokens_seen": 1363904, "step": 1420 }, { "epoch": 0.12092668024439919, "grad_norm": 0.7171556949615479, "learning_rate": 1.9973673694023998e-06, "loss": 0.0577, "num_input_tokens_seen": 1368448, "step": 1425 }, { "epoch": 0.12135098438560761, "grad_norm": 19.485977172851562, "learning_rate": 1.997258868238411e-06, "loss": 0.099, "num_input_tokens_seen": 1372864, "step": 1430 }, { "epoch": 0.12177528852681602, "grad_norm": 26.69516944885254, "learning_rate": 1.997148179180631e-06, "loss": 0.0979, "num_input_tokens_seen": 1377920, "step": 1435 }, { "epoch": 0.12219959266802444, "grad_norm": 15.690411567687988, "learning_rate": 1.9970353024719003e-06, "loss": 0.0951, "num_input_tokens_seen": 1382464, "step": 1440 }, { "epoch": 0.12262389680923286, "grad_norm": 15.733072280883789, "learning_rate": 1.9969202383598605e-06, "loss": 0.065, "num_input_tokens_seen": 1387072, "step": 1445 }, { "epoch": 0.12304820095044128, "grad_norm": 13.7681245803833, "learning_rate": 1.996802987096952e-06, "loss": 0.0363, "num_input_tokens_seen": 1391488, "step": 1450 }, { "epoch": 0.1234725050916497, "grad_norm": 22.600217819213867, "learning_rate": 1.9966835489404123e-06, "loss": 0.1148, "num_input_tokens_seen": 1397440, "step": 1455 }, { "epoch": 0.12389680923285812, "grad_norm": 2.1628708839416504, "learning_rate": 1.996561924152278e-06, "loss": 0.0559, "num_input_tokens_seen": 1402048, "step": 1460 }, { "epoch": 0.12432111337406652, "grad_norm": 51.919342041015625, "learning_rate": 1.996438112999383e-06, "loss": 0.0275, "num_input_tokens_seen": 1406784, "step": 1465 }, { "epoch": 0.12474541751527495, "grad_norm": 58.69674301147461, "learning_rate": 1.9963121157533573e-06, "loss": 0.1324, "num_input_tokens_seen": 1411328, "step": 1470 }, { "epoch": 0.12516972165648338, "grad_norm": 1.9336317777633667, "learning_rate": 1.9961839326906272e-06, "loss": 0.1638, "num_input_tokens_seen": 1415936, "step": 1475 }, { "epoch": 0.12559402579769177, "grad_norm": 41.08507537841797, "learning_rate": 1.9960535640924146e-06, "loss": 0.1479, "num_input_tokens_seen": 1421248, "step": 1480 }, { "epoch": 0.1260183299389002, "grad_norm": 7.536561965942383, "learning_rate": 1.995921010244736e-06, "loss": 0.0392, "num_input_tokens_seen": 1425728, "step": 1485 }, { "epoch": 0.1264426340801086, "grad_norm": 14.329437255859375, "learning_rate": 1.9957862714384025e-06, "loss": 0.0857, "num_input_tokens_seen": 1431296, "step": 1490 }, { "epoch": 0.12686693822131703, "grad_norm": 10.391690254211426, "learning_rate": 1.9956493479690188e-06, "loss": 0.0819, "num_input_tokens_seen": 1436160, "step": 1495 }, { "epoch": 0.12729124236252545, "grad_norm": 2.942070722579956, "learning_rate": 1.9955102401369814e-06, "loss": 0.1003, "num_input_tokens_seen": 1440960, "step": 1500 }, { "epoch": 0.12771554650373387, "grad_norm": 14.67211627960205, "learning_rate": 1.9953689482474806e-06, "loss": 0.0611, "num_input_tokens_seen": 1445760, "step": 1505 }, { "epoch": 0.1281398506449423, "grad_norm": 0.5613120198249817, "learning_rate": 1.995225472610498e-06, "loss": 0.0144, "num_input_tokens_seen": 1450688, "step": 1510 }, { "epoch": 0.12856415478615071, "grad_norm": 40.097408294677734, "learning_rate": 1.9950798135408057e-06, "loss": 0.1675, "num_input_tokens_seen": 1455552, "step": 1515 }, { "epoch": 0.12898845892735913, "grad_norm": 22.93716812133789, "learning_rate": 1.994931971357966e-06, "loss": 0.0808, "num_input_tokens_seen": 1460416, "step": 1520 }, { "epoch": 0.12941276306856755, "grad_norm": 1.170467495918274, "learning_rate": 1.9947819463863316e-06, "loss": 0.0717, "num_input_tokens_seen": 1466432, "step": 1525 }, { "epoch": 0.12983706720977597, "grad_norm": 18.326679229736328, "learning_rate": 1.9946297389550432e-06, "loss": 0.0989, "num_input_tokens_seen": 1471232, "step": 1530 }, { "epoch": 0.1302613713509844, "grad_norm": 13.557463645935059, "learning_rate": 1.9944753493980292e-06, "loss": 0.0587, "num_input_tokens_seen": 1476160, "step": 1535 }, { "epoch": 0.13068567549219282, "grad_norm": 19.043073654174805, "learning_rate": 1.9943187780540062e-06, "loss": 0.0755, "num_input_tokens_seen": 1481152, "step": 1540 }, { "epoch": 0.13110997963340124, "grad_norm": 6.632198333740234, "learning_rate": 1.994160025266478e-06, "loss": 0.1668, "num_input_tokens_seen": 1486336, "step": 1545 }, { "epoch": 0.13153428377460963, "grad_norm": 11.749693870544434, "learning_rate": 1.9939990913837327e-06, "loss": 0.0588, "num_input_tokens_seen": 1491264, "step": 1550 }, { "epoch": 0.13195858791581805, "grad_norm": 5.760255813598633, "learning_rate": 1.993835976758845e-06, "loss": 0.0672, "num_input_tokens_seen": 1495680, "step": 1555 }, { "epoch": 0.13238289205702647, "grad_norm": 41.815704345703125, "learning_rate": 1.993670681749673e-06, "loss": 0.1687, "num_input_tokens_seen": 1501376, "step": 1560 }, { "epoch": 0.1328071961982349, "grad_norm": 4.122589588165283, "learning_rate": 1.9935032067188587e-06, "loss": 0.1089, "num_input_tokens_seen": 1506176, "step": 1565 }, { "epoch": 0.1332315003394433, "grad_norm": 25.289459228515625, "learning_rate": 1.993333552033827e-06, "loss": 0.0749, "num_input_tokens_seen": 1511808, "step": 1570 }, { "epoch": 0.13365580448065173, "grad_norm": 20.910133361816406, "learning_rate": 1.9931617180667844e-06, "loss": 0.0406, "num_input_tokens_seen": 1516608, "step": 1575 }, { "epoch": 0.13408010862186015, "grad_norm": 15.184967041015625, "learning_rate": 1.992987705194719e-06, "loss": 0.0988, "num_input_tokens_seen": 1521280, "step": 1580 }, { "epoch": 0.13450441276306857, "grad_norm": 1.4249966144561768, "learning_rate": 1.9928115137993983e-06, "loss": 0.0683, "num_input_tokens_seen": 1526080, "step": 1585 }, { "epoch": 0.134928716904277, "grad_norm": 21.653785705566406, "learning_rate": 1.9926331442673703e-06, "loss": 0.0429, "num_input_tokens_seen": 1530944, "step": 1590 }, { "epoch": 0.1353530210454854, "grad_norm": 37.388206481933594, "learning_rate": 1.992452596989962e-06, "loss": 0.1098, "num_input_tokens_seen": 1536256, "step": 1595 }, { "epoch": 0.13577732518669383, "grad_norm": 36.764583587646484, "learning_rate": 1.9922698723632763e-06, "loss": 0.0842, "num_input_tokens_seen": 1540864, "step": 1600 }, { "epoch": 0.13620162932790225, "grad_norm": 9.037910461425781, "learning_rate": 1.992084970788195e-06, "loss": 0.077, "num_input_tokens_seen": 1545536, "step": 1605 }, { "epoch": 0.13662593346911067, "grad_norm": 30.793230056762695, "learning_rate": 1.991897892670375e-06, "loss": 0.1246, "num_input_tokens_seen": 1550144, "step": 1610 }, { "epoch": 0.13705023761031906, "grad_norm": 8.332464218139648, "learning_rate": 1.9917086384202475e-06, "loss": 0.0509, "num_input_tokens_seen": 1554624, "step": 1615 }, { "epoch": 0.13747454175152748, "grad_norm": 16.857133865356445, "learning_rate": 1.9915172084530195e-06, "loss": 0.1169, "num_input_tokens_seen": 1559168, "step": 1620 }, { "epoch": 0.1378988458927359, "grad_norm": 9.418706893920898, "learning_rate": 1.9913236031886707e-06, "loss": 0.0867, "num_input_tokens_seen": 1564032, "step": 1625 }, { "epoch": 0.13832315003394433, "grad_norm": 11.382521629333496, "learning_rate": 1.9911278230519533e-06, "loss": 0.0813, "num_input_tokens_seen": 1568896, "step": 1630 }, { "epoch": 0.13874745417515275, "grad_norm": 21.073949813842773, "learning_rate": 1.9909298684723905e-06, "loss": 0.0779, "num_input_tokens_seen": 1573888, "step": 1635 }, { "epoch": 0.13917175831636117, "grad_norm": 0.7896543741226196, "learning_rate": 1.9907297398842764e-06, "loss": 0.0649, "num_input_tokens_seen": 1578496, "step": 1640 }, { "epoch": 0.1395960624575696, "grad_norm": 20.59737777709961, "learning_rate": 1.9905274377266744e-06, "loss": 0.0418, "num_input_tokens_seen": 1583104, "step": 1645 }, { "epoch": 0.140020366598778, "grad_norm": 53.22269821166992, "learning_rate": 1.9903229624434174e-06, "loss": 0.1031, "num_input_tokens_seen": 1587648, "step": 1650 }, { "epoch": 0.14044467073998643, "grad_norm": 17.283193588256836, "learning_rate": 1.9901163144831047e-06, "loss": 0.1513, "num_input_tokens_seen": 1593216, "step": 1655 }, { "epoch": 0.14086897488119485, "grad_norm": 17.767263412475586, "learning_rate": 1.989907494299103e-06, "loss": 0.0057, "num_input_tokens_seen": 1598208, "step": 1660 }, { "epoch": 0.14129327902240327, "grad_norm": 1.1825686693191528, "learning_rate": 1.989696502349545e-06, "loss": 0.0057, "num_input_tokens_seen": 1602688, "step": 1665 }, { "epoch": 0.1417175831636117, "grad_norm": 17.303054809570312, "learning_rate": 1.9894833390973266e-06, "loss": 0.1691, "num_input_tokens_seen": 1606784, "step": 1670 }, { "epoch": 0.1421418873048201, "grad_norm": 27.718305587768555, "learning_rate": 1.9892680050101085e-06, "loss": 0.1757, "num_input_tokens_seen": 1611584, "step": 1675 }, { "epoch": 0.1425661914460285, "grad_norm": 13.038661003112793, "learning_rate": 1.9890505005603146e-06, "loss": 0.094, "num_input_tokens_seen": 1616576, "step": 1680 }, { "epoch": 0.14299049558723692, "grad_norm": 9.758828163146973, "learning_rate": 1.9888308262251284e-06, "loss": 0.0994, "num_input_tokens_seen": 1621440, "step": 1685 }, { "epoch": 0.14341479972844534, "grad_norm": 21.367116928100586, "learning_rate": 1.9886089824864956e-06, "loss": 0.071, "num_input_tokens_seen": 1626368, "step": 1690 }, { "epoch": 0.14383910386965376, "grad_norm": 16.830913543701172, "learning_rate": 1.9883849698311213e-06, "loss": 0.0566, "num_input_tokens_seen": 1630784, "step": 1695 }, { "epoch": 0.14426340801086218, "grad_norm": 13.30070686340332, "learning_rate": 1.988158788750468e-06, "loss": 0.0815, "num_input_tokens_seen": 1635776, "step": 1700 }, { "epoch": 0.1446877121520706, "grad_norm": 16.3173828125, "learning_rate": 1.9879304397407566e-06, "loss": 0.0967, "num_input_tokens_seen": 1640448, "step": 1705 }, { "epoch": 0.14511201629327902, "grad_norm": 7.858912467956543, "learning_rate": 1.987699923302963e-06, "loss": 0.0206, "num_input_tokens_seen": 1645440, "step": 1710 }, { "epoch": 0.14553632043448744, "grad_norm": 35.57898712158203, "learning_rate": 1.9874672399428195e-06, "loss": 0.0811, "num_input_tokens_seen": 1649984, "step": 1715 }, { "epoch": 0.14596062457569586, "grad_norm": 39.42796325683594, "learning_rate": 1.9872323901708116e-06, "loss": 0.1235, "num_input_tokens_seen": 1654720, "step": 1720 }, { "epoch": 0.14638492871690428, "grad_norm": 19.698476791381836, "learning_rate": 1.9869953745021785e-06, "loss": 0.1061, "num_input_tokens_seen": 1659648, "step": 1725 }, { "epoch": 0.1468092328581127, "grad_norm": 0.6171606779098511, "learning_rate": 1.9867561934569103e-06, "loss": 0.0567, "num_input_tokens_seen": 1664896, "step": 1730 }, { "epoch": 0.14723353699932112, "grad_norm": 3.3654897212982178, "learning_rate": 1.9865148475597475e-06, "loss": 0.0482, "num_input_tokens_seen": 1669568, "step": 1735 }, { "epoch": 0.14765784114052954, "grad_norm": 19.97379493713379, "learning_rate": 1.986271337340182e-06, "loss": 0.0588, "num_input_tokens_seen": 1674432, "step": 1740 }, { "epoch": 0.14808214528173794, "grad_norm": 3.5285873413085938, "learning_rate": 1.9860256633324513e-06, "loss": 0.0265, "num_input_tokens_seen": 1678720, "step": 1745 }, { "epoch": 0.14850644942294636, "grad_norm": 23.46439552307129, "learning_rate": 1.9857778260755426e-06, "loss": 0.0692, "num_input_tokens_seen": 1683904, "step": 1750 }, { "epoch": 0.14893075356415478, "grad_norm": 28.765380859375, "learning_rate": 1.9855278261131876e-06, "loss": 0.0717, "num_input_tokens_seen": 1689024, "step": 1755 }, { "epoch": 0.1493550577053632, "grad_norm": 2.764413833618164, "learning_rate": 1.985275663993863e-06, "loss": 0.0851, "num_input_tokens_seen": 1693632, "step": 1760 }, { "epoch": 0.14977936184657162, "grad_norm": 13.944731712341309, "learning_rate": 1.9850213402707888e-06, "loss": 0.0532, "num_input_tokens_seen": 1698304, "step": 1765 }, { "epoch": 0.15020366598778004, "grad_norm": 40.670230865478516, "learning_rate": 1.9847648555019286e-06, "loss": 0.1201, "num_input_tokens_seen": 1703808, "step": 1770 }, { "epoch": 0.15020366598778004, "eval_loss": 0.0835869163274765, "eval_runtime": 16.0609, "eval_samples_per_second": 652.205, "eval_steps_per_second": 81.565, "num_input_tokens_seen": 1703808, "step": 1770 }, { "epoch": 0.15062797012898846, "grad_norm": 4.531874656677246, "learning_rate": 1.9845062102499858e-06, "loss": 0.0634, "num_input_tokens_seen": 1708992, "step": 1775 }, { "epoch": 0.15105227427019688, "grad_norm": 1.780526876449585, "learning_rate": 1.9842454050824043e-06, "loss": 0.0769, "num_input_tokens_seen": 1713600, "step": 1780 }, { "epoch": 0.1514765784114053, "grad_norm": 42.02983856201172, "learning_rate": 1.9839824405713663e-06, "loss": 0.0963, "num_input_tokens_seen": 1718208, "step": 1785 }, { "epoch": 0.15190088255261372, "grad_norm": 32.276702880859375, "learning_rate": 1.983717317293792e-06, "loss": 0.1394, "num_input_tokens_seen": 1722560, "step": 1790 }, { "epoch": 0.15232518669382214, "grad_norm": 1.8770980834960938, "learning_rate": 1.983450035831337e-06, "loss": 0.0263, "num_input_tokens_seen": 1727296, "step": 1795 }, { "epoch": 0.15274949083503056, "grad_norm": 13.28686809539795, "learning_rate": 1.983180596770392e-06, "loss": 0.0809, "num_input_tokens_seen": 1732608, "step": 1800 }, { "epoch": 0.15317379497623898, "grad_norm": 0.3505326211452484, "learning_rate": 1.982909000702082e-06, "loss": 0.0569, "num_input_tokens_seen": 1737280, "step": 1805 }, { "epoch": 0.15359809911744737, "grad_norm": 5.0527262687683105, "learning_rate": 1.982635248222264e-06, "loss": 0.0948, "num_input_tokens_seen": 1741440, "step": 1810 }, { "epoch": 0.1540224032586558, "grad_norm": 10.845234870910645, "learning_rate": 1.982359339931524e-06, "loss": 0.0485, "num_input_tokens_seen": 1746176, "step": 1815 }, { "epoch": 0.1544467073998642, "grad_norm": 21.369482040405273, "learning_rate": 1.9820812764351804e-06, "loss": 0.1267, "num_input_tokens_seen": 1751680, "step": 1820 }, { "epoch": 0.15487101154107263, "grad_norm": 0.3655910789966583, "learning_rate": 1.981801058343279e-06, "loss": 0.0524, "num_input_tokens_seen": 1756416, "step": 1825 }, { "epoch": 0.15529531568228105, "grad_norm": 1.76851487159729, "learning_rate": 1.981518686270592e-06, "loss": 0.0919, "num_input_tokens_seen": 1760960, "step": 1830 }, { "epoch": 0.15571961982348947, "grad_norm": 33.344947814941406, "learning_rate": 1.9812341608366183e-06, "loss": 0.0884, "num_input_tokens_seen": 1766208, "step": 1835 }, { "epoch": 0.1561439239646979, "grad_norm": 48.86477279663086, "learning_rate": 1.980947482665579e-06, "loss": 0.0528, "num_input_tokens_seen": 1771264, "step": 1840 }, { "epoch": 0.15656822810590632, "grad_norm": 10.009577751159668, "learning_rate": 1.980658652386421e-06, "loss": 0.15, "num_input_tokens_seen": 1776192, "step": 1845 }, { "epoch": 0.15699253224711474, "grad_norm": 1.3521720170974731, "learning_rate": 1.9803676706328102e-06, "loss": 0.0842, "num_input_tokens_seen": 1780992, "step": 1850 }, { "epoch": 0.15741683638832316, "grad_norm": 12.654996871948242, "learning_rate": 1.980074538043134e-06, "loss": 0.0473, "num_input_tokens_seen": 1785408, "step": 1855 }, { "epoch": 0.15784114052953158, "grad_norm": 24.43471908569336, "learning_rate": 1.9797792552604985e-06, "loss": 0.1532, "num_input_tokens_seen": 1790208, "step": 1860 }, { "epoch": 0.15826544467074, "grad_norm": 0.8395228385925293, "learning_rate": 1.9794818229327266e-06, "loss": 0.0137, "num_input_tokens_seen": 1795264, "step": 1865 }, { "epoch": 0.15868974881194842, "grad_norm": 28.747392654418945, "learning_rate": 1.9791822417123576e-06, "loss": 0.0572, "num_input_tokens_seen": 1800064, "step": 1870 }, { "epoch": 0.1591140529531568, "grad_norm": 6.029873847961426, "learning_rate": 1.9788805122566445e-06, "loss": 0.05, "num_input_tokens_seen": 1804672, "step": 1875 }, { "epoch": 0.15953835709436523, "grad_norm": 28.338607788085938, "learning_rate": 1.9785766352275538e-06, "loss": 0.1075, "num_input_tokens_seen": 1809408, "step": 1880 }, { "epoch": 0.15996266123557365, "grad_norm": 22.816967010498047, "learning_rate": 1.9782706112917643e-06, "loss": 0.1561, "num_input_tokens_seen": 1813824, "step": 1885 }, { "epoch": 0.16038696537678207, "grad_norm": 28.80828857421875, "learning_rate": 1.977962441120664e-06, "loss": 0.0392, "num_input_tokens_seen": 1818176, "step": 1890 }, { "epoch": 0.1608112695179905, "grad_norm": 1.2556086778640747, "learning_rate": 1.9776521253903492e-06, "loss": 0.0622, "num_input_tokens_seen": 1822784, "step": 1895 }, { "epoch": 0.1612355736591989, "grad_norm": 14.4972562789917, "learning_rate": 1.9773396647816246e-06, "loss": 0.1414, "num_input_tokens_seen": 1827520, "step": 1900 }, { "epoch": 0.16165987780040733, "grad_norm": 19.10732650756836, "learning_rate": 1.97702505998e-06, "loss": 0.0911, "num_input_tokens_seen": 1832256, "step": 1905 }, { "epoch": 0.16208418194161575, "grad_norm": 14.778487205505371, "learning_rate": 1.976708311675688e-06, "loss": 0.0821, "num_input_tokens_seen": 1836864, "step": 1910 }, { "epoch": 0.16250848608282417, "grad_norm": 7.831988334655762, "learning_rate": 1.976389420563607e-06, "loss": 0.0317, "num_input_tokens_seen": 1841280, "step": 1915 }, { "epoch": 0.1629327902240326, "grad_norm": 1.758711338043213, "learning_rate": 1.9760683873433734e-06, "loss": 0.0848, "num_input_tokens_seen": 1846080, "step": 1920 }, { "epoch": 0.163357094365241, "grad_norm": 0.39283105731010437, "learning_rate": 1.9757452127193043e-06, "loss": 0.0373, "num_input_tokens_seen": 1850816, "step": 1925 }, { "epoch": 0.16378139850644943, "grad_norm": 20.11293601989746, "learning_rate": 1.9754198974004156e-06, "loss": 0.0922, "num_input_tokens_seen": 1855232, "step": 1930 }, { "epoch": 0.16420570264765785, "grad_norm": 2.879047393798828, "learning_rate": 1.975092442100419e-06, "loss": 0.0689, "num_input_tokens_seen": 1860160, "step": 1935 }, { "epoch": 0.16463000678886625, "grad_norm": 4.302145957946777, "learning_rate": 1.9747628475377204e-06, "loss": 0.0229, "num_input_tokens_seen": 1865024, "step": 1940 }, { "epoch": 0.16505431093007467, "grad_norm": 48.407005310058594, "learning_rate": 1.9744311144354208e-06, "loss": 0.0846, "num_input_tokens_seen": 1869888, "step": 1945 }, { "epoch": 0.16547861507128309, "grad_norm": 0.2003674954175949, "learning_rate": 1.9740972435213112e-06, "loss": 0.1164, "num_input_tokens_seen": 1874624, "step": 1950 }, { "epoch": 0.1659029192124915, "grad_norm": 33.10806655883789, "learning_rate": 1.973761235527874e-06, "loss": 0.066, "num_input_tokens_seen": 1879168, "step": 1955 }, { "epoch": 0.16632722335369993, "grad_norm": 51.882904052734375, "learning_rate": 1.9734230911922795e-06, "loss": 0.1811, "num_input_tokens_seen": 1884096, "step": 1960 }, { "epoch": 0.16675152749490835, "grad_norm": 63.34104919433594, "learning_rate": 1.9730828112563852e-06, "loss": 0.0921, "num_input_tokens_seen": 1888832, "step": 1965 }, { "epoch": 0.16717583163611677, "grad_norm": 32.06380081176758, "learning_rate": 1.972740396466734e-06, "loss": 0.0428, "num_input_tokens_seen": 1893696, "step": 1970 }, { "epoch": 0.1676001357773252, "grad_norm": 2.3903558254241943, "learning_rate": 1.972395847574552e-06, "loss": 0.0128, "num_input_tokens_seen": 1898176, "step": 1975 }, { "epoch": 0.1680244399185336, "grad_norm": 6.827738285064697, "learning_rate": 1.972049165335747e-06, "loss": 0.092, "num_input_tokens_seen": 1902720, "step": 1980 }, { "epoch": 0.16844874405974203, "grad_norm": 25.457487106323242, "learning_rate": 1.9717003505109094e-06, "loss": 0.0494, "num_input_tokens_seen": 1907520, "step": 1985 }, { "epoch": 0.16887304820095045, "grad_norm": 13.417096138000488, "learning_rate": 1.9713494038653054e-06, "loss": 0.0955, "num_input_tokens_seen": 1912000, "step": 1990 }, { "epoch": 0.16929735234215887, "grad_norm": 5.452141761779785, "learning_rate": 1.97099632616888e-06, "loss": 0.0437, "num_input_tokens_seen": 1916224, "step": 1995 }, { "epoch": 0.1697216564833673, "grad_norm": 0.8709999918937683, "learning_rate": 1.9706411181962534e-06, "loss": 0.0532, "num_input_tokens_seen": 1920896, "step": 2000 }, { "epoch": 0.1701459606245757, "grad_norm": 17.97975730895996, "learning_rate": 1.970283780726718e-06, "loss": 0.0502, "num_input_tokens_seen": 1925312, "step": 2005 }, { "epoch": 0.1705702647657841, "grad_norm": 0.3186114728450775, "learning_rate": 1.9699243145442397e-06, "loss": 0.095, "num_input_tokens_seen": 1929920, "step": 2010 }, { "epoch": 0.17099456890699252, "grad_norm": 25.567066192626953, "learning_rate": 1.9695627204374544e-06, "loss": 0.0817, "num_input_tokens_seen": 1934720, "step": 2015 }, { "epoch": 0.17141887304820094, "grad_norm": 21.674535751342773, "learning_rate": 1.969198999199666e-06, "loss": 0.0332, "num_input_tokens_seen": 1939584, "step": 2020 }, { "epoch": 0.17184317718940936, "grad_norm": 8.817938804626465, "learning_rate": 1.968833151628845e-06, "loss": 0.125, "num_input_tokens_seen": 1944576, "step": 2025 }, { "epoch": 0.17226748133061778, "grad_norm": 28.406312942504883, "learning_rate": 1.968465178527628e-06, "loss": 0.1032, "num_input_tokens_seen": 1948928, "step": 2030 }, { "epoch": 0.1726917854718262, "grad_norm": 27.26176643371582, "learning_rate": 1.9680950807033124e-06, "loss": 0.0727, "num_input_tokens_seen": 1953600, "step": 2035 }, { "epoch": 0.17311608961303462, "grad_norm": 9.266279220581055, "learning_rate": 1.96772285896786e-06, "loss": 0.0892, "num_input_tokens_seen": 1958592, "step": 2040 }, { "epoch": 0.17354039375424304, "grad_norm": 2.9285852909088135, "learning_rate": 1.9673485141378904e-06, "loss": 0.042, "num_input_tokens_seen": 1962752, "step": 2045 }, { "epoch": 0.17396469789545146, "grad_norm": 23.246116638183594, "learning_rate": 1.9669720470346817e-06, "loss": 0.1337, "num_input_tokens_seen": 1967424, "step": 2050 }, { "epoch": 0.17438900203665988, "grad_norm": 5.3057541847229, "learning_rate": 1.966593458484168e-06, "loss": 0.0393, "num_input_tokens_seen": 1972736, "step": 2055 }, { "epoch": 0.1748133061778683, "grad_norm": 13.531951904296875, "learning_rate": 1.9662127493169367e-06, "loss": 0.0351, "num_input_tokens_seen": 1977408, "step": 2060 }, { "epoch": 0.17523761031907673, "grad_norm": 34.17113494873047, "learning_rate": 1.96582992036823e-06, "loss": 0.044, "num_input_tokens_seen": 1982016, "step": 2065 }, { "epoch": 0.17566191446028515, "grad_norm": 13.662484169006348, "learning_rate": 1.9654449724779387e-06, "loss": 0.114, "num_input_tokens_seen": 1987392, "step": 2070 }, { "epoch": 0.17608621860149354, "grad_norm": 12.098200798034668, "learning_rate": 1.965057906490602e-06, "loss": 0.0802, "num_input_tokens_seen": 1992064, "step": 2075 }, { "epoch": 0.17651052274270196, "grad_norm": 20.902666091918945, "learning_rate": 1.964668723255408e-06, "loss": 0.0644, "num_input_tokens_seen": 1997120, "step": 2080 }, { "epoch": 0.17693482688391038, "grad_norm": 21.382699966430664, "learning_rate": 1.964277423626188e-06, "loss": 0.0501, "num_input_tokens_seen": 2001664, "step": 2085 }, { "epoch": 0.1773591310251188, "grad_norm": 22.832244873046875, "learning_rate": 1.9638840084614178e-06, "loss": 0.0941, "num_input_tokens_seen": 2006336, "step": 2090 }, { "epoch": 0.17778343516632722, "grad_norm": 20.812602996826172, "learning_rate": 1.963488478624214e-06, "loss": 0.0483, "num_input_tokens_seen": 2011264, "step": 2095 }, { "epoch": 0.17820773930753564, "grad_norm": 20.94266128540039, "learning_rate": 1.9630908349823315e-06, "loss": 0.0896, "num_input_tokens_seen": 2015680, "step": 2100 }, { "epoch": 0.17863204344874406, "grad_norm": 22.355464935302734, "learning_rate": 1.9626910784081647e-06, "loss": 0.1457, "num_input_tokens_seen": 2020352, "step": 2105 }, { "epoch": 0.17905634758995248, "grad_norm": 8.69415283203125, "learning_rate": 1.9622892097787426e-06, "loss": 0.0603, "num_input_tokens_seen": 2024832, "step": 2110 }, { "epoch": 0.1794806517311609, "grad_norm": 12.219429016113281, "learning_rate": 1.961885229975727e-06, "loss": 0.0261, "num_input_tokens_seen": 2029184, "step": 2115 }, { "epoch": 0.17990495587236932, "grad_norm": 16.649051666259766, "learning_rate": 1.9614791398854133e-06, "loss": 0.0787, "num_input_tokens_seen": 2034048, "step": 2120 }, { "epoch": 0.18032926001357774, "grad_norm": 7.8996357917785645, "learning_rate": 1.9610709403987244e-06, "loss": 0.0453, "num_input_tokens_seen": 2039232, "step": 2125 }, { "epoch": 0.18075356415478616, "grad_norm": 13.233209609985352, "learning_rate": 1.9606606324112134e-06, "loss": 0.0774, "num_input_tokens_seen": 2043712, "step": 2130 }, { "epoch": 0.18117786829599458, "grad_norm": 17.83286476135254, "learning_rate": 1.9602482168230576e-06, "loss": 0.1347, "num_input_tokens_seen": 2048576, "step": 2135 }, { "epoch": 0.18160217243720297, "grad_norm": 16.00604820251465, "learning_rate": 1.9598336945390584e-06, "loss": 0.071, "num_input_tokens_seen": 2053440, "step": 2140 }, { "epoch": 0.1820264765784114, "grad_norm": 3.1249139308929443, "learning_rate": 1.95941706646864e-06, "loss": 0.0639, "num_input_tokens_seen": 2058304, "step": 2145 }, { "epoch": 0.18245078071961982, "grad_norm": 6.381864070892334, "learning_rate": 1.9589983335258457e-06, "loss": 0.0581, "num_input_tokens_seen": 2062720, "step": 2150 }, { "epoch": 0.18287508486082824, "grad_norm": 3.6606667041778564, "learning_rate": 1.9585774966293365e-06, "loss": 0.0886, "num_input_tokens_seen": 2067264, "step": 2155 }, { "epoch": 0.18329938900203666, "grad_norm": 11.423752784729004, "learning_rate": 1.95815455670239e-06, "loss": 0.0804, "num_input_tokens_seen": 2071744, "step": 2160 }, { "epoch": 0.18372369314324508, "grad_norm": 16.14927101135254, "learning_rate": 1.957729514672897e-06, "loss": 0.0776, "num_input_tokens_seen": 2076352, "step": 2165 }, { "epoch": 0.1841479972844535, "grad_norm": 20.34837532043457, "learning_rate": 1.957302371473361e-06, "loss": 0.0654, "num_input_tokens_seen": 2081088, "step": 2170 }, { "epoch": 0.18457230142566192, "grad_norm": 21.14691162109375, "learning_rate": 1.9568731280408945e-06, "loss": 0.0651, "num_input_tokens_seen": 2085760, "step": 2175 }, { "epoch": 0.18499660556687034, "grad_norm": 8.296751976013184, "learning_rate": 1.956441785317217e-06, "loss": 0.0981, "num_input_tokens_seen": 2090624, "step": 2180 }, { "epoch": 0.18542090970807876, "grad_norm": 8.130701065063477, "learning_rate": 1.9560083442486565e-06, "loss": 0.0606, "num_input_tokens_seen": 2095936, "step": 2185 }, { "epoch": 0.18584521384928718, "grad_norm": 8.967162132263184, "learning_rate": 1.955572805786141e-06, "loss": 0.081, "num_input_tokens_seen": 2100608, "step": 2190 }, { "epoch": 0.1862695179904956, "grad_norm": 20.573461532592773, "learning_rate": 1.9551351708852015e-06, "loss": 0.0897, "num_input_tokens_seen": 2105856, "step": 2195 }, { "epoch": 0.18669382213170402, "grad_norm": 7.168735027313232, "learning_rate": 1.9546954405059697e-06, "loss": 0.0448, "num_input_tokens_seen": 2110464, "step": 2200 }, { "epoch": 0.1871181262729124, "grad_norm": 13.694960594177246, "learning_rate": 1.954253615613173e-06, "loss": 0.1086, "num_input_tokens_seen": 2115648, "step": 2205 }, { "epoch": 0.18754243041412083, "grad_norm": 3.1330201625823975, "learning_rate": 1.9538096971761343e-06, "loss": 0.043, "num_input_tokens_seen": 2120256, "step": 2210 }, { "epoch": 0.18796673455532925, "grad_norm": 4.10610294342041, "learning_rate": 1.9533636861687696e-06, "loss": 0.1077, "num_input_tokens_seen": 2124672, "step": 2215 }, { "epoch": 0.18839103869653767, "grad_norm": 43.373619079589844, "learning_rate": 1.9529155835695855e-06, "loss": 0.0675, "num_input_tokens_seen": 2129344, "step": 2220 }, { "epoch": 0.1888153428377461, "grad_norm": 23.113445281982422, "learning_rate": 1.952465390361678e-06, "loss": 0.0677, "num_input_tokens_seen": 2134336, "step": 2225 }, { "epoch": 0.1892396469789545, "grad_norm": 8.734219551086426, "learning_rate": 1.95201310753273e-06, "loss": 0.0558, "num_input_tokens_seen": 2139456, "step": 2230 }, { "epoch": 0.18966395112016293, "grad_norm": 4.216504096984863, "learning_rate": 1.9515587360750068e-06, "loss": 0.1059, "num_input_tokens_seen": 2144640, "step": 2235 }, { "epoch": 0.19008825526137135, "grad_norm": 11.680764198303223, "learning_rate": 1.9511022769853586e-06, "loss": 0.1091, "num_input_tokens_seen": 2149760, "step": 2240 }, { "epoch": 0.19051255940257977, "grad_norm": 22.223356246948242, "learning_rate": 1.9506437312652144e-06, "loss": 0.1657, "num_input_tokens_seen": 2155200, "step": 2245 }, { "epoch": 0.1909368635437882, "grad_norm": 2.622177839279175, "learning_rate": 1.9501830999205806e-06, "loss": 0.1252, "num_input_tokens_seen": 2159872, "step": 2250 }, { "epoch": 0.1913611676849966, "grad_norm": 17.01951789855957, "learning_rate": 1.9497203839620398e-06, "loss": 0.0864, "num_input_tokens_seen": 2164544, "step": 2255 }, { "epoch": 0.19178547182620503, "grad_norm": 4.09209680557251, "learning_rate": 1.9492555844047483e-06, "loss": 0.0384, "num_input_tokens_seen": 2169856, "step": 2260 }, { "epoch": 0.19220977596741345, "grad_norm": 0.8658658266067505, "learning_rate": 1.9487887022684334e-06, "loss": 0.0385, "num_input_tokens_seen": 2174400, "step": 2265 }, { "epoch": 0.19263408010862185, "grad_norm": 7.3754730224609375, "learning_rate": 1.9483197385773913e-06, "loss": 0.0915, "num_input_tokens_seen": 2179200, "step": 2270 }, { "epoch": 0.19305838424983027, "grad_norm": 7.779440402984619, "learning_rate": 1.947848694360485e-06, "loss": 0.0774, "num_input_tokens_seen": 2184768, "step": 2275 }, { "epoch": 0.1934826883910387, "grad_norm": 50.24574279785156, "learning_rate": 1.947375570651142e-06, "loss": 0.0947, "num_input_tokens_seen": 2189824, "step": 2280 }, { "epoch": 0.1939069925322471, "grad_norm": 1.2048569917678833, "learning_rate": 1.9469003684873514e-06, "loss": 0.0549, "num_input_tokens_seen": 2194752, "step": 2285 }, { "epoch": 0.19433129667345553, "grad_norm": 26.04237174987793, "learning_rate": 1.946423088911664e-06, "loss": 0.1116, "num_input_tokens_seen": 2199552, "step": 2290 }, { "epoch": 0.19475560081466395, "grad_norm": 17.550045013427734, "learning_rate": 1.9459437329711865e-06, "loss": 0.1976, "num_input_tokens_seen": 2204288, "step": 2295 }, { "epoch": 0.19517990495587237, "grad_norm": 24.590728759765625, "learning_rate": 1.945462301717581e-06, "loss": 0.0683, "num_input_tokens_seen": 2209344, "step": 2300 }, { "epoch": 0.1956042090970808, "grad_norm": 7.054686546325684, "learning_rate": 1.944978796207064e-06, "loss": 0.0837, "num_input_tokens_seen": 2214208, "step": 2305 }, { "epoch": 0.1960285132382892, "grad_norm": 22.503530502319336, "learning_rate": 1.9444932175004017e-06, "loss": 0.0974, "num_input_tokens_seen": 2218624, "step": 2310 }, { "epoch": 0.19645281737949763, "grad_norm": 20.48867416381836, "learning_rate": 1.9440055666629087e-06, "loss": 0.0717, "num_input_tokens_seen": 2223872, "step": 2315 }, { "epoch": 0.19687712152070605, "grad_norm": 1.9906337261199951, "learning_rate": 1.943515844764446e-06, "loss": 0.0729, "num_input_tokens_seen": 2228096, "step": 2320 }, { "epoch": 0.19730142566191447, "grad_norm": 25.787109375, "learning_rate": 1.943024052879418e-06, "loss": 0.1151, "num_input_tokens_seen": 2232384, "step": 2325 }, { "epoch": 0.1977257298031229, "grad_norm": 4.576552867889404, "learning_rate": 1.9425301920867703e-06, "loss": 0.0336, "num_input_tokens_seen": 2237184, "step": 2330 }, { "epoch": 0.19815003394433128, "grad_norm": 18.02105140686035, "learning_rate": 1.942034263469989e-06, "loss": 0.1116, "num_input_tokens_seen": 2242048, "step": 2335 }, { "epoch": 0.1985743380855397, "grad_norm": 3.6412253379821777, "learning_rate": 1.941536268117095e-06, "loss": 0.0395, "num_input_tokens_seen": 2247104, "step": 2340 }, { "epoch": 0.19899864222674812, "grad_norm": 9.237351417541504, "learning_rate": 1.9410362071206436e-06, "loss": 0.089, "num_input_tokens_seen": 2251840, "step": 2345 }, { "epoch": 0.19942294636795654, "grad_norm": 29.391401290893555, "learning_rate": 1.9405340815777232e-06, "loss": 0.1879, "num_input_tokens_seen": 2256832, "step": 2350 }, { "epoch": 0.19984725050916496, "grad_norm": 29.297670364379883, "learning_rate": 1.9400298925899505e-06, "loss": 0.0277, "num_input_tokens_seen": 2261120, "step": 2355 }, { "epoch": 0.20027155465037338, "grad_norm": 8.756644248962402, "learning_rate": 1.939523641263469e-06, "loss": 0.1436, "num_input_tokens_seen": 2266496, "step": 2360 }, { "epoch": 0.20027155465037338, "eval_loss": 0.0887659341096878, "eval_runtime": 15.9713, "eval_samples_per_second": 655.866, "eval_steps_per_second": 82.022, "num_input_tokens_seen": 2266496, "step": 2360 }, { "epoch": 0.2006958587915818, "grad_norm": 5.741673469543457, "learning_rate": 1.9390153287089485e-06, "loss": 0.0249, "num_input_tokens_seen": 2271040, "step": 2365 }, { "epoch": 0.20112016293279023, "grad_norm": 30.594064712524414, "learning_rate": 1.938504956041579e-06, "loss": 0.1026, "num_input_tokens_seen": 2276096, "step": 2370 }, { "epoch": 0.20154446707399865, "grad_norm": 20.060314178466797, "learning_rate": 1.937992524381071e-06, "loss": 0.0498, "num_input_tokens_seen": 2280192, "step": 2375 }, { "epoch": 0.20196877121520707, "grad_norm": 17.804256439208984, "learning_rate": 1.9374780348516525e-06, "loss": 0.0528, "num_input_tokens_seen": 2284672, "step": 2380 }, { "epoch": 0.20239307535641549, "grad_norm": 6.91187047958374, "learning_rate": 1.9369614885820657e-06, "loss": 0.1108, "num_input_tokens_seen": 2289664, "step": 2385 }, { "epoch": 0.2028173794976239, "grad_norm": 25.068439483642578, "learning_rate": 1.9364428867055655e-06, "loss": 0.0978, "num_input_tokens_seen": 2294976, "step": 2390 }, { "epoch": 0.20324168363883233, "grad_norm": 0.30525487661361694, "learning_rate": 1.935922230359916e-06, "loss": 0.0427, "num_input_tokens_seen": 2299584, "step": 2395 }, { "epoch": 0.20366598778004075, "grad_norm": 19.536846160888672, "learning_rate": 1.9353995206873898e-06, "loss": 0.067, "num_input_tokens_seen": 2304320, "step": 2400 }, { "epoch": 0.20409029192124914, "grad_norm": 18.92000961303711, "learning_rate": 1.9348747588347637e-06, "loss": 0.1673, "num_input_tokens_seen": 2308928, "step": 2405 }, { "epoch": 0.20451459606245756, "grad_norm": 2.1541614532470703, "learning_rate": 1.9343479459533157e-06, "loss": 0.0387, "num_input_tokens_seen": 2313280, "step": 2410 }, { "epoch": 0.20493890020366598, "grad_norm": 15.7988920211792, "learning_rate": 1.933819083198826e-06, "loss": 0.1093, "num_input_tokens_seen": 2318400, "step": 2415 }, { "epoch": 0.2053632043448744, "grad_norm": 13.255326271057129, "learning_rate": 1.9332881717315694e-06, "loss": 0.0704, "num_input_tokens_seen": 2323712, "step": 2420 }, { "epoch": 0.20578750848608282, "grad_norm": 7.066593647003174, "learning_rate": 1.9327552127163172e-06, "loss": 0.0282, "num_input_tokens_seen": 2327936, "step": 2425 }, { "epoch": 0.20621181262729124, "grad_norm": 0.11941832304000854, "learning_rate": 1.932220207322332e-06, "loss": 0.0612, "num_input_tokens_seen": 2332224, "step": 2430 }, { "epoch": 0.20663611676849966, "grad_norm": 20.786226272583008, "learning_rate": 1.931683156723366e-06, "loss": 0.0718, "num_input_tokens_seen": 2336704, "step": 2435 }, { "epoch": 0.20706042090970808, "grad_norm": 0.507086455821991, "learning_rate": 1.9311440620976595e-06, "loss": 0.0876, "num_input_tokens_seen": 2341888, "step": 2440 }, { "epoch": 0.2074847250509165, "grad_norm": 2.0232927799224854, "learning_rate": 1.930602924627935e-06, "loss": 0.0747, "num_input_tokens_seen": 2346432, "step": 2445 }, { "epoch": 0.20790902919212492, "grad_norm": 0.4495334327220917, "learning_rate": 1.930059745501399e-06, "loss": 0.0499, "num_input_tokens_seen": 2350656, "step": 2450 }, { "epoch": 0.20833333333333334, "grad_norm": 16.482301712036133, "learning_rate": 1.9295145259097362e-06, "loss": 0.0438, "num_input_tokens_seen": 2355392, "step": 2455 }, { "epoch": 0.20875763747454176, "grad_norm": 0.18059544265270233, "learning_rate": 1.9289672670491076e-06, "loss": 0.0241, "num_input_tokens_seen": 2360320, "step": 2460 }, { "epoch": 0.20918194161575018, "grad_norm": 2.5006070137023926, "learning_rate": 1.928417970120149e-06, "loss": 0.0112, "num_input_tokens_seen": 2365120, "step": 2465 }, { "epoch": 0.20960624575695858, "grad_norm": 4.708000659942627, "learning_rate": 1.9278666363279664e-06, "loss": 0.1338, "num_input_tokens_seen": 2369920, "step": 2470 }, { "epoch": 0.210030549898167, "grad_norm": 9.103507041931152, "learning_rate": 1.9273132668821363e-06, "loss": 0.0943, "num_input_tokens_seen": 2374848, "step": 2475 }, { "epoch": 0.21045485403937542, "grad_norm": 0.2682408392429352, "learning_rate": 1.926757862996699e-06, "loss": 0.0583, "num_input_tokens_seen": 2380032, "step": 2480 }, { "epoch": 0.21087915818058384, "grad_norm": 35.878334045410156, "learning_rate": 1.92620042589016e-06, "loss": 0.1324, "num_input_tokens_seen": 2384704, "step": 2485 }, { "epoch": 0.21130346232179226, "grad_norm": 51.964813232421875, "learning_rate": 1.9256409567854847e-06, "loss": 0.0661, "num_input_tokens_seen": 2389568, "step": 2490 }, { "epoch": 0.21172776646300068, "grad_norm": 23.085674285888672, "learning_rate": 1.9250794569100963e-06, "loss": 0.1469, "num_input_tokens_seen": 2394560, "step": 2495 }, { "epoch": 0.2121520706042091, "grad_norm": 2.490314245223999, "learning_rate": 1.9245159274958737e-06, "loss": 0.0676, "num_input_tokens_seen": 2399232, "step": 2500 }, { "epoch": 0.21257637474541752, "grad_norm": 14.327190399169922, "learning_rate": 1.9239503697791487e-06, "loss": 0.1362, "num_input_tokens_seen": 2404032, "step": 2505 }, { "epoch": 0.21300067888662594, "grad_norm": 10.138527870178223, "learning_rate": 1.9233827850007024e-06, "loss": 0.0744, "num_input_tokens_seen": 2408576, "step": 2510 }, { "epoch": 0.21342498302783436, "grad_norm": 10.438551902770996, "learning_rate": 1.9228131744057633e-06, "loss": 0.0727, "num_input_tokens_seen": 2413440, "step": 2515 }, { "epoch": 0.21384928716904278, "grad_norm": 10.9727144241333, "learning_rate": 1.922241539244005e-06, "loss": 0.0503, "num_input_tokens_seen": 2417664, "step": 2520 }, { "epoch": 0.2142735913102512, "grad_norm": 4.456893444061279, "learning_rate": 1.921667880769541e-06, "loss": 0.0807, "num_input_tokens_seen": 2422464, "step": 2525 }, { "epoch": 0.21469789545145962, "grad_norm": 9.519829750061035, "learning_rate": 1.921092200240926e-06, "loss": 0.039, "num_input_tokens_seen": 2427200, "step": 2530 }, { "epoch": 0.215122199592668, "grad_norm": 0.6565377712249756, "learning_rate": 1.9205144989211495e-06, "loss": 0.0421, "num_input_tokens_seen": 2431616, "step": 2535 }, { "epoch": 0.21554650373387643, "grad_norm": 8.525800704956055, "learning_rate": 1.919934778077635e-06, "loss": 0.0555, "num_input_tokens_seen": 2436352, "step": 2540 }, { "epoch": 0.21597080787508485, "grad_norm": 8.354979515075684, "learning_rate": 1.9193530389822362e-06, "loss": 0.0429, "num_input_tokens_seen": 2440704, "step": 2545 }, { "epoch": 0.21639511201629327, "grad_norm": 8.60445785522461, "learning_rate": 1.918769282911235e-06, "loss": 0.0246, "num_input_tokens_seen": 2446272, "step": 2550 }, { "epoch": 0.2168194161575017, "grad_norm": 30.532001495361328, "learning_rate": 1.9181835111453383e-06, "loss": 0.0731, "num_input_tokens_seen": 2450496, "step": 2555 }, { "epoch": 0.2172437202987101, "grad_norm": 49.82064437866211, "learning_rate": 1.9175957249696755e-06, "loss": 0.0483, "num_input_tokens_seen": 2455424, "step": 2560 }, { "epoch": 0.21766802443991853, "grad_norm": 38.16300964355469, "learning_rate": 1.9170059256737946e-06, "loss": 0.0535, "num_input_tokens_seen": 2460352, "step": 2565 }, { "epoch": 0.21809232858112695, "grad_norm": 14.842656135559082, "learning_rate": 1.9164141145516613e-06, "loss": 0.1311, "num_input_tokens_seen": 2464896, "step": 2570 }, { "epoch": 0.21851663272233537, "grad_norm": 0.09945914894342422, "learning_rate": 1.915820292901654e-06, "loss": 0.1178, "num_input_tokens_seen": 2469824, "step": 2575 }, { "epoch": 0.2189409368635438, "grad_norm": 12.554542541503906, "learning_rate": 1.915224462026563e-06, "loss": 0.1612, "num_input_tokens_seen": 2474176, "step": 2580 }, { "epoch": 0.21936524100475221, "grad_norm": 16.17306900024414, "learning_rate": 1.9146266232335854e-06, "loss": 0.1505, "num_input_tokens_seen": 2479232, "step": 2585 }, { "epoch": 0.21978954514596064, "grad_norm": 18.73937225341797, "learning_rate": 1.914026777834325e-06, "loss": 0.1221, "num_input_tokens_seen": 2483648, "step": 2590 }, { "epoch": 0.22021384928716906, "grad_norm": 9.550419807434082, "learning_rate": 1.9134249271447872e-06, "loss": 0.0517, "num_input_tokens_seen": 2488064, "step": 2595 }, { "epoch": 0.22063815342837745, "grad_norm": 4.426822185516357, "learning_rate": 1.9128210724853765e-06, "loss": 0.0604, "num_input_tokens_seen": 2493184, "step": 2600 }, { "epoch": 0.22106245756958587, "grad_norm": 8.261371612548828, "learning_rate": 1.912215215180894e-06, "loss": 0.094, "num_input_tokens_seen": 2498048, "step": 2605 }, { "epoch": 0.2214867617107943, "grad_norm": 17.701379776000977, "learning_rate": 1.9116073565605347e-06, "loss": 0.1097, "num_input_tokens_seen": 2504064, "step": 2610 }, { "epoch": 0.2219110658520027, "grad_norm": 6.866028785705566, "learning_rate": 1.9109974979578847e-06, "loss": 0.0698, "num_input_tokens_seen": 2508800, "step": 2615 }, { "epoch": 0.22233536999321113, "grad_norm": 3.621572732925415, "learning_rate": 1.9103856407109172e-06, "loss": 0.0328, "num_input_tokens_seen": 2513280, "step": 2620 }, { "epoch": 0.22275967413441955, "grad_norm": 8.503085136413574, "learning_rate": 1.9097717861619907e-06, "loss": 0.0484, "num_input_tokens_seen": 2518080, "step": 2625 }, { "epoch": 0.22318397827562797, "grad_norm": 4.364653587341309, "learning_rate": 1.9091559356578445e-06, "loss": 0.0655, "num_input_tokens_seen": 2522688, "step": 2630 }, { "epoch": 0.2236082824168364, "grad_norm": 0.2347300499677658, "learning_rate": 1.9085380905495985e-06, "loss": 0.0933, "num_input_tokens_seen": 2527040, "step": 2635 }, { "epoch": 0.2240325865580448, "grad_norm": 3.144343852996826, "learning_rate": 1.9079182521927475e-06, "loss": 0.0743, "num_input_tokens_seen": 2531584, "step": 2640 }, { "epoch": 0.22445689069925323, "grad_norm": 0.9802240133285522, "learning_rate": 1.9072964219471594e-06, "loss": 0.1144, "num_input_tokens_seen": 2536384, "step": 2645 }, { "epoch": 0.22488119484046165, "grad_norm": 21.657424926757812, "learning_rate": 1.9066726011770724e-06, "loss": 0.0821, "num_input_tokens_seen": 2540800, "step": 2650 }, { "epoch": 0.22530549898167007, "grad_norm": 1.9864215850830078, "learning_rate": 1.906046791251092e-06, "loss": 0.02, "num_input_tokens_seen": 2545600, "step": 2655 }, { "epoch": 0.2257298031228785, "grad_norm": 36.35599899291992, "learning_rate": 1.9054189935421868e-06, "loss": 0.1331, "num_input_tokens_seen": 2550528, "step": 2660 }, { "epoch": 0.22615410726408688, "grad_norm": 9.535066604614258, "learning_rate": 1.9047892094276871e-06, "loss": 0.0412, "num_input_tokens_seen": 2555328, "step": 2665 }, { "epoch": 0.2265784114052953, "grad_norm": 13.776278495788574, "learning_rate": 1.9041574402892813e-06, "loss": 0.0844, "num_input_tokens_seen": 2560320, "step": 2670 }, { "epoch": 0.22700271554650372, "grad_norm": 6.368226051330566, "learning_rate": 1.903523687513012e-06, "loss": 0.0467, "num_input_tokens_seen": 2564800, "step": 2675 }, { "epoch": 0.22742701968771215, "grad_norm": 20.76290512084961, "learning_rate": 1.902887952489275e-06, "loss": 0.0676, "num_input_tokens_seen": 2569664, "step": 2680 }, { "epoch": 0.22785132382892057, "grad_norm": 0.438536137342453, "learning_rate": 1.9022502366128132e-06, "loss": 0.0301, "num_input_tokens_seen": 2574592, "step": 2685 }, { "epoch": 0.22827562797012899, "grad_norm": 8.510451316833496, "learning_rate": 1.9016105412827173e-06, "loss": 0.1049, "num_input_tokens_seen": 2579328, "step": 2690 }, { "epoch": 0.2286999321113374, "grad_norm": 5.517171859741211, "learning_rate": 1.9009688679024189e-06, "loss": 0.0181, "num_input_tokens_seen": 2584896, "step": 2695 }, { "epoch": 0.22912423625254583, "grad_norm": 15.41623592376709, "learning_rate": 1.9003252178796907e-06, "loss": 0.1308, "num_input_tokens_seen": 2589504, "step": 2700 }, { "epoch": 0.22954854039375425, "grad_norm": 24.895021438598633, "learning_rate": 1.8996795926266412e-06, "loss": 0.0851, "num_input_tokens_seen": 2594304, "step": 2705 }, { "epoch": 0.22997284453496267, "grad_norm": 17.133085250854492, "learning_rate": 1.899031993559712e-06, "loss": 0.0292, "num_input_tokens_seen": 2598784, "step": 2710 }, { "epoch": 0.2303971486761711, "grad_norm": 25.906352996826172, "learning_rate": 1.8983824220996764e-06, "loss": 0.0709, "num_input_tokens_seen": 2603712, "step": 2715 }, { "epoch": 0.2308214528173795, "grad_norm": 21.616334915161133, "learning_rate": 1.8977308796716338e-06, "loss": 0.0578, "num_input_tokens_seen": 2608320, "step": 2720 }, { "epoch": 0.23124575695858793, "grad_norm": 1.3934400081634521, "learning_rate": 1.897077367705008e-06, "loss": 0.0302, "num_input_tokens_seen": 2613248, "step": 2725 }, { "epoch": 0.23167006109979632, "grad_norm": 24.00430679321289, "learning_rate": 1.896421887633544e-06, "loss": 0.0637, "num_input_tokens_seen": 2617664, "step": 2730 }, { "epoch": 0.23209436524100474, "grad_norm": 15.404330253601074, "learning_rate": 1.8957644408953044e-06, "loss": 0.0716, "num_input_tokens_seen": 2622016, "step": 2735 }, { "epoch": 0.23251866938221316, "grad_norm": 23.977699279785156, "learning_rate": 1.8951050289326664e-06, "loss": 0.0438, "num_input_tokens_seen": 2626368, "step": 2740 }, { "epoch": 0.23294297352342158, "grad_norm": 6.890242099761963, "learning_rate": 1.8944436531923193e-06, "loss": 0.032, "num_input_tokens_seen": 2630720, "step": 2745 }, { "epoch": 0.23336727766463, "grad_norm": 39.53364181518555, "learning_rate": 1.8937803151252603e-06, "loss": 0.1554, "num_input_tokens_seen": 2635456, "step": 2750 }, { "epoch": 0.23379158180583842, "grad_norm": 24.008249282836914, "learning_rate": 1.8931150161867915e-06, "loss": 0.1055, "num_input_tokens_seen": 2639744, "step": 2755 }, { "epoch": 0.23421588594704684, "grad_norm": 34.880165100097656, "learning_rate": 1.8924477578365177e-06, "loss": 0.0627, "num_input_tokens_seen": 2643904, "step": 2760 }, { "epoch": 0.23464019008825526, "grad_norm": 9.298295021057129, "learning_rate": 1.8917785415383415e-06, "loss": 0.0403, "num_input_tokens_seen": 2648768, "step": 2765 }, { "epoch": 0.23506449422946368, "grad_norm": 7.759624004364014, "learning_rate": 1.8911073687604622e-06, "loss": 0.0271, "num_input_tokens_seen": 2653952, "step": 2770 }, { "epoch": 0.2354887983706721, "grad_norm": 0.04529079049825668, "learning_rate": 1.8904342409753703e-06, "loss": 0.0031, "num_input_tokens_seen": 2658432, "step": 2775 }, { "epoch": 0.23591310251188052, "grad_norm": 1.6445941925048828, "learning_rate": 1.8897591596598464e-06, "loss": 0.054, "num_input_tokens_seen": 2663168, "step": 2780 }, { "epoch": 0.23633740665308894, "grad_norm": 27.741294860839844, "learning_rate": 1.8890821262949564e-06, "loss": 0.1288, "num_input_tokens_seen": 2668096, "step": 2785 }, { "epoch": 0.23676171079429736, "grad_norm": 0.5501040816307068, "learning_rate": 1.8884031423660488e-06, "loss": 0.1241, "num_input_tokens_seen": 2672576, "step": 2790 }, { "epoch": 0.23718601493550576, "grad_norm": 25.323272705078125, "learning_rate": 1.8877222093627517e-06, "loss": 0.0832, "num_input_tokens_seen": 2678336, "step": 2795 }, { "epoch": 0.23761031907671418, "grad_norm": 8.880626678466797, "learning_rate": 1.8870393287789694e-06, "loss": 0.1352, "num_input_tokens_seen": 2684288, "step": 2800 }, { "epoch": 0.2380346232179226, "grad_norm": 25.005939483642578, "learning_rate": 1.8863545021128781e-06, "loss": 0.0542, "num_input_tokens_seen": 2688704, "step": 2805 }, { "epoch": 0.23845892735913102, "grad_norm": 15.758960723876953, "learning_rate": 1.885667730866925e-06, "loss": 0.0608, "num_input_tokens_seen": 2693056, "step": 2810 }, { "epoch": 0.23888323150033944, "grad_norm": 12.390170097351074, "learning_rate": 1.884979016547822e-06, "loss": 0.0844, "num_input_tokens_seen": 2697280, "step": 2815 }, { "epoch": 0.23930753564154786, "grad_norm": 8.885245323181152, "learning_rate": 1.8842883606665457e-06, "loss": 0.0769, "num_input_tokens_seen": 2701440, "step": 2820 }, { "epoch": 0.23973183978275628, "grad_norm": 2.2123122215270996, "learning_rate": 1.88359576473833e-06, "loss": 0.0935, "num_input_tokens_seen": 2707264, "step": 2825 }, { "epoch": 0.2401561439239647, "grad_norm": 16.648462295532227, "learning_rate": 1.8829012302826674e-06, "loss": 0.1271, "num_input_tokens_seen": 2711680, "step": 2830 }, { "epoch": 0.24058044806517312, "grad_norm": 0.9231572151184082, "learning_rate": 1.8822047588233017e-06, "loss": 0.0232, "num_input_tokens_seen": 2716352, "step": 2835 }, { "epoch": 0.24100475220638154, "grad_norm": 2.5872323513031006, "learning_rate": 1.881506351888227e-06, "loss": 0.0673, "num_input_tokens_seen": 2721024, "step": 2840 }, { "epoch": 0.24142905634758996, "grad_norm": 21.090572357177734, "learning_rate": 1.8808060110096839e-06, "loss": 0.0675, "num_input_tokens_seen": 2725696, "step": 2845 }, { "epoch": 0.24185336048879838, "grad_norm": 21.68260383605957, "learning_rate": 1.8801037377241553e-06, "loss": 0.0625, "num_input_tokens_seen": 2731328, "step": 2850 }, { "epoch": 0.2422776646300068, "grad_norm": 5.974689483642578, "learning_rate": 1.879399533572364e-06, "loss": 0.115, "num_input_tokens_seen": 2736192, "step": 2855 }, { "epoch": 0.24270196877121522, "grad_norm": 19.78734588623047, "learning_rate": 1.8786934000992688e-06, "loss": 0.0828, "num_input_tokens_seen": 2740544, "step": 2860 }, { "epoch": 0.2431262729124236, "grad_norm": 12.459096908569336, "learning_rate": 1.877985338854061e-06, "loss": 0.0668, "num_input_tokens_seen": 2745472, "step": 2865 }, { "epoch": 0.24355057705363203, "grad_norm": 1.3515022993087769, "learning_rate": 1.877275351390162e-06, "loss": 0.0126, "num_input_tokens_seen": 2750528, "step": 2870 }, { "epoch": 0.24397488119484045, "grad_norm": 0.6821182370185852, "learning_rate": 1.8765634392652183e-06, "loss": 0.0341, "num_input_tokens_seen": 2755328, "step": 2875 }, { "epoch": 0.24439918533604887, "grad_norm": 21.59737777709961, "learning_rate": 1.8758496040410998e-06, "loss": 0.0878, "num_input_tokens_seen": 2760128, "step": 2880 }, { "epoch": 0.2448234894772573, "grad_norm": 1.4333782196044922, "learning_rate": 1.8751338472838942e-06, "loss": 0.0316, "num_input_tokens_seen": 2764992, "step": 2885 }, { "epoch": 0.24524779361846571, "grad_norm": 18.27245330810547, "learning_rate": 1.8744161705639065e-06, "loss": 0.0563, "num_input_tokens_seen": 2769408, "step": 2890 }, { "epoch": 0.24567209775967414, "grad_norm": 16.279781341552734, "learning_rate": 1.8736965754556526e-06, "loss": 0.0999, "num_input_tokens_seen": 2774336, "step": 2895 }, { "epoch": 0.24609640190088256, "grad_norm": 25.436418533325195, "learning_rate": 1.8729750635378578e-06, "loss": 0.1851, "num_input_tokens_seen": 2779136, "step": 2900 }, { "epoch": 0.24652070604209098, "grad_norm": 25.405763626098633, "learning_rate": 1.872251636393453e-06, "loss": 0.0546, "num_input_tokens_seen": 2784256, "step": 2905 }, { "epoch": 0.2469450101832994, "grad_norm": 19.037561416625977, "learning_rate": 1.8715262956095694e-06, "loss": 0.1097, "num_input_tokens_seen": 2788864, "step": 2910 }, { "epoch": 0.24736931432450782, "grad_norm": 18.466859817504883, "learning_rate": 1.8707990427775386e-06, "loss": 0.0549, "num_input_tokens_seen": 2793152, "step": 2915 }, { "epoch": 0.24779361846571624, "grad_norm": 7.623482704162598, "learning_rate": 1.870069879492886e-06, "loss": 0.0782, "num_input_tokens_seen": 2798144, "step": 2920 }, { "epoch": 0.24821792260692466, "grad_norm": 0.18756820261478424, "learning_rate": 1.869338807355328e-06, "loss": 0.0769, "num_input_tokens_seen": 2803136, "step": 2925 }, { "epoch": 0.24864222674813305, "grad_norm": 18.723121643066406, "learning_rate": 1.8686058279687699e-06, "loss": 0.0514, "num_input_tokens_seen": 2807744, "step": 2930 }, { "epoch": 0.24906653088934147, "grad_norm": 1.2718958854675293, "learning_rate": 1.8678709429413e-06, "loss": 0.038, "num_input_tokens_seen": 2812928, "step": 2935 }, { "epoch": 0.2494908350305499, "grad_norm": 12.092676162719727, "learning_rate": 1.867134153885189e-06, "loss": 0.0578, "num_input_tokens_seen": 2817536, "step": 2940 }, { "epoch": 0.2499151391717583, "grad_norm": 2.50976300239563, "learning_rate": 1.8663954624168832e-06, "loss": 0.0818, "num_input_tokens_seen": 2822784, "step": 2945 }, { "epoch": 0.25033944331296676, "grad_norm": 0.1713555008172989, "learning_rate": 1.8656548701570039e-06, "loss": 0.0749, "num_input_tokens_seen": 2827328, "step": 2950 }, { "epoch": 0.25033944331296676, "eval_loss": 0.07611989974975586, "eval_runtime": 15.9014, "eval_samples_per_second": 658.746, "eval_steps_per_second": 82.383, "num_input_tokens_seen": 2827328, "step": 2950 }, { "epoch": 0.2507637474541752, "grad_norm": 7.6525068283081055, "learning_rate": 1.864912378730342e-06, "loss": 0.0738, "num_input_tokens_seen": 2832128, "step": 2955 }, { "epoch": 0.25118805159538354, "grad_norm": 24.655349731445312, "learning_rate": 1.8641679897658551e-06, "loss": 0.149, "num_input_tokens_seen": 2837824, "step": 2960 }, { "epoch": 0.25161235573659196, "grad_norm": 3.085174798965454, "learning_rate": 1.8634217048966633e-06, "loss": 0.063, "num_input_tokens_seen": 2842240, "step": 2965 }, { "epoch": 0.2520366598778004, "grad_norm": 24.828632354736328, "learning_rate": 1.8626735257600475e-06, "loss": 0.0512, "num_input_tokens_seen": 2846848, "step": 2970 }, { "epoch": 0.2524609640190088, "grad_norm": 12.527300834655762, "learning_rate": 1.8619234539974429e-06, "loss": 0.0923, "num_input_tokens_seen": 2851392, "step": 2975 }, { "epoch": 0.2528852681602172, "grad_norm": 5.862428665161133, "learning_rate": 1.8611714912544376e-06, "loss": 0.0373, "num_input_tokens_seen": 2855680, "step": 2980 }, { "epoch": 0.25330957230142565, "grad_norm": 12.940213203430176, "learning_rate": 1.860417639180769e-06, "loss": 0.0624, "num_input_tokens_seen": 2860544, "step": 2985 }, { "epoch": 0.25373387644263407, "grad_norm": 8.12386417388916, "learning_rate": 1.8596618994303183e-06, "loss": 0.0577, "num_input_tokens_seen": 2865152, "step": 2990 }, { "epoch": 0.2541581805838425, "grad_norm": 0.8909376263618469, "learning_rate": 1.858904273661109e-06, "loss": 0.066, "num_input_tokens_seen": 2870144, "step": 2995 }, { "epoch": 0.2545824847250509, "grad_norm": 26.337459564208984, "learning_rate": 1.8581447635353019e-06, "loss": 0.0492, "num_input_tokens_seen": 2875200, "step": 3000 }, { "epoch": 0.2550067888662593, "grad_norm": 11.455595016479492, "learning_rate": 1.8573833707191918e-06, "loss": 0.1037, "num_input_tokens_seen": 2880192, "step": 3005 }, { "epoch": 0.25543109300746775, "grad_norm": 2.723041296005249, "learning_rate": 1.8566200968832044e-06, "loss": 0.1198, "num_input_tokens_seen": 2884800, "step": 3010 }, { "epoch": 0.25585539714867617, "grad_norm": 25.147613525390625, "learning_rate": 1.855854943701892e-06, "loss": 0.0739, "num_input_tokens_seen": 2890176, "step": 3015 }, { "epoch": 0.2562797012898846, "grad_norm": 23.688777923583984, "learning_rate": 1.85508791285393e-06, "loss": 0.1248, "num_input_tokens_seen": 2895040, "step": 3020 }, { "epoch": 0.256704005431093, "grad_norm": 5.152799606323242, "learning_rate": 1.8543190060221125e-06, "loss": 0.0693, "num_input_tokens_seen": 2899776, "step": 3025 }, { "epoch": 0.25712830957230143, "grad_norm": 6.354460716247559, "learning_rate": 1.853548224893351e-06, "loss": 0.0601, "num_input_tokens_seen": 2904064, "step": 3030 }, { "epoch": 0.25755261371350985, "grad_norm": 10.099621772766113, "learning_rate": 1.8527755711586678e-06, "loss": 0.0395, "num_input_tokens_seen": 2908800, "step": 3035 }, { "epoch": 0.25797691785471827, "grad_norm": 20.724409103393555, "learning_rate": 1.8520010465131935e-06, "loss": 0.0432, "num_input_tokens_seen": 2913216, "step": 3040 }, { "epoch": 0.2584012219959267, "grad_norm": 12.310056686401367, "learning_rate": 1.8512246526561636e-06, "loss": 0.0755, "num_input_tokens_seen": 2917504, "step": 3045 }, { "epoch": 0.2588255261371351, "grad_norm": 21.255277633666992, "learning_rate": 1.8504463912909149e-06, "loss": 0.1289, "num_input_tokens_seen": 2922752, "step": 3050 }, { "epoch": 0.25924983027834353, "grad_norm": 16.850648880004883, "learning_rate": 1.8496662641248807e-06, "loss": 0.0354, "num_input_tokens_seen": 2928000, "step": 3055 }, { "epoch": 0.25967413441955195, "grad_norm": 26.931428909301758, "learning_rate": 1.8488842728695874e-06, "loss": 0.0658, "num_input_tokens_seen": 2932736, "step": 3060 }, { "epoch": 0.26009843856076037, "grad_norm": 22.07209014892578, "learning_rate": 1.8481004192406525e-06, "loss": 0.1191, "num_input_tokens_seen": 2937664, "step": 3065 }, { "epoch": 0.2605227427019688, "grad_norm": 43.74690628051758, "learning_rate": 1.8473147049577773e-06, "loss": 0.1139, "num_input_tokens_seen": 2942784, "step": 3070 }, { "epoch": 0.2609470468431772, "grad_norm": 18.454248428344727, "learning_rate": 1.8465271317447474e-06, "loss": 0.1196, "num_input_tokens_seen": 2947840, "step": 3075 }, { "epoch": 0.26137135098438563, "grad_norm": 33.043155670166016, "learning_rate": 1.845737701329425e-06, "loss": 0.0741, "num_input_tokens_seen": 2952448, "step": 3080 }, { "epoch": 0.26179565512559405, "grad_norm": 2.562422752380371, "learning_rate": 1.8449464154437475e-06, "loss": 0.0241, "num_input_tokens_seen": 2957120, "step": 3085 }, { "epoch": 0.26221995926680247, "grad_norm": 12.61534309387207, "learning_rate": 1.8441532758237233e-06, "loss": 0.1096, "num_input_tokens_seen": 2961728, "step": 3090 }, { "epoch": 0.26264426340801084, "grad_norm": 6.174483776092529, "learning_rate": 1.8433582842094273e-06, "loss": 0.0875, "num_input_tokens_seen": 2966208, "step": 3095 }, { "epoch": 0.26306856754921926, "grad_norm": 30.785140991210938, "learning_rate": 1.8425614423449974e-06, "loss": 0.0347, "num_input_tokens_seen": 2972288, "step": 3100 }, { "epoch": 0.2634928716904277, "grad_norm": 11.550189971923828, "learning_rate": 1.8417627519786313e-06, "loss": 0.0902, "num_input_tokens_seen": 2976512, "step": 3105 }, { "epoch": 0.2639171758316361, "grad_norm": 3.7005128860473633, "learning_rate": 1.840962214862582e-06, "loss": 0.0626, "num_input_tokens_seen": 2981248, "step": 3110 }, { "epoch": 0.2643414799728445, "grad_norm": 19.367431640625, "learning_rate": 1.8401598327531533e-06, "loss": 0.0862, "num_input_tokens_seen": 2985728, "step": 3115 }, { "epoch": 0.26476578411405294, "grad_norm": 24.754112243652344, "learning_rate": 1.839355607410698e-06, "loss": 0.0586, "num_input_tokens_seen": 2990144, "step": 3120 }, { "epoch": 0.26519008825526136, "grad_norm": 17.672439575195312, "learning_rate": 1.8385495405996119e-06, "loss": 0.0669, "num_input_tokens_seen": 2994560, "step": 3125 }, { "epoch": 0.2656143923964698, "grad_norm": 33.57306671142578, "learning_rate": 1.8377416340883312e-06, "loss": 0.0625, "num_input_tokens_seen": 2999488, "step": 3130 }, { "epoch": 0.2660386965376782, "grad_norm": 5.751327991485596, "learning_rate": 1.836931889649328e-06, "loss": 0.0634, "num_input_tokens_seen": 3004096, "step": 3135 }, { "epoch": 0.2664630006788866, "grad_norm": 36.13847732543945, "learning_rate": 1.8361203090591068e-06, "loss": 0.1258, "num_input_tokens_seen": 3008512, "step": 3140 }, { "epoch": 0.26688730482009504, "grad_norm": 16.493362426757812, "learning_rate": 1.8353068940982006e-06, "loss": 0.0708, "num_input_tokens_seen": 3013504, "step": 3145 }, { "epoch": 0.26731160896130346, "grad_norm": 0.15408827364444733, "learning_rate": 1.8344916465511664e-06, "loss": 0.0164, "num_input_tokens_seen": 3018112, "step": 3150 }, { "epoch": 0.2677359131025119, "grad_norm": 14.348245620727539, "learning_rate": 1.833674568206582e-06, "loss": 0.1132, "num_input_tokens_seen": 3023168, "step": 3155 }, { "epoch": 0.2681602172437203, "grad_norm": 10.401468276977539, "learning_rate": 1.832855660857042e-06, "loss": 0.0723, "num_input_tokens_seen": 3027840, "step": 3160 }, { "epoch": 0.2685845213849287, "grad_norm": 30.41805648803711, "learning_rate": 1.8320349262991532e-06, "loss": 0.1293, "num_input_tokens_seen": 3034176, "step": 3165 }, { "epoch": 0.26900882552613714, "grad_norm": 15.705052375793457, "learning_rate": 1.8312123663335316e-06, "loss": 0.063, "num_input_tokens_seen": 3038464, "step": 3170 }, { "epoch": 0.26943312966734556, "grad_norm": 13.934436798095703, "learning_rate": 1.8303879827647974e-06, "loss": 0.0748, "num_input_tokens_seen": 3042944, "step": 3175 }, { "epoch": 0.269857433808554, "grad_norm": 2.7212295532226562, "learning_rate": 1.8295617774015724e-06, "loss": 0.0582, "num_input_tokens_seen": 3048000, "step": 3180 }, { "epoch": 0.2702817379497624, "grad_norm": 12.423938751220703, "learning_rate": 1.8287337520564744e-06, "loss": 0.0863, "num_input_tokens_seen": 3053056, "step": 3185 }, { "epoch": 0.2707060420909708, "grad_norm": 20.587114334106445, "learning_rate": 1.8279039085461148e-06, "loss": 0.1082, "num_input_tokens_seen": 3057792, "step": 3190 }, { "epoch": 0.27113034623217924, "grad_norm": 1.5044630765914917, "learning_rate": 1.8270722486910933e-06, "loss": 0.1442, "num_input_tokens_seen": 3062784, "step": 3195 }, { "epoch": 0.27155465037338766, "grad_norm": 21.748659133911133, "learning_rate": 1.8262387743159948e-06, "loss": 0.1048, "num_input_tokens_seen": 3067712, "step": 3200 }, { "epoch": 0.2719789545145961, "grad_norm": 18.255653381347656, "learning_rate": 1.8254034872493853e-06, "loss": 0.0471, "num_input_tokens_seen": 3072000, "step": 3205 }, { "epoch": 0.2724032586558045, "grad_norm": 0.24077212810516357, "learning_rate": 1.8245663893238072e-06, "loss": 0.0572, "num_input_tokens_seen": 3076416, "step": 3210 }, { "epoch": 0.2728275627970129, "grad_norm": 18.335908889770508, "learning_rate": 1.823727482375776e-06, "loss": 0.0936, "num_input_tokens_seen": 3081792, "step": 3215 }, { "epoch": 0.27325186693822134, "grad_norm": 5.29511833190918, "learning_rate": 1.8228867682457762e-06, "loss": 0.1179, "num_input_tokens_seen": 3086656, "step": 3220 }, { "epoch": 0.2736761710794297, "grad_norm": 8.807744979858398, "learning_rate": 1.8220442487782565e-06, "loss": 0.0912, "num_input_tokens_seen": 3091328, "step": 3225 }, { "epoch": 0.27410047522063813, "grad_norm": 1.064199686050415, "learning_rate": 1.8211999258216273e-06, "loss": 0.0176, "num_input_tokens_seen": 3096448, "step": 3230 }, { "epoch": 0.27452477936184655, "grad_norm": 5.58245325088501, "learning_rate": 1.8203538012282548e-06, "loss": 0.0774, "num_input_tokens_seen": 3102400, "step": 3235 }, { "epoch": 0.27494908350305497, "grad_norm": 3.3104500770568848, "learning_rate": 1.8195058768544583e-06, "loss": 0.0535, "num_input_tokens_seen": 3107008, "step": 3240 }, { "epoch": 0.2753733876442634, "grad_norm": 0.2806529104709625, "learning_rate": 1.8186561545605052e-06, "loss": 0.0291, "num_input_tokens_seen": 3111872, "step": 3245 }, { "epoch": 0.2757976917854718, "grad_norm": 23.801342010498047, "learning_rate": 1.8178046362106083e-06, "loss": 0.0274, "num_input_tokens_seen": 3116544, "step": 3250 }, { "epoch": 0.27622199592668023, "grad_norm": 9.284271240234375, "learning_rate": 1.8169513236729195e-06, "loss": 0.1035, "num_input_tokens_seen": 3121024, "step": 3255 }, { "epoch": 0.27664630006788865, "grad_norm": 24.860389709472656, "learning_rate": 1.8160962188195278e-06, "loss": 0.0561, "num_input_tokens_seen": 3125696, "step": 3260 }, { "epoch": 0.27707060420909707, "grad_norm": 7.064362049102783, "learning_rate": 1.8152393235264545e-06, "loss": 0.0989, "num_input_tokens_seen": 3130752, "step": 3265 }, { "epoch": 0.2774949083503055, "grad_norm": 21.392122268676758, "learning_rate": 1.8143806396736486e-06, "loss": 0.0991, "num_input_tokens_seen": 3135360, "step": 3270 }, { "epoch": 0.2779192124915139, "grad_norm": 26.925817489624023, "learning_rate": 1.813520169144983e-06, "loss": 0.1135, "num_input_tokens_seen": 3140032, "step": 3275 }, { "epoch": 0.27834351663272233, "grad_norm": 0.9765627980232239, "learning_rate": 1.8126579138282501e-06, "loss": 0.0532, "num_input_tokens_seen": 3144960, "step": 3280 }, { "epoch": 0.27876782077393075, "grad_norm": 1.4220398664474487, "learning_rate": 1.8117938756151592e-06, "loss": 0.065, "num_input_tokens_seen": 3150016, "step": 3285 }, { "epoch": 0.2791921249151392, "grad_norm": 12.359588623046875, "learning_rate": 1.8109280564013297e-06, "loss": 0.1384, "num_input_tokens_seen": 3155200, "step": 3290 }, { "epoch": 0.2796164290563476, "grad_norm": 22.804471969604492, "learning_rate": 1.8100604580862898e-06, "loss": 0.0494, "num_input_tokens_seen": 3160000, "step": 3295 }, { "epoch": 0.280040733197556, "grad_norm": 0.858443558216095, "learning_rate": 1.8091910825734686e-06, "loss": 0.0524, "num_input_tokens_seen": 3164672, "step": 3300 }, { "epoch": 0.28046503733876443, "grad_norm": 10.366905212402344, "learning_rate": 1.808319931770197e-06, "loss": 0.052, "num_input_tokens_seen": 3169152, "step": 3305 }, { "epoch": 0.28088934147997285, "grad_norm": 38.595821380615234, "learning_rate": 1.8074470075876983e-06, "loss": 0.0446, "num_input_tokens_seen": 3173888, "step": 3310 }, { "epoch": 0.2813136456211813, "grad_norm": 0.30433109402656555, "learning_rate": 1.8065723119410884e-06, "loss": 0.0326, "num_input_tokens_seen": 3179072, "step": 3315 }, { "epoch": 0.2817379497623897, "grad_norm": 18.26148223876953, "learning_rate": 1.8056958467493678e-06, "loss": 0.0651, "num_input_tokens_seen": 3183552, "step": 3320 }, { "epoch": 0.2821622539035981, "grad_norm": 7.407904148101807, "learning_rate": 1.8048176139354207e-06, "loss": 0.0082, "num_input_tokens_seen": 3187968, "step": 3325 }, { "epoch": 0.28258655804480654, "grad_norm": 0.13984189927577972, "learning_rate": 1.8039376154260086e-06, "loss": 0.1037, "num_input_tokens_seen": 3192704, "step": 3330 }, { "epoch": 0.28301086218601496, "grad_norm": 14.583263397216797, "learning_rate": 1.803055853151767e-06, "loss": 0.0991, "num_input_tokens_seen": 3197760, "step": 3335 }, { "epoch": 0.2834351663272234, "grad_norm": 32.27197265625, "learning_rate": 1.8021723290472007e-06, "loss": 0.0358, "num_input_tokens_seen": 3202368, "step": 3340 }, { "epoch": 0.2838594704684318, "grad_norm": 10.743328094482422, "learning_rate": 1.8012870450506798e-06, "loss": 0.1846, "num_input_tokens_seen": 3207360, "step": 3345 }, { "epoch": 0.2842837746096402, "grad_norm": 10.31937313079834, "learning_rate": 1.800400003104436e-06, "loss": 0.0893, "num_input_tokens_seen": 3213632, "step": 3350 }, { "epoch": 0.2847080787508486, "grad_norm": 8.19094181060791, "learning_rate": 1.799511205154557e-06, "loss": 0.0597, "num_input_tokens_seen": 3219136, "step": 3355 }, { "epoch": 0.285132382892057, "grad_norm": 1.475780725479126, "learning_rate": 1.7986206531509835e-06, "loss": 0.0728, "num_input_tokens_seen": 3225088, "step": 3360 }, { "epoch": 0.2855566870332654, "grad_norm": 6.953364372253418, "learning_rate": 1.7977283490475043e-06, "loss": 0.0789, "num_input_tokens_seen": 3229504, "step": 3365 }, { "epoch": 0.28598099117447384, "grad_norm": 7.935585975646973, "learning_rate": 1.796834294801752e-06, "loss": 0.0618, "num_input_tokens_seen": 3234368, "step": 3370 }, { "epoch": 0.28640529531568226, "grad_norm": 7.753322124481201, "learning_rate": 1.7959384923751993e-06, "loss": 0.082, "num_input_tokens_seen": 3239232, "step": 3375 }, { "epoch": 0.2868295994568907, "grad_norm": 28.726940155029297, "learning_rate": 1.7950409437331535e-06, "loss": 0.064, "num_input_tokens_seen": 3244288, "step": 3380 }, { "epoch": 0.2872539035980991, "grad_norm": 14.288199424743652, "learning_rate": 1.7941416508447534e-06, "loss": 0.1214, "num_input_tokens_seen": 3248640, "step": 3385 }, { "epoch": 0.2876782077393075, "grad_norm": 0.3123435378074646, "learning_rate": 1.7932406156829649e-06, "loss": 0.0186, "num_input_tokens_seen": 3253440, "step": 3390 }, { "epoch": 0.28810251188051594, "grad_norm": 12.358621597290039, "learning_rate": 1.7923378402245756e-06, "loss": 0.0676, "num_input_tokens_seen": 3258048, "step": 3395 }, { "epoch": 0.28852681602172436, "grad_norm": 8.790231704711914, "learning_rate": 1.7914333264501913e-06, "loss": 0.0332, "num_input_tokens_seen": 3262912, "step": 3400 }, { "epoch": 0.2889511201629328, "grad_norm": 20.035654067993164, "learning_rate": 1.790527076344232e-06, "loss": 0.0887, "num_input_tokens_seen": 3267776, "step": 3405 }, { "epoch": 0.2893754243041412, "grad_norm": 20.470781326293945, "learning_rate": 1.7896190918949266e-06, "loss": 0.0464, "num_input_tokens_seen": 3272448, "step": 3410 }, { "epoch": 0.2897997284453496, "grad_norm": 14.918984413146973, "learning_rate": 1.7887093750943088e-06, "loss": 0.07, "num_input_tokens_seen": 3277056, "step": 3415 }, { "epoch": 0.29022403258655805, "grad_norm": 9.069342613220215, "learning_rate": 1.7877979279382131e-06, "loss": 0.0767, "num_input_tokens_seen": 3282048, "step": 3420 }, { "epoch": 0.29064833672776647, "grad_norm": 11.887508392333984, "learning_rate": 1.7868847524262708e-06, "loss": 0.0967, "num_input_tokens_seen": 3286336, "step": 3425 }, { "epoch": 0.2910726408689749, "grad_norm": 1.5049360990524292, "learning_rate": 1.7859698505619043e-06, "loss": 0.0277, "num_input_tokens_seen": 3290880, "step": 3430 }, { "epoch": 0.2914969450101833, "grad_norm": 0.22411267459392548, "learning_rate": 1.7850532243523238e-06, "loss": 0.0378, "num_input_tokens_seen": 3295360, "step": 3435 }, { "epoch": 0.2919212491513917, "grad_norm": 0.26940274238586426, "learning_rate": 1.7841348758085224e-06, "loss": 0.0274, "num_input_tokens_seen": 3299840, "step": 3440 }, { "epoch": 0.29234555329260015, "grad_norm": 1.3497130870819092, "learning_rate": 1.7832148069452719e-06, "loss": 0.0298, "num_input_tokens_seen": 3304448, "step": 3445 }, { "epoch": 0.29276985743380857, "grad_norm": 0.14612747728824615, "learning_rate": 1.7822930197811186e-06, "loss": 0.1044, "num_input_tokens_seen": 3308928, "step": 3450 }, { "epoch": 0.293194161575017, "grad_norm": 6.200453758239746, "learning_rate": 1.781369516338378e-06, "loss": 0.1, "num_input_tokens_seen": 3313408, "step": 3455 }, { "epoch": 0.2936184657162254, "grad_norm": 8.402563095092773, "learning_rate": 1.7804442986431317e-06, "loss": 0.0865, "num_input_tokens_seen": 3318080, "step": 3460 }, { "epoch": 0.29404276985743383, "grad_norm": 18.582571029663086, "learning_rate": 1.7795173687252213e-06, "loss": 0.0947, "num_input_tokens_seen": 3323136, "step": 3465 }, { "epoch": 0.29446707399864225, "grad_norm": 19.33099937438965, "learning_rate": 1.778588728618246e-06, "loss": 0.0454, "num_input_tokens_seen": 3327936, "step": 3470 }, { "epoch": 0.29489137813985067, "grad_norm": 19.658811569213867, "learning_rate": 1.777658380359556e-06, "loss": 0.0718, "num_input_tokens_seen": 3332864, "step": 3475 }, { "epoch": 0.2953156822810591, "grad_norm": 0.3303367495536804, "learning_rate": 1.7767263259902494e-06, "loss": 0.0759, "num_input_tokens_seen": 3338048, "step": 3480 }, { "epoch": 0.2957399864222675, "grad_norm": 14.395822525024414, "learning_rate": 1.7757925675551672e-06, "loss": 0.1356, "num_input_tokens_seen": 3343104, "step": 3485 }, { "epoch": 0.2961642905634759, "grad_norm": 11.50323486328125, "learning_rate": 1.7748571071028898e-06, "loss": 0.0701, "num_input_tokens_seen": 3347712, "step": 3490 }, { "epoch": 0.2965885947046843, "grad_norm": 10.81116771697998, "learning_rate": 1.7739199466857301e-06, "loss": 0.0489, "num_input_tokens_seen": 3352000, "step": 3495 }, { "epoch": 0.2970128988458927, "grad_norm": 6.09246301651001, "learning_rate": 1.772981088359732e-06, "loss": 0.0291, "num_input_tokens_seen": 3356480, "step": 3500 }, { "epoch": 0.29743720298710113, "grad_norm": 13.909074783325195, "learning_rate": 1.7720405341846636e-06, "loss": 0.0997, "num_input_tokens_seen": 3361536, "step": 3505 }, { "epoch": 0.29786150712830956, "grad_norm": 10.681073188781738, "learning_rate": 1.771098286224014e-06, "loss": 0.0796, "num_input_tokens_seen": 3367296, "step": 3510 }, { "epoch": 0.298285811269518, "grad_norm": 0.8761059641838074, "learning_rate": 1.7701543465449884e-06, "loss": 0.0678, "num_input_tokens_seen": 3372096, "step": 3515 }, { "epoch": 0.2987101154107264, "grad_norm": 0.13927964866161346, "learning_rate": 1.7692087172185026e-06, "loss": 0.0632, "num_input_tokens_seen": 3376384, "step": 3520 }, { "epoch": 0.2991344195519348, "grad_norm": 8.352365493774414, "learning_rate": 1.7682614003191805e-06, "loss": 0.0409, "num_input_tokens_seen": 3381504, "step": 3525 }, { "epoch": 0.29955872369314324, "grad_norm": 20.81534767150879, "learning_rate": 1.7673123979253475e-06, "loss": 0.0469, "num_input_tokens_seen": 3386112, "step": 3530 }, { "epoch": 0.29998302783435166, "grad_norm": 1.3498769998550415, "learning_rate": 1.7663617121190271e-06, "loss": 0.05, "num_input_tokens_seen": 3395072, "step": 3535 }, { "epoch": 0.3004073319755601, "grad_norm": 0.6542552709579468, "learning_rate": 1.7654093449859367e-06, "loss": 0.0141, "num_input_tokens_seen": 3399808, "step": 3540 }, { "epoch": 0.3004073319755601, "eval_loss": 0.08620841801166534, "eval_runtime": 15.9677, "eval_samples_per_second": 656.01, "eval_steps_per_second": 82.04, "num_input_tokens_seen": 3399808, "step": 3540 }, { "epoch": 0.3008316361167685, "grad_norm": 32.648826599121094, "learning_rate": 1.764455298615481e-06, "loss": 0.0416, "num_input_tokens_seen": 3404544, "step": 3545 }, { "epoch": 0.3012559402579769, "grad_norm": 11.315652847290039, "learning_rate": 1.7634995751007499e-06, "loss": 0.1003, "num_input_tokens_seen": 3408896, "step": 3550 }, { "epoch": 0.30168024439918534, "grad_norm": 26.142065048217773, "learning_rate": 1.7625421765385124e-06, "loss": 0.0709, "num_input_tokens_seen": 3413824, "step": 3555 }, { "epoch": 0.30210454854039376, "grad_norm": 21.637981414794922, "learning_rate": 1.7615831050292127e-06, "loss": 0.097, "num_input_tokens_seen": 3418240, "step": 3560 }, { "epoch": 0.3025288526816022, "grad_norm": 6.655580997467041, "learning_rate": 1.760622362676965e-06, "loss": 0.1312, "num_input_tokens_seen": 3423168, "step": 3565 }, { "epoch": 0.3029531568228106, "grad_norm": 23.639205932617188, "learning_rate": 1.7596599515895486e-06, "loss": 0.0642, "num_input_tokens_seen": 3428224, "step": 3570 }, { "epoch": 0.303377460964019, "grad_norm": 0.2874506413936615, "learning_rate": 1.7586958738784055e-06, "loss": 0.0485, "num_input_tokens_seen": 3432896, "step": 3575 }, { "epoch": 0.30380176510522744, "grad_norm": 48.77882766723633, "learning_rate": 1.7577301316586323e-06, "loss": 0.1002, "num_input_tokens_seen": 3437632, "step": 3580 }, { "epoch": 0.30422606924643586, "grad_norm": 33.99219512939453, "learning_rate": 1.7567627270489787e-06, "loss": 0.061, "num_input_tokens_seen": 3442112, "step": 3585 }, { "epoch": 0.3046503733876443, "grad_norm": 8.705355644226074, "learning_rate": 1.7557936621718406e-06, "loss": 0.0435, "num_input_tokens_seen": 3448064, "step": 3590 }, { "epoch": 0.3050746775288527, "grad_norm": 49.27440643310547, "learning_rate": 1.754822939153257e-06, "loss": 0.1354, "num_input_tokens_seen": 3452800, "step": 3595 }, { "epoch": 0.3054989816700611, "grad_norm": 31.726367950439453, "learning_rate": 1.7538505601229043e-06, "loss": 0.1403, "num_input_tokens_seen": 3457856, "step": 3600 }, { "epoch": 0.30592328581126954, "grad_norm": 1.0165332555770874, "learning_rate": 1.7528765272140927e-06, "loss": 0.036, "num_input_tokens_seen": 3462720, "step": 3605 }, { "epoch": 0.30634758995247796, "grad_norm": 15.145169258117676, "learning_rate": 1.7519008425637597e-06, "loss": 0.1049, "num_input_tokens_seen": 3467264, "step": 3610 }, { "epoch": 0.3067718940936864, "grad_norm": 17.913949966430664, "learning_rate": 1.7509235083124679e-06, "loss": 0.0713, "num_input_tokens_seen": 3472832, "step": 3615 }, { "epoch": 0.30719619823489475, "grad_norm": 8.478503227233887, "learning_rate": 1.749944526604398e-06, "loss": 0.061, "num_input_tokens_seen": 3478016, "step": 3620 }, { "epoch": 0.30762050237610317, "grad_norm": 0.49673381447792053, "learning_rate": 1.7489638995873453e-06, "loss": 0.0438, "num_input_tokens_seen": 3482688, "step": 3625 }, { "epoch": 0.3080448065173116, "grad_norm": 19.62775230407715, "learning_rate": 1.7479816294127149e-06, "loss": 0.0193, "num_input_tokens_seen": 3487296, "step": 3630 }, { "epoch": 0.30846911065852, "grad_norm": 0.37304261326789856, "learning_rate": 1.746997718235517e-06, "loss": 0.0555, "num_input_tokens_seen": 3491712, "step": 3635 }, { "epoch": 0.3088934147997284, "grad_norm": 4.202676296234131, "learning_rate": 1.7460121682143616e-06, "loss": 0.0876, "num_input_tokens_seen": 3496256, "step": 3640 }, { "epoch": 0.30931771894093685, "grad_norm": 22.060909271240234, "learning_rate": 1.7450249815114545e-06, "loss": 0.1017, "num_input_tokens_seen": 3500672, "step": 3645 }, { "epoch": 0.30974202308214527, "grad_norm": 0.10195865482091904, "learning_rate": 1.744036160292592e-06, "loss": 0.0117, "num_input_tokens_seen": 3505536, "step": 3650 }, { "epoch": 0.3101663272233537, "grad_norm": 0.29044589400291443, "learning_rate": 1.7430457067271563e-06, "loss": 0.0618, "num_input_tokens_seen": 3510400, "step": 3655 }, { "epoch": 0.3105906313645621, "grad_norm": 0.09018034487962723, "learning_rate": 1.742053622988111e-06, "loss": 0.03, "num_input_tokens_seen": 3514880, "step": 3660 }, { "epoch": 0.31101493550577053, "grad_norm": 33.10944747924805, "learning_rate": 1.7410599112519969e-06, "loss": 0.1128, "num_input_tokens_seen": 3520192, "step": 3665 }, { "epoch": 0.31143923964697895, "grad_norm": 5.516417026519775, "learning_rate": 1.7400645736989246e-06, "loss": 0.0743, "num_input_tokens_seen": 3524544, "step": 3670 }, { "epoch": 0.31186354378818737, "grad_norm": 1.1226091384887695, "learning_rate": 1.7390676125125733e-06, "loss": 0.0105, "num_input_tokens_seen": 3528896, "step": 3675 }, { "epoch": 0.3122878479293958, "grad_norm": 0.11550328880548477, "learning_rate": 1.7380690298801836e-06, "loss": 0.0761, "num_input_tokens_seen": 3533568, "step": 3680 }, { "epoch": 0.3127121520706042, "grad_norm": 24.203899383544922, "learning_rate": 1.7370688279925538e-06, "loss": 0.0437, "num_input_tokens_seen": 3539008, "step": 3685 }, { "epoch": 0.31313645621181263, "grad_norm": 0.20447641611099243, "learning_rate": 1.736067009044034e-06, "loss": 0.0234, "num_input_tokens_seen": 3544448, "step": 3690 }, { "epoch": 0.31356076035302105, "grad_norm": 22.803014755249023, "learning_rate": 1.7350635752325222e-06, "loss": 0.1307, "num_input_tokens_seen": 3549184, "step": 3695 }, { "epoch": 0.31398506449422947, "grad_norm": 0.733262836933136, "learning_rate": 1.7340585287594603e-06, "loss": 0.0346, "num_input_tokens_seen": 3554176, "step": 3700 }, { "epoch": 0.3144093686354379, "grad_norm": 25.992023468017578, "learning_rate": 1.733051871829826e-06, "loss": 0.0626, "num_input_tokens_seen": 3558720, "step": 3705 }, { "epoch": 0.3148336727766463, "grad_norm": 13.120671272277832, "learning_rate": 1.7320436066521333e-06, "loss": 0.0735, "num_input_tokens_seen": 3563648, "step": 3710 }, { "epoch": 0.31525797691785473, "grad_norm": 8.363951683044434, "learning_rate": 1.7310337354384214e-06, "loss": 0.0464, "num_input_tokens_seen": 3568704, "step": 3715 }, { "epoch": 0.31568228105906315, "grad_norm": 5.674199104309082, "learning_rate": 1.7300222604042552e-06, "loss": 0.0382, "num_input_tokens_seen": 3573184, "step": 3720 }, { "epoch": 0.3161065852002716, "grad_norm": 31.29104995727539, "learning_rate": 1.7290091837687172e-06, "loss": 0.0764, "num_input_tokens_seen": 3578432, "step": 3725 }, { "epoch": 0.31653088934148, "grad_norm": 2.064466953277588, "learning_rate": 1.7279945077544036e-06, "loss": 0.0116, "num_input_tokens_seen": 3582848, "step": 3730 }, { "epoch": 0.3169551934826884, "grad_norm": 0.26752981543540955, "learning_rate": 1.7269782345874203e-06, "loss": 0.0319, "num_input_tokens_seen": 3587840, "step": 3735 }, { "epoch": 0.31737949762389683, "grad_norm": 0.43561768531799316, "learning_rate": 1.7259603664973766e-06, "loss": 0.1165, "num_input_tokens_seen": 3592576, "step": 3740 }, { "epoch": 0.31780380176510525, "grad_norm": 14.585902214050293, "learning_rate": 1.7249409057173806e-06, "loss": 0.1014, "num_input_tokens_seen": 3597376, "step": 3745 }, { "epoch": 0.3182281059063136, "grad_norm": 1.0415359735488892, "learning_rate": 1.7239198544840354e-06, "loss": 0.0935, "num_input_tokens_seen": 3601728, "step": 3750 }, { "epoch": 0.31865241004752204, "grad_norm": 11.442963600158691, "learning_rate": 1.7228972150374332e-06, "loss": 0.1359, "num_input_tokens_seen": 3606592, "step": 3755 }, { "epoch": 0.31907671418873046, "grad_norm": 6.534305095672607, "learning_rate": 1.7218729896211504e-06, "loss": 0.0588, "num_input_tokens_seen": 3611328, "step": 3760 }, { "epoch": 0.3195010183299389, "grad_norm": 0.5454856157302856, "learning_rate": 1.7208471804822425e-06, "loss": 0.0536, "num_input_tokens_seen": 3616000, "step": 3765 }, { "epoch": 0.3199253224711473, "grad_norm": 0.16123202443122864, "learning_rate": 1.71981978987124e-06, "loss": 0.0302, "num_input_tokens_seen": 3620288, "step": 3770 }, { "epoch": 0.3203496266123557, "grad_norm": 0.17035053670406342, "learning_rate": 1.7187908200421432e-06, "loss": 0.0396, "num_input_tokens_seen": 3624704, "step": 3775 }, { "epoch": 0.32077393075356414, "grad_norm": 0.20689719915390015, "learning_rate": 1.717760273252417e-06, "loss": 0.0919, "num_input_tokens_seen": 3628736, "step": 3780 }, { "epoch": 0.32119823489477256, "grad_norm": 20.81116485595703, "learning_rate": 1.7167281517629854e-06, "loss": 0.0607, "num_input_tokens_seen": 3633664, "step": 3785 }, { "epoch": 0.321622539035981, "grad_norm": 18.75973129272461, "learning_rate": 1.7156944578382277e-06, "loss": 0.0758, "num_input_tokens_seen": 3638400, "step": 3790 }, { "epoch": 0.3220468431771894, "grad_norm": 22.641748428344727, "learning_rate": 1.7146591937459732e-06, "loss": 0.0976, "num_input_tokens_seen": 3643200, "step": 3795 }, { "epoch": 0.3224711473183978, "grad_norm": 29.938251495361328, "learning_rate": 1.713622361757495e-06, "loss": 0.1564, "num_input_tokens_seen": 3647680, "step": 3800 }, { "epoch": 0.32289545145960624, "grad_norm": 0.21171480417251587, "learning_rate": 1.712583964147507e-06, "loss": 0.0291, "num_input_tokens_seen": 3653120, "step": 3805 }, { "epoch": 0.32331975560081466, "grad_norm": 0.950505256652832, "learning_rate": 1.7115440031941572e-06, "loss": 0.1444, "num_input_tokens_seen": 3657856, "step": 3810 }, { "epoch": 0.3237440597420231, "grad_norm": 34.6823844909668, "learning_rate": 1.7105024811790248e-06, "loss": 0.0922, "num_input_tokens_seen": 3662656, "step": 3815 }, { "epoch": 0.3241683638832315, "grad_norm": 28.47976303100586, "learning_rate": 1.7094594003871116e-06, "loss": 0.0862, "num_input_tokens_seen": 3668096, "step": 3820 }, { "epoch": 0.3245926680244399, "grad_norm": 14.752837181091309, "learning_rate": 1.7084147631068415e-06, "loss": 0.0375, "num_input_tokens_seen": 3673280, "step": 3825 }, { "epoch": 0.32501697216564834, "grad_norm": 22.75851821899414, "learning_rate": 1.7073685716300517e-06, "loss": 0.084, "num_input_tokens_seen": 3677824, "step": 3830 }, { "epoch": 0.32544127630685676, "grad_norm": 2.2120866775512695, "learning_rate": 1.7063208282519894e-06, "loss": 0.0866, "num_input_tokens_seen": 3682624, "step": 3835 }, { "epoch": 0.3258655804480652, "grad_norm": 18.671844482421875, "learning_rate": 1.7052715352713074e-06, "loss": 0.0973, "num_input_tokens_seen": 3687296, "step": 3840 }, { "epoch": 0.3262898845892736, "grad_norm": 27.980823516845703, "learning_rate": 1.7042206949900568e-06, "loss": 0.0854, "num_input_tokens_seen": 3692352, "step": 3845 }, { "epoch": 0.326714188730482, "grad_norm": 23.52008628845215, "learning_rate": 1.703168309713684e-06, "loss": 0.0305, "num_input_tokens_seen": 3697472, "step": 3850 }, { "epoch": 0.32713849287169044, "grad_norm": 10.77466869354248, "learning_rate": 1.7021143817510262e-06, "loss": 0.0659, "num_input_tokens_seen": 3702528, "step": 3855 }, { "epoch": 0.32756279701289887, "grad_norm": 21.211490631103516, "learning_rate": 1.7010589134143025e-06, "loss": 0.1094, "num_input_tokens_seen": 3706560, "step": 3860 }, { "epoch": 0.3279871011541073, "grad_norm": 14.143378257751465, "learning_rate": 1.7000019070191138e-06, "loss": 0.067, "num_input_tokens_seen": 3711104, "step": 3865 }, { "epoch": 0.3284114052953157, "grad_norm": 7.531778335571289, "learning_rate": 1.698943364884434e-06, "loss": 0.0876, "num_input_tokens_seen": 3715712, "step": 3870 }, { "epoch": 0.3288357094365241, "grad_norm": 0.37810540199279785, "learning_rate": 1.697883289332607e-06, "loss": 0.0276, "num_input_tokens_seen": 3720960, "step": 3875 }, { "epoch": 0.3292600135777325, "grad_norm": 8.426651000976562, "learning_rate": 1.6968216826893405e-06, "loss": 0.0555, "num_input_tokens_seen": 3725504, "step": 3880 }, { "epoch": 0.3296843177189409, "grad_norm": 23.238388061523438, "learning_rate": 1.6957585472837014e-06, "loss": 0.0693, "num_input_tokens_seen": 3729856, "step": 3885 }, { "epoch": 0.33010862186014933, "grad_norm": 9.211931228637695, "learning_rate": 1.6946938854481103e-06, "loss": 0.1188, "num_input_tokens_seen": 3735040, "step": 3890 }, { "epoch": 0.33053292600135775, "grad_norm": 34.77646255493164, "learning_rate": 1.6936276995183371e-06, "loss": 0.0129, "num_input_tokens_seen": 3740288, "step": 3895 }, { "epoch": 0.33095723014256617, "grad_norm": 1.0880327224731445, "learning_rate": 1.6925599918334954e-06, "loss": 0.0761, "num_input_tokens_seen": 3744960, "step": 3900 }, { "epoch": 0.3313815342837746, "grad_norm": 0.20256610214710236, "learning_rate": 1.6914907647360367e-06, "loss": 0.0783, "num_input_tokens_seen": 3749120, "step": 3905 }, { "epoch": 0.331805838424983, "grad_norm": 24.49281883239746, "learning_rate": 1.6904200205717467e-06, "loss": 0.0857, "num_input_tokens_seen": 3753984, "step": 3910 }, { "epoch": 0.33223014256619143, "grad_norm": 0.2078077346086502, "learning_rate": 1.689347761689739e-06, "loss": 0.0364, "num_input_tokens_seen": 3758464, "step": 3915 }, { "epoch": 0.33265444670739985, "grad_norm": 39.527366638183594, "learning_rate": 1.6882739904424507e-06, "loss": 0.0956, "num_input_tokens_seen": 3762624, "step": 3920 }, { "epoch": 0.3330787508486083, "grad_norm": 23.886940002441406, "learning_rate": 1.6871987091856366e-06, "loss": 0.1428, "num_input_tokens_seen": 3767616, "step": 3925 }, { "epoch": 0.3335030549898167, "grad_norm": 3.738818407058716, "learning_rate": 1.6861219202783644e-06, "loss": 0.0047, "num_input_tokens_seen": 3772864, "step": 3930 }, { "epoch": 0.3339273591310251, "grad_norm": 14.304019927978516, "learning_rate": 1.6850436260830093e-06, "loss": 0.0992, "num_input_tokens_seen": 3777728, "step": 3935 }, { "epoch": 0.33435166327223353, "grad_norm": 19.410703659057617, "learning_rate": 1.683963828965249e-06, "loss": 0.0653, "num_input_tokens_seen": 3782912, "step": 3940 }, { "epoch": 0.33477596741344195, "grad_norm": 25.328245162963867, "learning_rate": 1.6828825312940592e-06, "loss": 0.0656, "num_input_tokens_seen": 3788160, "step": 3945 }, { "epoch": 0.3352002715546504, "grad_norm": 29.521923065185547, "learning_rate": 1.6817997354417066e-06, "loss": 0.0815, "num_input_tokens_seen": 3792448, "step": 3950 }, { "epoch": 0.3356245756958588, "grad_norm": 0.7535111308097839, "learning_rate": 1.6807154437837453e-06, "loss": 0.0654, "num_input_tokens_seen": 3797376, "step": 3955 }, { "epoch": 0.3360488798370672, "grad_norm": 9.811915397644043, "learning_rate": 1.6796296586990108e-06, "loss": 0.0956, "num_input_tokens_seen": 3802496, "step": 3960 }, { "epoch": 0.33647318397827564, "grad_norm": 1.2682443857192993, "learning_rate": 1.6785423825696156e-06, "loss": 0.0604, "num_input_tokens_seen": 3806912, "step": 3965 }, { "epoch": 0.33689748811948406, "grad_norm": 0.8005596995353699, "learning_rate": 1.6774536177809426e-06, "loss": 0.042, "num_input_tokens_seen": 3811648, "step": 3970 }, { "epoch": 0.3373217922606925, "grad_norm": 0.733005166053772, "learning_rate": 1.6763633667216416e-06, "loss": 0.0873, "num_input_tokens_seen": 3817024, "step": 3975 }, { "epoch": 0.3377460964019009, "grad_norm": 1.7879672050476074, "learning_rate": 1.6752716317836226e-06, "loss": 0.0224, "num_input_tokens_seen": 3821440, "step": 3980 }, { "epoch": 0.3381704005431093, "grad_norm": 0.19908781349658966, "learning_rate": 1.6741784153620508e-06, "loss": 0.0261, "num_input_tokens_seen": 3825984, "step": 3985 }, { "epoch": 0.33859470468431774, "grad_norm": 23.788095474243164, "learning_rate": 1.6730837198553422e-06, "loss": 0.057, "num_input_tokens_seen": 3831104, "step": 3990 }, { "epoch": 0.33901900882552616, "grad_norm": 6.659988880157471, "learning_rate": 1.6719875476651577e-06, "loss": 0.083, "num_input_tokens_seen": 3836160, "step": 3995 }, { "epoch": 0.3394433129667346, "grad_norm": 15.46704387664795, "learning_rate": 1.6708899011963978e-06, "loss": 0.099, "num_input_tokens_seen": 3840640, "step": 4000 }, { "epoch": 0.339867617107943, "grad_norm": 0.8073393106460571, "learning_rate": 1.6697907828571966e-06, "loss": 0.0456, "num_input_tokens_seen": 3845440, "step": 4005 }, { "epoch": 0.3402919212491514, "grad_norm": 17.0804386138916, "learning_rate": 1.6686901950589193e-06, "loss": 0.1105, "num_input_tokens_seen": 3850368, "step": 4010 }, { "epoch": 0.3407162253903598, "grad_norm": 7.3930182456970215, "learning_rate": 1.6675881402161536e-06, "loss": 0.0449, "num_input_tokens_seen": 3855296, "step": 4015 }, { "epoch": 0.3411405295315682, "grad_norm": 2.1560728549957275, "learning_rate": 1.6664846207467054e-06, "loss": 0.073, "num_input_tokens_seen": 3859648, "step": 4020 }, { "epoch": 0.3415648336727766, "grad_norm": 29.044185638427734, "learning_rate": 1.665379639071595e-06, "loss": 0.0364, "num_input_tokens_seen": 3864512, "step": 4025 }, { "epoch": 0.34198913781398504, "grad_norm": 10.980119705200195, "learning_rate": 1.6642731976150492e-06, "loss": 0.0528, "num_input_tokens_seen": 3868800, "step": 4030 }, { "epoch": 0.34241344195519346, "grad_norm": 0.7191017866134644, "learning_rate": 1.6631652988044995e-06, "loss": 0.059, "num_input_tokens_seen": 3873664, "step": 4035 }, { "epoch": 0.3428377460964019, "grad_norm": 24.94672966003418, "learning_rate": 1.6620559450705728e-06, "loss": 0.0991, "num_input_tokens_seen": 3878528, "step": 4040 }, { "epoch": 0.3432620502376103, "grad_norm": 59.45679473876953, "learning_rate": 1.6609451388470885e-06, "loss": 0.0806, "num_input_tokens_seen": 3883136, "step": 4045 }, { "epoch": 0.3436863543788187, "grad_norm": 7.993133544921875, "learning_rate": 1.6598328825710533e-06, "loss": 0.095, "num_input_tokens_seen": 3888384, "step": 4050 }, { "epoch": 0.34411065852002715, "grad_norm": 1.8137032985687256, "learning_rate": 1.6587191786826543e-06, "loss": 0.0117, "num_input_tokens_seen": 3893056, "step": 4055 }, { "epoch": 0.34453496266123557, "grad_norm": 0.13727472722530365, "learning_rate": 1.6576040296252553e-06, "loss": 0.0724, "num_input_tokens_seen": 3897600, "step": 4060 }, { "epoch": 0.344959266802444, "grad_norm": 25.691621780395508, "learning_rate": 1.65648743784539e-06, "loss": 0.1141, "num_input_tokens_seen": 3902080, "step": 4065 }, { "epoch": 0.3453835709436524, "grad_norm": 6.447222709655762, "learning_rate": 1.6553694057927573e-06, "loss": 0.061, "num_input_tokens_seen": 3906880, "step": 4070 }, { "epoch": 0.3458078750848608, "grad_norm": 19.07137680053711, "learning_rate": 1.654249935920217e-06, "loss": 0.1306, "num_input_tokens_seen": 3911040, "step": 4075 }, { "epoch": 0.34623217922606925, "grad_norm": 5.483658313751221, "learning_rate": 1.6531290306837817e-06, "loss": 0.0689, "num_input_tokens_seen": 3915712, "step": 4080 }, { "epoch": 0.34665648336727767, "grad_norm": 4.867100238800049, "learning_rate": 1.6520066925426143e-06, "loss": 0.0916, "num_input_tokens_seen": 3919936, "step": 4085 }, { "epoch": 0.3470807875084861, "grad_norm": 2.243724822998047, "learning_rate": 1.650882923959021e-06, "loss": 0.1051, "num_input_tokens_seen": 3924480, "step": 4090 }, { "epoch": 0.3475050916496945, "grad_norm": 5.008316516876221, "learning_rate": 1.649757727398446e-06, "loss": 0.1395, "num_input_tokens_seen": 3929728, "step": 4095 }, { "epoch": 0.34792939579090293, "grad_norm": 6.448604583740234, "learning_rate": 1.6486311053294669e-06, "loss": 0.0495, "num_input_tokens_seen": 3934080, "step": 4100 }, { "epoch": 0.34835369993211135, "grad_norm": 8.177726745605469, "learning_rate": 1.6475030602237876e-06, "loss": 0.0759, "num_input_tokens_seen": 3938624, "step": 4105 }, { "epoch": 0.34877800407331977, "grad_norm": 12.904138565063477, "learning_rate": 1.646373594556236e-06, "loss": 0.0924, "num_input_tokens_seen": 3944448, "step": 4110 }, { "epoch": 0.3492023082145282, "grad_norm": 8.9033203125, "learning_rate": 1.6452427108047542e-06, "loss": 0.0257, "num_input_tokens_seen": 3949184, "step": 4115 }, { "epoch": 0.3496266123557366, "grad_norm": 9.575577735900879, "learning_rate": 1.6441104114503977e-06, "loss": 0.0649, "num_input_tokens_seen": 3953664, "step": 4120 }, { "epoch": 0.35005091649694503, "grad_norm": 12.641286849975586, "learning_rate": 1.642976698977326e-06, "loss": 0.0585, "num_input_tokens_seen": 3958336, "step": 4125 }, { "epoch": 0.35047522063815345, "grad_norm": 3.237588882446289, "learning_rate": 1.6418415758727995e-06, "loss": 0.0051, "num_input_tokens_seen": 3963584, "step": 4130 }, { "epoch": 0.35047522063815345, "eval_loss": 0.07100464403629303, "eval_runtime": 15.9339, "eval_samples_per_second": 657.403, "eval_steps_per_second": 82.215, "num_input_tokens_seen": 3963584, "step": 4130 }, { "epoch": 0.35089952477936187, "grad_norm": 9.41185474395752, "learning_rate": 1.6407050446271738e-06, "loss": 0.0893, "num_input_tokens_seen": 3968896, "step": 4135 }, { "epoch": 0.3513238289205703, "grad_norm": 9.226503372192383, "learning_rate": 1.6395671077338928e-06, "loss": 0.1264, "num_input_tokens_seen": 3973440, "step": 4140 }, { "epoch": 0.35174813306177866, "grad_norm": 20.192955017089844, "learning_rate": 1.6384277676894855e-06, "loss": 0.0501, "num_input_tokens_seen": 3978176, "step": 4145 }, { "epoch": 0.3521724372029871, "grad_norm": 19.124189376831055, "learning_rate": 1.6372870269935583e-06, "loss": 0.0844, "num_input_tokens_seen": 3983360, "step": 4150 }, { "epoch": 0.3525967413441955, "grad_norm": 51.20574951171875, "learning_rate": 1.6361448881487912e-06, "loss": 0.0303, "num_input_tokens_seen": 3987584, "step": 4155 }, { "epoch": 0.3530210454854039, "grad_norm": 0.0988692194223404, "learning_rate": 1.6350013536609307e-06, "loss": 0.0368, "num_input_tokens_seen": 3992576, "step": 4160 }, { "epoch": 0.35344534962661234, "grad_norm": 0.7540062665939331, "learning_rate": 1.6338564260387861e-06, "loss": 0.0488, "num_input_tokens_seen": 3997824, "step": 4165 }, { "epoch": 0.35386965376782076, "grad_norm": 11.16374683380127, "learning_rate": 1.6327101077942228e-06, "loss": 0.1147, "num_input_tokens_seen": 4002048, "step": 4170 }, { "epoch": 0.3542939579090292, "grad_norm": 37.31918716430664, "learning_rate": 1.631562401442157e-06, "loss": 0.0454, "num_input_tokens_seen": 4006656, "step": 4175 }, { "epoch": 0.3547182620502376, "grad_norm": 34.546321868896484, "learning_rate": 1.6304133095005505e-06, "loss": 0.0978, "num_input_tokens_seen": 4011136, "step": 4180 }, { "epoch": 0.355142566191446, "grad_norm": 62.22642517089844, "learning_rate": 1.6292628344904048e-06, "loss": 0.0658, "num_input_tokens_seen": 4015808, "step": 4185 }, { "epoch": 0.35556687033265444, "grad_norm": 17.028743743896484, "learning_rate": 1.628110978935756e-06, "loss": 0.0771, "num_input_tokens_seen": 4020480, "step": 4190 }, { "epoch": 0.35599117447386286, "grad_norm": 0.28512778878211975, "learning_rate": 1.626957745363668e-06, "loss": 0.0722, "num_input_tokens_seen": 4025088, "step": 4195 }, { "epoch": 0.3564154786150713, "grad_norm": 1.972703456878662, "learning_rate": 1.6258031363042291e-06, "loss": 0.0513, "num_input_tokens_seen": 4030272, "step": 4200 }, { "epoch": 0.3568397827562797, "grad_norm": 23.032922744750977, "learning_rate": 1.624647154290545e-06, "loss": 0.0864, "num_input_tokens_seen": 4035072, "step": 4205 }, { "epoch": 0.3572640868974881, "grad_norm": 10.1510009765625, "learning_rate": 1.6234898018587336e-06, "loss": 0.0615, "num_input_tokens_seen": 4039488, "step": 4210 }, { "epoch": 0.35768839103869654, "grad_norm": 16.575807571411133, "learning_rate": 1.6223310815479186e-06, "loss": 0.1352, "num_input_tokens_seen": 4044480, "step": 4215 }, { "epoch": 0.35811269517990496, "grad_norm": 0.11152282357215881, "learning_rate": 1.6211709959002255e-06, "loss": 0.0714, "num_input_tokens_seen": 4048768, "step": 4220 }, { "epoch": 0.3585369993211134, "grad_norm": 1.276023030281067, "learning_rate": 1.620009547460775e-06, "loss": 0.0638, "num_input_tokens_seen": 4053504, "step": 4225 }, { "epoch": 0.3589613034623218, "grad_norm": 2.9088714122772217, "learning_rate": 1.6188467387776779e-06, "loss": 0.0221, "num_input_tokens_seen": 4057856, "step": 4230 }, { "epoch": 0.3593856076035302, "grad_norm": 14.828264236450195, "learning_rate": 1.6176825724020286e-06, "loss": 0.0434, "num_input_tokens_seen": 4062784, "step": 4235 }, { "epoch": 0.35980991174473864, "grad_norm": 0.575377881526947, "learning_rate": 1.6165170508879007e-06, "loss": 0.0282, "num_input_tokens_seen": 4067328, "step": 4240 }, { "epoch": 0.36023421588594706, "grad_norm": 70.52108764648438, "learning_rate": 1.6153501767923408e-06, "loss": 0.0755, "num_input_tokens_seen": 4072704, "step": 4245 }, { "epoch": 0.3606585200271555, "grad_norm": 11.265511512756348, "learning_rate": 1.6141819526753626e-06, "loss": 0.1098, "num_input_tokens_seen": 4077504, "step": 4250 }, { "epoch": 0.3610828241683639, "grad_norm": 0.06153067946434021, "learning_rate": 1.613012381099942e-06, "loss": 0.1086, "num_input_tokens_seen": 4082240, "step": 4255 }, { "epoch": 0.3615071283095723, "grad_norm": 6.542662620544434, "learning_rate": 1.6118414646320111e-06, "loss": 0.0658, "num_input_tokens_seen": 4086272, "step": 4260 }, { "epoch": 0.36193143245078074, "grad_norm": 58.30453872680664, "learning_rate": 1.6106692058404518e-06, "loss": 0.0744, "num_input_tokens_seen": 4090880, "step": 4265 }, { "epoch": 0.36235573659198916, "grad_norm": 14.486126899719238, "learning_rate": 1.6094956072970924e-06, "loss": 0.0715, "num_input_tokens_seen": 4095552, "step": 4270 }, { "epoch": 0.36278004073319753, "grad_norm": 0.5152646899223328, "learning_rate": 1.608320671576699e-06, "loss": 0.0807, "num_input_tokens_seen": 4100352, "step": 4275 }, { "epoch": 0.36320434487440595, "grad_norm": 1.4671602249145508, "learning_rate": 1.6071444012569723e-06, "loss": 0.052, "num_input_tokens_seen": 4104704, "step": 4280 }, { "epoch": 0.36362864901561437, "grad_norm": 14.76534366607666, "learning_rate": 1.6059667989185405e-06, "loss": 0.1166, "num_input_tokens_seen": 4109376, "step": 4285 }, { "epoch": 0.3640529531568228, "grad_norm": 8.199237823486328, "learning_rate": 1.6047878671449544e-06, "loss": 0.0883, "num_input_tokens_seen": 4114496, "step": 4290 }, { "epoch": 0.3644772572980312, "grad_norm": 21.092506408691406, "learning_rate": 1.6036076085226812e-06, "loss": 0.0369, "num_input_tokens_seen": 4119552, "step": 4295 }, { "epoch": 0.36490156143923963, "grad_norm": 10.888265609741211, "learning_rate": 1.6024260256410995e-06, "loss": 0.0301, "num_input_tokens_seen": 4124352, "step": 4300 }, { "epoch": 0.36532586558044805, "grad_norm": 48.6574592590332, "learning_rate": 1.601243121092493e-06, "loss": 0.0234, "num_input_tokens_seen": 4129152, "step": 4305 }, { "epoch": 0.36575016972165647, "grad_norm": 8.204540252685547, "learning_rate": 1.6000588974720443e-06, "loss": 0.0369, "num_input_tokens_seen": 4134144, "step": 4310 }, { "epoch": 0.3661744738628649, "grad_norm": 9.99583625793457, "learning_rate": 1.5988733573778314e-06, "loss": 0.1336, "num_input_tokens_seen": 4138816, "step": 4315 }, { "epoch": 0.3665987780040733, "grad_norm": 22.33359146118164, "learning_rate": 1.597686503410819e-06, "loss": 0.1023, "num_input_tokens_seen": 4143680, "step": 4320 }, { "epoch": 0.36702308214528173, "grad_norm": 13.462114334106445, "learning_rate": 1.596498338174856e-06, "loss": 0.0953, "num_input_tokens_seen": 4149120, "step": 4325 }, { "epoch": 0.36744738628649015, "grad_norm": 3.512903928756714, "learning_rate": 1.595308864276666e-06, "loss": 0.0331, "num_input_tokens_seen": 4154432, "step": 4330 }, { "epoch": 0.36787169042769857, "grad_norm": 0.18234136700630188, "learning_rate": 1.5941180843258452e-06, "loss": 0.0485, "num_input_tokens_seen": 4159360, "step": 4335 }, { "epoch": 0.368295994568907, "grad_norm": 1.4561611413955688, "learning_rate": 1.5929260009348551e-06, "loss": 0.0196, "num_input_tokens_seen": 4163520, "step": 4340 }, { "epoch": 0.3687202987101154, "grad_norm": 37.51924133300781, "learning_rate": 1.5917326167190163e-06, "loss": 0.1079, "num_input_tokens_seen": 4168640, "step": 4345 }, { "epoch": 0.36914460285132383, "grad_norm": 33.85417556762695, "learning_rate": 1.5905379342965033e-06, "loss": 0.1585, "num_input_tokens_seen": 4173312, "step": 4350 }, { "epoch": 0.36956890699253225, "grad_norm": 6.61181640625, "learning_rate": 1.589341956288339e-06, "loss": 0.1323, "num_input_tokens_seen": 4177664, "step": 4355 }, { "epoch": 0.3699932111337407, "grad_norm": 2.2639856338500977, "learning_rate": 1.5881446853183888e-06, "loss": 0.0261, "num_input_tokens_seen": 4182016, "step": 4360 }, { "epoch": 0.3704175152749491, "grad_norm": 1.7123826742172241, "learning_rate": 1.586946124013354e-06, "loss": 0.0039, "num_input_tokens_seen": 4186624, "step": 4365 }, { "epoch": 0.3708418194161575, "grad_norm": 6.56783390045166, "learning_rate": 1.585746275002768e-06, "loss": 0.0708, "num_input_tokens_seen": 4191936, "step": 4370 }, { "epoch": 0.37126612355736593, "grad_norm": 7.986996173858643, "learning_rate": 1.5845451409189887e-06, "loss": 0.087, "num_input_tokens_seen": 4196352, "step": 4375 }, { "epoch": 0.37169042769857435, "grad_norm": 23.848651885986328, "learning_rate": 1.5833427243971927e-06, "loss": 0.0648, "num_input_tokens_seen": 4200960, "step": 4380 }, { "epoch": 0.3721147318397828, "grad_norm": 0.21783022582530975, "learning_rate": 1.582139028075371e-06, "loss": 0.0723, "num_input_tokens_seen": 4206208, "step": 4385 }, { "epoch": 0.3725390359809912, "grad_norm": 11.511324882507324, "learning_rate": 1.580934054594322e-06, "loss": 0.0497, "num_input_tokens_seen": 4210688, "step": 4390 }, { "epoch": 0.3729633401221996, "grad_norm": 5.814783096313477, "learning_rate": 1.5797278065976463e-06, "loss": 0.0667, "num_input_tokens_seen": 4215424, "step": 4395 }, { "epoch": 0.37338764426340804, "grad_norm": 8.122594833374023, "learning_rate": 1.5785202867317407e-06, "loss": 0.0108, "num_input_tokens_seen": 4220160, "step": 4400 }, { "epoch": 0.37381194840461646, "grad_norm": 4.401556968688965, "learning_rate": 1.5773114976457915e-06, "loss": 0.1083, "num_input_tokens_seen": 4224832, "step": 4405 }, { "epoch": 0.3742362525458248, "grad_norm": 21.742496490478516, "learning_rate": 1.576101441991771e-06, "loss": 0.0833, "num_input_tokens_seen": 4230464, "step": 4410 }, { "epoch": 0.37466055668703324, "grad_norm": 1.350370168685913, "learning_rate": 1.574890122424429e-06, "loss": 0.0739, "num_input_tokens_seen": 4234816, "step": 4415 }, { "epoch": 0.37508486082824166, "grad_norm": 3.1516149044036865, "learning_rate": 1.573677541601289e-06, "loss": 0.1099, "num_input_tokens_seen": 4239424, "step": 4420 }, { "epoch": 0.3755091649694501, "grad_norm": 22.65239715576172, "learning_rate": 1.5724637021826409e-06, "loss": 0.0576, "num_input_tokens_seen": 4244160, "step": 4425 }, { "epoch": 0.3759334691106585, "grad_norm": 4.091480255126953, "learning_rate": 1.5712486068315367e-06, "loss": 0.0613, "num_input_tokens_seen": 4248320, "step": 4430 }, { "epoch": 0.3763577732518669, "grad_norm": 26.138118743896484, "learning_rate": 1.5700322582137826e-06, "loss": 0.0184, "num_input_tokens_seen": 4252672, "step": 4435 }, { "epoch": 0.37678207739307534, "grad_norm": 0.11472512781620026, "learning_rate": 1.5688146589979358e-06, "loss": 0.0768, "num_input_tokens_seen": 4257280, "step": 4440 }, { "epoch": 0.37720638153428376, "grad_norm": 4.047384262084961, "learning_rate": 1.5675958118552962e-06, "loss": 0.0366, "num_input_tokens_seen": 4262592, "step": 4445 }, { "epoch": 0.3776306856754922, "grad_norm": 6.992406845092773, "learning_rate": 1.5663757194599013e-06, "loss": 0.0796, "num_input_tokens_seen": 4267840, "step": 4450 }, { "epoch": 0.3780549898167006, "grad_norm": 0.738090991973877, "learning_rate": 1.5651543844885216e-06, "loss": 0.0535, "num_input_tokens_seen": 4272576, "step": 4455 }, { "epoch": 0.378479293957909, "grad_norm": 42.81803512573242, "learning_rate": 1.5639318096206533e-06, "loss": 0.0441, "num_input_tokens_seen": 4277568, "step": 4460 }, { "epoch": 0.37890359809911744, "grad_norm": 11.038164138793945, "learning_rate": 1.562707997538512e-06, "loss": 0.0889, "num_input_tokens_seen": 4282880, "step": 4465 }, { "epoch": 0.37932790224032586, "grad_norm": 16.20423126220703, "learning_rate": 1.5614829509270288e-06, "loss": 0.087, "num_input_tokens_seen": 4287296, "step": 4470 }, { "epoch": 0.3797522063815343, "grad_norm": 0.10790825635194778, "learning_rate": 1.5602566724738426e-06, "loss": 0.0615, "num_input_tokens_seen": 4291712, "step": 4475 }, { "epoch": 0.3801765105227427, "grad_norm": 20.115352630615234, "learning_rate": 1.5590291648692952e-06, "loss": 0.0883, "num_input_tokens_seen": 4297088, "step": 4480 }, { "epoch": 0.3806008146639511, "grad_norm": 16.331439971923828, "learning_rate": 1.5578004308064245e-06, "loss": 0.0393, "num_input_tokens_seen": 4302784, "step": 4485 }, { "epoch": 0.38102511880515955, "grad_norm": 18.88719367980957, "learning_rate": 1.55657047298096e-06, "loss": 0.0433, "num_input_tokens_seen": 4307584, "step": 4490 }, { "epoch": 0.38144942294636797, "grad_norm": 0.11818954348564148, "learning_rate": 1.5553392940913148e-06, "loss": 0.0148, "num_input_tokens_seen": 4312064, "step": 4495 }, { "epoch": 0.3818737270875764, "grad_norm": 0.27669757604599, "learning_rate": 1.554106896838582e-06, "loss": 0.0059, "num_input_tokens_seen": 4316672, "step": 4500 }, { "epoch": 0.3822980312287848, "grad_norm": 5.000036716461182, "learning_rate": 1.5528732839265272e-06, "loss": 0.0332, "num_input_tokens_seen": 4321088, "step": 4505 }, { "epoch": 0.3827223353699932, "grad_norm": 41.28630065917969, "learning_rate": 1.5516384580615832e-06, "loss": 0.0341, "num_input_tokens_seen": 4326208, "step": 4510 }, { "epoch": 0.38314663951120165, "grad_norm": 0.11097334325313568, "learning_rate": 1.5504024219528437e-06, "loss": 0.0553, "num_input_tokens_seen": 4331008, "step": 4515 }, { "epoch": 0.38357094365241007, "grad_norm": 7.243147373199463, "learning_rate": 1.5491651783120578e-06, "loss": 0.019, "num_input_tokens_seen": 4335936, "step": 4520 }, { "epoch": 0.3839952477936185, "grad_norm": 0.0719473585486412, "learning_rate": 1.5479267298536238e-06, "loss": 0.0442, "num_input_tokens_seen": 4340608, "step": 4525 }, { "epoch": 0.3844195519348269, "grad_norm": 23.928747177124023, "learning_rate": 1.5466870792945828e-06, "loss": 0.0841, "num_input_tokens_seen": 4345344, "step": 4530 }, { "epoch": 0.38484385607603533, "grad_norm": 13.379002571105957, "learning_rate": 1.545446229354614e-06, "loss": 0.1063, "num_input_tokens_seen": 4351680, "step": 4535 }, { "epoch": 0.3852681602172437, "grad_norm": 26.05900764465332, "learning_rate": 1.5442041827560272e-06, "loss": 0.12, "num_input_tokens_seen": 4356032, "step": 4540 }, { "epoch": 0.3856924643584521, "grad_norm": 41.182865142822266, "learning_rate": 1.542960942223758e-06, "loss": 0.0567, "num_input_tokens_seen": 4360576, "step": 4545 }, { "epoch": 0.38611676849966053, "grad_norm": 22.600698471069336, "learning_rate": 1.541716510485361e-06, "loss": 0.1224, "num_input_tokens_seen": 4365376, "step": 4550 }, { "epoch": 0.38654107264086895, "grad_norm": 0.17062042653560638, "learning_rate": 1.5404708902710048e-06, "loss": 0.015, "num_input_tokens_seen": 4370368, "step": 4555 }, { "epoch": 0.3869653767820774, "grad_norm": 8.25340747833252, "learning_rate": 1.5392240843134648e-06, "loss": 0.0918, "num_input_tokens_seen": 4375488, "step": 4560 }, { "epoch": 0.3873896809232858, "grad_norm": 23.542274475097656, "learning_rate": 1.5379760953481178e-06, "loss": 0.1303, "num_input_tokens_seen": 4380352, "step": 4565 }, { "epoch": 0.3878139850644942, "grad_norm": 21.426984786987305, "learning_rate": 1.5367269261129367e-06, "loss": 0.0735, "num_input_tokens_seen": 4385664, "step": 4570 }, { "epoch": 0.38823828920570264, "grad_norm": 15.733993530273438, "learning_rate": 1.5354765793484831e-06, "loss": 0.1257, "num_input_tokens_seen": 4390528, "step": 4575 }, { "epoch": 0.38866259334691106, "grad_norm": 0.2743380069732666, "learning_rate": 1.5342250577979023e-06, "loss": 0.0082, "num_input_tokens_seen": 4395136, "step": 4580 }, { "epoch": 0.3890868974881195, "grad_norm": 6.8629279136657715, "learning_rate": 1.532972364206917e-06, "loss": 0.0835, "num_input_tokens_seen": 4400320, "step": 4585 }, { "epoch": 0.3895112016293279, "grad_norm": 0.2883586883544922, "learning_rate": 1.5317185013238209e-06, "loss": 0.0612, "num_input_tokens_seen": 4405056, "step": 4590 }, { "epoch": 0.3899355057705363, "grad_norm": 17.891014099121094, "learning_rate": 1.5304634718994738e-06, "loss": 0.0862, "num_input_tokens_seen": 4410624, "step": 4595 }, { "epoch": 0.39035980991174474, "grad_norm": 6.251929759979248, "learning_rate": 1.5292072786872938e-06, "loss": 0.0614, "num_input_tokens_seen": 4415424, "step": 4600 }, { "epoch": 0.39078411405295316, "grad_norm": 9.538578033447266, "learning_rate": 1.527949924443253e-06, "loss": 0.0369, "num_input_tokens_seen": 4420224, "step": 4605 }, { "epoch": 0.3912084181941616, "grad_norm": 27.938451766967773, "learning_rate": 1.52669141192587e-06, "loss": 0.0958, "num_input_tokens_seen": 4424896, "step": 4610 }, { "epoch": 0.39163272233537, "grad_norm": 29.00919532775879, "learning_rate": 1.5254317438962052e-06, "loss": 0.0946, "num_input_tokens_seen": 4429312, "step": 4615 }, { "epoch": 0.3920570264765784, "grad_norm": 8.168607711791992, "learning_rate": 1.5241709231178539e-06, "loss": 0.1145, "num_input_tokens_seen": 4433920, "step": 4620 }, { "epoch": 0.39248133061778684, "grad_norm": 14.62310791015625, "learning_rate": 1.5229089523569405e-06, "loss": 0.0557, "num_input_tokens_seen": 4438464, "step": 4625 }, { "epoch": 0.39290563475899526, "grad_norm": 5.12725830078125, "learning_rate": 1.5216458343821122e-06, "loss": 0.0481, "num_input_tokens_seen": 4443584, "step": 4630 }, { "epoch": 0.3933299389002037, "grad_norm": 1.6804157495498657, "learning_rate": 1.5203815719645328e-06, "loss": 0.0451, "num_input_tokens_seen": 4448960, "step": 4635 }, { "epoch": 0.3937542430414121, "grad_norm": 0.8322094678878784, "learning_rate": 1.5191161678778773e-06, "loss": 0.0307, "num_input_tokens_seen": 4453504, "step": 4640 }, { "epoch": 0.3941785471826205, "grad_norm": 32.25953674316406, "learning_rate": 1.5178496248983251e-06, "loss": 0.108, "num_input_tokens_seen": 4458048, "step": 4645 }, { "epoch": 0.39460285132382894, "grad_norm": 13.34233283996582, "learning_rate": 1.5165819458045554e-06, "loss": 0.0384, "num_input_tokens_seen": 4463168, "step": 4650 }, { "epoch": 0.39502715546503736, "grad_norm": 33.31865692138672, "learning_rate": 1.5153131333777377e-06, "loss": 0.1386, "num_input_tokens_seen": 4467520, "step": 4655 }, { "epoch": 0.3954514596062458, "grad_norm": 13.36380386352539, "learning_rate": 1.51404319040153e-06, "loss": 0.0864, "num_input_tokens_seen": 4472960, "step": 4660 }, { "epoch": 0.3958757637474542, "grad_norm": 13.95639705657959, "learning_rate": 1.5127721196620697e-06, "loss": 0.058, "num_input_tokens_seen": 4477312, "step": 4665 }, { "epoch": 0.39630006788866257, "grad_norm": 2.9383718967437744, "learning_rate": 1.5114999239479685e-06, "loss": 0.0255, "num_input_tokens_seen": 4482432, "step": 4670 }, { "epoch": 0.396724372029871, "grad_norm": 15.002151489257812, "learning_rate": 1.5102266060503063e-06, "loss": 0.0754, "num_input_tokens_seen": 4486464, "step": 4675 }, { "epoch": 0.3971486761710794, "grad_norm": 5.086097717285156, "learning_rate": 1.508952168762624e-06, "loss": 0.104, "num_input_tokens_seen": 4491520, "step": 4680 }, { "epoch": 0.3975729803122878, "grad_norm": 3.3495185375213623, "learning_rate": 1.5076766148809209e-06, "loss": 0.0473, "num_input_tokens_seen": 4496256, "step": 4685 }, { "epoch": 0.39799728445349625, "grad_norm": 11.387035369873047, "learning_rate": 1.506399947203643e-06, "loss": 0.0532, "num_input_tokens_seen": 4500416, "step": 4690 }, { "epoch": 0.39842158859470467, "grad_norm": 20.36817169189453, "learning_rate": 1.5051221685316815e-06, "loss": 0.052, "num_input_tokens_seen": 4505536, "step": 4695 }, { "epoch": 0.3988458927359131, "grad_norm": 5.638185977935791, "learning_rate": 1.5038432816683652e-06, "loss": 0.1351, "num_input_tokens_seen": 4510400, "step": 4700 }, { "epoch": 0.3992701968771215, "grad_norm": 16.21558380126953, "learning_rate": 1.5025632894194532e-06, "loss": 0.0894, "num_input_tokens_seen": 4515904, "step": 4705 }, { "epoch": 0.39969450101832993, "grad_norm": 21.98189926147461, "learning_rate": 1.5012821945931303e-06, "loss": 0.0939, "num_input_tokens_seen": 4520448, "step": 4710 }, { "epoch": 0.40011880515953835, "grad_norm": 18.89621353149414, "learning_rate": 1.5e-06, "loss": 0.0654, "num_input_tokens_seen": 4525824, "step": 4715 }, { "epoch": 0.40054310930074677, "grad_norm": 18.766685485839844, "learning_rate": 1.498716708453079e-06, "loss": 0.0782, "num_input_tokens_seen": 4530304, "step": 4720 }, { "epoch": 0.40054310930074677, "eval_loss": 0.055105432868003845, "eval_runtime": 15.7954, "eval_samples_per_second": 663.169, "eval_steps_per_second": 82.936, "num_input_tokens_seen": 4530304, "step": 4720 }, { "epoch": 0.4009674134419552, "grad_norm": 7.12246561050415, "learning_rate": 1.4974323227677903e-06, "loss": 0.1067, "num_input_tokens_seen": 4534720, "step": 4725 }, { "epoch": 0.4013917175831636, "grad_norm": 12.245247840881348, "learning_rate": 1.4961468457619575e-06, "loss": 0.1018, "num_input_tokens_seen": 4539520, "step": 4730 }, { "epoch": 0.40181602172437203, "grad_norm": 0.7408866882324219, "learning_rate": 1.4948602802557982e-06, "loss": 0.0083, "num_input_tokens_seen": 4544448, "step": 4735 }, { "epoch": 0.40224032586558045, "grad_norm": 17.369991302490234, "learning_rate": 1.4935726290719177e-06, "loss": 0.0448, "num_input_tokens_seen": 4549632, "step": 4740 }, { "epoch": 0.40266463000678887, "grad_norm": 1.5301878452301025, "learning_rate": 1.492283895035305e-06, "loss": 0.0697, "num_input_tokens_seen": 4554560, "step": 4745 }, { "epoch": 0.4030889341479973, "grad_norm": 7.232668876647949, "learning_rate": 1.490994080973322e-06, "loss": 0.0605, "num_input_tokens_seen": 4559168, "step": 4750 }, { "epoch": 0.4035132382892057, "grad_norm": 11.598247528076172, "learning_rate": 1.4897031897157025e-06, "loss": 0.0556, "num_input_tokens_seen": 4563968, "step": 4755 }, { "epoch": 0.40393754243041413, "grad_norm": 1.8283789157867432, "learning_rate": 1.4884112240945425e-06, "loss": 0.0479, "num_input_tokens_seen": 4569408, "step": 4760 }, { "epoch": 0.40436184657162255, "grad_norm": 0.4203529357910156, "learning_rate": 1.4871181869442952e-06, "loss": 0.0856, "num_input_tokens_seen": 4573824, "step": 4765 }, { "epoch": 0.40478615071283097, "grad_norm": 18.766796112060547, "learning_rate": 1.485824081101764e-06, "loss": 0.126, "num_input_tokens_seen": 4578368, "step": 4770 }, { "epoch": 0.4052104548540394, "grad_norm": 31.071496963500977, "learning_rate": 1.4845289094060984e-06, "loss": 0.0938, "num_input_tokens_seen": 4583040, "step": 4775 }, { "epoch": 0.4056347589952478, "grad_norm": 11.54023265838623, "learning_rate": 1.4832326746987846e-06, "loss": 0.1106, "num_input_tokens_seen": 4587968, "step": 4780 }, { "epoch": 0.40605906313645623, "grad_norm": 8.134360313415527, "learning_rate": 1.4819353798236424e-06, "loss": 0.1431, "num_input_tokens_seen": 4593216, "step": 4785 }, { "epoch": 0.40648336727766465, "grad_norm": 15.816326141357422, "learning_rate": 1.4806370276268163e-06, "loss": 0.0959, "num_input_tokens_seen": 4597824, "step": 4790 }, { "epoch": 0.4069076714188731, "grad_norm": 8.704207420349121, "learning_rate": 1.4793376209567714e-06, "loss": 0.0695, "num_input_tokens_seen": 4602880, "step": 4795 }, { "epoch": 0.4073319755600815, "grad_norm": 20.96257209777832, "learning_rate": 1.4780371626642858e-06, "loss": 0.0287, "num_input_tokens_seen": 4607744, "step": 4800 }, { "epoch": 0.40775627970128986, "grad_norm": 6.630837917327881, "learning_rate": 1.4767356556024448e-06, "loss": 0.1188, "num_input_tokens_seen": 4612224, "step": 4805 }, { "epoch": 0.4081805838424983, "grad_norm": 18.014816284179688, "learning_rate": 1.4754331026266344e-06, "loss": 0.0525, "num_input_tokens_seen": 4616704, "step": 4810 }, { "epoch": 0.4086048879837067, "grad_norm": 10.613683700561523, "learning_rate": 1.474129506594536e-06, "loss": 0.0684, "num_input_tokens_seen": 4621568, "step": 4815 }, { "epoch": 0.4090291921249151, "grad_norm": 7.092163562774658, "learning_rate": 1.472824870366118e-06, "loss": 0.0427, "num_input_tokens_seen": 4626176, "step": 4820 }, { "epoch": 0.40945349626612354, "grad_norm": 8.511113166809082, "learning_rate": 1.4715191968036324e-06, "loss": 0.0541, "num_input_tokens_seen": 4630400, "step": 4825 }, { "epoch": 0.40987780040733196, "grad_norm": 1.382973074913025, "learning_rate": 1.4702124887716058e-06, "loss": 0.0376, "num_input_tokens_seen": 4634688, "step": 4830 }, { "epoch": 0.4103021045485404, "grad_norm": 2.407348871231079, "learning_rate": 1.4689047491368354e-06, "loss": 0.0444, "num_input_tokens_seen": 4639104, "step": 4835 }, { "epoch": 0.4107264086897488, "grad_norm": 33.6815185546875, "learning_rate": 1.4675959807683808e-06, "loss": 0.0833, "num_input_tokens_seen": 4643328, "step": 4840 }, { "epoch": 0.4111507128309572, "grad_norm": 2.1301493644714355, "learning_rate": 1.4662861865375588e-06, "loss": 0.0481, "num_input_tokens_seen": 4648320, "step": 4845 }, { "epoch": 0.41157501697216564, "grad_norm": 0.48759591579437256, "learning_rate": 1.4649753693179373e-06, "loss": 0.0984, "num_input_tokens_seen": 4653120, "step": 4850 }, { "epoch": 0.41199932111337406, "grad_norm": 20.999130249023438, "learning_rate": 1.4636635319853272e-06, "loss": 0.073, "num_input_tokens_seen": 4658048, "step": 4855 }, { "epoch": 0.4124236252545825, "grad_norm": 0.22578765451908112, "learning_rate": 1.4623506774177796e-06, "loss": 0.0463, "num_input_tokens_seen": 4662976, "step": 4860 }, { "epoch": 0.4128479293957909, "grad_norm": 8.64342975616455, "learning_rate": 1.4610368084955748e-06, "loss": 0.0902, "num_input_tokens_seen": 4667840, "step": 4865 }, { "epoch": 0.4132722335369993, "grad_norm": 48.03071212768555, "learning_rate": 1.4597219281012208e-06, "loss": 0.096, "num_input_tokens_seen": 4673408, "step": 4870 }, { "epoch": 0.41369653767820774, "grad_norm": 26.673458099365234, "learning_rate": 1.4584060391194436e-06, "loss": 0.1332, "num_input_tokens_seen": 4679552, "step": 4875 }, { "epoch": 0.41412084181941616, "grad_norm": 0.5152348279953003, "learning_rate": 1.4570891444371814e-06, "loss": 0.0965, "num_input_tokens_seen": 4684352, "step": 4880 }, { "epoch": 0.4145451459606246, "grad_norm": 6.625322341918945, "learning_rate": 1.4557712469435797e-06, "loss": 0.0692, "num_input_tokens_seen": 4688704, "step": 4885 }, { "epoch": 0.414969450101833, "grad_norm": 22.45545196533203, "learning_rate": 1.4544523495299841e-06, "loss": 0.0413, "num_input_tokens_seen": 4693440, "step": 4890 }, { "epoch": 0.4153937542430414, "grad_norm": 1.495713710784912, "learning_rate": 1.4531324550899333e-06, "loss": 0.0806, "num_input_tokens_seen": 4698496, "step": 4895 }, { "epoch": 0.41581805838424984, "grad_norm": 12.096963882446289, "learning_rate": 1.451811566519154e-06, "loss": 0.0962, "num_input_tokens_seen": 4703616, "step": 4900 }, { "epoch": 0.41624236252545826, "grad_norm": 5.132894992828369, "learning_rate": 1.450489686715553e-06, "loss": 0.0827, "num_input_tokens_seen": 4707584, "step": 4905 }, { "epoch": 0.4166666666666667, "grad_norm": 41.053489685058594, "learning_rate": 1.4491668185792131e-06, "loss": 0.1271, "num_input_tokens_seen": 4711936, "step": 4910 }, { "epoch": 0.4170909708078751, "grad_norm": 1.5487451553344727, "learning_rate": 1.4478429650123851e-06, "loss": 0.0545, "num_input_tokens_seen": 4716992, "step": 4915 }, { "epoch": 0.4175152749490835, "grad_norm": 24.532102584838867, "learning_rate": 1.44651812891948e-06, "loss": 0.0595, "num_input_tokens_seen": 4721792, "step": 4920 }, { "epoch": 0.41793957909029195, "grad_norm": 14.233129501342773, "learning_rate": 1.4451923132070669e-06, "loss": 0.0644, "num_input_tokens_seen": 4726208, "step": 4925 }, { "epoch": 0.41836388323150037, "grad_norm": 24.64492416381836, "learning_rate": 1.4438655207838628e-06, "loss": 0.0712, "num_input_tokens_seen": 4730880, "step": 4930 }, { "epoch": 0.41878818737270873, "grad_norm": 2.684027910232544, "learning_rate": 1.4425377545607275e-06, "loss": 0.0898, "num_input_tokens_seen": 4736128, "step": 4935 }, { "epoch": 0.41921249151391715, "grad_norm": 0.3984468877315521, "learning_rate": 1.4412090174506567e-06, "loss": 0.0292, "num_input_tokens_seen": 4740608, "step": 4940 }, { "epoch": 0.41963679565512557, "grad_norm": 1.1649671792984009, "learning_rate": 1.4398793123687777e-06, "loss": 0.0739, "num_input_tokens_seen": 4745408, "step": 4945 }, { "epoch": 0.420061099796334, "grad_norm": 0.20689576864242554, "learning_rate": 1.4385486422323404e-06, "loss": 0.0038, "num_input_tokens_seen": 4750400, "step": 4950 }, { "epoch": 0.4204854039375424, "grad_norm": 0.23738016188144684, "learning_rate": 1.4372170099607123e-06, "loss": 0.0556, "num_input_tokens_seen": 4755008, "step": 4955 }, { "epoch": 0.42090970807875083, "grad_norm": 6.202879428863525, "learning_rate": 1.435884418475371e-06, "loss": 0.1251, "num_input_tokens_seen": 4759424, "step": 4960 }, { "epoch": 0.42133401221995925, "grad_norm": 3.472230911254883, "learning_rate": 1.4345508706998994e-06, "loss": 0.0476, "num_input_tokens_seen": 4763968, "step": 4965 }, { "epoch": 0.4217583163611677, "grad_norm": 13.630236625671387, "learning_rate": 1.433216369559978e-06, "loss": 0.065, "num_input_tokens_seen": 4768960, "step": 4970 }, { "epoch": 0.4221826205023761, "grad_norm": 1.2870839834213257, "learning_rate": 1.4318809179833791e-06, "loss": 0.0698, "num_input_tokens_seen": 4774592, "step": 4975 }, { "epoch": 0.4226069246435845, "grad_norm": 13.314414024353027, "learning_rate": 1.4305445188999596e-06, "loss": 0.0981, "num_input_tokens_seen": 4778944, "step": 4980 }, { "epoch": 0.42303122878479293, "grad_norm": 1.0573667287826538, "learning_rate": 1.4292071752416558e-06, "loss": 0.1095, "num_input_tokens_seen": 4783488, "step": 4985 }, { "epoch": 0.42345553292600135, "grad_norm": 1.0888034105300903, "learning_rate": 1.4278688899424764e-06, "loss": 0.0264, "num_input_tokens_seen": 4788288, "step": 4990 }, { "epoch": 0.4238798370672098, "grad_norm": 0.6745188236236572, "learning_rate": 1.4265296659384953e-06, "loss": 0.0264, "num_input_tokens_seen": 4792896, "step": 4995 }, { "epoch": 0.4243041412084182, "grad_norm": 11.085637092590332, "learning_rate": 1.4251895061678463e-06, "loss": 0.1121, "num_input_tokens_seen": 4797056, "step": 5000 }, { "epoch": 0.4247284453496266, "grad_norm": 0.7506033182144165, "learning_rate": 1.4238484135707162e-06, "loss": 0.0697, "num_input_tokens_seen": 4802304, "step": 5005 }, { "epoch": 0.42515274949083504, "grad_norm": 11.137918472290039, "learning_rate": 1.4225063910893384e-06, "loss": 0.0716, "num_input_tokens_seen": 4807424, "step": 5010 }, { "epoch": 0.42557705363204346, "grad_norm": 51.12891387939453, "learning_rate": 1.4211634416679855e-06, "loss": 0.0602, "num_input_tokens_seen": 4812224, "step": 5015 }, { "epoch": 0.4260013577732519, "grad_norm": 28.7672061920166, "learning_rate": 1.419819568252965e-06, "loss": 0.1474, "num_input_tokens_seen": 4817216, "step": 5020 }, { "epoch": 0.4264256619144603, "grad_norm": 0.18512046337127686, "learning_rate": 1.418474773792611e-06, "loss": 0.0352, "num_input_tokens_seen": 4822336, "step": 5025 }, { "epoch": 0.4268499660556687, "grad_norm": 4.583760738372803, "learning_rate": 1.4171290612372779e-06, "loss": 0.0397, "num_input_tokens_seen": 4827328, "step": 5030 }, { "epoch": 0.42727427019687714, "grad_norm": 9.943023681640625, "learning_rate": 1.4157824335393349e-06, "loss": 0.0677, "num_input_tokens_seen": 4831808, "step": 5035 }, { "epoch": 0.42769857433808556, "grad_norm": 0.5764977335929871, "learning_rate": 1.4144348936531588e-06, "loss": 0.0246, "num_input_tokens_seen": 4836096, "step": 5040 }, { "epoch": 0.428122878479294, "grad_norm": 40.916561126708984, "learning_rate": 1.413086444535127e-06, "loss": 0.0595, "num_input_tokens_seen": 4840448, "step": 5045 }, { "epoch": 0.4285471826205024, "grad_norm": 0.2913835048675537, "learning_rate": 1.4117370891436133e-06, "loss": 0.0571, "num_input_tokens_seen": 4845184, "step": 5050 }, { "epoch": 0.4289714867617108, "grad_norm": 6.826591491699219, "learning_rate": 1.410386830438978e-06, "loss": 0.1268, "num_input_tokens_seen": 4849920, "step": 5055 }, { "epoch": 0.42939579090291924, "grad_norm": 28.76835823059082, "learning_rate": 1.4090356713835635e-06, "loss": 0.0976, "num_input_tokens_seen": 4854400, "step": 5060 }, { "epoch": 0.4298200950441276, "grad_norm": 4.77614688873291, "learning_rate": 1.4076836149416886e-06, "loss": 0.085, "num_input_tokens_seen": 4859392, "step": 5065 }, { "epoch": 0.430244399185336, "grad_norm": 1.4293068647384644, "learning_rate": 1.4063306640796404e-06, "loss": 0.1246, "num_input_tokens_seen": 4864512, "step": 5070 }, { "epoch": 0.43066870332654444, "grad_norm": 8.199060440063477, "learning_rate": 1.4049768217656674e-06, "loss": 0.0893, "num_input_tokens_seen": 4869888, "step": 5075 }, { "epoch": 0.43109300746775286, "grad_norm": 7.790666580200195, "learning_rate": 1.4036220909699748e-06, "loss": 0.0676, "num_input_tokens_seen": 4874304, "step": 5080 }, { "epoch": 0.4315173116089613, "grad_norm": 35.040306091308594, "learning_rate": 1.4022664746647168e-06, "loss": 0.0927, "num_input_tokens_seen": 4879360, "step": 5085 }, { "epoch": 0.4319416157501697, "grad_norm": 9.675673484802246, "learning_rate": 1.40090997582399e-06, "loss": 0.0828, "num_input_tokens_seen": 4883520, "step": 5090 }, { "epoch": 0.4323659198913781, "grad_norm": 3.159926652908325, "learning_rate": 1.3995525974238278e-06, "loss": 0.039, "num_input_tokens_seen": 4888320, "step": 5095 }, { "epoch": 0.43279022403258655, "grad_norm": 2.8504841327667236, "learning_rate": 1.398194342442193e-06, "loss": 0.0103, "num_input_tokens_seen": 4893824, "step": 5100 }, { "epoch": 0.43321452817379497, "grad_norm": 9.892780303955078, "learning_rate": 1.396835213858971e-06, "loss": 0.0693, "num_input_tokens_seen": 4898432, "step": 5105 }, { "epoch": 0.4336388323150034, "grad_norm": 25.05731964111328, "learning_rate": 1.395475214655965e-06, "loss": 0.106, "num_input_tokens_seen": 4903040, "step": 5110 }, { "epoch": 0.4340631364562118, "grad_norm": 11.220687866210938, "learning_rate": 1.394114347816887e-06, "loss": 0.0744, "num_input_tokens_seen": 4908096, "step": 5115 }, { "epoch": 0.4344874405974202, "grad_norm": 0.7513478994369507, "learning_rate": 1.3927526163273538e-06, "loss": 0.1002, "num_input_tokens_seen": 4912640, "step": 5120 }, { "epoch": 0.43491174473862865, "grad_norm": 17.629304885864258, "learning_rate": 1.3913900231748776e-06, "loss": 0.0305, "num_input_tokens_seen": 4917504, "step": 5125 }, { "epoch": 0.43533604887983707, "grad_norm": 0.5880158543586731, "learning_rate": 1.3900265713488623e-06, "loss": 0.0281, "num_input_tokens_seen": 4922688, "step": 5130 }, { "epoch": 0.4357603530210455, "grad_norm": 2.0686724185943604, "learning_rate": 1.3886622638405952e-06, "loss": 0.0589, "num_input_tokens_seen": 4926976, "step": 5135 }, { "epoch": 0.4361846571622539, "grad_norm": 21.587642669677734, "learning_rate": 1.3872971036432406e-06, "loss": 0.052, "num_input_tokens_seen": 4931456, "step": 5140 }, { "epoch": 0.43660896130346233, "grad_norm": 2.3363358974456787, "learning_rate": 1.385931093751834e-06, "loss": 0.0948, "num_input_tokens_seen": 4936000, "step": 5145 }, { "epoch": 0.43703326544467075, "grad_norm": 14.495381355285645, "learning_rate": 1.384564237163275e-06, "loss": 0.0645, "num_input_tokens_seen": 4940288, "step": 5150 }, { "epoch": 0.43745756958587917, "grad_norm": 9.281749725341797, "learning_rate": 1.3831965368763203e-06, "loss": 0.0545, "num_input_tokens_seen": 4944576, "step": 5155 }, { "epoch": 0.4378818737270876, "grad_norm": 14.984248161315918, "learning_rate": 1.3818279958915785e-06, "loss": 0.0682, "num_input_tokens_seen": 4948992, "step": 5160 }, { "epoch": 0.438306177868296, "grad_norm": 10.550505638122559, "learning_rate": 1.3804586172115015e-06, "loss": 0.0657, "num_input_tokens_seen": 4953728, "step": 5165 }, { "epoch": 0.43873048200950443, "grad_norm": 12.228930473327637, "learning_rate": 1.3790884038403793e-06, "loss": 0.055, "num_input_tokens_seen": 4958720, "step": 5170 }, { "epoch": 0.43915478615071285, "grad_norm": 16.7993221282959, "learning_rate": 1.3777173587843341e-06, "loss": 0.0761, "num_input_tokens_seen": 4963840, "step": 5175 }, { "epoch": 0.43957909029192127, "grad_norm": 19.838136672973633, "learning_rate": 1.3763454850513122e-06, "loss": 0.0099, "num_input_tokens_seen": 4968512, "step": 5180 }, { "epoch": 0.4400033944331297, "grad_norm": 16.498245239257812, "learning_rate": 1.3749727856510766e-06, "loss": 0.079, "num_input_tokens_seen": 4972928, "step": 5185 }, { "epoch": 0.4404276985743381, "grad_norm": 14.861262321472168, "learning_rate": 1.373599263595204e-06, "loss": 0.1034, "num_input_tokens_seen": 4977664, "step": 5190 }, { "epoch": 0.4408520027155465, "grad_norm": 1.0039514303207397, "learning_rate": 1.3722249218970744e-06, "loss": 0.1265, "num_input_tokens_seen": 4982912, "step": 5195 }, { "epoch": 0.4412763068567549, "grad_norm": 21.250131607055664, "learning_rate": 1.3708497635718672e-06, "loss": 0.0489, "num_input_tokens_seen": 4988416, "step": 5200 }, { "epoch": 0.4417006109979633, "grad_norm": 8.660135269165039, "learning_rate": 1.3694737916365515e-06, "loss": 0.0982, "num_input_tokens_seen": 4993472, "step": 5205 }, { "epoch": 0.44212491513917174, "grad_norm": 0.22283487021923065, "learning_rate": 1.3680970091098832e-06, "loss": 0.037, "num_input_tokens_seen": 4998208, "step": 5210 }, { "epoch": 0.44254921928038016, "grad_norm": 15.840060234069824, "learning_rate": 1.366719419012396e-06, "loss": 0.0563, "num_input_tokens_seen": 5003008, "step": 5215 }, { "epoch": 0.4429735234215886, "grad_norm": 14.518122673034668, "learning_rate": 1.3653410243663951e-06, "loss": 0.0438, "num_input_tokens_seen": 5008704, "step": 5220 }, { "epoch": 0.443397827562797, "grad_norm": 27.201988220214844, "learning_rate": 1.363961828195951e-06, "loss": 0.1245, "num_input_tokens_seen": 5013120, "step": 5225 }, { "epoch": 0.4438221317040054, "grad_norm": 0.40873754024505615, "learning_rate": 1.3625818335268923e-06, "loss": 0.0402, "num_input_tokens_seen": 5017664, "step": 5230 }, { "epoch": 0.44424643584521384, "grad_norm": 11.026297569274902, "learning_rate": 1.3612010433868004e-06, "loss": 0.058, "num_input_tokens_seen": 5022528, "step": 5235 }, { "epoch": 0.44467073998642226, "grad_norm": 6.463628768920898, "learning_rate": 1.3598194608050008e-06, "loss": 0.0781, "num_input_tokens_seen": 5027072, "step": 5240 }, { "epoch": 0.4450950441276307, "grad_norm": 0.26125529408454895, "learning_rate": 1.3584370888125583e-06, "loss": 0.0093, "num_input_tokens_seen": 5031424, "step": 5245 }, { "epoch": 0.4455193482688391, "grad_norm": 25.507278442382812, "learning_rate": 1.357053930442269e-06, "loss": 0.056, "num_input_tokens_seen": 5036480, "step": 5250 }, { "epoch": 0.4459436524100475, "grad_norm": 12.934303283691406, "learning_rate": 1.355669988728655e-06, "loss": 0.0619, "num_input_tokens_seen": 5041792, "step": 5255 }, { "epoch": 0.44636795655125594, "grad_norm": 0.4678743779659271, "learning_rate": 1.3542852667079557e-06, "loss": 0.0033, "num_input_tokens_seen": 5046592, "step": 5260 }, { "epoch": 0.44679226069246436, "grad_norm": 20.0618953704834, "learning_rate": 1.352899767418124e-06, "loss": 0.1114, "num_input_tokens_seen": 5051456, "step": 5265 }, { "epoch": 0.4472165648336728, "grad_norm": 21.787246704101562, "learning_rate": 1.3515134938988168e-06, "loss": 0.0761, "num_input_tokens_seen": 5056320, "step": 5270 }, { "epoch": 0.4476408689748812, "grad_norm": 0.42069345712661743, "learning_rate": 1.3501264491913906e-06, "loss": 0.0149, "num_input_tokens_seen": 5061248, "step": 5275 }, { "epoch": 0.4480651731160896, "grad_norm": 16.042268753051758, "learning_rate": 1.348738636338893e-06, "loss": 0.0755, "num_input_tokens_seen": 5065664, "step": 5280 }, { "epoch": 0.44848947725729804, "grad_norm": 11.962218284606934, "learning_rate": 1.3473500583860568e-06, "loss": 0.0789, "num_input_tokens_seen": 5071552, "step": 5285 }, { "epoch": 0.44891378139850646, "grad_norm": 0.37122446298599243, "learning_rate": 1.3459607183792945e-06, "loss": 0.0564, "num_input_tokens_seen": 5076032, "step": 5290 }, { "epoch": 0.4493380855397149, "grad_norm": 15.164850234985352, "learning_rate": 1.344570619366689e-06, "loss": 0.0543, "num_input_tokens_seen": 5080384, "step": 5295 }, { "epoch": 0.4497623896809233, "grad_norm": 17.177448272705078, "learning_rate": 1.3431797643979894e-06, "loss": 0.0623, "num_input_tokens_seen": 5085376, "step": 5300 }, { "epoch": 0.4501866938221317, "grad_norm": 7.689847469329834, "learning_rate": 1.3417881565246027e-06, "loss": 0.0338, "num_input_tokens_seen": 5090112, "step": 5305 }, { "epoch": 0.45061099796334014, "grad_norm": 56.23309326171875, "learning_rate": 1.3403957987995882e-06, "loss": 0.05, "num_input_tokens_seen": 5095424, "step": 5310 }, { "epoch": 0.45061099796334014, "eval_loss": 0.06341014802455902, "eval_runtime": 15.8212, "eval_samples_per_second": 662.086, "eval_steps_per_second": 82.8, "num_input_tokens_seen": 5095424, "step": 5310 }, { "epoch": 0.45103530210454856, "grad_norm": 2.7271251678466797, "learning_rate": 1.33900269427765e-06, "loss": 0.0584, "num_input_tokens_seen": 5100864, "step": 5315 }, { "epoch": 0.451459606245757, "grad_norm": 9.763409614562988, "learning_rate": 1.3376088460151306e-06, "loss": 0.0825, "num_input_tokens_seen": 5105088, "step": 5320 }, { "epoch": 0.4518839103869654, "grad_norm": 26.676908493041992, "learning_rate": 1.336214257070004e-06, "loss": 0.044, "num_input_tokens_seen": 5109760, "step": 5325 }, { "epoch": 0.45230821452817377, "grad_norm": 25.818172454833984, "learning_rate": 1.3348189305018702e-06, "loss": 0.0885, "num_input_tokens_seen": 5114176, "step": 5330 }, { "epoch": 0.4527325186693822, "grad_norm": 9.313101768493652, "learning_rate": 1.3334228693719464e-06, "loss": 0.0254, "num_input_tokens_seen": 5118592, "step": 5335 }, { "epoch": 0.4531568228105906, "grad_norm": 0.3694465160369873, "learning_rate": 1.3320260767430614e-06, "loss": 0.1096, "num_input_tokens_seen": 5123584, "step": 5340 }, { "epoch": 0.45358112695179903, "grad_norm": 14.618754386901855, "learning_rate": 1.3306285556796492e-06, "loss": 0.0228, "num_input_tokens_seen": 5128192, "step": 5345 }, { "epoch": 0.45400543109300745, "grad_norm": 0.516826868057251, "learning_rate": 1.3292303092477424e-06, "loss": 0.0764, "num_input_tokens_seen": 5132864, "step": 5350 }, { "epoch": 0.45442973523421587, "grad_norm": 1.1318422555923462, "learning_rate": 1.3278313405149638e-06, "loss": 0.0411, "num_input_tokens_seen": 5137216, "step": 5355 }, { "epoch": 0.4548540393754243, "grad_norm": 8.509772300720215, "learning_rate": 1.3264316525505216e-06, "loss": 0.0462, "num_input_tokens_seen": 5142528, "step": 5360 }, { "epoch": 0.4552783435166327, "grad_norm": 15.627358436584473, "learning_rate": 1.3250312484252021e-06, "loss": 0.0102, "num_input_tokens_seen": 5147968, "step": 5365 }, { "epoch": 0.45570264765784113, "grad_norm": 0.08779313415288925, "learning_rate": 1.3236301312113627e-06, "loss": 0.0413, "num_input_tokens_seen": 5152384, "step": 5370 }, { "epoch": 0.45612695179904955, "grad_norm": 36.04643630981445, "learning_rate": 1.3222283039829247e-06, "loss": 0.1069, "num_input_tokens_seen": 5156992, "step": 5375 }, { "epoch": 0.45655125594025797, "grad_norm": 0.39356672763824463, "learning_rate": 1.3208257698153676e-06, "loss": 0.0965, "num_input_tokens_seen": 5161984, "step": 5380 }, { "epoch": 0.4569755600814664, "grad_norm": 16.60633659362793, "learning_rate": 1.3194225317857216e-06, "loss": 0.0604, "num_input_tokens_seen": 5167040, "step": 5385 }, { "epoch": 0.4573998642226748, "grad_norm": 25.81131362915039, "learning_rate": 1.3180185929725616e-06, "loss": 0.0475, "num_input_tokens_seen": 5171776, "step": 5390 }, { "epoch": 0.45782416836388323, "grad_norm": 14.490070343017578, "learning_rate": 1.3166139564559992e-06, "loss": 0.117, "num_input_tokens_seen": 5176896, "step": 5395 }, { "epoch": 0.45824847250509165, "grad_norm": 17.152679443359375, "learning_rate": 1.3152086253176773e-06, "loss": 0.0416, "num_input_tokens_seen": 5181312, "step": 5400 }, { "epoch": 0.4586727766463001, "grad_norm": 6.349189758300781, "learning_rate": 1.313802602640763e-06, "loss": 0.008, "num_input_tokens_seen": 5186112, "step": 5405 }, { "epoch": 0.4590970807875085, "grad_norm": 5.189371109008789, "learning_rate": 1.3123958915099392e-06, "loss": 0.0387, "num_input_tokens_seen": 5191040, "step": 5410 }, { "epoch": 0.4595213849287169, "grad_norm": 0.14324896037578583, "learning_rate": 1.3109884950114005e-06, "loss": 0.0441, "num_input_tokens_seen": 5195584, "step": 5415 }, { "epoch": 0.45994568906992533, "grad_norm": 22.456459045410156, "learning_rate": 1.309580416232845e-06, "loss": 0.0799, "num_input_tokens_seen": 5200256, "step": 5420 }, { "epoch": 0.46036999321113375, "grad_norm": 14.055853843688965, "learning_rate": 1.3081716582634672e-06, "loss": 0.0576, "num_input_tokens_seen": 5205376, "step": 5425 }, { "epoch": 0.4607942973523422, "grad_norm": 14.267642974853516, "learning_rate": 1.3067622241939518e-06, "loss": 0.0254, "num_input_tokens_seen": 5211392, "step": 5430 }, { "epoch": 0.4612186014935506, "grad_norm": 10.525510787963867, "learning_rate": 1.305352117116467e-06, "loss": 0.0247, "num_input_tokens_seen": 5215616, "step": 5435 }, { "epoch": 0.461642905634759, "grad_norm": 7.825259685516357, "learning_rate": 1.3039413401246576e-06, "loss": 0.0862, "num_input_tokens_seen": 5220608, "step": 5440 }, { "epoch": 0.46206720977596744, "grad_norm": 6.458859443664551, "learning_rate": 1.3025298963136377e-06, "loss": 0.1111, "num_input_tokens_seen": 5225344, "step": 5445 }, { "epoch": 0.46249151391717586, "grad_norm": 8.985774040222168, "learning_rate": 1.3011177887799844e-06, "loss": 0.0417, "num_input_tokens_seen": 5230464, "step": 5450 }, { "epoch": 0.4629158180583843, "grad_norm": 6.6957597732543945, "learning_rate": 1.2997050206217315e-06, "loss": 0.0548, "num_input_tokens_seen": 5235264, "step": 5455 }, { "epoch": 0.46334012219959264, "grad_norm": 7.47517728805542, "learning_rate": 1.2982915949383614e-06, "loss": 0.0883, "num_input_tokens_seen": 5239808, "step": 5460 }, { "epoch": 0.46376442634080106, "grad_norm": 3.9635698795318604, "learning_rate": 1.2968775148308002e-06, "loss": 0.0629, "num_input_tokens_seen": 5244416, "step": 5465 }, { "epoch": 0.4641887304820095, "grad_norm": 15.994041442871094, "learning_rate": 1.295462783401408e-06, "loss": 0.0882, "num_input_tokens_seen": 5249280, "step": 5470 }, { "epoch": 0.4646130346232179, "grad_norm": 10.046189308166504, "learning_rate": 1.2940474037539755e-06, "loss": 0.1072, "num_input_tokens_seen": 5254080, "step": 5475 }, { "epoch": 0.4650373387644263, "grad_norm": 1.4697424173355103, "learning_rate": 1.2926313789937143e-06, "loss": 0.0185, "num_input_tokens_seen": 5259136, "step": 5480 }, { "epoch": 0.46546164290563474, "grad_norm": 18.949148178100586, "learning_rate": 1.2912147122272522e-06, "loss": 0.0443, "num_input_tokens_seen": 5263744, "step": 5485 }, { "epoch": 0.46588594704684316, "grad_norm": 1.0807805061340332, "learning_rate": 1.289797406562625e-06, "loss": 0.0569, "num_input_tokens_seen": 5268544, "step": 5490 }, { "epoch": 0.4663102511880516, "grad_norm": 17.27862548828125, "learning_rate": 1.2883794651092704e-06, "loss": 0.0288, "num_input_tokens_seen": 5273280, "step": 5495 }, { "epoch": 0.46673455532926, "grad_norm": 7.288158416748047, "learning_rate": 1.2869608909780212e-06, "loss": 0.0231, "num_input_tokens_seen": 5277888, "step": 5500 }, { "epoch": 0.4671588594704684, "grad_norm": 14.408581733703613, "learning_rate": 1.2855416872810973e-06, "loss": 0.0518, "num_input_tokens_seen": 5282432, "step": 5505 }, { "epoch": 0.46758316361167684, "grad_norm": 5.324777126312256, "learning_rate": 1.284121857132101e-06, "loss": 0.0088, "num_input_tokens_seen": 5288512, "step": 5510 }, { "epoch": 0.46800746775288526, "grad_norm": 9.321609497070312, "learning_rate": 1.2827014036460082e-06, "loss": 0.0204, "num_input_tokens_seen": 5292800, "step": 5515 }, { "epoch": 0.4684317718940937, "grad_norm": 9.393115043640137, "learning_rate": 1.2812803299391628e-06, "loss": 0.0274, "num_input_tokens_seen": 5297856, "step": 5520 }, { "epoch": 0.4688560760353021, "grad_norm": 6.718729019165039, "learning_rate": 1.2798586391292689e-06, "loss": 0.0859, "num_input_tokens_seen": 5302784, "step": 5525 }, { "epoch": 0.4692803801765105, "grad_norm": 43.81071472167969, "learning_rate": 1.2784363343353848e-06, "loss": 0.0555, "num_input_tokens_seen": 5307648, "step": 5530 }, { "epoch": 0.46970468431771895, "grad_norm": 6.245013236999512, "learning_rate": 1.2770134186779158e-06, "loss": 0.0496, "num_input_tokens_seen": 5311680, "step": 5535 }, { "epoch": 0.47012898845892737, "grad_norm": 7.27108907699585, "learning_rate": 1.2755898952786076e-06, "loss": 0.053, "num_input_tokens_seen": 5316288, "step": 5540 }, { "epoch": 0.4705532926001358, "grad_norm": 17.36294937133789, "learning_rate": 1.2741657672605385e-06, "loss": 0.0291, "num_input_tokens_seen": 5320448, "step": 5545 }, { "epoch": 0.4709775967413442, "grad_norm": 1.1560553312301636, "learning_rate": 1.272741037748114e-06, "loss": 0.0483, "num_input_tokens_seen": 5324928, "step": 5550 }, { "epoch": 0.4714019008825526, "grad_norm": 24.80785369873047, "learning_rate": 1.2713157098670588e-06, "loss": 0.0429, "num_input_tokens_seen": 5329792, "step": 5555 }, { "epoch": 0.47182620502376105, "grad_norm": 13.043895721435547, "learning_rate": 1.2698897867444112e-06, "loss": 0.0697, "num_input_tokens_seen": 5334720, "step": 5560 }, { "epoch": 0.47225050916496947, "grad_norm": 17.10675621032715, "learning_rate": 1.268463271508514e-06, "loss": 0.0769, "num_input_tokens_seen": 5339968, "step": 5565 }, { "epoch": 0.4726748133061779, "grad_norm": 0.16105744242668152, "learning_rate": 1.2670361672890099e-06, "loss": 0.0083, "num_input_tokens_seen": 5345216, "step": 5570 }, { "epoch": 0.4730991174473863, "grad_norm": 15.116951942443848, "learning_rate": 1.265608477216834e-06, "loss": 0.116, "num_input_tokens_seen": 5350400, "step": 5575 }, { "epoch": 0.47352342158859473, "grad_norm": 4.960959434509277, "learning_rate": 1.2641802044242065e-06, "loss": 0.0868, "num_input_tokens_seen": 5354944, "step": 5580 }, { "epoch": 0.47394772572980315, "grad_norm": 0.19331686198711395, "learning_rate": 1.2627513520446252e-06, "loss": 0.1187, "num_input_tokens_seen": 5359040, "step": 5585 }, { "epoch": 0.4743720298710115, "grad_norm": 4.621866226196289, "learning_rate": 1.2613219232128608e-06, "loss": 0.0866, "num_input_tokens_seen": 5363584, "step": 5590 }, { "epoch": 0.47479633401221993, "grad_norm": 12.511336326599121, "learning_rate": 1.2598919210649475e-06, "loss": 0.0184, "num_input_tokens_seen": 5368256, "step": 5595 }, { "epoch": 0.47522063815342835, "grad_norm": 0.6315953731536865, "learning_rate": 1.2584613487381787e-06, "loss": 0.0563, "num_input_tokens_seen": 5372800, "step": 5600 }, { "epoch": 0.4756449422946368, "grad_norm": 6.170360088348389, "learning_rate": 1.257030209371097e-06, "loss": 0.0413, "num_input_tokens_seen": 5377280, "step": 5605 }, { "epoch": 0.4760692464358452, "grad_norm": 6.3370466232299805, "learning_rate": 1.2555985061034902e-06, "loss": 0.1219, "num_input_tokens_seen": 5382208, "step": 5610 }, { "epoch": 0.4764935505770536, "grad_norm": 11.882719993591309, "learning_rate": 1.2541662420763832e-06, "loss": 0.0935, "num_input_tokens_seen": 5386816, "step": 5615 }, { "epoch": 0.47691785471826204, "grad_norm": 11.76760482788086, "learning_rate": 1.2527334204320306e-06, "loss": 0.0457, "num_input_tokens_seen": 5391360, "step": 5620 }, { "epoch": 0.47734215885947046, "grad_norm": 14.905533790588379, "learning_rate": 1.251300044313911e-06, "loss": 0.0584, "num_input_tokens_seen": 5395904, "step": 5625 }, { "epoch": 0.4777664630006789, "grad_norm": 0.44021788239479065, "learning_rate": 1.2498661168667188e-06, "loss": 0.0555, "num_input_tokens_seen": 5400448, "step": 5630 }, { "epoch": 0.4781907671418873, "grad_norm": 9.252647399902344, "learning_rate": 1.2484316412363585e-06, "loss": 0.0431, "num_input_tokens_seen": 5405824, "step": 5635 }, { "epoch": 0.4786150712830957, "grad_norm": 0.9856930375099182, "learning_rate": 1.246996620569937e-06, "loss": 0.0407, "num_input_tokens_seen": 5410688, "step": 5640 }, { "epoch": 0.47903937542430414, "grad_norm": 6.884036064147949, "learning_rate": 1.245561058015757e-06, "loss": 0.0646, "num_input_tokens_seen": 5415296, "step": 5645 }, { "epoch": 0.47946367956551256, "grad_norm": 7.797816276550293, "learning_rate": 1.2441249567233098e-06, "loss": 0.1001, "num_input_tokens_seen": 5419648, "step": 5650 }, { "epoch": 0.479887983706721, "grad_norm": 5.869114875793457, "learning_rate": 1.2426883198432696e-06, "loss": 0.0398, "num_input_tokens_seen": 5424576, "step": 5655 }, { "epoch": 0.4803122878479294, "grad_norm": 4.114724636077881, "learning_rate": 1.2412511505274844e-06, "loss": 0.033, "num_input_tokens_seen": 5429184, "step": 5660 }, { "epoch": 0.4807365919891378, "grad_norm": 1.3931119441986084, "learning_rate": 1.2398134519289708e-06, "loss": 0.0589, "num_input_tokens_seen": 5433536, "step": 5665 }, { "epoch": 0.48116089613034624, "grad_norm": 11.368043899536133, "learning_rate": 1.2383752272019071e-06, "loss": 0.1094, "num_input_tokens_seen": 5438464, "step": 5670 }, { "epoch": 0.48158520027155466, "grad_norm": 16.947954177856445, "learning_rate": 1.2369364795016252e-06, "loss": 0.0679, "num_input_tokens_seen": 5443136, "step": 5675 }, { "epoch": 0.4820095044127631, "grad_norm": 2.892996072769165, "learning_rate": 1.2354972119846045e-06, "loss": 0.0332, "num_input_tokens_seen": 5447744, "step": 5680 }, { "epoch": 0.4824338085539715, "grad_norm": 1.6063815355300903, "learning_rate": 1.2340574278084648e-06, "loss": 0.0926, "num_input_tokens_seen": 5452800, "step": 5685 }, { "epoch": 0.4828581126951799, "grad_norm": 8.03439998626709, "learning_rate": 1.23261713013196e-06, "loss": 0.0565, "num_input_tokens_seen": 5457472, "step": 5690 }, { "epoch": 0.48328241683638834, "grad_norm": 10.29983901977539, "learning_rate": 1.2311763221149697e-06, "loss": 0.0754, "num_input_tokens_seen": 5462272, "step": 5695 }, { "epoch": 0.48370672097759676, "grad_norm": 14.466094017028809, "learning_rate": 1.2297350069184935e-06, "loss": 0.0176, "num_input_tokens_seen": 5466880, "step": 5700 }, { "epoch": 0.4841310251188052, "grad_norm": 7.963089942932129, "learning_rate": 1.228293187704644e-06, "loss": 0.0447, "num_input_tokens_seen": 5471616, "step": 5705 }, { "epoch": 0.4845553292600136, "grad_norm": 6.54095458984375, "learning_rate": 1.2268508676366393e-06, "loss": 0.0748, "num_input_tokens_seen": 5476160, "step": 5710 }, { "epoch": 0.484979633401222, "grad_norm": 19.244571685791016, "learning_rate": 1.225408049878796e-06, "loss": 0.1343, "num_input_tokens_seen": 5480960, "step": 5715 }, { "epoch": 0.48540393754243044, "grad_norm": 13.259824752807617, "learning_rate": 1.223964737596523e-06, "loss": 0.062, "num_input_tokens_seen": 5486528, "step": 5720 }, { "epoch": 0.4858282416836388, "grad_norm": 9.365538597106934, "learning_rate": 1.2225209339563143e-06, "loss": 0.0411, "num_input_tokens_seen": 5491456, "step": 5725 }, { "epoch": 0.4862525458248472, "grad_norm": 1.487982153892517, "learning_rate": 1.2210766421257419e-06, "loss": 0.0602, "num_input_tokens_seen": 5496640, "step": 5730 }, { "epoch": 0.48667684996605565, "grad_norm": 0.29796916246414185, "learning_rate": 1.2196318652734477e-06, "loss": 0.0293, "num_input_tokens_seen": 5501376, "step": 5735 }, { "epoch": 0.48710115410726407, "grad_norm": 11.434643745422363, "learning_rate": 1.2181866065691392e-06, "loss": 0.0521, "num_input_tokens_seen": 5505856, "step": 5740 }, { "epoch": 0.4875254582484725, "grad_norm": 14.557947158813477, "learning_rate": 1.2167408691835807e-06, "loss": 0.0564, "num_input_tokens_seen": 5510720, "step": 5745 }, { "epoch": 0.4879497623896809, "grad_norm": 2.0139198303222656, "learning_rate": 1.2152946562885857e-06, "loss": 0.0466, "num_input_tokens_seen": 5514880, "step": 5750 }, { "epoch": 0.48837406653088933, "grad_norm": 0.48408669233322144, "learning_rate": 1.2138479710570123e-06, "loss": 0.0491, "num_input_tokens_seen": 5519616, "step": 5755 }, { "epoch": 0.48879837067209775, "grad_norm": 17.641517639160156, "learning_rate": 1.2124008166627535e-06, "loss": 0.0656, "num_input_tokens_seen": 5523968, "step": 5760 }, { "epoch": 0.48922267481330617, "grad_norm": 0.3832457661628723, "learning_rate": 1.2109531962807332e-06, "loss": 0.0673, "num_input_tokens_seen": 5528960, "step": 5765 }, { "epoch": 0.4896469789545146, "grad_norm": 0.15046310424804688, "learning_rate": 1.2095051130868959e-06, "loss": 0.0817, "num_input_tokens_seen": 5535488, "step": 5770 }, { "epoch": 0.490071283095723, "grad_norm": 1.9801909923553467, "learning_rate": 1.2080565702582027e-06, "loss": 0.1018, "num_input_tokens_seen": 5540288, "step": 5775 }, { "epoch": 0.49049558723693143, "grad_norm": 39.119529724121094, "learning_rate": 1.2066075709726225e-06, "loss": 0.0485, "num_input_tokens_seen": 5545792, "step": 5780 }, { "epoch": 0.49091989137813985, "grad_norm": 13.806150436401367, "learning_rate": 1.2051581184091263e-06, "loss": 0.0674, "num_input_tokens_seen": 5550336, "step": 5785 }, { "epoch": 0.49134419551934827, "grad_norm": 10.708805084228516, "learning_rate": 1.2037082157476782e-06, "loss": 0.0727, "num_input_tokens_seen": 5555328, "step": 5790 }, { "epoch": 0.4917684996605567, "grad_norm": 30.937210083007812, "learning_rate": 1.2022578661692312e-06, "loss": 0.0649, "num_input_tokens_seen": 5560896, "step": 5795 }, { "epoch": 0.4921928038017651, "grad_norm": 12.959268569946289, "learning_rate": 1.2008070728557185e-06, "loss": 0.0633, "num_input_tokens_seen": 5565824, "step": 5800 }, { "epoch": 0.49261710794297353, "grad_norm": 4.291776180267334, "learning_rate": 1.1993558389900462e-06, "loss": 0.0425, "num_input_tokens_seen": 5570368, "step": 5805 }, { "epoch": 0.49304141208418195, "grad_norm": 13.795291900634766, "learning_rate": 1.197904167756087e-06, "loss": 0.0633, "num_input_tokens_seen": 5574848, "step": 5810 }, { "epoch": 0.49346571622539037, "grad_norm": 2.7180373668670654, "learning_rate": 1.1964520623386741e-06, "loss": 0.0689, "num_input_tokens_seen": 5579456, "step": 5815 }, { "epoch": 0.4938900203665988, "grad_norm": 1.1406170129776, "learning_rate": 1.1949995259235919e-06, "loss": 0.0423, "num_input_tokens_seen": 5584384, "step": 5820 }, { "epoch": 0.4943143245078072, "grad_norm": 28.64975357055664, "learning_rate": 1.1935465616975716e-06, "loss": 0.123, "num_input_tokens_seen": 5589632, "step": 5825 }, { "epoch": 0.49473862864901563, "grad_norm": 23.525083541870117, "learning_rate": 1.192093172848282e-06, "loss": 0.0281, "num_input_tokens_seen": 5594048, "step": 5830 }, { "epoch": 0.49516293279022405, "grad_norm": 27.362192153930664, "learning_rate": 1.1906393625643242e-06, "loss": 0.0515, "num_input_tokens_seen": 5598720, "step": 5835 }, { "epoch": 0.4955872369314325, "grad_norm": 10.059873580932617, "learning_rate": 1.1891851340352235e-06, "loss": 0.1113, "num_input_tokens_seen": 5603136, "step": 5840 }, { "epoch": 0.4960115410726409, "grad_norm": 0.4605356454849243, "learning_rate": 1.1877304904514232e-06, "loss": 0.0388, "num_input_tokens_seen": 5607872, "step": 5845 }, { "epoch": 0.4964358452138493, "grad_norm": 2.450542688369751, "learning_rate": 1.1862754350042764e-06, "loss": 0.039, "num_input_tokens_seen": 5612352, "step": 5850 }, { "epoch": 0.4968601493550577, "grad_norm": 0.1743314415216446, "learning_rate": 1.1848199708860404e-06, "loss": 0.077, "num_input_tokens_seen": 5617472, "step": 5855 }, { "epoch": 0.4972844534962661, "grad_norm": 10.397873878479004, "learning_rate": 1.183364101289869e-06, "loss": 0.0934, "num_input_tokens_seen": 5621824, "step": 5860 }, { "epoch": 0.4977087576374745, "grad_norm": 0.9554150104522705, "learning_rate": 1.1819078294098057e-06, "loss": 0.0631, "num_input_tokens_seen": 5626304, "step": 5865 }, { "epoch": 0.49813306177868294, "grad_norm": 18.06666374206543, "learning_rate": 1.180451158440776e-06, "loss": 0.0413, "num_input_tokens_seen": 5631680, "step": 5870 }, { "epoch": 0.49855736591989136, "grad_norm": 7.613068580627441, "learning_rate": 1.1789940915785823e-06, "loss": 0.115, "num_input_tokens_seen": 5635904, "step": 5875 }, { "epoch": 0.4989816700610998, "grad_norm": 0.9774286150932312, "learning_rate": 1.177536632019894e-06, "loss": 0.0422, "num_input_tokens_seen": 5640512, "step": 5880 }, { "epoch": 0.4994059742023082, "grad_norm": 1.4857728481292725, "learning_rate": 1.1760787829622423e-06, "loss": 0.0691, "num_input_tokens_seen": 5646464, "step": 5885 }, { "epoch": 0.4998302783435166, "grad_norm": 0.16748501360416412, "learning_rate": 1.1746205476040137e-06, "loss": 0.0249, "num_input_tokens_seen": 5651008, "step": 5890 }, { "epoch": 0.5002545824847251, "grad_norm": 17.213905334472656, "learning_rate": 1.173161929144442e-06, "loss": 0.0536, "num_input_tokens_seen": 5655616, "step": 5895 }, { "epoch": 0.5006788866259335, "grad_norm": 0.5089601278305054, "learning_rate": 1.171702930783601e-06, "loss": 0.0293, "num_input_tokens_seen": 5660352, "step": 5900 }, { "epoch": 0.5006788866259335, "eval_loss": 0.05498848110437393, "eval_runtime": 15.8725, "eval_samples_per_second": 659.946, "eval_steps_per_second": 82.533, "num_input_tokens_seen": 5660352, "step": 5900 }, { "epoch": 0.5011031907671419, "grad_norm": 0.7019211649894714, "learning_rate": 1.1702435557223986e-06, "loss": 0.0627, "num_input_tokens_seen": 5664832, "step": 5905 }, { "epoch": 0.5015274949083504, "grad_norm": 1.023059368133545, "learning_rate": 1.1687838071625684e-06, "loss": 0.0832, "num_input_tokens_seen": 5669824, "step": 5910 }, { "epoch": 0.5019517990495588, "grad_norm": 0.6381323337554932, "learning_rate": 1.167323688306664e-06, "loss": 0.0744, "num_input_tokens_seen": 5674240, "step": 5915 }, { "epoch": 0.5023761031907671, "grad_norm": 12.75153636932373, "learning_rate": 1.1658632023580515e-06, "loss": 0.0557, "num_input_tokens_seen": 5679296, "step": 5920 }, { "epoch": 0.5028004073319755, "grad_norm": 25.601726531982422, "learning_rate": 1.1644023525209014e-06, "loss": 0.0411, "num_input_tokens_seen": 5683840, "step": 5925 }, { "epoch": 0.5032247114731839, "grad_norm": 15.45758056640625, "learning_rate": 1.162941142000184e-06, "loss": 0.0754, "num_input_tokens_seen": 5688896, "step": 5930 }, { "epoch": 0.5036490156143923, "grad_norm": 2.823531150817871, "learning_rate": 1.1614795740016598e-06, "loss": 0.0428, "num_input_tokens_seen": 5693440, "step": 5935 }, { "epoch": 0.5040733197556008, "grad_norm": 0.22975222766399384, "learning_rate": 1.160017651731874e-06, "loss": 0.073, "num_input_tokens_seen": 5697920, "step": 5940 }, { "epoch": 0.5044976238968092, "grad_norm": 0.3254455327987671, "learning_rate": 1.1585553783981486e-06, "loss": 0.0417, "num_input_tokens_seen": 5702528, "step": 5945 }, { "epoch": 0.5049219280380176, "grad_norm": 18.13775062561035, "learning_rate": 1.1570927572085766e-06, "loss": 0.0924, "num_input_tokens_seen": 5707584, "step": 5950 }, { "epoch": 0.505346232179226, "grad_norm": 9.583961486816406, "learning_rate": 1.1556297913720137e-06, "loss": 0.046, "num_input_tokens_seen": 5712192, "step": 5955 }, { "epoch": 0.5057705363204344, "grad_norm": 32.913612365722656, "learning_rate": 1.1541664840980715e-06, "loss": 0.1587, "num_input_tokens_seen": 5717632, "step": 5960 }, { "epoch": 0.5061948404616429, "grad_norm": 4.711894989013672, "learning_rate": 1.1527028385971107e-06, "loss": 0.0854, "num_input_tokens_seen": 5722176, "step": 5965 }, { "epoch": 0.5066191446028513, "grad_norm": 12.714658737182617, "learning_rate": 1.1512388580802348e-06, "loss": 0.0459, "num_input_tokens_seen": 5726720, "step": 5970 }, { "epoch": 0.5070434487440597, "grad_norm": 10.083311080932617, "learning_rate": 1.1497745457592815e-06, "loss": 0.0672, "num_input_tokens_seen": 5731328, "step": 5975 }, { "epoch": 0.5074677528852681, "grad_norm": 1.8064274787902832, "learning_rate": 1.1483099048468168e-06, "loss": 0.0614, "num_input_tokens_seen": 5736256, "step": 5980 }, { "epoch": 0.5078920570264766, "grad_norm": 12.606705665588379, "learning_rate": 1.1468449385561272e-06, "loss": 0.0764, "num_input_tokens_seen": 5741248, "step": 5985 }, { "epoch": 0.508316361167685, "grad_norm": 5.615732669830322, "learning_rate": 1.145379650101214e-06, "loss": 0.0344, "num_input_tokens_seen": 5746304, "step": 5990 }, { "epoch": 0.5087406653088934, "grad_norm": 0.6430853605270386, "learning_rate": 1.143914042696784e-06, "loss": 0.097, "num_input_tokens_seen": 5751552, "step": 5995 }, { "epoch": 0.5091649694501018, "grad_norm": 2.67842173576355, "learning_rate": 1.1424481195582445e-06, "loss": 0.0568, "num_input_tokens_seen": 5756032, "step": 6000 }, { "epoch": 0.5095892735913102, "grad_norm": 17.653491973876953, "learning_rate": 1.1409818839016958e-06, "loss": 0.0818, "num_input_tokens_seen": 5761600, "step": 6005 }, { "epoch": 0.5100135777325187, "grad_norm": 1.1732429265975952, "learning_rate": 1.1395153389439231e-06, "loss": 0.0286, "num_input_tokens_seen": 5766336, "step": 6010 }, { "epoch": 0.5104378818737271, "grad_norm": 17.17027473449707, "learning_rate": 1.1380484879023903e-06, "loss": 0.0889, "num_input_tokens_seen": 5771392, "step": 6015 }, { "epoch": 0.5108621860149355, "grad_norm": 25.292057037353516, "learning_rate": 1.1365813339952334e-06, "loss": 0.0375, "num_input_tokens_seen": 5775808, "step": 6020 }, { "epoch": 0.5112864901561439, "grad_norm": 1.390062928199768, "learning_rate": 1.1351138804412524e-06, "loss": 0.0905, "num_input_tokens_seen": 5780800, "step": 6025 }, { "epoch": 0.5117107942973523, "grad_norm": 26.388877868652344, "learning_rate": 1.1336461304599047e-06, "loss": 0.031, "num_input_tokens_seen": 5786304, "step": 6030 }, { "epoch": 0.5121350984385608, "grad_norm": 0.2944334149360657, "learning_rate": 1.1321780872712983e-06, "loss": 0.0295, "num_input_tokens_seen": 5791360, "step": 6035 }, { "epoch": 0.5125594025797692, "grad_norm": 1.198797583580017, "learning_rate": 1.1307097540961838e-06, "loss": 0.019, "num_input_tokens_seen": 5795840, "step": 6040 }, { "epoch": 0.5129837067209776, "grad_norm": 16.838085174560547, "learning_rate": 1.129241134155949e-06, "loss": 0.0386, "num_input_tokens_seen": 5800576, "step": 6045 }, { "epoch": 0.513408010862186, "grad_norm": 0.14390717446804047, "learning_rate": 1.1277722306726103e-06, "loss": 0.0621, "num_input_tokens_seen": 5805632, "step": 6050 }, { "epoch": 0.5138323150033944, "grad_norm": 0.09065728634595871, "learning_rate": 1.1263030468688057e-06, "loss": 0.0254, "num_input_tokens_seen": 5810688, "step": 6055 }, { "epoch": 0.5142566191446029, "grad_norm": 0.3492352366447449, "learning_rate": 1.1248335859677891e-06, "loss": 0.0513, "num_input_tokens_seen": 5815616, "step": 6060 }, { "epoch": 0.5146809232858113, "grad_norm": 16.324581146240234, "learning_rate": 1.1233638511934218e-06, "loss": 0.0772, "num_input_tokens_seen": 5820672, "step": 6065 }, { "epoch": 0.5151052274270197, "grad_norm": 6.548031806945801, "learning_rate": 1.121893845770166e-06, "loss": 0.0595, "num_input_tokens_seen": 5824896, "step": 6070 }, { "epoch": 0.5155295315682281, "grad_norm": 1.5588712692260742, "learning_rate": 1.120423572923078e-06, "loss": 0.0458, "num_input_tokens_seen": 5829632, "step": 6075 }, { "epoch": 0.5159538357094365, "grad_norm": 8.870634078979492, "learning_rate": 1.1189530358778004e-06, "loss": 0.0292, "num_input_tokens_seen": 5834240, "step": 6080 }, { "epoch": 0.516378139850645, "grad_norm": 0.9702350497245789, "learning_rate": 1.1174822378605551e-06, "loss": 0.0951, "num_input_tokens_seen": 5838784, "step": 6085 }, { "epoch": 0.5168024439918534, "grad_norm": 19.710203170776367, "learning_rate": 1.116011182098138e-06, "loss": 0.0588, "num_input_tokens_seen": 5843072, "step": 6090 }, { "epoch": 0.5172267481330618, "grad_norm": 15.01016902923584, "learning_rate": 1.1145398718179085e-06, "loss": 0.0476, "num_input_tokens_seen": 5847360, "step": 6095 }, { "epoch": 0.5176510522742702, "grad_norm": 0.10758156329393387, "learning_rate": 1.1130683102477862e-06, "loss": 0.019, "num_input_tokens_seen": 5852224, "step": 6100 }, { "epoch": 0.5180753564154786, "grad_norm": 5.596349239349365, "learning_rate": 1.1115965006162405e-06, "loss": 0.0241, "num_input_tokens_seen": 5857152, "step": 6105 }, { "epoch": 0.5184996605566871, "grad_norm": 0.04970073699951172, "learning_rate": 1.110124446152286e-06, "loss": 0.1214, "num_input_tokens_seen": 5861888, "step": 6110 }, { "epoch": 0.5189239646978955, "grad_norm": 10.609586715698242, "learning_rate": 1.1086521500854744e-06, "loss": 0.1294, "num_input_tokens_seen": 5866496, "step": 6115 }, { "epoch": 0.5193482688391039, "grad_norm": 13.855015754699707, "learning_rate": 1.1071796156458868e-06, "loss": 0.0606, "num_input_tokens_seen": 5870912, "step": 6120 }, { "epoch": 0.5197725729803123, "grad_norm": 0.7808138728141785, "learning_rate": 1.1057068460641281e-06, "loss": 0.0881, "num_input_tokens_seen": 5876672, "step": 6125 }, { "epoch": 0.5201968771215207, "grad_norm": 16.916934967041016, "learning_rate": 1.1042338445713183e-06, "loss": 0.0278, "num_input_tokens_seen": 5881024, "step": 6130 }, { "epoch": 0.5206211812627292, "grad_norm": 8.180154800415039, "learning_rate": 1.1027606143990867e-06, "loss": 0.1703, "num_input_tokens_seen": 5886080, "step": 6135 }, { "epoch": 0.5210454854039376, "grad_norm": 15.013484954833984, "learning_rate": 1.1012871587795638e-06, "loss": 0.0505, "num_input_tokens_seen": 5890880, "step": 6140 }, { "epoch": 0.521469789545146, "grad_norm": 1.3397859334945679, "learning_rate": 1.0998134809453756e-06, "loss": 0.0369, "num_input_tokens_seen": 5895424, "step": 6145 }, { "epoch": 0.5218940936863544, "grad_norm": 0.7837854623794556, "learning_rate": 1.0983395841296347e-06, "loss": 0.0887, "num_input_tokens_seen": 5900352, "step": 6150 }, { "epoch": 0.5223183978275628, "grad_norm": 22.695222854614258, "learning_rate": 1.0968654715659347e-06, "loss": 0.0695, "num_input_tokens_seen": 5904960, "step": 6155 }, { "epoch": 0.5227427019687713, "grad_norm": 8.58603572845459, "learning_rate": 1.095391146488342e-06, "loss": 0.0687, "num_input_tokens_seen": 5910016, "step": 6160 }, { "epoch": 0.5231670061099797, "grad_norm": 6.1412200927734375, "learning_rate": 1.09391661213139e-06, "loss": 0.0779, "num_input_tokens_seen": 5914944, "step": 6165 }, { "epoch": 0.5235913102511881, "grad_norm": 8.769179344177246, "learning_rate": 1.0924418717300707e-06, "loss": 0.0389, "num_input_tokens_seen": 5920448, "step": 6170 }, { "epoch": 0.5240156143923965, "grad_norm": 20.312427520751953, "learning_rate": 1.090966928519828e-06, "loss": 0.0357, "num_input_tokens_seen": 5925696, "step": 6175 }, { "epoch": 0.5244399185336049, "grad_norm": 5.0691938400268555, "learning_rate": 1.0894917857365511e-06, "loss": 0.0356, "num_input_tokens_seen": 5930624, "step": 6180 }, { "epoch": 0.5248642226748133, "grad_norm": 10.31592082977295, "learning_rate": 1.0880164466165673e-06, "loss": 0.0751, "num_input_tokens_seen": 5935168, "step": 6185 }, { "epoch": 0.5252885268160217, "grad_norm": 0.4381231665611267, "learning_rate": 1.0865409143966338e-06, "loss": 0.0332, "num_input_tokens_seen": 5939712, "step": 6190 }, { "epoch": 0.5257128309572301, "grad_norm": 3.2070534229278564, "learning_rate": 1.0850651923139317e-06, "loss": 0.0108, "num_input_tokens_seen": 5944576, "step": 6195 }, { "epoch": 0.5261371350984385, "grad_norm": 27.044889450073242, "learning_rate": 1.0835892836060598e-06, "loss": 0.0517, "num_input_tokens_seen": 5949184, "step": 6200 }, { "epoch": 0.5265614392396469, "grad_norm": 1.5127665996551514, "learning_rate": 1.0821131915110246e-06, "loss": 0.0215, "num_input_tokens_seen": 5954176, "step": 6205 }, { "epoch": 0.5269857433808554, "grad_norm": 13.82748794555664, "learning_rate": 1.080636919267236e-06, "loss": 0.0753, "num_input_tokens_seen": 5958656, "step": 6210 }, { "epoch": 0.5274100475220638, "grad_norm": 13.576017379760742, "learning_rate": 1.079160470113499e-06, "loss": 0.0789, "num_input_tokens_seen": 5963264, "step": 6215 }, { "epoch": 0.5278343516632722, "grad_norm": 11.991615295410156, "learning_rate": 1.0776838472890064e-06, "loss": 0.0936, "num_input_tokens_seen": 5968576, "step": 6220 }, { "epoch": 0.5282586558044806, "grad_norm": 0.9006063342094421, "learning_rate": 1.0762070540333322e-06, "loss": 0.0034, "num_input_tokens_seen": 5973248, "step": 6225 }, { "epoch": 0.528682959945689, "grad_norm": 0.429584264755249, "learning_rate": 1.0747300935864243e-06, "loss": 0.0488, "num_input_tokens_seen": 5977920, "step": 6230 }, { "epoch": 0.5291072640868975, "grad_norm": 12.29738998413086, "learning_rate": 1.0732529691885977e-06, "loss": 0.0588, "num_input_tokens_seen": 5982656, "step": 6235 }, { "epoch": 0.5295315682281059, "grad_norm": 6.842536926269531, "learning_rate": 1.0717756840805263e-06, "loss": 0.0431, "num_input_tokens_seen": 5987392, "step": 6240 }, { "epoch": 0.5299558723693143, "grad_norm": 0.5230674743652344, "learning_rate": 1.0702982415032378e-06, "loss": 0.0505, "num_input_tokens_seen": 5993280, "step": 6245 }, { "epoch": 0.5303801765105227, "grad_norm": 23.495798110961914, "learning_rate": 1.068820644698104e-06, "loss": 0.0423, "num_input_tokens_seen": 5998272, "step": 6250 }, { "epoch": 0.5308044806517311, "grad_norm": 11.772712707519531, "learning_rate": 1.0673428969068363e-06, "loss": 0.023, "num_input_tokens_seen": 6002816, "step": 6255 }, { "epoch": 0.5312287847929396, "grad_norm": 23.144899368286133, "learning_rate": 1.0658650013714765e-06, "loss": 0.028, "num_input_tokens_seen": 6007744, "step": 6260 }, { "epoch": 0.531653088934148, "grad_norm": 0.028380636125802994, "learning_rate": 1.0643869613343906e-06, "loss": 0.099, "num_input_tokens_seen": 6011776, "step": 6265 }, { "epoch": 0.5320773930753564, "grad_norm": 0.04158276692032814, "learning_rate": 1.062908780038262e-06, "loss": 0.0939, "num_input_tokens_seen": 6017344, "step": 6270 }, { "epoch": 0.5325016972165648, "grad_norm": 9.953067779541016, "learning_rate": 1.0614304607260843e-06, "loss": 0.0401, "num_input_tokens_seen": 6022144, "step": 6275 }, { "epoch": 0.5329260013577732, "grad_norm": 22.902023315429688, "learning_rate": 1.0599520066411529e-06, "loss": 0.0351, "num_input_tokens_seen": 6027712, "step": 6280 }, { "epoch": 0.5333503054989817, "grad_norm": 13.597489356994629, "learning_rate": 1.0584734210270597e-06, "loss": 0.0586, "num_input_tokens_seen": 6032064, "step": 6285 }, { "epoch": 0.5337746096401901, "grad_norm": 0.03597741574048996, "learning_rate": 1.0569947071276845e-06, "loss": 0.0382, "num_input_tokens_seen": 6036288, "step": 6290 }, { "epoch": 0.5341989137813985, "grad_norm": 17.50570297241211, "learning_rate": 1.0555158681871897e-06, "loss": 0.0912, "num_input_tokens_seen": 6040960, "step": 6295 }, { "epoch": 0.5346232179226069, "grad_norm": 15.3289155960083, "learning_rate": 1.0540369074500103e-06, "loss": 0.0313, "num_input_tokens_seen": 6045376, "step": 6300 }, { "epoch": 0.5350475220638153, "grad_norm": 10.554159164428711, "learning_rate": 1.0525578281608503e-06, "loss": 0.0809, "num_input_tokens_seen": 6049856, "step": 6305 }, { "epoch": 0.5354718262050238, "grad_norm": 0.8359688520431519, "learning_rate": 1.0510786335646725e-06, "loss": 0.0089, "num_input_tokens_seen": 6054144, "step": 6310 }, { "epoch": 0.5358961303462322, "grad_norm": 6.878500938415527, "learning_rate": 1.0495993269066935e-06, "loss": 0.0477, "num_input_tokens_seen": 6060032, "step": 6315 }, { "epoch": 0.5363204344874406, "grad_norm": 15.124363899230957, "learning_rate": 1.0481199114323746e-06, "loss": 0.132, "num_input_tokens_seen": 6065280, "step": 6320 }, { "epoch": 0.536744738628649, "grad_norm": 0.35588526725769043, "learning_rate": 1.0466403903874175e-06, "loss": 0.0846, "num_input_tokens_seen": 6070080, "step": 6325 }, { "epoch": 0.5371690427698574, "grad_norm": 6.994991779327393, "learning_rate": 1.0451607670177543e-06, "loss": 0.0642, "num_input_tokens_seen": 6076032, "step": 6330 }, { "epoch": 0.5375933469110659, "grad_norm": 11.849706649780273, "learning_rate": 1.0436810445695421e-06, "loss": 0.0577, "num_input_tokens_seen": 6080768, "step": 6335 }, { "epoch": 0.5380176510522743, "grad_norm": 10.08267879486084, "learning_rate": 1.0422012262891548e-06, "loss": 0.0294, "num_input_tokens_seen": 6085312, "step": 6340 }, { "epoch": 0.5384419551934827, "grad_norm": 19.3542537689209, "learning_rate": 1.0407213154231774e-06, "loss": 0.0297, "num_input_tokens_seen": 6090048, "step": 6345 }, { "epoch": 0.5388662593346911, "grad_norm": 56.21127700805664, "learning_rate": 1.0392413152183973e-06, "loss": 0.0992, "num_input_tokens_seen": 6094720, "step": 6350 }, { "epoch": 0.5392905634758995, "grad_norm": 18.282859802246094, "learning_rate": 1.0377612289217982e-06, "loss": 0.1116, "num_input_tokens_seen": 6099456, "step": 6355 }, { "epoch": 0.539714867617108, "grad_norm": 0.08463834971189499, "learning_rate": 1.0362810597805524e-06, "loss": 0.033, "num_input_tokens_seen": 6104448, "step": 6360 }, { "epoch": 0.5401391717583164, "grad_norm": 0.2075161337852478, "learning_rate": 1.0348008110420149e-06, "loss": 0.0312, "num_input_tokens_seen": 6109056, "step": 6365 }, { "epoch": 0.5405634758995248, "grad_norm": 26.991762161254883, "learning_rate": 1.0333204859537142e-06, "loss": 0.088, "num_input_tokens_seen": 6114496, "step": 6370 }, { "epoch": 0.5409877800407332, "grad_norm": 0.06256894022226334, "learning_rate": 1.0318400877633466e-06, "loss": 0.0882, "num_input_tokens_seen": 6119360, "step": 6375 }, { "epoch": 0.5414120841819416, "grad_norm": 9.642011642456055, "learning_rate": 1.030359619718769e-06, "loss": 0.0926, "num_input_tokens_seen": 6124352, "step": 6380 }, { "epoch": 0.5418363883231501, "grad_norm": 1.3404682874679565, "learning_rate": 1.0288790850679916e-06, "loss": 0.0509, "num_input_tokens_seen": 6128832, "step": 6385 }, { "epoch": 0.5422606924643585, "grad_norm": 21.819734573364258, "learning_rate": 1.0273984870591706e-06, "loss": 0.1019, "num_input_tokens_seen": 6133312, "step": 6390 }, { "epoch": 0.5426849966055669, "grad_norm": 28.43328094482422, "learning_rate": 1.025917828940601e-06, "loss": 0.0668, "num_input_tokens_seen": 6137600, "step": 6395 }, { "epoch": 0.5431093007467753, "grad_norm": 6.486978054046631, "learning_rate": 1.02443711396071e-06, "loss": 0.0544, "num_input_tokens_seen": 6142464, "step": 6400 }, { "epoch": 0.5435336048879837, "grad_norm": 0.3852335512638092, "learning_rate": 1.0229563453680495e-06, "loss": 0.0586, "num_input_tokens_seen": 6147072, "step": 6405 }, { "epoch": 0.5439579090291922, "grad_norm": 25.111413955688477, "learning_rate": 1.021475526411289e-06, "loss": 0.045, "num_input_tokens_seen": 6151744, "step": 6410 }, { "epoch": 0.5443822131704006, "grad_norm": 9.803653717041016, "learning_rate": 1.0199946603392078e-06, "loss": 0.085, "num_input_tokens_seen": 6156672, "step": 6415 }, { "epoch": 0.544806517311609, "grad_norm": 37.152095794677734, "learning_rate": 1.01851375040069e-06, "loss": 0.0563, "num_input_tokens_seen": 6161600, "step": 6420 }, { "epoch": 0.5452308214528174, "grad_norm": 0.2186758667230606, "learning_rate": 1.0170327998447149e-06, "loss": 0.0312, "num_input_tokens_seen": 6165760, "step": 6425 }, { "epoch": 0.5456551255940258, "grad_norm": 1.1985291242599487, "learning_rate": 1.015551811920351e-06, "loss": 0.0114, "num_input_tokens_seen": 6174912, "step": 6430 }, { "epoch": 0.5460794297352343, "grad_norm": 10.91264533996582, "learning_rate": 1.014070789876749e-06, "loss": 0.0856, "num_input_tokens_seen": 6179136, "step": 6435 }, { "epoch": 0.5465037338764427, "grad_norm": 1.4603297710418701, "learning_rate": 1.0125897369631342e-06, "loss": 0.1228, "num_input_tokens_seen": 6183680, "step": 6440 }, { "epoch": 0.546928038017651, "grad_norm": 32.919151306152344, "learning_rate": 1.0111086564288003e-06, "loss": 0.0587, "num_input_tokens_seen": 6188608, "step": 6445 }, { "epoch": 0.5473523421588594, "grad_norm": 2.691481828689575, "learning_rate": 1.009627551523101e-06, "loss": 0.0592, "num_input_tokens_seen": 6193600, "step": 6450 }, { "epoch": 0.5477766463000678, "grad_norm": 0.18778395652770996, "learning_rate": 1.008146425495443e-06, "loss": 0.0367, "num_input_tokens_seen": 6198528, "step": 6455 }, { "epoch": 0.5482009504412763, "grad_norm": 0.21035797894001007, "learning_rate": 1.0066652815952805e-06, "loss": 0.0359, "num_input_tokens_seen": 6204096, "step": 6460 }, { "epoch": 0.5486252545824847, "grad_norm": 20.11526107788086, "learning_rate": 1.0051841230721063e-06, "loss": 0.0465, "num_input_tokens_seen": 6208704, "step": 6465 }, { "epoch": 0.5490495587236931, "grad_norm": 7.261089324951172, "learning_rate": 1.0037029531754453e-06, "loss": 0.1061, "num_input_tokens_seen": 6213440, "step": 6470 }, { "epoch": 0.5494738628649015, "grad_norm": 0.9366330504417419, "learning_rate": 1.002221775154847e-06, "loss": 0.0551, "num_input_tokens_seen": 6218368, "step": 6475 }, { "epoch": 0.5498981670061099, "grad_norm": 2.7935664653778076, "learning_rate": 1.0007405922598793e-06, "loss": 0.0618, "num_input_tokens_seen": 6223616, "step": 6480 }, { "epoch": 0.5503224711473184, "grad_norm": 0.42181113362312317, "learning_rate": 9.992594077401208e-07, "loss": 0.0938, "num_input_tokens_seen": 6228480, "step": 6485 }, { "epoch": 0.5507467752885268, "grad_norm": 5.8830180168151855, "learning_rate": 9.977782248451534e-07, "loss": 0.0534, "num_input_tokens_seen": 6232896, "step": 6490 }, { "epoch": 0.5507467752885268, "eval_loss": 0.05577274411916733, "eval_runtime": 15.823, "eval_samples_per_second": 662.009, "eval_steps_per_second": 82.791, "num_input_tokens_seen": 6232896, "step": 6490 }, { "epoch": 0.5511710794297352, "grad_norm": 37.08064651489258, "learning_rate": 9.962970468245548e-07, "loss": 0.0959, "num_input_tokens_seen": 6237696, "step": 6495 }, { "epoch": 0.5515953835709436, "grad_norm": 1.1415743827819824, "learning_rate": 9.948158769278939e-07, "loss": 0.0324, "num_input_tokens_seen": 6242304, "step": 6500 }, { "epoch": 0.552019687712152, "grad_norm": 7.484726905822754, "learning_rate": 9.933347184047194e-07, "loss": 0.0393, "num_input_tokens_seen": 6246976, "step": 6505 }, { "epoch": 0.5524439918533605, "grad_norm": 18.209239959716797, "learning_rate": 9.918535745045571e-07, "loss": 0.0629, "num_input_tokens_seen": 6251264, "step": 6510 }, { "epoch": 0.5528682959945689, "grad_norm": 14.765807151794434, "learning_rate": 9.903724484768991e-07, "loss": 0.0104, "num_input_tokens_seen": 6255872, "step": 6515 }, { "epoch": 0.5532926001357773, "grad_norm": 0.719184935092926, "learning_rate": 9.888913435711996e-07, "loss": 0.0381, "num_input_tokens_seen": 6260928, "step": 6520 }, { "epoch": 0.5537169042769857, "grad_norm": 17.972461700439453, "learning_rate": 9.874102630368658e-07, "loss": 0.04, "num_input_tokens_seen": 6265600, "step": 6525 }, { "epoch": 0.5541412084181941, "grad_norm": 0.6293085813522339, "learning_rate": 9.859292101232514e-07, "loss": 0.113, "num_input_tokens_seen": 6270464, "step": 6530 }, { "epoch": 0.5545655125594026, "grad_norm": 24.948253631591797, "learning_rate": 9.84448188079649e-07, "loss": 0.0957, "num_input_tokens_seen": 6274944, "step": 6535 }, { "epoch": 0.554989816700611, "grad_norm": 6.423430442810059, "learning_rate": 9.829672001552853e-07, "loss": 0.0578, "num_input_tokens_seen": 6279424, "step": 6540 }, { "epoch": 0.5554141208418194, "grad_norm": 4.687989711761475, "learning_rate": 9.8148624959931e-07, "loss": 0.1313, "num_input_tokens_seen": 6284096, "step": 6545 }, { "epoch": 0.5558384249830278, "grad_norm": 5.982985973358154, "learning_rate": 9.80005339660792e-07, "loss": 0.03, "num_input_tokens_seen": 6289728, "step": 6550 }, { "epoch": 0.5562627291242362, "grad_norm": 43.957584381103516, "learning_rate": 9.785244735887112e-07, "loss": 0.0289, "num_input_tokens_seen": 6294400, "step": 6555 }, { "epoch": 0.5566870332654447, "grad_norm": 7.876846790313721, "learning_rate": 9.770436546319504e-07, "loss": 0.0557, "num_input_tokens_seen": 6298880, "step": 6560 }, { "epoch": 0.5571113374066531, "grad_norm": 26.09071922302246, "learning_rate": 9.755628860392901e-07, "loss": 0.0636, "num_input_tokens_seen": 6303424, "step": 6565 }, { "epoch": 0.5575356415478615, "grad_norm": 32.834678649902344, "learning_rate": 9.740821710593988e-07, "loss": 0.026, "num_input_tokens_seen": 6308032, "step": 6570 }, { "epoch": 0.5579599456890699, "grad_norm": 0.24545079469680786, "learning_rate": 9.726015129408296e-07, "loss": 0.048, "num_input_tokens_seen": 6312832, "step": 6575 }, { "epoch": 0.5583842498302783, "grad_norm": 6.026395797729492, "learning_rate": 9.711209149320083e-07, "loss": 0.035, "num_input_tokens_seen": 6317312, "step": 6580 }, { "epoch": 0.5588085539714868, "grad_norm": 7.616768836975098, "learning_rate": 9.69640380281231e-07, "loss": 0.0341, "num_input_tokens_seen": 6321920, "step": 6585 }, { "epoch": 0.5592328581126952, "grad_norm": 1.705358624458313, "learning_rate": 9.681599122366533e-07, "loss": 0.0732, "num_input_tokens_seen": 6326336, "step": 6590 }, { "epoch": 0.5596571622539036, "grad_norm": 0.5510636568069458, "learning_rate": 9.66679514046286e-07, "loss": 0.0904, "num_input_tokens_seen": 6331008, "step": 6595 }, { "epoch": 0.560081466395112, "grad_norm": 12.033029556274414, "learning_rate": 9.65199188957985e-07, "loss": 0.0866, "num_input_tokens_seen": 6335744, "step": 6600 }, { "epoch": 0.5605057705363204, "grad_norm": 25.936338424682617, "learning_rate": 9.637189402194475e-07, "loss": 0.0558, "num_input_tokens_seen": 6340736, "step": 6605 }, { "epoch": 0.5609300746775289, "grad_norm": 14.279778480529785, "learning_rate": 9.622387710782017e-07, "loss": 0.0955, "num_input_tokens_seen": 6345216, "step": 6610 }, { "epoch": 0.5613543788187373, "grad_norm": 9.077555656433105, "learning_rate": 9.607586847816029e-07, "loss": 0.0383, "num_input_tokens_seen": 6350080, "step": 6615 }, { "epoch": 0.5617786829599457, "grad_norm": 38.76018524169922, "learning_rate": 9.592786845768225e-07, "loss": 0.0689, "num_input_tokens_seen": 6354816, "step": 6620 }, { "epoch": 0.5622029871011541, "grad_norm": 6.977408409118652, "learning_rate": 9.577987737108454e-07, "loss": 0.0538, "num_input_tokens_seen": 6360000, "step": 6625 }, { "epoch": 0.5626272912423625, "grad_norm": 7.492920875549316, "learning_rate": 9.563189554304578e-07, "loss": 0.1221, "num_input_tokens_seen": 6364672, "step": 6630 }, { "epoch": 0.563051595383571, "grad_norm": 0.4446965456008911, "learning_rate": 9.548392329822456e-07, "loss": 0.0828, "num_input_tokens_seen": 6369408, "step": 6635 }, { "epoch": 0.5634758995247794, "grad_norm": 10.316352844238281, "learning_rate": 9.533596096125825e-07, "loss": 0.0482, "num_input_tokens_seen": 6374080, "step": 6640 }, { "epoch": 0.5639002036659878, "grad_norm": 1.064753532409668, "learning_rate": 9.518800885676256e-07, "loss": 0.051, "num_input_tokens_seen": 6379200, "step": 6645 }, { "epoch": 0.5643245078071962, "grad_norm": 0.490021675825119, "learning_rate": 9.504006730933068e-07, "loss": 0.0577, "num_input_tokens_seen": 6384576, "step": 6650 }, { "epoch": 0.5647488119484046, "grad_norm": 8.95898151397705, "learning_rate": 9.489213664353276e-07, "loss": 0.0797, "num_input_tokens_seen": 6389760, "step": 6655 }, { "epoch": 0.5651731160896131, "grad_norm": 27.54326820373535, "learning_rate": 9.474421718391497e-07, "loss": 0.1317, "num_input_tokens_seen": 6394176, "step": 6660 }, { "epoch": 0.5655974202308215, "grad_norm": 17.441795349121094, "learning_rate": 9.459630925499897e-07, "loss": 0.0693, "num_input_tokens_seen": 6398976, "step": 6665 }, { "epoch": 0.5660217243720299, "grad_norm": 12.825746536254883, "learning_rate": 9.444841318128103e-07, "loss": 0.0414, "num_input_tokens_seen": 6403264, "step": 6670 }, { "epoch": 0.5664460285132383, "grad_norm": 11.503122329711914, "learning_rate": 9.430052928723152e-07, "loss": 0.0771, "num_input_tokens_seen": 6408128, "step": 6675 }, { "epoch": 0.5668703326544468, "grad_norm": 14.999883651733398, "learning_rate": 9.415265789729403e-07, "loss": 0.1131, "num_input_tokens_seen": 6412672, "step": 6680 }, { "epoch": 0.5672946367956552, "grad_norm": 0.7394747734069824, "learning_rate": 9.400479933588468e-07, "loss": 0.0724, "num_input_tokens_seen": 6417088, "step": 6685 }, { "epoch": 0.5677189409368636, "grad_norm": 0.6897430419921875, "learning_rate": 9.385695392739156e-07, "loss": 0.0707, "num_input_tokens_seen": 6421824, "step": 6690 }, { "epoch": 0.568143245078072, "grad_norm": 2.056105852127075, "learning_rate": 9.370912199617376e-07, "loss": 0.0411, "num_input_tokens_seen": 6426560, "step": 6695 }, { "epoch": 0.5685675492192804, "grad_norm": 27.487417221069336, "learning_rate": 9.356130386656093e-07, "loss": 0.0867, "num_input_tokens_seen": 6431040, "step": 6700 }, { "epoch": 0.5689918533604889, "grad_norm": 24.241193771362305, "learning_rate": 9.341349986285234e-07, "loss": 0.0488, "num_input_tokens_seen": 6435968, "step": 6705 }, { "epoch": 0.5694161575016972, "grad_norm": 9.322391510009766, "learning_rate": 9.326571030931636e-07, "loss": 0.1258, "num_input_tokens_seen": 6440640, "step": 6710 }, { "epoch": 0.5698404616429056, "grad_norm": 0.4335402250289917, "learning_rate": 9.311793553018958e-07, "loss": 0.0646, "num_input_tokens_seen": 6445504, "step": 6715 }, { "epoch": 0.570264765784114, "grad_norm": 20.35150909423828, "learning_rate": 9.297017584967624e-07, "loss": 0.0453, "num_input_tokens_seen": 6449600, "step": 6720 }, { "epoch": 0.5706890699253224, "grad_norm": 2.6463088989257812, "learning_rate": 9.282243159194734e-07, "loss": 0.0386, "num_input_tokens_seen": 6454528, "step": 6725 }, { "epoch": 0.5711133740665308, "grad_norm": 0.11555840075016022, "learning_rate": 9.267470308114025e-07, "loss": 0.0768, "num_input_tokens_seen": 6459264, "step": 6730 }, { "epoch": 0.5715376782077393, "grad_norm": 20.841169357299805, "learning_rate": 9.252699064135758e-07, "loss": 0.078, "num_input_tokens_seen": 6463552, "step": 6735 }, { "epoch": 0.5719619823489477, "grad_norm": 0.19547468423843384, "learning_rate": 9.23792945966668e-07, "loss": 0.0072, "num_input_tokens_seen": 6468608, "step": 6740 }, { "epoch": 0.5723862864901561, "grad_norm": 11.617748260498047, "learning_rate": 9.223161527109936e-07, "loss": 0.0303, "num_input_tokens_seen": 6473408, "step": 6745 }, { "epoch": 0.5728105906313645, "grad_norm": 0.14943847060203552, "learning_rate": 9.208395298865014e-07, "loss": 0.0353, "num_input_tokens_seen": 6478656, "step": 6750 }, { "epoch": 0.573234894772573, "grad_norm": 6.509980201721191, "learning_rate": 9.19363080732764e-07, "loss": 0.0986, "num_input_tokens_seen": 6483328, "step": 6755 }, { "epoch": 0.5736591989137814, "grad_norm": 0.04856886342167854, "learning_rate": 9.178868084889756e-07, "loss": 0.0211, "num_input_tokens_seen": 6488064, "step": 6760 }, { "epoch": 0.5740835030549898, "grad_norm": 49.04362869262695, "learning_rate": 9.164107163939401e-07, "loss": 0.1111, "num_input_tokens_seen": 6492864, "step": 6765 }, { "epoch": 0.5745078071961982, "grad_norm": 0.417856901884079, "learning_rate": 9.149348076860685e-07, "loss": 0.038, "num_input_tokens_seen": 6497216, "step": 6770 }, { "epoch": 0.5749321113374066, "grad_norm": 8.41581916809082, "learning_rate": 9.134590856033664e-07, "loss": 0.0373, "num_input_tokens_seen": 6501888, "step": 6775 }, { "epoch": 0.575356415478615, "grad_norm": 21.148208618164062, "learning_rate": 9.11983553383433e-07, "loss": 0.0708, "num_input_tokens_seen": 6507200, "step": 6780 }, { "epoch": 0.5757807196198235, "grad_norm": 0.28288573026657104, "learning_rate": 9.105082142634489e-07, "loss": 0.0143, "num_input_tokens_seen": 6515840, "step": 6785 }, { "epoch": 0.5762050237610319, "grad_norm": 11.684349060058594, "learning_rate": 9.090330714801723e-07, "loss": 0.1098, "num_input_tokens_seen": 6520384, "step": 6790 }, { "epoch": 0.5766293279022403, "grad_norm": 4.981570243835449, "learning_rate": 9.075581282699294e-07, "loss": 0.0894, "num_input_tokens_seen": 6524992, "step": 6795 }, { "epoch": 0.5770536320434487, "grad_norm": 16.109561920166016, "learning_rate": 9.060833878686098e-07, "loss": 0.1289, "num_input_tokens_seen": 6532160, "step": 6800 }, { "epoch": 0.5774779361846571, "grad_norm": 1.6210087537765503, "learning_rate": 9.046088535116581e-07, "loss": 0.0264, "num_input_tokens_seen": 6536384, "step": 6805 }, { "epoch": 0.5779022403258656, "grad_norm": 1.7853858470916748, "learning_rate": 9.031345284340652e-07, "loss": 0.0253, "num_input_tokens_seen": 6540800, "step": 6810 }, { "epoch": 0.578326544467074, "grad_norm": 15.93824291229248, "learning_rate": 9.016604158703654e-07, "loss": 0.1609, "num_input_tokens_seen": 6545216, "step": 6815 }, { "epoch": 0.5787508486082824, "grad_norm": 0.15775707364082336, "learning_rate": 9.001865190546244e-07, "loss": 0.0496, "num_input_tokens_seen": 6550400, "step": 6820 }, { "epoch": 0.5791751527494908, "grad_norm": 2.840294599533081, "learning_rate": 8.987128412204363e-07, "loss": 0.0241, "num_input_tokens_seen": 6554752, "step": 6825 }, { "epoch": 0.5795994568906992, "grad_norm": 12.399237632751465, "learning_rate": 8.972393856009132e-07, "loss": 0.0436, "num_input_tokens_seen": 6559616, "step": 6830 }, { "epoch": 0.5800237610319077, "grad_norm": 18.532922744750977, "learning_rate": 8.957661554286817e-07, "loss": 0.0387, "num_input_tokens_seen": 6564608, "step": 6835 }, { "epoch": 0.5804480651731161, "grad_norm": 10.585027694702148, "learning_rate": 8.942931539358718e-07, "loss": 0.0477, "num_input_tokens_seen": 6569024, "step": 6840 }, { "epoch": 0.5808723693143245, "grad_norm": 0.14699456095695496, "learning_rate": 8.928203843541131e-07, "loss": 0.056, "num_input_tokens_seen": 6574016, "step": 6845 }, { "epoch": 0.5812966734555329, "grad_norm": 3.8574206829071045, "learning_rate": 8.913478499145254e-07, "loss": 0.0153, "num_input_tokens_seen": 6578944, "step": 6850 }, { "epoch": 0.5817209775967414, "grad_norm": 17.667245864868164, "learning_rate": 8.898755538477138e-07, "loss": 0.0494, "num_input_tokens_seen": 6584192, "step": 6855 }, { "epoch": 0.5821452817379498, "grad_norm": 13.899635314941406, "learning_rate": 8.884034993837594e-07, "loss": 0.1016, "num_input_tokens_seen": 6589056, "step": 6860 }, { "epoch": 0.5825695858791582, "grad_norm": 23.75398826599121, "learning_rate": 8.869316897522141e-07, "loss": 0.0777, "num_input_tokens_seen": 6593536, "step": 6865 }, { "epoch": 0.5829938900203666, "grad_norm": 17.726505279541016, "learning_rate": 8.854601281820914e-07, "loss": 0.0843, "num_input_tokens_seen": 6600128, "step": 6870 }, { "epoch": 0.583418194161575, "grad_norm": 9.943798065185547, "learning_rate": 8.839888179018621e-07, "loss": 0.0393, "num_input_tokens_seen": 6604864, "step": 6875 }, { "epoch": 0.5838424983027835, "grad_norm": 17.00997543334961, "learning_rate": 8.825177621394449e-07, "loss": 0.0228, "num_input_tokens_seen": 6609728, "step": 6880 }, { "epoch": 0.5842668024439919, "grad_norm": 0.7779857516288757, "learning_rate": 8.810469641222001e-07, "loss": 0.0307, "num_input_tokens_seen": 6615104, "step": 6885 }, { "epoch": 0.5846911065852003, "grad_norm": 0.525295078754425, "learning_rate": 8.795764270769221e-07, "loss": 0.0439, "num_input_tokens_seen": 6620096, "step": 6890 }, { "epoch": 0.5851154107264087, "grad_norm": 22.693933486938477, "learning_rate": 8.781061542298341e-07, "loss": 0.0911, "num_input_tokens_seen": 6624448, "step": 6895 }, { "epoch": 0.5855397148676171, "grad_norm": 26.4716854095459, "learning_rate": 8.766361488065783e-07, "loss": 0.1139, "num_input_tokens_seen": 6628800, "step": 6900 }, { "epoch": 0.5859640190088256, "grad_norm": 20.0168399810791, "learning_rate": 8.751664140322112e-07, "loss": 0.046, "num_input_tokens_seen": 6633664, "step": 6905 }, { "epoch": 0.586388323150034, "grad_norm": 19.89805030822754, "learning_rate": 8.736969531311942e-07, "loss": 0.0924, "num_input_tokens_seen": 6638720, "step": 6910 }, { "epoch": 0.5868126272912424, "grad_norm": 0.6657816171646118, "learning_rate": 8.7222776932739e-07, "loss": 0.0468, "num_input_tokens_seen": 6643008, "step": 6915 }, { "epoch": 0.5872369314324508, "grad_norm": 2.8613314628601074, "learning_rate": 8.70758865844051e-07, "loss": 0.0501, "num_input_tokens_seen": 6647360, "step": 6920 }, { "epoch": 0.5876612355736592, "grad_norm": 0.22243715822696686, "learning_rate": 8.69290245903816e-07, "loss": 0.0371, "num_input_tokens_seen": 6652032, "step": 6925 }, { "epoch": 0.5880855397148677, "grad_norm": 33.91285705566406, "learning_rate": 8.678219127287018e-07, "loss": 0.0833, "num_input_tokens_seen": 6656320, "step": 6930 }, { "epoch": 0.5885098438560761, "grad_norm": 2.0737743377685547, "learning_rate": 8.663538695400951e-07, "loss": 0.0731, "num_input_tokens_seen": 6660928, "step": 6935 }, { "epoch": 0.5889341479972845, "grad_norm": 0.5914128422737122, "learning_rate": 8.648861195587475e-07, "loss": 0.027, "num_input_tokens_seen": 6665856, "step": 6940 }, { "epoch": 0.5893584521384929, "grad_norm": 0.4644359350204468, "learning_rate": 8.634186660047663e-07, "loss": 0.0478, "num_input_tokens_seen": 6670144, "step": 6945 }, { "epoch": 0.5897827562797013, "grad_norm": 23.87814712524414, "learning_rate": 8.619515120976097e-07, "loss": 0.0928, "num_input_tokens_seen": 6675264, "step": 6950 }, { "epoch": 0.5902070604209098, "grad_norm": 8.000229835510254, "learning_rate": 8.60484661056077e-07, "loss": 0.13, "num_input_tokens_seen": 6679552, "step": 6955 }, { "epoch": 0.5906313645621182, "grad_norm": 17.253625869750977, "learning_rate": 8.590181160983043e-07, "loss": 0.0307, "num_input_tokens_seen": 6683904, "step": 6960 }, { "epoch": 0.5910556687033266, "grad_norm": 13.027657508850098, "learning_rate": 8.575518804417552e-07, "loss": 0.0527, "num_input_tokens_seen": 6688320, "step": 6965 }, { "epoch": 0.591479972844535, "grad_norm": 1.04340398311615, "learning_rate": 8.560859573032161e-07, "loss": 0.0419, "num_input_tokens_seen": 6693696, "step": 6970 }, { "epoch": 0.5919042769857433, "grad_norm": 2.2827279567718506, "learning_rate": 8.546203498987861e-07, "loss": 0.0631, "num_input_tokens_seen": 6698496, "step": 6975 }, { "epoch": 0.5923285811269517, "grad_norm": 8.269172668457031, "learning_rate": 8.531550614438729e-07, "loss": 0.0257, "num_input_tokens_seen": 6704192, "step": 6980 }, { "epoch": 0.5927528852681602, "grad_norm": 7.1176042556762695, "learning_rate": 8.516900951531832e-07, "loss": 0.0148, "num_input_tokens_seen": 6708480, "step": 6985 }, { "epoch": 0.5931771894093686, "grad_norm": 0.349401593208313, "learning_rate": 8.502254542407185e-07, "loss": 0.0556, "num_input_tokens_seen": 6713856, "step": 6990 }, { "epoch": 0.593601493550577, "grad_norm": 6.542207717895508, "learning_rate": 8.487611419197653e-07, "loss": 0.1009, "num_input_tokens_seen": 6719104, "step": 6995 }, { "epoch": 0.5940257976917854, "grad_norm": 23.163578033447266, "learning_rate": 8.472971614028895e-07, "loss": 0.0662, "num_input_tokens_seen": 6723328, "step": 7000 }, { "epoch": 0.5944501018329938, "grad_norm": 0.11186230182647705, "learning_rate": 8.458335159019288e-07, "loss": 0.0326, "num_input_tokens_seen": 6728064, "step": 7005 }, { "epoch": 0.5948744059742023, "grad_norm": 0.16995789110660553, "learning_rate": 8.443702086279866e-07, "loss": 0.0827, "num_input_tokens_seen": 6732864, "step": 7010 }, { "epoch": 0.5952987101154107, "grad_norm": 0.8537437915802002, "learning_rate": 8.429072427914235e-07, "loss": 0.0157, "num_input_tokens_seen": 6737792, "step": 7015 }, { "epoch": 0.5957230142566191, "grad_norm": 7.274645805358887, "learning_rate": 8.414446216018516e-07, "loss": 0.0443, "num_input_tokens_seen": 6742848, "step": 7020 }, { "epoch": 0.5961473183978275, "grad_norm": 0.6011242866516113, "learning_rate": 8.399823482681261e-07, "loss": 0.0561, "num_input_tokens_seen": 6748160, "step": 7025 }, { "epoch": 0.596571622539036, "grad_norm": 0.3317387104034424, "learning_rate": 8.385204259983403e-07, "loss": 0.0759, "num_input_tokens_seen": 6752960, "step": 7030 }, { "epoch": 0.5969959266802444, "grad_norm": 7.0838727951049805, "learning_rate": 8.37058857999816e-07, "loss": 0.0535, "num_input_tokens_seen": 6758016, "step": 7035 }, { "epoch": 0.5974202308214528, "grad_norm": 13.913290023803711, "learning_rate": 8.355976474790987e-07, "loss": 0.104, "num_input_tokens_seen": 6762688, "step": 7040 }, { "epoch": 0.5978445349626612, "grad_norm": 13.374975204467773, "learning_rate": 8.341367976419485e-07, "loss": 0.0365, "num_input_tokens_seen": 6767424, "step": 7045 }, { "epoch": 0.5982688391038696, "grad_norm": 8.739448547363281, "learning_rate": 8.326763116933359e-07, "loss": 0.0831, "num_input_tokens_seen": 6771648, "step": 7050 }, { "epoch": 0.598693143245078, "grad_norm": 8.212983131408691, "learning_rate": 8.312161928374317e-07, "loss": 0.0632, "num_input_tokens_seen": 6776832, "step": 7055 }, { "epoch": 0.5991174473862865, "grad_norm": 0.2802315950393677, "learning_rate": 8.297564442776012e-07, "loss": 0.0284, "num_input_tokens_seen": 6781120, "step": 7060 }, { "epoch": 0.5995417515274949, "grad_norm": 0.3204219937324524, "learning_rate": 8.282970692163988e-07, "loss": 0.0353, "num_input_tokens_seen": 6785472, "step": 7065 }, { "epoch": 0.5999660556687033, "grad_norm": 13.889245986938477, "learning_rate": 8.268380708555579e-07, "loss": 0.0856, "num_input_tokens_seen": 6791488, "step": 7070 }, { "epoch": 0.6003903598099117, "grad_norm": 0.5243129134178162, "learning_rate": 8.253794523959863e-07, "loss": 0.081, "num_input_tokens_seen": 6797120, "step": 7075 }, { "epoch": 0.6008146639511202, "grad_norm": 5.982780456542969, "learning_rate": 8.239212170377576e-07, "loss": 0.0467, "num_input_tokens_seen": 6801984, "step": 7080 }, { "epoch": 0.6008146639511202, "eval_loss": 0.05977928265929222, "eval_runtime": 15.8463, "eval_samples_per_second": 661.039, "eval_steps_per_second": 82.669, "num_input_tokens_seen": 6801984, "step": 7080 }, { "epoch": 0.6012389680923286, "grad_norm": 7.727343559265137, "learning_rate": 8.224633679801062e-07, "loss": 0.0763, "num_input_tokens_seen": 6806528, "step": 7085 }, { "epoch": 0.601663272233537, "grad_norm": 13.581296920776367, "learning_rate": 8.210059084214176e-07, "loss": 0.1491, "num_input_tokens_seen": 6811456, "step": 7090 }, { "epoch": 0.6020875763747454, "grad_norm": 0.4599580764770508, "learning_rate": 8.195488415592237e-07, "loss": 0.027, "num_input_tokens_seen": 6815872, "step": 7095 }, { "epoch": 0.6025118805159538, "grad_norm": 0.5491631627082825, "learning_rate": 8.180921705901941e-07, "loss": 0.0177, "num_input_tokens_seen": 6821376, "step": 7100 }, { "epoch": 0.6029361846571623, "grad_norm": 7.0324506759643555, "learning_rate": 8.16635898710131e-07, "loss": 0.0499, "num_input_tokens_seen": 6826688, "step": 7105 }, { "epoch": 0.6033604887983707, "grad_norm": 0.3288812041282654, "learning_rate": 8.151800291139596e-07, "loss": 0.0411, "num_input_tokens_seen": 6831680, "step": 7110 }, { "epoch": 0.6037847929395791, "grad_norm": 1.8278636932373047, "learning_rate": 8.137245649957239e-07, "loss": 0.0234, "num_input_tokens_seen": 6836032, "step": 7115 }, { "epoch": 0.6042090970807875, "grad_norm": 23.321144104003906, "learning_rate": 8.122695095485767e-07, "loss": 0.0511, "num_input_tokens_seen": 6840576, "step": 7120 }, { "epoch": 0.6046334012219959, "grad_norm": 0.11708024889230728, "learning_rate": 8.108148659647764e-07, "loss": 0.0629, "num_input_tokens_seen": 6845696, "step": 7125 }, { "epoch": 0.6050577053632044, "grad_norm": 0.16852536797523499, "learning_rate": 8.093606374356758e-07, "loss": 0.0581, "num_input_tokens_seen": 6849984, "step": 7130 }, { "epoch": 0.6054820095044128, "grad_norm": 0.33735671639442444, "learning_rate": 8.079068271517182e-07, "loss": 0.0429, "num_input_tokens_seen": 6854592, "step": 7135 }, { "epoch": 0.6059063136456212, "grad_norm": 30.76317024230957, "learning_rate": 8.064534383024284e-07, "loss": 0.1295, "num_input_tokens_seen": 6859584, "step": 7140 }, { "epoch": 0.6063306177868296, "grad_norm": 0.9654586911201477, "learning_rate": 8.050004740764082e-07, "loss": 0.062, "num_input_tokens_seen": 6864320, "step": 7145 }, { "epoch": 0.606754921928038, "grad_norm": 6.901752948760986, "learning_rate": 8.035479376613261e-07, "loss": 0.038, "num_input_tokens_seen": 6868672, "step": 7150 }, { "epoch": 0.6071792260692465, "grad_norm": 2.7611167430877686, "learning_rate": 8.020958322439132e-07, "loss": 0.0651, "num_input_tokens_seen": 6873088, "step": 7155 }, { "epoch": 0.6076035302104549, "grad_norm": 29.122224807739258, "learning_rate": 8.006441610099539e-07, "loss": 0.0519, "num_input_tokens_seen": 6877568, "step": 7160 }, { "epoch": 0.6080278343516633, "grad_norm": 13.105555534362793, "learning_rate": 7.991929271442817e-07, "loss": 0.1173, "num_input_tokens_seen": 6882112, "step": 7165 }, { "epoch": 0.6084521384928717, "grad_norm": 14.10412311553955, "learning_rate": 7.977421338307687e-07, "loss": 0.1062, "num_input_tokens_seen": 6886720, "step": 7170 }, { "epoch": 0.6088764426340801, "grad_norm": 0.5990397334098816, "learning_rate": 7.962917842523215e-07, "loss": 0.0485, "num_input_tokens_seen": 6891584, "step": 7175 }, { "epoch": 0.6093007467752886, "grad_norm": 34.55642318725586, "learning_rate": 7.94841881590874e-07, "loss": 0.0222, "num_input_tokens_seen": 6896320, "step": 7180 }, { "epoch": 0.609725050916497, "grad_norm": 11.046833992004395, "learning_rate": 7.933924290273774e-07, "loss": 0.1228, "num_input_tokens_seen": 6900992, "step": 7185 }, { "epoch": 0.6101493550577054, "grad_norm": 4.509375095367432, "learning_rate": 7.919434297417976e-07, "loss": 0.0742, "num_input_tokens_seen": 6905600, "step": 7190 }, { "epoch": 0.6105736591989138, "grad_norm": 13.198589324951172, "learning_rate": 7.904948869131039e-07, "loss": 0.103, "num_input_tokens_seen": 6910208, "step": 7195 }, { "epoch": 0.6109979633401222, "grad_norm": 40.57453918457031, "learning_rate": 7.89046803719267e-07, "loss": 0.0421, "num_input_tokens_seen": 6915264, "step": 7200 }, { "epoch": 0.6114222674813307, "grad_norm": 2.951673984527588, "learning_rate": 7.875991833372463e-07, "loss": 0.0266, "num_input_tokens_seen": 6919808, "step": 7205 }, { "epoch": 0.6118465716225391, "grad_norm": 5.2823872566223145, "learning_rate": 7.861520289429879e-07, "loss": 0.0321, "num_input_tokens_seen": 6924608, "step": 7210 }, { "epoch": 0.6122708757637475, "grad_norm": 7.0157623291015625, "learning_rate": 7.847053437114141e-07, "loss": 0.0338, "num_input_tokens_seen": 6929344, "step": 7215 }, { "epoch": 0.6126951799049559, "grad_norm": 10.4129638671875, "learning_rate": 7.832591308164193e-07, "loss": 0.1193, "num_input_tokens_seen": 6934464, "step": 7220 }, { "epoch": 0.6131194840461643, "grad_norm": 39.73077392578125, "learning_rate": 7.818133934308606e-07, "loss": 0.0717, "num_input_tokens_seen": 6939456, "step": 7225 }, { "epoch": 0.6135437881873728, "grad_norm": 0.44385433197021484, "learning_rate": 7.803681347265524e-07, "loss": 0.0072, "num_input_tokens_seen": 6943808, "step": 7230 }, { "epoch": 0.6139680923285811, "grad_norm": 0.2843058407306671, "learning_rate": 7.789233578742583e-07, "loss": 0.0254, "num_input_tokens_seen": 6948736, "step": 7235 }, { "epoch": 0.6143923964697895, "grad_norm": 12.6770658493042, "learning_rate": 7.774790660436857e-07, "loss": 0.0926, "num_input_tokens_seen": 6953792, "step": 7240 }, { "epoch": 0.6148167006109979, "grad_norm": 0.1394738405942917, "learning_rate": 7.760352624034769e-07, "loss": 0.0226, "num_input_tokens_seen": 6958656, "step": 7245 }, { "epoch": 0.6152410047522063, "grad_norm": 39.0544319152832, "learning_rate": 7.745919501212043e-07, "loss": 0.0326, "num_input_tokens_seen": 6963200, "step": 7250 }, { "epoch": 0.6156653088934148, "grad_norm": 9.968228340148926, "learning_rate": 7.731491323633608e-07, "loss": 0.0307, "num_input_tokens_seen": 6968448, "step": 7255 }, { "epoch": 0.6160896130346232, "grad_norm": 12.081189155578613, "learning_rate": 7.71706812295356e-07, "loss": 0.0835, "num_input_tokens_seen": 6973888, "step": 7260 }, { "epoch": 0.6165139171758316, "grad_norm": 9.74222183227539, "learning_rate": 7.702649930815065e-07, "loss": 0.0817, "num_input_tokens_seen": 6978304, "step": 7265 }, { "epoch": 0.61693822131704, "grad_norm": 12.646188735961914, "learning_rate": 7.688236778850306e-07, "loss": 0.0605, "num_input_tokens_seen": 6983168, "step": 7270 }, { "epoch": 0.6173625254582484, "grad_norm": 1.385225534439087, "learning_rate": 7.6738286986804e-07, "loss": 0.0463, "num_input_tokens_seen": 6988224, "step": 7275 }, { "epoch": 0.6177868295994569, "grad_norm": 11.31658935546875, "learning_rate": 7.659425721915351e-07, "loss": 0.0742, "num_input_tokens_seen": 6992448, "step": 7280 }, { "epoch": 0.6182111337406653, "grad_norm": 11.948744773864746, "learning_rate": 7.645027880153956e-07, "loss": 0.0507, "num_input_tokens_seen": 6996864, "step": 7285 }, { "epoch": 0.6186354378818737, "grad_norm": 6.7950544357299805, "learning_rate": 7.63063520498375e-07, "loss": 0.0316, "num_input_tokens_seen": 7001984, "step": 7290 }, { "epoch": 0.6190597420230821, "grad_norm": 16.258878707885742, "learning_rate": 7.616247727980927e-07, "loss": 0.0762, "num_input_tokens_seen": 7007168, "step": 7295 }, { "epoch": 0.6194840461642905, "grad_norm": 0.2103135585784912, "learning_rate": 7.601865480710289e-07, "loss": 0.0582, "num_input_tokens_seen": 7012416, "step": 7300 }, { "epoch": 0.619908350305499, "grad_norm": 7.516461372375488, "learning_rate": 7.587488494725156e-07, "loss": 0.0792, "num_input_tokens_seen": 7016576, "step": 7305 }, { "epoch": 0.6203326544467074, "grad_norm": 12.709039688110352, "learning_rate": 7.573116801567301e-07, "loss": 0.0391, "num_input_tokens_seen": 7021056, "step": 7310 }, { "epoch": 0.6207569585879158, "grad_norm": 0.8361039757728577, "learning_rate": 7.558750432766901e-07, "loss": 0.091, "num_input_tokens_seen": 7025984, "step": 7315 }, { "epoch": 0.6211812627291242, "grad_norm": 15.576173782348633, "learning_rate": 7.544389419842429e-07, "loss": 0.0191, "num_input_tokens_seen": 7031104, "step": 7320 }, { "epoch": 0.6216055668703326, "grad_norm": 1.3385177850723267, "learning_rate": 7.530033794300631e-07, "loss": 0.0467, "num_input_tokens_seen": 7035904, "step": 7325 }, { "epoch": 0.6220298710115411, "grad_norm": 8.493648529052734, "learning_rate": 7.515683587636412e-07, "loss": 0.1059, "num_input_tokens_seen": 7040384, "step": 7330 }, { "epoch": 0.6224541751527495, "grad_norm": 0.186950221657753, "learning_rate": 7.501338831332813e-07, "loss": 0.0149, "num_input_tokens_seen": 7045312, "step": 7335 }, { "epoch": 0.6228784792939579, "grad_norm": 20.064208984375, "learning_rate": 7.486999556860889e-07, "loss": 0.034, "num_input_tokens_seen": 7050048, "step": 7340 }, { "epoch": 0.6233027834351663, "grad_norm": 0.10197906941175461, "learning_rate": 7.472665795679694e-07, "loss": 0.0281, "num_input_tokens_seen": 7054592, "step": 7345 }, { "epoch": 0.6237270875763747, "grad_norm": 50.49172592163086, "learning_rate": 7.458337579236168e-07, "loss": 0.0953, "num_input_tokens_seen": 7059392, "step": 7350 }, { "epoch": 0.6241513917175832, "grad_norm": 10.861714363098145, "learning_rate": 7.4440149389651e-07, "loss": 0.113, "num_input_tokens_seen": 7063552, "step": 7355 }, { "epoch": 0.6245756958587916, "grad_norm": 17.287086486816406, "learning_rate": 7.429697906289029e-07, "loss": 0.0476, "num_input_tokens_seen": 7068288, "step": 7360 }, { "epoch": 0.625, "grad_norm": 1.1666802167892456, "learning_rate": 7.415386512618216e-07, "loss": 0.005, "num_input_tokens_seen": 7073216, "step": 7365 }, { "epoch": 0.6254243041412084, "grad_norm": 11.889900207519531, "learning_rate": 7.401080789350525e-07, "loss": 0.0727, "num_input_tokens_seen": 7077824, "step": 7370 }, { "epoch": 0.6258486082824168, "grad_norm": 1.242414116859436, "learning_rate": 7.386780767871396e-07, "loss": 0.0238, "num_input_tokens_seen": 7082240, "step": 7375 }, { "epoch": 0.6262729124236253, "grad_norm": 9.294206619262695, "learning_rate": 7.372486479553748e-07, "loss": 0.078, "num_input_tokens_seen": 7087360, "step": 7380 }, { "epoch": 0.6266972165648337, "grad_norm": 0.3037230670452118, "learning_rate": 7.358197955757939e-07, "loss": 0.0387, "num_input_tokens_seen": 7092288, "step": 7385 }, { "epoch": 0.6271215207060421, "grad_norm": 18.94266700744629, "learning_rate": 7.343915227831661e-07, "loss": 0.0334, "num_input_tokens_seen": 7096768, "step": 7390 }, { "epoch": 0.6275458248472505, "grad_norm": 34.89910125732422, "learning_rate": 7.329638327109902e-07, "loss": 0.0722, "num_input_tokens_seen": 7101312, "step": 7395 }, { "epoch": 0.6279701289884589, "grad_norm": 6.9105224609375, "learning_rate": 7.315367284914861e-07, "loss": 0.0878, "num_input_tokens_seen": 7105664, "step": 7400 }, { "epoch": 0.6283944331296674, "grad_norm": 0.7883709669113159, "learning_rate": 7.301102132555891e-07, "loss": 0.0652, "num_input_tokens_seen": 7110208, "step": 7405 }, { "epoch": 0.6288187372708758, "grad_norm": 0.3730391263961792, "learning_rate": 7.286842901329412e-07, "loss": 0.004, "num_input_tokens_seen": 7115136, "step": 7410 }, { "epoch": 0.6292430414120842, "grad_norm": 0.42069530487060547, "learning_rate": 7.272589622518863e-07, "loss": 0.0337, "num_input_tokens_seen": 7119552, "step": 7415 }, { "epoch": 0.6296673455532926, "grad_norm": 8.277861595153809, "learning_rate": 7.258342327394616e-07, "loss": 0.0522, "num_input_tokens_seen": 7124352, "step": 7420 }, { "epoch": 0.630091649694501, "grad_norm": 0.07428093254566193, "learning_rate": 7.244101047213927e-07, "loss": 0.0316, "num_input_tokens_seen": 7128768, "step": 7425 }, { "epoch": 0.6305159538357095, "grad_norm": 36.01335144042969, "learning_rate": 7.229865813220843e-07, "loss": 0.0971, "num_input_tokens_seen": 7133568, "step": 7430 }, { "epoch": 0.6309402579769179, "grad_norm": 8.02556324005127, "learning_rate": 7.215636656646151e-07, "loss": 0.0625, "num_input_tokens_seen": 7138112, "step": 7435 }, { "epoch": 0.6313645621181263, "grad_norm": 8.330338478088379, "learning_rate": 7.201413608707312e-07, "loss": 0.0557, "num_input_tokens_seen": 7142848, "step": 7440 }, { "epoch": 0.6317888662593347, "grad_norm": 0.15843556821346283, "learning_rate": 7.187196700608372e-07, "loss": 0.0781, "num_input_tokens_seen": 7147584, "step": 7445 }, { "epoch": 0.6322131704005431, "grad_norm": 0.48546895384788513, "learning_rate": 7.172985963539919e-07, "loss": 0.0768, "num_input_tokens_seen": 7152192, "step": 7450 }, { "epoch": 0.6326374745417516, "grad_norm": 7.328261375427246, "learning_rate": 7.158781428678989e-07, "loss": 0.0604, "num_input_tokens_seen": 7156672, "step": 7455 }, { "epoch": 0.63306177868296, "grad_norm": 2.0213396549224854, "learning_rate": 7.144583127189028e-07, "loss": 0.0545, "num_input_tokens_seen": 7161664, "step": 7460 }, { "epoch": 0.6334860828241684, "grad_norm": 19.81895637512207, "learning_rate": 7.130391090219789e-07, "loss": 0.0352, "num_input_tokens_seen": 7166016, "step": 7465 }, { "epoch": 0.6339103869653768, "grad_norm": 21.48404312133789, "learning_rate": 7.116205348907298e-07, "loss": 0.0671, "num_input_tokens_seen": 7170752, "step": 7470 }, { "epoch": 0.6343346911065852, "grad_norm": 5.94006872177124, "learning_rate": 7.10202593437375e-07, "loss": 0.0546, "num_input_tokens_seen": 7176064, "step": 7475 }, { "epoch": 0.6347589952477937, "grad_norm": 29.022289276123047, "learning_rate": 7.08785287772748e-07, "loss": 0.0549, "num_input_tokens_seen": 7181312, "step": 7480 }, { "epoch": 0.6351832993890021, "grad_norm": 11.498795509338379, "learning_rate": 7.073686210062859e-07, "loss": 0.095, "num_input_tokens_seen": 7186176, "step": 7485 }, { "epoch": 0.6356076035302105, "grad_norm": 1.1727149486541748, "learning_rate": 7.059525962460248e-07, "loss": 0.0358, "num_input_tokens_seen": 7190784, "step": 7490 }, { "epoch": 0.6360319076714189, "grad_norm": 6.915524959564209, "learning_rate": 7.045372165985919e-07, "loss": 0.0687, "num_input_tokens_seen": 7195840, "step": 7495 }, { "epoch": 0.6364562118126272, "grad_norm": 6.936726093292236, "learning_rate": 7.031224851691999e-07, "loss": 0.0347, "num_input_tokens_seen": 7201024, "step": 7500 }, { "epoch": 0.6368805159538357, "grad_norm": 0.39795514941215515, "learning_rate": 7.017084050616385e-07, "loss": 0.0709, "num_input_tokens_seen": 7205760, "step": 7505 }, { "epoch": 0.6373048200950441, "grad_norm": 6.167526721954346, "learning_rate": 7.002949793782686e-07, "loss": 0.0482, "num_input_tokens_seen": 7210560, "step": 7510 }, { "epoch": 0.6377291242362525, "grad_norm": 27.449459075927734, "learning_rate": 6.988822112200156e-07, "loss": 0.0555, "num_input_tokens_seen": 7215488, "step": 7515 }, { "epoch": 0.6381534283774609, "grad_norm": 8.422005653381348, "learning_rate": 6.974701036863626e-07, "loss": 0.0843, "num_input_tokens_seen": 7220608, "step": 7520 }, { "epoch": 0.6385777325186693, "grad_norm": 0.35018572211265564, "learning_rate": 6.960586598753426e-07, "loss": 0.029, "num_input_tokens_seen": 7225280, "step": 7525 }, { "epoch": 0.6390020366598778, "grad_norm": 0.1585758924484253, "learning_rate": 6.946478828835331e-07, "loss": 0.0439, "num_input_tokens_seen": 7229696, "step": 7530 }, { "epoch": 0.6394263408010862, "grad_norm": 14.599209785461426, "learning_rate": 6.932377758060481e-07, "loss": 0.0425, "num_input_tokens_seen": 7233984, "step": 7535 }, { "epoch": 0.6398506449422946, "grad_norm": 22.134122848510742, "learning_rate": 6.91828341736533e-07, "loss": 0.0495, "num_input_tokens_seen": 7239936, "step": 7540 }, { "epoch": 0.640274949083503, "grad_norm": 45.42800521850586, "learning_rate": 6.904195837671552e-07, "loss": 0.0334, "num_input_tokens_seen": 7244480, "step": 7545 }, { "epoch": 0.6406992532247114, "grad_norm": 0.15206590294837952, "learning_rate": 6.890115049885994e-07, "loss": 0.0664, "num_input_tokens_seen": 7248960, "step": 7550 }, { "epoch": 0.6411235573659199, "grad_norm": 31.500267028808594, "learning_rate": 6.87604108490061e-07, "loss": 0.0579, "num_input_tokens_seen": 7253888, "step": 7555 }, { "epoch": 0.6415478615071283, "grad_norm": 0.42176005244255066, "learning_rate": 6.861973973592372e-07, "loss": 0.0378, "num_input_tokens_seen": 7259200, "step": 7560 }, { "epoch": 0.6419721656483367, "grad_norm": 1.1130379438400269, "learning_rate": 6.847913746823227e-07, "loss": 0.0178, "num_input_tokens_seen": 7263808, "step": 7565 }, { "epoch": 0.6423964697895451, "grad_norm": 17.046188354492188, "learning_rate": 6.833860435440006e-07, "loss": 0.0296, "num_input_tokens_seen": 7269248, "step": 7570 }, { "epoch": 0.6428207739307535, "grad_norm": 0.6816079020500183, "learning_rate": 6.819814070274384e-07, "loss": 0.0162, "num_input_tokens_seen": 7274496, "step": 7575 }, { "epoch": 0.643245078071962, "grad_norm": 6.718956470489502, "learning_rate": 6.805774682142782e-07, "loss": 0.1676, "num_input_tokens_seen": 7279552, "step": 7580 }, { "epoch": 0.6436693822131704, "grad_norm": 10.924981117248535, "learning_rate": 6.791742301846325e-07, "loss": 0.0954, "num_input_tokens_seen": 7284096, "step": 7585 }, { "epoch": 0.6440936863543788, "grad_norm": 0.19647859036922455, "learning_rate": 6.777716960170752e-07, "loss": 0.039, "num_input_tokens_seen": 7289088, "step": 7590 }, { "epoch": 0.6445179904955872, "grad_norm": 35.239501953125, "learning_rate": 6.763698687886372e-07, "loss": 0.1273, "num_input_tokens_seen": 7293696, "step": 7595 }, { "epoch": 0.6449422946367956, "grad_norm": 0.21344631910324097, "learning_rate": 6.749687515747977e-07, "loss": 0.047, "num_input_tokens_seen": 7298816, "step": 7600 }, { "epoch": 0.6453665987780041, "grad_norm": 5.354994297027588, "learning_rate": 6.735683474494784e-07, "loss": 0.0821, "num_input_tokens_seen": 7303232, "step": 7605 }, { "epoch": 0.6457909029192125, "grad_norm": 14.325811386108398, "learning_rate": 6.721686594850362e-07, "loss": 0.0391, "num_input_tokens_seen": 7308416, "step": 7610 }, { "epoch": 0.6462152070604209, "grad_norm": 0.12330670654773712, "learning_rate": 6.707696907522577e-07, "loss": 0.0789, "num_input_tokens_seen": 7313024, "step": 7615 }, { "epoch": 0.6466395112016293, "grad_norm": 12.076754570007324, "learning_rate": 6.693714443203507e-07, "loss": 0.0668, "num_input_tokens_seen": 7317760, "step": 7620 }, { "epoch": 0.6470638153428377, "grad_norm": 20.513025283813477, "learning_rate": 6.679739232569388e-07, "loss": 0.0315, "num_input_tokens_seen": 7322624, "step": 7625 }, { "epoch": 0.6474881194840462, "grad_norm": 12.568245887756348, "learning_rate": 6.665771306280537e-07, "loss": 0.0434, "num_input_tokens_seen": 7327104, "step": 7630 }, { "epoch": 0.6479124236252546, "grad_norm": 15.543357849121094, "learning_rate": 6.651810694981299e-07, "loss": 0.0735, "num_input_tokens_seen": 7331520, "step": 7635 }, { "epoch": 0.648336727766463, "grad_norm": 14.047017097473145, "learning_rate": 6.637857429299958e-07, "loss": 0.0712, "num_input_tokens_seen": 7336448, "step": 7640 }, { "epoch": 0.6487610319076714, "grad_norm": 0.43313679099082947, "learning_rate": 6.623911539848697e-07, "loss": 0.0669, "num_input_tokens_seen": 7341248, "step": 7645 }, { "epoch": 0.6491853360488798, "grad_norm": 13.36621379852295, "learning_rate": 6.6099730572235e-07, "loss": 0.0657, "num_input_tokens_seen": 7345920, "step": 7650 }, { "epoch": 0.6496096401900883, "grad_norm": 0.5109625458717346, "learning_rate": 6.596042012004119e-07, "loss": 0.0214, "num_input_tokens_seen": 7350464, "step": 7655 }, { "epoch": 0.6500339443312967, "grad_norm": 14.361283302307129, "learning_rate": 6.582118434753973e-07, "loss": 0.0931, "num_input_tokens_seen": 7355008, "step": 7660 }, { "epoch": 0.6504582484725051, "grad_norm": 32.11049270629883, "learning_rate": 6.568202356020108e-07, "loss": 0.0362, "num_input_tokens_seen": 7359680, "step": 7665 }, { "epoch": 0.6508825526137135, "grad_norm": 2.051922082901001, "learning_rate": 6.554293806333109e-07, "loss": 0.0404, "num_input_tokens_seen": 7363968, "step": 7670 }, { "epoch": 0.6508825526137135, "eval_loss": 0.05561085045337677, "eval_runtime": 15.8384, "eval_samples_per_second": 661.368, "eval_steps_per_second": 82.71, "num_input_tokens_seen": 7363968, "step": 7670 }, { "epoch": 0.651306856754922, "grad_norm": 14.956515312194824, "learning_rate": 6.540392816207054e-07, "loss": 0.108, "num_input_tokens_seen": 7368064, "step": 7675 }, { "epoch": 0.6517311608961304, "grad_norm": 14.99658489227295, "learning_rate": 6.52649941613943e-07, "loss": 0.0423, "num_input_tokens_seen": 7372736, "step": 7680 }, { "epoch": 0.6521554650373388, "grad_norm": 1.0164530277252197, "learning_rate": 6.512613636611068e-07, "loss": 0.0361, "num_input_tokens_seen": 7377600, "step": 7685 }, { "epoch": 0.6525797691785472, "grad_norm": 17.327880859375, "learning_rate": 6.498735508086093e-07, "loss": 0.0238, "num_input_tokens_seen": 7381952, "step": 7690 }, { "epoch": 0.6530040733197556, "grad_norm": 2.355736494064331, "learning_rate": 6.484865061011829e-07, "loss": 0.0258, "num_input_tokens_seen": 7387264, "step": 7695 }, { "epoch": 0.653428377460964, "grad_norm": 1.3785549402236938, "learning_rate": 6.471002325818761e-07, "loss": 0.0708, "num_input_tokens_seen": 7391936, "step": 7700 }, { "epoch": 0.6538526816021725, "grad_norm": 3.1388180255889893, "learning_rate": 6.45714733292044e-07, "loss": 0.0622, "num_input_tokens_seen": 7397248, "step": 7705 }, { "epoch": 0.6542769857433809, "grad_norm": 12.08487319946289, "learning_rate": 6.443300112713452e-07, "loss": 0.0741, "num_input_tokens_seen": 7401920, "step": 7710 }, { "epoch": 0.6547012898845893, "grad_norm": 27.47235107421875, "learning_rate": 6.429460695577309e-07, "loss": 0.0523, "num_input_tokens_seen": 7406912, "step": 7715 }, { "epoch": 0.6551255940257977, "grad_norm": 8.73314094543457, "learning_rate": 6.415629111874418e-07, "loss": 0.0758, "num_input_tokens_seen": 7411776, "step": 7720 }, { "epoch": 0.6555498981670062, "grad_norm": 0.9363436698913574, "learning_rate": 6.401805391949989e-07, "loss": 0.0585, "num_input_tokens_seen": 7416128, "step": 7725 }, { "epoch": 0.6559742023082146, "grad_norm": 0.05604305863380432, "learning_rate": 6.387989566131996e-07, "loss": 0.0381, "num_input_tokens_seen": 7422208, "step": 7730 }, { "epoch": 0.656398506449423, "grad_norm": 8.00350570678711, "learning_rate": 6.374181664731076e-07, "loss": 0.0647, "num_input_tokens_seen": 7427008, "step": 7735 }, { "epoch": 0.6568228105906314, "grad_norm": 1.7786318063735962, "learning_rate": 6.360381718040493e-07, "loss": 0.053, "num_input_tokens_seen": 7431488, "step": 7740 }, { "epoch": 0.6572471147318398, "grad_norm": 1.2591062784194946, "learning_rate": 6.34658975633605e-07, "loss": 0.0645, "num_input_tokens_seen": 7436544, "step": 7745 }, { "epoch": 0.6576714188730483, "grad_norm": 0.7253627777099609, "learning_rate": 6.332805809876041e-07, "loss": 0.0631, "num_input_tokens_seen": 7440896, "step": 7750 }, { "epoch": 0.6580957230142567, "grad_norm": 30.2974796295166, "learning_rate": 6.319029908901168e-07, "loss": 0.1176, "num_input_tokens_seen": 7445824, "step": 7755 }, { "epoch": 0.658520027155465, "grad_norm": 22.773164749145508, "learning_rate": 6.305262083634487e-07, "loss": 0.0954, "num_input_tokens_seen": 7450368, "step": 7760 }, { "epoch": 0.6589443312966734, "grad_norm": 31.29222869873047, "learning_rate": 6.29150236428133e-07, "loss": 0.0724, "num_input_tokens_seen": 7454720, "step": 7765 }, { "epoch": 0.6593686354378818, "grad_norm": 9.666147232055664, "learning_rate": 6.277750781029254e-07, "loss": 0.0929, "num_input_tokens_seen": 7459264, "step": 7770 }, { "epoch": 0.6597929395790902, "grad_norm": 5.074236869812012, "learning_rate": 6.26400736404796e-07, "loss": 0.0311, "num_input_tokens_seen": 7463744, "step": 7775 }, { "epoch": 0.6602172437202987, "grad_norm": 27.29247283935547, "learning_rate": 6.250272143489236e-07, "loss": 0.0185, "num_input_tokens_seen": 7468800, "step": 7780 }, { "epoch": 0.6606415478615071, "grad_norm": 0.3091581165790558, "learning_rate": 6.23654514948688e-07, "loss": 0.0711, "num_input_tokens_seen": 7473856, "step": 7785 }, { "epoch": 0.6610658520027155, "grad_norm": 14.052967071533203, "learning_rate": 6.222826412156659e-07, "loss": 0.0637, "num_input_tokens_seen": 7478144, "step": 7790 }, { "epoch": 0.6614901561439239, "grad_norm": 5.32991361618042, "learning_rate": 6.209115961596207e-07, "loss": 0.0699, "num_input_tokens_seen": 7482432, "step": 7795 }, { "epoch": 0.6619144602851323, "grad_norm": 7.41123628616333, "learning_rate": 6.195413827884986e-07, "loss": 0.0804, "num_input_tokens_seen": 7487488, "step": 7800 }, { "epoch": 0.6623387644263408, "grad_norm": 8.139752388000488, "learning_rate": 6.181720041084216e-07, "loss": 0.0639, "num_input_tokens_seen": 7492032, "step": 7805 }, { "epoch": 0.6627630685675492, "grad_norm": 2.0337157249450684, "learning_rate": 6.168034631236794e-07, "loss": 0.0315, "num_input_tokens_seen": 7496576, "step": 7810 }, { "epoch": 0.6631873727087576, "grad_norm": 0.41162967681884766, "learning_rate": 6.154357628367251e-07, "loss": 0.0285, "num_input_tokens_seen": 7501056, "step": 7815 }, { "epoch": 0.663611676849966, "grad_norm": 25.204240798950195, "learning_rate": 6.140689062481657e-07, "loss": 0.0801, "num_input_tokens_seen": 7505792, "step": 7820 }, { "epoch": 0.6640359809911744, "grad_norm": 22.751937866210938, "learning_rate": 6.127028963567593e-07, "loss": 0.0378, "num_input_tokens_seen": 7510528, "step": 7825 }, { "epoch": 0.6644602851323829, "grad_norm": 14.025527954101562, "learning_rate": 6.113377361594048e-07, "loss": 0.0707, "num_input_tokens_seen": 7515200, "step": 7830 }, { "epoch": 0.6648845892735913, "grad_norm": 13.164021492004395, "learning_rate": 6.099734286511378e-07, "loss": 0.0271, "num_input_tokens_seen": 7520128, "step": 7835 }, { "epoch": 0.6653088934147997, "grad_norm": 5.781976699829102, "learning_rate": 6.086099768251222e-07, "loss": 0.0496, "num_input_tokens_seen": 7524544, "step": 7840 }, { "epoch": 0.6657331975560081, "grad_norm": 23.99297523498535, "learning_rate": 6.072473836726461e-07, "loss": 0.123, "num_input_tokens_seen": 7529664, "step": 7845 }, { "epoch": 0.6661575016972165, "grad_norm": 9.406707763671875, "learning_rate": 6.058856521831126e-07, "loss": 0.0509, "num_input_tokens_seen": 7533760, "step": 7850 }, { "epoch": 0.666581805838425, "grad_norm": 0.1467132270336151, "learning_rate": 6.045247853440349e-07, "loss": 0.0302, "num_input_tokens_seen": 7538432, "step": 7855 }, { "epoch": 0.6670061099796334, "grad_norm": 9.463153839111328, "learning_rate": 6.031647861410287e-07, "loss": 0.0664, "num_input_tokens_seen": 7543168, "step": 7860 }, { "epoch": 0.6674304141208418, "grad_norm": 73.3876724243164, "learning_rate": 6.018056575578074e-07, "loss": 0.0448, "num_input_tokens_seen": 7547840, "step": 7865 }, { "epoch": 0.6678547182620502, "grad_norm": 19.8172550201416, "learning_rate": 6.004474025761723e-07, "loss": 0.0146, "num_input_tokens_seen": 7552768, "step": 7870 }, { "epoch": 0.6682790224032586, "grad_norm": 12.499360084533691, "learning_rate": 5.990900241760102e-07, "loss": 0.1316, "num_input_tokens_seen": 7557312, "step": 7875 }, { "epoch": 0.6687033265444671, "grad_norm": 15.117225646972656, "learning_rate": 5.977335253352833e-07, "loss": 0.0859, "num_input_tokens_seen": 7561920, "step": 7880 }, { "epoch": 0.6691276306856755, "grad_norm": 22.873123168945312, "learning_rate": 5.963779090300254e-07, "loss": 0.0652, "num_input_tokens_seen": 7567104, "step": 7885 }, { "epoch": 0.6695519348268839, "grad_norm": 6.8520731925964355, "learning_rate": 5.950231782343326e-07, "loss": 0.0364, "num_input_tokens_seen": 7571584, "step": 7890 }, { "epoch": 0.6699762389680923, "grad_norm": 23.435100555419922, "learning_rate": 5.936693359203597e-07, "loss": 0.0182, "num_input_tokens_seen": 7576256, "step": 7895 }, { "epoch": 0.6704005431093008, "grad_norm": 10.050165176391602, "learning_rate": 5.923163850583113e-07, "loss": 0.0678, "num_input_tokens_seen": 7581248, "step": 7900 }, { "epoch": 0.6708248472505092, "grad_norm": 8.643121719360352, "learning_rate": 5.909643286164367e-07, "loss": 0.0382, "num_input_tokens_seen": 7585984, "step": 7905 }, { "epoch": 0.6712491513917176, "grad_norm": 15.745407104492188, "learning_rate": 5.896131695610223e-07, "loss": 0.0831, "num_input_tokens_seen": 7590528, "step": 7910 }, { "epoch": 0.671673455532926, "grad_norm": 8.164334297180176, "learning_rate": 5.88262910856387e-07, "loss": 0.0263, "num_input_tokens_seen": 7595328, "step": 7915 }, { "epoch": 0.6720977596741344, "grad_norm": 6.022217750549316, "learning_rate": 5.869135554648728e-07, "loss": 0.0661, "num_input_tokens_seen": 7600128, "step": 7920 }, { "epoch": 0.6725220638153429, "grad_norm": 0.2256646305322647, "learning_rate": 5.855651063468411e-07, "loss": 0.0523, "num_input_tokens_seen": 7604288, "step": 7925 }, { "epoch": 0.6729463679565513, "grad_norm": 15.741569519042969, "learning_rate": 5.84217566460665e-07, "loss": 0.0387, "num_input_tokens_seen": 7609280, "step": 7930 }, { "epoch": 0.6733706720977597, "grad_norm": 11.693473815917969, "learning_rate": 5.828709387627217e-07, "loss": 0.0498, "num_input_tokens_seen": 7614528, "step": 7935 }, { "epoch": 0.6737949762389681, "grad_norm": 17.00261116027832, "learning_rate": 5.815252262073891e-07, "loss": 0.112, "num_input_tokens_seen": 7619264, "step": 7940 }, { "epoch": 0.6742192803801765, "grad_norm": 0.4120815396308899, "learning_rate": 5.801804317470349e-07, "loss": 0.0041, "num_input_tokens_seen": 7624192, "step": 7945 }, { "epoch": 0.674643584521385, "grad_norm": 9.579654693603516, "learning_rate": 5.788365583320144e-07, "loss": 0.0446, "num_input_tokens_seen": 7629120, "step": 7950 }, { "epoch": 0.6750678886625934, "grad_norm": 19.68268394470215, "learning_rate": 5.774936089106617e-07, "loss": 0.0664, "num_input_tokens_seen": 7633984, "step": 7955 }, { "epoch": 0.6754921928038018, "grad_norm": 0.8648895621299744, "learning_rate": 5.761515864292835e-07, "loss": 0.0222, "num_input_tokens_seen": 7638976, "step": 7960 }, { "epoch": 0.6759164969450102, "grad_norm": 0.464873343706131, "learning_rate": 5.748104938321534e-07, "loss": 0.001, "num_input_tokens_seen": 7643520, "step": 7965 }, { "epoch": 0.6763408010862186, "grad_norm": 0.21355387568473816, "learning_rate": 5.734703340615049e-07, "loss": 0.0733, "num_input_tokens_seen": 7648576, "step": 7970 }, { "epoch": 0.676765105227427, "grad_norm": 0.31152021884918213, "learning_rate": 5.721311100575235e-07, "loss": 0.0419, "num_input_tokens_seen": 7653632, "step": 7975 }, { "epoch": 0.6771894093686355, "grad_norm": 0.17043381929397583, "learning_rate": 5.707928247583444e-07, "loss": 0.0034, "num_input_tokens_seen": 7658176, "step": 7980 }, { "epoch": 0.6776137135098439, "grad_norm": 19.964506149291992, "learning_rate": 5.694554811000407e-07, "loss": 0.1114, "num_input_tokens_seen": 7663040, "step": 7985 }, { "epoch": 0.6780380176510523, "grad_norm": 21.162376403808594, "learning_rate": 5.681190820166213e-07, "loss": 0.0707, "num_input_tokens_seen": 7668160, "step": 7990 }, { "epoch": 0.6784623217922607, "grad_norm": 3.3429012298583984, "learning_rate": 5.667836304400221e-07, "loss": 0.0181, "num_input_tokens_seen": 7673024, "step": 7995 }, { "epoch": 0.6788866259334692, "grad_norm": 0.6459212899208069, "learning_rate": 5.654491293001005e-07, "loss": 0.0702, "num_input_tokens_seen": 7677696, "step": 8000 }, { "epoch": 0.6793109300746776, "grad_norm": 16.935989379882812, "learning_rate": 5.641155815246289e-07, "loss": 0.046, "num_input_tokens_seen": 7682752, "step": 8005 }, { "epoch": 0.679735234215886, "grad_norm": 1.244130253791809, "learning_rate": 5.62782990039288e-07, "loss": 0.0835, "num_input_tokens_seen": 7687872, "step": 8010 }, { "epoch": 0.6801595383570944, "grad_norm": 22.5478515625, "learning_rate": 5.614513577676592e-07, "loss": 0.0774, "num_input_tokens_seen": 7692800, "step": 8015 }, { "epoch": 0.6805838424983028, "grad_norm": 0.9386072754859924, "learning_rate": 5.601206876312223e-07, "loss": 0.0339, "num_input_tokens_seen": 7697472, "step": 8020 }, { "epoch": 0.6810081466395111, "grad_norm": 0.1652708798646927, "learning_rate": 5.587909825493433e-07, "loss": 0.0687, "num_input_tokens_seen": 7702336, "step": 8025 }, { "epoch": 0.6814324507807196, "grad_norm": 13.297231674194336, "learning_rate": 5.57462245439273e-07, "loss": 0.0828, "num_input_tokens_seen": 7707136, "step": 8030 }, { "epoch": 0.681856754921928, "grad_norm": 19.77155876159668, "learning_rate": 5.561344792161373e-07, "loss": 0.1116, "num_input_tokens_seen": 7711808, "step": 8035 }, { "epoch": 0.6822810590631364, "grad_norm": 0.917893648147583, "learning_rate": 5.54807686792933e-07, "loss": 0.0646, "num_input_tokens_seen": 7716352, "step": 8040 }, { "epoch": 0.6827053632043448, "grad_norm": 1.264143705368042, "learning_rate": 5.534818710805198e-07, "loss": 0.1109, "num_input_tokens_seen": 7721024, "step": 8045 }, { "epoch": 0.6831296673455532, "grad_norm": 5.795388698577881, "learning_rate": 5.52157034987615e-07, "loss": 0.0793, "num_input_tokens_seen": 7726208, "step": 8050 }, { "epoch": 0.6835539714867617, "grad_norm": 0.15009626746177673, "learning_rate": 5.508331814207864e-07, "loss": 0.0826, "num_input_tokens_seen": 7731136, "step": 8055 }, { "epoch": 0.6839782756279701, "grad_norm": 11.75623607635498, "learning_rate": 5.495103132844466e-07, "loss": 0.1033, "num_input_tokens_seen": 7735552, "step": 8060 }, { "epoch": 0.6844025797691785, "grad_norm": 0.1650766134262085, "learning_rate": 5.481884334808463e-07, "loss": 0.0839, "num_input_tokens_seen": 7740032, "step": 8065 }, { "epoch": 0.6848268839103869, "grad_norm": 14.68458080291748, "learning_rate": 5.468675449100664e-07, "loss": 0.0441, "num_input_tokens_seen": 7744960, "step": 8070 }, { "epoch": 0.6852511880515954, "grad_norm": 5.773752212524414, "learning_rate": 5.455476504700161e-07, "loss": 0.1324, "num_input_tokens_seen": 7749632, "step": 8075 }, { "epoch": 0.6856754921928038, "grad_norm": 3.588259220123291, "learning_rate": 5.442287530564203e-07, "loss": 0.0765, "num_input_tokens_seen": 7754176, "step": 8080 }, { "epoch": 0.6860997963340122, "grad_norm": 13.416653633117676, "learning_rate": 5.429108555628186e-07, "loss": 0.1193, "num_input_tokens_seen": 7759360, "step": 8085 }, { "epoch": 0.6865241004752206, "grad_norm": 3.5806448459625244, "learning_rate": 5.415939608805564e-07, "loss": 0.0654, "num_input_tokens_seen": 7763904, "step": 8090 }, { "epoch": 0.686948404616429, "grad_norm": 0.8055393695831299, "learning_rate": 5.402780718987789e-07, "loss": 0.0189, "num_input_tokens_seen": 7768320, "step": 8095 }, { "epoch": 0.6873727087576375, "grad_norm": 15.830171585083008, "learning_rate": 5.389631915044249e-07, "loss": 0.0697, "num_input_tokens_seen": 7772928, "step": 8100 }, { "epoch": 0.6877970128988459, "grad_norm": 0.9971351027488708, "learning_rate": 5.376493225822208e-07, "loss": 0.0421, "num_input_tokens_seen": 7777280, "step": 8105 }, { "epoch": 0.6882213170400543, "grad_norm": 8.671359062194824, "learning_rate": 5.363364680146725e-07, "loss": 0.0558, "num_input_tokens_seen": 7782080, "step": 8110 }, { "epoch": 0.6886456211812627, "grad_norm": 17.439279556274414, "learning_rate": 5.350246306820632e-07, "loss": 0.0597, "num_input_tokens_seen": 7786880, "step": 8115 }, { "epoch": 0.6890699253224711, "grad_norm": 2.295281410217285, "learning_rate": 5.337138134624412e-07, "loss": 0.0316, "num_input_tokens_seen": 7791232, "step": 8120 }, { "epoch": 0.6894942294636796, "grad_norm": 1.799139142036438, "learning_rate": 5.324040192316193e-07, "loss": 0.0082, "num_input_tokens_seen": 7796160, "step": 8125 }, { "epoch": 0.689918533604888, "grad_norm": 9.208296775817871, "learning_rate": 5.310952508631645e-07, "loss": 0.0278, "num_input_tokens_seen": 7801024, "step": 8130 }, { "epoch": 0.6903428377460964, "grad_norm": 12.797538757324219, "learning_rate": 5.297875112283941e-07, "loss": 0.0599, "num_input_tokens_seen": 7805952, "step": 8135 }, { "epoch": 0.6907671418873048, "grad_norm": 13.396888732910156, "learning_rate": 5.284808031963676e-07, "loss": 0.0434, "num_input_tokens_seen": 7811008, "step": 8140 }, { "epoch": 0.6911914460285132, "grad_norm": 0.3513202369213104, "learning_rate": 5.271751296338822e-07, "loss": 0.0109, "num_input_tokens_seen": 7815296, "step": 8145 }, { "epoch": 0.6916157501697217, "grad_norm": 2.3773586750030518, "learning_rate": 5.25870493405464e-07, "loss": 0.0378, "num_input_tokens_seen": 7820864, "step": 8150 }, { "epoch": 0.6920400543109301, "grad_norm": 1.3169046640396118, "learning_rate": 5.245668973733657e-07, "loss": 0.0569, "num_input_tokens_seen": 7825856, "step": 8155 }, { "epoch": 0.6924643584521385, "grad_norm": 15.97273063659668, "learning_rate": 5.232643443975553e-07, "loss": 0.0184, "num_input_tokens_seen": 7830400, "step": 8160 }, { "epoch": 0.6928886625933469, "grad_norm": 1.616037130355835, "learning_rate": 5.219628373357142e-07, "loss": 0.0032, "num_input_tokens_seen": 7835776, "step": 8165 }, { "epoch": 0.6933129667345553, "grad_norm": 0.09990768134593964, "learning_rate": 5.206623790432285e-07, "loss": 0.0689, "num_input_tokens_seen": 7840640, "step": 8170 }, { "epoch": 0.6937372708757638, "grad_norm": 31.12531852722168, "learning_rate": 5.193629723731837e-07, "loss": 0.0707, "num_input_tokens_seen": 7845120, "step": 8175 }, { "epoch": 0.6941615750169722, "grad_norm": 14.439279556274414, "learning_rate": 5.180646201763577e-07, "loss": 0.0528, "num_input_tokens_seen": 7849664, "step": 8180 }, { "epoch": 0.6945858791581806, "grad_norm": 17.159196853637695, "learning_rate": 5.167673253012152e-07, "loss": 0.008, "num_input_tokens_seen": 7854080, "step": 8185 }, { "epoch": 0.695010183299389, "grad_norm": 19.91121482849121, "learning_rate": 5.154710905939015e-07, "loss": 0.0412, "num_input_tokens_seen": 7858176, "step": 8190 }, { "epoch": 0.6954344874405974, "grad_norm": 3.647684335708618, "learning_rate": 5.141759188982356e-07, "loss": 0.0572, "num_input_tokens_seen": 7862912, "step": 8195 }, { "epoch": 0.6958587915818059, "grad_norm": 14.494319915771484, "learning_rate": 5.12881813055705e-07, "loss": 0.0708, "num_input_tokens_seen": 7867648, "step": 8200 }, { "epoch": 0.6962830957230143, "grad_norm": 0.490307092666626, "learning_rate": 5.115887759054571e-07, "loss": 0.1211, "num_input_tokens_seen": 7873152, "step": 8205 }, { "epoch": 0.6967073998642227, "grad_norm": 25.91111946105957, "learning_rate": 5.102968102842973e-07, "loss": 0.0675, "num_input_tokens_seen": 7877824, "step": 8210 }, { "epoch": 0.6971317040054311, "grad_norm": 9.230366706848145, "learning_rate": 5.090059190266779e-07, "loss": 0.0948, "num_input_tokens_seen": 7882688, "step": 8215 }, { "epoch": 0.6975560081466395, "grad_norm": 3.7622454166412354, "learning_rate": 5.077161049646951e-07, "loss": 0.0272, "num_input_tokens_seen": 7887424, "step": 8220 }, { "epoch": 0.697980312287848, "grad_norm": 0.6465440392494202, "learning_rate": 5.06427370928082e-07, "loss": 0.0328, "num_input_tokens_seen": 7891584, "step": 8225 }, { "epoch": 0.6984046164290564, "grad_norm": 35.424503326416016, "learning_rate": 5.05139719744202e-07, "loss": 0.1217, "num_input_tokens_seen": 7896576, "step": 8230 }, { "epoch": 0.6988289205702648, "grad_norm": 0.2946743071079254, "learning_rate": 5.038531542380425e-07, "loss": 0.0685, "num_input_tokens_seen": 7901376, "step": 8235 }, { "epoch": 0.6992532247114732, "grad_norm": 0.7577657699584961, "learning_rate": 5.025676772322099e-07, "loss": 0.0305, "num_input_tokens_seen": 7906368, "step": 8240 }, { "epoch": 0.6996775288526816, "grad_norm": 12.771099090576172, "learning_rate": 5.012832915469207e-07, "loss": 0.0503, "num_input_tokens_seen": 7911232, "step": 8245 }, { "epoch": 0.7001018329938901, "grad_norm": 15.256807327270508, "learning_rate": 5.000000000000002e-07, "loss": 0.0654, "num_input_tokens_seen": 7915776, "step": 8250 }, { "epoch": 0.7005261371350985, "grad_norm": 18.504268646240234, "learning_rate": 4.987178054068699e-07, "loss": 0.0226, "num_input_tokens_seen": 7920192, "step": 8255 }, { "epoch": 0.7009504412763069, "grad_norm": 0.9049757719039917, "learning_rate": 4.97436710580547e-07, "loss": 0.0633, "num_input_tokens_seen": 7924800, "step": 8260 }, { "epoch": 0.7009504412763069, "eval_loss": 0.0545908585190773, "eval_runtime": 15.9241, "eval_samples_per_second": 657.806, "eval_steps_per_second": 82.265, "num_input_tokens_seen": 7924800, "step": 8260 }, { "epoch": 0.7013747454175153, "grad_norm": 0.3657372295856476, "learning_rate": 4.961567183316348e-07, "loss": 0.0334, "num_input_tokens_seen": 7929664, "step": 8265 }, { "epoch": 0.7017990495587237, "grad_norm": 19.398540496826172, "learning_rate": 4.948778314683183e-07, "loss": 0.1168, "num_input_tokens_seen": 7934464, "step": 8270 }, { "epoch": 0.7022233536999322, "grad_norm": 44.40122604370117, "learning_rate": 4.93600052796357e-07, "loss": 0.0608, "num_input_tokens_seen": 7939264, "step": 8275 }, { "epoch": 0.7026476578411406, "grad_norm": 28.30301856994629, "learning_rate": 4.923233851190794e-07, "loss": 0.0751, "num_input_tokens_seen": 7943552, "step": 8280 }, { "epoch": 0.703071961982349, "grad_norm": 2.000850200653076, "learning_rate": 4.910478312373756e-07, "loss": 0.0151, "num_input_tokens_seen": 7948032, "step": 8285 }, { "epoch": 0.7034962661235573, "grad_norm": 0.9129754900932312, "learning_rate": 4.897733939496942e-07, "loss": 0.0642, "num_input_tokens_seen": 7953408, "step": 8290 }, { "epoch": 0.7039205702647657, "grad_norm": 0.4381353557109833, "learning_rate": 4.885000760520317e-07, "loss": 0.0675, "num_input_tokens_seen": 7957760, "step": 8295 }, { "epoch": 0.7043448744059742, "grad_norm": 15.017487525939941, "learning_rate": 4.872278803379299e-07, "loss": 0.035, "num_input_tokens_seen": 7963712, "step": 8300 }, { "epoch": 0.7047691785471826, "grad_norm": 1.7986425161361694, "learning_rate": 4.8595680959847e-07, "loss": 0.0596, "num_input_tokens_seen": 7968576, "step": 8305 }, { "epoch": 0.705193482688391, "grad_norm": 8.05062484741211, "learning_rate": 4.846868666222622e-07, "loss": 0.0847, "num_input_tokens_seen": 7973184, "step": 8310 }, { "epoch": 0.7056177868295994, "grad_norm": 0.7912288308143616, "learning_rate": 4.834180541954447e-07, "loss": 0.0375, "num_input_tokens_seen": 7977792, "step": 8315 }, { "epoch": 0.7060420909708078, "grad_norm": 0.472023606300354, "learning_rate": 4.821503751016745e-07, "loss": 0.0239, "num_input_tokens_seen": 7981824, "step": 8320 }, { "epoch": 0.7064663951120163, "grad_norm": 2.4885144233703613, "learning_rate": 4.808838321221226e-07, "loss": 0.0527, "num_input_tokens_seen": 7986112, "step": 8325 }, { "epoch": 0.7068906992532247, "grad_norm": 7.7319536209106445, "learning_rate": 4.79618428035467e-07, "loss": 0.1099, "num_input_tokens_seen": 7990592, "step": 8330 }, { "epoch": 0.7073150033944331, "grad_norm": 1.5436960458755493, "learning_rate": 4.78354165617888e-07, "loss": 0.1138, "num_input_tokens_seen": 7995136, "step": 8335 }, { "epoch": 0.7077393075356415, "grad_norm": 10.343153953552246, "learning_rate": 4.77091047643059e-07, "loss": 0.0542, "num_input_tokens_seen": 7999872, "step": 8340 }, { "epoch": 0.7081636116768499, "grad_norm": 40.822261810302734, "learning_rate": 4.7582907688214593e-07, "loss": 0.0385, "num_input_tokens_seen": 8004608, "step": 8345 }, { "epoch": 0.7085879158180584, "grad_norm": 28.6870174407959, "learning_rate": 4.745682561037947e-07, "loss": 0.0206, "num_input_tokens_seen": 8009024, "step": 8350 }, { "epoch": 0.7090122199592668, "grad_norm": 14.473899841308594, "learning_rate": 4.733085880741301e-07, "loss": 0.0727, "num_input_tokens_seen": 8013504, "step": 8355 }, { "epoch": 0.7094365241004752, "grad_norm": 7.182537078857422, "learning_rate": 4.7205007555674714e-07, "loss": 0.0952, "num_input_tokens_seen": 8018432, "step": 8360 }, { "epoch": 0.7098608282416836, "grad_norm": 0.05755231902003288, "learning_rate": 4.707927213127062e-07, "loss": 0.0025, "num_input_tokens_seen": 8022848, "step": 8365 }, { "epoch": 0.710285132382892, "grad_norm": 6.39005184173584, "learning_rate": 4.6953652810052615e-07, "loss": 0.0645, "num_input_tokens_seen": 8027392, "step": 8370 }, { "epoch": 0.7107094365241005, "grad_norm": 12.84408187866211, "learning_rate": 4.682814986761792e-07, "loss": 0.154, "num_input_tokens_seen": 8032640, "step": 8375 }, { "epoch": 0.7111337406653089, "grad_norm": 0.5624406337738037, "learning_rate": 4.670276357930828e-07, "loss": 0.0511, "num_input_tokens_seen": 8037312, "step": 8380 }, { "epoch": 0.7115580448065173, "grad_norm": 8.612873077392578, "learning_rate": 4.657749422020979e-07, "loss": 0.1108, "num_input_tokens_seen": 8042240, "step": 8385 }, { "epoch": 0.7119823489477257, "grad_norm": 0.5993130207061768, "learning_rate": 4.6452342065151704e-07, "loss": 0.0449, "num_input_tokens_seen": 8047360, "step": 8390 }, { "epoch": 0.7124066530889341, "grad_norm": 16.80901527404785, "learning_rate": 4.632730738870634e-07, "loss": 0.0763, "num_input_tokens_seen": 8052096, "step": 8395 }, { "epoch": 0.7128309572301426, "grad_norm": 20.762821197509766, "learning_rate": 4.6202390465188225e-07, "loss": 0.0305, "num_input_tokens_seen": 8056960, "step": 8400 }, { "epoch": 0.713255261371351, "grad_norm": 37.5723876953125, "learning_rate": 4.6077591568653575e-07, "loss": 0.0582, "num_input_tokens_seen": 8061568, "step": 8405 }, { "epoch": 0.7136795655125594, "grad_norm": 21.374719619750977, "learning_rate": 4.595291097289952e-07, "loss": 0.0432, "num_input_tokens_seen": 8066624, "step": 8410 }, { "epoch": 0.7141038696537678, "grad_norm": 16.342044830322266, "learning_rate": 4.582834895146391e-07, "loss": 0.0634, "num_input_tokens_seen": 8071680, "step": 8415 }, { "epoch": 0.7145281737949762, "grad_norm": 7.064672470092773, "learning_rate": 4.5703905777624184e-07, "loss": 0.061, "num_input_tokens_seen": 8076480, "step": 8420 }, { "epoch": 0.7149524779361847, "grad_norm": 0.5178303718566895, "learning_rate": 4.5579581724397255e-07, "loss": 0.0268, "num_input_tokens_seen": 8080768, "step": 8425 }, { "epoch": 0.7153767820773931, "grad_norm": 1.711916446685791, "learning_rate": 4.5455377064538603e-07, "loss": 0.0509, "num_input_tokens_seen": 8085248, "step": 8430 }, { "epoch": 0.7158010862186015, "grad_norm": 17.0489444732666, "learning_rate": 4.533129207054167e-07, "loss": 0.0485, "num_input_tokens_seen": 8090176, "step": 8435 }, { "epoch": 0.7162253903598099, "grad_norm": 4.791530132293701, "learning_rate": 4.520732701463762e-07, "loss": 0.0764, "num_input_tokens_seen": 8094464, "step": 8440 }, { "epoch": 0.7166496945010183, "grad_norm": 0.5642265677452087, "learning_rate": 4.508348216879421e-07, "loss": 0.1021, "num_input_tokens_seen": 8099008, "step": 8445 }, { "epoch": 0.7170739986422268, "grad_norm": 6.994995594024658, "learning_rate": 4.4959757804715613e-07, "loss": 0.0745, "num_input_tokens_seen": 8104256, "step": 8450 }, { "epoch": 0.7174983027834352, "grad_norm": 19.228673934936523, "learning_rate": 4.483615419384167e-07, "loss": 0.0703, "num_input_tokens_seen": 8109056, "step": 8455 }, { "epoch": 0.7179226069246436, "grad_norm": 0.37509685754776, "learning_rate": 4.4712671607347307e-07, "loss": 0.1058, "num_input_tokens_seen": 8114176, "step": 8460 }, { "epoch": 0.718346911065852, "grad_norm": 5.339738845825195, "learning_rate": 4.458931031614179e-07, "loss": 0.0812, "num_input_tokens_seen": 8119232, "step": 8465 }, { "epoch": 0.7187712152070604, "grad_norm": 1.2560439109802246, "learning_rate": 4.4466070590868543e-07, "loss": 0.0612, "num_input_tokens_seen": 8123840, "step": 8470 }, { "epoch": 0.7191955193482689, "grad_norm": 5.405581474304199, "learning_rate": 4.434295270190402e-07, "loss": 0.1036, "num_input_tokens_seen": 8128256, "step": 8475 }, { "epoch": 0.7196198234894773, "grad_norm": 16.936054229736328, "learning_rate": 4.4219956919357546e-07, "loss": 0.0321, "num_input_tokens_seen": 8133504, "step": 8480 }, { "epoch": 0.7200441276306857, "grad_norm": 0.13875967264175415, "learning_rate": 4.409708351307049e-07, "loss": 0.0123, "num_input_tokens_seen": 8138176, "step": 8485 }, { "epoch": 0.7204684317718941, "grad_norm": 14.88166332244873, "learning_rate": 4.3974332752615727e-07, "loss": 0.0502, "num_input_tokens_seen": 8143104, "step": 8490 }, { "epoch": 0.7208927359131025, "grad_norm": 4.445859909057617, "learning_rate": 4.385170490729712e-07, "loss": 0.028, "num_input_tokens_seen": 8147584, "step": 8495 }, { "epoch": 0.721317040054311, "grad_norm": 24.310640335083008, "learning_rate": 4.3729200246148835e-07, "loss": 0.1012, "num_input_tokens_seen": 8152896, "step": 8500 }, { "epoch": 0.7217413441955194, "grad_norm": 3.4741063117980957, "learning_rate": 4.3606819037934673e-07, "loss": 0.0263, "num_input_tokens_seen": 8158400, "step": 8505 }, { "epoch": 0.7221656483367278, "grad_norm": 21.058059692382812, "learning_rate": 4.348456155114786e-07, "loss": 0.0805, "num_input_tokens_seen": 8163520, "step": 8510 }, { "epoch": 0.7225899524779362, "grad_norm": 29.004924774169922, "learning_rate": 4.336242805400989e-07, "loss": 0.0636, "num_input_tokens_seen": 8168256, "step": 8515 }, { "epoch": 0.7230142566191446, "grad_norm": 1.2382115125656128, "learning_rate": 4.324041881447041e-07, "loss": 0.0225, "num_input_tokens_seen": 8172928, "step": 8520 }, { "epoch": 0.7234385607603531, "grad_norm": 14.450663566589355, "learning_rate": 4.311853410020643e-07, "loss": 0.033, "num_input_tokens_seen": 8177728, "step": 8525 }, { "epoch": 0.7238628649015615, "grad_norm": 10.671835899353027, "learning_rate": 4.299677417862173e-07, "loss": 0.0376, "num_input_tokens_seen": 8182272, "step": 8530 }, { "epoch": 0.7242871690427699, "grad_norm": 26.833974838256836, "learning_rate": 4.287513931684634e-07, "loss": 0.142, "num_input_tokens_seen": 8187776, "step": 8535 }, { "epoch": 0.7247114731839783, "grad_norm": 0.49240443110466003, "learning_rate": 4.2753629781735936e-07, "loss": 0.0742, "num_input_tokens_seen": 8192960, "step": 8540 }, { "epoch": 0.7251357773251867, "grad_norm": 29.694900512695312, "learning_rate": 4.2632245839871095e-07, "loss": 0.1659, "num_input_tokens_seen": 8197824, "step": 8545 }, { "epoch": 0.7255600814663951, "grad_norm": 42.07789611816406, "learning_rate": 4.251098775755708e-07, "loss": 0.0614, "num_input_tokens_seen": 8202880, "step": 8550 }, { "epoch": 0.7259843856076035, "grad_norm": 14.277400970458984, "learning_rate": 4.238985580082293e-07, "loss": 0.1057, "num_input_tokens_seen": 8207552, "step": 8555 }, { "epoch": 0.7264086897488119, "grad_norm": 7.6538543701171875, "learning_rate": 4.2268850235420827e-07, "loss": 0.0801, "num_input_tokens_seen": 8212672, "step": 8560 }, { "epoch": 0.7268329938900203, "grad_norm": 25.851024627685547, "learning_rate": 4.214797132682596e-07, "loss": 0.0461, "num_input_tokens_seen": 8219392, "step": 8565 }, { "epoch": 0.7272572980312287, "grad_norm": 13.57695198059082, "learning_rate": 4.202721934023536e-07, "loss": 0.0472, "num_input_tokens_seen": 8224064, "step": 8570 }, { "epoch": 0.7276816021724372, "grad_norm": 8.460223197937012, "learning_rate": 4.19065945405678e-07, "loss": 0.0636, "num_input_tokens_seen": 8229376, "step": 8575 }, { "epoch": 0.7281059063136456, "grad_norm": 0.40096619725227356, "learning_rate": 4.17860971924629e-07, "loss": 0.0224, "num_input_tokens_seen": 8234816, "step": 8580 }, { "epoch": 0.728530210454854, "grad_norm": 13.379546165466309, "learning_rate": 4.166572756028073e-07, "loss": 0.0674, "num_input_tokens_seen": 8239552, "step": 8585 }, { "epoch": 0.7289545145960624, "grad_norm": 10.900382995605469, "learning_rate": 4.154548590810113e-07, "loss": 0.0569, "num_input_tokens_seen": 8243840, "step": 8590 }, { "epoch": 0.7293788187372708, "grad_norm": 33.06897735595703, "learning_rate": 4.14253724997232e-07, "loss": 0.1121, "num_input_tokens_seen": 8248960, "step": 8595 }, { "epoch": 0.7298031228784793, "grad_norm": 5.541492938995361, "learning_rate": 4.1305387598664567e-07, "loss": 0.0483, "num_input_tokens_seen": 8253888, "step": 8600 }, { "epoch": 0.7302274270196877, "grad_norm": 24.621667861938477, "learning_rate": 4.118553146816115e-07, "loss": 0.0628, "num_input_tokens_seen": 8258816, "step": 8605 }, { "epoch": 0.7306517311608961, "grad_norm": 2.005286931991577, "learning_rate": 4.1065804371166114e-07, "loss": 0.0508, "num_input_tokens_seen": 8263424, "step": 8610 }, { "epoch": 0.7310760353021045, "grad_norm": 0.4178374111652374, "learning_rate": 4.0946206570349685e-07, "loss": 0.0554, "num_input_tokens_seen": 8268288, "step": 8615 }, { "epoch": 0.7315003394433129, "grad_norm": 19.408565521240234, "learning_rate": 4.082673832809838e-07, "loss": 0.0945, "num_input_tokens_seen": 8273152, "step": 8620 }, { "epoch": 0.7319246435845214, "grad_norm": 0.30382806062698364, "learning_rate": 4.0707399906514483e-07, "loss": 0.0166, "num_input_tokens_seen": 8278016, "step": 8625 }, { "epoch": 0.7323489477257298, "grad_norm": 6.678869724273682, "learning_rate": 4.058819156741545e-07, "loss": 0.0345, "num_input_tokens_seen": 8283392, "step": 8630 }, { "epoch": 0.7327732518669382, "grad_norm": 11.891843795776367, "learning_rate": 4.0469113572333426e-07, "loss": 0.0519, "num_input_tokens_seen": 8288064, "step": 8635 }, { "epoch": 0.7331975560081466, "grad_norm": 10.481492042541504, "learning_rate": 4.03501661825144e-07, "loss": 0.0699, "num_input_tokens_seen": 8292672, "step": 8640 }, { "epoch": 0.733621860149355, "grad_norm": 7.467172622680664, "learning_rate": 4.023134965891809e-07, "loss": 0.09, "num_input_tokens_seen": 8297344, "step": 8645 }, { "epoch": 0.7340461642905635, "grad_norm": 9.949556350708008, "learning_rate": 4.0112664262216866e-07, "loss": 0.0495, "num_input_tokens_seen": 8302080, "step": 8650 }, { "epoch": 0.7344704684317719, "grad_norm": 24.923635482788086, "learning_rate": 3.9994110252795563e-07, "loss": 0.1092, "num_input_tokens_seen": 8307008, "step": 8655 }, { "epoch": 0.7348947725729803, "grad_norm": 23.27560806274414, "learning_rate": 3.987568789075072e-07, "loss": 0.0748, "num_input_tokens_seen": 8312064, "step": 8660 }, { "epoch": 0.7353190767141887, "grad_norm": 0.12548640370368958, "learning_rate": 3.975739743589004e-07, "loss": 0.0264, "num_input_tokens_seen": 8316480, "step": 8665 }, { "epoch": 0.7357433808553971, "grad_norm": 8.573442459106445, "learning_rate": 3.9639239147731864e-07, "loss": 0.024, "num_input_tokens_seen": 8321664, "step": 8670 }, { "epoch": 0.7361676849966056, "grad_norm": 16.72869300842285, "learning_rate": 3.952121328550455e-07, "loss": 0.0706, "num_input_tokens_seen": 8326016, "step": 8675 }, { "epoch": 0.736591989137814, "grad_norm": 0.3787572383880615, "learning_rate": 3.9403320108145943e-07, "loss": 0.01, "num_input_tokens_seen": 8330688, "step": 8680 }, { "epoch": 0.7370162932790224, "grad_norm": 0.19815906882286072, "learning_rate": 3.928555987430275e-07, "loss": 0.0251, "num_input_tokens_seen": 8335552, "step": 8685 }, { "epoch": 0.7374405974202308, "grad_norm": 1.1737080812454224, "learning_rate": 3.916793284233011e-07, "loss": 0.0294, "num_input_tokens_seen": 8340224, "step": 8690 }, { "epoch": 0.7378649015614392, "grad_norm": 6.083102226257324, "learning_rate": 3.9050439270290733e-07, "loss": 0.0556, "num_input_tokens_seen": 8345024, "step": 8695 }, { "epoch": 0.7382892057026477, "grad_norm": 0.30222249031066895, "learning_rate": 3.8933079415954805e-07, "loss": 0.0438, "num_input_tokens_seen": 8349632, "step": 8700 }, { "epoch": 0.7387135098438561, "grad_norm": 0.32194453477859497, "learning_rate": 3.8815853536798905e-07, "loss": 0.0502, "num_input_tokens_seen": 8354624, "step": 8705 }, { "epoch": 0.7391378139850645, "grad_norm": 23.009239196777344, "learning_rate": 3.8698761890005794e-07, "loss": 0.0513, "num_input_tokens_seen": 8359168, "step": 8710 }, { "epoch": 0.7395621181262729, "grad_norm": 0.40400999784469604, "learning_rate": 3.858180473246373e-07, "loss": 0.0154, "num_input_tokens_seen": 8363968, "step": 8715 }, { "epoch": 0.7399864222674813, "grad_norm": 8.802319526672363, "learning_rate": 3.8464982320765906e-07, "loss": 0.0438, "num_input_tokens_seen": 8369152, "step": 8720 }, { "epoch": 0.7404107264086898, "grad_norm": 39.71036148071289, "learning_rate": 3.834829491120991e-07, "loss": 0.0391, "num_input_tokens_seen": 8373952, "step": 8725 }, { "epoch": 0.7408350305498982, "grad_norm": 20.570491790771484, "learning_rate": 3.8231742759797157e-07, "loss": 0.0883, "num_input_tokens_seen": 8379456, "step": 8730 }, { "epoch": 0.7412593346911066, "grad_norm": 16.419105529785156, "learning_rate": 3.811532612223219e-07, "loss": 0.0386, "num_input_tokens_seen": 8383936, "step": 8735 }, { "epoch": 0.741683638832315, "grad_norm": 14.511885643005371, "learning_rate": 3.7999045253922504e-07, "loss": 0.0699, "num_input_tokens_seen": 8388864, "step": 8740 }, { "epoch": 0.7421079429735234, "grad_norm": 30.785167694091797, "learning_rate": 3.788290040997746e-07, "loss": 0.0511, "num_input_tokens_seen": 8393920, "step": 8745 }, { "epoch": 0.7425322471147319, "grad_norm": 1.0883046388626099, "learning_rate": 3.776689184520815e-07, "loss": 0.0927, "num_input_tokens_seen": 8398848, "step": 8750 }, { "epoch": 0.7429565512559403, "grad_norm": 26.308425903320312, "learning_rate": 3.765101981412665e-07, "loss": 0.1256, "num_input_tokens_seen": 8404160, "step": 8755 }, { "epoch": 0.7433808553971487, "grad_norm": 10.920520782470703, "learning_rate": 3.753528457094548e-07, "loss": 0.0433, "num_input_tokens_seen": 8408896, "step": 8760 }, { "epoch": 0.7438051595383571, "grad_norm": 14.321868896484375, "learning_rate": 3.7419686369577076e-07, "loss": 0.1051, "num_input_tokens_seen": 8413312, "step": 8765 }, { "epoch": 0.7442294636795656, "grad_norm": 0.10328897833824158, "learning_rate": 3.730422546363323e-07, "loss": 0.0251, "num_input_tokens_seen": 8417920, "step": 8770 }, { "epoch": 0.744653767820774, "grad_norm": 22.444488525390625, "learning_rate": 3.7188902106424414e-07, "loss": 0.068, "num_input_tokens_seen": 8422720, "step": 8775 }, { "epoch": 0.7450780719619824, "grad_norm": 0.36431556940078735, "learning_rate": 3.7073716550959533e-07, "loss": 0.0595, "num_input_tokens_seen": 8426944, "step": 8780 }, { "epoch": 0.7455023761031908, "grad_norm": 16.112642288208008, "learning_rate": 3.6958669049944956e-07, "loss": 0.0491, "num_input_tokens_seen": 8431104, "step": 8785 }, { "epoch": 0.7459266802443992, "grad_norm": 15.997936248779297, "learning_rate": 3.684375985578431e-07, "loss": 0.0572, "num_input_tokens_seen": 8435520, "step": 8790 }, { "epoch": 0.7463509843856077, "grad_norm": 2.1923623085021973, "learning_rate": 3.672898922057773e-07, "loss": 0.0451, "num_input_tokens_seen": 8441152, "step": 8795 }, { "epoch": 0.7467752885268161, "grad_norm": 21.68796157836914, "learning_rate": 3.66143573961214e-07, "loss": 0.0423, "num_input_tokens_seen": 8445824, "step": 8800 }, { "epoch": 0.7471995926680245, "grad_norm": 0.1858411580324173, "learning_rate": 3.649986463390694e-07, "loss": 0.0749, "num_input_tokens_seen": 8450752, "step": 8805 }, { "epoch": 0.7476238968092329, "grad_norm": 13.372088432312012, "learning_rate": 3.6385511185120885e-07, "loss": 0.0542, "num_input_tokens_seen": 8455296, "step": 8810 }, { "epoch": 0.7480482009504412, "grad_norm": 5.704926013946533, "learning_rate": 3.6271297300644156e-07, "loss": 0.0453, "num_input_tokens_seen": 8460480, "step": 8815 }, { "epoch": 0.7484725050916496, "grad_norm": 0.219235360622406, "learning_rate": 3.6157223231051426e-07, "loss": 0.0876, "num_input_tokens_seen": 8465152, "step": 8820 }, { "epoch": 0.7488968092328581, "grad_norm": 0.9952836632728577, "learning_rate": 3.6043289226610717e-07, "loss": 0.0494, "num_input_tokens_seen": 8470144, "step": 8825 }, { "epoch": 0.7493211133740665, "grad_norm": 21.607990264892578, "learning_rate": 3.5929495537282596e-07, "loss": 0.0376, "num_input_tokens_seen": 8475840, "step": 8830 }, { "epoch": 0.7497454175152749, "grad_norm": 0.37881892919540405, "learning_rate": 3.5815842412720045e-07, "loss": 0.0778, "num_input_tokens_seen": 8480256, "step": 8835 }, { "epoch": 0.7501697216564833, "grad_norm": 1.4716641902923584, "learning_rate": 3.57023301022674e-07, "loss": 0.0507, "num_input_tokens_seen": 8485568, "step": 8840 }, { "epoch": 0.7505940257976917, "grad_norm": 16.42884063720703, "learning_rate": 3.558895885496023e-07, "loss": 0.0751, "num_input_tokens_seen": 8490048, "step": 8845 }, { "epoch": 0.7510183299389002, "grad_norm": 0.9709210991859436, "learning_rate": 3.547572891952456e-07, "loss": 0.0632, "num_input_tokens_seen": 8494720, "step": 8850 }, { "epoch": 0.7510183299389002, "eval_loss": 0.05398377776145935, "eval_runtime": 15.752, "eval_samples_per_second": 664.994, "eval_steps_per_second": 83.164, "num_input_tokens_seen": 8494720, "step": 8850 }, { "epoch": 0.7514426340801086, "grad_norm": 7.805334091186523, "learning_rate": 3.536264054437641e-07, "loss": 0.0387, "num_input_tokens_seen": 8499392, "step": 8855 }, { "epoch": 0.751866938221317, "grad_norm": 20.84987449645996, "learning_rate": 3.524969397762122e-07, "loss": 0.0444, "num_input_tokens_seen": 8504256, "step": 8860 }, { "epoch": 0.7522912423625254, "grad_norm": 25.035362243652344, "learning_rate": 3.5136889467053353e-07, "loss": 0.074, "num_input_tokens_seen": 8508864, "step": 8865 }, { "epoch": 0.7527155465037338, "grad_norm": 13.157938957214355, "learning_rate": 3.5024227260155383e-07, "loss": 0.0805, "num_input_tokens_seen": 8514048, "step": 8870 }, { "epoch": 0.7531398506449423, "grad_norm": 10.362030982971191, "learning_rate": 3.4911707604097916e-07, "loss": 0.0212, "num_input_tokens_seen": 8518656, "step": 8875 }, { "epoch": 0.7535641547861507, "grad_norm": 28.8228816986084, "learning_rate": 3.4799330745738573e-07, "loss": 0.0455, "num_input_tokens_seen": 8523520, "step": 8880 }, { "epoch": 0.7539884589273591, "grad_norm": 12.480382919311523, "learning_rate": 3.468709693162183e-07, "loss": 0.1758, "num_input_tokens_seen": 8528320, "step": 8885 }, { "epoch": 0.7544127630685675, "grad_norm": 15.432955741882324, "learning_rate": 3.4575006407978304e-07, "loss": 0.0379, "num_input_tokens_seen": 8533184, "step": 8890 }, { "epoch": 0.754837067209776, "grad_norm": 21.28765106201172, "learning_rate": 3.446305942072425e-07, "loss": 0.0292, "num_input_tokens_seen": 8537536, "step": 8895 }, { "epoch": 0.7552613713509844, "grad_norm": 23.919931411743164, "learning_rate": 3.4351256215461e-07, "loss": 0.0689, "num_input_tokens_seen": 8541696, "step": 8900 }, { "epoch": 0.7556856754921928, "grad_norm": 10.663202285766602, "learning_rate": 3.423959703747449e-07, "loss": 0.0226, "num_input_tokens_seen": 8546176, "step": 8905 }, { "epoch": 0.7561099796334012, "grad_norm": 0.2008209079504013, "learning_rate": 3.4128082131734535e-07, "loss": 0.0418, "num_input_tokens_seen": 8550592, "step": 8910 }, { "epoch": 0.7565342837746096, "grad_norm": 0.5570741891860962, "learning_rate": 3.401671174289469e-07, "loss": 0.0512, "num_input_tokens_seen": 8555648, "step": 8915 }, { "epoch": 0.756958587915818, "grad_norm": 15.728958129882812, "learning_rate": 3.390548611529116e-07, "loss": 0.0859, "num_input_tokens_seen": 8560640, "step": 8920 }, { "epoch": 0.7573828920570265, "grad_norm": 7.811544418334961, "learning_rate": 3.3794405492942713e-07, "loss": 0.0721, "num_input_tokens_seen": 8565376, "step": 8925 }, { "epoch": 0.7578071961982349, "grad_norm": 15.441573143005371, "learning_rate": 3.368347011955006e-07, "loss": 0.0562, "num_input_tokens_seen": 8569728, "step": 8930 }, { "epoch": 0.7582315003394433, "grad_norm": 42.045528411865234, "learning_rate": 3.3572680238495064e-07, "loss": 0.0676, "num_input_tokens_seen": 8574656, "step": 8935 }, { "epoch": 0.7586558044806517, "grad_norm": 0.640474796295166, "learning_rate": 3.346203609284053e-07, "loss": 0.0675, "num_input_tokens_seen": 8579072, "step": 8940 }, { "epoch": 0.7590801086218602, "grad_norm": 13.068296432495117, "learning_rate": 3.335153792532945e-07, "loss": 0.0307, "num_input_tokens_seen": 8584000, "step": 8945 }, { "epoch": 0.7595044127630686, "grad_norm": 9.666223526000977, "learning_rate": 3.324118597838463e-07, "loss": 0.0874, "num_input_tokens_seen": 8589248, "step": 8950 }, { "epoch": 0.759928716904277, "grad_norm": 0.12405683100223541, "learning_rate": 3.313098049410803e-07, "loss": 0.0565, "num_input_tokens_seen": 8593472, "step": 8955 }, { "epoch": 0.7603530210454854, "grad_norm": 0.11512839794158936, "learning_rate": 3.3020921714280325e-07, "loss": 0.0134, "num_input_tokens_seen": 8597952, "step": 8960 }, { "epoch": 0.7607773251866938, "grad_norm": 2.1277525424957275, "learning_rate": 3.291100988036022e-07, "loss": 0.0331, "num_input_tokens_seen": 8602816, "step": 8965 }, { "epoch": 0.7612016293279023, "grad_norm": 0.37769949436187744, "learning_rate": 3.280124523348423e-07, "loss": 0.0146, "num_input_tokens_seen": 8608256, "step": 8970 }, { "epoch": 0.7616259334691107, "grad_norm": 31.548748016357422, "learning_rate": 3.269162801446578e-07, "loss": 0.0495, "num_input_tokens_seen": 8612928, "step": 8975 }, { "epoch": 0.7620502376103191, "grad_norm": 0.18137019872665405, "learning_rate": 3.258215846379492e-07, "loss": 0.1099, "num_input_tokens_seen": 8617280, "step": 8980 }, { "epoch": 0.7624745417515275, "grad_norm": 0.3657650649547577, "learning_rate": 3.247283682163774e-07, "loss": 0.0041, "num_input_tokens_seen": 8621952, "step": 8985 }, { "epoch": 0.7628988458927359, "grad_norm": 1.3342311382293701, "learning_rate": 3.2363663327835855e-07, "loss": 0.0261, "num_input_tokens_seen": 8627136, "step": 8990 }, { "epoch": 0.7633231500339444, "grad_norm": 0.06417883932590485, "learning_rate": 3.2254638221905716e-07, "loss": 0.0975, "num_input_tokens_seen": 8631872, "step": 8995 }, { "epoch": 0.7637474541751528, "grad_norm": 25.41999626159668, "learning_rate": 3.214576174303846e-07, "loss": 0.0871, "num_input_tokens_seen": 8636864, "step": 9000 }, { "epoch": 0.7641717583163612, "grad_norm": 1.317375898361206, "learning_rate": 3.2037034130098905e-07, "loss": 0.033, "num_input_tokens_seen": 8641792, "step": 9005 }, { "epoch": 0.7645960624575696, "grad_norm": 6.363920211791992, "learning_rate": 3.192845562162549e-07, "loss": 0.0412, "num_input_tokens_seen": 8646400, "step": 9010 }, { "epoch": 0.765020366598778, "grad_norm": 6.3293867111206055, "learning_rate": 3.1820026455829353e-07, "loss": 0.0528, "num_input_tokens_seen": 8650944, "step": 9015 }, { "epoch": 0.7654446707399865, "grad_norm": 0.2312636822462082, "learning_rate": 3.171174687059408e-07, "loss": 0.0714, "num_input_tokens_seen": 8656704, "step": 9020 }, { "epoch": 0.7658689748811949, "grad_norm": 14.677282333374023, "learning_rate": 3.160361710347508e-07, "loss": 0.0878, "num_input_tokens_seen": 8661120, "step": 9025 }, { "epoch": 0.7662932790224033, "grad_norm": 0.961157500743866, "learning_rate": 3.14956373916991e-07, "loss": 0.0299, "num_input_tokens_seen": 8665664, "step": 9030 }, { "epoch": 0.7667175831636117, "grad_norm": 22.225244522094727, "learning_rate": 3.138780797216356e-07, "loss": 0.0576, "num_input_tokens_seen": 8670592, "step": 9035 }, { "epoch": 0.7671418873048201, "grad_norm": 7.560973644256592, "learning_rate": 3.128012908143636e-07, "loss": 0.0568, "num_input_tokens_seen": 8677120, "step": 9040 }, { "epoch": 0.7675661914460286, "grad_norm": 1.7268146276474, "learning_rate": 3.1172600955754935e-07, "loss": 0.0727, "num_input_tokens_seen": 8682176, "step": 9045 }, { "epoch": 0.767990495587237, "grad_norm": 5.31557559967041, "learning_rate": 3.1065223831026066e-07, "loss": 0.0849, "num_input_tokens_seen": 8686976, "step": 9050 }, { "epoch": 0.7684147997284454, "grad_norm": 51.82563781738281, "learning_rate": 3.095799794282533e-07, "loss": 0.0669, "num_input_tokens_seen": 8691904, "step": 9055 }, { "epoch": 0.7688391038696538, "grad_norm": 6.468626022338867, "learning_rate": 3.0850923526396334e-07, "loss": 0.0465, "num_input_tokens_seen": 8700928, "step": 9060 }, { "epoch": 0.7692634080108622, "grad_norm": 26.497516632080078, "learning_rate": 3.0744000816650464e-07, "loss": 0.0227, "num_input_tokens_seen": 8705344, "step": 9065 }, { "epoch": 0.7696877121520707, "grad_norm": 3.8520402908325195, "learning_rate": 3.0637230048166263e-07, "loss": 0.0562, "num_input_tokens_seen": 8710784, "step": 9070 }, { "epoch": 0.770112016293279, "grad_norm": 6.1381731033325195, "learning_rate": 3.0530611455188946e-07, "loss": 0.0205, "num_input_tokens_seen": 8716032, "step": 9075 }, { "epoch": 0.7705363204344874, "grad_norm": 11.487967491149902, "learning_rate": 3.0424145271629844e-07, "loss": 0.0332, "num_input_tokens_seen": 8721088, "step": 9080 }, { "epoch": 0.7709606245756958, "grad_norm": 0.8325733542442322, "learning_rate": 3.031783173106596e-07, "loss": 0.0212, "num_input_tokens_seen": 8725632, "step": 9085 }, { "epoch": 0.7713849287169042, "grad_norm": 22.9674072265625, "learning_rate": 3.0211671066739276e-07, "loss": 0.0318, "num_input_tokens_seen": 8730560, "step": 9090 }, { "epoch": 0.7718092328581126, "grad_norm": 4.438326835632324, "learning_rate": 3.01056635115566e-07, "loss": 0.0695, "num_input_tokens_seen": 8736640, "step": 9095 }, { "epoch": 0.7722335369993211, "grad_norm": 0.9272420406341553, "learning_rate": 2.999980929808863e-07, "loss": 0.0152, "num_input_tokens_seen": 8741248, "step": 9100 }, { "epoch": 0.7726578411405295, "grad_norm": 8.21893310546875, "learning_rate": 2.989410865856975e-07, "loss": 0.0392, "num_input_tokens_seen": 8745856, "step": 9105 }, { "epoch": 0.7730821452817379, "grad_norm": 18.906034469604492, "learning_rate": 2.9788561824897397e-07, "loss": 0.1469, "num_input_tokens_seen": 8750016, "step": 9110 }, { "epoch": 0.7735064494229463, "grad_norm": 31.49846839904785, "learning_rate": 2.968316902863157e-07, "loss": 0.0212, "num_input_tokens_seen": 8754944, "step": 9115 }, { "epoch": 0.7739307535641547, "grad_norm": 15.8748779296875, "learning_rate": 2.957793050099433e-07, "loss": 0.0967, "num_input_tokens_seen": 8759488, "step": 9120 }, { "epoch": 0.7743550577053632, "grad_norm": 3.842491388320923, "learning_rate": 2.9472846472869295e-07, "loss": 0.0815, "num_input_tokens_seen": 8763840, "step": 9125 }, { "epoch": 0.7747793618465716, "grad_norm": 0.13100187480449677, "learning_rate": 2.936791717480104e-07, "loss": 0.0108, "num_input_tokens_seen": 8768320, "step": 9130 }, { "epoch": 0.77520366598778, "grad_norm": 8.87035083770752, "learning_rate": 2.9263142836994845e-07, "loss": 0.0252, "num_input_tokens_seen": 8773568, "step": 9135 }, { "epoch": 0.7756279701289884, "grad_norm": 7.818394660949707, "learning_rate": 2.915852368931585e-07, "loss": 0.0947, "num_input_tokens_seen": 8778176, "step": 9140 }, { "epoch": 0.7760522742701969, "grad_norm": 50.66056823730469, "learning_rate": 2.905405996128882e-07, "loss": 0.111, "num_input_tokens_seen": 8782784, "step": 9145 }, { "epoch": 0.7764765784114053, "grad_norm": 0.7679608464241028, "learning_rate": 2.894975188209754e-07, "loss": 0.0308, "num_input_tokens_seen": 8787456, "step": 9150 }, { "epoch": 0.7769008825526137, "grad_norm": 18.441892623901367, "learning_rate": 2.8845599680584265e-07, "loss": 0.0682, "num_input_tokens_seen": 8792256, "step": 9155 }, { "epoch": 0.7773251866938221, "grad_norm": 15.792590141296387, "learning_rate": 2.8741603585249306e-07, "loss": 0.1251, "num_input_tokens_seen": 8797568, "step": 9160 }, { "epoch": 0.7777494908350305, "grad_norm": 0.1300697922706604, "learning_rate": 2.8637763824250507e-07, "loss": 0.0504, "num_input_tokens_seen": 8802112, "step": 9165 }, { "epoch": 0.778173794976239, "grad_norm": 0.6240066289901733, "learning_rate": 2.8534080625402677e-07, "loss": 0.0153, "num_input_tokens_seen": 8806784, "step": 9170 }, { "epoch": 0.7785980991174474, "grad_norm": 26.65509033203125, "learning_rate": 2.8430554216177203e-07, "loss": 0.0108, "num_input_tokens_seen": 8812096, "step": 9175 }, { "epoch": 0.7790224032586558, "grad_norm": 21.93799591064453, "learning_rate": 2.8327184823701464e-07, "loss": 0.11, "num_input_tokens_seen": 8817024, "step": 9180 }, { "epoch": 0.7794467073998642, "grad_norm": 35.96677780151367, "learning_rate": 2.822397267475827e-07, "loss": 0.1664, "num_input_tokens_seen": 8821696, "step": 9185 }, { "epoch": 0.7798710115410726, "grad_norm": 28.536968231201172, "learning_rate": 2.812091799578566e-07, "loss": 0.1155, "num_input_tokens_seen": 8826560, "step": 9190 }, { "epoch": 0.780295315682281, "grad_norm": 0.38851362466812134, "learning_rate": 2.8018021012875995e-07, "loss": 0.0301, "num_input_tokens_seen": 8831552, "step": 9195 }, { "epoch": 0.7807196198234895, "grad_norm": 17.1531925201416, "learning_rate": 2.791528195177576e-07, "loss": 0.0673, "num_input_tokens_seen": 8836480, "step": 9200 }, { "epoch": 0.7811439239646979, "grad_norm": 13.413601875305176, "learning_rate": 2.7812701037884964e-07, "loss": 0.0681, "num_input_tokens_seen": 8840832, "step": 9205 }, { "epoch": 0.7815682281059063, "grad_norm": 24.447235107421875, "learning_rate": 2.7710278496256665e-07, "loss": 0.0717, "num_input_tokens_seen": 8845376, "step": 9210 }, { "epoch": 0.7819925322471147, "grad_norm": 0.10010475665330887, "learning_rate": 2.7608014551596437e-07, "loss": 0.0424, "num_input_tokens_seen": 8850176, "step": 9215 }, { "epoch": 0.7824168363883232, "grad_norm": 2.201000452041626, "learning_rate": 2.7505909428261956e-07, "loss": 0.0697, "num_input_tokens_seen": 8854976, "step": 9220 }, { "epoch": 0.7828411405295316, "grad_norm": 19.673593521118164, "learning_rate": 2.740396335026234e-07, "loss": 0.0415, "num_input_tokens_seen": 8859392, "step": 9225 }, { "epoch": 0.78326544467074, "grad_norm": 0.5144276022911072, "learning_rate": 2.7302176541257984e-07, "loss": 0.0155, "num_input_tokens_seen": 8863936, "step": 9230 }, { "epoch": 0.7836897488119484, "grad_norm": 0.5424436330795288, "learning_rate": 2.720054922455964e-07, "loss": 0.1092, "num_input_tokens_seen": 8868480, "step": 9235 }, { "epoch": 0.7841140529531568, "grad_norm": 53.60285949707031, "learning_rate": 2.7099081623128294e-07, "loss": 0.0854, "num_input_tokens_seen": 8873344, "step": 9240 }, { "epoch": 0.7845383570943653, "grad_norm": 30.46038818359375, "learning_rate": 2.699777395957449e-07, "loss": 0.0661, "num_input_tokens_seen": 8878016, "step": 9245 }, { "epoch": 0.7849626612355737, "grad_norm": 9.4970703125, "learning_rate": 2.6896626456157846e-07, "loss": 0.1429, "num_input_tokens_seen": 8882816, "step": 9250 }, { "epoch": 0.7853869653767821, "grad_norm": 0.3060969412326813, "learning_rate": 2.679563933478667e-07, "loss": 0.0025, "num_input_tokens_seen": 8887936, "step": 9255 }, { "epoch": 0.7858112695179905, "grad_norm": 8.793341636657715, "learning_rate": 2.6694812817017387e-07, "loss": 0.0219, "num_input_tokens_seen": 8892672, "step": 9260 }, { "epoch": 0.7862355736591989, "grad_norm": 17.539247512817383, "learning_rate": 2.659414712405398e-07, "loss": 0.053, "num_input_tokens_seen": 8897152, "step": 9265 }, { "epoch": 0.7866598778004074, "grad_norm": 0.20830750465393066, "learning_rate": 2.649364247674779e-07, "loss": 0.0312, "num_input_tokens_seen": 8901504, "step": 9270 }, { "epoch": 0.7870841819416158, "grad_norm": 12.591196060180664, "learning_rate": 2.639329909559662e-07, "loss": 0.073, "num_input_tokens_seen": 8906176, "step": 9275 }, { "epoch": 0.7875084860828242, "grad_norm": 0.34583133459091187, "learning_rate": 2.6293117200744643e-07, "loss": 0.0315, "num_input_tokens_seen": 8910720, "step": 9280 }, { "epoch": 0.7879327902240326, "grad_norm": 5.635802268981934, "learning_rate": 2.6193097011981635e-07, "loss": 0.0515, "num_input_tokens_seen": 8915200, "step": 9285 }, { "epoch": 0.788357094365241, "grad_norm": 15.817290306091309, "learning_rate": 2.609323874874266e-07, "loss": 0.0686, "num_input_tokens_seen": 8920256, "step": 9290 }, { "epoch": 0.7887813985064495, "grad_norm": 2.470198392868042, "learning_rate": 2.5993542630107533e-07, "loss": 0.0547, "num_input_tokens_seen": 8924864, "step": 9295 }, { "epoch": 0.7892057026476579, "grad_norm": 0.5217587351799011, "learning_rate": 2.589400887480032e-07, "loss": 0.0615, "num_input_tokens_seen": 8929856, "step": 9300 }, { "epoch": 0.7896300067888663, "grad_norm": 16.123640060424805, "learning_rate": 2.579463770118887e-07, "loss": 0.0191, "num_input_tokens_seen": 8936640, "step": 9305 }, { "epoch": 0.7900543109300747, "grad_norm": 0.8148235082626343, "learning_rate": 2.569542932728436e-07, "loss": 0.0344, "num_input_tokens_seen": 8940992, "step": 9310 }, { "epoch": 0.7904786150712831, "grad_norm": 2.461164712905884, "learning_rate": 2.5596383970740833e-07, "loss": 0.0789, "num_input_tokens_seen": 8945664, "step": 9315 }, { "epoch": 0.7909029192124916, "grad_norm": 10.289015769958496, "learning_rate": 2.549750184885454e-07, "loss": 0.1219, "num_input_tokens_seen": 8950016, "step": 9320 }, { "epoch": 0.7913272233537, "grad_norm": 14.911494255065918, "learning_rate": 2.5398783178563844e-07, "loss": 0.0785, "num_input_tokens_seen": 8954880, "step": 9325 }, { "epoch": 0.7917515274949084, "grad_norm": 7.448769569396973, "learning_rate": 2.5300228176448304e-07, "loss": 0.058, "num_input_tokens_seen": 8960128, "step": 9330 }, { "epoch": 0.7921758316361168, "grad_norm": 2.861565113067627, "learning_rate": 2.52018370587285e-07, "loss": 0.0552, "num_input_tokens_seen": 8964864, "step": 9335 }, { "epoch": 0.7926001357773251, "grad_norm": 6.1687822341918945, "learning_rate": 2.5103610041265475e-07, "loss": 0.0933, "num_input_tokens_seen": 8970112, "step": 9340 }, { "epoch": 0.7930244399185336, "grad_norm": 5.1669416427612305, "learning_rate": 2.5005547339560207e-07, "loss": 0.0475, "num_input_tokens_seen": 8975104, "step": 9345 }, { "epoch": 0.793448744059742, "grad_norm": 5.684637546539307, "learning_rate": 2.4907649168753197e-07, "loss": 0.0478, "num_input_tokens_seen": 8980160, "step": 9350 }, { "epoch": 0.7938730482009504, "grad_norm": 39.38056182861328, "learning_rate": 2.480991574362403e-07, "loss": 0.1039, "num_input_tokens_seen": 8984320, "step": 9355 }, { "epoch": 0.7942973523421588, "grad_norm": 14.252079963684082, "learning_rate": 2.471234727859072e-07, "loss": 0.0495, "num_input_tokens_seen": 8989376, "step": 9360 }, { "epoch": 0.7947216564833672, "grad_norm": 2.3673818111419678, "learning_rate": 2.461494398770957e-07, "loss": 0.026, "num_input_tokens_seen": 8993920, "step": 9365 }, { "epoch": 0.7951459606245757, "grad_norm": 0.3640649914741516, "learning_rate": 2.4517706084674316e-07, "loss": 0.0662, "num_input_tokens_seen": 8998848, "step": 9370 }, { "epoch": 0.7955702647657841, "grad_norm": 0.05636943131685257, "learning_rate": 2.4420633782815945e-07, "loss": 0.0314, "num_input_tokens_seen": 9003712, "step": 9375 }, { "epoch": 0.7959945689069925, "grad_norm": 0.3024899363517761, "learning_rate": 2.432372729510214e-07, "loss": 0.0552, "num_input_tokens_seen": 9009472, "step": 9380 }, { "epoch": 0.7964188730482009, "grad_norm": 4.740152359008789, "learning_rate": 2.4226986834136763e-07, "loss": 0.0265, "num_input_tokens_seen": 9014144, "step": 9385 }, { "epoch": 0.7968431771894093, "grad_norm": 18.9387264251709, "learning_rate": 2.4130412612159445e-07, "loss": 0.0193, "num_input_tokens_seen": 9018688, "step": 9390 }, { "epoch": 0.7972674813306178, "grad_norm": 9.759135246276855, "learning_rate": 2.403400484104514e-07, "loss": 0.0334, "num_input_tokens_seen": 9023040, "step": 9395 }, { "epoch": 0.7976917854718262, "grad_norm": 0.15711554884910583, "learning_rate": 2.3937763732303504e-07, "loss": 0.0262, "num_input_tokens_seen": 9027328, "step": 9400 }, { "epoch": 0.7981160896130346, "grad_norm": 1.643560528755188, "learning_rate": 2.3841689497078742e-07, "loss": 0.0333, "num_input_tokens_seen": 9031552, "step": 9405 }, { "epoch": 0.798540393754243, "grad_norm": 0.7876635193824768, "learning_rate": 2.3745782346148756e-07, "loss": 0.0224, "num_input_tokens_seen": 9036672, "step": 9410 }, { "epoch": 0.7989646978954514, "grad_norm": 4.152390956878662, "learning_rate": 2.3650042489924992e-07, "loss": 0.0491, "num_input_tokens_seen": 9041472, "step": 9415 }, { "epoch": 0.7993890020366599, "grad_norm": 31.914854049682617, "learning_rate": 2.3554470138451909e-07, "loss": 0.0686, "num_input_tokens_seen": 9046912, "step": 9420 }, { "epoch": 0.7998133061778683, "grad_norm": 8.399618148803711, "learning_rate": 2.345906550140634e-07, "loss": 0.0481, "num_input_tokens_seen": 9051712, "step": 9425 }, { "epoch": 0.8002376103190767, "grad_norm": 10.311676025390625, "learning_rate": 2.3363828788097274e-07, "loss": 0.0909, "num_input_tokens_seen": 9056256, "step": 9430 }, { "epoch": 0.8006619144602851, "grad_norm": 8.353757858276367, "learning_rate": 2.3268760207465244e-07, "loss": 0.0345, "num_input_tokens_seen": 9061376, "step": 9435 }, { "epoch": 0.8010862186014935, "grad_norm": 13.43514633178711, "learning_rate": 2.3173859968081944e-07, "loss": 0.1023, "num_input_tokens_seen": 9066048, "step": 9440 }, { "epoch": 0.8010862186014935, "eval_loss": 0.05470386520028114, "eval_runtime": 15.9466, "eval_samples_per_second": 656.88, "eval_steps_per_second": 82.149, "num_input_tokens_seen": 9066048, "step": 9440 }, { "epoch": 0.801510522742702, "grad_norm": 14.77238941192627, "learning_rate": 2.3079128278149717e-07, "loss": 0.0286, "num_input_tokens_seen": 9071232, "step": 9445 }, { "epoch": 0.8019348268839104, "grad_norm": 24.130496978759766, "learning_rate": 2.2984565345501172e-07, "loss": 0.0474, "num_input_tokens_seen": 9075520, "step": 9450 }, { "epoch": 0.8023591310251188, "grad_norm": 15.134228706359863, "learning_rate": 2.2890171377598556e-07, "loss": 0.0605, "num_input_tokens_seen": 9080192, "step": 9455 }, { "epoch": 0.8027834351663272, "grad_norm": 7.502737998962402, "learning_rate": 2.2795946581533632e-07, "loss": 0.0041, "num_input_tokens_seen": 9085696, "step": 9460 }, { "epoch": 0.8032077393075356, "grad_norm": 11.876471519470215, "learning_rate": 2.27018911640268e-07, "loss": 0.0716, "num_input_tokens_seen": 9090432, "step": 9465 }, { "epoch": 0.8036320434487441, "grad_norm": 7.7270731925964355, "learning_rate": 2.2608005331426982e-07, "loss": 0.0832, "num_input_tokens_seen": 9094976, "step": 9470 }, { "epoch": 0.8040563475899525, "grad_norm": 7.613325595855713, "learning_rate": 2.251428928971102e-07, "loss": 0.0863, "num_input_tokens_seen": 9100096, "step": 9475 }, { "epoch": 0.8044806517311609, "grad_norm": 6.049130439758301, "learning_rate": 2.2420743244483253e-07, "loss": 0.0752, "num_input_tokens_seen": 9105408, "step": 9480 }, { "epoch": 0.8049049558723693, "grad_norm": 6.191554546356201, "learning_rate": 2.2327367400975051e-07, "loss": 0.0346, "num_input_tokens_seen": 9110144, "step": 9485 }, { "epoch": 0.8053292600135777, "grad_norm": 0.3262642025947571, "learning_rate": 2.2234161964044417e-07, "loss": 0.0509, "num_input_tokens_seen": 9115520, "step": 9490 }, { "epoch": 0.8057535641547862, "grad_norm": 0.36145687103271484, "learning_rate": 2.2141127138175386e-07, "loss": 0.0249, "num_input_tokens_seen": 9119808, "step": 9495 }, { "epoch": 0.8061778682959946, "grad_norm": 11.450455665588379, "learning_rate": 2.2048263127477861e-07, "loss": 0.0707, "num_input_tokens_seen": 9124672, "step": 9500 }, { "epoch": 0.806602172437203, "grad_norm": 9.32411003112793, "learning_rate": 2.195557013568684e-07, "loss": 0.0903, "num_input_tokens_seen": 9129216, "step": 9505 }, { "epoch": 0.8070264765784114, "grad_norm": 3.2268972396850586, "learning_rate": 2.1863048366162207e-07, "loss": 0.0704, "num_input_tokens_seen": 9133952, "step": 9510 }, { "epoch": 0.8074507807196198, "grad_norm": 26.277555465698242, "learning_rate": 2.1770698021888145e-07, "loss": 0.0464, "num_input_tokens_seen": 9138240, "step": 9515 }, { "epoch": 0.8078750848608283, "grad_norm": 25.081214904785156, "learning_rate": 2.167851930547283e-07, "loss": 0.1019, "num_input_tokens_seen": 9142656, "step": 9520 }, { "epoch": 0.8082993890020367, "grad_norm": 0.8892016410827637, "learning_rate": 2.1586512419147763e-07, "loss": 0.0595, "num_input_tokens_seen": 9147456, "step": 9525 }, { "epoch": 0.8087236931432451, "grad_norm": 16.272262573242188, "learning_rate": 2.149467756476765e-07, "loss": 0.0314, "num_input_tokens_seen": 9152064, "step": 9530 }, { "epoch": 0.8091479972844535, "grad_norm": 12.82519245147705, "learning_rate": 2.140301494380956e-07, "loss": 0.0694, "num_input_tokens_seen": 9156544, "step": 9535 }, { "epoch": 0.8095723014256619, "grad_norm": 17.81451988220215, "learning_rate": 2.1311524757372901e-07, "loss": 0.0385, "num_input_tokens_seen": 9161088, "step": 9540 }, { "epoch": 0.8099966055668704, "grad_norm": 21.11798667907715, "learning_rate": 2.1220207206178685e-07, "loss": 0.05, "num_input_tokens_seen": 9165440, "step": 9545 }, { "epoch": 0.8104209097080788, "grad_norm": 2.2403523921966553, "learning_rate": 2.1129062490569106e-07, "loss": 0.0143, "num_input_tokens_seen": 9170496, "step": 9550 }, { "epoch": 0.8108452138492872, "grad_norm": 1.4986127614974976, "learning_rate": 2.1038090810507348e-07, "loss": 0.034, "num_input_tokens_seen": 9175360, "step": 9555 }, { "epoch": 0.8112695179904956, "grad_norm": 16.13140296936035, "learning_rate": 2.0947292365576785e-07, "loss": 0.057, "num_input_tokens_seen": 9179776, "step": 9560 }, { "epoch": 0.811693822131704, "grad_norm": 0.2588607966899872, "learning_rate": 2.085666735498085e-07, "loss": 0.0305, "num_input_tokens_seen": 9185536, "step": 9565 }, { "epoch": 0.8121181262729125, "grad_norm": 11.537827491760254, "learning_rate": 2.0766215977542435e-07, "loss": 0.0328, "num_input_tokens_seen": 9190528, "step": 9570 }, { "epoch": 0.8125424304141209, "grad_norm": 29.518110275268555, "learning_rate": 2.0675938431703532e-07, "loss": 0.0937, "num_input_tokens_seen": 9195264, "step": 9575 }, { "epoch": 0.8129667345553293, "grad_norm": 16.484352111816406, "learning_rate": 2.0585834915524646e-07, "loss": 0.065, "num_input_tokens_seen": 9200192, "step": 9580 }, { "epoch": 0.8133910386965377, "grad_norm": 0.5188624858856201, "learning_rate": 2.0495905626684674e-07, "loss": 0.0184, "num_input_tokens_seen": 9205056, "step": 9585 }, { "epoch": 0.8138153428377461, "grad_norm": 15.126639366149902, "learning_rate": 2.0406150762480089e-07, "loss": 0.0328, "num_input_tokens_seen": 9209856, "step": 9590 }, { "epoch": 0.8142396469789546, "grad_norm": 14.457038879394531, "learning_rate": 2.0316570519824806e-07, "loss": 0.033, "num_input_tokens_seen": 9214464, "step": 9595 }, { "epoch": 0.814663951120163, "grad_norm": 7.134429931640625, "learning_rate": 2.0227165095249564e-07, "loss": 0.0432, "num_input_tokens_seen": 9219072, "step": 9600 }, { "epoch": 0.8150882552613713, "grad_norm": 0.702487587928772, "learning_rate": 2.0137934684901636e-07, "loss": 0.0675, "num_input_tokens_seen": 9224768, "step": 9605 }, { "epoch": 0.8155125594025797, "grad_norm": 41.96034240722656, "learning_rate": 2.0048879484544279e-07, "loss": 0.0464, "num_input_tokens_seen": 9229696, "step": 9610 }, { "epoch": 0.8159368635437881, "grad_norm": 0.04394965618848801, "learning_rate": 1.9959999689556407e-07, "loss": 0.0971, "num_input_tokens_seen": 9235072, "step": 9615 }, { "epoch": 0.8163611676849966, "grad_norm": 0.4195549488067627, "learning_rate": 1.9871295494931994e-07, "loss": 0.0947, "num_input_tokens_seen": 9240320, "step": 9620 }, { "epoch": 0.816785471826205, "grad_norm": 0.4872162640094757, "learning_rate": 1.978276709527994e-07, "loss": 0.0287, "num_input_tokens_seen": 9244928, "step": 9625 }, { "epoch": 0.8172097759674134, "grad_norm": 12.054028511047363, "learning_rate": 1.9694414684823313e-07, "loss": 0.1166, "num_input_tokens_seen": 9249792, "step": 9630 }, { "epoch": 0.8176340801086218, "grad_norm": 0.684785008430481, "learning_rate": 1.960623845739914e-07, "loss": 0.0569, "num_input_tokens_seen": 9254848, "step": 9635 }, { "epoch": 0.8180583842498302, "grad_norm": 27.895057678222656, "learning_rate": 1.9518238606457925e-07, "loss": 0.0463, "num_input_tokens_seen": 9259392, "step": 9640 }, { "epoch": 0.8184826883910387, "grad_norm": 0.3559216260910034, "learning_rate": 1.943041532506322e-07, "loss": 0.066, "num_input_tokens_seen": 9263872, "step": 9645 }, { "epoch": 0.8189069925322471, "grad_norm": 19.72517204284668, "learning_rate": 1.9342768805891173e-07, "loss": 0.0389, "num_input_tokens_seen": 9268800, "step": 9650 }, { "epoch": 0.8193312966734555, "grad_norm": 6.721203804016113, "learning_rate": 1.9255299241230182e-07, "loss": 0.1183, "num_input_tokens_seen": 9273408, "step": 9655 }, { "epoch": 0.8197556008146639, "grad_norm": 22.650775909423828, "learning_rate": 1.91680068229803e-07, "loss": 0.0679, "num_input_tokens_seen": 9278208, "step": 9660 }, { "epoch": 0.8201799049558723, "grad_norm": 0.2406800389289856, "learning_rate": 1.9080891742653105e-07, "loss": 0.0643, "num_input_tokens_seen": 9282944, "step": 9665 }, { "epoch": 0.8206042090970808, "grad_norm": 0.293648898601532, "learning_rate": 1.8993954191371042e-07, "loss": 0.0074, "num_input_tokens_seen": 9288064, "step": 9670 }, { "epoch": 0.8210285132382892, "grad_norm": 0.11446045339107513, "learning_rate": 1.8907194359866986e-07, "loss": 0.0289, "num_input_tokens_seen": 9293120, "step": 9675 }, { "epoch": 0.8214528173794976, "grad_norm": 19.012449264526367, "learning_rate": 1.8820612438484075e-07, "loss": 0.0717, "num_input_tokens_seen": 9297472, "step": 9680 }, { "epoch": 0.821877121520706, "grad_norm": 7.200514793395996, "learning_rate": 1.8734208617174986e-07, "loss": 0.0743, "num_input_tokens_seen": 9302144, "step": 9685 }, { "epoch": 0.8223014256619144, "grad_norm": 0.48357534408569336, "learning_rate": 1.864798308550173e-07, "loss": 0.0671, "num_input_tokens_seen": 9307200, "step": 9690 }, { "epoch": 0.8227257298031229, "grad_norm": 15.533746719360352, "learning_rate": 1.856193603263515e-07, "loss": 0.0212, "num_input_tokens_seen": 9312384, "step": 9695 }, { "epoch": 0.8231500339443313, "grad_norm": 7.348632335662842, "learning_rate": 1.8476067647354553e-07, "loss": 0.0079, "num_input_tokens_seen": 9317120, "step": 9700 }, { "epoch": 0.8235743380855397, "grad_norm": 26.3117733001709, "learning_rate": 1.8390378118047213e-07, "loss": 0.0715, "num_input_tokens_seen": 9321664, "step": 9705 }, { "epoch": 0.8239986422267481, "grad_norm": 20.507158279418945, "learning_rate": 1.8304867632708077e-07, "loss": 0.0367, "num_input_tokens_seen": 9326208, "step": 9710 }, { "epoch": 0.8244229463679565, "grad_norm": 0.20351463556289673, "learning_rate": 1.821953637893917e-07, "loss": 0.0819, "num_input_tokens_seen": 9331264, "step": 9715 }, { "epoch": 0.824847250509165, "grad_norm": 9.972869873046875, "learning_rate": 1.8134384543949478e-07, "loss": 0.0681, "num_input_tokens_seen": 9336704, "step": 9720 }, { "epoch": 0.8252715546503734, "grad_norm": 8.309649467468262, "learning_rate": 1.804941231455417e-07, "loss": 0.0809, "num_input_tokens_seen": 9342016, "step": 9725 }, { "epoch": 0.8256958587915818, "grad_norm": 10.31495475769043, "learning_rate": 1.7964619877174513e-07, "loss": 0.0543, "num_input_tokens_seen": 9346752, "step": 9730 }, { "epoch": 0.8261201629327902, "grad_norm": 0.1515226662158966, "learning_rate": 1.788000741783725e-07, "loss": 0.045, "num_input_tokens_seen": 9351296, "step": 9735 }, { "epoch": 0.8265444670739986, "grad_norm": 8.2435302734375, "learning_rate": 1.7795575122174323e-07, "loss": 0.078, "num_input_tokens_seen": 9355712, "step": 9740 }, { "epoch": 0.8269687712152071, "grad_norm": 9.670989990234375, "learning_rate": 1.7711323175422376e-07, "loss": 0.0929, "num_input_tokens_seen": 9360384, "step": 9745 }, { "epoch": 0.8273930753564155, "grad_norm": 6.574410915374756, "learning_rate": 1.7627251762422413e-07, "loss": 0.0472, "num_input_tokens_seen": 9364608, "step": 9750 }, { "epoch": 0.8278173794976239, "grad_norm": 16.814891815185547, "learning_rate": 1.7543361067619267e-07, "loss": 0.0203, "num_input_tokens_seen": 9369728, "step": 9755 }, { "epoch": 0.8282416836388323, "grad_norm": 6.871399402618408, "learning_rate": 1.7459651275061483e-07, "loss": 0.0865, "num_input_tokens_seen": 9374592, "step": 9760 }, { "epoch": 0.8286659877800407, "grad_norm": 8.160877227783203, "learning_rate": 1.737612256840053e-07, "loss": 0.0433, "num_input_tokens_seen": 9380160, "step": 9765 }, { "epoch": 0.8290902919212492, "grad_norm": 2.8912947177886963, "learning_rate": 1.729277513089068e-07, "loss": 0.0528, "num_input_tokens_seen": 9385216, "step": 9770 }, { "epoch": 0.8295145960624576, "grad_norm": 8.160943984985352, "learning_rate": 1.7209609145388538e-07, "loss": 0.0437, "num_input_tokens_seen": 9390080, "step": 9775 }, { "epoch": 0.829938900203666, "grad_norm": 18.256690979003906, "learning_rate": 1.7126624794352563e-07, "loss": 0.0108, "num_input_tokens_seen": 9394304, "step": 9780 }, { "epoch": 0.8303632043448744, "grad_norm": 0.5559505224227905, "learning_rate": 1.7043822259842766e-07, "loss": 0.0646, "num_input_tokens_seen": 9399360, "step": 9785 }, { "epoch": 0.8307875084860828, "grad_norm": 0.38654825091362, "learning_rate": 1.6961201723520247e-07, "loss": 0.0338, "num_input_tokens_seen": 9404352, "step": 9790 }, { "epoch": 0.8312118126272913, "grad_norm": 12.815828323364258, "learning_rate": 1.6878763366646832e-07, "loss": 0.0434, "num_input_tokens_seen": 9409152, "step": 9795 }, { "epoch": 0.8316361167684997, "grad_norm": 8.283032417297363, "learning_rate": 1.6796507370084656e-07, "loss": 0.0449, "num_input_tokens_seen": 9414208, "step": 9800 }, { "epoch": 0.8320604209097081, "grad_norm": 0.2858330309391022, "learning_rate": 1.671443391429581e-07, "loss": 0.0646, "num_input_tokens_seen": 9419008, "step": 9805 }, { "epoch": 0.8324847250509165, "grad_norm": 0.6068195104598999, "learning_rate": 1.6632543179341772e-07, "loss": 0.0368, "num_input_tokens_seen": 9424192, "step": 9810 }, { "epoch": 0.832909029192125, "grad_norm": 21.136734008789062, "learning_rate": 1.6550835344883364e-07, "loss": 0.0235, "num_input_tokens_seen": 9429056, "step": 9815 }, { "epoch": 0.8333333333333334, "grad_norm": 11.528197288513184, "learning_rate": 1.646931059017994e-07, "loss": 0.053, "num_input_tokens_seen": 9433024, "step": 9820 }, { "epoch": 0.8337576374745418, "grad_norm": 21.11509895324707, "learning_rate": 1.6387969094089317e-07, "loss": 0.0435, "num_input_tokens_seen": 9437696, "step": 9825 }, { "epoch": 0.8341819416157502, "grad_norm": 1.8037995100021362, "learning_rate": 1.6306811035067203e-07, "loss": 0.0472, "num_input_tokens_seen": 9442944, "step": 9830 }, { "epoch": 0.8346062457569586, "grad_norm": 0.501308798789978, "learning_rate": 1.6225836591166886e-07, "loss": 0.0394, "num_input_tokens_seen": 9447680, "step": 9835 }, { "epoch": 0.835030549898167, "grad_norm": 27.36667823791504, "learning_rate": 1.6145045940038803e-07, "loss": 0.0774, "num_input_tokens_seen": 9452288, "step": 9840 }, { "epoch": 0.8354548540393755, "grad_norm": 18.382902145385742, "learning_rate": 1.6064439258930217e-07, "loss": 0.063, "num_input_tokens_seen": 9456896, "step": 9845 }, { "epoch": 0.8358791581805839, "grad_norm": 19.6127872467041, "learning_rate": 1.5984016724684658e-07, "loss": 0.0847, "num_input_tokens_seen": 9461632, "step": 9850 }, { "epoch": 0.8363034623217923, "grad_norm": 0.09241819381713867, "learning_rate": 1.5903778513741816e-07, "loss": 0.077, "num_input_tokens_seen": 9466560, "step": 9855 }, { "epoch": 0.8367277664630007, "grad_norm": 0.4436960816383362, "learning_rate": 1.5823724802136862e-07, "loss": 0.0369, "num_input_tokens_seen": 9471168, "step": 9860 }, { "epoch": 0.837152070604209, "grad_norm": 11.254528999328613, "learning_rate": 1.5743855765500258e-07, "loss": 0.105, "num_input_tokens_seen": 9475968, "step": 9865 }, { "epoch": 0.8375763747454175, "grad_norm": 18.34065818786621, "learning_rate": 1.5664171579057273e-07, "loss": 0.0345, "num_input_tokens_seen": 9481280, "step": 9870 }, { "epoch": 0.8380006788866259, "grad_norm": 0.9362542033195496, "learning_rate": 1.5584672417627665e-07, "loss": 0.0235, "num_input_tokens_seen": 9485952, "step": 9875 }, { "epoch": 0.8384249830278343, "grad_norm": 1.7877192497253418, "learning_rate": 1.5505358455625229e-07, "loss": 0.0437, "num_input_tokens_seen": 9490048, "step": 9880 }, { "epoch": 0.8388492871690427, "grad_norm": 22.90734100341797, "learning_rate": 1.5426229867057516e-07, "loss": 0.0855, "num_input_tokens_seen": 9495360, "step": 9885 }, { "epoch": 0.8392735913102511, "grad_norm": 10.30651569366455, "learning_rate": 1.5347286825525252e-07, "loss": 0.0929, "num_input_tokens_seen": 9499968, "step": 9890 }, { "epoch": 0.8396978954514596, "grad_norm": 12.345735549926758, "learning_rate": 1.526852950422226e-07, "loss": 0.0571, "num_input_tokens_seen": 9504704, "step": 9895 }, { "epoch": 0.840122199592668, "grad_norm": 0.13042160868644714, "learning_rate": 1.5189958075934771e-07, "loss": 0.0243, "num_input_tokens_seen": 9509184, "step": 9900 }, { "epoch": 0.8405465037338764, "grad_norm": 16.40341567993164, "learning_rate": 1.5111572713041253e-07, "loss": 0.047, "num_input_tokens_seen": 9514048, "step": 9905 }, { "epoch": 0.8409708078750848, "grad_norm": 6.82795524597168, "learning_rate": 1.5033373587511944e-07, "loss": 0.0214, "num_input_tokens_seen": 9519104, "step": 9910 }, { "epoch": 0.8413951120162932, "grad_norm": 0.75281822681427, "learning_rate": 1.4955360870908505e-07, "loss": 0.0717, "num_input_tokens_seen": 9523840, "step": 9915 }, { "epoch": 0.8418194161575017, "grad_norm": 7.721748352050781, "learning_rate": 1.4877534734383624e-07, "loss": 0.0594, "num_input_tokens_seen": 9528384, "step": 9920 }, { "epoch": 0.8422437202987101, "grad_norm": 17.48251724243164, "learning_rate": 1.4799895348680647e-07, "loss": 0.0608, "num_input_tokens_seen": 9533184, "step": 9925 }, { "epoch": 0.8426680244399185, "grad_norm": 0.6833990812301636, "learning_rate": 1.4722442884133214e-07, "loss": 0.0261, "num_input_tokens_seen": 9538944, "step": 9930 }, { "epoch": 0.8430923285811269, "grad_norm": 1.3696354627609253, "learning_rate": 1.4645177510664886e-07, "loss": 0.0125, "num_input_tokens_seen": 9543296, "step": 9935 }, { "epoch": 0.8435166327223353, "grad_norm": 0.21253393590450287, "learning_rate": 1.4568099397788746e-07, "loss": 0.0706, "num_input_tokens_seen": 9547840, "step": 9940 }, { "epoch": 0.8439409368635438, "grad_norm": 1.1190301179885864, "learning_rate": 1.4491208714607016e-07, "loss": 0.0964, "num_input_tokens_seen": 9552704, "step": 9945 }, { "epoch": 0.8443652410047522, "grad_norm": 6.390957355499268, "learning_rate": 1.4414505629810813e-07, "loss": 0.0629, "num_input_tokens_seen": 9558272, "step": 9950 }, { "epoch": 0.8447895451459606, "grad_norm": 19.948999404907227, "learning_rate": 1.433799031167957e-07, "loss": 0.0944, "num_input_tokens_seen": 9563264, "step": 9955 }, { "epoch": 0.845213849287169, "grad_norm": 0.25374409556388855, "learning_rate": 1.426166292808083e-07, "loss": 0.0298, "num_input_tokens_seen": 9567680, "step": 9960 }, { "epoch": 0.8456381534283774, "grad_norm": 0.17964182794094086, "learning_rate": 1.4185523646469821e-07, "loss": 0.0373, "num_input_tokens_seen": 9572608, "step": 9965 }, { "epoch": 0.8460624575695859, "grad_norm": 13.58020305633545, "learning_rate": 1.410957263388909e-07, "loss": 0.0676, "num_input_tokens_seen": 9577088, "step": 9970 }, { "epoch": 0.8464867617107943, "grad_norm": 0.4978608191013336, "learning_rate": 1.4033810056968155e-07, "loss": 0.0421, "num_input_tokens_seen": 9581952, "step": 9975 }, { "epoch": 0.8469110658520027, "grad_norm": 0.18817739188671112, "learning_rate": 1.3958236081923102e-07, "loss": 0.096, "num_input_tokens_seen": 9586496, "step": 9980 }, { "epoch": 0.8473353699932111, "grad_norm": 0.07502438873052597, "learning_rate": 1.3882850874556207e-07, "loss": 0.0315, "num_input_tokens_seen": 9591296, "step": 9985 }, { "epoch": 0.8477596741344195, "grad_norm": 0.8373299241065979, "learning_rate": 1.3807654600255713e-07, "loss": 0.0639, "num_input_tokens_seen": 9595904, "step": 9990 }, { "epoch": 0.848183978275628, "grad_norm": 42.67544174194336, "learning_rate": 1.373264742399526e-07, "loss": 0.1177, "num_input_tokens_seen": 9600640, "step": 9995 }, { "epoch": 0.8486082824168364, "grad_norm": 2.967900514602661, "learning_rate": 1.3657829510333652e-07, "loss": 0.0569, "num_input_tokens_seen": 9605056, "step": 10000 }, { "epoch": 0.8490325865580448, "grad_norm": 9.756741523742676, "learning_rate": 1.3583201023414493e-07, "loss": 0.0102, "num_input_tokens_seen": 9610112, "step": 10005 }, { "epoch": 0.8494568906992532, "grad_norm": 0.3621211349964142, "learning_rate": 1.350876212696579e-07, "loss": 0.0276, "num_input_tokens_seen": 9615744, "step": 10010 }, { "epoch": 0.8498811948404617, "grad_norm": 12.596657752990723, "learning_rate": 1.3434512984299596e-07, "loss": 0.0089, "num_input_tokens_seen": 9620288, "step": 10015 }, { "epoch": 0.8503054989816701, "grad_norm": 0.3857429325580597, "learning_rate": 1.3360453758311686e-07, "loss": 0.0775, "num_input_tokens_seen": 9625024, "step": 10020 }, { "epoch": 0.8507298031228785, "grad_norm": 0.5006208419799805, "learning_rate": 1.32865846114811e-07, "loss": 0.0471, "num_input_tokens_seen": 9630144, "step": 10025 }, { "epoch": 0.8511541072640869, "grad_norm": 13.02332878112793, "learning_rate": 1.321290570586999e-07, "loss": 0.0665, "num_input_tokens_seen": 9634624, "step": 10030 }, { "epoch": 0.8511541072640869, "eval_loss": 0.05264058709144592, "eval_runtime": 15.9998, "eval_samples_per_second": 654.694, "eval_steps_per_second": 81.876, "num_input_tokens_seen": 9634624, "step": 10030 }, { "epoch": 0.8515784114052953, "grad_norm": 0.15222354233264923, "learning_rate": 1.3139417203123027e-07, "loss": 0.0181, "num_input_tokens_seen": 9639744, "step": 10035 }, { "epoch": 0.8520027155465038, "grad_norm": 0.2849067747592926, "learning_rate": 1.306611926446718e-07, "loss": 0.009, "num_input_tokens_seen": 9644480, "step": 10040 }, { "epoch": 0.8524270196877122, "grad_norm": 0.13256412744522095, "learning_rate": 1.2993012050711406e-07, "loss": 0.0478, "num_input_tokens_seen": 9649408, "step": 10045 }, { "epoch": 0.8528513238289206, "grad_norm": 7.352311134338379, "learning_rate": 1.292009572224614e-07, "loss": 0.0887, "num_input_tokens_seen": 9653440, "step": 10050 }, { "epoch": 0.853275627970129, "grad_norm": 1.3892513513565063, "learning_rate": 1.284737043904306e-07, "loss": 0.0318, "num_input_tokens_seen": 9658176, "step": 10055 }, { "epoch": 0.8536999321113374, "grad_norm": 0.09574834257364273, "learning_rate": 1.2774836360654717e-07, "loss": 0.0806, "num_input_tokens_seen": 9662848, "step": 10060 }, { "epoch": 0.8541242362525459, "grad_norm": 17.671186447143555, "learning_rate": 1.2702493646214207e-07, "loss": 0.0707, "num_input_tokens_seen": 9667392, "step": 10065 }, { "epoch": 0.8545485403937543, "grad_norm": 12.697011947631836, "learning_rate": 1.2630342454434728e-07, "loss": 0.1539, "num_input_tokens_seen": 9672384, "step": 10070 }, { "epoch": 0.8549728445349627, "grad_norm": 10.890848159790039, "learning_rate": 1.2558382943609357e-07, "loss": 0.0521, "num_input_tokens_seen": 9677248, "step": 10075 }, { "epoch": 0.8553971486761711, "grad_norm": 12.55119514465332, "learning_rate": 1.2486615271610558e-07, "loss": 0.0558, "num_input_tokens_seen": 9681536, "step": 10080 }, { "epoch": 0.8558214528173795, "grad_norm": 35.36617660522461, "learning_rate": 1.241503959589003e-07, "loss": 0.081, "num_input_tokens_seen": 9686592, "step": 10085 }, { "epoch": 0.856245756958588, "grad_norm": 25.30384635925293, "learning_rate": 1.234365607347816e-07, "loss": 0.0423, "num_input_tokens_seen": 9691648, "step": 10090 }, { "epoch": 0.8566700610997964, "grad_norm": 26.083160400390625, "learning_rate": 1.22724648609838e-07, "loss": 0.0433, "num_input_tokens_seen": 9696768, "step": 10095 }, { "epoch": 0.8570943652410048, "grad_norm": 11.980801582336426, "learning_rate": 1.2201466114593884e-07, "loss": 0.0689, "num_input_tokens_seen": 9701376, "step": 10100 }, { "epoch": 0.8575186693822132, "grad_norm": 0.25428134202957153, "learning_rate": 1.2130659990073144e-07, "loss": 0.0659, "num_input_tokens_seen": 9705408, "step": 10105 }, { "epoch": 0.8579429735234216, "grad_norm": 25.351072311401367, "learning_rate": 1.206004664276359e-07, "loss": 0.064, "num_input_tokens_seen": 9709824, "step": 10110 }, { "epoch": 0.8583672776646301, "grad_norm": 0.5601566433906555, "learning_rate": 1.198962622758447e-07, "loss": 0.0826, "num_input_tokens_seen": 9715072, "step": 10115 }, { "epoch": 0.8587915818058385, "grad_norm": 2.15639591217041, "learning_rate": 1.1919398899031585e-07, "loss": 0.0706, "num_input_tokens_seen": 9720000, "step": 10120 }, { "epoch": 0.8592158859470469, "grad_norm": 0.6312253475189209, "learning_rate": 1.1849364811177288e-07, "loss": 0.0041, "num_input_tokens_seen": 9724288, "step": 10125 }, { "epoch": 0.8596401900882552, "grad_norm": 0.6518107056617737, "learning_rate": 1.1779524117669837e-07, "loss": 0.0681, "num_input_tokens_seen": 9729280, "step": 10130 }, { "epoch": 0.8600644942294636, "grad_norm": 8.462985038757324, "learning_rate": 1.1709876971733269e-07, "loss": 0.0764, "num_input_tokens_seen": 9733696, "step": 10135 }, { "epoch": 0.860488798370672, "grad_norm": 0.15076375007629395, "learning_rate": 1.1640423526166987e-07, "loss": 0.016, "num_input_tokens_seen": 9738624, "step": 10140 }, { "epoch": 0.8609131025118805, "grad_norm": 0.44147834181785583, "learning_rate": 1.1571163933345462e-07, "loss": 0.0557, "num_input_tokens_seen": 9743488, "step": 10145 }, { "epoch": 0.8613374066530889, "grad_norm": 2.310833215713501, "learning_rate": 1.150209834521777e-07, "loss": 0.0133, "num_input_tokens_seen": 9749632, "step": 10150 }, { "epoch": 0.8617617107942973, "grad_norm": 11.502927780151367, "learning_rate": 1.1433226913307514e-07, "loss": 0.0291, "num_input_tokens_seen": 9754432, "step": 10155 }, { "epoch": 0.8621860149355057, "grad_norm": 1.099219560623169, "learning_rate": 1.1364549788712185e-07, "loss": 0.0293, "num_input_tokens_seen": 9759168, "step": 10160 }, { "epoch": 0.8626103190767141, "grad_norm": 0.827223002910614, "learning_rate": 1.1296067122103059e-07, "loss": 0.012, "num_input_tokens_seen": 9764096, "step": 10165 }, { "epoch": 0.8630346232179226, "grad_norm": 0.387584924697876, "learning_rate": 1.1227779063724818e-07, "loss": 0.0581, "num_input_tokens_seen": 9768768, "step": 10170 }, { "epoch": 0.863458927359131, "grad_norm": 15.812118530273438, "learning_rate": 1.115968576339511e-07, "loss": 0.0188, "num_input_tokens_seen": 9773184, "step": 10175 }, { "epoch": 0.8638832315003394, "grad_norm": 7.606304168701172, "learning_rate": 1.1091787370504347e-07, "loss": 0.0713, "num_input_tokens_seen": 9778688, "step": 10180 }, { "epoch": 0.8643075356415478, "grad_norm": 20.49475860595703, "learning_rate": 1.1024084034015347e-07, "loss": 0.0388, "num_input_tokens_seen": 9783168, "step": 10185 }, { "epoch": 0.8647318397827563, "grad_norm": 14.119841575622559, "learning_rate": 1.095657590246295e-07, "loss": 0.0622, "num_input_tokens_seen": 9787712, "step": 10190 }, { "epoch": 0.8651561439239647, "grad_norm": 17.043668746948242, "learning_rate": 1.0889263123953773e-07, "loss": 0.0461, "num_input_tokens_seen": 9792384, "step": 10195 }, { "epoch": 0.8655804480651731, "grad_norm": 6.3596625328063965, "learning_rate": 1.0822145846165853e-07, "loss": 0.1011, "num_input_tokens_seen": 9797824, "step": 10200 }, { "epoch": 0.8660047522063815, "grad_norm": 2.849475145339966, "learning_rate": 1.0755224216348235e-07, "loss": 0.1421, "num_input_tokens_seen": 9802880, "step": 10205 }, { "epoch": 0.8664290563475899, "grad_norm": 10.186467170715332, "learning_rate": 1.0688498381320854e-07, "loss": 0.088, "num_input_tokens_seen": 9807424, "step": 10210 }, { "epoch": 0.8668533604887984, "grad_norm": 15.472275733947754, "learning_rate": 1.0621968487473975e-07, "loss": 0.0545, "num_input_tokens_seen": 9812480, "step": 10215 }, { "epoch": 0.8672776646300068, "grad_norm": 72.3904037475586, "learning_rate": 1.0555634680768066e-07, "loss": 0.0356, "num_input_tokens_seen": 9816960, "step": 10220 }, { "epoch": 0.8677019687712152, "grad_norm": 0.23482580482959747, "learning_rate": 1.0489497106733347e-07, "loss": 0.0259, "num_input_tokens_seen": 9821568, "step": 10225 }, { "epoch": 0.8681262729124236, "grad_norm": 2.643237829208374, "learning_rate": 1.0423555910469561e-07, "loss": 0.0805, "num_input_tokens_seen": 9826048, "step": 10230 }, { "epoch": 0.868550577053632, "grad_norm": 7.882426738739014, "learning_rate": 1.0357811236645597e-07, "loss": 0.0412, "num_input_tokens_seen": 9830720, "step": 10235 }, { "epoch": 0.8689748811948405, "grad_norm": 7.657166481018066, "learning_rate": 1.0292263229499209e-07, "loss": 0.0653, "num_input_tokens_seen": 9835648, "step": 10240 }, { "epoch": 0.8693991853360489, "grad_norm": 9.319342613220215, "learning_rate": 1.022691203283661e-07, "loss": 0.1175, "num_input_tokens_seen": 9839936, "step": 10245 }, { "epoch": 0.8698234894772573, "grad_norm": 0.1383010298013687, "learning_rate": 1.0161757790032355e-07, "loss": 0.0568, "num_input_tokens_seen": 9844608, "step": 10250 }, { "epoch": 0.8702477936184657, "grad_norm": 1.5263642072677612, "learning_rate": 1.0096800644028791e-07, "loss": 0.0191, "num_input_tokens_seen": 9848896, "step": 10255 }, { "epoch": 0.8706720977596741, "grad_norm": 6.711588382720947, "learning_rate": 1.003204073733589e-07, "loss": 0.0822, "num_input_tokens_seen": 9853184, "step": 10260 }, { "epoch": 0.8710964019008826, "grad_norm": 0.34313127398490906, "learning_rate": 9.967478212030923e-08, "loss": 0.0627, "num_input_tokens_seen": 9857856, "step": 10265 }, { "epoch": 0.871520706042091, "grad_norm": 37.75361251831055, "learning_rate": 9.903113209758096e-08, "loss": 0.0556, "num_input_tokens_seen": 9862592, "step": 10270 }, { "epoch": 0.8719450101832994, "grad_norm": 12.846491813659668, "learning_rate": 9.838945871728266e-08, "loss": 0.0582, "num_input_tokens_seen": 9867584, "step": 10275 }, { "epoch": 0.8723693143245078, "grad_norm": 4.416686534881592, "learning_rate": 9.774976338718677e-08, "loss": 0.0089, "num_input_tokens_seen": 9872384, "step": 10280 }, { "epoch": 0.8727936184657162, "grad_norm": 27.68250846862793, "learning_rate": 9.711204751072499e-08, "loss": 0.0969, "num_input_tokens_seen": 9876672, "step": 10285 }, { "epoch": 0.8732179226069247, "grad_norm": 31.03038215637207, "learning_rate": 9.647631248698773e-08, "loss": 0.0259, "num_input_tokens_seen": 9881792, "step": 10290 }, { "epoch": 0.8736422267481331, "grad_norm": 13.698047637939453, "learning_rate": 9.584255971071886e-08, "loss": 0.1201, "num_input_tokens_seen": 9886464, "step": 10295 }, { "epoch": 0.8740665308893415, "grad_norm": 19.63283348083496, "learning_rate": 9.521079057231274e-08, "loss": 0.0121, "num_input_tokens_seen": 9891264, "step": 10300 }, { "epoch": 0.8744908350305499, "grad_norm": 5.031111240386963, "learning_rate": 9.45810064578133e-08, "loss": 0.0298, "num_input_tokens_seen": 9896320, "step": 10305 }, { "epoch": 0.8749151391717583, "grad_norm": 16.013206481933594, "learning_rate": 9.39532087489081e-08, "loss": 0.0864, "num_input_tokens_seen": 9901504, "step": 10310 }, { "epoch": 0.8753394433129668, "grad_norm": 10.649767875671387, "learning_rate": 9.33273988229275e-08, "loss": 0.0504, "num_input_tokens_seen": 9907008, "step": 10315 }, { "epoch": 0.8757637474541752, "grad_norm": 7.6332197189331055, "learning_rate": 9.270357805284057e-08, "loss": 0.0989, "num_input_tokens_seen": 9911744, "step": 10320 }, { "epoch": 0.8761880515953836, "grad_norm": 12.951970100402832, "learning_rate": 9.208174780725253e-08, "loss": 0.1047, "num_input_tokens_seen": 9916096, "step": 10325 }, { "epoch": 0.876612355736592, "grad_norm": 1.121921181678772, "learning_rate": 9.146190945040145e-08, "loss": 0.0295, "num_input_tokens_seen": 9920448, "step": 10330 }, { "epoch": 0.8770366598778004, "grad_norm": 16.190587997436523, "learning_rate": 9.084406434215553e-08, "loss": 0.0886, "num_input_tokens_seen": 9925312, "step": 10335 }, { "epoch": 0.8774609640190089, "grad_norm": 18.056102752685547, "learning_rate": 9.022821383800926e-08, "loss": 0.0846, "num_input_tokens_seen": 9929920, "step": 10340 }, { "epoch": 0.8778852681602173, "grad_norm": 1.8114804029464722, "learning_rate": 8.961435928908267e-08, "loss": 0.0035, "num_input_tokens_seen": 9934912, "step": 10345 }, { "epoch": 0.8783095723014257, "grad_norm": 19.24297332763672, "learning_rate": 8.900250204211513e-08, "loss": 0.0809, "num_input_tokens_seen": 9939520, "step": 10350 }, { "epoch": 0.8787338764426341, "grad_norm": 28.829084396362305, "learning_rate": 8.839264343946506e-08, "loss": 0.0441, "num_input_tokens_seen": 9944384, "step": 10355 }, { "epoch": 0.8791581805838425, "grad_norm": 9.343367576599121, "learning_rate": 8.778478481910611e-08, "loss": 0.0557, "num_input_tokens_seen": 9949056, "step": 10360 }, { "epoch": 0.879582484725051, "grad_norm": 15.344569206237793, "learning_rate": 8.717892751462363e-08, "loss": 0.0435, "num_input_tokens_seen": 9954176, "step": 10365 }, { "epoch": 0.8800067888662594, "grad_norm": 15.17747974395752, "learning_rate": 8.657507285521281e-08, "loss": 0.0449, "num_input_tokens_seen": 9958912, "step": 10370 }, { "epoch": 0.8804310930074678, "grad_norm": 12.388559341430664, "learning_rate": 8.597322216567493e-08, "loss": 0.0793, "num_input_tokens_seen": 9963648, "step": 10375 }, { "epoch": 0.8808553971486762, "grad_norm": 13.636301040649414, "learning_rate": 8.537337676641442e-08, "loss": 0.0145, "num_input_tokens_seen": 9968256, "step": 10380 }, { "epoch": 0.8812797012898846, "grad_norm": 0.26373496651649475, "learning_rate": 8.477553797343728e-08, "loss": 0.0439, "num_input_tokens_seen": 9973376, "step": 10385 }, { "epoch": 0.881704005431093, "grad_norm": 20.62845230102539, "learning_rate": 8.41797070983461e-08, "loss": 0.0804, "num_input_tokens_seen": 9978240, "step": 10390 }, { "epoch": 0.8821283095723014, "grad_norm": 0.17017631232738495, "learning_rate": 8.358588544833877e-08, "loss": 0.003, "num_input_tokens_seen": 9982784, "step": 10395 }, { "epoch": 0.8825526137135098, "grad_norm": 5.08088493347168, "learning_rate": 8.29940743262052e-08, "loss": 0.0894, "num_input_tokens_seen": 9987008, "step": 10400 }, { "epoch": 0.8829769178547182, "grad_norm": 11.488354682922363, "learning_rate": 8.240427503032443e-08, "loss": 0.0803, "num_input_tokens_seen": 9992640, "step": 10405 }, { "epoch": 0.8834012219959266, "grad_norm": 0.7701570987701416, "learning_rate": 8.181648885466141e-08, "loss": 0.0592, "num_input_tokens_seen": 9996672, "step": 10410 }, { "epoch": 0.883825526137135, "grad_norm": 5.4983601570129395, "learning_rate": 8.123071708876473e-08, "loss": 0.07, "num_input_tokens_seen": 10001216, "step": 10415 }, { "epoch": 0.8842498302783435, "grad_norm": 2.3859031200408936, "learning_rate": 8.064696101776358e-08, "loss": 0.0299, "num_input_tokens_seen": 10006144, "step": 10420 }, { "epoch": 0.8846741344195519, "grad_norm": 23.999298095703125, "learning_rate": 8.006522192236487e-08, "loss": 0.0395, "num_input_tokens_seen": 10011520, "step": 10425 }, { "epoch": 0.8850984385607603, "grad_norm": 1.3761314153671265, "learning_rate": 7.948550107885043e-08, "loss": 0.0071, "num_input_tokens_seen": 10016512, "step": 10430 }, { "epoch": 0.8855227427019687, "grad_norm": 33.45118713378906, "learning_rate": 7.89077997590738e-08, "loss": 0.0866, "num_input_tokens_seen": 10020928, "step": 10435 }, { "epoch": 0.8859470468431772, "grad_norm": 1.5158158540725708, "learning_rate": 7.833211923045891e-08, "loss": 0.0255, "num_input_tokens_seen": 10025920, "step": 10440 }, { "epoch": 0.8863713509843856, "grad_norm": 14.920109748840332, "learning_rate": 7.775846075599524e-08, "loss": 0.0434, "num_input_tokens_seen": 10030464, "step": 10445 }, { "epoch": 0.886795655125594, "grad_norm": 4.014338970184326, "learning_rate": 7.718682559423651e-08, "loss": 0.0498, "num_input_tokens_seen": 10035328, "step": 10450 }, { "epoch": 0.8872199592668024, "grad_norm": 30.575336456298828, "learning_rate": 7.661721499929752e-08, "loss": 0.0692, "num_input_tokens_seen": 10040384, "step": 10455 }, { "epoch": 0.8876442634080108, "grad_norm": 0.6011759042739868, "learning_rate": 7.60496302208512e-08, "loss": 0.0334, "num_input_tokens_seen": 10045440, "step": 10460 }, { "epoch": 0.8880685675492193, "grad_norm": 5.954895973205566, "learning_rate": 7.548407250412614e-08, "loss": 0.0272, "num_input_tokens_seen": 10050432, "step": 10465 }, { "epoch": 0.8884928716904277, "grad_norm": 23.370336532592773, "learning_rate": 7.492054308990381e-08, "loss": 0.0403, "num_input_tokens_seen": 10055296, "step": 10470 }, { "epoch": 0.8889171758316361, "grad_norm": 15.24754810333252, "learning_rate": 7.435904321451524e-08, "loss": 0.0513, "num_input_tokens_seen": 10060416, "step": 10475 }, { "epoch": 0.8893414799728445, "grad_norm": 0.8229270577430725, "learning_rate": 7.379957410983995e-08, "loss": 0.0397, "num_input_tokens_seen": 10065472, "step": 10480 }, { "epoch": 0.8897657841140529, "grad_norm": 9.5431489944458, "learning_rate": 7.324213700330095e-08, "loss": 0.0267, "num_input_tokens_seen": 10070784, "step": 10485 }, { "epoch": 0.8901900882552614, "grad_norm": 0.07072459161281586, "learning_rate": 7.268673311786378e-08, "loss": 0.0202, "num_input_tokens_seen": 10075904, "step": 10490 }, { "epoch": 0.8906143923964698, "grad_norm": 10.91126823425293, "learning_rate": 7.213336367203338e-08, "loss": 0.0229, "num_input_tokens_seen": 10080768, "step": 10495 }, { "epoch": 0.8910386965376782, "grad_norm": 0.6212610006332397, "learning_rate": 7.158202987985106e-08, "loss": 0.0579, "num_input_tokens_seen": 10085312, "step": 10500 }, { "epoch": 0.8914630006788866, "grad_norm": 0.43236419558525085, "learning_rate": 7.10327329508923e-08, "loss": 0.1098, "num_input_tokens_seen": 10089792, "step": 10505 }, { "epoch": 0.891887304820095, "grad_norm": 7.02418851852417, "learning_rate": 7.048547409026384e-08, "loss": 0.0712, "num_input_tokens_seen": 10094976, "step": 10510 }, { "epoch": 0.8923116089613035, "grad_norm": 0.24239858984947205, "learning_rate": 6.994025449860064e-08, "loss": 0.0331, "num_input_tokens_seen": 10099200, "step": 10515 }, { "epoch": 0.8927359131025119, "grad_norm": 23.110071182250977, "learning_rate": 6.939707537206485e-08, "loss": 0.075, "num_input_tokens_seen": 10104320, "step": 10520 }, { "epoch": 0.8931602172437203, "grad_norm": 0.2738831043243408, "learning_rate": 6.885593790234056e-08, "loss": 0.0464, "num_input_tokens_seen": 10109312, "step": 10525 }, { "epoch": 0.8935845213849287, "grad_norm": 0.8342077136039734, "learning_rate": 6.831684327663367e-08, "loss": 0.0293, "num_input_tokens_seen": 10113600, "step": 10530 }, { "epoch": 0.8940088255261371, "grad_norm": 6.28220272064209, "learning_rate": 6.777979267766786e-08, "loss": 0.0048, "num_input_tokens_seen": 10118272, "step": 10535 }, { "epoch": 0.8944331296673456, "grad_norm": 28.221162796020508, "learning_rate": 6.724478728368277e-08, "loss": 0.059, "num_input_tokens_seen": 10122688, "step": 10540 }, { "epoch": 0.894857433808554, "grad_norm": 5.227849960327148, "learning_rate": 6.671182826843047e-08, "loss": 0.1179, "num_input_tokens_seen": 10128000, "step": 10545 }, { "epoch": 0.8952817379497624, "grad_norm": 6.202547073364258, "learning_rate": 6.618091680117399e-08, "loss": 0.0508, "num_input_tokens_seen": 10132544, "step": 10550 }, { "epoch": 0.8957060420909708, "grad_norm": 7.8940629959106445, "learning_rate": 6.565205404668395e-08, "loss": 0.0739, "num_input_tokens_seen": 10138496, "step": 10555 }, { "epoch": 0.8961303462321792, "grad_norm": 29.633386611938477, "learning_rate": 6.512524116523633e-08, "loss": 0.0383, "num_input_tokens_seen": 10143424, "step": 10560 }, { "epoch": 0.8965546503733877, "grad_norm": 23.16342544555664, "learning_rate": 6.460047931261003e-08, "loss": 0.0861, "num_input_tokens_seen": 10148672, "step": 10565 }, { "epoch": 0.8969789545145961, "grad_norm": 7.808042526245117, "learning_rate": 6.407776964008383e-08, "loss": 0.0124, "num_input_tokens_seen": 10153408, "step": 10570 }, { "epoch": 0.8974032586558045, "grad_norm": 0.7497434020042419, "learning_rate": 6.355711329443481e-08, "loss": 0.055, "num_input_tokens_seen": 10157952, "step": 10575 }, { "epoch": 0.8978275627970129, "grad_norm": 0.13198748230934143, "learning_rate": 6.303851141793437e-08, "loss": 0.0355, "num_input_tokens_seen": 10163712, "step": 10580 }, { "epoch": 0.8982518669382213, "grad_norm": 18.96082305908203, "learning_rate": 6.252196514834751e-08, "loss": 0.0312, "num_input_tokens_seen": 10168512, "step": 10585 }, { "epoch": 0.8986761710794298, "grad_norm": 33.99904251098633, "learning_rate": 6.200747561892882e-08, "loss": 0.0404, "num_input_tokens_seen": 10173120, "step": 10590 }, { "epoch": 0.8991004752206382, "grad_norm": 5.6009368896484375, "learning_rate": 6.149504395842087e-08, "loss": 0.0617, "num_input_tokens_seen": 10177856, "step": 10595 }, { "epoch": 0.8995247793618466, "grad_norm": 21.8309383392334, "learning_rate": 6.098467129105123e-08, "loss": 0.0328, "num_input_tokens_seen": 10182080, "step": 10600 }, { "epoch": 0.899949083503055, "grad_norm": 27.717575073242188, "learning_rate": 6.047635873653068e-08, "loss": 0.0675, "num_input_tokens_seen": 10186496, "step": 10605 }, { "epoch": 0.9003733876442634, "grad_norm": 20.762332916259766, "learning_rate": 5.997010741004949e-08, "loss": 0.0368, "num_input_tokens_seen": 10190912, "step": 10610 }, { "epoch": 0.9007976917854719, "grad_norm": 27.49617576599121, "learning_rate": 5.946591842227677e-08, "loss": 0.0525, "num_input_tokens_seen": 10195008, "step": 10615 }, { "epoch": 0.9012219959266803, "grad_norm": 9.339949607849121, "learning_rate": 5.8963792879356265e-08, "loss": 0.0855, "num_input_tokens_seen": 10199424, "step": 10620 }, { "epoch": 0.9012219959266803, "eval_loss": 0.05229973420500755, "eval_runtime": 15.7294, "eval_samples_per_second": 665.95, "eval_steps_per_second": 83.284, "num_input_tokens_seen": 10199424, "step": 10620 }, { "epoch": 0.9016463000678887, "grad_norm": 14.488176345825195, "learning_rate": 5.84637318829051e-08, "loss": 0.1033, "num_input_tokens_seen": 10203968, "step": 10625 }, { "epoch": 0.9020706042090971, "grad_norm": 37.42259216308594, "learning_rate": 5.796573653001091e-08, "loss": 0.0844, "num_input_tokens_seen": 10208640, "step": 10630 }, { "epoch": 0.9024949083503055, "grad_norm": 0.3053247332572937, "learning_rate": 5.746980791322942e-08, "loss": 0.0136, "num_input_tokens_seen": 10213504, "step": 10635 }, { "epoch": 0.902919212491514, "grad_norm": 9.757519721984863, "learning_rate": 5.697594712058218e-08, "loss": 0.0535, "num_input_tokens_seen": 10218432, "step": 10640 }, { "epoch": 0.9033435166327224, "grad_norm": 0.3384649157524109, "learning_rate": 5.6484155235554275e-08, "loss": 0.09, "num_input_tokens_seen": 10223296, "step": 10645 }, { "epoch": 0.9037678207739308, "grad_norm": 9.248594284057617, "learning_rate": 5.599443333709131e-08, "loss": 0.1227, "num_input_tokens_seen": 10227904, "step": 10650 }, { "epoch": 0.9041921249151391, "grad_norm": 0.7518671154975891, "learning_rate": 5.5506782499598394e-08, "loss": 0.0481, "num_input_tokens_seen": 10232640, "step": 10655 }, { "epoch": 0.9046164290563475, "grad_norm": 22.829261779785156, "learning_rate": 5.502120379293585e-08, "loss": 0.0891, "num_input_tokens_seen": 10236864, "step": 10660 }, { "epoch": 0.905040733197556, "grad_norm": 0.5002491474151611, "learning_rate": 5.453769828241872e-08, "loss": 0.0525, "num_input_tokens_seen": 10241216, "step": 10665 }, { "epoch": 0.9054650373387644, "grad_norm": 13.650254249572754, "learning_rate": 5.4056267028813606e-08, "loss": 0.0544, "num_input_tokens_seen": 10245952, "step": 10670 }, { "epoch": 0.9058893414799728, "grad_norm": 41.74579620361328, "learning_rate": 5.357691108833584e-08, "loss": 0.0321, "num_input_tokens_seen": 10251392, "step": 10675 }, { "epoch": 0.9063136456211812, "grad_norm": 5.183800220489502, "learning_rate": 5.309963151264829e-08, "loss": 0.1047, "num_input_tokens_seen": 10256128, "step": 10680 }, { "epoch": 0.9067379497623896, "grad_norm": 3.764021396636963, "learning_rate": 5.262442934885813e-08, "loss": 0.043, "num_input_tokens_seen": 10260352, "step": 10685 }, { "epoch": 0.9071622539035981, "grad_norm": 0.09938930720090866, "learning_rate": 5.21513056395152e-08, "loss": 0.0157, "num_input_tokens_seen": 10265344, "step": 10690 }, { "epoch": 0.9075865580448065, "grad_norm": 2.3056552410125732, "learning_rate": 5.168026142260862e-08, "loss": 0.0575, "num_input_tokens_seen": 10269632, "step": 10695 }, { "epoch": 0.9080108621860149, "grad_norm": 13.903623580932617, "learning_rate": 5.121129773156663e-08, "loss": 0.0206, "num_input_tokens_seen": 10274560, "step": 10700 }, { "epoch": 0.9084351663272233, "grad_norm": 0.24791185557842255, "learning_rate": 5.074441559525167e-08, "loss": 0.0327, "num_input_tokens_seen": 10279552, "step": 10705 }, { "epoch": 0.9088594704684317, "grad_norm": 0.47743096947669983, "learning_rate": 5.027961603796027e-08, "loss": 0.0499, "num_input_tokens_seen": 10284288, "step": 10710 }, { "epoch": 0.9092837746096402, "grad_norm": 15.714171409606934, "learning_rate": 4.981690007941952e-08, "loss": 0.0414, "num_input_tokens_seen": 10289600, "step": 10715 }, { "epoch": 0.9097080787508486, "grad_norm": 4.764758586883545, "learning_rate": 4.93562687347856e-08, "loss": 0.0168, "num_input_tokens_seen": 10294144, "step": 10720 }, { "epoch": 0.910132382892057, "grad_norm": 8.239587783813477, "learning_rate": 4.889772301464112e-08, "loss": 0.0495, "num_input_tokens_seen": 10298752, "step": 10725 }, { "epoch": 0.9105566870332654, "grad_norm": 13.610047340393066, "learning_rate": 4.844126392499304e-08, "loss": 0.0437, "num_input_tokens_seen": 10303424, "step": 10730 }, { "epoch": 0.9109809911744738, "grad_norm": 23.149311065673828, "learning_rate": 4.7986892467270057e-08, "loss": 0.0589, "num_input_tokens_seen": 10308096, "step": 10735 }, { "epoch": 0.9114052953156823, "grad_norm": 6.693690299987793, "learning_rate": 4.7534609638321785e-08, "loss": 0.0508, "num_input_tokens_seen": 10313152, "step": 10740 }, { "epoch": 0.9118295994568907, "grad_norm": 11.246514320373535, "learning_rate": 4.70844164304145e-08, "loss": 0.0348, "num_input_tokens_seen": 10318016, "step": 10745 }, { "epoch": 0.9122539035980991, "grad_norm": 0.46989282965660095, "learning_rate": 4.663631383123057e-08, "loss": 0.0304, "num_input_tokens_seen": 10322432, "step": 10750 }, { "epoch": 0.9126782077393075, "grad_norm": 0.2042643278837204, "learning_rate": 4.61903028238656e-08, "loss": 0.0115, "num_input_tokens_seen": 10327168, "step": 10755 }, { "epoch": 0.9131025118805159, "grad_norm": 1.085499882698059, "learning_rate": 4.5746384386826767e-08, "loss": 0.0781, "num_input_tokens_seen": 10331584, "step": 10760 }, { "epoch": 0.9135268160217244, "grad_norm": 0.2616358697414398, "learning_rate": 4.5304559494030004e-08, "loss": 0.0269, "num_input_tokens_seen": 10336640, "step": 10765 }, { "epoch": 0.9139511201629328, "grad_norm": 9.98695182800293, "learning_rate": 4.486482911479839e-08, "loss": 0.0617, "num_input_tokens_seen": 10341440, "step": 10770 }, { "epoch": 0.9143754243041412, "grad_norm": 2.2652738094329834, "learning_rate": 4.442719421385921e-08, "loss": 0.0104, "num_input_tokens_seen": 10346624, "step": 10775 }, { "epoch": 0.9147997284453496, "grad_norm": 0.2195603996515274, "learning_rate": 4.399165575134378e-08, "loss": 0.0092, "num_input_tokens_seen": 10351552, "step": 10780 }, { "epoch": 0.915224032586558, "grad_norm": 9.171103477478027, "learning_rate": 4.3558214682782645e-08, "loss": 0.0601, "num_input_tokens_seen": 10356352, "step": 10785 }, { "epoch": 0.9156483367277665, "grad_norm": 0.11924657225608826, "learning_rate": 4.312687195910558e-08, "loss": 0.0717, "num_input_tokens_seen": 10361920, "step": 10790 }, { "epoch": 0.9160726408689749, "grad_norm": 5.790630340576172, "learning_rate": 4.269762852663894e-08, "loss": 0.0724, "num_input_tokens_seen": 10366272, "step": 10795 }, { "epoch": 0.9164969450101833, "grad_norm": 18.95665168762207, "learning_rate": 4.227048532710287e-08, "loss": 0.0167, "num_input_tokens_seen": 10371328, "step": 10800 }, { "epoch": 0.9169212491513917, "grad_norm": 24.164165496826172, "learning_rate": 4.184544329761008e-08, "loss": 0.0604, "num_input_tokens_seen": 10376384, "step": 10805 }, { "epoch": 0.9173455532926001, "grad_norm": 6.414123058319092, "learning_rate": 4.1422503370663553e-08, "loss": 0.0687, "num_input_tokens_seen": 10381120, "step": 10810 }, { "epoch": 0.9177698574338086, "grad_norm": 0.15381702780723572, "learning_rate": 4.100166647415437e-08, "loss": 0.0225, "num_input_tokens_seen": 10385536, "step": 10815 }, { "epoch": 0.918194161575017, "grad_norm": 1.0370328426361084, "learning_rate": 4.058293353135988e-08, "loss": 0.0188, "num_input_tokens_seen": 10390208, "step": 10820 }, { "epoch": 0.9186184657162254, "grad_norm": 0.18410193920135498, "learning_rate": 4.016630546094158e-08, "loss": 0.051, "num_input_tokens_seen": 10394560, "step": 10825 }, { "epoch": 0.9190427698574338, "grad_norm": 1.0141817331314087, "learning_rate": 3.975178317694239e-08, "loss": 0.0707, "num_input_tokens_seen": 10398848, "step": 10830 }, { "epoch": 0.9194670739986422, "grad_norm": 11.05111026763916, "learning_rate": 3.9339367588786644e-08, "loss": 0.036, "num_input_tokens_seen": 10404160, "step": 10835 }, { "epoch": 0.9198913781398507, "grad_norm": 6.632215976715088, "learning_rate": 3.892905960127546e-08, "loss": 0.051, "num_input_tokens_seen": 10408704, "step": 10840 }, { "epoch": 0.9203156822810591, "grad_norm": 11.019655227661133, "learning_rate": 3.852086011458688e-08, "loss": 0.0547, "num_input_tokens_seen": 10413312, "step": 10845 }, { "epoch": 0.9207399864222675, "grad_norm": 26.46409797668457, "learning_rate": 3.811477002427288e-08, "loss": 0.048, "num_input_tokens_seen": 10418048, "step": 10850 }, { "epoch": 0.9211642905634759, "grad_norm": 0.4152030348777771, "learning_rate": 3.771079022125745e-08, "loss": 0.0239, "num_input_tokens_seen": 10422464, "step": 10855 }, { "epoch": 0.9215885947046843, "grad_norm": 33.51541519165039, "learning_rate": 3.7308921591835074e-08, "loss": 0.1148, "num_input_tokens_seen": 10426880, "step": 10860 }, { "epoch": 0.9220128988458928, "grad_norm": 16.225305557250977, "learning_rate": 3.6909165017668385e-08, "loss": 0.1021, "num_input_tokens_seen": 10431232, "step": 10865 }, { "epoch": 0.9224372029871012, "grad_norm": 26.616647720336914, "learning_rate": 3.651152137578617e-08, "loss": 0.1128, "num_input_tokens_seen": 10437120, "step": 10870 }, { "epoch": 0.9228615071283096, "grad_norm": 12.144709587097168, "learning_rate": 3.611599153858214e-08, "loss": 0.1289, "num_input_tokens_seen": 10443456, "step": 10875 }, { "epoch": 0.923285811269518, "grad_norm": 6.237645149230957, "learning_rate": 3.572257637381182e-08, "loss": 0.0611, "num_input_tokens_seen": 10448576, "step": 10880 }, { "epoch": 0.9237101154107265, "grad_norm": 2.817307472229004, "learning_rate": 3.533127674459202e-08, "loss": 0.0389, "num_input_tokens_seen": 10453440, "step": 10885 }, { "epoch": 0.9241344195519349, "grad_norm": 0.3340953290462494, "learning_rate": 3.494209350939792e-08, "loss": 0.0268, "num_input_tokens_seen": 10458176, "step": 10890 }, { "epoch": 0.9245587236931433, "grad_norm": 10.947976112365723, "learning_rate": 3.455502752206152e-08, "loss": 0.0594, "num_input_tokens_seen": 10463232, "step": 10895 }, { "epoch": 0.9249830278343517, "grad_norm": 0.28482815623283386, "learning_rate": 3.4170079631769764e-08, "loss": 0.0892, "num_input_tokens_seen": 10468032, "step": 10900 }, { "epoch": 0.9254073319755601, "grad_norm": 4.82904052734375, "learning_rate": 3.378725068306298e-08, "loss": 0.0135, "num_input_tokens_seen": 10472832, "step": 10905 }, { "epoch": 0.9258316361167686, "grad_norm": 0.40435436367988586, "learning_rate": 3.3406541515832e-08, "loss": 0.0199, "num_input_tokens_seen": 10477248, "step": 10910 }, { "epoch": 0.926255940257977, "grad_norm": 19.899160385131836, "learning_rate": 3.302795296531813e-08, "loss": 0.057, "num_input_tokens_seen": 10481920, "step": 10915 }, { "epoch": 0.9266802443991853, "grad_norm": 6.992005825042725, "learning_rate": 3.265148586210942e-08, "loss": 0.0741, "num_input_tokens_seen": 10486976, "step": 10920 }, { "epoch": 0.9271045485403937, "grad_norm": 31.746776580810547, "learning_rate": 3.2277141032139746e-08, "loss": 0.0575, "num_input_tokens_seen": 10491712, "step": 10925 }, { "epoch": 0.9275288526816021, "grad_norm": 35.81395721435547, "learning_rate": 3.190491929668748e-08, "loss": 0.0889, "num_input_tokens_seen": 10496960, "step": 10930 }, { "epoch": 0.9279531568228105, "grad_norm": 13.375768661499023, "learning_rate": 3.15348214723723e-08, "loss": 0.0602, "num_input_tokens_seen": 10501120, "step": 10935 }, { "epoch": 0.928377460964019, "grad_norm": 0.4103608727455139, "learning_rate": 3.11668483711548e-08, "loss": 0.0228, "num_input_tokens_seen": 10507008, "step": 10940 }, { "epoch": 0.9288017651052274, "grad_norm": 18.951581954956055, "learning_rate": 3.0801000800333876e-08, "loss": 0.0284, "num_input_tokens_seen": 10511424, "step": 10945 }, { "epoch": 0.9292260692464358, "grad_norm": 38.00094985961914, "learning_rate": 3.043727956254538e-08, "loss": 0.0394, "num_input_tokens_seen": 10515968, "step": 10950 }, { "epoch": 0.9296503733876442, "grad_norm": 37.419803619384766, "learning_rate": 3.007568545576011e-08, "loss": 0.0882, "num_input_tokens_seen": 10520576, "step": 10955 }, { "epoch": 0.9300746775288526, "grad_norm": 1.1467417478561401, "learning_rate": 2.971621927328216e-08, "loss": 0.0491, "num_input_tokens_seen": 10525504, "step": 10960 }, { "epoch": 0.9304989816700611, "grad_norm": 0.40472060441970825, "learning_rate": 2.9358881803746794e-08, "loss": 0.1112, "num_input_tokens_seen": 10530112, "step": 10965 }, { "epoch": 0.9309232858112695, "grad_norm": 0.4380423426628113, "learning_rate": 2.900367383111979e-08, "loss": 0.0864, "num_input_tokens_seen": 10535232, "step": 10970 }, { "epoch": 0.9313475899524779, "grad_norm": 18.734893798828125, "learning_rate": 2.865059613469434e-08, "loss": 0.0258, "num_input_tokens_seen": 10539712, "step": 10975 }, { "epoch": 0.9317718940936863, "grad_norm": 2.202915906906128, "learning_rate": 2.829964948909047e-08, "loss": 0.057, "num_input_tokens_seen": 10544128, "step": 10980 }, { "epoch": 0.9321961982348947, "grad_norm": 9.770487785339355, "learning_rate": 2.795083466425252e-08, "loss": 0.0174, "num_input_tokens_seen": 10548288, "step": 10985 }, { "epoch": 0.9326205023761032, "grad_norm": 29.964080810546875, "learning_rate": 2.760415242544811e-08, "loss": 0.0176, "num_input_tokens_seen": 10552512, "step": 10990 }, { "epoch": 0.9330448065173116, "grad_norm": 1.10525381565094, "learning_rate": 2.7259603533266063e-08, "loss": 0.0617, "num_input_tokens_seen": 10557952, "step": 10995 }, { "epoch": 0.93346911065852, "grad_norm": 0.8512893319129944, "learning_rate": 2.6917188743614704e-08, "loss": 0.0529, "num_input_tokens_seen": 10562240, "step": 11000 }, { "epoch": 0.9338934147997284, "grad_norm": 3.778372049331665, "learning_rate": 2.6576908807720233e-08, "loss": 0.0832, "num_input_tokens_seen": 10566976, "step": 11005 }, { "epoch": 0.9343177189409368, "grad_norm": 22.075092315673828, "learning_rate": 2.623876447212592e-08, "loss": 0.0393, "num_input_tokens_seen": 10571584, "step": 11010 }, { "epoch": 0.9347420230821453, "grad_norm": 16.559036254882812, "learning_rate": 2.590275647868867e-08, "loss": 0.082, "num_input_tokens_seen": 10576832, "step": 11015 }, { "epoch": 0.9351663272233537, "grad_norm": 30.172012329101562, "learning_rate": 2.5568885564579258e-08, "loss": 0.0305, "num_input_tokens_seen": 10581184, "step": 11020 }, { "epoch": 0.9355906313645621, "grad_norm": 3.05375337600708, "learning_rate": 2.5237152462279532e-08, "loss": 0.0419, "num_input_tokens_seen": 10585792, "step": 11025 }, { "epoch": 0.9360149355057705, "grad_norm": 0.48022231459617615, "learning_rate": 2.4907557899581212e-08, "loss": 0.0216, "num_input_tokens_seen": 10590016, "step": 11030 }, { "epoch": 0.936439239646979, "grad_norm": 1.483380913734436, "learning_rate": 2.4580102599584317e-08, "loss": 0.0183, "num_input_tokens_seen": 10594624, "step": 11035 }, { "epoch": 0.9368635437881874, "grad_norm": 13.737578392028809, "learning_rate": 2.425478728069552e-08, "loss": 0.1089, "num_input_tokens_seen": 10598912, "step": 11040 }, { "epoch": 0.9372878479293958, "grad_norm": 1.424195647239685, "learning_rate": 2.3931612656626688e-08, "loss": 0.0255, "num_input_tokens_seen": 10603648, "step": 11045 }, { "epoch": 0.9377121520706042, "grad_norm": 4.536886692047119, "learning_rate": 2.3610579436392996e-08, "loss": 0.0221, "num_input_tokens_seen": 10608640, "step": 11050 }, { "epoch": 0.9381364562118126, "grad_norm": 18.450544357299805, "learning_rate": 2.329168832431161e-08, "loss": 0.1013, "num_input_tokens_seen": 10613312, "step": 11055 }, { "epoch": 0.938560760353021, "grad_norm": 0.3562639057636261, "learning_rate": 2.2974940020000112e-08, "loss": 0.0113, "num_input_tokens_seen": 10617600, "step": 11060 }, { "epoch": 0.9389850644942295, "grad_norm": 27.98299789428711, "learning_rate": 2.266033521837529e-08, "loss": 0.109, "num_input_tokens_seen": 10622144, "step": 11065 }, { "epoch": 0.9394093686354379, "grad_norm": 1.067746877670288, "learning_rate": 2.2347874609650596e-08, "loss": 0.0026, "num_input_tokens_seen": 10626880, "step": 11070 }, { "epoch": 0.9398336727766463, "grad_norm": 27.640390396118164, "learning_rate": 2.2037558879336004e-08, "loss": 0.0656, "num_input_tokens_seen": 10632128, "step": 11075 }, { "epoch": 0.9402579769178547, "grad_norm": 4.924507141113281, "learning_rate": 2.1729388708235485e-08, "loss": 0.0192, "num_input_tokens_seen": 10636800, "step": 11080 }, { "epoch": 0.9406822810590632, "grad_norm": 0.15182463824748993, "learning_rate": 2.1423364772445886e-08, "loss": 0.0352, "num_input_tokens_seen": 10641408, "step": 11085 }, { "epoch": 0.9411065852002716, "grad_norm": 5.914322853088379, "learning_rate": 2.111948774335548e-08, "loss": 0.0058, "num_input_tokens_seen": 10646400, "step": 11090 }, { "epoch": 0.94153088934148, "grad_norm": 31.913299560546875, "learning_rate": 2.081775828764254e-08, "loss": 0.1124, "num_input_tokens_seen": 10650816, "step": 11095 }, { "epoch": 0.9419551934826884, "grad_norm": 0.4387105703353882, "learning_rate": 2.0518177067273103e-08, "loss": 0.0133, "num_input_tokens_seen": 10655424, "step": 11100 }, { "epoch": 0.9423794976238968, "grad_norm": 1.6567003726959229, "learning_rate": 2.0220744739501305e-08, "loss": 0.0908, "num_input_tokens_seen": 10660416, "step": 11105 }, { "epoch": 0.9428038017651053, "grad_norm": 6.256274700164795, "learning_rate": 1.992546195686573e-08, "loss": 0.0695, "num_input_tokens_seen": 10665088, "step": 11110 }, { "epoch": 0.9432281059063137, "grad_norm": 0.11509720981121063, "learning_rate": 1.9632329367189725e-08, "loss": 0.0392, "num_input_tokens_seen": 10669632, "step": 11115 }, { "epoch": 0.9436524100475221, "grad_norm": 14.866227149963379, "learning_rate": 1.9341347613579086e-08, "loss": 0.0275, "num_input_tokens_seen": 10674752, "step": 11120 }, { "epoch": 0.9440767141887305, "grad_norm": 0.6725902557373047, "learning_rate": 1.9052517334420704e-08, "loss": 0.0512, "num_input_tokens_seen": 10679296, "step": 11125 }, { "epoch": 0.9445010183299389, "grad_norm": 25.810312271118164, "learning_rate": 1.8765839163381815e-08, "loss": 0.0959, "num_input_tokens_seen": 10683968, "step": 11130 }, { "epoch": 0.9449253224711474, "grad_norm": 0.7116135954856873, "learning_rate": 1.8481313729407645e-08, "loss": 0.0589, "num_input_tokens_seen": 10688512, "step": 11135 }, { "epoch": 0.9453496266123558, "grad_norm": 7.129089832305908, "learning_rate": 1.8198941656720646e-08, "loss": 0.1141, "num_input_tokens_seen": 10693312, "step": 11140 }, { "epoch": 0.9457739307535642, "grad_norm": 28.044822692871094, "learning_rate": 1.7918723564819272e-08, "loss": 0.105, "num_input_tokens_seen": 10698688, "step": 11145 }, { "epoch": 0.9461982348947726, "grad_norm": 14.497418403625488, "learning_rate": 1.7640660068475976e-08, "loss": 0.1199, "num_input_tokens_seen": 10704448, "step": 11150 }, { "epoch": 0.946622539035981, "grad_norm": 0.1703585833311081, "learning_rate": 1.7364751777736332e-08, "loss": 0.0696, "num_input_tokens_seen": 10709312, "step": 11155 }, { "epoch": 0.9470468431771895, "grad_norm": 10.870919227600098, "learning_rate": 1.7090999297917684e-08, "loss": 0.0331, "num_input_tokens_seen": 10714368, "step": 11160 }, { "epoch": 0.9474711473183979, "grad_norm": 61.19886779785156, "learning_rate": 1.6819403229607732e-08, "loss": 0.0659, "num_input_tokens_seen": 10718848, "step": 11165 }, { "epoch": 0.9478954514596063, "grad_norm": 0.1849125176668167, "learning_rate": 1.6549964168663054e-08, "loss": 0.0355, "num_input_tokens_seen": 10723712, "step": 11170 }, { "epoch": 0.9483197556008147, "grad_norm": 7.6935601234436035, "learning_rate": 1.6282682706208028e-08, "loss": 0.0551, "num_input_tokens_seen": 10728384, "step": 11175 }, { "epoch": 0.948744059742023, "grad_norm": 0.25422102212905884, "learning_rate": 1.6017559428633588e-08, "loss": 0.0233, "num_input_tokens_seen": 10733632, "step": 11180 }, { "epoch": 0.9491683638832314, "grad_norm": 0.24908825755119324, "learning_rate": 1.5754594917595564e-08, "loss": 0.0556, "num_input_tokens_seen": 10738880, "step": 11185 }, { "epoch": 0.9495926680244399, "grad_norm": 19.16463851928711, "learning_rate": 1.549378975001403e-08, "loss": 0.0574, "num_input_tokens_seen": 10744384, "step": 11190 }, { "epoch": 0.9500169721656483, "grad_norm": 16.46047592163086, "learning_rate": 1.5235144498071172e-08, "loss": 0.0319, "num_input_tokens_seen": 10749632, "step": 11195 }, { "epoch": 0.9504412763068567, "grad_norm": 17.636735916137695, "learning_rate": 1.4978659729210974e-08, "loss": 0.0301, "num_input_tokens_seen": 10754624, "step": 11200 }, { "epoch": 0.9508655804480651, "grad_norm": 27.047237396240234, "learning_rate": 1.4724336006137094e-08, "loss": 0.0761, "num_input_tokens_seen": 10759488, "step": 11205 }, { "epoch": 0.9512898845892735, "grad_norm": 0.19785869121551514, "learning_rate": 1.4472173886812433e-08, "loss": 0.004, "num_input_tokens_seen": 10764096, "step": 11210 }, { "epoch": 0.9512898845892735, "eval_loss": 0.05228454992175102, "eval_runtime": 15.8433, "eval_samples_per_second": 661.164, "eval_steps_per_second": 82.685, "num_input_tokens_seen": 10764096, "step": 11210 }, { "epoch": 0.951714188730482, "grad_norm": 10.602087020874023, "learning_rate": 1.4222173924457348e-08, "loss": 0.0577, "num_input_tokens_seen": 10768640, "step": 11215 }, { "epoch": 0.9521384928716904, "grad_norm": 0.10275991261005402, "learning_rate": 1.3974336667548659e-08, "loss": 0.0544, "num_input_tokens_seen": 10774016, "step": 11220 }, { "epoch": 0.9525627970128988, "grad_norm": 1.062309741973877, "learning_rate": 1.3728662659818201e-08, "loss": 0.0036, "num_input_tokens_seen": 10779072, "step": 11225 }, { "epoch": 0.9529871011541072, "grad_norm": 18.538475036621094, "learning_rate": 1.3485152440252389e-08, "loss": 0.0487, "num_input_tokens_seen": 10784000, "step": 11230 }, { "epoch": 0.9534114052953157, "grad_norm": 7.267714500427246, "learning_rate": 1.3243806543089875e-08, "loss": 0.0644, "num_input_tokens_seen": 10788928, "step": 11235 }, { "epoch": 0.9538357094365241, "grad_norm": 7.134186267852783, "learning_rate": 1.3004625497821553e-08, "loss": 0.0275, "num_input_tokens_seen": 10794304, "step": 11240 }, { "epoch": 0.9542600135777325, "grad_norm": 9.111855506896973, "learning_rate": 1.276760982918812e-08, "loss": 0.0298, "num_input_tokens_seen": 10798528, "step": 11245 }, { "epoch": 0.9546843177189409, "grad_norm": 0.16917872428894043, "learning_rate": 1.2532760057180291e-08, "loss": 0.0547, "num_input_tokens_seen": 10803200, "step": 11250 }, { "epoch": 0.9551086218601493, "grad_norm": 0.395969033241272, "learning_rate": 1.230007669703681e-08, "loss": 0.0283, "num_input_tokens_seen": 10807680, "step": 11255 }, { "epoch": 0.9555329260013578, "grad_norm": 0.17550839483737946, "learning_rate": 1.2069560259243328e-08, "loss": 0.0351, "num_input_tokens_seen": 10812224, "step": 11260 }, { "epoch": 0.9559572301425662, "grad_norm": 7.8040385246276855, "learning_rate": 1.1841211249531636e-08, "loss": 0.045, "num_input_tokens_seen": 10816640, "step": 11265 }, { "epoch": 0.9563815342837746, "grad_norm": 0.16808491945266724, "learning_rate": 1.1615030168878438e-08, "loss": 0.0182, "num_input_tokens_seen": 10821440, "step": 11270 }, { "epoch": 0.956805838424983, "grad_norm": 30.00806999206543, "learning_rate": 1.139101751350402e-08, "loss": 0.0752, "num_input_tokens_seen": 10826560, "step": 11275 }, { "epoch": 0.9572301425661914, "grad_norm": 21.37195587158203, "learning_rate": 1.1169173774871477e-08, "loss": 0.0729, "num_input_tokens_seen": 10831552, "step": 11280 }, { "epoch": 0.9576544467073999, "grad_norm": 0.5756484270095825, "learning_rate": 1.0949499439685483e-08, "loss": 0.027, "num_input_tokens_seen": 10835968, "step": 11285 }, { "epoch": 0.9580787508486083, "grad_norm": 11.867304801940918, "learning_rate": 1.0731994989891302e-08, "loss": 0.0548, "num_input_tokens_seen": 10840064, "step": 11290 }, { "epoch": 0.9585030549898167, "grad_norm": 7.94635534286499, "learning_rate": 1.0516660902673446e-08, "loss": 0.0353, "num_input_tokens_seen": 10844672, "step": 11295 }, { "epoch": 0.9589273591310251, "grad_norm": 0.13016831874847412, "learning_rate": 1.0303497650455128e-08, "loss": 0.0655, "num_input_tokens_seen": 10850304, "step": 11300 }, { "epoch": 0.9593516632722335, "grad_norm": 0.8015629649162292, "learning_rate": 1.0092505700896703e-08, "loss": 0.0381, "num_input_tokens_seen": 10854912, "step": 11305 }, { "epoch": 0.959775967413442, "grad_norm": 20.08193588256836, "learning_rate": 9.883685516895113e-09, "loss": 0.0385, "num_input_tokens_seen": 10859904, "step": 11310 }, { "epoch": 0.9602002715546504, "grad_norm": 9.255372047424316, "learning_rate": 9.677037556582557e-09, "loss": 0.0542, "num_input_tokens_seen": 10864704, "step": 11315 }, { "epoch": 0.9606245756958588, "grad_norm": 0.4058516025543213, "learning_rate": 9.47256227332538e-09, "loss": 0.0082, "num_input_tokens_seen": 10869248, "step": 11320 }, { "epoch": 0.9610488798370672, "grad_norm": 0.5226470828056335, "learning_rate": 9.270260115723739e-09, "loss": 0.0511, "num_input_tokens_seen": 10873984, "step": 11325 }, { "epoch": 0.9614731839782756, "grad_norm": 0.08098774403333664, "learning_rate": 9.070131527609603e-09, "loss": 0.0736, "num_input_tokens_seen": 10878592, "step": 11330 }, { "epoch": 0.9618974881194841, "grad_norm": 9.366537094116211, "learning_rate": 8.872176948046761e-09, "loss": 0.0627, "num_input_tokens_seen": 10883264, "step": 11335 }, { "epoch": 0.9623217922606925, "grad_norm": 0.4009857475757599, "learning_rate": 8.676396811329145e-09, "loss": 0.0327, "num_input_tokens_seen": 10887552, "step": 11340 }, { "epoch": 0.9627460964019009, "grad_norm": 7.252389907836914, "learning_rate": 8.482791546980506e-09, "loss": 0.0727, "num_input_tokens_seen": 10892800, "step": 11345 }, { "epoch": 0.9631704005431093, "grad_norm": 0.7880826592445374, "learning_rate": 8.291361579752631e-09, "loss": 0.0624, "num_input_tokens_seen": 10897280, "step": 11350 }, { "epoch": 0.9635947046843177, "grad_norm": 9.33914566040039, "learning_rate": 8.102107329625351e-09, "loss": 0.0544, "num_input_tokens_seen": 10902144, "step": 11355 }, { "epoch": 0.9640190088255262, "grad_norm": 15.225980758666992, "learning_rate": 7.91502921180487e-09, "loss": 0.0805, "num_input_tokens_seen": 10907264, "step": 11360 }, { "epoch": 0.9644433129667346, "grad_norm": 0.42005324363708496, "learning_rate": 7.730127636723538e-09, "loss": 0.0146, "num_input_tokens_seen": 10911808, "step": 11365 }, { "epoch": 0.964867617107943, "grad_norm": 18.510848999023438, "learning_rate": 7.547403010037978e-09, "loss": 0.0853, "num_input_tokens_seen": 10916544, "step": 11370 }, { "epoch": 0.9652919212491514, "grad_norm": 23.744672775268555, "learning_rate": 7.366855732629407e-09, "loss": 0.0814, "num_input_tokens_seen": 10921024, "step": 11375 }, { "epoch": 0.9657162253903598, "grad_norm": 7.666995525360107, "learning_rate": 7.1884862006017514e-09, "loss": 0.0551, "num_input_tokens_seen": 10926016, "step": 11380 }, { "epoch": 0.9661405295315683, "grad_norm": 6.401066780090332, "learning_rate": 7.012294805281205e-09, "loss": 0.0426, "num_input_tokens_seen": 10931200, "step": 11385 }, { "epoch": 0.9665648336727767, "grad_norm": 8.19086742401123, "learning_rate": 6.838281933215562e-09, "loss": 0.0681, "num_input_tokens_seen": 10936384, "step": 11390 }, { "epoch": 0.9669891378139851, "grad_norm": 27.355649948120117, "learning_rate": 6.6664479661729944e-09, "loss": 0.0278, "num_input_tokens_seen": 10941184, "step": 11395 }, { "epoch": 0.9674134419551935, "grad_norm": 8.20887565612793, "learning_rate": 6.496793281141055e-09, "loss": 0.0654, "num_input_tokens_seen": 10945408, "step": 11400 }, { "epoch": 0.9678377460964019, "grad_norm": 22.68846893310547, "learning_rate": 6.329318250326898e-09, "loss": 0.0877, "num_input_tokens_seen": 10950144, "step": 11405 }, { "epoch": 0.9682620502376104, "grad_norm": 5.500332832336426, "learning_rate": 6.164023241154837e-09, "loss": 0.1094, "num_input_tokens_seen": 10955264, "step": 11410 }, { "epoch": 0.9686863543788188, "grad_norm": 4.054877758026123, "learning_rate": 6.000908616267009e-09, "loss": 0.0078, "num_input_tokens_seen": 10960256, "step": 11415 }, { "epoch": 0.9691106585200272, "grad_norm": 24.178909301757812, "learning_rate": 5.839974733522046e-09, "loss": 0.0628, "num_input_tokens_seen": 10964672, "step": 11420 }, { "epoch": 0.9695349626612356, "grad_norm": 0.5533793568611145, "learning_rate": 5.68122194599363e-09, "loss": 0.0369, "num_input_tokens_seen": 10969728, "step": 11425 }, { "epoch": 0.969959266802444, "grad_norm": 0.2897357940673828, "learning_rate": 5.5246506019709374e-09, "loss": 0.0914, "num_input_tokens_seen": 10974720, "step": 11430 }, { "epoch": 0.9703835709436525, "grad_norm": 4.2502264976501465, "learning_rate": 5.370261044956969e-09, "loss": 0.0407, "num_input_tokens_seen": 10979008, "step": 11435 }, { "epoch": 0.9708078750848609, "grad_norm": 4.7640604972839355, "learning_rate": 5.218053613668116e-09, "loss": 0.0368, "num_input_tokens_seen": 10984128, "step": 11440 }, { "epoch": 0.9712321792260692, "grad_norm": 1.1905510425567627, "learning_rate": 5.068028642033595e-09, "loss": 0.0629, "num_input_tokens_seen": 10988480, "step": 11445 }, { "epoch": 0.9716564833672776, "grad_norm": 37.74256134033203, "learning_rate": 4.92018645919412e-09, "loss": 0.0341, "num_input_tokens_seen": 10992896, "step": 11450 }, { "epoch": 0.972080787508486, "grad_norm": 24.807628631591797, "learning_rate": 4.774527389501681e-09, "loss": 0.078, "num_input_tokens_seen": 10997696, "step": 11455 }, { "epoch": 0.9725050916496945, "grad_norm": 1.42875337600708, "learning_rate": 4.63105175251921e-09, "loss": 0.0699, "num_input_tokens_seen": 11002944, "step": 11460 }, { "epoch": 0.9729293957909029, "grad_norm": 20.29813003540039, "learning_rate": 4.489759863018583e-09, "loss": 0.085, "num_input_tokens_seen": 11008384, "step": 11465 }, { "epoch": 0.9733536999321113, "grad_norm": 37.211143493652344, "learning_rate": 4.350652030981394e-09, "loss": 0.0997, "num_input_tokens_seen": 11012800, "step": 11470 }, { "epoch": 0.9737780040733197, "grad_norm": 14.482527732849121, "learning_rate": 4.213728561597296e-09, "loss": 0.1097, "num_input_tokens_seen": 11017216, "step": 11475 }, { "epoch": 0.9742023082145281, "grad_norm": 5.938981533050537, "learning_rate": 4.0789897552637735e-09, "loss": 0.0678, "num_input_tokens_seen": 11021504, "step": 11480 }, { "epoch": 0.9746266123557366, "grad_norm": 12.1581392288208, "learning_rate": 3.946435907585255e-09, "loss": 0.0978, "num_input_tokens_seen": 11026816, "step": 11485 }, { "epoch": 0.975050916496945, "grad_norm": 9.135724067687988, "learning_rate": 3.816067309372673e-09, "loss": 0.0679, "num_input_tokens_seen": 11032448, "step": 11490 }, { "epoch": 0.9754752206381534, "grad_norm": 13.876908302307129, "learning_rate": 3.68788424664257e-09, "loss": 0.1102, "num_input_tokens_seen": 11037056, "step": 11495 }, { "epoch": 0.9758995247793618, "grad_norm": 0.29706308245658875, "learning_rate": 3.561887000616881e-09, "loss": 0.0901, "num_input_tokens_seen": 11041152, "step": 11500 }, { "epoch": 0.9763238289205702, "grad_norm": 19.376083374023438, "learning_rate": 3.438075847721933e-09, "loss": 0.0358, "num_input_tokens_seen": 11045504, "step": 11505 }, { "epoch": 0.9767481330617787, "grad_norm": 7.894840240478516, "learning_rate": 3.316451059587777e-09, "loss": 0.0784, "num_input_tokens_seen": 11049792, "step": 11510 }, { "epoch": 0.9771724372029871, "grad_norm": 0.4556736946105957, "learning_rate": 3.1970129030481907e-09, "loss": 0.0353, "num_input_tokens_seen": 11053952, "step": 11515 }, { "epoch": 0.9775967413441955, "grad_norm": 0.27915769815444946, "learning_rate": 3.0797616401392335e-09, "loss": 0.0642, "num_input_tokens_seen": 11058240, "step": 11520 }, { "epoch": 0.9780210454854039, "grad_norm": 1.1327648162841797, "learning_rate": 2.964697528099469e-09, "loss": 0.0538, "num_input_tokens_seen": 11063488, "step": 11525 }, { "epoch": 0.9784453496266123, "grad_norm": 0.9393741488456726, "learning_rate": 2.8518208193689664e-09, "loss": 0.0462, "num_input_tokens_seen": 11068096, "step": 11530 }, { "epoch": 0.9788696537678208, "grad_norm": 16.2606258392334, "learning_rate": 2.741131761588522e-09, "loss": 0.1083, "num_input_tokens_seen": 11073728, "step": 11535 }, { "epoch": 0.9792939579090292, "grad_norm": 8.848075866699219, "learning_rate": 2.632630597600105e-09, "loss": 0.0648, "num_input_tokens_seen": 11078144, "step": 11540 }, { "epoch": 0.9797182620502376, "grad_norm": 5.798458576202393, "learning_rate": 2.526317565444969e-09, "loss": 0.0129, "num_input_tokens_seen": 11083328, "step": 11545 }, { "epoch": 0.980142566191446, "grad_norm": 29.144174575805664, "learning_rate": 2.422192898364095e-09, "loss": 0.089, "num_input_tokens_seen": 11088064, "step": 11550 }, { "epoch": 0.9805668703326544, "grad_norm": 11.775348663330078, "learning_rate": 2.3202568247974175e-09, "loss": 0.0186, "num_input_tokens_seen": 11092736, "step": 11555 }, { "epoch": 0.9809911744738629, "grad_norm": 18.84742546081543, "learning_rate": 2.2205095683833774e-09, "loss": 0.0534, "num_input_tokens_seen": 11098176, "step": 11560 }, { "epoch": 0.9814154786150713, "grad_norm": 35.98866653442383, "learning_rate": 2.122951347958035e-09, "loss": 0.0566, "num_input_tokens_seen": 11102912, "step": 11565 }, { "epoch": 0.9818397827562797, "grad_norm": 7.811520099639893, "learning_rate": 2.0275823775551817e-09, "loss": 0.0898, "num_input_tokens_seen": 11107392, "step": 11570 }, { "epoch": 0.9822640868974881, "grad_norm": 44.668968200683594, "learning_rate": 1.934402866405671e-09, "loss": 0.0508, "num_input_tokens_seen": 11112256, "step": 11575 }, { "epoch": 0.9826883910386965, "grad_norm": 11.887144088745117, "learning_rate": 1.843413018936535e-09, "loss": 0.0302, "num_input_tokens_seen": 11117376, "step": 11580 }, { "epoch": 0.983112695179905, "grad_norm": 16.22102165222168, "learning_rate": 1.7546130347712018e-09, "loss": 0.105, "num_input_tokens_seen": 11122624, "step": 11585 }, { "epoch": 0.9835369993211134, "grad_norm": 1.5749400854110718, "learning_rate": 1.6680031087286106e-09, "loss": 0.0155, "num_input_tokens_seen": 11127680, "step": 11590 }, { "epoch": 0.9839613034623218, "grad_norm": 16.012849807739258, "learning_rate": 1.5835834308228768e-09, "loss": 0.0984, "num_input_tokens_seen": 11132672, "step": 11595 }, { "epoch": 0.9843856076035302, "grad_norm": 23.44938850402832, "learning_rate": 1.5013541862630708e-09, "loss": 0.091, "num_input_tokens_seen": 11137408, "step": 11600 }, { "epoch": 0.9848099117447386, "grad_norm": 28.59117889404297, "learning_rate": 1.4213155554525513e-09, "loss": 0.0188, "num_input_tokens_seen": 11142016, "step": 11605 }, { "epoch": 0.9852342158859471, "grad_norm": 23.170242309570312, "learning_rate": 1.343467713988522e-09, "loss": 0.0406, "num_input_tokens_seen": 11146624, "step": 11610 }, { "epoch": 0.9856585200271555, "grad_norm": 12.793461799621582, "learning_rate": 1.2678108326621418e-09, "loss": 0.135, "num_input_tokens_seen": 11151680, "step": 11615 }, { "epoch": 0.9860828241683639, "grad_norm": 0.7046257257461548, "learning_rate": 1.1943450774574148e-09, "loss": 0.0567, "num_input_tokens_seen": 11156096, "step": 11620 }, { "epoch": 0.9865071283095723, "grad_norm": 10.034231185913086, "learning_rate": 1.1230706095516352e-09, "loss": 0.0577, "num_input_tokens_seen": 11160256, "step": 11625 }, { "epoch": 0.9869314324507807, "grad_norm": 0.14725586771965027, "learning_rate": 1.0539875853142754e-09, "loss": 0.0315, "num_input_tokens_seen": 11165568, "step": 11630 }, { "epoch": 0.9873557365919892, "grad_norm": 6.678691864013672, "learning_rate": 9.8709615630721e-10, "loss": 0.071, "num_input_tokens_seen": 11170432, "step": 11635 }, { "epoch": 0.9877800407331976, "grad_norm": 0.4217716157436371, "learning_rate": 9.22396469283937e-10, "loss": 0.0726, "num_input_tokens_seen": 11175104, "step": 11640 }, { "epoch": 0.988204344874406, "grad_norm": 0.1474541574716568, "learning_rate": 8.598886661895787e-10, "loss": 0.0355, "num_input_tokens_seen": 11179584, "step": 11645 }, { "epoch": 0.9886286490156144, "grad_norm": 22.87831687927246, "learning_rate": 7.995728841605487e-10, "loss": 0.0425, "num_input_tokens_seen": 11184960, "step": 11650 }, { "epoch": 0.9890529531568228, "grad_norm": 3.589052438735962, "learning_rate": 7.41449255524107e-10, "loss": 0.0954, "num_input_tokens_seen": 11190464, "step": 11655 }, { "epoch": 0.9894772572980313, "grad_norm": 9.759099006652832, "learning_rate": 6.855179077981388e-10, "loss": 0.0764, "num_input_tokens_seen": 11195392, "step": 11660 }, { "epoch": 0.9899015614392397, "grad_norm": 0.7287328839302063, "learning_rate": 6.3177896369071e-10, "loss": 0.0392, "num_input_tokens_seen": 11200128, "step": 11665 }, { "epoch": 0.9903258655804481, "grad_norm": 0.5317105054855347, "learning_rate": 5.802325411001785e-10, "loss": 0.0484, "num_input_tokens_seen": 11204800, "step": 11670 }, { "epoch": 0.9907501697216565, "grad_norm": 12.197322845458984, "learning_rate": 5.308787531147496e-10, "loss": 0.0924, "num_input_tokens_seen": 11209472, "step": 11675 }, { "epoch": 0.991174473862865, "grad_norm": 1.3452421426773071, "learning_rate": 4.837177080119214e-10, "loss": 0.0679, "num_input_tokens_seen": 11214016, "step": 11680 }, { "epoch": 0.9915987780040734, "grad_norm": 10.827523231506348, "learning_rate": 4.387495092587068e-10, "loss": 0.072, "num_input_tokens_seen": 11218752, "step": 11685 }, { "epoch": 0.9920230821452818, "grad_norm": 14.037049293518066, "learning_rate": 3.959742555111889e-10, "loss": 0.1148, "num_input_tokens_seen": 11223552, "step": 11690 }, { "epoch": 0.9924473862864902, "grad_norm": 49.733394622802734, "learning_rate": 3.553920406144106e-10, "loss": 0.0671, "num_input_tokens_seen": 11228032, "step": 11695 }, { "epoch": 0.9928716904276986, "grad_norm": 1.3481730222702026, "learning_rate": 3.1700295360181927e-10, "loss": 0.0562, "num_input_tokens_seen": 11232448, "step": 11700 }, { "epoch": 0.993295994568907, "grad_norm": 27.198644638061523, "learning_rate": 2.808070786955996e-10, "loss": 0.0858, "num_input_tokens_seen": 11237440, "step": 11705 }, { "epoch": 0.9937202987101154, "grad_norm": 1.2682602405548096, "learning_rate": 2.4680449530622984e-10, "loss": 0.0601, "num_input_tokens_seen": 11242048, "step": 11710 }, { "epoch": 0.9941446028513238, "grad_norm": 7.826528072357178, "learning_rate": 2.1499527803214846e-10, "loss": 0.086, "num_input_tokens_seen": 11246400, "step": 11715 }, { "epoch": 0.9945689069925322, "grad_norm": 9.909138679504395, "learning_rate": 1.8537949665997642e-10, "loss": 0.0707, "num_input_tokens_seen": 11251456, "step": 11720 }, { "epoch": 0.9949932111337406, "grad_norm": 0.5570287704467773, "learning_rate": 1.5795721616373992e-10, "loss": 0.0287, "num_input_tokens_seen": 11256128, "step": 11725 }, { "epoch": 0.995417515274949, "grad_norm": 13.057730674743652, "learning_rate": 1.3272849670564746e-10, "loss": 0.0433, "num_input_tokens_seen": 11260736, "step": 11730 }, { "epoch": 0.9958418194161575, "grad_norm": 9.216782569885254, "learning_rate": 1.0969339363497975e-10, "loss": 0.0669, "num_input_tokens_seen": 11264896, "step": 11735 }, { "epoch": 0.9962661235573659, "grad_norm": 8.39694595336914, "learning_rate": 8.885195748875584e-11, "loss": 0.0291, "num_input_tokens_seen": 11270400, "step": 11740 }, { "epoch": 0.9966904276985743, "grad_norm": 0.7769232392311096, "learning_rate": 7.020423399117791e-11, "loss": 0.0136, "num_input_tokens_seen": 11275008, "step": 11745 }, { "epoch": 0.9971147318397827, "grad_norm": 0.41244271397590637, "learning_rate": 5.375026405352034e-11, "loss": 0.0776, "num_input_tokens_seen": 11279808, "step": 11750 }, { "epoch": 0.9975390359809911, "grad_norm": 9.232144355773926, "learning_rate": 3.949008377424068e-11, "loss": 0.0341, "num_input_tokens_seen": 11283904, "step": 11755 }, { "epoch": 0.9979633401221996, "grad_norm": 8.324690818786621, "learning_rate": 2.742372443909069e-11, "loss": 0.0601, "num_input_tokens_seen": 11288512, "step": 11760 }, { "epoch": 0.998387644263408, "grad_norm": 14.6475191116333, "learning_rate": 1.7551212520339197e-11, "loss": 0.1003, "num_input_tokens_seen": 11293568, "step": 11765 }, { "epoch": 0.9988119484046164, "grad_norm": 0.8326796889305115, "learning_rate": 9.872569677438213e-12, "loss": 0.1294, "num_input_tokens_seen": 11298752, "step": 11770 }, { "epoch": 0.9992362525458248, "grad_norm": 24.4580020904541, "learning_rate": 4.387812756578846e-12, "loss": 0.04, "num_input_tokens_seen": 11303680, "step": 11775 }, { "epoch": 0.9996605566870332, "grad_norm": 0.4780389368534088, "learning_rate": 1.0969537908023242e-12, "loss": 0.0059, "num_input_tokens_seen": 11308288, "step": 11780 }, { "epoch": 1.0, "num_input_tokens_seen": 11312256, "step": 11784, "total_flos": 6.605086766609203e+16, "train_loss": 0.0722960903008882, "train_runtime": 1744.9763, "train_samples_per_second": 54.023, "train_steps_per_second": 6.753 } ], "logging_steps": 5, "max_steps": 11784, "num_input_tokens_seen": 11312256, "num_train_epochs": 1, "save_steps": 590, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 6.605086766609203e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }