{ "best_global_step": 5685, "best_metric": 0.09698151051998138, "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_sst2_42_1779207274/checkpoint-5685", "epoch": 5.0, "eval_steps": 1895, "global_step": 37885, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006598917777484492, "grad_norm": 442.6911926269531, "learning_rate": 2.111375032990235e-09, "loss": 1.4356, "num_input_tokens_seen": 2240, "step": 5 }, { "epoch": 0.0013197835554968984, "grad_norm": 450.9195861816406, "learning_rate": 4.7505938242280285e-09, "loss": 1.5327, "num_input_tokens_seen": 4672, "step": 10 }, { "epoch": 0.0019796753332453477, "grad_norm": 461.6313171386719, "learning_rate": 7.389812615465822e-09, "loss": 1.4513, "num_input_tokens_seen": 7040, "step": 15 }, { "epoch": 0.002639567110993797, "grad_norm": 494.7020568847656, "learning_rate": 1.0029031406703616e-08, "loss": 1.4726, "num_input_tokens_seen": 9600, "step": 20 }, { "epoch": 0.0032994588887422464, "grad_norm": 465.0445251464844, "learning_rate": 1.2668250197941409e-08, "loss": 1.4022, "num_input_tokens_seen": 12160, "step": 25 }, { "epoch": 0.0039593506664906955, "grad_norm": 437.6158447265625, "learning_rate": 1.5307468989179204e-08, "loss": 1.5128, "num_input_tokens_seen": 14528, "step": 30 }, { "epoch": 0.004619242444239145, "grad_norm": 426.5767517089844, "learning_rate": 1.7946687780416997e-08, "loss": 1.3482, "num_input_tokens_seen": 16768, "step": 35 }, { "epoch": 0.005279134221987594, "grad_norm": 475.7981262207031, "learning_rate": 2.058590657165479e-08, "loss": 1.4398, "num_input_tokens_seen": 19264, "step": 40 }, { "epoch": 0.005939025999736044, "grad_norm": 467.2344970703125, "learning_rate": 2.3225125362892583e-08, "loss": 1.4043, "num_input_tokens_seen": 21632, "step": 45 }, { "epoch": 0.006598917777484493, "grad_norm": 387.37567138671875, "learning_rate": 2.5864344154130376e-08, "loss": 1.3473, "num_input_tokens_seen": 24000, "step": 50 }, { "epoch": 0.007258809555232942, "grad_norm": 442.187255859375, "learning_rate": 2.850356294536817e-08, "loss": 1.427, "num_input_tokens_seen": 26496, "step": 55 }, { "epoch": 0.007918701332981391, "grad_norm": 441.4548645019531, "learning_rate": 3.1142781736605966e-08, "loss": 1.4604, "num_input_tokens_seen": 29120, "step": 60 }, { "epoch": 0.008578593110729841, "grad_norm": 411.3653259277344, "learning_rate": 3.378200052784376e-08, "loss": 1.2511, "num_input_tokens_seen": 31744, "step": 65 }, { "epoch": 0.00923848488847829, "grad_norm": 390.2351989746094, "learning_rate": 3.6421219319081546e-08, "loss": 1.2233, "num_input_tokens_seen": 34176, "step": 70 }, { "epoch": 0.009898376666226739, "grad_norm": 400.3909912109375, "learning_rate": 3.9060438110319346e-08, "loss": 1.2987, "num_input_tokens_seen": 36864, "step": 75 }, { "epoch": 0.010558268443975187, "grad_norm": 388.7707824707031, "learning_rate": 4.169965690155713e-08, "loss": 1.1757, "num_input_tokens_seen": 39424, "step": 80 }, { "epoch": 0.011218160221723637, "grad_norm": 396.9801025390625, "learning_rate": 4.433887569279493e-08, "loss": 1.1551, "num_input_tokens_seen": 42112, "step": 85 }, { "epoch": 0.011878051999472087, "grad_norm": 278.70367431640625, "learning_rate": 4.6978094484032725e-08, "loss": 1.0816, "num_input_tokens_seen": 44544, "step": 90 }, { "epoch": 0.012537943777220536, "grad_norm": 273.1611022949219, "learning_rate": 4.961731327527052e-08, "loss": 0.8812, "num_input_tokens_seen": 47104, "step": 95 }, { "epoch": 0.013197835554968985, "grad_norm": 244.677978515625, "learning_rate": 5.225653206650831e-08, "loss": 0.8313, "num_input_tokens_seen": 49664, "step": 100 }, { "epoch": 0.013857727332717434, "grad_norm": 235.11949157714844, "learning_rate": 5.4895750857746105e-08, "loss": 0.7819, "num_input_tokens_seen": 52352, "step": 105 }, { "epoch": 0.014517619110465884, "grad_norm": 250.9341278076172, "learning_rate": 5.75349696489839e-08, "loss": 0.746, "num_input_tokens_seen": 54720, "step": 110 }, { "epoch": 0.015177510888214334, "grad_norm": 217.72210693359375, "learning_rate": 6.01741884402217e-08, "loss": 0.7562, "num_input_tokens_seen": 57152, "step": 115 }, { "epoch": 0.015837402665962782, "grad_norm": 203.7813720703125, "learning_rate": 6.281340723145948e-08, "loss": 0.6604, "num_input_tokens_seen": 59776, "step": 120 }, { "epoch": 0.01649729444371123, "grad_norm": 61.57086181640625, "learning_rate": 6.545262602269728e-08, "loss": 0.4326, "num_input_tokens_seen": 62464, "step": 125 }, { "epoch": 0.017157186221459682, "grad_norm": 43.6219367980957, "learning_rate": 6.809184481393507e-08, "loss": 0.3566, "num_input_tokens_seen": 65088, "step": 130 }, { "epoch": 0.01781707799920813, "grad_norm": 30.025165557861328, "learning_rate": 7.073106360517287e-08, "loss": 0.3399, "num_input_tokens_seen": 67776, "step": 135 }, { "epoch": 0.01847696977695658, "grad_norm": 34.46451950073242, "learning_rate": 7.337028239641066e-08, "loss": 0.3479, "num_input_tokens_seen": 70400, "step": 140 }, { "epoch": 0.01913686155470503, "grad_norm": 75.49683380126953, "learning_rate": 7.600950118764846e-08, "loss": 0.3225, "num_input_tokens_seen": 72704, "step": 145 }, { "epoch": 0.019796753332453478, "grad_norm": 35.20500183105469, "learning_rate": 7.864871997888626e-08, "loss": 0.3282, "num_input_tokens_seen": 75136, "step": 150 }, { "epoch": 0.020456645110201926, "grad_norm": 26.75568962097168, "learning_rate": 8.128793877012403e-08, "loss": 0.3013, "num_input_tokens_seen": 77632, "step": 155 }, { "epoch": 0.021116536887950375, "grad_norm": 19.496416091918945, "learning_rate": 8.392715756136183e-08, "loss": 0.3084, "num_input_tokens_seen": 80320, "step": 160 }, { "epoch": 0.021776428665698826, "grad_norm": 66.57316589355469, "learning_rate": 8.656637635259963e-08, "loss": 0.3253, "num_input_tokens_seen": 82752, "step": 165 }, { "epoch": 0.022436320443447275, "grad_norm": 51.820770263671875, "learning_rate": 8.920559514383743e-08, "loss": 0.2936, "num_input_tokens_seen": 85248, "step": 170 }, { "epoch": 0.023096212221195723, "grad_norm": 64.85391235351562, "learning_rate": 9.184481393507522e-08, "loss": 0.3018, "num_input_tokens_seen": 87872, "step": 175 }, { "epoch": 0.023756103998944175, "grad_norm": 29.103076934814453, "learning_rate": 9.4484032726313e-08, "loss": 0.2907, "num_input_tokens_seen": 90368, "step": 180 }, { "epoch": 0.024415995776692623, "grad_norm": 62.926822662353516, "learning_rate": 9.71232515175508e-08, "loss": 0.2362, "num_input_tokens_seen": 92928, "step": 185 }, { "epoch": 0.02507588755444107, "grad_norm": 40.194679260253906, "learning_rate": 9.976247030878859e-08, "loss": 0.2169, "num_input_tokens_seen": 95296, "step": 190 }, { "epoch": 0.02573577933218952, "grad_norm": 55.39727783203125, "learning_rate": 1.0240168910002639e-07, "loss": 0.204, "num_input_tokens_seen": 97984, "step": 195 }, { "epoch": 0.02639567110993797, "grad_norm": 92.27530670166016, "learning_rate": 1.0504090789126419e-07, "loss": 0.2406, "num_input_tokens_seen": 100352, "step": 200 }, { "epoch": 0.02705556288768642, "grad_norm": 25.1845703125, "learning_rate": 1.0768012668250196e-07, "loss": 0.2504, "num_input_tokens_seen": 102464, "step": 205 }, { "epoch": 0.027715454665434867, "grad_norm": 27.743425369262695, "learning_rate": 1.1031934547373976e-07, "loss": 0.1825, "num_input_tokens_seen": 105088, "step": 210 }, { "epoch": 0.02837534644318332, "grad_norm": 78.66401672363281, "learning_rate": 1.1295856426497756e-07, "loss": 0.203, "num_input_tokens_seen": 107648, "step": 215 }, { "epoch": 0.029035238220931767, "grad_norm": 70.38174438476562, "learning_rate": 1.1559778305621536e-07, "loss": 0.1882, "num_input_tokens_seen": 110144, "step": 220 }, { "epoch": 0.029695129998680216, "grad_norm": 83.74911499023438, "learning_rate": 1.1823700184745315e-07, "loss": 0.202, "num_input_tokens_seen": 112576, "step": 225 }, { "epoch": 0.030355021776428667, "grad_norm": 83.70970153808594, "learning_rate": 1.2087622063869096e-07, "loss": 0.2255, "num_input_tokens_seen": 115264, "step": 230 }, { "epoch": 0.031014913554177116, "grad_norm": 26.91559410095215, "learning_rate": 1.2351543942992873e-07, "loss": 0.1494, "num_input_tokens_seen": 117888, "step": 235 }, { "epoch": 0.031674805331925564, "grad_norm": 57.33741760253906, "learning_rate": 1.2615465822116653e-07, "loss": 0.0743, "num_input_tokens_seen": 120320, "step": 240 }, { "epoch": 0.032334697109674015, "grad_norm": 154.66212463378906, "learning_rate": 1.2879387701240433e-07, "loss": 0.1968, "num_input_tokens_seen": 122688, "step": 245 }, { "epoch": 0.03299458888742246, "grad_norm": 89.07099914550781, "learning_rate": 1.314330958036421e-07, "loss": 0.1802, "num_input_tokens_seen": 125248, "step": 250 }, { "epoch": 0.03365448066517091, "grad_norm": 78.18814849853516, "learning_rate": 1.340723145948799e-07, "loss": 0.149, "num_input_tokens_seen": 127680, "step": 255 }, { "epoch": 0.034314372442919364, "grad_norm": 48.04738235473633, "learning_rate": 1.367115333861177e-07, "loss": 0.0907, "num_input_tokens_seen": 130496, "step": 260 }, { "epoch": 0.03497426422066781, "grad_norm": 81.23184967041016, "learning_rate": 1.393507521773555e-07, "loss": 0.1709, "num_input_tokens_seen": 132992, "step": 265 }, { "epoch": 0.03563415599841626, "grad_norm": 57.03975296020508, "learning_rate": 1.419899709685933e-07, "loss": 0.1453, "num_input_tokens_seen": 135040, "step": 270 }, { "epoch": 0.03629404777616471, "grad_norm": 125.083251953125, "learning_rate": 1.4462918975983108e-07, "loss": 0.1749, "num_input_tokens_seen": 137600, "step": 275 }, { "epoch": 0.03695393955391316, "grad_norm": 95.15032958984375, "learning_rate": 1.4726840855106888e-07, "loss": 0.1575, "num_input_tokens_seen": 139904, "step": 280 }, { "epoch": 0.03761383133166161, "grad_norm": 44.138343811035156, "learning_rate": 1.4990762734230665e-07, "loss": 0.1957, "num_input_tokens_seen": 142016, "step": 285 }, { "epoch": 0.03827372310941006, "grad_norm": 17.540449142456055, "learning_rate": 1.5254684613354445e-07, "loss": 0.0195, "num_input_tokens_seen": 144576, "step": 290 }, { "epoch": 0.038933614887158505, "grad_norm": 85.709228515625, "learning_rate": 1.5518606492478225e-07, "loss": 0.086, "num_input_tokens_seen": 146880, "step": 295 }, { "epoch": 0.039593506664906956, "grad_norm": 155.11170959472656, "learning_rate": 1.5782528371602005e-07, "loss": 0.2891, "num_input_tokens_seen": 149184, "step": 300 }, { "epoch": 0.0402533984426554, "grad_norm": 5.010693073272705, "learning_rate": 1.6046450250725785e-07, "loss": 0.1615, "num_input_tokens_seen": 151296, "step": 305 }, { "epoch": 0.04091329022040385, "grad_norm": 55.62345504760742, "learning_rate": 1.6310372129849565e-07, "loss": 0.2055, "num_input_tokens_seen": 153664, "step": 310 }, { "epoch": 0.041573181998152305, "grad_norm": 31.857683181762695, "learning_rate": 1.6574294008973345e-07, "loss": 0.2078, "num_input_tokens_seen": 156032, "step": 315 }, { "epoch": 0.04223307377590075, "grad_norm": 7.426979064941406, "learning_rate": 1.6838215888097122e-07, "loss": 0.1392, "num_input_tokens_seen": 158528, "step": 320 }, { "epoch": 0.0428929655536492, "grad_norm": 22.063268661499023, "learning_rate": 1.71021377672209e-07, "loss": 0.1113, "num_input_tokens_seen": 160704, "step": 325 }, { "epoch": 0.04355285733139765, "grad_norm": 40.419403076171875, "learning_rate": 1.736605964634468e-07, "loss": 0.178, "num_input_tokens_seen": 163072, "step": 330 }, { "epoch": 0.0442127491091461, "grad_norm": 111.098876953125, "learning_rate": 1.762998152546846e-07, "loss": 0.1961, "num_input_tokens_seen": 165568, "step": 335 }, { "epoch": 0.04487264088689455, "grad_norm": 5.24355936050415, "learning_rate": 1.789390340459224e-07, "loss": 0.2253, "num_input_tokens_seen": 167936, "step": 340 }, { "epoch": 0.045532532664643, "grad_norm": 191.69830322265625, "learning_rate": 1.815782528371602e-07, "loss": 0.174, "num_input_tokens_seen": 170176, "step": 345 }, { "epoch": 0.046192424442391446, "grad_norm": 140.1460418701172, "learning_rate": 1.84217471628398e-07, "loss": 0.1717, "num_input_tokens_seen": 172352, "step": 350 }, { "epoch": 0.0468523162201399, "grad_norm": 72.49575805664062, "learning_rate": 1.8685669041963577e-07, "loss": 0.1506, "num_input_tokens_seen": 175168, "step": 355 }, { "epoch": 0.04751220799788835, "grad_norm": 37.27299499511719, "learning_rate": 1.8949590921087357e-07, "loss": 0.0852, "num_input_tokens_seen": 177856, "step": 360 }, { "epoch": 0.048172099775636794, "grad_norm": 95.40577697753906, "learning_rate": 1.9213512800211137e-07, "loss": 0.2049, "num_input_tokens_seen": 180160, "step": 365 }, { "epoch": 0.048831991553385246, "grad_norm": 67.52294921875, "learning_rate": 1.9477434679334917e-07, "loss": 0.1772, "num_input_tokens_seen": 182784, "step": 370 }, { "epoch": 0.0494918833311337, "grad_norm": 12.928277969360352, "learning_rate": 1.9741356558458697e-07, "loss": 0.0558, "num_input_tokens_seen": 185024, "step": 375 }, { "epoch": 0.05015177510888214, "grad_norm": 96.16371154785156, "learning_rate": 2.0005278437582474e-07, "loss": 0.2301, "num_input_tokens_seen": 187712, "step": 380 }, { "epoch": 0.050811666886630594, "grad_norm": 131.8448028564453, "learning_rate": 2.0269200316706254e-07, "loss": 0.179, "num_input_tokens_seen": 190400, "step": 385 }, { "epoch": 0.05147155866437904, "grad_norm": 83.01555633544922, "learning_rate": 2.0533122195830032e-07, "loss": 0.1703, "num_input_tokens_seen": 193280, "step": 390 }, { "epoch": 0.05213145044212749, "grad_norm": 68.2806625366211, "learning_rate": 2.0797044074953812e-07, "loss": 0.238, "num_input_tokens_seen": 195584, "step": 395 }, { "epoch": 0.05279134221987594, "grad_norm": 8.89856243133545, "learning_rate": 2.1060965954077591e-07, "loss": 0.0807, "num_input_tokens_seen": 198208, "step": 400 }, { "epoch": 0.05345123399762439, "grad_norm": 6.929687023162842, "learning_rate": 2.1324887833201371e-07, "loss": 0.1271, "num_input_tokens_seen": 200704, "step": 405 }, { "epoch": 0.05411112577537284, "grad_norm": 110.80956268310547, "learning_rate": 2.1588809712325151e-07, "loss": 0.1709, "num_input_tokens_seen": 203136, "step": 410 }, { "epoch": 0.05477101755312129, "grad_norm": 12.411187171936035, "learning_rate": 2.1852731591448931e-07, "loss": 0.1371, "num_input_tokens_seen": 205504, "step": 415 }, { "epoch": 0.055430909330869735, "grad_norm": 39.06804656982422, "learning_rate": 2.2116653470572711e-07, "loss": 0.2497, "num_input_tokens_seen": 208000, "step": 420 }, { "epoch": 0.056090801108618187, "grad_norm": 95.63874816894531, "learning_rate": 2.238057534969649e-07, "loss": 0.1593, "num_input_tokens_seen": 210560, "step": 425 }, { "epoch": 0.05675069288636664, "grad_norm": 116.45384216308594, "learning_rate": 2.2644497228820266e-07, "loss": 0.2053, "num_input_tokens_seen": 213056, "step": 430 }, { "epoch": 0.05741058466411508, "grad_norm": 88.33666229248047, "learning_rate": 2.2908419107944046e-07, "loss": 0.1709, "num_input_tokens_seen": 215296, "step": 435 }, { "epoch": 0.058070476441863535, "grad_norm": 1.3807058334350586, "learning_rate": 2.3172340987067826e-07, "loss": 0.0204, "num_input_tokens_seen": 217472, "step": 440 }, { "epoch": 0.058730368219611986, "grad_norm": 77.15796661376953, "learning_rate": 2.3436262866191606e-07, "loss": 0.0669, "num_input_tokens_seen": 219968, "step": 445 }, { "epoch": 0.05939025999736043, "grad_norm": 1.4172821044921875, "learning_rate": 2.3700184745315386e-07, "loss": 0.1046, "num_input_tokens_seen": 222592, "step": 450 }, { "epoch": 0.06005015177510888, "grad_norm": 3.821490526199341, "learning_rate": 2.3964106624439166e-07, "loss": 0.1579, "num_input_tokens_seen": 224768, "step": 455 }, { "epoch": 0.060710043552857335, "grad_norm": 3.3072474002838135, "learning_rate": 2.4228028503562943e-07, "loss": 0.238, "num_input_tokens_seen": 227264, "step": 460 }, { "epoch": 0.06136993533060578, "grad_norm": 169.94236755371094, "learning_rate": 2.4491950382686726e-07, "loss": 0.0656, "num_input_tokens_seen": 229760, "step": 465 }, { "epoch": 0.06202982710835423, "grad_norm": 45.810054779052734, "learning_rate": 2.4755872261810503e-07, "loss": 0.2998, "num_input_tokens_seen": 232000, "step": 470 }, { "epoch": 0.06268971888610268, "grad_norm": 2.529078245162964, "learning_rate": 2.501979414093428e-07, "loss": 0.2337, "num_input_tokens_seen": 234176, "step": 475 }, { "epoch": 0.06334961066385113, "grad_norm": 1.3976417779922485, "learning_rate": 2.528371602005806e-07, "loss": 0.1524, "num_input_tokens_seen": 236736, "step": 480 }, { "epoch": 0.06400950244159957, "grad_norm": 0.11078206449747086, "learning_rate": 2.554763789918184e-07, "loss": 0.0146, "num_input_tokens_seen": 239104, "step": 485 }, { "epoch": 0.06466939421934803, "grad_norm": 178.26333618164062, "learning_rate": 2.581155977830562e-07, "loss": 0.4542, "num_input_tokens_seen": 241600, "step": 490 }, { "epoch": 0.06532928599709648, "grad_norm": 228.8087158203125, "learning_rate": 2.60754816574294e-07, "loss": 0.5945, "num_input_tokens_seen": 244160, "step": 495 }, { "epoch": 0.06598917777484492, "grad_norm": 163.3988494873047, "learning_rate": 2.633940353655318e-07, "loss": 0.3712, "num_input_tokens_seen": 246464, "step": 500 }, { "epoch": 0.06664906955259338, "grad_norm": 1.6546058654785156, "learning_rate": 2.660332541567696e-07, "loss": 0.1548, "num_input_tokens_seen": 248960, "step": 505 }, { "epoch": 0.06730896133034182, "grad_norm": 94.28485870361328, "learning_rate": 2.686724729480074e-07, "loss": 0.1575, "num_input_tokens_seen": 251392, "step": 510 }, { "epoch": 0.06796885310809027, "grad_norm": 78.42328643798828, "learning_rate": 2.7131169173924515e-07, "loss": 0.1289, "num_input_tokens_seen": 253888, "step": 515 }, { "epoch": 0.06862874488583873, "grad_norm": 104.21388244628906, "learning_rate": 2.73950910530483e-07, "loss": 0.1912, "num_input_tokens_seen": 256384, "step": 520 }, { "epoch": 0.06928863666358717, "grad_norm": 99.578369140625, "learning_rate": 2.7659012932172075e-07, "loss": 0.2614, "num_input_tokens_seen": 258496, "step": 525 }, { "epoch": 0.06994852844133562, "grad_norm": 0.4172161817550659, "learning_rate": 2.792293481129586e-07, "loss": 0.1036, "num_input_tokens_seen": 260864, "step": 530 }, { "epoch": 0.07060842021908408, "grad_norm": 98.31641387939453, "learning_rate": 2.8186856690419635e-07, "loss": 0.0465, "num_input_tokens_seen": 263360, "step": 535 }, { "epoch": 0.07126831199683252, "grad_norm": 12.86385440826416, "learning_rate": 2.845077856954342e-07, "loss": 0.097, "num_input_tokens_seen": 266112, "step": 540 }, { "epoch": 0.07192820377458096, "grad_norm": 103.72950744628906, "learning_rate": 2.8714700448667195e-07, "loss": 0.1353, "num_input_tokens_seen": 268544, "step": 545 }, { "epoch": 0.07258809555232942, "grad_norm": 1.9934414625167847, "learning_rate": 2.897862232779097e-07, "loss": 0.2927, "num_input_tokens_seen": 270912, "step": 550 }, { "epoch": 0.07324798733007787, "grad_norm": 89.35478210449219, "learning_rate": 2.9242544206914755e-07, "loss": 0.042, "num_input_tokens_seen": 273664, "step": 555 }, { "epoch": 0.07390787910782631, "grad_norm": 51.353797912597656, "learning_rate": 2.950646608603853e-07, "loss": 0.1713, "num_input_tokens_seen": 276160, "step": 560 }, { "epoch": 0.07456777088557477, "grad_norm": 67.62535095214844, "learning_rate": 2.977038796516231e-07, "loss": 0.1153, "num_input_tokens_seen": 278784, "step": 565 }, { "epoch": 0.07522766266332322, "grad_norm": 61.236839294433594, "learning_rate": 3.0034309844286087e-07, "loss": 0.1525, "num_input_tokens_seen": 281152, "step": 570 }, { "epoch": 0.07588755444107166, "grad_norm": 91.33195495605469, "learning_rate": 3.029823172340987e-07, "loss": 0.2213, "num_input_tokens_seen": 283456, "step": 575 }, { "epoch": 0.07654744621882012, "grad_norm": 77.89016723632812, "learning_rate": 3.0562153602533647e-07, "loss": 0.1509, "num_input_tokens_seen": 286016, "step": 580 }, { "epoch": 0.07720733799656856, "grad_norm": 36.07801818847656, "learning_rate": 3.0826075481657424e-07, "loss": 0.1542, "num_input_tokens_seen": 288448, "step": 585 }, { "epoch": 0.07786722977431701, "grad_norm": 8.650485038757324, "learning_rate": 3.1089997360781207e-07, "loss": 0.136, "num_input_tokens_seen": 290816, "step": 590 }, { "epoch": 0.07852712155206547, "grad_norm": 87.19075775146484, "learning_rate": 3.1353919239904984e-07, "loss": 0.0768, "num_input_tokens_seen": 293504, "step": 595 }, { "epoch": 0.07918701332981391, "grad_norm": 87.9682388305664, "learning_rate": 3.1617841119028767e-07, "loss": 0.1251, "num_input_tokens_seen": 295744, "step": 600 }, { "epoch": 0.07984690510756236, "grad_norm": 22.17746925354004, "learning_rate": 3.1881762998152544e-07, "loss": 0.1154, "num_input_tokens_seen": 298112, "step": 605 }, { "epoch": 0.0805067968853108, "grad_norm": 110.7480239868164, "learning_rate": 3.2145684877276327e-07, "loss": 0.1348, "num_input_tokens_seen": 300608, "step": 610 }, { "epoch": 0.08116668866305926, "grad_norm": 0.6900279521942139, "learning_rate": 3.2409606756400104e-07, "loss": 0.0238, "num_input_tokens_seen": 303360, "step": 615 }, { "epoch": 0.0818265804408077, "grad_norm": 5.3087968826293945, "learning_rate": 3.267352863552388e-07, "loss": 0.1818, "num_input_tokens_seen": 305920, "step": 620 }, { "epoch": 0.08248647221855615, "grad_norm": 3.0559468269348145, "learning_rate": 3.2937450514647664e-07, "loss": 0.1448, "num_input_tokens_seen": 308416, "step": 625 }, { "epoch": 0.08314636399630461, "grad_norm": 2.740339756011963, "learning_rate": 3.320137239377144e-07, "loss": 0.0859, "num_input_tokens_seen": 310848, "step": 630 }, { "epoch": 0.08380625577405305, "grad_norm": 76.87493133544922, "learning_rate": 3.3465294272895224e-07, "loss": 0.1894, "num_input_tokens_seen": 313408, "step": 635 }, { "epoch": 0.0844661475518015, "grad_norm": 5.294236183166504, "learning_rate": 3.3729216152019e-07, "loss": 0.2527, "num_input_tokens_seen": 315904, "step": 640 }, { "epoch": 0.08512603932954996, "grad_norm": 150.8271026611328, "learning_rate": 3.3993138031142784e-07, "loss": 0.2999, "num_input_tokens_seen": 318080, "step": 645 }, { "epoch": 0.0857859311072984, "grad_norm": 68.92874908447266, "learning_rate": 3.425705991026656e-07, "loss": 0.1142, "num_input_tokens_seen": 320256, "step": 650 }, { "epoch": 0.08644582288504685, "grad_norm": 1.1438566446304321, "learning_rate": 3.452098178939034e-07, "loss": 0.0054, "num_input_tokens_seen": 322496, "step": 655 }, { "epoch": 0.0871057146627953, "grad_norm": 144.06700134277344, "learning_rate": 3.478490366851412e-07, "loss": 0.133, "num_input_tokens_seen": 325120, "step": 660 }, { "epoch": 0.08776560644054375, "grad_norm": 199.6498565673828, "learning_rate": 3.50488255476379e-07, "loss": 0.0938, "num_input_tokens_seen": 327296, "step": 665 }, { "epoch": 0.0884254982182922, "grad_norm": 106.60006713867188, "learning_rate": 3.5312747426761676e-07, "loss": 0.492, "num_input_tokens_seen": 329664, "step": 670 }, { "epoch": 0.08908538999604065, "grad_norm": 58.48418426513672, "learning_rate": 3.5576669305885453e-07, "loss": 0.2004, "num_input_tokens_seen": 332416, "step": 675 }, { "epoch": 0.0897452817737891, "grad_norm": 12.844401359558105, "learning_rate": 3.5840591185009236e-07, "loss": 0.0255, "num_input_tokens_seen": 334976, "step": 680 }, { "epoch": 0.09040517355153754, "grad_norm": 13.640639305114746, "learning_rate": 3.6104513064133013e-07, "loss": 0.1744, "num_input_tokens_seen": 337472, "step": 685 }, { "epoch": 0.091065065329286, "grad_norm": 23.306495666503906, "learning_rate": 3.636843494325679e-07, "loss": 0.0702, "num_input_tokens_seen": 339904, "step": 690 }, { "epoch": 0.09172495710703445, "grad_norm": 0.2791350781917572, "learning_rate": 3.6632356822380573e-07, "loss": 0.1571, "num_input_tokens_seen": 342272, "step": 695 }, { "epoch": 0.09238484888478289, "grad_norm": 47.02802276611328, "learning_rate": 3.689627870150435e-07, "loss": 0.0945, "num_input_tokens_seen": 344640, "step": 700 }, { "epoch": 0.09304474066253135, "grad_norm": 0.08177275210618973, "learning_rate": 3.7160200580628133e-07, "loss": 0.1118, "num_input_tokens_seen": 347072, "step": 705 }, { "epoch": 0.0937046324402798, "grad_norm": 0.10263694822788239, "learning_rate": 3.742412245975191e-07, "loss": 0.1896, "num_input_tokens_seen": 349696, "step": 710 }, { "epoch": 0.09436452421802824, "grad_norm": 167.6782684326172, "learning_rate": 3.7688044338875693e-07, "loss": 0.1602, "num_input_tokens_seen": 352256, "step": 715 }, { "epoch": 0.0950244159957767, "grad_norm": 0.09177152812480927, "learning_rate": 3.795196621799947e-07, "loss": 0.2044, "num_input_tokens_seen": 355008, "step": 720 }, { "epoch": 0.09568430777352514, "grad_norm": 79.8562240600586, "learning_rate": 3.821588809712325e-07, "loss": 0.4465, "num_input_tokens_seen": 357376, "step": 725 }, { "epoch": 0.09634419955127359, "grad_norm": 131.0872344970703, "learning_rate": 3.847980997624703e-07, "loss": 0.2807, "num_input_tokens_seen": 359680, "step": 730 }, { "epoch": 0.09700409132902205, "grad_norm": 64.67192840576172, "learning_rate": 3.874373185537081e-07, "loss": 0.0482, "num_input_tokens_seen": 362176, "step": 735 }, { "epoch": 0.09766398310677049, "grad_norm": 1.6336522102355957, "learning_rate": 3.900765373449459e-07, "loss": 0.0361, "num_input_tokens_seen": 365056, "step": 740 }, { "epoch": 0.09832387488451894, "grad_norm": 3.4322988986968994, "learning_rate": 3.927157561361837e-07, "loss": 0.1913, "num_input_tokens_seen": 367488, "step": 745 }, { "epoch": 0.0989837666622674, "grad_norm": 156.1293487548828, "learning_rate": 3.953549749274215e-07, "loss": 0.2768, "num_input_tokens_seen": 369792, "step": 750 }, { "epoch": 0.09964365844001584, "grad_norm": 12.225781440734863, "learning_rate": 3.979941937186593e-07, "loss": 0.0962, "num_input_tokens_seen": 372160, "step": 755 }, { "epoch": 0.10030355021776428, "grad_norm": 66.30107116699219, "learning_rate": 4.0063341250989705e-07, "loss": 0.1613, "num_input_tokens_seen": 374272, "step": 760 }, { "epoch": 0.10096344199551274, "grad_norm": 156.31951904296875, "learning_rate": 4.032726313011349e-07, "loss": 0.1893, "num_input_tokens_seen": 376704, "step": 765 }, { "epoch": 0.10162333377326119, "grad_norm": 221.8079071044922, "learning_rate": 4.0591185009237265e-07, "loss": 0.321, "num_input_tokens_seen": 379136, "step": 770 }, { "epoch": 0.10228322555100963, "grad_norm": 53.741859436035156, "learning_rate": 4.085510688836104e-07, "loss": 0.0868, "num_input_tokens_seen": 381568, "step": 775 }, { "epoch": 0.10294311732875808, "grad_norm": 2.2501022815704346, "learning_rate": 4.111902876748482e-07, "loss": 0.2393, "num_input_tokens_seen": 384128, "step": 780 }, { "epoch": 0.10360300910650654, "grad_norm": 64.6779556274414, "learning_rate": 4.13829506466086e-07, "loss": 0.3035, "num_input_tokens_seen": 386304, "step": 785 }, { "epoch": 0.10426290088425498, "grad_norm": 149.44509887695312, "learning_rate": 4.164687252573238e-07, "loss": 0.2872, "num_input_tokens_seen": 388864, "step": 790 }, { "epoch": 0.10492279266200343, "grad_norm": 40.67619323730469, "learning_rate": 4.1910794404856157e-07, "loss": 0.0753, "num_input_tokens_seen": 391104, "step": 795 }, { "epoch": 0.10558268443975188, "grad_norm": 43.18121337890625, "learning_rate": 4.217471628397994e-07, "loss": 0.1116, "num_input_tokens_seen": 393856, "step": 800 }, { "epoch": 0.10624257621750033, "grad_norm": 51.915523529052734, "learning_rate": 4.2438638163103717e-07, "loss": 0.1934, "num_input_tokens_seen": 396352, "step": 805 }, { "epoch": 0.10690246799524877, "grad_norm": 2.390080451965332, "learning_rate": 4.27025600422275e-07, "loss": 0.1459, "num_input_tokens_seen": 398592, "step": 810 }, { "epoch": 0.10756235977299723, "grad_norm": 87.79183197021484, "learning_rate": 4.2966481921351277e-07, "loss": 0.0887, "num_input_tokens_seen": 401088, "step": 815 }, { "epoch": 0.10822225155074568, "grad_norm": 17.97756004333496, "learning_rate": 4.323040380047506e-07, "loss": 0.0521, "num_input_tokens_seen": 403456, "step": 820 }, { "epoch": 0.10888214332849412, "grad_norm": 30.060583114624023, "learning_rate": 4.3494325679598837e-07, "loss": 0.2846, "num_input_tokens_seen": 405696, "step": 825 }, { "epoch": 0.10954203510624258, "grad_norm": 58.16769790649414, "learning_rate": 4.3758247558722614e-07, "loss": 0.2486, "num_input_tokens_seen": 408128, "step": 830 }, { "epoch": 0.11020192688399102, "grad_norm": 2.298959493637085, "learning_rate": 4.4022169437846397e-07, "loss": 0.0355, "num_input_tokens_seen": 411008, "step": 835 }, { "epoch": 0.11086181866173947, "grad_norm": 4.175866603851318, "learning_rate": 4.4286091316970174e-07, "loss": 0.3422, "num_input_tokens_seen": 413312, "step": 840 }, { "epoch": 0.11152171043948793, "grad_norm": 0.37670063972473145, "learning_rate": 4.4550013196093957e-07, "loss": 0.124, "num_input_tokens_seen": 415616, "step": 845 }, { "epoch": 0.11218160221723637, "grad_norm": 105.30368041992188, "learning_rate": 4.4813935075217734e-07, "loss": 0.1853, "num_input_tokens_seen": 418368, "step": 850 }, { "epoch": 0.11284149399498482, "grad_norm": 3.088155508041382, "learning_rate": 4.5077856954341517e-07, "loss": 0.0833, "num_input_tokens_seen": 420864, "step": 855 }, { "epoch": 0.11350138577273328, "grad_norm": 196.9340057373047, "learning_rate": 4.5341778833465294e-07, "loss": 0.222, "num_input_tokens_seen": 423040, "step": 860 }, { "epoch": 0.11416127755048172, "grad_norm": 122.30973052978516, "learning_rate": 4.560570071258907e-07, "loss": 0.3516, "num_input_tokens_seen": 425344, "step": 865 }, { "epoch": 0.11482116932823017, "grad_norm": 5.073932647705078, "learning_rate": 4.5869622591712854e-07, "loss": 0.1296, "num_input_tokens_seen": 427968, "step": 870 }, { "epoch": 0.11548106110597862, "grad_norm": 181.66859436035156, "learning_rate": 4.613354447083663e-07, "loss": 0.232, "num_input_tokens_seen": 430592, "step": 875 }, { "epoch": 0.11614095288372707, "grad_norm": 82.50604248046875, "learning_rate": 4.639746634996041e-07, "loss": 0.4212, "num_input_tokens_seen": 433152, "step": 880 }, { "epoch": 0.11680084466147551, "grad_norm": 102.42996215820312, "learning_rate": 4.6661388229084186e-07, "loss": 0.0185, "num_input_tokens_seen": 435456, "step": 885 }, { "epoch": 0.11746073643922397, "grad_norm": 161.98501586914062, "learning_rate": 4.692531010820797e-07, "loss": 0.2316, "num_input_tokens_seen": 438080, "step": 890 }, { "epoch": 0.11812062821697242, "grad_norm": 184.88845825195312, "learning_rate": 4.7189231987331746e-07, "loss": 0.2135, "num_input_tokens_seen": 440640, "step": 895 }, { "epoch": 0.11878051999472086, "grad_norm": 3.3446881771087646, "learning_rate": 4.7453153866455523e-07, "loss": 0.0402, "num_input_tokens_seen": 443008, "step": 900 }, { "epoch": 0.11944041177246932, "grad_norm": 59.780128479003906, "learning_rate": 4.771707574557931e-07, "loss": 0.1973, "num_input_tokens_seen": 445312, "step": 905 }, { "epoch": 0.12010030355021777, "grad_norm": 0.7624008059501648, "learning_rate": 4.798099762470308e-07, "loss": 0.105, "num_input_tokens_seen": 447680, "step": 910 }, { "epoch": 0.12076019532796621, "grad_norm": 3.7067677974700928, "learning_rate": 4.824491950382686e-07, "loss": 0.1353, "num_input_tokens_seen": 450368, "step": 915 }, { "epoch": 0.12142008710571467, "grad_norm": 5.35090446472168, "learning_rate": 4.850884138295065e-07, "loss": 0.0864, "num_input_tokens_seen": 452800, "step": 920 }, { "epoch": 0.12207997888346311, "grad_norm": 1.3662978410720825, "learning_rate": 4.877276326207443e-07, "loss": 0.1168, "num_input_tokens_seen": 455744, "step": 925 }, { "epoch": 0.12273987066121156, "grad_norm": 17.259654998779297, "learning_rate": 4.90366851411982e-07, "loss": 0.1899, "num_input_tokens_seen": 458176, "step": 930 }, { "epoch": 0.12339976243896002, "grad_norm": 0.9647508859634399, "learning_rate": 4.930060702032198e-07, "loss": 0.1902, "num_input_tokens_seen": 460480, "step": 935 }, { "epoch": 0.12405965421670846, "grad_norm": 6.764614105224609, "learning_rate": 4.956452889944576e-07, "loss": 0.169, "num_input_tokens_seen": 462656, "step": 940 }, { "epoch": 0.1247195459944569, "grad_norm": 391.0478210449219, "learning_rate": 4.982845077856955e-07, "loss": 0.2648, "num_input_tokens_seen": 465344, "step": 945 }, { "epoch": 0.12537943777220537, "grad_norm": 194.13442993164062, "learning_rate": 5.009237265769331e-07, "loss": 0.3667, "num_input_tokens_seen": 467712, "step": 950 }, { "epoch": 0.1260393295499538, "grad_norm": 101.93359375, "learning_rate": 5.03562945368171e-07, "loss": 0.2885, "num_input_tokens_seen": 470080, "step": 955 }, { "epoch": 0.12669922132770225, "grad_norm": 319.10693359375, "learning_rate": 5.062021641594088e-07, "loss": 0.1084, "num_input_tokens_seen": 472512, "step": 960 }, { "epoch": 0.1273591131054507, "grad_norm": 136.6356658935547, "learning_rate": 5.088413829506465e-07, "loss": 0.1628, "num_input_tokens_seen": 474752, "step": 965 }, { "epoch": 0.12801900488319914, "grad_norm": 256.36505126953125, "learning_rate": 5.114806017418843e-07, "loss": 0.0777, "num_input_tokens_seen": 477440, "step": 970 }, { "epoch": 0.12867889666094762, "grad_norm": 38.02722930908203, "learning_rate": 5.141198205331222e-07, "loss": 0.1169, "num_input_tokens_seen": 480000, "step": 975 }, { "epoch": 0.12933878843869606, "grad_norm": 97.68553924560547, "learning_rate": 5.1675903932436e-07, "loss": 0.2261, "num_input_tokens_seen": 482368, "step": 980 }, { "epoch": 0.1299986802164445, "grad_norm": 0.6980463862419128, "learning_rate": 5.193982581155977e-07, "loss": 0.0063, "num_input_tokens_seen": 484544, "step": 985 }, { "epoch": 0.13065857199419295, "grad_norm": 2.7004430294036865, "learning_rate": 5.220374769068355e-07, "loss": 0.1087, "num_input_tokens_seen": 486720, "step": 990 }, { "epoch": 0.1313184637719414, "grad_norm": 0.10351748764514923, "learning_rate": 5.246766956980734e-07, "loss": 0.0935, "num_input_tokens_seen": 489344, "step": 995 }, { "epoch": 0.13197835554968984, "grad_norm": 3.9759552478790283, "learning_rate": 5.273159144893111e-07, "loss": 0.0785, "num_input_tokens_seen": 491776, "step": 1000 }, { "epoch": 0.1326382473274383, "grad_norm": 405.497802734375, "learning_rate": 5.29955133280549e-07, "loss": 0.1646, "num_input_tokens_seen": 493952, "step": 1005 }, { "epoch": 0.13329813910518676, "grad_norm": 48.91209411621094, "learning_rate": 5.325943520717867e-07, "loss": 0.2601, "num_input_tokens_seen": 496320, "step": 1010 }, { "epoch": 0.1339580308829352, "grad_norm": 435.2791748046875, "learning_rate": 5.352335708630246e-07, "loss": 0.075, "num_input_tokens_seen": 498496, "step": 1015 }, { "epoch": 0.13461792266068365, "grad_norm": 58.02470397949219, "learning_rate": 5.378727896542623e-07, "loss": 0.3792, "num_input_tokens_seen": 501056, "step": 1020 }, { "epoch": 0.1352778144384321, "grad_norm": 19.976274490356445, "learning_rate": 5.405120084455001e-07, "loss": 0.0633, "num_input_tokens_seen": 503296, "step": 1025 }, { "epoch": 0.13593770621618054, "grad_norm": 36.125999450683594, "learning_rate": 5.431512272367379e-07, "loss": 0.2425, "num_input_tokens_seen": 505856, "step": 1030 }, { "epoch": 0.13659759799392898, "grad_norm": 4.3010334968566895, "learning_rate": 5.457904460279758e-07, "loss": 0.2077, "num_input_tokens_seen": 508416, "step": 1035 }, { "epoch": 0.13725748977167745, "grad_norm": 1.1001554727554321, "learning_rate": 5.484296648192135e-07, "loss": 0.1821, "num_input_tokens_seen": 511040, "step": 1040 }, { "epoch": 0.1379173815494259, "grad_norm": 1.0026289224624634, "learning_rate": 5.510688836104512e-07, "loss": 0.0126, "num_input_tokens_seen": 513216, "step": 1045 }, { "epoch": 0.13857727332717434, "grad_norm": 310.48272705078125, "learning_rate": 5.537081024016891e-07, "loss": 0.2747, "num_input_tokens_seen": 515456, "step": 1050 }, { "epoch": 0.1392371651049228, "grad_norm": 0.2722547948360443, "learning_rate": 5.563473211929268e-07, "loss": 0.2289, "num_input_tokens_seen": 517824, "step": 1055 }, { "epoch": 0.13989705688267123, "grad_norm": 9.799527168273926, "learning_rate": 5.589865399841647e-07, "loss": 0.1583, "num_input_tokens_seen": 520320, "step": 1060 }, { "epoch": 0.14055694866041968, "grad_norm": 153.0984344482422, "learning_rate": 5.616257587754024e-07, "loss": 0.0417, "num_input_tokens_seen": 522752, "step": 1065 }, { "epoch": 0.14121684043816815, "grad_norm": 58.452980041503906, "learning_rate": 5.642649775666402e-07, "loss": 0.0679, "num_input_tokens_seen": 525376, "step": 1070 }, { "epoch": 0.1418767322159166, "grad_norm": 1.3527008295059204, "learning_rate": 5.66904196357878e-07, "loss": 0.2223, "num_input_tokens_seen": 527808, "step": 1075 }, { "epoch": 0.14253662399366504, "grad_norm": 0.029463879764080048, "learning_rate": 5.695434151491159e-07, "loss": 0.1876, "num_input_tokens_seen": 530304, "step": 1080 }, { "epoch": 0.14319651577141349, "grad_norm": 30.274728775024414, "learning_rate": 5.721826339403536e-07, "loss": 0.375, "num_input_tokens_seen": 532480, "step": 1085 }, { "epoch": 0.14385640754916193, "grad_norm": 57.1826171875, "learning_rate": 5.748218527315914e-07, "loss": 0.2028, "num_input_tokens_seen": 535168, "step": 1090 }, { "epoch": 0.14451629932691037, "grad_norm": 52.93345260620117, "learning_rate": 5.774610715228292e-07, "loss": 0.3046, "num_input_tokens_seen": 537600, "step": 1095 }, { "epoch": 0.14517619110465885, "grad_norm": 131.98606872558594, "learning_rate": 5.801002903140671e-07, "loss": 0.043, "num_input_tokens_seen": 540096, "step": 1100 }, { "epoch": 0.1458360828824073, "grad_norm": 1.4051055908203125, "learning_rate": 5.827395091053047e-07, "loss": 0.0119, "num_input_tokens_seen": 542592, "step": 1105 }, { "epoch": 0.14649597466015574, "grad_norm": 0.07283133268356323, "learning_rate": 5.853787278965426e-07, "loss": 0.0392, "num_input_tokens_seen": 544960, "step": 1110 }, { "epoch": 0.14715586643790418, "grad_norm": 121.37798309326172, "learning_rate": 5.880179466877804e-07, "loss": 0.0851, "num_input_tokens_seen": 547136, "step": 1115 }, { "epoch": 0.14781575821565263, "grad_norm": 138.19085693359375, "learning_rate": 5.906571654790183e-07, "loss": 0.1365, "num_input_tokens_seen": 549440, "step": 1120 }, { "epoch": 0.14847564999340107, "grad_norm": 78.11005401611328, "learning_rate": 5.932963842702559e-07, "loss": 0.2707, "num_input_tokens_seen": 552064, "step": 1125 }, { "epoch": 0.14913554177114954, "grad_norm": 31.630468368530273, "learning_rate": 5.959356030614938e-07, "loss": 0.0997, "num_input_tokens_seen": 554432, "step": 1130 }, { "epoch": 0.149795433548898, "grad_norm": 3.8265812397003174, "learning_rate": 5.985748218527316e-07, "loss": 0.0028, "num_input_tokens_seen": 556800, "step": 1135 }, { "epoch": 0.15045532532664643, "grad_norm": 1.7065273523330688, "learning_rate": 6.012140406439695e-07, "loss": 0.2855, "num_input_tokens_seen": 559296, "step": 1140 }, { "epoch": 0.15111521710439488, "grad_norm": 39.60039138793945, "learning_rate": 6.038532594352071e-07, "loss": 0.1093, "num_input_tokens_seen": 562112, "step": 1145 }, { "epoch": 0.15177510888214332, "grad_norm": 0.6826571822166443, "learning_rate": 6.064924782264449e-07, "loss": 0.0592, "num_input_tokens_seen": 564288, "step": 1150 }, { "epoch": 0.15243500065989177, "grad_norm": 120.91127014160156, "learning_rate": 6.091316970176828e-07, "loss": 0.188, "num_input_tokens_seen": 566848, "step": 1155 }, { "epoch": 0.15309489243764024, "grad_norm": 98.7718505859375, "learning_rate": 6.117709158089205e-07, "loss": 0.4772, "num_input_tokens_seen": 569152, "step": 1160 }, { "epoch": 0.15375478421538868, "grad_norm": 94.5262222290039, "learning_rate": 6.144101346001583e-07, "loss": 0.0848, "num_input_tokens_seen": 571456, "step": 1165 }, { "epoch": 0.15441467599313713, "grad_norm": 0.14129430055618286, "learning_rate": 6.170493533913961e-07, "loss": 0.2235, "num_input_tokens_seen": 573824, "step": 1170 }, { "epoch": 0.15507456777088557, "grad_norm": 206.35865783691406, "learning_rate": 6.196885721826339e-07, "loss": 0.0517, "num_input_tokens_seen": 576128, "step": 1175 }, { "epoch": 0.15573445954863402, "grad_norm": 0.3648551106452942, "learning_rate": 6.223277909738716e-07, "loss": 0.1252, "num_input_tokens_seen": 578432, "step": 1180 }, { "epoch": 0.15639435132638246, "grad_norm": 0.18024662137031555, "learning_rate": 6.249670097651095e-07, "loss": 0.2063, "num_input_tokens_seen": 580736, "step": 1185 }, { "epoch": 0.15705424310413094, "grad_norm": 9.043709754943848, "learning_rate": 6.276062285563473e-07, "loss": 0.0011, "num_input_tokens_seen": 583040, "step": 1190 }, { "epoch": 0.15771413488187938, "grad_norm": 0.12227041274309158, "learning_rate": 6.302454473475851e-07, "loss": 0.0065, "num_input_tokens_seen": 585344, "step": 1195 }, { "epoch": 0.15837402665962783, "grad_norm": 76.58000183105469, "learning_rate": 6.328846661388228e-07, "loss": 0.2037, "num_input_tokens_seen": 587904, "step": 1200 }, { "epoch": 0.15903391843737627, "grad_norm": 0.9080504179000854, "learning_rate": 6.355238849300607e-07, "loss": 0.2007, "num_input_tokens_seen": 590208, "step": 1205 }, { "epoch": 0.15969381021512472, "grad_norm": 152.57972717285156, "learning_rate": 6.381631037212984e-07, "loss": 0.1552, "num_input_tokens_seen": 592512, "step": 1210 }, { "epoch": 0.16035370199287316, "grad_norm": 0.8822894096374512, "learning_rate": 6.408023225125363e-07, "loss": 0.2204, "num_input_tokens_seen": 595072, "step": 1215 }, { "epoch": 0.1610135937706216, "grad_norm": 294.2984924316406, "learning_rate": 6.43441541303774e-07, "loss": 0.2003, "num_input_tokens_seen": 597632, "step": 1220 }, { "epoch": 0.16167348554837008, "grad_norm": 0.054259005934000015, "learning_rate": 6.460807600950119e-07, "loss": 0.0941, "num_input_tokens_seen": 600192, "step": 1225 }, { "epoch": 0.16233337732611852, "grad_norm": 0.06157355755567551, "learning_rate": 6.487199788862496e-07, "loss": 0.2021, "num_input_tokens_seen": 602304, "step": 1230 }, { "epoch": 0.16299326910386697, "grad_norm": 391.3377990722656, "learning_rate": 6.513591976774875e-07, "loss": 0.2325, "num_input_tokens_seen": 604608, "step": 1235 }, { "epoch": 0.1636531608816154, "grad_norm": 27.769588470458984, "learning_rate": 6.539984164687252e-07, "loss": 0.1003, "num_input_tokens_seen": 606976, "step": 1240 }, { "epoch": 0.16431305265936386, "grad_norm": 0.0755537897348404, "learning_rate": 6.566376352599631e-07, "loss": 0.0153, "num_input_tokens_seen": 609600, "step": 1245 }, { "epoch": 0.1649729444371123, "grad_norm": 177.3814697265625, "learning_rate": 6.592768540512008e-07, "loss": 0.2758, "num_input_tokens_seen": 611840, "step": 1250 }, { "epoch": 0.16563283621486077, "grad_norm": 12.570916175842285, "learning_rate": 6.619160728424386e-07, "loss": 0.1107, "num_input_tokens_seen": 614528, "step": 1255 }, { "epoch": 0.16629272799260922, "grad_norm": 0.11696495860815048, "learning_rate": 6.645552916336764e-07, "loss": 0.0118, "num_input_tokens_seen": 617024, "step": 1260 }, { "epoch": 0.16695261977035766, "grad_norm": 40.369667053222656, "learning_rate": 6.671945104249141e-07, "loss": 0.156, "num_input_tokens_seen": 619392, "step": 1265 }, { "epoch": 0.1676125115481061, "grad_norm": 67.77987670898438, "learning_rate": 6.69833729216152e-07, "loss": 0.179, "num_input_tokens_seen": 621888, "step": 1270 }, { "epoch": 0.16827240332585455, "grad_norm": 166.91651916503906, "learning_rate": 6.724729480073898e-07, "loss": 0.1028, "num_input_tokens_seen": 624192, "step": 1275 }, { "epoch": 0.168932295103603, "grad_norm": 0.7046058177947998, "learning_rate": 6.751121667986275e-07, "loss": 0.0813, "num_input_tokens_seen": 626624, "step": 1280 }, { "epoch": 0.16959218688135147, "grad_norm": 60.36655044555664, "learning_rate": 6.777513855898653e-07, "loss": 0.0655, "num_input_tokens_seen": 628928, "step": 1285 }, { "epoch": 0.17025207865909991, "grad_norm": 0.3693448007106781, "learning_rate": 6.803906043811032e-07, "loss": 0.1142, "num_input_tokens_seen": 631616, "step": 1290 }, { "epoch": 0.17091197043684836, "grad_norm": 0.0590805858373642, "learning_rate": 6.83029823172341e-07, "loss": 0.045, "num_input_tokens_seen": 634112, "step": 1295 }, { "epoch": 0.1715718622145968, "grad_norm": 261.4384460449219, "learning_rate": 6.856690419635787e-07, "loss": 0.2547, "num_input_tokens_seen": 636416, "step": 1300 }, { "epoch": 0.17223175399234525, "grad_norm": 0.21970777213573456, "learning_rate": 6.883082607548165e-07, "loss": 0.0028, "num_input_tokens_seen": 638848, "step": 1305 }, { "epoch": 0.1728916457700937, "grad_norm": 170.5442352294922, "learning_rate": 6.909474795460544e-07, "loss": 0.0734, "num_input_tokens_seen": 641216, "step": 1310 }, { "epoch": 0.17355153754784217, "grad_norm": 0.15614676475524902, "learning_rate": 6.935866983372921e-07, "loss": 0.0042, "num_input_tokens_seen": 644032, "step": 1315 }, { "epoch": 0.1742114293255906, "grad_norm": 0.9323055744171143, "learning_rate": 6.962259171285299e-07, "loss": 0.1632, "num_input_tokens_seen": 646528, "step": 1320 }, { "epoch": 0.17487132110333906, "grad_norm": 0.03342974931001663, "learning_rate": 6.988651359197677e-07, "loss": 0.1357, "num_input_tokens_seen": 649088, "step": 1325 }, { "epoch": 0.1755312128810875, "grad_norm": 0.4350726008415222, "learning_rate": 7.015043547110056e-07, "loss": 0.0358, "num_input_tokens_seen": 651520, "step": 1330 }, { "epoch": 0.17619110465883595, "grad_norm": 151.69903564453125, "learning_rate": 7.041435735022433e-07, "loss": 0.1195, "num_input_tokens_seen": 654016, "step": 1335 }, { "epoch": 0.1768509964365844, "grad_norm": 0.02422684244811535, "learning_rate": 7.067827922934811e-07, "loss": 0.0002, "num_input_tokens_seen": 656320, "step": 1340 }, { "epoch": 0.17751088821433286, "grad_norm": 186.25314331054688, "learning_rate": 7.094220110847189e-07, "loss": 0.2259, "num_input_tokens_seen": 658880, "step": 1345 }, { "epoch": 0.1781707799920813, "grad_norm": 0.054249465465545654, "learning_rate": 7.120612298759568e-07, "loss": 0.0003, "num_input_tokens_seen": 661184, "step": 1350 }, { "epoch": 0.17883067176982975, "grad_norm": 0.8555107116699219, "learning_rate": 7.147004486671945e-07, "loss": 0.0801, "num_input_tokens_seen": 663680, "step": 1355 }, { "epoch": 0.1794905635475782, "grad_norm": 40.881561279296875, "learning_rate": 7.173396674584322e-07, "loss": 0.0336, "num_input_tokens_seen": 666048, "step": 1360 }, { "epoch": 0.18015045532532664, "grad_norm": 196.4837188720703, "learning_rate": 7.199788862496701e-07, "loss": 0.1074, "num_input_tokens_seen": 668480, "step": 1365 }, { "epoch": 0.1808103471030751, "grad_norm": 0.763932466506958, "learning_rate": 7.226181050409078e-07, "loss": 0.2056, "num_input_tokens_seen": 671488, "step": 1370 }, { "epoch": 0.18147023888082353, "grad_norm": 0.08814774453639984, "learning_rate": 7.252573238321457e-07, "loss": 0.1112, "num_input_tokens_seen": 673920, "step": 1375 }, { "epoch": 0.182130130658572, "grad_norm": 116.7082290649414, "learning_rate": 7.278965426233834e-07, "loss": 0.2556, "num_input_tokens_seen": 676416, "step": 1380 }, { "epoch": 0.18279002243632045, "grad_norm": 230.38856506347656, "learning_rate": 7.305357614146212e-07, "loss": 0.1058, "num_input_tokens_seen": 679040, "step": 1385 }, { "epoch": 0.1834499142140689, "grad_norm": 18.40300178527832, "learning_rate": 7.33174980205859e-07, "loss": 0.188, "num_input_tokens_seen": 681920, "step": 1390 }, { "epoch": 0.18410980599181734, "grad_norm": 0.1622646301984787, "learning_rate": 7.358141989970969e-07, "loss": 0.0583, "num_input_tokens_seen": 684416, "step": 1395 }, { "epoch": 0.18476969776956578, "grad_norm": 0.5164310336112976, "learning_rate": 7.384534177883346e-07, "loss": 0.0136, "num_input_tokens_seen": 686976, "step": 1400 }, { "epoch": 0.18542958954731423, "grad_norm": 0.15728312730789185, "learning_rate": 7.410926365795724e-07, "loss": 0.1873, "num_input_tokens_seen": 689408, "step": 1405 }, { "epoch": 0.1860894813250627, "grad_norm": 0.05055977404117584, "learning_rate": 7.437318553708102e-07, "loss": 0.0987, "num_input_tokens_seen": 691712, "step": 1410 }, { "epoch": 0.18674937310281114, "grad_norm": 0.0684177577495575, "learning_rate": 7.463710741620481e-07, "loss": 0.1017, "num_input_tokens_seen": 694080, "step": 1415 }, { "epoch": 0.1874092648805596, "grad_norm": 102.82398986816406, "learning_rate": 7.490102929532857e-07, "loss": 0.1519, "num_input_tokens_seen": 696640, "step": 1420 }, { "epoch": 0.18806915665830803, "grad_norm": 0.09111729264259338, "learning_rate": 7.516495117445236e-07, "loss": 0.0091, "num_input_tokens_seen": 699136, "step": 1425 }, { "epoch": 0.18872904843605648, "grad_norm": 26.446638107299805, "learning_rate": 7.542887305357614e-07, "loss": 0.3917, "num_input_tokens_seen": 701696, "step": 1430 }, { "epoch": 0.18938894021380492, "grad_norm": 66.99994659423828, "learning_rate": 7.569279493269993e-07, "loss": 0.206, "num_input_tokens_seen": 704384, "step": 1435 }, { "epoch": 0.1900488319915534, "grad_norm": 2.427685022354126, "learning_rate": 7.595671681182369e-07, "loss": 0.0719, "num_input_tokens_seen": 706560, "step": 1440 }, { "epoch": 0.19070872376930184, "grad_norm": 1.8941874504089355, "learning_rate": 7.622063869094748e-07, "loss": 0.0804, "num_input_tokens_seen": 709248, "step": 1445 }, { "epoch": 0.19136861554705029, "grad_norm": 45.50901794433594, "learning_rate": 7.648456057007126e-07, "loss": 0.1375, "num_input_tokens_seen": 711296, "step": 1450 }, { "epoch": 0.19202850732479873, "grad_norm": 0.24863462150096893, "learning_rate": 7.674848244919505e-07, "loss": 0.0051, "num_input_tokens_seen": 713728, "step": 1455 }, { "epoch": 0.19268839910254718, "grad_norm": 2.862823009490967, "learning_rate": 7.701240432831881e-07, "loss": 0.0025, "num_input_tokens_seen": 716224, "step": 1460 }, { "epoch": 0.19334829088029562, "grad_norm": 52.276092529296875, "learning_rate": 7.727632620744259e-07, "loss": 0.3884, "num_input_tokens_seen": 718528, "step": 1465 }, { "epoch": 0.1940081826580441, "grad_norm": 0.3433469533920288, "learning_rate": 7.754024808656638e-07, "loss": 0.2433, "num_input_tokens_seen": 720960, "step": 1470 }, { "epoch": 0.19466807443579254, "grad_norm": 0.39280804991722107, "learning_rate": 7.780416996569014e-07, "loss": 0.196, "num_input_tokens_seen": 723200, "step": 1475 }, { "epoch": 0.19532796621354098, "grad_norm": 87.35384368896484, "learning_rate": 7.806809184481393e-07, "loss": 0.1681, "num_input_tokens_seen": 725888, "step": 1480 }, { "epoch": 0.19598785799128943, "grad_norm": 0.5325558185577393, "learning_rate": 7.833201372393771e-07, "loss": 0.0609, "num_input_tokens_seen": 728256, "step": 1485 }, { "epoch": 0.19664774976903787, "grad_norm": 145.96885681152344, "learning_rate": 7.859593560306149e-07, "loss": 0.1787, "num_input_tokens_seen": 730688, "step": 1490 }, { "epoch": 0.19730764154678632, "grad_norm": 2.3841898441314697, "learning_rate": 7.885985748218526e-07, "loss": 0.2156, "num_input_tokens_seen": 732992, "step": 1495 }, { "epoch": 0.1979675333245348, "grad_norm": 0.35310041904449463, "learning_rate": 7.912377936130905e-07, "loss": 0.0585, "num_input_tokens_seen": 735424, "step": 1500 }, { "epoch": 0.19862742510228323, "grad_norm": 125.40451049804688, "learning_rate": 7.938770124043283e-07, "loss": 0.1832, "num_input_tokens_seen": 737664, "step": 1505 }, { "epoch": 0.19928731688003168, "grad_norm": 130.02059936523438, "learning_rate": 7.965162311955661e-07, "loss": 0.2169, "num_input_tokens_seen": 740096, "step": 1510 }, { "epoch": 0.19994720865778012, "grad_norm": 12.761601448059082, "learning_rate": 7.991554499868038e-07, "loss": 0.1121, "num_input_tokens_seen": 742912, "step": 1515 }, { "epoch": 0.20060710043552857, "grad_norm": 1.22785222530365, "learning_rate": 8.017946687780417e-07, "loss": 0.1211, "num_input_tokens_seen": 745536, "step": 1520 }, { "epoch": 0.201266992213277, "grad_norm": 0.4311539828777313, "learning_rate": 8.044338875692794e-07, "loss": 0.0182, "num_input_tokens_seen": 747968, "step": 1525 }, { "epoch": 0.20192688399102549, "grad_norm": 13.155280113220215, "learning_rate": 8.070731063605173e-07, "loss": 0.1159, "num_input_tokens_seen": 750464, "step": 1530 }, { "epoch": 0.20258677576877393, "grad_norm": 0.16525061428546906, "learning_rate": 8.09712325151755e-07, "loss": 0.1005, "num_input_tokens_seen": 752896, "step": 1535 }, { "epoch": 0.20324666754652237, "grad_norm": 116.2685317993164, "learning_rate": 8.123515439429929e-07, "loss": 0.2272, "num_input_tokens_seen": 755648, "step": 1540 }, { "epoch": 0.20390655932427082, "grad_norm": 25.550724029541016, "learning_rate": 8.149907627342306e-07, "loss": 0.2213, "num_input_tokens_seen": 758144, "step": 1545 }, { "epoch": 0.20456645110201926, "grad_norm": 4.4988884925842285, "learning_rate": 8.176299815254685e-07, "loss": 0.0083, "num_input_tokens_seen": 760704, "step": 1550 }, { "epoch": 0.2052263428797677, "grad_norm": 0.9621325135231018, "learning_rate": 8.202692003167062e-07, "loss": 0.0023, "num_input_tokens_seen": 763136, "step": 1555 }, { "epoch": 0.20588623465751615, "grad_norm": 1.8329963684082031, "learning_rate": 8.229084191079441e-07, "loss": 0.1036, "num_input_tokens_seen": 766016, "step": 1560 }, { "epoch": 0.20654612643526463, "grad_norm": 56.67019271850586, "learning_rate": 8.255476378991818e-07, "loss": 0.3823, "num_input_tokens_seen": 768512, "step": 1565 }, { "epoch": 0.20720601821301307, "grad_norm": 0.3594275712966919, "learning_rate": 8.281868566904196e-07, "loss": 0.2189, "num_input_tokens_seen": 770816, "step": 1570 }, { "epoch": 0.20786590999076152, "grad_norm": 361.115966796875, "learning_rate": 8.308260754816574e-07, "loss": 0.2545, "num_input_tokens_seen": 773312, "step": 1575 }, { "epoch": 0.20852580176850996, "grad_norm": 65.93611145019531, "learning_rate": 8.334652942728951e-07, "loss": 0.043, "num_input_tokens_seen": 775680, "step": 1580 }, { "epoch": 0.2091856935462584, "grad_norm": 20.88169288635254, "learning_rate": 8.36104513064133e-07, "loss": 0.1056, "num_input_tokens_seen": 778048, "step": 1585 }, { "epoch": 0.20984558532400685, "grad_norm": 0.9779016971588135, "learning_rate": 8.387437318553708e-07, "loss": 0.2807, "num_input_tokens_seen": 780672, "step": 1590 }, { "epoch": 0.21050547710175532, "grad_norm": 9.102095603942871, "learning_rate": 8.413829506466085e-07, "loss": 0.4118, "num_input_tokens_seen": 783360, "step": 1595 }, { "epoch": 0.21116536887950377, "grad_norm": 83.82833099365234, "learning_rate": 8.440221694378463e-07, "loss": 0.1426, "num_input_tokens_seen": 785856, "step": 1600 }, { "epoch": 0.2118252606572522, "grad_norm": 2.142838716506958, "learning_rate": 8.466613882290842e-07, "loss": 0.1497, "num_input_tokens_seen": 788480, "step": 1605 }, { "epoch": 0.21248515243500066, "grad_norm": 0.20364223420619965, "learning_rate": 8.49300607020322e-07, "loss": 0.1096, "num_input_tokens_seen": 791040, "step": 1610 }, { "epoch": 0.2131450442127491, "grad_norm": 0.12941564619541168, "learning_rate": 8.519398258115597e-07, "loss": 0.0508, "num_input_tokens_seen": 793600, "step": 1615 }, { "epoch": 0.21380493599049755, "grad_norm": 0.15976838767528534, "learning_rate": 8.545790446027975e-07, "loss": 0.2171, "num_input_tokens_seen": 795968, "step": 1620 }, { "epoch": 0.21446482776824602, "grad_norm": 0.6009201407432556, "learning_rate": 8.572182633940354e-07, "loss": 0.0251, "num_input_tokens_seen": 798272, "step": 1625 }, { "epoch": 0.21512471954599446, "grad_norm": 0.09053964912891388, "learning_rate": 8.59857482185273e-07, "loss": 0.1544, "num_input_tokens_seen": 801024, "step": 1630 }, { "epoch": 0.2157846113237429, "grad_norm": 39.99749755859375, "learning_rate": 8.624967009765109e-07, "loss": 0.1345, "num_input_tokens_seen": 803584, "step": 1635 }, { "epoch": 0.21644450310149135, "grad_norm": 3.849316120147705, "learning_rate": 8.651359197677487e-07, "loss": 0.0544, "num_input_tokens_seen": 805952, "step": 1640 }, { "epoch": 0.2171043948792398, "grad_norm": 0.24410617351531982, "learning_rate": 8.677751385589866e-07, "loss": 0.0881, "num_input_tokens_seen": 808512, "step": 1645 }, { "epoch": 0.21776428665698824, "grad_norm": 5.100990295410156, "learning_rate": 8.704143573502242e-07, "loss": 0.0636, "num_input_tokens_seen": 811136, "step": 1650 }, { "epoch": 0.21842417843473672, "grad_norm": 0.057112302631139755, "learning_rate": 8.730535761414621e-07, "loss": 0.0526, "num_input_tokens_seen": 813376, "step": 1655 }, { "epoch": 0.21908407021248516, "grad_norm": 130.31956481933594, "learning_rate": 8.756927949326999e-07, "loss": 0.2304, "num_input_tokens_seen": 815872, "step": 1660 }, { "epoch": 0.2197439619902336, "grad_norm": 0.1521269828081131, "learning_rate": 8.783320137239377e-07, "loss": 0.0414, "num_input_tokens_seen": 818240, "step": 1665 }, { "epoch": 0.22040385376798205, "grad_norm": 0.08848614990711212, "learning_rate": 8.809712325151754e-07, "loss": 0.108, "num_input_tokens_seen": 820736, "step": 1670 }, { "epoch": 0.2210637455457305, "grad_norm": 177.2266082763672, "learning_rate": 8.836104513064132e-07, "loss": 0.3837, "num_input_tokens_seen": 823616, "step": 1675 }, { "epoch": 0.22172363732347894, "grad_norm": 20.548864364624023, "learning_rate": 8.862496700976511e-07, "loss": 0.1827, "num_input_tokens_seen": 826112, "step": 1680 }, { "epoch": 0.2223835291012274, "grad_norm": 0.21072465181350708, "learning_rate": 8.888888888888888e-07, "loss": 0.1026, "num_input_tokens_seen": 828672, "step": 1685 }, { "epoch": 0.22304342087897586, "grad_norm": 0.858121931552887, "learning_rate": 8.915281076801266e-07, "loss": 0.1787, "num_input_tokens_seen": 831296, "step": 1690 }, { "epoch": 0.2237033126567243, "grad_norm": 50.27134704589844, "learning_rate": 8.941673264713644e-07, "loss": 0.0912, "num_input_tokens_seen": 833856, "step": 1695 }, { "epoch": 0.22436320443447275, "grad_norm": 0.13557344675064087, "learning_rate": 8.968065452626022e-07, "loss": 0.2237, "num_input_tokens_seen": 836480, "step": 1700 }, { "epoch": 0.2250230962122212, "grad_norm": 90.39064025878906, "learning_rate": 8.9944576405384e-07, "loss": 0.0627, "num_input_tokens_seen": 838976, "step": 1705 }, { "epoch": 0.22568298798996964, "grad_norm": 0.13472281396389008, "learning_rate": 9.020849828450778e-07, "loss": 0.1252, "num_input_tokens_seen": 841728, "step": 1710 }, { "epoch": 0.22634287976771808, "grad_norm": 0.06779789924621582, "learning_rate": 9.047242016363156e-07, "loss": 0.0499, "num_input_tokens_seen": 843968, "step": 1715 }, { "epoch": 0.22700277154546655, "grad_norm": 0.08471981436014175, "learning_rate": 9.073634204275534e-07, "loss": 0.4636, "num_input_tokens_seen": 846464, "step": 1720 }, { "epoch": 0.227662663323215, "grad_norm": 0.1624394655227661, "learning_rate": 9.100026392187912e-07, "loss": 0.2002, "num_input_tokens_seen": 849088, "step": 1725 }, { "epoch": 0.22832255510096344, "grad_norm": 0.18447798490524292, "learning_rate": 9.12641858010029e-07, "loss": 0.0052, "num_input_tokens_seen": 851712, "step": 1730 }, { "epoch": 0.2289824468787119, "grad_norm": 0.32411372661590576, "learning_rate": 9.152810768012667e-07, "loss": 0.156, "num_input_tokens_seen": 854208, "step": 1735 }, { "epoch": 0.22964233865646033, "grad_norm": 0.3888205885887146, "learning_rate": 9.179202955925046e-07, "loss": 0.0557, "num_input_tokens_seen": 856576, "step": 1740 }, { "epoch": 0.23030223043420878, "grad_norm": 0.337443083524704, "learning_rate": 9.205595143837424e-07, "loss": 0.2487, "num_input_tokens_seen": 859008, "step": 1745 }, { "epoch": 0.23096212221195725, "grad_norm": 20.267051696777344, "learning_rate": 9.231987331749802e-07, "loss": 0.1679, "num_input_tokens_seen": 861440, "step": 1750 }, { "epoch": 0.2316220139897057, "grad_norm": 86.99280548095703, "learning_rate": 9.258379519662179e-07, "loss": 0.1502, "num_input_tokens_seen": 863936, "step": 1755 }, { "epoch": 0.23228190576745414, "grad_norm": 79.0941390991211, "learning_rate": 9.284771707574558e-07, "loss": 0.1127, "num_input_tokens_seen": 866176, "step": 1760 }, { "epoch": 0.23294179754520258, "grad_norm": 0.8542360067367554, "learning_rate": 9.311163895486936e-07, "loss": 0.1574, "num_input_tokens_seen": 868480, "step": 1765 }, { "epoch": 0.23360168932295103, "grad_norm": 0.31946781277656555, "learning_rate": 9.337556083399313e-07, "loss": 0.1025, "num_input_tokens_seen": 870976, "step": 1770 }, { "epoch": 0.23426158110069947, "grad_norm": 2.6998465061187744, "learning_rate": 9.363948271311691e-07, "loss": 0.2237, "num_input_tokens_seen": 873088, "step": 1775 }, { "epoch": 0.23492147287844795, "grad_norm": 25.20713996887207, "learning_rate": 9.390340459224069e-07, "loss": 0.2408, "num_input_tokens_seen": 875520, "step": 1780 }, { "epoch": 0.2355813646561964, "grad_norm": 16.12818145751953, "learning_rate": 9.416732647136448e-07, "loss": 0.0166, "num_input_tokens_seen": 877632, "step": 1785 }, { "epoch": 0.23624125643394484, "grad_norm": 27.490631103515625, "learning_rate": 9.443124835048824e-07, "loss": 0.0702, "num_input_tokens_seen": 880000, "step": 1790 }, { "epoch": 0.23690114821169328, "grad_norm": 236.32696533203125, "learning_rate": 9.469517022961203e-07, "loss": 0.0669, "num_input_tokens_seen": 882176, "step": 1795 }, { "epoch": 0.23756103998944172, "grad_norm": 0.07463784515857697, "learning_rate": 9.495909210873581e-07, "loss": 0.0867, "num_input_tokens_seen": 884800, "step": 1800 }, { "epoch": 0.23822093176719017, "grad_norm": 58.357872009277344, "learning_rate": 9.522301398785959e-07, "loss": 0.0113, "num_input_tokens_seen": 887104, "step": 1805 }, { "epoch": 0.23888082354493864, "grad_norm": 154.2604217529297, "learning_rate": 9.548693586698336e-07, "loss": 0.043, "num_input_tokens_seen": 889408, "step": 1810 }, { "epoch": 0.2395407153226871, "grad_norm": 452.2360534667969, "learning_rate": 9.575085774610714e-07, "loss": 0.2031, "num_input_tokens_seen": 891648, "step": 1815 }, { "epoch": 0.24020060710043553, "grad_norm": 23.80782127380371, "learning_rate": 9.601477962523092e-07, "loss": 0.2171, "num_input_tokens_seen": 894208, "step": 1820 }, { "epoch": 0.24086049887818398, "grad_norm": 23.87192726135254, "learning_rate": 9.627870150435472e-07, "loss": 0.1157, "num_input_tokens_seen": 896704, "step": 1825 }, { "epoch": 0.24152039065593242, "grad_norm": 0.8979841470718384, "learning_rate": 9.65426233834785e-07, "loss": 0.045, "num_input_tokens_seen": 899264, "step": 1830 }, { "epoch": 0.24218028243368087, "grad_norm": 0.11097682267427444, "learning_rate": 9.680654526260227e-07, "loss": 0.0719, "num_input_tokens_seen": 901760, "step": 1835 }, { "epoch": 0.24284017421142934, "grad_norm": 313.09429931640625, "learning_rate": 9.707046714172605e-07, "loss": 0.1597, "num_input_tokens_seen": 903872, "step": 1840 }, { "epoch": 0.24350006598917778, "grad_norm": 0.03155740723013878, "learning_rate": 9.733438902084983e-07, "loss": 0.0005, "num_input_tokens_seen": 906368, "step": 1845 }, { "epoch": 0.24415995776692623, "grad_norm": 0.024505728855729103, "learning_rate": 9.75983108999736e-07, "loss": 0.0012, "num_input_tokens_seen": 908864, "step": 1850 }, { "epoch": 0.24481984954467467, "grad_norm": 0.07788019627332687, "learning_rate": 9.786223277909738e-07, "loss": 0.169, "num_input_tokens_seen": 911040, "step": 1855 }, { "epoch": 0.24547974132242312, "grad_norm": 32.930335998535156, "learning_rate": 9.812615465822116e-07, "loss": 0.2041, "num_input_tokens_seen": 913408, "step": 1860 }, { "epoch": 0.24613963310017156, "grad_norm": 24.17268180847168, "learning_rate": 9.839007653734496e-07, "loss": 0.4034, "num_input_tokens_seen": 915968, "step": 1865 }, { "epoch": 0.24679952487792003, "grad_norm": 106.91326904296875, "learning_rate": 9.865399841646871e-07, "loss": 0.1269, "num_input_tokens_seen": 918528, "step": 1870 }, { "epoch": 0.24745941665566848, "grad_norm": 0.11772750318050385, "learning_rate": 9.89179202955925e-07, "loss": 0.0006, "num_input_tokens_seen": 921152, "step": 1875 }, { "epoch": 0.24811930843341692, "grad_norm": 0.17723694443702698, "learning_rate": 9.918184217471629e-07, "loss": 0.0005, "num_input_tokens_seen": 923520, "step": 1880 }, { "epoch": 0.24877920021116537, "grad_norm": 0.6994357109069824, "learning_rate": 9.944576405384004e-07, "loss": 0.1003, "num_input_tokens_seen": 925888, "step": 1885 }, { "epoch": 0.2494390919889138, "grad_norm": 21.57866096496582, "learning_rate": 9.970968593296384e-07, "loss": 0.1144, "num_input_tokens_seen": 928704, "step": 1890 }, { "epoch": 0.2500989837666623, "grad_norm": 130.3249053955078, "learning_rate": 9.997360781208762e-07, "loss": 0.4074, "num_input_tokens_seen": 930944, "step": 1895 }, { "epoch": 0.2500989837666623, "eval_loss": 0.15521390736103058, "eval_runtime": 7.8585, "eval_samples_per_second": 857.037, "eval_steps_per_second": 107.145, "num_input_tokens_seen": 930944, "step": 1895 }, { "epoch": 0.25075887554441073, "grad_norm": 397.0398254394531, "learning_rate": 1.002375296912114e-06, "loss": 0.2799, "num_input_tokens_seen": 933376, "step": 1900 }, { "epoch": 0.2514187673221592, "grad_norm": 0.684500515460968, "learning_rate": 1.0050145157033517e-06, "loss": 0.151, "num_input_tokens_seen": 936000, "step": 1905 }, { "epoch": 0.2520786590999076, "grad_norm": 65.4966049194336, "learning_rate": 1.0076537344945895e-06, "loss": 0.226, "num_input_tokens_seen": 938432, "step": 1910 }, { "epoch": 0.25273855087765607, "grad_norm": 58.49400329589844, "learning_rate": 1.0102929532858273e-06, "loss": 0.1408, "num_input_tokens_seen": 941312, "step": 1915 }, { "epoch": 0.2533984426554045, "grad_norm": 59.23992919921875, "learning_rate": 1.012932172077065e-06, "loss": 0.0428, "num_input_tokens_seen": 943488, "step": 1920 }, { "epoch": 0.25405833443315295, "grad_norm": 57.335479736328125, "learning_rate": 1.015571390868303e-06, "loss": 0.1021, "num_input_tokens_seen": 945856, "step": 1925 }, { "epoch": 0.2547182262109014, "grad_norm": 10.350142478942871, "learning_rate": 1.0182106096595406e-06, "loss": 0.1363, "num_input_tokens_seen": 948352, "step": 1930 }, { "epoch": 0.25537811798864984, "grad_norm": 0.7066463828086853, "learning_rate": 1.0208498284507786e-06, "loss": 0.0973, "num_input_tokens_seen": 950976, "step": 1935 }, { "epoch": 0.2560380097663983, "grad_norm": 316.34130859375, "learning_rate": 1.0234890472420164e-06, "loss": 0.1607, "num_input_tokens_seen": 953216, "step": 1940 }, { "epoch": 0.25669790154414673, "grad_norm": 2.6355180740356445, "learning_rate": 1.0261282660332541e-06, "loss": 0.015, "num_input_tokens_seen": 955648, "step": 1945 }, { "epoch": 0.25735779332189523, "grad_norm": 0.5929775238037109, "learning_rate": 1.028767484824492e-06, "loss": 0.0037, "num_input_tokens_seen": 957888, "step": 1950 }, { "epoch": 0.2580176850996437, "grad_norm": 29.690797805786133, "learning_rate": 1.0314067036157297e-06, "loss": 0.1865, "num_input_tokens_seen": 960128, "step": 1955 }, { "epoch": 0.2586775768773921, "grad_norm": 0.16472673416137695, "learning_rate": 1.0340459224069675e-06, "loss": 0.0348, "num_input_tokens_seen": 962496, "step": 1960 }, { "epoch": 0.25933746865514057, "grad_norm": 22.16752052307129, "learning_rate": 1.0366851411982054e-06, "loss": 0.3992, "num_input_tokens_seen": 965120, "step": 1965 }, { "epoch": 0.259997360432889, "grad_norm": 0.6555209159851074, "learning_rate": 1.039324359989443e-06, "loss": 0.007, "num_input_tokens_seen": 967616, "step": 1970 }, { "epoch": 0.26065725221063746, "grad_norm": 0.059777699410915375, "learning_rate": 1.0419635787806808e-06, "loss": 0.1056, "num_input_tokens_seen": 970240, "step": 1975 }, { "epoch": 0.2613171439883859, "grad_norm": 20.016475677490234, "learning_rate": 1.0446027975719188e-06, "loss": 0.2734, "num_input_tokens_seen": 972544, "step": 1980 }, { "epoch": 0.26197703576613435, "grad_norm": 0.36860647797584534, "learning_rate": 1.0472420163631565e-06, "loss": 0.2381, "num_input_tokens_seen": 974912, "step": 1985 }, { "epoch": 0.2626369275438828, "grad_norm": 20.66069793701172, "learning_rate": 1.049881235154394e-06, "loss": 0.4659, "num_input_tokens_seen": 977088, "step": 1990 }, { "epoch": 0.26329681932163124, "grad_norm": 27.569839477539062, "learning_rate": 1.052520453945632e-06, "loss": 0.241, "num_input_tokens_seen": 979648, "step": 1995 }, { "epoch": 0.2639567110993797, "grad_norm": 25.123470306396484, "learning_rate": 1.0551596727368699e-06, "loss": 0.0961, "num_input_tokens_seen": 982336, "step": 2000 }, { "epoch": 0.2646166028771281, "grad_norm": 10.748541831970215, "learning_rate": 1.0577988915281074e-06, "loss": 0.0121, "num_input_tokens_seen": 984768, "step": 2005 }, { "epoch": 0.2652764946548766, "grad_norm": 0.4387301206588745, "learning_rate": 1.0604381103193454e-06, "loss": 0.0467, "num_input_tokens_seen": 987136, "step": 2010 }, { "epoch": 0.26593638643262507, "grad_norm": 0.04643954709172249, "learning_rate": 1.0630773291105832e-06, "loss": 0.142, "num_input_tokens_seen": 989824, "step": 2015 }, { "epoch": 0.2665962782103735, "grad_norm": 24.511646270751953, "learning_rate": 1.0657165479018212e-06, "loss": 0.2781, "num_input_tokens_seen": 992064, "step": 2020 }, { "epoch": 0.26725616998812196, "grad_norm": 35.56803512573242, "learning_rate": 1.0683557666930587e-06, "loss": 0.2715, "num_input_tokens_seen": 994368, "step": 2025 }, { "epoch": 0.2679160617658704, "grad_norm": 19.118850708007812, "learning_rate": 1.0709949854842965e-06, "loss": 0.1645, "num_input_tokens_seen": 996864, "step": 2030 }, { "epoch": 0.26857595354361885, "grad_norm": 0.19591161608695984, "learning_rate": 1.0736342042755345e-06, "loss": 0.1714, "num_input_tokens_seen": 999360, "step": 2035 }, { "epoch": 0.2692358453213673, "grad_norm": 0.8063420653343201, "learning_rate": 1.0762734230667723e-06, "loss": 0.0919, "num_input_tokens_seen": 1001920, "step": 2040 }, { "epoch": 0.26989573709911574, "grad_norm": 0.3764183819293976, "learning_rate": 1.0789126418580098e-06, "loss": 0.0063, "num_input_tokens_seen": 1004224, "step": 2045 }, { "epoch": 0.2705556288768642, "grad_norm": 0.18820199370384216, "learning_rate": 1.0815518606492478e-06, "loss": 0.0012, "num_input_tokens_seen": 1006528, "step": 2050 }, { "epoch": 0.27121552065461263, "grad_norm": 1.6984174251556396, "learning_rate": 1.0841910794404856e-06, "loss": 0.2114, "num_input_tokens_seen": 1008896, "step": 2055 }, { "epoch": 0.2718754124323611, "grad_norm": 0.23580117523670197, "learning_rate": 1.0868302982317234e-06, "loss": 0.1478, "num_input_tokens_seen": 1011648, "step": 2060 }, { "epoch": 0.2725353042101095, "grad_norm": 33.00039291381836, "learning_rate": 1.0894695170229611e-06, "loss": 0.0509, "num_input_tokens_seen": 1014208, "step": 2065 }, { "epoch": 0.27319519598785796, "grad_norm": 0.04923771321773529, "learning_rate": 1.092108735814199e-06, "loss": 0.105, "num_input_tokens_seen": 1016640, "step": 2070 }, { "epoch": 0.27385508776560646, "grad_norm": 0.052733778953552246, "learning_rate": 1.0947479546054369e-06, "loss": 0.0663, "num_input_tokens_seen": 1019328, "step": 2075 }, { "epoch": 0.2745149795433549, "grad_norm": 0.047354552894830704, "learning_rate": 1.0973871733966747e-06, "loss": 0.0865, "num_input_tokens_seen": 1021696, "step": 2080 }, { "epoch": 0.27517487132110335, "grad_norm": 97.71429443359375, "learning_rate": 1.1000263921879122e-06, "loss": 0.2014, "num_input_tokens_seen": 1024256, "step": 2085 }, { "epoch": 0.2758347630988518, "grad_norm": 33.12277603149414, "learning_rate": 1.1026656109791502e-06, "loss": 0.1935, "num_input_tokens_seen": 1026560, "step": 2090 }, { "epoch": 0.27649465487660024, "grad_norm": 29.502172470092773, "learning_rate": 1.105304829770388e-06, "loss": 0.2346, "num_input_tokens_seen": 1029184, "step": 2095 }, { "epoch": 0.2771545466543487, "grad_norm": 10.247140884399414, "learning_rate": 1.1079440485616255e-06, "loss": 0.0947, "num_input_tokens_seen": 1031744, "step": 2100 }, { "epoch": 0.27781443843209713, "grad_norm": 1.695241093635559, "learning_rate": 1.1105832673528635e-06, "loss": 0.1724, "num_input_tokens_seen": 1034048, "step": 2105 }, { "epoch": 0.2784743302098456, "grad_norm": 0.18251433968544006, "learning_rate": 1.1132224861441013e-06, "loss": 0.0564, "num_input_tokens_seen": 1036544, "step": 2110 }, { "epoch": 0.279134221987594, "grad_norm": 0.03173065558075905, "learning_rate": 1.115861704935339e-06, "loss": 0.2777, "num_input_tokens_seen": 1038848, "step": 2115 }, { "epoch": 0.27979411376534247, "grad_norm": 0.03673817217350006, "learning_rate": 1.1185009237265768e-06, "loss": 0.1166, "num_input_tokens_seen": 1041152, "step": 2120 }, { "epoch": 0.2804540055430909, "grad_norm": 18.983776092529297, "learning_rate": 1.1211401425178146e-06, "loss": 0.1476, "num_input_tokens_seen": 1043968, "step": 2125 }, { "epoch": 0.28111389732083936, "grad_norm": 107.69248962402344, "learning_rate": 1.1237793613090524e-06, "loss": 0.2938, "num_input_tokens_seen": 1046272, "step": 2130 }, { "epoch": 0.28177378909858786, "grad_norm": 0.15629757940769196, "learning_rate": 1.1264185801002904e-06, "loss": 0.1254, "num_input_tokens_seen": 1048448, "step": 2135 }, { "epoch": 0.2824336808763363, "grad_norm": 36.44291687011719, "learning_rate": 1.129057798891528e-06, "loss": 0.2135, "num_input_tokens_seen": 1051072, "step": 2140 }, { "epoch": 0.28309357265408475, "grad_norm": 0.4880758225917816, "learning_rate": 1.131697017682766e-06, "loss": 0.0968, "num_input_tokens_seen": 1053312, "step": 2145 }, { "epoch": 0.2837534644318332, "grad_norm": 0.0785878598690033, "learning_rate": 1.1343362364740037e-06, "loss": 0.1169, "num_input_tokens_seen": 1055680, "step": 2150 }, { "epoch": 0.28441335620958164, "grad_norm": 0.1919853389263153, "learning_rate": 1.1369754552652415e-06, "loss": 0.2016, "num_input_tokens_seen": 1057984, "step": 2155 }, { "epoch": 0.2850732479873301, "grad_norm": 0.431120365858078, "learning_rate": 1.1396146740564792e-06, "loss": 0.1111, "num_input_tokens_seen": 1060736, "step": 2160 }, { "epoch": 0.2857331397650785, "grad_norm": 1.5402590036392212, "learning_rate": 1.142253892847717e-06, "loss": 0.1279, "num_input_tokens_seen": 1063424, "step": 2165 }, { "epoch": 0.28639303154282697, "grad_norm": 0.02886631153523922, "learning_rate": 1.1448931116389548e-06, "loss": 0.0038, "num_input_tokens_seen": 1065728, "step": 2170 }, { "epoch": 0.2870529233205754, "grad_norm": 0.049261245876550674, "learning_rate": 1.1475323304301928e-06, "loss": 0.0007, "num_input_tokens_seen": 1068160, "step": 2175 }, { "epoch": 0.28771281509832386, "grad_norm": 0.04944710433483124, "learning_rate": 1.1501715492214303e-06, "loss": 0.2872, "num_input_tokens_seen": 1070592, "step": 2180 }, { "epoch": 0.2883727068760723, "grad_norm": 19.942960739135742, "learning_rate": 1.1528107680126681e-06, "loss": 0.0978, "num_input_tokens_seen": 1073280, "step": 2185 }, { "epoch": 0.28903259865382075, "grad_norm": 1.5230175256729126, "learning_rate": 1.155449986803906e-06, "loss": 0.1237, "num_input_tokens_seen": 1075584, "step": 2190 }, { "epoch": 0.28969249043156925, "grad_norm": 115.89716339111328, "learning_rate": 1.1580892055951439e-06, "loss": 0.1684, "num_input_tokens_seen": 1078016, "step": 2195 }, { "epoch": 0.2903523822093177, "grad_norm": 1.9137988090515137, "learning_rate": 1.1607284243863814e-06, "loss": 0.1865, "num_input_tokens_seen": 1080512, "step": 2200 }, { "epoch": 0.29101227398706614, "grad_norm": 18.28475570678711, "learning_rate": 1.1633676431776194e-06, "loss": 0.169, "num_input_tokens_seen": 1082752, "step": 2205 }, { "epoch": 0.2916721657648146, "grad_norm": 27.2545166015625, "learning_rate": 1.1660068619688572e-06, "loss": 0.3035, "num_input_tokens_seen": 1085184, "step": 2210 }, { "epoch": 0.29233205754256303, "grad_norm": 49.74034118652344, "learning_rate": 1.1686460807600947e-06, "loss": 0.212, "num_input_tokens_seen": 1087360, "step": 2215 }, { "epoch": 0.2929919493203115, "grad_norm": 35.755130767822266, "learning_rate": 1.1712852995513327e-06, "loss": 0.0484, "num_input_tokens_seen": 1089984, "step": 2220 }, { "epoch": 0.2936518410980599, "grad_norm": 0.13340047001838684, "learning_rate": 1.1739245183425705e-06, "loss": 0.0607, "num_input_tokens_seen": 1092608, "step": 2225 }, { "epoch": 0.29431173287580836, "grad_norm": 0.050269801169633865, "learning_rate": 1.1765637371338085e-06, "loss": 0.101, "num_input_tokens_seen": 1095040, "step": 2230 }, { "epoch": 0.2949716246535568, "grad_norm": 16.144840240478516, "learning_rate": 1.179202955925046e-06, "loss": 0.1061, "num_input_tokens_seen": 1097728, "step": 2235 }, { "epoch": 0.29563151643130525, "grad_norm": 300.6488952636719, "learning_rate": 1.1818421747162838e-06, "loss": 0.2662, "num_input_tokens_seen": 1100096, "step": 2240 }, { "epoch": 0.2962914082090537, "grad_norm": 173.1255340576172, "learning_rate": 1.1844813935075218e-06, "loss": 0.1855, "num_input_tokens_seen": 1102400, "step": 2245 }, { "epoch": 0.29695129998680214, "grad_norm": 0.08108215034008026, "learning_rate": 1.1871206122987596e-06, "loss": 0.0741, "num_input_tokens_seen": 1104960, "step": 2250 }, { "epoch": 0.2976111917645506, "grad_norm": 15.926095008850098, "learning_rate": 1.1897598310899971e-06, "loss": 0.2804, "num_input_tokens_seen": 1107520, "step": 2255 }, { "epoch": 0.2982710835422991, "grad_norm": 36.089237213134766, "learning_rate": 1.1923990498812351e-06, "loss": 0.2522, "num_input_tokens_seen": 1109952, "step": 2260 }, { "epoch": 0.29893097532004753, "grad_norm": 0.7445940971374512, "learning_rate": 1.195038268672473e-06, "loss": 0.0116, "num_input_tokens_seen": 1112128, "step": 2265 }, { "epoch": 0.299590867097796, "grad_norm": 0.1918533444404602, "learning_rate": 1.1976774874637107e-06, "loss": 0.0716, "num_input_tokens_seen": 1114688, "step": 2270 }, { "epoch": 0.3002507588755444, "grad_norm": 0.18674559891223907, "learning_rate": 1.2003167062549485e-06, "loss": 0.2035, "num_input_tokens_seen": 1117248, "step": 2275 }, { "epoch": 0.30091065065329287, "grad_norm": 101.24190521240234, "learning_rate": 1.2029559250461862e-06, "loss": 0.1032, "num_input_tokens_seen": 1119808, "step": 2280 }, { "epoch": 0.3015705424310413, "grad_norm": 376.71929931640625, "learning_rate": 1.2055951438374242e-06, "loss": 0.4005, "num_input_tokens_seen": 1122112, "step": 2285 }, { "epoch": 0.30223043420878976, "grad_norm": 1.0818003416061401, "learning_rate": 1.208234362628662e-06, "loss": 0.1585, "num_input_tokens_seen": 1124544, "step": 2290 }, { "epoch": 0.3028903259865382, "grad_norm": 0.4159611165523529, "learning_rate": 1.2108735814198995e-06, "loss": 0.1725, "num_input_tokens_seen": 1127104, "step": 2295 }, { "epoch": 0.30355021776428665, "grad_norm": 13.03470230102539, "learning_rate": 1.2135128002111375e-06, "loss": 0.1268, "num_input_tokens_seen": 1129536, "step": 2300 }, { "epoch": 0.3042101095420351, "grad_norm": 0.2850092053413391, "learning_rate": 1.2161520190023753e-06, "loss": 0.0557, "num_input_tokens_seen": 1131840, "step": 2305 }, { "epoch": 0.30487000131978353, "grad_norm": 2.2885732650756836, "learning_rate": 1.2187912377936129e-06, "loss": 0.128, "num_input_tokens_seen": 1134336, "step": 2310 }, { "epoch": 0.305529893097532, "grad_norm": 54.30757141113281, "learning_rate": 1.2214304565848509e-06, "loss": 0.3577, "num_input_tokens_seen": 1136640, "step": 2315 }, { "epoch": 0.3061897848752805, "grad_norm": 1.3606059551239014, "learning_rate": 1.2240696753760886e-06, "loss": 0.1164, "num_input_tokens_seen": 1139136, "step": 2320 }, { "epoch": 0.3068496766530289, "grad_norm": 0.1324460804462433, "learning_rate": 1.2267088941673264e-06, "loss": 0.0429, "num_input_tokens_seen": 1141568, "step": 2325 }, { "epoch": 0.30750956843077737, "grad_norm": 0.09382675588130951, "learning_rate": 1.2293481129585642e-06, "loss": 0.0024, "num_input_tokens_seen": 1143936, "step": 2330 }, { "epoch": 0.3081694602085258, "grad_norm": 76.82371520996094, "learning_rate": 1.231987331749802e-06, "loss": 0.0776, "num_input_tokens_seen": 1146624, "step": 2335 }, { "epoch": 0.30882935198627426, "grad_norm": 0.43480339646339417, "learning_rate": 1.2346265505410397e-06, "loss": 0.0591, "num_input_tokens_seen": 1149120, "step": 2340 }, { "epoch": 0.3094892437640227, "grad_norm": 0.028940560296177864, "learning_rate": 1.2372657693322777e-06, "loss": 0.1412, "num_input_tokens_seen": 1151488, "step": 2345 }, { "epoch": 0.31014913554177115, "grad_norm": 0.2386150062084198, "learning_rate": 1.2399049881235153e-06, "loss": 0.0975, "num_input_tokens_seen": 1153600, "step": 2350 }, { "epoch": 0.3108090273195196, "grad_norm": 0.01438100729137659, "learning_rate": 1.2425442069147532e-06, "loss": 0.1037, "num_input_tokens_seen": 1156224, "step": 2355 }, { "epoch": 0.31146891909726804, "grad_norm": 0.034385617822408676, "learning_rate": 1.245183425705991e-06, "loss": 0.1141, "num_input_tokens_seen": 1158656, "step": 2360 }, { "epoch": 0.3121288108750165, "grad_norm": 0.020433984696865082, "learning_rate": 1.2478226444972288e-06, "loss": 0.1503, "num_input_tokens_seen": 1161408, "step": 2365 }, { "epoch": 0.3127887026527649, "grad_norm": 0.3348312973976135, "learning_rate": 1.2504618632884666e-06, "loss": 0.0871, "num_input_tokens_seen": 1163904, "step": 2370 }, { "epoch": 0.31344859443051337, "grad_norm": 0.22757618129253387, "learning_rate": 1.2531010820797043e-06, "loss": 0.3747, "num_input_tokens_seen": 1166208, "step": 2375 }, { "epoch": 0.3141084862082619, "grad_norm": 1.5419764518737793, "learning_rate": 1.2557403008709421e-06, "loss": 0.1877, "num_input_tokens_seen": 1168512, "step": 2380 }, { "epoch": 0.3147683779860103, "grad_norm": 10.151047706604004, "learning_rate": 1.25837951966218e-06, "loss": 0.2214, "num_input_tokens_seen": 1171072, "step": 2385 }, { "epoch": 0.31542826976375876, "grad_norm": 0.7378384470939636, "learning_rate": 1.2610187384534177e-06, "loss": 0.0104, "num_input_tokens_seen": 1173312, "step": 2390 }, { "epoch": 0.3160881615415072, "grad_norm": 0.357771098613739, "learning_rate": 1.2636579572446554e-06, "loss": 0.1493, "num_input_tokens_seen": 1175744, "step": 2395 }, { "epoch": 0.31674805331925565, "grad_norm": 27.58684730529785, "learning_rate": 1.2662971760358934e-06, "loss": 0.2014, "num_input_tokens_seen": 1178112, "step": 2400 }, { "epoch": 0.3174079450970041, "grad_norm": 0.15001633763313293, "learning_rate": 1.2689363948271312e-06, "loss": 0.0012, "num_input_tokens_seen": 1180352, "step": 2405 }, { "epoch": 0.31806783687475254, "grad_norm": 0.09786776453256607, "learning_rate": 1.2715756136183688e-06, "loss": 0.1906, "num_input_tokens_seen": 1182528, "step": 2410 }, { "epoch": 0.318727728652501, "grad_norm": 0.4362722337245941, "learning_rate": 1.2742148324096067e-06, "loss": 0.018, "num_input_tokens_seen": 1185280, "step": 2415 }, { "epoch": 0.31938762043024943, "grad_norm": 27.211410522460938, "learning_rate": 1.2768540512008445e-06, "loss": 0.0975, "num_input_tokens_seen": 1188032, "step": 2420 }, { "epoch": 0.3200475122079979, "grad_norm": 20.800615310668945, "learning_rate": 1.279493269992082e-06, "loss": 0.2283, "num_input_tokens_seen": 1190464, "step": 2425 }, { "epoch": 0.3207074039857463, "grad_norm": 42.08029556274414, "learning_rate": 1.28213248878332e-06, "loss": 0.192, "num_input_tokens_seen": 1192960, "step": 2430 }, { "epoch": 0.32136729576349476, "grad_norm": 0.1869146227836609, "learning_rate": 1.2847717075745578e-06, "loss": 0.0019, "num_input_tokens_seen": 1195072, "step": 2435 }, { "epoch": 0.3220271875412432, "grad_norm": 0.02612818218767643, "learning_rate": 1.2874109263657958e-06, "loss": 0.0131, "num_input_tokens_seen": 1197376, "step": 2440 }, { "epoch": 0.3226870793189917, "grad_norm": 0.04093582555651665, "learning_rate": 1.2900501451570334e-06, "loss": 0.0088, "num_input_tokens_seen": 1199936, "step": 2445 }, { "epoch": 0.32334697109674015, "grad_norm": 187.89329528808594, "learning_rate": 1.2926893639482712e-06, "loss": 0.0404, "num_input_tokens_seen": 1202368, "step": 2450 }, { "epoch": 0.3240068628744886, "grad_norm": 0.03035602904856205, "learning_rate": 1.2953285827395091e-06, "loss": 0.0721, "num_input_tokens_seen": 1204800, "step": 2455 }, { "epoch": 0.32466675465223704, "grad_norm": 29.28488540649414, "learning_rate": 1.297967801530747e-06, "loss": 0.1065, "num_input_tokens_seen": 1207552, "step": 2460 }, { "epoch": 0.3253266464299855, "grad_norm": 0.03032883256673813, "learning_rate": 1.3006070203219845e-06, "loss": 0.2085, "num_input_tokens_seen": 1209856, "step": 2465 }, { "epoch": 0.32598653820773393, "grad_norm": 0.10402119904756546, "learning_rate": 1.3032462391132225e-06, "loss": 0.1298, "num_input_tokens_seen": 1212352, "step": 2470 }, { "epoch": 0.3266464299854824, "grad_norm": 0.1559586375951767, "learning_rate": 1.3058854579044602e-06, "loss": 0.0053, "num_input_tokens_seen": 1214912, "step": 2475 }, { "epoch": 0.3273063217632308, "grad_norm": 0.02763325348496437, "learning_rate": 1.308524676695698e-06, "loss": 0.0804, "num_input_tokens_seen": 1217088, "step": 2480 }, { "epoch": 0.32796621354097927, "grad_norm": 1.776210904121399, "learning_rate": 1.3111638954869358e-06, "loss": 0.1441, "num_input_tokens_seen": 1219584, "step": 2485 }, { "epoch": 0.3286261053187277, "grad_norm": 67.13372802734375, "learning_rate": 1.3138031142781736e-06, "loss": 0.1647, "num_input_tokens_seen": 1222336, "step": 2490 }, { "epoch": 0.32928599709647616, "grad_norm": 2.0714035034179688, "learning_rate": 1.3164423330694115e-06, "loss": 0.16, "num_input_tokens_seen": 1224832, "step": 2495 }, { "epoch": 0.3299458888742246, "grad_norm": 21.270225524902344, "learning_rate": 1.3190815518606493e-06, "loss": 0.3234, "num_input_tokens_seen": 1227392, "step": 2500 }, { "epoch": 0.3306057806519731, "grad_norm": 16.29952621459961, "learning_rate": 1.3217207706518869e-06, "loss": 0.1718, "num_input_tokens_seen": 1229952, "step": 2505 }, { "epoch": 0.33126567242972155, "grad_norm": 1.428592324256897, "learning_rate": 1.3243599894431249e-06, "loss": 0.0817, "num_input_tokens_seen": 1232640, "step": 2510 }, { "epoch": 0.33192556420747, "grad_norm": 163.8802490234375, "learning_rate": 1.3269992082343626e-06, "loss": 0.0409, "num_input_tokens_seen": 1235200, "step": 2515 }, { "epoch": 0.33258545598521844, "grad_norm": 0.16034527122974396, "learning_rate": 1.3296384270256002e-06, "loss": 0.0289, "num_input_tokens_seen": 1237632, "step": 2520 }, { "epoch": 0.3332453477629669, "grad_norm": 0.22546841204166412, "learning_rate": 1.3322776458168382e-06, "loss": 0.1123, "num_input_tokens_seen": 1239936, "step": 2525 }, { "epoch": 0.3339052395407153, "grad_norm": 0.36573493480682373, "learning_rate": 1.334916864608076e-06, "loss": 0.2255, "num_input_tokens_seen": 1242304, "step": 2530 }, { "epoch": 0.33456513131846377, "grad_norm": 0.039298903197050095, "learning_rate": 1.3375560833993137e-06, "loss": 0.0004, "num_input_tokens_seen": 1244736, "step": 2535 }, { "epoch": 0.3352250230962122, "grad_norm": 1.0564583539962769, "learning_rate": 1.3401953021905515e-06, "loss": 0.2463, "num_input_tokens_seen": 1247488, "step": 2540 }, { "epoch": 0.33588491487396066, "grad_norm": 0.1800013780593872, "learning_rate": 1.3428345209817893e-06, "loss": 0.1187, "num_input_tokens_seen": 1250048, "step": 2545 }, { "epoch": 0.3365448066517091, "grad_norm": 191.12794494628906, "learning_rate": 1.345473739773027e-06, "loss": 0.2226, "num_input_tokens_seen": 1252928, "step": 2550 }, { "epoch": 0.33720469842945755, "grad_norm": 0.26806724071502686, "learning_rate": 1.348112958564265e-06, "loss": 0.2176, "num_input_tokens_seen": 1255488, "step": 2555 }, { "epoch": 0.337864590207206, "grad_norm": 52.88833236694336, "learning_rate": 1.3507521773555026e-06, "loss": 0.4771, "num_input_tokens_seen": 1257984, "step": 2560 }, { "epoch": 0.3385244819849545, "grad_norm": 0.3574354648590088, "learning_rate": 1.3533913961467406e-06, "loss": 0.1681, "num_input_tokens_seen": 1260416, "step": 2565 }, { "epoch": 0.33918437376270294, "grad_norm": 48.450592041015625, "learning_rate": 1.3560306149379783e-06, "loss": 0.1162, "num_input_tokens_seen": 1263168, "step": 2570 }, { "epoch": 0.3398442655404514, "grad_norm": 0.33409392833709717, "learning_rate": 1.3586698337292161e-06, "loss": 0.1092, "num_input_tokens_seen": 1265600, "step": 2575 }, { "epoch": 0.34050415731819983, "grad_norm": 0.06134882941842079, "learning_rate": 1.361309052520454e-06, "loss": 0.0017, "num_input_tokens_seen": 1268032, "step": 2580 }, { "epoch": 0.3411640490959483, "grad_norm": 205.01483154296875, "learning_rate": 1.3639482713116917e-06, "loss": 0.124, "num_input_tokens_seen": 1270336, "step": 2585 }, { "epoch": 0.3418239408736967, "grad_norm": 27.653396606445312, "learning_rate": 1.3665874901029294e-06, "loss": 0.0837, "num_input_tokens_seen": 1273088, "step": 2590 }, { "epoch": 0.34248383265144516, "grad_norm": 0.3572693467140198, "learning_rate": 1.3692267088941674e-06, "loss": 0.2435, "num_input_tokens_seen": 1275456, "step": 2595 }, { "epoch": 0.3431437244291936, "grad_norm": 0.15710730850696564, "learning_rate": 1.371865927685405e-06, "loss": 0.0428, "num_input_tokens_seen": 1277888, "step": 2600 }, { "epoch": 0.34380361620694205, "grad_norm": 2.2360987663269043, "learning_rate": 1.3745051464766428e-06, "loss": 0.0011, "num_input_tokens_seen": 1280384, "step": 2605 }, { "epoch": 0.3444635079846905, "grad_norm": 0.7117704749107361, "learning_rate": 1.3771443652678807e-06, "loss": 0.0331, "num_input_tokens_seen": 1282944, "step": 2610 }, { "epoch": 0.34512339976243894, "grad_norm": 147.02932739257812, "learning_rate": 1.3797835840591185e-06, "loss": 0.1367, "num_input_tokens_seen": 1285184, "step": 2615 }, { "epoch": 0.3457832915401874, "grad_norm": 0.08217495679855347, "learning_rate": 1.382422802850356e-06, "loss": 0.202, "num_input_tokens_seen": 1287808, "step": 2620 }, { "epoch": 0.34644318331793583, "grad_norm": 1.0039873123168945, "learning_rate": 1.385062021641594e-06, "loss": 0.1634, "num_input_tokens_seen": 1290432, "step": 2625 }, { "epoch": 0.34710307509568433, "grad_norm": 0.36813926696777344, "learning_rate": 1.3877012404328318e-06, "loss": 0.1988, "num_input_tokens_seen": 1292736, "step": 2630 }, { "epoch": 0.3477629668734328, "grad_norm": 0.5801262259483337, "learning_rate": 1.3903404592240694e-06, "loss": 0.1405, "num_input_tokens_seen": 1295104, "step": 2635 }, { "epoch": 0.3484228586511812, "grad_norm": 0.835560142993927, "learning_rate": 1.3929796780153074e-06, "loss": 0.0939, "num_input_tokens_seen": 1297600, "step": 2640 }, { "epoch": 0.34908275042892967, "grad_norm": 0.08413344621658325, "learning_rate": 1.3956188968065452e-06, "loss": 0.0013, "num_input_tokens_seen": 1300224, "step": 2645 }, { "epoch": 0.3497426422066781, "grad_norm": 0.3069940209388733, "learning_rate": 1.3982581155977831e-06, "loss": 0.1809, "num_input_tokens_seen": 1302528, "step": 2650 }, { "epoch": 0.35040253398442656, "grad_norm": 0.15991085767745972, "learning_rate": 1.4008973343890207e-06, "loss": 0.2074, "num_input_tokens_seen": 1305024, "step": 2655 }, { "epoch": 0.351062425762175, "grad_norm": 0.2440817505121231, "learning_rate": 1.4035365531802585e-06, "loss": 0.2161, "num_input_tokens_seen": 1307392, "step": 2660 }, { "epoch": 0.35172231753992345, "grad_norm": 0.7089338302612305, "learning_rate": 1.4061757719714965e-06, "loss": 0.2352, "num_input_tokens_seen": 1310016, "step": 2665 }, { "epoch": 0.3523822093176719, "grad_norm": 0.12224601209163666, "learning_rate": 1.4088149907627342e-06, "loss": 0.0256, "num_input_tokens_seen": 1312576, "step": 2670 }, { "epoch": 0.35304210109542034, "grad_norm": 0.06766802072525024, "learning_rate": 1.4114542095539718e-06, "loss": 0.1432, "num_input_tokens_seen": 1315072, "step": 2675 }, { "epoch": 0.3537019928731688, "grad_norm": 47.305416107177734, "learning_rate": 1.4140934283452098e-06, "loss": 0.1524, "num_input_tokens_seen": 1317504, "step": 2680 }, { "epoch": 0.3543618846509172, "grad_norm": 0.1390659511089325, "learning_rate": 1.4167326471364476e-06, "loss": 0.001, "num_input_tokens_seen": 1319680, "step": 2685 }, { "epoch": 0.3550217764286657, "grad_norm": 0.08814946562051773, "learning_rate": 1.4193718659276853e-06, "loss": 0.0008, "num_input_tokens_seen": 1322240, "step": 2690 }, { "epoch": 0.35568166820641417, "grad_norm": 18.944007873535156, "learning_rate": 1.4220110847189231e-06, "loss": 0.1094, "num_input_tokens_seen": 1324736, "step": 2695 }, { "epoch": 0.3563415599841626, "grad_norm": 0.02471146546304226, "learning_rate": 1.4246503035101609e-06, "loss": 0.1183, "num_input_tokens_seen": 1327104, "step": 2700 }, { "epoch": 0.35700145176191106, "grad_norm": 28.54195213317871, "learning_rate": 1.4272895223013989e-06, "loss": 0.2524, "num_input_tokens_seen": 1329920, "step": 2705 }, { "epoch": 0.3576613435396595, "grad_norm": 0.09029418230056763, "learning_rate": 1.4299287410926366e-06, "loss": 0.1706, "num_input_tokens_seen": 1332352, "step": 2710 }, { "epoch": 0.35832123531740795, "grad_norm": 32.86302185058594, "learning_rate": 1.4325679598838742e-06, "loss": 0.0205, "num_input_tokens_seen": 1335040, "step": 2715 }, { "epoch": 0.3589811270951564, "grad_norm": 0.2393476665019989, "learning_rate": 1.4352071786751122e-06, "loss": 0.003, "num_input_tokens_seen": 1337472, "step": 2720 }, { "epoch": 0.35964101887290484, "grad_norm": 77.94792938232422, "learning_rate": 1.43784639746635e-06, "loss": 0.0809, "num_input_tokens_seen": 1339712, "step": 2725 }, { "epoch": 0.3603009106506533, "grad_norm": 0.08134565502405167, "learning_rate": 1.4404856162575877e-06, "loss": 0.0011, "num_input_tokens_seen": 1342144, "step": 2730 }, { "epoch": 0.36096080242840173, "grad_norm": 0.4735080897808075, "learning_rate": 1.4431248350488255e-06, "loss": 0.0817, "num_input_tokens_seen": 1344512, "step": 2735 }, { "epoch": 0.3616206942061502, "grad_norm": 395.6458435058594, "learning_rate": 1.4457640538400633e-06, "loss": 0.4208, "num_input_tokens_seen": 1346880, "step": 2740 }, { "epoch": 0.3622805859838986, "grad_norm": 11.863261222839355, "learning_rate": 1.448403272631301e-06, "loss": 0.0683, "num_input_tokens_seen": 1349056, "step": 2745 }, { "epoch": 0.36294047776164706, "grad_norm": 166.0413360595703, "learning_rate": 1.4510424914225388e-06, "loss": 0.1806, "num_input_tokens_seen": 1351488, "step": 2750 }, { "epoch": 0.36360036953939556, "grad_norm": 0.06676194816827774, "learning_rate": 1.4536817102137766e-06, "loss": 0.0011, "num_input_tokens_seen": 1353920, "step": 2755 }, { "epoch": 0.364260261317144, "grad_norm": 0.4566335678100586, "learning_rate": 1.4563209290050144e-06, "loss": 0.3443, "num_input_tokens_seen": 1356352, "step": 2760 }, { "epoch": 0.36492015309489245, "grad_norm": 0.3084438443183899, "learning_rate": 1.4589601477962524e-06, "loss": 0.2605, "num_input_tokens_seen": 1359104, "step": 2765 }, { "epoch": 0.3655800448726409, "grad_norm": 38.383583068847656, "learning_rate": 1.46159936658749e-06, "loss": 0.2077, "num_input_tokens_seen": 1361536, "step": 2770 }, { "epoch": 0.36623993665038934, "grad_norm": 0.5150312781333923, "learning_rate": 1.464238585378728e-06, "loss": 0.3646, "num_input_tokens_seen": 1364160, "step": 2775 }, { "epoch": 0.3668998284281378, "grad_norm": 6.813530445098877, "learning_rate": 1.4668778041699657e-06, "loss": 0.0023, "num_input_tokens_seen": 1366656, "step": 2780 }, { "epoch": 0.36755972020588623, "grad_norm": 40.210357666015625, "learning_rate": 1.4695170229612034e-06, "loss": 0.142, "num_input_tokens_seen": 1369216, "step": 2785 }, { "epoch": 0.3682196119836347, "grad_norm": 0.21548590064048767, "learning_rate": 1.4721562417524412e-06, "loss": 0.2354, "num_input_tokens_seen": 1371712, "step": 2790 }, { "epoch": 0.3688795037613831, "grad_norm": 0.28355929255485535, "learning_rate": 1.474795460543679e-06, "loss": 0.1816, "num_input_tokens_seen": 1374080, "step": 2795 }, { "epoch": 0.36953939553913157, "grad_norm": 1.1086498498916626, "learning_rate": 1.4774346793349168e-06, "loss": 0.1143, "num_input_tokens_seen": 1377024, "step": 2800 }, { "epoch": 0.37019928731688, "grad_norm": 0.7446885704994202, "learning_rate": 1.4800738981261548e-06, "loss": 0.122, "num_input_tokens_seen": 1379392, "step": 2805 }, { "epoch": 0.37085917909462845, "grad_norm": 0.14264525473117828, "learning_rate": 1.4827131169173923e-06, "loss": 0.1612, "num_input_tokens_seen": 1381760, "step": 2810 }, { "epoch": 0.37151907087237696, "grad_norm": 0.19203917682170868, "learning_rate": 1.48535233570863e-06, "loss": 0.0652, "num_input_tokens_seen": 1384128, "step": 2815 }, { "epoch": 0.3721789626501254, "grad_norm": 0.036427125334739685, "learning_rate": 1.487991554499868e-06, "loss": 0.053, "num_input_tokens_seen": 1386496, "step": 2820 }, { "epoch": 0.37283885442787384, "grad_norm": 273.5498046875, "learning_rate": 1.4906307732911058e-06, "loss": 0.1133, "num_input_tokens_seen": 1388736, "step": 2825 }, { "epoch": 0.3734987462056223, "grad_norm": 95.69959259033203, "learning_rate": 1.4932699920823434e-06, "loss": 0.0446, "num_input_tokens_seen": 1391040, "step": 2830 }, { "epoch": 0.37415863798337073, "grad_norm": 93.49053955078125, "learning_rate": 1.4959092108735814e-06, "loss": 0.093, "num_input_tokens_seen": 1393344, "step": 2835 }, { "epoch": 0.3748185297611192, "grad_norm": 1.0124017000198364, "learning_rate": 1.4985484296648192e-06, "loss": 0.074, "num_input_tokens_seen": 1395648, "step": 2840 }, { "epoch": 0.3754784215388676, "grad_norm": 0.015571318566799164, "learning_rate": 1.5011876484560567e-06, "loss": 0.2042, "num_input_tokens_seen": 1398272, "step": 2845 }, { "epoch": 0.37613831331661607, "grad_norm": 0.0427069365978241, "learning_rate": 1.5038268672472947e-06, "loss": 0.2285, "num_input_tokens_seen": 1400576, "step": 2850 }, { "epoch": 0.3767982050943645, "grad_norm": 0.03314981982111931, "learning_rate": 1.5064660860385325e-06, "loss": 0.049, "num_input_tokens_seen": 1403136, "step": 2855 }, { "epoch": 0.37745809687211296, "grad_norm": 0.028006955981254578, "learning_rate": 1.5091053048297705e-06, "loss": 0.0759, "num_input_tokens_seen": 1405504, "step": 2860 }, { "epoch": 0.3781179886498614, "grad_norm": 31.784631729125977, "learning_rate": 1.511744523621008e-06, "loss": 0.2051, "num_input_tokens_seen": 1407808, "step": 2865 }, { "epoch": 0.37877788042760985, "grad_norm": 79.9329605102539, "learning_rate": 1.5143837424122458e-06, "loss": 0.1983, "num_input_tokens_seen": 1410304, "step": 2870 }, { "epoch": 0.37943777220535835, "grad_norm": 0.17618303000926971, "learning_rate": 1.5170229612034838e-06, "loss": 0.0052, "num_input_tokens_seen": 1412736, "step": 2875 }, { "epoch": 0.3800976639831068, "grad_norm": 70.05154418945312, "learning_rate": 1.5196621799947216e-06, "loss": 0.1043, "num_input_tokens_seen": 1415488, "step": 2880 }, { "epoch": 0.38075755576085524, "grad_norm": 0.05183727666735649, "learning_rate": 1.5223013987859591e-06, "loss": 0.148, "num_input_tokens_seen": 1417856, "step": 2885 }, { "epoch": 0.3814174475386037, "grad_norm": 161.5440673828125, "learning_rate": 1.5249406175771971e-06, "loss": 0.1376, "num_input_tokens_seen": 1420096, "step": 2890 }, { "epoch": 0.3820773393163521, "grad_norm": 38.89706039428711, "learning_rate": 1.5275798363684349e-06, "loss": 0.1461, "num_input_tokens_seen": 1422656, "step": 2895 }, { "epoch": 0.38273723109410057, "grad_norm": 55.588035583496094, "learning_rate": 1.5302190551596727e-06, "loss": 0.1672, "num_input_tokens_seen": 1425152, "step": 2900 }, { "epoch": 0.383397122871849, "grad_norm": 0.15920549631118774, "learning_rate": 1.5328582739509104e-06, "loss": 0.0423, "num_input_tokens_seen": 1427840, "step": 2905 }, { "epoch": 0.38405701464959746, "grad_norm": 0.49687254428863525, "learning_rate": 1.5354974927421482e-06, "loss": 0.0218, "num_input_tokens_seen": 1430144, "step": 2910 }, { "epoch": 0.3847169064273459, "grad_norm": 0.13978274166584015, "learning_rate": 1.5381367115333862e-06, "loss": 0.1745, "num_input_tokens_seen": 1432448, "step": 2915 }, { "epoch": 0.38537679820509435, "grad_norm": 0.2328559309244156, "learning_rate": 1.540775930324624e-06, "loss": 0.0026, "num_input_tokens_seen": 1434752, "step": 2920 }, { "epoch": 0.3860366899828428, "grad_norm": 1.778712511062622, "learning_rate": 1.5434151491158615e-06, "loss": 0.001, "num_input_tokens_seen": 1437184, "step": 2925 }, { "epoch": 0.38669658176059124, "grad_norm": 0.156612828373909, "learning_rate": 1.5460543679070995e-06, "loss": 0.051, "num_input_tokens_seen": 1440000, "step": 2930 }, { "epoch": 0.3873564735383397, "grad_norm": 0.6022273898124695, "learning_rate": 1.5486935866983373e-06, "loss": 0.0864, "num_input_tokens_seen": 1442624, "step": 2935 }, { "epoch": 0.3880163653160882, "grad_norm": 0.10904964804649353, "learning_rate": 1.551332805489575e-06, "loss": 0.291, "num_input_tokens_seen": 1445184, "step": 2940 }, { "epoch": 0.38867625709383663, "grad_norm": 0.8670811653137207, "learning_rate": 1.5539720242808128e-06, "loss": 0.2756, "num_input_tokens_seen": 1447616, "step": 2945 }, { "epoch": 0.3893361488715851, "grad_norm": 1.617418646812439, "learning_rate": 1.5566112430720506e-06, "loss": 0.0006, "num_input_tokens_seen": 1450176, "step": 2950 }, { "epoch": 0.3899960406493335, "grad_norm": 101.99866485595703, "learning_rate": 1.5592504618632884e-06, "loss": 0.4024, "num_input_tokens_seen": 1452672, "step": 2955 }, { "epoch": 0.39065593242708196, "grad_norm": 116.28324890136719, "learning_rate": 1.5618896806545262e-06, "loss": 0.2948, "num_input_tokens_seen": 1455232, "step": 2960 }, { "epoch": 0.3913158242048304, "grad_norm": 67.55413818359375, "learning_rate": 1.564528899445764e-06, "loss": 0.2825, "num_input_tokens_seen": 1457792, "step": 2965 }, { "epoch": 0.39197571598257885, "grad_norm": 1.1112959384918213, "learning_rate": 1.5671681182370017e-06, "loss": 0.0889, "num_input_tokens_seen": 1460160, "step": 2970 }, { "epoch": 0.3926356077603273, "grad_norm": 4.917082786560059, "learning_rate": 1.5698073370282397e-06, "loss": 0.074, "num_input_tokens_seen": 1462400, "step": 2975 }, { "epoch": 0.39329549953807574, "grad_norm": 1.6990487575531006, "learning_rate": 1.5724465558194772e-06, "loss": 0.128, "num_input_tokens_seen": 1465088, "step": 2980 }, { "epoch": 0.3939553913158242, "grad_norm": 2.347425699234009, "learning_rate": 1.5750857746107152e-06, "loss": 0.1378, "num_input_tokens_seen": 1467456, "step": 2985 }, { "epoch": 0.39461528309357263, "grad_norm": 0.054718267172575, "learning_rate": 1.577724993401953e-06, "loss": 0.0013, "num_input_tokens_seen": 1470016, "step": 2990 }, { "epoch": 0.3952751748713211, "grad_norm": 0.06302514672279358, "learning_rate": 1.5803642121931908e-06, "loss": 0.0779, "num_input_tokens_seen": 1472448, "step": 2995 }, { "epoch": 0.3959350666490696, "grad_norm": 0.8244885802268982, "learning_rate": 1.5830034309844285e-06, "loss": 0.0693, "num_input_tokens_seen": 1475072, "step": 3000 }, { "epoch": 0.396594958426818, "grad_norm": 0.21904513239860535, "learning_rate": 1.5856426497756663e-06, "loss": 0.063, "num_input_tokens_seen": 1477440, "step": 3005 }, { "epoch": 0.39725485020456647, "grad_norm": 0.9207690358161926, "learning_rate": 1.588281868566904e-06, "loss": 0.0011, "num_input_tokens_seen": 1479872, "step": 3010 }, { "epoch": 0.3979147419823149, "grad_norm": 125.98383331298828, "learning_rate": 1.590921087358142e-06, "loss": 0.2275, "num_input_tokens_seen": 1481920, "step": 3015 }, { "epoch": 0.39857463376006336, "grad_norm": 0.06044726446270943, "learning_rate": 1.5935603061493796e-06, "loss": 0.2769, "num_input_tokens_seen": 1484992, "step": 3020 }, { "epoch": 0.3992345255378118, "grad_norm": 0.15632975101470947, "learning_rate": 1.5961995249406174e-06, "loss": 0.3505, "num_input_tokens_seen": 1487232, "step": 3025 }, { "epoch": 0.39989441731556025, "grad_norm": 217.93968200683594, "learning_rate": 1.5988387437318554e-06, "loss": 0.19, "num_input_tokens_seen": 1489728, "step": 3030 }, { "epoch": 0.4005543090933087, "grad_norm": 6.839831352233887, "learning_rate": 1.6014779625230932e-06, "loss": 0.2316, "num_input_tokens_seen": 1491968, "step": 3035 }, { "epoch": 0.40121420087105714, "grad_norm": 93.5851821899414, "learning_rate": 1.6041171813143307e-06, "loss": 0.1632, "num_input_tokens_seen": 1494336, "step": 3040 }, { "epoch": 0.4018740926488056, "grad_norm": 0.35064733028411865, "learning_rate": 1.6067564001055687e-06, "loss": 0.1404, "num_input_tokens_seen": 1497472, "step": 3045 }, { "epoch": 0.402533984426554, "grad_norm": 0.19418714940547943, "learning_rate": 1.6093956188968065e-06, "loss": 0.085, "num_input_tokens_seen": 1499968, "step": 3050 }, { "epoch": 0.40319387620430247, "grad_norm": 0.04916776716709137, "learning_rate": 1.612034837688044e-06, "loss": 0.0344, "num_input_tokens_seen": 1502336, "step": 3055 }, { "epoch": 0.40385376798205097, "grad_norm": 35.16215896606445, "learning_rate": 1.614674056479282e-06, "loss": 0.2477, "num_input_tokens_seen": 1504576, "step": 3060 }, { "epoch": 0.4045136597597994, "grad_norm": 78.75347900390625, "learning_rate": 1.6173132752705198e-06, "loss": 0.2589, "num_input_tokens_seen": 1507136, "step": 3065 }, { "epoch": 0.40517355153754786, "grad_norm": 1.299813985824585, "learning_rate": 1.6199524940617578e-06, "loss": 0.0691, "num_input_tokens_seen": 1509696, "step": 3070 }, { "epoch": 0.4058334433152963, "grad_norm": 0.5724449157714844, "learning_rate": 1.6225917128529954e-06, "loss": 0.2528, "num_input_tokens_seen": 1512192, "step": 3075 }, { "epoch": 0.40649333509304475, "grad_norm": 0.24407175183296204, "learning_rate": 1.6252309316442331e-06, "loss": 0.0691, "num_input_tokens_seen": 1514752, "step": 3080 }, { "epoch": 0.4071532268707932, "grad_norm": 34.682369232177734, "learning_rate": 1.6278701504354711e-06, "loss": 0.3173, "num_input_tokens_seen": 1517440, "step": 3085 }, { "epoch": 0.40781311864854164, "grad_norm": 0.07574780285358429, "learning_rate": 1.6305093692267089e-06, "loss": 0.0856, "num_input_tokens_seen": 1519744, "step": 3090 }, { "epoch": 0.4084730104262901, "grad_norm": 338.62677001953125, "learning_rate": 1.6331485880179465e-06, "loss": 0.1111, "num_input_tokens_seen": 1522176, "step": 3095 }, { "epoch": 0.40913290220403853, "grad_norm": 0.1823023557662964, "learning_rate": 1.6357878068091844e-06, "loss": 0.0694, "num_input_tokens_seen": 1524352, "step": 3100 }, { "epoch": 0.409792793981787, "grad_norm": 67.96218872070312, "learning_rate": 1.6384270256004222e-06, "loss": 0.2666, "num_input_tokens_seen": 1526976, "step": 3105 }, { "epoch": 0.4104526857595354, "grad_norm": 0.4283585548400879, "learning_rate": 1.64106624439166e-06, "loss": 0.1743, "num_input_tokens_seen": 1529152, "step": 3110 }, { "epoch": 0.41111257753728386, "grad_norm": 0.08526215702295303, "learning_rate": 1.6437054631828978e-06, "loss": 0.0033, "num_input_tokens_seen": 1531648, "step": 3115 }, { "epoch": 0.4117724693150323, "grad_norm": 0.20778554677963257, "learning_rate": 1.6463446819741355e-06, "loss": 0.1017, "num_input_tokens_seen": 1533952, "step": 3120 }, { "epoch": 0.4124323610927808, "grad_norm": 92.7779769897461, "learning_rate": 1.6489839007653735e-06, "loss": 0.294, "num_input_tokens_seen": 1536640, "step": 3125 }, { "epoch": 0.41309225287052925, "grad_norm": 0.7637858986854553, "learning_rate": 1.6516231195566113e-06, "loss": 0.0033, "num_input_tokens_seen": 1539200, "step": 3130 }, { "epoch": 0.4137521446482777, "grad_norm": 7.557998180389404, "learning_rate": 1.6542623383478489e-06, "loss": 0.1664, "num_input_tokens_seen": 1541632, "step": 3135 }, { "epoch": 0.41441203642602614, "grad_norm": 36.74544906616211, "learning_rate": 1.6569015571390868e-06, "loss": 0.0619, "num_input_tokens_seen": 1544128, "step": 3140 }, { "epoch": 0.4150719282037746, "grad_norm": 0.06126031652092934, "learning_rate": 1.6595407759303246e-06, "loss": 0.0231, "num_input_tokens_seen": 1546368, "step": 3145 }, { "epoch": 0.41573181998152303, "grad_norm": 1.9612305164337158, "learning_rate": 1.6621799947215624e-06, "loss": 0.4345, "num_input_tokens_seen": 1548736, "step": 3150 }, { "epoch": 0.4163917117592715, "grad_norm": 0.30381351709365845, "learning_rate": 1.6648192135128002e-06, "loss": 0.2605, "num_input_tokens_seen": 1551168, "step": 3155 }, { "epoch": 0.4170516035370199, "grad_norm": 0.25879448652267456, "learning_rate": 1.667458432304038e-06, "loss": 0.1204, "num_input_tokens_seen": 1553664, "step": 3160 }, { "epoch": 0.41771149531476837, "grad_norm": 109.06922149658203, "learning_rate": 1.6700976510952757e-06, "loss": 0.1409, "num_input_tokens_seen": 1555968, "step": 3165 }, { "epoch": 0.4183713870925168, "grad_norm": 45.09346389770508, "learning_rate": 1.6727368698865135e-06, "loss": 0.064, "num_input_tokens_seen": 1558208, "step": 3170 }, { "epoch": 0.41903127887026526, "grad_norm": 1.3239551782608032, "learning_rate": 1.6753760886777513e-06, "loss": 0.0094, "num_input_tokens_seen": 1560640, "step": 3175 }, { "epoch": 0.4196911706480137, "grad_norm": 2.044196128845215, "learning_rate": 1.678015307468989e-06, "loss": 0.0375, "num_input_tokens_seen": 1563328, "step": 3180 }, { "epoch": 0.4203510624257622, "grad_norm": 0.24516402184963226, "learning_rate": 1.680654526260227e-06, "loss": 0.1175, "num_input_tokens_seen": 1565696, "step": 3185 }, { "epoch": 0.42101095420351065, "grad_norm": 0.20431405305862427, "learning_rate": 1.6832937450514646e-06, "loss": 0.4013, "num_input_tokens_seen": 1568640, "step": 3190 }, { "epoch": 0.4216708459812591, "grad_norm": 9.456520080566406, "learning_rate": 1.6859329638427023e-06, "loss": 0.0542, "num_input_tokens_seen": 1571328, "step": 3195 }, { "epoch": 0.42233073775900754, "grad_norm": 26.781938552856445, "learning_rate": 1.6885721826339403e-06, "loss": 0.5376, "num_input_tokens_seen": 1574016, "step": 3200 }, { "epoch": 0.422990629536756, "grad_norm": 37.978271484375, "learning_rate": 1.691211401425178e-06, "loss": 0.2516, "num_input_tokens_seen": 1576384, "step": 3205 }, { "epoch": 0.4236505213145044, "grad_norm": 0.1549421101808548, "learning_rate": 1.6938506202164159e-06, "loss": 0.217, "num_input_tokens_seen": 1579200, "step": 3210 }, { "epoch": 0.42431041309225287, "grad_norm": 54.167076110839844, "learning_rate": 1.6964898390076536e-06, "loss": 0.1522, "num_input_tokens_seen": 1581632, "step": 3215 }, { "epoch": 0.4249703048700013, "grad_norm": 1.0169860124588013, "learning_rate": 1.6991290577988914e-06, "loss": 0.0283, "num_input_tokens_seen": 1584064, "step": 3220 }, { "epoch": 0.42563019664774976, "grad_norm": 52.26798629760742, "learning_rate": 1.7017682765901294e-06, "loss": 0.0425, "num_input_tokens_seen": 1586752, "step": 3225 }, { "epoch": 0.4262900884254982, "grad_norm": 19.18907356262207, "learning_rate": 1.704407495381367e-06, "loss": 0.1471, "num_input_tokens_seen": 1589248, "step": 3230 }, { "epoch": 0.42694998020324665, "grad_norm": 0.09750813245773315, "learning_rate": 1.7070467141726047e-06, "loss": 0.0052, "num_input_tokens_seen": 1591552, "step": 3235 }, { "epoch": 0.4276098719809951, "grad_norm": 1.1829841136932373, "learning_rate": 1.7096859329638427e-06, "loss": 0.0685, "num_input_tokens_seen": 1594048, "step": 3240 }, { "epoch": 0.4282697637587436, "grad_norm": 74.45574188232422, "learning_rate": 1.7123251517550805e-06, "loss": 0.2108, "num_input_tokens_seen": 1596544, "step": 3245 }, { "epoch": 0.42892965553649204, "grad_norm": 41.79922866821289, "learning_rate": 1.714964370546318e-06, "loss": 0.2884, "num_input_tokens_seen": 1599104, "step": 3250 }, { "epoch": 0.4295895473142405, "grad_norm": 0.04245501011610031, "learning_rate": 1.717603589337556e-06, "loss": 0.1717, "num_input_tokens_seen": 1601344, "step": 3255 }, { "epoch": 0.43024943909198893, "grad_norm": 22.026790618896484, "learning_rate": 1.7202428081287938e-06, "loss": 0.0818, "num_input_tokens_seen": 1603968, "step": 3260 }, { "epoch": 0.4309093308697374, "grad_norm": 61.95907211303711, "learning_rate": 1.7228820269200314e-06, "loss": 0.0261, "num_input_tokens_seen": 1606400, "step": 3265 }, { "epoch": 0.4315692226474858, "grad_norm": 258.6390380859375, "learning_rate": 1.7255212457112694e-06, "loss": 0.2814, "num_input_tokens_seen": 1608960, "step": 3270 }, { "epoch": 0.43222911442523426, "grad_norm": 0.6005437970161438, "learning_rate": 1.7281604645025071e-06, "loss": 0.0021, "num_input_tokens_seen": 1611456, "step": 3275 }, { "epoch": 0.4328890062029827, "grad_norm": 0.05120214447379112, "learning_rate": 1.7307996832937451e-06, "loss": 0.0595, "num_input_tokens_seen": 1613952, "step": 3280 }, { "epoch": 0.43354889798073115, "grad_norm": 30.487438201904297, "learning_rate": 1.7334389020849827e-06, "loss": 0.1862, "num_input_tokens_seen": 1616320, "step": 3285 }, { "epoch": 0.4342087897584796, "grad_norm": 0.9038780927658081, "learning_rate": 1.7360781208762205e-06, "loss": 0.3119, "num_input_tokens_seen": 1619136, "step": 3290 }, { "epoch": 0.43486868153622804, "grad_norm": 5.6396050453186035, "learning_rate": 1.7387173396674584e-06, "loss": 0.0999, "num_input_tokens_seen": 1621632, "step": 3295 }, { "epoch": 0.4355285733139765, "grad_norm": 1.9114316701889038, "learning_rate": 1.7413565584586962e-06, "loss": 0.1682, "num_input_tokens_seen": 1624064, "step": 3300 }, { "epoch": 0.43618846509172493, "grad_norm": 46.63256072998047, "learning_rate": 1.7439957772499338e-06, "loss": 0.1445, "num_input_tokens_seen": 1626496, "step": 3305 }, { "epoch": 0.43684835686947343, "grad_norm": 20.691329956054688, "learning_rate": 1.7466349960411718e-06, "loss": 0.1393, "num_input_tokens_seen": 1628800, "step": 3310 }, { "epoch": 0.4375082486472219, "grad_norm": 100.15694427490234, "learning_rate": 1.7492742148324095e-06, "loss": 0.1456, "num_input_tokens_seen": 1631232, "step": 3315 }, { "epoch": 0.4381681404249703, "grad_norm": 0.0552111379802227, "learning_rate": 1.7519134336236473e-06, "loss": 0.0401, "num_input_tokens_seen": 1633728, "step": 3320 }, { "epoch": 0.43882803220271877, "grad_norm": 0.3552817702293396, "learning_rate": 1.754552652414885e-06, "loss": 0.1055, "num_input_tokens_seen": 1636224, "step": 3325 }, { "epoch": 0.4394879239804672, "grad_norm": 7.17879581451416, "learning_rate": 1.7571918712061229e-06, "loss": 0.0655, "num_input_tokens_seen": 1638272, "step": 3330 }, { "epoch": 0.44014781575821565, "grad_norm": 0.05285099148750305, "learning_rate": 1.7598310899973608e-06, "loss": 0.0947, "num_input_tokens_seen": 1640512, "step": 3335 }, { "epoch": 0.4408077075359641, "grad_norm": 0.15522630512714386, "learning_rate": 1.7624703087885986e-06, "loss": 0.0818, "num_input_tokens_seen": 1642944, "step": 3340 }, { "epoch": 0.44146759931371254, "grad_norm": 220.8441925048828, "learning_rate": 1.7651095275798362e-06, "loss": 0.0882, "num_input_tokens_seen": 1645376, "step": 3345 }, { "epoch": 0.442127491091461, "grad_norm": 0.0852712094783783, "learning_rate": 1.7677487463710742e-06, "loss": 0.0007, "num_input_tokens_seen": 1647936, "step": 3350 }, { "epoch": 0.44278738286920943, "grad_norm": 0.47491714358329773, "learning_rate": 1.770387965162312e-06, "loss": 0.1307, "num_input_tokens_seen": 1650496, "step": 3355 }, { "epoch": 0.4434472746469579, "grad_norm": 0.023417294025421143, "learning_rate": 1.7730271839535497e-06, "loss": 0.1083, "num_input_tokens_seen": 1652992, "step": 3360 }, { "epoch": 0.4441071664247063, "grad_norm": 0.10983436554670334, "learning_rate": 1.7756664027447875e-06, "loss": 0.0738, "num_input_tokens_seen": 1655488, "step": 3365 }, { "epoch": 0.4447670582024548, "grad_norm": 0.035877879709005356, "learning_rate": 1.7783056215360253e-06, "loss": 0.1521, "num_input_tokens_seen": 1657984, "step": 3370 }, { "epoch": 0.44542694998020327, "grad_norm": 0.28190505504608154, "learning_rate": 1.780944840327263e-06, "loss": 0.2652, "num_input_tokens_seen": 1660736, "step": 3375 }, { "epoch": 0.4460868417579517, "grad_norm": 17.0957088470459, "learning_rate": 1.7835840591185008e-06, "loss": 0.1661, "num_input_tokens_seen": 1663104, "step": 3380 }, { "epoch": 0.44674673353570016, "grad_norm": 95.27015686035156, "learning_rate": 1.7862232779097386e-06, "loss": 0.0768, "num_input_tokens_seen": 1665344, "step": 3385 }, { "epoch": 0.4474066253134486, "grad_norm": 41.594940185546875, "learning_rate": 1.7888624967009763e-06, "loss": 0.1197, "num_input_tokens_seen": 1668096, "step": 3390 }, { "epoch": 0.44806651709119705, "grad_norm": 24.987401962280273, "learning_rate": 1.7915017154922143e-06, "loss": 0.0718, "num_input_tokens_seen": 1670272, "step": 3395 }, { "epoch": 0.4487264088689455, "grad_norm": 1.6394327878952026, "learning_rate": 1.794140934283452e-06, "loss": 0.1287, "num_input_tokens_seen": 1672448, "step": 3400 }, { "epoch": 0.44938630064669394, "grad_norm": 165.666259765625, "learning_rate": 1.7967801530746897e-06, "loss": 0.0269, "num_input_tokens_seen": 1675200, "step": 3405 }, { "epoch": 0.4500461924244424, "grad_norm": 9.98177433013916, "learning_rate": 1.7994193718659277e-06, "loss": 0.1333, "num_input_tokens_seen": 1677696, "step": 3410 }, { "epoch": 0.4507060842021908, "grad_norm": 18.331186294555664, "learning_rate": 1.8020585906571654e-06, "loss": 0.0596, "num_input_tokens_seen": 1680256, "step": 3415 }, { "epoch": 0.45136597597993927, "grad_norm": 0.17333543300628662, "learning_rate": 1.8046978094484032e-06, "loss": 0.085, "num_input_tokens_seen": 1682624, "step": 3420 }, { "epoch": 0.4520258677576877, "grad_norm": 49.40113830566406, "learning_rate": 1.807337028239641e-06, "loss": 0.0495, "num_input_tokens_seen": 1685056, "step": 3425 }, { "epoch": 0.45268575953543616, "grad_norm": 242.10452270507812, "learning_rate": 1.8099762470308787e-06, "loss": 0.1929, "num_input_tokens_seen": 1687168, "step": 3430 }, { "epoch": 0.45334565131318466, "grad_norm": 2.2440037727355957, "learning_rate": 1.8126154658221167e-06, "loss": 0.277, "num_input_tokens_seen": 1689216, "step": 3435 }, { "epoch": 0.4540055430909331, "grad_norm": 0.2605953812599182, "learning_rate": 1.8152546846133543e-06, "loss": 0.0305, "num_input_tokens_seen": 1691776, "step": 3440 }, { "epoch": 0.45466543486868155, "grad_norm": 0.7526996731758118, "learning_rate": 1.817893903404592e-06, "loss": 0.0776, "num_input_tokens_seen": 1694336, "step": 3445 }, { "epoch": 0.45532532664643, "grad_norm": 34.56523132324219, "learning_rate": 1.82053312219583e-06, "loss": 0.2252, "num_input_tokens_seen": 1696640, "step": 3450 }, { "epoch": 0.45598521842417844, "grad_norm": 16.095619201660156, "learning_rate": 1.8231723409870678e-06, "loss": 0.0788, "num_input_tokens_seen": 1698880, "step": 3455 }, { "epoch": 0.4566451102019269, "grad_norm": 74.69313049316406, "learning_rate": 1.8258115597783054e-06, "loss": 0.2611, "num_input_tokens_seen": 1701312, "step": 3460 }, { "epoch": 0.45730500197967533, "grad_norm": 50.04304885864258, "learning_rate": 1.8284507785695434e-06, "loss": 0.2096, "num_input_tokens_seen": 1703808, "step": 3465 }, { "epoch": 0.4579648937574238, "grad_norm": 82.02398681640625, "learning_rate": 1.8310899973607811e-06, "loss": 0.0747, "num_input_tokens_seen": 1706304, "step": 3470 }, { "epoch": 0.4586247855351722, "grad_norm": 15.55357837677002, "learning_rate": 1.8337292161520187e-06, "loss": 0.3095, "num_input_tokens_seen": 1708736, "step": 3475 }, { "epoch": 0.45928467731292066, "grad_norm": 0.8830556273460388, "learning_rate": 1.8363684349432567e-06, "loss": 0.1241, "num_input_tokens_seen": 1711232, "step": 3480 }, { "epoch": 0.4599445690906691, "grad_norm": 0.3285176157951355, "learning_rate": 1.8390076537344945e-06, "loss": 0.0022, "num_input_tokens_seen": 1713600, "step": 3485 }, { "epoch": 0.46060446086841755, "grad_norm": 0.16952642798423767, "learning_rate": 1.8416468725257325e-06, "loss": 0.1762, "num_input_tokens_seen": 1716224, "step": 3490 }, { "epoch": 0.46126435264616605, "grad_norm": 62.19145965576172, "learning_rate": 1.84428609131697e-06, "loss": 0.2366, "num_input_tokens_seen": 1718720, "step": 3495 }, { "epoch": 0.4619242444239145, "grad_norm": 19.505321502685547, "learning_rate": 1.8469253101082078e-06, "loss": 0.2068, "num_input_tokens_seen": 1721280, "step": 3500 }, { "epoch": 0.46258413620166294, "grad_norm": 57.49911880493164, "learning_rate": 1.8495645288994458e-06, "loss": 0.0785, "num_input_tokens_seen": 1723712, "step": 3505 }, { "epoch": 0.4632440279794114, "grad_norm": 36.87958526611328, "learning_rate": 1.8522037476906835e-06, "loss": 0.1624, "num_input_tokens_seen": 1725952, "step": 3510 }, { "epoch": 0.46390391975715983, "grad_norm": 6.948802947998047, "learning_rate": 1.8548429664819211e-06, "loss": 0.0022, "num_input_tokens_seen": 1728512, "step": 3515 }, { "epoch": 0.4645638115349083, "grad_norm": 116.72127532958984, "learning_rate": 1.857482185273159e-06, "loss": 0.1997, "num_input_tokens_seen": 1731008, "step": 3520 }, { "epoch": 0.4652237033126567, "grad_norm": 36.398902893066406, "learning_rate": 1.8601214040643969e-06, "loss": 0.1817, "num_input_tokens_seen": 1733696, "step": 3525 }, { "epoch": 0.46588359509040517, "grad_norm": 158.40782165527344, "learning_rate": 1.8627606228556346e-06, "loss": 0.0807, "num_input_tokens_seen": 1736256, "step": 3530 }, { "epoch": 0.4665434868681536, "grad_norm": 56.720367431640625, "learning_rate": 1.8653998416468724e-06, "loss": 0.2405, "num_input_tokens_seen": 1738944, "step": 3535 }, { "epoch": 0.46720337864590206, "grad_norm": 19.342018127441406, "learning_rate": 1.8680390604381102e-06, "loss": 0.207, "num_input_tokens_seen": 1741568, "step": 3540 }, { "epoch": 0.4678632704236505, "grad_norm": 0.12003885954618454, "learning_rate": 1.8706782792293482e-06, "loss": 0.0024, "num_input_tokens_seen": 1743808, "step": 3545 }, { "epoch": 0.46852316220139895, "grad_norm": 0.5443273782730103, "learning_rate": 1.873317498020586e-06, "loss": 0.035, "num_input_tokens_seen": 1746176, "step": 3550 }, { "epoch": 0.46918305397914745, "grad_norm": 0.12307876348495483, "learning_rate": 1.8759567168118235e-06, "loss": 0.214, "num_input_tokens_seen": 1748608, "step": 3555 }, { "epoch": 0.4698429457568959, "grad_norm": 0.24467946588993073, "learning_rate": 1.8785959356030615e-06, "loss": 0.0983, "num_input_tokens_seen": 1751040, "step": 3560 }, { "epoch": 0.47050283753464434, "grad_norm": 2.263025999069214, "learning_rate": 1.8812351543942993e-06, "loss": 0.1472, "num_input_tokens_seen": 1753280, "step": 3565 }, { "epoch": 0.4711627293123928, "grad_norm": 14.852109909057617, "learning_rate": 1.883874373185537e-06, "loss": 0.4087, "num_input_tokens_seen": 1755712, "step": 3570 }, { "epoch": 0.4718226210901412, "grad_norm": 53.581016540527344, "learning_rate": 1.8865135919767748e-06, "loss": 0.1291, "num_input_tokens_seen": 1757952, "step": 3575 }, { "epoch": 0.47248251286788967, "grad_norm": 116.46224212646484, "learning_rate": 1.8891528107680126e-06, "loss": 0.0719, "num_input_tokens_seen": 1760384, "step": 3580 }, { "epoch": 0.4731424046456381, "grad_norm": 0.08298421651124954, "learning_rate": 1.8917920295592504e-06, "loss": 0.0011, "num_input_tokens_seen": 1762944, "step": 3585 }, { "epoch": 0.47380229642338656, "grad_norm": 29.53057861328125, "learning_rate": 1.8944312483504881e-06, "loss": 0.2399, "num_input_tokens_seen": 1765504, "step": 3590 }, { "epoch": 0.474462188201135, "grad_norm": 0.1717340499162674, "learning_rate": 1.897070467141726e-06, "loss": 0.0452, "num_input_tokens_seen": 1768128, "step": 3595 }, { "epoch": 0.47512207997888345, "grad_norm": 0.200901597738266, "learning_rate": 1.8997096859329637e-06, "loss": 0.1898, "num_input_tokens_seen": 1770624, "step": 3600 }, { "epoch": 0.4757819717566319, "grad_norm": 24.74864387512207, "learning_rate": 1.9023489047242017e-06, "loss": 0.329, "num_input_tokens_seen": 1773248, "step": 3605 }, { "epoch": 0.47644186353438034, "grad_norm": 22.671142578125, "learning_rate": 1.9049881235154392e-06, "loss": 0.0657, "num_input_tokens_seen": 1775488, "step": 3610 }, { "epoch": 0.4771017553121288, "grad_norm": 1.239869475364685, "learning_rate": 1.907627342306677e-06, "loss": 0.1438, "num_input_tokens_seen": 1777792, "step": 3615 }, { "epoch": 0.4777616470898773, "grad_norm": 1.57161283493042, "learning_rate": 1.9102665610979148e-06, "loss": 0.0654, "num_input_tokens_seen": 1780352, "step": 3620 }, { "epoch": 0.47842153886762573, "grad_norm": 0.7097579836845398, "learning_rate": 1.9129057798891528e-06, "loss": 0.2003, "num_input_tokens_seen": 1783168, "step": 3625 }, { "epoch": 0.4790814306453742, "grad_norm": 23.16379737854004, "learning_rate": 1.9155449986803903e-06, "loss": 0.1582, "num_input_tokens_seen": 1785920, "step": 3630 }, { "epoch": 0.4797413224231226, "grad_norm": 0.09995155036449432, "learning_rate": 1.9181842174716283e-06, "loss": 0.1929, "num_input_tokens_seen": 1788416, "step": 3635 }, { "epoch": 0.48040121420087106, "grad_norm": 0.15477493405342102, "learning_rate": 1.9208234362628663e-06, "loss": 0.0412, "num_input_tokens_seen": 1790848, "step": 3640 }, { "epoch": 0.4810611059786195, "grad_norm": 73.67361450195312, "learning_rate": 1.923462655054104e-06, "loss": 0.0349, "num_input_tokens_seen": 1793280, "step": 3645 }, { "epoch": 0.48172099775636795, "grad_norm": 0.04567524790763855, "learning_rate": 1.9261018738453414e-06, "loss": 0.209, "num_input_tokens_seen": 1795648, "step": 3650 }, { "epoch": 0.4823808895341164, "grad_norm": 30.12908363342285, "learning_rate": 1.9287410926365794e-06, "loss": 0.1709, "num_input_tokens_seen": 1797952, "step": 3655 }, { "epoch": 0.48304078131186484, "grad_norm": 53.03123092651367, "learning_rate": 1.9313803114278174e-06, "loss": 0.099, "num_input_tokens_seen": 1800192, "step": 3660 }, { "epoch": 0.4837006730896133, "grad_norm": 0.17931802570819855, "learning_rate": 1.9340195302190554e-06, "loss": 0.1852, "num_input_tokens_seen": 1802560, "step": 3665 }, { "epoch": 0.48436056486736173, "grad_norm": 77.04347229003906, "learning_rate": 1.936658749010293e-06, "loss": 0.2034, "num_input_tokens_seen": 1805056, "step": 3670 }, { "epoch": 0.4850204566451102, "grad_norm": 16.242393493652344, "learning_rate": 1.9392979678015305e-06, "loss": 0.2424, "num_input_tokens_seen": 1807744, "step": 3675 }, { "epoch": 0.4856803484228587, "grad_norm": 39.54520797729492, "learning_rate": 1.9419371865927685e-06, "loss": 0.1529, "num_input_tokens_seen": 1810368, "step": 3680 }, { "epoch": 0.4863402402006071, "grad_norm": 0.6363928914070129, "learning_rate": 1.944576405384006e-06, "loss": 0.0411, "num_input_tokens_seen": 1813056, "step": 3685 }, { "epoch": 0.48700013197835557, "grad_norm": 0.6807913184165955, "learning_rate": 1.947215624175244e-06, "loss": 0.2287, "num_input_tokens_seen": 1815360, "step": 3690 }, { "epoch": 0.487660023756104, "grad_norm": 16.44377326965332, "learning_rate": 1.949854842966482e-06, "loss": 0.1827, "num_input_tokens_seen": 1817920, "step": 3695 }, { "epoch": 0.48831991553385246, "grad_norm": 15.65255355834961, "learning_rate": 1.9524940617577196e-06, "loss": 0.0576, "num_input_tokens_seen": 1820288, "step": 3700 }, { "epoch": 0.4889798073116009, "grad_norm": 0.26828962564468384, "learning_rate": 1.955133280548957e-06, "loss": 0.1, "num_input_tokens_seen": 1822656, "step": 3705 }, { "epoch": 0.48963969908934935, "grad_norm": 18.101781845092773, "learning_rate": 1.957772499340195e-06, "loss": 0.1911, "num_input_tokens_seen": 1824832, "step": 3710 }, { "epoch": 0.4902995908670978, "grad_norm": 138.92214965820312, "learning_rate": 1.960411718131433e-06, "loss": 0.1059, "num_input_tokens_seen": 1827200, "step": 3715 }, { "epoch": 0.49095948264484623, "grad_norm": 17.669477462768555, "learning_rate": 1.963050936922671e-06, "loss": 0.1355, "num_input_tokens_seen": 1829632, "step": 3720 }, { "epoch": 0.4916193744225947, "grad_norm": 35.92660140991211, "learning_rate": 1.9656901557139086e-06, "loss": 0.0601, "num_input_tokens_seen": 1832192, "step": 3725 }, { "epoch": 0.4922792662003431, "grad_norm": 90.45658111572266, "learning_rate": 1.968329374505146e-06, "loss": 0.1779, "num_input_tokens_seen": 1834752, "step": 3730 }, { "epoch": 0.49293915797809157, "grad_norm": 7.398950099945068, "learning_rate": 1.970968593296384e-06, "loss": 0.0968, "num_input_tokens_seen": 1837248, "step": 3735 }, { "epoch": 0.49359904975584007, "grad_norm": 62.705711364746094, "learning_rate": 1.973607812087622e-06, "loss": 0.2923, "num_input_tokens_seen": 1839680, "step": 3740 }, { "epoch": 0.4942589415335885, "grad_norm": 0.30728596448898315, "learning_rate": 1.9762470308788597e-06, "loss": 0.115, "num_input_tokens_seen": 1842304, "step": 3745 }, { "epoch": 0.49491883331133696, "grad_norm": 1.9036290645599365, "learning_rate": 1.9788862496700977e-06, "loss": 0.0656, "num_input_tokens_seen": 1844480, "step": 3750 }, { "epoch": 0.4955787250890854, "grad_norm": 0.364557683467865, "learning_rate": 1.9815254684613353e-06, "loss": 0.0883, "num_input_tokens_seen": 1846912, "step": 3755 }, { "epoch": 0.49623861686683385, "grad_norm": 14.800479888916016, "learning_rate": 1.9841646872525733e-06, "loss": 0.1242, "num_input_tokens_seen": 1849024, "step": 3760 }, { "epoch": 0.4968985086445823, "grad_norm": 0.4379720389842987, "learning_rate": 1.986803906043811e-06, "loss": 0.0576, "num_input_tokens_seen": 1851712, "step": 3765 }, { "epoch": 0.49755840042233074, "grad_norm": 74.74554443359375, "learning_rate": 1.989443124835049e-06, "loss": 0.1076, "num_input_tokens_seen": 1854208, "step": 3770 }, { "epoch": 0.4982182922000792, "grad_norm": 0.05924411118030548, "learning_rate": 1.9920823436262864e-06, "loss": 0.0448, "num_input_tokens_seen": 1856704, "step": 3775 }, { "epoch": 0.4988781839778276, "grad_norm": 200.05575561523438, "learning_rate": 1.9947215624175244e-06, "loss": 0.1233, "num_input_tokens_seen": 1858944, "step": 3780 }, { "epoch": 0.49953807575557607, "grad_norm": 0.9478186964988708, "learning_rate": 1.997360781208762e-06, "loss": 0.1662, "num_input_tokens_seen": 1861696, "step": 3785 }, { "epoch": 0.5001979675333246, "grad_norm": 0.37777480483055115, "learning_rate": 2e-06, "loss": 0.3196, "num_input_tokens_seen": 1864128, "step": 3790 }, { "epoch": 0.5001979675333246, "eval_loss": 0.15765729546546936, "eval_runtime": 7.8748, "eval_samples_per_second": 855.255, "eval_steps_per_second": 106.923, "num_input_tokens_seen": 1864128, "step": 3790 }, { "epoch": 0.500857859311073, "grad_norm": 0.1402450054883957, "learning_rate": 1.9999998938786208e-06, "loss": 0.0058, "num_input_tokens_seen": 1866432, "step": 3795 }, { "epoch": 0.5015177510888215, "grad_norm": 16.13665008544922, "learning_rate": 1.9999995755145053e-06, "loss": 0.0722, "num_input_tokens_seen": 1868736, "step": 3800 }, { "epoch": 0.5021776428665699, "grad_norm": 14.748276710510254, "learning_rate": 1.9999990449077214e-06, "loss": 0.1512, "num_input_tokens_seen": 1871360, "step": 3805 }, { "epoch": 0.5028375346443184, "grad_norm": 0.05216236785054207, "learning_rate": 1.999998302058382e-06, "loss": 0.0042, "num_input_tokens_seen": 1873536, "step": 3810 }, { "epoch": 0.5034974264220667, "grad_norm": 123.01942443847656, "learning_rate": 1.999997346966644e-06, "loss": 0.2111, "num_input_tokens_seen": 1875904, "step": 3815 }, { "epoch": 0.5041573181998152, "grad_norm": 34.44160461425781, "learning_rate": 1.999996179632711e-06, "loss": 0.2222, "num_input_tokens_seen": 1878464, "step": 3820 }, { "epoch": 0.5048172099775636, "grad_norm": 115.07276153564453, "learning_rate": 1.9999948000568297e-06, "loss": 0.1967, "num_input_tokens_seen": 1880640, "step": 3825 }, { "epoch": 0.5054771017553121, "grad_norm": 106.39981079101562, "learning_rate": 1.9999932082392934e-06, "loss": 0.1649, "num_input_tokens_seen": 1882944, "step": 3830 }, { "epoch": 0.5061369935330606, "grad_norm": 1.3459759950637817, "learning_rate": 1.9999914041804405e-06, "loss": 0.064, "num_input_tokens_seen": 1885248, "step": 3835 }, { "epoch": 0.506796885310809, "grad_norm": 21.34935188293457, "learning_rate": 1.9999893878806534e-06, "loss": 0.1077, "num_input_tokens_seen": 1887872, "step": 3840 }, { "epoch": 0.5074567770885575, "grad_norm": 185.3477020263672, "learning_rate": 1.99998715934036e-06, "loss": 0.0414, "num_input_tokens_seen": 1890560, "step": 3845 }, { "epoch": 0.5081166688663059, "grad_norm": 231.92300415039062, "learning_rate": 1.999984718560033e-06, "loss": 0.1999, "num_input_tokens_seen": 1893056, "step": 3850 }, { "epoch": 0.5087765606440544, "grad_norm": 0.08792197704315186, "learning_rate": 1.9999820655401914e-06, "loss": 0.1416, "num_input_tokens_seen": 1895360, "step": 3855 }, { "epoch": 0.5094364524218028, "grad_norm": 0.09239798784255981, "learning_rate": 1.9999792002813973e-06, "loss": 0.0954, "num_input_tokens_seen": 1897664, "step": 3860 }, { "epoch": 0.5100963441995513, "grad_norm": 50.75433349609375, "learning_rate": 1.9999761227842592e-06, "loss": 0.27, "num_input_tokens_seen": 1900288, "step": 3865 }, { "epoch": 0.5107562359772997, "grad_norm": 0.144536554813385, "learning_rate": 1.9999728330494307e-06, "loss": 0.0797, "num_input_tokens_seen": 1903040, "step": 3870 }, { "epoch": 0.5114161277550482, "grad_norm": 18.808856964111328, "learning_rate": 1.9999693310776095e-06, "loss": 0.1566, "num_input_tokens_seen": 1905472, "step": 3875 }, { "epoch": 0.5120760195327966, "grad_norm": 0.6288896203041077, "learning_rate": 1.9999656168695387e-06, "loss": 0.0922, "num_input_tokens_seen": 1907712, "step": 3880 }, { "epoch": 0.5127359113105451, "grad_norm": 0.032050516456365585, "learning_rate": 1.9999616904260072e-06, "loss": 0.001, "num_input_tokens_seen": 1910080, "step": 3885 }, { "epoch": 0.5133958030882935, "grad_norm": 38.96467971801758, "learning_rate": 1.9999575517478477e-06, "loss": 0.2619, "num_input_tokens_seen": 1912512, "step": 3890 }, { "epoch": 0.514055694866042, "grad_norm": 0.0541483499109745, "learning_rate": 1.9999532008359393e-06, "loss": 0.1074, "num_input_tokens_seen": 1914752, "step": 3895 }, { "epoch": 0.5147155866437905, "grad_norm": 0.31871920824050903, "learning_rate": 1.999948637691205e-06, "loss": 0.002, "num_input_tokens_seen": 1917120, "step": 3900 }, { "epoch": 0.5153754784215389, "grad_norm": 118.97882080078125, "learning_rate": 1.9999438623146132e-06, "loss": 0.0127, "num_input_tokens_seen": 1919232, "step": 3905 }, { "epoch": 0.5160353701992874, "grad_norm": 0.19287319481372833, "learning_rate": 1.999938874707178e-06, "loss": 0.0941, "num_input_tokens_seen": 1921856, "step": 3910 }, { "epoch": 0.5166952619770357, "grad_norm": 40.96875762939453, "learning_rate": 1.9999336748699576e-06, "loss": 0.1961, "num_input_tokens_seen": 1924224, "step": 3915 }, { "epoch": 0.5173551537547842, "grad_norm": 0.026069777086377144, "learning_rate": 1.9999282628040553e-06, "loss": 0.0313, "num_input_tokens_seen": 1926464, "step": 3920 }, { "epoch": 0.5180150455325326, "grad_norm": 0.15689440071582794, "learning_rate": 1.9999226385106205e-06, "loss": 0.3644, "num_input_tokens_seen": 1928896, "step": 3925 }, { "epoch": 0.5186749373102811, "grad_norm": 70.96661376953125, "learning_rate": 1.9999168019908464e-06, "loss": 0.1085, "num_input_tokens_seen": 1931200, "step": 3930 }, { "epoch": 0.5193348290880295, "grad_norm": 0.10924938321113586, "learning_rate": 1.9999107532459716e-06, "loss": 0.1389, "num_input_tokens_seen": 1933632, "step": 3935 }, { "epoch": 0.519994720865778, "grad_norm": 0.22806203365325928, "learning_rate": 1.9999044922772808e-06, "loss": 0.0024, "num_input_tokens_seen": 1935872, "step": 3940 }, { "epoch": 0.5206546126435264, "grad_norm": 0.1909378170967102, "learning_rate": 1.999898019086102e-06, "loss": 0.0619, "num_input_tokens_seen": 1938048, "step": 3945 }, { "epoch": 0.5213145044212749, "grad_norm": 97.45793151855469, "learning_rate": 1.999891333673809e-06, "loss": 0.0606, "num_input_tokens_seen": 1940608, "step": 3950 }, { "epoch": 0.5219743961990233, "grad_norm": 131.9565887451172, "learning_rate": 1.999884436041822e-06, "loss": 0.3859, "num_input_tokens_seen": 1943040, "step": 3955 }, { "epoch": 0.5226342879767718, "grad_norm": 0.3030645251274109, "learning_rate": 1.999877326191603e-06, "loss": 0.1834, "num_input_tokens_seen": 1945664, "step": 3960 }, { "epoch": 0.5232941797545203, "grad_norm": 0.06720297783613205, "learning_rate": 1.9998700041246626e-06, "loss": 0.0053, "num_input_tokens_seen": 1948032, "step": 3965 }, { "epoch": 0.5239540715322687, "grad_norm": 0.10676462203264236, "learning_rate": 1.9998624698425545e-06, "loss": 0.0008, "num_input_tokens_seen": 1950208, "step": 3970 }, { "epoch": 0.5246139633100172, "grad_norm": 0.34879347681999207, "learning_rate": 1.999854723346877e-06, "loss": 0.1693, "num_input_tokens_seen": 1952896, "step": 3975 }, { "epoch": 0.5252738550877656, "grad_norm": 0.12043995410203934, "learning_rate": 1.999846764639275e-06, "loss": 0.0881, "num_input_tokens_seen": 1955200, "step": 3980 }, { "epoch": 0.5259337468655141, "grad_norm": 0.03420906886458397, "learning_rate": 1.999838593721438e-06, "loss": 0.0753, "num_input_tokens_seen": 1957376, "step": 3985 }, { "epoch": 0.5265936386432625, "grad_norm": 0.09042178839445114, "learning_rate": 1.999830210595099e-06, "loss": 0.2321, "num_input_tokens_seen": 1960064, "step": 3990 }, { "epoch": 0.527253530421011, "grad_norm": 0.13190309703350067, "learning_rate": 1.999821615262039e-06, "loss": 0.1365, "num_input_tokens_seen": 1962624, "step": 3995 }, { "epoch": 0.5279134221987594, "grad_norm": 0.33515259623527527, "learning_rate": 1.9998128077240805e-06, "loss": 0.0799, "num_input_tokens_seen": 1965056, "step": 4000 }, { "epoch": 0.5285733139765079, "grad_norm": 1.0748093128204346, "learning_rate": 1.9998037879830937e-06, "loss": 0.137, "num_input_tokens_seen": 1967424, "step": 4005 }, { "epoch": 0.5292332057542563, "grad_norm": 89.96631622314453, "learning_rate": 1.999794556040993e-06, "loss": 0.2433, "num_input_tokens_seen": 1969856, "step": 4010 }, { "epoch": 0.5298930975320048, "grad_norm": 0.29487359523773193, "learning_rate": 1.999785111899738e-06, "loss": 0.0609, "num_input_tokens_seen": 1972544, "step": 4015 }, { "epoch": 0.5305529893097533, "grad_norm": 0.09903884679079056, "learning_rate": 1.9997754555613324e-06, "loss": 0.0675, "num_input_tokens_seen": 1974912, "step": 4020 }, { "epoch": 0.5312128810875016, "grad_norm": 21.907075881958008, "learning_rate": 1.999765587027827e-06, "loss": 0.2871, "num_input_tokens_seen": 1977216, "step": 4025 }, { "epoch": 0.5318727728652501, "grad_norm": 30.0252742767334, "learning_rate": 1.9997555063013147e-06, "loss": 0.2346, "num_input_tokens_seen": 1979648, "step": 4030 }, { "epoch": 0.5325326646429985, "grad_norm": 84.9073257446289, "learning_rate": 1.999745213383936e-06, "loss": 0.013, "num_input_tokens_seen": 1981888, "step": 4035 }, { "epoch": 0.533192556420747, "grad_norm": 86.11760711669922, "learning_rate": 1.9997347082778753e-06, "loss": 0.1859, "num_input_tokens_seen": 1984448, "step": 4040 }, { "epoch": 0.5338524481984954, "grad_norm": 45.54768753051758, "learning_rate": 1.999723990985363e-06, "loss": 0.3186, "num_input_tokens_seen": 1987072, "step": 4045 }, { "epoch": 0.5345123399762439, "grad_norm": 0.8239102363586426, "learning_rate": 1.999713061508672e-06, "loss": 0.1181, "num_input_tokens_seen": 1989632, "step": 4050 }, { "epoch": 0.5351722317539923, "grad_norm": 0.6783844232559204, "learning_rate": 1.9997019198501233e-06, "loss": 0.0538, "num_input_tokens_seen": 1992192, "step": 4055 }, { "epoch": 0.5358321235317408, "grad_norm": 33.974369049072266, "learning_rate": 1.999690566012082e-06, "loss": 0.265, "num_input_tokens_seen": 1994624, "step": 4060 }, { "epoch": 0.5364920153094892, "grad_norm": 17.405261993408203, "learning_rate": 1.9996789999969568e-06, "loss": 0.0607, "num_input_tokens_seen": 1997056, "step": 4065 }, { "epoch": 0.5371519070872377, "grad_norm": 2.6141507625579834, "learning_rate": 1.999667221807203e-06, "loss": 0.134, "num_input_tokens_seen": 1999360, "step": 4070 }, { "epoch": 0.5378117988649861, "grad_norm": 58.6915397644043, "learning_rate": 1.9996552314453204e-06, "loss": 0.082, "num_input_tokens_seen": 2001856, "step": 4075 }, { "epoch": 0.5384716906427346, "grad_norm": 51.96027755737305, "learning_rate": 1.999643028913854e-06, "loss": 0.1956, "num_input_tokens_seen": 2004288, "step": 4080 }, { "epoch": 0.5391315824204831, "grad_norm": 80.61639404296875, "learning_rate": 1.9996306142153935e-06, "loss": 0.1135, "num_input_tokens_seen": 2006976, "step": 4085 }, { "epoch": 0.5397914741982315, "grad_norm": 4.0380539894104, "learning_rate": 1.9996179873525737e-06, "loss": 0.0367, "num_input_tokens_seen": 2009280, "step": 4090 }, { "epoch": 0.54045136597598, "grad_norm": 189.69859313964844, "learning_rate": 1.9996051483280744e-06, "loss": 0.2253, "num_input_tokens_seen": 2011776, "step": 4095 }, { "epoch": 0.5411112577537284, "grad_norm": 47.396934509277344, "learning_rate": 1.9995920971446215e-06, "loss": 0.4021, "num_input_tokens_seen": 2014336, "step": 4100 }, { "epoch": 0.5417711495314769, "grad_norm": 31.230287551879883, "learning_rate": 1.9995788338049846e-06, "loss": 0.1495, "num_input_tokens_seen": 2017152, "step": 4105 }, { "epoch": 0.5424310413092253, "grad_norm": 26.340003967285156, "learning_rate": 1.999565358311978e-06, "loss": 0.1161, "num_input_tokens_seen": 2019520, "step": 4110 }, { "epoch": 0.5430909330869738, "grad_norm": 0.42596912384033203, "learning_rate": 1.999551670668463e-06, "loss": 0.1655, "num_input_tokens_seen": 2021632, "step": 4115 }, { "epoch": 0.5437508248647221, "grad_norm": 41.248680114746094, "learning_rate": 1.9995377708773437e-06, "loss": 0.1365, "num_input_tokens_seen": 2023744, "step": 4120 }, { "epoch": 0.5444107166424706, "grad_norm": 0.6861774921417236, "learning_rate": 1.999523658941571e-06, "loss": 0.0072, "num_input_tokens_seen": 2026048, "step": 4125 }, { "epoch": 0.545070608420219, "grad_norm": 0.05741658806800842, "learning_rate": 1.999509334864139e-06, "loss": 0.0905, "num_input_tokens_seen": 2028352, "step": 4130 }, { "epoch": 0.5457305001979675, "grad_norm": 0.12205217778682709, "learning_rate": 1.999494798648089e-06, "loss": 0.0448, "num_input_tokens_seen": 2030848, "step": 4135 }, { "epoch": 0.5463903919757159, "grad_norm": 0.046685557812452316, "learning_rate": 1.9994800502965055e-06, "loss": 0.1427, "num_input_tokens_seen": 2033344, "step": 4140 }, { "epoch": 0.5470502837534644, "grad_norm": 1.0870046615600586, "learning_rate": 1.9994650898125193e-06, "loss": 0.0661, "num_input_tokens_seen": 2035584, "step": 4145 }, { "epoch": 0.5477101755312129, "grad_norm": 23.627378463745117, "learning_rate": 1.9994499171993056e-06, "loss": 0.0643, "num_input_tokens_seen": 2038272, "step": 4150 }, { "epoch": 0.5483700673089613, "grad_norm": 19.256277084350586, "learning_rate": 1.999434532460084e-06, "loss": 0.1879, "num_input_tokens_seen": 2040768, "step": 4155 }, { "epoch": 0.5490299590867098, "grad_norm": 27.206546783447266, "learning_rate": 1.99941893559812e-06, "loss": 0.1577, "num_input_tokens_seen": 2043072, "step": 4160 }, { "epoch": 0.5496898508644582, "grad_norm": 14.51369571685791, "learning_rate": 1.9994031266167247e-06, "loss": 0.1324, "num_input_tokens_seen": 2045824, "step": 4165 }, { "epoch": 0.5503497426422067, "grad_norm": 26.757734298706055, "learning_rate": 1.999387105519253e-06, "loss": 0.366, "num_input_tokens_seen": 2048064, "step": 4170 }, { "epoch": 0.5510096344199551, "grad_norm": 0.8686099648475647, "learning_rate": 1.9993708723091044e-06, "loss": 0.0539, "num_input_tokens_seen": 2050432, "step": 4175 }, { "epoch": 0.5516695261977036, "grad_norm": 1.9059375524520874, "learning_rate": 1.9993544269897253e-06, "loss": 0.0967, "num_input_tokens_seen": 2052928, "step": 4180 }, { "epoch": 0.552329417975452, "grad_norm": 1.612260341644287, "learning_rate": 1.999337769564606e-06, "loss": 0.1474, "num_input_tokens_seen": 2055424, "step": 4185 }, { "epoch": 0.5529893097532005, "grad_norm": 1.3080824613571167, "learning_rate": 1.9993209000372814e-06, "loss": 0.121, "num_input_tokens_seen": 2057536, "step": 4190 }, { "epoch": 0.5536492015309489, "grad_norm": 45.95222854614258, "learning_rate": 1.9993038184113325e-06, "loss": 0.2545, "num_input_tokens_seen": 2059840, "step": 4195 }, { "epoch": 0.5543090933086974, "grad_norm": 35.62221908569336, "learning_rate": 1.999286524690385e-06, "loss": 0.2414, "num_input_tokens_seen": 2062656, "step": 4200 }, { "epoch": 0.5549689850864459, "grad_norm": 1.8063488006591797, "learning_rate": 1.999269018878108e-06, "loss": 0.2045, "num_input_tokens_seen": 2064960, "step": 4205 }, { "epoch": 0.5556288768641943, "grad_norm": 18.830642700195312, "learning_rate": 1.999251300978219e-06, "loss": 0.233, "num_input_tokens_seen": 2067008, "step": 4210 }, { "epoch": 0.5562887686419428, "grad_norm": 0.42927706241607666, "learning_rate": 1.9992333709944764e-06, "loss": 0.0333, "num_input_tokens_seen": 2069696, "step": 4215 }, { "epoch": 0.5569486604196912, "grad_norm": 10.923612594604492, "learning_rate": 1.9992152289306872e-06, "loss": 0.0229, "num_input_tokens_seen": 2072320, "step": 4220 }, { "epoch": 0.5576085521974397, "grad_norm": 0.6108276844024658, "learning_rate": 1.999196874790701e-06, "loss": 0.005, "num_input_tokens_seen": 2074752, "step": 4225 }, { "epoch": 0.558268443975188, "grad_norm": 13.711141586303711, "learning_rate": 1.999178308578414e-06, "loss": 0.1422, "num_input_tokens_seen": 2077440, "step": 4230 }, { "epoch": 0.5589283357529365, "grad_norm": 70.51332092285156, "learning_rate": 1.9991595302977666e-06, "loss": 0.2342, "num_input_tokens_seen": 2080000, "step": 4235 }, { "epoch": 0.5595882275306849, "grad_norm": 30.383102416992188, "learning_rate": 1.9991405399527438e-06, "loss": 0.196, "num_input_tokens_seen": 2082560, "step": 4240 }, { "epoch": 0.5602481193084334, "grad_norm": 18.101728439331055, "learning_rate": 1.999121337547377e-06, "loss": 0.1375, "num_input_tokens_seen": 2084864, "step": 4245 }, { "epoch": 0.5609080110861818, "grad_norm": 0.16135449707508087, "learning_rate": 1.9991019230857413e-06, "loss": 0.097, "num_input_tokens_seen": 2087424, "step": 4250 }, { "epoch": 0.5615679028639303, "grad_norm": 6.275109767913818, "learning_rate": 1.999082296571957e-06, "loss": 0.1572, "num_input_tokens_seen": 2090048, "step": 4255 }, { "epoch": 0.5622277946416787, "grad_norm": 0.1460818648338318, "learning_rate": 1.9990624580101907e-06, "loss": 0.0845, "num_input_tokens_seen": 2092416, "step": 4260 }, { "epoch": 0.5628876864194272, "grad_norm": 2.7357776165008545, "learning_rate": 1.999042407404652e-06, "loss": 0.0568, "num_input_tokens_seen": 2094656, "step": 4265 }, { "epoch": 0.5635475781971757, "grad_norm": 322.82025146484375, "learning_rate": 1.999022144759597e-06, "loss": 0.0462, "num_input_tokens_seen": 2097024, "step": 4270 }, { "epoch": 0.5642074699749241, "grad_norm": 3.075206995010376, "learning_rate": 1.9990016700793257e-06, "loss": 0.0643, "num_input_tokens_seen": 2099392, "step": 4275 }, { "epoch": 0.5648673617526726, "grad_norm": 1.3678243160247803, "learning_rate": 1.9989809833681845e-06, "loss": 0.0911, "num_input_tokens_seen": 2102016, "step": 4280 }, { "epoch": 0.565527253530421, "grad_norm": 34.296119689941406, "learning_rate": 1.9989600846305634e-06, "loss": 0.2019, "num_input_tokens_seen": 2104320, "step": 4285 }, { "epoch": 0.5661871453081695, "grad_norm": 43.207645416259766, "learning_rate": 1.9989389738708984e-06, "loss": 0.1077, "num_input_tokens_seen": 2107136, "step": 4290 }, { "epoch": 0.5668470370859179, "grad_norm": 1.0292057991027832, "learning_rate": 1.9989176510936698e-06, "loss": 0.0653, "num_input_tokens_seen": 2109888, "step": 4295 }, { "epoch": 0.5675069288636664, "grad_norm": 17.092741012573242, "learning_rate": 1.9988961163034033e-06, "loss": 0.1031, "num_input_tokens_seen": 2112192, "step": 4300 }, { "epoch": 0.5681668206414148, "grad_norm": 0.08173481374979019, "learning_rate": 1.9988743695046696e-06, "loss": 0.1154, "num_input_tokens_seen": 2114752, "step": 4305 }, { "epoch": 0.5688267124191633, "grad_norm": 0.012852237559854984, "learning_rate": 1.9988524107020844e-06, "loss": 0.0766, "num_input_tokens_seen": 2117184, "step": 4310 }, { "epoch": 0.5694866041969117, "grad_norm": 0.25723448395729065, "learning_rate": 1.9988302399003083e-06, "loss": 0.1522, "num_input_tokens_seen": 2119552, "step": 4315 }, { "epoch": 0.5701464959746602, "grad_norm": 138.39862060546875, "learning_rate": 1.9988078571040464e-06, "loss": 0.1321, "num_input_tokens_seen": 2121920, "step": 4320 }, { "epoch": 0.5708063877524086, "grad_norm": 12.506170272827148, "learning_rate": 1.99878526231805e-06, "loss": 0.2248, "num_input_tokens_seen": 2124096, "step": 4325 }, { "epoch": 0.571466279530157, "grad_norm": 27.999643325805664, "learning_rate": 1.998762455547114e-06, "loss": 0.1275, "num_input_tokens_seen": 2126528, "step": 4330 }, { "epoch": 0.5721261713079056, "grad_norm": 13.254135131835938, "learning_rate": 1.998739436796079e-06, "loss": 0.0773, "num_input_tokens_seen": 2128576, "step": 4335 }, { "epoch": 0.5727860630856539, "grad_norm": 190.8338165283203, "learning_rate": 1.9987162060698312e-06, "loss": 0.1947, "num_input_tokens_seen": 2130688, "step": 4340 }, { "epoch": 0.5734459548634024, "grad_norm": 15.912854194641113, "learning_rate": 1.9986927633733007e-06, "loss": 0.2038, "num_input_tokens_seen": 2133312, "step": 4345 }, { "epoch": 0.5741058466411508, "grad_norm": 0.5080829858779907, "learning_rate": 1.9986691087114634e-06, "loss": 0.1005, "num_input_tokens_seen": 2135680, "step": 4350 }, { "epoch": 0.5747657384188993, "grad_norm": 0.27579280734062195, "learning_rate": 1.9986452420893393e-06, "loss": 0.1931, "num_input_tokens_seen": 2138112, "step": 4355 }, { "epoch": 0.5754256301966477, "grad_norm": 25.686216354370117, "learning_rate": 1.998621163511994e-06, "loss": 0.3394, "num_input_tokens_seen": 2140352, "step": 4360 }, { "epoch": 0.5760855219743962, "grad_norm": 0.8148085474967957, "learning_rate": 1.998596872984539e-06, "loss": 0.1082, "num_input_tokens_seen": 2143040, "step": 4365 }, { "epoch": 0.5767454137521446, "grad_norm": 0.7620784044265747, "learning_rate": 1.998572370512128e-06, "loss": 0.0335, "num_input_tokens_seen": 2145280, "step": 4370 }, { "epoch": 0.5774053055298931, "grad_norm": 1.3707529306411743, "learning_rate": 1.998547656099963e-06, "loss": 0.0719, "num_input_tokens_seen": 2147904, "step": 4375 }, { "epoch": 0.5780651973076415, "grad_norm": 0.11669503152370453, "learning_rate": 1.9985227297532886e-06, "loss": 0.0904, "num_input_tokens_seen": 2150400, "step": 4380 }, { "epoch": 0.57872508908539, "grad_norm": 1.8291724920272827, "learning_rate": 1.9984975914773957e-06, "loss": 0.1622, "num_input_tokens_seen": 2153088, "step": 4385 }, { "epoch": 0.5793849808631385, "grad_norm": 0.11948619037866592, "learning_rate": 1.9984722412776197e-06, "loss": 0.0055, "num_input_tokens_seen": 2155776, "step": 4390 }, { "epoch": 0.5800448726408869, "grad_norm": 58.45433044433594, "learning_rate": 1.9984466791593407e-06, "loss": 0.2532, "num_input_tokens_seen": 2158400, "step": 4395 }, { "epoch": 0.5807047644186354, "grad_norm": 0.1013028621673584, "learning_rate": 1.9984209051279843e-06, "loss": 0.0378, "num_input_tokens_seen": 2160704, "step": 4400 }, { "epoch": 0.5813646561963838, "grad_norm": 53.731590270996094, "learning_rate": 1.998394919189021e-06, "loss": 0.0141, "num_input_tokens_seen": 2163200, "step": 4405 }, { "epoch": 0.5820245479741323, "grad_norm": 87.15843200683594, "learning_rate": 1.9983687213479655e-06, "loss": 0.137, "num_input_tokens_seen": 2165376, "step": 4410 }, { "epoch": 0.5826844397518807, "grad_norm": 33.592220306396484, "learning_rate": 1.998342311610379e-06, "loss": 0.2062, "num_input_tokens_seen": 2167808, "step": 4415 }, { "epoch": 0.5833443315296292, "grad_norm": 50.75172424316406, "learning_rate": 1.998315689981866e-06, "loss": 0.225, "num_input_tokens_seen": 2170112, "step": 4420 }, { "epoch": 0.5840042233073776, "grad_norm": 0.11312510073184967, "learning_rate": 1.998288856468077e-06, "loss": 0.0063, "num_input_tokens_seen": 2172480, "step": 4425 }, { "epoch": 0.5846641150851261, "grad_norm": 41.83127975463867, "learning_rate": 1.998261811074707e-06, "loss": 0.1578, "num_input_tokens_seen": 2175104, "step": 4430 }, { "epoch": 0.5853240068628744, "grad_norm": 16.17295265197754, "learning_rate": 1.998234553807497e-06, "loss": 0.0695, "num_input_tokens_seen": 2177280, "step": 4435 }, { "epoch": 0.585983898640623, "grad_norm": 0.10577072203159332, "learning_rate": 1.9982070846722312e-06, "loss": 0.0882, "num_input_tokens_seen": 2179776, "step": 4440 }, { "epoch": 0.5866437904183713, "grad_norm": 1.0365639925003052, "learning_rate": 1.9981794036747402e-06, "loss": 0.1574, "num_input_tokens_seen": 2182400, "step": 4445 }, { "epoch": 0.5873036821961198, "grad_norm": 0.619874894618988, "learning_rate": 1.998151510820899e-06, "loss": 0.0313, "num_input_tokens_seen": 2185088, "step": 4450 }, { "epoch": 0.5879635739738683, "grad_norm": 1.185616135597229, "learning_rate": 1.9981234061166275e-06, "loss": 0.0876, "num_input_tokens_seen": 2187776, "step": 4455 }, { "epoch": 0.5886234657516167, "grad_norm": 225.3323211669922, "learning_rate": 1.9980950895678914e-06, "loss": 0.0183, "num_input_tokens_seen": 2190016, "step": 4460 }, { "epoch": 0.5892833575293652, "grad_norm": 91.41350555419922, "learning_rate": 1.9980665611806998e-06, "loss": 0.0676, "num_input_tokens_seen": 2192320, "step": 4465 }, { "epoch": 0.5899432493071136, "grad_norm": 31.638723373413086, "learning_rate": 1.998037820961108e-06, "loss": 0.1909, "num_input_tokens_seen": 2194752, "step": 4470 }, { "epoch": 0.5906031410848621, "grad_norm": 23.39573097229004, "learning_rate": 1.9980088689152163e-06, "loss": 0.1777, "num_input_tokens_seen": 2197056, "step": 4475 }, { "epoch": 0.5912630328626105, "grad_norm": 0.038047995418310165, "learning_rate": 1.9979797050491687e-06, "loss": 0.092, "num_input_tokens_seen": 2199296, "step": 4480 }, { "epoch": 0.591922924640359, "grad_norm": 47.337886810302734, "learning_rate": 1.997950329369156e-06, "loss": 0.3187, "num_input_tokens_seen": 2201664, "step": 4485 }, { "epoch": 0.5925828164181074, "grad_norm": 13.546995162963867, "learning_rate": 1.997920741881412e-06, "loss": 0.1067, "num_input_tokens_seen": 2204288, "step": 4490 }, { "epoch": 0.5932427081958559, "grad_norm": 15.969120979309082, "learning_rate": 1.997890942592217e-06, "loss": 0.1064, "num_input_tokens_seen": 2206528, "step": 4495 }, { "epoch": 0.5939025999736043, "grad_norm": 0.7014201283454895, "learning_rate": 1.997860931507896e-06, "loss": 0.1168, "num_input_tokens_seen": 2209024, "step": 4500 }, { "epoch": 0.5945624917513528, "grad_norm": 0.4199674129486084, "learning_rate": 1.997830708634818e-06, "loss": 0.0681, "num_input_tokens_seen": 2211584, "step": 4505 }, { "epoch": 0.5952223835291012, "grad_norm": 19.820323944091797, "learning_rate": 1.9978002739793977e-06, "loss": 0.1479, "num_input_tokens_seen": 2213952, "step": 4510 }, { "epoch": 0.5958822753068497, "grad_norm": 15.060903549194336, "learning_rate": 1.9977696275480945e-06, "loss": 0.1002, "num_input_tokens_seen": 2216192, "step": 4515 }, { "epoch": 0.5965421670845982, "grad_norm": 5.9868268966674805, "learning_rate": 1.9977387693474134e-06, "loss": 0.0057, "num_input_tokens_seen": 2218688, "step": 4520 }, { "epoch": 0.5972020588623466, "grad_norm": 0.29611310362815857, "learning_rate": 1.9977076993839037e-06, "loss": 0.0011, "num_input_tokens_seen": 2220928, "step": 4525 }, { "epoch": 0.5978619506400951, "grad_norm": 0.06432250887155533, "learning_rate": 1.9976764176641592e-06, "loss": 0.001, "num_input_tokens_seen": 2223360, "step": 4530 }, { "epoch": 0.5985218424178435, "grad_norm": 0.038721442222595215, "learning_rate": 1.99764492419482e-06, "loss": 0.1205, "num_input_tokens_seen": 2225792, "step": 4535 }, { "epoch": 0.599181734195592, "grad_norm": 65.97834014892578, "learning_rate": 1.99761321898257e-06, "loss": 0.2427, "num_input_tokens_seen": 2228288, "step": 4540 }, { "epoch": 0.5998416259733403, "grad_norm": 33.381248474121094, "learning_rate": 1.9975813020341387e-06, "loss": 0.2698, "num_input_tokens_seen": 2230848, "step": 4545 }, { "epoch": 0.6005015177510888, "grad_norm": 254.90545654296875, "learning_rate": 1.9975491733562997e-06, "loss": 0.2384, "num_input_tokens_seen": 2233472, "step": 4550 }, { "epoch": 0.6011614095288372, "grad_norm": 0.6209867000579834, "learning_rate": 1.9975168329558725e-06, "loss": 0.2191, "num_input_tokens_seen": 2236096, "step": 4555 }, { "epoch": 0.6018213013065857, "grad_norm": 0.45464786887168884, "learning_rate": 1.9974842808397206e-06, "loss": 0.1075, "num_input_tokens_seen": 2238720, "step": 4560 }, { "epoch": 0.6024811930843341, "grad_norm": 101.83075714111328, "learning_rate": 1.9974515170147533e-06, "loss": 0.1344, "num_input_tokens_seen": 2241216, "step": 4565 }, { "epoch": 0.6031410848620826, "grad_norm": 9.599241256713867, "learning_rate": 1.997418541487925e-06, "loss": 0.07, "num_input_tokens_seen": 2243648, "step": 4570 }, { "epoch": 0.6038009766398311, "grad_norm": 6.536127090454102, "learning_rate": 1.9973853542662336e-06, "loss": 0.1225, "num_input_tokens_seen": 2246080, "step": 4575 }, { "epoch": 0.6044608684175795, "grad_norm": 0.32332703471183777, "learning_rate": 1.9973519553567233e-06, "loss": 0.0581, "num_input_tokens_seen": 2248256, "step": 4580 }, { "epoch": 0.605120760195328, "grad_norm": 0.3811197280883789, "learning_rate": 1.9973183447664826e-06, "loss": 0.0503, "num_input_tokens_seen": 2250688, "step": 4585 }, { "epoch": 0.6057806519730764, "grad_norm": 15.983846664428711, "learning_rate": 1.9972845225026458e-06, "loss": 0.2459, "num_input_tokens_seen": 2253120, "step": 4590 }, { "epoch": 0.6064405437508249, "grad_norm": 15.699974060058594, "learning_rate": 1.99725048857239e-06, "loss": 0.2109, "num_input_tokens_seen": 2255360, "step": 4595 }, { "epoch": 0.6071004355285733, "grad_norm": 0.4910595118999481, "learning_rate": 1.99721624298294e-06, "loss": 0.1823, "num_input_tokens_seen": 2257728, "step": 4600 }, { "epoch": 0.6077603273063218, "grad_norm": 0.5166888236999512, "learning_rate": 1.997181785741564e-06, "loss": 0.0686, "num_input_tokens_seen": 2260224, "step": 4605 }, { "epoch": 0.6084202190840702, "grad_norm": 0.6000028848648071, "learning_rate": 1.9971471168555746e-06, "loss": 0.0076, "num_input_tokens_seen": 2262912, "step": 4610 }, { "epoch": 0.6090801108618187, "grad_norm": 15.50414752960205, "learning_rate": 1.9971122363323307e-06, "loss": 0.2299, "num_input_tokens_seen": 2265152, "step": 4615 }, { "epoch": 0.6097400026395671, "grad_norm": 0.13759158551692963, "learning_rate": 1.9970771441792347e-06, "loss": 0.0823, "num_input_tokens_seen": 2267968, "step": 4620 }, { "epoch": 0.6103998944173156, "grad_norm": 0.09345412999391556, "learning_rate": 1.997041840403735e-06, "loss": 0.1992, "num_input_tokens_seen": 2270336, "step": 4625 }, { "epoch": 0.611059786195064, "grad_norm": 12.803251266479492, "learning_rate": 1.997006325013325e-06, "loss": 0.2115, "num_input_tokens_seen": 2273024, "step": 4630 }, { "epoch": 0.6117196779728125, "grad_norm": 0.13515806198120117, "learning_rate": 1.9969705980155426e-06, "loss": 0.0794, "num_input_tokens_seen": 2275264, "step": 4635 }, { "epoch": 0.612379569750561, "grad_norm": 20.334821701049805, "learning_rate": 1.99693465941797e-06, "loss": 0.0776, "num_input_tokens_seen": 2277888, "step": 4640 }, { "epoch": 0.6130394615283093, "grad_norm": 0.2252114862203598, "learning_rate": 1.9968985092282354e-06, "loss": 0.002, "num_input_tokens_seen": 2280320, "step": 4645 }, { "epoch": 0.6136993533060578, "grad_norm": 132.18417358398438, "learning_rate": 1.996862147454011e-06, "loss": 0.0637, "num_input_tokens_seen": 2282560, "step": 4650 }, { "epoch": 0.6143592450838062, "grad_norm": 0.08274441957473755, "learning_rate": 1.9968255741030144e-06, "loss": 0.121, "num_input_tokens_seen": 2284864, "step": 4655 }, { "epoch": 0.6150191368615547, "grad_norm": 0.13016772270202637, "learning_rate": 1.9967887891830082e-06, "loss": 0.1595, "num_input_tokens_seen": 2287168, "step": 4660 }, { "epoch": 0.6156790286393031, "grad_norm": 1.3486446142196655, "learning_rate": 1.9967517927017995e-06, "loss": 0.0561, "num_input_tokens_seen": 2289600, "step": 4665 }, { "epoch": 0.6163389204170516, "grad_norm": 142.54844665527344, "learning_rate": 1.996714584667241e-06, "loss": 0.1015, "num_input_tokens_seen": 2292160, "step": 4670 }, { "epoch": 0.6169988121948, "grad_norm": 0.17757734656333923, "learning_rate": 1.9966771650872295e-06, "loss": 0.0621, "num_input_tokens_seen": 2294912, "step": 4675 }, { "epoch": 0.6176587039725485, "grad_norm": 0.3973993957042694, "learning_rate": 1.996639533969707e-06, "loss": 0.0647, "num_input_tokens_seen": 2297024, "step": 4680 }, { "epoch": 0.6183185957502969, "grad_norm": 12.3432035446167, "learning_rate": 1.9966016913226602e-06, "loss": 0.2015, "num_input_tokens_seen": 2299456, "step": 4685 }, { "epoch": 0.6189784875280454, "grad_norm": 0.01895112358033657, "learning_rate": 1.9965636371541217e-06, "loss": 0.062, "num_input_tokens_seen": 2301568, "step": 4690 }, { "epoch": 0.6196383793057938, "grad_norm": 119.7965316772461, "learning_rate": 1.9965253714721676e-06, "loss": 0.1759, "num_input_tokens_seen": 2303936, "step": 4695 }, { "epoch": 0.6202982710835423, "grad_norm": 12.308302879333496, "learning_rate": 1.99648689428492e-06, "loss": 0.1793, "num_input_tokens_seen": 2306176, "step": 4700 }, { "epoch": 0.6209581628612908, "grad_norm": 0.34794431924819946, "learning_rate": 1.9964482056005446e-06, "loss": 0.046, "num_input_tokens_seen": 2308736, "step": 4705 }, { "epoch": 0.6216180546390392, "grad_norm": 14.829720497131348, "learning_rate": 1.9964093054272534e-06, "loss": 0.2129, "num_input_tokens_seen": 2311104, "step": 4710 }, { "epoch": 0.6222779464167877, "grad_norm": 123.25777435302734, "learning_rate": 1.9963701937733024e-06, "loss": 0.1223, "num_input_tokens_seen": 2313536, "step": 4715 }, { "epoch": 0.6229378381945361, "grad_norm": 43.15277862548828, "learning_rate": 1.9963308706469932e-06, "loss": 0.1751, "num_input_tokens_seen": 2316032, "step": 4720 }, { "epoch": 0.6235977299722846, "grad_norm": 11.635313987731934, "learning_rate": 1.9962913360566713e-06, "loss": 0.2008, "num_input_tokens_seen": 2318656, "step": 4725 }, { "epoch": 0.624257621750033, "grad_norm": 14.797162055969238, "learning_rate": 1.9962515900107283e-06, "loss": 0.1295, "num_input_tokens_seen": 2321216, "step": 4730 }, { "epoch": 0.6249175135277815, "grad_norm": 1.9820233583450317, "learning_rate": 1.9962116325175993e-06, "loss": 0.1381, "num_input_tokens_seen": 2323648, "step": 4735 }, { "epoch": 0.6255774053055299, "grad_norm": 61.69966125488281, "learning_rate": 1.996171463585765e-06, "loss": 0.1807, "num_input_tokens_seen": 2326080, "step": 4740 }, { "epoch": 0.6262372970832784, "grad_norm": 14.098520278930664, "learning_rate": 1.996131083223752e-06, "loss": 0.097, "num_input_tokens_seen": 2328512, "step": 4745 }, { "epoch": 0.6268971888610267, "grad_norm": 2.896374464035034, "learning_rate": 1.9960904914401295e-06, "loss": 0.1358, "num_input_tokens_seen": 2331008, "step": 4750 }, { "epoch": 0.6275570806387752, "grad_norm": 67.75291442871094, "learning_rate": 1.9960496882435138e-06, "loss": 0.0575, "num_input_tokens_seen": 2333376, "step": 4755 }, { "epoch": 0.6282169724165237, "grad_norm": 37.26298141479492, "learning_rate": 1.996008673642564e-06, "loss": 0.2401, "num_input_tokens_seen": 2335872, "step": 4760 }, { "epoch": 0.6288768641942721, "grad_norm": 0.15088030695915222, "learning_rate": 1.995967447645986e-06, "loss": 0.0035, "num_input_tokens_seen": 2338432, "step": 4765 }, { "epoch": 0.6295367559720206, "grad_norm": 181.27249145507812, "learning_rate": 1.9959260102625293e-06, "loss": 0.2603, "num_input_tokens_seen": 2340928, "step": 4770 }, { "epoch": 0.630196647749769, "grad_norm": 0.40237560868263245, "learning_rate": 1.9958843615009892e-06, "loss": 0.1541, "num_input_tokens_seen": 2343680, "step": 4775 }, { "epoch": 0.6308565395275175, "grad_norm": 0.08449136465787888, "learning_rate": 1.995842501370205e-06, "loss": 0.062, "num_input_tokens_seen": 2346240, "step": 4780 }, { "epoch": 0.6315164313052659, "grad_norm": 0.08802822232246399, "learning_rate": 1.9958004298790607e-06, "loss": 0.1132, "num_input_tokens_seen": 2348544, "step": 4785 }, { "epoch": 0.6321763230830144, "grad_norm": 0.11865068972110748, "learning_rate": 1.9957581470364867e-06, "loss": 0.3771, "num_input_tokens_seen": 2350976, "step": 4790 }, { "epoch": 0.6328362148607628, "grad_norm": 40.807823181152344, "learning_rate": 1.9957156528514564e-06, "loss": 0.1463, "num_input_tokens_seen": 2353216, "step": 4795 }, { "epoch": 0.6334961066385113, "grad_norm": 0.09082300215959549, "learning_rate": 1.995672947332989e-06, "loss": 0.1893, "num_input_tokens_seen": 2355584, "step": 4800 }, { "epoch": 0.6341559984162597, "grad_norm": 0.1805894523859024, "learning_rate": 1.995630030490149e-06, "loss": 0.0834, "num_input_tokens_seen": 2358144, "step": 4805 }, { "epoch": 0.6348158901940082, "grad_norm": 47.16820526123047, "learning_rate": 1.9955869023320447e-06, "loss": 0.0498, "num_input_tokens_seen": 2360896, "step": 4810 }, { "epoch": 0.6354757819717566, "grad_norm": 0.7773832082748413, "learning_rate": 1.99554356286783e-06, "loss": 0.2227, "num_input_tokens_seen": 2363264, "step": 4815 }, { "epoch": 0.6361356737495051, "grad_norm": 13.574986457824707, "learning_rate": 1.9955000121067035e-06, "loss": 0.0659, "num_input_tokens_seen": 2365632, "step": 4820 }, { "epoch": 0.6367955655272536, "grad_norm": 0.08415788412094116, "learning_rate": 1.9954562500579075e-06, "loss": 0.0047, "num_input_tokens_seen": 2368000, "step": 4825 }, { "epoch": 0.637455457305002, "grad_norm": 119.38396453857422, "learning_rate": 1.9954122767307316e-06, "loss": 0.122, "num_input_tokens_seen": 2370560, "step": 4830 }, { "epoch": 0.6381153490827505, "grad_norm": 0.06524139642715454, "learning_rate": 1.995368092134508e-06, "loss": 0.0527, "num_input_tokens_seen": 2373120, "step": 4835 }, { "epoch": 0.6387752408604989, "grad_norm": 1.1225311756134033, "learning_rate": 1.9953236962786143e-06, "loss": 0.004, "num_input_tokens_seen": 2375872, "step": 4840 }, { "epoch": 0.6394351326382474, "grad_norm": 0.058993130922317505, "learning_rate": 1.995279089172474e-06, "loss": 0.2211, "num_input_tokens_seen": 2378432, "step": 4845 }, { "epoch": 0.6400950244159958, "grad_norm": 30.01612091064453, "learning_rate": 1.9952342708255543e-06, "loss": 0.1565, "num_input_tokens_seen": 2380800, "step": 4850 }, { "epoch": 0.6407549161937443, "grad_norm": 0.034984275698661804, "learning_rate": 1.9951892412473677e-06, "loss": 0.1074, "num_input_tokens_seen": 2383744, "step": 4855 }, { "epoch": 0.6414148079714926, "grad_norm": 13.939770698547363, "learning_rate": 1.9951440004474707e-06, "loss": 0.124, "num_input_tokens_seen": 2386112, "step": 4860 }, { "epoch": 0.6420746997492411, "grad_norm": 62.33798599243164, "learning_rate": 1.9950985484354664e-06, "loss": 0.1265, "num_input_tokens_seen": 2388736, "step": 4865 }, { "epoch": 0.6427345915269895, "grad_norm": 0.21689827740192413, "learning_rate": 1.9950528852210013e-06, "loss": 0.0334, "num_input_tokens_seen": 2391104, "step": 4870 }, { "epoch": 0.643394483304738, "grad_norm": 24.510175704956055, "learning_rate": 1.9950070108137663e-06, "loss": 0.1468, "num_input_tokens_seen": 2393728, "step": 4875 }, { "epoch": 0.6440543750824864, "grad_norm": 0.9391570091247559, "learning_rate": 1.9949609252234985e-06, "loss": 0.1562, "num_input_tokens_seen": 2396480, "step": 4880 }, { "epoch": 0.6447142668602349, "grad_norm": 0.11992018669843674, "learning_rate": 1.9949146284599794e-06, "loss": 0.1169, "num_input_tokens_seen": 2399104, "step": 4885 }, { "epoch": 0.6453741586379834, "grad_norm": 54.8850212097168, "learning_rate": 1.9948681205330354e-06, "loss": 0.096, "num_input_tokens_seen": 2401664, "step": 4890 }, { "epoch": 0.6460340504157318, "grad_norm": 27.9152889251709, "learning_rate": 1.994821401452537e-06, "loss": 0.1063, "num_input_tokens_seen": 2404160, "step": 4895 }, { "epoch": 0.6466939421934803, "grad_norm": 0.24877403676509857, "learning_rate": 1.9947744712283997e-06, "loss": 0.0837, "num_input_tokens_seen": 2406592, "step": 4900 }, { "epoch": 0.6473538339712287, "grad_norm": 0.04175091162323952, "learning_rate": 1.9947273298705848e-06, "loss": 0.0537, "num_input_tokens_seen": 2409088, "step": 4905 }, { "epoch": 0.6480137257489772, "grad_norm": 0.4560162425041199, "learning_rate": 1.994679977389097e-06, "loss": 0.173, "num_input_tokens_seen": 2411584, "step": 4910 }, { "epoch": 0.6486736175267256, "grad_norm": 0.3465970754623413, "learning_rate": 1.9946324137939876e-06, "loss": 0.3713, "num_input_tokens_seen": 2414400, "step": 4915 }, { "epoch": 0.6493335093044741, "grad_norm": 35.83467102050781, "learning_rate": 1.9945846390953503e-06, "loss": 0.22, "num_input_tokens_seen": 2416640, "step": 4920 }, { "epoch": 0.6499934010822225, "grad_norm": 18.50018310546875, "learning_rate": 1.994536653303326e-06, "loss": 0.1468, "num_input_tokens_seen": 2419136, "step": 4925 }, { "epoch": 0.650653292859971, "grad_norm": 0.6801294088363647, "learning_rate": 1.9944884564280987e-06, "loss": 0.0354, "num_input_tokens_seen": 2421440, "step": 4930 }, { "epoch": 0.6513131846377194, "grad_norm": 0.1738162487745285, "learning_rate": 1.994440048479898e-06, "loss": 0.0854, "num_input_tokens_seen": 2424000, "step": 4935 }, { "epoch": 0.6519730764154679, "grad_norm": 57.80926513671875, "learning_rate": 1.9943914294689984e-06, "loss": 0.0808, "num_input_tokens_seen": 2426240, "step": 4940 }, { "epoch": 0.6526329681932164, "grad_norm": 0.08340390771627426, "learning_rate": 1.9943425994057184e-06, "loss": 0.0641, "num_input_tokens_seen": 2428864, "step": 4945 }, { "epoch": 0.6532928599709648, "grad_norm": 0.4628424346446991, "learning_rate": 1.994293558300422e-06, "loss": 0.0188, "num_input_tokens_seen": 2431296, "step": 4950 }, { "epoch": 0.6539527517487133, "grad_norm": 0.0313633531332016, "learning_rate": 1.9942443061635183e-06, "loss": 0.0037, "num_input_tokens_seen": 2433984, "step": 4955 }, { "epoch": 0.6546126435264616, "grad_norm": 0.10239587724208832, "learning_rate": 1.9941948430054603e-06, "loss": 0.5298, "num_input_tokens_seen": 2436224, "step": 4960 }, { "epoch": 0.6552725353042101, "grad_norm": 0.33065155148506165, "learning_rate": 1.994145168836746e-06, "loss": 0.1602, "num_input_tokens_seen": 2438720, "step": 4965 }, { "epoch": 0.6559324270819585, "grad_norm": 0.1416037231683731, "learning_rate": 1.994095283667919e-06, "loss": 0.1132, "num_input_tokens_seen": 2440960, "step": 4970 }, { "epoch": 0.656592318859707, "grad_norm": 26.69266128540039, "learning_rate": 1.9940451875095666e-06, "loss": 0.0129, "num_input_tokens_seen": 2443328, "step": 4975 }, { "epoch": 0.6572522106374554, "grad_norm": 0.09884212166070938, "learning_rate": 1.9939948803723217e-06, "loss": 0.1357, "num_input_tokens_seen": 2445952, "step": 4980 }, { "epoch": 0.6579121024152039, "grad_norm": 0.06190844997763634, "learning_rate": 1.9939443622668614e-06, "loss": 0.0527, "num_input_tokens_seen": 2448576, "step": 4985 }, { "epoch": 0.6585719941929523, "grad_norm": 13.247187614440918, "learning_rate": 1.9938936332039073e-06, "loss": 0.3274, "num_input_tokens_seen": 2451136, "step": 4990 }, { "epoch": 0.6592318859707008, "grad_norm": 0.5872359871864319, "learning_rate": 1.993842693194227e-06, "loss": 0.0122, "num_input_tokens_seen": 2453632, "step": 4995 }, { "epoch": 0.6598917777484492, "grad_norm": 6.910613536834717, "learning_rate": 1.993791542248632e-06, "loss": 0.1135, "num_input_tokens_seen": 2456192, "step": 5000 }, { "epoch": 0.6605516695261977, "grad_norm": 0.10543259233236313, "learning_rate": 1.9937401803779784e-06, "loss": 0.0259, "num_input_tokens_seen": 2458624, "step": 5005 }, { "epoch": 0.6612115613039462, "grad_norm": 0.3452533185482025, "learning_rate": 1.9936886075931678e-06, "loss": 0.0829, "num_input_tokens_seen": 2460928, "step": 5010 }, { "epoch": 0.6618714530816946, "grad_norm": 3.1825904846191406, "learning_rate": 1.993636823905146e-06, "loss": 0.0726, "num_input_tokens_seen": 2463552, "step": 5015 }, { "epoch": 0.6625313448594431, "grad_norm": 75.93983459472656, "learning_rate": 1.9935848293249034e-06, "loss": 0.0881, "num_input_tokens_seen": 2465856, "step": 5020 }, { "epoch": 0.6631912366371915, "grad_norm": 0.03516853600740433, "learning_rate": 1.9935326238634763e-06, "loss": 0.044, "num_input_tokens_seen": 2468288, "step": 5025 }, { "epoch": 0.66385112841494, "grad_norm": 52.587249755859375, "learning_rate": 1.993480207531944e-06, "loss": 0.3045, "num_input_tokens_seen": 2470912, "step": 5030 }, { "epoch": 0.6645110201926884, "grad_norm": 0.8322742581367493, "learning_rate": 1.9934275803414317e-06, "loss": 0.1027, "num_input_tokens_seen": 2473536, "step": 5035 }, { "epoch": 0.6651709119704369, "grad_norm": 1.7283278703689575, "learning_rate": 1.99337474230311e-06, "loss": 0.0028, "num_input_tokens_seen": 2476032, "step": 5040 }, { "epoch": 0.6658308037481853, "grad_norm": 0.27299973368644714, "learning_rate": 1.993321693428192e-06, "loss": 0.2344, "num_input_tokens_seen": 2478208, "step": 5045 }, { "epoch": 0.6664906955259338, "grad_norm": 0.16945065557956696, "learning_rate": 1.9932684337279378e-06, "loss": 0.1178, "num_input_tokens_seen": 2480512, "step": 5050 }, { "epoch": 0.6671505873036822, "grad_norm": 0.08358097821474075, "learning_rate": 1.9932149632136514e-06, "loss": 0.2015, "num_input_tokens_seen": 2483008, "step": 5055 }, { "epoch": 0.6678104790814307, "grad_norm": 0.04497074335813522, "learning_rate": 1.9931612818966812e-06, "loss": 0.2345, "num_input_tokens_seen": 2485376, "step": 5060 }, { "epoch": 0.668470370859179, "grad_norm": 0.6962704658508301, "learning_rate": 1.993107389788421e-06, "loss": 0.0378, "num_input_tokens_seen": 2488064, "step": 5065 }, { "epoch": 0.6691302626369275, "grad_norm": 21.029521942138672, "learning_rate": 1.9930532869003085e-06, "loss": 0.0923, "num_input_tokens_seen": 2490624, "step": 5070 }, { "epoch": 0.669790154414676, "grad_norm": 24.72555923461914, "learning_rate": 1.992998973243827e-06, "loss": 0.4066, "num_input_tokens_seen": 2493248, "step": 5075 }, { "epoch": 0.6704500461924244, "grad_norm": 32.65169906616211, "learning_rate": 1.9929444488305047e-06, "loss": 0.1969, "num_input_tokens_seen": 2495744, "step": 5080 }, { "epoch": 0.6711099379701729, "grad_norm": 0.13829834759235382, "learning_rate": 1.992889713671913e-06, "loss": 0.0028, "num_input_tokens_seen": 2498176, "step": 5085 }, { "epoch": 0.6717698297479213, "grad_norm": 0.2096666544675827, "learning_rate": 1.99283476777967e-06, "loss": 0.0702, "num_input_tokens_seen": 2500416, "step": 5090 }, { "epoch": 0.6724297215256698, "grad_norm": 21.304018020629883, "learning_rate": 1.9927796111654366e-06, "loss": 0.1533, "num_input_tokens_seen": 2502848, "step": 5095 }, { "epoch": 0.6730896133034182, "grad_norm": 65.4073257446289, "learning_rate": 1.99272424384092e-06, "loss": 0.1575, "num_input_tokens_seen": 2505152, "step": 5100 }, { "epoch": 0.6737495050811667, "grad_norm": 51.36351776123047, "learning_rate": 1.992668665817871e-06, "loss": 0.1766, "num_input_tokens_seen": 2507648, "step": 5105 }, { "epoch": 0.6744093968589151, "grad_norm": 0.4049692153930664, "learning_rate": 1.9926128771080867e-06, "loss": 0.0742, "num_input_tokens_seen": 2510144, "step": 5110 }, { "epoch": 0.6750692886366636, "grad_norm": 0.48001331090927124, "learning_rate": 1.9925568777234067e-06, "loss": 0.2246, "num_input_tokens_seen": 2513024, "step": 5115 }, { "epoch": 0.675729180414412, "grad_norm": 0.321492999792099, "learning_rate": 1.992500667675717e-06, "loss": 0.0633, "num_input_tokens_seen": 2515072, "step": 5120 }, { "epoch": 0.6763890721921605, "grad_norm": 5.6695990562438965, "learning_rate": 1.992444246976948e-06, "loss": 0.0047, "num_input_tokens_seen": 2517376, "step": 5125 }, { "epoch": 0.677048963969909, "grad_norm": 0.6470169425010681, "learning_rate": 1.9923876156390743e-06, "loss": 0.0688, "num_input_tokens_seen": 2520064, "step": 5130 }, { "epoch": 0.6777088557476574, "grad_norm": 2.185170888900757, "learning_rate": 1.992330773674115e-06, "loss": 0.0592, "num_input_tokens_seen": 2522688, "step": 5135 }, { "epoch": 0.6783687475254059, "grad_norm": 0.08854498714208603, "learning_rate": 1.9922737210941353e-06, "loss": 0.0356, "num_input_tokens_seen": 2525184, "step": 5140 }, { "epoch": 0.6790286393031543, "grad_norm": 0.012564142234623432, "learning_rate": 1.9922164579112436e-06, "loss": 0.0004, "num_input_tokens_seen": 2527552, "step": 5145 }, { "epoch": 0.6796885310809028, "grad_norm": 68.29488372802734, "learning_rate": 1.9921589841375938e-06, "loss": 0.0067, "num_input_tokens_seen": 2530240, "step": 5150 }, { "epoch": 0.6803484228586512, "grad_norm": 199.47763061523438, "learning_rate": 1.9921012997853843e-06, "loss": 0.0581, "num_input_tokens_seen": 2532480, "step": 5155 }, { "epoch": 0.6810083146363997, "grad_norm": 0.01096346229314804, "learning_rate": 1.9920434048668582e-06, "loss": 0.0488, "num_input_tokens_seen": 2534912, "step": 5160 }, { "epoch": 0.681668206414148, "grad_norm": 0.222233846783638, "learning_rate": 1.9919852993943035e-06, "loss": 0.1032, "num_input_tokens_seen": 2537408, "step": 5165 }, { "epoch": 0.6823280981918965, "grad_norm": 0.04326486587524414, "learning_rate": 1.991926983380052e-06, "loss": 0.3101, "num_input_tokens_seen": 2539776, "step": 5170 }, { "epoch": 0.6829879899696449, "grad_norm": 0.02621627412736416, "learning_rate": 1.9918684568364813e-06, "loss": 0.0739, "num_input_tokens_seen": 2542208, "step": 5175 }, { "epoch": 0.6836478817473934, "grad_norm": 0.6199452877044678, "learning_rate": 1.9918097197760134e-06, "loss": 0.0663, "num_input_tokens_seen": 2544704, "step": 5180 }, { "epoch": 0.6843077735251418, "grad_norm": 0.917776882648468, "learning_rate": 1.9917507722111144e-06, "loss": 0.1005, "num_input_tokens_seen": 2547072, "step": 5185 }, { "epoch": 0.6849676653028903, "grad_norm": 0.5460265874862671, "learning_rate": 1.9916916141542957e-06, "loss": 0.1848, "num_input_tokens_seen": 2549440, "step": 5190 }, { "epoch": 0.6856275570806388, "grad_norm": 11.587053298950195, "learning_rate": 1.991632245618113e-06, "loss": 0.4134, "num_input_tokens_seen": 2552128, "step": 5195 }, { "epoch": 0.6862874488583872, "grad_norm": 0.13508452475070953, "learning_rate": 1.9915726666151673e-06, "loss": 0.0699, "num_input_tokens_seen": 2554368, "step": 5200 }, { "epoch": 0.6869473406361357, "grad_norm": 0.03892385959625244, "learning_rate": 1.9915128771581033e-06, "loss": 0.1567, "num_input_tokens_seen": 2556928, "step": 5205 }, { "epoch": 0.6876072324138841, "grad_norm": 12.882155418395996, "learning_rate": 1.9914528772596113e-06, "loss": 0.17, "num_input_tokens_seen": 2559360, "step": 5210 }, { "epoch": 0.6882671241916326, "grad_norm": 0.14622516930103302, "learning_rate": 1.9913926669324253e-06, "loss": 0.2589, "num_input_tokens_seen": 2561856, "step": 5215 }, { "epoch": 0.688927015969381, "grad_norm": 26.672409057617188, "learning_rate": 1.991332246189325e-06, "loss": 0.2197, "num_input_tokens_seen": 2564352, "step": 5220 }, { "epoch": 0.6895869077471295, "grad_norm": 0.45475801825523376, "learning_rate": 1.9912716150431343e-06, "loss": 0.0444, "num_input_tokens_seen": 2566784, "step": 5225 }, { "epoch": 0.6902467995248779, "grad_norm": 32.12901306152344, "learning_rate": 1.9912107735067215e-06, "loss": 0.1357, "num_input_tokens_seen": 2569152, "step": 5230 }, { "epoch": 0.6909066913026264, "grad_norm": 0.20969435572624207, "learning_rate": 1.991149721593e-06, "loss": 0.0881, "num_input_tokens_seen": 2571520, "step": 5235 }, { "epoch": 0.6915665830803748, "grad_norm": 1.3807674646377563, "learning_rate": 1.991088459314927e-06, "loss": 0.0346, "num_input_tokens_seen": 2574080, "step": 5240 }, { "epoch": 0.6922264748581233, "grad_norm": 0.21307876706123352, "learning_rate": 1.991026986685506e-06, "loss": 0.0699, "num_input_tokens_seen": 2576320, "step": 5245 }, { "epoch": 0.6928863666358717, "grad_norm": 0.09592917561531067, "learning_rate": 1.9909653037177826e-06, "loss": 0.073, "num_input_tokens_seen": 2578752, "step": 5250 }, { "epoch": 0.6935462584136202, "grad_norm": 24.82294464111328, "learning_rate": 1.9909034104248503e-06, "loss": 0.0697, "num_input_tokens_seen": 2581184, "step": 5255 }, { "epoch": 0.6942061501913687, "grad_norm": 2.019487142562866, "learning_rate": 1.9908413068198442e-06, "loss": 0.1952, "num_input_tokens_seen": 2583872, "step": 5260 }, { "epoch": 0.694866041969117, "grad_norm": 0.2514612078666687, "learning_rate": 1.990778992915946e-06, "loss": 0.2296, "num_input_tokens_seen": 2586304, "step": 5265 }, { "epoch": 0.6955259337468656, "grad_norm": 0.08101149648427963, "learning_rate": 1.990716468726381e-06, "loss": 0.1202, "num_input_tokens_seen": 2589056, "step": 5270 }, { "epoch": 0.6961858255246139, "grad_norm": 11.269828796386719, "learning_rate": 1.9906537342644203e-06, "loss": 0.1517, "num_input_tokens_seen": 2591488, "step": 5275 }, { "epoch": 0.6968457173023624, "grad_norm": 1.0551068782806396, "learning_rate": 1.990590789543378e-06, "loss": 0.0183, "num_input_tokens_seen": 2593792, "step": 5280 }, { "epoch": 0.6975056090801108, "grad_norm": 0.9069265127182007, "learning_rate": 1.9905276345766134e-06, "loss": 0.2654, "num_input_tokens_seen": 2595968, "step": 5285 }, { "epoch": 0.6981655008578593, "grad_norm": 14.611425399780273, "learning_rate": 1.990464269377532e-06, "loss": 0.2556, "num_input_tokens_seen": 2598144, "step": 5290 }, { "epoch": 0.6988253926356077, "grad_norm": 0.7366898655891418, "learning_rate": 1.9904006939595815e-06, "loss": 0.0923, "num_input_tokens_seen": 2600448, "step": 5295 }, { "epoch": 0.6994852844133562, "grad_norm": 0.2763810157775879, "learning_rate": 1.9903369083362554e-06, "loss": 0.0051, "num_input_tokens_seen": 2602944, "step": 5300 }, { "epoch": 0.7001451761911046, "grad_norm": 82.81452178955078, "learning_rate": 1.990272912521092e-06, "loss": 0.1914, "num_input_tokens_seen": 2605120, "step": 5305 }, { "epoch": 0.7008050679688531, "grad_norm": 27.29536247253418, "learning_rate": 1.990208706527674e-06, "loss": 0.0594, "num_input_tokens_seen": 2607296, "step": 5310 }, { "epoch": 0.7014649597466015, "grad_norm": 0.053431663662195206, "learning_rate": 1.9901442903696284e-06, "loss": 0.0578, "num_input_tokens_seen": 2609728, "step": 5315 }, { "epoch": 0.70212485152435, "grad_norm": 14.237296104431152, "learning_rate": 1.990079664060628e-06, "loss": 0.2463, "num_input_tokens_seen": 2612224, "step": 5320 }, { "epoch": 0.7027847433020985, "grad_norm": 48.734046936035156, "learning_rate": 1.9900148276143874e-06, "loss": 0.0071, "num_input_tokens_seen": 2614720, "step": 5325 }, { "epoch": 0.7034446350798469, "grad_norm": 0.6822793483734131, "learning_rate": 1.9899497810446694e-06, "loss": 0.3149, "num_input_tokens_seen": 2617344, "step": 5330 }, { "epoch": 0.7041045268575954, "grad_norm": 17.468055725097656, "learning_rate": 1.989884524365279e-06, "loss": 0.1851, "num_input_tokens_seen": 2619584, "step": 5335 }, { "epoch": 0.7047644186353438, "grad_norm": 0.7687005400657654, "learning_rate": 1.9898190575900664e-06, "loss": 0.0551, "num_input_tokens_seen": 2621888, "step": 5340 }, { "epoch": 0.7054243104130923, "grad_norm": 0.17701426148414612, "learning_rate": 1.9897533807329265e-06, "loss": 0.1479, "num_input_tokens_seen": 2624512, "step": 5345 }, { "epoch": 0.7060842021908407, "grad_norm": 0.30073249340057373, "learning_rate": 1.989687493807799e-06, "loss": 0.0035, "num_input_tokens_seen": 2627008, "step": 5350 }, { "epoch": 0.7067440939685892, "grad_norm": 0.2257729321718216, "learning_rate": 1.9896213968286672e-06, "loss": 0.0406, "num_input_tokens_seen": 2629440, "step": 5355 }, { "epoch": 0.7074039857463376, "grad_norm": 0.09596077352762222, "learning_rate": 1.9895550898095606e-06, "loss": 0.1103, "num_input_tokens_seen": 2631872, "step": 5360 }, { "epoch": 0.7080638775240861, "grad_norm": 0.2598581910133362, "learning_rate": 1.9894885727645516e-06, "loss": 0.1771, "num_input_tokens_seen": 2634560, "step": 5365 }, { "epoch": 0.7087237693018344, "grad_norm": 16.17540168762207, "learning_rate": 1.989421845707759e-06, "loss": 0.2135, "num_input_tokens_seen": 2637120, "step": 5370 }, { "epoch": 0.709383661079583, "grad_norm": 0.3493006229400635, "learning_rate": 1.989354908653344e-06, "loss": 0.0021, "num_input_tokens_seen": 2639552, "step": 5375 }, { "epoch": 0.7100435528573315, "grad_norm": 92.4169692993164, "learning_rate": 1.989287761615514e-06, "loss": 0.1498, "num_input_tokens_seen": 2641984, "step": 5380 }, { "epoch": 0.7107034446350798, "grad_norm": 12.568245887756348, "learning_rate": 1.9892204046085206e-06, "loss": 0.0816, "num_input_tokens_seen": 2644352, "step": 5385 }, { "epoch": 0.7113633364128283, "grad_norm": 0.05369238555431366, "learning_rate": 1.98915283764666e-06, "loss": 0.0261, "num_input_tokens_seen": 2647040, "step": 5390 }, { "epoch": 0.7120232281905767, "grad_norm": 0.1250723898410797, "learning_rate": 1.989085060744272e-06, "loss": 0.0705, "num_input_tokens_seen": 2649472, "step": 5395 }, { "epoch": 0.7126831199683252, "grad_norm": 1.4033637046813965, "learning_rate": 1.989017073915742e-06, "loss": 0.0431, "num_input_tokens_seen": 2651840, "step": 5400 }, { "epoch": 0.7133430117460736, "grad_norm": 61.13285446166992, "learning_rate": 1.9889488771755004e-06, "loss": 0.0093, "num_input_tokens_seen": 2654464, "step": 5405 }, { "epoch": 0.7140029035238221, "grad_norm": 0.004062811844050884, "learning_rate": 1.9888804705380207e-06, "loss": 0.1071, "num_input_tokens_seen": 2656576, "step": 5410 }, { "epoch": 0.7146627953015705, "grad_norm": 0.12063659727573395, "learning_rate": 1.9888118540178228e-06, "loss": 0.0828, "num_input_tokens_seen": 2659008, "step": 5415 }, { "epoch": 0.715322687079319, "grad_norm": 0.0075147757306694984, "learning_rate": 1.9887430276294688e-06, "loss": 0.0466, "num_input_tokens_seen": 2661632, "step": 5420 }, { "epoch": 0.7159825788570674, "grad_norm": 0.21719199419021606, "learning_rate": 1.9886739913875666e-06, "loss": 0.1611, "num_input_tokens_seen": 2664192, "step": 5425 }, { "epoch": 0.7166424706348159, "grad_norm": 130.8598175048828, "learning_rate": 1.98860474530677e-06, "loss": 0.0963, "num_input_tokens_seen": 2666624, "step": 5430 }, { "epoch": 0.7173023624125643, "grad_norm": 0.03334322199225426, "learning_rate": 1.9885352894017745e-06, "loss": 0.1402, "num_input_tokens_seen": 2669120, "step": 5435 }, { "epoch": 0.7179622541903128, "grad_norm": 4.289888381958008, "learning_rate": 1.9884656236873224e-06, "loss": 0.2358, "num_input_tokens_seen": 2671552, "step": 5440 }, { "epoch": 0.7186221459680613, "grad_norm": 13.685027122497559, "learning_rate": 1.9883957481781998e-06, "loss": 0.1333, "num_input_tokens_seen": 2674240, "step": 5445 }, { "epoch": 0.7192820377458097, "grad_norm": 0.42409083247184753, "learning_rate": 1.988325662889237e-06, "loss": 0.1131, "num_input_tokens_seen": 2676544, "step": 5450 }, { "epoch": 0.7199419295235582, "grad_norm": 0.199946328997612, "learning_rate": 1.988255367835309e-06, "loss": 0.0009, "num_input_tokens_seen": 2678912, "step": 5455 }, { "epoch": 0.7206018213013066, "grad_norm": 0.4616469442844391, "learning_rate": 1.9881848630313357e-06, "loss": 0.0309, "num_input_tokens_seen": 2681344, "step": 5460 }, { "epoch": 0.7212617130790551, "grad_norm": 130.4469757080078, "learning_rate": 1.988114148492281e-06, "loss": 0.0208, "num_input_tokens_seen": 2683776, "step": 5465 }, { "epoch": 0.7219216048568035, "grad_norm": 0.03411302715539932, "learning_rate": 1.9880432242331534e-06, "loss": 0.1115, "num_input_tokens_seen": 2686016, "step": 5470 }, { "epoch": 0.722581496634552, "grad_norm": 67.0157699584961, "learning_rate": 1.9879720902690067e-06, "loss": 0.1267, "num_input_tokens_seen": 2688128, "step": 5475 }, { "epoch": 0.7232413884123003, "grad_norm": 0.22230832278728485, "learning_rate": 1.987900746614938e-06, "loss": 0.2031, "num_input_tokens_seen": 2690368, "step": 5480 }, { "epoch": 0.7239012801900488, "grad_norm": 1.3174525499343872, "learning_rate": 1.98782919328609e-06, "loss": 0.0919, "num_input_tokens_seen": 2692992, "step": 5485 }, { "epoch": 0.7245611719677972, "grad_norm": 2.8719289302825928, "learning_rate": 1.9877574302976484e-06, "loss": 0.0433, "num_input_tokens_seen": 2695424, "step": 5490 }, { "epoch": 0.7252210637455457, "grad_norm": 12.99232292175293, "learning_rate": 1.987685457664845e-06, "loss": 0.1607, "num_input_tokens_seen": 2697856, "step": 5495 }, { "epoch": 0.7258809555232941, "grad_norm": 0.09089522063732147, "learning_rate": 1.987613275402956e-06, "loss": 0.0006, "num_input_tokens_seen": 2700608, "step": 5500 }, { "epoch": 0.7265408473010426, "grad_norm": 0.18377459049224854, "learning_rate": 1.9875408835273007e-06, "loss": 0.023, "num_input_tokens_seen": 2703104, "step": 5505 }, { "epoch": 0.7272007390787911, "grad_norm": 0.02759479358792305, "learning_rate": 1.9874682820532444e-06, "loss": 0.1917, "num_input_tokens_seen": 2705344, "step": 5510 }, { "epoch": 0.7278606308565395, "grad_norm": 1.4615646600723267, "learning_rate": 1.9873954709961956e-06, "loss": 0.0201, "num_input_tokens_seen": 2707520, "step": 5515 }, { "epoch": 0.728520522634288, "grad_norm": 13.904537200927734, "learning_rate": 1.987322450371608e-06, "loss": 0.1724, "num_input_tokens_seen": 2709888, "step": 5520 }, { "epoch": 0.7291804144120364, "grad_norm": 0.05545727536082268, "learning_rate": 1.9872492201949807e-06, "loss": 0.2705, "num_input_tokens_seen": 2712192, "step": 5525 }, { "epoch": 0.7298403061897849, "grad_norm": 0.2653372585773468, "learning_rate": 1.9871757804818546e-06, "loss": 0.0019, "num_input_tokens_seen": 2714368, "step": 5530 }, { "epoch": 0.7305001979675333, "grad_norm": 0.0897333025932312, "learning_rate": 1.9871021312478183e-06, "loss": 0.1082, "num_input_tokens_seen": 2716608, "step": 5535 }, { "epoch": 0.7311600897452818, "grad_norm": 0.06179777905344963, "learning_rate": 1.9870282725085025e-06, "loss": 0.0082, "num_input_tokens_seen": 2718656, "step": 5540 }, { "epoch": 0.7318199815230302, "grad_norm": 0.04015703871846199, "learning_rate": 1.9869542042795832e-06, "loss": 0.104, "num_input_tokens_seen": 2721152, "step": 5545 }, { "epoch": 0.7324798733007787, "grad_norm": 37.48463439941406, "learning_rate": 1.9868799265767814e-06, "loss": 0.0037, "num_input_tokens_seen": 2723264, "step": 5550 }, { "epoch": 0.7331397650785271, "grad_norm": 14.808982849121094, "learning_rate": 1.986805439415861e-06, "loss": 0.268, "num_input_tokens_seen": 2725568, "step": 5555 }, { "epoch": 0.7337996568562756, "grad_norm": 0.08154232054948807, "learning_rate": 1.9867307428126327e-06, "loss": 0.1503, "num_input_tokens_seen": 2728192, "step": 5560 }, { "epoch": 0.7344595486340241, "grad_norm": 18.187698364257812, "learning_rate": 1.9866558367829493e-06, "loss": 0.2448, "num_input_tokens_seen": 2731072, "step": 5565 }, { "epoch": 0.7351194404117725, "grad_norm": 0.0977545753121376, "learning_rate": 1.986580721342709e-06, "loss": 0.123, "num_input_tokens_seen": 2733440, "step": 5570 }, { "epoch": 0.735779332189521, "grad_norm": 10.494301795959473, "learning_rate": 1.986505396507855e-06, "loss": 0.1279, "num_input_tokens_seen": 2736064, "step": 5575 }, { "epoch": 0.7364392239672694, "grad_norm": 41.61827087402344, "learning_rate": 1.9864298622943747e-06, "loss": 0.0323, "num_input_tokens_seen": 2738496, "step": 5580 }, { "epoch": 0.7370991157450179, "grad_norm": 1.2770682573318481, "learning_rate": 1.986354118718299e-06, "loss": 0.0531, "num_input_tokens_seen": 2740800, "step": 5585 }, { "epoch": 0.7377590075227662, "grad_norm": 0.4450221359729767, "learning_rate": 1.9862781657957043e-06, "loss": 0.0734, "num_input_tokens_seen": 2743104, "step": 5590 }, { "epoch": 0.7384188993005147, "grad_norm": 3.060678482055664, "learning_rate": 1.986202003542711e-06, "loss": 0.164, "num_input_tokens_seen": 2745344, "step": 5595 }, { "epoch": 0.7390787910782631, "grad_norm": 38.93354034423828, "learning_rate": 1.9861256319754836e-06, "loss": 0.0798, "num_input_tokens_seen": 2747520, "step": 5600 }, { "epoch": 0.7397386828560116, "grad_norm": 0.26881110668182373, "learning_rate": 1.986049051110232e-06, "loss": 0.0556, "num_input_tokens_seen": 2750016, "step": 5605 }, { "epoch": 0.74039857463376, "grad_norm": 0.3444020748138428, "learning_rate": 1.9859722609632097e-06, "loss": 0.165, "num_input_tokens_seen": 2752704, "step": 5610 }, { "epoch": 0.7410584664115085, "grad_norm": 17.72441291809082, "learning_rate": 1.985895261550715e-06, "loss": 0.1732, "num_input_tokens_seen": 2755328, "step": 5615 }, { "epoch": 0.7417183581892569, "grad_norm": 17.03912925720215, "learning_rate": 1.9858180528890898e-06, "loss": 0.1728, "num_input_tokens_seen": 2757632, "step": 5620 }, { "epoch": 0.7423782499670054, "grad_norm": 0.28609731793403625, "learning_rate": 1.985740634994722e-06, "loss": 0.0655, "num_input_tokens_seen": 2760192, "step": 5625 }, { "epoch": 0.7430381417447539, "grad_norm": 0.18844915926456451, "learning_rate": 1.985663007884043e-06, "loss": 0.0018, "num_input_tokens_seen": 2762816, "step": 5630 }, { "epoch": 0.7436980335225023, "grad_norm": 13.197314262390137, "learning_rate": 1.9855851715735275e-06, "loss": 0.0711, "num_input_tokens_seen": 2765120, "step": 5635 }, { "epoch": 0.7443579253002508, "grad_norm": 0.40209174156188965, "learning_rate": 1.985507126079697e-06, "loss": 0.0933, "num_input_tokens_seen": 2767808, "step": 5640 }, { "epoch": 0.7450178170779992, "grad_norm": 0.0401005819439888, "learning_rate": 1.985428871419115e-06, "loss": 0.0009, "num_input_tokens_seen": 2770176, "step": 5645 }, { "epoch": 0.7456777088557477, "grad_norm": 32.41221237182617, "learning_rate": 1.9853504076083914e-06, "loss": 0.1552, "num_input_tokens_seen": 2772672, "step": 5650 }, { "epoch": 0.7463376006334961, "grad_norm": 0.06597806513309479, "learning_rate": 1.985271734664179e-06, "loss": 0.1258, "num_input_tokens_seen": 2775104, "step": 5655 }, { "epoch": 0.7469974924112446, "grad_norm": 85.4188232421875, "learning_rate": 1.985192852603175e-06, "loss": 0.3175, "num_input_tokens_seen": 2777792, "step": 5660 }, { "epoch": 0.747657384188993, "grad_norm": 36.49203109741211, "learning_rate": 1.9851137614421234e-06, "loss": 0.2089, "num_input_tokens_seen": 2780416, "step": 5665 }, { "epoch": 0.7483172759667415, "grad_norm": 0.09846457839012146, "learning_rate": 1.9850344611978085e-06, "loss": 0.0021, "num_input_tokens_seen": 2783232, "step": 5670 }, { "epoch": 0.7489771677444899, "grad_norm": 2.6694984436035156, "learning_rate": 1.984954951887063e-06, "loss": 0.1406, "num_input_tokens_seen": 2785664, "step": 5675 }, { "epoch": 0.7496370595222384, "grad_norm": 35.99433517456055, "learning_rate": 1.984875233526761e-06, "loss": 0.0632, "num_input_tokens_seen": 2788224, "step": 5680 }, { "epoch": 0.7502969512999867, "grad_norm": 0.10405652970075607, "learning_rate": 1.984795306133823e-06, "loss": 0.0028, "num_input_tokens_seen": 2790656, "step": 5685 }, { "epoch": 0.7502969512999867, "eval_loss": 0.09698151051998138, "eval_runtime": 7.858, "eval_samples_per_second": 857.089, "eval_steps_per_second": 107.152, "num_input_tokens_seen": 2790656, "step": 5685 }, { "epoch": 0.7509568430777352, "grad_norm": 0.023251961916685104, "learning_rate": 1.984715169725212e-06, "loss": 0.0287, "num_input_tokens_seen": 2792960, "step": 5690 }, { "epoch": 0.7516167348554837, "grad_norm": 124.132568359375, "learning_rate": 1.9846348243179373e-06, "loss": 0.0862, "num_input_tokens_seen": 2795648, "step": 5695 }, { "epoch": 0.7522766266332321, "grad_norm": 22.3035888671875, "learning_rate": 1.9845542699290516e-06, "loss": 0.0883, "num_input_tokens_seen": 2797696, "step": 5700 }, { "epoch": 0.7529365184109806, "grad_norm": 16.672033309936523, "learning_rate": 1.9844735065756513e-06, "loss": 0.1298, "num_input_tokens_seen": 2800192, "step": 5705 }, { "epoch": 0.753596410188729, "grad_norm": 0.1386905014514923, "learning_rate": 1.984392534274878e-06, "loss": 0.0658, "num_input_tokens_seen": 2802560, "step": 5710 }, { "epoch": 0.7542563019664775, "grad_norm": 0.1372869610786438, "learning_rate": 1.9843113530439184e-06, "loss": 0.2382, "num_input_tokens_seen": 2804992, "step": 5715 }, { "epoch": 0.7549161937442259, "grad_norm": 0.30530741810798645, "learning_rate": 1.9842299629000014e-06, "loss": 0.2144, "num_input_tokens_seen": 2807296, "step": 5720 }, { "epoch": 0.7555760855219744, "grad_norm": 55.86363220214844, "learning_rate": 1.9841483638604025e-06, "loss": 0.1445, "num_input_tokens_seen": 2809984, "step": 5725 }, { "epoch": 0.7562359772997228, "grad_norm": 0.07656501233577728, "learning_rate": 1.9840665559424395e-06, "loss": 0.0021, "num_input_tokens_seen": 2812736, "step": 5730 }, { "epoch": 0.7568958690774713, "grad_norm": 0.051343828439712524, "learning_rate": 1.9839845391634764e-06, "loss": 0.1602, "num_input_tokens_seen": 2815040, "step": 5735 }, { "epoch": 0.7575557608552197, "grad_norm": 0.09367392212152481, "learning_rate": 1.9839023135409203e-06, "loss": 0.1313, "num_input_tokens_seen": 2817344, "step": 5740 }, { "epoch": 0.7582156526329682, "grad_norm": 0.057923562824726105, "learning_rate": 1.983819879092223e-06, "loss": 0.0919, "num_input_tokens_seen": 2819648, "step": 5745 }, { "epoch": 0.7588755444107167, "grad_norm": 0.2078000158071518, "learning_rate": 1.9837372358348804e-06, "loss": 0.2254, "num_input_tokens_seen": 2822464, "step": 5750 }, { "epoch": 0.7595354361884651, "grad_norm": 6.0667009353637695, "learning_rate": 1.9836543837864332e-06, "loss": 0.1121, "num_input_tokens_seen": 2824896, "step": 5755 }, { "epoch": 0.7601953279662136, "grad_norm": 3.583667278289795, "learning_rate": 1.9835713229644663e-06, "loss": 0.1378, "num_input_tokens_seen": 2827648, "step": 5760 }, { "epoch": 0.760855219743962, "grad_norm": 13.19372272491455, "learning_rate": 1.983488053386608e-06, "loss": 0.1264, "num_input_tokens_seen": 2830336, "step": 5765 }, { "epoch": 0.7615151115217105, "grad_norm": 16.9455509185791, "learning_rate": 1.983404575070533e-06, "loss": 0.039, "num_input_tokens_seen": 2832640, "step": 5770 }, { "epoch": 0.7621750032994589, "grad_norm": 0.371663898229599, "learning_rate": 1.9833208880339576e-06, "loss": 0.0268, "num_input_tokens_seen": 2834880, "step": 5775 }, { "epoch": 0.7628348950772074, "grad_norm": 0.05054211616516113, "learning_rate": 1.983236992294645e-06, "loss": 0.1555, "num_input_tokens_seen": 2837440, "step": 5780 }, { "epoch": 0.7634947868549558, "grad_norm": 14.877882957458496, "learning_rate": 1.9831528878704003e-06, "loss": 0.1095, "num_input_tokens_seen": 2839808, "step": 5785 }, { "epoch": 0.7641546786327043, "grad_norm": 0.0845484808087349, "learning_rate": 1.983068574779075e-06, "loss": 0.1398, "num_input_tokens_seen": 2842432, "step": 5790 }, { "epoch": 0.7648145704104526, "grad_norm": 0.18073433637619019, "learning_rate": 1.9829840530385633e-06, "loss": 0.1598, "num_input_tokens_seen": 2845120, "step": 5795 }, { "epoch": 0.7654744621882011, "grad_norm": 170.3042755126953, "learning_rate": 1.9828993226668046e-06, "loss": 0.0721, "num_input_tokens_seen": 2848000, "step": 5800 }, { "epoch": 0.7661343539659495, "grad_norm": 43.15339660644531, "learning_rate": 1.982814383681782e-06, "loss": 0.1805, "num_input_tokens_seen": 2850624, "step": 5805 }, { "epoch": 0.766794245743698, "grad_norm": 0.265320360660553, "learning_rate": 1.9827292361015235e-06, "loss": 0.1815, "num_input_tokens_seen": 2852992, "step": 5810 }, { "epoch": 0.7674541375214465, "grad_norm": 140.9707489013672, "learning_rate": 1.9826438799441016e-06, "loss": 0.0437, "num_input_tokens_seen": 2855424, "step": 5815 }, { "epoch": 0.7681140292991949, "grad_norm": 0.3222537040710449, "learning_rate": 1.982558315227631e-06, "loss": 0.147, "num_input_tokens_seen": 2857984, "step": 5820 }, { "epoch": 0.7687739210769434, "grad_norm": 0.1431400626897812, "learning_rate": 1.982472541970274e-06, "loss": 0.0712, "num_input_tokens_seen": 2860672, "step": 5825 }, { "epoch": 0.7694338128546918, "grad_norm": 12.281307220458984, "learning_rate": 1.9823865601902337e-06, "loss": 0.21, "num_input_tokens_seen": 2863040, "step": 5830 }, { "epoch": 0.7700937046324403, "grad_norm": 40.90239715576172, "learning_rate": 1.9823003699057607e-06, "loss": 0.1239, "num_input_tokens_seen": 2865856, "step": 5835 }, { "epoch": 0.7707535964101887, "grad_norm": 0.3467998206615448, "learning_rate": 1.9822139711351465e-06, "loss": 0.1, "num_input_tokens_seen": 2868096, "step": 5840 }, { "epoch": 0.7714134881879372, "grad_norm": 0.14333048462867737, "learning_rate": 1.9821273638967304e-06, "loss": 0.0024, "num_input_tokens_seen": 2870784, "step": 5845 }, { "epoch": 0.7720733799656856, "grad_norm": 65.02461242675781, "learning_rate": 1.9820405482088927e-06, "loss": 0.0828, "num_input_tokens_seen": 2873216, "step": 5850 }, { "epoch": 0.7727332717434341, "grad_norm": 0.33633705973625183, "learning_rate": 1.9819535240900606e-06, "loss": 0.001, "num_input_tokens_seen": 2875776, "step": 5855 }, { "epoch": 0.7733931635211825, "grad_norm": 0.0250334981828928, "learning_rate": 1.9818662915587036e-06, "loss": 0.0624, "num_input_tokens_seen": 2878336, "step": 5860 }, { "epoch": 0.774053055298931, "grad_norm": 16.731359481811523, "learning_rate": 1.981778850633336e-06, "loss": 0.2229, "num_input_tokens_seen": 2880896, "step": 5865 }, { "epoch": 0.7747129470766794, "grad_norm": 0.014623081311583519, "learning_rate": 1.981691201332517e-06, "loss": 0.0652, "num_input_tokens_seen": 2883648, "step": 5870 }, { "epoch": 0.7753728388544279, "grad_norm": 0.05482471361756325, "learning_rate": 1.9816033436748495e-06, "loss": 0.0585, "num_input_tokens_seen": 2885952, "step": 5875 }, { "epoch": 0.7760327306321764, "grad_norm": 0.3382435739040375, "learning_rate": 1.98151527767898e-06, "loss": 0.079, "num_input_tokens_seen": 2888576, "step": 5880 }, { "epoch": 0.7766926224099248, "grad_norm": 0.5653005242347717, "learning_rate": 1.981427003363601e-06, "loss": 0.1387, "num_input_tokens_seen": 2891136, "step": 5885 }, { "epoch": 0.7773525141876733, "grad_norm": 39.504642486572266, "learning_rate": 1.9813385207474472e-06, "loss": 0.1429, "num_input_tokens_seen": 2893696, "step": 5890 }, { "epoch": 0.7780124059654216, "grad_norm": 1.0022053718566895, "learning_rate": 1.981249829849299e-06, "loss": 0.0546, "num_input_tokens_seen": 2896512, "step": 5895 }, { "epoch": 0.7786722977431701, "grad_norm": 28.317174911499023, "learning_rate": 1.9811609306879798e-06, "loss": 0.1847, "num_input_tokens_seen": 2899008, "step": 5900 }, { "epoch": 0.7793321895209185, "grad_norm": 0.7019644975662231, "learning_rate": 1.9810718232823584e-06, "loss": 0.0416, "num_input_tokens_seen": 2901376, "step": 5905 }, { "epoch": 0.779992081298667, "grad_norm": 13.494510650634766, "learning_rate": 1.9809825076513462e-06, "loss": 0.2391, "num_input_tokens_seen": 2903872, "step": 5910 }, { "epoch": 0.7806519730764154, "grad_norm": 0.11542707681655884, "learning_rate": 1.980892983813901e-06, "loss": 0.0021, "num_input_tokens_seen": 2906240, "step": 5915 }, { "epoch": 0.7813118648541639, "grad_norm": 44.78446578979492, "learning_rate": 1.980803251789023e-06, "loss": 0.1206, "num_input_tokens_seen": 2908736, "step": 5920 }, { "epoch": 0.7819717566319123, "grad_norm": 42.42718505859375, "learning_rate": 1.980713311595757e-06, "loss": 0.1592, "num_input_tokens_seen": 2911104, "step": 5925 }, { "epoch": 0.7826316484096608, "grad_norm": 0.17716114223003387, "learning_rate": 1.980623163253192e-06, "loss": 0.0998, "num_input_tokens_seen": 2913472, "step": 5930 }, { "epoch": 0.7832915401874093, "grad_norm": 67.53726196289062, "learning_rate": 1.9805328067804626e-06, "loss": 0.1875, "num_input_tokens_seen": 2915840, "step": 5935 }, { "epoch": 0.7839514319651577, "grad_norm": 0.056678541004657745, "learning_rate": 1.980442242196745e-06, "loss": 0.0014, "num_input_tokens_seen": 2918144, "step": 5940 }, { "epoch": 0.7846113237429062, "grad_norm": 0.0669153556227684, "learning_rate": 1.9803514695212613e-06, "loss": 0.1515, "num_input_tokens_seen": 2920768, "step": 5945 }, { "epoch": 0.7852712155206546, "grad_norm": 0.037776295095682144, "learning_rate": 1.9802604887732773e-06, "loss": 0.093, "num_input_tokens_seen": 2923136, "step": 5950 }, { "epoch": 0.7859311072984031, "grad_norm": 0.15245310962200165, "learning_rate": 1.980169299972103e-06, "loss": 0.1338, "num_input_tokens_seen": 2925568, "step": 5955 }, { "epoch": 0.7865909990761515, "grad_norm": 0.14270982146263123, "learning_rate": 1.980077903137093e-06, "loss": 0.0132, "num_input_tokens_seen": 2928064, "step": 5960 }, { "epoch": 0.7872508908539, "grad_norm": 13.677779197692871, "learning_rate": 1.979986298287645e-06, "loss": 0.1477, "num_input_tokens_seen": 2930368, "step": 5965 }, { "epoch": 0.7879107826316484, "grad_norm": 15.470702171325684, "learning_rate": 1.979894485443201e-06, "loss": 0.0939, "num_input_tokens_seen": 2932928, "step": 5970 }, { "epoch": 0.7885706744093969, "grad_norm": 191.1402130126953, "learning_rate": 1.9798024646232495e-06, "loss": 0.2729, "num_input_tokens_seen": 2935360, "step": 5975 }, { "epoch": 0.7892305661871453, "grad_norm": 148.3045196533203, "learning_rate": 1.9797102358473195e-06, "loss": 0.0693, "num_input_tokens_seen": 2937920, "step": 5980 }, { "epoch": 0.7898904579648938, "grad_norm": 18.475128173828125, "learning_rate": 1.979617799134986e-06, "loss": 0.2579, "num_input_tokens_seen": 2940224, "step": 5985 }, { "epoch": 0.7905503497426422, "grad_norm": 57.13227462768555, "learning_rate": 1.979525154505869e-06, "loss": 0.0073, "num_input_tokens_seen": 2942848, "step": 5990 }, { "epoch": 0.7912102415203907, "grad_norm": 0.4409717917442322, "learning_rate": 1.979432301979631e-06, "loss": 0.1256, "num_input_tokens_seen": 2945344, "step": 5995 }, { "epoch": 0.7918701332981392, "grad_norm": 15.219269752502441, "learning_rate": 1.9793392415759796e-06, "loss": 0.0084, "num_input_tokens_seen": 2947840, "step": 6000 }, { "epoch": 0.7925300250758875, "grad_norm": 0.10808387398719788, "learning_rate": 1.979245973314666e-06, "loss": 0.1543, "num_input_tokens_seen": 2950144, "step": 6005 }, { "epoch": 0.793189916853636, "grad_norm": 306.9264221191406, "learning_rate": 1.9791524972154856e-06, "loss": 0.0351, "num_input_tokens_seen": 2952384, "step": 6010 }, { "epoch": 0.7938498086313844, "grad_norm": 0.06602656096220016, "learning_rate": 1.979058813298278e-06, "loss": 0.2565, "num_input_tokens_seen": 2955136, "step": 6015 }, { "epoch": 0.7945097004091329, "grad_norm": 1.304447889328003, "learning_rate": 1.978964921582927e-06, "loss": 0.0011, "num_input_tokens_seen": 2957824, "step": 6020 }, { "epoch": 0.7951695921868813, "grad_norm": 0.026650432497262955, "learning_rate": 1.9788708220893608e-06, "loss": 0.063, "num_input_tokens_seen": 2960256, "step": 6025 }, { "epoch": 0.7958294839646298, "grad_norm": 0.8960063457489014, "learning_rate": 1.9787765148375506e-06, "loss": 0.19, "num_input_tokens_seen": 2962944, "step": 6030 }, { "epoch": 0.7964893757423782, "grad_norm": 126.94390869140625, "learning_rate": 1.978681999847513e-06, "loss": 0.2955, "num_input_tokens_seen": 2965504, "step": 6035 }, { "epoch": 0.7971492675201267, "grad_norm": 0.08152367174625397, "learning_rate": 1.9785872771393084e-06, "loss": 0.3805, "num_input_tokens_seen": 2967744, "step": 6040 }, { "epoch": 0.7978091592978751, "grad_norm": 34.75372314453125, "learning_rate": 1.9784923467330403e-06, "loss": 0.0549, "num_input_tokens_seen": 2970240, "step": 6045 }, { "epoch": 0.7984690510756236, "grad_norm": 0.5853905081748962, "learning_rate": 1.9783972086488573e-06, "loss": 0.2836, "num_input_tokens_seen": 2972928, "step": 6050 }, { "epoch": 0.799128942853372, "grad_norm": 0.08218041062355042, "learning_rate": 1.9783018629069516e-06, "loss": 0.0879, "num_input_tokens_seen": 2975168, "step": 6055 }, { "epoch": 0.7997888346311205, "grad_norm": 1.292216420173645, "learning_rate": 1.97820630952756e-06, "loss": 0.1121, "num_input_tokens_seen": 2977408, "step": 6060 }, { "epoch": 0.800448726408869, "grad_norm": 0.052405282855033875, "learning_rate": 1.978110548530963e-06, "loss": 0.0025, "num_input_tokens_seen": 2979968, "step": 6065 }, { "epoch": 0.8011086181866174, "grad_norm": 39.24075698852539, "learning_rate": 1.9780145799374846e-06, "loss": 0.2776, "num_input_tokens_seen": 2982528, "step": 6070 }, { "epoch": 0.8017685099643659, "grad_norm": 0.5942057967185974, "learning_rate": 1.977918403767494e-06, "loss": 0.0743, "num_input_tokens_seen": 2984832, "step": 6075 }, { "epoch": 0.8024284017421143, "grad_norm": 24.79922866821289, "learning_rate": 1.9778220200414036e-06, "loss": 0.0383, "num_input_tokens_seen": 2987328, "step": 6080 }, { "epoch": 0.8030882935198628, "grad_norm": 0.13059140741825104, "learning_rate": 1.9777254287796706e-06, "loss": 0.0037, "num_input_tokens_seen": 2989760, "step": 6085 }, { "epoch": 0.8037481852976112, "grad_norm": 0.6647312641143799, "learning_rate": 1.9776286300027954e-06, "loss": 0.0012, "num_input_tokens_seen": 2992320, "step": 6090 }, { "epoch": 0.8044080770753597, "grad_norm": 0.0258713997900486, "learning_rate": 1.9775316237313225e-06, "loss": 0.1335, "num_input_tokens_seen": 2995136, "step": 6095 }, { "epoch": 0.805067968853108, "grad_norm": 0.06406942009925842, "learning_rate": 1.977434409985842e-06, "loss": 0.0003, "num_input_tokens_seen": 2998016, "step": 6100 }, { "epoch": 0.8057278606308566, "grad_norm": 0.02696152776479721, "learning_rate": 1.977336988786985e-06, "loss": 0.2727, "num_input_tokens_seen": 3000832, "step": 6105 }, { "epoch": 0.8063877524086049, "grad_norm": 14.614540100097656, "learning_rate": 1.97723936015543e-06, "loss": 0.2474, "num_input_tokens_seen": 3003584, "step": 6110 }, { "epoch": 0.8070476441863534, "grad_norm": 91.05704498291016, "learning_rate": 1.9771415241118972e-06, "loss": 0.0878, "num_input_tokens_seen": 3006464, "step": 6115 }, { "epoch": 0.8077075359641019, "grad_norm": 0.38378047943115234, "learning_rate": 1.9770434806771525e-06, "loss": 0.1026, "num_input_tokens_seen": 3008896, "step": 6120 }, { "epoch": 0.8083674277418503, "grad_norm": 4.753599643707275, "learning_rate": 1.976945229872003e-06, "loss": 0.0518, "num_input_tokens_seen": 3011392, "step": 6125 }, { "epoch": 0.8090273195195988, "grad_norm": 23.890127182006836, "learning_rate": 1.976846771717304e-06, "loss": 0.2063, "num_input_tokens_seen": 3014016, "step": 6130 }, { "epoch": 0.8096872112973472, "grad_norm": 0.4210752546787262, "learning_rate": 1.9767481062339512e-06, "loss": 0.1909, "num_input_tokens_seen": 3016576, "step": 6135 }, { "epoch": 0.8103471030750957, "grad_norm": 67.47174072265625, "learning_rate": 1.976649233442886e-06, "loss": 0.0153, "num_input_tokens_seen": 3019008, "step": 6140 }, { "epoch": 0.8110069948528441, "grad_norm": 1.1742627620697021, "learning_rate": 1.976550153365093e-06, "loss": 0.0678, "num_input_tokens_seen": 3021504, "step": 6145 }, { "epoch": 0.8116668866305926, "grad_norm": 2.150078773498535, "learning_rate": 1.9764508660216018e-06, "loss": 0.0594, "num_input_tokens_seen": 3023552, "step": 6150 }, { "epoch": 0.812326778408341, "grad_norm": 56.31321716308594, "learning_rate": 1.976351371433485e-06, "loss": 0.1778, "num_input_tokens_seen": 3025856, "step": 6155 }, { "epoch": 0.8129866701860895, "grad_norm": 44.1425895690918, "learning_rate": 1.9762516696218598e-06, "loss": 0.2057, "num_input_tokens_seen": 3028096, "step": 6160 }, { "epoch": 0.8136465619638379, "grad_norm": 25.15863800048828, "learning_rate": 1.9761517606078873e-06, "loss": 0.3517, "num_input_tokens_seen": 3030528, "step": 6165 }, { "epoch": 0.8143064537415864, "grad_norm": 36.221038818359375, "learning_rate": 1.9760516444127722e-06, "loss": 0.2465, "num_input_tokens_seen": 3033088, "step": 6170 }, { "epoch": 0.8149663455193348, "grad_norm": 0.5490625500679016, "learning_rate": 1.975951321057764e-06, "loss": 0.0653, "num_input_tokens_seen": 3035200, "step": 6175 }, { "epoch": 0.8156262372970833, "grad_norm": 0.20859502255916595, "learning_rate": 1.975850790564155e-06, "loss": 0.0755, "num_input_tokens_seen": 3037696, "step": 6180 }, { "epoch": 0.8162861290748318, "grad_norm": 11.634245872497559, "learning_rate": 1.9757500529532817e-06, "loss": 0.1064, "num_input_tokens_seen": 3040128, "step": 6185 }, { "epoch": 0.8169460208525802, "grad_norm": 0.895517110824585, "learning_rate": 1.975649108246526e-06, "loss": 0.1667, "num_input_tokens_seen": 3042560, "step": 6190 }, { "epoch": 0.8176059126303287, "grad_norm": 33.11262512207031, "learning_rate": 1.9755479564653123e-06, "loss": 0.2541, "num_input_tokens_seen": 3044800, "step": 6195 }, { "epoch": 0.8182658044080771, "grad_norm": 16.164127349853516, "learning_rate": 1.975446597631109e-06, "loss": 0.124, "num_input_tokens_seen": 3047040, "step": 6200 }, { "epoch": 0.8189256961858256, "grad_norm": 0.22055456042289734, "learning_rate": 1.975345031765429e-06, "loss": 0.064, "num_input_tokens_seen": 3049600, "step": 6205 }, { "epoch": 0.819585587963574, "grad_norm": 1.1324610710144043, "learning_rate": 1.975243258889829e-06, "loss": 0.1276, "num_input_tokens_seen": 3052416, "step": 6210 }, { "epoch": 0.8202454797413224, "grad_norm": 0.5565292835235596, "learning_rate": 1.9751412790259093e-06, "loss": 0.0928, "num_input_tokens_seen": 3055040, "step": 6215 }, { "epoch": 0.8209053715190708, "grad_norm": 32.02667999267578, "learning_rate": 1.9750390921953144e-06, "loss": 0.0983, "num_input_tokens_seen": 3057856, "step": 6220 }, { "epoch": 0.8215652632968193, "grad_norm": 1.3813210725784302, "learning_rate": 1.9749366984197335e-06, "loss": 0.2008, "num_input_tokens_seen": 3060160, "step": 6225 }, { "epoch": 0.8222251550745677, "grad_norm": 2.9544014930725098, "learning_rate": 1.9748340977208975e-06, "loss": 0.1972, "num_input_tokens_seen": 3062592, "step": 6230 }, { "epoch": 0.8228850468523162, "grad_norm": 11.683035850524902, "learning_rate": 1.9747312901205837e-06, "loss": 0.0591, "num_input_tokens_seen": 3065088, "step": 6235 }, { "epoch": 0.8235449386300646, "grad_norm": 0.14474692940711975, "learning_rate": 1.9746282756406126e-06, "loss": 0.0013, "num_input_tokens_seen": 3067712, "step": 6240 }, { "epoch": 0.8242048304078131, "grad_norm": 30.500009536743164, "learning_rate": 1.974525054302847e-06, "loss": 0.1508, "num_input_tokens_seen": 3070144, "step": 6245 }, { "epoch": 0.8248647221855616, "grad_norm": 0.25611788034439087, "learning_rate": 1.974421626129196e-06, "loss": 0.2101, "num_input_tokens_seen": 3072448, "step": 6250 }, { "epoch": 0.82552461396331, "grad_norm": 0.5064347982406616, "learning_rate": 1.9743179911416104e-06, "loss": 0.1979, "num_input_tokens_seen": 3075072, "step": 6255 }, { "epoch": 0.8261845057410585, "grad_norm": 0.08567559719085693, "learning_rate": 1.9742141493620876e-06, "loss": 0.1248, "num_input_tokens_seen": 3077376, "step": 6260 }, { "epoch": 0.8268443975188069, "grad_norm": 39.92914581298828, "learning_rate": 1.9741101008126655e-06, "loss": 0.2122, "num_input_tokens_seen": 3079808, "step": 6265 }, { "epoch": 0.8275042892965554, "grad_norm": 1.009863018989563, "learning_rate": 1.974005845515429e-06, "loss": 0.0026, "num_input_tokens_seen": 3082560, "step": 6270 }, { "epoch": 0.8281641810743038, "grad_norm": 26.14215087890625, "learning_rate": 1.9739013834925047e-06, "loss": 0.1156, "num_input_tokens_seen": 3084608, "step": 6275 }, { "epoch": 0.8288240728520523, "grad_norm": 18.71134376525879, "learning_rate": 1.973796714766064e-06, "loss": 0.3242, "num_input_tokens_seen": 3087104, "step": 6280 }, { "epoch": 0.8294839646298007, "grad_norm": 0.44969964027404785, "learning_rate": 1.973691839358323e-06, "loss": 0.0915, "num_input_tokens_seen": 3089408, "step": 6285 }, { "epoch": 0.8301438564075492, "grad_norm": 110.7673110961914, "learning_rate": 1.973586757291539e-06, "loss": 0.054, "num_input_tokens_seen": 3091776, "step": 6290 }, { "epoch": 0.8308037481852976, "grad_norm": 0.1554386168718338, "learning_rate": 1.973481468588017e-06, "loss": 0.1558, "num_input_tokens_seen": 3094208, "step": 6295 }, { "epoch": 0.8314636399630461, "grad_norm": 0.08468946814537048, "learning_rate": 1.973375973270102e-06, "loss": 0.0414, "num_input_tokens_seen": 3096768, "step": 6300 }, { "epoch": 0.8321235317407946, "grad_norm": 0.7825853824615479, "learning_rate": 1.973270271360185e-06, "loss": 0.0018, "num_input_tokens_seen": 3099456, "step": 6305 }, { "epoch": 0.832783423518543, "grad_norm": 0.31845423579216003, "learning_rate": 1.9731643628807014e-06, "loss": 0.1685, "num_input_tokens_seen": 3102208, "step": 6310 }, { "epoch": 0.8334433152962915, "grad_norm": 0.06811577826738358, "learning_rate": 1.973058247854129e-06, "loss": 0.0576, "num_input_tokens_seen": 3104896, "step": 6315 }, { "epoch": 0.8341032070740398, "grad_norm": 0.04747435823082924, "learning_rate": 1.9729519263029895e-06, "loss": 0.1591, "num_input_tokens_seen": 3107520, "step": 6320 }, { "epoch": 0.8347630988517883, "grad_norm": 0.05607318878173828, "learning_rate": 1.972845398249849e-06, "loss": 0.0462, "num_input_tokens_seen": 3110144, "step": 6325 }, { "epoch": 0.8354229906295367, "grad_norm": 0.04064086452126503, "learning_rate": 1.972738663717318e-06, "loss": 0.0463, "num_input_tokens_seen": 3112768, "step": 6330 }, { "epoch": 0.8360828824072852, "grad_norm": 0.023201411589980125, "learning_rate": 1.9726317227280494e-06, "loss": 0.0003, "num_input_tokens_seen": 3115328, "step": 6335 }, { "epoch": 0.8367427741850336, "grad_norm": 0.04389675334095955, "learning_rate": 1.972524575304741e-06, "loss": 0.0213, "num_input_tokens_seen": 3117888, "step": 6340 }, { "epoch": 0.8374026659627821, "grad_norm": 0.3593812584877014, "learning_rate": 1.972417221470134e-06, "loss": 0.0118, "num_input_tokens_seen": 3120384, "step": 6345 }, { "epoch": 0.8380625577405305, "grad_norm": 0.34914690256118774, "learning_rate": 1.972309661247013e-06, "loss": 0.1584, "num_input_tokens_seen": 3123008, "step": 6350 }, { "epoch": 0.838722449518279, "grad_norm": 0.03417549654841423, "learning_rate": 1.9722018946582075e-06, "loss": 0.0573, "num_input_tokens_seen": 3125504, "step": 6355 }, { "epoch": 0.8393823412960274, "grad_norm": 0.46451956033706665, "learning_rate": 1.9720939217265904e-06, "loss": 0.0681, "num_input_tokens_seen": 3127744, "step": 6360 }, { "epoch": 0.8400422330737759, "grad_norm": 0.0629674419760704, "learning_rate": 1.9719857424750776e-06, "loss": 0.1754, "num_input_tokens_seen": 3130048, "step": 6365 }, { "epoch": 0.8407021248515244, "grad_norm": 0.19810578227043152, "learning_rate": 1.971877356926629e-06, "loss": 0.0619, "num_input_tokens_seen": 3132480, "step": 6370 }, { "epoch": 0.8413620166292728, "grad_norm": 100.77786254882812, "learning_rate": 1.9717687651042494e-06, "loss": 0.136, "num_input_tokens_seen": 3135104, "step": 6375 }, { "epoch": 0.8420219084070213, "grad_norm": 48.25716781616211, "learning_rate": 1.971659967030987e-06, "loss": 0.2398, "num_input_tokens_seen": 3137344, "step": 6380 }, { "epoch": 0.8426818001847697, "grad_norm": 23.40747833251953, "learning_rate": 1.9715509627299324e-06, "loss": 0.2223, "num_input_tokens_seen": 3140096, "step": 6385 }, { "epoch": 0.8433416919625182, "grad_norm": 22.579530715942383, "learning_rate": 1.971441752224221e-06, "loss": 0.1451, "num_input_tokens_seen": 3142400, "step": 6390 }, { "epoch": 0.8440015837402666, "grad_norm": 1.4132654666900635, "learning_rate": 1.971332335537033e-06, "loss": 0.0571, "num_input_tokens_seen": 3144512, "step": 6395 }, { "epoch": 0.8446614755180151, "grad_norm": 16.25748634338379, "learning_rate": 1.97122271269159e-06, "loss": 0.1166, "num_input_tokens_seen": 3146944, "step": 6400 }, { "epoch": 0.8453213672957635, "grad_norm": 0.35819733142852783, "learning_rate": 1.97111288371116e-06, "loss": 0.1062, "num_input_tokens_seen": 3149376, "step": 6405 }, { "epoch": 0.845981259073512, "grad_norm": 0.4410839378833771, "learning_rate": 1.9710028486190524e-06, "loss": 0.1249, "num_input_tokens_seen": 3151744, "step": 6410 }, { "epoch": 0.8466411508512603, "grad_norm": 0.2627358138561249, "learning_rate": 1.970892607438621e-06, "loss": 0.039, "num_input_tokens_seen": 3154112, "step": 6415 }, { "epoch": 0.8473010426290088, "grad_norm": 0.37668725848197937, "learning_rate": 1.970782160193265e-06, "loss": 0.0129, "num_input_tokens_seen": 3156480, "step": 6420 }, { "epoch": 0.8479609344067572, "grad_norm": 17.34520721435547, "learning_rate": 1.970671506906425e-06, "loss": 0.2154, "num_input_tokens_seen": 3158784, "step": 6425 }, { "epoch": 0.8486208261845057, "grad_norm": 0.09452015161514282, "learning_rate": 1.970560647601587e-06, "loss": 0.1681, "num_input_tokens_seen": 3161152, "step": 6430 }, { "epoch": 0.8492807179622542, "grad_norm": 0.035893987864255905, "learning_rate": 1.9704495823022797e-06, "loss": 0.0015, "num_input_tokens_seen": 3163776, "step": 6435 }, { "epoch": 0.8499406097400026, "grad_norm": 35.052188873291016, "learning_rate": 1.970338311032076e-06, "loss": 0.1335, "num_input_tokens_seen": 3166272, "step": 6440 }, { "epoch": 0.8506005015177511, "grad_norm": 20.927108764648438, "learning_rate": 1.970226833814592e-06, "loss": 0.1466, "num_input_tokens_seen": 3168640, "step": 6445 }, { "epoch": 0.8512603932954995, "grad_norm": 0.10324529558420181, "learning_rate": 1.970115150673489e-06, "loss": 0.072, "num_input_tokens_seen": 3171008, "step": 6450 }, { "epoch": 0.851920285073248, "grad_norm": 0.2769117057323456, "learning_rate": 1.97000326163247e-06, "loss": 0.0785, "num_input_tokens_seen": 3173312, "step": 6455 }, { "epoch": 0.8525801768509964, "grad_norm": 0.13265232741832733, "learning_rate": 1.969891166715283e-06, "loss": 0.1788, "num_input_tokens_seen": 3175808, "step": 6460 }, { "epoch": 0.8532400686287449, "grad_norm": 0.49003279209136963, "learning_rate": 1.969778865945719e-06, "loss": 0.1182, "num_input_tokens_seen": 3178048, "step": 6465 }, { "epoch": 0.8538999604064933, "grad_norm": 0.14793264865875244, "learning_rate": 1.969666359347614e-06, "loss": 0.0031, "num_input_tokens_seen": 3180544, "step": 6470 }, { "epoch": 0.8545598521842418, "grad_norm": 11.453808784484863, "learning_rate": 1.969553646944845e-06, "loss": 0.268, "num_input_tokens_seen": 3183040, "step": 6475 }, { "epoch": 0.8552197439619902, "grad_norm": 0.1205844134092331, "learning_rate": 1.969440728761336e-06, "loss": 0.0905, "num_input_tokens_seen": 3185664, "step": 6480 }, { "epoch": 0.8558796357397387, "grad_norm": 17.57941436767578, "learning_rate": 1.9693276048210524e-06, "loss": 0.1175, "num_input_tokens_seen": 3188672, "step": 6485 }, { "epoch": 0.8565395275174872, "grad_norm": 42.01498031616211, "learning_rate": 1.969214275148004e-06, "loss": 0.0078, "num_input_tokens_seen": 3191168, "step": 6490 }, { "epoch": 0.8571994192952356, "grad_norm": 84.84487915039062, "learning_rate": 1.9691007397662444e-06, "loss": 0.2481, "num_input_tokens_seen": 3193664, "step": 6495 }, { "epoch": 0.8578593110729841, "grad_norm": 0.0309496708214283, "learning_rate": 1.96898699869987e-06, "loss": 0.0524, "num_input_tokens_seen": 3196224, "step": 6500 }, { "epoch": 0.8585192028507325, "grad_norm": 0.08695626258850098, "learning_rate": 1.968873051973022e-06, "loss": 0.1735, "num_input_tokens_seen": 3198784, "step": 6505 }, { "epoch": 0.859179094628481, "grad_norm": 0.0463121235370636, "learning_rate": 1.968758899609885e-06, "loss": 0.1, "num_input_tokens_seen": 3201472, "step": 6510 }, { "epoch": 0.8598389864062294, "grad_norm": 0.3663501739501953, "learning_rate": 1.9686445416346866e-06, "loss": 0.0387, "num_input_tokens_seen": 3203584, "step": 6515 }, { "epoch": 0.8604988781839779, "grad_norm": 44.38224411010742, "learning_rate": 1.9685299780716988e-06, "loss": 0.16, "num_input_tokens_seen": 3205888, "step": 6520 }, { "epoch": 0.8611587699617262, "grad_norm": 0.530706524848938, "learning_rate": 1.968415208945237e-06, "loss": 0.2118, "num_input_tokens_seen": 3208000, "step": 6525 }, { "epoch": 0.8618186617394747, "grad_norm": 0.151189386844635, "learning_rate": 1.9683002342796594e-06, "loss": 0.0009, "num_input_tokens_seen": 3210240, "step": 6530 }, { "epoch": 0.8624785535172231, "grad_norm": 54.15868377685547, "learning_rate": 1.9681850540993687e-06, "loss": 0.0847, "num_input_tokens_seen": 3212672, "step": 6535 }, { "epoch": 0.8631384452949716, "grad_norm": 11.121855735778809, "learning_rate": 1.9680696684288116e-06, "loss": 0.2278, "num_input_tokens_seen": 3215360, "step": 6540 }, { "epoch": 0.86379833707272, "grad_norm": 1.1879770755767822, "learning_rate": 1.9679540772924773e-06, "loss": 0.1291, "num_input_tokens_seen": 3218112, "step": 6545 }, { "epoch": 0.8644582288504685, "grad_norm": 0.7121301293373108, "learning_rate": 1.9678382807149e-06, "loss": 0.0677, "num_input_tokens_seen": 3220288, "step": 6550 }, { "epoch": 0.865118120628217, "grad_norm": 0.16042585670948029, "learning_rate": 1.967722278720656e-06, "loss": 0.0061, "num_input_tokens_seen": 3222976, "step": 6555 }, { "epoch": 0.8657780124059654, "grad_norm": 4.094472408294678, "learning_rate": 1.967606071334366e-06, "loss": 0.087, "num_input_tokens_seen": 3225472, "step": 6560 }, { "epoch": 0.8664379041837139, "grad_norm": 2.223787307739258, "learning_rate": 1.9674896585806938e-06, "loss": 0.2098, "num_input_tokens_seen": 3228096, "step": 6565 }, { "epoch": 0.8670977959614623, "grad_norm": 0.06424208730459213, "learning_rate": 1.967373040484348e-06, "loss": 0.1914, "num_input_tokens_seen": 3230720, "step": 6570 }, { "epoch": 0.8677576877392108, "grad_norm": 13.304986953735352, "learning_rate": 1.9672562170700794e-06, "loss": 0.1312, "num_input_tokens_seen": 3233088, "step": 6575 }, { "epoch": 0.8684175795169592, "grad_norm": 25.26539421081543, "learning_rate": 1.967139188362683e-06, "loss": 0.139, "num_input_tokens_seen": 3235712, "step": 6580 }, { "epoch": 0.8690774712947077, "grad_norm": 0.19319690763950348, "learning_rate": 1.9670219543869977e-06, "loss": 0.1531, "num_input_tokens_seen": 3238528, "step": 6585 }, { "epoch": 0.8697373630724561, "grad_norm": 18.5180606842041, "learning_rate": 1.9669045151679045e-06, "loss": 0.1389, "num_input_tokens_seen": 3240896, "step": 6590 }, { "epoch": 0.8703972548502046, "grad_norm": 0.2979770004749298, "learning_rate": 1.9667868707303304e-06, "loss": 0.0033, "num_input_tokens_seen": 3243392, "step": 6595 }, { "epoch": 0.871057146627953, "grad_norm": 0.1635628491640091, "learning_rate": 1.966669021099244e-06, "loss": 0.0405, "num_input_tokens_seen": 3245824, "step": 6600 }, { "epoch": 0.8717170384057015, "grad_norm": 2.8577051162719727, "learning_rate": 1.966550966299657e-06, "loss": 0.002, "num_input_tokens_seen": 3248128, "step": 6605 }, { "epoch": 0.8723769301834499, "grad_norm": 52.59244155883789, "learning_rate": 1.9664327063566273e-06, "loss": 0.2562, "num_input_tokens_seen": 3250624, "step": 6610 }, { "epoch": 0.8730368219611984, "grad_norm": 0.6827111840248108, "learning_rate": 1.966314241295254e-06, "loss": 0.1405, "num_input_tokens_seen": 3253312, "step": 6615 }, { "epoch": 0.8736967137389469, "grad_norm": 0.03325214982032776, "learning_rate": 1.9661955711406808e-06, "loss": 0.1581, "num_input_tokens_seen": 3255488, "step": 6620 }, { "epoch": 0.8743566055166953, "grad_norm": 0.028142258524894714, "learning_rate": 1.966076695918094e-06, "loss": 0.0712, "num_input_tokens_seen": 3257664, "step": 6625 }, { "epoch": 0.8750164972944438, "grad_norm": 0.07260460406541824, "learning_rate": 1.9659576156527236e-06, "loss": 0.0422, "num_input_tokens_seen": 3260160, "step": 6630 }, { "epoch": 0.8756763890721921, "grad_norm": 21.091014862060547, "learning_rate": 1.965838330369845e-06, "loss": 0.15, "num_input_tokens_seen": 3262528, "step": 6635 }, { "epoch": 0.8763362808499406, "grad_norm": 0.024985164403915405, "learning_rate": 1.9657188400947748e-06, "loss": 0.099, "num_input_tokens_seen": 3265024, "step": 6640 }, { "epoch": 0.876996172627689, "grad_norm": 0.18783406913280487, "learning_rate": 1.965599144852874e-06, "loss": 0.2838, "num_input_tokens_seen": 3267456, "step": 6645 }, { "epoch": 0.8776560644054375, "grad_norm": 0.43716302514076233, "learning_rate": 1.9654792446695467e-06, "loss": 0.0717, "num_input_tokens_seen": 3270208, "step": 6650 }, { "epoch": 0.8783159561831859, "grad_norm": 35.399932861328125, "learning_rate": 1.9653591395702408e-06, "loss": 0.1191, "num_input_tokens_seen": 3272960, "step": 6655 }, { "epoch": 0.8789758479609344, "grad_norm": 0.5945919752120972, "learning_rate": 1.9652388295804484e-06, "loss": 0.1331, "num_input_tokens_seen": 3275136, "step": 6660 }, { "epoch": 0.8796357397386828, "grad_norm": 29.954280853271484, "learning_rate": 1.9651183147257046e-06, "loss": 0.2028, "num_input_tokens_seen": 3277696, "step": 6665 }, { "epoch": 0.8802956315164313, "grad_norm": 12.914910316467285, "learning_rate": 1.964997595031587e-06, "loss": 0.1612, "num_input_tokens_seen": 3280064, "step": 6670 }, { "epoch": 0.8809555232941798, "grad_norm": 0.618593156337738, "learning_rate": 1.964876670523718e-06, "loss": 0.185, "num_input_tokens_seen": 3282304, "step": 6675 }, { "epoch": 0.8816154150719282, "grad_norm": 0.3972916007041931, "learning_rate": 1.9647555412277623e-06, "loss": 0.102, "num_input_tokens_seen": 3284736, "step": 6680 }, { "epoch": 0.8822753068496767, "grad_norm": 56.937686920166016, "learning_rate": 1.9646342071694298e-06, "loss": 0.0322, "num_input_tokens_seen": 3287168, "step": 6685 }, { "epoch": 0.8829351986274251, "grad_norm": 32.125244140625, "learning_rate": 1.9645126683744718e-06, "loss": 0.1026, "num_input_tokens_seen": 3289600, "step": 6690 }, { "epoch": 0.8835950904051736, "grad_norm": 0.03807664290070534, "learning_rate": 1.9643909248686847e-06, "loss": 0.0023, "num_input_tokens_seen": 3292160, "step": 6695 }, { "epoch": 0.884254982182922, "grad_norm": 12.510610580444336, "learning_rate": 1.964268976677907e-06, "loss": 0.1903, "num_input_tokens_seen": 3294592, "step": 6700 }, { "epoch": 0.8849148739606705, "grad_norm": 39.68159103393555, "learning_rate": 1.964146823828022e-06, "loss": 0.0425, "num_input_tokens_seen": 3296960, "step": 6705 }, { "epoch": 0.8855747657384189, "grad_norm": 0.1999286711215973, "learning_rate": 1.9640244663449548e-06, "loss": 0.035, "num_input_tokens_seen": 3299200, "step": 6710 }, { "epoch": 0.8862346575161674, "grad_norm": 10.900226593017578, "learning_rate": 1.963901904254676e-06, "loss": 0.2501, "num_input_tokens_seen": 3301568, "step": 6715 }, { "epoch": 0.8868945492939158, "grad_norm": 17.2080135345459, "learning_rate": 1.963779137583198e-06, "loss": 0.1129, "num_input_tokens_seen": 3304064, "step": 6720 }, { "epoch": 0.8875544410716643, "grad_norm": 0.17219261825084686, "learning_rate": 1.963656166356577e-06, "loss": 0.1272, "num_input_tokens_seen": 3306432, "step": 6725 }, { "epoch": 0.8882143328494126, "grad_norm": 1.6587921380996704, "learning_rate": 1.9635329906009135e-06, "loss": 0.1033, "num_input_tokens_seen": 3308736, "step": 6730 }, { "epoch": 0.8888742246271611, "grad_norm": 0.29581812024116516, "learning_rate": 1.96340961034235e-06, "loss": 0.0311, "num_input_tokens_seen": 3311168, "step": 6735 }, { "epoch": 0.8895341164049096, "grad_norm": 34.09101867675781, "learning_rate": 1.9632860256070727e-06, "loss": 0.1654, "num_input_tokens_seen": 3313664, "step": 6740 }, { "epoch": 0.890194008182658, "grad_norm": 20.558460235595703, "learning_rate": 1.9631622364213124e-06, "loss": 0.1481, "num_input_tokens_seen": 3316224, "step": 6745 }, { "epoch": 0.8908538999604065, "grad_norm": 108.66929626464844, "learning_rate": 1.9630382428113416e-06, "loss": 0.0998, "num_input_tokens_seen": 3318464, "step": 6750 }, { "epoch": 0.8915137917381549, "grad_norm": 0.1910923272371292, "learning_rate": 1.962914044803478e-06, "loss": 0.0018, "num_input_tokens_seen": 3320896, "step": 6755 }, { "epoch": 0.8921736835159034, "grad_norm": 0.1292416900396347, "learning_rate": 1.9627896424240814e-06, "loss": 0.1516, "num_input_tokens_seen": 3323648, "step": 6760 }, { "epoch": 0.8928335752936518, "grad_norm": 21.599706649780273, "learning_rate": 1.9626650356995545e-06, "loss": 0.2309, "num_input_tokens_seen": 3326208, "step": 6765 }, { "epoch": 0.8934934670714003, "grad_norm": 0.48037469387054443, "learning_rate": 1.9625402246563456e-06, "loss": 0.1373, "num_input_tokens_seen": 3328576, "step": 6770 }, { "epoch": 0.8941533588491487, "grad_norm": 0.28065234422683716, "learning_rate": 1.962415209320944e-06, "loss": 0.0354, "num_input_tokens_seen": 3331520, "step": 6775 }, { "epoch": 0.8948132506268972, "grad_norm": 0.31453850865364075, "learning_rate": 1.9622899897198834e-06, "loss": 0.0489, "num_input_tokens_seen": 3334336, "step": 6780 }, { "epoch": 0.8954731424046456, "grad_norm": 69.05518341064453, "learning_rate": 1.962164565879741e-06, "loss": 0.1136, "num_input_tokens_seen": 3336896, "step": 6785 }, { "epoch": 0.8961330341823941, "grad_norm": 16.333518981933594, "learning_rate": 1.9620389378271363e-06, "loss": 0.1573, "num_input_tokens_seen": 3339328, "step": 6790 }, { "epoch": 0.8967929259601425, "grad_norm": 0.5106508135795593, "learning_rate": 1.9619131055887343e-06, "loss": 0.0079, "num_input_tokens_seen": 3341760, "step": 6795 }, { "epoch": 0.897452817737891, "grad_norm": 0.23962059617042542, "learning_rate": 1.961787069191241e-06, "loss": 0.1041, "num_input_tokens_seen": 3344448, "step": 6800 }, { "epoch": 0.8981127095156395, "grad_norm": 0.08097890764474869, "learning_rate": 1.9616608286614065e-06, "loss": 0.0233, "num_input_tokens_seen": 3347008, "step": 6805 }, { "epoch": 0.8987726012933879, "grad_norm": 0.9426233768463135, "learning_rate": 1.9615343840260255e-06, "loss": 0.0408, "num_input_tokens_seen": 3349824, "step": 6810 }, { "epoch": 0.8994324930711364, "grad_norm": 0.28335756063461304, "learning_rate": 1.9614077353119345e-06, "loss": 0.0705, "num_input_tokens_seen": 3352320, "step": 6815 }, { "epoch": 0.9000923848488848, "grad_norm": 0.1531563252210617, "learning_rate": 1.961280882546013e-06, "loss": 0.0009, "num_input_tokens_seen": 3354688, "step": 6820 }, { "epoch": 0.9007522766266333, "grad_norm": 0.014378640800714493, "learning_rate": 1.961153825755186e-06, "loss": 0.0692, "num_input_tokens_seen": 3357056, "step": 6825 }, { "epoch": 0.9014121684043817, "grad_norm": 0.24212691187858582, "learning_rate": 1.961026564966419e-06, "loss": 0.0761, "num_input_tokens_seen": 3359488, "step": 6830 }, { "epoch": 0.9020720601821302, "grad_norm": 0.04449746385216713, "learning_rate": 1.9608991002067233e-06, "loss": 0.3297, "num_input_tokens_seen": 3361920, "step": 6835 }, { "epoch": 0.9027319519598785, "grad_norm": 0.47794926166534424, "learning_rate": 1.9607714315031513e-06, "loss": 0.0016, "num_input_tokens_seen": 3364416, "step": 6840 }, { "epoch": 0.903391843737627, "grad_norm": 0.08138076961040497, "learning_rate": 1.9606435588828008e-06, "loss": 0.1103, "num_input_tokens_seen": 3366912, "step": 6845 }, { "epoch": 0.9040517355153754, "grad_norm": 0.005542756523936987, "learning_rate": 1.960515482372811e-06, "loss": 0.0008, "num_input_tokens_seen": 3369088, "step": 6850 }, { "epoch": 0.9047116272931239, "grad_norm": 0.005087740253657103, "learning_rate": 1.960387202000366e-06, "loss": 0.2938, "num_input_tokens_seen": 3371520, "step": 6855 }, { "epoch": 0.9053715190708723, "grad_norm": 0.011929133906960487, "learning_rate": 1.9602587177926913e-06, "loss": 0.0004, "num_input_tokens_seen": 3374080, "step": 6860 }, { "epoch": 0.9060314108486208, "grad_norm": 16.993408203125, "learning_rate": 1.960130029777058e-06, "loss": 0.0758, "num_input_tokens_seen": 3376640, "step": 6865 }, { "epoch": 0.9066913026263693, "grad_norm": 0.02640739642083645, "learning_rate": 1.9600011379807783e-06, "loss": 0.0005, "num_input_tokens_seen": 3379072, "step": 6870 }, { "epoch": 0.9073511944041177, "grad_norm": 5.115023612976074, "learning_rate": 1.9598720424312093e-06, "loss": 0.05, "num_input_tokens_seen": 3381696, "step": 6875 }, { "epoch": 0.9080110861818662, "grad_norm": 11.82519817352295, "learning_rate": 1.9597427431557497e-06, "loss": 0.317, "num_input_tokens_seen": 3384064, "step": 6880 }, { "epoch": 0.9086709779596146, "grad_norm": 0.007930797524750233, "learning_rate": 1.9596132401818427e-06, "loss": 0.1413, "num_input_tokens_seen": 3386304, "step": 6885 }, { "epoch": 0.9093308697373631, "grad_norm": 0.23658303916454315, "learning_rate": 1.9594835335369748e-06, "loss": 0.078, "num_input_tokens_seen": 3388800, "step": 6890 }, { "epoch": 0.9099907615151115, "grad_norm": 0.07035399973392487, "learning_rate": 1.9593536232486747e-06, "loss": 0.1664, "num_input_tokens_seen": 3391232, "step": 6895 }, { "epoch": 0.91065065329286, "grad_norm": 20.818607330322266, "learning_rate": 1.9592235093445153e-06, "loss": 0.0852, "num_input_tokens_seen": 3393664, "step": 6900 }, { "epoch": 0.9113105450706084, "grad_norm": 57.55811309814453, "learning_rate": 1.959093191852112e-06, "loss": 0.1319, "num_input_tokens_seen": 3395968, "step": 6905 }, { "epoch": 0.9119704368483569, "grad_norm": 0.08466242253780365, "learning_rate": 1.958962670799124e-06, "loss": 0.1763, "num_input_tokens_seen": 3398272, "step": 6910 }, { "epoch": 0.9126303286261053, "grad_norm": 12.08685302734375, "learning_rate": 1.9588319462132535e-06, "loss": 0.2054, "num_input_tokens_seen": 3400960, "step": 6915 }, { "epoch": 0.9132902204038538, "grad_norm": 77.93342590332031, "learning_rate": 1.9587010181222456e-06, "loss": 0.2306, "num_input_tokens_seen": 3403520, "step": 6920 }, { "epoch": 0.9139501121816023, "grad_norm": 480.17840576171875, "learning_rate": 1.9585698865538892e-06, "loss": 0.2867, "num_input_tokens_seen": 3405952, "step": 6925 }, { "epoch": 0.9146100039593507, "grad_norm": 68.2589111328125, "learning_rate": 1.9584385515360155e-06, "loss": 0.1133, "num_input_tokens_seen": 3408320, "step": 6930 }, { "epoch": 0.9152698957370992, "grad_norm": 0.36150798201560974, "learning_rate": 1.9583070130965e-06, "loss": 0.0866, "num_input_tokens_seen": 3410880, "step": 6935 }, { "epoch": 0.9159297875148475, "grad_norm": 0.438012033700943, "learning_rate": 1.95817527126326e-06, "loss": 0.0082, "num_input_tokens_seen": 3413440, "step": 6940 }, { "epoch": 0.916589679292596, "grad_norm": 2.9928126335144043, "learning_rate": 1.9580433260642576e-06, "loss": 0.1116, "num_input_tokens_seen": 3416000, "step": 6945 }, { "epoch": 0.9172495710703444, "grad_norm": 13.603625297546387, "learning_rate": 1.9579111775274967e-06, "loss": 0.1138, "num_input_tokens_seen": 3418240, "step": 6950 }, { "epoch": 0.9179094628480929, "grad_norm": 0.3827645778656006, "learning_rate": 1.957778825681025e-06, "loss": 0.1691, "num_input_tokens_seen": 3420672, "step": 6955 }, { "epoch": 0.9185693546258413, "grad_norm": 7.339386463165283, "learning_rate": 1.9576462705529334e-06, "loss": 0.0336, "num_input_tokens_seen": 3422912, "step": 6960 }, { "epoch": 0.9192292464035898, "grad_norm": 16.184080123901367, "learning_rate": 1.9575135121713554e-06, "loss": 0.0039, "num_input_tokens_seen": 3425408, "step": 6965 }, { "epoch": 0.9198891381813382, "grad_norm": 18.327194213867188, "learning_rate": 1.9573805505644687e-06, "loss": 0.0885, "num_input_tokens_seen": 3427776, "step": 6970 }, { "epoch": 0.9205490299590867, "grad_norm": 15.888535499572754, "learning_rate": 1.9572473857604924e-06, "loss": 0.1885, "num_input_tokens_seen": 3430336, "step": 6975 }, { "epoch": 0.9212089217368351, "grad_norm": 52.742218017578125, "learning_rate": 1.9571140177876904e-06, "loss": 0.2446, "num_input_tokens_seen": 3432896, "step": 6980 }, { "epoch": 0.9218688135145836, "grad_norm": 0.11979825794696808, "learning_rate": 1.956980446674369e-06, "loss": 0.0608, "num_input_tokens_seen": 3435136, "step": 6985 }, { "epoch": 0.9225287052923321, "grad_norm": 1.5458872318267822, "learning_rate": 1.9568466724488783e-06, "loss": 0.0706, "num_input_tokens_seen": 3437824, "step": 6990 }, { "epoch": 0.9231885970700805, "grad_norm": 0.07339257746934891, "learning_rate": 1.95671269513961e-06, "loss": 0.0195, "num_input_tokens_seen": 3440320, "step": 6995 }, { "epoch": 0.923848488847829, "grad_norm": 0.3749296963214874, "learning_rate": 1.9565785147749994e-06, "loss": 0.1083, "num_input_tokens_seen": 3442880, "step": 7000 }, { "epoch": 0.9245083806255774, "grad_norm": 0.0999370887875557, "learning_rate": 1.956444131383527e-06, "loss": 0.0438, "num_input_tokens_seen": 3445120, "step": 7005 }, { "epoch": 0.9251682724033259, "grad_norm": 0.2530590891838074, "learning_rate": 1.9563095449937133e-06, "loss": 0.1449, "num_input_tokens_seen": 3447424, "step": 7010 }, { "epoch": 0.9258281641810743, "grad_norm": 0.015032319352030754, "learning_rate": 1.9561747556341236e-06, "loss": 0.0746, "num_input_tokens_seen": 3449920, "step": 7015 }, { "epoch": 0.9264880559588228, "grad_norm": 1.8766849040985107, "learning_rate": 1.9560397633333663e-06, "loss": 0.0844, "num_input_tokens_seen": 3452416, "step": 7020 }, { "epoch": 0.9271479477365712, "grad_norm": 15.20991039276123, "learning_rate": 1.955904568120092e-06, "loss": 0.1329, "num_input_tokens_seen": 3454912, "step": 7025 }, { "epoch": 0.9278078395143197, "grad_norm": 0.08727604895830154, "learning_rate": 1.955769170022996e-06, "loss": 0.0823, "num_input_tokens_seen": 3457472, "step": 7030 }, { "epoch": 0.928467731292068, "grad_norm": 0.3139757215976715, "learning_rate": 1.955633569070814e-06, "loss": 0.0728, "num_input_tokens_seen": 3459712, "step": 7035 }, { "epoch": 0.9291276230698166, "grad_norm": 0.4277566373348236, "learning_rate": 1.9554977652923276e-06, "loss": 0.1126, "num_input_tokens_seen": 3462144, "step": 7040 }, { "epoch": 0.9297875148475649, "grad_norm": 12.390519142150879, "learning_rate": 1.9553617587163594e-06, "loss": 0.2659, "num_input_tokens_seen": 3464512, "step": 7045 }, { "epoch": 0.9304474066253134, "grad_norm": 0.03325313329696655, "learning_rate": 1.955225549371776e-06, "loss": 0.0937, "num_input_tokens_seen": 3466880, "step": 7050 }, { "epoch": 0.931107298403062, "grad_norm": 0.5559648871421814, "learning_rate": 1.9550891372874872e-06, "loss": 0.1008, "num_input_tokens_seen": 3469248, "step": 7055 }, { "epoch": 0.9317671901808103, "grad_norm": 38.42670440673828, "learning_rate": 1.9549525224924453e-06, "loss": 0.2362, "num_input_tokens_seen": 3471616, "step": 7060 }, { "epoch": 0.9324270819585588, "grad_norm": 1.161107063293457, "learning_rate": 1.9548157050156456e-06, "loss": 0.0761, "num_input_tokens_seen": 3474240, "step": 7065 }, { "epoch": 0.9330869737363072, "grad_norm": 0.5504103899002075, "learning_rate": 1.9546786848861268e-06, "loss": 0.0566, "num_input_tokens_seen": 3476800, "step": 7070 }, { "epoch": 0.9337468655140557, "grad_norm": 0.2676885724067688, "learning_rate": 1.95454146213297e-06, "loss": 0.091, "num_input_tokens_seen": 3479488, "step": 7075 }, { "epoch": 0.9344067572918041, "grad_norm": 0.022085441276431084, "learning_rate": 1.954404036785301e-06, "loss": 0.0015, "num_input_tokens_seen": 3482176, "step": 7080 }, { "epoch": 0.9350666490695526, "grad_norm": 0.4007412791252136, "learning_rate": 1.9542664088722857e-06, "loss": 0.078, "num_input_tokens_seen": 3484800, "step": 7085 }, { "epoch": 0.935726540847301, "grad_norm": 0.11671361327171326, "learning_rate": 1.9541285784231355e-06, "loss": 0.1244, "num_input_tokens_seen": 3487488, "step": 7090 }, { "epoch": 0.9363864326250495, "grad_norm": 12.453141212463379, "learning_rate": 1.9539905454671037e-06, "loss": 0.3198, "num_input_tokens_seen": 3489728, "step": 7095 }, { "epoch": 0.9370463244027979, "grad_norm": 0.04366743937134743, "learning_rate": 1.953852310033487e-06, "loss": 0.2493, "num_input_tokens_seen": 3491904, "step": 7100 }, { "epoch": 0.9377062161805464, "grad_norm": 51.18413162231445, "learning_rate": 1.9537138721516248e-06, "loss": 0.059, "num_input_tokens_seen": 3494592, "step": 7105 }, { "epoch": 0.9383661079582949, "grad_norm": 0.18797965347766876, "learning_rate": 1.9535752318508995e-06, "loss": 0.1345, "num_input_tokens_seen": 3497088, "step": 7110 }, { "epoch": 0.9390259997360433, "grad_norm": 0.30258113145828247, "learning_rate": 1.9534363891607363e-06, "loss": 0.0865, "num_input_tokens_seen": 3499520, "step": 7115 }, { "epoch": 0.9396858915137918, "grad_norm": 0.14273835718631744, "learning_rate": 1.953297344110604e-06, "loss": 0.1349, "num_input_tokens_seen": 3502208, "step": 7120 }, { "epoch": 0.9403457832915402, "grad_norm": 11.008658409118652, "learning_rate": 1.9531580967300135e-06, "loss": 0.1946, "num_input_tokens_seen": 3504640, "step": 7125 }, { "epoch": 0.9410056750692887, "grad_norm": 2.5081124305725098, "learning_rate": 1.953018647048519e-06, "loss": 0.0946, "num_input_tokens_seen": 3506944, "step": 7130 }, { "epoch": 0.9416655668470371, "grad_norm": 20.469833374023438, "learning_rate": 1.9528789950957182e-06, "loss": 0.2065, "num_input_tokens_seen": 3509376, "step": 7135 }, { "epoch": 0.9423254586247856, "grad_norm": 23.552812576293945, "learning_rate": 1.9527391409012507e-06, "loss": 0.096, "num_input_tokens_seen": 3511680, "step": 7140 }, { "epoch": 0.942985350402534, "grad_norm": 251.3920440673828, "learning_rate": 1.9525990844948e-06, "loss": 0.0617, "num_input_tokens_seen": 3514112, "step": 7145 }, { "epoch": 0.9436452421802825, "grad_norm": 0.5825870633125305, "learning_rate": 1.952458825906092e-06, "loss": 0.0848, "num_input_tokens_seen": 3516480, "step": 7150 }, { "epoch": 0.9443051339580308, "grad_norm": 0.27023938298225403, "learning_rate": 1.952318365164895e-06, "loss": 0.1279, "num_input_tokens_seen": 3518720, "step": 7155 }, { "epoch": 0.9449650257357793, "grad_norm": 0.17718826234340668, "learning_rate": 1.952177702301021e-06, "loss": 0.0011, "num_input_tokens_seen": 3521216, "step": 7160 }, { "epoch": 0.9456249175135277, "grad_norm": 13.897564888000488, "learning_rate": 1.9520368373443246e-06, "loss": 0.2427, "num_input_tokens_seen": 3523776, "step": 7165 }, { "epoch": 0.9462848092912762, "grad_norm": 117.28556823730469, "learning_rate": 1.951895770324704e-06, "loss": 0.0515, "num_input_tokens_seen": 3526272, "step": 7170 }, { "epoch": 0.9469447010690247, "grad_norm": 1.6187975406646729, "learning_rate": 1.9517545012720993e-06, "loss": 0.1211, "num_input_tokens_seen": 3528576, "step": 7175 }, { "epoch": 0.9476045928467731, "grad_norm": 1.0781744718551636, "learning_rate": 1.9516130302164937e-06, "loss": 0.0018, "num_input_tokens_seen": 3531136, "step": 7180 }, { "epoch": 0.9482644846245216, "grad_norm": 13.055898666381836, "learning_rate": 1.9514713571879135e-06, "loss": 0.2951, "num_input_tokens_seen": 3533696, "step": 7185 }, { "epoch": 0.94892437640227, "grad_norm": 0.1939694732427597, "learning_rate": 1.9513294822164274e-06, "loss": 0.0036, "num_input_tokens_seen": 3536064, "step": 7190 }, { "epoch": 0.9495842681800185, "grad_norm": 1.0631992816925049, "learning_rate": 1.9511874053321483e-06, "loss": 0.0063, "num_input_tokens_seen": 3538432, "step": 7195 }, { "epoch": 0.9502441599577669, "grad_norm": 0.25243115425109863, "learning_rate": 1.95104512656523e-06, "loss": 0.0012, "num_input_tokens_seen": 3541120, "step": 7200 }, { "epoch": 0.9509040517355154, "grad_norm": 60.04290771484375, "learning_rate": 1.9509026459458702e-06, "loss": 0.0973, "num_input_tokens_seen": 3543680, "step": 7205 }, { "epoch": 0.9515639435132638, "grad_norm": 40.85661697387695, "learning_rate": 1.95075996350431e-06, "loss": 0.2608, "num_input_tokens_seen": 3545984, "step": 7210 }, { "epoch": 0.9522238352910123, "grad_norm": 14.948753356933594, "learning_rate": 1.9506170792708327e-06, "loss": 0.0943, "num_input_tokens_seen": 3548544, "step": 7215 }, { "epoch": 0.9528837270687607, "grad_norm": 0.013944868929684162, "learning_rate": 1.950473993275764e-06, "loss": 0.0524, "num_input_tokens_seen": 3551040, "step": 7220 }, { "epoch": 0.9535436188465092, "grad_norm": 0.06707829982042313, "learning_rate": 1.950330705549473e-06, "loss": 0.1268, "num_input_tokens_seen": 3553536, "step": 7225 }, { "epoch": 0.9542035106242576, "grad_norm": 0.03719603642821312, "learning_rate": 1.950187216122371e-06, "loss": 0.1104, "num_input_tokens_seen": 3555712, "step": 7230 }, { "epoch": 0.9548634024020061, "grad_norm": 0.2717171609401703, "learning_rate": 1.9500435250249136e-06, "loss": 0.1443, "num_input_tokens_seen": 3558080, "step": 7235 }, { "epoch": 0.9555232941797546, "grad_norm": 133.826171875, "learning_rate": 1.949899632287598e-06, "loss": 0.1269, "num_input_tokens_seen": 3560640, "step": 7240 }, { "epoch": 0.956183185957503, "grad_norm": 38.52214431762695, "learning_rate": 1.9497555379409633e-06, "loss": 0.0389, "num_input_tokens_seen": 3563392, "step": 7245 }, { "epoch": 0.9568430777352515, "grad_norm": 14.870156288146973, "learning_rate": 1.9496112420155937e-06, "loss": 0.309, "num_input_tokens_seen": 3565824, "step": 7250 }, { "epoch": 0.9575029695129998, "grad_norm": 0.18774457275867462, "learning_rate": 1.949466744542115e-06, "loss": 0.0023, "num_input_tokens_seen": 3568256, "step": 7255 }, { "epoch": 0.9581628612907483, "grad_norm": 19.96299171447754, "learning_rate": 1.9493220455511943e-06, "loss": 0.0659, "num_input_tokens_seen": 3570752, "step": 7260 }, { "epoch": 0.9588227530684967, "grad_norm": 7.811553955078125, "learning_rate": 1.9491771450735444e-06, "loss": 0.1136, "num_input_tokens_seen": 3572928, "step": 7265 }, { "epoch": 0.9594826448462452, "grad_norm": 25.050922393798828, "learning_rate": 1.9490320431399186e-06, "loss": 0.2459, "num_input_tokens_seen": 3575296, "step": 7270 }, { "epoch": 0.9601425366239936, "grad_norm": 0.148186594247818, "learning_rate": 1.9488867397811143e-06, "loss": 0.0014, "num_input_tokens_seen": 3577664, "step": 7275 }, { "epoch": 0.9608024284017421, "grad_norm": 0.40033382177352905, "learning_rate": 1.948741235027971e-06, "loss": 0.1765, "num_input_tokens_seen": 3580160, "step": 7280 }, { "epoch": 0.9614623201794905, "grad_norm": 87.95939636230469, "learning_rate": 1.9485955289113703e-06, "loss": 0.1181, "num_input_tokens_seen": 3582464, "step": 7285 }, { "epoch": 0.962122211957239, "grad_norm": 1.9860565662384033, "learning_rate": 1.9484496214622375e-06, "loss": 0.1001, "num_input_tokens_seen": 3584896, "step": 7290 }, { "epoch": 0.9627821037349875, "grad_norm": 0.29375576972961426, "learning_rate": 1.9483035127115416e-06, "loss": 0.0035, "num_input_tokens_seen": 3587584, "step": 7295 }, { "epoch": 0.9634419955127359, "grad_norm": 14.525373458862305, "learning_rate": 1.948157202690292e-06, "loss": 0.149, "num_input_tokens_seen": 3590144, "step": 7300 }, { "epoch": 0.9641018872904844, "grad_norm": 0.25842082500457764, "learning_rate": 1.9480106914295416e-06, "loss": 0.0409, "num_input_tokens_seen": 3592832, "step": 7305 }, { "epoch": 0.9647617790682328, "grad_norm": 0.6491772532463074, "learning_rate": 1.947863978960387e-06, "loss": 0.1264, "num_input_tokens_seen": 3595456, "step": 7310 }, { "epoch": 0.9654216708459813, "grad_norm": 0.36299997568130493, "learning_rate": 1.947717065313967e-06, "loss": 0.1478, "num_input_tokens_seen": 3597888, "step": 7315 }, { "epoch": 0.9660815626237297, "grad_norm": 18.739669799804688, "learning_rate": 1.9475699505214625e-06, "loss": 0.0841, "num_input_tokens_seen": 3600384, "step": 7320 }, { "epoch": 0.9667414544014782, "grad_norm": 13.522602081298828, "learning_rate": 1.947422634614098e-06, "loss": 0.183, "num_input_tokens_seen": 3602880, "step": 7325 }, { "epoch": 0.9674013461792266, "grad_norm": 0.14512999355793, "learning_rate": 1.94727511762314e-06, "loss": 0.0148, "num_input_tokens_seen": 3605248, "step": 7330 }, { "epoch": 0.9680612379569751, "grad_norm": 0.26263144612312317, "learning_rate": 1.9471273995798977e-06, "loss": 0.0009, "num_input_tokens_seen": 3607808, "step": 7335 }, { "epoch": 0.9687211297347235, "grad_norm": 12.533109664916992, "learning_rate": 1.9469794805157235e-06, "loss": 0.2154, "num_input_tokens_seen": 3610112, "step": 7340 }, { "epoch": 0.969381021512472, "grad_norm": 24.23501968383789, "learning_rate": 1.946831360462012e-06, "loss": 0.1076, "num_input_tokens_seen": 3612352, "step": 7345 }, { "epoch": 0.9700409132902204, "grad_norm": 15.151259422302246, "learning_rate": 1.946683039450201e-06, "loss": 0.184, "num_input_tokens_seen": 3614848, "step": 7350 }, { "epoch": 0.9707008050679689, "grad_norm": 0.8318037390708923, "learning_rate": 1.9465345175117698e-06, "loss": 0.0021, "num_input_tokens_seen": 3617408, "step": 7355 }, { "epoch": 0.9713606968457174, "grad_norm": 223.77159118652344, "learning_rate": 1.9463857946782418e-06, "loss": 0.1402, "num_input_tokens_seen": 3619968, "step": 7360 }, { "epoch": 0.9720205886234657, "grad_norm": 2.1519546508789062, "learning_rate": 1.9462368709811816e-06, "loss": 0.1302, "num_input_tokens_seen": 3622016, "step": 7365 }, { "epoch": 0.9726804804012142, "grad_norm": 13.423410415649414, "learning_rate": 1.946087746452198e-06, "loss": 0.115, "num_input_tokens_seen": 3624192, "step": 7370 }, { "epoch": 0.9733403721789626, "grad_norm": 0.11041421443223953, "learning_rate": 1.945938421122941e-06, "loss": 0.0111, "num_input_tokens_seen": 3626624, "step": 7375 }, { "epoch": 0.9740002639567111, "grad_norm": 14.675050735473633, "learning_rate": 1.9457888950251045e-06, "loss": 0.1937, "num_input_tokens_seen": 3628928, "step": 7380 }, { "epoch": 0.9746601557344595, "grad_norm": 0.20445093512535095, "learning_rate": 1.9456391681904234e-06, "loss": 0.1085, "num_input_tokens_seen": 3631552, "step": 7385 }, { "epoch": 0.975320047512208, "grad_norm": 15.725769996643066, "learning_rate": 1.9454892406506774e-06, "loss": 0.078, "num_input_tokens_seen": 3633984, "step": 7390 }, { "epoch": 0.9759799392899564, "grad_norm": 1.7748395204544067, "learning_rate": 1.945339112437686e-06, "loss": 0.2813, "num_input_tokens_seen": 3636224, "step": 7395 }, { "epoch": 0.9766398310677049, "grad_norm": 0.23947864770889282, "learning_rate": 1.945188783583314e-06, "loss": 0.1583, "num_input_tokens_seen": 3638656, "step": 7400 }, { "epoch": 0.9772997228454533, "grad_norm": 0.18196658790111542, "learning_rate": 1.945038254119467e-06, "loss": 0.0369, "num_input_tokens_seen": 3641408, "step": 7405 }, { "epoch": 0.9779596146232018, "grad_norm": 13.365228652954102, "learning_rate": 1.944887524078094e-06, "loss": 0.133, "num_input_tokens_seen": 3643840, "step": 7410 }, { "epoch": 0.9786195064009502, "grad_norm": 0.32755666971206665, "learning_rate": 1.9447365934911862e-06, "loss": 0.058, "num_input_tokens_seen": 3646336, "step": 7415 }, { "epoch": 0.9792793981786987, "grad_norm": 0.40165284276008606, "learning_rate": 1.944585462390778e-06, "loss": 0.086, "num_input_tokens_seen": 3648960, "step": 7420 }, { "epoch": 0.9799392899564472, "grad_norm": 24.77637481689453, "learning_rate": 1.9444341308089456e-06, "loss": 0.0681, "num_input_tokens_seen": 3651200, "step": 7425 }, { "epoch": 0.9805991817341956, "grad_norm": 0.17811863124370575, "learning_rate": 1.944282598777808e-06, "loss": 0.0279, "num_input_tokens_seen": 3653504, "step": 7430 }, { "epoch": 0.9812590735119441, "grad_norm": 24.5549373626709, "learning_rate": 1.9441308663295264e-06, "loss": 0.3589, "num_input_tokens_seen": 3656064, "step": 7435 }, { "epoch": 0.9819189652896925, "grad_norm": 44.16228485107422, "learning_rate": 1.9439789334963055e-06, "loss": 0.4366, "num_input_tokens_seen": 3658112, "step": 7440 }, { "epoch": 0.982578857067441, "grad_norm": 30.133174896240234, "learning_rate": 1.9438268003103916e-06, "loss": 0.1661, "num_input_tokens_seen": 3660928, "step": 7445 }, { "epoch": 0.9832387488451894, "grad_norm": 0.4015233814716339, "learning_rate": 1.943674466804074e-06, "loss": 0.1425, "num_input_tokens_seen": 3663232, "step": 7450 }, { "epoch": 0.9838986406229379, "grad_norm": 0.3251698315143585, "learning_rate": 1.9435219330096845e-06, "loss": 0.0064, "num_input_tokens_seen": 3665600, "step": 7455 }, { "epoch": 0.9845585324006862, "grad_norm": 0.10743151605129242, "learning_rate": 1.9433691989595975e-06, "loss": 0.0393, "num_input_tokens_seen": 3668096, "step": 7460 }, { "epoch": 0.9852184241784347, "grad_norm": 0.22793497145175934, "learning_rate": 1.943216264686229e-06, "loss": 0.0312, "num_input_tokens_seen": 3670656, "step": 7465 }, { "epoch": 0.9858783159561831, "grad_norm": 0.06277221441268921, "learning_rate": 1.943063130222038e-06, "loss": 0.157, "num_input_tokens_seen": 3673024, "step": 7470 }, { "epoch": 0.9865382077339316, "grad_norm": 0.04624286666512489, "learning_rate": 1.9429097955995275e-06, "loss": 0.0569, "num_input_tokens_seen": 3675712, "step": 7475 }, { "epoch": 0.9871980995116801, "grad_norm": 0.07568157464265823, "learning_rate": 1.9427562608512406e-06, "loss": 0.1377, "num_input_tokens_seen": 3678080, "step": 7480 }, { "epoch": 0.9878579912894285, "grad_norm": 28.12094497680664, "learning_rate": 1.9426025260097645e-06, "loss": 0.2043, "num_input_tokens_seen": 3680448, "step": 7485 }, { "epoch": 0.988517883067177, "grad_norm": 24.91111946105957, "learning_rate": 1.9424485911077278e-06, "loss": 0.0405, "num_input_tokens_seen": 3682752, "step": 7490 }, { "epoch": 0.9891777748449254, "grad_norm": 13.07931137084961, "learning_rate": 1.9422944561778026e-06, "loss": 0.1633, "num_input_tokens_seen": 3685376, "step": 7495 }, { "epoch": 0.9898376666226739, "grad_norm": 0.2622494697570801, "learning_rate": 1.9421401212527023e-06, "loss": 0.0496, "num_input_tokens_seen": 3687744, "step": 7500 }, { "epoch": 0.9904975584004223, "grad_norm": 33.244964599609375, "learning_rate": 1.9419855863651837e-06, "loss": 0.1828, "num_input_tokens_seen": 3690240, "step": 7505 }, { "epoch": 0.9911574501781708, "grad_norm": 0.11383026838302612, "learning_rate": 1.941830851548046e-06, "loss": 0.1345, "num_input_tokens_seen": 3692736, "step": 7510 }, { "epoch": 0.9918173419559192, "grad_norm": 10.269845008850098, "learning_rate": 1.94167591683413e-06, "loss": 0.1963, "num_input_tokens_seen": 3695360, "step": 7515 }, { "epoch": 0.9924772337336677, "grad_norm": 0.6313570737838745, "learning_rate": 1.94152078225632e-06, "loss": 0.0683, "num_input_tokens_seen": 3697856, "step": 7520 }, { "epoch": 0.9931371255114161, "grad_norm": 9.661954879760742, "learning_rate": 1.9413654478475415e-06, "loss": 0.0825, "num_input_tokens_seen": 3700224, "step": 7525 }, { "epoch": 0.9937970172891646, "grad_norm": 15.635941505432129, "learning_rate": 1.941209913640764e-06, "loss": 0.1488, "num_input_tokens_seen": 3702592, "step": 7530 }, { "epoch": 0.994456909066913, "grad_norm": 0.36205700039863586, "learning_rate": 1.9410541796689975e-06, "loss": 0.1055, "num_input_tokens_seen": 3704896, "step": 7535 }, { "epoch": 0.9951168008446615, "grad_norm": 1.9162219762802124, "learning_rate": 1.9408982459652963e-06, "loss": 0.0121, "num_input_tokens_seen": 3707264, "step": 7540 }, { "epoch": 0.99577669262241, "grad_norm": 13.23646354675293, "learning_rate": 1.940742112562756e-06, "loss": 0.0858, "num_input_tokens_seen": 3709824, "step": 7545 }, { "epoch": 0.9964365844001584, "grad_norm": 0.05299551039934158, "learning_rate": 1.9405857794945142e-06, "loss": 0.1293, "num_input_tokens_seen": 3712192, "step": 7550 }, { "epoch": 0.9970964761779069, "grad_norm": 118.45269775390625, "learning_rate": 1.9404292467937525e-06, "loss": 0.0699, "num_input_tokens_seen": 3714880, "step": 7555 }, { "epoch": 0.9977563679556553, "grad_norm": 4.233432769775391, "learning_rate": 1.9402725144936926e-06, "loss": 0.0584, "num_input_tokens_seen": 3717184, "step": 7560 }, { "epoch": 0.9984162597334038, "grad_norm": 0.11062260717153549, "learning_rate": 1.940115582627601e-06, "loss": 0.0402, "num_input_tokens_seen": 3719424, "step": 7565 }, { "epoch": 0.9990761515111521, "grad_norm": 0.0450112447142601, "learning_rate": 1.9399584512287842e-06, "loss": 0.0668, "num_input_tokens_seen": 3721920, "step": 7570 }, { "epoch": 0.9997360432889006, "grad_norm": 0.37227654457092285, "learning_rate": 1.939801120330593e-06, "loss": 0.123, "num_input_tokens_seen": 3724288, "step": 7575 }, { "epoch": 1.0003959350666491, "grad_norm": 0.26726233959198, "learning_rate": 1.9396435899664198e-06, "loss": 0.0006, "num_input_tokens_seen": 3726464, "step": 7580 }, { "epoch": 1.0003959350666491, "eval_loss": 0.11427787691354752, "eval_runtime": 7.7746, "eval_samples_per_second": 866.287, "eval_steps_per_second": 108.302, "num_input_tokens_seen": 3726464, "step": 7580 }, { "epoch": 1.0010558268443974, "grad_norm": 0.03733626753091812, "learning_rate": 1.9394858601696986e-06, "loss": 0.0614, "num_input_tokens_seen": 3728960, "step": 7585 }, { "epoch": 1.001715718622146, "grad_norm": 4.338964462280273, "learning_rate": 1.9393279309739067e-06, "loss": 0.0011, "num_input_tokens_seen": 3731648, "step": 7590 }, { "epoch": 1.0023756103998944, "grad_norm": 0.05182173103094101, "learning_rate": 1.939169802412564e-06, "loss": 0.0326, "num_input_tokens_seen": 3734144, "step": 7595 }, { "epoch": 1.003035502177643, "grad_norm": 0.3849523663520813, "learning_rate": 1.939011474519231e-06, "loss": 0.1584, "num_input_tokens_seen": 3736704, "step": 7600 }, { "epoch": 1.0036953939553914, "grad_norm": 0.08171097934246063, "learning_rate": 1.938852947327513e-06, "loss": 0.1347, "num_input_tokens_seen": 3739328, "step": 7605 }, { "epoch": 1.0043552857331397, "grad_norm": 14.762300491333008, "learning_rate": 1.938694220871055e-06, "loss": 0.1089, "num_input_tokens_seen": 3741760, "step": 7610 }, { "epoch": 1.0050151775108882, "grad_norm": 0.04276171326637268, "learning_rate": 1.938535295183547e-06, "loss": 0.0551, "num_input_tokens_seen": 3744384, "step": 7615 }, { "epoch": 1.0056750692886367, "grad_norm": 0.042462147772312164, "learning_rate": 1.938376170298718e-06, "loss": 0.1567, "num_input_tokens_seen": 3746816, "step": 7620 }, { "epoch": 1.0063349610663852, "grad_norm": 0.3363673686981201, "learning_rate": 1.9382168462503425e-06, "loss": 0.0361, "num_input_tokens_seen": 3749504, "step": 7625 }, { "epoch": 1.0069948528441335, "grad_norm": 0.15293100476264954, "learning_rate": 1.9380573230722354e-06, "loss": 0.052, "num_input_tokens_seen": 3751936, "step": 7630 }, { "epoch": 1.007654744621882, "grad_norm": 0.10599560290575027, "learning_rate": 1.9378976007982543e-06, "loss": 0.0017, "num_input_tokens_seen": 3754368, "step": 7635 }, { "epoch": 1.0083146363996305, "grad_norm": 29.61864471435547, "learning_rate": 1.9377376794622992e-06, "loss": 0.0029, "num_input_tokens_seen": 3756736, "step": 7640 }, { "epoch": 1.008974528177379, "grad_norm": 2.77350115776062, "learning_rate": 1.937577559098312e-06, "loss": 0.0945, "num_input_tokens_seen": 3759360, "step": 7645 }, { "epoch": 1.0096344199551273, "grad_norm": 0.023661043494939804, "learning_rate": 1.9374172397402774e-06, "loss": 0.0614, "num_input_tokens_seen": 3761536, "step": 7650 }, { "epoch": 1.0102943117328758, "grad_norm": 21.96059799194336, "learning_rate": 1.937256721422222e-06, "loss": 0.1333, "num_input_tokens_seen": 3763968, "step": 7655 }, { "epoch": 1.0109542035106243, "grad_norm": 16.02443504333496, "learning_rate": 1.9370960041782144e-06, "loss": 0.1473, "num_input_tokens_seen": 3766336, "step": 7660 }, { "epoch": 1.0116140952883728, "grad_norm": 0.16665181517601013, "learning_rate": 1.936935088042366e-06, "loss": 0.1232, "num_input_tokens_seen": 3768832, "step": 7665 }, { "epoch": 1.0122739870661213, "grad_norm": 0.06876052170991898, "learning_rate": 1.9367739730488295e-06, "loss": 0.066, "num_input_tokens_seen": 3771264, "step": 7670 }, { "epoch": 1.0129338788438695, "grad_norm": 0.15175847709178925, "learning_rate": 1.9366126592318012e-06, "loss": 0.0628, "num_input_tokens_seen": 3773760, "step": 7675 }, { "epoch": 1.013593770621618, "grad_norm": 1.0382298231124878, "learning_rate": 1.936451146625518e-06, "loss": 0.069, "num_input_tokens_seen": 3776064, "step": 7680 }, { "epoch": 1.0142536623993665, "grad_norm": 0.12675048410892487, "learning_rate": 1.9362894352642606e-06, "loss": 0.0009, "num_input_tokens_seen": 3778496, "step": 7685 }, { "epoch": 1.014913554177115, "grad_norm": 0.25615447759628296, "learning_rate": 1.9361275251823507e-06, "loss": 0.0102, "num_input_tokens_seen": 3780928, "step": 7690 }, { "epoch": 1.0155734459548633, "grad_norm": 0.049605101346969604, "learning_rate": 1.935965416414152e-06, "loss": 0.1457, "num_input_tokens_seen": 3783360, "step": 7695 }, { "epoch": 1.0162333377326118, "grad_norm": 0.04638204351067543, "learning_rate": 1.935803108994072e-06, "loss": 0.0004, "num_input_tokens_seen": 3785664, "step": 7700 }, { "epoch": 1.0168932295103603, "grad_norm": 0.0021269829012453556, "learning_rate": 1.9356406029565584e-06, "loss": 0.0002, "num_input_tokens_seen": 3788288, "step": 7705 }, { "epoch": 1.0175531212881088, "grad_norm": 63.27873229980469, "learning_rate": 1.935477898336102e-06, "loss": 0.2777, "num_input_tokens_seen": 3790784, "step": 7710 }, { "epoch": 1.018213013065857, "grad_norm": 0.013677487149834633, "learning_rate": 1.935314995167236e-06, "loss": 0.0015, "num_input_tokens_seen": 3793152, "step": 7715 }, { "epoch": 1.0188729048436056, "grad_norm": 0.01026434451341629, "learning_rate": 1.9351518934845355e-06, "loss": 0.0003, "num_input_tokens_seen": 3795712, "step": 7720 }, { "epoch": 1.019532796621354, "grad_norm": 18.70940589904785, "learning_rate": 1.934988593322617e-06, "loss": 0.0478, "num_input_tokens_seen": 3798080, "step": 7725 }, { "epoch": 1.0201926883991026, "grad_norm": 17.24424934387207, "learning_rate": 1.934825094716141e-06, "loss": 0.1037, "num_input_tokens_seen": 3800640, "step": 7730 }, { "epoch": 1.020852580176851, "grad_norm": 26.107290267944336, "learning_rate": 1.9346613976998075e-06, "loss": 0.1827, "num_input_tokens_seen": 3802944, "step": 7735 }, { "epoch": 1.0215124719545994, "grad_norm": 16.36334991455078, "learning_rate": 1.9344975023083606e-06, "loss": 0.0798, "num_input_tokens_seen": 3805504, "step": 7740 }, { "epoch": 1.0221723637323479, "grad_norm": 0.3354093134403229, "learning_rate": 1.9343334085765862e-06, "loss": 0.1328, "num_input_tokens_seen": 3807744, "step": 7745 }, { "epoch": 1.0228322555100964, "grad_norm": 0.12368268519639969, "learning_rate": 1.9341691165393116e-06, "loss": 0.028, "num_input_tokens_seen": 3810112, "step": 7750 }, { "epoch": 1.0234921472878449, "grad_norm": 0.2610388994216919, "learning_rate": 1.9340046262314065e-06, "loss": 0.1645, "num_input_tokens_seen": 3812608, "step": 7755 }, { "epoch": 1.0241520390655932, "grad_norm": 0.12910595536231995, "learning_rate": 1.9338399376877835e-06, "loss": 0.0013, "num_input_tokens_seen": 3814912, "step": 7760 }, { "epoch": 1.0248119308433417, "grad_norm": 0.11450402438640594, "learning_rate": 1.9336750509433958e-06, "loss": 0.0006, "num_input_tokens_seen": 3817600, "step": 7765 }, { "epoch": 1.0254718226210902, "grad_norm": 0.05036715790629387, "learning_rate": 1.93350996603324e-06, "loss": 0.1336, "num_input_tokens_seen": 3819904, "step": 7770 }, { "epoch": 1.0261317143988387, "grad_norm": 4.654677391052246, "learning_rate": 1.933344682992353e-06, "loss": 0.0452, "num_input_tokens_seen": 3822272, "step": 7775 }, { "epoch": 1.026791606176587, "grad_norm": 117.1086654663086, "learning_rate": 1.9331792018558165e-06, "loss": 0.1192, "num_input_tokens_seen": 3824512, "step": 7780 }, { "epoch": 1.0274514979543354, "grad_norm": 0.021433783695101738, "learning_rate": 1.933013522658752e-06, "loss": 0.0616, "num_input_tokens_seen": 3827072, "step": 7785 }, { "epoch": 1.028111389732084, "grad_norm": 0.1878979206085205, "learning_rate": 1.9328476454363235e-06, "loss": 0.1669, "num_input_tokens_seen": 3829632, "step": 7790 }, { "epoch": 1.0287712815098324, "grad_norm": 0.5877869725227356, "learning_rate": 1.932681570223737e-06, "loss": 0.0014, "num_input_tokens_seen": 3832000, "step": 7795 }, { "epoch": 1.029431173287581, "grad_norm": 0.03438083082437515, "learning_rate": 1.9325152970562418e-06, "loss": 0.0993, "num_input_tokens_seen": 3834624, "step": 7800 }, { "epoch": 1.0300910650653292, "grad_norm": 50.06906509399414, "learning_rate": 1.9323488259691273e-06, "loss": 0.1106, "num_input_tokens_seen": 3836992, "step": 7805 }, { "epoch": 1.0307509568430777, "grad_norm": 0.03151680901646614, "learning_rate": 1.932182156997726e-06, "loss": 0.1535, "num_input_tokens_seen": 3839488, "step": 7810 }, { "epoch": 1.0314108486208262, "grad_norm": 4.190992832183838, "learning_rate": 1.9320152901774124e-06, "loss": 0.0557, "num_input_tokens_seen": 3842112, "step": 7815 }, { "epoch": 1.0320707403985747, "grad_norm": 0.07871054112911224, "learning_rate": 1.9318482255436022e-06, "loss": 0.0745, "num_input_tokens_seen": 3844288, "step": 7820 }, { "epoch": 1.032730632176323, "grad_norm": 0.023968158289790154, "learning_rate": 1.9316809631317544e-06, "loss": 0.1813, "num_input_tokens_seen": 3846656, "step": 7825 }, { "epoch": 1.0333905239540715, "grad_norm": 0.8618749380111694, "learning_rate": 1.931513502977369e-06, "loss": 0.0014, "num_input_tokens_seen": 3849344, "step": 7830 }, { "epoch": 1.03405041573182, "grad_norm": 0.11035836488008499, "learning_rate": 1.931345845115988e-06, "loss": 0.0637, "num_input_tokens_seen": 3851712, "step": 7835 }, { "epoch": 1.0347103075095685, "grad_norm": 163.74595642089844, "learning_rate": 1.931177989583195e-06, "loss": 0.0103, "num_input_tokens_seen": 3854080, "step": 7840 }, { "epoch": 1.0353701992873168, "grad_norm": 52.531402587890625, "learning_rate": 1.9310099364146174e-06, "loss": 0.063, "num_input_tokens_seen": 3856576, "step": 7845 }, { "epoch": 1.0360300910650653, "grad_norm": 28.098182678222656, "learning_rate": 1.930841685645922e-06, "loss": 0.1425, "num_input_tokens_seen": 3859008, "step": 7850 }, { "epoch": 1.0366899828428138, "grad_norm": 0.5196312069892883, "learning_rate": 1.93067323731282e-06, "loss": 0.0006, "num_input_tokens_seen": 3861376, "step": 7855 }, { "epoch": 1.0373498746205623, "grad_norm": 0.149097740650177, "learning_rate": 1.930504591451063e-06, "loss": 0.0849, "num_input_tokens_seen": 3863872, "step": 7860 }, { "epoch": 1.0380097663983108, "grad_norm": 0.03719232231378555, "learning_rate": 1.9303357480964445e-06, "loss": 0.073, "num_input_tokens_seen": 3866304, "step": 7865 }, { "epoch": 1.038669658176059, "grad_norm": 0.17920592427253723, "learning_rate": 1.9301667072848002e-06, "loss": 0.0006, "num_input_tokens_seen": 3868864, "step": 7870 }, { "epoch": 1.0393295499538076, "grad_norm": 0.6391183137893677, "learning_rate": 1.929997469052008e-06, "loss": 0.0896, "num_input_tokens_seen": 3870976, "step": 7875 }, { "epoch": 1.039989441731556, "grad_norm": 0.045991070568561554, "learning_rate": 1.929828033433988e-06, "loss": 0.132, "num_input_tokens_seen": 3873408, "step": 7880 }, { "epoch": 1.0406493335093046, "grad_norm": 19.03218650817871, "learning_rate": 1.9296584004667005e-06, "loss": 0.2669, "num_input_tokens_seen": 3875776, "step": 7885 }, { "epoch": 1.0413092252870528, "grad_norm": 0.6183151602745056, "learning_rate": 1.92948857018615e-06, "loss": 0.0612, "num_input_tokens_seen": 3878144, "step": 7890 }, { "epoch": 1.0419691170648013, "grad_norm": 0.5006767511367798, "learning_rate": 1.929318542628381e-06, "loss": 0.1703, "num_input_tokens_seen": 3880512, "step": 7895 }, { "epoch": 1.0426290088425498, "grad_norm": 0.08102352917194366, "learning_rate": 1.9291483178294813e-06, "loss": 0.0021, "num_input_tokens_seen": 3882880, "step": 7900 }, { "epoch": 1.0432889006202983, "grad_norm": 0.03578070178627968, "learning_rate": 1.928977895825579e-06, "loss": 0.0301, "num_input_tokens_seen": 3885312, "step": 7905 }, { "epoch": 1.0439487923980466, "grad_norm": 0.05146459862589836, "learning_rate": 1.928807276652846e-06, "loss": 0.0624, "num_input_tokens_seen": 3887744, "step": 7910 }, { "epoch": 1.044608684175795, "grad_norm": 0.05097321793437004, "learning_rate": 1.928636460347494e-06, "loss": 0.0541, "num_input_tokens_seen": 3890048, "step": 7915 }, { "epoch": 1.0452685759535436, "grad_norm": 101.96385955810547, "learning_rate": 1.928465446945778e-06, "loss": 0.0159, "num_input_tokens_seen": 3892480, "step": 7920 }, { "epoch": 1.045928467731292, "grad_norm": 0.19685277342796326, "learning_rate": 1.9282942364839947e-06, "loss": 0.0013, "num_input_tokens_seen": 3894784, "step": 7925 }, { "epoch": 1.0465883595090406, "grad_norm": 0.08247081935405731, "learning_rate": 1.9281228289984816e-06, "loss": 0.0775, "num_input_tokens_seen": 3897472, "step": 7930 }, { "epoch": 1.047248251286789, "grad_norm": 25.526233673095703, "learning_rate": 1.927951224525619e-06, "loss": 0.078, "num_input_tokens_seen": 3900032, "step": 7935 }, { "epoch": 1.0479081430645374, "grad_norm": 0.10590098053216934, "learning_rate": 1.9277794231018286e-06, "loss": 0.0335, "num_input_tokens_seen": 3902592, "step": 7940 }, { "epoch": 1.048568034842286, "grad_norm": 13.906495094299316, "learning_rate": 1.927607424763574e-06, "loss": 0.0915, "num_input_tokens_seen": 3904896, "step": 7945 }, { "epoch": 1.0492279266200344, "grad_norm": 0.04824037477374077, "learning_rate": 1.927435229547361e-06, "loss": 0.0017, "num_input_tokens_seen": 3907200, "step": 7950 }, { "epoch": 1.0498878183977827, "grad_norm": 0.19578570127487183, "learning_rate": 1.9272628374897366e-06, "loss": 0.0768, "num_input_tokens_seen": 3909888, "step": 7955 }, { "epoch": 1.0505477101755312, "grad_norm": 0.061658911406993866, "learning_rate": 1.9270902486272892e-06, "loss": 0.0445, "num_input_tokens_seen": 3912320, "step": 7960 }, { "epoch": 1.0512076019532797, "grad_norm": 0.052046775817871094, "learning_rate": 1.92691746299665e-06, "loss": 0.1083, "num_input_tokens_seen": 3914944, "step": 7965 }, { "epoch": 1.0518674937310282, "grad_norm": 0.020027488470077515, "learning_rate": 1.9267444806344917e-06, "loss": 0.0833, "num_input_tokens_seen": 3917376, "step": 7970 }, { "epoch": 1.0525273855087764, "grad_norm": 0.0871131494641304, "learning_rate": 1.9265713015775285e-06, "loss": 0.0865, "num_input_tokens_seen": 3919872, "step": 7975 }, { "epoch": 1.053187277286525, "grad_norm": 0.11457386612892151, "learning_rate": 1.926397925862516e-06, "loss": 0.1148, "num_input_tokens_seen": 3922368, "step": 7980 }, { "epoch": 1.0538471690642734, "grad_norm": 464.58880615234375, "learning_rate": 1.9262243535262527e-06, "loss": 0.0591, "num_input_tokens_seen": 3925056, "step": 7985 }, { "epoch": 1.054507060842022, "grad_norm": 91.03417205810547, "learning_rate": 1.926050584605577e-06, "loss": 0.2027, "num_input_tokens_seen": 3927552, "step": 7990 }, { "epoch": 1.0551669526197704, "grad_norm": 0.5855996012687683, "learning_rate": 1.9258766191373706e-06, "loss": 0.0009, "num_input_tokens_seen": 3930176, "step": 7995 }, { "epoch": 1.0558268443975187, "grad_norm": 0.06435656547546387, "learning_rate": 1.9257024571585565e-06, "loss": 0.0005, "num_input_tokens_seen": 3932672, "step": 8000 }, { "epoch": 1.0564867361752672, "grad_norm": 0.12676815688610077, "learning_rate": 1.9255280987060995e-06, "loss": 0.2338, "num_input_tokens_seen": 3934912, "step": 8005 }, { "epoch": 1.0571466279530157, "grad_norm": 0.19433751702308655, "learning_rate": 1.9253535438170056e-06, "loss": 0.0681, "num_input_tokens_seen": 3937216, "step": 8010 }, { "epoch": 1.0578065197307642, "grad_norm": 18.061725616455078, "learning_rate": 1.9251787925283228e-06, "loss": 0.145, "num_input_tokens_seen": 3939776, "step": 8015 }, { "epoch": 1.0584664115085125, "grad_norm": 0.5926772952079773, "learning_rate": 1.925003844877141e-06, "loss": 0.1157, "num_input_tokens_seen": 3941888, "step": 8020 }, { "epoch": 1.059126303286261, "grad_norm": 0.10081900656223297, "learning_rate": 1.9248287009005914e-06, "loss": 0.0852, "num_input_tokens_seen": 3944192, "step": 8025 }, { "epoch": 1.0597861950640095, "grad_norm": 0.25350770354270935, "learning_rate": 1.9246533606358475e-06, "loss": 0.0706, "num_input_tokens_seen": 3946816, "step": 8030 }, { "epoch": 1.060446086841758, "grad_norm": 57.27534484863281, "learning_rate": 1.9244778241201232e-06, "loss": 0.1036, "num_input_tokens_seen": 3949440, "step": 8035 }, { "epoch": 1.0611059786195065, "grad_norm": 0.20645537972450256, "learning_rate": 1.9243020913906753e-06, "loss": 0.0058, "num_input_tokens_seen": 3952000, "step": 8040 }, { "epoch": 1.0617658703972548, "grad_norm": 0.2704315483570099, "learning_rate": 1.924126162484802e-06, "loss": 0.0008, "num_input_tokens_seen": 3954240, "step": 8045 }, { "epoch": 1.0624257621750033, "grad_norm": 0.09804032742977142, "learning_rate": 1.9239500374398427e-06, "loss": 0.1045, "num_input_tokens_seen": 3956608, "step": 8050 }, { "epoch": 1.0630856539527518, "grad_norm": 0.4295539855957031, "learning_rate": 1.9237737162931785e-06, "loss": 0.0012, "num_input_tokens_seen": 3959104, "step": 8055 }, { "epoch": 1.0637455457305003, "grad_norm": 0.13368825614452362, "learning_rate": 1.9235971990822323e-06, "loss": 0.1183, "num_input_tokens_seen": 3961664, "step": 8060 }, { "epoch": 1.0644054375082486, "grad_norm": 38.22637176513672, "learning_rate": 1.923420485844469e-06, "loss": 0.0608, "num_input_tokens_seen": 3964352, "step": 8065 }, { "epoch": 1.065065329285997, "grad_norm": 6.1040754318237305, "learning_rate": 1.9232435766173944e-06, "loss": 0.0015, "num_input_tokens_seen": 3966656, "step": 8070 }, { "epoch": 1.0657252210637456, "grad_norm": 0.0031956711318343878, "learning_rate": 1.9230664714385567e-06, "loss": 0.1006, "num_input_tokens_seen": 3968896, "step": 8075 }, { "epoch": 1.066385112841494, "grad_norm": 0.07407024502754211, "learning_rate": 1.922889170345544e-06, "loss": 0.028, "num_input_tokens_seen": 3971328, "step": 8080 }, { "epoch": 1.0670450046192423, "grad_norm": 0.1356084942817688, "learning_rate": 1.9227116733759883e-06, "loss": 0.1022, "num_input_tokens_seen": 3973696, "step": 8085 }, { "epoch": 1.0677048963969908, "grad_norm": 56.98714828491211, "learning_rate": 1.922533980567562e-06, "loss": 0.1266, "num_input_tokens_seen": 3976192, "step": 8090 }, { "epoch": 1.0683647881747393, "grad_norm": 0.14554879069328308, "learning_rate": 1.9223560919579782e-06, "loss": 0.0802, "num_input_tokens_seen": 3978944, "step": 8095 }, { "epoch": 1.0690246799524878, "grad_norm": 0.040467824786901474, "learning_rate": 1.922178007584993e-06, "loss": 0.0701, "num_input_tokens_seen": 3981376, "step": 8100 }, { "epoch": 1.0696845717302363, "grad_norm": 0.20177550613880157, "learning_rate": 1.921999727486404e-06, "loss": 0.0006, "num_input_tokens_seen": 3983744, "step": 8105 }, { "epoch": 1.0703444635079846, "grad_norm": 0.04099415987730026, "learning_rate": 1.9218212517000495e-06, "loss": 0.0604, "num_input_tokens_seen": 3986048, "step": 8110 }, { "epoch": 1.0710043552857331, "grad_norm": 1.1667630672454834, "learning_rate": 1.9216425802638095e-06, "loss": 0.0162, "num_input_tokens_seen": 3988736, "step": 8115 }, { "epoch": 1.0716642470634816, "grad_norm": 0.041468545794487, "learning_rate": 1.9214637132156056e-06, "loss": 0.046, "num_input_tokens_seen": 3991360, "step": 8120 }, { "epoch": 1.0723241388412301, "grad_norm": 0.10066083073616028, "learning_rate": 1.9212846505934018e-06, "loss": 0.0479, "num_input_tokens_seen": 3994176, "step": 8125 }, { "epoch": 1.0729840306189784, "grad_norm": 39.72118377685547, "learning_rate": 1.921105392435202e-06, "loss": 0.0754, "num_input_tokens_seen": 3996416, "step": 8130 }, { "epoch": 1.073643922396727, "grad_norm": 0.09259176254272461, "learning_rate": 1.9209259387790526e-06, "loss": 0.0023, "num_input_tokens_seen": 3998976, "step": 8135 }, { "epoch": 1.0743038141744754, "grad_norm": 0.008366269990801811, "learning_rate": 1.920746289663042e-06, "loss": 0.1178, "num_input_tokens_seen": 4001344, "step": 8140 }, { "epoch": 1.074963705952224, "grad_norm": 0.046617474406957626, "learning_rate": 1.9205664451252986e-06, "loss": 0.0384, "num_input_tokens_seen": 4003712, "step": 8145 }, { "epoch": 1.0756235977299722, "grad_norm": 0.5906067490577698, "learning_rate": 1.9203864052039935e-06, "loss": 0.0743, "num_input_tokens_seen": 4006144, "step": 8150 }, { "epoch": 1.0762834895077207, "grad_norm": 0.007874682545661926, "learning_rate": 1.9202061699373386e-06, "loss": 0.0612, "num_input_tokens_seen": 4008640, "step": 8155 }, { "epoch": 1.0769433812854692, "grad_norm": 0.027073025703430176, "learning_rate": 1.9200257393635878e-06, "loss": 0.0445, "num_input_tokens_seen": 4011456, "step": 8160 }, { "epoch": 1.0776032730632177, "grad_norm": 0.06536184996366501, "learning_rate": 1.9198451135210365e-06, "loss": 0.0008, "num_input_tokens_seen": 4014144, "step": 8165 }, { "epoch": 1.0782631648409662, "grad_norm": 125.64025115966797, "learning_rate": 1.919664292448021e-06, "loss": 0.0969, "num_input_tokens_seen": 4016320, "step": 8170 }, { "epoch": 1.0789230566187145, "grad_norm": 0.006089900620281696, "learning_rate": 1.9194832761829184e-06, "loss": 0.0018, "num_input_tokens_seen": 4019008, "step": 8175 }, { "epoch": 1.079582948396463, "grad_norm": 27.950912475585938, "learning_rate": 1.919302064764149e-06, "loss": 0.1604, "num_input_tokens_seen": 4021568, "step": 8180 }, { "epoch": 1.0802428401742115, "grad_norm": 0.3938080370426178, "learning_rate": 1.9191206582301737e-06, "loss": 0.0614, "num_input_tokens_seen": 4024064, "step": 8185 }, { "epoch": 1.08090273195196, "grad_norm": 0.023270845413208008, "learning_rate": 1.9189390566194942e-06, "loss": 0.2029, "num_input_tokens_seen": 4026496, "step": 8190 }, { "epoch": 1.0815626237297082, "grad_norm": 0.251174658536911, "learning_rate": 1.9187572599706547e-06, "loss": 0.0022, "num_input_tokens_seen": 4028992, "step": 8195 }, { "epoch": 1.0822225155074567, "grad_norm": 0.6823098063468933, "learning_rate": 1.9185752683222395e-06, "loss": 0.2369, "num_input_tokens_seen": 4031296, "step": 8200 }, { "epoch": 1.0828824072852052, "grad_norm": 43.15085983276367, "learning_rate": 1.9183930817128755e-06, "loss": 0.0804, "num_input_tokens_seen": 4033664, "step": 8205 }, { "epoch": 1.0835422990629537, "grad_norm": 28.672992706298828, "learning_rate": 1.9182107001812303e-06, "loss": 0.0454, "num_input_tokens_seen": 4036160, "step": 8210 }, { "epoch": 1.0842021908407022, "grad_norm": 59.211666107177734, "learning_rate": 1.9180281237660136e-06, "loss": 0.0698, "num_input_tokens_seen": 4038784, "step": 8215 }, { "epoch": 1.0848620826184505, "grad_norm": 0.054303571581840515, "learning_rate": 1.917845352505975e-06, "loss": 0.0012, "num_input_tokens_seen": 4041280, "step": 8220 }, { "epoch": 1.085521974396199, "grad_norm": 0.20329880714416504, "learning_rate": 1.917662386439907e-06, "loss": 0.0258, "num_input_tokens_seen": 4043712, "step": 8225 }, { "epoch": 1.0861818661739475, "grad_norm": 1.4719992876052856, "learning_rate": 1.9174792256066427e-06, "loss": 0.0015, "num_input_tokens_seen": 4045824, "step": 8230 }, { "epoch": 1.086841757951696, "grad_norm": 0.0649583712220192, "learning_rate": 1.9172958700450565e-06, "loss": 0.0688, "num_input_tokens_seen": 4048320, "step": 8235 }, { "epoch": 1.0875016497294443, "grad_norm": 1.4787904024124146, "learning_rate": 1.9171123197940647e-06, "loss": 0.0083, "num_input_tokens_seen": 4050688, "step": 8240 }, { "epoch": 1.0881615415071928, "grad_norm": 0.20488137006759644, "learning_rate": 1.916928574892624e-06, "loss": 0.0006, "num_input_tokens_seen": 4053312, "step": 8245 }, { "epoch": 1.0888214332849413, "grad_norm": 0.019268441945314407, "learning_rate": 1.9167446353797334e-06, "loss": 0.0955, "num_input_tokens_seen": 4055872, "step": 8250 }, { "epoch": 1.0894813250626898, "grad_norm": 0.0886877030134201, "learning_rate": 1.9165605012944322e-06, "loss": 0.0006, "num_input_tokens_seen": 4058304, "step": 8255 }, { "epoch": 1.090141216840438, "grad_norm": 16.01343536376953, "learning_rate": 1.916376172675802e-06, "loss": 0.1845, "num_input_tokens_seen": 4060800, "step": 8260 }, { "epoch": 1.0908011086181866, "grad_norm": 0.229745015501976, "learning_rate": 1.916191649562965e-06, "loss": 0.1351, "num_input_tokens_seen": 4063488, "step": 8265 }, { "epoch": 1.091461000395935, "grad_norm": 0.015055251307785511, "learning_rate": 1.9160069319950844e-06, "loss": 0.0002, "num_input_tokens_seen": 4066176, "step": 8270 }, { "epoch": 1.0921208921736836, "grad_norm": 0.02369670942425728, "learning_rate": 1.915822020011366e-06, "loss": 0.0815, "num_input_tokens_seen": 4068864, "step": 8275 }, { "epoch": 1.092780783951432, "grad_norm": 0.34016939997673035, "learning_rate": 1.915636913651056e-06, "loss": 0.0007, "num_input_tokens_seen": 4071360, "step": 8280 }, { "epoch": 1.0934406757291804, "grad_norm": 0.006058728788048029, "learning_rate": 1.9154516129534414e-06, "loss": 0.0924, "num_input_tokens_seen": 4074112, "step": 8285 }, { "epoch": 1.0941005675069289, "grad_norm": 0.054655253887176514, "learning_rate": 1.915266117957851e-06, "loss": 0.0658, "num_input_tokens_seen": 4076480, "step": 8290 }, { "epoch": 1.0947604592846774, "grad_norm": 0.012565754354000092, "learning_rate": 1.915080428703655e-06, "loss": 0.0017, "num_input_tokens_seen": 4078976, "step": 8295 }, { "epoch": 1.0954203510624259, "grad_norm": 0.027407482266426086, "learning_rate": 1.9148945452302647e-06, "loss": 0.1423, "num_input_tokens_seen": 4081664, "step": 8300 }, { "epoch": 1.0960802428401741, "grad_norm": 0.10399171710014343, "learning_rate": 1.9147084675771322e-06, "loss": 0.1065, "num_input_tokens_seen": 4084288, "step": 8305 }, { "epoch": 1.0967401346179226, "grad_norm": 0.014594976790249348, "learning_rate": 1.9145221957837513e-06, "loss": 0.0763, "num_input_tokens_seen": 4086656, "step": 8310 }, { "epoch": 1.0974000263956711, "grad_norm": 0.027905648574233055, "learning_rate": 1.9143357298896564e-06, "loss": 0.066, "num_input_tokens_seen": 4089152, "step": 8315 }, { "epoch": 1.0980599181734196, "grad_norm": 0.026191938668489456, "learning_rate": 1.9141490699344243e-06, "loss": 0.0539, "num_input_tokens_seen": 4091584, "step": 8320 }, { "epoch": 1.098719809951168, "grad_norm": 31.117815017700195, "learning_rate": 1.913962215957672e-06, "loss": 0.0029, "num_input_tokens_seen": 4093888, "step": 8325 }, { "epoch": 1.0993797017289164, "grad_norm": 20.938081741333008, "learning_rate": 1.9137751679990576e-06, "loss": 0.0022, "num_input_tokens_seen": 4096256, "step": 8330 }, { "epoch": 1.100039593506665, "grad_norm": 15.29064655303955, "learning_rate": 1.9135879260982806e-06, "loss": 0.1069, "num_input_tokens_seen": 4098816, "step": 8335 }, { "epoch": 1.1006994852844134, "grad_norm": 0.03777789697051048, "learning_rate": 1.9134004902950826e-06, "loss": 0.1271, "num_input_tokens_seen": 4101184, "step": 8340 }, { "epoch": 1.101359377062162, "grad_norm": 0.26297852396965027, "learning_rate": 1.913212860629244e-06, "loss": 0.1017, "num_input_tokens_seen": 4103488, "step": 8345 }, { "epoch": 1.1020192688399102, "grad_norm": 1.434247374534607, "learning_rate": 1.9130250371405895e-06, "loss": 0.1092, "num_input_tokens_seen": 4105792, "step": 8350 }, { "epoch": 1.1026791606176587, "grad_norm": 0.41510826349258423, "learning_rate": 1.912837019868982e-06, "loss": 0.0289, "num_input_tokens_seen": 4108416, "step": 8355 }, { "epoch": 1.1033390523954072, "grad_norm": 0.07832839339971542, "learning_rate": 1.9126488088543273e-06, "loss": 0.0011, "num_input_tokens_seen": 4110976, "step": 8360 }, { "epoch": 1.1039989441731557, "grad_norm": 24.100696563720703, "learning_rate": 1.912460404136572e-06, "loss": 0.1147, "num_input_tokens_seen": 4113536, "step": 8365 }, { "epoch": 1.104658835950904, "grad_norm": 137.2346954345703, "learning_rate": 1.912271805755703e-06, "loss": 0.1414, "num_input_tokens_seen": 4115840, "step": 8370 }, { "epoch": 1.1053187277286525, "grad_norm": 0.1459578573703766, "learning_rate": 1.9120830137517498e-06, "loss": 0.0527, "num_input_tokens_seen": 4118208, "step": 8375 }, { "epoch": 1.105978619506401, "grad_norm": 0.1875588446855545, "learning_rate": 1.9118940281647816e-06, "loss": 0.0102, "num_input_tokens_seen": 4120448, "step": 8380 }, { "epoch": 1.1066385112841495, "grad_norm": 65.89623260498047, "learning_rate": 1.9117048490349096e-06, "loss": 0.1928, "num_input_tokens_seen": 4122560, "step": 8385 }, { "epoch": 1.1072984030618978, "grad_norm": 0.024722224101424217, "learning_rate": 1.9115154764022852e-06, "loss": 0.0306, "num_input_tokens_seen": 4125120, "step": 8390 }, { "epoch": 1.1079582948396463, "grad_norm": 0.13537687063217163, "learning_rate": 1.9113259103071015e-06, "loss": 0.0867, "num_input_tokens_seen": 4127360, "step": 8395 }, { "epoch": 1.1086181866173948, "grad_norm": 0.12146171927452087, "learning_rate": 1.9111361507895925e-06, "loss": 0.1242, "num_input_tokens_seen": 4129664, "step": 8400 }, { "epoch": 1.1092780783951433, "grad_norm": 139.49085998535156, "learning_rate": 1.9109461978900342e-06, "loss": 0.082, "num_input_tokens_seen": 4132032, "step": 8405 }, { "epoch": 1.1099379701728918, "grad_norm": 0.11550889164209366, "learning_rate": 1.910756051648741e-06, "loss": 0.0745, "num_input_tokens_seen": 4134528, "step": 8410 }, { "epoch": 1.11059786195064, "grad_norm": 0.04432675242424011, "learning_rate": 1.9105657121060715e-06, "loss": 0.001, "num_input_tokens_seen": 4137280, "step": 8415 }, { "epoch": 1.1112577537283885, "grad_norm": 0.056893620640039444, "learning_rate": 1.9103751793024236e-06, "loss": 0.0026, "num_input_tokens_seen": 4139776, "step": 8420 }, { "epoch": 1.111917645506137, "grad_norm": 0.08899568021297455, "learning_rate": 1.9101844532782357e-06, "loss": 0.0961, "num_input_tokens_seen": 4142144, "step": 8425 }, { "epoch": 1.1125775372838855, "grad_norm": 0.10331233590841293, "learning_rate": 1.909993534073989e-06, "loss": 0.0057, "num_input_tokens_seen": 4144768, "step": 8430 }, { "epoch": 1.1132374290616338, "grad_norm": 0.07069068402051926, "learning_rate": 1.9098024217302043e-06, "loss": 0.0725, "num_input_tokens_seen": 4147008, "step": 8435 }, { "epoch": 1.1138973208393823, "grad_norm": 0.03294684365391731, "learning_rate": 1.909611116287444e-06, "loss": 0.0589, "num_input_tokens_seen": 4148992, "step": 8440 }, { "epoch": 1.1145572126171308, "grad_norm": 0.02010115422308445, "learning_rate": 1.909419617786311e-06, "loss": 0.0708, "num_input_tokens_seen": 4151552, "step": 8445 }, { "epoch": 1.1152171043948793, "grad_norm": 0.12367430329322815, "learning_rate": 1.90922792626745e-06, "loss": 0.1095, "num_input_tokens_seen": 4154176, "step": 8450 }, { "epoch": 1.1158769961726276, "grad_norm": 0.40327343344688416, "learning_rate": 1.9090360417715454e-06, "loss": 0.1106, "num_input_tokens_seen": 4156736, "step": 8455 }, { "epoch": 1.116536887950376, "grad_norm": 0.05257925018668175, "learning_rate": 1.9088439643393236e-06, "loss": 0.0834, "num_input_tokens_seen": 4158976, "step": 8460 }, { "epoch": 1.1171967797281246, "grad_norm": 0.18939876556396484, "learning_rate": 1.9086516940115518e-06, "loss": 0.001, "num_input_tokens_seen": 4161280, "step": 8465 }, { "epoch": 1.117856671505873, "grad_norm": 0.05031445994973183, "learning_rate": 1.908459230829038e-06, "loss": 0.0794, "num_input_tokens_seen": 4163776, "step": 8470 }, { "epoch": 1.1185165632836216, "grad_norm": 13.425889015197754, "learning_rate": 1.908266574832631e-06, "loss": 0.1537, "num_input_tokens_seen": 4166336, "step": 8475 }, { "epoch": 1.1191764550613699, "grad_norm": 0.010042755864560604, "learning_rate": 1.90807372606322e-06, "loss": 0.1069, "num_input_tokens_seen": 4168832, "step": 8480 }, { "epoch": 1.1198363468391184, "grad_norm": 0.17297986149787903, "learning_rate": 1.9078806845617372e-06, "loss": 0.0022, "num_input_tokens_seen": 4171520, "step": 8485 }, { "epoch": 1.1204962386168669, "grad_norm": 30.04994010925293, "learning_rate": 1.907687450369153e-06, "loss": 0.0587, "num_input_tokens_seen": 4174208, "step": 8490 }, { "epoch": 1.1211561303946154, "grad_norm": 2.6952767372131348, "learning_rate": 1.9074940235264805e-06, "loss": 0.1138, "num_input_tokens_seen": 4176512, "step": 8495 }, { "epoch": 1.1218160221723636, "grad_norm": 0.12238600105047226, "learning_rate": 1.9073004040747732e-06, "loss": 0.0606, "num_input_tokens_seen": 4179072, "step": 8500 }, { "epoch": 1.1224759139501121, "grad_norm": 0.33392786979675293, "learning_rate": 1.9071065920551254e-06, "loss": 0.0689, "num_input_tokens_seen": 4181568, "step": 8505 }, { "epoch": 1.1231358057278606, "grad_norm": 33.69342041015625, "learning_rate": 1.906912587508672e-06, "loss": 0.0378, "num_input_tokens_seen": 4184000, "step": 8510 }, { "epoch": 1.1237956975056091, "grad_norm": 0.23329313099384308, "learning_rate": 1.9067183904765893e-06, "loss": 0.0591, "num_input_tokens_seen": 4186240, "step": 8515 }, { "epoch": 1.1244555892833574, "grad_norm": 0.11479724198579788, "learning_rate": 1.9065240010000942e-06, "loss": 0.0016, "num_input_tokens_seen": 4188544, "step": 8520 }, { "epoch": 1.125115481061106, "grad_norm": 0.02210923284292221, "learning_rate": 1.9063294191204442e-06, "loss": 0.1241, "num_input_tokens_seen": 4191168, "step": 8525 }, { "epoch": 1.1257753728388544, "grad_norm": 0.056093212217092514, "learning_rate": 1.9061346448789383e-06, "loss": 0.2255, "num_input_tokens_seen": 4193984, "step": 8530 }, { "epoch": 1.126435264616603, "grad_norm": 0.026619018986821175, "learning_rate": 1.9059396783169157e-06, "loss": 0.001, "num_input_tokens_seen": 4196928, "step": 8535 }, { "epoch": 1.1270951563943514, "grad_norm": 0.1786504089832306, "learning_rate": 1.9057445194757566e-06, "loss": 0.0009, "num_input_tokens_seen": 4199424, "step": 8540 }, { "epoch": 1.1277550481720997, "grad_norm": 0.024085398763418198, "learning_rate": 1.9055491683968822e-06, "loss": 0.1215, "num_input_tokens_seen": 4201600, "step": 8545 }, { "epoch": 1.1284149399498482, "grad_norm": 0.051518410444259644, "learning_rate": 1.9053536251217544e-06, "loss": 0.0429, "num_input_tokens_seen": 4203968, "step": 8550 }, { "epoch": 1.1290748317275967, "grad_norm": 0.07784697413444519, "learning_rate": 1.9051578896918756e-06, "loss": 0.0003, "num_input_tokens_seen": 4206336, "step": 8555 }, { "epoch": 1.1297347235053452, "grad_norm": 0.02852809801697731, "learning_rate": 1.9049619621487894e-06, "loss": 0.0588, "num_input_tokens_seen": 4208960, "step": 8560 }, { "epoch": 1.1303946152830935, "grad_norm": 58.111297607421875, "learning_rate": 1.9047658425340798e-06, "loss": 0.0102, "num_input_tokens_seen": 4211200, "step": 8565 }, { "epoch": 1.131054507060842, "grad_norm": 0.13125121593475342, "learning_rate": 1.904569530889372e-06, "loss": 0.0002, "num_input_tokens_seen": 4213824, "step": 8570 }, { "epoch": 1.1317143988385905, "grad_norm": 0.06383427232503891, "learning_rate": 1.9043730272563319e-06, "loss": 0.0023, "num_input_tokens_seen": 4216192, "step": 8575 }, { "epoch": 1.132374290616339, "grad_norm": 0.1832188367843628, "learning_rate": 1.9041763316766653e-06, "loss": 0.0643, "num_input_tokens_seen": 4218304, "step": 8580 }, { "epoch": 1.1330341823940873, "grad_norm": 0.47467440366744995, "learning_rate": 1.90397944419212e-06, "loss": 0.0723, "num_input_tokens_seen": 4220608, "step": 8585 }, { "epoch": 1.1336940741718358, "grad_norm": 46.288822174072266, "learning_rate": 1.9037823648444839e-06, "loss": 0.1629, "num_input_tokens_seen": 4223040, "step": 8590 }, { "epoch": 1.1343539659495843, "grad_norm": 0.02950097993016243, "learning_rate": 1.9035850936755855e-06, "loss": 0.0002, "num_input_tokens_seen": 4225664, "step": 8595 }, { "epoch": 1.1350138577273328, "grad_norm": 0.01285611279308796, "learning_rate": 1.9033876307272941e-06, "loss": 0.1971, "num_input_tokens_seen": 4228224, "step": 8600 }, { "epoch": 1.1356737495050813, "grad_norm": 0.030775396153330803, "learning_rate": 1.9031899760415198e-06, "loss": 0.0001, "num_input_tokens_seen": 4230784, "step": 8605 }, { "epoch": 1.1363336412828295, "grad_norm": 13.63304615020752, "learning_rate": 1.9029921296602139e-06, "loss": 0.1488, "num_input_tokens_seen": 4233216, "step": 8610 }, { "epoch": 1.136993533060578, "grad_norm": 0.507597804069519, "learning_rate": 1.9027940916253668e-06, "loss": 0.0005, "num_input_tokens_seen": 4235584, "step": 8615 }, { "epoch": 1.1376534248383265, "grad_norm": 0.1052989810705185, "learning_rate": 1.9025958619790118e-06, "loss": 0.112, "num_input_tokens_seen": 4237952, "step": 8620 }, { "epoch": 1.138313316616075, "grad_norm": 0.05876855179667473, "learning_rate": 1.902397440763221e-06, "loss": 0.0003, "num_input_tokens_seen": 4240768, "step": 8625 }, { "epoch": 1.1389732083938233, "grad_norm": 0.09319160133600235, "learning_rate": 1.9021988280201083e-06, "loss": 0.2625, "num_input_tokens_seen": 4243072, "step": 8630 }, { "epoch": 1.1396331001715718, "grad_norm": 0.09952009469270706, "learning_rate": 1.9020000237918273e-06, "loss": 0.003, "num_input_tokens_seen": 4245632, "step": 8635 }, { "epoch": 1.1402929919493203, "grad_norm": 0.1723238229751587, "learning_rate": 1.9018010281205727e-06, "loss": 0.0461, "num_input_tokens_seen": 4248064, "step": 8640 }, { "epoch": 1.1409528837270688, "grad_norm": 0.16310635209083557, "learning_rate": 1.9016018410485809e-06, "loss": 0.0676, "num_input_tokens_seen": 4250496, "step": 8645 }, { "epoch": 1.141612775504817, "grad_norm": 17.923866271972656, "learning_rate": 1.901402462618127e-06, "loss": 0.3759, "num_input_tokens_seen": 4253120, "step": 8650 }, { "epoch": 1.1422726672825656, "grad_norm": 13.28976821899414, "learning_rate": 1.9012028928715272e-06, "loss": 0.2103, "num_input_tokens_seen": 4255680, "step": 8655 }, { "epoch": 1.142932559060314, "grad_norm": 0.06356354802846909, "learning_rate": 1.9010031318511401e-06, "loss": 0.0168, "num_input_tokens_seen": 4257984, "step": 8660 }, { "epoch": 1.1435924508380626, "grad_norm": 0.1584162563085556, "learning_rate": 1.9008031795993627e-06, "loss": 0.0013, "num_input_tokens_seen": 4260224, "step": 8665 }, { "epoch": 1.144252342615811, "grad_norm": 0.317395955324173, "learning_rate": 1.9006030361586337e-06, "loss": 0.0495, "num_input_tokens_seen": 4262656, "step": 8670 }, { "epoch": 1.1449122343935594, "grad_norm": 12.65967082977295, "learning_rate": 1.9004027015714315e-06, "loss": 0.0547, "num_input_tokens_seen": 4264960, "step": 8675 }, { "epoch": 1.1455721261713079, "grad_norm": 0.35858747363090515, "learning_rate": 1.9002021758802762e-06, "loss": 0.0832, "num_input_tokens_seen": 4267456, "step": 8680 }, { "epoch": 1.1462320179490564, "grad_norm": 0.145728200674057, "learning_rate": 1.900001459127728e-06, "loss": 0.0731, "num_input_tokens_seen": 4269760, "step": 8685 }, { "epoch": 1.1468919097268049, "grad_norm": 78.33462524414062, "learning_rate": 1.8998005513563872e-06, "loss": 0.031, "num_input_tokens_seen": 4272384, "step": 8690 }, { "epoch": 1.1475518015045532, "grad_norm": 0.28123587369918823, "learning_rate": 1.8995994526088955e-06, "loss": 0.0028, "num_input_tokens_seen": 4275008, "step": 8695 }, { "epoch": 1.1482116932823017, "grad_norm": 0.04340027645230293, "learning_rate": 1.8993981629279342e-06, "loss": 0.014, "num_input_tokens_seen": 4277440, "step": 8700 }, { "epoch": 1.1488715850600502, "grad_norm": 0.034418463706970215, "learning_rate": 1.8991966823562258e-06, "loss": 0.0003, "num_input_tokens_seen": 4279744, "step": 8705 }, { "epoch": 1.1495314768377987, "grad_norm": 162.60955810546875, "learning_rate": 1.8989950109365328e-06, "loss": 0.1334, "num_input_tokens_seen": 4282048, "step": 8710 }, { "epoch": 1.150191368615547, "grad_norm": 0.05149710550904274, "learning_rate": 1.8987931487116591e-06, "loss": 0.0581, "num_input_tokens_seen": 4284288, "step": 8715 }, { "epoch": 1.1508512603932954, "grad_norm": 0.03445148840546608, "learning_rate": 1.898591095724448e-06, "loss": 0.0003, "num_input_tokens_seen": 4286976, "step": 8720 }, { "epoch": 1.151511152171044, "grad_norm": 0.014576703310012817, "learning_rate": 1.898388852017784e-06, "loss": 0.0615, "num_input_tokens_seen": 4289472, "step": 8725 }, { "epoch": 1.1521710439487924, "grad_norm": 0.027082012966275215, "learning_rate": 1.8981864176345914e-06, "loss": 0.0752, "num_input_tokens_seen": 4292160, "step": 8730 }, { "epoch": 1.152830935726541, "grad_norm": 18.13522720336914, "learning_rate": 1.8979837926178362e-06, "loss": 0.1534, "num_input_tokens_seen": 4294528, "step": 8735 }, { "epoch": 1.1534908275042892, "grad_norm": 0.006116439588367939, "learning_rate": 1.8977809770105235e-06, "loss": 0.0736, "num_input_tokens_seen": 4297152, "step": 8740 }, { "epoch": 1.1541507192820377, "grad_norm": 8.04682731628418, "learning_rate": 1.8975779708556998e-06, "loss": 0.0011, "num_input_tokens_seen": 4299968, "step": 8745 }, { "epoch": 1.1548106110597862, "grad_norm": 102.7171401977539, "learning_rate": 1.8973747741964515e-06, "loss": 0.0063, "num_input_tokens_seen": 4302144, "step": 8750 }, { "epoch": 1.1554705028375347, "grad_norm": 0.03771144151687622, "learning_rate": 1.8971713870759057e-06, "loss": 0.0892, "num_input_tokens_seen": 4304576, "step": 8755 }, { "epoch": 1.156130394615283, "grad_norm": 37.372928619384766, "learning_rate": 1.8969678095372296e-06, "loss": 0.0785, "num_input_tokens_seen": 4306816, "step": 8760 }, { "epoch": 1.1567902863930315, "grad_norm": 24.924997329711914, "learning_rate": 1.8967640416236313e-06, "loss": 0.1083, "num_input_tokens_seen": 4309440, "step": 8765 }, { "epoch": 1.15745017817078, "grad_norm": 0.10889468342065811, "learning_rate": 1.8965600833783594e-06, "loss": 0.121, "num_input_tokens_seen": 4312000, "step": 8770 }, { "epoch": 1.1581100699485285, "grad_norm": 36.86738586425781, "learning_rate": 1.8963559348447015e-06, "loss": 0.1331, "num_input_tokens_seen": 4314432, "step": 8775 }, { "epoch": 1.1587699617262768, "grad_norm": 6.531122207641602, "learning_rate": 1.8961515960659878e-06, "loss": 0.0048, "num_input_tokens_seen": 4316928, "step": 8780 }, { "epoch": 1.1594298535040253, "grad_norm": 122.35628509521484, "learning_rate": 1.8959470670855873e-06, "loss": 0.1, "num_input_tokens_seen": 4319168, "step": 8785 }, { "epoch": 1.1600897452817738, "grad_norm": 15.543755531311035, "learning_rate": 1.8957423479469095e-06, "loss": 0.1709, "num_input_tokens_seen": 4321344, "step": 8790 }, { "epoch": 1.1607496370595223, "grad_norm": 116.81075286865234, "learning_rate": 1.8955374386934049e-06, "loss": 0.0571, "num_input_tokens_seen": 4323904, "step": 8795 }, { "epoch": 1.1614095288372708, "grad_norm": 0.28201350569725037, "learning_rate": 1.895332339368564e-06, "loss": 0.0995, "num_input_tokens_seen": 4326272, "step": 8800 }, { "epoch": 1.162069420615019, "grad_norm": 0.05971250683069229, "learning_rate": 1.8951270500159176e-06, "loss": 0.0573, "num_input_tokens_seen": 4329024, "step": 8805 }, { "epoch": 1.1627293123927676, "grad_norm": 15.558816909790039, "learning_rate": 1.8949215706790364e-06, "loss": 0.0971, "num_input_tokens_seen": 4331328, "step": 8810 }, { "epoch": 1.163389204170516, "grad_norm": 109.42797088623047, "learning_rate": 1.8947159014015326e-06, "loss": 0.1817, "num_input_tokens_seen": 4333696, "step": 8815 }, { "epoch": 1.1640490959482646, "grad_norm": 0.2136552333831787, "learning_rate": 1.8945100422270578e-06, "loss": 0.1063, "num_input_tokens_seen": 4336320, "step": 8820 }, { "epoch": 1.164708987726013, "grad_norm": 2.6217916011810303, "learning_rate": 1.8943039931993043e-06, "loss": 0.0024, "num_input_tokens_seen": 4338688, "step": 8825 }, { "epoch": 1.1653688795037613, "grad_norm": 13.55349063873291, "learning_rate": 1.8940977543620038e-06, "loss": 0.078, "num_input_tokens_seen": 4341312, "step": 8830 }, { "epoch": 1.1660287712815098, "grad_norm": 0.13286477327346802, "learning_rate": 1.89389132575893e-06, "loss": 0.0008, "num_input_tokens_seen": 4343808, "step": 8835 }, { "epoch": 1.1666886630592583, "grad_norm": 31.39531707763672, "learning_rate": 1.8936847074338948e-06, "loss": 0.0904, "num_input_tokens_seen": 4346176, "step": 8840 }, { "epoch": 1.1673485548370066, "grad_norm": 0.0839921236038208, "learning_rate": 1.8934778994307526e-06, "loss": 0.0682, "num_input_tokens_seen": 4348672, "step": 8845 }, { "epoch": 1.1680084466147551, "grad_norm": 0.012714563868939877, "learning_rate": 1.8932709017933958e-06, "loss": 0.1466, "num_input_tokens_seen": 4350976, "step": 8850 }, { "epoch": 1.1686683383925036, "grad_norm": 0.12429999560117722, "learning_rate": 1.8930637145657592e-06, "loss": 0.0004, "num_input_tokens_seen": 4353536, "step": 8855 }, { "epoch": 1.1693282301702521, "grad_norm": 0.08413698524236679, "learning_rate": 1.8928563377918157e-06, "loss": 0.1143, "num_input_tokens_seen": 4355712, "step": 8860 }, { "epoch": 1.1699881219480006, "grad_norm": 0.050988439470529556, "learning_rate": 1.8926487715155802e-06, "loss": 0.0635, "num_input_tokens_seen": 4358336, "step": 8865 }, { "epoch": 1.170648013725749, "grad_norm": 1.0589361190795898, "learning_rate": 1.892441015781107e-06, "loss": 0.0941, "num_input_tokens_seen": 4360896, "step": 8870 }, { "epoch": 1.1713079055034974, "grad_norm": 0.09044911712408066, "learning_rate": 1.892233070632491e-06, "loss": 0.0008, "num_input_tokens_seen": 4363456, "step": 8875 }, { "epoch": 1.171967797281246, "grad_norm": 14.525789260864258, "learning_rate": 1.8920249361138665e-06, "loss": 0.1365, "num_input_tokens_seen": 4365760, "step": 8880 }, { "epoch": 1.1726276890589944, "grad_norm": 0.1123279482126236, "learning_rate": 1.891816612269409e-06, "loss": 0.0024, "num_input_tokens_seen": 4368192, "step": 8885 }, { "epoch": 1.173287580836743, "grad_norm": 0.02701779454946518, "learning_rate": 1.8916080991433337e-06, "loss": 0.0928, "num_input_tokens_seen": 4370752, "step": 8890 }, { "epoch": 1.1739474726144912, "grad_norm": 0.06008841097354889, "learning_rate": 1.8913993967798956e-06, "loss": 0.0007, "num_input_tokens_seen": 4373376, "step": 8895 }, { "epoch": 1.1746073643922397, "grad_norm": 0.14545901119709015, "learning_rate": 1.8911905052233905e-06, "loss": 0.0492, "num_input_tokens_seen": 4376000, "step": 8900 }, { "epoch": 1.1752672561699882, "grad_norm": 31.391544342041016, "learning_rate": 1.8909814245181543e-06, "loss": 0.1225, "num_input_tokens_seen": 4378496, "step": 8905 }, { "epoch": 1.1759271479477365, "grad_norm": 11.671570777893066, "learning_rate": 1.890772154708563e-06, "loss": 0.0383, "num_input_tokens_seen": 4380672, "step": 8910 }, { "epoch": 1.176587039725485, "grad_norm": 18.109888076782227, "learning_rate": 1.8905626958390317e-06, "loss": 0.1929, "num_input_tokens_seen": 4383168, "step": 8915 }, { "epoch": 1.1772469315032335, "grad_norm": 0.048839978873729706, "learning_rate": 1.8903530479540176e-06, "loss": 0.0006, "num_input_tokens_seen": 4385472, "step": 8920 }, { "epoch": 1.177906823280982, "grad_norm": 0.05667317286133766, "learning_rate": 1.8901432110980164e-06, "loss": 0.0004, "num_input_tokens_seen": 4387840, "step": 8925 }, { "epoch": 1.1785667150587305, "grad_norm": 12.719619750976562, "learning_rate": 1.8899331853155648e-06, "loss": 0.1419, "num_input_tokens_seen": 4390336, "step": 8930 }, { "epoch": 1.1792266068364787, "grad_norm": 11.942977905273438, "learning_rate": 1.8897229706512387e-06, "loss": 0.099, "num_input_tokens_seen": 4392640, "step": 8935 }, { "epoch": 1.1798864986142272, "grad_norm": 0.24283498525619507, "learning_rate": 1.889512567149655e-06, "loss": 0.1339, "num_input_tokens_seen": 4395136, "step": 8940 }, { "epoch": 1.1805463903919757, "grad_norm": 0.12784871459007263, "learning_rate": 1.88930197485547e-06, "loss": 0.0047, "num_input_tokens_seen": 4397504, "step": 8945 }, { "epoch": 1.1812062821697242, "grad_norm": 4.140740871429443, "learning_rate": 1.8890911938133814e-06, "loss": 0.0047, "num_input_tokens_seen": 4399872, "step": 8950 }, { "epoch": 1.1818661739474727, "grad_norm": 0.3176402151584625, "learning_rate": 1.8888802240681248e-06, "loss": 0.0867, "num_input_tokens_seen": 4402048, "step": 8955 }, { "epoch": 1.182526065725221, "grad_norm": 47.93214797973633, "learning_rate": 1.888669065664477e-06, "loss": 0.003, "num_input_tokens_seen": 4404416, "step": 8960 }, { "epoch": 1.1831859575029695, "grad_norm": 0.04862954467535019, "learning_rate": 1.8884577186472557e-06, "loss": 0.0207, "num_input_tokens_seen": 4406720, "step": 8965 }, { "epoch": 1.183845849280718, "grad_norm": 0.016081402078270912, "learning_rate": 1.8882461830613173e-06, "loss": 0.1395, "num_input_tokens_seen": 4408896, "step": 8970 }, { "epoch": 1.1845057410584663, "grad_norm": 0.04223987087607384, "learning_rate": 1.8880344589515587e-06, "loss": 0.0004, "num_input_tokens_seen": 4411392, "step": 8975 }, { "epoch": 1.1851656328362148, "grad_norm": 0.0231131874024868, "learning_rate": 1.887822546362917e-06, "loss": 0.0003, "num_input_tokens_seen": 4413888, "step": 8980 }, { "epoch": 1.1858255246139633, "grad_norm": 0.03339609131217003, "learning_rate": 1.8876104453403686e-06, "loss": 0.0006, "num_input_tokens_seen": 4416384, "step": 8985 }, { "epoch": 1.1864854163917118, "grad_norm": 0.1902741938829422, "learning_rate": 1.8873981559289308e-06, "loss": 0.21, "num_input_tokens_seen": 4419136, "step": 8990 }, { "epoch": 1.1871453081694603, "grad_norm": 0.2967088520526886, "learning_rate": 1.8871856781736604e-06, "loss": 0.0786, "num_input_tokens_seen": 4421632, "step": 8995 }, { "epoch": 1.1878051999472086, "grad_norm": 0.09986329078674316, "learning_rate": 1.8869730121196542e-06, "loss": 0.0006, "num_input_tokens_seen": 4424320, "step": 9000 }, { "epoch": 1.188465091724957, "grad_norm": 17.65159034729004, "learning_rate": 1.8867601578120495e-06, "loss": 0.1348, "num_input_tokens_seen": 4426880, "step": 9005 }, { "epoch": 1.1891249835027056, "grad_norm": 0.12928028404712677, "learning_rate": 1.8865471152960225e-06, "loss": 0.0006, "num_input_tokens_seen": 4429376, "step": 9010 }, { "epoch": 1.189784875280454, "grad_norm": 0.02794799394905567, "learning_rate": 1.8863338846167905e-06, "loss": 0.1167, "num_input_tokens_seen": 4432064, "step": 9015 }, { "epoch": 1.1904447670582026, "grad_norm": 0.15142269432544708, "learning_rate": 1.8861204658196095e-06, "loss": 0.0008, "num_input_tokens_seen": 4434304, "step": 9020 }, { "epoch": 1.1911046588359508, "grad_norm": 0.18032433092594147, "learning_rate": 1.8859068589497765e-06, "loss": 0.0008, "num_input_tokens_seen": 4436544, "step": 9025 }, { "epoch": 1.1917645506136993, "grad_norm": 0.06488180160522461, "learning_rate": 1.8856930640526277e-06, "loss": 0.0947, "num_input_tokens_seen": 4439168, "step": 9030 }, { "epoch": 1.1924244423914478, "grad_norm": 0.04269060865044594, "learning_rate": 1.88547908117354e-06, "loss": 0.0814, "num_input_tokens_seen": 4441472, "step": 9035 }, { "epoch": 1.1930843341691963, "grad_norm": 0.12657848000526428, "learning_rate": 1.8852649103579292e-06, "loss": 0.2709, "num_input_tokens_seen": 4443840, "step": 9040 }, { "epoch": 1.1937442259469446, "grad_norm": 0.024857914075255394, "learning_rate": 1.885050551651252e-06, "loss": 0.0662, "num_input_tokens_seen": 4446016, "step": 9045 }, { "epoch": 1.1944041177246931, "grad_norm": 2.7682478427886963, "learning_rate": 1.8848360050990042e-06, "loss": 0.2496, "num_input_tokens_seen": 4448320, "step": 9050 }, { "epoch": 1.1950640095024416, "grad_norm": 0.15781430900096893, "learning_rate": 1.8846212707467216e-06, "loss": 0.0971, "num_input_tokens_seen": 4450880, "step": 9055 }, { "epoch": 1.1957239012801901, "grad_norm": 0.2587769329547882, "learning_rate": 1.8844063486399805e-06, "loss": 0.002, "num_input_tokens_seen": 4453824, "step": 9060 }, { "epoch": 1.1963837930579384, "grad_norm": 29.7452392578125, "learning_rate": 1.884191238824396e-06, "loss": 0.1786, "num_input_tokens_seen": 4456448, "step": 9065 }, { "epoch": 1.197043684835687, "grad_norm": 0.1104031577706337, "learning_rate": 1.883975941345624e-06, "loss": 0.1482, "num_input_tokens_seen": 4458880, "step": 9070 }, { "epoch": 1.1977035766134354, "grad_norm": 11.88070011138916, "learning_rate": 1.8837604562493597e-06, "loss": 0.1391, "num_input_tokens_seen": 4461504, "step": 9075 }, { "epoch": 1.198363468391184, "grad_norm": 35.3869514465332, "learning_rate": 1.883544783581338e-06, "loss": 0.1583, "num_input_tokens_seen": 4464000, "step": 9080 }, { "epoch": 1.1990233601689324, "grad_norm": 0.23158396780490875, "learning_rate": 1.8833289233873346e-06, "loss": 0.0736, "num_input_tokens_seen": 4466368, "step": 9085 }, { "epoch": 1.1996832519466807, "grad_norm": 159.66079711914062, "learning_rate": 1.8831128757131634e-06, "loss": 0.1445, "num_input_tokens_seen": 4468800, "step": 9090 }, { "epoch": 1.2003431437244292, "grad_norm": 0.3382391035556793, "learning_rate": 1.8828966406046796e-06, "loss": 0.1592, "num_input_tokens_seen": 4471296, "step": 9095 }, { "epoch": 1.2010030355021777, "grad_norm": 0.09924346208572388, "learning_rate": 1.8826802181077771e-06, "loss": 0.0024, "num_input_tokens_seen": 4473856, "step": 9100 }, { "epoch": 1.2016629272799262, "grad_norm": 0.3284415304660797, "learning_rate": 1.8824636082683903e-06, "loss": 0.0028, "num_input_tokens_seen": 4476416, "step": 9105 }, { "epoch": 1.2023228190576745, "grad_norm": 0.0883711501955986, "learning_rate": 1.8822468111324927e-06, "loss": 0.044, "num_input_tokens_seen": 4478848, "step": 9110 }, { "epoch": 1.202982710835423, "grad_norm": 13.112217903137207, "learning_rate": 1.8820298267460983e-06, "loss": 0.1416, "num_input_tokens_seen": 4481088, "step": 9115 }, { "epoch": 1.2036426026131715, "grad_norm": 13.190703392028809, "learning_rate": 1.8818126551552605e-06, "loss": 0.0922, "num_input_tokens_seen": 4483392, "step": 9120 }, { "epoch": 1.20430249439092, "grad_norm": 12.654837608337402, "learning_rate": 1.881595296406072e-06, "loss": 0.1309, "num_input_tokens_seen": 4485760, "step": 9125 }, { "epoch": 1.2049623861686682, "grad_norm": 0.05298277735710144, "learning_rate": 1.881377750544666e-06, "loss": 0.0023, "num_input_tokens_seen": 4488064, "step": 9130 }, { "epoch": 1.2056222779464167, "grad_norm": 91.60076904296875, "learning_rate": 1.8811600176172147e-06, "loss": 0.0897, "num_input_tokens_seen": 4490048, "step": 9135 }, { "epoch": 1.2062821697241652, "grad_norm": 0.06908053904771805, "learning_rate": 1.8809420976699308e-06, "loss": 0.0015, "num_input_tokens_seen": 4492480, "step": 9140 }, { "epoch": 1.2069420615019137, "grad_norm": 0.49360230565071106, "learning_rate": 1.8807239907490656e-06, "loss": 0.0265, "num_input_tokens_seen": 4495104, "step": 9145 }, { "epoch": 1.2076019532796622, "grad_norm": 0.07297612726688385, "learning_rate": 1.8805056969009114e-06, "loss": 0.2775, "num_input_tokens_seen": 4497408, "step": 9150 }, { "epoch": 1.2082618450574105, "grad_norm": 0.08615783601999283, "learning_rate": 1.8802872161717988e-06, "loss": 0.0605, "num_input_tokens_seen": 4499584, "step": 9155 }, { "epoch": 1.208921736835159, "grad_norm": 0.13035784661769867, "learning_rate": 1.8800685486080994e-06, "loss": 0.0287, "num_input_tokens_seen": 4502336, "step": 9160 }, { "epoch": 1.2095816286129075, "grad_norm": 0.030498240143060684, "learning_rate": 1.8798496942562235e-06, "loss": 0.0638, "num_input_tokens_seen": 4505024, "step": 9165 }, { "epoch": 1.210241520390656, "grad_norm": 14.509490013122559, "learning_rate": 1.879630653162621e-06, "loss": 0.0649, "num_input_tokens_seen": 4507776, "step": 9170 }, { "epoch": 1.2109014121684043, "grad_norm": 16.562963485717773, "learning_rate": 1.8794114253737825e-06, "loss": 0.0745, "num_input_tokens_seen": 4510144, "step": 9175 }, { "epoch": 1.2115613039461528, "grad_norm": 0.2812765836715698, "learning_rate": 1.8791920109362373e-06, "loss": 0.0387, "num_input_tokens_seen": 4512704, "step": 9180 }, { "epoch": 1.2122211957239013, "grad_norm": 0.5217291712760925, "learning_rate": 1.878972409896554e-06, "loss": 0.0823, "num_input_tokens_seen": 4515264, "step": 9185 }, { "epoch": 1.2128810875016498, "grad_norm": 0.06650111079216003, "learning_rate": 1.878752622301342e-06, "loss": 0.0804, "num_input_tokens_seen": 4518016, "step": 9190 }, { "epoch": 1.213540979279398, "grad_norm": 0.028405936434864998, "learning_rate": 1.8785326481972491e-06, "loss": 0.0554, "num_input_tokens_seen": 4520320, "step": 9195 }, { "epoch": 1.2142008710571466, "grad_norm": 15.38084602355957, "learning_rate": 1.8783124876309637e-06, "loss": 0.0832, "num_input_tokens_seen": 4522432, "step": 9200 }, { "epoch": 1.214860762834895, "grad_norm": 0.029807792976498604, "learning_rate": 1.878092140649213e-06, "loss": 0.0009, "num_input_tokens_seen": 4524864, "step": 9205 }, { "epoch": 1.2155206546126436, "grad_norm": 0.27122005820274353, "learning_rate": 1.8778716072987638e-06, "loss": 0.0041, "num_input_tokens_seen": 4527296, "step": 9210 }, { "epoch": 1.216180546390392, "grad_norm": 0.07015712559223175, "learning_rate": 1.8776508876264235e-06, "loss": 0.1339, "num_input_tokens_seen": 4529792, "step": 9215 }, { "epoch": 1.2168404381681404, "grad_norm": 0.8073084354400635, "learning_rate": 1.8774299816790373e-06, "loss": 0.0356, "num_input_tokens_seen": 4532288, "step": 9220 }, { "epoch": 1.2175003299458889, "grad_norm": 18.83184051513672, "learning_rate": 1.8772088895034916e-06, "loss": 0.0769, "num_input_tokens_seen": 4534848, "step": 9225 }, { "epoch": 1.2181602217236374, "grad_norm": 0.01886264607310295, "learning_rate": 1.876987611146711e-06, "loss": 0.0675, "num_input_tokens_seen": 4537280, "step": 9230 }, { "epoch": 1.2188201135013859, "grad_norm": 0.6087026000022888, "learning_rate": 1.876766146655661e-06, "loss": 0.158, "num_input_tokens_seen": 4539776, "step": 9235 }, { "epoch": 1.2194800052791341, "grad_norm": 0.8865079879760742, "learning_rate": 1.8765444960773453e-06, "loss": 0.0008, "num_input_tokens_seen": 4542144, "step": 9240 }, { "epoch": 1.2201398970568826, "grad_norm": 0.08013620227575302, "learning_rate": 1.8763226594588078e-06, "loss": 0.0557, "num_input_tokens_seen": 4544576, "step": 9245 }, { "epoch": 1.2207997888346311, "grad_norm": 0.12226373702287674, "learning_rate": 1.8761006368471315e-06, "loss": 0.0475, "num_input_tokens_seen": 4547264, "step": 9250 }, { "epoch": 1.2214596806123796, "grad_norm": 0.0244253259152174, "learning_rate": 1.8758784282894394e-06, "loss": 0.0003, "num_input_tokens_seen": 4549696, "step": 9255 }, { "epoch": 1.222119572390128, "grad_norm": 0.04465080052614212, "learning_rate": 1.8756560338328934e-06, "loss": 0.0956, "num_input_tokens_seen": 4552000, "step": 9260 }, { "epoch": 1.2227794641678764, "grad_norm": 0.03824853524565697, "learning_rate": 1.8754334535246952e-06, "loss": 0.0492, "num_input_tokens_seen": 4554624, "step": 9265 }, { "epoch": 1.223439355945625, "grad_norm": 11.81065559387207, "learning_rate": 1.875210687412086e-06, "loss": 0.0658, "num_input_tokens_seen": 4557056, "step": 9270 }, { "epoch": 1.2240992477233734, "grad_norm": 21.275474548339844, "learning_rate": 1.874987735542346e-06, "loss": 0.0784, "num_input_tokens_seen": 4559488, "step": 9275 }, { "epoch": 1.224759139501122, "grad_norm": 0.07805287837982178, "learning_rate": 1.8747645979627955e-06, "loss": 0.1279, "num_input_tokens_seen": 4561984, "step": 9280 }, { "epoch": 1.2254190312788702, "grad_norm": 0.07274620980024338, "learning_rate": 1.8745412747207933e-06, "loss": 0.0544, "num_input_tokens_seen": 4564736, "step": 9285 }, { "epoch": 1.2260789230566187, "grad_norm": 0.25491011142730713, "learning_rate": 1.8743177658637387e-06, "loss": 0.0699, "num_input_tokens_seen": 4567232, "step": 9290 }, { "epoch": 1.2267388148343672, "grad_norm": 69.74526977539062, "learning_rate": 1.8740940714390697e-06, "loss": 0.1415, "num_input_tokens_seen": 4569664, "step": 9295 }, { "epoch": 1.2273987066121157, "grad_norm": 0.05679222196340561, "learning_rate": 1.8738701914942636e-06, "loss": 0.0011, "num_input_tokens_seen": 4572096, "step": 9300 }, { "epoch": 1.228058598389864, "grad_norm": 0.05214114487171173, "learning_rate": 1.8736461260768375e-06, "loss": 0.0006, "num_input_tokens_seen": 4574528, "step": 9305 }, { "epoch": 1.2287184901676125, "grad_norm": 0.25871649384498596, "learning_rate": 1.8734218752343475e-06, "loss": 0.094, "num_input_tokens_seen": 4577088, "step": 9310 }, { "epoch": 1.229378381945361, "grad_norm": 0.01494324766099453, "learning_rate": 1.8731974390143894e-06, "loss": 0.0003, "num_input_tokens_seen": 4579456, "step": 9315 }, { "epoch": 1.2300382737231095, "grad_norm": 49.50048828125, "learning_rate": 1.872972817464598e-06, "loss": 0.3725, "num_input_tokens_seen": 4581824, "step": 9320 }, { "epoch": 1.2306981655008578, "grad_norm": 0.17533209919929504, "learning_rate": 1.8727480106326476e-06, "loss": 0.0324, "num_input_tokens_seen": 4584256, "step": 9325 }, { "epoch": 1.2313580572786063, "grad_norm": 32.87931823730469, "learning_rate": 1.872523018566252e-06, "loss": 0.1018, "num_input_tokens_seen": 4587008, "step": 9330 }, { "epoch": 1.2320179490563548, "grad_norm": 0.40437552332878113, "learning_rate": 1.8722978413131641e-06, "loss": 0.1404, "num_input_tokens_seen": 4589824, "step": 9335 }, { "epoch": 1.2326778408341033, "grad_norm": 53.637935638427734, "learning_rate": 1.8720724789211758e-06, "loss": 0.1816, "num_input_tokens_seen": 4592448, "step": 9340 }, { "epoch": 1.2333377326118518, "grad_norm": 0.4547289311885834, "learning_rate": 1.871846931438119e-06, "loss": 0.0031, "num_input_tokens_seen": 4594880, "step": 9345 }, { "epoch": 1.2339976243896, "grad_norm": 59.51481246948242, "learning_rate": 1.8716211989118645e-06, "loss": 0.4028, "num_input_tokens_seen": 4597120, "step": 9350 }, { "epoch": 1.2346575161673485, "grad_norm": 16.716562271118164, "learning_rate": 1.8713952813903222e-06, "loss": 0.1788, "num_input_tokens_seen": 4599552, "step": 9355 }, { "epoch": 1.235317407945097, "grad_norm": 0.6430609822273254, "learning_rate": 1.8711691789214416e-06, "loss": 0.0456, "num_input_tokens_seen": 4602048, "step": 9360 }, { "epoch": 1.2359772997228455, "grad_norm": 0.11246831715106964, "learning_rate": 1.8709428915532114e-06, "loss": 0.0675, "num_input_tokens_seen": 4604352, "step": 9365 }, { "epoch": 1.2366371915005938, "grad_norm": 0.9561209678649902, "learning_rate": 1.8707164193336595e-06, "loss": 0.1284, "num_input_tokens_seen": 4606720, "step": 9370 }, { "epoch": 1.2372970832783423, "grad_norm": 12.950846672058105, "learning_rate": 1.8704897623108527e-06, "loss": 0.1365, "num_input_tokens_seen": 4609088, "step": 9375 }, { "epoch": 1.2379569750560908, "grad_norm": 34.06569290161133, "learning_rate": 1.8702629205328973e-06, "loss": 0.0896, "num_input_tokens_seen": 4611712, "step": 9380 }, { "epoch": 1.2386168668338393, "grad_norm": 0.2726440131664276, "learning_rate": 1.8700358940479387e-06, "loss": 0.1082, "num_input_tokens_seen": 4614080, "step": 9385 }, { "epoch": 1.2392767586115876, "grad_norm": 0.19074247777462006, "learning_rate": 1.8698086829041624e-06, "loss": 0.0484, "num_input_tokens_seen": 4616256, "step": 9390 }, { "epoch": 1.239936650389336, "grad_norm": 6.18501615524292, "learning_rate": 1.8695812871497915e-06, "loss": 0.0028, "num_input_tokens_seen": 4618688, "step": 9395 }, { "epoch": 1.2405965421670846, "grad_norm": 18.376789093017578, "learning_rate": 1.8693537068330898e-06, "loss": 0.1368, "num_input_tokens_seen": 4621184, "step": 9400 }, { "epoch": 1.241256433944833, "grad_norm": 67.07162475585938, "learning_rate": 1.8691259420023589e-06, "loss": 0.189, "num_input_tokens_seen": 4623616, "step": 9405 }, { "epoch": 1.2419163257225816, "grad_norm": 0.06652729958295822, "learning_rate": 1.8688979927059405e-06, "loss": 0.0022, "num_input_tokens_seen": 4626048, "step": 9410 }, { "epoch": 1.2425762175003299, "grad_norm": 20.42545509338379, "learning_rate": 1.8686698589922154e-06, "loss": 0.2181, "num_input_tokens_seen": 4628544, "step": 9415 }, { "epoch": 1.2432361092780784, "grad_norm": 0.010046327486634254, "learning_rate": 1.868441540909603e-06, "loss": 0.0009, "num_input_tokens_seen": 4631296, "step": 9420 }, { "epoch": 1.2438960010558269, "grad_norm": 0.02093764953315258, "learning_rate": 1.8682130385065622e-06, "loss": 0.0018, "num_input_tokens_seen": 4633664, "step": 9425 }, { "epoch": 1.2445558928335754, "grad_norm": 0.21397200226783752, "learning_rate": 1.8679843518315911e-06, "loss": 0.001, "num_input_tokens_seen": 4636224, "step": 9430 }, { "epoch": 1.2452157846113237, "grad_norm": 0.056055840104818344, "learning_rate": 1.8677554809332272e-06, "loss": 0.0792, "num_input_tokens_seen": 4638720, "step": 9435 }, { "epoch": 1.2458756763890722, "grad_norm": 0.04379117116332054, "learning_rate": 1.8675264258600459e-06, "loss": 0.0399, "num_input_tokens_seen": 4641280, "step": 9440 }, { "epoch": 1.2465355681668207, "grad_norm": 0.05579795688390732, "learning_rate": 1.8672971866606627e-06, "loss": 0.0117, "num_input_tokens_seen": 4643648, "step": 9445 }, { "epoch": 1.2471954599445692, "grad_norm": 0.11813732236623764, "learning_rate": 1.8670677633837321e-06, "loss": 0.0879, "num_input_tokens_seen": 4646016, "step": 9450 }, { "epoch": 1.2478553517223174, "grad_norm": 19.978891372680664, "learning_rate": 1.8668381560779478e-06, "loss": 0.2114, "num_input_tokens_seen": 4648320, "step": 9455 }, { "epoch": 1.248515243500066, "grad_norm": 2.6825110912323, "learning_rate": 1.866608364792042e-06, "loss": 0.0495, "num_input_tokens_seen": 4650944, "step": 9460 }, { "epoch": 1.2491751352778144, "grad_norm": 0.8223278522491455, "learning_rate": 1.8663783895747863e-06, "loss": 0.0025, "num_input_tokens_seen": 4653440, "step": 9465 }, { "epoch": 1.249835027055563, "grad_norm": 22.685306549072266, "learning_rate": 1.8661482304749911e-06, "loss": 0.1382, "num_input_tokens_seen": 4656064, "step": 9470 }, { "epoch": 1.2504949188333114, "grad_norm": 0.029461894184350967, "learning_rate": 1.8659178875415062e-06, "loss": 0.1179, "num_input_tokens_seen": 4658240, "step": 9475 }, { "epoch": 1.2504949188333114, "eval_loss": 0.11660958081483841, "eval_runtime": 7.8307, "eval_samples_per_second": 860.075, "eval_steps_per_second": 107.525, "num_input_tokens_seen": 4658240, "step": 9475 }, { "epoch": 1.2511548106110597, "grad_norm": 0.46132540702819824, "learning_rate": 1.86568736082322e-06, "loss": 0.001, "num_input_tokens_seen": 4660992, "step": 9480 }, { "epoch": 1.2518147023888082, "grad_norm": 0.01598983258008957, "learning_rate": 1.8654566503690606e-06, "loss": 0.0584, "num_input_tokens_seen": 4663488, "step": 9485 }, { "epoch": 1.2524745941665567, "grad_norm": 0.13625356554985046, "learning_rate": 1.8652257562279942e-06, "loss": 0.0698, "num_input_tokens_seen": 4666112, "step": 9490 }, { "epoch": 1.2531344859443052, "grad_norm": 325.95355224609375, "learning_rate": 1.864994678449026e-06, "loss": 0.1508, "num_input_tokens_seen": 4668992, "step": 9495 }, { "epoch": 1.2537943777220537, "grad_norm": 42.81782150268555, "learning_rate": 1.864763417081202e-06, "loss": 0.1196, "num_input_tokens_seen": 4671168, "step": 9500 }, { "epoch": 1.254454269499802, "grad_norm": 0.0359887033700943, "learning_rate": 1.864531972173604e-06, "loss": 0.0589, "num_input_tokens_seen": 4673600, "step": 9505 }, { "epoch": 1.2551141612775505, "grad_norm": 0.061382975429296494, "learning_rate": 1.8643003437753557e-06, "loss": 0.0006, "num_input_tokens_seen": 4676224, "step": 9510 }, { "epoch": 1.255774053055299, "grad_norm": 18.239227294921875, "learning_rate": 1.8640685319356181e-06, "loss": 0.1122, "num_input_tokens_seen": 4678592, "step": 9515 }, { "epoch": 1.2564339448330473, "grad_norm": 0.4076708257198334, "learning_rate": 1.8638365367035922e-06, "loss": 0.001, "num_input_tokens_seen": 4680960, "step": 9520 }, { "epoch": 1.2570938366107958, "grad_norm": 22.06498146057129, "learning_rate": 1.863604358128516e-06, "loss": 0.0433, "num_input_tokens_seen": 4683264, "step": 9525 }, { "epoch": 1.2577537283885443, "grad_norm": 0.36363160610198975, "learning_rate": 1.8633719962596693e-06, "loss": 0.0433, "num_input_tokens_seen": 4685760, "step": 9530 }, { "epoch": 1.2584136201662928, "grad_norm": 36.61283493041992, "learning_rate": 1.863139451146368e-06, "loss": 0.0812, "num_input_tokens_seen": 4688256, "step": 9535 }, { "epoch": 1.2590735119440413, "grad_norm": 0.03277817741036415, "learning_rate": 1.8629067228379687e-06, "loss": 0.0911, "num_input_tokens_seen": 4690624, "step": 9540 }, { "epoch": 1.2597334037217895, "grad_norm": 0.06794434040784836, "learning_rate": 1.8626738113838657e-06, "loss": 0.0416, "num_input_tokens_seen": 4693376, "step": 9545 }, { "epoch": 1.260393295499538, "grad_norm": 0.5603092312812805, "learning_rate": 1.8624407168334938e-06, "loss": 0.0004, "num_input_tokens_seen": 4695936, "step": 9550 }, { "epoch": 1.2610531872772865, "grad_norm": 18.329586029052734, "learning_rate": 1.8622074392363249e-06, "loss": 0.0927, "num_input_tokens_seen": 4698304, "step": 9555 }, { "epoch": 1.261713079055035, "grad_norm": 8.598894119262695, "learning_rate": 1.8619739786418707e-06, "loss": 0.0017, "num_input_tokens_seen": 4700608, "step": 9560 }, { "epoch": 1.2623729708327835, "grad_norm": 0.1886700987815857, "learning_rate": 1.8617403350996814e-06, "loss": 0.0491, "num_input_tokens_seen": 4702976, "step": 9565 }, { "epoch": 1.2630328626105318, "grad_norm": 19.052263259887695, "learning_rate": 1.861506508659346e-06, "loss": 0.1877, "num_input_tokens_seen": 4705408, "step": 9570 }, { "epoch": 1.2636927543882803, "grad_norm": 0.01895013637840748, "learning_rate": 1.861272499370493e-06, "loss": 0.1504, "num_input_tokens_seen": 4708032, "step": 9575 }, { "epoch": 1.2643526461660288, "grad_norm": 19.410493850708008, "learning_rate": 1.8610383072827887e-06, "loss": 0.073, "num_input_tokens_seen": 4710400, "step": 9580 }, { "epoch": 1.265012537943777, "grad_norm": 15.133025169372559, "learning_rate": 1.8608039324459388e-06, "loss": 0.0615, "num_input_tokens_seen": 4712960, "step": 9585 }, { "epoch": 1.2656724297215256, "grad_norm": 0.11856388300657272, "learning_rate": 1.8605693749096877e-06, "loss": 0.0543, "num_input_tokens_seen": 4715200, "step": 9590 }, { "epoch": 1.266332321499274, "grad_norm": 17.171472549438477, "learning_rate": 1.8603346347238185e-06, "loss": 0.1053, "num_input_tokens_seen": 4717568, "step": 9595 }, { "epoch": 1.2669922132770226, "grad_norm": 0.1262669414281845, "learning_rate": 1.8600997119381533e-06, "loss": 0.1185, "num_input_tokens_seen": 4719936, "step": 9600 }, { "epoch": 1.267652105054771, "grad_norm": 0.5233826637268066, "learning_rate": 1.8598646066025523e-06, "loss": 0.092, "num_input_tokens_seen": 4722368, "step": 9605 }, { "epoch": 1.2683119968325194, "grad_norm": 0.26132288575172424, "learning_rate": 1.8596293187669155e-06, "loss": 0.0026, "num_input_tokens_seen": 4724864, "step": 9610 }, { "epoch": 1.2689718886102679, "grad_norm": 0.10552530735731125, "learning_rate": 1.8593938484811806e-06, "loss": 0.0039, "num_input_tokens_seen": 4727424, "step": 9615 }, { "epoch": 1.2696317803880164, "grad_norm": 21.18337059020996, "learning_rate": 1.8591581957953245e-06, "loss": 0.0911, "num_input_tokens_seen": 4729600, "step": 9620 }, { "epoch": 1.2702916721657649, "grad_norm": 0.06999889761209488, "learning_rate": 1.8589223607593628e-06, "loss": 0.0008, "num_input_tokens_seen": 4732352, "step": 9625 }, { "epoch": 1.2709515639435134, "grad_norm": 0.03068052977323532, "learning_rate": 1.8586863434233502e-06, "loss": 0.0029, "num_input_tokens_seen": 4734848, "step": 9630 }, { "epoch": 1.2716114557212617, "grad_norm": 1.2916810512542725, "learning_rate": 1.8584501438373793e-06, "loss": 0.0696, "num_input_tokens_seen": 4737216, "step": 9635 }, { "epoch": 1.2722713474990102, "grad_norm": 17.11394691467285, "learning_rate": 1.8582137620515816e-06, "loss": 0.0958, "num_input_tokens_seen": 4739712, "step": 9640 }, { "epoch": 1.2729312392767587, "grad_norm": 0.03202284872531891, "learning_rate": 1.8579771981161277e-06, "loss": 0.2084, "num_input_tokens_seen": 4742144, "step": 9645 }, { "epoch": 1.273591131054507, "grad_norm": 0.006628343369811773, "learning_rate": 1.8577404520812262e-06, "loss": 0.0001, "num_input_tokens_seen": 4744832, "step": 9650 }, { "epoch": 1.2742510228322554, "grad_norm": 65.34660339355469, "learning_rate": 1.8575035239971255e-06, "loss": 0.0457, "num_input_tokens_seen": 4747584, "step": 9655 }, { "epoch": 1.274910914610004, "grad_norm": 0.11509735137224197, "learning_rate": 1.857266413914111e-06, "loss": 0.0005, "num_input_tokens_seen": 4750016, "step": 9660 }, { "epoch": 1.2755708063877524, "grad_norm": 0.03837760165333748, "learning_rate": 1.8570291218825082e-06, "loss": 0.0567, "num_input_tokens_seen": 4752576, "step": 9665 }, { "epoch": 1.276230698165501, "grad_norm": 0.053258758038282394, "learning_rate": 1.8567916479526802e-06, "loss": 0.0774, "num_input_tokens_seen": 4754752, "step": 9670 }, { "epoch": 1.2768905899432492, "grad_norm": 2.510385036468506, "learning_rate": 1.8565539921750295e-06, "loss": 0.0006, "num_input_tokens_seen": 4756864, "step": 9675 }, { "epoch": 1.2775504817209977, "grad_norm": 88.27272033691406, "learning_rate": 1.8563161545999965e-06, "loss": 0.0764, "num_input_tokens_seen": 4759424, "step": 9680 }, { "epoch": 1.2782103734987462, "grad_norm": 24.547399520874023, "learning_rate": 1.8560781352780607e-06, "loss": 0.2287, "num_input_tokens_seen": 4761792, "step": 9685 }, { "epoch": 1.2788702652764947, "grad_norm": 0.13503701984882355, "learning_rate": 1.8558399342597402e-06, "loss": 0.0725, "num_input_tokens_seen": 4764544, "step": 9690 }, { "epoch": 1.2795301570542432, "grad_norm": 0.033040594309568405, "learning_rate": 1.8556015515955907e-06, "loss": 0.0003, "num_input_tokens_seen": 4766912, "step": 9695 }, { "epoch": 1.2801900488319915, "grad_norm": 0.05509025976061821, "learning_rate": 1.8553629873362079e-06, "loss": 0.063, "num_input_tokens_seen": 4769280, "step": 9700 }, { "epoch": 1.28084994060974, "grad_norm": 0.16460861265659332, "learning_rate": 1.855124241532225e-06, "loss": 0.0511, "num_input_tokens_seen": 4772032, "step": 9705 }, { "epoch": 1.2815098323874885, "grad_norm": 0.01625911518931389, "learning_rate": 1.8548853142343142e-06, "loss": 0.0003, "num_input_tokens_seen": 4774400, "step": 9710 }, { "epoch": 1.2821697241652368, "grad_norm": 0.3575328290462494, "learning_rate": 1.854646205493186e-06, "loss": 0.2591, "num_input_tokens_seen": 4776640, "step": 9715 }, { "epoch": 1.2828296159429853, "grad_norm": 39.71146774291992, "learning_rate": 1.8544069153595896e-06, "loss": 0.0848, "num_input_tokens_seen": 4779008, "step": 9720 }, { "epoch": 1.2834895077207338, "grad_norm": 0.4594075679779053, "learning_rate": 1.8541674438843125e-06, "loss": 0.2256, "num_input_tokens_seen": 4781696, "step": 9725 }, { "epoch": 1.2841493994984823, "grad_norm": 0.4085526764392853, "learning_rate": 1.8539277911181809e-06, "loss": 0.0343, "num_input_tokens_seen": 4784192, "step": 9730 }, { "epoch": 1.2848092912762308, "grad_norm": 0.5087748765945435, "learning_rate": 1.8536879571120593e-06, "loss": 0.0027, "num_input_tokens_seen": 4786880, "step": 9735 }, { "epoch": 1.285469183053979, "grad_norm": 0.07352367043495178, "learning_rate": 1.8534479419168508e-06, "loss": 0.1961, "num_input_tokens_seen": 4789696, "step": 9740 }, { "epoch": 1.2861290748317276, "grad_norm": 0.043694186955690384, "learning_rate": 1.8532077455834964e-06, "loss": 0.0241, "num_input_tokens_seen": 4792384, "step": 9745 }, { "epoch": 1.286788966609476, "grad_norm": 37.857357025146484, "learning_rate": 1.8529673681629766e-06, "loss": 0.1954, "num_input_tokens_seen": 4794944, "step": 9750 }, { "epoch": 1.2874488583872246, "grad_norm": 1.7321579456329346, "learning_rate": 1.85272680970631e-06, "loss": 0.0025, "num_input_tokens_seen": 4797376, "step": 9755 }, { "epoch": 1.288108750164973, "grad_norm": 0.08831868320703506, "learning_rate": 1.8524860702645527e-06, "loss": 0.0051, "num_input_tokens_seen": 4799808, "step": 9760 }, { "epoch": 1.2887686419427213, "grad_norm": 0.02845185063779354, "learning_rate": 1.8522451498888004e-06, "loss": 0.0732, "num_input_tokens_seen": 4802560, "step": 9765 }, { "epoch": 1.2894285337204698, "grad_norm": 0.15058395266532898, "learning_rate": 1.8520040486301862e-06, "loss": 0.0006, "num_input_tokens_seen": 4804736, "step": 9770 }, { "epoch": 1.2900884254982183, "grad_norm": 19.97967529296875, "learning_rate": 1.8517627665398825e-06, "loss": 0.1509, "num_input_tokens_seen": 4807040, "step": 9775 }, { "epoch": 1.2907483172759666, "grad_norm": 0.08964036405086517, "learning_rate": 1.8515213036690996e-06, "loss": 0.0015, "num_input_tokens_seen": 4809152, "step": 9780 }, { "epoch": 1.2914082090537151, "grad_norm": 0.08650282025337219, "learning_rate": 1.8512796600690864e-06, "loss": 0.0002, "num_input_tokens_seen": 4811776, "step": 9785 }, { "epoch": 1.2920681008314636, "grad_norm": 0.010320809669792652, "learning_rate": 1.8510378357911297e-06, "loss": 0.0003, "num_input_tokens_seen": 4814272, "step": 9790 }, { "epoch": 1.2927279926092121, "grad_norm": 0.026830295100808144, "learning_rate": 1.8507958308865551e-06, "loss": 0.0535, "num_input_tokens_seen": 4816576, "step": 9795 }, { "epoch": 1.2933878843869606, "grad_norm": 0.5749787092208862, "learning_rate": 1.8505536454067264e-06, "loss": 0.0654, "num_input_tokens_seen": 4819200, "step": 9800 }, { "epoch": 1.294047776164709, "grad_norm": 0.1049313172698021, "learning_rate": 1.8503112794030456e-06, "loss": 0.134, "num_input_tokens_seen": 4821824, "step": 9805 }, { "epoch": 1.2947076679424574, "grad_norm": 0.16344867646694183, "learning_rate": 1.8500687329269532e-06, "loss": 0.0005, "num_input_tokens_seen": 4824576, "step": 9810 }, { "epoch": 1.295367559720206, "grad_norm": 0.026725659146904945, "learning_rate": 1.8498260060299282e-06, "loss": 0.0988, "num_input_tokens_seen": 4827136, "step": 9815 }, { "epoch": 1.2960274514979544, "grad_norm": 17.736068725585938, "learning_rate": 1.849583098763487e-06, "loss": 0.0933, "num_input_tokens_seen": 4829312, "step": 9820 }, { "epoch": 1.296687343275703, "grad_norm": 0.22278468310832977, "learning_rate": 1.8493400111791858e-06, "loss": 0.0507, "num_input_tokens_seen": 4831808, "step": 9825 }, { "epoch": 1.2973472350534512, "grad_norm": 0.05429788678884506, "learning_rate": 1.8490967433286172e-06, "loss": 0.1303, "num_input_tokens_seen": 4834048, "step": 9830 }, { "epoch": 1.2980071268311997, "grad_norm": 0.018094699829816818, "learning_rate": 1.8488532952634138e-06, "loss": 0.0016, "num_input_tokens_seen": 4836416, "step": 9835 }, { "epoch": 1.2986670186089482, "grad_norm": 3.234799385070801, "learning_rate": 1.8486096670352448e-06, "loss": 0.0792, "num_input_tokens_seen": 4838656, "step": 9840 }, { "epoch": 1.2993269103866965, "grad_norm": 118.24174499511719, "learning_rate": 1.8483658586958198e-06, "loss": 0.1515, "num_input_tokens_seen": 4841024, "step": 9845 }, { "epoch": 1.299986802164445, "grad_norm": 372.8785705566406, "learning_rate": 1.8481218702968845e-06, "loss": 0.0899, "num_input_tokens_seen": 4843520, "step": 9850 }, { "epoch": 1.3006466939421935, "grad_norm": 0.0978814959526062, "learning_rate": 1.8478777018902236e-06, "loss": 0.0714, "num_input_tokens_seen": 4846208, "step": 9855 }, { "epoch": 1.301306585719942, "grad_norm": 0.22887535393238068, "learning_rate": 1.8476333535276605e-06, "loss": 0.1439, "num_input_tokens_seen": 4848768, "step": 9860 }, { "epoch": 1.3019664774976905, "grad_norm": 0.10580222308635712, "learning_rate": 1.8473888252610563e-06, "loss": 0.0974, "num_input_tokens_seen": 4851264, "step": 9865 }, { "epoch": 1.3026263692754387, "grad_norm": 0.34106671810150146, "learning_rate": 1.8471441171423101e-06, "loss": 0.1057, "num_input_tokens_seen": 4853632, "step": 9870 }, { "epoch": 1.3032862610531872, "grad_norm": 0.170933797955513, "learning_rate": 1.8468992292233595e-06, "loss": 0.0011, "num_input_tokens_seen": 4856256, "step": 9875 }, { "epoch": 1.3039461528309357, "grad_norm": 0.6921259760856628, "learning_rate": 1.8466541615561804e-06, "loss": 0.076, "num_input_tokens_seen": 4858752, "step": 9880 }, { "epoch": 1.3046060446086842, "grad_norm": 0.012155439704656601, "learning_rate": 1.8464089141927866e-06, "loss": 0.0014, "num_input_tokens_seen": 4861248, "step": 9885 }, { "epoch": 1.3052659363864327, "grad_norm": 212.3350830078125, "learning_rate": 1.8461634871852298e-06, "loss": 0.2671, "num_input_tokens_seen": 4863744, "step": 9890 }, { "epoch": 1.305925828164181, "grad_norm": 0.13975022733211517, "learning_rate": 1.8459178805856003e-06, "loss": 0.0681, "num_input_tokens_seen": 4865984, "step": 9895 }, { "epoch": 1.3065857199419295, "grad_norm": 0.8244561553001404, "learning_rate": 1.8456720944460265e-06, "loss": 0.1544, "num_input_tokens_seen": 4868480, "step": 9900 }, { "epoch": 1.307245611719678, "grad_norm": 19.768590927124023, "learning_rate": 1.8454261288186741e-06, "loss": 0.1641, "num_input_tokens_seen": 4870976, "step": 9905 }, { "epoch": 1.3079055034974263, "grad_norm": 0.03448856994509697, "learning_rate": 1.8451799837557483e-06, "loss": 0.0584, "num_input_tokens_seen": 4873472, "step": 9910 }, { "epoch": 1.3085653952751748, "grad_norm": 0.015968943014740944, "learning_rate": 1.8449336593094914e-06, "loss": 0.0582, "num_input_tokens_seen": 4876160, "step": 9915 }, { "epoch": 1.3092252870529233, "grad_norm": 0.04296119883656502, "learning_rate": 1.8446871555321834e-06, "loss": 0.0677, "num_input_tokens_seen": 4878400, "step": 9920 }, { "epoch": 1.3098851788306718, "grad_norm": 2.142721652984619, "learning_rate": 1.8444404724761436e-06, "loss": 0.0026, "num_input_tokens_seen": 4881152, "step": 9925 }, { "epoch": 1.3105450706084203, "grad_norm": 0.272866427898407, "learning_rate": 1.8441936101937285e-06, "loss": 0.0665, "num_input_tokens_seen": 4883648, "step": 9930 }, { "epoch": 1.3112049623861686, "grad_norm": 0.026395170018076897, "learning_rate": 1.8439465687373328e-06, "loss": 0.1721, "num_input_tokens_seen": 4885760, "step": 9935 }, { "epoch": 1.311864854163917, "grad_norm": 0.05674809217453003, "learning_rate": 1.8436993481593891e-06, "loss": 0.0015, "num_input_tokens_seen": 4888384, "step": 9940 }, { "epoch": 1.3125247459416656, "grad_norm": 33.2212028503418, "learning_rate": 1.8434519485123685e-06, "loss": 0.084, "num_input_tokens_seen": 4890880, "step": 9945 }, { "epoch": 1.313184637719414, "grad_norm": 0.12552928924560547, "learning_rate": 1.8432043698487796e-06, "loss": 0.0374, "num_input_tokens_seen": 4893184, "step": 9950 }, { "epoch": 1.3138445294971626, "grad_norm": 0.028963031247258186, "learning_rate": 1.8429566122211693e-06, "loss": 0.0611, "num_input_tokens_seen": 4895552, "step": 9955 }, { "epoch": 1.3145044212749109, "grad_norm": 0.035612285137176514, "learning_rate": 1.8427086756821222e-06, "loss": 0.1239, "num_input_tokens_seen": 4897856, "step": 9960 }, { "epoch": 1.3151643130526594, "grad_norm": 0.13290604948997498, "learning_rate": 1.842460560284261e-06, "loss": 0.1224, "num_input_tokens_seen": 4900352, "step": 9965 }, { "epoch": 1.3158242048304079, "grad_norm": 0.07261032611131668, "learning_rate": 1.8422122660802466e-06, "loss": 0.0006, "num_input_tokens_seen": 4903040, "step": 9970 }, { "epoch": 1.3164840966081561, "grad_norm": 0.05626000091433525, "learning_rate": 1.8419637931227776e-06, "loss": 0.0633, "num_input_tokens_seen": 4905664, "step": 9975 }, { "epoch": 1.3171439883859046, "grad_norm": 1.0715250968933105, "learning_rate": 1.8417151414645904e-06, "loss": 0.0512, "num_input_tokens_seen": 4907840, "step": 9980 }, { "epoch": 1.3178038801636531, "grad_norm": 0.10115410387516022, "learning_rate": 1.84146631115846e-06, "loss": 0.0012, "num_input_tokens_seen": 4910144, "step": 9985 }, { "epoch": 1.3184637719414016, "grad_norm": 23.29815101623535, "learning_rate": 1.8412173022571979e-06, "loss": 0.1102, "num_input_tokens_seen": 4912640, "step": 9990 }, { "epoch": 1.3191236637191501, "grad_norm": 0.16725671291351318, "learning_rate": 1.8409681148136556e-06, "loss": 0.0006, "num_input_tokens_seen": 4914944, "step": 9995 }, { "epoch": 1.3197835554968984, "grad_norm": 0.17661815881729126, "learning_rate": 1.8407187488807203e-06, "loss": 0.0516, "num_input_tokens_seen": 4917568, "step": 10000 }, { "epoch": 1.320443447274647, "grad_norm": 0.01880503073334694, "learning_rate": 1.8404692045113185e-06, "loss": 0.0525, "num_input_tokens_seen": 4919680, "step": 10005 }, { "epoch": 1.3211033390523954, "grad_norm": 229.0442657470703, "learning_rate": 1.8402194817584147e-06, "loss": 0.0183, "num_input_tokens_seen": 4921856, "step": 10010 }, { "epoch": 1.321763230830144, "grad_norm": 0.14362648129463196, "learning_rate": 1.8399695806750098e-06, "loss": 0.0421, "num_input_tokens_seen": 4924288, "step": 10015 }, { "epoch": 1.3224231226078924, "grad_norm": 0.1044907420873642, "learning_rate": 1.8397195013141445e-06, "loss": 0.1288, "num_input_tokens_seen": 4926528, "step": 10020 }, { "epoch": 1.3230830143856407, "grad_norm": 1.369716763496399, "learning_rate": 1.8394692437288954e-06, "loss": 0.004, "num_input_tokens_seen": 4929344, "step": 10025 }, { "epoch": 1.3237429061633892, "grad_norm": 26.978471755981445, "learning_rate": 1.8392188079723784e-06, "loss": 0.0934, "num_input_tokens_seen": 4931776, "step": 10030 }, { "epoch": 1.3244027979411377, "grad_norm": 0.09075580537319183, "learning_rate": 1.8389681940977467e-06, "loss": 0.0003, "num_input_tokens_seen": 4934272, "step": 10035 }, { "epoch": 1.325062689718886, "grad_norm": 0.08495357632637024, "learning_rate": 1.838717402158191e-06, "loss": 0.4409, "num_input_tokens_seen": 4936960, "step": 10040 }, { "epoch": 1.3257225814966347, "grad_norm": 0.0169760100543499, "learning_rate": 1.83846643220694e-06, "loss": 0.2134, "num_input_tokens_seen": 4939776, "step": 10045 }, { "epoch": 1.326382473274383, "grad_norm": 0.16505898535251617, "learning_rate": 1.8382152842972607e-06, "loss": 0.0947, "num_input_tokens_seen": 4942208, "step": 10050 }, { "epoch": 1.3270423650521315, "grad_norm": 0.145982563495636, "learning_rate": 1.8379639584824572e-06, "loss": 0.0015, "num_input_tokens_seen": 4944448, "step": 10055 }, { "epoch": 1.32770225682988, "grad_norm": 25.01861572265625, "learning_rate": 1.8377124548158713e-06, "loss": 0.177, "num_input_tokens_seen": 4946816, "step": 10060 }, { "epoch": 1.3283621486076282, "grad_norm": 0.07205390930175781, "learning_rate": 1.8374607733508833e-06, "loss": 0.0229, "num_input_tokens_seen": 4949184, "step": 10065 }, { "epoch": 1.3290220403853767, "grad_norm": 0.14163658022880554, "learning_rate": 1.8372089141409108e-06, "loss": 0.1654, "num_input_tokens_seen": 4951616, "step": 10070 }, { "epoch": 1.3296819321631252, "grad_norm": 0.1708272397518158, "learning_rate": 1.8369568772394087e-06, "loss": 0.1656, "num_input_tokens_seen": 4954048, "step": 10075 }, { "epoch": 1.3303418239408737, "grad_norm": 0.29565057158470154, "learning_rate": 1.8367046626998702e-06, "loss": 0.1187, "num_input_tokens_seen": 4956160, "step": 10080 }, { "epoch": 1.3310017157186222, "grad_norm": 12.916511535644531, "learning_rate": 1.8364522705758257e-06, "loss": 0.1228, "num_input_tokens_seen": 4958528, "step": 10085 }, { "epoch": 1.3316616074963705, "grad_norm": 69.16001892089844, "learning_rate": 1.836199700920844e-06, "loss": 0.1143, "num_input_tokens_seen": 4960896, "step": 10090 }, { "epoch": 1.332321499274119, "grad_norm": 0.2891543209552765, "learning_rate": 1.8359469537885312e-06, "loss": 0.0022, "num_input_tokens_seen": 4963456, "step": 10095 }, { "epoch": 1.3329813910518675, "grad_norm": 0.31188878417015076, "learning_rate": 1.835694029232531e-06, "loss": 0.0887, "num_input_tokens_seen": 4965632, "step": 10100 }, { "epoch": 1.333641282829616, "grad_norm": 0.077885203063488, "learning_rate": 1.8354409273065247e-06, "loss": 0.1001, "num_input_tokens_seen": 4967936, "step": 10105 }, { "epoch": 1.3343011746073645, "grad_norm": 2.893744468688965, "learning_rate": 1.835187648064231e-06, "loss": 0.0025, "num_input_tokens_seen": 4970240, "step": 10110 }, { "epoch": 1.3349610663851128, "grad_norm": 0.04158975929021835, "learning_rate": 1.8349341915594073e-06, "loss": 0.001, "num_input_tokens_seen": 4972992, "step": 10115 }, { "epoch": 1.3356209581628613, "grad_norm": 124.41069793701172, "learning_rate": 1.8346805578458474e-06, "loss": 0.1337, "num_input_tokens_seen": 4975616, "step": 10120 }, { "epoch": 1.3362808499406098, "grad_norm": 0.10024034976959229, "learning_rate": 1.8344267469773835e-06, "loss": 0.0462, "num_input_tokens_seen": 4978112, "step": 10125 }, { "epoch": 1.336940741718358, "grad_norm": 0.6105455756187439, "learning_rate": 1.8341727590078847e-06, "loss": 0.0005, "num_input_tokens_seen": 4980352, "step": 10130 }, { "epoch": 1.3376006334961066, "grad_norm": 339.4839782714844, "learning_rate": 1.8339185939912589e-06, "loss": 0.0783, "num_input_tokens_seen": 4982656, "step": 10135 }, { "epoch": 1.338260525273855, "grad_norm": 0.013025536201894283, "learning_rate": 1.83366425198145e-06, "loss": 0.0077, "num_input_tokens_seen": 4985216, "step": 10140 }, { "epoch": 1.3389204170516036, "grad_norm": 0.046981412917375565, "learning_rate": 1.8334097330324405e-06, "loss": 0.0159, "num_input_tokens_seen": 4987904, "step": 10145 }, { "epoch": 1.339580308829352, "grad_norm": 0.009956144727766514, "learning_rate": 1.8331550371982503e-06, "loss": 0.0203, "num_input_tokens_seen": 4990400, "step": 10150 }, { "epoch": 1.3402402006071004, "grad_norm": 0.6339499354362488, "learning_rate": 1.8329001645329364e-06, "loss": 0.071, "num_input_tokens_seen": 4992960, "step": 10155 }, { "epoch": 1.3409000923848489, "grad_norm": 2.1077048778533936, "learning_rate": 1.8326451150905945e-06, "loss": 0.0032, "num_input_tokens_seen": 4995584, "step": 10160 }, { "epoch": 1.3415599841625974, "grad_norm": 0.00569473672658205, "learning_rate": 1.8323898889253562e-06, "loss": 0.1142, "num_input_tokens_seen": 4997952, "step": 10165 }, { "epoch": 1.3422198759403459, "grad_norm": 0.04009333997964859, "learning_rate": 1.8321344860913918e-06, "loss": 0.1238, "num_input_tokens_seen": 5000000, "step": 10170 }, { "epoch": 1.3428797677180944, "grad_norm": 0.04034736752510071, "learning_rate": 1.8318789066429083e-06, "loss": 0.0662, "num_input_tokens_seen": 5002688, "step": 10175 }, { "epoch": 1.3435396594958426, "grad_norm": 0.04818149283528328, "learning_rate": 1.831623150634151e-06, "loss": 0.0005, "num_input_tokens_seen": 5005184, "step": 10180 }, { "epoch": 1.3441995512735911, "grad_norm": 0.040782514959573746, "learning_rate": 1.8313672181194023e-06, "loss": 0.1373, "num_input_tokens_seen": 5007424, "step": 10185 }, { "epoch": 1.3448594430513396, "grad_norm": 22.4898681640625, "learning_rate": 1.8311111091529817e-06, "loss": 0.0557, "num_input_tokens_seen": 5010112, "step": 10190 }, { "epoch": 1.345519334829088, "grad_norm": 0.39273348450660706, "learning_rate": 1.8308548237892465e-06, "loss": 0.0594, "num_input_tokens_seen": 5012736, "step": 10195 }, { "epoch": 1.3461792266068364, "grad_norm": 19.32087516784668, "learning_rate": 1.8305983620825915e-06, "loss": 0.0539, "num_input_tokens_seen": 5015040, "step": 10200 }, { "epoch": 1.346839118384585, "grad_norm": 0.05848585441708565, "learning_rate": 1.8303417240874492e-06, "loss": 0.0573, "num_input_tokens_seen": 5017344, "step": 10205 }, { "epoch": 1.3474990101623334, "grad_norm": 0.03271415829658508, "learning_rate": 1.8300849098582886e-06, "loss": 0.0528, "num_input_tokens_seen": 5019776, "step": 10210 }, { "epoch": 1.348158901940082, "grad_norm": 0.1817331165075302, "learning_rate": 1.829827919449617e-06, "loss": 0.0395, "num_input_tokens_seen": 5022272, "step": 10215 }, { "epoch": 1.3488187937178302, "grad_norm": 31.19110679626465, "learning_rate": 1.8295707529159783e-06, "loss": 0.1797, "num_input_tokens_seen": 5024768, "step": 10220 }, { "epoch": 1.3494786854955787, "grad_norm": 0.15519972145557404, "learning_rate": 1.829313410311955e-06, "loss": 0.1089, "num_input_tokens_seen": 5027072, "step": 10225 }, { "epoch": 1.3501385772733272, "grad_norm": 57.0513801574707, "learning_rate": 1.8290558916921656e-06, "loss": 0.1722, "num_input_tokens_seen": 5029568, "step": 10230 }, { "epoch": 1.3507984690510757, "grad_norm": 0.490141361951828, "learning_rate": 1.8287981971112668e-06, "loss": 0.0379, "num_input_tokens_seen": 5032256, "step": 10235 }, { "epoch": 1.3514583608288242, "grad_norm": 0.43738701939582825, "learning_rate": 1.8285403266239521e-06, "loss": 0.0258, "num_input_tokens_seen": 5034944, "step": 10240 }, { "epoch": 1.3521182526065725, "grad_norm": 18.965534210205078, "learning_rate": 1.8282822802849531e-06, "loss": 0.2289, "num_input_tokens_seen": 5037440, "step": 10245 }, { "epoch": 1.352778144384321, "grad_norm": 0.20755289494991302, "learning_rate": 1.8280240581490381e-06, "loss": 0.0005, "num_input_tokens_seen": 5040128, "step": 10250 }, { "epoch": 1.3534380361620695, "grad_norm": 0.1373784840106964, "learning_rate": 1.8277656602710127e-06, "loss": 0.0257, "num_input_tokens_seen": 5042624, "step": 10255 }, { "epoch": 1.3540979279398178, "grad_norm": 0.5778612494468689, "learning_rate": 1.8275070867057203e-06, "loss": 0.0817, "num_input_tokens_seen": 5044928, "step": 10260 }, { "epoch": 1.3547578197175663, "grad_norm": 0.06566134095191956, "learning_rate": 1.827248337508041e-06, "loss": 0.0005, "num_input_tokens_seen": 5047488, "step": 10265 }, { "epoch": 1.3554177114953148, "grad_norm": 0.06912878155708313, "learning_rate": 1.8269894127328925e-06, "loss": 0.0283, "num_input_tokens_seen": 5050368, "step": 10270 }, { "epoch": 1.3560776032730633, "grad_norm": 0.05937294289469719, "learning_rate": 1.8267303124352295e-06, "loss": 0.0452, "num_input_tokens_seen": 5052736, "step": 10275 }, { "epoch": 1.3567374950508118, "grad_norm": 16.943443298339844, "learning_rate": 1.826471036670045e-06, "loss": 0.1482, "num_input_tokens_seen": 5055168, "step": 10280 }, { "epoch": 1.35739738682856, "grad_norm": 0.060737937688827515, "learning_rate": 1.8262115854923673e-06, "loss": 0.0281, "num_input_tokens_seen": 5057664, "step": 10285 }, { "epoch": 1.3580572786063085, "grad_norm": 0.23668773472309113, "learning_rate": 1.8259519589572637e-06, "loss": 0.1029, "num_input_tokens_seen": 5060160, "step": 10290 }, { "epoch": 1.358717170384057, "grad_norm": 0.045837774872779846, "learning_rate": 1.8256921571198376e-06, "loss": 0.0132, "num_input_tokens_seen": 5062912, "step": 10295 }, { "epoch": 1.3593770621618055, "grad_norm": 0.024858810007572174, "learning_rate": 1.8254321800352308e-06, "loss": 0.0083, "num_input_tokens_seen": 5065216, "step": 10300 }, { "epoch": 1.360036953939554, "grad_norm": 116.51515197753906, "learning_rate": 1.8251720277586209e-06, "loss": 0.0474, "num_input_tokens_seen": 5067456, "step": 10305 }, { "epoch": 1.3606968457173023, "grad_norm": 71.86284637451172, "learning_rate": 1.8249117003452233e-06, "loss": 0.2756, "num_input_tokens_seen": 5069760, "step": 10310 }, { "epoch": 1.3613567374950508, "grad_norm": 0.07100139558315277, "learning_rate": 1.8246511978502912e-06, "loss": 0.1271, "num_input_tokens_seen": 5072320, "step": 10315 }, { "epoch": 1.3620166292727993, "grad_norm": 0.04098708555102348, "learning_rate": 1.8243905203291136e-06, "loss": 0.0017, "num_input_tokens_seen": 5074816, "step": 10320 }, { "epoch": 1.3626765210505476, "grad_norm": 0.02675493434071541, "learning_rate": 1.8241296678370184e-06, "loss": 0.0858, "num_input_tokens_seen": 5077312, "step": 10325 }, { "epoch": 1.363336412828296, "grad_norm": 0.03876377269625664, "learning_rate": 1.8238686404293686e-06, "loss": 0.0011, "num_input_tokens_seen": 5079616, "step": 10330 }, { "epoch": 1.3639963046060446, "grad_norm": 0.04434245079755783, "learning_rate": 1.8236074381615661e-06, "loss": 0.3048, "num_input_tokens_seen": 5081664, "step": 10335 }, { "epoch": 1.364656196383793, "grad_norm": 0.2147568315267563, "learning_rate": 1.823346061089049e-06, "loss": 0.0004, "num_input_tokens_seen": 5084224, "step": 10340 }, { "epoch": 1.3653160881615416, "grad_norm": 6.88644552230835, "learning_rate": 1.8230845092672925e-06, "loss": 0.0951, "num_input_tokens_seen": 5086528, "step": 10345 }, { "epoch": 1.3659759799392899, "grad_norm": 17.738170623779297, "learning_rate": 1.8228227827518093e-06, "loss": 0.167, "num_input_tokens_seen": 5088960, "step": 10350 }, { "epoch": 1.3666358717170384, "grad_norm": 85.15935516357422, "learning_rate": 1.8225608815981488e-06, "loss": 0.0787, "num_input_tokens_seen": 5091392, "step": 10355 }, { "epoch": 1.3672957634947869, "grad_norm": 0.06308019161224365, "learning_rate": 1.8222988058618976e-06, "loss": 0.0948, "num_input_tokens_seen": 5093888, "step": 10360 }, { "epoch": 1.3679556552725354, "grad_norm": 17.740474700927734, "learning_rate": 1.8220365555986797e-06, "loss": 0.15, "num_input_tokens_seen": 5096256, "step": 10365 }, { "epoch": 1.3686155470502839, "grad_norm": 0.6935084462165833, "learning_rate": 1.8217741308641553e-06, "loss": 0.0489, "num_input_tokens_seen": 5098816, "step": 10370 }, { "epoch": 1.3692754388280322, "grad_norm": 0.2454441636800766, "learning_rate": 1.8215115317140226e-06, "loss": 0.0487, "num_input_tokens_seen": 5101248, "step": 10375 }, { "epoch": 1.3699353306057807, "grad_norm": 1.8006916046142578, "learning_rate": 1.8212487582040164e-06, "loss": 0.0838, "num_input_tokens_seen": 5103488, "step": 10380 }, { "epoch": 1.3705952223835292, "grad_norm": 0.09306250512599945, "learning_rate": 1.8209858103899081e-06, "loss": 0.2107, "num_input_tokens_seen": 5105920, "step": 10385 }, { "epoch": 1.3712551141612774, "grad_norm": 0.3448830842971802, "learning_rate": 1.8207226883275067e-06, "loss": 0.001, "num_input_tokens_seen": 5108352, "step": 10390 }, { "epoch": 1.371915005939026, "grad_norm": 0.08386199176311493, "learning_rate": 1.820459392072658e-06, "loss": 0.1689, "num_input_tokens_seen": 5110784, "step": 10395 }, { "epoch": 1.3725748977167744, "grad_norm": 0.41205623745918274, "learning_rate": 1.8201959216812443e-06, "loss": 0.121, "num_input_tokens_seen": 5113344, "step": 10400 }, { "epoch": 1.373234789494523, "grad_norm": 0.1537424772977829, "learning_rate": 1.8199322772091858e-06, "loss": 0.0541, "num_input_tokens_seen": 5115712, "step": 10405 }, { "epoch": 1.3738946812722714, "grad_norm": 15.446474075317383, "learning_rate": 1.819668458712439e-06, "loss": 0.0519, "num_input_tokens_seen": 5117952, "step": 10410 }, { "epoch": 1.3745545730500197, "grad_norm": 0.19189509749412537, "learning_rate": 1.8194044662469973e-06, "loss": 0.0012, "num_input_tokens_seen": 5120128, "step": 10415 }, { "epoch": 1.3752144648277682, "grad_norm": 1.3778020143508911, "learning_rate": 1.8191402998688913e-06, "loss": 0.0045, "num_input_tokens_seen": 5122432, "step": 10420 }, { "epoch": 1.3758743566055167, "grad_norm": 0.09211282432079315, "learning_rate": 1.8188759596341888e-06, "loss": 0.0804, "num_input_tokens_seen": 5125056, "step": 10425 }, { "epoch": 1.3765342483832652, "grad_norm": 0.12644384801387787, "learning_rate": 1.8186114455989933e-06, "loss": 0.0818, "num_input_tokens_seen": 5127424, "step": 10430 }, { "epoch": 1.3771941401610137, "grad_norm": 0.07302141934633255, "learning_rate": 1.8183467578194467e-06, "loss": 0.0692, "num_input_tokens_seen": 5129792, "step": 10435 }, { "epoch": 1.377854031938762, "grad_norm": 0.0963432714343071, "learning_rate": 1.8180818963517264e-06, "loss": 0.1073, "num_input_tokens_seen": 5132032, "step": 10440 }, { "epoch": 1.3785139237165105, "grad_norm": 0.046357661485672, "learning_rate": 1.8178168612520478e-06, "loss": 0.0095, "num_input_tokens_seen": 5134400, "step": 10445 }, { "epoch": 1.379173815494259, "grad_norm": 0.05055604875087738, "learning_rate": 1.8175516525766627e-06, "loss": 0.0715, "num_input_tokens_seen": 5136640, "step": 10450 }, { "epoch": 1.3798337072720073, "grad_norm": 17.73884391784668, "learning_rate": 1.8172862703818593e-06, "loss": 0.1421, "num_input_tokens_seen": 5139136, "step": 10455 }, { "epoch": 1.3804935990497558, "grad_norm": 0.03119855374097824, "learning_rate": 1.8170207147239636e-06, "loss": 0.0011, "num_input_tokens_seen": 5141632, "step": 10460 }, { "epoch": 1.3811534908275043, "grad_norm": 16.85359764099121, "learning_rate": 1.8167549856593374e-06, "loss": 0.0696, "num_input_tokens_seen": 5144320, "step": 10465 }, { "epoch": 1.3818133826052528, "grad_norm": 50.230125427246094, "learning_rate": 1.81648908324438e-06, "loss": 0.2092, "num_input_tokens_seen": 5146880, "step": 10470 }, { "epoch": 1.3824732743830013, "grad_norm": 0.6590256690979004, "learning_rate": 1.8162230075355277e-06, "loss": 0.0023, "num_input_tokens_seen": 5149632, "step": 10475 }, { "epoch": 1.3831331661607495, "grad_norm": 0.7845008969306946, "learning_rate": 1.8159567585892521e-06, "loss": 0.0611, "num_input_tokens_seen": 5151936, "step": 10480 }, { "epoch": 1.383793057938498, "grad_norm": 18.889646530151367, "learning_rate": 1.8156903364620632e-06, "loss": 0.2547, "num_input_tokens_seen": 5154368, "step": 10485 }, { "epoch": 1.3844529497162466, "grad_norm": 2.00492525100708, "learning_rate": 1.8154237412105074e-06, "loss": 0.0018, "num_input_tokens_seen": 5156736, "step": 10490 }, { "epoch": 1.385112841493995, "grad_norm": 24.936811447143555, "learning_rate": 1.8151569728911672e-06, "loss": 0.203, "num_input_tokens_seen": 5159104, "step": 10495 }, { "epoch": 1.3857727332717436, "grad_norm": 0.39635398983955383, "learning_rate": 1.8148900315606625e-06, "loss": 0.1597, "num_input_tokens_seen": 5161472, "step": 10500 }, { "epoch": 1.3864326250494918, "grad_norm": 0.1377970278263092, "learning_rate": 1.8146229172756495e-06, "loss": 0.0015, "num_input_tokens_seen": 5163904, "step": 10505 }, { "epoch": 1.3870925168272403, "grad_norm": 0.060544900596141815, "learning_rate": 1.8143556300928214e-06, "loss": 0.0844, "num_input_tokens_seen": 5166464, "step": 10510 }, { "epoch": 1.3877524086049888, "grad_norm": 0.048183392733335495, "learning_rate": 1.814088170068908e-06, "loss": 0.0516, "num_input_tokens_seen": 5168704, "step": 10515 }, { "epoch": 1.388412300382737, "grad_norm": 0.13653618097305298, "learning_rate": 1.8138205372606756e-06, "loss": 0.0833, "num_input_tokens_seen": 5171200, "step": 10520 }, { "epoch": 1.3890721921604856, "grad_norm": 0.6845338344573975, "learning_rate": 1.8135527317249273e-06, "loss": 0.0015, "num_input_tokens_seen": 5173504, "step": 10525 }, { "epoch": 1.389732083938234, "grad_norm": 32.81275177001953, "learning_rate": 1.8132847535185029e-06, "loss": 0.0479, "num_input_tokens_seen": 5176064, "step": 10530 }, { "epoch": 1.3903919757159826, "grad_norm": 0.12121167778968811, "learning_rate": 1.8130166026982795e-06, "loss": 0.0021, "num_input_tokens_seen": 5178816, "step": 10535 }, { "epoch": 1.391051867493731, "grad_norm": 0.03380153700709343, "learning_rate": 1.8127482793211688e-06, "loss": 0.0802, "num_input_tokens_seen": 5181248, "step": 10540 }, { "epoch": 1.3917117592714794, "grad_norm": 0.0383787527680397, "learning_rate": 1.8124797834441217e-06, "loss": 0.0009, "num_input_tokens_seen": 5183552, "step": 10545 }, { "epoch": 1.3923716510492279, "grad_norm": 0.15044409036636353, "learning_rate": 1.812211115124124e-06, "loss": 0.0769, "num_input_tokens_seen": 5185728, "step": 10550 }, { "epoch": 1.3930315428269764, "grad_norm": 0.009135999716818333, "learning_rate": 1.8119422744181984e-06, "loss": 0.0521, "num_input_tokens_seen": 5188224, "step": 10555 }, { "epoch": 1.3936914346047249, "grad_norm": 23.88149070739746, "learning_rate": 1.8116732613834053e-06, "loss": 0.1086, "num_input_tokens_seen": 5191104, "step": 10560 }, { "epoch": 1.3943513263824734, "grad_norm": 0.011752346530556679, "learning_rate": 1.81140407607684e-06, "loss": 0.1069, "num_input_tokens_seen": 5193600, "step": 10565 }, { "epoch": 1.3950112181602217, "grad_norm": 0.059345416724681854, "learning_rate": 1.8111347185556348e-06, "loss": 0.0569, "num_input_tokens_seen": 5196032, "step": 10570 }, { "epoch": 1.3956711099379702, "grad_norm": 0.07716865092515945, "learning_rate": 1.8108651888769595e-06, "loss": 0.0003, "num_input_tokens_seen": 5198656, "step": 10575 }, { "epoch": 1.3963310017157187, "grad_norm": 0.009854006581008434, "learning_rate": 1.8105954870980198e-06, "loss": 0.146, "num_input_tokens_seen": 5200960, "step": 10580 }, { "epoch": 1.396990893493467, "grad_norm": 0.023813385516405106, "learning_rate": 1.810325613276058e-06, "loss": 0.0001, "num_input_tokens_seen": 5203520, "step": 10585 }, { "epoch": 1.3976507852712154, "grad_norm": 0.16299042105674744, "learning_rate": 1.8100555674683524e-06, "loss": 0.0006, "num_input_tokens_seen": 5206144, "step": 10590 }, { "epoch": 1.398310677048964, "grad_norm": 0.0172832403331995, "learning_rate": 1.8097853497322188e-06, "loss": 0.0002, "num_input_tokens_seen": 5208768, "step": 10595 }, { "epoch": 1.3989705688267124, "grad_norm": 0.24835462868213654, "learning_rate": 1.8095149601250088e-06, "loss": 0.0942, "num_input_tokens_seen": 5211136, "step": 10600 }, { "epoch": 1.399630460604461, "grad_norm": 0.06975623965263367, "learning_rate": 1.8092443987041104e-06, "loss": 0.066, "num_input_tokens_seen": 5213504, "step": 10605 }, { "epoch": 1.4002903523822092, "grad_norm": 0.04890378192067146, "learning_rate": 1.8089736655269486e-06, "loss": 0.0834, "num_input_tokens_seen": 5216000, "step": 10610 }, { "epoch": 1.4009502441599577, "grad_norm": 0.687415599822998, "learning_rate": 1.8087027606509842e-06, "loss": 0.14, "num_input_tokens_seen": 5218688, "step": 10615 }, { "epoch": 1.4016101359377062, "grad_norm": 0.25652775168418884, "learning_rate": 1.808431684133715e-06, "loss": 0.0022, "num_input_tokens_seen": 5221440, "step": 10620 }, { "epoch": 1.4022700277154547, "grad_norm": 1.2399516105651855, "learning_rate": 1.8081604360326753e-06, "loss": 0.1496, "num_input_tokens_seen": 5223616, "step": 10625 }, { "epoch": 1.4029299194932032, "grad_norm": 0.3370029330253601, "learning_rate": 1.807889016405435e-06, "loss": 0.0769, "num_input_tokens_seen": 5226176, "step": 10630 }, { "epoch": 1.4035898112709515, "grad_norm": 0.04719403013586998, "learning_rate": 1.8076174253096014e-06, "loss": 0.0013, "num_input_tokens_seen": 5228480, "step": 10635 }, { "epoch": 1.4042497030487, "grad_norm": 0.015282729640603065, "learning_rate": 1.8073456628028177e-06, "loss": 0.1115, "num_input_tokens_seen": 5230912, "step": 10640 }, { "epoch": 1.4049095948264485, "grad_norm": 21.17472267150879, "learning_rate": 1.8070737289427631e-06, "loss": 0.0883, "num_input_tokens_seen": 5233536, "step": 10645 }, { "epoch": 1.4055694866041968, "grad_norm": 0.05179164931178093, "learning_rate": 1.8068016237871541e-06, "loss": 0.0003, "num_input_tokens_seen": 5236096, "step": 10650 }, { "epoch": 1.4062293783819453, "grad_norm": 0.01727340929210186, "learning_rate": 1.8065293473937429e-06, "loss": 0.1611, "num_input_tokens_seen": 5238464, "step": 10655 }, { "epoch": 1.4068892701596938, "grad_norm": 87.70179748535156, "learning_rate": 1.806256899820318e-06, "loss": 0.3064, "num_input_tokens_seen": 5241088, "step": 10660 }, { "epoch": 1.4075491619374423, "grad_norm": 0.25517573952674866, "learning_rate": 1.8059842811247048e-06, "loss": 0.2462, "num_input_tokens_seen": 5243584, "step": 10665 }, { "epoch": 1.4082090537151908, "grad_norm": 0.09056201577186584, "learning_rate": 1.805711491364764e-06, "loss": 0.0303, "num_input_tokens_seen": 5246016, "step": 10670 }, { "epoch": 1.408868945492939, "grad_norm": 0.1641373485326767, "learning_rate": 1.8054385305983942e-06, "loss": 0.0026, "num_input_tokens_seen": 5248192, "step": 10675 }, { "epoch": 1.4095288372706876, "grad_norm": 107.500732421875, "learning_rate": 1.8051653988835284e-06, "loss": 0.1616, "num_input_tokens_seen": 5250752, "step": 10680 }, { "epoch": 1.410188729048436, "grad_norm": 0.016604578122496605, "learning_rate": 1.8048920962781372e-06, "loss": 0.1854, "num_input_tokens_seen": 5253120, "step": 10685 }, { "epoch": 1.4108486208261846, "grad_norm": 0.2926078736782074, "learning_rate": 1.8046186228402273e-06, "loss": 0.0559, "num_input_tokens_seen": 5255808, "step": 10690 }, { "epoch": 1.411508512603933, "grad_norm": 0.06410674005746841, "learning_rate": 1.8043449786278413e-06, "loss": 0.0009, "num_input_tokens_seen": 5258112, "step": 10695 }, { "epoch": 1.4121684043816813, "grad_norm": 20.56272315979004, "learning_rate": 1.8040711636990581e-06, "loss": 0.0902, "num_input_tokens_seen": 5260800, "step": 10700 }, { "epoch": 1.4128282961594298, "grad_norm": 0.3491075336933136, "learning_rate": 1.8037971781119931e-06, "loss": 0.0008, "num_input_tokens_seen": 5263104, "step": 10705 }, { "epoch": 1.4134881879371783, "grad_norm": 0.20114921033382416, "learning_rate": 1.8035230219247977e-06, "loss": 0.204, "num_input_tokens_seen": 5265472, "step": 10710 }, { "epoch": 1.4141480797149266, "grad_norm": 14.344511032104492, "learning_rate": 1.8032486951956596e-06, "loss": 0.0555, "num_input_tokens_seen": 5268160, "step": 10715 }, { "epoch": 1.4148079714926751, "grad_norm": 25.318134307861328, "learning_rate": 1.8029741979828026e-06, "loss": 0.115, "num_input_tokens_seen": 5270400, "step": 10720 }, { "epoch": 1.4154678632704236, "grad_norm": 0.02228490076959133, "learning_rate": 1.8026995303444867e-06, "loss": 0.0623, "num_input_tokens_seen": 5272768, "step": 10725 }, { "epoch": 1.4161277550481721, "grad_norm": 0.05160384625196457, "learning_rate": 1.802424692339008e-06, "loss": 0.0005, "num_input_tokens_seen": 5275584, "step": 10730 }, { "epoch": 1.4167876468259206, "grad_norm": 0.05498448386788368, "learning_rate": 1.8021496840246994e-06, "loss": 0.0371, "num_input_tokens_seen": 5277824, "step": 10735 }, { "epoch": 1.417447538603669, "grad_norm": 0.020905766636133194, "learning_rate": 1.8018745054599292e-06, "loss": 0.0004, "num_input_tokens_seen": 5280512, "step": 10740 }, { "epoch": 1.4181074303814174, "grad_norm": 0.14113740622997284, "learning_rate": 1.8015991567031015e-06, "loss": 0.0006, "num_input_tokens_seen": 5283136, "step": 10745 }, { "epoch": 1.418767322159166, "grad_norm": 0.9651300311088562, "learning_rate": 1.8013236378126577e-06, "loss": 0.0802, "num_input_tokens_seen": 5285568, "step": 10750 }, { "epoch": 1.4194272139369144, "grad_norm": 0.0940876454114914, "learning_rate": 1.8010479488470743e-06, "loss": 0.0573, "num_input_tokens_seen": 5287936, "step": 10755 }, { "epoch": 1.420087105714663, "grad_norm": 0.052880752831697464, "learning_rate": 1.8007720898648645e-06, "loss": 0.0006, "num_input_tokens_seen": 5289984, "step": 10760 }, { "epoch": 1.4207469974924112, "grad_norm": 13.744636535644531, "learning_rate": 1.8004960609245778e-06, "loss": 0.1727, "num_input_tokens_seen": 5292352, "step": 10765 }, { "epoch": 1.4214068892701597, "grad_norm": 0.0709216445684433, "learning_rate": 1.8002198620847988e-06, "loss": 0.0008, "num_input_tokens_seen": 5294720, "step": 10770 }, { "epoch": 1.4220667810479082, "grad_norm": 0.01790332980453968, "learning_rate": 1.7999434934041485e-06, "loss": 0.0007, "num_input_tokens_seen": 5297024, "step": 10775 }, { "epoch": 1.4227266728256565, "grad_norm": 0.0645064190030098, "learning_rate": 1.7996669549412847e-06, "loss": 0.0005, "num_input_tokens_seen": 5299584, "step": 10780 }, { "epoch": 1.4233865646034052, "grad_norm": 0.03778925910592079, "learning_rate": 1.7993902467549002e-06, "loss": 0.1082, "num_input_tokens_seen": 5301888, "step": 10785 }, { "epoch": 1.4240464563811535, "grad_norm": 0.034117791801691055, "learning_rate": 1.7991133689037247e-06, "loss": 0.1271, "num_input_tokens_seen": 5304256, "step": 10790 }, { "epoch": 1.424706348158902, "grad_norm": 23.118024826049805, "learning_rate": 1.7988363214465233e-06, "loss": 0.1573, "num_input_tokens_seen": 5306688, "step": 10795 }, { "epoch": 1.4253662399366505, "grad_norm": 0.07332354784011841, "learning_rate": 1.7985591044420975e-06, "loss": 0.0624, "num_input_tokens_seen": 5309248, "step": 10800 }, { "epoch": 1.4260261317143987, "grad_norm": 0.08144375681877136, "learning_rate": 1.7982817179492847e-06, "loss": 0.001, "num_input_tokens_seen": 5311552, "step": 10805 }, { "epoch": 1.4266860234921472, "grad_norm": 0.09295511990785599, "learning_rate": 1.7980041620269577e-06, "loss": 0.0589, "num_input_tokens_seen": 5314048, "step": 10810 }, { "epoch": 1.4273459152698957, "grad_norm": 0.024405941367149353, "learning_rate": 1.7977264367340262e-06, "loss": 0.0665, "num_input_tokens_seen": 5316480, "step": 10815 }, { "epoch": 1.4280058070476442, "grad_norm": 0.023482978343963623, "learning_rate": 1.7974485421294347e-06, "loss": 0.0012, "num_input_tokens_seen": 5318720, "step": 10820 }, { "epoch": 1.4286656988253927, "grad_norm": 0.24007222056388855, "learning_rate": 1.7971704782721652e-06, "loss": 0.0552, "num_input_tokens_seen": 5321344, "step": 10825 }, { "epoch": 1.429325590603141, "grad_norm": 0.02005026862025261, "learning_rate": 1.7968922452212342e-06, "loss": 0.1752, "num_input_tokens_seen": 5323584, "step": 10830 }, { "epoch": 1.4299854823808895, "grad_norm": 53.96958923339844, "learning_rate": 1.796613843035695e-06, "loss": 0.2167, "num_input_tokens_seen": 5326208, "step": 10835 }, { "epoch": 1.430645374158638, "grad_norm": 28.23914909362793, "learning_rate": 1.796335271774636e-06, "loss": 0.0647, "num_input_tokens_seen": 5328768, "step": 10840 }, { "epoch": 1.4313052659363863, "grad_norm": 13.797289848327637, "learning_rate": 1.7960565314971823e-06, "loss": 0.1761, "num_input_tokens_seen": 5331264, "step": 10845 }, { "epoch": 1.431965157714135, "grad_norm": 0.43690750002861023, "learning_rate": 1.7957776222624946e-06, "loss": 0.0024, "num_input_tokens_seen": 5333632, "step": 10850 }, { "epoch": 1.4326250494918833, "grad_norm": 0.10744563490152359, "learning_rate": 1.7954985441297684e-06, "loss": 0.001, "num_input_tokens_seen": 5336192, "step": 10855 }, { "epoch": 1.4332849412696318, "grad_norm": 0.3047581613063812, "learning_rate": 1.7952192971582374e-06, "loss": 0.0111, "num_input_tokens_seen": 5338496, "step": 10860 }, { "epoch": 1.4339448330473803, "grad_norm": 26.489316940307617, "learning_rate": 1.794939881407169e-06, "loss": 0.0531, "num_input_tokens_seen": 5340992, "step": 10865 }, { "epoch": 1.4346047248251286, "grad_norm": 0.04672485962510109, "learning_rate": 1.7946602969358673e-06, "loss": 0.0015, "num_input_tokens_seen": 5343552, "step": 10870 }, { "epoch": 1.435264616602877, "grad_norm": 0.5452134609222412, "learning_rate": 1.7943805438036718e-06, "loss": 0.0022, "num_input_tokens_seen": 5346176, "step": 10875 }, { "epoch": 1.4359245083806256, "grad_norm": 0.16802646219730377, "learning_rate": 1.7941006220699588e-06, "loss": 0.0216, "num_input_tokens_seen": 5348800, "step": 10880 }, { "epoch": 1.436584400158374, "grad_norm": 0.010055284947156906, "learning_rate": 1.7938205317941386e-06, "loss": 0.0736, "num_input_tokens_seen": 5351424, "step": 10885 }, { "epoch": 1.4372442919361226, "grad_norm": 0.03935292735695839, "learning_rate": 1.7935402730356594e-06, "loss": 0.0003, "num_input_tokens_seen": 5354048, "step": 10890 }, { "epoch": 1.4379041837138709, "grad_norm": 0.11049910634756088, "learning_rate": 1.7932598458540036e-06, "loss": 0.0753, "num_input_tokens_seen": 5356416, "step": 10895 }, { "epoch": 1.4385640754916194, "grad_norm": 0.021031470969319344, "learning_rate": 1.7929792503086897e-06, "loss": 0.03, "num_input_tokens_seen": 5358848, "step": 10900 }, { "epoch": 1.4392239672693679, "grad_norm": 0.01888999529182911, "learning_rate": 1.792698486459272e-06, "loss": 0.0655, "num_input_tokens_seen": 5361344, "step": 10905 }, { "epoch": 1.4398838590471164, "grad_norm": 0.010403584688901901, "learning_rate": 1.7924175543653411e-06, "loss": 0.0519, "num_input_tokens_seen": 5363904, "step": 10910 }, { "epoch": 1.4405437508248649, "grad_norm": 11.720383644104004, "learning_rate": 1.7921364540865224e-06, "loss": 0.0833, "num_input_tokens_seen": 5366144, "step": 10915 }, { "epoch": 1.4412036426026131, "grad_norm": 0.011344925500452518, "learning_rate": 1.7918551856824776e-06, "loss": 0.0007, "num_input_tokens_seen": 5368448, "step": 10920 }, { "epoch": 1.4418635343803616, "grad_norm": 0.036906708031892776, "learning_rate": 1.7915737492129037e-06, "loss": 0.1, "num_input_tokens_seen": 5370624, "step": 10925 }, { "epoch": 1.4425234261581101, "grad_norm": 198.46531677246094, "learning_rate": 1.7912921447375338e-06, "loss": 0.0986, "num_input_tokens_seen": 5373376, "step": 10930 }, { "epoch": 1.4431833179358584, "grad_norm": 48.49536895751953, "learning_rate": 1.7910103723161362e-06, "loss": 0.1395, "num_input_tokens_seen": 5376064, "step": 10935 }, { "epoch": 1.443843209713607, "grad_norm": 0.03592463955283165, "learning_rate": 1.7907284320085153e-06, "loss": 0.0167, "num_input_tokens_seen": 5378624, "step": 10940 }, { "epoch": 1.4445031014913554, "grad_norm": 19.094186782836914, "learning_rate": 1.7904463238745105e-06, "loss": 0.1895, "num_input_tokens_seen": 5381312, "step": 10945 }, { "epoch": 1.445162993269104, "grad_norm": 0.653519332408905, "learning_rate": 1.7901640479739974e-06, "loss": 0.1904, "num_input_tokens_seen": 5383616, "step": 10950 }, { "epoch": 1.4458228850468524, "grad_norm": 1.8942596912384033, "learning_rate": 1.789881604366887e-06, "loss": 0.1212, "num_input_tokens_seen": 5386368, "step": 10955 }, { "epoch": 1.4464827768246007, "grad_norm": 0.04864390566945076, "learning_rate": 1.7895989931131262e-06, "loss": 0.1056, "num_input_tokens_seen": 5388736, "step": 10960 }, { "epoch": 1.4471426686023492, "grad_norm": 0.1541789174079895, "learning_rate": 1.7893162142726967e-06, "loss": 0.0437, "num_input_tokens_seen": 5391232, "step": 10965 }, { "epoch": 1.4478025603800977, "grad_norm": 0.09908580034971237, "learning_rate": 1.7890332679056165e-06, "loss": 0.0009, "num_input_tokens_seen": 5393792, "step": 10970 }, { "epoch": 1.4484624521578462, "grad_norm": 31.06180763244629, "learning_rate": 1.7887501540719389e-06, "loss": 0.1911, "num_input_tokens_seen": 5396416, "step": 10975 }, { "epoch": 1.4491223439355947, "grad_norm": 0.2622423768043518, "learning_rate": 1.7884668728317531e-06, "loss": 0.0005, "num_input_tokens_seen": 5399232, "step": 10980 }, { "epoch": 1.449782235713343, "grad_norm": 0.028448861092329025, "learning_rate": 1.7881834242451829e-06, "loss": 0.0615, "num_input_tokens_seen": 5401664, "step": 10985 }, { "epoch": 1.4504421274910915, "grad_norm": 0.1454077512025833, "learning_rate": 1.7878998083723883e-06, "loss": 0.001, "num_input_tokens_seen": 5404224, "step": 10990 }, { "epoch": 1.45110201926884, "grad_norm": 26.754520416259766, "learning_rate": 1.7876160252735652e-06, "loss": 0.1332, "num_input_tokens_seen": 5406336, "step": 10995 }, { "epoch": 1.4517619110465882, "grad_norm": 0.21431031823158264, "learning_rate": 1.7873320750089443e-06, "loss": 0.0553, "num_input_tokens_seen": 5408832, "step": 11000 }, { "epoch": 1.4524218028243367, "grad_norm": 0.18584635853767395, "learning_rate": 1.7870479576387916e-06, "loss": 0.0357, "num_input_tokens_seen": 5411136, "step": 11005 }, { "epoch": 1.4530816946020852, "grad_norm": 0.03457614406943321, "learning_rate": 1.7867636732234094e-06, "loss": 0.1594, "num_input_tokens_seen": 5413376, "step": 11010 }, { "epoch": 1.4537415863798338, "grad_norm": 0.2158297598361969, "learning_rate": 1.7864792218231348e-06, "loss": 0.0837, "num_input_tokens_seen": 5415680, "step": 11015 }, { "epoch": 1.4544014781575823, "grad_norm": 0.02059786207973957, "learning_rate": 1.7861946034983406e-06, "loss": 0.066, "num_input_tokens_seen": 5418112, "step": 11020 }, { "epoch": 1.4550613699353305, "grad_norm": 1.214483618736267, "learning_rate": 1.785909818309435e-06, "loss": 0.0883, "num_input_tokens_seen": 5420352, "step": 11025 }, { "epoch": 1.455721261713079, "grad_norm": 40.515533447265625, "learning_rate": 1.7856248663168616e-06, "loss": 0.1528, "num_input_tokens_seen": 5422720, "step": 11030 }, { "epoch": 1.4563811534908275, "grad_norm": 0.14102889597415924, "learning_rate": 1.7853397475810995e-06, "loss": 0.1301, "num_input_tokens_seen": 5425024, "step": 11035 }, { "epoch": 1.457041045268576, "grad_norm": 0.02465779520571232, "learning_rate": 1.7850544621626626e-06, "loss": 0.0646, "num_input_tokens_seen": 5427584, "step": 11040 }, { "epoch": 1.4577009370463245, "grad_norm": 67.14161682128906, "learning_rate": 1.7847690101221011e-06, "loss": 0.2122, "num_input_tokens_seen": 5430400, "step": 11045 }, { "epoch": 1.4583608288240728, "grad_norm": 0.14253701269626617, "learning_rate": 1.7844833915200001e-06, "loss": 0.0005, "num_input_tokens_seen": 5432960, "step": 11050 }, { "epoch": 1.4590207206018213, "grad_norm": 14.538985252380371, "learning_rate": 1.7841976064169803e-06, "loss": 0.0658, "num_input_tokens_seen": 5435520, "step": 11055 }, { "epoch": 1.4596806123795698, "grad_norm": 21.02281951904297, "learning_rate": 1.7839116548736972e-06, "loss": 0.1915, "num_input_tokens_seen": 5438016, "step": 11060 }, { "epoch": 1.460340504157318, "grad_norm": 0.13544169068336487, "learning_rate": 1.7836255369508418e-06, "loss": 0.0019, "num_input_tokens_seen": 5440384, "step": 11065 }, { "epoch": 1.4610003959350666, "grad_norm": 0.4773879945278168, "learning_rate": 1.7833392527091409e-06, "loss": 0.0514, "num_input_tokens_seen": 5443072, "step": 11070 }, { "epoch": 1.461660287712815, "grad_norm": 0.03239217400550842, "learning_rate": 1.7830528022093559e-06, "loss": 0.1296, "num_input_tokens_seen": 5445760, "step": 11075 }, { "epoch": 1.4623201794905636, "grad_norm": 0.09148470312356949, "learning_rate": 1.7827661855122842e-06, "loss": 0.0006, "num_input_tokens_seen": 5448192, "step": 11080 }, { "epoch": 1.462980071268312, "grad_norm": 19.332427978515625, "learning_rate": 1.7824794026787577e-06, "loss": 0.0479, "num_input_tokens_seen": 5450752, "step": 11085 }, { "epoch": 1.4636399630460604, "grad_norm": 31.496519088745117, "learning_rate": 1.7821924537696447e-06, "loss": 0.0038, "num_input_tokens_seen": 5453056, "step": 11090 }, { "epoch": 1.4642998548238089, "grad_norm": 24.9755859375, "learning_rate": 1.7819053388458474e-06, "loss": 0.0636, "num_input_tokens_seen": 5455808, "step": 11095 }, { "epoch": 1.4649597466015574, "grad_norm": 0.020973796024918556, "learning_rate": 1.781618057968304e-06, "loss": 0.1838, "num_input_tokens_seen": 5458240, "step": 11100 }, { "epoch": 1.4656196383793059, "grad_norm": 0.009609062224626541, "learning_rate": 1.7813306111979878e-06, "loss": 0.0005, "num_input_tokens_seen": 5460480, "step": 11105 }, { "epoch": 1.4662795301570544, "grad_norm": 34.70246124267578, "learning_rate": 1.7810429985959075e-06, "loss": 0.1695, "num_input_tokens_seen": 5462592, "step": 11110 }, { "epoch": 1.4669394219348026, "grad_norm": 0.018167927861213684, "learning_rate": 1.7807552202231065e-06, "loss": 0.3442, "num_input_tokens_seen": 5464896, "step": 11115 }, { "epoch": 1.4675993137125511, "grad_norm": 0.22668059170246124, "learning_rate": 1.7804672761406636e-06, "loss": 0.1338, "num_input_tokens_seen": 5467264, "step": 11120 }, { "epoch": 1.4682592054902996, "grad_norm": 0.05633705109357834, "learning_rate": 1.7801791664096933e-06, "loss": 0.0805, "num_input_tokens_seen": 5469696, "step": 11125 }, { "epoch": 1.468919097268048, "grad_norm": 0.2396361529827118, "learning_rate": 1.7798908910913444e-06, "loss": 0.0636, "num_input_tokens_seen": 5472512, "step": 11130 }, { "epoch": 1.4695789890457964, "grad_norm": 14.083260536193848, "learning_rate": 1.7796024502468015e-06, "loss": 0.0954, "num_input_tokens_seen": 5475200, "step": 11135 }, { "epoch": 1.470238880823545, "grad_norm": 15.633596420288086, "learning_rate": 1.7793138439372839e-06, "loss": 0.1199, "num_input_tokens_seen": 5477568, "step": 11140 }, { "epoch": 1.4708987726012934, "grad_norm": 0.11257128417491913, "learning_rate": 1.7790250722240463e-06, "loss": 0.0413, "num_input_tokens_seen": 5480000, "step": 11145 }, { "epoch": 1.471558664379042, "grad_norm": 1.0883387327194214, "learning_rate": 1.7787361351683784e-06, "loss": 0.0579, "num_input_tokens_seen": 5482496, "step": 11150 }, { "epoch": 1.4722185561567902, "grad_norm": 14.952766418457031, "learning_rate": 1.7784470328316048e-06, "loss": 0.1602, "num_input_tokens_seen": 5484928, "step": 11155 }, { "epoch": 1.4728784479345387, "grad_norm": 0.10626795142889023, "learning_rate": 1.7781577652750858e-06, "loss": 0.0452, "num_input_tokens_seen": 5487296, "step": 11160 }, { "epoch": 1.4735383397122872, "grad_norm": 0.1313232183456421, "learning_rate": 1.777868332560216e-06, "loss": 0.0005, "num_input_tokens_seen": 5489856, "step": 11165 }, { "epoch": 1.4741982314900357, "grad_norm": 15.769591331481934, "learning_rate": 1.7775787347484255e-06, "loss": 0.0447, "num_input_tokens_seen": 5492352, "step": 11170 }, { "epoch": 1.4748581232677842, "grad_norm": 0.09785334020853043, "learning_rate": 1.7772889719011793e-06, "loss": 0.0842, "num_input_tokens_seen": 5494912, "step": 11175 }, { "epoch": 1.4755180150455325, "grad_norm": 0.1397729367017746, "learning_rate": 1.7769990440799775e-06, "loss": 0.0015, "num_input_tokens_seen": 5497600, "step": 11180 }, { "epoch": 1.476177906823281, "grad_norm": 30.313491821289062, "learning_rate": 1.7767089513463552e-06, "loss": 0.0355, "num_input_tokens_seen": 5500352, "step": 11185 }, { "epoch": 1.4768377986010295, "grad_norm": 17.191814422607422, "learning_rate": 1.7764186937618826e-06, "loss": 0.1102, "num_input_tokens_seen": 5502592, "step": 11190 }, { "epoch": 1.4774976903787778, "grad_norm": 15.721531867980957, "learning_rate": 1.7761282713881645e-06, "loss": 0.2163, "num_input_tokens_seen": 5505280, "step": 11195 }, { "epoch": 1.4781575821565263, "grad_norm": 0.026539819315075874, "learning_rate": 1.775837684286841e-06, "loss": 0.0617, "num_input_tokens_seen": 5507968, "step": 11200 }, { "epoch": 1.4788174739342748, "grad_norm": 0.09990093111991882, "learning_rate": 1.7755469325195871e-06, "loss": 0.2514, "num_input_tokens_seen": 5510592, "step": 11205 }, { "epoch": 1.4794773657120233, "grad_norm": 0.5558841824531555, "learning_rate": 1.7752560161481131e-06, "loss": 0.054, "num_input_tokens_seen": 5512896, "step": 11210 }, { "epoch": 1.4801372574897718, "grad_norm": 2.259077787399292, "learning_rate": 1.7749649352341636e-06, "loss": 0.0291, "num_input_tokens_seen": 5515584, "step": 11215 }, { "epoch": 1.48079714926752, "grad_norm": 0.06075366958975792, "learning_rate": 1.7746736898395182e-06, "loss": 0.1157, "num_input_tokens_seen": 5517888, "step": 11220 }, { "epoch": 1.4814570410452685, "grad_norm": 1.9688565731048584, "learning_rate": 1.7743822800259923e-06, "loss": 0.1393, "num_input_tokens_seen": 5520320, "step": 11225 }, { "epoch": 1.482116932823017, "grad_norm": 10.639870643615723, "learning_rate": 1.7740907058554348e-06, "loss": 0.1536, "num_input_tokens_seen": 5522624, "step": 11230 }, { "epoch": 1.4827768246007655, "grad_norm": 12.147008895874023, "learning_rate": 1.7737989673897307e-06, "loss": 0.2235, "num_input_tokens_seen": 5524864, "step": 11235 }, { "epoch": 1.483436716378514, "grad_norm": 14.560885429382324, "learning_rate": 1.7735070646907988e-06, "loss": 0.1008, "num_input_tokens_seen": 5527488, "step": 11240 }, { "epoch": 1.4840966081562623, "grad_norm": 14.701653480529785, "learning_rate": 1.773214997820594e-06, "loss": 0.0975, "num_input_tokens_seen": 5529856, "step": 11245 }, { "epoch": 1.4847564999340108, "grad_norm": 3.8046138286590576, "learning_rate": 1.772922766841105e-06, "loss": 0.0849, "num_input_tokens_seen": 5532352, "step": 11250 }, { "epoch": 1.4854163917117593, "grad_norm": 68.81758117675781, "learning_rate": 1.772630371814356e-06, "loss": 0.0448, "num_input_tokens_seen": 5534976, "step": 11255 }, { "epoch": 1.4860762834895076, "grad_norm": 220.65342712402344, "learning_rate": 1.7723378128024056e-06, "loss": 0.0766, "num_input_tokens_seen": 5537408, "step": 11260 }, { "epoch": 1.486736175267256, "grad_norm": 0.06636308878660202, "learning_rate": 1.7720450898673468e-06, "loss": 0.0378, "num_input_tokens_seen": 5540224, "step": 11265 }, { "epoch": 1.4873960670450046, "grad_norm": 0.06799861788749695, "learning_rate": 1.7717522030713088e-06, "loss": 0.2048, "num_input_tokens_seen": 5542784, "step": 11270 }, { "epoch": 1.488055958822753, "grad_norm": 0.1660117208957672, "learning_rate": 1.771459152476454e-06, "loss": 0.0836, "num_input_tokens_seen": 5544896, "step": 11275 }, { "epoch": 1.4887158506005016, "grad_norm": 1.1547398567199707, "learning_rate": 1.7711659381449807e-06, "loss": 0.0574, "num_input_tokens_seen": 5547520, "step": 11280 }, { "epoch": 1.4893757423782499, "grad_norm": 12.23236083984375, "learning_rate": 1.7708725601391214e-06, "loss": 0.1081, "num_input_tokens_seen": 5549952, "step": 11285 }, { "epoch": 1.4900356341559984, "grad_norm": 1.404454231262207, "learning_rate": 1.7705790185211433e-06, "loss": 0.0337, "num_input_tokens_seen": 5552768, "step": 11290 }, { "epoch": 1.4906955259337469, "grad_norm": 0.15022261440753937, "learning_rate": 1.770285313353349e-06, "loss": 0.0916, "num_input_tokens_seen": 5555392, "step": 11295 }, { "epoch": 1.4913554177114954, "grad_norm": 17.854000091552734, "learning_rate": 1.7699914446980745e-06, "loss": 0.1468, "num_input_tokens_seen": 5557760, "step": 11300 }, { "epoch": 1.4920153094892439, "grad_norm": 21.836078643798828, "learning_rate": 1.7696974126176917e-06, "loss": 0.0169, "num_input_tokens_seen": 5560192, "step": 11305 }, { "epoch": 1.4926752012669922, "grad_norm": 0.2405860722064972, "learning_rate": 1.769403217174607e-06, "loss": 0.042, "num_input_tokens_seen": 5562496, "step": 11310 }, { "epoch": 1.4933350930447407, "grad_norm": 0.07342782616615295, "learning_rate": 1.7691088584312608e-06, "loss": 0.128, "num_input_tokens_seen": 5564992, "step": 11315 }, { "epoch": 1.4939949848224892, "grad_norm": 0.11119920760393143, "learning_rate": 1.7688143364501292e-06, "loss": 0.0005, "num_input_tokens_seen": 5567616, "step": 11320 }, { "epoch": 1.4946548766002374, "grad_norm": 0.9778904318809509, "learning_rate": 1.7685196512937217e-06, "loss": 0.001, "num_input_tokens_seen": 5569984, "step": 11325 }, { "epoch": 1.495314768377986, "grad_norm": 0.11220138520002365, "learning_rate": 1.7682248030245836e-06, "loss": 0.1185, "num_input_tokens_seen": 5572160, "step": 11330 }, { "epoch": 1.4959746601557344, "grad_norm": 0.01077987626194954, "learning_rate": 1.7679297917052939e-06, "loss": 0.0006, "num_input_tokens_seen": 5574400, "step": 11335 }, { "epoch": 1.496634551933483, "grad_norm": 0.051964689046144485, "learning_rate": 1.7676346173984669e-06, "loss": 0.0005, "num_input_tokens_seen": 5577088, "step": 11340 }, { "epoch": 1.4972944437112314, "grad_norm": 0.009416039101779461, "learning_rate": 1.7673392801667513e-06, "loss": 0.0004, "num_input_tokens_seen": 5579584, "step": 11345 }, { "epoch": 1.4979543354889797, "grad_norm": 19.469966888427734, "learning_rate": 1.7670437800728298e-06, "loss": 0.2305, "num_input_tokens_seen": 5581952, "step": 11350 }, { "epoch": 1.4986142272667282, "grad_norm": 0.007671706844121218, "learning_rate": 1.7667481171794205e-06, "loss": 0.0002, "num_input_tokens_seen": 5584448, "step": 11355 }, { "epoch": 1.4992741190444767, "grad_norm": 0.07305540889501572, "learning_rate": 1.7664522915492759e-06, "loss": 0.0414, "num_input_tokens_seen": 5587008, "step": 11360 }, { "epoch": 1.4999340108222252, "grad_norm": 0.051261018961668015, "learning_rate": 1.7661563032451827e-06, "loss": 0.0593, "num_input_tokens_seen": 5589312, "step": 11365 }, { "epoch": 1.5005939025999737, "grad_norm": 28.68985366821289, "learning_rate": 1.7658601523299619e-06, "loss": 0.1073, "num_input_tokens_seen": 5591680, "step": 11370 }, { "epoch": 1.5005939025999737, "eval_loss": 0.12565796077251434, "eval_runtime": 7.7594, "eval_samples_per_second": 867.976, "eval_steps_per_second": 108.513, "num_input_tokens_seen": 5591680, "step": 11370 }, { "epoch": 1.501253794377722, "grad_norm": 1.1727688312530518, "learning_rate": 1.7655638388664698e-06, "loss": 0.0011, "num_input_tokens_seen": 5594176, "step": 11375 }, { "epoch": 1.5019136861554705, "grad_norm": 0.05231983959674835, "learning_rate": 1.765267362917597e-06, "loss": 0.0044, "num_input_tokens_seen": 5597056, "step": 11380 }, { "epoch": 1.502573577933219, "grad_norm": 1.1232495307922363, "learning_rate": 1.7649707245462678e-06, "loss": 0.0555, "num_input_tokens_seen": 5599488, "step": 11385 }, { "epoch": 1.5032334697109673, "grad_norm": 0.18011930584907532, "learning_rate": 1.7646739238154416e-06, "loss": 0.0716, "num_input_tokens_seen": 5601856, "step": 11390 }, { "epoch": 1.503893361488716, "grad_norm": 0.16445372998714447, "learning_rate": 1.7643769607881126e-06, "loss": 0.1088, "num_input_tokens_seen": 5604736, "step": 11395 }, { "epoch": 1.5045532532664643, "grad_norm": 0.0443299375474453, "learning_rate": 1.7640798355273087e-06, "loss": 0.0004, "num_input_tokens_seen": 5607168, "step": 11400 }, { "epoch": 1.5052131450442128, "grad_norm": 0.009100513532757759, "learning_rate": 1.7637825480960929e-06, "loss": 0.0018, "num_input_tokens_seen": 5609600, "step": 11405 }, { "epoch": 1.5058730368219613, "grad_norm": 0.5021543502807617, "learning_rate": 1.7634850985575623e-06, "loss": 0.2186, "num_input_tokens_seen": 5612032, "step": 11410 }, { "epoch": 1.5065329285997096, "grad_norm": 0.1777099221944809, "learning_rate": 1.7631874869748477e-06, "loss": 0.0788, "num_input_tokens_seen": 5614656, "step": 11415 }, { "epoch": 1.507192820377458, "grad_norm": 14.347442626953125, "learning_rate": 1.7628897134111163e-06, "loss": 0.128, "num_input_tokens_seen": 5616768, "step": 11420 }, { "epoch": 1.5078527121552066, "grad_norm": 13.774393081665039, "learning_rate": 1.762591777929567e-06, "loss": 0.1947, "num_input_tokens_seen": 5619008, "step": 11425 }, { "epoch": 1.5085126039329548, "grad_norm": 0.05142497643828392, "learning_rate": 1.7622936805934355e-06, "loss": 0.1306, "num_input_tokens_seen": 5621440, "step": 11430 }, { "epoch": 1.5091724957107036, "grad_norm": 0.14459626376628876, "learning_rate": 1.7619954214659901e-06, "loss": 0.0579, "num_input_tokens_seen": 5623872, "step": 11435 }, { "epoch": 1.5098323874884518, "grad_norm": 0.2629461884498596, "learning_rate": 1.7616970006105347e-06, "loss": 0.0584, "num_input_tokens_seen": 5626240, "step": 11440 }, { "epoch": 1.5104922792662003, "grad_norm": 0.13801227509975433, "learning_rate": 1.7613984180904065e-06, "loss": 0.1526, "num_input_tokens_seen": 5628544, "step": 11445 }, { "epoch": 1.5111521710439488, "grad_norm": 0.15773025155067444, "learning_rate": 1.7610996739689779e-06, "loss": 0.116, "num_input_tokens_seen": 5630912, "step": 11450 }, { "epoch": 1.5118120628216971, "grad_norm": 0.0603664331138134, "learning_rate": 1.7608007683096547e-06, "loss": 0.1417, "num_input_tokens_seen": 5633472, "step": 11455 }, { "epoch": 1.5124719545994458, "grad_norm": 0.05194404348731041, "learning_rate": 1.7605017011758778e-06, "loss": 0.1697, "num_input_tokens_seen": 5635712, "step": 11460 }, { "epoch": 1.5131318463771941, "grad_norm": 0.4702714681625366, "learning_rate": 1.7602024726311219e-06, "loss": 0.0017, "num_input_tokens_seen": 5638208, "step": 11465 }, { "epoch": 1.5137917381549426, "grad_norm": 0.3294268250465393, "learning_rate": 1.7599030827388963e-06, "loss": 0.0026, "num_input_tokens_seen": 5640832, "step": 11470 }, { "epoch": 1.5144516299326911, "grad_norm": 17.308738708496094, "learning_rate": 1.7596035315627442e-06, "loss": 0.0513, "num_input_tokens_seen": 5643200, "step": 11475 }, { "epoch": 1.5151115217104394, "grad_norm": 0.08752676844596863, "learning_rate": 1.7593038191662427e-06, "loss": 0.0494, "num_input_tokens_seen": 5645888, "step": 11480 }, { "epoch": 1.515771413488188, "grad_norm": 0.21542641520500183, "learning_rate": 1.7590039456130046e-06, "loss": 0.0006, "num_input_tokens_seen": 5648128, "step": 11485 }, { "epoch": 1.5164313052659364, "grad_norm": 0.0905633419752121, "learning_rate": 1.758703910966675e-06, "loss": 0.0801, "num_input_tokens_seen": 5650624, "step": 11490 }, { "epoch": 1.5170911970436847, "grad_norm": 1.459328532218933, "learning_rate": 1.7584037152909344e-06, "loss": 0.1932, "num_input_tokens_seen": 5653056, "step": 11495 }, { "epoch": 1.5177510888214334, "grad_norm": 0.011277738027274609, "learning_rate": 1.7581033586494973e-06, "loss": 0.0716, "num_input_tokens_seen": 5655552, "step": 11500 }, { "epoch": 1.5184109805991817, "grad_norm": 141.13853454589844, "learning_rate": 1.757802841106112e-06, "loss": 0.1644, "num_input_tokens_seen": 5658112, "step": 11505 }, { "epoch": 1.5190708723769302, "grad_norm": 0.05243814364075661, "learning_rate": 1.7575021627245612e-06, "loss": 0.0823, "num_input_tokens_seen": 5660480, "step": 11510 }, { "epoch": 1.5197307641546787, "grad_norm": 0.09922140091657639, "learning_rate": 1.7572013235686618e-06, "loss": 0.1695, "num_input_tokens_seen": 5662848, "step": 11515 }, { "epoch": 1.520390655932427, "grad_norm": 0.6299600601196289, "learning_rate": 1.7569003237022647e-06, "loss": 0.0332, "num_input_tokens_seen": 5665600, "step": 11520 }, { "epoch": 1.5210505477101757, "grad_norm": 92.7926254272461, "learning_rate": 1.756599163189255e-06, "loss": 0.0245, "num_input_tokens_seen": 5667776, "step": 11525 }, { "epoch": 1.521710439487924, "grad_norm": 11.94028377532959, "learning_rate": 1.7562978420935516e-06, "loss": 0.0693, "num_input_tokens_seen": 5670400, "step": 11530 }, { "epoch": 1.5223703312656724, "grad_norm": 0.1259056031703949, "learning_rate": 1.755996360479108e-06, "loss": 0.0025, "num_input_tokens_seen": 5673152, "step": 11535 }, { "epoch": 1.523030223043421, "grad_norm": 0.020230580121278763, "learning_rate": 1.7556947184099115e-06, "loss": 0.0894, "num_input_tokens_seen": 5675648, "step": 11540 }, { "epoch": 1.5236901148211692, "grad_norm": 0.01828363724052906, "learning_rate": 1.7553929159499832e-06, "loss": 0.0006, "num_input_tokens_seen": 5678144, "step": 11545 }, { "epoch": 1.5243500065989177, "grad_norm": 0.06999734044075012, "learning_rate": 1.755090953163379e-06, "loss": 0.0004, "num_input_tokens_seen": 5680448, "step": 11550 }, { "epoch": 1.5250098983766662, "grad_norm": 1.2670248746871948, "learning_rate": 1.754788830114187e-06, "loss": 0.0007, "num_input_tokens_seen": 5683008, "step": 11555 }, { "epoch": 1.5256697901544147, "grad_norm": 0.021599799394607544, "learning_rate": 1.7544865468665325e-06, "loss": 0.1216, "num_input_tokens_seen": 5685632, "step": 11560 }, { "epoch": 1.5263296819321632, "grad_norm": 27.43296241760254, "learning_rate": 1.7541841034845714e-06, "loss": 0.0914, "num_input_tokens_seen": 5687936, "step": 11565 }, { "epoch": 1.5269895737099115, "grad_norm": 0.059593282639980316, "learning_rate": 1.753881500032496e-06, "loss": 0.0879, "num_input_tokens_seen": 5690560, "step": 11570 }, { "epoch": 1.52764946548766, "grad_norm": 0.017369644716382027, "learning_rate": 1.7535787365745314e-06, "loss": 0.0659, "num_input_tokens_seen": 5692928, "step": 11575 }, { "epoch": 1.5283093572654085, "grad_norm": 137.04595947265625, "learning_rate": 1.7532758131749367e-06, "loss": 0.1278, "num_input_tokens_seen": 5695232, "step": 11580 }, { "epoch": 1.5289692490431568, "grad_norm": 0.39762821793556213, "learning_rate": 1.7529727298980058e-06, "loss": 0.0222, "num_input_tokens_seen": 5697856, "step": 11585 }, { "epoch": 1.5296291408209055, "grad_norm": 0.1500694751739502, "learning_rate": 1.7526694868080654e-06, "loss": 0.1518, "num_input_tokens_seen": 5700544, "step": 11590 }, { "epoch": 1.5302890325986538, "grad_norm": 0.22773054242134094, "learning_rate": 1.752366083969477e-06, "loss": 0.0011, "num_input_tokens_seen": 5702976, "step": 11595 }, { "epoch": 1.5309489243764023, "grad_norm": 0.05045131593942642, "learning_rate": 1.7520625214466352e-06, "loss": 0.0432, "num_input_tokens_seen": 5705600, "step": 11600 }, { "epoch": 1.5316088161541508, "grad_norm": 0.05065304785966873, "learning_rate": 1.7517587993039693e-06, "loss": 0.157, "num_input_tokens_seen": 5707968, "step": 11605 }, { "epoch": 1.532268707931899, "grad_norm": 0.052952587604522705, "learning_rate": 1.751454917605942e-06, "loss": 0.1714, "num_input_tokens_seen": 5710656, "step": 11610 }, { "epoch": 1.5329285997096476, "grad_norm": 15.411102294921875, "learning_rate": 1.7511508764170502e-06, "loss": 0.1965, "num_input_tokens_seen": 5712960, "step": 11615 }, { "epoch": 1.533588491487396, "grad_norm": 0.49912622570991516, "learning_rate": 1.7508466758018243e-06, "loss": 0.1463, "num_input_tokens_seen": 5715456, "step": 11620 }, { "epoch": 1.5342483832651446, "grad_norm": 0.6242772936820984, "learning_rate": 1.7505423158248285e-06, "loss": 0.1403, "num_input_tokens_seen": 5718080, "step": 11625 }, { "epoch": 1.534908275042893, "grad_norm": 11.48921012878418, "learning_rate": 1.750237796550661e-06, "loss": 0.1222, "num_input_tokens_seen": 5720448, "step": 11630 }, { "epoch": 1.5355681668206413, "grad_norm": 0.16093257069587708, "learning_rate": 1.7499331180439545e-06, "loss": 0.004, "num_input_tokens_seen": 5722816, "step": 11635 }, { "epoch": 1.5362280585983898, "grad_norm": 0.8627403974533081, "learning_rate": 1.749628280369374e-06, "loss": 0.0761, "num_input_tokens_seen": 5725184, "step": 11640 }, { "epoch": 1.5368879503761383, "grad_norm": 2.9328296184539795, "learning_rate": 1.7493232835916195e-06, "loss": 0.0645, "num_input_tokens_seen": 5727872, "step": 11645 }, { "epoch": 1.5375478421538866, "grad_norm": 0.4395294785499573, "learning_rate": 1.7490181277754238e-06, "loss": 0.0513, "num_input_tokens_seen": 5730560, "step": 11650 }, { "epoch": 1.5382077339316353, "grad_norm": 0.03519681468605995, "learning_rate": 1.748712812985555e-06, "loss": 0.0013, "num_input_tokens_seen": 5733056, "step": 11655 }, { "epoch": 1.5388676257093836, "grad_norm": 0.04068261757493019, "learning_rate": 1.7484073392868133e-06, "loss": 0.0553, "num_input_tokens_seen": 5735744, "step": 11660 }, { "epoch": 1.5395275174871321, "grad_norm": 117.09519958496094, "learning_rate": 1.7481017067440332e-06, "loss": 0.2724, "num_input_tokens_seen": 5738112, "step": 11665 }, { "epoch": 1.5401874092648806, "grad_norm": 16.728363037109375, "learning_rate": 1.7477959154220834e-06, "loss": 0.0778, "num_input_tokens_seen": 5740480, "step": 11670 }, { "epoch": 1.540847301042629, "grad_norm": 0.05176861584186554, "learning_rate": 1.7474899653858651e-06, "loss": 0.1881, "num_input_tokens_seen": 5742720, "step": 11675 }, { "epoch": 1.5415071928203774, "grad_norm": 0.1164284497499466, "learning_rate": 1.7471838567003153e-06, "loss": 0.0014, "num_input_tokens_seen": 5745088, "step": 11680 }, { "epoch": 1.542167084598126, "grad_norm": 4.228572368621826, "learning_rate": 1.746877589430402e-06, "loss": 0.0763, "num_input_tokens_seen": 5747328, "step": 11685 }, { "epoch": 1.5428269763758744, "grad_norm": 23.04313850402832, "learning_rate": 1.7465711636411288e-06, "loss": 0.1275, "num_input_tokens_seen": 5749952, "step": 11690 }, { "epoch": 1.543486868153623, "grad_norm": 12.308265686035156, "learning_rate": 1.746264579397533e-06, "loss": 0.2444, "num_input_tokens_seen": 5752512, "step": 11695 }, { "epoch": 1.5441467599313712, "grad_norm": 0.12739183008670807, "learning_rate": 1.7459578367646836e-06, "loss": 0.0617, "num_input_tokens_seen": 5755136, "step": 11700 }, { "epoch": 1.5448066517091197, "grad_norm": 26.933069229125977, "learning_rate": 1.7456509358076854e-06, "loss": 0.0409, "num_input_tokens_seen": 5757568, "step": 11705 }, { "epoch": 1.5454665434868682, "grad_norm": 0.07969870418310165, "learning_rate": 1.7453438765916758e-06, "loss": 0.0776, "num_input_tokens_seen": 5760000, "step": 11710 }, { "epoch": 1.5461264352646165, "grad_norm": 0.0692417323589325, "learning_rate": 1.7450366591818255e-06, "loss": 0.0011, "num_input_tokens_seen": 5762496, "step": 11715 }, { "epoch": 1.5467863270423652, "grad_norm": 13.611120223999023, "learning_rate": 1.7447292836433393e-06, "loss": 0.0578, "num_input_tokens_seen": 5764992, "step": 11720 }, { "epoch": 1.5474462188201135, "grad_norm": 0.0573265440762043, "learning_rate": 1.744421750041456e-06, "loss": 0.2353, "num_input_tokens_seen": 5767552, "step": 11725 }, { "epoch": 1.548106110597862, "grad_norm": 0.03308899700641632, "learning_rate": 1.7441140584414466e-06, "loss": 0.0704, "num_input_tokens_seen": 5770496, "step": 11730 }, { "epoch": 1.5487660023756105, "grad_norm": 0.15724174678325653, "learning_rate": 1.7438062089086167e-06, "loss": 0.0033, "num_input_tokens_seen": 5772864, "step": 11735 }, { "epoch": 1.5494258941533587, "grad_norm": 167.92340087890625, "learning_rate": 1.7434982015083056e-06, "loss": 0.13, "num_input_tokens_seen": 5775360, "step": 11740 }, { "epoch": 1.5500857859311075, "grad_norm": 0.12244913727045059, "learning_rate": 1.743190036305885e-06, "loss": 0.0007, "num_input_tokens_seen": 5777728, "step": 11745 }, { "epoch": 1.5507456777088557, "grad_norm": 10.200172424316406, "learning_rate": 1.7428817133667607e-06, "loss": 0.138, "num_input_tokens_seen": 5780160, "step": 11750 }, { "epoch": 1.5514055694866042, "grad_norm": 0.5580622553825378, "learning_rate": 1.7425732327563724e-06, "loss": 0.0649, "num_input_tokens_seen": 5782656, "step": 11755 }, { "epoch": 1.5520654612643527, "grad_norm": 0.1402365267276764, "learning_rate": 1.742264594540193e-06, "loss": 0.0009, "num_input_tokens_seen": 5784832, "step": 11760 }, { "epoch": 1.552725353042101, "grad_norm": 0.8441329002380371, "learning_rate": 1.7419557987837282e-06, "loss": 0.0474, "num_input_tokens_seen": 5787392, "step": 11765 }, { "epoch": 1.5533852448198495, "grad_norm": 0.062263913452625275, "learning_rate": 1.7416468455525179e-06, "loss": 0.0006, "num_input_tokens_seen": 5790080, "step": 11770 }, { "epoch": 1.554045136597598, "grad_norm": 0.06264446675777435, "learning_rate": 1.7413377349121353e-06, "loss": 0.1212, "num_input_tokens_seen": 5792576, "step": 11775 }, { "epoch": 1.5547050283753463, "grad_norm": 37.131507873535156, "learning_rate": 1.7410284669281868e-06, "loss": 0.0603, "num_input_tokens_seen": 5795136, "step": 11780 }, { "epoch": 1.555364920153095, "grad_norm": 23.096406936645508, "learning_rate": 1.7407190416663124e-06, "loss": 0.1844, "num_input_tokens_seen": 5797568, "step": 11785 }, { "epoch": 1.5560248119308433, "grad_norm": 0.10751156508922577, "learning_rate": 1.7404094591921852e-06, "loss": 0.068, "num_input_tokens_seen": 5799808, "step": 11790 }, { "epoch": 1.5566847037085918, "grad_norm": 20.53606605529785, "learning_rate": 1.740099719571512e-06, "loss": 0.0521, "num_input_tokens_seen": 5802240, "step": 11795 }, { "epoch": 1.5573445954863403, "grad_norm": 0.07454346865415573, "learning_rate": 1.7397898228700324e-06, "loss": 0.0007, "num_input_tokens_seen": 5804480, "step": 11800 }, { "epoch": 1.5580044872640886, "grad_norm": 0.377360075712204, "learning_rate": 1.7394797691535203e-06, "loss": 0.1066, "num_input_tokens_seen": 5806912, "step": 11805 }, { "epoch": 1.5586643790418373, "grad_norm": 18.144367218017578, "learning_rate": 1.739169558487782e-06, "loss": 0.0599, "num_input_tokens_seen": 5809152, "step": 11810 }, { "epoch": 1.5593242708195856, "grad_norm": 229.2332000732422, "learning_rate": 1.7388591909386575e-06, "loss": 0.033, "num_input_tokens_seen": 5811712, "step": 11815 }, { "epoch": 1.559984162597334, "grad_norm": 0.1694110631942749, "learning_rate": 1.7385486665720203e-06, "loss": 0.1292, "num_input_tokens_seen": 5814144, "step": 11820 }, { "epoch": 1.5606440543750826, "grad_norm": 0.32974788546562195, "learning_rate": 1.7382379854537767e-06, "loss": 0.0006, "num_input_tokens_seen": 5816512, "step": 11825 }, { "epoch": 1.5613039461528309, "grad_norm": 0.3102016746997833, "learning_rate": 1.7379271476498666e-06, "loss": 0.17, "num_input_tokens_seen": 5819072, "step": 11830 }, { "epoch": 1.5619638379305794, "grad_norm": 16.128957748413086, "learning_rate": 1.737616153226263e-06, "loss": 0.3636, "num_input_tokens_seen": 5821632, "step": 11835 }, { "epoch": 1.5626237297083279, "grad_norm": 0.35809439420700073, "learning_rate": 1.7373050022489722e-06, "loss": 0.0012, "num_input_tokens_seen": 5824000, "step": 11840 }, { "epoch": 1.5632836214860761, "grad_norm": 305.5896301269531, "learning_rate": 1.736993694784034e-06, "loss": 0.1184, "num_input_tokens_seen": 5826304, "step": 11845 }, { "epoch": 1.5639435132638249, "grad_norm": 55.598350524902344, "learning_rate": 1.736682230897521e-06, "loss": 0.0028, "num_input_tokens_seen": 5828608, "step": 11850 }, { "epoch": 1.5646034050415731, "grad_norm": 0.09117922931909561, "learning_rate": 1.7363706106555388e-06, "loss": 0.0607, "num_input_tokens_seen": 5831232, "step": 11855 }, { "epoch": 1.5652632968193216, "grad_norm": 0.2058306783437729, "learning_rate": 1.7360588341242273e-06, "loss": 0.1293, "num_input_tokens_seen": 5833664, "step": 11860 }, { "epoch": 1.5659231885970701, "grad_norm": 20.225988388061523, "learning_rate": 1.7357469013697582e-06, "loss": 0.047, "num_input_tokens_seen": 5835968, "step": 11865 }, { "epoch": 1.5665830803748184, "grad_norm": 0.11420862376689911, "learning_rate": 1.735434812458337e-06, "loss": 0.05, "num_input_tokens_seen": 5838464, "step": 11870 }, { "epoch": 1.5672429721525671, "grad_norm": 0.06262335926294327, "learning_rate": 1.7351225674562023e-06, "loss": 0.0951, "num_input_tokens_seen": 5840768, "step": 11875 }, { "epoch": 1.5679028639303154, "grad_norm": 0.4882711172103882, "learning_rate": 1.7348101664296265e-06, "loss": 0.0417, "num_input_tokens_seen": 5843328, "step": 11880 }, { "epoch": 1.568562755708064, "grad_norm": 25.356002807617188, "learning_rate": 1.7344976094449138e-06, "loss": 0.0526, "num_input_tokens_seen": 5845824, "step": 11885 }, { "epoch": 1.5692226474858124, "grad_norm": 0.4536149799823761, "learning_rate": 1.734184896568402e-06, "loss": 0.0011, "num_input_tokens_seen": 5848000, "step": 11890 }, { "epoch": 1.5698825392635607, "grad_norm": 16.68556785583496, "learning_rate": 1.7338720278664627e-06, "loss": 0.1243, "num_input_tokens_seen": 5850432, "step": 11895 }, { "epoch": 1.5705424310413092, "grad_norm": 0.03905687853693962, "learning_rate": 1.7335590034054997e-06, "loss": 0.0003, "num_input_tokens_seen": 5852800, "step": 11900 }, { "epoch": 1.5712023228190577, "grad_norm": 32.980892181396484, "learning_rate": 1.7332458232519502e-06, "loss": 0.0568, "num_input_tokens_seen": 5855104, "step": 11905 }, { "epoch": 1.571862214596806, "grad_norm": 0.020980341359972954, "learning_rate": 1.7329324874722846e-06, "loss": 0.0006, "num_input_tokens_seen": 5857664, "step": 11910 }, { "epoch": 1.5725221063745547, "grad_norm": 0.1890995353460312, "learning_rate": 1.7326189961330058e-06, "loss": 0.0007, "num_input_tokens_seen": 5859904, "step": 11915 }, { "epoch": 1.573181998152303, "grad_norm": 0.3043166995048523, "learning_rate": 1.7323053493006505e-06, "loss": 0.0809, "num_input_tokens_seen": 5862080, "step": 11920 }, { "epoch": 1.5738418899300515, "grad_norm": 0.31387296319007874, "learning_rate": 1.7319915470417876e-06, "loss": 0.0004, "num_input_tokens_seen": 5864384, "step": 11925 }, { "epoch": 1.5745017817078, "grad_norm": 0.07397419214248657, "learning_rate": 1.7316775894230197e-06, "loss": 0.191, "num_input_tokens_seen": 5866752, "step": 11930 }, { "epoch": 1.5751616734855483, "grad_norm": 0.2072152942419052, "learning_rate": 1.7313634765109816e-06, "loss": 0.0737, "num_input_tokens_seen": 5869248, "step": 11935 }, { "epoch": 1.575821565263297, "grad_norm": 0.016625676304101944, "learning_rate": 1.731049208372342e-06, "loss": 0.0554, "num_input_tokens_seen": 5871872, "step": 11940 }, { "epoch": 1.5764814570410453, "grad_norm": 0.14236846566200256, "learning_rate": 1.7307347850738014e-06, "loss": 0.1984, "num_input_tokens_seen": 5874176, "step": 11945 }, { "epoch": 1.5771413488187938, "grad_norm": 0.07960677891969681, "learning_rate": 1.7304202066820945e-06, "loss": 0.0005, "num_input_tokens_seen": 5876480, "step": 11950 }, { "epoch": 1.5778012405965423, "grad_norm": 0.4623545706272125, "learning_rate": 1.7301054732639882e-06, "loss": 0.0588, "num_input_tokens_seen": 5879104, "step": 11955 }, { "epoch": 1.5784611323742905, "grad_norm": 182.28273010253906, "learning_rate": 1.729790584886282e-06, "loss": 0.1775, "num_input_tokens_seen": 5881856, "step": 11960 }, { "epoch": 1.579121024152039, "grad_norm": 0.05107644945383072, "learning_rate": 1.7294755416158089e-06, "loss": 0.0426, "num_input_tokens_seen": 5884416, "step": 11965 }, { "epoch": 1.5797809159297875, "grad_norm": 0.07986666262149811, "learning_rate": 1.7291603435194344e-06, "loss": 0.1012, "num_input_tokens_seen": 5887040, "step": 11970 }, { "epoch": 1.5804408077075358, "grad_norm": 1.2375479936599731, "learning_rate": 1.7288449906640571e-06, "loss": 0.001, "num_input_tokens_seen": 5889536, "step": 11975 }, { "epoch": 1.5811006994852845, "grad_norm": 0.03987383469939232, "learning_rate": 1.7285294831166087e-06, "loss": 0.1094, "num_input_tokens_seen": 5891712, "step": 11980 }, { "epoch": 1.5817605912630328, "grad_norm": 0.13992907106876373, "learning_rate": 1.728213820944053e-06, "loss": 0.0588, "num_input_tokens_seen": 5894016, "step": 11985 }, { "epoch": 1.5824204830407813, "grad_norm": 0.28587380051612854, "learning_rate": 1.727898004213387e-06, "loss": 0.0014, "num_input_tokens_seen": 5896320, "step": 11990 }, { "epoch": 1.5830803748185298, "grad_norm": 40.24662399291992, "learning_rate": 1.7275820329916408e-06, "loss": 0.1045, "num_input_tokens_seen": 5898880, "step": 11995 }, { "epoch": 1.583740266596278, "grad_norm": 11.798562049865723, "learning_rate": 1.7272659073458766e-06, "loss": 0.1356, "num_input_tokens_seen": 5901632, "step": 12000 }, { "epoch": 1.5844001583740268, "grad_norm": 0.6922625303268433, "learning_rate": 1.7269496273431903e-06, "loss": 0.0449, "num_input_tokens_seen": 5903936, "step": 12005 }, { "epoch": 1.585060050151775, "grad_norm": 0.17535802721977234, "learning_rate": 1.7266331930507097e-06, "loss": 0.0552, "num_input_tokens_seen": 5906176, "step": 12010 }, { "epoch": 1.5857199419295236, "grad_norm": 0.09550785273313522, "learning_rate": 1.7263166045355954e-06, "loss": 0.0456, "num_input_tokens_seen": 5908864, "step": 12015 }, { "epoch": 1.586379833707272, "grad_norm": 0.042160920798778534, "learning_rate": 1.7259998618650418e-06, "loss": 0.0356, "num_input_tokens_seen": 5911424, "step": 12020 }, { "epoch": 1.5870397254850204, "grad_norm": 0.1057129055261612, "learning_rate": 1.7256829651062745e-06, "loss": 0.0003, "num_input_tokens_seen": 5913920, "step": 12025 }, { "epoch": 1.5876996172627689, "grad_norm": 15.842555046081543, "learning_rate": 1.725365914326553e-06, "loss": 0.2519, "num_input_tokens_seen": 5916160, "step": 12030 }, { "epoch": 1.5883595090405174, "grad_norm": 0.04782838001847267, "learning_rate": 1.7250487095931687e-06, "loss": 0.0035, "num_input_tokens_seen": 5918656, "step": 12035 }, { "epoch": 1.5890194008182656, "grad_norm": 0.644459068775177, "learning_rate": 1.7247313509734465e-06, "loss": 0.0008, "num_input_tokens_seen": 5921088, "step": 12040 }, { "epoch": 1.5896792925960144, "grad_norm": 0.4349988102912903, "learning_rate": 1.7244138385347429e-06, "loss": 0.0678, "num_input_tokens_seen": 5923456, "step": 12045 }, { "epoch": 1.5903391843737626, "grad_norm": 0.07780405879020691, "learning_rate": 1.7240961723444479e-06, "loss": 0.0005, "num_input_tokens_seen": 5925888, "step": 12050 }, { "epoch": 1.5909990761515111, "grad_norm": 0.47382500767707825, "learning_rate": 1.7237783524699836e-06, "loss": 0.0975, "num_input_tokens_seen": 5928384, "step": 12055 }, { "epoch": 1.5916589679292596, "grad_norm": 0.043678492307662964, "learning_rate": 1.7234603789788054e-06, "loss": 0.0006, "num_input_tokens_seen": 5930880, "step": 12060 }, { "epoch": 1.592318859707008, "grad_norm": 0.39742064476013184, "learning_rate": 1.7231422519384008e-06, "loss": 0.0004, "num_input_tokens_seen": 5933184, "step": 12065 }, { "epoch": 1.5929787514847566, "grad_norm": 0.02984798699617386, "learning_rate": 1.7228239714162895e-06, "loss": 0.2659, "num_input_tokens_seen": 5935744, "step": 12070 }, { "epoch": 1.593638643262505, "grad_norm": 0.26426124572753906, "learning_rate": 1.7225055374800249e-06, "loss": 0.0666, "num_input_tokens_seen": 5938304, "step": 12075 }, { "epoch": 1.5942985350402534, "grad_norm": 0.2540627717971802, "learning_rate": 1.7221869501971917e-06, "loss": 0.0895, "num_input_tokens_seen": 5940992, "step": 12080 }, { "epoch": 1.594958426818002, "grad_norm": 0.1360429972410202, "learning_rate": 1.721868209635408e-06, "loss": 0.0722, "num_input_tokens_seen": 5943552, "step": 12085 }, { "epoch": 1.5956183185957502, "grad_norm": 0.08838418126106262, "learning_rate": 1.7215493158623242e-06, "loss": 0.0022, "num_input_tokens_seen": 5946176, "step": 12090 }, { "epoch": 1.5962782103734987, "grad_norm": 0.19462290406227112, "learning_rate": 1.7212302689456234e-06, "loss": 0.0013, "num_input_tokens_seen": 5948800, "step": 12095 }, { "epoch": 1.5969381021512472, "grad_norm": 51.29019546508789, "learning_rate": 1.72091106895302e-06, "loss": 0.0489, "num_input_tokens_seen": 5951296, "step": 12100 }, { "epoch": 1.5975979939289955, "grad_norm": 14.03069019317627, "learning_rate": 1.7205917159522635e-06, "loss": 0.0734, "num_input_tokens_seen": 5953600, "step": 12105 }, { "epoch": 1.5982578857067442, "grad_norm": 0.015758154913783073, "learning_rate": 1.7202722100111328e-06, "loss": 0.0671, "num_input_tokens_seen": 5956224, "step": 12110 }, { "epoch": 1.5989177774844925, "grad_norm": 14.346113204956055, "learning_rate": 1.7199525511974417e-06, "loss": 0.1586, "num_input_tokens_seen": 5958656, "step": 12115 }, { "epoch": 1.599577669262241, "grad_norm": 0.01834205538034439, "learning_rate": 1.7196327395790352e-06, "loss": 0.0006, "num_input_tokens_seen": 5961088, "step": 12120 }, { "epoch": 1.6002375610399895, "grad_norm": 13.146003723144531, "learning_rate": 1.7193127752237906e-06, "loss": 0.1002, "num_input_tokens_seen": 5963520, "step": 12125 }, { "epoch": 1.6008974528177378, "grad_norm": 0.1535753607749939, "learning_rate": 1.7189926581996184e-06, "loss": 0.0006, "num_input_tokens_seen": 5965888, "step": 12130 }, { "epoch": 1.6015573445954865, "grad_norm": 2.0366768836975098, "learning_rate": 1.7186723885744609e-06, "loss": 0.0669, "num_input_tokens_seen": 5968064, "step": 12135 }, { "epoch": 1.6022172363732348, "grad_norm": 1.0037025213241577, "learning_rate": 1.7183519664162934e-06, "loss": 0.138, "num_input_tokens_seen": 5970560, "step": 12140 }, { "epoch": 1.6028771281509833, "grad_norm": 0.06006058678030968, "learning_rate": 1.7180313917931226e-06, "loss": 0.1324, "num_input_tokens_seen": 5973248, "step": 12145 }, { "epoch": 1.6035370199287318, "grad_norm": 0.02915399707853794, "learning_rate": 1.717710664772989e-06, "loss": 0.0511, "num_input_tokens_seen": 5975808, "step": 12150 }, { "epoch": 1.60419691170648, "grad_norm": 0.09983984380960464, "learning_rate": 1.7173897854239635e-06, "loss": 0.0011, "num_input_tokens_seen": 5978176, "step": 12155 }, { "epoch": 1.6048568034842285, "grad_norm": 0.39569634199142456, "learning_rate": 1.7170687538141512e-06, "loss": 0.0115, "num_input_tokens_seen": 5980608, "step": 12160 }, { "epoch": 1.605516695261977, "grad_norm": 0.05424243211746216, "learning_rate": 1.7167475700116882e-06, "loss": 0.0712, "num_input_tokens_seen": 5983424, "step": 12165 }, { "epoch": 1.6061765870397253, "grad_norm": 0.008268141187727451, "learning_rate": 1.7164262340847442e-06, "loss": 0.1335, "num_input_tokens_seen": 5985728, "step": 12170 }, { "epoch": 1.606836478817474, "grad_norm": 0.09651245921850204, "learning_rate": 1.71610474610152e-06, "loss": 0.0008, "num_input_tokens_seen": 5988288, "step": 12175 }, { "epoch": 1.6074963705952223, "grad_norm": 0.19663313031196594, "learning_rate": 1.7157831061302485e-06, "loss": 0.0775, "num_input_tokens_seen": 5991232, "step": 12180 }, { "epoch": 1.6081562623729708, "grad_norm": 0.14674024283885956, "learning_rate": 1.7154613142391968e-06, "loss": 0.0946, "num_input_tokens_seen": 5993728, "step": 12185 }, { "epoch": 1.6088161541507193, "grad_norm": 0.055165428668260574, "learning_rate": 1.7151393704966616e-06, "loss": 0.0603, "num_input_tokens_seen": 5996288, "step": 12190 }, { "epoch": 1.6094760459284676, "grad_norm": 0.08553388714790344, "learning_rate": 1.7148172749709736e-06, "loss": 0.1577, "num_input_tokens_seen": 5998848, "step": 12195 }, { "epoch": 1.6101359377062163, "grad_norm": 0.19394369423389435, "learning_rate": 1.7144950277304955e-06, "loss": 0.0006, "num_input_tokens_seen": 6001728, "step": 12200 }, { "epoch": 1.6107958294839646, "grad_norm": 0.3518393337726593, "learning_rate": 1.7141726288436216e-06, "loss": 0.0347, "num_input_tokens_seen": 6004480, "step": 12205 }, { "epoch": 1.611455721261713, "grad_norm": 0.23870500922203064, "learning_rate": 1.713850078378779e-06, "loss": 0.0302, "num_input_tokens_seen": 6006912, "step": 12210 }, { "epoch": 1.6121156130394616, "grad_norm": 0.05858458951115608, "learning_rate": 1.7135273764044262e-06, "loss": 0.1457, "num_input_tokens_seen": 6009344, "step": 12215 }, { "epoch": 1.6127755048172099, "grad_norm": 0.08137746900320053, "learning_rate": 1.7132045229890552e-06, "loss": 0.0659, "num_input_tokens_seen": 6011776, "step": 12220 }, { "epoch": 1.6134353965949584, "grad_norm": 0.02841508761048317, "learning_rate": 1.7128815182011886e-06, "loss": 0.1002, "num_input_tokens_seen": 6014080, "step": 12225 }, { "epoch": 1.6140952883727069, "grad_norm": 20.118602752685547, "learning_rate": 1.7125583621093819e-06, "loss": 0.1825, "num_input_tokens_seen": 6016640, "step": 12230 }, { "epoch": 1.6147551801504552, "grad_norm": 0.10016603022813797, "learning_rate": 1.712235054782223e-06, "loss": 0.0623, "num_input_tokens_seen": 6018816, "step": 12235 }, { "epoch": 1.6154150719282039, "grad_norm": 6.060418128967285, "learning_rate": 1.7119115962883313e-06, "loss": 0.001, "num_input_tokens_seen": 6021312, "step": 12240 }, { "epoch": 1.6160749637059522, "grad_norm": 0.0981973260641098, "learning_rate": 1.7115879866963586e-06, "loss": 0.0006, "num_input_tokens_seen": 6023616, "step": 12245 }, { "epoch": 1.6167348554837007, "grad_norm": 0.24712832272052765, "learning_rate": 1.7112642260749885e-06, "loss": 0.1983, "num_input_tokens_seen": 6026112, "step": 12250 }, { "epoch": 1.6173947472614492, "grad_norm": 0.10543973743915558, "learning_rate": 1.7109403144929369e-06, "loss": 0.0004, "num_input_tokens_seen": 6028544, "step": 12255 }, { "epoch": 1.6180546390391974, "grad_norm": 0.06185751035809517, "learning_rate": 1.7106162520189522e-06, "loss": 0.0435, "num_input_tokens_seen": 6031104, "step": 12260 }, { "epoch": 1.6187145308169462, "grad_norm": 0.056501973420381546, "learning_rate": 1.7102920387218136e-06, "loss": 0.1353, "num_input_tokens_seen": 6033728, "step": 12265 }, { "epoch": 1.6193744225946944, "grad_norm": 11.367016792297363, "learning_rate": 1.7099676746703332e-06, "loss": 0.2136, "num_input_tokens_seen": 6036352, "step": 12270 }, { "epoch": 1.620034314372443, "grad_norm": 12.27001667022705, "learning_rate": 1.7096431599333552e-06, "loss": 0.1958, "num_input_tokens_seen": 6038912, "step": 12275 }, { "epoch": 1.6206942061501914, "grad_norm": 0.5031221508979797, "learning_rate": 1.709318494579755e-06, "loss": 0.0934, "num_input_tokens_seen": 6041472, "step": 12280 }, { "epoch": 1.6213540979279397, "grad_norm": 0.1668480783700943, "learning_rate": 1.7089936786784414e-06, "loss": 0.0024, "num_input_tokens_seen": 6043584, "step": 12285 }, { "epoch": 1.6220139897056882, "grad_norm": 0.12070733308792114, "learning_rate": 1.708668712298353e-06, "loss": 0.059, "num_input_tokens_seen": 6046144, "step": 12290 }, { "epoch": 1.6226738814834367, "grad_norm": 0.07423515617847443, "learning_rate": 1.7083435955084627e-06, "loss": 0.0014, "num_input_tokens_seen": 6048512, "step": 12295 }, { "epoch": 1.6233337732611852, "grad_norm": 0.13812686502933502, "learning_rate": 1.7080183283777733e-06, "loss": 0.0526, "num_input_tokens_seen": 6050560, "step": 12300 }, { "epoch": 1.6239936650389337, "grad_norm": 0.06522795557975769, "learning_rate": 1.707692910975321e-06, "loss": 0.0014, "num_input_tokens_seen": 6053056, "step": 12305 }, { "epoch": 1.624653556816682, "grad_norm": 0.1586804836988449, "learning_rate": 1.7073673433701733e-06, "loss": 0.0247, "num_input_tokens_seen": 6055616, "step": 12310 }, { "epoch": 1.6253134485944305, "grad_norm": 33.24040603637695, "learning_rate": 1.7070416256314286e-06, "loss": 0.0355, "num_input_tokens_seen": 6057728, "step": 12315 }, { "epoch": 1.625973340372179, "grad_norm": 0.3471306264400482, "learning_rate": 1.7067157578282195e-06, "loss": 0.0661, "num_input_tokens_seen": 6060288, "step": 12320 }, { "epoch": 1.6266332321499273, "grad_norm": 31.32708740234375, "learning_rate": 1.7063897400297083e-06, "loss": 0.0912, "num_input_tokens_seen": 6062656, "step": 12325 }, { "epoch": 1.627293123927676, "grad_norm": 0.024469556286931038, "learning_rate": 1.7060635723050899e-06, "loss": 0.0712, "num_input_tokens_seen": 6065088, "step": 12330 }, { "epoch": 1.6279530157054243, "grad_norm": 0.04165755584836006, "learning_rate": 1.705737254723591e-06, "loss": 0.0004, "num_input_tokens_seen": 6067392, "step": 12335 }, { "epoch": 1.6286129074831728, "grad_norm": 12.173465728759766, "learning_rate": 1.7054107873544704e-06, "loss": 0.1991, "num_input_tokens_seen": 6069760, "step": 12340 }, { "epoch": 1.6292727992609213, "grad_norm": 0.069599948823452, "learning_rate": 1.7050841702670188e-06, "loss": 0.0003, "num_input_tokens_seen": 6072256, "step": 12345 }, { "epoch": 1.6299326910386696, "grad_norm": 0.08773674815893173, "learning_rate": 1.7047574035305576e-06, "loss": 0.0482, "num_input_tokens_seen": 6074688, "step": 12350 }, { "epoch": 1.630592582816418, "grad_norm": 0.009780521504580975, "learning_rate": 1.704430487214441e-06, "loss": 0.0002, "num_input_tokens_seen": 6077184, "step": 12355 }, { "epoch": 1.6312524745941666, "grad_norm": 1.1269195079803467, "learning_rate": 1.7041034213880545e-06, "loss": 0.001, "num_input_tokens_seen": 6079424, "step": 12360 }, { "epoch": 1.631912366371915, "grad_norm": 0.014932355843484402, "learning_rate": 1.7037762061208157e-06, "loss": 0.0385, "num_input_tokens_seen": 6081920, "step": 12365 }, { "epoch": 1.6325722581496636, "grad_norm": 0.028511904180049896, "learning_rate": 1.7034488414821734e-06, "loss": 0.115, "num_input_tokens_seen": 6084352, "step": 12370 }, { "epoch": 1.6332321499274118, "grad_norm": 0.017234528437256813, "learning_rate": 1.7031213275416083e-06, "loss": 0.0007, "num_input_tokens_seen": 6086848, "step": 12375 }, { "epoch": 1.6338920417051603, "grad_norm": 37.32754898071289, "learning_rate": 1.702793664368633e-06, "loss": 0.1038, "num_input_tokens_seen": 6089344, "step": 12380 }, { "epoch": 1.6345519334829088, "grad_norm": 0.03338843956589699, "learning_rate": 1.702465852032792e-06, "loss": 0.0722, "num_input_tokens_seen": 6091840, "step": 12385 }, { "epoch": 1.6352118252606571, "grad_norm": 0.08052528649568558, "learning_rate": 1.7021378906036607e-06, "loss": 0.0003, "num_input_tokens_seen": 6094144, "step": 12390 }, { "epoch": 1.6358717170384058, "grad_norm": 0.13955703377723694, "learning_rate": 1.7018097801508467e-06, "loss": 0.0003, "num_input_tokens_seen": 6096448, "step": 12395 }, { "epoch": 1.6365316088161541, "grad_norm": 0.022091159597039223, "learning_rate": 1.7014815207439884e-06, "loss": 0.1521, "num_input_tokens_seen": 6098816, "step": 12400 }, { "epoch": 1.6371915005939026, "grad_norm": 0.06733556091785431, "learning_rate": 1.7011531124527578e-06, "loss": 0.1457, "num_input_tokens_seen": 6101312, "step": 12405 }, { "epoch": 1.6378513923716511, "grad_norm": 0.060561731457710266, "learning_rate": 1.7008245553468559e-06, "loss": 0.0004, "num_input_tokens_seen": 6103936, "step": 12410 }, { "epoch": 1.6385112841493994, "grad_norm": 0.8273510932922363, "learning_rate": 1.7004958494960173e-06, "loss": 0.0008, "num_input_tokens_seen": 6106624, "step": 12415 }, { "epoch": 1.639171175927148, "grad_norm": 13.756531715393066, "learning_rate": 1.7001669949700073e-06, "loss": 0.0675, "num_input_tokens_seen": 6109440, "step": 12420 }, { "epoch": 1.6398310677048964, "grad_norm": 0.07977482676506042, "learning_rate": 1.6998379918386228e-06, "loss": 0.0088, "num_input_tokens_seen": 6111680, "step": 12425 }, { "epoch": 1.640490959482645, "grad_norm": 20.452110290527344, "learning_rate": 1.6995088401716922e-06, "loss": 0.1804, "num_input_tokens_seen": 6114112, "step": 12430 }, { "epoch": 1.6411508512603934, "grad_norm": 19.552038192749023, "learning_rate": 1.6991795400390756e-06, "loss": 0.1808, "num_input_tokens_seen": 6116544, "step": 12435 }, { "epoch": 1.6418107430381417, "grad_norm": 0.010304759256541729, "learning_rate": 1.698850091510665e-06, "loss": 0.001, "num_input_tokens_seen": 6119104, "step": 12440 }, { "epoch": 1.6424706348158902, "grad_norm": 20.534870147705078, "learning_rate": 1.6985204946563831e-06, "loss": 0.1505, "num_input_tokens_seen": 6121408, "step": 12445 }, { "epoch": 1.6431305265936387, "grad_norm": 11.33122444152832, "learning_rate": 1.6981907495461845e-06, "loss": 0.0506, "num_input_tokens_seen": 6124096, "step": 12450 }, { "epoch": 1.643790418371387, "grad_norm": 0.34447240829467773, "learning_rate": 1.697860856250055e-06, "loss": 0.001, "num_input_tokens_seen": 6126208, "step": 12455 }, { "epoch": 1.6444503101491357, "grad_norm": 25.78012466430664, "learning_rate": 1.6975308148380125e-06, "loss": 0.1091, "num_input_tokens_seen": 6128704, "step": 12460 }, { "epoch": 1.645110201926884, "grad_norm": 1.1186546087265015, "learning_rate": 1.6972006253801055e-06, "loss": 0.0021, "num_input_tokens_seen": 6130944, "step": 12465 }, { "epoch": 1.6457700937046325, "grad_norm": 391.4671630859375, "learning_rate": 1.6968702879464148e-06, "loss": 0.1985, "num_input_tokens_seen": 6133248, "step": 12470 }, { "epoch": 1.646429985482381, "grad_norm": 20.856117248535156, "learning_rate": 1.6965398026070517e-06, "loss": 0.2054, "num_input_tokens_seen": 6135744, "step": 12475 }, { "epoch": 1.6470898772601292, "grad_norm": 16.320039749145508, "learning_rate": 1.6962091694321595e-06, "loss": 0.0839, "num_input_tokens_seen": 6138112, "step": 12480 }, { "epoch": 1.647749769037878, "grad_norm": 0.07969659566879272, "learning_rate": 1.6958783884919124e-06, "loss": 0.1114, "num_input_tokens_seen": 6140608, "step": 12485 }, { "epoch": 1.6484096608156262, "grad_norm": 0.07572974264621735, "learning_rate": 1.6955474598565163e-06, "loss": 0.0034, "num_input_tokens_seen": 6143104, "step": 12490 }, { "epoch": 1.6490695525933747, "grad_norm": 0.6827272176742554, "learning_rate": 1.695216383596209e-06, "loss": 0.0456, "num_input_tokens_seen": 6145536, "step": 12495 }, { "epoch": 1.6497294443711232, "grad_norm": 0.06942526996135712, "learning_rate": 1.6948851597812586e-06, "loss": 0.0014, "num_input_tokens_seen": 6148096, "step": 12500 }, { "epoch": 1.6503893361488715, "grad_norm": 0.541210949420929, "learning_rate": 1.694553788481965e-06, "loss": 0.1411, "num_input_tokens_seen": 6150976, "step": 12505 }, { "epoch": 1.65104922792662, "grad_norm": 103.4432601928711, "learning_rate": 1.6942222697686593e-06, "loss": 0.1109, "num_input_tokens_seen": 6153408, "step": 12510 }, { "epoch": 1.6517091197043685, "grad_norm": 0.04951288178563118, "learning_rate": 1.6938906037117039e-06, "loss": 0.0007, "num_input_tokens_seen": 6156032, "step": 12515 }, { "epoch": 1.6523690114821168, "grad_norm": 0.344783216714859, "learning_rate": 1.6935587903814926e-06, "loss": 0.003, "num_input_tokens_seen": 6158784, "step": 12520 }, { "epoch": 1.6530289032598655, "grad_norm": 0.008561319671571255, "learning_rate": 1.6932268298484508e-06, "loss": 0.0005, "num_input_tokens_seen": 6161408, "step": 12525 }, { "epoch": 1.6536887950376138, "grad_norm": 16.31315803527832, "learning_rate": 1.692894722183034e-06, "loss": 0.0756, "num_input_tokens_seen": 6163712, "step": 12530 }, { "epoch": 1.6543486868153623, "grad_norm": 0.023284809663891792, "learning_rate": 1.6925624674557298e-06, "loss": 0.0003, "num_input_tokens_seen": 6165952, "step": 12535 }, { "epoch": 1.6550085785931108, "grad_norm": 0.14583520591259003, "learning_rate": 1.6922300657370573e-06, "loss": 0.0918, "num_input_tokens_seen": 6168192, "step": 12540 }, { "epoch": 1.655668470370859, "grad_norm": 0.03759438171982765, "learning_rate": 1.691897517097566e-06, "loss": 0.0002, "num_input_tokens_seen": 6170496, "step": 12545 }, { "epoch": 1.6563283621486078, "grad_norm": 0.370606392621994, "learning_rate": 1.6915648216078374e-06, "loss": 0.0005, "num_input_tokens_seen": 6173184, "step": 12550 }, { "epoch": 1.656988253926356, "grad_norm": 0.037230778485536575, "learning_rate": 1.691231979338483e-06, "loss": 0.0884, "num_input_tokens_seen": 6175552, "step": 12555 }, { "epoch": 1.6576481457041046, "grad_norm": 0.02874135971069336, "learning_rate": 1.690898990360146e-06, "loss": 0.1239, "num_input_tokens_seen": 6178048, "step": 12560 }, { "epoch": 1.658308037481853, "grad_norm": 0.015778981149196625, "learning_rate": 1.690565854743502e-06, "loss": 0.1286, "num_input_tokens_seen": 6180544, "step": 12565 }, { "epoch": 1.6589679292596013, "grad_norm": 0.028208622708916664, "learning_rate": 1.690232572559256e-06, "loss": 0.0003, "num_input_tokens_seen": 6183040, "step": 12570 }, { "epoch": 1.6596278210373498, "grad_norm": 0.011002824641764164, "learning_rate": 1.6898991438781445e-06, "loss": 0.1679, "num_input_tokens_seen": 6185984, "step": 12575 }, { "epoch": 1.6602877128150983, "grad_norm": 0.16241048276424408, "learning_rate": 1.6895655687709356e-06, "loss": 0.0464, "num_input_tokens_seen": 6188480, "step": 12580 }, { "epoch": 1.6609476045928466, "grad_norm": 97.199951171875, "learning_rate": 1.6892318473084283e-06, "loss": 0.071, "num_input_tokens_seen": 6191104, "step": 12585 }, { "epoch": 1.6616074963705953, "grad_norm": 0.041437387466430664, "learning_rate": 1.6888979795614524e-06, "loss": 0.404, "num_input_tokens_seen": 6193664, "step": 12590 }, { "epoch": 1.6622673881483436, "grad_norm": 0.7268674373626709, "learning_rate": 1.688563965600869e-06, "loss": 0.1, "num_input_tokens_seen": 6196480, "step": 12595 }, { "epoch": 1.6629272799260921, "grad_norm": 11.610241889953613, "learning_rate": 1.68822980549757e-06, "loss": 0.1222, "num_input_tokens_seen": 6199104, "step": 12600 }, { "epoch": 1.6635871717038406, "grad_norm": 0.09228473156690598, "learning_rate": 1.6878954993224786e-06, "loss": 0.0012, "num_input_tokens_seen": 6201280, "step": 12605 }, { "epoch": 1.664247063481589, "grad_norm": 0.27008455991744995, "learning_rate": 1.687561047146549e-06, "loss": 0.0029, "num_input_tokens_seen": 6203776, "step": 12610 }, { "epoch": 1.6649069552593376, "grad_norm": 0.06659281998872757, "learning_rate": 1.6872264490407656e-06, "loss": 0.1658, "num_input_tokens_seen": 6206464, "step": 12615 }, { "epoch": 1.665566847037086, "grad_norm": 0.6453077793121338, "learning_rate": 1.686891705076145e-06, "loss": 0.003, "num_input_tokens_seen": 6208896, "step": 12620 }, { "epoch": 1.6662267388148344, "grad_norm": 0.07216670364141464, "learning_rate": 1.6865568153237343e-06, "loss": 0.061, "num_input_tokens_seen": 6211136, "step": 12625 }, { "epoch": 1.666886630592583, "grad_norm": 30.78990936279297, "learning_rate": 1.6862217798546115e-06, "loss": 0.0895, "num_input_tokens_seen": 6213568, "step": 12630 }, { "epoch": 1.6675465223703312, "grad_norm": 18.963054656982422, "learning_rate": 1.6858865987398847e-06, "loss": 0.2668, "num_input_tokens_seen": 6215936, "step": 12635 }, { "epoch": 1.6682064141480797, "grad_norm": 0.05866992846131325, "learning_rate": 1.6855512720506941e-06, "loss": 0.0006, "num_input_tokens_seen": 6218560, "step": 12640 }, { "epoch": 1.6688663059258282, "grad_norm": 0.7241128087043762, "learning_rate": 1.6852157998582106e-06, "loss": 0.1121, "num_input_tokens_seen": 6221184, "step": 12645 }, { "epoch": 1.6695261977035765, "grad_norm": 0.14978912472724915, "learning_rate": 1.6848801822336355e-06, "loss": 0.0836, "num_input_tokens_seen": 6223552, "step": 12650 }, { "epoch": 1.6701860894813252, "grad_norm": 0.06732896715402603, "learning_rate": 1.684544419248201e-06, "loss": 0.0497, "num_input_tokens_seen": 6226304, "step": 12655 }, { "epoch": 1.6708459812590735, "grad_norm": 0.030299250036478043, "learning_rate": 1.6842085109731708e-06, "loss": 0.1245, "num_input_tokens_seen": 6228864, "step": 12660 }, { "epoch": 1.671505873036822, "grad_norm": 0.1021413654088974, "learning_rate": 1.6838724574798387e-06, "loss": 0.0763, "num_input_tokens_seen": 6231552, "step": 12665 }, { "epoch": 1.6721657648145705, "grad_norm": 0.10425157845020294, "learning_rate": 1.6835362588395298e-06, "loss": 0.0017, "num_input_tokens_seen": 6233856, "step": 12670 }, { "epoch": 1.6728256565923187, "grad_norm": 0.1030372902750969, "learning_rate": 1.6831999151235995e-06, "loss": 0.0827, "num_input_tokens_seen": 6236928, "step": 12675 }, { "epoch": 1.6734855483700675, "grad_norm": 19.882171630859375, "learning_rate": 1.682863426403435e-06, "loss": 0.0704, "num_input_tokens_seen": 6239552, "step": 12680 }, { "epoch": 1.6741454401478157, "grad_norm": 108.14405822753906, "learning_rate": 1.682526792750453e-06, "loss": 0.0288, "num_input_tokens_seen": 6241920, "step": 12685 }, { "epoch": 1.6748053319255642, "grad_norm": 0.06011539697647095, "learning_rate": 1.6821900142361015e-06, "loss": 0.2077, "num_input_tokens_seen": 6244160, "step": 12690 }, { "epoch": 1.6754652237033127, "grad_norm": 13.853163719177246, "learning_rate": 1.6818530909318595e-06, "loss": 0.2284, "num_input_tokens_seen": 6246720, "step": 12695 }, { "epoch": 1.676125115481061, "grad_norm": 14.427210807800293, "learning_rate": 1.6815160229092367e-06, "loss": 0.0043, "num_input_tokens_seen": 6249088, "step": 12700 }, { "epoch": 1.6767850072588095, "grad_norm": 13.329300880432129, "learning_rate": 1.6811788102397733e-06, "loss": 0.2497, "num_input_tokens_seen": 6251456, "step": 12705 }, { "epoch": 1.677444899036558, "grad_norm": 0.5812211036682129, "learning_rate": 1.68084145299504e-06, "loss": 0.0581, "num_input_tokens_seen": 6254272, "step": 12710 }, { "epoch": 1.6781047908143063, "grad_norm": 0.2836822271347046, "learning_rate": 1.6805039512466385e-06, "loss": 0.0853, "num_input_tokens_seen": 6256704, "step": 12715 }, { "epoch": 1.678764682592055, "grad_norm": 0.2207585871219635, "learning_rate": 1.6801663050662012e-06, "loss": 0.0476, "num_input_tokens_seen": 6259072, "step": 12720 }, { "epoch": 1.6794245743698033, "grad_norm": 0.47076278924942017, "learning_rate": 1.6798285145253907e-06, "loss": 0.0021, "num_input_tokens_seen": 6261632, "step": 12725 }, { "epoch": 1.6800844661475518, "grad_norm": 0.46993520855903625, "learning_rate": 1.6794905796959017e-06, "loss": 0.047, "num_input_tokens_seen": 6264192, "step": 12730 }, { "epoch": 1.6807443579253003, "grad_norm": 0.058821726590394974, "learning_rate": 1.6791525006494572e-06, "loss": 0.0419, "num_input_tokens_seen": 6266624, "step": 12735 }, { "epoch": 1.6814042497030486, "grad_norm": 0.04777168855071068, "learning_rate": 1.6788142774578126e-06, "loss": 0.3103, "num_input_tokens_seen": 6269056, "step": 12740 }, { "epoch": 1.6820641414807973, "grad_norm": 0.05101204663515091, "learning_rate": 1.678475910192753e-06, "loss": 0.0926, "num_input_tokens_seen": 6271296, "step": 12745 }, { "epoch": 1.6827240332585456, "grad_norm": 11.261943817138672, "learning_rate": 1.6781373989260948e-06, "loss": 0.2126, "num_input_tokens_seen": 6273600, "step": 12750 }, { "epoch": 1.683383925036294, "grad_norm": 23.232006072998047, "learning_rate": 1.6777987437296842e-06, "loss": 0.0928, "num_input_tokens_seen": 6276160, "step": 12755 }, { "epoch": 1.6840438168140426, "grad_norm": 0.16996802389621735, "learning_rate": 1.6774599446753984e-06, "loss": 0.0657, "num_input_tokens_seen": 6278720, "step": 12760 }, { "epoch": 1.6847037085917909, "grad_norm": 0.07030691206455231, "learning_rate": 1.6771210018351453e-06, "loss": 0.2448, "num_input_tokens_seen": 6281152, "step": 12765 }, { "epoch": 1.6853636003695394, "grad_norm": 0.1533779799938202, "learning_rate": 1.6767819152808627e-06, "loss": 0.0394, "num_input_tokens_seen": 6283392, "step": 12770 }, { "epoch": 1.6860234921472879, "grad_norm": 16.754724502563477, "learning_rate": 1.6764426850845194e-06, "loss": 0.064, "num_input_tokens_seen": 6285504, "step": 12775 }, { "epoch": 1.6866833839250361, "grad_norm": 5.0514678955078125, "learning_rate": 1.676103311318115e-06, "loss": 0.306, "num_input_tokens_seen": 6287936, "step": 12780 }, { "epoch": 1.6873432757027849, "grad_norm": 1.2993375062942505, "learning_rate": 1.6757637940536787e-06, "loss": 0.0701, "num_input_tokens_seen": 6290496, "step": 12785 }, { "epoch": 1.6880031674805331, "grad_norm": 0.2400226593017578, "learning_rate": 1.6754241333632705e-06, "loss": 0.0025, "num_input_tokens_seen": 6293056, "step": 12790 }, { "epoch": 1.6886630592582816, "grad_norm": 0.1520133912563324, "learning_rate": 1.6750843293189806e-06, "loss": 0.0408, "num_input_tokens_seen": 6295488, "step": 12795 }, { "epoch": 1.6893229510360301, "grad_norm": 0.32761409878730774, "learning_rate": 1.674744381992931e-06, "loss": 0.0907, "num_input_tokens_seen": 6297856, "step": 12800 }, { "epoch": 1.6899828428137784, "grad_norm": 1.6226130723953247, "learning_rate": 1.674404291457272e-06, "loss": 0.18, "num_input_tokens_seen": 6300160, "step": 12805 }, { "epoch": 1.6906427345915271, "grad_norm": 13.813915252685547, "learning_rate": 1.6740640577841862e-06, "loss": 0.1037, "num_input_tokens_seen": 6302976, "step": 12810 }, { "epoch": 1.6913026263692754, "grad_norm": 0.11801814287900925, "learning_rate": 1.673723681045885e-06, "loss": 0.003, "num_input_tokens_seen": 6305408, "step": 12815 }, { "epoch": 1.691962518147024, "grad_norm": 0.07843796163797379, "learning_rate": 1.6733831613146113e-06, "loss": 0.001, "num_input_tokens_seen": 6308160, "step": 12820 }, { "epoch": 1.6926224099247724, "grad_norm": 45.090911865234375, "learning_rate": 1.673042498662638e-06, "loss": 0.101, "num_input_tokens_seen": 6310656, "step": 12825 }, { "epoch": 1.6932823017025207, "grad_norm": 0.052833192050457, "learning_rate": 1.672701693162268e-06, "loss": 0.0167, "num_input_tokens_seen": 6313024, "step": 12830 }, { "epoch": 1.6939421934802692, "grad_norm": 0.3140457570552826, "learning_rate": 1.672360744885835e-06, "loss": 0.0606, "num_input_tokens_seen": 6315584, "step": 12835 }, { "epoch": 1.6946020852580177, "grad_norm": 0.043698202818632126, "learning_rate": 1.6720196539057025e-06, "loss": 0.0008, "num_input_tokens_seen": 6318016, "step": 12840 }, { "epoch": 1.695261977035766, "grad_norm": 24.684247970581055, "learning_rate": 1.671678420294265e-06, "loss": 0.2314, "num_input_tokens_seen": 6320896, "step": 12845 }, { "epoch": 1.6959218688135147, "grad_norm": 17.751066207885742, "learning_rate": 1.6713370441239469e-06, "loss": 0.0503, "num_input_tokens_seen": 6323328, "step": 12850 }, { "epoch": 1.696581760591263, "grad_norm": 0.4494774043560028, "learning_rate": 1.6709955254672026e-06, "loss": 0.1382, "num_input_tokens_seen": 6325760, "step": 12855 }, { "epoch": 1.6972416523690115, "grad_norm": 0.04193054139614105, "learning_rate": 1.670653864396517e-06, "loss": 0.066, "num_input_tokens_seen": 6328256, "step": 12860 }, { "epoch": 1.69790154414676, "grad_norm": 0.06418413668870926, "learning_rate": 1.670312060984405e-06, "loss": 0.1217, "num_input_tokens_seen": 6330816, "step": 12865 }, { "epoch": 1.6985614359245083, "grad_norm": 14.279733657836914, "learning_rate": 1.669970115303412e-06, "loss": 0.1156, "num_input_tokens_seen": 6333184, "step": 12870 }, { "epoch": 1.699221327702257, "grad_norm": 0.7062100768089294, "learning_rate": 1.6696280274261137e-06, "loss": 0.0019, "num_input_tokens_seen": 6335744, "step": 12875 }, { "epoch": 1.6998812194800053, "grad_norm": 15.306600570678711, "learning_rate": 1.6692857974251156e-06, "loss": 0.1754, "num_input_tokens_seen": 6338240, "step": 12880 }, { "epoch": 1.7005411112577538, "grad_norm": 0.228725865483284, "learning_rate": 1.668943425373054e-06, "loss": 0.0014, "num_input_tokens_seen": 6340672, "step": 12885 }, { "epoch": 1.7012010030355023, "grad_norm": 0.060659151524305344, "learning_rate": 1.668600911342594e-06, "loss": 0.0742, "num_input_tokens_seen": 6343104, "step": 12890 }, { "epoch": 1.7018608948132505, "grad_norm": 0.029327716678380966, "learning_rate": 1.668258255406432e-06, "loss": 0.0573, "num_input_tokens_seen": 6345856, "step": 12895 }, { "epoch": 1.702520786590999, "grad_norm": 433.2555236816406, "learning_rate": 1.6679154576372949e-06, "loss": 0.1826, "num_input_tokens_seen": 6348224, "step": 12900 }, { "epoch": 1.7031806783687475, "grad_norm": 3.740283489227295, "learning_rate": 1.6675725181079384e-06, "loss": 0.1554, "num_input_tokens_seen": 6350784, "step": 12905 }, { "epoch": 1.7038405701464958, "grad_norm": 0.37366318702697754, "learning_rate": 1.6672294368911493e-06, "loss": 0.0764, "num_input_tokens_seen": 6353344, "step": 12910 }, { "epoch": 1.7045004619242445, "grad_norm": 23.09748077392578, "learning_rate": 1.6668862140597434e-06, "loss": 0.0498, "num_input_tokens_seen": 6355584, "step": 12915 }, { "epoch": 1.7051603537019928, "grad_norm": 0.9731279611587524, "learning_rate": 1.6665428496865684e-06, "loss": 0.0021, "num_input_tokens_seen": 6358208, "step": 12920 }, { "epoch": 1.7058202454797413, "grad_norm": 0.08989608287811279, "learning_rate": 1.6661993438445e-06, "loss": 0.1555, "num_input_tokens_seen": 6360576, "step": 12925 }, { "epoch": 1.7064801372574898, "grad_norm": 11.549619674682617, "learning_rate": 1.665855696606445e-06, "loss": 0.0649, "num_input_tokens_seen": 6362944, "step": 12930 }, { "epoch": 1.707140029035238, "grad_norm": 14.804794311523438, "learning_rate": 1.6655119080453402e-06, "loss": 0.1484, "num_input_tokens_seen": 6365440, "step": 12935 }, { "epoch": 1.7077999208129868, "grad_norm": 0.05886728689074516, "learning_rate": 1.6651679782341524e-06, "loss": 0.1357, "num_input_tokens_seen": 6367808, "step": 12940 }, { "epoch": 1.708459812590735, "grad_norm": 8.854753494262695, "learning_rate": 1.6648239072458777e-06, "loss": 0.1377, "num_input_tokens_seen": 6370304, "step": 12945 }, { "epoch": 1.7091197043684836, "grad_norm": 1.3267241716384888, "learning_rate": 1.6644796951535432e-06, "loss": 0.0023, "num_input_tokens_seen": 6372544, "step": 12950 }, { "epoch": 1.709779596146232, "grad_norm": 0.04999571666121483, "learning_rate": 1.664135342030205e-06, "loss": 0.0224, "num_input_tokens_seen": 6375232, "step": 12955 }, { "epoch": 1.7104394879239804, "grad_norm": 0.20558416843414307, "learning_rate": 1.6637908479489496e-06, "loss": 0.0619, "num_input_tokens_seen": 6377664, "step": 12960 }, { "epoch": 1.7110993797017289, "grad_norm": 14.066550254821777, "learning_rate": 1.6634462129828938e-06, "loss": 0.1494, "num_input_tokens_seen": 6380032, "step": 12965 }, { "epoch": 1.7117592714794774, "grad_norm": 0.4319327771663666, "learning_rate": 1.6631014372051836e-06, "loss": 0.0607, "num_input_tokens_seen": 6382464, "step": 12970 }, { "epoch": 1.7124191632572257, "grad_norm": 0.3865269124507904, "learning_rate": 1.6627565206889953e-06, "loss": 0.1611, "num_input_tokens_seen": 6384512, "step": 12975 }, { "epoch": 1.7130790550349744, "grad_norm": 0.09151875227689743, "learning_rate": 1.6624114635075344e-06, "loss": 0.0027, "num_input_tokens_seen": 6387072, "step": 12980 }, { "epoch": 1.7137389468127227, "grad_norm": 0.26752570271492004, "learning_rate": 1.6620662657340371e-06, "loss": 0.0497, "num_input_tokens_seen": 6389696, "step": 12985 }, { "epoch": 1.7143988385904712, "grad_norm": 0.33701422810554504, "learning_rate": 1.66172092744177e-06, "loss": 0.0436, "num_input_tokens_seen": 6392064, "step": 12990 }, { "epoch": 1.7150587303682197, "grad_norm": 0.3485437035560608, "learning_rate": 1.661375448704027e-06, "loss": 0.0485, "num_input_tokens_seen": 6394816, "step": 12995 }, { "epoch": 1.715718622145968, "grad_norm": 13.659415245056152, "learning_rate": 1.6610298295941347e-06, "loss": 0.1353, "num_input_tokens_seen": 6397376, "step": 13000 }, { "epoch": 1.7163785139237167, "grad_norm": 71.0516128540039, "learning_rate": 1.6606840701854476e-06, "loss": 0.1308, "num_input_tokens_seen": 6399936, "step": 13005 }, { "epoch": 1.717038405701465, "grad_norm": 15.572142601013184, "learning_rate": 1.660338170551351e-06, "loss": 0.1169, "num_input_tokens_seen": 6402560, "step": 13010 }, { "epoch": 1.7176982974792134, "grad_norm": 39.39515686035156, "learning_rate": 1.6599921307652598e-06, "loss": 0.081, "num_input_tokens_seen": 6404928, "step": 13015 }, { "epoch": 1.718358189256962, "grad_norm": 0.8864211440086365, "learning_rate": 1.659645950900618e-06, "loss": 0.0065, "num_input_tokens_seen": 6407552, "step": 13020 }, { "epoch": 1.7190180810347102, "grad_norm": 0.14865431189537048, "learning_rate": 1.6592996310308997e-06, "loss": 0.1286, "num_input_tokens_seen": 6410240, "step": 13025 }, { "epoch": 1.7196779728124587, "grad_norm": 10.887186050415039, "learning_rate": 1.658953171229609e-06, "loss": 0.1372, "num_input_tokens_seen": 6412480, "step": 13030 }, { "epoch": 1.7203378645902072, "grad_norm": 31.245393753051758, "learning_rate": 1.6586065715702797e-06, "loss": 0.0805, "num_input_tokens_seen": 6415104, "step": 13035 }, { "epoch": 1.7209977563679555, "grad_norm": 0.3583381175994873, "learning_rate": 1.658259832126475e-06, "loss": 0.04, "num_input_tokens_seen": 6417728, "step": 13040 }, { "epoch": 1.7216576481457042, "grad_norm": 14.247611999511719, "learning_rate": 1.6579129529717872e-06, "loss": 0.1079, "num_input_tokens_seen": 6420544, "step": 13045 }, { "epoch": 1.7223175399234525, "grad_norm": 0.24372056126594543, "learning_rate": 1.6575659341798396e-06, "loss": 0.0819, "num_input_tokens_seen": 6422976, "step": 13050 }, { "epoch": 1.722977431701201, "grad_norm": 0.5238776206970215, "learning_rate": 1.6572187758242842e-06, "loss": 0.1217, "num_input_tokens_seen": 6425216, "step": 13055 }, { "epoch": 1.7236373234789495, "grad_norm": 0.1704397052526474, "learning_rate": 1.6568714779788024e-06, "loss": 0.3001, "num_input_tokens_seen": 6427456, "step": 13060 }, { "epoch": 1.7242972152566978, "grad_norm": 0.049737852066755295, "learning_rate": 1.6565240407171067e-06, "loss": 0.0018, "num_input_tokens_seen": 6429824, "step": 13065 }, { "epoch": 1.7249571070344465, "grad_norm": 14.743538856506348, "learning_rate": 1.6561764641129371e-06, "loss": 0.1615, "num_input_tokens_seen": 6432128, "step": 13070 }, { "epoch": 1.7256169988121948, "grad_norm": 0.7518868446350098, "learning_rate": 1.655828748240065e-06, "loss": 0.1621, "num_input_tokens_seen": 6434560, "step": 13075 }, { "epoch": 1.7262768905899433, "grad_norm": 0.31233370304107666, "learning_rate": 1.6554808931722902e-06, "loss": 0.0018, "num_input_tokens_seen": 6437120, "step": 13080 }, { "epoch": 1.7269367823676918, "grad_norm": 28.600543975830078, "learning_rate": 1.6551328989834423e-06, "loss": 0.113, "num_input_tokens_seen": 6439616, "step": 13085 }, { "epoch": 1.72759667414544, "grad_norm": 0.4299144744873047, "learning_rate": 1.6547847657473805e-06, "loss": 0.0015, "num_input_tokens_seen": 6442240, "step": 13090 }, { "epoch": 1.7282565659231885, "grad_norm": 0.061676811426877975, "learning_rate": 1.654436493537994e-06, "loss": 0.0015, "num_input_tokens_seen": 6444864, "step": 13095 }, { "epoch": 1.728916457700937, "grad_norm": 0.16284187138080597, "learning_rate": 1.6540880824292008e-06, "loss": 0.0301, "num_input_tokens_seen": 6447296, "step": 13100 }, { "epoch": 1.7295763494786855, "grad_norm": 0.05911894142627716, "learning_rate": 1.6537395324949489e-06, "loss": 0.1013, "num_input_tokens_seen": 6449408, "step": 13105 }, { "epoch": 1.730236241256434, "grad_norm": 0.01173966471105814, "learning_rate": 1.6533908438092149e-06, "loss": 0.0005, "num_input_tokens_seen": 6451968, "step": 13110 }, { "epoch": 1.7308961330341823, "grad_norm": 0.08755994588136673, "learning_rate": 1.6530420164460055e-06, "loss": 0.0571, "num_input_tokens_seen": 6454272, "step": 13115 }, { "epoch": 1.7315560248119308, "grad_norm": 27.81377410888672, "learning_rate": 1.6526930504793576e-06, "loss": 0.0762, "num_input_tokens_seen": 6456640, "step": 13120 }, { "epoch": 1.7322159165896793, "grad_norm": 0.029269201681017876, "learning_rate": 1.6523439459833357e-06, "loss": 0.001, "num_input_tokens_seen": 6459136, "step": 13125 }, { "epoch": 1.7328758083674276, "grad_norm": 41.80912780761719, "learning_rate": 1.6519947030320356e-06, "loss": 0.0997, "num_input_tokens_seen": 6461376, "step": 13130 }, { "epoch": 1.7335357001451763, "grad_norm": 0.032881107181310654, "learning_rate": 1.651645321699581e-06, "loss": 0.0785, "num_input_tokens_seen": 6463936, "step": 13135 }, { "epoch": 1.7341955919229246, "grad_norm": 17.294940948486328, "learning_rate": 1.6512958020601256e-06, "loss": 0.0607, "num_input_tokens_seen": 6466432, "step": 13140 }, { "epoch": 1.734855483700673, "grad_norm": 4.170657634735107, "learning_rate": 1.6509461441878527e-06, "loss": 0.0482, "num_input_tokens_seen": 6468800, "step": 13145 }, { "epoch": 1.7355153754784216, "grad_norm": 4.368320941925049, "learning_rate": 1.6505963481569745e-06, "loss": 0.0163, "num_input_tokens_seen": 6471360, "step": 13150 }, { "epoch": 1.7361752672561699, "grad_norm": 0.008847353979945183, "learning_rate": 1.6502464140417326e-06, "loss": 0.0005, "num_input_tokens_seen": 6473856, "step": 13155 }, { "epoch": 1.7368351590339184, "grad_norm": 204.90426635742188, "learning_rate": 1.6498963419163978e-06, "loss": 0.2147, "num_input_tokens_seen": 6476288, "step": 13160 }, { "epoch": 1.7374950508116669, "grad_norm": 0.08478450030088425, "learning_rate": 1.6495461318552708e-06, "loss": 0.1285, "num_input_tokens_seen": 6478528, "step": 13165 }, { "epoch": 1.7381549425894154, "grad_norm": 27.09694480895996, "learning_rate": 1.6491957839326812e-06, "loss": 0.0915, "num_input_tokens_seen": 6481024, "step": 13170 }, { "epoch": 1.7388148343671639, "grad_norm": 201.52137756347656, "learning_rate": 1.6488452982229873e-06, "loss": 0.0941, "num_input_tokens_seen": 6483136, "step": 13175 }, { "epoch": 1.7394747261449122, "grad_norm": 0.0733003094792366, "learning_rate": 1.6484946748005773e-06, "loss": 0.127, "num_input_tokens_seen": 6485824, "step": 13180 }, { "epoch": 1.7401346179226607, "grad_norm": 0.0202048197388649, "learning_rate": 1.6481439137398688e-06, "loss": 0.0833, "num_input_tokens_seen": 6488384, "step": 13185 }, { "epoch": 1.7407945097004092, "grad_norm": 0.060185838490724564, "learning_rate": 1.6477930151153078e-06, "loss": 0.0005, "num_input_tokens_seen": 6491072, "step": 13190 }, { "epoch": 1.7414544014781574, "grad_norm": 11.041423797607422, "learning_rate": 1.6474419790013707e-06, "loss": 0.0576, "num_input_tokens_seen": 6493568, "step": 13195 }, { "epoch": 1.7421142932559062, "grad_norm": 23.28339385986328, "learning_rate": 1.6470908054725617e-06, "loss": 0.0509, "num_input_tokens_seen": 6496320, "step": 13200 }, { "epoch": 1.7427741850336544, "grad_norm": 0.029013272374868393, "learning_rate": 1.6467394946034152e-06, "loss": 0.0011, "num_input_tokens_seen": 6498560, "step": 13205 }, { "epoch": 1.743434076811403, "grad_norm": 0.33198022842407227, "learning_rate": 1.6463880464684942e-06, "loss": 0.0763, "num_input_tokens_seen": 6500928, "step": 13210 }, { "epoch": 1.7440939685891514, "grad_norm": 15.056183815002441, "learning_rate": 1.6460364611423911e-06, "loss": 0.069, "num_input_tokens_seen": 6503424, "step": 13215 }, { "epoch": 1.7447538603668997, "grad_norm": 10.846746444702148, "learning_rate": 1.6456847386997277e-06, "loss": 0.1996, "num_input_tokens_seen": 6505792, "step": 13220 }, { "epoch": 1.7454137521446482, "grad_norm": 0.034079235047101974, "learning_rate": 1.6453328792151537e-06, "loss": 0.0445, "num_input_tokens_seen": 6508160, "step": 13225 }, { "epoch": 1.7460736439223967, "grad_norm": 0.034832458943128586, "learning_rate": 1.6449808827633497e-06, "loss": 0.0011, "num_input_tokens_seen": 6510976, "step": 13230 }, { "epoch": 1.7467335357001452, "grad_norm": 0.38657528162002563, "learning_rate": 1.6446287494190237e-06, "loss": 0.0272, "num_input_tokens_seen": 6513216, "step": 13235 }, { "epoch": 1.7473934274778937, "grad_norm": 58.905696868896484, "learning_rate": 1.6442764792569136e-06, "loss": 0.0498, "num_input_tokens_seen": 6515904, "step": 13240 }, { "epoch": 1.748053319255642, "grad_norm": 13.200113296508789, "learning_rate": 1.6439240723517862e-06, "loss": 0.1935, "num_input_tokens_seen": 6518528, "step": 13245 }, { "epoch": 1.7487132110333905, "grad_norm": 0.960753321647644, "learning_rate": 1.6435715287784375e-06, "loss": 0.0038, "num_input_tokens_seen": 6520960, "step": 13250 }, { "epoch": 1.749373102811139, "grad_norm": 0.26324328780174255, "learning_rate": 1.643218848611692e-06, "loss": 0.0558, "num_input_tokens_seen": 6523520, "step": 13255 }, { "epoch": 1.7500329945888873, "grad_norm": 38.99361038208008, "learning_rate": 1.642866031926404e-06, "loss": 0.2054, "num_input_tokens_seen": 6526080, "step": 13260 }, { "epoch": 1.750692886366636, "grad_norm": 0.039592694491147995, "learning_rate": 1.6425130787974558e-06, "loss": 0.342, "num_input_tokens_seen": 6528448, "step": 13265 }, { "epoch": 1.750692886366636, "eval_loss": 0.11515690386295319, "eval_runtime": 7.8387, "eval_samples_per_second": 859.203, "eval_steps_per_second": 107.416, "num_input_tokens_seen": 6528448, "step": 13265 }, { "epoch": 1.7513527781443843, "grad_norm": 0.28292742371559143, "learning_rate": 1.6421599892997596e-06, "loss": 0.0822, "num_input_tokens_seen": 6531136, "step": 13270 }, { "epoch": 1.7520126699221328, "grad_norm": 0.0518592968583107, "learning_rate": 1.6418067635082555e-06, "loss": 0.2525, "num_input_tokens_seen": 6533824, "step": 13275 }, { "epoch": 1.7526725616998813, "grad_norm": 22.125919342041016, "learning_rate": 1.6414534014979138e-06, "loss": 0.1643, "num_input_tokens_seen": 6536256, "step": 13280 }, { "epoch": 1.7533324534776296, "grad_norm": 0.2832372188568115, "learning_rate": 1.6410999033437323e-06, "loss": 0.0185, "num_input_tokens_seen": 6538688, "step": 13285 }, { "epoch": 1.7539923452553783, "grad_norm": 0.6796748042106628, "learning_rate": 1.640746269120739e-06, "loss": 0.0024, "num_input_tokens_seen": 6541376, "step": 13290 }, { "epoch": 1.7546522370331266, "grad_norm": 0.2318210005760193, "learning_rate": 1.6403924989039899e-06, "loss": 0.001, "num_input_tokens_seen": 6543744, "step": 13295 }, { "epoch": 1.755312128810875, "grad_norm": 15.547673225402832, "learning_rate": 1.6400385927685706e-06, "loss": 0.0786, "num_input_tokens_seen": 6546112, "step": 13300 }, { "epoch": 1.7559720205886236, "grad_norm": 0.2586086392402649, "learning_rate": 1.6396845507895942e-06, "loss": 0.0008, "num_input_tokens_seen": 6548544, "step": 13305 }, { "epoch": 1.7566319123663718, "grad_norm": 0.3337395191192627, "learning_rate": 1.6393303730422046e-06, "loss": 0.0397, "num_input_tokens_seen": 6550976, "step": 13310 }, { "epoch": 1.7572918041441203, "grad_norm": 0.03228778764605522, "learning_rate": 1.6389760596015727e-06, "loss": 0.027, "num_input_tokens_seen": 6553536, "step": 13315 }, { "epoch": 1.7579516959218688, "grad_norm": 15.461897850036621, "learning_rate": 1.6386216105428993e-06, "loss": 0.0348, "num_input_tokens_seen": 6556160, "step": 13320 }, { "epoch": 1.7586115876996171, "grad_norm": 0.05416465923190117, "learning_rate": 1.6382670259414138e-06, "loss": 0.0428, "num_input_tokens_seen": 6558336, "step": 13325 }, { "epoch": 1.7592714794773658, "grad_norm": 3.9760372638702393, "learning_rate": 1.637912305872374e-06, "loss": 0.1086, "num_input_tokens_seen": 6560960, "step": 13330 }, { "epoch": 1.7599313712551141, "grad_norm": 1.265533208847046, "learning_rate": 1.6375574504110664e-06, "loss": 0.0529, "num_input_tokens_seen": 6563200, "step": 13335 }, { "epoch": 1.7605912630328626, "grad_norm": 0.09937909245491028, "learning_rate": 1.637202459632807e-06, "loss": 0.0012, "num_input_tokens_seen": 6566144, "step": 13340 }, { "epoch": 1.7612511548106111, "grad_norm": 0.016885558143258095, "learning_rate": 1.6368473336129395e-06, "loss": 0.1122, "num_input_tokens_seen": 6568512, "step": 13345 }, { "epoch": 1.7619110465883594, "grad_norm": 30.21272087097168, "learning_rate": 1.6364920724268374e-06, "loss": 0.1335, "num_input_tokens_seen": 6570688, "step": 13350 }, { "epoch": 1.7625709383661081, "grad_norm": 0.06372368335723877, "learning_rate": 1.6361366761499023e-06, "loss": 0.2444, "num_input_tokens_seen": 6573376, "step": 13355 }, { "epoch": 1.7632308301438564, "grad_norm": 0.15929684042930603, "learning_rate": 1.6357811448575638e-06, "loss": 0.0877, "num_input_tokens_seen": 6575488, "step": 13360 }, { "epoch": 1.763890721921605, "grad_norm": 0.1368577927350998, "learning_rate": 1.6354254786252813e-06, "loss": 0.0697, "num_input_tokens_seen": 6577856, "step": 13365 }, { "epoch": 1.7645506136993534, "grad_norm": 0.12535516917705536, "learning_rate": 1.6350696775285425e-06, "loss": 0.0228, "num_input_tokens_seen": 6580096, "step": 13370 }, { "epoch": 1.7652105054771017, "grad_norm": 7.772195339202881, "learning_rate": 1.6347137416428637e-06, "loss": 0.0148, "num_input_tokens_seen": 6582592, "step": 13375 }, { "epoch": 1.7658703972548502, "grad_norm": 0.09801344573497772, "learning_rate": 1.634357671043789e-06, "loss": 0.1398, "num_input_tokens_seen": 6585216, "step": 13380 }, { "epoch": 1.7665302890325987, "grad_norm": 0.25059768557548523, "learning_rate": 1.6340014658068923e-06, "loss": 0.0012, "num_input_tokens_seen": 6588032, "step": 13385 }, { "epoch": 1.767190180810347, "grad_norm": 0.02534446120262146, "learning_rate": 1.6336451260077757e-06, "loss": 0.0724, "num_input_tokens_seen": 6590528, "step": 13390 }, { "epoch": 1.7678500725880957, "grad_norm": 0.018427465111017227, "learning_rate": 1.6332886517220694e-06, "loss": 0.1463, "num_input_tokens_seen": 6592960, "step": 13395 }, { "epoch": 1.768509964365844, "grad_norm": 0.08824898302555084, "learning_rate": 1.632932043025433e-06, "loss": 0.0916, "num_input_tokens_seen": 6595136, "step": 13400 }, { "epoch": 1.7691698561435925, "grad_norm": 47.151695251464844, "learning_rate": 1.6325752999935539e-06, "loss": 0.2149, "num_input_tokens_seen": 6597824, "step": 13405 }, { "epoch": 1.769829747921341, "grad_norm": 2.347273349761963, "learning_rate": 1.6322184227021479e-06, "loss": 0.0036, "num_input_tokens_seen": 6600128, "step": 13410 }, { "epoch": 1.7704896396990892, "grad_norm": 0.21026629209518433, "learning_rate": 1.6318614112269598e-06, "loss": 0.0014, "num_input_tokens_seen": 6602944, "step": 13415 }, { "epoch": 1.771149531476838, "grad_norm": 0.20188181102275848, "learning_rate": 1.631504265643763e-06, "loss": 0.1428, "num_input_tokens_seen": 6605184, "step": 13420 }, { "epoch": 1.7718094232545862, "grad_norm": 0.3236222267150879, "learning_rate": 1.6311469860283584e-06, "loss": 0.0775, "num_input_tokens_seen": 6607680, "step": 13425 }, { "epoch": 1.7724693150323347, "grad_norm": 0.04968617111444473, "learning_rate": 1.6307895724565768e-06, "loss": 0.0653, "num_input_tokens_seen": 6610048, "step": 13430 }, { "epoch": 1.7731292068100832, "grad_norm": 17.058731079101562, "learning_rate": 1.6304320250042761e-06, "loss": 0.0665, "num_input_tokens_seen": 6612480, "step": 13435 }, { "epoch": 1.7737890985878315, "grad_norm": 0.0495927557349205, "learning_rate": 1.6300743437473434e-06, "loss": 0.0553, "num_input_tokens_seen": 6614784, "step": 13440 }, { "epoch": 1.77444899036558, "grad_norm": 0.4401050806045532, "learning_rate": 1.6297165287616936e-06, "loss": 0.0466, "num_input_tokens_seen": 6617152, "step": 13445 }, { "epoch": 1.7751088821433285, "grad_norm": 18.053531646728516, "learning_rate": 1.629358580123271e-06, "loss": 0.2752, "num_input_tokens_seen": 6619648, "step": 13450 }, { "epoch": 1.7757687739210768, "grad_norm": 4.440788745880127, "learning_rate": 1.6290004979080473e-06, "loss": 0.0518, "num_input_tokens_seen": 6622016, "step": 13455 }, { "epoch": 1.7764286656988255, "grad_norm": 0.09185050427913666, "learning_rate": 1.6286422821920222e-06, "loss": 0.1611, "num_input_tokens_seen": 6624384, "step": 13460 }, { "epoch": 1.7770885574765738, "grad_norm": 0.1410074383020401, "learning_rate": 1.6282839330512252e-06, "loss": 0.1648, "num_input_tokens_seen": 6626880, "step": 13465 }, { "epoch": 1.7777484492543223, "grad_norm": 118.95601654052734, "learning_rate": 1.6279254505617134e-06, "loss": 0.0707, "num_input_tokens_seen": 6629760, "step": 13470 }, { "epoch": 1.7784083410320708, "grad_norm": 0.0755147859454155, "learning_rate": 1.6275668347995714e-06, "loss": 0.0006, "num_input_tokens_seen": 6632576, "step": 13475 }, { "epoch": 1.779068232809819, "grad_norm": 0.015661204233765602, "learning_rate": 1.6272080858409138e-06, "loss": 0.0429, "num_input_tokens_seen": 6635136, "step": 13480 }, { "epoch": 1.7797281245875678, "grad_norm": 0.10322346538305283, "learning_rate": 1.6268492037618815e-06, "loss": 0.0012, "num_input_tokens_seen": 6637440, "step": 13485 }, { "epoch": 1.780388016365316, "grad_norm": 0.12164479494094849, "learning_rate": 1.6264901886386448e-06, "loss": 0.0592, "num_input_tokens_seen": 6640320, "step": 13490 }, { "epoch": 1.7810479081430646, "grad_norm": 0.055566031485795975, "learning_rate": 1.6261310405474022e-06, "loss": 0.1364, "num_input_tokens_seen": 6643136, "step": 13495 }, { "epoch": 1.781707799920813, "grad_norm": 80.7416763305664, "learning_rate": 1.6257717595643807e-06, "loss": 0.0181, "num_input_tokens_seen": 6645568, "step": 13500 }, { "epoch": 1.7823676916985614, "grad_norm": 84.49398803710938, "learning_rate": 1.6254123457658346e-06, "loss": 0.1195, "num_input_tokens_seen": 6648128, "step": 13505 }, { "epoch": 1.7830275834763099, "grad_norm": 0.12812452018260956, "learning_rate": 1.625052799228047e-06, "loss": 0.0976, "num_input_tokens_seen": 6650880, "step": 13510 }, { "epoch": 1.7836874752540584, "grad_norm": 24.594783782958984, "learning_rate": 1.624693120027329e-06, "loss": 0.0987, "num_input_tokens_seen": 6653504, "step": 13515 }, { "epoch": 1.7843473670318066, "grad_norm": 0.302664190530777, "learning_rate": 1.6243333082400197e-06, "loss": 0.0598, "num_input_tokens_seen": 6655936, "step": 13520 }, { "epoch": 1.7850072588095554, "grad_norm": 0.029307713732123375, "learning_rate": 1.623973363942487e-06, "loss": 0.0493, "num_input_tokens_seen": 6658432, "step": 13525 }, { "epoch": 1.7856671505873036, "grad_norm": 0.23820511996746063, "learning_rate": 1.6236132872111266e-06, "loss": 0.0007, "num_input_tokens_seen": 6660800, "step": 13530 }, { "epoch": 1.7863270423650521, "grad_norm": 0.1212558001279831, "learning_rate": 1.6232530781223613e-06, "loss": 0.0938, "num_input_tokens_seen": 6663232, "step": 13535 }, { "epoch": 1.7869869341428006, "grad_norm": 21.21772575378418, "learning_rate": 1.6228927367526437e-06, "loss": 0.1012, "num_input_tokens_seen": 6665792, "step": 13540 }, { "epoch": 1.787646825920549, "grad_norm": 0.670257568359375, "learning_rate": 1.6225322631784533e-06, "loss": 0.0525, "num_input_tokens_seen": 6668352, "step": 13545 }, { "epoch": 1.7883067176982976, "grad_norm": 0.9238318204879761, "learning_rate": 1.622171657476298e-06, "loss": 0.0015, "num_input_tokens_seen": 6670976, "step": 13550 }, { "epoch": 1.788966609476046, "grad_norm": 0.06575516611337662, "learning_rate": 1.621810919722714e-06, "loss": 0.0724, "num_input_tokens_seen": 6673472, "step": 13555 }, { "epoch": 1.7896265012537944, "grad_norm": 0.31715166568756104, "learning_rate": 1.6214500499942649e-06, "loss": 0.0431, "num_input_tokens_seen": 6675904, "step": 13560 }, { "epoch": 1.790286393031543, "grad_norm": 0.0651044175028801, "learning_rate": 1.6210890483675427e-06, "loss": 0.1878, "num_input_tokens_seen": 6678208, "step": 13565 }, { "epoch": 1.7909462848092912, "grad_norm": 0.027278663590550423, "learning_rate": 1.620727914919168e-06, "loss": 0.0459, "num_input_tokens_seen": 6680704, "step": 13570 }, { "epoch": 1.7916061765870397, "grad_norm": 0.015751399099826813, "learning_rate": 1.620366649725788e-06, "loss": 0.0964, "num_input_tokens_seen": 6683136, "step": 13575 }, { "epoch": 1.7922660683647882, "grad_norm": 22.279233932495117, "learning_rate": 1.6200052528640792e-06, "loss": 0.0529, "num_input_tokens_seen": 6685632, "step": 13580 }, { "epoch": 1.7929259601425365, "grad_norm": 12.878225326538086, "learning_rate": 1.619643724410745e-06, "loss": 0.1235, "num_input_tokens_seen": 6687872, "step": 13585 }, { "epoch": 1.7935858519202852, "grad_norm": 0.34992167353630066, "learning_rate": 1.6192820644425176e-06, "loss": 0.002, "num_input_tokens_seen": 6690368, "step": 13590 }, { "epoch": 1.7942457436980335, "grad_norm": 17.81177520751953, "learning_rate": 1.6189202730361563e-06, "loss": 0.0692, "num_input_tokens_seen": 6692992, "step": 13595 }, { "epoch": 1.794905635475782, "grad_norm": 135.81553649902344, "learning_rate": 1.618558350268449e-06, "loss": 0.0568, "num_input_tokens_seen": 6695168, "step": 13600 }, { "epoch": 1.7955655272535305, "grad_norm": 0.016782505437731743, "learning_rate": 1.618196296216211e-06, "loss": 0.1914, "num_input_tokens_seen": 6697536, "step": 13605 }, { "epoch": 1.7962254190312787, "grad_norm": 0.269696980714798, "learning_rate": 1.6178341109562859e-06, "loss": 0.073, "num_input_tokens_seen": 6700288, "step": 13610 }, { "epoch": 1.7968853108090275, "grad_norm": 3.4058849811553955, "learning_rate": 1.6174717945655446e-06, "loss": 0.0581, "num_input_tokens_seen": 6702912, "step": 13615 }, { "epoch": 1.7975452025867757, "grad_norm": 0.19131074845790863, "learning_rate": 1.6171093471208863e-06, "loss": 0.083, "num_input_tokens_seen": 6705408, "step": 13620 }, { "epoch": 1.7982050943645242, "grad_norm": 0.05940356105566025, "learning_rate": 1.616746768699238e-06, "loss": 0.0008, "num_input_tokens_seen": 6707904, "step": 13625 }, { "epoch": 1.7988649861422727, "grad_norm": 18.854990005493164, "learning_rate": 1.616384059377554e-06, "loss": 0.0738, "num_input_tokens_seen": 6710400, "step": 13630 }, { "epoch": 1.799524877920021, "grad_norm": 0.020753202959895134, "learning_rate": 1.616021219232817e-06, "loss": 0.0009, "num_input_tokens_seen": 6712896, "step": 13635 }, { "epoch": 1.8001847696977695, "grad_norm": 0.0038060767110437155, "learning_rate": 1.6156582483420374e-06, "loss": 0.0371, "num_input_tokens_seen": 6715520, "step": 13640 }, { "epoch": 1.800844661475518, "grad_norm": 0.3243776857852936, "learning_rate": 1.6152951467822523e-06, "loss": 0.1065, "num_input_tokens_seen": 6718080, "step": 13645 }, { "epoch": 1.8015045532532663, "grad_norm": 54.605873107910156, "learning_rate": 1.614931914630528e-06, "loss": 0.1254, "num_input_tokens_seen": 6720320, "step": 13650 }, { "epoch": 1.802164445031015, "grad_norm": 66.58609771728516, "learning_rate": 1.6145685519639577e-06, "loss": 0.0896, "num_input_tokens_seen": 6722816, "step": 13655 }, { "epoch": 1.8028243368087633, "grad_norm": 0.15989159047603607, "learning_rate": 1.6142050588596631e-06, "loss": 0.0551, "num_input_tokens_seen": 6725120, "step": 13660 }, { "epoch": 1.8034842285865118, "grad_norm": 8.184621810913086, "learning_rate": 1.6138414353947923e-06, "loss": 0.0597, "num_input_tokens_seen": 6727616, "step": 13665 }, { "epoch": 1.8041441203642603, "grad_norm": 0.01618696190416813, "learning_rate": 1.613477681646522e-06, "loss": 0.0805, "num_input_tokens_seen": 6730240, "step": 13670 }, { "epoch": 1.8048040121420086, "grad_norm": 17.110807418823242, "learning_rate": 1.6131137976920556e-06, "loss": 0.0583, "num_input_tokens_seen": 6732608, "step": 13675 }, { "epoch": 1.8054639039197573, "grad_norm": 15.315784454345703, "learning_rate": 1.612749783608626e-06, "loss": 0.1257, "num_input_tokens_seen": 6735168, "step": 13680 }, { "epoch": 1.8061237956975056, "grad_norm": 0.023321373388171196, "learning_rate": 1.612385639473492e-06, "loss": 0.1853, "num_input_tokens_seen": 6737664, "step": 13685 }, { "epoch": 1.806783687475254, "grad_norm": 0.15307167172431946, "learning_rate": 1.6120213653639407e-06, "loss": 0.0448, "num_input_tokens_seen": 6740224, "step": 13690 }, { "epoch": 1.8074435792530026, "grad_norm": 5.365473747253418, "learning_rate": 1.6116569613572861e-06, "loss": 0.0295, "num_input_tokens_seen": 6742848, "step": 13695 }, { "epoch": 1.8081034710307509, "grad_norm": 0.040078677237033844, "learning_rate": 1.611292427530871e-06, "loss": 0.1216, "num_input_tokens_seen": 6745472, "step": 13700 }, { "epoch": 1.8087633628084994, "grad_norm": 11.159900665283203, "learning_rate": 1.6109277639620648e-06, "loss": 0.0853, "num_input_tokens_seen": 6747904, "step": 13705 }, { "epoch": 1.8094232545862479, "grad_norm": 0.08733681589365005, "learning_rate": 1.6105629707282645e-06, "loss": 0.1297, "num_input_tokens_seen": 6750592, "step": 13710 }, { "epoch": 1.8100831463639961, "grad_norm": 0.5698712468147278, "learning_rate": 1.6101980479068954e-06, "loss": 0.0997, "num_input_tokens_seen": 6753024, "step": 13715 }, { "epoch": 1.8107430381417449, "grad_norm": 14.043471336364746, "learning_rate": 1.609832995575409e-06, "loss": 0.0846, "num_input_tokens_seen": 6755392, "step": 13720 }, { "epoch": 1.8114029299194931, "grad_norm": 0.5709534287452698, "learning_rate": 1.6094678138112854e-06, "loss": 0.0217, "num_input_tokens_seen": 6757696, "step": 13725 }, { "epoch": 1.8120628216972416, "grad_norm": 0.6098952293395996, "learning_rate": 1.6091025026920316e-06, "loss": 0.0975, "num_input_tokens_seen": 6760128, "step": 13730 }, { "epoch": 1.8127227134749901, "grad_norm": 0.5241377353668213, "learning_rate": 1.6087370622951824e-06, "loss": 0.122, "num_input_tokens_seen": 6762496, "step": 13735 }, { "epoch": 1.8133826052527384, "grad_norm": 0.034056950360536575, "learning_rate": 1.6083714926983004e-06, "loss": 0.0623, "num_input_tokens_seen": 6765120, "step": 13740 }, { "epoch": 1.8140424970304871, "grad_norm": 0.13510484993457794, "learning_rate": 1.608005793978974e-06, "loss": 0.0799, "num_input_tokens_seen": 6767808, "step": 13745 }, { "epoch": 1.8147023888082354, "grad_norm": 29.59540557861328, "learning_rate": 1.6076399662148207e-06, "loss": 0.1322, "num_input_tokens_seen": 6770240, "step": 13750 }, { "epoch": 1.815362280585984, "grad_norm": 0.1105113998055458, "learning_rate": 1.6072740094834848e-06, "loss": 0.0825, "num_input_tokens_seen": 6772608, "step": 13755 }, { "epoch": 1.8160221723637324, "grad_norm": 46.684200286865234, "learning_rate": 1.606907923862638e-06, "loss": 0.1762, "num_input_tokens_seen": 6774976, "step": 13760 }, { "epoch": 1.8166820641414807, "grad_norm": 0.05612744390964508, "learning_rate": 1.6065417094299793e-06, "loss": 0.0425, "num_input_tokens_seen": 6777408, "step": 13765 }, { "epoch": 1.8173419559192292, "grad_norm": 0.03116508573293686, "learning_rate": 1.6061753662632352e-06, "loss": 0.0008, "num_input_tokens_seen": 6779584, "step": 13770 }, { "epoch": 1.8180018476969777, "grad_norm": 0.39724618196487427, "learning_rate": 1.6058088944401586e-06, "loss": 0.0566, "num_input_tokens_seen": 6782016, "step": 13775 }, { "epoch": 1.818661739474726, "grad_norm": 0.18111243844032288, "learning_rate": 1.6054422940385315e-06, "loss": 0.07, "num_input_tokens_seen": 6784576, "step": 13780 }, { "epoch": 1.8193216312524747, "grad_norm": 0.08874604851007462, "learning_rate": 1.6050755651361617e-06, "loss": 0.0423, "num_input_tokens_seen": 6787072, "step": 13785 }, { "epoch": 1.819981523030223, "grad_norm": 0.08461711555719376, "learning_rate": 1.6047087078108848e-06, "loss": 0.1433, "num_input_tokens_seen": 6789376, "step": 13790 }, { "epoch": 1.8206414148079715, "grad_norm": 0.2662123441696167, "learning_rate": 1.6043417221405636e-06, "loss": 0.0027, "num_input_tokens_seen": 6792000, "step": 13795 }, { "epoch": 1.82130130658572, "grad_norm": 15.94675064086914, "learning_rate": 1.6039746082030878e-06, "loss": 0.0624, "num_input_tokens_seen": 6794368, "step": 13800 }, { "epoch": 1.8219611983634683, "grad_norm": 0.13098464906215668, "learning_rate": 1.6036073660763755e-06, "loss": 0.0008, "num_input_tokens_seen": 6796800, "step": 13805 }, { "epoch": 1.822621090141217, "grad_norm": 51.529579162597656, "learning_rate": 1.6032399958383706e-06, "loss": 0.0646, "num_input_tokens_seen": 6799232, "step": 13810 }, { "epoch": 1.8232809819189653, "grad_norm": 13.87507152557373, "learning_rate": 1.6028724975670454e-06, "loss": 0.1131, "num_input_tokens_seen": 6801792, "step": 13815 }, { "epoch": 1.8239408736967138, "grad_norm": 0.11702122539281845, "learning_rate": 1.6025048713403977e-06, "loss": 0.1451, "num_input_tokens_seen": 6804416, "step": 13820 }, { "epoch": 1.8246007654744623, "grad_norm": 12.326350212097168, "learning_rate": 1.6021371172364543e-06, "loss": 0.0421, "num_input_tokens_seen": 6806912, "step": 13825 }, { "epoch": 1.8252606572522105, "grad_norm": 0.25664016604423523, "learning_rate": 1.6017692353332676e-06, "loss": 0.0012, "num_input_tokens_seen": 6809280, "step": 13830 }, { "epoch": 1.825920549029959, "grad_norm": 0.08013727515935898, "learning_rate": 1.6014012257089186e-06, "loss": 0.0015, "num_input_tokens_seen": 6811776, "step": 13835 }, { "epoch": 1.8265804408077075, "grad_norm": 0.02246575430035591, "learning_rate": 1.6010330884415146e-06, "loss": 0.0004, "num_input_tokens_seen": 6814080, "step": 13840 }, { "epoch": 1.827240332585456, "grad_norm": 0.10979142040014267, "learning_rate": 1.6006648236091903e-06, "loss": 0.088, "num_input_tokens_seen": 6816512, "step": 13845 }, { "epoch": 1.8279002243632045, "grad_norm": 0.033827897161245346, "learning_rate": 1.600296431290106e-06, "loss": 0.0002, "num_input_tokens_seen": 6818752, "step": 13850 }, { "epoch": 1.8285601161409528, "grad_norm": 0.05608005449175835, "learning_rate": 1.5999279115624517e-06, "loss": 0.1294, "num_input_tokens_seen": 6821248, "step": 13855 }, { "epoch": 1.8292200079187013, "grad_norm": 0.1876734495162964, "learning_rate": 1.5995592645044424e-06, "loss": 0.1328, "num_input_tokens_seen": 6823808, "step": 13860 }, { "epoch": 1.8298798996964498, "grad_norm": 0.1528148502111435, "learning_rate": 1.599190490194321e-06, "loss": 0.1448, "num_input_tokens_seen": 6826432, "step": 13865 }, { "epoch": 1.830539791474198, "grad_norm": 0.06932291388511658, "learning_rate": 1.5988215887103568e-06, "loss": 0.0097, "num_input_tokens_seen": 6828800, "step": 13870 }, { "epoch": 1.8311996832519468, "grad_norm": 0.004690032918006182, "learning_rate": 1.598452560130847e-06, "loss": 0.1828, "num_input_tokens_seen": 6831360, "step": 13875 }, { "epoch": 1.831859575029695, "grad_norm": 28.09312629699707, "learning_rate": 1.598083404534115e-06, "loss": 0.1433, "num_input_tokens_seen": 6833664, "step": 13880 }, { "epoch": 1.8325194668074436, "grad_norm": 0.13816773891448975, "learning_rate": 1.597714121998511e-06, "loss": 0.065, "num_input_tokens_seen": 6835904, "step": 13885 }, { "epoch": 1.833179358585192, "grad_norm": 0.2143886536359787, "learning_rate": 1.5973447126024131e-06, "loss": 0.1494, "num_input_tokens_seen": 6838336, "step": 13890 }, { "epoch": 1.8338392503629404, "grad_norm": 0.11392121762037277, "learning_rate": 1.596975176424226e-06, "loss": 0.2852, "num_input_tokens_seen": 6841024, "step": 13895 }, { "epoch": 1.8344991421406889, "grad_norm": 0.045107193291187286, "learning_rate": 1.5966055135423798e-06, "loss": 0.1151, "num_input_tokens_seen": 6843200, "step": 13900 }, { "epoch": 1.8351590339184374, "grad_norm": 0.13529819250106812, "learning_rate": 1.5962357240353342e-06, "loss": 0.004, "num_input_tokens_seen": 6845568, "step": 13905 }, { "epoch": 1.8358189256961859, "grad_norm": 0.3247222900390625, "learning_rate": 1.5958658079815737e-06, "loss": 0.0585, "num_input_tokens_seen": 6848000, "step": 13910 }, { "epoch": 1.8364788174739344, "grad_norm": 20.190378189086914, "learning_rate": 1.5954957654596102e-06, "loss": 0.0423, "num_input_tokens_seen": 6850688, "step": 13915 }, { "epoch": 1.8371387092516827, "grad_norm": 0.09919846802949905, "learning_rate": 1.595125596547983e-06, "loss": 0.0121, "num_input_tokens_seen": 6852928, "step": 13920 }, { "epoch": 1.8377986010294312, "grad_norm": 0.05398841202259064, "learning_rate": 1.5947553013252572e-06, "loss": 0.0006, "num_input_tokens_seen": 6855424, "step": 13925 }, { "epoch": 1.8384584928071797, "grad_norm": 0.525188684463501, "learning_rate": 1.594384879870026e-06, "loss": 0.0626, "num_input_tokens_seen": 6858048, "step": 13930 }, { "epoch": 1.839118384584928, "grad_norm": 65.14164733886719, "learning_rate": 1.594014332260908e-06, "loss": 0.0881, "num_input_tokens_seen": 6860672, "step": 13935 }, { "epoch": 1.8397782763626767, "grad_norm": 45.95109939575195, "learning_rate": 1.5936436585765493e-06, "loss": 0.0854, "num_input_tokens_seen": 6863104, "step": 13940 }, { "epoch": 1.840438168140425, "grad_norm": 0.05436606705188751, "learning_rate": 1.5932728588956233e-06, "loss": 0.1164, "num_input_tokens_seen": 6865472, "step": 13945 }, { "epoch": 1.8410980599181734, "grad_norm": 0.17238880693912506, "learning_rate": 1.5929019332968285e-06, "loss": 0.0017, "num_input_tokens_seen": 6867840, "step": 13950 }, { "epoch": 1.841757951695922, "grad_norm": 0.043068185448646545, "learning_rate": 1.5925308818588926e-06, "loss": 0.0018, "num_input_tokens_seen": 6870208, "step": 13955 }, { "epoch": 1.8424178434736702, "grad_norm": 0.03494265675544739, "learning_rate": 1.5921597046605672e-06, "loss": 0.055, "num_input_tokens_seen": 6872768, "step": 13960 }, { "epoch": 1.8430777352514187, "grad_norm": 0.020545775070786476, "learning_rate": 1.5917884017806327e-06, "loss": 0.046, "num_input_tokens_seen": 6875328, "step": 13965 }, { "epoch": 1.8437376270291672, "grad_norm": 0.12847785651683807, "learning_rate": 1.5914169732978957e-06, "loss": 0.1396, "num_input_tokens_seen": 6877632, "step": 13970 }, { "epoch": 1.8443975188069157, "grad_norm": 0.058383919298648834, "learning_rate": 1.5910454192911883e-06, "loss": 0.1319, "num_input_tokens_seen": 6879872, "step": 13975 }, { "epoch": 1.8450574105846642, "grad_norm": 0.02930704690515995, "learning_rate": 1.590673739839371e-06, "loss": 0.092, "num_input_tokens_seen": 6882688, "step": 13980 }, { "epoch": 1.8457173023624125, "grad_norm": 0.02760966680943966, "learning_rate": 1.5903019350213293e-06, "loss": 0.0631, "num_input_tokens_seen": 6885248, "step": 13985 }, { "epoch": 1.846377194140161, "grad_norm": 0.5269352197647095, "learning_rate": 1.589930004915977e-06, "loss": 0.0017, "num_input_tokens_seen": 6887552, "step": 13990 }, { "epoch": 1.8470370859179095, "grad_norm": 0.006367943715304136, "learning_rate": 1.5895579496022532e-06, "loss": 0.0009, "num_input_tokens_seen": 6889984, "step": 13995 }, { "epoch": 1.8476969776956578, "grad_norm": 10.508081436157227, "learning_rate": 1.5891857691591235e-06, "loss": 0.3151, "num_input_tokens_seen": 6892480, "step": 14000 }, { "epoch": 1.8483568694734065, "grad_norm": 0.14122456312179565, "learning_rate": 1.588813463665581e-06, "loss": 0.0002, "num_input_tokens_seen": 6894912, "step": 14005 }, { "epoch": 1.8490167612511548, "grad_norm": 0.08993390202522278, "learning_rate": 1.5884410332006443e-06, "loss": 0.1177, "num_input_tokens_seen": 6897408, "step": 14010 }, { "epoch": 1.8496766530289033, "grad_norm": 0.21386578679084778, "learning_rate": 1.58806847784336e-06, "loss": 0.0403, "num_input_tokens_seen": 6899712, "step": 14015 }, { "epoch": 1.8503365448066518, "grad_norm": 81.02230834960938, "learning_rate": 1.5876957976727993e-06, "loss": 0.1096, "num_input_tokens_seen": 6902016, "step": 14020 }, { "epoch": 1.8509964365844, "grad_norm": 17.306926727294922, "learning_rate": 1.5873229927680617e-06, "loss": 0.1389, "num_input_tokens_seen": 6904384, "step": 14025 }, { "epoch": 1.8516563283621488, "grad_norm": 0.20489084720611572, "learning_rate": 1.5869500632082717e-06, "loss": 0.0433, "num_input_tokens_seen": 6906816, "step": 14030 }, { "epoch": 1.852316220139897, "grad_norm": 0.8502187132835388, "learning_rate": 1.586577009072581e-06, "loss": 0.0012, "num_input_tokens_seen": 6909376, "step": 14035 }, { "epoch": 1.8529761119176456, "grad_norm": 0.016757525503635406, "learning_rate": 1.5862038304401682e-06, "loss": 0.0512, "num_input_tokens_seen": 6911680, "step": 14040 }, { "epoch": 1.853636003695394, "grad_norm": 0.11623700708150864, "learning_rate": 1.585830527390237e-06, "loss": 0.0724, "num_input_tokens_seen": 6913920, "step": 14045 }, { "epoch": 1.8542958954731423, "grad_norm": 58.12702560424805, "learning_rate": 1.585457100002019e-06, "loss": 0.189, "num_input_tokens_seen": 6916416, "step": 14050 }, { "epoch": 1.8549557872508908, "grad_norm": 0.018467986956238747, "learning_rate": 1.5850835483547705e-06, "loss": 0.0629, "num_input_tokens_seen": 6918848, "step": 14055 }, { "epoch": 1.8556156790286393, "grad_norm": 66.30414581298828, "learning_rate": 1.5847098725277763e-06, "loss": 0.0666, "num_input_tokens_seen": 6921536, "step": 14060 }, { "epoch": 1.8562755708063876, "grad_norm": 0.01666739024221897, "learning_rate": 1.5843360726003454e-06, "loss": 0.0795, "num_input_tokens_seen": 6924096, "step": 14065 }, { "epoch": 1.8569354625841363, "grad_norm": 0.11734547466039658, "learning_rate": 1.5839621486518144e-06, "loss": 0.0008, "num_input_tokens_seen": 6926400, "step": 14070 }, { "epoch": 1.8575953543618846, "grad_norm": 21.82460594177246, "learning_rate": 1.5835881007615466e-06, "loss": 0.2018, "num_input_tokens_seen": 6928896, "step": 14075 }, { "epoch": 1.858255246139633, "grad_norm": 0.7360246777534485, "learning_rate": 1.5832139290089302e-06, "loss": 0.184, "num_input_tokens_seen": 6931392, "step": 14080 }, { "epoch": 1.8589151379173816, "grad_norm": 2.7356605529785156, "learning_rate": 1.5828396334733807e-06, "loss": 0.0733, "num_input_tokens_seen": 6934080, "step": 14085 }, { "epoch": 1.8595750296951299, "grad_norm": 50.23606491088867, "learning_rate": 1.5824652142343394e-06, "loss": 0.1321, "num_input_tokens_seen": 6936576, "step": 14090 }, { "epoch": 1.8602349214728786, "grad_norm": 21.126819610595703, "learning_rate": 1.582090671371274e-06, "loss": 0.1625, "num_input_tokens_seen": 6938880, "step": 14095 }, { "epoch": 1.860894813250627, "grad_norm": 12.098738670349121, "learning_rate": 1.5817160049636792e-06, "loss": 0.1393, "num_input_tokens_seen": 6941504, "step": 14100 }, { "epoch": 1.8615547050283754, "grad_norm": 67.69693756103516, "learning_rate": 1.5813412150910748e-06, "loss": 0.0703, "num_input_tokens_seen": 6944000, "step": 14105 }, { "epoch": 1.862214596806124, "grad_norm": 0.11709312349557877, "learning_rate": 1.580966301833007e-06, "loss": 0.1415, "num_input_tokens_seen": 6946496, "step": 14110 }, { "epoch": 1.8628744885838722, "grad_norm": 0.06242687627673149, "learning_rate": 1.580591265269049e-06, "loss": 0.0015, "num_input_tokens_seen": 6948864, "step": 14115 }, { "epoch": 1.8635343803616207, "grad_norm": 0.1491411030292511, "learning_rate": 1.580216105478799e-06, "loss": 0.0012, "num_input_tokens_seen": 6951360, "step": 14120 }, { "epoch": 1.8641942721393692, "grad_norm": 0.5130274295806885, "learning_rate": 1.5798408225418825e-06, "loss": 0.0014, "num_input_tokens_seen": 6953856, "step": 14125 }, { "epoch": 1.8648541639171174, "grad_norm": 0.14763778448104858, "learning_rate": 1.57946541653795e-06, "loss": 0.0719, "num_input_tokens_seen": 6956096, "step": 14130 }, { "epoch": 1.8655140556948662, "grad_norm": 0.16149531304836273, "learning_rate": 1.579089887546679e-06, "loss": 0.0571, "num_input_tokens_seen": 6958528, "step": 14135 }, { "epoch": 1.8661739474726144, "grad_norm": 0.08772195875644684, "learning_rate": 1.578714235647773e-06, "loss": 0.1446, "num_input_tokens_seen": 6960704, "step": 14140 }, { "epoch": 1.866833839250363, "grad_norm": 2.0068814754486084, "learning_rate": 1.5783384609209609e-06, "loss": 0.1826, "num_input_tokens_seen": 6963072, "step": 14145 }, { "epoch": 1.8674937310281114, "grad_norm": 0.1501556634902954, "learning_rate": 1.577962563445999e-06, "loss": 0.0619, "num_input_tokens_seen": 6965568, "step": 14150 }, { "epoch": 1.8681536228058597, "grad_norm": 0.060296908020973206, "learning_rate": 1.5775865433026679e-06, "loss": 0.0009, "num_input_tokens_seen": 6967936, "step": 14155 }, { "epoch": 1.8688135145836084, "grad_norm": 0.20123106241226196, "learning_rate": 1.5772104005707756e-06, "loss": 0.0008, "num_input_tokens_seen": 6970432, "step": 14160 }, { "epoch": 1.8694734063613567, "grad_norm": 0.2809836268424988, "learning_rate": 1.5768341353301554e-06, "loss": 0.1136, "num_input_tokens_seen": 6973120, "step": 14165 }, { "epoch": 1.8701332981391052, "grad_norm": 0.12703171372413635, "learning_rate": 1.5764577476606673e-06, "loss": 0.0006, "num_input_tokens_seen": 6975296, "step": 14170 }, { "epoch": 1.8707931899168537, "grad_norm": 29.460742950439453, "learning_rate": 1.5760812376421965e-06, "loss": 0.0891, "num_input_tokens_seen": 6977728, "step": 14175 }, { "epoch": 1.871453081694602, "grad_norm": 0.19353777170181274, "learning_rate": 1.5757046053546547e-06, "loss": 0.1602, "num_input_tokens_seen": 6980160, "step": 14180 }, { "epoch": 1.8721129734723505, "grad_norm": 2.112881660461426, "learning_rate": 1.5753278508779797e-06, "loss": 0.081, "num_input_tokens_seen": 6982720, "step": 14185 }, { "epoch": 1.872772865250099, "grad_norm": 0.19052846729755402, "learning_rate": 1.574950974292134e-06, "loss": 0.2397, "num_input_tokens_seen": 6985344, "step": 14190 }, { "epoch": 1.8734327570278473, "grad_norm": 0.056706760078668594, "learning_rate": 1.5745739756771078e-06, "loss": 0.0029, "num_input_tokens_seen": 6987776, "step": 14195 }, { "epoch": 1.874092648805596, "grad_norm": 0.618067741394043, "learning_rate": 1.574196855112916e-06, "loss": 0.0016, "num_input_tokens_seen": 6990144, "step": 14200 }, { "epoch": 1.8747525405833443, "grad_norm": 0.07428532093763351, "learning_rate": 1.5738196126795998e-06, "loss": 0.0731, "num_input_tokens_seen": 6992832, "step": 14205 }, { "epoch": 1.8754124323610928, "grad_norm": 0.030017530545592308, "learning_rate": 1.5734422484572258e-06, "loss": 0.0601, "num_input_tokens_seen": 6995264, "step": 14210 }, { "epoch": 1.8760723241388413, "grad_norm": 0.0610010139644146, "learning_rate": 1.573064762525887e-06, "loss": 0.0583, "num_input_tokens_seen": 6997504, "step": 14215 }, { "epoch": 1.8767322159165896, "grad_norm": 0.06411426514387131, "learning_rate": 1.5726871549657027e-06, "loss": 0.0743, "num_input_tokens_seen": 6999872, "step": 14220 }, { "epoch": 1.8773921076943383, "grad_norm": 0.03602256253361702, "learning_rate": 1.5723094258568161e-06, "loss": 0.0004, "num_input_tokens_seen": 7002432, "step": 14225 }, { "epoch": 1.8780519994720866, "grad_norm": 13.705681800842285, "learning_rate": 1.571931575279399e-06, "loss": 0.087, "num_input_tokens_seen": 7004544, "step": 14230 }, { "epoch": 1.878711891249835, "grad_norm": 0.06149383634328842, "learning_rate": 1.5715536033136462e-06, "loss": 0.0305, "num_input_tokens_seen": 7006912, "step": 14235 }, { "epoch": 1.8793717830275836, "grad_norm": 33.77265930175781, "learning_rate": 1.5711755100397798e-06, "loss": 0.1398, "num_input_tokens_seen": 7009408, "step": 14240 }, { "epoch": 1.8800316748053318, "grad_norm": 0.04353427141904831, "learning_rate": 1.570797295538048e-06, "loss": 0.088, "num_input_tokens_seen": 7011904, "step": 14245 }, { "epoch": 1.8806915665830803, "grad_norm": 0.11581678688526154, "learning_rate": 1.5704189598887232e-06, "loss": 0.0514, "num_input_tokens_seen": 7014080, "step": 14250 }, { "epoch": 1.8813514583608288, "grad_norm": 0.18477141857147217, "learning_rate": 1.570040503172105e-06, "loss": 0.0358, "num_input_tokens_seen": 7016384, "step": 14255 }, { "epoch": 1.8820113501385771, "grad_norm": 18.699533462524414, "learning_rate": 1.569661925468518e-06, "loss": 0.246, "num_input_tokens_seen": 7018560, "step": 14260 }, { "epoch": 1.8826712419163258, "grad_norm": 1.352349877357483, "learning_rate": 1.5692832268583126e-06, "loss": 0.0017, "num_input_tokens_seen": 7020928, "step": 14265 }, { "epoch": 1.8833311336940741, "grad_norm": 1.0423433780670166, "learning_rate": 1.5689044074218643e-06, "loss": 0.2468, "num_input_tokens_seen": 7023744, "step": 14270 }, { "epoch": 1.8839910254718226, "grad_norm": 0.1379714459180832, "learning_rate": 1.5685254672395753e-06, "loss": 0.0838, "num_input_tokens_seen": 7026240, "step": 14275 }, { "epoch": 1.8846509172495711, "grad_norm": 0.3078247606754303, "learning_rate": 1.568146406391873e-06, "loss": 0.0016, "num_input_tokens_seen": 7028544, "step": 14280 }, { "epoch": 1.8853108090273194, "grad_norm": 0.07992412149906158, "learning_rate": 1.5677672249592101e-06, "loss": 0.0511, "num_input_tokens_seen": 7031040, "step": 14285 }, { "epoch": 1.8859707008050681, "grad_norm": 26.23770523071289, "learning_rate": 1.567387923022065e-06, "loss": 0.1894, "num_input_tokens_seen": 7033216, "step": 14290 }, { "epoch": 1.8866305925828164, "grad_norm": 0.21263808012008667, "learning_rate": 1.567008500660942e-06, "loss": 0.0037, "num_input_tokens_seen": 7035712, "step": 14295 }, { "epoch": 1.887290484360565, "grad_norm": 0.27672702074050903, "learning_rate": 1.5666289579563708e-06, "loss": 0.0039, "num_input_tokens_seen": 7038464, "step": 14300 }, { "epoch": 1.8879503761383134, "grad_norm": 15.613324165344238, "learning_rate": 1.5662492949889065e-06, "loss": 0.142, "num_input_tokens_seen": 7040832, "step": 14305 }, { "epoch": 1.8886102679160617, "grad_norm": 0.06228020787239075, "learning_rate": 1.5658695118391299e-06, "loss": 0.002, "num_input_tokens_seen": 7043456, "step": 14310 }, { "epoch": 1.8892701596938102, "grad_norm": 0.09050249308347702, "learning_rate": 1.5654896085876468e-06, "loss": 0.0934, "num_input_tokens_seen": 7046016, "step": 14315 }, { "epoch": 1.8899300514715587, "grad_norm": 1.3224503993988037, "learning_rate": 1.5651095853150893e-06, "loss": 0.0622, "num_input_tokens_seen": 7048704, "step": 14320 }, { "epoch": 1.890589943249307, "grad_norm": 74.3990478515625, "learning_rate": 1.5647294421021144e-06, "loss": 0.1235, "num_input_tokens_seen": 7050944, "step": 14325 }, { "epoch": 1.8912498350270557, "grad_norm": 0.6878701448440552, "learning_rate": 1.5643491790294054e-06, "loss": 0.0469, "num_input_tokens_seen": 7053312, "step": 14330 }, { "epoch": 1.891909726804804, "grad_norm": 0.16525298357009888, "learning_rate": 1.5639687961776695e-06, "loss": 0.0867, "num_input_tokens_seen": 7055936, "step": 14335 }, { "epoch": 1.8925696185825525, "grad_norm": 0.5637364983558655, "learning_rate": 1.5635882936276403e-06, "loss": 0.0129, "num_input_tokens_seen": 7058688, "step": 14340 }, { "epoch": 1.893229510360301, "grad_norm": 0.1050250306725502, "learning_rate": 1.5632076714600773e-06, "loss": 0.0382, "num_input_tokens_seen": 7061312, "step": 14345 }, { "epoch": 1.8938894021380492, "grad_norm": 0.06865517795085907, "learning_rate": 1.5628269297557644e-06, "loss": 0.0011, "num_input_tokens_seen": 7063808, "step": 14350 }, { "epoch": 1.894549293915798, "grad_norm": 0.3285222053527832, "learning_rate": 1.5624460685955115e-06, "loss": 0.1335, "num_input_tokens_seen": 7066368, "step": 14355 }, { "epoch": 1.8952091856935462, "grad_norm": 1.9790964126586914, "learning_rate": 1.562065088060153e-06, "loss": 0.1012, "num_input_tokens_seen": 7068992, "step": 14360 }, { "epoch": 1.8958690774712947, "grad_norm": 0.08310531079769135, "learning_rate": 1.5616839882305498e-06, "loss": 0.0007, "num_input_tokens_seen": 7071296, "step": 14365 }, { "epoch": 1.8965289692490432, "grad_norm": 0.013942413963377476, "learning_rate": 1.5613027691875877e-06, "loss": 0.0008, "num_input_tokens_seen": 7073536, "step": 14370 }, { "epoch": 1.8971888610267915, "grad_norm": 0.014673193916678429, "learning_rate": 1.5609214310121775e-06, "loss": 0.1195, "num_input_tokens_seen": 7076224, "step": 14375 }, { "epoch": 1.89784875280454, "grad_norm": 0.06911315023899078, "learning_rate": 1.5605399737852554e-06, "loss": 0.0009, "num_input_tokens_seen": 7078464, "step": 14380 }, { "epoch": 1.8985086445822885, "grad_norm": 0.017595898360013962, "learning_rate": 1.560158397587783e-06, "loss": 0.0817, "num_input_tokens_seen": 7081216, "step": 14385 }, { "epoch": 1.8991685363600368, "grad_norm": 0.0426434762775898, "learning_rate": 1.559776702500747e-06, "loss": 0.1318, "num_input_tokens_seen": 7083648, "step": 14390 }, { "epoch": 1.8998284281377855, "grad_norm": 0.020100057125091553, "learning_rate": 1.5593948886051592e-06, "loss": 0.0004, "num_input_tokens_seen": 7085952, "step": 14395 }, { "epoch": 1.9004883199155338, "grad_norm": 0.6564512848854065, "learning_rate": 1.5590129559820575e-06, "loss": 0.1321, "num_input_tokens_seen": 7088640, "step": 14400 }, { "epoch": 1.9011482116932823, "grad_norm": 0.35198983550071716, "learning_rate": 1.5586309047125039e-06, "loss": 0.0028, "num_input_tokens_seen": 7091456, "step": 14405 }, { "epoch": 1.9018081034710308, "grad_norm": 0.14709027111530304, "learning_rate": 1.5582487348775862e-06, "loss": 0.0625, "num_input_tokens_seen": 7093952, "step": 14410 }, { "epoch": 1.902467995248779, "grad_norm": 0.6149783730506897, "learning_rate": 1.5578664465584168e-06, "loss": 0.0348, "num_input_tokens_seen": 7096256, "step": 14415 }, { "epoch": 1.9031278870265278, "grad_norm": 0.035323966294527054, "learning_rate": 1.5574840398361339e-06, "loss": 0.1055, "num_input_tokens_seen": 7098816, "step": 14420 }, { "epoch": 1.903787778804276, "grad_norm": 0.07863358408212662, "learning_rate": 1.5571015147919005e-06, "loss": 0.0709, "num_input_tokens_seen": 7101056, "step": 14425 }, { "epoch": 1.9044476705820246, "grad_norm": 0.08664238452911377, "learning_rate": 1.5567188715069048e-06, "loss": 0.1621, "num_input_tokens_seen": 7103488, "step": 14430 }, { "epoch": 1.905107562359773, "grad_norm": 0.02262006513774395, "learning_rate": 1.5563361100623604e-06, "loss": 0.0179, "num_input_tokens_seen": 7105792, "step": 14435 }, { "epoch": 1.9057674541375214, "grad_norm": 0.024045400321483612, "learning_rate": 1.555953230539505e-06, "loss": 0.0417, "num_input_tokens_seen": 7107968, "step": 14440 }, { "epoch": 1.9064273459152699, "grad_norm": 0.13778825104236603, "learning_rate": 1.5555702330196021e-06, "loss": 0.0521, "num_input_tokens_seen": 7110592, "step": 14445 }, { "epoch": 1.9070872376930184, "grad_norm": 63.8028564453125, "learning_rate": 1.5551871175839406e-06, "loss": 0.2691, "num_input_tokens_seen": 7112704, "step": 14450 }, { "epoch": 1.9077471294707666, "grad_norm": 0.0490015372633934, "learning_rate": 1.5548038843138338e-06, "loss": 0.0585, "num_input_tokens_seen": 7115200, "step": 14455 }, { "epoch": 1.9084070212485154, "grad_norm": 0.03240058943629265, "learning_rate": 1.5544205332906201e-06, "loss": 0.1164, "num_input_tokens_seen": 7117632, "step": 14460 }, { "epoch": 1.9090669130262636, "grad_norm": 0.39769986271858215, "learning_rate": 1.554037064595663e-06, "loss": 0.221, "num_input_tokens_seen": 7120064, "step": 14465 }, { "epoch": 1.9097268048040121, "grad_norm": 0.09407158941030502, "learning_rate": 1.553653478310351e-06, "loss": 0.1352, "num_input_tokens_seen": 7122624, "step": 14470 }, { "epoch": 1.9103866965817606, "grad_norm": 0.3653717041015625, "learning_rate": 1.5532697745160972e-06, "loss": 0.0808, "num_input_tokens_seen": 7124992, "step": 14475 }, { "epoch": 1.911046588359509, "grad_norm": 0.887218713760376, "learning_rate": 1.5528859532943405e-06, "loss": 0.0018, "num_input_tokens_seen": 7127424, "step": 14480 }, { "epoch": 1.9117064801372576, "grad_norm": 0.211788609623909, "learning_rate": 1.552502014726544e-06, "loss": 0.0209, "num_input_tokens_seen": 7129920, "step": 14485 }, { "epoch": 1.912366371915006, "grad_norm": 0.03963814303278923, "learning_rate": 1.5521179588941956e-06, "loss": 0.0735, "num_input_tokens_seen": 7132352, "step": 14490 }, { "epoch": 1.9130262636927544, "grad_norm": 0.1944819688796997, "learning_rate": 1.5517337858788087e-06, "loss": 0.0481, "num_input_tokens_seen": 7134848, "step": 14495 }, { "epoch": 1.913686155470503, "grad_norm": 0.17637357115745544, "learning_rate": 1.551349495761921e-06, "loss": 0.0019, "num_input_tokens_seen": 7137344, "step": 14500 }, { "epoch": 1.9143460472482512, "grad_norm": 0.063203364610672, "learning_rate": 1.550965088625095e-06, "loss": 0.1998, "num_input_tokens_seen": 7139968, "step": 14505 }, { "epoch": 1.9150059390259997, "grad_norm": 0.14308017492294312, "learning_rate": 1.5505805645499192e-06, "loss": 0.0009, "num_input_tokens_seen": 7142464, "step": 14510 }, { "epoch": 1.9156658308037482, "grad_norm": 0.38713952898979187, "learning_rate": 1.5501959236180053e-06, "loss": 0.0447, "num_input_tokens_seen": 7145088, "step": 14515 }, { "epoch": 1.9163257225814965, "grad_norm": 0.03757447376847267, "learning_rate": 1.5498111659109908e-06, "loss": 0.0005, "num_input_tokens_seen": 7147712, "step": 14520 }, { "epoch": 1.9169856143592452, "grad_norm": 14.420360565185547, "learning_rate": 1.549426291510538e-06, "loss": 0.1215, "num_input_tokens_seen": 7150144, "step": 14525 }, { "epoch": 1.9176455061369935, "grad_norm": 15.488569259643555, "learning_rate": 1.5490413004983334e-06, "loss": 0.215, "num_input_tokens_seen": 7152640, "step": 14530 }, { "epoch": 1.918305397914742, "grad_norm": 0.0733942836523056, "learning_rate": 1.5486561929560887e-06, "loss": 0.1271, "num_input_tokens_seen": 7155200, "step": 14535 }, { "epoch": 1.9189652896924905, "grad_norm": 0.07937513291835785, "learning_rate": 1.5482709689655398e-06, "loss": 0.0219, "num_input_tokens_seen": 7157376, "step": 14540 }, { "epoch": 1.9196251814702388, "grad_norm": 0.6346595287322998, "learning_rate": 1.5478856286084483e-06, "loss": 0.0957, "num_input_tokens_seen": 7160000, "step": 14545 }, { "epoch": 1.9202850732479875, "grad_norm": 0.18721289932727814, "learning_rate": 1.5475001719665994e-06, "loss": 0.1187, "num_input_tokens_seen": 7162432, "step": 14550 }, { "epoch": 1.9209449650257358, "grad_norm": 0.06002054363489151, "learning_rate": 1.5471145991218037e-06, "loss": 0.256, "num_input_tokens_seen": 7164736, "step": 14555 }, { "epoch": 1.9216048568034843, "grad_norm": 0.2037287801504135, "learning_rate": 1.5467289101558962e-06, "loss": 0.0013, "num_input_tokens_seen": 7167104, "step": 14560 }, { "epoch": 1.9222647485812328, "grad_norm": 0.06793426722288132, "learning_rate": 1.5463431051507368e-06, "loss": 0.0016, "num_input_tokens_seen": 7169920, "step": 14565 }, { "epoch": 1.922924640358981, "grad_norm": 0.43444526195526123, "learning_rate": 1.5459571841882095e-06, "loss": 0.0322, "num_input_tokens_seen": 7172224, "step": 14570 }, { "epoch": 1.9235845321367295, "grad_norm": 0.02774728089570999, "learning_rate": 1.5455711473502233e-06, "loss": 0.0529, "num_input_tokens_seen": 7174848, "step": 14575 }, { "epoch": 1.924244423914478, "grad_norm": 0.1229424998164177, "learning_rate": 1.5451849947187121e-06, "loss": 0.1527, "num_input_tokens_seen": 7177536, "step": 14580 }, { "epoch": 1.9249043156922263, "grad_norm": 17.570878982543945, "learning_rate": 1.5447987263756335e-06, "loss": 0.1494, "num_input_tokens_seen": 7179712, "step": 14585 }, { "epoch": 1.925564207469975, "grad_norm": 5.2115302085876465, "learning_rate": 1.5444123424029703e-06, "loss": 0.0758, "num_input_tokens_seen": 7182016, "step": 14590 }, { "epoch": 1.9262240992477233, "grad_norm": 0.03287997841835022, "learning_rate": 1.5440258428827298e-06, "loss": 0.0535, "num_input_tokens_seen": 7184640, "step": 14595 }, { "epoch": 1.9268839910254718, "grad_norm": 0.06192353367805481, "learning_rate": 1.5436392278969438e-06, "loss": 0.0964, "num_input_tokens_seen": 7187072, "step": 14600 }, { "epoch": 1.9275438828032203, "grad_norm": 0.10755826532840729, "learning_rate": 1.5432524975276681e-06, "loss": 0.0184, "num_input_tokens_seen": 7189312, "step": 14605 }, { "epoch": 1.9282037745809686, "grad_norm": 0.039796650409698486, "learning_rate": 1.5428656518569838e-06, "loss": 0.0008, "num_input_tokens_seen": 7191680, "step": 14610 }, { "epoch": 1.9288636663587173, "grad_norm": 0.018832655623555183, "learning_rate": 1.5424786909669962e-06, "loss": 0.2284, "num_input_tokens_seen": 7194048, "step": 14615 }, { "epoch": 1.9295235581364656, "grad_norm": 0.12818144261837006, "learning_rate": 1.5420916149398346e-06, "loss": 0.0006, "num_input_tokens_seen": 7196800, "step": 14620 }, { "epoch": 1.930183449914214, "grad_norm": 0.10920975357294083, "learning_rate": 1.5417044238576533e-06, "loss": 0.0009, "num_input_tokens_seen": 7199232, "step": 14625 }, { "epoch": 1.9308433416919626, "grad_norm": 106.05785369873047, "learning_rate": 1.5413171178026308e-06, "loss": 0.1908, "num_input_tokens_seen": 7201792, "step": 14630 }, { "epoch": 1.9315032334697109, "grad_norm": 0.03340896964073181, "learning_rate": 1.5409296968569698e-06, "loss": 0.073, "num_input_tokens_seen": 7204352, "step": 14635 }, { "epoch": 1.9321631252474594, "grad_norm": 0.07122701406478882, "learning_rate": 1.540542161102898e-06, "loss": 0.0874, "num_input_tokens_seen": 7206912, "step": 14640 }, { "epoch": 1.9328230170252079, "grad_norm": 0.20408612489700317, "learning_rate": 1.5401545106226665e-06, "loss": 0.0365, "num_input_tokens_seen": 7209152, "step": 14645 }, { "epoch": 1.9334829088029564, "grad_norm": 17.965118408203125, "learning_rate": 1.539766745498552e-06, "loss": 0.1238, "num_input_tokens_seen": 7211712, "step": 14650 }, { "epoch": 1.9341428005807049, "grad_norm": 89.8632583618164, "learning_rate": 1.5393788658128542e-06, "loss": 0.0643, "num_input_tokens_seen": 7214144, "step": 14655 }, { "epoch": 1.9348026923584531, "grad_norm": 0.12914709746837616, "learning_rate": 1.538990871647898e-06, "loss": 0.0293, "num_input_tokens_seen": 7216768, "step": 14660 }, { "epoch": 1.9354625841362016, "grad_norm": 0.4041579067707062, "learning_rate": 1.5386027630860324e-06, "loss": 0.05, "num_input_tokens_seen": 7219200, "step": 14665 }, { "epoch": 1.9361224759139501, "grad_norm": 0.2001969814300537, "learning_rate": 1.5382145402096307e-06, "loss": 0.05, "num_input_tokens_seen": 7221760, "step": 14670 }, { "epoch": 1.9367823676916984, "grad_norm": 0.018148990347981453, "learning_rate": 1.53782620310109e-06, "loss": 0.0418, "num_input_tokens_seen": 7224192, "step": 14675 }, { "epoch": 1.9374422594694471, "grad_norm": 0.01664876751601696, "learning_rate": 1.5374377518428324e-06, "loss": 0.0005, "num_input_tokens_seen": 7226560, "step": 14680 }, { "epoch": 1.9381021512471954, "grad_norm": 0.11727305501699448, "learning_rate": 1.5370491865173042e-06, "loss": 0.0937, "num_input_tokens_seen": 7229056, "step": 14685 }, { "epoch": 1.938762043024944, "grad_norm": 0.06682264059782028, "learning_rate": 1.5366605072069747e-06, "loss": 0.0438, "num_input_tokens_seen": 7231872, "step": 14690 }, { "epoch": 1.9394219348026924, "grad_norm": 0.08827029913663864, "learning_rate": 1.5362717139943392e-06, "loss": 0.0753, "num_input_tokens_seen": 7234240, "step": 14695 }, { "epoch": 1.9400818265804407, "grad_norm": 0.015324097126722336, "learning_rate": 1.5358828069619155e-06, "loss": 0.2984, "num_input_tokens_seen": 7236800, "step": 14700 }, { "epoch": 1.9407417183581892, "grad_norm": 0.029757771641016006, "learning_rate": 1.5354937861922463e-06, "loss": 0.0489, "num_input_tokens_seen": 7239424, "step": 14705 }, { "epoch": 1.9414016101359377, "grad_norm": 14.723073959350586, "learning_rate": 1.5351046517678989e-06, "loss": 0.2381, "num_input_tokens_seen": 7241920, "step": 14710 }, { "epoch": 1.9420615019136862, "grad_norm": 0.08645767718553543, "learning_rate": 1.534715403771464e-06, "loss": 0.1093, "num_input_tokens_seen": 7244160, "step": 14715 }, { "epoch": 1.9427213936914347, "grad_norm": 0.06573551148176193, "learning_rate": 1.5343260422855573e-06, "loss": 0.0006, "num_input_tokens_seen": 7246592, "step": 14720 }, { "epoch": 1.943381285469183, "grad_norm": 2.026156187057495, "learning_rate": 1.5339365673928168e-06, "loss": 0.0014, "num_input_tokens_seen": 7249280, "step": 14725 }, { "epoch": 1.9440411772469315, "grad_norm": 0.019643815234303474, "learning_rate": 1.5335469791759068e-06, "loss": 0.0014, "num_input_tokens_seen": 7251712, "step": 14730 }, { "epoch": 1.94470106902468, "grad_norm": 0.04115848243236542, "learning_rate": 1.5331572777175137e-06, "loss": 0.0009, "num_input_tokens_seen": 7254144, "step": 14735 }, { "epoch": 1.9453609608024283, "grad_norm": 0.032892487943172455, "learning_rate": 1.5327674631003493e-06, "loss": 0.0006, "num_input_tokens_seen": 7256640, "step": 14740 }, { "epoch": 1.946020852580177, "grad_norm": 29.96086311340332, "learning_rate": 1.5323775354071491e-06, "loss": 0.2108, "num_input_tokens_seen": 7259264, "step": 14745 }, { "epoch": 1.9466807443579253, "grad_norm": 9.626848220825195, "learning_rate": 1.531987494720672e-06, "loss": 0.0767, "num_input_tokens_seen": 7261760, "step": 14750 }, { "epoch": 1.9473406361356738, "grad_norm": 0.31695500016212463, "learning_rate": 1.5315973411237016e-06, "loss": 0.1239, "num_input_tokens_seen": 7264128, "step": 14755 }, { "epoch": 1.9480005279134223, "grad_norm": 0.07309925556182861, "learning_rate": 1.531207074699045e-06, "loss": 0.0343, "num_input_tokens_seen": 7266432, "step": 14760 }, { "epoch": 1.9486604196911705, "grad_norm": 0.44029781222343445, "learning_rate": 1.5308166955295334e-06, "loss": 0.2684, "num_input_tokens_seen": 7268800, "step": 14765 }, { "epoch": 1.949320311468919, "grad_norm": 0.13368448615074158, "learning_rate": 1.5304262036980221e-06, "loss": 0.0569, "num_input_tokens_seen": 7271488, "step": 14770 }, { "epoch": 1.9499802032466675, "grad_norm": 0.3928241431713104, "learning_rate": 1.5300355992873903e-06, "loss": 0.0474, "num_input_tokens_seen": 7273984, "step": 14775 }, { "epoch": 1.950640095024416, "grad_norm": 13.45598316192627, "learning_rate": 1.5296448823805407e-06, "loss": 0.0888, "num_input_tokens_seen": 7276800, "step": 14780 }, { "epoch": 1.9512999868021645, "grad_norm": 0.303527295589447, "learning_rate": 1.5292540530603998e-06, "loss": 0.001, "num_input_tokens_seen": 7279616, "step": 14785 }, { "epoch": 1.9519598785799128, "grad_norm": 0.24898892641067505, "learning_rate": 1.5288631114099193e-06, "loss": 0.0512, "num_input_tokens_seen": 7281856, "step": 14790 }, { "epoch": 1.9526197703576613, "grad_norm": 0.27193912863731384, "learning_rate": 1.528472057512073e-06, "loss": 0.0006, "num_input_tokens_seen": 7284544, "step": 14795 }, { "epoch": 1.9532796621354098, "grad_norm": 0.5412809252738953, "learning_rate": 1.5280808914498593e-06, "loss": 0.0009, "num_input_tokens_seen": 7287040, "step": 14800 }, { "epoch": 1.953939553913158, "grad_norm": 0.12621335685253143, "learning_rate": 1.5276896133063e-06, "loss": 0.0468, "num_input_tokens_seen": 7289856, "step": 14805 }, { "epoch": 1.9545994456909068, "grad_norm": 0.014875116758048534, "learning_rate": 1.5272982231644421e-06, "loss": 0.1041, "num_input_tokens_seen": 7292096, "step": 14810 }, { "epoch": 1.955259337468655, "grad_norm": 164.4103240966797, "learning_rate": 1.5269067211073545e-06, "loss": 0.1564, "num_input_tokens_seen": 7294720, "step": 14815 }, { "epoch": 1.9559192292464036, "grad_norm": 0.0532931424677372, "learning_rate": 1.5265151072181309e-06, "loss": 0.2679, "num_input_tokens_seen": 7297344, "step": 14820 }, { "epoch": 1.956579121024152, "grad_norm": 0.0477750189602375, "learning_rate": 1.5261233815798886e-06, "loss": 0.1056, "num_input_tokens_seen": 7299776, "step": 14825 }, { "epoch": 1.9572390128019004, "grad_norm": 0.3876939117908478, "learning_rate": 1.5257315442757682e-06, "loss": 0.0649, "num_input_tokens_seen": 7302272, "step": 14830 }, { "epoch": 1.957898904579649, "grad_norm": 0.052762262523174286, "learning_rate": 1.5253395953889349e-06, "loss": 0.0355, "num_input_tokens_seen": 7304576, "step": 14835 }, { "epoch": 1.9585587963573974, "grad_norm": 1.0515028238296509, "learning_rate": 1.5249475350025764e-06, "loss": 0.0764, "num_input_tokens_seen": 7307456, "step": 14840 }, { "epoch": 1.9592186881351459, "grad_norm": 0.6155139803886414, "learning_rate": 1.5245553631999054e-06, "loss": 0.0557, "num_input_tokens_seen": 7309824, "step": 14845 }, { "epoch": 1.9598785799128944, "grad_norm": 0.06223485246300697, "learning_rate": 1.5241630800641567e-06, "loss": 0.0007, "num_input_tokens_seen": 7312192, "step": 14850 }, { "epoch": 1.9605384716906427, "grad_norm": 0.1414095163345337, "learning_rate": 1.5237706856785898e-06, "loss": 0.0836, "num_input_tokens_seen": 7314432, "step": 14855 }, { "epoch": 1.9611983634683912, "grad_norm": 0.06543510407209396, "learning_rate": 1.523378180126488e-06, "loss": 0.0521, "num_input_tokens_seen": 7316928, "step": 14860 }, { "epoch": 1.9618582552461397, "grad_norm": 0.09722074866294861, "learning_rate": 1.5229855634911575e-06, "loss": 0.0006, "num_input_tokens_seen": 7319616, "step": 14865 }, { "epoch": 1.962518147023888, "grad_norm": 0.11939533799886703, "learning_rate": 1.5225928358559283e-06, "loss": 0.002, "num_input_tokens_seen": 7321856, "step": 14870 }, { "epoch": 1.9631780388016367, "grad_norm": 0.03377383202314377, "learning_rate": 1.522199997304154e-06, "loss": 0.083, "num_input_tokens_seen": 7324224, "step": 14875 }, { "epoch": 1.963837930579385, "grad_norm": 7.487933158874512, "learning_rate": 1.5218070479192118e-06, "loss": 0.047, "num_input_tokens_seen": 7326656, "step": 14880 }, { "epoch": 1.9644978223571334, "grad_norm": 53.403724670410156, "learning_rate": 1.521413987784502e-06, "loss": 0.3865, "num_input_tokens_seen": 7329280, "step": 14885 }, { "epoch": 1.965157714134882, "grad_norm": 0.09187313914299011, "learning_rate": 1.5210208169834496e-06, "loss": 0.1041, "num_input_tokens_seen": 7331648, "step": 14890 }, { "epoch": 1.9658176059126302, "grad_norm": 0.21420453488826752, "learning_rate": 1.5206275355995013e-06, "loss": 0.0013, "num_input_tokens_seen": 7334144, "step": 14895 }, { "epoch": 1.966477497690379, "grad_norm": 0.47146058082580566, "learning_rate": 1.5202341437161288e-06, "loss": 0.0005, "num_input_tokens_seen": 7336640, "step": 14900 }, { "epoch": 1.9671373894681272, "grad_norm": 0.05956108495593071, "learning_rate": 1.5198406414168266e-06, "loss": 0.0005, "num_input_tokens_seen": 7339072, "step": 14905 }, { "epoch": 1.9677972812458757, "grad_norm": 0.03283373638987541, "learning_rate": 1.5194470287851124e-06, "loss": 0.0003, "num_input_tokens_seen": 7341504, "step": 14910 }, { "epoch": 1.9684571730236242, "grad_norm": 0.0706038549542427, "learning_rate": 1.5190533059045284e-06, "loss": 0.149, "num_input_tokens_seen": 7343936, "step": 14915 }, { "epoch": 1.9691170648013725, "grad_norm": 0.029689691960811615, "learning_rate": 1.5186594728586383e-06, "loss": 0.1659, "num_input_tokens_seen": 7346368, "step": 14920 }, { "epoch": 1.969776956579121, "grad_norm": 3.8806729316711426, "learning_rate": 1.518265529731031e-06, "loss": 0.0452, "num_input_tokens_seen": 7348800, "step": 14925 }, { "epoch": 1.9704368483568695, "grad_norm": 0.1849229484796524, "learning_rate": 1.5178714766053185e-06, "loss": 0.1882, "num_input_tokens_seen": 7350848, "step": 14930 }, { "epoch": 1.9710967401346178, "grad_norm": 41.28623580932617, "learning_rate": 1.5174773135651347e-06, "loss": 0.1838, "num_input_tokens_seen": 7353344, "step": 14935 }, { "epoch": 1.9717566319123665, "grad_norm": 0.12453329563140869, "learning_rate": 1.5170830406941386e-06, "loss": 0.0747, "num_input_tokens_seen": 7355776, "step": 14940 }, { "epoch": 1.9724165236901148, "grad_norm": 0.02459733560681343, "learning_rate": 1.5166886580760114e-06, "loss": 0.003, "num_input_tokens_seen": 7358720, "step": 14945 }, { "epoch": 1.9730764154678633, "grad_norm": 0.03575903922319412, "learning_rate": 1.5162941657944584e-06, "loss": 0.0494, "num_input_tokens_seen": 7361216, "step": 14950 }, { "epoch": 1.9737363072456118, "grad_norm": 10.720014572143555, "learning_rate": 1.5158995639332073e-06, "loss": 0.0556, "num_input_tokens_seen": 7363904, "step": 14955 }, { "epoch": 1.97439619902336, "grad_norm": 0.049669649451971054, "learning_rate": 1.5155048525760095e-06, "loss": 0.1777, "num_input_tokens_seen": 7366400, "step": 14960 }, { "epoch": 1.9750560908011088, "grad_norm": 11.927507400512695, "learning_rate": 1.5151100318066396e-06, "loss": 0.0443, "num_input_tokens_seen": 7368896, "step": 14965 }, { "epoch": 1.975715982578857, "grad_norm": 14.424137115478516, "learning_rate": 1.5147151017088958e-06, "loss": 0.1217, "num_input_tokens_seen": 7371392, "step": 14970 }, { "epoch": 1.9763758743566056, "grad_norm": 0.06754660606384277, "learning_rate": 1.514320062366599e-06, "loss": 0.0033, "num_input_tokens_seen": 7373888, "step": 14975 }, { "epoch": 1.977035766134354, "grad_norm": 0.05533408746123314, "learning_rate": 1.513924913863593e-06, "loss": 0.0015, "num_input_tokens_seen": 7376448, "step": 14980 }, { "epoch": 1.9776956579121023, "grad_norm": 69.94572448730469, "learning_rate": 1.513529656283746e-06, "loss": 0.1575, "num_input_tokens_seen": 7378624, "step": 14985 }, { "epoch": 1.9783555496898508, "grad_norm": 0.45651572942733765, "learning_rate": 1.513134289710948e-06, "loss": 0.0303, "num_input_tokens_seen": 7381184, "step": 14990 }, { "epoch": 1.9790154414675993, "grad_norm": 0.38808608055114746, "learning_rate": 1.5127388142291126e-06, "loss": 0.0846, "num_input_tokens_seen": 7383488, "step": 14995 }, { "epoch": 1.9796753332453476, "grad_norm": 0.08070173114538193, "learning_rate": 1.5123432299221772e-06, "loss": 0.0485, "num_input_tokens_seen": 7385792, "step": 15000 }, { "epoch": 1.9803352250230963, "grad_norm": 0.26309722661972046, "learning_rate": 1.5119475368741013e-06, "loss": 0.0013, "num_input_tokens_seen": 7388416, "step": 15005 }, { "epoch": 1.9809951168008446, "grad_norm": 48.13701629638672, "learning_rate": 1.5115517351688679e-06, "loss": 0.0646, "num_input_tokens_seen": 7390720, "step": 15010 }, { "epoch": 1.9816550085785931, "grad_norm": 0.3674798905849457, "learning_rate": 1.5111558248904829e-06, "loss": 0.0839, "num_input_tokens_seen": 7393152, "step": 15015 }, { "epoch": 1.9823149003563416, "grad_norm": 24.177644729614258, "learning_rate": 1.5107598061229755e-06, "loss": 0.1332, "num_input_tokens_seen": 7395328, "step": 15020 }, { "epoch": 1.98297479213409, "grad_norm": 10.63642406463623, "learning_rate": 1.510363678950398e-06, "loss": 0.1803, "num_input_tokens_seen": 7397696, "step": 15025 }, { "epoch": 1.9836346839118386, "grad_norm": 0.4671352505683899, "learning_rate": 1.509967443456826e-06, "loss": 0.0009, "num_input_tokens_seen": 7399936, "step": 15030 }, { "epoch": 1.984294575689587, "grad_norm": 0.18537390232086182, "learning_rate": 1.5095710997263562e-06, "loss": 0.0032, "num_input_tokens_seen": 7402048, "step": 15035 }, { "epoch": 1.9849544674673354, "grad_norm": 21.04948616027832, "learning_rate": 1.509174647843111e-06, "loss": 0.0739, "num_input_tokens_seen": 7404288, "step": 15040 }, { "epoch": 1.985614359245084, "grad_norm": 0.08610837906599045, "learning_rate": 1.5087780878912335e-06, "loss": 0.0578, "num_input_tokens_seen": 7406528, "step": 15045 }, { "epoch": 1.9862742510228322, "grad_norm": 0.06923052668571472, "learning_rate": 1.5083814199548912e-06, "loss": 0.0584, "num_input_tokens_seen": 7408832, "step": 15050 }, { "epoch": 1.9869341428005807, "grad_norm": 0.07516025751829147, "learning_rate": 1.5079846441182744e-06, "loss": 0.144, "num_input_tokens_seen": 7411584, "step": 15055 }, { "epoch": 1.9875940345783292, "grad_norm": 0.07054906338453293, "learning_rate": 1.5075877604655948e-06, "loss": 0.0525, "num_input_tokens_seen": 7414080, "step": 15060 }, { "epoch": 1.9882539263560775, "grad_norm": 0.025323480367660522, "learning_rate": 1.5071907690810892e-06, "loss": 0.0009, "num_input_tokens_seen": 7416384, "step": 15065 }, { "epoch": 1.9889138181338262, "grad_norm": 0.31888553500175476, "learning_rate": 1.5067936700490153e-06, "loss": 0.1545, "num_input_tokens_seen": 7418752, "step": 15070 }, { "epoch": 1.9895737099115745, "grad_norm": 0.3737231492996216, "learning_rate": 1.5063964634536553e-06, "loss": 0.0006, "num_input_tokens_seen": 7420992, "step": 15075 }, { "epoch": 1.990233601689323, "grad_norm": 17.460901260375977, "learning_rate": 1.5059991493793124e-06, "loss": 0.2178, "num_input_tokens_seen": 7423488, "step": 15080 }, { "epoch": 1.9908934934670715, "grad_norm": 44.136966705322266, "learning_rate": 1.5056017279103146e-06, "loss": 0.095, "num_input_tokens_seen": 7425920, "step": 15085 }, { "epoch": 1.9915533852448197, "grad_norm": 0.05283686891198158, "learning_rate": 1.505204199131011e-06, "loss": 0.0005, "num_input_tokens_seen": 7428608, "step": 15090 }, { "epoch": 1.9922132770225685, "grad_norm": 0.030043108388781548, "learning_rate": 1.5048065631257748e-06, "loss": 0.045, "num_input_tokens_seen": 7431168, "step": 15095 }, { "epoch": 1.9928731688003167, "grad_norm": 0.2686857283115387, "learning_rate": 1.5044088199790012e-06, "loss": 0.0011, "num_input_tokens_seen": 7433728, "step": 15100 }, { "epoch": 1.9935330605780652, "grad_norm": 38.80851745605469, "learning_rate": 1.5040109697751082e-06, "loss": 0.1167, "num_input_tokens_seen": 7436352, "step": 15105 }, { "epoch": 1.9941929523558137, "grad_norm": 0.10195082426071167, "learning_rate": 1.5036130125985364e-06, "loss": 0.1322, "num_input_tokens_seen": 7438784, "step": 15110 }, { "epoch": 1.994852844133562, "grad_norm": 0.02739228866994381, "learning_rate": 1.5032149485337494e-06, "loss": 0.0004, "num_input_tokens_seen": 7441152, "step": 15115 }, { "epoch": 1.9955127359113105, "grad_norm": 0.0074937110766768456, "learning_rate": 1.5028167776652339e-06, "loss": 0.1143, "num_input_tokens_seen": 7443456, "step": 15120 }, { "epoch": 1.996172627689059, "grad_norm": 0.4618469476699829, "learning_rate": 1.5024185000774984e-06, "loss": 0.0071, "num_input_tokens_seen": 7446016, "step": 15125 }, { "epoch": 1.9968325194668073, "grad_norm": 0.3459224998950958, "learning_rate": 1.5020201158550745e-06, "loss": 0.172, "num_input_tokens_seen": 7448448, "step": 15130 }, { "epoch": 1.997492411244556, "grad_norm": 0.6738158464431763, "learning_rate": 1.5016216250825164e-06, "loss": 0.1166, "num_input_tokens_seen": 7451008, "step": 15135 }, { "epoch": 1.9981523030223043, "grad_norm": 0.19503681361675262, "learning_rate": 1.5012230278444005e-06, "loss": 0.0687, "num_input_tokens_seen": 7453504, "step": 15140 }, { "epoch": 1.9988121948000528, "grad_norm": 0.1917676031589508, "learning_rate": 1.5008243242253269e-06, "loss": 0.0024, "num_input_tokens_seen": 7456000, "step": 15145 }, { "epoch": 1.9994720865778013, "grad_norm": 0.06810544431209564, "learning_rate": 1.5004255143099167e-06, "loss": 0.0027, "num_input_tokens_seen": 7458496, "step": 15150 }, { "epoch": 2.0001319783555496, "grad_norm": 0.10668352246284485, "learning_rate": 1.5000265981828153e-06, "loss": 0.0349, "num_input_tokens_seen": 7460784, "step": 15155 }, { "epoch": 2.0007918701332983, "grad_norm": 0.04501849785447121, "learning_rate": 1.4996275759286894e-06, "loss": 0.0004, "num_input_tokens_seen": 7463024, "step": 15160 }, { "epoch": 2.0007918701332983, "eval_loss": 0.1182011216878891, "eval_runtime": 7.8443, "eval_samples_per_second": 858.587, "eval_steps_per_second": 107.339, "num_input_tokens_seen": 7463024, "step": 15160 }, { "epoch": 2.0014517619110466, "grad_norm": 0.7536020278930664, "learning_rate": 1.4992284476322283e-06, "loss": 0.0012, "num_input_tokens_seen": 7465456, "step": 15165 }, { "epoch": 2.002111653688795, "grad_norm": 0.7476214170455933, "learning_rate": 1.4988292133781445e-06, "loss": 0.0007, "num_input_tokens_seen": 7467632, "step": 15170 }, { "epoch": 2.0027715454665436, "grad_norm": 0.02698984183371067, "learning_rate": 1.498429873251172e-06, "loss": 0.0406, "num_input_tokens_seen": 7470192, "step": 15175 }, { "epoch": 2.003431437244292, "grad_norm": 27.516366958618164, "learning_rate": 1.4980304273360686e-06, "loss": 0.1068, "num_input_tokens_seen": 7472432, "step": 15180 }, { "epoch": 2.0040913290220406, "grad_norm": 0.051883816719055176, "learning_rate": 1.4976308757176135e-06, "loss": 0.0002, "num_input_tokens_seen": 7474928, "step": 15185 }, { "epoch": 2.004751220799789, "grad_norm": 0.022894321009516716, "learning_rate": 1.4972312184806084e-06, "loss": 0.0004, "num_input_tokens_seen": 7477296, "step": 15190 }, { "epoch": 2.005411112577537, "grad_norm": 43.75878143310547, "learning_rate": 1.496831455709878e-06, "loss": 0.2538, "num_input_tokens_seen": 7479728, "step": 15195 }, { "epoch": 2.006071004355286, "grad_norm": 0.021939776837825775, "learning_rate": 1.4964315874902687e-06, "loss": 0.0003, "num_input_tokens_seen": 7482096, "step": 15200 }, { "epoch": 2.006730896133034, "grad_norm": 0.03236817568540573, "learning_rate": 1.49603161390665e-06, "loss": 0.0712, "num_input_tokens_seen": 7484400, "step": 15205 }, { "epoch": 2.007390787910783, "grad_norm": 0.0764591321349144, "learning_rate": 1.495631535043913e-06, "loss": 0.0005, "num_input_tokens_seen": 7486704, "step": 15210 }, { "epoch": 2.008050679688531, "grad_norm": 0.0899280309677124, "learning_rate": 1.4952313509869722e-06, "loss": 0.0001, "num_input_tokens_seen": 7489200, "step": 15215 }, { "epoch": 2.0087105714662794, "grad_norm": 0.0050004273653030396, "learning_rate": 1.4948310618207628e-06, "loss": 0.0876, "num_input_tokens_seen": 7491504, "step": 15220 }, { "epoch": 2.009370463244028, "grad_norm": 0.04471347853541374, "learning_rate": 1.4944306676302442e-06, "loss": 0.0002, "num_input_tokens_seen": 7493872, "step": 15225 }, { "epoch": 2.0100303550217764, "grad_norm": 0.5096481442451477, "learning_rate": 1.4940301685003967e-06, "loss": 0.0005, "num_input_tokens_seen": 7496176, "step": 15230 }, { "epoch": 2.0106902467995247, "grad_norm": 15.463436126708984, "learning_rate": 1.4936295645162232e-06, "loss": 0.0935, "num_input_tokens_seen": 7498864, "step": 15235 }, { "epoch": 2.0113501385772734, "grad_norm": 0.29590997099876404, "learning_rate": 1.4932288557627497e-06, "loss": 0.0003, "num_input_tokens_seen": 7501296, "step": 15240 }, { "epoch": 2.0120100303550217, "grad_norm": 0.08997692912817001, "learning_rate": 1.4928280423250228e-06, "loss": 0.0768, "num_input_tokens_seen": 7503856, "step": 15245 }, { "epoch": 2.0126699221327704, "grad_norm": 0.03307807072997093, "learning_rate": 1.4924271242881128e-06, "loss": 0.0711, "num_input_tokens_seen": 7506352, "step": 15250 }, { "epoch": 2.0133298139105187, "grad_norm": 0.025954531505703926, "learning_rate": 1.4920261017371116e-06, "loss": 0.0002, "num_input_tokens_seen": 7508912, "step": 15255 }, { "epoch": 2.013989705688267, "grad_norm": 0.005450637079775333, "learning_rate": 1.4916249747571333e-06, "loss": 0.0002, "num_input_tokens_seen": 7511280, "step": 15260 }, { "epoch": 2.0146495974660157, "grad_norm": 23.70391082763672, "learning_rate": 1.4912237434333142e-06, "loss": 0.0789, "num_input_tokens_seen": 7513904, "step": 15265 }, { "epoch": 2.015309489243764, "grad_norm": 2.1816422939300537, "learning_rate": 1.4908224078508125e-06, "loss": 0.0778, "num_input_tokens_seen": 7516720, "step": 15270 }, { "epoch": 2.0159693810215127, "grad_norm": 18.413654327392578, "learning_rate": 1.4904209680948092e-06, "loss": 0.0883, "num_input_tokens_seen": 7519152, "step": 15275 }, { "epoch": 2.016629272799261, "grad_norm": 0.026565132662653923, "learning_rate": 1.4900194242505067e-06, "loss": 0.0014, "num_input_tokens_seen": 7521648, "step": 15280 }, { "epoch": 2.0172891645770092, "grad_norm": 0.35007259249687195, "learning_rate": 1.48961777640313e-06, "loss": 0.0005, "num_input_tokens_seen": 7524144, "step": 15285 }, { "epoch": 2.017949056354758, "grad_norm": 0.0682162195444107, "learning_rate": 1.4892160246379257e-06, "loss": 0.0538, "num_input_tokens_seen": 7526704, "step": 15290 }, { "epoch": 2.0186089481325062, "grad_norm": 0.1767728328704834, "learning_rate": 1.4888141690401628e-06, "loss": 0.0006, "num_input_tokens_seen": 7529264, "step": 15295 }, { "epoch": 2.0192688399102545, "grad_norm": 0.0036514129023998976, "learning_rate": 1.488412209695132e-06, "loss": 0.0493, "num_input_tokens_seen": 7531696, "step": 15300 }, { "epoch": 2.0199287316880032, "grad_norm": 0.406444251537323, "learning_rate": 1.4880101466881464e-06, "loss": 0.0108, "num_input_tokens_seen": 7533872, "step": 15305 }, { "epoch": 2.0205886234657515, "grad_norm": 0.08499430119991302, "learning_rate": 1.4876079801045418e-06, "loss": 0.0706, "num_input_tokens_seen": 7535920, "step": 15310 }, { "epoch": 2.0212485152435002, "grad_norm": 0.881211519241333, "learning_rate": 1.4872057100296738e-06, "loss": 0.0015, "num_input_tokens_seen": 7538288, "step": 15315 }, { "epoch": 2.0219084070212485, "grad_norm": 0.021755851805210114, "learning_rate": 1.4868033365489222e-06, "loss": 0.0009, "num_input_tokens_seen": 7540528, "step": 15320 }, { "epoch": 2.022568298798997, "grad_norm": 0.00886019691824913, "learning_rate": 1.4864008597476873e-06, "loss": 0.0521, "num_input_tokens_seen": 7543024, "step": 15325 }, { "epoch": 2.0232281905767455, "grad_norm": 0.017470117658376694, "learning_rate": 1.4859982797113923e-06, "loss": 0.0005, "num_input_tokens_seen": 7545392, "step": 15330 }, { "epoch": 2.023888082354494, "grad_norm": 0.0028445960488170385, "learning_rate": 1.4855955965254816e-06, "loss": 0.0533, "num_input_tokens_seen": 7547632, "step": 15335 }, { "epoch": 2.0245479741322425, "grad_norm": 0.0514727346599102, "learning_rate": 1.485192810275422e-06, "loss": 0.0006, "num_input_tokens_seen": 7550192, "step": 15340 }, { "epoch": 2.025207865909991, "grad_norm": 0.002774176187813282, "learning_rate": 1.4847899210467021e-06, "loss": 0.096, "num_input_tokens_seen": 7552688, "step": 15345 }, { "epoch": 2.025867757687739, "grad_norm": 0.005656952038407326, "learning_rate": 1.4843869289248318e-06, "loss": 0.1187, "num_input_tokens_seen": 7555312, "step": 15350 }, { "epoch": 2.026527649465488, "grad_norm": 0.0056389025412499905, "learning_rate": 1.483983833995344e-06, "loss": 0.0001, "num_input_tokens_seen": 7558128, "step": 15355 }, { "epoch": 2.027187541243236, "grad_norm": 0.0857715830206871, "learning_rate": 1.4835806363437915e-06, "loss": 0.0011, "num_input_tokens_seen": 7560368, "step": 15360 }, { "epoch": 2.0278474330209844, "grad_norm": 0.013230415992438793, "learning_rate": 1.4831773360557513e-06, "loss": 0.0002, "num_input_tokens_seen": 7562736, "step": 15365 }, { "epoch": 2.028507324798733, "grad_norm": 0.6576191186904907, "learning_rate": 1.4827739332168201e-06, "loss": 0.0009, "num_input_tokens_seen": 7565424, "step": 15370 }, { "epoch": 2.0291672165764814, "grad_norm": 0.29561471939086914, "learning_rate": 1.4823704279126172e-06, "loss": 0.0006, "num_input_tokens_seen": 7567856, "step": 15375 }, { "epoch": 2.02982710835423, "grad_norm": 17.580387115478516, "learning_rate": 1.4819668202287847e-06, "loss": 0.0433, "num_input_tokens_seen": 7570352, "step": 15380 }, { "epoch": 2.0304870001319784, "grad_norm": 2.213040590286255, "learning_rate": 1.4815631102509843e-06, "loss": 0.0006, "num_input_tokens_seen": 7572720, "step": 15385 }, { "epoch": 2.0311468919097266, "grad_norm": 0.0046477969735860825, "learning_rate": 1.4811592980649014e-06, "loss": 0.0006, "num_input_tokens_seen": 7574960, "step": 15390 }, { "epoch": 2.0318067836874754, "grad_norm": 0.029864639043807983, "learning_rate": 1.4807553837562415e-06, "loss": 0.0002, "num_input_tokens_seen": 7577584, "step": 15395 }, { "epoch": 2.0324666754652236, "grad_norm": 0.004414161667227745, "learning_rate": 1.4803513674107325e-06, "loss": 0.0, "num_input_tokens_seen": 7580016, "step": 15400 }, { "epoch": 2.0331265672429724, "grad_norm": 0.002472514286637306, "learning_rate": 1.4799472491141245e-06, "loss": 0.0846, "num_input_tokens_seen": 7582320, "step": 15405 }, { "epoch": 2.0337864590207206, "grad_norm": 23.93593978881836, "learning_rate": 1.4795430289521885e-06, "loss": 0.0432, "num_input_tokens_seen": 7585008, "step": 15410 }, { "epoch": 2.034446350798469, "grad_norm": 0.0009444769821129739, "learning_rate": 1.479138707010717e-06, "loss": 0.0021, "num_input_tokens_seen": 7587760, "step": 15415 }, { "epoch": 2.0351062425762176, "grad_norm": 0.605518639087677, "learning_rate": 1.4787342833755245e-06, "loss": 0.0004, "num_input_tokens_seen": 7590256, "step": 15420 }, { "epoch": 2.035766134353966, "grad_norm": 0.004898503888398409, "learning_rate": 1.4783297581324472e-06, "loss": 0.0977, "num_input_tokens_seen": 7592688, "step": 15425 }, { "epoch": 2.036426026131714, "grad_norm": 0.0124160535633564, "learning_rate": 1.4779251313673422e-06, "loss": 0.0257, "num_input_tokens_seen": 7595120, "step": 15430 }, { "epoch": 2.037085917909463, "grad_norm": 0.2338225543498993, "learning_rate": 1.4775204031660894e-06, "loss": 0.0004, "num_input_tokens_seen": 7597680, "step": 15435 }, { "epoch": 2.037745809687211, "grad_norm": 0.0083228200674057, "learning_rate": 1.4771155736145888e-06, "loss": 0.0001, "num_input_tokens_seen": 7600304, "step": 15440 }, { "epoch": 2.03840570146496, "grad_norm": 0.0006205007084645331, "learning_rate": 1.4767106427987625e-06, "loss": 0.001, "num_input_tokens_seen": 7602800, "step": 15445 }, { "epoch": 2.039065593242708, "grad_norm": 0.013593539595603943, "learning_rate": 1.4763056108045549e-06, "loss": 0.0798, "num_input_tokens_seen": 7605040, "step": 15450 }, { "epoch": 2.0397254850204565, "grad_norm": 0.0008863316616043448, "learning_rate": 1.4759004777179297e-06, "loss": 0.0007, "num_input_tokens_seen": 7607408, "step": 15455 }, { "epoch": 2.040385376798205, "grad_norm": 0.010385457426309586, "learning_rate": 1.475495243624875e-06, "loss": 0.0612, "num_input_tokens_seen": 7609904, "step": 15460 }, { "epoch": 2.0410452685759535, "grad_norm": 0.3944193124771118, "learning_rate": 1.475089908611398e-06, "loss": 0.0879, "num_input_tokens_seen": 7612592, "step": 15465 }, { "epoch": 2.041705160353702, "grad_norm": 0.006912830751389265, "learning_rate": 1.4746844727635282e-06, "loss": 0.0002, "num_input_tokens_seen": 7615152, "step": 15470 }, { "epoch": 2.0423650521314505, "grad_norm": 15.692147254943848, "learning_rate": 1.474278936167316e-06, "loss": 0.1432, "num_input_tokens_seen": 7617584, "step": 15475 }, { "epoch": 2.0430249439091988, "grad_norm": 0.014512135647237301, "learning_rate": 1.4738732989088347e-06, "loss": 0.0, "num_input_tokens_seen": 7620208, "step": 15480 }, { "epoch": 2.0436848356869475, "grad_norm": 0.029907351359725, "learning_rate": 1.4734675610741767e-06, "loss": 0.0001, "num_input_tokens_seen": 7622896, "step": 15485 }, { "epoch": 2.0443447274646958, "grad_norm": 0.0069085415452718735, "learning_rate": 1.4730617227494577e-06, "loss": 0.0506, "num_input_tokens_seen": 7625328, "step": 15490 }, { "epoch": 2.045004619242444, "grad_norm": 0.008318849839270115, "learning_rate": 1.4726557840208137e-06, "loss": 0.0549, "num_input_tokens_seen": 7628208, "step": 15495 }, { "epoch": 2.0456645110201928, "grad_norm": 0.05175945535302162, "learning_rate": 1.4722497449744022e-06, "loss": 0.0008, "num_input_tokens_seen": 7630640, "step": 15500 }, { "epoch": 2.046324402797941, "grad_norm": 0.2361360341310501, "learning_rate": 1.471843605696402e-06, "loss": 0.0009, "num_input_tokens_seen": 7633072, "step": 15505 }, { "epoch": 2.0469842945756898, "grad_norm": 0.014276132918894291, "learning_rate": 1.4714373662730136e-06, "loss": 0.0002, "num_input_tokens_seen": 7635568, "step": 15510 }, { "epoch": 2.047644186353438, "grad_norm": 0.016513869166374207, "learning_rate": 1.4710310267904578e-06, "loss": 0.0002, "num_input_tokens_seen": 7638320, "step": 15515 }, { "epoch": 2.0483040781311863, "grad_norm": 0.007878591306507587, "learning_rate": 1.4706245873349777e-06, "loss": 0.0627, "num_input_tokens_seen": 7640496, "step": 15520 }, { "epoch": 2.048963969908935, "grad_norm": 0.029861019924283028, "learning_rate": 1.4702180479928368e-06, "loss": 0.0002, "num_input_tokens_seen": 7642736, "step": 15525 }, { "epoch": 2.0496238616866833, "grad_norm": 0.049566447734832764, "learning_rate": 1.4698114088503203e-06, "loss": 0.0519, "num_input_tokens_seen": 7645104, "step": 15530 }, { "epoch": 2.050283753464432, "grad_norm": 0.00440286984667182, "learning_rate": 1.4694046699937341e-06, "loss": 0.0002, "num_input_tokens_seen": 7647472, "step": 15535 }, { "epoch": 2.0509436452421803, "grad_norm": 3.612928628921509, "learning_rate": 1.4689978315094066e-06, "loss": 0.0015, "num_input_tokens_seen": 7649968, "step": 15540 }, { "epoch": 2.0516035370199286, "grad_norm": 10.197099685668945, "learning_rate": 1.468590893483685e-06, "loss": 0.0712, "num_input_tokens_seen": 7652592, "step": 15545 }, { "epoch": 2.0522634287976773, "grad_norm": 0.053775690495967865, "learning_rate": 1.4681838560029395e-06, "loss": 0.0006, "num_input_tokens_seen": 7655216, "step": 15550 }, { "epoch": 2.0529233205754256, "grad_norm": 0.4073657691478729, "learning_rate": 1.467776719153561e-06, "loss": 0.0006, "num_input_tokens_seen": 7657840, "step": 15555 }, { "epoch": 2.053583212353174, "grad_norm": 15.94839096069336, "learning_rate": 1.4673694830219613e-06, "loss": 0.0458, "num_input_tokens_seen": 7660144, "step": 15560 }, { "epoch": 2.0542431041309226, "grad_norm": 0.15345242619514465, "learning_rate": 1.466962147694573e-06, "loss": 0.0001, "num_input_tokens_seen": 7662704, "step": 15565 }, { "epoch": 2.054902995908671, "grad_norm": 16.386497497558594, "learning_rate": 1.4665547132578508e-06, "loss": 0.093, "num_input_tokens_seen": 7665200, "step": 15570 }, { "epoch": 2.0555628876864196, "grad_norm": 0.06717591732740402, "learning_rate": 1.466147179798269e-06, "loss": 0.0002, "num_input_tokens_seen": 7667376, "step": 15575 }, { "epoch": 2.056222779464168, "grad_norm": 0.16586610674858093, "learning_rate": 1.4657395474023237e-06, "loss": 0.0003, "num_input_tokens_seen": 7669680, "step": 15580 }, { "epoch": 2.056882671241916, "grad_norm": 0.011295896023511887, "learning_rate": 1.4653318161565325e-06, "loss": 0.0004, "num_input_tokens_seen": 7672176, "step": 15585 }, { "epoch": 2.057542563019665, "grad_norm": 0.001465719542466104, "learning_rate": 1.4649239861474324e-06, "loss": 0.0523, "num_input_tokens_seen": 7674800, "step": 15590 }, { "epoch": 2.058202454797413, "grad_norm": 0.019439391791820526, "learning_rate": 1.4645160574615834e-06, "loss": 0.1052, "num_input_tokens_seen": 7677232, "step": 15595 }, { "epoch": 2.058862346575162, "grad_norm": 0.21349266171455383, "learning_rate": 1.4641080301855648e-06, "loss": 0.0003, "num_input_tokens_seen": 7679536, "step": 15600 }, { "epoch": 2.05952223835291, "grad_norm": 0.012627423740923405, "learning_rate": 1.4636999044059777e-06, "loss": 0.0002, "num_input_tokens_seen": 7681840, "step": 15605 }, { "epoch": 2.0601821301306584, "grad_norm": 31.638559341430664, "learning_rate": 1.4632916802094436e-06, "loss": 0.1445, "num_input_tokens_seen": 7684528, "step": 15610 }, { "epoch": 2.060842021908407, "grad_norm": 0.046404916793107986, "learning_rate": 1.462883357682605e-06, "loss": 0.0006, "num_input_tokens_seen": 7687152, "step": 15615 }, { "epoch": 2.0615019136861554, "grad_norm": 0.07641780376434326, "learning_rate": 1.4624749369121265e-06, "loss": 0.1174, "num_input_tokens_seen": 7689712, "step": 15620 }, { "epoch": 2.0621618054639037, "grad_norm": 0.0200422964990139, "learning_rate": 1.4620664179846908e-06, "loss": 0.0001, "num_input_tokens_seen": 7692144, "step": 15625 }, { "epoch": 2.0628216972416524, "grad_norm": 0.03198925033211708, "learning_rate": 1.4616578009870044e-06, "loss": 0.0006, "num_input_tokens_seen": 7694320, "step": 15630 }, { "epoch": 2.0634815890194007, "grad_norm": 6.981573104858398, "learning_rate": 1.4612490860057927e-06, "loss": 0.0008, "num_input_tokens_seen": 7696752, "step": 15635 }, { "epoch": 2.0641414807971494, "grad_norm": 0.24552814662456512, "learning_rate": 1.4608402731278022e-06, "loss": 0.0002, "num_input_tokens_seen": 7699184, "step": 15640 }, { "epoch": 2.0648013725748977, "grad_norm": 0.013794873841106892, "learning_rate": 1.4604313624398014e-06, "loss": 0.1332, "num_input_tokens_seen": 7701424, "step": 15645 }, { "epoch": 2.065461264352646, "grad_norm": 0.0617479644715786, "learning_rate": 1.4600223540285778e-06, "loss": 0.0006, "num_input_tokens_seen": 7703856, "step": 15650 }, { "epoch": 2.0661211561303947, "grad_norm": 0.01708795502781868, "learning_rate": 1.459613247980941e-06, "loss": 0.0737, "num_input_tokens_seen": 7706224, "step": 15655 }, { "epoch": 2.066781047908143, "grad_norm": 0.03478756919503212, "learning_rate": 1.4592040443837203e-06, "loss": 0.0006, "num_input_tokens_seen": 7708400, "step": 15660 }, { "epoch": 2.0674409396858917, "grad_norm": 0.016062593087553978, "learning_rate": 1.458794743323767e-06, "loss": 0.0015, "num_input_tokens_seen": 7710960, "step": 15665 }, { "epoch": 2.06810083146364, "grad_norm": 0.12953194975852966, "learning_rate": 1.4583853448879513e-06, "loss": 0.0003, "num_input_tokens_seen": 7713520, "step": 15670 }, { "epoch": 2.0687607232413883, "grad_norm": 0.007021949626505375, "learning_rate": 1.4579758491631655e-06, "loss": 0.0001, "num_input_tokens_seen": 7715824, "step": 15675 }, { "epoch": 2.069420615019137, "grad_norm": 0.01294905785471201, "learning_rate": 1.4575662562363222e-06, "loss": 0.0443, "num_input_tokens_seen": 7718448, "step": 15680 }, { "epoch": 2.0700805067968853, "grad_norm": 0.23416569828987122, "learning_rate": 1.4571565661943542e-06, "loss": 0.0005, "num_input_tokens_seen": 7720944, "step": 15685 }, { "epoch": 2.0707403985746335, "grad_norm": 0.13015711307525635, "learning_rate": 1.456746779124216e-06, "loss": 0.0007, "num_input_tokens_seen": 7723760, "step": 15690 }, { "epoch": 2.0714002903523823, "grad_norm": 0.0016105296090245247, "learning_rate": 1.4563368951128812e-06, "loss": 0.0004, "num_input_tokens_seen": 7726256, "step": 15695 }, { "epoch": 2.0720601821301305, "grad_norm": 0.2471880316734314, "learning_rate": 1.4559269142473452e-06, "loss": 0.1174, "num_input_tokens_seen": 7729136, "step": 15700 }, { "epoch": 2.0727200739078793, "grad_norm": 0.050808388739824295, "learning_rate": 1.455516836614623e-06, "loss": 0.0002, "num_input_tokens_seen": 7731824, "step": 15705 }, { "epoch": 2.0733799656856275, "grad_norm": 0.004038075916469097, "learning_rate": 1.4551066623017505e-06, "loss": 0.1329, "num_input_tokens_seen": 7734128, "step": 15710 }, { "epoch": 2.074039857463376, "grad_norm": 0.01952297054231167, "learning_rate": 1.4546963913957848e-06, "loss": 0.072, "num_input_tokens_seen": 7736112, "step": 15715 }, { "epoch": 2.0746997492411245, "grad_norm": 0.01668599434196949, "learning_rate": 1.4542860239838025e-06, "loss": 0.0002, "num_input_tokens_seen": 7738736, "step": 15720 }, { "epoch": 2.075359641018873, "grad_norm": 0.05960208922624588, "learning_rate": 1.4538755601529018e-06, "loss": 0.0002, "num_input_tokens_seen": 7741424, "step": 15725 }, { "epoch": 2.0760195327966215, "grad_norm": 0.05067654699087143, "learning_rate": 1.4534649999901999e-06, "loss": 0.0003, "num_input_tokens_seen": 7743664, "step": 15730 }, { "epoch": 2.07667942457437, "grad_norm": 0.029946548864245415, "learning_rate": 1.4530543435828355e-06, "loss": 0.0002, "num_input_tokens_seen": 7746032, "step": 15735 }, { "epoch": 2.077339316352118, "grad_norm": 0.03288107365369797, "learning_rate": 1.4526435910179674e-06, "loss": 0.0567, "num_input_tokens_seen": 7748336, "step": 15740 }, { "epoch": 2.077999208129867, "grad_norm": 0.006778739392757416, "learning_rate": 1.4522327423827746e-06, "loss": 0.1586, "num_input_tokens_seen": 7750960, "step": 15745 }, { "epoch": 2.078659099907615, "grad_norm": 0.030091719701886177, "learning_rate": 1.4518217977644576e-06, "loss": 0.0005, "num_input_tokens_seen": 7753776, "step": 15750 }, { "epoch": 2.079318991685364, "grad_norm": 0.13405410945415497, "learning_rate": 1.4514107572502355e-06, "loss": 0.0007, "num_input_tokens_seen": 7756400, "step": 15755 }, { "epoch": 2.079978883463112, "grad_norm": 15.214388847351074, "learning_rate": 1.450999620927349e-06, "loss": 0.0781, "num_input_tokens_seen": 7758640, "step": 15760 }, { "epoch": 2.0806387752408604, "grad_norm": 0.05948585271835327, "learning_rate": 1.4505883888830591e-06, "loss": 0.0003, "num_input_tokens_seen": 7761072, "step": 15765 }, { "epoch": 2.081298667018609, "grad_norm": 0.054697006940841675, "learning_rate": 1.4501770612046461e-06, "loss": 0.001, "num_input_tokens_seen": 7763696, "step": 15770 }, { "epoch": 2.0819585587963574, "grad_norm": 0.06462718546390533, "learning_rate": 1.4497656379794126e-06, "loss": 0.0007, "num_input_tokens_seen": 7766192, "step": 15775 }, { "epoch": 2.0826184505741057, "grad_norm": 0.007994703017175198, "learning_rate": 1.4493541192946785e-06, "loss": 0.0007, "num_input_tokens_seen": 7768752, "step": 15780 }, { "epoch": 2.0832783423518544, "grad_norm": 0.03308379277586937, "learning_rate": 1.448942505237787e-06, "loss": 0.0007, "num_input_tokens_seen": 7771248, "step": 15785 }, { "epoch": 2.0839382341296027, "grad_norm": 0.005272372625768185, "learning_rate": 1.4485307958960996e-06, "loss": 0.0, "num_input_tokens_seen": 7774064, "step": 15790 }, { "epoch": 2.0845981259073514, "grad_norm": 0.01093069277703762, "learning_rate": 1.448118991356999e-06, "loss": 0.0001, "num_input_tokens_seen": 7776496, "step": 15795 }, { "epoch": 2.0852580176850997, "grad_norm": 0.01305568777024746, "learning_rate": 1.4477070917078876e-06, "loss": 0.1766, "num_input_tokens_seen": 7778800, "step": 15800 }, { "epoch": 2.085917909462848, "grad_norm": 0.0439567007124424, "learning_rate": 1.4472950970361878e-06, "loss": 0.0004, "num_input_tokens_seen": 7781168, "step": 15805 }, { "epoch": 2.0865778012405967, "grad_norm": 0.179931640625, "learning_rate": 1.4468830074293425e-06, "loss": 0.0002, "num_input_tokens_seen": 7783792, "step": 15810 }, { "epoch": 2.087237693018345, "grad_norm": 0.006239602342247963, "learning_rate": 1.4464708229748154e-06, "loss": 0.0104, "num_input_tokens_seen": 7786288, "step": 15815 }, { "epoch": 2.087897584796093, "grad_norm": 0.00521710142493248, "learning_rate": 1.4460585437600887e-06, "loss": 0.0001, "num_input_tokens_seen": 7788848, "step": 15820 }, { "epoch": 2.088557476573842, "grad_norm": 0.02085597813129425, "learning_rate": 1.4456461698726666e-06, "loss": 0.0001, "num_input_tokens_seen": 7791472, "step": 15825 }, { "epoch": 2.08921736835159, "grad_norm": 0.0025343787856400013, "learning_rate": 1.445233701400072e-06, "loss": 0.0002, "num_input_tokens_seen": 7794096, "step": 15830 }, { "epoch": 2.089877260129339, "grad_norm": 0.0019459739560261369, "learning_rate": 1.4448211384298482e-06, "loss": 0.0923, "num_input_tokens_seen": 7796464, "step": 15835 }, { "epoch": 2.090537151907087, "grad_norm": 0.0659346804022789, "learning_rate": 1.4444084810495589e-06, "loss": 0.0005, "num_input_tokens_seen": 7798832, "step": 15840 }, { "epoch": 2.0911970436848355, "grad_norm": 22.533714294433594, "learning_rate": 1.4439957293467877e-06, "loss": 0.2503, "num_input_tokens_seen": 7801456, "step": 15845 }, { "epoch": 2.091856935462584, "grad_norm": 4.6341047286987305, "learning_rate": 1.4435828834091384e-06, "loss": 0.0013, "num_input_tokens_seen": 7804016, "step": 15850 }, { "epoch": 2.0925168272403325, "grad_norm": 0.018850073218345642, "learning_rate": 1.443169943324234e-06, "loss": 0.0461, "num_input_tokens_seen": 7806384, "step": 15855 }, { "epoch": 2.0931767190180812, "grad_norm": 0.011577551253139973, "learning_rate": 1.4427569091797182e-06, "loss": 0.0001, "num_input_tokens_seen": 7808496, "step": 15860 }, { "epoch": 2.0938366107958295, "grad_norm": 0.012284616008400917, "learning_rate": 1.442343781063255e-06, "loss": 0.0003, "num_input_tokens_seen": 7810992, "step": 15865 }, { "epoch": 2.094496502573578, "grad_norm": 0.03962375968694687, "learning_rate": 1.441930559062527e-06, "loss": 0.0001, "num_input_tokens_seen": 7813680, "step": 15870 }, { "epoch": 2.0951563943513265, "grad_norm": 14.506804466247559, "learning_rate": 1.4415172432652385e-06, "loss": 0.0673, "num_input_tokens_seen": 7816304, "step": 15875 }, { "epoch": 2.095816286129075, "grad_norm": 0.02910180203616619, "learning_rate": 1.441103833759112e-06, "loss": 0.0001, "num_input_tokens_seen": 7818864, "step": 15880 }, { "epoch": 2.0964761779068235, "grad_norm": 0.022448379546403885, "learning_rate": 1.4406903306318913e-06, "loss": 0.0004, "num_input_tokens_seen": 7821168, "step": 15885 }, { "epoch": 2.097136069684572, "grad_norm": 0.026671426370739937, "learning_rate": 1.440276733971339e-06, "loss": 0.0002, "num_input_tokens_seen": 7823920, "step": 15890 }, { "epoch": 2.09779596146232, "grad_norm": 0.017572835087776184, "learning_rate": 1.439863043865238e-06, "loss": 0.0001, "num_input_tokens_seen": 7826224, "step": 15895 }, { "epoch": 2.098455853240069, "grad_norm": 0.0058591896668076515, "learning_rate": 1.4394492604013914e-06, "loss": 0.0, "num_input_tokens_seen": 7828720, "step": 15900 }, { "epoch": 2.099115745017817, "grad_norm": 0.003320800606161356, "learning_rate": 1.4390353836676217e-06, "loss": 0.0554, "num_input_tokens_seen": 7831344, "step": 15905 }, { "epoch": 2.0997756367955653, "grad_norm": 0.019994355738162994, "learning_rate": 1.4386214137517707e-06, "loss": 0.0002, "num_input_tokens_seen": 7833840, "step": 15910 }, { "epoch": 2.100435528573314, "grad_norm": 0.07288461178541183, "learning_rate": 1.438207350741701e-06, "loss": 0.0659, "num_input_tokens_seen": 7836016, "step": 15915 }, { "epoch": 2.1010954203510623, "grad_norm": 0.009932564571499825, "learning_rate": 1.4377931947252943e-06, "loss": 0.0295, "num_input_tokens_seen": 7838768, "step": 15920 }, { "epoch": 2.101755312128811, "grad_norm": 0.009432883001863956, "learning_rate": 1.4373789457904522e-06, "loss": 0.0581, "num_input_tokens_seen": 7841328, "step": 15925 }, { "epoch": 2.1024152039065593, "grad_norm": 0.013211063109338284, "learning_rate": 1.4369646040250962e-06, "loss": 0.0002, "num_input_tokens_seen": 7843760, "step": 15930 }, { "epoch": 2.1030750956843076, "grad_norm": 0.018002096563577652, "learning_rate": 1.4365501695171673e-06, "loss": 0.0908, "num_input_tokens_seen": 7846512, "step": 15935 }, { "epoch": 2.1037349874620563, "grad_norm": 0.10871543735265732, "learning_rate": 1.436135642354626e-06, "loss": 0.0002, "num_input_tokens_seen": 7849072, "step": 15940 }, { "epoch": 2.1043948792398046, "grad_norm": 0.051549457013607025, "learning_rate": 1.4357210226254533e-06, "loss": 0.088, "num_input_tokens_seen": 7851632, "step": 15945 }, { "epoch": 2.105054771017553, "grad_norm": 0.09863109886646271, "learning_rate": 1.435306310417648e-06, "loss": 0.0004, "num_input_tokens_seen": 7854064, "step": 15950 }, { "epoch": 2.1057146627953016, "grad_norm": 0.09837587922811508, "learning_rate": 1.4348915058192316e-06, "loss": 0.0478, "num_input_tokens_seen": 7856752, "step": 15955 }, { "epoch": 2.10637455457305, "grad_norm": 23.637500762939453, "learning_rate": 1.4344766089182416e-06, "loss": 0.0468, "num_input_tokens_seen": 7859376, "step": 15960 }, { "epoch": 2.1070344463507986, "grad_norm": 0.008645118214190006, "learning_rate": 1.4340616198027377e-06, "loss": 0.0001, "num_input_tokens_seen": 7862000, "step": 15965 }, { "epoch": 2.107694338128547, "grad_norm": 25.188690185546875, "learning_rate": 1.4336465385607982e-06, "loss": 0.0612, "num_input_tokens_seen": 7864304, "step": 15970 }, { "epoch": 2.108354229906295, "grad_norm": 0.2091568261384964, "learning_rate": 1.433231365280521e-06, "loss": 0.1067, "num_input_tokens_seen": 7866608, "step": 15975 }, { "epoch": 2.109014121684044, "grad_norm": 0.03102351725101471, "learning_rate": 1.432816100050024e-06, "loss": 0.0596, "num_input_tokens_seen": 7869040, "step": 15980 }, { "epoch": 2.109674013461792, "grad_norm": 0.0035080285742878914, "learning_rate": 1.432400742957444e-06, "loss": 0.0001, "num_input_tokens_seen": 7871792, "step": 15985 }, { "epoch": 2.110333905239541, "grad_norm": 0.003127987729385495, "learning_rate": 1.4319852940909377e-06, "loss": 0.001, "num_input_tokens_seen": 7874160, "step": 15990 }, { "epoch": 2.110993797017289, "grad_norm": 0.023723382502794266, "learning_rate": 1.4315697535386804e-06, "loss": 0.0029, "num_input_tokens_seen": 7876656, "step": 15995 }, { "epoch": 2.1116536887950375, "grad_norm": 35.08774948120117, "learning_rate": 1.4311541213888682e-06, "loss": 0.0551, "num_input_tokens_seen": 7879280, "step": 16000 }, { "epoch": 2.112313580572786, "grad_norm": 0.024285173043608665, "learning_rate": 1.430738397729716e-06, "loss": 0.0002, "num_input_tokens_seen": 7881712, "step": 16005 }, { "epoch": 2.1129734723505345, "grad_norm": 0.08313702791929245, "learning_rate": 1.4303225826494583e-06, "loss": 0.0004, "num_input_tokens_seen": 7883952, "step": 16010 }, { "epoch": 2.113633364128283, "grad_norm": 0.058101836591959, "learning_rate": 1.4299066762363484e-06, "loss": 0.0488, "num_input_tokens_seen": 7886320, "step": 16015 }, { "epoch": 2.1142932559060315, "grad_norm": 0.0922577753663063, "learning_rate": 1.4294906785786593e-06, "loss": 0.0273, "num_input_tokens_seen": 7889008, "step": 16020 }, { "epoch": 2.1149531476837797, "grad_norm": 18.65223503112793, "learning_rate": 1.429074589764684e-06, "loss": 0.0751, "num_input_tokens_seen": 7891376, "step": 16025 }, { "epoch": 2.1156130394615285, "grad_norm": 0.26984599232673645, "learning_rate": 1.4286584098827343e-06, "loss": 0.0536, "num_input_tokens_seen": 7893616, "step": 16030 }, { "epoch": 2.1162729312392767, "grad_norm": 0.010782040655612946, "learning_rate": 1.4282421390211411e-06, "loss": 0.0007, "num_input_tokens_seen": 7895984, "step": 16035 }, { "epoch": 2.116932823017025, "grad_norm": 0.0064732348546385765, "learning_rate": 1.4278257772682548e-06, "loss": 0.0001, "num_input_tokens_seen": 7898224, "step": 16040 }, { "epoch": 2.1175927147947737, "grad_norm": 0.007527779787778854, "learning_rate": 1.4274093247124456e-06, "loss": 0.0001, "num_input_tokens_seen": 7900656, "step": 16045 }, { "epoch": 2.118252606572522, "grad_norm": 0.0363912433385849, "learning_rate": 1.4269927814421023e-06, "loss": 0.0001, "num_input_tokens_seen": 7903152, "step": 16050 }, { "epoch": 2.1189124983502707, "grad_norm": 0.004254709463566542, "learning_rate": 1.426576147545633e-06, "loss": 0.0001, "num_input_tokens_seen": 7905520, "step": 16055 }, { "epoch": 2.119572390128019, "grad_norm": 0.03672404587268829, "learning_rate": 1.4261594231114658e-06, "loss": 0.0009, "num_input_tokens_seen": 7907952, "step": 16060 }, { "epoch": 2.1202322819057673, "grad_norm": 0.0029204622842371464, "learning_rate": 1.4257426082280466e-06, "loss": 0.1114, "num_input_tokens_seen": 7910384, "step": 16065 }, { "epoch": 2.120892173683516, "grad_norm": 0.42951253056526184, "learning_rate": 1.4253257029838419e-06, "loss": 0.009, "num_input_tokens_seen": 7912880, "step": 16070 }, { "epoch": 2.1215520654612643, "grad_norm": 101.55767822265625, "learning_rate": 1.4249087074673367e-06, "loss": 0.1324, "num_input_tokens_seen": 7915184, "step": 16075 }, { "epoch": 2.122211957239013, "grad_norm": 0.0719396322965622, "learning_rate": 1.4244916217670352e-06, "loss": 0.0003, "num_input_tokens_seen": 7917424, "step": 16080 }, { "epoch": 2.1228718490167613, "grad_norm": 0.010440871119499207, "learning_rate": 1.4240744459714612e-06, "loss": 0.0015, "num_input_tokens_seen": 7919600, "step": 16085 }, { "epoch": 2.1235317407945096, "grad_norm": 0.004497972317039967, "learning_rate": 1.4236571801691568e-06, "loss": 0.0001, "num_input_tokens_seen": 7922224, "step": 16090 }, { "epoch": 2.1241916325722583, "grad_norm": 37.28203582763672, "learning_rate": 1.4232398244486835e-06, "loss": 0.0682, "num_input_tokens_seen": 7924464, "step": 16095 }, { "epoch": 2.1248515243500066, "grad_norm": 0.0008809241116978228, "learning_rate": 1.4228223788986226e-06, "loss": 0.0001, "num_input_tokens_seen": 7927088, "step": 16100 }, { "epoch": 2.125511416127755, "grad_norm": 0.21736837923526764, "learning_rate": 1.4224048436075738e-06, "loss": 0.1128, "num_input_tokens_seen": 7929648, "step": 16105 }, { "epoch": 2.1261713079055036, "grad_norm": 0.008032063953578472, "learning_rate": 1.4219872186641557e-06, "loss": 0.0001, "num_input_tokens_seen": 7932016, "step": 16110 }, { "epoch": 2.126831199683252, "grad_norm": 0.014576790854334831, "learning_rate": 1.421569504157006e-06, "loss": 0.0535, "num_input_tokens_seen": 7934576, "step": 16115 }, { "epoch": 2.1274910914610006, "grad_norm": 0.037164539098739624, "learning_rate": 1.4211517001747818e-06, "loss": 0.0491, "num_input_tokens_seen": 7936880, "step": 16120 }, { "epoch": 2.128150983238749, "grad_norm": 0.09125718474388123, "learning_rate": 1.420733806806159e-06, "loss": 0.0007, "num_input_tokens_seen": 7939248, "step": 16125 }, { "epoch": 2.128810875016497, "grad_norm": 0.02172080986201763, "learning_rate": 1.4203158241398329e-06, "loss": 0.1307, "num_input_tokens_seen": 7941936, "step": 16130 }, { "epoch": 2.129470766794246, "grad_norm": 0.02687636949121952, "learning_rate": 1.4198977522645162e-06, "loss": 0.1063, "num_input_tokens_seen": 7944304, "step": 16135 }, { "epoch": 2.130130658571994, "grad_norm": 19.126834869384766, "learning_rate": 1.4194795912689426e-06, "loss": 0.1061, "num_input_tokens_seen": 7946544, "step": 16140 }, { "epoch": 2.130790550349743, "grad_norm": 0.009625761769711971, "learning_rate": 1.419061341241863e-06, "loss": 0.0001, "num_input_tokens_seen": 7949424, "step": 16145 }, { "epoch": 2.131450442127491, "grad_norm": 0.004309684503823519, "learning_rate": 1.4186430022720488e-06, "loss": 0.0001, "num_input_tokens_seen": 7951856, "step": 16150 }, { "epoch": 2.1321103339052394, "grad_norm": 0.05872798711061478, "learning_rate": 1.4182245744482886e-06, "loss": 0.0002, "num_input_tokens_seen": 7953968, "step": 16155 }, { "epoch": 2.132770225682988, "grad_norm": 0.02588357776403427, "learning_rate": 1.4178060578593912e-06, "loss": 0.1079, "num_input_tokens_seen": 7956464, "step": 16160 }, { "epoch": 2.1334301174607364, "grad_norm": 0.019361227750778198, "learning_rate": 1.4173874525941836e-06, "loss": 0.0001, "num_input_tokens_seen": 7958896, "step": 16165 }, { "epoch": 2.1340900092384847, "grad_norm": 0.11392463743686676, "learning_rate": 1.4169687587415114e-06, "loss": 0.0126, "num_input_tokens_seen": 7961328, "step": 16170 }, { "epoch": 2.1347499010162334, "grad_norm": 0.0023702646140009165, "learning_rate": 1.4165499763902399e-06, "loss": 0.0691, "num_input_tokens_seen": 7963888, "step": 16175 }, { "epoch": 2.1354097927939817, "grad_norm": 0.014895014464855194, "learning_rate": 1.416131105629252e-06, "loss": 0.0001, "num_input_tokens_seen": 7966512, "step": 16180 }, { "epoch": 2.1360696845717304, "grad_norm": 0.05524078384041786, "learning_rate": 1.4157121465474504e-06, "loss": 0.0002, "num_input_tokens_seen": 7968944, "step": 16185 }, { "epoch": 2.1367295763494787, "grad_norm": 0.05119001865386963, "learning_rate": 1.4152930992337562e-06, "loss": 0.0018, "num_input_tokens_seen": 7971440, "step": 16190 }, { "epoch": 2.137389468127227, "grad_norm": 0.04204750433564186, "learning_rate": 1.4148739637771088e-06, "loss": 0.0001, "num_input_tokens_seen": 7973744, "step": 16195 }, { "epoch": 2.1380493599049757, "grad_norm": 0.5487294793128967, "learning_rate": 1.4144547402664674e-06, "loss": 0.0523, "num_input_tokens_seen": 7975920, "step": 16200 }, { "epoch": 2.138709251682724, "grad_norm": 0.0028204945847392082, "learning_rate": 1.4140354287908079e-06, "loss": 0.0004, "num_input_tokens_seen": 7978224, "step": 16205 }, { "epoch": 2.1393691434604727, "grad_norm": 0.18289142847061157, "learning_rate": 1.4136160294391272e-06, "loss": 0.0001, "num_input_tokens_seen": 7980592, "step": 16210 }, { "epoch": 2.140029035238221, "grad_norm": 0.016940122470259666, "learning_rate": 1.4131965423004394e-06, "loss": 0.0001, "num_input_tokens_seen": 7982896, "step": 16215 }, { "epoch": 2.1406889270159692, "grad_norm": 0.5715939402580261, "learning_rate": 1.4127769674637777e-06, "loss": 0.0297, "num_input_tokens_seen": 7985072, "step": 16220 }, { "epoch": 2.141348818793718, "grad_norm": 0.0023537829983979464, "learning_rate": 1.4123573050181937e-06, "loss": 0.1126, "num_input_tokens_seen": 7987824, "step": 16225 }, { "epoch": 2.1420087105714662, "grad_norm": 1.0796823501586914, "learning_rate": 1.4119375550527578e-06, "loss": 0.0004, "num_input_tokens_seen": 7990256, "step": 16230 }, { "epoch": 2.1426686023492145, "grad_norm": 0.010677073150873184, "learning_rate": 1.4115177176565587e-06, "loss": 0.0001, "num_input_tokens_seen": 7992944, "step": 16235 }, { "epoch": 2.1433284941269632, "grad_norm": 0.002611901145428419, "learning_rate": 1.4110977929187042e-06, "loss": 0.0803, "num_input_tokens_seen": 7995440, "step": 16240 }, { "epoch": 2.1439883859047115, "grad_norm": 0.21536973118782043, "learning_rate": 1.41067778092832e-06, "loss": 0.0956, "num_input_tokens_seen": 7997808, "step": 16245 }, { "epoch": 2.1446482776824602, "grad_norm": 0.017482072114944458, "learning_rate": 1.4102576817745506e-06, "loss": 0.0798, "num_input_tokens_seen": 8000304, "step": 16250 }, { "epoch": 2.1453081694602085, "grad_norm": 0.03651322424411774, "learning_rate": 1.4098374955465592e-06, "loss": 0.1273, "num_input_tokens_seen": 8002992, "step": 16255 }, { "epoch": 2.145968061237957, "grad_norm": 0.023659037426114082, "learning_rate": 1.409417222333527e-06, "loss": 0.0005, "num_input_tokens_seen": 8005488, "step": 16260 }, { "epoch": 2.1466279530157055, "grad_norm": 0.009940829128026962, "learning_rate": 1.4089968622246543e-06, "loss": 0.0005, "num_input_tokens_seen": 8008240, "step": 16265 }, { "epoch": 2.147287844793454, "grad_norm": 0.062029287219047546, "learning_rate": 1.4085764153091595e-06, "loss": 0.1245, "num_input_tokens_seen": 8010864, "step": 16270 }, { "epoch": 2.1479477365712025, "grad_norm": 0.09726342558860779, "learning_rate": 1.4081558816762788e-06, "loss": 0.0894, "num_input_tokens_seen": 8013296, "step": 16275 }, { "epoch": 2.148607628348951, "grad_norm": 0.04445560276508331, "learning_rate": 1.4077352614152683e-06, "loss": 0.0005, "num_input_tokens_seen": 8016048, "step": 16280 }, { "epoch": 2.149267520126699, "grad_norm": 0.030557597056031227, "learning_rate": 1.407314554615401e-06, "loss": 0.0012, "num_input_tokens_seen": 8018480, "step": 16285 }, { "epoch": 2.149927411904448, "grad_norm": 0.13699224591255188, "learning_rate": 1.406893761365969e-06, "loss": 0.0413, "num_input_tokens_seen": 8021040, "step": 16290 }, { "epoch": 2.150587303682196, "grad_norm": 0.1511172652244568, "learning_rate": 1.4064728817562825e-06, "loss": 0.0005, "num_input_tokens_seen": 8023664, "step": 16295 }, { "epoch": 2.1512471954599444, "grad_norm": 0.06514198333024979, "learning_rate": 1.4060519158756702e-06, "loss": 0.0003, "num_input_tokens_seen": 8025712, "step": 16300 }, { "epoch": 2.151907087237693, "grad_norm": 0.018014973029494286, "learning_rate": 1.4056308638134794e-06, "loss": 0.0002, "num_input_tokens_seen": 8028208, "step": 16305 }, { "epoch": 2.1525669790154414, "grad_norm": 0.19428548216819763, "learning_rate": 1.4052097256590752e-06, "loss": 0.0002, "num_input_tokens_seen": 8030640, "step": 16310 }, { "epoch": 2.15322687079319, "grad_norm": 0.05569561943411827, "learning_rate": 1.4047885015018407e-06, "loss": 0.0613, "num_input_tokens_seen": 8033136, "step": 16315 }, { "epoch": 2.1538867625709384, "grad_norm": 0.44556447863578796, "learning_rate": 1.4043671914311785e-06, "loss": 0.054, "num_input_tokens_seen": 8035696, "step": 16320 }, { "epoch": 2.1545466543486866, "grad_norm": 0.011591610498726368, "learning_rate": 1.4039457955365077e-06, "loss": 0.0348, "num_input_tokens_seen": 8038448, "step": 16325 }, { "epoch": 2.1552065461264354, "grad_norm": 0.05333820357918739, "learning_rate": 1.403524313907267e-06, "loss": 0.1972, "num_input_tokens_seen": 8040944, "step": 16330 }, { "epoch": 2.1558664379041836, "grad_norm": 0.01803092285990715, "learning_rate": 1.403102746632913e-06, "loss": 0.0004, "num_input_tokens_seen": 8043312, "step": 16335 }, { "epoch": 2.1565263296819324, "grad_norm": 0.07055540382862091, "learning_rate": 1.4026810938029197e-06, "loss": 0.0002, "num_input_tokens_seen": 8045872, "step": 16340 }, { "epoch": 2.1571862214596806, "grad_norm": 22.134244918823242, "learning_rate": 1.4022593555067804e-06, "loss": 0.0644, "num_input_tokens_seen": 8048624, "step": 16345 }, { "epoch": 2.157846113237429, "grad_norm": 0.012910672463476658, "learning_rate": 1.401837531834006e-06, "loss": 0.1225, "num_input_tokens_seen": 8050864, "step": 16350 }, { "epoch": 2.1585060050151776, "grad_norm": 0.1114889532327652, "learning_rate": 1.401415622874125e-06, "loss": 0.0007, "num_input_tokens_seen": 8053616, "step": 16355 }, { "epoch": 2.159165896792926, "grad_norm": 0.013317722827196121, "learning_rate": 1.400993628716685e-06, "loss": 0.0002, "num_input_tokens_seen": 8056048, "step": 16360 }, { "epoch": 2.159825788570674, "grad_norm": 0.010530170984566212, "learning_rate": 1.400571549451251e-06, "loss": 0.0399, "num_input_tokens_seen": 8058288, "step": 16365 }, { "epoch": 2.160485680348423, "grad_norm": 0.020479438826441765, "learning_rate": 1.4001493851674066e-06, "loss": 0.0002, "num_input_tokens_seen": 8060592, "step": 16370 }, { "epoch": 2.161145572126171, "grad_norm": 0.02857316844165325, "learning_rate": 1.3997271359547529e-06, "loss": 0.0229, "num_input_tokens_seen": 8062960, "step": 16375 }, { "epoch": 2.16180546390392, "grad_norm": 0.04585658758878708, "learning_rate": 1.3993048019029088e-06, "loss": 0.0017, "num_input_tokens_seen": 8065584, "step": 16380 }, { "epoch": 2.162465355681668, "grad_norm": 0.01887671649456024, "learning_rate": 1.3988823831015125e-06, "loss": 0.0704, "num_input_tokens_seen": 8068144, "step": 16385 }, { "epoch": 2.1631252474594165, "grad_norm": 0.04641466215252876, "learning_rate": 1.3984598796402183e-06, "loss": 0.0004, "num_input_tokens_seen": 8070384, "step": 16390 }, { "epoch": 2.163785139237165, "grad_norm": 0.010708236135542393, "learning_rate": 1.3980372916087006e-06, "loss": 0.0002, "num_input_tokens_seen": 8072816, "step": 16395 }, { "epoch": 2.1644450310149135, "grad_norm": 0.1834602952003479, "learning_rate": 1.3976146190966498e-06, "loss": 0.0014, "num_input_tokens_seen": 8075184, "step": 16400 }, { "epoch": 2.165104922792662, "grad_norm": 0.07067617774009705, "learning_rate": 1.3971918621937756e-06, "loss": 0.0581, "num_input_tokens_seen": 8077424, "step": 16405 }, { "epoch": 2.1657648145704105, "grad_norm": 0.02065001055598259, "learning_rate": 1.3967690209898046e-06, "loss": 0.0001, "num_input_tokens_seen": 8080048, "step": 16410 }, { "epoch": 2.1664247063481588, "grad_norm": 0.2887703776359558, "learning_rate": 1.3963460955744824e-06, "loss": 0.0478, "num_input_tokens_seen": 8082416, "step": 16415 }, { "epoch": 2.1670845981259075, "grad_norm": 0.024521052837371826, "learning_rate": 1.3959230860375716e-06, "loss": 0.0001, "num_input_tokens_seen": 8085104, "step": 16420 }, { "epoch": 2.1677444899036558, "grad_norm": 0.3461296856403351, "learning_rate": 1.3954999924688522e-06, "loss": 0.1084, "num_input_tokens_seen": 8087408, "step": 16425 }, { "epoch": 2.1684043816814045, "grad_norm": 0.10695258527994156, "learning_rate": 1.395076814958124e-06, "loss": 0.0489, "num_input_tokens_seen": 8089456, "step": 16430 }, { "epoch": 2.1690642734591528, "grad_norm": 0.026660706847906113, "learning_rate": 1.3946535535952024e-06, "loss": 0.0002, "num_input_tokens_seen": 8092336, "step": 16435 }, { "epoch": 2.169724165236901, "grad_norm": 14.826727867126465, "learning_rate": 1.394230208469922e-06, "loss": 0.0369, "num_input_tokens_seen": 8094640, "step": 16440 }, { "epoch": 2.1703840570146498, "grad_norm": 24.21205711364746, "learning_rate": 1.3938067796721349e-06, "loss": 0.0711, "num_input_tokens_seen": 8097072, "step": 16445 }, { "epoch": 2.171043948792398, "grad_norm": 0.30090898275375366, "learning_rate": 1.3933832672917101e-06, "loss": 0.0007, "num_input_tokens_seen": 8099504, "step": 16450 }, { "epoch": 2.1717038405701463, "grad_norm": 0.12428780645132065, "learning_rate": 1.3929596714185357e-06, "loss": 0.0004, "num_input_tokens_seen": 8101680, "step": 16455 }, { "epoch": 2.172363732347895, "grad_norm": 1.8694030046463013, "learning_rate": 1.3925359921425166e-06, "loss": 0.1802, "num_input_tokens_seen": 8104304, "step": 16460 }, { "epoch": 2.1730236241256433, "grad_norm": 0.07084174454212189, "learning_rate": 1.3921122295535756e-06, "loss": 0.0001, "num_input_tokens_seen": 8106672, "step": 16465 }, { "epoch": 2.173683515903392, "grad_norm": 0.5130443572998047, "learning_rate": 1.3916883837416536e-06, "loss": 0.091, "num_input_tokens_seen": 8108976, "step": 16470 }, { "epoch": 2.1743434076811403, "grad_norm": 0.029886798933148384, "learning_rate": 1.3912644547967085e-06, "loss": 0.004, "num_input_tokens_seen": 8111408, "step": 16475 }, { "epoch": 2.1750032994588886, "grad_norm": 0.023892467841506004, "learning_rate": 1.390840442808716e-06, "loss": 0.0458, "num_input_tokens_seen": 8113904, "step": 16480 }, { "epoch": 2.1756631912366373, "grad_norm": 0.04723944514989853, "learning_rate": 1.3904163478676698e-06, "loss": 0.1022, "num_input_tokens_seen": 8116336, "step": 16485 }, { "epoch": 2.1763230830143856, "grad_norm": 0.05369102954864502, "learning_rate": 1.3899921700635808e-06, "loss": 0.0318, "num_input_tokens_seen": 8118640, "step": 16490 }, { "epoch": 2.176982974792134, "grad_norm": 84.5366439819336, "learning_rate": 1.389567909486478e-06, "loss": 0.0517, "num_input_tokens_seen": 8121008, "step": 16495 }, { "epoch": 2.1776428665698826, "grad_norm": 15.742450714111328, "learning_rate": 1.3891435662264077e-06, "loss": 0.065, "num_input_tokens_seen": 8123632, "step": 16500 }, { "epoch": 2.178302758347631, "grad_norm": 0.06588709354400635, "learning_rate": 1.3887191403734328e-06, "loss": 0.0667, "num_input_tokens_seen": 8126256, "step": 16505 }, { "epoch": 2.1789626501253796, "grad_norm": 0.011470166966319084, "learning_rate": 1.3882946320176358e-06, "loss": 0.0536, "num_input_tokens_seen": 8129072, "step": 16510 }, { "epoch": 2.179622541903128, "grad_norm": 12.762123107910156, "learning_rate": 1.3878700412491147e-06, "loss": 0.0269, "num_input_tokens_seen": 8131632, "step": 16515 }, { "epoch": 2.180282433680876, "grad_norm": 0.002766258781775832, "learning_rate": 1.3874453681579861e-06, "loss": 0.0001, "num_input_tokens_seen": 8134192, "step": 16520 }, { "epoch": 2.180942325458625, "grad_norm": 0.00879225879907608, "learning_rate": 1.3870206128343838e-06, "loss": 0.0014, "num_input_tokens_seen": 8136432, "step": 16525 }, { "epoch": 2.181602217236373, "grad_norm": 0.005440095905214548, "learning_rate": 1.386595775368459e-06, "loss": 0.0003, "num_input_tokens_seen": 8138544, "step": 16530 }, { "epoch": 2.182262109014122, "grad_norm": 12.968379020690918, "learning_rate": 1.3861708558503804e-06, "loss": 0.0551, "num_input_tokens_seen": 8140976, "step": 16535 }, { "epoch": 2.18292200079187, "grad_norm": 0.006056667771190405, "learning_rate": 1.385745854370334e-06, "loss": 0.0008, "num_input_tokens_seen": 8143344, "step": 16540 }, { "epoch": 2.1835818925696184, "grad_norm": 0.05108082666993141, "learning_rate": 1.3853207710185233e-06, "loss": 0.0352, "num_input_tokens_seen": 8145392, "step": 16545 }, { "epoch": 2.184241784347367, "grad_norm": 0.06454239040613174, "learning_rate": 1.3848956058851695e-06, "loss": 0.0001, "num_input_tokens_seen": 8148080, "step": 16550 }, { "epoch": 2.1849016761251154, "grad_norm": 24.318828582763672, "learning_rate": 1.3844703590605105e-06, "loss": 0.0695, "num_input_tokens_seen": 8150448, "step": 16555 }, { "epoch": 2.185561567902864, "grad_norm": 0.02337341383099556, "learning_rate": 1.3840450306348017e-06, "loss": 0.109, "num_input_tokens_seen": 8152880, "step": 16560 }, { "epoch": 2.1862214596806124, "grad_norm": 0.4075833261013031, "learning_rate": 1.3836196206983162e-06, "loss": 0.0972, "num_input_tokens_seen": 8155248, "step": 16565 }, { "epoch": 2.1868813514583607, "grad_norm": 0.1106967106461525, "learning_rate": 1.3831941293413443e-06, "loss": 0.0592, "num_input_tokens_seen": 8157808, "step": 16570 }, { "epoch": 2.1875412432361094, "grad_norm": 1.1730084419250488, "learning_rate": 1.3827685566541934e-06, "loss": 0.0009, "num_input_tokens_seen": 8160368, "step": 16575 }, { "epoch": 2.1882011350138577, "grad_norm": 0.01301812008023262, "learning_rate": 1.382342902727188e-06, "loss": 0.0492, "num_input_tokens_seen": 8162544, "step": 16580 }, { "epoch": 2.188861026791606, "grad_norm": 0.295091837644577, "learning_rate": 1.38191716765067e-06, "loss": 0.0004, "num_input_tokens_seen": 8165168, "step": 16585 }, { "epoch": 2.1895209185693547, "grad_norm": 0.033473141491413116, "learning_rate": 1.381491351514999e-06, "loss": 0.0538, "num_input_tokens_seen": 8167472, "step": 16590 }, { "epoch": 2.190180810347103, "grad_norm": 0.18719351291656494, "learning_rate": 1.3810654544105512e-06, "loss": 0.0013, "num_input_tokens_seen": 8169840, "step": 16595 }, { "epoch": 2.1908407021248517, "grad_norm": 0.01855543442070484, "learning_rate": 1.38063947642772e-06, "loss": 0.0007, "num_input_tokens_seen": 8172272, "step": 16600 }, { "epoch": 2.1915005939026, "grad_norm": 0.2745734453201294, "learning_rate": 1.3802134176569166e-06, "loss": 0.0001, "num_input_tokens_seen": 8174640, "step": 16605 }, { "epoch": 2.1921604856803483, "grad_norm": 0.03164679929614067, "learning_rate": 1.3797872781885685e-06, "loss": 0.0298, "num_input_tokens_seen": 8177264, "step": 16610 }, { "epoch": 2.192820377458097, "grad_norm": 0.028849711641669273, "learning_rate": 1.3793610581131207e-06, "loss": 0.0001, "num_input_tokens_seen": 8179504, "step": 16615 }, { "epoch": 2.1934802692358453, "grad_norm": 0.004540000110864639, "learning_rate": 1.3789347575210352e-06, "loss": 0.0782, "num_input_tokens_seen": 8182192, "step": 16620 }, { "epoch": 2.1941401610135935, "grad_norm": 0.004751342348754406, "learning_rate": 1.3785083765027919e-06, "loss": 0.0031, "num_input_tokens_seen": 8184496, "step": 16625 }, { "epoch": 2.1948000527913423, "grad_norm": 0.06377559155225754, "learning_rate": 1.3780819151488865e-06, "loss": 0.0002, "num_input_tokens_seen": 8186864, "step": 16630 }, { "epoch": 2.1954599445690905, "grad_norm": 0.013530077412724495, "learning_rate": 1.3776553735498321e-06, "loss": 0.2403, "num_input_tokens_seen": 8189168, "step": 16635 }, { "epoch": 2.1961198363468393, "grad_norm": 12.607081413269043, "learning_rate": 1.37722875179616e-06, "loss": 0.0315, "num_input_tokens_seen": 8191536, "step": 16640 }, { "epoch": 2.1967797281245875, "grad_norm": 0.0664307028055191, "learning_rate": 1.3768020499784165e-06, "loss": 0.0002, "num_input_tokens_seen": 8194352, "step": 16645 }, { "epoch": 2.197439619902336, "grad_norm": 0.12778986990451813, "learning_rate": 1.3763752681871669e-06, "loss": 0.0444, "num_input_tokens_seen": 8196784, "step": 16650 }, { "epoch": 2.1980995116800846, "grad_norm": 0.8269379734992981, "learning_rate": 1.375948406512992e-06, "loss": 0.0195, "num_input_tokens_seen": 8199216, "step": 16655 }, { "epoch": 2.198759403457833, "grad_norm": 0.008877968415617943, "learning_rate": 1.3755214650464903e-06, "loss": 0.071, "num_input_tokens_seen": 8201456, "step": 16660 }, { "epoch": 2.1994192952355816, "grad_norm": 0.0728282779455185, "learning_rate": 1.3750944438782769e-06, "loss": 0.0002, "num_input_tokens_seen": 8203568, "step": 16665 }, { "epoch": 2.20007918701333, "grad_norm": 0.18286637961864471, "learning_rate": 1.374667343098984e-06, "loss": 0.0002, "num_input_tokens_seen": 8205872, "step": 16670 }, { "epoch": 2.200739078791078, "grad_norm": 0.011909585446119308, "learning_rate": 1.3742401627992604e-06, "loss": 0.0201, "num_input_tokens_seen": 8208432, "step": 16675 }, { "epoch": 2.201398970568827, "grad_norm": 15.967900276184082, "learning_rate": 1.3738129030697724e-06, "loss": 0.2684, "num_input_tokens_seen": 8210928, "step": 16680 }, { "epoch": 2.202058862346575, "grad_norm": 0.08457144349813461, "learning_rate": 1.3733855640012028e-06, "loss": 0.0003, "num_input_tokens_seen": 8213168, "step": 16685 }, { "epoch": 2.202718754124324, "grad_norm": 0.2429049015045166, "learning_rate": 1.372958145684251e-06, "loss": 0.1604, "num_input_tokens_seen": 8215536, "step": 16690 }, { "epoch": 2.203378645902072, "grad_norm": 2.9228296279907227, "learning_rate": 1.3725306482096337e-06, "loss": 0.0022, "num_input_tokens_seen": 8217904, "step": 16695 }, { "epoch": 2.2040385376798204, "grad_norm": 0.39163509011268616, "learning_rate": 1.3721030716680835e-06, "loss": 0.0005, "num_input_tokens_seen": 8220208, "step": 16700 }, { "epoch": 2.204698429457569, "grad_norm": 0.05875394865870476, "learning_rate": 1.3716754161503514e-06, "loss": 0.0322, "num_input_tokens_seen": 8222832, "step": 16705 }, { "epoch": 2.2053583212353174, "grad_norm": 0.012448856607079506, "learning_rate": 1.3712476817472037e-06, "loss": 0.0001, "num_input_tokens_seen": 8225264, "step": 16710 }, { "epoch": 2.2060182130130657, "grad_norm": 0.04935499653220177, "learning_rate": 1.3708198685494234e-06, "loss": 0.1097, "num_input_tokens_seen": 8227632, "step": 16715 }, { "epoch": 2.2066781047908144, "grad_norm": 0.1539801061153412, "learning_rate": 1.3703919766478116e-06, "loss": 0.0618, "num_input_tokens_seen": 8230448, "step": 16720 }, { "epoch": 2.2073379965685627, "grad_norm": 0.03219306468963623, "learning_rate": 1.369964006133185e-06, "loss": 0.1022, "num_input_tokens_seen": 8233008, "step": 16725 }, { "epoch": 2.2079978883463114, "grad_norm": 1.6212204694747925, "learning_rate": 1.3695359570963772e-06, "loss": 0.0281, "num_input_tokens_seen": 8235568, "step": 16730 }, { "epoch": 2.2086577801240597, "grad_norm": 0.15697598457336426, "learning_rate": 1.3691078296282383e-06, "loss": 0.0354, "num_input_tokens_seen": 8237744, "step": 16735 }, { "epoch": 2.209317671901808, "grad_norm": 0.5345177054405212, "learning_rate": 1.3686796238196357e-06, "loss": 0.0026, "num_input_tokens_seen": 8240368, "step": 16740 }, { "epoch": 2.2099775636795567, "grad_norm": 34.04423904418945, "learning_rate": 1.3682513397614522e-06, "loss": 0.127, "num_input_tokens_seen": 8242800, "step": 16745 }, { "epoch": 2.210637455457305, "grad_norm": 0.002000207779929042, "learning_rate": 1.367822977544589e-06, "loss": 0.0009, "num_input_tokens_seen": 8245232, "step": 16750 }, { "epoch": 2.2112973472350532, "grad_norm": 0.17773684859275818, "learning_rate": 1.3673945372599623e-06, "loss": 0.0009, "num_input_tokens_seen": 8247856, "step": 16755 }, { "epoch": 2.211957239012802, "grad_norm": 20.52808380126953, "learning_rate": 1.366966018998505e-06, "loss": 0.0835, "num_input_tokens_seen": 8250352, "step": 16760 }, { "epoch": 2.2126171307905502, "grad_norm": 0.12653538584709167, "learning_rate": 1.3665374228511681e-06, "loss": 0.0053, "num_input_tokens_seen": 8252720, "step": 16765 }, { "epoch": 2.213277022568299, "grad_norm": 0.15404640138149261, "learning_rate": 1.366108748908917e-06, "loss": 0.0008, "num_input_tokens_seen": 8255344, "step": 16770 }, { "epoch": 2.2139369143460472, "grad_norm": 0.014432862401008606, "learning_rate": 1.3656799972627355e-06, "loss": 0.0782, "num_input_tokens_seen": 8257648, "step": 16775 }, { "epoch": 2.2145968061237955, "grad_norm": 0.0697983056306839, "learning_rate": 1.3652511680036227e-06, "loss": 0.0472, "num_input_tokens_seen": 8260336, "step": 16780 }, { "epoch": 2.2152566979015442, "grad_norm": 0.05055699869990349, "learning_rate": 1.3648222612225941e-06, "loss": 0.0985, "num_input_tokens_seen": 8263152, "step": 16785 }, { "epoch": 2.2159165896792925, "grad_norm": 0.0167390163987875, "learning_rate": 1.3643932770106824e-06, "loss": 0.0595, "num_input_tokens_seen": 8265584, "step": 16790 }, { "epoch": 2.2165764814570412, "grad_norm": 0.0399298258125782, "learning_rate": 1.3639642154589365e-06, "loss": 0.0005, "num_input_tokens_seen": 8267760, "step": 16795 }, { "epoch": 2.2172363732347895, "grad_norm": 0.00978358555585146, "learning_rate": 1.3635350766584217e-06, "loss": 0.0002, "num_input_tokens_seen": 8270256, "step": 16800 }, { "epoch": 2.217896265012538, "grad_norm": 0.005867898464202881, "learning_rate": 1.363105860700219e-06, "loss": 0.0003, "num_input_tokens_seen": 8273072, "step": 16805 }, { "epoch": 2.2185561567902865, "grad_norm": 0.6499746441841125, "learning_rate": 1.3626765676754274e-06, "loss": 0.194, "num_input_tokens_seen": 8275376, "step": 16810 }, { "epoch": 2.219216048568035, "grad_norm": 0.03754541650414467, "learning_rate": 1.3622471976751599e-06, "loss": 0.0007, "num_input_tokens_seen": 8277872, "step": 16815 }, { "epoch": 2.2198759403457835, "grad_norm": 0.4709387719631195, "learning_rate": 1.3618177507905484e-06, "loss": 0.0477, "num_input_tokens_seen": 8280432, "step": 16820 }, { "epoch": 2.220535832123532, "grad_norm": 0.026431599631905556, "learning_rate": 1.361388227112739e-06, "loss": 0.0001, "num_input_tokens_seen": 8282864, "step": 16825 }, { "epoch": 2.22119572390128, "grad_norm": 0.0019147369312122464, "learning_rate": 1.3609586267328955e-06, "loss": 0.0597, "num_input_tokens_seen": 8285360, "step": 16830 }, { "epoch": 2.221855615679029, "grad_norm": 0.01568089984357357, "learning_rate": 1.3605289497421974e-06, "loss": 0.0002, "num_input_tokens_seen": 8287728, "step": 16835 }, { "epoch": 2.222515507456777, "grad_norm": 23.796926498413086, "learning_rate": 1.3600991962318403e-06, "loss": 0.1485, "num_input_tokens_seen": 8290288, "step": 16840 }, { "epoch": 2.2231753992345253, "grad_norm": 0.047032494097948074, "learning_rate": 1.3596693662930365e-06, "loss": 0.0736, "num_input_tokens_seen": 8292720, "step": 16845 }, { "epoch": 2.223835291012274, "grad_norm": 0.1415061354637146, "learning_rate": 1.3592394600170142e-06, "loss": 0.0003, "num_input_tokens_seen": 8295280, "step": 16850 }, { "epoch": 2.2244951827900223, "grad_norm": 0.022507498040795326, "learning_rate": 1.3588094774950181e-06, "loss": 0.0475, "num_input_tokens_seen": 8297648, "step": 16855 }, { "epoch": 2.225155074567771, "grad_norm": 0.02734128013253212, "learning_rate": 1.3583794188183087e-06, "loss": 0.0073, "num_input_tokens_seen": 8300016, "step": 16860 }, { "epoch": 2.2258149663455193, "grad_norm": 1.2373629808425903, "learning_rate": 1.3579492840781625e-06, "loss": 0.0011, "num_input_tokens_seen": 8302512, "step": 16865 }, { "epoch": 2.2264748581232676, "grad_norm": 0.1843707114458084, "learning_rate": 1.357519073365873e-06, "loss": 0.1018, "num_input_tokens_seen": 8305136, "step": 16870 }, { "epoch": 2.2271347499010163, "grad_norm": 0.016257228329777718, "learning_rate": 1.357088786772749e-06, "loss": 0.0002, "num_input_tokens_seen": 8307696, "step": 16875 }, { "epoch": 2.2277946416787646, "grad_norm": 0.0035908890422433615, "learning_rate": 1.3566584243901163e-06, "loss": 0.0004, "num_input_tokens_seen": 8310000, "step": 16880 }, { "epoch": 2.228454533456513, "grad_norm": 0.022007431834936142, "learning_rate": 1.3562279863093154e-06, "loss": 0.0002, "num_input_tokens_seen": 8312304, "step": 16885 }, { "epoch": 2.2291144252342616, "grad_norm": 0.006274157669395208, "learning_rate": 1.3557974726217041e-06, "loss": 0.0001, "num_input_tokens_seen": 8314672, "step": 16890 }, { "epoch": 2.22977431701201, "grad_norm": 0.01680285483598709, "learning_rate": 1.3553668834186556e-06, "loss": 0.0003, "num_input_tokens_seen": 8317168, "step": 16895 }, { "epoch": 2.2304342087897586, "grad_norm": 0.0044866399839520454, "learning_rate": 1.3549362187915593e-06, "loss": 0.0642, "num_input_tokens_seen": 8319792, "step": 16900 }, { "epoch": 2.231094100567507, "grad_norm": 0.11140000075101852, "learning_rate": 1.3545054788318212e-06, "loss": 0.0002, "num_input_tokens_seen": 8322352, "step": 16905 }, { "epoch": 2.231753992345255, "grad_norm": 26.079273223876953, "learning_rate": 1.3540746636308623e-06, "loss": 0.1334, "num_input_tokens_seen": 8324848, "step": 16910 }, { "epoch": 2.232413884123004, "grad_norm": 0.0065078651532530785, "learning_rate": 1.3536437732801198e-06, "loss": 0.0002, "num_input_tokens_seen": 8327088, "step": 16915 }, { "epoch": 2.233073775900752, "grad_norm": 0.6009857058525085, "learning_rate": 1.3532128078710474e-06, "loss": 0.0014, "num_input_tokens_seen": 8329712, "step": 16920 }, { "epoch": 2.233733667678501, "grad_norm": 0.03584284335374832, "learning_rate": 1.3527817674951143e-06, "loss": 0.0088, "num_input_tokens_seen": 8332336, "step": 16925 }, { "epoch": 2.234393559456249, "grad_norm": 0.07889010012149811, "learning_rate": 1.3523506522438056e-06, "loss": 0.0002, "num_input_tokens_seen": 8334640, "step": 16930 }, { "epoch": 2.2350534512339975, "grad_norm": 14.214226722717285, "learning_rate": 1.3519194622086227e-06, "loss": 0.0535, "num_input_tokens_seen": 8337072, "step": 16935 }, { "epoch": 2.235713343011746, "grad_norm": 66.53902435302734, "learning_rate": 1.3514881974810823e-06, "loss": 0.0806, "num_input_tokens_seen": 8339376, "step": 16940 }, { "epoch": 2.2363732347894945, "grad_norm": 0.24029788374900818, "learning_rate": 1.3510568581527171e-06, "loss": 0.1198, "num_input_tokens_seen": 8341616, "step": 16945 }, { "epoch": 2.237033126567243, "grad_norm": 0.02471575327217579, "learning_rate": 1.3506254443150761e-06, "loss": 0.0794, "num_input_tokens_seen": 8344176, "step": 16950 }, { "epoch": 2.2376930183449915, "grad_norm": 0.006826276425272226, "learning_rate": 1.3501939560597233e-06, "loss": 0.0985, "num_input_tokens_seen": 8346608, "step": 16955 }, { "epoch": 2.2383529101227397, "grad_norm": 0.03343689441680908, "learning_rate": 1.3497623934782397e-06, "loss": 0.0123, "num_input_tokens_seen": 8349424, "step": 16960 }, { "epoch": 2.2390128019004885, "grad_norm": 0.22107385098934174, "learning_rate": 1.3493307566622204e-06, "loss": 0.0019, "num_input_tokens_seen": 8351728, "step": 16965 }, { "epoch": 2.2396726936782367, "grad_norm": 6.898412704467773, "learning_rate": 1.3488990457032778e-06, "loss": 0.0007, "num_input_tokens_seen": 8354160, "step": 16970 }, { "epoch": 2.240332585455985, "grad_norm": 0.013253572396934032, "learning_rate": 1.3484672606930393e-06, "loss": 0.0096, "num_input_tokens_seen": 8356272, "step": 16975 }, { "epoch": 2.2409924772337337, "grad_norm": 0.018749356269836426, "learning_rate": 1.3480354017231483e-06, "loss": 0.0001, "num_input_tokens_seen": 8358576, "step": 16980 }, { "epoch": 2.241652369011482, "grad_norm": 0.25545141100883484, "learning_rate": 1.3476034688852633e-06, "loss": 0.0025, "num_input_tokens_seen": 8361008, "step": 16985 }, { "epoch": 2.2423122607892307, "grad_norm": 0.03576225787401199, "learning_rate": 1.3471714622710595e-06, "loss": 0.056, "num_input_tokens_seen": 8363504, "step": 16990 }, { "epoch": 2.242972152566979, "grad_norm": 0.005058703012764454, "learning_rate": 1.3467393819722265e-06, "loss": 0.0, "num_input_tokens_seen": 8365680, "step": 16995 }, { "epoch": 2.2436320443447273, "grad_norm": 0.02662578783929348, "learning_rate": 1.3463072280804708e-06, "loss": 0.0002, "num_input_tokens_seen": 8368432, "step": 17000 }, { "epoch": 2.244291936122476, "grad_norm": 0.00669122114777565, "learning_rate": 1.3458750006875134e-06, "loss": 0.0004, "num_input_tokens_seen": 8370736, "step": 17005 }, { "epoch": 2.2449518279002243, "grad_norm": 0.0020471608731895685, "learning_rate": 1.3454426998850919e-06, "loss": 0.0002, "num_input_tokens_seen": 8373488, "step": 17010 }, { "epoch": 2.245611719677973, "grad_norm": 0.02016337215900421, "learning_rate": 1.345010325764959e-06, "loss": 0.0001, "num_input_tokens_seen": 8375984, "step": 17015 }, { "epoch": 2.2462716114557213, "grad_norm": 0.015209107659757137, "learning_rate": 1.3445778784188828e-06, "loss": 0.0002, "num_input_tokens_seen": 8378480, "step": 17020 }, { "epoch": 2.2469315032334696, "grad_norm": 0.284445583820343, "learning_rate": 1.3441453579386468e-06, "loss": 0.0004, "num_input_tokens_seen": 8380656, "step": 17025 }, { "epoch": 2.2475913950112183, "grad_norm": 43.588706970214844, "learning_rate": 1.343712764416051e-06, "loss": 0.0447, "num_input_tokens_seen": 8383408, "step": 17030 }, { "epoch": 2.2482512867889666, "grad_norm": 0.0007529565482400358, "learning_rate": 1.3432800979429097e-06, "loss": 0.1017, "num_input_tokens_seen": 8385904, "step": 17035 }, { "epoch": 2.248911178566715, "grad_norm": 0.32451948523521423, "learning_rate": 1.3428473586110537e-06, "loss": 0.0847, "num_input_tokens_seen": 8388400, "step": 17040 }, { "epoch": 2.2495710703444636, "grad_norm": 0.013873131014406681, "learning_rate": 1.3424145465123286e-06, "loss": 0.0001, "num_input_tokens_seen": 8390640, "step": 17045 }, { "epoch": 2.250230962122212, "grad_norm": 0.0216103233397007, "learning_rate": 1.3419816617385953e-06, "loss": 0.0, "num_input_tokens_seen": 8393200, "step": 17050 }, { "epoch": 2.2508908538999606, "grad_norm": 0.006166580133140087, "learning_rate": 1.3415487043817311e-06, "loss": 0.0556, "num_input_tokens_seen": 8395632, "step": 17055 }, { "epoch": 2.2508908538999606, "eval_loss": 0.1500292271375656, "eval_runtime": 7.7652, "eval_samples_per_second": 867.336, "eval_steps_per_second": 108.433, "num_input_tokens_seen": 8395632, "step": 17055 }, { "epoch": 2.251550745677709, "grad_norm": 0.015901878476142883, "learning_rate": 1.3411156745336272e-06, "loss": 0.0, "num_input_tokens_seen": 8397872, "step": 17060 }, { "epoch": 2.252210637455457, "grad_norm": 0.4960050880908966, "learning_rate": 1.3406825722861921e-06, "loss": 0.0002, "num_input_tokens_seen": 8400432, "step": 17065 }, { "epoch": 2.252870529233206, "grad_norm": 0.009557882323861122, "learning_rate": 1.3402493977313476e-06, "loss": 0.0627, "num_input_tokens_seen": 8402608, "step": 17070 }, { "epoch": 2.253530421010954, "grad_norm": 0.17954318225383759, "learning_rate": 1.3398161509610324e-06, "loss": 0.0002, "num_input_tokens_seen": 8404848, "step": 17075 }, { "epoch": 2.254190312788703, "grad_norm": 0.003647751174867153, "learning_rate": 1.3393828320672e-06, "loss": 0.0001, "num_input_tokens_seen": 8407216, "step": 17080 }, { "epoch": 2.254850204566451, "grad_norm": 0.09699433296918869, "learning_rate": 1.3389494411418192e-06, "loss": 0.0001, "num_input_tokens_seen": 8409648, "step": 17085 }, { "epoch": 2.2555100963441994, "grad_norm": 0.026855552569031715, "learning_rate": 1.3385159782768738e-06, "loss": 0.0001, "num_input_tokens_seen": 8412016, "step": 17090 }, { "epoch": 2.256169988121948, "grad_norm": 0.015472883358597755, "learning_rate": 1.3380824435643633e-06, "loss": 0.0763, "num_input_tokens_seen": 8414448, "step": 17095 }, { "epoch": 2.2568298798996964, "grad_norm": 0.0023638952989131212, "learning_rate": 1.3376488370963027e-06, "loss": 0.1161, "num_input_tokens_seen": 8416752, "step": 17100 }, { "epoch": 2.257489771677445, "grad_norm": 0.0017636867705732584, "learning_rate": 1.3372151589647212e-06, "loss": 0.0001, "num_input_tokens_seen": 8419120, "step": 17105 }, { "epoch": 2.2581496634551934, "grad_norm": 28.924041748046875, "learning_rate": 1.3367814092616642e-06, "loss": 0.0876, "num_input_tokens_seen": 8421296, "step": 17110 }, { "epoch": 2.2588095552329417, "grad_norm": 10.672141075134277, "learning_rate": 1.336347588079192e-06, "loss": 0.0389, "num_input_tokens_seen": 8423536, "step": 17115 }, { "epoch": 2.2594694470106904, "grad_norm": 0.0031343603041023016, "learning_rate": 1.3359136955093798e-06, "loss": 0.0001, "num_input_tokens_seen": 8426096, "step": 17120 }, { "epoch": 2.2601293387884387, "grad_norm": 0.03127521276473999, "learning_rate": 1.335479731644318e-06, "loss": 0.0383, "num_input_tokens_seen": 8428464, "step": 17125 }, { "epoch": 2.260789230566187, "grad_norm": 0.004107305780053139, "learning_rate": 1.3350456965761127e-06, "loss": 0.0004, "num_input_tokens_seen": 8431088, "step": 17130 }, { "epoch": 2.2614491223439357, "grad_norm": 0.032716382294893265, "learning_rate": 1.3346115903968845e-06, "loss": 0.0001, "num_input_tokens_seen": 8433328, "step": 17135 }, { "epoch": 2.262109014121684, "grad_norm": 0.14623123407363892, "learning_rate": 1.3341774131987694e-06, "loss": 0.0002, "num_input_tokens_seen": 8435760, "step": 17140 }, { "epoch": 2.2627689058994322, "grad_norm": 2.8802661895751953, "learning_rate": 1.333743165073918e-06, "loss": 0.0319, "num_input_tokens_seen": 8437936, "step": 17145 }, { "epoch": 2.263428797677181, "grad_norm": 0.004641890060156584, "learning_rate": 1.3333088461144967e-06, "loss": 0.0011, "num_input_tokens_seen": 8440496, "step": 17150 }, { "epoch": 2.2640886894549292, "grad_norm": 14.824410438537598, "learning_rate": 1.3328744564126868e-06, "loss": 0.0517, "num_input_tokens_seen": 8442736, "step": 17155 }, { "epoch": 2.264748581232678, "grad_norm": 0.06626929342746735, "learning_rate": 1.3324399960606835e-06, "loss": 0.1567, "num_input_tokens_seen": 8445424, "step": 17160 }, { "epoch": 2.2654084730104262, "grad_norm": 0.00443276995792985, "learning_rate": 1.3320054651506985e-06, "loss": 0.0549, "num_input_tokens_seen": 8448048, "step": 17165 }, { "epoch": 2.2660683647881745, "grad_norm": 0.03811829909682274, "learning_rate": 1.331570863774958e-06, "loss": 0.0001, "num_input_tokens_seen": 8450288, "step": 17170 }, { "epoch": 2.2667282565659232, "grad_norm": 0.09436263144016266, "learning_rate": 1.3311361920257024e-06, "loss": 0.0566, "num_input_tokens_seen": 8452592, "step": 17175 }, { "epoch": 2.2673881483436715, "grad_norm": 0.13137038052082062, "learning_rate": 1.3307014499951882e-06, "loss": 0.0006, "num_input_tokens_seen": 8454960, "step": 17180 }, { "epoch": 2.2680480401214203, "grad_norm": 0.032147493213415146, "learning_rate": 1.3302666377756859e-06, "loss": 0.0003, "num_input_tokens_seen": 8457328, "step": 17185 }, { "epoch": 2.2687079318991685, "grad_norm": 0.01971900463104248, "learning_rate": 1.3298317554594813e-06, "loss": 0.083, "num_input_tokens_seen": 8459824, "step": 17190 }, { "epoch": 2.269367823676917, "grad_norm": 0.014136100187897682, "learning_rate": 1.3293968031388752e-06, "loss": 0.0001, "num_input_tokens_seen": 8462448, "step": 17195 }, { "epoch": 2.2700277154546655, "grad_norm": 30.553123474121094, "learning_rate": 1.3289617809061827e-06, "loss": 0.1421, "num_input_tokens_seen": 8464752, "step": 17200 }, { "epoch": 2.270687607232414, "grad_norm": 23.864639282226562, "learning_rate": 1.3285266888537346e-06, "loss": 0.0642, "num_input_tokens_seen": 8467184, "step": 17205 }, { "epoch": 2.2713474990101625, "grad_norm": 0.978489100933075, "learning_rate": 1.3280915270738754e-06, "loss": 0.1786, "num_input_tokens_seen": 8469680, "step": 17210 }, { "epoch": 2.272007390787911, "grad_norm": 0.007936615496873856, "learning_rate": 1.3276562956589656e-06, "loss": 0.0017, "num_input_tokens_seen": 8471920, "step": 17215 }, { "epoch": 2.272667282565659, "grad_norm": 0.06761188060045242, "learning_rate": 1.32722099470138e-06, "loss": 0.0002, "num_input_tokens_seen": 8474608, "step": 17220 }, { "epoch": 2.273327174343408, "grad_norm": 0.005021695047616959, "learning_rate": 1.3267856242935076e-06, "loss": 0.0253, "num_input_tokens_seen": 8476848, "step": 17225 }, { "epoch": 2.273987066121156, "grad_norm": 0.03890543803572655, "learning_rate": 1.3263501845277528e-06, "loss": 0.0448, "num_input_tokens_seen": 8479280, "step": 17230 }, { "epoch": 2.274646957898905, "grad_norm": 0.0053220209665596485, "learning_rate": 1.3259146754965346e-06, "loss": 0.0008, "num_input_tokens_seen": 8481776, "step": 17235 }, { "epoch": 2.275306849676653, "grad_norm": 0.011229145340621471, "learning_rate": 1.3254790972922867e-06, "loss": 0.0031, "num_input_tokens_seen": 8484208, "step": 17240 }, { "epoch": 2.2759667414544014, "grad_norm": 0.003528717439621687, "learning_rate": 1.3250434500074574e-06, "loss": 0.047, "num_input_tokens_seen": 8486832, "step": 17245 }, { "epoch": 2.27662663323215, "grad_norm": 0.02159113623201847, "learning_rate": 1.3246077337345097e-06, "loss": 0.0611, "num_input_tokens_seen": 8489328, "step": 17250 }, { "epoch": 2.2772865250098984, "grad_norm": 0.005418762564659119, "learning_rate": 1.3241719485659206e-06, "loss": 0.0427, "num_input_tokens_seen": 8491696, "step": 17255 }, { "epoch": 2.2779464167876466, "grad_norm": 0.4703400135040283, "learning_rate": 1.3237360945941834e-06, "loss": 0.0004, "num_input_tokens_seen": 8494320, "step": 17260 }, { "epoch": 2.2786063085653954, "grad_norm": 44.323387145996094, "learning_rate": 1.3233001719118043e-06, "loss": 0.0442, "num_input_tokens_seen": 8496560, "step": 17265 }, { "epoch": 2.2792662003431436, "grad_norm": 0.024648388847708702, "learning_rate": 1.3228641806113047e-06, "loss": 0.1099, "num_input_tokens_seen": 8498928, "step": 17270 }, { "epoch": 2.2799260921208924, "grad_norm": 0.035347118973731995, "learning_rate": 1.3224281207852213e-06, "loss": 0.0006, "num_input_tokens_seen": 8501552, "step": 17275 }, { "epoch": 2.2805859838986406, "grad_norm": 0.03767740726470947, "learning_rate": 1.3219919925261034e-06, "loss": 0.0854, "num_input_tokens_seen": 8503792, "step": 17280 }, { "epoch": 2.281245875676389, "grad_norm": 0.041668906807899475, "learning_rate": 1.321555795926517e-06, "loss": 0.0475, "num_input_tokens_seen": 8505776, "step": 17285 }, { "epoch": 2.2819057674541376, "grad_norm": 0.008051461540162563, "learning_rate": 1.3211195310790415e-06, "loss": 0.0001, "num_input_tokens_seen": 8508272, "step": 17290 }, { "epoch": 2.282565659231886, "grad_norm": 0.004521454218775034, "learning_rate": 1.3206831980762712e-06, "loss": 0.0002, "num_input_tokens_seen": 8510768, "step": 17295 }, { "epoch": 2.283225551009634, "grad_norm": 0.006502344273030758, "learning_rate": 1.320246797010814e-06, "loss": 0.0001, "num_input_tokens_seen": 8513520, "step": 17300 }, { "epoch": 2.283885442787383, "grad_norm": 0.025791212916374207, "learning_rate": 1.319810327975293e-06, "loss": 0.0009, "num_input_tokens_seen": 8516080, "step": 17305 }, { "epoch": 2.284545334565131, "grad_norm": 0.2241465449333191, "learning_rate": 1.3193737910623462e-06, "loss": 0.0675, "num_input_tokens_seen": 8518448, "step": 17310 }, { "epoch": 2.28520522634288, "grad_norm": 0.03452041745185852, "learning_rate": 1.3189371863646246e-06, "loss": 0.0, "num_input_tokens_seen": 8520624, "step": 17315 }, { "epoch": 2.285865118120628, "grad_norm": 73.1903305053711, "learning_rate": 1.318500513974795e-06, "loss": 0.1649, "num_input_tokens_seen": 8523248, "step": 17320 }, { "epoch": 2.2865250098983765, "grad_norm": 0.013924540020525455, "learning_rate": 1.3180637739855376e-06, "loss": 0.0001, "num_input_tokens_seen": 8525552, "step": 17325 }, { "epoch": 2.287184901676125, "grad_norm": 0.006592525169253349, "learning_rate": 1.3176269664895476e-06, "loss": 0.0008, "num_input_tokens_seen": 8528048, "step": 17330 }, { "epoch": 2.2878447934538735, "grad_norm": 0.464189738035202, "learning_rate": 1.3171900915795338e-06, "loss": 0.0557, "num_input_tokens_seen": 8530480, "step": 17335 }, { "epoch": 2.288504685231622, "grad_norm": 0.0561629980802536, "learning_rate": 1.31675314934822e-06, "loss": 0.0002, "num_input_tokens_seen": 8533040, "step": 17340 }, { "epoch": 2.2891645770093705, "grad_norm": 16.04241371154785, "learning_rate": 1.316316139888344e-06, "loss": 0.1025, "num_input_tokens_seen": 8535536, "step": 17345 }, { "epoch": 2.2898244687871188, "grad_norm": 20.588781356811523, "learning_rate": 1.3158790632926579e-06, "loss": 0.0009, "num_input_tokens_seen": 8538032, "step": 17350 }, { "epoch": 2.2904843605648675, "grad_norm": 0.01548085082322359, "learning_rate": 1.3154419196539281e-06, "loss": 0.0595, "num_input_tokens_seen": 8540528, "step": 17355 }, { "epoch": 2.2911442523426158, "grad_norm": 0.01596042700111866, "learning_rate": 1.315004709064935e-06, "loss": 0.0001, "num_input_tokens_seen": 8543024, "step": 17360 }, { "epoch": 2.2918041441203645, "grad_norm": 0.006322584114968777, "learning_rate": 1.3145674316184736e-06, "loss": 0.0565, "num_input_tokens_seen": 8545520, "step": 17365 }, { "epoch": 2.2924640358981128, "grad_norm": 0.014929535798728466, "learning_rate": 1.3141300874073524e-06, "loss": 0.0457, "num_input_tokens_seen": 8548016, "step": 17370 }, { "epoch": 2.293123927675861, "grad_norm": 0.013482524082064629, "learning_rate": 1.3136926765243955e-06, "loss": 0.0002, "num_input_tokens_seen": 8550512, "step": 17375 }, { "epoch": 2.2937838194536098, "grad_norm": 29.703046798706055, "learning_rate": 1.3132551990624392e-06, "loss": 0.055, "num_input_tokens_seen": 8552816, "step": 17380 }, { "epoch": 2.294443711231358, "grad_norm": 1.5670666694641113, "learning_rate": 1.3128176551143352e-06, "loss": 0.06, "num_input_tokens_seen": 8555312, "step": 17385 }, { "epoch": 2.2951036030091063, "grad_norm": 24.488922119140625, "learning_rate": 1.3123800447729497e-06, "loss": 0.0493, "num_input_tokens_seen": 8557552, "step": 17390 }, { "epoch": 2.295763494786855, "grad_norm": 0.008273608982563019, "learning_rate": 1.3119423681311612e-06, "loss": 0.0001, "num_input_tokens_seen": 8559920, "step": 17395 }, { "epoch": 2.2964233865646033, "grad_norm": 0.031402088701725006, "learning_rate": 1.3115046252818644e-06, "loss": 0.0001, "num_input_tokens_seen": 8562544, "step": 17400 }, { "epoch": 2.297083278342352, "grad_norm": 0.015473921783268452, "learning_rate": 1.3110668163179664e-06, "loss": 0.0001, "num_input_tokens_seen": 8565168, "step": 17405 }, { "epoch": 2.2977431701201003, "grad_norm": 0.061862409114837646, "learning_rate": 1.3106289413323891e-06, "loss": 0.0382, "num_input_tokens_seen": 8567664, "step": 17410 }, { "epoch": 2.2984030618978486, "grad_norm": 0.0838281512260437, "learning_rate": 1.3101910004180685e-06, "loss": 0.006, "num_input_tokens_seen": 8569776, "step": 17415 }, { "epoch": 2.2990629536755973, "grad_norm": 0.003964480943977833, "learning_rate": 1.3097529936679545e-06, "loss": 0.0004, "num_input_tokens_seen": 8571952, "step": 17420 }, { "epoch": 2.2997228454533456, "grad_norm": 0.006393681280314922, "learning_rate": 1.3093149211750105e-06, "loss": 0.0001, "num_input_tokens_seen": 8574384, "step": 17425 }, { "epoch": 2.300382737231094, "grad_norm": 0.003106580814346671, "learning_rate": 1.3088767830322142e-06, "loss": 0.0007, "num_input_tokens_seen": 8576816, "step": 17430 }, { "epoch": 2.3010426290088426, "grad_norm": 0.001471186289563775, "learning_rate": 1.3084385793325575e-06, "loss": 0.0389, "num_input_tokens_seen": 8579184, "step": 17435 }, { "epoch": 2.301702520786591, "grad_norm": 0.001006085192784667, "learning_rate": 1.308000310169046e-06, "loss": 0.0005, "num_input_tokens_seen": 8581616, "step": 17440 }, { "epoch": 2.3023624125643396, "grad_norm": 0.0674763098359108, "learning_rate": 1.307561975634699e-06, "loss": 0.0001, "num_input_tokens_seen": 8584048, "step": 17445 }, { "epoch": 2.303022304342088, "grad_norm": 0.01596246100962162, "learning_rate": 1.3071235758225497e-06, "loss": 0.0, "num_input_tokens_seen": 8586288, "step": 17450 }, { "epoch": 2.303682196119836, "grad_norm": 0.05637786537408829, "learning_rate": 1.3066851108256457e-06, "loss": 0.0002, "num_input_tokens_seen": 8588784, "step": 17455 }, { "epoch": 2.304342087897585, "grad_norm": 0.008062033914029598, "learning_rate": 1.3062465807370475e-06, "loss": 0.0577, "num_input_tokens_seen": 8591216, "step": 17460 }, { "epoch": 2.305001979675333, "grad_norm": 0.020235441625118256, "learning_rate": 1.3058079856498302e-06, "loss": 0.1142, "num_input_tokens_seen": 8593904, "step": 17465 }, { "epoch": 2.305661871453082, "grad_norm": 0.012712682597339153, "learning_rate": 1.3053693256570829e-06, "loss": 0.0, "num_input_tokens_seen": 8596208, "step": 17470 }, { "epoch": 2.30632176323083, "grad_norm": 0.21507523953914642, "learning_rate": 1.304930600851907e-06, "loss": 0.085, "num_input_tokens_seen": 8598768, "step": 17475 }, { "epoch": 2.3069816550085784, "grad_norm": 0.2181164175271988, "learning_rate": 1.3044918113274195e-06, "loss": 0.0002, "num_input_tokens_seen": 8601008, "step": 17480 }, { "epoch": 2.307641546786327, "grad_norm": 0.004835736472159624, "learning_rate": 1.3040529571767498e-06, "loss": 0.127, "num_input_tokens_seen": 8603632, "step": 17485 }, { "epoch": 2.3083014385640754, "grad_norm": 0.42590686678886414, "learning_rate": 1.3036140384930416e-06, "loss": 0.0004, "num_input_tokens_seen": 8605872, "step": 17490 }, { "epoch": 2.308961330341824, "grad_norm": 0.06867695599794388, "learning_rate": 1.3031750553694528e-06, "loss": 0.0001, "num_input_tokens_seen": 8608432, "step": 17495 }, { "epoch": 2.3096212221195724, "grad_norm": 0.010033776052296162, "learning_rate": 1.3027360078991535e-06, "loss": 0.0001, "num_input_tokens_seen": 8610736, "step": 17500 }, { "epoch": 2.3102811138973207, "grad_norm": 0.037731487303972244, "learning_rate": 1.302296896175329e-06, "loss": 0.0782, "num_input_tokens_seen": 8613616, "step": 17505 }, { "epoch": 2.3109410056750694, "grad_norm": 0.0016087280819192529, "learning_rate": 1.3018577202911774e-06, "loss": 0.0256, "num_input_tokens_seen": 8616048, "step": 17510 }, { "epoch": 2.3116008974528177, "grad_norm": 21.856361389160156, "learning_rate": 1.3014184803399104e-06, "loss": 0.1737, "num_input_tokens_seen": 8618224, "step": 17515 }, { "epoch": 2.312260789230566, "grad_norm": 0.1102667823433876, "learning_rate": 1.3009791764147537e-06, "loss": 0.0467, "num_input_tokens_seen": 8620784, "step": 17520 }, { "epoch": 2.3129206810083147, "grad_norm": 0.007832813076674938, "learning_rate": 1.3005398086089462e-06, "loss": 0.0, "num_input_tokens_seen": 8623152, "step": 17525 }, { "epoch": 2.313580572786063, "grad_norm": 0.10257086157798767, "learning_rate": 1.3001003770157409e-06, "loss": 0.034, "num_input_tokens_seen": 8625456, "step": 17530 }, { "epoch": 2.3142404645638117, "grad_norm": 0.22831708192825317, "learning_rate": 1.2996608817284033e-06, "loss": 0.0005, "num_input_tokens_seen": 8627952, "step": 17535 }, { "epoch": 2.31490035634156, "grad_norm": 0.02536954917013645, "learning_rate": 1.2992213228402142e-06, "loss": 0.0003, "num_input_tokens_seen": 8630640, "step": 17540 }, { "epoch": 2.3155602481193083, "grad_norm": 0.013890203088521957, "learning_rate": 1.2987817004444654e-06, "loss": 0.0008, "num_input_tokens_seen": 8633520, "step": 17545 }, { "epoch": 2.316220139897057, "grad_norm": 0.0018548115622252226, "learning_rate": 1.2983420146344648e-06, "loss": 0.0799, "num_input_tokens_seen": 8636208, "step": 17550 }, { "epoch": 2.3168800316748053, "grad_norm": 0.05500582605600357, "learning_rate": 1.297902265503532e-06, "loss": 0.0427, "num_input_tokens_seen": 8638512, "step": 17555 }, { "epoch": 2.3175399234525536, "grad_norm": 0.0030148825608193874, "learning_rate": 1.2974624531450003e-06, "loss": 0.1341, "num_input_tokens_seen": 8640944, "step": 17560 }, { "epoch": 2.3181998152303023, "grad_norm": 0.014523412100970745, "learning_rate": 1.2970225776522172e-06, "loss": 0.1493, "num_input_tokens_seen": 8643632, "step": 17565 }, { "epoch": 2.3188597070080506, "grad_norm": 0.004904015921056271, "learning_rate": 1.2965826391185425e-06, "loss": 0.018, "num_input_tokens_seen": 8646064, "step": 17570 }, { "epoch": 2.3195195987857993, "grad_norm": 0.07339364290237427, "learning_rate": 1.2961426376373507e-06, "loss": 0.0023, "num_input_tokens_seen": 8648560, "step": 17575 }, { "epoch": 2.3201794905635476, "grad_norm": 0.0362345390021801, "learning_rate": 1.2957025733020285e-06, "loss": 0.0002, "num_input_tokens_seen": 8651056, "step": 17580 }, { "epoch": 2.320839382341296, "grad_norm": 0.013479121029376984, "learning_rate": 1.2952624462059767e-06, "loss": 0.0002, "num_input_tokens_seen": 8653552, "step": 17585 }, { "epoch": 2.3214992741190446, "grad_norm": 0.06915712356567383, "learning_rate": 1.2948222564426083e-06, "loss": 0.0009, "num_input_tokens_seen": 8656048, "step": 17590 }, { "epoch": 2.322159165896793, "grad_norm": 0.023110028356313705, "learning_rate": 1.2943820041053512e-06, "loss": 0.0003, "num_input_tokens_seen": 8658352, "step": 17595 }, { "epoch": 2.3228190576745416, "grad_norm": 0.01199672743678093, "learning_rate": 1.2939416892876451e-06, "loss": 0.0001, "num_input_tokens_seen": 8660720, "step": 17600 }, { "epoch": 2.32347894945229, "grad_norm": 0.0005031914915889502, "learning_rate": 1.2935013120829443e-06, "loss": 0.0005, "num_input_tokens_seen": 8663024, "step": 17605 }, { "epoch": 2.324138841230038, "grad_norm": 16.33403205871582, "learning_rate": 1.2930608725847156e-06, "loss": 0.0411, "num_input_tokens_seen": 8665392, "step": 17610 }, { "epoch": 2.324798733007787, "grad_norm": 0.02279752679169178, "learning_rate": 1.2926203708864385e-06, "loss": 0.0001, "num_input_tokens_seen": 8667824, "step": 17615 }, { "epoch": 2.325458624785535, "grad_norm": 27.623626708984375, "learning_rate": 1.2921798070816068e-06, "loss": 0.1861, "num_input_tokens_seen": 8670448, "step": 17620 }, { "epoch": 2.326118516563284, "grad_norm": 0.006070361007004976, "learning_rate": 1.2917391812637269e-06, "loss": 0.0, "num_input_tokens_seen": 8672944, "step": 17625 }, { "epoch": 2.326778408341032, "grad_norm": 0.35379403829574585, "learning_rate": 1.2912984935263183e-06, "loss": 0.0659, "num_input_tokens_seen": 8675248, "step": 17630 }, { "epoch": 2.3274383001187804, "grad_norm": 0.1110011488199234, "learning_rate": 1.290857743962914e-06, "loss": 0.0007, "num_input_tokens_seen": 8677680, "step": 17635 }, { "epoch": 2.328098191896529, "grad_norm": 0.007278754375874996, "learning_rate": 1.2904169326670596e-06, "loss": 0.0005, "num_input_tokens_seen": 8680048, "step": 17640 }, { "epoch": 2.3287580836742774, "grad_norm": 0.11822202056646347, "learning_rate": 1.2899760597323144e-06, "loss": 0.0002, "num_input_tokens_seen": 8682224, "step": 17645 }, { "epoch": 2.329417975452026, "grad_norm": 0.015329859219491482, "learning_rate": 1.2895351252522502e-06, "loss": 0.0956, "num_input_tokens_seen": 8684784, "step": 17650 }, { "epoch": 2.3300778672297744, "grad_norm": 0.6385002136230469, "learning_rate": 1.2890941293204525e-06, "loss": 0.0431, "num_input_tokens_seen": 8687088, "step": 17655 }, { "epoch": 2.3307377590075227, "grad_norm": 0.0072060092352330685, "learning_rate": 1.2886530720305193e-06, "loss": 0.0472, "num_input_tokens_seen": 8689264, "step": 17660 }, { "epoch": 2.3313976507852714, "grad_norm": 0.007306993473321199, "learning_rate": 1.2882119534760618e-06, "loss": 0.147, "num_input_tokens_seen": 8691760, "step": 17665 }, { "epoch": 2.3320575425630197, "grad_norm": 0.555959939956665, "learning_rate": 1.2877707737507043e-06, "loss": 0.0008, "num_input_tokens_seen": 8694128, "step": 17670 }, { "epoch": 2.332717434340768, "grad_norm": 0.020105179399251938, "learning_rate": 1.2873295329480837e-06, "loss": 0.0005, "num_input_tokens_seen": 8696688, "step": 17675 }, { "epoch": 2.3333773261185167, "grad_norm": 0.019031627103686333, "learning_rate": 1.2868882311618505e-06, "loss": 0.1152, "num_input_tokens_seen": 8699120, "step": 17680 }, { "epoch": 2.334037217896265, "grad_norm": 0.3021307587623596, "learning_rate": 1.286446868485668e-06, "loss": 0.0001, "num_input_tokens_seen": 8701552, "step": 17685 }, { "epoch": 2.3346971096740132, "grad_norm": 0.04043988510966301, "learning_rate": 1.2860054450132116e-06, "loss": 0.0001, "num_input_tokens_seen": 8704048, "step": 17690 }, { "epoch": 2.335357001451762, "grad_norm": 2.304701089859009, "learning_rate": 1.2855639608381706e-06, "loss": 0.0014, "num_input_tokens_seen": 8706480, "step": 17695 }, { "epoch": 2.3360168932295102, "grad_norm": 0.0803212970495224, "learning_rate": 1.2851224160542472e-06, "loss": 0.0017, "num_input_tokens_seen": 8709040, "step": 17700 }, { "epoch": 2.336676785007259, "grad_norm": 0.03722332417964935, "learning_rate": 1.2846808107551553e-06, "loss": 0.0613, "num_input_tokens_seen": 8711472, "step": 17705 }, { "epoch": 2.3373366767850072, "grad_norm": 0.012616438791155815, "learning_rate": 1.2842391450346228e-06, "loss": 0.0097, "num_input_tokens_seen": 8713904, "step": 17710 }, { "epoch": 2.3379965685627555, "grad_norm": 49.51243591308594, "learning_rate": 1.2837974189863902e-06, "loss": 0.1691, "num_input_tokens_seen": 8716144, "step": 17715 }, { "epoch": 2.3386564603405042, "grad_norm": 0.007718805689364672, "learning_rate": 1.2833556327042105e-06, "loss": 0.0004, "num_input_tokens_seen": 8718448, "step": 17720 }, { "epoch": 2.3393163521182525, "grad_norm": 24.115741729736328, "learning_rate": 1.2829137862818496e-06, "loss": 0.114, "num_input_tokens_seen": 8720624, "step": 17725 }, { "epoch": 2.3399762438960012, "grad_norm": 0.0347772054374218, "learning_rate": 1.2824718798130862e-06, "loss": 0.0005, "num_input_tokens_seen": 8723312, "step": 17730 }, { "epoch": 2.3406361356737495, "grad_norm": 0.018872858956456184, "learning_rate": 1.2820299133917122e-06, "loss": 0.1246, "num_input_tokens_seen": 8725680, "step": 17735 }, { "epoch": 2.341296027451498, "grad_norm": 0.022448837757110596, "learning_rate": 1.281587887111531e-06, "loss": 0.0008, "num_input_tokens_seen": 8727984, "step": 17740 }, { "epoch": 2.3419559192292465, "grad_norm": 0.041912712156772614, "learning_rate": 1.28114580106636e-06, "loss": 0.0371, "num_input_tokens_seen": 8730416, "step": 17745 }, { "epoch": 2.342615811006995, "grad_norm": 0.07139486819505692, "learning_rate": 1.2807036553500286e-06, "loss": 0.0004, "num_input_tokens_seen": 8733104, "step": 17750 }, { "epoch": 2.3432757027847435, "grad_norm": 0.19774889945983887, "learning_rate": 1.280261450056379e-06, "loss": 0.0003, "num_input_tokens_seen": 8735600, "step": 17755 }, { "epoch": 2.343935594562492, "grad_norm": 0.012960204854607582, "learning_rate": 1.2798191852792662e-06, "loss": 0.0001, "num_input_tokens_seen": 8738032, "step": 17760 }, { "epoch": 2.34459548634024, "grad_norm": 0.004742769058793783, "learning_rate": 1.2793768611125576e-06, "loss": 0.0002, "num_input_tokens_seen": 8740464, "step": 17765 }, { "epoch": 2.345255378117989, "grad_norm": 0.02533571422100067, "learning_rate": 1.2789344776501333e-06, "loss": 0.0731, "num_input_tokens_seen": 8742960, "step": 17770 }, { "epoch": 2.345915269895737, "grad_norm": 0.02396445721387863, "learning_rate": 1.2784920349858858e-06, "loss": 0.0001, "num_input_tokens_seen": 8745648, "step": 17775 }, { "epoch": 2.346575161673486, "grad_norm": 0.003967063035815954, "learning_rate": 1.278049533213721e-06, "loss": 0.0027, "num_input_tokens_seen": 8748272, "step": 17780 }, { "epoch": 2.347235053451234, "grad_norm": 0.05136928707361221, "learning_rate": 1.2776069724275557e-06, "loss": 0.0004, "num_input_tokens_seen": 8750832, "step": 17785 }, { "epoch": 2.3478949452289823, "grad_norm": 21.935380935668945, "learning_rate": 1.277164352721321e-06, "loss": 0.1713, "num_input_tokens_seen": 8753200, "step": 17790 }, { "epoch": 2.348554837006731, "grad_norm": 0.004382560960948467, "learning_rate": 1.27672167418896e-06, "loss": 0.0009, "num_input_tokens_seen": 8755824, "step": 17795 }, { "epoch": 2.3492147287844793, "grad_norm": 29.610349655151367, "learning_rate": 1.276278936924427e-06, "loss": 0.0133, "num_input_tokens_seen": 8758128, "step": 17800 }, { "epoch": 2.3498746205622276, "grad_norm": 0.5538921356201172, "learning_rate": 1.2758361410216902e-06, "loss": 0.0009, "num_input_tokens_seen": 8760624, "step": 17805 }, { "epoch": 2.3505345123399763, "grad_norm": 0.014471026137471199, "learning_rate": 1.2753932865747302e-06, "loss": 0.0004, "num_input_tokens_seen": 8762864, "step": 17810 }, { "epoch": 2.3511944041177246, "grad_norm": 53.138797760009766, "learning_rate": 1.2749503736775395e-06, "loss": 0.0598, "num_input_tokens_seen": 8765424, "step": 17815 }, { "epoch": 2.351854295895473, "grad_norm": 0.002378394827246666, "learning_rate": 1.2745074024241227e-06, "loss": 0.0, "num_input_tokens_seen": 8768048, "step": 17820 }, { "epoch": 2.3525141876732216, "grad_norm": 0.09277091920375824, "learning_rate": 1.2740643729084974e-06, "loss": 0.0296, "num_input_tokens_seen": 8770672, "step": 17825 }, { "epoch": 2.35317407945097, "grad_norm": 92.40314483642578, "learning_rate": 1.273621285224694e-06, "loss": 0.0406, "num_input_tokens_seen": 8773424, "step": 17830 }, { "epoch": 2.3538339712287186, "grad_norm": 0.003913685213774443, "learning_rate": 1.2731781394667538e-06, "loss": 0.0001, "num_input_tokens_seen": 8775792, "step": 17835 }, { "epoch": 2.354493863006467, "grad_norm": 0.012034501880407333, "learning_rate": 1.2727349357287322e-06, "loss": 0.0003, "num_input_tokens_seen": 8778288, "step": 17840 }, { "epoch": 2.355153754784215, "grad_norm": 0.0050900098867714405, "learning_rate": 1.2722916741046951e-06, "loss": 0.0001, "num_input_tokens_seen": 8780848, "step": 17845 }, { "epoch": 2.355813646561964, "grad_norm": 0.005646945908665657, "learning_rate": 1.2718483546887222e-06, "loss": 0.0007, "num_input_tokens_seen": 8783344, "step": 17850 }, { "epoch": 2.356473538339712, "grad_norm": 0.021207528188824654, "learning_rate": 1.2714049775749043e-06, "loss": 0.0002, "num_input_tokens_seen": 8785776, "step": 17855 }, { "epoch": 2.357133430117461, "grad_norm": 0.0018820096738636494, "learning_rate": 1.2709615428573454e-06, "loss": 0.1, "num_input_tokens_seen": 8787952, "step": 17860 }, { "epoch": 2.357793321895209, "grad_norm": 0.023304801434278488, "learning_rate": 1.2705180506301614e-06, "loss": 0.2573, "num_input_tokens_seen": 8790512, "step": 17865 }, { "epoch": 2.3584532136729575, "grad_norm": 0.012742428109049797, "learning_rate": 1.2700745009874799e-06, "loss": 0.0, "num_input_tokens_seen": 8792816, "step": 17870 }, { "epoch": 2.359113105450706, "grad_norm": 38.9813117980957, "learning_rate": 1.2696308940234414e-06, "loss": 0.0752, "num_input_tokens_seen": 8795184, "step": 17875 }, { "epoch": 2.3597729972284545, "grad_norm": 0.10750970989465714, "learning_rate": 1.2691872298321978e-06, "loss": 0.1042, "num_input_tokens_seen": 8797808, "step": 17880 }, { "epoch": 2.360432889006203, "grad_norm": 0.09598078578710556, "learning_rate": 1.2687435085079143e-06, "loss": 0.0013, "num_input_tokens_seen": 8800368, "step": 17885 }, { "epoch": 2.3610927807839515, "grad_norm": 0.028722476214170456, "learning_rate": 1.2682997301447671e-06, "loss": 0.2316, "num_input_tokens_seen": 8802992, "step": 17890 }, { "epoch": 2.3617526725616997, "grad_norm": 0.002694531111046672, "learning_rate": 1.267855894836945e-06, "loss": 0.0001, "num_input_tokens_seen": 8805744, "step": 17895 }, { "epoch": 2.3624125643394485, "grad_norm": 0.006382739171385765, "learning_rate": 1.267412002678649e-06, "loss": 0.0021, "num_input_tokens_seen": 8807984, "step": 17900 }, { "epoch": 2.3630724561171967, "grad_norm": 0.1750914603471756, "learning_rate": 1.2669680537640916e-06, "loss": 0.0008, "num_input_tokens_seen": 8810480, "step": 17905 }, { "epoch": 2.3637323478949455, "grad_norm": 0.13401751220226288, "learning_rate": 1.2665240481874986e-06, "loss": 0.001, "num_input_tokens_seen": 8812784, "step": 17910 }, { "epoch": 2.3643922396726937, "grad_norm": 0.0007668877951800823, "learning_rate": 1.266079986043106e-06, "loss": 0.0002, "num_input_tokens_seen": 8815344, "step": 17915 }, { "epoch": 2.365052131450442, "grad_norm": 0.0035453704185783863, "learning_rate": 1.2656358674251633e-06, "loss": 0.1032, "num_input_tokens_seen": 8817776, "step": 17920 }, { "epoch": 2.3657120232281907, "grad_norm": 0.010339041240513325, "learning_rate": 1.2651916924279311e-06, "loss": 0.0633, "num_input_tokens_seen": 8820464, "step": 17925 }, { "epoch": 2.366371915005939, "grad_norm": 0.012196065858006477, "learning_rate": 1.2647474611456827e-06, "loss": 0.1189, "num_input_tokens_seen": 8823280, "step": 17930 }, { "epoch": 2.3670318067836873, "grad_norm": 160.40484619140625, "learning_rate": 1.2643031736727029e-06, "loss": 0.127, "num_input_tokens_seen": 8825776, "step": 17935 }, { "epoch": 2.367691698561436, "grad_norm": 0.037526555359363556, "learning_rate": 1.2638588301032883e-06, "loss": 0.1263, "num_input_tokens_seen": 8828016, "step": 17940 }, { "epoch": 2.3683515903391843, "grad_norm": 20.34902572631836, "learning_rate": 1.2634144305317479e-06, "loss": 0.0705, "num_input_tokens_seen": 8830192, "step": 17945 }, { "epoch": 2.3690114821169326, "grad_norm": 15.509211540222168, "learning_rate": 1.2629699750524017e-06, "loss": 0.0828, "num_input_tokens_seen": 8832624, "step": 17950 }, { "epoch": 2.3696713738946813, "grad_norm": 0.048276953399181366, "learning_rate": 1.2625254637595829e-06, "loss": 0.0549, "num_input_tokens_seen": 8835248, "step": 17955 }, { "epoch": 2.3703312656724296, "grad_norm": 0.18577997386455536, "learning_rate": 1.2620808967476352e-06, "loss": 0.0008, "num_input_tokens_seen": 8838000, "step": 17960 }, { "epoch": 2.3709911574501783, "grad_norm": 0.020469972863793373, "learning_rate": 1.2616362741109154e-06, "loss": 0.0022, "num_input_tokens_seen": 8840688, "step": 17965 }, { "epoch": 2.3716510492279266, "grad_norm": 0.005565830506384373, "learning_rate": 1.2611915959437908e-06, "loss": 0.1106, "num_input_tokens_seen": 8843120, "step": 17970 }, { "epoch": 2.372310941005675, "grad_norm": 0.043362200260162354, "learning_rate": 1.2607468623406415e-06, "loss": 0.0799, "num_input_tokens_seen": 8845616, "step": 17975 }, { "epoch": 2.3729708327834236, "grad_norm": 14.814071655273438, "learning_rate": 1.2603020733958588e-06, "loss": 0.002, "num_input_tokens_seen": 8847856, "step": 17980 }, { "epoch": 2.373630724561172, "grad_norm": 0.011253765784204006, "learning_rate": 1.2598572292038459e-06, "loss": 0.0003, "num_input_tokens_seen": 8850480, "step": 17985 }, { "epoch": 2.3742906163389206, "grad_norm": 0.09818299859762192, "learning_rate": 1.2594123298590177e-06, "loss": 0.0006, "num_input_tokens_seen": 8853232, "step": 17990 }, { "epoch": 2.374950508116669, "grad_norm": 0.07499930262565613, "learning_rate": 1.2589673754558014e-06, "loss": 0.0675, "num_input_tokens_seen": 8855664, "step": 17995 }, { "epoch": 2.375610399894417, "grad_norm": 0.005919842980802059, "learning_rate": 1.2585223660886347e-06, "loss": 0.0001, "num_input_tokens_seen": 8858160, "step": 18000 }, { "epoch": 2.376270291672166, "grad_norm": 21.43160057067871, "learning_rate": 1.258077301851968e-06, "loss": 0.0011, "num_input_tokens_seen": 8860464, "step": 18005 }, { "epoch": 2.376930183449914, "grad_norm": 0.008869780227541924, "learning_rate": 1.2576321828402627e-06, "loss": 0.0613, "num_input_tokens_seen": 8862896, "step": 18010 }, { "epoch": 2.377590075227663, "grad_norm": 0.04058117792010307, "learning_rate": 1.2571870091479921e-06, "loss": 0.0488, "num_input_tokens_seen": 8865264, "step": 18015 }, { "epoch": 2.378249967005411, "grad_norm": 0.13646909594535828, "learning_rate": 1.2567417808696416e-06, "loss": 0.0703, "num_input_tokens_seen": 8867760, "step": 18020 }, { "epoch": 2.3789098587831594, "grad_norm": 0.10359276086091995, "learning_rate": 1.2562964980997072e-06, "loss": 0.0002, "num_input_tokens_seen": 8870448, "step": 18025 }, { "epoch": 2.379569750560908, "grad_norm": 0.049668870866298676, "learning_rate": 1.2558511609326968e-06, "loss": 0.046, "num_input_tokens_seen": 8873136, "step": 18030 }, { "epoch": 2.3802296423386564, "grad_norm": 0.005334274843335152, "learning_rate": 1.2554057694631302e-06, "loss": 0.0041, "num_input_tokens_seen": 8875632, "step": 18035 }, { "epoch": 2.380889534116405, "grad_norm": 0.021098989993333817, "learning_rate": 1.2549603237855386e-06, "loss": 0.0006, "num_input_tokens_seen": 8878384, "step": 18040 }, { "epoch": 2.3815494258941534, "grad_norm": 0.12568983435630798, "learning_rate": 1.2545148239944644e-06, "loss": 0.0615, "num_input_tokens_seen": 8880944, "step": 18045 }, { "epoch": 2.3822093176719017, "grad_norm": 0.002665475942194462, "learning_rate": 1.2540692701844625e-06, "loss": 0.0002, "num_input_tokens_seen": 8883568, "step": 18050 }, { "epoch": 2.3828692094496504, "grad_norm": 0.14754042029380798, "learning_rate": 1.253623662450097e-06, "loss": 0.052, "num_input_tokens_seen": 8886064, "step": 18055 }, { "epoch": 2.3835291012273987, "grad_norm": 0.2536415755748749, "learning_rate": 1.2531780008859464e-06, "loss": 0.0003, "num_input_tokens_seen": 8888816, "step": 18060 }, { "epoch": 2.384188993005147, "grad_norm": 0.5732733607292175, "learning_rate": 1.252732285586598e-06, "loss": 0.0738, "num_input_tokens_seen": 8891248, "step": 18065 }, { "epoch": 2.3848488847828957, "grad_norm": 0.2951080799102783, "learning_rate": 1.2522865166466528e-06, "loss": 0.0487, "num_input_tokens_seen": 8893808, "step": 18070 }, { "epoch": 2.385508776560644, "grad_norm": 27.109596252441406, "learning_rate": 1.2518406941607207e-06, "loss": 0.0383, "num_input_tokens_seen": 8896304, "step": 18075 }, { "epoch": 2.3861686683383927, "grad_norm": 0.049254752695560455, "learning_rate": 1.2513948182234253e-06, "loss": 0.0004, "num_input_tokens_seen": 8898672, "step": 18080 }, { "epoch": 2.386828560116141, "grad_norm": 0.0034236342180520296, "learning_rate": 1.2509488889293998e-06, "loss": 0.0002, "num_input_tokens_seen": 8901168, "step": 18085 }, { "epoch": 2.3874884518938893, "grad_norm": 0.6597353219985962, "learning_rate": 1.2505029063732898e-06, "loss": 0.0005, "num_input_tokens_seen": 8903600, "step": 18090 }, { "epoch": 2.388148343671638, "grad_norm": 0.019330337643623352, "learning_rate": 1.2500568706497526e-06, "loss": 0.0954, "num_input_tokens_seen": 8906032, "step": 18095 }, { "epoch": 2.3888082354493863, "grad_norm": 21.249752044677734, "learning_rate": 1.2496107818534548e-06, "loss": 0.0035, "num_input_tokens_seen": 8908400, "step": 18100 }, { "epoch": 2.3894681272271345, "grad_norm": 0.006237414199858904, "learning_rate": 1.2491646400790766e-06, "loss": 0.0573, "num_input_tokens_seen": 8910832, "step": 18105 }, { "epoch": 2.3901280190048833, "grad_norm": 0.045193735510110855, "learning_rate": 1.2487184454213073e-06, "loss": 0.0613, "num_input_tokens_seen": 8913200, "step": 18110 }, { "epoch": 2.3907879107826315, "grad_norm": 0.06255914270877838, "learning_rate": 1.2482721979748494e-06, "loss": 0.0002, "num_input_tokens_seen": 8915568, "step": 18115 }, { "epoch": 2.3914478025603803, "grad_norm": 0.07931576669216156, "learning_rate": 1.2478258978344149e-06, "loss": 0.0691, "num_input_tokens_seen": 8917680, "step": 18120 }, { "epoch": 2.3921076943381285, "grad_norm": 0.2358531802892685, "learning_rate": 1.2473795450947287e-06, "loss": 0.0109, "num_input_tokens_seen": 8920112, "step": 18125 }, { "epoch": 2.392767586115877, "grad_norm": 0.04463690146803856, "learning_rate": 1.2469331398505254e-06, "loss": 0.0002, "num_input_tokens_seen": 8922544, "step": 18130 }, { "epoch": 2.3934274778936255, "grad_norm": 0.07052083313465118, "learning_rate": 1.246486682196551e-06, "loss": 0.094, "num_input_tokens_seen": 8925040, "step": 18135 }, { "epoch": 2.394087369671374, "grad_norm": 0.026072848588228226, "learning_rate": 1.2460401722275633e-06, "loss": 0.0004, "num_input_tokens_seen": 8927408, "step": 18140 }, { "epoch": 2.3947472614491225, "grad_norm": 0.030193530023097992, "learning_rate": 1.2455936100383309e-06, "loss": 0.1117, "num_input_tokens_seen": 8929840, "step": 18145 }, { "epoch": 2.395407153226871, "grad_norm": 0.018104439601302147, "learning_rate": 1.2451469957236334e-06, "loss": 0.0003, "num_input_tokens_seen": 8932272, "step": 18150 }, { "epoch": 2.396067045004619, "grad_norm": 38.890846252441406, "learning_rate": 1.2447003293782607e-06, "loss": 0.0591, "num_input_tokens_seen": 8934640, "step": 18155 }, { "epoch": 2.396726936782368, "grad_norm": 0.17797276377677917, "learning_rate": 1.2442536110970152e-06, "loss": 0.0025, "num_input_tokens_seen": 8937200, "step": 18160 }, { "epoch": 2.397386828560116, "grad_norm": 0.001440043211914599, "learning_rate": 1.2438068409747097e-06, "loss": 0.0002, "num_input_tokens_seen": 8939568, "step": 18165 }, { "epoch": 2.398046720337865, "grad_norm": 0.002936235163360834, "learning_rate": 1.2433600191061677e-06, "loss": 0.0413, "num_input_tokens_seen": 8942000, "step": 18170 }, { "epoch": 2.398706612115613, "grad_norm": 0.006453686859458685, "learning_rate": 1.242913145586224e-06, "loss": 0.0088, "num_input_tokens_seen": 8944688, "step": 18175 }, { "epoch": 2.3993665038933614, "grad_norm": 83.48731231689453, "learning_rate": 1.2424662205097241e-06, "loss": 0.0345, "num_input_tokens_seen": 8947312, "step": 18180 }, { "epoch": 2.40002639567111, "grad_norm": 0.07596401125192642, "learning_rate": 1.2420192439715247e-06, "loss": 0.0002, "num_input_tokens_seen": 8949808, "step": 18185 }, { "epoch": 2.4006862874488584, "grad_norm": 0.005037888418883085, "learning_rate": 1.2415722160664933e-06, "loss": 0.0003, "num_input_tokens_seen": 8952112, "step": 18190 }, { "epoch": 2.4013461792266066, "grad_norm": 0.09627171605825424, "learning_rate": 1.2411251368895085e-06, "loss": 0.1239, "num_input_tokens_seen": 8954800, "step": 18195 }, { "epoch": 2.4020060710043554, "grad_norm": 0.0015911199152469635, "learning_rate": 1.2406780065354592e-06, "loss": 0.0001, "num_input_tokens_seen": 8957360, "step": 18200 }, { "epoch": 2.4026659627821036, "grad_norm": 0.015304110012948513, "learning_rate": 1.240230825099246e-06, "loss": 0.0001, "num_input_tokens_seen": 8960112, "step": 18205 }, { "epoch": 2.4033258545598524, "grad_norm": 0.9241950511932373, "learning_rate": 1.2397835926757798e-06, "loss": 0.0006, "num_input_tokens_seen": 8962608, "step": 18210 }, { "epoch": 2.4039857463376006, "grad_norm": 66.03406524658203, "learning_rate": 1.2393363093599823e-06, "loss": 0.133, "num_input_tokens_seen": 8965040, "step": 18215 }, { "epoch": 2.404645638115349, "grad_norm": 0.0014983558794483542, "learning_rate": 1.2388889752467867e-06, "loss": 0.0659, "num_input_tokens_seen": 8967216, "step": 18220 }, { "epoch": 2.4053055298930976, "grad_norm": 0.010034170933067799, "learning_rate": 1.2384415904311357e-06, "loss": 0.0, "num_input_tokens_seen": 8969712, "step": 18225 }, { "epoch": 2.405965421670846, "grad_norm": 0.17632248997688293, "learning_rate": 1.2379941550079836e-06, "loss": 0.0001, "num_input_tokens_seen": 8972208, "step": 18230 }, { "epoch": 2.406625313448594, "grad_norm": 27.587594985961914, "learning_rate": 1.2375466690722957e-06, "loss": 0.105, "num_input_tokens_seen": 8974768, "step": 18235 }, { "epoch": 2.407285205226343, "grad_norm": 0.012240786105394363, "learning_rate": 1.2370991327190473e-06, "loss": 0.0736, "num_input_tokens_seen": 8977200, "step": 18240 }, { "epoch": 2.407945097004091, "grad_norm": 0.0025328362826257944, "learning_rate": 1.2366515460432255e-06, "loss": 0.0, "num_input_tokens_seen": 8979568, "step": 18245 }, { "epoch": 2.40860498878184, "grad_norm": 0.08074627071619034, "learning_rate": 1.2362039091398259e-06, "loss": 0.0907, "num_input_tokens_seen": 8981936, "step": 18250 }, { "epoch": 2.409264880559588, "grad_norm": 0.031857311725616455, "learning_rate": 1.235756222103858e-06, "loss": 0.1141, "num_input_tokens_seen": 8984368, "step": 18255 }, { "epoch": 2.4099247723373365, "grad_norm": 17.715085983276367, "learning_rate": 1.2353084850303386e-06, "loss": 0.1377, "num_input_tokens_seen": 8986736, "step": 18260 }, { "epoch": 2.410584664115085, "grad_norm": 59.32007598876953, "learning_rate": 1.2348606980142973e-06, "loss": 0.1191, "num_input_tokens_seen": 8988720, "step": 18265 }, { "epoch": 2.4112445558928335, "grad_norm": 0.045384231954813004, "learning_rate": 1.2344128611507733e-06, "loss": 0.0002, "num_input_tokens_seen": 8990960, "step": 18270 }, { "epoch": 2.411904447670582, "grad_norm": 0.16882988810539246, "learning_rate": 1.2339649745348176e-06, "loss": 0.0002, "num_input_tokens_seen": 8993328, "step": 18275 }, { "epoch": 2.4125643394483305, "grad_norm": 0.03319350630044937, "learning_rate": 1.23351703826149e-06, "loss": 0.0004, "num_input_tokens_seen": 8995632, "step": 18280 }, { "epoch": 2.4132242312260788, "grad_norm": 0.022699033841490746, "learning_rate": 1.2330690524258618e-06, "loss": 0.046, "num_input_tokens_seen": 8998256, "step": 18285 }, { "epoch": 2.4138841230038275, "grad_norm": 0.07350515574216843, "learning_rate": 1.2326210171230152e-06, "loss": 0.1114, "num_input_tokens_seen": 9000368, "step": 18290 }, { "epoch": 2.4145440147815758, "grad_norm": 0.34410330653190613, "learning_rate": 1.2321729324480422e-06, "loss": 0.0758, "num_input_tokens_seen": 9002800, "step": 18295 }, { "epoch": 2.4152039065593245, "grad_norm": 15.627949714660645, "learning_rate": 1.2317247984960455e-06, "loss": 0.0675, "num_input_tokens_seen": 9005232, "step": 18300 }, { "epoch": 2.4158637983370728, "grad_norm": 0.03578624129295349, "learning_rate": 1.2312766153621383e-06, "loss": 0.0001, "num_input_tokens_seen": 9007920, "step": 18305 }, { "epoch": 2.416523690114821, "grad_norm": 0.06478110700845718, "learning_rate": 1.2308283831414444e-06, "loss": 0.0005, "num_input_tokens_seen": 9010416, "step": 18310 }, { "epoch": 2.4171835818925698, "grad_norm": 0.13254325091838837, "learning_rate": 1.2303801019290978e-06, "loss": 0.0003, "num_input_tokens_seen": 9013168, "step": 18315 }, { "epoch": 2.417843473670318, "grad_norm": 0.39260151982307434, "learning_rate": 1.2299317718202424e-06, "loss": 0.0476, "num_input_tokens_seen": 9015728, "step": 18320 }, { "epoch": 2.4185033654480668, "grad_norm": 0.05883554369211197, "learning_rate": 1.229483392910034e-06, "loss": 0.0326, "num_input_tokens_seen": 9018416, "step": 18325 }, { "epoch": 2.419163257225815, "grad_norm": 0.02526324987411499, "learning_rate": 1.229034965293637e-06, "loss": 0.0002, "num_input_tokens_seen": 9020976, "step": 18330 }, { "epoch": 2.4198231490035633, "grad_norm": 0.006224233657121658, "learning_rate": 1.2285864890662272e-06, "loss": 0.0002, "num_input_tokens_seen": 9023728, "step": 18335 }, { "epoch": 2.420483040781312, "grad_norm": 0.03532911092042923, "learning_rate": 1.2281379643229904e-06, "loss": 0.0007, "num_input_tokens_seen": 9026096, "step": 18340 }, { "epoch": 2.4211429325590603, "grad_norm": 25.137794494628906, "learning_rate": 1.2276893911591226e-06, "loss": 0.0509, "num_input_tokens_seen": 9028656, "step": 18345 }, { "epoch": 2.4218028243368086, "grad_norm": 0.00828655157238245, "learning_rate": 1.2272407696698303e-06, "loss": 0.0002, "num_input_tokens_seen": 9031472, "step": 18350 }, { "epoch": 2.4224627161145573, "grad_norm": 0.13335035741329193, "learning_rate": 1.2267920999503302e-06, "loss": 0.0005, "num_input_tokens_seen": 9034352, "step": 18355 }, { "epoch": 2.4231226078923056, "grad_norm": 0.013290558941662312, "learning_rate": 1.2263433820958494e-06, "loss": 0.0001, "num_input_tokens_seen": 9036720, "step": 18360 }, { "epoch": 2.423782499670054, "grad_norm": 0.19421987235546112, "learning_rate": 1.2258946162016247e-06, "loss": 0.0003, "num_input_tokens_seen": 9039216, "step": 18365 }, { "epoch": 2.4244423914478026, "grad_norm": 23.98882293701172, "learning_rate": 1.2254458023629035e-06, "loss": 0.0595, "num_input_tokens_seen": 9041584, "step": 18370 }, { "epoch": 2.425102283225551, "grad_norm": 0.0030773465987294912, "learning_rate": 1.2249969406749432e-06, "loss": 0.0442, "num_input_tokens_seen": 9043888, "step": 18375 }, { "epoch": 2.4257621750032996, "grad_norm": 0.006509008351713419, "learning_rate": 1.2245480312330117e-06, "loss": 0.0003, "num_input_tokens_seen": 9046320, "step": 18380 }, { "epoch": 2.426422066781048, "grad_norm": 0.011278158985078335, "learning_rate": 1.2240990741323867e-06, "loss": 0.0675, "num_input_tokens_seen": 9048880, "step": 18385 }, { "epoch": 2.427081958558796, "grad_norm": 0.00435879360884428, "learning_rate": 1.2236500694683555e-06, "loss": 0.0, "num_input_tokens_seen": 9051312, "step": 18390 }, { "epoch": 2.427741850336545, "grad_norm": 0.002087733941152692, "learning_rate": 1.223201017336217e-06, "loss": 0.1489, "num_input_tokens_seen": 9054000, "step": 18395 }, { "epoch": 2.428401742114293, "grad_norm": 0.0005177515558898449, "learning_rate": 1.222751917831279e-06, "loss": 0.001, "num_input_tokens_seen": 9056240, "step": 18400 }, { "epoch": 2.429061633892042, "grad_norm": 0.002716792980208993, "learning_rate": 1.2223027710488591e-06, "loss": 0.1132, "num_input_tokens_seen": 9058672, "step": 18405 }, { "epoch": 2.42972152566979, "grad_norm": 0.032944511622190475, "learning_rate": 1.221853577084286e-06, "loss": 0.0001, "num_input_tokens_seen": 9061104, "step": 18410 }, { "epoch": 2.4303814174475384, "grad_norm": 0.03812925145030022, "learning_rate": 1.221404336032898e-06, "loss": 0.1807, "num_input_tokens_seen": 9063280, "step": 18415 }, { "epoch": 2.431041309225287, "grad_norm": 0.022235559299588203, "learning_rate": 1.2209550479900425e-06, "loss": 0.0442, "num_input_tokens_seen": 9065840, "step": 18420 }, { "epoch": 2.4317012010030354, "grad_norm": 0.004477641079574823, "learning_rate": 1.2205057130510783e-06, "loss": 0.0, "num_input_tokens_seen": 9068016, "step": 18425 }, { "epoch": 2.432361092780784, "grad_norm": 0.01159152202308178, "learning_rate": 1.2200563313113732e-06, "loss": 0.0813, "num_input_tokens_seen": 9070448, "step": 18430 }, { "epoch": 2.4330209845585324, "grad_norm": 0.01895478554069996, "learning_rate": 1.2196069028663057e-06, "loss": 0.0473, "num_input_tokens_seen": 9072880, "step": 18435 }, { "epoch": 2.4336808763362807, "grad_norm": 0.10171425342559814, "learning_rate": 1.219157427811263e-06, "loss": 0.0002, "num_input_tokens_seen": 9075248, "step": 18440 }, { "epoch": 2.4343407681140294, "grad_norm": 22.22634506225586, "learning_rate": 1.218707906241643e-06, "loss": 0.0627, "num_input_tokens_seen": 9077744, "step": 18445 }, { "epoch": 2.4350006598917777, "grad_norm": 0.030533935874700546, "learning_rate": 1.2182583382528543e-06, "loss": 0.0003, "num_input_tokens_seen": 9080560, "step": 18450 }, { "epoch": 2.4356605516695264, "grad_norm": 0.002572020050138235, "learning_rate": 1.2178087239403133e-06, "loss": 0.0109, "num_input_tokens_seen": 9082992, "step": 18455 }, { "epoch": 2.4363204434472747, "grad_norm": 0.0030550749506801367, "learning_rate": 1.2173590633994479e-06, "loss": 0.0844, "num_input_tokens_seen": 9085552, "step": 18460 }, { "epoch": 2.436980335225023, "grad_norm": 0.007434781640768051, "learning_rate": 1.2169093567256955e-06, "loss": 0.0001, "num_input_tokens_seen": 9087728, "step": 18465 }, { "epoch": 2.4376402270027717, "grad_norm": 0.05250757560133934, "learning_rate": 1.2164596040145028e-06, "loss": 0.0004, "num_input_tokens_seen": 9089968, "step": 18470 }, { "epoch": 2.43830011878052, "grad_norm": 0.003924153745174408, "learning_rate": 1.2160098053613267e-06, "loss": 0.0011, "num_input_tokens_seen": 9092464, "step": 18475 }, { "epoch": 2.4389600105582683, "grad_norm": 0.022841813042759895, "learning_rate": 1.2155599608616331e-06, "loss": 0.0345, "num_input_tokens_seen": 9095088, "step": 18480 }, { "epoch": 2.439619902336017, "grad_norm": 0.45985811948776245, "learning_rate": 1.2151100706108996e-06, "loss": 0.0006, "num_input_tokens_seen": 9097712, "step": 18485 }, { "epoch": 2.4402797941137653, "grad_norm": 19.873098373413086, "learning_rate": 1.2146601347046107e-06, "loss": 0.0643, "num_input_tokens_seen": 9100336, "step": 18490 }, { "epoch": 2.4409396858915136, "grad_norm": 0.10789693892002106, "learning_rate": 1.214210153238263e-06, "loss": 0.0002, "num_input_tokens_seen": 9103024, "step": 18495 }, { "epoch": 2.4415995776692623, "grad_norm": 0.06011528521776199, "learning_rate": 1.2137601263073613e-06, "loss": 0.0001, "num_input_tokens_seen": 9105520, "step": 18500 }, { "epoch": 2.4422594694470106, "grad_norm": 0.0035535397473722696, "learning_rate": 1.2133100540074206e-06, "loss": 0.0689, "num_input_tokens_seen": 9108016, "step": 18505 }, { "epoch": 2.4429193612247593, "grad_norm": 0.05881497263908386, "learning_rate": 1.2128599364339663e-06, "loss": 0.0003, "num_input_tokens_seen": 9110320, "step": 18510 }, { "epoch": 2.4435792530025076, "grad_norm": 0.047487806528806686, "learning_rate": 1.212409773682531e-06, "loss": 0.0113, "num_input_tokens_seen": 9112624, "step": 18515 }, { "epoch": 2.444239144780256, "grad_norm": 0.05220465362071991, "learning_rate": 1.2119595658486599e-06, "loss": 0.0001, "num_input_tokens_seen": 9115120, "step": 18520 }, { "epoch": 2.4448990365580046, "grad_norm": 24.626712799072266, "learning_rate": 1.2115093130279055e-06, "loss": 0.111, "num_input_tokens_seen": 9117680, "step": 18525 }, { "epoch": 2.445558928335753, "grad_norm": 0.04938157647848129, "learning_rate": 1.2110590153158313e-06, "loss": 0.1346, "num_input_tokens_seen": 9120112, "step": 18530 }, { "epoch": 2.4462188201135016, "grad_norm": 0.06872491538524628, "learning_rate": 1.2106086728080095e-06, "loss": 0.0004, "num_input_tokens_seen": 9122800, "step": 18535 }, { "epoch": 2.44687871189125, "grad_norm": 0.008710040710866451, "learning_rate": 1.2101582856000219e-06, "loss": 0.0473, "num_input_tokens_seen": 9124976, "step": 18540 }, { "epoch": 2.447538603668998, "grad_norm": 0.006348209455609322, "learning_rate": 1.20970785378746e-06, "loss": 0.0337, "num_input_tokens_seen": 9127472, "step": 18545 }, { "epoch": 2.448198495446747, "grad_norm": 0.1652994155883789, "learning_rate": 1.2092573774659247e-06, "loss": 0.0005, "num_input_tokens_seen": 9130288, "step": 18550 }, { "epoch": 2.448858387224495, "grad_norm": 0.16471163928508759, "learning_rate": 1.2088068567310266e-06, "loss": 0.0011, "num_input_tokens_seen": 9132528, "step": 18555 }, { "epoch": 2.449518279002244, "grad_norm": 0.025540076196193695, "learning_rate": 1.2083562916783852e-06, "loss": 0.197, "num_input_tokens_seen": 9135152, "step": 18560 }, { "epoch": 2.450178170779992, "grad_norm": 0.6089839935302734, "learning_rate": 1.2079056824036294e-06, "loss": 0.088, "num_input_tokens_seen": 9138032, "step": 18565 }, { "epoch": 2.4508380625577404, "grad_norm": 0.03979070857167244, "learning_rate": 1.207455029002398e-06, "loss": 0.0004, "num_input_tokens_seen": 9140528, "step": 18570 }, { "epoch": 2.451497954335489, "grad_norm": 0.30008092522621155, "learning_rate": 1.207004331570339e-06, "loss": 0.0495, "num_input_tokens_seen": 9142768, "step": 18575 }, { "epoch": 2.4521578461132374, "grad_norm": 0.15778286755084991, "learning_rate": 1.2065535902031098e-06, "loss": 0.0004, "num_input_tokens_seen": 9145392, "step": 18580 }, { "epoch": 2.452817737890986, "grad_norm": 25.180299758911133, "learning_rate": 1.206102804996377e-06, "loss": 0.1586, "num_input_tokens_seen": 9147696, "step": 18585 }, { "epoch": 2.4534776296687344, "grad_norm": 82.59857940673828, "learning_rate": 1.2056519760458162e-06, "loss": 0.0845, "num_input_tokens_seen": 9150320, "step": 18590 }, { "epoch": 2.4541375214464827, "grad_norm": 0.2946361303329468, "learning_rate": 1.2052011034471123e-06, "loss": 0.0212, "num_input_tokens_seen": 9153072, "step": 18595 }, { "epoch": 2.4547974132242314, "grad_norm": 0.18923722207546234, "learning_rate": 1.2047501872959606e-06, "loss": 0.0005, "num_input_tokens_seen": 9155504, "step": 18600 }, { "epoch": 2.4554573050019797, "grad_norm": 0.07883734256029129, "learning_rate": 1.204299227688064e-06, "loss": 0.0002, "num_input_tokens_seen": 9158064, "step": 18605 }, { "epoch": 2.456117196779728, "grad_norm": 0.01335981860756874, "learning_rate": 1.203848224719136e-06, "loss": 0.1095, "num_input_tokens_seen": 9160688, "step": 18610 }, { "epoch": 2.4567770885574767, "grad_norm": 0.035853851586580276, "learning_rate": 1.2033971784848985e-06, "loss": 0.0001, "num_input_tokens_seen": 9163056, "step": 18615 }, { "epoch": 2.457436980335225, "grad_norm": 0.4601845443248749, "learning_rate": 1.2029460890810826e-06, "loss": 0.0004, "num_input_tokens_seen": 9165680, "step": 18620 }, { "epoch": 2.4580968721129732, "grad_norm": 0.09914588928222656, "learning_rate": 1.202494956603429e-06, "loss": 0.0215, "num_input_tokens_seen": 9167984, "step": 18625 }, { "epoch": 2.458756763890722, "grad_norm": 0.016405778005719185, "learning_rate": 1.2020437811476872e-06, "loss": 0.0013, "num_input_tokens_seen": 9170608, "step": 18630 }, { "epoch": 2.4594166556684702, "grad_norm": 0.012795671820640564, "learning_rate": 1.2015925628096157e-06, "loss": 0.0002, "num_input_tokens_seen": 9172976, "step": 18635 }, { "epoch": 2.460076547446219, "grad_norm": 0.01026566606014967, "learning_rate": 1.2011413016849829e-06, "loss": 0.0003, "num_input_tokens_seen": 9175600, "step": 18640 }, { "epoch": 2.4607364392239672, "grad_norm": 0.005955888889729977, "learning_rate": 1.2006899978695653e-06, "loss": 0.0007, "num_input_tokens_seen": 9177904, "step": 18645 }, { "epoch": 2.4613963310017155, "grad_norm": 0.03424505889415741, "learning_rate": 1.200238651459149e-06, "loss": 0.0005, "num_input_tokens_seen": 9180400, "step": 18650 }, { "epoch": 2.4620562227794642, "grad_norm": 0.004490535706281662, "learning_rate": 1.1997872625495284e-06, "loss": 0.0782, "num_input_tokens_seen": 9182896, "step": 18655 }, { "epoch": 2.4627161145572125, "grad_norm": 0.003114398568868637, "learning_rate": 1.1993358312365087e-06, "loss": 0.0021, "num_input_tokens_seen": 9185328, "step": 18660 }, { "epoch": 2.4633760063349612, "grad_norm": 0.12822800874710083, "learning_rate": 1.198884357615902e-06, "loss": 0.0001, "num_input_tokens_seen": 9187760, "step": 18665 }, { "epoch": 2.4640358981127095, "grad_norm": 0.0007764756446704268, "learning_rate": 1.1984328417835307e-06, "loss": 0.0844, "num_input_tokens_seen": 9189872, "step": 18670 }, { "epoch": 2.464695789890458, "grad_norm": 0.095713309943676, "learning_rate": 1.1979812838352257e-06, "loss": 0.1752, "num_input_tokens_seen": 9192176, "step": 18675 }, { "epoch": 2.4653556816682065, "grad_norm": 0.03003990463912487, "learning_rate": 1.1975296838668266e-06, "loss": 0.0296, "num_input_tokens_seen": 9194480, "step": 18680 }, { "epoch": 2.466015573445955, "grad_norm": 0.017723990604281425, "learning_rate": 1.1970780419741828e-06, "loss": 0.0712, "num_input_tokens_seen": 9196976, "step": 18685 }, { "epoch": 2.4666754652237035, "grad_norm": 0.031016312539577484, "learning_rate": 1.1966263582531517e-06, "loss": 0.0004, "num_input_tokens_seen": 9199216, "step": 18690 }, { "epoch": 2.467335357001452, "grad_norm": 0.030972057953476906, "learning_rate": 1.1961746327996e-06, "loss": 0.0064, "num_input_tokens_seen": 9201648, "step": 18695 }, { "epoch": 2.4679952487792, "grad_norm": 13.240409851074219, "learning_rate": 1.1957228657094027e-06, "loss": 0.0253, "num_input_tokens_seen": 9203760, "step": 18700 }, { "epoch": 2.468655140556949, "grad_norm": 0.012332662008702755, "learning_rate": 1.1952710570784447e-06, "loss": 0.0004, "num_input_tokens_seen": 9206000, "step": 18705 }, { "epoch": 2.469315032334697, "grad_norm": 14.274161338806152, "learning_rate": 1.194819207002619e-06, "loss": 0.1172, "num_input_tokens_seen": 9208496, "step": 18710 }, { "epoch": 2.469974924112446, "grad_norm": 0.16916923224925995, "learning_rate": 1.194367315577827e-06, "loss": 0.1238, "num_input_tokens_seen": 9211120, "step": 18715 }, { "epoch": 2.470634815890194, "grad_norm": 18.33597755432129, "learning_rate": 1.1939153828999801e-06, "loss": 0.1403, "num_input_tokens_seen": 9213744, "step": 18720 }, { "epoch": 2.4712947076679423, "grad_norm": 0.18467681109905243, "learning_rate": 1.1934634090649973e-06, "loss": 0.0015, "num_input_tokens_seen": 9215856, "step": 18725 }, { "epoch": 2.471954599445691, "grad_norm": 0.28404808044433594, "learning_rate": 1.1930113941688072e-06, "loss": 0.1295, "num_input_tokens_seen": 9218160, "step": 18730 }, { "epoch": 2.4726144912234393, "grad_norm": 1.6605956554412842, "learning_rate": 1.1925593383073458e-06, "loss": 0.0516, "num_input_tokens_seen": 9220848, "step": 18735 }, { "epoch": 2.4732743830011876, "grad_norm": 19.45949935913086, "learning_rate": 1.1921072415765595e-06, "loss": 0.0631, "num_input_tokens_seen": 9223344, "step": 18740 }, { "epoch": 2.4739342747789363, "grad_norm": 0.016054624691605568, "learning_rate": 1.1916551040724026e-06, "loss": 0.0004, "num_input_tokens_seen": 9225648, "step": 18745 }, { "epoch": 2.4745941665566846, "grad_norm": 0.03807045519351959, "learning_rate": 1.191202925890837e-06, "loss": 0.0413, "num_input_tokens_seen": 9228016, "step": 18750 }, { "epoch": 2.475254058334433, "grad_norm": 0.28098002076148987, "learning_rate": 1.1907507071278358e-06, "loss": 0.0693, "num_input_tokens_seen": 9230192, "step": 18755 }, { "epoch": 2.4759139501121816, "grad_norm": 0.03773991018533707, "learning_rate": 1.1902984478793776e-06, "loss": 0.0002, "num_input_tokens_seen": 9232624, "step": 18760 }, { "epoch": 2.47657384188993, "grad_norm": 14.075786590576172, "learning_rate": 1.1898461482414524e-06, "loss": 0.0532, "num_input_tokens_seen": 9234992, "step": 18765 }, { "epoch": 2.4772337336676786, "grad_norm": 0.1268509477376938, "learning_rate": 1.1893938083100568e-06, "loss": 0.0014, "num_input_tokens_seen": 9237360, "step": 18770 }, { "epoch": 2.477893625445427, "grad_norm": 0.7259445190429688, "learning_rate": 1.188941428181197e-06, "loss": 0.0253, "num_input_tokens_seen": 9239664, "step": 18775 }, { "epoch": 2.478553517223175, "grad_norm": 1.0823359489440918, "learning_rate": 1.188489007950887e-06, "loss": 0.0016, "num_input_tokens_seen": 9241968, "step": 18780 }, { "epoch": 2.479213409000924, "grad_norm": 0.007959727197885513, "learning_rate": 1.1880365477151501e-06, "loss": 0.0, "num_input_tokens_seen": 9244336, "step": 18785 }, { "epoch": 2.479873300778672, "grad_norm": 0.006146569736301899, "learning_rate": 1.1875840475700175e-06, "loss": 0.0007, "num_input_tokens_seen": 9246960, "step": 18790 }, { "epoch": 2.480533192556421, "grad_norm": 0.0030527012422680855, "learning_rate": 1.1871315076115293e-06, "loss": 0.0004, "num_input_tokens_seen": 9249264, "step": 18795 }, { "epoch": 2.481193084334169, "grad_norm": 0.03815356269478798, "learning_rate": 1.186678927935734e-06, "loss": 0.0001, "num_input_tokens_seen": 9251504, "step": 18800 }, { "epoch": 2.4818529761119175, "grad_norm": 0.0036139509174972773, "learning_rate": 1.1862263086386875e-06, "loss": 0.0001, "num_input_tokens_seen": 9253552, "step": 18805 }, { "epoch": 2.482512867889666, "grad_norm": 0.17686735093593597, "learning_rate": 1.1857736498164559e-06, "loss": 0.0002, "num_input_tokens_seen": 9255984, "step": 18810 }, { "epoch": 2.4831727596674145, "grad_norm": 0.08808859437704086, "learning_rate": 1.1853209515651122e-06, "loss": 0.0003, "num_input_tokens_seen": 9258352, "step": 18815 }, { "epoch": 2.483832651445163, "grad_norm": 0.0034736869856715202, "learning_rate": 1.1848682139807387e-06, "loss": 0.0001, "num_input_tokens_seen": 9260784, "step": 18820 }, { "epoch": 2.4844925432229115, "grad_norm": 0.0006168729742057621, "learning_rate": 1.1844154371594254e-06, "loss": 0.0008, "num_input_tokens_seen": 9263408, "step": 18825 }, { "epoch": 2.4851524350006597, "grad_norm": 0.007578112650662661, "learning_rate": 1.183962621197271e-06, "loss": 0.0673, "num_input_tokens_seen": 9265584, "step": 18830 }, { "epoch": 2.4858123267784085, "grad_norm": 24.65241050720215, "learning_rate": 1.1835097661903826e-06, "loss": 0.1876, "num_input_tokens_seen": 9267888, "step": 18835 }, { "epoch": 2.4864722185561567, "grad_norm": 0.027628377079963684, "learning_rate": 1.1830568722348748e-06, "loss": 0.0001, "num_input_tokens_seen": 9270256, "step": 18840 }, { "epoch": 2.4871321103339055, "grad_norm": 0.005967118311673403, "learning_rate": 1.182603939426872e-06, "loss": 0.0006, "num_input_tokens_seen": 9272944, "step": 18845 }, { "epoch": 2.4877920021116537, "grad_norm": 0.11982940882444382, "learning_rate": 1.1821509678625048e-06, "loss": 0.0002, "num_input_tokens_seen": 9275376, "step": 18850 }, { "epoch": 2.488451893889402, "grad_norm": 11.578821182250977, "learning_rate": 1.181697957637914e-06, "loss": 0.0368, "num_input_tokens_seen": 9277680, "step": 18855 }, { "epoch": 2.4891117856671507, "grad_norm": 33.48732376098633, "learning_rate": 1.1812449088492474e-06, "loss": 0.2397, "num_input_tokens_seen": 9280112, "step": 18860 }, { "epoch": 2.489771677444899, "grad_norm": 0.14362753927707672, "learning_rate": 1.1807918215926614e-06, "loss": 0.0019, "num_input_tokens_seen": 9282544, "step": 18865 }, { "epoch": 2.4904315692226473, "grad_norm": 14.686333656311035, "learning_rate": 1.1803386959643204e-06, "loss": 0.1361, "num_input_tokens_seen": 9284976, "step": 18870 }, { "epoch": 2.491091461000396, "grad_norm": 0.007154212798923254, "learning_rate": 1.179885532060397e-06, "loss": 0.0009, "num_input_tokens_seen": 9287472, "step": 18875 }, { "epoch": 2.4917513527781443, "grad_norm": 0.23903542757034302, "learning_rate": 1.1794323299770724e-06, "loss": 0.0829, "num_input_tokens_seen": 9290096, "step": 18880 }, { "epoch": 2.492411244555893, "grad_norm": 0.03929462283849716, "learning_rate": 1.1789790898105346e-06, "loss": 0.0012, "num_input_tokens_seen": 9292464, "step": 18885 }, { "epoch": 2.4930711363336413, "grad_norm": 0.009414401836693287, "learning_rate": 1.1785258116569816e-06, "loss": 0.0004, "num_input_tokens_seen": 9294896, "step": 18890 }, { "epoch": 2.4937310281113896, "grad_norm": 0.06671901047229767, "learning_rate": 1.1780724956126173e-06, "loss": 0.0015, "num_input_tokens_seen": 9297456, "step": 18895 }, { "epoch": 2.4943909198891383, "grad_norm": 26.247026443481445, "learning_rate": 1.1776191417736558e-06, "loss": 0.1238, "num_input_tokens_seen": 9299888, "step": 18900 }, { "epoch": 2.4950508116668866, "grad_norm": 24.444671630859375, "learning_rate": 1.1771657502363175e-06, "loss": 0.0385, "num_input_tokens_seen": 9302640, "step": 18905 }, { "epoch": 2.495710703444635, "grad_norm": 0.034380197525024414, "learning_rate": 1.1767123210968315e-06, "loss": 0.0002, "num_input_tokens_seen": 9305392, "step": 18910 }, { "epoch": 2.4963705952223836, "grad_norm": 0.05007326230406761, "learning_rate": 1.1762588544514352e-06, "loss": 0.0991, "num_input_tokens_seen": 9307888, "step": 18915 }, { "epoch": 2.497030487000132, "grad_norm": 0.036398421972990036, "learning_rate": 1.1758053503963733e-06, "loss": 0.0002, "num_input_tokens_seen": 9310192, "step": 18920 }, { "epoch": 2.4976903787778806, "grad_norm": 15.678851127624512, "learning_rate": 1.1753518090278991e-06, "loss": 0.1931, "num_input_tokens_seen": 9312688, "step": 18925 }, { "epoch": 2.498350270555629, "grad_norm": 0.07849349826574326, "learning_rate": 1.1748982304422729e-06, "loss": 0.0463, "num_input_tokens_seen": 9315248, "step": 18930 }, { "epoch": 2.499010162333377, "grad_norm": 0.720598578453064, "learning_rate": 1.174444614735764e-06, "loss": 0.0443, "num_input_tokens_seen": 9318000, "step": 18935 }, { "epoch": 2.499670054111126, "grad_norm": 0.45224106311798096, "learning_rate": 1.1739909620046485e-06, "loss": 0.0663, "num_input_tokens_seen": 9320752, "step": 18940 }, { "epoch": 2.500329945888874, "grad_norm": 26.452760696411133, "learning_rate": 1.1735372723452114e-06, "loss": 0.0352, "num_input_tokens_seen": 9323632, "step": 18945 }, { "epoch": 2.500989837666623, "grad_norm": 0.135438933968544, "learning_rate": 1.1730835458537454e-06, "loss": 0.0962, "num_input_tokens_seen": 9326256, "step": 18950 }, { "epoch": 2.500989837666623, "eval_loss": 0.11418119072914124, "eval_runtime": 7.8086, "eval_samples_per_second": 862.515, "eval_steps_per_second": 107.83, "num_input_tokens_seen": 9326256, "step": 18950 }, { "epoch": 2.501649729444371, "grad_norm": 0.07885482162237167, "learning_rate": 1.1726297826265497e-06, "loss": 0.0332, "num_input_tokens_seen": 9328688, "step": 18955 }, { "epoch": 2.5023096212221194, "grad_norm": 17.58693504333496, "learning_rate": 1.1721759827599326e-06, "loss": 0.0693, "num_input_tokens_seen": 9331312, "step": 18960 }, { "epoch": 2.502969512999868, "grad_norm": 24.822282791137695, "learning_rate": 1.1717221463502102e-06, "loss": 0.0561, "num_input_tokens_seen": 9333872, "step": 18965 }, { "epoch": 2.5036294047776164, "grad_norm": 0.05071251094341278, "learning_rate": 1.1712682734937058e-06, "loss": 0.0002, "num_input_tokens_seen": 9336176, "step": 18970 }, { "epoch": 2.504289296555365, "grad_norm": 0.4167310297489166, "learning_rate": 1.1708143642867506e-06, "loss": 0.0008, "num_input_tokens_seen": 9338800, "step": 18975 }, { "epoch": 2.5049491883331134, "grad_norm": 0.021511459723114967, "learning_rate": 1.1703604188256833e-06, "loss": 0.0002, "num_input_tokens_seen": 9341232, "step": 18980 }, { "epoch": 2.5056090801108617, "grad_norm": 24.896465301513672, "learning_rate": 1.169906437206851e-06, "loss": 0.0045, "num_input_tokens_seen": 9343664, "step": 18985 }, { "epoch": 2.5062689718886104, "grad_norm": 0.0010555103654041886, "learning_rate": 1.1694524195266077e-06, "loss": 0.0005, "num_input_tokens_seen": 9346096, "step": 18990 }, { "epoch": 2.5069288636663587, "grad_norm": 24.33745765686035, "learning_rate": 1.1689983658813152e-06, "loss": 0.1795, "num_input_tokens_seen": 9348592, "step": 18995 }, { "epoch": 2.5075887554441074, "grad_norm": 0.8176755905151367, "learning_rate": 1.1685442763673436e-06, "loss": 0.0014, "num_input_tokens_seen": 9351088, "step": 19000 }, { "epoch": 2.5082486472218557, "grad_norm": 0.24280667304992676, "learning_rate": 1.16809015108107e-06, "loss": 0.0008, "num_input_tokens_seen": 9353648, "step": 19005 }, { "epoch": 2.508908538999604, "grad_norm": 0.06780319660902023, "learning_rate": 1.1676359901188785e-06, "loss": 0.0019, "num_input_tokens_seen": 9356208, "step": 19010 }, { "epoch": 2.5095684307773523, "grad_norm": 0.2860470116138458, "learning_rate": 1.1671817935771623e-06, "loss": 0.0006, "num_input_tokens_seen": 9358320, "step": 19015 }, { "epoch": 2.510228322555101, "grad_norm": 0.018246835097670555, "learning_rate": 1.166727561552321e-06, "loss": 0.046, "num_input_tokens_seen": 9360688, "step": 19020 }, { "epoch": 2.5108882143328493, "grad_norm": 0.017531786113977432, "learning_rate": 1.1662732941407625e-06, "loss": 0.0188, "num_input_tokens_seen": 9363248, "step": 19025 }, { "epoch": 2.511548106110598, "grad_norm": 0.004202735144644976, "learning_rate": 1.165818991438901e-06, "loss": 0.0165, "num_input_tokens_seen": 9365872, "step": 19030 }, { "epoch": 2.5122079978883463, "grad_norm": 0.927355170249939, "learning_rate": 1.1653646535431593e-06, "loss": 0.0002, "num_input_tokens_seen": 9368368, "step": 19035 }, { "epoch": 2.5128678896660945, "grad_norm": 0.005749446805566549, "learning_rate": 1.1649102805499676e-06, "loss": 0.0004, "num_input_tokens_seen": 9371056, "step": 19040 }, { "epoch": 2.5135277814438433, "grad_norm": 0.00678770337253809, "learning_rate": 1.1644558725557627e-06, "loss": 0.0001, "num_input_tokens_seen": 9373680, "step": 19045 }, { "epoch": 2.5141876732215915, "grad_norm": 0.010611538775265217, "learning_rate": 1.16400142965699e-06, "loss": 0.0, "num_input_tokens_seen": 9375920, "step": 19050 }, { "epoch": 2.5148475649993403, "grad_norm": 16.00409507751465, "learning_rate": 1.1635469519501015e-06, "loss": 0.1645, "num_input_tokens_seen": 9378608, "step": 19055 }, { "epoch": 2.5155074567770885, "grad_norm": 0.06570476293563843, "learning_rate": 1.1630924395315565e-06, "loss": 0.0565, "num_input_tokens_seen": 9380784, "step": 19060 }, { "epoch": 2.516167348554837, "grad_norm": 13.64858627319336, "learning_rate": 1.1626378924978223e-06, "loss": 0.0412, "num_input_tokens_seen": 9383216, "step": 19065 }, { "epoch": 2.5168272403325855, "grad_norm": 0.13996559381484985, "learning_rate": 1.1621833109453734e-06, "loss": 0.0311, "num_input_tokens_seen": 9385712, "step": 19070 }, { "epoch": 2.517487132110334, "grad_norm": 0.0008270741673186421, "learning_rate": 1.161728694970691e-06, "loss": 0.0527, "num_input_tokens_seen": 9387888, "step": 19075 }, { "epoch": 2.5181470238880825, "grad_norm": 0.006990394555032253, "learning_rate": 1.1612740446702645e-06, "loss": 0.0464, "num_input_tokens_seen": 9390192, "step": 19080 }, { "epoch": 2.518806915665831, "grad_norm": 0.022595075890421867, "learning_rate": 1.1608193601405894e-06, "loss": 0.0352, "num_input_tokens_seen": 9392816, "step": 19085 }, { "epoch": 2.519466807443579, "grad_norm": 0.01655641384422779, "learning_rate": 1.1603646414781701e-06, "loss": 0.0001, "num_input_tokens_seen": 9395184, "step": 19090 }, { "epoch": 2.520126699221328, "grad_norm": 0.007066233549267054, "learning_rate": 1.1599098887795164e-06, "loss": 0.0002, "num_input_tokens_seen": 9397232, "step": 19095 }, { "epoch": 2.520786590999076, "grad_norm": 0.280401349067688, "learning_rate": 1.1594551021411473e-06, "loss": 0.0002, "num_input_tokens_seen": 9399664, "step": 19100 }, { "epoch": 2.521446482776825, "grad_norm": 0.020977843552827835, "learning_rate": 1.1590002816595874e-06, "loss": 0.0001, "num_input_tokens_seen": 9401968, "step": 19105 }, { "epoch": 2.522106374554573, "grad_norm": 0.0056993900798261166, "learning_rate": 1.158545427431369e-06, "loss": 0.1127, "num_input_tokens_seen": 9404400, "step": 19110 }, { "epoch": 2.5227662663323214, "grad_norm": 0.005368967540562153, "learning_rate": 1.1580905395530317e-06, "loss": 0.073, "num_input_tokens_seen": 9406832, "step": 19115 }, { "epoch": 2.52342615811007, "grad_norm": 0.3067415952682495, "learning_rate": 1.1576356181211223e-06, "loss": 0.0006, "num_input_tokens_seen": 9409264, "step": 19120 }, { "epoch": 2.5240860498878184, "grad_norm": 15.489126205444336, "learning_rate": 1.1571806632321941e-06, "loss": 0.1069, "num_input_tokens_seen": 9411824, "step": 19125 }, { "epoch": 2.524745941665567, "grad_norm": 0.005901966709643602, "learning_rate": 1.1567256749828088e-06, "loss": 0.0614, "num_input_tokens_seen": 9414320, "step": 19130 }, { "epoch": 2.5254058334433154, "grad_norm": 0.425497442483902, "learning_rate": 1.1562706534695337e-06, "loss": 0.0487, "num_input_tokens_seen": 9416688, "step": 19135 }, { "epoch": 2.5260657252210637, "grad_norm": 0.0033785065170377493, "learning_rate": 1.1558155987889437e-06, "loss": 0.0504, "num_input_tokens_seen": 9419120, "step": 19140 }, { "epoch": 2.526725616998812, "grad_norm": 0.009927736595273018, "learning_rate": 1.1553605110376216e-06, "loss": 0.0001, "num_input_tokens_seen": 9421552, "step": 19145 }, { "epoch": 2.5273855087765607, "grad_norm": 0.03838609904050827, "learning_rate": 1.154905390312156e-06, "loss": 0.0725, "num_input_tokens_seen": 9424112, "step": 19150 }, { "epoch": 2.528045400554309, "grad_norm": 0.012181775644421577, "learning_rate": 1.1544502367091428e-06, "loss": 0.0382, "num_input_tokens_seen": 9426352, "step": 19155 }, { "epoch": 2.5287052923320577, "grad_norm": 0.20829293131828308, "learning_rate": 1.1539950503251858e-06, "loss": 0.0905, "num_input_tokens_seen": 9429040, "step": 19160 }, { "epoch": 2.529365184109806, "grad_norm": 0.013603360392153263, "learning_rate": 1.153539831256894e-06, "loss": 0.0015, "num_input_tokens_seen": 9431280, "step": 19165 }, { "epoch": 2.530025075887554, "grad_norm": 0.010468337684869766, "learning_rate": 1.1530845796008853e-06, "loss": 0.023, "num_input_tokens_seen": 9433648, "step": 19170 }, { "epoch": 2.530684967665303, "grad_norm": 0.08018022030591965, "learning_rate": 1.1526292954537827e-06, "loss": 0.0277, "num_input_tokens_seen": 9436400, "step": 19175 }, { "epoch": 2.531344859443051, "grad_norm": 0.005076752509921789, "learning_rate": 1.1521739789122179e-06, "loss": 0.0384, "num_input_tokens_seen": 9438896, "step": 19180 }, { "epoch": 2.5320047512208, "grad_norm": 0.0038340799510478973, "learning_rate": 1.1517186300728276e-06, "loss": 0.0001, "num_input_tokens_seen": 9441136, "step": 19185 }, { "epoch": 2.532664642998548, "grad_norm": 0.018121426925063133, "learning_rate": 1.151263249032257e-06, "loss": 0.0023, "num_input_tokens_seen": 9443376, "step": 19190 }, { "epoch": 2.5333245347762965, "grad_norm": 40.5118522644043, "learning_rate": 1.150807835887157e-06, "loss": 0.049, "num_input_tokens_seen": 9445808, "step": 19195 }, { "epoch": 2.533984426554045, "grad_norm": 8.46541976928711, "learning_rate": 1.1503523907341858e-06, "loss": 0.1468, "num_input_tokens_seen": 9447984, "step": 19200 }, { "epoch": 2.5346443183317935, "grad_norm": 1.6953380107879639, "learning_rate": 1.1498969136700087e-06, "loss": 0.0019, "num_input_tokens_seen": 9450352, "step": 19205 }, { "epoch": 2.535304210109542, "grad_norm": 1.2033361196517944, "learning_rate": 1.1494414047912967e-06, "loss": 0.0008, "num_input_tokens_seen": 9452976, "step": 19210 }, { "epoch": 2.5359641018872905, "grad_norm": 0.04662247374653816, "learning_rate": 1.1489858641947292e-06, "loss": 0.0011, "num_input_tokens_seen": 9455536, "step": 19215 }, { "epoch": 2.5366239936650388, "grad_norm": 0.015560769475996494, "learning_rate": 1.1485302919769906e-06, "loss": 0.0336, "num_input_tokens_seen": 9458224, "step": 19220 }, { "epoch": 2.5372838854427875, "grad_norm": 0.017624717205762863, "learning_rate": 1.1480746882347733e-06, "loss": 0.0001, "num_input_tokens_seen": 9460592, "step": 19225 }, { "epoch": 2.5379437772205358, "grad_norm": 0.14374710619449615, "learning_rate": 1.1476190530647754e-06, "loss": 0.0004, "num_input_tokens_seen": 9463152, "step": 19230 }, { "epoch": 2.5386036689982845, "grad_norm": 20.909866333007812, "learning_rate": 1.1471633865637027e-06, "loss": 0.077, "num_input_tokens_seen": 9465712, "step": 19235 }, { "epoch": 2.5392635607760328, "grad_norm": 0.1594572812318802, "learning_rate": 1.146707688828267e-06, "loss": 0.0003, "num_input_tokens_seen": 9468400, "step": 19240 }, { "epoch": 2.539923452553781, "grad_norm": 0.0020154337398707867, "learning_rate": 1.1462519599551864e-06, "loss": 0.0, "num_input_tokens_seen": 9470704, "step": 19245 }, { "epoch": 2.5405833443315298, "grad_norm": 23.83881950378418, "learning_rate": 1.1457962000411864e-06, "loss": 0.2087, "num_input_tokens_seen": 9472944, "step": 19250 }, { "epoch": 2.541243236109278, "grad_norm": 0.028763171285390854, "learning_rate": 1.1453404091829987e-06, "loss": 0.0457, "num_input_tokens_seen": 9475440, "step": 19255 }, { "epoch": 2.5419031278870268, "grad_norm": 0.03985166549682617, "learning_rate": 1.1448845874773623e-06, "loss": 0.0014, "num_input_tokens_seen": 9477808, "step": 19260 }, { "epoch": 2.542563019664775, "grad_norm": 0.01088575180619955, "learning_rate": 1.1444287350210208e-06, "loss": 0.0019, "num_input_tokens_seen": 9480368, "step": 19265 }, { "epoch": 2.5432229114425233, "grad_norm": 0.1729097068309784, "learning_rate": 1.143972851910726e-06, "loss": 0.0003, "num_input_tokens_seen": 9482864, "step": 19270 }, { "epoch": 2.5438828032202716, "grad_norm": 0.005866058170795441, "learning_rate": 1.143516938243236e-06, "loss": 0.0, "num_input_tokens_seen": 9485488, "step": 19275 }, { "epoch": 2.5445426949980203, "grad_norm": 0.011199753731489182, "learning_rate": 1.1430609941153154e-06, "loss": 0.0001, "num_input_tokens_seen": 9487856, "step": 19280 }, { "epoch": 2.545202586775769, "grad_norm": 0.004434535745531321, "learning_rate": 1.1426050196237347e-06, "loss": 0.0015, "num_input_tokens_seen": 9489968, "step": 19285 }, { "epoch": 2.5458624785535173, "grad_norm": 0.005931300576776266, "learning_rate": 1.142149014865271e-06, "loss": 0.0698, "num_input_tokens_seen": 9492464, "step": 19290 }, { "epoch": 2.5465223703312656, "grad_norm": 0.021038591861724854, "learning_rate": 1.1416929799367086e-06, "loss": 0.0744, "num_input_tokens_seen": 9495024, "step": 19295 }, { "epoch": 2.547182262109014, "grad_norm": 0.06645817309617996, "learning_rate": 1.141236914934837e-06, "loss": 0.0001, "num_input_tokens_seen": 9497648, "step": 19300 }, { "epoch": 2.5478421538867626, "grad_norm": 0.0028256825171411037, "learning_rate": 1.1407808199564532e-06, "loss": 0.0009, "num_input_tokens_seen": 9499824, "step": 19305 }, { "epoch": 2.548502045664511, "grad_norm": 0.021098587661981583, "learning_rate": 1.1403246950983598e-06, "loss": 0.1192, "num_input_tokens_seen": 9502064, "step": 19310 }, { "epoch": 2.5491619374422596, "grad_norm": 0.01978352852165699, "learning_rate": 1.1398685404573657e-06, "loss": 0.0003, "num_input_tokens_seen": 9504560, "step": 19315 }, { "epoch": 2.549821829220008, "grad_norm": 0.009966706857085228, "learning_rate": 1.139412356130287e-06, "loss": 0.0002, "num_input_tokens_seen": 9507120, "step": 19320 }, { "epoch": 2.550481720997756, "grad_norm": 0.09230407327413559, "learning_rate": 1.138956142213945e-06, "loss": 0.0831, "num_input_tokens_seen": 9509552, "step": 19325 }, { "epoch": 2.551141612775505, "grad_norm": 0.004419033881276846, "learning_rate": 1.1384998988051684e-06, "loss": 0.0001, "num_input_tokens_seen": 9512304, "step": 19330 }, { "epoch": 2.551801504553253, "grad_norm": 0.12761980295181274, "learning_rate": 1.1380436260007914e-06, "loss": 0.0749, "num_input_tokens_seen": 9514736, "step": 19335 }, { "epoch": 2.552461396331002, "grad_norm": 0.16481366753578186, "learning_rate": 1.1375873238976542e-06, "loss": 0.0802, "num_input_tokens_seen": 9517232, "step": 19340 }, { "epoch": 2.55312128810875, "grad_norm": 0.0165677722543478, "learning_rate": 1.1371309925926034e-06, "loss": 0.0005, "num_input_tokens_seen": 9519472, "step": 19345 }, { "epoch": 2.5537811798864984, "grad_norm": 0.014621448703110218, "learning_rate": 1.1366746321824928e-06, "loss": 0.0006, "num_input_tokens_seen": 9521776, "step": 19350 }, { "epoch": 2.554441071664247, "grad_norm": 1.5919983386993408, "learning_rate": 1.1362182427641812e-06, "loss": 0.0014, "num_input_tokens_seen": 9524208, "step": 19355 }, { "epoch": 2.5551009634419954, "grad_norm": 0.025344735011458397, "learning_rate": 1.135761824434534e-06, "loss": 0.0002, "num_input_tokens_seen": 9526768, "step": 19360 }, { "epoch": 2.555760855219744, "grad_norm": 0.0029130533803254366, "learning_rate": 1.135305377290423e-06, "loss": 0.0013, "num_input_tokens_seen": 9529200, "step": 19365 }, { "epoch": 2.5564207469974924, "grad_norm": 0.011325112544000149, "learning_rate": 1.1348489014287248e-06, "loss": 0.1098, "num_input_tokens_seen": 9531824, "step": 19370 }, { "epoch": 2.5570806387752407, "grad_norm": 0.012303993105888367, "learning_rate": 1.1343923969463243e-06, "loss": 0.0411, "num_input_tokens_seen": 9534192, "step": 19375 }, { "epoch": 2.5577405305529894, "grad_norm": 0.005361241288483143, "learning_rate": 1.1339358639401103e-06, "loss": 0.0, "num_input_tokens_seen": 9536688, "step": 19380 }, { "epoch": 2.5584004223307377, "grad_norm": 25.965045928955078, "learning_rate": 1.1334793025069794e-06, "loss": 0.0505, "num_input_tokens_seen": 9539248, "step": 19385 }, { "epoch": 2.5590603141084864, "grad_norm": 0.2334904968738556, "learning_rate": 1.1330227127438332e-06, "loss": 0.0818, "num_input_tokens_seen": 9541936, "step": 19390 }, { "epoch": 2.5597202058862347, "grad_norm": 0.0050776004791259766, "learning_rate": 1.1325660947475792e-06, "loss": 0.087, "num_input_tokens_seen": 9544240, "step": 19395 }, { "epoch": 2.560380097663983, "grad_norm": 0.06891787052154541, "learning_rate": 1.1321094486151317e-06, "loss": 0.0565, "num_input_tokens_seen": 9546608, "step": 19400 }, { "epoch": 2.5610399894417313, "grad_norm": 0.19221165776252747, "learning_rate": 1.1316527744434104e-06, "loss": 0.0001, "num_input_tokens_seen": 9549040, "step": 19405 }, { "epoch": 2.56169988121948, "grad_norm": 0.02088336832821369, "learning_rate": 1.131196072329341e-06, "loss": 0.0177, "num_input_tokens_seen": 9551792, "step": 19410 }, { "epoch": 2.5623597729972287, "grad_norm": 0.005114255007356405, "learning_rate": 1.1307393423698555e-06, "loss": 0.0007, "num_input_tokens_seen": 9554480, "step": 19415 }, { "epoch": 2.563019664774977, "grad_norm": 0.0021275868639349937, "learning_rate": 1.1302825846618912e-06, "loss": 0.0, "num_input_tokens_seen": 9557040, "step": 19420 }, { "epoch": 2.5636795565527253, "grad_norm": 0.003015077905729413, "learning_rate": 1.1298257993023917e-06, "loss": 0.0352, "num_input_tokens_seen": 9559600, "step": 19425 }, { "epoch": 2.5643394483304736, "grad_norm": 0.00647539459168911, "learning_rate": 1.1293689863883062e-06, "loss": 0.0736, "num_input_tokens_seen": 9562096, "step": 19430 }, { "epoch": 2.5649993401082223, "grad_norm": 0.004464747849851847, "learning_rate": 1.1289121460165907e-06, "loss": 0.0001, "num_input_tokens_seen": 9564400, "step": 19435 }, { "epoch": 2.5656592318859706, "grad_norm": 0.04154035821557045, "learning_rate": 1.1284552782842054e-06, "loss": 0.0911, "num_input_tokens_seen": 9566768, "step": 19440 }, { "epoch": 2.5663191236637193, "grad_norm": 0.014330783858895302, "learning_rate": 1.1279983832881174e-06, "loss": 0.0004, "num_input_tokens_seen": 9569648, "step": 19445 }, { "epoch": 2.5669790154414676, "grad_norm": 194.3843231201172, "learning_rate": 1.1275414611252996e-06, "loss": 0.0241, "num_input_tokens_seen": 9572528, "step": 19450 }, { "epoch": 2.567638907219216, "grad_norm": 0.028719462454319, "learning_rate": 1.1270845118927304e-06, "loss": 0.0002, "num_input_tokens_seen": 9575152, "step": 19455 }, { "epoch": 2.5682987989969646, "grad_norm": 0.027058910578489304, "learning_rate": 1.1266275356873933e-06, "loss": 0.0428, "num_input_tokens_seen": 9577712, "step": 19460 }, { "epoch": 2.568958690774713, "grad_norm": 0.001484107575379312, "learning_rate": 1.1261705326062792e-06, "loss": 0.001, "num_input_tokens_seen": 9579952, "step": 19465 }, { "epoch": 2.5696185825524616, "grad_norm": 0.010305678471922874, "learning_rate": 1.1257135027463831e-06, "loss": 0.1439, "num_input_tokens_seen": 9582512, "step": 19470 }, { "epoch": 2.57027847433021, "grad_norm": 0.09567421674728394, "learning_rate": 1.1252564462047063e-06, "loss": 0.0001, "num_input_tokens_seen": 9584624, "step": 19475 }, { "epoch": 2.570938366107958, "grad_norm": 0.04122396185994148, "learning_rate": 1.124799363078256e-06, "loss": 0.0707, "num_input_tokens_seen": 9587056, "step": 19480 }, { "epoch": 2.571598257885707, "grad_norm": 0.05378778278827667, "learning_rate": 1.1243422534640443e-06, "loss": 0.0002, "num_input_tokens_seen": 9589232, "step": 19485 }, { "epoch": 2.572258149663455, "grad_norm": 0.04564797505736351, "learning_rate": 1.12388511745909e-06, "loss": 0.0706, "num_input_tokens_seen": 9591792, "step": 19490 }, { "epoch": 2.572918041441204, "grad_norm": 0.018719684332609177, "learning_rate": 1.1234279551604164e-06, "loss": 0.0012, "num_input_tokens_seen": 9594352, "step": 19495 }, { "epoch": 2.573577933218952, "grad_norm": 0.017055541276931763, "learning_rate": 1.1229707666650531e-06, "loss": 0.0584, "num_input_tokens_seen": 9597168, "step": 19500 }, { "epoch": 2.5742378249967004, "grad_norm": 0.024933185428380966, "learning_rate": 1.1225135520700355e-06, "loss": 0.0311, "num_input_tokens_seen": 9599728, "step": 19505 }, { "epoch": 2.574897716774449, "grad_norm": 0.09410513937473297, "learning_rate": 1.122056311472403e-06, "loss": 0.0561, "num_input_tokens_seen": 9602096, "step": 19510 }, { "epoch": 2.5755576085521974, "grad_norm": 0.4654831886291504, "learning_rate": 1.121599044969203e-06, "loss": 0.0667, "num_input_tokens_seen": 9604464, "step": 19515 }, { "epoch": 2.576217500329946, "grad_norm": 0.1882486641407013, "learning_rate": 1.1211417526574858e-06, "loss": 0.0006, "num_input_tokens_seen": 9606896, "step": 19520 }, { "epoch": 2.5768773921076944, "grad_norm": 3.377927303314209, "learning_rate": 1.1206844346343089e-06, "loss": 0.0013, "num_input_tokens_seen": 9609520, "step": 19525 }, { "epoch": 2.5775372838854427, "grad_norm": 0.19885686039924622, "learning_rate": 1.1202270909967347e-06, "loss": 0.0006, "num_input_tokens_seen": 9612336, "step": 19530 }, { "epoch": 2.5781971756631914, "grad_norm": 0.0585622675716877, "learning_rate": 1.119769721841831e-06, "loss": 0.0861, "num_input_tokens_seen": 9614320, "step": 19535 }, { "epoch": 2.5788570674409397, "grad_norm": 0.6341502666473389, "learning_rate": 1.119312327266671e-06, "loss": 0.0004, "num_input_tokens_seen": 9617136, "step": 19540 }, { "epoch": 2.5795169592186884, "grad_norm": 0.005355259403586388, "learning_rate": 1.1188549073683338e-06, "loss": 0.0001, "num_input_tokens_seen": 9619760, "step": 19545 }, { "epoch": 2.5801768509964367, "grad_norm": 0.014456301927566528, "learning_rate": 1.1183974622439032e-06, "loss": 0.0, "num_input_tokens_seen": 9622320, "step": 19550 }, { "epoch": 2.580836742774185, "grad_norm": 0.03556323051452637, "learning_rate": 1.1179399919904683e-06, "loss": 0.0006, "num_input_tokens_seen": 9624880, "step": 19555 }, { "epoch": 2.5814966345519332, "grad_norm": 19.87205696105957, "learning_rate": 1.1174824967051244e-06, "loss": 0.0987, "num_input_tokens_seen": 9627312, "step": 19560 }, { "epoch": 2.582156526329682, "grad_norm": 0.03862188756465912, "learning_rate": 1.117024976484971e-06, "loss": 0.0, "num_input_tokens_seen": 9630000, "step": 19565 }, { "epoch": 2.5828164181074302, "grad_norm": 14.911026954650879, "learning_rate": 1.1165674314271142e-06, "loss": 0.1392, "num_input_tokens_seen": 9632432, "step": 19570 }, { "epoch": 2.583476309885179, "grad_norm": 0.04838300123810768, "learning_rate": 1.1161098616286641e-06, "loss": 0.1115, "num_input_tokens_seen": 9634992, "step": 19575 }, { "epoch": 2.5841362016629272, "grad_norm": 0.04838849604129791, "learning_rate": 1.1156522671867366e-06, "loss": 0.0023, "num_input_tokens_seen": 9637296, "step": 19580 }, { "epoch": 2.5847960934406755, "grad_norm": 0.026173245161771774, "learning_rate": 1.1151946481984528e-06, "loss": 0.0005, "num_input_tokens_seen": 9639664, "step": 19585 }, { "epoch": 2.5854559852184242, "grad_norm": 0.006514672189950943, "learning_rate": 1.1147370047609391e-06, "loss": 0.0971, "num_input_tokens_seen": 9642224, "step": 19590 }, { "epoch": 2.5861158769961725, "grad_norm": 121.60194396972656, "learning_rate": 1.1142793369713273e-06, "loss": 0.1041, "num_input_tokens_seen": 9644592, "step": 19595 }, { "epoch": 2.5867757687739212, "grad_norm": 0.08616314083337784, "learning_rate": 1.1138216449267536e-06, "loss": 0.0555, "num_input_tokens_seen": 9647152, "step": 19600 }, { "epoch": 2.5874356605516695, "grad_norm": 0.2943209707736969, "learning_rate": 1.11336392872436e-06, "loss": 0.0006, "num_input_tokens_seen": 9649584, "step": 19605 }, { "epoch": 2.588095552329418, "grad_norm": 0.03955228999257088, "learning_rate": 1.112906188461293e-06, "loss": 0.0612, "num_input_tokens_seen": 9652080, "step": 19610 }, { "epoch": 2.5887554441071665, "grad_norm": 0.028087584301829338, "learning_rate": 1.1124484242347055e-06, "loss": 0.0002, "num_input_tokens_seen": 9654448, "step": 19615 }, { "epoch": 2.589415335884915, "grad_norm": 0.26087915897369385, "learning_rate": 1.1119906361417544e-06, "loss": 0.0007, "num_input_tokens_seen": 9657008, "step": 19620 }, { "epoch": 2.5900752276626635, "grad_norm": 4.318361282348633, "learning_rate": 1.1115328242796017e-06, "loss": 0.0618, "num_input_tokens_seen": 9659376, "step": 19625 }, { "epoch": 2.590735119440412, "grad_norm": 0.008139794692397118, "learning_rate": 1.1110749887454146e-06, "loss": 0.0002, "num_input_tokens_seen": 9662064, "step": 19630 }, { "epoch": 2.59139501121816, "grad_norm": 0.005774452816694975, "learning_rate": 1.110617129636365e-06, "loss": 0.0002, "num_input_tokens_seen": 9664112, "step": 19635 }, { "epoch": 2.592054902995909, "grad_norm": 0.0302497036755085, "learning_rate": 1.1101592470496315e-06, "loss": 0.0611, "num_input_tokens_seen": 9666288, "step": 19640 }, { "epoch": 2.592714794773657, "grad_norm": 0.07261912524700165, "learning_rate": 1.1097013410823952e-06, "loss": 0.0007, "num_input_tokens_seen": 9668528, "step": 19645 }, { "epoch": 2.593374686551406, "grad_norm": 0.1376306116580963, "learning_rate": 1.1092434118318435e-06, "loss": 0.0945, "num_input_tokens_seen": 9670832, "step": 19650 }, { "epoch": 2.594034578329154, "grad_norm": 0.09144063293933868, "learning_rate": 1.1087854593951688e-06, "loss": 0.0826, "num_input_tokens_seen": 9673008, "step": 19655 }, { "epoch": 2.5946944701069024, "grad_norm": 14.743500709533691, "learning_rate": 1.108327483869568e-06, "loss": 0.2127, "num_input_tokens_seen": 9675568, "step": 19660 }, { "epoch": 2.595354361884651, "grad_norm": 0.10582589358091354, "learning_rate": 1.1078694853522435e-06, "loss": 0.0008, "num_input_tokens_seen": 9678192, "step": 19665 }, { "epoch": 2.5960142536623994, "grad_norm": 0.0031384157482534647, "learning_rate": 1.1074114639404015e-06, "loss": 0.0001, "num_input_tokens_seen": 9680624, "step": 19670 }, { "epoch": 2.596674145440148, "grad_norm": 0.17820972204208374, "learning_rate": 1.1069534197312544e-06, "loss": 0.0003, "num_input_tokens_seen": 9683056, "step": 19675 }, { "epoch": 2.5973340372178964, "grad_norm": 12.603302001953125, "learning_rate": 1.1064953528220181e-06, "loss": 0.091, "num_input_tokens_seen": 9685616, "step": 19680 }, { "epoch": 2.5979939289956446, "grad_norm": 0.018135545775294304, "learning_rate": 1.1060372633099146e-06, "loss": 0.0002, "num_input_tokens_seen": 9688368, "step": 19685 }, { "epoch": 2.598653820773393, "grad_norm": 0.07453392446041107, "learning_rate": 1.10557915129217e-06, "loss": 0.0001, "num_input_tokens_seen": 9690672, "step": 19690 }, { "epoch": 2.5993137125511416, "grad_norm": 0.027635499835014343, "learning_rate": 1.1051210168660146e-06, "loss": 0.0007, "num_input_tokens_seen": 9692784, "step": 19695 }, { "epoch": 2.59997360432889, "grad_norm": 12.794387817382812, "learning_rate": 1.1046628601286852e-06, "loss": 0.0424, "num_input_tokens_seen": 9695216, "step": 19700 }, { "epoch": 2.6006334961066386, "grad_norm": 14.300383567810059, "learning_rate": 1.1042046811774213e-06, "loss": 0.0569, "num_input_tokens_seen": 9697456, "step": 19705 }, { "epoch": 2.601293387884387, "grad_norm": 0.02011864259839058, "learning_rate": 1.1037464801094684e-06, "loss": 0.0568, "num_input_tokens_seen": 9699760, "step": 19710 }, { "epoch": 2.601953279662135, "grad_norm": 0.663529634475708, "learning_rate": 1.1032882570220764e-06, "loss": 0.0536, "num_input_tokens_seen": 9702128, "step": 19715 }, { "epoch": 2.602613171439884, "grad_norm": 0.014893081970512867, "learning_rate": 1.1028300120124997e-06, "loss": 0.0003, "num_input_tokens_seen": 9705008, "step": 19720 }, { "epoch": 2.603273063217632, "grad_norm": 0.02787351794540882, "learning_rate": 1.1023717451779977e-06, "loss": 0.0, "num_input_tokens_seen": 9707312, "step": 19725 }, { "epoch": 2.603932954995381, "grad_norm": 0.015048121102154255, "learning_rate": 1.1019134566158341e-06, "loss": 0.0002, "num_input_tokens_seen": 9709936, "step": 19730 }, { "epoch": 2.604592846773129, "grad_norm": 0.007719477638602257, "learning_rate": 1.1014551464232773e-06, "loss": 0.0001, "num_input_tokens_seen": 9712304, "step": 19735 }, { "epoch": 2.6052527385508775, "grad_norm": 21.70554542541504, "learning_rate": 1.1009968146976003e-06, "loss": 0.0626, "num_input_tokens_seen": 9714736, "step": 19740 }, { "epoch": 2.605912630328626, "grad_norm": 0.0036755255423486233, "learning_rate": 1.100538461536081e-06, "loss": 0.1001, "num_input_tokens_seen": 9717360, "step": 19745 }, { "epoch": 2.6065725221063745, "grad_norm": 81.93183898925781, "learning_rate": 1.1000800870360012e-06, "loss": 0.0181, "num_input_tokens_seen": 9719984, "step": 19750 }, { "epoch": 2.607232413884123, "grad_norm": 25.536109924316406, "learning_rate": 1.0996216912946472e-06, "loss": 0.0023, "num_input_tokens_seen": 9722352, "step": 19755 }, { "epoch": 2.6078923056618715, "grad_norm": 0.0018935500411316752, "learning_rate": 1.099163274409311e-06, "loss": 0.0001, "num_input_tokens_seen": 9724848, "step": 19760 }, { "epoch": 2.6085521974396197, "grad_norm": 42.559783935546875, "learning_rate": 1.098704836477288e-06, "loss": 0.1899, "num_input_tokens_seen": 9727472, "step": 19765 }, { "epoch": 2.6092120892173685, "grad_norm": 0.014334982261061668, "learning_rate": 1.098246377595878e-06, "loss": 0.0, "num_input_tokens_seen": 9729840, "step": 19770 }, { "epoch": 2.6098719809951167, "grad_norm": 0.6818047761917114, "learning_rate": 1.097787897862386e-06, "loss": 0.0006, "num_input_tokens_seen": 9732592, "step": 19775 }, { "epoch": 2.6105318727728655, "grad_norm": 0.013156001456081867, "learning_rate": 1.097329397374121e-06, "loss": 0.1143, "num_input_tokens_seen": 9734960, "step": 19780 }, { "epoch": 2.6111917645506137, "grad_norm": 0.11573667824268341, "learning_rate": 1.0968708762283955e-06, "loss": 0.0001, "num_input_tokens_seen": 9737328, "step": 19785 }, { "epoch": 2.611851656328362, "grad_norm": 30.16448211669922, "learning_rate": 1.0964123345225285e-06, "loss": 0.1288, "num_input_tokens_seen": 9739888, "step": 19790 }, { "epoch": 2.6125115481061107, "grad_norm": 0.019509224221110344, "learning_rate": 1.0959537723538414e-06, "loss": 0.1315, "num_input_tokens_seen": 9742576, "step": 19795 }, { "epoch": 2.613171439883859, "grad_norm": 0.02313673309981823, "learning_rate": 1.0954951898196614e-06, "loss": 0.014, "num_input_tokens_seen": 9745264, "step": 19800 }, { "epoch": 2.6138313316616077, "grad_norm": 20.099658966064453, "learning_rate": 1.0950365870173186e-06, "loss": 0.1164, "num_input_tokens_seen": 9747568, "step": 19805 }, { "epoch": 2.614491223439356, "grad_norm": 0.01467086561024189, "learning_rate": 1.0945779640441484e-06, "loss": 0.0801, "num_input_tokens_seen": 9750192, "step": 19810 }, { "epoch": 2.6151511152171043, "grad_norm": 0.027691079303622246, "learning_rate": 1.0941193209974902e-06, "loss": 0.0024, "num_input_tokens_seen": 9752752, "step": 19815 }, { "epoch": 2.6158110069948526, "grad_norm": 0.1043350100517273, "learning_rate": 1.0936606579746877e-06, "loss": 0.0005, "num_input_tokens_seen": 9755504, "step": 19820 }, { "epoch": 2.6164708987726013, "grad_norm": 39.8645133972168, "learning_rate": 1.0932019750730888e-06, "loss": 0.0253, "num_input_tokens_seen": 9757936, "step": 19825 }, { "epoch": 2.6171307905503496, "grad_norm": 0.016320737078785896, "learning_rate": 1.0927432723900455e-06, "loss": 0.0, "num_input_tokens_seen": 9760368, "step": 19830 }, { "epoch": 2.6177906823280983, "grad_norm": 0.1097755879163742, "learning_rate": 1.0922845500229143e-06, "loss": 0.0281, "num_input_tokens_seen": 9762672, "step": 19835 }, { "epoch": 2.6184505741058466, "grad_norm": 0.01819690689444542, "learning_rate": 1.0918258080690557e-06, "loss": 0.0326, "num_input_tokens_seen": 9765040, "step": 19840 }, { "epoch": 2.619110465883595, "grad_norm": 0.027937527745962143, "learning_rate": 1.0913670466258343e-06, "loss": 0.0002, "num_input_tokens_seen": 9767536, "step": 19845 }, { "epoch": 2.6197703576613436, "grad_norm": 0.02201470360159874, "learning_rate": 1.090908265790619e-06, "loss": 0.0271, "num_input_tokens_seen": 9770032, "step": 19850 }, { "epoch": 2.620430249439092, "grad_norm": 34.935428619384766, "learning_rate": 1.0904494656607824e-06, "loss": 0.0798, "num_input_tokens_seen": 9772656, "step": 19855 }, { "epoch": 2.6210901412168406, "grad_norm": 18.851734161376953, "learning_rate": 1.0899906463337016e-06, "loss": 0.0459, "num_input_tokens_seen": 9775408, "step": 19860 }, { "epoch": 2.621750032994589, "grad_norm": 0.2432224303483963, "learning_rate": 1.0895318079067576e-06, "loss": 0.0001, "num_input_tokens_seen": 9777712, "step": 19865 }, { "epoch": 2.622409924772337, "grad_norm": 0.2236015647649765, "learning_rate": 1.0890729504773359e-06, "loss": 0.1895, "num_input_tokens_seen": 9780144, "step": 19870 }, { "epoch": 2.623069816550086, "grad_norm": 0.05861254781484604, "learning_rate": 1.0886140741428257e-06, "loss": 0.0005, "num_input_tokens_seen": 9782960, "step": 19875 }, { "epoch": 2.623729708327834, "grad_norm": 0.005046785343438387, "learning_rate": 1.08815517900062e-06, "loss": 0.0887, "num_input_tokens_seen": 9785136, "step": 19880 }, { "epoch": 2.624389600105583, "grad_norm": 0.01770644262433052, "learning_rate": 1.0876962651481159e-06, "loss": 0.0035, "num_input_tokens_seen": 9787696, "step": 19885 }, { "epoch": 2.625049491883331, "grad_norm": 0.008962863124907017, "learning_rate": 1.0872373326827143e-06, "loss": 0.0009, "num_input_tokens_seen": 9790192, "step": 19890 }, { "epoch": 2.6257093836610794, "grad_norm": 0.0033157370053231716, "learning_rate": 1.0867783817018207e-06, "loss": 0.0338, "num_input_tokens_seen": 9792752, "step": 19895 }, { "epoch": 2.626369275438828, "grad_norm": 28.182462692260742, "learning_rate": 1.086319412302844e-06, "loss": 0.1192, "num_input_tokens_seen": 9795376, "step": 19900 }, { "epoch": 2.6270291672165764, "grad_norm": 0.005671774502843618, "learning_rate": 1.085860424583197e-06, "loss": 0.0018, "num_input_tokens_seen": 9797872, "step": 19905 }, { "epoch": 2.627689058994325, "grad_norm": 0.11161317676305771, "learning_rate": 1.0854014186402968e-06, "loss": 0.0488, "num_input_tokens_seen": 9800432, "step": 19910 }, { "epoch": 2.6283489507720734, "grad_norm": 0.004430611617863178, "learning_rate": 1.0849423945715637e-06, "loss": 0.0001, "num_input_tokens_seen": 9802992, "step": 19915 }, { "epoch": 2.6290088425498217, "grad_norm": 0.12796758115291595, "learning_rate": 1.0844833524744226e-06, "loss": 0.0881, "num_input_tokens_seen": 9805424, "step": 19920 }, { "epoch": 2.6296687343275704, "grad_norm": 0.013856647536158562, "learning_rate": 1.0840242924463016e-06, "loss": 0.0001, "num_input_tokens_seen": 9808048, "step": 19925 }, { "epoch": 2.6303286261053187, "grad_norm": 0.07488207519054413, "learning_rate": 1.0835652145846335e-06, "loss": 0.0007, "num_input_tokens_seen": 9810608, "step": 19930 }, { "epoch": 2.6309885178830674, "grad_norm": 24.79937744140625, "learning_rate": 1.0831061189868531e-06, "loss": 0.1537, "num_input_tokens_seen": 9812848, "step": 19935 }, { "epoch": 2.6316484096608157, "grad_norm": 20.282337188720703, "learning_rate": 1.0826470057504008e-06, "loss": 0.1011, "num_input_tokens_seen": 9815088, "step": 19940 }, { "epoch": 2.632308301438564, "grad_norm": 0.022686759009957314, "learning_rate": 1.0821878749727204e-06, "loss": 0.0767, "num_input_tokens_seen": 9817520, "step": 19945 }, { "epoch": 2.6329681932163123, "grad_norm": 0.07471811026334763, "learning_rate": 1.0817287267512583e-06, "loss": 0.0003, "num_input_tokens_seen": 9820080, "step": 19950 }, { "epoch": 2.633628084994061, "grad_norm": 0.13609452545642853, "learning_rate": 1.0812695611834664e-06, "loss": 0.1129, "num_input_tokens_seen": 9822320, "step": 19955 }, { "epoch": 2.6342879767718093, "grad_norm": 0.20238642394542694, "learning_rate": 1.0808103783667981e-06, "loss": 0.0007, "num_input_tokens_seen": 9824432, "step": 19960 }, { "epoch": 2.634947868549558, "grad_norm": 0.023685157299041748, "learning_rate": 1.0803511783987122e-06, "loss": 0.0001, "num_input_tokens_seen": 9826992, "step": 19965 }, { "epoch": 2.6356077603273063, "grad_norm": 0.08244713395833969, "learning_rate": 1.0798919613766707e-06, "loss": 0.0016, "num_input_tokens_seen": 9829424, "step": 19970 }, { "epoch": 2.6362676521050545, "grad_norm": 24.1247501373291, "learning_rate": 1.079432727398139e-06, "loss": 0.1057, "num_input_tokens_seen": 9832048, "step": 19975 }, { "epoch": 2.6369275438828033, "grad_norm": 0.03990231081843376, "learning_rate": 1.078973476560586e-06, "loss": 0.0002, "num_input_tokens_seen": 9834352, "step": 19980 }, { "epoch": 2.6375874356605515, "grad_norm": 0.023310460150241852, "learning_rate": 1.0785142089614843e-06, "loss": 0.0002, "num_input_tokens_seen": 9836784, "step": 19985 }, { "epoch": 2.6382473274383003, "grad_norm": 0.03536645323038101, "learning_rate": 1.0780549246983105e-06, "loss": 0.0017, "num_input_tokens_seen": 9839152, "step": 19990 }, { "epoch": 2.6389072192160485, "grad_norm": 0.6347410082817078, "learning_rate": 1.077595623868544e-06, "loss": 0.0005, "num_input_tokens_seen": 9841712, "step": 19995 }, { "epoch": 2.639567110993797, "grad_norm": 0.0031490796245634556, "learning_rate": 1.0771363065696684e-06, "loss": 0.1022, "num_input_tokens_seen": 9844144, "step": 20000 }, { "epoch": 2.6402270027715455, "grad_norm": 0.06542643159627914, "learning_rate": 1.0766769728991705e-06, "loss": 0.0979, "num_input_tokens_seen": 9846512, "step": 20005 }, { "epoch": 2.640886894549294, "grad_norm": 0.1036391630768776, "learning_rate": 1.0762176229545398e-06, "loss": 0.0752, "num_input_tokens_seen": 9849136, "step": 20010 }, { "epoch": 2.6415467863270425, "grad_norm": 0.3355109691619873, "learning_rate": 1.0757582568332711e-06, "loss": 0.0312, "num_input_tokens_seen": 9851504, "step": 20015 }, { "epoch": 2.642206678104791, "grad_norm": 0.05988360196352005, "learning_rate": 1.0752988746328607e-06, "loss": 0.0831, "num_input_tokens_seen": 9853872, "step": 20020 }, { "epoch": 2.642866569882539, "grad_norm": 0.08277001231908798, "learning_rate": 1.0748394764508095e-06, "loss": 0.0416, "num_input_tokens_seen": 9856240, "step": 20025 }, { "epoch": 2.643526461660288, "grad_norm": 0.2739223837852478, "learning_rate": 1.0743800623846213e-06, "loss": 0.0012, "num_input_tokens_seen": 9858480, "step": 20030 }, { "epoch": 2.644186353438036, "grad_norm": 0.025710877031087875, "learning_rate": 1.0739206325318038e-06, "loss": 0.0004, "num_input_tokens_seen": 9860784, "step": 20035 }, { "epoch": 2.644846245215785, "grad_norm": 0.0025923114735633135, "learning_rate": 1.0734611869898668e-06, "loss": 0.0001, "num_input_tokens_seen": 9863408, "step": 20040 }, { "epoch": 2.645506136993533, "grad_norm": 0.05023697018623352, "learning_rate": 1.0730017258563253e-06, "loss": 0.0003, "num_input_tokens_seen": 9865840, "step": 20045 }, { "epoch": 2.6461660287712814, "grad_norm": 0.00368337519466877, "learning_rate": 1.0725422492286957e-06, "loss": 0.0004, "num_input_tokens_seen": 9868208, "step": 20050 }, { "epoch": 2.64682592054903, "grad_norm": 0.0010652344208210707, "learning_rate": 1.0720827572044995e-06, "loss": 0.0818, "num_input_tokens_seen": 9870320, "step": 20055 }, { "epoch": 2.6474858123267784, "grad_norm": 0.0436105877161026, "learning_rate": 1.0716232498812598e-06, "loss": 0.0457, "num_input_tokens_seen": 9872752, "step": 20060 }, { "epoch": 2.648145704104527, "grad_norm": 0.10467518121004105, "learning_rate": 1.0711637273565037e-06, "loss": 0.0002, "num_input_tokens_seen": 9875376, "step": 20065 }, { "epoch": 2.6488055958822754, "grad_norm": 0.7983324527740479, "learning_rate": 1.0707041897277623e-06, "loss": 0.002, "num_input_tokens_seen": 9877680, "step": 20070 }, { "epoch": 2.6494654876600237, "grad_norm": 0.0028939221519976854, "learning_rate": 1.0702446370925682e-06, "loss": 0.1485, "num_input_tokens_seen": 9880176, "step": 20075 }, { "epoch": 2.650125379437772, "grad_norm": 0.017617272213101387, "learning_rate": 1.069785069548459e-06, "loss": 0.0, "num_input_tokens_seen": 9882672, "step": 20080 }, { "epoch": 2.6507852712155207, "grad_norm": 0.053677234798669815, "learning_rate": 1.0693254871929737e-06, "loss": 0.0884, "num_input_tokens_seen": 9885168, "step": 20085 }, { "epoch": 2.6514451629932694, "grad_norm": 14.307182312011719, "learning_rate": 1.068865890123656e-06, "loss": 0.0369, "num_input_tokens_seen": 9887728, "step": 20090 }, { "epoch": 2.6521050547710177, "grad_norm": 146.76573181152344, "learning_rate": 1.068406278438052e-06, "loss": 0.0975, "num_input_tokens_seen": 9889904, "step": 20095 }, { "epoch": 2.652764946548766, "grad_norm": 0.7040823101997375, "learning_rate": 1.0679466522337102e-06, "loss": 0.0004, "num_input_tokens_seen": 9892272, "step": 20100 }, { "epoch": 2.653424838326514, "grad_norm": 0.13565072417259216, "learning_rate": 1.0674870116081838e-06, "loss": 0.0012, "num_input_tokens_seen": 9894832, "step": 20105 }, { "epoch": 2.654084730104263, "grad_norm": 1.549350619316101, "learning_rate": 1.067027356659028e-06, "loss": 0.0145, "num_input_tokens_seen": 9897520, "step": 20110 }, { "epoch": 2.654744621882011, "grad_norm": 0.023634739220142365, "learning_rate": 1.066567687483801e-06, "loss": 0.0002, "num_input_tokens_seen": 9899696, "step": 20115 }, { "epoch": 2.65540451365976, "grad_norm": 1.0513437986373901, "learning_rate": 1.0661080041800642e-06, "loss": 0.0003, "num_input_tokens_seen": 9902448, "step": 20120 }, { "epoch": 2.656064405437508, "grad_norm": 0.01823163591325283, "learning_rate": 1.0656483068453828e-06, "loss": 0.0975, "num_input_tokens_seen": 9904880, "step": 20125 }, { "epoch": 2.6567242972152565, "grad_norm": 16.00564956665039, "learning_rate": 1.065188595577323e-06, "loss": 0.0077, "num_input_tokens_seen": 9907376, "step": 20130 }, { "epoch": 2.657384188993005, "grad_norm": 0.001400611363351345, "learning_rate": 1.0647288704734563e-06, "loss": 0.0001, "num_input_tokens_seen": 9910000, "step": 20135 }, { "epoch": 2.6580440807707535, "grad_norm": 54.8060188293457, "learning_rate": 1.0642691316313556e-06, "loss": 0.0893, "num_input_tokens_seen": 9912368, "step": 20140 }, { "epoch": 2.658703972548502, "grad_norm": 38.158226013183594, "learning_rate": 1.0638093791485964e-06, "loss": 0.057, "num_input_tokens_seen": 9914672, "step": 20145 }, { "epoch": 2.6593638643262505, "grad_norm": 0.7230840921401978, "learning_rate": 1.0633496131227593e-06, "loss": 0.0003, "num_input_tokens_seen": 9917104, "step": 20150 }, { "epoch": 2.6600237561039988, "grad_norm": 0.07111620157957077, "learning_rate": 1.0628898336514252e-06, "loss": 0.0001, "num_input_tokens_seen": 9919536, "step": 20155 }, { "epoch": 2.6606836478817475, "grad_norm": 0.0019288670737296343, "learning_rate": 1.0624300408321795e-06, "loss": 0.0844, "num_input_tokens_seen": 9921712, "step": 20160 }, { "epoch": 2.6613435396594958, "grad_norm": 0.14772047102451324, "learning_rate": 1.0619702347626098e-06, "loss": 0.0596, "num_input_tokens_seen": 9924144, "step": 20165 }, { "epoch": 2.6620034314372445, "grad_norm": 0.6238264441490173, "learning_rate": 1.0615104155403063e-06, "loss": 0.0002, "num_input_tokens_seen": 9926640, "step": 20170 }, { "epoch": 2.6626633232149928, "grad_norm": 0.013693057000637054, "learning_rate": 1.0610505832628626e-06, "loss": 0.1212, "num_input_tokens_seen": 9929072, "step": 20175 }, { "epoch": 2.663323214992741, "grad_norm": 17.43290138244629, "learning_rate": 1.0605907380278745e-06, "loss": 0.0673, "num_input_tokens_seen": 9931376, "step": 20180 }, { "epoch": 2.6639831067704898, "grad_norm": 0.027616208419203758, "learning_rate": 1.0601308799329413e-06, "loss": 0.0001, "num_input_tokens_seen": 9933552, "step": 20185 }, { "epoch": 2.664642998548238, "grad_norm": 40.22153854370117, "learning_rate": 1.0596710090756641e-06, "loss": 0.1238, "num_input_tokens_seen": 9935728, "step": 20190 }, { "epoch": 2.6653028903259868, "grad_norm": 0.2736246883869171, "learning_rate": 1.0592111255536478e-06, "loss": 0.0167, "num_input_tokens_seen": 9937968, "step": 20195 }, { "epoch": 2.665962782103735, "grad_norm": 63.869476318359375, "learning_rate": 1.0587512294644982e-06, "loss": 0.1255, "num_input_tokens_seen": 9940272, "step": 20200 }, { "epoch": 2.6666226738814833, "grad_norm": 15.267470359802246, "learning_rate": 1.0582913209058257e-06, "loss": 0.0168, "num_input_tokens_seen": 9942768, "step": 20205 }, { "epoch": 2.667282565659232, "grad_norm": 152.2574462890625, "learning_rate": 1.0578313999752427e-06, "loss": 0.2539, "num_input_tokens_seen": 9945456, "step": 20210 }, { "epoch": 2.6679424574369803, "grad_norm": 0.024640629068017006, "learning_rate": 1.0573714667703638e-06, "loss": 0.001, "num_input_tokens_seen": 9948144, "step": 20215 }, { "epoch": 2.668602349214729, "grad_norm": 0.020223217085003853, "learning_rate": 1.0569115213888067e-06, "loss": 0.0813, "num_input_tokens_seen": 9950832, "step": 20220 }, { "epoch": 2.6692622409924773, "grad_norm": 14.030696868896484, "learning_rate": 1.0564515639281911e-06, "loss": 0.1883, "num_input_tokens_seen": 9953392, "step": 20225 }, { "epoch": 2.6699221327702256, "grad_norm": 0.02479293756186962, "learning_rate": 1.0559915944861397e-06, "loss": 0.0004, "num_input_tokens_seen": 9956016, "step": 20230 }, { "epoch": 2.670582024547974, "grad_norm": 0.21013309061527252, "learning_rate": 1.0555316131602778e-06, "loss": 0.0942, "num_input_tokens_seen": 9958512, "step": 20235 }, { "epoch": 2.6712419163257226, "grad_norm": 0.3462769389152527, "learning_rate": 1.0550716200482335e-06, "loss": 0.0016, "num_input_tokens_seen": 9961008, "step": 20240 }, { "epoch": 2.671901808103471, "grad_norm": 0.026164565235376358, "learning_rate": 1.0546116152476366e-06, "loss": 0.0428, "num_input_tokens_seen": 9963568, "step": 20245 }, { "epoch": 2.6725616998812196, "grad_norm": 0.0984434112906456, "learning_rate": 1.0541515988561195e-06, "loss": 0.1266, "num_input_tokens_seen": 9965808, "step": 20250 }, { "epoch": 2.673221591658968, "grad_norm": 0.08318020403385162, "learning_rate": 1.053691570971318e-06, "loss": 0.0373, "num_input_tokens_seen": 9968304, "step": 20255 }, { "epoch": 2.673881483436716, "grad_norm": 0.2922511398792267, "learning_rate": 1.0532315316908691e-06, "loss": 0.0505, "num_input_tokens_seen": 9970608, "step": 20260 }, { "epoch": 2.674541375214465, "grad_norm": 0.10788524895906448, "learning_rate": 1.0527714811124132e-06, "loss": 0.0933, "num_input_tokens_seen": 9972976, "step": 20265 }, { "epoch": 2.675201266992213, "grad_norm": 0.3631938695907593, "learning_rate": 1.0523114193335926e-06, "loss": 0.0012, "num_input_tokens_seen": 9975472, "step": 20270 }, { "epoch": 2.675861158769962, "grad_norm": 0.060309797525405884, "learning_rate": 1.051851346452052e-06, "loss": 0.1191, "num_input_tokens_seen": 9977840, "step": 20275 }, { "epoch": 2.67652105054771, "grad_norm": 0.09282801300287247, "learning_rate": 1.0513912625654386e-06, "loss": 0.0011, "num_input_tokens_seen": 9980080, "step": 20280 }, { "epoch": 2.6771809423254584, "grad_norm": 0.009397115558385849, "learning_rate": 1.0509311677714016e-06, "loss": 0.0008, "num_input_tokens_seen": 9982384, "step": 20285 }, { "epoch": 2.677840834103207, "grad_norm": 0.13344921171665192, "learning_rate": 1.050471062167594e-06, "loss": 0.0005, "num_input_tokens_seen": 9985136, "step": 20290 }, { "epoch": 2.6785007258809554, "grad_norm": 0.11381205171346664, "learning_rate": 1.050010945851668e-06, "loss": 0.1043, "num_input_tokens_seen": 9987760, "step": 20295 }, { "epoch": 2.679160617658704, "grad_norm": 0.0090647516772151, "learning_rate": 1.049550818921281e-06, "loss": 0.0001, "num_input_tokens_seen": 9990320, "step": 20300 }, { "epoch": 2.6798205094364524, "grad_norm": 0.016323421150445938, "learning_rate": 1.0490906814740916e-06, "loss": 0.0004, "num_input_tokens_seen": 9992816, "step": 20305 }, { "epoch": 2.6804804012142007, "grad_norm": 0.1127217710018158, "learning_rate": 1.0486305336077609e-06, "loss": 0.0006, "num_input_tokens_seen": 9995120, "step": 20310 }, { "epoch": 2.6811402929919494, "grad_norm": 25.71272850036621, "learning_rate": 1.0481703754199513e-06, "loss": 0.1659, "num_input_tokens_seen": 9997488, "step": 20315 }, { "epoch": 2.6818001847696977, "grad_norm": 0.004593730438500643, "learning_rate": 1.047710207008328e-06, "loss": 0.0007, "num_input_tokens_seen": 9999920, "step": 20320 }, { "epoch": 2.6824600765474464, "grad_norm": 43.304931640625, "learning_rate": 1.0472500284705595e-06, "loss": 0.1558, "num_input_tokens_seen": 10002352, "step": 20325 }, { "epoch": 2.6831199683251947, "grad_norm": 0.028853746131062508, "learning_rate": 1.046789839904314e-06, "loss": 0.0008, "num_input_tokens_seen": 10004592, "step": 20330 }, { "epoch": 2.683779860102943, "grad_norm": 0.006348696071654558, "learning_rate": 1.0463296414072641e-06, "loss": 0.038, "num_input_tokens_seen": 10007024, "step": 20335 }, { "epoch": 2.6844397518806917, "grad_norm": 0.03202318400144577, "learning_rate": 1.0458694330770832e-06, "loss": 0.0877, "num_input_tokens_seen": 10009712, "step": 20340 }, { "epoch": 2.68509964365844, "grad_norm": 0.05653927102684975, "learning_rate": 1.0454092150114473e-06, "loss": 0.0001, "num_input_tokens_seen": 10012400, "step": 20345 }, { "epoch": 2.6857595354361887, "grad_norm": 0.02157328464090824, "learning_rate": 1.0449489873080344e-06, "loss": 0.0013, "num_input_tokens_seen": 10014640, "step": 20350 }, { "epoch": 2.686419427213937, "grad_norm": 0.020717937499284744, "learning_rate": 1.0444887500645244e-06, "loss": 0.0002, "num_input_tokens_seen": 10017200, "step": 20355 }, { "epoch": 2.6870793189916853, "grad_norm": 0.23628821969032288, "learning_rate": 1.0440285033785994e-06, "loss": 0.0006, "num_input_tokens_seen": 10019888, "step": 20360 }, { "epoch": 2.6877392107694336, "grad_norm": 0.04174189642071724, "learning_rate": 1.0435682473479433e-06, "loss": 0.0873, "num_input_tokens_seen": 10022064, "step": 20365 }, { "epoch": 2.6883991025471823, "grad_norm": 0.04619302600622177, "learning_rate": 1.0431079820702425e-06, "loss": 0.0001, "num_input_tokens_seen": 10024496, "step": 20370 }, { "epoch": 2.6890589943249306, "grad_norm": 0.17279821634292603, "learning_rate": 1.042647707643184e-06, "loss": 0.0003, "num_input_tokens_seen": 10027056, "step": 20375 }, { "epoch": 2.6897188861026793, "grad_norm": 24.22002601623535, "learning_rate": 1.0421874241644591e-06, "loss": 0.0535, "num_input_tokens_seen": 10029616, "step": 20380 }, { "epoch": 2.6903787778804276, "grad_norm": 0.02293427661061287, "learning_rate": 1.0417271317317585e-06, "loss": 0.0001, "num_input_tokens_seen": 10032304, "step": 20385 }, { "epoch": 2.691038669658176, "grad_norm": 0.008679247461259365, "learning_rate": 1.0412668304427766e-06, "loss": 0.0001, "num_input_tokens_seen": 10034800, "step": 20390 }, { "epoch": 2.6916985614359246, "grad_norm": 0.5604121685028076, "learning_rate": 1.0408065203952086e-06, "loss": 0.0005, "num_input_tokens_seen": 10037424, "step": 20395 }, { "epoch": 2.692358453213673, "grad_norm": 0.049470458179712296, "learning_rate": 1.040346201686752e-06, "loss": 0.0002, "num_input_tokens_seen": 10039984, "step": 20400 }, { "epoch": 2.6930183449914216, "grad_norm": 0.003851136425510049, "learning_rate": 1.0398858744151067e-06, "loss": 0.0596, "num_input_tokens_seen": 10042672, "step": 20405 }, { "epoch": 2.69367823676917, "grad_norm": 0.01271560974419117, "learning_rate": 1.0394255386779728e-06, "loss": 0.0004, "num_input_tokens_seen": 10044912, "step": 20410 }, { "epoch": 2.694338128546918, "grad_norm": 0.20090311765670776, "learning_rate": 1.0389651945730545e-06, "loss": 0.0016, "num_input_tokens_seen": 10047216, "step": 20415 }, { "epoch": 2.694998020324667, "grad_norm": 0.0027731643058359623, "learning_rate": 1.0385048421980554e-06, "loss": 0.1896, "num_input_tokens_seen": 10049648, "step": 20420 }, { "epoch": 2.695657912102415, "grad_norm": 0.42517948150634766, "learning_rate": 1.0380444816506822e-06, "loss": 0.0583, "num_input_tokens_seen": 10052208, "step": 20425 }, { "epoch": 2.696317803880164, "grad_norm": 0.04374484345316887, "learning_rate": 1.0375841130286436e-06, "loss": 0.0242, "num_input_tokens_seen": 10054640, "step": 20430 }, { "epoch": 2.696977695657912, "grad_norm": 0.02089790813624859, "learning_rate": 1.0371237364296491e-06, "loss": 0.1204, "num_input_tokens_seen": 10057072, "step": 20435 }, { "epoch": 2.6976375874356604, "grad_norm": 1.9048203229904175, "learning_rate": 1.0366633519514104e-06, "loss": 0.0581, "num_input_tokens_seen": 10059376, "step": 20440 }, { "epoch": 2.698297479213409, "grad_norm": 0.02070975862443447, "learning_rate": 1.0362029596916407e-06, "loss": 0.0596, "num_input_tokens_seen": 10061936, "step": 20445 }, { "epoch": 2.6989573709911574, "grad_norm": 0.12654846906661987, "learning_rate": 1.0357425597480548e-06, "loss": 0.0001, "num_input_tokens_seen": 10064240, "step": 20450 }, { "epoch": 2.699617262768906, "grad_norm": 133.1804656982422, "learning_rate": 1.0352821522183697e-06, "loss": 0.0458, "num_input_tokens_seen": 10066608, "step": 20455 }, { "epoch": 2.7002771545466544, "grad_norm": 0.2598607838153839, "learning_rate": 1.0348217372003032e-06, "loss": 0.0203, "num_input_tokens_seen": 10068848, "step": 20460 }, { "epoch": 2.7009370463244027, "grad_norm": 0.03957448527216911, "learning_rate": 1.0343613147915748e-06, "loss": 0.0227, "num_input_tokens_seen": 10071152, "step": 20465 }, { "epoch": 2.7015969381021514, "grad_norm": 0.036427486687898636, "learning_rate": 1.0339008850899067e-06, "loss": 0.0001, "num_input_tokens_seen": 10073712, "step": 20470 }, { "epoch": 2.7022568298798997, "grad_norm": 0.44466233253479004, "learning_rate": 1.033440448193021e-06, "loss": 0.0004, "num_input_tokens_seen": 10076272, "step": 20475 }, { "epoch": 2.7029167216576484, "grad_norm": 0.05536358058452606, "learning_rate": 1.0329800041986423e-06, "loss": 0.0001, "num_input_tokens_seen": 10078448, "step": 20480 }, { "epoch": 2.7035766134353967, "grad_norm": 0.2589349150657654, "learning_rate": 1.0325195532044966e-06, "loss": 0.0009, "num_input_tokens_seen": 10081008, "step": 20485 }, { "epoch": 2.704236505213145, "grad_norm": 0.014972384087741375, "learning_rate": 1.032059095308311e-06, "loss": 0.0001, "num_input_tokens_seen": 10083312, "step": 20490 }, { "epoch": 2.7048963969908932, "grad_norm": 0.010684339329600334, "learning_rate": 1.0315986306078149e-06, "loss": 0.0, "num_input_tokens_seen": 10086192, "step": 20495 }, { "epoch": 2.705556288768642, "grad_norm": 0.005539987236261368, "learning_rate": 1.031138159200738e-06, "loss": 0.097, "num_input_tokens_seen": 10088432, "step": 20500 }, { "epoch": 2.7062161805463902, "grad_norm": 0.04544892534613609, "learning_rate": 1.0306776811848124e-06, "loss": 0.0813, "num_input_tokens_seen": 10091056, "step": 20505 }, { "epoch": 2.706876072324139, "grad_norm": 0.05907962843775749, "learning_rate": 1.030217196657771e-06, "loss": 0.0367, "num_input_tokens_seen": 10093552, "step": 20510 }, { "epoch": 2.7075359641018872, "grad_norm": 82.98628997802734, "learning_rate": 1.0297567057173486e-06, "loss": 0.0882, "num_input_tokens_seen": 10096048, "step": 20515 }, { "epoch": 2.7081958558796355, "grad_norm": 0.7463860511779785, "learning_rate": 1.0292962084612808e-06, "loss": 0.0012, "num_input_tokens_seen": 10098480, "step": 20520 }, { "epoch": 2.7088557476573842, "grad_norm": 0.016940882429480553, "learning_rate": 1.0288357049873051e-06, "loss": 0.0012, "num_input_tokens_seen": 10101360, "step": 20525 }, { "epoch": 2.7095156394351325, "grad_norm": 18.495149612426758, "learning_rate": 1.0283751953931595e-06, "loss": 0.0799, "num_input_tokens_seen": 10103856, "step": 20530 }, { "epoch": 2.7101755312128812, "grad_norm": 2.1261818408966064, "learning_rate": 1.0279146797765845e-06, "loss": 0.1165, "num_input_tokens_seen": 10106032, "step": 20535 }, { "epoch": 2.7108354229906295, "grad_norm": 42.41480255126953, "learning_rate": 1.0274541582353204e-06, "loss": 0.1146, "num_input_tokens_seen": 10108336, "step": 20540 }, { "epoch": 2.711495314768378, "grad_norm": 0.36318209767341614, "learning_rate": 1.0269936308671106e-06, "loss": 0.0007, "num_input_tokens_seen": 10111088, "step": 20545 }, { "epoch": 2.7121552065461265, "grad_norm": 3.9034619331359863, "learning_rate": 1.0265330977696977e-06, "loss": 0.0595, "num_input_tokens_seen": 10113584, "step": 20550 }, { "epoch": 2.712815098323875, "grad_norm": 0.025046294555068016, "learning_rate": 1.0260725590408273e-06, "loss": 0.0, "num_input_tokens_seen": 10116016, "step": 20555 }, { "epoch": 2.7134749901016235, "grad_norm": 0.04526593163609505, "learning_rate": 1.0256120147782445e-06, "loss": 0.0612, "num_input_tokens_seen": 10118768, "step": 20560 }, { "epoch": 2.714134881879372, "grad_norm": 1.7549163103103638, "learning_rate": 1.0251514650796975e-06, "loss": 0.0013, "num_input_tokens_seen": 10121008, "step": 20565 }, { "epoch": 2.71479477365712, "grad_norm": 0.06778989732265472, "learning_rate": 1.024690910042934e-06, "loss": 0.003, "num_input_tokens_seen": 10123760, "step": 20570 }, { "epoch": 2.715454665434869, "grad_norm": 0.07780632376670837, "learning_rate": 1.0242303497657038e-06, "loss": 0.0534, "num_input_tokens_seen": 10126128, "step": 20575 }, { "epoch": 2.716114557212617, "grad_norm": 0.05785810574889183, "learning_rate": 1.023769784345757e-06, "loss": 0.0001, "num_input_tokens_seen": 10128560, "step": 20580 }, { "epoch": 2.716774448990366, "grad_norm": 0.046384867280721664, "learning_rate": 1.0233092138808457e-06, "loss": 0.0412, "num_input_tokens_seen": 10130992, "step": 20585 }, { "epoch": 2.717434340768114, "grad_norm": 0.01017968449741602, "learning_rate": 1.0228486384687226e-06, "loss": 0.0032, "num_input_tokens_seen": 10133744, "step": 20590 }, { "epoch": 2.7180942325458624, "grad_norm": 0.09077580273151398, "learning_rate": 1.0223880582071413e-06, "loss": 0.0358, "num_input_tokens_seen": 10136112, "step": 20595 }, { "epoch": 2.718754124323611, "grad_norm": 0.011207741685211658, "learning_rate": 1.0219274731938574e-06, "loss": 0.0007, "num_input_tokens_seen": 10138352, "step": 20600 }, { "epoch": 2.7194140161013594, "grad_norm": 0.7007828950881958, "learning_rate": 1.0214668835266255e-06, "loss": 0.0695, "num_input_tokens_seen": 10140720, "step": 20605 }, { "epoch": 2.720073907879108, "grad_norm": 0.050733741372823715, "learning_rate": 1.021006289303203e-06, "loss": 0.0, "num_input_tokens_seen": 10143024, "step": 20610 }, { "epoch": 2.7207337996568564, "grad_norm": 0.011527138762176037, "learning_rate": 1.020545690621348e-06, "loss": 0.0383, "num_input_tokens_seen": 10145456, "step": 20615 }, { "epoch": 2.7213936914346046, "grad_norm": 0.3785325884819031, "learning_rate": 1.0200850875788187e-06, "loss": 0.0006, "num_input_tokens_seen": 10147440, "step": 20620 }, { "epoch": 2.722053583212353, "grad_norm": 0.1293078511953354, "learning_rate": 1.0196244802733752e-06, "loss": 0.0003, "num_input_tokens_seen": 10149808, "step": 20625 }, { "epoch": 2.7227134749901016, "grad_norm": 0.05859058350324631, "learning_rate": 1.0191638688027777e-06, "loss": 0.0002, "num_input_tokens_seen": 10152240, "step": 20630 }, { "epoch": 2.72337336676785, "grad_norm": 0.00426831329241395, "learning_rate": 1.0187032532647881e-06, "loss": 0.0, "num_input_tokens_seen": 10154800, "step": 20635 }, { "epoch": 2.7240332585455986, "grad_norm": 0.01532171294093132, "learning_rate": 1.018242633757168e-06, "loss": 0.0, "num_input_tokens_seen": 10157104, "step": 20640 }, { "epoch": 2.724693150323347, "grad_norm": 22.842222213745117, "learning_rate": 1.0177820103776814e-06, "loss": 0.1595, "num_input_tokens_seen": 10159728, "step": 20645 }, { "epoch": 2.725353042101095, "grad_norm": 0.14298970997333527, "learning_rate": 1.0173213832240918e-06, "loss": 0.0002, "num_input_tokens_seen": 10162288, "step": 20650 }, { "epoch": 2.726012933878844, "grad_norm": 0.03597806766629219, "learning_rate": 1.0168607523941637e-06, "loss": 0.0004, "num_input_tokens_seen": 10164784, "step": 20655 }, { "epoch": 2.726672825656592, "grad_norm": 0.059991996735334396, "learning_rate": 1.0164001179856635e-06, "loss": 0.0767, "num_input_tokens_seen": 10167344, "step": 20660 }, { "epoch": 2.727332717434341, "grad_norm": 0.013564787805080414, "learning_rate": 1.0159394800963565e-06, "loss": 0.0, "num_input_tokens_seen": 10169968, "step": 20665 }, { "epoch": 2.727992609212089, "grad_norm": 0.10042696446180344, "learning_rate": 1.0154788388240105e-06, "loss": 0.086, "num_input_tokens_seen": 10172400, "step": 20670 }, { "epoch": 2.7286525009898375, "grad_norm": 0.03980396315455437, "learning_rate": 1.015018194266393e-06, "loss": 0.0003, "num_input_tokens_seen": 10174768, "step": 20675 }, { "epoch": 2.729312392767586, "grad_norm": 0.02035425789654255, "learning_rate": 1.0145575465212727e-06, "loss": 0.0002, "num_input_tokens_seen": 10177136, "step": 20680 }, { "epoch": 2.7299722845453345, "grad_norm": 18.365062713623047, "learning_rate": 1.0140968956864186e-06, "loss": 0.0355, "num_input_tokens_seen": 10179312, "step": 20685 }, { "epoch": 2.730632176323083, "grad_norm": 0.055683355778455734, "learning_rate": 1.0136362418596004e-06, "loss": 0.0001, "num_input_tokens_seen": 10181872, "step": 20690 }, { "epoch": 2.7312920681008315, "grad_norm": 0.029079370200634003, "learning_rate": 1.0131755851385883e-06, "loss": 0.0874, "num_input_tokens_seen": 10184240, "step": 20695 }, { "epoch": 2.7319519598785797, "grad_norm": 0.0024721245281398296, "learning_rate": 1.012714925621154e-06, "loss": 0.0445, "num_input_tokens_seen": 10186544, "step": 20700 }, { "epoch": 2.7326118516563285, "grad_norm": 0.059845685958862305, "learning_rate": 1.012254263405069e-06, "loss": 0.0517, "num_input_tokens_seen": 10189296, "step": 20705 }, { "epoch": 2.7332717434340768, "grad_norm": 31.77506446838379, "learning_rate": 1.0117935985881048e-06, "loss": 0.08, "num_input_tokens_seen": 10191984, "step": 20710 }, { "epoch": 2.7339316352118255, "grad_norm": 0.00842016376554966, "learning_rate": 1.0113329312680352e-06, "loss": 0.0001, "num_input_tokens_seen": 10194608, "step": 20715 }, { "epoch": 2.7345915269895738, "grad_norm": 0.16895176470279694, "learning_rate": 1.0108722615426326e-06, "loss": 0.0008, "num_input_tokens_seen": 10197104, "step": 20720 }, { "epoch": 2.735251418767322, "grad_norm": 0.0018568473169580102, "learning_rate": 1.0104115895096715e-06, "loss": 0.0003, "num_input_tokens_seen": 10199536, "step": 20725 }, { "epoch": 2.7359113105450708, "grad_norm": 0.16196206212043762, "learning_rate": 1.0099509152669257e-06, "loss": 0.0002, "num_input_tokens_seen": 10202096, "step": 20730 }, { "epoch": 2.736571202322819, "grad_norm": 0.007333566900342703, "learning_rate": 1.0094902389121702e-06, "loss": 0.0002, "num_input_tokens_seen": 10204464, "step": 20735 }, { "epoch": 2.7372310941005678, "grad_norm": 0.0005558169796131551, "learning_rate": 1.0090295605431805e-06, "loss": 0.0001, "num_input_tokens_seen": 10207024, "step": 20740 }, { "epoch": 2.737890985878316, "grad_norm": 0.06266650557518005, "learning_rate": 1.0085688802577315e-06, "loss": 0.0, "num_input_tokens_seen": 10209712, "step": 20745 }, { "epoch": 2.7385508776560643, "grad_norm": 0.0023195738904178143, "learning_rate": 1.0081081981536001e-06, "loss": 0.0011, "num_input_tokens_seen": 10212144, "step": 20750 }, { "epoch": 2.7392107694338126, "grad_norm": 0.010359673760831356, "learning_rate": 1.0076475143285623e-06, "loss": 0.0938, "num_input_tokens_seen": 10214832, "step": 20755 }, { "epoch": 2.7398706612115613, "grad_norm": 0.051527973264455795, "learning_rate": 1.0071868288803948e-06, "loss": 0.0229, "num_input_tokens_seen": 10217328, "step": 20760 }, { "epoch": 2.7405305529893096, "grad_norm": 0.0074024563655257225, "learning_rate": 1.006726141906875e-06, "loss": 0.0003, "num_input_tokens_seen": 10219696, "step": 20765 }, { "epoch": 2.7411904447670583, "grad_norm": 0.0018134743440896273, "learning_rate": 1.0062654535057805e-06, "loss": 0.0504, "num_input_tokens_seen": 10222064, "step": 20770 }, { "epoch": 2.7418503365448066, "grad_norm": 27.99730682373047, "learning_rate": 1.0058047637748886e-06, "loss": 0.0955, "num_input_tokens_seen": 10224752, "step": 20775 }, { "epoch": 2.742510228322555, "grad_norm": 0.023540226742625237, "learning_rate": 1.0053440728119778e-06, "loss": 0.0611, "num_input_tokens_seen": 10227248, "step": 20780 }, { "epoch": 2.7431701201003036, "grad_norm": 0.04689393192529678, "learning_rate": 1.0048833807148263e-06, "loss": 0.0001, "num_input_tokens_seen": 10229744, "step": 20785 }, { "epoch": 2.743830011878052, "grad_norm": 0.00846624094992876, "learning_rate": 1.004422687581212e-06, "loss": 0.0001, "num_input_tokens_seen": 10232176, "step": 20790 }, { "epoch": 2.7444899036558006, "grad_norm": 21.59259605407715, "learning_rate": 1.0039619935089149e-06, "loss": 0.179, "num_input_tokens_seen": 10234608, "step": 20795 }, { "epoch": 2.745149795433549, "grad_norm": 0.129641592502594, "learning_rate": 1.0035012985957132e-06, "loss": 0.0004, "num_input_tokens_seen": 10237040, "step": 20800 }, { "epoch": 2.745809687211297, "grad_norm": 0.11951775848865509, "learning_rate": 1.0030406029393863e-06, "loss": 0.0003, "num_input_tokens_seen": 10239408, "step": 20805 }, { "epoch": 2.746469578989046, "grad_norm": 0.00737581355497241, "learning_rate": 1.0025799066377134e-06, "loss": 0.1464, "num_input_tokens_seen": 10241840, "step": 20810 }, { "epoch": 2.747129470766794, "grad_norm": 0.12172198295593262, "learning_rate": 1.0021192097884738e-06, "loss": 0.0002, "num_input_tokens_seen": 10244272, "step": 20815 }, { "epoch": 2.747789362544543, "grad_norm": 0.1808333545923233, "learning_rate": 1.0016585124894478e-06, "loss": 0.0029, "num_input_tokens_seen": 10246960, "step": 20820 }, { "epoch": 2.748449254322291, "grad_norm": 1.3367594480514526, "learning_rate": 1.0011978148384137e-06, "loss": 0.0008, "num_input_tokens_seen": 10249712, "step": 20825 }, { "epoch": 2.7491091461000394, "grad_norm": 0.05913139134645462, "learning_rate": 1.0007371169331527e-06, "loss": 0.0627, "num_input_tokens_seen": 10252400, "step": 20830 }, { "epoch": 2.749769037877788, "grad_norm": 43.06571578979492, "learning_rate": 1.0002764188714438e-06, "loss": 0.0152, "num_input_tokens_seen": 10255024, "step": 20835 }, { "epoch": 2.7504289296555364, "grad_norm": 0.11080607026815414, "learning_rate": 9.99815720751067e-07, "loss": 0.0001, "num_input_tokens_seen": 10257392, "step": 20840 }, { "epoch": 2.751088821433285, "grad_norm": 0.010921070352196693, "learning_rate": 9.993550226698021e-07, "loss": 0.0429, "num_input_tokens_seen": 10259504, "step": 20845 }, { "epoch": 2.751088821433285, "eval_loss": 0.16027498245239258, "eval_runtime": 7.7641, "eval_samples_per_second": 867.451, "eval_steps_per_second": 108.447, "num_input_tokens_seen": 10259504, "step": 20845 }, { "epoch": 2.7517487132110334, "grad_norm": 0.03583168983459473, "learning_rate": 9.988943247254293e-07, "loss": 0.0397, "num_input_tokens_seen": 10261808, "step": 20850 }, { "epoch": 2.7524086049887817, "grad_norm": 0.18788190186023712, "learning_rate": 9.984336270157277e-07, "loss": 0.038, "num_input_tokens_seen": 10264240, "step": 20855 }, { "epoch": 2.7530684967665304, "grad_norm": 0.023686319589614868, "learning_rate": 9.979729296384775e-07, "loss": 0.0004, "num_input_tokens_seen": 10266736, "step": 20860 }, { "epoch": 2.7537283885442787, "grad_norm": 39.64815902709961, "learning_rate": 9.97512232691458e-07, "loss": 0.2016, "num_input_tokens_seen": 10269488, "step": 20865 }, { "epoch": 2.7543882803220274, "grad_norm": 0.0550624318420887, "learning_rate": 9.970515362724497e-07, "loss": 0.0143, "num_input_tokens_seen": 10271920, "step": 20870 }, { "epoch": 2.7550481720997757, "grad_norm": 0.1452958583831787, "learning_rate": 9.965908404792313e-07, "loss": 0.1161, "num_input_tokens_seen": 10274672, "step": 20875 }, { "epoch": 2.755708063877524, "grad_norm": 0.05162237212061882, "learning_rate": 9.96130145409582e-07, "loss": 0.0524, "num_input_tokens_seen": 10277424, "step": 20880 }, { "epoch": 2.7563679556552723, "grad_norm": 0.006389949936419725, "learning_rate": 9.956694511612817e-07, "loss": 0.0002, "num_input_tokens_seen": 10279920, "step": 20885 }, { "epoch": 2.757027847433021, "grad_norm": 5.551244735717773, "learning_rate": 9.952087578321086e-07, "loss": 0.0058, "num_input_tokens_seen": 10282480, "step": 20890 }, { "epoch": 2.7576877392107697, "grad_norm": 21.14126968383789, "learning_rate": 9.947480655198423e-07, "loss": 0.1002, "num_input_tokens_seen": 10284976, "step": 20895 }, { "epoch": 2.758347630988518, "grad_norm": 0.1591024398803711, "learning_rate": 9.94287374322261e-07, "loss": 0.0384, "num_input_tokens_seen": 10287344, "step": 20900 }, { "epoch": 2.7590075227662663, "grad_norm": 0.01715918444097042, "learning_rate": 9.93826684337143e-07, "loss": 0.0002, "num_input_tokens_seen": 10289648, "step": 20905 }, { "epoch": 2.7596674145440145, "grad_norm": 0.01877519302070141, "learning_rate": 9.933659956622668e-07, "loss": 0.0342, "num_input_tokens_seen": 10291952, "step": 20910 }, { "epoch": 2.7603273063217633, "grad_norm": 10.56521987915039, "learning_rate": 9.929053083954096e-07, "loss": 0.0798, "num_input_tokens_seen": 10294704, "step": 20915 }, { "epoch": 2.7609871980995115, "grad_norm": 0.17608705163002014, "learning_rate": 9.924446226343496e-07, "loss": 0.0007, "num_input_tokens_seen": 10297264, "step": 20920 }, { "epoch": 2.7616470898772603, "grad_norm": 0.4646472632884979, "learning_rate": 9.91983938476864e-07, "loss": 0.077, "num_input_tokens_seen": 10299312, "step": 20925 }, { "epoch": 2.7623069816550085, "grad_norm": 22.721925735473633, "learning_rate": 9.915232560207288e-07, "loss": 0.0904, "num_input_tokens_seen": 10301616, "step": 20930 }, { "epoch": 2.762966873432757, "grad_norm": 0.05354856699705124, "learning_rate": 9.910625753637215e-07, "loss": 0.0017, "num_input_tokens_seen": 10303984, "step": 20935 }, { "epoch": 2.7636267652105055, "grad_norm": 0.7742305397987366, "learning_rate": 9.906018966036177e-07, "loss": 0.075, "num_input_tokens_seen": 10306608, "step": 20940 }, { "epoch": 2.764286656988254, "grad_norm": 0.024587344378232956, "learning_rate": 9.901412198381935e-07, "loss": 0.0004, "num_input_tokens_seen": 10309040, "step": 20945 }, { "epoch": 2.7649465487660025, "grad_norm": 0.03141823783516884, "learning_rate": 9.89680545165224e-07, "loss": 0.0003, "num_input_tokens_seen": 10311280, "step": 20950 }, { "epoch": 2.765606440543751, "grad_norm": 0.05810914188623428, "learning_rate": 9.892198726824835e-07, "loss": 0.0475, "num_input_tokens_seen": 10313776, "step": 20955 }, { "epoch": 2.766266332321499, "grad_norm": 14.158202171325684, "learning_rate": 9.887592024877478e-07, "loss": 0.0412, "num_input_tokens_seen": 10316400, "step": 20960 }, { "epoch": 2.766926224099248, "grad_norm": 0.1441594511270523, "learning_rate": 9.882985346787892e-07, "loss": 0.0002, "num_input_tokens_seen": 10319024, "step": 20965 }, { "epoch": 2.767586115876996, "grad_norm": 0.030178377404808998, "learning_rate": 9.878378693533825e-07, "loss": 0.0068, "num_input_tokens_seen": 10321584, "step": 20970 }, { "epoch": 2.768246007654745, "grad_norm": 0.8287844657897949, "learning_rate": 9.873772066092998e-07, "loss": 0.0035, "num_input_tokens_seen": 10323952, "step": 20975 }, { "epoch": 2.768905899432493, "grad_norm": 0.029004383832216263, "learning_rate": 9.869165465443132e-07, "loss": 0.0556, "num_input_tokens_seen": 10326384, "step": 20980 }, { "epoch": 2.7695657912102414, "grad_norm": 0.0934871956706047, "learning_rate": 9.864558892561955e-07, "loss": 0.0006, "num_input_tokens_seen": 10328688, "step": 20985 }, { "epoch": 2.77022568298799, "grad_norm": 17.891246795654297, "learning_rate": 9.859952348427167e-07, "loss": 0.0582, "num_input_tokens_seen": 10331312, "step": 20990 }, { "epoch": 2.7708855747657384, "grad_norm": 0.029909562319517136, "learning_rate": 9.855345834016481e-07, "loss": 0.0782, "num_input_tokens_seen": 10333680, "step": 20995 }, { "epoch": 2.771545466543487, "grad_norm": 0.11856421083211899, "learning_rate": 9.850739350307595e-07, "loss": 0.0438, "num_input_tokens_seen": 10336240, "step": 21000 }, { "epoch": 2.7722053583212354, "grad_norm": 0.02081143669784069, "learning_rate": 9.846132898278198e-07, "loss": 0.0004, "num_input_tokens_seen": 10338608, "step": 21005 }, { "epoch": 2.7728652500989837, "grad_norm": 0.00258413958363235, "learning_rate": 9.84152647890598e-07, "loss": 0.1489, "num_input_tokens_seen": 10341296, "step": 21010 }, { "epoch": 2.7735251418767324, "grad_norm": 0.01568412408232689, "learning_rate": 9.83692009316862e-07, "loss": 0.0001, "num_input_tokens_seen": 10344048, "step": 21015 }, { "epoch": 2.7741850336544807, "grad_norm": 0.010204672813415527, "learning_rate": 9.832313742043792e-07, "loss": 0.0002, "num_input_tokens_seen": 10346352, "step": 21020 }, { "epoch": 2.7748449254322294, "grad_norm": 39.50893783569336, "learning_rate": 9.827707426509155e-07, "loss": 0.0472, "num_input_tokens_seen": 10348784, "step": 21025 }, { "epoch": 2.7755048172099777, "grad_norm": 0.0015126679791137576, "learning_rate": 9.823101147542368e-07, "loss": 0.0002, "num_input_tokens_seen": 10351344, "step": 21030 }, { "epoch": 2.776164708987726, "grad_norm": 0.001049877842888236, "learning_rate": 9.818494906121084e-07, "loss": 0.0003, "num_input_tokens_seen": 10354032, "step": 21035 }, { "epoch": 2.776824600765474, "grad_norm": 0.014882220886647701, "learning_rate": 9.813888703222938e-07, "loss": 0.0003, "num_input_tokens_seen": 10356656, "step": 21040 }, { "epoch": 2.777484492543223, "grad_norm": 0.5062795281410217, "learning_rate": 9.809282539825573e-07, "loss": 0.0059, "num_input_tokens_seen": 10359280, "step": 21045 }, { "epoch": 2.778144384320971, "grad_norm": 23.332178115844727, "learning_rate": 9.804676416906605e-07, "loss": 0.0612, "num_input_tokens_seen": 10361712, "step": 21050 }, { "epoch": 2.77880427609872, "grad_norm": 0.014776087366044521, "learning_rate": 9.800070335443651e-07, "loss": 0.1814, "num_input_tokens_seen": 10364400, "step": 21055 }, { "epoch": 2.779464167876468, "grad_norm": 20.17340660095215, "learning_rate": 9.795464296414323e-07, "loss": 0.024, "num_input_tokens_seen": 10367024, "step": 21060 }, { "epoch": 2.7801240596542165, "grad_norm": 0.0953618586063385, "learning_rate": 9.790858300796214e-07, "loss": 0.0338, "num_input_tokens_seen": 10369520, "step": 21065 }, { "epoch": 2.780783951431965, "grad_norm": 0.0007482774672098458, "learning_rate": 9.78625234956692e-07, "loss": 0.004, "num_input_tokens_seen": 10372336, "step": 21070 }, { "epoch": 2.7814438432097135, "grad_norm": 29.122838973999023, "learning_rate": 9.781646443704014e-07, "loss": 0.0019, "num_input_tokens_seen": 10375024, "step": 21075 }, { "epoch": 2.782103734987462, "grad_norm": 0.10940604656934738, "learning_rate": 9.777040584185072e-07, "loss": 0.0002, "num_input_tokens_seen": 10377712, "step": 21080 }, { "epoch": 2.7827636267652105, "grad_norm": 0.010563059709966183, "learning_rate": 9.772434771987652e-07, "loss": 0.0001, "num_input_tokens_seen": 10379952, "step": 21085 }, { "epoch": 2.7834235185429588, "grad_norm": 0.3350667655467987, "learning_rate": 9.7678290080893e-07, "loss": 0.112, "num_input_tokens_seen": 10382448, "step": 21090 }, { "epoch": 2.7840834103207075, "grad_norm": 0.06816844642162323, "learning_rate": 9.76322329346756e-07, "loss": 0.0793, "num_input_tokens_seen": 10384688, "step": 21095 }, { "epoch": 2.7847433020984558, "grad_norm": 12.317327499389648, "learning_rate": 9.758617629099961e-07, "loss": 0.1091, "num_input_tokens_seen": 10387120, "step": 21100 }, { "epoch": 2.7854031938762045, "grad_norm": 0.005835865158587694, "learning_rate": 9.754012015964027e-07, "loss": 0.0003, "num_input_tokens_seen": 10389488, "step": 21105 }, { "epoch": 2.7860630856539528, "grad_norm": 0.011800551787018776, "learning_rate": 9.749406455037262e-07, "loss": 0.0003, "num_input_tokens_seen": 10392048, "step": 21110 }, { "epoch": 2.786722977431701, "grad_norm": 0.028174983337521553, "learning_rate": 9.744800947297154e-07, "loss": 0.0001, "num_input_tokens_seen": 10394416, "step": 21115 }, { "epoch": 2.7873828692094498, "grad_norm": 0.036128319799900055, "learning_rate": 9.740195493721204e-07, "loss": 0.0831, "num_input_tokens_seen": 10396912, "step": 21120 }, { "epoch": 2.788042760987198, "grad_norm": 0.18786346912384033, "learning_rate": 9.735590095286874e-07, "loss": 0.0001, "num_input_tokens_seen": 10399280, "step": 21125 }, { "epoch": 2.7887026527649468, "grad_norm": 0.5376497507095337, "learning_rate": 9.730984752971634e-07, "loss": 0.0007, "num_input_tokens_seen": 10401968, "step": 21130 }, { "epoch": 2.789362544542695, "grad_norm": 0.007869304157793522, "learning_rate": 9.726379467752937e-07, "loss": 0.0, "num_input_tokens_seen": 10404912, "step": 21135 }, { "epoch": 2.7900224363204433, "grad_norm": 0.02111724764108658, "learning_rate": 9.721774240608208e-07, "loss": 0.111, "num_input_tokens_seen": 10407600, "step": 21140 }, { "epoch": 2.790682328098192, "grad_norm": 0.016869166865944862, "learning_rate": 9.71716907251489e-07, "loss": 0.0037, "num_input_tokens_seen": 10410096, "step": 21145 }, { "epoch": 2.7913422198759403, "grad_norm": 0.8264169692993164, "learning_rate": 9.712563964450378e-07, "loss": 0.0089, "num_input_tokens_seen": 10412720, "step": 21150 }, { "epoch": 2.792002111653689, "grad_norm": 0.007358442526310682, "learning_rate": 9.707958917392094e-07, "loss": 0.0001, "num_input_tokens_seen": 10415088, "step": 21155 }, { "epoch": 2.7926620034314373, "grad_norm": 15.352611541748047, "learning_rate": 9.70335393231741e-07, "loss": 0.0325, "num_input_tokens_seen": 10417648, "step": 21160 }, { "epoch": 2.7933218952091856, "grad_norm": 0.003411094658076763, "learning_rate": 9.698749010203704e-07, "loss": 0.0001, "num_input_tokens_seen": 10420016, "step": 21165 }, { "epoch": 2.793981786986934, "grad_norm": 0.004778689704835415, "learning_rate": 9.694144152028342e-07, "loss": 0.0985, "num_input_tokens_seen": 10422704, "step": 21170 }, { "epoch": 2.7946416787646826, "grad_norm": 0.09724986553192139, "learning_rate": 9.689539358768668e-07, "loss": 0.0004, "num_input_tokens_seen": 10424752, "step": 21175 }, { "epoch": 2.795301570542431, "grad_norm": 0.01130112074315548, "learning_rate": 9.684934631402016e-07, "loss": 0.0004, "num_input_tokens_seen": 10427312, "step": 21180 }, { "epoch": 2.7959614623201796, "grad_norm": 0.010012752376496792, "learning_rate": 9.68032997090571e-07, "loss": 0.0, "num_input_tokens_seen": 10429808, "step": 21185 }, { "epoch": 2.796621354097928, "grad_norm": 0.005459084175527096, "learning_rate": 9.675725378257047e-07, "loss": 0.0, "num_input_tokens_seen": 10432368, "step": 21190 }, { "epoch": 2.797281245875676, "grad_norm": 0.09919703751802444, "learning_rate": 9.67112085443333e-07, "loss": 0.0902, "num_input_tokens_seen": 10434672, "step": 21195 }, { "epoch": 2.797941137653425, "grad_norm": 22.875263214111328, "learning_rate": 9.666516400411826e-07, "loss": 0.0641, "num_input_tokens_seen": 10437168, "step": 21200 }, { "epoch": 2.798601029431173, "grad_norm": 0.0013189826859161258, "learning_rate": 9.661912017169803e-07, "loss": 0.0, "num_input_tokens_seen": 10439472, "step": 21205 }, { "epoch": 2.799260921208922, "grad_norm": 0.0005493463831953704, "learning_rate": 9.657307705684507e-07, "loss": 0.0, "num_input_tokens_seen": 10441840, "step": 21210 }, { "epoch": 2.79992081298667, "grad_norm": 0.057411447167396545, "learning_rate": 9.652703466933167e-07, "loss": 0.0975, "num_input_tokens_seen": 10444272, "step": 21215 }, { "epoch": 2.8005807047644184, "grad_norm": 0.011182377114892006, "learning_rate": 9.648099301893003e-07, "loss": 0.0003, "num_input_tokens_seen": 10446832, "step": 21220 }, { "epoch": 2.801240596542167, "grad_norm": 0.0030372650362551212, "learning_rate": 9.643495211541212e-07, "loss": 0.135, "num_input_tokens_seen": 10449136, "step": 21225 }, { "epoch": 2.8019004883199154, "grad_norm": 0.08348898589611053, "learning_rate": 9.63889119685498e-07, "loss": 0.0473, "num_input_tokens_seen": 10451760, "step": 21230 }, { "epoch": 2.802560380097664, "grad_norm": 0.0047517018392682076, "learning_rate": 9.634287258811481e-07, "loss": 0.0, "num_input_tokens_seen": 10454000, "step": 21235 }, { "epoch": 2.8032202718754125, "grad_norm": 0.04656297713518143, "learning_rate": 9.62968339838786e-07, "loss": 0.0016, "num_input_tokens_seen": 10456304, "step": 21240 }, { "epoch": 2.8038801636531607, "grad_norm": 0.08442453294992447, "learning_rate": 9.625079616561256e-07, "loss": 0.0027, "num_input_tokens_seen": 10458800, "step": 21245 }, { "epoch": 2.8045400554309095, "grad_norm": 0.0003304073470644653, "learning_rate": 9.620475914308787e-07, "loss": 0.0001, "num_input_tokens_seen": 10461232, "step": 21250 }, { "epoch": 2.8051999472086577, "grad_norm": 1.3611310720443726, "learning_rate": 9.615872292607559e-07, "loss": 0.2071, "num_input_tokens_seen": 10463536, "step": 21255 }, { "epoch": 2.8058598389864065, "grad_norm": 25.757781982421875, "learning_rate": 9.611268752434658e-07, "loss": 0.2321, "num_input_tokens_seen": 10465904, "step": 21260 }, { "epoch": 2.8065197307641547, "grad_norm": 0.04057693853974342, "learning_rate": 9.606665294767144e-07, "loss": 0.0004, "num_input_tokens_seen": 10468272, "step": 21265 }, { "epoch": 2.807179622541903, "grad_norm": 0.010599908418953419, "learning_rate": 9.602061920582076e-07, "loss": 0.0368, "num_input_tokens_seen": 10470576, "step": 21270 }, { "epoch": 2.8078395143196517, "grad_norm": 0.010033125057816505, "learning_rate": 9.59745863085648e-07, "loss": 0.0004, "num_input_tokens_seen": 10472944, "step": 21275 }, { "epoch": 2.8084994060974, "grad_norm": 0.03106347844004631, "learning_rate": 9.59285542656738e-07, "loss": 0.0004, "num_input_tokens_seen": 10475248, "step": 21280 }, { "epoch": 2.8091592978751487, "grad_norm": 13.835277557373047, "learning_rate": 9.588252308691768e-07, "loss": 0.0534, "num_input_tokens_seen": 10477808, "step": 21285 }, { "epoch": 2.809819189652897, "grad_norm": 0.7269652485847473, "learning_rate": 9.583649278206616e-07, "loss": 0.0402, "num_input_tokens_seen": 10480176, "step": 21290 }, { "epoch": 2.8104790814306453, "grad_norm": 0.0030855233781039715, "learning_rate": 9.579046336088894e-07, "loss": 0.0887, "num_input_tokens_seen": 10482352, "step": 21295 }, { "epoch": 2.8111389732083936, "grad_norm": 0.20838147401809692, "learning_rate": 9.574443483315533e-07, "loss": 0.2153, "num_input_tokens_seen": 10484912, "step": 21300 }, { "epoch": 2.8117988649861423, "grad_norm": 0.08771320432424545, "learning_rate": 9.569840720863469e-07, "loss": 0.2879, "num_input_tokens_seen": 10487216, "step": 21305 }, { "epoch": 2.8124587567638906, "grad_norm": 0.46458372473716736, "learning_rate": 9.565238049709596e-07, "loss": 0.0255, "num_input_tokens_seen": 10489840, "step": 21310 }, { "epoch": 2.8131186485416393, "grad_norm": 0.06488537788391113, "learning_rate": 9.560635470830794e-07, "loss": 0.0002, "num_input_tokens_seen": 10492272, "step": 21315 }, { "epoch": 2.8137785403193876, "grad_norm": 38.66447830200195, "learning_rate": 9.556032985203934e-07, "loss": 0.0089, "num_input_tokens_seen": 10494768, "step": 21320 }, { "epoch": 2.814438432097136, "grad_norm": 0.5035339593887329, "learning_rate": 9.551430593805854e-07, "loss": 0.0758, "num_input_tokens_seen": 10497328, "step": 21325 }, { "epoch": 2.8150983238748846, "grad_norm": 71.4471664428711, "learning_rate": 9.546828297613389e-07, "loss": 0.0628, "num_input_tokens_seen": 10500016, "step": 21330 }, { "epoch": 2.815758215652633, "grad_norm": 0.12810802459716797, "learning_rate": 9.542226097603335e-07, "loss": 0.1013, "num_input_tokens_seen": 10502448, "step": 21335 }, { "epoch": 2.8164181074303816, "grad_norm": 0.014600202441215515, "learning_rate": 9.537623994752473e-07, "loss": 0.0005, "num_input_tokens_seen": 10504944, "step": 21340 }, { "epoch": 2.81707799920813, "grad_norm": 0.04110288992524147, "learning_rate": 9.533021990037572e-07, "loss": 0.0752, "num_input_tokens_seen": 10507440, "step": 21345 }, { "epoch": 2.817737890985878, "grad_norm": 0.030138636007905006, "learning_rate": 9.52842008443537e-07, "loss": 0.0179, "num_input_tokens_seen": 10509680, "step": 21350 }, { "epoch": 2.818397782763627, "grad_norm": 0.3603948652744293, "learning_rate": 9.523818278922593e-07, "loss": 0.002, "num_input_tokens_seen": 10512112, "step": 21355 }, { "epoch": 2.819057674541375, "grad_norm": 158.56179809570312, "learning_rate": 9.519216574475937e-07, "loss": 0.0933, "num_input_tokens_seen": 10514480, "step": 21360 }, { "epoch": 2.819717566319124, "grad_norm": 0.2659551799297333, "learning_rate": 9.514614972072082e-07, "loss": 0.0934, "num_input_tokens_seen": 10517040, "step": 21365 }, { "epoch": 2.820377458096872, "grad_norm": 0.053979117423295975, "learning_rate": 9.510013472687683e-07, "loss": 0.0255, "num_input_tokens_seen": 10519600, "step": 21370 }, { "epoch": 2.8210373498746204, "grad_norm": 0.17275747656822205, "learning_rate": 9.505412077299377e-07, "loss": 0.0002, "num_input_tokens_seen": 10522288, "step": 21375 }, { "epoch": 2.821697241652369, "grad_norm": 0.17458002269268036, "learning_rate": 9.500810786883776e-07, "loss": 0.0009, "num_input_tokens_seen": 10524976, "step": 21380 }, { "epoch": 2.8223571334301174, "grad_norm": 0.04548027738928795, "learning_rate": 9.496209602417472e-07, "loss": 0.034, "num_input_tokens_seen": 10527600, "step": 21385 }, { "epoch": 2.823017025207866, "grad_norm": 0.009598941542208195, "learning_rate": 9.49160852487703e-07, "loss": 0.0767, "num_input_tokens_seen": 10529968, "step": 21390 }, { "epoch": 2.8236769169856144, "grad_norm": 0.37783247232437134, "learning_rate": 9.487007555238997e-07, "loss": 0.076, "num_input_tokens_seen": 10532272, "step": 21395 }, { "epoch": 2.8243368087633627, "grad_norm": 20.97260856628418, "learning_rate": 9.482406694479895e-07, "loss": 0.0648, "num_input_tokens_seen": 10535152, "step": 21400 }, { "epoch": 2.8249967005411114, "grad_norm": 21.591175079345703, "learning_rate": 9.477805943576226e-07, "loss": 0.1232, "num_input_tokens_seen": 10537712, "step": 21405 }, { "epoch": 2.8256565923188597, "grad_norm": 0.01995622180402279, "learning_rate": 9.473205303504463e-07, "loss": 0.0002, "num_input_tokens_seen": 10540016, "step": 21410 }, { "epoch": 2.8263164840966084, "grad_norm": 18.156414031982422, "learning_rate": 9.468604775241061e-07, "loss": 0.1321, "num_input_tokens_seen": 10542512, "step": 21415 }, { "epoch": 2.8269763758743567, "grad_norm": 0.2256481796503067, "learning_rate": 9.464004359762445e-07, "loss": 0.0401, "num_input_tokens_seen": 10545136, "step": 21420 }, { "epoch": 2.827636267652105, "grad_norm": 0.026353495195508003, "learning_rate": 9.459404058045023e-07, "loss": 0.0018, "num_input_tokens_seen": 10547760, "step": 21425 }, { "epoch": 2.8282961594298532, "grad_norm": 0.08316987007856369, "learning_rate": 9.454803871065176e-07, "loss": 0.0399, "num_input_tokens_seen": 10549936, "step": 21430 }, { "epoch": 2.828956051207602, "grad_norm": 0.13066366314888, "learning_rate": 9.450203799799258e-07, "loss": 0.0011, "num_input_tokens_seen": 10552176, "step": 21435 }, { "epoch": 2.8296159429853502, "grad_norm": 0.5951442122459412, "learning_rate": 9.445603845223603e-07, "loss": 0.0415, "num_input_tokens_seen": 10554736, "step": 21440 }, { "epoch": 2.830275834763099, "grad_norm": 95.009521484375, "learning_rate": 9.44100400831452e-07, "loss": 0.1261, "num_input_tokens_seen": 10557296, "step": 21445 }, { "epoch": 2.8309357265408472, "grad_norm": 0.011956310831010342, "learning_rate": 9.436404290048282e-07, "loss": 0.0783, "num_input_tokens_seen": 10559984, "step": 21450 }, { "epoch": 2.8315956183185955, "grad_norm": 0.029880542308092117, "learning_rate": 9.43180469140116e-07, "loss": 0.1771, "num_input_tokens_seen": 10562416, "step": 21455 }, { "epoch": 2.8322555100963442, "grad_norm": 0.8779613971710205, "learning_rate": 9.427205213349369e-07, "loss": 0.0008, "num_input_tokens_seen": 10564976, "step": 21460 }, { "epoch": 2.8329154018740925, "grad_norm": 0.3202302157878876, "learning_rate": 9.422605856869129e-07, "loss": 0.0006, "num_input_tokens_seen": 10567728, "step": 21465 }, { "epoch": 2.8335752936518412, "grad_norm": 0.02996434085071087, "learning_rate": 9.418006622936618e-07, "loss": 0.0002, "num_input_tokens_seen": 10570416, "step": 21470 }, { "epoch": 2.8342351854295895, "grad_norm": 0.04730546846985817, "learning_rate": 9.413407512527977e-07, "loss": 0.1179, "num_input_tokens_seen": 10572784, "step": 21475 }, { "epoch": 2.834895077207338, "grad_norm": 0.042985014617443085, "learning_rate": 9.408808526619352e-07, "loss": 0.0001, "num_input_tokens_seen": 10575152, "step": 21480 }, { "epoch": 2.8355549689850865, "grad_norm": 0.09430090337991714, "learning_rate": 9.404209666186831e-07, "loss": 0.0002, "num_input_tokens_seen": 10577648, "step": 21485 }, { "epoch": 2.836214860762835, "grad_norm": 0.019693154841661453, "learning_rate": 9.3996109322065e-07, "loss": 0.0001, "num_input_tokens_seen": 10580208, "step": 21490 }, { "epoch": 2.8368747525405835, "grad_norm": 0.06802091002464294, "learning_rate": 9.395012325654398e-07, "loss": 0.0355, "num_input_tokens_seen": 10582512, "step": 21495 }, { "epoch": 2.837534644318332, "grad_norm": 0.004724997561424971, "learning_rate": 9.390413847506547e-07, "loss": 0.0001, "num_input_tokens_seen": 10584944, "step": 21500 }, { "epoch": 2.83819453609608, "grad_norm": 0.02163371816277504, "learning_rate": 9.385815498738944e-07, "loss": 0.049, "num_input_tokens_seen": 10587248, "step": 21505 }, { "epoch": 2.838854427873829, "grad_norm": 33.175628662109375, "learning_rate": 9.381217280327552e-07, "loss": 0.0809, "num_input_tokens_seen": 10590000, "step": 21510 }, { "epoch": 2.839514319651577, "grad_norm": 0.0046610478311777115, "learning_rate": 9.376619193248314e-07, "loss": 0.0, "num_input_tokens_seen": 10592496, "step": 21515 }, { "epoch": 2.840174211429326, "grad_norm": 0.015222937799990177, "learning_rate": 9.372021238477138e-07, "loss": 0.0767, "num_input_tokens_seen": 10595184, "step": 21520 }, { "epoch": 2.840834103207074, "grad_norm": 0.05140548199415207, "learning_rate": 9.367423416989905e-07, "loss": 0.001, "num_input_tokens_seen": 10597552, "step": 21525 }, { "epoch": 2.8414939949848224, "grad_norm": 0.018608352169394493, "learning_rate": 9.362825729762472e-07, "loss": 0.1518, "num_input_tokens_seen": 10600240, "step": 21530 }, { "epoch": 2.842153886762571, "grad_norm": 0.1203496903181076, "learning_rate": 9.358228177770663e-07, "loss": 0.1066, "num_input_tokens_seen": 10602608, "step": 21535 }, { "epoch": 2.8428137785403194, "grad_norm": 31.338871002197266, "learning_rate": 9.353630761990276e-07, "loss": 0.0717, "num_input_tokens_seen": 10605104, "step": 21540 }, { "epoch": 2.843473670318068, "grad_norm": 0.056929174810647964, "learning_rate": 9.349033483397082e-07, "loss": 0.0023, "num_input_tokens_seen": 10607600, "step": 21545 }, { "epoch": 2.8441335620958164, "grad_norm": 0.01509710494428873, "learning_rate": 9.344436342966812e-07, "loss": 0.0004, "num_input_tokens_seen": 10610160, "step": 21550 }, { "epoch": 2.8447934538735646, "grad_norm": 0.029244285076856613, "learning_rate": 9.339839341675185e-07, "loss": 0.0421, "num_input_tokens_seen": 10612400, "step": 21555 }, { "epoch": 2.845453345651313, "grad_norm": 0.17152266204357147, "learning_rate": 9.335242480497876e-07, "loss": 0.0003, "num_input_tokens_seen": 10615088, "step": 21560 }, { "epoch": 2.8461132374290616, "grad_norm": 0.416633278131485, "learning_rate": 9.330645760410537e-07, "loss": 0.0002, "num_input_tokens_seen": 10617648, "step": 21565 }, { "epoch": 2.8467731292068104, "grad_norm": 0.05466151982545853, "learning_rate": 9.326049182388789e-07, "loss": 0.0006, "num_input_tokens_seen": 10620336, "step": 21570 }, { "epoch": 2.8474330209845586, "grad_norm": 0.0032727643847465515, "learning_rate": 9.32145274740822e-07, "loss": 0.0001, "num_input_tokens_seen": 10622704, "step": 21575 }, { "epoch": 2.848092912762307, "grad_norm": 0.0016575565095990896, "learning_rate": 9.316856456444392e-07, "loss": 0.0407, "num_input_tokens_seen": 10625264, "step": 21580 }, { "epoch": 2.848752804540055, "grad_norm": 0.077976755797863, "learning_rate": 9.312260310472833e-07, "loss": 0.0736, "num_input_tokens_seen": 10628016, "step": 21585 }, { "epoch": 2.849412696317804, "grad_norm": 0.012262091040611267, "learning_rate": 9.307664310469046e-07, "loss": 0.0009, "num_input_tokens_seen": 10630384, "step": 21590 }, { "epoch": 2.850072588095552, "grad_norm": 0.009740821085870266, "learning_rate": 9.303068457408497e-07, "loss": 0.0016, "num_input_tokens_seen": 10632688, "step": 21595 }, { "epoch": 2.850732479873301, "grad_norm": 0.004587103612720966, "learning_rate": 9.298472752266615e-07, "loss": 0.0518, "num_input_tokens_seen": 10634800, "step": 21600 }, { "epoch": 2.851392371651049, "grad_norm": 0.1813964694738388, "learning_rate": 9.293877196018816e-07, "loss": 0.0001, "num_input_tokens_seen": 10637104, "step": 21605 }, { "epoch": 2.8520522634287975, "grad_norm": 0.0014675756683573127, "learning_rate": 9.289281789640465e-07, "loss": 0.0002, "num_input_tokens_seen": 10639408, "step": 21610 }, { "epoch": 2.852712155206546, "grad_norm": 16.158531188964844, "learning_rate": 9.28468653410691e-07, "loss": 0.274, "num_input_tokens_seen": 10641584, "step": 21615 }, { "epoch": 2.8533720469842945, "grad_norm": 0.003586375620216131, "learning_rate": 9.280091430393462e-07, "loss": 0.0001, "num_input_tokens_seen": 10643888, "step": 21620 }, { "epoch": 2.854031938762043, "grad_norm": 0.005891416687518358, "learning_rate": 9.275496479475386e-07, "loss": 0.0001, "num_input_tokens_seen": 10646512, "step": 21625 }, { "epoch": 2.8546918305397915, "grad_norm": 0.7463214993476868, "learning_rate": 9.270901682327945e-07, "loss": 0.001, "num_input_tokens_seen": 10648880, "step": 21630 }, { "epoch": 2.8553517223175398, "grad_norm": 3.2659404277801514, "learning_rate": 9.266307039926333e-07, "loss": 0.0012, "num_input_tokens_seen": 10651440, "step": 21635 }, { "epoch": 2.8560116140952885, "grad_norm": 0.0117728216573596, "learning_rate": 9.261712553245747e-07, "loss": 0.0001, "num_input_tokens_seen": 10654000, "step": 21640 }, { "epoch": 2.8566715058730368, "grad_norm": 49.505550384521484, "learning_rate": 9.257118223261323e-07, "loss": 0.202, "num_input_tokens_seen": 10656560, "step": 21645 }, { "epoch": 2.8573313976507855, "grad_norm": 14.688985824584961, "learning_rate": 9.252524050948174e-07, "loss": 0.0427, "num_input_tokens_seen": 10658928, "step": 21650 }, { "epoch": 2.8579912894285338, "grad_norm": 0.007706925738602877, "learning_rate": 9.247930037281385e-07, "loss": 0.0675, "num_input_tokens_seen": 10661360, "step": 21655 }, { "epoch": 2.858651181206282, "grad_norm": 0.06840179115533829, "learning_rate": 9.243336183235995e-07, "loss": 0.0338, "num_input_tokens_seen": 10663920, "step": 21660 }, { "epoch": 2.8593110729840308, "grad_norm": 0.018307015299797058, "learning_rate": 9.238742489787027e-07, "loss": 0.1112, "num_input_tokens_seen": 10666416, "step": 21665 }, { "epoch": 2.859970964761779, "grad_norm": 0.06790940463542938, "learning_rate": 9.234148957909451e-07, "loss": 0.0335, "num_input_tokens_seen": 10668656, "step": 21670 }, { "epoch": 2.8606308565395278, "grad_norm": 1.0796475410461426, "learning_rate": 9.229555588578211e-07, "loss": 0.1067, "num_input_tokens_seen": 10671152, "step": 21675 }, { "epoch": 2.861290748317276, "grad_norm": 0.013293218798935413, "learning_rate": 9.22496238276822e-07, "loss": 0.0001, "num_input_tokens_seen": 10673392, "step": 21680 }, { "epoch": 2.8619506400950243, "grad_norm": 0.10564276576042175, "learning_rate": 9.220369341454348e-07, "loss": 0.0005, "num_input_tokens_seen": 10675696, "step": 21685 }, { "epoch": 2.8626105318727726, "grad_norm": 0.21768558025360107, "learning_rate": 9.215776465611441e-07, "loss": 0.0005, "num_input_tokens_seen": 10677936, "step": 21690 }, { "epoch": 2.8632704236505213, "grad_norm": 0.012424738146364689, "learning_rate": 9.2111837562143e-07, "loss": 0.0022, "num_input_tokens_seen": 10680880, "step": 21695 }, { "epoch": 2.86393031542827, "grad_norm": 0.011372094973921776, "learning_rate": 9.206591214237692e-07, "loss": 0.0003, "num_input_tokens_seen": 10683056, "step": 21700 }, { "epoch": 2.8645902072060183, "grad_norm": 0.023209456354379654, "learning_rate": 9.201998840656355e-07, "loss": 0.1564, "num_input_tokens_seen": 10685552, "step": 21705 }, { "epoch": 2.8652500989837666, "grad_norm": 0.3390696942806244, "learning_rate": 9.197406636444984e-07, "loss": 0.0002, "num_input_tokens_seen": 10687728, "step": 21710 }, { "epoch": 2.865909990761515, "grad_norm": 0.0022603331599384546, "learning_rate": 9.192814602578245e-07, "loss": 0.0323, "num_input_tokens_seen": 10690352, "step": 21715 }, { "epoch": 2.8665698825392636, "grad_norm": 0.12818442285060883, "learning_rate": 9.188222740030759e-07, "loss": 0.1149, "num_input_tokens_seen": 10693168, "step": 21720 }, { "epoch": 2.867229774317012, "grad_norm": 0.04778322950005531, "learning_rate": 9.18363104977712e-07, "loss": 0.0007, "num_input_tokens_seen": 10695600, "step": 21725 }, { "epoch": 2.8678896660947606, "grad_norm": 17.72599983215332, "learning_rate": 9.179039532791879e-07, "loss": 0.0816, "num_input_tokens_seen": 10698032, "step": 21730 }, { "epoch": 2.868549557872509, "grad_norm": 0.06735151261091232, "learning_rate": 9.174448190049551e-07, "loss": 0.0805, "num_input_tokens_seen": 10700272, "step": 21735 }, { "epoch": 2.869209449650257, "grad_norm": 281.5223083496094, "learning_rate": 9.169857022524616e-07, "loss": 0.0385, "num_input_tokens_seen": 10702640, "step": 21740 }, { "epoch": 2.869869341428006, "grad_norm": 17.493099212646484, "learning_rate": 9.165266031191518e-07, "loss": 0.1961, "num_input_tokens_seen": 10705136, "step": 21745 }, { "epoch": 2.870529233205754, "grad_norm": 0.27138352394104004, "learning_rate": 9.160675217024659e-07, "loss": 0.0003, "num_input_tokens_seen": 10707824, "step": 21750 }, { "epoch": 2.871189124983503, "grad_norm": 33.14533233642578, "learning_rate": 9.156084580998409e-07, "loss": 0.1185, "num_input_tokens_seen": 10710064, "step": 21755 }, { "epoch": 2.871849016761251, "grad_norm": 14.331830978393555, "learning_rate": 9.151494124087093e-07, "loss": 0.0926, "num_input_tokens_seen": 10712432, "step": 21760 }, { "epoch": 2.8725089085389994, "grad_norm": 35.13714599609375, "learning_rate": 9.146903847265008e-07, "loss": 0.0872, "num_input_tokens_seen": 10714672, "step": 21765 }, { "epoch": 2.873168800316748, "grad_norm": 0.03220439702272415, "learning_rate": 9.142313751506401e-07, "loss": 0.0009, "num_input_tokens_seen": 10716912, "step": 21770 }, { "epoch": 2.8738286920944964, "grad_norm": 0.11148615926504135, "learning_rate": 9.137723837785491e-07, "loss": 0.0009, "num_input_tokens_seen": 10719600, "step": 21775 }, { "epoch": 2.874488583872245, "grad_norm": 0.7292298674583435, "learning_rate": 9.133134107076455e-07, "loss": 0.0015, "num_input_tokens_seen": 10721904, "step": 21780 }, { "epoch": 2.8751484756499934, "grad_norm": 0.5308613181114197, "learning_rate": 9.12854456035342e-07, "loss": 0.0006, "num_input_tokens_seen": 10724720, "step": 21785 }, { "epoch": 2.8758083674277417, "grad_norm": 0.7017931938171387, "learning_rate": 9.123955198590498e-07, "loss": 0.0011, "num_input_tokens_seen": 10727216, "step": 21790 }, { "epoch": 2.8764682592054904, "grad_norm": 0.049583759158849716, "learning_rate": 9.119366022761736e-07, "loss": 0.0004, "num_input_tokens_seen": 10729648, "step": 21795 }, { "epoch": 2.8771281509832387, "grad_norm": 0.0807613804936409, "learning_rate": 9.114777033841162e-07, "loss": 0.0005, "num_input_tokens_seen": 10732016, "step": 21800 }, { "epoch": 2.8777880427609874, "grad_norm": 0.036206115037202835, "learning_rate": 9.110188232802756e-07, "loss": 0.058, "num_input_tokens_seen": 10734320, "step": 21805 }, { "epoch": 2.8784479345387357, "grad_norm": 0.020139062777161598, "learning_rate": 9.105599620620446e-07, "loss": 0.0001, "num_input_tokens_seen": 10737008, "step": 21810 }, { "epoch": 2.879107826316484, "grad_norm": 34.06182098388672, "learning_rate": 9.101011198268146e-07, "loss": 0.0938, "num_input_tokens_seen": 10739632, "step": 21815 }, { "epoch": 2.8797677180942327, "grad_norm": 0.03170971944928169, "learning_rate": 9.096422966719704e-07, "loss": 0.0004, "num_input_tokens_seen": 10742384, "step": 21820 }, { "epoch": 2.880427609871981, "grad_norm": 0.017847511917352676, "learning_rate": 9.091834926948949e-07, "loss": 0.1499, "num_input_tokens_seen": 10744880, "step": 21825 }, { "epoch": 2.8810875016497297, "grad_norm": 0.019295165315270424, "learning_rate": 9.087247079929654e-07, "loss": 0.0004, "num_input_tokens_seen": 10747632, "step": 21830 }, { "epoch": 2.881747393427478, "grad_norm": 0.028995471075177193, "learning_rate": 9.082659426635554e-07, "loss": 0.0001, "num_input_tokens_seen": 10750128, "step": 21835 }, { "epoch": 2.8824072852052263, "grad_norm": 0.011393583379685879, "learning_rate": 9.07807196804035e-07, "loss": 0.1334, "num_input_tokens_seen": 10752880, "step": 21840 }, { "epoch": 2.8830671769829745, "grad_norm": 0.006427302956581116, "learning_rate": 9.073484705117691e-07, "loss": 0.1001, "num_input_tokens_seen": 10755504, "step": 21845 }, { "epoch": 2.8837270687607233, "grad_norm": 0.007785051595419645, "learning_rate": 9.068897638841197e-07, "loss": 0.0001, "num_input_tokens_seen": 10757808, "step": 21850 }, { "epoch": 2.8843869605384715, "grad_norm": 0.005997618660330772, "learning_rate": 9.064310770184438e-07, "loss": 0.0008, "num_input_tokens_seen": 10760432, "step": 21855 }, { "epoch": 2.8850468523162203, "grad_norm": 0.1344568282365799, "learning_rate": 9.059724100120939e-07, "loss": 0.0008, "num_input_tokens_seen": 10762864, "step": 21860 }, { "epoch": 2.8857067440939685, "grad_norm": 0.0011155613465234637, "learning_rate": 9.055137629624194e-07, "loss": 0.0027, "num_input_tokens_seen": 10765232, "step": 21865 }, { "epoch": 2.886366635871717, "grad_norm": 0.02373356744647026, "learning_rate": 9.05055135966764e-07, "loss": 0.0253, "num_input_tokens_seen": 10767600, "step": 21870 }, { "epoch": 2.8870265276494655, "grad_norm": 0.0023866845294833183, "learning_rate": 9.04596529122469e-07, "loss": 0.0001, "num_input_tokens_seen": 10769712, "step": 21875 }, { "epoch": 2.887686419427214, "grad_norm": 0.04158183932304382, "learning_rate": 9.041379425268697e-07, "loss": 0.0, "num_input_tokens_seen": 10771952, "step": 21880 }, { "epoch": 2.8883463112049625, "grad_norm": 0.058639414608478546, "learning_rate": 9.036793762772977e-07, "loss": 0.0676, "num_input_tokens_seen": 10774512, "step": 21885 }, { "epoch": 2.889006202982711, "grad_norm": 0.00817857775837183, "learning_rate": 9.032208304710808e-07, "loss": 0.0001, "num_input_tokens_seen": 10776944, "step": 21890 }, { "epoch": 2.889666094760459, "grad_norm": 1.1033525466918945, "learning_rate": 9.027623052055417e-07, "loss": 0.0007, "num_input_tokens_seen": 10779568, "step": 21895 }, { "epoch": 2.890325986538208, "grad_norm": 0.013293600641191006, "learning_rate": 9.023038005779992e-07, "loss": 0.0041, "num_input_tokens_seen": 10782512, "step": 21900 }, { "epoch": 2.890985878315956, "grad_norm": 0.01595185324549675, "learning_rate": 9.018453166857677e-07, "loss": 0.0001, "num_input_tokens_seen": 10784816, "step": 21905 }, { "epoch": 2.891645770093705, "grad_norm": 0.010223422199487686, "learning_rate": 9.013868536261566e-07, "loss": 0.0001, "num_input_tokens_seen": 10787504, "step": 21910 }, { "epoch": 2.892305661871453, "grad_norm": 0.001329478225670755, "learning_rate": 9.009284114964721e-07, "loss": 0.0, "num_input_tokens_seen": 10789936, "step": 21915 }, { "epoch": 2.8929655536492014, "grad_norm": 0.10551278293132782, "learning_rate": 9.004699903940146e-07, "loss": 0.0004, "num_input_tokens_seen": 10792496, "step": 21920 }, { "epoch": 2.89362544542695, "grad_norm": 0.007360459771007299, "learning_rate": 9.000115904160811e-07, "loss": 0.1505, "num_input_tokens_seen": 10794864, "step": 21925 }, { "epoch": 2.8942853372046984, "grad_norm": 1.597662091255188, "learning_rate": 8.995532116599636e-07, "loss": 0.0036, "num_input_tokens_seen": 10797232, "step": 21930 }, { "epoch": 2.894945228982447, "grad_norm": 0.012998878955841064, "learning_rate": 8.99094854222949e-07, "loss": 0.0001, "num_input_tokens_seen": 10799728, "step": 21935 }, { "epoch": 2.8956051207601954, "grad_norm": 13.63510513305664, "learning_rate": 8.986365182023212e-07, "loss": 0.0537, "num_input_tokens_seen": 10802224, "step": 21940 }, { "epoch": 2.8962650125379437, "grad_norm": 118.9696273803711, "learning_rate": 8.981782036953583e-07, "loss": 0.0036, "num_input_tokens_seen": 10804592, "step": 21945 }, { "epoch": 2.8969249043156924, "grad_norm": 0.006649637129157782, "learning_rate": 8.977199107993345e-07, "loss": 0.0397, "num_input_tokens_seen": 10806896, "step": 21950 }, { "epoch": 2.8975847960934407, "grad_norm": 2.4541690349578857, "learning_rate": 8.972616396115194e-07, "loss": 0.0016, "num_input_tokens_seen": 10809328, "step": 21955 }, { "epoch": 2.8982446878711894, "grad_norm": 0.1047786995768547, "learning_rate": 8.968033902291764e-07, "loss": 0.0631, "num_input_tokens_seen": 10811952, "step": 21960 }, { "epoch": 2.8989045796489377, "grad_norm": 0.0030349986627697945, "learning_rate": 8.963451627495673e-07, "loss": 0.0689, "num_input_tokens_seen": 10814256, "step": 21965 }, { "epoch": 2.899564471426686, "grad_norm": 17.257110595703125, "learning_rate": 8.95886957269946e-07, "loss": 0.0617, "num_input_tokens_seen": 10816624, "step": 21970 }, { "epoch": 2.900224363204434, "grad_norm": 0.013636348769068718, "learning_rate": 8.954287738875649e-07, "loss": 0.0001, "num_input_tokens_seen": 10819184, "step": 21975 }, { "epoch": 2.900884254982183, "grad_norm": 0.005858052987605333, "learning_rate": 8.94970612699669e-07, "loss": 0.0006, "num_input_tokens_seen": 10821616, "step": 21980 }, { "epoch": 2.901544146759931, "grad_norm": 0.0023866884876042604, "learning_rate": 8.945124738034998e-07, "loss": 0.0308, "num_input_tokens_seen": 10823920, "step": 21985 }, { "epoch": 2.90220403853768, "grad_norm": 0.0054891835898160934, "learning_rate": 8.940543572962944e-07, "loss": 0.0003, "num_input_tokens_seen": 10826288, "step": 21990 }, { "epoch": 2.902863930315428, "grad_norm": 0.02786225453019142, "learning_rate": 8.93596263275284e-07, "loss": 0.0, "num_input_tokens_seen": 10828464, "step": 21995 }, { "epoch": 2.9035238220931765, "grad_norm": 38.37060546875, "learning_rate": 8.931381918376969e-07, "loss": 0.1661, "num_input_tokens_seen": 10830960, "step": 22000 }, { "epoch": 2.904183713870925, "grad_norm": 0.0023783042561262846, "learning_rate": 8.926801430807545e-07, "loss": 0.0767, "num_input_tokens_seen": 10833136, "step": 22005 }, { "epoch": 2.9048436056486735, "grad_norm": 0.024528319016098976, "learning_rate": 8.922221171016744e-07, "loss": 0.0001, "num_input_tokens_seen": 10835632, "step": 22010 }, { "epoch": 2.905503497426422, "grad_norm": 0.10498657077550888, "learning_rate": 8.917641139976697e-07, "loss": 0.0003, "num_input_tokens_seen": 10838064, "step": 22015 }, { "epoch": 2.9061633892041705, "grad_norm": 0.03159989044070244, "learning_rate": 8.913061338659478e-07, "loss": 0.0798, "num_input_tokens_seen": 10840368, "step": 22020 }, { "epoch": 2.9068232809819188, "grad_norm": 175.38629150390625, "learning_rate": 8.908481768037119e-07, "loss": 0.0282, "num_input_tokens_seen": 10842800, "step": 22025 }, { "epoch": 2.9074831727596675, "grad_norm": 0.013397633098065853, "learning_rate": 8.903902429081603e-07, "loss": 0.0005, "num_input_tokens_seen": 10845424, "step": 22030 }, { "epoch": 2.908143064537416, "grad_norm": 0.12170317769050598, "learning_rate": 8.899323322764857e-07, "loss": 0.0004, "num_input_tokens_seen": 10847920, "step": 22035 }, { "epoch": 2.9088029563151645, "grad_norm": 0.0028501616325229406, "learning_rate": 8.894744450058767e-07, "loss": 0.0001, "num_input_tokens_seen": 10850160, "step": 22040 }, { "epoch": 2.909462848092913, "grad_norm": 0.14192311465740204, "learning_rate": 8.890165811935161e-07, "loss": 0.0661, "num_input_tokens_seen": 10852400, "step": 22045 }, { "epoch": 2.910122739870661, "grad_norm": 0.051709212362766266, "learning_rate": 8.885587409365826e-07, "loss": 0.1177, "num_input_tokens_seen": 10854832, "step": 22050 }, { "epoch": 2.91078263164841, "grad_norm": 0.2706301808357239, "learning_rate": 8.881009243322493e-07, "loss": 0.0018, "num_input_tokens_seen": 10857456, "step": 22055 }, { "epoch": 2.911442523426158, "grad_norm": 1.0978302955627441, "learning_rate": 8.876431314776847e-07, "loss": 0.0011, "num_input_tokens_seen": 10860080, "step": 22060 }, { "epoch": 2.912102415203907, "grad_norm": 0.018388278782367706, "learning_rate": 8.871853624700517e-07, "loss": 0.0001, "num_input_tokens_seen": 10862640, "step": 22065 }, { "epoch": 2.912762306981655, "grad_norm": 0.24191296100616455, "learning_rate": 8.867276174065085e-07, "loss": 0.0494, "num_input_tokens_seen": 10864880, "step": 22070 }, { "epoch": 2.9134221987594033, "grad_norm": 0.009454427286982536, "learning_rate": 8.862698963842084e-07, "loss": 0.0501, "num_input_tokens_seen": 10867056, "step": 22075 }, { "epoch": 2.914082090537152, "grad_norm": 0.0265056062489748, "learning_rate": 8.85812199500299e-07, "loss": 0.0003, "num_input_tokens_seen": 10869424, "step": 22080 }, { "epoch": 2.9147419823149003, "grad_norm": 0.03988128900527954, "learning_rate": 8.853545268519235e-07, "loss": 0.0876, "num_input_tokens_seen": 10871984, "step": 22085 }, { "epoch": 2.915401874092649, "grad_norm": 0.004546268377453089, "learning_rate": 8.848968785362196e-07, "loss": 0.0002, "num_input_tokens_seen": 10874352, "step": 22090 }, { "epoch": 2.9160617658703973, "grad_norm": 0.913521945476532, "learning_rate": 8.844392546503195e-07, "loss": 0.0004, "num_input_tokens_seen": 10876592, "step": 22095 }, { "epoch": 2.9167216576481456, "grad_norm": 0.00627158721908927, "learning_rate": 8.83981655291351e-07, "loss": 0.0537, "num_input_tokens_seen": 10879024, "step": 22100 }, { "epoch": 2.917381549425894, "grad_norm": 0.2408558577299118, "learning_rate": 8.835240805564358e-07, "loss": 0.0757, "num_input_tokens_seen": 10881584, "step": 22105 }, { "epoch": 2.9180414412036426, "grad_norm": 0.002517815912142396, "learning_rate": 8.830665305426914e-07, "loss": 0.0, "num_input_tokens_seen": 10884144, "step": 22110 }, { "epoch": 2.918701332981391, "grad_norm": 0.01214515045285225, "learning_rate": 8.826090053472291e-07, "loss": 0.1362, "num_input_tokens_seen": 10886832, "step": 22115 }, { "epoch": 2.9193612247591396, "grad_norm": 0.02185744419693947, "learning_rate": 8.821515050671547e-07, "loss": 0.0011, "num_input_tokens_seen": 10888880, "step": 22120 }, { "epoch": 2.920021116536888, "grad_norm": 0.04342570900917053, "learning_rate": 8.816940297995705e-07, "loss": 0.0005, "num_input_tokens_seen": 10891248, "step": 22125 }, { "epoch": 2.920681008314636, "grad_norm": 0.005738275591284037, "learning_rate": 8.812365796415715e-07, "loss": 0.0001, "num_input_tokens_seen": 10893552, "step": 22130 }, { "epoch": 2.921340900092385, "grad_norm": 0.03579311445355415, "learning_rate": 8.807791546902488e-07, "loss": 0.0004, "num_input_tokens_seen": 10895920, "step": 22135 }, { "epoch": 2.922000791870133, "grad_norm": 0.00643147761002183, "learning_rate": 8.803217550426873e-07, "loss": 0.0001, "num_input_tokens_seen": 10898608, "step": 22140 }, { "epoch": 2.922660683647882, "grad_norm": 0.001240309327840805, "learning_rate": 8.79864380795966e-07, "loss": 0.0001, "num_input_tokens_seen": 10901040, "step": 22145 }, { "epoch": 2.92332057542563, "grad_norm": 0.5274344086647034, "learning_rate": 8.794070320471605e-07, "loss": 0.094, "num_input_tokens_seen": 10903536, "step": 22150 }, { "epoch": 2.9239804672033785, "grad_norm": 0.03138048201799393, "learning_rate": 8.789497088933386e-07, "loss": 0.1084, "num_input_tokens_seen": 10905968, "step": 22155 }, { "epoch": 2.924640358981127, "grad_norm": 0.14140108227729797, "learning_rate": 8.78492411431565e-07, "loss": 0.1881, "num_input_tokens_seen": 10908656, "step": 22160 }, { "epoch": 2.9253002507588755, "grad_norm": 0.0537065826356411, "learning_rate": 8.78035139758897e-07, "loss": 0.0003, "num_input_tokens_seen": 10910960, "step": 22165 }, { "epoch": 2.925960142536624, "grad_norm": 0.031515542417764664, "learning_rate": 8.775778939723874e-07, "loss": 0.0003, "num_input_tokens_seen": 10913584, "step": 22170 }, { "epoch": 2.9266200343143725, "grad_norm": 0.07926079630851746, "learning_rate": 8.771206741690832e-07, "loss": 0.0009, "num_input_tokens_seen": 10916208, "step": 22175 }, { "epoch": 2.9272799260921207, "grad_norm": 0.088646300137043, "learning_rate": 8.76663480446026e-07, "loss": 0.0003, "num_input_tokens_seen": 10918704, "step": 22180 }, { "epoch": 2.9279398178698695, "grad_norm": 0.018740274012088776, "learning_rate": 8.762063129002521e-07, "loss": 0.0007, "num_input_tokens_seen": 10921200, "step": 22185 }, { "epoch": 2.9285997096476177, "grad_norm": 0.17317558825016022, "learning_rate": 8.757491716287919e-07, "loss": 0.0567, "num_input_tokens_seen": 10923568, "step": 22190 }, { "epoch": 2.9292596014253665, "grad_norm": 0.17912831902503967, "learning_rate": 8.752920567286701e-07, "loss": 0.0356, "num_input_tokens_seen": 10925872, "step": 22195 }, { "epoch": 2.9299194932031147, "grad_norm": 2.7432761192321777, "learning_rate": 8.748349682969063e-07, "loss": 0.0273, "num_input_tokens_seen": 10928496, "step": 22200 }, { "epoch": 2.930579384980863, "grad_norm": 0.005701547488570213, "learning_rate": 8.743779064305139e-07, "loss": 0.0001, "num_input_tokens_seen": 10930672, "step": 22205 }, { "epoch": 2.9312392767586117, "grad_norm": 0.06518379598855972, "learning_rate": 8.739208712265015e-07, "loss": 0.0002, "num_input_tokens_seen": 10933168, "step": 22210 }, { "epoch": 2.93189916853636, "grad_norm": 0.413714736700058, "learning_rate": 8.734638627818711e-07, "loss": 0.1551, "num_input_tokens_seen": 10935472, "step": 22215 }, { "epoch": 2.9325590603141087, "grad_norm": 0.017388276755809784, "learning_rate": 8.730068811936194e-07, "loss": 0.1055, "num_input_tokens_seen": 10938288, "step": 22220 }, { "epoch": 2.933218952091857, "grad_norm": 0.05265603959560394, "learning_rate": 8.725499265587376e-07, "loss": 0.052, "num_input_tokens_seen": 10940592, "step": 22225 }, { "epoch": 2.9338788438696053, "grad_norm": 19.1118221282959, "learning_rate": 8.720929989742108e-07, "loss": 0.0675, "num_input_tokens_seen": 10942832, "step": 22230 }, { "epoch": 2.9345387356473536, "grad_norm": 0.4165602922439575, "learning_rate": 8.71636098537019e-07, "loss": 0.0006, "num_input_tokens_seen": 10945456, "step": 22235 }, { "epoch": 2.9351986274251023, "grad_norm": 0.017717348411679268, "learning_rate": 8.711792253441358e-07, "loss": 0.001, "num_input_tokens_seen": 10947952, "step": 22240 }, { "epoch": 2.9358585192028506, "grad_norm": 0.06295906007289886, "learning_rate": 8.70722379492529e-07, "loss": 0.0002, "num_input_tokens_seen": 10950512, "step": 22245 }, { "epoch": 2.9365184109805993, "grad_norm": 0.08168453723192215, "learning_rate": 8.70265561079161e-07, "loss": 0.0006, "num_input_tokens_seen": 10953136, "step": 22250 }, { "epoch": 2.9371783027583476, "grad_norm": 0.019058220088481903, "learning_rate": 8.698087702009882e-07, "loss": 0.0665, "num_input_tokens_seen": 10955440, "step": 22255 }, { "epoch": 2.937838194536096, "grad_norm": 0.013230009004473686, "learning_rate": 8.693520069549612e-07, "loss": 0.0892, "num_input_tokens_seen": 10958064, "step": 22260 }, { "epoch": 2.9384980863138446, "grad_norm": 0.07534804940223694, "learning_rate": 8.688952714380247e-07, "loss": 0.0043, "num_input_tokens_seen": 10960688, "step": 22265 }, { "epoch": 2.939157978091593, "grad_norm": 0.040853142738342285, "learning_rate": 8.684385637471173e-07, "loss": 0.0382, "num_input_tokens_seen": 10963120, "step": 22270 }, { "epoch": 2.9398178698693416, "grad_norm": 33.396121978759766, "learning_rate": 8.679818839791721e-07, "loss": 0.3035, "num_input_tokens_seen": 10965616, "step": 22275 }, { "epoch": 2.94047776164709, "grad_norm": 0.020552687346935272, "learning_rate": 8.675252322311161e-07, "loss": 0.0004, "num_input_tokens_seen": 10968112, "step": 22280 }, { "epoch": 2.941137653424838, "grad_norm": 0.04607531800866127, "learning_rate": 8.670686085998702e-07, "loss": 0.0023, "num_input_tokens_seen": 10970544, "step": 22285 }, { "epoch": 2.941797545202587, "grad_norm": 0.4696016013622284, "learning_rate": 8.666120131823499e-07, "loss": 0.0015, "num_input_tokens_seen": 10973040, "step": 22290 }, { "epoch": 2.942457436980335, "grad_norm": 0.18232782185077667, "learning_rate": 8.661554460754631e-07, "loss": 0.1724, "num_input_tokens_seen": 10975792, "step": 22295 }, { "epoch": 2.943117328758084, "grad_norm": 0.33755701780319214, "learning_rate": 8.656989073761144e-07, "loss": 0.1864, "num_input_tokens_seen": 10978416, "step": 22300 }, { "epoch": 2.943777220535832, "grad_norm": 0.031122563406825066, "learning_rate": 8.652423971811992e-07, "loss": 0.0476, "num_input_tokens_seen": 10981296, "step": 22305 }, { "epoch": 2.9444371123135804, "grad_norm": 36.23896408081055, "learning_rate": 8.647859155876103e-07, "loss": 0.1013, "num_input_tokens_seen": 10983728, "step": 22310 }, { "epoch": 2.945097004091329, "grad_norm": 0.04262983053922653, "learning_rate": 8.643294626922314e-07, "loss": 0.0019, "num_input_tokens_seen": 10986352, "step": 22315 }, { "epoch": 2.9457568958690774, "grad_norm": 0.02900772914290428, "learning_rate": 8.638730385919411e-07, "loss": 0.0014, "num_input_tokens_seen": 10989040, "step": 22320 }, { "epoch": 2.946416787646826, "grad_norm": 0.12718896567821503, "learning_rate": 8.634166433836132e-07, "loss": 0.0004, "num_input_tokens_seen": 10991344, "step": 22325 }, { "epoch": 2.9470766794245744, "grad_norm": 636.110595703125, "learning_rate": 8.629602771641131e-07, "loss": 0.0746, "num_input_tokens_seen": 10993712, "step": 22330 }, { "epoch": 2.9477365712023227, "grad_norm": 0.0386221781373024, "learning_rate": 8.625039400303025e-07, "loss": 0.1069, "num_input_tokens_seen": 10996016, "step": 22335 }, { "epoch": 2.9483964629800714, "grad_norm": 0.06237388774752617, "learning_rate": 8.620476320790346e-07, "loss": 0.1457, "num_input_tokens_seen": 10998512, "step": 22340 }, { "epoch": 2.9490563547578197, "grad_norm": 0.05534980446100235, "learning_rate": 8.615913534071577e-07, "loss": 0.0385, "num_input_tokens_seen": 11001200, "step": 22345 }, { "epoch": 2.9497162465355684, "grad_norm": 0.583026111125946, "learning_rate": 8.61135104111514e-07, "loss": 0.0011, "num_input_tokens_seen": 11003632, "step": 22350 }, { "epoch": 2.9503761383133167, "grad_norm": 0.04361052066087723, "learning_rate": 8.606788842889387e-07, "loss": 0.0005, "num_input_tokens_seen": 11005936, "step": 22355 }, { "epoch": 2.951036030091065, "grad_norm": 0.18426087498664856, "learning_rate": 8.602226940362615e-07, "loss": 0.0006, "num_input_tokens_seen": 11008624, "step": 22360 }, { "epoch": 2.9516959218688132, "grad_norm": 0.02046630159020424, "learning_rate": 8.59766533450305e-07, "loss": 0.0893, "num_input_tokens_seen": 11010992, "step": 22365 }, { "epoch": 2.952355813646562, "grad_norm": 0.22573372721672058, "learning_rate": 8.593104026278866e-07, "loss": 0.2024, "num_input_tokens_seen": 11013680, "step": 22370 }, { "epoch": 2.9530157054243107, "grad_norm": 0.07988610863685608, "learning_rate": 8.588543016658164e-07, "loss": 0.0256, "num_input_tokens_seen": 11015984, "step": 22375 }, { "epoch": 2.953675597202059, "grad_norm": 0.049433253705501556, "learning_rate": 8.583982306608984e-07, "loss": 0.0559, "num_input_tokens_seen": 11018224, "step": 22380 }, { "epoch": 2.9543354889798072, "grad_norm": 0.29679155349731445, "learning_rate": 8.579421897099307e-07, "loss": 0.0007, "num_input_tokens_seen": 11020528, "step": 22385 }, { "epoch": 2.9549953807575555, "grad_norm": 0.663558304309845, "learning_rate": 8.574861789097043e-07, "loss": 0.0417, "num_input_tokens_seen": 11023088, "step": 22390 }, { "epoch": 2.9556552725353042, "grad_norm": 1.3020586967468262, "learning_rate": 8.570301983570048e-07, "loss": 0.0681, "num_input_tokens_seen": 11025840, "step": 22395 }, { "epoch": 2.9563151643130525, "grad_norm": 19.917871475219727, "learning_rate": 8.565742481486102e-07, "loss": 0.0693, "num_input_tokens_seen": 11028016, "step": 22400 }, { "epoch": 2.9569750560908012, "grad_norm": 0.037922654300928116, "learning_rate": 8.561183283812928e-07, "loss": 0.0002, "num_input_tokens_seen": 11030192, "step": 22405 }, { "epoch": 2.9576349478685495, "grad_norm": 0.06563448905944824, "learning_rate": 8.556624391518182e-07, "loss": 0.0005, "num_input_tokens_seen": 11032688, "step": 22410 }, { "epoch": 2.958294839646298, "grad_norm": 0.048501260578632355, "learning_rate": 8.552065805569457e-07, "loss": 0.0344, "num_input_tokens_seen": 11035248, "step": 22415 }, { "epoch": 2.9589547314240465, "grad_norm": 0.046624649316072464, "learning_rate": 8.547507526934281e-07, "loss": 0.0848, "num_input_tokens_seen": 11037808, "step": 22420 }, { "epoch": 2.959614623201795, "grad_norm": 0.2368771731853485, "learning_rate": 8.542949556580114e-07, "loss": 0.0006, "num_input_tokens_seen": 11039984, "step": 22425 }, { "epoch": 2.9602745149795435, "grad_norm": 0.5295007824897766, "learning_rate": 8.538391895474353e-07, "loss": 0.0662, "num_input_tokens_seen": 11042544, "step": 22430 }, { "epoch": 2.960934406757292, "grad_norm": 0.8100051283836365, "learning_rate": 8.533834544584327e-07, "loss": 0.0357, "num_input_tokens_seen": 11045168, "step": 22435 }, { "epoch": 2.96159429853504, "grad_norm": 0.031259916722774506, "learning_rate": 8.529277504877301e-07, "loss": 0.043, "num_input_tokens_seen": 11047792, "step": 22440 }, { "epoch": 2.962254190312789, "grad_norm": 23.24228286743164, "learning_rate": 8.524720777320476e-07, "loss": 0.1004, "num_input_tokens_seen": 11050160, "step": 22445 }, { "epoch": 2.962914082090537, "grad_norm": 0.4114581346511841, "learning_rate": 8.520164362880986e-07, "loss": 0.0013, "num_input_tokens_seen": 11052720, "step": 22450 }, { "epoch": 2.963573973868286, "grad_norm": 0.01881307363510132, "learning_rate": 8.515608262525886e-07, "loss": 0.0006, "num_input_tokens_seen": 11054960, "step": 22455 }, { "epoch": 2.964233865646034, "grad_norm": 0.022714700549840927, "learning_rate": 8.511052477222189e-07, "loss": 0.0002, "num_input_tokens_seen": 11057712, "step": 22460 }, { "epoch": 2.9648937574237824, "grad_norm": 0.011891757138073444, "learning_rate": 8.50649700793682e-07, "loss": 0.0526, "num_input_tokens_seen": 11060400, "step": 22465 }, { "epoch": 2.965553649201531, "grad_norm": 0.012211556546390057, "learning_rate": 8.501941855636645e-07, "loss": 0.0001, "num_input_tokens_seen": 11063152, "step": 22470 }, { "epoch": 2.9662135409792794, "grad_norm": 0.08262446522712708, "learning_rate": 8.497387021288468e-07, "loss": 0.0368, "num_input_tokens_seen": 11065904, "step": 22475 }, { "epoch": 2.966873432757028, "grad_norm": 0.07934442162513733, "learning_rate": 8.492832505859007e-07, "loss": 0.0007, "num_input_tokens_seen": 11068272, "step": 22480 }, { "epoch": 2.9675333245347764, "grad_norm": 0.05161631479859352, "learning_rate": 8.488278310314939e-07, "loss": 0.0004, "num_input_tokens_seen": 11070640, "step": 22485 }, { "epoch": 2.9681932163125246, "grad_norm": 0.010977809317409992, "learning_rate": 8.483724435622847e-07, "loss": 0.0015, "num_input_tokens_seen": 11073136, "step": 22490 }, { "epoch": 2.968853108090273, "grad_norm": 0.007353820372372866, "learning_rate": 8.479170882749269e-07, "loss": 0.0001, "num_input_tokens_seen": 11075888, "step": 22495 }, { "epoch": 2.9695129998680216, "grad_norm": 0.6406528949737549, "learning_rate": 8.474617652660657e-07, "loss": 0.0995, "num_input_tokens_seen": 11078448, "step": 22500 }, { "epoch": 2.9701728916457704, "grad_norm": 0.007448363117873669, "learning_rate": 8.470064746323399e-07, "loss": 0.0001, "num_input_tokens_seen": 11081072, "step": 22505 }, { "epoch": 2.9708327834235186, "grad_norm": 0.01633693464100361, "learning_rate": 8.465512164703823e-07, "loss": 0.0007, "num_input_tokens_seen": 11083504, "step": 22510 }, { "epoch": 2.971492675201267, "grad_norm": 0.002468445338308811, "learning_rate": 8.460959908768173e-07, "loss": 0.0006, "num_input_tokens_seen": 11085872, "step": 22515 }, { "epoch": 2.972152566979015, "grad_norm": 0.00488969637081027, "learning_rate": 8.456407979482645e-07, "loss": 0.0751, "num_input_tokens_seen": 11088368, "step": 22520 }, { "epoch": 2.972812458756764, "grad_norm": 0.05592549219727516, "learning_rate": 8.451856377813342e-07, "loss": 0.0005, "num_input_tokens_seen": 11090864, "step": 22525 }, { "epoch": 2.973472350534512, "grad_norm": 0.007001897785812616, "learning_rate": 8.44730510472631e-07, "loss": 0.0004, "num_input_tokens_seen": 11093488, "step": 22530 }, { "epoch": 2.974132242312261, "grad_norm": 0.1100287213921547, "learning_rate": 8.442754161187528e-07, "loss": 0.0001, "num_input_tokens_seen": 11095792, "step": 22535 }, { "epoch": 2.974792134090009, "grad_norm": 0.005934002343565226, "learning_rate": 8.438203548162898e-07, "loss": 0.0001, "num_input_tokens_seen": 11098288, "step": 22540 }, { "epoch": 2.9754520258677575, "grad_norm": 0.013678577728569508, "learning_rate": 8.433653266618255e-07, "loss": 0.0257, "num_input_tokens_seen": 11100528, "step": 22545 }, { "epoch": 2.976111917645506, "grad_norm": 0.004507725592702627, "learning_rate": 8.429103317519366e-07, "loss": 0.0707, "num_input_tokens_seen": 11103152, "step": 22550 }, { "epoch": 2.9767718094232545, "grad_norm": 0.06873472779989243, "learning_rate": 8.424553701831919e-07, "loss": 0.0073, "num_input_tokens_seen": 11105840, "step": 22555 }, { "epoch": 2.977431701201003, "grad_norm": 0.006153427064418793, "learning_rate": 8.420004420521542e-07, "loss": 0.0646, "num_input_tokens_seen": 11107952, "step": 22560 }, { "epoch": 2.9780915929787515, "grad_norm": 0.004110193345695734, "learning_rate": 8.415455474553784e-07, "loss": 0.0, "num_input_tokens_seen": 11110384, "step": 22565 }, { "epoch": 2.9787514847564998, "grad_norm": 0.0027430232148617506, "learning_rate": 8.41090686489413e-07, "loss": 0.0783, "num_input_tokens_seen": 11112944, "step": 22570 }, { "epoch": 2.9794113765342485, "grad_norm": 0.03169501945376396, "learning_rate": 8.406358592507985e-07, "loss": 0.1547, "num_input_tokens_seen": 11115312, "step": 22575 }, { "epoch": 2.9800712683119968, "grad_norm": 56.20221710205078, "learning_rate": 8.401810658360686e-07, "loss": 0.1548, "num_input_tokens_seen": 11117872, "step": 22580 }, { "epoch": 2.9807311600897455, "grad_norm": 0.022906135767698288, "learning_rate": 8.397263063417506e-07, "loss": 0.0782, "num_input_tokens_seen": 11120624, "step": 22585 }, { "epoch": 2.9813910518674938, "grad_norm": 0.016489777714014053, "learning_rate": 8.39271580864363e-07, "loss": 0.0001, "num_input_tokens_seen": 11123120, "step": 22590 }, { "epoch": 2.982050943645242, "grad_norm": 25.647974014282227, "learning_rate": 8.388168895004189e-07, "loss": 0.071, "num_input_tokens_seen": 11125552, "step": 22595 }, { "epoch": 2.9827108354229908, "grad_norm": 20.744800567626953, "learning_rate": 8.383622323464226e-07, "loss": 0.1256, "num_input_tokens_seen": 11128176, "step": 22600 }, { "epoch": 2.983370727200739, "grad_norm": 11.403419494628906, "learning_rate": 8.379076094988718e-07, "loss": 0.0751, "num_input_tokens_seen": 11130480, "step": 22605 }, { "epoch": 2.9840306189784878, "grad_norm": 0.13708731532096863, "learning_rate": 8.374530210542575e-07, "loss": 0.1028, "num_input_tokens_seen": 11132848, "step": 22610 }, { "epoch": 2.984690510756236, "grad_norm": 0.019638191908597946, "learning_rate": 8.369984671090621e-07, "loss": 0.0007, "num_input_tokens_seen": 11135152, "step": 22615 }, { "epoch": 2.9853504025339843, "grad_norm": 320.8174133300781, "learning_rate": 8.365439477597619e-07, "loss": 0.019, "num_input_tokens_seen": 11137648, "step": 22620 }, { "epoch": 2.986010294311733, "grad_norm": 0.07475064694881439, "learning_rate": 8.360894631028254e-07, "loss": 0.0387, "num_input_tokens_seen": 11139888, "step": 22625 }, { "epoch": 2.9866701860894813, "grad_norm": 0.010483183898031712, "learning_rate": 8.356350132347127e-07, "loss": 0.0004, "num_input_tokens_seen": 11142512, "step": 22630 }, { "epoch": 2.98733007786723, "grad_norm": 1.2646681070327759, "learning_rate": 8.351805982518788e-07, "loss": 0.0013, "num_input_tokens_seen": 11144816, "step": 22635 }, { "epoch": 2.9879899696449783, "grad_norm": 0.13110283017158508, "learning_rate": 8.347262182507688e-07, "loss": 0.0283, "num_input_tokens_seen": 11147312, "step": 22640 }, { "epoch": 2.9886498614227266, "grad_norm": 0.06564207375049591, "learning_rate": 8.342718733278228e-07, "loss": 0.0003, "num_input_tokens_seen": 11149552, "step": 22645 }, { "epoch": 2.989309753200475, "grad_norm": 0.24977436661720276, "learning_rate": 8.338175635794713e-07, "loss": 0.0004, "num_input_tokens_seen": 11151984, "step": 22650 }, { "epoch": 2.9899696449782236, "grad_norm": 0.02693694457411766, "learning_rate": 8.333632891021383e-07, "loss": 0.1013, "num_input_tokens_seen": 11154544, "step": 22655 }, { "epoch": 2.990629536755972, "grad_norm": 0.034543056041002274, "learning_rate": 8.32909049992241e-07, "loss": 0.0648, "num_input_tokens_seen": 11157040, "step": 22660 }, { "epoch": 2.9912894285337206, "grad_norm": 0.0038696222472935915, "learning_rate": 8.324548463461871e-07, "loss": 0.1119, "num_input_tokens_seen": 11159408, "step": 22665 }, { "epoch": 2.991949320311469, "grad_norm": 0.06301067024469376, "learning_rate": 8.320006782603797e-07, "loss": 0.0001, "num_input_tokens_seen": 11161968, "step": 22670 }, { "epoch": 2.992609212089217, "grad_norm": 0.021865667775273323, "learning_rate": 8.315465458312114e-07, "loss": 0.0006, "num_input_tokens_seen": 11164336, "step": 22675 }, { "epoch": 2.993269103866966, "grad_norm": 0.0017529904143884778, "learning_rate": 8.310924491550688e-07, "loss": 0.0551, "num_input_tokens_seen": 11166832, "step": 22680 }, { "epoch": 2.993928995644714, "grad_norm": 0.01234086137264967, "learning_rate": 8.306383883283308e-07, "loss": 0.0014, "num_input_tokens_seen": 11168880, "step": 22685 }, { "epoch": 2.994588887422463, "grad_norm": 0.05340511351823807, "learning_rate": 8.301843634473683e-07, "loss": 0.0004, "num_input_tokens_seen": 11171184, "step": 22690 }, { "epoch": 2.995248779200211, "grad_norm": 0.029869886115193367, "learning_rate": 8.297303746085452e-07, "loss": 0.0201, "num_input_tokens_seen": 11173680, "step": 22695 }, { "epoch": 2.9959086709779594, "grad_norm": 171.3265380859375, "learning_rate": 8.292764219082168e-07, "loss": 0.0418, "num_input_tokens_seen": 11176240, "step": 22700 }, { "epoch": 2.996568562755708, "grad_norm": 0.5562390089035034, "learning_rate": 8.28822505442732e-07, "loss": 0.0009, "num_input_tokens_seen": 11178608, "step": 22705 }, { "epoch": 2.9972284545334564, "grad_norm": 0.10550618171691895, "learning_rate": 8.283686253084306e-07, "loss": 0.0041, "num_input_tokens_seen": 11181360, "step": 22710 }, { "epoch": 2.997888346311205, "grad_norm": 0.2801491618156433, "learning_rate": 8.279147816016455e-07, "loss": 0.0644, "num_input_tokens_seen": 11183856, "step": 22715 }, { "epoch": 2.9985482380889534, "grad_norm": 0.005332210101187229, "learning_rate": 8.274609744187021e-07, "loss": 0.0427, "num_input_tokens_seen": 11186608, "step": 22720 }, { "epoch": 2.9992081298667017, "grad_norm": 0.009198754094541073, "learning_rate": 8.270072038559172e-07, "loss": 0.0002, "num_input_tokens_seen": 11188912, "step": 22725 }, { "epoch": 2.9998680216444504, "grad_norm": 0.028327500447630882, "learning_rate": 8.265534700096008e-07, "loss": 0.0001, "num_input_tokens_seen": 11191408, "step": 22730 }, { "epoch": 3.0005279134221987, "grad_norm": 0.01000028196722269, "learning_rate": 8.260997729760544e-07, "loss": 0.0006, "num_input_tokens_seen": 11193728, "step": 22735 }, { "epoch": 3.001187805199947, "grad_norm": 0.016123950481414795, "learning_rate": 8.256461128515717e-07, "loss": 0.0352, "num_input_tokens_seen": 11196096, "step": 22740 }, { "epoch": 3.001187805199947, "eval_loss": 0.14833371341228485, "eval_runtime": 7.8562, "eval_samples_per_second": 857.28, "eval_steps_per_second": 107.176, "num_input_tokens_seen": 11196096, "step": 22740 }, { "epoch": 3.0018476969776957, "grad_norm": 0.05220545455813408, "learning_rate": 8.251924897324392e-07, "loss": 0.0003, "num_input_tokens_seen": 11198656, "step": 22745 }, { "epoch": 3.002507588755444, "grad_norm": 0.003668104764074087, "learning_rate": 8.247389037149346e-07, "loss": 0.0, "num_input_tokens_seen": 11201088, "step": 22750 }, { "epoch": 3.0031674805331927, "grad_norm": 0.039417386054992676, "learning_rate": 8.242853548953288e-07, "loss": 0.0, "num_input_tokens_seen": 11203648, "step": 22755 }, { "epoch": 3.003827372310941, "grad_norm": 0.008168110623955727, "learning_rate": 8.238318433698841e-07, "loss": 0.0, "num_input_tokens_seen": 11206400, "step": 22760 }, { "epoch": 3.0044872640886893, "grad_norm": 0.00275533483363688, "learning_rate": 8.233783692348546e-07, "loss": 0.0, "num_input_tokens_seen": 11208896, "step": 22765 }, { "epoch": 3.005147155866438, "grad_norm": 1.2400000095367432, "learning_rate": 8.229249325864874e-07, "loss": 0.0016, "num_input_tokens_seen": 11211328, "step": 22770 }, { "epoch": 3.0058070476441863, "grad_norm": 0.014781933277845383, "learning_rate": 8.224715335210208e-07, "loss": 0.0581, "num_input_tokens_seen": 11214080, "step": 22775 }, { "epoch": 3.006466939421935, "grad_norm": 0.0018302809912711382, "learning_rate": 8.22018172134686e-07, "loss": 0.0001, "num_input_tokens_seen": 11216320, "step": 22780 }, { "epoch": 3.0071268311996833, "grad_norm": 0.008619059808552265, "learning_rate": 8.215648485237054e-07, "loss": 0.0502, "num_input_tokens_seen": 11218880, "step": 22785 }, { "epoch": 3.0077867229774315, "grad_norm": 0.004694989416748285, "learning_rate": 8.211115627842931e-07, "loss": 0.0004, "num_input_tokens_seen": 11221312, "step": 22790 }, { "epoch": 3.0084466147551803, "grad_norm": 0.007136012427508831, "learning_rate": 8.206583150126564e-07, "loss": 0.0, "num_input_tokens_seen": 11223488, "step": 22795 }, { "epoch": 3.0091065065329285, "grad_norm": 0.10257059335708618, "learning_rate": 8.202051053049936e-07, "loss": 0.0001, "num_input_tokens_seen": 11225728, "step": 22800 }, { "epoch": 3.009766398310677, "grad_norm": 0.003423799527809024, "learning_rate": 8.197519337574953e-07, "loss": 0.0006, "num_input_tokens_seen": 11227904, "step": 22805 }, { "epoch": 3.0104262900884255, "grad_norm": 0.005957749206572771, "learning_rate": 8.192988004663442e-07, "loss": 0.0, "num_input_tokens_seen": 11230464, "step": 22810 }, { "epoch": 3.011086181866174, "grad_norm": 0.011167514137923717, "learning_rate": 8.188457055277133e-07, "loss": 0.0004, "num_input_tokens_seen": 11232960, "step": 22815 }, { "epoch": 3.0117460736439225, "grad_norm": 0.002534316387027502, "learning_rate": 8.183926490377703e-07, "loss": 0.0001, "num_input_tokens_seen": 11235456, "step": 22820 }, { "epoch": 3.012405965421671, "grad_norm": 0.00074008823139593, "learning_rate": 8.179396310926719e-07, "loss": 0.0007, "num_input_tokens_seen": 11237888, "step": 22825 }, { "epoch": 3.013065857199419, "grad_norm": 0.011752812191843987, "learning_rate": 8.17486651788569e-07, "loss": 0.0003, "num_input_tokens_seen": 11240576, "step": 22830 }, { "epoch": 3.013725748977168, "grad_norm": 0.022307157516479492, "learning_rate": 8.170337112216023e-07, "loss": 0.0001, "num_input_tokens_seen": 11242752, "step": 22835 }, { "epoch": 3.014385640754916, "grad_norm": 0.001109883887693286, "learning_rate": 8.165808094879054e-07, "loss": 0.0, "num_input_tokens_seen": 11245184, "step": 22840 }, { "epoch": 3.015045532532665, "grad_norm": 0.024239294230937958, "learning_rate": 8.161279466836036e-07, "loss": 0.0, "num_input_tokens_seen": 11247552, "step": 22845 }, { "epoch": 3.015705424310413, "grad_norm": 0.00042449618922546506, "learning_rate": 8.156751229048132e-07, "loss": 0.0005, "num_input_tokens_seen": 11249856, "step": 22850 }, { "epoch": 3.0163653160881614, "grad_norm": 0.004139366559684277, "learning_rate": 8.152223382476438e-07, "loss": 0.0, "num_input_tokens_seen": 11252608, "step": 22855 }, { "epoch": 3.01702520786591, "grad_norm": 0.00021089478104840964, "learning_rate": 8.14769592808195e-07, "loss": 0.0, "num_input_tokens_seen": 11255040, "step": 22860 }, { "epoch": 3.0176850996436584, "grad_norm": 11.633561134338379, "learning_rate": 8.143168866825583e-07, "loss": 0.0565, "num_input_tokens_seen": 11257600, "step": 22865 }, { "epoch": 3.018344991421407, "grad_norm": 0.006936497054994106, "learning_rate": 8.138642199668183e-07, "loss": 0.0, "num_input_tokens_seen": 11259904, "step": 22870 }, { "epoch": 3.0190048831991554, "grad_norm": 0.15562273561954498, "learning_rate": 8.134115927570493e-07, "loss": 0.1032, "num_input_tokens_seen": 11262272, "step": 22875 }, { "epoch": 3.0196647749769037, "grad_norm": 0.002753417706117034, "learning_rate": 8.129590051493189e-07, "loss": 0.0014, "num_input_tokens_seen": 11264512, "step": 22880 }, { "epoch": 3.0203246667546524, "grad_norm": 19.26506233215332, "learning_rate": 8.125064572396851e-07, "loss": 0.0036, "num_input_tokens_seen": 11267008, "step": 22885 }, { "epoch": 3.0209845585324007, "grad_norm": 0.003096244530752301, "learning_rate": 8.12053949124198e-07, "loss": 0.0001, "num_input_tokens_seen": 11269568, "step": 22890 }, { "epoch": 3.021644450310149, "grad_norm": 0.015391573309898376, "learning_rate": 8.116014808988993e-07, "loss": 0.0002, "num_input_tokens_seen": 11272192, "step": 22895 }, { "epoch": 3.0223043420878977, "grad_norm": 0.01115776039659977, "learning_rate": 8.111490526598217e-07, "loss": 0.0002, "num_input_tokens_seen": 11275008, "step": 22900 }, { "epoch": 3.022964233865646, "grad_norm": 0.005220245569944382, "learning_rate": 8.106966645029905e-07, "loss": 0.0367, "num_input_tokens_seen": 11277376, "step": 22905 }, { "epoch": 3.0236241256433947, "grad_norm": 14.08866024017334, "learning_rate": 8.102443165244213e-07, "loss": 0.0626, "num_input_tokens_seen": 11279936, "step": 22910 }, { "epoch": 3.024284017421143, "grad_norm": 0.002115392591804266, "learning_rate": 8.097920088201216e-07, "loss": 0.0, "num_input_tokens_seen": 11282432, "step": 22915 }, { "epoch": 3.0249439091988912, "grad_norm": 0.0018674953607842326, "learning_rate": 8.09339741486091e-07, "loss": 0.0323, "num_input_tokens_seen": 11285184, "step": 22920 }, { "epoch": 3.02560380097664, "grad_norm": 0.006753930356353521, "learning_rate": 8.088875146183192e-07, "loss": 0.0, "num_input_tokens_seen": 11287744, "step": 22925 }, { "epoch": 3.0262636927543882, "grad_norm": 0.0023846931289881468, "learning_rate": 8.084353283127889e-07, "loss": 0.0, "num_input_tokens_seen": 11289984, "step": 22930 }, { "epoch": 3.026923584532137, "grad_norm": 0.009844399988651276, "learning_rate": 8.079831826654729e-07, "loss": 0.0457, "num_input_tokens_seen": 11292416, "step": 22935 }, { "epoch": 3.0275834763098852, "grad_norm": 0.054123781621456146, "learning_rate": 8.075310777723357e-07, "loss": 0.0001, "num_input_tokens_seen": 11295104, "step": 22940 }, { "epoch": 3.0282433680876335, "grad_norm": 0.003461797721683979, "learning_rate": 8.070790137293338e-07, "loss": 0.0, "num_input_tokens_seen": 11297280, "step": 22945 }, { "epoch": 3.0289032598653822, "grad_norm": 0.005159073509275913, "learning_rate": 8.066269906324138e-07, "loss": 0.0783, "num_input_tokens_seen": 11299648, "step": 22950 }, { "epoch": 3.0295631516431305, "grad_norm": 0.19001533091068268, "learning_rate": 8.061750085775151e-07, "loss": 0.0002, "num_input_tokens_seen": 11302080, "step": 22955 }, { "epoch": 3.030223043420879, "grad_norm": 10.371720314025879, "learning_rate": 8.057230676605673e-07, "loss": 0.0538, "num_input_tokens_seen": 11304896, "step": 22960 }, { "epoch": 3.0308829351986275, "grad_norm": 0.008192854933440685, "learning_rate": 8.05271167977491e-07, "loss": 0.0, "num_input_tokens_seen": 11307328, "step": 22965 }, { "epoch": 3.031542826976376, "grad_norm": 0.0040962412022054195, "learning_rate": 8.048193096241999e-07, "loss": 0.0, "num_input_tokens_seen": 11309440, "step": 22970 }, { "epoch": 3.0322027187541245, "grad_norm": 0.1099163144826889, "learning_rate": 8.043674926965962e-07, "loss": 0.0001, "num_input_tokens_seen": 11311936, "step": 22975 }, { "epoch": 3.032862610531873, "grad_norm": 1.730372667312622, "learning_rate": 8.039157172905762e-07, "loss": 0.0834, "num_input_tokens_seen": 11314560, "step": 22980 }, { "epoch": 3.033522502309621, "grad_norm": 0.10862348228693008, "learning_rate": 8.034639835020251e-07, "loss": 0.0001, "num_input_tokens_seen": 11316992, "step": 22985 }, { "epoch": 3.03418239408737, "grad_norm": 0.0029926190618425608, "learning_rate": 8.030122914268198e-07, "loss": 0.0002, "num_input_tokens_seen": 11319616, "step": 22990 }, { "epoch": 3.034842285865118, "grad_norm": 0.21199586987495422, "learning_rate": 8.025606411608299e-07, "loss": 0.0005, "num_input_tokens_seen": 11322112, "step": 22995 }, { "epoch": 3.035502177642867, "grad_norm": 0.003737781662493944, "learning_rate": 8.021090327999135e-07, "loss": 0.0002, "num_input_tokens_seen": 11324480, "step": 23000 }, { "epoch": 3.036162069420615, "grad_norm": 0.0034118664916604757, "learning_rate": 8.016574664399225e-07, "loss": 0.0004, "num_input_tokens_seen": 11327040, "step": 23005 }, { "epoch": 3.0368219611983633, "grad_norm": 0.007330165710300207, "learning_rate": 8.012059421766972e-07, "loss": 0.0001, "num_input_tokens_seen": 11329408, "step": 23010 }, { "epoch": 3.037481852976112, "grad_norm": 0.025925220921635628, "learning_rate": 8.007544601060719e-07, "loss": 0.0, "num_input_tokens_seen": 11332032, "step": 23015 }, { "epoch": 3.0381417447538603, "grad_norm": 0.12799876928329468, "learning_rate": 8.003030203238694e-07, "loss": 0.0002, "num_input_tokens_seen": 11334528, "step": 23020 }, { "epoch": 3.0388016365316086, "grad_norm": 0.0025362554006278515, "learning_rate": 7.998516229259045e-07, "loss": 0.0004, "num_input_tokens_seen": 11337088, "step": 23025 }, { "epoch": 3.0394615283093573, "grad_norm": 0.008501172065734863, "learning_rate": 7.994002680079835e-07, "loss": 0.0, "num_input_tokens_seen": 11339584, "step": 23030 }, { "epoch": 3.0401214200871056, "grad_norm": 0.00011658469156827778, "learning_rate": 7.989489556659028e-07, "loss": 0.0, "num_input_tokens_seen": 11342144, "step": 23035 }, { "epoch": 3.0407813118648543, "grad_norm": 0.0857052430510521, "learning_rate": 7.984976859954506e-07, "loss": 0.0001, "num_input_tokens_seen": 11345024, "step": 23040 }, { "epoch": 3.0414412036426026, "grad_norm": 0.04220561310648918, "learning_rate": 7.980464590924054e-07, "loss": 0.0001, "num_input_tokens_seen": 11347456, "step": 23045 }, { "epoch": 3.042101095420351, "grad_norm": 0.032353613525629044, "learning_rate": 7.975952750525366e-07, "loss": 0.02, "num_input_tokens_seen": 11349760, "step": 23050 }, { "epoch": 3.0427609871980996, "grad_norm": 0.0022301196586340666, "learning_rate": 7.97144133971605e-07, "loss": 0.0, "num_input_tokens_seen": 11352512, "step": 23055 }, { "epoch": 3.043420878975848, "grad_norm": 0.0006173693109303713, "learning_rate": 7.966930359453619e-07, "loss": 0.0, "num_input_tokens_seen": 11354944, "step": 23060 }, { "epoch": 3.0440807707535966, "grad_norm": 0.005107264034450054, "learning_rate": 7.9624198106955e-07, "loss": 0.0003, "num_input_tokens_seen": 11357312, "step": 23065 }, { "epoch": 3.044740662531345, "grad_norm": 0.0043373508378863335, "learning_rate": 7.957909694399019e-07, "loss": 0.1689, "num_input_tokens_seen": 11359936, "step": 23070 }, { "epoch": 3.045400554309093, "grad_norm": 0.0003471885866019875, "learning_rate": 7.953400011521417e-07, "loss": 0.0, "num_input_tokens_seen": 11362240, "step": 23075 }, { "epoch": 3.046060446086842, "grad_norm": 0.2935470640659332, "learning_rate": 7.948890763019845e-07, "loss": 0.0002, "num_input_tokens_seen": 11364608, "step": 23080 }, { "epoch": 3.04672033786459, "grad_norm": 0.009840859100222588, "learning_rate": 7.944381949851353e-07, "loss": 0.0, "num_input_tokens_seen": 11366976, "step": 23085 }, { "epoch": 3.0473802296423385, "grad_norm": 0.023066749796271324, "learning_rate": 7.939873572972908e-07, "loss": 0.0002, "num_input_tokens_seen": 11369408, "step": 23090 }, { "epoch": 3.048040121420087, "grad_norm": 0.005535646341741085, "learning_rate": 7.93536563334138e-07, "loss": 0.0001, "num_input_tokens_seen": 11371904, "step": 23095 }, { "epoch": 3.0487000131978355, "grad_norm": 0.08973933756351471, "learning_rate": 7.930858131913541e-07, "loss": 0.0001, "num_input_tokens_seen": 11374656, "step": 23100 }, { "epoch": 3.049359904975584, "grad_norm": 0.0007211986230686307, "learning_rate": 7.926351069646084e-07, "loss": 0.0001, "num_input_tokens_seen": 11376832, "step": 23105 }, { "epoch": 3.0500197967533325, "grad_norm": 0.0014363115187734365, "learning_rate": 7.921844447495594e-07, "loss": 0.0002, "num_input_tokens_seen": 11379264, "step": 23110 }, { "epoch": 3.0506796885310807, "grad_norm": 0.018530454486608505, "learning_rate": 7.917338266418573e-07, "loss": 0.0001, "num_input_tokens_seen": 11381504, "step": 23115 }, { "epoch": 3.0513395803088295, "grad_norm": 0.0006127398228272796, "learning_rate": 7.912832527371426e-07, "loss": 0.0006, "num_input_tokens_seen": 11384320, "step": 23120 }, { "epoch": 3.0519994720865777, "grad_norm": 0.023522265255451202, "learning_rate": 7.908327231310454e-07, "loss": 0.0, "num_input_tokens_seen": 11386752, "step": 23125 }, { "epoch": 3.0526593638643265, "grad_norm": 0.15402010083198547, "learning_rate": 7.903822379191885e-07, "loss": 0.0001, "num_input_tokens_seen": 11389120, "step": 23130 }, { "epoch": 3.0533192556420747, "grad_norm": 0.04021283984184265, "learning_rate": 7.899317971971835e-07, "loss": 0.0002, "num_input_tokens_seen": 11391680, "step": 23135 }, { "epoch": 3.053979147419823, "grad_norm": 0.006357967387884855, "learning_rate": 7.894814010606336e-07, "loss": 0.0, "num_input_tokens_seen": 11394176, "step": 23140 }, { "epoch": 3.0546390391975717, "grad_norm": 0.00018174726574216038, "learning_rate": 7.890310496051319e-07, "loss": 0.0, "num_input_tokens_seen": 11396480, "step": 23145 }, { "epoch": 3.05529893097532, "grad_norm": 0.0018396085361018777, "learning_rate": 7.885807429262616e-07, "loss": 0.0, "num_input_tokens_seen": 11399104, "step": 23150 }, { "epoch": 3.0559588227530683, "grad_norm": 0.0608244314789772, "learning_rate": 7.881304811195985e-07, "loss": 0.0007, "num_input_tokens_seen": 11401664, "step": 23155 }, { "epoch": 3.056618714530817, "grad_norm": 0.0004480116767808795, "learning_rate": 7.876802642807056e-07, "loss": 0.0, "num_input_tokens_seen": 11403968, "step": 23160 }, { "epoch": 3.0572786063085653, "grad_norm": 0.024656491354107857, "learning_rate": 7.8723009250514e-07, "loss": 0.0, "num_input_tokens_seen": 11406720, "step": 23165 }, { "epoch": 3.057938498086314, "grad_norm": 0.00041712072561495006, "learning_rate": 7.867799658884462e-07, "loss": 0.0054, "num_input_tokens_seen": 11409472, "step": 23170 }, { "epoch": 3.0585983898640623, "grad_norm": 0.0006589332479052246, "learning_rate": 7.863298845261603e-07, "loss": 0.0196, "num_input_tokens_seen": 11412160, "step": 23175 }, { "epoch": 3.0592582816418106, "grad_norm": 8.614584658062086e-05, "learning_rate": 7.858798485138095e-07, "loss": 0.0, "num_input_tokens_seen": 11414528, "step": 23180 }, { "epoch": 3.0599181734195593, "grad_norm": 0.05452264845371246, "learning_rate": 7.854298579469099e-07, "loss": 0.0, "num_input_tokens_seen": 11416960, "step": 23185 }, { "epoch": 3.0605780651973076, "grad_norm": 0.0005021971301175654, "learning_rate": 7.849799129209697e-07, "loss": 0.0002, "num_input_tokens_seen": 11419584, "step": 23190 }, { "epoch": 3.0612379569750563, "grad_norm": 0.0011811050353571773, "learning_rate": 7.845300135314857e-07, "loss": 0.0, "num_input_tokens_seen": 11422016, "step": 23195 }, { "epoch": 3.0618978487528046, "grad_norm": 1.7594058513641357, "learning_rate": 7.840801598739459e-07, "loss": 0.0002, "num_input_tokens_seen": 11424512, "step": 23200 }, { "epoch": 3.062557740530553, "grad_norm": 1.5043967323435936e-05, "learning_rate": 7.836303520438288e-07, "loss": 0.0374, "num_input_tokens_seen": 11426944, "step": 23205 }, { "epoch": 3.0632176323083016, "grad_norm": 7.377984002232552e-05, "learning_rate": 7.831805901366025e-07, "loss": 0.0549, "num_input_tokens_seen": 11429248, "step": 23210 }, { "epoch": 3.06387752408605, "grad_norm": 0.00022651898325420916, "learning_rate": 7.827308742477259e-07, "loss": 0.0, "num_input_tokens_seen": 11431872, "step": 23215 }, { "epoch": 3.064537415863798, "grad_norm": 0.0007423846400342882, "learning_rate": 7.822812044726479e-07, "loss": 0.0, "num_input_tokens_seen": 11434368, "step": 23220 }, { "epoch": 3.065197307641547, "grad_norm": 3.652780287666246e-05, "learning_rate": 7.818315809068076e-07, "loss": 0.1095, "num_input_tokens_seen": 11436800, "step": 23225 }, { "epoch": 3.065857199419295, "grad_norm": 0.001282864366658032, "learning_rate": 7.813820036456344e-07, "loss": 0.0, "num_input_tokens_seen": 11439360, "step": 23230 }, { "epoch": 3.066517091197044, "grad_norm": 0.0016766481567174196, "learning_rate": 7.809324727845478e-07, "loss": 0.0001, "num_input_tokens_seen": 11441728, "step": 23235 }, { "epoch": 3.067176982974792, "grad_norm": 0.0020113263744860888, "learning_rate": 7.804829884189576e-07, "loss": 0.0, "num_input_tokens_seen": 11444480, "step": 23240 }, { "epoch": 3.0678368747525404, "grad_norm": 0.0011261154431849718, "learning_rate": 7.800335506442635e-07, "loss": 0.0, "num_input_tokens_seen": 11447168, "step": 23245 }, { "epoch": 3.068496766530289, "grad_norm": 0.0004977425560355186, "learning_rate": 7.795841595558554e-07, "loss": 0.0, "num_input_tokens_seen": 11449856, "step": 23250 }, { "epoch": 3.0691566583080374, "grad_norm": 0.0013624058337882161, "learning_rate": 7.791348152491133e-07, "loss": 0.0007, "num_input_tokens_seen": 11452224, "step": 23255 }, { "epoch": 3.069816550085786, "grad_norm": 0.0009466035990044475, "learning_rate": 7.78685517819407e-07, "loss": 0.0, "num_input_tokens_seen": 11454848, "step": 23260 }, { "epoch": 3.0704764418635344, "grad_norm": 0.0009397098328918219, "learning_rate": 7.782362673620972e-07, "loss": 0.0, "num_input_tokens_seen": 11457088, "step": 23265 }, { "epoch": 3.0711363336412827, "grad_norm": 0.10596859455108643, "learning_rate": 7.777870639725339e-07, "loss": 0.0001, "num_input_tokens_seen": 11459520, "step": 23270 }, { "epoch": 3.0717962254190314, "grad_norm": 0.00013747526099905372, "learning_rate": 7.773379077460569e-07, "loss": 0.0001, "num_input_tokens_seen": 11461952, "step": 23275 }, { "epoch": 3.0724561171967797, "grad_norm": 0.9691517949104309, "learning_rate": 7.768887987779966e-07, "loss": 0.0003, "num_input_tokens_seen": 11464512, "step": 23280 }, { "epoch": 3.073116008974528, "grad_norm": 2.7859707188326865e-05, "learning_rate": 7.764397371636731e-07, "loss": 0.0, "num_input_tokens_seen": 11467008, "step": 23285 }, { "epoch": 3.0737759007522767, "grad_norm": 0.0008346032118424773, "learning_rate": 7.759907229983967e-07, "loss": 0.0, "num_input_tokens_seen": 11469120, "step": 23290 }, { "epoch": 3.074435792530025, "grad_norm": 0.0007078782073222101, "learning_rate": 7.755417563774673e-07, "loss": 0.0, "num_input_tokens_seen": 11471744, "step": 23295 }, { "epoch": 3.0750956843077737, "grad_norm": 0.0009659160277806222, "learning_rate": 7.75092837396174e-07, "loss": 0.0756, "num_input_tokens_seen": 11474112, "step": 23300 }, { "epoch": 3.075755576085522, "grad_norm": 0.0002244488277938217, "learning_rate": 7.746439661497981e-07, "loss": 0.0, "num_input_tokens_seen": 11476736, "step": 23305 }, { "epoch": 3.0764154678632702, "grad_norm": 0.0208339411765337, "learning_rate": 7.741951427336078e-07, "loss": 0.0813, "num_input_tokens_seen": 11479168, "step": 23310 }, { "epoch": 3.077075359641019, "grad_norm": 0.0005570108769461513, "learning_rate": 7.737463672428638e-07, "loss": 0.0, "num_input_tokens_seen": 11481664, "step": 23315 }, { "epoch": 3.0777352514187672, "grad_norm": 2.2254424038692378e-05, "learning_rate": 7.732976397728151e-07, "loss": 0.0005, "num_input_tokens_seen": 11484160, "step": 23320 }, { "epoch": 3.078395143196516, "grad_norm": 0.038763657212257385, "learning_rate": 7.728489604187001e-07, "loss": 0.0001, "num_input_tokens_seen": 11486592, "step": 23325 }, { "epoch": 3.0790550349742642, "grad_norm": 0.009633745066821575, "learning_rate": 7.72400329275749e-07, "loss": 0.0, "num_input_tokens_seen": 11489088, "step": 23330 }, { "epoch": 3.0797149267520125, "grad_norm": 60.851131439208984, "learning_rate": 7.719517464391791e-07, "loss": 0.0252, "num_input_tokens_seen": 11491392, "step": 23335 }, { "epoch": 3.0803748185297612, "grad_norm": 2.9119068130967207e-05, "learning_rate": 7.715032120042004e-07, "loss": 0.0016, "num_input_tokens_seen": 11493760, "step": 23340 }, { "epoch": 3.0810347103075095, "grad_norm": 0.0005959281697869301, "learning_rate": 7.710547260660096e-07, "loss": 0.0001, "num_input_tokens_seen": 11496320, "step": 23345 }, { "epoch": 3.081694602085258, "grad_norm": 0.00015468845958821476, "learning_rate": 7.706062887197959e-07, "loss": 0.0, "num_input_tokens_seen": 11498688, "step": 23350 }, { "epoch": 3.0823544938630065, "grad_norm": 5.800633516628295e-05, "learning_rate": 7.701579000607362e-07, "loss": 0.0, "num_input_tokens_seen": 11501248, "step": 23355 }, { "epoch": 3.083014385640755, "grad_norm": 1.4387388546310831e-05, "learning_rate": 7.697095601839975e-07, "loss": 0.0, "num_input_tokens_seen": 11503680, "step": 23360 }, { "epoch": 3.0836742774185035, "grad_norm": 0.009254597127437592, "learning_rate": 7.692612691847373e-07, "loss": 0.0, "num_input_tokens_seen": 11506176, "step": 23365 }, { "epoch": 3.084334169196252, "grad_norm": 14.855461120605469, "learning_rate": 7.688130271581015e-07, "loss": 0.0673, "num_input_tokens_seen": 11509056, "step": 23370 }, { "epoch": 3.084994060974, "grad_norm": 0.004579714499413967, "learning_rate": 7.68364834199227e-07, "loss": 0.0, "num_input_tokens_seen": 11511296, "step": 23375 }, { "epoch": 3.085653952751749, "grad_norm": 0.0009248594287782907, "learning_rate": 7.679166904032389e-07, "loss": 0.0, "num_input_tokens_seen": 11513856, "step": 23380 }, { "epoch": 3.086313844529497, "grad_norm": 7.695920794503763e-05, "learning_rate": 7.674685958652525e-07, "loss": 0.0002, "num_input_tokens_seen": 11516160, "step": 23385 }, { "epoch": 3.086973736307246, "grad_norm": 0.0007654979126527905, "learning_rate": 7.67020550680373e-07, "loss": 0.0252, "num_input_tokens_seen": 11518400, "step": 23390 }, { "epoch": 3.087633628084994, "grad_norm": 0.0008947370224632323, "learning_rate": 7.665725549436942e-07, "loss": 0.1016, "num_input_tokens_seen": 11521152, "step": 23395 }, { "epoch": 3.0882935198627424, "grad_norm": 0.0005049702012911439, "learning_rate": 7.661246087503006e-07, "loss": 0.0427, "num_input_tokens_seen": 11523776, "step": 23400 }, { "epoch": 3.088953411640491, "grad_norm": 0.009187503717839718, "learning_rate": 7.656767121952651e-07, "loss": 0.0001, "num_input_tokens_seen": 11526208, "step": 23405 }, { "epoch": 3.0896133034182394, "grad_norm": 0.0003884216712322086, "learning_rate": 7.652288653736504e-07, "loss": 0.0002, "num_input_tokens_seen": 11528704, "step": 23410 }, { "epoch": 3.0902731951959876, "grad_norm": 0.001425333321094513, "learning_rate": 7.647810683805091e-07, "loss": 0.0, "num_input_tokens_seen": 11531456, "step": 23415 }, { "epoch": 3.0909330869737364, "grad_norm": 0.006533483508974314, "learning_rate": 7.643333213108827e-07, "loss": 0.117, "num_input_tokens_seen": 11533824, "step": 23420 }, { "epoch": 3.0915929787514846, "grad_norm": 0.008299448527395725, "learning_rate": 7.638856242598024e-07, "loss": 0.0, "num_input_tokens_seen": 11536384, "step": 23425 }, { "epoch": 3.0922528705292334, "grad_norm": 0.002385131549090147, "learning_rate": 7.634379773222885e-07, "loss": 0.002, "num_input_tokens_seen": 11538944, "step": 23430 }, { "epoch": 3.0929127623069816, "grad_norm": 0.006455971393734217, "learning_rate": 7.629903805933506e-07, "loss": 0.0, "num_input_tokens_seen": 11541376, "step": 23435 }, { "epoch": 3.09357265408473, "grad_norm": 0.0007826373912394047, "learning_rate": 7.625428341679885e-07, "loss": 0.0, "num_input_tokens_seen": 11543872, "step": 23440 }, { "epoch": 3.0942325458624786, "grad_norm": 0.0028275828808546066, "learning_rate": 7.6209533814119e-07, "loss": 0.0456, "num_input_tokens_seen": 11546368, "step": 23445 }, { "epoch": 3.094892437640227, "grad_norm": 0.0017768917605280876, "learning_rate": 7.616478926079335e-07, "loss": 0.0, "num_input_tokens_seen": 11548928, "step": 23450 }, { "epoch": 3.0955523294179756, "grad_norm": 0.00031777186086401343, "learning_rate": 7.612004976631857e-07, "loss": 0.0, "num_input_tokens_seen": 11551680, "step": 23455 }, { "epoch": 3.096212221195724, "grad_norm": 0.0034608703572303057, "learning_rate": 7.607531534019028e-07, "loss": 0.0, "num_input_tokens_seen": 11554048, "step": 23460 }, { "epoch": 3.096872112973472, "grad_norm": 0.004485815763473511, "learning_rate": 7.60305859919031e-07, "loss": 0.0002, "num_input_tokens_seen": 11556416, "step": 23465 }, { "epoch": 3.097532004751221, "grad_norm": 0.0004141594690736383, "learning_rate": 7.598586173095043e-07, "loss": 0.0, "num_input_tokens_seen": 11558912, "step": 23470 }, { "epoch": 3.098191896528969, "grad_norm": 0.00151504622772336, "learning_rate": 7.594114256682473e-07, "loss": 0.0, "num_input_tokens_seen": 11561472, "step": 23475 }, { "epoch": 3.0988517883067175, "grad_norm": 0.001460838713683188, "learning_rate": 7.589642850901733e-07, "loss": 0.0719, "num_input_tokens_seen": 11563840, "step": 23480 }, { "epoch": 3.099511680084466, "grad_norm": 0.580944299697876, "learning_rate": 7.585171956701837e-07, "loss": 0.001, "num_input_tokens_seen": 11566528, "step": 23485 }, { "epoch": 3.1001715718622145, "grad_norm": 0.00398764293640852, "learning_rate": 7.580701575031713e-07, "loss": 0.0, "num_input_tokens_seen": 11568640, "step": 23490 }, { "epoch": 3.100831463639963, "grad_norm": 0.0029877680353820324, "learning_rate": 7.576231706840154e-07, "loss": 0.0009, "num_input_tokens_seen": 11571136, "step": 23495 }, { "epoch": 3.1014913554177115, "grad_norm": 0.0001967934367712587, "learning_rate": 7.571762353075869e-07, "loss": 0.0, "num_input_tokens_seen": 11573568, "step": 23500 }, { "epoch": 3.1021512471954598, "grad_norm": 0.0004972056485712528, "learning_rate": 7.56729351468744e-07, "loss": 0.0, "num_input_tokens_seen": 11576064, "step": 23505 }, { "epoch": 3.1028111389732085, "grad_norm": 0.011523031629621983, "learning_rate": 7.562825192623341e-07, "loss": 0.0722, "num_input_tokens_seen": 11578496, "step": 23510 }, { "epoch": 3.1034710307509568, "grad_norm": 0.0016027435194700956, "learning_rate": 7.558357387831953e-07, "loss": 0.0, "num_input_tokens_seen": 11580992, "step": 23515 }, { "epoch": 3.1041309225287055, "grad_norm": 0.0047021047212183475, "learning_rate": 7.553890101261522e-07, "loss": 0.0164, "num_input_tokens_seen": 11583488, "step": 23520 }, { "epoch": 3.1047908143064538, "grad_norm": 0.004702253267168999, "learning_rate": 7.54942333386021e-07, "loss": 0.0, "num_input_tokens_seen": 11586240, "step": 23525 }, { "epoch": 3.105450706084202, "grad_norm": 0.0065322984009981155, "learning_rate": 7.544957086576049e-07, "loss": 0.0, "num_input_tokens_seen": 11589056, "step": 23530 }, { "epoch": 3.1061105978619508, "grad_norm": 0.016910886391997337, "learning_rate": 7.540491360356965e-07, "loss": 0.0, "num_input_tokens_seen": 11591296, "step": 23535 }, { "epoch": 3.106770489639699, "grad_norm": 0.015758175402879715, "learning_rate": 7.53602615615078e-07, "loss": 0.0001, "num_input_tokens_seen": 11593856, "step": 23540 }, { "epoch": 3.1074303814174478, "grad_norm": 0.007520774845033884, "learning_rate": 7.5315614749052e-07, "loss": 0.0, "num_input_tokens_seen": 11596096, "step": 23545 }, { "epoch": 3.108090273195196, "grad_norm": 0.0007047753897495568, "learning_rate": 7.527097317567824e-07, "loss": 0.0, "num_input_tokens_seen": 11598592, "step": 23550 }, { "epoch": 3.1087501649729443, "grad_norm": 0.000877385726198554, "learning_rate": 7.522633685086135e-07, "loss": 0.063, "num_input_tokens_seen": 11601088, "step": 23555 }, { "epoch": 3.109410056750693, "grad_norm": 0.0026626859325915575, "learning_rate": 7.518170578407505e-07, "loss": 0.0026, "num_input_tokens_seen": 11603712, "step": 23560 }, { "epoch": 3.1100699485284413, "grad_norm": 0.0020679160952568054, "learning_rate": 7.513707998479199e-07, "loss": 0.0003, "num_input_tokens_seen": 11606080, "step": 23565 }, { "epoch": 3.1107298403061896, "grad_norm": 0.0006194873712956905, "learning_rate": 7.509245946248363e-07, "loss": 0.0, "num_input_tokens_seen": 11608192, "step": 23570 }, { "epoch": 3.1113897320839383, "grad_norm": 0.0005135766696184874, "learning_rate": 7.504784422662042e-07, "loss": 0.0001, "num_input_tokens_seen": 11610496, "step": 23575 }, { "epoch": 3.1120496238616866, "grad_norm": 0.11313286423683167, "learning_rate": 7.500323428667159e-07, "loss": 0.0004, "num_input_tokens_seen": 11613120, "step": 23580 }, { "epoch": 3.1127095156394353, "grad_norm": 15.523405075073242, "learning_rate": 7.495862965210525e-07, "loss": 0.0337, "num_input_tokens_seen": 11615296, "step": 23585 }, { "epoch": 3.1133694074171836, "grad_norm": 0.0008880163659341633, "learning_rate": 7.491403033238844e-07, "loss": 0.0008, "num_input_tokens_seen": 11617600, "step": 23590 }, { "epoch": 3.114029299194932, "grad_norm": 0.0013785932678729296, "learning_rate": 7.4869436336987e-07, "loss": 0.0323, "num_input_tokens_seen": 11619968, "step": 23595 }, { "epoch": 3.1146891909726806, "grad_norm": 0.007156513165682554, "learning_rate": 7.482484767536576e-07, "loss": 0.0002, "num_input_tokens_seen": 11622336, "step": 23600 }, { "epoch": 3.115349082750429, "grad_norm": 0.03351299464702606, "learning_rate": 7.478026435698827e-07, "loss": 0.0001, "num_input_tokens_seen": 11624896, "step": 23605 }, { "epoch": 3.116008974528177, "grad_norm": 1.2239532470703125, "learning_rate": 7.473568639131706e-07, "loss": 0.0005, "num_input_tokens_seen": 11627264, "step": 23610 }, { "epoch": 3.116668866305926, "grad_norm": 0.24902966618537903, "learning_rate": 7.469111378781346e-07, "loss": 0.0001, "num_input_tokens_seen": 11630016, "step": 23615 }, { "epoch": 3.117328758083674, "grad_norm": 0.0001732937671476975, "learning_rate": 7.464654655593767e-07, "loss": 0.1031, "num_input_tokens_seen": 11632448, "step": 23620 }, { "epoch": 3.117988649861423, "grad_norm": 0.002056445460766554, "learning_rate": 7.46019847051488e-07, "loss": 0.0, "num_input_tokens_seen": 11634944, "step": 23625 }, { "epoch": 3.118648541639171, "grad_norm": 0.0006751983892172575, "learning_rate": 7.455742824490477e-07, "loss": 0.0002, "num_input_tokens_seen": 11637632, "step": 23630 }, { "epoch": 3.1193084334169194, "grad_norm": 0.0006119022727943957, "learning_rate": 7.45128771846623e-07, "loss": 0.0, "num_input_tokens_seen": 11640192, "step": 23635 }, { "epoch": 3.119968325194668, "grad_norm": 0.0014791876310482621, "learning_rate": 7.446833153387714e-07, "loss": 0.0, "num_input_tokens_seen": 11642880, "step": 23640 }, { "epoch": 3.1206282169724164, "grad_norm": 0.001501706661656499, "learning_rate": 7.442379130200369e-07, "loss": 0.0, "num_input_tokens_seen": 11645184, "step": 23645 }, { "epoch": 3.121288108750165, "grad_norm": 0.0006226670229807496, "learning_rate": 7.437925649849534e-07, "loss": 0.0, "num_input_tokens_seen": 11647552, "step": 23650 }, { "epoch": 3.1219480005279134, "grad_norm": 34.658870697021484, "learning_rate": 7.433472713280426e-07, "loss": 0.0613, "num_input_tokens_seen": 11650240, "step": 23655 }, { "epoch": 3.1226078923056617, "grad_norm": 1.7755703926086426, "learning_rate": 7.42902032143815e-07, "loss": 0.0004, "num_input_tokens_seen": 11652672, "step": 23660 }, { "epoch": 3.1232677840834104, "grad_norm": 0.0004984392435289919, "learning_rate": 7.424568475267697e-07, "loss": 0.0, "num_input_tokens_seen": 11654848, "step": 23665 }, { "epoch": 3.1239276758611587, "grad_norm": 9.023253369377926e-05, "learning_rate": 7.42011717571393e-07, "loss": 0.0, "num_input_tokens_seen": 11656896, "step": 23670 }, { "epoch": 3.1245875676389074, "grad_norm": 0.00040223190444521606, "learning_rate": 7.415666423721613e-07, "loss": 0.0, "num_input_tokens_seen": 11659264, "step": 23675 }, { "epoch": 3.1252474594166557, "grad_norm": 0.00010142037353944033, "learning_rate": 7.411216220235381e-07, "loss": 0.0, "num_input_tokens_seen": 11661760, "step": 23680 }, { "epoch": 3.125907351194404, "grad_norm": 0.0006684021791443229, "learning_rate": 7.406766566199762e-07, "loss": 0.0891, "num_input_tokens_seen": 11664000, "step": 23685 }, { "epoch": 3.1265672429721527, "grad_norm": 0.0007115312619134784, "learning_rate": 7.402317462559163e-07, "loss": 0.0001, "num_input_tokens_seen": 11666624, "step": 23690 }, { "epoch": 3.127227134749901, "grad_norm": 0.043818097561597824, "learning_rate": 7.397868910257865e-07, "loss": 0.0, "num_input_tokens_seen": 11669376, "step": 23695 }, { "epoch": 3.1278870265276493, "grad_norm": 0.0004242084105499089, "learning_rate": 7.393420910240054e-07, "loss": 0.0564, "num_input_tokens_seen": 11672128, "step": 23700 }, { "epoch": 3.128546918305398, "grad_norm": 0.0007881783531047404, "learning_rate": 7.388973463449773e-07, "loss": 0.0, "num_input_tokens_seen": 11674496, "step": 23705 }, { "epoch": 3.1292068100831463, "grad_norm": 0.00019011733820661902, "learning_rate": 7.384526570830972e-07, "loss": 0.0, "num_input_tokens_seen": 11676992, "step": 23710 }, { "epoch": 3.129866701860895, "grad_norm": 0.0006559406756423414, "learning_rate": 7.380080233327466e-07, "loss": 0.0004, "num_input_tokens_seen": 11679744, "step": 23715 }, { "epoch": 3.1305265936386433, "grad_norm": 0.003428281983360648, "learning_rate": 7.375634451882956e-07, "loss": 0.0087, "num_input_tokens_seen": 11682048, "step": 23720 }, { "epoch": 3.1311864854163916, "grad_norm": 88.51753997802734, "learning_rate": 7.371189227441031e-07, "loss": 0.0213, "num_input_tokens_seen": 11684608, "step": 23725 }, { "epoch": 3.1318463771941403, "grad_norm": 0.4643154740333557, "learning_rate": 7.366744560945155e-07, "loss": 0.0552, "num_input_tokens_seen": 11686976, "step": 23730 }, { "epoch": 3.1325062689718886, "grad_norm": 0.00041846715612336993, "learning_rate": 7.362300453338679e-07, "loss": 0.0001, "num_input_tokens_seen": 11689280, "step": 23735 }, { "epoch": 3.133166160749637, "grad_norm": 0.01871989667415619, "learning_rate": 7.357856905564832e-07, "loss": 0.0, "num_input_tokens_seen": 11691776, "step": 23740 }, { "epoch": 3.1338260525273856, "grad_norm": 0.0005692157428711653, "learning_rate": 7.353413918566721e-07, "loss": 0.0, "num_input_tokens_seen": 11694080, "step": 23745 }, { "epoch": 3.134485944305134, "grad_norm": 0.00029191409703344107, "learning_rate": 7.348971493287342e-07, "loss": 0.0, "num_input_tokens_seen": 11696640, "step": 23750 }, { "epoch": 3.1351458360828826, "grad_norm": 0.00030911393696442246, "learning_rate": 7.344529630669565e-07, "loss": 0.0239, "num_input_tokens_seen": 11699136, "step": 23755 }, { "epoch": 3.135805727860631, "grad_norm": 0.0006664734100922942, "learning_rate": 7.340088331656147e-07, "loss": 0.0005, "num_input_tokens_seen": 11701632, "step": 23760 }, { "epoch": 3.136465619638379, "grad_norm": 0.0018933032406494021, "learning_rate": 7.33564759718972e-07, "loss": 0.0322, "num_input_tokens_seen": 11704000, "step": 23765 }, { "epoch": 3.137125511416128, "grad_norm": 0.0021034155506640673, "learning_rate": 7.331207428212792e-07, "loss": 0.0472, "num_input_tokens_seen": 11706624, "step": 23770 }, { "epoch": 3.137785403193876, "grad_norm": 1.0103815793991089, "learning_rate": 7.326767825667766e-07, "loss": 0.0004, "num_input_tokens_seen": 11708864, "step": 23775 }, { "epoch": 3.138445294971625, "grad_norm": 0.0008806603727862239, "learning_rate": 7.322328790496908e-07, "loss": 0.0215, "num_input_tokens_seen": 11711296, "step": 23780 }, { "epoch": 3.139105186749373, "grad_norm": 0.13981249928474426, "learning_rate": 7.317890323642375e-07, "loss": 0.0001, "num_input_tokens_seen": 11713600, "step": 23785 }, { "epoch": 3.1397650785271214, "grad_norm": 0.00162605254445225, "learning_rate": 7.3134524260462e-07, "loss": 0.0, "num_input_tokens_seen": 11716032, "step": 23790 }, { "epoch": 3.14042497030487, "grad_norm": 0.0005970995989628136, "learning_rate": 7.30901509865029e-07, "loss": 0.0, "num_input_tokens_seen": 11718592, "step": 23795 }, { "epoch": 3.1410848620826184, "grad_norm": 0.008737782947719097, "learning_rate": 7.304578342396441e-07, "loss": 0.0001, "num_input_tokens_seen": 11721280, "step": 23800 }, { "epoch": 3.141744753860367, "grad_norm": 0.0008488676394335926, "learning_rate": 7.300142158226319e-07, "loss": 0.0014, "num_input_tokens_seen": 11723904, "step": 23805 }, { "epoch": 3.1424046456381154, "grad_norm": 0.001676439307630062, "learning_rate": 7.295706547081475e-07, "loss": 0.0, "num_input_tokens_seen": 11726336, "step": 23810 }, { "epoch": 3.1430645374158637, "grad_norm": 0.0002265098737552762, "learning_rate": 7.291271509903334e-07, "loss": 0.0, "num_input_tokens_seen": 11728640, "step": 23815 }, { "epoch": 3.1437244291936124, "grad_norm": 0.6636281609535217, "learning_rate": 7.286837047633195e-07, "loss": 0.0001, "num_input_tokens_seen": 11731072, "step": 23820 }, { "epoch": 3.1443843209713607, "grad_norm": 0.0003701484529301524, "learning_rate": 7.282403161212251e-07, "loss": 0.0001, "num_input_tokens_seen": 11733568, "step": 23825 }, { "epoch": 3.145044212749109, "grad_norm": 17.29320526123047, "learning_rate": 7.277969851581551e-07, "loss": 0.1047, "num_input_tokens_seen": 11736064, "step": 23830 }, { "epoch": 3.1457041045268577, "grad_norm": 0.0005500232218764722, "learning_rate": 7.273537119682045e-07, "loss": 0.0001, "num_input_tokens_seen": 11738432, "step": 23835 }, { "epoch": 3.146363996304606, "grad_norm": 0.0012078273575752974, "learning_rate": 7.26910496645454e-07, "loss": 0.0, "num_input_tokens_seen": 11741120, "step": 23840 }, { "epoch": 3.1470238880823547, "grad_norm": 0.00023791982675902545, "learning_rate": 7.264673392839726e-07, "loss": 0.0, "num_input_tokens_seen": 11743296, "step": 23845 }, { "epoch": 3.147683779860103, "grad_norm": 0.010573324747383595, "learning_rate": 7.260242399778183e-07, "loss": 0.0411, "num_input_tokens_seen": 11745792, "step": 23850 }, { "epoch": 3.1483436716378512, "grad_norm": 0.0007384056807495654, "learning_rate": 7.255811988210343e-07, "loss": 0.0252, "num_input_tokens_seen": 11747968, "step": 23855 }, { "epoch": 3.1490035634156, "grad_norm": 29.7474422454834, "learning_rate": 7.251382159076544e-07, "loss": 0.0896, "num_input_tokens_seen": 11750144, "step": 23860 }, { "epoch": 3.1496634551933482, "grad_norm": 0.001195336226373911, "learning_rate": 7.246952913316977e-07, "loss": 0.1151, "num_input_tokens_seen": 11752704, "step": 23865 }, { "epoch": 3.1503233469710965, "grad_norm": 0.0011512299533933401, "learning_rate": 7.242524251871714e-07, "loss": 0.0766, "num_input_tokens_seen": 11755072, "step": 23870 }, { "epoch": 3.1509832387488452, "grad_norm": 0.0014371996512636542, "learning_rate": 7.238096175680714e-07, "loss": 0.0001, "num_input_tokens_seen": 11757504, "step": 23875 }, { "epoch": 3.1516431305265935, "grad_norm": 0.010653997771441936, "learning_rate": 7.233668685683798e-07, "loss": 0.0004, "num_input_tokens_seen": 11759744, "step": 23880 }, { "epoch": 3.1523030223043422, "grad_norm": 0.00024848911562003195, "learning_rate": 7.229241782820673e-07, "loss": 0.0907, "num_input_tokens_seen": 11762176, "step": 23885 }, { "epoch": 3.1529629140820905, "grad_norm": 0.004329715855419636, "learning_rate": 7.224815468030916e-07, "loss": 0.0, "num_input_tokens_seen": 11764672, "step": 23890 }, { "epoch": 3.153622805859839, "grad_norm": 0.012807437218725681, "learning_rate": 7.220389742253978e-07, "loss": 0.0, "num_input_tokens_seen": 11767168, "step": 23895 }, { "epoch": 3.1542826976375875, "grad_norm": 13.427555084228516, "learning_rate": 7.21596460642919e-07, "loss": 0.0025, "num_input_tokens_seen": 11769664, "step": 23900 }, { "epoch": 3.154942589415336, "grad_norm": 0.00018926402844954282, "learning_rate": 7.211540061495751e-07, "loss": 0.0, "num_input_tokens_seen": 11772352, "step": 23905 }, { "epoch": 3.1556024811930845, "grad_norm": 0.017940253019332886, "learning_rate": 7.207116108392746e-07, "loss": 0.0087, "num_input_tokens_seen": 11774656, "step": 23910 }, { "epoch": 3.156262372970833, "grad_norm": 0.12874840199947357, "learning_rate": 7.202692748059121e-07, "loss": 0.0001, "num_input_tokens_seen": 11776960, "step": 23915 }, { "epoch": 3.156922264748581, "grad_norm": 0.012035480700433254, "learning_rate": 7.1982699814337e-07, "loss": 0.061, "num_input_tokens_seen": 11779456, "step": 23920 }, { "epoch": 3.15758215652633, "grad_norm": 0.020318351686000824, "learning_rate": 7.193847809455192e-07, "loss": 0.0001, "num_input_tokens_seen": 11782016, "step": 23925 }, { "epoch": 3.158242048304078, "grad_norm": 0.00042746050166897476, "learning_rate": 7.189426233062161e-07, "loss": 0.0, "num_input_tokens_seen": 11784448, "step": 23930 }, { "epoch": 3.158901940081827, "grad_norm": 0.004594277124851942, "learning_rate": 7.185005253193064e-07, "loss": 0.0, "num_input_tokens_seen": 11786816, "step": 23935 }, { "epoch": 3.159561831859575, "grad_norm": 0.016789058223366737, "learning_rate": 7.180584870786217e-07, "loss": 0.0299, "num_input_tokens_seen": 11789120, "step": 23940 }, { "epoch": 3.1602217236373233, "grad_norm": 0.004520804155617952, "learning_rate": 7.17616508677981e-07, "loss": 0.0052, "num_input_tokens_seen": 11791872, "step": 23945 }, { "epoch": 3.160881615415072, "grad_norm": 0.0033924004528671503, "learning_rate": 7.171745902111919e-07, "loss": 0.0961, "num_input_tokens_seen": 11794560, "step": 23950 }, { "epoch": 3.1615415071928203, "grad_norm": 0.004584446549415588, "learning_rate": 7.167327317720479e-07, "loss": 0.0, "num_input_tokens_seen": 11797120, "step": 23955 }, { "epoch": 3.1622013989705686, "grad_norm": 0.08727633208036423, "learning_rate": 7.162909334543303e-07, "loss": 0.0001, "num_input_tokens_seen": 11799680, "step": 23960 }, { "epoch": 3.1628612907483173, "grad_norm": 0.011404680088162422, "learning_rate": 7.158491953518079e-07, "loss": 0.0, "num_input_tokens_seen": 11802048, "step": 23965 }, { "epoch": 3.1635211825260656, "grad_norm": 72.21094512939453, "learning_rate": 7.154075175582355e-07, "loss": 0.0431, "num_input_tokens_seen": 11804544, "step": 23970 }, { "epoch": 3.1641810743038143, "grad_norm": 0.002588431118056178, "learning_rate": 7.149659001673572e-07, "loss": 0.0, "num_input_tokens_seen": 11806976, "step": 23975 }, { "epoch": 3.1648409660815626, "grad_norm": 0.0015010889619588852, "learning_rate": 7.14524343272902e-07, "loss": 0.0021, "num_input_tokens_seen": 11809344, "step": 23980 }, { "epoch": 3.165500857859311, "grad_norm": 0.0020076995715498924, "learning_rate": 7.14082846968588e-07, "loss": 0.0001, "num_input_tokens_seen": 11811840, "step": 23985 }, { "epoch": 3.1661607496370596, "grad_norm": 0.0015450094360858202, "learning_rate": 7.136414113481191e-07, "loss": 0.0001, "num_input_tokens_seen": 11814208, "step": 23990 }, { "epoch": 3.166820641414808, "grad_norm": 0.0014535001246258616, "learning_rate": 7.132000365051873e-07, "loss": 0.0, "num_input_tokens_seen": 11816768, "step": 23995 }, { "epoch": 3.1674805331925566, "grad_norm": 0.005266868509352207, "learning_rate": 7.127587225334712e-07, "loss": 0.0002, "num_input_tokens_seen": 11819456, "step": 24000 }, { "epoch": 3.168140424970305, "grad_norm": 0.007542007602751255, "learning_rate": 7.123174695266354e-07, "loss": 0.0001, "num_input_tokens_seen": 11821760, "step": 24005 }, { "epoch": 3.168800316748053, "grad_norm": 0.012107683345675468, "learning_rate": 7.11876277578334e-07, "loss": 0.0396, "num_input_tokens_seen": 11824192, "step": 24010 }, { "epoch": 3.169460208525802, "grad_norm": 0.0022753942757844925, "learning_rate": 7.114351467822058e-07, "loss": 0.0, "num_input_tokens_seen": 11826688, "step": 24015 }, { "epoch": 3.17012010030355, "grad_norm": 0.0035900247748941183, "learning_rate": 7.109940772318787e-07, "loss": 0.0, "num_input_tokens_seen": 11828864, "step": 24020 }, { "epoch": 3.1707799920812985, "grad_norm": 0.0012713761534541845, "learning_rate": 7.105530690209656e-07, "loss": 0.0001, "num_input_tokens_seen": 11831168, "step": 24025 }, { "epoch": 3.171439883859047, "grad_norm": 0.021307138726115227, "learning_rate": 7.101121222430675e-07, "loss": 0.0626, "num_input_tokens_seen": 11834176, "step": 24030 }, { "epoch": 3.1720997756367955, "grad_norm": 0.01302417553961277, "learning_rate": 7.096712369917724e-07, "loss": 0.0, "num_input_tokens_seen": 11836288, "step": 24035 }, { "epoch": 3.172759667414544, "grad_norm": 0.0022673753555864096, "learning_rate": 7.092304133606544e-07, "loss": 0.0008, "num_input_tokens_seen": 11839040, "step": 24040 }, { "epoch": 3.1734195591922925, "grad_norm": 0.0007913524750620127, "learning_rate": 7.087896514432762e-07, "loss": 0.0, "num_input_tokens_seen": 11841280, "step": 24045 }, { "epoch": 3.1740794509700407, "grad_norm": 12.834890365600586, "learning_rate": 7.083489513331855e-07, "loss": 0.0511, "num_input_tokens_seen": 11843904, "step": 24050 }, { "epoch": 3.1747393427477895, "grad_norm": 72.53605651855469, "learning_rate": 7.079083131239177e-07, "loss": 0.0128, "num_input_tokens_seen": 11846336, "step": 24055 }, { "epoch": 3.1753992345255377, "grad_norm": 0.008593321777880192, "learning_rate": 7.074677369089955e-07, "loss": 0.0008, "num_input_tokens_seen": 11848576, "step": 24060 }, { "epoch": 3.1760591263032865, "grad_norm": 0.0005958180991001427, "learning_rate": 7.070272227819276e-07, "loss": 0.0009, "num_input_tokens_seen": 11850688, "step": 24065 }, { "epoch": 3.1767190180810347, "grad_norm": 0.005373831372708082, "learning_rate": 7.065867708362103e-07, "loss": 0.0, "num_input_tokens_seen": 11852992, "step": 24070 }, { "epoch": 3.177378909858783, "grad_norm": 0.12464771419763565, "learning_rate": 7.061463811653261e-07, "loss": 0.0001, "num_input_tokens_seen": 11855744, "step": 24075 }, { "epoch": 3.1780388016365317, "grad_norm": 0.011009292677044868, "learning_rate": 7.057060538627445e-07, "loss": 0.0001, "num_input_tokens_seen": 11858112, "step": 24080 }, { "epoch": 3.17869869341428, "grad_norm": 10.30748462677002, "learning_rate": 7.05265789021922e-07, "loss": 0.0366, "num_input_tokens_seen": 11860096, "step": 24085 }, { "epoch": 3.1793585851920287, "grad_norm": 0.0007246877066791058, "learning_rate": 7.048255867363014e-07, "loss": 0.0, "num_input_tokens_seen": 11862720, "step": 24090 }, { "epoch": 3.180018476969777, "grad_norm": 0.01777108572423458, "learning_rate": 7.043854470993125e-07, "loss": 0.0682, "num_input_tokens_seen": 11865088, "step": 24095 }, { "epoch": 3.1806783687475253, "grad_norm": 0.933526337146759, "learning_rate": 7.039453702043719e-07, "loss": 0.1339, "num_input_tokens_seen": 11867712, "step": 24100 }, { "epoch": 3.181338260525274, "grad_norm": 0.1766197234392166, "learning_rate": 7.035053561448825e-07, "loss": 0.0034, "num_input_tokens_seen": 11870272, "step": 24105 }, { "epoch": 3.1819981523030223, "grad_norm": 0.001022300566546619, "learning_rate": 7.030654050142341e-07, "loss": 0.0, "num_input_tokens_seen": 11872896, "step": 24110 }, { "epoch": 3.1826580440807706, "grad_norm": 0.004060187842696905, "learning_rate": 7.026255169058035e-07, "loss": 0.0706, "num_input_tokens_seen": 11875392, "step": 24115 }, { "epoch": 3.1833179358585193, "grad_norm": 0.016995996236801147, "learning_rate": 7.021856919129534e-07, "loss": 0.0114, "num_input_tokens_seen": 11877696, "step": 24120 }, { "epoch": 3.1839778276362676, "grad_norm": 0.020419931039214134, "learning_rate": 7.017459301290337e-07, "loss": 0.0308, "num_input_tokens_seen": 11880384, "step": 24125 }, { "epoch": 3.1846377194140163, "grad_norm": 0.0050578368827700615, "learning_rate": 7.013062316473803e-07, "loss": 0.0813, "num_input_tokens_seen": 11882944, "step": 24130 }, { "epoch": 3.1852976111917646, "grad_norm": 0.0038029979914426804, "learning_rate": 7.008665965613165e-07, "loss": 0.0, "num_input_tokens_seen": 11885440, "step": 24135 }, { "epoch": 3.185957502969513, "grad_norm": 0.07404931634664536, "learning_rate": 7.004270249641513e-07, "loss": 0.0001, "num_input_tokens_seen": 11887680, "step": 24140 }, { "epoch": 3.1866173947472616, "grad_norm": 0.0009840900311246514, "learning_rate": 6.999875169491808e-07, "loss": 0.0009, "num_input_tokens_seen": 11889984, "step": 24145 }, { "epoch": 3.18727728652501, "grad_norm": 0.02250547893345356, "learning_rate": 6.995480726096875e-07, "loss": 0.0, "num_input_tokens_seen": 11892224, "step": 24150 }, { "epoch": 3.187937178302758, "grad_norm": 40.72771453857422, "learning_rate": 6.991086920389395e-07, "loss": 0.0441, "num_input_tokens_seen": 11894656, "step": 24155 }, { "epoch": 3.188597070080507, "grad_norm": 0.051156964153051376, "learning_rate": 6.986693753301934e-07, "loss": 0.1136, "num_input_tokens_seen": 11897216, "step": 24160 }, { "epoch": 3.189256961858255, "grad_norm": 0.049732062965631485, "learning_rate": 6.982301225766897e-07, "loss": 0.0016, "num_input_tokens_seen": 11899712, "step": 24165 }, { "epoch": 3.189916853636004, "grad_norm": 0.003781419014558196, "learning_rate": 6.977909338716578e-07, "loss": 0.0239, "num_input_tokens_seen": 11902144, "step": 24170 }, { "epoch": 3.190576745413752, "grad_norm": 0.00045295763993635774, "learning_rate": 6.973518093083116e-07, "loss": 0.099, "num_input_tokens_seen": 11904640, "step": 24175 }, { "epoch": 3.1912366371915004, "grad_norm": 0.0009195672464556992, "learning_rate": 6.969127489798519e-07, "loss": 0.0008, "num_input_tokens_seen": 11907136, "step": 24180 }, { "epoch": 3.191896528969249, "grad_norm": 0.0020728004164993763, "learning_rate": 6.964737529794669e-07, "loss": 0.0013, "num_input_tokens_seen": 11909696, "step": 24185 }, { "epoch": 3.1925564207469974, "grad_norm": 0.03599919006228447, "learning_rate": 6.960348214003294e-07, "loss": 0.0, "num_input_tokens_seen": 11912064, "step": 24190 }, { "epoch": 3.193216312524746, "grad_norm": 0.0029828757978975773, "learning_rate": 6.955959543356005e-07, "loss": 0.0, "num_input_tokens_seen": 11914368, "step": 24195 }, { "epoch": 3.1938762043024944, "grad_norm": 0.0022346843034029007, "learning_rate": 6.951571518784257e-07, "loss": 0.0001, "num_input_tokens_seen": 11916736, "step": 24200 }, { "epoch": 3.1945360960802427, "grad_norm": 0.010649897158145905, "learning_rate": 6.947184141219378e-07, "loss": 0.0, "num_input_tokens_seen": 11918912, "step": 24205 }, { "epoch": 3.1951959878579914, "grad_norm": 0.000551412464119494, "learning_rate": 6.94279741159256e-07, "loss": 0.0, "num_input_tokens_seen": 11921152, "step": 24210 }, { "epoch": 3.1958558796357397, "grad_norm": 0.0012428689515218139, "learning_rate": 6.93841133083485e-07, "loss": 0.0001, "num_input_tokens_seen": 11923392, "step": 24215 }, { "epoch": 3.1965157714134884, "grad_norm": 24.994014739990234, "learning_rate": 6.934025899877167e-07, "loss": 0.0738, "num_input_tokens_seen": 11925952, "step": 24220 }, { "epoch": 3.1971756631912367, "grad_norm": 0.005721225868910551, "learning_rate": 6.929641119650286e-07, "loss": 0.0, "num_input_tokens_seen": 11928576, "step": 24225 }, { "epoch": 3.197835554968985, "grad_norm": 0.07588429003953934, "learning_rate": 6.92525699108484e-07, "loss": 0.0002, "num_input_tokens_seen": 11930880, "step": 24230 }, { "epoch": 3.1984954467467337, "grad_norm": 0.21324551105499268, "learning_rate": 6.920873515111336e-07, "loss": 0.0001, "num_input_tokens_seen": 11933312, "step": 24235 }, { "epoch": 3.199155338524482, "grad_norm": 0.000538547697942704, "learning_rate": 6.916490692660127e-07, "loss": 0.0142, "num_input_tokens_seen": 11935936, "step": 24240 }, { "epoch": 3.1998152303022303, "grad_norm": 0.0019838111475110054, "learning_rate": 6.912108524661443e-07, "loss": 0.043, "num_input_tokens_seen": 11938048, "step": 24245 }, { "epoch": 3.200475122079979, "grad_norm": 0.0037101011257618666, "learning_rate": 6.907727012045363e-07, "loss": 0.0, "num_input_tokens_seen": 11940480, "step": 24250 }, { "epoch": 3.2011350138577273, "grad_norm": 0.0017553389770910144, "learning_rate": 6.903346155741831e-07, "loss": 0.0372, "num_input_tokens_seen": 11942848, "step": 24255 }, { "epoch": 3.201794905635476, "grad_norm": 0.008613619953393936, "learning_rate": 6.898965956680655e-07, "loss": 0.0, "num_input_tokens_seen": 11945280, "step": 24260 }, { "epoch": 3.2024547974132243, "grad_norm": 0.029970765113830566, "learning_rate": 6.894586415791497e-07, "loss": 0.0002, "num_input_tokens_seen": 11948032, "step": 24265 }, { "epoch": 3.2031146891909725, "grad_norm": 0.0053716376423835754, "learning_rate": 6.890207534003884e-07, "loss": 0.0007, "num_input_tokens_seen": 11950464, "step": 24270 }, { "epoch": 3.2037745809687213, "grad_norm": 0.0050010415725409985, "learning_rate": 6.885829312247207e-07, "loss": 0.0, "num_input_tokens_seen": 11953216, "step": 24275 }, { "epoch": 3.2044344727464695, "grad_norm": 0.006434239447116852, "learning_rate": 6.881451751450702e-07, "loss": 0.0, "num_input_tokens_seen": 11955520, "step": 24280 }, { "epoch": 3.205094364524218, "grad_norm": 0.0001354543346678838, "learning_rate": 6.877074852543483e-07, "loss": 0.0308, "num_input_tokens_seen": 11958016, "step": 24285 }, { "epoch": 3.2057542563019665, "grad_norm": 0.00035205119638703763, "learning_rate": 6.872698616454511e-07, "loss": 0.0, "num_input_tokens_seen": 11960512, "step": 24290 }, { "epoch": 3.206414148079715, "grad_norm": 0.23281724750995636, "learning_rate": 6.868323044112612e-07, "loss": 0.0282, "num_input_tokens_seen": 11962944, "step": 24295 }, { "epoch": 3.2070740398574635, "grad_norm": 0.0029322770424187183, "learning_rate": 6.863948136446468e-07, "loss": 0.0, "num_input_tokens_seen": 11965632, "step": 24300 }, { "epoch": 3.207733931635212, "grad_norm": 0.00021965963242109865, "learning_rate": 6.859573894384625e-07, "loss": 0.075, "num_input_tokens_seen": 11967872, "step": 24305 }, { "epoch": 3.20839382341296, "grad_norm": 0.0003221237566322088, "learning_rate": 6.855200318855483e-07, "loss": 0.0236, "num_input_tokens_seen": 11970048, "step": 24310 }, { "epoch": 3.209053715190709, "grad_norm": 0.2113918662071228, "learning_rate": 6.850827410787295e-07, "loss": 0.0001, "num_input_tokens_seen": 11972352, "step": 24315 }, { "epoch": 3.209713606968457, "grad_norm": 0.05100065469741821, "learning_rate": 6.846455171108187e-07, "loss": 0.0001, "num_input_tokens_seen": 11974784, "step": 24320 }, { "epoch": 3.210373498746206, "grad_norm": 0.011089351028203964, "learning_rate": 6.842083600746131e-07, "loss": 0.0, "num_input_tokens_seen": 11977344, "step": 24325 }, { "epoch": 3.211033390523954, "grad_norm": 0.0005579411517828703, "learning_rate": 6.837712700628967e-07, "loss": 0.0, "num_input_tokens_seen": 11979904, "step": 24330 }, { "epoch": 3.2116932823017024, "grad_norm": 0.0014182161539793015, "learning_rate": 6.833342471684383e-07, "loss": 0.0001, "num_input_tokens_seen": 11982208, "step": 24335 }, { "epoch": 3.212353174079451, "grad_norm": 0.0018914632964879274, "learning_rate": 6.828972914839924e-07, "loss": 0.0238, "num_input_tokens_seen": 11984832, "step": 24340 }, { "epoch": 3.2130130658571994, "grad_norm": 0.002741429954767227, "learning_rate": 6.824604031023005e-07, "loss": 0.0863, "num_input_tokens_seen": 11987392, "step": 24345 }, { "epoch": 3.213672957634948, "grad_norm": 0.5368107557296753, "learning_rate": 6.820235821160881e-07, "loss": 0.0002, "num_input_tokens_seen": 11989632, "step": 24350 }, { "epoch": 3.2143328494126964, "grad_norm": 0.01114723365753889, "learning_rate": 6.815868286180683e-07, "loss": 0.0001, "num_input_tokens_seen": 11992064, "step": 24355 }, { "epoch": 3.2149927411904446, "grad_norm": 0.052227143198251724, "learning_rate": 6.811501427009383e-07, "loss": 0.001, "num_input_tokens_seen": 11994688, "step": 24360 }, { "epoch": 3.2156526329681934, "grad_norm": 0.002823041984811425, "learning_rate": 6.807135244573814e-07, "loss": 0.0003, "num_input_tokens_seen": 11997120, "step": 24365 }, { "epoch": 3.2163125247459416, "grad_norm": 0.014946339651942253, "learning_rate": 6.802769739800669e-07, "loss": 0.0005, "num_input_tokens_seen": 11999616, "step": 24370 }, { "epoch": 3.21697241652369, "grad_norm": 0.008161872625350952, "learning_rate": 6.798404913616491e-07, "loss": 0.0, "num_input_tokens_seen": 12002176, "step": 24375 }, { "epoch": 3.2176323083014386, "grad_norm": 0.0014490735484287143, "learning_rate": 6.794040766947693e-07, "loss": 0.0487, "num_input_tokens_seen": 12004608, "step": 24380 }, { "epoch": 3.218292200079187, "grad_norm": 0.0007983733667060733, "learning_rate": 6.789677300720522e-07, "loss": 0.0, "num_input_tokens_seen": 12007104, "step": 24385 }, { "epoch": 3.2189520918569356, "grad_norm": 0.20858755707740784, "learning_rate": 6.785314515861096e-07, "loss": 0.0, "num_input_tokens_seen": 12009664, "step": 24390 }, { "epoch": 3.219611983634684, "grad_norm": 0.012098542414605618, "learning_rate": 6.780952413295387e-07, "loss": 0.0, "num_input_tokens_seen": 12012032, "step": 24395 }, { "epoch": 3.220271875412432, "grad_norm": 0.019903438165783882, "learning_rate": 6.776590993949217e-07, "loss": 0.0001, "num_input_tokens_seen": 12014208, "step": 24400 }, { "epoch": 3.220931767190181, "grad_norm": 0.5107465982437134, "learning_rate": 6.772230258748266e-07, "loss": 0.0004, "num_input_tokens_seen": 12016576, "step": 24405 }, { "epoch": 3.221591658967929, "grad_norm": 2.904726505279541, "learning_rate": 6.767870208618071e-07, "loss": 0.0006, "num_input_tokens_seen": 12019264, "step": 24410 }, { "epoch": 3.2222515507456775, "grad_norm": 5.2059836889384314e-05, "learning_rate": 6.763510844484015e-07, "loss": 0.0213, "num_input_tokens_seen": 12021632, "step": 24415 }, { "epoch": 3.222911442523426, "grad_norm": 0.01229359395802021, "learning_rate": 6.759152167271349e-07, "loss": 0.0283, "num_input_tokens_seen": 12024000, "step": 24420 }, { "epoch": 3.2235713343011745, "grad_norm": 0.0005444117123261094, "learning_rate": 6.754794177905165e-07, "loss": 0.0, "num_input_tokens_seen": 12026432, "step": 24425 }, { "epoch": 3.224231226078923, "grad_norm": 0.0019035936566069722, "learning_rate": 6.750436877310418e-07, "loss": 0.0, "num_input_tokens_seen": 12028672, "step": 24430 }, { "epoch": 3.2248911178566715, "grad_norm": 0.3872932195663452, "learning_rate": 6.746080266411913e-07, "loss": 0.0004, "num_input_tokens_seen": 12031488, "step": 24435 }, { "epoch": 3.2255510096344198, "grad_norm": 0.12454172223806381, "learning_rate": 6.741724346134306e-07, "loss": 0.0001, "num_input_tokens_seen": 12033920, "step": 24440 }, { "epoch": 3.2262109014121685, "grad_norm": 0.00664086127653718, "learning_rate": 6.737369117402114e-07, "loss": 0.0044, "num_input_tokens_seen": 12036224, "step": 24445 }, { "epoch": 3.2268707931899168, "grad_norm": 11.390558242797852, "learning_rate": 6.733014581139699e-07, "loss": 0.115, "num_input_tokens_seen": 12038528, "step": 24450 }, { "epoch": 3.2275306849676655, "grad_norm": 0.000371633970644325, "learning_rate": 6.728660738271283e-07, "loss": 0.0, "num_input_tokens_seen": 12040896, "step": 24455 }, { "epoch": 3.2281905767454138, "grad_norm": 0.0016459020553156734, "learning_rate": 6.724307589720936e-07, "loss": 0.0, "num_input_tokens_seen": 12043008, "step": 24460 }, { "epoch": 3.228850468523162, "grad_norm": 0.9386215806007385, "learning_rate": 6.719955136412582e-07, "loss": 0.0023, "num_input_tokens_seen": 12045504, "step": 24465 }, { "epoch": 3.2295103603009108, "grad_norm": 0.00015591199917253107, "learning_rate": 6.715603379269998e-07, "loss": 0.0, "num_input_tokens_seen": 12047808, "step": 24470 }, { "epoch": 3.230170252078659, "grad_norm": 0.0026150809135288, "learning_rate": 6.711252319216814e-07, "loss": 0.0338, "num_input_tokens_seen": 12050496, "step": 24475 }, { "epoch": 3.2308301438564078, "grad_norm": 0.04011273384094238, "learning_rate": 6.70690195717651e-07, "loss": 0.0, "num_input_tokens_seen": 12052864, "step": 24480 }, { "epoch": 3.231490035634156, "grad_norm": 0.0008673505508340895, "learning_rate": 6.70255229407242e-07, "loss": 0.0, "num_input_tokens_seen": 12055040, "step": 24485 }, { "epoch": 3.2321499274119043, "grad_norm": 0.00220724125392735, "learning_rate": 6.698203330827722e-07, "loss": 0.0, "num_input_tokens_seen": 12057664, "step": 24490 }, { "epoch": 3.232809819189653, "grad_norm": 0.0067215790040791035, "learning_rate": 6.693855068365464e-07, "loss": 0.1253, "num_input_tokens_seen": 12059776, "step": 24495 }, { "epoch": 3.2334697109674013, "grad_norm": 0.009078079834580421, "learning_rate": 6.689507507608518e-07, "loss": 0.0, "num_input_tokens_seen": 12062336, "step": 24500 }, { "epoch": 3.2341296027451496, "grad_norm": 0.0011320833582431078, "learning_rate": 6.685160649479638e-07, "loss": 0.0, "num_input_tokens_seen": 12064512, "step": 24505 }, { "epoch": 3.2347894945228983, "grad_norm": 6.545926589751616e-05, "learning_rate": 6.680814494901406e-07, "loss": 0.0, "num_input_tokens_seen": 12067072, "step": 24510 }, { "epoch": 3.2354493863006466, "grad_norm": 0.10730508714914322, "learning_rate": 6.676469044796258e-07, "loss": 0.0, "num_input_tokens_seen": 12069376, "step": 24515 }, { "epoch": 3.2361092780783953, "grad_norm": 0.005647291895002127, "learning_rate": 6.672124300086492e-07, "loss": 0.0001, "num_input_tokens_seen": 12071872, "step": 24520 }, { "epoch": 3.2367691698561436, "grad_norm": 0.0025732112117111683, "learning_rate": 6.667780261694239e-07, "loss": 0.0548, "num_input_tokens_seen": 12074432, "step": 24525 }, { "epoch": 3.237429061633892, "grad_norm": 0.00010753106471383944, "learning_rate": 6.663436930541502e-07, "loss": 0.0, "num_input_tokens_seen": 12076672, "step": 24530 }, { "epoch": 3.2380889534116406, "grad_norm": 0.012851386331021786, "learning_rate": 6.659094307550112e-07, "loss": 0.0, "num_input_tokens_seen": 12079168, "step": 24535 }, { "epoch": 3.238748845189389, "grad_norm": 0.0013802021276205778, "learning_rate": 6.654752393641763e-07, "loss": 0.0, "num_input_tokens_seen": 12081728, "step": 24540 }, { "epoch": 3.239408736967137, "grad_norm": 0.006160231772810221, "learning_rate": 6.650411189737993e-07, "loss": 0.0, "num_input_tokens_seen": 12084160, "step": 24545 }, { "epoch": 3.240068628744886, "grad_norm": 0.0014017027569934726, "learning_rate": 6.646070696760192e-07, "loss": 0.028, "num_input_tokens_seen": 12086656, "step": 24550 }, { "epoch": 3.240728520522634, "grad_norm": 0.0003748885355889797, "learning_rate": 6.6417309156296e-07, "loss": 0.0, "num_input_tokens_seen": 12089024, "step": 24555 }, { "epoch": 3.241388412300383, "grad_norm": 0.001053887652233243, "learning_rate": 6.637391847267302e-07, "loss": 0.0, "num_input_tokens_seen": 12091456, "step": 24560 }, { "epoch": 3.242048304078131, "grad_norm": 0.014502989128232002, "learning_rate": 6.633053492594232e-07, "loss": 0.0, "num_input_tokens_seen": 12094016, "step": 24565 }, { "epoch": 3.2427081958558794, "grad_norm": 0.7106849551200867, "learning_rate": 6.628715852531179e-07, "loss": 0.0008, "num_input_tokens_seen": 12096448, "step": 24570 }, { "epoch": 3.243368087633628, "grad_norm": 0.005429383832961321, "learning_rate": 6.624378927998773e-07, "loss": 0.0, "num_input_tokens_seen": 12099008, "step": 24575 }, { "epoch": 3.2440279794113764, "grad_norm": 11.243054389953613, "learning_rate": 6.620042719917495e-07, "loss": 0.0898, "num_input_tokens_seen": 12101248, "step": 24580 }, { "epoch": 3.244687871189125, "grad_norm": 0.0023134404327720404, "learning_rate": 6.615707229207674e-07, "loss": 0.0, "num_input_tokens_seen": 12103744, "step": 24585 }, { "epoch": 3.2453477629668734, "grad_norm": 0.016285087913274765, "learning_rate": 6.611372456789486e-07, "loss": 0.0, "num_input_tokens_seen": 12106496, "step": 24590 }, { "epoch": 3.2460076547446217, "grad_norm": 0.003460107371211052, "learning_rate": 6.607038403582956e-07, "loss": 0.0004, "num_input_tokens_seen": 12108928, "step": 24595 }, { "epoch": 3.2466675465223704, "grad_norm": 0.010175548493862152, "learning_rate": 6.602705070507954e-07, "loss": 0.0, "num_input_tokens_seen": 12111488, "step": 24600 }, { "epoch": 3.2473274383001187, "grad_norm": 0.013270308263599873, "learning_rate": 6.598372458484202e-07, "loss": 0.0, "num_input_tokens_seen": 12114112, "step": 24605 }, { "epoch": 3.2479873300778674, "grad_norm": 13.476341247558594, "learning_rate": 6.594040568431262e-07, "loss": 0.0266, "num_input_tokens_seen": 12116352, "step": 24610 }, { "epoch": 3.2486472218556157, "grad_norm": 0.004440042190253735, "learning_rate": 6.589709401268546e-07, "loss": 0.0909, "num_input_tokens_seen": 12118976, "step": 24615 }, { "epoch": 3.249307113633364, "grad_norm": 0.0023096001241356134, "learning_rate": 6.585378957915315e-07, "loss": 0.0822, "num_input_tokens_seen": 12121216, "step": 24620 }, { "epoch": 3.2499670054111127, "grad_norm": 0.004000347573310137, "learning_rate": 6.581049239290672e-07, "loss": 0.0, "num_input_tokens_seen": 12123712, "step": 24625 }, { "epoch": 3.250626897188861, "grad_norm": 0.0027472316287457943, "learning_rate": 6.576720246313572e-07, "loss": 0.0, "num_input_tokens_seen": 12126016, "step": 24630 }, { "epoch": 3.2512867889666097, "grad_norm": 62.97257995605469, "learning_rate": 6.57239197990281e-07, "loss": 0.0352, "num_input_tokens_seen": 12128448, "step": 24635 }, { "epoch": 3.2512867889666097, "eval_loss": 0.18092882633209229, "eval_runtime": 7.879, "eval_samples_per_second": 854.8, "eval_steps_per_second": 106.866, "num_input_tokens_seen": 12128448, "step": 24635 }, { "epoch": 3.251946680744358, "grad_norm": 0.020524688065052032, "learning_rate": 6.568064440977028e-07, "loss": 0.0434, "num_input_tokens_seen": 12130880, "step": 24640 }, { "epoch": 3.2526065725221063, "grad_norm": 0.004393074195832014, "learning_rate": 6.563737630454719e-07, "loss": 0.0001, "num_input_tokens_seen": 12133248, "step": 24645 }, { "epoch": 3.253266464299855, "grad_norm": 0.0023800248745828867, "learning_rate": 6.559411549254211e-07, "loss": 0.0, "num_input_tokens_seen": 12135488, "step": 24650 }, { "epoch": 3.2539263560776033, "grad_norm": 0.5170552730560303, "learning_rate": 6.55508619829369e-07, "loss": 0.0661, "num_input_tokens_seen": 12137920, "step": 24655 }, { "epoch": 3.2545862478553516, "grad_norm": 0.006337509024888277, "learning_rate": 6.550761578491175e-07, "loss": 0.0001, "num_input_tokens_seen": 12140416, "step": 24660 }, { "epoch": 3.2552461396331003, "grad_norm": 0.18528732657432556, "learning_rate": 6.546437690764539e-07, "loss": 0.0338, "num_input_tokens_seen": 12143040, "step": 24665 }, { "epoch": 3.2559060314108486, "grad_norm": 0.002924225991591811, "learning_rate": 6.542114536031498e-07, "loss": 0.0002, "num_input_tokens_seen": 12145280, "step": 24670 }, { "epoch": 3.256565923188597, "grad_norm": 0.019116446375846863, "learning_rate": 6.537792115209599e-07, "loss": 0.0611, "num_input_tokens_seen": 12147776, "step": 24675 }, { "epoch": 3.2572258149663456, "grad_norm": 0.001296358066610992, "learning_rate": 6.533470429216258e-07, "loss": 0.0, "num_input_tokens_seen": 12150272, "step": 24680 }, { "epoch": 3.257885706744094, "grad_norm": 0.006655433680862188, "learning_rate": 6.529149478968709e-07, "loss": 0.0004, "num_input_tokens_seen": 12152768, "step": 24685 }, { "epoch": 3.2585455985218426, "grad_norm": 0.014014780521392822, "learning_rate": 6.524829265384058e-07, "loss": 0.0018, "num_input_tokens_seen": 12155072, "step": 24690 }, { "epoch": 3.259205490299591, "grad_norm": 0.6132182478904724, "learning_rate": 6.520509789379227e-07, "loss": 0.0355, "num_input_tokens_seen": 12157376, "step": 24695 }, { "epoch": 3.259865382077339, "grad_norm": 0.013002334162592888, "learning_rate": 6.516191051870992e-07, "loss": 0.0019, "num_input_tokens_seen": 12159616, "step": 24700 }, { "epoch": 3.260525273855088, "grad_norm": 0.0011766665847972035, "learning_rate": 6.511873053775985e-07, "loss": 0.0296, "num_input_tokens_seen": 12161920, "step": 24705 }, { "epoch": 3.261185165632836, "grad_norm": 0.0003221951483283192, "learning_rate": 6.507555796010658e-07, "loss": 0.0564, "num_input_tokens_seen": 12164160, "step": 24710 }, { "epoch": 3.261845057410585, "grad_norm": 0.012531314045190811, "learning_rate": 6.503239279491328e-07, "loss": 0.0615, "num_input_tokens_seen": 12166464, "step": 24715 }, { "epoch": 3.262504949188333, "grad_norm": 0.44118887186050415, "learning_rate": 6.498923505134138e-07, "loss": 0.0311, "num_input_tokens_seen": 12168960, "step": 24720 }, { "epoch": 3.2631648409660814, "grad_norm": 0.024108566343784332, "learning_rate": 6.494608473855079e-07, "loss": 0.0202, "num_input_tokens_seen": 12171648, "step": 24725 }, { "epoch": 3.26382473274383, "grad_norm": 0.0002736593596637249, "learning_rate": 6.490294186569989e-07, "loss": 0.0, "num_input_tokens_seen": 12174400, "step": 24730 }, { "epoch": 3.2644846245215784, "grad_norm": 188.8216094970703, "learning_rate": 6.485980644194541e-07, "loss": 0.0045, "num_input_tokens_seen": 12176704, "step": 24735 }, { "epoch": 3.265144516299327, "grad_norm": 18.44172477722168, "learning_rate": 6.481667847644256e-07, "loss": 0.0608, "num_input_tokens_seen": 12179008, "step": 24740 }, { "epoch": 3.2658044080770754, "grad_norm": 0.003776587313041091, "learning_rate": 6.477355797834494e-07, "loss": 0.0, "num_input_tokens_seen": 12181632, "step": 24745 }, { "epoch": 3.2664642998548237, "grad_norm": 0.0007478050538338721, "learning_rate": 6.473044495680451e-07, "loss": 0.0, "num_input_tokens_seen": 12184000, "step": 24750 }, { "epoch": 3.2671241916325724, "grad_norm": 0.015562565997242928, "learning_rate": 6.468733942097178e-07, "loss": 0.0241, "num_input_tokens_seen": 12186368, "step": 24755 }, { "epoch": 3.2677840834103207, "grad_norm": 0.0008789593121036887, "learning_rate": 6.464424137999551e-07, "loss": 0.0, "num_input_tokens_seen": 12188672, "step": 24760 }, { "epoch": 3.2684439751880694, "grad_norm": 0.06179773807525635, "learning_rate": 6.4601150843023e-07, "loss": 0.0017, "num_input_tokens_seen": 12190784, "step": 24765 }, { "epoch": 3.2691038669658177, "grad_norm": 0.003880223724991083, "learning_rate": 6.455806781919988e-07, "loss": 0.0001, "num_input_tokens_seen": 12193216, "step": 24770 }, { "epoch": 3.269763758743566, "grad_norm": 0.0023615926038473845, "learning_rate": 6.451499231767021e-07, "loss": 0.0, "num_input_tokens_seen": 12195712, "step": 24775 }, { "epoch": 3.2704236505213147, "grad_norm": 2.087416410446167, "learning_rate": 6.447192434757647e-07, "loss": 0.0023, "num_input_tokens_seen": 12198016, "step": 24780 }, { "epoch": 3.271083542299063, "grad_norm": 0.00017697991279419512, "learning_rate": 6.442886391805948e-07, "loss": 0.0, "num_input_tokens_seen": 12200512, "step": 24785 }, { "epoch": 3.2717434340768112, "grad_norm": 0.00011539664410520345, "learning_rate": 6.438581103825858e-07, "loss": 0.0002, "num_input_tokens_seen": 12203200, "step": 24790 }, { "epoch": 3.27240332585456, "grad_norm": 0.0006121984915807843, "learning_rate": 6.434276571731139e-07, "loss": 0.0, "num_input_tokens_seen": 12205888, "step": 24795 }, { "epoch": 3.2730632176323082, "grad_norm": 0.0016998574137687683, "learning_rate": 6.429972796435392e-07, "loss": 0.0006, "num_input_tokens_seen": 12208448, "step": 24800 }, { "epoch": 3.2737231094100565, "grad_norm": 0.0017284862697124481, "learning_rate": 6.425669778852072e-07, "loss": 0.0005, "num_input_tokens_seen": 12210816, "step": 24805 }, { "epoch": 3.2743830011878052, "grad_norm": 0.0031804365571588278, "learning_rate": 6.421367519894454e-07, "loss": 0.0, "num_input_tokens_seen": 12213376, "step": 24810 }, { "epoch": 3.2750428929655535, "grad_norm": 0.01726776920258999, "learning_rate": 6.417066020475669e-07, "loss": 0.0019, "num_input_tokens_seen": 12216128, "step": 24815 }, { "epoch": 3.2757027847433022, "grad_norm": 0.00042145411134697497, "learning_rate": 6.412765281508677e-07, "loss": 0.0002, "num_input_tokens_seen": 12218432, "step": 24820 }, { "epoch": 3.2763626765210505, "grad_norm": 0.00022174572222866118, "learning_rate": 6.408465303906271e-07, "loss": 0.0, "num_input_tokens_seen": 12221312, "step": 24825 }, { "epoch": 3.277022568298799, "grad_norm": 0.009224276058375835, "learning_rate": 6.404166088581102e-07, "loss": 0.0, "num_input_tokens_seen": 12223680, "step": 24830 }, { "epoch": 3.2776824600765475, "grad_norm": 0.00777504313737154, "learning_rate": 6.399867636445637e-07, "loss": 0.0487, "num_input_tokens_seen": 12226368, "step": 24835 }, { "epoch": 3.278342351854296, "grad_norm": 3.733914491022006e-05, "learning_rate": 6.395569948412198e-07, "loss": 0.0002, "num_input_tokens_seen": 12228736, "step": 24840 }, { "epoch": 3.2790022436320445, "grad_norm": 1.3131993000570219e-05, "learning_rate": 6.39127302539294e-07, "loss": 0.0559, "num_input_tokens_seen": 12231360, "step": 24845 }, { "epoch": 3.279662135409793, "grad_norm": 69.62069702148438, "learning_rate": 6.386976868299844e-07, "loss": 0.2746, "num_input_tokens_seen": 12234176, "step": 24850 }, { "epoch": 3.280322027187541, "grad_norm": 0.00021059955179225653, "learning_rate": 6.382681478044749e-07, "loss": 0.0001, "num_input_tokens_seen": 12236544, "step": 24855 }, { "epoch": 3.28098191896529, "grad_norm": 0.0011798146879300475, "learning_rate": 6.378386855539311e-07, "loss": 0.0001, "num_input_tokens_seen": 12239040, "step": 24860 }, { "epoch": 3.281641810743038, "grad_norm": 0.05896616727113724, "learning_rate": 6.374093001695042e-07, "loss": 0.08, "num_input_tokens_seen": 12241408, "step": 24865 }, { "epoch": 3.282301702520787, "grad_norm": 0.00966053456068039, "learning_rate": 6.369799917423277e-07, "loss": 0.0337, "num_input_tokens_seen": 12243840, "step": 24870 }, { "epoch": 3.282961594298535, "grad_norm": 0.006455364637076855, "learning_rate": 6.365507603635188e-07, "loss": 0.0001, "num_input_tokens_seen": 12246016, "step": 24875 }, { "epoch": 3.2836214860762833, "grad_norm": 0.0004605850263033062, "learning_rate": 6.361216061241792e-07, "loss": 0.0006, "num_input_tokens_seen": 12248320, "step": 24880 }, { "epoch": 3.284281377854032, "grad_norm": 1.3851643800735474, "learning_rate": 6.356925291153936e-07, "loss": 0.0292, "num_input_tokens_seen": 12251072, "step": 24885 }, { "epoch": 3.2849412696317803, "grad_norm": 0.0008965849410742521, "learning_rate": 6.352635294282309e-07, "loss": 0.0011, "num_input_tokens_seen": 12253632, "step": 24890 }, { "epoch": 3.285601161409529, "grad_norm": 0.0017181680304929614, "learning_rate": 6.348346071537427e-07, "loss": 0.0001, "num_input_tokens_seen": 12256128, "step": 24895 }, { "epoch": 3.2862610531872773, "grad_norm": 0.0009574389550834894, "learning_rate": 6.344057623829648e-07, "loss": 0.0, "num_input_tokens_seen": 12258688, "step": 24900 }, { "epoch": 3.2869209449650256, "grad_norm": 0.00013130882871337235, "learning_rate": 6.339769952069165e-07, "loss": 0.0, "num_input_tokens_seen": 12261312, "step": 24905 }, { "epoch": 3.2875808367427743, "grad_norm": 0.003466361900791526, "learning_rate": 6.335483057166002e-07, "loss": 0.0, "num_input_tokens_seen": 12263616, "step": 24910 }, { "epoch": 3.2882407285205226, "grad_norm": 0.0018649350386112928, "learning_rate": 6.331196940030026e-07, "loss": 0.0, "num_input_tokens_seen": 12266304, "step": 24915 }, { "epoch": 3.288900620298271, "grad_norm": 0.0006871359655633569, "learning_rate": 6.326911601570933e-07, "loss": 0.0, "num_input_tokens_seen": 12268608, "step": 24920 }, { "epoch": 3.2895605120760196, "grad_norm": 9.006281470647082e-05, "learning_rate": 6.322627042698251e-07, "loss": 0.0, "num_input_tokens_seen": 12271296, "step": 24925 }, { "epoch": 3.290220403853768, "grad_norm": 0.019566912204027176, "learning_rate": 6.318343264321352e-07, "loss": 0.0, "num_input_tokens_seen": 12273664, "step": 24930 }, { "epoch": 3.290880295631516, "grad_norm": 0.01741475611925125, "learning_rate": 6.314060267349432e-07, "loss": 0.0, "num_input_tokens_seen": 12276224, "step": 24935 }, { "epoch": 3.291540187409265, "grad_norm": 0.0002055577642749995, "learning_rate": 6.309778052691532e-07, "loss": 0.0551, "num_input_tokens_seen": 12278656, "step": 24940 }, { "epoch": 3.292200079187013, "grad_norm": 0.00037039650487713516, "learning_rate": 6.305496621256516e-07, "loss": 0.0, "num_input_tokens_seen": 12280960, "step": 24945 }, { "epoch": 3.292859970964762, "grad_norm": 0.0003575266164261848, "learning_rate": 6.30121597395309e-07, "loss": 0.0, "num_input_tokens_seen": 12283776, "step": 24950 }, { "epoch": 3.29351986274251, "grad_norm": 0.00045907212188467383, "learning_rate": 6.296936111689789e-07, "loss": 0.0, "num_input_tokens_seen": 12285952, "step": 24955 }, { "epoch": 3.2941797545202585, "grad_norm": 0.01350698247551918, "learning_rate": 6.292657035374981e-07, "loss": 0.0, "num_input_tokens_seen": 12288576, "step": 24960 }, { "epoch": 3.294839646298007, "grad_norm": 0.0007901710923761129, "learning_rate": 6.288378745916873e-07, "loss": 0.028, "num_input_tokens_seen": 12291200, "step": 24965 }, { "epoch": 3.2954995380757555, "grad_norm": 0.000196945431525819, "learning_rate": 6.284101244223497e-07, "loss": 0.0, "num_input_tokens_seen": 12293568, "step": 24970 }, { "epoch": 3.296159429853504, "grad_norm": 0.000786519784014672, "learning_rate": 6.279824531202725e-07, "loss": 0.0001, "num_input_tokens_seen": 12296064, "step": 24975 }, { "epoch": 3.2968193216312525, "grad_norm": 0.0034475887659937143, "learning_rate": 6.275548607762255e-07, "loss": 0.0266, "num_input_tokens_seen": 12298688, "step": 24980 }, { "epoch": 3.2974792134090007, "grad_norm": 0.004421170800924301, "learning_rate": 6.271273474809624e-07, "loss": 0.0001, "num_input_tokens_seen": 12300992, "step": 24985 }, { "epoch": 3.2981391051867495, "grad_norm": 0.031272463500499725, "learning_rate": 6.266999133252196e-07, "loss": 0.0, "num_input_tokens_seen": 12303680, "step": 24990 }, { "epoch": 3.2987989969644977, "grad_norm": 6.787193706259131e-05, "learning_rate": 6.262725583997169e-07, "loss": 0.0, "num_input_tokens_seen": 12305728, "step": 24995 }, { "epoch": 3.2994588887422465, "grad_norm": 4.479756535147317e-05, "learning_rate": 6.258452827951576e-07, "loss": 0.0, "num_input_tokens_seen": 12308096, "step": 25000 }, { "epoch": 3.3001187805199947, "grad_norm": 0.001354401232674718, "learning_rate": 6.254180866022278e-07, "loss": 0.0004, "num_input_tokens_seen": 12310656, "step": 25005 }, { "epoch": 3.300778672297743, "grad_norm": 8.70009753271006e-05, "learning_rate": 6.249909699115958e-07, "loss": 0.0009, "num_input_tokens_seen": 12313600, "step": 25010 }, { "epoch": 3.3014385640754917, "grad_norm": 0.003904482815414667, "learning_rate": 6.245639328139156e-07, "loss": 0.0266, "num_input_tokens_seen": 12315840, "step": 25015 }, { "epoch": 3.30209845585324, "grad_norm": 0.0015482102753594518, "learning_rate": 6.241369753998213e-07, "loss": 0.0, "num_input_tokens_seen": 12318784, "step": 25020 }, { "epoch": 3.3027583476309887, "grad_norm": 0.00014941871631890535, "learning_rate": 6.23710097759933e-07, "loss": 0.0, "num_input_tokens_seen": 12321152, "step": 25025 }, { "epoch": 3.303418239408737, "grad_norm": 0.0046899099834263325, "learning_rate": 6.232832999848511e-07, "loss": 0.0045, "num_input_tokens_seen": 12323712, "step": 25030 }, { "epoch": 3.3040781311864853, "grad_norm": 0.011636439710855484, "learning_rate": 6.228565821651606e-07, "loss": 0.0, "num_input_tokens_seen": 12326272, "step": 25035 }, { "epoch": 3.304738022964234, "grad_norm": 0.00013077407493256032, "learning_rate": 6.224299443914301e-07, "loss": 0.0, "num_input_tokens_seen": 12328896, "step": 25040 }, { "epoch": 3.3053979147419823, "grad_norm": 0.000527672003954649, "learning_rate": 6.22003386754209e-07, "loss": 0.0, "num_input_tokens_seen": 12331328, "step": 25045 }, { "epoch": 3.3060578065197306, "grad_norm": 36.416690826416016, "learning_rate": 6.215769093440325e-07, "loss": 0.0323, "num_input_tokens_seen": 12333568, "step": 25050 }, { "epoch": 3.3067176982974793, "grad_norm": 0.0005653111729770899, "learning_rate": 6.211505122514165e-07, "loss": 0.0003, "num_input_tokens_seen": 12336064, "step": 25055 }, { "epoch": 3.3073775900752276, "grad_norm": 0.001593749038875103, "learning_rate": 6.207241955668605e-07, "loss": 0.0002, "num_input_tokens_seen": 12338752, "step": 25060 }, { "epoch": 3.3080374818529763, "grad_norm": 0.007038143463432789, "learning_rate": 6.202979593808478e-07, "loss": 0.0, "num_input_tokens_seen": 12341184, "step": 25065 }, { "epoch": 3.3086973736307246, "grad_norm": 0.015235554426908493, "learning_rate": 6.198718037838435e-07, "loss": 0.0533, "num_input_tokens_seen": 12343488, "step": 25070 }, { "epoch": 3.309357265408473, "grad_norm": 0.0020212512463331223, "learning_rate": 6.194457288662963e-07, "loss": 0.0465, "num_input_tokens_seen": 12346176, "step": 25075 }, { "epoch": 3.3100171571862216, "grad_norm": 0.0019470115657895803, "learning_rate": 6.190197347186374e-07, "loss": 0.0, "num_input_tokens_seen": 12348480, "step": 25080 }, { "epoch": 3.31067704896397, "grad_norm": 0.03345693275332451, "learning_rate": 6.185938214312808e-07, "loss": 0.0001, "num_input_tokens_seen": 12350848, "step": 25085 }, { "epoch": 3.311336940741718, "grad_norm": 0.0005512295756489038, "learning_rate": 6.181679890946238e-07, "loss": 0.0, "num_input_tokens_seen": 12353472, "step": 25090 }, { "epoch": 3.311996832519467, "grad_norm": 0.0008134461240842938, "learning_rate": 6.17742237799046e-07, "loss": 0.0, "num_input_tokens_seen": 12356224, "step": 25095 }, { "epoch": 3.312656724297215, "grad_norm": 0.0038204581942409277, "learning_rate": 6.173165676349102e-07, "loss": 0.0049, "num_input_tokens_seen": 12358784, "step": 25100 }, { "epoch": 3.313316616074964, "grad_norm": 0.00022062953212298453, "learning_rate": 6.168909786925619e-07, "loss": 0.0, "num_input_tokens_seen": 12361088, "step": 25105 }, { "epoch": 3.313976507852712, "grad_norm": 0.00046019876026548445, "learning_rate": 6.164654710623289e-07, "loss": 0.0, "num_input_tokens_seen": 12363456, "step": 25110 }, { "epoch": 3.3146363996304604, "grad_norm": 0.3447696566581726, "learning_rate": 6.160400448345224e-07, "loss": 0.0001, "num_input_tokens_seen": 12366016, "step": 25115 }, { "epoch": 3.315296291408209, "grad_norm": 0.008802136406302452, "learning_rate": 6.156147000994358e-07, "loss": 0.0005, "num_input_tokens_seen": 12368576, "step": 25120 }, { "epoch": 3.3159561831859574, "grad_norm": 0.46979010105133057, "learning_rate": 6.151894369473459e-07, "loss": 0.0002, "num_input_tokens_seen": 12371008, "step": 25125 }, { "epoch": 3.316616074963706, "grad_norm": 0.0001193025746033527, "learning_rate": 6.147642554685112e-07, "loss": 0.0, "num_input_tokens_seen": 12373376, "step": 25130 }, { "epoch": 3.3172759667414544, "grad_norm": 0.006421966478228569, "learning_rate": 6.143391557531738e-07, "loss": 0.0, "num_input_tokens_seen": 12376064, "step": 25135 }, { "epoch": 3.3179358585192027, "grad_norm": 0.0010537905618548393, "learning_rate": 6.139141378915578e-07, "loss": 0.0061, "num_input_tokens_seen": 12378560, "step": 25140 }, { "epoch": 3.3185957502969514, "grad_norm": 0.01963932067155838, "learning_rate": 6.1348920197387e-07, "loss": 0.0, "num_input_tokens_seen": 12380928, "step": 25145 }, { "epoch": 3.3192556420746997, "grad_norm": 0.292169988155365, "learning_rate": 6.130643480903005e-07, "loss": 0.0002, "num_input_tokens_seen": 12383360, "step": 25150 }, { "epoch": 3.3199155338524484, "grad_norm": 0.0011927977902814746, "learning_rate": 6.126395763310213e-07, "loss": 0.0082, "num_input_tokens_seen": 12385920, "step": 25155 }, { "epoch": 3.3205754256301967, "grad_norm": 0.004244993906468153, "learning_rate": 6.122148867861864e-07, "loss": 0.0308, "num_input_tokens_seen": 12388416, "step": 25160 }, { "epoch": 3.321235317407945, "grad_norm": 11.786145210266113, "learning_rate": 6.117902795459342e-07, "loss": 0.02, "num_input_tokens_seen": 12390976, "step": 25165 }, { "epoch": 3.3218952091856937, "grad_norm": 0.06375247985124588, "learning_rate": 6.113657547003834e-07, "loss": 0.0, "num_input_tokens_seen": 12393472, "step": 25170 }, { "epoch": 3.322555100963442, "grad_norm": 0.0009331585606560111, "learning_rate": 6.109413123396374e-07, "loss": 0.0, "num_input_tokens_seen": 12396224, "step": 25175 }, { "epoch": 3.3232149927411903, "grad_norm": 0.00104428268969059, "learning_rate": 6.105169525537805e-07, "loss": 0.0266, "num_input_tokens_seen": 12398656, "step": 25180 }, { "epoch": 3.323874884518939, "grad_norm": 5.317957766237669e-05, "learning_rate": 6.100926754328797e-07, "loss": 0.0, "num_input_tokens_seen": 12400960, "step": 25185 }, { "epoch": 3.3245347762966873, "grad_norm": 0.010817881673574448, "learning_rate": 6.096684810669855e-07, "loss": 0.0, "num_input_tokens_seen": 12403584, "step": 25190 }, { "epoch": 3.325194668074436, "grad_norm": 0.00012285925913602114, "learning_rate": 6.092443695461289e-07, "loss": 0.0352, "num_input_tokens_seen": 12406144, "step": 25195 }, { "epoch": 3.3258545598521843, "grad_norm": 0.0024948231875896454, "learning_rate": 6.08820340960326e-07, "loss": 0.0, "num_input_tokens_seen": 12408512, "step": 25200 }, { "epoch": 3.3265144516299325, "grad_norm": 7.421601912938058e-05, "learning_rate": 6.083963953995728e-07, "loss": 0.0, "num_input_tokens_seen": 12411136, "step": 25205 }, { "epoch": 3.3271743434076813, "grad_norm": 1.3887579441070557, "learning_rate": 6.079725329538486e-07, "loss": 0.0005, "num_input_tokens_seen": 12413312, "step": 25210 }, { "epoch": 3.3278342351854295, "grad_norm": 15.47343921661377, "learning_rate": 6.075487537131158e-07, "loss": 0.028, "num_input_tokens_seen": 12415744, "step": 25215 }, { "epoch": 3.328494126963178, "grad_norm": 0.0005510664777830243, "learning_rate": 6.071250577673179e-07, "loss": 0.0006, "num_input_tokens_seen": 12418112, "step": 25220 }, { "epoch": 3.3291540187409265, "grad_norm": 0.0006473141256719828, "learning_rate": 6.067014452063816e-07, "loss": 0.0, "num_input_tokens_seen": 12420672, "step": 25225 }, { "epoch": 3.329813910518675, "grad_norm": 0.00029872983577661216, "learning_rate": 6.062779161202156e-07, "loss": 0.0, "num_input_tokens_seen": 12422848, "step": 25230 }, { "epoch": 3.3304738022964235, "grad_norm": 0.0003775710938498378, "learning_rate": 6.058544705987105e-07, "loss": 0.0, "num_input_tokens_seen": 12425280, "step": 25235 }, { "epoch": 3.331133694074172, "grad_norm": 0.019592612981796265, "learning_rate": 6.0543110873174e-07, "loss": 0.0366, "num_input_tokens_seen": 12427712, "step": 25240 }, { "epoch": 3.33179358585192, "grad_norm": 0.0002695015864446759, "learning_rate": 6.050078306091595e-07, "loss": 0.0, "num_input_tokens_seen": 12430528, "step": 25245 }, { "epoch": 3.332453477629669, "grad_norm": 0.0001931964507093653, "learning_rate": 6.045846363208066e-07, "loss": 0.0001, "num_input_tokens_seen": 12432768, "step": 25250 }, { "epoch": 3.333113369407417, "grad_norm": 0.0038819850888103247, "learning_rate": 6.041615259565014e-07, "loss": 0.0, "num_input_tokens_seen": 12435392, "step": 25255 }, { "epoch": 3.333773261185166, "grad_norm": 0.000421057891799137, "learning_rate": 6.037384996060455e-07, "loss": 0.0, "num_input_tokens_seen": 12437568, "step": 25260 }, { "epoch": 3.334433152962914, "grad_norm": 10.183027267456055, "learning_rate": 6.033155573592239e-07, "loss": 0.0266, "num_input_tokens_seen": 12439744, "step": 25265 }, { "epoch": 3.3350930447406624, "grad_norm": 0.00021547931828536093, "learning_rate": 6.028926993058026e-07, "loss": 0.0252, "num_input_tokens_seen": 12442048, "step": 25270 }, { "epoch": 3.335752936518411, "grad_norm": 0.00020331527048256248, "learning_rate": 6.024699255355302e-07, "loss": 0.0001, "num_input_tokens_seen": 12444992, "step": 25275 }, { "epoch": 3.3364128282961594, "grad_norm": 0.34224241971969604, "learning_rate": 6.020472361381374e-07, "loss": 0.0002, "num_input_tokens_seen": 12447296, "step": 25280 }, { "epoch": 3.337072720073908, "grad_norm": 0.00011249903764110059, "learning_rate": 6.016246312033371e-07, "loss": 0.0, "num_input_tokens_seen": 12449920, "step": 25285 }, { "epoch": 3.3377326118516564, "grad_norm": 8.940664883994032e-06, "learning_rate": 6.01202110820824e-07, "loss": 0.0294, "num_input_tokens_seen": 12452416, "step": 25290 }, { "epoch": 3.3383925036294047, "grad_norm": 0.012581578455865383, "learning_rate": 6.007796750802748e-07, "loss": 0.0736, "num_input_tokens_seen": 12454784, "step": 25295 }, { "epoch": 3.3390523954071534, "grad_norm": 0.0010578975779935718, "learning_rate": 6.003573240713489e-07, "loss": 0.0, "num_input_tokens_seen": 12457536, "step": 25300 }, { "epoch": 3.3397122871849017, "grad_norm": 0.17746460437774658, "learning_rate": 5.999350578836868e-07, "loss": 0.0004, "num_input_tokens_seen": 12460032, "step": 25305 }, { "epoch": 3.3403721789626504, "grad_norm": 7.115570042515174e-05, "learning_rate": 5.995128766069118e-07, "loss": 0.0, "num_input_tokens_seen": 12462336, "step": 25310 }, { "epoch": 3.3410320707403987, "grad_norm": 0.006361126434057951, "learning_rate": 5.990907803306286e-07, "loss": 0.0813, "num_input_tokens_seen": 12464960, "step": 25315 }, { "epoch": 3.341691962518147, "grad_norm": 0.0019897071179002523, "learning_rate": 5.986687691444239e-07, "loss": 0.0001, "num_input_tokens_seen": 12467648, "step": 25320 }, { "epoch": 3.3423518542958957, "grad_norm": 5.641576717607677e-05, "learning_rate": 5.98246843137867e-07, "loss": 0.0, "num_input_tokens_seen": 12470144, "step": 25325 }, { "epoch": 3.343011746073644, "grad_norm": 0.00012555062130559236, "learning_rate": 5.978250024005082e-07, "loss": 0.0001, "num_input_tokens_seen": 12472512, "step": 25330 }, { "epoch": 3.343671637851392, "grad_norm": 0.0028107953257858753, "learning_rate": 5.974032470218804e-07, "loss": 0.0, "num_input_tokens_seen": 12474880, "step": 25335 }, { "epoch": 3.344331529629141, "grad_norm": 0.00012251742009539157, "learning_rate": 5.969815770914983e-07, "loss": 0.0502, "num_input_tokens_seen": 12477632, "step": 25340 }, { "epoch": 3.344991421406889, "grad_norm": 0.029161326587200165, "learning_rate": 5.965599926988575e-07, "loss": 0.0, "num_input_tokens_seen": 12480704, "step": 25345 }, { "epoch": 3.3456513131846375, "grad_norm": 8.900999091565609e-06, "learning_rate": 5.961384939334373e-07, "loss": 0.0003, "num_input_tokens_seen": 12483200, "step": 25350 }, { "epoch": 3.346311204962386, "grad_norm": 0.010479514487087727, "learning_rate": 5.957170808846968e-07, "loss": 0.0011, "num_input_tokens_seen": 12485504, "step": 25355 }, { "epoch": 3.3469710967401345, "grad_norm": 0.014084945432841778, "learning_rate": 5.952957536420786e-07, "loss": 0.0, "num_input_tokens_seen": 12487872, "step": 25360 }, { "epoch": 3.347630988517883, "grad_norm": 0.002309830393642187, "learning_rate": 5.948745122950061e-07, "loss": 0.0, "num_input_tokens_seen": 12489984, "step": 25365 }, { "epoch": 3.3482908802956315, "grad_norm": 0.050204720348119736, "learning_rate": 5.944533569328841e-07, "loss": 0.0, "num_input_tokens_seen": 12492544, "step": 25370 }, { "epoch": 3.3489507720733798, "grad_norm": 0.0001361641043331474, "learning_rate": 5.940322876451009e-07, "loss": 0.0267, "num_input_tokens_seen": 12494592, "step": 25375 }, { "epoch": 3.3496106638511285, "grad_norm": 1.233882903761696e-05, "learning_rate": 5.936113045210245e-07, "loss": 0.0, "num_input_tokens_seen": 12496896, "step": 25380 }, { "epoch": 3.3502705556288768, "grad_norm": 53.146240234375, "learning_rate": 5.931904076500062e-07, "loss": 0.1251, "num_input_tokens_seen": 12499648, "step": 25385 }, { "epoch": 3.3509304474066255, "grad_norm": 3.949514575651847e-05, "learning_rate": 5.927695971213781e-07, "loss": 0.002, "num_input_tokens_seen": 12502272, "step": 25390 }, { "epoch": 3.3515903391843738, "grad_norm": 3.287645085947588e-05, "learning_rate": 5.923488730244537e-07, "loss": 0.0, "num_input_tokens_seen": 12504960, "step": 25395 }, { "epoch": 3.352250230962122, "grad_norm": 0.45993924140930176, "learning_rate": 5.919282354485293e-07, "loss": 0.0413, "num_input_tokens_seen": 12507456, "step": 25400 }, { "epoch": 3.3529101227398708, "grad_norm": 1.4746529814146925e-05, "learning_rate": 5.915076844828817e-07, "loss": 0.0002, "num_input_tokens_seen": 12509824, "step": 25405 }, { "epoch": 3.353570014517619, "grad_norm": 0.00013451321865431964, "learning_rate": 5.910872202167701e-07, "loss": 0.0, "num_input_tokens_seen": 12512576, "step": 25410 }, { "epoch": 3.3542299062953678, "grad_norm": 0.0022569603752344847, "learning_rate": 5.90666842739435e-07, "loss": 0.0, "num_input_tokens_seen": 12515072, "step": 25415 }, { "epoch": 3.354889798073116, "grad_norm": 0.030115384608507156, "learning_rate": 5.902465521400982e-07, "loss": 0.0025, "num_input_tokens_seen": 12517632, "step": 25420 }, { "epoch": 3.3555496898508643, "grad_norm": 0.0004955868935212493, "learning_rate": 5.898263485079636e-07, "loss": 0.0032, "num_input_tokens_seen": 12519872, "step": 25425 }, { "epoch": 3.356209581628613, "grad_norm": 0.03164421021938324, "learning_rate": 5.89406231932216e-07, "loss": 0.0, "num_input_tokens_seen": 12522304, "step": 25430 }, { "epoch": 3.3568694734063613, "grad_norm": 2.380097794230096e-05, "learning_rate": 5.889862025020227e-07, "loss": 0.0922, "num_input_tokens_seen": 12524480, "step": 25435 }, { "epoch": 3.35752936518411, "grad_norm": 0.113060362637043, "learning_rate": 5.885662603065316e-07, "loss": 0.0001, "num_input_tokens_seen": 12527040, "step": 25440 }, { "epoch": 3.3581892569618583, "grad_norm": 0.00010437117452966049, "learning_rate": 5.881464054348721e-07, "loss": 0.0, "num_input_tokens_seen": 12529216, "step": 25445 }, { "epoch": 3.3588491487396066, "grad_norm": 0.002310203155502677, "learning_rate": 5.877266379761561e-07, "loss": 0.0, "num_input_tokens_seen": 12531584, "step": 25450 }, { "epoch": 3.3595090405173553, "grad_norm": 0.010437843389809132, "learning_rate": 5.873069580194753e-07, "loss": 0.0, "num_input_tokens_seen": 12534144, "step": 25455 }, { "epoch": 3.3601689322951036, "grad_norm": 1.0495466085558292e-05, "learning_rate": 5.868873656539044e-07, "loss": 0.0337, "num_input_tokens_seen": 12536320, "step": 25460 }, { "epoch": 3.360828824072852, "grad_norm": 0.010058234445750713, "learning_rate": 5.864678609684986e-07, "loss": 0.0002, "num_input_tokens_seen": 12538624, "step": 25465 }, { "epoch": 3.3614887158506006, "grad_norm": 5.52868950762786e-05, "learning_rate": 5.860484440522946e-07, "loss": 0.0007, "num_input_tokens_seen": 12541120, "step": 25470 }, { "epoch": 3.362148607628349, "grad_norm": 8.224558769143187e-06, "learning_rate": 5.856291149943109e-07, "loss": 0.1057, "num_input_tokens_seen": 12543424, "step": 25475 }, { "epoch": 3.362808499406097, "grad_norm": 20.38540267944336, "learning_rate": 5.852098738835467e-07, "loss": 0.0891, "num_input_tokens_seen": 12545984, "step": 25480 }, { "epoch": 3.363468391183846, "grad_norm": 740.0801391601562, "learning_rate": 5.847907208089834e-07, "loss": 0.0404, "num_input_tokens_seen": 12548608, "step": 25485 }, { "epoch": 3.364128282961594, "grad_norm": 0.0001841172925196588, "learning_rate": 5.843716558595831e-07, "loss": 0.0, "num_input_tokens_seen": 12551104, "step": 25490 }, { "epoch": 3.364788174739343, "grad_norm": 13.210966110229492, "learning_rate": 5.839526791242883e-07, "loss": 0.0276, "num_input_tokens_seen": 12553600, "step": 25495 }, { "epoch": 3.365448066517091, "grad_norm": 0.026582859456539154, "learning_rate": 5.835337906920253e-07, "loss": 0.0009, "num_input_tokens_seen": 12556032, "step": 25500 }, { "epoch": 3.3661079582948394, "grad_norm": 0.0003508688823785633, "learning_rate": 5.831149906516989e-07, "loss": 0.0394, "num_input_tokens_seen": 12558464, "step": 25505 }, { "epoch": 3.366767850072588, "grad_norm": 0.006343053188174963, "learning_rate": 5.826962790921974e-07, "loss": 0.0, "num_input_tokens_seen": 12560704, "step": 25510 }, { "epoch": 3.3674277418503364, "grad_norm": 9.642515215091407e-05, "learning_rate": 5.822776561023885e-07, "loss": 0.0337, "num_input_tokens_seen": 12563136, "step": 25515 }, { "epoch": 3.368087633628085, "grad_norm": 0.12306500971317291, "learning_rate": 5.81859121771122e-07, "loss": 0.0001, "num_input_tokens_seen": 12565376, "step": 25520 }, { "epoch": 3.3687475254058334, "grad_norm": 0.0005888367886655033, "learning_rate": 5.814406761872294e-07, "loss": 0.0, "num_input_tokens_seen": 12567680, "step": 25525 }, { "epoch": 3.3694074171835817, "grad_norm": 2.025174617767334, "learning_rate": 5.810223194395221e-07, "loss": 0.0018, "num_input_tokens_seen": 12570176, "step": 25530 }, { "epoch": 3.3700673089613304, "grad_norm": 0.0001719648134894669, "learning_rate": 5.806040516167933e-07, "loss": 0.0631, "num_input_tokens_seen": 12572672, "step": 25535 }, { "epoch": 3.3707272007390787, "grad_norm": 0.03616896644234657, "learning_rate": 5.801858728078179e-07, "loss": 0.0364, "num_input_tokens_seen": 12575296, "step": 25540 }, { "epoch": 3.3713870925168274, "grad_norm": 0.00024308938009198755, "learning_rate": 5.797677831013506e-07, "loss": 0.076, "num_input_tokens_seen": 12577856, "step": 25545 }, { "epoch": 3.3720469842945757, "grad_norm": 1.1081480979919434, "learning_rate": 5.793497825861283e-07, "loss": 0.0607, "num_input_tokens_seen": 12580224, "step": 25550 }, { "epoch": 3.372706876072324, "grad_norm": 0.004675113596022129, "learning_rate": 5.789318713508686e-07, "loss": 0.055, "num_input_tokens_seen": 12582464, "step": 25555 }, { "epoch": 3.3733667678500727, "grad_norm": 0.1336895078420639, "learning_rate": 5.785140494842704e-07, "loss": 0.0001, "num_input_tokens_seen": 12584768, "step": 25560 }, { "epoch": 3.374026659627821, "grad_norm": 0.20448976755142212, "learning_rate": 5.780963170750129e-07, "loss": 0.0001, "num_input_tokens_seen": 12587264, "step": 25565 }, { "epoch": 3.3746865514055697, "grad_norm": 0.13979199528694153, "learning_rate": 5.776786742117564e-07, "loss": 0.0001, "num_input_tokens_seen": 12589760, "step": 25570 }, { "epoch": 3.375346443183318, "grad_norm": 25.667259216308594, "learning_rate": 5.772611209831436e-07, "loss": 0.0678, "num_input_tokens_seen": 12592320, "step": 25575 }, { "epoch": 3.3760063349610663, "grad_norm": 0.2753572165966034, "learning_rate": 5.768436574777964e-07, "loss": 0.0014, "num_input_tokens_seen": 12594944, "step": 25580 }, { "epoch": 3.376666226738815, "grad_norm": 0.08741661161184311, "learning_rate": 5.764262837843186e-07, "loss": 0.0001, "num_input_tokens_seen": 12597312, "step": 25585 }, { "epoch": 3.3773261185165633, "grad_norm": 1.0904316902160645, "learning_rate": 5.760089999912947e-07, "loss": 0.0004, "num_input_tokens_seen": 12599744, "step": 25590 }, { "epoch": 3.3779860102943116, "grad_norm": 43.92424392700195, "learning_rate": 5.755918061872907e-07, "loss": 0.0473, "num_input_tokens_seen": 12602496, "step": 25595 }, { "epoch": 3.3786459020720603, "grad_norm": 0.0960235595703125, "learning_rate": 5.751747024608527e-07, "loss": 0.0001, "num_input_tokens_seen": 12604736, "step": 25600 }, { "epoch": 3.3793057938498086, "grad_norm": 8.825310214888304e-05, "learning_rate": 5.747576889005068e-07, "loss": 0.0474, "num_input_tokens_seen": 12607168, "step": 25605 }, { "epoch": 3.379965685627557, "grad_norm": 0.486053466796875, "learning_rate": 5.743407655947627e-07, "loss": 0.0322, "num_input_tokens_seen": 12609920, "step": 25610 }, { "epoch": 3.3806255774053056, "grad_norm": 26.345102310180664, "learning_rate": 5.739239326321086e-07, "loss": 0.0329, "num_input_tokens_seen": 12612160, "step": 25615 }, { "epoch": 3.381285469183054, "grad_norm": 0.013855023309588432, "learning_rate": 5.735071901010146e-07, "loss": 0.0, "num_input_tokens_seen": 12614720, "step": 25620 }, { "epoch": 3.3819453609608026, "grad_norm": 0.0022012635599821806, "learning_rate": 5.730905380899309e-07, "loss": 0.0001, "num_input_tokens_seen": 12617472, "step": 25625 }, { "epoch": 3.382605252738551, "grad_norm": 0.006016657687723637, "learning_rate": 5.72673976687289e-07, "loss": 0.0, "num_input_tokens_seen": 12619968, "step": 25630 }, { "epoch": 3.383265144516299, "grad_norm": 0.0054673487320542336, "learning_rate": 5.722575059815014e-07, "loss": 0.0022, "num_input_tokens_seen": 12622272, "step": 25635 }, { "epoch": 3.383925036294048, "grad_norm": 0.5703974366188049, "learning_rate": 5.718411260609599e-07, "loss": 0.0005, "num_input_tokens_seen": 12624576, "step": 25640 }, { "epoch": 3.384584928071796, "grad_norm": 2.5996956825256348, "learning_rate": 5.714248370140397e-07, "loss": 0.0251, "num_input_tokens_seen": 12627264, "step": 25645 }, { "epoch": 3.385244819849545, "grad_norm": 0.038143787533044815, "learning_rate": 5.710086389290945e-07, "loss": 0.004, "num_input_tokens_seen": 12629952, "step": 25650 }, { "epoch": 3.385904711627293, "grad_norm": 0.0001890839048428461, "learning_rate": 5.705925318944585e-07, "loss": 0.0, "num_input_tokens_seen": 12632768, "step": 25655 }, { "epoch": 3.3865646034050414, "grad_norm": 0.0002839722437784076, "learning_rate": 5.701765159984483e-07, "loss": 0.0, "num_input_tokens_seen": 12635008, "step": 25660 }, { "epoch": 3.38722449518279, "grad_norm": 0.00851992517709732, "learning_rate": 5.6976059132936e-07, "loss": 0.0, "num_input_tokens_seen": 12637568, "step": 25665 }, { "epoch": 3.3878843869605384, "grad_norm": 0.0002972839865833521, "learning_rate": 5.69344757975471e-07, "loss": 0.0675, "num_input_tokens_seen": 12640192, "step": 25670 }, { "epoch": 3.388544278738287, "grad_norm": 336.3095703125, "learning_rate": 5.689290160250382e-07, "loss": 0.1155, "num_input_tokens_seen": 12642688, "step": 25675 }, { "epoch": 3.3892041705160354, "grad_norm": 4.639300346374512, "learning_rate": 5.685133655663001e-07, "loss": 0.0025, "num_input_tokens_seen": 12645376, "step": 25680 }, { "epoch": 3.3898640622937837, "grad_norm": 0.0036630574613809586, "learning_rate": 5.68097806687476e-07, "loss": 0.1427, "num_input_tokens_seen": 12648000, "step": 25685 }, { "epoch": 3.3905239540715324, "grad_norm": 0.00019933377916458994, "learning_rate": 5.676823394767644e-07, "loss": 0.0001, "num_input_tokens_seen": 12650816, "step": 25690 }, { "epoch": 3.3911838458492807, "grad_norm": 0.002877293387427926, "learning_rate": 5.672669640223458e-07, "loss": 0.0044, "num_input_tokens_seen": 12653376, "step": 25695 }, { "epoch": 3.3918437376270294, "grad_norm": 1.301897646044381e-05, "learning_rate": 5.668516804123808e-07, "loss": 0.0, "num_input_tokens_seen": 12656000, "step": 25700 }, { "epoch": 3.3925036294047777, "grad_norm": 0.9059159159660339, "learning_rate": 5.664364887350097e-07, "loss": 0.0018, "num_input_tokens_seen": 12658496, "step": 25705 }, { "epoch": 3.393163521182526, "grad_norm": 0.0015077658463269472, "learning_rate": 5.660213890783541e-07, "loss": 0.0, "num_input_tokens_seen": 12660864, "step": 25710 }, { "epoch": 3.3938234129602747, "grad_norm": 0.00017341156490147114, "learning_rate": 5.656063815305161e-07, "loss": 0.0, "num_input_tokens_seen": 12663424, "step": 25715 }, { "epoch": 3.394483304738023, "grad_norm": 0.00016228918684646487, "learning_rate": 5.651914661795785e-07, "loss": 0.0, "num_input_tokens_seen": 12665856, "step": 25720 }, { "epoch": 3.3951431965157712, "grad_norm": 0.08427057415246964, "learning_rate": 5.64776643113603e-07, "loss": 0.0001, "num_input_tokens_seen": 12668352, "step": 25725 }, { "epoch": 3.39580308829352, "grad_norm": 0.00015899655409157276, "learning_rate": 5.643619124206333e-07, "loss": 0.0, "num_input_tokens_seen": 12670592, "step": 25730 }, { "epoch": 3.3964629800712682, "grad_norm": 1.2886687727586832e-05, "learning_rate": 5.639472741886937e-07, "loss": 0.0518, "num_input_tokens_seen": 12673024, "step": 25735 }, { "epoch": 3.3971228718490165, "grad_norm": 0.14700756967067719, "learning_rate": 5.635327285057869e-07, "loss": 0.0003, "num_input_tokens_seen": 12675456, "step": 25740 }, { "epoch": 3.3977827636267652, "grad_norm": 0.002936385106295347, "learning_rate": 5.63118275459898e-07, "loss": 0.0, "num_input_tokens_seen": 12678080, "step": 25745 }, { "epoch": 3.3984426554045135, "grad_norm": 0.00019914221775252372, "learning_rate": 5.627039151389917e-07, "loss": 0.0595, "num_input_tokens_seen": 12680320, "step": 25750 }, { "epoch": 3.3991025471822622, "grad_norm": 1.2330317076703068e-05, "learning_rate": 5.622896476310125e-07, "loss": 0.0, "num_input_tokens_seen": 12682624, "step": 25755 }, { "epoch": 3.3997624389600105, "grad_norm": 0.000984366051852703, "learning_rate": 5.618754730238863e-07, "loss": 0.0, "num_input_tokens_seen": 12685120, "step": 25760 }, { "epoch": 3.400422330737759, "grad_norm": 1.5101558346941601e-05, "learning_rate": 5.614613914055175e-07, "loss": 0.2906, "num_input_tokens_seen": 12687808, "step": 25765 }, { "epoch": 3.4010822225155075, "grad_norm": 0.0074266353622078896, "learning_rate": 5.610474028637935e-07, "loss": 0.0, "num_input_tokens_seen": 12690240, "step": 25770 }, { "epoch": 3.401742114293256, "grad_norm": 3.550009205355309e-05, "learning_rate": 5.606335074865795e-07, "loss": 0.0, "num_input_tokens_seen": 12692544, "step": 25775 }, { "epoch": 3.4024020060710045, "grad_norm": 0.0015909220091998577, "learning_rate": 5.602197053617214e-07, "loss": 0.0, "num_input_tokens_seen": 12695104, "step": 25780 }, { "epoch": 3.403061897848753, "grad_norm": 0.0009785228176042438, "learning_rate": 5.598059965770468e-07, "loss": 0.0, "num_input_tokens_seen": 12697536, "step": 25785 }, { "epoch": 3.403721789626501, "grad_norm": 0.0001278437121072784, "learning_rate": 5.593923812203613e-07, "loss": 0.0002, "num_input_tokens_seen": 12699840, "step": 25790 }, { "epoch": 3.40438168140425, "grad_norm": 0.0002581992303021252, "learning_rate": 5.589788593794529e-07, "loss": 0.0, "num_input_tokens_seen": 12702336, "step": 25795 }, { "epoch": 3.405041573181998, "grad_norm": 0.00014046661090105772, "learning_rate": 5.585654311420873e-07, "loss": 0.0, "num_input_tokens_seen": 12704640, "step": 25800 }, { "epoch": 3.405701464959747, "grad_norm": 0.049336936324834824, "learning_rate": 5.581520965960125e-07, "loss": 0.0001, "num_input_tokens_seen": 12707264, "step": 25805 }, { "epoch": 3.406361356737495, "grad_norm": 4.411644840729423e-05, "learning_rate": 5.57738855828956e-07, "loss": 0.028, "num_input_tokens_seen": 12709888, "step": 25810 }, { "epoch": 3.4070212485152434, "grad_norm": 0.0019217153312638402, "learning_rate": 5.573257089286243e-07, "loss": 0.0, "num_input_tokens_seen": 12712192, "step": 25815 }, { "epoch": 3.407681140292992, "grad_norm": 16.16039276123047, "learning_rate": 5.569126559827053e-07, "loss": 0.0454, "num_input_tokens_seen": 12714560, "step": 25820 }, { "epoch": 3.4083410320707404, "grad_norm": 0.0001751862291712314, "learning_rate": 5.564996970788667e-07, "loss": 0.0, "num_input_tokens_seen": 12717056, "step": 25825 }, { "epoch": 3.409000923848489, "grad_norm": 0.0007003069040365517, "learning_rate": 5.560868323047556e-07, "loss": 0.0059, "num_input_tokens_seen": 12719552, "step": 25830 }, { "epoch": 3.4096608156262374, "grad_norm": 1.921590774145443e-05, "learning_rate": 5.556740617479998e-07, "loss": 0.0, "num_input_tokens_seen": 12721920, "step": 25835 }, { "epoch": 3.4103207074039856, "grad_norm": 7.84834410296753e-05, "learning_rate": 5.552613854962067e-07, "loss": 0.0352, "num_input_tokens_seen": 12724608, "step": 25840 }, { "epoch": 3.4109805991817344, "grad_norm": 19.140037536621094, "learning_rate": 5.548488036369645e-07, "loss": 0.0472, "num_input_tokens_seen": 12727168, "step": 25845 }, { "epoch": 3.4116404909594826, "grad_norm": 0.00015301394159905612, "learning_rate": 5.544363162578399e-07, "loss": 0.0, "num_input_tokens_seen": 12729408, "step": 25850 }, { "epoch": 3.412300382737231, "grad_norm": 0.0004984589177183807, "learning_rate": 5.540239234463804e-07, "loss": 0.0, "num_input_tokens_seen": 12732160, "step": 25855 }, { "epoch": 3.4129602745149796, "grad_norm": 0.0006072504911571741, "learning_rate": 5.536116252901142e-07, "loss": 0.0, "num_input_tokens_seen": 12734656, "step": 25860 }, { "epoch": 3.413620166292728, "grad_norm": 0.004299015738070011, "learning_rate": 5.531994218765477e-07, "loss": 0.0213, "num_input_tokens_seen": 12737216, "step": 25865 }, { "epoch": 3.4142800580704766, "grad_norm": 0.01571951061487198, "learning_rate": 5.527873132931682e-07, "loss": 0.0, "num_input_tokens_seen": 12739968, "step": 25870 }, { "epoch": 3.414939949848225, "grad_norm": 12.407855987548828, "learning_rate": 5.523752996274435e-07, "loss": 0.0366, "num_input_tokens_seen": 12742208, "step": 25875 }, { "epoch": 3.415599841625973, "grad_norm": 0.0006227616686373949, "learning_rate": 5.519633809668197e-07, "loss": 0.0003, "num_input_tokens_seen": 12744640, "step": 25880 }, { "epoch": 3.416259733403722, "grad_norm": 1.4746785163879395, "learning_rate": 5.515515573987238e-07, "loss": 0.0007, "num_input_tokens_seen": 12747008, "step": 25885 }, { "epoch": 3.41691962518147, "grad_norm": 0.004045584239065647, "learning_rate": 5.511398290105625e-07, "loss": 0.1469, "num_input_tokens_seen": 12749568, "step": 25890 }, { "epoch": 3.4175795169592185, "grad_norm": 0.7868015170097351, "learning_rate": 5.507281958897224e-07, "loss": 0.0004, "num_input_tokens_seen": 12752128, "step": 25895 }, { "epoch": 3.418239408736967, "grad_norm": 0.31684139370918274, "learning_rate": 5.503166581235694e-07, "loss": 0.0002, "num_input_tokens_seen": 12754560, "step": 25900 }, { "epoch": 3.4188993005147155, "grad_norm": 0.01085609383881092, "learning_rate": 5.499052157994486e-07, "loss": 0.0938, "num_input_tokens_seen": 12757248, "step": 25905 }, { "epoch": 3.419559192292464, "grad_norm": 0.06237511336803436, "learning_rate": 5.49493869004687e-07, "loss": 0.0352, "num_input_tokens_seen": 12759872, "step": 25910 }, { "epoch": 3.4202190840702125, "grad_norm": 0.016344398260116577, "learning_rate": 5.490826178265893e-07, "loss": 0.0001, "num_input_tokens_seen": 12762240, "step": 25915 }, { "epoch": 3.4208789758479607, "grad_norm": 0.3351937532424927, "learning_rate": 5.486714623524405e-07, "loss": 0.0026, "num_input_tokens_seen": 12764544, "step": 25920 }, { "epoch": 3.4215388676257095, "grad_norm": 0.01966346614062786, "learning_rate": 5.482604026695057e-07, "loss": 0.0, "num_input_tokens_seen": 12767296, "step": 25925 }, { "epoch": 3.4221987594034577, "grad_norm": 45.29475402832031, "learning_rate": 5.478494388650295e-07, "loss": 0.1487, "num_input_tokens_seen": 12769920, "step": 25930 }, { "epoch": 3.4228586511812065, "grad_norm": 19.655010223388672, "learning_rate": 5.474385710262357e-07, "loss": 0.0041, "num_input_tokens_seen": 12772416, "step": 25935 }, { "epoch": 3.4235185429589547, "grad_norm": 0.4863883852958679, "learning_rate": 5.470277992403271e-07, "loss": 0.0268, "num_input_tokens_seen": 12774720, "step": 25940 }, { "epoch": 3.424178434736703, "grad_norm": 0.00197918270714581, "learning_rate": 5.466171235944889e-07, "loss": 0.0657, "num_input_tokens_seen": 12777152, "step": 25945 }, { "epoch": 3.4248383265144517, "grad_norm": 0.003174730809405446, "learning_rate": 5.462065441758826e-07, "loss": 0.0, "num_input_tokens_seen": 12779776, "step": 25950 }, { "epoch": 3.4254982182922, "grad_norm": 0.00010079597268486395, "learning_rate": 5.457960610716515e-07, "loss": 0.0984, "num_input_tokens_seen": 12782272, "step": 25955 }, { "epoch": 3.4261581100699487, "grad_norm": 0.05801895633339882, "learning_rate": 5.453856743689172e-07, "loss": 0.0387, "num_input_tokens_seen": 12785088, "step": 25960 }, { "epoch": 3.426818001847697, "grad_norm": 0.014650064520537853, "learning_rate": 5.449753841547811e-07, "loss": 0.0009, "num_input_tokens_seen": 12787392, "step": 25965 }, { "epoch": 3.4274778936254453, "grad_norm": 1.5185226202011108, "learning_rate": 5.445651905163253e-07, "loss": 0.0017, "num_input_tokens_seen": 12789952, "step": 25970 }, { "epoch": 3.428137785403194, "grad_norm": 0.00041124902782030404, "learning_rate": 5.441550935406091e-07, "loss": 0.02, "num_input_tokens_seen": 12792640, "step": 25975 }, { "epoch": 3.4287976771809423, "grad_norm": 0.22763076424598694, "learning_rate": 5.43745093314674e-07, "loss": 0.0001, "num_input_tokens_seen": 12795136, "step": 25980 }, { "epoch": 3.4294575689586906, "grad_norm": 0.0003356730449013412, "learning_rate": 5.433351899255389e-07, "loss": 0.0, "num_input_tokens_seen": 12797824, "step": 25985 }, { "epoch": 3.4301174607364393, "grad_norm": 0.04222612828016281, "learning_rate": 5.429253834602025e-07, "loss": 0.0013, "num_input_tokens_seen": 12800640, "step": 25990 }, { "epoch": 3.4307773525141876, "grad_norm": 0.05344577878713608, "learning_rate": 5.425156740056436e-07, "loss": 0.0, "num_input_tokens_seen": 12803200, "step": 25995 }, { "epoch": 3.4314372442919363, "grad_norm": 0.001554910559207201, "learning_rate": 5.4210606164882e-07, "loss": 0.0001, "num_input_tokens_seen": 12805696, "step": 26000 }, { "epoch": 3.4320971360696846, "grad_norm": 33.12980651855469, "learning_rate": 5.416965464766694e-07, "loss": 0.1064, "num_input_tokens_seen": 12808640, "step": 26005 }, { "epoch": 3.432757027847433, "grad_norm": 0.00654615368694067, "learning_rate": 5.412871285761076e-07, "loss": 0.0001, "num_input_tokens_seen": 12811328, "step": 26010 }, { "epoch": 3.4334169196251816, "grad_norm": 8.831528248265386e-05, "learning_rate": 5.408778080340311e-07, "loss": 0.0701, "num_input_tokens_seen": 12814144, "step": 26015 }, { "epoch": 3.43407681140293, "grad_norm": 0.002978324657306075, "learning_rate": 5.404685849373154e-07, "loss": 0.0001, "num_input_tokens_seen": 12816512, "step": 26020 }, { "epoch": 3.434736703180678, "grad_norm": 0.00023777736350893974, "learning_rate": 5.400594593728146e-07, "loss": 0.0001, "num_input_tokens_seen": 12818752, "step": 26025 }, { "epoch": 3.435396594958427, "grad_norm": 0.0007493056473322213, "learning_rate": 5.396504314273629e-07, "loss": 0.0, "num_input_tokens_seen": 12821312, "step": 26030 }, { "epoch": 3.436056486736175, "grad_norm": 0.018175829201936722, "learning_rate": 5.39241501187774e-07, "loss": 0.0, "num_input_tokens_seen": 12823872, "step": 26035 }, { "epoch": 3.436716378513924, "grad_norm": 0.0037966063246130943, "learning_rate": 5.388326687408395e-07, "loss": 0.0004, "num_input_tokens_seen": 12826560, "step": 26040 }, { "epoch": 3.437376270291672, "grad_norm": 0.0003152771678287536, "learning_rate": 5.384239341733314e-07, "loss": 0.0, "num_input_tokens_seen": 12829248, "step": 26045 }, { "epoch": 3.4380361620694204, "grad_norm": 0.001747465692460537, "learning_rate": 5.38015297572001e-07, "loss": 0.0, "num_input_tokens_seen": 12831616, "step": 26050 }, { "epoch": 3.438696053847169, "grad_norm": 0.0026879829820245504, "learning_rate": 5.376067590235786e-07, "loss": 0.0, "num_input_tokens_seen": 12834240, "step": 26055 }, { "epoch": 3.4393559456249174, "grad_norm": 0.0006490828818641603, "learning_rate": 5.371983186147729e-07, "loss": 0.0, "num_input_tokens_seen": 12836928, "step": 26060 }, { "epoch": 3.440015837402666, "grad_norm": 0.0013973484747111797, "learning_rate": 5.367899764322725e-07, "loss": 0.0016, "num_input_tokens_seen": 12839808, "step": 26065 }, { "epoch": 3.4406757291804144, "grad_norm": 0.20865289866924286, "learning_rate": 5.363817325627458e-07, "loss": 0.0673, "num_input_tokens_seen": 12842176, "step": 26070 }, { "epoch": 3.4413356209581627, "grad_norm": 0.08312004804611206, "learning_rate": 5.359735870928388e-07, "loss": 0.0, "num_input_tokens_seen": 12844672, "step": 26075 }, { "epoch": 3.4419955127359114, "grad_norm": 0.0006424880702979863, "learning_rate": 5.355655401091776e-07, "loss": 0.0938, "num_input_tokens_seen": 12846976, "step": 26080 }, { "epoch": 3.4426554045136597, "grad_norm": 0.04234812408685684, "learning_rate": 5.351575916983677e-07, "loss": 0.0001, "num_input_tokens_seen": 12849600, "step": 26085 }, { "epoch": 3.4433152962914084, "grad_norm": 0.0009398151305504143, "learning_rate": 5.347497419469926e-07, "loss": 0.0457, "num_input_tokens_seen": 12851776, "step": 26090 }, { "epoch": 3.4439751880691567, "grad_norm": 0.034936707466840744, "learning_rate": 5.34341990941616e-07, "loss": 0.0, "num_input_tokens_seen": 12854144, "step": 26095 }, { "epoch": 3.444635079846905, "grad_norm": 0.0014790110290050507, "learning_rate": 5.33934338768779e-07, "loss": 0.0001, "num_input_tokens_seen": 12856640, "step": 26100 }, { "epoch": 3.4452949716246537, "grad_norm": 0.0009260703809559345, "learning_rate": 5.335267855150045e-07, "loss": 0.0, "num_input_tokens_seen": 12859072, "step": 26105 }, { "epoch": 3.445954863402402, "grad_norm": 0.0021803437266498804, "learning_rate": 5.331193312667916e-07, "loss": 0.0, "num_input_tokens_seen": 12861824, "step": 26110 }, { "epoch": 3.4466147551801507, "grad_norm": 0.0007027118699625134, "learning_rate": 5.327119761106193e-07, "loss": 0.0298, "num_input_tokens_seen": 12864064, "step": 26115 }, { "epoch": 3.447274646957899, "grad_norm": 0.0019124702084809542, "learning_rate": 5.323047201329468e-07, "loss": 0.0036, "num_input_tokens_seen": 12866688, "step": 26120 }, { "epoch": 3.4479345387356473, "grad_norm": 0.001602896605618298, "learning_rate": 5.318975634202103e-07, "loss": 0.0441, "num_input_tokens_seen": 12869056, "step": 26125 }, { "epoch": 3.448594430513396, "grad_norm": 0.006494051311165094, "learning_rate": 5.314905060588266e-07, "loss": 0.0, "num_input_tokens_seen": 12871744, "step": 26130 }, { "epoch": 3.4492543222911443, "grad_norm": 0.0012551875552162528, "learning_rate": 5.310835481351901e-07, "loss": 0.0, "num_input_tokens_seen": 12874176, "step": 26135 }, { "epoch": 3.4499142140688925, "grad_norm": 0.0018485430628061295, "learning_rate": 5.306766897356747e-07, "loss": 0.0, "num_input_tokens_seen": 12876672, "step": 26140 }, { "epoch": 3.4505741058466413, "grad_norm": 0.04104364663362503, "learning_rate": 5.302699309466338e-07, "loss": 0.0013, "num_input_tokens_seen": 12879232, "step": 26145 }, { "epoch": 3.4512339976243895, "grad_norm": 0.1153852790594101, "learning_rate": 5.298632718543981e-07, "loss": 0.072, "num_input_tokens_seen": 12881856, "step": 26150 }, { "epoch": 3.451893889402138, "grad_norm": 0.00025399410515092313, "learning_rate": 5.294567125452785e-07, "loss": 0.0009, "num_input_tokens_seen": 12884160, "step": 26155 }, { "epoch": 3.4525537811798865, "grad_norm": 0.0006861420115455985, "learning_rate": 5.290502531055648e-07, "loss": 0.0, "num_input_tokens_seen": 12886592, "step": 26160 }, { "epoch": 3.453213672957635, "grad_norm": 0.002263479633256793, "learning_rate": 5.286438936215239e-07, "loss": 0.0308, "num_input_tokens_seen": 12888832, "step": 26165 }, { "epoch": 3.4538735647353835, "grad_norm": 0.007285143714398146, "learning_rate": 5.282376341794033e-07, "loss": 0.0875, "num_input_tokens_seen": 12891200, "step": 26170 }, { "epoch": 3.454533456513132, "grad_norm": 0.027898499742150307, "learning_rate": 5.278314748654287e-07, "loss": 0.0, "num_input_tokens_seen": 12893952, "step": 26175 }, { "epoch": 3.45519334829088, "grad_norm": 75.30522918701172, "learning_rate": 5.274254157658048e-07, "loss": 0.0025, "num_input_tokens_seen": 12896512, "step": 26180 }, { "epoch": 3.455853240068629, "grad_norm": 0.00036869599716737866, "learning_rate": 5.270194569667139e-07, "loss": 0.0, "num_input_tokens_seen": 12898880, "step": 26185 }, { "epoch": 3.456513131846377, "grad_norm": 2.335038423538208, "learning_rate": 5.266135985543181e-07, "loss": 0.0949, "num_input_tokens_seen": 12901632, "step": 26190 }, { "epoch": 3.457173023624126, "grad_norm": 0.0013648836174979806, "learning_rate": 5.262078406147585e-07, "loss": 0.0, "num_input_tokens_seen": 12903808, "step": 26195 }, { "epoch": 3.457832915401874, "grad_norm": 0.0006372526404447854, "learning_rate": 5.258021832341534e-07, "loss": 0.0, "num_input_tokens_seen": 12906112, "step": 26200 }, { "epoch": 3.4584928071796224, "grad_norm": 0.0013917350443080068, "learning_rate": 5.25396626498601e-07, "loss": 0.0007, "num_input_tokens_seen": 12908416, "step": 26205 }, { "epoch": 3.459152698957371, "grad_norm": 0.007651422638446093, "learning_rate": 5.249911704941782e-07, "loss": 0.0, "num_input_tokens_seen": 12910848, "step": 26210 }, { "epoch": 3.4598125907351194, "grad_norm": 0.003433382837101817, "learning_rate": 5.245858153069394e-07, "loss": 0.0, "num_input_tokens_seen": 12913408, "step": 26215 }, { "epoch": 3.460472482512868, "grad_norm": 0.0034286656882613897, "learning_rate": 5.241805610229185e-07, "loss": 0.0016, "num_input_tokens_seen": 12916032, "step": 26220 }, { "epoch": 3.4611323742906164, "grad_norm": 11.557177543640137, "learning_rate": 5.23775407728128e-07, "loss": 0.0252, "num_input_tokens_seen": 12918528, "step": 26225 }, { "epoch": 3.4617922660683647, "grad_norm": 0.00010427210509078577, "learning_rate": 5.23370355508559e-07, "loss": 0.0, "num_input_tokens_seen": 12920960, "step": 26230 }, { "epoch": 3.4624521578461134, "grad_norm": 0.006644572596997023, "learning_rate": 5.229654044501802e-07, "loss": 0.0, "num_input_tokens_seen": 12923456, "step": 26235 }, { "epoch": 3.4631120496238617, "grad_norm": 0.004640599247068167, "learning_rate": 5.2256055463894e-07, "loss": 0.0, "num_input_tokens_seen": 12925760, "step": 26240 }, { "epoch": 3.4637719414016104, "grad_norm": 0.01460598036646843, "learning_rate": 5.221558061607649e-07, "loss": 0.0, "num_input_tokens_seen": 12928384, "step": 26245 }, { "epoch": 3.4644318331793587, "grad_norm": 0.018127327784895897, "learning_rate": 5.217511591015595e-07, "loss": 0.0, "num_input_tokens_seen": 12930816, "step": 26250 }, { "epoch": 3.465091724957107, "grad_norm": 0.000998675706796348, "learning_rate": 5.213466135472072e-07, "loss": 0.0066, "num_input_tokens_seen": 12933568, "step": 26255 }, { "epoch": 3.4657516167348557, "grad_norm": 0.009710119105875492, "learning_rate": 5.209421695835701e-07, "loss": 0.0005, "num_input_tokens_seen": 12935872, "step": 26260 }, { "epoch": 3.466411508512604, "grad_norm": 0.001298540853895247, "learning_rate": 5.205378272964889e-07, "loss": 0.0, "num_input_tokens_seen": 12938176, "step": 26265 }, { "epoch": 3.467071400290352, "grad_norm": 0.08667539060115814, "learning_rate": 5.201335867717818e-07, "loss": 0.0396, "num_input_tokens_seen": 12940416, "step": 26270 }, { "epoch": 3.467731292068101, "grad_norm": 0.022302474826574326, "learning_rate": 5.197294480952452e-07, "loss": 0.0072, "num_input_tokens_seen": 12943040, "step": 26275 }, { "epoch": 3.468391183845849, "grad_norm": 0.008083767257630825, "learning_rate": 5.193254113526561e-07, "loss": 0.0001, "num_input_tokens_seen": 12945408, "step": 26280 }, { "epoch": 3.4690510756235975, "grad_norm": 0.0010398940648883581, "learning_rate": 5.189214766297675e-07, "loss": 0.0396, "num_input_tokens_seen": 12947840, "step": 26285 }, { "epoch": 3.469710967401346, "grad_norm": 0.000684830010868609, "learning_rate": 5.18517644012312e-07, "loss": 0.0, "num_input_tokens_seen": 12950016, "step": 26290 }, { "epoch": 3.4703708591790945, "grad_norm": 1.880915641784668, "learning_rate": 5.181139135859996e-07, "loss": 0.0017, "num_input_tokens_seen": 12952832, "step": 26295 }, { "epoch": 3.471030750956843, "grad_norm": 0.0005736892344430089, "learning_rate": 5.177102854365196e-07, "loss": 0.0648, "num_input_tokens_seen": 12955200, "step": 26300 }, { "epoch": 3.4716906427345915, "grad_norm": 0.0024660734925419092, "learning_rate": 5.173067596495393e-07, "loss": 0.0, "num_input_tokens_seen": 12957568, "step": 26305 }, { "epoch": 3.4723505345123398, "grad_norm": 0.0006041477899998426, "learning_rate": 5.16903336310703e-07, "loss": 0.0, "num_input_tokens_seen": 12960000, "step": 26310 }, { "epoch": 3.4730104262900885, "grad_norm": 0.0010401011677458882, "learning_rate": 5.165000155056363e-07, "loss": 0.0, "num_input_tokens_seen": 12962368, "step": 26315 }, { "epoch": 3.4736703180678368, "grad_norm": 0.00012037860869895667, "learning_rate": 5.1609679731994e-07, "loss": 0.0007, "num_input_tokens_seen": 12964864, "step": 26320 }, { "epoch": 3.4743302098455855, "grad_norm": 30.517963409423828, "learning_rate": 5.156936818391937e-07, "loss": 0.147, "num_input_tokens_seen": 12967488, "step": 26325 }, { "epoch": 3.4749901016233338, "grad_norm": 0.0019168228609487414, "learning_rate": 5.152906691489566e-07, "loss": 0.0, "num_input_tokens_seen": 12970112, "step": 26330 }, { "epoch": 3.475649993401082, "grad_norm": 0.007496009115129709, "learning_rate": 5.148877593347649e-07, "loss": 0.0147, "num_input_tokens_seen": 12972736, "step": 26335 }, { "epoch": 3.4763098851788308, "grad_norm": 0.03964198753237724, "learning_rate": 5.144849524821337e-07, "loss": 0.0, "num_input_tokens_seen": 12975296, "step": 26340 }, { "epoch": 3.476969776956579, "grad_norm": 0.002376055810600519, "learning_rate": 5.140822486765552e-07, "loss": 0.0002, "num_input_tokens_seen": 12977920, "step": 26345 }, { "epoch": 3.4776296687343278, "grad_norm": 0.0003713663318194449, "learning_rate": 5.136796480035007e-07, "loss": 0.0, "num_input_tokens_seen": 12980480, "step": 26350 }, { "epoch": 3.478289560512076, "grad_norm": 0.004427397157996893, "learning_rate": 5.132771505484197e-07, "loss": 0.0, "num_input_tokens_seen": 12983168, "step": 26355 }, { "epoch": 3.4789494522898243, "grad_norm": 0.0713246613740921, "learning_rate": 5.128747563967384e-07, "loss": 0.0004, "num_input_tokens_seen": 12985600, "step": 26360 }, { "epoch": 3.479609344067573, "grad_norm": 0.0026552770286798477, "learning_rate": 5.124724656338626e-07, "loss": 0.0, "num_input_tokens_seen": 12988416, "step": 26365 }, { "epoch": 3.4802692358453213, "grad_norm": 0.012636465951800346, "learning_rate": 5.12070278345176e-07, "loss": 0.0, "num_input_tokens_seen": 12990656, "step": 26370 }, { "epoch": 3.48092912762307, "grad_norm": 0.0008887459989637136, "learning_rate": 5.116681946160391e-07, "loss": 0.0, "num_input_tokens_seen": 12993408, "step": 26375 }, { "epoch": 3.4815890194008183, "grad_norm": 0.0003574863658286631, "learning_rate": 5.112662145317917e-07, "loss": 0.0323, "num_input_tokens_seen": 12995520, "step": 26380 }, { "epoch": 3.4822489111785666, "grad_norm": 0.0011668046936392784, "learning_rate": 5.108643381777511e-07, "loss": 0.1026, "num_input_tokens_seen": 12997824, "step": 26385 }, { "epoch": 3.4829088029563153, "grad_norm": 0.0015160164330154657, "learning_rate": 5.104625656392132e-07, "loss": 0.0431, "num_input_tokens_seen": 13000128, "step": 26390 }, { "epoch": 3.4835686947340636, "grad_norm": 0.001212544390000403, "learning_rate": 5.100608970014501e-07, "loss": 0.0, "num_input_tokens_seen": 13002624, "step": 26395 }, { "epoch": 3.484228586511812, "grad_norm": 0.0007896866300143301, "learning_rate": 5.09659332349714e-07, "loss": 0.0002, "num_input_tokens_seen": 13005120, "step": 26400 }, { "epoch": 3.4848884782895606, "grad_norm": 0.0667625218629837, "learning_rate": 5.092578717692341e-07, "loss": 0.0, "num_input_tokens_seen": 13007808, "step": 26405 }, { "epoch": 3.485548370067309, "grad_norm": 0.0005229983362369239, "learning_rate": 5.088565153452171e-07, "loss": 0.0523, "num_input_tokens_seen": 13010048, "step": 26410 }, { "epoch": 3.486208261845057, "grad_norm": 0.0036990130320191383, "learning_rate": 5.084552631628479e-07, "loss": 0.0, "num_input_tokens_seen": 13012608, "step": 26415 }, { "epoch": 3.486868153622806, "grad_norm": 0.004398142918944359, "learning_rate": 5.080541153072902e-07, "loss": 0.0472, "num_input_tokens_seen": 13015040, "step": 26420 }, { "epoch": 3.487528045400554, "grad_norm": 0.0012921980815008283, "learning_rate": 5.076530718636834e-07, "loss": 0.0, "num_input_tokens_seen": 13017216, "step": 26425 }, { "epoch": 3.488187937178303, "grad_norm": 0.14894424378871918, "learning_rate": 5.07252132917147e-07, "loss": 0.0001, "num_input_tokens_seen": 13019648, "step": 26430 }, { "epoch": 3.488847828956051, "grad_norm": 15.87000560760498, "learning_rate": 5.068512985527773e-07, "loss": 0.0309, "num_input_tokens_seen": 13021824, "step": 26435 }, { "epoch": 3.4895077207337994, "grad_norm": 14.084748268127441, "learning_rate": 5.064505688556486e-07, "loss": 0.0554, "num_input_tokens_seen": 13024192, "step": 26440 }, { "epoch": 3.490167612511548, "grad_norm": 0.20227785408496857, "learning_rate": 5.060499439108127e-07, "loss": 0.0002, "num_input_tokens_seen": 13026624, "step": 26445 }, { "epoch": 3.4908275042892964, "grad_norm": 0.004290015436708927, "learning_rate": 5.056494238032985e-07, "loss": 0.0, "num_input_tokens_seen": 13029184, "step": 26450 }, { "epoch": 3.491487396067045, "grad_norm": 0.12511903047561646, "learning_rate": 5.052490086181151e-07, "loss": 0.0001, "num_input_tokens_seen": 13031616, "step": 26455 }, { "epoch": 3.4921472878447934, "grad_norm": 0.0507790669798851, "learning_rate": 5.048486984402467e-07, "loss": 0.0, "num_input_tokens_seen": 13034048, "step": 26460 }, { "epoch": 3.4928071796225417, "grad_norm": 0.10282183438539505, "learning_rate": 5.044484933546565e-07, "loss": 0.0032, "num_input_tokens_seen": 13036480, "step": 26465 }, { "epoch": 3.4934670714002904, "grad_norm": 0.010798219591379166, "learning_rate": 5.040483934462849e-07, "loss": 0.0, "num_input_tokens_seen": 13039104, "step": 26470 }, { "epoch": 3.4941269631780387, "grad_norm": 2.097705602645874, "learning_rate": 5.036483988000504e-07, "loss": 0.0004, "num_input_tokens_seen": 13041728, "step": 26475 }, { "epoch": 3.4947868549557874, "grad_norm": 0.008413027971982956, "learning_rate": 5.032485095008494e-07, "loss": 0.0001, "num_input_tokens_seen": 13044288, "step": 26480 }, { "epoch": 3.4954467467335357, "grad_norm": 0.0005098399124108255, "learning_rate": 5.028487256335541e-07, "loss": 0.0, "num_input_tokens_seen": 13046784, "step": 26485 }, { "epoch": 3.496106638511284, "grad_norm": 0.0010063229128718376, "learning_rate": 5.024490472830176e-07, "loss": 0.0, "num_input_tokens_seen": 13049536, "step": 26490 }, { "epoch": 3.4967665302890327, "grad_norm": 0.000572339806240052, "learning_rate": 5.020494745340677e-07, "loss": 0.0004, "num_input_tokens_seen": 13051968, "step": 26495 }, { "epoch": 3.497426422066781, "grad_norm": 23.79427146911621, "learning_rate": 5.016500074715108e-07, "loss": 0.0899, "num_input_tokens_seen": 13054592, "step": 26500 }, { "epoch": 3.4980863138445297, "grad_norm": 0.001726845744997263, "learning_rate": 5.01250646180131e-07, "loss": 0.0003, "num_input_tokens_seen": 13057088, "step": 26505 }, { "epoch": 3.498746205622278, "grad_norm": 0.0033422312699258327, "learning_rate": 5.008513907446898e-07, "loss": 0.0, "num_input_tokens_seen": 13059712, "step": 26510 }, { "epoch": 3.4994060974000263, "grad_norm": 0.0009347685845568776, "learning_rate": 5.004522412499267e-07, "loss": 0.0688, "num_input_tokens_seen": 13062336, "step": 26515 }, { "epoch": 3.500065989177775, "grad_norm": 0.03671286627650261, "learning_rate": 5.000531977805575e-07, "loss": 0.0, "num_input_tokens_seen": 13064640, "step": 26520 }, { "epoch": 3.5007258809555233, "grad_norm": 0.0007085995166562498, "learning_rate": 4.99654260421277e-07, "loss": 0.0441, "num_input_tokens_seen": 13067328, "step": 26525 }, { "epoch": 3.501385772733272, "grad_norm": 0.004408668261021376, "learning_rate": 4.992554292567568e-07, "loss": 0.0, "num_input_tokens_seen": 13069824, "step": 26530 }, { "epoch": 3.501385772733272, "eval_loss": 0.18093986809253693, "eval_runtime": 7.8753, "eval_samples_per_second": 855.204, "eval_steps_per_second": 106.916, "num_input_tokens_seen": 13069824, "step": 26530 }, { "epoch": 3.5020456645110203, "grad_norm": 0.05824043229222298, "learning_rate": 4.988567043716452e-07, "loss": 0.0002, "num_input_tokens_seen": 13072000, "step": 26535 }, { "epoch": 3.5027055562887686, "grad_norm": 0.4921933114528656, "learning_rate": 4.984580858505691e-07, "loss": 0.0002, "num_input_tokens_seen": 13074304, "step": 26540 }, { "epoch": 3.503365448066517, "grad_norm": 0.008208317682147026, "learning_rate": 4.980595737781328e-07, "loss": 0.0001, "num_input_tokens_seen": 13076928, "step": 26545 }, { "epoch": 3.5040253398442656, "grad_norm": 0.1486758440732956, "learning_rate": 4.976611682389168e-07, "loss": 0.0012, "num_input_tokens_seen": 13079360, "step": 26550 }, { "epoch": 3.504685231622014, "grad_norm": 0.0012614666484296322, "learning_rate": 4.972628693174802e-07, "loss": 0.0032, "num_input_tokens_seen": 13081792, "step": 26555 }, { "epoch": 3.5053451233997626, "grad_norm": 0.002648318186402321, "learning_rate": 4.96864677098359e-07, "loss": 0.0, "num_input_tokens_seen": 13084352, "step": 26560 }, { "epoch": 3.506005015177511, "grad_norm": 0.00365750421769917, "learning_rate": 4.964665916660671e-07, "loss": 0.0611, "num_input_tokens_seen": 13086784, "step": 26565 }, { "epoch": 3.506664906955259, "grad_norm": 0.10935722291469574, "learning_rate": 4.960686131050945e-07, "loss": 0.0001, "num_input_tokens_seen": 13089344, "step": 26570 }, { "epoch": 3.507324798733008, "grad_norm": 0.006860377267003059, "learning_rate": 4.956707414999095e-07, "loss": 0.0, "num_input_tokens_seen": 13091712, "step": 26575 }, { "epoch": 3.507984690510756, "grad_norm": 0.0029719332233071327, "learning_rate": 4.95272976934958e-07, "loss": 0.0, "num_input_tokens_seen": 13094208, "step": 26580 }, { "epoch": 3.508644582288505, "grad_norm": 18.196840286254883, "learning_rate": 4.948753194946617e-07, "loss": 0.0759, "num_input_tokens_seen": 13096768, "step": 26585 }, { "epoch": 3.509304474066253, "grad_norm": 0.0007499216008000076, "learning_rate": 4.944777692634211e-07, "loss": 0.0011, "num_input_tokens_seen": 13099264, "step": 26590 }, { "epoch": 3.5099643658440014, "grad_norm": 0.0002792077139019966, "learning_rate": 4.940803263256133e-07, "loss": 0.0011, "num_input_tokens_seen": 13101952, "step": 26595 }, { "epoch": 3.51062425762175, "grad_norm": 0.0011871742317453027, "learning_rate": 4.936829907655929e-07, "loss": 0.0, "num_input_tokens_seen": 13104704, "step": 26600 }, { "epoch": 3.5112841493994984, "grad_norm": 0.0007210766198113561, "learning_rate": 4.932857626676914e-07, "loss": 0.0, "num_input_tokens_seen": 13107072, "step": 26605 }, { "epoch": 3.511944041177247, "grad_norm": 0.00100160192232579, "learning_rate": 4.928886421162166e-07, "loss": 0.0846, "num_input_tokens_seen": 13109376, "step": 26610 }, { "epoch": 3.5126039329549954, "grad_norm": 0.019927017390727997, "learning_rate": 4.924916291954561e-07, "loss": 0.0518, "num_input_tokens_seen": 13111936, "step": 26615 }, { "epoch": 3.5132638247327437, "grad_norm": 0.01956903748214245, "learning_rate": 4.920947239896717e-07, "loss": 0.0, "num_input_tokens_seen": 13114432, "step": 26620 }, { "epoch": 3.5139237165104924, "grad_norm": 0.007326959166675806, "learning_rate": 4.916979265831043e-07, "loss": 0.0, "num_input_tokens_seen": 13116800, "step": 26625 }, { "epoch": 3.5145836082882407, "grad_norm": 0.0005818351055495441, "learning_rate": 4.913012370599715e-07, "loss": 0.0003, "num_input_tokens_seen": 13119296, "step": 26630 }, { "epoch": 3.5152435000659894, "grad_norm": 0.06792768090963364, "learning_rate": 4.909046555044672e-07, "loss": 0.0001, "num_input_tokens_seen": 13121664, "step": 26635 }, { "epoch": 3.5159033918437377, "grad_norm": 24.14913558959961, "learning_rate": 4.905081820007634e-07, "loss": 0.0035, "num_input_tokens_seen": 13124288, "step": 26640 }, { "epoch": 3.516563283621486, "grad_norm": 0.002926342422142625, "learning_rate": 4.901118166330077e-07, "loss": 0.0002, "num_input_tokens_seen": 13126784, "step": 26645 }, { "epoch": 3.5172231753992347, "grad_norm": 0.282175749540329, "learning_rate": 4.897155594853275e-07, "loss": 0.0004, "num_input_tokens_seen": 13129216, "step": 26650 }, { "epoch": 3.517883067176983, "grad_norm": 0.006466337013989687, "learning_rate": 4.893194106418246e-07, "loss": 0.0004, "num_input_tokens_seen": 13131456, "step": 26655 }, { "epoch": 3.5185429589547317, "grad_norm": 0.004010173957794905, "learning_rate": 4.889233701865782e-07, "loss": 0.0004, "num_input_tokens_seen": 13133824, "step": 26660 }, { "epoch": 3.51920285073248, "grad_norm": 0.013327274471521378, "learning_rate": 4.885274382036457e-07, "loss": 0.0104, "num_input_tokens_seen": 13136256, "step": 26665 }, { "epoch": 3.5198627425102282, "grad_norm": 0.0012044442119076848, "learning_rate": 4.881316147770607e-07, "loss": 0.0014, "num_input_tokens_seen": 13138944, "step": 26670 }, { "epoch": 3.5205226342879765, "grad_norm": 0.0015769976889714599, "learning_rate": 4.877358999908339e-07, "loss": 0.0, "num_input_tokens_seen": 13141504, "step": 26675 }, { "epoch": 3.5211825260657252, "grad_norm": 0.10278157144784927, "learning_rate": 4.873402939289527e-07, "loss": 0.0001, "num_input_tokens_seen": 13143552, "step": 26680 }, { "epoch": 3.5218424178434735, "grad_norm": 0.00024413972278125584, "learning_rate": 4.869447966753816e-07, "loss": 0.0, "num_input_tokens_seen": 13146048, "step": 26685 }, { "epoch": 3.5225023096212222, "grad_norm": 0.00010406988440081477, "learning_rate": 4.865494083140627e-07, "loss": 0.0, "num_input_tokens_seen": 13148288, "step": 26690 }, { "epoch": 3.5231622013989705, "grad_norm": 0.07188454270362854, "learning_rate": 4.861541289289131e-07, "loss": 0.0001, "num_input_tokens_seen": 13150720, "step": 26695 }, { "epoch": 3.523822093176719, "grad_norm": 8.29110576887615e-05, "learning_rate": 4.857589586038289e-07, "loss": 0.0381, "num_input_tokens_seen": 13153344, "step": 26700 }, { "epoch": 3.5244819849544675, "grad_norm": 128.06494140625, "learning_rate": 4.853638974226822e-07, "loss": 0.0044, "num_input_tokens_seen": 13155840, "step": 26705 }, { "epoch": 3.525141876732216, "grad_norm": 0.4814980626106262, "learning_rate": 4.849689454693212e-07, "loss": 0.0003, "num_input_tokens_seen": 13158272, "step": 26710 }, { "epoch": 3.5258017685099645, "grad_norm": 0.003013887209817767, "learning_rate": 4.845741028275719e-07, "loss": 0.0004, "num_input_tokens_seen": 13160640, "step": 26715 }, { "epoch": 3.526461660287713, "grad_norm": 0.008023404516279697, "learning_rate": 4.841793695812369e-07, "loss": 0.0001, "num_input_tokens_seen": 13163008, "step": 26720 }, { "epoch": 3.527121552065461, "grad_norm": 0.0011691105319187045, "learning_rate": 4.837847458140959e-07, "loss": 0.075, "num_input_tokens_seen": 13165440, "step": 26725 }, { "epoch": 3.52778144384321, "grad_norm": 0.0500674843788147, "learning_rate": 4.833902316099039e-07, "loss": 0.0, "num_input_tokens_seen": 13167680, "step": 26730 }, { "epoch": 3.528441335620958, "grad_norm": 0.0005932372878305614, "learning_rate": 4.829958270523944e-07, "loss": 0.0, "num_input_tokens_seen": 13169728, "step": 26735 }, { "epoch": 3.529101227398707, "grad_norm": 0.012722421437501907, "learning_rate": 4.82601532225277e-07, "loss": 0.0626, "num_input_tokens_seen": 13172096, "step": 26740 }, { "epoch": 3.529761119176455, "grad_norm": 0.013261332176625729, "learning_rate": 4.822073472122374e-07, "loss": 0.0001, "num_input_tokens_seen": 13174528, "step": 26745 }, { "epoch": 3.5304210109542034, "grad_norm": 0.00034718215465545654, "learning_rate": 4.818132720969387e-07, "loss": 0.0, "num_input_tokens_seen": 13176960, "step": 26750 }, { "epoch": 3.531080902731952, "grad_norm": 5.128545761108398, "learning_rate": 4.814193069630211e-07, "loss": 0.002, "num_input_tokens_seen": 13179328, "step": 26755 }, { "epoch": 3.5317407945097004, "grad_norm": 0.00042824147385545075, "learning_rate": 4.810254518941e-07, "loss": 0.0, "num_input_tokens_seen": 13181824, "step": 26760 }, { "epoch": 3.532400686287449, "grad_norm": 0.035186175256967545, "learning_rate": 4.806317069737684e-07, "loss": 0.0, "num_input_tokens_seen": 13184256, "step": 26765 }, { "epoch": 3.5330605780651974, "grad_norm": 0.0006575493607670069, "learning_rate": 4.802380722855961e-07, "loss": 0.0, "num_input_tokens_seen": 13186560, "step": 26770 }, { "epoch": 3.5337204698429456, "grad_norm": 0.0046204268001019955, "learning_rate": 4.798445479131295e-07, "loss": 0.0, "num_input_tokens_seen": 13189120, "step": 26775 }, { "epoch": 3.5343803616206944, "grad_norm": 0.0010483438381925225, "learning_rate": 4.794511339398911e-07, "loss": 0.0, "num_input_tokens_seen": 13191552, "step": 26780 }, { "epoch": 3.5350402533984426, "grad_norm": 0.0001779908052412793, "learning_rate": 4.790578304493791e-07, "loss": 0.0001, "num_input_tokens_seen": 13193856, "step": 26785 }, { "epoch": 3.5357001451761914, "grad_norm": 0.0018780305981636047, "learning_rate": 4.786646375250711e-07, "loss": 0.0891, "num_input_tokens_seen": 13196288, "step": 26790 }, { "epoch": 3.5363600369539396, "grad_norm": 0.0004961126251146197, "learning_rate": 4.78271555250418e-07, "loss": 0.0003, "num_input_tokens_seen": 13198720, "step": 26795 }, { "epoch": 3.537019928731688, "grad_norm": 0.0003020392032340169, "learning_rate": 4.778785837088497e-07, "loss": 0.0001, "num_input_tokens_seen": 13201152, "step": 26800 }, { "epoch": 3.537679820509436, "grad_norm": 0.0001238805562024936, "learning_rate": 4.774857229837708e-07, "loss": 0.0001, "num_input_tokens_seen": 13203584, "step": 26805 }, { "epoch": 3.538339712287185, "grad_norm": 0.0013724665623158216, "learning_rate": 4.770929731585634e-07, "loss": 0.0, "num_input_tokens_seen": 13206016, "step": 26810 }, { "epoch": 3.538999604064933, "grad_norm": 39.34675216674805, "learning_rate": 4.7670033431658605e-07, "loss": 0.0797, "num_input_tokens_seen": 13208256, "step": 26815 }, { "epoch": 3.539659495842682, "grad_norm": 0.0005967853940092027, "learning_rate": 4.7630780654117273e-07, "loss": 0.0, "num_input_tokens_seen": 13210880, "step": 26820 }, { "epoch": 3.54031938762043, "grad_norm": 0.0013899434125050902, "learning_rate": 4.7591538991563594e-07, "loss": 0.0001, "num_input_tokens_seen": 13213248, "step": 26825 }, { "epoch": 3.5409792793981785, "grad_norm": 0.00026954489294439554, "learning_rate": 4.755230845232625e-07, "loss": 0.0, "num_input_tokens_seen": 13215616, "step": 26830 }, { "epoch": 3.541639171175927, "grad_norm": 0.00046751563786529005, "learning_rate": 4.7513089044731603e-07, "loss": 0.0213, "num_input_tokens_seen": 13217920, "step": 26835 }, { "epoch": 3.5422990629536755, "grad_norm": 4.4770134991267696e-05, "learning_rate": 4.7473880777103725e-07, "loss": 0.0, "num_input_tokens_seen": 13220288, "step": 26840 }, { "epoch": 3.542958954731424, "grad_norm": 0.0011170512298122048, "learning_rate": 4.74346836577643e-07, "loss": 0.0487, "num_input_tokens_seen": 13223040, "step": 26845 }, { "epoch": 3.5436188465091725, "grad_norm": 0.1462031602859497, "learning_rate": 4.7395497695032637e-07, "loss": 0.0017, "num_input_tokens_seen": 13225152, "step": 26850 }, { "epoch": 3.5442787382869207, "grad_norm": 0.004078585188835859, "learning_rate": 4.735632289722563e-07, "loss": 0.0, "num_input_tokens_seen": 13227648, "step": 26855 }, { "epoch": 3.5449386300646695, "grad_norm": 0.00015965760394465178, "learning_rate": 4.731715927265787e-07, "loss": 0.0, "num_input_tokens_seen": 13230080, "step": 26860 }, { "epoch": 3.5455985218424177, "grad_norm": 133.80470275878906, "learning_rate": 4.727800682964159e-07, "loss": 0.1657, "num_input_tokens_seen": 13232768, "step": 26865 }, { "epoch": 3.5462584136201665, "grad_norm": 0.006459526252001524, "learning_rate": 4.723886557648655e-07, "loss": 0.0, "num_input_tokens_seen": 13235008, "step": 26870 }, { "epoch": 3.5469183053979148, "grad_norm": 0.025274749845266342, "learning_rate": 4.719973552150022e-07, "loss": 0.0839, "num_input_tokens_seen": 13237696, "step": 26875 }, { "epoch": 3.547578197175663, "grad_norm": 0.004373045172542334, "learning_rate": 4.7160616672987674e-07, "loss": 0.0, "num_input_tokens_seen": 13240192, "step": 26880 }, { "epoch": 3.5482380889534118, "grad_norm": 0.0033362486865371466, "learning_rate": 4.712150903925165e-07, "loss": 0.0, "num_input_tokens_seen": 13242496, "step": 26885 }, { "epoch": 3.54889798073116, "grad_norm": 0.001436105347238481, "learning_rate": 4.708241262859237e-07, "loss": 0.0, "num_input_tokens_seen": 13244864, "step": 26890 }, { "epoch": 3.5495578725089088, "grad_norm": 0.0019471053965389729, "learning_rate": 4.7043327449307813e-07, "loss": 0.0, "num_input_tokens_seen": 13247104, "step": 26895 }, { "epoch": 3.550217764286657, "grad_norm": 0.0009903459576889873, "learning_rate": 4.700425350969357e-07, "loss": 0.0, "num_input_tokens_seen": 13249536, "step": 26900 }, { "epoch": 3.5508776560644053, "grad_norm": 1.5322378873825073, "learning_rate": 4.696519081804271e-07, "loss": 0.001, "num_input_tokens_seen": 13251904, "step": 26905 }, { "epoch": 3.551537547842154, "grad_norm": 0.0005009761080145836, "learning_rate": 4.6926139382646045e-07, "loss": 0.0, "num_input_tokens_seen": 13254336, "step": 26910 }, { "epoch": 3.5521974396199023, "grad_norm": 0.02354281395673752, "learning_rate": 4.6887099211792016e-07, "loss": 0.0, "num_input_tokens_seen": 13257088, "step": 26915 }, { "epoch": 3.552857331397651, "grad_norm": 0.003061910392716527, "learning_rate": 4.6848070313766507e-07, "loss": 0.0427, "num_input_tokens_seen": 13259584, "step": 26920 }, { "epoch": 3.5535172231753993, "grad_norm": 0.0007698460831306875, "learning_rate": 4.68090526968532e-07, "loss": 0.0, "num_input_tokens_seen": 13262208, "step": 26925 }, { "epoch": 3.5541771149531476, "grad_norm": 0.004608421120792627, "learning_rate": 4.677004636933327e-07, "loss": 0.052, "num_input_tokens_seen": 13264704, "step": 26930 }, { "epoch": 3.554837006730896, "grad_norm": 0.06963248550891876, "learning_rate": 4.673105133948557e-07, "loss": 0.0, "num_input_tokens_seen": 13266816, "step": 26935 }, { "epoch": 3.5554968985086446, "grad_norm": 0.00527073023840785, "learning_rate": 4.6692067615586493e-07, "loss": 0.0001, "num_input_tokens_seen": 13269120, "step": 26940 }, { "epoch": 3.556156790286393, "grad_norm": 0.02549833059310913, "learning_rate": 4.6653095205909955e-07, "loss": 0.0001, "num_input_tokens_seen": 13271232, "step": 26945 }, { "epoch": 3.5568166820641416, "grad_norm": 0.009850227274000645, "learning_rate": 4.661413411872772e-07, "loss": 0.0, "num_input_tokens_seen": 13273536, "step": 26950 }, { "epoch": 3.55747657384189, "grad_norm": 3.481121778488159, "learning_rate": 4.6575184362308904e-07, "loss": 0.0281, "num_input_tokens_seen": 13276160, "step": 26955 }, { "epoch": 3.558136465619638, "grad_norm": 0.0004499904753174633, "learning_rate": 4.653624594492033e-07, "loss": 0.0, "num_input_tokens_seen": 13278336, "step": 26960 }, { "epoch": 3.558796357397387, "grad_norm": 0.3562501072883606, "learning_rate": 4.649731887482644e-07, "loss": 0.0001, "num_input_tokens_seen": 13280832, "step": 26965 }, { "epoch": 3.559456249175135, "grad_norm": 0.01617368683218956, "learning_rate": 4.645840316028914e-07, "loss": 0.0, "num_input_tokens_seen": 13283328, "step": 26970 }, { "epoch": 3.560116140952884, "grad_norm": 0.0003085601201746613, "learning_rate": 4.641949880956809e-07, "loss": 0.002, "num_input_tokens_seen": 13285632, "step": 26975 }, { "epoch": 3.560776032730632, "grad_norm": 0.008649193681776524, "learning_rate": 4.638060583092035e-07, "loss": 0.0003, "num_input_tokens_seen": 13287872, "step": 26980 }, { "epoch": 3.5614359245083804, "grad_norm": 0.17648448050022125, "learning_rate": 4.634172423260081e-07, "loss": 0.069, "num_input_tokens_seen": 13290560, "step": 26985 }, { "epoch": 3.562095816286129, "grad_norm": 0.0014605034375563264, "learning_rate": 4.6302854022861735e-07, "loss": 0.0322, "num_input_tokens_seen": 13293056, "step": 26990 }, { "epoch": 3.5627557080638774, "grad_norm": 0.0008046274306252599, "learning_rate": 4.6263995209953024e-07, "loss": 0.0, "num_input_tokens_seen": 13295488, "step": 26995 }, { "epoch": 3.563415599841626, "grad_norm": 0.0001721447624731809, "learning_rate": 4.622514780212219e-07, "loss": 0.0, "num_input_tokens_seen": 13297856, "step": 27000 }, { "epoch": 3.5640754916193744, "grad_norm": 0.009151825681328773, "learning_rate": 4.618631180761434e-07, "loss": 0.0322, "num_input_tokens_seen": 13300416, "step": 27005 }, { "epoch": 3.5647353833971227, "grad_norm": 0.000537705491296947, "learning_rate": 4.6147487234672156e-07, "loss": 0.0, "num_input_tokens_seen": 13302848, "step": 27010 }, { "epoch": 3.5653952751748714, "grad_norm": 0.0005098577821627259, "learning_rate": 4.6108674091535795e-07, "loss": 0.0, "num_input_tokens_seen": 13305344, "step": 27015 }, { "epoch": 3.5660551669526197, "grad_norm": 0.0005886783474124968, "learning_rate": 4.6069872386443107e-07, "loss": 0.0, "num_input_tokens_seen": 13307840, "step": 27020 }, { "epoch": 3.5667150587303684, "grad_norm": 0.000730705913156271, "learning_rate": 4.6031082127629514e-07, "loss": 0.0323, "num_input_tokens_seen": 13310208, "step": 27025 }, { "epoch": 3.5673749505081167, "grad_norm": 0.01606333628296852, "learning_rate": 4.5992303323327885e-07, "loss": 0.0001, "num_input_tokens_seen": 13312576, "step": 27030 }, { "epoch": 3.568034842285865, "grad_norm": 0.006714434828609228, "learning_rate": 4.5953535981768786e-07, "loss": 0.0004, "num_input_tokens_seen": 13314752, "step": 27035 }, { "epoch": 3.5686947340636137, "grad_norm": 0.0010593448532745242, "learning_rate": 4.591478011118034e-07, "loss": 0.0, "num_input_tokens_seen": 13317184, "step": 27040 }, { "epoch": 3.569354625841362, "grad_norm": 0.00020904975826852024, "learning_rate": 4.5876035719788133e-07, "loss": 0.0585, "num_input_tokens_seen": 13320128, "step": 27045 }, { "epoch": 3.5700145176191107, "grad_norm": 0.000504173047374934, "learning_rate": 4.5837302815815394e-07, "loss": 0.0002, "num_input_tokens_seen": 13322816, "step": 27050 }, { "epoch": 3.570674409396859, "grad_norm": 9.949973173206672e-05, "learning_rate": 4.5798581407482927e-07, "loss": 0.0, "num_input_tokens_seen": 13325248, "step": 27055 }, { "epoch": 3.5713343011746073, "grad_norm": 0.0005612291861325502, "learning_rate": 4.5759871503009097e-07, "loss": 0.0693, "num_input_tokens_seen": 13327680, "step": 27060 }, { "epoch": 3.5719941929523555, "grad_norm": 0.03958077356219292, "learning_rate": 4.572117311060972e-07, "loss": 0.0, "num_input_tokens_seen": 13329984, "step": 27065 }, { "epoch": 3.5726540847301043, "grad_norm": 0.0063445377163589, "learning_rate": 4.56824862384983e-07, "loss": 0.0719, "num_input_tokens_seen": 13332288, "step": 27070 }, { "epoch": 3.573313976507853, "grad_norm": 0.0007765923510305583, "learning_rate": 4.564381089488587e-07, "loss": 0.0176, "num_input_tokens_seen": 13334400, "step": 27075 }, { "epoch": 3.5739738682856013, "grad_norm": 0.0009005771134980023, "learning_rate": 4.560514708798093e-07, "loss": 0.1063, "num_input_tokens_seen": 13337024, "step": 27080 }, { "epoch": 3.5746337600633495, "grad_norm": 0.025038283318281174, "learning_rate": 4.556649482598962e-07, "loss": 0.0, "num_input_tokens_seen": 13339328, "step": 27085 }, { "epoch": 3.575293651841098, "grad_norm": 0.0005765855894424021, "learning_rate": 4.552785411711565e-07, "loss": 0.0412, "num_input_tokens_seen": 13341632, "step": 27090 }, { "epoch": 3.5759535436188465, "grad_norm": 3.052090883255005, "learning_rate": 4.548922496956015e-07, "loss": 0.0011, "num_input_tokens_seen": 13343936, "step": 27095 }, { "epoch": 3.576613435396595, "grad_norm": 0.0007220639381557703, "learning_rate": 4.54506073915219e-07, "loss": 0.0, "num_input_tokens_seen": 13346688, "step": 27100 }, { "epoch": 3.5772733271743435, "grad_norm": 0.0006462151068262756, "learning_rate": 4.541200139119723e-07, "loss": 0.0001, "num_input_tokens_seen": 13349376, "step": 27105 }, { "epoch": 3.577933218952092, "grad_norm": 0.009303468279540539, "learning_rate": 4.537340697678e-07, "loss": 0.0673, "num_input_tokens_seen": 13351680, "step": 27110 }, { "epoch": 3.57859311072984, "grad_norm": 0.001099230838008225, "learning_rate": 4.533482415646157e-07, "loss": 0.0003, "num_input_tokens_seen": 13354048, "step": 27115 }, { "epoch": 3.579253002507589, "grad_norm": 11.371760368347168, "learning_rate": 4.529625293843078e-07, "loss": 0.004, "num_input_tokens_seen": 13356416, "step": 27120 }, { "epoch": 3.579912894285337, "grad_norm": 0.012447909452021122, "learning_rate": 4.525769333087425e-07, "loss": 0.0, "num_input_tokens_seen": 13358592, "step": 27125 }, { "epoch": 3.580572786063086, "grad_norm": 0.0015077232383191586, "learning_rate": 4.521914534197585e-07, "loss": 0.0, "num_input_tokens_seen": 13361216, "step": 27130 }, { "epoch": 3.581232677840834, "grad_norm": 0.0013660573167726398, "learning_rate": 4.518060897991721e-07, "loss": 0.0, "num_input_tokens_seen": 13363392, "step": 27135 }, { "epoch": 3.5818925696185824, "grad_norm": 0.003877345472574234, "learning_rate": 4.51420842528773e-07, "loss": 0.0, "num_input_tokens_seen": 13365760, "step": 27140 }, { "epoch": 3.582552461396331, "grad_norm": 0.0003621858195401728, "learning_rate": 4.510357116903275e-07, "loss": 0.0, "num_input_tokens_seen": 13368000, "step": 27145 }, { "epoch": 3.5832123531740794, "grad_norm": 0.00024860017583705485, "learning_rate": 4.5065069736557737e-07, "loss": 0.0, "num_input_tokens_seen": 13370368, "step": 27150 }, { "epoch": 3.583872244951828, "grad_norm": 0.0033539736177772284, "learning_rate": 4.502657996362379e-07, "loss": 0.0, "num_input_tokens_seen": 13373248, "step": 27155 }, { "epoch": 3.5845321367295764, "grad_norm": 1.584349513053894, "learning_rate": 4.498810185840023e-07, "loss": 0.0005, "num_input_tokens_seen": 13375488, "step": 27160 }, { "epoch": 3.5851920285073247, "grad_norm": 1.3402936458587646, "learning_rate": 4.494963542905369e-07, "loss": 0.0873, "num_input_tokens_seen": 13377856, "step": 27165 }, { "epoch": 3.5858519202850734, "grad_norm": 0.013916954398155212, "learning_rate": 4.491118068374835e-07, "loss": 0.0007, "num_input_tokens_seen": 13380544, "step": 27170 }, { "epoch": 3.5865118120628217, "grad_norm": 0.0025454284623265266, "learning_rate": 4.4872737630645984e-07, "loss": 0.0, "num_input_tokens_seen": 13382912, "step": 27175 }, { "epoch": 3.5871717038405704, "grad_norm": 0.0019730727653950453, "learning_rate": 4.4834306277905855e-07, "loss": 0.0001, "num_input_tokens_seen": 13385152, "step": 27180 }, { "epoch": 3.5878315956183187, "grad_norm": 0.0019074628362432122, "learning_rate": 4.4795886633684776e-07, "loss": 0.0, "num_input_tokens_seen": 13387392, "step": 27185 }, { "epoch": 3.588491487396067, "grad_norm": 0.06505951285362244, "learning_rate": 4.4757478706136974e-07, "loss": 0.0472, "num_input_tokens_seen": 13389696, "step": 27190 }, { "epoch": 3.5891513791738157, "grad_norm": 0.006583169102668762, "learning_rate": 4.4719082503414273e-07, "loss": 0.0004, "num_input_tokens_seen": 13391872, "step": 27195 }, { "epoch": 3.589811270951564, "grad_norm": 0.00811641477048397, "learning_rate": 4.468069803366604e-07, "loss": 0.0, "num_input_tokens_seen": 13394048, "step": 27200 }, { "epoch": 3.5904711627293127, "grad_norm": 0.0005078451940789819, "learning_rate": 4.464232530503902e-07, "loss": 0.0, "num_input_tokens_seen": 13396672, "step": 27205 }, { "epoch": 3.591131054507061, "grad_norm": 27.991722106933594, "learning_rate": 4.460396432567759e-07, "loss": 0.1157, "num_input_tokens_seen": 13399232, "step": 27210 }, { "epoch": 3.591790946284809, "grad_norm": 0.0011901069665327668, "learning_rate": 4.456561510372358e-07, "loss": 0.0, "num_input_tokens_seen": 13401600, "step": 27215 }, { "epoch": 3.5924508380625575, "grad_norm": 0.008773591369390488, "learning_rate": 4.4527277647316375e-07, "loss": 0.0, "num_input_tokens_seen": 13404160, "step": 27220 }, { "epoch": 3.593110729840306, "grad_norm": 0.0010363340843468904, "learning_rate": 4.448895196459275e-07, "loss": 0.0016, "num_input_tokens_seen": 13406592, "step": 27225 }, { "epoch": 3.5937706216180545, "grad_norm": 0.9809166789054871, "learning_rate": 4.4450638063687094e-07, "loss": 0.0012, "num_input_tokens_seen": 13409152, "step": 27230 }, { "epoch": 3.594430513395803, "grad_norm": 0.12178061902523041, "learning_rate": 4.4412335952731284e-07, "loss": 0.0001, "num_input_tokens_seen": 13411776, "step": 27235 }, { "epoch": 3.5950904051735515, "grad_norm": 0.0018833117792382836, "learning_rate": 4.437404563985461e-07, "loss": 0.0009, "num_input_tokens_seen": 13414272, "step": 27240 }, { "epoch": 3.5957502969512998, "grad_norm": 17.3830623626709, "learning_rate": 4.4335767133183923e-07, "loss": 0.0169, "num_input_tokens_seen": 13416832, "step": 27245 }, { "epoch": 3.5964101887290485, "grad_norm": 40.81116485595703, "learning_rate": 4.4297500440843616e-07, "loss": 0.075, "num_input_tokens_seen": 13419136, "step": 27250 }, { "epoch": 3.5970700805067968, "grad_norm": 0.001731569180265069, "learning_rate": 4.4259245570955437e-07, "loss": 0.0004, "num_input_tokens_seen": 13421632, "step": 27255 }, { "epoch": 3.5977299722845455, "grad_norm": 0.002258374122902751, "learning_rate": 4.422100253163874e-07, "loss": 0.0001, "num_input_tokens_seen": 13424000, "step": 27260 }, { "epoch": 3.5983898640622938, "grad_norm": 0.0061827958561480045, "learning_rate": 4.4182771331010347e-07, "loss": 0.0337, "num_input_tokens_seen": 13426368, "step": 27265 }, { "epoch": 3.599049755840042, "grad_norm": 5.052387237548828, "learning_rate": 4.414455197718457e-07, "loss": 0.002, "num_input_tokens_seen": 13428608, "step": 27270 }, { "epoch": 3.5997096476177908, "grad_norm": 0.6098920702934265, "learning_rate": 4.410634447827316e-07, "loss": 0.0002, "num_input_tokens_seen": 13430848, "step": 27275 }, { "epoch": 3.600369539395539, "grad_norm": 0.00019582083041314036, "learning_rate": 4.406814884238532e-07, "loss": 0.1103, "num_input_tokens_seen": 13433280, "step": 27280 }, { "epoch": 3.6010294311732878, "grad_norm": 0.08573462069034576, "learning_rate": 4.4029965077627927e-07, "loss": 0.0383, "num_input_tokens_seen": 13435584, "step": 27285 }, { "epoch": 3.601689322951036, "grad_norm": 0.006321965716779232, "learning_rate": 4.399179319210511e-07, "loss": 0.0, "num_input_tokens_seen": 13438080, "step": 27290 }, { "epoch": 3.6023492147287843, "grad_norm": 0.0017303403001278639, "learning_rate": 4.3953633193918606e-07, "loss": 0.0, "num_input_tokens_seen": 13440832, "step": 27295 }, { "epoch": 3.603009106506533, "grad_norm": 20.29780387878418, "learning_rate": 4.3915485091167647e-07, "loss": 0.1113, "num_input_tokens_seen": 13443520, "step": 27300 }, { "epoch": 3.6036689982842813, "grad_norm": 0.0009621708304621279, "learning_rate": 4.3877348891948794e-07, "loss": 0.0, "num_input_tokens_seen": 13445824, "step": 27305 }, { "epoch": 3.60432889006203, "grad_norm": 0.0025498121976852417, "learning_rate": 4.3839224604356274e-07, "loss": 0.0001, "num_input_tokens_seen": 13448512, "step": 27310 }, { "epoch": 3.6049887818397783, "grad_norm": 0.00046822839067317545, "learning_rate": 4.3801112236481575e-07, "loss": 0.0, "num_input_tokens_seen": 13450944, "step": 27315 }, { "epoch": 3.6056486736175266, "grad_norm": 0.003700240980833769, "learning_rate": 4.3763011796413915e-07, "loss": 0.0001, "num_input_tokens_seen": 13453376, "step": 27320 }, { "epoch": 3.6063085653952753, "grad_norm": 0.052340708673000336, "learning_rate": 4.372492329223977e-07, "loss": 0.0001, "num_input_tokens_seen": 13455936, "step": 27325 }, { "epoch": 3.6069684571730236, "grad_norm": 0.023182492703199387, "learning_rate": 4.3686846732043105e-07, "loss": 0.0281, "num_input_tokens_seen": 13458560, "step": 27330 }, { "epoch": 3.6076283489507723, "grad_norm": 0.08036337792873383, "learning_rate": 4.3648782123905424e-07, "loss": 0.0626, "num_input_tokens_seen": 13460864, "step": 27335 }, { "epoch": 3.6082882407285206, "grad_norm": 0.01171074528247118, "learning_rate": 4.361072947590568e-07, "loss": 0.0001, "num_input_tokens_seen": 13463360, "step": 27340 }, { "epoch": 3.608948132506269, "grad_norm": 0.0003683822287712246, "learning_rate": 4.3572688796120307e-07, "loss": 0.0001, "num_input_tokens_seen": 13466112, "step": 27345 }, { "epoch": 3.609608024284017, "grad_norm": 1.8278930187225342, "learning_rate": 4.353466009262309e-07, "loss": 0.001, "num_input_tokens_seen": 13468800, "step": 27350 }, { "epoch": 3.610267916061766, "grad_norm": 0.0013008704409003258, "learning_rate": 4.3496643373485367e-07, "loss": 0.0226, "num_input_tokens_seen": 13471296, "step": 27355 }, { "epoch": 3.610927807839514, "grad_norm": 0.0005728037795051932, "learning_rate": 4.345863864677596e-07, "loss": 0.0, "num_input_tokens_seen": 13473728, "step": 27360 }, { "epoch": 3.611587699617263, "grad_norm": 0.1258080005645752, "learning_rate": 4.342064592056103e-07, "loss": 0.0009, "num_input_tokens_seen": 13476032, "step": 27365 }, { "epoch": 3.612247591395011, "grad_norm": 0.0041159396059811115, "learning_rate": 4.338266520290428e-07, "loss": 0.0, "num_input_tokens_seen": 13478592, "step": 27370 }, { "epoch": 3.6129074831727594, "grad_norm": 25.62404441833496, "learning_rate": 4.3344696501866893e-07, "loss": 0.0688, "num_input_tokens_seen": 13481088, "step": 27375 }, { "epoch": 3.613567374950508, "grad_norm": 0.0026808977127075195, "learning_rate": 4.330673982550738e-07, "loss": 0.0001, "num_input_tokens_seen": 13483328, "step": 27380 }, { "epoch": 3.6142272667282564, "grad_norm": 0.0003970061952713877, "learning_rate": 4.326879518188178e-07, "loss": 0.0, "num_input_tokens_seen": 13485888, "step": 27385 }, { "epoch": 3.614887158506005, "grad_norm": 0.003292738925665617, "learning_rate": 4.323086257904359e-07, "loss": 0.0, "num_input_tokens_seen": 13488512, "step": 27390 }, { "epoch": 3.6155470502837534, "grad_norm": 0.0002803392708301544, "learning_rate": 4.319294202504378e-07, "loss": 0.0, "num_input_tokens_seen": 13490688, "step": 27395 }, { "epoch": 3.6162069420615017, "grad_norm": 0.0007644235738553107, "learning_rate": 4.3155033527930606e-07, "loss": 0.0, "num_input_tokens_seen": 13492992, "step": 27400 }, { "epoch": 3.6168668338392505, "grad_norm": 0.16152521967887878, "learning_rate": 4.3117137095749945e-07, "loss": 0.0201, "num_input_tokens_seen": 13495360, "step": 27405 }, { "epoch": 3.6175267256169987, "grad_norm": 1.5119383335113525, "learning_rate": 4.307925273654505e-07, "loss": 0.0018, "num_input_tokens_seen": 13497792, "step": 27410 }, { "epoch": 3.6181866173947475, "grad_norm": 5.329381019691937e-05, "learning_rate": 4.3041380458356534e-07, "loss": 0.0, "num_input_tokens_seen": 13500224, "step": 27415 }, { "epoch": 3.6188465091724957, "grad_norm": 0.00558890588581562, "learning_rate": 4.3003520269222557e-07, "loss": 0.0, "num_input_tokens_seen": 13502400, "step": 27420 }, { "epoch": 3.619506400950244, "grad_norm": 0.009107373654842377, "learning_rate": 4.29656721771787e-07, "loss": 0.0533, "num_input_tokens_seen": 13505216, "step": 27425 }, { "epoch": 3.6201662927279927, "grad_norm": 0.00024875879171304405, "learning_rate": 4.292783619025788e-07, "loss": 0.0549, "num_input_tokens_seen": 13507520, "step": 27430 }, { "epoch": 3.620826184505741, "grad_norm": 0.019647786393761635, "learning_rate": 4.289001231649054e-07, "loss": 0.0176, "num_input_tokens_seen": 13510144, "step": 27435 }, { "epoch": 3.6214860762834897, "grad_norm": 0.09665568917989731, "learning_rate": 4.285220056390454e-07, "loss": 0.0595, "num_input_tokens_seen": 13512640, "step": 27440 }, { "epoch": 3.622145968061238, "grad_norm": 0.0013785201590508223, "learning_rate": 4.2814400940525164e-07, "loss": 0.0001, "num_input_tokens_seen": 13515200, "step": 27445 }, { "epoch": 3.6228058598389863, "grad_norm": 0.008707517758011818, "learning_rate": 4.2776613454375087e-07, "loss": 0.0, "num_input_tokens_seen": 13517568, "step": 27450 }, { "epoch": 3.623465751616735, "grad_norm": 0.02214517630636692, "learning_rate": 4.2738838113474353e-07, "loss": 0.0004, "num_input_tokens_seen": 13520064, "step": 27455 }, { "epoch": 3.6241256433944833, "grad_norm": 0.008972934447228909, "learning_rate": 4.2701074925840643e-07, "loss": 0.0002, "num_input_tokens_seen": 13522688, "step": 27460 }, { "epoch": 3.624785535172232, "grad_norm": 0.005197230726480484, "learning_rate": 4.266332389948882e-07, "loss": 0.0, "num_input_tokens_seen": 13525376, "step": 27465 }, { "epoch": 3.6254454269499803, "grad_norm": 0.0023379288613796234, "learning_rate": 4.2625585042431347e-07, "loss": 0.0, "num_input_tokens_seen": 13527680, "step": 27470 }, { "epoch": 3.6261053187277286, "grad_norm": 0.013858218677341938, "learning_rate": 4.258785836267792e-07, "loss": 0.0, "num_input_tokens_seen": 13530112, "step": 27475 }, { "epoch": 3.626765210505477, "grad_norm": 0.12098375707864761, "learning_rate": 4.255014386823582e-07, "loss": 0.0002, "num_input_tokens_seen": 13532416, "step": 27480 }, { "epoch": 3.6274251022832256, "grad_norm": 0.0001731712545733899, "learning_rate": 4.25124415671097e-07, "loss": 0.0, "num_input_tokens_seen": 13535040, "step": 27485 }, { "epoch": 3.628084994060974, "grad_norm": 0.0029791847337037325, "learning_rate": 4.24747514673015e-07, "loss": 0.0008, "num_input_tokens_seen": 13537408, "step": 27490 }, { "epoch": 3.6287448858387226, "grad_norm": 12.79725456237793, "learning_rate": 4.24370735768108e-07, "loss": 0.0457, "num_input_tokens_seen": 13539584, "step": 27495 }, { "epoch": 3.629404777616471, "grad_norm": 0.0019413211848586798, "learning_rate": 4.23994079036344e-07, "loss": 0.0004, "num_input_tokens_seen": 13542144, "step": 27500 }, { "epoch": 3.630064669394219, "grad_norm": 0.0006674039177596569, "learning_rate": 4.2361754455766517e-07, "loss": 0.0005, "num_input_tokens_seen": 13544576, "step": 27505 }, { "epoch": 3.630724561171968, "grad_norm": 0.013020829297602177, "learning_rate": 4.232411324119888e-07, "loss": 0.0, "num_input_tokens_seen": 13546880, "step": 27510 }, { "epoch": 3.631384452949716, "grad_norm": 0.00012012778461212292, "learning_rate": 4.228648426792054e-07, "loss": 0.0736, "num_input_tokens_seen": 13549440, "step": 27515 }, { "epoch": 3.632044344727465, "grad_norm": 0.11743014305830002, "learning_rate": 4.224886754391803e-07, "loss": 0.0, "num_input_tokens_seen": 13552000, "step": 27520 }, { "epoch": 3.632704236505213, "grad_norm": 0.009049608372151852, "learning_rate": 4.2211263077175144e-07, "loss": 0.001, "num_input_tokens_seen": 13554688, "step": 27525 }, { "epoch": 3.6333641282829614, "grad_norm": 0.2897687554359436, "learning_rate": 4.2173670875673197e-07, "loss": 0.0611, "num_input_tokens_seen": 13557568, "step": 27530 }, { "epoch": 3.63402402006071, "grad_norm": 0.001158829894848168, "learning_rate": 4.213609094739089e-07, "loss": 0.0, "num_input_tokens_seen": 13560128, "step": 27535 }, { "epoch": 3.6346839118384584, "grad_norm": 0.004890472162514925, "learning_rate": 4.2098523300304236e-07, "loss": 0.0005, "num_input_tokens_seen": 13562560, "step": 27540 }, { "epoch": 3.635343803616207, "grad_norm": 0.35512468218803406, "learning_rate": 4.2060967942386715e-07, "loss": 0.0001, "num_input_tokens_seen": 13564928, "step": 27545 }, { "epoch": 3.6360036953939554, "grad_norm": 0.0007364979828707874, "learning_rate": 4.2023424881609195e-07, "loss": 0.0001, "num_input_tokens_seen": 13567360, "step": 27550 }, { "epoch": 3.6366635871717037, "grad_norm": 0.22686073184013367, "learning_rate": 4.1985894125939947e-07, "loss": 0.0002, "num_input_tokens_seen": 13569920, "step": 27555 }, { "epoch": 3.6373234789494524, "grad_norm": 0.0018527201609686017, "learning_rate": 4.194837568334452e-07, "loss": 0.0065, "num_input_tokens_seen": 13572288, "step": 27560 }, { "epoch": 3.6379833707272007, "grad_norm": 0.0005900466348975897, "learning_rate": 4.191086956178598e-07, "loss": 0.0, "num_input_tokens_seen": 13574720, "step": 27565 }, { "epoch": 3.6386432625049494, "grad_norm": 0.0005124110612086952, "learning_rate": 4.187337576922476e-07, "loss": 0.0, "num_input_tokens_seen": 13577152, "step": 27570 }, { "epoch": 3.6393031542826977, "grad_norm": 0.0024374271742999554, "learning_rate": 4.1835894313618593e-07, "loss": 0.0028, "num_input_tokens_seen": 13579584, "step": 27575 }, { "epoch": 3.639963046060446, "grad_norm": 0.0035164635628461838, "learning_rate": 4.179842520292265e-07, "loss": 0.0, "num_input_tokens_seen": 13582016, "step": 27580 }, { "epoch": 3.6406229378381947, "grad_norm": 0.047881029546260834, "learning_rate": 4.176096844508954e-07, "loss": 0.0, "num_input_tokens_seen": 13584192, "step": 27585 }, { "epoch": 3.641282829615943, "grad_norm": 0.09858769178390503, "learning_rate": 4.17235240480691e-07, "loss": 0.0002, "num_input_tokens_seen": 13586624, "step": 27590 }, { "epoch": 3.6419427213936917, "grad_norm": 0.001338004251010716, "learning_rate": 4.1686092019808685e-07, "loss": 0.0016, "num_input_tokens_seen": 13588864, "step": 27595 }, { "epoch": 3.64260261317144, "grad_norm": 0.00016618985682725906, "learning_rate": 4.164867236825296e-07, "loss": 0.0487, "num_input_tokens_seen": 13591552, "step": 27600 }, { "epoch": 3.6432625049491882, "grad_norm": 9.292057991027832, "learning_rate": 4.1611265101344005e-07, "loss": 0.028, "num_input_tokens_seen": 13593920, "step": 27605 }, { "epoch": 3.6439223967269365, "grad_norm": 0.009514679200947285, "learning_rate": 4.1573870227021224e-07, "loss": 0.066, "num_input_tokens_seen": 13596288, "step": 27610 }, { "epoch": 3.6445822885046852, "grad_norm": 0.0004131619061809033, "learning_rate": 4.153648775322132e-07, "loss": 0.0, "num_input_tokens_seen": 13598464, "step": 27615 }, { "epoch": 3.6452421802824335, "grad_norm": 0.0005515352240763605, "learning_rate": 4.1499117687878606e-07, "loss": 0.0014, "num_input_tokens_seen": 13600704, "step": 27620 }, { "epoch": 3.6459020720601822, "grad_norm": 0.003655769629403949, "learning_rate": 4.1461760038924496e-07, "loss": 0.0487, "num_input_tokens_seen": 13603136, "step": 27625 }, { "epoch": 3.6465619638379305, "grad_norm": 0.00046932691475376487, "learning_rate": 4.142441481428792e-07, "loss": 0.0, "num_input_tokens_seen": 13605440, "step": 27630 }, { "epoch": 3.647221855615679, "grad_norm": 0.015964679419994354, "learning_rate": 4.138708202189516e-07, "loss": 0.0, "num_input_tokens_seen": 13607744, "step": 27635 }, { "epoch": 3.6478817473934275, "grad_norm": 0.001884157769382, "learning_rate": 4.134976166966977e-07, "loss": 0.0, "num_input_tokens_seen": 13610240, "step": 27640 }, { "epoch": 3.648541639171176, "grad_norm": 0.0008349604904651642, "learning_rate": 4.131245376553278e-07, "loss": 0.0754, "num_input_tokens_seen": 13612480, "step": 27645 }, { "epoch": 3.6492015309489245, "grad_norm": 0.029129212722182274, "learning_rate": 4.1275158317402436e-07, "loss": 0.0028, "num_input_tokens_seen": 13615104, "step": 27650 }, { "epoch": 3.649861422726673, "grad_norm": 0.0011978574329987168, "learning_rate": 4.123787533319455e-07, "loss": 0.0, "num_input_tokens_seen": 13617536, "step": 27655 }, { "epoch": 3.650521314504421, "grad_norm": 0.0017305328510701656, "learning_rate": 4.1200604820822103e-07, "loss": 0.0018, "num_input_tokens_seen": 13619904, "step": 27660 }, { "epoch": 3.65118120628217, "grad_norm": 0.0013345396146178246, "learning_rate": 4.1163346788195465e-07, "loss": 0.0, "num_input_tokens_seen": 13622464, "step": 27665 }, { "epoch": 3.651841098059918, "grad_norm": 0.0014696142170578241, "learning_rate": 4.11261012432224e-07, "loss": 0.0018, "num_input_tokens_seen": 13625152, "step": 27670 }, { "epoch": 3.652500989837667, "grad_norm": 0.11104393750429153, "learning_rate": 4.1088868193808023e-07, "loss": 0.0004, "num_input_tokens_seen": 13627712, "step": 27675 }, { "epoch": 3.653160881615415, "grad_norm": 0.0015458540292456746, "learning_rate": 4.10516476478548e-07, "loss": 0.0, "num_input_tokens_seen": 13629952, "step": 27680 }, { "epoch": 3.6538207733931634, "grad_norm": 0.0003642126393970102, "learning_rate": 4.101443961326245e-07, "loss": 0.0688, "num_input_tokens_seen": 13632576, "step": 27685 }, { "epoch": 3.654480665170912, "grad_norm": 0.011262251064181328, "learning_rate": 4.0977244097928164e-07, "loss": 0.0, "num_input_tokens_seen": 13634944, "step": 27690 }, { "epoch": 3.6551405569486604, "grad_norm": 0.0009316291543655097, "learning_rate": 4.094006110974645e-07, "loss": 0.0, "num_input_tokens_seen": 13637248, "step": 27695 }, { "epoch": 3.655800448726409, "grad_norm": 0.029980765655636787, "learning_rate": 4.0902890656609044e-07, "loss": 0.0001, "num_input_tokens_seen": 13639744, "step": 27700 }, { "epoch": 3.6564603405041574, "grad_norm": 40.80172348022461, "learning_rate": 4.0865732746405145e-07, "loss": 0.1378, "num_input_tokens_seen": 13642240, "step": 27705 }, { "epoch": 3.6571202322819056, "grad_norm": 0.0032207188196480274, "learning_rate": 4.08285873870213e-07, "loss": 0.0, "num_input_tokens_seen": 13644672, "step": 27710 }, { "epoch": 3.6577801240596544, "grad_norm": 0.00017996614042203873, "learning_rate": 4.079145458634125e-07, "loss": 0.0004, "num_input_tokens_seen": 13647040, "step": 27715 }, { "epoch": 3.6584400158374026, "grad_norm": 0.00022340656141750515, "learning_rate": 4.075433435224621e-07, "loss": 0.0003, "num_input_tokens_seen": 13649600, "step": 27720 }, { "epoch": 3.6590999076151514, "grad_norm": 0.0005656805005855858, "learning_rate": 4.071722669261468e-07, "loss": 0.0072, "num_input_tokens_seen": 13652352, "step": 27725 }, { "epoch": 3.6597597993928996, "grad_norm": 0.00013878944446332753, "learning_rate": 4.068013161532253e-07, "loss": 0.0, "num_input_tokens_seen": 13654976, "step": 27730 }, { "epoch": 3.660419691170648, "grad_norm": 0.0004583010741043836, "learning_rate": 4.064304912824286e-07, "loss": 0.0007, "num_input_tokens_seen": 13657408, "step": 27735 }, { "epoch": 3.661079582948396, "grad_norm": 1.2669492959976196, "learning_rate": 4.0605979239246166e-07, "loss": 0.0004, "num_input_tokens_seen": 13659392, "step": 27740 }, { "epoch": 3.661739474726145, "grad_norm": 9.382128337165341e-05, "learning_rate": 4.056892195620032e-07, "loss": 0.0, "num_input_tokens_seen": 13661824, "step": 27745 }, { "epoch": 3.6623993665038936, "grad_norm": 0.002449862891808152, "learning_rate": 4.0531877286970397e-07, "loss": 0.0, "num_input_tokens_seen": 13664384, "step": 27750 }, { "epoch": 3.663059258281642, "grad_norm": 0.20444366335868835, "learning_rate": 4.0494845239418873e-07, "loss": 0.0001, "num_input_tokens_seen": 13666688, "step": 27755 }, { "epoch": 3.66371915005939, "grad_norm": 0.0016125873662531376, "learning_rate": 4.045782582140559e-07, "loss": 0.0844, "num_input_tokens_seen": 13669056, "step": 27760 }, { "epoch": 3.6643790418371385, "grad_norm": 0.005589081905782223, "learning_rate": 4.042081904078757e-07, "loss": 0.1125, "num_input_tokens_seen": 13671680, "step": 27765 }, { "epoch": 3.665038933614887, "grad_norm": 0.00195342511869967, "learning_rate": 4.0383824905419263e-07, "loss": 0.0, "num_input_tokens_seen": 13674240, "step": 27770 }, { "epoch": 3.6656988253926355, "grad_norm": 0.0008958014077506959, "learning_rate": 4.034684342315241e-07, "loss": 0.0549, "num_input_tokens_seen": 13676672, "step": 27775 }, { "epoch": 3.666358717170384, "grad_norm": 0.0006265094853006303, "learning_rate": 4.0309874601836114e-07, "loss": 0.0, "num_input_tokens_seen": 13678976, "step": 27780 }, { "epoch": 3.6670186089481325, "grad_norm": 0.0008544830488972366, "learning_rate": 4.0272918449316684e-07, "loss": 0.0626, "num_input_tokens_seen": 13681344, "step": 27785 }, { "epoch": 3.6676785007258808, "grad_norm": 0.0001300136063946411, "learning_rate": 4.0235974973437735e-07, "loss": 0.0, "num_input_tokens_seen": 13683456, "step": 27790 }, { "epoch": 3.6683383925036295, "grad_norm": 0.013282733038067818, "learning_rate": 4.0199044182040385e-07, "loss": 0.0, "num_input_tokens_seen": 13686272, "step": 27795 }, { "epoch": 3.6689982842813778, "grad_norm": 0.00022985691612120718, "learning_rate": 4.016212608296284e-07, "loss": 0.1113, "num_input_tokens_seen": 13688896, "step": 27800 }, { "epoch": 3.6696581760591265, "grad_norm": 0.07989729940891266, "learning_rate": 4.012522068404075e-07, "loss": 0.0472, "num_input_tokens_seen": 13691200, "step": 27805 }, { "epoch": 3.6703180678368748, "grad_norm": 0.0033196040894836187, "learning_rate": 4.0088327993106964e-07, "loss": 0.0, "num_input_tokens_seen": 13693888, "step": 27810 }, { "epoch": 3.670977959614623, "grad_norm": 0.000571762619074434, "learning_rate": 4.005144801799171e-07, "loss": 0.0308, "num_input_tokens_seen": 13696256, "step": 27815 }, { "epoch": 3.6716378513923718, "grad_norm": 0.00014293697313405573, "learning_rate": 4.001458076652253e-07, "loss": 0.1735, "num_input_tokens_seen": 13698752, "step": 27820 }, { "epoch": 3.67229774317012, "grad_norm": 0.00529489666223526, "learning_rate": 3.9977726246524133e-07, "loss": 0.0006, "num_input_tokens_seen": 13701376, "step": 27825 }, { "epoch": 3.6729576349478688, "grad_norm": 0.0012214038288220763, "learning_rate": 3.994088446581877e-07, "loss": 0.0, "num_input_tokens_seen": 13703808, "step": 27830 }, { "epoch": 3.673617526725617, "grad_norm": 0.013352553360164165, "learning_rate": 3.990405543222576e-07, "loss": 0.0, "num_input_tokens_seen": 13706176, "step": 27835 }, { "epoch": 3.6742774185033653, "grad_norm": 0.0007113065803423524, "learning_rate": 3.9867239153561774e-07, "loss": 0.0079, "num_input_tokens_seen": 13708480, "step": 27840 }, { "epoch": 3.674937310281114, "grad_norm": 0.0019296143436804414, "learning_rate": 3.9830435637640825e-07, "loss": 0.0003, "num_input_tokens_seen": 13710848, "step": 27845 }, { "epoch": 3.6755972020588623, "grad_norm": 0.005527435336261988, "learning_rate": 3.979364489227419e-07, "loss": 0.0, "num_input_tokens_seen": 13713024, "step": 27850 }, { "epoch": 3.676257093836611, "grad_norm": 0.15193168818950653, "learning_rate": 3.9756866925270494e-07, "loss": 0.0005, "num_input_tokens_seen": 13715776, "step": 27855 }, { "epoch": 3.6769169856143593, "grad_norm": 0.06932579725980759, "learning_rate": 3.972010174443551e-07, "loss": 0.0004, "num_input_tokens_seen": 13718336, "step": 27860 }, { "epoch": 3.6775768773921076, "grad_norm": 0.008432332426309586, "learning_rate": 3.9683349357572417e-07, "loss": 0.0, "num_input_tokens_seen": 13720896, "step": 27865 }, { "epoch": 3.678236769169856, "grad_norm": 0.00018875522073358297, "learning_rate": 3.9646609772481677e-07, "loss": 0.0, "num_input_tokens_seen": 13723136, "step": 27870 }, { "epoch": 3.6788966609476046, "grad_norm": 0.023280220106244087, "learning_rate": 3.960988299696094e-07, "loss": 0.0005, "num_input_tokens_seen": 13725568, "step": 27875 }, { "epoch": 3.6795565527253533, "grad_norm": 0.00022550317225977778, "learning_rate": 3.957316903880522e-07, "loss": 0.0, "num_input_tokens_seen": 13727936, "step": 27880 }, { "epoch": 3.6802164445031016, "grad_norm": 0.002597242360934615, "learning_rate": 3.953646790580679e-07, "loss": 0.0, "num_input_tokens_seen": 13730240, "step": 27885 }, { "epoch": 3.68087633628085, "grad_norm": 0.00300345616415143, "learning_rate": 3.949977960575525e-07, "loss": 0.0001, "num_input_tokens_seen": 13732928, "step": 27890 }, { "epoch": 3.681536228058598, "grad_norm": 4.238515853881836, "learning_rate": 3.946310414643734e-07, "loss": 0.0109, "num_input_tokens_seen": 13735616, "step": 27895 }, { "epoch": 3.682196119836347, "grad_norm": 0.0008129694033414125, "learning_rate": 3.94264415356372e-07, "loss": 0.0487, "num_input_tokens_seen": 13738048, "step": 27900 }, { "epoch": 3.682856011614095, "grad_norm": 0.004607574548572302, "learning_rate": 3.938979178113625e-07, "loss": 0.121, "num_input_tokens_seen": 13740544, "step": 27905 }, { "epoch": 3.683515903391844, "grad_norm": 0.00020315272558946162, "learning_rate": 3.9353154890713037e-07, "loss": 0.0005, "num_input_tokens_seen": 13743168, "step": 27910 }, { "epoch": 3.684175795169592, "grad_norm": 0.0006358879618346691, "learning_rate": 3.9316530872143537e-07, "loss": 0.0281, "num_input_tokens_seen": 13745408, "step": 27915 }, { "epoch": 3.6848356869473404, "grad_norm": 0.003894600784406066, "learning_rate": 3.927991973320096e-07, "loss": 0.0337, "num_input_tokens_seen": 13747904, "step": 27920 }, { "epoch": 3.685495578725089, "grad_norm": 0.00034760107519105077, "learning_rate": 3.924332148165569e-07, "loss": 0.0579, "num_input_tokens_seen": 13750400, "step": 27925 }, { "epoch": 3.6861554705028374, "grad_norm": 0.001135274418629706, "learning_rate": 3.9206736125275463e-07, "loss": 0.0, "num_input_tokens_seen": 13753024, "step": 27930 }, { "epoch": 3.686815362280586, "grad_norm": 9.904774196911603e-05, "learning_rate": 3.9170163671825265e-07, "loss": 0.0, "num_input_tokens_seen": 13755520, "step": 27935 }, { "epoch": 3.6874752540583344, "grad_norm": 1.1374342441558838, "learning_rate": 3.9133604129067364e-07, "loss": 0.001, "num_input_tokens_seen": 13758336, "step": 27940 }, { "epoch": 3.6881351458360827, "grad_norm": 0.00011956832167925313, "learning_rate": 3.9097057504761234e-07, "loss": 0.0, "num_input_tokens_seen": 13760960, "step": 27945 }, { "epoch": 3.6887950376138314, "grad_norm": 0.0032787492964416742, "learning_rate": 3.9060523806663556e-07, "loss": 0.0, "num_input_tokens_seen": 13763520, "step": 27950 }, { "epoch": 3.6894549293915797, "grad_norm": 0.00014116879901848733, "learning_rate": 3.9024003042528474e-07, "loss": 0.0, "num_input_tokens_seen": 13766144, "step": 27955 }, { "epoch": 3.6901148211693284, "grad_norm": 0.0004441550699993968, "learning_rate": 3.898749522010716e-07, "loss": 0.0, "num_input_tokens_seen": 13768768, "step": 27960 }, { "epoch": 3.6907747129470767, "grad_norm": 0.007838066667318344, "learning_rate": 3.895100034714817e-07, "loss": 0.0674, "num_input_tokens_seen": 13771264, "step": 27965 }, { "epoch": 3.691434604724825, "grad_norm": 0.006580795627087355, "learning_rate": 3.8914518431397305e-07, "loss": 0.0, "num_input_tokens_seen": 13773440, "step": 27970 }, { "epoch": 3.6920944965025737, "grad_norm": 0.045708343386650085, "learning_rate": 3.887804948059752e-07, "loss": 0.0007, "num_input_tokens_seen": 13775872, "step": 27975 }, { "epoch": 3.692754388280322, "grad_norm": 0.03007504530251026, "learning_rate": 3.8841593502489155e-07, "loss": 0.0, "num_input_tokens_seen": 13778112, "step": 27980 }, { "epoch": 3.6934142800580707, "grad_norm": 0.0015138540184125304, "learning_rate": 3.880515050480964e-07, "loss": 0.0, "num_input_tokens_seen": 13780544, "step": 27985 }, { "epoch": 3.694074171835819, "grad_norm": 1.4385871887207031, "learning_rate": 3.876872049529385e-07, "loss": 0.0012, "num_input_tokens_seen": 13782976, "step": 27990 }, { "epoch": 3.6947340636135673, "grad_norm": 0.0011436428176239133, "learning_rate": 3.8732303481673733e-07, "loss": 0.0, "num_input_tokens_seen": 13785472, "step": 27995 }, { "epoch": 3.695393955391316, "grad_norm": 56.57427978515625, "learning_rate": 3.869589947167851e-07, "loss": 0.0673, "num_input_tokens_seen": 13788032, "step": 28000 }, { "epoch": 3.6960538471690643, "grad_norm": 0.00022567510313820094, "learning_rate": 3.8659508473034684e-07, "loss": 0.0, "num_input_tokens_seen": 13790528, "step": 28005 }, { "epoch": 3.696713738946813, "grad_norm": 0.003226806875318289, "learning_rate": 3.8623130493465994e-07, "loss": 0.0029, "num_input_tokens_seen": 13793216, "step": 28010 }, { "epoch": 3.6973736307245613, "grad_norm": 0.0005611493834294379, "learning_rate": 3.8586765540693434e-07, "loss": 0.0001, "num_input_tokens_seen": 13795584, "step": 28015 }, { "epoch": 3.6980335225023095, "grad_norm": 0.00033650652039796114, "learning_rate": 3.855041362243514e-07, "loss": 0.0, "num_input_tokens_seen": 13797952, "step": 28020 }, { "epoch": 3.698693414280058, "grad_norm": 0.018644453957676888, "learning_rate": 3.8514074746406566e-07, "loss": 0.0, "num_input_tokens_seen": 13800576, "step": 28025 }, { "epoch": 3.6993533060578065, "grad_norm": 0.031189944595098495, "learning_rate": 3.847774892032042e-07, "loss": 0.0, "num_input_tokens_seen": 13803136, "step": 28030 }, { "epoch": 3.700013197835555, "grad_norm": 0.45781606435775757, "learning_rate": 3.844143615188652e-07, "loss": 0.0004, "num_input_tokens_seen": 13805248, "step": 28035 }, { "epoch": 3.7006730896133035, "grad_norm": 0.042275186628103256, "learning_rate": 3.8405136448812023e-07, "loss": 0.0, "num_input_tokens_seen": 13807424, "step": 28040 }, { "epoch": 3.701332981391052, "grad_norm": 0.0003274598275311291, "learning_rate": 3.8368849818801317e-07, "loss": 0.058, "num_input_tokens_seen": 13810304, "step": 28045 }, { "epoch": 3.7019928731688, "grad_norm": 0.005129341036081314, "learning_rate": 3.8332576269555906e-07, "loss": 0.0, "num_input_tokens_seen": 13812544, "step": 28050 }, { "epoch": 3.702652764946549, "grad_norm": 0.017442552372813225, "learning_rate": 3.8296315808774616e-07, "loss": 0.0, "num_input_tokens_seen": 13815040, "step": 28055 }, { "epoch": 3.703312656724297, "grad_norm": 0.00010095502511831, "learning_rate": 3.826006844415347e-07, "loss": 0.0, "num_input_tokens_seen": 13817536, "step": 28060 }, { "epoch": 3.703972548502046, "grad_norm": 0.00021431800269056112, "learning_rate": 3.822383418338576e-07, "loss": 0.0, "num_input_tokens_seen": 13819840, "step": 28065 }, { "epoch": 3.704632440279794, "grad_norm": 0.0005621056770905852, "learning_rate": 3.8187613034161847e-07, "loss": 0.0, "num_input_tokens_seen": 13822208, "step": 28070 }, { "epoch": 3.7052923320575424, "grad_norm": 0.0012197456089779735, "learning_rate": 3.815140500416947e-07, "loss": 0.0001, "num_input_tokens_seen": 13824512, "step": 28075 }, { "epoch": 3.705952223835291, "grad_norm": 0.00770995020866394, "learning_rate": 3.811521010109353e-07, "loss": 0.0008, "num_input_tokens_seen": 13826816, "step": 28080 }, { "epoch": 3.7066121156130394, "grad_norm": 0.00017412604938726872, "learning_rate": 3.807902833261609e-07, "loss": 0.0, "num_input_tokens_seen": 13829312, "step": 28085 }, { "epoch": 3.707272007390788, "grad_norm": 0.0002185389748774469, "learning_rate": 3.804285970641649e-07, "loss": 0.0518, "num_input_tokens_seen": 13831680, "step": 28090 }, { "epoch": 3.7079318991685364, "grad_norm": 0.0013204488204792142, "learning_rate": 3.800670423017128e-07, "loss": 0.0533, "num_input_tokens_seen": 13834240, "step": 28095 }, { "epoch": 3.7085917909462847, "grad_norm": 11.457676887512207, "learning_rate": 3.7970561911554143e-07, "loss": 0.0718, "num_input_tokens_seen": 13836352, "step": 28100 }, { "epoch": 3.7092516827240334, "grad_norm": 0.0008938403916545212, "learning_rate": 3.793443275823607e-07, "loss": 0.0, "num_input_tokens_seen": 13838976, "step": 28105 }, { "epoch": 3.7099115745017817, "grad_norm": 0.002852260135114193, "learning_rate": 3.7898316777885195e-07, "loss": 0.0011, "num_input_tokens_seen": 13841472, "step": 28110 }, { "epoch": 3.7105714662795304, "grad_norm": 0.008115318603813648, "learning_rate": 3.786221397816691e-07, "loss": 0.0003, "num_input_tokens_seen": 13843712, "step": 28115 }, { "epoch": 3.7112313580572787, "grad_norm": 0.0013575529446825385, "learning_rate": 3.782612436674375e-07, "loss": 0.0487, "num_input_tokens_seen": 13846208, "step": 28120 }, { "epoch": 3.711891249835027, "grad_norm": 0.0008263670606538653, "learning_rate": 3.7790047951275394e-07, "loss": 0.0401, "num_input_tokens_seen": 13848448, "step": 28125 }, { "epoch": 3.7125511416127757, "grad_norm": 0.07842446118593216, "learning_rate": 3.7753984739418945e-07, "loss": 0.0001, "num_input_tokens_seen": 13850880, "step": 28130 }, { "epoch": 3.713211033390524, "grad_norm": 0.007149757817387581, "learning_rate": 3.771793473882844e-07, "loss": 0.0025, "num_input_tokens_seen": 13853440, "step": 28135 }, { "epoch": 3.7138709251682727, "grad_norm": 0.9949508309364319, "learning_rate": 3.768189795715532e-07, "loss": 0.0912, "num_input_tokens_seen": 13856000, "step": 28140 }, { "epoch": 3.714530816946021, "grad_norm": 0.016497809439897537, "learning_rate": 3.764587440204804e-07, "loss": 0.0, "num_input_tokens_seen": 13858368, "step": 28145 }, { "epoch": 3.715190708723769, "grad_norm": 0.008333483710885048, "learning_rate": 3.7609864081152387e-07, "loss": 0.0002, "num_input_tokens_seen": 13861056, "step": 28150 }, { "epoch": 3.7158506005015175, "grad_norm": 0.0004311394295655191, "learning_rate": 3.7573867002111324e-07, "loss": 0.0, "num_input_tokens_seen": 13863552, "step": 28155 }, { "epoch": 3.716510492279266, "grad_norm": 0.0013469455298036337, "learning_rate": 3.753788317256488e-07, "loss": 0.0, "num_input_tokens_seen": 13866240, "step": 28160 }, { "epoch": 3.7171703840570145, "grad_norm": 0.0002838643849827349, "learning_rate": 3.7501912600150474e-07, "loss": 0.0, "num_input_tokens_seen": 13868480, "step": 28165 }, { "epoch": 3.717830275834763, "grad_norm": 0.0029573384672403336, "learning_rate": 3.7465955292502505e-07, "loss": 0.0, "num_input_tokens_seen": 13870592, "step": 28170 }, { "epoch": 3.7184901676125115, "grad_norm": 0.00849292054772377, "learning_rate": 3.7430011257252735e-07, "loss": 0.0308, "num_input_tokens_seen": 13872704, "step": 28175 }, { "epoch": 3.7191500593902598, "grad_norm": 0.7944841384887695, "learning_rate": 3.7394080502029934e-07, "loss": 0.0003, "num_input_tokens_seen": 13874880, "step": 28180 }, { "epoch": 3.7198099511680085, "grad_norm": 14.134904861450195, "learning_rate": 3.73581630344602e-07, "loss": 0.0302, "num_input_tokens_seen": 13877248, "step": 28185 }, { "epoch": 3.7204698429457568, "grad_norm": 0.06969019025564194, "learning_rate": 3.732225886216678e-07, "loss": 0.0, "num_input_tokens_seen": 13879744, "step": 28190 }, { "epoch": 3.7211297347235055, "grad_norm": 0.06846865266561508, "learning_rate": 3.7286367992769994e-07, "loss": 0.0001, "num_input_tokens_seen": 13882112, "step": 28195 }, { "epoch": 3.721789626501254, "grad_norm": 0.0003618435002863407, "learning_rate": 3.7250490433887473e-07, "loss": 0.0471, "num_input_tokens_seen": 13884416, "step": 28200 }, { "epoch": 3.722449518279002, "grad_norm": 24.19239616394043, "learning_rate": 3.7214626193133993e-07, "loss": 0.0548, "num_input_tokens_seen": 13886656, "step": 28205 }, { "epoch": 3.723109410056751, "grad_norm": 0.0008490770705975592, "learning_rate": 3.717877527812141e-07, "loss": 0.1484, "num_input_tokens_seen": 13889088, "step": 28210 }, { "epoch": 3.723769301834499, "grad_norm": 0.011017916724085808, "learning_rate": 3.714293769645886e-07, "loss": 0.0813, "num_input_tokens_seen": 13891456, "step": 28215 }, { "epoch": 3.724429193612248, "grad_norm": 0.0930677130818367, "learning_rate": 3.710711345575261e-07, "loss": 0.0005, "num_input_tokens_seen": 13894016, "step": 28220 }, { "epoch": 3.725089085389996, "grad_norm": 0.019668880850076675, "learning_rate": 3.707130256360614e-07, "loss": 0.0, "num_input_tokens_seen": 13896512, "step": 28225 }, { "epoch": 3.7257489771677443, "grad_norm": 0.008796813897788525, "learning_rate": 3.7035505027619964e-07, "loss": 0.0181, "num_input_tokens_seen": 13899008, "step": 28230 }, { "epoch": 3.726408868945493, "grad_norm": 53.365760803222656, "learning_rate": 3.6999720855391893e-07, "loss": 0.0411, "num_input_tokens_seen": 13901632, "step": 28235 }, { "epoch": 3.7270687607232413, "grad_norm": 27.761310577392578, "learning_rate": 3.696395005451689e-07, "loss": 0.0704, "num_input_tokens_seen": 13903936, "step": 28240 }, { "epoch": 3.72772865250099, "grad_norm": 0.003962219692766666, "learning_rate": 3.6928192632586986e-07, "loss": 0.0001, "num_input_tokens_seen": 13906368, "step": 28245 }, { "epoch": 3.7283885442787383, "grad_norm": 37.43833541870117, "learning_rate": 3.6892448597191463e-07, "loss": 0.0881, "num_input_tokens_seen": 13908992, "step": 28250 }, { "epoch": 3.7290484360564866, "grad_norm": 11.68809700012207, "learning_rate": 3.685671795591677e-07, "loss": 0.0367, "num_input_tokens_seen": 13911744, "step": 28255 }, { "epoch": 3.7297083278342353, "grad_norm": 0.7247753739356995, "learning_rate": 3.682100071634642e-07, "loss": 0.0018, "num_input_tokens_seen": 13914240, "step": 28260 }, { "epoch": 3.7303682196119836, "grad_norm": 0.0006752077606506646, "learning_rate": 3.6785296886061144e-07, "loss": 0.0003, "num_input_tokens_seen": 13917120, "step": 28265 }, { "epoch": 3.7310281113897323, "grad_norm": 0.010004118084907532, "learning_rate": 3.674960647263885e-07, "loss": 0.0, "num_input_tokens_seen": 13919616, "step": 28270 }, { "epoch": 3.7316880031674806, "grad_norm": 0.00652843713760376, "learning_rate": 3.671392948365458e-07, "loss": 0.0004, "num_input_tokens_seen": 13922560, "step": 28275 }, { "epoch": 3.732347894945229, "grad_norm": 0.09454436600208282, "learning_rate": 3.667826592668052e-07, "loss": 0.0002, "num_input_tokens_seen": 13925376, "step": 28280 }, { "epoch": 3.733007786722977, "grad_norm": 0.10800040513277054, "learning_rate": 3.664261580928589e-07, "loss": 0.0006, "num_input_tokens_seen": 13927936, "step": 28285 }, { "epoch": 3.733667678500726, "grad_norm": 0.0006057324353605509, "learning_rate": 3.660697913903733e-07, "loss": 0.0044, "num_input_tokens_seen": 13930176, "step": 28290 }, { "epoch": 3.734327570278474, "grad_norm": 0.025416888296604156, "learning_rate": 3.6571355923498346e-07, "loss": 0.0001, "num_input_tokens_seen": 13932800, "step": 28295 }, { "epoch": 3.734987462056223, "grad_norm": 0.0017488945741206408, "learning_rate": 3.6535746170229777e-07, "loss": 0.0049, "num_input_tokens_seen": 13935424, "step": 28300 }, { "epoch": 3.735647353833971, "grad_norm": 0.0009809860493987799, "learning_rate": 3.6500149886789524e-07, "loss": 0.0039, "num_input_tokens_seen": 13938176, "step": 28305 }, { "epoch": 3.7363072456117195, "grad_norm": 0.012923874892294407, "learning_rate": 3.64645670807326e-07, "loss": 0.0075, "num_input_tokens_seen": 13940672, "step": 28310 }, { "epoch": 3.736967137389468, "grad_norm": 0.005602862685918808, "learning_rate": 3.642899775961127e-07, "loss": 0.0013, "num_input_tokens_seen": 13943232, "step": 28315 }, { "epoch": 3.7376270291672165, "grad_norm": 0.004117357078939676, "learning_rate": 3.6393441930974734e-07, "loss": 0.0224, "num_input_tokens_seen": 13945472, "step": 28320 }, { "epoch": 3.738286920944965, "grad_norm": 0.0016863815253600478, "learning_rate": 3.6357899602369626e-07, "loss": 0.1003, "num_input_tokens_seen": 13948288, "step": 28325 }, { "epoch": 3.7389468127227135, "grad_norm": 0.009383490309119225, "learning_rate": 3.632237078133946e-07, "loss": 0.0, "num_input_tokens_seen": 13950464, "step": 28330 }, { "epoch": 3.7396067045004617, "grad_norm": 0.06896061450242996, "learning_rate": 3.628685547542496e-07, "loss": 0.0087, "num_input_tokens_seen": 13952640, "step": 28335 }, { "epoch": 3.7402665962782105, "grad_norm": 0.0010050699347630143, "learning_rate": 3.6251353692164e-07, "loss": 0.0, "num_input_tokens_seen": 13954944, "step": 28340 }, { "epoch": 3.7409264880559587, "grad_norm": 0.024658070877194405, "learning_rate": 3.6215865439091587e-07, "loss": 0.0097, "num_input_tokens_seen": 13957184, "step": 28345 }, { "epoch": 3.7415863798337075, "grad_norm": 0.00028802931774407625, "learning_rate": 3.6180390723739883e-07, "loss": 0.0, "num_input_tokens_seen": 13959552, "step": 28350 }, { "epoch": 3.7422462716114557, "grad_norm": 0.00033592613181099296, "learning_rate": 3.614492955363806e-07, "loss": 0.0004, "num_input_tokens_seen": 13962240, "step": 28355 }, { "epoch": 3.742906163389204, "grad_norm": 0.0001732255332171917, "learning_rate": 3.610948193631255e-07, "loss": 0.0844, "num_input_tokens_seen": 13964544, "step": 28360 }, { "epoch": 3.7435660551669527, "grad_norm": 0.0003821653372142464, "learning_rate": 3.607404787928686e-07, "loss": 0.0, "num_input_tokens_seen": 13967040, "step": 28365 }, { "epoch": 3.744225946944701, "grad_norm": 0.0010076966136693954, "learning_rate": 3.6038627390081567e-07, "loss": 0.0, "num_input_tokens_seen": 13969728, "step": 28370 }, { "epoch": 3.7448858387224497, "grad_norm": 0.011342594400048256, "learning_rate": 3.6003220476214445e-07, "loss": 0.0, "num_input_tokens_seen": 13972416, "step": 28375 }, { "epoch": 3.745545730500198, "grad_norm": 0.00022565149993170053, "learning_rate": 3.596782714520037e-07, "loss": 0.0004, "num_input_tokens_seen": 13975040, "step": 28380 }, { "epoch": 3.7462056222779463, "grad_norm": 0.0006821189308539033, "learning_rate": 3.593244740455127e-07, "loss": 0.0004, "num_input_tokens_seen": 13977472, "step": 28385 }, { "epoch": 3.746865514055695, "grad_norm": 0.00014877947978675365, "learning_rate": 3.5897081261776275e-07, "loss": 0.0176, "num_input_tokens_seen": 13979776, "step": 28390 }, { "epoch": 3.7475254058334433, "grad_norm": 0.0004527225682977587, "learning_rate": 3.586172872438158e-07, "loss": 0.0001, "num_input_tokens_seen": 13982336, "step": 28395 }, { "epoch": 3.748185297611192, "grad_norm": 0.000232151331147179, "learning_rate": 3.582638979987054e-07, "loss": 0.0352, "num_input_tokens_seen": 13984768, "step": 28400 }, { "epoch": 3.7488451893889403, "grad_norm": 0.0008931290940381587, "learning_rate": 3.579106449574353e-07, "loss": 0.0521, "num_input_tokens_seen": 13986880, "step": 28405 }, { "epoch": 3.7495050811666886, "grad_norm": 0.0014002250973135233, "learning_rate": 3.5755752819498107e-07, "loss": 0.0, "num_input_tokens_seen": 13989696, "step": 28410 }, { "epoch": 3.750164972944437, "grad_norm": 0.0012177800526842475, "learning_rate": 3.572045477862896e-07, "loss": 0.0109, "num_input_tokens_seen": 13991936, "step": 28415 }, { "epoch": 3.7508248647221856, "grad_norm": 0.00019355359836481512, "learning_rate": 3.568517038062778e-07, "loss": 0.0003, "num_input_tokens_seen": 13994496, "step": 28420 }, { "epoch": 3.751484756499934, "grad_norm": 0.002805389231070876, "learning_rate": 3.564989963298346e-07, "loss": 0.0243, "num_input_tokens_seen": 13996672, "step": 28425 }, { "epoch": 3.751484756499934, "eval_loss": 0.20364880561828613, "eval_runtime": 7.8652, "eval_samples_per_second": 856.306, "eval_steps_per_second": 107.054, "num_input_tokens_seen": 13996672, "step": 28425 }, { "epoch": 3.7521446482776826, "grad_norm": 0.0034394606482237577, "learning_rate": 3.5614642543181996e-07, "loss": 0.1141, "num_input_tokens_seen": 13998976, "step": 28430 }, { "epoch": 3.752804540055431, "grad_norm": 0.0035546584986150265, "learning_rate": 3.5579399118706364e-07, "loss": 0.0002, "num_input_tokens_seen": 14001152, "step": 28435 }, { "epoch": 3.753464431833179, "grad_norm": 0.007250829134136438, "learning_rate": 3.5544169367036783e-07, "loss": 0.0294, "num_input_tokens_seen": 14003520, "step": 28440 }, { "epoch": 3.754124323610928, "grad_norm": 0.001128159579820931, "learning_rate": 3.550895329565049e-07, "loss": 0.0, "num_input_tokens_seen": 14005824, "step": 28445 }, { "epoch": 3.754784215388676, "grad_norm": 0.22034797072410583, "learning_rate": 3.5473750912021894e-07, "loss": 0.0, "num_input_tokens_seen": 14008128, "step": 28450 }, { "epoch": 3.755444107166425, "grad_norm": 0.00023955103824846447, "learning_rate": 3.543856222362239e-07, "loss": 0.0096, "num_input_tokens_seen": 14010560, "step": 28455 }, { "epoch": 3.756103998944173, "grad_norm": 0.0013909810222685337, "learning_rate": 3.540338723792049e-07, "loss": 0.0, "num_input_tokens_seen": 14013184, "step": 28460 }, { "epoch": 3.7567638907219214, "grad_norm": 0.002331098075956106, "learning_rate": 3.5368225962381924e-07, "loss": 0.0, "num_input_tokens_seen": 14015552, "step": 28465 }, { "epoch": 3.75742378249967, "grad_norm": 0.00235453387722373, "learning_rate": 3.533307840446935e-07, "loss": 0.0002, "num_input_tokens_seen": 14018112, "step": 28470 }, { "epoch": 3.7580836742774184, "grad_norm": 0.012911655008792877, "learning_rate": 3.529794457164265e-07, "loss": 0.0, "num_input_tokens_seen": 14020736, "step": 28475 }, { "epoch": 3.758743566055167, "grad_norm": 13.316226959228516, "learning_rate": 3.526282447135862e-07, "loss": 0.0088, "num_input_tokens_seen": 14023104, "step": 28480 }, { "epoch": 3.7594034578329154, "grad_norm": 0.0003600022755563259, "learning_rate": 3.5227718111071316e-07, "loss": 0.0457, "num_input_tokens_seen": 14025664, "step": 28485 }, { "epoch": 3.7600633496106637, "grad_norm": 0.15423204004764557, "learning_rate": 3.519262549823183e-07, "loss": 0.0175, "num_input_tokens_seen": 14027776, "step": 28490 }, { "epoch": 3.7607232413884124, "grad_norm": 0.0038064823020249605, "learning_rate": 3.5157546640288227e-07, "loss": 0.0004, "num_input_tokens_seen": 14030144, "step": 28495 }, { "epoch": 3.7613831331661607, "grad_norm": 0.003817281685769558, "learning_rate": 3.5122481544685857e-07, "loss": 0.0067, "num_input_tokens_seen": 14032576, "step": 28500 }, { "epoch": 3.7620430249439094, "grad_norm": 21.253145217895508, "learning_rate": 3.5087430218866945e-07, "loss": 0.0166, "num_input_tokens_seen": 14034944, "step": 28505 }, { "epoch": 3.7627029167216577, "grad_norm": 0.00010077494516735896, "learning_rate": 3.505239267027094e-07, "loss": 0.0, "num_input_tokens_seen": 14037312, "step": 28510 }, { "epoch": 3.763362808499406, "grad_norm": 9.552456855773926, "learning_rate": 3.5017368906334235e-07, "loss": 0.0208, "num_input_tokens_seen": 14039872, "step": 28515 }, { "epoch": 3.7640227002771547, "grad_norm": 30.936616897583008, "learning_rate": 3.498235893449042e-07, "loss": 0.0324, "num_input_tokens_seen": 14042240, "step": 28520 }, { "epoch": 3.764682592054903, "grad_norm": 0.043132536113262177, "learning_rate": 3.494736276217013e-07, "loss": 0.0005, "num_input_tokens_seen": 14044672, "step": 28525 }, { "epoch": 3.7653424838326517, "grad_norm": 0.002851359313353896, "learning_rate": 3.4912380396800987e-07, "loss": 0.0003, "num_input_tokens_seen": 14047040, "step": 28530 }, { "epoch": 3.7660023756104, "grad_norm": 0.0006523252232000232, "learning_rate": 3.4877411845807783e-07, "loss": 0.0735, "num_input_tokens_seen": 14049856, "step": 28535 }, { "epoch": 3.7666622673881482, "grad_norm": 0.002385950880125165, "learning_rate": 3.4842457116612365e-07, "loss": 0.0352, "num_input_tokens_seen": 14052288, "step": 28540 }, { "epoch": 3.7673221591658965, "grad_norm": 0.0008141055586747825, "learning_rate": 3.4807516216633557e-07, "loss": 0.0001, "num_input_tokens_seen": 14054528, "step": 28545 }, { "epoch": 3.7679820509436452, "grad_norm": 0.014787948690354824, "learning_rate": 3.477258915328735e-07, "loss": 0.0, "num_input_tokens_seen": 14056896, "step": 28550 }, { "epoch": 3.768641942721394, "grad_norm": 0.001489198417402804, "learning_rate": 3.4737675933986744e-07, "loss": 0.0, "num_input_tokens_seen": 14059392, "step": 28555 }, { "epoch": 3.7693018344991422, "grad_norm": 0.00500260666012764, "learning_rate": 3.4702776566141864e-07, "loss": 0.0, "num_input_tokens_seen": 14061696, "step": 28560 }, { "epoch": 3.7699617262768905, "grad_norm": 0.6943917870521545, "learning_rate": 3.4667891057159784e-07, "loss": 0.0002, "num_input_tokens_seen": 14063744, "step": 28565 }, { "epoch": 3.770621618054639, "grad_norm": 0.0005322875804267824, "learning_rate": 3.463301941444473e-07, "loss": 0.0906, "num_input_tokens_seen": 14066240, "step": 28570 }, { "epoch": 3.7712815098323875, "grad_norm": 0.0003532171540427953, "learning_rate": 3.459816164539798e-07, "loss": 0.0, "num_input_tokens_seen": 14068736, "step": 28575 }, { "epoch": 3.771941401610136, "grad_norm": 0.07054813206195831, "learning_rate": 3.456331775741779e-07, "loss": 0.1, "num_input_tokens_seen": 14071232, "step": 28580 }, { "epoch": 3.7726012933878845, "grad_norm": 0.001612989348359406, "learning_rate": 3.452848775789955e-07, "loss": 0.0, "num_input_tokens_seen": 14073664, "step": 28585 }, { "epoch": 3.773261185165633, "grad_norm": 1.5302505493164062, "learning_rate": 3.449367165423571e-07, "loss": 0.0002, "num_input_tokens_seen": 14075904, "step": 28590 }, { "epoch": 3.773921076943381, "grad_norm": 0.011845918372273445, "learning_rate": 3.4458869453815674e-07, "loss": 0.002, "num_input_tokens_seen": 14078208, "step": 28595 }, { "epoch": 3.77458096872113, "grad_norm": 0.006674485746771097, "learning_rate": 3.4424081164025976e-07, "loss": 0.0446, "num_input_tokens_seen": 14080704, "step": 28600 }, { "epoch": 3.775240860498878, "grad_norm": 0.012882623821496964, "learning_rate": 3.4389306792250194e-07, "loss": 0.0, "num_input_tokens_seen": 14083072, "step": 28605 }, { "epoch": 3.775900752276627, "grad_norm": 0.004808145109564066, "learning_rate": 3.435454634586896e-07, "loss": 0.0, "num_input_tokens_seen": 14085248, "step": 28610 }, { "epoch": 3.776560644054375, "grad_norm": 0.0003251858288422227, "learning_rate": 3.431979983225987e-07, "loss": 0.0001, "num_input_tokens_seen": 14087936, "step": 28615 }, { "epoch": 3.7772205358321234, "grad_norm": 22.550294876098633, "learning_rate": 3.4285067258797626e-07, "loss": 0.0266, "num_input_tokens_seen": 14090368, "step": 28620 }, { "epoch": 3.777880427609872, "grad_norm": 0.0005045776488259435, "learning_rate": 3.425034863285404e-07, "loss": 0.0001, "num_input_tokens_seen": 14093568, "step": 28625 }, { "epoch": 3.7785403193876204, "grad_norm": 0.0031521327327936888, "learning_rate": 3.42156439617978e-07, "loss": 0.0341, "num_input_tokens_seen": 14095808, "step": 28630 }, { "epoch": 3.779200211165369, "grad_norm": 0.0008202116587199271, "learning_rate": 3.418095325299475e-07, "loss": 0.0, "num_input_tokens_seen": 14098368, "step": 28635 }, { "epoch": 3.7798601029431174, "grad_norm": 0.07252146303653717, "learning_rate": 3.414627651380778e-07, "loss": 0.0001, "num_input_tokens_seen": 14100736, "step": 28640 }, { "epoch": 3.7805199947208656, "grad_norm": 7.641245611011982e-05, "learning_rate": 3.4111613751596725e-07, "loss": 0.0, "num_input_tokens_seen": 14103104, "step": 28645 }, { "epoch": 3.7811798864986144, "grad_norm": 0.0003453242243267596, "learning_rate": 3.407696497371855e-07, "loss": 0.0004, "num_input_tokens_seen": 14105600, "step": 28650 }, { "epoch": 3.7818397782763626, "grad_norm": 0.0007029441185295582, "learning_rate": 3.40423301875271e-07, "loss": 0.0, "num_input_tokens_seen": 14107712, "step": 28655 }, { "epoch": 3.7824996700541114, "grad_norm": 0.0001722180750221014, "learning_rate": 3.400770940037353e-07, "loss": 0.0008, "num_input_tokens_seen": 14110080, "step": 28660 }, { "epoch": 3.7831595618318596, "grad_norm": 0.0002277433522976935, "learning_rate": 3.3973102619605753e-07, "loss": 0.0054, "num_input_tokens_seen": 14112512, "step": 28665 }, { "epoch": 3.783819453609608, "grad_norm": 0.0007299144635908306, "learning_rate": 3.3938509852568773e-07, "loss": 0.0, "num_input_tokens_seen": 14114624, "step": 28670 }, { "epoch": 3.784479345387356, "grad_norm": 0.07267485558986664, "learning_rate": 3.390393110660471e-07, "loss": 0.0, "num_input_tokens_seen": 14116928, "step": 28675 }, { "epoch": 3.785139237165105, "grad_norm": 0.0013063414953649044, "learning_rate": 3.386936638905263e-07, "loss": 0.0, "num_input_tokens_seen": 14119296, "step": 28680 }, { "epoch": 3.7857991289428536, "grad_norm": 0.008357301354408264, "learning_rate": 3.38348157072487e-07, "loss": 0.0, "num_input_tokens_seen": 14121600, "step": 28685 }, { "epoch": 3.786459020720602, "grad_norm": 0.0002192695828853175, "learning_rate": 3.380027906852596e-07, "loss": 0.0426, "num_input_tokens_seen": 14123840, "step": 28690 }, { "epoch": 3.78711891249835, "grad_norm": 0.00023211569350678474, "learning_rate": 3.3765756480214616e-07, "loss": 0.0, "num_input_tokens_seen": 14126208, "step": 28695 }, { "epoch": 3.7877788042760985, "grad_norm": 0.008235753513872623, "learning_rate": 3.373124794964185e-07, "loss": 0.0, "num_input_tokens_seen": 14128640, "step": 28700 }, { "epoch": 3.788438696053847, "grad_norm": 0.00028870353708043694, "learning_rate": 3.36967534841318e-07, "loss": 0.0004, "num_input_tokens_seen": 14130944, "step": 28705 }, { "epoch": 3.7890985878315955, "grad_norm": 0.019349442794919014, "learning_rate": 3.3662273091005687e-07, "loss": 0.0002, "num_input_tokens_seen": 14133504, "step": 28710 }, { "epoch": 3.789758479609344, "grad_norm": 0.0013696362730115652, "learning_rate": 3.3627806777581777e-07, "loss": 0.0011, "num_input_tokens_seen": 14136128, "step": 28715 }, { "epoch": 3.7904183713870925, "grad_norm": 0.043605487793684006, "learning_rate": 3.35933545511752e-07, "loss": 0.0919, "num_input_tokens_seen": 14138432, "step": 28720 }, { "epoch": 3.7910782631648408, "grad_norm": 0.00044478950439952314, "learning_rate": 3.3558916419098247e-07, "loss": 0.0203, "num_input_tokens_seen": 14140928, "step": 28725 }, { "epoch": 3.7917381549425895, "grad_norm": 0.00045032083289697766, "learning_rate": 3.3524492388660166e-07, "loss": 0.0014, "num_input_tokens_seen": 14143296, "step": 28730 }, { "epoch": 3.7923980467203378, "grad_norm": 0.12213001400232315, "learning_rate": 3.349008246716721e-07, "loss": 0.0003, "num_input_tokens_seen": 14145920, "step": 28735 }, { "epoch": 3.7930579384980865, "grad_norm": 0.015413171611726284, "learning_rate": 3.345568666192261e-07, "loss": 0.0854, "num_input_tokens_seen": 14148480, "step": 28740 }, { "epoch": 3.7937178302758348, "grad_norm": 0.009154030121862888, "learning_rate": 3.3421304980226627e-07, "loss": 0.0, "num_input_tokens_seen": 14150976, "step": 28745 }, { "epoch": 3.794377722053583, "grad_norm": 0.0014472492039203644, "learning_rate": 3.338693742937657e-07, "loss": 0.0001, "num_input_tokens_seen": 14153728, "step": 28750 }, { "epoch": 3.7950376138313318, "grad_norm": 0.8823553323745728, "learning_rate": 3.3352584016666654e-07, "loss": 0.0002, "num_input_tokens_seen": 14156288, "step": 28755 }, { "epoch": 3.79569750560908, "grad_norm": 0.38144099712371826, "learning_rate": 3.3318244749388136e-07, "loss": 0.0006, "num_input_tokens_seen": 14158976, "step": 28760 }, { "epoch": 3.7963573973868288, "grad_norm": 0.0008522819844074547, "learning_rate": 3.328391963482934e-07, "loss": 0.0048, "num_input_tokens_seen": 14161472, "step": 28765 }, { "epoch": 3.797017289164577, "grad_norm": 0.000978624913841486, "learning_rate": 3.3249608680275455e-07, "loss": 0.0, "num_input_tokens_seen": 14163968, "step": 28770 }, { "epoch": 3.7976771809423253, "grad_norm": 0.0012917830608785152, "learning_rate": 3.3215311893008744e-07, "loss": 0.0007, "num_input_tokens_seen": 14166592, "step": 28775 }, { "epoch": 3.798337072720074, "grad_norm": 0.00022298976546153426, "learning_rate": 3.318102928030848e-07, "loss": 0.0502, "num_input_tokens_seen": 14169344, "step": 28780 }, { "epoch": 3.7989969644978223, "grad_norm": 0.001345164841040969, "learning_rate": 3.3146760849450916e-07, "loss": 0.0001, "num_input_tokens_seen": 14171904, "step": 28785 }, { "epoch": 3.799656856275571, "grad_norm": 0.005270975176244974, "learning_rate": 3.3112506607709246e-07, "loss": 0.0, "num_input_tokens_seen": 14174336, "step": 28790 }, { "epoch": 3.8003167480533193, "grad_norm": 0.0006610043928958476, "learning_rate": 3.307826656235363e-07, "loss": 0.0001, "num_input_tokens_seen": 14176640, "step": 28795 }, { "epoch": 3.8009766398310676, "grad_norm": 0.0007634887588210404, "learning_rate": 3.304404072065139e-07, "loss": 0.028, "num_input_tokens_seen": 14178944, "step": 28800 }, { "epoch": 3.8016365316088163, "grad_norm": 0.003030109917744994, "learning_rate": 3.30098290898666e-07, "loss": 0.0, "num_input_tokens_seen": 14181568, "step": 28805 }, { "epoch": 3.8022964233865646, "grad_norm": 0.0007390666869468987, "learning_rate": 3.2975631677260505e-07, "loss": 0.0001, "num_input_tokens_seen": 14184128, "step": 28810 }, { "epoch": 3.8029563151643133, "grad_norm": 0.021578386425971985, "learning_rate": 3.294144849009122e-07, "loss": 0.0001, "num_input_tokens_seen": 14186560, "step": 28815 }, { "epoch": 3.8036162069420616, "grad_norm": 17.728506088256836, "learning_rate": 3.290727953561393e-07, "loss": 0.0382, "num_input_tokens_seen": 14189184, "step": 28820 }, { "epoch": 3.80427609871981, "grad_norm": 0.0007388376980088651, "learning_rate": 3.287312482108071e-07, "loss": 0.0, "num_input_tokens_seen": 14191616, "step": 28825 }, { "epoch": 3.804935990497558, "grad_norm": 0.26988065242767334, "learning_rate": 3.2838984353740593e-07, "loss": 0.0738, "num_input_tokens_seen": 14194432, "step": 28830 }, { "epoch": 3.805595882275307, "grad_norm": 0.09984458237886429, "learning_rate": 3.2804858140839764e-07, "loss": 0.0114, "num_input_tokens_seen": 14197120, "step": 28835 }, { "epoch": 3.806255774053055, "grad_norm": 0.012381376698613167, "learning_rate": 3.277074618962117e-07, "loss": 0.0564, "num_input_tokens_seen": 14199424, "step": 28840 }, { "epoch": 3.806915665830804, "grad_norm": 7.645833829883486e-05, "learning_rate": 3.2736648507324903e-07, "loss": 0.086, "num_input_tokens_seen": 14201792, "step": 28845 }, { "epoch": 3.807575557608552, "grad_norm": 0.006288577802479267, "learning_rate": 3.270256510118786e-07, "loss": 0.0, "num_input_tokens_seen": 14204416, "step": 28850 }, { "epoch": 3.8082354493863004, "grad_norm": 0.0009563757921569049, "learning_rate": 3.2668495978444065e-07, "loss": 0.0016, "num_input_tokens_seen": 14207104, "step": 28855 }, { "epoch": 3.808895341164049, "grad_norm": 4.422444908414036e-05, "learning_rate": 3.2634441146324445e-07, "loss": 0.063, "num_input_tokens_seen": 14209600, "step": 28860 }, { "epoch": 3.8095552329417974, "grad_norm": 13.252848625183105, "learning_rate": 3.26004006120568e-07, "loss": 0.0611, "num_input_tokens_seen": 14211840, "step": 28865 }, { "epoch": 3.810215124719546, "grad_norm": 77.8260269165039, "learning_rate": 3.256637438286612e-07, "loss": 0.094, "num_input_tokens_seen": 14214336, "step": 28870 }, { "epoch": 3.8108750164972944, "grad_norm": 0.018460115417838097, "learning_rate": 3.253236246597417e-07, "loss": 0.0, "num_input_tokens_seen": 14216640, "step": 28875 }, { "epoch": 3.8115349082750427, "grad_norm": 0.020576654002070427, "learning_rate": 3.2498364868599683e-07, "loss": 0.0006, "num_input_tokens_seen": 14219264, "step": 28880 }, { "epoch": 3.8121948000527914, "grad_norm": 69.54541778564453, "learning_rate": 3.2464381597958444e-07, "loss": 0.0352, "num_input_tokens_seen": 14221504, "step": 28885 }, { "epoch": 3.8128546918305397, "grad_norm": 0.0036860329564660788, "learning_rate": 3.243041266126316e-07, "loss": 0.0, "num_input_tokens_seen": 14223744, "step": 28890 }, { "epoch": 3.8135145836082884, "grad_norm": 0.000731949636247009, "learning_rate": 3.239645806572352e-07, "loss": 0.0, "num_input_tokens_seen": 14226304, "step": 28895 }, { "epoch": 3.8141744753860367, "grad_norm": 0.007175610400736332, "learning_rate": 3.2362517818546085e-07, "loss": 0.0213, "num_input_tokens_seen": 14228672, "step": 28900 }, { "epoch": 3.814834367163785, "grad_norm": 0.0002437162766000256, "learning_rate": 3.2328591926934446e-07, "loss": 0.0001, "num_input_tokens_seen": 14231360, "step": 28905 }, { "epoch": 3.8154942589415337, "grad_norm": 0.034070249646902084, "learning_rate": 3.229468039808916e-07, "loss": 0.0001, "num_input_tokens_seen": 14233856, "step": 28910 }, { "epoch": 3.816154150719282, "grad_norm": 0.000245957839069888, "learning_rate": 3.2260783239207644e-07, "loss": 0.0, "num_input_tokens_seen": 14236416, "step": 28915 }, { "epoch": 3.8168140424970307, "grad_norm": 0.028583209961652756, "learning_rate": 3.2226900457484354e-07, "loss": 0.0, "num_input_tokens_seen": 14238848, "step": 28920 }, { "epoch": 3.817473934274779, "grad_norm": 0.30260807275772095, "learning_rate": 3.21930320601107e-07, "loss": 0.0217, "num_input_tokens_seen": 14241728, "step": 28925 }, { "epoch": 3.8181338260525273, "grad_norm": 0.0004018806212116033, "learning_rate": 3.215917805427495e-07, "loss": 0.0001, "num_input_tokens_seen": 14243904, "step": 28930 }, { "epoch": 3.818793717830276, "grad_norm": 0.004953477066010237, "learning_rate": 3.2125338447162386e-07, "loss": 0.0, "num_input_tokens_seen": 14246336, "step": 28935 }, { "epoch": 3.8194536096080243, "grad_norm": 0.5943925380706787, "learning_rate": 3.209151324595523e-07, "loss": 0.0003, "num_input_tokens_seen": 14248512, "step": 28940 }, { "epoch": 3.820113501385773, "grad_norm": 0.06596149504184723, "learning_rate": 3.205770245783267e-07, "loss": 0.0657, "num_input_tokens_seen": 14250944, "step": 28945 }, { "epoch": 3.8207733931635213, "grad_norm": 0.006771343760192394, "learning_rate": 3.202390608997072e-07, "loss": 0.1313, "num_input_tokens_seen": 14253568, "step": 28950 }, { "epoch": 3.8214332849412695, "grad_norm": 0.015993310138583183, "learning_rate": 3.1990124149542465e-07, "loss": 0.0, "num_input_tokens_seen": 14256064, "step": 28955 }, { "epoch": 3.822093176719018, "grad_norm": 52.393898010253906, "learning_rate": 3.1956356643717896e-07, "loss": 0.1208, "num_input_tokens_seen": 14258304, "step": 28960 }, { "epoch": 3.8227530684967665, "grad_norm": 0.0416153222322464, "learning_rate": 3.1922603579663877e-07, "loss": 0.0001, "num_input_tokens_seen": 14260608, "step": 28965 }, { "epoch": 3.823412960274515, "grad_norm": 0.0024769201409071684, "learning_rate": 3.188886496454426e-07, "loss": 0.0001, "num_input_tokens_seen": 14263040, "step": 28970 }, { "epoch": 3.8240728520522635, "grad_norm": 26.59957504272461, "learning_rate": 3.185514080551986e-07, "loss": 0.0844, "num_input_tokens_seen": 14265344, "step": 28975 }, { "epoch": 3.824732743830012, "grad_norm": 0.014377378858625889, "learning_rate": 3.1821431109748344e-07, "loss": 0.0, "num_input_tokens_seen": 14267904, "step": 28980 }, { "epoch": 3.82539263560776, "grad_norm": 0.003249021479859948, "learning_rate": 3.178773588438438e-07, "loss": 0.1095, "num_input_tokens_seen": 14270400, "step": 28985 }, { "epoch": 3.826052527385509, "grad_norm": 0.004426421597599983, "learning_rate": 3.1754055136579463e-07, "loss": 0.0, "num_input_tokens_seen": 14272768, "step": 28990 }, { "epoch": 3.826712419163257, "grad_norm": 0.0005180624430067837, "learning_rate": 3.172038887348221e-07, "loss": 0.0001, "num_input_tokens_seen": 14275136, "step": 28995 }, { "epoch": 3.827372310941006, "grad_norm": 0.0005901391850784421, "learning_rate": 3.168673710223797e-07, "loss": 0.0001, "num_input_tokens_seen": 14277696, "step": 29000 }, { "epoch": 3.828032202718754, "grad_norm": 0.0007131620077416301, "learning_rate": 3.165309982998903e-07, "loss": 0.0001, "num_input_tokens_seen": 14279872, "step": 29005 }, { "epoch": 3.8286920944965024, "grad_norm": 0.0011300368933007121, "learning_rate": 3.161947706387479e-07, "loss": 0.0001, "num_input_tokens_seen": 14282432, "step": 29010 }, { "epoch": 3.829351986274251, "grad_norm": 0.7849501967430115, "learning_rate": 3.1585868811031337e-07, "loss": 0.0004, "num_input_tokens_seen": 14284864, "step": 29015 }, { "epoch": 3.8300118780519994, "grad_norm": 0.0024259083438664675, "learning_rate": 3.155227507859185e-07, "loss": 0.0003, "num_input_tokens_seen": 14287296, "step": 29020 }, { "epoch": 3.830671769829748, "grad_norm": 0.004194983281195164, "learning_rate": 3.1518695873686285e-07, "loss": 0.0674, "num_input_tokens_seen": 14289920, "step": 29025 }, { "epoch": 3.8313316616074964, "grad_norm": 0.0013464801013469696, "learning_rate": 3.1485131203441605e-07, "loss": 0.0001, "num_input_tokens_seen": 14292416, "step": 29030 }, { "epoch": 3.8319915533852447, "grad_norm": 14.864596366882324, "learning_rate": 3.1451581074981726e-07, "loss": 0.0065, "num_input_tokens_seen": 14294592, "step": 29035 }, { "epoch": 3.8326514451629934, "grad_norm": 0.004308775532990694, "learning_rate": 3.141804549542735e-07, "loss": 0.0003, "num_input_tokens_seen": 14297088, "step": 29040 }, { "epoch": 3.8333113369407417, "grad_norm": 32.45550537109375, "learning_rate": 3.138452447189617e-07, "loss": 0.1579, "num_input_tokens_seen": 14299712, "step": 29045 }, { "epoch": 3.8339712287184904, "grad_norm": 0.0018099230946972966, "learning_rate": 3.1351018011502837e-07, "loss": 0.0003, "num_input_tokens_seen": 14301888, "step": 29050 }, { "epoch": 3.8346311204962387, "grad_norm": 0.0064097810536623, "learning_rate": 3.1317526121358785e-07, "loss": 0.0567, "num_input_tokens_seen": 14304256, "step": 29055 }, { "epoch": 3.835291012273987, "grad_norm": 0.1308693140745163, "learning_rate": 3.128404880857244e-07, "loss": 0.0001, "num_input_tokens_seen": 14306752, "step": 29060 }, { "epoch": 3.8359509040517357, "grad_norm": 0.0016533080488443375, "learning_rate": 3.125058608024914e-07, "loss": 0.0005, "num_input_tokens_seen": 14309248, "step": 29065 }, { "epoch": 3.836610795829484, "grad_norm": 0.00398173276335001, "learning_rate": 3.1217137943491144e-07, "loss": 0.0164, "num_input_tokens_seen": 14311872, "step": 29070 }, { "epoch": 3.8372706876072327, "grad_norm": 0.0020239560399204493, "learning_rate": 3.1183704405397494e-07, "loss": 0.0001, "num_input_tokens_seen": 14314368, "step": 29075 }, { "epoch": 3.837930579384981, "grad_norm": 43.287757873535156, "learning_rate": 3.1150285473064255e-07, "loss": 0.0381, "num_input_tokens_seen": 14316864, "step": 29080 }, { "epoch": 3.8385904711627292, "grad_norm": 49.01985168457031, "learning_rate": 3.1116881153584387e-07, "loss": 0.0239, "num_input_tokens_seen": 14319360, "step": 29085 }, { "epoch": 3.8392503629404775, "grad_norm": 0.00048302547656930983, "learning_rate": 3.108349145404764e-07, "loss": 0.0, "num_input_tokens_seen": 14322048, "step": 29090 }, { "epoch": 3.8399102547182262, "grad_norm": 0.0018678263295441866, "learning_rate": 3.1050116381540793e-07, "loss": 0.0382, "num_input_tokens_seen": 14324480, "step": 29095 }, { "epoch": 3.8405701464959745, "grad_norm": 0.43559908866882324, "learning_rate": 3.101675594314747e-07, "loss": 0.0021, "num_input_tokens_seen": 14326976, "step": 29100 }, { "epoch": 3.8412300382737232, "grad_norm": 0.14082865417003632, "learning_rate": 3.098341014594813e-07, "loss": 0.0002, "num_input_tokens_seen": 14329600, "step": 29105 }, { "epoch": 3.8418899300514715, "grad_norm": 0.0021376083604991436, "learning_rate": 3.0950078997020214e-07, "loss": 0.0001, "num_input_tokens_seen": 14331968, "step": 29110 }, { "epoch": 3.84254982182922, "grad_norm": 0.00955208856612444, "learning_rate": 3.0916762503438e-07, "loss": 0.0719, "num_input_tokens_seen": 14334720, "step": 29115 }, { "epoch": 3.8432097136069685, "grad_norm": 0.024048855528235435, "learning_rate": 3.0883460672272724e-07, "loss": 0.0002, "num_input_tokens_seen": 14337088, "step": 29120 }, { "epoch": 3.843869605384717, "grad_norm": 0.05813174322247505, "learning_rate": 3.0850173510592415e-07, "loss": 0.001, "num_input_tokens_seen": 14339264, "step": 29125 }, { "epoch": 3.8445294971624655, "grad_norm": 0.013413142412900925, "learning_rate": 3.0816901025461974e-07, "loss": 0.0442, "num_input_tokens_seen": 14341632, "step": 29130 }, { "epoch": 3.845189388940214, "grad_norm": 0.003118762979283929, "learning_rate": 3.0783643223943367e-07, "loss": 0.0, "num_input_tokens_seen": 14343872, "step": 29135 }, { "epoch": 3.845849280717962, "grad_norm": 0.00037634363980032504, "learning_rate": 3.075040011309522e-07, "loss": 0.0003, "num_input_tokens_seen": 14346240, "step": 29140 }, { "epoch": 3.846509172495711, "grad_norm": 0.003813160816207528, "learning_rate": 3.0717171699973197e-07, "loss": 0.0001, "num_input_tokens_seen": 14348544, "step": 29145 }, { "epoch": 3.847169064273459, "grad_norm": 0.03408288210630417, "learning_rate": 3.068395799162976e-07, "loss": 0.0, "num_input_tokens_seen": 14350784, "step": 29150 }, { "epoch": 3.847828956051208, "grad_norm": 0.004677819553762674, "learning_rate": 3.0650758995114335e-07, "loss": 0.0, "num_input_tokens_seen": 14353408, "step": 29155 }, { "epoch": 3.848488847828956, "grad_norm": 0.051624976098537445, "learning_rate": 3.061757471747313e-07, "loss": 0.0, "num_input_tokens_seen": 14355712, "step": 29160 }, { "epoch": 3.8491487396067043, "grad_norm": 0.0331740602850914, "learning_rate": 3.058440516574918e-07, "loss": 0.0089, "num_input_tokens_seen": 14358016, "step": 29165 }, { "epoch": 3.849808631384453, "grad_norm": 0.0008157134870998561, "learning_rate": 3.055125034698265e-07, "loss": 0.0337, "num_input_tokens_seen": 14360576, "step": 29170 }, { "epoch": 3.8504685231622013, "grad_norm": 0.02268756367266178, "learning_rate": 3.051811026821027e-07, "loss": 0.0004, "num_input_tokens_seen": 14363008, "step": 29175 }, { "epoch": 3.85112841493995, "grad_norm": 0.0016416346188634634, "learning_rate": 3.04849849364659e-07, "loss": 0.0003, "num_input_tokens_seen": 14365376, "step": 29180 }, { "epoch": 3.8517883067176983, "grad_norm": 0.0015501509187743068, "learning_rate": 3.045187435878003e-07, "loss": 0.0001, "num_input_tokens_seen": 14367872, "step": 29185 }, { "epoch": 3.8524481984954466, "grad_norm": 0.0018056805711239576, "learning_rate": 3.041877854218021e-07, "loss": 0.0001, "num_input_tokens_seen": 14370304, "step": 29190 }, { "epoch": 3.8531080902731953, "grad_norm": 0.000659221550449729, "learning_rate": 3.0385697493690807e-07, "loss": 0.0, "num_input_tokens_seen": 14372928, "step": 29195 }, { "epoch": 3.8537679820509436, "grad_norm": 0.19510525465011597, "learning_rate": 3.0352631220332945e-07, "loss": 0.0004, "num_input_tokens_seen": 14375360, "step": 29200 }, { "epoch": 3.8544278738286923, "grad_norm": 11.538311958312988, "learning_rate": 3.031957972912482e-07, "loss": 0.1616, "num_input_tokens_seen": 14377920, "step": 29205 }, { "epoch": 3.8550877656064406, "grad_norm": 0.0012093938421458006, "learning_rate": 3.028654302708131e-07, "loss": 0.0, "num_input_tokens_seen": 14380352, "step": 29210 }, { "epoch": 3.855747657384189, "grad_norm": 1.0782610177993774, "learning_rate": 3.025352112121419e-07, "loss": 0.0007, "num_input_tokens_seen": 14382912, "step": 29215 }, { "epoch": 3.856407549161937, "grad_norm": 4.327111309976317e-05, "learning_rate": 3.022051401853214e-07, "loss": 0.0, "num_input_tokens_seen": 14385344, "step": 29220 }, { "epoch": 3.857067440939686, "grad_norm": 0.07471462339162827, "learning_rate": 3.018752172604069e-07, "loss": 0.0001, "num_input_tokens_seen": 14387840, "step": 29225 }, { "epoch": 3.857727332717434, "grad_norm": 0.00025852222461253405, "learning_rate": 3.015454425074224e-07, "loss": 0.0002, "num_input_tokens_seen": 14390016, "step": 29230 }, { "epoch": 3.858387224495183, "grad_norm": 0.016328802332282066, "learning_rate": 3.0121581599635973e-07, "loss": 0.0, "num_input_tokens_seen": 14392384, "step": 29235 }, { "epoch": 3.859047116272931, "grad_norm": 0.0008336760802194476, "learning_rate": 3.0088633779717975e-07, "loss": 0.0, "num_input_tokens_seen": 14394752, "step": 29240 }, { "epoch": 3.8597070080506795, "grad_norm": 0.0012462205486372113, "learning_rate": 3.0055700797981244e-07, "loss": 0.0004, "num_input_tokens_seen": 14397184, "step": 29245 }, { "epoch": 3.860366899828428, "grad_norm": 0.005022898782044649, "learning_rate": 3.002278266141548e-07, "loss": 0.0, "num_input_tokens_seen": 14399744, "step": 29250 }, { "epoch": 3.8610267916061765, "grad_norm": 0.0011758505133911967, "learning_rate": 2.9989879377007375e-07, "loss": 0.0, "num_input_tokens_seen": 14402112, "step": 29255 }, { "epoch": 3.861686683383925, "grad_norm": 0.0008672875701449811, "learning_rate": 2.995699095174041e-07, "loss": 0.0, "num_input_tokens_seen": 14404544, "step": 29260 }, { "epoch": 3.8623465751616735, "grad_norm": 0.037034500390291214, "learning_rate": 2.9924117392594893e-07, "loss": 0.0, "num_input_tokens_seen": 14406720, "step": 29265 }, { "epoch": 3.8630064669394217, "grad_norm": 0.0004481837968342006, "learning_rate": 2.9891258706547997e-07, "loss": 0.0, "num_input_tokens_seen": 14409472, "step": 29270 }, { "epoch": 3.8636663587171705, "grad_norm": 0.015096590854227543, "learning_rate": 2.9858414900573757e-07, "loss": 0.0366, "num_input_tokens_seen": 14411904, "step": 29275 }, { "epoch": 3.8643262504949187, "grad_norm": 0.0008525225566700101, "learning_rate": 2.9825585981643064e-07, "loss": 0.0411, "num_input_tokens_seen": 14414400, "step": 29280 }, { "epoch": 3.8649861422726675, "grad_norm": 0.030953820794820786, "learning_rate": 2.9792771956723537e-07, "loss": 0.0, "num_input_tokens_seen": 14416896, "step": 29285 }, { "epoch": 3.8656460340504157, "grad_norm": 0.0005172466626390815, "learning_rate": 2.9759972832779776e-07, "loss": 0.0, "num_input_tokens_seen": 14419328, "step": 29290 }, { "epoch": 3.866305925828164, "grad_norm": 1.1345044374465942, "learning_rate": 2.972718861677317e-07, "loss": 0.0002, "num_input_tokens_seen": 14421696, "step": 29295 }, { "epoch": 3.8669658176059127, "grad_norm": 0.002292012795805931, "learning_rate": 2.969441931566188e-07, "loss": 0.0, "num_input_tokens_seen": 14423936, "step": 29300 }, { "epoch": 3.867625709383661, "grad_norm": 0.00030207863892428577, "learning_rate": 2.9661664936400964e-07, "loss": 0.0001, "num_input_tokens_seen": 14426432, "step": 29305 }, { "epoch": 3.8682856011614097, "grad_norm": 0.0012762520927935839, "learning_rate": 2.9628925485942357e-07, "loss": 0.0, "num_input_tokens_seen": 14428672, "step": 29310 }, { "epoch": 3.868945492939158, "grad_norm": 0.014591632410883904, "learning_rate": 2.9596200971234687e-07, "loss": 0.0001, "num_input_tokens_seen": 14431040, "step": 29315 }, { "epoch": 3.8696053847169063, "grad_norm": 0.00019203654665034264, "learning_rate": 2.956349139922357e-07, "loss": 0.0657, "num_input_tokens_seen": 14433472, "step": 29320 }, { "epoch": 3.870265276494655, "grad_norm": 0.06158406659960747, "learning_rate": 2.9530796776851283e-07, "loss": 0.0001, "num_input_tokens_seen": 14435968, "step": 29325 }, { "epoch": 3.8709251682724033, "grad_norm": 0.03689207881689072, "learning_rate": 2.9498117111057155e-07, "loss": 0.0, "num_input_tokens_seen": 14438336, "step": 29330 }, { "epoch": 3.871585060050152, "grad_norm": 0.04720371589064598, "learning_rate": 2.9465452408777126e-07, "loss": 0.0001, "num_input_tokens_seen": 14440896, "step": 29335 }, { "epoch": 3.8722449518279003, "grad_norm": 0.0012083809124305844, "learning_rate": 2.943280267694399e-07, "loss": 0.0, "num_input_tokens_seen": 14443392, "step": 29340 }, { "epoch": 3.8729048436056486, "grad_norm": 0.00017764570657163858, "learning_rate": 2.940016792248754e-07, "loss": 0.0009, "num_input_tokens_seen": 14445952, "step": 29345 }, { "epoch": 3.873564735383397, "grad_norm": 0.0009236446931026876, "learning_rate": 2.936754815233417e-07, "loss": 0.0611, "num_input_tokens_seen": 14448256, "step": 29350 }, { "epoch": 3.8742246271611456, "grad_norm": 0.000315178360324353, "learning_rate": 2.933494337340726e-07, "loss": 0.0, "num_input_tokens_seen": 14450624, "step": 29355 }, { "epoch": 3.8748845189388943, "grad_norm": 0.005778376944363117, "learning_rate": 2.930235359262687e-07, "loss": 0.0, "num_input_tokens_seen": 14453056, "step": 29360 }, { "epoch": 3.8755444107166426, "grad_norm": 0.007840816862881184, "learning_rate": 2.9269778816909985e-07, "loss": 0.0, "num_input_tokens_seen": 14455616, "step": 29365 }, { "epoch": 3.876204302494391, "grad_norm": 0.0008592153899371624, "learning_rate": 2.9237219053170383e-07, "loss": 0.0, "num_input_tokens_seen": 14457792, "step": 29370 }, { "epoch": 3.876864194272139, "grad_norm": 0.0005324503872543573, "learning_rate": 2.920467430831858e-07, "loss": 0.0, "num_input_tokens_seen": 14460096, "step": 29375 }, { "epoch": 3.877524086049888, "grad_norm": 0.12947431206703186, "learning_rate": 2.917214458926199e-07, "loss": 0.0019, "num_input_tokens_seen": 14463040, "step": 29380 }, { "epoch": 3.878183977827636, "grad_norm": 0.001886560581624508, "learning_rate": 2.913962990290486e-07, "loss": 0.0, "num_input_tokens_seen": 14465472, "step": 29385 }, { "epoch": 3.878843869605385, "grad_norm": 0.0008303842623718083, "learning_rate": 2.910713025614812e-07, "loss": 0.0891, "num_input_tokens_seen": 14467968, "step": 29390 }, { "epoch": 3.879503761383133, "grad_norm": 0.003242628648877144, "learning_rate": 2.9074645655889604e-07, "loss": 0.0, "num_input_tokens_seen": 14470656, "step": 29395 }, { "epoch": 3.8801636531608814, "grad_norm": 0.0005708981771022081, "learning_rate": 2.904217610902396e-07, "loss": 0.0004, "num_input_tokens_seen": 14472704, "step": 29400 }, { "epoch": 3.88082354493863, "grad_norm": 0.0017631093505769968, "learning_rate": 2.900972162244263e-07, "loss": 0.0, "num_input_tokens_seen": 14475136, "step": 29405 }, { "epoch": 3.8814834367163784, "grad_norm": 15.109874725341797, "learning_rate": 2.897728220303378e-07, "loss": 0.0491, "num_input_tokens_seen": 14477504, "step": 29410 }, { "epoch": 3.882143328494127, "grad_norm": 0.0022156566847115755, "learning_rate": 2.894485785768248e-07, "loss": 0.0239, "num_input_tokens_seen": 14479936, "step": 29415 }, { "epoch": 3.8828032202718754, "grad_norm": 0.008582009002566338, "learning_rate": 2.891244859327059e-07, "loss": 0.1459, "num_input_tokens_seen": 14482368, "step": 29420 }, { "epoch": 3.8834631120496237, "grad_norm": 0.042062871158123016, "learning_rate": 2.888005441667668e-07, "loss": 0.0, "num_input_tokens_seen": 14484736, "step": 29425 }, { "epoch": 3.8841230038273724, "grad_norm": 144.55355834960938, "learning_rate": 2.88476753347762e-07, "loss": 0.0049, "num_input_tokens_seen": 14487296, "step": 29430 }, { "epoch": 3.8847828956051207, "grad_norm": 0.0020836309995502234, "learning_rate": 2.881531135444143e-07, "loss": 0.0, "num_input_tokens_seen": 14489344, "step": 29435 }, { "epoch": 3.8854427873828694, "grad_norm": 0.7218878865242004, "learning_rate": 2.878296248254131e-07, "loss": 0.0386, "num_input_tokens_seen": 14492096, "step": 29440 }, { "epoch": 3.8861026791606177, "grad_norm": 0.2119138091802597, "learning_rate": 2.8750628725941685e-07, "loss": 0.0065, "num_input_tokens_seen": 14494720, "step": 29445 }, { "epoch": 3.886762570938366, "grad_norm": 0.0037669632583856583, "learning_rate": 2.8718310091505173e-07, "loss": 0.0, "num_input_tokens_seen": 14497280, "step": 29450 }, { "epoch": 3.8874224627161147, "grad_norm": 0.08131466060876846, "learning_rate": 2.8686006586091183e-07, "loss": 0.0001, "num_input_tokens_seen": 14499904, "step": 29455 }, { "epoch": 3.888082354493863, "grad_norm": 10.323993682861328, "learning_rate": 2.8653718216555854e-07, "loss": 0.0854, "num_input_tokens_seen": 14502784, "step": 29460 }, { "epoch": 3.8887422462716117, "grad_norm": 0.00036318288766779006, "learning_rate": 2.8621444989752184e-07, "loss": 0.0, "num_input_tokens_seen": 14504960, "step": 29465 }, { "epoch": 3.88940213804936, "grad_norm": 0.0004923275555483997, "learning_rate": 2.858918691252997e-07, "loss": 0.0, "num_input_tokens_seen": 14507520, "step": 29470 }, { "epoch": 3.8900620298271082, "grad_norm": 0.0016408892115578055, "learning_rate": 2.855694399173568e-07, "loss": 0.0, "num_input_tokens_seen": 14510016, "step": 29475 }, { "epoch": 3.890721921604857, "grad_norm": 0.01022788044065237, "learning_rate": 2.8524716234212684e-07, "loss": 0.0009, "num_input_tokens_seen": 14512512, "step": 29480 }, { "epoch": 3.8913818133826052, "grad_norm": 0.0026390019338577986, "learning_rate": 2.849250364680108e-07, "loss": 0.0, "num_input_tokens_seen": 14514624, "step": 29485 }, { "epoch": 3.892041705160354, "grad_norm": 0.002436618087813258, "learning_rate": 2.846030623633778e-07, "loss": 0.0, "num_input_tokens_seen": 14516928, "step": 29490 }, { "epoch": 3.8927015969381022, "grad_norm": 0.022925982251763344, "learning_rate": 2.842812400965645e-07, "loss": 0.0, "num_input_tokens_seen": 14519296, "step": 29495 }, { "epoch": 3.8933614887158505, "grad_norm": 23.102983474731445, "learning_rate": 2.839595697358744e-07, "loss": 0.1298, "num_input_tokens_seen": 14521728, "step": 29500 }, { "epoch": 3.894021380493599, "grad_norm": 0.002253438113257289, "learning_rate": 2.836380513495812e-07, "loss": 0.0023, "num_input_tokens_seen": 14524224, "step": 29505 }, { "epoch": 3.8946812722713475, "grad_norm": 0.0016631442122161388, "learning_rate": 2.8331668500592374e-07, "loss": 0.0001, "num_input_tokens_seen": 14526912, "step": 29510 }, { "epoch": 3.895341164049096, "grad_norm": 0.004704699851572514, "learning_rate": 2.829954707731104e-07, "loss": 0.0003, "num_input_tokens_seen": 14529280, "step": 29515 }, { "epoch": 3.8960010558268445, "grad_norm": 0.000636607815977186, "learning_rate": 2.826744087193159e-07, "loss": 0.0008, "num_input_tokens_seen": 14531776, "step": 29520 }, { "epoch": 3.896660947604593, "grad_norm": 0.02008168026804924, "learning_rate": 2.823534989126838e-07, "loss": 0.0009, "num_input_tokens_seen": 14533952, "step": 29525 }, { "epoch": 3.897320839382341, "grad_norm": 0.0013008936075493693, "learning_rate": 2.820327414213249e-07, "loss": 0.0, "num_input_tokens_seen": 14536128, "step": 29530 }, { "epoch": 3.89798073116009, "grad_norm": 0.2298823744058609, "learning_rate": 2.8171213631331714e-07, "loss": 0.024, "num_input_tokens_seen": 14539072, "step": 29535 }, { "epoch": 3.898640622937838, "grad_norm": 0.1926964521408081, "learning_rate": 2.813916836567074e-07, "loss": 0.0002, "num_input_tokens_seen": 14541632, "step": 29540 }, { "epoch": 3.899300514715587, "grad_norm": 0.0007152509060688317, "learning_rate": 2.810713835195092e-07, "loss": 0.2078, "num_input_tokens_seen": 14543680, "step": 29545 }, { "epoch": 3.899960406493335, "grad_norm": 0.006173610687255859, "learning_rate": 2.807512359697034e-07, "loss": 0.0, "num_input_tokens_seen": 14546048, "step": 29550 }, { "epoch": 3.9006202982710834, "grad_norm": 0.11755349487066269, "learning_rate": 2.8043124107523943e-07, "loss": 0.0412, "num_input_tokens_seen": 14548480, "step": 29555 }, { "epoch": 3.901280190048832, "grad_norm": 0.4096464216709137, "learning_rate": 2.801113989040338e-07, "loss": 0.0004, "num_input_tokens_seen": 14550976, "step": 29560 }, { "epoch": 3.9019400818265804, "grad_norm": 0.0008820955990813673, "learning_rate": 2.7979170952397103e-07, "loss": 0.0, "num_input_tokens_seen": 14553600, "step": 29565 }, { "epoch": 3.902599973604329, "grad_norm": 0.002521621063351631, "learning_rate": 2.7947217300290225e-07, "loss": 0.0056, "num_input_tokens_seen": 14556160, "step": 29570 }, { "epoch": 3.9032598653820774, "grad_norm": 0.042136672884225845, "learning_rate": 2.791527894086472e-07, "loss": 0.0337, "num_input_tokens_seen": 14558912, "step": 29575 }, { "epoch": 3.9039197571598256, "grad_norm": 0.00030493823578581214, "learning_rate": 2.7883355880899286e-07, "loss": 0.002, "num_input_tokens_seen": 14561408, "step": 29580 }, { "epoch": 3.9045796489375744, "grad_norm": 0.6785450577735901, "learning_rate": 2.78514481271693e-07, "loss": 0.0005, "num_input_tokens_seen": 14563648, "step": 29585 }, { "epoch": 3.9052395407153226, "grad_norm": 0.0023029835429042578, "learning_rate": 2.7819555686447004e-07, "loss": 0.0, "num_input_tokens_seen": 14565888, "step": 29590 }, { "epoch": 3.9058994324930714, "grad_norm": 0.0056864372454583645, "learning_rate": 2.7787678565501347e-07, "loss": 0.008, "num_input_tokens_seen": 14568384, "step": 29595 }, { "epoch": 3.9065593242708196, "grad_norm": 0.0006198826595209539, "learning_rate": 2.7755816771097963e-07, "loss": 0.0, "num_input_tokens_seen": 14570432, "step": 29600 }, { "epoch": 3.907219216048568, "grad_norm": 0.0022146229166537523, "learning_rate": 2.7723970309999324e-07, "loss": 0.0154, "num_input_tokens_seen": 14572864, "step": 29605 }, { "epoch": 3.9078791078263166, "grad_norm": 0.011892078444361687, "learning_rate": 2.7692139188964594e-07, "loss": 0.0, "num_input_tokens_seen": 14575104, "step": 29610 }, { "epoch": 3.908538999604065, "grad_norm": 68.65377044677734, "learning_rate": 2.766032341474975e-07, "loss": 0.0083, "num_input_tokens_seen": 14577664, "step": 29615 }, { "epoch": 3.9091988913818136, "grad_norm": 0.418712854385376, "learning_rate": 2.762852299410738e-07, "loss": 0.0004, "num_input_tokens_seen": 14580352, "step": 29620 }, { "epoch": 3.909858783159562, "grad_norm": 0.0004645238514058292, "learning_rate": 2.759673793378694e-07, "loss": 0.0, "num_input_tokens_seen": 14582784, "step": 29625 }, { "epoch": 3.91051867493731, "grad_norm": 0.008189682848751545, "learning_rate": 2.7564968240534594e-07, "loss": 0.0001, "num_input_tokens_seen": 14585216, "step": 29630 }, { "epoch": 3.9111785667150585, "grad_norm": 0.01741664856672287, "learning_rate": 2.753321392109318e-07, "loss": 0.0611, "num_input_tokens_seen": 14587584, "step": 29635 }, { "epoch": 3.911838458492807, "grad_norm": 0.04173688963055611, "learning_rate": 2.7501474982202345e-07, "loss": 0.0001, "num_input_tokens_seen": 14589952, "step": 29640 }, { "epoch": 3.9124983502705555, "grad_norm": 0.05172654241323471, "learning_rate": 2.7469751430598486e-07, "loss": 0.0001, "num_input_tokens_seen": 14592320, "step": 29645 }, { "epoch": 3.913158242048304, "grad_norm": 0.009400231763720512, "learning_rate": 2.743804327301462e-07, "loss": 0.0266, "num_input_tokens_seen": 14594560, "step": 29650 }, { "epoch": 3.9138181338260525, "grad_norm": 0.01464253943413496, "learning_rate": 2.7406350516180666e-07, "loss": 0.0725, "num_input_tokens_seen": 14597248, "step": 29655 }, { "epoch": 3.9144780256038008, "grad_norm": 0.0002989550703205168, "learning_rate": 2.7374673166823057e-07, "loss": 0.0, "num_input_tokens_seen": 14599488, "step": 29660 }, { "epoch": 3.9151379173815495, "grad_norm": 0.0020613304805010557, "learning_rate": 2.7343011231665227e-07, "loss": 0.0, "num_input_tokens_seen": 14601728, "step": 29665 }, { "epoch": 3.9157978091592978, "grad_norm": 0.001036074128933251, "learning_rate": 2.731136471742712e-07, "loss": 0.0, "num_input_tokens_seen": 14604160, "step": 29670 }, { "epoch": 3.9164577009370465, "grad_norm": 0.0002730927080847323, "learning_rate": 2.7279733630825417e-07, "loss": 0.0, "num_input_tokens_seen": 14606592, "step": 29675 }, { "epoch": 3.9171175927147948, "grad_norm": 0.05069692060351372, "learning_rate": 2.7248117978573725e-07, "loss": 0.001, "num_input_tokens_seen": 14609024, "step": 29680 }, { "epoch": 3.917777484492543, "grad_norm": 0.0025865097995847464, "learning_rate": 2.721651776738212e-07, "loss": 0.1096, "num_input_tokens_seen": 14611392, "step": 29685 }, { "epoch": 3.9184373762702918, "grad_norm": 0.5897282361984253, "learning_rate": 2.71849330039576e-07, "loss": 0.0004, "num_input_tokens_seen": 14613760, "step": 29690 }, { "epoch": 3.91909726804804, "grad_norm": 0.007200206164270639, "learning_rate": 2.715336369500374e-07, "loss": 0.0, "num_input_tokens_seen": 14616128, "step": 29695 }, { "epoch": 3.9197571598257888, "grad_norm": 0.001574324443936348, "learning_rate": 2.712180984722091e-07, "loss": 0.0441, "num_input_tokens_seen": 14618816, "step": 29700 }, { "epoch": 3.920417051603537, "grad_norm": 0.1959327608346939, "learning_rate": 2.7090271467306235e-07, "loss": 0.0002, "num_input_tokens_seen": 14621184, "step": 29705 }, { "epoch": 3.9210769433812853, "grad_norm": 0.036899324506521225, "learning_rate": 2.705874856195344e-07, "loss": 0.0, "num_input_tokens_seen": 14623936, "step": 29710 }, { "epoch": 3.921736835159034, "grad_norm": 0.007074782159179449, "learning_rate": 2.702724113785305e-07, "loss": 0.0797, "num_input_tokens_seen": 14626176, "step": 29715 }, { "epoch": 3.9223967269367823, "grad_norm": 0.3928063213825226, "learning_rate": 2.6995749201692353e-07, "loss": 0.0506, "num_input_tokens_seen": 14628608, "step": 29720 }, { "epoch": 3.923056618714531, "grad_norm": 0.0009985992219299078, "learning_rate": 2.696427276015518e-07, "loss": 0.0011, "num_input_tokens_seen": 14631424, "step": 29725 }, { "epoch": 3.9237165104922793, "grad_norm": 0.00414725998416543, "learning_rate": 2.693281181992225e-07, "loss": 0.0049, "num_input_tokens_seen": 14633792, "step": 29730 }, { "epoch": 3.9243764022700276, "grad_norm": 0.0017239763401448727, "learning_rate": 2.6901366387670885e-07, "loss": 0.0009, "num_input_tokens_seen": 14636352, "step": 29735 }, { "epoch": 3.9250362940477763, "grad_norm": 0.0003734455385711044, "learning_rate": 2.6869936470075214e-07, "loss": 0.0001, "num_input_tokens_seen": 14638784, "step": 29740 }, { "epoch": 3.9256961858255246, "grad_norm": 0.0018246241379529238, "learning_rate": 2.6838522073805915e-07, "loss": 0.0, "num_input_tokens_seen": 14641408, "step": 29745 }, { "epoch": 3.9263560776032733, "grad_norm": 0.0012204793747514486, "learning_rate": 2.6807123205530523e-07, "loss": 0.0, "num_input_tokens_seen": 14643712, "step": 29750 }, { "epoch": 3.9270159693810216, "grad_norm": 17.668453216552734, "learning_rate": 2.677573987191323e-07, "loss": 0.0412, "num_input_tokens_seen": 14646336, "step": 29755 }, { "epoch": 3.92767586115877, "grad_norm": 0.19213633239269257, "learning_rate": 2.674437207961487e-07, "loss": 0.0008, "num_input_tokens_seen": 14648832, "step": 29760 }, { "epoch": 3.928335752936518, "grad_norm": 0.007988156750798225, "learning_rate": 2.671301983529307e-07, "loss": 0.0, "num_input_tokens_seen": 14651136, "step": 29765 }, { "epoch": 3.928995644714267, "grad_norm": 7.79569149017334, "learning_rate": 2.668168314560213e-07, "loss": 0.0823, "num_input_tokens_seen": 14653568, "step": 29770 }, { "epoch": 3.929655536492015, "grad_norm": 0.01193348877131939, "learning_rate": 2.6650362017192986e-07, "loss": 0.0239, "num_input_tokens_seen": 14656000, "step": 29775 }, { "epoch": 3.930315428269764, "grad_norm": 0.03314467892050743, "learning_rate": 2.661905645671335e-07, "loss": 0.0001, "num_input_tokens_seen": 14658432, "step": 29780 }, { "epoch": 3.930975320047512, "grad_norm": 0.0030468301847577095, "learning_rate": 2.658776647080759e-07, "loss": 0.0035, "num_input_tokens_seen": 14661056, "step": 29785 }, { "epoch": 3.9316352118252604, "grad_norm": 0.005730305332690477, "learning_rate": 2.655649206611683e-07, "loss": 0.0337, "num_input_tokens_seen": 14663360, "step": 29790 }, { "epoch": 3.932295103603009, "grad_norm": 0.00034679798409342766, "learning_rate": 2.652523324927876e-07, "loss": 0.0002, "num_input_tokens_seen": 14665856, "step": 29795 }, { "epoch": 3.9329549953807574, "grad_norm": 0.0049493578262627125, "learning_rate": 2.649399002692786e-07, "loss": 0.0, "num_input_tokens_seen": 14668224, "step": 29800 }, { "epoch": 3.933614887158506, "grad_norm": 0.00087630475172773, "learning_rate": 2.6462762405695314e-07, "loss": 0.0, "num_input_tokens_seen": 14670464, "step": 29805 }, { "epoch": 3.9342747789362544, "grad_norm": 11.067977905273438, "learning_rate": 2.6431550392208924e-07, "loss": 0.0352, "num_input_tokens_seen": 14673088, "step": 29810 }, { "epoch": 3.9349346707140027, "grad_norm": 0.0014208820648491383, "learning_rate": 2.6400353993093205e-07, "loss": 0.0, "num_input_tokens_seen": 14675584, "step": 29815 }, { "epoch": 3.9355945624917514, "grad_norm": 0.014897317625582218, "learning_rate": 2.636917321496939e-07, "loss": 0.0, "num_input_tokens_seen": 14678336, "step": 29820 }, { "epoch": 3.9362544542694997, "grad_norm": 0.003759880783036351, "learning_rate": 2.6338008064455395e-07, "loss": 0.0, "num_input_tokens_seen": 14680896, "step": 29825 }, { "epoch": 3.9369143460472484, "grad_norm": 16.379169464111328, "learning_rate": 2.6306858548165776e-07, "loss": 0.0008, "num_input_tokens_seen": 14683200, "step": 29830 }, { "epoch": 3.9375742378249967, "grad_norm": 0.0020549760665744543, "learning_rate": 2.627572467271172e-07, "loss": 0.0, "num_input_tokens_seen": 14685760, "step": 29835 }, { "epoch": 3.938234129602745, "grad_norm": 0.054575130343437195, "learning_rate": 2.62446064447013e-07, "loss": 0.0, "num_input_tokens_seen": 14688256, "step": 29840 }, { "epoch": 3.9388940213804937, "grad_norm": 4.02533114538528e-05, "learning_rate": 2.621350387073903e-07, "loss": 0.0617, "num_input_tokens_seen": 14690496, "step": 29845 }, { "epoch": 3.939553913158242, "grad_norm": 0.005808962509036064, "learning_rate": 2.618241695742628e-07, "loss": 0.0, "num_input_tokens_seen": 14692992, "step": 29850 }, { "epoch": 3.9402138049359907, "grad_norm": 0.005495645571500063, "learning_rate": 2.615134571136095e-07, "loss": 0.0001, "num_input_tokens_seen": 14695168, "step": 29855 }, { "epoch": 3.940873696713739, "grad_norm": 0.0014869053848087788, "learning_rate": 2.6120290139137726e-07, "loss": 0.1172, "num_input_tokens_seen": 14697664, "step": 29860 }, { "epoch": 3.9415335884914873, "grad_norm": 0.010261405259370804, "learning_rate": 2.608925024734795e-07, "loss": 0.0, "num_input_tokens_seen": 14700480, "step": 29865 }, { "epoch": 3.942193480269236, "grad_norm": 0.06723063439130783, "learning_rate": 2.605822604257953e-07, "loss": 0.0001, "num_input_tokens_seen": 14703296, "step": 29870 }, { "epoch": 3.9428533720469843, "grad_norm": 0.002988782711327076, "learning_rate": 2.6027217531417256e-07, "loss": 0.0, "num_input_tokens_seen": 14705408, "step": 29875 }, { "epoch": 3.943513263824733, "grad_norm": 0.14848151803016663, "learning_rate": 2.5996224720442394e-07, "loss": 0.0001, "num_input_tokens_seen": 14707712, "step": 29880 }, { "epoch": 3.9441731556024813, "grad_norm": 0.000740960007533431, "learning_rate": 2.59652476162329e-07, "loss": 0.0, "num_input_tokens_seen": 14710208, "step": 29885 }, { "epoch": 3.9448330473802296, "grad_norm": 0.002873439807444811, "learning_rate": 2.593428622536349e-07, "loss": 0.0001, "num_input_tokens_seen": 14712640, "step": 29890 }, { "epoch": 3.945492939157978, "grad_norm": 0.0018458872800692916, "learning_rate": 2.5903340554405485e-07, "loss": 0.061, "num_input_tokens_seen": 14715136, "step": 29895 }, { "epoch": 3.9461528309357266, "grad_norm": 0.01930380053818226, "learning_rate": 2.587241060992691e-07, "loss": 0.0, "num_input_tokens_seen": 14717824, "step": 29900 }, { "epoch": 3.946812722713475, "grad_norm": 0.007538790814578533, "learning_rate": 2.5841496398492366e-07, "loss": 0.0, "num_input_tokens_seen": 14720320, "step": 29905 }, { "epoch": 3.9474726144912236, "grad_norm": 0.02111750654876232, "learning_rate": 2.5810597926663205e-07, "loss": 0.0, "num_input_tokens_seen": 14722688, "step": 29910 }, { "epoch": 3.948132506268972, "grad_norm": 46.01771545410156, "learning_rate": 2.577971520099741e-07, "loss": 0.1273, "num_input_tokens_seen": 14724928, "step": 29915 }, { "epoch": 3.94879239804672, "grad_norm": 0.007815618999302387, "learning_rate": 2.574884822804958e-07, "loss": 0.0, "num_input_tokens_seen": 14727360, "step": 29920 }, { "epoch": 3.949452289824469, "grad_norm": 0.0002846256538759917, "learning_rate": 2.571799701437103e-07, "loss": 0.0001, "num_input_tokens_seen": 14729856, "step": 29925 }, { "epoch": 3.950112181602217, "grad_norm": 0.369070440530777, "learning_rate": 2.568716156650974e-07, "loss": 0.0003, "num_input_tokens_seen": 14732224, "step": 29930 }, { "epoch": 3.950772073379966, "grad_norm": 0.002331068040803075, "learning_rate": 2.5656341891010236e-07, "loss": 0.028, "num_input_tokens_seen": 14734912, "step": 29935 }, { "epoch": 3.951431965157714, "grad_norm": 0.0062283482402563095, "learning_rate": 2.5625537994413825e-07, "loss": 0.0, "num_input_tokens_seen": 14737216, "step": 29940 }, { "epoch": 3.9520918569354624, "grad_norm": 2.5231564044952393, "learning_rate": 2.559474988325838e-07, "loss": 0.0523, "num_input_tokens_seen": 14739648, "step": 29945 }, { "epoch": 3.952751748713211, "grad_norm": 19.905338287353516, "learning_rate": 2.556397756407852e-07, "loss": 0.1603, "num_input_tokens_seen": 14742400, "step": 29950 }, { "epoch": 3.9534116404909594, "grad_norm": 0.009885110892355442, "learning_rate": 2.5533221043405364e-07, "loss": 0.0001, "num_input_tokens_seen": 14744832, "step": 29955 }, { "epoch": 3.954071532268708, "grad_norm": 0.005242446903139353, "learning_rate": 2.5502480327766785e-07, "loss": 0.0003, "num_input_tokens_seen": 14747392, "step": 29960 }, { "epoch": 3.9547314240464564, "grad_norm": 12.402752876281738, "learning_rate": 2.5471755423687326e-07, "loss": 0.02, "num_input_tokens_seen": 14749952, "step": 29965 }, { "epoch": 3.9553913158242047, "grad_norm": 0.01866605319082737, "learning_rate": 2.5441046337688053e-07, "loss": 0.0, "num_input_tokens_seen": 14752384, "step": 29970 }, { "epoch": 3.9560512076019534, "grad_norm": 0.0028618343640118837, "learning_rate": 2.541035307628678e-07, "loss": 0.0849, "num_input_tokens_seen": 14754880, "step": 29975 }, { "epoch": 3.9567110993797017, "grad_norm": 0.0022470192052423954, "learning_rate": 2.5379675645997965e-07, "loss": 0.0002, "num_input_tokens_seen": 14757184, "step": 29980 }, { "epoch": 3.9573709911574504, "grad_norm": 14.184697151184082, "learning_rate": 2.5349014053332604e-07, "loss": 0.0546, "num_input_tokens_seen": 14759744, "step": 29985 }, { "epoch": 3.9580308829351987, "grad_norm": 0.010952018201351166, "learning_rate": 2.5318368304798464e-07, "loss": 0.0, "num_input_tokens_seen": 14762112, "step": 29990 }, { "epoch": 3.958690774712947, "grad_norm": 0.01866159960627556, "learning_rate": 2.5287738406899783e-07, "loss": 0.0, "num_input_tokens_seen": 14764608, "step": 29995 }, { "epoch": 3.9593506664906957, "grad_norm": 0.0018119042506441474, "learning_rate": 2.525712436613767e-07, "loss": 0.0015, "num_input_tokens_seen": 14767104, "step": 30000 }, { "epoch": 3.960010558268444, "grad_norm": 0.06305401027202606, "learning_rate": 2.5226526189009656e-07, "loss": 0.0012, "num_input_tokens_seen": 14769792, "step": 30005 }, { "epoch": 3.9606704500461927, "grad_norm": 0.1402633786201477, "learning_rate": 2.519594388200994e-07, "loss": 0.0001, "num_input_tokens_seen": 14772224, "step": 30010 }, { "epoch": 3.961330341823941, "grad_norm": 0.0025317424442619085, "learning_rate": 2.51653774516295e-07, "loss": 0.0007, "num_input_tokens_seen": 14774784, "step": 30015 }, { "epoch": 3.9619902336016892, "grad_norm": 31.066204071044922, "learning_rate": 2.5134826904355767e-07, "loss": 0.0472, "num_input_tokens_seen": 14777088, "step": 30020 }, { "epoch": 3.9626501253794375, "grad_norm": 0.0028152584563940763, "learning_rate": 2.510429224667291e-07, "loss": 0.0, "num_input_tokens_seen": 14779264, "step": 30025 }, { "epoch": 3.9633100171571862, "grad_norm": 0.011473532766103745, "learning_rate": 2.5073773485061645e-07, "loss": 0.0, "num_input_tokens_seen": 14781696, "step": 30030 }, { "epoch": 3.9639699089349345, "grad_norm": 0.004749370273202658, "learning_rate": 2.504327062599939e-07, "loss": 0.0704, "num_input_tokens_seen": 14784384, "step": 30035 }, { "epoch": 3.9646298007126832, "grad_norm": 0.008181129582226276, "learning_rate": 2.501278367596017e-07, "loss": 0.0657, "num_input_tokens_seen": 14786752, "step": 30040 }, { "epoch": 3.9652896924904315, "grad_norm": 0.021034974604845047, "learning_rate": 2.498231264141458e-07, "loss": 0.0, "num_input_tokens_seen": 14789312, "step": 30045 }, { "epoch": 3.96594958426818, "grad_norm": 0.0029732969123870134, "learning_rate": 2.495185752882989e-07, "loss": 0.0004, "num_input_tokens_seen": 14791488, "step": 30050 }, { "epoch": 3.9666094760459285, "grad_norm": 0.019406119361519814, "learning_rate": 2.492141834467002e-07, "loss": 0.0, "num_input_tokens_seen": 14793856, "step": 30055 }, { "epoch": 3.967269367823677, "grad_norm": 0.007835861295461655, "learning_rate": 2.4890995095395397e-07, "loss": 0.0518, "num_input_tokens_seen": 14796352, "step": 30060 }, { "epoch": 3.9679292596014255, "grad_norm": 0.0039335619658231735, "learning_rate": 2.486058778746316e-07, "loss": 0.0, "num_input_tokens_seen": 14798976, "step": 30065 }, { "epoch": 3.968589151379174, "grad_norm": 0.003976741805672646, "learning_rate": 2.4830196427327056e-07, "loss": 0.0518, "num_input_tokens_seen": 14801472, "step": 30070 }, { "epoch": 3.969249043156922, "grad_norm": 0.03188467025756836, "learning_rate": 2.4799821021437463e-07, "loss": 0.0003, "num_input_tokens_seen": 14804224, "step": 30075 }, { "epoch": 3.969908934934671, "grad_norm": 0.013812188059091568, "learning_rate": 2.476946157624126e-07, "loss": 0.0, "num_input_tokens_seen": 14806464, "step": 30080 }, { "epoch": 3.970568826712419, "grad_norm": 0.10443263500928879, "learning_rate": 2.4739118098182055e-07, "loss": 0.0001, "num_input_tokens_seen": 14809216, "step": 30085 }, { "epoch": 3.971228718490168, "grad_norm": 0.007619071286171675, "learning_rate": 2.470879059370008e-07, "loss": 0.0, "num_input_tokens_seen": 14811392, "step": 30090 }, { "epoch": 3.971888610267916, "grad_norm": 0.008477813564240932, "learning_rate": 2.467847906923205e-07, "loss": 0.0, "num_input_tokens_seen": 14813824, "step": 30095 }, { "epoch": 3.9725485020456643, "grad_norm": 0.0526459701359272, "learning_rate": 2.4648183531211397e-07, "loss": 0.0001, "num_input_tokens_seen": 14816000, "step": 30100 }, { "epoch": 3.973208393823413, "grad_norm": 0.011147763580083847, "learning_rate": 2.4617903986068146e-07, "loss": 0.0005, "num_input_tokens_seen": 14818368, "step": 30105 }, { "epoch": 3.9738682856011613, "grad_norm": 0.007487526163458824, "learning_rate": 2.458764044022892e-07, "loss": 0.0939, "num_input_tokens_seen": 14820544, "step": 30110 }, { "epoch": 3.97452817737891, "grad_norm": 0.23043251037597656, "learning_rate": 2.455739290011689e-07, "loss": 0.0003, "num_input_tokens_seen": 14822912, "step": 30115 }, { "epoch": 3.9751880691566583, "grad_norm": 0.005081063602119684, "learning_rate": 2.452716137215191e-07, "loss": 0.0626, "num_input_tokens_seen": 14825152, "step": 30120 }, { "epoch": 3.9758479609344066, "grad_norm": 0.0004440572520252317, "learning_rate": 2.449694586275042e-07, "loss": 0.0, "num_input_tokens_seen": 14827776, "step": 30125 }, { "epoch": 3.9765078527121553, "grad_norm": 0.012423052452504635, "learning_rate": 2.4466746378325384e-07, "loss": 0.0001, "num_input_tokens_seen": 14830336, "step": 30130 }, { "epoch": 3.9771677444899036, "grad_norm": 0.012256015092134476, "learning_rate": 2.4436562925286473e-07, "loss": 0.0005, "num_input_tokens_seen": 14832896, "step": 30135 }, { "epoch": 3.9778276362676523, "grad_norm": 0.003797542303800583, "learning_rate": 2.440639551003992e-07, "loss": 0.0213, "num_input_tokens_seen": 14835136, "step": 30140 }, { "epoch": 3.9784875280454006, "grad_norm": 0.008668414317071438, "learning_rate": 2.437624413898849e-07, "loss": 0.0001, "num_input_tokens_seen": 14837440, "step": 30145 }, { "epoch": 3.979147419823149, "grad_norm": 17.620750427246094, "learning_rate": 2.4346108818531605e-07, "loss": 0.0549, "num_input_tokens_seen": 14840128, "step": 30150 }, { "epoch": 3.979807311600897, "grad_norm": 0.28982800245285034, "learning_rate": 2.4315989555065284e-07, "loss": 0.0001, "num_input_tokens_seen": 14842560, "step": 30155 }, { "epoch": 3.980467203378646, "grad_norm": 0.0032483511604368687, "learning_rate": 2.428588635498215e-07, "loss": 0.0, "num_input_tokens_seen": 14844928, "step": 30160 }, { "epoch": 3.9811270951563946, "grad_norm": 0.0006455311668105423, "learning_rate": 2.425579922467137e-07, "loss": 0.0, "num_input_tokens_seen": 14847104, "step": 30165 }, { "epoch": 3.981786986934143, "grad_norm": 0.0021415213122963905, "learning_rate": 2.4225728170518636e-07, "loss": 0.0, "num_input_tokens_seen": 14849664, "step": 30170 }, { "epoch": 3.982446878711891, "grad_norm": 0.004268865566700697, "learning_rate": 2.419567319890645e-07, "loss": 0.0518, "num_input_tokens_seen": 14851968, "step": 30175 }, { "epoch": 3.9831067704896395, "grad_norm": 0.0018845315789803863, "learning_rate": 2.416563431621366e-07, "loss": 0.0, "num_input_tokens_seen": 14854400, "step": 30180 }, { "epoch": 3.983766662267388, "grad_norm": 0.014987271279096603, "learning_rate": 2.413561152881587e-07, "loss": 0.0024, "num_input_tokens_seen": 14857024, "step": 30185 }, { "epoch": 3.9844265540451365, "grad_norm": 0.0005198507569730282, "learning_rate": 2.410560484308514e-07, "loss": 0.0, "num_input_tokens_seen": 14859264, "step": 30190 }, { "epoch": 3.985086445822885, "grad_norm": 0.014474891126155853, "learning_rate": 2.407561426539019e-07, "loss": 0.0, "num_input_tokens_seen": 14861824, "step": 30195 }, { "epoch": 3.9857463376006335, "grad_norm": 0.40284138917922974, "learning_rate": 2.404563980209634e-07, "loss": 0.0037, "num_input_tokens_seen": 14864256, "step": 30200 }, { "epoch": 3.9864062293783817, "grad_norm": 0.0001849651162046939, "learning_rate": 2.401568145956537e-07, "loss": 0.0001, "num_input_tokens_seen": 14866432, "step": 30205 }, { "epoch": 3.9870661211561305, "grad_norm": 0.003026962745934725, "learning_rate": 2.398573924415583e-07, "loss": 0.0001, "num_input_tokens_seen": 14869248, "step": 30210 }, { "epoch": 3.9877260129338787, "grad_norm": 0.0028771180659532547, "learning_rate": 2.395581316222269e-07, "loss": 0.0, "num_input_tokens_seen": 14871552, "step": 30215 }, { "epoch": 3.9883859047116275, "grad_norm": 0.029063567519187927, "learning_rate": 2.3925903220117506e-07, "loss": 0.0503, "num_input_tokens_seen": 14874432, "step": 30220 }, { "epoch": 3.9890457964893757, "grad_norm": 0.001278785872273147, "learning_rate": 2.389600942418848e-07, "loss": 0.0253, "num_input_tokens_seen": 14876800, "step": 30225 }, { "epoch": 3.989705688267124, "grad_norm": 0.006419615354388952, "learning_rate": 2.386613178078035e-07, "loss": 0.0, "num_input_tokens_seen": 14879168, "step": 30230 }, { "epoch": 3.9903655800448727, "grad_norm": 0.013536657206714153, "learning_rate": 2.3836270296234463e-07, "loss": 0.0, "num_input_tokens_seen": 14881728, "step": 30235 }, { "epoch": 3.991025471822621, "grad_norm": 0.11201123893260956, "learning_rate": 2.3806424976888639e-07, "loss": 0.0001, "num_input_tokens_seen": 14884288, "step": 30240 }, { "epoch": 3.9916853636003697, "grad_norm": 0.009009967558085918, "learning_rate": 2.3776595829077362e-07, "loss": 0.0472, "num_input_tokens_seen": 14886976, "step": 30245 }, { "epoch": 3.992345255378118, "grad_norm": 0.8810946345329285, "learning_rate": 2.3746782859131685e-07, "loss": 0.0006, "num_input_tokens_seen": 14889600, "step": 30250 }, { "epoch": 3.9930051471558663, "grad_norm": 0.00014014226326253265, "learning_rate": 2.371698607337913e-07, "loss": 0.0565, "num_input_tokens_seen": 14892224, "step": 30255 }, { "epoch": 3.993665038933615, "grad_norm": 0.03605637326836586, "learning_rate": 2.368720547814389e-07, "loss": 0.0001, "num_input_tokens_seen": 14894592, "step": 30260 }, { "epoch": 3.9943249307113633, "grad_norm": 0.00678131403401494, "learning_rate": 2.3657441079746698e-07, "loss": 0.0, "num_input_tokens_seen": 14897344, "step": 30265 }, { "epoch": 3.994984822489112, "grad_norm": 0.002024312736466527, "learning_rate": 2.362769288450478e-07, "loss": 0.0411, "num_input_tokens_seen": 14899904, "step": 30270 }, { "epoch": 3.9956447142668603, "grad_norm": 0.06557377427816391, "learning_rate": 2.3597960898731995e-07, "loss": 0.0, "num_input_tokens_seen": 14902400, "step": 30275 }, { "epoch": 3.9963046060446086, "grad_norm": 0.0008136624819599092, "learning_rate": 2.356824512873876e-07, "loss": 0.0001, "num_input_tokens_seen": 14904704, "step": 30280 }, { "epoch": 3.9969644978223573, "grad_norm": 0.0027104674372822046, "learning_rate": 2.3538545580832047e-07, "loss": 0.0009, "num_input_tokens_seen": 14907520, "step": 30285 }, { "epoch": 3.9976243896001056, "grad_norm": 12.874913215637207, "learning_rate": 2.350886226131531e-07, "loss": 0.0414, "num_input_tokens_seen": 14909952, "step": 30290 }, { "epoch": 3.9982842813778543, "grad_norm": 0.008480180986225605, "learning_rate": 2.3479195176488664e-07, "loss": 0.0, "num_input_tokens_seen": 14912640, "step": 30295 }, { "epoch": 3.9989441731556026, "grad_norm": 0.0009021972655318677, "learning_rate": 2.344954433264874e-07, "loss": 0.0626, "num_input_tokens_seen": 14915136, "step": 30300 }, { "epoch": 3.999604064933351, "grad_norm": 0.0008285566582344472, "learning_rate": 2.3419909736088672e-07, "loss": 0.0, "num_input_tokens_seen": 14917504, "step": 30305 }, { "epoch": 4.000263956711099, "grad_norm": 0.003723538015037775, "learning_rate": 2.3390291393098215e-07, "loss": 0.0, "num_input_tokens_seen": 14919888, "step": 30310 }, { "epoch": 4.000923848488847, "grad_norm": 0.0029248909559100866, "learning_rate": 2.3360689309963666e-07, "loss": 0.0003, "num_input_tokens_seen": 14922640, "step": 30315 }, { "epoch": 4.001583740266597, "grad_norm": 0.010353301651775837, "learning_rate": 2.333110349296782e-07, "loss": 0.0002, "num_input_tokens_seen": 14924944, "step": 30320 }, { "epoch": 4.001583740266597, "eval_loss": 0.1816491037607193, "eval_runtime": 7.8672, "eval_samples_per_second": 856.086, "eval_steps_per_second": 107.027, "num_input_tokens_seen": 14924944, "step": 30320 }, { "epoch": 4.002243632044345, "grad_norm": 0.011572030372917652, "learning_rate": 2.3301533948390072e-07, "loss": 0.0, "num_input_tokens_seen": 14927632, "step": 30325 }, { "epoch": 4.002903523822093, "grad_norm": 0.005584248807281256, "learning_rate": 2.3271980682506297e-07, "loss": 0.0001, "num_input_tokens_seen": 14930128, "step": 30330 }, { "epoch": 4.003563415599841, "grad_norm": 0.018282150849699974, "learning_rate": 2.3242443701589054e-07, "loss": 0.0, "num_input_tokens_seen": 14932688, "step": 30335 }, { "epoch": 4.00422330737759, "grad_norm": 8.024270937312394e-05, "learning_rate": 2.3212923011907305e-07, "loss": 0.0002, "num_input_tokens_seen": 14935120, "step": 30340 }, { "epoch": 4.004883199155339, "grad_norm": 0.0024475615937262774, "learning_rate": 2.3183418619726523e-07, "loss": 0.0, "num_input_tokens_seen": 14937488, "step": 30345 }, { "epoch": 4.005543090933087, "grad_norm": 0.015102401375770569, "learning_rate": 2.3153930531308952e-07, "loss": 0.0176, "num_input_tokens_seen": 14939984, "step": 30350 }, { "epoch": 4.006202982710835, "grad_norm": 0.003354162210598588, "learning_rate": 2.3124458752913123e-07, "loss": 0.0, "num_input_tokens_seen": 14942416, "step": 30355 }, { "epoch": 4.006862874488584, "grad_norm": 1.4745862483978271, "learning_rate": 2.3095003290794258e-07, "loss": 0.0006, "num_input_tokens_seen": 14944656, "step": 30360 }, { "epoch": 4.007522766266332, "grad_norm": 2.048581518465653e-05, "learning_rate": 2.306556415120401e-07, "loss": 0.0018, "num_input_tokens_seen": 14947344, "step": 30365 }, { "epoch": 4.008182658044081, "grad_norm": 0.0013990188017487526, "learning_rate": 2.3036141340390657e-07, "loss": 0.0014, "num_input_tokens_seen": 14949648, "step": 30370 }, { "epoch": 4.008842549821829, "grad_norm": 0.1011928915977478, "learning_rate": 2.3006734864599008e-07, "loss": 0.0001, "num_input_tokens_seen": 14952464, "step": 30375 }, { "epoch": 4.009502441599578, "grad_norm": 0.002364733023568988, "learning_rate": 2.2977344730070314e-07, "loss": 0.0004, "num_input_tokens_seen": 14954896, "step": 30380 }, { "epoch": 4.010162333377326, "grad_norm": 0.1956844925880432, "learning_rate": 2.294797094304244e-07, "loss": 0.0504, "num_input_tokens_seen": 14957456, "step": 30385 }, { "epoch": 4.010822225155074, "grad_norm": 0.00027462790603749454, "learning_rate": 2.2918613509749795e-07, "loss": 0.0, "num_input_tokens_seen": 14959696, "step": 30390 }, { "epoch": 4.011482116932823, "grad_norm": 0.02837277203798294, "learning_rate": 2.2889272436423233e-07, "loss": 0.0001, "num_input_tokens_seen": 14962192, "step": 30395 }, { "epoch": 4.012142008710572, "grad_norm": 0.0019921245984733105, "learning_rate": 2.2859947729290207e-07, "loss": 0.0, "num_input_tokens_seen": 14964432, "step": 30400 }, { "epoch": 4.01280190048832, "grad_norm": 0.000512312282808125, "learning_rate": 2.2830639394574657e-07, "loss": 0.0, "num_input_tokens_seen": 14967056, "step": 30405 }, { "epoch": 4.013461792266068, "grad_norm": 0.0010396696161478758, "learning_rate": 2.280134743849712e-07, "loss": 0.0, "num_input_tokens_seen": 14969296, "step": 30410 }, { "epoch": 4.0141216840438165, "grad_norm": 8.928061485290527, "learning_rate": 2.2772071867274524e-07, "loss": 0.056, "num_input_tokens_seen": 14971600, "step": 30415 }, { "epoch": 4.014781575821566, "grad_norm": 0.005695475731045008, "learning_rate": 2.2742812687120438e-07, "loss": 0.0006, "num_input_tokens_seen": 14973904, "step": 30420 }, { "epoch": 4.015441467599314, "grad_norm": 0.0021254941821098328, "learning_rate": 2.2713569904244934e-07, "loss": 0.001, "num_input_tokens_seen": 14976592, "step": 30425 }, { "epoch": 4.016101359377062, "grad_norm": 0.00028210715390741825, "learning_rate": 2.268434352485452e-07, "loss": 0.0005, "num_input_tokens_seen": 14979408, "step": 30430 }, { "epoch": 4.0167612511548105, "grad_norm": 0.005418936721980572, "learning_rate": 2.265513355515233e-07, "loss": 0.0003, "num_input_tokens_seen": 14981776, "step": 30435 }, { "epoch": 4.017421142932559, "grad_norm": 0.0015027726767584682, "learning_rate": 2.262594000133795e-07, "loss": 0.0, "num_input_tokens_seen": 14984208, "step": 30440 }, { "epoch": 4.018081034710307, "grad_norm": 0.002356699900701642, "learning_rate": 2.2596762869607521e-07, "loss": 0.0007, "num_input_tokens_seen": 14986704, "step": 30445 }, { "epoch": 4.018740926488056, "grad_norm": 0.0013817485887557268, "learning_rate": 2.2567602166153653e-07, "loss": 0.0, "num_input_tokens_seen": 14989328, "step": 30450 }, { "epoch": 4.0194008182658045, "grad_norm": 0.09678442031145096, "learning_rate": 2.2538457897165498e-07, "loss": 0.0001, "num_input_tokens_seen": 14991568, "step": 30455 }, { "epoch": 4.020060710043553, "grad_norm": 0.0013658119132742286, "learning_rate": 2.2509330068828748e-07, "loss": 0.028, "num_input_tokens_seen": 14993680, "step": 30460 }, { "epoch": 4.020720601821301, "grad_norm": 0.009650514461100101, "learning_rate": 2.2480218687325515e-07, "loss": 0.0, "num_input_tokens_seen": 14996048, "step": 30465 }, { "epoch": 4.021380493599049, "grad_norm": 0.0006762424600310624, "learning_rate": 2.2451123758834512e-07, "loss": 0.0, "num_input_tokens_seen": 14998544, "step": 30470 }, { "epoch": 4.0220403853767985, "grad_norm": 142.475341796875, "learning_rate": 2.2422045289530967e-07, "loss": 0.0337, "num_input_tokens_seen": 15000976, "step": 30475 }, { "epoch": 4.022700277154547, "grad_norm": 0.04685702919960022, "learning_rate": 2.2392983285586487e-07, "loss": 0.0, "num_input_tokens_seen": 15003408, "step": 30480 }, { "epoch": 4.023360168932295, "grad_norm": 0.0030901760328561068, "learning_rate": 2.2363937753169338e-07, "loss": 0.0383, "num_input_tokens_seen": 15005904, "step": 30485 }, { "epoch": 4.024020060710043, "grad_norm": 0.0033339818473905325, "learning_rate": 2.2334908698444188e-07, "loss": 0.0239, "num_input_tokens_seen": 15008400, "step": 30490 }, { "epoch": 4.024679952487792, "grad_norm": 0.00055954564595595, "learning_rate": 2.23058961275723e-07, "loss": 0.0, "num_input_tokens_seen": 15010960, "step": 30495 }, { "epoch": 4.025339844265541, "grad_norm": 0.002326509216800332, "learning_rate": 2.2276900046711334e-07, "loss": 0.0, "num_input_tokens_seen": 15013392, "step": 30500 }, { "epoch": 4.025999736043289, "grad_norm": 0.002689636778086424, "learning_rate": 2.2247920462015458e-07, "loss": 0.0005, "num_input_tokens_seen": 15016080, "step": 30505 }, { "epoch": 4.026659627821037, "grad_norm": 0.0010973322205245495, "learning_rate": 2.2218957379635483e-07, "loss": 0.0, "num_input_tokens_seen": 15018320, "step": 30510 }, { "epoch": 4.027319519598786, "grad_norm": 0.005248472560197115, "learning_rate": 2.2190010805718528e-07, "loss": 0.0, "num_input_tokens_seen": 15021008, "step": 30515 }, { "epoch": 4.027979411376534, "grad_norm": 0.0011204908369109035, "learning_rate": 2.2161080746408345e-07, "loss": 0.0, "num_input_tokens_seen": 15023312, "step": 30520 }, { "epoch": 4.028639303154283, "grad_norm": 0.005970706697553396, "learning_rate": 2.2132167207845087e-07, "loss": 0.0023, "num_input_tokens_seen": 15025552, "step": 30525 }, { "epoch": 4.029299194932031, "grad_norm": 0.001030957093462348, "learning_rate": 2.2103270196165468e-07, "loss": 0.0, "num_input_tokens_seen": 15028176, "step": 30530 }, { "epoch": 4.02995908670978, "grad_norm": 0.0019524479284882545, "learning_rate": 2.2074389717502695e-07, "loss": 0.0, "num_input_tokens_seen": 15030416, "step": 30535 }, { "epoch": 4.030618978487528, "grad_norm": 0.0014918609522283077, "learning_rate": 2.204552577798635e-07, "loss": 0.0008, "num_input_tokens_seen": 15032720, "step": 30540 }, { "epoch": 4.031278870265276, "grad_norm": 0.0008159163990058005, "learning_rate": 2.2016678383742714e-07, "loss": 0.0004, "num_input_tokens_seen": 15035216, "step": 30545 }, { "epoch": 4.031938762043025, "grad_norm": 0.005126807373017073, "learning_rate": 2.1987847540894378e-07, "loss": 0.0, "num_input_tokens_seen": 15037648, "step": 30550 }, { "epoch": 4.032598653820774, "grad_norm": 0.00635765353217721, "learning_rate": 2.1959033255560455e-07, "loss": 0.0001, "num_input_tokens_seen": 15040016, "step": 30555 }, { "epoch": 4.033258545598522, "grad_norm": 0.004286654759198427, "learning_rate": 2.19302355338566e-07, "loss": 0.0009, "num_input_tokens_seen": 15042768, "step": 30560 }, { "epoch": 4.03391843737627, "grad_norm": 0.00043845814070664346, "learning_rate": 2.1901454381894914e-07, "loss": 0.028, "num_input_tokens_seen": 15045008, "step": 30565 }, { "epoch": 4.0345783291540185, "grad_norm": 0.027832932770252228, "learning_rate": 2.1872689805784007e-07, "loss": 0.0, "num_input_tokens_seen": 15047376, "step": 30570 }, { "epoch": 4.035238220931767, "grad_norm": 0.0007483892259187996, "learning_rate": 2.1843941811628918e-07, "loss": 0.0008, "num_input_tokens_seen": 15050064, "step": 30575 }, { "epoch": 4.035898112709516, "grad_norm": 0.0030488441698253155, "learning_rate": 2.1815210405531214e-07, "loss": 0.0, "num_input_tokens_seen": 15052560, "step": 30580 }, { "epoch": 4.036558004487264, "grad_norm": 0.0026415761094540358, "learning_rate": 2.1786495593588972e-07, "loss": 0.0, "num_input_tokens_seen": 15055056, "step": 30585 }, { "epoch": 4.0372178962650125, "grad_norm": 0.015966864302754402, "learning_rate": 2.1757797381896625e-07, "loss": 0.0, "num_input_tokens_seen": 15057168, "step": 30590 }, { "epoch": 4.037877788042761, "grad_norm": 0.020574091002345085, "learning_rate": 2.1729115776545192e-07, "loss": 0.0, "num_input_tokens_seen": 15059408, "step": 30595 }, { "epoch": 4.038537679820509, "grad_norm": 0.004963522776961327, "learning_rate": 2.170045078362218e-07, "loss": 0.0, "num_input_tokens_seen": 15062032, "step": 30600 }, { "epoch": 4.039197571598258, "grad_norm": 1.6431316137313843, "learning_rate": 2.167180240921145e-07, "loss": 0.001, "num_input_tokens_seen": 15064528, "step": 30605 }, { "epoch": 4.0398574633760065, "grad_norm": 0.004193543456494808, "learning_rate": 2.1643170659393461e-07, "loss": 0.0, "num_input_tokens_seen": 15066704, "step": 30610 }, { "epoch": 4.040517355153755, "grad_norm": 0.00035888998536393046, "learning_rate": 2.1614555540245083e-07, "loss": 0.0014, "num_input_tokens_seen": 15069200, "step": 30615 }, { "epoch": 4.041177246931503, "grad_norm": 0.009292280301451683, "learning_rate": 2.1585957057839688e-07, "loss": 0.0, "num_input_tokens_seen": 15071440, "step": 30620 }, { "epoch": 4.041837138709251, "grad_norm": 0.0003106459917034954, "learning_rate": 2.1557375218247053e-07, "loss": 0.0001, "num_input_tokens_seen": 15074192, "step": 30625 }, { "epoch": 4.0424970304870005, "grad_norm": 0.0001242978178197518, "learning_rate": 2.1528810027533495e-07, "loss": 0.0, "num_input_tokens_seen": 15076624, "step": 30630 }, { "epoch": 4.043156922264749, "grad_norm": 0.001095140934921801, "learning_rate": 2.1500261491761796e-07, "loss": 0.0, "num_input_tokens_seen": 15079248, "step": 30635 }, { "epoch": 4.043816814042497, "grad_norm": 9.301899262936786e-05, "learning_rate": 2.1471729616991107e-07, "loss": 0.0005, "num_input_tokens_seen": 15081488, "step": 30640 }, { "epoch": 4.044476705820245, "grad_norm": 0.0002437293151160702, "learning_rate": 2.1443214409277154e-07, "loss": 0.0, "num_input_tokens_seen": 15083856, "step": 30645 }, { "epoch": 4.045136597597994, "grad_norm": 0.6968335509300232, "learning_rate": 2.1414715874672117e-07, "loss": 0.0004, "num_input_tokens_seen": 15086160, "step": 30650 }, { "epoch": 4.045796489375743, "grad_norm": 0.004181780386716127, "learning_rate": 2.1386234019224525e-07, "loss": 0.0, "num_input_tokens_seen": 15088336, "step": 30655 }, { "epoch": 4.046456381153491, "grad_norm": 0.0003220188373234123, "learning_rate": 2.1357768848979518e-07, "loss": 0.0, "num_input_tokens_seen": 15090832, "step": 30660 }, { "epoch": 4.047116272931239, "grad_norm": 0.054620057344436646, "learning_rate": 2.1329320369978532e-07, "loss": 0.0162, "num_input_tokens_seen": 15093392, "step": 30665 }, { "epoch": 4.047776164708988, "grad_norm": 0.009810811839997768, "learning_rate": 2.130088858825967e-07, "loss": 0.0, "num_input_tokens_seen": 15096144, "step": 30670 }, { "epoch": 4.048436056486736, "grad_norm": 0.0014325518859550357, "learning_rate": 2.1272473509857313e-07, "loss": 0.0028, "num_input_tokens_seen": 15098512, "step": 30675 }, { "epoch": 4.049095948264485, "grad_norm": 0.013514799997210503, "learning_rate": 2.1244075140802298e-07, "loss": 0.0188, "num_input_tokens_seen": 15101008, "step": 30680 }, { "epoch": 4.049755840042233, "grad_norm": 0.0010740907164290547, "learning_rate": 2.1215693487122078e-07, "loss": 0.0352, "num_input_tokens_seen": 15103632, "step": 30685 }, { "epoch": 4.050415731819982, "grad_norm": 0.011525586247444153, "learning_rate": 2.118732855484038e-07, "loss": 0.0001, "num_input_tokens_seen": 15106128, "step": 30690 }, { "epoch": 4.05107562359773, "grad_norm": 4.342807369539514e-05, "learning_rate": 2.1158980349977496e-07, "loss": 0.0564, "num_input_tokens_seen": 15108496, "step": 30695 }, { "epoch": 4.051735515375478, "grad_norm": 0.00133869843557477, "learning_rate": 2.1130648878550095e-07, "loss": 0.0, "num_input_tokens_seen": 15111184, "step": 30700 }, { "epoch": 4.052395407153226, "grad_norm": 0.00475813914090395, "learning_rate": 2.1102334146571342e-07, "loss": 0.0468, "num_input_tokens_seen": 15114000, "step": 30705 }, { "epoch": 4.053055298930976, "grad_norm": 0.0002620435261633247, "learning_rate": 2.1074036160050867e-07, "loss": 0.0, "num_input_tokens_seen": 15116240, "step": 30710 }, { "epoch": 4.053715190708724, "grad_norm": 0.002503307070583105, "learning_rate": 2.104575492499464e-07, "loss": 0.0352, "num_input_tokens_seen": 15118864, "step": 30715 }, { "epoch": 4.054375082486472, "grad_norm": 0.001919831382110715, "learning_rate": 2.1017490447405195e-07, "loss": 0.0, "num_input_tokens_seen": 15121552, "step": 30720 }, { "epoch": 4.05503497426422, "grad_norm": 0.005105135962367058, "learning_rate": 2.0989242733281486e-07, "loss": 0.0, "num_input_tokens_seen": 15123792, "step": 30725 }, { "epoch": 4.055694866041969, "grad_norm": 0.012683448381721973, "learning_rate": 2.0961011788618833e-07, "loss": 0.0002, "num_input_tokens_seen": 15126224, "step": 30730 }, { "epoch": 4.056354757819718, "grad_norm": 0.023624008521437645, "learning_rate": 2.0932797619409058e-07, "loss": 0.0, "num_input_tokens_seen": 15128912, "step": 30735 }, { "epoch": 4.057014649597466, "grad_norm": 0.002604448702186346, "learning_rate": 2.0904600231640435e-07, "loss": 0.0, "num_input_tokens_seen": 15131472, "step": 30740 }, { "epoch": 4.057674541375214, "grad_norm": 0.004134779330343008, "learning_rate": 2.0876419631297682e-07, "loss": 0.0, "num_input_tokens_seen": 15133776, "step": 30745 }, { "epoch": 4.058334433152963, "grad_norm": 0.00834393035620451, "learning_rate": 2.084825582436186e-07, "loss": 0.0004, "num_input_tokens_seen": 15136400, "step": 30750 }, { "epoch": 4.058994324930711, "grad_norm": 0.00036648480454459786, "learning_rate": 2.0820108816810565e-07, "loss": 0.0, "num_input_tokens_seen": 15138832, "step": 30755 }, { "epoch": 4.05965421670846, "grad_norm": 0.00014605256728827953, "learning_rate": 2.0791978614617834e-07, "loss": 0.0, "num_input_tokens_seen": 15141520, "step": 30760 }, { "epoch": 4.060314108486208, "grad_norm": 0.00479362765327096, "learning_rate": 2.0763865223754028e-07, "loss": 0.0, "num_input_tokens_seen": 15143760, "step": 30765 }, { "epoch": 4.060974000263957, "grad_norm": 0.01058896817266941, "learning_rate": 2.0735768650186058e-07, "loss": 0.0, "num_input_tokens_seen": 15146128, "step": 30770 }, { "epoch": 4.061633892041705, "grad_norm": 0.046509820967912674, "learning_rate": 2.0707688899877195e-07, "loss": 0.0032, "num_input_tokens_seen": 15148752, "step": 30775 }, { "epoch": 4.062293783819453, "grad_norm": 0.003307884559035301, "learning_rate": 2.0679625978787196e-07, "loss": 0.0164, "num_input_tokens_seen": 15150928, "step": 30780 }, { "epoch": 4.062953675597202, "grad_norm": 0.00022394787811208516, "learning_rate": 2.0651579892872173e-07, "loss": 0.0, "num_input_tokens_seen": 15153424, "step": 30785 }, { "epoch": 4.063613567374951, "grad_norm": 0.0006549333338625729, "learning_rate": 2.0623550648084719e-07, "loss": 0.0, "num_input_tokens_seen": 15156112, "step": 30790 }, { "epoch": 4.064273459152699, "grad_norm": 0.0018182151252403855, "learning_rate": 2.0595538250373868e-07, "loss": 0.0, "num_input_tokens_seen": 15158608, "step": 30795 }, { "epoch": 4.064933350930447, "grad_norm": 0.04654252901673317, "learning_rate": 2.0567542705684992e-07, "loss": 0.0, "num_input_tokens_seen": 15161040, "step": 30800 }, { "epoch": 4.0655932427081956, "grad_norm": 0.0013130871811881661, "learning_rate": 2.0539564019959965e-07, "loss": 0.0, "num_input_tokens_seen": 15163792, "step": 30805 }, { "epoch": 4.066253134485945, "grad_norm": 0.0019590912852436304, "learning_rate": 2.05116021991371e-07, "loss": 0.0, "num_input_tokens_seen": 15166352, "step": 30810 }, { "epoch": 4.066913026263693, "grad_norm": 0.00028893371927551925, "learning_rate": 2.0483657249151043e-07, "loss": 0.0, "num_input_tokens_seen": 15168592, "step": 30815 }, { "epoch": 4.067572918041441, "grad_norm": 0.3325551152229309, "learning_rate": 2.045572917593291e-07, "loss": 0.0002, "num_input_tokens_seen": 15171344, "step": 30820 }, { "epoch": 4.0682328098191896, "grad_norm": 0.007113211788237095, "learning_rate": 2.0427817985410245e-07, "loss": 0.0, "num_input_tokens_seen": 15173776, "step": 30825 }, { "epoch": 4.068892701596938, "grad_norm": 0.008288971148431301, "learning_rate": 2.0399923683507026e-07, "loss": 0.0007, "num_input_tokens_seen": 15176208, "step": 30830 }, { "epoch": 4.069552593374686, "grad_norm": 0.010359534062445164, "learning_rate": 2.0372046276143596e-07, "loss": 0.0, "num_input_tokens_seen": 15178576, "step": 30835 }, { "epoch": 4.070212485152435, "grad_norm": 0.0014906317228451371, "learning_rate": 2.0344185769236654e-07, "loss": 0.0, "num_input_tokens_seen": 15180752, "step": 30840 }, { "epoch": 4.070872376930184, "grad_norm": 0.0037701940163969994, "learning_rate": 2.0316342168699517e-07, "loss": 0.0001, "num_input_tokens_seen": 15183248, "step": 30845 }, { "epoch": 4.071532268707932, "grad_norm": 0.00434601865708828, "learning_rate": 2.0288515480441714e-07, "loss": 0.0001, "num_input_tokens_seen": 15185936, "step": 30850 }, { "epoch": 4.07219216048568, "grad_norm": 0.0018540917662903666, "learning_rate": 2.0260705710369296e-07, "loss": 0.061, "num_input_tokens_seen": 15188176, "step": 30855 }, { "epoch": 4.072852052263428, "grad_norm": 0.006436683237552643, "learning_rate": 2.0232912864384644e-07, "loss": 0.0, "num_input_tokens_seen": 15190416, "step": 30860 }, { "epoch": 4.073511944041178, "grad_norm": 0.0005059536779299378, "learning_rate": 2.0205136948386604e-07, "loss": 0.0003, "num_input_tokens_seen": 15192848, "step": 30865 }, { "epoch": 4.074171835818926, "grad_norm": 0.018882203847169876, "learning_rate": 2.0177377968270438e-07, "loss": 0.0, "num_input_tokens_seen": 15195728, "step": 30870 }, { "epoch": 4.074831727596674, "grad_norm": 0.0033944041933864355, "learning_rate": 2.0149635929927723e-07, "loss": 0.0, "num_input_tokens_seen": 15198416, "step": 30875 }, { "epoch": 4.075491619374422, "grad_norm": 0.0015481224982067943, "learning_rate": 2.0121910839246593e-07, "loss": 0.0, "num_input_tokens_seen": 15200912, "step": 30880 }, { "epoch": 4.076151511152171, "grad_norm": 0.005322051700204611, "learning_rate": 2.0094202702111462e-07, "loss": 0.0, "num_input_tokens_seen": 15203280, "step": 30885 }, { "epoch": 4.07681140292992, "grad_norm": 0.003729539690539241, "learning_rate": 2.006651152440315e-07, "loss": 0.0, "num_input_tokens_seen": 15205840, "step": 30890 }, { "epoch": 4.077471294707668, "grad_norm": 0.0005679655005224049, "learning_rate": 2.0038837311998945e-07, "loss": 0.0, "num_input_tokens_seen": 15208208, "step": 30895 }, { "epoch": 4.078131186485416, "grad_norm": 7.510792784160003e-05, "learning_rate": 2.0011180070772472e-07, "loss": 0.0, "num_input_tokens_seen": 15210576, "step": 30900 }, { "epoch": 4.078791078263165, "grad_norm": 0.022109191864728928, "learning_rate": 1.998353980659383e-07, "loss": 0.0001, "num_input_tokens_seen": 15213072, "step": 30905 }, { "epoch": 4.079450970040913, "grad_norm": 0.0009357878006994724, "learning_rate": 1.9955916525329396e-07, "loss": 0.0, "num_input_tokens_seen": 15215504, "step": 30910 }, { "epoch": 4.080110861818662, "grad_norm": 0.00043642695527523756, "learning_rate": 1.992831023284205e-07, "loss": 0.0013, "num_input_tokens_seen": 15217680, "step": 30915 }, { "epoch": 4.08077075359641, "grad_norm": 0.0005060906405560672, "learning_rate": 1.9900720934991055e-07, "loss": 0.0, "num_input_tokens_seen": 15220176, "step": 30920 }, { "epoch": 4.081430645374159, "grad_norm": 0.002591827418655157, "learning_rate": 1.9873148637631977e-07, "loss": 0.0, "num_input_tokens_seen": 15222608, "step": 30925 }, { "epoch": 4.082090537151907, "grad_norm": 2.7808291633846238e-05, "learning_rate": 1.9845593346616861e-07, "loss": 0.13, "num_input_tokens_seen": 15224912, "step": 30930 }, { "epoch": 4.082750428929655, "grad_norm": 7.892550638644025e-05, "learning_rate": 1.981805506779416e-07, "loss": 0.0, "num_input_tokens_seen": 15227280, "step": 30935 }, { "epoch": 4.083410320707404, "grad_norm": 0.00016154882905539125, "learning_rate": 1.9790533807008613e-07, "loss": 0.0, "num_input_tokens_seen": 15229520, "step": 30940 }, { "epoch": 4.084070212485153, "grad_norm": 0.000336196506395936, "learning_rate": 1.976302957010143e-07, "loss": 0.0, "num_input_tokens_seen": 15232016, "step": 30945 }, { "epoch": 4.084730104262901, "grad_norm": 0.015776529908180237, "learning_rate": 1.9735542362910197e-07, "loss": 0.0188, "num_input_tokens_seen": 15234320, "step": 30950 }, { "epoch": 4.085389996040649, "grad_norm": 11.258391380310059, "learning_rate": 1.9708072191268886e-07, "loss": 0.0998, "num_input_tokens_seen": 15236752, "step": 30955 }, { "epoch": 4.0860498878183975, "grad_norm": 3.8240083085838705e-05, "learning_rate": 1.9680619061007796e-07, "loss": 0.0001, "num_input_tokens_seen": 15239248, "step": 30960 }, { "epoch": 4.086709779596147, "grad_norm": 0.0016334295505657792, "learning_rate": 1.9653182977953699e-07, "loss": 0.0005, "num_input_tokens_seen": 15241680, "step": 30965 }, { "epoch": 4.087369671373895, "grad_norm": 0.00161271751858294, "learning_rate": 1.9625763947929698e-07, "loss": 0.0001, "num_input_tokens_seen": 15244176, "step": 30970 }, { "epoch": 4.088029563151643, "grad_norm": 0.0008376072510145605, "learning_rate": 1.9598361976755252e-07, "loss": 0.0, "num_input_tokens_seen": 15246416, "step": 30975 }, { "epoch": 4.0886894549293915, "grad_norm": 0.000364994426490739, "learning_rate": 1.9570977070246254e-07, "loss": 0.0, "num_input_tokens_seen": 15248656, "step": 30980 }, { "epoch": 4.08934934670714, "grad_norm": 9.574399948120117, "learning_rate": 1.9543609234214987e-07, "loss": 0.0066, "num_input_tokens_seen": 15251216, "step": 30985 }, { "epoch": 4.090009238484888, "grad_norm": 0.0003706459829118103, "learning_rate": 1.9516258474470005e-07, "loss": 0.0, "num_input_tokens_seen": 15253840, "step": 30990 }, { "epoch": 4.090669130262637, "grad_norm": 0.0002485027362126857, "learning_rate": 1.948892479681634e-07, "loss": 0.0, "num_input_tokens_seen": 15256400, "step": 30995 }, { "epoch": 4.0913290220403855, "grad_norm": 0.0002536515239626169, "learning_rate": 1.946160820705538e-07, "loss": 0.0, "num_input_tokens_seen": 15258640, "step": 31000 }, { "epoch": 4.091988913818134, "grad_norm": 0.00037458192673511803, "learning_rate": 1.9434308710984893e-07, "loss": 0.0176, "num_input_tokens_seen": 15261264, "step": 31005 }, { "epoch": 4.092648805595882, "grad_norm": 0.0009328834130428731, "learning_rate": 1.9407026314398966e-07, "loss": 0.0, "num_input_tokens_seen": 15263696, "step": 31010 }, { "epoch": 4.09330869737363, "grad_norm": 0.0010519040515646338, "learning_rate": 1.9379761023088047e-07, "loss": 0.0066, "num_input_tokens_seen": 15266256, "step": 31015 }, { "epoch": 4.0939685891513795, "grad_norm": 0.021239787340164185, "learning_rate": 1.9352512842839096e-07, "loss": 0.0, "num_input_tokens_seen": 15268816, "step": 31020 }, { "epoch": 4.094628480929128, "grad_norm": 0.0001497942430432886, "learning_rate": 1.9325281779435265e-07, "loss": 0.0322, "num_input_tokens_seen": 15271248, "step": 31025 }, { "epoch": 4.095288372706876, "grad_norm": 0.013604072853922844, "learning_rate": 1.9298067838656196e-07, "loss": 0.0, "num_input_tokens_seen": 15273936, "step": 31030 }, { "epoch": 4.095948264484624, "grad_norm": 0.00011796157923527062, "learning_rate": 1.9270871026277812e-07, "loss": 0.0, "num_input_tokens_seen": 15276560, "step": 31035 }, { "epoch": 4.096608156262373, "grad_norm": 0.0051427981816232204, "learning_rate": 1.9243691348072454e-07, "loss": 0.0, "num_input_tokens_seen": 15279184, "step": 31040 }, { "epoch": 4.097268048040122, "grad_norm": 0.011375799775123596, "learning_rate": 1.9216528809808841e-07, "loss": 0.0, "num_input_tokens_seen": 15281424, "step": 31045 }, { "epoch": 4.09792793981787, "grad_norm": 0.005654303357005119, "learning_rate": 1.918938341725198e-07, "loss": 0.0, "num_input_tokens_seen": 15283984, "step": 31050 }, { "epoch": 4.098587831595618, "grad_norm": 0.00010724661115091294, "learning_rate": 1.91622551761633e-07, "loss": 0.0004, "num_input_tokens_seen": 15286544, "step": 31055 }, { "epoch": 4.099247723373367, "grad_norm": 0.0006716445786878467, "learning_rate": 1.9135144092300604e-07, "loss": 0.0001, "num_input_tokens_seen": 15289040, "step": 31060 }, { "epoch": 4.099907615151115, "grad_norm": 9.40972167882137e-05, "learning_rate": 1.9108050171417967e-07, "loss": 0.0, "num_input_tokens_seen": 15291728, "step": 31065 }, { "epoch": 4.100567506928864, "grad_norm": 0.0023098858073353767, "learning_rate": 1.9080973419265922e-07, "loss": 0.0, "num_input_tokens_seen": 15294160, "step": 31070 }, { "epoch": 4.101227398706612, "grad_norm": 0.00010492518777027726, "learning_rate": 1.9053913841591285e-07, "loss": 0.0095, "num_input_tokens_seen": 15296528, "step": 31075 }, { "epoch": 4.101887290484361, "grad_norm": 0.00028661335818469524, "learning_rate": 1.9026871444137306e-07, "loss": 0.0, "num_input_tokens_seen": 15298896, "step": 31080 }, { "epoch": 4.102547182262109, "grad_norm": 0.009167580865323544, "learning_rate": 1.8999846232643468e-07, "loss": 0.0, "num_input_tokens_seen": 15301584, "step": 31085 }, { "epoch": 4.103207074039857, "grad_norm": 0.04716065526008606, "learning_rate": 1.897283821284571e-07, "loss": 0.0, "num_input_tokens_seen": 15304208, "step": 31090 }, { "epoch": 4.103866965817606, "grad_norm": 8.909520149230957, "learning_rate": 1.894584739047631e-07, "loss": 0.0226, "num_input_tokens_seen": 15306768, "step": 31095 }, { "epoch": 4.104526857595355, "grad_norm": 0.0014904022682458162, "learning_rate": 1.8918873771263842e-07, "loss": 0.0, "num_input_tokens_seen": 15309200, "step": 31100 }, { "epoch": 4.105186749373103, "grad_norm": 0.0003525837091729045, "learning_rate": 1.8891917360933262e-07, "loss": 0.0, "num_input_tokens_seen": 15311632, "step": 31105 }, { "epoch": 4.105846641150851, "grad_norm": 0.00023026631970424205, "learning_rate": 1.8864978165205892e-07, "loss": 0.0, "num_input_tokens_seen": 15313936, "step": 31110 }, { "epoch": 4.1065065329285995, "grad_norm": 0.00022328045452013612, "learning_rate": 1.8838056189799388e-07, "loss": 0.0, "num_input_tokens_seen": 15316368, "step": 31115 }, { "epoch": 4.107166424706348, "grad_norm": 7.168061711126938e-05, "learning_rate": 1.881115144042771e-07, "loss": 0.0004, "num_input_tokens_seen": 15318736, "step": 31120 }, { "epoch": 4.107826316484097, "grad_norm": 0.09140996634960175, "learning_rate": 1.8784263922801212e-07, "loss": 0.0001, "num_input_tokens_seen": 15321360, "step": 31125 }, { "epoch": 4.108486208261845, "grad_norm": 0.0032238177955150604, "learning_rate": 1.8757393642626606e-07, "loss": 0.0001, "num_input_tokens_seen": 15323664, "step": 31130 }, { "epoch": 4.1091461000395935, "grad_norm": 0.001386750489473343, "learning_rate": 1.873054060560686e-07, "loss": 0.0, "num_input_tokens_seen": 15325904, "step": 31135 }, { "epoch": 4.109805991817342, "grad_norm": 0.05405682697892189, "learning_rate": 1.870370481744137e-07, "loss": 0.0, "num_input_tokens_seen": 15328208, "step": 31140 }, { "epoch": 4.11046588359509, "grad_norm": 0.000502249866258353, "learning_rate": 1.8676886283825843e-07, "loss": 0.0, "num_input_tokens_seen": 15330704, "step": 31145 }, { "epoch": 4.111125775372839, "grad_norm": 0.01588205061852932, "learning_rate": 1.8650085010452288e-07, "loss": 0.0, "num_input_tokens_seen": 15333072, "step": 31150 }, { "epoch": 4.1117856671505875, "grad_norm": 0.0001183047570521012, "learning_rate": 1.8623301003009106e-07, "loss": 0.0011, "num_input_tokens_seen": 15335440, "step": 31155 }, { "epoch": 4.112445558928336, "grad_norm": 0.00015695270849391818, "learning_rate": 1.8596534267180998e-07, "loss": 0.0001, "num_input_tokens_seen": 15338320, "step": 31160 }, { "epoch": 4.113105450706084, "grad_norm": 0.0009862695587798953, "learning_rate": 1.8569784808649035e-07, "loss": 0.0, "num_input_tokens_seen": 15341072, "step": 31165 }, { "epoch": 4.113765342483832, "grad_norm": 0.0006455021211877465, "learning_rate": 1.8543052633090582e-07, "loss": 0.0294, "num_input_tokens_seen": 15343504, "step": 31170 }, { "epoch": 4.1144252342615815, "grad_norm": 0.006330928765237331, "learning_rate": 1.8516337746179288e-07, "loss": 0.0266, "num_input_tokens_seen": 15346128, "step": 31175 }, { "epoch": 4.11508512603933, "grad_norm": 7.823634223314002e-05, "learning_rate": 1.8489640153585296e-07, "loss": 0.0, "num_input_tokens_seen": 15348752, "step": 31180 }, { "epoch": 4.115745017817078, "grad_norm": 0.0016246692975983024, "learning_rate": 1.8462959860974914e-07, "loss": 0.0, "num_input_tokens_seen": 15350992, "step": 31185 }, { "epoch": 4.116404909594826, "grad_norm": 0.0002753250009845942, "learning_rate": 1.843629687401085e-07, "loss": 0.0, "num_input_tokens_seen": 15353360, "step": 31190 }, { "epoch": 4.117064801372575, "grad_norm": 0.0006193838198669255, "learning_rate": 1.840965119835216e-07, "loss": 0.0, "num_input_tokens_seen": 15355856, "step": 31195 }, { "epoch": 4.117724693150324, "grad_norm": 3.5579931136453524e-05, "learning_rate": 1.838302283965415e-07, "loss": 0.0, "num_input_tokens_seen": 15358288, "step": 31200 }, { "epoch": 4.118384584928072, "grad_norm": 0.0013902924256399274, "learning_rate": 1.835641180356855e-07, "loss": 0.0, "num_input_tokens_seen": 15360592, "step": 31205 }, { "epoch": 4.11904447670582, "grad_norm": 0.0001232646027347073, "learning_rate": 1.8329818095743265e-07, "loss": 0.0001, "num_input_tokens_seen": 15362896, "step": 31210 }, { "epoch": 4.119704368483569, "grad_norm": 0.00045455177314579487, "learning_rate": 1.8303241721822737e-07, "loss": 0.0, "num_input_tokens_seen": 15365328, "step": 31215 }, { "epoch": 4.120364260261317, "grad_norm": 17.56751823425293, "learning_rate": 1.8276682687447553e-07, "loss": 0.0426, "num_input_tokens_seen": 15367632, "step": 31220 }, { "epoch": 4.121024152039066, "grad_norm": 0.0020516146905720234, "learning_rate": 1.825014099825466e-07, "loss": 0.0, "num_input_tokens_seen": 15370128, "step": 31225 }, { "epoch": 4.121684043816814, "grad_norm": 0.0730593279004097, "learning_rate": 1.822361665987734e-07, "loss": 0.0, "num_input_tokens_seen": 15372688, "step": 31230 }, { "epoch": 4.122343935594563, "grad_norm": 0.010133378207683563, "learning_rate": 1.819710967794521e-07, "loss": 0.0, "num_input_tokens_seen": 15375056, "step": 31235 }, { "epoch": 4.123003827372311, "grad_norm": 0.001488432171754539, "learning_rate": 1.8170620058084208e-07, "loss": 0.0, "num_input_tokens_seen": 15377552, "step": 31240 }, { "epoch": 4.123663719150059, "grad_norm": 5.022612094762735e-05, "learning_rate": 1.814414780591651e-07, "loss": 0.0, "num_input_tokens_seen": 15379920, "step": 31245 }, { "epoch": 4.124323610927807, "grad_norm": 0.0005323368241079152, "learning_rate": 1.811769292706068e-07, "loss": 0.0, "num_input_tokens_seen": 15382224, "step": 31250 }, { "epoch": 4.124983502705557, "grad_norm": 0.0006102940533310175, "learning_rate": 1.8091255427131614e-07, "loss": 0.0, "num_input_tokens_seen": 15384912, "step": 31255 }, { "epoch": 4.125643394483305, "grad_norm": 0.00011270979302935302, "learning_rate": 1.8064835311740422e-07, "loss": 0.0, "num_input_tokens_seen": 15387216, "step": 31260 }, { "epoch": 4.126303286261053, "grad_norm": 0.0010503892553970218, "learning_rate": 1.80384325864946e-07, "loss": 0.0035, "num_input_tokens_seen": 15389648, "step": 31265 }, { "epoch": 4.126963178038801, "grad_norm": 0.013230645097792149, "learning_rate": 1.8012047256997977e-07, "loss": 0.0001, "num_input_tokens_seen": 15392272, "step": 31270 }, { "epoch": 4.12762306981655, "grad_norm": 0.00023171912471298128, "learning_rate": 1.798567932885059e-07, "loss": 0.0, "num_input_tokens_seen": 15394896, "step": 31275 }, { "epoch": 4.128282961594299, "grad_norm": 0.1003655418753624, "learning_rate": 1.7959328807648856e-07, "loss": 0.0343, "num_input_tokens_seen": 15397584, "step": 31280 }, { "epoch": 4.128942853372047, "grad_norm": 0.5334244966506958, "learning_rate": 1.7932995698985486e-07, "loss": 0.0004, "num_input_tokens_seen": 15400144, "step": 31285 }, { "epoch": 4.129602745149795, "grad_norm": 0.00014007413119543344, "learning_rate": 1.7906680008449536e-07, "loss": 0.0, "num_input_tokens_seen": 15402832, "step": 31290 }, { "epoch": 4.130262636927544, "grad_norm": 0.004108428489416838, "learning_rate": 1.788038174162625e-07, "loss": 0.0002, "num_input_tokens_seen": 15405328, "step": 31295 }, { "epoch": 4.130922528705292, "grad_norm": 0.000142919976497069, "learning_rate": 1.785410090409727e-07, "loss": 0.0005, "num_input_tokens_seen": 15407952, "step": 31300 }, { "epoch": 4.131582420483041, "grad_norm": 0.0007745701004751027, "learning_rate": 1.7827837501440556e-07, "loss": 0.0001, "num_input_tokens_seen": 15410320, "step": 31305 }, { "epoch": 4.132242312260789, "grad_norm": 0.004103151150047779, "learning_rate": 1.7801591539230255e-07, "loss": 0.0001, "num_input_tokens_seen": 15412688, "step": 31310 }, { "epoch": 4.132902204038538, "grad_norm": 13.383068084716797, "learning_rate": 1.7775363023036916e-07, "loss": 0.0338, "num_input_tokens_seen": 15415056, "step": 31315 }, { "epoch": 4.133562095816286, "grad_norm": 0.0009205325040966272, "learning_rate": 1.7749151958427379e-07, "loss": 0.0, "num_input_tokens_seen": 15417488, "step": 31320 }, { "epoch": 4.134221987594034, "grad_norm": 0.00014656288840342313, "learning_rate": 1.77229583509647e-07, "loss": 0.0, "num_input_tokens_seen": 15419792, "step": 31325 }, { "epoch": 4.134881879371783, "grad_norm": 7.536137127317488e-05, "learning_rate": 1.7696782206208306e-07, "loss": 0.0, "num_input_tokens_seen": 15422480, "step": 31330 }, { "epoch": 4.135541771149532, "grad_norm": 0.15476743876934052, "learning_rate": 1.767062352971389e-07, "loss": 0.0001, "num_input_tokens_seen": 15424784, "step": 31335 }, { "epoch": 4.13620166292728, "grad_norm": 0.0006393285002559423, "learning_rate": 1.7644482327033484e-07, "loss": 0.0, "num_input_tokens_seen": 15427344, "step": 31340 }, { "epoch": 4.136861554705028, "grad_norm": 0.03930385038256645, "learning_rate": 1.761835860371532e-07, "loss": 0.0, "num_input_tokens_seen": 15430096, "step": 31345 }, { "epoch": 4.1375214464827765, "grad_norm": 0.03138414025306702, "learning_rate": 1.759225236530394e-07, "loss": 0.0, "num_input_tokens_seen": 15432784, "step": 31350 }, { "epoch": 4.138181338260526, "grad_norm": 9.463958849664778e-05, "learning_rate": 1.756616361734029e-07, "loss": 0.0, "num_input_tokens_seen": 15434832, "step": 31355 }, { "epoch": 4.138841230038274, "grad_norm": 2.535146474838257, "learning_rate": 1.754009236536146e-07, "loss": 0.0205, "num_input_tokens_seen": 15437264, "step": 31360 }, { "epoch": 4.139501121816022, "grad_norm": 0.0004646007146220654, "learning_rate": 1.7514038614900905e-07, "loss": 0.0, "num_input_tokens_seen": 15439952, "step": 31365 }, { "epoch": 4.1401610135937705, "grad_norm": 0.00032446475233882666, "learning_rate": 1.748800237148833e-07, "loss": 0.0, "num_input_tokens_seen": 15442192, "step": 31370 }, { "epoch": 4.140820905371519, "grad_norm": 0.00011745891242753714, "learning_rate": 1.7461983640649736e-07, "loss": 0.0, "num_input_tokens_seen": 15444560, "step": 31375 }, { "epoch": 4.141480797149267, "grad_norm": 0.06134350597858429, "learning_rate": 1.7435982427907446e-07, "loss": 0.0, "num_input_tokens_seen": 15447056, "step": 31380 }, { "epoch": 4.142140688927016, "grad_norm": 0.005177411716431379, "learning_rate": 1.7409998738779962e-07, "loss": 0.0, "num_input_tokens_seen": 15449680, "step": 31385 }, { "epoch": 4.1428005807047645, "grad_norm": 0.00234969868324697, "learning_rate": 1.7384032578782216e-07, "loss": 0.0, "num_input_tokens_seen": 15452048, "step": 31390 }, { "epoch": 4.143460472482513, "grad_norm": 0.0019324537133798003, "learning_rate": 1.7358083953425306e-07, "loss": 0.0, "num_input_tokens_seen": 15454736, "step": 31395 }, { "epoch": 4.144120364260261, "grad_norm": 0.00011362414807081223, "learning_rate": 1.7332152868216598e-07, "loss": 0.0001, "num_input_tokens_seen": 15457232, "step": 31400 }, { "epoch": 4.144780256038009, "grad_norm": 0.00027947762282565236, "learning_rate": 1.7306239328659822e-07, "loss": 0.0, "num_input_tokens_seen": 15459728, "step": 31405 }, { "epoch": 4.1454401478157585, "grad_norm": 0.0011989494087174535, "learning_rate": 1.728034334025491e-07, "loss": 0.0, "num_input_tokens_seen": 15462096, "step": 31410 }, { "epoch": 4.146100039593507, "grad_norm": 0.0004557797801680863, "learning_rate": 1.7254464908498156e-07, "loss": 0.0511, "num_input_tokens_seen": 15464720, "step": 31415 }, { "epoch": 4.146759931371255, "grad_norm": 0.00030610704561695457, "learning_rate": 1.7228604038882e-07, "loss": 0.0003, "num_input_tokens_seen": 15467024, "step": 31420 }, { "epoch": 4.147419823149003, "grad_norm": 2.5527537218295038e-05, "learning_rate": 1.720276073689525e-07, "loss": 0.0, "num_input_tokens_seen": 15469520, "step": 31425 }, { "epoch": 4.148079714926752, "grad_norm": 0.005444636568427086, "learning_rate": 1.7176935008022986e-07, "loss": 0.0411, "num_input_tokens_seen": 15471824, "step": 31430 }, { "epoch": 4.148739606704501, "grad_norm": 0.0002152034139726311, "learning_rate": 1.715112685774649e-07, "loss": 0.0001, "num_input_tokens_seen": 15474000, "step": 31435 }, { "epoch": 4.149399498482249, "grad_norm": 0.000859928026329726, "learning_rate": 1.7125336291543368e-07, "loss": 0.0, "num_input_tokens_seen": 15476560, "step": 31440 }, { "epoch": 4.150059390259997, "grad_norm": 0.0003301157266832888, "learning_rate": 1.7099563314887498e-07, "loss": 0.0426, "num_input_tokens_seen": 15478736, "step": 31445 }, { "epoch": 4.150719282037746, "grad_norm": 0.009691378101706505, "learning_rate": 1.7073807933249008e-07, "loss": 0.0, "num_input_tokens_seen": 15480976, "step": 31450 }, { "epoch": 4.151379173815494, "grad_norm": 0.0030957702547311783, "learning_rate": 1.7048070152094263e-07, "loss": 0.0595, "num_input_tokens_seen": 15483536, "step": 31455 }, { "epoch": 4.152039065593243, "grad_norm": 0.0003932028484996408, "learning_rate": 1.7022349976885941e-07, "loss": 0.0001, "num_input_tokens_seen": 15486032, "step": 31460 }, { "epoch": 4.152698957370991, "grad_norm": 12.39704418182373, "learning_rate": 1.6996647413082977e-07, "loss": 0.0519, "num_input_tokens_seen": 15488912, "step": 31465 }, { "epoch": 4.15335884914874, "grad_norm": 0.015966031700372696, "learning_rate": 1.6970962466140514e-07, "loss": 0.0, "num_input_tokens_seen": 15491408, "step": 31470 }, { "epoch": 4.154018740926488, "grad_norm": 0.00011750247358577326, "learning_rate": 1.6945295141510018e-07, "loss": 0.0, "num_input_tokens_seen": 15493776, "step": 31475 }, { "epoch": 4.154678632704236, "grad_norm": 8.13830629340373e-05, "learning_rate": 1.691964544463922e-07, "loss": 0.0, "num_input_tokens_seen": 15496272, "step": 31480 }, { "epoch": 4.155338524481985, "grad_norm": 0.00031953773577697575, "learning_rate": 1.6894013380972028e-07, "loss": 0.0, "num_input_tokens_seen": 15498512, "step": 31485 }, { "epoch": 4.155998416259734, "grad_norm": 0.008766383863985538, "learning_rate": 1.6868398955948693e-07, "loss": 0.0, "num_input_tokens_seen": 15501008, "step": 31490 }, { "epoch": 4.156658308037482, "grad_norm": 0.061419978737831116, "learning_rate": 1.684280217500569e-07, "loss": 0.0, "num_input_tokens_seen": 15503312, "step": 31495 }, { "epoch": 4.15731819981523, "grad_norm": 0.6979484558105469, "learning_rate": 1.6817223043575768e-07, "loss": 0.0005, "num_input_tokens_seen": 15506000, "step": 31500 }, { "epoch": 4.1579780915929785, "grad_norm": 0.00016205478459596634, "learning_rate": 1.6791661567087888e-07, "loss": 0.0253, "num_input_tokens_seen": 15508752, "step": 31505 }, { "epoch": 4.158637983370728, "grad_norm": 0.07795727998018265, "learning_rate": 1.6766117750967244e-07, "loss": 0.0, "num_input_tokens_seen": 15511440, "step": 31510 }, { "epoch": 4.159297875148476, "grad_norm": 2.1538477085414343e-05, "learning_rate": 1.6740591600635433e-07, "loss": 0.0, "num_input_tokens_seen": 15513808, "step": 31515 }, { "epoch": 4.159957766926224, "grad_norm": 0.00036920199636369944, "learning_rate": 1.671508312151011e-07, "loss": 0.0, "num_input_tokens_seen": 15516496, "step": 31520 }, { "epoch": 4.1606176587039725, "grad_norm": 6.141668563941494e-05, "learning_rate": 1.6689592319005296e-07, "loss": 0.0645, "num_input_tokens_seen": 15519056, "step": 31525 }, { "epoch": 4.161277550481721, "grad_norm": 0.0010340113658457994, "learning_rate": 1.6664119198531245e-07, "loss": 0.0001, "num_input_tokens_seen": 15521104, "step": 31530 }, { "epoch": 4.161937442259469, "grad_norm": 0.002041921252384782, "learning_rate": 1.6638663765494398e-07, "loss": 0.0294, "num_input_tokens_seen": 15523344, "step": 31535 }, { "epoch": 4.162597334037218, "grad_norm": 0.00012479268480092287, "learning_rate": 1.6613226025297545e-07, "loss": 0.0, "num_input_tokens_seen": 15525840, "step": 31540 }, { "epoch": 4.1632572258149665, "grad_norm": 0.0012602178612723947, "learning_rate": 1.6587805983339564e-07, "loss": 0.0, "num_input_tokens_seen": 15528144, "step": 31545 }, { "epoch": 4.163917117592715, "grad_norm": 0.0001506891567260027, "learning_rate": 1.65624036450158e-07, "loss": 0.0, "num_input_tokens_seen": 15530512, "step": 31550 }, { "epoch": 4.164577009370463, "grad_norm": 0.00014679189189337194, "learning_rate": 1.6537019015717647e-07, "loss": 0.0, "num_input_tokens_seen": 15532880, "step": 31555 }, { "epoch": 4.165236901148211, "grad_norm": 0.0002842925605364144, "learning_rate": 1.6511652100832797e-07, "loss": 0.0, "num_input_tokens_seen": 15535440, "step": 31560 }, { "epoch": 4.1658967929259605, "grad_norm": 0.00048440933460369706, "learning_rate": 1.648630290574522e-07, "loss": 0.0, "num_input_tokens_seen": 15538000, "step": 31565 }, { "epoch": 4.166556684703709, "grad_norm": 0.00023747571685817093, "learning_rate": 1.646097143583508e-07, "loss": 0.02, "num_input_tokens_seen": 15540688, "step": 31570 }, { "epoch": 4.167216576481457, "grad_norm": 5.907857484999113e-05, "learning_rate": 1.6435657696478844e-07, "loss": 0.0252, "num_input_tokens_seen": 15543120, "step": 31575 }, { "epoch": 4.167876468259205, "grad_norm": 0.002017110353335738, "learning_rate": 1.6410361693049114e-07, "loss": 0.0112, "num_input_tokens_seen": 15545232, "step": 31580 }, { "epoch": 4.168536360036954, "grad_norm": 0.006705759093165398, "learning_rate": 1.6385083430914792e-07, "loss": 0.0, "num_input_tokens_seen": 15547920, "step": 31585 }, { "epoch": 4.169196251814703, "grad_norm": 0.00014904749696142972, "learning_rate": 1.6359822915441058e-07, "loss": 0.0456, "num_input_tokens_seen": 15550224, "step": 31590 }, { "epoch": 4.169856143592451, "grad_norm": 0.09349919855594635, "learning_rate": 1.6334580151989207e-07, "loss": 0.0, "num_input_tokens_seen": 15552656, "step": 31595 }, { "epoch": 4.170516035370199, "grad_norm": 0.004773380700498819, "learning_rate": 1.630935514591686e-07, "loss": 0.0, "num_input_tokens_seen": 15555280, "step": 31600 }, { "epoch": 4.171175927147948, "grad_norm": 0.054015111178159714, "learning_rate": 1.6284147902577872e-07, "loss": 0.0, "num_input_tokens_seen": 15557776, "step": 31605 }, { "epoch": 4.171835818925696, "grad_norm": 0.00047588403685949743, "learning_rate": 1.6258958427322234e-07, "loss": 0.0001, "num_input_tokens_seen": 15560208, "step": 31610 }, { "epoch": 4.172495710703445, "grad_norm": 0.266117125749588, "learning_rate": 1.623378672549628e-07, "loss": 0.0002, "num_input_tokens_seen": 15562768, "step": 31615 }, { "epoch": 4.173155602481193, "grad_norm": 0.00938863679766655, "learning_rate": 1.620863280244249e-07, "loss": 0.0, "num_input_tokens_seen": 15565328, "step": 31620 }, { "epoch": 4.173815494258942, "grad_norm": 0.00039532931987196207, "learning_rate": 1.6183496663499652e-07, "loss": 0.0005, "num_input_tokens_seen": 15567632, "step": 31625 }, { "epoch": 4.17447538603669, "grad_norm": 0.00045435517677105963, "learning_rate": 1.6158378314002673e-07, "loss": 0.0, "num_input_tokens_seen": 15570064, "step": 31630 }, { "epoch": 4.175135277814438, "grad_norm": 0.0060187773779034615, "learning_rate": 1.613327775928276e-07, "loss": 0.0, "num_input_tokens_seen": 15572624, "step": 31635 }, { "epoch": 4.175795169592186, "grad_norm": 0.001109470147639513, "learning_rate": 1.6108195004667357e-07, "loss": 0.0, "num_input_tokens_seen": 15574672, "step": 31640 }, { "epoch": 4.176455061369936, "grad_norm": 0.26625481247901917, "learning_rate": 1.6083130055480033e-07, "loss": 0.0002, "num_input_tokens_seen": 15577488, "step": 31645 }, { "epoch": 4.177114953147684, "grad_norm": 6.225990364328027e-05, "learning_rate": 1.6058082917040682e-07, "loss": 0.0, "num_input_tokens_seen": 15579920, "step": 31650 }, { "epoch": 4.177774844925432, "grad_norm": 0.0013030472910031676, "learning_rate": 1.6033053594665402e-07, "loss": 0.0, "num_input_tokens_seen": 15582224, "step": 31655 }, { "epoch": 4.17843473670318, "grad_norm": 0.00037867153878323734, "learning_rate": 1.6008042093666428e-07, "loss": 0.0, "num_input_tokens_seen": 15584656, "step": 31660 }, { "epoch": 4.179094628480929, "grad_norm": 0.0003713365877047181, "learning_rate": 1.5983048419352297e-07, "loss": 0.0, "num_input_tokens_seen": 15587024, "step": 31665 }, { "epoch": 4.179754520258678, "grad_norm": 0.17333261668682098, "learning_rate": 1.5958072577027738e-07, "loss": 0.0002, "num_input_tokens_seen": 15589648, "step": 31670 }, { "epoch": 4.180414412036426, "grad_norm": 0.002129318891093135, "learning_rate": 1.5933114571993712e-07, "loss": 0.0, "num_input_tokens_seen": 15592464, "step": 31675 }, { "epoch": 4.181074303814174, "grad_norm": 0.00019260341650806367, "learning_rate": 1.5908174409547347e-07, "loss": 0.0381, "num_input_tokens_seen": 15595024, "step": 31680 }, { "epoch": 4.181734195591923, "grad_norm": 0.7180609107017517, "learning_rate": 1.588325209498198e-07, "loss": 0.0677, "num_input_tokens_seen": 15597648, "step": 31685 }, { "epoch": 4.182394087369671, "grad_norm": 6.745587597833946e-05, "learning_rate": 1.5858347633587277e-07, "loss": 0.0, "num_input_tokens_seen": 15600208, "step": 31690 }, { "epoch": 4.18305397914742, "grad_norm": 0.002034937497228384, "learning_rate": 1.5833461030648954e-07, "loss": 0.0, "num_input_tokens_seen": 15602768, "step": 31695 }, { "epoch": 4.183713870925168, "grad_norm": 0.08714821189641953, "learning_rate": 1.5808592291449074e-07, "loss": 0.0207, "num_input_tokens_seen": 15605456, "step": 31700 }, { "epoch": 4.184373762702917, "grad_norm": 0.0014131611678749323, "learning_rate": 1.5783741421265784e-07, "loss": 0.0003, "num_input_tokens_seen": 15608016, "step": 31705 }, { "epoch": 4.185033654480665, "grad_norm": 0.0023685896303504705, "learning_rate": 1.575890842537353e-07, "loss": 0.0, "num_input_tokens_seen": 15610256, "step": 31710 }, { "epoch": 4.185693546258413, "grad_norm": 0.0001617130619706586, "learning_rate": 1.573409330904296e-07, "loss": 0.0, "num_input_tokens_seen": 15612688, "step": 31715 }, { "epoch": 4.1863534380361624, "grad_norm": 0.059925127774477005, "learning_rate": 1.5709296077540835e-07, "loss": 0.0579, "num_input_tokens_seen": 15615376, "step": 31720 }, { "epoch": 4.187013329813911, "grad_norm": 0.01801411621272564, "learning_rate": 1.5684516736130283e-07, "loss": 0.0441, "num_input_tokens_seen": 15617680, "step": 31725 }, { "epoch": 4.187673221591659, "grad_norm": 0.0002701185003388673, "learning_rate": 1.5659755290070453e-07, "loss": 0.0, "num_input_tokens_seen": 15620432, "step": 31730 }, { "epoch": 4.188333113369407, "grad_norm": 0.0005498105310834944, "learning_rate": 1.5635011744616854e-07, "loss": 0.0, "num_input_tokens_seen": 15622736, "step": 31735 }, { "epoch": 4.188993005147156, "grad_norm": 0.0004175195062998682, "learning_rate": 1.5610286105021063e-07, "loss": 0.0, "num_input_tokens_seen": 15625424, "step": 31740 }, { "epoch": 4.189652896924905, "grad_norm": 0.0007348982035182416, "learning_rate": 1.5585578376530938e-07, "loss": 0.0003, "num_input_tokens_seen": 15627920, "step": 31745 }, { "epoch": 4.190312788702653, "grad_norm": 2.787469929899089e-05, "learning_rate": 1.556088856439055e-07, "loss": 0.0, "num_input_tokens_seen": 15630352, "step": 31750 }, { "epoch": 4.190972680480401, "grad_norm": 2.5024770366144367e-05, "learning_rate": 1.5536216673840084e-07, "loss": 0.0518, "num_input_tokens_seen": 15632848, "step": 31755 }, { "epoch": 4.19163257225815, "grad_norm": 0.018026202917099, "learning_rate": 1.551156271011599e-07, "loss": 0.0, "num_input_tokens_seen": 15635344, "step": 31760 }, { "epoch": 4.192292464035898, "grad_norm": 0.0006465193582698703, "learning_rate": 1.5486926678450907e-07, "loss": 0.0, "num_input_tokens_seen": 15637840, "step": 31765 }, { "epoch": 4.192952355813647, "grad_norm": 0.002082411665469408, "learning_rate": 1.5462308584073625e-07, "loss": 0.0, "num_input_tokens_seen": 15640272, "step": 31770 }, { "epoch": 4.193612247591395, "grad_norm": 0.0004274914681445807, "learning_rate": 1.5437708432209174e-07, "loss": 0.0, "num_input_tokens_seen": 15642832, "step": 31775 }, { "epoch": 4.194272139369144, "grad_norm": 0.0006353407516144216, "learning_rate": 1.5413126228078755e-07, "loss": 0.0, "num_input_tokens_seen": 15645136, "step": 31780 }, { "epoch": 4.194932031146892, "grad_norm": 7.021045166766271e-05, "learning_rate": 1.5388561976899784e-07, "loss": 0.0, "num_input_tokens_seen": 15647376, "step": 31785 }, { "epoch": 4.19559192292464, "grad_norm": 0.0003200802602805197, "learning_rate": 1.53640156838858e-07, "loss": 0.0, "num_input_tokens_seen": 15649616, "step": 31790 }, { "epoch": 4.196251814702388, "grad_norm": 0.000181476934812963, "learning_rate": 1.5339487354246605e-07, "loss": 0.0, "num_input_tokens_seen": 15652048, "step": 31795 }, { "epoch": 4.196911706480138, "grad_norm": 0.9393686652183533, "learning_rate": 1.5314976993188177e-07, "loss": 0.001, "num_input_tokens_seen": 15654288, "step": 31800 }, { "epoch": 4.197571598257886, "grad_norm": 0.0006523782503791153, "learning_rate": 1.5290484605912624e-07, "loss": 0.0, "num_input_tokens_seen": 15656784, "step": 31805 }, { "epoch": 4.198231490035634, "grad_norm": 0.0040655615739524364, "learning_rate": 1.5266010197618296e-07, "loss": 0.0, "num_input_tokens_seen": 15659536, "step": 31810 }, { "epoch": 4.198891381813382, "grad_norm": 2.516552209854126, "learning_rate": 1.5241553773499727e-07, "loss": 0.001, "num_input_tokens_seen": 15661776, "step": 31815 }, { "epoch": 4.199551273591131, "grad_norm": 0.0005888799205422401, "learning_rate": 1.5217115338747577e-07, "loss": 0.0, "num_input_tokens_seen": 15664208, "step": 31820 }, { "epoch": 4.20021116536888, "grad_norm": 0.0015442796284332871, "learning_rate": 1.5192694898548742e-07, "loss": 0.0132, "num_input_tokens_seen": 15666576, "step": 31825 }, { "epoch": 4.200871057146628, "grad_norm": 0.0013671378837898374, "learning_rate": 1.5168292458086286e-07, "loss": 0.0, "num_input_tokens_seen": 15668880, "step": 31830 }, { "epoch": 4.201530948924376, "grad_norm": 0.0011779994238168001, "learning_rate": 1.5143908022539487e-07, "loss": 0.028, "num_input_tokens_seen": 15671120, "step": 31835 }, { "epoch": 4.202190840702125, "grad_norm": 0.00035011785803362727, "learning_rate": 1.5119541597083718e-07, "loss": 0.0001, "num_input_tokens_seen": 15673424, "step": 31840 }, { "epoch": 4.202850732479873, "grad_norm": 0.0017878002254292369, "learning_rate": 1.5095193186890554e-07, "loss": 0.0, "num_input_tokens_seen": 15676112, "step": 31845 }, { "epoch": 4.203510624257622, "grad_norm": 0.42923977971076965, "learning_rate": 1.5070862797127847e-07, "loss": 0.0006, "num_input_tokens_seen": 15678608, "step": 31850 }, { "epoch": 4.20417051603537, "grad_norm": 0.000863776367623359, "learning_rate": 1.504655043295948e-07, "loss": 0.0074, "num_input_tokens_seen": 15680976, "step": 31855 }, { "epoch": 4.204830407813119, "grad_norm": 0.000916738819796592, "learning_rate": 1.5022256099545594e-07, "loss": 0.0, "num_input_tokens_seen": 15683280, "step": 31860 }, { "epoch": 4.205490299590867, "grad_norm": 0.00029176438692957163, "learning_rate": 1.4997979802042515e-07, "loss": 0.0, "num_input_tokens_seen": 15685648, "step": 31865 }, { "epoch": 4.206150191368615, "grad_norm": 4.51797604910098e-05, "learning_rate": 1.4973721545602668e-07, "loss": 0.0, "num_input_tokens_seen": 15688272, "step": 31870 }, { "epoch": 4.206810083146364, "grad_norm": 0.008635712787508965, "learning_rate": 1.4949481335374736e-07, "loss": 0.0001, "num_input_tokens_seen": 15690768, "step": 31875 }, { "epoch": 4.207469974924113, "grad_norm": 0.00044233829248696566, "learning_rate": 1.4925259176503446e-07, "loss": 0.0, "num_input_tokens_seen": 15693456, "step": 31880 }, { "epoch": 4.208129866701861, "grad_norm": 7.04431367921643e-05, "learning_rate": 1.4901055074129888e-07, "loss": 0.0, "num_input_tokens_seen": 15695888, "step": 31885 }, { "epoch": 4.208789758479609, "grad_norm": 0.02665085531771183, "learning_rate": 1.487686903339115e-07, "loss": 0.0, "num_input_tokens_seen": 15698064, "step": 31890 }, { "epoch": 4.2094496502573575, "grad_norm": 3.517913137329742e-05, "learning_rate": 1.4852701059420526e-07, "loss": 0.0, "num_input_tokens_seen": 15700368, "step": 31895 }, { "epoch": 4.210109542035106, "grad_norm": 0.14689721167087555, "learning_rate": 1.4828551157347514e-07, "loss": 0.0, "num_input_tokens_seen": 15702864, "step": 31900 }, { "epoch": 4.210769433812855, "grad_norm": 0.0003207788977306336, "learning_rate": 1.4804419332297746e-07, "loss": 0.0, "num_input_tokens_seen": 15705104, "step": 31905 }, { "epoch": 4.211429325590603, "grad_norm": 0.0017935391515493393, "learning_rate": 1.478030558939307e-07, "loss": 0.0, "num_input_tokens_seen": 15707344, "step": 31910 }, { "epoch": 4.2120892173683515, "grad_norm": 0.0030650501139461994, "learning_rate": 1.4756209933751396e-07, "loss": 0.0, "num_input_tokens_seen": 15709904, "step": 31915 }, { "epoch": 4.2127491091461, "grad_norm": 6.088883674237877e-05, "learning_rate": 1.4732132370486872e-07, "loss": 0.0, "num_input_tokens_seen": 15712272, "step": 31920 }, { "epoch": 4.213409000923848, "grad_norm": 0.042080190032720566, "learning_rate": 1.4708072904709812e-07, "loss": 0.0, "num_input_tokens_seen": 15714896, "step": 31925 }, { "epoch": 4.214068892701597, "grad_norm": 0.00020239691366441548, "learning_rate": 1.468403154152663e-07, "loss": 0.0011, "num_input_tokens_seen": 15717456, "step": 31930 }, { "epoch": 4.2147287844793455, "grad_norm": 0.0012077669380232692, "learning_rate": 1.4660008286039937e-07, "loss": 0.0113, "num_input_tokens_seen": 15720016, "step": 31935 }, { "epoch": 4.215388676257094, "grad_norm": 0.0007291028741747141, "learning_rate": 1.4636003143348518e-07, "loss": 0.0, "num_input_tokens_seen": 15722320, "step": 31940 }, { "epoch": 4.216048568034842, "grad_norm": 0.0003883135796058923, "learning_rate": 1.4612016118547265e-07, "loss": 0.0, "num_input_tokens_seen": 15724816, "step": 31945 }, { "epoch": 4.21670845981259, "grad_norm": 0.00250708544626832, "learning_rate": 1.4588047216727251e-07, "loss": 0.0396, "num_input_tokens_seen": 15727440, "step": 31950 }, { "epoch": 4.2173683515903395, "grad_norm": 0.00013595109339803457, "learning_rate": 1.4564096442975715e-07, "loss": 0.0, "num_input_tokens_seen": 15729744, "step": 31955 }, { "epoch": 4.218028243368088, "grad_norm": 1.639278889342677e-05, "learning_rate": 1.454016380237605e-07, "loss": 0.0, "num_input_tokens_seen": 15732304, "step": 31960 }, { "epoch": 4.218688135145836, "grad_norm": 0.0030679525807499886, "learning_rate": 1.4516249300007743e-07, "loss": 0.0, "num_input_tokens_seen": 15734608, "step": 31965 }, { "epoch": 4.219348026923584, "grad_norm": 0.002188972430303693, "learning_rate": 1.4492352940946506e-07, "loss": 0.0, "num_input_tokens_seen": 15736976, "step": 31970 }, { "epoch": 4.220007918701333, "grad_norm": 0.002761758165434003, "learning_rate": 1.4468474730264168e-07, "loss": 0.0019, "num_input_tokens_seen": 15739664, "step": 31975 }, { "epoch": 4.220667810479082, "grad_norm": 0.0013528363779187202, "learning_rate": 1.4444614673028687e-07, "loss": 0.0, "num_input_tokens_seen": 15742096, "step": 31980 }, { "epoch": 4.22132770225683, "grad_norm": 7.338653085753322e-05, "learning_rate": 1.442077277430419e-07, "loss": 0.0, "num_input_tokens_seen": 15744464, "step": 31985 }, { "epoch": 4.221987594034578, "grad_norm": 0.0034586521796882153, "learning_rate": 1.4396949039150984e-07, "loss": 0.0, "num_input_tokens_seen": 15746896, "step": 31990 }, { "epoch": 4.222647485812327, "grad_norm": 0.00011525737500051036, "learning_rate": 1.4373143472625438e-07, "loss": 0.0, "num_input_tokens_seen": 15749200, "step": 31995 }, { "epoch": 4.223307377590075, "grad_norm": 0.0010359887965023518, "learning_rate": 1.4349356079780116e-07, "loss": 0.0, "num_input_tokens_seen": 15751696, "step": 32000 }, { "epoch": 4.223967269367824, "grad_norm": 13.788939476013184, "learning_rate": 1.432558686566374e-07, "loss": 0.0308, "num_input_tokens_seen": 15754256, "step": 32005 }, { "epoch": 4.224627161145572, "grad_norm": 0.00019276590319350362, "learning_rate": 1.4301835835321175e-07, "loss": 0.0323, "num_input_tokens_seen": 15757008, "step": 32010 }, { "epoch": 4.225287052923321, "grad_norm": 0.0015638087643310428, "learning_rate": 1.4278102993793362e-07, "loss": 0.0, "num_input_tokens_seen": 15759312, "step": 32015 }, { "epoch": 4.225946944701069, "grad_norm": 0.00024061364820227027, "learning_rate": 1.4254388346117408e-07, "loss": 0.0, "num_input_tokens_seen": 15761616, "step": 32020 }, { "epoch": 4.226606836478817, "grad_norm": 0.00010773177200462669, "learning_rate": 1.423069189732664e-07, "loss": 0.0, "num_input_tokens_seen": 15764176, "step": 32025 }, { "epoch": 4.227266728256566, "grad_norm": 0.009924034588038921, "learning_rate": 1.4207013652450405e-07, "loss": 0.0042, "num_input_tokens_seen": 15766736, "step": 32030 }, { "epoch": 4.227926620034315, "grad_norm": 0.00042953903903253376, "learning_rate": 1.4183353616514293e-07, "loss": 0.0023, "num_input_tokens_seen": 15769424, "step": 32035 }, { "epoch": 4.228586511812063, "grad_norm": 0.006489480845630169, "learning_rate": 1.415971179453991e-07, "loss": 0.0, "num_input_tokens_seen": 15772240, "step": 32040 }, { "epoch": 4.229246403589811, "grad_norm": 0.0013794874539598823, "learning_rate": 1.4136088191545083e-07, "loss": 0.0001, "num_input_tokens_seen": 15774608, "step": 32045 }, { "epoch": 4.2299062953675595, "grad_norm": 0.027372226119041443, "learning_rate": 1.411248281254379e-07, "loss": 0.0, "num_input_tokens_seen": 15777040, "step": 32050 }, { "epoch": 4.230566187145308, "grad_norm": 0.00011632290261331946, "learning_rate": 1.408889566254603e-07, "loss": 0.0, "num_input_tokens_seen": 15779472, "step": 32055 }, { "epoch": 4.231226078923057, "grad_norm": 0.0005008972948417068, "learning_rate": 1.4065326746558092e-07, "loss": 0.0, "num_input_tokens_seen": 15781904, "step": 32060 }, { "epoch": 4.231885970700805, "grad_norm": 0.03096534125506878, "learning_rate": 1.4041776069582233e-07, "loss": 0.0, "num_input_tokens_seen": 15784592, "step": 32065 }, { "epoch": 4.2325458624785535, "grad_norm": 0.00018453155644237995, "learning_rate": 1.4018243636616967e-07, "loss": 0.0, "num_input_tokens_seen": 15787024, "step": 32070 }, { "epoch": 4.233205754256302, "grad_norm": 0.000387275853427127, "learning_rate": 1.399472945265684e-07, "loss": 0.0, "num_input_tokens_seen": 15789456, "step": 32075 }, { "epoch": 4.23386564603405, "grad_norm": 0.0003460666921455413, "learning_rate": 1.397123352269257e-07, "loss": 0.0176, "num_input_tokens_seen": 15791888, "step": 32080 }, { "epoch": 4.234525537811799, "grad_norm": 0.0038513634353876114, "learning_rate": 1.3947755851711053e-07, "loss": 0.0002, "num_input_tokens_seen": 15794128, "step": 32085 }, { "epoch": 4.2351854295895475, "grad_norm": 0.021419484168291092, "learning_rate": 1.3924296444695194e-07, "loss": 0.0, "num_input_tokens_seen": 15796304, "step": 32090 }, { "epoch": 4.235845321367296, "grad_norm": 0.021339669823646545, "learning_rate": 1.3900855306624093e-07, "loss": 0.0, "num_input_tokens_seen": 15798800, "step": 32095 }, { "epoch": 4.236505213145044, "grad_norm": 0.0009724851697683334, "learning_rate": 1.387743244247299e-07, "loss": 0.0007, "num_input_tokens_seen": 15801424, "step": 32100 }, { "epoch": 4.237165104922792, "grad_norm": 0.003946192096918821, "learning_rate": 1.385402785721319e-07, "loss": 0.0, "num_input_tokens_seen": 15804240, "step": 32105 }, { "epoch": 4.2378249967005415, "grad_norm": 1.2278902431717142e-05, "learning_rate": 1.3830641555812162e-07, "loss": 0.0, "num_input_tokens_seen": 15806544, "step": 32110 }, { "epoch": 4.23848488847829, "grad_norm": 0.3208927512168884, "learning_rate": 1.3807273543233466e-07, "loss": 0.0268, "num_input_tokens_seen": 15809552, "step": 32115 }, { "epoch": 4.239144780256038, "grad_norm": 0.19890807569026947, "learning_rate": 1.3783923824436817e-07, "loss": 0.0001, "num_input_tokens_seen": 15811984, "step": 32120 }, { "epoch": 4.239804672033786, "grad_norm": 0.00046882135211490095, "learning_rate": 1.3760592404377991e-07, "loss": 0.0, "num_input_tokens_seen": 15814608, "step": 32125 }, { "epoch": 4.240464563811535, "grad_norm": 0.000249588891165331, "learning_rate": 1.373727928800894e-07, "loss": 0.1054, "num_input_tokens_seen": 15817040, "step": 32130 }, { "epoch": 4.241124455589284, "grad_norm": 9.873955726623535, "learning_rate": 1.3713984480277708e-07, "loss": 0.0323, "num_input_tokens_seen": 15819600, "step": 32135 }, { "epoch": 4.241784347367032, "grad_norm": 0.00012521083408501, "learning_rate": 1.3690707986128414e-07, "loss": 0.0, "num_input_tokens_seen": 15822608, "step": 32140 }, { "epoch": 4.24244423914478, "grad_norm": 0.06816807389259338, "learning_rate": 1.3667449810501353e-07, "loss": 0.0, "num_input_tokens_seen": 15825360, "step": 32145 }, { "epoch": 4.243104130922529, "grad_norm": 8.7230589997489e-05, "learning_rate": 1.3644209958332908e-07, "loss": 0.0, "num_input_tokens_seen": 15827792, "step": 32150 }, { "epoch": 4.243764022700277, "grad_norm": 0.00036624076892621815, "learning_rate": 1.3620988434555546e-07, "loss": 0.0253, "num_input_tokens_seen": 15830224, "step": 32155 }, { "epoch": 4.244423914478026, "grad_norm": 2.8342570658423938e-05, "learning_rate": 1.3597785244097882e-07, "loss": 0.0381, "num_input_tokens_seen": 15832720, "step": 32160 }, { "epoch": 4.245083806255774, "grad_norm": 0.03173115476965904, "learning_rate": 1.3574600391884627e-07, "loss": 0.0, "num_input_tokens_seen": 15835152, "step": 32165 }, { "epoch": 4.245743698033523, "grad_norm": 0.0012725105043500662, "learning_rate": 1.3551433882836615e-07, "loss": 0.0, "num_input_tokens_seen": 15837648, "step": 32170 }, { "epoch": 4.246403589811271, "grad_norm": 0.004674192983657122, "learning_rate": 1.3528285721870747e-07, "loss": 0.0, "num_input_tokens_seen": 15839888, "step": 32175 }, { "epoch": 4.247063481589019, "grad_norm": 0.007815685123205185, "learning_rate": 1.3505155913900012e-07, "loss": 0.0, "num_input_tokens_seen": 15842640, "step": 32180 }, { "epoch": 4.247723373366767, "grad_norm": 14.349305152893066, "learning_rate": 1.3482044463833632e-07, "loss": 0.0411, "num_input_tokens_seen": 15845072, "step": 32185 }, { "epoch": 4.248383265144517, "grad_norm": 0.0006116251461207867, "learning_rate": 1.3458951376576778e-07, "loss": 0.0046, "num_input_tokens_seen": 15847504, "step": 32190 }, { "epoch": 4.249043156922265, "grad_norm": 0.001768477144651115, "learning_rate": 1.343587665703082e-07, "loss": 0.0, "num_input_tokens_seen": 15850064, "step": 32195 }, { "epoch": 4.249703048700013, "grad_norm": 0.004840330220758915, "learning_rate": 1.341282031009321e-07, "loss": 0.0, "num_input_tokens_seen": 15852752, "step": 32200 }, { "epoch": 4.250362940477761, "grad_norm": 15.46373462677002, "learning_rate": 1.338978234065745e-07, "loss": 0.0442, "num_input_tokens_seen": 15855056, "step": 32205 }, { "epoch": 4.25102283225551, "grad_norm": 9.834176063537598, "learning_rate": 1.3366762753613236e-07, "loss": 0.0143, "num_input_tokens_seen": 15857488, "step": 32210 }, { "epoch": 4.251682724033259, "grad_norm": 4.949681758880615, "learning_rate": 1.3343761553846222e-07, "loss": 0.0087, "num_input_tokens_seen": 15859920, "step": 32215 }, { "epoch": 4.251682724033259, "eval_loss": 0.24730534851551056, "eval_runtime": 7.904, "eval_samples_per_second": 852.096, "eval_steps_per_second": 106.528, "num_input_tokens_seen": 15859920, "step": 32215 }, { "epoch": 4.252342615811007, "grad_norm": 0.00021375197684392333, "learning_rate": 1.332077874623836e-07, "loss": 0.0, "num_input_tokens_seen": 15862480, "step": 32220 }, { "epoch": 4.253002507588755, "grad_norm": 1.4487995031231549e-05, "learning_rate": 1.3297814335667523e-07, "loss": 0.0577, "num_input_tokens_seen": 15865296, "step": 32225 }, { "epoch": 4.253662399366504, "grad_norm": 0.0018687325064092875, "learning_rate": 1.3274868327007715e-07, "loss": 0.0548, "num_input_tokens_seen": 15867600, "step": 32230 }, { "epoch": 4.254322291144252, "grad_norm": 2.8422791729099117e-05, "learning_rate": 1.3251940725129108e-07, "loss": 0.0122, "num_input_tokens_seen": 15870032, "step": 32235 }, { "epoch": 4.254982182922001, "grad_norm": 0.0015061901649460196, "learning_rate": 1.3229031534897882e-07, "loss": 0.0, "num_input_tokens_seen": 15872464, "step": 32240 }, { "epoch": 4.255642074699749, "grad_norm": 0.0064211683347821236, "learning_rate": 1.320614076117641e-07, "loss": 0.0, "num_input_tokens_seen": 15874768, "step": 32245 }, { "epoch": 4.256301966477498, "grad_norm": 0.0002376893098698929, "learning_rate": 1.318326840882301e-07, "loss": 0.0, "num_input_tokens_seen": 15877136, "step": 32250 }, { "epoch": 4.256961858255246, "grad_norm": 9.079680603463203e-05, "learning_rate": 1.3160414482692217e-07, "loss": 0.0, "num_input_tokens_seen": 15879312, "step": 32255 }, { "epoch": 4.257621750032994, "grad_norm": 0.0030448322650045156, "learning_rate": 1.3137578987634635e-07, "loss": 0.0, "num_input_tokens_seen": 15881936, "step": 32260 }, { "epoch": 4.258281641810743, "grad_norm": 0.00842120312154293, "learning_rate": 1.3114761928496875e-07, "loss": 0.0, "num_input_tokens_seen": 15884240, "step": 32265 }, { "epoch": 4.258941533588492, "grad_norm": 0.00013276837125886232, "learning_rate": 1.3091963310121734e-07, "loss": 0.001, "num_input_tokens_seen": 15886736, "step": 32270 }, { "epoch": 4.25960142536624, "grad_norm": 0.00280931917950511, "learning_rate": 1.306918313734805e-07, "loss": 0.0, "num_input_tokens_seen": 15888976, "step": 32275 }, { "epoch": 4.260261317143988, "grad_norm": 0.00019990344299003482, "learning_rate": 1.3046421415010732e-07, "loss": 0.0001, "num_input_tokens_seen": 15891088, "step": 32280 }, { "epoch": 4.2609212089217365, "grad_norm": 0.01581178978085518, "learning_rate": 1.3023678147940797e-07, "loss": 0.0, "num_input_tokens_seen": 15893712, "step": 32285 }, { "epoch": 4.261581100699486, "grad_norm": 0.00010969245340675116, "learning_rate": 1.3000953340965336e-07, "loss": 0.0213, "num_input_tokens_seen": 15896144, "step": 32290 }, { "epoch": 4.262240992477234, "grad_norm": 0.0008918251842260361, "learning_rate": 1.297824699890756e-07, "loss": 0.0, "num_input_tokens_seen": 15898640, "step": 32295 }, { "epoch": 4.262900884254982, "grad_norm": 9.960948955267668e-05, "learning_rate": 1.2955559126586667e-07, "loss": 0.0, "num_input_tokens_seen": 15901008, "step": 32300 }, { "epoch": 4.2635607760327305, "grad_norm": 0.0014763657236471772, "learning_rate": 1.293288972881803e-07, "loss": 0.0, "num_input_tokens_seen": 15903696, "step": 32305 }, { "epoch": 4.264220667810479, "grad_norm": 0.08612053841352463, "learning_rate": 1.2910238810413075e-07, "loss": 0.0, "num_input_tokens_seen": 15906128, "step": 32310 }, { "epoch": 4.264880559588228, "grad_norm": 0.015839478000998497, "learning_rate": 1.2887606376179262e-07, "loss": 0.0, "num_input_tokens_seen": 15908624, "step": 32315 }, { "epoch": 4.265540451365976, "grad_norm": 0.00016956948093138635, "learning_rate": 1.2864992430920164e-07, "loss": 0.0001, "num_input_tokens_seen": 15910864, "step": 32320 }, { "epoch": 4.2662003431437245, "grad_norm": 3.184717570547946e-05, "learning_rate": 1.2842396979435476e-07, "loss": 0.0004, "num_input_tokens_seen": 15913296, "step": 32325 }, { "epoch": 4.266860234921473, "grad_norm": 1.2898004570160992e-05, "learning_rate": 1.2819820026520856e-07, "loss": 0.0, "num_input_tokens_seen": 15915792, "step": 32330 }, { "epoch": 4.267520126699221, "grad_norm": 0.0007911850116215646, "learning_rate": 1.2797261576968133e-07, "loss": 0.0, "num_input_tokens_seen": 15917968, "step": 32335 }, { "epoch": 4.268180018476969, "grad_norm": 0.0031948827672749758, "learning_rate": 1.2774721635565156e-07, "loss": 0.0, "num_input_tokens_seen": 15920656, "step": 32340 }, { "epoch": 4.2688399102547185, "grad_norm": 0.0011284436332061887, "learning_rate": 1.275220020709591e-07, "loss": 0.0, "num_input_tokens_seen": 15923024, "step": 32345 }, { "epoch": 4.269499802032467, "grad_norm": 0.0007449170225299895, "learning_rate": 1.2729697296340358e-07, "loss": 0.0503, "num_input_tokens_seen": 15925328, "step": 32350 }, { "epoch": 4.270159693810215, "grad_norm": 0.0018473287345841527, "learning_rate": 1.270721290807456e-07, "loss": 0.0, "num_input_tokens_seen": 15927760, "step": 32355 }, { "epoch": 4.270819585587963, "grad_norm": 0.08810362219810486, "learning_rate": 1.268474704707073e-07, "loss": 0.0, "num_input_tokens_seen": 15930192, "step": 32360 }, { "epoch": 4.271479477365712, "grad_norm": 2.55685572483344e-05, "learning_rate": 1.2662299718097036e-07, "loss": 0.0747, "num_input_tokens_seen": 15932368, "step": 32365 }, { "epoch": 4.272139369143461, "grad_norm": 0.0030504302121698856, "learning_rate": 1.2639870925917805e-07, "loss": 0.0, "num_input_tokens_seen": 15934928, "step": 32370 }, { "epoch": 4.272799260921209, "grad_norm": 0.0010365161579102278, "learning_rate": 1.2617460675293312e-07, "loss": 0.0, "num_input_tokens_seen": 15937232, "step": 32375 }, { "epoch": 4.273459152698957, "grad_norm": 0.0027514533139765263, "learning_rate": 1.259506897098005e-07, "loss": 0.0, "num_input_tokens_seen": 15940176, "step": 32380 }, { "epoch": 4.274119044476706, "grad_norm": 1.8370121717453003, "learning_rate": 1.2572695817730473e-07, "loss": 0.0005, "num_input_tokens_seen": 15942608, "step": 32385 }, { "epoch": 4.274778936254454, "grad_norm": 0.006444776430726051, "learning_rate": 1.2550341220293059e-07, "loss": 0.0, "num_input_tokens_seen": 15945296, "step": 32390 }, { "epoch": 4.275438828032203, "grad_norm": 0.00025173244648613036, "learning_rate": 1.2528005183412503e-07, "loss": 0.0, "num_input_tokens_seen": 15947920, "step": 32395 }, { "epoch": 4.276098719809951, "grad_norm": 3.405748793738894e-05, "learning_rate": 1.2505687711829417e-07, "loss": 0.0, "num_input_tokens_seen": 15950672, "step": 32400 }, { "epoch": 4.2767586115877, "grad_norm": 0.00016917857283260673, "learning_rate": 1.2483388810280538e-07, "loss": 0.0016, "num_input_tokens_seen": 15953552, "step": 32405 }, { "epoch": 4.277418503365448, "grad_norm": 0.010513650253415108, "learning_rate": 1.2461108483498617e-07, "loss": 0.0007, "num_input_tokens_seen": 15955920, "step": 32410 }, { "epoch": 4.278078395143196, "grad_norm": 0.000138119314215146, "learning_rate": 1.2438846736212516e-07, "loss": 0.0, "num_input_tokens_seen": 15958544, "step": 32415 }, { "epoch": 4.278738286920945, "grad_norm": 0.012299539521336555, "learning_rate": 1.2416603573147155e-07, "loss": 0.0, "num_input_tokens_seen": 15961168, "step": 32420 }, { "epoch": 4.279398178698694, "grad_norm": 0.00020890981249976903, "learning_rate": 1.2394378999023426e-07, "loss": 0.0, "num_input_tokens_seen": 15963408, "step": 32425 }, { "epoch": 4.280058070476442, "grad_norm": 0.00022947814431972802, "learning_rate": 1.2372173018558373e-07, "loss": 0.0, "num_input_tokens_seen": 15966224, "step": 32430 }, { "epoch": 4.28071796225419, "grad_norm": 0.0012638174230232835, "learning_rate": 1.2349985636465054e-07, "loss": 0.0002, "num_input_tokens_seen": 15968464, "step": 32435 }, { "epoch": 4.2813778540319385, "grad_norm": 0.004594683647155762, "learning_rate": 1.2327816857452567e-07, "loss": 0.0, "num_input_tokens_seen": 15971280, "step": 32440 }, { "epoch": 4.282037745809687, "grad_norm": 0.00012326195428613573, "learning_rate": 1.230566668622607e-07, "loss": 0.0, "num_input_tokens_seen": 15973520, "step": 32445 }, { "epoch": 4.282697637587436, "grad_norm": 0.0018847265746444464, "learning_rate": 1.2283535127486789e-07, "loss": 0.0, "num_input_tokens_seen": 15976016, "step": 32450 }, { "epoch": 4.283357529365184, "grad_norm": 0.09880480915307999, "learning_rate": 1.2261422185932003e-07, "loss": 0.0, "num_input_tokens_seen": 15978320, "step": 32455 }, { "epoch": 4.2840174211429325, "grad_norm": 1.9902327039744705e-05, "learning_rate": 1.223932786625499e-07, "loss": 0.0, "num_input_tokens_seen": 15980880, "step": 32460 }, { "epoch": 4.284677312920681, "grad_norm": 0.0009088137885555625, "learning_rate": 1.221725217314512e-07, "loss": 0.0, "num_input_tokens_seen": 15983312, "step": 32465 }, { "epoch": 4.285337204698429, "grad_norm": 2.24759578704834, "learning_rate": 1.2195195111287827e-07, "loss": 0.0017, "num_input_tokens_seen": 15985872, "step": 32470 }, { "epoch": 4.285997096476178, "grad_norm": 0.0002791814331430942, "learning_rate": 1.2173156685364516e-07, "loss": 0.0, "num_input_tokens_seen": 15988304, "step": 32475 }, { "epoch": 4.2866569882539265, "grad_norm": 44.14458084106445, "learning_rate": 1.2151136900052706e-07, "loss": 0.024, "num_input_tokens_seen": 15990672, "step": 32480 }, { "epoch": 4.287316880031675, "grad_norm": 0.0004078407946508378, "learning_rate": 1.2129135760025955e-07, "loss": 0.0008, "num_input_tokens_seen": 15993040, "step": 32485 }, { "epoch": 4.287976771809423, "grad_norm": 5.2265910198912024e-05, "learning_rate": 1.2107153269953818e-07, "loss": 0.0007, "num_input_tokens_seen": 15995792, "step": 32490 }, { "epoch": 4.288636663587171, "grad_norm": 0.0001401216140948236, "learning_rate": 1.208518943450192e-07, "loss": 0.0, "num_input_tokens_seen": 15998288, "step": 32495 }, { "epoch": 4.2892965553649205, "grad_norm": 0.1450749933719635, "learning_rate": 1.2063244258331938e-07, "loss": 0.0001, "num_input_tokens_seen": 16000912, "step": 32500 }, { "epoch": 4.289956447142669, "grad_norm": 0.0006044969195500016, "learning_rate": 1.2041317746101599e-07, "loss": 0.0, "num_input_tokens_seen": 16003088, "step": 32505 }, { "epoch": 4.290616338920417, "grad_norm": 1.2627470823645126e-05, "learning_rate": 1.2019409902464616e-07, "loss": 0.0, "num_input_tokens_seen": 16005776, "step": 32510 }, { "epoch": 4.291276230698165, "grad_norm": 4.5773995225317776e-05, "learning_rate": 1.1997520732070742e-07, "loss": 0.0, "num_input_tokens_seen": 16008144, "step": 32515 }, { "epoch": 4.291936122475914, "grad_norm": 0.004673975054174662, "learning_rate": 1.197565023956586e-07, "loss": 0.0, "num_input_tokens_seen": 16010768, "step": 32520 }, { "epoch": 4.292596014253663, "grad_norm": 0.0012522018514573574, "learning_rate": 1.1953798429591778e-07, "loss": 0.0, "num_input_tokens_seen": 16013200, "step": 32525 }, { "epoch": 4.293255906031411, "grad_norm": 0.2917592525482178, "learning_rate": 1.1931965306786396e-07, "loss": 0.0002, "num_input_tokens_seen": 16015824, "step": 32530 }, { "epoch": 4.293915797809159, "grad_norm": 0.0011305802036076784, "learning_rate": 1.1910150875783664e-07, "loss": 0.0, "num_input_tokens_seen": 16018064, "step": 32535 }, { "epoch": 4.294575689586908, "grad_norm": 1.8471331713953987e-05, "learning_rate": 1.1888355141213491e-07, "loss": 0.0, "num_input_tokens_seen": 16020432, "step": 32540 }, { "epoch": 4.295235581364656, "grad_norm": 7.753491081530228e-05, "learning_rate": 1.1866578107701897e-07, "loss": 0.0001, "num_input_tokens_seen": 16023056, "step": 32545 }, { "epoch": 4.295895473142405, "grad_norm": 0.00025911873672157526, "learning_rate": 1.1844819779870862e-07, "loss": 0.0001, "num_input_tokens_seen": 16025360, "step": 32550 }, { "epoch": 4.296555364920153, "grad_norm": 0.0005726946983486414, "learning_rate": 1.1823080162338483e-07, "loss": 0.0, "num_input_tokens_seen": 16027920, "step": 32555 }, { "epoch": 4.297215256697902, "grad_norm": 0.017819080501794815, "learning_rate": 1.1801359259718823e-07, "loss": 0.0, "num_input_tokens_seen": 16030416, "step": 32560 }, { "epoch": 4.29787514847565, "grad_norm": 0.0017297941958531737, "learning_rate": 1.1779657076621951e-07, "loss": 0.0001, "num_input_tokens_seen": 16032784, "step": 32565 }, { "epoch": 4.298535040253398, "grad_norm": 0.0005568054039031267, "learning_rate": 1.1757973617654027e-07, "loss": 0.0, "num_input_tokens_seen": 16035216, "step": 32570 }, { "epoch": 4.299194932031147, "grad_norm": 0.006575642619282007, "learning_rate": 1.1736308887417201e-07, "loss": 0.0, "num_input_tokens_seen": 16037584, "step": 32575 }, { "epoch": 4.299854823808896, "grad_norm": 1.1569028174562845e-05, "learning_rate": 1.1714662890509685e-07, "loss": 0.0001, "num_input_tokens_seen": 16040016, "step": 32580 }, { "epoch": 4.300514715586644, "grad_norm": 7.299587014131248e-05, "learning_rate": 1.1693035631525628e-07, "loss": 0.0, "num_input_tokens_seen": 16042640, "step": 32585 }, { "epoch": 4.301174607364392, "grad_norm": 0.0003289075684733689, "learning_rate": 1.1671427115055299e-07, "loss": 0.0, "num_input_tokens_seen": 16045136, "step": 32590 }, { "epoch": 4.3018344991421404, "grad_norm": 1.6217174561461434e-05, "learning_rate": 1.1649837345684954e-07, "loss": 0.0006, "num_input_tokens_seen": 16047696, "step": 32595 }, { "epoch": 4.302494390919889, "grad_norm": 2.654941454238724e-05, "learning_rate": 1.1628266327996827e-07, "loss": 0.0004, "num_input_tokens_seen": 16050000, "step": 32600 }, { "epoch": 4.303154282697638, "grad_norm": 0.0025819791480898857, "learning_rate": 1.1606714066569235e-07, "loss": 0.0003, "num_input_tokens_seen": 16052624, "step": 32605 }, { "epoch": 4.303814174475386, "grad_norm": 22.277074813842773, "learning_rate": 1.1585180565976515e-07, "loss": 0.0361, "num_input_tokens_seen": 16054864, "step": 32610 }, { "epoch": 4.3044740662531344, "grad_norm": 0.00020241711172275245, "learning_rate": 1.1563665830788948e-07, "loss": 0.0, "num_input_tokens_seen": 16057104, "step": 32615 }, { "epoch": 4.305133958030883, "grad_norm": 0.00014118028047960252, "learning_rate": 1.1542169865572904e-07, "loss": 0.0001, "num_input_tokens_seen": 16059472, "step": 32620 }, { "epoch": 4.305793849808631, "grad_norm": 1.2410049748723395e-05, "learning_rate": 1.1520692674890741e-07, "loss": 0.0002, "num_input_tokens_seen": 16061712, "step": 32625 }, { "epoch": 4.30645374158638, "grad_norm": 5.3214229410514235e-05, "learning_rate": 1.149923426330086e-07, "loss": 0.0001, "num_input_tokens_seen": 16064016, "step": 32630 }, { "epoch": 4.3071136333641284, "grad_norm": 2.0495712306001224e-05, "learning_rate": 1.1477794635357618e-07, "loss": 0.0001, "num_input_tokens_seen": 16066192, "step": 32635 }, { "epoch": 4.307773525141877, "grad_norm": 0.00010133234172826633, "learning_rate": 1.145637379561144e-07, "loss": 0.0, "num_input_tokens_seen": 16068368, "step": 32640 }, { "epoch": 4.308433416919625, "grad_norm": 2.9838472983101383e-05, "learning_rate": 1.1434971748608757e-07, "loss": 0.0002, "num_input_tokens_seen": 16070416, "step": 32645 }, { "epoch": 4.309093308697373, "grad_norm": 0.0007358550792559981, "learning_rate": 1.1413588498891957e-07, "loss": 0.0, "num_input_tokens_seen": 16072784, "step": 32650 }, { "epoch": 4.3097532004751224, "grad_norm": 0.09885164350271225, "learning_rate": 1.139222405099951e-07, "loss": 0.0001, "num_input_tokens_seen": 16075280, "step": 32655 }, { "epoch": 4.310413092252871, "grad_norm": 0.0009102729964070022, "learning_rate": 1.137087840946589e-07, "loss": 0.0, "num_input_tokens_seen": 16078032, "step": 32660 }, { "epoch": 4.311072984030619, "grad_norm": 0.000612208095844835, "learning_rate": 1.1349551578821493e-07, "loss": 0.0133, "num_input_tokens_seen": 16080464, "step": 32665 }, { "epoch": 4.311732875808367, "grad_norm": 3.099453169852495e-05, "learning_rate": 1.1328243563592831e-07, "loss": 0.0, "num_input_tokens_seen": 16082960, "step": 32670 }, { "epoch": 4.312392767586116, "grad_norm": 3.5384764487389475e-05, "learning_rate": 1.1306954368302357e-07, "loss": 0.0, "num_input_tokens_seen": 16085456, "step": 32675 }, { "epoch": 4.313052659363865, "grad_norm": 2.569669231888838e-05, "learning_rate": 1.1285683997468564e-07, "loss": 0.0015, "num_input_tokens_seen": 16087504, "step": 32680 }, { "epoch": 4.313712551141613, "grad_norm": 0.0005885774153284729, "learning_rate": 1.1264432455605933e-07, "loss": 0.0, "num_input_tokens_seen": 16089936, "step": 32685 }, { "epoch": 4.314372442919361, "grad_norm": 4.543912291410379e-05, "learning_rate": 1.1243199747224897e-07, "loss": 0.0, "num_input_tokens_seen": 16092112, "step": 32690 }, { "epoch": 4.31503233469711, "grad_norm": 0.0004520398215390742, "learning_rate": 1.122198587683203e-07, "loss": 0.0, "num_input_tokens_seen": 16094544, "step": 32695 }, { "epoch": 4.315692226474858, "grad_norm": 1.6724603483453393e-05, "learning_rate": 1.1200790848929764e-07, "loss": 0.0239, "num_input_tokens_seen": 16096848, "step": 32700 }, { "epoch": 4.316352118252606, "grad_norm": 0.0003861555305775255, "learning_rate": 1.1179614668016624e-07, "loss": 0.0, "num_input_tokens_seen": 16099024, "step": 32705 }, { "epoch": 4.317012010030355, "grad_norm": 0.20465129613876343, "learning_rate": 1.1158457338587047e-07, "loss": 0.0144, "num_input_tokens_seen": 16101776, "step": 32710 }, { "epoch": 4.317671901808104, "grad_norm": 3.3590320526855066e-05, "learning_rate": 1.1137318865131595e-07, "loss": 0.0, "num_input_tokens_seen": 16104144, "step": 32715 }, { "epoch": 4.318331793585852, "grad_norm": 0.000302224128972739, "learning_rate": 1.1116199252136727e-07, "loss": 0.0, "num_input_tokens_seen": 16106512, "step": 32720 }, { "epoch": 4.3189916853636, "grad_norm": 0.12201271951198578, "learning_rate": 1.1095098504084877e-07, "loss": 0.0427, "num_input_tokens_seen": 16108944, "step": 32725 }, { "epoch": 4.319651577141348, "grad_norm": 0.04804328456521034, "learning_rate": 1.1074016625454607e-07, "loss": 0.0001, "num_input_tokens_seen": 16111312, "step": 32730 }, { "epoch": 4.320311468919098, "grad_norm": 0.00011896403884747997, "learning_rate": 1.1052953620720351e-07, "loss": 0.028, "num_input_tokens_seen": 16114000, "step": 32735 }, { "epoch": 4.320971360696846, "grad_norm": 4.947075649397448e-05, "learning_rate": 1.1031909494352588e-07, "loss": 0.0322, "num_input_tokens_seen": 16116112, "step": 32740 }, { "epoch": 4.321631252474594, "grad_norm": 0.0005829980946145952, "learning_rate": 1.1010884250817765e-07, "loss": 0.0533, "num_input_tokens_seen": 16118544, "step": 32745 }, { "epoch": 4.322291144252342, "grad_norm": 5.172559031052515e-05, "learning_rate": 1.098987789457836e-07, "loss": 0.0, "num_input_tokens_seen": 16120976, "step": 32750 }, { "epoch": 4.322951036030091, "grad_norm": 4.1301213059341535e-05, "learning_rate": 1.0968890430092825e-07, "loss": 0.0096, "num_input_tokens_seen": 16123600, "step": 32755 }, { "epoch": 4.32361092780784, "grad_norm": 0.0037700431421399117, "learning_rate": 1.0947921861815557e-07, "loss": 0.0, "num_input_tokens_seen": 16125840, "step": 32760 }, { "epoch": 4.324270819585588, "grad_norm": 0.0031601234804838896, "learning_rate": 1.0926972194197015e-07, "loss": 0.0518, "num_input_tokens_seen": 16128336, "step": 32765 }, { "epoch": 4.324930711363336, "grad_norm": 0.00015431219071615487, "learning_rate": 1.0906041431683632e-07, "loss": 0.0, "num_input_tokens_seen": 16131024, "step": 32770 }, { "epoch": 4.325590603141085, "grad_norm": 0.0005814318428747356, "learning_rate": 1.0885129578717767e-07, "loss": 0.0, "num_input_tokens_seen": 16133712, "step": 32775 }, { "epoch": 4.326250494918833, "grad_norm": 2.0374101950437762e-05, "learning_rate": 1.0864236639737823e-07, "loss": 0.0, "num_input_tokens_seen": 16136208, "step": 32780 }, { "epoch": 4.326910386696582, "grad_norm": 3.942536568501964e-05, "learning_rate": 1.0843362619178187e-07, "loss": 0.0, "num_input_tokens_seen": 16138576, "step": 32785 }, { "epoch": 4.32757027847433, "grad_norm": 5.525383472442627, "learning_rate": 1.0822507521469227e-07, "loss": 0.0014, "num_input_tokens_seen": 16141072, "step": 32790 }, { "epoch": 4.328230170252079, "grad_norm": 6.413905066438019e-05, "learning_rate": 1.0801671351037255e-07, "loss": 0.0, "num_input_tokens_seen": 16143632, "step": 32795 }, { "epoch": 4.328890062029827, "grad_norm": 0.00015124822675716132, "learning_rate": 1.0780854112304626e-07, "loss": 0.0018, "num_input_tokens_seen": 16146320, "step": 32800 }, { "epoch": 4.329549953807575, "grad_norm": 0.0004587690345942974, "learning_rate": 1.076005580968965e-07, "loss": 0.0, "num_input_tokens_seen": 16149008, "step": 32805 }, { "epoch": 4.330209845585324, "grad_norm": 0.0013646406587213278, "learning_rate": 1.0739276447606582e-07, "loss": 0.0, "num_input_tokens_seen": 16151504, "step": 32810 }, { "epoch": 4.330869737363073, "grad_norm": 8.358648483408615e-05, "learning_rate": 1.0718516030465708e-07, "loss": 0.0, "num_input_tokens_seen": 16154320, "step": 32815 }, { "epoch": 4.331529629140821, "grad_norm": 0.0003338223323225975, "learning_rate": 1.0697774562673312e-07, "loss": 0.0, "num_input_tokens_seen": 16156816, "step": 32820 }, { "epoch": 4.332189520918569, "grad_norm": 0.0018141282489523292, "learning_rate": 1.0677052048631563e-07, "loss": 0.0, "num_input_tokens_seen": 16158992, "step": 32825 }, { "epoch": 4.3328494126963175, "grad_norm": 3.3614989661145955e-05, "learning_rate": 1.0656348492738687e-07, "loss": 0.0, "num_input_tokens_seen": 16161296, "step": 32830 }, { "epoch": 4.333509304474067, "grad_norm": 9.122475603362545e-05, "learning_rate": 1.0635663899388881e-07, "loss": 0.0, "num_input_tokens_seen": 16163664, "step": 32835 }, { "epoch": 4.334169196251815, "grad_norm": 6.432763620978221e-05, "learning_rate": 1.0614998272972298e-07, "loss": 0.0533, "num_input_tokens_seen": 16165840, "step": 32840 }, { "epoch": 4.334829088029563, "grad_norm": 2.337733531021513e-05, "learning_rate": 1.0594351617875053e-07, "loss": 0.0683, "num_input_tokens_seen": 16168208, "step": 32845 }, { "epoch": 4.3354889798073115, "grad_norm": 4.805472417501733e-05, "learning_rate": 1.0573723938479217e-07, "loss": 0.0, "num_input_tokens_seen": 16170640, "step": 32850 }, { "epoch": 4.33614887158506, "grad_norm": 0.0001322894386248663, "learning_rate": 1.0553115239162935e-07, "loss": 0.0, "num_input_tokens_seen": 16172880, "step": 32855 }, { "epoch": 4.336808763362809, "grad_norm": 1.8390241166343912e-05, "learning_rate": 1.0532525524300206e-07, "loss": 0.0, "num_input_tokens_seen": 16175248, "step": 32860 }, { "epoch": 4.337468655140557, "grad_norm": 6.988491804804653e-05, "learning_rate": 1.0511954798261058e-07, "loss": 0.0, "num_input_tokens_seen": 16177680, "step": 32865 }, { "epoch": 4.3381285469183055, "grad_norm": 0.0004975342308171093, "learning_rate": 1.0491403065411508e-07, "loss": 0.0472, "num_input_tokens_seen": 16180048, "step": 32870 }, { "epoch": 4.338788438696054, "grad_norm": 0.001267925021238625, "learning_rate": 1.0470870330113457e-07, "loss": 0.0, "num_input_tokens_seen": 16182416, "step": 32875 }, { "epoch": 4.339448330473802, "grad_norm": 0.0012009447673335671, "learning_rate": 1.0450356596724886e-07, "loss": 0.1348, "num_input_tokens_seen": 16184848, "step": 32880 }, { "epoch": 4.34010822225155, "grad_norm": 2.034113094850909e-05, "learning_rate": 1.0429861869599622e-07, "loss": 0.0, "num_input_tokens_seen": 16187280, "step": 32885 }, { "epoch": 4.3407681140292995, "grad_norm": 1.8013641238212585e-05, "learning_rate": 1.0409386153087596e-07, "loss": 0.0004, "num_input_tokens_seen": 16189584, "step": 32890 }, { "epoch": 4.341428005807048, "grad_norm": 0.0014492205809801817, "learning_rate": 1.0388929451534601e-07, "loss": 0.0, "num_input_tokens_seen": 16191760, "step": 32895 }, { "epoch": 4.342087897584796, "grad_norm": 0.0023908542934805155, "learning_rate": 1.0368491769282395e-07, "loss": 0.0, "num_input_tokens_seen": 16194128, "step": 32900 }, { "epoch": 4.342747789362544, "grad_norm": 8.186150080291554e-05, "learning_rate": 1.0348073110668743e-07, "loss": 0.0, "num_input_tokens_seen": 16196752, "step": 32905 }, { "epoch": 4.343407681140293, "grad_norm": 0.0023419694043695927, "learning_rate": 1.0327673480027377e-07, "loss": 0.0, "num_input_tokens_seen": 16199248, "step": 32910 }, { "epoch": 4.344067572918042, "grad_norm": 0.0005303608486428857, "learning_rate": 1.0307292881687968e-07, "loss": 0.0002, "num_input_tokens_seen": 16201808, "step": 32915 }, { "epoch": 4.34472746469579, "grad_norm": 0.1571546196937561, "learning_rate": 1.0286931319976133e-07, "loss": 0.0, "num_input_tokens_seen": 16204304, "step": 32920 }, { "epoch": 4.345387356473538, "grad_norm": 4.8940040869638324e-05, "learning_rate": 1.026658879921346e-07, "loss": 0.0, "num_input_tokens_seen": 16206864, "step": 32925 }, { "epoch": 4.346047248251287, "grad_norm": 6.440455436706543, "learning_rate": 1.024626532371755e-07, "loss": 0.0061, "num_input_tokens_seen": 16209104, "step": 32930 }, { "epoch": 4.346707140029035, "grad_norm": 0.00065460434416309, "learning_rate": 1.0225960897801856e-07, "loss": 0.0001, "num_input_tokens_seen": 16211536, "step": 32935 }, { "epoch": 4.347367031806784, "grad_norm": 4.881566565018147e-05, "learning_rate": 1.0205675525775858e-07, "loss": 0.0, "num_input_tokens_seen": 16213840, "step": 32940 }, { "epoch": 4.348026923584532, "grad_norm": 0.008005255833268166, "learning_rate": 1.0185409211945017e-07, "loss": 0.0, "num_input_tokens_seen": 16216144, "step": 32945 }, { "epoch": 4.348686815362281, "grad_norm": 0.00028780216234736145, "learning_rate": 1.0165161960610669e-07, "loss": 0.0, "num_input_tokens_seen": 16218512, "step": 32950 }, { "epoch": 4.349346707140029, "grad_norm": 6.93507754476741e-05, "learning_rate": 1.0144933776070163e-07, "loss": 0.0, "num_input_tokens_seen": 16221200, "step": 32955 }, { "epoch": 4.350006598917777, "grad_norm": 1.4966816706873942e-05, "learning_rate": 1.012472466261678e-07, "loss": 0.0, "num_input_tokens_seen": 16223632, "step": 32960 }, { "epoch": 4.3506664906955255, "grad_norm": 0.0011455845087766647, "learning_rate": 1.0104534624539785e-07, "loss": 0.001, "num_input_tokens_seen": 16226192, "step": 32965 }, { "epoch": 4.351326382473275, "grad_norm": 0.00605833949521184, "learning_rate": 1.0084363666124318e-07, "loss": 0.0002, "num_input_tokens_seen": 16228432, "step": 32970 }, { "epoch": 4.351986274251023, "grad_norm": 0.0009054954862222075, "learning_rate": 1.0064211791651544e-07, "loss": 0.0, "num_input_tokens_seen": 16230736, "step": 32975 }, { "epoch": 4.352646166028771, "grad_norm": 1.4958550309529528e-05, "learning_rate": 1.0044079005398576e-07, "loss": 0.0001, "num_input_tokens_seen": 16232976, "step": 32980 }, { "epoch": 4.3533060578065195, "grad_norm": 2.5793897293624468e-05, "learning_rate": 1.0023965311638415e-07, "loss": 0.0, "num_input_tokens_seen": 16235408, "step": 32985 }, { "epoch": 4.353965949584268, "grad_norm": 5.313528163242154e-05, "learning_rate": 1.0003870714640061e-07, "loss": 0.0, "num_input_tokens_seen": 16238032, "step": 32990 }, { "epoch": 4.354625841362017, "grad_norm": 3.9087779441615567e-05, "learning_rate": 9.983795218668456e-08, "loss": 0.002, "num_input_tokens_seen": 16240976, "step": 32995 }, { "epoch": 4.355285733139765, "grad_norm": 3.251605812693015e-05, "learning_rate": 9.963738827984458e-08, "loss": 0.0384, "num_input_tokens_seen": 16243088, "step": 33000 }, { "epoch": 4.3559456249175135, "grad_norm": 0.00026064313715323806, "learning_rate": 9.943701546844906e-08, "loss": 0.0, "num_input_tokens_seen": 16245520, "step": 33005 }, { "epoch": 4.356605516695262, "grad_norm": 0.02836507558822632, "learning_rate": 9.923683379502557e-08, "loss": 0.0337, "num_input_tokens_seen": 16248016, "step": 33010 }, { "epoch": 4.35726540847301, "grad_norm": 2.0125011360505596e-05, "learning_rate": 9.903684330206152e-08, "loss": 0.0, "num_input_tokens_seen": 16250320, "step": 33015 }, { "epoch": 4.357925300250759, "grad_norm": 0.0007408508099615574, "learning_rate": 9.8837044032003e-08, "loss": 0.0, "num_input_tokens_seen": 16253072, "step": 33020 }, { "epoch": 4.3585851920285075, "grad_norm": 0.0003739091625902802, "learning_rate": 9.863743602725627e-08, "loss": 0.0, "num_input_tokens_seen": 16255696, "step": 33025 }, { "epoch": 4.359245083806256, "grad_norm": 2.6183059162576683e-05, "learning_rate": 9.843801933018669e-08, "loss": 0.0, "num_input_tokens_seen": 16258256, "step": 33030 }, { "epoch": 4.359904975584004, "grad_norm": 0.07573069632053375, "learning_rate": 9.823879398311874e-08, "loss": 0.0, "num_input_tokens_seen": 16260752, "step": 33035 }, { "epoch": 4.360564867361752, "grad_norm": 7.156374340411276e-05, "learning_rate": 9.803976002833692e-08, "loss": 0.0226, "num_input_tokens_seen": 16263440, "step": 33040 }, { "epoch": 4.3612247591395015, "grad_norm": 0.00026338372845202684, "learning_rate": 9.78409175080841e-08, "loss": 0.0009, "num_input_tokens_seen": 16266000, "step": 33045 }, { "epoch": 4.36188465091725, "grad_norm": 7.694336090935394e-05, "learning_rate": 9.764226646456408e-08, "loss": 0.0, "num_input_tokens_seen": 16268624, "step": 33050 }, { "epoch": 4.362544542694998, "grad_norm": 2.3234377295011654e-05, "learning_rate": 9.744380693993858e-08, "loss": 0.0, "num_input_tokens_seen": 16270992, "step": 33055 }, { "epoch": 4.363204434472746, "grad_norm": 0.00016954565944615752, "learning_rate": 9.724553897632893e-08, "loss": 0.0, "num_input_tokens_seen": 16273424, "step": 33060 }, { "epoch": 4.363864326250495, "grad_norm": 2.0216995835653506e-05, "learning_rate": 9.704746261581675e-08, "loss": 0.0441, "num_input_tokens_seen": 16275728, "step": 33065 }, { "epoch": 4.364524218028244, "grad_norm": 0.00014168783673085272, "learning_rate": 9.684957790044179e-08, "loss": 0.0, "num_input_tokens_seen": 16277904, "step": 33070 }, { "epoch": 4.365184109805992, "grad_norm": 0.0001042889998643659, "learning_rate": 9.665188487220399e-08, "loss": 0.0001, "num_input_tokens_seen": 16280720, "step": 33075 }, { "epoch": 4.36584400158374, "grad_norm": 6.27780391369015e-05, "learning_rate": 9.64543835730619e-08, "loss": 0.0, "num_input_tokens_seen": 16283088, "step": 33080 }, { "epoch": 4.366503893361489, "grad_norm": 2.529203447920736e-05, "learning_rate": 9.625707404493399e-08, "loss": 0.0, "num_input_tokens_seen": 16285520, "step": 33085 }, { "epoch": 4.367163785139237, "grad_norm": 0.00032823492074385285, "learning_rate": 9.605995632969787e-08, "loss": 0.0001, "num_input_tokens_seen": 16287888, "step": 33090 }, { "epoch": 4.367823676916986, "grad_norm": 19.19044303894043, "learning_rate": 9.586303046919008e-08, "loss": 0.0226, "num_input_tokens_seen": 16290256, "step": 33095 }, { "epoch": 4.368483568694734, "grad_norm": 0.0008443885017186403, "learning_rate": 9.566629650520675e-08, "loss": 0.0, "num_input_tokens_seen": 16292496, "step": 33100 }, { "epoch": 4.369143460472483, "grad_norm": 0.19148346781730652, "learning_rate": 9.546975447950345e-08, "loss": 0.0001, "num_input_tokens_seen": 16294864, "step": 33105 }, { "epoch": 4.369803352250231, "grad_norm": 0.0007681222632527351, "learning_rate": 9.527340443379461e-08, "loss": 0.0, "num_input_tokens_seen": 16297616, "step": 33110 }, { "epoch": 4.370463244027979, "grad_norm": 0.011789199896156788, "learning_rate": 9.507724640975412e-08, "loss": 0.0, "num_input_tokens_seen": 16300048, "step": 33115 }, { "epoch": 4.371123135805728, "grad_norm": 0.00011409088619984686, "learning_rate": 9.488128044901511e-08, "loss": 0.0, "num_input_tokens_seen": 16302608, "step": 33120 }, { "epoch": 4.371783027583477, "grad_norm": 0.00021923432359471917, "learning_rate": 9.468550659317009e-08, "loss": 0.0715, "num_input_tokens_seen": 16305232, "step": 33125 }, { "epoch": 4.372442919361225, "grad_norm": 0.00023443755344487727, "learning_rate": 9.44899248837705e-08, "loss": 0.028, "num_input_tokens_seen": 16307536, "step": 33130 }, { "epoch": 4.373102811138973, "grad_norm": 1.5468593119294383e-05, "learning_rate": 9.4294535362327e-08, "loss": 0.0003, "num_input_tokens_seen": 16310160, "step": 33135 }, { "epoch": 4.373762702916721, "grad_norm": 1.1573849405976944e-05, "learning_rate": 9.409933807031012e-08, "loss": 0.0001, "num_input_tokens_seen": 16312976, "step": 33140 }, { "epoch": 4.37442259469447, "grad_norm": 2.0115514416829683e-05, "learning_rate": 9.390433304914846e-08, "loss": 0.0, "num_input_tokens_seen": 16315216, "step": 33145 }, { "epoch": 4.375082486472219, "grad_norm": 0.0004016650200355798, "learning_rate": 9.370952034023061e-08, "loss": 0.0502, "num_input_tokens_seen": 16317584, "step": 33150 }, { "epoch": 4.375742378249967, "grad_norm": 0.019395913928747177, "learning_rate": 9.351489998490447e-08, "loss": 0.0, "num_input_tokens_seen": 16319952, "step": 33155 }, { "epoch": 4.376402270027715, "grad_norm": 2.075076918117702e-05, "learning_rate": 9.332047202447635e-08, "loss": 0.0, "num_input_tokens_seen": 16322576, "step": 33160 }, { "epoch": 4.377062161805464, "grad_norm": 0.0009933270048350096, "learning_rate": 9.312623650021245e-08, "loss": 0.0, "num_input_tokens_seen": 16325328, "step": 33165 }, { "epoch": 4.377722053583212, "grad_norm": 0.04395154118537903, "learning_rate": 9.29321934533378e-08, "loss": 0.0001, "num_input_tokens_seen": 16327568, "step": 33170 }, { "epoch": 4.378381945360961, "grad_norm": 0.003907402511686087, "learning_rate": 9.273834292503668e-08, "loss": 0.0, "num_input_tokens_seen": 16330384, "step": 33175 }, { "epoch": 4.379041837138709, "grad_norm": 0.04484650120139122, "learning_rate": 9.254468495645251e-08, "loss": 0.0, "num_input_tokens_seen": 16332624, "step": 33180 }, { "epoch": 4.379701728916458, "grad_norm": 6.542204937431961e-05, "learning_rate": 9.235121958868731e-08, "loss": 0.0, "num_input_tokens_seen": 16334928, "step": 33185 }, { "epoch": 4.380361620694206, "grad_norm": 6.131920963525772e-05, "learning_rate": 9.215794686280343e-08, "loss": 0.0004, "num_input_tokens_seen": 16337552, "step": 33190 }, { "epoch": 4.381021512471954, "grad_norm": 0.003037465503439307, "learning_rate": 9.196486681982096e-08, "loss": 0.0, "num_input_tokens_seen": 16340112, "step": 33195 }, { "epoch": 4.381681404249703, "grad_norm": 0.16938516497612, "learning_rate": 9.177197950072012e-08, "loss": 0.0001, "num_input_tokens_seen": 16342416, "step": 33200 }, { "epoch": 4.382341296027452, "grad_norm": 0.004777638241648674, "learning_rate": 9.157928494644007e-08, "loss": 0.0366, "num_input_tokens_seen": 16344912, "step": 33205 }, { "epoch": 4.3830011878052, "grad_norm": 5.403992690844461e-05, "learning_rate": 9.138678319787818e-08, "loss": 0.0001, "num_input_tokens_seen": 16347728, "step": 33210 }, { "epoch": 4.383661079582948, "grad_norm": 0.0020157424733042717, "learning_rate": 9.119447429589212e-08, "loss": 0.0, "num_input_tokens_seen": 16350352, "step": 33215 }, { "epoch": 4.3843209713606965, "grad_norm": 0.008629385381937027, "learning_rate": 9.100235828129743e-08, "loss": 0.0, "num_input_tokens_seen": 16352784, "step": 33220 }, { "epoch": 4.384980863138446, "grad_norm": 0.0016030854312703013, "learning_rate": 9.08104351948702e-08, "loss": 0.0066, "num_input_tokens_seen": 16355344, "step": 33225 }, { "epoch": 4.385640754916194, "grad_norm": 0.0016935844905674458, "learning_rate": 9.061870507734426e-08, "loss": 0.0003, "num_input_tokens_seen": 16357712, "step": 33230 }, { "epoch": 4.386300646693942, "grad_norm": 0.00036475996603257954, "learning_rate": 9.042716796941275e-08, "loss": 0.0, "num_input_tokens_seen": 16360144, "step": 33235 }, { "epoch": 4.3869605384716905, "grad_norm": 0.0006020345608703792, "learning_rate": 9.023582391172813e-08, "loss": 0.0, "num_input_tokens_seen": 16362576, "step": 33240 }, { "epoch": 4.387620430249439, "grad_norm": 5.8977642765967175e-05, "learning_rate": 9.004467294490203e-08, "loss": 0.0, "num_input_tokens_seen": 16365072, "step": 33245 }, { "epoch": 4.388280322027187, "grad_norm": 4.70478662464302e-05, "learning_rate": 8.98537151095048e-08, "loss": 0.0, "num_input_tokens_seen": 16367568, "step": 33250 }, { "epoch": 4.388940213804936, "grad_norm": 0.0005329661653377116, "learning_rate": 8.966295044606565e-08, "loss": 0.0, "num_input_tokens_seen": 16370128, "step": 33255 }, { "epoch": 4.3896001055826845, "grad_norm": 2.0265884813852608e-05, "learning_rate": 8.94723789950731e-08, "loss": 0.0, "num_input_tokens_seen": 16372688, "step": 33260 }, { "epoch": 4.390259997360433, "grad_norm": 3.663907409645617e-05, "learning_rate": 8.928200079697479e-08, "loss": 0.0, "num_input_tokens_seen": 16375120, "step": 33265 }, { "epoch": 4.390919889138181, "grad_norm": 1.8534060716629028, "learning_rate": 8.909181589217674e-08, "loss": 0.0006, "num_input_tokens_seen": 16377616, "step": 33270 }, { "epoch": 4.391579780915929, "grad_norm": 0.001200351631268859, "learning_rate": 8.890182432104443e-08, "loss": 0.0, "num_input_tokens_seen": 16380112, "step": 33275 }, { "epoch": 4.3922396726936785, "grad_norm": 1.7528962416690774e-05, "learning_rate": 8.871202612390249e-08, "loss": 0.0, "num_input_tokens_seen": 16382544, "step": 33280 }, { "epoch": 4.392899564471427, "grad_norm": 2.8107933758292347e-05, "learning_rate": 8.852242134103383e-08, "loss": 0.0, "num_input_tokens_seen": 16385104, "step": 33285 }, { "epoch": 4.393559456249175, "grad_norm": 0.0058558168821036816, "learning_rate": 8.833301001268078e-08, "loss": 0.0, "num_input_tokens_seen": 16387536, "step": 33290 }, { "epoch": 4.394219348026923, "grad_norm": 24.756563186645508, "learning_rate": 8.814379217904455e-08, "loss": 0.0188, "num_input_tokens_seen": 16389840, "step": 33295 }, { "epoch": 4.394879239804672, "grad_norm": 1.8636386812431738e-05, "learning_rate": 8.795476788028555e-08, "loss": 0.0153, "num_input_tokens_seen": 16392080, "step": 33300 }, { "epoch": 4.395539131582421, "grad_norm": 0.0011585361789911985, "learning_rate": 8.776593715652226e-08, "loss": 0.0001, "num_input_tokens_seen": 16394384, "step": 33305 }, { "epoch": 4.396199023360169, "grad_norm": 2.210846185684204, "learning_rate": 8.757730004783303e-08, "loss": 0.002, "num_input_tokens_seen": 16397072, "step": 33310 }, { "epoch": 4.396858915137917, "grad_norm": 0.018569767475128174, "learning_rate": 8.738885659425477e-08, "loss": 0.0626, "num_input_tokens_seen": 16399696, "step": 33315 }, { "epoch": 4.397518806915666, "grad_norm": 1.4774296687392052e-05, "learning_rate": 8.72006068357829e-08, "loss": 0.0, "num_input_tokens_seen": 16402256, "step": 33320 }, { "epoch": 4.398178698693414, "grad_norm": 0.1812116652727127, "learning_rate": 8.701255081237225e-08, "loss": 0.0001, "num_input_tokens_seen": 16404944, "step": 33325 }, { "epoch": 4.398838590471163, "grad_norm": 0.00019690478802658617, "learning_rate": 8.682468856393654e-08, "loss": 0.0, "num_input_tokens_seen": 16407248, "step": 33330 }, { "epoch": 4.399498482248911, "grad_norm": 5.9989270084770396e-05, "learning_rate": 8.66370201303478e-08, "loss": 0.0, "num_input_tokens_seen": 16409424, "step": 33335 }, { "epoch": 4.40015837402666, "grad_norm": 0.08432416617870331, "learning_rate": 8.644954555143757e-08, "loss": 0.0004, "num_input_tokens_seen": 16412048, "step": 33340 }, { "epoch": 4.400818265804408, "grad_norm": 0.000751759042032063, "learning_rate": 8.626226486699573e-08, "loss": 0.0002, "num_input_tokens_seen": 16414736, "step": 33345 }, { "epoch": 4.401478157582156, "grad_norm": 3.477888094494119e-05, "learning_rate": 8.607517811677168e-08, "loss": 0.0, "num_input_tokens_seen": 16417232, "step": 33350 }, { "epoch": 4.402138049359905, "grad_norm": 0.002841503359377384, "learning_rate": 8.588828534047276e-08, "loss": 0.0188, "num_input_tokens_seen": 16419728, "step": 33355 }, { "epoch": 4.402797941137654, "grad_norm": 0.0006798732210882008, "learning_rate": 8.570158657776582e-08, "loss": 0.0, "num_input_tokens_seen": 16422288, "step": 33360 }, { "epoch": 4.403457832915402, "grad_norm": 0.007670256774872541, "learning_rate": 8.551508186827639e-08, "loss": 0.0782, "num_input_tokens_seen": 16424784, "step": 33365 }, { "epoch": 4.40411772469315, "grad_norm": 0.00018999635358341038, "learning_rate": 8.532877125158854e-08, "loss": 0.0, "num_input_tokens_seen": 16427280, "step": 33370 }, { "epoch": 4.4047776164708985, "grad_norm": 0.0015463822055608034, "learning_rate": 8.514265476724547e-08, "loss": 0.0366, "num_input_tokens_seen": 16429840, "step": 33375 }, { "epoch": 4.405437508248648, "grad_norm": 5.6568584113847464e-05, "learning_rate": 8.49567324547491e-08, "loss": 0.0, "num_input_tokens_seen": 16432208, "step": 33380 }, { "epoch": 4.406097400026396, "grad_norm": 0.009160442277789116, "learning_rate": 8.47710043535601e-08, "loss": 0.0001, "num_input_tokens_seen": 16434960, "step": 33385 }, { "epoch": 4.406757291804144, "grad_norm": 2.5621367967687547e-05, "learning_rate": 8.458547050309794e-08, "loss": 0.0, "num_input_tokens_seen": 16437584, "step": 33390 }, { "epoch": 4.4074171835818925, "grad_norm": 2.901063453464303e-05, "learning_rate": 8.440013094274035e-08, "loss": 0.0, "num_input_tokens_seen": 16440144, "step": 33395 }, { "epoch": 4.408077075359641, "grad_norm": 0.425859659910202, "learning_rate": 8.421498571182517e-08, "loss": 0.0004, "num_input_tokens_seen": 16442704, "step": 33400 }, { "epoch": 4.40873696713739, "grad_norm": 0.0017851804150268435, "learning_rate": 8.403003484964743e-08, "loss": 0.0, "num_input_tokens_seen": 16445008, "step": 33405 }, { "epoch": 4.409396858915138, "grad_norm": 2.694344766496215e-05, "learning_rate": 8.384527839546196e-08, "loss": 0.0, "num_input_tokens_seen": 16447248, "step": 33410 }, { "epoch": 4.4100567506928865, "grad_norm": 0.001166831818409264, "learning_rate": 8.366071638848183e-08, "loss": 0.0, "num_input_tokens_seen": 16450128, "step": 33415 }, { "epoch": 4.410716642470635, "grad_norm": 0.00015432581130880862, "learning_rate": 8.347634886787901e-08, "loss": 0.028, "num_input_tokens_seen": 16452752, "step": 33420 }, { "epoch": 4.411376534248383, "grad_norm": 6.231493171071634e-05, "learning_rate": 8.329217587278437e-08, "loss": 0.0, "num_input_tokens_seen": 16455248, "step": 33425 }, { "epoch": 4.412036426026131, "grad_norm": 0.00045959983253851533, "learning_rate": 8.310819744228691e-08, "loss": 0.0, "num_input_tokens_seen": 16457616, "step": 33430 }, { "epoch": 4.4126963178038805, "grad_norm": 0.001372020342387259, "learning_rate": 8.29244136154349e-08, "loss": 0.0001, "num_input_tokens_seen": 16459984, "step": 33435 }, { "epoch": 4.413356209581629, "grad_norm": 2.6552370400168e-05, "learning_rate": 8.274082443123543e-08, "loss": 0.0, "num_input_tokens_seen": 16462480, "step": 33440 }, { "epoch": 4.414016101359377, "grad_norm": 0.0005929334438405931, "learning_rate": 8.255742992865356e-08, "loss": 0.0308, "num_input_tokens_seen": 16465040, "step": 33445 }, { "epoch": 4.414675993137125, "grad_norm": 0.00015467203047592193, "learning_rate": 8.237423014661348e-08, "loss": 0.0, "num_input_tokens_seen": 16467728, "step": 33450 }, { "epoch": 4.415335884914874, "grad_norm": 0.0003078359295614064, "learning_rate": 8.219122512399813e-08, "loss": 0.0, "num_input_tokens_seen": 16469968, "step": 33455 }, { "epoch": 4.415995776692623, "grad_norm": 2.4373392079724e-05, "learning_rate": 8.200841489964927e-08, "loss": 0.0002, "num_input_tokens_seen": 16472592, "step": 33460 }, { "epoch": 4.416655668470371, "grad_norm": 0.0010042509529739618, "learning_rate": 8.182579951236657e-08, "loss": 0.0, "num_input_tokens_seen": 16475024, "step": 33465 }, { "epoch": 4.417315560248119, "grad_norm": 7.21759715816006e-05, "learning_rate": 8.164337900090901e-08, "loss": 0.0, "num_input_tokens_seen": 16477520, "step": 33470 }, { "epoch": 4.417975452025868, "grad_norm": 7.079127681208774e-05, "learning_rate": 8.146115340399418e-08, "loss": 0.0003, "num_input_tokens_seen": 16480016, "step": 33475 }, { "epoch": 4.418635343803616, "grad_norm": 1.6037702152971178e-05, "learning_rate": 8.127912276029781e-08, "loss": 0.0, "num_input_tokens_seen": 16482256, "step": 33480 }, { "epoch": 4.419295235581365, "grad_norm": 5.478865568875335e-05, "learning_rate": 8.109728710845488e-08, "loss": 0.0, "num_input_tokens_seen": 16484496, "step": 33485 }, { "epoch": 4.419955127359113, "grad_norm": 2.6513811462791637e-05, "learning_rate": 8.091564648705874e-08, "loss": 0.0, "num_input_tokens_seen": 16486864, "step": 33490 }, { "epoch": 4.420615019136862, "grad_norm": 0.00030529368086718023, "learning_rate": 8.073420093466087e-08, "loss": 0.0006, "num_input_tokens_seen": 16489168, "step": 33495 }, { "epoch": 4.42127491091461, "grad_norm": 0.0018798833480104804, "learning_rate": 8.055295048977218e-08, "loss": 0.0426, "num_input_tokens_seen": 16491792, "step": 33500 }, { "epoch": 4.421934802692358, "grad_norm": 0.00010593536717351526, "learning_rate": 8.037189519086163e-08, "loss": 0.0472, "num_input_tokens_seen": 16494096, "step": 33505 }, { "epoch": 4.4225946944701064, "grad_norm": 0.00031165831023827195, "learning_rate": 8.019103507635704e-08, "loss": 0.0001, "num_input_tokens_seen": 16496720, "step": 33510 }, { "epoch": 4.423254586247856, "grad_norm": 0.00030534231336787343, "learning_rate": 8.00103701846443e-08, "loss": 0.0, "num_input_tokens_seen": 16499152, "step": 33515 }, { "epoch": 4.423914478025604, "grad_norm": 6.27004337310791, "learning_rate": 7.982990055406846e-08, "loss": 0.0025, "num_input_tokens_seen": 16501520, "step": 33520 }, { "epoch": 4.424574369803352, "grad_norm": 0.00011541576532181352, "learning_rate": 7.964962622293314e-08, "loss": 0.0001, "num_input_tokens_seen": 16503824, "step": 33525 }, { "epoch": 4.4252342615811004, "grad_norm": 2.0695533748948947e-05, "learning_rate": 7.946954722949972e-08, "loss": 0.0, "num_input_tokens_seen": 16506512, "step": 33530 }, { "epoch": 4.425894153358849, "grad_norm": 2.451527507218998e-05, "learning_rate": 7.928966361198897e-08, "loss": 0.0, "num_input_tokens_seen": 16508880, "step": 33535 }, { "epoch": 4.426554045136598, "grad_norm": 1.954801700776443e-05, "learning_rate": 7.910997540858011e-08, "loss": 0.0001, "num_input_tokens_seen": 16511120, "step": 33540 }, { "epoch": 4.427213936914346, "grad_norm": 0.00020451426098588854, "learning_rate": 7.89304826574102e-08, "loss": 0.0, "num_input_tokens_seen": 16513424, "step": 33545 }, { "epoch": 4.4278738286920944, "grad_norm": 0.003469746559858322, "learning_rate": 7.875118539657566e-08, "loss": 0.0, "num_input_tokens_seen": 16515664, "step": 33550 }, { "epoch": 4.428533720469843, "grad_norm": 0.00537085859104991, "learning_rate": 7.857208366413048e-08, "loss": 0.0, "num_input_tokens_seen": 16518224, "step": 33555 }, { "epoch": 4.429193612247591, "grad_norm": 4.522494418779388e-05, "learning_rate": 7.839317749808838e-08, "loss": 0.0006, "num_input_tokens_seen": 16520528, "step": 33560 }, { "epoch": 4.42985350402534, "grad_norm": 7.53160347812809e-05, "learning_rate": 7.821446693642064e-08, "loss": 0.0, "num_input_tokens_seen": 16522896, "step": 33565 }, { "epoch": 4.4305133958030885, "grad_norm": 0.0006427129846997559, "learning_rate": 7.803595201705692e-08, "loss": 0.0, "num_input_tokens_seen": 16525392, "step": 33570 }, { "epoch": 4.431173287580837, "grad_norm": 0.0001272416702704504, "learning_rate": 7.785763277788648e-08, "loss": 0.0001, "num_input_tokens_seen": 16527952, "step": 33575 }, { "epoch": 4.431833179358585, "grad_norm": 0.02497689425945282, "learning_rate": 7.767950925675559e-08, "loss": 0.0, "num_input_tokens_seen": 16530384, "step": 33580 }, { "epoch": 4.432493071136333, "grad_norm": 0.0012441120343282819, "learning_rate": 7.750158149147012e-08, "loss": 0.0, "num_input_tokens_seen": 16532752, "step": 33585 }, { "epoch": 4.4331529629140825, "grad_norm": 0.00025881710462272167, "learning_rate": 7.732384951979354e-08, "loss": 0.0, "num_input_tokens_seen": 16535248, "step": 33590 }, { "epoch": 4.433812854691831, "grad_norm": 0.03984666243195534, "learning_rate": 7.714631337944854e-08, "loss": 0.0, "num_input_tokens_seen": 16537680, "step": 33595 }, { "epoch": 4.434472746469579, "grad_norm": 0.004693345166742802, "learning_rate": 7.696897310811579e-08, "loss": 0.0, "num_input_tokens_seen": 16540304, "step": 33600 }, { "epoch": 4.435132638247327, "grad_norm": 0.13585327565670013, "learning_rate": 7.679182874343437e-08, "loss": 0.092, "num_input_tokens_seen": 16542992, "step": 33605 }, { "epoch": 4.435792530025076, "grad_norm": 0.0002721291675698012, "learning_rate": 7.66148803230019e-08, "loss": 0.0072, "num_input_tokens_seen": 16545616, "step": 33610 }, { "epoch": 4.436452421802825, "grad_norm": 0.9574349522590637, "learning_rate": 7.643812788437454e-08, "loss": 0.0002, "num_input_tokens_seen": 16548048, "step": 33615 }, { "epoch": 4.437112313580573, "grad_norm": 0.00035204915911890566, "learning_rate": 7.626157146506651e-08, "loss": 0.0, "num_input_tokens_seen": 16550288, "step": 33620 }, { "epoch": 4.437772205358321, "grad_norm": 0.0347675122320652, "learning_rate": 7.608521110255084e-08, "loss": 0.008, "num_input_tokens_seen": 16552720, "step": 33625 }, { "epoch": 4.43843209713607, "grad_norm": 6.068991933716461e-05, "learning_rate": 7.590904683425858e-08, "loss": 0.0, "num_input_tokens_seen": 16555024, "step": 33630 }, { "epoch": 4.439091988913818, "grad_norm": 0.004437305498868227, "learning_rate": 7.57330786975795e-08, "loss": 0.0, "num_input_tokens_seen": 16557520, "step": 33635 }, { "epoch": 4.439751880691567, "grad_norm": 5.0391710828989744e-05, "learning_rate": 7.555730672986138e-08, "loss": 0.0, "num_input_tokens_seen": 16559824, "step": 33640 }, { "epoch": 4.440411772469315, "grad_norm": 1.7985103113460355e-05, "learning_rate": 7.53817309684106e-08, "loss": 0.0, "num_input_tokens_seen": 16562256, "step": 33645 }, { "epoch": 4.441071664247064, "grad_norm": 0.0015585446963086724, "learning_rate": 7.520635145049193e-08, "loss": 0.0001, "num_input_tokens_seen": 16564688, "step": 33650 }, { "epoch": 4.441731556024812, "grad_norm": 0.0017445468110963702, "learning_rate": 7.503116821332834e-08, "loss": 0.028, "num_input_tokens_seen": 16566928, "step": 33655 }, { "epoch": 4.44239144780256, "grad_norm": 0.010277300141751766, "learning_rate": 7.485618129410109e-08, "loss": 0.0, "num_input_tokens_seen": 16569296, "step": 33660 }, { "epoch": 4.443051339580309, "grad_norm": 2.531162681407295e-05, "learning_rate": 7.468139072994994e-08, "loss": 0.0, "num_input_tokens_seen": 16571728, "step": 33665 }, { "epoch": 4.443711231358058, "grad_norm": 1.1785974502563477, "learning_rate": 7.450679655797321e-08, "loss": 0.0015, "num_input_tokens_seen": 16574160, "step": 33670 }, { "epoch": 4.444371123135806, "grad_norm": 1.7777702808380127, "learning_rate": 7.433239881522691e-08, "loss": 0.0018, "num_input_tokens_seen": 16576336, "step": 33675 }, { "epoch": 4.445031014913554, "grad_norm": 0.07361600548028946, "learning_rate": 7.415819753872576e-08, "loss": 0.0001, "num_input_tokens_seen": 16578768, "step": 33680 }, { "epoch": 4.445690906691302, "grad_norm": 0.00013445514196064323, "learning_rate": 7.398419276544287e-08, "loss": 0.0, "num_input_tokens_seen": 16581136, "step": 33685 }, { "epoch": 4.446350798469051, "grad_norm": 0.0029635755345225334, "learning_rate": 7.381038453230925e-08, "loss": 0.0049, "num_input_tokens_seen": 16583568, "step": 33690 }, { "epoch": 4.4470106902468, "grad_norm": 9.411584854125977, "learning_rate": 7.363677287621462e-08, "loss": 0.028, "num_input_tokens_seen": 16586000, "step": 33695 }, { "epoch": 4.447670582024548, "grad_norm": 0.00021924672182649374, "learning_rate": 7.346335783400693e-08, "loss": 0.0, "num_input_tokens_seen": 16588368, "step": 33700 }, { "epoch": 4.448330473802296, "grad_norm": 0.0011029281886294484, "learning_rate": 7.329013944249186e-08, "loss": 0.0, "num_input_tokens_seen": 16590736, "step": 33705 }, { "epoch": 4.448990365580045, "grad_norm": 4.7456309403060004e-05, "learning_rate": 7.311711773843399e-08, "loss": 0.0, "num_input_tokens_seen": 16593168, "step": 33710 }, { "epoch": 4.449650257357793, "grad_norm": 0.003539201570674777, "learning_rate": 7.294429275855596e-08, "loss": 0.0294, "num_input_tokens_seen": 16595472, "step": 33715 }, { "epoch": 4.450310149135542, "grad_norm": 4.4052645534975454e-05, "learning_rate": 7.277166453953865e-08, "loss": 0.0, "num_input_tokens_seen": 16597584, "step": 33720 }, { "epoch": 4.45097004091329, "grad_norm": 6.079300874262117e-05, "learning_rate": 7.259923311802119e-08, "loss": 0.0005, "num_input_tokens_seen": 16600080, "step": 33725 }, { "epoch": 4.451629932691039, "grad_norm": 9.078793482331093e-06, "learning_rate": 7.242699853060041e-08, "loss": 0.0, "num_input_tokens_seen": 16602576, "step": 33730 }, { "epoch": 4.452289824468787, "grad_norm": 0.0034223340917378664, "learning_rate": 7.225496081383264e-08, "loss": 0.0001, "num_input_tokens_seen": 16605200, "step": 33735 }, { "epoch": 4.452949716246535, "grad_norm": 2.6993599021807313e-05, "learning_rate": 7.2083120004231e-08, "loss": 0.0003, "num_input_tokens_seen": 16607568, "step": 33740 }, { "epoch": 4.453609608024284, "grad_norm": 4.951494702254422e-05, "learning_rate": 7.191147613826787e-08, "loss": 0.0, "num_input_tokens_seen": 16610448, "step": 33745 }, { "epoch": 4.454269499802033, "grad_norm": 3.3548680221429095e-05, "learning_rate": 7.17400292523731e-08, "loss": 0.0001, "num_input_tokens_seen": 16613136, "step": 33750 }, { "epoch": 4.454929391579781, "grad_norm": 5.778231570729986e-05, "learning_rate": 7.156877938293515e-08, "loss": 0.0, "num_input_tokens_seen": 16615632, "step": 33755 }, { "epoch": 4.455589283357529, "grad_norm": 5.531049828277901e-05, "learning_rate": 7.139772656630083e-08, "loss": 0.0, "num_input_tokens_seen": 16618192, "step": 33760 }, { "epoch": 4.4562491751352775, "grad_norm": 0.0002507556928321719, "learning_rate": 7.122687083877422e-08, "loss": 0.0, "num_input_tokens_seen": 16620496, "step": 33765 }, { "epoch": 4.456909066913026, "grad_norm": 1.9328092093928717e-05, "learning_rate": 7.105621223661906e-08, "loss": 0.0381, "num_input_tokens_seen": 16622864, "step": 33770 }, { "epoch": 4.457568958690775, "grad_norm": 0.000141787197208032, "learning_rate": 7.088575079605585e-08, "loss": 0.0, "num_input_tokens_seen": 16625360, "step": 33775 }, { "epoch": 4.458228850468523, "grad_norm": 1.3037359167356044e-05, "learning_rate": 7.071548655326387e-08, "loss": 0.0, "num_input_tokens_seen": 16627856, "step": 33780 }, { "epoch": 4.4588887422462715, "grad_norm": 0.00021317604114301503, "learning_rate": 7.054541954438053e-08, "loss": 0.0, "num_input_tokens_seen": 16630544, "step": 33785 }, { "epoch": 4.45954863402402, "grad_norm": 0.0002955278323497623, "learning_rate": 7.03755498055012e-08, "loss": 0.0, "num_input_tokens_seen": 16633104, "step": 33790 }, { "epoch": 4.460208525801768, "grad_norm": 1.046508550643921, "learning_rate": 7.02058773726798e-08, "loss": 0.0386, "num_input_tokens_seen": 16635728, "step": 33795 }, { "epoch": 4.460868417579517, "grad_norm": 0.00010222404671367258, "learning_rate": 7.003640228192775e-08, "loss": 0.0, "num_input_tokens_seen": 16637904, "step": 33800 }, { "epoch": 4.4615283093572655, "grad_norm": 0.0018340600654482841, "learning_rate": 6.986712456921506e-08, "loss": 0.0, "num_input_tokens_seen": 16640208, "step": 33805 }, { "epoch": 4.462188201135014, "grad_norm": 0.0010408456437289715, "learning_rate": 6.969804427046988e-08, "loss": 0.0, "num_input_tokens_seen": 16642640, "step": 33810 }, { "epoch": 4.462848092912762, "grad_norm": 4.860827175434679e-05, "learning_rate": 6.952916142157783e-08, "loss": 0.0239, "num_input_tokens_seen": 16645136, "step": 33815 }, { "epoch": 4.46350798469051, "grad_norm": 3.999754699179903e-05, "learning_rate": 6.936047605838347e-08, "loss": 0.0, "num_input_tokens_seen": 16647376, "step": 33820 }, { "epoch": 4.4641678764682595, "grad_norm": 4.946894841850735e-05, "learning_rate": 6.919198821668892e-08, "loss": 0.0, "num_input_tokens_seen": 16649616, "step": 33825 }, { "epoch": 4.464827768246008, "grad_norm": 5.7902558182831854e-05, "learning_rate": 6.902369793225437e-08, "loss": 0.0, "num_input_tokens_seen": 16652048, "step": 33830 }, { "epoch": 4.465487660023756, "grad_norm": 0.0020352238789200783, "learning_rate": 6.885560524079837e-08, "loss": 0.0, "num_input_tokens_seen": 16654544, "step": 33835 }, { "epoch": 4.466147551801504, "grad_norm": 0.0005612990353256464, "learning_rate": 6.868771017799735e-08, "loss": 0.001, "num_input_tokens_seen": 16657104, "step": 33840 }, { "epoch": 4.466807443579253, "grad_norm": 12.53954792022705, "learning_rate": 6.852001277948593e-08, "loss": 0.0366, "num_input_tokens_seen": 16659600, "step": 33845 }, { "epoch": 4.467467335357002, "grad_norm": 0.00031025375938043, "learning_rate": 6.835251308085644e-08, "loss": 0.0, "num_input_tokens_seen": 16662352, "step": 33850 }, { "epoch": 4.46812722713475, "grad_norm": 2.4980216039693914e-05, "learning_rate": 6.818521111765952e-08, "loss": 0.0, "num_input_tokens_seen": 16664592, "step": 33855 }, { "epoch": 4.468787118912498, "grad_norm": 1.1097313290520106e-05, "learning_rate": 6.801810692540411e-08, "loss": 0.0, "num_input_tokens_seen": 16667216, "step": 33860 }, { "epoch": 4.469447010690247, "grad_norm": 0.00029235193505883217, "learning_rate": 6.78512005395564e-08, "loss": 0.0001, "num_input_tokens_seen": 16669776, "step": 33865 }, { "epoch": 4.470106902467995, "grad_norm": 5.69798139622435e-05, "learning_rate": 6.768449199554127e-08, "loss": 0.0, "num_input_tokens_seen": 16672208, "step": 33870 }, { "epoch": 4.470766794245744, "grad_norm": 7.299717253772542e-05, "learning_rate": 6.751798132874154e-08, "loss": 0.0, "num_input_tokens_seen": 16674512, "step": 33875 }, { "epoch": 4.471426686023492, "grad_norm": 0.32424068450927734, "learning_rate": 6.73516685744977e-08, "loss": 0.0002, "num_input_tokens_seen": 16676816, "step": 33880 }, { "epoch": 4.472086577801241, "grad_norm": 5.323095683706924e-05, "learning_rate": 6.718555376810864e-08, "loss": 0.0192, "num_input_tokens_seen": 16679376, "step": 33885 }, { "epoch": 4.472746469578989, "grad_norm": 0.0003650693688541651, "learning_rate": 6.70196369448306e-08, "loss": 0.0, "num_input_tokens_seen": 16681808, "step": 33890 }, { "epoch": 4.473406361356737, "grad_norm": 8.204561163438484e-05, "learning_rate": 6.685391813987873e-08, "loss": 0.0, "num_input_tokens_seen": 16684560, "step": 33895 }, { "epoch": 4.474066253134486, "grad_norm": 0.00024618374300189316, "learning_rate": 6.668839738842547e-08, "loss": 0.0239, "num_input_tokens_seen": 16687056, "step": 33900 }, { "epoch": 4.474726144912235, "grad_norm": 80.98246002197266, "learning_rate": 6.652307472560103e-08, "loss": 0.0666, "num_input_tokens_seen": 16689424, "step": 33905 }, { "epoch": 4.475386036689983, "grad_norm": 3.296041177236475e-05, "learning_rate": 6.635795018649459e-08, "loss": 0.0, "num_input_tokens_seen": 16691856, "step": 33910 }, { "epoch": 4.476045928467731, "grad_norm": 9.287385940551758, "learning_rate": 6.61930238061521e-08, "loss": 0.0266, "num_input_tokens_seen": 16694288, "step": 33915 }, { "epoch": 4.4767058202454795, "grad_norm": 11.796046257019043, "learning_rate": 6.602829561957846e-08, "loss": 0.0395, "num_input_tokens_seen": 16696976, "step": 33920 }, { "epoch": 4.477365712023229, "grad_norm": 0.00013667892199009657, "learning_rate": 6.586376566173556e-08, "loss": 0.0, "num_input_tokens_seen": 16699536, "step": 33925 }, { "epoch": 4.478025603800977, "grad_norm": 3.166230089846067e-05, "learning_rate": 6.569943396754396e-08, "loss": 0.0009, "num_input_tokens_seen": 16701904, "step": 33930 }, { "epoch": 4.478685495578725, "grad_norm": 6.734608177794144e-05, "learning_rate": 6.553530057188206e-08, "loss": 0.0, "num_input_tokens_seen": 16704272, "step": 33935 }, { "epoch": 4.4793453873564735, "grad_norm": 8.24275120976381e-05, "learning_rate": 6.537136550958545e-08, "loss": 0.0, "num_input_tokens_seen": 16706896, "step": 33940 }, { "epoch": 4.480005279134222, "grad_norm": 2.8194315433502197, "learning_rate": 6.52076288154485e-08, "loss": 0.0016, "num_input_tokens_seen": 16709008, "step": 33945 }, { "epoch": 4.48066517091197, "grad_norm": 8.94250202178955, "learning_rate": 6.504409052422332e-08, "loss": 0.007, "num_input_tokens_seen": 16711440, "step": 33950 }, { "epoch": 4.481325062689719, "grad_norm": 717.7362060546875, "learning_rate": 6.488075067061927e-08, "loss": 0.0969, "num_input_tokens_seen": 16714128, "step": 33955 }, { "epoch": 4.4819849544674675, "grad_norm": 0.0002321966312592849, "learning_rate": 6.471760928930436e-08, "loss": 0.0, "num_input_tokens_seen": 16716560, "step": 33960 }, { "epoch": 4.482644846245216, "grad_norm": 0.0001448716939194128, "learning_rate": 6.455466641490403e-08, "loss": 0.0, "num_input_tokens_seen": 16719120, "step": 33965 }, { "epoch": 4.483304738022964, "grad_norm": 0.00013773588580079377, "learning_rate": 6.439192208200195e-08, "loss": 0.0, "num_input_tokens_seen": 16721552, "step": 33970 }, { "epoch": 4.483964629800712, "grad_norm": 6.926531932549551e-05, "learning_rate": 6.422937632513914e-08, "loss": 0.0, "num_input_tokens_seen": 16724304, "step": 33975 }, { "epoch": 4.4846245215784615, "grad_norm": 0.0003329158644191921, "learning_rate": 6.40670291788149e-08, "loss": 0.0, "num_input_tokens_seen": 16726992, "step": 33980 }, { "epoch": 4.48528441335621, "grad_norm": 0.00022437769803218544, "learning_rate": 6.390488067748634e-08, "loss": 0.0, "num_input_tokens_seen": 16729488, "step": 33985 }, { "epoch": 4.485944305133958, "grad_norm": 0.12374948710203171, "learning_rate": 6.374293085556814e-08, "loss": 0.0, "num_input_tokens_seen": 16731920, "step": 33990 }, { "epoch": 4.486604196911706, "grad_norm": 0.0772160217165947, "learning_rate": 6.358117974743293e-08, "loss": 0.0, "num_input_tokens_seen": 16734416, "step": 33995 }, { "epoch": 4.487264088689455, "grad_norm": 0.0010211360640823841, "learning_rate": 6.341962738741125e-08, "loss": 0.0, "num_input_tokens_seen": 16737104, "step": 34000 }, { "epoch": 4.487923980467204, "grad_norm": 4.8965968744596466e-05, "learning_rate": 6.325827380979176e-08, "loss": 0.0, "num_input_tokens_seen": 16739536, "step": 34005 }, { "epoch": 4.488583872244952, "grad_norm": 0.00012408196926116943, "learning_rate": 6.309711904882009e-08, "loss": 0.0, "num_input_tokens_seen": 16741712, "step": 34010 }, { "epoch": 4.4892437640227, "grad_norm": 2.4676581233507022e-05, "learning_rate": 6.293616313870032e-08, "loss": 0.0, "num_input_tokens_seen": 16743824, "step": 34015 }, { "epoch": 4.489903655800449, "grad_norm": 8.535667438991368e-05, "learning_rate": 6.277540611359445e-08, "loss": 0.0, "num_input_tokens_seen": 16746256, "step": 34020 }, { "epoch": 4.490563547578197, "grad_norm": 0.15944455564022064, "learning_rate": 6.261484800762163e-08, "loss": 0.0, "num_input_tokens_seen": 16748624, "step": 34025 }, { "epoch": 4.491223439355946, "grad_norm": 2.235090323665645e-05, "learning_rate": 6.245448885485938e-08, "loss": 0.0001, "num_input_tokens_seen": 16751248, "step": 34030 }, { "epoch": 4.491883331133694, "grad_norm": 0.006533946376293898, "learning_rate": 6.229432868934281e-08, "loss": 0.0, "num_input_tokens_seen": 16753680, "step": 34035 }, { "epoch": 4.492543222911443, "grad_norm": 9.534538548905402e-05, "learning_rate": 6.21343675450644e-08, "loss": 0.0001, "num_input_tokens_seen": 16756240, "step": 34040 }, { "epoch": 4.493203114689191, "grad_norm": 0.0004805122152902186, "learning_rate": 6.19746054559751e-08, "loss": 0.0001, "num_input_tokens_seen": 16758672, "step": 34045 }, { "epoch": 4.493863006466939, "grad_norm": 0.258656769990921, "learning_rate": 6.181504245598312e-08, "loss": 0.0, "num_input_tokens_seen": 16760848, "step": 34050 }, { "epoch": 4.494522898244687, "grad_norm": 2.5847604774753563e-05, "learning_rate": 6.165567857895471e-08, "loss": 0.0, "num_input_tokens_seen": 16763344, "step": 34055 }, { "epoch": 4.495182790022437, "grad_norm": 0.6106637120246887, "learning_rate": 6.149651385871358e-08, "loss": 0.0005, "num_input_tokens_seen": 16765904, "step": 34060 }, { "epoch": 4.495842681800185, "grad_norm": 0.002589078852906823, "learning_rate": 6.133754832904092e-08, "loss": 0.0, "num_input_tokens_seen": 16768336, "step": 34065 }, { "epoch": 4.496502573577933, "grad_norm": 6.54537943773903e-05, "learning_rate": 6.117878202367677e-08, "loss": 0.0213, "num_input_tokens_seen": 16770832, "step": 34070 }, { "epoch": 4.497162465355681, "grad_norm": 0.14802023768424988, "learning_rate": 6.102021497631749e-08, "loss": 0.0001, "num_input_tokens_seen": 16773264, "step": 34075 }, { "epoch": 4.49782235713343, "grad_norm": 1.6509698980371468e-05, "learning_rate": 6.086184722061826e-08, "loss": 0.0103, "num_input_tokens_seen": 16775824, "step": 34080 }, { "epoch": 4.498482248911179, "grad_norm": 0.0005468002054840326, "learning_rate": 6.070367879019101e-08, "loss": 0.0014, "num_input_tokens_seen": 16778064, "step": 34085 }, { "epoch": 4.499142140688927, "grad_norm": 1.2052417332597543e-05, "learning_rate": 6.054570971860618e-08, "loss": 0.0007, "num_input_tokens_seen": 16780624, "step": 34090 }, { "epoch": 4.499802032466675, "grad_norm": 1.0952991247177124, "learning_rate": 6.038794003939151e-08, "loss": 0.0242, "num_input_tokens_seen": 16783248, "step": 34095 }, { "epoch": 4.500461924244424, "grad_norm": 1.677956424828153e-05, "learning_rate": 6.023036978603213e-08, "loss": 0.0016, "num_input_tokens_seen": 16785552, "step": 34100 }, { "epoch": 4.501121816022172, "grad_norm": 4.611305848811753e-05, "learning_rate": 6.007299899197194e-08, "loss": 0.0, "num_input_tokens_seen": 16787728, "step": 34105 }, { "epoch": 4.501781707799921, "grad_norm": 0.00016956614854279906, "learning_rate": 5.991582769061121e-08, "loss": 0.0, "num_input_tokens_seen": 16790288, "step": 34110 }, { "epoch": 4.501781707799921, "eval_loss": 0.2763582170009613, "eval_runtime": 7.878, "eval_samples_per_second": 854.911, "eval_steps_per_second": 106.88, "num_input_tokens_seen": 16790288, "step": 34110 }, { "epoch": 4.502441599577669, "grad_norm": 0.00017157703405246139, "learning_rate": 5.975885591530827e-08, "loss": 0.0, "num_input_tokens_seen": 16792848, "step": 34115 }, { "epoch": 4.503101491355418, "grad_norm": 0.0001700647408142686, "learning_rate": 5.9602083699379577e-08, "loss": 0.0518, "num_input_tokens_seen": 16795408, "step": 34120 }, { "epoch": 4.503761383133166, "grad_norm": 2.089840199914761e-05, "learning_rate": 5.9445511076098745e-08, "loss": 0.028, "num_input_tokens_seen": 16798096, "step": 34125 }, { "epoch": 4.504421274910914, "grad_norm": 6.222442607395351e-05, "learning_rate": 5.92891380786974e-08, "loss": 0.0, "num_input_tokens_seen": 16800528, "step": 34130 }, { "epoch": 4.505081166688663, "grad_norm": 3.297501098131761e-05, "learning_rate": 5.913296474036422e-08, "loss": 0.0, "num_input_tokens_seen": 16803024, "step": 34135 }, { "epoch": 4.505741058466412, "grad_norm": 1.1789659765781835e-05, "learning_rate": 5.8976991094246034e-08, "loss": 0.0, "num_input_tokens_seen": 16805456, "step": 34140 }, { "epoch": 4.50640095024416, "grad_norm": 0.002153146080672741, "learning_rate": 5.882121717344735e-08, "loss": 0.0005, "num_input_tokens_seen": 16807632, "step": 34145 }, { "epoch": 4.507060842021908, "grad_norm": 0.0005576178664341569, "learning_rate": 5.866564301102972e-08, "loss": 0.0, "num_input_tokens_seen": 16810256, "step": 34150 }, { "epoch": 4.5077207337996565, "grad_norm": 2.7772233486175537, "learning_rate": 5.851026864001263e-08, "loss": 0.0047, "num_input_tokens_seen": 16813008, "step": 34155 }, { "epoch": 4.508380625577406, "grad_norm": 0.06701714545488358, "learning_rate": 5.835509409337358e-08, "loss": 0.0294, "num_input_tokens_seen": 16815376, "step": 34160 }, { "epoch": 4.509040517355154, "grad_norm": 8.746223102207296e-06, "learning_rate": 5.820011940404668e-08, "loss": 0.0, "num_input_tokens_seen": 16817680, "step": 34165 }, { "epoch": 4.509700409132902, "grad_norm": 0.0006987557862885296, "learning_rate": 5.804534460492449e-08, "loss": 0.0, "num_input_tokens_seen": 16820368, "step": 34170 }, { "epoch": 4.5103603009106505, "grad_norm": 1.2826088095607702e-05, "learning_rate": 5.789076972885687e-08, "loss": 0.0, "num_input_tokens_seen": 16822672, "step": 34175 }, { "epoch": 4.511020192688399, "grad_norm": 0.0025306891184300184, "learning_rate": 5.7736394808651226e-08, "loss": 0.0, "num_input_tokens_seen": 16824976, "step": 34180 }, { "epoch": 4.511680084466148, "grad_norm": 0.0032583356369286776, "learning_rate": 5.758221987707235e-08, "loss": 0.0, "num_input_tokens_seen": 16827472, "step": 34185 }, { "epoch": 4.512339976243896, "grad_norm": 6.325223512249067e-05, "learning_rate": 5.742824496684284e-08, "loss": 0.0, "num_input_tokens_seen": 16829840, "step": 34190 }, { "epoch": 4.5129998680216445, "grad_norm": 4.075995457242243e-05, "learning_rate": 5.72744701106429e-08, "loss": 0.0, "num_input_tokens_seen": 16832400, "step": 34195 }, { "epoch": 4.513659759799393, "grad_norm": 0.00010375280544394627, "learning_rate": 5.7120895341109864e-08, "loss": 0.0016, "num_input_tokens_seen": 16834832, "step": 34200 }, { "epoch": 4.514319651577141, "grad_norm": 5.2472357749938965, "learning_rate": 5.696752069083899e-08, "loss": 0.0066, "num_input_tokens_seen": 16837200, "step": 34205 }, { "epoch": 4.51497954335489, "grad_norm": 8.740870725887362e-06, "learning_rate": 5.6814346192383125e-08, "loss": 0.0, "num_input_tokens_seen": 16839632, "step": 34210 }, { "epoch": 4.5156394351326385, "grad_norm": 6.086594657972455e-05, "learning_rate": 5.666137187825204e-08, "loss": 0.0, "num_input_tokens_seen": 16842128, "step": 34215 }, { "epoch": 4.516299326910387, "grad_norm": 0.0001888852275442332, "learning_rate": 5.650859778091388e-08, "loss": 0.0002, "num_input_tokens_seen": 16844240, "step": 34220 }, { "epoch": 4.516959218688135, "grad_norm": 1.9013299606740475e-05, "learning_rate": 5.635602393279326e-08, "loss": 0.0, "num_input_tokens_seen": 16846352, "step": 34225 }, { "epoch": 4.517619110465883, "grad_norm": 2.5412498871446587e-05, "learning_rate": 5.62036503662735e-08, "loss": 0.0, "num_input_tokens_seen": 16848784, "step": 34230 }, { "epoch": 4.518279002243632, "grad_norm": 0.0002402652462478727, "learning_rate": 5.6051477113694625e-08, "loss": 0.0, "num_input_tokens_seen": 16850960, "step": 34235 }, { "epoch": 4.518938894021381, "grad_norm": 0.0011904650600627065, "learning_rate": 5.589950420735379e-08, "loss": 0.0, "num_input_tokens_seen": 16853968, "step": 34240 }, { "epoch": 4.519598785799129, "grad_norm": 4.7674417146481574e-05, "learning_rate": 5.574773167950697e-08, "loss": 0.045, "num_input_tokens_seen": 16856592, "step": 34245 }, { "epoch": 4.520258677576877, "grad_norm": 0.0012722613755613565, "learning_rate": 5.5596159562366076e-08, "loss": 0.0, "num_input_tokens_seen": 16859024, "step": 34250 }, { "epoch": 4.520918569354626, "grad_norm": 36.9177131652832, "learning_rate": 5.5444787888101696e-08, "loss": 0.0518, "num_input_tokens_seen": 16861264, "step": 34255 }, { "epoch": 4.521578461132374, "grad_norm": 1.2535748282971326e-05, "learning_rate": 5.529361668884103e-08, "loss": 0.0002, "num_input_tokens_seen": 16863696, "step": 34260 }, { "epoch": 4.522238352910123, "grad_norm": 1.3539308383769821e-05, "learning_rate": 5.514264599666918e-08, "loss": 0.0, "num_input_tokens_seen": 16866064, "step": 34265 }, { "epoch": 4.522898244687871, "grad_norm": 0.005475457292050123, "learning_rate": 5.4991875843628745e-08, "loss": 0.0, "num_input_tokens_seen": 16868688, "step": 34270 }, { "epoch": 4.52355813646562, "grad_norm": 0.0007006602245382965, "learning_rate": 5.484130626171923e-08, "loss": 0.0005, "num_input_tokens_seen": 16870800, "step": 34275 }, { "epoch": 4.524218028243368, "grad_norm": 0.0001456452300772071, "learning_rate": 5.46909372828982e-08, "loss": 0.0, "num_input_tokens_seen": 16873552, "step": 34280 }, { "epoch": 4.524877920021116, "grad_norm": 1.1672700643539429, "learning_rate": 5.454076893908055e-08, "loss": 0.0008, "num_input_tokens_seen": 16875984, "step": 34285 }, { "epoch": 4.5255378117988645, "grad_norm": 6.52900489512831e-05, "learning_rate": 5.439080126213802e-08, "loss": 0.0003, "num_input_tokens_seen": 16878544, "step": 34290 }, { "epoch": 4.526197703576614, "grad_norm": 0.0034513825085014105, "learning_rate": 5.4241034283900364e-08, "loss": 0.0, "num_input_tokens_seen": 16881168, "step": 34295 }, { "epoch": 4.526857595354362, "grad_norm": 1.3490453056874685e-05, "learning_rate": 5.40914680361545e-08, "loss": 0.0, "num_input_tokens_seen": 16883472, "step": 34300 }, { "epoch": 4.52751748713211, "grad_norm": 2.3728985979687423e-05, "learning_rate": 5.394210255064502e-08, "loss": 0.0, "num_input_tokens_seen": 16885648, "step": 34305 }, { "epoch": 4.5281773789098585, "grad_norm": 0.07513949275016785, "learning_rate": 5.379293785907335e-08, "loss": 0.0, "num_input_tokens_seen": 16887888, "step": 34310 }, { "epoch": 4.528837270687607, "grad_norm": 3.131102857878432e-05, "learning_rate": 5.364397399309861e-08, "loss": 0.0005, "num_input_tokens_seen": 16890128, "step": 34315 }, { "epoch": 4.529497162465356, "grad_norm": 0.00018491598893888295, "learning_rate": 5.349521098433762e-08, "loss": 0.0, "num_input_tokens_seen": 16892496, "step": 34320 }, { "epoch": 4.530157054243104, "grad_norm": 0.007551413960754871, "learning_rate": 5.334664886436391e-08, "loss": 0.0415, "num_input_tokens_seen": 16894608, "step": 34325 }, { "epoch": 4.5308169460208525, "grad_norm": 0.00030818648519925773, "learning_rate": 5.3198287664708907e-08, "loss": 0.0, "num_input_tokens_seen": 16897616, "step": 34330 }, { "epoch": 4.531476837798601, "grad_norm": 2.6269792215316556e-05, "learning_rate": 5.3050127416861104e-08, "loss": 0.0, "num_input_tokens_seen": 16900048, "step": 34335 }, { "epoch": 4.532136729576349, "grad_norm": 5.578194395639002e-05, "learning_rate": 5.290216815226656e-08, "loss": 0.0165, "num_input_tokens_seen": 16902416, "step": 34340 }, { "epoch": 4.532796621354098, "grad_norm": 0.0002465557190589607, "learning_rate": 5.275440990232838e-08, "loss": 0.0003, "num_input_tokens_seen": 16904656, "step": 34345 }, { "epoch": 4.5334565131318465, "grad_norm": 0.00656506372615695, "learning_rate": 5.2606852698407367e-08, "loss": 0.0, "num_input_tokens_seen": 16907216, "step": 34350 }, { "epoch": 4.534116404909595, "grad_norm": 14.822111129760742, "learning_rate": 5.245949657182136e-08, "loss": 0.0381, "num_input_tokens_seen": 16909840, "step": 34355 }, { "epoch": 4.534776296687343, "grad_norm": 2.3827880795579404e-05, "learning_rate": 5.231234155384567e-08, "loss": 0.0003, "num_input_tokens_seen": 16912464, "step": 34360 }, { "epoch": 4.535436188465091, "grad_norm": 1.264294496650109e-05, "learning_rate": 5.216538767571277e-08, "loss": 0.0, "num_input_tokens_seen": 16915088, "step": 34365 }, { "epoch": 4.5360960802428405, "grad_norm": 3.063208350795321e-05, "learning_rate": 5.201863496861292e-08, "loss": 0.002, "num_input_tokens_seen": 16917584, "step": 34370 }, { "epoch": 4.536755972020589, "grad_norm": 3.200107312295586e-05, "learning_rate": 5.187208346369276e-08, "loss": 0.0, "num_input_tokens_seen": 16920080, "step": 34375 }, { "epoch": 4.537415863798337, "grad_norm": 0.06736087054014206, "learning_rate": 5.17257331920572e-08, "loss": 0.0, "num_input_tokens_seen": 16922640, "step": 34380 }, { "epoch": 4.538075755576085, "grad_norm": 0.020671632140874863, "learning_rate": 5.157958418476793e-08, "loss": 0.0001, "num_input_tokens_seen": 16925200, "step": 34385 }, { "epoch": 4.538735647353834, "grad_norm": 0.00017729074170347303, "learning_rate": 5.1433636472844045e-08, "loss": 0.0123, "num_input_tokens_seen": 16927504, "step": 34390 }, { "epoch": 4.539395539131583, "grad_norm": 0.0018899147398769855, "learning_rate": 5.1287890087261864e-08, "loss": 0.0079, "num_input_tokens_seen": 16929872, "step": 34395 }, { "epoch": 4.540055430909331, "grad_norm": 0.3784496784210205, "learning_rate": 5.114234505895465e-08, "loss": 0.0001, "num_input_tokens_seen": 16931856, "step": 34400 }, { "epoch": 4.540715322687079, "grad_norm": 0.00019514707673806697, "learning_rate": 5.0997001418814025e-08, "loss": 0.0, "num_input_tokens_seen": 16934224, "step": 34405 }, { "epoch": 4.541375214464828, "grad_norm": 0.0017008042195811868, "learning_rate": 5.085185919768742e-08, "loss": 0.0, "num_input_tokens_seen": 16936592, "step": 34410 }, { "epoch": 4.542035106242576, "grad_norm": 0.00016672021592967212, "learning_rate": 5.0706918426380754e-08, "loss": 0.0, "num_input_tokens_seen": 16939024, "step": 34415 }, { "epoch": 4.542694998020325, "grad_norm": 0.00020502068218775094, "learning_rate": 5.056217913565619e-08, "loss": 0.0366, "num_input_tokens_seen": 16941456, "step": 34420 }, { "epoch": 4.543354889798073, "grad_norm": 3.393287261133082e-05, "learning_rate": 5.0417641356233943e-08, "loss": 0.0, "num_input_tokens_seen": 16943632, "step": 34425 }, { "epoch": 4.544014781575822, "grad_norm": 2.4823410058161244e-05, "learning_rate": 5.027330511879102e-08, "loss": 0.0, "num_input_tokens_seen": 16946000, "step": 34430 }, { "epoch": 4.54467467335357, "grad_norm": 0.00012084999616490677, "learning_rate": 5.012917045396148e-08, "loss": 0.0, "num_input_tokens_seen": 16948560, "step": 34435 }, { "epoch": 4.545334565131318, "grad_norm": 0.0009474708931520581, "learning_rate": 4.998523739233729e-08, "loss": 0.0001, "num_input_tokens_seen": 16950928, "step": 34440 }, { "epoch": 4.545994456909067, "grad_norm": 2.3217171474243514e-05, "learning_rate": 4.984150596446701e-08, "loss": 0.0005, "num_input_tokens_seen": 16953360, "step": 34445 }, { "epoch": 4.546654348686816, "grad_norm": 9.132823470281437e-05, "learning_rate": 4.9697976200856584e-08, "loss": 0.0, "num_input_tokens_seen": 16955856, "step": 34450 }, { "epoch": 4.547314240464564, "grad_norm": 3.412169826333411e-05, "learning_rate": 4.955464813196897e-08, "loss": 0.0, "num_input_tokens_seen": 16958160, "step": 34455 }, { "epoch": 4.547974132242312, "grad_norm": 5.927090023760684e-05, "learning_rate": 4.941152178822483e-08, "loss": 0.0001, "num_input_tokens_seen": 16960592, "step": 34460 }, { "epoch": 4.5486340240200605, "grad_norm": 1.7155516616185196e-05, "learning_rate": 4.926859720000165e-08, "loss": 0.0, "num_input_tokens_seen": 16963152, "step": 34465 }, { "epoch": 4.54929391579781, "grad_norm": 8.997808618005365e-05, "learning_rate": 4.912587439763394e-08, "loss": 0.0, "num_input_tokens_seen": 16965584, "step": 34470 }, { "epoch": 4.549953807575558, "grad_norm": 0.00021670824207831174, "learning_rate": 4.898335341141369e-08, "loss": 0.0, "num_input_tokens_seen": 16967888, "step": 34475 }, { "epoch": 4.550613699353306, "grad_norm": 0.0006072040996514261, "learning_rate": 4.884103427159014e-08, "loss": 0.0, "num_input_tokens_seen": 16970256, "step": 34480 }, { "epoch": 4.5512735911310545, "grad_norm": 0.0130154425278306, "learning_rate": 4.8698917008369144e-08, "loss": 0.0615, "num_input_tokens_seen": 16973200, "step": 34485 }, { "epoch": 4.551933482908803, "grad_norm": 5.232800685917027e-05, "learning_rate": 4.855700165191423e-08, "loss": 0.0, "num_input_tokens_seen": 16975568, "step": 34490 }, { "epoch": 4.552593374686552, "grad_norm": 0.0001216641758219339, "learning_rate": 4.841528823234609e-08, "loss": 0.0, "num_input_tokens_seen": 16978128, "step": 34495 }, { "epoch": 4.5532532664643, "grad_norm": 0.009573639370501041, "learning_rate": 4.8273776779741984e-08, "loss": 0.0, "num_input_tokens_seen": 16980560, "step": 34500 }, { "epoch": 4.5539131582420485, "grad_norm": 9.984229109250009e-05, "learning_rate": 4.8132467324136894e-08, "loss": 0.0, "num_input_tokens_seen": 16983184, "step": 34505 }, { "epoch": 4.554573050019797, "grad_norm": 0.004783345386385918, "learning_rate": 4.799135989552272e-08, "loss": 0.0, "num_input_tokens_seen": 16985488, "step": 34510 }, { "epoch": 4.555232941797545, "grad_norm": 0.0024777057114988565, "learning_rate": 4.7850454523848725e-08, "loss": 0.0019, "num_input_tokens_seen": 16987984, "step": 34515 }, { "epoch": 4.555892833575293, "grad_norm": 0.002937519922852516, "learning_rate": 4.770975123902066e-08, "loss": 0.0, "num_input_tokens_seen": 16990288, "step": 34520 }, { "epoch": 4.5565527253530425, "grad_norm": 4.716146213468164e-05, "learning_rate": 4.756925007090185e-08, "loss": 0.0, "num_input_tokens_seen": 16992912, "step": 34525 }, { "epoch": 4.557212617130791, "grad_norm": 19.31949234008789, "learning_rate": 4.7428951049312996e-08, "loss": 0.0294, "num_input_tokens_seen": 16995088, "step": 34530 }, { "epoch": 4.557872508908539, "grad_norm": 4.645546869141981e-05, "learning_rate": 4.728885420403117e-08, "loss": 0.0, "num_input_tokens_seen": 16997520, "step": 34535 }, { "epoch": 4.558532400686287, "grad_norm": 0.001822422374971211, "learning_rate": 4.714895956479104e-08, "loss": 0.0, "num_input_tokens_seen": 16999696, "step": 34540 }, { "epoch": 4.559192292464036, "grad_norm": 0.007913710549473763, "learning_rate": 4.700926716128428e-08, "loss": 0.0337, "num_input_tokens_seen": 17002256, "step": 34545 }, { "epoch": 4.559852184241785, "grad_norm": 2.7463400328997523e-05, "learning_rate": 4.686977702315953e-08, "loss": 0.0, "num_input_tokens_seen": 17004880, "step": 34550 }, { "epoch": 4.560512076019533, "grad_norm": 1.1041237485187594e-05, "learning_rate": 4.673048918002265e-08, "loss": 0.0719, "num_input_tokens_seen": 17007376, "step": 34555 }, { "epoch": 4.561171967797281, "grad_norm": 1.2896975022158585e-05, "learning_rate": 4.659140366143621e-08, "loss": 0.0, "num_input_tokens_seen": 17009808, "step": 34560 }, { "epoch": 4.56183185957503, "grad_norm": 0.0025346383918076754, "learning_rate": 4.64525204969205e-08, "loss": 0.0, "num_input_tokens_seen": 17012752, "step": 34565 }, { "epoch": 4.562491751352778, "grad_norm": 5.901870463276282e-05, "learning_rate": 4.631383971595226e-08, "loss": 0.0, "num_input_tokens_seen": 17015056, "step": 34570 }, { "epoch": 4.563151643130526, "grad_norm": 8.729910769034177e-05, "learning_rate": 4.617536134796529e-08, "loss": 0.0007, "num_input_tokens_seen": 17017424, "step": 34575 }, { "epoch": 4.563811534908275, "grad_norm": 0.00011786862160079181, "learning_rate": 4.6037085422351077e-08, "loss": 0.0, "num_input_tokens_seen": 17019984, "step": 34580 }, { "epoch": 4.564471426686024, "grad_norm": 1.6084588423836976e-05, "learning_rate": 4.5899011968457244e-08, "loss": 0.0, "num_input_tokens_seen": 17022864, "step": 34585 }, { "epoch": 4.565131318463772, "grad_norm": 0.00034129302366636693, "learning_rate": 4.576114101558914e-08, "loss": 0.0, "num_input_tokens_seen": 17025168, "step": 34590 }, { "epoch": 4.56579121024152, "grad_norm": 0.00021121930330991745, "learning_rate": 4.562347259300881e-08, "loss": 0.0, "num_input_tokens_seen": 17027728, "step": 34595 }, { "epoch": 4.566451102019268, "grad_norm": 3.6314009776106104e-05, "learning_rate": 4.54860067299353e-08, "loss": 0.0308, "num_input_tokens_seen": 17030096, "step": 34600 }, { "epoch": 4.567110993797018, "grad_norm": 0.000650825328193605, "learning_rate": 4.534874345554496e-08, "loss": 0.0253, "num_input_tokens_seen": 17032912, "step": 34605 }, { "epoch": 4.567770885574766, "grad_norm": 0.00017748665413819253, "learning_rate": 4.521168279897058e-08, "loss": 0.0, "num_input_tokens_seen": 17035600, "step": 34610 }, { "epoch": 4.568430777352514, "grad_norm": 3.97105141018983e-05, "learning_rate": 4.507482478930258e-08, "loss": 0.087, "num_input_tokens_seen": 17038096, "step": 34615 }, { "epoch": 4.569090669130262, "grad_norm": 3.461187952780165e-05, "learning_rate": 4.493816945558815e-08, "loss": 0.0398, "num_input_tokens_seen": 17040720, "step": 34620 }, { "epoch": 4.569750560908011, "grad_norm": 0.009485268965363503, "learning_rate": 4.480171682683098e-08, "loss": 0.0105, "num_input_tokens_seen": 17043280, "step": 34625 }, { "epoch": 4.57041045268576, "grad_norm": 0.003441791282966733, "learning_rate": 4.466546693199247e-08, "loss": 0.0226, "num_input_tokens_seen": 17045392, "step": 34630 }, { "epoch": 4.571070344463508, "grad_norm": 0.0006981107871979475, "learning_rate": 4.4529419799990695e-08, "loss": 0.0, "num_input_tokens_seen": 17047888, "step": 34635 }, { "epoch": 4.571730236241256, "grad_norm": 5.575758041231893e-05, "learning_rate": 4.439357545970068e-08, "loss": 0.0016, "num_input_tokens_seen": 17050320, "step": 34640 }, { "epoch": 4.572390128019005, "grad_norm": 1.394921764585888e-05, "learning_rate": 4.425793393995414e-08, "loss": 0.0, "num_input_tokens_seen": 17052944, "step": 34645 }, { "epoch": 4.573050019796753, "grad_norm": 0.002040313323959708, "learning_rate": 4.412249526954015e-08, "loss": 0.0, "num_input_tokens_seen": 17055248, "step": 34650 }, { "epoch": 4.573709911574502, "grad_norm": 3.8957525248406455e-05, "learning_rate": 4.398725947720483e-08, "loss": 0.0, "num_input_tokens_seen": 17057872, "step": 34655 }, { "epoch": 4.57436980335225, "grad_norm": 3.296605427749455e-05, "learning_rate": 4.385222659165067e-08, "loss": 0.0, "num_input_tokens_seen": 17060304, "step": 34660 }, { "epoch": 4.575029695129999, "grad_norm": 1.76807698153425e-05, "learning_rate": 4.3717396641537395e-08, "loss": 0.0, "num_input_tokens_seen": 17062928, "step": 34665 }, { "epoch": 4.575689586907747, "grad_norm": 0.0012917850399389863, "learning_rate": 4.358276965548202e-08, "loss": 0.0, "num_input_tokens_seen": 17065360, "step": 34670 }, { "epoch": 4.576349478685495, "grad_norm": 0.18802160024642944, "learning_rate": 4.344834566205802e-08, "loss": 0.0001, "num_input_tokens_seen": 17067792, "step": 34675 }, { "epoch": 4.577009370463244, "grad_norm": 0.00029112552874721587, "learning_rate": 4.331412468979567e-08, "loss": 0.0, "num_input_tokens_seen": 17069776, "step": 34680 }, { "epoch": 4.577669262240993, "grad_norm": 0.00016572128515690565, "learning_rate": 4.318010676718254e-08, "loss": 0.0, "num_input_tokens_seen": 17072272, "step": 34685 }, { "epoch": 4.578329154018741, "grad_norm": 3.444494359428063e-05, "learning_rate": 4.304629192266318e-08, "loss": 0.0, "num_input_tokens_seen": 17074832, "step": 34690 }, { "epoch": 4.578989045796489, "grad_norm": 0.04258698970079422, "learning_rate": 4.2912680184638564e-08, "loss": 0.0004, "num_input_tokens_seen": 17077776, "step": 34695 }, { "epoch": 4.5796489375742375, "grad_norm": 2.2253461793297902e-05, "learning_rate": 4.277927158146688e-08, "loss": 0.0657, "num_input_tokens_seen": 17080336, "step": 34700 }, { "epoch": 4.580308829351987, "grad_norm": 11.546168327331543, "learning_rate": 4.264606614146327e-08, "loss": 0.0066, "num_input_tokens_seen": 17082576, "step": 34705 }, { "epoch": 4.580968721129735, "grad_norm": 0.012570054270327091, "learning_rate": 4.251306389289944e-08, "loss": 0.0, "num_input_tokens_seen": 17084880, "step": 34710 }, { "epoch": 4.581628612907483, "grad_norm": 1.0254173503199127e-05, "learning_rate": 4.2380264864004143e-08, "loss": 0.0, "num_input_tokens_seen": 17087440, "step": 34715 }, { "epoch": 4.5822885046852315, "grad_norm": 0.0003984362119808793, "learning_rate": 4.2247669082963065e-08, "loss": 0.0, "num_input_tokens_seen": 17089808, "step": 34720 }, { "epoch": 4.58294839646298, "grad_norm": 0.00015969022933859378, "learning_rate": 4.211527657791891e-08, "loss": 0.0, "num_input_tokens_seen": 17092048, "step": 34725 }, { "epoch": 4.583608288240729, "grad_norm": 0.0001142570617957972, "learning_rate": 4.198308737697087e-08, "loss": 0.0, "num_input_tokens_seen": 17094544, "step": 34730 }, { "epoch": 4.584268180018477, "grad_norm": 1.6683903595549054e-05, "learning_rate": 4.1851101508174834e-08, "loss": 0.0001, "num_input_tokens_seen": 17096784, "step": 34735 }, { "epoch": 4.5849280717962255, "grad_norm": 0.0009563491330482066, "learning_rate": 4.171931899954439e-08, "loss": 0.0001, "num_input_tokens_seen": 17099152, "step": 34740 }, { "epoch": 4.585587963573974, "grad_norm": 5.6973076425492764e-05, "learning_rate": 4.1587739879049067e-08, "loss": 0.0, "num_input_tokens_seen": 17101264, "step": 34745 }, { "epoch": 4.586247855351722, "grad_norm": 0.00014773164002690464, "learning_rate": 4.145636417461573e-08, "loss": 0.0, "num_input_tokens_seen": 17103504, "step": 34750 }, { "epoch": 4.586907747129471, "grad_norm": 2.2371052182279527e-05, "learning_rate": 4.132519191412787e-08, "loss": 0.0, "num_input_tokens_seen": 17105744, "step": 34755 }, { "epoch": 4.5875676389072195, "grad_norm": 2.9566681405412965e-05, "learning_rate": 4.1194223125425753e-08, "loss": 0.0176, "num_input_tokens_seen": 17108304, "step": 34760 }, { "epoch": 4.588227530684968, "grad_norm": 2.690335168153979e-05, "learning_rate": 4.1063457836306716e-08, "loss": 0.0, "num_input_tokens_seen": 17110544, "step": 34765 }, { "epoch": 4.588887422462716, "grad_norm": 4.563686889014207e-05, "learning_rate": 4.0932896074524546e-08, "loss": 0.0, "num_input_tokens_seen": 17113104, "step": 34770 }, { "epoch": 4.589547314240464, "grad_norm": 0.001125271082855761, "learning_rate": 4.080253786779042e-08, "loss": 0.0226, "num_input_tokens_seen": 17115472, "step": 34775 }, { "epoch": 4.590207206018213, "grad_norm": 0.0025146508123725653, "learning_rate": 4.0672383243771643e-08, "loss": 0.0033, "num_input_tokens_seen": 17118032, "step": 34780 }, { "epoch": 4.590867097795962, "grad_norm": 1.4741677659912966e-05, "learning_rate": 4.054243223009246e-08, "loss": 0.0, "num_input_tokens_seen": 17120592, "step": 34785 }, { "epoch": 4.59152698957371, "grad_norm": 7.56392182665877e-05, "learning_rate": 4.041268485433413e-08, "loss": 0.0, "num_input_tokens_seen": 17122896, "step": 34790 }, { "epoch": 4.592186881351458, "grad_norm": 2.3552544007543474e-05, "learning_rate": 4.028314114403475e-08, "loss": 0.0, "num_input_tokens_seen": 17125456, "step": 34795 }, { "epoch": 4.592846773129207, "grad_norm": 3.42881066899281e-05, "learning_rate": 4.015380112668909e-08, "loss": 0.0, "num_input_tokens_seen": 17128016, "step": 34800 }, { "epoch": 4.593506664906955, "grad_norm": 1.2655588761845138e-05, "learning_rate": 4.002466482974831e-08, "loss": 0.0, "num_input_tokens_seen": 17130512, "step": 34805 }, { "epoch": 4.594166556684704, "grad_norm": 5.4459964303532615e-05, "learning_rate": 3.989573228062082e-08, "loss": 0.0364, "num_input_tokens_seen": 17132944, "step": 34810 }, { "epoch": 4.594826448462452, "grad_norm": 0.0026006638072431087, "learning_rate": 3.976700350667173e-08, "loss": 0.0, "num_input_tokens_seen": 17135440, "step": 34815 }, { "epoch": 4.595486340240201, "grad_norm": 5.579711069003679e-05, "learning_rate": 3.963847853522262e-08, "loss": 0.0, "num_input_tokens_seen": 17137872, "step": 34820 }, { "epoch": 4.596146232017949, "grad_norm": 0.01866592839360237, "learning_rate": 3.951015739355201e-08, "loss": 0.0, "num_input_tokens_seen": 17140176, "step": 34825 }, { "epoch": 4.596806123795697, "grad_norm": 0.004795776214450598, "learning_rate": 3.9382040108895344e-08, "loss": 0.0, "num_input_tokens_seen": 17142672, "step": 34830 }, { "epoch": 4.5974660155734455, "grad_norm": 0.04255275800824165, "learning_rate": 3.925412670844419e-08, "loss": 0.0004, "num_input_tokens_seen": 17145232, "step": 34835 }, { "epoch": 4.598125907351195, "grad_norm": 0.002885986352339387, "learning_rate": 3.9126417219347506e-08, "loss": 0.0, "num_input_tokens_seen": 17147600, "step": 34840 }, { "epoch": 4.598785799128943, "grad_norm": 0.0016264189034700394, "learning_rate": 3.899891166871072e-08, "loss": 0.0, "num_input_tokens_seen": 17150032, "step": 34845 }, { "epoch": 4.599445690906691, "grad_norm": 6.804332952015102e-05, "learning_rate": 3.8871610083595965e-08, "loss": 0.0003, "num_input_tokens_seen": 17152400, "step": 34850 }, { "epoch": 4.6001055826844395, "grad_norm": 0.009856065735220909, "learning_rate": 3.874451249102195e-08, "loss": 0.0214, "num_input_tokens_seen": 17154896, "step": 34855 }, { "epoch": 4.600765474462188, "grad_norm": 2.9393810109468177e-05, "learning_rate": 3.861761891796433e-08, "loss": 0.0002, "num_input_tokens_seen": 17157264, "step": 34860 }, { "epoch": 4.601425366239937, "grad_norm": 2.4371995095862076e-05, "learning_rate": 3.8490929391355345e-08, "loss": 0.0, "num_input_tokens_seen": 17159632, "step": 34865 }, { "epoch": 4.602085258017685, "grad_norm": 0.010247951373457909, "learning_rate": 3.83644439380838e-08, "loss": 0.0, "num_input_tokens_seen": 17162000, "step": 34870 }, { "epoch": 4.6027451497954335, "grad_norm": 2.818459688569419e-05, "learning_rate": 3.823816258499546e-08, "loss": 0.0, "num_input_tokens_seen": 17164304, "step": 34875 }, { "epoch": 4.603405041573182, "grad_norm": 0.0009323288686573505, "learning_rate": 3.811208535889265e-08, "loss": 0.0105, "num_input_tokens_seen": 17166992, "step": 34880 }, { "epoch": 4.60406493335093, "grad_norm": 0.002070516115054488, "learning_rate": 3.79862122865342e-08, "loss": 0.0411, "num_input_tokens_seen": 17169552, "step": 34885 }, { "epoch": 4.604724825128679, "grad_norm": 0.001990356482565403, "learning_rate": 3.786054339463596e-08, "loss": 0.0, "num_input_tokens_seen": 17172176, "step": 34890 }, { "epoch": 4.6053847169064275, "grad_norm": 2.619278893689625e-05, "learning_rate": 3.7735078709869804e-08, "loss": 0.0, "num_input_tokens_seen": 17174416, "step": 34895 }, { "epoch": 4.606044608684176, "grad_norm": 1.5399427866213955e-05, "learning_rate": 3.760981825886533e-08, "loss": 0.0, "num_input_tokens_seen": 17176720, "step": 34900 }, { "epoch": 4.606704500461924, "grad_norm": 0.00026046272250823677, "learning_rate": 3.748476206820783e-08, "loss": 0.0239, "num_input_tokens_seen": 17179216, "step": 34905 }, { "epoch": 4.607364392239672, "grad_norm": 0.006970543414354324, "learning_rate": 3.735991016443929e-08, "loss": 0.0016, "num_input_tokens_seen": 17181648, "step": 34910 }, { "epoch": 4.6080242840174215, "grad_norm": 0.00010547209240030497, "learning_rate": 3.723526257405929e-08, "loss": 0.0337, "num_input_tokens_seen": 17184272, "step": 34915 }, { "epoch": 4.60868417579517, "grad_norm": 0.08839015662670135, "learning_rate": 3.711081932352278e-08, "loss": 0.0, "num_input_tokens_seen": 17186704, "step": 34920 }, { "epoch": 4.609344067572918, "grad_norm": 0.00013485303497873247, "learning_rate": 3.698658043924241e-08, "loss": 0.0005, "num_input_tokens_seen": 17189200, "step": 34925 }, { "epoch": 4.610003959350666, "grad_norm": 2.3718108423054218e-05, "learning_rate": 3.686254594758653e-08, "loss": 0.0, "num_input_tokens_seen": 17191888, "step": 34930 }, { "epoch": 4.610663851128415, "grad_norm": 9.570590918883681e-05, "learning_rate": 3.673871587488076e-08, "loss": 0.0, "num_input_tokens_seen": 17194448, "step": 34935 }, { "epoch": 4.611323742906164, "grad_norm": 5.9927140682702884e-05, "learning_rate": 3.661509024740739e-08, "loss": 0.028, "num_input_tokens_seen": 17197136, "step": 34940 }, { "epoch": 4.611983634683912, "grad_norm": 0.0012820486444979906, "learning_rate": 3.6491669091404553e-08, "loss": 0.0, "num_input_tokens_seen": 17199504, "step": 34945 }, { "epoch": 4.61264352646166, "grad_norm": 2.176354428229388e-05, "learning_rate": 3.636845243306785e-08, "loss": 0.0, "num_input_tokens_seen": 17201808, "step": 34950 }, { "epoch": 4.613303418239409, "grad_norm": 3.7333906220737845e-05, "learning_rate": 3.624544029854914e-08, "loss": 0.0001, "num_input_tokens_seen": 17204432, "step": 34955 }, { "epoch": 4.613963310017157, "grad_norm": 1.2981083273189142e-05, "learning_rate": 3.6122632713956766e-08, "loss": 0.0001, "num_input_tokens_seen": 17206928, "step": 34960 }, { "epoch": 4.614623201794906, "grad_norm": 0.10895369201898575, "learning_rate": 3.600002970535565e-08, "loss": 0.0001, "num_input_tokens_seen": 17209552, "step": 34965 }, { "epoch": 4.615283093572654, "grad_norm": 0.0003632585285231471, "learning_rate": 3.587763129876753e-08, "loss": 0.0, "num_input_tokens_seen": 17212048, "step": 34970 }, { "epoch": 4.615942985350403, "grad_norm": 2.6129724574275315e-05, "learning_rate": 3.575543752017063e-08, "loss": 0.0, "num_input_tokens_seen": 17214160, "step": 34975 }, { "epoch": 4.616602877128151, "grad_norm": 0.03864043951034546, "learning_rate": 3.563344839549942e-08, "loss": 0.0003, "num_input_tokens_seen": 17216656, "step": 34980 }, { "epoch": 4.617262768905899, "grad_norm": 0.002652583410963416, "learning_rate": 3.5511663950645534e-08, "loss": 0.0001, "num_input_tokens_seen": 17219024, "step": 34985 }, { "epoch": 4.617922660683648, "grad_norm": 0.0005656993598677218, "learning_rate": 3.539008421145673e-08, "loss": 0.0, "num_input_tokens_seen": 17221648, "step": 34990 }, { "epoch": 4.618582552461397, "grad_norm": 1.3721179129788652e-05, "learning_rate": 3.526870920373726e-08, "loss": 0.0001, "num_input_tokens_seen": 17223952, "step": 34995 }, { "epoch": 4.619242444239145, "grad_norm": 0.00020168579067103565, "learning_rate": 3.514753895324829e-08, "loss": 0.0, "num_input_tokens_seen": 17226448, "step": 35000 }, { "epoch": 4.619902336016893, "grad_norm": 0.06954360753297806, "learning_rate": 3.5026573485707253e-08, "loss": 0.0, "num_input_tokens_seen": 17228944, "step": 35005 }, { "epoch": 4.620562227794641, "grad_norm": 4.667209577746689e-05, "learning_rate": 3.4905812826788285e-08, "loss": 0.0, "num_input_tokens_seen": 17231376, "step": 35010 }, { "epoch": 4.621222119572391, "grad_norm": 0.00416320376098156, "learning_rate": 3.478525700212176e-08, "loss": 0.0411, "num_input_tokens_seen": 17234384, "step": 35015 }, { "epoch": 4.621882011350139, "grad_norm": 5.151130608282983e-05, "learning_rate": 3.4664906037294996e-08, "loss": 0.004, "num_input_tokens_seen": 17236816, "step": 35020 }, { "epoch": 4.622541903127887, "grad_norm": 0.00011490224278531969, "learning_rate": 3.4544759957851553e-08, "loss": 0.0, "num_input_tokens_seen": 17238992, "step": 35025 }, { "epoch": 4.623201794905635, "grad_norm": 0.0010727453045547009, "learning_rate": 3.4424818789291373e-08, "loss": 0.0001, "num_input_tokens_seen": 17241616, "step": 35030 }, { "epoch": 4.623861686683384, "grad_norm": 1.041501309373416e-05, "learning_rate": 3.4305082557071316e-08, "loss": 0.0, "num_input_tokens_seen": 17243792, "step": 35035 }, { "epoch": 4.624521578461132, "grad_norm": 0.00030621461337432265, "learning_rate": 3.418555128660461e-08, "loss": 0.0, "num_input_tokens_seen": 17246288, "step": 35040 }, { "epoch": 4.625181470238881, "grad_norm": 0.0003549474640749395, "learning_rate": 3.406622500326062e-08, "loss": 0.0, "num_input_tokens_seen": 17249040, "step": 35045 }, { "epoch": 4.625841362016629, "grad_norm": 1.2623581824300345e-05, "learning_rate": 3.3947103732365646e-08, "loss": 0.0018, "num_input_tokens_seen": 17251664, "step": 35050 }, { "epoch": 4.626501253794378, "grad_norm": 0.00046227945131249726, "learning_rate": 3.382818749920224e-08, "loss": 0.0, "num_input_tokens_seen": 17254160, "step": 35055 }, { "epoch": 4.627161145572126, "grad_norm": 0.015203205868601799, "learning_rate": 3.370947632900978e-08, "loss": 0.0239, "num_input_tokens_seen": 17256784, "step": 35060 }, { "epoch": 4.627821037349874, "grad_norm": 0.09579546004533768, "learning_rate": 3.3590970246983654e-08, "loss": 0.0, "num_input_tokens_seen": 17259088, "step": 35065 }, { "epoch": 4.628480929127623, "grad_norm": 4.486577381612733e-05, "learning_rate": 3.3472669278275637e-08, "loss": 0.0, "num_input_tokens_seen": 17261648, "step": 35070 }, { "epoch": 4.629140820905372, "grad_norm": 1.4926635230949614e-05, "learning_rate": 3.3354573447994637e-08, "loss": 0.0035, "num_input_tokens_seen": 17264336, "step": 35075 }, { "epoch": 4.62980071268312, "grad_norm": 1.2104676898161415e-05, "learning_rate": 3.3236682781205616e-08, "loss": 0.0, "num_input_tokens_seen": 17266576, "step": 35080 }, { "epoch": 4.630460604460868, "grad_norm": 3.344564538565464e-05, "learning_rate": 3.311899730292989e-08, "loss": 0.0, "num_input_tokens_seen": 17268944, "step": 35085 }, { "epoch": 4.6311204962386165, "grad_norm": 0.00019546352268662304, "learning_rate": 3.3001517038145356e-08, "loss": 0.0, "num_input_tokens_seen": 17271376, "step": 35090 }, { "epoch": 4.631780388016365, "grad_norm": 0.004190162289887667, "learning_rate": 3.28842420117863e-08, "loss": 0.0, "num_input_tokens_seen": 17273808, "step": 35095 }, { "epoch": 4.632440279794114, "grad_norm": 6.786596350139007e-05, "learning_rate": 3.27671722487437e-08, "loss": 0.0, "num_input_tokens_seen": 17276112, "step": 35100 }, { "epoch": 4.633100171571862, "grad_norm": 0.00013934404705651104, "learning_rate": 3.265030777386446e-08, "loss": 0.0005, "num_input_tokens_seen": 17278480, "step": 35105 }, { "epoch": 4.6337600633496105, "grad_norm": 0.000934273237362504, "learning_rate": 3.2533648611952623e-08, "loss": 0.0252, "num_input_tokens_seen": 17281296, "step": 35110 }, { "epoch": 4.634419955127359, "grad_norm": 0.0025720647536218166, "learning_rate": 3.241719478776805e-08, "loss": 0.0, "num_input_tokens_seen": 17283984, "step": 35115 }, { "epoch": 4.635079846905107, "grad_norm": 0.0044370610266923904, "learning_rate": 3.230094632602698e-08, "loss": 0.0, "num_input_tokens_seen": 17286352, "step": 35120 }, { "epoch": 4.635739738682856, "grad_norm": 0.002383069135248661, "learning_rate": 3.218490325140266e-08, "loss": 0.0, "num_input_tokens_seen": 17289040, "step": 35125 }, { "epoch": 4.6363996304606045, "grad_norm": 0.0003934859996661544, "learning_rate": 3.206906558852418e-08, "loss": 0.0, "num_input_tokens_seen": 17291536, "step": 35130 }, { "epoch": 4.637059522238353, "grad_norm": 0.00044940304360352457, "learning_rate": 3.195343336197742e-08, "loss": 0.0, "num_input_tokens_seen": 17294160, "step": 35135 }, { "epoch": 4.637719414016101, "grad_norm": 81.78995513916016, "learning_rate": 3.183800659630431e-08, "loss": 0.0755, "num_input_tokens_seen": 17296912, "step": 35140 }, { "epoch": 4.638379305793849, "grad_norm": 2.1038411432527937e-05, "learning_rate": 3.1722785316003475e-08, "loss": 0.0, "num_input_tokens_seen": 17299344, "step": 35145 }, { "epoch": 4.6390391975715985, "grad_norm": 0.0004084999964106828, "learning_rate": 3.160776954552979e-08, "loss": 0.0011, "num_input_tokens_seen": 17301840, "step": 35150 }, { "epoch": 4.639699089349347, "grad_norm": 0.00018177898891735822, "learning_rate": 3.149295930929441e-08, "loss": 0.0518, "num_input_tokens_seen": 17304336, "step": 35155 }, { "epoch": 4.640358981127095, "grad_norm": 1.1378585440979805e-05, "learning_rate": 3.137835463166494e-08, "loss": 0.0016, "num_input_tokens_seen": 17306768, "step": 35160 }, { "epoch": 4.641018872904843, "grad_norm": 0.0009956208523362875, "learning_rate": 3.12639555369657e-08, "loss": 0.0305, "num_input_tokens_seen": 17308816, "step": 35165 }, { "epoch": 4.641678764682592, "grad_norm": 0.0009633854497224092, "learning_rate": 3.1149762049476724e-08, "loss": 0.0226, "num_input_tokens_seen": 17311696, "step": 35170 }, { "epoch": 4.642338656460341, "grad_norm": 0.1642853021621704, "learning_rate": 3.103577419343484e-08, "loss": 0.0, "num_input_tokens_seen": 17314000, "step": 35175 }, { "epoch": 4.642998548238089, "grad_norm": 0.015242863446474075, "learning_rate": 3.092199199303325e-08, "loss": 0.0, "num_input_tokens_seen": 17316368, "step": 35180 }, { "epoch": 4.643658440015837, "grad_norm": 0.010791040025651455, "learning_rate": 3.0808415472421413e-08, "loss": 0.0, "num_input_tokens_seen": 17318800, "step": 35185 }, { "epoch": 4.644318331793586, "grad_norm": 0.05711999163031578, "learning_rate": 3.069504465570505e-08, "loss": 0.036, "num_input_tokens_seen": 17321296, "step": 35190 }, { "epoch": 4.644978223571334, "grad_norm": 1.9168721337337047e-05, "learning_rate": 3.0581879566946243e-08, "loss": 0.0, "num_input_tokens_seen": 17323472, "step": 35195 }, { "epoch": 4.645638115349083, "grad_norm": 2.594672878331039e-05, "learning_rate": 3.046892023016356e-08, "loss": 0.0, "num_input_tokens_seen": 17325712, "step": 35200 }, { "epoch": 4.646298007126831, "grad_norm": 9.095690620597452e-05, "learning_rate": 3.035616666933183e-08, "loss": 0.0, "num_input_tokens_seen": 17328464, "step": 35205 }, { "epoch": 4.64695789890458, "grad_norm": 1.3668033716385253e-05, "learning_rate": 3.024361890838201e-08, "loss": 0.0, "num_input_tokens_seen": 17330960, "step": 35210 }, { "epoch": 4.647617790682328, "grad_norm": 0.0003607422695495188, "learning_rate": 3.013127697120166e-08, "loss": 0.0001, "num_input_tokens_seen": 17333776, "step": 35215 }, { "epoch": 4.648277682460076, "grad_norm": 3.8394380680983886e-05, "learning_rate": 3.00191408816346e-08, "loss": 0.0028, "num_input_tokens_seen": 17336144, "step": 35220 }, { "epoch": 4.648937574237825, "grad_norm": 3.6790715967072174e-05, "learning_rate": 2.99072106634809e-08, "loss": 0.0, "num_input_tokens_seen": 17338640, "step": 35225 }, { "epoch": 4.649597466015574, "grad_norm": 2.8719652618747205e-05, "learning_rate": 2.9795486340496557e-08, "loss": 0.0, "num_input_tokens_seen": 17341136, "step": 35230 }, { "epoch": 4.650257357793322, "grad_norm": 0.00010499545896891505, "learning_rate": 2.968396793639494e-08, "loss": 0.0, "num_input_tokens_seen": 17343248, "step": 35235 }, { "epoch": 4.65091724957107, "grad_norm": 1.3791161109111272e-05, "learning_rate": 2.9572655474844555e-08, "loss": 0.0, "num_input_tokens_seen": 17345744, "step": 35240 }, { "epoch": 4.6515771413488185, "grad_norm": 0.000668855500407517, "learning_rate": 2.9461548979470507e-08, "loss": 0.0, "num_input_tokens_seen": 17348432, "step": 35245 }, { "epoch": 4.652237033126568, "grad_norm": 1.8187585737905465e-05, "learning_rate": 2.9350648473854933e-08, "loss": 0.0001, "num_input_tokens_seen": 17350864, "step": 35250 }, { "epoch": 4.652896924904316, "grad_norm": 2.9176559110055678e-05, "learning_rate": 2.9239953981535116e-08, "loss": 0.0, "num_input_tokens_seen": 17353360, "step": 35255 }, { "epoch": 4.653556816682064, "grad_norm": 7.256161916302517e-05, "learning_rate": 2.9129465526005592e-08, "loss": 0.0, "num_input_tokens_seen": 17355664, "step": 35260 }, { "epoch": 4.6542167084598125, "grad_norm": 0.0024205967783927917, "learning_rate": 2.9019183130716386e-08, "loss": 0.0, "num_input_tokens_seen": 17358288, "step": 35265 }, { "epoch": 4.654876600237561, "grad_norm": 1.6926347598200664e-05, "learning_rate": 2.8909106819074214e-08, "loss": 0.0032, "num_input_tokens_seen": 17360400, "step": 35270 }, { "epoch": 4.65553649201531, "grad_norm": 0.003134387545287609, "learning_rate": 2.8799236614442168e-08, "loss": 0.0, "num_input_tokens_seen": 17363216, "step": 35275 }, { "epoch": 4.656196383793058, "grad_norm": 1.5004871784185525e-05, "learning_rate": 2.868957254013915e-08, "loss": 0.0, "num_input_tokens_seen": 17365584, "step": 35280 }, { "epoch": 4.6568562755708065, "grad_norm": 0.00010448491229908541, "learning_rate": 2.8580114619440655e-08, "loss": 0.0, "num_input_tokens_seen": 17367824, "step": 35285 }, { "epoch": 4.657516167348555, "grad_norm": 0.0001897216570796445, "learning_rate": 2.8470862875578427e-08, "loss": 0.0, "num_input_tokens_seen": 17370064, "step": 35290 }, { "epoch": 4.658176059126303, "grad_norm": 0.035831667482852936, "learning_rate": 2.836181733174037e-08, "loss": 0.0, "num_input_tokens_seen": 17372688, "step": 35295 }, { "epoch": 4.658835950904052, "grad_norm": 7.262377766892314e-05, "learning_rate": 2.8252978011070404e-08, "loss": 0.0, "num_input_tokens_seen": 17375312, "step": 35300 }, { "epoch": 4.6594958426818005, "grad_norm": 0.0005857625510543585, "learning_rate": 2.8144344936669062e-08, "loss": 0.0, "num_input_tokens_seen": 17377744, "step": 35305 }, { "epoch": 4.660155734459549, "grad_norm": 0.00033626792719587684, "learning_rate": 2.8035918131592895e-08, "loss": 0.0, "num_input_tokens_seen": 17380240, "step": 35310 }, { "epoch": 4.660815626237297, "grad_norm": 11.807950019836426, "learning_rate": 2.792769761885472e-08, "loss": 0.0132, "num_input_tokens_seen": 17382608, "step": 35315 }, { "epoch": 4.661475518015045, "grad_norm": 4.965622792951763e-05, "learning_rate": 2.781968342142349e-08, "loss": 0.0, "num_input_tokens_seen": 17385104, "step": 35320 }, { "epoch": 4.662135409792794, "grad_norm": 1.8443208318785764e-05, "learning_rate": 2.771187556222454e-08, "loss": 0.0, "num_input_tokens_seen": 17387664, "step": 35325 }, { "epoch": 4.662795301570543, "grad_norm": 0.007886563427746296, "learning_rate": 2.7604274064139123e-08, "loss": 0.0011, "num_input_tokens_seen": 17389712, "step": 35330 }, { "epoch": 4.663455193348291, "grad_norm": 5.378757487051189e-05, "learning_rate": 2.7496878950005077e-08, "loss": 0.0, "num_input_tokens_seen": 17392400, "step": 35335 }, { "epoch": 4.664115085126039, "grad_norm": 1.7856054910225794e-05, "learning_rate": 2.738969024261606e-08, "loss": 0.0, "num_input_tokens_seen": 17395088, "step": 35340 }, { "epoch": 4.664774976903788, "grad_norm": 1.7758238755050115e-05, "learning_rate": 2.7282707964722427e-08, "loss": 0.0, "num_input_tokens_seen": 17397520, "step": 35345 }, { "epoch": 4.665434868681536, "grad_norm": 0.00024594739079475403, "learning_rate": 2.7175932139030022e-08, "loss": 0.0008, "num_input_tokens_seen": 17399824, "step": 35350 }, { "epoch": 4.666094760459285, "grad_norm": 0.004140197765082121, "learning_rate": 2.7069362788201267e-08, "loss": 0.0, "num_input_tokens_seen": 17402384, "step": 35355 }, { "epoch": 4.666754652237033, "grad_norm": 5.721020698547363, "learning_rate": 2.6962999934855068e-08, "loss": 0.0039, "num_input_tokens_seen": 17404752, "step": 35360 }, { "epoch": 4.667414544014782, "grad_norm": 0.004005917347967625, "learning_rate": 2.6856843601565816e-08, "loss": 0.0, "num_input_tokens_seen": 17407184, "step": 35365 }, { "epoch": 4.66807443579253, "grad_norm": 2.8405822376953438e-05, "learning_rate": 2.6750893810864596e-08, "loss": 0.0, "num_input_tokens_seen": 17409680, "step": 35370 }, { "epoch": 4.668734327570278, "grad_norm": 3.6111789086135104e-05, "learning_rate": 2.6645150585238528e-08, "loss": 0.0381, "num_input_tokens_seen": 17412304, "step": 35375 }, { "epoch": 4.6693942193480265, "grad_norm": 6.57942146062851e-05, "learning_rate": 2.653961394713067e-08, "loss": 0.0005, "num_input_tokens_seen": 17414736, "step": 35380 }, { "epoch": 4.670054111125776, "grad_norm": 0.3089575469493866, "learning_rate": 2.6434283918940424e-08, "loss": 0.0002, "num_input_tokens_seen": 17417232, "step": 35385 }, { "epoch": 4.670714002903524, "grad_norm": 9.907195091247559, "learning_rate": 2.6329160523023587e-08, "loss": 0.0294, "num_input_tokens_seen": 17420240, "step": 35390 }, { "epoch": 4.671373894681272, "grad_norm": 1.850312764872797e-05, "learning_rate": 2.6224243781691636e-08, "loss": 0.0, "num_input_tokens_seen": 17422928, "step": 35395 }, { "epoch": 4.6720337864590205, "grad_norm": 3.9662245399085805e-05, "learning_rate": 2.6119533717212428e-08, "loss": 0.0, "num_input_tokens_seen": 17425424, "step": 35400 }, { "epoch": 4.672693678236769, "grad_norm": 6.341608241200447e-05, "learning_rate": 2.601503035180963e-08, "loss": 0.0, "num_input_tokens_seen": 17427920, "step": 35405 }, { "epoch": 4.673353570014518, "grad_norm": 2.3568481992697343e-05, "learning_rate": 2.5910733707663947e-08, "loss": 0.0, "num_input_tokens_seen": 17430416, "step": 35410 }, { "epoch": 4.674013461792266, "grad_norm": 2.081350248772651e-05, "learning_rate": 2.5806643806910998e-08, "loss": 0.0, "num_input_tokens_seen": 17432784, "step": 35415 }, { "epoch": 4.6746733535700145, "grad_norm": 0.011610975489020348, "learning_rate": 2.5702760671643455e-08, "loss": 0.0, "num_input_tokens_seen": 17434896, "step": 35420 }, { "epoch": 4.675333245347763, "grad_norm": 3.9640330214751884e-05, "learning_rate": 2.559908432390967e-08, "loss": 0.0, "num_input_tokens_seen": 17437200, "step": 35425 }, { "epoch": 4.675993137125511, "grad_norm": 0.9224775433540344, "learning_rate": 2.5495614785714047e-08, "loss": 0.0219, "num_input_tokens_seen": 17439504, "step": 35430 }, { "epoch": 4.67665302890326, "grad_norm": 7.126452692318708e-05, "learning_rate": 2.5392352079017576e-08, "loss": 0.0, "num_input_tokens_seen": 17442000, "step": 35435 }, { "epoch": 4.6773129206810085, "grad_norm": 0.007979700341820717, "learning_rate": 2.528929622573661e-08, "loss": 0.0, "num_input_tokens_seen": 17444240, "step": 35440 }, { "epoch": 4.677972812458757, "grad_norm": 5.438230436993763e-05, "learning_rate": 2.5186447247744436e-08, "loss": 0.0, "num_input_tokens_seen": 17446672, "step": 35445 }, { "epoch": 4.678632704236505, "grad_norm": 0.00035698155988939106, "learning_rate": 2.5083805166869698e-08, "loss": 0.0, "num_input_tokens_seen": 17449232, "step": 35450 }, { "epoch": 4.679292596014253, "grad_norm": 35.28067398071289, "learning_rate": 2.4981370004897527e-08, "loss": 0.0657, "num_input_tokens_seen": 17451920, "step": 35455 }, { "epoch": 4.6799524877920025, "grad_norm": 0.00012678831990342587, "learning_rate": 2.487914178356898e-08, "loss": 0.0, "num_input_tokens_seen": 17454224, "step": 35460 }, { "epoch": 4.680612379569751, "grad_norm": 4.944609827362001e-05, "learning_rate": 2.4777120524581364e-08, "loss": 0.0, "num_input_tokens_seen": 17456784, "step": 35465 }, { "epoch": 4.681272271347499, "grad_norm": 0.014934533275663853, "learning_rate": 2.4675306249587912e-08, "loss": 0.0487, "num_input_tokens_seen": 17459088, "step": 35470 }, { "epoch": 4.681932163125247, "grad_norm": 0.0011983781587332487, "learning_rate": 2.45736989801979e-08, "loss": 0.0411, "num_input_tokens_seen": 17461456, "step": 35475 }, { "epoch": 4.682592054902996, "grad_norm": 0.00014378594642039388, "learning_rate": 2.4472298737976848e-08, "loss": 0.0, "num_input_tokens_seen": 17463760, "step": 35480 }, { "epoch": 4.683251946680745, "grad_norm": 1.3808754374622367e-05, "learning_rate": 2.4371105544446323e-08, "loss": 0.0595, "num_input_tokens_seen": 17466128, "step": 35485 }, { "epoch": 4.683911838458493, "grad_norm": 0.4533805549144745, "learning_rate": 2.427011942108348e-08, "loss": 0.0002, "num_input_tokens_seen": 17468624, "step": 35490 }, { "epoch": 4.684571730236241, "grad_norm": 2.5709761757752858e-05, "learning_rate": 2.416934038932217e-08, "loss": 0.0595, "num_input_tokens_seen": 17470928, "step": 35495 }, { "epoch": 4.68523162201399, "grad_norm": 0.003099799156188965, "learning_rate": 2.406876847055206e-08, "loss": 0.0095, "num_input_tokens_seen": 17473552, "step": 35500 }, { "epoch": 4.685891513791738, "grad_norm": 2.1243256924208254e-05, "learning_rate": 2.396840368611852e-08, "loss": 0.0, "num_input_tokens_seen": 17475600, "step": 35505 }, { "epoch": 4.686551405569487, "grad_norm": 2.592039163573645e-05, "learning_rate": 2.3868246057323515e-08, "loss": 0.001, "num_input_tokens_seen": 17478224, "step": 35510 }, { "epoch": 4.687211297347235, "grad_norm": 1.40487991302507e-05, "learning_rate": 2.3768295605424703e-08, "loss": 0.1067, "num_input_tokens_seen": 17480656, "step": 35515 }, { "epoch": 4.687871189124984, "grad_norm": 0.00012769679597113281, "learning_rate": 2.3668552351635896e-08, "loss": 0.0, "num_input_tokens_seen": 17482960, "step": 35520 }, { "epoch": 4.688531080902732, "grad_norm": 2.972482798213605e-05, "learning_rate": 2.356901631712671e-08, "loss": 0.0, "num_input_tokens_seen": 17485648, "step": 35525 }, { "epoch": 4.68919097268048, "grad_norm": 0.00017019780352711678, "learning_rate": 2.346968752302303e-08, "loss": 0.0518, "num_input_tokens_seen": 17488208, "step": 35530 }, { "epoch": 4.689850864458229, "grad_norm": 0.004232440609484911, "learning_rate": 2.3370565990406877e-08, "loss": 0.0, "num_input_tokens_seen": 17490704, "step": 35535 }, { "epoch": 4.690510756235978, "grad_norm": 2.0393299564602785e-05, "learning_rate": 2.3271651740315755e-08, "loss": 0.0, "num_input_tokens_seen": 17493008, "step": 35540 }, { "epoch": 4.691170648013726, "grad_norm": 0.00018886018369812518, "learning_rate": 2.3172944793743653e-08, "loss": 0.0188, "num_input_tokens_seen": 17495376, "step": 35545 }, { "epoch": 4.691830539791474, "grad_norm": 1.3998830581840593e-05, "learning_rate": 2.3074445171640366e-08, "loss": 0.0, "num_input_tokens_seen": 17497616, "step": 35550 }, { "epoch": 4.692490431569222, "grad_norm": 5.956808308837935e-05, "learning_rate": 2.2976152894911838e-08, "loss": 0.0252, "num_input_tokens_seen": 17500368, "step": 35555 }, { "epoch": 4.693150323346972, "grad_norm": 0.014102444052696228, "learning_rate": 2.2878067984419825e-08, "loss": 0.0261, "num_input_tokens_seen": 17502736, "step": 35560 }, { "epoch": 4.69381021512472, "grad_norm": 6.130715337349102e-05, "learning_rate": 2.2780190460981896e-08, "loss": 0.0, "num_input_tokens_seen": 17505232, "step": 35565 }, { "epoch": 4.694470106902468, "grad_norm": 0.00011063072452088818, "learning_rate": 2.2682520345372325e-08, "loss": 0.0, "num_input_tokens_seen": 17507600, "step": 35570 }, { "epoch": 4.695129998680216, "grad_norm": 0.04662204161286354, "learning_rate": 2.258505765832064e-08, "loss": 0.0003, "num_input_tokens_seen": 17510032, "step": 35575 }, { "epoch": 4.695789890457965, "grad_norm": 0.03584326431155205, "learning_rate": 2.248780242051229e-08, "loss": 0.0001, "num_input_tokens_seen": 17512848, "step": 35580 }, { "epoch": 4.696449782235713, "grad_norm": 0.005998116452246904, "learning_rate": 2.239075465258966e-08, "loss": 0.0, "num_input_tokens_seen": 17515344, "step": 35585 }, { "epoch": 4.697109674013462, "grad_norm": 3.821205609710887e-05, "learning_rate": 2.2293914375149824e-08, "loss": 0.0, "num_input_tokens_seen": 17517776, "step": 35590 }, { "epoch": 4.69776956579121, "grad_norm": 1.1565753993636463e-05, "learning_rate": 2.2197281608746787e-08, "loss": 0.0, "num_input_tokens_seen": 17520272, "step": 35595 }, { "epoch": 4.698429457568959, "grad_norm": 3.2036539778346196e-05, "learning_rate": 2.210085637388992e-08, "loss": 0.0062, "num_input_tokens_seen": 17522512, "step": 35600 }, { "epoch": 4.699089349346707, "grad_norm": 1.2361926565063186e-05, "learning_rate": 2.2004638691044962e-08, "loss": 0.0, "num_input_tokens_seen": 17525264, "step": 35605 }, { "epoch": 4.699749241124455, "grad_norm": 0.0018424964509904385, "learning_rate": 2.190862858063347e-08, "loss": 0.0338, "num_input_tokens_seen": 17527568, "step": 35610 }, { "epoch": 4.700409132902204, "grad_norm": 5.0226128223584965e-05, "learning_rate": 2.1812826063032584e-08, "loss": 0.0, "num_input_tokens_seen": 17530064, "step": 35615 }, { "epoch": 4.701069024679953, "grad_norm": 2.2851856556371786e-05, "learning_rate": 2.1717231158576045e-08, "loss": 0.0, "num_input_tokens_seen": 17532688, "step": 35620 }, { "epoch": 4.701728916457701, "grad_norm": 6.59624784020707e-05, "learning_rate": 2.1621843887552948e-08, "loss": 0.0, "num_input_tokens_seen": 17535120, "step": 35625 }, { "epoch": 4.702388808235449, "grad_norm": 0.0010319275315850973, "learning_rate": 2.1526664270208662e-08, "loss": 0.0, "num_input_tokens_seen": 17537488, "step": 35630 }, { "epoch": 4.7030487000131975, "grad_norm": 6.349383329506963e-05, "learning_rate": 2.1431692326744244e-08, "loss": 0.0, "num_input_tokens_seen": 17540176, "step": 35635 }, { "epoch": 4.703708591790946, "grad_norm": 0.00023460011288989335, "learning_rate": 2.1336928077317017e-08, "loss": 0.0, "num_input_tokens_seen": 17542672, "step": 35640 }, { "epoch": 4.704368483568695, "grad_norm": 2.8320706405793317e-05, "learning_rate": 2.1242371542039893e-08, "loss": 0.0016, "num_input_tokens_seen": 17544848, "step": 35645 }, { "epoch": 4.705028375346443, "grad_norm": 0.1557927280664444, "learning_rate": 2.1148022740981708e-08, "loss": 0.0, "num_input_tokens_seen": 17547344, "step": 35650 }, { "epoch": 4.7056882671241915, "grad_norm": 0.00012379908002912998, "learning_rate": 2.1053881694167442e-08, "loss": 0.0, "num_input_tokens_seen": 17549968, "step": 35655 }, { "epoch": 4.70634815890194, "grad_norm": 0.0013328184140846133, "learning_rate": 2.095994842157789e-08, "loss": 0.02, "num_input_tokens_seen": 17552272, "step": 35660 }, { "epoch": 4.707008050679688, "grad_norm": 2.367728120589163e-05, "learning_rate": 2.086622294314955e-08, "loss": 0.0002, "num_input_tokens_seen": 17554768, "step": 35665 }, { "epoch": 4.707667942457437, "grad_norm": 5.5861837608972564e-05, "learning_rate": 2.077270527877495e-08, "loss": 0.0, "num_input_tokens_seen": 17557136, "step": 35670 }, { "epoch": 4.7083278342351855, "grad_norm": 1.653864273976069e-05, "learning_rate": 2.067939544830277e-08, "loss": 0.0164, "num_input_tokens_seen": 17559696, "step": 35675 }, { "epoch": 4.708987726012934, "grad_norm": 6.894586113048717e-05, "learning_rate": 2.0586293471537287e-08, "loss": 0.0, "num_input_tokens_seen": 17562128, "step": 35680 }, { "epoch": 4.709647617790682, "grad_norm": 0.18366917967796326, "learning_rate": 2.0493399368238573e-08, "loss": 0.0295, "num_input_tokens_seen": 17565136, "step": 35685 }, { "epoch": 4.71030750956843, "grad_norm": 0.019991910085082054, "learning_rate": 2.0400713158122863e-08, "loss": 0.0001, "num_input_tokens_seen": 17567952, "step": 35690 }, { "epoch": 4.7109674013461795, "grad_norm": 0.004305573645979166, "learning_rate": 2.0308234860862084e-08, "loss": 0.0, "num_input_tokens_seen": 17570256, "step": 35695 }, { "epoch": 4.711627293123928, "grad_norm": 2.053894058917649e-05, "learning_rate": 2.021596449608409e-08, "loss": 0.0, "num_input_tokens_seen": 17572624, "step": 35700 }, { "epoch": 4.712287184901676, "grad_norm": 0.00030093066743575037, "learning_rate": 2.0123902083372557e-08, "loss": 0.0, "num_input_tokens_seen": 17574800, "step": 35705 }, { "epoch": 4.712947076679424, "grad_norm": 0.00013274639786686748, "learning_rate": 2.003204764226718e-08, "loss": 0.0766, "num_input_tokens_seen": 17577360, "step": 35710 }, { "epoch": 4.713606968457173, "grad_norm": 5.4098480177344754e-05, "learning_rate": 1.9940401192263146e-08, "loss": 0.0, "num_input_tokens_seen": 17579856, "step": 35715 }, { "epoch": 4.714266860234922, "grad_norm": 8.203894685721025e-05, "learning_rate": 1.9848962752812006e-08, "loss": 0.0, "num_input_tokens_seen": 17582224, "step": 35720 }, { "epoch": 4.71492675201267, "grad_norm": 0.004259839653968811, "learning_rate": 1.9757732343320898e-08, "loss": 0.0, "num_input_tokens_seen": 17584656, "step": 35725 }, { "epoch": 4.715586643790418, "grad_norm": 0.001554732909426093, "learning_rate": 1.9666709983152674e-08, "loss": 0.0, "num_input_tokens_seen": 17587152, "step": 35730 }, { "epoch": 4.716246535568167, "grad_norm": 1.9275730664958246e-05, "learning_rate": 1.957589569162632e-08, "loss": 0.0, "num_input_tokens_seen": 17589520, "step": 35735 }, { "epoch": 4.716906427345915, "grad_norm": 15.459978103637695, "learning_rate": 1.948528948801631e-08, "loss": 0.0579, "num_input_tokens_seen": 17591824, "step": 35740 }, { "epoch": 4.717566319123664, "grad_norm": 0.0007542030070908368, "learning_rate": 1.939489139155337e-08, "loss": 0.0, "num_input_tokens_seen": 17594512, "step": 35745 }, { "epoch": 4.718226210901412, "grad_norm": 8.487552986480296e-05, "learning_rate": 1.9304701421423707e-08, "loss": 0.0, "num_input_tokens_seen": 17596880, "step": 35750 }, { "epoch": 4.718886102679161, "grad_norm": 2.4246677639894187e-05, "learning_rate": 1.921471959676957e-08, "loss": 0.0, "num_input_tokens_seen": 17599504, "step": 35755 }, { "epoch": 4.719545994456909, "grad_norm": 0.0041843983344733715, "learning_rate": 1.9124945936688896e-08, "loss": 0.02, "num_input_tokens_seen": 17601872, "step": 35760 }, { "epoch": 4.720205886234657, "grad_norm": 3.7647943827323616e-05, "learning_rate": 1.903538046023545e-08, "loss": 0.0016, "num_input_tokens_seen": 17604560, "step": 35765 }, { "epoch": 4.720865778012406, "grad_norm": 0.210673525929451, "learning_rate": 1.8946023186419025e-08, "loss": 0.0001, "num_input_tokens_seen": 17606736, "step": 35770 }, { "epoch": 4.721525669790155, "grad_norm": 9.897825293592177e-06, "learning_rate": 1.885687413420478e-08, "loss": 0.0, "num_input_tokens_seen": 17609360, "step": 35775 }, { "epoch": 4.722185561567903, "grad_norm": 4.5366341510089114e-05, "learning_rate": 1.876793332251425e-08, "loss": 0.0, "num_input_tokens_seen": 17611600, "step": 35780 }, { "epoch": 4.722845453345651, "grad_norm": 0.00021711646695621312, "learning_rate": 1.8679200770224445e-08, "loss": 0.0, "num_input_tokens_seen": 17614224, "step": 35785 }, { "epoch": 4.7235053451233995, "grad_norm": 0.00011851973249576986, "learning_rate": 1.859067649616797e-08, "loss": 0.1016, "num_input_tokens_seen": 17616656, "step": 35790 }, { "epoch": 4.724165236901149, "grad_norm": 0.00015636181342415512, "learning_rate": 1.8502360519133564e-08, "loss": 0.0009, "num_input_tokens_seen": 17619600, "step": 35795 }, { "epoch": 4.724825128678897, "grad_norm": 0.00039138575084507465, "learning_rate": 1.8414252857865688e-08, "loss": 0.0, "num_input_tokens_seen": 17622160, "step": 35800 }, { "epoch": 4.725485020456645, "grad_norm": 2.3353479264187627e-05, "learning_rate": 1.8326353531064708e-08, "loss": 0.0, "num_input_tokens_seen": 17624720, "step": 35805 }, { "epoch": 4.7261449122343935, "grad_norm": 0.004620248917490244, "learning_rate": 1.8238662557386262e-08, "loss": 0.0, "num_input_tokens_seen": 17627280, "step": 35810 }, { "epoch": 4.726804804012142, "grad_norm": 1.8144639398087747e-05, "learning_rate": 1.8151179955442463e-08, "loss": 0.0, "num_input_tokens_seen": 17630032, "step": 35815 }, { "epoch": 4.727464695789891, "grad_norm": 1.72847921930952e-05, "learning_rate": 1.806390574380079e-08, "loss": 0.0337, "num_input_tokens_seen": 17632720, "step": 35820 }, { "epoch": 4.728124587567639, "grad_norm": 1.156251528300345e-05, "learning_rate": 1.797683994098431e-08, "loss": 0.0, "num_input_tokens_seen": 17635280, "step": 35825 }, { "epoch": 4.7287844793453875, "grad_norm": 1.7015587218338624e-05, "learning_rate": 1.7889982565472473e-08, "loss": 0.0, "num_input_tokens_seen": 17637840, "step": 35830 }, { "epoch": 4.729444371123136, "grad_norm": 0.00022032709966879338, "learning_rate": 1.780333363569986e-08, "loss": 0.0001, "num_input_tokens_seen": 17640208, "step": 35835 }, { "epoch": 4.730104262900884, "grad_norm": 1.7302148989983834e-05, "learning_rate": 1.77168931700572e-08, "loss": 0.0, "num_input_tokens_seen": 17642768, "step": 35840 }, { "epoch": 4.730764154678632, "grad_norm": 5.2531137043843046e-05, "learning_rate": 1.7630661186890827e-08, "loss": 0.0647, "num_input_tokens_seen": 17645136, "step": 35845 }, { "epoch": 4.7314240464563815, "grad_norm": 3.242297316319309e-05, "learning_rate": 1.7544637704502875e-08, "loss": 0.0, "num_input_tokens_seen": 17647504, "step": 35850 }, { "epoch": 4.73208393823413, "grad_norm": 0.00018562580225989223, "learning_rate": 1.745882274115118e-08, "loss": 0.0, "num_input_tokens_seen": 17649808, "step": 35855 }, { "epoch": 4.732743830011878, "grad_norm": 1.8453007214702666e-05, "learning_rate": 1.7373216315049288e-08, "loss": 0.0, "num_input_tokens_seen": 17652624, "step": 35860 }, { "epoch": 4.733403721789626, "grad_norm": 6.539422611240298e-05, "learning_rate": 1.7287818444366663e-08, "loss": 0.0, "num_input_tokens_seen": 17655248, "step": 35865 }, { "epoch": 4.734063613567375, "grad_norm": 0.00016532238805666566, "learning_rate": 1.7202629147228365e-08, "loss": 0.0032, "num_input_tokens_seen": 17657744, "step": 35870 }, { "epoch": 4.734723505345124, "grad_norm": 1.4289161299529951e-05, "learning_rate": 1.711764844171515e-08, "loss": 0.0266, "num_input_tokens_seen": 17660432, "step": 35875 }, { "epoch": 4.735383397122872, "grad_norm": 0.0004984359256923199, "learning_rate": 1.7032876345863588e-08, "loss": 0.0023, "num_input_tokens_seen": 17662736, "step": 35880 }, { "epoch": 4.73604328890062, "grad_norm": 3.515151547617279e-05, "learning_rate": 1.694831287766596e-08, "loss": 0.0, "num_input_tokens_seen": 17665040, "step": 35885 }, { "epoch": 4.736703180678369, "grad_norm": 0.986121416091919, "learning_rate": 1.6863958055070126e-08, "loss": 0.0149, "num_input_tokens_seen": 17667088, "step": 35890 }, { "epoch": 4.737363072456117, "grad_norm": 1.1831551091745496e-05, "learning_rate": 1.677981189597988e-08, "loss": 0.0, "num_input_tokens_seen": 17669456, "step": 35895 }, { "epoch": 4.738022964233865, "grad_norm": 0.0015665609389543533, "learning_rate": 1.6695874418254707e-08, "loss": 0.0, "num_input_tokens_seen": 17671760, "step": 35900 }, { "epoch": 4.738682856011614, "grad_norm": 2.9511278626159765e-05, "learning_rate": 1.6612145639709696e-08, "loss": 0.0, "num_input_tokens_seen": 17674320, "step": 35905 }, { "epoch": 4.739342747789363, "grad_norm": 0.00012323328701313585, "learning_rate": 1.652862557811563e-08, "loss": 0.0239, "num_input_tokens_seen": 17676688, "step": 35910 }, { "epoch": 4.740002639567111, "grad_norm": 2.5474702852079645e-05, "learning_rate": 1.6445314251198884e-08, "loss": 0.0, "num_input_tokens_seen": 17678800, "step": 35915 }, { "epoch": 4.740662531344859, "grad_norm": 0.00870624277740717, "learning_rate": 1.636221167664209e-08, "loss": 0.0001, "num_input_tokens_seen": 17681296, "step": 35920 }, { "epoch": 4.741322423122607, "grad_norm": 3.422133158892393e-05, "learning_rate": 1.6279317872082697e-08, "loss": 0.0, "num_input_tokens_seen": 17683728, "step": 35925 }, { "epoch": 4.741982314900357, "grad_norm": 0.00010780996672110632, "learning_rate": 1.6196632855114745e-08, "loss": 0.0, "num_input_tokens_seen": 17686096, "step": 35930 }, { "epoch": 4.742642206678105, "grad_norm": 2.2824242478236556e-05, "learning_rate": 1.611415664328708e-08, "loss": 0.0001, "num_input_tokens_seen": 17688528, "step": 35935 }, { "epoch": 4.743302098455853, "grad_norm": 2.1149289750610478e-05, "learning_rate": 1.6031889254105148e-08, "loss": 0.0, "num_input_tokens_seen": 17691088, "step": 35940 }, { "epoch": 4.743961990233601, "grad_norm": 0.00030069437343627214, "learning_rate": 1.594983070502942e-08, "loss": 0.0381, "num_input_tokens_seen": 17693392, "step": 35945 }, { "epoch": 4.74462188201135, "grad_norm": 1.4820947399130091e-05, "learning_rate": 1.5867981013475974e-08, "loss": 0.0, "num_input_tokens_seen": 17695824, "step": 35950 }, { "epoch": 4.745281773789099, "grad_norm": 6.589458644157276e-05, "learning_rate": 1.5786340196817127e-08, "loss": 0.0177, "num_input_tokens_seen": 17698000, "step": 35955 }, { "epoch": 4.745941665566847, "grad_norm": 0.0008548679179511964, "learning_rate": 1.570490827238047e-08, "loss": 0.0, "num_input_tokens_seen": 17700176, "step": 35960 }, { "epoch": 4.746601557344595, "grad_norm": 0.22142356634140015, "learning_rate": 1.562368525744939e-08, "loss": 0.0001, "num_input_tokens_seen": 17702864, "step": 35965 }, { "epoch": 4.747261449122344, "grad_norm": 9.57998854573816e-05, "learning_rate": 1.5542671169262667e-08, "loss": 0.0032, "num_input_tokens_seen": 17705296, "step": 35970 }, { "epoch": 4.747921340900092, "grad_norm": 0.003186148591339588, "learning_rate": 1.5461866025015202e-08, "loss": 0.0, "num_input_tokens_seen": 17707920, "step": 35975 }, { "epoch": 4.748581232677841, "grad_norm": 0.0001743086177157238, "learning_rate": 1.5381269841857282e-08, "loss": 0.0, "num_input_tokens_seen": 17710352, "step": 35980 }, { "epoch": 4.749241124455589, "grad_norm": 8.552165672881529e-05, "learning_rate": 1.5300882636894662e-08, "loss": 0.0, "num_input_tokens_seen": 17712464, "step": 35985 }, { "epoch": 4.749901016233338, "grad_norm": 2.156283335352782e-05, "learning_rate": 1.5220704427189145e-08, "loss": 0.0426, "num_input_tokens_seen": 17714832, "step": 35990 }, { "epoch": 4.750560908011086, "grad_norm": 1.526574487797916e-05, "learning_rate": 1.5140735229757893e-08, "loss": 0.075, "num_input_tokens_seen": 17717200, "step": 35995 }, { "epoch": 4.751220799788834, "grad_norm": 2.099963057844434e-05, "learning_rate": 1.5060975061573777e-08, "loss": 0.0, "num_input_tokens_seen": 17719440, "step": 36000 }, { "epoch": 4.751880691566583, "grad_norm": 2.1203761207289062e-05, "learning_rate": 1.4981423939565364e-08, "loss": 0.0, "num_input_tokens_seen": 17721744, "step": 36005 }, { "epoch": 4.751880691566583, "eval_loss": 0.2836270332336426, "eval_runtime": 7.9949, "eval_samples_per_second": 842.412, "eval_steps_per_second": 105.317, "num_input_tokens_seen": 17721744, "step": 36005 }, { "epoch": 4.752540583344332, "grad_norm": 0.03689395636320114, "learning_rate": 1.49020818806167e-08, "loss": 0.0, "num_input_tokens_seen": 17724048, "step": 36010 }, { "epoch": 4.75320047512208, "grad_norm": 0.0009968762751668692, "learning_rate": 1.4822948901567767e-08, "loss": 0.0, "num_input_tokens_seen": 17726672, "step": 36015 }, { "epoch": 4.753860366899828, "grad_norm": 0.00012214395974297076, "learning_rate": 1.474402501921368e-08, "loss": 0.006, "num_input_tokens_seen": 17729168, "step": 36020 }, { "epoch": 4.7545202586775765, "grad_norm": 3.551087502273731e-05, "learning_rate": 1.4665310250305708e-08, "loss": 0.0001, "num_input_tokens_seen": 17731664, "step": 36025 }, { "epoch": 4.755180150455326, "grad_norm": 0.08151374757289886, "learning_rate": 1.4586804611550484e-08, "loss": 0.001, "num_input_tokens_seen": 17734224, "step": 36030 }, { "epoch": 4.755840042233074, "grad_norm": 0.041376352310180664, "learning_rate": 1.4508508119610019e-08, "loss": 0.0, "num_input_tokens_seen": 17736656, "step": 36035 }, { "epoch": 4.756499934010822, "grad_norm": 0.00023899480584077537, "learning_rate": 1.4430420791102461e-08, "loss": 0.0, "num_input_tokens_seen": 17738832, "step": 36040 }, { "epoch": 4.7571598257885706, "grad_norm": 0.0009312837501056492, "learning_rate": 1.4352542642601106e-08, "loss": 0.0, "num_input_tokens_seen": 17741328, "step": 36045 }, { "epoch": 4.757819717566319, "grad_norm": 0.00021485799516085535, "learning_rate": 1.427487369063507e-08, "loss": 0.0001, "num_input_tokens_seen": 17743952, "step": 36050 }, { "epoch": 4.758479609344068, "grad_norm": 0.4940636456012726, "learning_rate": 1.4197413951689052e-08, "loss": 0.0001, "num_input_tokens_seen": 17746512, "step": 36055 }, { "epoch": 4.759139501121816, "grad_norm": 0.0010852499399334192, "learning_rate": 1.4120163442203237e-08, "loss": 0.0, "num_input_tokens_seen": 17749072, "step": 36060 }, { "epoch": 4.7597993928995646, "grad_norm": 8.565572352381423e-05, "learning_rate": 1.404312217857373e-08, "loss": 0.0, "num_input_tokens_seen": 17752080, "step": 36065 }, { "epoch": 4.760459284677313, "grad_norm": 0.003699306631460786, "learning_rate": 1.3966290177151674e-08, "loss": 0.0, "num_input_tokens_seen": 17754704, "step": 36070 }, { "epoch": 4.761119176455061, "grad_norm": 3.3029118640115485e-05, "learning_rate": 1.3889667454244136e-08, "loss": 0.0, "num_input_tokens_seen": 17757456, "step": 36075 }, { "epoch": 4.76177906823281, "grad_norm": 0.0007963759708218277, "learning_rate": 1.3813254026113997e-08, "loss": 0.0, "num_input_tokens_seen": 17760080, "step": 36080 }, { "epoch": 4.7624389600105586, "grad_norm": 3.863330857711844e-05, "learning_rate": 1.373704990897917e-08, "loss": 0.0007, "num_input_tokens_seen": 17762576, "step": 36085 }, { "epoch": 4.763098851788307, "grad_norm": 0.0002565206668805331, "learning_rate": 1.3661055119013608e-08, "loss": 0.0, "num_input_tokens_seen": 17764944, "step": 36090 }, { "epoch": 4.763758743566055, "grad_norm": 0.001068632467649877, "learning_rate": 1.3585269672346633e-08, "loss": 0.0, "num_input_tokens_seen": 17767440, "step": 36095 }, { "epoch": 4.764418635343803, "grad_norm": 0.00014805530372541398, "learning_rate": 1.3509693585063042e-08, "loss": 0.0, "num_input_tokens_seen": 17769872, "step": 36100 }, { "epoch": 4.7650785271215526, "grad_norm": 5.604649049928412e-05, "learning_rate": 1.3434326873203449e-08, "loss": 0.0, "num_input_tokens_seen": 17772240, "step": 36105 }, { "epoch": 4.765738418899301, "grad_norm": 0.04028937220573425, "learning_rate": 1.3359169552763727e-08, "loss": 0.0, "num_input_tokens_seen": 17774608, "step": 36110 }, { "epoch": 4.766398310677049, "grad_norm": 2.779177884804085e-05, "learning_rate": 1.328422163969567e-08, "loss": 0.0, "num_input_tokens_seen": 17776976, "step": 36115 }, { "epoch": 4.767058202454797, "grad_norm": 3.3683489164104685e-05, "learning_rate": 1.320948314990633e-08, "loss": 0.0, "num_input_tokens_seen": 17779280, "step": 36120 }, { "epoch": 4.767718094232546, "grad_norm": 0.0006951736286282539, "learning_rate": 1.3134954099258466e-08, "loss": 0.0, "num_input_tokens_seen": 17781712, "step": 36125 }, { "epoch": 4.768377986010294, "grad_norm": 1.4812599147262517e-05, "learning_rate": 1.306063450357009e-08, "loss": 0.0, "num_input_tokens_seen": 17784208, "step": 36130 }, { "epoch": 4.769037877788043, "grad_norm": 2.1049470888101496e-05, "learning_rate": 1.298652437861536e-08, "loss": 0.0, "num_input_tokens_seen": 17786320, "step": 36135 }, { "epoch": 4.769697769565791, "grad_norm": 1.1275597898929846e-05, "learning_rate": 1.2912623740123362e-08, "loss": 0.0, "num_input_tokens_seen": 17788688, "step": 36140 }, { "epoch": 4.77035766134354, "grad_norm": 1.6923049770412035e-05, "learning_rate": 1.2838932603779107e-08, "loss": 0.0, "num_input_tokens_seen": 17791312, "step": 36145 }, { "epoch": 4.771017553121288, "grad_norm": 0.00019921225612051785, "learning_rate": 1.2765450985222859e-08, "loss": 0.0, "num_input_tokens_seen": 17793872, "step": 36150 }, { "epoch": 4.771677444899036, "grad_norm": 0.10752221941947937, "learning_rate": 1.269217890005081e-08, "loss": 0.0, "num_input_tokens_seen": 17796432, "step": 36155 }, { "epoch": 4.772337336676785, "grad_norm": 0.048550963401794434, "learning_rate": 1.2619116363814075e-08, "loss": 0.0, "num_input_tokens_seen": 17798864, "step": 36160 }, { "epoch": 4.772997228454534, "grad_norm": 2.1920983272138983e-05, "learning_rate": 1.2546263392019917e-08, "loss": 0.0, "num_input_tokens_seen": 17801168, "step": 36165 }, { "epoch": 4.773657120232282, "grad_norm": 0.00014910813479218632, "learning_rate": 1.2473620000130858e-08, "loss": 0.0, "num_input_tokens_seen": 17803344, "step": 36170 }, { "epoch": 4.77431701201003, "grad_norm": 1.8873417502618395e-05, "learning_rate": 1.2401186203564784e-08, "loss": 0.0, "num_input_tokens_seen": 17805840, "step": 36175 }, { "epoch": 4.7749769037877785, "grad_norm": 0.0004082721716258675, "learning_rate": 1.2328962017695288e-08, "loss": 0.0213, "num_input_tokens_seen": 17808336, "step": 36180 }, { "epoch": 4.775636795565527, "grad_norm": 2.1403398932307027e-05, "learning_rate": 1.225694745785144e-08, "loss": 0.0, "num_input_tokens_seen": 17810960, "step": 36185 }, { "epoch": 4.776296687343276, "grad_norm": 2.5181774617522024e-05, "learning_rate": 1.2185142539317905e-08, "loss": 0.0, "num_input_tokens_seen": 17813328, "step": 36190 }, { "epoch": 4.776956579121024, "grad_norm": 0.00021293084137141705, "learning_rate": 1.21135472773346e-08, "loss": 0.0, "num_input_tokens_seen": 17815760, "step": 36195 }, { "epoch": 4.7776164708987725, "grad_norm": 0.0020100793335586786, "learning_rate": 1.2042161687097152e-08, "loss": 0.0, "num_input_tokens_seen": 17818384, "step": 36200 }, { "epoch": 4.778276362676521, "grad_norm": 0.0014156547840684652, "learning_rate": 1.197098578375677e-08, "loss": 0.0, "num_input_tokens_seen": 17820752, "step": 36205 }, { "epoch": 4.778936254454269, "grad_norm": 1.781239188858308e-05, "learning_rate": 1.1900019582419818e-08, "loss": 0.0079, "num_input_tokens_seen": 17823248, "step": 36210 }, { "epoch": 4.779596146232018, "grad_norm": 0.00027068998315371573, "learning_rate": 1.1829263098148357e-08, "loss": 0.0, "num_input_tokens_seen": 17825680, "step": 36215 }, { "epoch": 4.7802560380097665, "grad_norm": 7.034857117105275e-05, "learning_rate": 1.1758716345960263e-08, "loss": 0.0, "num_input_tokens_seen": 17828112, "step": 36220 }, { "epoch": 4.780915929787515, "grad_norm": 1.102480473491596e-05, "learning_rate": 1.1688379340828224e-08, "loss": 0.0, "num_input_tokens_seen": 17830544, "step": 36225 }, { "epoch": 4.781575821565263, "grad_norm": 1.4273274246079382e-05, "learning_rate": 1.1618252097680858e-08, "loss": 0.0, "num_input_tokens_seen": 17833168, "step": 36230 }, { "epoch": 4.782235713343011, "grad_norm": 1.7388576452503912e-05, "learning_rate": 1.1548334631402146e-08, "loss": 0.0, "num_input_tokens_seen": 17835536, "step": 36235 }, { "epoch": 4.7828956051207605, "grad_norm": 0.0008372985175810754, "learning_rate": 1.1478626956831771e-08, "loss": 0.0, "num_input_tokens_seen": 17837712, "step": 36240 }, { "epoch": 4.783555496898509, "grad_norm": 6.483028118964285e-05, "learning_rate": 1.1409129088764346e-08, "loss": 0.0, "num_input_tokens_seen": 17840528, "step": 36245 }, { "epoch": 4.784215388676257, "grad_norm": 0.02836316078901291, "learning_rate": 1.1339841041950516e-08, "loss": 0.0, "num_input_tokens_seen": 17842896, "step": 36250 }, { "epoch": 4.784875280454005, "grad_norm": 0.0002625687629915774, "learning_rate": 1.1270762831096182e-08, "loss": 0.0, "num_input_tokens_seen": 17845520, "step": 36255 }, { "epoch": 4.785535172231754, "grad_norm": 1.4671531971544027e-05, "learning_rate": 1.1201894470862504e-08, "loss": 0.0, "num_input_tokens_seen": 17848144, "step": 36260 }, { "epoch": 4.786195064009503, "grad_norm": 5.6285505706910044e-05, "learning_rate": 1.1133235975866572e-08, "loss": 0.0, "num_input_tokens_seen": 17850320, "step": 36265 }, { "epoch": 4.786854955787251, "grad_norm": 2.7550644517759793e-05, "learning_rate": 1.1064787360680282e-08, "loss": 0.0, "num_input_tokens_seen": 17852816, "step": 36270 }, { "epoch": 4.787514847564999, "grad_norm": 0.000557609018869698, "learning_rate": 1.0996548639831793e-08, "loss": 0.0, "num_input_tokens_seen": 17855248, "step": 36275 }, { "epoch": 4.788174739342748, "grad_norm": 1.5351559341070242e-05, "learning_rate": 1.0928519827803961e-08, "loss": 0.0, "num_input_tokens_seen": 17857616, "step": 36280 }, { "epoch": 4.788834631120496, "grad_norm": 2.7812551707029343e-05, "learning_rate": 1.086070093903535e-08, "loss": 0.0, "num_input_tokens_seen": 17860240, "step": 36285 }, { "epoch": 4.789494522898245, "grad_norm": 0.004202886018902063, "learning_rate": 1.0793091987920444e-08, "loss": 0.0, "num_input_tokens_seen": 17862800, "step": 36290 }, { "epoch": 4.790154414675993, "grad_norm": 0.0016465377993881702, "learning_rate": 1.0725692988808322e-08, "loss": 0.0, "num_input_tokens_seen": 17865168, "step": 36295 }, { "epoch": 4.790814306453742, "grad_norm": 5.0558181101223454e-05, "learning_rate": 1.0658503956004206e-08, "loss": 0.0, "num_input_tokens_seen": 17867664, "step": 36300 }, { "epoch": 4.79147419823149, "grad_norm": 0.002537961583584547, "learning_rate": 1.0591524903768245e-08, "loss": 0.0, "num_input_tokens_seen": 17870160, "step": 36305 }, { "epoch": 4.792134090009238, "grad_norm": 4.224347503622994e-05, "learning_rate": 1.0524755846316402e-08, "loss": 0.0, "num_input_tokens_seen": 17872656, "step": 36310 }, { "epoch": 4.792793981786987, "grad_norm": 0.0007909387350082397, "learning_rate": 1.0458196797820007e-08, "loss": 0.0, "num_input_tokens_seen": 17875280, "step": 36315 }, { "epoch": 4.793453873564736, "grad_norm": 0.2758484482765198, "learning_rate": 1.039184777240565e-08, "loss": 0.0001, "num_input_tokens_seen": 17877776, "step": 36320 }, { "epoch": 4.794113765342484, "grad_norm": 0.0005735823069699109, "learning_rate": 1.0325708784155396e-08, "loss": 0.0, "num_input_tokens_seen": 17879952, "step": 36325 }, { "epoch": 4.794773657120232, "grad_norm": 2.0948098608641885e-05, "learning_rate": 1.0259779847106798e-08, "loss": 0.0, "num_input_tokens_seen": 17882512, "step": 36330 }, { "epoch": 4.7954335488979805, "grad_norm": 2.039031642198097e-05, "learning_rate": 1.0194060975252772e-08, "loss": 0.0, "num_input_tokens_seen": 17885072, "step": 36335 }, { "epoch": 4.79609344067573, "grad_norm": 0.00025202587130479515, "learning_rate": 1.0128552182541606e-08, "loss": 0.0, "num_input_tokens_seen": 17887568, "step": 36340 }, { "epoch": 4.796753332453478, "grad_norm": 0.0006386773893609643, "learning_rate": 1.0063253482877287e-08, "loss": 0.0, "num_input_tokens_seen": 17890192, "step": 36345 }, { "epoch": 4.797413224231226, "grad_norm": 0.0011661931639537215, "learning_rate": 9.998164890118844e-09, "loss": 0.0005, "num_input_tokens_seen": 17892880, "step": 36350 }, { "epoch": 4.7980731160089745, "grad_norm": 0.0006924106855876744, "learning_rate": 9.933286418080778e-09, "loss": 0.0, "num_input_tokens_seen": 17895376, "step": 36355 }, { "epoch": 4.798733007786723, "grad_norm": 8.853508916217834e-05, "learning_rate": 9.868618080533298e-09, "loss": 0.0012, "num_input_tokens_seen": 17897680, "step": 36360 }, { "epoch": 4.799392899564472, "grad_norm": 1.4001774616190232e-05, "learning_rate": 9.804159891201536e-09, "loss": 0.0011, "num_input_tokens_seen": 17900368, "step": 36365 }, { "epoch": 4.80005279134222, "grad_norm": 0.0002682250633370131, "learning_rate": 9.739911863766548e-09, "loss": 0.0, "num_input_tokens_seen": 17902928, "step": 36370 }, { "epoch": 4.8007126831199685, "grad_norm": 0.00022198254009708762, "learning_rate": 9.675874011864205e-09, "loss": 0.0, "num_input_tokens_seen": 17905488, "step": 36375 }, { "epoch": 4.801372574897717, "grad_norm": 2.1480387658812106e-05, "learning_rate": 9.612046349086411e-09, "loss": 0.0, "num_input_tokens_seen": 17907920, "step": 36380 }, { "epoch": 4.802032466675465, "grad_norm": 2.2065241864765994e-05, "learning_rate": 9.548428888979775e-09, "loss": 0.0396, "num_input_tokens_seen": 17910544, "step": 36385 }, { "epoch": 4.802692358453213, "grad_norm": 20.490070343017578, "learning_rate": 9.485021645046941e-09, "loss": 0.0054, "num_input_tokens_seen": 17912976, "step": 36390 }, { "epoch": 4.8033522502309625, "grad_norm": 0.0011220432352274656, "learning_rate": 9.421824630745478e-09, "loss": 0.0, "num_input_tokens_seen": 17915280, "step": 36395 }, { "epoch": 4.804012142008711, "grad_norm": 1.5031295333756134e-05, "learning_rate": 9.358837859488544e-09, "loss": 0.0, "num_input_tokens_seen": 17917648, "step": 36400 }, { "epoch": 4.804672033786459, "grad_norm": 0.019448235630989075, "learning_rate": 9.296061344644667e-09, "loss": 0.0, "num_input_tokens_seen": 17919952, "step": 36405 }, { "epoch": 4.805331925564207, "grad_norm": 0.00023923815751913935, "learning_rate": 9.233495099537525e-09, "loss": 0.0035, "num_input_tokens_seen": 17922512, "step": 36410 }, { "epoch": 4.805991817341956, "grad_norm": 0.00014659958833362907, "learning_rate": 9.171139137446605e-09, "loss": 0.0005, "num_input_tokens_seen": 17924944, "step": 36415 }, { "epoch": 4.806651709119705, "grad_norm": 4.474867455428466e-05, "learning_rate": 9.10899347160632e-09, "loss": 0.0, "num_input_tokens_seen": 17927312, "step": 36420 }, { "epoch": 4.807311600897453, "grad_norm": 1.4712712982145604e-05, "learning_rate": 9.047058115206674e-09, "loss": 0.0, "num_input_tokens_seen": 17929744, "step": 36425 }, { "epoch": 4.807971492675201, "grad_norm": 5.0706272304523736e-05, "learning_rate": 8.985333081393154e-09, "loss": 0.028, "num_input_tokens_seen": 17932112, "step": 36430 }, { "epoch": 4.80863138445295, "grad_norm": 4.384171188576147e-05, "learning_rate": 8.923818383266169e-09, "loss": 0.0, "num_input_tokens_seen": 17934480, "step": 36435 }, { "epoch": 4.809291276230698, "grad_norm": 4.905409150524065e-05, "learning_rate": 8.862514033882051e-09, "loss": 0.0, "num_input_tokens_seen": 17936912, "step": 36440 }, { "epoch": 4.809951168008446, "grad_norm": 5.2517101721605286e-05, "learning_rate": 8.80142004625195e-09, "loss": 0.0, "num_input_tokens_seen": 17939536, "step": 36445 }, { "epoch": 4.810611059786195, "grad_norm": 1.8874017769121565e-05, "learning_rate": 8.740536433342826e-09, "loss": 0.0001, "num_input_tokens_seen": 17941712, "step": 36450 }, { "epoch": 4.811270951563944, "grad_norm": 7.149603334255517e-05, "learning_rate": 8.679863208076787e-09, "loss": 0.0, "num_input_tokens_seen": 17944016, "step": 36455 }, { "epoch": 4.811930843341692, "grad_norm": 9.245514775102492e-06, "learning_rate": 8.619400383331088e-09, "loss": 0.0, "num_input_tokens_seen": 17946320, "step": 36460 }, { "epoch": 4.81259073511944, "grad_norm": 4.676095340983011e-05, "learning_rate": 8.559147971938574e-09, "loss": 0.0, "num_input_tokens_seen": 17948752, "step": 36465 }, { "epoch": 4.813250626897188, "grad_norm": 4.1727682400960475e-05, "learning_rate": 8.499105986687572e-09, "loss": 0.0001, "num_input_tokens_seen": 17951376, "step": 36470 }, { "epoch": 4.813910518674938, "grad_norm": 4.7045174142112955e-05, "learning_rate": 8.439274440321442e-09, "loss": 0.0, "num_input_tokens_seen": 17953616, "step": 36475 }, { "epoch": 4.814570410452686, "grad_norm": 0.01597621850669384, "learning_rate": 8.379653345538918e-09, "loss": 0.0, "num_input_tokens_seen": 17955792, "step": 36480 }, { "epoch": 4.815230302230434, "grad_norm": 0.0008904458954930305, "learning_rate": 8.320242714994319e-09, "loss": 0.0, "num_input_tokens_seen": 17958288, "step": 36485 }, { "epoch": 4.815890194008182, "grad_norm": 0.15588656067848206, "learning_rate": 8.261042561297004e-09, "loss": 0.0001, "num_input_tokens_seen": 17961104, "step": 36490 }, { "epoch": 4.816550085785931, "grad_norm": 0.00011970204650424421, "learning_rate": 8.202052897011702e-09, "loss": 0.0, "num_input_tokens_seen": 17963600, "step": 36495 }, { "epoch": 4.81720997756368, "grad_norm": 6.248672434594482e-05, "learning_rate": 8.143273734658729e-09, "loss": 0.0, "num_input_tokens_seen": 17966096, "step": 36500 }, { "epoch": 4.817869869341428, "grad_norm": 2.0748017050209455e-05, "learning_rate": 8.084705086713439e-09, "loss": 0.0, "num_input_tokens_seen": 17968592, "step": 36505 }, { "epoch": 4.818529761119176, "grad_norm": 14.28984546661377, "learning_rate": 8.026346965606556e-09, "loss": 0.0412, "num_input_tokens_seen": 17970832, "step": 36510 }, { "epoch": 4.819189652896925, "grad_norm": 1.3816493265039753e-05, "learning_rate": 7.968199383724283e-09, "loss": 0.0, "num_input_tokens_seen": 17973136, "step": 36515 }, { "epoch": 4.819849544674673, "grad_norm": 0.040203265845775604, "learning_rate": 7.91026235340786e-09, "loss": 0.0, "num_input_tokens_seen": 17975632, "step": 36520 }, { "epoch": 4.820509436452422, "grad_norm": 0.0003583792713470757, "learning_rate": 7.852535886954225e-09, "loss": 0.0, "num_input_tokens_seen": 17978128, "step": 36525 }, { "epoch": 4.82116932823017, "grad_norm": 8.75997357070446e-05, "learning_rate": 7.795019996615249e-09, "loss": 0.0, "num_input_tokens_seen": 17980752, "step": 36530 }, { "epoch": 4.821829220007919, "grad_norm": 1.4387996088771615e-05, "learning_rate": 7.737714694598274e-09, "loss": 0.0, "num_input_tokens_seen": 17983504, "step": 36535 }, { "epoch": 4.822489111785667, "grad_norm": 1.6647998563712463e-05, "learning_rate": 7.680619993065906e-09, "loss": 0.0, "num_input_tokens_seen": 17985872, "step": 36540 }, { "epoch": 4.823149003563415, "grad_norm": 2.520790440030396e-05, "learning_rate": 7.62373590413623e-09, "loss": 0.0, "num_input_tokens_seen": 17988560, "step": 36545 }, { "epoch": 4.823808895341164, "grad_norm": 0.43590307235717773, "learning_rate": 7.567062439882254e-09, "loss": 0.0001, "num_input_tokens_seen": 17990928, "step": 36550 }, { "epoch": 4.824468787118913, "grad_norm": 0.0005357779446057975, "learning_rate": 7.510599612332801e-09, "loss": 0.0025, "num_input_tokens_seen": 17993296, "step": 36555 }, { "epoch": 4.825128678896661, "grad_norm": 2.613514516269788e-05, "learning_rate": 7.454347433471397e-09, "loss": 0.0, "num_input_tokens_seen": 17995792, "step": 36560 }, { "epoch": 4.825788570674409, "grad_norm": 6.79643708281219e-05, "learning_rate": 7.398305915237379e-09, "loss": 0.028, "num_input_tokens_seen": 17998096, "step": 36565 }, { "epoch": 4.8264484624521575, "grad_norm": 0.011779813095927238, "learning_rate": 7.342475069525012e-09, "loss": 0.0, "num_input_tokens_seen": 18000528, "step": 36570 }, { "epoch": 4.827108354229907, "grad_norm": 0.0013433409621939063, "learning_rate": 7.2868549081841476e-09, "loss": 0.0, "num_input_tokens_seen": 18003024, "step": 36575 }, { "epoch": 4.827768246007655, "grad_norm": 3.9421811379725114e-05, "learning_rate": 7.2314454430195685e-09, "loss": 0.0, "num_input_tokens_seen": 18005712, "step": 36580 }, { "epoch": 4.828428137785403, "grad_norm": 54.147377014160156, "learning_rate": 7.176246685791754e-09, "loss": 0.0915, "num_input_tokens_seen": 18008144, "step": 36585 }, { "epoch": 4.8290880295631515, "grad_norm": 0.00025381697923876345, "learning_rate": 7.121258648216e-09, "loss": 0.0239, "num_input_tokens_seen": 18010256, "step": 36590 }, { "epoch": 4.8297479213409, "grad_norm": 0.00021304019901435822, "learning_rate": 7.066481341963304e-09, "loss": 0.0, "num_input_tokens_seen": 18012752, "step": 36595 }, { "epoch": 4.830407813118649, "grad_norm": 1.7139234842034057e-05, "learning_rate": 7.0119147786597e-09, "loss": 0.0, "num_input_tokens_seen": 18014992, "step": 36600 }, { "epoch": 4.831067704896397, "grad_norm": 1.4912050573911984e-05, "learning_rate": 6.957558969886368e-09, "loss": 0.0004, "num_input_tokens_seen": 18017552, "step": 36605 }, { "epoch": 4.8317275966741455, "grad_norm": 6.469316576840356e-05, "learning_rate": 6.9034139271803015e-09, "loss": 0.0, "num_input_tokens_seen": 18020048, "step": 36610 }, { "epoch": 4.832387488451894, "grad_norm": 0.2807008922100067, "learning_rate": 6.849479662033086e-09, "loss": 0.0002, "num_input_tokens_seen": 18022480, "step": 36615 }, { "epoch": 4.833047380229642, "grad_norm": 0.0032580739352852106, "learning_rate": 6.795756185891899e-09, "loss": 0.0487, "num_input_tokens_seen": 18024848, "step": 36620 }, { "epoch": 4.833707272007391, "grad_norm": 0.00011517686652950943, "learning_rate": 6.742243510159396e-09, "loss": 0.0, "num_input_tokens_seen": 18027152, "step": 36625 }, { "epoch": 4.8343671637851395, "grad_norm": 2.705694168980699e-05, "learning_rate": 6.688941646193047e-09, "loss": 0.0, "num_input_tokens_seen": 18029584, "step": 36630 }, { "epoch": 4.835027055562888, "grad_norm": 0.013773814775049686, "learning_rate": 6.635850605305804e-09, "loss": 0.0, "num_input_tokens_seen": 18031952, "step": 36635 }, { "epoch": 4.835686947340636, "grad_norm": 0.022058840841054916, "learning_rate": 6.582970398765986e-09, "loss": 0.0, "num_input_tokens_seen": 18034512, "step": 36640 }, { "epoch": 4.836346839118384, "grad_norm": 4.3627336708595976e-05, "learning_rate": 6.530301037796837e-09, "loss": 0.0, "num_input_tokens_seen": 18037200, "step": 36645 }, { "epoch": 4.8370067308961335, "grad_norm": 5.2935080020688474e-05, "learning_rate": 6.477842533577194e-09, "loss": 0.0, "num_input_tokens_seen": 18039952, "step": 36650 }, { "epoch": 4.837666622673882, "grad_norm": 10.672889709472656, "learning_rate": 6.4255948972409265e-09, "loss": 0.0337, "num_input_tokens_seen": 18042320, "step": 36655 }, { "epoch": 4.83832651445163, "grad_norm": 0.00011545175220817327, "learning_rate": 6.3735581398772775e-09, "loss": 0.0, "num_input_tokens_seen": 18044752, "step": 36660 }, { "epoch": 4.838986406229378, "grad_norm": 6.284004484768957e-05, "learning_rate": 6.321732272530633e-09, "loss": 0.0, "num_input_tokens_seen": 18047440, "step": 36665 }, { "epoch": 4.839646298007127, "grad_norm": 0.00020805255917366594, "learning_rate": 6.2701173062006396e-09, "loss": 0.0502, "num_input_tokens_seen": 18049680, "step": 36670 }, { "epoch": 4.840306189784875, "grad_norm": 0.0001228711480507627, "learning_rate": 6.2187132518422004e-09, "loss": 0.0381, "num_input_tokens_seen": 18052176, "step": 36675 }, { "epoch": 4.840966081562624, "grad_norm": 0.0029427942354232073, "learning_rate": 6.167520120365477e-09, "loss": 0.0, "num_input_tokens_seen": 18054992, "step": 36680 }, { "epoch": 4.841625973340372, "grad_norm": 0.12953588366508484, "learning_rate": 6.1165379226358895e-09, "loss": 0.0001, "num_input_tokens_seen": 18057488, "step": 36685 }, { "epoch": 4.842285865118121, "grad_norm": 4.0880309825297445e-05, "learning_rate": 6.065766669474004e-09, "loss": 0.0, "num_input_tokens_seen": 18059984, "step": 36690 }, { "epoch": 4.842945756895869, "grad_norm": 2.0065477656316943e-05, "learning_rate": 6.015206371655535e-09, "loss": 0.0, "num_input_tokens_seen": 18062352, "step": 36695 }, { "epoch": 4.843605648673617, "grad_norm": 6.099815436755307e-05, "learning_rate": 5.964857039911786e-09, "loss": 0.0, "num_input_tokens_seen": 18064464, "step": 36700 }, { "epoch": 4.8442655404513655, "grad_norm": 0.9859268665313721, "learning_rate": 5.914718684928766e-09, "loss": 0.001, "num_input_tokens_seen": 18066896, "step": 36705 }, { "epoch": 4.844925432229115, "grad_norm": 3.101829861407168e-05, "learning_rate": 5.864791317348183e-09, "loss": 0.0009, "num_input_tokens_seen": 18069328, "step": 36710 }, { "epoch": 4.845585324006863, "grad_norm": 1.0897661923081614e-05, "learning_rate": 5.815074947766674e-09, "loss": 0.0, "num_input_tokens_seen": 18071888, "step": 36715 }, { "epoch": 4.846245215784611, "grad_norm": 0.0002581208827905357, "learning_rate": 5.76556958673613e-09, "loss": 0.0, "num_input_tokens_seen": 18074448, "step": 36720 }, { "epoch": 4.8469051075623595, "grad_norm": 2.9608701879624277e-05, "learning_rate": 5.716275244763813e-09, "loss": 0.0, "num_input_tokens_seen": 18077008, "step": 36725 }, { "epoch": 4.847564999340108, "grad_norm": 0.0004536760679911822, "learning_rate": 5.667191932312021e-09, "loss": 0.0, "num_input_tokens_seen": 18079952, "step": 36730 }, { "epoch": 4.848224891117857, "grad_norm": 2.2530020942213014e-05, "learning_rate": 5.61831965979831e-09, "loss": 0.0, "num_input_tokens_seen": 18082384, "step": 36735 }, { "epoch": 4.848884782895605, "grad_norm": 0.0021661531645804644, "learning_rate": 5.5696584375956036e-09, "loss": 0.0, "num_input_tokens_seen": 18084816, "step": 36740 }, { "epoch": 4.8495446746733535, "grad_norm": 0.00015442879521287978, "learning_rate": 5.5212082760316415e-09, "loss": 0.0, "num_input_tokens_seen": 18087120, "step": 36745 }, { "epoch": 4.850204566451102, "grad_norm": 1.039473954733694e-05, "learning_rate": 5.472969185389975e-09, "loss": 0.0, "num_input_tokens_seen": 18089744, "step": 36750 }, { "epoch": 4.85086445822885, "grad_norm": 0.0004750068474095315, "learning_rate": 5.424941175908637e-09, "loss": 0.0, "num_input_tokens_seen": 18092368, "step": 36755 }, { "epoch": 4.851524350006599, "grad_norm": 0.000852764758747071, "learning_rate": 5.377124257781473e-09, "loss": 0.0, "num_input_tokens_seen": 18094928, "step": 36760 }, { "epoch": 4.8521842417843475, "grad_norm": 3.803055369644426e-05, "learning_rate": 5.329518441157144e-09, "loss": 0.0, "num_input_tokens_seen": 18097040, "step": 36765 }, { "epoch": 4.852844133562096, "grad_norm": 0.005006958730518818, "learning_rate": 5.282123736139677e-09, "loss": 0.0003, "num_input_tokens_seen": 18099280, "step": 36770 }, { "epoch": 4.853504025339844, "grad_norm": 1.367265394947026e-05, "learning_rate": 5.234940152788358e-09, "loss": 0.0, "num_input_tokens_seen": 18101520, "step": 36775 }, { "epoch": 4.854163917117592, "grad_norm": 0.00015550132957287133, "learning_rate": 5.187967701117401e-09, "loss": 0.0, "num_input_tokens_seen": 18103632, "step": 36780 }, { "epoch": 4.8548238088953415, "grad_norm": 2.042776941379998e-05, "learning_rate": 5.141206391096387e-09, "loss": 0.0, "num_input_tokens_seen": 18106000, "step": 36785 }, { "epoch": 4.85548370067309, "grad_norm": 0.0007538821664638817, "learning_rate": 5.094656232650263e-09, "loss": 0.0066, "num_input_tokens_seen": 18108624, "step": 36790 }, { "epoch": 4.856143592450838, "grad_norm": 9.818230319069698e-05, "learning_rate": 5.0483172356586835e-09, "loss": 0.0, "num_input_tokens_seen": 18111120, "step": 36795 }, { "epoch": 4.856803484228586, "grad_norm": 0.00048822807730175555, "learning_rate": 5.002189409956892e-09, "loss": 0.0, "num_input_tokens_seen": 18113424, "step": 36800 }, { "epoch": 4.857463376006335, "grad_norm": 0.013466020114719868, "learning_rate": 4.956272765335278e-09, "loss": 0.0, "num_input_tokens_seen": 18115792, "step": 36805 }, { "epoch": 4.858123267784084, "grad_norm": 4.1604354919400066e-05, "learning_rate": 4.91056731153916e-09, "loss": 0.0039, "num_input_tokens_seen": 18118416, "step": 36810 }, { "epoch": 4.858783159561832, "grad_norm": 0.00016768813657108694, "learning_rate": 4.865073058269331e-09, "loss": 0.0, "num_input_tokens_seen": 18120464, "step": 36815 }, { "epoch": 4.85944305133958, "grad_norm": 2.7609599783318117e-05, "learning_rate": 4.819790015181513e-09, "loss": 0.0, "num_input_tokens_seen": 18122960, "step": 36820 }, { "epoch": 4.860102943117329, "grad_norm": 2.8394108085194603e-05, "learning_rate": 4.774718191886684e-09, "loss": 0.0657, "num_input_tokens_seen": 18125520, "step": 36825 }, { "epoch": 4.860762834895077, "grad_norm": 5.970145866740495e-05, "learning_rate": 4.729857597951081e-09, "loss": 0.0, "num_input_tokens_seen": 18128080, "step": 36830 }, { "epoch": 4.861422726672826, "grad_norm": 0.001574253081344068, "learning_rate": 4.685208242896088e-09, "loss": 0.0, "num_input_tokens_seen": 18130064, "step": 36835 }, { "epoch": 4.862082618450574, "grad_norm": 2.4492735974490643e-05, "learning_rate": 4.6407701361981246e-09, "loss": 0.0, "num_input_tokens_seen": 18132688, "step": 36840 }, { "epoch": 4.862742510228323, "grad_norm": 7.663365977350622e-05, "learning_rate": 4.5965432872888675e-09, "loss": 0.001, "num_input_tokens_seen": 18135248, "step": 36845 }, { "epoch": 4.863402402006071, "grad_norm": 1.9701075871125795e-05, "learning_rate": 4.552527705555032e-09, "loss": 0.0, "num_input_tokens_seen": 18137808, "step": 36850 }, { "epoch": 4.864062293783819, "grad_norm": 4.3438776629045606e-05, "learning_rate": 4.5087234003388094e-09, "loss": 0.001, "num_input_tokens_seen": 18139984, "step": 36855 }, { "epoch": 4.864722185561568, "grad_norm": 0.00038492324529215693, "learning_rate": 4.465130380937321e-09, "loss": 0.0, "num_input_tokens_seen": 18142288, "step": 36860 }, { "epoch": 4.865382077339317, "grad_norm": 2.848833719326649e-05, "learning_rate": 4.42174865660283e-09, "loss": 0.0, "num_input_tokens_seen": 18144912, "step": 36865 }, { "epoch": 4.866041969117065, "grad_norm": 6.18175690760836e-05, "learning_rate": 4.37857823654264e-09, "loss": 0.0, "num_input_tokens_seen": 18147664, "step": 36870 }, { "epoch": 4.866701860894813, "grad_norm": 13.880156517028809, "learning_rate": 4.335619129919643e-09, "loss": 0.0061, "num_input_tokens_seen": 18150160, "step": 36875 }, { "epoch": 4.867361752672561, "grad_norm": 0.6847956776618958, "learning_rate": 4.292871345851323e-09, "loss": 0.0001, "num_input_tokens_seen": 18152656, "step": 36880 }, { "epoch": 4.868021644450311, "grad_norm": 2.3505672288592905e-05, "learning_rate": 4.250334893410867e-09, "loss": 0.0, "num_input_tokens_seen": 18155024, "step": 36885 }, { "epoch": 4.868681536228059, "grad_norm": 0.00011453049228293821, "learning_rate": 4.208009781626054e-09, "loss": 0.0, "num_input_tokens_seen": 18157584, "step": 36890 }, { "epoch": 4.869341428005807, "grad_norm": 0.0018516669515520334, "learning_rate": 4.165896019480253e-09, "loss": 0.0133, "num_input_tokens_seen": 18160080, "step": 36895 }, { "epoch": 4.870001319783555, "grad_norm": 0.0003991833655163646, "learning_rate": 4.123993615911759e-09, "loss": 0.0, "num_input_tokens_seen": 18162384, "step": 36900 }, { "epoch": 4.870661211561304, "grad_norm": 0.0008322819485329092, "learning_rate": 4.082302579814012e-09, "loss": 0.0, "num_input_tokens_seen": 18165008, "step": 36905 }, { "epoch": 4.871321103339053, "grad_norm": 0.00972724612802267, "learning_rate": 4.040822920035713e-09, "loss": 0.0054, "num_input_tokens_seen": 18167248, "step": 36910 }, { "epoch": 4.871980995116801, "grad_norm": 0.001637786510400474, "learning_rate": 3.999554645380487e-09, "loss": 0.0, "num_input_tokens_seen": 18169680, "step": 36915 }, { "epoch": 4.872640886894549, "grad_norm": 0.0003935332060791552, "learning_rate": 3.958497764607438e-09, "loss": 0.0, "num_input_tokens_seen": 18171920, "step": 36920 }, { "epoch": 4.873300778672298, "grad_norm": 0.0012009014608338475, "learning_rate": 3.917652286430484e-09, "loss": 0.0, "num_input_tokens_seen": 18174352, "step": 36925 }, { "epoch": 4.873960670450046, "grad_norm": 0.15180867910385132, "learning_rate": 3.87701821951869e-09, "loss": 0.0549, "num_input_tokens_seen": 18176848, "step": 36930 }, { "epoch": 4.874620562227794, "grad_norm": 0.00013223606219980866, "learning_rate": 3.836595572496493e-09, "loss": 0.0, "num_input_tokens_seen": 18179280, "step": 36935 }, { "epoch": 4.875280454005543, "grad_norm": 3.8269990909611806e-05, "learning_rate": 3.796384353943138e-09, "loss": 0.0, "num_input_tokens_seen": 18181776, "step": 36940 }, { "epoch": 4.875940345783292, "grad_norm": 0.12301933020353317, "learning_rate": 3.756384572393357e-09, "loss": 0.0001, "num_input_tokens_seen": 18184528, "step": 36945 }, { "epoch": 4.87660023756104, "grad_norm": 0.005153327248990536, "learning_rate": 3.7165962363366888e-09, "loss": 0.0, "num_input_tokens_seen": 18187408, "step": 36950 }, { "epoch": 4.877260129338788, "grad_norm": 5.5060783779481426e-05, "learning_rate": 3.677019354217936e-09, "loss": 0.0, "num_input_tokens_seen": 18189968, "step": 36955 }, { "epoch": 4.8779200211165366, "grad_norm": 2.651229988259729e-05, "learning_rate": 3.637653934437046e-09, "loss": 0.0, "num_input_tokens_seen": 18192400, "step": 36960 }, { "epoch": 4.878579912894286, "grad_norm": 1.6487107131979428e-05, "learning_rate": 3.5984999853490017e-09, "loss": 0.0, "num_input_tokens_seen": 18195024, "step": 36965 }, { "epoch": 4.879239804672034, "grad_norm": 0.002100097481161356, "learning_rate": 3.5595575152639333e-09, "loss": 0.0, "num_input_tokens_seen": 18197456, "step": 36970 }, { "epoch": 4.879899696449782, "grad_norm": 0.001038621529005468, "learning_rate": 3.5208265324472297e-09, "loss": 0.0, "num_input_tokens_seen": 18200272, "step": 36975 }, { "epoch": 4.8805595882275306, "grad_norm": 0.00012049706128891557, "learning_rate": 3.4823070451190926e-09, "loss": 0.0, "num_input_tokens_seen": 18202960, "step": 36980 }, { "epoch": 4.881219480005279, "grad_norm": 0.0003379734989721328, "learning_rate": 3.443999061455094e-09, "loss": 0.0, "num_input_tokens_seen": 18205136, "step": 36985 }, { "epoch": 4.881879371783027, "grad_norm": 0.008252648636698723, "learning_rate": 3.4059025895857295e-09, "loss": 0.0, "num_input_tokens_seen": 18207376, "step": 36990 }, { "epoch": 4.882539263560776, "grad_norm": 4.569829616229981e-05, "learning_rate": 3.368017637596865e-09, "loss": 0.0, "num_input_tokens_seen": 18210000, "step": 36995 }, { "epoch": 4.8831991553385246, "grad_norm": 0.0096468236297369, "learning_rate": 3.330344213529179e-09, "loss": 0.0, "num_input_tokens_seen": 18212560, "step": 37000 }, { "epoch": 4.883859047116273, "grad_norm": 2.2760867068427615e-05, "learning_rate": 3.29288232537861e-09, "loss": 0.0, "num_input_tokens_seen": 18214800, "step": 37005 }, { "epoch": 4.884518938894021, "grad_norm": 0.0077208224684000015, "learning_rate": 3.2556319810961297e-09, "loss": 0.0, "num_input_tokens_seen": 18217168, "step": 37010 }, { "epoch": 4.885178830671769, "grad_norm": 0.00016451945703011006, "learning_rate": 3.21859318858797e-09, "loss": 0.0, "num_input_tokens_seen": 18219920, "step": 37015 }, { "epoch": 4.885838722449519, "grad_norm": 0.00011153052764711902, "learning_rate": 3.1817659557152876e-09, "loss": 0.0, "num_input_tokens_seen": 18222352, "step": 37020 }, { "epoch": 4.886498614227267, "grad_norm": 1.4121486856311094e-05, "learning_rate": 3.1451502902943848e-09, "loss": 0.0, "num_input_tokens_seen": 18224848, "step": 37025 }, { "epoch": 4.887158506005015, "grad_norm": 108.1161880493164, "learning_rate": 3.1087462000967124e-09, "loss": 0.0782, "num_input_tokens_seen": 18227280, "step": 37030 }, { "epoch": 4.887818397782763, "grad_norm": 0.00013722782023251057, "learning_rate": 3.0725536928486452e-09, "loss": 0.001, "num_input_tokens_seen": 18229712, "step": 37035 }, { "epoch": 4.888478289560512, "grad_norm": 0.00022701297712046653, "learning_rate": 3.036572776231927e-09, "loss": 0.0, "num_input_tokens_seen": 18232400, "step": 37040 }, { "epoch": 4.889138181338261, "grad_norm": 0.00024063313321676105, "learning_rate": 3.0008034578832274e-09, "loss": 0.0213, "num_input_tokens_seen": 18234896, "step": 37045 }, { "epoch": 4.889798073116009, "grad_norm": 0.0005864155828021467, "learning_rate": 2.9652457453942515e-09, "loss": 0.0, "num_input_tokens_seen": 18237392, "step": 37050 }, { "epoch": 4.890457964893757, "grad_norm": 1.8028958947979845e-05, "learning_rate": 2.9298996463119618e-09, "loss": 0.0, "num_input_tokens_seen": 18240080, "step": 37055 }, { "epoch": 4.891117856671506, "grad_norm": 1.7613796444493346e-05, "learning_rate": 2.894765168138247e-09, "loss": 0.0, "num_input_tokens_seen": 18242448, "step": 37060 }, { "epoch": 4.891777748449254, "grad_norm": 3.3598429581616074e-05, "learning_rate": 2.85984231833003e-09, "loss": 0.0164, "num_input_tokens_seen": 18244880, "step": 37065 }, { "epoch": 4.892437640227003, "grad_norm": 19.759035110473633, "learning_rate": 2.825131104299716e-09, "loss": 0.0466, "num_input_tokens_seen": 18247248, "step": 37070 }, { "epoch": 4.893097532004751, "grad_norm": 0.006311311852186918, "learning_rate": 2.7906315334143004e-09, "loss": 0.0, "num_input_tokens_seen": 18249680, "step": 37075 }, { "epoch": 4.8937574237825, "grad_norm": 33.24702835083008, "learning_rate": 2.756343612996148e-09, "loss": 0.0072, "num_input_tokens_seen": 18251920, "step": 37080 }, { "epoch": 4.894417315560248, "grad_norm": 3.692014797707088e-05, "learning_rate": 2.722267350322549e-09, "loss": 0.0, "num_input_tokens_seen": 18254608, "step": 37085 }, { "epoch": 4.895077207337996, "grad_norm": 1.450990566809196e-05, "learning_rate": 2.6884027526259403e-09, "loss": 0.0, "num_input_tokens_seen": 18257040, "step": 37090 }, { "epoch": 4.895737099115745, "grad_norm": 2.8280426704441197e-05, "learning_rate": 2.654749827093905e-09, "loss": 0.0, "num_input_tokens_seen": 18259472, "step": 37095 }, { "epoch": 4.896396990893494, "grad_norm": 0.0005788062699139118, "learning_rate": 2.6213085808691747e-09, "loss": 0.0352, "num_input_tokens_seen": 18261840, "step": 37100 }, { "epoch": 4.897056882671242, "grad_norm": 2.147852319467347e-05, "learning_rate": 2.588079021049072e-09, "loss": 0.0, "num_input_tokens_seen": 18264464, "step": 37105 }, { "epoch": 4.89771677444899, "grad_norm": 0.00012815039372071624, "learning_rate": 2.5550611546866217e-09, "loss": 0.0, "num_input_tokens_seen": 18266832, "step": 37110 }, { "epoch": 4.8983766662267385, "grad_norm": 4.9210022552870214e-05, "learning_rate": 2.5222549887893295e-09, "loss": 0.0, "num_input_tokens_seen": 18269264, "step": 37115 }, { "epoch": 4.899036558004488, "grad_norm": 3.040397677978035e-05, "learning_rate": 2.4896605303204034e-09, "loss": 0.0, "num_input_tokens_seen": 18271760, "step": 37120 }, { "epoch": 4.899696449782236, "grad_norm": 1.7230806406587362e-05, "learning_rate": 2.4572777861976425e-09, "loss": 0.0, "num_input_tokens_seen": 18274000, "step": 37125 }, { "epoch": 4.900356341559984, "grad_norm": 0.00017458596266806126, "learning_rate": 2.425106763293994e-09, "loss": 0.0, "num_input_tokens_seen": 18276368, "step": 37130 }, { "epoch": 4.9010162333377325, "grad_norm": 0.021160904318094254, "learning_rate": 2.393147468437551e-09, "loss": 0.0, "num_input_tokens_seen": 18278800, "step": 37135 }, { "epoch": 4.901676125115481, "grad_norm": 0.0006504508783109486, "learning_rate": 2.3613999084114434e-09, "loss": 0.031, "num_input_tokens_seen": 18281232, "step": 37140 }, { "epoch": 4.90233601689323, "grad_norm": 2.090338966809213e-05, "learning_rate": 2.329864089953837e-09, "loss": 0.0, "num_input_tokens_seen": 18283536, "step": 37145 }, { "epoch": 4.902995908670978, "grad_norm": 4.0948219975689426e-05, "learning_rate": 2.298540019758155e-09, "loss": 0.0352, "num_input_tokens_seen": 18285904, "step": 37150 }, { "epoch": 4.9036558004487265, "grad_norm": 1.6493268049089238e-05, "learning_rate": 2.2674277044724134e-09, "loss": 0.0, "num_input_tokens_seen": 18288336, "step": 37155 }, { "epoch": 4.904315692226475, "grad_norm": 0.17416909337043762, "learning_rate": 2.236527150700218e-09, "loss": 0.0001, "num_input_tokens_seen": 18290832, "step": 37160 }, { "epoch": 4.904975584004223, "grad_norm": 0.000696924515068531, "learning_rate": 2.205838364999879e-09, "loss": 0.0, "num_input_tokens_seen": 18293264, "step": 37165 }, { "epoch": 4.905635475781972, "grad_norm": 2.5053914214367978e-05, "learning_rate": 2.1753613538849636e-09, "loss": 0.0, "num_input_tokens_seen": 18295632, "step": 37170 }, { "epoch": 4.9062953675597205, "grad_norm": 0.068586066365242, "learning_rate": 2.145096123823853e-09, "loss": 0.0007, "num_input_tokens_seen": 18298256, "step": 37175 }, { "epoch": 4.906955259337469, "grad_norm": 5.68434115848504e-05, "learning_rate": 2.1150426812401866e-09, "loss": 0.0, "num_input_tokens_seen": 18300304, "step": 37180 }, { "epoch": 4.907615151115217, "grad_norm": 1.4777840988244861e-05, "learning_rate": 2.0852010325125293e-09, "loss": 0.0006, "num_input_tokens_seen": 18302992, "step": 37185 }, { "epoch": 4.908275042892965, "grad_norm": 0.0001635983062442392, "learning_rate": 2.0555711839747026e-09, "loss": 0.0, "num_input_tokens_seen": 18305424, "step": 37190 }, { "epoch": 4.908934934670714, "grad_norm": 1.687292751739733e-05, "learning_rate": 2.0261531419153433e-09, "loss": 0.0, "num_input_tokens_seen": 18307664, "step": 37195 }, { "epoch": 4.909594826448463, "grad_norm": 0.9634570479393005, "learning_rate": 1.9969469125782346e-09, "loss": 0.0004, "num_input_tokens_seen": 18310288, "step": 37200 }, { "epoch": 4.910254718226211, "grad_norm": 2.745349775068462e-05, "learning_rate": 1.9679525021621955e-09, "loss": 0.0, "num_input_tokens_seen": 18312720, "step": 37205 }, { "epoch": 4.910914610003959, "grad_norm": 1.8449822164257057e-05, "learning_rate": 1.939169916820971e-09, "loss": 0.0, "num_input_tokens_seen": 18315536, "step": 37210 }, { "epoch": 4.911574501781708, "grad_norm": 0.00042026498704217374, "learning_rate": 1.910599162663673e-09, "loss": 0.0, "num_input_tokens_seen": 18317968, "step": 37215 }, { "epoch": 4.912234393559456, "grad_norm": 0.01246301457285881, "learning_rate": 1.8822402457540075e-09, "loss": 0.0001, "num_input_tokens_seen": 18320528, "step": 37220 }, { "epoch": 4.912894285337205, "grad_norm": 1.6955054888967425e-05, "learning_rate": 1.8540931721110487e-09, "loss": 0.0, "num_input_tokens_seen": 18322960, "step": 37225 }, { "epoch": 4.913554177114953, "grad_norm": 2.438508454360999e-05, "learning_rate": 1.8261579477087951e-09, "loss": 0.0001, "num_input_tokens_seen": 18325584, "step": 37230 }, { "epoch": 4.914214068892702, "grad_norm": 2.5170236767735332e-05, "learning_rate": 1.7984345784763932e-09, "loss": 0.0, "num_input_tokens_seen": 18328016, "step": 37235 }, { "epoch": 4.91487396067045, "grad_norm": 2.092982322210446e-05, "learning_rate": 1.770923070297803e-09, "loss": 0.0, "num_input_tokens_seen": 18330256, "step": 37240 }, { "epoch": 4.915533852448198, "grad_norm": 0.019744206219911575, "learning_rate": 1.743623429012131e-09, "loss": 0.0533, "num_input_tokens_seen": 18333072, "step": 37245 }, { "epoch": 4.9161937442259465, "grad_norm": 7.678641122765839e-05, "learning_rate": 1.7165356604136317e-09, "loss": 0.0, "num_input_tokens_seen": 18335440, "step": 37250 }, { "epoch": 4.916853636003696, "grad_norm": 4.612092743627727e-05, "learning_rate": 1.6896597702514837e-09, "loss": 0.0, "num_input_tokens_seen": 18337680, "step": 37255 }, { "epoch": 4.917513527781444, "grad_norm": 5.102691173553467, "learning_rate": 1.6629957642297908e-09, "loss": 0.0028, "num_input_tokens_seen": 18340496, "step": 37260 }, { "epoch": 4.918173419559192, "grad_norm": 0.002844721544533968, "learning_rate": 1.6365436480079153e-09, "loss": 0.0, "num_input_tokens_seen": 18342928, "step": 37265 }, { "epoch": 4.9188333113369405, "grad_norm": 7.894792361184955e-05, "learning_rate": 1.6103034272000326e-09, "loss": 0.0, "num_input_tokens_seen": 18345488, "step": 37270 }, { "epoch": 4.919493203114689, "grad_norm": 2.4739310902077705e-05, "learning_rate": 1.5842751073753546e-09, "loss": 0.0, "num_input_tokens_seen": 18348112, "step": 37275 }, { "epoch": 4.920153094892438, "grad_norm": 6.59223078400828e-05, "learning_rate": 1.5584586940584622e-09, "loss": 0.0, "num_input_tokens_seen": 18350416, "step": 37280 }, { "epoch": 4.920812986670186, "grad_norm": 0.00019666607840918005, "learning_rate": 1.5328541927286387e-09, "loss": 0.028, "num_input_tokens_seen": 18352912, "step": 37285 }, { "epoch": 4.9214728784479345, "grad_norm": 5.4916919907554984e-05, "learning_rate": 1.507461608819982e-09, "loss": 0.0, "num_input_tokens_seen": 18355536, "step": 37290 }, { "epoch": 4.922132770225683, "grad_norm": 6.035666592651978e-05, "learning_rate": 1.4822809477222919e-09, "loss": 0.0, "num_input_tokens_seen": 18357840, "step": 37295 }, { "epoch": 4.922792662003431, "grad_norm": 3.3862557756947353e-05, "learning_rate": 1.457312214779627e-09, "loss": 0.0004, "num_input_tokens_seen": 18360336, "step": 37300 }, { "epoch": 4.92345255378118, "grad_norm": 0.0001764145854394883, "learning_rate": 1.4325554152916364e-09, "loss": 0.0, "num_input_tokens_seen": 18363216, "step": 37305 }, { "epoch": 4.9241124455589285, "grad_norm": 1.4086094779486302e-05, "learning_rate": 1.408010554512673e-09, "loss": 0.0, "num_input_tokens_seen": 18365968, "step": 37310 }, { "epoch": 4.924772337336677, "grad_norm": 2.1404019207693636e-05, "learning_rate": 1.3836776376522364e-09, "loss": 0.0, "num_input_tokens_seen": 18368400, "step": 37315 }, { "epoch": 4.925432229114425, "grad_norm": 1.2312765647948254e-05, "learning_rate": 1.3595566698748617e-09, "loss": 0.0, "num_input_tokens_seen": 18370704, "step": 37320 }, { "epoch": 4.926092120892173, "grad_norm": 5.9814472479047254e-05, "learning_rate": 1.3356476562998986e-09, "loss": 0.0, "num_input_tokens_seen": 18373264, "step": 37325 }, { "epoch": 4.9267520126699225, "grad_norm": 0.13493262231349945, "learning_rate": 1.3119506020020653e-09, "loss": 0.0001, "num_input_tokens_seen": 18375632, "step": 37330 }, { "epoch": 4.927411904447671, "grad_norm": 0.0025102654471993446, "learning_rate": 1.2884655120107835e-09, "loss": 0.0226, "num_input_tokens_seen": 18378192, "step": 37335 }, { "epoch": 4.928071796225419, "grad_norm": 2.4892946385079995e-05, "learning_rate": 1.26519239131051e-09, "loss": 0.0, "num_input_tokens_seen": 18380560, "step": 37340 }, { "epoch": 4.928731688003167, "grad_norm": 0.0018648894038051367, "learning_rate": 1.2421312448408494e-09, "loss": 0.0, "num_input_tokens_seen": 18383248, "step": 37345 }, { "epoch": 4.929391579780916, "grad_norm": 1.4500590562820435, "learning_rate": 1.2192820774965529e-09, "loss": 0.0014, "num_input_tokens_seen": 18385616, "step": 37350 }, { "epoch": 4.930051471558665, "grad_norm": 0.009046858176589012, "learning_rate": 1.1966448941269635e-09, "loss": 0.0, "num_input_tokens_seen": 18387728, "step": 37355 }, { "epoch": 4.930711363336413, "grad_norm": 5.444921043817885e-05, "learning_rate": 1.1742196995366827e-09, "loss": 0.0, "num_input_tokens_seen": 18390096, "step": 37360 }, { "epoch": 4.931371255114161, "grad_norm": 5.099747431813739e-05, "learning_rate": 1.1520064984853473e-09, "loss": 0.0, "num_input_tokens_seen": 18392656, "step": 37365 }, { "epoch": 4.93203114689191, "grad_norm": 1.4332696082419716e-05, "learning_rate": 1.1300052956876304e-09, "loss": 0.0, "num_input_tokens_seen": 18395088, "step": 37370 }, { "epoch": 4.932691038669658, "grad_norm": 0.00013304037565831095, "learning_rate": 1.1082160958129082e-09, "loss": 0.0, "num_input_tokens_seen": 18397456, "step": 37375 }, { "epoch": 4.933350930447407, "grad_norm": 0.0004227849130984396, "learning_rate": 1.0866389034860368e-09, "loss": 0.0, "num_input_tokens_seen": 18400016, "step": 37380 }, { "epoch": 4.934010822225155, "grad_norm": 0.00013109679275657982, "learning_rate": 1.0652737232864639e-09, "loss": 0.0, "num_input_tokens_seen": 18402640, "step": 37385 }, { "epoch": 4.934670714002904, "grad_norm": 0.008443259634077549, "learning_rate": 1.0441205597487845e-09, "loss": 0.0, "num_input_tokens_seen": 18405072, "step": 37390 }, { "epoch": 4.935330605780652, "grad_norm": 0.0001453846925869584, "learning_rate": 1.0231794173626296e-09, "loss": 0.0001, "num_input_tokens_seen": 18407440, "step": 37395 }, { "epoch": 4.9359904975584, "grad_norm": 4.117728894925676e-05, "learning_rate": 1.002450300572666e-09, "loss": 0.0003, "num_input_tokens_seen": 18409872, "step": 37400 }, { "epoch": 4.936650389336149, "grad_norm": 0.0007367845973931253, "learning_rate": 9.819332137784853e-10, "loss": 0.0, "num_input_tokens_seen": 18412432, "step": 37405 }, { "epoch": 4.937310281113898, "grad_norm": 4.505567267187871e-05, "learning_rate": 9.616281613347155e-10, "loss": 0.0016, "num_input_tokens_seen": 18414608, "step": 37410 }, { "epoch": 4.937970172891646, "grad_norm": 0.00013725746248383075, "learning_rate": 9.415351475507982e-10, "loss": 0.0044, "num_input_tokens_seen": 18417424, "step": 37415 }, { "epoch": 4.938630064669394, "grad_norm": 3.152512726956047e-05, "learning_rate": 9.216541766914332e-10, "loss": 0.0, "num_input_tokens_seen": 18419664, "step": 37420 }, { "epoch": 4.939289956447142, "grad_norm": 0.0017625819891691208, "learning_rate": 9.019852529762451e-10, "loss": 0.0001, "num_input_tokens_seen": 18422352, "step": 37425 }, { "epoch": 4.939949848224892, "grad_norm": 2.2083482690504752e-05, "learning_rate": 8.825283805796724e-10, "loss": 0.0, "num_input_tokens_seen": 18424912, "step": 37430 }, { "epoch": 4.94060974000264, "grad_norm": 5.681900802301243e-05, "learning_rate": 8.632835636315227e-10, "loss": 0.0, "num_input_tokens_seen": 18427600, "step": 37435 }, { "epoch": 4.941269631780388, "grad_norm": 2.003555346163921e-05, "learning_rate": 8.442508062163068e-10, "loss": 0.0, "num_input_tokens_seen": 18430032, "step": 37440 }, { "epoch": 4.941929523558136, "grad_norm": 0.016679493710398674, "learning_rate": 8.254301123734597e-10, "loss": 0.0, "num_input_tokens_seen": 18432400, "step": 37445 }, { "epoch": 4.942589415335885, "grad_norm": 0.0004292270168662071, "learning_rate": 8.068214860976752e-10, "loss": 0.0322, "num_input_tokens_seen": 18434768, "step": 37450 }, { "epoch": 4.943249307113634, "grad_norm": 14.158968925476074, "learning_rate": 7.884249313383495e-10, "loss": 0.0861, "num_input_tokens_seen": 18436944, "step": 37455 }, { "epoch": 4.943909198891382, "grad_norm": 0.00015048046770971268, "learning_rate": 7.702404520002481e-10, "loss": 0.0308, "num_input_tokens_seen": 18439248, "step": 37460 }, { "epoch": 4.94456909066913, "grad_norm": 5.5279640946537256e-05, "learning_rate": 7.522680519426173e-10, "loss": 0.0001, "num_input_tokens_seen": 18441680, "step": 37465 }, { "epoch": 4.945228982446879, "grad_norm": 1.82662970473757e-05, "learning_rate": 7.345077349801832e-10, "loss": 0.0, "num_input_tokens_seen": 18443856, "step": 37470 }, { "epoch": 4.945888874224627, "grad_norm": 0.039773765951395035, "learning_rate": 7.169595048823751e-10, "loss": 0.0, "num_input_tokens_seen": 18446288, "step": 37475 }, { "epoch": 4.946548766002375, "grad_norm": 4.541295528411865, "learning_rate": 6.996233653736583e-10, "loss": 0.0496, "num_input_tokens_seen": 18448464, "step": 37480 }, { "epoch": 4.947208657780124, "grad_norm": 5.383110692491755e-05, "learning_rate": 6.824993201334228e-10, "loss": 0.0, "num_input_tokens_seen": 18450768, "step": 37485 }, { "epoch": 4.947868549557873, "grad_norm": 2.5847257347777486e-05, "learning_rate": 6.655873727963168e-10, "loss": 0.0, "num_input_tokens_seen": 18453136, "step": 37490 }, { "epoch": 4.948528441335621, "grad_norm": 1.4880834896757733e-05, "learning_rate": 6.488875269516914e-10, "loss": 0.0, "num_input_tokens_seen": 18455504, "step": 37495 }, { "epoch": 4.949188333113369, "grad_norm": 6.817045505158603e-05, "learning_rate": 6.323997861439334e-10, "loss": 0.0, "num_input_tokens_seen": 18458064, "step": 37500 }, { "epoch": 4.9498482248911175, "grad_norm": 0.0013366822386160493, "learning_rate": 6.16124153872466e-10, "loss": 0.0, "num_input_tokens_seen": 18460624, "step": 37505 }, { "epoch": 4.950508116668866, "grad_norm": 0.00018459931015968323, "learning_rate": 6.00060633591748e-10, "loss": 0.0, "num_input_tokens_seen": 18463120, "step": 37510 }, { "epoch": 4.951168008446615, "grad_norm": 0.008064552210271358, "learning_rate": 5.842092287109412e-10, "loss": 0.0012, "num_input_tokens_seen": 18465616, "step": 37515 }, { "epoch": 4.951827900224363, "grad_norm": 0.059500906616449356, "learning_rate": 5.685699425945767e-10, "loss": 0.0, "num_input_tokens_seen": 18467984, "step": 37520 }, { "epoch": 4.9524877920021115, "grad_norm": 0.020492108538746834, "learning_rate": 5.531427785619991e-10, "loss": 0.0, "num_input_tokens_seen": 18470032, "step": 37525 }, { "epoch": 4.95314768377986, "grad_norm": 2.5691040718811564e-05, "learning_rate": 5.379277398873671e-10, "loss": 0.0, "num_input_tokens_seen": 18472336, "step": 37530 }, { "epoch": 4.953807575557608, "grad_norm": 1.320360541343689, "learning_rate": 5.229248298000976e-10, "loss": 0.0022, "num_input_tokens_seen": 18475088, "step": 37535 }, { "epoch": 4.954467467335357, "grad_norm": 1.585727477504406e-05, "learning_rate": 5.081340514843102e-10, "loss": 0.0252, "num_input_tokens_seen": 18477776, "step": 37540 }, { "epoch": 4.9551273591131055, "grad_norm": 0.0001642427669139579, "learning_rate": 4.935554080793825e-10, "loss": 0.061, "num_input_tokens_seen": 18480336, "step": 37545 }, { "epoch": 4.955787250890854, "grad_norm": 1.2666053407883737e-05, "learning_rate": 4.791889026793949e-10, "loss": 0.0, "num_input_tokens_seen": 18482640, "step": 37550 }, { "epoch": 4.956447142668602, "grad_norm": 4.189881292404607e-05, "learning_rate": 4.6503453833368623e-10, "loss": 0.0, "num_input_tokens_seen": 18484880, "step": 37555 }, { "epoch": 4.95710703444635, "grad_norm": 1.872699249361176e-05, "learning_rate": 4.5109231804629776e-10, "loss": 0.0, "num_input_tokens_seen": 18487696, "step": 37560 }, { "epoch": 4.9577669262240995, "grad_norm": 12.058331489562988, "learning_rate": 4.37362244776307e-10, "loss": 0.0473, "num_input_tokens_seen": 18490384, "step": 37565 }, { "epoch": 4.958426818001848, "grad_norm": 0.00039062247378751636, "learning_rate": 4.238443214380494e-10, "loss": 0.0, "num_input_tokens_seen": 18492816, "step": 37570 }, { "epoch": 4.959086709779596, "grad_norm": 5.1051236368948594e-05, "learning_rate": 4.105385509004522e-10, "loss": 0.0, "num_input_tokens_seen": 18495376, "step": 37575 }, { "epoch": 4.959746601557344, "grad_norm": 1.5794721548445523e-05, "learning_rate": 3.974449359875898e-10, "loss": 0.0, "num_input_tokens_seen": 18497616, "step": 37580 }, { "epoch": 4.960406493335093, "grad_norm": 3.2267846108879894e-05, "learning_rate": 3.8456347947835034e-10, "loss": 0.0, "num_input_tokens_seen": 18500048, "step": 37585 }, { "epoch": 4.961066385112842, "grad_norm": 1.5826091839699075e-05, "learning_rate": 3.7189418410699114e-10, "loss": 0.0, "num_input_tokens_seen": 18502608, "step": 37590 }, { "epoch": 4.96172627689059, "grad_norm": 8.943623106461018e-05, "learning_rate": 3.5943705256236136e-10, "loss": 0.0044, "num_input_tokens_seen": 18505104, "step": 37595 }, { "epoch": 4.962386168668338, "grad_norm": 0.00010258001566398889, "learning_rate": 3.4719208748834607e-10, "loss": 0.0, "num_input_tokens_seen": 18507536, "step": 37600 }, { "epoch": 4.963046060446087, "grad_norm": 0.006609190255403519, "learning_rate": 3.3515929148397737e-10, "loss": 0.0001, "num_input_tokens_seen": 18510032, "step": 37605 }, { "epoch": 4.963705952223835, "grad_norm": 7.628079038113356e-05, "learning_rate": 3.2333866710299027e-10, "loss": 0.0, "num_input_tokens_seen": 18512592, "step": 37610 }, { "epoch": 4.964365844001584, "grad_norm": 1.7201859009219334e-05, "learning_rate": 3.1173021685426684e-10, "loss": 0.0, "num_input_tokens_seen": 18514704, "step": 37615 }, { "epoch": 4.965025735779332, "grad_norm": 3.107169322902337e-05, "learning_rate": 3.003339432016139e-10, "loss": 0.0095, "num_input_tokens_seen": 18517328, "step": 37620 }, { "epoch": 4.965685627557081, "grad_norm": 0.0002109938650391996, "learning_rate": 2.891498485638744e-10, "loss": 0.0006, "num_input_tokens_seen": 18520080, "step": 37625 }, { "epoch": 4.966345519334829, "grad_norm": 7.280964928213507e-05, "learning_rate": 2.781779353147051e-10, "loss": 0.0, "num_input_tokens_seen": 18522384, "step": 37630 }, { "epoch": 4.967005411112577, "grad_norm": 0.00024594253045506775, "learning_rate": 2.6741820578290997e-10, "loss": 0.0, "num_input_tokens_seen": 18525392, "step": 37635 }, { "epoch": 4.967665302890326, "grad_norm": 13.583490371704102, "learning_rate": 2.568706622519956e-10, "loss": 0.0322, "num_input_tokens_seen": 18527760, "step": 37640 }, { "epoch": 4.968325194668075, "grad_norm": 0.23574484884738922, "learning_rate": 2.465353069608378e-10, "loss": 0.0002, "num_input_tokens_seen": 18530000, "step": 37645 }, { "epoch": 4.968985086445823, "grad_norm": 0.005555503070354462, "learning_rate": 2.3641214210279314e-10, "loss": 0.0836, "num_input_tokens_seen": 18532624, "step": 37650 }, { "epoch": 4.969644978223571, "grad_norm": 0.00010899443441303447, "learning_rate": 2.265011698266983e-10, "loss": 0.0, "num_input_tokens_seen": 18535056, "step": 37655 }, { "epoch": 4.9703048700013195, "grad_norm": 1.8328459191252477e-05, "learning_rate": 2.168023922357598e-10, "loss": 0.0, "num_input_tokens_seen": 18537488, "step": 37660 }, { "epoch": 4.970964761779069, "grad_norm": 0.0003537725133355707, "learning_rate": 2.0731581138877518e-10, "loss": 0.0, "num_input_tokens_seen": 18539792, "step": 37665 }, { "epoch": 4.971624653556817, "grad_norm": 5.8356781664770097e-05, "learning_rate": 1.980414292990229e-10, "loss": 0.0, "num_input_tokens_seen": 18542224, "step": 37670 }, { "epoch": 4.972284545334565, "grad_norm": 1.1076231203333009e-05, "learning_rate": 1.889792479350394e-10, "loss": 0.0, "num_input_tokens_seen": 18544912, "step": 37675 }, { "epoch": 4.9729444371123135, "grad_norm": 0.002201800001785159, "learning_rate": 1.8012926922017502e-10, "loss": 0.0, "num_input_tokens_seen": 18547280, "step": 37680 }, { "epoch": 4.973604328890062, "grad_norm": 1.901209179777652e-05, "learning_rate": 1.714914950327051e-10, "loss": 0.0, "num_input_tokens_seen": 18549840, "step": 37685 }, { "epoch": 4.974264220667811, "grad_norm": 0.0002244171773782, "learning_rate": 1.6306592720594093e-10, "loss": 0.0, "num_input_tokens_seen": 18552208, "step": 37690 }, { "epoch": 4.974924112445559, "grad_norm": 0.01187474001199007, "learning_rate": 1.5485256752822973e-10, "loss": 0.0233, "num_input_tokens_seen": 18554640, "step": 37695 }, { "epoch": 4.9755840042233075, "grad_norm": 1.741272171784658e-05, "learning_rate": 1.4685141774273268e-10, "loss": 0.0, "num_input_tokens_seen": 18557392, "step": 37700 }, { "epoch": 4.976243896001056, "grad_norm": 0.00016994222823996097, "learning_rate": 1.3906247954764694e-10, "loss": 0.0, "num_input_tokens_seen": 18559696, "step": 37705 }, { "epoch": 4.976903787778804, "grad_norm": 1.2624673217942473e-05, "learning_rate": 1.3148575459609457e-10, "loss": 0.0, "num_input_tokens_seen": 18561936, "step": 37710 }, { "epoch": 4.977563679556553, "grad_norm": 0.0006298840162344277, "learning_rate": 1.241212444962336e-10, "loss": 0.0, "num_input_tokens_seen": 18564432, "step": 37715 }, { "epoch": 4.9782235713343015, "grad_norm": 0.037689466029405594, "learning_rate": 1.169689508111471e-10, "loss": 0.0, "num_input_tokens_seen": 18566928, "step": 37720 }, { "epoch": 4.97888346311205, "grad_norm": 1.6840593161759898e-05, "learning_rate": 1.1002887505873193e-10, "loss": 0.0, "num_input_tokens_seen": 18569744, "step": 37725 }, { "epoch": 4.979543354889798, "grad_norm": 0.2575373351573944, "learning_rate": 1.0330101871214303e-10, "loss": 0.0001, "num_input_tokens_seen": 18572048, "step": 37730 }, { "epoch": 4.980203246667546, "grad_norm": 2.676645090105012e-05, "learning_rate": 9.678538319923824e-11, "loss": 0.0, "num_input_tokens_seen": 18574544, "step": 37735 }, { "epoch": 4.980863138445295, "grad_norm": 1.6679727195878513e-05, "learning_rate": 9.048196990280033e-11, "loss": 0.0005, "num_input_tokens_seen": 18577296, "step": 37740 }, { "epoch": 4.981523030223044, "grad_norm": 0.0002634110569488257, "learning_rate": 8.439078016087009e-11, "loss": 0.0, "num_input_tokens_seen": 18579600, "step": 37745 }, { "epoch": 4.982182922000792, "grad_norm": 0.0004117540374863893, "learning_rate": 7.851181526619122e-11, "loss": 0.0997, "num_input_tokens_seen": 18582224, "step": 37750 }, { "epoch": 4.98284281377854, "grad_norm": 1.633869942452293e-05, "learning_rate": 7.284507646654336e-11, "loss": 0.0, "num_input_tokens_seen": 18584336, "step": 37755 }, { "epoch": 4.983502705556289, "grad_norm": 16.390350341796875, "learning_rate": 6.739056496452011e-11, "loss": 0.0472, "num_input_tokens_seen": 18586896, "step": 37760 }, { "epoch": 4.984162597334037, "grad_norm": 1.5212469406833407e-05, "learning_rate": 6.214828191797305e-11, "loss": 0.0001, "num_input_tokens_seen": 18589520, "step": 37765 }, { "epoch": 4.984822489111786, "grad_norm": 0.00015223871741909534, "learning_rate": 5.711822843945669e-11, "loss": 0.0324, "num_input_tokens_seen": 18591952, "step": 37770 }, { "epoch": 4.985482380889534, "grad_norm": 0.00014864149852655828, "learning_rate": 5.230040559656146e-11, "loss": 0.0, "num_input_tokens_seen": 18594384, "step": 37775 }, { "epoch": 4.986142272667283, "grad_norm": 5.9889051044592634e-05, "learning_rate": 4.769481441191381e-11, "loss": 0.0019, "num_input_tokens_seen": 18596880, "step": 37780 }, { "epoch": 4.986802164445031, "grad_norm": 0.9271034598350525, "learning_rate": 4.330145586284306e-11, "loss": 0.0038, "num_input_tokens_seen": 18599632, "step": 37785 }, { "epoch": 4.987462056222779, "grad_norm": 0.0002262652269564569, "learning_rate": 3.912033088204758e-11, "loss": 0.0, "num_input_tokens_seen": 18602128, "step": 37790 }, { "epoch": 4.988121948000527, "grad_norm": 2.7516141926753335e-05, "learning_rate": 3.515144035670658e-11, "loss": 0.0, "num_input_tokens_seen": 18604496, "step": 37795 }, { "epoch": 4.988781839778277, "grad_norm": 9.783699351828545e-05, "learning_rate": 3.139478512936833e-11, "loss": 0.0, "num_input_tokens_seen": 18606992, "step": 37800 }, { "epoch": 4.989441731556025, "grad_norm": 1.840178265410941e-05, "learning_rate": 2.7850365997283963e-11, "loss": 0.0, "num_input_tokens_seen": 18609232, "step": 37805 }, { "epoch": 4.990101623333773, "grad_norm": 8.516815432813019e-05, "learning_rate": 2.4518183712740615e-11, "loss": 0.0, "num_input_tokens_seen": 18611472, "step": 37810 }, { "epoch": 4.990761515111521, "grad_norm": 5.190412048250437e-05, "learning_rate": 2.1398238982839324e-11, "loss": 0.0, "num_input_tokens_seen": 18613776, "step": 37815 }, { "epoch": 4.99142140688927, "grad_norm": 1.6882424461073242e-05, "learning_rate": 1.8490532470050168e-11, "loss": 0.0, "num_input_tokens_seen": 18616272, "step": 37820 }, { "epoch": 4.992081298667019, "grad_norm": 0.00013847336231265217, "learning_rate": 1.5795064791213065e-11, "loss": 0.028, "num_input_tokens_seen": 18618768, "step": 37825 }, { "epoch": 4.992741190444767, "grad_norm": 0.00012111458636354655, "learning_rate": 1.3311836518647978e-11, "loss": 0.0, "num_input_tokens_seen": 18621328, "step": 37830 }, { "epoch": 4.993401082222515, "grad_norm": 0.02647862210869789, "learning_rate": 1.104084817926676e-11, "loss": 0.0, "num_input_tokens_seen": 18623568, "step": 37835 }, { "epoch": 4.994060974000264, "grad_norm": 0.00014961596752982587, "learning_rate": 8.982100255128244e-12, "loss": 0.0, "num_input_tokens_seen": 18626000, "step": 37840 }, { "epoch": 4.994720865778012, "grad_norm": 1.651822094572708e-05, "learning_rate": 7.135593183216215e-12, "loss": 0.0, "num_input_tokens_seen": 18628496, "step": 37845 }, { "epoch": 4.995380757555761, "grad_norm": 6.622510409215465e-05, "learning_rate": 5.501327355328378e-12, "loss": 0.0, "num_input_tokens_seen": 18630928, "step": 37850 }, { "epoch": 4.996040649333509, "grad_norm": 0.00030600413447245955, "learning_rate": 4.0793031184094275e-12, "loss": 0.0001, "num_input_tokens_seen": 18633488, "step": 37855 }, { "epoch": 4.996700541111258, "grad_norm": 1.81586710823467e-05, "learning_rate": 2.8695207742179816e-12, "loss": 0.0, "num_input_tokens_seen": 18635984, "step": 37860 }, { "epoch": 4.997360432889006, "grad_norm": 0.0009784384164959192, "learning_rate": 1.8719805796596487e-12, "loss": 0.0, "num_input_tokens_seen": 18638288, "step": 37865 }, { "epoch": 4.998020324666754, "grad_norm": 0.0030284288804978132, "learning_rate": 1.086682746231915e-12, "loss": 0.0, "num_input_tokens_seen": 18640720, "step": 37870 }, { "epoch": 4.998680216444503, "grad_norm": 3.776025914703496e-05, "learning_rate": 5.136274408013008e-13, "loss": 0.0, "num_input_tokens_seen": 18642960, "step": 37875 }, { "epoch": 4.999340108222252, "grad_norm": 2.063461215584539e-05, "learning_rate": 1.5281478493722745e-13, "loss": 0.0, "num_input_tokens_seen": 18645008, "step": 37880 }, { "epoch": 5.0, "grad_norm": 2.0273428162909113e-05, "learning_rate": 4.244855245083556e-15, "loss": 0.0001, "num_input_tokens_seen": 18647328, "step": 37885 }, { "epoch": 5.0, "num_input_tokens_seen": 18647328, "step": 37885, "total_flos": 1.0887944845433242e+17, "train_loss": 0.05778941776209685, "train_runtime": 3377.3501, "train_samples_per_second": 89.736, "train_steps_per_second": 11.217 } ], "logging_steps": 5, "max_steps": 37885, "num_input_tokens_seen": 18647328, "num_train_epochs": 5, "save_steps": 1895, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.0887944845433242e+17, "train_batch_size": 8, "trial_name": null, "trial_params": null }