{ "best_global_step": 1064, "best_metric": 0.18848362565040588, "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_boolq_42_1776331558/checkpoint-1064", "epoch": 5.0, "eval_steps": 266, "global_step": 5305, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00471253534401508, "grad_norm": 318.0024719238281, "learning_rate": 3.766478342749529e-08, "loss": 0.8967, "num_input_tokens_seen": 10752, "step": 5 }, { "epoch": 0.00942507068803016, "grad_norm": 311.7052001953125, "learning_rate": 8.474576271186442e-08, "loss": 0.865, "num_input_tokens_seen": 20736, "step": 10 }, { "epoch": 0.01413760603204524, "grad_norm": 320.9153747558594, "learning_rate": 1.3182674199623353e-07, "loss": 0.8712, "num_input_tokens_seen": 31232, "step": 15 }, { "epoch": 0.01885014137606032, "grad_norm": 198.89523315429688, "learning_rate": 1.7890772128060264e-07, "loss": 0.7056, "num_input_tokens_seen": 40320, "step": 20 }, { "epoch": 0.0235626767200754, "grad_norm": 119.11978149414062, "learning_rate": 2.2598870056497177e-07, "loss": 0.5698, "num_input_tokens_seen": 51136, "step": 25 }, { "epoch": 0.02827521206409048, "grad_norm": 40.273216247558594, "learning_rate": 2.730696798493409e-07, "loss": 0.4948, "num_input_tokens_seen": 63424, "step": 30 }, { "epoch": 0.03298774740810556, "grad_norm": 48.785831451416016, "learning_rate": 3.2015065913371e-07, "loss": 0.3516, "num_input_tokens_seen": 75328, "step": 35 }, { "epoch": 0.03770028275212064, "grad_norm": 40.042724609375, "learning_rate": 3.6723163841807916e-07, "loss": 0.3321, "num_input_tokens_seen": 86784, "step": 40 }, { "epoch": 0.04241281809613572, "grad_norm": 27.21363639831543, "learning_rate": 4.1431261770244826e-07, "loss": 0.3329, "num_input_tokens_seen": 99136, "step": 45 }, { "epoch": 0.0471253534401508, "grad_norm": 165.71754455566406, "learning_rate": 4.613935969868174e-07, "loss": 0.4395, "num_input_tokens_seen": 108672, "step": 50 }, { "epoch": 0.051837888784165884, "grad_norm": 41.76438903808594, "learning_rate": 5.084745762711865e-07, "loss": 0.4549, "num_input_tokens_seen": 119808, "step": 55 }, { "epoch": 0.05655042412818096, "grad_norm": 67.6883544921875, "learning_rate": 5.555555555555555e-07, "loss": 0.3434, "num_input_tokens_seen": 130048, "step": 60 }, { "epoch": 0.061262959472196045, "grad_norm": 72.87140655517578, "learning_rate": 6.026365348399247e-07, "loss": 0.3434, "num_input_tokens_seen": 142464, "step": 65 }, { "epoch": 0.06597549481621112, "grad_norm": 34.45458221435547, "learning_rate": 6.497175141242938e-07, "loss": 0.3516, "num_input_tokens_seen": 154048, "step": 70 }, { "epoch": 0.0706880301602262, "grad_norm": 33.45477294921875, "learning_rate": 6.96798493408663e-07, "loss": 0.3088, "num_input_tokens_seen": 166720, "step": 75 }, { "epoch": 0.07540056550424128, "grad_norm": 39.73252868652344, "learning_rate": 7.43879472693032e-07, "loss": 0.3218, "num_input_tokens_seen": 179200, "step": 80 }, { "epoch": 0.08011310084825636, "grad_norm": 39.0076789855957, "learning_rate": 7.909604519774013e-07, "loss": 0.3962, "num_input_tokens_seen": 190464, "step": 85 }, { "epoch": 0.08482563619227144, "grad_norm": 23.60104751586914, "learning_rate": 8.380414312617704e-07, "loss": 0.3243, "num_input_tokens_seen": 205376, "step": 90 }, { "epoch": 0.08953817153628653, "grad_norm": 43.6406364440918, "learning_rate": 8.851224105461394e-07, "loss": 0.383, "num_input_tokens_seen": 215424, "step": 95 }, { "epoch": 0.0942507068803016, "grad_norm": 20.507476806640625, "learning_rate": 9.322033898305086e-07, "loss": 0.2987, "num_input_tokens_seen": 226688, "step": 100 }, { "epoch": 0.09896324222431668, "grad_norm": 27.949003219604492, "learning_rate": 9.792843691148776e-07, "loss": 0.2859, "num_input_tokens_seen": 237248, "step": 105 }, { "epoch": 0.10367577756833177, "grad_norm": 44.55869674682617, "learning_rate": 1.0263653483992468e-06, "loss": 0.3517, "num_input_tokens_seen": 254144, "step": 110 }, { "epoch": 0.10838831291234684, "grad_norm": 65.89197540283203, "learning_rate": 1.073446327683616e-06, "loss": 0.296, "num_input_tokens_seen": 265920, "step": 115 }, { "epoch": 0.11310084825636192, "grad_norm": 77.50157165527344, "learning_rate": 1.120527306967985e-06, "loss": 0.4375, "num_input_tokens_seen": 277184, "step": 120 }, { "epoch": 0.117813383600377, "grad_norm": 48.51222229003906, "learning_rate": 1.167608286252354e-06, "loss": 0.2998, "num_input_tokens_seen": 289024, "step": 125 }, { "epoch": 0.12252591894439209, "grad_norm": 36.21791076660156, "learning_rate": 1.2146892655367234e-06, "loss": 0.2883, "num_input_tokens_seen": 299456, "step": 130 }, { "epoch": 0.12723845428840716, "grad_norm": 38.02757263183594, "learning_rate": 1.2617702448210926e-06, "loss": 0.3329, "num_input_tokens_seen": 313728, "step": 135 }, { "epoch": 0.13195098963242224, "grad_norm": 26.75225257873535, "learning_rate": 1.3088512241054615e-06, "loss": 0.2533, "num_input_tokens_seen": 326080, "step": 140 }, { "epoch": 0.13666352497643733, "grad_norm": 31.08643341064453, "learning_rate": 1.3559322033898307e-06, "loss": 0.2705, "num_input_tokens_seen": 338688, "step": 145 }, { "epoch": 0.1413760603204524, "grad_norm": 38.68683624267578, "learning_rate": 1.4030131826741996e-06, "loss": 0.3576, "num_input_tokens_seen": 349632, "step": 150 }, { "epoch": 0.1460885956644675, "grad_norm": 36.93141174316406, "learning_rate": 1.4500941619585688e-06, "loss": 0.2256, "num_input_tokens_seen": 363968, "step": 155 }, { "epoch": 0.15080113100848255, "grad_norm": 47.699241638183594, "learning_rate": 1.4971751412429381e-06, "loss": 0.4483, "num_input_tokens_seen": 375680, "step": 160 }, { "epoch": 0.15551366635249764, "grad_norm": 24.248348236083984, "learning_rate": 1.544256120527307e-06, "loss": 0.269, "num_input_tokens_seen": 386368, "step": 165 }, { "epoch": 0.16022620169651272, "grad_norm": 18.501712799072266, "learning_rate": 1.5913370998116762e-06, "loss": 0.3019, "num_input_tokens_seen": 396992, "step": 170 }, { "epoch": 0.1649387370405278, "grad_norm": 13.451111793518066, "learning_rate": 1.6384180790960452e-06, "loss": 0.3423, "num_input_tokens_seen": 408960, "step": 175 }, { "epoch": 0.1696512723845429, "grad_norm": 24.91837501525879, "learning_rate": 1.6854990583804145e-06, "loss": 0.2813, "num_input_tokens_seen": 419008, "step": 180 }, { "epoch": 0.17436380772855797, "grad_norm": 23.55257225036621, "learning_rate": 1.7325800376647837e-06, "loss": 0.2445, "num_input_tokens_seen": 430144, "step": 185 }, { "epoch": 0.17907634307257306, "grad_norm": 50.514373779296875, "learning_rate": 1.7796610169491526e-06, "loss": 0.2385, "num_input_tokens_seen": 441216, "step": 190 }, { "epoch": 0.18378887841658811, "grad_norm": 63.581478118896484, "learning_rate": 1.8267419962335218e-06, "loss": 0.2216, "num_input_tokens_seen": 451584, "step": 195 }, { "epoch": 0.1885014137606032, "grad_norm": 132.5011444091797, "learning_rate": 1.873822975517891e-06, "loss": 0.4569, "num_input_tokens_seen": 464384, "step": 200 }, { "epoch": 0.19321394910461828, "grad_norm": 17.86366081237793, "learning_rate": 1.92090395480226e-06, "loss": 0.4075, "num_input_tokens_seen": 481792, "step": 205 }, { "epoch": 0.19792648444863337, "grad_norm": 22.02827262878418, "learning_rate": 1.9679849340866293e-06, "loss": 0.2703, "num_input_tokens_seen": 493952, "step": 210 }, { "epoch": 0.20263901979264845, "grad_norm": 22.791790008544922, "learning_rate": 2.015065913370998e-06, "loss": 0.2604, "num_input_tokens_seen": 504832, "step": 215 }, { "epoch": 0.20735155513666353, "grad_norm": 26.63323974609375, "learning_rate": 2.062146892655367e-06, "loss": 0.277, "num_input_tokens_seen": 514368, "step": 220 }, { "epoch": 0.21206409048067862, "grad_norm": 48.75139617919922, "learning_rate": 2.1092278719397365e-06, "loss": 0.2405, "num_input_tokens_seen": 525568, "step": 225 }, { "epoch": 0.21677662582469368, "grad_norm": 29.34063148498535, "learning_rate": 2.1563088512241055e-06, "loss": 0.259, "num_input_tokens_seen": 537664, "step": 230 }, { "epoch": 0.22148916116870876, "grad_norm": 26.771190643310547, "learning_rate": 2.203389830508475e-06, "loss": 0.2561, "num_input_tokens_seen": 547584, "step": 235 }, { "epoch": 0.22620169651272384, "grad_norm": 32.1733512878418, "learning_rate": 2.2504708097928438e-06, "loss": 0.3491, "num_input_tokens_seen": 558144, "step": 240 }, { "epoch": 0.23091423185673893, "grad_norm": 17.223844528198242, "learning_rate": 2.297551789077213e-06, "loss": 0.2543, "num_input_tokens_seen": 569024, "step": 245 }, { "epoch": 0.235626767200754, "grad_norm": 28.303009033203125, "learning_rate": 2.344632768361582e-06, "loss": 0.3033, "num_input_tokens_seen": 580864, "step": 250 }, { "epoch": 0.2403393025447691, "grad_norm": 50.68221664428711, "learning_rate": 2.391713747645951e-06, "loss": 0.2747, "num_input_tokens_seen": 592768, "step": 255 }, { "epoch": 0.24505183788878418, "grad_norm": 22.0999698638916, "learning_rate": 2.4387947269303204e-06, "loss": 0.336, "num_input_tokens_seen": 604032, "step": 260 }, { "epoch": 0.24976437323279924, "grad_norm": 29.03078269958496, "learning_rate": 2.4858757062146898e-06, "loss": 0.2277, "num_input_tokens_seen": 616256, "step": 265 }, { "epoch": 0.25070688030160226, "eval_loss": 0.25048765540122986, "eval_runtime": 2.7618, "eval_samples_per_second": 341.444, "eval_steps_per_second": 42.726, "num_input_tokens_seen": 618432, "step": 266 }, { "epoch": 0.2544769085768143, "grad_norm": 33.82787322998047, "learning_rate": 2.5329566854990583e-06, "loss": 0.2331, "num_input_tokens_seen": 627072, "step": 270 }, { "epoch": 0.25918944392082943, "grad_norm": 19.519996643066406, "learning_rate": 2.5800376647834272e-06, "loss": 0.157, "num_input_tokens_seen": 638592, "step": 275 }, { "epoch": 0.2639019792648445, "grad_norm": 43.415740966796875, "learning_rate": 2.627118644067797e-06, "loss": 0.3209, "num_input_tokens_seen": 648448, "step": 280 }, { "epoch": 0.26861451460885954, "grad_norm": 33.58452606201172, "learning_rate": 2.674199623352166e-06, "loss": 0.2578, "num_input_tokens_seen": 662784, "step": 285 }, { "epoch": 0.27332704995287466, "grad_norm": 22.619098663330078, "learning_rate": 2.7212806026365353e-06, "loss": 0.3557, "num_input_tokens_seen": 673856, "step": 290 }, { "epoch": 0.2780395852968897, "grad_norm": 19.780139923095703, "learning_rate": 2.7683615819209043e-06, "loss": 0.2089, "num_input_tokens_seen": 683136, "step": 295 }, { "epoch": 0.2827521206409048, "grad_norm": 16.88971519470215, "learning_rate": 2.8154425612052732e-06, "loss": 0.2989, "num_input_tokens_seen": 694784, "step": 300 }, { "epoch": 0.2874646559849199, "grad_norm": 18.517738342285156, "learning_rate": 2.862523540489642e-06, "loss": 0.2632, "num_input_tokens_seen": 706624, "step": 305 }, { "epoch": 0.292177191328935, "grad_norm": 26.673988342285156, "learning_rate": 2.9096045197740115e-06, "loss": 0.2979, "num_input_tokens_seen": 716800, "step": 310 }, { "epoch": 0.29688972667295005, "grad_norm": 40.26359176635742, "learning_rate": 2.9566854990583805e-06, "loss": 0.3261, "num_input_tokens_seen": 728704, "step": 315 }, { "epoch": 0.3016022620169651, "grad_norm": 20.343751907348633, "learning_rate": 3.00376647834275e-06, "loss": 0.1851, "num_input_tokens_seen": 740352, "step": 320 }, { "epoch": 0.3063147973609802, "grad_norm": 30.753236770629883, "learning_rate": 3.0508474576271192e-06, "loss": 0.2727, "num_input_tokens_seen": 751936, "step": 325 }, { "epoch": 0.3110273327049953, "grad_norm": 27.787220001220703, "learning_rate": 3.097928436911488e-06, "loss": 0.3077, "num_input_tokens_seen": 763264, "step": 330 }, { "epoch": 0.3157398680490104, "grad_norm": 13.1635103225708, "learning_rate": 3.145009416195857e-06, "loss": 0.3285, "num_input_tokens_seen": 772992, "step": 335 }, { "epoch": 0.32045240339302544, "grad_norm": 32.643653869628906, "learning_rate": 3.192090395480226e-06, "loss": 0.2493, "num_input_tokens_seen": 787008, "step": 340 }, { "epoch": 0.32516493873704055, "grad_norm": 31.31944465637207, "learning_rate": 3.2391713747645954e-06, "loss": 0.233, "num_input_tokens_seen": 798848, "step": 345 }, { "epoch": 0.3298774740810556, "grad_norm": 69.64946746826172, "learning_rate": 3.2862523540489644e-06, "loss": 0.3409, "num_input_tokens_seen": 811584, "step": 350 }, { "epoch": 0.33459000942507067, "grad_norm": 21.657026290893555, "learning_rate": 3.3333333333333333e-06, "loss": 0.1946, "num_input_tokens_seen": 822208, "step": 355 }, { "epoch": 0.3393025447690858, "grad_norm": 48.79526138305664, "learning_rate": 3.3804143126177023e-06, "loss": 0.3024, "num_input_tokens_seen": 833792, "step": 360 }, { "epoch": 0.34401508011310084, "grad_norm": 22.62244415283203, "learning_rate": 3.427495291902072e-06, "loss": 0.347, "num_input_tokens_seen": 845568, "step": 365 }, { "epoch": 0.34872761545711595, "grad_norm": 23.822986602783203, "learning_rate": 3.474576271186441e-06, "loss": 0.3787, "num_input_tokens_seen": 855040, "step": 370 }, { "epoch": 0.353440150801131, "grad_norm": 8.774383544921875, "learning_rate": 3.5216572504708104e-06, "loss": 0.3066, "num_input_tokens_seen": 867712, "step": 375 }, { "epoch": 0.3581526861451461, "grad_norm": 8.585469245910645, "learning_rate": 3.5687382297551793e-06, "loss": 0.2843, "num_input_tokens_seen": 880960, "step": 380 }, { "epoch": 0.36286522148916117, "grad_norm": 16.818485260009766, "learning_rate": 3.6158192090395483e-06, "loss": 0.2422, "num_input_tokens_seen": 894784, "step": 385 }, { "epoch": 0.36757775683317623, "grad_norm": 16.943395614624023, "learning_rate": 3.662900188323917e-06, "loss": 0.2664, "num_input_tokens_seen": 905600, "step": 390 }, { "epoch": 0.37229029217719134, "grad_norm": 26.195528030395508, "learning_rate": 3.7099811676082866e-06, "loss": 0.3223, "num_input_tokens_seen": 915072, "step": 395 }, { "epoch": 0.3770028275212064, "grad_norm": 37.17734146118164, "learning_rate": 3.7570621468926555e-06, "loss": 0.2066, "num_input_tokens_seen": 927552, "step": 400 }, { "epoch": 0.3817153628652215, "grad_norm": 43.06858444213867, "learning_rate": 3.8041431261770245e-06, "loss": 0.2964, "num_input_tokens_seen": 940160, "step": 405 }, { "epoch": 0.38642789820923656, "grad_norm": 31.916301727294922, "learning_rate": 3.851224105461394e-06, "loss": 0.1989, "num_input_tokens_seen": 949760, "step": 410 }, { "epoch": 0.3911404335532517, "grad_norm": 21.781017303466797, "learning_rate": 3.898305084745763e-06, "loss": 0.2152, "num_input_tokens_seen": 960896, "step": 415 }, { "epoch": 0.39585296889726673, "grad_norm": 18.697711944580078, "learning_rate": 3.945386064030132e-06, "loss": 0.3237, "num_input_tokens_seen": 971648, "step": 420 }, { "epoch": 0.4005655042412818, "grad_norm": 24.69769287109375, "learning_rate": 3.992467043314501e-06, "loss": 0.2182, "num_input_tokens_seen": 981504, "step": 425 }, { "epoch": 0.4052780395852969, "grad_norm": 26.988998413085938, "learning_rate": 4.03954802259887e-06, "loss": 0.1811, "num_input_tokens_seen": 993664, "step": 430 }, { "epoch": 0.40999057492931196, "grad_norm": 39.637481689453125, "learning_rate": 4.08662900188324e-06, "loss": 0.2151, "num_input_tokens_seen": 1003008, "step": 435 }, { "epoch": 0.41470311027332707, "grad_norm": 36.60890579223633, "learning_rate": 4.133709981167609e-06, "loss": 0.2333, "num_input_tokens_seen": 1013248, "step": 440 }, { "epoch": 0.4194156456173421, "grad_norm": 21.840532302856445, "learning_rate": 4.180790960451978e-06, "loss": 0.2694, "num_input_tokens_seen": 1023296, "step": 445 }, { "epoch": 0.42412818096135724, "grad_norm": 48.131988525390625, "learning_rate": 4.2278719397363475e-06, "loss": 0.3493, "num_input_tokens_seen": 1033152, "step": 450 }, { "epoch": 0.4288407163053723, "grad_norm": 30.9437198638916, "learning_rate": 4.2749529190207165e-06, "loss": 0.2147, "num_input_tokens_seen": 1045248, "step": 455 }, { "epoch": 0.43355325164938735, "grad_norm": 38.648189544677734, "learning_rate": 4.322033898305085e-06, "loss": 0.3071, "num_input_tokens_seen": 1058944, "step": 460 }, { "epoch": 0.43826578699340246, "grad_norm": 45.260948181152344, "learning_rate": 4.369114877589454e-06, "loss": 0.2118, "num_input_tokens_seen": 1071680, "step": 465 }, { "epoch": 0.4429783223374175, "grad_norm": 30.458763122558594, "learning_rate": 4.416195856873823e-06, "loss": 0.2319, "num_input_tokens_seen": 1083328, "step": 470 }, { "epoch": 0.44769085768143263, "grad_norm": 44.823246002197266, "learning_rate": 4.463276836158192e-06, "loss": 0.4004, "num_input_tokens_seen": 1097600, "step": 475 }, { "epoch": 0.4524033930254477, "grad_norm": 30.685049057006836, "learning_rate": 4.510357815442561e-06, "loss": 0.218, "num_input_tokens_seen": 1109376, "step": 480 }, { "epoch": 0.4571159283694628, "grad_norm": 20.631444931030273, "learning_rate": 4.55743879472693e-06, "loss": 0.2579, "num_input_tokens_seen": 1120448, "step": 485 }, { "epoch": 0.46182846371347785, "grad_norm": 27.939373016357422, "learning_rate": 4.6045197740113e-06, "loss": 0.298, "num_input_tokens_seen": 1131392, "step": 490 }, { "epoch": 0.4665409990574929, "grad_norm": 13.28208065032959, "learning_rate": 4.651600753295669e-06, "loss": 0.2203, "num_input_tokens_seen": 1143552, "step": 495 }, { "epoch": 0.471253534401508, "grad_norm": 55.77337646484375, "learning_rate": 4.698681732580039e-06, "loss": 0.1785, "num_input_tokens_seen": 1153088, "step": 500 }, { "epoch": 0.4759660697455231, "grad_norm": 32.49051284790039, "learning_rate": 4.745762711864408e-06, "loss": 0.2125, "num_input_tokens_seen": 1166080, "step": 505 }, { "epoch": 0.4806786050895382, "grad_norm": 63.475467681884766, "learning_rate": 4.7928436911487765e-06, "loss": 0.3882, "num_input_tokens_seen": 1176896, "step": 510 }, { "epoch": 0.48539114043355325, "grad_norm": 49.208858489990234, "learning_rate": 4.8399246704331455e-06, "loss": 0.2743, "num_input_tokens_seen": 1187392, "step": 515 }, { "epoch": 0.49010367577756836, "grad_norm": 38.7435417175293, "learning_rate": 4.8870056497175144e-06, "loss": 0.3309, "num_input_tokens_seen": 1196672, "step": 520 }, { "epoch": 0.4948162111215834, "grad_norm": 37.79723358154297, "learning_rate": 4.934086629001883e-06, "loss": 0.3264, "num_input_tokens_seen": 1209344, "step": 525 }, { "epoch": 0.49952874646559847, "grad_norm": 14.449431419372559, "learning_rate": 4.981167608286252e-06, "loss": 0.2193, "num_input_tokens_seen": 1221504, "step": 530 }, { "epoch": 0.5014137606032045, "eval_loss": 0.31662699580192566, "eval_runtime": 3.716, "eval_samples_per_second": 253.768, "eval_steps_per_second": 31.755, "num_input_tokens_seen": 1225408, "step": 532 }, { "epoch": 0.5042412818096136, "grad_norm": 28.057838439941406, "learning_rate": 4.999995128224159e-06, "loss": 0.3059, "num_input_tokens_seen": 1232256, "step": 535 }, { "epoch": 0.5089538171536286, "grad_norm": 24.673755645751953, "learning_rate": 4.999965356329446e-06, "loss": 0.2494, "num_input_tokens_seen": 1242880, "step": 540 }, { "epoch": 0.5136663524976437, "grad_norm": 24.47283172607422, "learning_rate": 4.99990851940408e-06, "loss": 0.2812, "num_input_tokens_seen": 1253248, "step": 545 }, { "epoch": 0.5183788878416589, "grad_norm": 34.05777359008789, "learning_rate": 4.999824618063384e-06, "loss": 0.2639, "num_input_tokens_seen": 1265280, "step": 550 }, { "epoch": 0.5230914231856739, "grad_norm": 23.17024803161621, "learning_rate": 4.99971365321569e-06, "loss": 0.3403, "num_input_tokens_seen": 1275328, "step": 555 }, { "epoch": 0.527803958529689, "grad_norm": 14.374199867248535, "learning_rate": 4.9995756260623194e-06, "loss": 0.2942, "num_input_tokens_seen": 1286272, "step": 560 }, { "epoch": 0.532516493873704, "grad_norm": 23.844751358032227, "learning_rate": 4.999410538097579e-06, "loss": 0.2036, "num_input_tokens_seen": 1298816, "step": 565 }, { "epoch": 0.5372290292177191, "grad_norm": 25.96900177001953, "learning_rate": 4.999218391108735e-06, "loss": 0.2656, "num_input_tokens_seen": 1311680, "step": 570 }, { "epoch": 0.5419415645617343, "grad_norm": 43.49790954589844, "learning_rate": 4.9989991871760054e-06, "loss": 0.2828, "num_input_tokens_seen": 1324672, "step": 575 }, { "epoch": 0.5466540999057493, "grad_norm": 14.262967109680176, "learning_rate": 4.998752928672525e-06, "loss": 0.2081, "num_input_tokens_seen": 1336896, "step": 580 }, { "epoch": 0.5513666352497644, "grad_norm": 3.46243953704834, "learning_rate": 4.9984796182643285e-06, "loss": 0.1485, "num_input_tokens_seen": 1348928, "step": 585 }, { "epoch": 0.5560791705937794, "grad_norm": 34.33613204956055, "learning_rate": 4.99817925891032e-06, "loss": 0.6339, "num_input_tokens_seen": 1362496, "step": 590 }, { "epoch": 0.5607917059377945, "grad_norm": 34.32979965209961, "learning_rate": 4.997851853862237e-06, "loss": 0.5079, "num_input_tokens_seen": 1377152, "step": 595 }, { "epoch": 0.5655042412818096, "grad_norm": 37.79439926147461, "learning_rate": 4.997497406664621e-06, "loss": 0.2396, "num_input_tokens_seen": 1390016, "step": 600 }, { "epoch": 0.5702167766258247, "grad_norm": 17.340059280395508, "learning_rate": 4.997115921154774e-06, "loss": 0.2335, "num_input_tokens_seen": 1401856, "step": 605 }, { "epoch": 0.5749293119698398, "grad_norm": 29.506044387817383, "learning_rate": 4.9967074014627206e-06, "loss": 0.1719, "num_input_tokens_seen": 1412736, "step": 610 }, { "epoch": 0.5796418473138548, "grad_norm": 37.41396713256836, "learning_rate": 4.996271852011161e-06, "loss": 0.3201, "num_input_tokens_seen": 1425280, "step": 615 }, { "epoch": 0.58435438265787, "grad_norm": 80.07007598876953, "learning_rate": 4.995809277515424e-06, "loss": 0.1993, "num_input_tokens_seen": 1436480, "step": 620 }, { "epoch": 0.589066918001885, "grad_norm": 25.41455841064453, "learning_rate": 4.995319682983417e-06, "loss": 0.3072, "num_input_tokens_seen": 1447808, "step": 625 }, { "epoch": 0.5937794533459001, "grad_norm": 25.022064208984375, "learning_rate": 4.99480307371557e-06, "loss": 0.3263, "num_input_tokens_seen": 1460352, "step": 630 }, { "epoch": 0.5984919886899152, "grad_norm": 15.871000289916992, "learning_rate": 4.9942594553047775e-06, "loss": 0.2747, "num_input_tokens_seen": 1472640, "step": 635 }, { "epoch": 0.6032045240339302, "grad_norm": 9.052998542785645, "learning_rate": 4.993688833636341e-06, "loss": 0.2984, "num_input_tokens_seen": 1482688, "step": 640 }, { "epoch": 0.6079170593779454, "grad_norm": 11.553794860839844, "learning_rate": 4.993091214887904e-06, "loss": 0.2671, "num_input_tokens_seen": 1494336, "step": 645 }, { "epoch": 0.6126295947219604, "grad_norm": 11.756524085998535, "learning_rate": 4.992466605529384e-06, "loss": 0.1511, "num_input_tokens_seen": 1504896, "step": 650 }, { "epoch": 0.6173421300659755, "grad_norm": 34.58543395996094, "learning_rate": 4.991815012322902e-06, "loss": 0.3427, "num_input_tokens_seen": 1518592, "step": 655 }, { "epoch": 0.6220546654099905, "grad_norm": 26.234251022338867, "learning_rate": 4.991136442322713e-06, "loss": 0.2164, "num_input_tokens_seen": 1531264, "step": 660 }, { "epoch": 0.6267672007540056, "grad_norm": 22.865320205688477, "learning_rate": 4.990430902875125e-06, "loss": 0.2187, "num_input_tokens_seen": 1541376, "step": 665 }, { "epoch": 0.6314797360980208, "grad_norm": 17.565690994262695, "learning_rate": 4.989698401618423e-06, "loss": 0.2911, "num_input_tokens_seen": 1551424, "step": 670 }, { "epoch": 0.6361922714420358, "grad_norm": 31.6148624420166, "learning_rate": 4.988938946482786e-06, "loss": 0.1331, "num_input_tokens_seen": 1562624, "step": 675 }, { "epoch": 0.6409048067860509, "grad_norm": 18.770896911621094, "learning_rate": 4.988152545690197e-06, "loss": 0.2686, "num_input_tokens_seen": 1574016, "step": 680 }, { "epoch": 0.6456173421300659, "grad_norm": 43.168392181396484, "learning_rate": 4.987339207754358e-06, "loss": 0.3307, "num_input_tokens_seen": 1586688, "step": 685 }, { "epoch": 0.6503298774740811, "grad_norm": 11.960392951965332, "learning_rate": 4.9864989414806e-06, "loss": 0.247, "num_input_tokens_seen": 1596992, "step": 690 }, { "epoch": 0.6550424128180962, "grad_norm": 13.759454727172852, "learning_rate": 4.985631755965779e-06, "loss": 0.3232, "num_input_tokens_seen": 1609920, "step": 695 }, { "epoch": 0.6597549481621112, "grad_norm": 11.490863800048828, "learning_rate": 4.984737660598187e-06, "loss": 0.2132, "num_input_tokens_seen": 1620736, "step": 700 }, { "epoch": 0.6644674835061263, "grad_norm": 39.30780029296875, "learning_rate": 4.983816665057447e-06, "loss": 0.2797, "num_input_tokens_seen": 1632512, "step": 705 }, { "epoch": 0.6691800188501413, "grad_norm": 10.898017883300781, "learning_rate": 4.982868779314405e-06, "loss": 0.3142, "num_input_tokens_seen": 1643264, "step": 710 }, { "epoch": 0.6738925541941565, "grad_norm": 16.296348571777344, "learning_rate": 4.981894013631026e-06, "loss": 0.1914, "num_input_tokens_seen": 1654208, "step": 715 }, { "epoch": 0.6786050895381716, "grad_norm": 23.66090202331543, "learning_rate": 4.980892378560281e-06, "loss": 0.1985, "num_input_tokens_seen": 1664640, "step": 720 }, { "epoch": 0.6833176248821866, "grad_norm": 27.988893508911133, "learning_rate": 4.979863884946034e-06, "loss": 0.2831, "num_input_tokens_seen": 1676864, "step": 725 }, { "epoch": 0.6880301602262017, "grad_norm": 20.06635284423828, "learning_rate": 4.978808543922925e-06, "loss": 0.2082, "num_input_tokens_seen": 1691072, "step": 730 }, { "epoch": 0.6927426955702167, "grad_norm": 19.601367950439453, "learning_rate": 4.9777263669162465e-06, "loss": 0.1227, "num_input_tokens_seen": 1702400, "step": 735 }, { "epoch": 0.6974552309142319, "grad_norm": 6.3521199226379395, "learning_rate": 4.976617365641822e-06, "loss": 0.1471, "num_input_tokens_seen": 1714944, "step": 740 }, { "epoch": 0.702167766258247, "grad_norm": 87.46131134033203, "learning_rate": 4.97548155210588e-06, "loss": 0.3082, "num_input_tokens_seen": 1725376, "step": 745 }, { "epoch": 0.706880301602262, "grad_norm": 80.72607421875, "learning_rate": 4.974318938604921e-06, "loss": 0.4432, "num_input_tokens_seen": 1737152, "step": 750 }, { "epoch": 0.7115928369462771, "grad_norm": 13.310392379760742, "learning_rate": 4.9731295377255885e-06, "loss": 0.1969, "num_input_tokens_seen": 1749120, "step": 755 }, { "epoch": 0.7163053722902922, "grad_norm": 23.76306915283203, "learning_rate": 4.971913362344529e-06, "loss": 0.272, "num_input_tokens_seen": 1760384, "step": 760 }, { "epoch": 0.7210179076343073, "grad_norm": 33.018524169921875, "learning_rate": 4.970670425628255e-06, "loss": 0.1454, "num_input_tokens_seen": 1773632, "step": 765 }, { "epoch": 0.7257304429783223, "grad_norm": 21.914316177368164, "learning_rate": 4.969400741032999e-06, "loss": 0.184, "num_input_tokens_seen": 1787776, "step": 770 }, { "epoch": 0.7304429783223374, "grad_norm": 38.7669792175293, "learning_rate": 4.968104322304575e-06, "loss": 0.2148, "num_input_tokens_seen": 1798336, "step": 775 }, { "epoch": 0.7351555136663525, "grad_norm": 41.227027893066406, "learning_rate": 4.966781183478223e-06, "loss": 0.2897, "num_input_tokens_seen": 1809216, "step": 780 }, { "epoch": 0.7398680490103676, "grad_norm": 26.2169189453125, "learning_rate": 4.965431338878456e-06, "loss": 0.2981, "num_input_tokens_seen": 1822144, "step": 785 }, { "epoch": 0.7445805843543827, "grad_norm": 5.09712553024292, "learning_rate": 4.9640548031189125e-06, "loss": 0.2476, "num_input_tokens_seen": 1833088, "step": 790 }, { "epoch": 0.7492931196983977, "grad_norm": 15.851964950561523, "learning_rate": 4.962651591102191e-06, "loss": 0.2554, "num_input_tokens_seen": 1845056, "step": 795 }, { "epoch": 0.7521206409048068, "eval_loss": 0.2178538739681244, "eval_runtime": 2.7742, "eval_samples_per_second": 339.916, "eval_steps_per_second": 42.535, "num_input_tokens_seen": 1851072, "step": 798 }, { "epoch": 0.7540056550424128, "grad_norm": 14.348052978515625, "learning_rate": 4.961221718019695e-06, "loss": 0.2507, "num_input_tokens_seen": 1855168, "step": 800 }, { "epoch": 0.7587181903864278, "grad_norm": 20.442550659179688, "learning_rate": 4.9597651993514585e-06, "loss": 0.3006, "num_input_tokens_seen": 1867328, "step": 805 }, { "epoch": 0.763430725730443, "grad_norm": 18.405014038085938, "learning_rate": 4.9582820508659924e-06, "loss": 0.1949, "num_input_tokens_seen": 1882560, "step": 810 }, { "epoch": 0.7681432610744581, "grad_norm": 26.241788864135742, "learning_rate": 4.956772288620101e-06, "loss": 0.1866, "num_input_tokens_seen": 1893376, "step": 815 }, { "epoch": 0.7728557964184731, "grad_norm": 4.750776290893555, "learning_rate": 4.955235928958716e-06, "loss": 0.1114, "num_input_tokens_seen": 1906048, "step": 820 }, { "epoch": 0.7775683317624882, "grad_norm": 22.653051376342773, "learning_rate": 4.953672988514716e-06, "loss": 0.2425, "num_input_tokens_seen": 1917568, "step": 825 }, { "epoch": 0.7822808671065034, "grad_norm": 56.989315032958984, "learning_rate": 4.95208348420875e-06, "loss": 0.4121, "num_input_tokens_seen": 1929216, "step": 830 }, { "epoch": 0.7869934024505184, "grad_norm": 21.19652557373047, "learning_rate": 4.950467433249046e-06, "loss": 0.1859, "num_input_tokens_seen": 1940416, "step": 835 }, { "epoch": 0.7917059377945335, "grad_norm": 17.347103118896484, "learning_rate": 4.948824853131237e-06, "loss": 0.2065, "num_input_tokens_seen": 1949632, "step": 840 }, { "epoch": 0.7964184731385485, "grad_norm": 32.96878433227539, "learning_rate": 4.94715576163816e-06, "loss": 0.2102, "num_input_tokens_seen": 1961920, "step": 845 }, { "epoch": 0.8011310084825636, "grad_norm": 4.76591157913208, "learning_rate": 4.945460176839671e-06, "loss": 0.2975, "num_input_tokens_seen": 1973696, "step": 850 }, { "epoch": 0.8058435438265787, "grad_norm": 17.566848754882812, "learning_rate": 4.943738117092447e-06, "loss": 0.294, "num_input_tokens_seen": 1985280, "step": 855 }, { "epoch": 0.8105560791705938, "grad_norm": 34.71393966674805, "learning_rate": 4.941989601039785e-06, "loss": 0.2107, "num_input_tokens_seen": 1997504, "step": 860 }, { "epoch": 0.8152686145146089, "grad_norm": 15.716105461120605, "learning_rate": 4.940214647611405e-06, "loss": 0.2815, "num_input_tokens_seen": 2009600, "step": 865 }, { "epoch": 0.8199811498586239, "grad_norm": 5.163911819458008, "learning_rate": 4.9384132760232395e-06, "loss": 0.1509, "num_input_tokens_seen": 2020672, "step": 870 }, { "epoch": 0.824693685202639, "grad_norm": 32.56769943237305, "learning_rate": 4.93658550577723e-06, "loss": 0.258, "num_input_tokens_seen": 2033408, "step": 875 }, { "epoch": 0.8294062205466541, "grad_norm": 21.050493240356445, "learning_rate": 4.9347313566611145e-06, "loss": 0.2403, "num_input_tokens_seen": 2043328, "step": 880 }, { "epoch": 0.8341187558906692, "grad_norm": 13.551421165466309, "learning_rate": 4.9328508487482115e-06, "loss": 0.2631, "num_input_tokens_seen": 2054656, "step": 885 }, { "epoch": 0.8388312912346843, "grad_norm": 19.12700080871582, "learning_rate": 4.930944002397204e-06, "loss": 0.2302, "num_input_tokens_seen": 2064128, "step": 890 }, { "epoch": 0.8435438265786993, "grad_norm": 29.187570571899414, "learning_rate": 4.929010838251923e-06, "loss": 0.2009, "num_input_tokens_seen": 2076864, "step": 895 }, { "epoch": 0.8482563619227145, "grad_norm": 20.132150650024414, "learning_rate": 4.927051377241115e-06, "loss": 0.1868, "num_input_tokens_seen": 2087104, "step": 900 }, { "epoch": 0.8529688972667295, "grad_norm": 19.931499481201172, "learning_rate": 4.9250656405782215e-06, "loss": 0.3066, "num_input_tokens_seen": 2097728, "step": 905 }, { "epoch": 0.8576814326107446, "grad_norm": 21.25429916381836, "learning_rate": 4.9230536497611525e-06, "loss": 0.1685, "num_input_tokens_seen": 2107904, "step": 910 }, { "epoch": 0.8623939679547596, "grad_norm": 35.41661834716797, "learning_rate": 4.921015426572047e-06, "loss": 0.3358, "num_input_tokens_seen": 2120192, "step": 915 }, { "epoch": 0.8671065032987747, "grad_norm": 20.501426696777344, "learning_rate": 4.918950993077039e-06, "loss": 0.2411, "num_input_tokens_seen": 2131904, "step": 920 }, { "epoch": 0.8718190386427899, "grad_norm": 30.00881576538086, "learning_rate": 4.91686037162602e-06, "loss": 0.3069, "num_input_tokens_seen": 2144640, "step": 925 }, { "epoch": 0.8765315739868049, "grad_norm": 30.22358512878418, "learning_rate": 4.9147435848523975e-06, "loss": 0.1587, "num_input_tokens_seen": 2154112, "step": 930 }, { "epoch": 0.88124410933082, "grad_norm": 10.572684288024902, "learning_rate": 4.91260065567285e-06, "loss": 0.1468, "num_input_tokens_seen": 2167232, "step": 935 }, { "epoch": 0.885956644674835, "grad_norm": 61.71476745605469, "learning_rate": 4.910431607287075e-06, "loss": 0.2699, "num_input_tokens_seen": 2179264, "step": 940 }, { "epoch": 0.8906691800188501, "grad_norm": 10.816360473632812, "learning_rate": 4.908236463177544e-06, "loss": 0.3797, "num_input_tokens_seen": 2191488, "step": 945 }, { "epoch": 0.8953817153628653, "grad_norm": 39.76873016357422, "learning_rate": 4.906015247109242e-06, "loss": 0.1988, "num_input_tokens_seen": 2201856, "step": 950 }, { "epoch": 0.9000942507068803, "grad_norm": 23.409250259399414, "learning_rate": 4.903767983129414e-06, "loss": 0.3161, "num_input_tokens_seen": 2214464, "step": 955 }, { "epoch": 0.9048067860508954, "grad_norm": 23.47569465637207, "learning_rate": 4.901494695567306e-06, "loss": 0.2565, "num_input_tokens_seen": 2229184, "step": 960 }, { "epoch": 0.9095193213949104, "grad_norm": 12.153125762939453, "learning_rate": 4.899195409033897e-06, "loss": 0.2214, "num_input_tokens_seen": 2239104, "step": 965 }, { "epoch": 0.9142318567389256, "grad_norm": 13.904633522033691, "learning_rate": 4.896870148421637e-06, "loss": 0.1992, "num_input_tokens_seen": 2249152, "step": 970 }, { "epoch": 0.9189443920829407, "grad_norm": 9.68702507019043, "learning_rate": 4.894518938904175e-06, "loss": 0.1527, "num_input_tokens_seen": 2261312, "step": 975 }, { "epoch": 0.9236569274269557, "grad_norm": 35.594173431396484, "learning_rate": 4.892141805936085e-06, "loss": 0.1398, "num_input_tokens_seen": 2275008, "step": 980 }, { "epoch": 0.9283694627709708, "grad_norm": 30.414966583251953, "learning_rate": 4.889738775252596e-06, "loss": 0.276, "num_input_tokens_seen": 2287680, "step": 985 }, { "epoch": 0.9330819981149858, "grad_norm": 36.264251708984375, "learning_rate": 4.887309872869308e-06, "loss": 0.2869, "num_input_tokens_seen": 2299840, "step": 990 }, { "epoch": 0.937794533459001, "grad_norm": 34.444374084472656, "learning_rate": 4.884855125081912e-06, "loss": 0.2347, "num_input_tokens_seen": 2311104, "step": 995 }, { "epoch": 0.942507068803016, "grad_norm": 39.005767822265625, "learning_rate": 4.882374558465906e-06, "loss": 0.326, "num_input_tokens_seen": 2322432, "step": 1000 }, { "epoch": 0.9472196041470311, "grad_norm": 23.73866081237793, "learning_rate": 4.8798681998763056e-06, "loss": 0.2946, "num_input_tokens_seen": 2333120, "step": 1005 }, { "epoch": 0.9519321394910462, "grad_norm": 17.239654541015625, "learning_rate": 4.877336076447358e-06, "loss": 0.2846, "num_input_tokens_seen": 2345472, "step": 1010 }, { "epoch": 0.9566446748350612, "grad_norm": 16.902143478393555, "learning_rate": 4.87477821559224e-06, "loss": 0.1988, "num_input_tokens_seen": 2357568, "step": 1015 }, { "epoch": 0.9613572101790764, "grad_norm": 20.823362350463867, "learning_rate": 4.87219464500277e-06, "loss": 0.2295, "num_input_tokens_seen": 2368064, "step": 1020 }, { "epoch": 0.9660697455230914, "grad_norm": 22.501428604125977, "learning_rate": 4.869585392649102e-06, "loss": 0.2166, "num_input_tokens_seen": 2381184, "step": 1025 }, { "epoch": 0.9707822808671065, "grad_norm": 12.077306747436523, "learning_rate": 4.866950486779425e-06, "loss": 0.1964, "num_input_tokens_seen": 2393408, "step": 1030 }, { "epoch": 0.9754948162111216, "grad_norm": 24.82265281677246, "learning_rate": 4.864289955919658e-06, "loss": 0.2603, "num_input_tokens_seen": 2406720, "step": 1035 }, { "epoch": 0.9802073515551367, "grad_norm": 24.67642593383789, "learning_rate": 4.8616038288731394e-06, "loss": 0.3101, "num_input_tokens_seen": 2420288, "step": 1040 }, { "epoch": 0.9849198868991518, "grad_norm": 7.1168532371521, "learning_rate": 4.8588921347203175e-06, "loss": 0.1463, "num_input_tokens_seen": 2431488, "step": 1045 }, { "epoch": 0.9896324222431668, "grad_norm": 12.157154083251953, "learning_rate": 4.8561549028184315e-06, "loss": 0.2497, "num_input_tokens_seen": 2444032, "step": 1050 }, { "epoch": 0.9943449575871819, "grad_norm": 18.19011688232422, "learning_rate": 4.8533921628012e-06, "loss": 0.1574, "num_input_tokens_seen": 2454912, "step": 1055 }, { "epoch": 0.9990574929311969, "grad_norm": 22.441247940063477, "learning_rate": 4.850603944578494e-06, "loss": 0.3676, "num_input_tokens_seen": 2467584, "step": 1060 }, { "epoch": 1.002827521206409, "eval_loss": 0.18848362565040588, "eval_runtime": 2.7314, "eval_samples_per_second": 345.248, "eval_steps_per_second": 43.202, "num_input_tokens_seen": 2475808, "step": 1064 }, { "epoch": 1.003770028275212, "grad_norm": 12.251239776611328, "learning_rate": 4.847790278336017e-06, "loss": 0.1493, "num_input_tokens_seen": 2478048, "step": 1065 }, { "epoch": 1.0084825636192272, "grad_norm": 23.926055908203125, "learning_rate": 4.844951194534975e-06, "loss": 0.1749, "num_input_tokens_seen": 2492576, "step": 1070 }, { "epoch": 1.0131950989632421, "grad_norm": 2.156106472015381, "learning_rate": 4.842086723911751e-06, "loss": 0.1307, "num_input_tokens_seen": 2505440, "step": 1075 }, { "epoch": 1.0179076343072573, "grad_norm": 29.823352813720703, "learning_rate": 4.839196897477569e-06, "loss": 0.1119, "num_input_tokens_seen": 2515488, "step": 1080 }, { "epoch": 1.0226201696512724, "grad_norm": 7.730029106140137, "learning_rate": 4.836281746518159e-06, "loss": 0.1664, "num_input_tokens_seen": 2529504, "step": 1085 }, { "epoch": 1.0273327049952874, "grad_norm": 35.77005386352539, "learning_rate": 4.833341302593417e-06, "loss": 0.1393, "num_input_tokens_seen": 2539872, "step": 1090 }, { "epoch": 1.0320452403393026, "grad_norm": 0.4067946672439575, "learning_rate": 4.830375597537068e-06, "loss": 0.0376, "num_input_tokens_seen": 2549856, "step": 1095 }, { "epoch": 1.0367577756833177, "grad_norm": 0.01726607233285904, "learning_rate": 4.827384663456315e-06, "loss": 0.1836, "num_input_tokens_seen": 2559328, "step": 1100 }, { "epoch": 1.0414703110273327, "grad_norm": 201.8726043701172, "learning_rate": 4.824368532731496e-06, "loss": 0.369, "num_input_tokens_seen": 2569440, "step": 1105 }, { "epoch": 1.0461828463713478, "grad_norm": 123.39364624023438, "learning_rate": 4.821327238015732e-06, "loss": 0.084, "num_input_tokens_seen": 2580448, "step": 1110 }, { "epoch": 1.0508953817153628, "grad_norm": 93.78629302978516, "learning_rate": 4.818260812234572e-06, "loss": 0.4176, "num_input_tokens_seen": 2590752, "step": 1115 }, { "epoch": 1.055607917059378, "grad_norm": 54.99407196044922, "learning_rate": 4.815169288585641e-06, "loss": 0.0664, "num_input_tokens_seen": 2600160, "step": 1120 }, { "epoch": 1.0603204524033931, "grad_norm": 33.015010833740234, "learning_rate": 4.812052700538274e-06, "loss": 0.1558, "num_input_tokens_seen": 2611232, "step": 1125 }, { "epoch": 1.065032987747408, "grad_norm": 10.432161331176758, "learning_rate": 4.808911081833161e-06, "loss": 0.1476, "num_input_tokens_seen": 2623712, "step": 1130 }, { "epoch": 1.0697455230914232, "grad_norm": 21.43227767944336, "learning_rate": 4.805744466481974e-06, "loss": 0.0875, "num_input_tokens_seen": 2635936, "step": 1135 }, { "epoch": 1.0744580584354382, "grad_norm": 0.39066728949546814, "learning_rate": 4.802552888767005e-06, "loss": 0.1297, "num_input_tokens_seen": 2645920, "step": 1140 }, { "epoch": 1.0791705937794533, "grad_norm": 79.24580383300781, "learning_rate": 4.799336383240793e-06, "loss": 0.2563, "num_input_tokens_seen": 2660768, "step": 1145 }, { "epoch": 1.0838831291234685, "grad_norm": 90.48008728027344, "learning_rate": 4.796094984725749e-06, "loss": 0.1484, "num_input_tokens_seen": 2671200, "step": 1150 }, { "epoch": 1.0885956644674835, "grad_norm": 0.7040526270866394, "learning_rate": 4.792828728313778e-06, "loss": 0.1145, "num_input_tokens_seen": 2683040, "step": 1155 }, { "epoch": 1.0933081998114986, "grad_norm": 80.87930297851562, "learning_rate": 4.789537649365904e-06, "loss": 0.0767, "num_input_tokens_seen": 2694432, "step": 1160 }, { "epoch": 1.0980207351555136, "grad_norm": 0.26429542899131775, "learning_rate": 4.78622178351188e-06, "loss": 0.0079, "num_input_tokens_seen": 2707168, "step": 1165 }, { "epoch": 1.1027332704995287, "grad_norm": 0.19729509949684143, "learning_rate": 4.782881166649808e-06, "loss": 0.1644, "num_input_tokens_seen": 2717984, "step": 1170 }, { "epoch": 1.107445805843544, "grad_norm": 220.86328125, "learning_rate": 4.77951583494575e-06, "loss": 0.2543, "num_input_tokens_seen": 2730784, "step": 1175 }, { "epoch": 1.1121583411875589, "grad_norm": 21.004072189331055, "learning_rate": 4.77612582483333e-06, "loss": 0.4821, "num_input_tokens_seen": 2740704, "step": 1180 }, { "epoch": 1.116870876531574, "grad_norm": 45.33163833618164, "learning_rate": 4.772711173013352e-06, "loss": 0.2498, "num_input_tokens_seen": 2751968, "step": 1185 }, { "epoch": 1.121583411875589, "grad_norm": 4.449422359466553, "learning_rate": 4.769271916453387e-06, "loss": 0.1649, "num_input_tokens_seen": 2763808, "step": 1190 }, { "epoch": 1.1262959472196041, "grad_norm": 154.34603881835938, "learning_rate": 4.765808092387385e-06, "loss": 0.0735, "num_input_tokens_seen": 2774624, "step": 1195 }, { "epoch": 1.1310084825636193, "grad_norm": 100.58317565917969, "learning_rate": 4.762319738315269e-06, "loss": 0.2639, "num_input_tokens_seen": 2785888, "step": 1200 }, { "epoch": 1.1357210179076342, "grad_norm": 45.887027740478516, "learning_rate": 4.758806892002526e-06, "loss": 0.3194, "num_input_tokens_seen": 2797216, "step": 1205 }, { "epoch": 1.1404335532516494, "grad_norm": 36.13898849487305, "learning_rate": 4.7552695914798e-06, "loss": 0.1395, "num_input_tokens_seen": 2808032, "step": 1210 }, { "epoch": 1.1451460885956646, "grad_norm": 96.65644836425781, "learning_rate": 4.751707875042481e-06, "loss": 0.2734, "num_input_tokens_seen": 2823008, "step": 1215 }, { "epoch": 1.1498586239396795, "grad_norm": 2.167825698852539, "learning_rate": 4.748121781250288e-06, "loss": 0.0883, "num_input_tokens_seen": 2835936, "step": 1220 }, { "epoch": 1.1545711592836947, "grad_norm": 14.599705696105957, "learning_rate": 4.744511348926855e-06, "loss": 0.169, "num_input_tokens_seen": 2847584, "step": 1225 }, { "epoch": 1.1592836946277096, "grad_norm": 68.31897735595703, "learning_rate": 4.740876617159308e-06, "loss": 0.1451, "num_input_tokens_seen": 2857952, "step": 1230 }, { "epoch": 1.1639962299717248, "grad_norm": 77.1812515258789, "learning_rate": 4.737217625297844e-06, "loss": 0.2114, "num_input_tokens_seen": 2868192, "step": 1235 }, { "epoch": 1.1687087653157398, "grad_norm": 6.400179862976074, "learning_rate": 4.733534412955301e-06, "loss": 0.1145, "num_input_tokens_seen": 2879904, "step": 1240 }, { "epoch": 1.173421300659755, "grad_norm": 1.274997353553772, "learning_rate": 4.729827020006735e-06, "loss": 0.1768, "num_input_tokens_seen": 2892384, "step": 1245 }, { "epoch": 1.17813383600377, "grad_norm": 32.56444549560547, "learning_rate": 4.726095486588983e-06, "loss": 0.1507, "num_input_tokens_seen": 2905184, "step": 1250 }, { "epoch": 1.182846371347785, "grad_norm": 7.450242042541504, "learning_rate": 4.722339853100232e-06, "loss": 0.0958, "num_input_tokens_seen": 2916640, "step": 1255 }, { "epoch": 1.1875589066918002, "grad_norm": 4.951867580413818, "learning_rate": 4.718560160199579e-06, "loss": 0.1192, "num_input_tokens_seen": 2927072, "step": 1260 }, { "epoch": 1.1922714420358154, "grad_norm": 50.1746940612793, "learning_rate": 4.714756448806592e-06, "loss": 0.2693, "num_input_tokens_seen": 2937888, "step": 1265 }, { "epoch": 1.1969839773798303, "grad_norm": 0.2431841641664505, "learning_rate": 4.71092876010087e-06, "loss": 0.1689, "num_input_tokens_seen": 2950752, "step": 1270 }, { "epoch": 1.2016965127238455, "grad_norm": 40.15456771850586, "learning_rate": 4.70707713552159e-06, "loss": 0.0997, "num_input_tokens_seen": 2961056, "step": 1275 }, { "epoch": 1.2064090480678604, "grad_norm": 154.8431396484375, "learning_rate": 4.703201616767067e-06, "loss": 0.1164, "num_input_tokens_seen": 2971552, "step": 1280 }, { "epoch": 1.2111215834118756, "grad_norm": 67.9471206665039, "learning_rate": 4.699302245794293e-06, "loss": 0.0178, "num_input_tokens_seen": 2985120, "step": 1285 }, { "epoch": 1.2158341187558908, "grad_norm": 104.93325805664062, "learning_rate": 4.6953790648184924e-06, "loss": 0.1821, "num_input_tokens_seen": 2996128, "step": 1290 }, { "epoch": 1.2205466540999057, "grad_norm": 0.03052549809217453, "learning_rate": 4.691432116312661e-06, "loss": 0.0199, "num_input_tokens_seen": 3007072, "step": 1295 }, { "epoch": 1.2252591894439209, "grad_norm": 0.9742458462715149, "learning_rate": 4.687461443007101e-06, "loss": 0.006, "num_input_tokens_seen": 3018656, "step": 1300 }, { "epoch": 1.2299717247879358, "grad_norm": 0.007309742737561464, "learning_rate": 4.683467087888967e-06, "loss": 0.1915, "num_input_tokens_seen": 3030624, "step": 1305 }, { "epoch": 1.234684260131951, "grad_norm": 0.3931090831756592, "learning_rate": 4.6794490942017955e-06, "loss": 0.2276, "num_input_tokens_seen": 3043040, "step": 1310 }, { "epoch": 1.2393967954759662, "grad_norm": 8.714564323425293, "learning_rate": 4.6754075054450385e-06, "loss": 0.0236, "num_input_tokens_seen": 3057632, "step": 1315 }, { "epoch": 1.244109330819981, "grad_norm": 0.008542931638658047, "learning_rate": 4.671342365373592e-06, "loss": 0.1376, "num_input_tokens_seen": 3069792, "step": 1320 }, { "epoch": 1.2488218661639963, "grad_norm": 52.45071792602539, "learning_rate": 4.667253717997324e-06, "loss": 0.2062, "num_input_tokens_seen": 3080608, "step": 1325 }, { "epoch": 1.2535344015080114, "grad_norm": 10.894562721252441, "learning_rate": 4.663141607580589e-06, "loss": 0.165, "num_input_tokens_seen": 3091552, "step": 1330 }, { "epoch": 1.2535344015080114, "eval_loss": 0.4607957601547241, "eval_runtime": 2.7224, "eval_samples_per_second": 346.386, "eval_steps_per_second": 43.344, "num_input_tokens_seen": 3091552, "step": 1330 }, { "epoch": 1.2582469368520264, "grad_norm": 121.4914321899414, "learning_rate": 4.659006078641766e-06, "loss": 0.222, "num_input_tokens_seen": 3103712, "step": 1335 }, { "epoch": 1.2629594721960415, "grad_norm": 2.8751637935638428, "learning_rate": 4.6548471759527634e-06, "loss": 0.2312, "num_input_tokens_seen": 3115104, "step": 1340 }, { "epoch": 1.2676720075400565, "grad_norm": 3.6843035221099854, "learning_rate": 4.6506649445385335e-06, "loss": 0.011, "num_input_tokens_seen": 3127648, "step": 1345 }, { "epoch": 1.2723845428840717, "grad_norm": 26.937593460083008, "learning_rate": 4.646459429676594e-06, "loss": 0.2732, "num_input_tokens_seen": 3138208, "step": 1350 }, { "epoch": 1.2770970782280866, "grad_norm": 41.53554916381836, "learning_rate": 4.642230676896531e-06, "loss": 0.148, "num_input_tokens_seen": 3148256, "step": 1355 }, { "epoch": 1.2818096135721018, "grad_norm": 74.98961639404297, "learning_rate": 4.6379787319795076e-06, "loss": 0.0901, "num_input_tokens_seen": 3157856, "step": 1360 }, { "epoch": 1.286522148916117, "grad_norm": 1.2443631887435913, "learning_rate": 4.6337036409577705e-06, "loss": 0.24, "num_input_tokens_seen": 3167136, "step": 1365 }, { "epoch": 1.2912346842601319, "grad_norm": 0.20186370611190796, "learning_rate": 4.62940545011415e-06, "loss": 0.0842, "num_input_tokens_seen": 3181984, "step": 1370 }, { "epoch": 1.295947219604147, "grad_norm": 22.39756965637207, "learning_rate": 4.625084205981554e-06, "loss": 0.1368, "num_input_tokens_seen": 3195744, "step": 1375 }, { "epoch": 1.3006597549481622, "grad_norm": 9.254731178283691, "learning_rate": 4.620739955342476e-06, "loss": 0.2497, "num_input_tokens_seen": 3207776, "step": 1380 }, { "epoch": 1.3053722902921772, "grad_norm": 0.06419213116168976, "learning_rate": 4.616372745228477e-06, "loss": 0.0782, "num_input_tokens_seen": 3219296, "step": 1385 }, { "epoch": 1.3100848256361923, "grad_norm": 56.7759895324707, "learning_rate": 4.611982622919684e-06, "loss": 0.3956, "num_input_tokens_seen": 3230048, "step": 1390 }, { "epoch": 1.3147973609802073, "grad_norm": 68.79596710205078, "learning_rate": 4.607569635944271e-06, "loss": 0.1166, "num_input_tokens_seen": 3239200, "step": 1395 }, { "epoch": 1.3195098963242224, "grad_norm": 27.92612648010254, "learning_rate": 4.603133832077953e-06, "loss": 0.2557, "num_input_tokens_seen": 3255008, "step": 1400 }, { "epoch": 1.3242224316682374, "grad_norm": 13.399755477905273, "learning_rate": 4.598675259343462e-06, "loss": 0.2547, "num_input_tokens_seen": 3267040, "step": 1405 }, { "epoch": 1.3289349670122526, "grad_norm": 25.696258544921875, "learning_rate": 4.594193966010031e-06, "loss": 0.2374, "num_input_tokens_seen": 3276960, "step": 1410 }, { "epoch": 1.3336475023562677, "grad_norm": 29.0289363861084, "learning_rate": 4.589690000592868e-06, "loss": 0.0795, "num_input_tokens_seen": 3287840, "step": 1415 }, { "epoch": 1.3383600377002827, "grad_norm": 30.088584899902344, "learning_rate": 4.585163411852632e-06, "loss": 0.2095, "num_input_tokens_seen": 3300256, "step": 1420 }, { "epoch": 1.3430725730442978, "grad_norm": 3.960421562194824, "learning_rate": 4.58061424879491e-06, "loss": 0.3144, "num_input_tokens_seen": 3311712, "step": 1425 }, { "epoch": 1.347785108388313, "grad_norm": 75.69437408447266, "learning_rate": 4.576042560669678e-06, "loss": 0.1113, "num_input_tokens_seen": 3322144, "step": 1430 }, { "epoch": 1.352497643732328, "grad_norm": 53.89783477783203, "learning_rate": 4.571448396970773e-06, "loss": 0.4022, "num_input_tokens_seen": 3333856, "step": 1435 }, { "epoch": 1.3572101790763431, "grad_norm": 17.59637451171875, "learning_rate": 4.566831807435359e-06, "loss": 0.1542, "num_input_tokens_seen": 3345696, "step": 1440 }, { "epoch": 1.3619227144203583, "grad_norm": 15.906473159790039, "learning_rate": 4.562192842043381e-06, "loss": 0.2594, "num_input_tokens_seen": 3357024, "step": 1445 }, { "epoch": 1.3666352497643732, "grad_norm": 53.453163146972656, "learning_rate": 4.557531551017034e-06, "loss": 0.1721, "num_input_tokens_seen": 3368480, "step": 1450 }, { "epoch": 1.3713477851083884, "grad_norm": 10.427976608276367, "learning_rate": 4.552847984820208e-06, "loss": 0.1418, "num_input_tokens_seen": 3378720, "step": 1455 }, { "epoch": 1.3760603204524033, "grad_norm": 17.01227569580078, "learning_rate": 4.548142194157951e-06, "loss": 0.1344, "num_input_tokens_seen": 3390688, "step": 1460 }, { "epoch": 1.3807728557964185, "grad_norm": 0.41409215331077576, "learning_rate": 4.54341422997592e-06, "loss": 0.2518, "num_input_tokens_seen": 3403488, "step": 1465 }, { "epoch": 1.3854853911404335, "grad_norm": 3.571580410003662, "learning_rate": 4.538664143459819e-06, "loss": 0.1194, "num_input_tokens_seen": 3415648, "step": 1470 }, { "epoch": 1.3901979264844486, "grad_norm": 39.68430709838867, "learning_rate": 4.5338919860348565e-06, "loss": 0.1113, "num_input_tokens_seen": 3427168, "step": 1475 }, { "epoch": 1.3949104618284638, "grad_norm": 0.09742722660303116, "learning_rate": 4.529097809365184e-06, "loss": 0.1426, "num_input_tokens_seen": 3437664, "step": 1480 }, { "epoch": 1.3996229971724787, "grad_norm": 80.09423828125, "learning_rate": 4.524281665353334e-06, "loss": 0.3136, "num_input_tokens_seen": 3450144, "step": 1485 }, { "epoch": 1.404335532516494, "grad_norm": 38.64655303955078, "learning_rate": 4.519443606139665e-06, "loss": 0.1617, "num_input_tokens_seen": 3461280, "step": 1490 }, { "epoch": 1.409048067860509, "grad_norm": 60.909393310546875, "learning_rate": 4.514583684101792e-06, "loss": 0.2666, "num_input_tokens_seen": 3472608, "step": 1495 }, { "epoch": 1.413760603204524, "grad_norm": 89.08367919921875, "learning_rate": 4.509701951854018e-06, "loss": 0.105, "num_input_tokens_seen": 3485024, "step": 1500 }, { "epoch": 1.4184731385485392, "grad_norm": 73.14676666259766, "learning_rate": 4.504798462246768e-06, "loss": 0.2341, "num_input_tokens_seen": 3496096, "step": 1505 }, { "epoch": 1.4231856738925541, "grad_norm": 33.10121154785156, "learning_rate": 4.499873268366017e-06, "loss": 0.2829, "num_input_tokens_seen": 3506848, "step": 1510 }, { "epoch": 1.4278982092365693, "grad_norm": 45.99144744873047, "learning_rate": 4.494926423532715e-06, "loss": 0.1819, "num_input_tokens_seen": 3521568, "step": 1515 }, { "epoch": 1.4326107445805842, "grad_norm": 3.1161906719207764, "learning_rate": 4.4899579813022046e-06, "loss": 0.1103, "num_input_tokens_seen": 3533856, "step": 1520 }, { "epoch": 1.4373232799245994, "grad_norm": 1.9241315126419067, "learning_rate": 4.484967995463648e-06, "loss": 0.216, "num_input_tokens_seen": 3544544, "step": 1525 }, { "epoch": 1.4420358152686146, "grad_norm": 26.153079986572266, "learning_rate": 4.479956520039443e-06, "loss": 0.303, "num_input_tokens_seen": 3554336, "step": 1530 }, { "epoch": 1.4467483506126295, "grad_norm": 8.090953826904297, "learning_rate": 4.474923609284635e-06, "loss": 0.0434, "num_input_tokens_seen": 3564384, "step": 1535 }, { "epoch": 1.4514608859566447, "grad_norm": 0.26238393783569336, "learning_rate": 4.469869317686332e-06, "loss": 0.1438, "num_input_tokens_seen": 3576992, "step": 1540 }, { "epoch": 1.4561734213006599, "grad_norm": 92.67262268066406, "learning_rate": 4.464793699963116e-06, "loss": 0.1766, "num_input_tokens_seen": 3587872, "step": 1545 }, { "epoch": 1.4608859566446748, "grad_norm": 11.002724647521973, "learning_rate": 4.4596968110644484e-06, "loss": 0.0997, "num_input_tokens_seen": 3598560, "step": 1550 }, { "epoch": 1.46559849198869, "grad_norm": 77.25719451904297, "learning_rate": 4.454578706170075e-06, "loss": 0.1595, "num_input_tokens_seen": 3608864, "step": 1555 }, { "epoch": 1.4703110273327051, "grad_norm": 1.6689245700836182, "learning_rate": 4.44943944068943e-06, "loss": 0.0274, "num_input_tokens_seen": 3620960, "step": 1560 }, { "epoch": 1.47502356267672, "grad_norm": 103.46016693115234, "learning_rate": 4.444279070261035e-06, "loss": 0.4584, "num_input_tokens_seen": 3632096, "step": 1565 }, { "epoch": 1.479736098020735, "grad_norm": 57.57553482055664, "learning_rate": 4.4390976507518994e-06, "loss": 0.2423, "num_input_tokens_seen": 3643424, "step": 1570 }, { "epoch": 1.4844486333647502, "grad_norm": 0.6700392961502075, "learning_rate": 4.433895238256909e-06, "loss": 0.046, "num_input_tokens_seen": 3654624, "step": 1575 }, { "epoch": 1.4891611687087654, "grad_norm": 58.0783576965332, "learning_rate": 4.4286718890982275e-06, "loss": 0.0609, "num_input_tokens_seen": 3665504, "step": 1580 }, { "epoch": 1.4938737040527803, "grad_norm": 142.61090087890625, "learning_rate": 4.423427659824681e-06, "loss": 0.2488, "num_input_tokens_seen": 3676448, "step": 1585 }, { "epoch": 1.4985862393967955, "grad_norm": 40.1721305847168, "learning_rate": 4.418162607211146e-06, "loss": 0.4721, "num_input_tokens_seen": 3686432, "step": 1590 }, { "epoch": 1.5032987747408106, "grad_norm": 25.409154891967773, "learning_rate": 4.412876788257936e-06, "loss": 0.2207, "num_input_tokens_seen": 3697312, "step": 1595 }, { "epoch": 1.5042412818096136, "eval_loss": 0.35448023676872253, "eval_runtime": 2.7456, "eval_samples_per_second": 343.46, "eval_steps_per_second": 42.978, "num_input_tokens_seen": 3699104, "step": 1596 }, { "epoch": 1.5080113100848256, "grad_norm": 44.117496490478516, "learning_rate": 4.407570260190186e-06, "loss": 0.2648, "num_input_tokens_seen": 3707808, "step": 1600 }, { "epoch": 1.5127238454288408, "grad_norm": 26.070695877075195, "learning_rate": 4.402243080457229e-06, "loss": 0.3225, "num_input_tokens_seen": 3719840, "step": 1605 }, { "epoch": 1.517436380772856, "grad_norm": 1.1607394218444824, "learning_rate": 4.396895306731978e-06, "loss": 0.2234, "num_input_tokens_seen": 3731168, "step": 1610 }, { "epoch": 1.5221489161168709, "grad_norm": 103.62728881835938, "learning_rate": 4.391526996910298e-06, "loss": 0.2199, "num_input_tokens_seen": 3744160, "step": 1615 }, { "epoch": 1.5268614514608858, "grad_norm": 31.115297317504883, "learning_rate": 4.386138209110385e-06, "loss": 0.1515, "num_input_tokens_seen": 3754912, "step": 1620 }, { "epoch": 1.5315739868049012, "grad_norm": 1.294524073600769, "learning_rate": 4.3807290016721265e-06, "loss": 0.1179, "num_input_tokens_seen": 3767776, "step": 1625 }, { "epoch": 1.5362865221489161, "grad_norm": 92.95679473876953, "learning_rate": 4.375299433156483e-06, "loss": 0.1079, "num_input_tokens_seen": 3779104, "step": 1630 }, { "epoch": 1.540999057492931, "grad_norm": 72.8927001953125, "learning_rate": 4.3698495623448424e-06, "loss": 0.359, "num_input_tokens_seen": 3789408, "step": 1635 }, { "epoch": 1.5457115928369463, "grad_norm": 31.62137794494629, "learning_rate": 4.364379448238392e-06, "loss": 0.1058, "num_input_tokens_seen": 3799584, "step": 1640 }, { "epoch": 1.5504241281809614, "grad_norm": 80.54794311523438, "learning_rate": 4.358889150057476e-06, "loss": 0.3319, "num_input_tokens_seen": 3813344, "step": 1645 }, { "epoch": 1.5551366635249764, "grad_norm": 91.38248443603516, "learning_rate": 4.35337872724095e-06, "loss": 0.1354, "num_input_tokens_seen": 3823328, "step": 1650 }, { "epoch": 1.5598491988689915, "grad_norm": 86.33023071289062, "learning_rate": 4.347848239445548e-06, "loss": 0.1612, "num_input_tokens_seen": 3835232, "step": 1655 }, { "epoch": 1.5645617342130067, "grad_norm": 24.640047073364258, "learning_rate": 4.342297746545228e-06, "loss": 0.2858, "num_input_tokens_seen": 3846368, "step": 1660 }, { "epoch": 1.5692742695570217, "grad_norm": 0.5544624924659729, "learning_rate": 4.336727308630527e-06, "loss": 0.0313, "num_input_tokens_seen": 3858656, "step": 1665 }, { "epoch": 1.5739868049010366, "grad_norm": 23.30266761779785, "learning_rate": 4.33113698600791e-06, "loss": 0.1587, "num_input_tokens_seen": 3871776, "step": 1670 }, { "epoch": 1.578699340245052, "grad_norm": 0.21707068383693695, "learning_rate": 4.325526839199115e-06, "loss": 0.0377, "num_input_tokens_seen": 3884384, "step": 1675 }, { "epoch": 1.583411875589067, "grad_norm": 97.02978515625, "learning_rate": 4.319896928940505e-06, "loss": 0.2741, "num_input_tokens_seen": 3896224, "step": 1680 }, { "epoch": 1.5881244109330819, "grad_norm": 6.382898807525635, "learning_rate": 4.3142473161824e-06, "loss": 0.1037, "num_input_tokens_seen": 3906528, "step": 1685 }, { "epoch": 1.592836946277097, "grad_norm": 36.04171371459961, "learning_rate": 4.308578062088426e-06, "loss": 0.1437, "num_input_tokens_seen": 3917728, "step": 1690 }, { "epoch": 1.5975494816211122, "grad_norm": 61.61280822753906, "learning_rate": 4.302889228034846e-06, "loss": 0.3957, "num_input_tokens_seen": 3928032, "step": 1695 }, { "epoch": 1.6022620169651272, "grad_norm": 1.8270617723464966, "learning_rate": 4.297180875609902e-06, "loss": 0.1641, "num_input_tokens_seen": 3940384, "step": 1700 }, { "epoch": 1.6069745523091423, "grad_norm": 0.7876982092857361, "learning_rate": 4.2914530666131436e-06, "loss": 0.0949, "num_input_tokens_seen": 3951904, "step": 1705 }, { "epoch": 1.6116870876531575, "grad_norm": 59.75898742675781, "learning_rate": 4.285705863054759e-06, "loss": 0.2799, "num_input_tokens_seen": 3963360, "step": 1710 }, { "epoch": 1.6163996229971724, "grad_norm": 50.44517517089844, "learning_rate": 4.279939327154909e-06, "loss": 0.3126, "num_input_tokens_seen": 3974432, "step": 1715 }, { "epoch": 1.6211121583411876, "grad_norm": 22.407121658325195, "learning_rate": 4.274153521343047e-06, "loss": 0.2358, "num_input_tokens_seen": 3984352, "step": 1720 }, { "epoch": 1.6258246936852028, "grad_norm": 2.445833206176758, "learning_rate": 4.268348508257243e-06, "loss": 0.0892, "num_input_tokens_seen": 3994016, "step": 1725 }, { "epoch": 1.6305372290292177, "grad_norm": 79.69355010986328, "learning_rate": 4.262524350743512e-06, "loss": 0.3199, "num_input_tokens_seen": 4005856, "step": 1730 }, { "epoch": 1.6352497643732327, "grad_norm": 27.91238784790039, "learning_rate": 4.25668111185513e-06, "loss": 0.1497, "num_input_tokens_seen": 4017248, "step": 1735 }, { "epoch": 1.6399622997172478, "grad_norm": 65.74903106689453, "learning_rate": 4.250818854851948e-06, "loss": 0.1124, "num_input_tokens_seen": 4028128, "step": 1740 }, { "epoch": 1.644674835061263, "grad_norm": 16.284719467163086, "learning_rate": 4.244937643199711e-06, "loss": 0.1923, "num_input_tokens_seen": 4044768, "step": 1745 }, { "epoch": 1.649387370405278, "grad_norm": 68.08360290527344, "learning_rate": 4.239037540569373e-06, "loss": 0.1026, "num_input_tokens_seen": 4062432, "step": 1750 }, { "epoch": 1.654099905749293, "grad_norm": 16.83579444885254, "learning_rate": 4.233118610836401e-06, "loss": 0.0699, "num_input_tokens_seen": 4074016, "step": 1755 }, { "epoch": 1.6588124410933083, "grad_norm": 26.799367904663086, "learning_rate": 4.227180918080089e-06, "loss": 0.1875, "num_input_tokens_seen": 4084704, "step": 1760 }, { "epoch": 1.6635249764373232, "grad_norm": 10.665923118591309, "learning_rate": 4.221224526582863e-06, "loss": 0.0828, "num_input_tokens_seen": 4095136, "step": 1765 }, { "epoch": 1.6682375117813384, "grad_norm": 0.24358469247817993, "learning_rate": 4.215249500829583e-06, "loss": 0.1379, "num_input_tokens_seen": 4107744, "step": 1770 }, { "epoch": 1.6729500471253536, "grad_norm": 0.6852381229400635, "learning_rate": 4.209255905506847e-06, "loss": 0.2322, "num_input_tokens_seen": 4118624, "step": 1775 }, { "epoch": 1.6776625824693685, "grad_norm": 0.456554651260376, "learning_rate": 4.2032438055022925e-06, "loss": 0.1804, "num_input_tokens_seen": 4129184, "step": 1780 }, { "epoch": 1.6823751178133834, "grad_norm": 96.7328872680664, "learning_rate": 4.197213265903889e-06, "loss": 0.3414, "num_input_tokens_seen": 4141024, "step": 1785 }, { "epoch": 1.6870876531573988, "grad_norm": 16.629526138305664, "learning_rate": 4.191164351999236e-06, "loss": 0.3523, "num_input_tokens_seen": 4151840, "step": 1790 }, { "epoch": 1.6918001885014138, "grad_norm": 23.59195899963379, "learning_rate": 4.18509712927486e-06, "loss": 0.2797, "num_input_tokens_seen": 4164704, "step": 1795 }, { "epoch": 1.6965127238454287, "grad_norm": 38.683265686035156, "learning_rate": 4.179011663415494e-06, "loss": 0.2943, "num_input_tokens_seen": 4177184, "step": 1800 }, { "epoch": 1.701225259189444, "grad_norm": 20.35943031311035, "learning_rate": 4.172908020303384e-06, "loss": 0.0589, "num_input_tokens_seen": 4188768, "step": 1805 }, { "epoch": 1.705937794533459, "grad_norm": 25.21088218688965, "learning_rate": 4.166786266017557e-06, "loss": 0.1865, "num_input_tokens_seen": 4200480, "step": 1810 }, { "epoch": 1.710650329877474, "grad_norm": 18.756656646728516, "learning_rate": 4.160646466833121e-06, "loss": 0.1045, "num_input_tokens_seen": 4212064, "step": 1815 }, { "epoch": 1.7153628652214892, "grad_norm": 38.346832275390625, "learning_rate": 4.154488689220536e-06, "loss": 0.2373, "num_input_tokens_seen": 4221728, "step": 1820 }, { "epoch": 1.7200754005655043, "grad_norm": 61.90775680541992, "learning_rate": 4.1483129998449035e-06, "loss": 0.216, "num_input_tokens_seen": 4233888, "step": 1825 }, { "epoch": 1.7247879359095193, "grad_norm": 35.818946838378906, "learning_rate": 4.142119465565238e-06, "loss": 0.2308, "num_input_tokens_seen": 4245344, "step": 1830 }, { "epoch": 1.7295004712535345, "grad_norm": 42.63814163208008, "learning_rate": 4.135908153433748e-06, "loss": 0.0663, "num_input_tokens_seen": 4256992, "step": 1835 }, { "epoch": 1.7342130065975496, "grad_norm": 1.1722609996795654, "learning_rate": 4.129679130695105e-06, "loss": 0.0795, "num_input_tokens_seen": 4266784, "step": 1840 }, { "epoch": 1.7389255419415646, "grad_norm": 73.20691680908203, "learning_rate": 4.123432464785721e-06, "loss": 0.0953, "num_input_tokens_seen": 4281504, "step": 1845 }, { "epoch": 1.7436380772855795, "grad_norm": 61.06163024902344, "learning_rate": 4.117168223333015e-06, "loss": 0.3657, "num_input_tokens_seen": 4296032, "step": 1850 }, { "epoch": 1.7483506126295947, "grad_norm": 3.197977304458618, "learning_rate": 4.1108864741546815e-06, "loss": 0.0417, "num_input_tokens_seen": 4309280, "step": 1855 }, { "epoch": 1.7530631479736098, "grad_norm": 0.4998331665992737, "learning_rate": 4.1045872852579546e-06, "loss": 0.1138, "num_input_tokens_seen": 4319648, "step": 1860 }, { "epoch": 1.7549481621112157, "eval_loss": 0.3500010073184967, "eval_runtime": 2.7501, "eval_samples_per_second": 342.894, "eval_steps_per_second": 42.907, "num_input_tokens_seen": 4324256, "step": 1862 }, { "epoch": 1.7577756833176248, "grad_norm": 108.458740234375, "learning_rate": 4.098270724838879e-06, "loss": 0.0767, "num_input_tokens_seen": 4330144, "step": 1865 }, { "epoch": 1.76248821866164, "grad_norm": 0.2290242463350296, "learning_rate": 4.091936861281561e-06, "loss": 0.0415, "num_input_tokens_seen": 4343712, "step": 1870 }, { "epoch": 1.7672007540056551, "grad_norm": 93.17559814453125, "learning_rate": 4.085585763157435e-06, "loss": 0.4214, "num_input_tokens_seen": 4354144, "step": 1875 }, { "epoch": 1.77191328934967, "grad_norm": 10.659987449645996, "learning_rate": 4.07921749922452e-06, "loss": 0.013, "num_input_tokens_seen": 4364896, "step": 1880 }, { "epoch": 1.7766258246936852, "grad_norm": 0.5930144786834717, "learning_rate": 4.0728321384266764e-06, "loss": 0.1879, "num_input_tokens_seen": 4377120, "step": 1885 }, { "epoch": 1.7813383600377004, "grad_norm": 0.13112248480319977, "learning_rate": 4.066429749892854e-06, "loss": 0.1512, "num_input_tokens_seen": 4388128, "step": 1890 }, { "epoch": 1.7860508953817154, "grad_norm": 31.263877868652344, "learning_rate": 4.060010402936353e-06, "loss": 0.1946, "num_input_tokens_seen": 4402272, "step": 1895 }, { "epoch": 1.7907634307257303, "grad_norm": 66.94145965576172, "learning_rate": 4.053574167054063e-06, "loss": 0.0513, "num_input_tokens_seen": 4412640, "step": 1900 }, { "epoch": 1.7954759660697457, "grad_norm": 30.63470458984375, "learning_rate": 4.047121111925718e-06, "loss": 0.2935, "num_input_tokens_seen": 4424096, "step": 1905 }, { "epoch": 1.8001885014137606, "grad_norm": 64.27619171142578, "learning_rate": 4.040651307413142e-06, "loss": 0.1499, "num_input_tokens_seen": 4434144, "step": 1910 }, { "epoch": 1.8049010367577756, "grad_norm": 88.78367614746094, "learning_rate": 4.034164823559487e-06, "loss": 0.1671, "num_input_tokens_seen": 4446240, "step": 1915 }, { "epoch": 1.8096135721017907, "grad_norm": 47.201698303222656, "learning_rate": 4.02766173058848e-06, "loss": 0.183, "num_input_tokens_seen": 4455712, "step": 1920 }, { "epoch": 1.814326107445806, "grad_norm": 17.526779174804688, "learning_rate": 4.021142098903662e-06, "loss": 0.2619, "num_input_tokens_seen": 4466144, "step": 1925 }, { "epoch": 1.8190386427898209, "grad_norm": 18.032976150512695, "learning_rate": 4.014605999087623e-06, "loss": 0.2168, "num_input_tokens_seen": 4476064, "step": 1930 }, { "epoch": 1.823751178133836, "grad_norm": 4.104875564575195, "learning_rate": 4.008053501901239e-06, "loss": 0.1402, "num_input_tokens_seen": 4487456, "step": 1935 }, { "epoch": 1.8284637134778512, "grad_norm": 28.21024513244629, "learning_rate": 4.001484678282911e-06, "loss": 0.2318, "num_input_tokens_seen": 4498400, "step": 1940 }, { "epoch": 1.8331762488218661, "grad_norm": 36.88951873779297, "learning_rate": 3.994899599347787e-06, "loss": 0.1527, "num_input_tokens_seen": 4511520, "step": 1945 }, { "epoch": 1.837888784165881, "grad_norm": 12.032304763793945, "learning_rate": 3.9882983363869995e-06, "loss": 0.151, "num_input_tokens_seen": 4523232, "step": 1950 }, { "epoch": 1.8426013195098965, "grad_norm": 22.562625885009766, "learning_rate": 3.981680960866896e-06, "loss": 0.084, "num_input_tokens_seen": 4536416, "step": 1955 }, { "epoch": 1.8473138548539114, "grad_norm": 2.119037389755249, "learning_rate": 3.9750475444282545e-06, "loss": 0.1193, "num_input_tokens_seen": 4546528, "step": 1960 }, { "epoch": 1.8520263901979264, "grad_norm": 5.9970574378967285, "learning_rate": 3.968398158885519e-06, "loss": 0.0301, "num_input_tokens_seen": 4559008, "step": 1965 }, { "epoch": 1.8567389255419415, "grad_norm": 53.16204071044922, "learning_rate": 3.961732876226016e-06, "loss": 0.1272, "num_input_tokens_seen": 4569824, "step": 1970 }, { "epoch": 1.8614514608859567, "grad_norm": 34.37496566772461, "learning_rate": 3.955051768609179e-06, "loss": 0.0125, "num_input_tokens_seen": 4581664, "step": 1975 }, { "epoch": 1.8661639962299716, "grad_norm": 5.8095011711120605, "learning_rate": 3.948354908365762e-06, "loss": 0.2273, "num_input_tokens_seen": 4593696, "step": 1980 }, { "epoch": 1.8708765315739868, "grad_norm": 82.38545989990234, "learning_rate": 3.941642367997062e-06, "loss": 0.3306, "num_input_tokens_seen": 4604064, "step": 1985 }, { "epoch": 1.875589066918002, "grad_norm": 13.79807186126709, "learning_rate": 3.934914220174128e-06, "loss": 0.2246, "num_input_tokens_seen": 4613856, "step": 1990 }, { "epoch": 1.880301602262017, "grad_norm": 9.43858528137207, "learning_rate": 3.9281705377369814e-06, "loss": 0.262, "num_input_tokens_seen": 4624480, "step": 1995 }, { "epoch": 1.885014137606032, "grad_norm": 0.6858423352241516, "learning_rate": 3.921411393693823e-06, "loss": 0.0359, "num_input_tokens_seen": 4634720, "step": 2000 }, { "epoch": 1.8897266729500473, "grad_norm": 12.693150520324707, "learning_rate": 3.9146368612202425e-06, "loss": 0.1522, "num_input_tokens_seen": 4644320, "step": 2005 }, { "epoch": 1.8944392082940622, "grad_norm": 0.35528820753097534, "learning_rate": 3.907847013658429e-06, "loss": 0.1144, "num_input_tokens_seen": 4656672, "step": 2010 }, { "epoch": 1.8991517436380771, "grad_norm": 0.7190976142883301, "learning_rate": 3.901041924516372e-06, "loss": 0.152, "num_input_tokens_seen": 4668832, "step": 2015 }, { "epoch": 1.9038642789820923, "grad_norm": 18.8311767578125, "learning_rate": 3.894221667467074e-06, "loss": 0.0683, "num_input_tokens_seen": 4680096, "step": 2020 }, { "epoch": 1.9085768143261075, "grad_norm": 2.0841264724731445, "learning_rate": 3.887386316347742e-06, "loss": 0.0966, "num_input_tokens_seen": 4692320, "step": 2025 }, { "epoch": 1.9132893496701224, "grad_norm": 90.08401489257812, "learning_rate": 3.880535945158997e-06, "loss": 0.1503, "num_input_tokens_seen": 4709344, "step": 2030 }, { "epoch": 1.9180018850141376, "grad_norm": 0.7957233786582947, "learning_rate": 3.873670628064071e-06, "loss": 0.0726, "num_input_tokens_seen": 4721888, "step": 2035 }, { "epoch": 1.9227144203581528, "grad_norm": 115.30460357666016, "learning_rate": 3.866790439387998e-06, "loss": 0.117, "num_input_tokens_seen": 4732384, "step": 2040 }, { "epoch": 1.9274269557021677, "grad_norm": 0.2744818925857544, "learning_rate": 3.85989545361682e-06, "loss": 0.2188, "num_input_tokens_seen": 4743264, "step": 2045 }, { "epoch": 1.9321394910461829, "grad_norm": 0.26964840292930603, "learning_rate": 3.85298574539677e-06, "loss": 0.1091, "num_input_tokens_seen": 4753248, "step": 2050 }, { "epoch": 1.936852026390198, "grad_norm": 151.45645141601562, "learning_rate": 3.846061389533472e-06, "loss": 0.0907, "num_input_tokens_seen": 4764768, "step": 2055 }, { "epoch": 1.941564561734213, "grad_norm": 72.78887939453125, "learning_rate": 3.839122460991124e-06, "loss": 0.2683, "num_input_tokens_seen": 4775456, "step": 2060 }, { "epoch": 1.946277097078228, "grad_norm": 203.98098754882812, "learning_rate": 3.832169034891695e-06, "loss": 0.3549, "num_input_tokens_seen": 4789152, "step": 2065 }, { "epoch": 1.9509896324222433, "grad_norm": 12.131155014038086, "learning_rate": 3.825201186514103e-06, "loss": 0.0639, "num_input_tokens_seen": 4803488, "step": 2070 }, { "epoch": 1.9557021677662583, "grad_norm": 8.148255348205566, "learning_rate": 3.818218991293406e-06, "loss": 0.2019, "num_input_tokens_seen": 4813216, "step": 2075 }, { "epoch": 1.9604147031102732, "grad_norm": 39.3453369140625, "learning_rate": 3.811222524819983e-06, "loss": 0.1943, "num_input_tokens_seen": 4823584, "step": 2080 }, { "epoch": 1.9651272384542884, "grad_norm": 44.195316314697266, "learning_rate": 3.8042118628387138e-06, "loss": 0.0531, "num_input_tokens_seen": 4838624, "step": 2085 }, { "epoch": 1.9698397737983036, "grad_norm": 69.47586059570312, "learning_rate": 3.7971870812481636e-06, "loss": 0.0121, "num_input_tokens_seen": 4851552, "step": 2090 }, { "epoch": 1.9745523091423185, "grad_norm": 34.5429573059082, "learning_rate": 3.7901482560997577e-06, "loss": 0.1929, "num_input_tokens_seen": 4864352, "step": 2095 }, { "epoch": 1.9792648444863337, "grad_norm": 4.417181015014648, "learning_rate": 3.78309546359696e-06, "loss": 0.2053, "num_input_tokens_seen": 4875616, "step": 2100 }, { "epoch": 1.9839773798303488, "grad_norm": 43.39990997314453, "learning_rate": 3.776028780094446e-06, "loss": 0.0107, "num_input_tokens_seen": 4886560, "step": 2105 }, { "epoch": 1.9886899151743638, "grad_norm": 31.191131591796875, "learning_rate": 3.7689482820972797e-06, "loss": 0.2379, "num_input_tokens_seen": 4898592, "step": 2110 }, { "epoch": 1.993402450518379, "grad_norm": 87.375244140625, "learning_rate": 3.7618540462600792e-06, "loss": 0.2504, "num_input_tokens_seen": 4912160, "step": 2115 }, { "epoch": 1.998114985862394, "grad_norm": 16.684934616088867, "learning_rate": 3.7547461493861948e-06, "loss": 0.1832, "num_input_tokens_seen": 4923424, "step": 2120 }, { "epoch": 2.002827521206409, "grad_norm": 0.0688318982720375, "learning_rate": 3.7476246684268703e-06, "loss": 0.0762, "num_input_tokens_seen": 4932416, "step": 2125 }, { "epoch": 2.005655042412818, "eval_loss": 0.33445462584495544, "eval_runtime": 3.3719, "eval_samples_per_second": 279.667, "eval_steps_per_second": 34.996, "num_input_tokens_seen": 4940992, "step": 2128 }, { "epoch": 2.007540056550424, "grad_norm": 2.835258722305298, "learning_rate": 3.740489680480415e-06, "loss": 0.0528, "num_input_tokens_seen": 4948288, "step": 2130 }, { "epoch": 2.0122525918944394, "grad_norm": 0.02049732208251953, "learning_rate": 3.733341262791366e-06, "loss": 0.0067, "num_input_tokens_seen": 4960512, "step": 2135 }, { "epoch": 2.0169651272384543, "grad_norm": 0.09395653009414673, "learning_rate": 3.7261794927496535e-06, "loss": 0.0027, "num_input_tokens_seen": 4972352, "step": 2140 }, { "epoch": 2.0216776625824693, "grad_norm": 159.60162353515625, "learning_rate": 3.719004447889762e-06, "loss": 0.0681, "num_input_tokens_seen": 4982272, "step": 2145 }, { "epoch": 2.0263901979264842, "grad_norm": 0.5360152721405029, "learning_rate": 3.7118162058898915e-06, "loss": 0.1795, "num_input_tokens_seen": 4993088, "step": 2150 }, { "epoch": 2.0311027332704996, "grad_norm": 0.0288984514772892, "learning_rate": 3.704614844571117e-06, "loss": 0.0124, "num_input_tokens_seen": 5003392, "step": 2155 }, { "epoch": 2.0358152686145146, "grad_norm": 0.07737737149000168, "learning_rate": 3.6974004418965435e-06, "loss": 0.0007, "num_input_tokens_seen": 5014592, "step": 2160 }, { "epoch": 2.0405278039585295, "grad_norm": 81.7595443725586, "learning_rate": 3.6901730759704674e-06, "loss": 0.1943, "num_input_tokens_seen": 5028160, "step": 2165 }, { "epoch": 2.045240339302545, "grad_norm": 0.019410187378525734, "learning_rate": 3.682932825037523e-06, "loss": 0.1365, "num_input_tokens_seen": 5037504, "step": 2170 }, { "epoch": 2.04995287464656, "grad_norm": 15.080971717834473, "learning_rate": 3.675679767481842e-06, "loss": 0.0894, "num_input_tokens_seen": 5052288, "step": 2175 }, { "epoch": 2.054665409990575, "grad_norm": 10.959814071655273, "learning_rate": 3.6684139818262045e-06, "loss": 0.1397, "num_input_tokens_seen": 5064384, "step": 2180 }, { "epoch": 2.05937794533459, "grad_norm": 158.70689392089844, "learning_rate": 3.6611355467311825e-06, "loss": 0.0268, "num_input_tokens_seen": 5074240, "step": 2185 }, { "epoch": 2.064090480678605, "grad_norm": 0.12513531744480133, "learning_rate": 3.653844540994298e-06, "loss": 0.0081, "num_input_tokens_seen": 5085312, "step": 2190 }, { "epoch": 2.06880301602262, "grad_norm": 0.03574146702885628, "learning_rate": 3.6465410435491603e-06, "loss": 0.0006, "num_input_tokens_seen": 5094592, "step": 2195 }, { "epoch": 2.0735155513666355, "grad_norm": 0.017842473462224007, "learning_rate": 3.6392251334646194e-06, "loss": 0.0012, "num_input_tokens_seen": 5108544, "step": 2200 }, { "epoch": 2.0782280867106504, "grad_norm": 0.040509432554244995, "learning_rate": 3.6318968899439042e-06, "loss": 0.2164, "num_input_tokens_seen": 5118976, "step": 2205 }, { "epoch": 2.0829406220546653, "grad_norm": 0.03663352131843567, "learning_rate": 3.6245563923237692e-06, "loss": 0.0004, "num_input_tokens_seen": 5134272, "step": 2210 }, { "epoch": 2.0876531573986803, "grad_norm": 0.11897611618041992, "learning_rate": 3.617203720073633e-06, "loss": 0.0463, "num_input_tokens_seen": 5145408, "step": 2215 }, { "epoch": 2.0923656927426957, "grad_norm": 0.11080852895975113, "learning_rate": 3.6098389527947164e-06, "loss": 0.1413, "num_input_tokens_seen": 5157440, "step": 2220 }, { "epoch": 2.0970782280867106, "grad_norm": 0.09218670427799225, "learning_rate": 3.6024621702191876e-06, "loss": 0.0007, "num_input_tokens_seen": 5170176, "step": 2225 }, { "epoch": 2.1017907634307256, "grad_norm": 1.5784250497817993, "learning_rate": 3.5950734522092908e-06, "loss": 0.2877, "num_input_tokens_seen": 5178944, "step": 2230 }, { "epoch": 2.106503298774741, "grad_norm": 0.22626134753227234, "learning_rate": 3.587672878756487e-06, "loss": 0.0007, "num_input_tokens_seen": 5190272, "step": 2235 }, { "epoch": 2.111215834118756, "grad_norm": 0.011661054566502571, "learning_rate": 3.5802605299805843e-06, "loss": 0.0004, "num_input_tokens_seen": 5202304, "step": 2240 }, { "epoch": 2.115928369462771, "grad_norm": 129.8340301513672, "learning_rate": 3.5728364861288743e-06, "loss": 0.1757, "num_input_tokens_seen": 5215808, "step": 2245 }, { "epoch": 2.1206409048067862, "grad_norm": 0.05797062814235687, "learning_rate": 3.5654008275752607e-06, "loss": 0.0003, "num_input_tokens_seen": 5229056, "step": 2250 }, { "epoch": 2.125353440150801, "grad_norm": 0.6185352206230164, "learning_rate": 3.557953634819389e-06, "loss": 0.0007, "num_input_tokens_seen": 5239616, "step": 2255 }, { "epoch": 2.130065975494816, "grad_norm": 271.65594482421875, "learning_rate": 3.550494988485777e-06, "loss": 0.1511, "num_input_tokens_seen": 5249600, "step": 2260 }, { "epoch": 2.1347785108388315, "grad_norm": 0.7488783001899719, "learning_rate": 3.5430249693229403e-06, "loss": 0.2004, "num_input_tokens_seen": 5261888, "step": 2265 }, { "epoch": 2.1394910461828465, "grad_norm": 0.022314058616757393, "learning_rate": 3.5355436582025184e-06, "loss": 0.0272, "num_input_tokens_seen": 5272768, "step": 2270 }, { "epoch": 2.1442035815268614, "grad_norm": 0.029360728338360786, "learning_rate": 3.5280511361183995e-06, "loss": 0.142, "num_input_tokens_seen": 5283520, "step": 2275 }, { "epoch": 2.1489161168708764, "grad_norm": 0.04351954534649849, "learning_rate": 3.5205474841858444e-06, "loss": 0.0003, "num_input_tokens_seen": 5294336, "step": 2280 }, { "epoch": 2.1536286522148917, "grad_norm": 0.8838725090026855, "learning_rate": 3.513032783640605e-06, "loss": 0.0445, "num_input_tokens_seen": 5304960, "step": 2285 }, { "epoch": 2.1583411875589067, "grad_norm": 0.011690633371472359, "learning_rate": 3.5055071158380512e-06, "loss": 0.0002, "num_input_tokens_seen": 5317184, "step": 2290 }, { "epoch": 2.1630537229029216, "grad_norm": 0.16222970187664032, "learning_rate": 3.497970562252282e-06, "loss": 0.0003, "num_input_tokens_seen": 5329152, "step": 2295 }, { "epoch": 2.167766258246937, "grad_norm": 128.02944946289062, "learning_rate": 3.4904232044752507e-06, "loss": 0.232, "num_input_tokens_seen": 5342016, "step": 2300 }, { "epoch": 2.172478793590952, "grad_norm": 73.5108413696289, "learning_rate": 3.4828651242158764e-06, "loss": 0.1157, "num_input_tokens_seen": 5352768, "step": 2305 }, { "epoch": 2.177191328934967, "grad_norm": 0.029827579855918884, "learning_rate": 3.4752964032991638e-06, "loss": 0.1506, "num_input_tokens_seen": 5364160, "step": 2310 }, { "epoch": 2.181903864278982, "grad_norm": 0.14194637537002563, "learning_rate": 3.4677171236653133e-06, "loss": 0.1442, "num_input_tokens_seen": 5376448, "step": 2315 }, { "epoch": 2.1866163996229973, "grad_norm": 90.07513427734375, "learning_rate": 3.460127367368836e-06, "loss": 0.0562, "num_input_tokens_seen": 5386560, "step": 2320 }, { "epoch": 2.191328934967012, "grad_norm": 0.10351494699716568, "learning_rate": 3.452527216577665e-06, "loss": 0.1956, "num_input_tokens_seen": 5399296, "step": 2325 }, { "epoch": 2.196041470311027, "grad_norm": 0.17435222864151, "learning_rate": 3.444916753572267e-06, "loss": 0.1061, "num_input_tokens_seen": 5410944, "step": 2330 }, { "epoch": 2.2007540056550425, "grad_norm": 0.2240123301744461, "learning_rate": 3.4372960607447493e-06, "loss": 0.0012, "num_input_tokens_seen": 5423168, "step": 2335 }, { "epoch": 2.2054665409990575, "grad_norm": 0.03307259455323219, "learning_rate": 3.429665220597968e-06, "loss": 0.0111, "num_input_tokens_seen": 5436544, "step": 2340 }, { "epoch": 2.2101790763430724, "grad_norm": 0.026153933256864548, "learning_rate": 3.4220243157446388e-06, "loss": 0.0934, "num_input_tokens_seen": 5448512, "step": 2345 }, { "epoch": 2.214891611687088, "grad_norm": 150.3295440673828, "learning_rate": 3.4143734289064363e-06, "loss": 0.0139, "num_input_tokens_seen": 5460032, "step": 2350 }, { "epoch": 2.2196041470311028, "grad_norm": 0.09933875501155853, "learning_rate": 3.4067126429131035e-06, "loss": 0.0004, "num_input_tokens_seen": 5472896, "step": 2355 }, { "epoch": 2.2243166823751177, "grad_norm": 0.017140116542577744, "learning_rate": 3.3990420407015534e-06, "loss": 0.0005, "num_input_tokens_seen": 5482944, "step": 2360 }, { "epoch": 2.229029217719133, "grad_norm": 88.95214080810547, "learning_rate": 3.3913617053149694e-06, "loss": 0.0536, "num_input_tokens_seen": 5494336, "step": 2365 }, { "epoch": 2.233741753063148, "grad_norm": 0.016227245330810547, "learning_rate": 3.3836717199019087e-06, "loss": 0.0001, "num_input_tokens_seen": 5505728, "step": 2370 }, { "epoch": 2.238454288407163, "grad_norm": 0.01039363257586956, "learning_rate": 3.3759721677154022e-06, "loss": 0.0861, "num_input_tokens_seen": 5515328, "step": 2375 }, { "epoch": 2.243166823751178, "grad_norm": 0.014802216552197933, "learning_rate": 3.3682631321120507e-06, "loss": 0.0002, "num_input_tokens_seen": 5525760, "step": 2380 }, { "epoch": 2.2478793590951933, "grad_norm": 0.8376834392547607, "learning_rate": 3.3605446965511256e-06, "loss": 0.168, "num_input_tokens_seen": 5537280, "step": 2385 }, { "epoch": 2.2525918944392083, "grad_norm": 20.87982749938965, "learning_rate": 3.3528169445936616e-06, "loss": 0.0898, "num_input_tokens_seen": 5548928, "step": 2390 }, { "epoch": 2.2563619227144205, "eval_loss": 0.46465176343917847, "eval_runtime": 2.7461, "eval_samples_per_second": 343.401, "eval_steps_per_second": 42.971, "num_input_tokens_seen": 5558144, "step": 2394 }, { "epoch": 2.257304429783223, "grad_norm": 446.71234130859375, "learning_rate": 3.3450799599015567e-06, "loss": 0.1847, "num_input_tokens_seen": 5559872, "step": 2395 }, { "epoch": 2.2620169651272386, "grad_norm": 0.04414854571223259, "learning_rate": 3.3373338262366617e-06, "loss": 0.0234, "num_input_tokens_seen": 5571264, "step": 2400 }, { "epoch": 2.2667295004712535, "grad_norm": 0.1296156644821167, "learning_rate": 3.329578627459878e-06, "loss": 0.0881, "num_input_tokens_seen": 5581312, "step": 2405 }, { "epoch": 2.2714420358152685, "grad_norm": 0.03757103905081749, "learning_rate": 3.3218144475302444e-06, "loss": 0.0004, "num_input_tokens_seen": 5592384, "step": 2410 }, { "epoch": 2.276154571159284, "grad_norm": 0.04516446590423584, "learning_rate": 3.314041370504034e-06, "loss": 0.1036, "num_input_tokens_seen": 5603456, "step": 2415 }, { "epoch": 2.280867106503299, "grad_norm": 0.09362529218196869, "learning_rate": 3.30625948053384e-06, "loss": 0.0579, "num_input_tokens_seen": 5614464, "step": 2420 }, { "epoch": 2.2855796418473138, "grad_norm": 10.87307071685791, "learning_rate": 3.2984688618676665e-06, "loss": 0.089, "num_input_tokens_seen": 5626112, "step": 2425 }, { "epoch": 2.290292177191329, "grad_norm": 0.38255080580711365, "learning_rate": 3.2906695988480144e-06, "loss": 0.0886, "num_input_tokens_seen": 5637248, "step": 2430 }, { "epoch": 2.295004712535344, "grad_norm": 35.04936599731445, "learning_rate": 3.2828617759109715e-06, "loss": 0.0709, "num_input_tokens_seen": 5647552, "step": 2435 }, { "epoch": 2.299717247879359, "grad_norm": 0.16362737119197845, "learning_rate": 3.2750454775852956e-06, "loss": 0.0006, "num_input_tokens_seen": 5662080, "step": 2440 }, { "epoch": 2.304429783223374, "grad_norm": 0.023827245458960533, "learning_rate": 3.2672207884915017e-06, "loss": 0.0005, "num_input_tokens_seen": 5673856, "step": 2445 }, { "epoch": 2.3091423185673894, "grad_norm": 55.789493560791016, "learning_rate": 3.2593877933409436e-06, "loss": 0.107, "num_input_tokens_seen": 5683904, "step": 2450 }, { "epoch": 2.3138548539114043, "grad_norm": 0.02205372042953968, "learning_rate": 3.251546576934897e-06, "loss": 0.0003, "num_input_tokens_seen": 5694400, "step": 2455 }, { "epoch": 2.3185673892554193, "grad_norm": 65.09204864501953, "learning_rate": 3.2436972241636443e-06, "loss": 0.1635, "num_input_tokens_seen": 5705664, "step": 2460 }, { "epoch": 2.3232799245994347, "grad_norm": 0.022804006934165955, "learning_rate": 3.2358398200055515e-06, "loss": 0.0001, "num_input_tokens_seen": 5718848, "step": 2465 }, { "epoch": 2.3279924599434496, "grad_norm": 0.01745908334851265, "learning_rate": 3.227974449526152e-06, "loss": 0.0504, "num_input_tokens_seen": 5732096, "step": 2470 }, { "epoch": 2.3327049952874646, "grad_norm": 91.0146713256836, "learning_rate": 3.2201011978772224e-06, "loss": 0.09, "num_input_tokens_seen": 5742144, "step": 2475 }, { "epoch": 2.3374175306314795, "grad_norm": 0.06392789632081985, "learning_rate": 3.2122201502958635e-06, "loss": 0.0647, "num_input_tokens_seen": 5754176, "step": 2480 }, { "epoch": 2.342130065975495, "grad_norm": 0.008629159070551395, "learning_rate": 3.2043313921035747e-06, "loss": 0.0155, "num_input_tokens_seen": 5767104, "step": 2485 }, { "epoch": 2.34684260131951, "grad_norm": 113.13795471191406, "learning_rate": 3.1964350087053323e-06, "loss": 0.3015, "num_input_tokens_seen": 5779520, "step": 2490 }, { "epoch": 2.3515551366635252, "grad_norm": 243.08010864257812, "learning_rate": 3.1885310855886655e-06, "loss": 0.0284, "num_input_tokens_seen": 5792640, "step": 2495 }, { "epoch": 2.35626767200754, "grad_norm": 0.029916413128376007, "learning_rate": 3.1806197083227276e-06, "loss": 0.0001, "num_input_tokens_seen": 5805696, "step": 2500 }, { "epoch": 2.360980207351555, "grad_norm": 0.012451832182705402, "learning_rate": 3.172700962557373e-06, "loss": 0.168, "num_input_tokens_seen": 5819840, "step": 2505 }, { "epoch": 2.36569274269557, "grad_norm": 0.06405292451381683, "learning_rate": 3.1647749340222288e-06, "loss": 0.1209, "num_input_tokens_seen": 5830016, "step": 2510 }, { "epoch": 2.3704052780395855, "grad_norm": 31.766504287719727, "learning_rate": 3.1568417085257653e-06, "loss": 0.0744, "num_input_tokens_seen": 5840000, "step": 2515 }, { "epoch": 2.3751178133836004, "grad_norm": 117.67131805419922, "learning_rate": 3.1489013719543703e-06, "loss": 0.0681, "num_input_tokens_seen": 5849920, "step": 2520 }, { "epoch": 2.3798303487276153, "grad_norm": 17.114727020263672, "learning_rate": 3.140954010271416e-06, "loss": 0.2567, "num_input_tokens_seen": 5860480, "step": 2525 }, { "epoch": 2.3845428840716307, "grad_norm": 0.0346570685505867, "learning_rate": 3.132999709516329e-06, "loss": 0.0055, "num_input_tokens_seen": 5873408, "step": 2530 }, { "epoch": 2.3892554194156457, "grad_norm": 3.354789972305298, "learning_rate": 3.1250385558036606e-06, "loss": 0.0887, "num_input_tokens_seen": 5884608, "step": 2535 }, { "epoch": 2.3939679547596606, "grad_norm": 46.3475227355957, "learning_rate": 3.1170706353221525e-06, "loss": 0.2362, "num_input_tokens_seen": 5896064, "step": 2540 }, { "epoch": 2.3986804901036756, "grad_norm": 0.14980660378932953, "learning_rate": 3.109096034333805e-06, "loss": 0.0014, "num_input_tokens_seen": 5907776, "step": 2545 }, { "epoch": 2.403393025447691, "grad_norm": 63.976871490478516, "learning_rate": 3.1011148391729434e-06, "loss": 0.0292, "num_input_tokens_seen": 5919744, "step": 2550 }, { "epoch": 2.408105560791706, "grad_norm": 0.936824381351471, "learning_rate": 3.0931271362452803e-06, "loss": 0.18, "num_input_tokens_seen": 5932224, "step": 2555 }, { "epoch": 2.412818096135721, "grad_norm": 0.04161603003740311, "learning_rate": 3.085133012026985e-06, "loss": 0.001, "num_input_tokens_seen": 5943424, "step": 2560 }, { "epoch": 2.4175306314797362, "grad_norm": 50.99045181274414, "learning_rate": 3.0771325530637434e-06, "loss": 0.1243, "num_input_tokens_seen": 5955904, "step": 2565 }, { "epoch": 2.422243166823751, "grad_norm": 1.0831135511398315, "learning_rate": 3.0691258459698227e-06, "loss": 0.0789, "num_input_tokens_seen": 5967360, "step": 2570 }, { "epoch": 2.426955702167766, "grad_norm": 0.2694717049598694, "learning_rate": 3.0611129774271318e-06, "loss": 0.1948, "num_input_tokens_seen": 5980608, "step": 2575 }, { "epoch": 2.4316682375117815, "grad_norm": 0.017116645351052284, "learning_rate": 3.0530940341842883e-06, "loss": 0.0003, "num_input_tokens_seen": 5993472, "step": 2580 }, { "epoch": 2.4363807728557965, "grad_norm": 0.10097761452198029, "learning_rate": 3.045069103055672e-06, "loss": 0.0005, "num_input_tokens_seen": 6003520, "step": 2585 }, { "epoch": 2.4410933081998114, "grad_norm": 2.4190433025360107, "learning_rate": 3.037038270920489e-06, "loss": 0.0118, "num_input_tokens_seen": 6014720, "step": 2590 }, { "epoch": 2.445805843543827, "grad_norm": 0.22586967051029205, "learning_rate": 3.0290016247218323e-06, "loss": 0.0956, "num_input_tokens_seen": 6032192, "step": 2595 }, { "epoch": 2.4505183788878417, "grad_norm": 0.04388771951198578, "learning_rate": 3.0209592514657365e-06, "loss": 0.2412, "num_input_tokens_seen": 6043328, "step": 2600 }, { "epoch": 2.4552309142318567, "grad_norm": 27.575590133666992, "learning_rate": 3.012911238220241e-06, "loss": 0.0061, "num_input_tokens_seen": 6055424, "step": 2605 }, { "epoch": 2.4599434495758716, "grad_norm": 0.026035049930214882, "learning_rate": 3.004857672114443e-06, "loss": 0.2284, "num_input_tokens_seen": 6065472, "step": 2610 }, { "epoch": 2.464655984919887, "grad_norm": 1.065898060798645, "learning_rate": 2.996798640337556e-06, "loss": 0.0007, "num_input_tokens_seen": 6078016, "step": 2615 }, { "epoch": 2.469368520263902, "grad_norm": 17.635618209838867, "learning_rate": 2.9887342301379653e-06, "loss": 0.0974, "num_input_tokens_seen": 6089472, "step": 2620 }, { "epoch": 2.474081055607917, "grad_norm": 9.892471313476562, "learning_rate": 2.9806645288222854e-06, "loss": 0.1484, "num_input_tokens_seen": 6100992, "step": 2625 }, { "epoch": 2.4787935909519323, "grad_norm": 0.05071339011192322, "learning_rate": 2.9725896237544115e-06, "loss": 0.0821, "num_input_tokens_seen": 6112768, "step": 2630 }, { "epoch": 2.4835061262959472, "grad_norm": 0.19504772126674652, "learning_rate": 2.9645096023545774e-06, "loss": 0.0017, "num_input_tokens_seen": 6122752, "step": 2635 }, { "epoch": 2.488218661639962, "grad_norm": 0.5724993348121643, "learning_rate": 2.956424552098405e-06, "loss": 0.05, "num_input_tokens_seen": 6136256, "step": 2640 }, { "epoch": 2.492931196983977, "grad_norm": 0.17895296216011047, "learning_rate": 2.94833456051596e-06, "loss": 0.0714, "num_input_tokens_seen": 6147264, "step": 2645 }, { "epoch": 2.4976437323279925, "grad_norm": 0.440418004989624, "learning_rate": 2.9402397151908056e-06, "loss": 0.0012, "num_input_tokens_seen": 6161088, "step": 2650 }, { "epoch": 2.5023562676720075, "grad_norm": 0.04092998430132866, "learning_rate": 2.93214010375905e-06, "loss": 0.0567, "num_input_tokens_seen": 6173568, "step": 2655 }, { "epoch": 2.507068803016023, "grad_norm": 0.02762027271091938, "learning_rate": 2.924035813908402e-06, "loss": 0.0692, "num_input_tokens_seen": 6183872, "step": 2660 }, { "epoch": 2.507068803016023, "eval_loss": 0.40977799892425537, "eval_runtime": 2.7366, "eval_samples_per_second": 344.588, "eval_steps_per_second": 43.119, "num_input_tokens_seen": 6183872, "step": 2660 }, { "epoch": 2.511781338360038, "grad_norm": 0.12844966351985931, "learning_rate": 2.9159269333772173e-06, "loss": 0.0693, "num_input_tokens_seen": 6195648, "step": 2665 }, { "epoch": 2.5164938737040528, "grad_norm": 0.043845776468515396, "learning_rate": 2.9078135499535535e-06, "loss": 0.0003, "num_input_tokens_seen": 6205696, "step": 2670 }, { "epoch": 2.5212064090480677, "grad_norm": 0.1523384004831314, "learning_rate": 2.8996957514742164e-06, "loss": 0.0993, "num_input_tokens_seen": 6219648, "step": 2675 }, { "epoch": 2.525918944392083, "grad_norm": 0.03311268240213394, "learning_rate": 2.891573625823808e-06, "loss": 0.0016, "num_input_tokens_seen": 6233664, "step": 2680 }, { "epoch": 2.530631479736098, "grad_norm": 0.013956856913864613, "learning_rate": 2.883447260933781e-06, "loss": 0.0002, "num_input_tokens_seen": 6246400, "step": 2685 }, { "epoch": 2.535344015080113, "grad_norm": 0.00951316673308611, "learning_rate": 2.875316744781479e-06, "loss": 0.0776, "num_input_tokens_seen": 6256576, "step": 2690 }, { "epoch": 2.5400565504241284, "grad_norm": 18.4660587310791, "learning_rate": 2.8671821653891903e-06, "loss": 0.0909, "num_input_tokens_seen": 6266240, "step": 2695 }, { "epoch": 2.5447690857681433, "grad_norm": 14.168307304382324, "learning_rate": 2.85904361082319e-06, "loss": 0.1384, "num_input_tokens_seen": 6279872, "step": 2700 }, { "epoch": 2.5494816211121583, "grad_norm": 0.031402263790369034, "learning_rate": 2.8509011691927923e-06, "loss": 0.0001, "num_input_tokens_seen": 6290048, "step": 2705 }, { "epoch": 2.554194156456173, "grad_norm": 0.06130323186516762, "learning_rate": 2.8427549286493906e-06, "loss": 0.0368, "num_input_tokens_seen": 6301120, "step": 2710 }, { "epoch": 2.5589066918001886, "grad_norm": 0.13699810206890106, "learning_rate": 2.8346049773855077e-06, "loss": 0.1002, "num_input_tokens_seen": 6312512, "step": 2715 }, { "epoch": 2.5636192271442035, "grad_norm": 0.026166977360844612, "learning_rate": 2.8264514036338385e-06, "loss": 0.0002, "num_input_tokens_seen": 6323776, "step": 2720 }, { "epoch": 2.568331762488219, "grad_norm": 0.05908394604921341, "learning_rate": 2.818294295666295e-06, "loss": 0.0003, "num_input_tokens_seen": 6334208, "step": 2725 }, { "epoch": 2.573044297832234, "grad_norm": 1.3038756847381592, "learning_rate": 2.8101337417930523e-06, "loss": 0.0952, "num_input_tokens_seen": 6345216, "step": 2730 }, { "epoch": 2.577756833176249, "grad_norm": 39.35552215576172, "learning_rate": 2.8019698303615912e-06, "loss": 0.2239, "num_input_tokens_seen": 6354304, "step": 2735 }, { "epoch": 2.5824693685202638, "grad_norm": 0.07452750205993652, "learning_rate": 2.7938026497557414e-06, "loss": 0.0628, "num_input_tokens_seen": 6368192, "step": 2740 }, { "epoch": 2.5871819038642787, "grad_norm": 0.034812554717063904, "learning_rate": 2.7856322883947253e-06, "loss": 0.0454, "num_input_tokens_seen": 6382400, "step": 2745 }, { "epoch": 2.591894439208294, "grad_norm": 0.02569451369345188, "learning_rate": 2.7774588347322016e-06, "loss": 0.0836, "num_input_tokens_seen": 6395584, "step": 2750 }, { "epoch": 2.596606974552309, "grad_norm": 479.6157531738281, "learning_rate": 2.7692823772553057e-06, "loss": 0.1468, "num_input_tokens_seen": 6406720, "step": 2755 }, { "epoch": 2.6013195098963244, "grad_norm": 371.5435791015625, "learning_rate": 2.7611030044836927e-06, "loss": 0.1705, "num_input_tokens_seen": 6418112, "step": 2760 }, { "epoch": 2.6060320452403394, "grad_norm": 58.181087493896484, "learning_rate": 2.752920804968581e-06, "loss": 0.0602, "num_input_tokens_seen": 6431104, "step": 2765 }, { "epoch": 2.6107445805843543, "grad_norm": 3.1500463485717773, "learning_rate": 2.744735867291789e-06, "loss": 0.0038, "num_input_tokens_seen": 6441792, "step": 2770 }, { "epoch": 2.6154571159283693, "grad_norm": 0.12725692987442017, "learning_rate": 2.736548280064781e-06, "loss": 0.167, "num_input_tokens_seen": 6452672, "step": 2775 }, { "epoch": 2.6201696512723847, "grad_norm": 0.05792888626456261, "learning_rate": 2.728358131927704e-06, "loss": 0.1083, "num_input_tokens_seen": 6465600, "step": 2780 }, { "epoch": 2.6248821866163996, "grad_norm": 2.1484336853027344, "learning_rate": 2.720165511548433e-06, "loss": 0.0731, "num_input_tokens_seen": 6477312, "step": 2785 }, { "epoch": 2.6295947219604145, "grad_norm": 13.481400489807129, "learning_rate": 2.711970507621603e-06, "loss": 0.179, "num_input_tokens_seen": 6486592, "step": 2790 }, { "epoch": 2.63430725730443, "grad_norm": 0.2009446918964386, "learning_rate": 2.7037732088676583e-06, "loss": 0.0011, "num_input_tokens_seen": 6497088, "step": 2795 }, { "epoch": 2.639019792648445, "grad_norm": 0.11857722699642181, "learning_rate": 2.6955737040318853e-06, "loss": 0.0035, "num_input_tokens_seen": 6505984, "step": 2800 }, { "epoch": 2.64373232799246, "grad_norm": 0.26333293318748474, "learning_rate": 2.687372081883454e-06, "loss": 0.0009, "num_input_tokens_seen": 6516928, "step": 2805 }, { "epoch": 2.6484448633364748, "grad_norm": 0.025169173255562782, "learning_rate": 2.6791684312144565e-06, "loss": 0.0096, "num_input_tokens_seen": 6527424, "step": 2810 }, { "epoch": 2.65315739868049, "grad_norm": 0.04603775218129158, "learning_rate": 2.670962840838946e-06, "loss": 0.0955, "num_input_tokens_seen": 6538432, "step": 2815 }, { "epoch": 2.657869934024505, "grad_norm": 116.4062271118164, "learning_rate": 2.6627553995919763e-06, "loss": 0.0341, "num_input_tokens_seen": 6551552, "step": 2820 }, { "epoch": 2.6625824693685205, "grad_norm": 0.07408913224935532, "learning_rate": 2.6545461963286374e-06, "loss": 0.0005, "num_input_tokens_seen": 6566208, "step": 2825 }, { "epoch": 2.6672950047125354, "grad_norm": 67.10398864746094, "learning_rate": 2.646335319923097e-06, "loss": 0.1887, "num_input_tokens_seen": 6577472, "step": 2830 }, { "epoch": 2.6720075400565504, "grad_norm": 0.03928399085998535, "learning_rate": 2.6381228592676343e-06, "loss": 0.1243, "num_input_tokens_seen": 6588608, "step": 2835 }, { "epoch": 2.6767200754005653, "grad_norm": 0.28905507922172546, "learning_rate": 2.629908903271683e-06, "loss": 0.1048, "num_input_tokens_seen": 6601088, "step": 2840 }, { "epoch": 2.6814326107445807, "grad_norm": 0.06444710493087769, "learning_rate": 2.6216935408608617e-06, "loss": 0.0005, "num_input_tokens_seen": 6611392, "step": 2845 }, { "epoch": 2.6861451460885957, "grad_norm": 14.772208213806152, "learning_rate": 2.6134768609760187e-06, "loss": 0.001, "num_input_tokens_seen": 6622656, "step": 2850 }, { "epoch": 2.6908576814326106, "grad_norm": 222.56637573242188, "learning_rate": 2.605258952572263e-06, "loss": 0.0916, "num_input_tokens_seen": 6635264, "step": 2855 }, { "epoch": 2.695570216776626, "grad_norm": 38.73012924194336, "learning_rate": 2.5970399046180043e-06, "loss": 0.0028, "num_input_tokens_seen": 6647680, "step": 2860 }, { "epoch": 2.700282752120641, "grad_norm": 0.014610473066568375, "learning_rate": 2.588819806093991e-06, "loss": 0.0001, "num_input_tokens_seen": 6662016, "step": 2865 }, { "epoch": 2.704995287464656, "grad_norm": 0.013276712968945503, "learning_rate": 2.580598745992342e-06, "loss": 0.1805, "num_input_tokens_seen": 6673024, "step": 2870 }, { "epoch": 2.709707822808671, "grad_norm": 0.011643171310424805, "learning_rate": 2.5723768133155894e-06, "loss": 0.0001, "num_input_tokens_seen": 6684416, "step": 2875 }, { "epoch": 2.7144203581526862, "grad_norm": 0.04866794869303703, "learning_rate": 2.5641540970757105e-06, "loss": 0.0783, "num_input_tokens_seen": 6696448, "step": 2880 }, { "epoch": 2.719132893496701, "grad_norm": 0.007833253592252731, "learning_rate": 2.555930686293165e-06, "loss": 0.0002, "num_input_tokens_seen": 6710528, "step": 2885 }, { "epoch": 2.7238454288407166, "grad_norm": 0.03470620512962341, "learning_rate": 2.547706669995933e-06, "loss": 0.0004, "num_input_tokens_seen": 6722176, "step": 2890 }, { "epoch": 2.7285579641847315, "grad_norm": 31.088220596313477, "learning_rate": 2.53948213721855e-06, "loss": 0.1775, "num_input_tokens_seen": 6732416, "step": 2895 }, { "epoch": 2.7332704995287465, "grad_norm": 0.12457949668169022, "learning_rate": 2.531257177001141e-06, "loss": 0.1137, "num_input_tokens_seen": 6745728, "step": 2900 }, { "epoch": 2.7379830348727614, "grad_norm": 11.16380786895752, "learning_rate": 2.523031878388463e-06, "loss": 0.0956, "num_input_tokens_seen": 6756096, "step": 2905 }, { "epoch": 2.742695570216777, "grad_norm": 1.0219601392745972, "learning_rate": 2.5148063304289306e-06, "loss": 0.063, "num_input_tokens_seen": 6766976, "step": 2910 }, { "epoch": 2.7474081055607917, "grad_norm": 0.03135580196976662, "learning_rate": 2.5065806221736617e-06, "loss": 0.1039, "num_input_tokens_seen": 6777792, "step": 2915 }, { "epoch": 2.7521206409048067, "grad_norm": 0.06795566529035568, "learning_rate": 2.4983548426755104e-06, "loss": 0.0003, "num_input_tokens_seen": 6789568, "step": 2920 }, { "epoch": 2.756833176248822, "grad_norm": 0.024466486647725105, "learning_rate": 2.4901290809880984e-06, "loss": 0.227, "num_input_tokens_seen": 6803392, "step": 2925 }, { "epoch": 2.757775683317625, "eval_loss": 0.43027257919311523, "eval_runtime": 2.7751, "eval_samples_per_second": 339.806, "eval_steps_per_second": 42.521, "num_input_tokens_seen": 6806208, "step": 2926 }, { "epoch": 2.761545711592837, "grad_norm": 0.08801653981208801, "learning_rate": 2.4819034261648574e-06, "loss": 0.0645, "num_input_tokens_seen": 6821760, "step": 2930 }, { "epoch": 2.766258246936852, "grad_norm": 0.02651871182024479, "learning_rate": 2.4736779672580625e-06, "loss": 0.2084, "num_input_tokens_seen": 6834688, "step": 2935 }, { "epoch": 2.770970782280867, "grad_norm": 0.9018564224243164, "learning_rate": 2.465452793317865e-06, "loss": 0.0731, "num_input_tokens_seen": 6846784, "step": 2940 }, { "epoch": 2.7756833176248823, "grad_norm": 0.10618780553340912, "learning_rate": 2.457227993391333e-06, "loss": 0.0866, "num_input_tokens_seen": 6859520, "step": 2945 }, { "epoch": 2.7803958529688972, "grad_norm": 0.10207852721214294, "learning_rate": 2.4490036565214876e-06, "loss": 0.0008, "num_input_tokens_seen": 6871296, "step": 2950 }, { "epoch": 2.785108388312912, "grad_norm": 125.38693237304688, "learning_rate": 2.440779871746331e-06, "loss": 0.0151, "num_input_tokens_seen": 6882496, "step": 2955 }, { "epoch": 2.7898209236569276, "grad_norm": 0.05238529294729233, "learning_rate": 2.4325567280978937e-06, "loss": 0.0708, "num_input_tokens_seen": 6894528, "step": 2960 }, { "epoch": 2.7945334590009425, "grad_norm": 56.63581848144531, "learning_rate": 2.424334314601263e-06, "loss": 0.1738, "num_input_tokens_seen": 6904960, "step": 2965 }, { "epoch": 2.7992459943449575, "grad_norm": 0.020289117470383644, "learning_rate": 2.416112720273623e-06, "loss": 0.155, "num_input_tokens_seen": 6914944, "step": 2970 }, { "epoch": 2.8039585296889724, "grad_norm": 0.2439601868391037, "learning_rate": 2.4078920341232856e-06, "loss": 0.0006, "num_input_tokens_seen": 6926080, "step": 2975 }, { "epoch": 2.808671065032988, "grad_norm": 22.042722702026367, "learning_rate": 2.3996723451487344e-06, "loss": 0.0028, "num_input_tokens_seen": 6936832, "step": 2980 }, { "epoch": 2.8133836003770027, "grad_norm": 0.06483375281095505, "learning_rate": 2.391453742337657e-06, "loss": 0.2284, "num_input_tokens_seen": 6948160, "step": 2985 }, { "epoch": 2.818096135721018, "grad_norm": 0.019181005656719208, "learning_rate": 2.3832363146659806e-06, "loss": 0.0003, "num_input_tokens_seen": 6958848, "step": 2990 }, { "epoch": 2.822808671065033, "grad_norm": 265.2681579589844, "learning_rate": 2.37502015109691e-06, "loss": 0.1133, "num_input_tokens_seen": 6970432, "step": 2995 }, { "epoch": 2.827521206409048, "grad_norm": 0.018625818192958832, "learning_rate": 2.3668053405799667e-06, "loss": 0.0691, "num_input_tokens_seen": 6980480, "step": 3000 }, { "epoch": 2.832233741753063, "grad_norm": 241.9073486328125, "learning_rate": 2.3585919720500214e-06, "loss": 0.0368, "num_input_tokens_seen": 6989760, "step": 3005 }, { "epoch": 2.8369462770970784, "grad_norm": 0.006676084361970425, "learning_rate": 2.3503801344263347e-06, "loss": 0.093, "num_input_tokens_seen": 6999232, "step": 3010 }, { "epoch": 2.8416588124410933, "grad_norm": 97.66014862060547, "learning_rate": 2.3421699166115946e-06, "loss": 0.2148, "num_input_tokens_seen": 7010944, "step": 3015 }, { "epoch": 2.8463713477851083, "grad_norm": 0.0063159572891891, "learning_rate": 2.3339614074909495e-06, "loss": 0.1475, "num_input_tokens_seen": 7021824, "step": 3020 }, { "epoch": 2.8510838831291236, "grad_norm": 0.022584695369005203, "learning_rate": 2.325754695931054e-06, "loss": 0.1085, "num_input_tokens_seen": 7031488, "step": 3025 }, { "epoch": 2.8557964184731386, "grad_norm": 34.398460388183594, "learning_rate": 2.3175498707790964e-06, "loss": 0.0536, "num_input_tokens_seen": 7041088, "step": 3030 }, { "epoch": 2.8605089538171535, "grad_norm": 48.400482177734375, "learning_rate": 2.3093470208618467e-06, "loss": 0.1759, "num_input_tokens_seen": 7051840, "step": 3035 }, { "epoch": 2.8652214891611685, "grad_norm": 0.9638648629188538, "learning_rate": 2.3011462349846907e-06, "loss": 0.0005, "num_input_tokens_seen": 7062848, "step": 3040 }, { "epoch": 2.869934024505184, "grad_norm": 2.5327022075653076, "learning_rate": 2.292947601930664e-06, "loss": 0.0006, "num_input_tokens_seen": 7079296, "step": 3045 }, { "epoch": 2.874646559849199, "grad_norm": 27.3743896484375, "learning_rate": 2.2847512104595005e-06, "loss": 0.1614, "num_input_tokens_seen": 7090752, "step": 3050 }, { "epoch": 2.879359095193214, "grad_norm": 0.02387884445488453, "learning_rate": 2.2765571493066647e-06, "loss": 0.0003, "num_input_tokens_seen": 7102464, "step": 3055 }, { "epoch": 2.884071630537229, "grad_norm": 0.011423971503973007, "learning_rate": 2.2683655071823925e-06, "loss": 0.038, "num_input_tokens_seen": 7117376, "step": 3060 }, { "epoch": 2.888784165881244, "grad_norm": 0.048735495656728745, "learning_rate": 2.2601763727707295e-06, "loss": 0.0809, "num_input_tokens_seen": 7131584, "step": 3065 }, { "epoch": 2.893496701225259, "grad_norm": 0.3432343602180481, "learning_rate": 2.2519898347285745e-06, "loss": 0.1831, "num_input_tokens_seen": 7142720, "step": 3070 }, { "epoch": 2.8982092365692744, "grad_norm": 48.5273323059082, "learning_rate": 2.2438059816847165e-06, "loss": 0.1239, "num_input_tokens_seen": 7155520, "step": 3075 }, { "epoch": 2.9029217719132894, "grad_norm": 0.019747210666537285, "learning_rate": 2.235624902238879e-06, "loss": 0.0753, "num_input_tokens_seen": 7165504, "step": 3080 }, { "epoch": 2.9076343072573043, "grad_norm": 87.9238052368164, "learning_rate": 2.2274466849607526e-06, "loss": 0.118, "num_input_tokens_seen": 7176384, "step": 3085 }, { "epoch": 2.9123468426013197, "grad_norm": 0.024710891768336296, "learning_rate": 2.219271418389046e-06, "loss": 0.0012, "num_input_tokens_seen": 7188288, "step": 3090 }, { "epoch": 2.9170593779453347, "grad_norm": 48.7574577331543, "learning_rate": 2.2110991910305233e-06, "loss": 0.1523, "num_input_tokens_seen": 7199680, "step": 3095 }, { "epoch": 2.9217719132893496, "grad_norm": 24.760589599609375, "learning_rate": 2.2029300913590413e-06, "loss": 0.0548, "num_input_tokens_seen": 7211520, "step": 3100 }, { "epoch": 2.9264844486333645, "grad_norm": 0.13239729404449463, "learning_rate": 2.1947642078146005e-06, "loss": 0.0932, "num_input_tokens_seen": 7221440, "step": 3105 }, { "epoch": 2.93119698397738, "grad_norm": 0.09687553346157074, "learning_rate": 2.1866016288023815e-06, "loss": 0.0528, "num_input_tokens_seen": 7232128, "step": 3110 }, { "epoch": 2.935909519321395, "grad_norm": 23.3001651763916, "learning_rate": 2.178442442691789e-06, "loss": 0.1414, "num_input_tokens_seen": 7241984, "step": 3115 }, { "epoch": 2.9406220546654103, "grad_norm": 0.11602967977523804, "learning_rate": 2.170286737815495e-06, "loss": 0.0745, "num_input_tokens_seen": 7252672, "step": 3120 }, { "epoch": 2.945334590009425, "grad_norm": 0.5076259970664978, "learning_rate": 2.1621346024684854e-06, "loss": 0.0453, "num_input_tokens_seen": 7264064, "step": 3125 }, { "epoch": 2.95004712535344, "grad_norm": 2.5609045028686523, "learning_rate": 2.1539861249071004e-06, "loss": 0.0268, "num_input_tokens_seen": 7275776, "step": 3130 }, { "epoch": 2.954759660697455, "grad_norm": 1.9134023189544678, "learning_rate": 2.145841393348079e-06, "loss": 0.0361, "num_input_tokens_seen": 7287680, "step": 3135 }, { "epoch": 2.95947219604147, "grad_norm": 0.09775304049253464, "learning_rate": 2.1377004959676086e-06, "loss": 0.001, "num_input_tokens_seen": 7300032, "step": 3140 }, { "epoch": 2.9641847313854854, "grad_norm": 1.4209673404693604, "learning_rate": 2.129563520900364e-06, "loss": 0.0632, "num_input_tokens_seen": 7311616, "step": 3145 }, { "epoch": 2.9688972667295004, "grad_norm": 0.10756014287471771, "learning_rate": 2.1214305562385592e-06, "loss": 0.1604, "num_input_tokens_seen": 7321600, "step": 3150 }, { "epoch": 2.9736098020735158, "grad_norm": 0.026215003803372383, "learning_rate": 2.1133016900309876e-06, "loss": 0.0003, "num_input_tokens_seen": 7333376, "step": 3155 }, { "epoch": 2.9783223374175307, "grad_norm": 0.01626676134765148, "learning_rate": 2.1051770102820755e-06, "loss": 0.0002, "num_input_tokens_seen": 7344384, "step": 3160 }, { "epoch": 2.9830348727615457, "grad_norm": 0.2678651511669159, "learning_rate": 2.0970566049509236e-06, "loss": 0.0799, "num_input_tokens_seen": 7355840, "step": 3165 }, { "epoch": 2.9877474081055606, "grad_norm": 0.012307991273701191, "learning_rate": 2.088940561950359e-06, "loss": 0.0002, "num_input_tokens_seen": 7368128, "step": 3170 }, { "epoch": 2.992459943449576, "grad_norm": 0.03959092125296593, "learning_rate": 2.080828969145979e-06, "loss": 0.1426, "num_input_tokens_seen": 7381056, "step": 3175 }, { "epoch": 2.997172478793591, "grad_norm": 16.04501724243164, "learning_rate": 2.0727219143552034e-06, "loss": 0.094, "num_input_tokens_seen": 7393536, "step": 3180 }, { "epoch": 3.001885014137606, "grad_norm": 0.013006187044084072, "learning_rate": 2.0646194853463255e-06, "loss": 0.0923, "num_input_tokens_seen": 7402656, "step": 3185 }, { "epoch": 3.0065975494816213, "grad_norm": 0.05889980494976044, "learning_rate": 2.056521769837553e-06, "loss": 0.0004, "num_input_tokens_seen": 7416480, "step": 3190 }, { "epoch": 3.008482563619227, "eval_loss": 0.3936729431152344, "eval_runtime": 2.7505, "eval_samples_per_second": 342.849, "eval_steps_per_second": 42.902, "num_input_tokens_seen": 7421856, "step": 3192 }, { "epoch": 3.0113100848256362, "grad_norm": 0.11007480323314667, "learning_rate": 2.0484288554960707e-06, "loss": 0.0003, "num_input_tokens_seen": 7430304, "step": 3195 }, { "epoch": 3.016022620169651, "grad_norm": 0.023877175524830818, "learning_rate": 2.040340829937082e-06, "loss": 0.052, "num_input_tokens_seen": 7441568, "step": 3200 }, { "epoch": 3.0207351555136666, "grad_norm": 0.016011981293559074, "learning_rate": 2.032257780722865e-06, "loss": 0.0003, "num_input_tokens_seen": 7451744, "step": 3205 }, { "epoch": 3.0254476908576815, "grad_norm": 0.02110173925757408, "learning_rate": 2.0241797953618204e-06, "loss": 0.0002, "num_input_tokens_seen": 7463008, "step": 3210 }, { "epoch": 3.0301602262016964, "grad_norm": 0.018056534230709076, "learning_rate": 2.0161069613075295e-06, "loss": 0.0001, "num_input_tokens_seen": 7475424, "step": 3215 }, { "epoch": 3.0348727615457114, "grad_norm": 0.018598072230815887, "learning_rate": 2.008039365957804e-06, "loss": 0.0002, "num_input_tokens_seen": 7486368, "step": 3220 }, { "epoch": 3.039585296889727, "grad_norm": 2.553637981414795, "learning_rate": 1.9999770966537416e-06, "loss": 0.0005, "num_input_tokens_seen": 7497312, "step": 3225 }, { "epoch": 3.0442978322337417, "grad_norm": 81.77398681640625, "learning_rate": 1.991920240678776e-06, "loss": 0.0457, "num_input_tokens_seen": 7507552, "step": 3230 }, { "epoch": 3.0490103675777567, "grad_norm": 0.0028707189485430717, "learning_rate": 1.983868885257739e-06, "loss": 0.0001, "num_input_tokens_seen": 7519008, "step": 3235 }, { "epoch": 3.053722902921772, "grad_norm": 25.919538497924805, "learning_rate": 1.97582311755591e-06, "loss": 0.0908, "num_input_tokens_seen": 7530400, "step": 3240 }, { "epoch": 3.058435438265787, "grad_norm": 0.009589405730366707, "learning_rate": 1.9677830246780764e-06, "loss": 0.0002, "num_input_tokens_seen": 7544096, "step": 3245 }, { "epoch": 3.063147973609802, "grad_norm": 0.04477664828300476, "learning_rate": 1.9597486936675886e-06, "loss": 0.0044, "num_input_tokens_seen": 7554784, "step": 3250 }, { "epoch": 3.0678605089538173, "grad_norm": 0.0675143375992775, "learning_rate": 1.9517202115054174e-06, "loss": 0.0001, "num_input_tokens_seen": 7567392, "step": 3255 }, { "epoch": 3.0725730442978323, "grad_norm": 0.010941299609839916, "learning_rate": 1.9436976651092143e-06, "loss": 0.0001, "num_input_tokens_seen": 7578016, "step": 3260 }, { "epoch": 3.0772855796418472, "grad_norm": 0.00475684879347682, "learning_rate": 1.9356811413323686e-06, "loss": 0.0689, "num_input_tokens_seen": 7589728, "step": 3265 }, { "epoch": 3.081998114985862, "grad_norm": 0.009347557090222836, "learning_rate": 1.9276707269630664e-06, "loss": 0.0006, "num_input_tokens_seen": 7601184, "step": 3270 }, { "epoch": 3.0867106503298776, "grad_norm": 0.02251746505498886, "learning_rate": 1.9196665087233548e-06, "loss": 0.0001, "num_input_tokens_seen": 7612128, "step": 3275 }, { "epoch": 3.0914231856738925, "grad_norm": 0.006476939655840397, "learning_rate": 1.9116685732681995e-06, "loss": 0.0004, "num_input_tokens_seen": 7623776, "step": 3280 }, { "epoch": 3.0961357210179075, "grad_norm": 0.008666305802762508, "learning_rate": 1.9036770071845467e-06, "loss": 0.0001, "num_input_tokens_seen": 7636128, "step": 3285 }, { "epoch": 3.100848256361923, "grad_norm": 0.07765082269906998, "learning_rate": 1.8956918969903881e-06, "loss": 0.0002, "num_input_tokens_seen": 7646432, "step": 3290 }, { "epoch": 3.105560791705938, "grad_norm": 0.006188265047967434, "learning_rate": 1.887713329133824e-06, "loss": 0.0, "num_input_tokens_seen": 7657824, "step": 3295 }, { "epoch": 3.1102733270499527, "grad_norm": 28.873966217041016, "learning_rate": 1.8797413899921224e-06, "loss": 0.0829, "num_input_tokens_seen": 7669920, "step": 3300 }, { "epoch": 3.114985862393968, "grad_norm": 34.75840759277344, "learning_rate": 1.8717761658707916e-06, "loss": 0.0054, "num_input_tokens_seen": 7681952, "step": 3305 }, { "epoch": 3.119698397737983, "grad_norm": 0.005098584573715925, "learning_rate": 1.86381774300264e-06, "loss": 0.0, "num_input_tokens_seen": 7692832, "step": 3310 }, { "epoch": 3.124410933081998, "grad_norm": 0.012214220128953457, "learning_rate": 1.8558662075468468e-06, "loss": 0.1029, "num_input_tokens_seen": 7703072, "step": 3315 }, { "epoch": 3.1291234684260134, "grad_norm": 0.05773286893963814, "learning_rate": 1.8479216455880225e-06, "loss": 0.0, "num_input_tokens_seen": 7714016, "step": 3320 }, { "epoch": 3.1338360037700284, "grad_norm": 0.0058107743971049786, "learning_rate": 1.8399841431352855e-06, "loss": 0.0002, "num_input_tokens_seen": 7726688, "step": 3325 }, { "epoch": 3.1385485391140433, "grad_norm": 0.11002416163682938, "learning_rate": 1.8320537861213267e-06, "loss": 0.0001, "num_input_tokens_seen": 7739680, "step": 3330 }, { "epoch": 3.1432610744580582, "grad_norm": 0.04805764928460121, "learning_rate": 1.8241306604014761e-06, "loss": 0.0001, "num_input_tokens_seen": 7749024, "step": 3335 }, { "epoch": 3.1479736098020736, "grad_norm": 0.001311355852521956, "learning_rate": 1.816214851752779e-06, "loss": 0.0008, "num_input_tokens_seen": 7761568, "step": 3340 }, { "epoch": 3.1526861451460886, "grad_norm": 0.013387499377131462, "learning_rate": 1.8083064458730651e-06, "loss": 0.0001, "num_input_tokens_seen": 7772640, "step": 3345 }, { "epoch": 3.1573986804901035, "grad_norm": 0.017248639836907387, "learning_rate": 1.8004055283800204e-06, "loss": 0.0004, "num_input_tokens_seen": 7784672, "step": 3350 }, { "epoch": 3.162111215834119, "grad_norm": 0.0028156836051493883, "learning_rate": 1.7925121848102583e-06, "loss": 0.0, "num_input_tokens_seen": 7795872, "step": 3355 }, { "epoch": 3.166823751178134, "grad_norm": 0.005071389954537153, "learning_rate": 1.7846265006183976e-06, "loss": 0.0, "num_input_tokens_seen": 7808416, "step": 3360 }, { "epoch": 3.171536286522149, "grad_norm": 0.013269501738250256, "learning_rate": 1.776748561176137e-06, "loss": 0.0, "num_input_tokens_seen": 7820640, "step": 3365 }, { "epoch": 3.176248821866164, "grad_norm": 0.010563570074737072, "learning_rate": 1.7688784517713247e-06, "loss": 0.0, "num_input_tokens_seen": 7831072, "step": 3370 }, { "epoch": 3.180961357210179, "grad_norm": 29.668603897094727, "learning_rate": 1.761016257607044e-06, "loss": 0.0969, "num_input_tokens_seen": 7841888, "step": 3375 }, { "epoch": 3.185673892554194, "grad_norm": 0.07287805527448654, "learning_rate": 1.7531620638006834e-06, "loss": 0.0488, "num_input_tokens_seen": 7852896, "step": 3380 }, { "epoch": 3.190386427898209, "grad_norm": 27.53697395324707, "learning_rate": 1.7453159553830217e-06, "loss": 0.0013, "num_input_tokens_seen": 7868384, "step": 3385 }, { "epoch": 3.1950989632422244, "grad_norm": 0.006890235003083944, "learning_rate": 1.7374780172973004e-06, "loss": 0.0001, "num_input_tokens_seen": 7881312, "step": 3390 }, { "epoch": 3.1998114985862394, "grad_norm": 0.0019902060739696026, "learning_rate": 1.7296483343983095e-06, "loss": 0.0564, "num_input_tokens_seen": 7892128, "step": 3395 }, { "epoch": 3.2045240339302543, "grad_norm": 0.020912524312734604, "learning_rate": 1.7218269914514668e-06, "loss": 0.0002, "num_input_tokens_seen": 7902624, "step": 3400 }, { "epoch": 3.2092365692742697, "grad_norm": 0.007507129572331905, "learning_rate": 1.714014073131901e-06, "loss": 0.0001, "num_input_tokens_seen": 7915168, "step": 3405 }, { "epoch": 3.2139491046182846, "grad_norm": 0.053086619824171066, "learning_rate": 1.7062096640235327e-06, "loss": 0.0002, "num_input_tokens_seen": 7925472, "step": 3410 }, { "epoch": 3.2186616399622996, "grad_norm": 0.1808883100748062, "learning_rate": 1.6984138486181612e-06, "loss": 0.0001, "num_input_tokens_seen": 7940576, "step": 3415 }, { "epoch": 3.223374175306315, "grad_norm": 0.015256045386195183, "learning_rate": 1.6906267113145514e-06, "loss": 0.0323, "num_input_tokens_seen": 7956064, "step": 3420 }, { "epoch": 3.22808671065033, "grad_norm": 0.022556964308023453, "learning_rate": 1.6828483364175127e-06, "loss": 0.0, "num_input_tokens_seen": 7967264, "step": 3425 }, { "epoch": 3.232799245994345, "grad_norm": 1.4760725498199463, "learning_rate": 1.6750788081369951e-06, "loss": 0.0003, "num_input_tokens_seen": 7978144, "step": 3430 }, { "epoch": 3.23751178133836, "grad_norm": 18.443159103393555, "learning_rate": 1.6673182105871733e-06, "loss": 0.0443, "num_input_tokens_seen": 7989152, "step": 3435 }, { "epoch": 3.242224316682375, "grad_norm": 0.005763629917055368, "learning_rate": 1.659566627785536e-06, "loss": 0.0, "num_input_tokens_seen": 8000800, "step": 3440 }, { "epoch": 3.24693685202639, "grad_norm": 0.0024256331380456686, "learning_rate": 1.651824143651975e-06, "loss": 0.0004, "num_input_tokens_seen": 8014816, "step": 3445 }, { "epoch": 3.251649387370405, "grad_norm": 0.003130377735942602, "learning_rate": 1.644090842007881e-06, "loss": 0.0, "num_input_tokens_seen": 8025120, "step": 3450 }, { "epoch": 3.2563619227144205, "grad_norm": 0.02095922827720642, "learning_rate": 1.6363668065752336e-06, "loss": 0.0, "num_input_tokens_seen": 8037344, "step": 3455 }, { "epoch": 3.2591894439208295, "eval_loss": 0.5191035270690918, "eval_runtime": 2.7496, "eval_samples_per_second": 342.955, "eval_steps_per_second": 42.915, "num_input_tokens_seen": 8043744, "step": 3458 }, { "epoch": 3.2610744580584354, "grad_norm": 0.0036951308138668537, "learning_rate": 1.6286521209756917e-06, "loss": 0.0875, "num_input_tokens_seen": 8048096, "step": 3460 }, { "epoch": 3.2657869934024504, "grad_norm": 0.004653128329664469, "learning_rate": 1.6209468687296947e-06, "loss": 0.0, "num_input_tokens_seen": 8061344, "step": 3465 }, { "epoch": 3.2704995287464658, "grad_norm": 0.004204562399536371, "learning_rate": 1.613251133255554e-06, "loss": 0.0, "num_input_tokens_seen": 8073184, "step": 3470 }, { "epoch": 3.2752120640904807, "grad_norm": 0.005397483240813017, "learning_rate": 1.6055649978685517e-06, "loss": 0.0, "num_input_tokens_seen": 8082976, "step": 3475 }, { "epoch": 3.2799245994344957, "grad_norm": 0.01232027355581522, "learning_rate": 1.5978885457800348e-06, "loss": 0.0, "num_input_tokens_seen": 8094624, "step": 3480 }, { "epoch": 3.284637134778511, "grad_norm": 57.04811096191406, "learning_rate": 1.59022186009652e-06, "loss": 0.0843, "num_input_tokens_seen": 8104928, "step": 3485 }, { "epoch": 3.289349670122526, "grad_norm": 0.009285739623010159, "learning_rate": 1.5825650238187918e-06, "loss": 0.0, "num_input_tokens_seen": 8116896, "step": 3490 }, { "epoch": 3.294062205466541, "grad_norm": 0.009216835722327232, "learning_rate": 1.5749181198410014e-06, "loss": 0.0875, "num_input_tokens_seen": 8127968, "step": 3495 }, { "epoch": 3.298774740810556, "grad_norm": 0.00624003866687417, "learning_rate": 1.5672812309497722e-06, "loss": 0.0326, "num_input_tokens_seen": 8139936, "step": 3500 }, { "epoch": 3.3034872761545713, "grad_norm": 0.014653928577899933, "learning_rate": 1.5596544398233028e-06, "loss": 0.0001, "num_input_tokens_seen": 8151392, "step": 3505 }, { "epoch": 3.308199811498586, "grad_norm": 0.00760252121835947, "learning_rate": 1.5520378290304723e-06, "loss": 0.0, "num_input_tokens_seen": 8165280, "step": 3510 }, { "epoch": 3.312912346842601, "grad_norm": 0.037401266396045685, "learning_rate": 1.544431481029944e-06, "loss": 0.0, "num_input_tokens_seen": 8177696, "step": 3515 }, { "epoch": 3.3176248821866166, "grad_norm": 0.004210233688354492, "learning_rate": 1.5368354781692764e-06, "loss": 0.0, "num_input_tokens_seen": 8189280, "step": 3520 }, { "epoch": 3.3223374175306315, "grad_norm": 0.08442122489213943, "learning_rate": 1.5292499026840292e-06, "loss": 0.0001, "num_input_tokens_seen": 8202784, "step": 3525 }, { "epoch": 3.3270499528746464, "grad_norm": 0.0204016100615263, "learning_rate": 1.5216748366968743e-06, "loss": 0.1032, "num_input_tokens_seen": 8216032, "step": 3530 }, { "epoch": 3.331762488218662, "grad_norm": 0.002625198569148779, "learning_rate": 1.5141103622167042e-06, "loss": 0.0001, "num_input_tokens_seen": 8228320, "step": 3535 }, { "epoch": 3.336475023562677, "grad_norm": 0.00422442564740777, "learning_rate": 1.5065565611377472e-06, "loss": 0.0487, "num_input_tokens_seen": 8240416, "step": 3540 }, { "epoch": 3.3411875589066917, "grad_norm": 0.001890690764412284, "learning_rate": 1.4990135152386814e-06, "loss": 0.0, "num_input_tokens_seen": 8252640, "step": 3545 }, { "epoch": 3.345900094250707, "grad_norm": 0.04529090225696564, "learning_rate": 1.4914813061817434e-06, "loss": 0.0001, "num_input_tokens_seen": 8261984, "step": 3550 }, { "epoch": 3.350612629594722, "grad_norm": 0.004377726465463638, "learning_rate": 1.4839600155118525e-06, "loss": 0.0036, "num_input_tokens_seen": 8273568, "step": 3555 }, { "epoch": 3.355325164938737, "grad_norm": 0.003906300291419029, "learning_rate": 1.4764497246557214e-06, "loss": 0.0001, "num_input_tokens_seen": 8285472, "step": 3560 }, { "epoch": 3.360037700282752, "grad_norm": 5.829066753387451, "learning_rate": 1.4689505149209788e-06, "loss": 0.0008, "num_input_tokens_seen": 8294816, "step": 3565 }, { "epoch": 3.3647502356267673, "grad_norm": 0.004101856611669064, "learning_rate": 1.4614624674952843e-06, "loss": 0.0, "num_input_tokens_seen": 8305504, "step": 3570 }, { "epoch": 3.3694627709707823, "grad_norm": 0.002960493555292487, "learning_rate": 1.4539856634454558e-06, "loss": 0.0518, "num_input_tokens_seen": 8316320, "step": 3575 }, { "epoch": 3.3741753063147972, "grad_norm": 0.004375193268060684, "learning_rate": 1.4465201837165876e-06, "loss": 0.0384, "num_input_tokens_seen": 8327200, "step": 3580 }, { "epoch": 3.3788878416588126, "grad_norm": 0.0019584076944738626, "learning_rate": 1.4390661091311742e-06, "loss": 0.0, "num_input_tokens_seen": 8339488, "step": 3585 }, { "epoch": 3.3836003770028276, "grad_norm": 0.12523356080055237, "learning_rate": 1.4316235203882373e-06, "loss": 0.0642, "num_input_tokens_seen": 8353120, "step": 3590 }, { "epoch": 3.3883129123468425, "grad_norm": 0.002279081614688039, "learning_rate": 1.4241924980624485e-06, "loss": 0.0, "num_input_tokens_seen": 8364768, "step": 3595 }, { "epoch": 3.3930254476908575, "grad_norm": 7.863749027252197, "learning_rate": 1.4167731226032656e-06, "loss": 0.0029, "num_input_tokens_seen": 8376480, "step": 3600 }, { "epoch": 3.397737983034873, "grad_norm": 0.0011470771860331297, "learning_rate": 1.4093654743340462e-06, "loss": 0.0122, "num_input_tokens_seen": 8386784, "step": 3605 }, { "epoch": 3.402450518378888, "grad_norm": 0.001517343451268971, "learning_rate": 1.4019696334511962e-06, "loss": 0.0, "num_input_tokens_seen": 8397984, "step": 3610 }, { "epoch": 3.4071630537229027, "grad_norm": 0.005868109408766031, "learning_rate": 1.3945856800232874e-06, "loss": 0.0, "num_input_tokens_seen": 8408544, "step": 3615 }, { "epoch": 3.411875589066918, "grad_norm": 0.0029909429140388966, "learning_rate": 1.3872136939902004e-06, "loss": 0.0, "num_input_tokens_seen": 8419552, "step": 3620 }, { "epoch": 3.416588124410933, "grad_norm": 0.4216376841068268, "learning_rate": 1.379853755162249e-06, "loss": 0.0001, "num_input_tokens_seen": 8429664, "step": 3625 }, { "epoch": 3.421300659754948, "grad_norm": 0.0006833241204731166, "learning_rate": 1.3725059432193278e-06, "loss": 0.0, "num_input_tokens_seen": 8441376, "step": 3630 }, { "epoch": 3.4260131950989634, "grad_norm": 0.0014978962717577815, "learning_rate": 1.3651703377100406e-06, "loss": 0.0, "num_input_tokens_seen": 8452896, "step": 3635 }, { "epoch": 3.4307257304429783, "grad_norm": 0.0020364606752991676, "learning_rate": 1.3578470180508432e-06, "loss": 0.0, "num_input_tokens_seen": 8463328, "step": 3640 }, { "epoch": 3.4354382657869933, "grad_norm": 0.003118757624179125, "learning_rate": 1.3505360635251813e-06, "loss": 0.0, "num_input_tokens_seen": 8475808, "step": 3645 }, { "epoch": 3.4401508011310087, "grad_norm": 0.002336150733754039, "learning_rate": 1.3432375532826374e-06, "loss": 0.0122, "num_input_tokens_seen": 8487456, "step": 3650 }, { "epoch": 3.4448633364750236, "grad_norm": 0.04974250867962837, "learning_rate": 1.3359515663380668e-06, "loss": 0.0, "num_input_tokens_seen": 8503712, "step": 3655 }, { "epoch": 3.4495758718190386, "grad_norm": 0.069328673183918, "learning_rate": 1.3286781815707465e-06, "loss": 0.2188, "num_input_tokens_seen": 8514848, "step": 3660 }, { "epoch": 3.4542884071630535, "grad_norm": 0.0022926589008420706, "learning_rate": 1.3214174777235192e-06, "loss": 0.0985, "num_input_tokens_seen": 8524960, "step": 3665 }, { "epoch": 3.459000942507069, "grad_norm": 0.03560859337449074, "learning_rate": 1.3141695334019453e-06, "loss": 0.0001, "num_input_tokens_seen": 8535520, "step": 3670 }, { "epoch": 3.463713477851084, "grad_norm": 0.013610146008431911, "learning_rate": 1.3069344270734452e-06, "loss": 0.0023, "num_input_tokens_seen": 8544864, "step": 3675 }, { "epoch": 3.468426013195099, "grad_norm": 0.0055809905752539635, "learning_rate": 1.2997122370664538e-06, "loss": 0.0001, "num_input_tokens_seen": 8556960, "step": 3680 }, { "epoch": 3.473138548539114, "grad_norm": 0.011014264076948166, "learning_rate": 1.2925030415695727e-06, "loss": 0.0001, "num_input_tokens_seen": 8567968, "step": 3685 }, { "epoch": 3.477851083883129, "grad_norm": 30.655986785888672, "learning_rate": 1.285306918630722e-06, "loss": 0.0595, "num_input_tokens_seen": 8581920, "step": 3690 }, { "epoch": 3.482563619227144, "grad_norm": 0.047711387276649475, "learning_rate": 1.2781239461562966e-06, "loss": 0.0442, "num_input_tokens_seen": 8594720, "step": 3695 }, { "epoch": 3.4872761545711595, "grad_norm": 0.00917474739253521, "learning_rate": 1.2709542019103211e-06, "loss": 0.0001, "num_input_tokens_seen": 8606560, "step": 3700 }, { "epoch": 3.4919886899151744, "grad_norm": 0.016317633911967278, "learning_rate": 1.2637977635136123e-06, "loss": 0.0017, "num_input_tokens_seen": 8618208, "step": 3705 }, { "epoch": 3.4967012252591894, "grad_norm": 0.028710726648569107, "learning_rate": 1.2566547084429326e-06, "loss": 0.0089, "num_input_tokens_seen": 8631584, "step": 3710 }, { "epoch": 3.5014137606032048, "grad_norm": 0.0347968190908432, "learning_rate": 1.2495251140301553e-06, "loss": 0.0338, "num_input_tokens_seen": 8642912, "step": 3715 }, { "epoch": 3.5061262959472197, "grad_norm": 0.01212374772876501, "learning_rate": 1.2424090574614262e-06, "loss": 0.0002, "num_input_tokens_seen": 8652384, "step": 3720 }, { "epoch": 3.5098963242224315, "eval_loss": 0.4635506868362427, "eval_runtime": 2.7946, "eval_samples_per_second": 337.431, "eval_steps_per_second": 42.224, "num_input_tokens_seen": 8660768, "step": 3724 }, { "epoch": 3.5108388312912346, "grad_norm": 0.008465130813419819, "learning_rate": 1.2353066157763305e-06, "loss": 0.0008, "num_input_tokens_seen": 8662624, "step": 3725 }, { "epoch": 3.5155513666352496, "grad_norm": 0.02242710441350937, "learning_rate": 1.2282178658670514e-06, "loss": 0.0001, "num_input_tokens_seen": 8672864, "step": 3730 }, { "epoch": 3.520263901979265, "grad_norm": 0.004590487107634544, "learning_rate": 1.221142884477548e-06, "loss": 0.0001, "num_input_tokens_seen": 8684448, "step": 3735 }, { "epoch": 3.52497643732328, "grad_norm": 0.0026052999310195446, "learning_rate": 1.2140817482027155e-06, "loss": 0.0001, "num_input_tokens_seen": 8698336, "step": 3740 }, { "epoch": 3.529688972667295, "grad_norm": 0.002777635119855404, "learning_rate": 1.207034533487564e-06, "loss": 0.0, "num_input_tokens_seen": 8711072, "step": 3745 }, { "epoch": 3.5344015080113103, "grad_norm": 0.004557525273412466, "learning_rate": 1.2000013166263803e-06, "loss": 0.0001, "num_input_tokens_seen": 8723872, "step": 3750 }, { "epoch": 3.539114043355325, "grad_norm": 10.647988319396973, "learning_rate": 1.1929821737619132e-06, "loss": 0.0013, "num_input_tokens_seen": 8735776, "step": 3755 }, { "epoch": 3.54382657869934, "grad_norm": 0.0055831498466432095, "learning_rate": 1.1859771808845417e-06, "loss": 0.0, "num_input_tokens_seen": 8752736, "step": 3760 }, { "epoch": 3.548539114043355, "grad_norm": 0.007252705283463001, "learning_rate": 1.1789864138314577e-06, "loss": 0.0001, "num_input_tokens_seen": 8766688, "step": 3765 }, { "epoch": 3.5532516493873705, "grad_norm": 0.031091537326574326, "learning_rate": 1.1720099482858364e-06, "loss": 0.0, "num_input_tokens_seen": 8781536, "step": 3770 }, { "epoch": 3.5579641847313854, "grad_norm": 0.0167153999209404, "learning_rate": 1.1650478597760284e-06, "loss": 0.0001, "num_input_tokens_seen": 8792224, "step": 3775 }, { "epoch": 3.562676720075401, "grad_norm": 0.002952584996819496, "learning_rate": 1.158100223674733e-06, "loss": 0.0704, "num_input_tokens_seen": 8803168, "step": 3780 }, { "epoch": 3.5673892554194158, "grad_norm": 0.003907319158315659, "learning_rate": 1.1511671151981861e-06, "loss": 0.0001, "num_input_tokens_seen": 8813536, "step": 3785 }, { "epoch": 3.5721017907634307, "grad_norm": 0.004613762255758047, "learning_rate": 1.1442486094053445e-06, "loss": 0.0, "num_input_tokens_seen": 8823840, "step": 3790 }, { "epoch": 3.5768143261074457, "grad_norm": 0.008234544657170773, "learning_rate": 1.1373447811970762e-06, "loss": 0.0, "num_input_tokens_seen": 8836576, "step": 3795 }, { "epoch": 3.581526861451461, "grad_norm": 0.004143028054386377, "learning_rate": 1.130455705315345e-06, "loss": 0.0, "num_input_tokens_seen": 8849824, "step": 3800 }, { "epoch": 3.586239396795476, "grad_norm": 0.010429292917251587, "learning_rate": 1.1235814563424046e-06, "loss": 0.1829, "num_input_tokens_seen": 8860448, "step": 3805 }, { "epoch": 3.590951932139491, "grad_norm": 0.005767806898802519, "learning_rate": 1.1167221086999897e-06, "loss": 0.0001, "num_input_tokens_seen": 8871776, "step": 3810 }, { "epoch": 3.5956644674835063, "grad_norm": 4.553272724151611, "learning_rate": 1.10987773664851e-06, "loss": 0.0006, "num_input_tokens_seen": 8885728, "step": 3815 }, { "epoch": 3.6003770028275213, "grad_norm": 0.002544153481721878, "learning_rate": 1.1030484142862511e-06, "loss": 0.0, "num_input_tokens_seen": 8895904, "step": 3820 }, { "epoch": 3.605089538171536, "grad_norm": 0.5232880711555481, "learning_rate": 1.0962342155485613e-06, "loss": 0.0006, "num_input_tokens_seen": 8907808, "step": 3825 }, { "epoch": 3.609802073515551, "grad_norm": 0.0035159303806722164, "learning_rate": 1.0894352142070652e-06, "loss": 0.0, "num_input_tokens_seen": 8918432, "step": 3830 }, { "epoch": 3.6145146088595665, "grad_norm": 21.874401092529297, "learning_rate": 1.0826514838688533e-06, "loss": 0.072, "num_input_tokens_seen": 8929248, "step": 3835 }, { "epoch": 3.6192271442035815, "grad_norm": 0.03307318687438965, "learning_rate": 1.075883097975691e-06, "loss": 0.0001, "num_input_tokens_seen": 8940384, "step": 3840 }, { "epoch": 3.623939679547597, "grad_norm": 0.0033383166883140802, "learning_rate": 1.0691301298032218e-06, "loss": 0.0, "num_input_tokens_seen": 8950816, "step": 3845 }, { "epoch": 3.628652214891612, "grad_norm": 0.004945589695125818, "learning_rate": 1.0623926524601771e-06, "loss": 0.0001, "num_input_tokens_seen": 8963296, "step": 3850 }, { "epoch": 3.6333647502356268, "grad_norm": 0.014219646342098713, "learning_rate": 1.0556707388875786e-06, "loss": 0.0, "num_input_tokens_seen": 8974624, "step": 3855 }, { "epoch": 3.6380772855796417, "grad_norm": 13.508224487304688, "learning_rate": 1.048964461857954e-06, "loss": 0.0596, "num_input_tokens_seen": 8985952, "step": 3860 }, { "epoch": 3.6427898209236567, "grad_norm": 0.022754942998290062, "learning_rate": 1.0422738939745453e-06, "loss": 0.0002, "num_input_tokens_seen": 8996064, "step": 3865 }, { "epoch": 3.647502356267672, "grad_norm": 0.023511681705713272, "learning_rate": 1.035599107670529e-06, "loss": 0.0002, "num_input_tokens_seen": 9006368, "step": 3870 }, { "epoch": 3.652214891611687, "grad_norm": 0.01913359761238098, "learning_rate": 1.0289401752082214e-06, "loss": 0.0001, "num_input_tokens_seen": 9018272, "step": 3875 }, { "epoch": 3.6569274269557024, "grad_norm": 0.03516068682074547, "learning_rate": 1.0222971686783089e-06, "loss": 0.1112, "num_input_tokens_seen": 9029472, "step": 3880 }, { "epoch": 3.6616399622997173, "grad_norm": 0.10616306960582733, "learning_rate": 1.0156701599990562e-06, "loss": 0.0001, "num_input_tokens_seen": 9041824, "step": 3885 }, { "epoch": 3.6663524976437323, "grad_norm": 0.007794396486133337, "learning_rate": 1.0090592209155373e-06, "loss": 0.0381, "num_input_tokens_seen": 9054752, "step": 3890 }, { "epoch": 3.6710650329877472, "grad_norm": 2.6590635776519775, "learning_rate": 1.0024644229988484e-06, "loss": 0.002, "num_input_tokens_seen": 9064928, "step": 3895 }, { "epoch": 3.6757775683317626, "grad_norm": 0.02115059085190296, "learning_rate": 9.95885837645344e-07, "loss": 0.0001, "num_input_tokens_seen": 9076256, "step": 3900 }, { "epoch": 3.6804901036757776, "grad_norm": 0.035742953419685364, "learning_rate": 9.893235360758565e-07, "loss": 0.0954, "num_input_tokens_seen": 9086624, "step": 3905 }, { "epoch": 3.6852026390197925, "grad_norm": 0.007046286016702652, "learning_rate": 9.827775893349273e-07, "loss": 0.0001, "num_input_tokens_seen": 9097824, "step": 3910 }, { "epoch": 3.689915174363808, "grad_norm": 379.1387939453125, "learning_rate": 9.762480682900374e-07, "loss": 0.0323, "num_input_tokens_seen": 9107296, "step": 3915 }, { "epoch": 3.694627709707823, "grad_norm": 18.87242889404297, "learning_rate": 9.697350436308428e-07, "loss": 0.0039, "num_input_tokens_seen": 9119008, "step": 3920 }, { "epoch": 3.699340245051838, "grad_norm": 0.0049491687677800655, "learning_rate": 9.63238585868405e-07, "loss": 0.0001, "num_input_tokens_seen": 9131296, "step": 3925 }, { "epoch": 3.7040527803958527, "grad_norm": 0.01330646499991417, "learning_rate": 9.567587653344295e-07, "loss": 0.0001, "num_input_tokens_seen": 9141664, "step": 3930 }, { "epoch": 3.708765315739868, "grad_norm": 0.01977146603167057, "learning_rate": 9.502956521805054e-07, "loss": 0.0001, "num_input_tokens_seen": 9151328, "step": 3935 }, { "epoch": 3.713477851083883, "grad_norm": 0.08915020525455475, "learning_rate": 9.438493163773433e-07, "loss": 0.0002, "num_input_tokens_seen": 9164192, "step": 3940 }, { "epoch": 3.7181903864278985, "grad_norm": 0.004182067699730396, "learning_rate": 9.374198277140237e-07, "loss": 0.0003, "num_input_tokens_seen": 9176544, "step": 3945 }, { "epoch": 3.7229029217719134, "grad_norm": 0.028023963794112206, "learning_rate": 9.310072557972305e-07, "loss": 0.0162, "num_input_tokens_seen": 9188512, "step": 3950 }, { "epoch": 3.7276154571159283, "grad_norm": 0.003678038949146867, "learning_rate": 9.246116700505109e-07, "loss": 0.0001, "num_input_tokens_seen": 9200992, "step": 3955 }, { "epoch": 3.7323279924599433, "grad_norm": 0.004443558864295483, "learning_rate": 9.18233139713513e-07, "loss": 0.0001, "num_input_tokens_seen": 9211168, "step": 3960 }, { "epoch": 3.7370405278039587, "grad_norm": 0.012858624570071697, "learning_rate": 9.118717338412414e-07, "loss": 0.0, "num_input_tokens_seen": 9223456, "step": 3965 }, { "epoch": 3.7417530631479736, "grad_norm": 0.037973713129758835, "learning_rate": 9.055275213033077e-07, "loss": 0.0002, "num_input_tokens_seen": 9233632, "step": 3970 }, { "epoch": 3.7464655984919886, "grad_norm": 49.70805358886719, "learning_rate": 8.992005707831877e-07, "loss": 0.0751, "num_input_tokens_seen": 9243296, "step": 3975 }, { "epoch": 3.751178133836004, "grad_norm": 0.0022637660149484873, "learning_rate": 8.928909507774741e-07, "loss": 0.0002, "num_input_tokens_seen": 9259424, "step": 3980 }, { "epoch": 3.755890669180019, "grad_norm": 0.009885657578706741, "learning_rate": 8.86598729595137e-07, "loss": 0.0, "num_input_tokens_seen": 9271840, "step": 3985 }, { "epoch": 3.760603204524034, "grad_norm": 0.010455531068146229, "learning_rate": 8.80323975356783e-07, "loss": 0.0, "num_input_tokens_seen": 9286304, "step": 3990 }, { "epoch": 3.760603204524034, "eval_loss": 0.5201095938682556, "eval_runtime": 2.7914, "eval_samples_per_second": 337.823, "eval_steps_per_second": 42.273, "num_input_tokens_seen": 9286304, "step": 3990 }, { "epoch": 3.765315739868049, "grad_norm": 3.8269801139831543, "learning_rate": 8.740667559939217e-07, "loss": 0.0004, "num_input_tokens_seen": 9297056, "step": 3995 }, { "epoch": 3.770028275212064, "grad_norm": 0.010600591078400612, "learning_rate": 8.678271392482243e-07, "loss": 0.0, "num_input_tokens_seen": 9307872, "step": 4000 }, { "epoch": 3.774740810556079, "grad_norm": 0.00742421904578805, "learning_rate": 8.616051926707941e-07, "loss": 0.0, "num_input_tokens_seen": 9318816, "step": 4005 }, { "epoch": 3.7794533459000945, "grad_norm": 0.005168728996068239, "learning_rate": 8.554009836214345e-07, "loss": 0.0308, "num_input_tokens_seen": 9331232, "step": 4010 }, { "epoch": 3.7841658812441095, "grad_norm": 0.002414106857031584, "learning_rate": 8.49214579267921e-07, "loss": 0.0657, "num_input_tokens_seen": 9342112, "step": 4015 }, { "epoch": 3.7888784165881244, "grad_norm": 0.05582628399133682, "learning_rate": 8.430460465852683e-07, "loss": 0.0, "num_input_tokens_seen": 9355872, "step": 4020 }, { "epoch": 3.7935909519321394, "grad_norm": 0.0030152511317282915, "learning_rate": 8.368954523550146e-07, "loss": 0.0, "num_input_tokens_seen": 9367008, "step": 4025 }, { "epoch": 3.7983034872761543, "grad_norm": 0.008660019375383854, "learning_rate": 8.307628631644904e-07, "loss": 0.0001, "num_input_tokens_seen": 9380000, "step": 4030 }, { "epoch": 3.8030160226201697, "grad_norm": 0.012062069959938526, "learning_rate": 8.246483454061016e-07, "loss": 0.0, "num_input_tokens_seen": 9390368, "step": 4035 }, { "epoch": 3.8077285579641846, "grad_norm": 0.1445380598306656, "learning_rate": 8.185519652766091e-07, "loss": 0.0829, "num_input_tokens_seen": 9401952, "step": 4040 }, { "epoch": 3.8124410933082, "grad_norm": 0.017705656588077545, "learning_rate": 8.124737887764148e-07, "loss": 0.0, "num_input_tokens_seen": 9413536, "step": 4045 }, { "epoch": 3.817153628652215, "grad_norm": 13.497117042541504, "learning_rate": 8.064138817088429e-07, "loss": 0.09, "num_input_tokens_seen": 9424864, "step": 4050 }, { "epoch": 3.82186616399623, "grad_norm": 0.0018770827446132898, "learning_rate": 8.003723096794314e-07, "loss": 0.0, "num_input_tokens_seen": 9437152, "step": 4055 }, { "epoch": 3.826578699340245, "grad_norm": 0.005844899918884039, "learning_rate": 7.94349138095219e-07, "loss": 0.0001, "num_input_tokens_seen": 9448032, "step": 4060 }, { "epoch": 3.8312912346842602, "grad_norm": 0.07622718065977097, "learning_rate": 7.883444321640383e-07, "loss": 0.0001, "num_input_tokens_seen": 9459424, "step": 4065 }, { "epoch": 3.836003770028275, "grad_norm": 0.00536764832213521, "learning_rate": 7.82358256893812e-07, "loss": 0.0001, "num_input_tokens_seen": 9469536, "step": 4070 }, { "epoch": 3.84071630537229, "grad_norm": 0.0012705104891210794, "learning_rate": 7.763906770918428e-07, "loss": 0.0, "num_input_tokens_seen": 9482976, "step": 4075 }, { "epoch": 3.8454288407163055, "grad_norm": 0.008968825452029705, "learning_rate": 7.704417573641196e-07, "loss": 0.0001, "num_input_tokens_seen": 9492704, "step": 4080 }, { "epoch": 3.8501413760603205, "grad_norm": 0.00470514502376318, "learning_rate": 7.645115621146116e-07, "loss": 0.0, "num_input_tokens_seen": 9504864, "step": 4085 }, { "epoch": 3.8548539114043354, "grad_norm": 0.04696199670433998, "learning_rate": 7.586001555445773e-07, "loss": 0.1079, "num_input_tokens_seen": 9515424, "step": 4090 }, { "epoch": 3.8595664467483504, "grad_norm": 0.0076852161437273026, "learning_rate": 7.527076016518603e-07, "loss": 0.0001, "num_input_tokens_seen": 9525792, "step": 4095 }, { "epoch": 3.8642789820923658, "grad_norm": 0.04222777113318443, "learning_rate": 7.468339642302077e-07, "loss": 0.0001, "num_input_tokens_seen": 9536416, "step": 4100 }, { "epoch": 3.8689915174363807, "grad_norm": 0.024708032608032227, "learning_rate": 7.409793068685709e-07, "loss": 0.0722, "num_input_tokens_seen": 9550880, "step": 4105 }, { "epoch": 3.873704052780396, "grad_norm": 0.007110555190593004, "learning_rate": 7.351436929504203e-07, "loss": 0.0, "num_input_tokens_seen": 9564768, "step": 4110 }, { "epoch": 3.878416588124411, "grad_norm": 0.013784021139144897, "learning_rate": 7.293271856530585e-07, "loss": 0.0001, "num_input_tokens_seen": 9575776, "step": 4115 }, { "epoch": 3.883129123468426, "grad_norm": 0.0009743549744598567, "learning_rate": 7.235298479469391e-07, "loss": 0.0323, "num_input_tokens_seen": 9588192, "step": 4120 }, { "epoch": 3.887841658812441, "grad_norm": 0.006835806183516979, "learning_rate": 7.177517425949801e-07, "loss": 0.0, "num_input_tokens_seen": 9598432, "step": 4125 }, { "epoch": 3.8925541941564563, "grad_norm": 0.07676160335540771, "learning_rate": 7.119929321518876e-07, "loss": 0.0001, "num_input_tokens_seen": 9613920, "step": 4130 }, { "epoch": 3.8972667295004713, "grad_norm": 0.0023403996601700783, "learning_rate": 7.062534789634772e-07, "loss": 0.0001, "num_input_tokens_seen": 9624864, "step": 4135 }, { "epoch": 3.901979264844486, "grad_norm": 0.002742171287536621, "learning_rate": 7.005334451660034e-07, "loss": 0.0004, "num_input_tokens_seen": 9635232, "step": 4140 }, { "epoch": 3.9066918001885016, "grad_norm": 0.012737921439111233, "learning_rate": 6.948328926854767e-07, "loss": 0.0, "num_input_tokens_seen": 9648544, "step": 4145 }, { "epoch": 3.9114043355325165, "grad_norm": 0.05681464448571205, "learning_rate": 6.891518832370059e-07, "loss": 0.0074, "num_input_tokens_seen": 9659424, "step": 4150 }, { "epoch": 3.9161168708765315, "grad_norm": 0.034557852894067764, "learning_rate": 6.834904783241198e-07, "loss": 0.0, "num_input_tokens_seen": 9669920, "step": 4155 }, { "epoch": 3.9208294062205464, "grad_norm": 0.608920693397522, "learning_rate": 6.778487392381089e-07, "loss": 0.0002, "num_input_tokens_seen": 9681376, "step": 4160 }, { "epoch": 3.925541941564562, "grad_norm": 0.0012145772343501449, "learning_rate": 6.722267270573529e-07, "loss": 0.0, "num_input_tokens_seen": 9691552, "step": 4165 }, { "epoch": 3.9302544769085768, "grad_norm": 0.017469940707087517, "learning_rate": 6.666245026466708e-07, "loss": 0.0001, "num_input_tokens_seen": 9704288, "step": 4170 }, { "epoch": 3.934967012252592, "grad_norm": 0.0016305310418829322, "learning_rate": 6.61042126656652e-07, "loss": 0.0595, "num_input_tokens_seen": 9713952, "step": 4175 }, { "epoch": 3.939679547596607, "grad_norm": 26.813234329223633, "learning_rate": 6.554796595230051e-07, "loss": 0.0642, "num_input_tokens_seen": 9724576, "step": 4180 }, { "epoch": 3.944392082940622, "grad_norm": 0.0023128024768084288, "learning_rate": 6.499371614659019e-07, "loss": 0.0002, "num_input_tokens_seen": 9735392, "step": 4185 }, { "epoch": 3.949104618284637, "grad_norm": 0.00788215734064579, "learning_rate": 6.444146924893252e-07, "loss": 0.0766, "num_input_tokens_seen": 9745888, "step": 4190 }, { "epoch": 3.9538171536286524, "grad_norm": 0.0028776442632079124, "learning_rate": 6.389123123804217e-07, "loss": 0.111, "num_input_tokens_seen": 9755104, "step": 4195 }, { "epoch": 3.9585296889726673, "grad_norm": 0.0523090660572052, "learning_rate": 6.334300807088509e-07, "loss": 0.0003, "num_input_tokens_seen": 9766944, "step": 4200 }, { "epoch": 3.9632422243166823, "grad_norm": 0.008871479891240597, "learning_rate": 6.279680568261423e-07, "loss": 0.0782, "num_input_tokens_seen": 9778336, "step": 4205 }, { "epoch": 3.9679547596606977, "grad_norm": 0.22362910211086273, "learning_rate": 6.225262998650525e-07, "loss": 0.0004, "num_input_tokens_seen": 9789088, "step": 4210 }, { "epoch": 3.9726672950047126, "grad_norm": 1.362607479095459, "learning_rate": 6.171048687389273e-07, "loss": 0.0003, "num_input_tokens_seen": 9799392, "step": 4215 }, { "epoch": 3.9773798303487276, "grad_norm": 0.0025137532502412796, "learning_rate": 6.117038221410568e-07, "loss": 0.0, "num_input_tokens_seen": 9811360, "step": 4220 }, { "epoch": 3.9820923656927425, "grad_norm": 0.11764784902334213, "learning_rate": 6.063232185440507e-07, "loss": 0.1016, "num_input_tokens_seen": 9824160, "step": 4225 }, { "epoch": 3.986804901036758, "grad_norm": 0.011414180509746075, "learning_rate": 6.009631161991958e-07, "loss": 0.0007, "num_input_tokens_seen": 9834784, "step": 4230 }, { "epoch": 3.991517436380773, "grad_norm": 0.004349572584033012, "learning_rate": 5.956235731358298e-07, "loss": 0.0, "num_input_tokens_seen": 9845920, "step": 4235 }, { "epoch": 3.9962299717247878, "grad_norm": 0.0038933674804866314, "learning_rate": 5.903046471607121e-07, "loss": 0.0, "num_input_tokens_seen": 9858208, "step": 4240 }, { "epoch": 4.000942507068803, "grad_norm": 0.014259828254580498, "learning_rate": 5.850063958573993e-07, "loss": 0.032, "num_input_tokens_seen": 9868192, "step": 4245 }, { "epoch": 4.005655042412818, "grad_norm": 0.002472294494509697, "learning_rate": 5.797288765856196e-07, "loss": 0.0, "num_input_tokens_seen": 9882784, "step": 4250 }, { "epoch": 4.010367577756833, "grad_norm": 0.17401158809661865, "learning_rate": 5.74472146480653e-07, "loss": 0.0001, "num_input_tokens_seen": 9892448, "step": 4255 }, { "epoch": 4.011310084825636, "eval_loss": 0.5145591497421265, "eval_runtime": 2.7443, "eval_samples_per_second": 343.621, "eval_steps_per_second": 42.998, "num_input_tokens_seen": 9894624, "step": 4256 }, { "epoch": 4.015080113100848, "grad_norm": 0.0075249760411679745, "learning_rate": 5.692362624527117e-07, "loss": 0.0, "num_input_tokens_seen": 9905376, "step": 4260 }, { "epoch": 4.019792648444863, "grad_norm": 0.07316409796476364, "learning_rate": 5.640212811863277e-07, "loss": 0.0, "num_input_tokens_seen": 9915616, "step": 4265 }, { "epoch": 4.024505183788879, "grad_norm": 0.007110144477337599, "learning_rate": 5.588272591397337e-07, "loss": 0.0, "num_input_tokens_seen": 9928288, "step": 4270 }, { "epoch": 4.029217719132894, "grad_norm": 0.002868575043976307, "learning_rate": 5.536542525442554e-07, "loss": 0.0001, "num_input_tokens_seen": 9939232, "step": 4275 }, { "epoch": 4.033930254476909, "grad_norm": 0.004913229029625654, "learning_rate": 5.485023174037005e-07, "loss": 0.0, "num_input_tokens_seen": 9950688, "step": 4280 }, { "epoch": 4.038642789820924, "grad_norm": 0.0007955088512971997, "learning_rate": 5.433715094937575e-07, "loss": 0.0, "num_input_tokens_seen": 9961824, "step": 4285 }, { "epoch": 4.043355325164939, "grad_norm": 0.002959765959531069, "learning_rate": 5.382618843613827e-07, "loss": 0.0, "num_input_tokens_seen": 9974560, "step": 4290 }, { "epoch": 4.0480678605089535, "grad_norm": 0.0037353690713644028, "learning_rate": 5.331734973242089e-07, "loss": 0.0, "num_input_tokens_seen": 9987040, "step": 4295 }, { "epoch": 4.0527803958529685, "grad_norm": 0.001677420805208385, "learning_rate": 5.28106403469939e-07, "loss": 0.0, "num_input_tokens_seen": 10002400, "step": 4300 }, { "epoch": 4.057492931196984, "grad_norm": 0.024959493428468704, "learning_rate": 5.23060657655754e-07, "loss": 0.0, "num_input_tokens_seen": 10012448, "step": 4305 }, { "epoch": 4.062205466540999, "grad_norm": 0.002381374826654792, "learning_rate": 5.180363145077164e-07, "loss": 0.0001, "num_input_tokens_seen": 10023392, "step": 4310 }, { "epoch": 4.066918001885014, "grad_norm": 0.0020008094143122435, "learning_rate": 5.130334284201799e-07, "loss": 0.0002, "num_input_tokens_seen": 10034528, "step": 4315 }, { "epoch": 4.071630537229029, "grad_norm": 0.0024472337681800127, "learning_rate": 5.080520535552028e-07, "loss": 0.0, "num_input_tokens_seen": 10045024, "step": 4320 }, { "epoch": 4.076343072573044, "grad_norm": 0.05074908956885338, "learning_rate": 5.030922438419569e-07, "loss": 0.0, "num_input_tokens_seen": 10055328, "step": 4325 }, { "epoch": 4.081055607917059, "grad_norm": 0.008851269260048866, "learning_rate": 4.981540529761473e-07, "loss": 0.0, "num_input_tokens_seen": 10065184, "step": 4330 }, { "epoch": 4.085768143261075, "grad_norm": 0.009758401662111282, "learning_rate": 4.932375344194285e-07, "loss": 0.0, "num_input_tokens_seen": 10077088, "step": 4335 }, { "epoch": 4.09048067860509, "grad_norm": 0.005179739557206631, "learning_rate": 4.88342741398831e-07, "loss": 0.0, "num_input_tokens_seen": 10087840, "step": 4340 }, { "epoch": 4.095193213949105, "grad_norm": 0.020409001037478447, "learning_rate": 4.83469726906175e-07, "loss": 0.0, "num_input_tokens_seen": 10098656, "step": 4345 }, { "epoch": 4.09990574929312, "grad_norm": 0.0022380428854376078, "learning_rate": 4.786185436975085e-07, "loss": 0.0, "num_input_tokens_seen": 10111456, "step": 4350 }, { "epoch": 4.104618284637135, "grad_norm": 0.0027253238949924707, "learning_rate": 4.7378924429252735e-07, "loss": 0.0, "num_input_tokens_seen": 10122912, "step": 4355 }, { "epoch": 4.10933081998115, "grad_norm": 0.22917087376117706, "learning_rate": 4.689818809740118e-07, "loss": 0.0003, "num_input_tokens_seen": 10135072, "step": 4360 }, { "epoch": 4.1140433553251645, "grad_norm": 0.04845559597015381, "learning_rate": 4.641965057872552e-07, "loss": 0.0001, "num_input_tokens_seen": 10145760, "step": 4365 }, { "epoch": 4.11875589066918, "grad_norm": 0.004377515520900488, "learning_rate": 4.594331705395078e-07, "loss": 0.0001, "num_input_tokens_seen": 10156000, "step": 4370 }, { "epoch": 4.123468426013195, "grad_norm": 0.002658847952261567, "learning_rate": 4.5469192679940905e-07, "loss": 0.0, "num_input_tokens_seen": 10168736, "step": 4375 }, { "epoch": 4.12818096135721, "grad_norm": 0.022708803415298462, "learning_rate": 4.4997282589643363e-07, "loss": 0.0, "num_input_tokens_seen": 10181408, "step": 4380 }, { "epoch": 4.132893496701225, "grad_norm": 0.004133255686610937, "learning_rate": 4.4527591892033263e-07, "loss": 0.0, "num_input_tokens_seen": 10191904, "step": 4385 }, { "epoch": 4.13760603204524, "grad_norm": 0.0023588186595588923, "learning_rate": 4.406012567205847e-07, "loss": 0.0, "num_input_tokens_seen": 10202080, "step": 4390 }, { "epoch": 4.142318567389255, "grad_norm": 0.0017039207741618156, "learning_rate": 4.359488899058409e-07, "loss": 0.0, "num_input_tokens_seen": 10212064, "step": 4395 }, { "epoch": 4.147031102733271, "grad_norm": 0.004048160742968321, "learning_rate": 4.313188688433792e-07, "loss": 0.0, "num_input_tokens_seen": 10223136, "step": 4400 }, { "epoch": 4.151743638077286, "grad_norm": 0.0019121092045679688, "learning_rate": 4.2671124365855853e-07, "loss": 0.0, "num_input_tokens_seen": 10238432, "step": 4405 }, { "epoch": 4.156456173421301, "grad_norm": 0.004715082701295614, "learning_rate": 4.2212606423427867e-07, "loss": 0.0252, "num_input_tokens_seen": 10250784, "step": 4410 }, { "epoch": 4.161168708765316, "grad_norm": 0.04374701902270317, "learning_rate": 4.175633802104337e-07, "loss": 0.0, "num_input_tokens_seen": 10265440, "step": 4415 }, { "epoch": 4.165881244109331, "grad_norm": 0.014298969879746437, "learning_rate": 4.1302324098338315e-07, "loss": 0.0, "num_input_tokens_seen": 10276704, "step": 4420 }, { "epoch": 4.170593779453346, "grad_norm": 0.004060305189341307, "learning_rate": 4.0850569570541036e-07, "loss": 0.0, "num_input_tokens_seen": 10286496, "step": 4425 }, { "epoch": 4.175306314797361, "grad_norm": 0.0036463961005210876, "learning_rate": 4.0401079328419384e-07, "loss": 0.0, "num_input_tokens_seen": 10297376, "step": 4430 }, { "epoch": 4.180018850141376, "grad_norm": 0.007656124886125326, "learning_rate": 3.995385823822767e-07, "loss": 0.0, "num_input_tokens_seen": 10306976, "step": 4435 }, { "epoch": 4.184731385485391, "grad_norm": 0.009232879616320133, "learning_rate": 3.9508911141653896e-07, "loss": 0.0, "num_input_tokens_seen": 10318880, "step": 4440 }, { "epoch": 4.189443920829406, "grad_norm": 0.5042584538459778, "learning_rate": 3.906624285576771e-07, "loss": 0.0001, "num_input_tokens_seen": 10330784, "step": 4445 }, { "epoch": 4.194156456173421, "grad_norm": 0.009172679856419563, "learning_rate": 3.862585817296771e-07, "loss": 0.0, "num_input_tokens_seen": 10341088, "step": 4450 }, { "epoch": 4.198868991517436, "grad_norm": 0.00793259497731924, "learning_rate": 3.8187761860929956e-07, "loss": 0.0, "num_input_tokens_seen": 10352096, "step": 4455 }, { "epoch": 4.203581526861451, "grad_norm": 0.010999282822012901, "learning_rate": 3.775195866255618e-07, "loss": 0.0, "num_input_tokens_seen": 10364448, "step": 4460 }, { "epoch": 4.208294062205466, "grad_norm": 0.0006450503133237362, "learning_rate": 3.731845329592268e-07, "loss": 0.0, "num_input_tokens_seen": 10376928, "step": 4465 }, { "epoch": 4.213006597549482, "grad_norm": 0.0008582579903304577, "learning_rate": 3.6887250454228666e-07, "loss": 0.0, "num_input_tokens_seen": 10389216, "step": 4470 }, { "epoch": 4.217719132893497, "grad_norm": 0.002186185447499156, "learning_rate": 3.6458354805746304e-07, "loss": 0.0, "num_input_tokens_seen": 10406944, "step": 4475 }, { "epoch": 4.222431668237512, "grad_norm": 0.0032066500280052423, "learning_rate": 3.603177099376931e-07, "loss": 0.0, "num_input_tokens_seen": 10417760, "step": 4480 }, { "epoch": 4.227144203581527, "grad_norm": 0.0038139999378472567, "learning_rate": 3.5607503636563484e-07, "loss": 0.0, "num_input_tokens_seen": 10429216, "step": 4485 }, { "epoch": 4.231856738925542, "grad_norm": 0.004827831871807575, "learning_rate": 3.5185557327315797e-07, "loss": 0.0, "num_input_tokens_seen": 10442784, "step": 4490 }, { "epoch": 4.236569274269557, "grad_norm": 0.0007723537273705006, "learning_rate": 3.47659366340857e-07, "loss": 0.0, "num_input_tokens_seen": 10454496, "step": 4495 }, { "epoch": 4.2412818096135725, "grad_norm": 0.00883992575109005, "learning_rate": 3.43486460997548e-07, "loss": 0.0, "num_input_tokens_seen": 10466464, "step": 4500 }, { "epoch": 4.245994344957587, "grad_norm": 0.009929073974490166, "learning_rate": 3.393369024197826e-07, "loss": 0.0, "num_input_tokens_seen": 10476768, "step": 4505 }, { "epoch": 4.250706880301602, "grad_norm": 0.003360740141943097, "learning_rate": 3.352107355313536e-07, "loss": 0.0, "num_input_tokens_seen": 10487392, "step": 4510 }, { "epoch": 4.255419415645617, "grad_norm": 0.004852129612118006, "learning_rate": 3.311080050028148e-07, "loss": 0.0, "num_input_tokens_seen": 10498144, "step": 4515 }, { "epoch": 4.260131950989632, "grad_norm": 0.0017156790709123015, "learning_rate": 3.2702875525099235e-07, "loss": 0.0782, "num_input_tokens_seen": 10507808, "step": 4520 }, { "epoch": 4.262016965127239, "eval_loss": 0.5548250675201416, "eval_runtime": 2.7767, "eval_samples_per_second": 339.618, "eval_steps_per_second": 42.497, "num_input_tokens_seen": 10512416, "step": 4522 }, { "epoch": 4.264844486333647, "grad_norm": 0.014055570587515831, "learning_rate": 3.2297303043850564e-07, "loss": 0.0, "num_input_tokens_seen": 10517408, "step": 4525 }, { "epoch": 4.269557021677663, "grad_norm": 0.002077508484944701, "learning_rate": 3.189408744732897e-07, "loss": 0.0, "num_input_tokens_seen": 10528416, "step": 4530 }, { "epoch": 4.274269557021678, "grad_norm": 0.036649417132139206, "learning_rate": 3.149323310081201e-07, "loss": 0.0, "num_input_tokens_seen": 10541216, "step": 4535 }, { "epoch": 4.278982092365693, "grad_norm": 0.00107799272518605, "learning_rate": 3.1094744344013855e-07, "loss": 0.0, "num_input_tokens_seen": 10554016, "step": 4540 }, { "epoch": 4.283694627709708, "grad_norm": 0.011105876415967941, "learning_rate": 3.069862549103841e-07, "loss": 0.0, "num_input_tokens_seen": 10563552, "step": 4545 }, { "epoch": 4.288407163053723, "grad_norm": 0.005242721643298864, "learning_rate": 3.030488083033273e-07, "loss": 0.0, "num_input_tokens_seen": 10576288, "step": 4550 }, { "epoch": 4.293119698397738, "grad_norm": 0.013468295335769653, "learning_rate": 2.991351462464037e-07, "loss": 0.0, "num_input_tokens_seen": 10586784, "step": 4555 }, { "epoch": 4.297832233741753, "grad_norm": 0.005555103067308664, "learning_rate": 2.9524531110955406e-07, "loss": 0.0, "num_input_tokens_seen": 10597792, "step": 4560 }, { "epoch": 4.3025447690857686, "grad_norm": 0.009637890383601189, "learning_rate": 2.913793450047639e-07, "loss": 0.0, "num_input_tokens_seen": 10610720, "step": 4565 }, { "epoch": 4.3072573044297835, "grad_norm": 0.002298228908330202, "learning_rate": 2.875372897856113e-07, "loss": 0.0, "num_input_tokens_seen": 10622176, "step": 4570 }, { "epoch": 4.311969839773798, "grad_norm": 0.025600271299481392, "learning_rate": 2.837191870468084e-07, "loss": 0.0, "num_input_tokens_seen": 10632864, "step": 4575 }, { "epoch": 4.316682375117813, "grad_norm": 0.000993955647572875, "learning_rate": 2.7992507812375557e-07, "loss": 0.0039, "num_input_tokens_seen": 10642784, "step": 4580 }, { "epoch": 4.321394910461828, "grad_norm": 0.01237708143889904, "learning_rate": 2.76155004092091e-07, "loss": 0.0153, "num_input_tokens_seen": 10652896, "step": 4585 }, { "epoch": 4.326107445805843, "grad_norm": 0.002820044755935669, "learning_rate": 2.7240900576724904e-07, "loss": 0.1078, "num_input_tokens_seen": 10665248, "step": 4590 }, { "epoch": 4.330819981149858, "grad_norm": 0.0086582712829113, "learning_rate": 2.686871237040151e-07, "loss": 0.0001, "num_input_tokens_seen": 10676384, "step": 4595 }, { "epoch": 4.335532516493874, "grad_norm": 0.0017868748400360346, "learning_rate": 2.6498939819608827e-07, "loss": 0.0, "num_input_tokens_seen": 10688352, "step": 4600 }, { "epoch": 4.340245051837889, "grad_norm": 0.026829157024621964, "learning_rate": 2.613158692756443e-07, "loss": 0.0, "num_input_tokens_seen": 10698080, "step": 4605 }, { "epoch": 4.344957587181904, "grad_norm": 0.004218968562781811, "learning_rate": 2.576665767129055e-07, "loss": 0.0, "num_input_tokens_seen": 10710816, "step": 4610 }, { "epoch": 4.349670122525919, "grad_norm": 0.0005848800064995885, "learning_rate": 2.5404156001570257e-07, "loss": 0.0, "num_input_tokens_seen": 10722592, "step": 4615 }, { "epoch": 4.354382657869934, "grad_norm": 0.0013897489989176393, "learning_rate": 2.5044085842905686e-07, "loss": 0.0, "num_input_tokens_seen": 10734752, "step": 4620 }, { "epoch": 4.359095193213949, "grad_norm": 0.003906297497451305, "learning_rate": 2.4686451093474673e-07, "loss": 0.0001, "num_input_tokens_seen": 10746464, "step": 4625 }, { "epoch": 4.363807728557964, "grad_norm": 0.010411670431494713, "learning_rate": 2.433125562508917e-07, "loss": 0.0, "num_input_tokens_seen": 10757472, "step": 4630 }, { "epoch": 4.36852026390198, "grad_norm": 24.992229461669922, "learning_rate": 2.3978503283152847e-07, "loss": 0.1078, "num_input_tokens_seen": 10769056, "step": 4635 }, { "epoch": 4.3732327992459945, "grad_norm": 0.007977725006639957, "learning_rate": 2.3628197886619852e-07, "loss": 0.0, "num_input_tokens_seen": 10780384, "step": 4640 }, { "epoch": 4.3779453345900095, "grad_norm": 0.0031011472456157207, "learning_rate": 2.3280343227953305e-07, "loss": 0.0, "num_input_tokens_seen": 10792928, "step": 4645 }, { "epoch": 4.382657869934024, "grad_norm": 0.0017057248624041677, "learning_rate": 2.293494307308411e-07, "loss": 0.0, "num_input_tokens_seen": 10803808, "step": 4650 }, { "epoch": 4.387370405278039, "grad_norm": 0.002497282810509205, "learning_rate": 2.2592001161370392e-07, "loss": 0.0, "num_input_tokens_seen": 10814496, "step": 4655 }, { "epoch": 4.392082940622054, "grad_norm": 0.005025547929108143, "learning_rate": 2.2251521205557042e-07, "loss": 0.0, "num_input_tokens_seen": 10827168, "step": 4660 }, { "epoch": 4.39679547596607, "grad_norm": 0.014085683971643448, "learning_rate": 2.1913506891735242e-07, "loss": 0.0, "num_input_tokens_seen": 10839392, "step": 4665 }, { "epoch": 4.401508011310085, "grad_norm": 0.0015956445131450891, "learning_rate": 2.1577961879302807e-07, "loss": 0.0, "num_input_tokens_seen": 10851744, "step": 4670 }, { "epoch": 4.4062205466541, "grad_norm": 0.0027405947912484407, "learning_rate": 2.124488980092454e-07, "loss": 0.0, "num_input_tokens_seen": 10864608, "step": 4675 }, { "epoch": 4.410933081998115, "grad_norm": 0.0018958933651447296, "learning_rate": 2.0914294262492723e-07, "loss": 0.0, "num_input_tokens_seen": 10877856, "step": 4680 }, { "epoch": 4.41564561734213, "grad_norm": 0.0035127715673297644, "learning_rate": 2.0586178843088473e-07, "loss": 0.0044, "num_input_tokens_seen": 10891616, "step": 4685 }, { "epoch": 4.420358152686145, "grad_norm": 0.003918149974197149, "learning_rate": 2.026054709494235e-07, "loss": 0.0, "num_input_tokens_seen": 10901024, "step": 4690 }, { "epoch": 4.425070688030161, "grad_norm": 0.004163654521107674, "learning_rate": 1.9937402543396683e-07, "loss": 0.0, "num_input_tokens_seen": 10910560, "step": 4695 }, { "epoch": 4.429783223374176, "grad_norm": 0.0017460703384131193, "learning_rate": 1.961674868686675e-07, "loss": 0.0, "num_input_tokens_seen": 10921824, "step": 4700 }, { "epoch": 4.434495758718191, "grad_norm": 0.0011986172758042812, "learning_rate": 1.929858899680323e-07, "loss": 0.0, "num_input_tokens_seen": 10934944, "step": 4705 }, { "epoch": 4.4392082940622055, "grad_norm": 0.0017532928613945842, "learning_rate": 1.8982926917654575e-07, "loss": 0.0922, "num_input_tokens_seen": 10946400, "step": 4710 }, { "epoch": 4.4439208294062205, "grad_norm": 0.0029414647724479437, "learning_rate": 1.8669765866829724e-07, "loss": 0.0, "num_input_tokens_seen": 10958112, "step": 4715 }, { "epoch": 4.448633364750235, "grad_norm": 0.0008323266520164907, "learning_rate": 1.835910923466097e-07, "loss": 0.0, "num_input_tokens_seen": 10970528, "step": 4720 }, { "epoch": 4.45334590009425, "grad_norm": 0.002249309793114662, "learning_rate": 1.805096038436749e-07, "loss": 0.0, "num_input_tokens_seen": 10982048, "step": 4725 }, { "epoch": 4.458058435438266, "grad_norm": 0.001825433922931552, "learning_rate": 1.774532265201867e-07, "loss": 0.0, "num_input_tokens_seen": 10994848, "step": 4730 }, { "epoch": 4.462770970782281, "grad_norm": 0.04776029288768768, "learning_rate": 1.7442199346498294e-07, "loss": 0.0001, "num_input_tokens_seen": 11004896, "step": 4735 }, { "epoch": 4.467483506126296, "grad_norm": 0.01915101520717144, "learning_rate": 1.7141593749468361e-07, "loss": 0.0, "num_input_tokens_seen": 11017056, "step": 4740 }, { "epoch": 4.472196041470311, "grad_norm": 0.0076557123102247715, "learning_rate": 1.6843509115333917e-07, "loss": 0.0, "num_input_tokens_seen": 11026912, "step": 4745 }, { "epoch": 4.476908576814326, "grad_norm": 0.010826838202774525, "learning_rate": 1.6547948671207515e-07, "loss": 0.0, "num_input_tokens_seen": 11038176, "step": 4750 }, { "epoch": 4.481621112158341, "grad_norm": 0.0004848201060667634, "learning_rate": 1.6254915616874645e-07, "loss": 0.0, "num_input_tokens_seen": 11047648, "step": 4755 }, { "epoch": 4.486333647502356, "grad_norm": 0.004097946919500828, "learning_rate": 1.5964413124758492e-07, "loss": 0.0441, "num_input_tokens_seen": 11056864, "step": 4760 }, { "epoch": 4.491046182846372, "grad_norm": 0.0032770747784525156, "learning_rate": 1.5676444339886327e-07, "loss": 0.0, "num_input_tokens_seen": 11067744, "step": 4765 }, { "epoch": 4.495758718190387, "grad_norm": 0.0030054976232349873, "learning_rate": 1.5391012379854937e-07, "loss": 0.0, "num_input_tokens_seen": 11077920, "step": 4770 }, { "epoch": 4.500471253534402, "grad_norm": 0.0032186508178710938, "learning_rate": 1.5108120334797e-07, "loss": 0.0, "num_input_tokens_seen": 11088864, "step": 4775 }, { "epoch": 4.5051837888784165, "grad_norm": 0.006772062741219997, "learning_rate": 1.4827771267347662e-07, "loss": 0.0, "num_input_tokens_seen": 11098336, "step": 4780 }, { "epoch": 4.5098963242224315, "grad_norm": 0.003302761586382985, "learning_rate": 1.4549968212611538e-07, "loss": 0.0, "num_input_tokens_seen": 11107680, "step": 4785 }, { "epoch": 4.512723845428841, "eval_loss": 0.5418137311935425, "eval_runtime": 2.7286, "eval_samples_per_second": 345.594, "eval_steps_per_second": 43.245, "num_input_tokens_seen": 11115040, "step": 4788 }, { "epoch": 4.514608859566446, "grad_norm": 0.009535914286971092, "learning_rate": 1.4274714178129534e-07, "loss": 0.0, "num_input_tokens_seen": 11120480, "step": 4790 }, { "epoch": 4.519321394910461, "grad_norm": 0.0017795681487768888, "learning_rate": 1.4002012143846472e-07, "loss": 0.0, "num_input_tokens_seen": 11132320, "step": 4795 }, { "epoch": 4.524033930254477, "grad_norm": 0.004151622299104929, "learning_rate": 1.3731865062078853e-07, "loss": 0.0006, "num_input_tokens_seen": 11148960, "step": 4800 }, { "epoch": 4.528746465598492, "grad_norm": 0.009143157862126827, "learning_rate": 1.3464275857482778e-07, "loss": 0.0, "num_input_tokens_seen": 11159968, "step": 4805 }, { "epoch": 4.533459000942507, "grad_norm": 0.00789305754005909, "learning_rate": 1.3199247427022528e-07, "loss": 0.122, "num_input_tokens_seen": 11170848, "step": 4810 }, { "epoch": 4.538171536286522, "grad_norm": 0.0016827658982947469, "learning_rate": 1.293678263993872e-07, "loss": 0.0, "num_input_tokens_seen": 11184288, "step": 4815 }, { "epoch": 4.542884071630537, "grad_norm": 0.004033006262034178, "learning_rate": 1.2676884337717882e-07, "loss": 0.0, "num_input_tokens_seen": 11197856, "step": 4820 }, { "epoch": 4.547596606974552, "grad_norm": 0.0016169328009709716, "learning_rate": 1.241955533406114e-07, "loss": 0.0, "num_input_tokens_seen": 11209696, "step": 4825 }, { "epoch": 4.552309142318568, "grad_norm": 0.0008929000468924642, "learning_rate": 1.2164798414854073e-07, "loss": 0.0, "num_input_tokens_seen": 11220064, "step": 4830 }, { "epoch": 4.557021677662583, "grad_norm": 0.002499083988368511, "learning_rate": 1.1912616338136396e-07, "loss": 0.0, "num_input_tokens_seen": 11230304, "step": 4835 }, { "epoch": 4.561734213006598, "grad_norm": 0.0028031610418111086, "learning_rate": 1.1663011834072257e-07, "loss": 0.0, "num_input_tokens_seen": 11240096, "step": 4840 }, { "epoch": 4.566446748350613, "grad_norm": 0.002892805030569434, "learning_rate": 1.1415987604920492e-07, "loss": 0.0, "num_input_tokens_seen": 11251104, "step": 4845 }, { "epoch": 4.5711592836946275, "grad_norm": 0.002147044287994504, "learning_rate": 1.11715463250055e-07, "loss": 0.0, "num_input_tokens_seen": 11261088, "step": 4850 }, { "epoch": 4.5758718190386425, "grad_norm": 69.84650421142578, "learning_rate": 1.0929690640688218e-07, "loss": 0.0072, "num_input_tokens_seen": 11273312, "step": 4855 }, { "epoch": 4.580584354382658, "grad_norm": 0.00039674167055636644, "learning_rate": 1.0690423170337554e-07, "loss": 0.0003, "num_input_tokens_seen": 11284896, "step": 4860 }, { "epoch": 4.585296889726673, "grad_norm": 0.002157399896532297, "learning_rate": 1.0453746504302003e-07, "loss": 0.0, "num_input_tokens_seen": 11294560, "step": 4865 }, { "epoch": 4.590009425070688, "grad_norm": 0.012543817050755024, "learning_rate": 1.021966320488152e-07, "loss": 0.0813, "num_input_tokens_seen": 11308128, "step": 4870 }, { "epoch": 4.594721960414703, "grad_norm": 0.004514993634074926, "learning_rate": 9.988175806299877e-08, "loss": 0.0, "num_input_tokens_seen": 11321056, "step": 4875 }, { "epoch": 4.599434495758718, "grad_norm": 0.06353511661291122, "learning_rate": 9.759286814677305e-08, "loss": 0.0, "num_input_tokens_seen": 11334496, "step": 4880 }, { "epoch": 4.604147031102733, "grad_norm": 0.00232234806753695, "learning_rate": 9.532998708003061e-08, "loss": 0.0, "num_input_tokens_seen": 11346208, "step": 4885 }, { "epoch": 4.608859566446748, "grad_norm": 0.0016301957657560706, "learning_rate": 9.309313936108983e-08, "loss": 0.0, "num_input_tokens_seen": 11358112, "step": 4890 }, { "epoch": 4.613572101790764, "grad_norm": 0.0005986247560940683, "learning_rate": 9.088234920642703e-08, "loss": 0.0, "num_input_tokens_seen": 11368096, "step": 4895 }, { "epoch": 4.618284637134779, "grad_norm": 0.004071133676916361, "learning_rate": 8.869764055041501e-08, "loss": 0.0, "num_input_tokens_seen": 11378976, "step": 4900 }, { "epoch": 4.622997172478794, "grad_norm": 0.0028420898597687483, "learning_rate": 8.653903704506389e-08, "loss": 0.0, "num_input_tokens_seen": 11390688, "step": 4905 }, { "epoch": 4.627709707822809, "grad_norm": 0.011091876775026321, "learning_rate": 8.440656205976644e-08, "loss": 0.0, "num_input_tokens_seen": 11401440, "step": 4910 }, { "epoch": 4.632422243166824, "grad_norm": 0.001331353560090065, "learning_rate": 8.230023868104231e-08, "loss": 0.0, "num_input_tokens_seen": 11412448, "step": 4915 }, { "epoch": 4.6371347785108386, "grad_norm": 0.01889793761074543, "learning_rate": 8.022008971229039e-08, "loss": 0.0, "num_input_tokens_seen": 11422496, "step": 4920 }, { "epoch": 4.6418473138548535, "grad_norm": 0.002929375506937504, "learning_rate": 7.816613767354098e-08, "loss": 0.0, "num_input_tokens_seen": 11433632, "step": 4925 }, { "epoch": 4.646559849198869, "grad_norm": 0.002046855865046382, "learning_rate": 7.613840480121176e-08, "loss": 0.0, "num_input_tokens_seen": 11446112, "step": 4930 }, { "epoch": 4.651272384542884, "grad_norm": 0.0020440209191292524, "learning_rate": 7.41369130478689e-08, "loss": 0.0, "num_input_tokens_seen": 11459552, "step": 4935 }, { "epoch": 4.655984919886899, "grad_norm": 0.01186713483184576, "learning_rate": 7.216168408198554e-08, "loss": 0.0, "num_input_tokens_seen": 11469984, "step": 4940 }, { "epoch": 4.660697455230914, "grad_norm": 0.005305349826812744, "learning_rate": 7.021273928771221e-08, "loss": 0.0, "num_input_tokens_seen": 11481888, "step": 4945 }, { "epoch": 4.665409990574929, "grad_norm": 0.0028958169277757406, "learning_rate": 6.829009976464102e-08, "loss": 0.0579, "num_input_tokens_seen": 11494944, "step": 4950 }, { "epoch": 4.670122525918944, "grad_norm": 0.0005974304513074458, "learning_rate": 6.639378632757986e-08, "loss": 0.0, "num_input_tokens_seen": 11505184, "step": 4955 }, { "epoch": 4.674835061262959, "grad_norm": 0.0025155956391245127, "learning_rate": 6.452381950632469e-08, "loss": 0.0, "num_input_tokens_seen": 11517856, "step": 4960 }, { "epoch": 4.679547596606975, "grad_norm": 0.0031720330007374287, "learning_rate": 6.268021954544095e-08, "loss": 0.0, "num_input_tokens_seen": 11530016, "step": 4965 }, { "epoch": 4.68426013195099, "grad_norm": 0.004775336477905512, "learning_rate": 6.08630064040408e-08, "loss": 0.0, "num_input_tokens_seen": 11545376, "step": 4970 }, { "epoch": 4.688972667295005, "grad_norm": 0.001557852141559124, "learning_rate": 5.9072199755567936e-08, "loss": 0.0, "num_input_tokens_seen": 11556448, "step": 4975 }, { "epoch": 4.69368520263902, "grad_norm": 0.0037680910900235176, "learning_rate": 5.730781898758614e-08, "loss": 0.0, "num_input_tokens_seen": 11566304, "step": 4980 }, { "epoch": 4.698397737983035, "grad_norm": 0.0020881840027868748, "learning_rate": 5.556988320156831e-08, "loss": 0.0, "num_input_tokens_seen": 11577056, "step": 4985 }, { "epoch": 4.7031102733270505, "grad_norm": 0.0009613709407858551, "learning_rate": 5.3858411212689146e-08, "loss": 0.0, "num_input_tokens_seen": 11589536, "step": 4990 }, { "epoch": 4.707822808671065, "grad_norm": 0.11669395118951797, "learning_rate": 5.2173421549621685e-08, "loss": 0.0001, "num_input_tokens_seen": 11599648, "step": 4995 }, { "epoch": 4.71253534401508, "grad_norm": 0.0027235562447458506, "learning_rate": 5.051493245433775e-08, "loss": 0.0, "num_input_tokens_seen": 11610272, "step": 5000 }, { "epoch": 4.717247879359095, "grad_norm": 0.06230725720524788, "learning_rate": 4.888296188190977e-08, "loss": 0.0, "num_input_tokens_seen": 11620768, "step": 5005 }, { "epoch": 4.72196041470311, "grad_norm": 0.0018570433603599668, "learning_rate": 4.727752750031511e-08, "loss": 0.0, "num_input_tokens_seen": 11632608, "step": 5010 }, { "epoch": 4.726672950047125, "grad_norm": 0.01290284376591444, "learning_rate": 4.5698646690247874e-08, "loss": 0.0, "num_input_tokens_seen": 11644896, "step": 5015 }, { "epoch": 4.73138548539114, "grad_norm": 0.005762244574725628, "learning_rate": 4.414633654492767e-08, "loss": 0.0, "num_input_tokens_seen": 11661344, "step": 5020 }, { "epoch": 4.736098020735156, "grad_norm": 0.0015709196450188756, "learning_rate": 4.2620613869915894e-08, "loss": 0.0, "num_input_tokens_seen": 11672288, "step": 5025 }, { "epoch": 4.740810556079171, "grad_norm": 0.0015942390309646726, "learning_rate": 4.112149518293362e-08, "loss": 0.0, "num_input_tokens_seen": 11684960, "step": 5030 }, { "epoch": 4.745523091423186, "grad_norm": 0.013269704766571522, "learning_rate": 3.9648996713683715e-08, "loss": 0.0, "num_input_tokens_seen": 11696160, "step": 5035 }, { "epoch": 4.750235626767201, "grad_norm": 0.004723825957626104, "learning_rate": 3.8203134403672905e-08, "loss": 0.0, "num_input_tokens_seen": 11705952, "step": 5040 }, { "epoch": 4.754948162111216, "grad_norm": 0.006132619455456734, "learning_rate": 3.678392390604163e-08, "loss": 0.0, "num_input_tokens_seen": 11716192, "step": 5045 }, { "epoch": 4.759660697455231, "grad_norm": 0.003462289460003376, "learning_rate": 3.539138058539282e-08, "loss": 0.0, "num_input_tokens_seen": 11728160, "step": 5050 }, { "epoch": 4.763430725730443, "eval_loss": 0.54215008020401, "eval_runtime": 2.8215, "eval_samples_per_second": 334.225, "eval_steps_per_second": 41.822, "num_input_tokens_seen": 11736672, "step": 5054 }, { "epoch": 4.764373232799246, "grad_norm": 0.0019681937992572784, "learning_rate": 3.4025519517626174e-08, "loss": 0.0, "num_input_tokens_seen": 11738720, "step": 5055 }, { "epoch": 4.7690857681432615, "grad_norm": 0.0026169854681938887, "learning_rate": 3.268635548977633e-08, "loss": 0.0, "num_input_tokens_seen": 11750176, "step": 5060 }, { "epoch": 4.773798303487276, "grad_norm": 0.0066421544179320335, "learning_rate": 3.137390299984888e-08, "loss": 0.0, "num_input_tokens_seen": 11761312, "step": 5065 }, { "epoch": 4.778510838831291, "grad_norm": 0.0028810338117182255, "learning_rate": 3.0088176256668765e-08, "loss": 0.0, "num_input_tokens_seen": 11773728, "step": 5070 }, { "epoch": 4.783223374175306, "grad_norm": 0.001111470046453178, "learning_rate": 2.8829189179721552e-08, "loss": 0.0, "num_input_tokens_seen": 11784672, "step": 5075 }, { "epoch": 4.787935909519321, "grad_norm": 0.009747396223247051, "learning_rate": 2.759695539900603e-08, "loss": 0.0, "num_input_tokens_seen": 11796512, "step": 5080 }, { "epoch": 4.792648444863336, "grad_norm": 0.0038332142867147923, "learning_rate": 2.639148825488491e-08, "loss": 0.0, "num_input_tokens_seen": 11810464, "step": 5085 }, { "epoch": 4.797360980207351, "grad_norm": 0.003033042885363102, "learning_rate": 2.5212800797941582e-08, "loss": 0.0, "num_input_tokens_seen": 11820768, "step": 5090 }, { "epoch": 4.802073515551367, "grad_norm": 0.007614613976329565, "learning_rate": 2.406090578883691e-08, "loss": 0.0, "num_input_tokens_seen": 11831776, "step": 5095 }, { "epoch": 4.806786050895382, "grad_norm": 0.0031820612493902445, "learning_rate": 2.2935815698174045e-08, "loss": 0.0, "num_input_tokens_seen": 11843296, "step": 5100 }, { "epoch": 4.811498586239397, "grad_norm": 0.00583045044913888, "learning_rate": 2.1837542706359958e-08, "loss": 0.0, "num_input_tokens_seen": 11860000, "step": 5105 }, { "epoch": 4.816211121583412, "grad_norm": 0.016117779538035393, "learning_rate": 2.0766098703477178e-08, "loss": 0.0, "num_input_tokens_seen": 11872160, "step": 5110 }, { "epoch": 4.820923656927427, "grad_norm": 0.006356321275234222, "learning_rate": 1.9721495289152237e-08, "loss": 0.0, "num_input_tokens_seen": 11883168, "step": 5115 }, { "epoch": 4.825636192271442, "grad_norm": 0.0020537914242595434, "learning_rate": 1.8703743772430783e-08, "loss": 0.0, "num_input_tokens_seen": 11895584, "step": 5120 }, { "epoch": 4.830348727615457, "grad_norm": 0.006692873314023018, "learning_rate": 1.7712855171655996e-08, "loss": 0.0, "num_input_tokens_seen": 11906784, "step": 5125 }, { "epoch": 4.8350612629594725, "grad_norm": 0.0012435702374204993, "learning_rate": 1.6748840214348972e-08, "loss": 0.0, "num_input_tokens_seen": 11917600, "step": 5130 }, { "epoch": 4.839773798303487, "grad_norm": 0.00899266917258501, "learning_rate": 1.5811709337091862e-08, "loss": 0.0, "num_input_tokens_seen": 11929632, "step": 5135 }, { "epoch": 4.844486333647502, "grad_norm": 0.0016645839205011725, "learning_rate": 1.4901472685415475e-08, "loss": 0.0, "num_input_tokens_seen": 11938720, "step": 5140 }, { "epoch": 4.849198868991517, "grad_norm": 0.00441192090511322, "learning_rate": 1.4018140113689904e-08, "loss": 0.0072, "num_input_tokens_seen": 11951648, "step": 5145 }, { "epoch": 4.853911404335532, "grad_norm": 0.001967532094568014, "learning_rate": 1.3161721185016852e-08, "loss": 0.0, "num_input_tokens_seen": 11962336, "step": 5150 }, { "epoch": 4.858623939679548, "grad_norm": 0.005055475980043411, "learning_rate": 1.2332225171126366e-08, "loss": 0.0, "num_input_tokens_seen": 11975904, "step": 5155 }, { "epoch": 4.863336475023563, "grad_norm": 0.014400842599570751, "learning_rate": 1.152966105227693e-08, "loss": 0.0, "num_input_tokens_seen": 11986208, "step": 5160 }, { "epoch": 4.868049010367578, "grad_norm": 0.0008923859568312764, "learning_rate": 1.0754037517158312e-08, "loss": 0.0, "num_input_tokens_seen": 11999520, "step": 5165 }, { "epoch": 4.872761545711593, "grad_norm": 0.0033309967257082462, "learning_rate": 1.0005362962796362e-08, "loss": 0.0, "num_input_tokens_seen": 12011424, "step": 5170 }, { "epoch": 4.877474081055608, "grad_norm": 0.005915912799537182, "learning_rate": 9.283645494463368e-09, "loss": 0.0, "num_input_tokens_seen": 12024864, "step": 5175 }, { "epoch": 4.882186616399623, "grad_norm": 0.0010997394565492868, "learning_rate": 8.588892925590064e-09, "loss": 0.0, "num_input_tokens_seen": 12035936, "step": 5180 }, { "epoch": 4.886899151743638, "grad_norm": 0.005926317069679499, "learning_rate": 7.92111277768015e-09, "loss": 0.0, "num_input_tokens_seen": 12045920, "step": 5185 }, { "epoch": 4.891611687087654, "grad_norm": 0.011491699144244194, "learning_rate": 7.280312280230073e-09, "loss": 0.0, "num_input_tokens_seen": 12058144, "step": 5190 }, { "epoch": 4.8963242224316685, "grad_norm": 0.0012660275679081678, "learning_rate": 6.666498370650198e-09, "loss": 0.0, "num_input_tokens_seen": 12069792, "step": 5195 }, { "epoch": 4.9010367577756835, "grad_norm": 0.03111676499247551, "learning_rate": 6.079677694189046e-09, "loss": 0.0, "num_input_tokens_seen": 12080864, "step": 5200 }, { "epoch": 4.905749293119698, "grad_norm": 0.0013736054534092546, "learning_rate": 5.5198566038627835e-09, "loss": 0.0, "num_input_tokens_seen": 12092256, "step": 5205 }, { "epoch": 4.910461828463713, "grad_norm": 0.008520975708961487, "learning_rate": 4.987041160385287e-09, "loss": 0.0, "num_input_tokens_seen": 12106784, "step": 5210 }, { "epoch": 4.915174363807728, "grad_norm": 0.038982491940259933, "learning_rate": 4.481237132103189e-09, "loss": 0.0003, "num_input_tokens_seen": 12117088, "step": 5215 }, { "epoch": 4.919886899151743, "grad_norm": 0.0053964899852871895, "learning_rate": 4.002449994932878e-09, "loss": 0.0, "num_input_tokens_seen": 12128736, "step": 5220 }, { "epoch": 4.924599434495759, "grad_norm": 0.0056511214934289455, "learning_rate": 3.550684932301374e-09, "loss": 0.0, "num_input_tokens_seen": 12145376, "step": 5225 }, { "epoch": 4.929311969839774, "grad_norm": 0.004293152131140232, "learning_rate": 3.1259468350910982e-09, "loss": 0.0, "num_input_tokens_seen": 12156320, "step": 5230 }, { "epoch": 4.934024505183789, "grad_norm": 0.002154400572180748, "learning_rate": 2.7282403015849167e-09, "loss": 0.0, "num_input_tokens_seen": 12167968, "step": 5235 }, { "epoch": 4.938737040527804, "grad_norm": 0.0020166414324194193, "learning_rate": 2.3575696374189548e-09, "loss": 0.0, "num_input_tokens_seen": 12179744, "step": 5240 }, { "epoch": 4.943449575871819, "grad_norm": 0.0013167713768780231, "learning_rate": 2.013938855533748e-09, "loss": 0.0001, "num_input_tokens_seen": 12192288, "step": 5245 }, { "epoch": 4.948162111215834, "grad_norm": 0.00897101778537035, "learning_rate": 1.6973516761317755e-09, "loss": 0.0, "num_input_tokens_seen": 12203360, "step": 5250 }, { "epoch": 4.952874646559849, "grad_norm": 0.7944397330284119, "learning_rate": 1.407811526637215e-09, "loss": 0.0007, "num_input_tokens_seen": 12215392, "step": 5255 }, { "epoch": 4.957587181903865, "grad_norm": 0.004428845830261707, "learning_rate": 1.145321541659028e-09, "loss": 0.0, "num_input_tokens_seen": 12227680, "step": 5260 }, { "epoch": 4.9622997172478795, "grad_norm": 0.005368073936551809, "learning_rate": 9.098845629559871e-10, "loss": 0.0, "num_input_tokens_seen": 12242336, "step": 5265 }, { "epoch": 4.9670122525918945, "grad_norm": 0.055548761039972305, "learning_rate": 7.015031394072557e-10, "loss": 0.0, "num_input_tokens_seen": 12251936, "step": 5270 }, { "epoch": 4.971724787935909, "grad_norm": 0.006647925358265638, "learning_rate": 5.201795269837995e-10, "loss": 0.0, "num_input_tokens_seen": 12262432, "step": 5275 }, { "epoch": 4.976437323279924, "grad_norm": 0.0016112312441691756, "learning_rate": 3.6591568872451634e-10, "loss": 0.0, "num_input_tokens_seen": 12275872, "step": 5280 }, { "epoch": 4.981149858623939, "grad_norm": 0.003358457935974002, "learning_rate": 2.387132947151427e-10, "loss": 0.0, "num_input_tokens_seen": 12288928, "step": 5285 }, { "epoch": 4.985862393967954, "grad_norm": 0.011470218189060688, "learning_rate": 1.3857372206882436e-10, "loss": 0.0969, "num_input_tokens_seen": 12300832, "step": 5290 }, { "epoch": 4.99057492931197, "grad_norm": 0.018770242109894753, "learning_rate": 6.549805491307127e-11, "loss": 0.0, "num_input_tokens_seen": 12312352, "step": 5295 }, { "epoch": 4.995287464655985, "grad_norm": 0.002519431058317423, "learning_rate": 1.948708437726765e-11, "loss": 0.0, "num_input_tokens_seen": 12322528, "step": 5300 }, { "epoch": 5.0, "grad_norm": 0.0025235991925001144, "learning_rate": 5.413085829575338e-13, "loss": 0.0, "num_input_tokens_seen": 12333600, "step": 5305 }, { "epoch": 5.0, "num_input_tokens_seen": 12333600, "step": 5305, "total_flos": 7.20143693217792e+16, "train_loss": 0.11108403178044919, "train_runtime": 1575.5047, "train_samples_per_second": 26.925, "train_steps_per_second": 3.367 } ], "logging_steps": 5, "max_steps": 5305, "num_input_tokens_seen": 12333600, "num_train_epochs": 5, "save_steps": 266, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.20143693217792e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }