{ "best_global_step": 4097, "best_metric": 0.2517484128475189, "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_cola_42_1774791067/checkpoint-4097", "epoch": 5.0, "eval_steps": 241, "global_step": 4810, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005197505197505198, "grad_norm": 434.86285400390625, "learning_rate": 4.1580041580041583e-07, "loss": 1.2842, "num_input_tokens_seen": 2048, "step": 5 }, { "epoch": 0.010395010395010396, "grad_norm": 175.30227661132812, "learning_rate": 9.355509355509356e-07, "loss": 0.8545, "num_input_tokens_seen": 4224, "step": 10 }, { "epoch": 0.015592515592515593, "grad_norm": 26.521507263183594, "learning_rate": 1.4553014553014554e-06, "loss": 0.3278, "num_input_tokens_seen": 6272, "step": 15 }, { "epoch": 0.02079002079002079, "grad_norm": 86.457275390625, "learning_rate": 1.975051975051975e-06, "loss": 0.365, "num_input_tokens_seen": 8384, "step": 20 }, { "epoch": 0.02598752598752599, "grad_norm": 55.42351150512695, "learning_rate": 2.494802494802495e-06, "loss": 0.4433, "num_input_tokens_seen": 10496, "step": 25 }, { "epoch": 0.031185031185031187, "grad_norm": 22.866464614868164, "learning_rate": 3.014553014553015e-06, "loss": 0.2526, "num_input_tokens_seen": 12544, "step": 30 }, { "epoch": 0.036382536382536385, "grad_norm": 34.69972229003906, "learning_rate": 3.5343035343035348e-06, "loss": 0.3051, "num_input_tokens_seen": 14528, "step": 35 }, { "epoch": 0.04158004158004158, "grad_norm": 62.158714294433594, "learning_rate": 4.0540540540540545e-06, "loss": 0.5129, "num_input_tokens_seen": 16576, "step": 40 }, { "epoch": 0.04677754677754678, "grad_norm": 24.736021041870117, "learning_rate": 4.573804573804574e-06, "loss": 0.283, "num_input_tokens_seen": 18560, "step": 45 }, { "epoch": 0.05197505197505198, "grad_norm": 103.87641906738281, "learning_rate": 5.093555093555094e-06, "loss": 0.454, "num_input_tokens_seen": 20608, "step": 50 }, { "epoch": 0.057172557172557176, "grad_norm": 85.98382568359375, "learning_rate": 5.613305613305614e-06, "loss": 0.2518, "num_input_tokens_seen": 22656, "step": 55 }, { "epoch": 0.062370062370062374, "grad_norm": 15.937103271484375, "learning_rate": 6.1330561330561335e-06, "loss": 0.2538, "num_input_tokens_seen": 24640, "step": 60 }, { "epoch": 0.06756756756756757, "grad_norm": 71.92253112792969, "learning_rate": 6.652806652806653e-06, "loss": 0.3571, "num_input_tokens_seen": 26752, "step": 65 }, { "epoch": 0.07276507276507277, "grad_norm": 41.08773422241211, "learning_rate": 7.172557172557173e-06, "loss": 0.3588, "num_input_tokens_seen": 28608, "step": 70 }, { "epoch": 0.07796257796257797, "grad_norm": 39.851627349853516, "learning_rate": 7.692307692307694e-06, "loss": 0.2553, "num_input_tokens_seen": 30912, "step": 75 }, { "epoch": 0.08316008316008316, "grad_norm": 39.69013595581055, "learning_rate": 8.212058212058212e-06, "loss": 0.3869, "num_input_tokens_seen": 32896, "step": 80 }, { "epoch": 0.08835758835758836, "grad_norm": 15.486207962036133, "learning_rate": 8.731808731808733e-06, "loss": 0.3812, "num_input_tokens_seen": 34816, "step": 85 }, { "epoch": 0.09355509355509356, "grad_norm": 54.213748931884766, "learning_rate": 9.251559251559252e-06, "loss": 0.735, "num_input_tokens_seen": 36736, "step": 90 }, { "epoch": 0.09875259875259876, "grad_norm": 33.433189392089844, "learning_rate": 9.771309771309773e-06, "loss": 0.4027, "num_input_tokens_seen": 38720, "step": 95 }, { "epoch": 0.10395010395010396, "grad_norm": 45.393463134765625, "learning_rate": 1.0291060291060291e-05, "loss": 0.4605, "num_input_tokens_seen": 40640, "step": 100 }, { "epoch": 0.10914760914760915, "grad_norm": 4.37235164642334, "learning_rate": 1.0810810810810812e-05, "loss": 0.2999, "num_input_tokens_seen": 42688, "step": 105 }, { "epoch": 0.11434511434511435, "grad_norm": 41.325050354003906, "learning_rate": 1.1330561330561331e-05, "loss": 0.2839, "num_input_tokens_seen": 44544, "step": 110 }, { "epoch": 0.11954261954261955, "grad_norm": 19.053668975830078, "learning_rate": 1.1850311850311852e-05, "loss": 0.3329, "num_input_tokens_seen": 46400, "step": 115 }, { "epoch": 0.12474012474012475, "grad_norm": 11.979599952697754, "learning_rate": 1.2370062370062372e-05, "loss": 0.2089, "num_input_tokens_seen": 48448, "step": 120 }, { "epoch": 0.12993762993762994, "grad_norm": 20.051830291748047, "learning_rate": 1.2889812889812891e-05, "loss": 0.2829, "num_input_tokens_seen": 50496, "step": 125 }, { "epoch": 0.13513513513513514, "grad_norm": 41.367645263671875, "learning_rate": 1.3409563409563412e-05, "loss": 0.2434, "num_input_tokens_seen": 52416, "step": 130 }, { "epoch": 0.14033264033264034, "grad_norm": 97.8139419555664, "learning_rate": 1.392931392931393e-05, "loss": 0.3621, "num_input_tokens_seen": 54464, "step": 135 }, { "epoch": 0.14553014553014554, "grad_norm": 25.77781867980957, "learning_rate": 1.4449064449064451e-05, "loss": 0.3338, "num_input_tokens_seen": 56448, "step": 140 }, { "epoch": 0.15072765072765074, "grad_norm": 22.036914825439453, "learning_rate": 1.496881496881497e-05, "loss": 0.3185, "num_input_tokens_seen": 58368, "step": 145 }, { "epoch": 0.15592515592515593, "grad_norm": 13.202033996582031, "learning_rate": 1.548856548856549e-05, "loss": 0.2379, "num_input_tokens_seen": 60544, "step": 150 }, { "epoch": 0.16112266112266113, "grad_norm": 27.529069900512695, "learning_rate": 1.600831600831601e-05, "loss": 0.3634, "num_input_tokens_seen": 62592, "step": 155 }, { "epoch": 0.16632016632016633, "grad_norm": 25.028240203857422, "learning_rate": 1.652806652806653e-05, "loss": 0.3093, "num_input_tokens_seen": 64576, "step": 160 }, { "epoch": 0.17151767151767153, "grad_norm": 10.42611026763916, "learning_rate": 1.704781704781705e-05, "loss": 0.3377, "num_input_tokens_seen": 66688, "step": 165 }, { "epoch": 0.17671517671517672, "grad_norm": 4.2057390213012695, "learning_rate": 1.756756756756757e-05, "loss": 0.3224, "num_input_tokens_seen": 68544, "step": 170 }, { "epoch": 0.18191268191268192, "grad_norm": 36.7962532043457, "learning_rate": 1.808731808731809e-05, "loss": 0.3814, "num_input_tokens_seen": 70592, "step": 175 }, { "epoch": 0.18711018711018712, "grad_norm": 9.922293663024902, "learning_rate": 1.8607068607068607e-05, "loss": 0.338, "num_input_tokens_seen": 72576, "step": 180 }, { "epoch": 0.19230769230769232, "grad_norm": 2.9066672325134277, "learning_rate": 1.9126819126819128e-05, "loss": 0.2853, "num_input_tokens_seen": 74624, "step": 185 }, { "epoch": 0.19750519750519752, "grad_norm": 30.27046775817871, "learning_rate": 1.964656964656965e-05, "loss": 0.3374, "num_input_tokens_seen": 76608, "step": 190 }, { "epoch": 0.20270270270270271, "grad_norm": 17.140790939331055, "learning_rate": 2.016632016632017e-05, "loss": 0.3221, "num_input_tokens_seen": 78720, "step": 195 }, { "epoch": 0.2079002079002079, "grad_norm": 75.4980239868164, "learning_rate": 2.068607068607069e-05, "loss": 0.3944, "num_input_tokens_seen": 81152, "step": 200 }, { "epoch": 0.2130977130977131, "grad_norm": 10.222508430480957, "learning_rate": 2.1205821205821207e-05, "loss": 0.3755, "num_input_tokens_seen": 83200, "step": 205 }, { "epoch": 0.2182952182952183, "grad_norm": 9.712939262390137, "learning_rate": 2.1725571725571728e-05, "loss": 0.2949, "num_input_tokens_seen": 85184, "step": 210 }, { "epoch": 0.2234927234927235, "grad_norm": 20.923433303833008, "learning_rate": 2.2245322245322248e-05, "loss": 0.2174, "num_input_tokens_seen": 87232, "step": 215 }, { "epoch": 0.2286902286902287, "grad_norm": 27.973743438720703, "learning_rate": 2.276507276507277e-05, "loss": 0.6706, "num_input_tokens_seen": 89152, "step": 220 }, { "epoch": 0.2338877338877339, "grad_norm": 23.218599319458008, "learning_rate": 2.3284823284823286e-05, "loss": 0.4387, "num_input_tokens_seen": 91328, "step": 225 }, { "epoch": 0.2390852390852391, "grad_norm": 3.9333198070526123, "learning_rate": 2.3804573804573807e-05, "loss": 0.3047, "num_input_tokens_seen": 93312, "step": 230 }, { "epoch": 0.2442827442827443, "grad_norm": 3.1809186935424805, "learning_rate": 2.4324324324324327e-05, "loss": 0.2717, "num_input_tokens_seen": 95296, "step": 235 }, { "epoch": 0.2494802494802495, "grad_norm": 15.368531227111816, "learning_rate": 2.4844074844074848e-05, "loss": 0.2682, "num_input_tokens_seen": 97216, "step": 240 }, { "epoch": 0.2505197505197505, "eval_loss": 0.38468512892723083, "eval_runtime": 1.6403, "eval_samples_per_second": 521.872, "eval_steps_per_second": 65.234, "num_input_tokens_seen": 97664, "step": 241 }, { "epoch": 0.25467775467775466, "grad_norm": 2.458040952682495, "learning_rate": 2.5363825363825365e-05, "loss": 0.344, "num_input_tokens_seen": 99264, "step": 245 }, { "epoch": 0.2598752598752599, "grad_norm": 26.132585525512695, "learning_rate": 2.5883575883575882e-05, "loss": 0.3993, "num_input_tokens_seen": 101184, "step": 250 }, { "epoch": 0.26507276507276506, "grad_norm": 28.206130981445312, "learning_rate": 2.6403326403326406e-05, "loss": 0.4414, "num_input_tokens_seen": 103296, "step": 255 }, { "epoch": 0.2702702702702703, "grad_norm": 6.235988140106201, "learning_rate": 2.6923076923076923e-05, "loss": 0.3322, "num_input_tokens_seen": 105344, "step": 260 }, { "epoch": 0.27546777546777546, "grad_norm": 5.716452121734619, "learning_rate": 2.7442827442827447e-05, "loss": 0.3314, "num_input_tokens_seen": 107392, "step": 265 }, { "epoch": 0.2806652806652807, "grad_norm": 13.819499969482422, "learning_rate": 2.796257796257796e-05, "loss": 0.2951, "num_input_tokens_seen": 109440, "step": 270 }, { "epoch": 0.28586278586278585, "grad_norm": 27.50826644897461, "learning_rate": 2.8482328482328485e-05, "loss": 0.2986, "num_input_tokens_seen": 111424, "step": 275 }, { "epoch": 0.2910602910602911, "grad_norm": 24.54796600341797, "learning_rate": 2.9002079002079002e-05, "loss": 0.407, "num_input_tokens_seen": 113408, "step": 280 }, { "epoch": 0.29625779625779625, "grad_norm": 1.7183665037155151, "learning_rate": 2.9521829521829526e-05, "loss": 0.4506, "num_input_tokens_seen": 115392, "step": 285 }, { "epoch": 0.30145530145530147, "grad_norm": 38.75688552856445, "learning_rate": 3.0041580041580043e-05, "loss": 0.4738, "num_input_tokens_seen": 117440, "step": 290 }, { "epoch": 0.30665280665280664, "grad_norm": 20.901853561401367, "learning_rate": 3.056133056133057e-05, "loss": 0.5248, "num_input_tokens_seen": 119424, "step": 295 }, { "epoch": 0.31185031185031187, "grad_norm": 37.667694091796875, "learning_rate": 3.108108108108108e-05, "loss": 0.3321, "num_input_tokens_seen": 121344, "step": 300 }, { "epoch": 0.31704781704781704, "grad_norm": 21.214815139770508, "learning_rate": 3.16008316008316e-05, "loss": 0.3707, "num_input_tokens_seen": 123264, "step": 305 }, { "epoch": 0.32224532224532226, "grad_norm": 3.6351711750030518, "learning_rate": 3.212058212058212e-05, "loss": 0.3574, "num_input_tokens_seen": 125184, "step": 310 }, { "epoch": 0.32744282744282743, "grad_norm": 2.756866216659546, "learning_rate": 3.264033264033264e-05, "loss": 0.2851, "num_input_tokens_seen": 127296, "step": 315 }, { "epoch": 0.33264033264033266, "grad_norm": 8.542179107666016, "learning_rate": 3.3160083160083164e-05, "loss": 0.2794, "num_input_tokens_seen": 129408, "step": 320 }, { "epoch": 0.33783783783783783, "grad_norm": 5.599982738494873, "learning_rate": 3.3679833679833684e-05, "loss": 0.3015, "num_input_tokens_seen": 131520, "step": 325 }, { "epoch": 0.34303534303534305, "grad_norm": 0.6099509000778198, "learning_rate": 3.41995841995842e-05, "loss": 0.2826, "num_input_tokens_seen": 133568, "step": 330 }, { "epoch": 0.3482328482328482, "grad_norm": 10.96675968170166, "learning_rate": 3.4719334719334725e-05, "loss": 0.3704, "num_input_tokens_seen": 135616, "step": 335 }, { "epoch": 0.35343035343035345, "grad_norm": 36.376678466796875, "learning_rate": 3.523908523908524e-05, "loss": 0.4519, "num_input_tokens_seen": 137664, "step": 340 }, { "epoch": 0.3586278586278586, "grad_norm": 23.873687744140625, "learning_rate": 3.575883575883576e-05, "loss": 0.2863, "num_input_tokens_seen": 139584, "step": 345 }, { "epoch": 0.36382536382536385, "grad_norm": 3.04746675491333, "learning_rate": 3.627858627858628e-05, "loss": 0.4921, "num_input_tokens_seen": 141504, "step": 350 }, { "epoch": 0.369022869022869, "grad_norm": 22.159381866455078, "learning_rate": 3.67983367983368e-05, "loss": 0.3078, "num_input_tokens_seen": 143552, "step": 355 }, { "epoch": 0.37422037422037424, "grad_norm": 4.239355564117432, "learning_rate": 3.731808731808732e-05, "loss": 0.252, "num_input_tokens_seen": 145536, "step": 360 }, { "epoch": 0.3794178794178794, "grad_norm": 33.11611557006836, "learning_rate": 3.783783783783784e-05, "loss": 0.2894, "num_input_tokens_seen": 147456, "step": 365 }, { "epoch": 0.38461538461538464, "grad_norm": 3.5446720123291016, "learning_rate": 3.8357588357588356e-05, "loss": 0.1779, "num_input_tokens_seen": 149440, "step": 370 }, { "epoch": 0.3898128898128898, "grad_norm": 17.057636260986328, "learning_rate": 3.8877338877338883e-05, "loss": 0.4928, "num_input_tokens_seen": 151360, "step": 375 }, { "epoch": 0.39501039501039503, "grad_norm": 11.648699760437012, "learning_rate": 3.93970893970894e-05, "loss": 0.4363, "num_input_tokens_seen": 153344, "step": 380 }, { "epoch": 0.4002079002079002, "grad_norm": 6.347674369812012, "learning_rate": 3.991683991683992e-05, "loss": 0.2689, "num_input_tokens_seen": 155264, "step": 385 }, { "epoch": 0.40540540540540543, "grad_norm": 14.731024742126465, "learning_rate": 4.043659043659044e-05, "loss": 0.2539, "num_input_tokens_seen": 157248, "step": 390 }, { "epoch": 0.4106029106029106, "grad_norm": 11.619532585144043, "learning_rate": 4.095634095634096e-05, "loss": 0.332, "num_input_tokens_seen": 159296, "step": 395 }, { "epoch": 0.4158004158004158, "grad_norm": 20.94300079345703, "learning_rate": 4.147609147609148e-05, "loss": 0.3, "num_input_tokens_seen": 161344, "step": 400 }, { "epoch": 0.420997920997921, "grad_norm": 4.039135456085205, "learning_rate": 4.1995841995842e-05, "loss": 0.2376, "num_input_tokens_seen": 163328, "step": 405 }, { "epoch": 0.4261954261954262, "grad_norm": 9.615825653076172, "learning_rate": 4.2515592515592514e-05, "loss": 0.2709, "num_input_tokens_seen": 165312, "step": 410 }, { "epoch": 0.4313929313929314, "grad_norm": 4.961127758026123, "learning_rate": 4.303534303534304e-05, "loss": 0.1471, "num_input_tokens_seen": 167360, "step": 415 }, { "epoch": 0.4365904365904366, "grad_norm": 55.59805679321289, "learning_rate": 4.3555093555093555e-05, "loss": 0.9994, "num_input_tokens_seen": 169344, "step": 420 }, { "epoch": 0.4417879417879418, "grad_norm": 7.878658294677734, "learning_rate": 4.407484407484408e-05, "loss": 0.603, "num_input_tokens_seen": 171456, "step": 425 }, { "epoch": 0.446985446985447, "grad_norm": 5.158046722412109, "learning_rate": 4.4594594594594596e-05, "loss": 0.3054, "num_input_tokens_seen": 173568, "step": 430 }, { "epoch": 0.4521829521829522, "grad_norm": 1.331925630569458, "learning_rate": 4.511434511434512e-05, "loss": 0.3208, "num_input_tokens_seen": 175552, "step": 435 }, { "epoch": 0.4573804573804574, "grad_norm": 7.393728733062744, "learning_rate": 4.563409563409564e-05, "loss": 0.2942, "num_input_tokens_seen": 177536, "step": 440 }, { "epoch": 0.4625779625779626, "grad_norm": 15.99036693572998, "learning_rate": 4.615384615384616e-05, "loss": 0.3005, "num_input_tokens_seen": 179584, "step": 445 }, { "epoch": 0.4677754677754678, "grad_norm": 1.2690688371658325, "learning_rate": 4.667359667359668e-05, "loss": 1.7773, "num_input_tokens_seen": 181568, "step": 450 }, { "epoch": 0.47297297297297297, "grad_norm": 3.469167709350586, "learning_rate": 4.71933471933472e-05, "loss": 0.2889, "num_input_tokens_seen": 183552, "step": 455 }, { "epoch": 0.4781704781704782, "grad_norm": 90.938232421875, "learning_rate": 4.771309771309771e-05, "loss": 0.2556, "num_input_tokens_seen": 185600, "step": 460 }, { "epoch": 0.48336798336798337, "grad_norm": 2.4140217304229736, "learning_rate": 4.823284823284824e-05, "loss": 0.2378, "num_input_tokens_seen": 187584, "step": 465 }, { "epoch": 0.4885654885654886, "grad_norm": 4.565840721130371, "learning_rate": 4.8752598752598754e-05, "loss": 0.2859, "num_input_tokens_seen": 189568, "step": 470 }, { "epoch": 0.49376299376299376, "grad_norm": 12.585880279541016, "learning_rate": 4.9272349272349275e-05, "loss": 0.2438, "num_input_tokens_seen": 191680, "step": 475 }, { "epoch": 0.498960498960499, "grad_norm": 3.2450168132781982, "learning_rate": 4.9792099792099796e-05, "loss": 0.3383, "num_input_tokens_seen": 193728, "step": 480 }, { "epoch": 0.501039501039501, "eval_loss": 0.4080815315246582, "eval_runtime": 2.6441, "eval_samples_per_second": 323.736, "eval_steps_per_second": 40.467, "num_input_tokens_seen": 194560, "step": 482 }, { "epoch": 0.5041580041580042, "grad_norm": 29.770891189575195, "learning_rate": 4.999994075155936e-05, "loss": 0.381, "num_input_tokens_seen": 195776, "step": 485 }, { "epoch": 0.5093555093555093, "grad_norm": 19.471193313598633, "learning_rate": 4.999957867877242e-05, "loss": 0.2916, "num_input_tokens_seen": 197696, "step": 490 }, { "epoch": 0.5145530145530145, "grad_norm": 29.544788360595703, "learning_rate": 4.999888745376028e-05, "loss": 0.2766, "num_input_tokens_seen": 199680, "step": 495 }, { "epoch": 0.5197505197505198, "grad_norm": 11.05797290802002, "learning_rate": 4.9997867085623824e-05, "loss": 0.2596, "num_input_tokens_seen": 201792, "step": 500 }, { "epoch": 0.524948024948025, "grad_norm": 3.8948936462402344, "learning_rate": 4.999651758779754e-05, "loss": 0.2689, "num_input_tokens_seen": 203840, "step": 505 }, { "epoch": 0.5301455301455301, "grad_norm": 18.150039672851562, "learning_rate": 4.999483897804933e-05, "loss": 0.3363, "num_input_tokens_seen": 205824, "step": 510 }, { "epoch": 0.5353430353430353, "grad_norm": 1.8224319219589233, "learning_rate": 4.999283127848029e-05, "loss": 0.2583, "num_input_tokens_seen": 207936, "step": 515 }, { "epoch": 0.5405405405405406, "grad_norm": 9.910496711730957, "learning_rate": 4.999049451552443e-05, "loss": 0.3915, "num_input_tokens_seen": 209984, "step": 520 }, { "epoch": 0.5457380457380457, "grad_norm": 0.8137359619140625, "learning_rate": 4.9987828719948284e-05, "loss": 0.2461, "num_input_tokens_seen": 212096, "step": 525 }, { "epoch": 0.5509355509355509, "grad_norm": 2.0492968559265137, "learning_rate": 4.998483392685055e-05, "loss": 0.2863, "num_input_tokens_seen": 214080, "step": 530 }, { "epoch": 0.5561330561330561, "grad_norm": 24.322450637817383, "learning_rate": 4.9981510175661606e-05, "loss": 0.3052, "num_input_tokens_seen": 216128, "step": 535 }, { "epoch": 0.5613305613305614, "grad_norm": 28.344161987304688, "learning_rate": 4.9977857510143e-05, "loss": 0.3625, "num_input_tokens_seen": 218176, "step": 540 }, { "epoch": 0.5665280665280665, "grad_norm": 11.159996032714844, "learning_rate": 4.9973875978386843e-05, "loss": 0.2801, "num_input_tokens_seen": 220096, "step": 545 }, { "epoch": 0.5717255717255717, "grad_norm": 14.94636058807373, "learning_rate": 4.996956563281524e-05, "loss": 0.3041, "num_input_tokens_seen": 222080, "step": 550 }, { "epoch": 0.5769230769230769, "grad_norm": 8.506303787231445, "learning_rate": 4.996492653017952e-05, "loss": 0.2498, "num_input_tokens_seen": 224000, "step": 555 }, { "epoch": 0.5821205821205822, "grad_norm": 18.009078979492188, "learning_rate": 4.995995873155958e-05, "loss": 0.3224, "num_input_tokens_seen": 225984, "step": 560 }, { "epoch": 0.5873180873180873, "grad_norm": 19.32984733581543, "learning_rate": 4.9954662302362973e-05, "loss": 0.322, "num_input_tokens_seen": 227840, "step": 565 }, { "epoch": 0.5925155925155925, "grad_norm": 18.17301368713379, "learning_rate": 4.9949037312324155e-05, "loss": 0.302, "num_input_tokens_seen": 229824, "step": 570 }, { "epoch": 0.5977130977130977, "grad_norm": 14.943294525146484, "learning_rate": 4.9943083835503467e-05, "loss": 0.3875, "num_input_tokens_seen": 231872, "step": 575 }, { "epoch": 0.6029106029106029, "grad_norm": 12.56007194519043, "learning_rate": 4.993680195028626e-05, "loss": 0.314, "num_input_tokens_seen": 233920, "step": 580 }, { "epoch": 0.6081081081081081, "grad_norm": 12.372591972351074, "learning_rate": 4.9930191739381775e-05, "loss": 0.3189, "num_input_tokens_seen": 235840, "step": 585 }, { "epoch": 0.6133056133056133, "grad_norm": 11.213384628295898, "learning_rate": 4.9923253289822116e-05, "loss": 0.3418, "num_input_tokens_seen": 238016, "step": 590 }, { "epoch": 0.6185031185031185, "grad_norm": 12.969761848449707, "learning_rate": 4.9915986692961045e-05, "loss": 0.3307, "num_input_tokens_seen": 240064, "step": 595 }, { "epoch": 0.6237006237006237, "grad_norm": 1.1426069736480713, "learning_rate": 4.9908392044472865e-05, "loss": 0.2835, "num_input_tokens_seen": 242048, "step": 600 }, { "epoch": 0.6288981288981289, "grad_norm": 7.80474853515625, "learning_rate": 4.990046944435105e-05, "loss": 0.2584, "num_input_tokens_seen": 243968, "step": 605 }, { "epoch": 0.6340956340956341, "grad_norm": 1.6327171325683594, "learning_rate": 4.989221899690704e-05, "loss": 0.2953, "num_input_tokens_seen": 246016, "step": 610 }, { "epoch": 0.6392931392931392, "grad_norm": 16.204669952392578, "learning_rate": 4.9883640810768764e-05, "loss": 0.3214, "num_input_tokens_seen": 248000, "step": 615 }, { "epoch": 0.6444906444906445, "grad_norm": 0.29481250047683716, "learning_rate": 4.9874734998879316e-05, "loss": 0.312, "num_input_tokens_seen": 250048, "step": 620 }, { "epoch": 0.6496881496881497, "grad_norm": 18.51386833190918, "learning_rate": 4.9865501678495375e-05, "loss": 0.3215, "num_input_tokens_seen": 252096, "step": 625 }, { "epoch": 0.6548856548856549, "grad_norm": 5.452864646911621, "learning_rate": 4.98559409711857e-05, "loss": 0.2892, "num_input_tokens_seen": 254144, "step": 630 }, { "epoch": 0.66008316008316, "grad_norm": 11.4102783203125, "learning_rate": 4.984605300282954e-05, "loss": 0.3016, "num_input_tokens_seen": 256128, "step": 635 }, { "epoch": 0.6652806652806653, "grad_norm": 8.976471900939941, "learning_rate": 4.983583790361497e-05, "loss": 0.2596, "num_input_tokens_seen": 258048, "step": 640 }, { "epoch": 0.6704781704781705, "grad_norm": 27.530433654785156, "learning_rate": 4.982529580803714e-05, "loss": 0.363, "num_input_tokens_seen": 260352, "step": 645 }, { "epoch": 0.6756756756756757, "grad_norm": 12.061158180236816, "learning_rate": 4.981442685489659e-05, "loss": 0.2849, "num_input_tokens_seen": 262272, "step": 650 }, { "epoch": 0.6808731808731808, "grad_norm": 13.447040557861328, "learning_rate": 4.9803231187297304e-05, "loss": 0.2969, "num_input_tokens_seen": 264320, "step": 655 }, { "epoch": 0.6860706860706861, "grad_norm": 33.57612228393555, "learning_rate": 4.979170895264494e-05, "loss": 0.3962, "num_input_tokens_seen": 266240, "step": 660 }, { "epoch": 0.6912681912681913, "grad_norm": 4.9110002517700195, "learning_rate": 4.977986030264482e-05, "loss": 0.312, "num_input_tokens_seen": 268224, "step": 665 }, { "epoch": 0.6964656964656964, "grad_norm": 0.21739044785499573, "learning_rate": 4.976768539329994e-05, "loss": 0.2552, "num_input_tokens_seen": 270272, "step": 670 }, { "epoch": 0.7016632016632016, "grad_norm": 8.439529418945312, "learning_rate": 4.975518438490897e-05, "loss": 0.312, "num_input_tokens_seen": 272256, "step": 675 }, { "epoch": 0.7068607068607069, "grad_norm": 10.565657615661621, "learning_rate": 4.9742357442064045e-05, "loss": 0.2889, "num_input_tokens_seen": 274240, "step": 680 }, { "epoch": 0.7120582120582121, "grad_norm": 19.482200622558594, "learning_rate": 4.972920473364869e-05, "loss": 0.2841, "num_input_tokens_seen": 276288, "step": 685 }, { "epoch": 0.7172557172557172, "grad_norm": 21.85175132751465, "learning_rate": 4.971572643283557e-05, "loss": 0.3076, "num_input_tokens_seen": 278272, "step": 690 }, { "epoch": 0.7224532224532224, "grad_norm": 7.696235656738281, "learning_rate": 4.970192271708416e-05, "loss": 0.285, "num_input_tokens_seen": 280384, "step": 695 }, { "epoch": 0.7276507276507277, "grad_norm": 4.778509616851807, "learning_rate": 4.968779376813849e-05, "loss": 0.2376, "num_input_tokens_seen": 282368, "step": 700 }, { "epoch": 0.7328482328482329, "grad_norm": 9.415769577026367, "learning_rate": 4.967333977202469e-05, "loss": 0.2787, "num_input_tokens_seen": 284416, "step": 705 }, { "epoch": 0.738045738045738, "grad_norm": 4.963840484619141, "learning_rate": 4.965856091904855e-05, "loss": 0.214, "num_input_tokens_seen": 286464, "step": 710 }, { "epoch": 0.7432432432432432, "grad_norm": 12.690200805664062, "learning_rate": 4.964345740379307e-05, "loss": 0.2858, "num_input_tokens_seen": 288448, "step": 715 }, { "epoch": 0.7484407484407485, "grad_norm": 15.548615455627441, "learning_rate": 4.962802942511581e-05, "loss": 0.2962, "num_input_tokens_seen": 290496, "step": 720 }, { "epoch": 0.7515592515592515, "eval_loss": 0.29600390791893005, "eval_runtime": 3.3797, "eval_samples_per_second": 253.278, "eval_steps_per_second": 31.66, "num_input_tokens_seen": 291712, "step": 723 }, { "epoch": 0.7536382536382537, "grad_norm": 7.057698726654053, "learning_rate": 4.9612277186146335e-05, "loss": 0.3061, "num_input_tokens_seen": 292480, "step": 725 }, { "epoch": 0.7588357588357588, "grad_norm": 17.72517204284668, "learning_rate": 4.959620089428354e-05, "loss": 0.281, "num_input_tokens_seen": 294464, "step": 730 }, { "epoch": 0.764033264033264, "grad_norm": 1.6551860570907593, "learning_rate": 4.957980076119285e-05, "loss": 0.2702, "num_input_tokens_seen": 296448, "step": 735 }, { "epoch": 0.7692307692307693, "grad_norm": 9.719671249389648, "learning_rate": 4.956307700280354e-05, "loss": 0.3146, "num_input_tokens_seen": 298432, "step": 740 }, { "epoch": 0.7744282744282744, "grad_norm": 8.770389556884766, "learning_rate": 4.954602983930581e-05, "loss": 0.2567, "num_input_tokens_seen": 300480, "step": 745 }, { "epoch": 0.7796257796257796, "grad_norm": 7.721967697143555, "learning_rate": 4.95286594951479e-05, "loss": 0.2488, "num_input_tokens_seen": 302400, "step": 750 }, { "epoch": 0.7848232848232848, "grad_norm": 7.888345241546631, "learning_rate": 4.9510966199033174e-05, "loss": 0.2852, "num_input_tokens_seen": 304320, "step": 755 }, { "epoch": 0.7900207900207901, "grad_norm": 8.099255561828613, "learning_rate": 4.949295018391706e-05, "loss": 0.2968, "num_input_tokens_seen": 306240, "step": 760 }, { "epoch": 0.7952182952182952, "grad_norm": 7.542270183563232, "learning_rate": 4.947461168700402e-05, "loss": 0.2957, "num_input_tokens_seen": 308032, "step": 765 }, { "epoch": 0.8004158004158004, "grad_norm": 4.36430025100708, "learning_rate": 4.945595094974442e-05, "loss": 0.2531, "num_input_tokens_seen": 309952, "step": 770 }, { "epoch": 0.8056133056133056, "grad_norm": 11.39189624786377, "learning_rate": 4.94369682178313e-05, "loss": 0.2396, "num_input_tokens_seen": 311936, "step": 775 }, { "epoch": 0.8108108108108109, "grad_norm": 3.859365224838257, "learning_rate": 4.9417663741197236e-05, "loss": 0.3127, "num_input_tokens_seen": 313920, "step": 780 }, { "epoch": 0.816008316008316, "grad_norm": 10.639534950256348, "learning_rate": 4.939803777401095e-05, "loss": 0.2873, "num_input_tokens_seen": 315968, "step": 785 }, { "epoch": 0.8212058212058212, "grad_norm": 8.708645820617676, "learning_rate": 4.937809057467404e-05, "loss": 0.263, "num_input_tokens_seen": 317952, "step": 790 }, { "epoch": 0.8264033264033264, "grad_norm": 3.0771279335021973, "learning_rate": 4.935782240581752e-05, "loss": 0.2762, "num_input_tokens_seen": 319872, "step": 795 }, { "epoch": 0.8316008316008316, "grad_norm": 9.068258285522461, "learning_rate": 4.9337233534298425e-05, "loss": 0.2759, "num_input_tokens_seen": 321856, "step": 800 }, { "epoch": 0.8367983367983368, "grad_norm": 11.020606994628906, "learning_rate": 4.931632423119621e-05, "loss": 0.2849, "num_input_tokens_seen": 323968, "step": 805 }, { "epoch": 0.841995841995842, "grad_norm": 11.716889381408691, "learning_rate": 4.9295094771809285e-05, "loss": 0.2775, "num_input_tokens_seen": 325952, "step": 810 }, { "epoch": 0.8471933471933472, "grad_norm": 2.4100873470306396, "learning_rate": 4.92735454356513e-05, "loss": 0.1962, "num_input_tokens_seen": 328000, "step": 815 }, { "epoch": 0.8523908523908524, "grad_norm": 18.39780044555664, "learning_rate": 4.925167650644752e-05, "loss": 0.2237, "num_input_tokens_seen": 329984, "step": 820 }, { "epoch": 0.8575883575883576, "grad_norm": 31.814456939697266, "learning_rate": 4.9229488272131067e-05, "loss": 0.3432, "num_input_tokens_seen": 331904, "step": 825 }, { "epoch": 0.8627858627858628, "grad_norm": 13.310869216918945, "learning_rate": 4.920698102483912e-05, "loss": 0.3102, "num_input_tokens_seen": 333888, "step": 830 }, { "epoch": 0.867983367983368, "grad_norm": 1.8199642896652222, "learning_rate": 4.918415506090911e-05, "loss": 0.3035, "num_input_tokens_seen": 335872, "step": 835 }, { "epoch": 0.8731808731808732, "grad_norm": 14.503442764282227, "learning_rate": 4.916101068087476e-05, "loss": 0.2682, "num_input_tokens_seen": 337856, "step": 840 }, { "epoch": 0.8783783783783784, "grad_norm": 1.1979509592056274, "learning_rate": 4.913754818946219e-05, "loss": 0.2422, "num_input_tokens_seen": 339776, "step": 845 }, { "epoch": 0.8835758835758836, "grad_norm": 5.467031478881836, "learning_rate": 4.911376789558584e-05, "loss": 0.1949, "num_input_tokens_seen": 341760, "step": 850 }, { "epoch": 0.8887733887733887, "grad_norm": 26.362804412841797, "learning_rate": 4.9089670112344456e-05, "loss": 0.3477, "num_input_tokens_seen": 343680, "step": 855 }, { "epoch": 0.893970893970894, "grad_norm": 13.462319374084473, "learning_rate": 4.906525515701695e-05, "loss": 0.292, "num_input_tokens_seen": 345600, "step": 860 }, { "epoch": 0.8991683991683992, "grad_norm": 15.104516983032227, "learning_rate": 4.904052335105822e-05, "loss": 0.2896, "num_input_tokens_seen": 347520, "step": 865 }, { "epoch": 0.9043659043659044, "grad_norm": 2.1453566551208496, "learning_rate": 4.90154750200949e-05, "loss": 0.2644, "num_input_tokens_seen": 349568, "step": 870 }, { "epoch": 0.9095634095634095, "grad_norm": 30.04273223876953, "learning_rate": 4.8990110493921105e-05, "loss": 0.3212, "num_input_tokens_seen": 351552, "step": 875 }, { "epoch": 0.9147609147609148, "grad_norm": 9.758408546447754, "learning_rate": 4.8964430106494075e-05, "loss": 0.2787, "num_input_tokens_seen": 353472, "step": 880 }, { "epoch": 0.91995841995842, "grad_norm": 2.3094310760498047, "learning_rate": 4.893843419592977e-05, "loss": 0.2656, "num_input_tokens_seen": 355392, "step": 885 }, { "epoch": 0.9251559251559252, "grad_norm": 2.449030876159668, "learning_rate": 4.891212310449844e-05, "loss": 0.2593, "num_input_tokens_seen": 357440, "step": 890 }, { "epoch": 0.9303534303534303, "grad_norm": 5.086390972137451, "learning_rate": 4.8885497178620095e-05, "loss": 0.2785, "num_input_tokens_seen": 359488, "step": 895 }, { "epoch": 0.9355509355509356, "grad_norm": 12.254743576049805, "learning_rate": 4.8858556768859944e-05, "loss": 0.283, "num_input_tokens_seen": 361408, "step": 900 }, { "epoch": 0.9407484407484408, "grad_norm": 8.048048973083496, "learning_rate": 4.88313022299238e-05, "loss": 0.27, "num_input_tokens_seen": 363392, "step": 905 }, { "epoch": 0.9459459459459459, "grad_norm": 13.520707130432129, "learning_rate": 4.88037339206534e-05, "loss": 0.2987, "num_input_tokens_seen": 365440, "step": 910 }, { "epoch": 0.9511434511434511, "grad_norm": 18.515222549438477, "learning_rate": 4.8775852204021665e-05, "loss": 0.2998, "num_input_tokens_seen": 367616, "step": 915 }, { "epoch": 0.9563409563409564, "grad_norm": 25.785511016845703, "learning_rate": 4.874765744712796e-05, "loss": 0.358, "num_input_tokens_seen": 369600, "step": 920 }, { "epoch": 0.9615384615384616, "grad_norm": 8.675851821899414, "learning_rate": 4.871915002119321e-05, "loss": 0.2755, "num_input_tokens_seen": 371520, "step": 925 }, { "epoch": 0.9667359667359667, "grad_norm": 0.7464810013771057, "learning_rate": 4.8690330301555045e-05, "loss": 0.3085, "num_input_tokens_seen": 373568, "step": 930 }, { "epoch": 0.9719334719334719, "grad_norm": 21.288469314575195, "learning_rate": 4.8661198667662854e-05, "loss": 0.2548, "num_input_tokens_seen": 375488, "step": 935 }, { "epoch": 0.9771309771309772, "grad_norm": 15.424735069274902, "learning_rate": 4.86317555030728e-05, "loss": 0.3274, "num_input_tokens_seen": 377728, "step": 940 }, { "epoch": 0.9823284823284824, "grad_norm": 11.471841812133789, "learning_rate": 4.8602001195442725e-05, "loss": 0.2924, "num_input_tokens_seen": 379840, "step": 945 }, { "epoch": 0.9875259875259875, "grad_norm": 12.289169311523438, "learning_rate": 4.857193613652711e-05, "loss": 0.2685, "num_input_tokens_seen": 381760, "step": 950 }, { "epoch": 0.9927234927234927, "grad_norm": 2.2855138778686523, "learning_rate": 4.8541560722171855e-05, "loss": 0.2743, "num_input_tokens_seen": 383808, "step": 955 }, { "epoch": 0.997920997920998, "grad_norm": 14.171538352966309, "learning_rate": 4.8510875352309106e-05, "loss": 0.2807, "num_input_tokens_seen": 385856, "step": 960 }, { "epoch": 1.002079002079002, "eval_loss": 0.2738620638847351, "eval_runtime": 1.0463, "eval_samples_per_second": 818.137, "eval_steps_per_second": 102.267, "num_input_tokens_seen": 387464, "step": 964 }, { "epoch": 1.003118503118503, "grad_norm": 1.732373833656311, "learning_rate": 4.8479880430951995e-05, "loss": 0.2643, "num_input_tokens_seen": 387848, "step": 965 }, { "epoch": 1.0083160083160083, "grad_norm": 9.852456092834473, "learning_rate": 4.844857636618928e-05, "loss": 0.2613, "num_input_tokens_seen": 389640, "step": 970 }, { "epoch": 1.0135135135135136, "grad_norm": 13.70590877532959, "learning_rate": 4.8416963570180025e-05, "loss": 0.2824, "num_input_tokens_seen": 391624, "step": 975 }, { "epoch": 1.0187110187110187, "grad_norm": 10.222359657287598, "learning_rate": 4.838504245914812e-05, "loss": 0.3008, "num_input_tokens_seen": 393672, "step": 980 }, { "epoch": 1.023908523908524, "grad_norm": 24.265884399414062, "learning_rate": 4.8352813453376836e-05, "loss": 0.3048, "num_input_tokens_seen": 395784, "step": 985 }, { "epoch": 1.0291060291060292, "grad_norm": 20.11310577392578, "learning_rate": 4.83202769772033e-05, "loss": 0.3205, "num_input_tokens_seen": 397768, "step": 990 }, { "epoch": 1.0343035343035343, "grad_norm": 19.961780548095703, "learning_rate": 4.8287433459012844e-05, "loss": 0.2694, "num_input_tokens_seen": 399816, "step": 995 }, { "epoch": 1.0395010395010396, "grad_norm": 3.684941530227661, "learning_rate": 4.8254283331233464e-05, "loss": 0.3263, "num_input_tokens_seen": 401928, "step": 1000 }, { "epoch": 1.0446985446985446, "grad_norm": 1.1775870323181152, "learning_rate": 4.822082703033003e-05, "loss": 0.3028, "num_input_tokens_seen": 403912, "step": 1005 }, { "epoch": 1.04989604989605, "grad_norm": 7.7064595222473145, "learning_rate": 4.818706499679862e-05, "loss": 0.2501, "num_input_tokens_seen": 405832, "step": 1010 }, { "epoch": 1.0550935550935552, "grad_norm": 20.951908111572266, "learning_rate": 4.815299767516065e-05, "loss": 0.339, "num_input_tokens_seen": 407880, "step": 1015 }, { "epoch": 1.0602910602910602, "grad_norm": 0.5823318958282471, "learning_rate": 4.8118625513957074e-05, "loss": 0.2748, "num_input_tokens_seen": 410120, "step": 1020 }, { "epoch": 1.0654885654885655, "grad_norm": 5.90387487411499, "learning_rate": 4.808394896574245e-05, "loss": 0.3246, "num_input_tokens_seen": 412168, "step": 1025 }, { "epoch": 1.0706860706860706, "grad_norm": 13.779071807861328, "learning_rate": 4.8048968487079e-05, "loss": 0.2849, "num_input_tokens_seen": 414472, "step": 1030 }, { "epoch": 1.0758835758835759, "grad_norm": 15.818516731262207, "learning_rate": 4.8013684538530565e-05, "loss": 0.2908, "num_input_tokens_seen": 416520, "step": 1035 }, { "epoch": 1.0810810810810811, "grad_norm": 11.890518188476562, "learning_rate": 4.79780975846566e-05, "loss": 0.2774, "num_input_tokens_seen": 418568, "step": 1040 }, { "epoch": 1.0862785862785862, "grad_norm": 7.704615592956543, "learning_rate": 4.7942208094006e-05, "loss": 0.2417, "num_input_tokens_seen": 420488, "step": 1045 }, { "epoch": 1.0914760914760915, "grad_norm": 2.7734618186950684, "learning_rate": 4.790601653911094e-05, "loss": 0.2736, "num_input_tokens_seen": 422472, "step": 1050 }, { "epoch": 1.0966735966735968, "grad_norm": 12.339665412902832, "learning_rate": 4.786952339648071e-05, "loss": 0.3165, "num_input_tokens_seen": 424456, "step": 1055 }, { "epoch": 1.1018711018711018, "grad_norm": 3.863492250442505, "learning_rate": 4.783272914659535e-05, "loss": 0.305, "num_input_tokens_seen": 426568, "step": 1060 }, { "epoch": 1.107068607068607, "grad_norm": 5.703735828399658, "learning_rate": 4.77956342738994e-05, "loss": 0.2809, "num_input_tokens_seen": 428552, "step": 1065 }, { "epoch": 1.1122661122661124, "grad_norm": 5.661115646362305, "learning_rate": 4.775823926679548e-05, "loss": 0.2758, "num_input_tokens_seen": 430472, "step": 1070 }, { "epoch": 1.1174636174636174, "grad_norm": 5.488950729370117, "learning_rate": 4.77205446176379e-05, "loss": 0.2657, "num_input_tokens_seen": 432328, "step": 1075 }, { "epoch": 1.1226611226611227, "grad_norm": 10.569025993347168, "learning_rate": 4.768255082272611e-05, "loss": 0.2881, "num_input_tokens_seen": 434440, "step": 1080 }, { "epoch": 1.1278586278586278, "grad_norm": 3.0698752403259277, "learning_rate": 4.764425838229824e-05, "loss": 0.2938, "num_input_tokens_seen": 436488, "step": 1085 }, { "epoch": 1.133056133056133, "grad_norm": 23.730390548706055, "learning_rate": 4.760566780052445e-05, "loss": 0.4153, "num_input_tokens_seen": 438472, "step": 1090 }, { "epoch": 1.1382536382536383, "grad_norm": 0.6121450066566467, "learning_rate": 4.7566779585500347e-05, "loss": 0.3296, "num_input_tokens_seen": 440456, "step": 1095 }, { "epoch": 1.1434511434511434, "grad_norm": 4.771881103515625, "learning_rate": 4.7527594249240264e-05, "loss": 0.3082, "num_input_tokens_seen": 442440, "step": 1100 }, { "epoch": 1.1486486486486487, "grad_norm": 7.00406551361084, "learning_rate": 4.748811230767051e-05, "loss": 0.3109, "num_input_tokens_seen": 444424, "step": 1105 }, { "epoch": 1.1538461538461537, "grad_norm": 0.35836905241012573, "learning_rate": 4.744833428062262e-05, "loss": 0.2754, "num_input_tokens_seen": 446280, "step": 1110 }, { "epoch": 1.159043659043659, "grad_norm": 12.108142852783203, "learning_rate": 4.740826069182645e-05, "loss": 0.286, "num_input_tokens_seen": 448264, "step": 1115 }, { "epoch": 1.1642411642411643, "grad_norm": 2.960707902908325, "learning_rate": 4.736789206890332e-05, "loss": 0.2806, "num_input_tokens_seen": 450376, "step": 1120 }, { "epoch": 1.1694386694386694, "grad_norm": 1.6544653177261353, "learning_rate": 4.732722894335909e-05, "loss": 0.2575, "num_input_tokens_seen": 452552, "step": 1125 }, { "epoch": 1.1746361746361746, "grad_norm": 4.431951999664307, "learning_rate": 4.7286271850577105e-05, "loss": 0.2989, "num_input_tokens_seen": 454600, "step": 1130 }, { "epoch": 1.17983367983368, "grad_norm": 27.89082145690918, "learning_rate": 4.724502132981119e-05, "loss": 0.5214, "num_input_tokens_seen": 456648, "step": 1135 }, { "epoch": 1.185031185031185, "grad_norm": 12.9546480178833, "learning_rate": 4.7203477924178506e-05, "loss": 0.3325, "num_input_tokens_seen": 458632, "step": 1140 }, { "epoch": 1.1902286902286903, "grad_norm": 10.19326114654541, "learning_rate": 4.7161642180652464e-05, "loss": 0.2795, "num_input_tokens_seen": 460680, "step": 1145 }, { "epoch": 1.1954261954261955, "grad_norm": 11.509027481079102, "learning_rate": 4.7119514650055476e-05, "loss": 0.2697, "num_input_tokens_seen": 462728, "step": 1150 }, { "epoch": 1.2006237006237006, "grad_norm": 11.531643867492676, "learning_rate": 4.7077095887051686e-05, "loss": 0.2429, "num_input_tokens_seen": 464776, "step": 1155 }, { "epoch": 1.2058212058212059, "grad_norm": 2.4489336013793945, "learning_rate": 4.7034386450139735e-05, "loss": 0.2844, "num_input_tokens_seen": 466696, "step": 1160 }, { "epoch": 1.211018711018711, "grad_norm": 10.006536483764648, "learning_rate": 4.699138690164533e-05, "loss": 0.259, "num_input_tokens_seen": 468616, "step": 1165 }, { "epoch": 1.2162162162162162, "grad_norm": 5.630403995513916, "learning_rate": 4.694809780771391e-05, "loss": 0.2734, "num_input_tokens_seen": 470728, "step": 1170 }, { "epoch": 1.2214137214137215, "grad_norm": 9.379544258117676, "learning_rate": 4.690451973830313e-05, "loss": 0.2763, "num_input_tokens_seen": 472776, "step": 1175 }, { "epoch": 1.2266112266112266, "grad_norm": 4.9165167808532715, "learning_rate": 4.6860653267175416e-05, "loss": 0.2615, "num_input_tokens_seen": 474824, "step": 1180 }, { "epoch": 1.2318087318087318, "grad_norm": 7.163639068603516, "learning_rate": 4.681649897189036e-05, "loss": 0.2894, "num_input_tokens_seen": 476744, "step": 1185 }, { "epoch": 1.237006237006237, "grad_norm": 3.139596939086914, "learning_rate": 4.677205743379713e-05, "loss": 0.2065, "num_input_tokens_seen": 478856, "step": 1190 }, { "epoch": 1.2422037422037422, "grad_norm": 18.728031158447266, "learning_rate": 4.672732923802685e-05, "loss": 0.4129, "num_input_tokens_seen": 480776, "step": 1195 }, { "epoch": 1.2474012474012475, "grad_norm": 21.870121002197266, "learning_rate": 4.668231497348484e-05, "loss": 0.2716, "num_input_tokens_seen": 482952, "step": 1200 }, { "epoch": 1.2525987525987525, "grad_norm": 7.45676851272583, "learning_rate": 4.663701523284291e-05, "loss": 0.2836, "num_input_tokens_seen": 485192, "step": 1205 }, { "epoch": 1.2525987525987525, "eval_loss": 0.2581372559070587, "eval_runtime": 1.5453, "eval_samples_per_second": 553.946, "eval_steps_per_second": 69.243, "num_input_tokens_seen": 485192, "step": 1205 }, { "epoch": 1.2577962577962578, "grad_norm": 19.655874252319336, "learning_rate": 4.6591430612531515e-05, "loss": 0.2541, "num_input_tokens_seen": 487112, "step": 1210 }, { "epoch": 1.262993762993763, "grad_norm": 11.319998741149902, "learning_rate": 4.6545561712731954e-05, "loss": 0.3056, "num_input_tokens_seen": 489160, "step": 1215 }, { "epoch": 1.2681912681912682, "grad_norm": 6.425257205963135, "learning_rate": 4.649940913736841e-05, "loss": 0.2656, "num_input_tokens_seen": 491080, "step": 1220 }, { "epoch": 1.2733887733887734, "grad_norm": 7.79650354385376, "learning_rate": 4.645297349410005e-05, "loss": 0.2917, "num_input_tokens_seen": 493064, "step": 1225 }, { "epoch": 1.2785862785862787, "grad_norm": 8.849406242370605, "learning_rate": 4.640625539431298e-05, "loss": 0.2878, "num_input_tokens_seen": 494984, "step": 1230 }, { "epoch": 1.2837837837837838, "grad_norm": 10.014208793640137, "learning_rate": 4.635925545311224e-05, "loss": 0.2686, "num_input_tokens_seen": 496968, "step": 1235 }, { "epoch": 1.288981288981289, "grad_norm": 3.8225860595703125, "learning_rate": 4.6311974289313646e-05, "loss": 0.2747, "num_input_tokens_seen": 498824, "step": 1240 }, { "epoch": 1.2941787941787941, "grad_norm": 9.505216598510742, "learning_rate": 4.6264412525435716e-05, "loss": 0.2269, "num_input_tokens_seen": 500808, "step": 1245 }, { "epoch": 1.2993762993762994, "grad_norm": 12.160257339477539, "learning_rate": 4.6216570787691423e-05, "loss": 0.2595, "num_input_tokens_seen": 502856, "step": 1250 }, { "epoch": 1.3045738045738045, "grad_norm": 2.7872188091278076, "learning_rate": 4.6168449705979956e-05, "loss": 0.2367, "num_input_tokens_seen": 504712, "step": 1255 }, { "epoch": 1.3097713097713097, "grad_norm": 1.930995225906372, "learning_rate": 4.612004991387843e-05, "loss": 0.3177, "num_input_tokens_seen": 506696, "step": 1260 }, { "epoch": 1.314968814968815, "grad_norm": 0.9595280289649963, "learning_rate": 4.6071372048633566e-05, "loss": 0.2562, "num_input_tokens_seen": 508680, "step": 1265 }, { "epoch": 1.32016632016632, "grad_norm": 1.0570541620254517, "learning_rate": 4.6022416751153255e-05, "loss": 0.291, "num_input_tokens_seen": 510728, "step": 1270 }, { "epoch": 1.3253638253638254, "grad_norm": 3.702086925506592, "learning_rate": 4.5973184665998186e-05, "loss": 0.2441, "num_input_tokens_seen": 512712, "step": 1275 }, { "epoch": 1.3305613305613306, "grad_norm": 0.28220054507255554, "learning_rate": 4.5923676441373287e-05, "loss": 0.2674, "num_input_tokens_seen": 514696, "step": 1280 }, { "epoch": 1.3357588357588357, "grad_norm": 4.1724724769592285, "learning_rate": 4.5873892729119225e-05, "loss": 0.2628, "num_input_tokens_seen": 516808, "step": 1285 }, { "epoch": 1.340956340956341, "grad_norm": 5.697893142700195, "learning_rate": 4.582383418470386e-05, "loss": 0.208, "num_input_tokens_seen": 518792, "step": 1290 }, { "epoch": 1.3461538461538463, "grad_norm": 14.32170581817627, "learning_rate": 4.577350146721353e-05, "loss": 0.2791, "num_input_tokens_seen": 520840, "step": 1295 }, { "epoch": 1.3513513513513513, "grad_norm": 5.778555393218994, "learning_rate": 4.5722895239344435e-05, "loss": 0.2367, "num_input_tokens_seen": 522760, "step": 1300 }, { "epoch": 1.3565488565488566, "grad_norm": 10.224639892578125, "learning_rate": 4.567201616739393e-05, "loss": 0.2853, "num_input_tokens_seen": 524872, "step": 1305 }, { "epoch": 1.3617463617463619, "grad_norm": 10.455507278442383, "learning_rate": 4.562086492125167e-05, "loss": 0.2922, "num_input_tokens_seen": 526920, "step": 1310 }, { "epoch": 1.366943866943867, "grad_norm": 1.0684677362442017, "learning_rate": 4.556944217439088e-05, "loss": 0.2892, "num_input_tokens_seen": 528968, "step": 1315 }, { "epoch": 1.3721413721413722, "grad_norm": 8.995451927185059, "learning_rate": 4.5517748603859435e-05, "loss": 0.2689, "num_input_tokens_seen": 530888, "step": 1320 }, { "epoch": 1.3773388773388773, "grad_norm": 9.848129272460938, "learning_rate": 4.546578489027095e-05, "loss": 0.2348, "num_input_tokens_seen": 532872, "step": 1325 }, { "epoch": 1.3825363825363826, "grad_norm": 12.437982559204102, "learning_rate": 4.541355171779582e-05, "loss": 0.2971, "num_input_tokens_seen": 534920, "step": 1330 }, { "epoch": 1.3877338877338876, "grad_norm": 4.1738667488098145, "learning_rate": 4.5361049774152256e-05, "loss": 0.2582, "num_input_tokens_seen": 536840, "step": 1335 }, { "epoch": 1.392931392931393, "grad_norm": 11.655598640441895, "learning_rate": 4.530827975059715e-05, "loss": 0.2788, "num_input_tokens_seen": 538760, "step": 1340 }, { "epoch": 1.3981288981288982, "grad_norm": 16.621856689453125, "learning_rate": 4.5255242341917055e-05, "loss": 0.2367, "num_input_tokens_seen": 540680, "step": 1345 }, { "epoch": 1.4033264033264032, "grad_norm": 13.314364433288574, "learning_rate": 4.5201938246418976e-05, "loss": 0.2864, "num_input_tokens_seen": 542664, "step": 1350 }, { "epoch": 1.4085239085239085, "grad_norm": 0.3170285224914551, "learning_rate": 4.51483681659212e-05, "loss": 0.1771, "num_input_tokens_seen": 544712, "step": 1355 }, { "epoch": 1.4137214137214138, "grad_norm": 14.410113334655762, "learning_rate": 4.509453280574407e-05, "loss": 0.3864, "num_input_tokens_seen": 546824, "step": 1360 }, { "epoch": 1.4189189189189189, "grad_norm": 9.504006385803223, "learning_rate": 4.504043287470068e-05, "loss": 0.2952, "num_input_tokens_seen": 548936, "step": 1365 }, { "epoch": 1.4241164241164241, "grad_norm": 19.96672248840332, "learning_rate": 4.498606908508754e-05, "loss": 0.3433, "num_input_tokens_seen": 550920, "step": 1370 }, { "epoch": 1.4293139293139294, "grad_norm": 3.9341182708740234, "learning_rate": 4.4931442152675185e-05, "loss": 0.2757, "num_input_tokens_seen": 552904, "step": 1375 }, { "epoch": 1.4345114345114345, "grad_norm": 13.74646282196045, "learning_rate": 4.487655279669881e-05, "loss": 0.3025, "num_input_tokens_seen": 554824, "step": 1380 }, { "epoch": 1.4397089397089398, "grad_norm": 2.631537914276123, "learning_rate": 4.482140173984875e-05, "loss": 0.2663, "num_input_tokens_seen": 556872, "step": 1385 }, { "epoch": 1.444906444906445, "grad_norm": 6.440622329711914, "learning_rate": 4.476598970826094e-05, "loss": 0.2717, "num_input_tokens_seen": 558984, "step": 1390 }, { "epoch": 1.45010395010395, "grad_norm": 17.752002716064453, "learning_rate": 4.4710317431507434e-05, "loss": 0.2791, "num_input_tokens_seen": 560968, "step": 1395 }, { "epoch": 1.4553014553014554, "grad_norm": 5.554246425628662, "learning_rate": 4.465438564258673e-05, "loss": 0.2617, "num_input_tokens_seen": 562952, "step": 1400 }, { "epoch": 1.4604989604989604, "grad_norm": 5.866454601287842, "learning_rate": 4.4598195077914145e-05, "loss": 0.2452, "num_input_tokens_seen": 565064, "step": 1405 }, { "epoch": 1.4656964656964657, "grad_norm": 3.3153018951416016, "learning_rate": 4.454174647731213e-05, "loss": 0.2761, "num_input_tokens_seen": 567112, "step": 1410 }, { "epoch": 1.4708939708939708, "grad_norm": 20.00400733947754, "learning_rate": 4.4485040584000514e-05, "loss": 0.2875, "num_input_tokens_seen": 569160, "step": 1415 }, { "epoch": 1.476091476091476, "grad_norm": 12.827857971191406, "learning_rate": 4.442807814458672e-05, "loss": 0.2782, "num_input_tokens_seen": 571336, "step": 1420 }, { "epoch": 1.4812889812889813, "grad_norm": 1.1996235847473145, "learning_rate": 4.437085990905591e-05, "loss": 0.25, "num_input_tokens_seen": 573384, "step": 1425 }, { "epoch": 1.4864864864864864, "grad_norm": 1.4895740747451782, "learning_rate": 4.431338663076119e-05, "loss": 0.2596, "num_input_tokens_seen": 575304, "step": 1430 }, { "epoch": 1.4916839916839917, "grad_norm": 2.120258331298828, "learning_rate": 4.4255659066413595e-05, "loss": 0.2788, "num_input_tokens_seen": 577160, "step": 1435 }, { "epoch": 1.496881496881497, "grad_norm": 1.543442726135254, "learning_rate": 4.419767797607219e-05, "loss": 0.2892, "num_input_tokens_seen": 579208, "step": 1440 }, { "epoch": 1.502079002079002, "grad_norm": 7.000278472900391, "learning_rate": 4.413944412313405e-05, "loss": 0.2936, "num_input_tokens_seen": 581256, "step": 1445 }, { "epoch": 1.503118503118503, "eval_loss": 0.25699949264526367, "eval_runtime": 1.6561, "eval_samples_per_second": 516.892, "eval_steps_per_second": 64.612, "num_input_tokens_seen": 581704, "step": 1446 }, { "epoch": 1.5072765072765073, "grad_norm": 8.624284744262695, "learning_rate": 4.4080958274324155e-05, "loss": 0.2702, "num_input_tokens_seen": 583304, "step": 1450 }, { "epoch": 1.5124740124740126, "grad_norm": 1.7187201976776123, "learning_rate": 4.40222211996854e-05, "loss": 0.2252, "num_input_tokens_seen": 585224, "step": 1455 }, { "epoch": 1.5176715176715176, "grad_norm": 13.548686981201172, "learning_rate": 4.396323367256836e-05, "loss": 0.4066, "num_input_tokens_seen": 587272, "step": 1460 }, { "epoch": 1.5228690228690227, "grad_norm": 4.836349010467529, "learning_rate": 4.390399646962117e-05, "loss": 0.2413, "num_input_tokens_seen": 589320, "step": 1465 }, { "epoch": 1.5280665280665282, "grad_norm": 16.470077514648438, "learning_rate": 4.384451037077924e-05, "loss": 0.2593, "num_input_tokens_seen": 591304, "step": 1470 }, { "epoch": 1.5332640332640333, "grad_norm": 19.7336483001709, "learning_rate": 4.378477615925505e-05, "loss": 0.2499, "num_input_tokens_seen": 593224, "step": 1475 }, { "epoch": 1.5384615384615383, "grad_norm": 11.778541564941406, "learning_rate": 4.372479462152781e-05, "loss": 0.2672, "num_input_tokens_seen": 595336, "step": 1480 }, { "epoch": 1.5436590436590436, "grad_norm": 10.281831741333008, "learning_rate": 4.366456654733308e-05, "loss": 0.2898, "num_input_tokens_seen": 597256, "step": 1485 }, { "epoch": 1.5488565488565489, "grad_norm": 1.1301134824752808, "learning_rate": 4.360409272965242e-05, "loss": 0.2852, "num_input_tokens_seen": 599304, "step": 1490 }, { "epoch": 1.554054054054054, "grad_norm": 13.930249214172363, "learning_rate": 4.3543373964702907e-05, "loss": 0.2828, "num_input_tokens_seen": 601288, "step": 1495 }, { "epoch": 1.5592515592515592, "grad_norm": 6.923305988311768, "learning_rate": 4.348241105192668e-05, "loss": 0.2597, "num_input_tokens_seen": 603272, "step": 1500 }, { "epoch": 1.5644490644490645, "grad_norm": 6.9845356941223145, "learning_rate": 4.34212047939804e-05, "loss": 0.2584, "num_input_tokens_seen": 605256, "step": 1505 }, { "epoch": 1.5696465696465696, "grad_norm": 14.034126281738281, "learning_rate": 4.335975599672469e-05, "loss": 0.2713, "num_input_tokens_seen": 607304, "step": 1510 }, { "epoch": 1.5748440748440748, "grad_norm": 1.7358630895614624, "learning_rate": 4.329806546921353e-05, "loss": 0.2702, "num_input_tokens_seen": 609224, "step": 1515 }, { "epoch": 1.5800415800415801, "grad_norm": 7.067193031311035, "learning_rate": 4.323613402368357e-05, "loss": 0.2648, "num_input_tokens_seen": 611336, "step": 1520 }, { "epoch": 1.5852390852390852, "grad_norm": 2.2806382179260254, "learning_rate": 4.317396247554347e-05, "loss": 0.2879, "num_input_tokens_seen": 613320, "step": 1525 }, { "epoch": 1.5904365904365905, "grad_norm": 7.832094669342041, "learning_rate": 4.311155164336318e-05, "loss": 0.2953, "num_input_tokens_seen": 615176, "step": 1530 }, { "epoch": 1.5956340956340958, "grad_norm": 5.03207540512085, "learning_rate": 4.3048902348863116e-05, "loss": 0.2754, "num_input_tokens_seen": 617224, "step": 1535 }, { "epoch": 1.6008316008316008, "grad_norm": 9.225286483764648, "learning_rate": 4.298601541690336e-05, "loss": 0.2785, "num_input_tokens_seen": 619208, "step": 1540 }, { "epoch": 1.6060291060291059, "grad_norm": 8.74346923828125, "learning_rate": 4.292289167547281e-05, "loss": 0.278, "num_input_tokens_seen": 621192, "step": 1545 }, { "epoch": 1.6112266112266114, "grad_norm": 7.021914005279541, "learning_rate": 4.285953195567827e-05, "loss": 0.2618, "num_input_tokens_seen": 623176, "step": 1550 }, { "epoch": 1.6164241164241164, "grad_norm": 1.7642489671707153, "learning_rate": 4.2795937091733515e-05, "loss": 0.2506, "num_input_tokens_seen": 625160, "step": 1555 }, { "epoch": 1.6216216216216215, "grad_norm": 19.76347541809082, "learning_rate": 4.27321079209483e-05, "loss": 0.3095, "num_input_tokens_seen": 627144, "step": 1560 }, { "epoch": 1.6268191268191268, "grad_norm": 3.08026385307312, "learning_rate": 4.266804528371732e-05, "loss": 0.2951, "num_input_tokens_seen": 629192, "step": 1565 }, { "epoch": 1.632016632016632, "grad_norm": 6.732242584228516, "learning_rate": 4.260375002350917e-05, "loss": 0.2796, "num_input_tokens_seen": 631240, "step": 1570 }, { "epoch": 1.637214137214137, "grad_norm": 9.543598175048828, "learning_rate": 4.253922298685525e-05, "loss": 0.2407, "num_input_tokens_seen": 633224, "step": 1575 }, { "epoch": 1.6424116424116424, "grad_norm": 12.409037590026855, "learning_rate": 4.247446502333858e-05, "loss": 0.2386, "num_input_tokens_seen": 635208, "step": 1580 }, { "epoch": 1.6476091476091477, "grad_norm": 1.7029387950897217, "learning_rate": 4.2409476985582644e-05, "loss": 0.2872, "num_input_tokens_seen": 637256, "step": 1585 }, { "epoch": 1.6528066528066527, "grad_norm": 8.404769897460938, "learning_rate": 4.234425972924014e-05, "loss": 0.2806, "num_input_tokens_seen": 639176, "step": 1590 }, { "epoch": 1.658004158004158, "grad_norm": 3.5066511631011963, "learning_rate": 4.227881411298175e-05, "loss": 0.2715, "num_input_tokens_seen": 641224, "step": 1595 }, { "epoch": 1.6632016632016633, "grad_norm": 0.08306021988391876, "learning_rate": 4.221314099848481e-05, "loss": 0.2872, "num_input_tokens_seen": 643144, "step": 1600 }, { "epoch": 1.6683991683991684, "grad_norm": 3.17313551902771, "learning_rate": 4.2147241250421944e-05, "loss": 0.2204, "num_input_tokens_seen": 644936, "step": 1605 }, { "epoch": 1.6735966735966736, "grad_norm": 7.495476722717285, "learning_rate": 4.208111573644975e-05, "loss": 0.2557, "num_input_tokens_seen": 646984, "step": 1610 }, { "epoch": 1.678794178794179, "grad_norm": 11.988297462463379, "learning_rate": 4.201476532719728e-05, "loss": 0.2777, "num_input_tokens_seen": 649032, "step": 1615 }, { "epoch": 1.683991683991684, "grad_norm": 9.61047649383545, "learning_rate": 4.194819089625466e-05, "loss": 0.2778, "num_input_tokens_seen": 651080, "step": 1620 }, { "epoch": 1.689189189189189, "grad_norm": 12.550249099731445, "learning_rate": 4.188139332016154e-05, "loss": 0.2953, "num_input_tokens_seen": 653000, "step": 1625 }, { "epoch": 1.6943866943866945, "grad_norm": 9.836450576782227, "learning_rate": 4.1814373478395586e-05, "loss": 0.2955, "num_input_tokens_seen": 654920, "step": 1630 }, { "epoch": 1.6995841995841996, "grad_norm": 4.831801414489746, "learning_rate": 4.174713225336086e-05, "loss": 0.2599, "num_input_tokens_seen": 656904, "step": 1635 }, { "epoch": 1.7047817047817047, "grad_norm": 23.776752471923828, "learning_rate": 4.1679670530376244e-05, "loss": 0.2986, "num_input_tokens_seen": 658952, "step": 1640 }, { "epoch": 1.70997920997921, "grad_norm": 8.714804649353027, "learning_rate": 4.161198919766375e-05, "loss": 0.264, "num_input_tokens_seen": 660872, "step": 1645 }, { "epoch": 1.7151767151767152, "grad_norm": 7.620249271392822, "learning_rate": 4.154408914633685e-05, "loss": 0.2337, "num_input_tokens_seen": 662856, "step": 1650 }, { "epoch": 1.7203742203742203, "grad_norm": 12.147835731506348, "learning_rate": 4.147597127038873e-05, "loss": 0.2968, "num_input_tokens_seen": 664904, "step": 1655 }, { "epoch": 1.7255717255717256, "grad_norm": 4.487679481506348, "learning_rate": 4.140763646668052e-05, "loss": 0.2433, "num_input_tokens_seen": 666888, "step": 1660 }, { "epoch": 1.7307692307692308, "grad_norm": 4.300174236297607, "learning_rate": 4.1339085634929485e-05, "loss": 0.3127, "num_input_tokens_seen": 668936, "step": 1665 }, { "epoch": 1.735966735966736, "grad_norm": 5.227227687835693, "learning_rate": 4.12703196776972e-05, "loss": 0.2454, "num_input_tokens_seen": 670856, "step": 1670 }, { "epoch": 1.7411642411642412, "grad_norm": 3.5631868839263916, "learning_rate": 4.120133950037763e-05, "loss": 0.3639, "num_input_tokens_seen": 672840, "step": 1675 }, { "epoch": 1.7463617463617465, "grad_norm": 3.815650701522827, "learning_rate": 4.113214601118524e-05, "loss": 0.2468, "num_input_tokens_seen": 674824, "step": 1680 }, { "epoch": 1.7515592515592515, "grad_norm": 8.337157249450684, "learning_rate": 4.1062740121143016e-05, "loss": 0.2705, "num_input_tokens_seen": 676808, "step": 1685 }, { "epoch": 1.7536382536382535, "eval_loss": 0.25601524114608765, "eval_runtime": 1.0768, "eval_samples_per_second": 794.918, "eval_steps_per_second": 99.365, "num_input_tokens_seen": 677576, "step": 1687 }, { "epoch": 1.7567567567567568, "grad_norm": 5.897716999053955, "learning_rate": 4.099312274407048e-05, "loss": 0.2969, "num_input_tokens_seen": 678728, "step": 1690 }, { "epoch": 1.761954261954262, "grad_norm": 8.0389404296875, "learning_rate": 4.0923294796571676e-05, "loss": 0.2874, "num_input_tokens_seen": 680776, "step": 1695 }, { "epoch": 1.7671517671517671, "grad_norm": 8.596820831298828, "learning_rate": 4.085325719802307e-05, "loss": 0.2651, "num_input_tokens_seen": 683016, "step": 1700 }, { "epoch": 1.7723492723492722, "grad_norm": 9.364229202270508, "learning_rate": 4.078301087056144e-05, "loss": 0.2924, "num_input_tokens_seen": 685256, "step": 1705 }, { "epoch": 1.7775467775467777, "grad_norm": 17.756505966186523, "learning_rate": 4.0712556739071795e-05, "loss": 0.2762, "num_input_tokens_seen": 687304, "step": 1710 }, { "epoch": 1.7827442827442828, "grad_norm": 16.004009246826172, "learning_rate": 4.064189573117512e-05, "loss": 0.2888, "num_input_tokens_seen": 689224, "step": 1715 }, { "epoch": 1.7879417879417878, "grad_norm": 15.74494457244873, "learning_rate": 4.0571028777216214e-05, "loss": 0.2282, "num_input_tokens_seen": 691400, "step": 1720 }, { "epoch": 1.793139293139293, "grad_norm": 2.8662827014923096, "learning_rate": 4.049995681025143e-05, "loss": 0.187, "num_input_tokens_seen": 693320, "step": 1725 }, { "epoch": 1.7983367983367984, "grad_norm": 8.626184463500977, "learning_rate": 4.0428680766036384e-05, "loss": 0.4406, "num_input_tokens_seen": 695432, "step": 1730 }, { "epoch": 1.8035343035343034, "grad_norm": 3.6228630542755127, "learning_rate": 4.035720158301363e-05, "loss": 0.3552, "num_input_tokens_seen": 697544, "step": 1735 }, { "epoch": 1.8087318087318087, "grad_norm": 34.01264953613281, "learning_rate": 4.028552020230031e-05, "loss": 0.3263, "num_input_tokens_seen": 699592, "step": 1740 }, { "epoch": 1.813929313929314, "grad_norm": 5.828328609466553, "learning_rate": 4.0213637567675774e-05, "loss": 0.2859, "num_input_tokens_seen": 701576, "step": 1745 }, { "epoch": 1.819126819126819, "grad_norm": 6.004729747772217, "learning_rate": 4.0141554625569125e-05, "loss": 0.2657, "num_input_tokens_seen": 703688, "step": 1750 }, { "epoch": 1.8243243243243243, "grad_norm": 4.201369285583496, "learning_rate": 4.0069272325046816e-05, "loss": 0.2842, "num_input_tokens_seen": 705736, "step": 1755 }, { "epoch": 1.8295218295218296, "grad_norm": 12.624285697937012, "learning_rate": 3.999679161780005e-05, "loss": 0.2479, "num_input_tokens_seen": 707720, "step": 1760 }, { "epoch": 1.8347193347193347, "grad_norm": 4.109714031219482, "learning_rate": 3.99241134581324e-05, "loss": 0.3132, "num_input_tokens_seen": 709896, "step": 1765 }, { "epoch": 1.83991683991684, "grad_norm": 5.0265302658081055, "learning_rate": 3.985123880294708e-05, "loss": 0.2661, "num_input_tokens_seen": 711944, "step": 1770 }, { "epoch": 1.8451143451143452, "grad_norm": 6.782411575317383, "learning_rate": 3.9778168611734456e-05, "loss": 0.2664, "num_input_tokens_seen": 713992, "step": 1775 }, { "epoch": 1.8503118503118503, "grad_norm": 1.0791548490524292, "learning_rate": 3.970490384655939e-05, "loss": 0.2443, "num_input_tokens_seen": 715976, "step": 1780 }, { "epoch": 1.8555093555093554, "grad_norm": 8.595144271850586, "learning_rate": 3.963144547204856e-05, "loss": 0.2659, "num_input_tokens_seen": 718024, "step": 1785 }, { "epoch": 1.8607068607068609, "grad_norm": 6.319913387298584, "learning_rate": 3.955779445537776e-05, "loss": 0.2441, "num_input_tokens_seen": 720072, "step": 1790 }, { "epoch": 1.865904365904366, "grad_norm": 16.376117706298828, "learning_rate": 3.948395176625918e-05, "loss": 0.279, "num_input_tokens_seen": 722120, "step": 1795 }, { "epoch": 1.871101871101871, "grad_norm": 12.614494323730469, "learning_rate": 3.9409918376928604e-05, "loss": 0.2851, "num_input_tokens_seen": 724168, "step": 1800 }, { "epoch": 1.8762993762993763, "grad_norm": 0.3612583577632904, "learning_rate": 3.933569526213268e-05, "loss": 0.2928, "num_input_tokens_seen": 726280, "step": 1805 }, { "epoch": 1.8814968814968815, "grad_norm": 5.5901103019714355, "learning_rate": 3.926128339911599e-05, "loss": 0.2677, "num_input_tokens_seen": 728264, "step": 1810 }, { "epoch": 1.8866943866943866, "grad_norm": 19.419448852539062, "learning_rate": 3.918668376760827e-05, "loss": 0.2924, "num_input_tokens_seen": 730312, "step": 1815 }, { "epoch": 1.8918918918918919, "grad_norm": 19.855844497680664, "learning_rate": 3.9111897349811454e-05, "loss": 0.2771, "num_input_tokens_seen": 732296, "step": 1820 }, { "epoch": 1.8970893970893972, "grad_norm": 0.2629048228263855, "learning_rate": 3.903692513038677e-05, "loss": 0.2412, "num_input_tokens_seen": 734088, "step": 1825 }, { "epoch": 1.9022869022869022, "grad_norm": 10.053337097167969, "learning_rate": 3.896176809644178e-05, "loss": 0.2897, "num_input_tokens_seen": 736072, "step": 1830 }, { "epoch": 1.9074844074844075, "grad_norm": 14.880932807922363, "learning_rate": 3.8886427237517344e-05, "loss": 0.3063, "num_input_tokens_seen": 738120, "step": 1835 }, { "epoch": 1.9126819126819128, "grad_norm": 7.023550510406494, "learning_rate": 3.881090354557463e-05, "loss": 0.3038, "num_input_tokens_seen": 740168, "step": 1840 }, { "epoch": 1.9178794178794178, "grad_norm": 14.33624267578125, "learning_rate": 3.8735198014982064e-05, "loss": 0.2716, "num_input_tokens_seen": 742280, "step": 1845 }, { "epoch": 1.9230769230769231, "grad_norm": 3.6817572116851807, "learning_rate": 3.865931164250219e-05, "loss": 0.2834, "num_input_tokens_seen": 744328, "step": 1850 }, { "epoch": 1.9282744282744284, "grad_norm": 6.452430248260498, "learning_rate": 3.8583245427278584e-05, "loss": 0.2845, "num_input_tokens_seen": 746440, "step": 1855 }, { "epoch": 1.9334719334719335, "grad_norm": 5.071720123291016, "learning_rate": 3.850700037082268e-05, "loss": 0.3004, "num_input_tokens_seen": 748488, "step": 1860 }, { "epoch": 1.9386694386694385, "grad_norm": 7.624428749084473, "learning_rate": 3.8430577477000595e-05, "loss": 0.2696, "num_input_tokens_seen": 750344, "step": 1865 }, { "epoch": 1.943866943866944, "grad_norm": 11.881917953491211, "learning_rate": 3.835397775201991e-05, "loss": 0.2567, "num_input_tokens_seen": 752328, "step": 1870 }, { "epoch": 1.949064449064449, "grad_norm": 3.7098724842071533, "learning_rate": 3.827720220441642e-05, "loss": 0.269, "num_input_tokens_seen": 754312, "step": 1875 }, { "epoch": 1.9542619542619541, "grad_norm": 8.87547492980957, "learning_rate": 3.8200251845040855e-05, "loss": 0.2816, "num_input_tokens_seen": 756232, "step": 1880 }, { "epoch": 1.9594594594594594, "grad_norm": 4.694116592407227, "learning_rate": 3.812312768704557e-05, "loss": 0.2706, "num_input_tokens_seen": 758280, "step": 1885 }, { "epoch": 1.9646569646569647, "grad_norm": 0.5913885235786438, "learning_rate": 3.8045830745871195e-05, "loss": 0.2412, "num_input_tokens_seen": 760328, "step": 1890 }, { "epoch": 1.9698544698544698, "grad_norm": 0.12654490768909454, "learning_rate": 3.7968362039233316e-05, "loss": 0.2593, "num_input_tokens_seen": 762248, "step": 1895 }, { "epoch": 1.975051975051975, "grad_norm": 25.26936149597168, "learning_rate": 3.789072258710898e-05, "loss": 0.2765, "num_input_tokens_seen": 764168, "step": 1900 }, { "epoch": 1.9802494802494803, "grad_norm": 13.081347465515137, "learning_rate": 3.781291341172338e-05, "loss": 0.2703, "num_input_tokens_seen": 766216, "step": 1905 }, { "epoch": 1.9854469854469854, "grad_norm": 5.589529037475586, "learning_rate": 3.7734935537536276e-05, "loss": 0.2418, "num_input_tokens_seen": 768264, "step": 1910 }, { "epoch": 1.9906444906444907, "grad_norm": 10.241774559020996, "learning_rate": 3.7656789991228636e-05, "loss": 0.2502, "num_input_tokens_seen": 770184, "step": 1915 }, { "epoch": 1.995841995841996, "grad_norm": 2.5858964920043945, "learning_rate": 3.7578477801689e-05, "loss": 0.2432, "num_input_tokens_seen": 772168, "step": 1920 }, { "epoch": 2.001039501039501, "grad_norm": 3.724403142929077, "learning_rate": 3.7500000000000003e-05, "loss": 0.2243, "num_input_tokens_seen": 774160, "step": 1925 }, { "epoch": 2.004158004158004, "eval_loss": 0.2575376331806183, "eval_runtime": 1.0513, "eval_samples_per_second": 814.265, "eval_steps_per_second": 101.783, "num_input_tokens_seen": 775312, "step": 1928 }, { "epoch": 2.006237006237006, "grad_norm": 8.0248384475708, "learning_rate": 3.742135761942479e-05, "loss": 0.26, "num_input_tokens_seen": 776144, "step": 1930 }, { "epoch": 2.0114345114345116, "grad_norm": 8.558785438537598, "learning_rate": 3.734255169539337e-05, "loss": 0.2814, "num_input_tokens_seen": 778128, "step": 1935 }, { "epoch": 2.0166320166320166, "grad_norm": 29.213605880737305, "learning_rate": 3.7263583265489074e-05, "loss": 0.2911, "num_input_tokens_seen": 780176, "step": 1940 }, { "epoch": 2.0218295218295217, "grad_norm": 2.797147274017334, "learning_rate": 3.718445336943478e-05, "loss": 0.2723, "num_input_tokens_seen": 782160, "step": 1945 }, { "epoch": 2.027027027027027, "grad_norm": 11.302480697631836, "learning_rate": 3.710516304907931e-05, "loss": 0.3159, "num_input_tokens_seen": 784208, "step": 1950 }, { "epoch": 2.0322245322245323, "grad_norm": 10.434552192687988, "learning_rate": 3.702571334838365e-05, "loss": 0.2713, "num_input_tokens_seen": 786256, "step": 1955 }, { "epoch": 2.0374220374220373, "grad_norm": 18.884937286376953, "learning_rate": 3.694610531340729e-05, "loss": 0.2491, "num_input_tokens_seen": 788240, "step": 1960 }, { "epoch": 2.042619542619543, "grad_norm": 7.0856404304504395, "learning_rate": 3.6866339992294344e-05, "loss": 0.2663, "num_input_tokens_seen": 790288, "step": 1965 }, { "epoch": 2.047817047817048, "grad_norm": 8.749947547912598, "learning_rate": 3.6786418435259854e-05, "loss": 0.2681, "num_input_tokens_seen": 792272, "step": 1970 }, { "epoch": 2.053014553014553, "grad_norm": 7.0498785972595215, "learning_rate": 3.670634169457587e-05, "loss": 0.2757, "num_input_tokens_seen": 794384, "step": 1975 }, { "epoch": 2.0582120582120584, "grad_norm": 1.3708738088607788, "learning_rate": 3.662611082455766e-05, "loss": 0.2727, "num_input_tokens_seen": 796368, "step": 1980 }, { "epoch": 2.0634095634095635, "grad_norm": 17.901782989501953, "learning_rate": 3.654572688154979e-05, "loss": 0.2711, "num_input_tokens_seen": 798480, "step": 1985 }, { "epoch": 2.0686070686070686, "grad_norm": 3.3067057132720947, "learning_rate": 3.646519092391227e-05, "loss": 0.2843, "num_input_tokens_seen": 800528, "step": 1990 }, { "epoch": 2.0738045738045736, "grad_norm": 10.808554649353027, "learning_rate": 3.6384504012006544e-05, "loss": 0.2917, "num_input_tokens_seen": 802768, "step": 1995 }, { "epoch": 2.079002079002079, "grad_norm": 15.788969039916992, "learning_rate": 3.6303667208181575e-05, "loss": 0.2846, "num_input_tokens_seen": 804752, "step": 2000 }, { "epoch": 2.084199584199584, "grad_norm": 7.7113847732543945, "learning_rate": 3.622268157675986e-05, "loss": 0.2932, "num_input_tokens_seen": 806672, "step": 2005 }, { "epoch": 2.0893970893970892, "grad_norm": 1.0900391340255737, "learning_rate": 3.614154818402339e-05, "loss": 0.2602, "num_input_tokens_seen": 808656, "step": 2010 }, { "epoch": 2.0945945945945947, "grad_norm": 8.559063911437988, "learning_rate": 3.606026809819966e-05, "loss": 0.2402, "num_input_tokens_seen": 810640, "step": 2015 }, { "epoch": 2.0997920997921, "grad_norm": 13.94715690612793, "learning_rate": 3.597884238944752e-05, "loss": 0.2832, "num_input_tokens_seen": 812688, "step": 2020 }, { "epoch": 2.104989604989605, "grad_norm": 10.943618774414062, "learning_rate": 3.5897272129843194e-05, "loss": 0.262, "num_input_tokens_seen": 814800, "step": 2025 }, { "epoch": 2.1101871101871104, "grad_norm": 8.191487312316895, "learning_rate": 3.581555839336606e-05, "loss": 0.2348, "num_input_tokens_seen": 816912, "step": 2030 }, { "epoch": 2.1153846153846154, "grad_norm": 8.54643440246582, "learning_rate": 3.57337022558846e-05, "loss": 0.2981, "num_input_tokens_seen": 818896, "step": 2035 }, { "epoch": 2.1205821205821205, "grad_norm": 0.22179275751113892, "learning_rate": 3.565170479514214e-05, "loss": 0.2857, "num_input_tokens_seen": 820880, "step": 2040 }, { "epoch": 2.125779625779626, "grad_norm": 8.135652542114258, "learning_rate": 3.5569567090742764e-05, "loss": 0.2745, "num_input_tokens_seen": 822864, "step": 2045 }, { "epoch": 2.130977130977131, "grad_norm": 1.2629114389419556, "learning_rate": 3.548729022413701e-05, "loss": 0.2705, "num_input_tokens_seen": 825040, "step": 2050 }, { "epoch": 2.136174636174636, "grad_norm": 2.5352609157562256, "learning_rate": 3.540487527860769e-05, "loss": 0.2397, "num_input_tokens_seen": 827024, "step": 2055 }, { "epoch": 2.141372141372141, "grad_norm": 5.771927356719971, "learning_rate": 3.53223233392556e-05, "loss": 0.2921, "num_input_tokens_seen": 829136, "step": 2060 }, { "epoch": 2.1465696465696467, "grad_norm": 14.170849800109863, "learning_rate": 3.523963549298525e-05, "loss": 0.274, "num_input_tokens_seen": 831184, "step": 2065 }, { "epoch": 2.1517671517671517, "grad_norm": 35.1840705871582, "learning_rate": 3.51568128284905e-05, "loss": 0.3599, "num_input_tokens_seen": 833168, "step": 2070 }, { "epoch": 2.156964656964657, "grad_norm": 3.5509274005889893, "learning_rate": 3.5073856436240334e-05, "loss": 0.2991, "num_input_tokens_seen": 835216, "step": 2075 }, { "epoch": 2.1621621621621623, "grad_norm": 11.091768264770508, "learning_rate": 3.499076740846438e-05, "loss": 0.2711, "num_input_tokens_seen": 837136, "step": 2080 }, { "epoch": 2.1673596673596673, "grad_norm": 10.117902755737305, "learning_rate": 3.490754683913863e-05, "loss": 0.263, "num_input_tokens_seen": 839120, "step": 2085 }, { "epoch": 2.1725571725571724, "grad_norm": 8.886628150939941, "learning_rate": 3.482419582397095e-05, "loss": 0.2114, "num_input_tokens_seen": 841104, "step": 2090 }, { "epoch": 2.177754677754678, "grad_norm": 3.9707517623901367, "learning_rate": 3.474071546038673e-05, "loss": 0.3437, "num_input_tokens_seen": 843152, "step": 2095 }, { "epoch": 2.182952182952183, "grad_norm": 7.44064474105835, "learning_rate": 3.46571068475144e-05, "loss": 0.2665, "num_input_tokens_seen": 845136, "step": 2100 }, { "epoch": 2.188149688149688, "grad_norm": 15.758981704711914, "learning_rate": 3.4573371086170936e-05, "loss": 0.2736, "num_input_tokens_seen": 847120, "step": 2105 }, { "epoch": 2.1933471933471935, "grad_norm": 6.210805892944336, "learning_rate": 3.4489509278847414e-05, "loss": 0.2967, "num_input_tokens_seen": 849168, "step": 2110 }, { "epoch": 2.1985446985446986, "grad_norm": 5.870239734649658, "learning_rate": 3.4405522529694454e-05, "loss": 0.2804, "num_input_tokens_seen": 851152, "step": 2115 }, { "epoch": 2.2037422037422036, "grad_norm": 0.2958710789680481, "learning_rate": 3.432141194450772e-05, "loss": 0.2789, "num_input_tokens_seen": 853008, "step": 2120 }, { "epoch": 2.208939708939709, "grad_norm": 2.236769199371338, "learning_rate": 3.4237178630713314e-05, "loss": 0.2593, "num_input_tokens_seen": 855120, "step": 2125 }, { "epoch": 2.214137214137214, "grad_norm": 13.610929489135742, "learning_rate": 3.415282369735324e-05, "loss": 0.3184, "num_input_tokens_seen": 857232, "step": 2130 }, { "epoch": 2.2193347193347193, "grad_norm": 23.174781799316406, "learning_rate": 3.4068348255070763e-05, "loss": 0.2697, "num_input_tokens_seen": 859344, "step": 2135 }, { "epoch": 2.2245322245322248, "grad_norm": 5.864615440368652, "learning_rate": 3.3983753416095845e-05, "loss": 0.2666, "num_input_tokens_seen": 861328, "step": 2140 }, { "epoch": 2.22972972972973, "grad_norm": 4.22444486618042, "learning_rate": 3.389904029423041e-05, "loss": 0.2807, "num_input_tokens_seen": 863376, "step": 2145 }, { "epoch": 2.234927234927235, "grad_norm": 10.534299850463867, "learning_rate": 3.381421000483378e-05, "loss": 0.277, "num_input_tokens_seen": 865424, "step": 2150 }, { "epoch": 2.24012474012474, "grad_norm": 19.86652946472168, "learning_rate": 3.37292636648079e-05, "loss": 0.2884, "num_input_tokens_seen": 867472, "step": 2155 }, { "epoch": 2.2453222453222454, "grad_norm": 1.5927962064743042, "learning_rate": 3.36442023925827e-05, "loss": 0.259, "num_input_tokens_seen": 869584, "step": 2160 }, { "epoch": 2.2505197505197505, "grad_norm": 13.930359840393066, "learning_rate": 3.3559027308101345e-05, "loss": 0.2477, "num_input_tokens_seen": 871568, "step": 2165 }, { "epoch": 2.2546777546777546, "eval_loss": 0.2923731505870819, "eval_runtime": 2.2896, "eval_samples_per_second": 373.867, "eval_steps_per_second": 46.733, "num_input_tokens_seen": 873104, "step": 2169 }, { "epoch": 2.2557172557172556, "grad_norm": 13.714078903198242, "learning_rate": 3.3473739532805467e-05, "loss": 0.3482, "num_input_tokens_seen": 873488, "step": 2170 }, { "epoch": 2.260914760914761, "grad_norm": 6.342438697814941, "learning_rate": 3.3388340189620424e-05, "loss": 0.26, "num_input_tokens_seen": 875472, "step": 2175 }, { "epoch": 2.266112266112266, "grad_norm": 13.85311222076416, "learning_rate": 3.330283040294053e-05, "loss": 0.2554, "num_input_tokens_seen": 877392, "step": 2180 }, { "epoch": 2.271309771309771, "grad_norm": 4.5571370124816895, "learning_rate": 3.321721129861422e-05, "loss": 0.2621, "num_input_tokens_seen": 879504, "step": 2185 }, { "epoch": 2.2765072765072767, "grad_norm": 12.646343231201172, "learning_rate": 3.3131484003929246e-05, "loss": 0.29, "num_input_tokens_seen": 881360, "step": 2190 }, { "epoch": 2.2817047817047817, "grad_norm": 2.4754276275634766, "learning_rate": 3.3045649647597815e-05, "loss": 0.2916, "num_input_tokens_seen": 883280, "step": 2195 }, { "epoch": 2.286902286902287, "grad_norm": 9.653176307678223, "learning_rate": 3.2959709359741744e-05, "loss": 0.2572, "num_input_tokens_seen": 885328, "step": 2200 }, { "epoch": 2.2920997920997923, "grad_norm": 16.368059158325195, "learning_rate": 3.2873664271877584e-05, "loss": 0.3062, "num_input_tokens_seen": 887312, "step": 2205 }, { "epoch": 2.2972972972972974, "grad_norm": 11.021875381469727, "learning_rate": 3.278751551690172e-05, "loss": 0.2672, "num_input_tokens_seen": 889296, "step": 2210 }, { "epoch": 2.3024948024948024, "grad_norm": 8.907588958740234, "learning_rate": 3.270126422907544e-05, "loss": 0.2647, "num_input_tokens_seen": 891408, "step": 2215 }, { "epoch": 2.3076923076923075, "grad_norm": 9.290825843811035, "learning_rate": 3.261491154401001e-05, "loss": 0.2884, "num_input_tokens_seen": 893392, "step": 2220 }, { "epoch": 2.312889812889813, "grad_norm": 4.591054916381836, "learning_rate": 3.2528458598651734e-05, "loss": 0.2719, "num_input_tokens_seen": 895440, "step": 2225 }, { "epoch": 2.318087318087318, "grad_norm": 14.931044578552246, "learning_rate": 3.244190653126696e-05, "loss": 0.2718, "num_input_tokens_seen": 897616, "step": 2230 }, { "epoch": 2.323284823284823, "grad_norm": 5.752501487731934, "learning_rate": 3.2355256481427145e-05, "loss": 0.2636, "num_input_tokens_seen": 899536, "step": 2235 }, { "epoch": 2.3284823284823286, "grad_norm": 10.545312881469727, "learning_rate": 3.226850958999375e-05, "loss": 0.2544, "num_input_tokens_seen": 901648, "step": 2240 }, { "epoch": 2.3336798336798337, "grad_norm": 0.47291961312294006, "learning_rate": 3.2181666999103324e-05, "loss": 0.2258, "num_input_tokens_seen": 903696, "step": 2245 }, { "epoch": 2.3388773388773387, "grad_norm": 12.981254577636719, "learning_rate": 3.209472985215243e-05, "loss": 0.3054, "num_input_tokens_seen": 905552, "step": 2250 }, { "epoch": 2.3440748440748442, "grad_norm": 6.684492588043213, "learning_rate": 3.2007699293782555e-05, "loss": 0.3446, "num_input_tokens_seen": 907472, "step": 2255 }, { "epoch": 2.3492723492723493, "grad_norm": 4.928536891937256, "learning_rate": 3.1920576469865115e-05, "loss": 0.2495, "num_input_tokens_seen": 909584, "step": 2260 }, { "epoch": 2.3544698544698544, "grad_norm": 6.033250331878662, "learning_rate": 3.183336252748627e-05, "loss": 0.2754, "num_input_tokens_seen": 911632, "step": 2265 }, { "epoch": 2.35966735966736, "grad_norm": 2.3870551586151123, "learning_rate": 3.1746058614931916e-05, "loss": 0.2604, "num_input_tokens_seen": 913616, "step": 2270 }, { "epoch": 2.364864864864865, "grad_norm": 18.541162490844727, "learning_rate": 3.16586658816725e-05, "loss": 0.2711, "num_input_tokens_seen": 915728, "step": 2275 }, { "epoch": 2.37006237006237, "grad_norm": 13.122420310974121, "learning_rate": 3.157118547834793e-05, "loss": 0.2566, "num_input_tokens_seen": 917776, "step": 2280 }, { "epoch": 2.375259875259875, "grad_norm": 12.414369583129883, "learning_rate": 3.148361855675237e-05, "loss": 0.2684, "num_input_tokens_seen": 919952, "step": 2285 }, { "epoch": 2.3804573804573805, "grad_norm": 7.519947052001953, "learning_rate": 3.139596626981916e-05, "loss": 0.2294, "num_input_tokens_seen": 921872, "step": 2290 }, { "epoch": 2.3856548856548856, "grad_norm": 8.675261497497559, "learning_rate": 3.130822977160554e-05, "loss": 0.2603, "num_input_tokens_seen": 923856, "step": 2295 }, { "epoch": 2.390852390852391, "grad_norm": 5.88022518157959, "learning_rate": 3.122041021727755e-05, "loss": 0.3942, "num_input_tokens_seen": 925968, "step": 2300 }, { "epoch": 2.396049896049896, "grad_norm": 14.03941535949707, "learning_rate": 3.1132508763094715e-05, "loss": 0.3128, "num_input_tokens_seen": 927888, "step": 2305 }, { "epoch": 2.401247401247401, "grad_norm": 15.206921577453613, "learning_rate": 3.104452656639492e-05, "loss": 0.2467, "num_input_tokens_seen": 929808, "step": 2310 }, { "epoch": 2.4064449064449063, "grad_norm": 1.0389469861984253, "learning_rate": 3.0956464785579124e-05, "loss": 0.1963, "num_input_tokens_seen": 931728, "step": 2315 }, { "epoch": 2.4116424116424118, "grad_norm": 9.624686241149902, "learning_rate": 3.0868324580096114e-05, "loss": 0.3533, "num_input_tokens_seen": 933840, "step": 2320 }, { "epoch": 2.416839916839917, "grad_norm": 1.1855783462524414, "learning_rate": 3.078010711042723e-05, "loss": 0.2936, "num_input_tokens_seen": 935824, "step": 2325 }, { "epoch": 2.422037422037422, "grad_norm": 4.064991474151611, "learning_rate": 3.0691813538071105e-05, "loss": 0.274, "num_input_tokens_seen": 937872, "step": 2330 }, { "epoch": 2.4272349272349274, "grad_norm": 26.271160125732422, "learning_rate": 3.0603445025528376e-05, "loss": 0.3378, "num_input_tokens_seen": 939984, "step": 2335 }, { "epoch": 2.4324324324324325, "grad_norm": 12.682053565979004, "learning_rate": 3.051500273628633e-05, "loss": 0.2418, "num_input_tokens_seen": 941968, "step": 2340 }, { "epoch": 2.4376299376299375, "grad_norm": 3.6713008880615234, "learning_rate": 3.0426487834803657e-05, "loss": 0.2943, "num_input_tokens_seen": 943952, "step": 2345 }, { "epoch": 2.442827442827443, "grad_norm": 19.096328735351562, "learning_rate": 3.0337901486495073e-05, "loss": 0.2435, "num_input_tokens_seen": 945872, "step": 2350 }, { "epoch": 2.448024948024948, "grad_norm": 4.218629360198975, "learning_rate": 3.0249244857715976e-05, "loss": 0.267, "num_input_tokens_seen": 947856, "step": 2355 }, { "epoch": 2.453222453222453, "grad_norm": 0.270999550819397, "learning_rate": 3.01605191157471e-05, "loss": 0.2452, "num_input_tokens_seen": 949840, "step": 2360 }, { "epoch": 2.4584199584199586, "grad_norm": 1.739176630973816, "learning_rate": 3.007172542877915e-05, "loss": 0.2342, "num_input_tokens_seen": 951760, "step": 2365 }, { "epoch": 2.4636174636174637, "grad_norm": 13.98282241821289, "learning_rate": 2.998286496589742e-05, "loss": 0.3294, "num_input_tokens_seen": 953680, "step": 2370 }, { "epoch": 2.4688149688149688, "grad_norm": 9.587896347045898, "learning_rate": 2.9893938897066393e-05, "loss": 0.2417, "num_input_tokens_seen": 955600, "step": 2375 }, { "epoch": 2.474012474012474, "grad_norm": 6.101572036743164, "learning_rate": 2.9804948393114324e-05, "loss": 0.2781, "num_input_tokens_seen": 957456, "step": 2380 }, { "epoch": 2.4792099792099793, "grad_norm": 5.505262851715088, "learning_rate": 2.9715894625717866e-05, "loss": 0.2721, "num_input_tokens_seen": 959504, "step": 2385 }, { "epoch": 2.4844074844074844, "grad_norm": 0.05719423666596413, "learning_rate": 2.9626778767386604e-05, "loss": 0.277, "num_input_tokens_seen": 961488, "step": 2390 }, { "epoch": 2.4896049896049894, "grad_norm": 17.471576690673828, "learning_rate": 2.953760199144764e-05, "loss": 0.2759, "num_input_tokens_seen": 963408, "step": 2395 }, { "epoch": 2.494802494802495, "grad_norm": 12.931529998779297, "learning_rate": 2.9448365472030115e-05, "loss": 0.2633, "num_input_tokens_seen": 965392, "step": 2400 }, { "epoch": 2.5, "grad_norm": 8.689949989318848, "learning_rate": 2.935907038404981e-05, "loss": 0.2744, "num_input_tokens_seen": 967440, "step": 2405 }, { "epoch": 2.505197505197505, "grad_norm": 1.9627736806869507, "learning_rate": 2.92697179031936e-05, "loss": 0.2379, "num_input_tokens_seen": 969360, "step": 2410 }, { "epoch": 2.505197505197505, "eval_loss": 0.2576568126678467, "eval_runtime": 1.0822, "eval_samples_per_second": 790.985, "eval_steps_per_second": 98.873, "num_input_tokens_seen": 969360, "step": 2410 }, { "epoch": 2.51039501039501, "grad_norm": 0.891659140586853, "learning_rate": 2.9180309205904027e-05, "loss": 0.2923, "num_input_tokens_seen": 971472, "step": 2415 }, { "epoch": 2.5155925155925156, "grad_norm": 2.514449119567871, "learning_rate": 2.9090845469363805e-05, "loss": 0.296, "num_input_tokens_seen": 973456, "step": 2420 }, { "epoch": 2.5207900207900207, "grad_norm": 19.587871551513672, "learning_rate": 2.9001327871480294e-05, "loss": 0.2911, "num_input_tokens_seen": 975504, "step": 2425 }, { "epoch": 2.525987525987526, "grad_norm": 16.2788143157959, "learning_rate": 2.8911757590870027e-05, "loss": 0.285, "num_input_tokens_seen": 977552, "step": 2430 }, { "epoch": 2.5311850311850312, "grad_norm": 21.12313461303711, "learning_rate": 2.8822135806843154e-05, "loss": 0.2552, "num_input_tokens_seen": 979536, "step": 2435 }, { "epoch": 2.5363825363825363, "grad_norm": 1.4399539232254028, "learning_rate": 2.8732463699387968e-05, "loss": 0.2906, "num_input_tokens_seen": 981584, "step": 2440 }, { "epoch": 2.5415800415800414, "grad_norm": 13.594979286193848, "learning_rate": 2.8642742449155284e-05, "loss": 0.2795, "num_input_tokens_seen": 983632, "step": 2445 }, { "epoch": 2.546777546777547, "grad_norm": 16.668123245239258, "learning_rate": 2.855297323744301e-05, "loss": 0.228, "num_input_tokens_seen": 985680, "step": 2450 }, { "epoch": 2.551975051975052, "grad_norm": 1.3556967973709106, "learning_rate": 2.8463157246180468e-05, "loss": 0.2414, "num_input_tokens_seen": 987664, "step": 2455 }, { "epoch": 2.5571725571725574, "grad_norm": 8.519729614257812, "learning_rate": 2.8373295657912945e-05, "loss": 0.2636, "num_input_tokens_seen": 989648, "step": 2460 }, { "epoch": 2.5623700623700625, "grad_norm": 2.2857918739318848, "learning_rate": 2.828338965578603e-05, "loss": 0.2691, "num_input_tokens_seen": 991696, "step": 2465 }, { "epoch": 2.5675675675675675, "grad_norm": 5.966533184051514, "learning_rate": 2.8193440423530114e-05, "loss": 0.2598, "num_input_tokens_seen": 993616, "step": 2470 }, { "epoch": 2.5727650727650726, "grad_norm": 2.139225482940674, "learning_rate": 2.810344914544475e-05, "loss": 0.2688, "num_input_tokens_seen": 995664, "step": 2475 }, { "epoch": 2.577962577962578, "grad_norm": 1.0499998331069946, "learning_rate": 2.8013417006383076e-05, "loss": 0.295, "num_input_tokens_seen": 997648, "step": 2480 }, { "epoch": 2.583160083160083, "grad_norm": 8.98536205291748, "learning_rate": 2.792334519173624e-05, "loss": 0.2802, "num_input_tokens_seen": 999696, "step": 2485 }, { "epoch": 2.5883575883575882, "grad_norm": 9.025850296020508, "learning_rate": 2.7833234887417743e-05, "loss": 0.2897, "num_input_tokens_seen": 1001680, "step": 2490 }, { "epoch": 2.5935550935550937, "grad_norm": 4.961789131164551, "learning_rate": 2.7743087279847868e-05, "loss": 0.2723, "num_input_tokens_seen": 1003728, "step": 2495 }, { "epoch": 2.598752598752599, "grad_norm": 11.652027130126953, "learning_rate": 2.765290355593805e-05, "loss": 0.2874, "num_input_tokens_seen": 1005584, "step": 2500 }, { "epoch": 2.603950103950104, "grad_norm": 16.368242263793945, "learning_rate": 2.7562684903075238e-05, "loss": 0.2405, "num_input_tokens_seen": 1007696, "step": 2505 }, { "epoch": 2.609147609147609, "grad_norm": 12.281315803527832, "learning_rate": 2.7472432509106248e-05, "loss": 0.2737, "num_input_tokens_seen": 1009680, "step": 2510 }, { "epoch": 2.6143451143451144, "grad_norm": 16.369632720947266, "learning_rate": 2.7382147562322174e-05, "loss": 0.2753, "num_input_tokens_seen": 1011728, "step": 2515 }, { "epoch": 2.6195426195426195, "grad_norm": 3.395531177520752, "learning_rate": 2.729183125144269e-05, "loss": 0.2553, "num_input_tokens_seen": 1013840, "step": 2520 }, { "epoch": 2.624740124740125, "grad_norm": 7.065828323364258, "learning_rate": 2.7201484765600426e-05, "loss": 0.2564, "num_input_tokens_seen": 1015824, "step": 2525 }, { "epoch": 2.62993762993763, "grad_norm": 2.8372550010681152, "learning_rate": 2.7111109294325297e-05, "loss": 0.277, "num_input_tokens_seen": 1017744, "step": 2530 }, { "epoch": 2.635135135135135, "grad_norm": 12.833436012268066, "learning_rate": 2.702070602752887e-05, "loss": 0.2439, "num_input_tokens_seen": 1019728, "step": 2535 }, { "epoch": 2.64033264033264, "grad_norm": 0.17026355862617493, "learning_rate": 2.693027615548864e-05, "loss": 0.2958, "num_input_tokens_seen": 1021840, "step": 2540 }, { "epoch": 2.6455301455301456, "grad_norm": 10.469789505004883, "learning_rate": 2.6839820868832433e-05, "loss": 0.2809, "num_input_tokens_seen": 1023824, "step": 2545 }, { "epoch": 2.6507276507276507, "grad_norm": 10.419620513916016, "learning_rate": 2.6749341358522674e-05, "loss": 0.2583, "num_input_tokens_seen": 1025616, "step": 2550 }, { "epoch": 2.6559251559251558, "grad_norm": 3.5157597064971924, "learning_rate": 2.665883881584072e-05, "loss": 0.2494, "num_input_tokens_seen": 1027664, "step": 2555 }, { "epoch": 2.6611226611226613, "grad_norm": 1.5977754592895508, "learning_rate": 2.6568314432371183e-05, "loss": 0.2477, "num_input_tokens_seen": 1029648, "step": 2560 }, { "epoch": 2.6663201663201663, "grad_norm": 1.0980393886566162, "learning_rate": 2.6477769399986245e-05, "loss": 0.2402, "num_input_tokens_seen": 1031632, "step": 2565 }, { "epoch": 2.6715176715176714, "grad_norm": 8.981844902038574, "learning_rate": 2.6387204910829956e-05, "loss": 0.2583, "num_input_tokens_seen": 1033488, "step": 2570 }, { "epoch": 2.6767151767151764, "grad_norm": 3.0374419689178467, "learning_rate": 2.629662215730253e-05, "loss": 0.2162, "num_input_tokens_seen": 1035536, "step": 2575 }, { "epoch": 2.681912681912682, "grad_norm": 5.767524242401123, "learning_rate": 2.6206022332044667e-05, "loss": 0.2652, "num_input_tokens_seen": 1037584, "step": 2580 }, { "epoch": 2.687110187110187, "grad_norm": 0.40257641673088074, "learning_rate": 2.6115406627921825e-05, "loss": 0.2401, "num_input_tokens_seen": 1039568, "step": 2585 }, { "epoch": 2.6923076923076925, "grad_norm": 1.3719632625579834, "learning_rate": 2.6024776238008543e-05, "loss": 0.2416, "num_input_tokens_seen": 1041616, "step": 2590 }, { "epoch": 2.6975051975051976, "grad_norm": 0.23526664078235626, "learning_rate": 2.593413235557271e-05, "loss": 0.2479, "num_input_tokens_seen": 1043664, "step": 2595 }, { "epoch": 2.7027027027027026, "grad_norm": 5.976350784301758, "learning_rate": 2.5843476174059872e-05, "loss": 0.2624, "num_input_tokens_seen": 1045520, "step": 2600 }, { "epoch": 2.7079002079002077, "grad_norm": 1.9363540410995483, "learning_rate": 2.5752808887077477e-05, "loss": 0.2519, "num_input_tokens_seen": 1047376, "step": 2605 }, { "epoch": 2.713097713097713, "grad_norm": 0.9820289611816406, "learning_rate": 2.5662131688379242e-05, "loss": 0.2347, "num_input_tokens_seen": 1049360, "step": 2610 }, { "epoch": 2.7182952182952183, "grad_norm": 6.348036766052246, "learning_rate": 2.5571445771849327e-05, "loss": 0.2785, "num_input_tokens_seen": 1051344, "step": 2615 }, { "epoch": 2.7234927234927238, "grad_norm": 2.1040618419647217, "learning_rate": 2.548075233148674e-05, "loss": 0.2622, "num_input_tokens_seen": 1053264, "step": 2620 }, { "epoch": 2.728690228690229, "grad_norm": 3.914247989654541, "learning_rate": 2.5390052561389478e-05, "loss": 0.2798, "num_input_tokens_seen": 1055248, "step": 2625 }, { "epoch": 2.733887733887734, "grad_norm": 10.045319557189941, "learning_rate": 2.529934765573893e-05, "loss": 0.2568, "num_input_tokens_seen": 1057104, "step": 2630 }, { "epoch": 2.739085239085239, "grad_norm": 6.609117031097412, "learning_rate": 2.520863880878408e-05, "loss": 0.2622, "num_input_tokens_seen": 1059024, "step": 2635 }, { "epoch": 2.7442827442827444, "grad_norm": 8.553672790527344, "learning_rate": 2.511792721482581e-05, "loss": 0.2707, "num_input_tokens_seen": 1060944, "step": 2640 }, { "epoch": 2.7494802494802495, "grad_norm": 6.869534969329834, "learning_rate": 2.502721406820116e-05, "loss": 0.2525, "num_input_tokens_seen": 1062992, "step": 2645 }, { "epoch": 2.7546777546777546, "grad_norm": 8.741276741027832, "learning_rate": 2.4936500563267627e-05, "loss": 0.2934, "num_input_tokens_seen": 1064848, "step": 2650 }, { "epoch": 2.7557172557172556, "eval_loss": 0.2561495900154114, "eval_runtime": 1.071, "eval_samples_per_second": 799.235, "eval_steps_per_second": 99.904, "num_input_tokens_seen": 1065232, "step": 2651 }, { "epoch": 2.75987525987526, "grad_norm": 10.544535636901855, "learning_rate": 2.4845787894387425e-05, "loss": 0.2493, "num_input_tokens_seen": 1066832, "step": 2655 }, { "epoch": 2.765072765072765, "grad_norm": 7.622214317321777, "learning_rate": 2.4755077255911743e-05, "loss": 0.2661, "num_input_tokens_seen": 1068880, "step": 2660 }, { "epoch": 2.77027027027027, "grad_norm": 7.094521522521973, "learning_rate": 2.4664369842165068e-05, "loss": 0.2398, "num_input_tokens_seen": 1070864, "step": 2665 }, { "epoch": 2.7754677754677752, "grad_norm": 9.394824981689453, "learning_rate": 2.4573666847429384e-05, "loss": 0.2523, "num_input_tokens_seen": 1072848, "step": 2670 }, { "epoch": 2.7806652806652807, "grad_norm": 5.644352436065674, "learning_rate": 2.4482969465928543e-05, "loss": 0.3167, "num_input_tokens_seen": 1074832, "step": 2675 }, { "epoch": 2.785862785862786, "grad_norm": 11.34897232055664, "learning_rate": 2.4392278891812455e-05, "loss": 0.2951, "num_input_tokens_seen": 1076944, "step": 2680 }, { "epoch": 2.7910602910602913, "grad_norm": 4.7796196937561035, "learning_rate": 2.430159631914141e-05, "loss": 0.2844, "num_input_tokens_seen": 1078800, "step": 2685 }, { "epoch": 2.7962577962577964, "grad_norm": 6.175029277801514, "learning_rate": 2.4210922941870367e-05, "loss": 0.2706, "num_input_tokens_seen": 1080912, "step": 2690 }, { "epoch": 2.8014553014553014, "grad_norm": 2.2498137950897217, "learning_rate": 2.41202599538332e-05, "loss": 0.244, "num_input_tokens_seen": 1082960, "step": 2695 }, { "epoch": 2.8066528066528065, "grad_norm": 14.539408683776855, "learning_rate": 2.402960854872697e-05, "loss": 0.2877, "num_input_tokens_seen": 1085008, "step": 2700 }, { "epoch": 2.811850311850312, "grad_norm": 6.818408489227295, "learning_rate": 2.39389699200963e-05, "loss": 0.1818, "num_input_tokens_seen": 1087184, "step": 2705 }, { "epoch": 2.817047817047817, "grad_norm": 8.152698516845703, "learning_rate": 2.384834526131752e-05, "loss": 0.2384, "num_input_tokens_seen": 1089104, "step": 2710 }, { "epoch": 2.822245322245322, "grad_norm": 0.4875684976577759, "learning_rate": 2.3757735765583083e-05, "loss": 0.2707, "num_input_tokens_seen": 1091024, "step": 2715 }, { "epoch": 2.8274428274428276, "grad_norm": 0.5857343077659607, "learning_rate": 2.366714262588577e-05, "loss": 0.2399, "num_input_tokens_seen": 1093008, "step": 2720 }, { "epoch": 2.8326403326403327, "grad_norm": 0.10867218673229218, "learning_rate": 2.3576567035003027e-05, "loss": 0.2595, "num_input_tokens_seen": 1094992, "step": 2725 }, { "epoch": 2.8378378378378377, "grad_norm": 5.911508083343506, "learning_rate": 2.3486010185481248e-05, "loss": 0.2918, "num_input_tokens_seen": 1097040, "step": 2730 }, { "epoch": 2.8430353430353428, "grad_norm": 12.04796314239502, "learning_rate": 2.3395473269620056e-05, "loss": 0.274, "num_input_tokens_seen": 1098960, "step": 2735 }, { "epoch": 2.8482328482328483, "grad_norm": 15.308754920959473, "learning_rate": 2.330495747945665e-05, "loss": 0.2749, "num_input_tokens_seen": 1101200, "step": 2740 }, { "epoch": 2.8534303534303533, "grad_norm": 16.627357482910156, "learning_rate": 2.321446400675005e-05, "loss": 0.2766, "num_input_tokens_seen": 1103120, "step": 2745 }, { "epoch": 2.858627858627859, "grad_norm": 20.289390563964844, "learning_rate": 2.3123994042965453e-05, "loss": 0.255, "num_input_tokens_seen": 1105168, "step": 2750 }, { "epoch": 2.863825363825364, "grad_norm": 8.598325729370117, "learning_rate": 2.3033548779258535e-05, "loss": 0.2452, "num_input_tokens_seen": 1107152, "step": 2755 }, { "epoch": 2.869022869022869, "grad_norm": 13.070589065551758, "learning_rate": 2.294312940645975e-05, "loss": 0.3016, "num_input_tokens_seen": 1109200, "step": 2760 }, { "epoch": 2.874220374220374, "grad_norm": 2.4811654090881348, "learning_rate": 2.2852737115058682e-05, "loss": 0.282, "num_input_tokens_seen": 1111248, "step": 2765 }, { "epoch": 2.8794178794178795, "grad_norm": 9.034327507019043, "learning_rate": 2.276237309518834e-05, "loss": 0.2753, "num_input_tokens_seen": 1113232, "step": 2770 }, { "epoch": 2.8846153846153846, "grad_norm": 0.18150383234024048, "learning_rate": 2.2672038536609487e-05, "loss": 0.268, "num_input_tokens_seen": 1115216, "step": 2775 }, { "epoch": 2.88981288981289, "grad_norm": 4.699392318725586, "learning_rate": 2.2581734628695034e-05, "loss": 0.2816, "num_input_tokens_seen": 1117264, "step": 2780 }, { "epoch": 2.895010395010395, "grad_norm": 4.98245906829834, "learning_rate": 2.2491462560414287e-05, "loss": 0.2795, "num_input_tokens_seen": 1119376, "step": 2785 }, { "epoch": 2.9002079002079, "grad_norm": 7.173026084899902, "learning_rate": 2.2401223520317362e-05, "loss": 0.283, "num_input_tokens_seen": 1121424, "step": 2790 }, { "epoch": 2.9054054054054053, "grad_norm": 1.0886781215667725, "learning_rate": 2.2311018696519532e-05, "loss": 0.2663, "num_input_tokens_seen": 1123472, "step": 2795 }, { "epoch": 2.9106029106029108, "grad_norm": 3.959348440170288, "learning_rate": 2.222084927668553e-05, "loss": 0.218, "num_input_tokens_seen": 1125584, "step": 2800 }, { "epoch": 2.915800415800416, "grad_norm": 18.90048599243164, "learning_rate": 2.2130716448014e-05, "loss": 0.2474, "num_input_tokens_seen": 1127568, "step": 2805 }, { "epoch": 2.920997920997921, "grad_norm": 19.015106201171875, "learning_rate": 2.204062139722176e-05, "loss": 0.3308, "num_input_tokens_seen": 1129552, "step": 2810 }, { "epoch": 2.9261954261954264, "grad_norm": 3.917026996612549, "learning_rate": 2.1950565310528266e-05, "loss": 0.2914, "num_input_tokens_seen": 1131472, "step": 2815 }, { "epoch": 2.9313929313929314, "grad_norm": 11.03906536102295, "learning_rate": 2.186054937363996e-05, "loss": 0.3008, "num_input_tokens_seen": 1133392, "step": 2820 }, { "epoch": 2.9365904365904365, "grad_norm": 9.081315040588379, "learning_rate": 2.1770574771734642e-05, "loss": 0.2747, "num_input_tokens_seen": 1135440, "step": 2825 }, { "epoch": 2.9417879417879416, "grad_norm": 1.4751437902450562, "learning_rate": 2.168064268944591e-05, "loss": 0.2694, "num_input_tokens_seen": 1137424, "step": 2830 }, { "epoch": 2.946985446985447, "grad_norm": 0.2908379137516022, "learning_rate": 2.159075431084751e-05, "loss": 0.2574, "num_input_tokens_seen": 1139408, "step": 2835 }, { "epoch": 2.952182952182952, "grad_norm": 6.697641372680664, "learning_rate": 2.1500910819437766e-05, "loss": 0.2677, "num_input_tokens_seen": 1141456, "step": 2840 }, { "epoch": 2.9573804573804576, "grad_norm": 0.9215155243873596, "learning_rate": 2.141111339812405e-05, "loss": 0.2513, "num_input_tokens_seen": 1143440, "step": 2845 }, { "epoch": 2.9625779625779627, "grad_norm": 0.976637601852417, "learning_rate": 2.1321363229207096e-05, "loss": 0.256, "num_input_tokens_seen": 1145360, "step": 2850 }, { "epoch": 2.9677754677754677, "grad_norm": 13.551946640014648, "learning_rate": 2.123166149436556e-05, "loss": 0.2779, "num_input_tokens_seen": 1147280, "step": 2855 }, { "epoch": 2.972972972972973, "grad_norm": 11.918627738952637, "learning_rate": 2.114200937464035e-05, "loss": 0.2814, "num_input_tokens_seen": 1149200, "step": 2860 }, { "epoch": 2.9781704781704783, "grad_norm": 19.893056869506836, "learning_rate": 2.1052408050419152e-05, "loss": 0.2671, "num_input_tokens_seen": 1151184, "step": 2865 }, { "epoch": 2.9833679833679834, "grad_norm": 12.081989288330078, "learning_rate": 2.0962858701420866e-05, "loss": 0.2391, "num_input_tokens_seen": 1153232, "step": 2870 }, { "epoch": 2.9885654885654884, "grad_norm": 9.680088996887207, "learning_rate": 2.0873362506680057e-05, "loss": 0.251, "num_input_tokens_seen": 1155216, "step": 2875 }, { "epoch": 2.993762993762994, "grad_norm": 7.104877471923828, "learning_rate": 2.078392064453144e-05, "loss": 0.2661, "num_input_tokens_seen": 1157264, "step": 2880 }, { "epoch": 2.998960498960499, "grad_norm": 0.48920291662216187, "learning_rate": 2.0694534292594392e-05, "loss": 0.2609, "num_input_tokens_seen": 1159312, "step": 2885 }, { "epoch": 3.004158004158004, "grad_norm": 7.250494003295898, "learning_rate": 2.0605204627757403e-05, "loss": 0.2209, "num_input_tokens_seen": 1161248, "step": 2890 }, { "epoch": 3.006237006237006, "eval_loss": 0.257083535194397, "eval_runtime": 1.0821, "eval_samples_per_second": 791.084, "eval_steps_per_second": 98.885, "num_input_tokens_seen": 1162016, "step": 2892 }, { "epoch": 3.0093555093555096, "grad_norm": 14.574300765991211, "learning_rate": 2.051593282616262e-05, "loss": 0.2146, "num_input_tokens_seen": 1163168, "step": 2895 }, { "epoch": 3.0145530145530146, "grad_norm": 15.882224082946777, "learning_rate": 2.0426720063190335e-05, "loss": 0.2637, "num_input_tokens_seen": 1165088, "step": 2900 }, { "epoch": 3.0197505197505197, "grad_norm": 17.72052574157715, "learning_rate": 2.033756751344352e-05, "loss": 0.2303, "num_input_tokens_seen": 1167136, "step": 2905 }, { "epoch": 3.024948024948025, "grad_norm": 6.26023530960083, "learning_rate": 2.0248476350732368e-05, "loss": 0.2915, "num_input_tokens_seen": 1169120, "step": 2910 }, { "epoch": 3.0301455301455302, "grad_norm": 10.183695793151855, "learning_rate": 2.0159447748058805e-05, "loss": 0.3333, "num_input_tokens_seen": 1171040, "step": 2915 }, { "epoch": 3.0353430353430353, "grad_norm": 5.654576778411865, "learning_rate": 2.0070482877601127e-05, "loss": 0.2142, "num_input_tokens_seen": 1173024, "step": 2920 }, { "epoch": 3.0405405405405403, "grad_norm": 6.201435565948486, "learning_rate": 1.998158291069845e-05, "loss": 0.2912, "num_input_tokens_seen": 1174944, "step": 2925 }, { "epoch": 3.045738045738046, "grad_norm": 6.677492141723633, "learning_rate": 1.9892749017835384e-05, "loss": 0.2799, "num_input_tokens_seen": 1177056, "step": 2930 }, { "epoch": 3.050935550935551, "grad_norm": 22.7122859954834, "learning_rate": 1.9803982368626583e-05, "loss": 0.318, "num_input_tokens_seen": 1178976, "step": 2935 }, { "epoch": 3.056133056133056, "grad_norm": 8.921732902526855, "learning_rate": 1.9715284131801353e-05, "loss": 0.2836, "num_input_tokens_seen": 1181024, "step": 2940 }, { "epoch": 3.0613305613305615, "grad_norm": 1.3563765287399292, "learning_rate": 1.9626655475188238e-05, "loss": 0.2515, "num_input_tokens_seen": 1183008, "step": 2945 }, { "epoch": 3.0665280665280665, "grad_norm": 9.756816864013672, "learning_rate": 1.953809756569971e-05, "loss": 0.2949, "num_input_tokens_seen": 1185056, "step": 2950 }, { "epoch": 3.0717255717255716, "grad_norm": 10.324029922485352, "learning_rate": 1.9449611569316717e-05, "loss": 0.2646, "num_input_tokens_seen": 1186976, "step": 2955 }, { "epoch": 3.076923076923077, "grad_norm": 1.5440117120742798, "learning_rate": 1.9361198651073408e-05, "loss": 0.3027, "num_input_tokens_seen": 1188960, "step": 2960 }, { "epoch": 3.082120582120582, "grad_norm": 11.017732620239258, "learning_rate": 1.9272859975041754e-05, "loss": 0.2629, "num_input_tokens_seen": 1190944, "step": 2965 }, { "epoch": 3.087318087318087, "grad_norm": 19.006444931030273, "learning_rate": 1.918459670431622e-05, "loss": 0.2261, "num_input_tokens_seen": 1192928, "step": 2970 }, { "epoch": 3.0925155925155927, "grad_norm": 7.811370849609375, "learning_rate": 1.9096410000998475e-05, "loss": 0.239, "num_input_tokens_seen": 1194848, "step": 2975 }, { "epoch": 3.0977130977130978, "grad_norm": 1.5552674531936646, "learning_rate": 1.900830102618206e-05, "loss": 0.2484, "num_input_tokens_seen": 1196768, "step": 2980 }, { "epoch": 3.102910602910603, "grad_norm": 9.481167793273926, "learning_rate": 1.892027093993716e-05, "loss": 0.2739, "num_input_tokens_seen": 1198688, "step": 2985 }, { "epoch": 3.108108108108108, "grad_norm": 8.231619834899902, "learning_rate": 1.8832320901295227e-05, "loss": 0.2701, "num_input_tokens_seen": 1200672, "step": 2990 }, { "epoch": 3.1133056133056134, "grad_norm": 9.055398941040039, "learning_rate": 1.8744452068233825e-05, "loss": 0.29, "num_input_tokens_seen": 1202720, "step": 2995 }, { "epoch": 3.1185031185031185, "grad_norm": 1.7919214963912964, "learning_rate": 1.8656665597661333e-05, "loss": 0.2816, "num_input_tokens_seen": 1204768, "step": 3000 }, { "epoch": 3.1237006237006235, "grad_norm": 9.331473350524902, "learning_rate": 1.85689626454017e-05, "loss": 0.2677, "num_input_tokens_seen": 1206944, "step": 3005 }, { "epoch": 3.128898128898129, "grad_norm": 9.339173316955566, "learning_rate": 1.8481344366179284e-05, "loss": 0.2308, "num_input_tokens_seen": 1209056, "step": 3010 }, { "epoch": 3.134095634095634, "grad_norm": 0.4089166224002838, "learning_rate": 1.839381191360358e-05, "loss": 0.2745, "num_input_tokens_seen": 1210976, "step": 3015 }, { "epoch": 3.139293139293139, "grad_norm": 13.044018745422363, "learning_rate": 1.8306366440154066e-05, "loss": 0.2645, "num_input_tokens_seen": 1213024, "step": 3020 }, { "epoch": 3.1444906444906446, "grad_norm": 3.4399261474609375, "learning_rate": 1.821900909716504e-05, "loss": 0.2854, "num_input_tokens_seen": 1215136, "step": 3025 }, { "epoch": 3.1496881496881497, "grad_norm": 16.901412963867188, "learning_rate": 1.8131741034810435e-05, "loss": 0.2667, "num_input_tokens_seen": 1217056, "step": 3030 }, { "epoch": 3.1548856548856548, "grad_norm": 4.719562530517578, "learning_rate": 1.8044563402088684e-05, "loss": 0.2799, "num_input_tokens_seen": 1219168, "step": 3035 }, { "epoch": 3.1600831600831603, "grad_norm": 0.4354883134365082, "learning_rate": 1.795747734680762e-05, "loss": 0.2724, "num_input_tokens_seen": 1221088, "step": 3040 }, { "epoch": 3.1652806652806653, "grad_norm": 4.838202953338623, "learning_rate": 1.7870484015569306e-05, "loss": 0.2666, "num_input_tokens_seen": 1223264, "step": 3045 }, { "epoch": 3.1704781704781704, "grad_norm": 0.40550848841667175, "learning_rate": 1.7783584553755006e-05, "loss": 0.2586, "num_input_tokens_seen": 1225440, "step": 3050 }, { "epoch": 3.175675675675676, "grad_norm": 8.755316734313965, "learning_rate": 1.769678010551003e-05, "loss": 0.2519, "num_input_tokens_seen": 1227424, "step": 3055 }, { "epoch": 3.180873180873181, "grad_norm": 8.232521057128906, "learning_rate": 1.761007181372874e-05, "loss": 0.241, "num_input_tokens_seen": 1229344, "step": 3060 }, { "epoch": 3.186070686070686, "grad_norm": 7.669497013092041, "learning_rate": 1.7523460820039464e-05, "loss": 0.2406, "num_input_tokens_seen": 1231456, "step": 3065 }, { "epoch": 3.1912681912681915, "grad_norm": 12.419413566589355, "learning_rate": 1.7436948264789466e-05, "loss": 0.3145, "num_input_tokens_seen": 1233440, "step": 3070 }, { "epoch": 3.1964656964656966, "grad_norm": 2.3379440307617188, "learning_rate": 1.7350535287029957e-05, "loss": 0.2568, "num_input_tokens_seen": 1235552, "step": 3075 }, { "epoch": 3.2016632016632016, "grad_norm": 10.220464706420898, "learning_rate": 1.7264223024501064e-05, "loss": 0.2696, "num_input_tokens_seen": 1237536, "step": 3080 }, { "epoch": 3.2068607068607067, "grad_norm": 17.75920867919922, "learning_rate": 1.717801261361685e-05, "loss": 0.2861, "num_input_tokens_seen": 1239584, "step": 3085 }, { "epoch": 3.212058212058212, "grad_norm": 7.69419002532959, "learning_rate": 1.7091905189450423e-05, "loss": 0.2501, "num_input_tokens_seen": 1241504, "step": 3090 }, { "epoch": 3.2172557172557172, "grad_norm": 6.530004501342773, "learning_rate": 1.700590188571887e-05, "loss": 0.2578, "num_input_tokens_seen": 1243552, "step": 3095 }, { "epoch": 3.2224532224532223, "grad_norm": 2.1304476261138916, "learning_rate": 1.6920003834768438e-05, "loss": 0.3075, "num_input_tokens_seen": 1245600, "step": 3100 }, { "epoch": 3.227650727650728, "grad_norm": 19.96734046936035, "learning_rate": 1.6834212167559575e-05, "loss": 0.2389, "num_input_tokens_seen": 1247712, "step": 3105 }, { "epoch": 3.232848232848233, "grad_norm": 12.024674415588379, "learning_rate": 1.674852801365203e-05, "loss": 0.26, "num_input_tokens_seen": 1249696, "step": 3110 }, { "epoch": 3.238045738045738, "grad_norm": 6.835387229919434, "learning_rate": 1.6662952501190033e-05, "loss": 0.3027, "num_input_tokens_seen": 1251808, "step": 3115 }, { "epoch": 3.2432432432432434, "grad_norm": 8.725172996520996, "learning_rate": 1.6577486756887374e-05, "loss": 0.2315, "num_input_tokens_seen": 1253728, "step": 3120 }, { "epoch": 3.2484407484407485, "grad_norm": 0.420453816652298, "learning_rate": 1.649213190601261e-05, "loss": 0.249, "num_input_tokens_seen": 1255840, "step": 3125 }, { "epoch": 3.2536382536382535, "grad_norm": 8.639700889587402, "learning_rate": 1.640688907237425e-05, "loss": 0.2647, "num_input_tokens_seen": 1257888, "step": 3130 }, { "epoch": 3.2567567567567566, "eval_loss": 0.2563324272632599, "eval_runtime": 1.0592, "eval_samples_per_second": 808.124, "eval_steps_per_second": 101.015, "num_input_tokens_seen": 1259168, "step": 3133 }, { "epoch": 3.258835758835759, "grad_norm": 13.672746658325195, "learning_rate": 1.632175937830594e-05, "loss": 0.2584, "num_input_tokens_seen": 1259936, "step": 3135 }, { "epoch": 3.264033264033264, "grad_norm": 14.988076210021973, "learning_rate": 1.6236743944651703e-05, "loss": 0.2355, "num_input_tokens_seen": 1262112, "step": 3140 }, { "epoch": 3.269230769230769, "grad_norm": 4.268918037414551, "learning_rate": 1.615184389075117e-05, "loss": 0.2481, "num_input_tokens_seen": 1263904, "step": 3145 }, { "epoch": 3.274428274428274, "grad_norm": 2.765204429626465, "learning_rate": 1.6067060334424835e-05, "loss": 0.2607, "num_input_tokens_seen": 1265952, "step": 3150 }, { "epoch": 3.2796257796257797, "grad_norm": 18.85066795349121, "learning_rate": 1.5982394391959382e-05, "loss": 0.3119, "num_input_tokens_seen": 1267872, "step": 3155 }, { "epoch": 3.284823284823285, "grad_norm": 2.8350369930267334, "learning_rate": 1.58978471780929e-05, "loss": 0.2925, "num_input_tokens_seen": 1269792, "step": 3160 }, { "epoch": 3.29002079002079, "grad_norm": 6.0720438957214355, "learning_rate": 1.581341980600033e-05, "loss": 0.2586, "num_input_tokens_seen": 1271776, "step": 3165 }, { "epoch": 3.2952182952182953, "grad_norm": 15.48050308227539, "learning_rate": 1.5729113387278673e-05, "loss": 0.2671, "num_input_tokens_seen": 1273760, "step": 3170 }, { "epoch": 3.3004158004158004, "grad_norm": 4.232337951660156, "learning_rate": 1.5644929031932454e-05, "loss": 0.2864, "num_input_tokens_seen": 1275808, "step": 3175 }, { "epoch": 3.3056133056133055, "grad_norm": 4.032689094543457, "learning_rate": 1.5560867848359077e-05, "loss": 0.2666, "num_input_tokens_seen": 1277792, "step": 3180 }, { "epoch": 3.310810810810811, "grad_norm": 3.819479465484619, "learning_rate": 1.547693094333421e-05, "loss": 0.2648, "num_input_tokens_seen": 1279776, "step": 3185 }, { "epoch": 3.316008316008316, "grad_norm": 5.751096725463867, "learning_rate": 1.539311942199725e-05, "loss": 0.2586, "num_input_tokens_seen": 1281760, "step": 3190 }, { "epoch": 3.321205821205821, "grad_norm": 5.748921871185303, "learning_rate": 1.5309434387836735e-05, "loss": 0.2391, "num_input_tokens_seen": 1283744, "step": 3195 }, { "epoch": 3.3264033264033266, "grad_norm": 4.8532915115356445, "learning_rate": 1.5225876942675842e-05, "loss": 0.2907, "num_input_tokens_seen": 1285792, "step": 3200 }, { "epoch": 3.3316008316008316, "grad_norm": 2.2807741165161133, "learning_rate": 1.5142448186657878e-05, "loss": 0.2942, "num_input_tokens_seen": 1287776, "step": 3205 }, { "epoch": 3.3367983367983367, "grad_norm": 0.5820121169090271, "learning_rate": 1.505914921823178e-05, "loss": 0.3321, "num_input_tokens_seen": 1289696, "step": 3210 }, { "epoch": 3.3419958419958418, "grad_norm": 2.5034902095794678, "learning_rate": 1.4975981134137659e-05, "loss": 0.2486, "num_input_tokens_seen": 1291680, "step": 3215 }, { "epoch": 3.3471933471933473, "grad_norm": 1.1145634651184082, "learning_rate": 1.489294502939238e-05, "loss": 0.2502, "num_input_tokens_seen": 1293536, "step": 3220 }, { "epoch": 3.3523908523908523, "grad_norm": 7.527857780456543, "learning_rate": 1.4810041997275092e-05, "loss": 0.2878, "num_input_tokens_seen": 1295712, "step": 3225 }, { "epoch": 3.357588357588358, "grad_norm": 12.356915473937988, "learning_rate": 1.4727273129312918e-05, "loss": 0.2824, "num_input_tokens_seen": 1297760, "step": 3230 }, { "epoch": 3.362785862785863, "grad_norm": 4.496915817260742, "learning_rate": 1.4644639515266483e-05, "loss": 0.2772, "num_input_tokens_seen": 1299808, "step": 3235 }, { "epoch": 3.367983367983368, "grad_norm": 2.9005799293518066, "learning_rate": 1.4562142243115644e-05, "loss": 0.2602, "num_input_tokens_seen": 1301920, "step": 3240 }, { "epoch": 3.373180873180873, "grad_norm": 1.413136601448059, "learning_rate": 1.4479782399045152e-05, "loss": 0.2737, "num_input_tokens_seen": 1303904, "step": 3245 }, { "epoch": 3.3783783783783785, "grad_norm": 15.236632347106934, "learning_rate": 1.4397561067430298e-05, "loss": 0.2683, "num_input_tokens_seen": 1305888, "step": 3250 }, { "epoch": 3.3835758835758836, "grad_norm": 13.490982055664062, "learning_rate": 1.4315479330822712e-05, "loss": 0.261, "num_input_tokens_seen": 1308064, "step": 3255 }, { "epoch": 3.3887733887733886, "grad_norm": 4.421963691711426, "learning_rate": 1.4233538269936042e-05, "loss": 0.2702, "num_input_tokens_seen": 1310048, "step": 3260 }, { "epoch": 3.393970893970894, "grad_norm": 10.610108375549316, "learning_rate": 1.415173896363178e-05, "loss": 0.2761, "num_input_tokens_seen": 1311968, "step": 3265 }, { "epoch": 3.399168399168399, "grad_norm": 4.008954048156738, "learning_rate": 1.4070082488905034e-05, "loss": 0.2631, "num_input_tokens_seen": 1313888, "step": 3270 }, { "epoch": 3.4043659043659042, "grad_norm": 2.906602144241333, "learning_rate": 1.3988569920870314e-05, "loss": 0.2702, "num_input_tokens_seen": 1316064, "step": 3275 }, { "epoch": 3.4095634095634098, "grad_norm": 12.885231018066406, "learning_rate": 1.3907202332747454e-05, "loss": 0.2643, "num_input_tokens_seen": 1318112, "step": 3280 }, { "epoch": 3.414760914760915, "grad_norm": 0.1723124235868454, "learning_rate": 1.3825980795847402e-05, "loss": 0.2877, "num_input_tokens_seen": 1319968, "step": 3285 }, { "epoch": 3.41995841995842, "grad_norm": 7.7092390060424805, "learning_rate": 1.3744906379558165e-05, "loss": 0.271, "num_input_tokens_seen": 1322016, "step": 3290 }, { "epoch": 3.4251559251559254, "grad_norm": 1.4101628065109253, "learning_rate": 1.3663980151330732e-05, "loss": 0.2729, "num_input_tokens_seen": 1323936, "step": 3295 }, { "epoch": 3.4303534303534304, "grad_norm": 1.401780128479004, "learning_rate": 1.3583203176664961e-05, "loss": 0.261, "num_input_tokens_seen": 1325920, "step": 3300 }, { "epoch": 3.4355509355509355, "grad_norm": 21.787748336791992, "learning_rate": 1.350257651909562e-05, "loss": 0.2498, "num_input_tokens_seen": 1327840, "step": 3305 }, { "epoch": 3.4407484407484406, "grad_norm": 8.913102149963379, "learning_rate": 1.3422101240178365e-05, "loss": 0.2384, "num_input_tokens_seen": 1329760, "step": 3310 }, { "epoch": 3.445945945945946, "grad_norm": 2.156329870223999, "learning_rate": 1.3341778399475713e-05, "loss": 0.2789, "num_input_tokens_seen": 1331744, "step": 3315 }, { "epoch": 3.451143451143451, "grad_norm": 6.306075096130371, "learning_rate": 1.3261609054543179e-05, "loss": 0.26, "num_input_tokens_seen": 1333792, "step": 3320 }, { "epoch": 3.456340956340956, "grad_norm": 8.915762901306152, "learning_rate": 1.3181594260915262e-05, "loss": 0.2975, "num_input_tokens_seen": 1335776, "step": 3325 }, { "epoch": 3.4615384615384617, "grad_norm": 2.6008245944976807, "learning_rate": 1.3101735072091622e-05, "loss": 0.2479, "num_input_tokens_seen": 1337824, "step": 3330 }, { "epoch": 3.4667359667359667, "grad_norm": 10.535400390625, "learning_rate": 1.3022032539523176e-05, "loss": 0.223, "num_input_tokens_seen": 1339872, "step": 3335 }, { "epoch": 3.471933471933472, "grad_norm": 7.183012008666992, "learning_rate": 1.2942487712598234e-05, "loss": 0.2543, "num_input_tokens_seen": 1341920, "step": 3340 }, { "epoch": 3.4771309771309773, "grad_norm": 8.969158172607422, "learning_rate": 1.2863101638628717e-05, "loss": 0.2449, "num_input_tokens_seen": 1343904, "step": 3345 }, { "epoch": 3.4823284823284824, "grad_norm": 14.45447826385498, "learning_rate": 1.2783875362836373e-05, "loss": 0.2881, "num_input_tokens_seen": 1345952, "step": 3350 }, { "epoch": 3.4875259875259874, "grad_norm": 5.68330717086792, "learning_rate": 1.2704809928338956e-05, "loss": 0.2574, "num_input_tokens_seen": 1348128, "step": 3355 }, { "epoch": 3.492723492723493, "grad_norm": 4.9651079177856445, "learning_rate": 1.2625906376136581e-05, "loss": 0.2915, "num_input_tokens_seen": 1350048, "step": 3360 }, { "epoch": 3.497920997920998, "grad_norm": 0.1253027617931366, "learning_rate": 1.2547165745097928e-05, "loss": 0.2653, "num_input_tokens_seen": 1351968, "step": 3365 }, { "epoch": 3.503118503118503, "grad_norm": 7.796263217926025, "learning_rate": 1.2468589071946632e-05, "loss": 0.2795, "num_input_tokens_seen": 1353952, "step": 3370 }, { "epoch": 3.507276507276507, "eval_loss": 0.2641850709915161, "eval_runtime": 1.0564, "eval_samples_per_second": 810.335, "eval_steps_per_second": 101.292, "num_input_tokens_seen": 1355552, "step": 3374 }, { "epoch": 3.508316008316008, "grad_norm": 0.7158005237579346, "learning_rate": 1.2390177391247614e-05, "loss": 0.2673, "num_input_tokens_seen": 1356000, "step": 3375 }, { "epoch": 3.5135135135135136, "grad_norm": 2.2780776023864746, "learning_rate": 1.2311931735393417e-05, "loss": 0.2527, "num_input_tokens_seen": 1357984, "step": 3380 }, { "epoch": 3.5187110187110187, "grad_norm": 2.900038957595825, "learning_rate": 1.2233853134590697e-05, "loss": 0.2635, "num_input_tokens_seen": 1359904, "step": 3385 }, { "epoch": 3.523908523908524, "grad_norm": 9.76952075958252, "learning_rate": 1.215594261684656e-05, "loss": 0.2737, "num_input_tokens_seen": 1361952, "step": 3390 }, { "epoch": 3.529106029106029, "grad_norm": 0.9151262044906616, "learning_rate": 1.2078201207955123e-05, "loss": 0.2521, "num_input_tokens_seen": 1364000, "step": 3395 }, { "epoch": 3.5343035343035343, "grad_norm": 7.289126873016357, "learning_rate": 1.2000629931483947e-05, "loss": 0.246, "num_input_tokens_seen": 1366112, "step": 3400 }, { "epoch": 3.5395010395010393, "grad_norm": 7.83226203918457, "learning_rate": 1.1923229808760564e-05, "loss": 0.3136, "num_input_tokens_seen": 1368096, "step": 3405 }, { "epoch": 3.544698544698545, "grad_norm": 0.7650713324546814, "learning_rate": 1.1846001858859054e-05, "loss": 0.2365, "num_input_tokens_seen": 1370208, "step": 3410 }, { "epoch": 3.54989604989605, "grad_norm": 8.69223403930664, "learning_rate": 1.1768947098586628e-05, "loss": 0.2545, "num_input_tokens_seen": 1372192, "step": 3415 }, { "epoch": 3.555093555093555, "grad_norm": 8.523820877075195, "learning_rate": 1.1692066542470201e-05, "loss": 0.2743, "num_input_tokens_seen": 1374240, "step": 3420 }, { "epoch": 3.5602910602910605, "grad_norm": 13.287330627441406, "learning_rate": 1.1615361202743088e-05, "loss": 0.2821, "num_input_tokens_seen": 1376160, "step": 3425 }, { "epoch": 3.5654885654885655, "grad_norm": 6.943297386169434, "learning_rate": 1.1538832089331628e-05, "loss": 0.2579, "num_input_tokens_seen": 1378208, "step": 3430 }, { "epoch": 3.5706860706860706, "grad_norm": 9.258183479309082, "learning_rate": 1.1462480209841928e-05, "loss": 0.2526, "num_input_tokens_seen": 1380192, "step": 3435 }, { "epoch": 3.5758835758835756, "grad_norm": 0.5721746683120728, "learning_rate": 1.138630656954658e-05, "loss": 0.2369, "num_input_tokens_seen": 1382368, "step": 3440 }, { "epoch": 3.581081081081081, "grad_norm": 7.4510297775268555, "learning_rate": 1.1310312171371393e-05, "loss": 0.3155, "num_input_tokens_seen": 1384608, "step": 3445 }, { "epoch": 3.586278586278586, "grad_norm": 17.350297927856445, "learning_rate": 1.1234498015882261e-05, "loss": 0.2615, "num_input_tokens_seen": 1386592, "step": 3450 }, { "epoch": 3.5914760914760917, "grad_norm": 5.5357513427734375, "learning_rate": 1.1158865101271906e-05, "loss": 0.2614, "num_input_tokens_seen": 1388448, "step": 3455 }, { "epoch": 3.5966735966735968, "grad_norm": 7.910511016845703, "learning_rate": 1.1083414423346807e-05, "loss": 0.2222, "num_input_tokens_seen": 1390560, "step": 3460 }, { "epoch": 3.601871101871102, "grad_norm": 13.7831392288208, "learning_rate": 1.1008146975514059e-05, "loss": 0.3213, "num_input_tokens_seen": 1392736, "step": 3465 }, { "epoch": 3.607068607068607, "grad_norm": 0.9229243397712708, "learning_rate": 1.0933063748768254e-05, "loss": 0.2762, "num_input_tokens_seen": 1394720, "step": 3470 }, { "epoch": 3.6122661122661124, "grad_norm": 8.663399696350098, "learning_rate": 1.0858165731678513e-05, "loss": 0.23, "num_input_tokens_seen": 1396640, "step": 3475 }, { "epoch": 3.6174636174636174, "grad_norm": 13.28499984741211, "learning_rate": 1.0783453910375424e-05, "loss": 0.3322, "num_input_tokens_seen": 1398752, "step": 3480 }, { "epoch": 3.6226611226611225, "grad_norm": 8.6603364944458, "learning_rate": 1.0708929268538034e-05, "loss": 0.2494, "num_input_tokens_seen": 1400800, "step": 3485 }, { "epoch": 3.627858627858628, "grad_norm": 16.12564468383789, "learning_rate": 1.0634592787380965e-05, "loss": 0.2596, "num_input_tokens_seen": 1402720, "step": 3490 }, { "epoch": 3.633056133056133, "grad_norm": 7.862550735473633, "learning_rate": 1.0560445445641423e-05, "loss": 0.2563, "num_input_tokens_seen": 1404704, "step": 3495 }, { "epoch": 3.638253638253638, "grad_norm": 0.07106157392263412, "learning_rate": 1.048648821956637e-05, "loss": 0.2589, "num_input_tokens_seen": 1406560, "step": 3500 }, { "epoch": 3.643451143451143, "grad_norm": 0.42181286215782166, "learning_rate": 1.0412722082899644e-05, "loss": 0.2386, "num_input_tokens_seen": 1408544, "step": 3505 }, { "epoch": 3.6486486486486487, "grad_norm": 0.11339768022298813, "learning_rate": 1.033914800686912e-05, "loss": 0.2512, "num_input_tokens_seen": 1410464, "step": 3510 }, { "epoch": 3.6538461538461537, "grad_norm": 16.115554809570312, "learning_rate": 1.0265766960173965e-05, "loss": 0.2277, "num_input_tokens_seen": 1412448, "step": 3515 }, { "epoch": 3.6590436590436592, "grad_norm": 18.590940475463867, "learning_rate": 1.019257990897185e-05, "loss": 0.2869, "num_input_tokens_seen": 1414688, "step": 3520 }, { "epoch": 3.6642411642411643, "grad_norm": 5.8544158935546875, "learning_rate": 1.0119587816866258e-05, "loss": 0.2914, "num_input_tokens_seen": 1416672, "step": 3525 }, { "epoch": 3.6694386694386694, "grad_norm": 8.909049034118652, "learning_rate": 1.0046791644893758e-05, "loss": 0.2836, "num_input_tokens_seen": 1418592, "step": 3530 }, { "epoch": 3.6746361746361744, "grad_norm": 5.8211212158203125, "learning_rate": 9.974192351511368e-06, "loss": 0.2675, "num_input_tokens_seen": 1420576, "step": 3535 }, { "epoch": 3.67983367983368, "grad_norm": 7.190529823303223, "learning_rate": 9.901790892583974e-06, "loss": 0.2679, "num_input_tokens_seen": 1422560, "step": 3540 }, { "epoch": 3.685031185031185, "grad_norm": 6.408370018005371, "learning_rate": 9.829588221371694e-06, "loss": 0.2697, "num_input_tokens_seen": 1424608, "step": 3545 }, { "epoch": 3.6902286902286905, "grad_norm": 3.801816940307617, "learning_rate": 9.757585288517328e-06, "loss": 0.2612, "num_input_tokens_seen": 1426784, "step": 3550 }, { "epoch": 3.6954261954261955, "grad_norm": 2.5229806900024414, "learning_rate": 9.6857830420339e-06, "loss": 0.2501, "num_input_tokens_seen": 1428896, "step": 3555 }, { "epoch": 3.7006237006237006, "grad_norm": 7.06850004196167, "learning_rate": 9.614182427292077e-06, "loss": 0.2586, "num_input_tokens_seen": 1430880, "step": 3560 }, { "epoch": 3.7058212058212057, "grad_norm": 6.542532920837402, "learning_rate": 9.54278438700785e-06, "loss": 0.2484, "num_input_tokens_seen": 1432864, "step": 3565 }, { "epoch": 3.711018711018711, "grad_norm": 15.00790023803711, "learning_rate": 9.471589861229998e-06, "loss": 0.287, "num_input_tokens_seen": 1434912, "step": 3570 }, { "epoch": 3.7162162162162162, "grad_norm": 3.730722188949585, "learning_rate": 9.400599787327773e-06, "loss": 0.3025, "num_input_tokens_seen": 1436832, "step": 3575 }, { "epoch": 3.7214137214137213, "grad_norm": 0.24840876460075378, "learning_rate": 9.329815099978568e-06, "loss": 0.2433, "num_input_tokens_seen": 1438752, "step": 3580 }, { "epoch": 3.726611226611227, "grad_norm": 12.727954864501953, "learning_rate": 9.259236731155582e-06, "loss": 0.3809, "num_input_tokens_seen": 1440672, "step": 3585 }, { "epoch": 3.731808731808732, "grad_norm": 3.5419039726257324, "learning_rate": 9.18886561011557e-06, "loss": 0.2221, "num_input_tokens_seen": 1442784, "step": 3590 }, { "epoch": 3.737006237006237, "grad_norm": 4.471946716308594, "learning_rate": 9.118702663386584e-06, "loss": 0.2622, "num_input_tokens_seen": 1444960, "step": 3595 }, { "epoch": 3.742203742203742, "grad_norm": 7.390607833862305, "learning_rate": 9.048748814755784e-06, "loss": 0.3094, "num_input_tokens_seen": 1446880, "step": 3600 }, { "epoch": 3.7474012474012475, "grad_norm": 0.6072008013725281, "learning_rate": 8.979004985257294e-06, "loss": 0.2723, "num_input_tokens_seen": 1448992, "step": 3605 }, { "epoch": 3.7525987525987525, "grad_norm": 5.596619606018066, "learning_rate": 8.909472093160065e-06, "loss": 0.2755, "num_input_tokens_seen": 1450976, "step": 3610 }, { "epoch": 3.757796257796258, "grad_norm": 4.79258394241333, "learning_rate": 8.840151053955773e-06, "loss": 0.2751, "num_input_tokens_seen": 1453088, "step": 3615 }, { "epoch": 3.757796257796258, "eval_loss": 0.25871533155441284, "eval_runtime": 1.0494, "eval_samples_per_second": 815.737, "eval_steps_per_second": 101.967, "num_input_tokens_seen": 1453088, "step": 3615 }, { "epoch": 3.762993762993763, "grad_norm": 9.075246810913086, "learning_rate": 8.771042780346766e-06, "loss": 0.2834, "num_input_tokens_seen": 1455136, "step": 3620 }, { "epoch": 3.768191268191268, "grad_norm": 3.0494143962860107, "learning_rate": 8.702148182234043e-06, "loss": 0.2472, "num_input_tokens_seen": 1457120, "step": 3625 }, { "epoch": 3.773388773388773, "grad_norm": 3.5605342388153076, "learning_rate": 8.633468166705336e-06, "loss": 0.2772, "num_input_tokens_seen": 1459168, "step": 3630 }, { "epoch": 3.7785862785862787, "grad_norm": 3.971622943878174, "learning_rate": 8.565003638023065e-06, "loss": 0.2651, "num_input_tokens_seen": 1461152, "step": 3635 }, { "epoch": 3.7837837837837838, "grad_norm": 2.8847694396972656, "learning_rate": 8.496755497612492e-06, "loss": 0.2756, "num_input_tokens_seen": 1463136, "step": 3640 }, { "epoch": 3.788981288981289, "grad_norm": 9.094311714172363, "learning_rate": 8.42872464404986e-06, "loss": 0.2693, "num_input_tokens_seen": 1465120, "step": 3645 }, { "epoch": 3.7941787941787943, "grad_norm": 3.9456887245178223, "learning_rate": 8.360911973050537e-06, "loss": 0.2816, "num_input_tokens_seen": 1467104, "step": 3650 }, { "epoch": 3.7993762993762994, "grad_norm": 9.080158233642578, "learning_rate": 8.293318377457241e-06, "loss": 0.2571, "num_input_tokens_seen": 1469152, "step": 3655 }, { "epoch": 3.8045738045738045, "grad_norm": 3.958771228790283, "learning_rate": 8.225944747228257e-06, "loss": 0.268, "num_input_tokens_seen": 1471264, "step": 3660 }, { "epoch": 3.8097713097713095, "grad_norm": 8.621827125549316, "learning_rate": 8.158791969425738e-06, "loss": 0.2128, "num_input_tokens_seen": 1473248, "step": 3665 }, { "epoch": 3.814968814968815, "grad_norm": 14.013006210327148, "learning_rate": 8.091860928204049e-06, "loss": 0.3101, "num_input_tokens_seen": 1475360, "step": 3670 }, { "epoch": 3.82016632016632, "grad_norm": 11.601419448852539, "learning_rate": 8.025152504798078e-06, "loss": 0.3044, "num_input_tokens_seen": 1477472, "step": 3675 }, { "epoch": 3.8253638253638256, "grad_norm": 3.5920257568359375, "learning_rate": 7.958667577511683e-06, "loss": 0.2471, "num_input_tokens_seen": 1479328, "step": 3680 }, { "epoch": 3.8305613305613306, "grad_norm": 11.748164176940918, "learning_rate": 7.892407021706063e-06, "loss": 0.2552, "num_input_tokens_seen": 1481248, "step": 3685 }, { "epoch": 3.8357588357588357, "grad_norm": 11.761496543884277, "learning_rate": 7.826371709788313e-06, "loss": 0.3112, "num_input_tokens_seen": 1483168, "step": 3690 }, { "epoch": 3.8409563409563408, "grad_norm": 2.8758435249328613, "learning_rate": 7.760562511199882e-06, "loss": 0.2585, "num_input_tokens_seen": 1485152, "step": 3695 }, { "epoch": 3.8461538461538463, "grad_norm": 3.748758554458618, "learning_rate": 7.694980292405122e-06, "loss": 0.2673, "num_input_tokens_seen": 1487200, "step": 3700 }, { "epoch": 3.8513513513513513, "grad_norm": 5.409223556518555, "learning_rate": 7.629625916879932e-06, "loss": 0.2763, "num_input_tokens_seen": 1489184, "step": 3705 }, { "epoch": 3.856548856548857, "grad_norm": 13.337824821472168, "learning_rate": 7.564500245100325e-06, "loss": 0.278, "num_input_tokens_seen": 1491168, "step": 3710 }, { "epoch": 3.861746361746362, "grad_norm": 1.357743740081787, "learning_rate": 7.499604134531149e-06, "loss": 0.2727, "num_input_tokens_seen": 1493216, "step": 3715 }, { "epoch": 3.866943866943867, "grad_norm": 0.9974175095558167, "learning_rate": 7.434938439614781e-06, "loss": 0.2667, "num_input_tokens_seen": 1495200, "step": 3720 }, { "epoch": 3.872141372141372, "grad_norm": 1.3982412815093994, "learning_rate": 7.370504011759855e-06, "loss": 0.2707, "num_input_tokens_seen": 1497184, "step": 3725 }, { "epoch": 3.8773388773388775, "grad_norm": 2.715106248855591, "learning_rate": 7.306301699330065e-06, "loss": 0.2656, "num_input_tokens_seen": 1499040, "step": 3730 }, { "epoch": 3.8825363825363826, "grad_norm": 10.939849853515625, "learning_rate": 7.242332347633052e-06, "loss": 0.2423, "num_input_tokens_seen": 1501024, "step": 3735 }, { "epoch": 3.8877338877338876, "grad_norm": 8.844660758972168, "learning_rate": 7.178596798909159e-06, "loss": 0.2487, "num_input_tokens_seen": 1503072, "step": 3740 }, { "epoch": 3.892931392931393, "grad_norm": 0.9195927381515503, "learning_rate": 7.115095892320456e-06, "loss": 0.2847, "num_input_tokens_seen": 1505248, "step": 3745 }, { "epoch": 3.898128898128898, "grad_norm": 15.075143814086914, "learning_rate": 7.051830463939604e-06, "loss": 0.2334, "num_input_tokens_seen": 1507296, "step": 3750 }, { "epoch": 3.9033264033264032, "grad_norm": 5.580709934234619, "learning_rate": 6.98880134673891e-06, "loss": 0.2595, "num_input_tokens_seen": 1509344, "step": 3755 }, { "epoch": 3.9085239085239083, "grad_norm": 5.476256370544434, "learning_rate": 6.926009370579334e-06, "loss": 0.2098, "num_input_tokens_seen": 1511456, "step": 3760 }, { "epoch": 3.913721413721414, "grad_norm": 3.832021713256836, "learning_rate": 6.8634553621995416e-06, "loss": 0.2277, "num_input_tokens_seen": 1513440, "step": 3765 }, { "epoch": 3.918918918918919, "grad_norm": 15.132575035095215, "learning_rate": 6.80114014520507e-06, "loss": 0.3251, "num_input_tokens_seen": 1515488, "step": 3770 }, { "epoch": 3.9241164241164244, "grad_norm": 5.868591785430908, "learning_rate": 6.739064540057424e-06, "loss": 0.2604, "num_input_tokens_seen": 1517408, "step": 3775 }, { "epoch": 3.9293139293139294, "grad_norm": 11.784462928771973, "learning_rate": 6.677229364063328e-06, "loss": 0.2458, "num_input_tokens_seen": 1519392, "step": 3780 }, { "epoch": 3.9345114345114345, "grad_norm": 3.2488772869110107, "learning_rate": 6.615635431363942e-06, "loss": 0.2596, "num_input_tokens_seen": 1521440, "step": 3785 }, { "epoch": 3.9397089397089395, "grad_norm": 3.991219997406006, "learning_rate": 6.554283552924118e-06, "loss": 0.2766, "num_input_tokens_seen": 1523488, "step": 3790 }, { "epoch": 3.944906444906445, "grad_norm": 2.877652168273926, "learning_rate": 6.493174536521768e-06, "loss": 0.2551, "num_input_tokens_seen": 1525600, "step": 3795 }, { "epoch": 3.95010395010395, "grad_norm": 10.219852447509766, "learning_rate": 6.4323091867372095e-06, "loss": 0.2612, "num_input_tokens_seen": 1527584, "step": 3800 }, { "epoch": 3.955301455301455, "grad_norm": 9.425738334655762, "learning_rate": 6.371688304942544e-06, "loss": 0.2575, "num_input_tokens_seen": 1529504, "step": 3805 }, { "epoch": 3.9604989604989607, "grad_norm": 13.004129409790039, "learning_rate": 6.311312689291166e-06, "loss": 0.2897, "num_input_tokens_seen": 1531424, "step": 3810 }, { "epoch": 3.9656964656964657, "grad_norm": 7.159772872924805, "learning_rate": 6.251183134707184e-06, "loss": 0.2366, "num_input_tokens_seen": 1533408, "step": 3815 }, { "epoch": 3.970893970893971, "grad_norm": 6.52066707611084, "learning_rate": 6.191300432875017e-06, "loss": 0.2668, "num_input_tokens_seen": 1535392, "step": 3820 }, { "epoch": 3.976091476091476, "grad_norm": 13.160318374633789, "learning_rate": 6.13166537222894e-06, "loss": 0.1661, "num_input_tokens_seen": 1537312, "step": 3825 }, { "epoch": 3.9812889812889813, "grad_norm": 16.36733055114746, "learning_rate": 6.072278737942691e-06, "loss": 0.28, "num_input_tokens_seen": 1539360, "step": 3830 }, { "epoch": 3.9864864864864864, "grad_norm": 4.4265875816345215, "learning_rate": 6.0131413119191685e-06, "loss": 0.3038, "num_input_tokens_seen": 1541280, "step": 3835 }, { "epoch": 3.991683991683992, "grad_norm": 4.7540717124938965, "learning_rate": 5.954253872780102e-06, "loss": 0.2598, "num_input_tokens_seen": 1543136, "step": 3840 }, { "epoch": 3.996881496881497, "grad_norm": 1.5352482795715332, "learning_rate": 5.8956171958558266e-06, "loss": 0.2387, "num_input_tokens_seen": 1545120, "step": 3845 }, { "epoch": 4.002079002079002, "grad_norm": 0.05950174108147621, "learning_rate": 5.8372320531750655e-06, "loss": 0.2258, "num_input_tokens_seen": 1547056, "step": 3850 }, { "epoch": 4.007276507276507, "grad_norm": 9.390141487121582, "learning_rate": 5.77909921345475e-06, "loss": 0.279, "num_input_tokens_seen": 1548976, "step": 3855 }, { "epoch": 4.008316008316008, "eval_loss": 0.25588732957839966, "eval_runtime": 1.0892, "eval_samples_per_second": 785.884, "eval_steps_per_second": 98.235, "num_input_tokens_seen": 1549360, "step": 3856 }, { "epoch": 4.012474012474012, "grad_norm": 4.292252063751221, "learning_rate": 5.721219442089926e-06, "loss": 0.2636, "num_input_tokens_seen": 1550960, "step": 3860 }, { "epoch": 4.017671517671518, "grad_norm": 3.9697649478912354, "learning_rate": 5.663593501143663e-06, "loss": 0.2772, "num_input_tokens_seen": 1552944, "step": 3865 }, { "epoch": 4.022869022869023, "grad_norm": 1.579316258430481, "learning_rate": 5.6062221493370035e-06, "loss": 0.2654, "num_input_tokens_seen": 1554992, "step": 3870 }, { "epoch": 4.028066528066528, "grad_norm": 14.188766479492188, "learning_rate": 5.549106142039018e-06, "loss": 0.2682, "num_input_tokens_seen": 1557104, "step": 3875 }, { "epoch": 4.033264033264033, "grad_norm": 7.652184963226318, "learning_rate": 5.492246231256798e-06, "loss": 0.2818, "num_input_tokens_seen": 1559088, "step": 3880 }, { "epoch": 4.038461538461538, "grad_norm": 7.617486953735352, "learning_rate": 5.435643165625614e-06, "loss": 0.2739, "num_input_tokens_seen": 1561008, "step": 3885 }, { "epoch": 4.043659043659043, "grad_norm": 14.909598350524902, "learning_rate": 5.379297690399035e-06, "loss": 0.273, "num_input_tokens_seen": 1563056, "step": 3890 }, { "epoch": 4.048856548856548, "grad_norm": 13.364591598510742, "learning_rate": 5.3232105474390895e-06, "loss": 0.2571, "num_input_tokens_seen": 1565040, "step": 3895 }, { "epoch": 4.054054054054054, "grad_norm": 4.7986159324646, "learning_rate": 5.267382475206548e-06, "loss": 0.2529, "num_input_tokens_seen": 1567024, "step": 3900 }, { "epoch": 4.0592515592515594, "grad_norm": 4.830848693847656, "learning_rate": 5.2118142087511705e-06, "loss": 0.264, "num_input_tokens_seen": 1569136, "step": 3905 }, { "epoch": 4.0644490644490645, "grad_norm": 11.776159286499023, "learning_rate": 5.156506479702019e-06, "loss": 0.2638, "num_input_tokens_seen": 1571120, "step": 3910 }, { "epoch": 4.06964656964657, "grad_norm": 0.2953391969203949, "learning_rate": 5.101460016257859e-06, "loss": 0.2652, "num_input_tokens_seen": 1573040, "step": 3915 }, { "epoch": 4.074844074844075, "grad_norm": 0.2401997148990631, "learning_rate": 5.0466755431775316e-06, "loss": 0.2582, "num_input_tokens_seen": 1574896, "step": 3920 }, { "epoch": 4.08004158004158, "grad_norm": 0.6011865139007568, "learning_rate": 4.992153781770448e-06, "loss": 0.2618, "num_input_tokens_seen": 1576880, "step": 3925 }, { "epoch": 4.085239085239086, "grad_norm": 12.98713493347168, "learning_rate": 4.937895449887075e-06, "loss": 0.2455, "num_input_tokens_seen": 1578864, "step": 3930 }, { "epoch": 4.090436590436591, "grad_norm": 13.129234313964844, "learning_rate": 4.883901261909465e-06, "loss": 0.2813, "num_input_tokens_seen": 1580848, "step": 3935 }, { "epoch": 4.095634095634096, "grad_norm": 11.796737670898438, "learning_rate": 4.8301719287419e-06, "loss": 0.2815, "num_input_tokens_seen": 1582704, "step": 3940 }, { "epoch": 4.100831600831601, "grad_norm": 9.959646224975586, "learning_rate": 4.776708157801463e-06, "loss": 0.2796, "num_input_tokens_seen": 1584816, "step": 3945 }, { "epoch": 4.106029106029106, "grad_norm": 11.752809524536133, "learning_rate": 4.7235106530088085e-06, "loss": 0.2491, "num_input_tokens_seen": 1586800, "step": 3950 }, { "epoch": 4.111226611226611, "grad_norm": 5.360927104949951, "learning_rate": 4.670580114778813e-06, "loss": 0.2528, "num_input_tokens_seen": 1588720, "step": 3955 }, { "epoch": 4.116424116424117, "grad_norm": 5.705671310424805, "learning_rate": 4.617917240011394e-06, "loss": 0.2732, "num_input_tokens_seen": 1590576, "step": 3960 }, { "epoch": 4.121621621621622, "grad_norm": 1.100892186164856, "learning_rate": 4.565522722082336e-06, "loss": 0.2573, "num_input_tokens_seen": 1592496, "step": 3965 }, { "epoch": 4.126819126819127, "grad_norm": 1.0973014831542969, "learning_rate": 4.513397250834159e-06, "loss": 0.2638, "num_input_tokens_seen": 1594544, "step": 3970 }, { "epoch": 4.132016632016632, "grad_norm": 8.093790054321289, "learning_rate": 4.461541512567011e-06, "loss": 0.2925, "num_input_tokens_seen": 1596400, "step": 3975 }, { "epoch": 4.137214137214137, "grad_norm": 0.3267320990562439, "learning_rate": 4.409956190029674e-06, "loss": 0.2786, "num_input_tokens_seen": 1598320, "step": 3980 }, { "epoch": 4.142411642411642, "grad_norm": 0.30618995428085327, "learning_rate": 4.358641962410537e-06, "loss": 0.2286, "num_input_tokens_seen": 1600368, "step": 3985 }, { "epoch": 4.147609147609147, "grad_norm": 12.98468017578125, "learning_rate": 4.307599505328672e-06, "loss": 0.2986, "num_input_tokens_seen": 1602352, "step": 3990 }, { "epoch": 4.152806652806653, "grad_norm": 0.18394550681114197, "learning_rate": 4.256829490824949e-06, "loss": 0.2363, "num_input_tokens_seen": 1604336, "step": 3995 }, { "epoch": 4.158004158004158, "grad_norm": 6.980307102203369, "learning_rate": 4.206332587353149e-06, "loss": 0.2652, "num_input_tokens_seen": 1606256, "step": 4000 }, { "epoch": 4.163201663201663, "grad_norm": 0.6148359775543213, "learning_rate": 4.1561094597712155e-06, "loss": 0.2641, "num_input_tokens_seen": 1608304, "step": 4005 }, { "epoch": 4.168399168399168, "grad_norm": 9.185464859008789, "learning_rate": 4.106160769332443e-06, "loss": 0.2415, "num_input_tokens_seen": 1610480, "step": 4010 }, { "epoch": 4.173596673596673, "grad_norm": 0.5142713189125061, "learning_rate": 4.056487173676843e-06, "loss": 0.2449, "num_input_tokens_seen": 1612528, "step": 4015 }, { "epoch": 4.1787941787941785, "grad_norm": 15.734733581542969, "learning_rate": 4.007089326822405e-06, "loss": 0.2742, "num_input_tokens_seen": 1614576, "step": 4020 }, { "epoch": 4.183991683991684, "grad_norm": 1.1118292808532715, "learning_rate": 3.957967879156533e-06, "loss": 0.264, "num_input_tokens_seen": 1616624, "step": 4025 }, { "epoch": 4.1891891891891895, "grad_norm": 12.48095989227295, "learning_rate": 3.909123477427487e-06, "loss": 0.2548, "num_input_tokens_seen": 1618672, "step": 4030 }, { "epoch": 4.1943866943866945, "grad_norm": 5.244612693786621, "learning_rate": 3.860556764735842e-06, "loss": 0.2113, "num_input_tokens_seen": 1620784, "step": 4035 }, { "epoch": 4.1995841995842, "grad_norm": 21.30678367614746, "learning_rate": 3.812268380526046e-06, "loss": 0.3261, "num_input_tokens_seen": 1622768, "step": 4040 }, { "epoch": 4.204781704781705, "grad_norm": 8.002152442932129, "learning_rate": 3.764258960577971e-06, "loss": 0.2481, "num_input_tokens_seen": 1624688, "step": 4045 }, { "epoch": 4.20997920997921, "grad_norm": 2.3525662422180176, "learning_rate": 3.7165291369985618e-06, "loss": 0.2599, "num_input_tokens_seen": 1626672, "step": 4050 }, { "epoch": 4.215176715176715, "grad_norm": 4.8225202560424805, "learning_rate": 3.6690795382135186e-06, "loss": 0.258, "num_input_tokens_seen": 1628848, "step": 4055 }, { "epoch": 4.220374220374221, "grad_norm": 1.9619829654693604, "learning_rate": 3.6219107889590155e-06, "loss": 0.2809, "num_input_tokens_seen": 1630832, "step": 4060 }, { "epoch": 4.225571725571726, "grad_norm": 20.82301139831543, "learning_rate": 3.575023510273462e-06, "loss": 0.2292, "num_input_tokens_seen": 1632880, "step": 4065 }, { "epoch": 4.230769230769231, "grad_norm": 1.0340510606765747, "learning_rate": 3.5284183194893488e-06, "loss": 0.2871, "num_input_tokens_seen": 1634992, "step": 4070 }, { "epoch": 4.235966735966736, "grad_norm": 12.200738906860352, "learning_rate": 3.48209583022511e-06, "loss": 0.2634, "num_input_tokens_seen": 1636912, "step": 4075 }, { "epoch": 4.241164241164241, "grad_norm": 14.348763465881348, "learning_rate": 3.4360566523770426e-06, "loss": 0.2477, "num_input_tokens_seen": 1638832, "step": 4080 }, { "epoch": 4.246361746361746, "grad_norm": 3.283851385116577, "learning_rate": 3.3903013921112755e-06, "loss": 0.2351, "num_input_tokens_seen": 1641072, "step": 4085 }, { "epoch": 4.251559251559252, "grad_norm": 1.4203660488128662, "learning_rate": 3.3448306518557795e-06, "loss": 0.2808, "num_input_tokens_seen": 1642992, "step": 4090 }, { "epoch": 4.256756756756757, "grad_norm": 5.4867753982543945, "learning_rate": 3.299645030292467e-06, "loss": 0.2511, "num_input_tokens_seen": 1645040, "step": 4095 }, { "epoch": 4.258835758835759, "eval_loss": 0.2517484128475189, "eval_runtime": 1.069, "eval_samples_per_second": 800.752, "eval_steps_per_second": 100.094, "num_input_tokens_seen": 1645808, "step": 4097 }, { "epoch": 4.261954261954262, "grad_norm": 15.803162574768066, "learning_rate": 3.2547451223492786e-06, "loss": 0.2495, "num_input_tokens_seen": 1647024, "step": 4100 }, { "epoch": 4.267151767151767, "grad_norm": 6.709695816040039, "learning_rate": 3.2101315191923663e-06, "loss": 0.2774, "num_input_tokens_seen": 1649008, "step": 4105 }, { "epoch": 4.272349272349272, "grad_norm": 12.675971984863281, "learning_rate": 3.165804808218292e-06, "loss": 0.2208, "num_input_tokens_seen": 1651056, "step": 4110 }, { "epoch": 4.277546777546777, "grad_norm": 20.12381935119629, "learning_rate": 3.1217655730463093e-06, "loss": 0.2601, "num_input_tokens_seen": 1653104, "step": 4115 }, { "epoch": 4.282744282744282, "grad_norm": 5.935137748718262, "learning_rate": 3.078014393510695e-06, "loss": 0.2427, "num_input_tokens_seen": 1655344, "step": 4120 }, { "epoch": 4.287941787941788, "grad_norm": 3.3360893726348877, "learning_rate": 3.0345518456530665e-06, "loss": 0.2959, "num_input_tokens_seen": 1657392, "step": 4125 }, { "epoch": 4.293139293139293, "grad_norm": 8.30186653137207, "learning_rate": 2.991378501714856e-06, "loss": 0.2807, "num_input_tokens_seen": 1659312, "step": 4130 }, { "epoch": 4.298336798336798, "grad_norm": 7.54670524597168, "learning_rate": 2.9484949301297166e-06, "loss": 0.2579, "num_input_tokens_seen": 1661424, "step": 4135 }, { "epoch": 4.303534303534303, "grad_norm": 14.763995170593262, "learning_rate": 2.9059016955160916e-06, "loss": 0.2498, "num_input_tokens_seen": 1663408, "step": 4140 }, { "epoch": 4.3087318087318085, "grad_norm": 1.220461368560791, "learning_rate": 2.8635993586697553e-06, "loss": 0.226, "num_input_tokens_seen": 1665328, "step": 4145 }, { "epoch": 4.313929313929314, "grad_norm": 23.395973205566406, "learning_rate": 2.8215884765564193e-06, "loss": 0.2687, "num_input_tokens_seen": 1667312, "step": 4150 }, { "epoch": 4.3191268191268195, "grad_norm": 4.078779697418213, "learning_rate": 2.7798696023044163e-06, "loss": 0.1693, "num_input_tokens_seen": 1669296, "step": 4155 }, { "epoch": 4.324324324324325, "grad_norm": 5.839596748352051, "learning_rate": 2.73844328519742e-06, "loss": 0.2217, "num_input_tokens_seen": 1671280, "step": 4160 }, { "epoch": 4.32952182952183, "grad_norm": 21.18253517150879, "learning_rate": 2.6973100706672e-06, "loss": 0.2814, "num_input_tokens_seen": 1673456, "step": 4165 }, { "epoch": 4.334719334719335, "grad_norm": 6.5861101150512695, "learning_rate": 2.656470500286451e-06, "loss": 0.2523, "num_input_tokens_seen": 1675504, "step": 4170 }, { "epoch": 4.33991683991684, "grad_norm": 6.993749618530273, "learning_rate": 2.615925111761647e-06, "loss": 0.228, "num_input_tokens_seen": 1677488, "step": 4175 }, { "epoch": 4.345114345114345, "grad_norm": 11.91810417175293, "learning_rate": 2.5756744389259734e-06, "loss": 0.2399, "num_input_tokens_seen": 1679536, "step": 4180 }, { "epoch": 4.350311850311851, "grad_norm": 4.944723606109619, "learning_rate": 2.535719011732321e-06, "loss": 0.2345, "num_input_tokens_seen": 1681520, "step": 4185 }, { "epoch": 4.355509355509356, "grad_norm": 18.20621681213379, "learning_rate": 2.49605935624625e-06, "loss": 0.2726, "num_input_tokens_seen": 1683568, "step": 4190 }, { "epoch": 4.360706860706861, "grad_norm": 8.823187828063965, "learning_rate": 2.4566959946391243e-06, "loss": 0.236, "num_input_tokens_seen": 1685488, "step": 4195 }, { "epoch": 4.365904365904366, "grad_norm": 8.236385345458984, "learning_rate": 2.417629445181194e-06, "loss": 0.2476, "num_input_tokens_seen": 1687408, "step": 4200 }, { "epoch": 4.371101871101871, "grad_norm": 3.388185739517212, "learning_rate": 2.378860222234794e-06, "loss": 0.2325, "num_input_tokens_seen": 1689520, "step": 4205 }, { "epoch": 4.376299376299376, "grad_norm": 2.7209372520446777, "learning_rate": 2.3403888362475782e-06, "loss": 0.2749, "num_input_tokens_seen": 1691568, "step": 4210 }, { "epoch": 4.381496881496881, "grad_norm": 7.729001045227051, "learning_rate": 2.3022157937457627e-06, "loss": 0.2235, "num_input_tokens_seen": 1693616, "step": 4215 }, { "epoch": 4.386694386694387, "grad_norm": 9.053695678710938, "learning_rate": 2.2643415973275016e-06, "loss": 0.2369, "num_input_tokens_seen": 1695600, "step": 4220 }, { "epoch": 4.391891891891892, "grad_norm": 4.420502662658691, "learning_rate": 2.2267667456562307e-06, "loss": 0.285, "num_input_tokens_seen": 1697584, "step": 4225 }, { "epoch": 4.397089397089397, "grad_norm": 3.5097815990448, "learning_rate": 2.1894917334541354e-06, "loss": 0.2273, "num_input_tokens_seen": 1699568, "step": 4230 }, { "epoch": 4.402286902286902, "grad_norm": 23.638469696044922, "learning_rate": 2.15251705149562e-06, "loss": 0.2953, "num_input_tokens_seen": 1701744, "step": 4235 }, { "epoch": 4.407484407484407, "grad_norm": 4.72571325302124, "learning_rate": 2.11584318660083e-06, "loss": 0.2404, "num_input_tokens_seen": 1703600, "step": 4240 }, { "epoch": 4.412681912681912, "grad_norm": 6.068181991577148, "learning_rate": 2.0794706216292813e-06, "loss": 0.3067, "num_input_tokens_seen": 1705712, "step": 4245 }, { "epoch": 4.417879417879418, "grad_norm": 10.372715950012207, "learning_rate": 2.043399835473475e-06, "loss": 0.2088, "num_input_tokens_seen": 1707696, "step": 4250 }, { "epoch": 4.423076923076923, "grad_norm": 4.278080463409424, "learning_rate": 2.0076313030525844e-06, "loss": 0.2195, "num_input_tokens_seen": 1709744, "step": 4255 }, { "epoch": 4.428274428274428, "grad_norm": 9.075408935546875, "learning_rate": 1.972165495306241e-06, "loss": 0.2399, "num_input_tokens_seen": 1711792, "step": 4260 }, { "epoch": 4.4334719334719335, "grad_norm": 4.037149429321289, "learning_rate": 1.937002879188285e-06, "loss": 0.2491, "num_input_tokens_seen": 1713904, "step": 4265 }, { "epoch": 4.4386694386694385, "grad_norm": 7.768908977508545, "learning_rate": 1.9021439176606564e-06, "loss": 0.2257, "num_input_tokens_seen": 1715824, "step": 4270 }, { "epoch": 4.443866943866944, "grad_norm": 4.410028457641602, "learning_rate": 1.8675890696872838e-06, "loss": 0.2438, "num_input_tokens_seen": 1717808, "step": 4275 }, { "epoch": 4.4490644490644495, "grad_norm": 4.74402379989624, "learning_rate": 1.8333387902280314e-06, "loss": 0.2773, "num_input_tokens_seen": 1719856, "step": 4280 }, { "epoch": 4.454261954261955, "grad_norm": 6.120370864868164, "learning_rate": 1.7993935302327292e-06, "loss": 0.2193, "num_input_tokens_seen": 1721776, "step": 4285 }, { "epoch": 4.45945945945946, "grad_norm": 10.760017395019531, "learning_rate": 1.7657537366352338e-06, "loss": 0.238, "num_input_tokens_seen": 1723632, "step": 4290 }, { "epoch": 4.464656964656965, "grad_norm": 6.990230083465576, "learning_rate": 1.732419852347511e-06, "loss": 0.1772, "num_input_tokens_seen": 1725488, "step": 4295 }, { "epoch": 4.46985446985447, "grad_norm": 15.208341598510742, "learning_rate": 1.699392316253856e-06, "loss": 0.2837, "num_input_tokens_seen": 1727600, "step": 4300 }, { "epoch": 4.475051975051975, "grad_norm": 16.286996841430664, "learning_rate": 1.666671563205069e-06, "loss": 0.2494, "num_input_tokens_seen": 1729712, "step": 4305 }, { "epoch": 4.48024948024948, "grad_norm": 2.84039306640625, "learning_rate": 1.6342580240127582e-06, "loss": 0.269, "num_input_tokens_seen": 1731696, "step": 4310 }, { "epoch": 4.485446985446986, "grad_norm": 15.725839614868164, "learning_rate": 1.6021521254436678e-06, "loss": 0.2551, "num_input_tokens_seen": 1733744, "step": 4315 }, { "epoch": 4.490644490644491, "grad_norm": 4.358521461486816, "learning_rate": 1.5703542902140294e-06, "loss": 0.2408, "num_input_tokens_seen": 1735728, "step": 4320 }, { "epoch": 4.495841995841996, "grad_norm": 14.235426902770996, "learning_rate": 1.5388649369840357e-06, "loss": 0.1891, "num_input_tokens_seen": 1737776, "step": 4325 }, { "epoch": 4.501039501039501, "grad_norm": 7.304314136505127, "learning_rate": 1.5076844803522922e-06, "loss": 0.2684, "num_input_tokens_seen": 1739824, "step": 4330 }, { "epoch": 4.506237006237006, "grad_norm": 18.58903694152832, "learning_rate": 1.476813330850388e-06, "loss": 0.2709, "num_input_tokens_seen": 1741744, "step": 4335 }, { "epoch": 4.509355509355509, "eval_loss": 0.2577267587184906, "eval_runtime": 1.0636, "eval_samples_per_second": 804.851, "eval_steps_per_second": 100.606, "num_input_tokens_seen": 1742960, "step": 4338 }, { "epoch": 4.511434511434511, "grad_norm": 18.499958038330078, "learning_rate": 1.4462518949374838e-06, "loss": 0.2731, "num_input_tokens_seen": 1743728, "step": 4340 }, { "epoch": 4.516632016632016, "grad_norm": 4.451554298400879, "learning_rate": 1.4160005749949328e-06, "loss": 0.2431, "num_input_tokens_seen": 1745904, "step": 4345 }, { "epoch": 4.521829521829522, "grad_norm": 5.922986030578613, "learning_rate": 1.386059769321027e-06, "loss": 0.2649, "num_input_tokens_seen": 1747824, "step": 4350 }, { "epoch": 4.527027027027027, "grad_norm": 11.740480422973633, "learning_rate": 1.3564298721257223e-06, "loss": 0.2569, "num_input_tokens_seen": 1749872, "step": 4355 }, { "epoch": 4.532224532224532, "grad_norm": 10.447465896606445, "learning_rate": 1.3271112735254498e-06, "loss": 0.1998, "num_input_tokens_seen": 1751792, "step": 4360 }, { "epoch": 4.537422037422037, "grad_norm": 5.754942893981934, "learning_rate": 1.298104359538005e-06, "loss": 0.251, "num_input_tokens_seen": 1753776, "step": 4365 }, { "epoch": 4.542619542619542, "grad_norm": 8.049335479736328, "learning_rate": 1.269409512077427e-06, "loss": 0.2705, "num_input_tokens_seen": 1755824, "step": 4370 }, { "epoch": 4.547817047817047, "grad_norm": 4.818149089813232, "learning_rate": 1.241027108949e-06, "loss": 0.202, "num_input_tokens_seen": 1758000, "step": 4375 }, { "epoch": 4.553014553014553, "grad_norm": 10.757091522216797, "learning_rate": 1.2129575238442715e-06, "loss": 0.2565, "num_input_tokens_seen": 1759984, "step": 4380 }, { "epoch": 4.558212058212058, "grad_norm": 23.19622230529785, "learning_rate": 1.185201126336122e-06, "loss": 0.2607, "num_input_tokens_seen": 1761968, "step": 4385 }, { "epoch": 4.5634095634095635, "grad_norm": 6.780999660491943, "learning_rate": 1.1577582818739135e-06, "loss": 0.2392, "num_input_tokens_seen": 1764016, "step": 4390 }, { "epoch": 4.5686070686070686, "grad_norm": 6.052558422088623, "learning_rate": 1.1306293517786614e-06, "loss": 0.2808, "num_input_tokens_seen": 1765936, "step": 4395 }, { "epoch": 4.573804573804574, "grad_norm": 11.568082809448242, "learning_rate": 1.1038146932383004e-06, "loss": 0.1891, "num_input_tokens_seen": 1767984, "step": 4400 }, { "epoch": 4.579002079002079, "grad_norm": 17.461746215820312, "learning_rate": 1.0773146593029637e-06, "loss": 0.2029, "num_input_tokens_seen": 1769904, "step": 4405 }, { "epoch": 4.584199584199585, "grad_norm": 9.160344123840332, "learning_rate": 1.0511295988803294e-06, "loss": 0.2743, "num_input_tokens_seen": 1771888, "step": 4410 }, { "epoch": 4.58939708939709, "grad_norm": 8.415828704833984, "learning_rate": 1.0252598567310451e-06, "loss": 0.2115, "num_input_tokens_seen": 1773936, "step": 4415 }, { "epoch": 4.594594594594595, "grad_norm": 17.489177703857422, "learning_rate": 9.99705773464185e-07, "loss": 0.2778, "num_input_tokens_seen": 1775984, "step": 4420 }, { "epoch": 4.5997920997921, "grad_norm": 11.403511047363281, "learning_rate": 9.744676855327483e-07, "loss": 0.2428, "num_input_tokens_seen": 1777840, "step": 4425 }, { "epoch": 4.604989604989605, "grad_norm": 23.2257022857666, "learning_rate": 9.495459252292504e-07, "loss": 0.2124, "num_input_tokens_seen": 1779824, "step": 4430 }, { "epoch": 4.61018711018711, "grad_norm": 4.646495342254639, "learning_rate": 9.249408206813332e-07, "loss": 0.1939, "num_input_tokens_seen": 1781872, "step": 4435 }, { "epoch": 4.615384615384615, "grad_norm": 3.444089412689209, "learning_rate": 9.006526958474509e-07, "loss": 0.2364, "num_input_tokens_seen": 1783984, "step": 4440 }, { "epoch": 4.620582120582121, "grad_norm": 18.236093521118164, "learning_rate": 8.766818705126134e-07, "loss": 0.2023, "num_input_tokens_seen": 1786032, "step": 4445 }, { "epoch": 4.625779625779626, "grad_norm": 7.719527244567871, "learning_rate": 8.530286602841525e-07, "loss": 0.2455, "num_input_tokens_seen": 1788016, "step": 4450 }, { "epoch": 4.630977130977131, "grad_norm": 6.857132911682129, "learning_rate": 8.296933765875897e-07, "loss": 0.2154, "num_input_tokens_seen": 1790064, "step": 4455 }, { "epoch": 4.636174636174636, "grad_norm": 10.924470901489258, "learning_rate": 8.066763266625282e-07, "loss": 0.2046, "num_input_tokens_seen": 1791984, "step": 4460 }, { "epoch": 4.641372141372141, "grad_norm": 3.512413740158081, "learning_rate": 7.839778135586007e-07, "loss": 0.1884, "num_input_tokens_seen": 1793904, "step": 4465 }, { "epoch": 4.646569646569646, "grad_norm": 20.026166915893555, "learning_rate": 7.615981361314889e-07, "loss": 0.223, "num_input_tokens_seen": 1795888, "step": 4470 }, { "epoch": 4.651767151767151, "grad_norm": 13.404367446899414, "learning_rate": 7.3953758903898e-07, "loss": 0.3424, "num_input_tokens_seen": 1797872, "step": 4475 }, { "epoch": 4.656964656964657, "grad_norm": 8.044845581054688, "learning_rate": 7.177964627370997e-07, "loss": 0.2152, "num_input_tokens_seen": 1799920, "step": 4480 }, { "epoch": 4.662162162162162, "grad_norm": 6.080995082855225, "learning_rate": 6.963750434762745e-07, "loss": 0.2628, "num_input_tokens_seen": 1801776, "step": 4485 }, { "epoch": 4.667359667359667, "grad_norm": 7.769125461578369, "learning_rate": 6.752736132975696e-07, "loss": 0.2974, "num_input_tokens_seen": 1803824, "step": 4490 }, { "epoch": 4.672557172557172, "grad_norm": 9.061914443969727, "learning_rate": 6.54492450028979e-07, "loss": 0.1697, "num_input_tokens_seen": 1805744, "step": 4495 }, { "epoch": 4.6777546777546775, "grad_norm": 8.759089469909668, "learning_rate": 6.340318272817474e-07, "loss": 0.2489, "num_input_tokens_seen": 1807728, "step": 4500 }, { "epoch": 4.682952182952183, "grad_norm": 14.477424621582031, "learning_rate": 6.138920144468124e-07, "loss": 0.255, "num_input_tokens_seen": 1809712, "step": 4505 }, { "epoch": 4.6881496881496885, "grad_norm": 16.341243743896484, "learning_rate": 5.94073276691201e-07, "loss": 0.2946, "num_input_tokens_seen": 1811632, "step": 4510 }, { "epoch": 4.6933471933471935, "grad_norm": 4.825214385986328, "learning_rate": 5.745758749545749e-07, "loss": 0.2011, "num_input_tokens_seen": 1813552, "step": 4515 }, { "epoch": 4.698544698544699, "grad_norm": 6.129444122314453, "learning_rate": 5.554000659457881e-07, "loss": 0.2354, "num_input_tokens_seen": 1815664, "step": 4520 }, { "epoch": 4.703742203742204, "grad_norm": 4.319976806640625, "learning_rate": 5.365461021395096e-07, "loss": 0.2284, "num_input_tokens_seen": 1817648, "step": 4525 }, { "epoch": 4.708939708939709, "grad_norm": 4.276791572570801, "learning_rate": 5.180142317728815e-07, "loss": 0.2259, "num_input_tokens_seen": 1819696, "step": 4530 }, { "epoch": 4.714137214137214, "grad_norm": 10.404797554016113, "learning_rate": 4.998046988422766e-07, "loss": 0.269, "num_input_tokens_seen": 1821680, "step": 4535 }, { "epoch": 4.71933471933472, "grad_norm": 11.949972152709961, "learning_rate": 4.819177431000604e-07, "loss": 0.2786, "num_input_tokens_seen": 1823728, "step": 4540 }, { "epoch": 4.724532224532225, "grad_norm": 9.46826171875, "learning_rate": 4.6435360005145644e-07, "loss": 0.3228, "num_input_tokens_seen": 1825712, "step": 4545 }, { "epoch": 4.72972972972973, "grad_norm": 13.263773918151855, "learning_rate": 4.4711250095143267e-07, "loss": 0.2664, "num_input_tokens_seen": 1827760, "step": 4550 }, { "epoch": 4.734927234927235, "grad_norm": 5.771205902099609, "learning_rate": 4.30194672801662e-07, "loss": 0.2014, "num_input_tokens_seen": 1829680, "step": 4555 }, { "epoch": 4.74012474012474, "grad_norm": 23.836517333984375, "learning_rate": 4.136003383475251e-07, "loss": 0.2992, "num_input_tokens_seen": 1831728, "step": 4560 }, { "epoch": 4.745322245322245, "grad_norm": 12.346782684326172, "learning_rate": 3.9732971607519265e-07, "loss": 0.2033, "num_input_tokens_seen": 1833648, "step": 4565 }, { "epoch": 4.75051975051975, "grad_norm": 5.421136856079102, "learning_rate": 3.8138302020873373e-07, "loss": 0.2388, "num_input_tokens_seen": 1835696, "step": 4570 }, { "epoch": 4.755717255717256, "grad_norm": 12.613269805908203, "learning_rate": 3.6576046070730675e-07, "loss": 0.2582, "num_input_tokens_seen": 1837808, "step": 4575 }, { "epoch": 4.75987525987526, "eval_loss": 0.2604904770851135, "eval_runtime": 1.0828, "eval_samples_per_second": 790.52, "eval_steps_per_second": 98.815, "num_input_tokens_seen": 1839344, "step": 4579 }, { "epoch": 4.760914760914761, "grad_norm": 10.182611465454102, "learning_rate": 3.5046224326238107e-07, "loss": 0.2365, "num_input_tokens_seen": 1839728, "step": 4580 }, { "epoch": 4.766112266112266, "grad_norm": 5.537731647491455, "learning_rate": 3.3548856929505047e-07, "loss": 0.2569, "num_input_tokens_seen": 1841776, "step": 4585 }, { "epoch": 4.771309771309771, "grad_norm": 4.337884902954102, "learning_rate": 3.208396359533572e-07, "loss": 0.2548, "num_input_tokens_seen": 1843696, "step": 4590 }, { "epoch": 4.776507276507276, "grad_norm": 15.579423904418945, "learning_rate": 3.065156361097138e-07, "loss": 0.2391, "num_input_tokens_seen": 1845744, "step": 4595 }, { "epoch": 4.781704781704782, "grad_norm": 3.1390204429626465, "learning_rate": 2.925167583583577e-07, "loss": 0.2364, "num_input_tokens_seen": 1847792, "step": 4600 }, { "epoch": 4.786902286902287, "grad_norm": 12.236397743225098, "learning_rate": 2.7884318701285885e-07, "loss": 0.2843, "num_input_tokens_seen": 1849776, "step": 4605 }, { "epoch": 4.792099792099792, "grad_norm": 6.024417877197266, "learning_rate": 2.6549510210371607e-07, "loss": 0.2187, "num_input_tokens_seen": 1852016, "step": 4610 }, { "epoch": 4.797297297297297, "grad_norm": 13.414298057556152, "learning_rate": 2.524726793759591e-07, "loss": 0.246, "num_input_tokens_seen": 1854064, "step": 4615 }, { "epoch": 4.802494802494802, "grad_norm": 2.9425196647644043, "learning_rate": 2.397760902868612e-07, "loss": 0.2382, "num_input_tokens_seen": 1856112, "step": 4620 }, { "epoch": 4.8076923076923075, "grad_norm": 18.276071548461914, "learning_rate": 2.274055020036553e-07, "loss": 0.2361, "num_input_tokens_seen": 1858096, "step": 4625 }, { "epoch": 4.8128898128898125, "grad_norm": 3.6439883708953857, "learning_rate": 2.1536107740135482e-07, "loss": 0.2671, "num_input_tokens_seen": 1860272, "step": 4630 }, { "epoch": 4.8180873180873185, "grad_norm": 10.452295303344727, "learning_rate": 2.0364297506060003e-07, "loss": 0.2501, "num_input_tokens_seen": 1862256, "step": 4635 }, { "epoch": 4.8232848232848236, "grad_norm": 7.328881740570068, "learning_rate": 1.922513492655653e-07, "loss": 0.1984, "num_input_tokens_seen": 1864304, "step": 4640 }, { "epoch": 4.828482328482329, "grad_norm": 4.626097202301025, "learning_rate": 1.8118635000194396e-07, "loss": 0.2677, "num_input_tokens_seen": 1866224, "step": 4645 }, { "epoch": 4.833679833679834, "grad_norm": 6.644436836242676, "learning_rate": 1.704481229549526e-07, "loss": 0.2349, "num_input_tokens_seen": 1868336, "step": 4650 }, { "epoch": 4.838877338877339, "grad_norm": 6.079159259796143, "learning_rate": 1.6003680950742728e-07, "loss": 0.2927, "num_input_tokens_seen": 1870448, "step": 4655 }, { "epoch": 4.844074844074844, "grad_norm": 11.934816360473633, "learning_rate": 1.4995254673795812e-07, "loss": 0.2206, "num_input_tokens_seen": 1872368, "step": 4660 }, { "epoch": 4.849272349272349, "grad_norm": 2.1284427642822266, "learning_rate": 1.4019546741908251e-07, "loss": 0.2374, "num_input_tokens_seen": 1874480, "step": 4665 }, { "epoch": 4.854469854469855, "grad_norm": 7.95443868637085, "learning_rate": 1.3076570001553934e-07, "loss": 0.2712, "num_input_tokens_seen": 1876464, "step": 4670 }, { "epoch": 4.85966735966736, "grad_norm": 10.991232872009277, "learning_rate": 1.216633686825841e-07, "loss": 0.2406, "num_input_tokens_seen": 1878448, "step": 4675 }, { "epoch": 4.864864864864865, "grad_norm": 16.078187942504883, "learning_rate": 1.1288859326433477e-07, "loss": 0.2743, "num_input_tokens_seen": 1880432, "step": 4680 }, { "epoch": 4.87006237006237, "grad_norm": 19.031558990478516, "learning_rate": 1.0444148929221464e-07, "loss": 0.2828, "num_input_tokens_seen": 1882544, "step": 4685 }, { "epoch": 4.875259875259875, "grad_norm": 16.82112693786621, "learning_rate": 9.63221679834203e-08, "loss": 0.274, "num_input_tokens_seen": 1884528, "step": 4690 }, { "epoch": 4.88045738045738, "grad_norm": 6.512580394744873, "learning_rate": 8.853073623946162e-08, "loss": 0.2457, "num_input_tokens_seen": 1886640, "step": 4695 }, { "epoch": 4.885654885654886, "grad_norm": 9.699633598327637, "learning_rate": 8.106729664475176e-08, "loss": 0.2663, "num_input_tokens_seen": 1888688, "step": 4700 }, { "epoch": 4.890852390852391, "grad_norm": 14.048091888427734, "learning_rate": 7.393194746525279e-08, "loss": 0.2572, "num_input_tokens_seen": 1890736, "step": 4705 }, { "epoch": 4.896049896049896, "grad_norm": 7.062888145446777, "learning_rate": 6.712478264719601e-08, "loss": 0.2399, "num_input_tokens_seen": 1892720, "step": 4710 }, { "epoch": 4.901247401247401, "grad_norm": 11.242563247680664, "learning_rate": 6.064589181582481e-08, "loss": 0.2267, "num_input_tokens_seen": 1894704, "step": 4715 }, { "epoch": 4.906444906444906, "grad_norm": 1.6842583417892456, "learning_rate": 5.4495360274231524e-08, "loss": 0.2652, "num_input_tokens_seen": 1896624, "step": 4720 }, { "epoch": 4.911642411642411, "grad_norm": 4.220650672912598, "learning_rate": 4.867326900223068e-08, "loss": 0.2254, "num_input_tokens_seen": 1898544, "step": 4725 }, { "epoch": 4.916839916839917, "grad_norm": 3.8804283142089844, "learning_rate": 4.317969465527927e-08, "loss": 0.2342, "num_input_tokens_seen": 1900592, "step": 4730 }, { "epoch": 4.922037422037422, "grad_norm": 6.306464195251465, "learning_rate": 3.8014709563488625e-08, "loss": 0.2151, "num_input_tokens_seen": 1902576, "step": 4735 }, { "epoch": 4.927234927234927, "grad_norm": 5.872474193572998, "learning_rate": 3.317838173066135e-08, "loss": 0.2368, "num_input_tokens_seen": 1904624, "step": 4740 }, { "epoch": 4.9324324324324325, "grad_norm": 23.829078674316406, "learning_rate": 2.8670774833386426e-08, "loss": 0.2662, "num_input_tokens_seen": 1906736, "step": 4745 }, { "epoch": 4.9376299376299375, "grad_norm": 2.1569466590881348, "learning_rate": 2.449194822022327e-08, "loss": 0.221, "num_input_tokens_seen": 1908592, "step": 4750 }, { "epoch": 4.942827442827443, "grad_norm": 5.917527675628662, "learning_rate": 2.064195691089954e-08, "loss": 0.2286, "num_input_tokens_seen": 1910576, "step": 4755 }, { "epoch": 4.948024948024948, "grad_norm": 8.833841323852539, "learning_rate": 1.712085159559784e-08, "loss": 0.2104, "num_input_tokens_seen": 1912624, "step": 4760 }, { "epoch": 4.953222453222454, "grad_norm": 17.08448600769043, "learning_rate": 1.3928678634289593e-08, "loss": 0.2802, "num_input_tokens_seen": 1914608, "step": 4765 }, { "epoch": 4.958419958419959, "grad_norm": 2.2440571784973145, "learning_rate": 1.1065480056110522e-08, "loss": 0.2125, "num_input_tokens_seen": 1916592, "step": 4770 }, { "epoch": 4.963617463617464, "grad_norm": 6.129010200500488, "learning_rate": 8.531293558824982e-09, "loss": 0.2162, "num_input_tokens_seen": 1918704, "step": 4775 }, { "epoch": 4.968814968814969, "grad_norm": 2.9528563022613525, "learning_rate": 6.326152508320804e-09, "loss": 0.2508, "num_input_tokens_seen": 1920624, "step": 4780 }, { "epoch": 4.974012474012474, "grad_norm": 7.140697479248047, "learning_rate": 4.450085938170756e-09, "loss": 0.2453, "num_input_tokens_seen": 1922480, "step": 4785 }, { "epoch": 4.979209979209979, "grad_norm": 13.262871742248535, "learning_rate": 2.9031185492522926e-09, "loss": 0.2758, "num_input_tokens_seen": 1924464, "step": 4790 }, { "epoch": 4.984407484407484, "grad_norm": 8.832655906677246, "learning_rate": 1.6852707094172636e-09, "loss": 0.2275, "num_input_tokens_seen": 1926448, "step": 4795 }, { "epoch": 4.98960498960499, "grad_norm": 11.241654396057129, "learning_rate": 7.965584532282355e-10, "loss": 0.2635, "num_input_tokens_seen": 1928560, "step": 4800 }, { "epoch": 4.994802494802495, "grad_norm": 5.555950164794922, "learning_rate": 2.3699348174754945e-10, "loss": 0.2384, "num_input_tokens_seen": 1930544, "step": 4805 }, { "epoch": 5.0, "grad_norm": 12.928886413574219, "learning_rate": 6.583162381890162e-12, "loss": 0.2943, "num_input_tokens_seen": 1932608, "step": 4810 }, { "epoch": 5.0, "num_input_tokens_seen": 1932608, "step": 4810, "total_flos": 1.1284259767320576e+16, "train_loss": 0.28020706261022177, "train_runtime": 1219.1473, "train_samples_per_second": 31.559, "train_steps_per_second": 3.945 } ], "logging_steps": 5, "max_steps": 4810, "num_input_tokens_seen": 1932608, "num_train_epochs": 5, "save_steps": 241, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1284259767320576e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }