{ "best_global_step": 964, "best_metric": 0.17627178132534027, "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_cola_42_1776331560/checkpoint-964", "epoch": 5.0, "eval_steps": 241, "global_step": 4810, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.005197505197505198, "grad_norm": 440.98797607421875, "learning_rate": 4.158004158004159e-08, "loss": 1.2917, "num_input_tokens_seen": 2048, "step": 5 }, { "epoch": 0.010395010395010396, "grad_norm": 396.0676574707031, "learning_rate": 9.355509355509357e-08, "loss": 1.2491, "num_input_tokens_seen": 4224, "step": 10 }, { "epoch": 0.015592515592515593, "grad_norm": 480.88055419921875, "learning_rate": 1.4553014553014554e-07, "loss": 1.3117, "num_input_tokens_seen": 6272, "step": 15 }, { "epoch": 0.02079002079002079, "grad_norm": 408.1519775390625, "learning_rate": 1.9750519750519752e-07, "loss": 1.1366, "num_input_tokens_seen": 8384, "step": 20 }, { "epoch": 0.02598752598752599, "grad_norm": 239.34207153320312, "learning_rate": 2.494802494802495e-07, "loss": 0.7792, "num_input_tokens_seen": 10496, "step": 25 }, { "epoch": 0.031185031185031187, "grad_norm": 36.67585754394531, "learning_rate": 3.014553014553015e-07, "loss": 0.523, "num_input_tokens_seen": 12544, "step": 30 }, { "epoch": 0.036382536382536385, "grad_norm": 53.71092987060547, "learning_rate": 3.534303534303535e-07, "loss": 0.3194, "num_input_tokens_seen": 14528, "step": 35 }, { "epoch": 0.04158004158004158, "grad_norm": 72.29695129394531, "learning_rate": 4.0540540540540546e-07, "loss": 0.3482, "num_input_tokens_seen": 16576, "step": 40 }, { "epoch": 0.04677754677754678, "grad_norm": 158.2495880126953, "learning_rate": 4.5738045738045745e-07, "loss": 0.377, "num_input_tokens_seen": 18560, "step": 45 }, { "epoch": 0.05197505197505198, "grad_norm": 47.052833557128906, "learning_rate": 5.093555093555094e-07, "loss": 0.2695, "num_input_tokens_seen": 20608, "step": 50 }, { "epoch": 0.057172557172557176, "grad_norm": 29.388397216796875, "learning_rate": 5.613305613305614e-07, "loss": 0.2431, "num_input_tokens_seen": 22656, "step": 55 }, { "epoch": 0.062370062370062374, "grad_norm": 94.9759292602539, "learning_rate": 6.133056133056134e-07, "loss": 0.3162, "num_input_tokens_seen": 24640, "step": 60 }, { "epoch": 0.06756756756756757, "grad_norm": 108.12734985351562, "learning_rate": 6.652806652806654e-07, "loss": 0.3171, "num_input_tokens_seen": 26752, "step": 65 }, { "epoch": 0.07276507276507277, "grad_norm": 64.33253479003906, "learning_rate": 7.172557172557173e-07, "loss": 0.4943, "num_input_tokens_seen": 28608, "step": 70 }, { "epoch": 0.07796257796257797, "grad_norm": 85.49552154541016, "learning_rate": 7.692307692307694e-07, "loss": 0.3067, "num_input_tokens_seen": 30912, "step": 75 }, { "epoch": 0.08316008316008316, "grad_norm": 71.20551300048828, "learning_rate": 8.212058212058213e-07, "loss": 0.5316, "num_input_tokens_seen": 32896, "step": 80 }, { "epoch": 0.08835758835758836, "grad_norm": 8.865546226501465, "learning_rate": 8.731808731808733e-07, "loss": 0.3228, "num_input_tokens_seen": 34816, "step": 85 }, { "epoch": 0.09355509355509356, "grad_norm": 39.09188461303711, "learning_rate": 9.251559251559253e-07, "loss": 0.3339, "num_input_tokens_seen": 36736, "step": 90 }, { "epoch": 0.09875259875259876, "grad_norm": 8.73692512512207, "learning_rate": 9.771309771309773e-07, "loss": 0.2951, "num_input_tokens_seen": 38720, "step": 95 }, { "epoch": 0.10395010395010396, "grad_norm": 16.94306182861328, "learning_rate": 1.0291060291060292e-06, "loss": 0.2279, "num_input_tokens_seen": 40640, "step": 100 }, { "epoch": 0.10914760914760915, "grad_norm": 23.53331756591797, "learning_rate": 1.0810810810810812e-06, "loss": 0.2562, "num_input_tokens_seen": 42688, "step": 105 }, { "epoch": 0.11434511434511435, "grad_norm": 104.1171646118164, "learning_rate": 1.1330561330561333e-06, "loss": 0.2794, "num_input_tokens_seen": 44544, "step": 110 }, { "epoch": 0.11954261954261955, "grad_norm": 18.66437530517578, "learning_rate": 1.1850311850311852e-06, "loss": 0.2511, "num_input_tokens_seen": 46400, "step": 115 }, { "epoch": 0.12474012474012475, "grad_norm": 52.965763092041016, "learning_rate": 1.2370062370062372e-06, "loss": 0.2503, "num_input_tokens_seen": 48448, "step": 120 }, { "epoch": 0.12993762993762994, "grad_norm": 38.35150146484375, "learning_rate": 1.288981288981289e-06, "loss": 0.3088, "num_input_tokens_seen": 50496, "step": 125 }, { "epoch": 0.13513513513513514, "grad_norm": 67.04692840576172, "learning_rate": 1.340956340956341e-06, "loss": 0.2409, "num_input_tokens_seen": 52416, "step": 130 }, { "epoch": 0.14033264033264034, "grad_norm": 44.75727844238281, "learning_rate": 1.3929313929313932e-06, "loss": 0.2575, "num_input_tokens_seen": 54464, "step": 135 }, { "epoch": 0.14553014553014554, "grad_norm": 21.614540100097656, "learning_rate": 1.4449064449064451e-06, "loss": 0.2283, "num_input_tokens_seen": 56448, "step": 140 }, { "epoch": 0.15072765072765074, "grad_norm": 66.04798126220703, "learning_rate": 1.496881496881497e-06, "loss": 0.2004, "num_input_tokens_seen": 58368, "step": 145 }, { "epoch": 0.15592515592515593, "grad_norm": 141.60061645507812, "learning_rate": 1.548856548856549e-06, "loss": 0.3166, "num_input_tokens_seen": 60544, "step": 150 }, { "epoch": 0.16112266112266113, "grad_norm": 102.77252960205078, "learning_rate": 1.6008316008316011e-06, "loss": 0.3419, "num_input_tokens_seen": 62592, "step": 155 }, { "epoch": 0.16632016632016633, "grad_norm": 21.740901947021484, "learning_rate": 1.652806652806653e-06, "loss": 0.2605, "num_input_tokens_seen": 64576, "step": 160 }, { "epoch": 0.17151767151767153, "grad_norm": 58.935909271240234, "learning_rate": 1.704781704781705e-06, "loss": 0.2617, "num_input_tokens_seen": 66688, "step": 165 }, { "epoch": 0.17671517671517672, "grad_norm": 14.498135566711426, "learning_rate": 1.756756756756757e-06, "loss": 0.2505, "num_input_tokens_seen": 68544, "step": 170 }, { "epoch": 0.18191268191268192, "grad_norm": 55.772621154785156, "learning_rate": 1.808731808731809e-06, "loss": 0.2672, "num_input_tokens_seen": 70592, "step": 175 }, { "epoch": 0.18711018711018712, "grad_norm": 37.71332931518555, "learning_rate": 1.860706860706861e-06, "loss": 0.2488, "num_input_tokens_seen": 72576, "step": 180 }, { "epoch": 0.19230769230769232, "grad_norm": 14.720597267150879, "learning_rate": 1.912681912681913e-06, "loss": 0.1863, "num_input_tokens_seen": 74624, "step": 185 }, { "epoch": 0.19750519750519752, "grad_norm": 25.971519470214844, "learning_rate": 1.964656964656965e-06, "loss": 0.1639, "num_input_tokens_seen": 76608, "step": 190 }, { "epoch": 0.20270270270270271, "grad_norm": 135.57969665527344, "learning_rate": 2.016632016632017e-06, "loss": 0.3799, "num_input_tokens_seen": 78720, "step": 195 }, { "epoch": 0.2079002079002079, "grad_norm": 85.43441009521484, "learning_rate": 2.0686070686070687e-06, "loss": 0.455, "num_input_tokens_seen": 81152, "step": 200 }, { "epoch": 0.2130977130977131, "grad_norm": 21.497081756591797, "learning_rate": 2.120582120582121e-06, "loss": 0.2303, "num_input_tokens_seen": 83200, "step": 205 }, { "epoch": 0.2182952182952183, "grad_norm": 43.83966064453125, "learning_rate": 2.172557172557173e-06, "loss": 0.2985, "num_input_tokens_seen": 85184, "step": 210 }, { "epoch": 0.2234927234927235, "grad_norm": 21.950963973999023, "learning_rate": 2.2245322245322247e-06, "loss": 0.1462, "num_input_tokens_seen": 87232, "step": 215 }, { "epoch": 0.2286902286902287, "grad_norm": 16.27555274963379, "learning_rate": 2.276507276507277e-06, "loss": 0.2323, "num_input_tokens_seen": 89152, "step": 220 }, { "epoch": 0.2338877338877339, "grad_norm": 20.64867401123047, "learning_rate": 2.3284823284823286e-06, "loss": 0.3453, "num_input_tokens_seen": 91328, "step": 225 }, { "epoch": 0.2390852390852391, "grad_norm": 17.483060836791992, "learning_rate": 2.3804573804573807e-06, "loss": 0.1965, "num_input_tokens_seen": 93312, "step": 230 }, { "epoch": 0.2442827442827443, "grad_norm": 18.55425262451172, "learning_rate": 2.432432432432433e-06, "loss": 0.186, "num_input_tokens_seen": 95296, "step": 235 }, { "epoch": 0.2494802494802495, "grad_norm": 59.2343635559082, "learning_rate": 2.4844074844074846e-06, "loss": 0.2021, "num_input_tokens_seen": 97216, "step": 240 }, { "epoch": 0.2505197505197505, "eval_loss": 0.2780425250530243, "eval_runtime": 1.0326, "eval_samples_per_second": 829.001, "eval_steps_per_second": 103.625, "num_input_tokens_seen": 97664, "step": 241 }, { "epoch": 0.25467775467775466, "grad_norm": 32.85165023803711, "learning_rate": 2.5363825363825367e-06, "loss": 0.2316, "num_input_tokens_seen": 99264, "step": 245 }, { "epoch": 0.2598752598752599, "grad_norm": 52.35283660888672, "learning_rate": 2.5883575883575885e-06, "loss": 0.2509, "num_input_tokens_seen": 101184, "step": 250 }, { "epoch": 0.26507276507276506, "grad_norm": 18.137977600097656, "learning_rate": 2.6403326403326406e-06, "loss": 0.1696, "num_input_tokens_seen": 103296, "step": 255 }, { "epoch": 0.2702702702702703, "grad_norm": 35.994441986083984, "learning_rate": 2.6923076923076923e-06, "loss": 0.3451, "num_input_tokens_seen": 105344, "step": 260 }, { "epoch": 0.27546777546777546, "grad_norm": 18.656810760498047, "learning_rate": 2.7442827442827445e-06, "loss": 0.1851, "num_input_tokens_seen": 107392, "step": 265 }, { "epoch": 0.2806652806652807, "grad_norm": 21.467487335205078, "learning_rate": 2.796257796257796e-06, "loss": 0.2177, "num_input_tokens_seen": 109440, "step": 270 }, { "epoch": 0.28586278586278585, "grad_norm": 61.47985076904297, "learning_rate": 2.8482328482328488e-06, "loss": 0.2767, "num_input_tokens_seen": 111424, "step": 275 }, { "epoch": 0.2910602910602911, "grad_norm": 38.212928771972656, "learning_rate": 2.9002079002079005e-06, "loss": 0.3802, "num_input_tokens_seen": 113408, "step": 280 }, { "epoch": 0.29625779625779625, "grad_norm": 29.204410552978516, "learning_rate": 2.9521829521829526e-06, "loss": 0.2173, "num_input_tokens_seen": 115392, "step": 285 }, { "epoch": 0.30145530145530147, "grad_norm": 38.67171859741211, "learning_rate": 3.0041580041580043e-06, "loss": 0.2155, "num_input_tokens_seen": 117440, "step": 290 }, { "epoch": 0.30665280665280664, "grad_norm": 17.80721664428711, "learning_rate": 3.0561330561330565e-06, "loss": 0.2116, "num_input_tokens_seen": 119424, "step": 295 }, { "epoch": 0.31185031185031187, "grad_norm": 30.9665470123291, "learning_rate": 3.1081081081081082e-06, "loss": 0.2158, "num_input_tokens_seen": 121344, "step": 300 }, { "epoch": 0.31704781704781704, "grad_norm": 24.32500648498535, "learning_rate": 3.1600831600831604e-06, "loss": 0.2048, "num_input_tokens_seen": 123264, "step": 305 }, { "epoch": 0.32224532224532226, "grad_norm": 24.350160598754883, "learning_rate": 3.212058212058212e-06, "loss": 0.2194, "num_input_tokens_seen": 125184, "step": 310 }, { "epoch": 0.32744282744282743, "grad_norm": 17.5281982421875, "learning_rate": 3.2640332640332646e-06, "loss": 0.1868, "num_input_tokens_seen": 127296, "step": 315 }, { "epoch": 0.33264033264033266, "grad_norm": 30.43378257751465, "learning_rate": 3.3160083160083164e-06, "loss": 0.217, "num_input_tokens_seen": 129408, "step": 320 }, { "epoch": 0.33783783783783783, "grad_norm": 7.501856327056885, "learning_rate": 3.3679833679833685e-06, "loss": 0.179, "num_input_tokens_seen": 131520, "step": 325 }, { "epoch": 0.34303534303534305, "grad_norm": 37.26128005981445, "learning_rate": 3.4199584199584202e-06, "loss": 0.2437, "num_input_tokens_seen": 133568, "step": 330 }, { "epoch": 0.3482328482328482, "grad_norm": 50.59195327758789, "learning_rate": 3.4719334719334724e-06, "loss": 0.1981, "num_input_tokens_seen": 135616, "step": 335 }, { "epoch": 0.35343035343035345, "grad_norm": 27.73749542236328, "learning_rate": 3.523908523908524e-06, "loss": 0.3892, "num_input_tokens_seen": 137664, "step": 340 }, { "epoch": 0.3586278586278586, "grad_norm": 25.262042999267578, "learning_rate": 3.5758835758835762e-06, "loss": 0.1327, "num_input_tokens_seen": 139584, "step": 345 }, { "epoch": 0.36382536382536385, "grad_norm": 33.850399017333984, "learning_rate": 3.627858627858628e-06, "loss": 0.2165, "num_input_tokens_seen": 141504, "step": 350 }, { "epoch": 0.369022869022869, "grad_norm": 40.24353790283203, "learning_rate": 3.6798336798336805e-06, "loss": 0.2907, "num_input_tokens_seen": 143552, "step": 355 }, { "epoch": 0.37422037422037424, "grad_norm": 26.471458435058594, "learning_rate": 3.7318087318087322e-06, "loss": 0.3012, "num_input_tokens_seen": 145536, "step": 360 }, { "epoch": 0.3794178794178794, "grad_norm": 24.082931518554688, "learning_rate": 3.7837837837837844e-06, "loss": 0.2293, "num_input_tokens_seen": 147456, "step": 365 }, { "epoch": 0.38461538461538464, "grad_norm": 17.22445297241211, "learning_rate": 3.835758835758836e-06, "loss": 0.1782, "num_input_tokens_seen": 149440, "step": 370 }, { "epoch": 0.3898128898128898, "grad_norm": 28.956668853759766, "learning_rate": 3.887733887733889e-06, "loss": 0.3696, "num_input_tokens_seen": 151360, "step": 375 }, { "epoch": 0.39501039501039503, "grad_norm": 33.45877456665039, "learning_rate": 3.9397089397089396e-06, "loss": 0.309, "num_input_tokens_seen": 153344, "step": 380 }, { "epoch": 0.4002079002079002, "grad_norm": 5.932255744934082, "learning_rate": 3.991683991683992e-06, "loss": 0.2413, "num_input_tokens_seen": 155264, "step": 385 }, { "epoch": 0.40540540540540543, "grad_norm": 21.102771759033203, "learning_rate": 4.043659043659044e-06, "loss": 0.3064, "num_input_tokens_seen": 157248, "step": 390 }, { "epoch": 0.4106029106029106, "grad_norm": 4.685667991638184, "learning_rate": 4.095634095634096e-06, "loss": 0.2798, "num_input_tokens_seen": 159296, "step": 395 }, { "epoch": 0.4158004158004158, "grad_norm": 17.0113525390625, "learning_rate": 4.147609147609148e-06, "loss": 0.3488, "num_input_tokens_seen": 161344, "step": 400 }, { "epoch": 0.420997920997921, "grad_norm": 6.919645309448242, "learning_rate": 4.1995841995842e-06, "loss": 0.2072, "num_input_tokens_seen": 163328, "step": 405 }, { "epoch": 0.4261954261954262, "grad_norm": 69.12760925292969, "learning_rate": 4.2515592515592516e-06, "loss": 0.1704, "num_input_tokens_seen": 165312, "step": 410 }, { "epoch": 0.4313929313929314, "grad_norm": 5.609197616577148, "learning_rate": 4.303534303534304e-06, "loss": 0.0573, "num_input_tokens_seen": 167360, "step": 415 }, { "epoch": 0.4365904365904366, "grad_norm": 99.32202911376953, "learning_rate": 4.355509355509356e-06, "loss": 0.9576, "num_input_tokens_seen": 169344, "step": 420 }, { "epoch": 0.4417879417879418, "grad_norm": 10.58239459991455, "learning_rate": 4.4074844074844084e-06, "loss": 0.3222, "num_input_tokens_seen": 171456, "step": 425 }, { "epoch": 0.446985446985447, "grad_norm": 20.188488006591797, "learning_rate": 4.45945945945946e-06, "loss": 0.3442, "num_input_tokens_seen": 173568, "step": 430 }, { "epoch": 0.4521829521829522, "grad_norm": 8.674958229064941, "learning_rate": 4.511434511434512e-06, "loss": 0.1851, "num_input_tokens_seen": 175552, "step": 435 }, { "epoch": 0.4573804573804574, "grad_norm": 22.08028793334961, "learning_rate": 4.563409563409564e-06, "loss": 0.2573, "num_input_tokens_seen": 177536, "step": 440 }, { "epoch": 0.4625779625779626, "grad_norm": 11.568997383117676, "learning_rate": 4.615384615384616e-06, "loss": 0.2972, "num_input_tokens_seen": 179584, "step": 445 }, { "epoch": 0.4677754677754678, "grad_norm": 6.849438190460205, "learning_rate": 4.667359667359668e-06, "loss": 0.2247, "num_input_tokens_seen": 181568, "step": 450 }, { "epoch": 0.47297297297297297, "grad_norm": 3.9055252075195312, "learning_rate": 4.71933471933472e-06, "loss": 0.2355, "num_input_tokens_seen": 183552, "step": 455 }, { "epoch": 0.4781704781704782, "grad_norm": 20.351293563842773, "learning_rate": 4.771309771309771e-06, "loss": 0.1821, "num_input_tokens_seen": 185600, "step": 460 }, { "epoch": 0.48336798336798337, "grad_norm": 21.34255599975586, "learning_rate": 4.823284823284824e-06, "loss": 0.1938, "num_input_tokens_seen": 187584, "step": 465 }, { "epoch": 0.4885654885654886, "grad_norm": 28.844085693359375, "learning_rate": 4.875259875259876e-06, "loss": 0.2747, "num_input_tokens_seen": 189568, "step": 470 }, { "epoch": 0.49376299376299376, "grad_norm": 14.666620254516602, "learning_rate": 4.927234927234928e-06, "loss": 0.2394, "num_input_tokens_seen": 191680, "step": 475 }, { "epoch": 0.498960498960499, "grad_norm": 23.078649520874023, "learning_rate": 4.97920997920998e-06, "loss": 0.2402, "num_input_tokens_seen": 193728, "step": 480 }, { "epoch": 0.501039501039501, "eval_loss": 0.20022711157798767, "eval_runtime": 1.0474, "eval_samples_per_second": 817.231, "eval_steps_per_second": 102.154, "num_input_tokens_seen": 194560, "step": 482 }, { "epoch": 0.5041580041580042, "grad_norm": 49.961700439453125, "learning_rate": 4.999994075155936e-06, "loss": 0.1873, "num_input_tokens_seen": 195776, "step": 485 }, { "epoch": 0.5093555093555093, "grad_norm": 21.11049461364746, "learning_rate": 4.999957867877242e-06, "loss": 0.1905, "num_input_tokens_seen": 197696, "step": 490 }, { "epoch": 0.5145530145530145, "grad_norm": 40.248802185058594, "learning_rate": 4.999888745376028e-06, "loss": 0.1952, "num_input_tokens_seen": 199680, "step": 495 }, { "epoch": 0.5197505197505198, "grad_norm": 25.25174903869629, "learning_rate": 4.999786708562382e-06, "loss": 0.2149, "num_input_tokens_seen": 201792, "step": 500 }, { "epoch": 0.524948024948025, "grad_norm": 30.329490661621094, "learning_rate": 4.999651758779753e-06, "loss": 0.2066, "num_input_tokens_seen": 203840, "step": 505 }, { "epoch": 0.5301455301455301, "grad_norm": 23.636180877685547, "learning_rate": 4.999483897804933e-06, "loss": 0.2161, "num_input_tokens_seen": 205824, "step": 510 }, { "epoch": 0.5353430353430353, "grad_norm": 29.035306930541992, "learning_rate": 4.999283127848029e-06, "loss": 0.1777, "num_input_tokens_seen": 207936, "step": 515 }, { "epoch": 0.5405405405405406, "grad_norm": 21.316884994506836, "learning_rate": 4.999049451552443e-06, "loss": 0.1931, "num_input_tokens_seen": 209984, "step": 520 }, { "epoch": 0.5457380457380457, "grad_norm": 39.675086975097656, "learning_rate": 4.998782871994828e-06, "loss": 0.3235, "num_input_tokens_seen": 212096, "step": 525 }, { "epoch": 0.5509355509355509, "grad_norm": 20.291854858398438, "learning_rate": 4.998483392685055e-06, "loss": 0.2083, "num_input_tokens_seen": 214080, "step": 530 }, { "epoch": 0.5561330561330561, "grad_norm": 11.547039985656738, "learning_rate": 4.9981510175661606e-06, "loss": 0.2592, "num_input_tokens_seen": 216128, "step": 535 }, { "epoch": 0.5613305613305614, "grad_norm": 14.435676574707031, "learning_rate": 4.9977857510143e-06, "loss": 0.2199, "num_input_tokens_seen": 218176, "step": 540 }, { "epoch": 0.5665280665280665, "grad_norm": 11.747395515441895, "learning_rate": 4.997387597838684e-06, "loss": 0.1414, "num_input_tokens_seen": 220096, "step": 545 }, { "epoch": 0.5717255717255717, "grad_norm": 39.84230422973633, "learning_rate": 4.996956563281524e-06, "loss": 0.1874, "num_input_tokens_seen": 222080, "step": 550 }, { "epoch": 0.5769230769230769, "grad_norm": 41.40126419067383, "learning_rate": 4.996492653017953e-06, "loss": 0.2643, "num_input_tokens_seen": 224000, "step": 555 }, { "epoch": 0.5821205821205822, "grad_norm": 26.15458106994629, "learning_rate": 4.995995873155958e-06, "loss": 0.2975, "num_input_tokens_seen": 225984, "step": 560 }, { "epoch": 0.5873180873180873, "grad_norm": 17.151378631591797, "learning_rate": 4.995466230236298e-06, "loss": 0.1955, "num_input_tokens_seen": 227840, "step": 565 }, { "epoch": 0.5925155925155925, "grad_norm": 15.602777481079102, "learning_rate": 4.994903731232415e-06, "loss": 0.2476, "num_input_tokens_seen": 229824, "step": 570 }, { "epoch": 0.5977130977130977, "grad_norm": 6.400282859802246, "learning_rate": 4.994308383550347e-06, "loss": 0.213, "num_input_tokens_seen": 231872, "step": 575 }, { "epoch": 0.6029106029106029, "grad_norm": 21.05705451965332, "learning_rate": 4.993680195028626e-06, "loss": 0.2039, "num_input_tokens_seen": 233920, "step": 580 }, { "epoch": 0.6081081081081081, "grad_norm": 20.211143493652344, "learning_rate": 4.993019173938178e-06, "loss": 0.2036, "num_input_tokens_seen": 235840, "step": 585 }, { "epoch": 0.6133056133056133, "grad_norm": 7.714716911315918, "learning_rate": 4.992325328982212e-06, "loss": 0.2111, "num_input_tokens_seen": 238016, "step": 590 }, { "epoch": 0.6185031185031185, "grad_norm": 11.061738967895508, "learning_rate": 4.991598669296105e-06, "loss": 0.1706, "num_input_tokens_seen": 240064, "step": 595 }, { "epoch": 0.6237006237006237, "grad_norm": 37.05807113647461, "learning_rate": 4.990839204447287e-06, "loss": 0.2236, "num_input_tokens_seen": 242048, "step": 600 }, { "epoch": 0.6288981288981289, "grad_norm": 23.512836456298828, "learning_rate": 4.990046944435105e-06, "loss": 0.1908, "num_input_tokens_seen": 243968, "step": 605 }, { "epoch": 0.6340956340956341, "grad_norm": 26.345306396484375, "learning_rate": 4.989221899690704e-06, "loss": 0.2409, "num_input_tokens_seen": 246016, "step": 610 }, { "epoch": 0.6392931392931392, "grad_norm": 8.184889793395996, "learning_rate": 4.988364081076877e-06, "loss": 0.2135, "num_input_tokens_seen": 248000, "step": 615 }, { "epoch": 0.6444906444906445, "grad_norm": 8.331661224365234, "learning_rate": 4.987473499887932e-06, "loss": 0.203, "num_input_tokens_seen": 250048, "step": 620 }, { "epoch": 0.6496881496881497, "grad_norm": 19.97974967956543, "learning_rate": 4.986550167849538e-06, "loss": 0.1867, "num_input_tokens_seen": 252096, "step": 625 }, { "epoch": 0.6548856548856549, "grad_norm": 15.157999992370605, "learning_rate": 4.9855940971185705e-06, "loss": 0.1162, "num_input_tokens_seen": 254144, "step": 630 }, { "epoch": 0.66008316008316, "grad_norm": 9.33337116241455, "learning_rate": 4.984605300282955e-06, "loss": 0.2562, "num_input_tokens_seen": 256128, "step": 635 }, { "epoch": 0.6652806652806653, "grad_norm": 28.826885223388672, "learning_rate": 4.983583790361497e-06, "loss": 0.1389, "num_input_tokens_seen": 258048, "step": 640 }, { "epoch": 0.6704781704781705, "grad_norm": 48.085391998291016, "learning_rate": 4.982529580803714e-06, "loss": 0.3054, "num_input_tokens_seen": 260352, "step": 645 }, { "epoch": 0.6756756756756757, "grad_norm": 24.728063583374023, "learning_rate": 4.981442685489659e-06, "loss": 0.2884, "num_input_tokens_seen": 262272, "step": 650 }, { "epoch": 0.6808731808731808, "grad_norm": 24.10839080810547, "learning_rate": 4.9803231187297305e-06, "loss": 0.1599, "num_input_tokens_seen": 264320, "step": 655 }, { "epoch": 0.6860706860706861, "grad_norm": 10.08352279663086, "learning_rate": 4.979170895264494e-06, "loss": 0.1946, "num_input_tokens_seen": 266240, "step": 660 }, { "epoch": 0.6912681912681913, "grad_norm": 17.471120834350586, "learning_rate": 4.977986030264483e-06, "loss": 0.2128, "num_input_tokens_seen": 268224, "step": 665 }, { "epoch": 0.6964656964656964, "grad_norm": 19.734243392944336, "learning_rate": 4.9767685393299946e-06, "loss": 0.2326, "num_input_tokens_seen": 270272, "step": 670 }, { "epoch": 0.7016632016632016, "grad_norm": 8.745848655700684, "learning_rate": 4.975518438490897e-06, "loss": 0.2276, "num_input_tokens_seen": 272256, "step": 675 }, { "epoch": 0.7068607068607069, "grad_norm": 24.683629989624023, "learning_rate": 4.974235744206405e-06, "loss": 0.1786, "num_input_tokens_seen": 274240, "step": 680 }, { "epoch": 0.7120582120582121, "grad_norm": 32.86091232299805, "learning_rate": 4.972920473364869e-06, "loss": 0.1923, "num_input_tokens_seen": 276288, "step": 685 }, { "epoch": 0.7172557172557172, "grad_norm": 13.548423767089844, "learning_rate": 4.971572643283557e-06, "loss": 0.1661, "num_input_tokens_seen": 278272, "step": 690 }, { "epoch": 0.7224532224532224, "grad_norm": 31.974199295043945, "learning_rate": 4.970192271708416e-06, "loss": 0.1867, "num_input_tokens_seen": 280384, "step": 695 }, { "epoch": 0.7276507276507277, "grad_norm": 16.395275115966797, "learning_rate": 4.968779376813849e-06, "loss": 0.3333, "num_input_tokens_seen": 282368, "step": 700 }, { "epoch": 0.7328482328482329, "grad_norm": 12.498151779174805, "learning_rate": 4.967333977202469e-06, "loss": 0.1327, "num_input_tokens_seen": 284416, "step": 705 }, { "epoch": 0.738045738045738, "grad_norm": 70.73739624023438, "learning_rate": 4.965856091904855e-06, "loss": 0.2235, "num_input_tokens_seen": 286464, "step": 710 }, { "epoch": 0.7432432432432432, "grad_norm": 11.769681930541992, "learning_rate": 4.964345740379307e-06, "loss": 0.3413, "num_input_tokens_seen": 288448, "step": 715 }, { "epoch": 0.7484407484407485, "grad_norm": 9.002899169921875, "learning_rate": 4.962802942511582e-06, "loss": 0.1906, "num_input_tokens_seen": 290496, "step": 720 }, { "epoch": 0.7515592515592515, "eval_loss": 0.20943090319633484, "eval_runtime": 1.0284, "eval_samples_per_second": 832.355, "eval_steps_per_second": 104.044, "num_input_tokens_seen": 291712, "step": 723 }, { "epoch": 0.7536382536382537, "grad_norm": 30.15701675415039, "learning_rate": 4.961227718614634e-06, "loss": 0.2576, "num_input_tokens_seen": 292480, "step": 725 }, { "epoch": 0.7588357588357588, "grad_norm": 16.669214248657227, "learning_rate": 4.959620089428354e-06, "loss": 0.2352, "num_input_tokens_seen": 294464, "step": 730 }, { "epoch": 0.764033264033264, "grad_norm": 24.175790786743164, "learning_rate": 4.957980076119285e-06, "loss": 0.2617, "num_input_tokens_seen": 296448, "step": 735 }, { "epoch": 0.7692307692307693, "grad_norm": 14.268982887268066, "learning_rate": 4.956307700280354e-06, "loss": 0.2079, "num_input_tokens_seen": 298432, "step": 740 }, { "epoch": 0.7744282744282744, "grad_norm": 6.003777980804443, "learning_rate": 4.954602983930581e-06, "loss": 0.2712, "num_input_tokens_seen": 300480, "step": 745 }, { "epoch": 0.7796257796257796, "grad_norm": 9.822731018066406, "learning_rate": 4.95286594951479e-06, "loss": 0.2211, "num_input_tokens_seen": 302400, "step": 750 }, { "epoch": 0.7848232848232848, "grad_norm": 13.10158920288086, "learning_rate": 4.951096619903317e-06, "loss": 0.2161, "num_input_tokens_seen": 304320, "step": 755 }, { "epoch": 0.7900207900207901, "grad_norm": 6.7825775146484375, "learning_rate": 4.949295018391707e-06, "loss": 0.1828, "num_input_tokens_seen": 306240, "step": 760 }, { "epoch": 0.7952182952182952, "grad_norm": 16.962614059448242, "learning_rate": 4.9474611687004025e-06, "loss": 0.2155, "num_input_tokens_seen": 308032, "step": 765 }, { "epoch": 0.8004158004158004, "grad_norm": 11.7578125, "learning_rate": 4.945595094974442e-06, "loss": 0.2009, "num_input_tokens_seen": 309952, "step": 770 }, { "epoch": 0.8056133056133056, "grad_norm": 11.759556770324707, "learning_rate": 4.94369682178313e-06, "loss": 0.1813, "num_input_tokens_seen": 311936, "step": 775 }, { "epoch": 0.8108108108108109, "grad_norm": 10.483379364013672, "learning_rate": 4.941766374119724e-06, "loss": 0.1603, "num_input_tokens_seen": 313920, "step": 780 }, { "epoch": 0.816008316008316, "grad_norm": 33.7660026550293, "learning_rate": 4.939803777401096e-06, "loss": 0.2613, "num_input_tokens_seen": 315968, "step": 785 }, { "epoch": 0.8212058212058212, "grad_norm": 12.025551795959473, "learning_rate": 4.937809057467404e-06, "loss": 0.2641, "num_input_tokens_seen": 317952, "step": 790 }, { "epoch": 0.8264033264033264, "grad_norm": 14.894133567810059, "learning_rate": 4.935782240581753e-06, "loss": 0.1934, "num_input_tokens_seen": 319872, "step": 795 }, { "epoch": 0.8316008316008316, "grad_norm": 7.2731733322143555, "learning_rate": 4.933723353429842e-06, "loss": 0.2498, "num_input_tokens_seen": 321856, "step": 800 }, { "epoch": 0.8367983367983368, "grad_norm": 7.900448799133301, "learning_rate": 4.931632423119621e-06, "loss": 0.1671, "num_input_tokens_seen": 323968, "step": 805 }, { "epoch": 0.841995841995842, "grad_norm": 13.05286693572998, "learning_rate": 4.929509477180929e-06, "loss": 0.2092, "num_input_tokens_seen": 325952, "step": 810 }, { "epoch": 0.8471933471933472, "grad_norm": 0.7964070439338684, "learning_rate": 4.927354543565131e-06, "loss": 0.0581, "num_input_tokens_seen": 328000, "step": 815 }, { "epoch": 0.8523908523908524, "grad_norm": 68.79032135009766, "learning_rate": 4.925167650644752e-06, "loss": 0.1592, "num_input_tokens_seen": 329984, "step": 820 }, { "epoch": 0.8575883575883576, "grad_norm": 15.014650344848633, "learning_rate": 4.922948827213107e-06, "loss": 0.4462, "num_input_tokens_seen": 331904, "step": 825 }, { "epoch": 0.8627858627858628, "grad_norm": 11.443034172058105, "learning_rate": 4.920698102483913e-06, "loss": 0.4518, "num_input_tokens_seen": 333888, "step": 830 }, { "epoch": 0.867983367983368, "grad_norm": 61.09624481201172, "learning_rate": 4.9184155060909115e-06, "loss": 0.2671, "num_input_tokens_seen": 335872, "step": 835 }, { "epoch": 0.8731808731808732, "grad_norm": 69.81952667236328, "learning_rate": 4.916101068087477e-06, "loss": 0.3681, "num_input_tokens_seen": 337856, "step": 840 }, { "epoch": 0.8783783783783784, "grad_norm": 26.779844284057617, "learning_rate": 4.9137548189462185e-06, "loss": 0.2011, "num_input_tokens_seen": 339776, "step": 845 }, { "epoch": 0.8835758835758836, "grad_norm": 14.02595043182373, "learning_rate": 4.911376789558584e-06, "loss": 0.1852, "num_input_tokens_seen": 341760, "step": 850 }, { "epoch": 0.8887733887733887, "grad_norm": 13.688316345214844, "learning_rate": 4.908967011234446e-06, "loss": 0.3553, "num_input_tokens_seen": 343680, "step": 855 }, { "epoch": 0.893970893970894, "grad_norm": 12.0100679397583, "learning_rate": 4.9065255157016955e-06, "loss": 0.2092, "num_input_tokens_seen": 345600, "step": 860 }, { "epoch": 0.8991683991683992, "grad_norm": 13.758508682250977, "learning_rate": 4.904052335105822e-06, "loss": 0.2165, "num_input_tokens_seen": 347520, "step": 865 }, { "epoch": 0.9043659043659044, "grad_norm": 21.069822311401367, "learning_rate": 4.90154750200949e-06, "loss": 0.1773, "num_input_tokens_seen": 349568, "step": 870 }, { "epoch": 0.9095634095634095, "grad_norm": 12.611119270324707, "learning_rate": 4.899011049392111e-06, "loss": 0.1146, "num_input_tokens_seen": 351552, "step": 875 }, { "epoch": 0.9147609147609148, "grad_norm": 10.34527587890625, "learning_rate": 4.896443010649408e-06, "loss": 0.1213, "num_input_tokens_seen": 353472, "step": 880 }, { "epoch": 0.91995841995842, "grad_norm": 7.383549690246582, "learning_rate": 4.893843419592977e-06, "loss": 0.123, "num_input_tokens_seen": 355392, "step": 885 }, { "epoch": 0.9251559251559252, "grad_norm": 36.267250061035156, "learning_rate": 4.891212310449845e-06, "loss": 0.1794, "num_input_tokens_seen": 357440, "step": 890 }, { "epoch": 0.9303534303534303, "grad_norm": 21.83590316772461, "learning_rate": 4.88854971786201e-06, "loss": 0.1822, "num_input_tokens_seen": 359488, "step": 895 }, { "epoch": 0.9355509355509356, "grad_norm": 74.82781982421875, "learning_rate": 4.885855676885995e-06, "loss": 0.282, "num_input_tokens_seen": 361408, "step": 900 }, { "epoch": 0.9407484407484408, "grad_norm": 27.140975952148438, "learning_rate": 4.88313022299238e-06, "loss": 0.1931, "num_input_tokens_seen": 363392, "step": 905 }, { "epoch": 0.9459459459459459, "grad_norm": 45.93625259399414, "learning_rate": 4.880373392065339e-06, "loss": 0.318, "num_input_tokens_seen": 365440, "step": 910 }, { "epoch": 0.9511434511434511, "grad_norm": 22.00739860534668, "learning_rate": 4.877585220402167e-06, "loss": 0.1793, "num_input_tokens_seen": 367616, "step": 915 }, { "epoch": 0.9563409563409564, "grad_norm": 20.41562843322754, "learning_rate": 4.874765744712796e-06, "loss": 0.1164, "num_input_tokens_seen": 369600, "step": 920 }, { "epoch": 0.9615384615384616, "grad_norm": 30.4830379486084, "learning_rate": 4.8719150021193206e-06, "loss": 0.2515, "num_input_tokens_seen": 371520, "step": 925 }, { "epoch": 0.9667359667359667, "grad_norm": 19.38152503967285, "learning_rate": 4.869033030155504e-06, "loss": 0.3492, "num_input_tokens_seen": 373568, "step": 930 }, { "epoch": 0.9719334719334719, "grad_norm": 14.191635131835938, "learning_rate": 4.866119866766286e-06, "loss": 0.1902, "num_input_tokens_seen": 375488, "step": 935 }, { "epoch": 0.9771309771309772, "grad_norm": 24.351335525512695, "learning_rate": 4.86317555030728e-06, "loss": 0.2238, "num_input_tokens_seen": 377728, "step": 940 }, { "epoch": 0.9823284823284824, "grad_norm": 7.04970121383667, "learning_rate": 4.860200119544273e-06, "loss": 0.11, "num_input_tokens_seen": 379840, "step": 945 }, { "epoch": 0.9875259875259875, "grad_norm": 40.61119079589844, "learning_rate": 4.857193613652711e-06, "loss": 0.2154, "num_input_tokens_seen": 381760, "step": 950 }, { "epoch": 0.9927234927234927, "grad_norm": 18.396310806274414, "learning_rate": 4.854156072217185e-06, "loss": 0.1666, "num_input_tokens_seen": 383808, "step": 955 }, { "epoch": 0.997920997920998, "grad_norm": 8.14262866973877, "learning_rate": 4.851087535230911e-06, "loss": 0.2397, "num_input_tokens_seen": 385856, "step": 960 }, { "epoch": 1.002079002079002, "eval_loss": 0.17627178132534027, "eval_runtime": 1.0408, "eval_samples_per_second": 822.411, "eval_steps_per_second": 102.801, "num_input_tokens_seen": 387464, "step": 964 }, { "epoch": 1.003118503118503, "grad_norm": 17.4815731048584, "learning_rate": 4.8479880430952e-06, "loss": 0.176, "num_input_tokens_seen": 387848, "step": 965 }, { "epoch": 1.0083160083160083, "grad_norm": 3.335278272628784, "learning_rate": 4.844857636618928e-06, "loss": 0.0833, "num_input_tokens_seen": 389640, "step": 970 }, { "epoch": 1.0135135135135136, "grad_norm": 37.46596908569336, "learning_rate": 4.841696357018003e-06, "loss": 0.1134, "num_input_tokens_seen": 391624, "step": 975 }, { "epoch": 1.0187110187110187, "grad_norm": 12.097825050354004, "learning_rate": 4.838504245914812e-06, "loss": 0.0776, "num_input_tokens_seen": 393672, "step": 980 }, { "epoch": 1.023908523908524, "grad_norm": 0.35555675625801086, "learning_rate": 4.835281345337684e-06, "loss": 0.0266, "num_input_tokens_seen": 395784, "step": 985 }, { "epoch": 1.0291060291060292, "grad_norm": 99.47309112548828, "learning_rate": 4.832027697720329e-06, "loss": 0.2075, "num_input_tokens_seen": 397768, "step": 990 }, { "epoch": 1.0343035343035343, "grad_norm": 65.3523178100586, "learning_rate": 4.828743345901285e-06, "loss": 0.4063, "num_input_tokens_seen": 399816, "step": 995 }, { "epoch": 1.0395010395010396, "grad_norm": 1.6000525951385498, "learning_rate": 4.825428333123346e-06, "loss": 0.1017, "num_input_tokens_seen": 401928, "step": 1000 }, { "epoch": 1.0446985446985446, "grad_norm": 5.073741436004639, "learning_rate": 4.822082703033003e-06, "loss": 0.0338, "num_input_tokens_seen": 403912, "step": 1005 }, { "epoch": 1.04989604989605, "grad_norm": 31.460180282592773, "learning_rate": 4.818706499679862e-06, "loss": 0.1392, "num_input_tokens_seen": 405832, "step": 1010 }, { "epoch": 1.0550935550935552, "grad_norm": 13.538461685180664, "learning_rate": 4.815299767516065e-06, "loss": 0.1168, "num_input_tokens_seen": 407880, "step": 1015 }, { "epoch": 1.0602910602910602, "grad_norm": 48.505496978759766, "learning_rate": 4.811862551395707e-06, "loss": 0.1001, "num_input_tokens_seen": 410120, "step": 1020 }, { "epoch": 1.0654885654885655, "grad_norm": 25.246902465820312, "learning_rate": 4.808394896574246e-06, "loss": 0.0944, "num_input_tokens_seen": 412168, "step": 1025 }, { "epoch": 1.0706860706860706, "grad_norm": 40.07487487792969, "learning_rate": 4.8048968487079e-06, "loss": 0.1433, "num_input_tokens_seen": 414472, "step": 1030 }, { "epoch": 1.0758835758835759, "grad_norm": 77.1120834350586, "learning_rate": 4.801368453853057e-06, "loss": 0.3131, "num_input_tokens_seen": 416520, "step": 1035 }, { "epoch": 1.0810810810810811, "grad_norm": 22.368253707885742, "learning_rate": 4.79780975846566e-06, "loss": 0.171, "num_input_tokens_seen": 418568, "step": 1040 }, { "epoch": 1.0862785862785862, "grad_norm": 13.560877799987793, "learning_rate": 4.7942208094006e-06, "loss": 0.1287, "num_input_tokens_seen": 420488, "step": 1045 }, { "epoch": 1.0914760914760915, "grad_norm": 31.480270385742188, "learning_rate": 4.790601653911094e-06, "loss": 0.1098, "num_input_tokens_seen": 422472, "step": 1050 }, { "epoch": 1.0966735966735968, "grad_norm": 34.814151763916016, "learning_rate": 4.786952339648071e-06, "loss": 0.297, "num_input_tokens_seen": 424456, "step": 1055 }, { "epoch": 1.1018711018711018, "grad_norm": 38.07038116455078, "learning_rate": 4.783272914659535e-06, "loss": 0.3308, "num_input_tokens_seen": 426568, "step": 1060 }, { "epoch": 1.107068607068607, "grad_norm": 114.09796905517578, "learning_rate": 4.77956342738994e-06, "loss": 0.1061, "num_input_tokens_seen": 428552, "step": 1065 }, { "epoch": 1.1122661122661124, "grad_norm": 26.920055389404297, "learning_rate": 4.775823926679549e-06, "loss": 0.0996, "num_input_tokens_seen": 430472, "step": 1070 }, { "epoch": 1.1174636174636174, "grad_norm": 20.18915557861328, "learning_rate": 4.77205446176379e-06, "loss": 0.1315, "num_input_tokens_seen": 432328, "step": 1075 }, { "epoch": 1.1226611226611227, "grad_norm": 146.2836151123047, "learning_rate": 4.768255082272612e-06, "loss": 0.2841, "num_input_tokens_seen": 434440, "step": 1080 }, { "epoch": 1.1278586278586278, "grad_norm": 140.84530639648438, "learning_rate": 4.764425838229823e-06, "loss": 0.0783, "num_input_tokens_seen": 436488, "step": 1085 }, { "epoch": 1.133056133056133, "grad_norm": 28.586977005004883, "learning_rate": 4.760566780052445e-06, "loss": 0.346, "num_input_tokens_seen": 438472, "step": 1090 }, { "epoch": 1.1382536382536383, "grad_norm": 42.30632400512695, "learning_rate": 4.756677958550035e-06, "loss": 0.4155, "num_input_tokens_seen": 440456, "step": 1095 }, { "epoch": 1.1434511434511434, "grad_norm": 41.02433395385742, "learning_rate": 4.752759424924026e-06, "loss": 0.1236, "num_input_tokens_seen": 442440, "step": 1100 }, { "epoch": 1.1486486486486487, "grad_norm": 22.878646850585938, "learning_rate": 4.7488112307670515e-06, "loss": 0.099, "num_input_tokens_seen": 444424, "step": 1105 }, { "epoch": 1.1538461538461537, "grad_norm": 18.822031021118164, "learning_rate": 4.7448334280622624e-06, "loss": 0.1891, "num_input_tokens_seen": 446280, "step": 1110 }, { "epoch": 1.159043659043659, "grad_norm": 26.573184967041016, "learning_rate": 4.740826069182645e-06, "loss": 0.1802, "num_input_tokens_seen": 448264, "step": 1115 }, { "epoch": 1.1642411642411643, "grad_norm": 14.406500816345215, "learning_rate": 4.736789206890332e-06, "loss": 0.2325, "num_input_tokens_seen": 450376, "step": 1120 }, { "epoch": 1.1694386694386694, "grad_norm": 2.926020860671997, "learning_rate": 4.732722894335909e-06, "loss": 0.1142, "num_input_tokens_seen": 452552, "step": 1125 }, { "epoch": 1.1746361746361746, "grad_norm": 13.685401916503906, "learning_rate": 4.728627185057711e-06, "loss": 0.1432, "num_input_tokens_seen": 454600, "step": 1130 }, { "epoch": 1.17983367983368, "grad_norm": 37.38111877441406, "learning_rate": 4.724502132981119e-06, "loss": 0.1061, "num_input_tokens_seen": 456648, "step": 1135 }, { "epoch": 1.185031185031185, "grad_norm": 28.67649269104004, "learning_rate": 4.720347792417851e-06, "loss": 0.078, "num_input_tokens_seen": 458632, "step": 1140 }, { "epoch": 1.1902286902286903, "grad_norm": 83.61138153076172, "learning_rate": 4.716164218065246e-06, "loss": 0.1068, "num_input_tokens_seen": 460680, "step": 1145 }, { "epoch": 1.1954261954261955, "grad_norm": 14.42905330657959, "learning_rate": 4.711951465005548e-06, "loss": 0.2177, "num_input_tokens_seen": 462728, "step": 1150 }, { "epoch": 1.2006237006237006, "grad_norm": 9.12094783782959, "learning_rate": 4.707709588705169e-06, "loss": 0.058, "num_input_tokens_seen": 464776, "step": 1155 }, { "epoch": 1.2058212058212059, "grad_norm": 33.16276550292969, "learning_rate": 4.7034386450139735e-06, "loss": 0.3544, "num_input_tokens_seen": 466696, "step": 1160 }, { "epoch": 1.211018711018711, "grad_norm": 29.38105010986328, "learning_rate": 4.699138690164533e-06, "loss": 0.1744, "num_input_tokens_seen": 468616, "step": 1165 }, { "epoch": 1.2162162162162162, "grad_norm": 41.46559143066406, "learning_rate": 4.694809780771391e-06, "loss": 0.1842, "num_input_tokens_seen": 470728, "step": 1170 }, { "epoch": 1.2214137214137215, "grad_norm": 15.253453254699707, "learning_rate": 4.690451973830314e-06, "loss": 0.1067, "num_input_tokens_seen": 472776, "step": 1175 }, { "epoch": 1.2266112266112266, "grad_norm": 32.71086120605469, "learning_rate": 4.6860653267175425e-06, "loss": 0.177, "num_input_tokens_seen": 474824, "step": 1180 }, { "epoch": 1.2318087318087318, "grad_norm": 14.900812149047852, "learning_rate": 4.681649897189036e-06, "loss": 0.2562, "num_input_tokens_seen": 476744, "step": 1185 }, { "epoch": 1.237006237006237, "grad_norm": 11.960613250732422, "learning_rate": 4.677205743379714e-06, "loss": 0.053, "num_input_tokens_seen": 478856, "step": 1190 }, { "epoch": 1.2422037422037422, "grad_norm": 33.26080322265625, "learning_rate": 4.672732923802685e-06, "loss": 0.1686, "num_input_tokens_seen": 480776, "step": 1195 }, { "epoch": 1.2474012474012475, "grad_norm": 13.924117088317871, "learning_rate": 4.6682314973484844e-06, "loss": 0.0292, "num_input_tokens_seen": 482952, "step": 1200 }, { "epoch": 1.2525987525987525, "grad_norm": 18.63437271118164, "learning_rate": 4.663701523284291e-06, "loss": 0.0622, "num_input_tokens_seen": 485192, "step": 1205 }, { "epoch": 1.2525987525987525, "eval_loss": 0.26757940649986267, "eval_runtime": 1.0561, "eval_samples_per_second": 810.538, "eval_steps_per_second": 101.317, "num_input_tokens_seen": 485192, "step": 1205 }, { "epoch": 1.2577962577962578, "grad_norm": 0.11255653202533722, "learning_rate": 4.659143061253152e-06, "loss": 0.1299, "num_input_tokens_seen": 487112, "step": 1210 }, { "epoch": 1.262993762993763, "grad_norm": 38.90354537963867, "learning_rate": 4.654556171273196e-06, "loss": 0.2685, "num_input_tokens_seen": 489160, "step": 1215 }, { "epoch": 1.2681912681912682, "grad_norm": 0.6540949940681458, "learning_rate": 4.649940913736841e-06, "loss": 0.2017, "num_input_tokens_seen": 491080, "step": 1220 }, { "epoch": 1.2733887733887734, "grad_norm": 0.8767725825309753, "learning_rate": 4.645297349410005e-06, "loss": 0.0607, "num_input_tokens_seen": 493064, "step": 1225 }, { "epoch": 1.2785862785862787, "grad_norm": 1.3416281938552856, "learning_rate": 4.640625539431298e-06, "loss": 0.1537, "num_input_tokens_seen": 494984, "step": 1230 }, { "epoch": 1.2837837837837838, "grad_norm": 11.072347640991211, "learning_rate": 4.635925545311224e-06, "loss": 0.2946, "num_input_tokens_seen": 496968, "step": 1235 }, { "epoch": 1.288981288981289, "grad_norm": 5.309337615966797, "learning_rate": 4.631197428931365e-06, "loss": 0.0799, "num_input_tokens_seen": 498824, "step": 1240 }, { "epoch": 1.2941787941787941, "grad_norm": 42.28898239135742, "learning_rate": 4.626441252543572e-06, "loss": 0.0804, "num_input_tokens_seen": 500808, "step": 1245 }, { "epoch": 1.2993762993762994, "grad_norm": 99.69770812988281, "learning_rate": 4.621657078769143e-06, "loss": 0.251, "num_input_tokens_seen": 502856, "step": 1250 }, { "epoch": 1.3045738045738045, "grad_norm": 48.22378921508789, "learning_rate": 4.616844970597996e-06, "loss": 0.0856, "num_input_tokens_seen": 504712, "step": 1255 }, { "epoch": 1.3097713097713097, "grad_norm": 22.802143096923828, "learning_rate": 4.612004991387843e-06, "loss": 0.3719, "num_input_tokens_seen": 506696, "step": 1260 }, { "epoch": 1.314968814968815, "grad_norm": 20.26570701599121, "learning_rate": 4.607137204863356e-06, "loss": 0.0936, "num_input_tokens_seen": 508680, "step": 1265 }, { "epoch": 1.32016632016632, "grad_norm": 4.629741191864014, "learning_rate": 4.602241675115326e-06, "loss": 0.1072, "num_input_tokens_seen": 510728, "step": 1270 }, { "epoch": 1.3253638253638254, "grad_norm": 2.296597957611084, "learning_rate": 4.597318466599819e-06, "loss": 0.0841, "num_input_tokens_seen": 512712, "step": 1275 }, { "epoch": 1.3305613305613306, "grad_norm": 0.3281061351299286, "learning_rate": 4.592367644137329e-06, "loss": 0.1067, "num_input_tokens_seen": 514696, "step": 1280 }, { "epoch": 1.3357588357588357, "grad_norm": 28.257986068725586, "learning_rate": 4.587389272911923e-06, "loss": 0.1895, "num_input_tokens_seen": 516808, "step": 1285 }, { "epoch": 1.340956340956341, "grad_norm": 49.04169464111328, "learning_rate": 4.582383418470386e-06, "loss": 0.2118, "num_input_tokens_seen": 518792, "step": 1290 }, { "epoch": 1.3461538461538463, "grad_norm": 75.7163314819336, "learning_rate": 4.5773501467213525e-06, "loss": 0.1325, "num_input_tokens_seen": 520840, "step": 1295 }, { "epoch": 1.3513513513513513, "grad_norm": 0.3487839698791504, "learning_rate": 4.572289523934444e-06, "loss": 0.0526, "num_input_tokens_seen": 522760, "step": 1300 }, { "epoch": 1.3565488565488566, "grad_norm": 21.736961364746094, "learning_rate": 4.567201616739393e-06, "loss": 0.2152, "num_input_tokens_seen": 524872, "step": 1305 }, { "epoch": 1.3617463617463619, "grad_norm": 66.38067626953125, "learning_rate": 4.562086492125167e-06, "loss": 0.1978, "num_input_tokens_seen": 526920, "step": 1310 }, { "epoch": 1.366943866943867, "grad_norm": 11.493204116821289, "learning_rate": 4.5569442174390885e-06, "loss": 0.1374, "num_input_tokens_seen": 528968, "step": 1315 }, { "epoch": 1.3721413721413722, "grad_norm": 0.7820735573768616, "learning_rate": 4.551774860385944e-06, "loss": 0.0818, "num_input_tokens_seen": 530888, "step": 1320 }, { "epoch": 1.3773388773388773, "grad_norm": 0.8318536281585693, "learning_rate": 4.546578489027095e-06, "loss": 0.1644, "num_input_tokens_seen": 532872, "step": 1325 }, { "epoch": 1.3825363825363826, "grad_norm": 10.547067642211914, "learning_rate": 4.541355171779582e-06, "loss": 0.118, "num_input_tokens_seen": 534920, "step": 1330 }, { "epoch": 1.3877338877338876, "grad_norm": 1.9730658531188965, "learning_rate": 4.536104977415225e-06, "loss": 0.0039, "num_input_tokens_seen": 536840, "step": 1335 }, { "epoch": 1.392931392931393, "grad_norm": 21.502456665039062, "learning_rate": 4.530827975059715e-06, "loss": 0.3705, "num_input_tokens_seen": 538760, "step": 1340 }, { "epoch": 1.3981288981288982, "grad_norm": 0.3489514887332916, "learning_rate": 4.525524234191705e-06, "loss": 0.2364, "num_input_tokens_seen": 540680, "step": 1345 }, { "epoch": 1.4033264033264032, "grad_norm": 44.12948989868164, "learning_rate": 4.520193824641898e-06, "loss": 0.1405, "num_input_tokens_seen": 542664, "step": 1350 }, { "epoch": 1.4085239085239085, "grad_norm": 91.0547103881836, "learning_rate": 4.51483681659212e-06, "loss": 0.1596, "num_input_tokens_seen": 544712, "step": 1355 }, { "epoch": 1.4137214137214138, "grad_norm": 16.955787658691406, "learning_rate": 4.5094532805744075e-06, "loss": 0.2662, "num_input_tokens_seen": 546824, "step": 1360 }, { "epoch": 1.4189189189189189, "grad_norm": 24.00728416442871, "learning_rate": 4.504043287470068e-06, "loss": 0.0791, "num_input_tokens_seen": 548936, "step": 1365 }, { "epoch": 1.4241164241164241, "grad_norm": 1.3119056224822998, "learning_rate": 4.498606908508754e-06, "loss": 0.1218, "num_input_tokens_seen": 550920, "step": 1370 }, { "epoch": 1.4293139293139294, "grad_norm": 0.5111730694770813, "learning_rate": 4.493144215267519e-06, "loss": 0.0307, "num_input_tokens_seen": 552904, "step": 1375 }, { "epoch": 1.4345114345114345, "grad_norm": 5.568216800689697, "learning_rate": 4.4876552796698814e-06, "loss": 0.1616, "num_input_tokens_seen": 554824, "step": 1380 }, { "epoch": 1.4397089397089398, "grad_norm": 21.439517974853516, "learning_rate": 4.482140173984875e-06, "loss": 0.214, "num_input_tokens_seen": 556872, "step": 1385 }, { "epoch": 1.444906444906445, "grad_norm": 4.337334156036377, "learning_rate": 4.476598970826093e-06, "loss": 0.1453, "num_input_tokens_seen": 558984, "step": 1390 }, { "epoch": 1.45010395010395, "grad_norm": 15.975934982299805, "learning_rate": 4.471031743150744e-06, "loss": 0.2061, "num_input_tokens_seen": 560968, "step": 1395 }, { "epoch": 1.4553014553014554, "grad_norm": 65.18135070800781, "learning_rate": 4.465438564258673e-06, "loss": 0.2358, "num_input_tokens_seen": 562952, "step": 1400 }, { "epoch": 1.4604989604989604, "grad_norm": 0.4641796052455902, "learning_rate": 4.459819507791415e-06, "loss": 0.0357, "num_input_tokens_seen": 565064, "step": 1405 }, { "epoch": 1.4656964656964657, "grad_norm": 0.4620436728000641, "learning_rate": 4.454174647731213e-06, "loss": 0.1194, "num_input_tokens_seen": 567112, "step": 1410 }, { "epoch": 1.4708939708939708, "grad_norm": 0.40077999234199524, "learning_rate": 4.448504058400052e-06, "loss": 0.2261, "num_input_tokens_seen": 569160, "step": 1415 }, { "epoch": 1.476091476091476, "grad_norm": 31.147504806518555, "learning_rate": 4.4428078144586715e-06, "loss": 0.1794, "num_input_tokens_seen": 571336, "step": 1420 }, { "epoch": 1.4812889812889813, "grad_norm": 60.33259582519531, "learning_rate": 4.437085990905591e-06, "loss": 0.2622, "num_input_tokens_seen": 573384, "step": 1425 }, { "epoch": 1.4864864864864864, "grad_norm": 42.84424591064453, "learning_rate": 4.431338663076119e-06, "loss": 0.1625, "num_input_tokens_seen": 575304, "step": 1430 }, { "epoch": 1.4916839916839917, "grad_norm": 1.8637181520462036, "learning_rate": 4.42556590664136e-06, "loss": 0.0647, "num_input_tokens_seen": 577160, "step": 1435 }, { "epoch": 1.496881496881497, "grad_norm": 34.14229965209961, "learning_rate": 4.41976779760722e-06, "loss": 0.11, "num_input_tokens_seen": 579208, "step": 1440 }, { "epoch": 1.502079002079002, "grad_norm": 48.55718994140625, "learning_rate": 4.413944412313405e-06, "loss": 0.0911, "num_input_tokens_seen": 581256, "step": 1445 }, { "epoch": 1.503118503118503, "eval_loss": 0.3145564794540405, "eval_runtime": 1.1073, "eval_samples_per_second": 773.075, "eval_steps_per_second": 96.634, "num_input_tokens_seen": 581704, "step": 1446 }, { "epoch": 1.5072765072765073, "grad_norm": 0.2164192944765091, "learning_rate": 4.408095827432416e-06, "loss": 0.1191, "num_input_tokens_seen": 583304, "step": 1450 }, { "epoch": 1.5124740124740126, "grad_norm": 50.20052719116211, "learning_rate": 4.40222211996854e-06, "loss": 0.3479, "num_input_tokens_seen": 585224, "step": 1455 }, { "epoch": 1.5176715176715176, "grad_norm": 31.401309967041016, "learning_rate": 4.396323367256836e-06, "loss": 0.2617, "num_input_tokens_seen": 587272, "step": 1460 }, { "epoch": 1.5228690228690227, "grad_norm": 32.85145568847656, "learning_rate": 4.390399646962117e-06, "loss": 0.1985, "num_input_tokens_seen": 589320, "step": 1465 }, { "epoch": 1.5280665280665282, "grad_norm": 13.456771850585938, "learning_rate": 4.384451037077924e-06, "loss": 0.1369, "num_input_tokens_seen": 591304, "step": 1470 }, { "epoch": 1.5332640332640333, "grad_norm": 1.2188056707382202, "learning_rate": 4.378477615925506e-06, "loss": 0.1433, "num_input_tokens_seen": 593224, "step": 1475 }, { "epoch": 1.5384615384615383, "grad_norm": 28.065401077270508, "learning_rate": 4.372479462152781e-06, "loss": 0.1273, "num_input_tokens_seen": 595336, "step": 1480 }, { "epoch": 1.5436590436590436, "grad_norm": 21.94362449645996, "learning_rate": 4.366456654733308e-06, "loss": 0.2715, "num_input_tokens_seen": 597256, "step": 1485 }, { "epoch": 1.5488565488565489, "grad_norm": 11.613262176513672, "learning_rate": 4.360409272965242e-06, "loss": 0.1859, "num_input_tokens_seen": 599304, "step": 1490 }, { "epoch": 1.554054054054054, "grad_norm": 3.856565237045288, "learning_rate": 4.354337396470291e-06, "loss": 0.0745, "num_input_tokens_seen": 601288, "step": 1495 }, { "epoch": 1.5592515592515592, "grad_norm": 52.13639831542969, "learning_rate": 4.348241105192668e-06, "loss": 0.1641, "num_input_tokens_seen": 603272, "step": 1500 }, { "epoch": 1.5644490644490645, "grad_norm": 4.8234357833862305, "learning_rate": 4.34212047939804e-06, "loss": 0.1365, "num_input_tokens_seen": 605256, "step": 1505 }, { "epoch": 1.5696465696465696, "grad_norm": 17.75581169128418, "learning_rate": 4.335975599672469e-06, "loss": 0.0868, "num_input_tokens_seen": 607304, "step": 1510 }, { "epoch": 1.5748440748440748, "grad_norm": 6.615152835845947, "learning_rate": 4.329806546921354e-06, "loss": 0.1281, "num_input_tokens_seen": 609224, "step": 1515 }, { "epoch": 1.5800415800415801, "grad_norm": 29.3903865814209, "learning_rate": 4.3236134023683565e-06, "loss": 0.0465, "num_input_tokens_seen": 611336, "step": 1520 }, { "epoch": 1.5852390852390852, "grad_norm": 66.3274154663086, "learning_rate": 4.3173962475543475e-06, "loss": 0.1156, "num_input_tokens_seen": 613320, "step": 1525 }, { "epoch": 1.5904365904365905, "grad_norm": 45.9361686706543, "learning_rate": 4.311155164336318e-06, "loss": 0.2405, "num_input_tokens_seen": 615176, "step": 1530 }, { "epoch": 1.5956340956340958, "grad_norm": 15.004530906677246, "learning_rate": 4.3048902348863116e-06, "loss": 0.1673, "num_input_tokens_seen": 617224, "step": 1535 }, { "epoch": 1.6008316008316008, "grad_norm": 46.72109603881836, "learning_rate": 4.298601541690336e-06, "loss": 0.1683, "num_input_tokens_seen": 619208, "step": 1540 }, { "epoch": 1.6060291060291059, "grad_norm": 25.378122329711914, "learning_rate": 4.292289167547281e-06, "loss": 0.221, "num_input_tokens_seen": 621192, "step": 1545 }, { "epoch": 1.6112266112266114, "grad_norm": 15.957310676574707, "learning_rate": 4.285953195567827e-06, "loss": 0.1458, "num_input_tokens_seen": 623176, "step": 1550 }, { "epoch": 1.6164241164241164, "grad_norm": 20.942054748535156, "learning_rate": 4.279593709173352e-06, "loss": 0.246, "num_input_tokens_seen": 625160, "step": 1555 }, { "epoch": 1.6216216216216215, "grad_norm": 0.6662601232528687, "learning_rate": 4.27321079209483e-06, "loss": 0.1381, "num_input_tokens_seen": 627144, "step": 1560 }, { "epoch": 1.6268191268191268, "grad_norm": 24.978178024291992, "learning_rate": 4.266804528371732e-06, "loss": 0.1634, "num_input_tokens_seen": 629192, "step": 1565 }, { "epoch": 1.632016632016632, "grad_norm": 7.083024024963379, "learning_rate": 4.260375002350917e-06, "loss": 0.1174, "num_input_tokens_seen": 631240, "step": 1570 }, { "epoch": 1.637214137214137, "grad_norm": 20.857555389404297, "learning_rate": 4.253922298685525e-06, "loss": 0.2274, "num_input_tokens_seen": 633224, "step": 1575 }, { "epoch": 1.6424116424116424, "grad_norm": 46.30590057373047, "learning_rate": 4.2474465023338586e-06, "loss": 0.1367, "num_input_tokens_seen": 635208, "step": 1580 }, { "epoch": 1.6476091476091477, "grad_norm": 0.8250938057899475, "learning_rate": 4.2409476985582645e-06, "loss": 0.1048, "num_input_tokens_seen": 637256, "step": 1585 }, { "epoch": 1.6528066528066527, "grad_norm": 2.617978572845459, "learning_rate": 4.234425972924014e-06, "loss": 0.0156, "num_input_tokens_seen": 639176, "step": 1590 }, { "epoch": 1.658004158004158, "grad_norm": 0.5611851811408997, "learning_rate": 4.227881411298175e-06, "loss": 0.1551, "num_input_tokens_seen": 641224, "step": 1595 }, { "epoch": 1.6632016632016633, "grad_norm": 0.4693892002105713, "learning_rate": 4.221314099848481e-06, "loss": 0.1125, "num_input_tokens_seen": 643144, "step": 1600 }, { "epoch": 1.6683991683991684, "grad_norm": 38.350101470947266, "learning_rate": 4.214724125042195e-06, "loss": 0.1457, "num_input_tokens_seen": 644936, "step": 1605 }, { "epoch": 1.6735966735966736, "grad_norm": 36.42043685913086, "learning_rate": 4.208111573644975e-06, "loss": 0.1623, "num_input_tokens_seen": 646984, "step": 1610 }, { "epoch": 1.678794178794179, "grad_norm": 0.15917447209358215, "learning_rate": 4.2014765327197285e-06, "loss": 0.2052, "num_input_tokens_seen": 649032, "step": 1615 }, { "epoch": 1.683991683991684, "grad_norm": 25.833524703979492, "learning_rate": 4.194819089625466e-06, "loss": 0.2047, "num_input_tokens_seen": 651080, "step": 1620 }, { "epoch": 1.689189189189189, "grad_norm": 13.379853248596191, "learning_rate": 4.188139332016154e-06, "loss": 0.2123, "num_input_tokens_seen": 653000, "step": 1625 }, { "epoch": 1.6943866943866945, "grad_norm": 10.135590553283691, "learning_rate": 4.181437347839559e-06, "loss": 0.2089, "num_input_tokens_seen": 654920, "step": 1630 }, { "epoch": 1.6995841995841996, "grad_norm": 3.057936191558838, "learning_rate": 4.174713225336087e-06, "loss": 0.1685, "num_input_tokens_seen": 656904, "step": 1635 }, { "epoch": 1.7047817047817047, "grad_norm": 29.033493041992188, "learning_rate": 4.167967053037625e-06, "loss": 0.105, "num_input_tokens_seen": 658952, "step": 1640 }, { "epoch": 1.70997920997921, "grad_norm": 0.9139642715454102, "learning_rate": 4.161198919766375e-06, "loss": 0.0899, "num_input_tokens_seen": 660872, "step": 1645 }, { "epoch": 1.7151767151767152, "grad_norm": 37.07249069213867, "learning_rate": 4.154408914633685e-06, "loss": 0.2054, "num_input_tokens_seen": 662856, "step": 1650 }, { "epoch": 1.7203742203742203, "grad_norm": 6.607808589935303, "learning_rate": 4.147597127038873e-06, "loss": 0.2025, "num_input_tokens_seen": 664904, "step": 1655 }, { "epoch": 1.7255717255717256, "grad_norm": 20.834936141967773, "learning_rate": 4.140763646668051e-06, "loss": 0.141, "num_input_tokens_seen": 666888, "step": 1660 }, { "epoch": 1.7307692307692308, "grad_norm": 8.577420234680176, "learning_rate": 4.133908563492949e-06, "loss": 0.0252, "num_input_tokens_seen": 668936, "step": 1665 }, { "epoch": 1.735966735966736, "grad_norm": 86.40946960449219, "learning_rate": 4.12703196776972e-06, "loss": 0.2066, "num_input_tokens_seen": 670856, "step": 1670 }, { "epoch": 1.7411642411642412, "grad_norm": 30.79783058166504, "learning_rate": 4.120133950037763e-06, "loss": 0.3627, "num_input_tokens_seen": 672840, "step": 1675 }, { "epoch": 1.7463617463617465, "grad_norm": 40.20522689819336, "learning_rate": 4.113214601118523e-06, "loss": 0.2218, "num_input_tokens_seen": 674824, "step": 1680 }, { "epoch": 1.7515592515592515, "grad_norm": 26.033588409423828, "learning_rate": 4.106274012114302e-06, "loss": 0.1042, "num_input_tokens_seen": 676808, "step": 1685 }, { "epoch": 1.7536382536382535, "eval_loss": 0.2114141583442688, "eval_runtime": 1.0685, "eval_samples_per_second": 801.127, "eval_steps_per_second": 100.141, "num_input_tokens_seen": 677576, "step": 1687 }, { "epoch": 1.7567567567567568, "grad_norm": 20.88821792602539, "learning_rate": 4.099312274407049e-06, "loss": 0.1712, "num_input_tokens_seen": 678728, "step": 1690 }, { "epoch": 1.761954261954262, "grad_norm": 36.20766067504883, "learning_rate": 4.092329479657168e-06, "loss": 0.1031, "num_input_tokens_seen": 680776, "step": 1695 }, { "epoch": 1.7671517671517671, "grad_norm": 1.624861478805542, "learning_rate": 4.085325719802307e-06, "loss": 0.1288, "num_input_tokens_seen": 683016, "step": 1700 }, { "epoch": 1.7723492723492722, "grad_norm": 5.351211071014404, "learning_rate": 4.0783010870561445e-06, "loss": 0.0556, "num_input_tokens_seen": 685256, "step": 1705 }, { "epoch": 1.7775467775467777, "grad_norm": 45.75218963623047, "learning_rate": 4.07125567390718e-06, "loss": 0.3125, "num_input_tokens_seen": 687304, "step": 1710 }, { "epoch": 1.7827442827442828, "grad_norm": 26.084054946899414, "learning_rate": 4.064189573117512e-06, "loss": 0.2158, "num_input_tokens_seen": 689224, "step": 1715 }, { "epoch": 1.7879417879417878, "grad_norm": 1.3422380685806274, "learning_rate": 4.057102877721621e-06, "loss": 0.1701, "num_input_tokens_seen": 691400, "step": 1720 }, { "epoch": 1.793139293139293, "grad_norm": 14.056601524353027, "learning_rate": 4.049995681025143e-06, "loss": 0.1154, "num_input_tokens_seen": 693320, "step": 1725 }, { "epoch": 1.7983367983367984, "grad_norm": 18.775554656982422, "learning_rate": 4.0428680766036386e-06, "loss": 0.1654, "num_input_tokens_seen": 695432, "step": 1730 }, { "epoch": 1.8035343035343034, "grad_norm": 18.788192749023438, "learning_rate": 4.035720158301363e-06, "loss": 0.2169, "num_input_tokens_seen": 697544, "step": 1735 }, { "epoch": 1.8087318087318087, "grad_norm": 15.670763969421387, "learning_rate": 4.028552020230031e-06, "loss": 0.1438, "num_input_tokens_seen": 699592, "step": 1740 }, { "epoch": 1.813929313929314, "grad_norm": 27.88116455078125, "learning_rate": 4.021363756767577e-06, "loss": 0.2247, "num_input_tokens_seen": 701576, "step": 1745 }, { "epoch": 1.819126819126819, "grad_norm": 17.78391456604004, "learning_rate": 4.014155462556913e-06, "loss": 0.2586, "num_input_tokens_seen": 703688, "step": 1750 }, { "epoch": 1.8243243243243243, "grad_norm": 16.56380844116211, "learning_rate": 4.006927232504682e-06, "loss": 0.2187, "num_input_tokens_seen": 705736, "step": 1755 }, { "epoch": 1.8295218295218296, "grad_norm": 6.663086414337158, "learning_rate": 3.999679161780006e-06, "loss": 0.043, "num_input_tokens_seen": 707720, "step": 1760 }, { "epoch": 1.8347193347193347, "grad_norm": 0.7404634952545166, "learning_rate": 3.99241134581324e-06, "loss": 0.08, "num_input_tokens_seen": 709896, "step": 1765 }, { "epoch": 1.83991683991684, "grad_norm": 39.46183395385742, "learning_rate": 3.985123880294708e-06, "loss": 0.1669, "num_input_tokens_seen": 711944, "step": 1770 }, { "epoch": 1.8451143451143452, "grad_norm": 7.113590717315674, "learning_rate": 3.977816861173446e-06, "loss": 0.1912, "num_input_tokens_seen": 713992, "step": 1775 }, { "epoch": 1.8503118503118503, "grad_norm": 48.9863395690918, "learning_rate": 3.970490384655939e-06, "loss": 0.1846, "num_input_tokens_seen": 715976, "step": 1780 }, { "epoch": 1.8555093555093554, "grad_norm": 31.94883918762207, "learning_rate": 3.963144547204856e-06, "loss": 0.105, "num_input_tokens_seen": 718024, "step": 1785 }, { "epoch": 1.8607068607068609, "grad_norm": 11.180656433105469, "learning_rate": 3.955779445537776e-06, "loss": 0.2342, "num_input_tokens_seen": 720072, "step": 1790 }, { "epoch": 1.865904365904366, "grad_norm": 5.974298000335693, "learning_rate": 3.948395176625918e-06, "loss": 0.2314, "num_input_tokens_seen": 722120, "step": 1795 }, { "epoch": 1.871101871101871, "grad_norm": 3.235103130340576, "learning_rate": 3.940991837692861e-06, "loss": 0.1187, "num_input_tokens_seen": 724168, "step": 1800 }, { "epoch": 1.8762993762993763, "grad_norm": 25.299224853515625, "learning_rate": 3.933569526213268e-06, "loss": 0.1292, "num_input_tokens_seen": 726280, "step": 1805 }, { "epoch": 1.8814968814968815, "grad_norm": 2.7856338024139404, "learning_rate": 3.926128339911599e-06, "loss": 0.0843, "num_input_tokens_seen": 728264, "step": 1810 }, { "epoch": 1.8866943866943866, "grad_norm": 0.9576424360275269, "learning_rate": 3.918668376760827e-06, "loss": 0.1791, "num_input_tokens_seen": 730312, "step": 1815 }, { "epoch": 1.8918918918918919, "grad_norm": 55.718650817871094, "learning_rate": 3.9111897349811455e-06, "loss": 0.1365, "num_input_tokens_seen": 732296, "step": 1820 }, { "epoch": 1.8970893970893972, "grad_norm": 20.79973793029785, "learning_rate": 3.903692513038677e-06, "loss": 0.1369, "num_input_tokens_seen": 734088, "step": 1825 }, { "epoch": 1.9022869022869022, "grad_norm": 51.692867279052734, "learning_rate": 3.896176809644178e-06, "loss": 0.2305, "num_input_tokens_seen": 736072, "step": 1830 }, { "epoch": 1.9074844074844075, "grad_norm": 3.8833014965057373, "learning_rate": 3.8886427237517345e-06, "loss": 0.2062, "num_input_tokens_seen": 738120, "step": 1835 }, { "epoch": 1.9126819126819128, "grad_norm": 26.46794891357422, "learning_rate": 3.881090354557463e-06, "loss": 0.2077, "num_input_tokens_seen": 740168, "step": 1840 }, { "epoch": 1.9178794178794178, "grad_norm": 11.211897850036621, "learning_rate": 3.8735198014982066e-06, "loss": 0.1425, "num_input_tokens_seen": 742280, "step": 1845 }, { "epoch": 1.9230769230769231, "grad_norm": 26.17072296142578, "learning_rate": 3.865931164250219e-06, "loss": 0.0702, "num_input_tokens_seen": 744328, "step": 1850 }, { "epoch": 1.9282744282744284, "grad_norm": 13.180768966674805, "learning_rate": 3.858324542727859e-06, "loss": 0.1732, "num_input_tokens_seen": 746440, "step": 1855 }, { "epoch": 1.9334719334719335, "grad_norm": 41.96379470825195, "learning_rate": 3.8507000370822675e-06, "loss": 0.1543, "num_input_tokens_seen": 748488, "step": 1860 }, { "epoch": 1.9386694386694385, "grad_norm": 14.179240226745605, "learning_rate": 3.84305774770006e-06, "loss": 0.1298, "num_input_tokens_seen": 750344, "step": 1865 }, { "epoch": 1.943866943866944, "grad_norm": 10.15986442565918, "learning_rate": 3.835397775201991e-06, "loss": 0.0507, "num_input_tokens_seen": 752328, "step": 1870 }, { "epoch": 1.949064449064449, "grad_norm": 14.734403610229492, "learning_rate": 3.827720220441642e-06, "loss": 0.2625, "num_input_tokens_seen": 754312, "step": 1875 }, { "epoch": 1.9542619542619541, "grad_norm": 38.59032440185547, "learning_rate": 3.820025184504085e-06, "loss": 0.4145, "num_input_tokens_seen": 756232, "step": 1880 }, { "epoch": 1.9594594594594594, "grad_norm": 22.36400604248047, "learning_rate": 3.812312768704557e-06, "loss": 0.2626, "num_input_tokens_seen": 758280, "step": 1885 }, { "epoch": 1.9646569646569647, "grad_norm": 5.247732162475586, "learning_rate": 3.80458307458712e-06, "loss": 0.1128, "num_input_tokens_seen": 760328, "step": 1890 }, { "epoch": 1.9698544698544698, "grad_norm": 14.368209838867188, "learning_rate": 3.7968362039233315e-06, "loss": 0.1213, "num_input_tokens_seen": 762248, "step": 1895 }, { "epoch": 1.975051975051975, "grad_norm": 22.700754165649414, "learning_rate": 3.7890722587108985e-06, "loss": 0.077, "num_input_tokens_seen": 764168, "step": 1900 }, { "epoch": 1.9802494802494803, "grad_norm": 1.1314526796340942, "learning_rate": 3.7812913411723377e-06, "loss": 0.0655, "num_input_tokens_seen": 766216, "step": 1905 }, { "epoch": 1.9854469854469854, "grad_norm": 22.53055763244629, "learning_rate": 3.773493553753628e-06, "loss": 0.0962, "num_input_tokens_seen": 768264, "step": 1910 }, { "epoch": 1.9906444906444907, "grad_norm": 19.964323043823242, "learning_rate": 3.7656789991228638e-06, "loss": 0.0219, "num_input_tokens_seen": 770184, "step": 1915 }, { "epoch": 1.995841995841996, "grad_norm": 3.517256259918213, "learning_rate": 3.7578477801689e-06, "loss": 0.1279, "num_input_tokens_seen": 772168, "step": 1920 }, { "epoch": 2.001039501039501, "grad_norm": 0.14674918353557587, "learning_rate": 3.7500000000000005e-06, "loss": 0.096, "num_input_tokens_seen": 774160, "step": 1925 }, { "epoch": 2.004158004158004, "eval_loss": 0.3561875522136688, "eval_runtime": 1.0449, "eval_samples_per_second": 819.204, "eval_steps_per_second": 102.401, "num_input_tokens_seen": 775312, "step": 1928 }, { "epoch": 2.006237006237006, "grad_norm": 1.6722761392593384, "learning_rate": 3.7421357619424793e-06, "loss": 0.0698, "num_input_tokens_seen": 776144, "step": 1930 }, { "epoch": 2.0114345114345116, "grad_norm": 0.06727920472621918, "learning_rate": 3.7342551695393375e-06, "loss": 0.0941, "num_input_tokens_seen": 778128, "step": 1935 }, { "epoch": 2.0166320166320166, "grad_norm": 0.06038059666752815, "learning_rate": 3.7263583265489077e-06, "loss": 0.0863, "num_input_tokens_seen": 780176, "step": 1940 }, { "epoch": 2.0218295218295217, "grad_norm": 52.84983444213867, "learning_rate": 3.718445336943478e-06, "loss": 0.0572, "num_input_tokens_seen": 782160, "step": 1945 }, { "epoch": 2.027027027027027, "grad_norm": 35.81016540527344, "learning_rate": 3.7105163049079305e-06, "loss": 0.0675, "num_input_tokens_seen": 784208, "step": 1950 }, { "epoch": 2.0322245322245323, "grad_norm": 0.14884886145591736, "learning_rate": 3.702571334838365e-06, "loss": 0.0002, "num_input_tokens_seen": 786256, "step": 1955 }, { "epoch": 2.0374220374220373, "grad_norm": 0.03602315112948418, "learning_rate": 3.6946105313407287e-06, "loss": 0.1288, "num_input_tokens_seen": 788240, "step": 1960 }, { "epoch": 2.042619542619543, "grad_norm": 152.0932159423828, "learning_rate": 3.6866339992294347e-06, "loss": 0.1179, "num_input_tokens_seen": 790288, "step": 1965 }, { "epoch": 2.047817047817048, "grad_norm": 0.03992204740643501, "learning_rate": 3.678641843525986e-06, "loss": 0.0768, "num_input_tokens_seen": 792272, "step": 1970 }, { "epoch": 2.053014553014553, "grad_norm": 0.021054543554782867, "learning_rate": 3.670634169457587e-06, "loss": 0.0297, "num_input_tokens_seen": 794384, "step": 1975 }, { "epoch": 2.0582120582120584, "grad_norm": 0.09833300858736038, "learning_rate": 3.662611082455766e-06, "loss": 0.1305, "num_input_tokens_seen": 796368, "step": 1980 }, { "epoch": 2.0634095634095635, "grad_norm": 1.8203060626983643, "learning_rate": 3.6545726881549792e-06, "loss": 0.0029, "num_input_tokens_seen": 798480, "step": 1985 }, { "epoch": 2.0686070686070686, "grad_norm": 13.759748458862305, "learning_rate": 3.6465190923912275e-06, "loss": 0.0937, "num_input_tokens_seen": 800528, "step": 1990 }, { "epoch": 2.0738045738045736, "grad_norm": 0.23287345468997955, "learning_rate": 3.6384504012006544e-06, "loss": 0.1904, "num_input_tokens_seen": 802768, "step": 1995 }, { "epoch": 2.079002079002079, "grad_norm": 15.67501163482666, "learning_rate": 3.6303667208181576e-06, "loss": 0.1647, "num_input_tokens_seen": 804752, "step": 2000 }, { "epoch": 2.084199584199584, "grad_norm": 0.5168598294258118, "learning_rate": 3.622268157675986e-06, "loss": 0.0649, "num_input_tokens_seen": 806672, "step": 2005 }, { "epoch": 2.0893970893970892, "grad_norm": 0.3060367703437805, "learning_rate": 3.614154818402339e-06, "loss": 0.0186, "num_input_tokens_seen": 808656, "step": 2010 }, { "epoch": 2.0945945945945947, "grad_norm": 0.8559133410453796, "learning_rate": 3.6060268098199656e-06, "loss": 0.0494, "num_input_tokens_seen": 810640, "step": 2015 }, { "epoch": 2.0997920997921, "grad_norm": 0.40390461683273315, "learning_rate": 3.5978842389447523e-06, "loss": 0.0657, "num_input_tokens_seen": 812688, "step": 2020 }, { "epoch": 2.104989604989605, "grad_norm": 0.16403798758983612, "learning_rate": 3.5897272129843198e-06, "loss": 0.0206, "num_input_tokens_seen": 814800, "step": 2025 }, { "epoch": 2.1101871101871104, "grad_norm": 0.8833001255989075, "learning_rate": 3.5815558393366064e-06, "loss": 0.0252, "num_input_tokens_seen": 816912, "step": 2030 }, { "epoch": 2.1153846153846154, "grad_norm": 0.044099193066358566, "learning_rate": 3.57337022558846e-06, "loss": 0.1156, "num_input_tokens_seen": 818896, "step": 2035 }, { "epoch": 2.1205821205821205, "grad_norm": 20.8973445892334, "learning_rate": 3.5651704795142137e-06, "loss": 0.0855, "num_input_tokens_seen": 820880, "step": 2040 }, { "epoch": 2.125779625779626, "grad_norm": 21.765148162841797, "learning_rate": 3.5569567090742763e-06, "loss": 0.1594, "num_input_tokens_seen": 822864, "step": 2045 }, { "epoch": 2.130977130977131, "grad_norm": 2.817866325378418, "learning_rate": 3.548729022413701e-06, "loss": 0.0265, "num_input_tokens_seen": 825040, "step": 2050 }, { "epoch": 2.136174636174636, "grad_norm": 0.0856303721666336, "learning_rate": 3.5404875278607693e-06, "loss": 0.0995, "num_input_tokens_seen": 827024, "step": 2055 }, { "epoch": 2.141372141372141, "grad_norm": 0.09817512333393097, "learning_rate": 3.5322323339255602e-06, "loss": 0.072, "num_input_tokens_seen": 829136, "step": 2060 }, { "epoch": 2.1465696465696467, "grad_norm": 4.946967601776123, "learning_rate": 3.5239635492985248e-06, "loss": 0.0483, "num_input_tokens_seen": 831184, "step": 2065 }, { "epoch": 2.1517671517671517, "grad_norm": 0.04570393264293671, "learning_rate": 3.5156812828490507e-06, "loss": 0.0007, "num_input_tokens_seen": 833168, "step": 2070 }, { "epoch": 2.156964656964657, "grad_norm": 0.031534019857645035, "learning_rate": 3.5073856436240335e-06, "loss": 0.0685, "num_input_tokens_seen": 835216, "step": 2075 }, { "epoch": 2.1621621621621623, "grad_norm": 0.12049651145935059, "learning_rate": 3.4990767408464383e-06, "loss": 0.0004, "num_input_tokens_seen": 837136, "step": 2080 }, { "epoch": 2.1673596673596673, "grad_norm": 0.0549314022064209, "learning_rate": 3.4907546839138627e-06, "loss": 0.1832, "num_input_tokens_seen": 839120, "step": 2085 }, { "epoch": 2.1725571725571724, "grad_norm": 0.11109847575426102, "learning_rate": 3.4824195823970954e-06, "loss": 0.0608, "num_input_tokens_seen": 841104, "step": 2090 }, { "epoch": 2.177754677754678, "grad_norm": 9.288878440856934, "learning_rate": 3.4740715460386732e-06, "loss": 0.0894, "num_input_tokens_seen": 843152, "step": 2095 }, { "epoch": 2.182952182952183, "grad_norm": 0.1733572781085968, "learning_rate": 3.46571068475144e-06, "loss": 0.0972, "num_input_tokens_seen": 845136, "step": 2100 }, { "epoch": 2.188149688149688, "grad_norm": 0.4034070670604706, "learning_rate": 3.457337108617094e-06, "loss": 0.1887, "num_input_tokens_seen": 847120, "step": 2105 }, { "epoch": 2.1933471933471935, "grad_norm": 26.97930908203125, "learning_rate": 3.4489509278847415e-06, "loss": 0.2052, "num_input_tokens_seen": 849168, "step": 2110 }, { "epoch": 2.1985446985446986, "grad_norm": 74.03186798095703, "learning_rate": 3.440552252969446e-06, "loss": 0.0731, "num_input_tokens_seen": 851152, "step": 2115 }, { "epoch": 2.2037422037422036, "grad_norm": 3.1255908012390137, "learning_rate": 3.432141194450772e-06, "loss": 0.0078, "num_input_tokens_seen": 853008, "step": 2120 }, { "epoch": 2.208939708939709, "grad_norm": 0.36267173290252686, "learning_rate": 3.4237178630713312e-06, "loss": 0.0651, "num_input_tokens_seen": 855120, "step": 2125 }, { "epoch": 2.214137214137214, "grad_norm": 0.01146312803030014, "learning_rate": 3.4152823697353237e-06, "loss": 0.1599, "num_input_tokens_seen": 857232, "step": 2130 }, { "epoch": 2.2193347193347193, "grad_norm": 28.126773834228516, "learning_rate": 3.4068348255070764e-06, "loss": 0.057, "num_input_tokens_seen": 859344, "step": 2135 }, { "epoch": 2.2245322245322248, "grad_norm": 15.560206413269043, "learning_rate": 3.3983753416095844e-06, "loss": 0.0868, "num_input_tokens_seen": 861328, "step": 2140 }, { "epoch": 2.22972972972973, "grad_norm": 13.887256622314453, "learning_rate": 3.3899040294230413e-06, "loss": 0.2098, "num_input_tokens_seen": 863376, "step": 2145 }, { "epoch": 2.234927234927235, "grad_norm": 0.028023365885019302, "learning_rate": 3.381421000483378e-06, "loss": 0.0096, "num_input_tokens_seen": 865424, "step": 2150 }, { "epoch": 2.24012474012474, "grad_norm": 32.4400520324707, "learning_rate": 3.37292636648079e-06, "loss": 0.0981, "num_input_tokens_seen": 867472, "step": 2155 }, { "epoch": 2.2453222453222454, "grad_norm": 72.8097915649414, "learning_rate": 3.3644202392582703e-06, "loss": 0.1542, "num_input_tokens_seen": 869584, "step": 2160 }, { "epoch": 2.2505197505197505, "grad_norm": 1.8866627216339111, "learning_rate": 3.3559027308101344e-06, "loss": 0.0094, "num_input_tokens_seen": 871568, "step": 2165 }, { "epoch": 2.2546777546777546, "eval_loss": 0.30345332622528076, "eval_runtime": 1.0686, "eval_samples_per_second": 801.043, "eval_steps_per_second": 100.13, "num_input_tokens_seen": 873104, "step": 2169 }, { "epoch": 2.2557172557172556, "grad_norm": 12.88599967956543, "learning_rate": 3.3473739532805464e-06, "loss": 0.0945, "num_input_tokens_seen": 873488, "step": 2170 }, { "epoch": 2.260914760914761, "grad_norm": 2.817438840866089, "learning_rate": 3.3388340189620427e-06, "loss": 0.1038, "num_input_tokens_seen": 875472, "step": 2175 }, { "epoch": 2.266112266112266, "grad_norm": 0.07076973468065262, "learning_rate": 3.3302830402940534e-06, "loss": 0.0275, "num_input_tokens_seen": 877392, "step": 2180 }, { "epoch": 2.271309771309771, "grad_norm": 0.515289306640625, "learning_rate": 3.3217211298614225e-06, "loss": 0.1037, "num_input_tokens_seen": 879504, "step": 2185 }, { "epoch": 2.2765072765072767, "grad_norm": 4.857708930969238, "learning_rate": 3.313148400392925e-06, "loss": 0.0551, "num_input_tokens_seen": 881360, "step": 2190 }, { "epoch": 2.2817047817047817, "grad_norm": 0.10649969428777695, "learning_rate": 3.3045649647597814e-06, "loss": 0.094, "num_input_tokens_seen": 883280, "step": 2195 }, { "epoch": 2.286902286902287, "grad_norm": 0.1783861368894577, "learning_rate": 3.2959709359741743e-06, "loss": 0.0053, "num_input_tokens_seen": 885328, "step": 2200 }, { "epoch": 2.2920997920997923, "grad_norm": 23.481298446655273, "learning_rate": 3.2873664271877588e-06, "loss": 0.0732, "num_input_tokens_seen": 887312, "step": 2205 }, { "epoch": 2.2972972972972974, "grad_norm": 54.74378967285156, "learning_rate": 3.2787515516901717e-06, "loss": 0.0574, "num_input_tokens_seen": 889296, "step": 2210 }, { "epoch": 2.3024948024948024, "grad_norm": 0.0618121400475502, "learning_rate": 3.2701264229075443e-06, "loss": 0.0007, "num_input_tokens_seen": 891408, "step": 2215 }, { "epoch": 2.3076923076923075, "grad_norm": 0.05290354788303375, "learning_rate": 3.261491154401001e-06, "loss": 0.001, "num_input_tokens_seen": 893392, "step": 2220 }, { "epoch": 2.312889812889813, "grad_norm": 21.230501174926758, "learning_rate": 3.2528458598651735e-06, "loss": 0.0047, "num_input_tokens_seen": 895440, "step": 2225 }, { "epoch": 2.318087318087318, "grad_norm": 2.141026258468628, "learning_rate": 3.2441906531266963e-06, "loss": 0.1493, "num_input_tokens_seen": 897616, "step": 2230 }, { "epoch": 2.323284823284823, "grad_norm": 35.35129165649414, "learning_rate": 3.2355256481427145e-06, "loss": 0.0359, "num_input_tokens_seen": 899536, "step": 2235 }, { "epoch": 2.3284823284823286, "grad_norm": 0.017625411972403526, "learning_rate": 3.2268509589993745e-06, "loss": 0.0408, "num_input_tokens_seen": 901648, "step": 2240 }, { "epoch": 2.3336798336798337, "grad_norm": 3.886178731918335, "learning_rate": 3.218166699910332e-06, "loss": 0.1054, "num_input_tokens_seen": 903696, "step": 2245 }, { "epoch": 2.3388773388773387, "grad_norm": 27.935588836669922, "learning_rate": 3.209472985215243e-06, "loss": 0.1455, "num_input_tokens_seen": 905552, "step": 2250 }, { "epoch": 2.3440748440748442, "grad_norm": 35.09407043457031, "learning_rate": 3.2007699293782557e-06, "loss": 0.0118, "num_input_tokens_seen": 907472, "step": 2255 }, { "epoch": 2.3492723492723493, "grad_norm": 58.58681869506836, "learning_rate": 3.1920576469865115e-06, "loss": 0.1043, "num_input_tokens_seen": 909584, "step": 2260 }, { "epoch": 2.3544698544698544, "grad_norm": 45.3444938659668, "learning_rate": 3.183336252748627e-06, "loss": 0.0544, "num_input_tokens_seen": 911632, "step": 2265 }, { "epoch": 2.35966735966736, "grad_norm": 0.2568526566028595, "learning_rate": 3.1746058614931918e-06, "loss": 0.0396, "num_input_tokens_seen": 913616, "step": 2270 }, { "epoch": 2.364864864864865, "grad_norm": 55.77798080444336, "learning_rate": 3.16586658816725e-06, "loss": 0.0559, "num_input_tokens_seen": 915728, "step": 2275 }, { "epoch": 2.37006237006237, "grad_norm": 0.06812699884176254, "learning_rate": 3.157118547834793e-06, "loss": 0.1154, "num_input_tokens_seen": 917776, "step": 2280 }, { "epoch": 2.375259875259875, "grad_norm": 0.2111903578042984, "learning_rate": 3.1483618556752373e-06, "loss": 0.1803, "num_input_tokens_seen": 919952, "step": 2285 }, { "epoch": 2.3804573804573805, "grad_norm": 0.02562599442899227, "learning_rate": 3.139596626981916e-06, "loss": 0.0648, "num_input_tokens_seen": 921872, "step": 2290 }, { "epoch": 2.3856548856548856, "grad_norm": 0.2788306176662445, "learning_rate": 3.1308229771605546e-06, "loss": 0.1079, "num_input_tokens_seen": 923856, "step": 2295 }, { "epoch": 2.390852390852391, "grad_norm": 10.78072738647461, "learning_rate": 3.1220410217277546e-06, "loss": 0.1516, "num_input_tokens_seen": 925968, "step": 2300 }, { "epoch": 2.396049896049896, "grad_norm": 3.1442511081695557, "learning_rate": 3.1132508763094715e-06, "loss": 0.0496, "num_input_tokens_seen": 927888, "step": 2305 }, { "epoch": 2.401247401247401, "grad_norm": 0.10060002654790878, "learning_rate": 3.1044526566394924e-06, "loss": 0.0691, "num_input_tokens_seen": 929808, "step": 2310 }, { "epoch": 2.4064449064449063, "grad_norm": 0.07174642384052277, "learning_rate": 3.0956464785579125e-06, "loss": 0.0009, "num_input_tokens_seen": 931728, "step": 2315 }, { "epoch": 2.4116424116424118, "grad_norm": 0.3574046790599823, "learning_rate": 3.0868324580096113e-06, "loss": 0.0309, "num_input_tokens_seen": 933840, "step": 2320 }, { "epoch": 2.416839916839917, "grad_norm": 0.8769842982292175, "learning_rate": 3.078010711042723e-06, "loss": 0.1115, "num_input_tokens_seen": 935824, "step": 2325 }, { "epoch": 2.422037422037422, "grad_norm": 0.044372253119945526, "learning_rate": 3.069181353807111e-06, "loss": 0.043, "num_input_tokens_seen": 937872, "step": 2330 }, { "epoch": 2.4272349272349274, "grad_norm": 0.3668450713157654, "learning_rate": 3.0603445025528377e-06, "loss": 0.098, "num_input_tokens_seen": 939984, "step": 2335 }, { "epoch": 2.4324324324324325, "grad_norm": 0.3943478465080261, "learning_rate": 3.051500273628633e-06, "loss": 0.0482, "num_input_tokens_seen": 941968, "step": 2340 }, { "epoch": 2.4376299376299375, "grad_norm": 40.15293502807617, "learning_rate": 3.042648783480366e-06, "loss": 0.0265, "num_input_tokens_seen": 943952, "step": 2345 }, { "epoch": 2.442827442827443, "grad_norm": 0.6149891018867493, "learning_rate": 3.0337901486495073e-06, "loss": 0.0727, "num_input_tokens_seen": 945872, "step": 2350 }, { "epoch": 2.448024948024948, "grad_norm": 0.031426846981048584, "learning_rate": 3.0249244857715977e-06, "loss": 0.1045, "num_input_tokens_seen": 947856, "step": 2355 }, { "epoch": 2.453222453222453, "grad_norm": 0.029239047318696976, "learning_rate": 3.01605191157471e-06, "loss": 0.0835, "num_input_tokens_seen": 949840, "step": 2360 }, { "epoch": 2.4584199584199586, "grad_norm": 43.117408752441406, "learning_rate": 3.0071725428779152e-06, "loss": 0.0307, "num_input_tokens_seen": 951760, "step": 2365 }, { "epoch": 2.4636174636174637, "grad_norm": 5.0166168212890625, "learning_rate": 2.9982864965897423e-06, "loss": 0.0294, "num_input_tokens_seen": 953680, "step": 2370 }, { "epoch": 2.4688149688149688, "grad_norm": 0.023427043110132217, "learning_rate": 2.9893938897066392e-06, "loss": 0.0349, "num_input_tokens_seen": 955600, "step": 2375 }, { "epoch": 2.474012474012474, "grad_norm": 0.23864233493804932, "learning_rate": 2.9804948393114325e-06, "loss": 0.2071, "num_input_tokens_seen": 957456, "step": 2380 }, { "epoch": 2.4792099792099793, "grad_norm": 0.4926133453845978, "learning_rate": 2.9715894625717868e-06, "loss": 0.0055, "num_input_tokens_seen": 959504, "step": 2385 }, { "epoch": 2.4844074844074844, "grad_norm": 0.32122310996055603, "learning_rate": 2.9626778767386604e-06, "loss": 0.0277, "num_input_tokens_seen": 961488, "step": 2390 }, { "epoch": 2.4896049896049894, "grad_norm": 0.13178545236587524, "learning_rate": 2.953760199144764e-06, "loss": 0.1288, "num_input_tokens_seen": 963408, "step": 2395 }, { "epoch": 2.494802494802495, "grad_norm": 0.05135256052017212, "learning_rate": 2.9448365472030116e-06, "loss": 0.0595, "num_input_tokens_seen": 965392, "step": 2400 }, { "epoch": 2.5, "grad_norm": 0.09723832458257675, "learning_rate": 2.935907038404981e-06, "loss": 0.0664, "num_input_tokens_seen": 967440, "step": 2405 }, { "epoch": 2.505197505197505, "grad_norm": 90.10528564453125, "learning_rate": 2.9269717903193603e-06, "loss": 0.0894, "num_input_tokens_seen": 969360, "step": 2410 }, { "epoch": 2.505197505197505, "eval_loss": 0.3648892641067505, "eval_runtime": 1.0725, "eval_samples_per_second": 798.114, "eval_steps_per_second": 99.764, "num_input_tokens_seen": 969360, "step": 2410 }, { "epoch": 2.51039501039501, "grad_norm": 116.97930908203125, "learning_rate": 2.918030920590403e-06, "loss": 0.0082, "num_input_tokens_seen": 971472, "step": 2415 }, { "epoch": 2.5155925155925156, "grad_norm": 0.011047163046896458, "learning_rate": 2.9090845469363804e-06, "loss": 0.0006, "num_input_tokens_seen": 973456, "step": 2420 }, { "epoch": 2.5207900207900207, "grad_norm": 0.1614125818014145, "learning_rate": 2.9001327871480296e-06, "loss": 0.0004, "num_input_tokens_seen": 975504, "step": 2425 }, { "epoch": 2.525987525987526, "grad_norm": 0.01074185874313116, "learning_rate": 2.8911757590870028e-06, "loss": 0.0019, "num_input_tokens_seen": 977552, "step": 2430 }, { "epoch": 2.5311850311850312, "grad_norm": 173.61000061035156, "learning_rate": 2.8822135806843156e-06, "loss": 0.1355, "num_input_tokens_seen": 979536, "step": 2435 }, { "epoch": 2.5363825363825363, "grad_norm": 0.009233055636286736, "learning_rate": 2.873246369938797e-06, "loss": 0.084, "num_input_tokens_seen": 981584, "step": 2440 }, { "epoch": 2.5415800415800414, "grad_norm": 3.7363264560699463, "learning_rate": 2.8642742449155287e-06, "loss": 0.0365, "num_input_tokens_seen": 983632, "step": 2445 }, { "epoch": 2.546777546777547, "grad_norm": 13.669214248657227, "learning_rate": 2.855297323744301e-06, "loss": 0.1776, "num_input_tokens_seen": 985680, "step": 2450 }, { "epoch": 2.551975051975052, "grad_norm": 17.678695678710938, "learning_rate": 2.8463157246180465e-06, "loss": 0.0731, "num_input_tokens_seen": 987664, "step": 2455 }, { "epoch": 2.5571725571725574, "grad_norm": 0.007595015689730644, "learning_rate": 2.8373295657912947e-06, "loss": 0.0002, "num_input_tokens_seen": 989648, "step": 2460 }, { "epoch": 2.5623700623700625, "grad_norm": 0.08272235840559006, "learning_rate": 2.828338965578603e-06, "loss": 0.0005, "num_input_tokens_seen": 991696, "step": 2465 }, { "epoch": 2.5675675675675675, "grad_norm": 14.857572555541992, "learning_rate": 2.8193440423530117e-06, "loss": 0.2142, "num_input_tokens_seen": 993616, "step": 2470 }, { "epoch": 2.5727650727650726, "grad_norm": 155.0998077392578, "learning_rate": 2.810344914544475e-06, "loss": 0.0503, "num_input_tokens_seen": 995664, "step": 2475 }, { "epoch": 2.577962577962578, "grad_norm": 0.12371411919593811, "learning_rate": 2.8013417006383078e-06, "loss": 0.1017, "num_input_tokens_seen": 997648, "step": 2480 }, { "epoch": 2.583160083160083, "grad_norm": 0.05877931788563728, "learning_rate": 2.792334519173624e-06, "loss": 0.0796, "num_input_tokens_seen": 999696, "step": 2485 }, { "epoch": 2.5883575883575882, "grad_norm": 0.09183234721422195, "learning_rate": 2.7833234887417745e-06, "loss": 0.1002, "num_input_tokens_seen": 1001680, "step": 2490 }, { "epoch": 2.5935550935550937, "grad_norm": 0.1903308629989624, "learning_rate": 2.774308727984787e-06, "loss": 0.0836, "num_input_tokens_seen": 1003728, "step": 2495 }, { "epoch": 2.598752598752599, "grad_norm": 0.0603751540184021, "learning_rate": 2.7652903555938047e-06, "loss": 0.0495, "num_input_tokens_seen": 1005584, "step": 2500 }, { "epoch": 2.603950103950104, "grad_norm": 0.11998436599969864, "learning_rate": 2.756268490307524e-06, "loss": 0.061, "num_input_tokens_seen": 1007696, "step": 2505 }, { "epoch": 2.609147609147609, "grad_norm": 15.387810707092285, "learning_rate": 2.747243250910625e-06, "loss": 0.2945, "num_input_tokens_seen": 1009680, "step": 2510 }, { "epoch": 2.6143451143451144, "grad_norm": 0.8963765501976013, "learning_rate": 2.7382147562322175e-06, "loss": 0.0414, "num_input_tokens_seen": 1011728, "step": 2515 }, { "epoch": 2.6195426195426195, "grad_norm": 0.18006020784378052, "learning_rate": 2.729183125144269e-06, "loss": 0.0023, "num_input_tokens_seen": 1013840, "step": 2520 }, { "epoch": 2.624740124740125, "grad_norm": 33.4202880859375, "learning_rate": 2.7201484765600426e-06, "loss": 0.1403, "num_input_tokens_seen": 1015824, "step": 2525 }, { "epoch": 2.62993762993763, "grad_norm": 0.056018222123384476, "learning_rate": 2.71111092943253e-06, "loss": 0.1792, "num_input_tokens_seen": 1017744, "step": 2530 }, { "epoch": 2.635135135135135, "grad_norm": 0.27936848998069763, "learning_rate": 2.702070602752887e-06, "loss": 0.0616, "num_input_tokens_seen": 1019728, "step": 2535 }, { "epoch": 2.64033264033264, "grad_norm": 0.3187178075313568, "learning_rate": 2.693027615548864e-06, "loss": 0.0836, "num_input_tokens_seen": 1021840, "step": 2540 }, { "epoch": 2.6455301455301456, "grad_norm": 18.821897506713867, "learning_rate": 2.6839820868832433e-06, "loss": 0.0909, "num_input_tokens_seen": 1023824, "step": 2545 }, { "epoch": 2.6507276507276507, "grad_norm": 0.6045968532562256, "learning_rate": 2.6749341358522675e-06, "loss": 0.0143, "num_input_tokens_seen": 1025616, "step": 2550 }, { "epoch": 2.6559251559251558, "grad_norm": 0.18768300116062164, "learning_rate": 2.665883881584072e-06, "loss": 0.0105, "num_input_tokens_seen": 1027664, "step": 2555 }, { "epoch": 2.6611226611226613, "grad_norm": 0.04695185646414757, "learning_rate": 2.6568314432371183e-06, "loss": 0.0167, "num_input_tokens_seen": 1029648, "step": 2560 }, { "epoch": 2.6663201663201663, "grad_norm": 0.04115242138504982, "learning_rate": 2.647776939998625e-06, "loss": 0.0354, "num_input_tokens_seen": 1031632, "step": 2565 }, { "epoch": 2.6715176715176714, "grad_norm": 0.029054885730147362, "learning_rate": 2.6387204910829954e-06, "loss": 0.0416, "num_input_tokens_seen": 1033488, "step": 2570 }, { "epoch": 2.6767151767151764, "grad_norm": 12.681103706359863, "learning_rate": 2.629662215730253e-06, "loss": 0.0011, "num_input_tokens_seen": 1035536, "step": 2575 }, { "epoch": 2.681912681912682, "grad_norm": 0.039936427026987076, "learning_rate": 2.620602233204467e-06, "loss": 0.0636, "num_input_tokens_seen": 1037584, "step": 2580 }, { "epoch": 2.687110187110187, "grad_norm": 17.539627075195312, "learning_rate": 2.6115406627921823e-06, "loss": 0.1506, "num_input_tokens_seen": 1039568, "step": 2585 }, { "epoch": 2.6923076923076925, "grad_norm": 0.6394942998886108, "learning_rate": 2.6024776238008543e-06, "loss": 0.0269, "num_input_tokens_seen": 1041616, "step": 2590 }, { "epoch": 2.6975051975051976, "grad_norm": 0.01985405571758747, "learning_rate": 2.5934132355572713e-06, "loss": 0.1038, "num_input_tokens_seen": 1043664, "step": 2595 }, { "epoch": 2.7027027027027026, "grad_norm": 12.785253524780273, "learning_rate": 2.5843476174059874e-06, "loss": 0.159, "num_input_tokens_seen": 1045520, "step": 2600 }, { "epoch": 2.7079002079002077, "grad_norm": 0.04270019382238388, "learning_rate": 2.575280888707748e-06, "loss": 0.1412, "num_input_tokens_seen": 1047376, "step": 2605 }, { "epoch": 2.713097713097713, "grad_norm": 7.434192657470703, "learning_rate": 2.5662131688379244e-06, "loss": 0.0029, "num_input_tokens_seen": 1049360, "step": 2610 }, { "epoch": 2.7182952182952183, "grad_norm": 0.07113603502511978, "learning_rate": 2.557144577184933e-06, "loss": 0.054, "num_input_tokens_seen": 1051344, "step": 2615 }, { "epoch": 2.7234927234927238, "grad_norm": 0.07835633307695389, "learning_rate": 2.5480752331486742e-06, "loss": 0.0051, "num_input_tokens_seen": 1053264, "step": 2620 }, { "epoch": 2.728690228690229, "grad_norm": 0.1012062355875969, "learning_rate": 2.539005256138948e-06, "loss": 0.0494, "num_input_tokens_seen": 1055248, "step": 2625 }, { "epoch": 2.733887733887734, "grad_norm": 87.71424865722656, "learning_rate": 2.529934765573893e-06, "loss": 0.0155, "num_input_tokens_seen": 1057104, "step": 2630 }, { "epoch": 2.739085239085239, "grad_norm": 0.09936363995075226, "learning_rate": 2.520863880878408e-06, "loss": 0.0379, "num_input_tokens_seen": 1059024, "step": 2635 }, { "epoch": 2.7442827442827444, "grad_norm": 1.4544926881790161, "learning_rate": 2.511792721482581e-06, "loss": 0.2379, "num_input_tokens_seen": 1060944, "step": 2640 }, { "epoch": 2.7494802494802495, "grad_norm": 1.8390294313430786, "learning_rate": 2.502721406820116e-06, "loss": 0.038, "num_input_tokens_seen": 1062992, "step": 2645 }, { "epoch": 2.7546777546777546, "grad_norm": 0.23238161206245422, "learning_rate": 2.493650056326763e-06, "loss": 0.0705, "num_input_tokens_seen": 1064848, "step": 2650 }, { "epoch": 2.7557172557172556, "eval_loss": 0.306118369102478, "eval_runtime": 1.0992, "eval_samples_per_second": 778.717, "eval_steps_per_second": 97.34, "num_input_tokens_seen": 1065232, "step": 2651 }, { "epoch": 2.75987525987526, "grad_norm": 1.4501862525939941, "learning_rate": 2.4845787894387427e-06, "loss": 0.2106, "num_input_tokens_seen": 1066832, "step": 2655 }, { "epoch": 2.765072765072765, "grad_norm": 0.20231160521507263, "learning_rate": 2.4755077255911746e-06, "loss": 0.0032, "num_input_tokens_seen": 1068880, "step": 2660 }, { "epoch": 2.77027027027027, "grad_norm": 12.596285820007324, "learning_rate": 2.466436984216507e-06, "loss": 0.151, "num_input_tokens_seen": 1070864, "step": 2665 }, { "epoch": 2.7754677754677752, "grad_norm": 2.4909775257110596, "learning_rate": 2.4573666847429383e-06, "loss": 0.1102, "num_input_tokens_seen": 1072848, "step": 2670 }, { "epoch": 2.7806652806652807, "grad_norm": 0.3123326301574707, "learning_rate": 2.4482969465928545e-06, "loss": 0.0628, "num_input_tokens_seen": 1074832, "step": 2675 }, { "epoch": 2.785862785862786, "grad_norm": 0.03955717012286186, "learning_rate": 2.4392278891812457e-06, "loss": 0.002, "num_input_tokens_seen": 1076944, "step": 2680 }, { "epoch": 2.7910602910602913, "grad_norm": 1.0874260663986206, "learning_rate": 2.430159631914141e-06, "loss": 0.0233, "num_input_tokens_seen": 1078800, "step": 2685 }, { "epoch": 2.7962577962577964, "grad_norm": 0.6165662407875061, "learning_rate": 2.421092294187037e-06, "loss": 0.1463, "num_input_tokens_seen": 1080912, "step": 2690 }, { "epoch": 2.8014553014553014, "grad_norm": 0.12875588238239288, "learning_rate": 2.41202599538332e-06, "loss": 0.0068, "num_input_tokens_seen": 1082960, "step": 2695 }, { "epoch": 2.8066528066528065, "grad_norm": 0.024786395952105522, "learning_rate": 2.402960854872697e-06, "loss": 0.0591, "num_input_tokens_seen": 1085008, "step": 2700 }, { "epoch": 2.811850311850312, "grad_norm": 0.05379832535982132, "learning_rate": 2.39389699200963e-06, "loss": 0.0729, "num_input_tokens_seen": 1087184, "step": 2705 }, { "epoch": 2.817047817047817, "grad_norm": 0.04001461714506149, "learning_rate": 2.3848345261317523e-06, "loss": 0.0013, "num_input_tokens_seen": 1089104, "step": 2710 }, { "epoch": 2.822245322245322, "grad_norm": 0.09780512005090714, "learning_rate": 2.3757735765583083e-06, "loss": 0.1587, "num_input_tokens_seen": 1091024, "step": 2715 }, { "epoch": 2.8274428274428276, "grad_norm": 0.06699176877737045, "learning_rate": 2.3667142625885774e-06, "loss": 0.0685, "num_input_tokens_seen": 1093008, "step": 2720 }, { "epoch": 2.8326403326403327, "grad_norm": 0.03752860054373741, "learning_rate": 2.357656703500303e-06, "loss": 0.0005, "num_input_tokens_seen": 1094992, "step": 2725 }, { "epoch": 2.8378378378378377, "grad_norm": 0.09200336039066315, "learning_rate": 2.3486010185481247e-06, "loss": 0.0003, "num_input_tokens_seen": 1097040, "step": 2730 }, { "epoch": 2.8430353430353428, "grad_norm": 0.3943188786506653, "learning_rate": 2.3395473269620055e-06, "loss": 0.1532, "num_input_tokens_seen": 1098960, "step": 2735 }, { "epoch": 2.8482328482328483, "grad_norm": 0.03474006429314613, "learning_rate": 2.330495747945665e-06, "loss": 0.0005, "num_input_tokens_seen": 1101200, "step": 2740 }, { "epoch": 2.8534303534303533, "grad_norm": 0.25783851742744446, "learning_rate": 2.321446400675005e-06, "loss": 0.1635, "num_input_tokens_seen": 1103120, "step": 2745 }, { "epoch": 2.858627858627859, "grad_norm": 27.41282081604004, "learning_rate": 2.3123994042965454e-06, "loss": 0.0648, "num_input_tokens_seen": 1105168, "step": 2750 }, { "epoch": 2.863825363825364, "grad_norm": 0.015091204084455967, "learning_rate": 2.3033548779258535e-06, "loss": 0.0463, "num_input_tokens_seen": 1107152, "step": 2755 }, { "epoch": 2.869022869022869, "grad_norm": 48.32891845703125, "learning_rate": 2.2943129406459754e-06, "loss": 0.2765, "num_input_tokens_seen": 1109200, "step": 2760 }, { "epoch": 2.874220374220374, "grad_norm": 0.029947001487016678, "learning_rate": 2.2852737115058684e-06, "loss": 0.2216, "num_input_tokens_seen": 1111248, "step": 2765 }, { "epoch": 2.8794178794178795, "grad_norm": 54.1898307800293, "learning_rate": 2.2762373095188344e-06, "loss": 0.1188, "num_input_tokens_seen": 1113232, "step": 2770 }, { "epoch": 2.8846153846153846, "grad_norm": 10.942605972290039, "learning_rate": 2.2672038536609487e-06, "loss": 0.0557, "num_input_tokens_seen": 1115216, "step": 2775 }, { "epoch": 2.88981288981289, "grad_norm": 0.1424887627363205, "learning_rate": 2.2581734628695034e-06, "loss": 0.0011, "num_input_tokens_seen": 1117264, "step": 2780 }, { "epoch": 2.895010395010395, "grad_norm": 16.02339744567871, "learning_rate": 2.2491462560414287e-06, "loss": 0.1068, "num_input_tokens_seen": 1119376, "step": 2785 }, { "epoch": 2.9002079002079, "grad_norm": 1.1276624202728271, "learning_rate": 2.2401223520317363e-06, "loss": 0.1178, "num_input_tokens_seen": 1121424, "step": 2790 }, { "epoch": 2.9054054054054053, "grad_norm": 13.925151824951172, "learning_rate": 2.2311018696519532e-06, "loss": 0.0582, "num_input_tokens_seen": 1123472, "step": 2795 }, { "epoch": 2.9106029106029108, "grad_norm": 0.04868851974606514, "learning_rate": 2.2220849276685533e-06, "loss": 0.0007, "num_input_tokens_seen": 1125584, "step": 2800 }, { "epoch": 2.915800415800416, "grad_norm": 0.11241783946752548, "learning_rate": 2.2130716448014e-06, "loss": 0.0783, "num_input_tokens_seen": 1127568, "step": 2805 }, { "epoch": 2.920997920997921, "grad_norm": 0.37742850184440613, "learning_rate": 2.2040621397221762e-06, "loss": 0.0946, "num_input_tokens_seen": 1129552, "step": 2810 }, { "epoch": 2.9261954261954264, "grad_norm": 0.31173890829086304, "learning_rate": 2.1950565310528264e-06, "loss": 0.0011, "num_input_tokens_seen": 1131472, "step": 2815 }, { "epoch": 2.9313929313929314, "grad_norm": 0.06812157481908798, "learning_rate": 2.186054937363996e-06, "loss": 0.0005, "num_input_tokens_seen": 1133392, "step": 2820 }, { "epoch": 2.9365904365904365, "grad_norm": 0.7941020727157593, "learning_rate": 2.1770574771734644e-06, "loss": 0.0004, "num_input_tokens_seen": 1135440, "step": 2825 }, { "epoch": 2.9417879417879416, "grad_norm": 23.382444381713867, "learning_rate": 2.168064268944591e-06, "loss": 0.0037, "num_input_tokens_seen": 1137424, "step": 2830 }, { "epoch": 2.946985446985447, "grad_norm": 0.1391032636165619, "learning_rate": 2.1590754310847513e-06, "loss": 0.018, "num_input_tokens_seen": 1139408, "step": 2835 }, { "epoch": 2.952182952182952, "grad_norm": 0.01357136107981205, "learning_rate": 2.150091081943777e-06, "loss": 0.1722, "num_input_tokens_seen": 1141456, "step": 2840 }, { "epoch": 2.9573804573804576, "grad_norm": 0.020888514816761017, "learning_rate": 2.141111339812405e-06, "loss": 0.1002, "num_input_tokens_seen": 1143440, "step": 2845 }, { "epoch": 2.9625779625779627, "grad_norm": 0.11883700639009476, "learning_rate": 2.1321363229207097e-06, "loss": 0.0783, "num_input_tokens_seen": 1145360, "step": 2850 }, { "epoch": 2.9677754677754677, "grad_norm": 0.4169588088989258, "learning_rate": 2.123166149436556e-06, "loss": 0.1061, "num_input_tokens_seen": 1147280, "step": 2855 }, { "epoch": 2.972972972972973, "grad_norm": 0.14435216784477234, "learning_rate": 2.114200937464035e-06, "loss": 0.1705, "num_input_tokens_seen": 1149200, "step": 2860 }, { "epoch": 2.9781704781704783, "grad_norm": 0.27636605501174927, "learning_rate": 2.1052408050419153e-06, "loss": 0.003, "num_input_tokens_seen": 1151184, "step": 2865 }, { "epoch": 2.9833679833679834, "grad_norm": 0.32042670249938965, "learning_rate": 2.0962858701420867e-06, "loss": 0.0952, "num_input_tokens_seen": 1153232, "step": 2870 }, { "epoch": 2.9885654885654884, "grad_norm": 12.104057312011719, "learning_rate": 2.087336250668006e-06, "loss": 0.1992, "num_input_tokens_seen": 1155216, "step": 2875 }, { "epoch": 2.993762993762994, "grad_norm": 0.12431977689266205, "learning_rate": 2.0783920644531443e-06, "loss": 0.1408, "num_input_tokens_seen": 1157264, "step": 2880 }, { "epoch": 2.998960498960499, "grad_norm": 0.302804559469223, "learning_rate": 2.069453429259439e-06, "loss": 0.2101, "num_input_tokens_seen": 1159312, "step": 2885 }, { "epoch": 3.004158004158004, "grad_norm": 0.18154755234718323, "learning_rate": 2.06052046277574e-06, "loss": 0.0016, "num_input_tokens_seen": 1161248, "step": 2890 }, { "epoch": 3.006237006237006, "eval_loss": 0.2698093056678772, "eval_runtime": 1.0525, "eval_samples_per_second": 813.316, "eval_steps_per_second": 101.664, "num_input_tokens_seen": 1162016, "step": 2892 }, { "epoch": 3.0093555093555096, "grad_norm": 0.132065549492836, "learning_rate": 2.051593282616262e-06, "loss": 0.0012, "num_input_tokens_seen": 1163168, "step": 2895 }, { "epoch": 3.0145530145530146, "grad_norm": 0.12736886739730835, "learning_rate": 2.0426720063190335e-06, "loss": 0.0559, "num_input_tokens_seen": 1165088, "step": 2900 }, { "epoch": 3.0197505197505197, "grad_norm": 0.15903866291046143, "learning_rate": 2.0337567513443518e-06, "loss": 0.0012, "num_input_tokens_seen": 1167136, "step": 2905 }, { "epoch": 3.024948024948025, "grad_norm": 0.06871409714221954, "learning_rate": 2.0248476350732368e-06, "loss": 0.046, "num_input_tokens_seen": 1169120, "step": 2910 }, { "epoch": 3.0301455301455302, "grad_norm": 6.423719882965088, "learning_rate": 2.0159447748058803e-06, "loss": 0.0235, "num_input_tokens_seen": 1171040, "step": 2915 }, { "epoch": 3.0353430353430353, "grad_norm": 0.08531547337770462, "learning_rate": 2.007048287760113e-06, "loss": 0.1135, "num_input_tokens_seen": 1173024, "step": 2920 }, { "epoch": 3.0405405405405403, "grad_norm": 0.07908215373754501, "learning_rate": 1.998158291069845e-06, "loss": 0.0007, "num_input_tokens_seen": 1174944, "step": 2925 }, { "epoch": 3.045738045738046, "grad_norm": 15.975863456726074, "learning_rate": 1.989274901783538e-06, "loss": 0.009, "num_input_tokens_seen": 1177056, "step": 2930 }, { "epoch": 3.050935550935551, "grad_norm": 0.03592640534043312, "learning_rate": 1.9803982368626582e-06, "loss": 0.0004, "num_input_tokens_seen": 1178976, "step": 2935 }, { "epoch": 3.056133056133056, "grad_norm": 0.09219586849212646, "learning_rate": 1.9715284131801353e-06, "loss": 0.0007, "num_input_tokens_seen": 1181024, "step": 2940 }, { "epoch": 3.0613305613305615, "grad_norm": 0.022503485903143883, "learning_rate": 1.9626655475188237e-06, "loss": 0.0003, "num_input_tokens_seen": 1183008, "step": 2945 }, { "epoch": 3.0665280665280665, "grad_norm": 0.01857338473200798, "learning_rate": 1.953809756569971e-06, "loss": 0.0003, "num_input_tokens_seen": 1185056, "step": 2950 }, { "epoch": 3.0717255717255716, "grad_norm": 0.03196537122130394, "learning_rate": 1.9449611569316716e-06, "loss": 0.0623, "num_input_tokens_seen": 1186976, "step": 2955 }, { "epoch": 3.076923076923077, "grad_norm": 0.016185106709599495, "learning_rate": 1.936119865107341e-06, "loss": 0.1065, "num_input_tokens_seen": 1188960, "step": 2960 }, { "epoch": 3.082120582120582, "grad_norm": 0.03204691782593727, "learning_rate": 1.9272859975041757e-06, "loss": 0.0002, "num_input_tokens_seen": 1190944, "step": 2965 }, { "epoch": 3.087318087318087, "grad_norm": 0.03665002062916756, "learning_rate": 1.918459670431622e-06, "loss": 0.0381, "num_input_tokens_seen": 1192928, "step": 2970 }, { "epoch": 3.0925155925155927, "grad_norm": 0.01817925274372101, "learning_rate": 1.9096410000998478e-06, "loss": 0.0045, "num_input_tokens_seen": 1194848, "step": 2975 }, { "epoch": 3.0977130977130978, "grad_norm": 0.28329595923423767, "learning_rate": 1.9008301026182064e-06, "loss": 0.0019, "num_input_tokens_seen": 1196768, "step": 2980 }, { "epoch": 3.102910602910603, "grad_norm": 0.016043463721871376, "learning_rate": 1.892027093993716e-06, "loss": 0.0002, "num_input_tokens_seen": 1198688, "step": 2985 }, { "epoch": 3.108108108108108, "grad_norm": 0.028100663796067238, "learning_rate": 1.883232090129523e-06, "loss": 0.0002, "num_input_tokens_seen": 1200672, "step": 2990 }, { "epoch": 3.1133056133056134, "grad_norm": 0.06433451920747757, "learning_rate": 1.8744452068233826e-06, "loss": 0.0713, "num_input_tokens_seen": 1202720, "step": 2995 }, { "epoch": 3.1185031185031185, "grad_norm": 0.017661597579717636, "learning_rate": 1.8656665597661334e-06, "loss": 0.0002, "num_input_tokens_seen": 1204768, "step": 3000 }, { "epoch": 3.1237006237006235, "grad_norm": 0.02003006637096405, "learning_rate": 1.8568962645401702e-06, "loss": 0.0001, "num_input_tokens_seen": 1206944, "step": 3005 }, { "epoch": 3.128898128898129, "grad_norm": 0.017040062695741653, "learning_rate": 1.8481344366179284e-06, "loss": 0.095, "num_input_tokens_seen": 1209056, "step": 3010 }, { "epoch": 3.134095634095634, "grad_norm": 0.030846811830997467, "learning_rate": 1.8393811913603583e-06, "loss": 0.0002, "num_input_tokens_seen": 1210976, "step": 3015 }, { "epoch": 3.139293139293139, "grad_norm": 0.029971648007631302, "learning_rate": 1.8306366440154067e-06, "loss": 0.0002, "num_input_tokens_seen": 1213024, "step": 3020 }, { "epoch": 3.1444906444906446, "grad_norm": 0.033417243510484695, "learning_rate": 1.8219009097165042e-06, "loss": 0.0302, "num_input_tokens_seen": 1215136, "step": 3025 }, { "epoch": 3.1496881496881497, "grad_norm": 0.3327726423740387, "learning_rate": 1.8131741034810436e-06, "loss": 0.0004, "num_input_tokens_seen": 1217056, "step": 3030 }, { "epoch": 3.1548856548856548, "grad_norm": 0.030325112864375114, "learning_rate": 1.8044563402088686e-06, "loss": 0.0003, "num_input_tokens_seen": 1219168, "step": 3035 }, { "epoch": 3.1600831600831603, "grad_norm": 0.0346427820622921, "learning_rate": 1.7957477346807622e-06, "loss": 0.0002, "num_input_tokens_seen": 1221088, "step": 3040 }, { "epoch": 3.1652806652806653, "grad_norm": 0.013777323067188263, "learning_rate": 1.7870484015569306e-06, "loss": 0.0002, "num_input_tokens_seen": 1223264, "step": 3045 }, { "epoch": 3.1704781704781704, "grad_norm": 0.006741571240127087, "learning_rate": 1.7783584553755007e-06, "loss": 0.0002, "num_input_tokens_seen": 1225440, "step": 3050 }, { "epoch": 3.175675675675676, "grad_norm": 0.1481812596321106, "learning_rate": 1.769678010551003e-06, "loss": 0.0864, "num_input_tokens_seen": 1227424, "step": 3055 }, { "epoch": 3.180873180873181, "grad_norm": 0.15245255827903748, "learning_rate": 1.7610071813728741e-06, "loss": 0.0793, "num_input_tokens_seen": 1229344, "step": 3060 }, { "epoch": 3.186070686070686, "grad_norm": 6.337021827697754, "learning_rate": 1.7523460820039466e-06, "loss": 0.0974, "num_input_tokens_seen": 1231456, "step": 3065 }, { "epoch": 3.1912681912681915, "grad_norm": 0.27218034863471985, "learning_rate": 1.7436948264789465e-06, "loss": 0.0003, "num_input_tokens_seen": 1233440, "step": 3070 }, { "epoch": 3.1964656964656966, "grad_norm": 0.02221021242439747, "learning_rate": 1.7350535287029957e-06, "loss": 0.0779, "num_input_tokens_seen": 1235552, "step": 3075 }, { "epoch": 3.2016632016632016, "grad_norm": 0.04011327400803566, "learning_rate": 1.7264223024501064e-06, "loss": 0.152, "num_input_tokens_seen": 1237536, "step": 3080 }, { "epoch": 3.2068607068607067, "grad_norm": 0.02025497704744339, "learning_rate": 1.717801261361685e-06, "loss": 0.0004, "num_input_tokens_seen": 1239584, "step": 3085 }, { "epoch": 3.212058212058212, "grad_norm": 1.1895182132720947, "learning_rate": 1.7091905189450425e-06, "loss": 0.0013, "num_input_tokens_seen": 1241504, "step": 3090 }, { "epoch": 3.2172557172557172, "grad_norm": 251.4841766357422, "learning_rate": 1.700590188571887e-06, "loss": 0.0375, "num_input_tokens_seen": 1243552, "step": 3095 }, { "epoch": 3.2224532224532223, "grad_norm": 0.022575953975319862, "learning_rate": 1.6920003834768438e-06, "loss": 0.0002, "num_input_tokens_seen": 1245600, "step": 3100 }, { "epoch": 3.227650727650728, "grad_norm": 0.019538020715117455, "learning_rate": 1.6834212167559578e-06, "loss": 0.0002, "num_input_tokens_seen": 1247712, "step": 3105 }, { "epoch": 3.232848232848233, "grad_norm": 102.0543212890625, "learning_rate": 1.6748528013652032e-06, "loss": 0.031, "num_input_tokens_seen": 1249696, "step": 3110 }, { "epoch": 3.238045738045738, "grad_norm": 0.1734342724084854, "learning_rate": 1.6662952501190032e-06, "loss": 0.0647, "num_input_tokens_seen": 1251808, "step": 3115 }, { "epoch": 3.2432432432432434, "grad_norm": 106.23870086669922, "learning_rate": 1.6577486756887376e-06, "loss": 0.0462, "num_input_tokens_seen": 1253728, "step": 3120 }, { "epoch": 3.2484407484407485, "grad_norm": 0.021452955901622772, "learning_rate": 1.6492131906012608e-06, "loss": 0.0289, "num_input_tokens_seen": 1255840, "step": 3125 }, { "epoch": 3.2536382536382535, "grad_norm": 0.07759949564933777, "learning_rate": 1.640688907237425e-06, "loss": 0.0469, "num_input_tokens_seen": 1257888, "step": 3130 }, { "epoch": 3.2567567567567566, "eval_loss": 0.36025160551071167, "eval_runtime": 1.086, "eval_samples_per_second": 788.217, "eval_steps_per_second": 98.527, "num_input_tokens_seen": 1259168, "step": 3133 }, { "epoch": 3.258835758835759, "grad_norm": 17.87605857849121, "learning_rate": 1.632175937830594e-06, "loss": 0.0767, "num_input_tokens_seen": 1259936, "step": 3135 }, { "epoch": 3.264033264033264, "grad_norm": 0.020596951246261597, "learning_rate": 1.6236743944651703e-06, "loss": 0.0504, "num_input_tokens_seen": 1262112, "step": 3140 }, { "epoch": 3.269230769230769, "grad_norm": 20.961088180541992, "learning_rate": 1.6151843890751172e-06, "loss": 0.0185, "num_input_tokens_seen": 1263904, "step": 3145 }, { "epoch": 3.274428274428274, "grad_norm": 0.048750557005405426, "learning_rate": 1.6067060334424836e-06, "loss": 0.0131, "num_input_tokens_seen": 1265952, "step": 3150 }, { "epoch": 3.2796257796257797, "grad_norm": 0.010466455481946468, "learning_rate": 1.5982394391959382e-06, "loss": 0.0002, "num_input_tokens_seen": 1267872, "step": 3155 }, { "epoch": 3.284823284823285, "grad_norm": 70.06320190429688, "learning_rate": 1.5897847178092902e-06, "loss": 0.0937, "num_input_tokens_seen": 1269792, "step": 3160 }, { "epoch": 3.29002079002079, "grad_norm": 1.3579819202423096, "learning_rate": 1.5813419806000329e-06, "loss": 0.0014, "num_input_tokens_seen": 1271776, "step": 3165 }, { "epoch": 3.2952182952182953, "grad_norm": 0.01165761612355709, "learning_rate": 1.5729113387278675e-06, "loss": 0.0785, "num_input_tokens_seen": 1273760, "step": 3170 }, { "epoch": 3.3004158004158004, "grad_norm": 0.04458131268620491, "learning_rate": 1.5644929031932455e-06, "loss": 0.1213, "num_input_tokens_seen": 1275808, "step": 3175 }, { "epoch": 3.3056133056133055, "grad_norm": 0.052004504948854446, "learning_rate": 1.556086784835908e-06, "loss": 0.0576, "num_input_tokens_seen": 1277792, "step": 3180 }, { "epoch": 3.310810810810811, "grad_norm": 0.08415602892637253, "learning_rate": 1.547693094333421e-06, "loss": 0.0004, "num_input_tokens_seen": 1279776, "step": 3185 }, { "epoch": 3.316008316008316, "grad_norm": 38.53419494628906, "learning_rate": 1.5393119421997252e-06, "loss": 0.1482, "num_input_tokens_seen": 1281760, "step": 3190 }, { "epoch": 3.321205821205821, "grad_norm": 0.08246491849422455, "learning_rate": 1.5309434387836737e-06, "loss": 0.0042, "num_input_tokens_seen": 1283744, "step": 3195 }, { "epoch": 3.3264033264033266, "grad_norm": 0.12744662165641785, "learning_rate": 1.5225876942675844e-06, "loss": 0.0005, "num_input_tokens_seen": 1285792, "step": 3200 }, { "epoch": 3.3316008316008316, "grad_norm": 19.751220703125, "learning_rate": 1.514244818665788e-06, "loss": 0.0525, "num_input_tokens_seen": 1287776, "step": 3205 }, { "epoch": 3.3367983367983367, "grad_norm": 0.04199739545583725, "learning_rate": 1.505914921823178e-06, "loss": 0.0002, "num_input_tokens_seen": 1289696, "step": 3210 }, { "epoch": 3.3419958419958418, "grad_norm": 0.19491150975227356, "learning_rate": 1.497598113413766e-06, "loss": 0.0006, "num_input_tokens_seen": 1291680, "step": 3215 }, { "epoch": 3.3471933471933473, "grad_norm": 0.019337935373187065, "learning_rate": 1.489294502939238e-06, "loss": 0.0003, "num_input_tokens_seen": 1293536, "step": 3220 }, { "epoch": 3.3523908523908523, "grad_norm": 0.01680462807416916, "learning_rate": 1.4810041997275094e-06, "loss": 0.0003, "num_input_tokens_seen": 1295712, "step": 3225 }, { "epoch": 3.357588357588358, "grad_norm": 0.025287525728344917, "learning_rate": 1.4727273129312918e-06, "loss": 0.0008, "num_input_tokens_seen": 1297760, "step": 3230 }, { "epoch": 3.362785862785863, "grad_norm": 0.015904569998383522, "learning_rate": 1.4644639515266484e-06, "loss": 0.0001, "num_input_tokens_seen": 1299808, "step": 3235 }, { "epoch": 3.367983367983368, "grad_norm": 0.02150922454893589, "learning_rate": 1.4562142243115646e-06, "loss": 0.0002, "num_input_tokens_seen": 1301920, "step": 3240 }, { "epoch": 3.373180873180873, "grad_norm": 0.014152280054986477, "learning_rate": 1.4479782399045152e-06, "loss": 0.0054, "num_input_tokens_seen": 1303904, "step": 3245 }, { "epoch": 3.3783783783783785, "grad_norm": 0.002619291888549924, "learning_rate": 1.43975610674303e-06, "loss": 0.0424, "num_input_tokens_seen": 1305888, "step": 3250 }, { "epoch": 3.3835758835758836, "grad_norm": 16.205442428588867, "learning_rate": 1.4315479330822711e-06, "loss": 0.1061, "num_input_tokens_seen": 1308064, "step": 3255 }, { "epoch": 3.3887733887733886, "grad_norm": 0.017430748790502548, "learning_rate": 1.4233538269936042e-06, "loss": 0.0016, "num_input_tokens_seen": 1310048, "step": 3260 }, { "epoch": 3.393970893970894, "grad_norm": 0.022695308551192284, "learning_rate": 1.415173896363178e-06, "loss": 0.1162, "num_input_tokens_seen": 1311968, "step": 3265 }, { "epoch": 3.399168399168399, "grad_norm": 0.009034757502377033, "learning_rate": 1.4070082488905034e-06, "loss": 0.0003, "num_input_tokens_seen": 1313888, "step": 3270 }, { "epoch": 3.4043659043659042, "grad_norm": 0.014441024512052536, "learning_rate": 1.3988569920870315e-06, "loss": 0.0648, "num_input_tokens_seen": 1316064, "step": 3275 }, { "epoch": 3.4095634095634098, "grad_norm": 0.019770730286836624, "learning_rate": 1.3907202332747454e-06, "loss": 0.0011, "num_input_tokens_seen": 1318112, "step": 3280 }, { "epoch": 3.414760914760915, "grad_norm": 0.18654315173625946, "learning_rate": 1.3825980795847401e-06, "loss": 0.0003, "num_input_tokens_seen": 1319968, "step": 3285 }, { "epoch": 3.41995841995842, "grad_norm": 0.016684433445334435, "learning_rate": 1.3744906379558165e-06, "loss": 0.038, "num_input_tokens_seen": 1322016, "step": 3290 }, { "epoch": 3.4251559251559254, "grad_norm": 0.012927900068461895, "learning_rate": 1.3663980151330734e-06, "loss": 0.0009, "num_input_tokens_seen": 1323936, "step": 3295 }, { "epoch": 3.4303534303534304, "grad_norm": 0.034205105155706406, "learning_rate": 1.358320317666496e-06, "loss": 0.0241, "num_input_tokens_seen": 1325920, "step": 3300 }, { "epoch": 3.4355509355509355, "grad_norm": 0.016045430675148964, "learning_rate": 1.350257651909562e-06, "loss": 0.0668, "num_input_tokens_seen": 1327840, "step": 3305 }, { "epoch": 3.4407484407484406, "grad_norm": 0.01693640649318695, "learning_rate": 1.3422101240178365e-06, "loss": 0.0001, "num_input_tokens_seen": 1329760, "step": 3310 }, { "epoch": 3.445945945945946, "grad_norm": 0.12216309458017349, "learning_rate": 1.3341778399475714e-06, "loss": 0.0002, "num_input_tokens_seen": 1331744, "step": 3315 }, { "epoch": 3.451143451143451, "grad_norm": 72.96051025390625, "learning_rate": 1.3261609054543178e-06, "loss": 0.0278, "num_input_tokens_seen": 1333792, "step": 3320 }, { "epoch": 3.456340956340956, "grad_norm": 0.03497939929366112, "learning_rate": 1.3181594260915263e-06, "loss": 0.0412, "num_input_tokens_seen": 1335776, "step": 3325 }, { "epoch": 3.4615384615384617, "grad_norm": 0.28867605328559875, "learning_rate": 1.3101735072091624e-06, "loss": 0.0003, "num_input_tokens_seen": 1337824, "step": 3330 }, { "epoch": 3.4667359667359667, "grad_norm": 0.010723100043833256, "learning_rate": 1.3022032539523177e-06, "loss": 0.0311, "num_input_tokens_seen": 1339872, "step": 3335 }, { "epoch": 3.471933471933472, "grad_norm": 0.06669893115758896, "learning_rate": 1.2942487712598234e-06, "loss": 0.0937, "num_input_tokens_seen": 1341920, "step": 3340 }, { "epoch": 3.4771309771309773, "grad_norm": 41.53193664550781, "learning_rate": 1.2863101638628716e-06, "loss": 0.0176, "num_input_tokens_seen": 1343904, "step": 3345 }, { "epoch": 3.4823284823284824, "grad_norm": 0.002587628783658147, "learning_rate": 1.2783875362836373e-06, "loss": 0.0738, "num_input_tokens_seen": 1345952, "step": 3350 }, { "epoch": 3.4875259875259874, "grad_norm": 0.01352632511407137, "learning_rate": 1.2704809928338957e-06, "loss": 0.0394, "num_input_tokens_seen": 1348128, "step": 3355 }, { "epoch": 3.492723492723493, "grad_norm": 0.01383188832551241, "learning_rate": 1.2625906376136582e-06, "loss": 0.0012, "num_input_tokens_seen": 1350048, "step": 3360 }, { "epoch": 3.497920997920998, "grad_norm": 0.03481636196374893, "learning_rate": 1.2547165745097927e-06, "loss": 0.1121, "num_input_tokens_seen": 1351968, "step": 3365 }, { "epoch": 3.503118503118503, "grad_norm": 0.007815233431756496, "learning_rate": 1.2468589071946632e-06, "loss": 0.0682, "num_input_tokens_seen": 1353952, "step": 3370 }, { "epoch": 3.507276507276507, "eval_loss": 0.4127735495567322, "eval_runtime": 1.0486, "eval_samples_per_second": 816.351, "eval_steps_per_second": 102.044, "num_input_tokens_seen": 1355552, "step": 3374 }, { "epoch": 3.508316008316008, "grad_norm": 0.027879195287823677, "learning_rate": 1.2390177391247616e-06, "loss": 0.0726, "num_input_tokens_seen": 1356000, "step": 3375 }, { "epoch": 3.5135135135135136, "grad_norm": 26.8095760345459, "learning_rate": 1.2311931735393417e-06, "loss": 0.1161, "num_input_tokens_seen": 1357984, "step": 3380 }, { "epoch": 3.5187110187110187, "grad_norm": 0.03682544827461243, "learning_rate": 1.2233853134590698e-06, "loss": 0.0002, "num_input_tokens_seen": 1359904, "step": 3385 }, { "epoch": 3.523908523908524, "grad_norm": 0.015444116666913033, "learning_rate": 1.2155942616846562e-06, "loss": 0.0385, "num_input_tokens_seen": 1361952, "step": 3390 }, { "epoch": 3.529106029106029, "grad_norm": 0.0405961312353611, "learning_rate": 1.2078201207955122e-06, "loss": 0.1318, "num_input_tokens_seen": 1364000, "step": 3395 }, { "epoch": 3.5343035343035343, "grad_norm": 0.05229797586798668, "learning_rate": 1.2000629931483947e-06, "loss": 0.0008, "num_input_tokens_seen": 1366112, "step": 3400 }, { "epoch": 3.5395010395010393, "grad_norm": 0.09235959500074387, "learning_rate": 1.1923229808760565e-06, "loss": 0.0016, "num_input_tokens_seen": 1368096, "step": 3405 }, { "epoch": 3.544698544698545, "grad_norm": 0.044506847858428955, "learning_rate": 1.1846001858859054e-06, "loss": 0.0661, "num_input_tokens_seen": 1370208, "step": 3410 }, { "epoch": 3.54989604989605, "grad_norm": 0.27768710255622864, "learning_rate": 1.1768947098586628e-06, "loss": 0.0004, "num_input_tokens_seen": 1372192, "step": 3415 }, { "epoch": 3.555093555093555, "grad_norm": 0.05268951505422592, "learning_rate": 1.1692066542470202e-06, "loss": 0.0171, "num_input_tokens_seen": 1374240, "step": 3420 }, { "epoch": 3.5602910602910605, "grad_norm": 0.04955555871129036, "learning_rate": 1.1615361202743088e-06, "loss": 0.0003, "num_input_tokens_seen": 1376160, "step": 3425 }, { "epoch": 3.5654885654885655, "grad_norm": 0.020622428506612778, "learning_rate": 1.1538832089331628e-06, "loss": 0.0008, "num_input_tokens_seen": 1378208, "step": 3430 }, { "epoch": 3.5706860706860706, "grad_norm": 0.04335801303386688, "learning_rate": 1.1462480209841928e-06, "loss": 0.0007, "num_input_tokens_seen": 1380192, "step": 3435 }, { "epoch": 3.5758835758835756, "grad_norm": 0.053323231637477875, "learning_rate": 1.1386306569546578e-06, "loss": 0.0491, "num_input_tokens_seen": 1382368, "step": 3440 }, { "epoch": 3.581081081081081, "grad_norm": 0.09630803763866425, "learning_rate": 1.1310312171371394e-06, "loss": 0.0002, "num_input_tokens_seen": 1384608, "step": 3445 }, { "epoch": 3.586278586278586, "grad_norm": 0.24890074133872986, "learning_rate": 1.123449801588226e-06, "loss": 0.1426, "num_input_tokens_seen": 1386592, "step": 3450 }, { "epoch": 3.5914760914760917, "grad_norm": 51.86346435546875, "learning_rate": 1.1158865101271906e-06, "loss": 0.098, "num_input_tokens_seen": 1388448, "step": 3455 }, { "epoch": 3.5966735966735968, "grad_norm": 0.017590023577213287, "learning_rate": 1.1083414423346807e-06, "loss": 0.0001, "num_input_tokens_seen": 1390560, "step": 3460 }, { "epoch": 3.601871101871102, "grad_norm": 0.01884053274989128, "learning_rate": 1.100814697551406e-06, "loss": 0.0977, "num_input_tokens_seen": 1392736, "step": 3465 }, { "epoch": 3.607068607068607, "grad_norm": 173.05203247070312, "learning_rate": 1.0933063748768254e-06, "loss": 0.1036, "num_input_tokens_seen": 1394720, "step": 3470 }, { "epoch": 3.6122661122661124, "grad_norm": 0.04371850937604904, "learning_rate": 1.0858165731678514e-06, "loss": 0.0001, "num_input_tokens_seen": 1396640, "step": 3475 }, { "epoch": 3.6174636174636174, "grad_norm": 78.75630187988281, "learning_rate": 1.0783453910375423e-06, "loss": 0.0528, "num_input_tokens_seen": 1398752, "step": 3480 }, { "epoch": 3.6226611226611225, "grad_norm": 17.215036392211914, "learning_rate": 1.0708929268538034e-06, "loss": 0.0787, "num_input_tokens_seen": 1400800, "step": 3485 }, { "epoch": 3.627858627858628, "grad_norm": 0.05456389859318733, "learning_rate": 1.0634592787380964e-06, "loss": 0.0007, "num_input_tokens_seen": 1402720, "step": 3490 }, { "epoch": 3.633056133056133, "grad_norm": 0.06369329243898392, "learning_rate": 1.0560445445641423e-06, "loss": 0.0827, "num_input_tokens_seen": 1404704, "step": 3495 }, { "epoch": 3.638253638253638, "grad_norm": 0.02703475020825863, "learning_rate": 1.048648821956637e-06, "loss": 0.0002, "num_input_tokens_seen": 1406560, "step": 3500 }, { "epoch": 3.643451143451143, "grad_norm": 0.0234812144190073, "learning_rate": 1.0412722082899647e-06, "loss": 0.0586, "num_input_tokens_seen": 1408544, "step": 3505 }, { "epoch": 3.6486486486486487, "grad_norm": 0.03321904316544533, "learning_rate": 1.033914800686912e-06, "loss": 0.0003, "num_input_tokens_seen": 1410464, "step": 3510 }, { "epoch": 3.6538461538461537, "grad_norm": 0.021713286638259888, "learning_rate": 1.0265766960173964e-06, "loss": 0.0001, "num_input_tokens_seen": 1412448, "step": 3515 }, { "epoch": 3.6590436590436592, "grad_norm": 19.148372650146484, "learning_rate": 1.019257990897185e-06, "loss": 0.042, "num_input_tokens_seen": 1414688, "step": 3520 }, { "epoch": 3.6642411642411643, "grad_norm": 13.719977378845215, "learning_rate": 1.0119587816866258e-06, "loss": 0.0036, "num_input_tokens_seen": 1416672, "step": 3525 }, { "epoch": 3.6694386694386694, "grad_norm": 0.012155055068433285, "learning_rate": 1.0046791644893757e-06, "loss": 0.0002, "num_input_tokens_seen": 1418592, "step": 3530 }, { "epoch": 3.6746361746361744, "grad_norm": 0.015267685987055302, "learning_rate": 9.97419235151137e-07, "loss": 0.0004, "num_input_tokens_seen": 1420576, "step": 3535 }, { "epoch": 3.67983367983368, "grad_norm": 0.4185558259487152, "learning_rate": 9.901790892583973e-07, "loss": 0.0005, "num_input_tokens_seen": 1422560, "step": 3540 }, { "epoch": 3.685031185031185, "grad_norm": 0.01660173013806343, "learning_rate": 9.829588221371694e-07, "loss": 0.0001, "num_input_tokens_seen": 1424608, "step": 3545 }, { "epoch": 3.6902286902286905, "grad_norm": 0.06823495030403137, "learning_rate": 9.757585288517329e-07, "loss": 0.0002, "num_input_tokens_seen": 1426784, "step": 3550 }, { "epoch": 3.6954261954261955, "grad_norm": 0.010435913689434528, "learning_rate": 9.6857830420339e-07, "loss": 0.0507, "num_input_tokens_seen": 1428896, "step": 3555 }, { "epoch": 3.7006237006237006, "grad_norm": 0.03763195872306824, "learning_rate": 9.614182427292076e-07, "loss": 0.0001, "num_input_tokens_seen": 1430880, "step": 3560 }, { "epoch": 3.7058212058212057, "grad_norm": 0.07442791014909744, "learning_rate": 9.54278438700785e-07, "loss": 0.0706, "num_input_tokens_seen": 1432864, "step": 3565 }, { "epoch": 3.711018711018711, "grad_norm": 13.558998107910156, "learning_rate": 9.471589861229999e-07, "loss": 0.0558, "num_input_tokens_seen": 1434912, "step": 3570 }, { "epoch": 3.7162162162162162, "grad_norm": 0.03634670376777649, "learning_rate": 9.400599787327774e-07, "loss": 0.0451, "num_input_tokens_seen": 1436832, "step": 3575 }, { "epoch": 3.7214137214137213, "grad_norm": 0.015272362157702446, "learning_rate": 9.329815099978567e-07, "loss": 0.0456, "num_input_tokens_seen": 1438752, "step": 3580 }, { "epoch": 3.726611226611227, "grad_norm": 0.06222844123840332, "learning_rate": 9.259236731155583e-07, "loss": 0.0002, "num_input_tokens_seen": 1440672, "step": 3585 }, { "epoch": 3.731808731808732, "grad_norm": 0.31334197521209717, "learning_rate": 9.188865610115572e-07, "loss": 0.0311, "num_input_tokens_seen": 1442784, "step": 3590 }, { "epoch": 3.737006237006237, "grad_norm": 51.054107666015625, "learning_rate": 9.118702663386583e-07, "loss": 0.0596, "num_input_tokens_seen": 1444960, "step": 3595 }, { "epoch": 3.742203742203742, "grad_norm": 33.01020431518555, "learning_rate": 9.048748814755783e-07, "loss": 0.0648, "num_input_tokens_seen": 1446880, "step": 3600 }, { "epoch": 3.7474012474012475, "grad_norm": 0.032987091690301895, "learning_rate": 8.979004985257294e-07, "loss": 0.0394, "num_input_tokens_seen": 1448992, "step": 3605 }, { "epoch": 3.7525987525987525, "grad_norm": 0.09595970064401627, "learning_rate": 8.909472093160066e-07, "loss": 0.0295, "num_input_tokens_seen": 1450976, "step": 3610 }, { "epoch": 3.757796257796258, "grad_norm": 1.557525396347046, "learning_rate": 8.840151053955773e-07, "loss": 0.0128, "num_input_tokens_seen": 1453088, "step": 3615 }, { "epoch": 3.757796257796258, "eval_loss": 0.36968719959259033, "eval_runtime": 1.2334, "eval_samples_per_second": 694.025, "eval_steps_per_second": 86.753, "num_input_tokens_seen": 1453088, "step": 3615 }, { "epoch": 3.762993762993763, "grad_norm": 0.020010627806186676, "learning_rate": 8.771042780346767e-07, "loss": 0.0014, "num_input_tokens_seen": 1455136, "step": 3620 }, { "epoch": 3.768191268191268, "grad_norm": 12.859967231750488, "learning_rate": 8.702148182234043e-07, "loss": 0.1087, "num_input_tokens_seen": 1457120, "step": 3625 }, { "epoch": 3.773388773388773, "grad_norm": 0.03449089452624321, "learning_rate": 8.633468166705336e-07, "loss": 0.0001, "num_input_tokens_seen": 1459168, "step": 3630 }, { "epoch": 3.7785862785862787, "grad_norm": 0.013074683956801891, "learning_rate": 8.565003638023065e-07, "loss": 0.0061, "num_input_tokens_seen": 1461152, "step": 3635 }, { "epoch": 3.7837837837837838, "grad_norm": 0.00507075572386384, "learning_rate": 8.496755497612491e-07, "loss": 0.0002, "num_input_tokens_seen": 1463136, "step": 3640 }, { "epoch": 3.788981288981289, "grad_norm": 0.010262695141136646, "learning_rate": 8.42872464404986e-07, "loss": 0.0001, "num_input_tokens_seen": 1465120, "step": 3645 }, { "epoch": 3.7941787941787943, "grad_norm": 4.041860103607178, "learning_rate": 8.360911973050537e-07, "loss": 0.0322, "num_input_tokens_seen": 1467104, "step": 3650 }, { "epoch": 3.7993762993762994, "grad_norm": 0.005001334939152002, "learning_rate": 8.29331837745724e-07, "loss": 0.0004, "num_input_tokens_seen": 1469152, "step": 3655 }, { "epoch": 3.8045738045738045, "grad_norm": 17.126569747924805, "learning_rate": 8.225944747228257e-07, "loss": 0.1215, "num_input_tokens_seen": 1471264, "step": 3660 }, { "epoch": 3.8097713097713095, "grad_norm": 0.0037782315630465746, "learning_rate": 8.158791969425739e-07, "loss": 0.0868, "num_input_tokens_seen": 1473248, "step": 3665 }, { "epoch": 3.814968814968815, "grad_norm": 0.027992993593215942, "learning_rate": 8.091860928204048e-07, "loss": 0.0009, "num_input_tokens_seen": 1475360, "step": 3670 }, { "epoch": 3.82016632016632, "grad_norm": 0.006942141801118851, "learning_rate": 8.025152504798078e-07, "loss": 0.0001, "num_input_tokens_seen": 1477472, "step": 3675 }, { "epoch": 3.8253638253638256, "grad_norm": 19.416587829589844, "learning_rate": 7.958667577511684e-07, "loss": 0.0912, "num_input_tokens_seen": 1479328, "step": 3680 }, { "epoch": 3.8305613305613306, "grad_norm": 0.010084366425871849, "learning_rate": 7.892407021706064e-07, "loss": 0.0447, "num_input_tokens_seen": 1481248, "step": 3685 }, { "epoch": 3.8357588357588357, "grad_norm": 0.02589116431772709, "learning_rate": 7.826371709788314e-07, "loss": 0.0001, "num_input_tokens_seen": 1483168, "step": 3690 }, { "epoch": 3.8409563409563408, "grad_norm": 0.12098560482263565, "learning_rate": 7.760562511199881e-07, "loss": 0.0007, "num_input_tokens_seen": 1485152, "step": 3695 }, { "epoch": 3.8461538461538463, "grad_norm": 0.0336734913289547, "learning_rate": 7.694980292405122e-07, "loss": 0.0407, "num_input_tokens_seen": 1487200, "step": 3700 }, { "epoch": 3.8513513513513513, "grad_norm": 0.08973251283168793, "learning_rate": 7.629625916879932e-07, "loss": 0.0294, "num_input_tokens_seen": 1489184, "step": 3705 }, { "epoch": 3.856548856548857, "grad_norm": 17.128236770629883, "learning_rate": 7.564500245100326e-07, "loss": 0.0046, "num_input_tokens_seen": 1491168, "step": 3710 }, { "epoch": 3.861746361746362, "grad_norm": 0.03917059302330017, "learning_rate": 7.49960413453115e-07, "loss": 0.0001, "num_input_tokens_seen": 1493216, "step": 3715 }, { "epoch": 3.866943866943867, "grad_norm": 0.022577917203307152, "learning_rate": 7.434938439614781e-07, "loss": 0.0738, "num_input_tokens_seen": 1495200, "step": 3720 }, { "epoch": 3.872141372141372, "grad_norm": 0.04259275645017624, "learning_rate": 7.370504011759855e-07, "loss": 0.0047, "num_input_tokens_seen": 1497184, "step": 3725 }, { "epoch": 3.8773388773388775, "grad_norm": 39.698997497558594, "learning_rate": 7.306301699330065e-07, "loss": 0.0633, "num_input_tokens_seen": 1499040, "step": 3730 }, { "epoch": 3.8825363825363826, "grad_norm": 21.861370086669922, "learning_rate": 7.242332347633052e-07, "loss": 0.0354, "num_input_tokens_seen": 1501024, "step": 3735 }, { "epoch": 3.8877338877338876, "grad_norm": 0.0236463975161314, "learning_rate": 7.17859679890916e-07, "loss": 0.042, "num_input_tokens_seen": 1503072, "step": 3740 }, { "epoch": 3.892931392931393, "grad_norm": 0.09350544959306717, "learning_rate": 7.115095892320456e-07, "loss": 0.0002, "num_input_tokens_seen": 1505248, "step": 3745 }, { "epoch": 3.898128898128898, "grad_norm": 0.004034217447042465, "learning_rate": 7.051830463939605e-07, "loss": 0.0084, "num_input_tokens_seen": 1507296, "step": 3750 }, { "epoch": 3.9033264033264032, "grad_norm": 0.026631083339452744, "learning_rate": 6.988801346738911e-07, "loss": 0.0226, "num_input_tokens_seen": 1509344, "step": 3755 }, { "epoch": 3.9085239085239083, "grad_norm": 0.008157435804605484, "learning_rate": 6.926009370579334e-07, "loss": 0.0001, "num_input_tokens_seen": 1511456, "step": 3760 }, { "epoch": 3.913721413721414, "grad_norm": 72.86700439453125, "learning_rate": 6.863455362199542e-07, "loss": 0.0235, "num_input_tokens_seen": 1513440, "step": 3765 }, { "epoch": 3.918918918918919, "grad_norm": 0.05969979614019394, "learning_rate": 6.801140145205071e-07, "loss": 0.0001, "num_input_tokens_seen": 1515488, "step": 3770 }, { "epoch": 3.9241164241164244, "grad_norm": 4.924336910247803, "learning_rate": 6.739064540057425e-07, "loss": 0.0065, "num_input_tokens_seen": 1517408, "step": 3775 }, { "epoch": 3.9293139293139294, "grad_norm": 0.07060942053794861, "learning_rate": 6.677229364063329e-07, "loss": 0.0335, "num_input_tokens_seen": 1519392, "step": 3780 }, { "epoch": 3.9345114345114345, "grad_norm": 0.025277776643633842, "learning_rate": 6.615635431363943e-07, "loss": 0.0001, "num_input_tokens_seen": 1521440, "step": 3785 }, { "epoch": 3.9397089397089395, "grad_norm": 22.37493896484375, "learning_rate": 6.554283552924118e-07, "loss": 0.0844, "num_input_tokens_seen": 1523488, "step": 3790 }, { "epoch": 3.944906444906445, "grad_norm": 0.008414591662585735, "learning_rate": 6.493174536521768e-07, "loss": 0.0001, "num_input_tokens_seen": 1525600, "step": 3795 }, { "epoch": 3.95010395010395, "grad_norm": 4.057095527648926, "learning_rate": 6.43230918673721e-07, "loss": 0.0715, "num_input_tokens_seen": 1527584, "step": 3800 }, { "epoch": 3.955301455301455, "grad_norm": 0.2397640198469162, "learning_rate": 6.371688304942544e-07, "loss": 0.0002, "num_input_tokens_seen": 1529504, "step": 3805 }, { "epoch": 3.9604989604989607, "grad_norm": 0.024253297597169876, "learning_rate": 6.311312689291166e-07, "loss": 0.0805, "num_input_tokens_seen": 1531424, "step": 3810 }, { "epoch": 3.9656964656964657, "grad_norm": 0.006427168846130371, "learning_rate": 6.251183134707183e-07, "loss": 0.0, "num_input_tokens_seen": 1533408, "step": 3815 }, { "epoch": 3.970893970893971, "grad_norm": 22.389490127563477, "learning_rate": 6.191300432875017e-07, "loss": 0.1432, "num_input_tokens_seen": 1535392, "step": 3820 }, { "epoch": 3.976091476091476, "grad_norm": 42.83168029785156, "learning_rate": 6.13166537222894e-07, "loss": 0.0178, "num_input_tokens_seen": 1537312, "step": 3825 }, { "epoch": 3.9812889812889813, "grad_norm": 34.80426788330078, "learning_rate": 6.072278737942691e-07, "loss": 0.0611, "num_input_tokens_seen": 1539360, "step": 3830 }, { "epoch": 3.9864864864864864, "grad_norm": 0.005531808827072382, "learning_rate": 6.013141311919168e-07, "loss": 0.0019, "num_input_tokens_seen": 1541280, "step": 3835 }, { "epoch": 3.991683991683992, "grad_norm": 0.09399595111608505, "learning_rate": 5.954253872780102e-07, "loss": 0.0644, "num_input_tokens_seen": 1543136, "step": 3840 }, { "epoch": 3.996881496881497, "grad_norm": 0.004355916753411293, "learning_rate": 5.895617195855827e-07, "loss": 0.1091, "num_input_tokens_seen": 1545120, "step": 3845 }, { "epoch": 4.002079002079002, "grad_norm": 0.013024209067225456, "learning_rate": 5.837232053175065e-07, "loss": 0.0001, "num_input_tokens_seen": 1547056, "step": 3850 }, { "epoch": 4.007276507276507, "grad_norm": 0.05919545143842697, "learning_rate": 5.77909921345475e-07, "loss": 0.0238, "num_input_tokens_seen": 1548976, "step": 3855 }, { "epoch": 4.008316008316008, "eval_loss": 0.3716074526309967, "eval_runtime": 1.0785, "eval_samples_per_second": 793.715, "eval_steps_per_second": 99.214, "num_input_tokens_seen": 1549360, "step": 3856 }, { "epoch": 4.012474012474012, "grad_norm": 0.22275064885616302, "learning_rate": 5.721219442089925e-07, "loss": 0.0133, "num_input_tokens_seen": 1550960, "step": 3860 }, { "epoch": 4.017671517671518, "grad_norm": 11.842212677001953, "learning_rate": 5.663593501143663e-07, "loss": 0.011, "num_input_tokens_seen": 1552944, "step": 3865 }, { "epoch": 4.022869022869023, "grad_norm": 0.035551466047763824, "learning_rate": 5.606222149337004e-07, "loss": 0.0378, "num_input_tokens_seen": 1554992, "step": 3870 }, { "epoch": 4.028066528066528, "grad_norm": 0.21466241776943207, "learning_rate": 5.549106142039018e-07, "loss": 0.0001, "num_input_tokens_seen": 1557104, "step": 3875 }, { "epoch": 4.033264033264033, "grad_norm": 0.010968453250825405, "learning_rate": 5.492246231256798e-07, "loss": 0.0008, "num_input_tokens_seen": 1559088, "step": 3880 }, { "epoch": 4.038461538461538, "grad_norm": 0.0740390494465828, "learning_rate": 5.435643165625615e-07, "loss": 0.0001, "num_input_tokens_seen": 1561008, "step": 3885 }, { "epoch": 4.043659043659043, "grad_norm": 0.03413901478052139, "learning_rate": 5.379297690399035e-07, "loss": 0.0007, "num_input_tokens_seen": 1563056, "step": 3890 }, { "epoch": 4.048856548856548, "grad_norm": 0.023828689008951187, "learning_rate": 5.323210547439089e-07, "loss": 0.0001, "num_input_tokens_seen": 1565040, "step": 3895 }, { "epoch": 4.054054054054054, "grad_norm": 0.02368989959359169, "learning_rate": 5.267382475206548e-07, "loss": 0.0001, "num_input_tokens_seen": 1567024, "step": 3900 }, { "epoch": 4.0592515592515594, "grad_norm": 0.1620592474937439, "learning_rate": 5.21181420875117e-07, "loss": 0.0003, "num_input_tokens_seen": 1569136, "step": 3905 }, { "epoch": 4.0644490644490645, "grad_norm": 0.013055311515927315, "learning_rate": 5.15650647970202e-07, "loss": 0.0001, "num_input_tokens_seen": 1571120, "step": 3910 }, { "epoch": 4.06964656964657, "grad_norm": 0.005612197332084179, "learning_rate": 5.101460016257858e-07, "loss": 0.0001, "num_input_tokens_seen": 1573040, "step": 3915 }, { "epoch": 4.074844074844075, "grad_norm": 0.016595976427197456, "learning_rate": 5.046675543177531e-07, "loss": 0.0005, "num_input_tokens_seen": 1574896, "step": 3920 }, { "epoch": 4.08004158004158, "grad_norm": 0.05645221471786499, "learning_rate": 4.992153781770448e-07, "loss": 0.0001, "num_input_tokens_seen": 1576880, "step": 3925 }, { "epoch": 4.085239085239086, "grad_norm": 0.02893124334514141, "learning_rate": 4.937895449887076e-07, "loss": 0.0001, "num_input_tokens_seen": 1578864, "step": 3930 }, { "epoch": 4.090436590436591, "grad_norm": 0.010248606093227863, "learning_rate": 4.883901261909466e-07, "loss": 0.0, "num_input_tokens_seen": 1580848, "step": 3935 }, { "epoch": 4.095634095634096, "grad_norm": 0.019447464495897293, "learning_rate": 4.830171928741901e-07, "loss": 0.0001, "num_input_tokens_seen": 1582704, "step": 3940 }, { "epoch": 4.100831600831601, "grad_norm": 0.15405897796154022, "learning_rate": 4.776708157801463e-07, "loss": 0.0008, "num_input_tokens_seen": 1584816, "step": 3945 }, { "epoch": 4.106029106029106, "grad_norm": 8.753682136535645, "learning_rate": 4.723510653008809e-07, "loss": 0.0387, "num_input_tokens_seen": 1586800, "step": 3950 }, { "epoch": 4.111226611226611, "grad_norm": 0.06123171001672745, "learning_rate": 4.6705801147788136e-07, "loss": 0.081, "num_input_tokens_seen": 1588720, "step": 3955 }, { "epoch": 4.116424116424117, "grad_norm": 0.004952425602823496, "learning_rate": 4.617917240011394e-07, "loss": 0.0001, "num_input_tokens_seen": 1590576, "step": 3960 }, { "epoch": 4.121621621621622, "grad_norm": 0.00792229175567627, "learning_rate": 4.5655227220823355e-07, "loss": 0.0001, "num_input_tokens_seen": 1592496, "step": 3965 }, { "epoch": 4.126819126819127, "grad_norm": 0.013923810794949532, "learning_rate": 4.513397250834159e-07, "loss": 0.0123, "num_input_tokens_seen": 1594544, "step": 3970 }, { "epoch": 4.132016632016632, "grad_norm": 0.029175899922847748, "learning_rate": 4.461541512567011e-07, "loss": 0.0007, "num_input_tokens_seen": 1596400, "step": 3975 }, { "epoch": 4.137214137214137, "grad_norm": 0.04299869015812874, "learning_rate": 4.409956190029674e-07, "loss": 0.0585, "num_input_tokens_seen": 1598320, "step": 3980 }, { "epoch": 4.142411642411642, "grad_norm": 36.72762680053711, "learning_rate": 4.358641962410537e-07, "loss": 0.0202, "num_input_tokens_seen": 1600368, "step": 3985 }, { "epoch": 4.147609147609147, "grad_norm": 0.005658295005559921, "learning_rate": 4.3075995053286716e-07, "loss": 0.0, "num_input_tokens_seen": 1602352, "step": 3990 }, { "epoch": 4.152806652806653, "grad_norm": 0.00978625938296318, "learning_rate": 4.2568294908249486e-07, "loss": 0.0002, "num_input_tokens_seen": 1604336, "step": 3995 }, { "epoch": 4.158004158004158, "grad_norm": 0.005897314287722111, "learning_rate": 4.2063325873531485e-07, "loss": 0.0, "num_input_tokens_seen": 1606256, "step": 4000 }, { "epoch": 4.163201663201663, "grad_norm": 0.059251993894577026, "learning_rate": 4.156109459771215e-07, "loss": 0.0001, "num_input_tokens_seen": 1608304, "step": 4005 }, { "epoch": 4.168399168399168, "grad_norm": 0.004152240231633186, "learning_rate": 4.106160769332443e-07, "loss": 0.0001, "num_input_tokens_seen": 1610480, "step": 4010 }, { "epoch": 4.173596673596673, "grad_norm": 0.047246526926755905, "learning_rate": 4.056487173676843e-07, "loss": 0.0382, "num_input_tokens_seen": 1612528, "step": 4015 }, { "epoch": 4.1787941787941785, "grad_norm": 0.026120582595467567, "learning_rate": 4.0070893268224055e-07, "loss": 0.0001, "num_input_tokens_seen": 1614576, "step": 4020 }, { "epoch": 4.183991683991684, "grad_norm": 0.012839434668421745, "learning_rate": 3.9579678791565323e-07, "loss": 0.0, "num_input_tokens_seen": 1616624, "step": 4025 }, { "epoch": 4.1891891891891895, "grad_norm": 16.232559204101562, "learning_rate": 3.9091234774274873e-07, "loss": 0.0378, "num_input_tokens_seen": 1618672, "step": 4030 }, { "epoch": 4.1943866943866945, "grad_norm": 0.0076831188052892685, "learning_rate": 3.8605567647358426e-07, "loss": 0.0029, "num_input_tokens_seen": 1620784, "step": 4035 }, { "epoch": 4.1995841995842, "grad_norm": 0.009812161326408386, "learning_rate": 3.812268380526046e-07, "loss": 0.0002, "num_input_tokens_seen": 1622768, "step": 4040 }, { "epoch": 4.204781704781705, "grad_norm": 0.12099117040634155, "learning_rate": 3.764258960577971e-07, "loss": 0.0001, "num_input_tokens_seen": 1624688, "step": 4045 }, { "epoch": 4.20997920997921, "grad_norm": 0.005353657063096762, "learning_rate": 3.7165291369985616e-07, "loss": 0.0004, "num_input_tokens_seen": 1626672, "step": 4050 }, { "epoch": 4.215176715176715, "grad_norm": 0.001504407380707562, "learning_rate": 3.6690795382135184e-07, "loss": 0.0001, "num_input_tokens_seen": 1628848, "step": 4055 }, { "epoch": 4.220374220374221, "grad_norm": 0.009774814359843731, "learning_rate": 3.6219107889590154e-07, "loss": 0.0001, "num_input_tokens_seen": 1630832, "step": 4060 }, { "epoch": 4.225571725571726, "grad_norm": 0.00985631812363863, "learning_rate": 3.575023510273462e-07, "loss": 0.0007, "num_input_tokens_seen": 1632880, "step": 4065 }, { "epoch": 4.230769230769231, "grad_norm": 0.01718440279364586, "learning_rate": 3.528418319489349e-07, "loss": 0.0001, "num_input_tokens_seen": 1634992, "step": 4070 }, { "epoch": 4.235966735966736, "grad_norm": 0.021337008103728294, "learning_rate": 3.48209583022511e-07, "loss": 0.0001, "num_input_tokens_seen": 1636912, "step": 4075 }, { "epoch": 4.241164241164241, "grad_norm": 0.03264433145523071, "learning_rate": 3.436056652377043e-07, "loss": 0.0, "num_input_tokens_seen": 1638832, "step": 4080 }, { "epoch": 4.246361746361746, "grad_norm": 0.028791099786758423, "learning_rate": 3.3903013921112753e-07, "loss": 0.056, "num_input_tokens_seen": 1641072, "step": 4085 }, { "epoch": 4.251559251559252, "grad_norm": 0.00902112852782011, "learning_rate": 3.3448306518557795e-07, "loss": 0.0001, "num_input_tokens_seen": 1642992, "step": 4090 }, { "epoch": 4.256756756756757, "grad_norm": 0.0031842426396906376, "learning_rate": 3.299645030292467e-07, "loss": 0.0, "num_input_tokens_seen": 1645040, "step": 4095 }, { "epoch": 4.258835758835759, "eval_loss": 0.4492134153842926, "eval_runtime": 1.0401, "eval_samples_per_second": 823.003, "eval_steps_per_second": 102.875, "num_input_tokens_seen": 1645808, "step": 4097 }, { "epoch": 4.261954261954262, "grad_norm": 0.008271156810224056, "learning_rate": 3.254745122349279e-07, "loss": 0.0, "num_input_tokens_seen": 1647024, "step": 4100 }, { "epoch": 4.267151767151767, "grad_norm": 0.009126213379204273, "learning_rate": 3.2101315191923667e-07, "loss": 0.0001, "num_input_tokens_seen": 1649008, "step": 4105 }, { "epoch": 4.272349272349272, "grad_norm": 0.008243863470852375, "learning_rate": 3.1658048082182926e-07, "loss": 0.0003, "num_input_tokens_seen": 1651056, "step": 4110 }, { "epoch": 4.277546777546777, "grad_norm": 0.016346026211977005, "learning_rate": 3.1217655730463094e-07, "loss": 0.0001, "num_input_tokens_seen": 1653104, "step": 4115 }, { "epoch": 4.282744282744282, "grad_norm": 0.014476928859949112, "learning_rate": 3.078014393510695e-07, "loss": 0.0001, "num_input_tokens_seen": 1655344, "step": 4120 }, { "epoch": 4.287941787941788, "grad_norm": 0.00862564891576767, "learning_rate": 3.0345518456530666e-07, "loss": 0.042, "num_input_tokens_seen": 1657392, "step": 4125 }, { "epoch": 4.293139293139293, "grad_norm": 0.011305141262710094, "learning_rate": 2.9913785017148563e-07, "loss": 0.0002, "num_input_tokens_seen": 1659312, "step": 4130 }, { "epoch": 4.298336798336798, "grad_norm": 17.00044822692871, "learning_rate": 2.9484949301297166e-07, "loss": 0.0557, "num_input_tokens_seen": 1661424, "step": 4135 }, { "epoch": 4.303534303534303, "grad_norm": 0.0021855896338820457, "learning_rate": 2.905901695516092e-07, "loss": 0.0239, "num_input_tokens_seen": 1663408, "step": 4140 }, { "epoch": 4.3087318087318085, "grad_norm": 0.005250105168670416, "learning_rate": 2.8635993586697555e-07, "loss": 0.0001, "num_input_tokens_seen": 1665328, "step": 4145 }, { "epoch": 4.313929313929314, "grad_norm": 0.02172735519707203, "learning_rate": 2.8215884765564197e-07, "loss": 0.0001, "num_input_tokens_seen": 1667312, "step": 4150 }, { "epoch": 4.3191268191268195, "grad_norm": 0.3306088447570801, "learning_rate": 2.779869602304416e-07, "loss": 0.0003, "num_input_tokens_seen": 1669296, "step": 4155 }, { "epoch": 4.324324324324325, "grad_norm": 0.0034492157865315676, "learning_rate": 2.73844328519742e-07, "loss": 0.0003, "num_input_tokens_seen": 1671280, "step": 4160 }, { "epoch": 4.32952182952183, "grad_norm": 0.3147349953651428, "learning_rate": 2.6973100706672e-07, "loss": 0.0002, "num_input_tokens_seen": 1673456, "step": 4165 }, { "epoch": 4.334719334719335, "grad_norm": 0.0011071843327954412, "learning_rate": 2.656470500286451e-07, "loss": 0.0001, "num_input_tokens_seen": 1675504, "step": 4170 }, { "epoch": 4.33991683991684, "grad_norm": 0.00639357278123498, "learning_rate": 2.615925111761647e-07, "loss": 0.0, "num_input_tokens_seen": 1677488, "step": 4175 }, { "epoch": 4.345114345114345, "grad_norm": 0.00608447939157486, "learning_rate": 2.575674438925974e-07, "loss": 0.0633, "num_input_tokens_seen": 1679536, "step": 4180 }, { "epoch": 4.350311850311851, "grad_norm": 84.28992462158203, "learning_rate": 2.535719011732321e-07, "loss": 0.0875, "num_input_tokens_seen": 1681520, "step": 4185 }, { "epoch": 4.355509355509356, "grad_norm": 13.161194801330566, "learning_rate": 2.4960593562462496e-07, "loss": 0.0372, "num_input_tokens_seen": 1683568, "step": 4190 }, { "epoch": 4.360706860706861, "grad_norm": 0.006262065842747688, "learning_rate": 2.4566959946391246e-07, "loss": 0.0001, "num_input_tokens_seen": 1685488, "step": 4195 }, { "epoch": 4.365904365904366, "grad_norm": 0.010419082827866077, "learning_rate": 2.4176294451811936e-07, "loss": 0.0341, "num_input_tokens_seen": 1687408, "step": 4200 }, { "epoch": 4.371101871101871, "grad_norm": 0.0028410113882273436, "learning_rate": 2.378860222234794e-07, "loss": 0.0001, "num_input_tokens_seen": 1689520, "step": 4205 }, { "epoch": 4.376299376299376, "grad_norm": 0.00360031402669847, "learning_rate": 2.3403888362475784e-07, "loss": 0.0003, "num_input_tokens_seen": 1691568, "step": 4210 }, { "epoch": 4.381496881496881, "grad_norm": 0.006181271746754646, "learning_rate": 2.3022157937457628e-07, "loss": 0.0, "num_input_tokens_seen": 1693616, "step": 4215 }, { "epoch": 4.386694386694387, "grad_norm": 0.014252823777496815, "learning_rate": 2.2643415973275017e-07, "loss": 0.0001, "num_input_tokens_seen": 1695600, "step": 4220 }, { "epoch": 4.391891891891892, "grad_norm": 0.008665296249091625, "learning_rate": 2.226766745656231e-07, "loss": 0.0001, "num_input_tokens_seen": 1697584, "step": 4225 }, { "epoch": 4.397089397089397, "grad_norm": 0.004776356276124716, "learning_rate": 2.1894917334541355e-07, "loss": 0.0001, "num_input_tokens_seen": 1699568, "step": 4230 }, { "epoch": 4.402286902286902, "grad_norm": 0.01509240921586752, "learning_rate": 2.15251705149562e-07, "loss": 0.0017, "num_input_tokens_seen": 1701744, "step": 4235 }, { "epoch": 4.407484407484407, "grad_norm": 0.002179246162995696, "learning_rate": 2.11584318660083e-07, "loss": 0.0, "num_input_tokens_seen": 1703600, "step": 4240 }, { "epoch": 4.412681912681912, "grad_norm": 0.01842692494392395, "learning_rate": 2.0794706216292815e-07, "loss": 0.0613, "num_input_tokens_seen": 1705712, "step": 4245 }, { "epoch": 4.417879417879418, "grad_norm": 0.007841149345040321, "learning_rate": 2.043399835473475e-07, "loss": 0.0001, "num_input_tokens_seen": 1707696, "step": 4250 }, { "epoch": 4.423076923076923, "grad_norm": 0.006627705413848162, "learning_rate": 2.0076313030525845e-07, "loss": 0.0012, "num_input_tokens_seen": 1709744, "step": 4255 }, { "epoch": 4.428274428274428, "grad_norm": 0.0027992126997560263, "learning_rate": 1.9721654953062412e-07, "loss": 0.0001, "num_input_tokens_seen": 1711792, "step": 4260 }, { "epoch": 4.4334719334719335, "grad_norm": 0.02479691430926323, "learning_rate": 1.937002879188285e-07, "loss": 0.0002, "num_input_tokens_seen": 1713904, "step": 4265 }, { "epoch": 4.4386694386694385, "grad_norm": 0.011448011733591557, "learning_rate": 1.9021439176606565e-07, "loss": 0.0, "num_input_tokens_seen": 1715824, "step": 4270 }, { "epoch": 4.443866943866944, "grad_norm": 0.01309084240347147, "learning_rate": 1.8675890696872838e-07, "loss": 0.0001, "num_input_tokens_seen": 1717808, "step": 4275 }, { "epoch": 4.4490644490644495, "grad_norm": 16.534465789794922, "learning_rate": 1.8333387902280314e-07, "loss": 0.0326, "num_input_tokens_seen": 1719856, "step": 4280 }, { "epoch": 4.454261954261955, "grad_norm": 0.006772617343813181, "learning_rate": 1.799393530232729e-07, "loss": 0.0001, "num_input_tokens_seen": 1721776, "step": 4285 }, { "epoch": 4.45945945945946, "grad_norm": 0.04006092995405197, "learning_rate": 1.765753736635234e-07, "loss": 0.0001, "num_input_tokens_seen": 1723632, "step": 4290 }, { "epoch": 4.464656964656965, "grad_norm": 0.004494254942983389, "learning_rate": 1.7324198523475111e-07, "loss": 0.0001, "num_input_tokens_seen": 1725488, "step": 4295 }, { "epoch": 4.46985446985447, "grad_norm": 0.005892970599234104, "learning_rate": 1.6993923162538562e-07, "loss": 0.0001, "num_input_tokens_seen": 1727600, "step": 4300 }, { "epoch": 4.475051975051975, "grad_norm": 0.016387267038226128, "learning_rate": 1.666671563205069e-07, "loss": 0.0462, "num_input_tokens_seen": 1729712, "step": 4305 }, { "epoch": 4.48024948024948, "grad_norm": 0.003626425750553608, "learning_rate": 1.6342580240127582e-07, "loss": 0.0, "num_input_tokens_seen": 1731696, "step": 4310 }, { "epoch": 4.485446985446986, "grad_norm": 0.050125960260629654, "learning_rate": 1.6021521254436678e-07, "loss": 0.0169, "num_input_tokens_seen": 1733744, "step": 4315 }, { "epoch": 4.490644490644491, "grad_norm": 0.010477319359779358, "learning_rate": 1.5703542902140296e-07, "loss": 0.0001, "num_input_tokens_seen": 1735728, "step": 4320 }, { "epoch": 4.495841995841996, "grad_norm": 0.18304939568042755, "learning_rate": 1.538864936984036e-07, "loss": 0.0001, "num_input_tokens_seen": 1737776, "step": 4325 }, { "epoch": 4.501039501039501, "grad_norm": 0.0033877205569297075, "learning_rate": 1.507684480352292e-07, "loss": 0.0313, "num_input_tokens_seen": 1739824, "step": 4330 }, { "epoch": 4.506237006237006, "grad_norm": 0.005890438798815012, "learning_rate": 1.476813330850388e-07, "loss": 0.0202, "num_input_tokens_seen": 1741744, "step": 4335 }, { "epoch": 4.509355509355509, "eval_loss": 0.43684616684913635, "eval_runtime": 1.0364, "eval_samples_per_second": 825.951, "eval_steps_per_second": 103.244, "num_input_tokens_seen": 1742960, "step": 4338 }, { "epoch": 4.511434511434511, "grad_norm": 0.004618450067937374, "learning_rate": 1.4462518949374838e-07, "loss": 0.0002, "num_input_tokens_seen": 1743728, "step": 4340 }, { "epoch": 4.516632016632016, "grad_norm": 15.636531829833984, "learning_rate": 1.4160005749949328e-07, "loss": 0.0723, "num_input_tokens_seen": 1745904, "step": 4345 }, { "epoch": 4.521829521829522, "grad_norm": 0.009265006519854069, "learning_rate": 1.386059769321027e-07, "loss": 0.0001, "num_input_tokens_seen": 1747824, "step": 4350 }, { "epoch": 4.527027027027027, "grad_norm": 0.14768032729625702, "learning_rate": 1.3564298721257223e-07, "loss": 0.0002, "num_input_tokens_seen": 1749872, "step": 4355 }, { "epoch": 4.532224532224532, "grad_norm": 0.004393266513943672, "learning_rate": 1.32711127352545e-07, "loss": 0.0, "num_input_tokens_seen": 1751792, "step": 4360 }, { "epoch": 4.537422037422037, "grad_norm": 0.0018293843604624271, "learning_rate": 1.2981043595380048e-07, "loss": 0.0001, "num_input_tokens_seen": 1753776, "step": 4365 }, { "epoch": 4.542619542619542, "grad_norm": 0.010307732038199902, "learning_rate": 1.269409512077427e-07, "loss": 0.0, "num_input_tokens_seen": 1755824, "step": 4370 }, { "epoch": 4.547817047817047, "grad_norm": 0.02882198989391327, "learning_rate": 1.241027108949e-07, "loss": 0.0001, "num_input_tokens_seen": 1758000, "step": 4375 }, { "epoch": 4.553014553014553, "grad_norm": 0.02132793888449669, "learning_rate": 1.2129575238442715e-07, "loss": 0.0006, "num_input_tokens_seen": 1759984, "step": 4380 }, { "epoch": 4.558212058212058, "grad_norm": 0.014733387157320976, "learning_rate": 1.1852011263361218e-07, "loss": 0.0002, "num_input_tokens_seen": 1761968, "step": 4385 }, { "epoch": 4.5634095634095635, "grad_norm": 0.009524204768240452, "learning_rate": 1.1577582818739136e-07, "loss": 0.0, "num_input_tokens_seen": 1764016, "step": 4390 }, { "epoch": 4.5686070686070686, "grad_norm": 17.770906448364258, "learning_rate": 1.1306293517786615e-07, "loss": 0.0046, "num_input_tokens_seen": 1765936, "step": 4395 }, { "epoch": 4.573804573804574, "grad_norm": 0.0056666964665055275, "learning_rate": 1.1038146932383003e-07, "loss": 0.0002, "num_input_tokens_seen": 1767984, "step": 4400 }, { "epoch": 4.579002079002079, "grad_norm": 0.2879636287689209, "learning_rate": 1.0773146593029637e-07, "loss": 0.0266, "num_input_tokens_seen": 1769904, "step": 4405 }, { "epoch": 4.584199584199585, "grad_norm": 0.01593073643743992, "learning_rate": 1.0511295988803293e-07, "loss": 0.0001, "num_input_tokens_seen": 1771888, "step": 4410 }, { "epoch": 4.58939708939709, "grad_norm": 0.005392360966652632, "learning_rate": 1.0252598567310451e-07, "loss": 0.0027, "num_input_tokens_seen": 1773936, "step": 4415 }, { "epoch": 4.594594594594595, "grad_norm": 0.00791078433394432, "learning_rate": 9.997057734641852e-08, "loss": 0.0, "num_input_tokens_seen": 1775984, "step": 4420 }, { "epoch": 4.5997920997921, "grad_norm": 0.006835015490651131, "learning_rate": 9.744676855327484e-08, "loss": 0.0, "num_input_tokens_seen": 1777840, "step": 4425 }, { "epoch": 4.604989604989605, "grad_norm": 11.176219940185547, "learning_rate": 9.495459252292505e-08, "loss": 0.0267, "num_input_tokens_seen": 1779824, "step": 4430 }, { "epoch": 4.61018711018711, "grad_norm": 0.013272907584905624, "learning_rate": 9.249408206813332e-08, "loss": 0.0723, "num_input_tokens_seen": 1781872, "step": 4435 }, { "epoch": 4.615384615384615, "grad_norm": 0.012476145289838314, "learning_rate": 9.00652695847451e-08, "loss": 0.0001, "num_input_tokens_seen": 1783984, "step": 4440 }, { "epoch": 4.620582120582121, "grad_norm": 0.02734716795384884, "learning_rate": 8.766818705126134e-08, "loss": 0.0, "num_input_tokens_seen": 1786032, "step": 4445 }, { "epoch": 4.625779625779626, "grad_norm": 0.0005791023722849786, "learning_rate": 8.530286602841525e-08, "loss": 0.0058, "num_input_tokens_seen": 1788016, "step": 4450 }, { "epoch": 4.630977130977131, "grad_norm": 0.0094565125182271, "learning_rate": 8.296933765875898e-08, "loss": 0.0001, "num_input_tokens_seen": 1790064, "step": 4455 }, { "epoch": 4.636174636174636, "grad_norm": 0.0023999966215342283, "learning_rate": 8.066763266625283e-08, "loss": 0.0003, "num_input_tokens_seen": 1791984, "step": 4460 }, { "epoch": 4.641372141372141, "grad_norm": 0.004338675644248724, "learning_rate": 7.839778135586007e-08, "loss": 0.0321, "num_input_tokens_seen": 1793904, "step": 4465 }, { "epoch": 4.646569646569646, "grad_norm": 0.7023778557777405, "learning_rate": 7.61598136131489e-08, "loss": 0.0003, "num_input_tokens_seen": 1795888, "step": 4470 }, { "epoch": 4.651767151767151, "grad_norm": 0.019446399062871933, "learning_rate": 7.3953758903898e-08, "loss": 0.028, "num_input_tokens_seen": 1797872, "step": 4475 }, { "epoch": 4.656964656964657, "grad_norm": 0.06941288709640503, "learning_rate": 7.177964627370999e-08, "loss": 0.0007, "num_input_tokens_seen": 1799920, "step": 4480 }, { "epoch": 4.662162162162162, "grad_norm": 0.009321698918938637, "learning_rate": 6.963750434762745e-08, "loss": 0.0001, "num_input_tokens_seen": 1801776, "step": 4485 }, { "epoch": 4.667359667359667, "grad_norm": 61.291290283203125, "learning_rate": 6.752736132975696e-08, "loss": 0.0157, "num_input_tokens_seen": 1803824, "step": 4490 }, { "epoch": 4.672557172557172, "grad_norm": 0.003786651650443673, "learning_rate": 6.544924500289789e-08, "loss": 0.0562, "num_input_tokens_seen": 1805744, "step": 4495 }, { "epoch": 4.6777546777546775, "grad_norm": 0.01198617834597826, "learning_rate": 6.340318272817476e-08, "loss": 0.0, "num_input_tokens_seen": 1807728, "step": 4500 }, { "epoch": 4.682952182952183, "grad_norm": 0.024431385099887848, "learning_rate": 6.138920144468124e-08, "loss": 0.0329, "num_input_tokens_seen": 1809712, "step": 4505 }, { "epoch": 4.6881496881496885, "grad_norm": 0.011199146509170532, "learning_rate": 5.940732766912011e-08, "loss": 0.1284, "num_input_tokens_seen": 1811632, "step": 4510 }, { "epoch": 4.6933471933471935, "grad_norm": 0.015243390575051308, "learning_rate": 5.745758749545749e-08, "loss": 0.0, "num_input_tokens_seen": 1813552, "step": 4515 }, { "epoch": 4.698544698544699, "grad_norm": 0.02215772680938244, "learning_rate": 5.554000659457881e-08, "loss": 0.0, "num_input_tokens_seen": 1815664, "step": 4520 }, { "epoch": 4.703742203742204, "grad_norm": 0.07075236737728119, "learning_rate": 5.365461021395096e-08, "loss": 0.0056, "num_input_tokens_seen": 1817648, "step": 4525 }, { "epoch": 4.708939708939709, "grad_norm": 0.6212006211280823, "learning_rate": 5.1801423177288146e-08, "loss": 0.0226, "num_input_tokens_seen": 1819696, "step": 4530 }, { "epoch": 4.714137214137214, "grad_norm": 0.021660171449184418, "learning_rate": 4.998046988422767e-08, "loss": 0.0, "num_input_tokens_seen": 1821680, "step": 4535 }, { "epoch": 4.71933471933472, "grad_norm": 0.003982728812843561, "learning_rate": 4.8191774310006045e-08, "loss": 0.0, "num_input_tokens_seen": 1823728, "step": 4540 }, { "epoch": 4.724532224532225, "grad_norm": 0.005566044710576534, "learning_rate": 4.6435360005145647e-08, "loss": 0.0006, "num_input_tokens_seen": 1825712, "step": 4545 }, { "epoch": 4.72972972972973, "grad_norm": 0.006296331528574228, "learning_rate": 4.471125009514326e-08, "loss": 0.0258, "num_input_tokens_seen": 1827760, "step": 4550 }, { "epoch": 4.734927234927235, "grad_norm": 0.007365924771875143, "learning_rate": 4.30194672801662e-08, "loss": 0.0, "num_input_tokens_seen": 1829680, "step": 4555 }, { "epoch": 4.74012474012474, "grad_norm": 0.032850153744220734, "learning_rate": 4.136003383475251e-08, "loss": 0.0002, "num_input_tokens_seen": 1831728, "step": 4560 }, { "epoch": 4.745322245322245, "grad_norm": 0.048938535153865814, "learning_rate": 3.9732971607519264e-08, "loss": 0.0001, "num_input_tokens_seen": 1833648, "step": 4565 }, { "epoch": 4.75051975051975, "grad_norm": 0.010438877157866955, "learning_rate": 3.813830202087338e-08, "loss": 0.0, "num_input_tokens_seen": 1835696, "step": 4570 }, { "epoch": 4.755717255717256, "grad_norm": 0.24429504573345184, "learning_rate": 3.6576046070730676e-08, "loss": 0.0001, "num_input_tokens_seen": 1837808, "step": 4575 }, { "epoch": 4.75987525987526, "eval_loss": 0.4380520284175873, "eval_runtime": 1.0491, "eval_samples_per_second": 815.908, "eval_steps_per_second": 101.988, "num_input_tokens_seen": 1839344, "step": 4579 }, { "epoch": 4.760914760914761, "grad_norm": 0.021253783255815506, "learning_rate": 3.504622432623811e-08, "loss": 0.0003, "num_input_tokens_seen": 1839728, "step": 4580 }, { "epoch": 4.766112266112266, "grad_norm": 0.007059005554765463, "learning_rate": 3.354885692950505e-08, "loss": 0.002, "num_input_tokens_seen": 1841776, "step": 4585 }, { "epoch": 4.771309771309771, "grad_norm": 0.0066725509241223335, "learning_rate": 3.208396359533572e-08, "loss": 0.0001, "num_input_tokens_seen": 1843696, "step": 4590 }, { "epoch": 4.776507276507276, "grad_norm": 0.006126644089818001, "learning_rate": 3.065156361097138e-08, "loss": 0.0002, "num_input_tokens_seen": 1845744, "step": 4595 }, { "epoch": 4.781704781704782, "grad_norm": 5.130758285522461, "learning_rate": 2.925167583583577e-08, "loss": 0.0009, "num_input_tokens_seen": 1847792, "step": 4600 }, { "epoch": 4.786902286902287, "grad_norm": 0.009116302244365215, "learning_rate": 2.7884318701285883e-08, "loss": 0.0712, "num_input_tokens_seen": 1849776, "step": 4605 }, { "epoch": 4.792099792099792, "grad_norm": 0.005589164327830076, "learning_rate": 2.654951021037161e-08, "loss": 0.0, "num_input_tokens_seen": 1852016, "step": 4610 }, { "epoch": 4.797297297297297, "grad_norm": 0.0037636614870280027, "learning_rate": 2.524726793759591e-08, "loss": 0.0001, "num_input_tokens_seen": 1854064, "step": 4615 }, { "epoch": 4.802494802494802, "grad_norm": 0.012889928184449673, "learning_rate": 2.3977609028686123e-08, "loss": 0.0002, "num_input_tokens_seen": 1856112, "step": 4620 }, { "epoch": 4.8076923076923075, "grad_norm": 0.0022313897497951984, "learning_rate": 2.2740550200365528e-08, "loss": 0.0, "num_input_tokens_seen": 1858096, "step": 4625 }, { "epoch": 4.8128898128898125, "grad_norm": 0.004886255133897066, "learning_rate": 2.153610774013548e-08, "loss": 0.0, "num_input_tokens_seen": 1860272, "step": 4630 }, { "epoch": 4.8180873180873185, "grad_norm": 0.004527249839156866, "learning_rate": 2.0364297506060005e-08, "loss": 0.0001, "num_input_tokens_seen": 1862256, "step": 4635 }, { "epoch": 4.8232848232848236, "grad_norm": 0.005999819375574589, "learning_rate": 1.922513492655653e-08, "loss": 0.0, "num_input_tokens_seen": 1864304, "step": 4640 }, { "epoch": 4.828482328482329, "grad_norm": 0.003096930915489793, "learning_rate": 1.8118635000194395e-08, "loss": 0.0, "num_input_tokens_seen": 1866224, "step": 4645 }, { "epoch": 4.833679833679834, "grad_norm": 0.011734005995094776, "learning_rate": 1.704481229549526e-08, "loss": 0.0002, "num_input_tokens_seen": 1868336, "step": 4650 }, { "epoch": 4.838877338877339, "grad_norm": 0.005439637694507837, "learning_rate": 1.6003680950742728e-08, "loss": 0.0001, "num_input_tokens_seen": 1870448, "step": 4655 }, { "epoch": 4.844074844074844, "grad_norm": 21.41458511352539, "learning_rate": 1.499525467379581e-08, "loss": 0.0076, "num_input_tokens_seen": 1872368, "step": 4660 }, { "epoch": 4.849272349272349, "grad_norm": 0.007195206359028816, "learning_rate": 1.4019546741908252e-08, "loss": 0.0001, "num_input_tokens_seen": 1874480, "step": 4665 }, { "epoch": 4.854469854469855, "grad_norm": 0.019606366753578186, "learning_rate": 1.3076570001553934e-08, "loss": 0.0214, "num_input_tokens_seen": 1876464, "step": 4670 }, { "epoch": 4.85966735966736, "grad_norm": 0.021399203687906265, "learning_rate": 1.216633686825841e-08, "loss": 0.0, "num_input_tokens_seen": 1878448, "step": 4675 }, { "epoch": 4.864864864864865, "grad_norm": 0.008349803276360035, "learning_rate": 1.1288859326433477e-08, "loss": 0.0426, "num_input_tokens_seen": 1880432, "step": 4680 }, { "epoch": 4.87006237006237, "grad_norm": 0.00672512361779809, "learning_rate": 1.0444148929221466e-08, "loss": 0.0598, "num_input_tokens_seen": 1882544, "step": 4685 }, { "epoch": 4.875259875259875, "grad_norm": 0.009454301558434963, "learning_rate": 9.632216798342032e-09, "loss": 0.0001, "num_input_tokens_seen": 1884528, "step": 4690 }, { "epoch": 4.88045738045738, "grad_norm": 0.001981085864827037, "learning_rate": 8.853073623946163e-09, "loss": 0.0, "num_input_tokens_seen": 1886640, "step": 4695 }, { "epoch": 4.885654885654886, "grad_norm": 13.079166412353516, "learning_rate": 8.106729664475178e-09, "loss": 0.0369, "num_input_tokens_seen": 1888688, "step": 4700 }, { "epoch": 4.890852390852391, "grad_norm": 0.020751064643263817, "learning_rate": 7.3931947465252786e-09, "loss": 0.0001, "num_input_tokens_seen": 1890736, "step": 4705 }, { "epoch": 4.896049896049896, "grad_norm": 0.0023189696948975325, "learning_rate": 6.7124782647196015e-09, "loss": 0.0, "num_input_tokens_seen": 1892720, "step": 4710 }, { "epoch": 4.901247401247401, "grad_norm": 0.010008195415139198, "learning_rate": 6.064589181582481e-09, "loss": 0.0, "num_input_tokens_seen": 1894704, "step": 4715 }, { "epoch": 4.906444906444906, "grad_norm": 0.011259862221777439, "learning_rate": 5.4495360274231526e-09, "loss": 0.0287, "num_input_tokens_seen": 1896624, "step": 4720 }, { "epoch": 4.911642411642411, "grad_norm": 0.006877740379422903, "learning_rate": 4.867326900223068e-09, "loss": 0.0307, "num_input_tokens_seen": 1898544, "step": 4725 }, { "epoch": 4.916839916839917, "grad_norm": 0.0025101625360548496, "learning_rate": 4.317969465527927e-09, "loss": 0.0353, "num_input_tokens_seen": 1900592, "step": 4730 }, { "epoch": 4.922037422037422, "grad_norm": 0.022444335743784904, "learning_rate": 3.801470956348863e-09, "loss": 0.0287, "num_input_tokens_seen": 1902576, "step": 4735 }, { "epoch": 4.927234927234927, "grad_norm": 0.0029787139501422644, "learning_rate": 3.3178381730661345e-09, "loss": 0.0001, "num_input_tokens_seen": 1904624, "step": 4740 }, { "epoch": 4.9324324324324325, "grad_norm": 0.0010304702445864677, "learning_rate": 2.8670774833386427e-09, "loss": 0.0, "num_input_tokens_seen": 1906736, "step": 4745 }, { "epoch": 4.9376299376299375, "grad_norm": 0.0025558616034686565, "learning_rate": 2.449194822022327e-09, "loss": 0.0, "num_input_tokens_seen": 1908592, "step": 4750 }, { "epoch": 4.942827442827443, "grad_norm": 0.019764816388487816, "learning_rate": 2.064195691089954e-09, "loss": 0.0006, "num_input_tokens_seen": 1910576, "step": 4755 }, { "epoch": 4.948024948024948, "grad_norm": 0.004834398627281189, "learning_rate": 1.7120851595597842e-09, "loss": 0.0, "num_input_tokens_seen": 1912624, "step": 4760 }, { "epoch": 4.953222453222454, "grad_norm": 0.03484058007597923, "learning_rate": 1.3928678634289595e-09, "loss": 0.0283, "num_input_tokens_seen": 1914608, "step": 4765 }, { "epoch": 4.958419958419959, "grad_norm": 0.008615722879767418, "learning_rate": 1.1065480056110521e-09, "loss": 0.0004, "num_input_tokens_seen": 1916592, "step": 4770 }, { "epoch": 4.963617463617464, "grad_norm": 0.16332073509693146, "learning_rate": 8.531293558824983e-10, "loss": 0.0001, "num_input_tokens_seen": 1918704, "step": 4775 }, { "epoch": 4.968814968814969, "grad_norm": 0.0005217403522692621, "learning_rate": 6.326152508320804e-10, "loss": 0.0001, "num_input_tokens_seen": 1920624, "step": 4780 }, { "epoch": 4.974012474012474, "grad_norm": 1.4390920400619507, "learning_rate": 4.450085938170756e-10, "loss": 0.0013, "num_input_tokens_seen": 1922480, "step": 4785 }, { "epoch": 4.979209979209979, "grad_norm": 0.018787242472171783, "learning_rate": 2.903118549252293e-10, "loss": 0.0, "num_input_tokens_seen": 1924464, "step": 4790 }, { "epoch": 4.984407484407484, "grad_norm": 0.14635036885738373, "learning_rate": 1.6852707094172637e-10, "loss": 0.0001, "num_input_tokens_seen": 1926448, "step": 4795 }, { "epoch": 4.98960498960499, "grad_norm": 0.04664904624223709, "learning_rate": 7.965584532282356e-11, "loss": 0.0002, "num_input_tokens_seen": 1928560, "step": 4800 }, { "epoch": 4.994802494802495, "grad_norm": 0.024172263219952583, "learning_rate": 2.3699348174754943e-11, "loss": 0.0177, "num_input_tokens_seen": 1930544, "step": 4805 }, { "epoch": 5.0, "grad_norm": 0.0018674664897844195, "learning_rate": 6.583162381890162e-13, "loss": 0.0001, "num_input_tokens_seen": 1932608, "step": 4810 }, { "epoch": 5.0, "num_input_tokens_seen": 1932608, "step": 4810, "total_flos": 1.1284259767320576e+16, "train_loss": 0.10950150515592155, "train_runtime": 1431.7139, "train_samples_per_second": 26.873, "train_steps_per_second": 3.36 } ], "logging_steps": 5, "max_steps": 4810, "num_input_tokens_seen": 1932608, "num_train_epochs": 5, "save_steps": 241, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1284259767320576e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }