{ "best_global_step": 104, "best_metric": 0.17402823269367218, "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_mrpc_42_1774791061/checkpoint-104", "epoch": 5.0, "eval_steps": 104, "global_step": 2065, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.012106537530266344, "grad_norm": 262.4778747558594, "learning_rate": 9.66183574879227e-07, "loss": 0.7681, "num_input_tokens_seen": 4352, "step": 5 }, { "epoch": 0.024213075060532687, "grad_norm": 26.363384246826172, "learning_rate": 2.173913043478261e-06, "loss": 0.3056, "num_input_tokens_seen": 8768, "step": 10 }, { "epoch": 0.03631961259079903, "grad_norm": 10.327119827270508, "learning_rate": 3.3816425120772947e-06, "loss": 0.183, "num_input_tokens_seen": 12992, "step": 15 }, { "epoch": 0.048426150121065374, "grad_norm": 36.403324127197266, "learning_rate": 4.589371980676329e-06, "loss": 0.4041, "num_input_tokens_seen": 17344, "step": 20 }, { "epoch": 0.06053268765133172, "grad_norm": 8.729621887207031, "learning_rate": 5.797101449275362e-06, "loss": 0.4147, "num_input_tokens_seen": 21696, "step": 25 }, { "epoch": 0.07263922518159806, "grad_norm": 4.769359111785889, "learning_rate": 7.004830917874397e-06, "loss": 0.2132, "num_input_tokens_seen": 26112, "step": 30 }, { "epoch": 0.0847457627118644, "grad_norm": 4.588466644287109, "learning_rate": 8.212560386473431e-06, "loss": 0.2587, "num_input_tokens_seen": 30208, "step": 35 }, { "epoch": 0.09685230024213075, "grad_norm": 21.823162078857422, "learning_rate": 9.420289855072464e-06, "loss": 0.2076, "num_input_tokens_seen": 34688, "step": 40 }, { "epoch": 0.1089588377723971, "grad_norm": 18.038860321044922, "learning_rate": 1.0628019323671499e-05, "loss": 0.1842, "num_input_tokens_seen": 38784, "step": 45 }, { "epoch": 0.12106537530266344, "grad_norm": 12.918279647827148, "learning_rate": 1.1835748792270531e-05, "loss": 0.3012, "num_input_tokens_seen": 43200, "step": 50 }, { "epoch": 0.13317191283292978, "grad_norm": 24.635744094848633, "learning_rate": 1.3043478260869566e-05, "loss": 0.1951, "num_input_tokens_seen": 47296, "step": 55 }, { "epoch": 0.14527845036319612, "grad_norm": 14.053600311279297, "learning_rate": 1.4251207729468599e-05, "loss": 0.2332, "num_input_tokens_seen": 51712, "step": 60 }, { "epoch": 0.15738498789346247, "grad_norm": 8.166345596313477, "learning_rate": 1.5458937198067633e-05, "loss": 0.2049, "num_input_tokens_seen": 55872, "step": 65 }, { "epoch": 0.1694915254237288, "grad_norm": 27.84511947631836, "learning_rate": 1.6666666666666667e-05, "loss": 0.2103, "num_input_tokens_seen": 59840, "step": 70 }, { "epoch": 0.18159806295399517, "grad_norm": 55.02257537841797, "learning_rate": 1.78743961352657e-05, "loss": 0.3072, "num_input_tokens_seen": 64000, "step": 75 }, { "epoch": 0.1937046004842615, "grad_norm": 11.449199676513672, "learning_rate": 1.9082125603864733e-05, "loss": 0.3841, "num_input_tokens_seen": 68352, "step": 80 }, { "epoch": 0.20581113801452786, "grad_norm": 11.381855964660645, "learning_rate": 2.028985507246377e-05, "loss": 0.232, "num_input_tokens_seen": 72768, "step": 85 }, { "epoch": 0.2179176755447942, "grad_norm": 42.495670318603516, "learning_rate": 2.1497584541062805e-05, "loss": 0.2474, "num_input_tokens_seen": 77120, "step": 90 }, { "epoch": 0.23002421307506055, "grad_norm": 21.28970718383789, "learning_rate": 2.2705314009661836e-05, "loss": 0.1841, "num_input_tokens_seen": 81664, "step": 95 }, { "epoch": 0.24213075060532688, "grad_norm": 17.023759841918945, "learning_rate": 2.391304347826087e-05, "loss": 0.1681, "num_input_tokens_seen": 86080, "step": 100 }, { "epoch": 0.25181598062953997, "eval_loss": 0.17402823269367218, "eval_runtime": 0.639, "eval_samples_per_second": 574.368, "eval_steps_per_second": 71.992, "num_input_tokens_seen": 89600, "step": 104 }, { "epoch": 0.2542372881355932, "grad_norm": 16.713178634643555, "learning_rate": 2.5120772946859905e-05, "loss": 0.1488, "num_input_tokens_seen": 90432, "step": 105 }, { "epoch": 0.26634382566585957, "grad_norm": 6.363961219787598, "learning_rate": 2.632850241545894e-05, "loss": 0.2051, "num_input_tokens_seen": 94528, "step": 110 }, { "epoch": 0.2784503631961259, "grad_norm": 7.700758934020996, "learning_rate": 2.753623188405797e-05, "loss": 0.16, "num_input_tokens_seen": 98816, "step": 115 }, { "epoch": 0.29055690072639223, "grad_norm": 8.657270431518555, "learning_rate": 2.8743961352657005e-05, "loss": 0.205, "num_input_tokens_seen": 103104, "step": 120 }, { "epoch": 0.3026634382566586, "grad_norm": 7.297232151031494, "learning_rate": 2.995169082125604e-05, "loss": 0.1846, "num_input_tokens_seen": 107328, "step": 125 }, { "epoch": 0.31476997578692495, "grad_norm": 13.21757984161377, "learning_rate": 3.1159420289855074e-05, "loss": 0.2243, "num_input_tokens_seen": 111488, "step": 130 }, { "epoch": 0.3268765133171913, "grad_norm": 6.457214832305908, "learning_rate": 3.236714975845411e-05, "loss": 0.2013, "num_input_tokens_seen": 115968, "step": 135 }, { "epoch": 0.3389830508474576, "grad_norm": 29.321474075317383, "learning_rate": 3.357487922705314e-05, "loss": 0.2278, "num_input_tokens_seen": 120192, "step": 140 }, { "epoch": 0.35108958837772397, "grad_norm": 10.676529884338379, "learning_rate": 3.478260869565218e-05, "loss": 0.1886, "num_input_tokens_seen": 124416, "step": 145 }, { "epoch": 0.36319612590799033, "grad_norm": 11.802507400512695, "learning_rate": 3.5990338164251205e-05, "loss": 0.1635, "num_input_tokens_seen": 128832, "step": 150 }, { "epoch": 0.37530266343825663, "grad_norm": 9.175806999206543, "learning_rate": 3.719806763285024e-05, "loss": 0.2118, "num_input_tokens_seen": 132992, "step": 155 }, { "epoch": 0.387409200968523, "grad_norm": 17.557262420654297, "learning_rate": 3.8405797101449274e-05, "loss": 0.3186, "num_input_tokens_seen": 137280, "step": 160 }, { "epoch": 0.39951573849878935, "grad_norm": 31.175756454467773, "learning_rate": 3.961352657004831e-05, "loss": 0.2002, "num_input_tokens_seen": 141568, "step": 165 }, { "epoch": 0.4116222760290557, "grad_norm": 12.988505363464355, "learning_rate": 4.082125603864734e-05, "loss": 0.1792, "num_input_tokens_seen": 145984, "step": 170 }, { "epoch": 0.423728813559322, "grad_norm": 43.43312454223633, "learning_rate": 4.202898550724638e-05, "loss": 0.3197, "num_input_tokens_seen": 150144, "step": 175 }, { "epoch": 0.4358353510895884, "grad_norm": 10.99770736694336, "learning_rate": 4.323671497584541e-05, "loss": 0.3561, "num_input_tokens_seen": 154624, "step": 180 }, { "epoch": 0.44794188861985473, "grad_norm": 8.507532119750977, "learning_rate": 4.4444444444444447e-05, "loss": 0.373, "num_input_tokens_seen": 158784, "step": 185 }, { "epoch": 0.4600484261501211, "grad_norm": 129.54592895507812, "learning_rate": 4.565217391304348e-05, "loss": 0.3924, "num_input_tokens_seen": 163072, "step": 190 }, { "epoch": 0.4721549636803874, "grad_norm": 15.108623504638672, "learning_rate": 4.6859903381642516e-05, "loss": 0.2368, "num_input_tokens_seen": 167104, "step": 195 }, { "epoch": 0.48426150121065376, "grad_norm": 9.902148246765137, "learning_rate": 4.806763285024155e-05, "loss": 0.4497, "num_input_tokens_seen": 171456, "step": 200 }, { "epoch": 0.4963680387409201, "grad_norm": 16.188369750976562, "learning_rate": 4.9275362318840584e-05, "loss": 0.2715, "num_input_tokens_seen": 175808, "step": 205 }, { "epoch": 0.5036319612590799, "eval_loss": 0.23122040927410126, "eval_runtime": 0.6326, "eval_samples_per_second": 580.165, "eval_steps_per_second": 72.718, "num_input_tokens_seen": 178688, "step": 208 }, { "epoch": 0.5084745762711864, "grad_norm": 0.7912328839302063, "learning_rate": 4.9999857052054956e-05, "loss": 0.1981, "num_input_tokens_seen": 180224, "step": 210 }, { "epoch": 0.5205811138014528, "grad_norm": 4.983211040496826, "learning_rate": 4.999824890644693e-05, "loss": 0.1989, "num_input_tokens_seen": 184704, "step": 215 }, { "epoch": 0.5326876513317191, "grad_norm": 16.626827239990234, "learning_rate": 4.9994854045622684e-05, "loss": 0.2336, "num_input_tokens_seen": 189184, "step": 220 }, { "epoch": 0.5447941888619855, "grad_norm": 5.18185567855835, "learning_rate": 4.9989672712225204e-05, "loss": 0.1595, "num_input_tokens_seen": 193536, "step": 225 }, { "epoch": 0.5569007263922519, "grad_norm": 8.547920227050781, "learning_rate": 4.998270527658311e-05, "loss": 0.2147, "num_input_tokens_seen": 197888, "step": 230 }, { "epoch": 0.5690072639225182, "grad_norm": 1.19011652469635, "learning_rate": 4.9973952236684216e-05, "loss": 0.1959, "num_input_tokens_seen": 202112, "step": 235 }, { "epoch": 0.5811138014527845, "grad_norm": 12.658636093139648, "learning_rate": 4.996341421813993e-05, "loss": 0.2085, "num_input_tokens_seen": 206528, "step": 240 }, { "epoch": 0.5932203389830508, "grad_norm": 20.122756958007812, "learning_rate": 4.9951091974140506e-05, "loss": 0.2304, "num_input_tokens_seen": 210944, "step": 245 }, { "epoch": 0.6053268765133172, "grad_norm": 10.99802303314209, "learning_rate": 4.99369863854013e-05, "loss": 0.2171, "num_input_tokens_seen": 215104, "step": 250 }, { "epoch": 0.6174334140435835, "grad_norm": 7.956684112548828, "learning_rate": 4.992109846009972e-05, "loss": 0.2458, "num_input_tokens_seen": 219328, "step": 255 }, { "epoch": 0.6295399515738499, "grad_norm": 19.862939834594727, "learning_rate": 4.990342933380321e-05, "loss": 0.219, "num_input_tokens_seen": 223680, "step": 260 }, { "epoch": 0.6416464891041163, "grad_norm": 7.302405834197998, "learning_rate": 4.9883980269388106e-05, "loss": 0.3803, "num_input_tokens_seen": 227904, "step": 265 }, { "epoch": 0.6537530266343826, "grad_norm": 9.361984252929688, "learning_rate": 4.986275265694935e-05, "loss": 0.3005, "num_input_tokens_seen": 231936, "step": 270 }, { "epoch": 0.6658595641646489, "grad_norm": 16.678607940673828, "learning_rate": 4.9839748013701145e-05, "loss": 0.2954, "num_input_tokens_seen": 236160, "step": 275 }, { "epoch": 0.6779661016949152, "grad_norm": 9.596780776977539, "learning_rate": 4.981496798386849e-05, "loss": 0.2924, "num_input_tokens_seen": 240320, "step": 280 }, { "epoch": 0.6900726392251816, "grad_norm": 6.522184371948242, "learning_rate": 4.978841433856971e-05, "loss": 0.1771, "num_input_tokens_seen": 244800, "step": 285 }, { "epoch": 0.7021791767554479, "grad_norm": 8.720867156982422, "learning_rate": 4.976008897568981e-05, "loss": 0.194, "num_input_tokens_seen": 249152, "step": 290 }, { "epoch": 0.7142857142857143, "grad_norm": 11.178607940673828, "learning_rate": 4.972999391974488e-05, "loss": 0.2064, "num_input_tokens_seen": 253376, "step": 295 }, { "epoch": 0.7263922518159807, "grad_norm": 12.191368103027344, "learning_rate": 4.969813132173735e-05, "loss": 0.2096, "num_input_tokens_seen": 257664, "step": 300 }, { "epoch": 0.738498789346247, "grad_norm": 5.037217617034912, "learning_rate": 4.966450345900229e-05, "loss": 0.1712, "num_input_tokens_seen": 262016, "step": 305 }, { "epoch": 0.7506053268765133, "grad_norm": 10.153473854064941, "learning_rate": 4.962911273504461e-05, "loss": 0.2276, "num_input_tokens_seen": 266432, "step": 310 }, { "epoch": 0.7554479418886199, "eval_loss": 0.22853781282901764, "eval_runtime": 2.3445, "eval_samples_per_second": 156.536, "eval_steps_per_second": 19.62, "num_input_tokens_seen": 267968, "step": 312 }, { "epoch": 0.7627118644067796, "grad_norm": 12.040881156921387, "learning_rate": 4.9591961679367284e-05, "loss": 0.2349, "num_input_tokens_seen": 270464, "step": 315 }, { "epoch": 0.774818401937046, "grad_norm": 12.473306655883789, "learning_rate": 4.955305294729056e-05, "loss": 0.2824, "num_input_tokens_seen": 274688, "step": 320 }, { "epoch": 0.7869249394673123, "grad_norm": 21.77474594116211, "learning_rate": 4.951238931976216e-05, "loss": 0.3105, "num_input_tokens_seen": 278848, "step": 325 }, { "epoch": 0.7990314769975787, "grad_norm": 17.280487060546875, "learning_rate": 4.9469973703158565e-05, "loss": 0.2667, "num_input_tokens_seen": 283136, "step": 330 }, { "epoch": 0.8111380145278451, "grad_norm": 6.448112487792969, "learning_rate": 4.9425809129077204e-05, "loss": 0.2213, "num_input_tokens_seen": 287680, "step": 335 }, { "epoch": 0.8232445520581114, "grad_norm": 1.0759979486465454, "learning_rate": 4.937989875411985e-05, "loss": 0.1887, "num_input_tokens_seen": 292224, "step": 340 }, { "epoch": 0.8353510895883777, "grad_norm": 8.703038215637207, "learning_rate": 4.933224585966696e-05, "loss": 0.2499, "num_input_tokens_seen": 296448, "step": 345 }, { "epoch": 0.847457627118644, "grad_norm": 16.416717529296875, "learning_rate": 4.928285385164315e-05, "loss": 0.2431, "num_input_tokens_seen": 300736, "step": 350 }, { "epoch": 0.8595641646489104, "grad_norm": 6.670568943023682, "learning_rate": 4.923172626027379e-05, "loss": 0.2588, "num_input_tokens_seen": 304960, "step": 355 }, { "epoch": 0.8716707021791767, "grad_norm": 3.8800857067108154, "learning_rate": 4.917886673983267e-05, "loss": 0.2322, "num_input_tokens_seen": 309184, "step": 360 }, { "epoch": 0.8837772397094431, "grad_norm": 8.991925239562988, "learning_rate": 4.912427906838078e-05, "loss": 0.2314, "num_input_tokens_seen": 313408, "step": 365 }, { "epoch": 0.8958837772397095, "grad_norm": 9.208677291870117, "learning_rate": 4.906796714749635e-05, "loss": 0.1782, "num_input_tokens_seen": 317888, "step": 370 }, { "epoch": 0.9079903147699758, "grad_norm": 6.636046886444092, "learning_rate": 4.900993500199591e-05, "loss": 0.1873, "num_input_tokens_seen": 322048, "step": 375 }, { "epoch": 0.9200968523002422, "grad_norm": 10.718189239501953, "learning_rate": 4.895018677964669e-05, "loss": 0.1985, "num_input_tokens_seen": 326592, "step": 380 }, { "epoch": 0.9322033898305084, "grad_norm": 22.99626922607422, "learning_rate": 4.8888726750870126e-05, "loss": 0.3036, "num_input_tokens_seen": 330880, "step": 385 }, { "epoch": 0.9443099273607748, "grad_norm": 3.320899486541748, "learning_rate": 4.882555930843664e-05, "loss": 0.2224, "num_input_tokens_seen": 335104, "step": 390 }, { "epoch": 0.9564164648910412, "grad_norm": 5.677978038787842, "learning_rate": 4.87606889671517e-05, "loss": 0.1898, "num_input_tokens_seen": 339392, "step": 395 }, { "epoch": 0.9685230024213075, "grad_norm": 11.17044448852539, "learning_rate": 4.8694120363533104e-05, "loss": 0.1663, "num_input_tokens_seen": 343744, "step": 400 }, { "epoch": 0.9806295399515739, "grad_norm": 9.493459701538086, "learning_rate": 4.8625858255479574e-05, "loss": 0.1954, "num_input_tokens_seen": 348160, "step": 405 }, { "epoch": 0.9927360774818402, "grad_norm": 13.322687149047852, "learning_rate": 4.855590752193076e-05, "loss": 0.2606, "num_input_tokens_seen": 352448, "step": 410 }, { "epoch": 1.0048426150121066, "grad_norm": 13.647954940795898, "learning_rate": 4.848427316251842e-05, "loss": 0.5572, "num_input_tokens_seen": 356656, "step": 415 }, { "epoch": 1.0072639225181599, "eval_loss": 0.2624819278717041, "eval_runtime": 0.8628, "eval_samples_per_second": 425.363, "eval_steps_per_second": 53.315, "num_input_tokens_seen": 357488, "step": 416 }, { "epoch": 1.0169491525423728, "grad_norm": 43.02584457397461, "learning_rate": 4.841096029720921e-05, "loss": 0.2346, "num_input_tokens_seen": 360880, "step": 420 }, { "epoch": 1.0290556900726393, "grad_norm": 8.104162216186523, "learning_rate": 4.8335974165938615e-05, "loss": 0.1819, "num_input_tokens_seen": 365104, "step": 425 }, { "epoch": 1.0411622276029056, "grad_norm": 5.002182483673096, "learning_rate": 4.825932012823652e-05, "loss": 0.1495, "num_input_tokens_seen": 369776, "step": 430 }, { "epoch": 1.053268765133172, "grad_norm": 27.77912139892578, "learning_rate": 4.8181003662844074e-05, "loss": 0.2583, "num_input_tokens_seen": 374000, "step": 435 }, { "epoch": 1.0653753026634383, "grad_norm": 9.262914657592773, "learning_rate": 4.8101030367322195e-05, "loss": 0.2093, "num_input_tokens_seen": 378096, "step": 440 }, { "epoch": 1.0774818401937045, "grad_norm": 5.5975823402404785, "learning_rate": 4.8019405957651395e-05, "loss": 0.1806, "num_input_tokens_seen": 382256, "step": 445 }, { "epoch": 1.089588377723971, "grad_norm": 10.306631088256836, "learning_rate": 4.793613626782331e-05, "loss": 0.3307, "num_input_tokens_seen": 386672, "step": 450 }, { "epoch": 1.1016949152542372, "grad_norm": 4.157079696655273, "learning_rate": 4.785122724942367e-05, "loss": 0.2208, "num_input_tokens_seen": 390960, "step": 455 }, { "epoch": 1.1138014527845037, "grad_norm": 0.7576245069503784, "learning_rate": 4.776468497120698e-05, "loss": 0.2978, "num_input_tokens_seen": 395440, "step": 460 }, { "epoch": 1.12590799031477, "grad_norm": 6.9619035720825195, "learning_rate": 4.7676515618662684e-05, "loss": 0.2315, "num_input_tokens_seen": 399600, "step": 465 }, { "epoch": 1.1380145278450362, "grad_norm": 1.4395357370376587, "learning_rate": 4.758672549357316e-05, "loss": 0.2236, "num_input_tokens_seen": 403888, "step": 470 }, { "epoch": 1.1501210653753027, "grad_norm": 18.561601638793945, "learning_rate": 4.749532101356322e-05, "loss": 0.1689, "num_input_tokens_seen": 408176, "step": 475 }, { "epoch": 1.162227602905569, "grad_norm": 16.139604568481445, "learning_rate": 4.740230871164147e-05, "loss": 0.2012, "num_input_tokens_seen": 412208, "step": 480 }, { "epoch": 1.1743341404358354, "grad_norm": 1.962085247039795, "learning_rate": 4.730769523573337e-05, "loss": 0.1816, "num_input_tokens_seen": 416624, "step": 485 }, { "epoch": 1.1864406779661016, "grad_norm": 3.118806838989258, "learning_rate": 4.7211487348206054e-05, "loss": 0.2491, "num_input_tokens_seen": 421040, "step": 490 }, { "epoch": 1.1985472154963681, "grad_norm": 3.9620296955108643, "learning_rate": 4.711369192538503e-05, "loss": 0.203, "num_input_tokens_seen": 425136, "step": 495 }, { "epoch": 1.2106537530266344, "grad_norm": 4.469512462615967, "learning_rate": 4.7014315957062685e-05, "loss": 0.4102, "num_input_tokens_seen": 429680, "step": 500 }, { "epoch": 1.2227602905569008, "grad_norm": 8.607080459594727, "learning_rate": 4.691336654599873e-05, "loss": 0.2409, "num_input_tokens_seen": 434224, "step": 505 }, { "epoch": 1.234866828087167, "grad_norm": 9.237229347229004, "learning_rate": 4.6810850907412484e-05, "loss": 0.2191, "num_input_tokens_seen": 438320, "step": 510 }, { "epoch": 1.2469733656174333, "grad_norm": 5.81946325302124, "learning_rate": 4.670677636846723e-05, "loss": 0.1975, "num_input_tokens_seen": 442672, "step": 515 }, { "epoch": 1.2590799031476998, "grad_norm": 2.934025764465332, "learning_rate": 4.660115036774648e-05, "loss": 0.1881, "num_input_tokens_seen": 446896, "step": 520 }, { "epoch": 1.2590799031476998, "eval_loss": 0.1976936012506485, "eval_runtime": 0.6676, "eval_samples_per_second": 549.73, "eval_steps_per_second": 68.904, "num_input_tokens_seen": 446896, "step": 520 }, { "epoch": 1.271186440677966, "grad_norm": 2.785706043243408, "learning_rate": 4.6493980454722344e-05, "loss": 0.2485, "num_input_tokens_seen": 451312, "step": 525 }, { "epoch": 1.2832929782082325, "grad_norm": 9.8702392578125, "learning_rate": 4.638527428921592e-05, "loss": 0.2053, "num_input_tokens_seen": 455408, "step": 530 }, { "epoch": 1.2953995157384988, "grad_norm": 7.424989223480225, "learning_rate": 4.627503964084981e-05, "loss": 0.1867, "num_input_tokens_seen": 460080, "step": 535 }, { "epoch": 1.307506053268765, "grad_norm": 4.052550792694092, "learning_rate": 4.6163284388492835e-05, "loss": 0.1674, "num_input_tokens_seen": 464496, "step": 540 }, { "epoch": 1.3196125907990315, "grad_norm": 2.9404428005218506, "learning_rate": 4.605001651969686e-05, "loss": 0.2045, "num_input_tokens_seen": 468720, "step": 545 }, { "epoch": 1.331719128329298, "grad_norm": 6.4158148765563965, "learning_rate": 4.593524413012592e-05, "loss": 0.191, "num_input_tokens_seen": 473264, "step": 550 }, { "epoch": 1.3438256658595642, "grad_norm": 2.213015556335449, "learning_rate": 4.5818975422977606e-05, "loss": 0.1828, "num_input_tokens_seen": 477552, "step": 555 }, { "epoch": 1.3559322033898304, "grad_norm": 5.9616804122924805, "learning_rate": 4.570121870839671e-05, "loss": 0.1546, "num_input_tokens_seen": 482032, "step": 560 }, { "epoch": 1.368038740920097, "grad_norm": 0.6267197132110596, "learning_rate": 4.558198240288131e-05, "loss": 0.2025, "num_input_tokens_seen": 486384, "step": 565 }, { "epoch": 1.3801452784503632, "grad_norm": 9.450618743896484, "learning_rate": 4.546127502868118e-05, "loss": 0.2413, "num_input_tokens_seen": 490672, "step": 570 }, { "epoch": 1.3922518159806296, "grad_norm": 5.918724536895752, "learning_rate": 4.5339105213188714e-05, "loss": 0.2163, "num_input_tokens_seen": 494960, "step": 575 }, { "epoch": 1.4043583535108959, "grad_norm": 2.0229716300964355, "learning_rate": 4.521548168832227e-05, "loss": 0.3013, "num_input_tokens_seen": 499120, "step": 580 }, { "epoch": 1.4164648910411621, "grad_norm": 4.871718406677246, "learning_rate": 4.509041328990204e-05, "loss": 0.2324, "num_input_tokens_seen": 503408, "step": 585 }, { "epoch": 1.4285714285714286, "grad_norm": 4.264101028442383, "learning_rate": 4.4963908957018576e-05, "loss": 0.1956, "num_input_tokens_seen": 507312, "step": 590 }, { "epoch": 1.4406779661016949, "grad_norm": 0.7742087841033936, "learning_rate": 4.483597773139386e-05, "loss": 0.2206, "num_input_tokens_seen": 511600, "step": 595 }, { "epoch": 1.4527845036319613, "grad_norm": 1.387762427330017, "learning_rate": 4.470662875673506e-05, "loss": 0.1973, "num_input_tokens_seen": 515888, "step": 600 }, { "epoch": 1.4648910411622276, "grad_norm": 8.138726234436035, "learning_rate": 4.457587127808096e-05, "loss": 0.1848, "num_input_tokens_seen": 519920, "step": 605 }, { "epoch": 1.4769975786924938, "grad_norm": 3.1052446365356445, "learning_rate": 4.4443714641141255e-05, "loss": 0.1922, "num_input_tokens_seen": 524336, "step": 610 }, { "epoch": 1.4891041162227603, "grad_norm": 1.7755212783813477, "learning_rate": 4.4310168291628504e-05, "loss": 0.1922, "num_input_tokens_seen": 528496, "step": 615 }, { "epoch": 1.5012106537530268, "grad_norm": 8.44454288482666, "learning_rate": 4.4175241774583084e-05, "loss": 0.1809, "num_input_tokens_seen": 532784, "step": 620 }, { "epoch": 1.5108958837772397, "eval_loss": 0.19258780777454376, "eval_runtime": 0.6591, "eval_samples_per_second": 556.848, "eval_steps_per_second": 69.796, "num_input_tokens_seen": 536176, "step": 624 }, { "epoch": 1.513317191283293, "grad_norm": 6.506056785583496, "learning_rate": 4.403894473369092e-05, "loss": 0.2205, "num_input_tokens_seen": 537136, "step": 625 }, { "epoch": 1.5254237288135593, "grad_norm": 15.012322425842285, "learning_rate": 4.390128691059423e-05, "loss": 0.26, "num_input_tokens_seen": 541552, "step": 630 }, { "epoch": 1.5375302663438255, "grad_norm": 2.567143440246582, "learning_rate": 4.3762278144195236e-05, "loss": 0.2678, "num_input_tokens_seen": 545648, "step": 635 }, { "epoch": 1.549636803874092, "grad_norm": 9.604016304016113, "learning_rate": 4.362192836995299e-05, "loss": 0.2246, "num_input_tokens_seen": 550256, "step": 640 }, { "epoch": 1.5617433414043584, "grad_norm": 6.7328104972839355, "learning_rate": 4.348024761917321e-05, "loss": 0.2397, "num_input_tokens_seen": 554928, "step": 645 }, { "epoch": 1.5738498789346247, "grad_norm": 13.930996894836426, "learning_rate": 4.333724601829132e-05, "loss": 0.2303, "num_input_tokens_seen": 559344, "step": 650 }, { "epoch": 1.585956416464891, "grad_norm": 7.173315048217773, "learning_rate": 4.319293378814868e-05, "loss": 0.2178, "num_input_tokens_seen": 563760, "step": 655 }, { "epoch": 1.5980629539951574, "grad_norm": 1.3246958255767822, "learning_rate": 4.304732124326206e-05, "loss": 0.1945, "num_input_tokens_seen": 568112, "step": 660 }, { "epoch": 1.6101694915254239, "grad_norm": 10.188156127929688, "learning_rate": 4.2900418791086403e-05, "loss": 0.1908, "num_input_tokens_seen": 572464, "step": 665 }, { "epoch": 1.6222760290556901, "grad_norm": 7.808104515075684, "learning_rate": 4.275223693127103e-05, "loss": 0.2026, "num_input_tokens_seen": 576752, "step": 670 }, { "epoch": 1.6343825665859564, "grad_norm": 0.8921657204627991, "learning_rate": 4.260278625490911e-05, "loss": 0.1959, "num_input_tokens_seen": 580976, "step": 675 }, { "epoch": 1.6464891041162226, "grad_norm": 6.147708892822266, "learning_rate": 4.2452077443780744e-05, "loss": 0.2025, "num_input_tokens_seen": 585264, "step": 680 }, { "epoch": 1.658595641646489, "grad_norm": 5.73768424987793, "learning_rate": 4.2300121269589475e-05, "loss": 0.1777, "num_input_tokens_seen": 589744, "step": 685 }, { "epoch": 1.6707021791767556, "grad_norm": 5.188973426818848, "learning_rate": 4.214692859319237e-05, "loss": 0.2142, "num_input_tokens_seen": 593968, "step": 690 }, { "epoch": 1.6828087167070218, "grad_norm": 20.29938316345215, "learning_rate": 4.19925103638238e-05, "loss": 0.2096, "num_input_tokens_seen": 598256, "step": 695 }, { "epoch": 1.694915254237288, "grad_norm": 3.481995105743408, "learning_rate": 4.183687761831281e-05, "loss": 0.1881, "num_input_tokens_seen": 602608, "step": 700 }, { "epoch": 1.7070217917675545, "grad_norm": 2.9380016326904297, "learning_rate": 4.168004148029435e-05, "loss": 0.1678, "num_input_tokens_seen": 607088, "step": 705 }, { "epoch": 1.7191283292978208, "grad_norm": 6.645642280578613, "learning_rate": 4.1522013159414144e-05, "loss": 0.243, "num_input_tokens_seen": 611248, "step": 710 }, { "epoch": 1.7312348668280872, "grad_norm": 5.701453685760498, "learning_rate": 4.136280395052754e-05, "loss": 0.2024, "num_input_tokens_seen": 615536, "step": 715 }, { "epoch": 1.7433414043583535, "grad_norm": 4.573903560638428, "learning_rate": 4.120242523289223e-05, "loss": 0.1803, "num_input_tokens_seen": 619952, "step": 720 }, { "epoch": 1.7554479418886197, "grad_norm": 3.025674819946289, "learning_rate": 4.1040888469354925e-05, "loss": 0.1949, "num_input_tokens_seen": 624368, "step": 725 }, { "epoch": 1.7627118644067796, "eval_loss": 0.19822187721729279, "eval_runtime": 1.1195, "eval_samples_per_second": 327.835, "eval_steps_per_second": 41.091, "num_input_tokens_seen": 626992, "step": 728 }, { "epoch": 1.7675544794188862, "grad_norm": 5.934816360473633, "learning_rate": 4.087820520553205e-05, "loss": 0.1935, "num_input_tokens_seen": 628720, "step": 730 }, { "epoch": 1.7796610169491527, "grad_norm": 1.3624376058578491, "learning_rate": 4.0714387068984574e-05, "loss": 0.1884, "num_input_tokens_seen": 633008, "step": 735 }, { "epoch": 1.791767554479419, "grad_norm": 2.1475796699523926, "learning_rate": 4.05494457683869e-05, "loss": 0.2014, "num_input_tokens_seen": 637360, "step": 740 }, { "epoch": 1.8038740920096852, "grad_norm": 10.264263153076172, "learning_rate": 4.038339309269002e-05, "loss": 0.2152, "num_input_tokens_seen": 641648, "step": 745 }, { "epoch": 1.8159806295399514, "grad_norm": 4.37279748916626, "learning_rate": 4.021624091027895e-05, "loss": 0.192, "num_input_tokens_seen": 645552, "step": 750 }, { "epoch": 1.828087167070218, "grad_norm": 10.11119270324707, "learning_rate": 4.004800116812441e-05, "loss": 0.3049, "num_input_tokens_seen": 649904, "step": 755 }, { "epoch": 1.8401937046004844, "grad_norm": 0.4716910719871521, "learning_rate": 3.987868589092893e-05, "loss": 0.184, "num_input_tokens_seen": 654128, "step": 760 }, { "epoch": 1.8523002421307506, "grad_norm": 8.259904861450195, "learning_rate": 3.9708307180267456e-05, "loss": 0.1914, "num_input_tokens_seen": 658672, "step": 765 }, { "epoch": 1.8644067796610169, "grad_norm": 14.706856727600098, "learning_rate": 3.953687721372233e-05, "loss": 0.4553, "num_input_tokens_seen": 663088, "step": 770 }, { "epoch": 1.8765133171912833, "grad_norm": 9.08963394165039, "learning_rate": 3.936440824401299e-05, "loss": 0.1709, "num_input_tokens_seen": 667440, "step": 775 }, { "epoch": 1.8886198547215496, "grad_norm": 4.246565818786621, "learning_rate": 3.919091259812013e-05, "loss": 0.1831, "num_input_tokens_seen": 671792, "step": 780 }, { "epoch": 1.900726392251816, "grad_norm": 11.860783576965332, "learning_rate": 3.9016402676404753e-05, "loss": 0.2175, "num_input_tokens_seen": 676336, "step": 785 }, { "epoch": 1.9128329297820823, "grad_norm": 5.474867820739746, "learning_rate": 3.884089095172181e-05, "loss": 0.18, "num_input_tokens_seen": 680624, "step": 790 }, { "epoch": 1.9249394673123486, "grad_norm": 2.7666966915130615, "learning_rate": 3.866438996852872e-05, "loss": 0.1914, "num_input_tokens_seen": 685040, "step": 795 }, { "epoch": 1.937046004842615, "grad_norm": 10.039326667785645, "learning_rate": 3.848691234198879e-05, "loss": 0.1935, "num_input_tokens_seen": 689392, "step": 800 }, { "epoch": 1.9491525423728815, "grad_norm": 3.919206142425537, "learning_rate": 3.830847075706956e-05, "loss": 0.2046, "num_input_tokens_seen": 693552, "step": 805 }, { "epoch": 1.9612590799031477, "grad_norm": 16.429906845092773, "learning_rate": 3.812907796763616e-05, "loss": 0.2291, "num_input_tokens_seen": 698032, "step": 810 }, { "epoch": 1.973365617433414, "grad_norm": 6.558701992034912, "learning_rate": 3.7948746795539745e-05, "loss": 0.1751, "num_input_tokens_seen": 702000, "step": 815 }, { "epoch": 1.9854721549636802, "grad_norm": 8.950061798095703, "learning_rate": 3.776749012970105e-05, "loss": 0.1795, "num_input_tokens_seen": 706160, "step": 820 }, { "epoch": 1.9975786924939467, "grad_norm": 3.701720714569092, "learning_rate": 3.758532092518924e-05, "loss": 0.1852, "num_input_tokens_seen": 710768, "step": 825 }, { "epoch": 2.009685230024213, "grad_norm": 6.777426719665527, "learning_rate": 3.740225220229587e-05, "loss": 0.256, "num_input_tokens_seen": 714744, "step": 830 }, { "epoch": 2.0145278450363198, "eval_loss": 0.1934857964515686, "eval_runtime": 0.6627, "eval_samples_per_second": 553.776, "eval_steps_per_second": 69.411, "num_input_tokens_seen": 716344, "step": 832 }, { "epoch": 2.0217917675544794, "grad_norm": 7.20669412612915, "learning_rate": 3.721829704560436e-05, "loss": 0.1878, "num_input_tokens_seen": 718776, "step": 835 }, { "epoch": 2.0338983050847457, "grad_norm": 6.232179164886475, "learning_rate": 3.7033468603054725e-05, "loss": 0.2215, "num_input_tokens_seen": 722744, "step": 840 }, { "epoch": 2.046004842615012, "grad_norm": 8.393187522888184, "learning_rate": 3.6847780085003905e-05, "loss": 0.1657, "num_input_tokens_seen": 727160, "step": 845 }, { "epoch": 2.0581113801452786, "grad_norm": 9.579306602478027, "learning_rate": 3.666124476328155e-05, "loss": 0.1957, "num_input_tokens_seen": 731576, "step": 850 }, { "epoch": 2.070217917675545, "grad_norm": 8.12859058380127, "learning_rate": 3.647387597024139e-05, "loss": 0.1881, "num_input_tokens_seen": 736184, "step": 855 }, { "epoch": 2.082324455205811, "grad_norm": 11.758556365966797, "learning_rate": 3.6285687097808394e-05, "loss": 0.2041, "num_input_tokens_seen": 740472, "step": 860 }, { "epoch": 2.0944309927360774, "grad_norm": 1.7637454271316528, "learning_rate": 3.609669159652158e-05, "loss": 0.213, "num_input_tokens_seen": 744760, "step": 865 }, { "epoch": 2.106537530266344, "grad_norm": 5.633957386016846, "learning_rate": 3.590690297457262e-05, "loss": 0.1913, "num_input_tokens_seen": 749176, "step": 870 }, { "epoch": 2.1186440677966103, "grad_norm": 4.531621932983398, "learning_rate": 3.57163347968404e-05, "loss": 0.1961, "num_input_tokens_seen": 753528, "step": 875 }, { "epoch": 2.1307506053268765, "grad_norm": 6.524752140045166, "learning_rate": 3.552500068392147e-05, "loss": 0.1981, "num_input_tokens_seen": 757688, "step": 880 }, { "epoch": 2.142857142857143, "grad_norm": 5.924046516418457, "learning_rate": 3.533291431115653e-05, "loss": 0.2002, "num_input_tokens_seen": 762040, "step": 885 }, { "epoch": 2.154963680387409, "grad_norm": 4.7628068923950195, "learning_rate": 3.514008940765304e-05, "loss": 0.1856, "num_input_tokens_seen": 766200, "step": 890 }, { "epoch": 2.1670702179176757, "grad_norm": 9.14155101776123, "learning_rate": 3.494653975530388e-05, "loss": 0.2107, "num_input_tokens_seen": 770680, "step": 895 }, { "epoch": 2.179176755447942, "grad_norm": 7.742560386657715, "learning_rate": 3.475227918780239e-05, "loss": 0.1771, "num_input_tokens_seen": 774840, "step": 900 }, { "epoch": 2.1912832929782082, "grad_norm": 1.2218825817108154, "learning_rate": 3.4557321589653556e-05, "loss": 0.1924, "num_input_tokens_seen": 779192, "step": 905 }, { "epoch": 2.2033898305084745, "grad_norm": 10.382070541381836, "learning_rate": 3.436168089518168e-05, "loss": 0.1687, "num_input_tokens_seen": 783608, "step": 910 }, { "epoch": 2.2154963680387407, "grad_norm": 2.07893967628479, "learning_rate": 3.416537108753443e-05, "loss": 0.1922, "num_input_tokens_seen": 788088, "step": 915 }, { "epoch": 2.2276029055690074, "grad_norm": 14.99792194366455, "learning_rate": 3.3968406197683376e-05, "loss": 0.1721, "num_input_tokens_seen": 792568, "step": 920 }, { "epoch": 2.2397094430992737, "grad_norm": 4.237668037414551, "learning_rate": 3.3770800303421254e-05, "loss": 0.2058, "num_input_tokens_seen": 797176, "step": 925 }, { "epoch": 2.25181598062954, "grad_norm": 2.3142411708831787, "learning_rate": 3.357256752835561e-05, "loss": 0.1925, "num_input_tokens_seen": 801400, "step": 930 }, { "epoch": 2.263922518159806, "grad_norm": 3.0918896198272705, "learning_rate": 3.3373722040899517e-05, "loss": 0.1601, "num_input_tokens_seen": 805944, "step": 935 }, { "epoch": 2.2663438256658597, "eval_loss": 0.38670673966407776, "eval_runtime": 2.26, "eval_samples_per_second": 162.386, "eval_steps_per_second": 20.354, "num_input_tokens_seen": 806712, "step": 936 }, { "epoch": 2.2760290556900724, "grad_norm": 3.9013168811798096, "learning_rate": 3.317427805325875e-05, "loss": 0.9421, "num_input_tokens_seen": 810040, "step": 940 }, { "epoch": 2.288135593220339, "grad_norm": 1.7496920824050903, "learning_rate": 3.297424982041609e-05, "loss": 0.191, "num_input_tokens_seen": 814392, "step": 945 }, { "epoch": 2.3002421307506054, "grad_norm": 6.5397491455078125, "learning_rate": 3.277365163911243e-05, "loss": 0.1962, "num_input_tokens_seen": 818872, "step": 950 }, { "epoch": 2.3123486682808716, "grad_norm": 2.407987594604492, "learning_rate": 3.257249784682492e-05, "loss": 0.2261, "num_input_tokens_seen": 823096, "step": 955 }, { "epoch": 2.324455205811138, "grad_norm": 3.1127803325653076, "learning_rate": 3.2370802820742275e-05, "loss": 0.1945, "num_input_tokens_seen": 827128, "step": 960 }, { "epoch": 2.3365617433414045, "grad_norm": 10.151595115661621, "learning_rate": 3.2168580976737104e-05, "loss": 0.2272, "num_input_tokens_seen": 831288, "step": 965 }, { "epoch": 2.348668280871671, "grad_norm": 1.2875597476959229, "learning_rate": 3.196584676833562e-05, "loss": 0.1824, "num_input_tokens_seen": 835640, "step": 970 }, { "epoch": 2.360774818401937, "grad_norm": 0.8216660022735596, "learning_rate": 3.1762614685684567e-05, "loss": 0.156, "num_input_tokens_seen": 839736, "step": 975 }, { "epoch": 2.3728813559322033, "grad_norm": 7.343863010406494, "learning_rate": 3.155889925451557e-05, "loss": 0.2199, "num_input_tokens_seen": 844024, "step": 980 }, { "epoch": 2.38498789346247, "grad_norm": 2.2787206172943115, "learning_rate": 3.1354715035106894e-05, "loss": 0.1885, "num_input_tokens_seen": 848248, "step": 985 }, { "epoch": 2.3970944309927362, "grad_norm": 6.654670238494873, "learning_rate": 3.1150076621242816e-05, "loss": 0.1645, "num_input_tokens_seen": 852472, "step": 990 }, { "epoch": 2.4092009685230025, "grad_norm": 3.4156064987182617, "learning_rate": 3.0944998639170544e-05, "loss": 0.1747, "num_input_tokens_seen": 856824, "step": 995 }, { "epoch": 2.4213075060532687, "grad_norm": 0.4972361624240875, "learning_rate": 3.073949574655479e-05, "loss": 0.1751, "num_input_tokens_seen": 860984, "step": 1000 }, { "epoch": 2.433414043583535, "grad_norm": 0.7988845705986023, "learning_rate": 3.053358263143015e-05, "loss": 0.1975, "num_input_tokens_seen": 865272, "step": 1005 }, { "epoch": 2.4455205811138017, "grad_norm": 5.293003082275391, "learning_rate": 3.032727401115135e-05, "loss": 0.1765, "num_input_tokens_seen": 869560, "step": 1010 }, { "epoch": 2.457627118644068, "grad_norm": 3.4668216705322266, "learning_rate": 3.012058463134126e-05, "loss": 0.1624, "num_input_tokens_seen": 873976, "step": 1015 }, { "epoch": 2.469733656174334, "grad_norm": 1.981259822845459, "learning_rate": 2.991352926483702e-05, "loss": 0.2237, "num_input_tokens_seen": 878200, "step": 1020 }, { "epoch": 2.4818401937046004, "grad_norm": 15.534086227416992, "learning_rate": 2.9706122710634165e-05, "loss": 0.2024, "num_input_tokens_seen": 882872, "step": 1025 }, { "epoch": 2.4939467312348667, "grad_norm": 2.0866310596466064, "learning_rate": 2.949837979282889e-05, "loss": 0.2673, "num_input_tokens_seen": 887096, "step": 1030 }, { "epoch": 2.5060532687651333, "grad_norm": 1.296164870262146, "learning_rate": 2.92903153595585e-05, "loss": 0.2168, "num_input_tokens_seen": 891576, "step": 1035 }, { "epoch": 2.5181598062953996, "grad_norm": 3.0610435009002686, "learning_rate": 2.908194428194019e-05, "loss": 0.1768, "num_input_tokens_seen": 895736, "step": 1040 }, { "epoch": 2.5181598062953996, "eval_loss": 0.1943914145231247, "eval_runtime": 0.6714, "eval_samples_per_second": 546.608, "eval_steps_per_second": 68.512, "num_input_tokens_seen": 895736, "step": 1040 }, { "epoch": 2.530266343825666, "grad_norm": 13.436739921569824, "learning_rate": 2.88732814530081e-05, "loss": 0.1555, "num_input_tokens_seen": 900024, "step": 1045 }, { "epoch": 2.542372881355932, "grad_norm": 9.469161987304688, "learning_rate": 2.866434178664893e-05, "loss": 0.1744, "num_input_tokens_seen": 904440, "step": 1050 }, { "epoch": 2.5544794188861983, "grad_norm": 6.683951377868652, "learning_rate": 2.8455140216535947e-05, "loss": 0.1842, "num_input_tokens_seen": 908728, "step": 1055 }, { "epoch": 2.566585956416465, "grad_norm": 4.156672954559326, "learning_rate": 2.8245691695061604e-05, "loss": 0.2018, "num_input_tokens_seen": 913016, "step": 1060 }, { "epoch": 2.5786924939467313, "grad_norm": 2.5280745029449463, "learning_rate": 2.8036011192268863e-05, "loss": 0.2027, "num_input_tokens_seen": 917304, "step": 1065 }, { "epoch": 2.5907990314769975, "grad_norm": 3.3346853256225586, "learning_rate": 2.7826113694781252e-05, "loss": 0.1984, "num_input_tokens_seen": 921528, "step": 1070 }, { "epoch": 2.6029055690072638, "grad_norm": 6.732588768005371, "learning_rate": 2.761601420473168e-05, "loss": 0.1674, "num_input_tokens_seen": 925944, "step": 1075 }, { "epoch": 2.61501210653753, "grad_norm": 5.7978363037109375, "learning_rate": 2.740572773869019e-05, "loss": 0.1523, "num_input_tokens_seen": 930744, "step": 1080 }, { "epoch": 2.6271186440677967, "grad_norm": 4.692154884338379, "learning_rate": 2.7195269326590682e-05, "loss": 0.1263, "num_input_tokens_seen": 935352, "step": 1085 }, { "epoch": 2.639225181598063, "grad_norm": 8.889333724975586, "learning_rate": 2.6984654010656667e-05, "loss": 0.1656, "num_input_tokens_seen": 939640, "step": 1090 }, { "epoch": 2.651331719128329, "grad_norm": 4.259967803955078, "learning_rate": 2.6773896844326125e-05, "loss": 0.2926, "num_input_tokens_seen": 943672, "step": 1095 }, { "epoch": 2.663438256658596, "grad_norm": 3.0391273498535156, "learning_rate": 2.656301289117561e-05, "loss": 0.1547, "num_input_tokens_seen": 947704, "step": 1100 }, { "epoch": 2.6755447941888617, "grad_norm": 9.067920684814453, "learning_rate": 2.6352017223843585e-05, "loss": 0.2428, "num_input_tokens_seen": 951928, "step": 1105 }, { "epoch": 2.6876513317191284, "grad_norm": 7.765347957611084, "learning_rate": 2.6140924922953125e-05, "loss": 0.1649, "num_input_tokens_seen": 956216, "step": 1110 }, { "epoch": 2.6997578692493946, "grad_norm": 1.6490931510925293, "learning_rate": 2.5929751076034058e-05, "loss": 0.1597, "num_input_tokens_seen": 960504, "step": 1115 }, { "epoch": 2.711864406779661, "grad_norm": 1.5548573732376099, "learning_rate": 2.571851077644461e-05, "loss": 0.1407, "num_input_tokens_seen": 965048, "step": 1120 }, { "epoch": 2.7239709443099276, "grad_norm": 5.526769161224365, "learning_rate": 2.5507219122292598e-05, "loss": 0.1667, "num_input_tokens_seen": 969208, "step": 1125 }, { "epoch": 2.736077481840194, "grad_norm": 5.792220115661621, "learning_rate": 2.529589121535636e-05, "loss": 0.1438, "num_input_tokens_seen": 973624, "step": 1130 }, { "epoch": 2.74818401937046, "grad_norm": 6.361023902893066, "learning_rate": 2.5084542160005335e-05, "loss": 0.2294, "num_input_tokens_seen": 977976, "step": 1135 }, { "epoch": 2.7602905569007263, "grad_norm": 1.0617471933364868, "learning_rate": 2.487318706212051e-05, "loss": 0.1964, "num_input_tokens_seen": 982200, "step": 1140 }, { "epoch": 2.7699757869249395, "eval_loss": 0.19318054616451263, "eval_runtime": 0.6508, "eval_samples_per_second": 563.894, "eval_steps_per_second": 70.679, "num_input_tokens_seen": 985592, "step": 1144 }, { "epoch": 2.7723970944309926, "grad_norm": 7.693630695343018, "learning_rate": 2.4661841028014785e-05, "loss": 0.203, "num_input_tokens_seen": 986488, "step": 1145 }, { "epoch": 2.7845036319612593, "grad_norm": 4.296042442321777, "learning_rate": 2.445051916335321e-05, "loss": 0.1983, "num_input_tokens_seen": 990456, "step": 1150 }, { "epoch": 2.7966101694915255, "grad_norm": 2.928414821624756, "learning_rate": 2.4239236572073352e-05, "loss": 0.1825, "num_input_tokens_seen": 994744, "step": 1155 }, { "epoch": 2.8087167070217918, "grad_norm": 2.411320686340332, "learning_rate": 2.4028008355305815e-05, "loss": 0.178, "num_input_tokens_seen": 999160, "step": 1160 }, { "epoch": 2.820823244552058, "grad_norm": 6.881911754608154, "learning_rate": 2.3816849610294783e-05, "loss": 0.1709, "num_input_tokens_seen": 1003256, "step": 1165 }, { "epoch": 2.8329297820823243, "grad_norm": 4.286351680755615, "learning_rate": 2.3605775429319115e-05, "loss": 0.1853, "num_input_tokens_seen": 1007480, "step": 1170 }, { "epoch": 2.845036319612591, "grad_norm": 3.7688863277435303, "learning_rate": 2.3394800898613535e-05, "loss": 0.1431, "num_input_tokens_seen": 1011896, "step": 1175 }, { "epoch": 2.857142857142857, "grad_norm": 3.717094898223877, "learning_rate": 2.318394109729041e-05, "loss": 0.2253, "num_input_tokens_seen": 1015992, "step": 1180 }, { "epoch": 2.8692493946731235, "grad_norm": 7.443727493286133, "learning_rate": 2.297321109626198e-05, "loss": 0.1686, "num_input_tokens_seen": 1020408, "step": 1185 }, { "epoch": 2.8813559322033897, "grad_norm": 12.574480056762695, "learning_rate": 2.27626259571632e-05, "loss": 0.1988, "num_input_tokens_seen": 1025016, "step": 1190 }, { "epoch": 2.893462469733656, "grad_norm": 9.311829566955566, "learning_rate": 2.2552200731275213e-05, "loss": 0.1682, "num_input_tokens_seen": 1029368, "step": 1195 }, { "epoch": 2.9055690072639226, "grad_norm": 4.659236431121826, "learning_rate": 2.2341950458449576e-05, "loss": 0.1918, "num_input_tokens_seen": 1033592, "step": 1200 }, { "epoch": 2.917675544794189, "grad_norm": 1.1926063299179077, "learning_rate": 2.213189016603333e-05, "loss": 0.2047, "num_input_tokens_seen": 1037688, "step": 1205 }, { "epoch": 2.929782082324455, "grad_norm": 1.54401433467865, "learning_rate": 2.1922034867794925e-05, "loss": 0.1686, "num_input_tokens_seen": 1041912, "step": 1210 }, { "epoch": 2.9418886198547214, "grad_norm": 6.956883430480957, "learning_rate": 2.1712399562851147e-05, "loss": 0.1663, "num_input_tokens_seen": 1046392, "step": 1215 }, { "epoch": 2.9539951573849876, "grad_norm": 6.875396728515625, "learning_rate": 2.150299923459505e-05, "loss": 0.1158, "num_input_tokens_seen": 1050616, "step": 1220 }, { "epoch": 2.9661016949152543, "grad_norm": 4.653652191162109, "learning_rate": 2.1293848849625065e-05, "loss": 0.1857, "num_input_tokens_seen": 1054840, "step": 1225 }, { "epoch": 2.9782082324455206, "grad_norm": 4.641164302825928, "learning_rate": 2.108496335667527e-05, "loss": 0.2051, "num_input_tokens_seen": 1058936, "step": 1230 }, { "epoch": 2.990314769975787, "grad_norm": 4.4205002784729, "learning_rate": 2.0876357685546944e-05, "loss": 0.137, "num_input_tokens_seen": 1063288, "step": 1235 }, { "epoch": 3.002421307506053, "grad_norm": 9.87366771697998, "learning_rate": 2.06680467460415e-05, "loss": 0.294, "num_input_tokens_seen": 1067392, "step": 1240 }, { "epoch": 3.0145278450363198, "grad_norm": 1.3809499740600586, "learning_rate": 2.0460045426894817e-05, "loss": 0.1436, "num_input_tokens_seen": 1071872, "step": 1245 }, { "epoch": 3.0217917675544794, "eval_loss": 0.20527909696102142, "eval_runtime": 0.667, "eval_samples_per_second": 550.187, "eval_steps_per_second": 68.961, "num_input_tokens_seen": 1074624, "step": 1248 }, { "epoch": 3.026634382566586, "grad_norm": 1.1184051036834717, "learning_rate": 2.0252368594713083e-05, "loss": 0.1503, "num_input_tokens_seen": 1076416, "step": 1250 }, { "epoch": 3.0387409200968523, "grad_norm": 3.941237211227417, "learning_rate": 2.004503109291023e-05, "loss": 0.156, "num_input_tokens_seen": 1080512, "step": 1255 }, { "epoch": 3.0508474576271185, "grad_norm": 2.0000264644622803, "learning_rate": 1.9838047740647026e-05, "loss": 0.1971, "num_input_tokens_seen": 1084608, "step": 1260 }, { "epoch": 3.062953995157385, "grad_norm": 11.35123062133789, "learning_rate": 1.9631433331771886e-05, "loss": 0.1813, "num_input_tokens_seen": 1089024, "step": 1265 }, { "epoch": 3.0750605326876514, "grad_norm": 2.1008217334747314, "learning_rate": 1.9425202633763513e-05, "loss": 0.133, "num_input_tokens_seen": 1093376, "step": 1270 }, { "epoch": 3.0871670702179177, "grad_norm": 5.499813556671143, "learning_rate": 1.9219370386675388e-05, "loss": 0.089, "num_input_tokens_seen": 1097728, "step": 1275 }, { "epoch": 3.099273607748184, "grad_norm": 8.502225875854492, "learning_rate": 1.901395130208229e-05, "loss": 0.2836, "num_input_tokens_seen": 1101888, "step": 1280 }, { "epoch": 3.11138014527845, "grad_norm": 14.45283031463623, "learning_rate": 1.880896006202876e-05, "loss": 0.1116, "num_input_tokens_seen": 1106176, "step": 1285 }, { "epoch": 3.123486682808717, "grad_norm": 3.364891767501831, "learning_rate": 1.860441131797977e-05, "loss": 0.1027, "num_input_tokens_seen": 1110272, "step": 1290 }, { "epoch": 3.135593220338983, "grad_norm": 8.516124725341797, "learning_rate": 1.8400319689773474e-05, "loss": 0.1582, "num_input_tokens_seen": 1114496, "step": 1295 }, { "epoch": 3.1476997578692494, "grad_norm": 11.724932670593262, "learning_rate": 1.8196699764576318e-05, "loss": 0.0408, "num_input_tokens_seen": 1118784, "step": 1300 }, { "epoch": 3.1598062953995156, "grad_norm": 8.753253936767578, "learning_rate": 1.7993566095840443e-05, "loss": 0.1234, "num_input_tokens_seen": 1123008, "step": 1305 }, { "epoch": 3.171912832929782, "grad_norm": 8.221136093139648, "learning_rate": 1.7790933202263434e-05, "loss": 0.2236, "num_input_tokens_seen": 1127424, "step": 1310 }, { "epoch": 3.1840193704600486, "grad_norm": 17.435853958129883, "learning_rate": 1.758881556675073e-05, "loss": 0.1958, "num_input_tokens_seen": 1131840, "step": 1315 }, { "epoch": 3.196125907990315, "grad_norm": 5.691689491271973, "learning_rate": 1.738722763538036e-05, "loss": 0.1238, "num_input_tokens_seen": 1136192, "step": 1320 }, { "epoch": 3.208232445520581, "grad_norm": 2.6163206100463867, "learning_rate": 1.7186183816370522e-05, "loss": 0.1027, "num_input_tokens_seen": 1140544, "step": 1325 }, { "epoch": 3.2203389830508473, "grad_norm": 5.7949724197387695, "learning_rate": 1.6985698479049702e-05, "loss": 0.0907, "num_input_tokens_seen": 1145280, "step": 1330 }, { "epoch": 3.232445520581114, "grad_norm": 5.007083892822266, "learning_rate": 1.6785785952829717e-05, "loss": 0.1037, "num_input_tokens_seen": 1149888, "step": 1335 }, { "epoch": 3.2445520581113803, "grad_norm": 12.367361068725586, "learning_rate": 1.6586460526181473e-05, "loss": 0.1776, "num_input_tokens_seen": 1153920, "step": 1340 }, { "epoch": 3.2566585956416465, "grad_norm": 16.06878089904785, "learning_rate": 1.6387736445613772e-05, "loss": 0.2125, "num_input_tokens_seen": 1158592, "step": 1345 }, { "epoch": 3.2687651331719128, "grad_norm": 7.7484588623046875, "learning_rate": 1.6189627914655008e-05, "loss": 0.2252, "num_input_tokens_seen": 1162816, "step": 1350 }, { "epoch": 3.2736077481840193, "eval_loss": 0.2091810256242752, "eval_runtime": 0.6785, "eval_samples_per_second": 540.886, "eval_steps_per_second": 67.795, "num_input_tokens_seen": 1164544, "step": 1352 }, { "epoch": 3.280871670702179, "grad_norm": 9.04961109161377, "learning_rate": 1.599214909283805e-05, "loss": 0.1163, "num_input_tokens_seen": 1167232, "step": 1355 }, { "epoch": 3.2929782082324457, "grad_norm": 3.317920446395874, "learning_rate": 1.579531409468815e-05, "loss": 0.1094, "num_input_tokens_seen": 1171648, "step": 1360 }, { "epoch": 3.305084745762712, "grad_norm": 8.250765800476074, "learning_rate": 1.5599136988714186e-05, "loss": 0.141, "num_input_tokens_seen": 1175808, "step": 1365 }, { "epoch": 3.317191283292978, "grad_norm": 5.985897541046143, "learning_rate": 1.5403631796403085e-05, "loss": 0.1296, "num_input_tokens_seen": 1180224, "step": 1370 }, { "epoch": 3.3292978208232444, "grad_norm": 4.8227314949035645, "learning_rate": 1.520881249121767e-05, "loss": 0.1375, "num_input_tokens_seen": 1184704, "step": 1375 }, { "epoch": 3.341404358353511, "grad_norm": 2.318727970123291, "learning_rate": 1.5014692997597962e-05, "loss": 0.1459, "num_input_tokens_seen": 1188992, "step": 1380 }, { "epoch": 3.3535108958837774, "grad_norm": 13.753244400024414, "learning_rate": 1.4821287189965866e-05, "loss": 0.1535, "num_input_tokens_seen": 1193408, "step": 1385 }, { "epoch": 3.3656174334140436, "grad_norm": 1.9978270530700684, "learning_rate": 1.4628608891733625e-05, "loss": 0.1246, "num_input_tokens_seen": 1197760, "step": 1390 }, { "epoch": 3.37772397094431, "grad_norm": 6.705835819244385, "learning_rate": 1.4436671874315722e-05, "loss": 0.0863, "num_input_tokens_seen": 1201792, "step": 1395 }, { "epoch": 3.389830508474576, "grad_norm": 7.748871326446533, "learning_rate": 1.4245489856144634e-05, "loss": 0.0968, "num_input_tokens_seen": 1205824, "step": 1400 }, { "epoch": 3.401937046004843, "grad_norm": 4.018503189086914, "learning_rate": 1.4055076501690311e-05, "loss": 0.0749, "num_input_tokens_seen": 1210240, "step": 1405 }, { "epoch": 3.414043583535109, "grad_norm": 4.750000953674316, "learning_rate": 1.3865445420483526e-05, "loss": 0.09, "num_input_tokens_seen": 1214464, "step": 1410 }, { "epoch": 3.4261501210653753, "grad_norm": 9.335100173950195, "learning_rate": 1.367661016614315e-05, "loss": 0.1746, "num_input_tokens_seen": 1218752, "step": 1415 }, { "epoch": 3.4382566585956416, "grad_norm": 4.242533206939697, "learning_rate": 1.3488584235407439e-05, "loss": 0.0826, "num_input_tokens_seen": 1223168, "step": 1420 }, { "epoch": 3.450363196125908, "grad_norm": 1.9875125885009766, "learning_rate": 1.3301381067169366e-05, "loss": 0.1469, "num_input_tokens_seen": 1227328, "step": 1425 }, { "epoch": 3.4624697336561745, "grad_norm": 10.304492950439453, "learning_rate": 1.3115014041516089e-05, "loss": 0.1454, "num_input_tokens_seen": 1231360, "step": 1430 }, { "epoch": 3.4745762711864407, "grad_norm": 2.467794418334961, "learning_rate": 1.2929496478772635e-05, "loss": 0.0455, "num_input_tokens_seen": 1235456, "step": 1435 }, { "epoch": 3.486682808716707, "grad_norm": 5.000001907348633, "learning_rate": 1.2744841638549842e-05, "loss": 0.106, "num_input_tokens_seen": 1239616, "step": 1440 }, { "epoch": 3.4987893462469732, "grad_norm": 0.32030388712882996, "learning_rate": 1.2561062718796662e-05, "loss": 0.0763, "num_input_tokens_seen": 1243968, "step": 1445 }, { "epoch": 3.5108958837772395, "grad_norm": 1.8182225227355957, "learning_rate": 1.2378172854856831e-05, "loss": 0.0978, "num_input_tokens_seen": 1248128, "step": 1450 }, { "epoch": 3.523002421307506, "grad_norm": 5.48933219909668, "learning_rate": 1.2196185118530063e-05, "loss": 0.1328, "num_input_tokens_seen": 1252288, "step": 1455 }, { "epoch": 3.5254237288135593, "eval_loss": 0.3491859436035156, "eval_runtime": 0.6747, "eval_samples_per_second": 543.942, "eval_steps_per_second": 68.178, "num_input_tokens_seen": 1253248, "step": 1456 }, { "epoch": 3.5351089588377724, "grad_norm": 1.86709725856781, "learning_rate": 1.2015112517137744e-05, "loss": 0.1139, "num_input_tokens_seen": 1256640, "step": 1460 }, { "epoch": 3.5472154963680387, "grad_norm": 10.584001541137695, "learning_rate": 1.183496799259326e-05, "loss": 0.1247, "num_input_tokens_seen": 1261440, "step": 1465 }, { "epoch": 3.559322033898305, "grad_norm": 0.81782066822052, "learning_rate": 1.1655764420476988e-05, "loss": 0.0777, "num_input_tokens_seen": 1265664, "step": 1470 }, { "epoch": 3.571428571428571, "grad_norm": 4.23323917388916, "learning_rate": 1.1477514609116039e-05, "loss": 0.0848, "num_input_tokens_seen": 1270016, "step": 1475 }, { "epoch": 3.583535108958838, "grad_norm": 4.22898006439209, "learning_rate": 1.1300231298668786e-05, "loss": 0.1263, "num_input_tokens_seen": 1274560, "step": 1480 }, { "epoch": 3.595641646489104, "grad_norm": 7.585851669311523, "learning_rate": 1.1123927160214289e-05, "loss": 0.1362, "num_input_tokens_seen": 1278976, "step": 1485 }, { "epoch": 3.6077481840193704, "grad_norm": 2.0685174465179443, "learning_rate": 1.0948614794846668e-05, "loss": 0.1068, "num_input_tokens_seen": 1283200, "step": 1490 }, { "epoch": 3.619854721549637, "grad_norm": 4.345080852508545, "learning_rate": 1.0774306732774414e-05, "loss": 0.2069, "num_input_tokens_seen": 1287296, "step": 1495 }, { "epoch": 3.6319612590799033, "grad_norm": 15.997807502746582, "learning_rate": 1.0601015432424819e-05, "loss": 0.1368, "num_input_tokens_seen": 1291712, "step": 1500 }, { "epoch": 3.6440677966101696, "grad_norm": 6.712691783905029, "learning_rate": 1.042875327955356e-05, "loss": 0.1959, "num_input_tokens_seen": 1295936, "step": 1505 }, { "epoch": 3.656174334140436, "grad_norm": 5.0442962646484375, "learning_rate": 1.0257532586359422e-05, "loss": 0.0932, "num_input_tokens_seen": 1300608, "step": 1510 }, { "epoch": 3.668280871670702, "grad_norm": 5.707069396972656, "learning_rate": 1.0087365590604289e-05, "loss": 0.1347, "num_input_tokens_seen": 1305024, "step": 1515 }, { "epoch": 3.6803874092009687, "grad_norm": 2.964393138885498, "learning_rate": 9.918264454738504e-06, "loss": 0.1287, "num_input_tokens_seen": 1309376, "step": 1520 }, { "epoch": 3.692493946731235, "grad_norm": 10.144442558288574, "learning_rate": 9.75024126503153e-06, "loss": 0.0818, "num_input_tokens_seen": 1313664, "step": 1525 }, { "epoch": 3.7046004842615012, "grad_norm": 8.710615158081055, "learning_rate": 9.583308030708135e-06, "loss": 0.0869, "num_input_tokens_seen": 1318080, "step": 1530 }, { "epoch": 3.7167070217917675, "grad_norm": 2.1846084594726562, "learning_rate": 9.417476683090007e-06, "loss": 0.0893, "num_input_tokens_seen": 1322432, "step": 1535 }, { "epoch": 3.7288135593220337, "grad_norm": 3.826754570007324, "learning_rate": 9.252759074743034e-06, "loss": 0.1556, "num_input_tokens_seen": 1326848, "step": 1540 }, { "epoch": 3.7409200968523004, "grad_norm": 10.382698059082031, "learning_rate": 9.08916697863014e-06, "loss": 0.0774, "num_input_tokens_seen": 1331328, "step": 1545 }, { "epoch": 3.7530266343825667, "grad_norm": 7.099722862243652, "learning_rate": 8.926712087269801e-06, "loss": 0.1253, "num_input_tokens_seen": 1335424, "step": 1550 }, { "epoch": 3.765133171912833, "grad_norm": 5.015311241149902, "learning_rate": 8.765406011900368e-06, "loss": 0.1276, "num_input_tokens_seen": 1339712, "step": 1555 }, { "epoch": 3.777239709443099, "grad_norm": 4.82669734954834, "learning_rate": 8.605260281650152e-06, "loss": 0.1842, "num_input_tokens_seen": 1344000, "step": 1560 }, { "epoch": 3.777239709443099, "eval_loss": 0.21899566054344177, "eval_runtime": 0.6796, "eval_samples_per_second": 539.994, "eval_steps_per_second": 67.683, "num_input_tokens_seen": 1344000, "step": 1560 }, { "epoch": 3.7893462469733654, "grad_norm": 3.010295867919922, "learning_rate": 8.446286342713419e-06, "loss": 0.0881, "num_input_tokens_seen": 1348224, "step": 1565 }, { "epoch": 3.801452784503632, "grad_norm": 2.3779475688934326, "learning_rate": 8.288495557532241e-06, "loss": 0.1348, "num_input_tokens_seen": 1352576, "step": 1570 }, { "epoch": 3.8135593220338984, "grad_norm": 6.911816120147705, "learning_rate": 8.131899203984463e-06, "loss": 0.134, "num_input_tokens_seen": 1356864, "step": 1575 }, { "epoch": 3.8256658595641646, "grad_norm": 9.250137329101562, "learning_rate": 7.976508474577548e-06, "loss": 0.1141, "num_input_tokens_seen": 1361152, "step": 1580 }, { "epoch": 3.837772397094431, "grad_norm": 4.86985445022583, "learning_rate": 7.822334475648654e-06, "loss": 0.0705, "num_input_tokens_seen": 1365376, "step": 1585 }, { "epoch": 3.849878934624697, "grad_norm": 0.7732688188552856, "learning_rate": 7.669388226570809e-06, "loss": 0.0907, "num_input_tokens_seen": 1369728, "step": 1590 }, { "epoch": 3.861985472154964, "grad_norm": 5.062341213226318, "learning_rate": 7.517680658965329e-06, "loss": 0.1261, "num_input_tokens_seen": 1374144, "step": 1595 }, { "epoch": 3.87409200968523, "grad_norm": 8.762838363647461, "learning_rate": 7.367222615920477e-06, "loss": 0.1084, "num_input_tokens_seen": 1378368, "step": 1600 }, { "epoch": 3.8861985472154963, "grad_norm": 8.905739784240723, "learning_rate": 7.2180248512164896e-06, "loss": 0.0813, "num_input_tokens_seen": 1382464, "step": 1605 }, { "epoch": 3.898305084745763, "grad_norm": 0.5714547038078308, "learning_rate": 7.070098028556948e-06, "loss": 0.0805, "num_input_tokens_seen": 1386880, "step": 1610 }, { "epoch": 3.910411622276029, "grad_norm": 8.167064666748047, "learning_rate": 6.923452720806611e-06, "loss": 0.1924, "num_input_tokens_seen": 1391296, "step": 1615 }, { "epoch": 3.9225181598062955, "grad_norm": 3.438431739807129, "learning_rate": 6.778099409235739e-06, "loss": 0.0609, "num_input_tokens_seen": 1395456, "step": 1620 }, { "epoch": 3.9346246973365617, "grad_norm": 7.784511089324951, "learning_rate": 6.634048482770946e-06, "loss": 0.0932, "num_input_tokens_seen": 1399616, "step": 1625 }, { "epoch": 3.946731234866828, "grad_norm": 13.272894859313965, "learning_rate": 6.491310237252679e-06, "loss": 0.1241, "num_input_tokens_seen": 1403712, "step": 1630 }, { "epoch": 3.9588377723970947, "grad_norm": 12.38925838470459, "learning_rate": 6.349894874699344e-06, "loss": 0.1232, "num_input_tokens_seen": 1408128, "step": 1635 }, { "epoch": 3.970944309927361, "grad_norm": 5.343148231506348, "learning_rate": 6.209812502578114e-06, "loss": 0.0787, "num_input_tokens_seen": 1412480, "step": 1640 }, { "epoch": 3.983050847457627, "grad_norm": 1.2886254787445068, "learning_rate": 6.071073133082492e-06, "loss": 0.0494, "num_input_tokens_seen": 1416704, "step": 1645 }, { "epoch": 3.9951573849878934, "grad_norm": 10.778816223144531, "learning_rate": 5.933686682416758e-06, "loss": 0.0969, "num_input_tokens_seen": 1421120, "step": 1650 }, { "epoch": 4.00726392251816, "grad_norm": 0.2529144883155823, "learning_rate": 5.797662970087184e-06, "loss": 0.09, "num_input_tokens_seen": 1424944, "step": 1655 }, { "epoch": 4.019370460048426, "grad_norm": 6.2160162925720215, "learning_rate": 5.663011718200201e-06, "loss": 0.0897, "num_input_tokens_seen": 1429296, "step": 1660 }, { "epoch": 4.0290556900726395, "eval_loss": 0.2532218098640442, "eval_runtime": 0.672, "eval_samples_per_second": 546.104, "eval_steps_per_second": 68.449, "num_input_tokens_seen": 1432880, "step": 1664 }, { "epoch": 4.031476997578692, "grad_norm": 0.9374585747718811, "learning_rate": 5.529742550767544e-06, "loss": 0.0316, "num_input_tokens_seen": 1433776, "step": 1665 }, { "epoch": 4.043583535108959, "grad_norm": 1.9009536504745483, "learning_rate": 5.397864993018367e-06, "loss": 0.0492, "num_input_tokens_seen": 1438000, "step": 1670 }, { "epoch": 4.0556900726392255, "grad_norm": 7.239864349365234, "learning_rate": 5.267388470718449e-06, "loss": 0.029, "num_input_tokens_seen": 1442352, "step": 1675 }, { "epoch": 4.067796610169491, "grad_norm": 2.098872661590576, "learning_rate": 5.138322309496504e-06, "loss": 0.052, "num_input_tokens_seen": 1446704, "step": 1680 }, { "epoch": 4.079903147699758, "grad_norm": 1.4036399126052856, "learning_rate": 5.010675734177631e-06, "loss": 0.0469, "num_input_tokens_seen": 1450864, "step": 1685 }, { "epoch": 4.092009685230024, "grad_norm": 11.33265495300293, "learning_rate": 4.884457868124001e-06, "loss": 0.0316, "num_input_tokens_seen": 1455088, "step": 1690 }, { "epoch": 4.1041162227602905, "grad_norm": 1.9709900617599487, "learning_rate": 4.759677732582782e-06, "loss": 0.0228, "num_input_tokens_seen": 1459376, "step": 1695 }, { "epoch": 4.116222760290557, "grad_norm": 0.01155536063015461, "learning_rate": 4.636344246041321e-06, "loss": 0.0529, "num_input_tokens_seen": 1463600, "step": 1700 }, { "epoch": 4.128329297820823, "grad_norm": 19.08058738708496, "learning_rate": 4.514466223589753e-06, "loss": 0.0565, "num_input_tokens_seen": 1468080, "step": 1705 }, { "epoch": 4.14043583535109, "grad_norm": 1.3092641830444336, "learning_rate": 4.3940523762909135e-06, "loss": 0.0695, "num_input_tokens_seen": 1472624, "step": 1710 }, { "epoch": 4.1525423728813555, "grad_norm": 0.055544547736644745, "learning_rate": 4.275111310557758e-06, "loss": 0.0511, "num_input_tokens_seen": 1477040, "step": 1715 }, { "epoch": 4.164648910411622, "grad_norm": 0.16590368747711182, "learning_rate": 4.1576515275382226e-06, "loss": 0.0311, "num_input_tokens_seen": 1481328, "step": 1720 }, { "epoch": 4.176755447941889, "grad_norm": 0.1331050992012024, "learning_rate": 4.0416814225076035e-06, "loss": 0.0394, "num_input_tokens_seen": 1485808, "step": 1725 }, { "epoch": 4.188861985472155, "grad_norm": 1.6521071195602417, "learning_rate": 3.9272092842685345e-06, "loss": 0.0255, "num_input_tokens_seen": 1490160, "step": 1730 }, { "epoch": 4.200968523002421, "grad_norm": 0.42354145646095276, "learning_rate": 3.814243294558542e-06, "loss": 0.0073, "num_input_tokens_seen": 1494512, "step": 1735 }, { "epoch": 4.213075060532688, "grad_norm": 2.2178032398223877, "learning_rate": 3.702791527465274e-06, "loss": 0.0562, "num_input_tokens_seen": 1498480, "step": 1740 }, { "epoch": 4.225181598062954, "grad_norm": 13.911809921264648, "learning_rate": 3.592861948849416e-06, "loss": 0.0463, "num_input_tokens_seen": 1502768, "step": 1745 }, { "epoch": 4.237288135593221, "grad_norm": 0.01323059480637312, "learning_rate": 3.484462415775333e-06, "loss": 0.0429, "num_input_tokens_seen": 1506992, "step": 1750 }, { "epoch": 4.249394673123486, "grad_norm": 0.1997198611497879, "learning_rate": 3.377600675949527e-06, "loss": 0.0035, "num_input_tokens_seen": 1511472, "step": 1755 }, { "epoch": 4.261501210653753, "grad_norm": 9.309453010559082, "learning_rate": 3.272284367166825e-06, "loss": 0.0395, "num_input_tokens_seen": 1515824, "step": 1760 }, { "epoch": 4.27360774818402, "grad_norm": 1.514168620109558, "learning_rate": 3.1685210167645335e-06, "loss": 0.0337, "num_input_tokens_seen": 1520176, "step": 1765 }, { "epoch": 4.280871670702179, "eval_loss": 0.4314914643764496, "eval_runtime": 0.8115, "eval_samples_per_second": 452.254, "eval_steps_per_second": 56.686, "num_input_tokens_seen": 1522544, "step": 1768 }, { "epoch": 4.285714285714286, "grad_norm": 0.23039455711841583, "learning_rate": 3.0663180410843982e-06, "loss": 0.008, "num_input_tokens_seen": 1524336, "step": 1770 }, { "epoch": 4.297820823244552, "grad_norm": 0.17007200419902802, "learning_rate": 2.9656827449425494e-06, "loss": 0.1379, "num_input_tokens_seen": 1528560, "step": 1775 }, { "epoch": 4.309927360774818, "grad_norm": 5.092523097991943, "learning_rate": 2.86662232110739e-06, "loss": 0.0391, "num_input_tokens_seen": 1532720, "step": 1780 }, { "epoch": 4.322033898305085, "grad_norm": 8.858246803283691, "learning_rate": 2.7691438497855134e-06, "loss": 0.0481, "num_input_tokens_seen": 1536944, "step": 1785 }, { "epoch": 4.3341404358353515, "grad_norm": 0.16653333604335785, "learning_rate": 2.673254298115646e-06, "loss": 0.0365, "num_input_tokens_seen": 1541168, "step": 1790 }, { "epoch": 4.346246973365617, "grad_norm": 0.057360630482435226, "learning_rate": 2.5789605196706674e-06, "loss": 0.0094, "num_input_tokens_seen": 1545456, "step": 1795 }, { "epoch": 4.358353510895884, "grad_norm": 18.321725845336914, "learning_rate": 2.4862692539677906e-06, "loss": 0.0798, "num_input_tokens_seen": 1549872, "step": 1800 }, { "epoch": 4.37046004842615, "grad_norm": 0.05611402168869972, "learning_rate": 2.3951871259868503e-06, "loss": 0.113, "num_input_tokens_seen": 1554288, "step": 1805 }, { "epoch": 4.3825665859564165, "grad_norm": 7.665430068969727, "learning_rate": 2.3057206456967905e-06, "loss": 0.1113, "num_input_tokens_seen": 1558384, "step": 1810 }, { "epoch": 4.394673123486683, "grad_norm": 9.430697441101074, "learning_rate": 2.217876207590375e-06, "loss": 0.0523, "num_input_tokens_seen": 1562544, "step": 1815 }, { "epoch": 4.406779661016949, "grad_norm": 0.0549406073987484, "learning_rate": 2.131660090227139e-06, "loss": 0.0659, "num_input_tokens_seen": 1567216, "step": 1820 }, { "epoch": 4.418886198547216, "grad_norm": 0.08962647616863251, "learning_rate": 2.0470784557846652e-06, "loss": 0.0756, "num_input_tokens_seen": 1571568, "step": 1825 }, { "epoch": 4.4309927360774815, "grad_norm": 0.09955435991287231, "learning_rate": 1.964137349618114e-06, "loss": 0.0018, "num_input_tokens_seen": 1575792, "step": 1830 }, { "epoch": 4.443099273607748, "grad_norm": 0.7829030156135559, "learning_rate": 1.8828426998281689e-06, "loss": 0.0419, "num_input_tokens_seen": 1580080, "step": 1835 }, { "epoch": 4.455205811138015, "grad_norm": 3.134791851043701, "learning_rate": 1.8032003168373306e-06, "loss": 0.0692, "num_input_tokens_seen": 1584112, "step": 1840 }, { "epoch": 4.467312348668281, "grad_norm": 1.7574918270111084, "learning_rate": 1.7252158929746131e-06, "loss": 0.0456, "num_input_tokens_seen": 1588400, "step": 1845 }, { "epoch": 4.479418886198547, "grad_norm": 27.999475479125977, "learning_rate": 1.6488950020686955e-06, "loss": 0.0504, "num_input_tokens_seen": 1592816, "step": 1850 }, { "epoch": 4.491525423728813, "grad_norm": 0.16101866960525513, "learning_rate": 1.5742430990495466e-06, "loss": 0.0573, "num_input_tokens_seen": 1597296, "step": 1855 }, { "epoch": 4.50363196125908, "grad_norm": 0.11599753797054291, "learning_rate": 1.5012655195585368e-06, "loss": 0.0293, "num_input_tokens_seen": 1601648, "step": 1860 }, { "epoch": 4.5157384987893465, "grad_norm": 6.17954683303833, "learning_rate": 1.4299674795670764e-06, "loss": 0.1156, "num_input_tokens_seen": 1605936, "step": 1865 }, { "epoch": 4.527845036319612, "grad_norm": 1.049548625946045, "learning_rate": 1.360354075003828e-06, "loss": 0.126, "num_input_tokens_seen": 1610096, "step": 1870 }, { "epoch": 4.532687651331719, "eval_loss": 0.42201921343803406, "eval_runtime": 0.693, "eval_samples_per_second": 529.558, "eval_steps_per_second": 66.375, "num_input_tokens_seen": 1611760, "step": 1872 }, { "epoch": 4.539951573849879, "grad_norm": 13.409820556640625, "learning_rate": 1.2924302813904582e-06, "loss": 0.0436, "num_input_tokens_seen": 1614384, "step": 1875 }, { "epoch": 4.552058111380145, "grad_norm": 3.9212989807128906, "learning_rate": 1.226200953486037e-06, "loss": 0.0591, "num_input_tokens_seen": 1618800, "step": 1880 }, { "epoch": 4.5641646489104115, "grad_norm": 0.7789947986602783, "learning_rate": 1.1616708249400449e-06, "loss": 0.0027, "num_input_tokens_seen": 1622960, "step": 1885 }, { "epoch": 4.576271186440678, "grad_norm": 16.51002311706543, "learning_rate": 1.0988445079540388e-06, "loss": 0.037, "num_input_tokens_seen": 1627056, "step": 1890 }, { "epoch": 4.588377723970944, "grad_norm": 0.03825072944164276, "learning_rate": 1.0377264929520125e-06, "loss": 0.0205, "num_input_tokens_seen": 1631408, "step": 1895 }, { "epoch": 4.600484261501211, "grad_norm": 13.03893756866455, "learning_rate": 9.783211482594285e-07, "loss": 0.0687, "num_input_tokens_seen": 1635888, "step": 1900 }, { "epoch": 4.6125907990314765, "grad_norm": 0.19233529269695282, "learning_rate": 9.206327197910203e-07, "loss": 0.0049, "num_input_tokens_seen": 1640176, "step": 1905 }, { "epoch": 4.624697336561743, "grad_norm": 9.149880409240723, "learning_rate": 8.646653307473079e-07, "loss": 0.056, "num_input_tokens_seen": 1644528, "step": 1910 }, { "epoch": 4.63680387409201, "grad_norm": 0.09057964384555817, "learning_rate": 8.10422981319911e-07, "loss": 0.002, "num_input_tokens_seen": 1649264, "step": 1915 }, { "epoch": 4.648910411622276, "grad_norm": 0.645796537399292, "learning_rate": 7.579095484056192e-07, "loss": 0.0111, "num_input_tokens_seen": 1653808, "step": 1920 }, { "epoch": 4.661016949152542, "grad_norm": 0.02393440343439579, "learning_rate": 7.07128785329314e-07, "loss": 0.0023, "num_input_tokens_seen": 1658288, "step": 1925 }, { "epoch": 4.673123486682809, "grad_norm": 0.03354793041944504, "learning_rate": 6.580843215757082e-07, "loss": 0.0228, "num_input_tokens_seen": 1662576, "step": 1930 }, { "epoch": 4.685230024213075, "grad_norm": 1.0874401330947876, "learning_rate": 6.107796625299117e-07, "loss": 0.0221, "num_input_tokens_seen": 1667056, "step": 1935 }, { "epoch": 4.697336561743342, "grad_norm": 0.94743412733078, "learning_rate": 5.652181892269181e-07, "loss": 0.0733, "num_input_tokens_seen": 1671536, "step": 1940 }, { "epoch": 4.709443099273607, "grad_norm": 0.04832937568426132, "learning_rate": 5.214031581099149e-07, "loss": 0.0023, "num_input_tokens_seen": 1675888, "step": 1945 }, { "epoch": 4.721549636803874, "grad_norm": 12.764242172241211, "learning_rate": 4.793377007975719e-07, "loss": 0.0341, "num_input_tokens_seen": 1680176, "step": 1950 }, { "epoch": 4.733656174334141, "grad_norm": 6.990570068359375, "learning_rate": 4.3902482386018186e-07, "loss": 0.0568, "num_input_tokens_seen": 1684400, "step": 1955 }, { "epoch": 4.745762711864407, "grad_norm": 26.958158493041992, "learning_rate": 4.004674086047905e-07, "loss": 0.1211, "num_input_tokens_seen": 1688816, "step": 1960 }, { "epoch": 4.757869249394673, "grad_norm": 1.086872935295105, "learning_rate": 3.636682108692502e-07, "loss": 0.0408, "num_input_tokens_seen": 1693360, "step": 1965 }, { "epoch": 4.76997578692494, "grad_norm": 15.128409385681152, "learning_rate": 3.2862986082524416e-07, "loss": 0.0647, "num_input_tokens_seen": 1697584, "step": 1970 }, { "epoch": 4.782082324455206, "grad_norm": 7.18263053894043, "learning_rate": 2.953548627903202e-07, "loss": 0.0336, "num_input_tokens_seen": 1702000, "step": 1975 }, { "epoch": 4.784503631961259, "eval_loss": 0.4348176121711731, "eval_runtime": 0.6821, "eval_samples_per_second": 538.039, "eval_steps_per_second": 67.438, "num_input_tokens_seen": 1702832, "step": 1976 }, { "epoch": 4.7941888619854724, "grad_norm": 0.357972115278244, "learning_rate": 2.6384559504886166e-07, "loss": 0.1448, "num_input_tokens_seen": 1706416, "step": 1980 }, { "epoch": 4.806295399515738, "grad_norm": 5.933152198791504, "learning_rate": 2.3410430968214824e-07, "loss": 0.0163, "num_input_tokens_seen": 1710960, "step": 1985 }, { "epoch": 4.818401937046005, "grad_norm": 21.378908157348633, "learning_rate": 2.0613313240735454e-07, "loss": 0.1048, "num_input_tokens_seen": 1715440, "step": 1990 }, { "epoch": 4.830508474576272, "grad_norm": 0.03769972547888756, "learning_rate": 1.7993406242563238e-07, "loss": 0.0295, "num_input_tokens_seen": 1719728, "step": 1995 }, { "epoch": 4.842615012106537, "grad_norm": 0.04536456614732742, "learning_rate": 1.5550897227922523e-07, "loss": 0.0007, "num_input_tokens_seen": 1724272, "step": 2000 }, { "epoch": 4.854721549636804, "grad_norm": 12.351763725280762, "learning_rate": 1.3285960771761697e-07, "loss": 0.064, "num_input_tokens_seen": 1728560, "step": 2005 }, { "epoch": 4.86682808716707, "grad_norm": 11.032571792602539, "learning_rate": 1.119875875727705e-07, "loss": 0.0289, "num_input_tokens_seen": 1733104, "step": 2010 }, { "epoch": 4.878934624697337, "grad_norm": 21.032617568969727, "learning_rate": 9.289440364341485e-08, "loss": 0.0127, "num_input_tokens_seen": 1737264, "step": 2015 }, { "epoch": 4.891041162227603, "grad_norm": 3.019296169281006, "learning_rate": 7.558142058842754e-08, "loss": 0.0664, "num_input_tokens_seen": 1741424, "step": 2020 }, { "epoch": 4.903147699757869, "grad_norm": 0.06446848809719086, "learning_rate": 6.004987582929055e-08, "loss": 0.0657, "num_input_tokens_seen": 1745648, "step": 2025 }, { "epoch": 4.915254237288136, "grad_norm": 15.37187385559082, "learning_rate": 4.63008794616554e-08, "loss": 0.045, "num_input_tokens_seen": 1749872, "step": 2030 }, { "epoch": 4.927360774818402, "grad_norm": 0.0873086079955101, "learning_rate": 3.433541417599551e-08, "loss": 0.0431, "num_input_tokens_seen": 1754288, "step": 2035 }, { "epoch": 4.939467312348668, "grad_norm": 0.19230781495571136, "learning_rate": 2.4154335187365207e-08, "loss": 0.0332, "num_input_tokens_seen": 1758640, "step": 2040 }, { "epoch": 4.951573849878935, "grad_norm": 0.0936799943447113, "learning_rate": 1.5758370174284722e-08, "loss": 0.0602, "num_input_tokens_seen": 1762928, "step": 2045 }, { "epoch": 4.963680387409201, "grad_norm": 0.07743958383798599, "learning_rate": 9.14811922672898e-09, "loss": 0.0118, "num_input_tokens_seen": 1767344, "step": 2050 }, { "epoch": 4.9757869249394675, "grad_norm": 0.36676138639450073, "learning_rate": 4.324054803223065e-09, "loss": 0.0392, "num_input_tokens_seen": 1771632, "step": 2055 }, { "epoch": 4.987893462469733, "grad_norm": 11.666873931884766, "learning_rate": 1.286521697091425e-09, "loss": 0.0333, "num_input_tokens_seen": 1775728, "step": 2060 }, { "epoch": 5.0, "grad_norm": 0.10359911620616913, "learning_rate": 3.5737011805370145e-11, "loss": 0.0653, "num_input_tokens_seen": 1780000, "step": 2065 }, { "epoch": 5.0, "num_input_tokens_seen": 1780000, "step": 2065, "total_flos": 1.039320047616e+16, "train_loss": 0.16683834154997698, "train_runtime": 1017.6301, "train_samples_per_second": 16.219, "train_steps_per_second": 2.029 } ], "logging_steps": 5, "max_steps": 2065, "num_input_tokens_seen": 1780000, "num_train_epochs": 5, "save_steps": 104, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.039320047616e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }