{ "best_global_step": 71, "best_metric": 0.14072927832603455, "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_rte_42_1774791065/checkpoint-71", "epoch": 5.0, "eval_steps": 71, "global_step": 1405, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017793594306049824, "grad_norm": 100.99552917480469, "learning_rate": 1.4184397163120568e-06, "loss": 0.6704, "num_input_tokens_seen": 7872, "step": 5 }, { "epoch": 0.03558718861209965, "grad_norm": 94.50279235839844, "learning_rate": 3.1914893617021277e-06, "loss": 0.2548, "num_input_tokens_seen": 14784, "step": 10 }, { "epoch": 0.05338078291814947, "grad_norm": 112.95467376708984, "learning_rate": 4.964539007092199e-06, "loss": 0.9227, "num_input_tokens_seen": 23424, "step": 15 }, { "epoch": 0.0711743772241993, "grad_norm": 4.330768585205078, "learning_rate": 6.73758865248227e-06, "loss": 0.1819, "num_input_tokens_seen": 29824, "step": 20 }, { "epoch": 0.08896797153024912, "grad_norm": 5.050032138824463, "learning_rate": 8.510638297872341e-06, "loss": 0.228, "num_input_tokens_seen": 37824, "step": 25 }, { "epoch": 0.10676156583629894, "grad_norm": 12.993303298950195, "learning_rate": 1.0283687943262411e-05, "loss": 0.1572, "num_input_tokens_seen": 44608, "step": 30 }, { "epoch": 0.12455516014234876, "grad_norm": 17.3212947845459, "learning_rate": 1.2056737588652483e-05, "loss": 0.1609, "num_input_tokens_seen": 51968, "step": 35 }, { "epoch": 0.1423487544483986, "grad_norm": 15.35787296295166, "learning_rate": 1.3829787234042554e-05, "loss": 0.2143, "num_input_tokens_seen": 59456, "step": 40 }, { "epoch": 0.1601423487544484, "grad_norm": 31.886463165283203, "learning_rate": 1.5602836879432626e-05, "loss": 0.2034, "num_input_tokens_seen": 66496, "step": 45 }, { "epoch": 0.17793594306049823, "grad_norm": 18.710391998291016, "learning_rate": 1.7375886524822697e-05, "loss": 0.2702, "num_input_tokens_seen": 73408, "step": 50 }, { "epoch": 0.19572953736654805, "grad_norm": 23.454370498657227, "learning_rate": 1.9148936170212766e-05, "loss": 0.1793, "num_input_tokens_seen": 80576, "step": 55 }, { "epoch": 0.21352313167259787, "grad_norm": 2.1824188232421875, "learning_rate": 2.0921985815602837e-05, "loss": 0.161, "num_input_tokens_seen": 88256, "step": 60 }, { "epoch": 0.2313167259786477, "grad_norm": 4.793557167053223, "learning_rate": 2.269503546099291e-05, "loss": 0.1808, "num_input_tokens_seen": 96256, "step": 65 }, { "epoch": 0.2491103202846975, "grad_norm": 27.524141311645508, "learning_rate": 2.446808510638298e-05, "loss": 0.2508, "num_input_tokens_seen": 103424, "step": 70 }, { "epoch": 0.2526690391459075, "eval_loss": 0.14072927832603455, "eval_runtime": 0.6083, "eval_samples_per_second": 409.332, "eval_steps_per_second": 52.605, "num_input_tokens_seen": 105024, "step": 71 }, { "epoch": 0.2669039145907473, "grad_norm": 7.07442045211792, "learning_rate": 2.624113475177305e-05, "loss": 0.143, "num_input_tokens_seen": 110528, "step": 75 }, { "epoch": 0.2846975088967972, "grad_norm": 16.91026496887207, "learning_rate": 2.8014184397163124e-05, "loss": 0.2326, "num_input_tokens_seen": 117440, "step": 80 }, { "epoch": 0.302491103202847, "grad_norm": 15.627188682556152, "learning_rate": 2.9787234042553192e-05, "loss": 0.2053, "num_input_tokens_seen": 125504, "step": 85 }, { "epoch": 0.3202846975088968, "grad_norm": 24.881179809570312, "learning_rate": 3.156028368794326e-05, "loss": 0.2409, "num_input_tokens_seen": 132352, "step": 90 }, { "epoch": 0.33807829181494664, "grad_norm": 19.5406494140625, "learning_rate": 3.3333333333333335e-05, "loss": 0.2063, "num_input_tokens_seen": 139200, "step": 95 }, { "epoch": 0.35587188612099646, "grad_norm": 23.64601707458496, "learning_rate": 3.5106382978723407e-05, "loss": 0.244, "num_input_tokens_seen": 147904, "step": 100 }, { "epoch": 0.3736654804270463, "grad_norm": 13.374123573303223, "learning_rate": 3.687943262411347e-05, "loss": 0.183, "num_input_tokens_seen": 154240, "step": 105 }, { "epoch": 0.3914590747330961, "grad_norm": 7.41645622253418, "learning_rate": 3.865248226950355e-05, "loss": 0.1615, "num_input_tokens_seen": 161472, "step": 110 }, { "epoch": 0.4092526690391459, "grad_norm": 11.39084243774414, "learning_rate": 4.0425531914893614e-05, "loss": 0.1703, "num_input_tokens_seen": 168192, "step": 115 }, { "epoch": 0.42704626334519574, "grad_norm": 11.221735000610352, "learning_rate": 4.219858156028369e-05, "loss": 0.246, "num_input_tokens_seen": 174656, "step": 120 }, { "epoch": 0.44483985765124556, "grad_norm": 10.728532791137695, "learning_rate": 4.3971631205673764e-05, "loss": 0.1665, "num_input_tokens_seen": 181632, "step": 125 }, { "epoch": 0.4626334519572954, "grad_norm": 14.208120346069336, "learning_rate": 4.574468085106383e-05, "loss": 0.1695, "num_input_tokens_seen": 191488, "step": 130 }, { "epoch": 0.4804270462633452, "grad_norm": 1.208547830581665, "learning_rate": 4.751773049645391e-05, "loss": 0.1764, "num_input_tokens_seen": 198848, "step": 135 }, { "epoch": 0.498220640569395, "grad_norm": 19.034914016723633, "learning_rate": 4.929078014184397e-05, "loss": 0.1769, "num_input_tokens_seen": 207232, "step": 140 }, { "epoch": 0.505338078291815, "eval_loss": 0.15581394731998444, "eval_runtime": 0.5742, "eval_samples_per_second": 433.643, "eval_steps_per_second": 55.729, "num_input_tokens_seen": 209536, "step": 142 }, { "epoch": 0.5160142348754448, "grad_norm": 15.478862762451172, "learning_rate": 4.9999305045921804e-05, "loss": 0.2155, "num_input_tokens_seen": 213952, "step": 145 }, { "epoch": 0.5338078291814946, "grad_norm": 15.320956230163574, "learning_rate": 4.9995058244251644e-05, "loss": 0.185, "num_input_tokens_seen": 221376, "step": 150 }, { "epoch": 0.5516014234875445, "grad_norm": 14.336426734924316, "learning_rate": 4.998695138156149e-05, "loss": 0.2471, "num_input_tokens_seen": 228928, "step": 155 }, { "epoch": 0.5693950177935944, "grad_norm": 9.861719131469727, "learning_rate": 4.997498570981822e-05, "loss": 0.2061, "num_input_tokens_seen": 236352, "step": 160 }, { "epoch": 0.5871886120996441, "grad_norm": 10.547555923461914, "learning_rate": 4.995916307691601e-05, "loss": 0.1488, "num_input_tokens_seen": 244416, "step": 165 }, { "epoch": 0.604982206405694, "grad_norm": 6.471895217895508, "learning_rate": 4.993948592639104e-05, "loss": 0.1625, "num_input_tokens_seen": 251456, "step": 170 }, { "epoch": 0.6227758007117438, "grad_norm": 10.839587211608887, "learning_rate": 4.991595729704405e-05, "loss": 0.1635, "num_input_tokens_seen": 258880, "step": 175 }, { "epoch": 0.6405693950177936, "grad_norm": 14.87012767791748, "learning_rate": 4.9888580822471086e-05, "loss": 0.163, "num_input_tokens_seen": 265152, "step": 180 }, { "epoch": 0.6583629893238434, "grad_norm": 12.832857131958008, "learning_rate": 4.985736073050237e-05, "loss": 0.1599, "num_input_tokens_seen": 272576, "step": 185 }, { "epoch": 0.6761565836298933, "grad_norm": 9.781329154968262, "learning_rate": 4.982230184254933e-05, "loss": 0.1669, "num_input_tokens_seen": 279744, "step": 190 }, { "epoch": 0.693950177935943, "grad_norm": 9.258131980895996, "learning_rate": 4.9783409572860105e-05, "loss": 0.1659, "num_input_tokens_seen": 287680, "step": 195 }, { "epoch": 0.7117437722419929, "grad_norm": 6.527733325958252, "learning_rate": 4.974068992768331e-05, "loss": 0.1729, "num_input_tokens_seen": 294592, "step": 200 }, { "epoch": 0.7295373665480427, "grad_norm": 17.004568099975586, "learning_rate": 4.9694149504340517e-05, "loss": 0.2655, "num_input_tokens_seen": 301440, "step": 205 }, { "epoch": 0.7473309608540926, "grad_norm": 12.954022407531738, "learning_rate": 4.964379549020741e-05, "loss": 0.1924, "num_input_tokens_seen": 308416, "step": 210 }, { "epoch": 0.7580071174377224, "eval_loss": 0.1600140929222107, "eval_runtime": 0.5686, "eval_samples_per_second": 437.954, "eval_steps_per_second": 56.283, "num_input_tokens_seen": 312576, "step": 213 }, { "epoch": 0.7651245551601423, "grad_norm": 2.3744094371795654, "learning_rate": 4.958963566160384e-05, "loss": 0.1666, "num_input_tokens_seen": 315328, "step": 215 }, { "epoch": 0.7829181494661922, "grad_norm": 4.415204048156738, "learning_rate": 4.953167838259285e-05, "loss": 0.1668, "num_input_tokens_seen": 322688, "step": 220 }, { "epoch": 0.800711743772242, "grad_norm": 4.4129319190979, "learning_rate": 4.946993260368904e-05, "loss": 0.1826, "num_input_tokens_seen": 329280, "step": 225 }, { "epoch": 0.8185053380782918, "grad_norm": 1.2767548561096191, "learning_rate": 4.940440786047628e-05, "loss": 0.1488, "num_input_tokens_seen": 336896, "step": 230 }, { "epoch": 0.8362989323843416, "grad_norm": 10.839607238769531, "learning_rate": 4.933511427213511e-05, "loss": 0.2852, "num_input_tokens_seen": 344128, "step": 235 }, { "epoch": 0.8540925266903915, "grad_norm": 16.616424560546875, "learning_rate": 4.926206253988001e-05, "loss": 0.1901, "num_input_tokens_seen": 350912, "step": 240 }, { "epoch": 0.8718861209964412, "grad_norm": 3.9430079460144043, "learning_rate": 4.91852639453068e-05, "loss": 0.1972, "num_input_tokens_seen": 358016, "step": 245 }, { "epoch": 0.8896797153024911, "grad_norm": 6.706320762634277, "learning_rate": 4.910473034865033e-05, "loss": 0.3136, "num_input_tokens_seen": 364736, "step": 250 }, { "epoch": 0.9074733096085409, "grad_norm": 3.7334418296813965, "learning_rate": 4.902047418695292e-05, "loss": 0.1648, "num_input_tokens_seen": 371648, "step": 255 }, { "epoch": 0.9252669039145908, "grad_norm": 11.57023811340332, "learning_rate": 4.893250847214369e-05, "loss": 0.1706, "num_input_tokens_seen": 379200, "step": 260 }, { "epoch": 0.9430604982206405, "grad_norm": 1.522990345954895, "learning_rate": 4.884084678902898e-05, "loss": 0.2379, "num_input_tokens_seen": 387200, "step": 265 }, { "epoch": 0.9608540925266904, "grad_norm": 6.809507846832275, "learning_rate": 4.874550329319457e-05, "loss": 0.1618, "num_input_tokens_seen": 395264, "step": 270 }, { "epoch": 0.9786476868327402, "grad_norm": 9.76811695098877, "learning_rate": 4.864649270881944e-05, "loss": 0.1637, "num_input_tokens_seen": 402176, "step": 275 }, { "epoch": 0.99644128113879, "grad_norm": 15.906750679016113, "learning_rate": 4.8543830326401954e-05, "loss": 0.1956, "num_input_tokens_seen": 409984, "step": 280 }, { "epoch": 1.01067615658363, "eval_loss": 0.16843144595623016, "eval_runtime": 0.6085, "eval_samples_per_second": 409.228, "eval_steps_per_second": 52.592, "num_input_tokens_seen": 414040, "step": 284 }, { "epoch": 1.0142348754448398, "grad_norm": 7.785819053649902, "learning_rate": 4.843753200039851e-05, "loss": 0.1483, "num_input_tokens_seen": 415256, "step": 285 }, { "epoch": 1.0320284697508897, "grad_norm": 2.8784444332122803, "learning_rate": 4.832761414677503e-05, "loss": 0.1508, "num_input_tokens_seen": 422808, "step": 290 }, { "epoch": 1.0498220640569396, "grad_norm": 9.171720504760742, "learning_rate": 4.8214093740471836e-05, "loss": 0.1599, "num_input_tokens_seen": 430104, "step": 295 }, { "epoch": 1.0676156583629894, "grad_norm": 0.9587394595146179, "learning_rate": 4.8096988312782174e-05, "loss": 0.1629, "num_input_tokens_seen": 436760, "step": 300 }, { "epoch": 1.085409252669039, "grad_norm": 5.9907379150390625, "learning_rate": 4.7976315948644745e-05, "loss": 0.1729, "num_input_tokens_seen": 444952, "step": 305 }, { "epoch": 1.103202846975089, "grad_norm": 0.4214398264884949, "learning_rate": 4.7852095283850866e-05, "loss": 3.0413, "num_input_tokens_seen": 452760, "step": 310 }, { "epoch": 1.1209964412811388, "grad_norm": 0.5086872577667236, "learning_rate": 4.772434550216643e-05, "loss": 0.1785, "num_input_tokens_seen": 458392, "step": 315 }, { "epoch": 1.1387900355871885, "grad_norm": 0.5129872560501099, "learning_rate": 4.7593086332369344e-05, "loss": 0.1666, "num_input_tokens_seen": 465112, "step": 320 }, { "epoch": 1.1565836298932384, "grad_norm": 7.883773326873779, "learning_rate": 4.74583380452027e-05, "loss": 0.2395, "num_input_tokens_seen": 472216, "step": 325 }, { "epoch": 1.1743772241992882, "grad_norm": 3.8998472690582275, "learning_rate": 4.7320121450244394e-05, "loss": 0.2229, "num_input_tokens_seen": 479576, "step": 330 }, { "epoch": 1.1921708185053381, "grad_norm": 11.560748100280762, "learning_rate": 4.717845789269333e-05, "loss": 0.2531, "num_input_tokens_seen": 486552, "step": 335 }, { "epoch": 1.209964412811388, "grad_norm": 20.51876449584961, "learning_rate": 4.703336925007311e-05, "loss": 0.2223, "num_input_tokens_seen": 494616, "step": 340 }, { "epoch": 1.2277580071174377, "grad_norm": 10.914800643920898, "learning_rate": 4.68848779288534e-05, "loss": 0.1898, "num_input_tokens_seen": 501400, "step": 345 }, { "epoch": 1.2455516014234875, "grad_norm": 6.894437789916992, "learning_rate": 4.673300686098957e-05, "loss": 0.1662, "num_input_tokens_seen": 508888, "step": 350 }, { "epoch": 1.2633451957295374, "grad_norm": 4.296377658843994, "learning_rate": 4.657777950038133e-05, "loss": 0.1589, "num_input_tokens_seen": 517656, "step": 355 }, { "epoch": 1.2633451957295374, "eval_loss": 0.1600693166255951, "eval_runtime": 0.607, "eval_samples_per_second": 410.202, "eval_steps_per_second": 52.717, "num_input_tokens_seen": 517656, "step": 355 }, { "epoch": 1.281138790035587, "grad_norm": 6.819537162780762, "learning_rate": 4.6419219819250636e-05, "loss": 0.1538, "num_input_tokens_seen": 526232, "step": 360 }, { "epoch": 1.298932384341637, "grad_norm": 14.25802230834961, "learning_rate": 4.62573523044396e-05, "loss": 0.1811, "num_input_tokens_seen": 533400, "step": 365 }, { "epoch": 1.3167259786476868, "grad_norm": 3.1280250549316406, "learning_rate": 4.609220195362886e-05, "loss": 0.174, "num_input_tokens_seen": 542168, "step": 370 }, { "epoch": 1.3345195729537367, "grad_norm": 7.372785568237305, "learning_rate": 4.5923794271477217e-05, "loss": 0.1571, "num_input_tokens_seen": 549976, "step": 375 }, { "epoch": 1.3523131672597866, "grad_norm": 7.614220142364502, "learning_rate": 4.575215526568278e-05, "loss": 0.1641, "num_input_tokens_seen": 557016, "step": 380 }, { "epoch": 1.3701067615658362, "grad_norm": 38.93210983276367, "learning_rate": 4.5577311442966584e-05, "loss": 1.4814, "num_input_tokens_seen": 564504, "step": 385 }, { "epoch": 1.387900355871886, "grad_norm": 5.316745281219482, "learning_rate": 4.539928980497903e-05, "loss": 0.1601, "num_input_tokens_seen": 571864, "step": 390 }, { "epoch": 1.405693950177936, "grad_norm": 9.071686744689941, "learning_rate": 4.521811784412996e-05, "loss": 0.2213, "num_input_tokens_seen": 578456, "step": 395 }, { "epoch": 1.4234875444839858, "grad_norm": 36.631160736083984, "learning_rate": 4.503382353934294e-05, "loss": 1.4493, "num_input_tokens_seen": 584600, "step": 400 }, { "epoch": 1.4412811387900355, "grad_norm": 17.439191818237305, "learning_rate": 4.4846435351734376e-05, "loss": 0.1729, "num_input_tokens_seen": 591128, "step": 405 }, { "epoch": 1.4590747330960854, "grad_norm": 4.0148138999938965, "learning_rate": 4.4655982220218176e-05, "loss": 0.1539, "num_input_tokens_seen": 598552, "step": 410 }, { "epoch": 1.4768683274021353, "grad_norm": 0.6515812873840332, "learning_rate": 4.446249355703661e-05, "loss": 0.1612, "num_input_tokens_seen": 607320, "step": 415 }, { "epoch": 1.4946619217081851, "grad_norm": 4.950193881988525, "learning_rate": 4.426599924321815e-05, "loss": 0.1594, "num_input_tokens_seen": 614744, "step": 420 }, { "epoch": 1.512455516014235, "grad_norm": 4.902361869812012, "learning_rate": 4.4066529623962784e-05, "loss": 0.1947, "num_input_tokens_seen": 622808, "step": 425 }, { "epoch": 1.5160142348754448, "eval_loss": 0.18150445818901062, "eval_runtime": 0.6062, "eval_samples_per_second": 410.733, "eval_steps_per_second": 52.785, "num_input_tokens_seen": 624344, "step": 426 }, { "epoch": 1.5302491103202847, "grad_norm": 0.29520076513290405, "learning_rate": 4.386411550395576e-05, "loss": 0.1523, "num_input_tokens_seen": 630488, "step": 430 }, { "epoch": 1.5480427046263345, "grad_norm": 1.9226378202438354, "learning_rate": 4.365878814261032e-05, "loss": 0.1721, "num_input_tokens_seen": 638424, "step": 435 }, { "epoch": 1.5658362989323842, "grad_norm": 6.8878493309021, "learning_rate": 4.34505792492402e-05, "loss": 0.1551, "num_input_tokens_seen": 645208, "step": 440 }, { "epoch": 1.583629893238434, "grad_norm": 9.136181831359863, "learning_rate": 4.323952097816269e-05, "loss": 0.1499, "num_input_tokens_seen": 653016, "step": 445 }, { "epoch": 1.601423487544484, "grad_norm": 7.4756178855896, "learning_rate": 4.3025645923732926e-05, "loss": 0.1843, "num_input_tokens_seen": 659992, "step": 450 }, { "epoch": 1.6192170818505338, "grad_norm": 7.807384490966797, "learning_rate": 4.2808987115310255e-05, "loss": 0.1579, "num_input_tokens_seen": 667224, "step": 455 }, { "epoch": 1.6370106761565837, "grad_norm": 0.17006787657737732, "learning_rate": 4.2589578012157426e-05, "loss": 0.1563, "num_input_tokens_seen": 675160, "step": 460 }, { "epoch": 1.6548042704626336, "grad_norm": 0.41114601492881775, "learning_rate": 4.236745249827336e-05, "loss": 0.1556, "num_input_tokens_seen": 683544, "step": 465 }, { "epoch": 1.6725978647686834, "grad_norm": 2.4918622970581055, "learning_rate": 4.214264487716033e-05, "loss": 0.1593, "num_input_tokens_seen": 689368, "step": 470 }, { "epoch": 1.690391459074733, "grad_norm": 10.712060928344727, "learning_rate": 4.191518986652642e-05, "loss": 0.1699, "num_input_tokens_seen": 695832, "step": 475 }, { "epoch": 1.708185053380783, "grad_norm": 0.39044228196144104, "learning_rate": 4.168512259292391e-05, "loss": 0.1563, "num_input_tokens_seen": 703128, "step": 480 }, { "epoch": 1.7259786476868326, "grad_norm": 4.815671443939209, "learning_rate": 4.1452478586324605e-05, "loss": 0.1507, "num_input_tokens_seen": 709528, "step": 485 }, { "epoch": 1.7437722419928825, "grad_norm": 0.5018470287322998, "learning_rate": 4.121729377463285e-05, "loss": 0.1558, "num_input_tokens_seen": 716312, "step": 490 }, { "epoch": 1.7615658362989324, "grad_norm": 10.01478099822998, "learning_rate": 4.097960447813705e-05, "loss": 0.1825, "num_input_tokens_seen": 722776, "step": 495 }, { "epoch": 1.7686832740213523, "eval_loss": 0.16469639539718628, "eval_runtime": 0.5964, "eval_samples_per_second": 417.484, "eval_steps_per_second": 53.653, "num_input_tokens_seen": 725656, "step": 497 }, { "epoch": 1.7793594306049823, "grad_norm": 3.8590610027313232, "learning_rate": 4.073944740390061e-05, "loss": 0.1798, "num_input_tokens_seen": 729944, "step": 500 }, { "epoch": 1.7971530249110321, "grad_norm": 4.1739020347595215, "learning_rate": 4.049685964009321e-05, "loss": 0.1694, "num_input_tokens_seen": 737112, "step": 505 }, { "epoch": 1.814946619217082, "grad_norm": 10.671394348144531, "learning_rate": 4.025187865026311e-05, "loss": 0.1605, "num_input_tokens_seen": 744408, "step": 510 }, { "epoch": 1.8327402135231317, "grad_norm": 0.9396809935569763, "learning_rate": 4.000454226755159e-05, "loss": 0.1574, "num_input_tokens_seen": 750488, "step": 515 }, { "epoch": 1.8505338078291815, "grad_norm": 6.7215447425842285, "learning_rate": 3.975488868885021e-05, "loss": 0.1703, "num_input_tokens_seen": 757528, "step": 520 }, { "epoch": 1.8683274021352312, "grad_norm": 0.5858572721481323, "learning_rate": 3.9502956468902014e-05, "loss": 0.1545, "num_input_tokens_seen": 763736, "step": 525 }, { "epoch": 1.886120996441281, "grad_norm": 2.219594955444336, "learning_rate": 3.924878451434735e-05, "loss": 0.1534, "num_input_tokens_seen": 771864, "step": 530 }, { "epoch": 1.903914590747331, "grad_norm": 1.9175541400909424, "learning_rate": 3.899241207771546e-05, "loss": 0.1537, "num_input_tokens_seen": 778712, "step": 535 }, { "epoch": 1.9217081850533808, "grad_norm": 12.399153709411621, "learning_rate": 3.873387875136252e-05, "loss": 0.1917, "num_input_tokens_seen": 784280, "step": 540 }, { "epoch": 1.9395017793594307, "grad_norm": 7.259119987487793, "learning_rate": 3.847322446135736e-05, "loss": 0.1743, "num_input_tokens_seen": 792280, "step": 545 }, { "epoch": 1.9572953736654806, "grad_norm": 7.568546772003174, "learning_rate": 3.821048946131549e-05, "loss": 0.1752, "num_input_tokens_seen": 798488, "step": 550 }, { "epoch": 1.9750889679715302, "grad_norm": 6.783497333526611, "learning_rate": 3.794571432618267e-05, "loss": 0.1578, "num_input_tokens_seen": 806104, "step": 555 }, { "epoch": 1.99288256227758, "grad_norm": 9.681258201599121, "learning_rate": 3.767893994596876e-05, "loss": 0.1774, "num_input_tokens_seen": 813336, "step": 560 }, { "epoch": 2.0106761565836297, "grad_norm": 3.2600245475769043, "learning_rate": 3.741020751943297e-05, "loss": 0.1568, "num_input_tokens_seen": 817576, "step": 565 }, { "epoch": 2.02135231316726, "eval_loss": 0.15550938248634338, "eval_runtime": 0.6255, "eval_samples_per_second": 398.079, "eval_steps_per_second": 51.159, "num_input_tokens_seen": 821416, "step": 568 }, { "epoch": 2.0284697508896796, "grad_norm": 3.0256900787353516, "learning_rate": 3.713955854772144e-05, "loss": 0.1565, "num_input_tokens_seen": 823848, "step": 570 }, { "epoch": 2.0462633451957295, "grad_norm": 1.889113187789917, "learning_rate": 3.686703482795802e-05, "loss": 0.1536, "num_input_tokens_seen": 832232, "step": 575 }, { "epoch": 2.0640569395017794, "grad_norm": 3.334212303161621, "learning_rate": 3.6592678446789516e-05, "loss": 0.1624, "num_input_tokens_seen": 840424, "step": 580 }, { "epoch": 2.0818505338078293, "grad_norm": 3.6044702529907227, "learning_rate": 3.631653177388605e-05, "loss": 0.1395, "num_input_tokens_seen": 846824, "step": 585 }, { "epoch": 2.099644128113879, "grad_norm": 8.975861549377441, "learning_rate": 3.60386374553978e-05, "loss": 0.196, "num_input_tokens_seen": 853608, "step": 590 }, { "epoch": 2.117437722419929, "grad_norm": 10.559611320495605, "learning_rate": 3.5759038407369056e-05, "loss": 0.1637, "num_input_tokens_seen": 860968, "step": 595 }, { "epoch": 2.135231316725979, "grad_norm": 6.914389610290527, "learning_rate": 3.547777780911055e-05, "loss": 0.194, "num_input_tokens_seen": 868904, "step": 600 }, { "epoch": 2.1530249110320283, "grad_norm": 8.329413414001465, "learning_rate": 3.519489909653113e-05, "loss": 0.1592, "num_input_tokens_seen": 876072, "step": 605 }, { "epoch": 2.170818505338078, "grad_norm": 4.701565742492676, "learning_rate": 3.4910445955429854e-05, "loss": 0.1549, "num_input_tokens_seen": 883752, "step": 610 }, { "epoch": 2.188612099644128, "grad_norm": 7.797508716583252, "learning_rate": 3.4624462314749443e-05, "loss": 0.1533, "num_input_tokens_seen": 891304, "step": 615 }, { "epoch": 2.206405693950178, "grad_norm": 1.7337656021118164, "learning_rate": 3.433699233979222e-05, "loss": 0.1483, "num_input_tokens_seen": 899176, "step": 620 }, { "epoch": 2.224199288256228, "grad_norm": 5.721285343170166, "learning_rate": 3.4048080425399505e-05, "loss": 0.1436, "num_input_tokens_seen": 907560, "step": 625 }, { "epoch": 2.2419928825622777, "grad_norm": 3.0777595043182373, "learning_rate": 3.375777118909561e-05, "loss": 0.1413, "num_input_tokens_seen": 915240, "step": 630 }, { "epoch": 2.2597864768683276, "grad_norm": 15.890474319458008, "learning_rate": 3.3466109464197426e-05, "loss": 0.1597, "num_input_tokens_seen": 921384, "step": 635 }, { "epoch": 2.2740213523131674, "eval_loss": 0.1567462682723999, "eval_runtime": 0.6255, "eval_samples_per_second": 398.087, "eval_steps_per_second": 51.16, "num_input_tokens_seen": 926760, "step": 639 }, { "epoch": 2.277580071174377, "grad_norm": 1.5718131065368652, "learning_rate": 3.317314029289067e-05, "loss": 0.1653, "num_input_tokens_seen": 927528, "step": 640 }, { "epoch": 2.295373665480427, "grad_norm": 3.7291853427886963, "learning_rate": 3.287890891927386e-05, "loss": 0.1594, "num_input_tokens_seen": 934568, "step": 645 }, { "epoch": 2.3131672597864767, "grad_norm": 4.549835205078125, "learning_rate": 3.258346078237122e-05, "loss": 0.1402, "num_input_tokens_seen": 942248, "step": 650 }, { "epoch": 2.3309608540925266, "grad_norm": 14.683507919311523, "learning_rate": 3.228684150911527e-05, "loss": 0.2418, "num_input_tokens_seen": 949096, "step": 655 }, { "epoch": 2.3487544483985765, "grad_norm": 1.7894399166107178, "learning_rate": 3.198909690730063e-05, "loss": 0.1845, "num_input_tokens_seen": 955752, "step": 660 }, { "epoch": 2.3665480427046264, "grad_norm": 15.066572189331055, "learning_rate": 3.169027295850977e-05, "loss": 0.1664, "num_input_tokens_seen": 963176, "step": 665 }, { "epoch": 2.3843416370106763, "grad_norm": 4.301926136016846, "learning_rate": 3.139041581101187e-05, "loss": 0.1627, "num_input_tokens_seen": 968232, "step": 670 }, { "epoch": 2.402135231316726, "grad_norm": 5.145651340484619, "learning_rate": 3.108957177263608e-05, "loss": 0.1498, "num_input_tokens_seen": 976552, "step": 675 }, { "epoch": 2.419928825622776, "grad_norm": 2.5066633224487305, "learning_rate": 3.078778730362003e-05, "loss": 0.1656, "num_input_tokens_seen": 983720, "step": 680 }, { "epoch": 2.4377224199288254, "grad_norm": 3.9444332122802734, "learning_rate": 3.048510900943484e-05, "loss": 0.1567, "num_input_tokens_seen": 991976, "step": 685 }, { "epoch": 2.4555160142348753, "grad_norm": 4.341545581817627, "learning_rate": 3.018158363358773e-05, "loss": 0.1807, "num_input_tokens_seen": 998184, "step": 690 }, { "epoch": 2.473309608540925, "grad_norm": 4.363418102264404, "learning_rate": 2.9877258050403212e-05, "loss": 0.1678, "num_input_tokens_seen": 1005672, "step": 695 }, { "epoch": 2.491103202846975, "grad_norm": 3.3406949043273926, "learning_rate": 2.9572179257784215e-05, "loss": 0.1531, "num_input_tokens_seen": 1013096, "step": 700 }, { "epoch": 2.508896797153025, "grad_norm": 2.7513387203216553, "learning_rate": 2.9266394369954052e-05, "loss": 0.1337, "num_input_tokens_seen": 1019304, "step": 705 }, { "epoch": 2.526690391459075, "grad_norm": 7.649652481079102, "learning_rate": 2.8959950610180374e-05, "loss": 0.1431, "num_input_tokens_seen": 1025320, "step": 710 }, { "epoch": 2.526690391459075, "eval_loss": 0.16391661763191223, "eval_runtime": 0.6072, "eval_samples_per_second": 410.078, "eval_steps_per_second": 52.701, "num_input_tokens_seen": 1025320, "step": 710 }, { "epoch": 2.5444839857651247, "grad_norm": 15.210580825805664, "learning_rate": 2.865289530348243e-05, "loss": 0.1675, "num_input_tokens_seen": 1032552, "step": 715 }, { "epoch": 2.562277580071174, "grad_norm": 4.497170925140381, "learning_rate": 2.834527586932243e-05, "loss": 2.4615, "num_input_tokens_seen": 1039912, "step": 720 }, { "epoch": 2.580071174377224, "grad_norm": 10.657808303833008, "learning_rate": 2.8037139814282493e-05, "loss": 0.1636, "num_input_tokens_seen": 1047208, "step": 725 }, { "epoch": 2.597864768683274, "grad_norm": 1.3169434070587158, "learning_rate": 2.7728534724728027e-05, "loss": 0.1652, "num_input_tokens_seen": 1053928, "step": 730 }, { "epoch": 2.6156583629893237, "grad_norm": 2.855050802230835, "learning_rate": 2.741950825945881e-05, "loss": 0.1482, "num_input_tokens_seen": 1061608, "step": 735 }, { "epoch": 2.6334519572953736, "grad_norm": 2.2470901012420654, "learning_rate": 2.711010814234896e-05, "loss": 0.1501, "num_input_tokens_seen": 1067560, "step": 740 }, { "epoch": 2.6512455516014235, "grad_norm": 4.065670967102051, "learning_rate": 2.6800382154976732e-05, "loss": 0.1743, "num_input_tokens_seen": 1074152, "step": 745 }, { "epoch": 2.6690391459074734, "grad_norm": 5.455725193023682, "learning_rate": 2.6490378129245498e-05, "loss": 0.1441, "num_input_tokens_seen": 1082856, "step": 750 }, { "epoch": 2.6868327402135233, "grad_norm": 3.1051108837127686, "learning_rate": 2.6180143939996925e-05, "loss": 0.1495, "num_input_tokens_seen": 1089512, "step": 755 }, { "epoch": 2.704626334519573, "grad_norm": 2.337266206741333, "learning_rate": 2.5869727497617495e-05, "loss": 0.1464, "num_input_tokens_seen": 1096232, "step": 760 }, { "epoch": 2.722419928825623, "grad_norm": 4.207283973693848, "learning_rate": 2.55591767406396e-05, "loss": 0.1572, "num_input_tokens_seen": 1104168, "step": 765 }, { "epoch": 2.7402135231316724, "grad_norm": 2.140827178955078, "learning_rate": 2.5248539628338246e-05, "loss": 0.1326, "num_input_tokens_seen": 1112232, "step": 770 }, { "epoch": 2.7580071174377223, "grad_norm": 8.35146713256836, "learning_rate": 2.4937864133324516e-05, "loss": 0.1734, "num_input_tokens_seen": 1119016, "step": 775 }, { "epoch": 2.775800711743772, "grad_norm": 18.731395721435547, "learning_rate": 2.462719823413707e-05, "loss": 0.1986, "num_input_tokens_seen": 1126696, "step": 780 }, { "epoch": 2.7793594306049823, "eval_loss": 0.15414386987686157, "eval_runtime": 0.6372, "eval_samples_per_second": 390.788, "eval_steps_per_second": 50.222, "num_input_tokens_seen": 1128104, "step": 781 }, { "epoch": 2.793594306049822, "grad_norm": 6.263734817504883, "learning_rate": 2.4316589907832654e-05, "loss": 0.1576, "num_input_tokens_seen": 1134184, "step": 785 }, { "epoch": 2.811387900355872, "grad_norm": 1.7886258363723755, "learning_rate": 2.4006087122576863e-05, "loss": 0.1392, "num_input_tokens_seen": 1140392, "step": 790 }, { "epoch": 2.829181494661922, "grad_norm": 9.585826873779297, "learning_rate": 2.3695737830236266e-05, "loss": 0.2025, "num_input_tokens_seen": 1148328, "step": 795 }, { "epoch": 2.8469750889679717, "grad_norm": 3.7239151000976562, "learning_rate": 2.338558995897307e-05, "loss": 0.1781, "num_input_tokens_seen": 1154024, "step": 800 }, { "epoch": 2.864768683274021, "grad_norm": 7.329390525817871, "learning_rate": 2.3075691405843435e-05, "loss": 0.195, "num_input_tokens_seen": 1160808, "step": 805 }, { "epoch": 2.882562277580071, "grad_norm": 5.577742099761963, "learning_rate": 2.2766090029400573e-05, "loss": 0.1597, "num_input_tokens_seen": 1167912, "step": 810 }, { "epoch": 2.900355871886121, "grad_norm": 8.529340744018555, "learning_rate": 2.2456833642303822e-05, "loss": 0.1433, "num_input_tokens_seen": 1174568, "step": 815 }, { "epoch": 2.9181494661921707, "grad_norm": 5.017305374145508, "learning_rate": 2.214797000393479e-05, "loss": 0.1553, "num_input_tokens_seen": 1181480, "step": 820 }, { "epoch": 2.9359430604982206, "grad_norm": 3.5880136489868164, "learning_rate": 2.183954681302173e-05, "loss": 0.1614, "num_input_tokens_seen": 1189928, "step": 825 }, { "epoch": 2.9537366548042705, "grad_norm": 1.7257145643234253, "learning_rate": 2.1531611700273297e-05, "loss": 0.1351, "num_input_tokens_seen": 1197480, "step": 830 }, { "epoch": 2.9715302491103204, "grad_norm": 4.875583171844482, "learning_rate": 2.1224212221022777e-05, "loss": 0.1845, "num_input_tokens_seen": 1204584, "step": 835 }, { "epoch": 2.9893238434163703, "grad_norm": 5.411481857299805, "learning_rate": 2.0917395847883995e-05, "loss": 0.1616, "num_input_tokens_seen": 1212584, "step": 840 }, { "epoch": 3.00711743772242, "grad_norm": 4.330006122589111, "learning_rate": 2.0611209963419958e-05, "loss": 0.1625, "num_input_tokens_seen": 1217856, "step": 845 }, { "epoch": 3.0249110320284696, "grad_norm": 10.39330768585205, "learning_rate": 2.030570185282544e-05, "loss": 0.137, "num_input_tokens_seen": 1226624, "step": 850 }, { "epoch": 3.0320284697508897, "eval_loss": 0.1851627230644226, "eval_runtime": 0.6345, "eval_samples_per_second": 392.434, "eval_steps_per_second": 50.433, "num_input_tokens_seen": 1229440, "step": 852 }, { "epoch": 3.0427046263345194, "grad_norm": 3.0105044841766357, "learning_rate": 2.0000918696624588e-05, "loss": 0.1453, "num_input_tokens_seen": 1233152, "step": 855 }, { "epoch": 3.0604982206405693, "grad_norm": 2.1030280590057373, "learning_rate": 1.9696907563384687e-05, "loss": 0.138, "num_input_tokens_seen": 1240128, "step": 860 }, { "epoch": 3.078291814946619, "grad_norm": 2.1849405765533447, "learning_rate": 1.939371540244723e-05, "loss": 0.1148, "num_input_tokens_seen": 1248064, "step": 865 }, { "epoch": 3.096085409252669, "grad_norm": 6.3520402908325195, "learning_rate": 1.9091389036677382e-05, "loss": 0.1106, "num_input_tokens_seen": 1255232, "step": 870 }, { "epoch": 3.113879003558719, "grad_norm": 3.93772554397583, "learning_rate": 1.878997515523299e-05, "loss": 0.1169, "num_input_tokens_seen": 1262272, "step": 875 }, { "epoch": 3.131672597864769, "grad_norm": 6.558725833892822, "learning_rate": 1.848952030635424e-05, "loss": 0.1161, "num_input_tokens_seen": 1269632, "step": 880 }, { "epoch": 3.1494661921708187, "grad_norm": 3.3383939266204834, "learning_rate": 1.819007089017508e-05, "loss": 0.123, "num_input_tokens_seen": 1277312, "step": 885 }, { "epoch": 3.167259786476868, "grad_norm": 15.820018768310547, "learning_rate": 1.789167315155749e-05, "loss": 0.1599, "num_input_tokens_seen": 1284096, "step": 890 }, { "epoch": 3.185053380782918, "grad_norm": 2.621346950531006, "learning_rate": 1.7594373172949784e-05, "loss": 0.1109, "num_input_tokens_seen": 1291648, "step": 895 }, { "epoch": 3.202846975088968, "grad_norm": 6.172404766082764, "learning_rate": 1.7298216867269906e-05, "loss": 0.1569, "num_input_tokens_seen": 1299712, "step": 900 }, { "epoch": 3.2206405693950177, "grad_norm": 10.012272834777832, "learning_rate": 1.7003249970815026e-05, "loss": 0.1082, "num_input_tokens_seen": 1306176, "step": 905 }, { "epoch": 3.2384341637010676, "grad_norm": 3.6646652221679688, "learning_rate": 1.6709518036198308e-05, "loss": 0.1387, "num_input_tokens_seen": 1314112, "step": 910 }, { "epoch": 3.2562277580071175, "grad_norm": 9.655856132507324, "learning_rate": 1.6417066425314087e-05, "loss": 0.1199, "num_input_tokens_seen": 1321088, "step": 915 }, { "epoch": 3.2740213523131674, "grad_norm": 7.546687602996826, "learning_rate": 1.612594030233252e-05, "loss": 0.1422, "num_input_tokens_seen": 1328512, "step": 920 }, { "epoch": 3.284697508896797, "eval_loss": 0.16463510692119598, "eval_runtime": 0.6174, "eval_samples_per_second": 403.311, "eval_steps_per_second": 51.831, "num_input_tokens_seen": 1332544, "step": 923 }, { "epoch": 3.2918149466192173, "grad_norm": 3.2389485836029053, "learning_rate": 1.583618462672472e-05, "loss": 0.0863, "num_input_tokens_seen": 1336128, "step": 925 }, { "epoch": 3.309608540925267, "grad_norm": 3.8101906776428223, "learning_rate": 1.5547844146319545e-05, "loss": 0.1155, "num_input_tokens_seen": 1343552, "step": 930 }, { "epoch": 3.3274021352313166, "grad_norm": 5.337780475616455, "learning_rate": 1.5260963390393075e-05, "loss": 0.1691, "num_input_tokens_seen": 1351552, "step": 935 }, { "epoch": 3.3451957295373664, "grad_norm": 4.4513840675354, "learning_rate": 1.4975586662791783e-05, "loss": 0.0983, "num_input_tokens_seen": 1358272, "step": 940 }, { "epoch": 3.3629893238434163, "grad_norm": 7.950605392456055, "learning_rate": 1.4691758035090602e-05, "loss": 0.137, "num_input_tokens_seen": 1366784, "step": 945 }, { "epoch": 3.380782918149466, "grad_norm": 2.973015785217285, "learning_rate": 1.4409521339786808e-05, "loss": 0.1389, "num_input_tokens_seen": 1373312, "step": 950 }, { "epoch": 3.398576512455516, "grad_norm": 1.8699113130569458, "learning_rate": 1.41289201635308e-05, "loss": 0.0916, "num_input_tokens_seen": 1380736, "step": 955 }, { "epoch": 3.416370106761566, "grad_norm": 1.629996657371521, "learning_rate": 1.3849997840394943e-05, "loss": 0.096, "num_input_tokens_seen": 1388544, "step": 960 }, { "epoch": 3.434163701067616, "grad_norm": 3.142674446105957, "learning_rate": 1.3572797445181345e-05, "loss": 0.1252, "num_input_tokens_seen": 1396160, "step": 965 }, { "epoch": 3.4519572953736652, "grad_norm": 1.9603294134140015, "learning_rate": 1.3297361786769652e-05, "loss": 0.0988, "num_input_tokens_seen": 1404096, "step": 970 }, { "epoch": 3.469750889679715, "grad_norm": 18.924589157104492, "learning_rate": 1.3023733401505981e-05, "loss": 0.1135, "num_input_tokens_seen": 1411008, "step": 975 }, { "epoch": 3.487544483985765, "grad_norm": 4.6644487380981445, "learning_rate": 1.2751954546633871e-05, "loss": 0.155, "num_input_tokens_seen": 1418880, "step": 980 }, { "epoch": 3.505338078291815, "grad_norm": 8.87281608581543, "learning_rate": 1.2482067193768417e-05, "loss": 0.1302, "num_input_tokens_seen": 1426048, "step": 985 }, { "epoch": 3.5231316725978647, "grad_norm": 6.374912738800049, "learning_rate": 1.2214113022414448e-05, "loss": 0.0911, "num_input_tokens_seen": 1432064, "step": 990 }, { "epoch": 3.5373665480427046, "eval_loss": 0.1803617924451828, "eval_runtime": 0.6287, "eval_samples_per_second": 396.078, "eval_steps_per_second": 50.902, "num_input_tokens_seen": 1438336, "step": 994 }, { "epoch": 3.5409252669039146, "grad_norm": 7.5531110763549805, "learning_rate": 1.1948133413529817e-05, "loss": 0.1165, "num_input_tokens_seen": 1439808, "step": 995 }, { "epoch": 3.5587188612099645, "grad_norm": 10.984672546386719, "learning_rate": 1.168416944313486e-05, "loss": 0.156, "num_input_tokens_seen": 1447616, "step": 1000 }, { "epoch": 3.5765124555160144, "grad_norm": 5.665327072143555, "learning_rate": 1.1422261875968845e-05, "loss": 0.0978, "num_input_tokens_seen": 1454208, "step": 1005 }, { "epoch": 3.5943060498220643, "grad_norm": 5.291867256164551, "learning_rate": 1.1162451159194614e-05, "loss": 0.0784, "num_input_tokens_seen": 1463296, "step": 1010 }, { "epoch": 3.612099644128114, "grad_norm": 4.302516937255859, "learning_rate": 1.0904777416152166e-05, "loss": 0.1698, "num_input_tokens_seen": 1469952, "step": 1015 }, { "epoch": 3.6298932384341636, "grad_norm": 2.612572193145752, "learning_rate": 1.0649280440162326e-05, "loss": 0.1033, "num_input_tokens_seen": 1477184, "step": 1020 }, { "epoch": 3.6476868327402134, "grad_norm": 7.643741607666016, "learning_rate": 1.0395999688381314e-05, "loss": 0.1025, "num_input_tokens_seen": 1484160, "step": 1025 }, { "epoch": 3.6654804270462633, "grad_norm": 1.1666496992111206, "learning_rate": 1.0144974275707241e-05, "loss": 0.0885, "num_input_tokens_seen": 1491200, "step": 1030 }, { "epoch": 3.683274021352313, "grad_norm": 8.459441184997559, "learning_rate": 9.896242968739539e-06, "loss": 0.1678, "num_input_tokens_seen": 1498368, "step": 1035 }, { "epoch": 3.701067615658363, "grad_norm": 7.720543384552002, "learning_rate": 9.649844179792081e-06, "loss": 0.1068, "num_input_tokens_seen": 1505984, "step": 1040 }, { "epoch": 3.718861209964413, "grad_norm": 1.8878631591796875, "learning_rate": 9.405815960961054e-06, "loss": 0.0978, "num_input_tokens_seen": 1511680, "step": 1045 }, { "epoch": 3.7366548042704624, "grad_norm": 2.47867488861084, "learning_rate": 9.16419599824847e-06, "loss": 0.0966, "num_input_tokens_seen": 1517888, "step": 1050 }, { "epoch": 3.7544483985765122, "grad_norm": 3.3050386905670166, "learning_rate": 8.925021605742211e-06, "loss": 0.1815, "num_input_tokens_seen": 1525568, "step": 1055 }, { "epoch": 3.772241992882562, "grad_norm": 6.0262837409973145, "learning_rate": 8.68832971985347e-06, "loss": 0.1028, "num_input_tokens_seen": 1532480, "step": 1060 }, { "epoch": 3.790035587188612, "grad_norm": 2.8200912475585938, "learning_rate": 8.454156893612591e-06, "loss": 0.1203, "num_input_tokens_seen": 1539072, "step": 1065 }, { "epoch": 3.790035587188612, "eval_loss": 0.17713916301727295, "eval_runtime": 0.6261, "eval_samples_per_second": 397.715, "eval_steps_per_second": 51.112, "num_input_tokens_seen": 1539072, "step": 1065 }, { "epoch": 3.807829181494662, "grad_norm": 2.3930211067199707, "learning_rate": 8.222539291024078e-06, "loss": 0.1178, "num_input_tokens_seen": 1547584, "step": 1070 }, { "epoch": 3.8256227758007118, "grad_norm": 7.24454402923584, "learning_rate": 7.993512681481639e-06, "loss": 0.0999, "num_input_tokens_seen": 1554304, "step": 1075 }, { "epoch": 3.8434163701067616, "grad_norm": 7.17146110534668, "learning_rate": 7.767112434244253e-06, "loss": 0.145, "num_input_tokens_seen": 1560896, "step": 1080 }, { "epoch": 3.8612099644128115, "grad_norm": 4.711667060852051, "learning_rate": 7.543373512973947e-06, "loss": 0.0627, "num_input_tokens_seen": 1567744, "step": 1085 }, { "epoch": 3.8790035587188614, "grad_norm": 12.18324089050293, "learning_rate": 7.3223304703363135e-06, "loss": 0.1558, "num_input_tokens_seen": 1574400, "step": 1090 }, { "epoch": 3.8967971530249113, "grad_norm": 2.6999011039733887, "learning_rate": 7.104017442664393e-06, "loss": 0.0965, "num_input_tokens_seen": 1581504, "step": 1095 }, { "epoch": 3.914590747330961, "grad_norm": 5.639074802398682, "learning_rate": 6.8884681446869105e-06, "loss": 0.0914, "num_input_tokens_seen": 1589504, "step": 1100 }, { "epoch": 3.9323843416370106, "grad_norm": 6.777685165405273, "learning_rate": 6.67571586432163e-06, "loss": 0.124, "num_input_tokens_seen": 1597696, "step": 1105 }, { "epoch": 3.9501779359430604, "grad_norm": 5.154758453369141, "learning_rate": 6.465793457534553e-06, "loss": 0.1388, "num_input_tokens_seen": 1605248, "step": 1110 }, { "epoch": 3.9679715302491103, "grad_norm": 4.713754653930664, "learning_rate": 6.258733343265932e-06, "loss": 0.1646, "num_input_tokens_seen": 1613952, "step": 1115 }, { "epoch": 3.98576512455516, "grad_norm": 5.546712875366211, "learning_rate": 6.0545674984236826e-06, "loss": 0.1024, "num_input_tokens_seen": 1620224, "step": 1120 }, { "epoch": 4.00355871886121, "grad_norm": 1.0218762159347534, "learning_rate": 5.853327452945115e-06, "loss": 0.0889, "num_input_tokens_seen": 1625800, "step": 1125 }, { "epoch": 4.0213523131672595, "grad_norm": 7.033966541290283, "learning_rate": 5.655044284927657e-06, "loss": 0.0747, "num_input_tokens_seen": 1633352, "step": 1130 }, { "epoch": 4.039145907473309, "grad_norm": 1.1709257364273071, "learning_rate": 5.459748615829355e-06, "loss": 0.0551, "num_input_tokens_seen": 1640840, "step": 1135 }, { "epoch": 4.04270462633452, "eval_loss": 0.19830213487148285, "eval_runtime": 0.616, "eval_samples_per_second": 404.216, "eval_steps_per_second": 51.947, "num_input_tokens_seen": 1642696, "step": 1136 }, { "epoch": 4.056939501779359, "grad_norm": 2.399528980255127, "learning_rate": 5.267470605739952e-06, "loss": 0.0395, "num_input_tokens_seen": 1648520, "step": 1140 }, { "epoch": 4.074733096085409, "grad_norm": 3.8567628860473633, "learning_rate": 5.078239948723154e-06, "loss": 0.0215, "num_input_tokens_seen": 1655752, "step": 1145 }, { "epoch": 4.092526690391459, "grad_norm": 2.231137990951538, "learning_rate": 4.892085868230881e-06, "loss": 0.0073, "num_input_tokens_seen": 1662920, "step": 1150 }, { "epoch": 4.110320284697509, "grad_norm": 8.699728012084961, "learning_rate": 4.709037112590217e-06, "loss": 0.0348, "num_input_tokens_seen": 1669896, "step": 1155 }, { "epoch": 4.128113879003559, "grad_norm": 8.660861015319824, "learning_rate": 4.529121950563716e-06, "loss": 0.076, "num_input_tokens_seen": 1675400, "step": 1160 }, { "epoch": 4.145907473309609, "grad_norm": 7.111387252807617, "learning_rate": 4.352368166983753e-06, "loss": 0.0705, "num_input_tokens_seen": 1682952, "step": 1165 }, { "epoch": 4.1637010676156585, "grad_norm": 6.721922874450684, "learning_rate": 4.178803058461664e-06, "loss": 0.088, "num_input_tokens_seen": 1690248, "step": 1170 }, { "epoch": 4.181494661921708, "grad_norm": 1.4173535108566284, "learning_rate": 4.0084534291722376e-06, "loss": 0.05, "num_input_tokens_seen": 1696840, "step": 1175 }, { "epoch": 4.199288256227758, "grad_norm": 0.436257928609848, "learning_rate": 3.841345586714251e-06, "loss": 0.0689, "num_input_tokens_seen": 1703624, "step": 1180 }, { "epoch": 4.217081850533808, "grad_norm": 0.09257882088422775, "learning_rate": 3.677505338047729e-06, "loss": 0.0218, "num_input_tokens_seen": 1710024, "step": 1185 }, { "epoch": 4.234875444839858, "grad_norm": 0.0730605497956276, "learning_rate": 3.516957985508476e-06, "loss": 0.068, "num_input_tokens_seen": 1717768, "step": 1190 }, { "epoch": 4.252669039145908, "grad_norm": 0.23621395230293274, "learning_rate": 3.3597283229005877e-06, "loss": 0.021, "num_input_tokens_seen": 1727240, "step": 1195 }, { "epoch": 4.270462633451958, "grad_norm": 0.33008334040641785, "learning_rate": 3.205840631667456e-06, "loss": 0.0422, "num_input_tokens_seen": 1734408, "step": 1200 }, { "epoch": 4.288256227758007, "grad_norm": 9.555450439453125, "learning_rate": 3.0553186771419162e-06, "loss": 0.0577, "num_input_tokens_seen": 1740936, "step": 1205 }, { "epoch": 4.295373665480427, "eval_loss": 0.3402128219604492, "eval_runtime": 0.6132, "eval_samples_per_second": 406.087, "eval_steps_per_second": 52.188, "num_input_tokens_seen": 1743624, "step": 1207 }, { "epoch": 4.306049822064057, "grad_norm": 1.463619589805603, "learning_rate": 2.908185704876101e-06, "loss": 0.0397, "num_input_tokens_seen": 1747784, "step": 1210 }, { "epoch": 4.3238434163701065, "grad_norm": 8.920357704162598, "learning_rate": 2.7644644370515365e-06, "loss": 0.0636, "num_input_tokens_seen": 1754888, "step": 1215 }, { "epoch": 4.341637010676156, "grad_norm": 1.068237543106079, "learning_rate": 2.624177068970124e-06, "loss": 0.0083, "num_input_tokens_seen": 1762632, "step": 1220 }, { "epoch": 4.359430604982206, "grad_norm": 15.559476852416992, "learning_rate": 2.4873452656264313e-06, "loss": 0.0331, "num_input_tokens_seen": 1769928, "step": 1225 }, { "epoch": 4.377224199288256, "grad_norm": 0.16921275854110718, "learning_rate": 2.3539901583619185e-06, "loss": 0.0824, "num_input_tokens_seen": 1777480, "step": 1230 }, { "epoch": 4.395017793594306, "grad_norm": 0.30731886625289917, "learning_rate": 2.2241323416015453e-06, "loss": 0.0384, "num_input_tokens_seen": 1784840, "step": 1235 }, { "epoch": 4.412811387900356, "grad_norm": 0.8764639496803284, "learning_rate": 2.09779186967331e-06, "loss": 0.0435, "num_input_tokens_seen": 1792584, "step": 1240 }, { "epoch": 4.430604982206406, "grad_norm": 10.101332664489746, "learning_rate": 1.9749882537112296e-06, "loss": 0.0525, "num_input_tokens_seen": 1800968, "step": 1245 }, { "epoch": 4.448398576512456, "grad_norm": 0.037536390125751495, "learning_rate": 1.8557404586421413e-06, "loss": 0.0777, "num_input_tokens_seen": 1808456, "step": 1250 }, { "epoch": 4.4661921708185055, "grad_norm": 14.205132484436035, "learning_rate": 1.7400669002569232e-06, "loss": 0.1469, "num_input_tokens_seen": 1816136, "step": 1255 }, { "epoch": 4.483985765124555, "grad_norm": 16.095531463623047, "learning_rate": 1.6279854423664697e-06, "loss": 0.0696, "num_input_tokens_seen": 1824136, "step": 1260 }, { "epoch": 4.501779359430605, "grad_norm": 0.2532411217689514, "learning_rate": 1.5195133940429345e-06, "loss": 0.0084, "num_input_tokens_seen": 1831304, "step": 1265 }, { "epoch": 4.519572953736655, "grad_norm": 5.228630065917969, "learning_rate": 1.4146675069466403e-06, "loss": 0.0259, "num_input_tokens_seen": 1837512, "step": 1270 }, { "epoch": 4.537366548042705, "grad_norm": 1.4012762308120728, "learning_rate": 1.313463972739068e-06, "loss": 0.0319, "num_input_tokens_seen": 1844296, "step": 1275 }, { "epoch": 4.548042704626335, "eval_loss": 0.3532261848449707, "eval_runtime": 0.6553, "eval_samples_per_second": 379.991, "eval_steps_per_second": 48.834, "num_input_tokens_seen": 1849416, "step": 1278 }, { "epoch": 4.555160142348754, "grad_norm": 0.7564399838447571, "learning_rate": 1.2159184205823432e-06, "loss": 0.0338, "num_input_tokens_seen": 1851720, "step": 1280 }, { "epoch": 4.572953736654805, "grad_norm": 0.5453316569328308, "learning_rate": 1.122045914725564e-06, "loss": 0.0457, "num_input_tokens_seen": 1858120, "step": 1285 }, { "epoch": 4.590747330960854, "grad_norm": 9.23385238647461, "learning_rate": 1.0318609521783818e-06, "loss": 0.0645, "num_input_tokens_seen": 1865928, "step": 1290 }, { "epoch": 4.608540925266904, "grad_norm": 6.625101566314697, "learning_rate": 9.453774604721938e-07, "loss": 0.0261, "num_input_tokens_seen": 1873800, "step": 1295 }, { "epoch": 4.6263345195729535, "grad_norm": 11.140019416809082, "learning_rate": 8.62608795509276e-07, "loss": 0.054, "num_input_tokens_seen": 1881800, "step": 1300 }, { "epoch": 4.644128113879003, "grad_norm": 1.8604605197906494, "learning_rate": 7.835677395001795e-07, "loss": 0.0036, "num_input_tokens_seen": 1888648, "step": 1305 }, { "epoch": 4.661921708185053, "grad_norm": 10.582422256469727, "learning_rate": 7.082664989897487e-07, "loss": 0.1115, "num_input_tokens_seen": 1895432, "step": 1310 }, { "epoch": 4.679715302491103, "grad_norm": 5.248901844024658, "learning_rate": 6.367167029720234e-07, "loss": 0.0608, "num_input_tokens_seen": 1902408, "step": 1315 }, { "epoch": 4.697508896797153, "grad_norm": 0.2927665412425995, "learning_rate": 5.68929401094323e-07, "loss": 0.0289, "num_input_tokens_seen": 1910344, "step": 1320 }, { "epoch": 4.715302491103203, "grad_norm": 0.10143531113862991, "learning_rate": 5.049150619508502e-07, "loss": 0.0309, "num_input_tokens_seen": 1918472, "step": 1325 }, { "epoch": 4.733096085409253, "grad_norm": 1.0533421039581299, "learning_rate": 4.4468357146596475e-07, "loss": 0.0078, "num_input_tokens_seen": 1924744, "step": 1330 }, { "epoch": 4.750889679715303, "grad_norm": 0.11136168986558914, "learning_rate": 3.8824423136748777e-07, "loss": 0.0676, "num_input_tokens_seen": 1932872, "step": 1335 }, { "epoch": 4.7686832740213525, "grad_norm": 7.673605442047119, "learning_rate": 3.3560575775019864e-07, "loss": 0.0673, "num_input_tokens_seen": 1940040, "step": 1340 }, { "epoch": 4.786476868327402, "grad_norm": 9.804219245910645, "learning_rate": 2.8677627972978906e-07, "loss": 0.0846, "num_input_tokens_seen": 1948936, "step": 1345 }, { "epoch": 4.800711743772242, "eval_loss": 0.34229812026023865, "eval_runtime": 0.623, "eval_samples_per_second": 399.663, "eval_steps_per_second": 51.362, "num_input_tokens_seen": 1954568, "step": 1349 }, { "epoch": 4.804270462633452, "grad_norm": 0.05879069119691849, "learning_rate": 2.417633381874534e-07, "loss": 0.001, "num_input_tokens_seen": 1955912, "step": 1350 }, { "epoch": 4.822064056939502, "grad_norm": 5.924869537353516, "learning_rate": 2.0057388460533732e-07, "loss": 0.0243, "num_input_tokens_seen": 1962760, "step": 1355 }, { "epoch": 4.839857651245552, "grad_norm": 5.939824104309082, "learning_rate": 1.6321427999298755e-07, "loss": 0.0594, "num_input_tokens_seen": 1969160, "step": 1360 }, { "epoch": 4.857651245551601, "grad_norm": 1.1909672021865845, "learning_rate": 1.2969029390501597e-07, "loss": 0.0329, "num_input_tokens_seen": 1975752, "step": 1365 }, { "epoch": 4.875444839857651, "grad_norm": 2.516611099243164, "learning_rate": 1.0000710355008159e-07, "loss": 0.0349, "num_input_tokens_seen": 1983240, "step": 1370 }, { "epoch": 4.893238434163701, "grad_norm": 2.7162880897521973, "learning_rate": 7.416929299135511e-08, "loss": 0.004, "num_input_tokens_seen": 1990792, "step": 1375 }, { "epoch": 4.911032028469751, "grad_norm": 0.11341112107038498, "learning_rate": 5.218085243859638e-08, "loss": 0.028, "num_input_tokens_seen": 1998728, "step": 1380 }, { "epoch": 4.9288256227758005, "grad_norm": 12.289280891418457, "learning_rate": 3.4045177631936155e-08, "loss": 0.046, "num_input_tokens_seen": 2006920, "step": 1385 }, { "epoch": 4.94661921708185, "grad_norm": 5.766960144042969, "learning_rate": 1.976506931745392e-08, "loss": 0.0136, "num_input_tokens_seen": 2013128, "step": 1390 }, { "epoch": 4.9644128113879, "grad_norm": 0.9045501351356506, "learning_rate": 9.3427328146517e-09, "loss": 0.0718, "num_input_tokens_seen": 2021704, "step": 1395 }, { "epoch": 4.98220640569395, "grad_norm": 1.2223786115646362, "learning_rate": 2.779777675890327e-09, "loss": 0.1224, "num_input_tokens_seen": 2028872, "step": 1400 }, { "epoch": 5.0, "grad_norm": 0.01060063298791647, "learning_rate": 7.72174378022017e-11, "loss": 0.0499, "num_input_tokens_seen": 2035272, "step": 1405 }, { "epoch": 5.0, "num_input_tokens_seen": 2035272, "step": 1405, "total_flos": 1.1883702201974784e+16, "train_loss": 0.17133487164477065, "train_runtime": 699.7603, "train_samples_per_second": 16.013, "train_steps_per_second": 2.008 } ], "logging_steps": 5, "max_steps": 1405, "num_input_tokens_seen": 2035272, "num_train_epochs": 5, "save_steps": 71, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1883702201974784e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }