{ "best_global_step": 284, "best_metric": 0.11889845132827759, "best_model_checkpoint": "saves_bts_preliminary/base/llama-3.2-1b-instruct/train_rte_42_1776331559/checkpoint-284", "epoch": 5.0, "eval_steps": 71, "global_step": 1405, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017793594306049824, "grad_norm": 214.55776977539062, "learning_rate": 1.4184397163120568e-07, "loss": 0.7401, "num_input_tokens_seen": 7872, "step": 5 }, { "epoch": 0.03558718861209965, "grad_norm": 161.54965209960938, "learning_rate": 3.1914893617021275e-07, "loss": 0.6171, "num_input_tokens_seen": 14784, "step": 10 }, { "epoch": 0.05338078291814947, "grad_norm": 70.1294937133789, "learning_rate": 4.964539007092199e-07, "loss": 0.4092, "num_input_tokens_seen": 23424, "step": 15 }, { "epoch": 0.0711743772241993, "grad_norm": 83.8577880859375, "learning_rate": 6.73758865248227e-07, "loss": 0.2659, "num_input_tokens_seen": 29824, "step": 20 }, { "epoch": 0.08896797153024912, "grad_norm": 70.67394256591797, "learning_rate": 8.510638297872341e-07, "loss": 0.2666, "num_input_tokens_seen": 37824, "step": 25 }, { "epoch": 0.10676156583629894, "grad_norm": 35.297691345214844, "learning_rate": 1.0283687943262412e-06, "loss": 0.2573, "num_input_tokens_seen": 44608, "step": 30 }, { "epoch": 0.12455516014234876, "grad_norm": 8.436112403869629, "learning_rate": 1.2056737588652482e-06, "loss": 0.3322, "num_input_tokens_seen": 51968, "step": 35 }, { "epoch": 0.1423487544483986, "grad_norm": 13.455877304077148, "learning_rate": 1.3829787234042555e-06, "loss": 0.1441, "num_input_tokens_seen": 59456, "step": 40 }, { "epoch": 0.1601423487544484, "grad_norm": 32.384830474853516, "learning_rate": 1.5602836879432626e-06, "loss": 0.163, "num_input_tokens_seen": 66496, "step": 45 }, { "epoch": 0.17793594306049823, "grad_norm": 26.830564498901367, "learning_rate": 1.7375886524822697e-06, "loss": 0.1815, "num_input_tokens_seen": 73408, "step": 50 }, { "epoch": 0.19572953736654805, "grad_norm": 7.963785171508789, "learning_rate": 1.9148936170212767e-06, "loss": 0.1484, "num_input_tokens_seen": 80576, "step": 55 }, { "epoch": 0.21352313167259787, "grad_norm": 8.084745407104492, "learning_rate": 2.092198581560284e-06, "loss": 0.1828, "num_input_tokens_seen": 88256, "step": 60 }, { "epoch": 0.2313167259786477, "grad_norm": 11.574740409851074, "learning_rate": 2.269503546099291e-06, "loss": 0.1709, "num_input_tokens_seen": 96256, "step": 65 }, { "epoch": 0.2491103202846975, "grad_norm": 20.736160278320312, "learning_rate": 2.446808510638298e-06, "loss": 0.2309, "num_input_tokens_seen": 103424, "step": 70 }, { "epoch": 0.2526690391459075, "eval_loss": 0.18017518520355225, "eval_runtime": 0.6321, "eval_samples_per_second": 393.895, "eval_steps_per_second": 50.621, "num_input_tokens_seen": 105024, "step": 71 }, { "epoch": 0.2669039145907473, "grad_norm": 20.432262420654297, "learning_rate": 2.624113475177305e-06, "loss": 0.1733, "num_input_tokens_seen": 110528, "step": 75 }, { "epoch": 0.2846975088967972, "grad_norm": 4.497078895568848, "learning_rate": 2.8014184397163125e-06, "loss": 0.1469, "num_input_tokens_seen": 117440, "step": 80 }, { "epoch": 0.302491103202847, "grad_norm": 13.809417724609375, "learning_rate": 2.978723404255319e-06, "loss": 0.1417, "num_input_tokens_seen": 125504, "step": 85 }, { "epoch": 0.3202846975088968, "grad_norm": 47.335994720458984, "learning_rate": 3.1560283687943267e-06, "loss": 0.1887, "num_input_tokens_seen": 132352, "step": 90 }, { "epoch": 0.33807829181494664, "grad_norm": 33.13681411743164, "learning_rate": 3.3333333333333333e-06, "loss": 0.1507, "num_input_tokens_seen": 139200, "step": 95 }, { "epoch": 0.35587188612099646, "grad_norm": 21.207801818847656, "learning_rate": 3.510638297872341e-06, "loss": 0.0966, "num_input_tokens_seen": 147904, "step": 100 }, { "epoch": 0.3736654804270463, "grad_norm": 27.387420654296875, "learning_rate": 3.6879432624113475e-06, "loss": 0.2218, "num_input_tokens_seen": 154240, "step": 105 }, { "epoch": 0.3914590747330961, "grad_norm": 14.321903228759766, "learning_rate": 3.865248226950355e-06, "loss": 0.1246, "num_input_tokens_seen": 161472, "step": 110 }, { "epoch": 0.4092526690391459, "grad_norm": 24.01630401611328, "learning_rate": 4.042553191489362e-06, "loss": 0.1689, "num_input_tokens_seen": 168192, "step": 115 }, { "epoch": 0.42704626334519574, "grad_norm": 5.953591346740723, "learning_rate": 4.219858156028369e-06, "loss": 0.1818, "num_input_tokens_seen": 174656, "step": 120 }, { "epoch": 0.44483985765124556, "grad_norm": 7.9348039627075195, "learning_rate": 4.397163120567377e-06, "loss": 0.1189, "num_input_tokens_seen": 181632, "step": 125 }, { "epoch": 0.4626334519572954, "grad_norm": 31.200048446655273, "learning_rate": 4.574468085106383e-06, "loss": 0.0973, "num_input_tokens_seen": 191488, "step": 130 }, { "epoch": 0.4804270462633452, "grad_norm": 20.26383399963379, "learning_rate": 4.751773049645391e-06, "loss": 0.1978, "num_input_tokens_seen": 198848, "step": 135 }, { "epoch": 0.498220640569395, "grad_norm": 34.11137771606445, "learning_rate": 4.929078014184397e-06, "loss": 0.1861, "num_input_tokens_seen": 207232, "step": 140 }, { "epoch": 0.505338078291815, "eval_loss": 0.2461702525615692, "eval_runtime": 0.5958, "eval_samples_per_second": 417.954, "eval_steps_per_second": 53.713, "num_input_tokens_seen": 209536, "step": 142 }, { "epoch": 0.5160142348754448, "grad_norm": 5.17560338973999, "learning_rate": 4.999930504592181e-06, "loss": 0.2676, "num_input_tokens_seen": 213952, "step": 145 }, { "epoch": 0.5338078291814946, "grad_norm": 4.064759731292725, "learning_rate": 4.999505824425164e-06, "loss": 0.1686, "num_input_tokens_seen": 221376, "step": 150 }, { "epoch": 0.5516014234875445, "grad_norm": 6.850118160247803, "learning_rate": 4.998695138156149e-06, "loss": 0.1074, "num_input_tokens_seen": 228928, "step": 155 }, { "epoch": 0.5693950177935944, "grad_norm": 12.681975364685059, "learning_rate": 4.997498570981822e-06, "loss": 0.1216, "num_input_tokens_seen": 236352, "step": 160 }, { "epoch": 0.5871886120996441, "grad_norm": 20.49947166442871, "learning_rate": 4.995916307691601e-06, "loss": 0.1426, "num_input_tokens_seen": 244416, "step": 165 }, { "epoch": 0.604982206405694, "grad_norm": 17.282085418701172, "learning_rate": 4.993948592639105e-06, "loss": 0.175, "num_input_tokens_seen": 251456, "step": 170 }, { "epoch": 0.6227758007117438, "grad_norm": 12.41250991821289, "learning_rate": 4.991595729704405e-06, "loss": 0.1179, "num_input_tokens_seen": 258880, "step": 175 }, { "epoch": 0.6405693950177936, "grad_norm": 28.129770278930664, "learning_rate": 4.988858082247109e-06, "loss": 0.1235, "num_input_tokens_seen": 265152, "step": 180 }, { "epoch": 0.6583629893238434, "grad_norm": 27.979137420654297, "learning_rate": 4.985736073050237e-06, "loss": 0.1596, "num_input_tokens_seen": 272576, "step": 185 }, { "epoch": 0.6761565836298933, "grad_norm": 5.348273754119873, "learning_rate": 4.982230184254934e-06, "loss": 0.1188, "num_input_tokens_seen": 279744, "step": 190 }, { "epoch": 0.693950177935943, "grad_norm": 21.296180725097656, "learning_rate": 4.9783409572860105e-06, "loss": 0.1255, "num_input_tokens_seen": 287680, "step": 195 }, { "epoch": 0.7117437722419929, "grad_norm": 8.518939018249512, "learning_rate": 4.9740689927683314e-06, "loss": 0.0801, "num_input_tokens_seen": 294592, "step": 200 }, { "epoch": 0.7295373665480427, "grad_norm": 22.954431533813477, "learning_rate": 4.9694149504340515e-06, "loss": 0.0902, "num_input_tokens_seen": 301440, "step": 205 }, { "epoch": 0.7473309608540926, "grad_norm": 6.020153999328613, "learning_rate": 4.964379549020741e-06, "loss": 0.0658, "num_input_tokens_seen": 308416, "step": 210 }, { "epoch": 0.7580071174377224, "eval_loss": 0.1589389145374298, "eval_runtime": 1.701, "eval_samples_per_second": 146.387, "eval_steps_per_second": 18.813, "num_input_tokens_seen": 312576, "step": 213 }, { "epoch": 0.7651245551601423, "grad_norm": 22.559890747070312, "learning_rate": 4.9589635661603845e-06, "loss": 0.1047, "num_input_tokens_seen": 315328, "step": 215 }, { "epoch": 0.7829181494661922, "grad_norm": 21.644763946533203, "learning_rate": 4.953167838259285e-06, "loss": 0.0899, "num_input_tokens_seen": 322688, "step": 220 }, { "epoch": 0.800711743772242, "grad_norm": 21.93659782409668, "learning_rate": 4.946993260368904e-06, "loss": 0.1884, "num_input_tokens_seen": 329280, "step": 225 }, { "epoch": 0.8185053380782918, "grad_norm": 4.014484405517578, "learning_rate": 4.9404407860476275e-06, "loss": 0.0862, "num_input_tokens_seen": 336896, "step": 230 }, { "epoch": 0.8362989323843416, "grad_norm": 9.570290565490723, "learning_rate": 4.933511427213511e-06, "loss": 0.1129, "num_input_tokens_seen": 344128, "step": 235 }, { "epoch": 0.8540925266903915, "grad_norm": 8.622743606567383, "learning_rate": 4.926206253988001e-06, "loss": 0.0612, "num_input_tokens_seen": 350912, "step": 240 }, { "epoch": 0.8718861209964412, "grad_norm": 20.623458862304688, "learning_rate": 4.91852639453068e-06, "loss": 0.1364, "num_input_tokens_seen": 358016, "step": 245 }, { "epoch": 0.8896797153024911, "grad_norm": 20.129985809326172, "learning_rate": 4.910473034865033e-06, "loss": 0.0751, "num_input_tokens_seen": 364736, "step": 250 }, { "epoch": 0.9074733096085409, "grad_norm": 38.003440856933594, "learning_rate": 4.902047418695293e-06, "loss": 0.1051, "num_input_tokens_seen": 371648, "step": 255 }, { "epoch": 0.9252669039145908, "grad_norm": 12.691219329833984, "learning_rate": 4.893250847214369e-06, "loss": 0.0675, "num_input_tokens_seen": 379200, "step": 260 }, { "epoch": 0.9430604982206405, "grad_norm": 13.248903274536133, "learning_rate": 4.884084678902898e-06, "loss": 0.1611, "num_input_tokens_seen": 387200, "step": 265 }, { "epoch": 0.9608540925266904, "grad_norm": 19.279905319213867, "learning_rate": 4.874550329319457e-06, "loss": 0.1232, "num_input_tokens_seen": 395264, "step": 270 }, { "epoch": 0.9786476868327402, "grad_norm": 10.716254234313965, "learning_rate": 4.864649270881944e-06, "loss": 0.135, "num_input_tokens_seen": 402176, "step": 275 }, { "epoch": 0.99644128113879, "grad_norm": 10.358949661254883, "learning_rate": 4.854383032640196e-06, "loss": 0.0765, "num_input_tokens_seen": 409984, "step": 280 }, { "epoch": 1.01067615658363, "eval_loss": 0.11889845132827759, "eval_runtime": 0.6175, "eval_samples_per_second": 403.229, "eval_steps_per_second": 51.821, "num_input_tokens_seen": 414040, "step": 284 }, { "epoch": 1.0142348754448398, "grad_norm": 19.193450927734375, "learning_rate": 4.843753200039851e-06, "loss": 0.0754, "num_input_tokens_seen": 415256, "step": 285 }, { "epoch": 1.0320284697508897, "grad_norm": 15.135278701782227, "learning_rate": 4.832761414677502e-06, "loss": 0.067, "num_input_tokens_seen": 422808, "step": 290 }, { "epoch": 1.0498220640569396, "grad_norm": 39.43471145629883, "learning_rate": 4.821409374047184e-06, "loss": 0.0342, "num_input_tokens_seen": 430104, "step": 295 }, { "epoch": 1.0676156583629894, "grad_norm": 88.81342315673828, "learning_rate": 4.809698831278217e-06, "loss": 0.0683, "num_input_tokens_seen": 436760, "step": 300 }, { "epoch": 1.085409252669039, "grad_norm": 0.4617443382740021, "learning_rate": 4.797631594864475e-06, "loss": 0.1073, "num_input_tokens_seen": 444952, "step": 305 }, { "epoch": 1.103202846975089, "grad_norm": 40.835601806640625, "learning_rate": 4.785209528385087e-06, "loss": 0.1453, "num_input_tokens_seen": 452760, "step": 310 }, { "epoch": 1.1209964412811388, "grad_norm": 47.22534942626953, "learning_rate": 4.7724345502166435e-06, "loss": 0.2851, "num_input_tokens_seen": 458392, "step": 315 }, { "epoch": 1.1387900355871885, "grad_norm": 22.309940338134766, "learning_rate": 4.759308633236934e-06, "loss": 0.0427, "num_input_tokens_seen": 465112, "step": 320 }, { "epoch": 1.1565836298932384, "grad_norm": 0.2836686968803406, "learning_rate": 4.74583380452027e-06, "loss": 0.0369, "num_input_tokens_seen": 472216, "step": 325 }, { "epoch": 1.1743772241992882, "grad_norm": 20.73933219909668, "learning_rate": 4.7320121450244395e-06, "loss": 0.1237, "num_input_tokens_seen": 479576, "step": 330 }, { "epoch": 1.1921708185053381, "grad_norm": 0.6483802199363708, "learning_rate": 4.717845789269333e-06, "loss": 0.0822, "num_input_tokens_seen": 486552, "step": 335 }, { "epoch": 1.209964412811388, "grad_norm": 10.158080101013184, "learning_rate": 4.703336925007311e-06, "loss": 0.0835, "num_input_tokens_seen": 494616, "step": 340 }, { "epoch": 1.2277580071174377, "grad_norm": 22.884260177612305, "learning_rate": 4.68848779288534e-06, "loss": 0.0413, "num_input_tokens_seen": 501400, "step": 345 }, { "epoch": 1.2455516014234875, "grad_norm": 0.23994407057762146, "learning_rate": 4.673300686098957e-06, "loss": 0.0284, "num_input_tokens_seen": 508888, "step": 350 }, { "epoch": 1.2633451957295374, "grad_norm": 28.652849197387695, "learning_rate": 4.657777950038133e-06, "loss": 0.1848, "num_input_tokens_seen": 517656, "step": 355 }, { "epoch": 1.2633451957295374, "eval_loss": 0.21280953288078308, "eval_runtime": 0.6154, "eval_samples_per_second": 404.643, "eval_steps_per_second": 52.002, "num_input_tokens_seen": 517656, "step": 355 }, { "epoch": 1.281138790035587, "grad_norm": 29.27073860168457, "learning_rate": 4.641921981925064e-06, "loss": 0.0684, "num_input_tokens_seen": 526232, "step": 360 }, { "epoch": 1.298932384341637, "grad_norm": 6.552826404571533, "learning_rate": 4.625735230443959e-06, "loss": 0.0556, "num_input_tokens_seen": 533400, "step": 365 }, { "epoch": 1.3167259786476868, "grad_norm": 19.556528091430664, "learning_rate": 4.609220195362886e-06, "loss": 0.1059, "num_input_tokens_seen": 542168, "step": 370 }, { "epoch": 1.3345195729537367, "grad_norm": 0.45224103331565857, "learning_rate": 4.592379427147722e-06, "loss": 0.0525, "num_input_tokens_seen": 549976, "step": 375 }, { "epoch": 1.3523131672597866, "grad_norm": 7.306344032287598, "learning_rate": 4.575215526568278e-06, "loss": 0.118, "num_input_tokens_seen": 557016, "step": 380 }, { "epoch": 1.3701067615658362, "grad_norm": 17.069351196289062, "learning_rate": 4.557731144296659e-06, "loss": 0.0831, "num_input_tokens_seen": 564504, "step": 385 }, { "epoch": 1.387900355871886, "grad_norm": 2.3357198238372803, "learning_rate": 4.539928980497903e-06, "loss": 0.0283, "num_input_tokens_seen": 571864, "step": 390 }, { "epoch": 1.405693950177936, "grad_norm": 18.61362648010254, "learning_rate": 4.521811784412996e-06, "loss": 0.0483, "num_input_tokens_seen": 578456, "step": 395 }, { "epoch": 1.4234875444839858, "grad_norm": 51.09469223022461, "learning_rate": 4.503382353934295e-06, "loss": 0.1155, "num_input_tokens_seen": 584600, "step": 400 }, { "epoch": 1.4412811387900355, "grad_norm": 0.02246745675802231, "learning_rate": 4.484643535173438e-06, "loss": 0.0086, "num_input_tokens_seen": 591128, "step": 405 }, { "epoch": 1.4590747330960854, "grad_norm": 0.0882764533162117, "learning_rate": 4.465598222021818e-06, "loss": 0.0969, "num_input_tokens_seen": 598552, "step": 410 }, { "epoch": 1.4768683274021353, "grad_norm": 33.67585372924805, "learning_rate": 4.446249355703661e-06, "loss": 0.0715, "num_input_tokens_seen": 607320, "step": 415 }, { "epoch": 1.4946619217081851, "grad_norm": 24.378616333007812, "learning_rate": 4.426599924321815e-06, "loss": 0.094, "num_input_tokens_seen": 614744, "step": 420 }, { "epoch": 1.512455516014235, "grad_norm": 9.61978530883789, "learning_rate": 4.406652962396278e-06, "loss": 0.0306, "num_input_tokens_seen": 622808, "step": 425 }, { "epoch": 1.5160142348754448, "eval_loss": 0.17913997173309326, "eval_runtime": 0.6176, "eval_samples_per_second": 403.205, "eval_steps_per_second": 51.817, "num_input_tokens_seen": 624344, "step": 426 }, { "epoch": 1.5302491103202847, "grad_norm": 18.067750930786133, "learning_rate": 4.386411550395576e-06, "loss": 0.103, "num_input_tokens_seen": 630488, "step": 430 }, { "epoch": 1.5480427046263345, "grad_norm": 2.518416404724121, "learning_rate": 4.365878814261032e-06, "loss": 0.0775, "num_input_tokens_seen": 638424, "step": 435 }, { "epoch": 1.5658362989323842, "grad_norm": 17.197315216064453, "learning_rate": 4.34505792492402e-06, "loss": 0.0767, "num_input_tokens_seen": 645208, "step": 440 }, { "epoch": 1.583629893238434, "grad_norm": 1.7272682189941406, "learning_rate": 4.3239520978162685e-06, "loss": 0.013, "num_input_tokens_seen": 653016, "step": 445 }, { "epoch": 1.601423487544484, "grad_norm": 2.9054439067840576, "learning_rate": 4.302564592373293e-06, "loss": 0.0129, "num_input_tokens_seen": 659992, "step": 450 }, { "epoch": 1.6192170818505338, "grad_norm": 0.1965407431125641, "learning_rate": 4.280898711531026e-06, "loss": 0.1234, "num_input_tokens_seen": 667224, "step": 455 }, { "epoch": 1.6370106761565837, "grad_norm": 1.4352664947509766, "learning_rate": 4.258957801215743e-06, "loss": 0.1334, "num_input_tokens_seen": 675160, "step": 460 }, { "epoch": 1.6548042704626336, "grad_norm": 7.041714191436768, "learning_rate": 4.236745249827336e-06, "loss": 0.1297, "num_input_tokens_seen": 683544, "step": 465 }, { "epoch": 1.6725978647686834, "grad_norm": 22.762420654296875, "learning_rate": 4.2142644877160334e-06, "loss": 0.0334, "num_input_tokens_seen": 689368, "step": 470 }, { "epoch": 1.690391459074733, "grad_norm": 15.38829231262207, "learning_rate": 4.191518986652642e-06, "loss": 0.0779, "num_input_tokens_seen": 695832, "step": 475 }, { "epoch": 1.708185053380783, "grad_norm": 5.282833576202393, "learning_rate": 4.168512259292391e-06, "loss": 0.0085, "num_input_tokens_seen": 703128, "step": 480 }, { "epoch": 1.7259786476868326, "grad_norm": 0.2967665493488312, "learning_rate": 4.14524785863246e-06, "loss": 0.0617, "num_input_tokens_seen": 709528, "step": 485 }, { "epoch": 1.7437722419928825, "grad_norm": 18.050338745117188, "learning_rate": 4.121729377463285e-06, "loss": 0.0537, "num_input_tokens_seen": 716312, "step": 490 }, { "epoch": 1.7615658362989324, "grad_norm": 34.60630416870117, "learning_rate": 4.0979604478137045e-06, "loss": 0.1029, "num_input_tokens_seen": 722776, "step": 495 }, { "epoch": 1.7686832740213523, "eval_loss": 0.13597266376018524, "eval_runtime": 0.6055, "eval_samples_per_second": 411.239, "eval_steps_per_second": 52.85, "num_input_tokens_seen": 725656, "step": 497 }, { "epoch": 1.7793594306049823, "grad_norm": 19.958894729614258, "learning_rate": 4.0739447403900605e-06, "loss": 0.1142, "num_input_tokens_seen": 729944, "step": 500 }, { "epoch": 1.7971530249110321, "grad_norm": 13.6007719039917, "learning_rate": 4.0496859640093215e-06, "loss": 0.0837, "num_input_tokens_seen": 737112, "step": 505 }, { "epoch": 1.814946619217082, "grad_norm": 2.8377349376678467, "learning_rate": 4.025187865026311e-06, "loss": 0.0124, "num_input_tokens_seen": 744408, "step": 510 }, { "epoch": 1.8327402135231317, "grad_norm": 4.554283618927002, "learning_rate": 4.0004542267551585e-06, "loss": 0.059, "num_input_tokens_seen": 750488, "step": 515 }, { "epoch": 1.8505338078291815, "grad_norm": 20.501800537109375, "learning_rate": 3.975488868885022e-06, "loss": 0.0662, "num_input_tokens_seen": 757528, "step": 520 }, { "epoch": 1.8683274021352312, "grad_norm": 37.35130310058594, "learning_rate": 3.950295646890202e-06, "loss": 0.0299, "num_input_tokens_seen": 763736, "step": 525 }, { "epoch": 1.886120996441281, "grad_norm": 39.31193161010742, "learning_rate": 3.924878451434736e-06, "loss": 0.0666, "num_input_tokens_seen": 771864, "step": 530 }, { "epoch": 1.903914590747331, "grad_norm": 1.3388175964355469, "learning_rate": 3.899241207771546e-06, "loss": 0.072, "num_input_tokens_seen": 778712, "step": 535 }, { "epoch": 1.9217081850533808, "grad_norm": 12.562256813049316, "learning_rate": 3.873387875136252e-06, "loss": 0.0475, "num_input_tokens_seen": 784280, "step": 540 }, { "epoch": 1.9395017793594307, "grad_norm": 20.693452835083008, "learning_rate": 3.847322446135736e-06, "loss": 0.1443, "num_input_tokens_seen": 792280, "step": 545 }, { "epoch": 1.9572953736654806, "grad_norm": 10.482217788696289, "learning_rate": 3.821048946131549e-06, "loss": 0.1501, "num_input_tokens_seen": 798488, "step": 550 }, { "epoch": 1.9750889679715302, "grad_norm": 0.7575608491897583, "learning_rate": 3.794571432618267e-06, "loss": 0.0502, "num_input_tokens_seen": 806104, "step": 555 }, { "epoch": 1.99288256227758, "grad_norm": 13.330556869506836, "learning_rate": 3.767893994596876e-06, "loss": 0.0142, "num_input_tokens_seen": 813336, "step": 560 }, { "epoch": 2.0106761565836297, "grad_norm": 0.28736042976379395, "learning_rate": 3.7410207519432972e-06, "loss": 0.1868, "num_input_tokens_seen": 817576, "step": 565 }, { "epoch": 2.02135231316726, "eval_loss": 0.16060441732406616, "eval_runtime": 0.6313, "eval_samples_per_second": 394.424, "eval_steps_per_second": 50.689, "num_input_tokens_seen": 821416, "step": 568 }, { "epoch": 2.0284697508896796, "grad_norm": 35.438270568847656, "learning_rate": 3.713955854772144e-06, "loss": 0.0138, "num_input_tokens_seen": 823848, "step": 570 }, { "epoch": 2.0462633451957295, "grad_norm": 23.84980010986328, "learning_rate": 3.686703482795802e-06, "loss": 0.1069, "num_input_tokens_seen": 832232, "step": 575 }, { "epoch": 2.0640569395017794, "grad_norm": 0.2273331582546234, "learning_rate": 3.6592678446789516e-06, "loss": 0.042, "num_input_tokens_seen": 840424, "step": 580 }, { "epoch": 2.0818505338078293, "grad_norm": 0.2911248505115509, "learning_rate": 3.631653177388605e-06, "loss": 0.0325, "num_input_tokens_seen": 846824, "step": 585 }, { "epoch": 2.099644128113879, "grad_norm": 0.03189073130488396, "learning_rate": 3.6038637455397802e-06, "loss": 0.003, "num_input_tokens_seen": 853608, "step": 590 }, { "epoch": 2.117437722419929, "grad_norm": 0.06126879155635834, "learning_rate": 3.575903840736906e-06, "loss": 0.1002, "num_input_tokens_seen": 860968, "step": 595 }, { "epoch": 2.135231316725979, "grad_norm": 0.12086429446935654, "learning_rate": 3.547777780911055e-06, "loss": 0.0251, "num_input_tokens_seen": 868904, "step": 600 }, { "epoch": 2.1530249110320283, "grad_norm": 0.013016794808208942, "learning_rate": 3.519489909653113e-06, "loss": 0.0005, "num_input_tokens_seen": 876072, "step": 605 }, { "epoch": 2.170818505338078, "grad_norm": 41.4961051940918, "learning_rate": 3.4910445955429856e-06, "loss": 0.0155, "num_input_tokens_seen": 883752, "step": 610 }, { "epoch": 2.188612099644128, "grad_norm": 0.10037070512771606, "learning_rate": 3.4624462314749447e-06, "loss": 0.0004, "num_input_tokens_seen": 891304, "step": 615 }, { "epoch": 2.206405693950178, "grad_norm": 0.04163924232125282, "learning_rate": 3.433699233979222e-06, "loss": 0.0163, "num_input_tokens_seen": 899176, "step": 620 }, { "epoch": 2.224199288256228, "grad_norm": 0.09111510962247849, "learning_rate": 3.4048080425399506e-06, "loss": 0.0001, "num_input_tokens_seen": 907560, "step": 625 }, { "epoch": 2.2419928825622777, "grad_norm": 0.023458287119865417, "learning_rate": 3.375777118909561e-06, "loss": 0.0027, "num_input_tokens_seen": 915240, "step": 630 }, { "epoch": 2.2597864768683276, "grad_norm": 0.007639422547072172, "learning_rate": 3.346610946419743e-06, "loss": 0.0259, "num_input_tokens_seen": 921384, "step": 635 }, { "epoch": 2.2740213523131674, "eval_loss": 0.25424328446388245, "eval_runtime": 0.622, "eval_samples_per_second": 400.338, "eval_steps_per_second": 51.449, "num_input_tokens_seen": 926760, "step": 639 }, { "epoch": 2.277580071174377, "grad_norm": 0.02619522623717785, "learning_rate": 3.3173140292890673e-06, "loss": 0.0233, "num_input_tokens_seen": 927528, "step": 640 }, { "epoch": 2.295373665480427, "grad_norm": 0.031525298953056335, "learning_rate": 3.2878908919273867e-06, "loss": 0.0266, "num_input_tokens_seen": 934568, "step": 645 }, { "epoch": 2.3131672597864767, "grad_norm": 0.08830317109823227, "learning_rate": 3.2583460782371217e-06, "loss": 0.0006, "num_input_tokens_seen": 942248, "step": 650 }, { "epoch": 2.3309608540925266, "grad_norm": 0.005108945071697235, "learning_rate": 3.228684150911527e-06, "loss": 0.0003, "num_input_tokens_seen": 949096, "step": 655 }, { "epoch": 2.3487544483985765, "grad_norm": 0.04930766299366951, "learning_rate": 3.1989096907300634e-06, "loss": 0.0082, "num_input_tokens_seen": 955752, "step": 660 }, { "epoch": 2.3665480427046264, "grad_norm": 0.06299162656068802, "learning_rate": 3.1690272958509772e-06, "loss": 0.0486, "num_input_tokens_seen": 963176, "step": 665 }, { "epoch": 2.3843416370106763, "grad_norm": 0.6633197665214539, "learning_rate": 3.139041581101187e-06, "loss": 0.0001, "num_input_tokens_seen": 968232, "step": 670 }, { "epoch": 2.402135231316726, "grad_norm": 59.2174186706543, "learning_rate": 3.108957177263608e-06, "loss": 0.0222, "num_input_tokens_seen": 976552, "step": 675 }, { "epoch": 2.419928825622776, "grad_norm": 0.0013539546635001898, "learning_rate": 3.078778730362003e-06, "loss": 0.0075, "num_input_tokens_seen": 983720, "step": 680 }, { "epoch": 2.4377224199288254, "grad_norm": 0.01872558705508709, "learning_rate": 3.0485109009434844e-06, "loss": 0.0003, "num_input_tokens_seen": 991976, "step": 685 }, { "epoch": 2.4555160142348753, "grad_norm": 0.06228525564074516, "learning_rate": 3.018158363358773e-06, "loss": 0.061, "num_input_tokens_seen": 998184, "step": 690 }, { "epoch": 2.473309608540925, "grad_norm": 0.015886032953858376, "learning_rate": 2.9877258050403214e-06, "loss": 0.0, "num_input_tokens_seen": 1005672, "step": 695 }, { "epoch": 2.491103202846975, "grad_norm": 0.49660900235176086, "learning_rate": 2.9572179257784215e-06, "loss": 0.0152, "num_input_tokens_seen": 1013096, "step": 700 }, { "epoch": 2.508896797153025, "grad_norm": 0.01401636004447937, "learning_rate": 2.9266394369954056e-06, "loss": 0.0006, "num_input_tokens_seen": 1019304, "step": 705 }, { "epoch": 2.526690391459075, "grad_norm": 0.011517788283526897, "learning_rate": 2.8959950610180376e-06, "loss": 0.029, "num_input_tokens_seen": 1025320, "step": 710 }, { "epoch": 2.526690391459075, "eval_loss": 0.23608553409576416, "eval_runtime": 0.6224, "eval_samples_per_second": 400.086, "eval_steps_per_second": 51.417, "num_input_tokens_seen": 1025320, "step": 710 }, { "epoch": 2.5444839857651247, "grad_norm": 0.09000097960233688, "learning_rate": 2.865289530348243e-06, "loss": 0.0, "num_input_tokens_seen": 1032552, "step": 715 }, { "epoch": 2.562277580071174, "grad_norm": 0.0029357271268963814, "learning_rate": 2.8345275869322432e-06, "loss": 0.0, "num_input_tokens_seen": 1039912, "step": 720 }, { "epoch": 2.580071174377224, "grad_norm": 0.00977697316557169, "learning_rate": 2.8037139814282494e-06, "loss": 0.0092, "num_input_tokens_seen": 1047208, "step": 725 }, { "epoch": 2.597864768683274, "grad_norm": 0.03362439572811127, "learning_rate": 2.7728534724728027e-06, "loss": 0.0001, "num_input_tokens_seen": 1053928, "step": 730 }, { "epoch": 2.6156583629893237, "grad_norm": 109.98116302490234, "learning_rate": 2.741950825945881e-06, "loss": 0.0806, "num_input_tokens_seen": 1061608, "step": 735 }, { "epoch": 2.6334519572953736, "grad_norm": 0.026239968836307526, "learning_rate": 2.7110108142348962e-06, "loss": 0.0747, "num_input_tokens_seen": 1067560, "step": 740 }, { "epoch": 2.6512455516014235, "grad_norm": 0.13907551765441895, "learning_rate": 2.6800382154976734e-06, "loss": 0.0001, "num_input_tokens_seen": 1074152, "step": 745 }, { "epoch": 2.6690391459074734, "grad_norm": 0.02416655234992504, "learning_rate": 2.64903781292455e-06, "loss": 0.0005, "num_input_tokens_seen": 1082856, "step": 750 }, { "epoch": 2.6868327402135233, "grad_norm": 1.5046820640563965, "learning_rate": 2.6180143939996926e-06, "loss": 0.0003, "num_input_tokens_seen": 1089512, "step": 755 }, { "epoch": 2.704626334519573, "grad_norm": 0.027851078659296036, "learning_rate": 2.5869727497617495e-06, "loss": 0.0433, "num_input_tokens_seen": 1096232, "step": 760 }, { "epoch": 2.722419928825623, "grad_norm": 0.0055125863291323185, "learning_rate": 2.55591767406396e-06, "loss": 0.0004, "num_input_tokens_seen": 1104168, "step": 765 }, { "epoch": 2.7402135231316724, "grad_norm": 26.093463897705078, "learning_rate": 2.524853962833825e-06, "loss": 0.1194, "num_input_tokens_seen": 1112232, "step": 770 }, { "epoch": 2.7580071174377223, "grad_norm": 0.31872352957725525, "learning_rate": 2.4937864133324514e-06, "loss": 0.0031, "num_input_tokens_seen": 1119016, "step": 775 }, { "epoch": 2.775800711743772, "grad_norm": 0.7696136832237244, "learning_rate": 2.462719823413707e-06, "loss": 0.0005, "num_input_tokens_seen": 1126696, "step": 780 }, { "epoch": 2.7793594306049823, "eval_loss": 0.23524385690689087, "eval_runtime": 1.141, "eval_samples_per_second": 218.224, "eval_steps_per_second": 28.045, "num_input_tokens_seen": 1128104, "step": 781 }, { "epoch": 2.793594306049822, "grad_norm": 0.014183886349201202, "learning_rate": 2.4316589907832654e-06, "loss": 0.0423, "num_input_tokens_seen": 1134184, "step": 785 }, { "epoch": 2.811387900355872, "grad_norm": 0.04211263358592987, "learning_rate": 2.4006087122576867e-06, "loss": 0.001, "num_input_tokens_seen": 1140392, "step": 790 }, { "epoch": 2.829181494661922, "grad_norm": 0.09497106820344925, "learning_rate": 2.3695737830236263e-06, "loss": 0.031, "num_input_tokens_seen": 1148328, "step": 795 }, { "epoch": 2.8469750889679717, "grad_norm": 0.040660299360752106, "learning_rate": 2.3385589958973073e-06, "loss": 0.0007, "num_input_tokens_seen": 1154024, "step": 800 }, { "epoch": 2.864768683274021, "grad_norm": 0.06477546691894531, "learning_rate": 2.3075691405843435e-06, "loss": 0.0003, "num_input_tokens_seen": 1160808, "step": 805 }, { "epoch": 2.882562277580071, "grad_norm": 0.022568199783563614, "learning_rate": 2.2766090029400573e-06, "loss": 0.0299, "num_input_tokens_seen": 1167912, "step": 810 }, { "epoch": 2.900355871886121, "grad_norm": 0.012872009538114071, "learning_rate": 2.2456833642303825e-06, "loss": 0.0001, "num_input_tokens_seen": 1174568, "step": 815 }, { "epoch": 2.9181494661921707, "grad_norm": 0.0069219921715557575, "learning_rate": 2.214797000393479e-06, "loss": 0.0001, "num_input_tokens_seen": 1181480, "step": 820 }, { "epoch": 2.9359430604982206, "grad_norm": 74.77655792236328, "learning_rate": 2.183954681302173e-06, "loss": 0.0251, "num_input_tokens_seen": 1189928, "step": 825 }, { "epoch": 2.9537366548042705, "grad_norm": 0.00504131056368351, "learning_rate": 2.15316117002733e-06, "loss": 0.0001, "num_input_tokens_seen": 1197480, "step": 830 }, { "epoch": 2.9715302491103204, "grad_norm": 0.014680106192827225, "learning_rate": 2.122421222102278e-06, "loss": 0.039, "num_input_tokens_seen": 1204584, "step": 835 }, { "epoch": 2.9893238434163703, "grad_norm": 32.642799377441406, "learning_rate": 2.0917395847884e-06, "loss": 0.0311, "num_input_tokens_seen": 1212584, "step": 840 }, { "epoch": 3.00711743772242, "grad_norm": 55.226280212402344, "learning_rate": 2.061120996341996e-06, "loss": 0.0104, "num_input_tokens_seen": 1217856, "step": 845 }, { "epoch": 3.0249110320284696, "grad_norm": 0.005825079046189785, "learning_rate": 2.030570185282544e-06, "loss": 0.0001, "num_input_tokens_seen": 1226624, "step": 850 }, { "epoch": 3.0320284697508897, "eval_loss": 0.25802624225616455, "eval_runtime": 0.6232, "eval_samples_per_second": 399.564, "eval_steps_per_second": 51.35, "num_input_tokens_seen": 1229440, "step": 852 }, { "epoch": 3.0427046263345194, "grad_norm": 0.0032935605850070715, "learning_rate": 2.0000918696624587e-06, "loss": 0.0, "num_input_tokens_seen": 1233152, "step": 855 }, { "epoch": 3.0604982206405693, "grad_norm": 0.024462368339300156, "learning_rate": 1.9696907563384687e-06, "loss": 0.0, "num_input_tokens_seen": 1240128, "step": 860 }, { "epoch": 3.078291814946619, "grad_norm": 0.004567572381347418, "learning_rate": 1.9393715402447228e-06, "loss": 0.0, "num_input_tokens_seen": 1248064, "step": 865 }, { "epoch": 3.096085409252669, "grad_norm": 0.008781103417277336, "learning_rate": 1.9091389036677384e-06, "loss": 0.0, "num_input_tokens_seen": 1255232, "step": 870 }, { "epoch": 3.113879003558719, "grad_norm": 0.003714508144184947, "learning_rate": 1.878997515523299e-06, "loss": 0.0486, "num_input_tokens_seen": 1262272, "step": 875 }, { "epoch": 3.131672597864769, "grad_norm": 0.0036656211595982313, "learning_rate": 1.8489520306354243e-06, "loss": 0.0001, "num_input_tokens_seen": 1269632, "step": 880 }, { "epoch": 3.1494661921708187, "grad_norm": 0.06997384876012802, "learning_rate": 1.8190070890175082e-06, "loss": 0.0001, "num_input_tokens_seen": 1277312, "step": 885 }, { "epoch": 3.167259786476868, "grad_norm": 0.0681416317820549, "learning_rate": 1.7891673151557493e-06, "loss": 0.0502, "num_input_tokens_seen": 1284096, "step": 890 }, { "epoch": 3.185053380782918, "grad_norm": 0.008159984834492207, "learning_rate": 1.7594373172949786e-06, "loss": 0.0001, "num_input_tokens_seen": 1291648, "step": 895 }, { "epoch": 3.202846975088968, "grad_norm": 0.008854905143380165, "learning_rate": 1.7298216867269906e-06, "loss": 0.0001, "num_input_tokens_seen": 1299712, "step": 900 }, { "epoch": 3.2206405693950177, "grad_norm": 0.016451064497232437, "learning_rate": 1.7003249970815028e-06, "loss": 0.0001, "num_input_tokens_seen": 1306176, "step": 905 }, { "epoch": 3.2384341637010676, "grad_norm": 0.010798790492117405, "learning_rate": 1.6709518036198307e-06, "loss": 0.0001, "num_input_tokens_seen": 1314112, "step": 910 }, { "epoch": 3.2562277580071175, "grad_norm": 13.17015266418457, "learning_rate": 1.6417066425314088e-06, "loss": 0.0251, "num_input_tokens_seen": 1321088, "step": 915 }, { "epoch": 3.2740213523131674, "grad_norm": 0.006120134145021439, "learning_rate": 1.612594030233252e-06, "loss": 0.0001, "num_input_tokens_seen": 1328512, "step": 920 }, { "epoch": 3.284697508896797, "eval_loss": 0.22950421273708344, "eval_runtime": 0.6162, "eval_samples_per_second": 404.1, "eval_steps_per_second": 51.932, "num_input_tokens_seen": 1332544, "step": 923 }, { "epoch": 3.2918149466192173, "grad_norm": 0.008929496631026268, "learning_rate": 1.5836184626724722e-06, "loss": 0.0005, "num_input_tokens_seen": 1336128, "step": 925 }, { "epoch": 3.309608540925267, "grad_norm": 0.004387512803077698, "learning_rate": 1.5547844146319547e-06, "loss": 0.0, "num_input_tokens_seen": 1343552, "step": 930 }, { "epoch": 3.3274021352313166, "grad_norm": 0.011066491715610027, "learning_rate": 1.5260963390393075e-06, "loss": 0.0383, "num_input_tokens_seen": 1351552, "step": 935 }, { "epoch": 3.3451957295373664, "grad_norm": 0.006294109858572483, "learning_rate": 1.4975586662791783e-06, "loss": 0.0002, "num_input_tokens_seen": 1358272, "step": 940 }, { "epoch": 3.3629893238434163, "grad_norm": 0.01710674725472927, "learning_rate": 1.4691758035090603e-06, "loss": 0.0001, "num_input_tokens_seen": 1366784, "step": 945 }, { "epoch": 3.380782918149466, "grad_norm": 0.006381936836987734, "learning_rate": 1.4409521339786809e-06, "loss": 0.0001, "num_input_tokens_seen": 1373312, "step": 950 }, { "epoch": 3.398576512455516, "grad_norm": 0.012110439129173756, "learning_rate": 1.41289201635308e-06, "loss": 0.0001, "num_input_tokens_seen": 1380736, "step": 955 }, { "epoch": 3.416370106761566, "grad_norm": 0.03375524654984474, "learning_rate": 1.3849997840394943e-06, "loss": 0.0001, "num_input_tokens_seen": 1388544, "step": 960 }, { "epoch": 3.434163701067616, "grad_norm": 0.010791816748678684, "learning_rate": 1.3572797445181346e-06, "loss": 0.0001, "num_input_tokens_seen": 1396160, "step": 965 }, { "epoch": 3.4519572953736652, "grad_norm": 0.006806841120123863, "learning_rate": 1.3297361786769654e-06, "loss": 0.0, "num_input_tokens_seen": 1404096, "step": 970 }, { "epoch": 3.469750889679715, "grad_norm": 0.006986913271248341, "learning_rate": 1.302373340150598e-06, "loss": 0.0004, "num_input_tokens_seen": 1411008, "step": 975 }, { "epoch": 3.487544483985765, "grad_norm": 0.006954096723347902, "learning_rate": 1.2751954546633872e-06, "loss": 0.0001, "num_input_tokens_seen": 1418880, "step": 980 }, { "epoch": 3.505338078291815, "grad_norm": 0.0037973152939230204, "learning_rate": 1.2482067193768419e-06, "loss": 0.0, "num_input_tokens_seen": 1426048, "step": 985 }, { "epoch": 3.5231316725978647, "grad_norm": 0.1800023317337036, "learning_rate": 1.2214113022414448e-06, "loss": 0.0001, "num_input_tokens_seen": 1432064, "step": 990 }, { "epoch": 3.5373665480427046, "eval_loss": 0.24046999216079712, "eval_runtime": 0.6864, "eval_samples_per_second": 362.784, "eval_steps_per_second": 46.623, "num_input_tokens_seen": 1438336, "step": 994 }, { "epoch": 3.5409252669039146, "grad_norm": 0.008008265867829323, "learning_rate": 1.1948133413529817e-06, "loss": 0.0, "num_input_tokens_seen": 1439808, "step": 995 }, { "epoch": 3.5587188612099645, "grad_norm": 0.004162625875324011, "learning_rate": 1.168416944313486e-06, "loss": 0.0001, "num_input_tokens_seen": 1447616, "step": 1000 }, { "epoch": 3.5765124555160144, "grad_norm": 0.0032155588269233704, "learning_rate": 1.1422261875968845e-06, "loss": 0.0, "num_input_tokens_seen": 1454208, "step": 1005 }, { "epoch": 3.5943060498220643, "grad_norm": 0.007771800272166729, "learning_rate": 1.1162451159194615e-06, "loss": 0.0, "num_input_tokens_seen": 1463296, "step": 1010 }, { "epoch": 3.612099644128114, "grad_norm": 0.12433820962905884, "learning_rate": 1.0904777416152166e-06, "loss": 0.0009, "num_input_tokens_seen": 1469952, "step": 1015 }, { "epoch": 3.6298932384341636, "grad_norm": 0.003224864834919572, "learning_rate": 1.0649280440162326e-06, "loss": 0.0, "num_input_tokens_seen": 1477184, "step": 1020 }, { "epoch": 3.6476868327402134, "grad_norm": 0.00848240777850151, "learning_rate": 1.0395999688381313e-06, "loss": 0.0, "num_input_tokens_seen": 1484160, "step": 1025 }, { "epoch": 3.6654804270462633, "grad_norm": 0.005719688255339861, "learning_rate": 1.0144974275707243e-06, "loss": 0.0001, "num_input_tokens_seen": 1491200, "step": 1030 }, { "epoch": 3.683274021352313, "grad_norm": 0.0028636339120566845, "learning_rate": 9.896242968739538e-07, "loss": 0.0, "num_input_tokens_seen": 1498368, "step": 1035 }, { "epoch": 3.701067615658363, "grad_norm": 0.0037352785002440214, "learning_rate": 9.649844179792082e-07, "loss": 0.0, "num_input_tokens_seen": 1505984, "step": 1040 }, { "epoch": 3.718861209964413, "grad_norm": 0.0014299631584435701, "learning_rate": 9.405815960961054e-07, "loss": 0.0, "num_input_tokens_seen": 1511680, "step": 1045 }, { "epoch": 3.7366548042704624, "grad_norm": 0.002295982325449586, "learning_rate": 9.164195998248471e-07, "loss": 0.0, "num_input_tokens_seen": 1517888, "step": 1050 }, { "epoch": 3.7544483985765122, "grad_norm": 0.0017496125074103475, "learning_rate": 8.925021605742212e-07, "loss": 0.0109, "num_input_tokens_seen": 1525568, "step": 1055 }, { "epoch": 3.772241992882562, "grad_norm": 0.0030260924249887466, "learning_rate": 8.68832971985347e-07, "loss": 0.0, "num_input_tokens_seen": 1532480, "step": 1060 }, { "epoch": 3.790035587188612, "grad_norm": 0.0027729899156838655, "learning_rate": 8.454156893612592e-07, "loss": 0.0, "num_input_tokens_seen": 1539072, "step": 1065 }, { "epoch": 3.790035587188612, "eval_loss": 0.2512344419956207, "eval_runtime": 0.6676, "eval_samples_per_second": 372.986, "eval_steps_per_second": 47.934, "num_input_tokens_seen": 1539072, "step": 1065 }, { "epoch": 3.807829181494662, "grad_norm": 0.010402582585811615, "learning_rate": 8.222539291024079e-07, "loss": 0.0, "num_input_tokens_seen": 1547584, "step": 1070 }, { "epoch": 3.8256227758007118, "grad_norm": 0.0076858168467879295, "learning_rate": 7.993512681481638e-07, "loss": 0.0, "num_input_tokens_seen": 1554304, "step": 1075 }, { "epoch": 3.8434163701067616, "grad_norm": 0.005163070745766163, "learning_rate": 7.767112434244254e-07, "loss": 0.0, "num_input_tokens_seen": 1560896, "step": 1080 }, { "epoch": 3.8612099644128115, "grad_norm": 0.007071156986057758, "learning_rate": 7.543373512973947e-07, "loss": 0.0, "num_input_tokens_seen": 1567744, "step": 1085 }, { "epoch": 3.8790035587188614, "grad_norm": 0.004036191385239363, "learning_rate": 7.322330470336314e-07, "loss": 0.032, "num_input_tokens_seen": 1574400, "step": 1090 }, { "epoch": 3.8967971530249113, "grad_norm": 0.001716184546239674, "learning_rate": 7.104017442664393e-07, "loss": 0.0187, "num_input_tokens_seen": 1581504, "step": 1095 }, { "epoch": 3.914590747330961, "grad_norm": 0.03370984271168709, "learning_rate": 6.88846814468691e-07, "loss": 0.0, "num_input_tokens_seen": 1589504, "step": 1100 }, { "epoch": 3.9323843416370106, "grad_norm": 19.512357711791992, "learning_rate": 6.67571586432163e-07, "loss": 0.0369, "num_input_tokens_seen": 1597696, "step": 1105 }, { "epoch": 3.9501779359430604, "grad_norm": 0.0021878909319639206, "learning_rate": 6.465793457534553e-07, "loss": 0.0, "num_input_tokens_seen": 1605248, "step": 1110 }, { "epoch": 3.9679715302491103, "grad_norm": 0.0022776706609874964, "learning_rate": 6.258733343265933e-07, "loss": 0.0002, "num_input_tokens_seen": 1613952, "step": 1115 }, { "epoch": 3.98576512455516, "grad_norm": 0.010679907165467739, "learning_rate": 6.054567498423683e-07, "loss": 0.0, "num_input_tokens_seen": 1620224, "step": 1120 }, { "epoch": 4.00355871886121, "grad_norm": 0.003633465152233839, "learning_rate": 5.853327452945115e-07, "loss": 0.0, "num_input_tokens_seen": 1625800, "step": 1125 }, { "epoch": 4.0213523131672595, "grad_norm": 0.0048196627758443356, "learning_rate": 5.655044284927658e-07, "loss": 0.0, "num_input_tokens_seen": 1633352, "step": 1130 }, { "epoch": 4.039145907473309, "grad_norm": 0.0030679679475724697, "learning_rate": 5.459748615829355e-07, "loss": 0.0, "num_input_tokens_seen": 1640840, "step": 1135 }, { "epoch": 4.04270462633452, "eval_loss": 0.2551669180393219, "eval_runtime": 1.6934, "eval_samples_per_second": 147.043, "eval_steps_per_second": 18.897, "num_input_tokens_seen": 1642696, "step": 1136 }, { "epoch": 4.056939501779359, "grad_norm": 0.00447492441162467, "learning_rate": 5.267470605739953e-07, "loss": 0.0, "num_input_tokens_seen": 1648520, "step": 1140 }, { "epoch": 4.074733096085409, "grad_norm": 0.0019011611584573984, "learning_rate": 5.078239948723154e-07, "loss": 0.0, "num_input_tokens_seen": 1655752, "step": 1145 }, { "epoch": 4.092526690391459, "grad_norm": 0.005783023778349161, "learning_rate": 4.892085868230881e-07, "loss": 0.0, "num_input_tokens_seen": 1662920, "step": 1150 }, { "epoch": 4.110320284697509, "grad_norm": 0.0052789985202252865, "learning_rate": 4.7090371125902175e-07, "loss": 0.0, "num_input_tokens_seen": 1669896, "step": 1155 }, { "epoch": 4.128113879003559, "grad_norm": 0.005814805161207914, "learning_rate": 4.529121950563717e-07, "loss": 0.0, "num_input_tokens_seen": 1675400, "step": 1160 }, { "epoch": 4.145907473309609, "grad_norm": 0.010795537382364273, "learning_rate": 4.352368166983753e-07, "loss": 0.0, "num_input_tokens_seen": 1682952, "step": 1165 }, { "epoch": 4.1637010676156585, "grad_norm": 0.013921362347900867, "learning_rate": 4.178803058461664e-07, "loss": 0.0, "num_input_tokens_seen": 1690248, "step": 1170 }, { "epoch": 4.181494661921708, "grad_norm": 0.00226654764264822, "learning_rate": 4.0084534291722375e-07, "loss": 0.0, "num_input_tokens_seen": 1696840, "step": 1175 }, { "epoch": 4.199288256227758, "grad_norm": 0.005677198059856892, "learning_rate": 3.8413455867142513e-07, "loss": 0.0, "num_input_tokens_seen": 1703624, "step": 1180 }, { "epoch": 4.217081850533808, "grad_norm": 0.004136247094720602, "learning_rate": 3.6775053380477296e-07, "loss": 0.0, "num_input_tokens_seen": 1710024, "step": 1185 }, { "epoch": 4.234875444839858, "grad_norm": 0.0022235463839024305, "learning_rate": 3.516957985508476e-07, "loss": 0.0, "num_input_tokens_seen": 1717768, "step": 1190 }, { "epoch": 4.252669039145908, "grad_norm": 0.0016525887185707688, "learning_rate": 3.3597283229005877e-07, "loss": 0.0, "num_input_tokens_seen": 1727240, "step": 1195 }, { "epoch": 4.270462633451958, "grad_norm": 0.0030577515717595816, "learning_rate": 3.2058406316674563e-07, "loss": 0.0, "num_input_tokens_seen": 1734408, "step": 1200 }, { "epoch": 4.288256227758007, "grad_norm": 0.004889196250587702, "learning_rate": 3.055318677141916e-07, "loss": 0.0, "num_input_tokens_seen": 1740936, "step": 1205 }, { "epoch": 4.295373665480427, "eval_loss": 0.257210373878479, "eval_runtime": 0.6184, "eval_samples_per_second": 402.654, "eval_steps_per_second": 51.747, "num_input_tokens_seen": 1743624, "step": 1207 }, { "epoch": 4.306049822064057, "grad_norm": 0.010472727008163929, "learning_rate": 2.9081857048761014e-07, "loss": 0.0, "num_input_tokens_seen": 1747784, "step": 1210 }, { "epoch": 4.3238434163701065, "grad_norm": 0.002632361836731434, "learning_rate": 2.764464437051537e-07, "loss": 0.0, "num_input_tokens_seen": 1754888, "step": 1215 }, { "epoch": 4.341637010676156, "grad_norm": 0.003168675350025296, "learning_rate": 2.624177068970124e-07, "loss": 0.0, "num_input_tokens_seen": 1762632, "step": 1220 }, { "epoch": 4.359430604982206, "grad_norm": 0.001775842742063105, "learning_rate": 2.4873452656264316e-07, "loss": 0.0, "num_input_tokens_seen": 1769928, "step": 1225 }, { "epoch": 4.377224199288256, "grad_norm": 0.0023636040277779102, "learning_rate": 2.3539901583619186e-07, "loss": 0.0, "num_input_tokens_seen": 1777480, "step": 1230 }, { "epoch": 4.395017793594306, "grad_norm": 0.003756535705178976, "learning_rate": 2.2241323416015452e-07, "loss": 0.0, "num_input_tokens_seen": 1784840, "step": 1235 }, { "epoch": 4.412811387900356, "grad_norm": 0.0014317089226096869, "learning_rate": 2.0977918696733103e-07, "loss": 0.0, "num_input_tokens_seen": 1792584, "step": 1240 }, { "epoch": 4.430604982206406, "grad_norm": 0.0015214212471619248, "learning_rate": 1.9749882537112297e-07, "loss": 0.0, "num_input_tokens_seen": 1800968, "step": 1245 }, { "epoch": 4.448398576512456, "grad_norm": 0.003403919516131282, "learning_rate": 1.8557404586421413e-07, "loss": 0.0, "num_input_tokens_seen": 1808456, "step": 1250 }, { "epoch": 4.4661921708185055, "grad_norm": 0.013262175023555756, "learning_rate": 1.7400669002569233e-07, "loss": 0.0, "num_input_tokens_seen": 1816136, "step": 1255 }, { "epoch": 4.483985765124555, "grad_norm": 0.001888920902274549, "learning_rate": 1.62798544236647e-07, "loss": 0.0, "num_input_tokens_seen": 1824136, "step": 1260 }, { "epoch": 4.501779359430605, "grad_norm": 0.00637847138568759, "learning_rate": 1.5195133940429345e-07, "loss": 0.0, "num_input_tokens_seen": 1831304, "step": 1265 }, { "epoch": 4.519572953736655, "grad_norm": 0.0010983651736751199, "learning_rate": 1.4146675069466403e-07, "loss": 0.0, "num_input_tokens_seen": 1837512, "step": 1270 }, { "epoch": 4.537366548042705, "grad_norm": 0.0015767315635457635, "learning_rate": 1.313463972739068e-07, "loss": 0.0, "num_input_tokens_seen": 1844296, "step": 1275 }, { "epoch": 4.548042704626335, "eval_loss": 0.259037584066391, "eval_runtime": 0.6197, "eval_samples_per_second": 401.818, "eval_steps_per_second": 51.639, "num_input_tokens_seen": 1849416, "step": 1278 }, { "epoch": 4.555160142348754, "grad_norm": 0.003252014284953475, "learning_rate": 1.215918420582343e-07, "loss": 0.0, "num_input_tokens_seen": 1851720, "step": 1280 }, { "epoch": 4.572953736654805, "grad_norm": 0.0038456034380942583, "learning_rate": 1.1220459147255642e-07, "loss": 0.0, "num_input_tokens_seen": 1858120, "step": 1285 }, { "epoch": 4.590747330960854, "grad_norm": 0.002327981637790799, "learning_rate": 1.0318609521783818e-07, "loss": 0.0, "num_input_tokens_seen": 1865928, "step": 1290 }, { "epoch": 4.608540925266904, "grad_norm": 0.002249909332022071, "learning_rate": 9.453774604721937e-08, "loss": 0.0, "num_input_tokens_seen": 1873800, "step": 1295 }, { "epoch": 4.6263345195729535, "grad_norm": 0.00320955878123641, "learning_rate": 8.62608795509276e-08, "loss": 0.0, "num_input_tokens_seen": 1881800, "step": 1300 }, { "epoch": 4.644128113879003, "grad_norm": 0.010727422311902046, "learning_rate": 7.835677395001795e-08, "loss": 0.0, "num_input_tokens_seen": 1888648, "step": 1305 }, { "epoch": 4.661921708185053, "grad_norm": 0.0015563094057142735, "learning_rate": 7.082664989897486e-08, "loss": 0.0, "num_input_tokens_seen": 1895432, "step": 1310 }, { "epoch": 4.679715302491103, "grad_norm": 0.0029733225237578154, "learning_rate": 6.367167029720234e-08, "loss": 0.0, "num_input_tokens_seen": 1902408, "step": 1315 }, { "epoch": 4.697508896797153, "grad_norm": 0.0011107242899015546, "learning_rate": 5.68929401094323e-08, "loss": 0.0277, "num_input_tokens_seen": 1910344, "step": 1320 }, { "epoch": 4.715302491103203, "grad_norm": 0.0024087605997920036, "learning_rate": 5.049150619508503e-08, "loss": 0.0, "num_input_tokens_seen": 1918472, "step": 1325 }, { "epoch": 4.733096085409253, "grad_norm": 0.002195857698097825, "learning_rate": 4.446835714659647e-08, "loss": 0.0, "num_input_tokens_seen": 1924744, "step": 1330 }, { "epoch": 4.750889679715303, "grad_norm": 0.0017717446899041533, "learning_rate": 3.882442313674878e-08, "loss": 0.0, "num_input_tokens_seen": 1932872, "step": 1335 }, { "epoch": 4.7686832740213525, "grad_norm": 0.0018043630989268422, "learning_rate": 3.3560575775019866e-08, "loss": 0.0, "num_input_tokens_seen": 1940040, "step": 1340 }, { "epoch": 4.786476868327402, "grad_norm": 0.0031781333964318037, "learning_rate": 2.8677627972978905e-08, "loss": 0.0, "num_input_tokens_seen": 1948936, "step": 1345 }, { "epoch": 4.800711743772242, "eval_loss": 0.2602100372314453, "eval_runtime": 0.6213, "eval_samples_per_second": 400.766, "eval_steps_per_second": 51.504, "num_input_tokens_seen": 1954568, "step": 1349 }, { "epoch": 4.804270462633452, "grad_norm": 0.001851449953392148, "learning_rate": 2.4176333818745347e-08, "loss": 0.0, "num_input_tokens_seen": 1955912, "step": 1350 }, { "epoch": 4.822064056939502, "grad_norm": 0.003628035541623831, "learning_rate": 2.0057388460533733e-08, "loss": 0.0, "num_input_tokens_seen": 1962760, "step": 1355 }, { "epoch": 4.839857651245552, "grad_norm": 0.0011652238899841905, "learning_rate": 1.6321427999298754e-08, "loss": 0.0, "num_input_tokens_seen": 1969160, "step": 1360 }, { "epoch": 4.857651245551601, "grad_norm": 0.003991110250353813, "learning_rate": 1.2969029390501597e-08, "loss": 0.0, "num_input_tokens_seen": 1975752, "step": 1365 }, { "epoch": 4.875444839857651, "grad_norm": 0.004909892100840807, "learning_rate": 1.000071035500816e-08, "loss": 0.0, "num_input_tokens_seen": 1983240, "step": 1370 }, { "epoch": 4.893238434163701, "grad_norm": 0.00255265599116683, "learning_rate": 7.416929299135511e-09, "loss": 0.0, "num_input_tokens_seen": 1990792, "step": 1375 }, { "epoch": 4.911032028469751, "grad_norm": 0.0013633174821734428, "learning_rate": 5.218085243859639e-09, "loss": 0.0, "num_input_tokens_seen": 1998728, "step": 1380 }, { "epoch": 4.9288256227758005, "grad_norm": 0.005876463372260332, "learning_rate": 3.4045177631936154e-09, "loss": 0.0, "num_input_tokens_seen": 2006920, "step": 1385 }, { "epoch": 4.94661921708185, "grad_norm": 0.0021194759756326675, "learning_rate": 1.976506931745392e-09, "loss": 0.0, "num_input_tokens_seen": 2013128, "step": 1390 }, { "epoch": 4.9644128113879, "grad_norm": 0.0024136609863489866, "learning_rate": 9.3427328146517e-10, "loss": 0.0, "num_input_tokens_seen": 2021704, "step": 1395 }, { "epoch": 4.98220640569395, "grad_norm": 0.16585078835487366, "learning_rate": 2.7797776758903274e-10, "loss": 0.0, "num_input_tokens_seen": 2028872, "step": 1400 }, { "epoch": 5.0, "grad_norm": 0.002416080329567194, "learning_rate": 7.72174378022017e-12, "loss": 0.0, "num_input_tokens_seen": 2035272, "step": 1405 }, { "epoch": 5.0, "num_input_tokens_seen": 2035272, "step": 1405, "total_flos": 1.1883702201974784e+16, "train_loss": 0.05568007128206763, "train_runtime": 1085.6649, "train_samples_per_second": 10.321, "train_steps_per_second": 1.294 } ], "logging_steps": 5, "max_steps": 1405, "num_input_tokens_seen": 2035272, "num_train_epochs": 5, "save_steps": 71, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.1883702201974784e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }