{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 200, "global_step": 936, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0010683760683760685, "grad_norm": 4.745241496830083e+18, "learning_rate": 0.0, "loss": 1.5719, "step": 1 }, { "epoch": 0.005341880341880342, "grad_norm": 3565.890869140625, "learning_rate": 8.510638297872341e-07, "loss": 1.5783, "step": 5 }, { "epoch": 0.010683760683760684, "grad_norm": 37.561458587646484, "learning_rate": 1.9148936170212767e-06, "loss": 1.5379, "step": 10 }, { "epoch": 0.016025641025641024, "grad_norm": 5.307524681091309, "learning_rate": 2.978723404255319e-06, "loss": 1.439, "step": 15 }, { "epoch": 0.021367521367521368, "grad_norm": 2.2638120651245117, "learning_rate": 4.042553191489362e-06, "loss": 1.3521, "step": 20 }, { "epoch": 0.026709401709401708, "grad_norm": 3.3778412342071533, "learning_rate": 5.106382978723404e-06, "loss": 1.2871, "step": 25 }, { "epoch": 0.03205128205128205, "grad_norm": 1.7132008075714111, "learning_rate": 6.170212765957447e-06, "loss": 1.2607, "step": 30 }, { "epoch": 0.03739316239316239, "grad_norm": 1.8523601293563843, "learning_rate": 7.234042553191491e-06, "loss": 1.2357, "step": 35 }, { "epoch": 0.042735042735042736, "grad_norm": 1.5859768390655518, "learning_rate": 8.297872340425532e-06, "loss": 1.2121, "step": 40 }, { "epoch": 0.04807692307692308, "grad_norm": 1.557377815246582, "learning_rate": 9.361702127659576e-06, "loss": 1.1803, "step": 45 }, { "epoch": 0.053418803418803416, "grad_norm": 1.4963977336883545, "learning_rate": 1.0425531914893619e-05, "loss": 1.1915, "step": 50 }, { "epoch": 0.05876068376068376, "grad_norm": 1.3628768920898438, "learning_rate": 1.1489361702127662e-05, "loss": 1.1701, "step": 55 }, { "epoch": 0.0641025641025641, "grad_norm": 2.2901501655578613, "learning_rate": 1.2553191489361702e-05, "loss": 1.1598, "step": 60 }, { "epoch": 0.06944444444444445, "grad_norm": 1.3150509595870972, "learning_rate": 1.3617021276595745e-05, "loss": 1.1611, "step": 65 }, { "epoch": 0.07478632478632478, "grad_norm": 1.1793471574783325, "learning_rate": 1.4680851063829789e-05, "loss": 1.1656, "step": 70 }, { "epoch": 0.08012820512820513, "grad_norm": 1.3063637018203735, "learning_rate": 1.5744680851063832e-05, "loss": 1.1599, "step": 75 }, { "epoch": 0.08547008547008547, "grad_norm": 1.6266546249389648, "learning_rate": 1.6808510638297873e-05, "loss": 1.1404, "step": 80 }, { "epoch": 0.09081196581196581, "grad_norm": 1.2766536474227905, "learning_rate": 1.7872340425531915e-05, "loss": 1.1515, "step": 85 }, { "epoch": 0.09615384615384616, "grad_norm": 1.1077594757080078, "learning_rate": 1.893617021276596e-05, "loss": 1.145, "step": 90 }, { "epoch": 0.1014957264957265, "grad_norm": 1.000414252281189, "learning_rate": 2e-05, "loss": 1.148, "step": 95 }, { "epoch": 0.10683760683760683, "grad_norm": 1.1821677684783936, "learning_rate": 1.9998259904917257e-05, "loss": 1.1424, "step": 100 }, { "epoch": 0.11217948717948718, "grad_norm": 1.3549473285675049, "learning_rate": 1.9993040225255205e-05, "loss": 1.1624, "step": 105 }, { "epoch": 0.11752136752136752, "grad_norm": 0.9064257144927979, "learning_rate": 1.998434277756163e-05, "loss": 1.1321, "step": 110 }, { "epoch": 0.12286324786324786, "grad_norm": 1.0382879972457886, "learning_rate": 1.9972170588713715e-05, "loss": 1.1526, "step": 115 }, { "epoch": 0.1282051282051282, "grad_norm": 1.089872121810913, "learning_rate": 1.9956527894864662e-05, "loss": 1.1479, "step": 120 }, { "epoch": 0.13354700854700854, "grad_norm": 1.2133479118347168, "learning_rate": 1.9937420139969397e-05, "loss": 1.1415, "step": 125 }, { "epoch": 0.1388888888888889, "grad_norm": 0.9869184494018555, "learning_rate": 1.9914853973889988e-05, "loss": 1.1373, "step": 130 }, { "epoch": 0.14423076923076922, "grad_norm": 1.0042775869369507, "learning_rate": 1.988883725008136e-05, "loss": 1.1368, "step": 135 }, { "epoch": 0.14957264957264957, "grad_norm": 1.358765959739685, "learning_rate": 1.985937902285815e-05, "loss": 1.1299, "step": 140 }, { "epoch": 0.15491452991452992, "grad_norm": 1.0506223440170288, "learning_rate": 1.9826489544243623e-05, "loss": 1.1424, "step": 145 }, { "epoch": 0.16025641025641027, "grad_norm": 0.8935067057609558, "learning_rate": 1.9790180260401778e-05, "loss": 1.1409, "step": 150 }, { "epoch": 0.1655982905982906, "grad_norm": 0.910125195980072, "learning_rate": 1.9750463807653873e-05, "loss": 1.1436, "step": 155 }, { "epoch": 0.17094017094017094, "grad_norm": 0.9480465650558472, "learning_rate": 1.9707354008080736e-05, "loss": 1.1289, "step": 160 }, { "epoch": 0.1762820512820513, "grad_norm": 1.0701904296875, "learning_rate": 1.9660865864712413e-05, "loss": 1.1308, "step": 165 }, { "epoch": 0.18162393162393162, "grad_norm": 0.8851361870765686, "learning_rate": 1.9611015556306845e-05, "loss": 1.1171, "step": 170 }, { "epoch": 0.18696581196581197, "grad_norm": 0.9653987884521484, "learning_rate": 1.9557820431719333e-05, "loss": 1.128, "step": 175 }, { "epoch": 0.19230769230769232, "grad_norm": 0.8165105581283569, "learning_rate": 1.9501299003864828e-05, "loss": 1.1329, "step": 180 }, { "epoch": 0.19764957264957264, "grad_norm": 0.7411690354347229, "learning_rate": 1.944147094327506e-05, "loss": 1.1363, "step": 185 }, { "epoch": 0.202991452991453, "grad_norm": 0.8322554230690002, "learning_rate": 1.937835707125284e-05, "loss": 1.1302, "step": 190 }, { "epoch": 0.20833333333333334, "grad_norm": 0.7390133738517761, "learning_rate": 1.9311979352625837e-05, "loss": 1.1375, "step": 195 }, { "epoch": 0.21367521367521367, "grad_norm": 0.7468808889389038, "learning_rate": 1.924236088810241e-05, "loss": 1.1453, "step": 200 }, { "epoch": 0.21367521367521367, "eval_loss": 1.1343013048171997, "eval_runtime": 121.3407, "eval_samples_per_second": 109.164, "eval_steps_per_second": 0.857, "step": 200 }, { "epoch": 0.21901709401709402, "grad_norm": 0.8430936932563782, "learning_rate": 1.916952590623212e-05, "loss": 1.1097, "step": 205 }, { "epoch": 0.22435897435897437, "grad_norm": 0.8264766335487366, "learning_rate": 1.909349975497372e-05, "loss": 1.1243, "step": 210 }, { "epoch": 0.2297008547008547, "grad_norm": 0.7218984961509705, "learning_rate": 1.9014308892873612e-05, "loss": 1.1454, "step": 215 }, { "epoch": 0.23504273504273504, "grad_norm": 0.950661301612854, "learning_rate": 1.8931980879857737e-05, "loss": 1.113, "step": 220 }, { "epoch": 0.2403846153846154, "grad_norm": 0.8158000707626343, "learning_rate": 1.8846544367640218e-05, "loss": 1.125, "step": 225 }, { "epoch": 0.24572649572649571, "grad_norm": 0.8023744225502014, "learning_rate": 1.8758029089752023e-05, "loss": 1.1271, "step": 230 }, { "epoch": 0.25106837606837606, "grad_norm": 0.7522373795509338, "learning_rate": 1.86664658511931e-05, "loss": 1.1027, "step": 235 }, { "epoch": 0.2564102564102564, "grad_norm": 0.7865638136863708, "learning_rate": 1.85718865177117e-05, "loss": 1.1091, "step": 240 }, { "epoch": 0.26175213675213677, "grad_norm": 0.7381576895713806, "learning_rate": 1.847432400471443e-05, "loss": 1.1202, "step": 245 }, { "epoch": 0.2670940170940171, "grad_norm": 0.8119399547576904, "learning_rate": 1.8373812265811126e-05, "loss": 1.1246, "step": 250 }, { "epoch": 0.2724358974358974, "grad_norm": 0.8148014545440674, "learning_rate": 1.827038628099831e-05, "loss": 1.1305, "step": 255 }, { "epoch": 0.2777777777777778, "grad_norm": 0.7722187042236328, "learning_rate": 1.81640820444855e-05, "loss": 1.1103, "step": 260 }, { "epoch": 0.2831196581196581, "grad_norm": 0.7955113053321838, "learning_rate": 1.8054936552168548e-05, "loss": 1.1134, "step": 265 }, { "epoch": 0.28846153846153844, "grad_norm": 0.8004475235939026, "learning_rate": 1.7942987788754348e-05, "loss": 1.1135, "step": 270 }, { "epoch": 0.2938034188034188, "grad_norm": 0.786965012550354, "learning_rate": 1.7828274714541445e-05, "loss": 1.1247, "step": 275 }, { "epoch": 0.29914529914529914, "grad_norm": 0.7405045032501221, "learning_rate": 1.771083725186111e-05, "loss": 1.1107, "step": 280 }, { "epoch": 0.30448717948717946, "grad_norm": 0.7558034062385559, "learning_rate": 1.759071627118362e-05, "loss": 1.1081, "step": 285 }, { "epoch": 0.30982905982905984, "grad_norm": 0.7463499903678894, "learning_rate": 1.746795357689453e-05, "loss": 1.1193, "step": 290 }, { "epoch": 0.31517094017094016, "grad_norm": 0.7769708633422852, "learning_rate": 1.7342591892745978e-05, "loss": 1.1026, "step": 295 }, { "epoch": 0.32051282051282054, "grad_norm": 0.7373056411743164, "learning_rate": 1.7214674846987992e-05, "loss": 1.1107, "step": 300 }, { "epoch": 0.32585470085470086, "grad_norm": 0.7885181307792664, "learning_rate": 1.7084246957185036e-05, "loss": 1.1282, "step": 305 }, { "epoch": 0.3311965811965812, "grad_norm": 0.7831401824951172, "learning_rate": 1.695135361472305e-05, "loss": 1.1166, "step": 310 }, { "epoch": 0.33653846153846156, "grad_norm": 0.7612439393997192, "learning_rate": 1.681604106901239e-05, "loss": 1.1096, "step": 315 }, { "epoch": 0.3418803418803419, "grad_norm": 0.8480131030082703, "learning_rate": 1.6678356411392135e-05, "loss": 1.1128, "step": 320 }, { "epoch": 0.3472222222222222, "grad_norm": 0.7431190609931946, "learning_rate": 1.6538347558741424e-05, "loss": 1.1052, "step": 325 }, { "epoch": 0.3525641025641026, "grad_norm": 0.8087280988693237, "learning_rate": 1.6396063236803465e-05, "loss": 1.1119, "step": 330 }, { "epoch": 0.3579059829059829, "grad_norm": 0.7224681973457336, "learning_rate": 1.625155296322805e-05, "loss": 1.1182, "step": 335 }, { "epoch": 0.36324786324786323, "grad_norm": 0.8140762448310852, "learning_rate": 1.610486703033847e-05, "loss": 1.093, "step": 340 }, { "epoch": 0.3685897435897436, "grad_norm": 0.7991831302642822, "learning_rate": 1.5956056487628832e-05, "loss": 1.1214, "step": 345 }, { "epoch": 0.37393162393162394, "grad_norm": 0.7950995564460754, "learning_rate": 1.5805173123997856e-05, "loss": 1.1171, "step": 350 }, { "epoch": 0.37927350427350426, "grad_norm": 0.7558692097663879, "learning_rate": 1.5652269449725375e-05, "loss": 1.0817, "step": 355 }, { "epoch": 0.38461538461538464, "grad_norm": 0.8574461936950684, "learning_rate": 1.549739867819773e-05, "loss": 1.1228, "step": 360 }, { "epoch": 0.38995726495726496, "grad_norm": 0.7260861992835999, "learning_rate": 1.534061470738852e-05, "loss": 1.0933, "step": 365 }, { "epoch": 0.3952991452991453, "grad_norm": 0.7083914875984192, "learning_rate": 1.5181972101101083e-05, "loss": 1.0952, "step": 370 }, { "epoch": 0.40064102564102566, "grad_norm": 0.6901055574417114, "learning_rate": 1.5021526069979232e-05, "loss": 1.1023, "step": 375 }, { "epoch": 0.405982905982906, "grad_norm": 0.8109995722770691, "learning_rate": 1.4859332452292937e-05, "loss": 1.1167, "step": 380 }, { "epoch": 0.4113247863247863, "grad_norm": 0.7325941324234009, "learning_rate": 1.4695447694505512e-05, "loss": 1.11, "step": 385 }, { "epoch": 0.4166666666666667, "grad_norm": 0.7975173592567444, "learning_rate": 1.4529928831629185e-05, "loss": 1.0997, "step": 390 }, { "epoch": 0.422008547008547, "grad_norm": 0.7777805328369141, "learning_rate": 1.4362833467375839e-05, "loss": 1.1172, "step": 395 }, { "epoch": 0.42735042735042733, "grad_norm": 0.7026362419128418, "learning_rate": 1.4194219754109812e-05, "loss": 1.0929, "step": 400 }, { "epoch": 0.42735042735042733, "eval_loss": 1.1095771789550781, "eval_runtime": 121.5104, "eval_samples_per_second": 109.011, "eval_steps_per_second": 0.856, "step": 400 }, { "epoch": 0.4326923076923077, "grad_norm": 0.690746545791626, "learning_rate": 1.402414637260977e-05, "loss": 1.1014, "step": 405 }, { "epoch": 0.43803418803418803, "grad_norm": 0.7299688458442688, "learning_rate": 1.3852672511646683e-05, "loss": 1.1059, "step": 410 }, { "epoch": 0.44337606837606836, "grad_norm": 0.7541385889053345, "learning_rate": 1.367985784738501e-05, "loss": 1.0929, "step": 415 }, { "epoch": 0.44871794871794873, "grad_norm": 0.7011000514030457, "learning_rate": 1.350576252261425e-05, "loss": 1.069, "step": 420 }, { "epoch": 0.45405982905982906, "grad_norm": 0.7681401371955872, "learning_rate": 1.3330447125818115e-05, "loss": 1.1181, "step": 425 }, { "epoch": 0.4594017094017094, "grad_norm": 0.7124375104904175, "learning_rate": 1.3153972670088584e-05, "loss": 1.084, "step": 430 }, { "epoch": 0.46474358974358976, "grad_norm": 0.7539063096046448, "learning_rate": 1.2976400571892189e-05, "loss": 1.0933, "step": 435 }, { "epoch": 0.4700854700854701, "grad_norm": 0.7165161967277527, "learning_rate": 1.2797792629695909e-05, "loss": 1.1012, "step": 440 }, { "epoch": 0.4754273504273504, "grad_norm": 0.714011549949646, "learning_rate": 1.2618211002460135e-05, "loss": 1.093, "step": 445 }, { "epoch": 0.4807692307692308, "grad_norm": 0.7356306910514832, "learning_rate": 1.2437718188006165e-05, "loss": 1.0975, "step": 450 }, { "epoch": 0.4861111111111111, "grad_norm": 0.9547709822654724, "learning_rate": 1.2256377001265785e-05, "loss": 1.0849, "step": 455 }, { "epoch": 0.49145299145299143, "grad_norm": 0.7575970888137817, "learning_rate": 1.2074250552420459e-05, "loss": 1.1048, "step": 460 }, { "epoch": 0.4967948717948718, "grad_norm": 0.7571170926094055, "learning_rate": 1.1891402224937805e-05, "loss": 1.0833, "step": 465 }, { "epoch": 0.5021367521367521, "grad_norm": 0.7408652901649475, "learning_rate": 1.170789565351293e-05, "loss": 1.091, "step": 470 }, { "epoch": 0.5074786324786325, "grad_norm": 0.7377583384513855, "learning_rate": 1.1523794701922351e-05, "loss": 1.0995, "step": 475 }, { "epoch": 0.5128205128205128, "grad_norm": 0.7226018309593201, "learning_rate": 1.1339163440798187e-05, "loss": 1.0883, "step": 480 }, { "epoch": 0.5181623931623932, "grad_norm": 0.6740500330924988, "learning_rate": 1.1154066125330358e-05, "loss": 1.0853, "step": 485 }, { "epoch": 0.5235042735042735, "grad_norm": 0.7039321660995483, "learning_rate": 1.0968567172904558e-05, "loss": 1.0793, "step": 490 }, { "epoch": 0.5288461538461539, "grad_norm": 0.6822574734687805, "learning_rate": 1.0782731140683786e-05, "loss": 1.084, "step": 495 }, { "epoch": 0.5341880341880342, "grad_norm": 0.6553980112075806, "learning_rate": 1.0596622703141209e-05, "loss": 1.0718, "step": 500 }, { "epoch": 0.5395299145299145, "grad_norm": 0.70383620262146, "learning_rate": 1.0410306629552231e-05, "loss": 1.064, "step": 505 }, { "epoch": 0.5448717948717948, "grad_norm": 0.6900469064712524, "learning_rate": 1.0223847761453558e-05, "loss": 1.0768, "step": 510 }, { "epoch": 0.5502136752136753, "grad_norm": 0.689424455165863, "learning_rate": 1.0037310990077083e-05, "loss": 1.0839, "step": 515 }, { "epoch": 0.5555555555555556, "grad_norm": 0.6523409485816956, "learning_rate": 9.850761233766537e-06, "loss": 1.103, "step": 520 }, { "epoch": 0.5608974358974359, "grad_norm": 0.718777596950531, "learning_rate": 9.664263415384644e-06, "loss": 1.0881, "step": 525 }, { "epoch": 0.5662393162393162, "grad_norm": 0.7004057168960571, "learning_rate": 9.47788243971875e-06, "loss": 1.0919, "step": 530 }, { "epoch": 0.5715811965811965, "grad_norm": 0.6746854186058044, "learning_rate": 9.291683170892712e-06, "loss": 1.0911, "step": 535 }, { "epoch": 0.5769230769230769, "grad_norm": 0.716135561466217, "learning_rate": 9.10573040979294e-06, "loss": 1.0774, "step": 540 }, { "epoch": 0.5822649572649573, "grad_norm": 0.6947251558303833, "learning_rate": 8.920088871516482e-06, "loss": 1.0855, "step": 545 }, { "epoch": 0.5876068376068376, "grad_norm": 0.6928468346595764, "learning_rate": 8.734823162848919e-06, "loss": 1.0743, "step": 550 }, { "epoch": 0.592948717948718, "grad_norm": 0.6887853145599365, "learning_rate": 8.549997759779981e-06, "loss": 1.0879, "step": 555 }, { "epoch": 0.5982905982905983, "grad_norm": 0.6782488822937012, "learning_rate": 8.365676985064684e-06, "loss": 1.078, "step": 560 }, { "epoch": 0.6036324786324786, "grad_norm": 0.6661266088485718, "learning_rate": 8.181924985837762e-06, "loss": 1.0582, "step": 565 }, { "epoch": 0.6089743589743589, "grad_norm": 0.7845523953437805, "learning_rate": 7.998805711289281e-06, "loss": 1.0851, "step": 570 }, { "epoch": 0.6143162393162394, "grad_norm": 0.6643468141555786, "learning_rate": 7.81638289040908e-06, "loss": 1.0725, "step": 575 }, { "epoch": 0.6196581196581197, "grad_norm": 0.6565442085266113, "learning_rate": 7.634720009807879e-06, "loss": 1.0822, "step": 580 }, { "epoch": 0.625, "grad_norm": 0.6695041656494141, "learning_rate": 7.453880291622726e-06, "loss": 1.0824, "step": 585 }, { "epoch": 0.6303418803418803, "grad_norm": 0.6693868041038513, "learning_rate": 7.273926671514503e-06, "loss": 1.0872, "step": 590 }, { "epoch": 0.6356837606837606, "grad_norm": 0.6269449591636658, "learning_rate": 7.094921776765095e-06, "loss": 1.077, "step": 595 }, { "epoch": 0.6410256410256411, "grad_norm": 0.6727346181869507, "learning_rate": 6.916927904481934e-06, "loss": 1.0808, "step": 600 }, { "epoch": 0.6410256410256411, "eval_loss": 1.0848172903060913, "eval_runtime": 121.4332, "eval_samples_per_second": 109.081, "eval_steps_per_second": 0.856, "step": 600 }, { "epoch": 0.6463675213675214, "grad_norm": 0.6319820880889893, "learning_rate": 6.740006999917406e-06, "loss": 1.0898, "step": 605 }, { "epoch": 0.6517094017094017, "grad_norm": 0.6709337830543518, "learning_rate": 6.56422063491072e-06, "loss": 1.0778, "step": 610 }, { "epoch": 0.657051282051282, "grad_norm": 0.6739898920059204, "learning_rate": 6.389629986459756e-06, "loss": 1.0536, "step": 615 }, { "epoch": 0.6623931623931624, "grad_norm": 0.6539934873580933, "learning_rate": 6.216295815430277e-06, "loss": 1.0579, "step": 620 }, { "epoch": 0.6677350427350427, "grad_norm": 0.6796319484710693, "learning_rate": 6.044278445410025e-06, "loss": 1.0788, "step": 625 }, { "epoch": 0.6730769230769231, "grad_norm": 0.6338521242141724, "learning_rate": 5.873637741714941e-06, "loss": 1.0553, "step": 630 }, { "epoch": 0.6784188034188035, "grad_norm": 0.6625267267227173, "learning_rate": 5.704433090554912e-06, "loss": 1.0889, "step": 635 }, { "epoch": 0.6837606837606838, "grad_norm": 0.6456872820854187, "learning_rate": 5.536723378366226e-06, "loss": 1.0694, "step": 640 }, { "epoch": 0.6891025641025641, "grad_norm": 0.6204415559768677, "learning_rate": 5.37056697131799e-06, "loss": 1.0883, "step": 645 }, { "epoch": 0.6944444444444444, "grad_norm": 0.6364960670471191, "learning_rate": 5.206021694999571e-06, "loss": 1.0727, "step": 650 }, { "epoch": 0.6997863247863247, "grad_norm": 0.6018571853637695, "learning_rate": 5.043144814296214e-06, "loss": 1.0509, "step": 655 }, { "epoch": 0.7051282051282052, "grad_norm": 0.648232102394104, "learning_rate": 4.881993013459762e-06, "loss": 1.0766, "step": 660 }, { "epoch": 0.7104700854700855, "grad_norm": 0.6507673263549805, "learning_rate": 4.722622376381455e-06, "loss": 1.0764, "step": 665 }, { "epoch": 0.7158119658119658, "grad_norm": 0.62617427110672, "learning_rate": 4.565088367073675e-06, "loss": 1.0583, "step": 670 }, { "epoch": 0.7211538461538461, "grad_norm": 0.642150342464447, "learning_rate": 4.409445810367421e-06, "loss": 1.0596, "step": 675 }, { "epoch": 0.7264957264957265, "grad_norm": 0.651789665222168, "learning_rate": 4.255748872832201e-06, "loss": 1.064, "step": 680 }, { "epoch": 0.7318376068376068, "grad_norm": 0.6348583698272705, "learning_rate": 4.104051043925068e-06, "loss": 1.085, "step": 685 }, { "epoch": 0.7371794871794872, "grad_norm": 0.6330149173736572, "learning_rate": 3.9544051173752504e-06, "loss": 1.0775, "step": 690 }, { "epoch": 0.7425213675213675, "grad_norm": 0.6291777491569519, "learning_rate": 3.8068631728109364e-06, "loss": 1.081, "step": 695 }, { "epoch": 0.7478632478632479, "grad_norm": 0.6168299913406372, "learning_rate": 3.6614765576345755e-06, "loss": 1.0543, "step": 700 }, { "epoch": 0.7532051282051282, "grad_norm": 0.6146724820137024, "learning_rate": 3.5182958691529945e-06, "loss": 1.0612, "step": 705 }, { "epoch": 0.7585470085470085, "grad_norm": 0.6585379838943481, "learning_rate": 3.3773709369685924e-06, "loss": 1.0629, "step": 710 }, { "epoch": 0.7638888888888888, "grad_norm": 0.6729740500450134, "learning_rate": 3.2387508056376726e-06, "loss": 1.0638, "step": 715 }, { "epoch": 0.7692307692307693, "grad_norm": 0.6358993053436279, "learning_rate": 3.1024837176020173e-06, "loss": 1.0638, "step": 720 }, { "epoch": 0.7745726495726496, "grad_norm": 0.6569262146949768, "learning_rate": 2.968617096399592e-06, "loss": 1.0589, "step": 725 }, { "epoch": 0.7799145299145299, "grad_norm": 0.608600378036499, "learning_rate": 2.8371975301602572e-06, "loss": 1.0911, "step": 730 }, { "epoch": 0.7852564102564102, "grad_norm": 0.6112855076789856, "learning_rate": 2.708270755392207e-06, "loss": 1.0705, "step": 735 }, { "epoch": 0.7905982905982906, "grad_norm": 0.6621512174606323, "learning_rate": 2.581881641064806e-06, "loss": 1.0614, "step": 740 }, { "epoch": 0.7959401709401709, "grad_norm": 0.6552026271820068, "learning_rate": 2.4580741729933246e-06, "loss": 1.0683, "step": 745 }, { "epoch": 0.8012820512820513, "grad_norm": 0.6412573456764221, "learning_rate": 2.3368914385310415e-06, "loss": 1.0498, "step": 750 }, { "epoch": 0.8066239316239316, "grad_norm": 0.6315770745277405, "learning_rate": 2.2183756115740274e-06, "loss": 1.0629, "step": 755 }, { "epoch": 0.811965811965812, "grad_norm": 0.6639691591262817, "learning_rate": 2.1025679378838247e-06, "loss": 1.0596, "step": 760 }, { "epoch": 0.8173076923076923, "grad_norm": 0.6195425391197205, "learning_rate": 1.9895087207331422e-06, "loss": 1.0579, "step": 765 }, { "epoch": 0.8226495726495726, "grad_norm": 0.630287766456604, "learning_rate": 1.8792373068795422e-06, "loss": 1.0606, "step": 770 }, { "epoch": 0.8279914529914529, "grad_norm": 0.6329755187034607, "learning_rate": 1.7717920728720284e-06, "loss": 1.055, "step": 775 }, { "epoch": 0.8333333333333334, "grad_norm": 0.6025441884994507, "learning_rate": 1.6672104116952748e-06, "loss": 1.0719, "step": 780 }, { "epoch": 0.8386752136752137, "grad_norm": 0.6310837268829346, "learning_rate": 1.5655287197561497e-06, "loss": 1.0692, "step": 785 }, { "epoch": 0.844017094017094, "grad_norm": 0.6214393377304077, "learning_rate": 1.4667823842170837e-06, "loss": 1.0737, "step": 790 }, { "epoch": 0.8493589743589743, "grad_norm": 0.653542160987854, "learning_rate": 1.371005770680659e-06, "loss": 1.0525, "step": 795 }, { "epoch": 0.8547008547008547, "grad_norm": 0.6011979579925537, "learning_rate": 1.2782322112297274e-06, "loss": 1.0529, "step": 800 }, { "epoch": 0.8547008547008547, "eval_loss": 1.0704631805419922, "eval_runtime": 121.3074, "eval_samples_per_second": 109.194, "eval_steps_per_second": 0.857, "step": 800 }, { "epoch": 0.8600427350427351, "grad_norm": 0.6574413776397705, "learning_rate": 1.188493992827211e-06, "loss": 1.0474, "step": 805 }, { "epoch": 0.8653846153846154, "grad_norm": 0.5898586511611938, "learning_rate": 1.101822346079625e-06, "loss": 1.0446, "step": 810 }, { "epoch": 0.8707264957264957, "grad_norm": 0.6238599419593811, "learning_rate": 1.0182474343682346e-06, "loss": 1.0539, "step": 815 }, { "epoch": 0.8760683760683761, "grad_norm": 0.6406736373901367, "learning_rate": 9.377983433516181e-07, "loss": 1.0657, "step": 820 }, { "epoch": 0.8814102564102564, "grad_norm": 0.6025185585021973, "learning_rate": 8.605030708433149e-07, "loss": 1.0585, "step": 825 }, { "epoch": 0.8867521367521367, "grad_norm": 0.6066309213638306, "learning_rate": 7.863885170680486e-07, "loss": 1.0596, "step": 830 }, { "epoch": 0.8920940170940171, "grad_norm": 0.6598129272460938, "learning_rate": 7.154804752999344e-07, "loss": 1.0407, "step": 835 }, { "epoch": 0.8974358974358975, "grad_norm": 0.6132450699806213, "learning_rate": 6.478036228859363e-07, "loss": 1.0487, "step": 840 }, { "epoch": 0.9027777777777778, "grad_norm": 0.6001634001731873, "learning_rate": 5.833815126576714e-07, "loss": 1.0485, "step": 845 }, { "epoch": 0.9081196581196581, "grad_norm": 0.6118801236152649, "learning_rate": 5.222365647345862e-07, "loss": 1.0666, "step": 850 }, { "epoch": 0.9134615384615384, "grad_norm": 0.6073253750801086, "learning_rate": 4.6439005872132457e-07, "loss": 1.0574, "step": 855 }, { "epoch": 0.9188034188034188, "grad_norm": 0.610341489315033, "learning_rate": 4.0986212630201974e-07, "loss": 1.0599, "step": 860 }, { "epoch": 0.9241452991452992, "grad_norm": 0.589881420135498, "learning_rate": 3.58671744234087e-07, "loss": 1.073, "step": 865 }, { "epoch": 0.9294871794871795, "grad_norm": 0.6028986573219299, "learning_rate": 3.1083672774395055e-07, "loss": 1.0768, "step": 870 }, { "epoch": 0.9348290598290598, "grad_norm": 0.5990201830863953, "learning_rate": 2.6637372432700483e-07, "loss": 1.058, "step": 875 }, { "epoch": 0.9401709401709402, "grad_norm": 0.621417760848999, "learning_rate": 2.2529820795397228e-07, "loss": 1.066, "step": 880 }, { "epoch": 0.9455128205128205, "grad_norm": 0.5974612236022949, "learning_rate": 1.8762447368566582e-07, "loss": 1.0543, "step": 885 }, { "epoch": 0.9508547008547008, "grad_norm": 0.60768061876297, "learning_rate": 1.5336563269803372e-07, "loss": 1.0598, "step": 890 }, { "epoch": 0.9561965811965812, "grad_norm": 0.605929434299469, "learning_rate": 1.225336077192274e-07, "loss": 1.0558, "step": 895 }, { "epoch": 0.9615384615384616, "grad_norm": 0.6178026795387268, "learning_rate": 9.513912888025611e-08, "loss": 1.0661, "step": 900 }, { "epoch": 0.9668803418803419, "grad_norm": 0.603589653968811, "learning_rate": 7.119172998070412e-08, "loss": 1.0557, "step": 905 }, { "epoch": 0.9722222222222222, "grad_norm": 0.6099143028259277, "learning_rate": 5.0699745170785796e-08, "loss": 1.0732, "step": 910 }, { "epoch": 0.9775641025641025, "grad_norm": 0.6262799501419067, "learning_rate": 3.367030605090249e-08, "loss": 1.0671, "step": 915 }, { "epoch": 0.9829059829059829, "grad_norm": 0.6268653273582458, "learning_rate": 2.010933918970781e-08, "loss": 1.0524, "step": 920 }, { "epoch": 0.9882478632478633, "grad_norm": 0.5946672558784485, "learning_rate": 1.0021564061554189e-08, "loss": 1.0477, "step": 925 }, { "epoch": 0.9935897435897436, "grad_norm": 0.6294344067573547, "learning_rate": 3.410491404017835e-09, "loss": 1.0792, "step": 930 }, { "epoch": 0.9989316239316239, "grad_norm": 0.5966866612434387, "learning_rate": 2.784219961060597e-10, "loss": 1.0449, "step": 935 }, { "epoch": 1.0, "step": 936, "total_flos": 1.381094473127166e+18, "train_loss": 1.105873676828849, "train_runtime": 5784.2377, "train_samples_per_second": 20.701, "train_steps_per_second": 0.162 } ], "logging_steps": 5, "max_steps": 936, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.381094473127166e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }