{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 25, "global_step": 335, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0149812734082397, "grad_norm": 3.1280667781829834, "learning_rate": 0.0, "loss": 0.287, "num_input_tokens_seen": 106224, "step": 1, "train_runtime": 24.4359, "train_tokens_per_second": 4347.041 }, { "epoch": 0.0299625468164794, "grad_norm": 1.6757160425186157, "learning_rate": 1.4705882352941177e-06, "loss": 0.1593, "num_input_tokens_seen": 222384, "step": 2, "train_runtime": 34.7317, "train_tokens_per_second": 6402.905 }, { "epoch": 0.0449438202247191, "grad_norm": 2.345803737640381, "learning_rate": 2.9411764705882355e-06, "loss": 0.1572, "num_input_tokens_seen": 332728, "step": 3, "train_runtime": 45.0408, "train_tokens_per_second": 7387.255 }, { "epoch": 0.0599250936329588, "grad_norm": 3.6148295402526855, "learning_rate": 4.411764705882353e-06, "loss": 0.2334, "num_input_tokens_seen": 439824, "step": 4, "train_runtime": 55.4327, "train_tokens_per_second": 7934.377 }, { "epoch": 0.0749063670411985, "grad_norm": 1.2359391450881958, "learning_rate": 5.882352941176471e-06, "loss": 0.0885, "num_input_tokens_seen": 552832, "step": 5, "train_runtime": 65.9903, "train_tokens_per_second": 8377.468 }, { "epoch": 0.0898876404494382, "grad_norm": 1.303969144821167, "learning_rate": 7.3529411764705884e-06, "loss": 0.1054, "num_input_tokens_seen": 667008, "step": 6, "train_runtime": 77.8897, "train_tokens_per_second": 8563.494 }, { "epoch": 0.10486891385767791, "grad_norm": 2.0778756141662598, "learning_rate": 8.823529411764707e-06, "loss": 0.2192, "num_input_tokens_seen": 779728, "step": 7, "train_runtime": 89.3699, "train_tokens_per_second": 8724.73 }, { "epoch": 0.1198501872659176, "grad_norm": 1.932761788368225, "learning_rate": 1.0294117647058824e-05, "loss": 0.1713, "num_input_tokens_seen": 893296, "step": 8, "train_runtime": 101.0598, "train_tokens_per_second": 8839.281 }, { "epoch": 0.1348314606741573, "grad_norm": 1.013716220855713, "learning_rate": 1.1764705882352942e-05, "loss": 0.1316, "num_input_tokens_seen": 1010424, "step": 9, "train_runtime": 112.9423, "train_tokens_per_second": 8946.377 }, { "epoch": 0.149812734082397, "grad_norm": 0.4578853249549866, "learning_rate": 1.323529411764706e-05, "loss": 0.0925, "num_input_tokens_seen": 1122608, "step": 10, "train_runtime": 124.9021, "train_tokens_per_second": 8987.903 }, { "epoch": 0.1647940074906367, "grad_norm": 1.1105873584747314, "learning_rate": 1.4705882352941177e-05, "loss": 0.1682, "num_input_tokens_seen": 1232432, "step": 11, "train_runtime": 136.2736, "train_tokens_per_second": 9043.807 }, { "epoch": 0.1797752808988764, "grad_norm": 0.8049420714378357, "learning_rate": 1.6176470588235296e-05, "loss": 0.1501, "num_input_tokens_seen": 1341448, "step": 12, "train_runtime": 147.7057, "train_tokens_per_second": 9081.896 }, { "epoch": 0.1947565543071161, "grad_norm": 0.7054234743118286, "learning_rate": 1.7647058823529414e-05, "loss": 0.1541, "num_input_tokens_seen": 1438440, "step": 13, "train_runtime": 158.7691, "train_tokens_per_second": 9059.95 }, { "epoch": 0.20973782771535582, "grad_norm": 3.3378353118896484, "learning_rate": 1.9117647058823528e-05, "loss": 0.1972, "num_input_tokens_seen": 1547408, "step": 14, "train_runtime": 170.332, "train_tokens_per_second": 9084.656 }, { "epoch": 0.2247191011235955, "grad_norm": 0.47882241010665894, "learning_rate": 2.058823529411765e-05, "loss": 0.1355, "num_input_tokens_seen": 1660336, "step": 15, "train_runtime": 181.7719, "train_tokens_per_second": 9134.175 }, { "epoch": 0.2397003745318352, "grad_norm": 0.4519363045692444, "learning_rate": 2.2058823529411766e-05, "loss": 0.1175, "num_input_tokens_seen": 1774704, "step": 16, "train_runtime": 193.265, "train_tokens_per_second": 9182.748 }, { "epoch": 0.2546816479400749, "grad_norm": 0.8406792283058167, "learning_rate": 2.3529411764705884e-05, "loss": 0.2153, "num_input_tokens_seen": 1887056, "step": 17, "train_runtime": 204.716, "train_tokens_per_second": 9217.922 }, { "epoch": 0.2696629213483146, "grad_norm": 0.5358920097351074, "learning_rate": 2.5e-05, "loss": 0.1604, "num_input_tokens_seen": 1995456, "step": 18, "train_runtime": 216.103, "train_tokens_per_second": 9233.821 }, { "epoch": 0.2846441947565543, "grad_norm": 0.5870547294616699, "learning_rate": 2.647058823529412e-05, "loss": 0.1922, "num_input_tokens_seen": 2111824, "step": 19, "train_runtime": 227.5641, "train_tokens_per_second": 9280.126 }, { "epoch": 0.299625468164794, "grad_norm": 0.7451322078704834, "learning_rate": 2.7941176470588236e-05, "loss": 0.2839, "num_input_tokens_seen": 2217760, "step": 20, "train_runtime": 238.9081, "train_tokens_per_second": 9282.899 }, { "epoch": 0.3146067415730337, "grad_norm": 0.5490975975990295, "learning_rate": 2.9411764705882354e-05, "loss": 0.1694, "num_input_tokens_seen": 2331736, "step": 21, "train_runtime": 250.286, "train_tokens_per_second": 9316.285 }, { "epoch": 0.3295880149812734, "grad_norm": 0.2864564061164856, "learning_rate": 3.0882352941176475e-05, "loss": 0.0844, "num_input_tokens_seen": 2436696, "step": 22, "train_runtime": 261.5777, "train_tokens_per_second": 9315.381 }, { "epoch": 0.3445692883895131, "grad_norm": 0.5809709429740906, "learning_rate": 3.235294117647059e-05, "loss": 0.1601, "num_input_tokens_seen": 2549584, "step": 23, "train_runtime": 272.813, "train_tokens_per_second": 9345.538 }, { "epoch": 0.3595505617977528, "grad_norm": 0.5216901302337646, "learning_rate": 3.382352941176471e-05, "loss": 0.1835, "num_input_tokens_seen": 2655016, "step": 24, "train_runtime": 284.1449, "train_tokens_per_second": 9343.881 }, { "epoch": 0.37453183520599254, "grad_norm": 0.5510843992233276, "learning_rate": 3.529411764705883e-05, "loss": 0.2083, "num_input_tokens_seen": 2761784, "step": 25, "train_runtime": 295.5762, "train_tokens_per_second": 9343.731 }, { "epoch": 0.37453183520599254, "eval_accuracy": 0.9302976935430894, "eval_loss": 0.22450505197048187, "eval_runtime": 4.9446, "eval_samples_per_second": 11.528, "eval_steps_per_second": 3.034, "num_input_tokens_seen": 2761784, "step": 25 }, { "epoch": 0.3895131086142322, "grad_norm": 0.40877798199653625, "learning_rate": 3.6764705882352945e-05, "loss": 0.1667, "num_input_tokens_seen": 2876408, "step": 26, "train_runtime": 312.0108, "train_tokens_per_second": 9218.938 }, { "epoch": 0.4044943820224719, "grad_norm": 0.2976829409599304, "learning_rate": 3.8235294117647055e-05, "loss": 0.0896, "num_input_tokens_seen": 2987992, "step": 27, "train_runtime": 323.1092, "train_tokens_per_second": 9247.622 }, { "epoch": 0.41947565543071164, "grad_norm": 0.68152916431427, "learning_rate": 3.970588235294117e-05, "loss": 0.2299, "num_input_tokens_seen": 3093200, "step": 28, "train_runtime": 334.4361, "train_tokens_per_second": 9249.001 }, { "epoch": 0.4344569288389513, "grad_norm": 0.6799381375312805, "learning_rate": 4.11764705882353e-05, "loss": 0.269, "num_input_tokens_seen": 3192624, "step": 29, "train_runtime": 345.4411, "train_tokens_per_second": 9242.167 }, { "epoch": 0.449438202247191, "grad_norm": 0.4044613242149353, "learning_rate": 4.2647058823529415e-05, "loss": 0.159, "num_input_tokens_seen": 3306120, "step": 30, "train_runtime": 356.9051, "train_tokens_per_second": 9263.303 }, { "epoch": 0.46441947565543074, "grad_norm": 0.4966118037700653, "learning_rate": 4.411764705882353e-05, "loss": 0.229, "num_input_tokens_seen": 3419688, "step": 31, "train_runtime": 368.3658, "train_tokens_per_second": 9283.402 }, { "epoch": 0.4794007490636704, "grad_norm": 0.6686931848526001, "learning_rate": 4.558823529411765e-05, "loss": 0.1994, "num_input_tokens_seen": 3521896, "step": 32, "train_runtime": 379.7162, "train_tokens_per_second": 9275.074 }, { "epoch": 0.4943820224719101, "grad_norm": 0.2366495132446289, "learning_rate": 4.705882352941177e-05, "loss": 0.109, "num_input_tokens_seen": 3644912, "step": 33, "train_runtime": 391.2145, "train_tokens_per_second": 9316.916 }, { "epoch": 0.5093632958801498, "grad_norm": 0.49944090843200684, "learning_rate": 4.8529411764705885e-05, "loss": 0.1888, "num_input_tokens_seen": 3751480, "step": 34, "train_runtime": 402.5823, "train_tokens_per_second": 9318.541 }, { "epoch": 0.5243445692883895, "grad_norm": 0.5719208121299744, "learning_rate": 5e-05, "loss": 0.1841, "num_input_tokens_seen": 3853072, "step": 35, "train_runtime": 413.9404, "train_tokens_per_second": 9308.277 }, { "epoch": 0.5393258426966292, "grad_norm": 0.5086914896965027, "learning_rate": 4.999863832700438e-05, "loss": 0.2391, "num_input_tokens_seen": 3954992, "step": 36, "train_runtime": 424.9062, "train_tokens_per_second": 9307.918 }, { "epoch": 0.5543071161048689, "grad_norm": 0.616763174533844, "learning_rate": 4.999455345634978e-05, "loss": 0.2589, "num_input_tokens_seen": 4060312, "step": 37, "train_runtime": 436.2608, "train_tokens_per_second": 9307.075 }, { "epoch": 0.5692883895131086, "grad_norm": 0.3873949646949768, "learning_rate": 4.9987745833016855e-05, "loss": 0.1603, "num_input_tokens_seen": 4159664, "step": 38, "train_runtime": 447.7626, "train_tokens_per_second": 9289.888 }, { "epoch": 0.5842696629213483, "grad_norm": 0.42982256412506104, "learning_rate": 4.9978216198586135e-05, "loss": 0.1837, "num_input_tokens_seen": 4273696, "step": 39, "train_runtime": 459.1806, "train_tokens_per_second": 9307.223 }, { "epoch": 0.599250936329588, "grad_norm": 0.45362767577171326, "learning_rate": 4.996596559115731e-05, "loss": 0.2044, "num_input_tokens_seen": 4381080, "step": 40, "train_runtime": 470.3339, "train_tokens_per_second": 9314.829 }, { "epoch": 0.6142322097378277, "grad_norm": 0.32139134407043457, "learning_rate": 4.995099534523607e-05, "loss": 0.1326, "num_input_tokens_seen": 4499912, "step": 41, "train_runtime": 481.7193, "train_tokens_per_second": 9341.357 }, { "epoch": 0.6292134831460674, "grad_norm": 0.39159446954727173, "learning_rate": 4.9933307091588796e-05, "loss": 0.1795, "num_input_tokens_seen": 4606816, "step": 42, "train_runtime": 493.112, "train_tokens_per_second": 9342.332 }, { "epoch": 0.6441947565543071, "grad_norm": 0.4465094804763794, "learning_rate": 4.991290275706486e-05, "loss": 0.188, "num_input_tokens_seen": 4720528, "step": 43, "train_runtime": 504.578, "train_tokens_per_second": 9355.398 }, { "epoch": 0.6591760299625468, "grad_norm": 0.5234239101409912, "learning_rate": 4.988978456438678e-05, "loss": 0.1692, "num_input_tokens_seen": 4834552, "step": 44, "train_runtime": 516.0278, "train_tokens_per_second": 9368.781 }, { "epoch": 0.6741573033707865, "grad_norm": 0.36853307485580444, "learning_rate": 4.986395503190805e-05, "loss": 0.1526, "num_input_tokens_seen": 4940840, "step": 45, "train_runtime": 527.1679, "train_tokens_per_second": 9372.422 }, { "epoch": 0.6891385767790262, "grad_norm": 0.5273284912109375, "learning_rate": 4.983541697333881e-05, "loss": 0.2274, "num_input_tokens_seen": 5044880, "step": 46, "train_runtime": 538.3953, "train_tokens_per_second": 9370.215 }, { "epoch": 0.704119850187266, "grad_norm": 0.24699360132217407, "learning_rate": 4.980417349743936e-05, "loss": 0.1199, "num_input_tokens_seen": 5164256, "step": 47, "train_runtime": 549.2882, "train_tokens_per_second": 9401.724 }, { "epoch": 0.7191011235955056, "grad_norm": 0.5065047144889832, "learning_rate": 4.9770228007681494e-05, "loss": 0.2262, "num_input_tokens_seen": 5262840, "step": 48, "train_runtime": 559.2344, "train_tokens_per_second": 9410.795 }, { "epoch": 0.7340823970037453, "grad_norm": 0.3612704873085022, "learning_rate": 4.973358420187776e-05, "loss": 0.1684, "num_input_tokens_seen": 5374992, "step": 49, "train_runtime": 569.5899, "train_tokens_per_second": 9436.6 }, { "epoch": 0.7490636704119851, "grad_norm": 0.40374258160591125, "learning_rate": 4.9694246071778604e-05, "loss": 0.1599, "num_input_tokens_seen": 5486368, "step": 50, "train_runtime": 580.0566, "train_tokens_per_second": 9458.332 }, { "epoch": 0.7490636704119851, "eval_accuracy": 0.9305199059084309, "eval_loss": 0.22489887475967407, "eval_runtime": 4.9257, "eval_samples_per_second": 11.572, "eval_steps_per_second": 3.045, "num_input_tokens_seen": 5486368, "step": 50 }, { "epoch": 0.7640449438202247, "grad_norm": 0.41528448462486267, "learning_rate": 4.9652217902637596e-05, "loss": 0.2025, "num_input_tokens_seen": 5582648, "step": 51, "train_runtime": 594.7066, "train_tokens_per_second": 9387.23 }, { "epoch": 0.7790262172284644, "grad_norm": 0.32743218541145325, "learning_rate": 4.9607504272744575e-05, "loss": 0.1592, "num_input_tokens_seen": 5692920, "step": 52, "train_runtime": 604.9175, "train_tokens_per_second": 9411.069 }, { "epoch": 0.7940074906367042, "grad_norm": 0.484891802072525, "learning_rate": 4.956011005292692e-05, "loss": 0.2657, "num_input_tokens_seen": 5795728, "step": 53, "train_runtime": 615.187, "train_tokens_per_second": 9421.083 }, { "epoch": 0.8089887640449438, "grad_norm": 0.335283488035202, "learning_rate": 4.951004040601898e-05, "loss": 0.1878, "num_input_tokens_seen": 5911816, "step": 54, "train_runtime": 625.4996, "train_tokens_per_second": 9451.351 }, { "epoch": 0.8239700374531835, "grad_norm": 0.44197705388069153, "learning_rate": 4.945730078629964e-05, "loss": 0.2157, "num_input_tokens_seen": 6015648, "step": 55, "train_runtime": 635.6908, "train_tokens_per_second": 9463.167 }, { "epoch": 0.8389513108614233, "grad_norm": 0.3809565603733063, "learning_rate": 4.9401896938898185e-05, "loss": 0.1789, "num_input_tokens_seen": 6132248, "step": 56, "train_runtime": 646.1102, "train_tokens_per_second": 9491.025 }, { "epoch": 0.8539325842696629, "grad_norm": 0.4315880239009857, "learning_rate": 4.934383489916843e-05, "loss": 0.2019, "num_input_tokens_seen": 6249344, "step": 57, "train_runtime": 656.5451, "train_tokens_per_second": 9518.529 }, { "epoch": 0.8689138576779026, "grad_norm": 0.27884915471076965, "learning_rate": 4.928312099203131e-05, "loss": 0.132, "num_input_tokens_seen": 6366872, "step": 58, "train_runtime": 667.0032, "train_tokens_per_second": 9545.49 }, { "epoch": 0.8838951310861424, "grad_norm": 0.4101852476596832, "learning_rate": 4.921976183128585e-05, "loss": 0.2022, "num_input_tokens_seen": 6475464, "step": 59, "train_runtime": 677.3685, "train_tokens_per_second": 9559.736 }, { "epoch": 0.898876404494382, "grad_norm": 0.35576295852661133, "learning_rate": 4.9153764318888706e-05, "loss": 0.1605, "num_input_tokens_seen": 6587040, "step": 60, "train_runtime": 688.9819, "train_tokens_per_second": 9560.542 }, { "epoch": 0.9138576779026217, "grad_norm": 0.4770338833332062, "learning_rate": 4.908513564420231e-05, "loss": 0.2062, "num_input_tokens_seen": 6702552, "step": 61, "train_runtime": 700.508, "train_tokens_per_second": 9568.131 }, { "epoch": 0.9288389513108615, "grad_norm": 0.3157297372817993, "learning_rate": 4.90138832832117e-05, "loss": 0.1485, "num_input_tokens_seen": 6809352, "step": 62, "train_runtime": 712.0733, "train_tokens_per_second": 9562.713 }, { "epoch": 0.9438202247191011, "grad_norm": 0.4924875497817993, "learning_rate": 4.894001499771015e-05, "loss": 0.1896, "num_input_tokens_seen": 6909928, "step": 63, "train_runtime": 723.6298, "train_tokens_per_second": 9548.983 }, { "epoch": 0.9588014981273408, "grad_norm": 0.245199054479599, "learning_rate": 4.886353883445363e-05, "loss": 0.1141, "num_input_tokens_seen": 7029288, "step": 64, "train_runtime": 735.3069, "train_tokens_per_second": 9559.665 }, { "epoch": 0.9737827715355806, "grad_norm": 2.8648366928100586, "learning_rate": 4.878446312428424e-05, "loss": 0.2227, "num_input_tokens_seen": 7136544, "step": 65, "train_runtime": 746.8368, "train_tokens_per_second": 9555.693 }, { "epoch": 0.9887640449438202, "grad_norm": 0.28591519594192505, "learning_rate": 4.8702796481222714e-05, "loss": 0.1648, "num_input_tokens_seen": 7244184, "step": 66, "train_runtime": 758.3309, "train_tokens_per_second": 9552.801 }, { "epoch": 1.0, "grad_norm": 0.28591519594192505, "learning_rate": 4.861854780153004e-05, "loss": 0.2552, "num_input_tokens_seen": 7319544, "step": 67, "train_runtime": 763.1534, "train_tokens_per_second": 9591.183 }, { "epoch": 1.0149812734082397, "grad_norm": 0.5447623133659363, "learning_rate": 4.853172626273841e-05, "loss": 0.1038, "num_input_tokens_seen": 7437632, "step": 68, "train_runtime": 774.79, "train_tokens_per_second": 9599.546 }, { "epoch": 1.0299625468164795, "grad_norm": 0.27388796210289, "learning_rate": 4.8442341322651385e-05, "loss": 0.1202, "num_input_tokens_seen": 7547280, "step": 69, "train_runtime": 786.2831, "train_tokens_per_second": 9598.68 }, { "epoch": 1.0449438202247192, "grad_norm": 0.3335851728916168, "learning_rate": 4.83504027183137e-05, "loss": 0.1851, "num_input_tokens_seen": 7658904, "step": 70, "train_runtime": 798.2298, "train_tokens_per_second": 9594.861 }, { "epoch": 1.0599250936329587, "grad_norm": 0.2915020287036896, "learning_rate": 4.825592046495054e-05, "loss": 0.1193, "num_input_tokens_seen": 7762712, "step": 71, "train_runtime": 809.7872, "train_tokens_per_second": 9586.113 }, { "epoch": 1.0749063670411985, "grad_norm": 0.3645778000354767, "learning_rate": 4.8158904854876555e-05, "loss": 0.1442, "num_input_tokens_seen": 7875080, "step": 72, "train_runtime": 821.7369, "train_tokens_per_second": 9583.457 }, { "epoch": 1.0898876404494382, "grad_norm": 0.21766622364521027, "learning_rate": 4.805936645637463e-05, "loss": 0.1783, "num_input_tokens_seen": 7989424, "step": 73, "train_runtime": 833.6118, "train_tokens_per_second": 9584.106 }, { "epoch": 1.104868913857678, "grad_norm": 0.24854034185409546, "learning_rate": 4.795731611254473e-05, "loss": 0.096, "num_input_tokens_seen": 8104200, "step": 74, "train_runtime": 845.3499, "train_tokens_per_second": 9586.799 }, { "epoch": 1.1198501872659177, "grad_norm": 0.2844558358192444, "learning_rate": 4.785276494012263e-05, "loss": 0.1223, "num_input_tokens_seen": 8216400, "step": 75, "train_runtime": 857.2782, "train_tokens_per_second": 9584.287 }, { "epoch": 1.1198501872659177, "eval_accuracy": 0.9364104124311492, "eval_loss": 0.20777302980422974, "eval_runtime": 4.9438, "eval_samples_per_second": 11.53, "eval_steps_per_second": 3.034, "num_input_tokens_seen": 8216400, "step": 75 }, { "epoch": 1.1348314606741572, "grad_norm": 0.2329414337873459, "learning_rate": 4.7745724328269e-05, "loss": 0.1293, "num_input_tokens_seen": 8330424, "step": 76, "train_runtime": 874.2192, "train_tokens_per_second": 9528.988 }, { "epoch": 1.149812734082397, "grad_norm": 0.32167452573776245, "learning_rate": 4.763620593732867e-05, "loss": 0.1562, "num_input_tokens_seen": 8438312, "step": 77, "train_runtime": 886.018, "train_tokens_per_second": 9523.86 }, { "epoch": 1.1647940074906367, "grad_norm": 0.3214111030101776, "learning_rate": 4.752422169756048e-05, "loss": 0.1081, "num_input_tokens_seen": 8538856, "step": 78, "train_runtime": 897.8564, "train_tokens_per_second": 9510.269 }, { "epoch": 1.1797752808988764, "grad_norm": 0.22370071709156036, "learning_rate": 4.740978380783765e-05, "loss": 0.0907, "num_input_tokens_seen": 8648688, "step": 79, "train_runtime": 909.7476, "train_tokens_per_second": 9506.689 }, { "epoch": 1.1947565543071161, "grad_norm": 0.2359580248594284, "learning_rate": 4.7292904734318924e-05, "loss": 0.1497, "num_input_tokens_seen": 8757528, "step": 80, "train_runtime": 921.657, "train_tokens_per_second": 9501.938 }, { "epoch": 1.2097378277153559, "grad_norm": 0.45581308007240295, "learning_rate": 4.7173597209090534e-05, "loss": 0.1343, "num_input_tokens_seen": 8871600, "step": 81, "train_runtime": 933.4439, "train_tokens_per_second": 9504.16 }, { "epoch": 1.2247191011235956, "grad_norm": 0.33737245202064514, "learning_rate": 4.70518742287793e-05, "loss": 0.1842, "num_input_tokens_seen": 8975328, "step": 82, "train_runtime": 944.8618, "train_tokens_per_second": 9499.091 }, { "epoch": 1.2397003745318351, "grad_norm": 0.43545785546302795, "learning_rate": 4.6927749053136866e-05, "loss": 0.1342, "num_input_tokens_seen": 9090992, "step": 83, "train_runtime": 956.4008, "train_tokens_per_second": 9505.421 }, { "epoch": 1.2546816479400749, "grad_norm": 0.33271756768226624, "learning_rate": 4.6801235203595195e-05, "loss": 0.1938, "num_input_tokens_seen": 9201320, "step": 84, "train_runtime": 967.7879, "train_tokens_per_second": 9507.579 }, { "epoch": 1.2696629213483146, "grad_norm": 0.3993559777736664, "learning_rate": 4.667234646179368e-05, "loss": 0.1673, "num_input_tokens_seen": 9304160, "step": 85, "train_runtime": 978.9186, "train_tokens_per_second": 9504.529 }, { "epoch": 1.2846441947565543, "grad_norm": 0.3416566252708435, "learning_rate": 4.654109686807787e-05, "loss": 0.2025, "num_input_tokens_seen": 9409224, "step": 86, "train_runtime": 990.4873, "train_tokens_per_second": 9499.59 }, { "epoch": 1.299625468164794, "grad_norm": 0.3253297805786133, "learning_rate": 4.640750071996995e-05, "loss": 0.1421, "num_input_tokens_seen": 9514232, "step": 87, "train_runtime": 1002.0082, "train_tokens_per_second": 9495.164 }, { "epoch": 1.3146067415730336, "grad_norm": 0.3348604738712311, "learning_rate": 4.6271572570611296e-05, "loss": 0.1485, "num_input_tokens_seen": 9623752, "step": 88, "train_runtime": 1013.5776, "train_tokens_per_second": 9494.835 }, { "epoch": 1.3295880149812733, "grad_norm": 0.34817907214164734, "learning_rate": 4.613332722717714e-05, "loss": 0.1504, "num_input_tokens_seen": 9734808, "step": 89, "train_runtime": 1025.1774, "train_tokens_per_second": 9495.73 }, { "epoch": 1.344569288389513, "grad_norm": 0.3490277826786041, "learning_rate": 4.5992779749263546e-05, "loss": 0.1232, "num_input_tokens_seen": 9847464, "step": 90, "train_runtime": 1036.7917, "train_tokens_per_second": 9498.016 }, { "epoch": 1.3595505617977528, "grad_norm": 0.2823807895183563, "learning_rate": 4.584994544724695e-05, "loss": 0.1916, "num_input_tokens_seen": 9940464, "step": 91, "train_runtime": 1048.2397, "train_tokens_per_second": 9483.006 }, { "epoch": 1.3745318352059925, "grad_norm": 0.3263910114765167, "learning_rate": 4.5704839880616296e-05, "loss": 0.1665, "num_input_tokens_seen": 10054728, "step": 92, "train_runtime": 1059.8602, "train_tokens_per_second": 9486.844 }, { "epoch": 1.3895131086142323, "grad_norm": 0.26731908321380615, "learning_rate": 4.5557478856278114e-05, "loss": 0.102, "num_input_tokens_seen": 10172456, "step": 93, "train_runtime": 1071.5397, "train_tokens_per_second": 9493.307 }, { "epoch": 1.404494382022472, "grad_norm": 0.2940012216567993, "learning_rate": 4.5407878426834596e-05, "loss": 0.1167, "num_input_tokens_seen": 10279024, "step": 94, "train_runtime": 1083.1405, "train_tokens_per_second": 9490.019 }, { "epoch": 1.4194756554307117, "grad_norm": 0.24917353689670563, "learning_rate": 4.5256054888834934e-05, "loss": 0.1945, "num_input_tokens_seen": 10394120, "step": 95, "train_runtime": 1094.8039, "train_tokens_per_second": 9494.047 }, { "epoch": 1.4344569288389513, "grad_norm": 0.3618624806404114, "learning_rate": 4.5102024781000077e-05, "loss": 0.1576, "num_input_tokens_seen": 10503768, "step": 96, "train_runtime": 1106.3393, "train_tokens_per_second": 9494.165 }, { "epoch": 1.449438202247191, "grad_norm": 0.596593976020813, "learning_rate": 4.4945804882421086e-05, "loss": 0.1266, "num_input_tokens_seen": 10616136, "step": 97, "train_runtime": 1117.9131, "train_tokens_per_second": 9496.388 }, { "epoch": 1.4644194756554307, "grad_norm": 0.2645472586154938, "learning_rate": 4.478741221073136e-05, "loss": 0.0974, "num_input_tokens_seen": 10725704, "step": 98, "train_runtime": 1129.4583, "train_tokens_per_second": 9496.326 }, { "epoch": 1.4794007490636705, "grad_norm": 0.3144528269767761, "learning_rate": 4.4626864020252774e-05, "loss": 0.0942, "num_input_tokens_seen": 10838848, "step": 99, "train_runtime": 1141.0205, "train_tokens_per_second": 9499.258 }, { "epoch": 1.49438202247191, "grad_norm": 0.2749118208885193, "learning_rate": 4.446417780011618e-05, "loss": 0.16, "num_input_tokens_seen": 10953704, "step": 100, "train_runtime": 1152.6526, "train_tokens_per_second": 9503.04 }, { "epoch": 1.49438202247191, "eval_accuracy": 0.9357448056062344, "eval_loss": 0.20240993797779083, "eval_runtime": 4.9441, "eval_samples_per_second": 11.529, "eval_steps_per_second": 3.034, "num_input_tokens_seen": 10953704, "step": 100 }, { "epoch": 1.5093632958801497, "grad_norm": 0.3232828378677368, "learning_rate": 4.42993712723562e-05, "loss": 0.1192, "num_input_tokens_seen": 11073888, "step": 101, "train_runtime": 1169.2748, "train_tokens_per_second": 9470.732 }, { "epoch": 1.5243445692883895, "grad_norm": 0.3835008144378662, "learning_rate": 4.413246238998069e-05, "loss": 0.1767, "num_input_tokens_seen": 11178896, "step": 102, "train_runtime": 1180.9051, "train_tokens_per_second": 9466.38 }, { "epoch": 1.5393258426966292, "grad_norm": 0.29268574714660645, "learning_rate": 4.3963469335015085e-05, "loss": 0.1383, "num_input_tokens_seen": 11289112, "step": 103, "train_runtime": 1192.4003, "train_tokens_per_second": 9467.552 }, { "epoch": 1.554307116104869, "grad_norm": 0.24543774127960205, "learning_rate": 4.379241051652174e-05, "loss": 0.1421, "num_input_tokens_seen": 11401952, "step": 104, "train_runtime": 1203.8514, "train_tokens_per_second": 9471.229 }, { "epoch": 1.5692883895131087, "grad_norm": 0.3485076427459717, "learning_rate": 4.361930456859455e-05, "loss": 0.1201, "num_input_tokens_seen": 11511848, "step": 105, "train_runtime": 1215.263, "train_tokens_per_second": 9472.722 }, { "epoch": 1.5842696629213484, "grad_norm": 0.21573545038700104, "learning_rate": 4.34441703483291e-05, "loss": 0.0623, "num_input_tokens_seen": 11625728, "step": 106, "train_runtime": 1226.7662, "train_tokens_per_second": 9476.726 }, { "epoch": 1.5992509363295881, "grad_norm": 0.578035831451416, "learning_rate": 4.326702693376844e-05, "loss": 0.193, "num_input_tokens_seen": 11741544, "step": 107, "train_runtime": 1238.3403, "train_tokens_per_second": 9481.678 }, { "epoch": 1.6142322097378277, "grad_norm": 0.21446435153484344, "learning_rate": 4.308789362182492e-05, "loss": 0.0936, "num_input_tokens_seen": 11851240, "step": 108, "train_runtime": 1249.8102, "train_tokens_per_second": 9482.432 }, { "epoch": 1.6292134831460674, "grad_norm": 0.30708542466163635, "learning_rate": 4.2906789926177975e-05, "loss": 0.1468, "num_input_tokens_seen": 11963664, "step": 109, "train_runtime": 1261.6521, "train_tokens_per_second": 9482.538 }, { "epoch": 1.6441947565543071, "grad_norm": 0.3326849341392517, "learning_rate": 4.272373557514858e-05, "loss": 0.1707, "num_input_tokens_seen": 12067544, "step": 110, "train_runtime": 1272.7519, "train_tokens_per_second": 9481.459 }, { "epoch": 1.6591760299625467, "grad_norm": 0.4741860032081604, "learning_rate": 4.2538750509550054e-05, "loss": 0.1829, "num_input_tokens_seen": 12164792, "step": 111, "train_runtime": 1284.2245, "train_tokens_per_second": 9472.481 }, { "epoch": 1.6741573033707864, "grad_norm": 0.29018136858940125, "learning_rate": 4.235185488051585e-05, "loss": 0.1401, "num_input_tokens_seen": 12281440, "step": 112, "train_runtime": 1296.1999, "train_tokens_per_second": 9474.958 }, { "epoch": 1.6891385767790261, "grad_norm": 0.2956504225730896, "learning_rate": 4.216306904730447e-05, "loss": 0.1412, "num_input_tokens_seen": 12389800, "step": 113, "train_runtime": 1308.0426, "train_tokens_per_second": 9472.015 }, { "epoch": 1.7041198501872659, "grad_norm": 0.27467837929725647, "learning_rate": 4.1972413575081595e-05, "loss": 0.1908, "num_input_tokens_seen": 12498360, "step": 114, "train_runtime": 1319.6005, "train_tokens_per_second": 9471.321 }, { "epoch": 1.7191011235955056, "grad_norm": 0.5112754702568054, "learning_rate": 4.177990923267986e-05, "loss": 0.1783, "num_input_tokens_seen": 12601072, "step": 115, "train_runtime": 1331.1821, "train_tokens_per_second": 9466.077 }, { "epoch": 1.7340823970037453, "grad_norm": 0.25173839926719666, "learning_rate": 4.158557699033644e-05, "loss": 0.1246, "num_input_tokens_seen": 12704456, "step": 116, "train_runtime": 1342.9235, "train_tokens_per_second": 9460.298 }, { "epoch": 1.749063670411985, "grad_norm": 0.34037086367607117, "learning_rate": 4.138943801740865e-05, "loss": 0.0917, "num_input_tokens_seen": 12801568, "step": 117, "train_runtime": 1354.7403, "train_tokens_per_second": 9449.463 }, { "epoch": 1.7640449438202248, "grad_norm": 0.20387206971645355, "learning_rate": 4.119151368006793e-05, "loss": 0.0672, "num_input_tokens_seen": 12917448, "step": 118, "train_runtime": 1366.4787, "train_tokens_per_second": 9453.092 }, { "epoch": 1.7790262172284645, "grad_norm": 0.19825316965579987, "learning_rate": 4.099182553897229e-05, "loss": 0.1358, "num_input_tokens_seen": 13022432, "step": 119, "train_runtime": 1378.2058, "train_tokens_per_second": 9448.83 }, { "epoch": 1.7940074906367043, "grad_norm": 0.2554757297039032, "learning_rate": 4.079039534691767e-05, "loss": 0.1048, "num_input_tokens_seen": 13129888, "step": 120, "train_runtime": 1390.1082, "train_tokens_per_second": 9445.227 }, { "epoch": 1.8089887640449438, "grad_norm": 0.256199449300766, "learning_rate": 4.058724504646834e-05, "loss": 0.1369, "num_input_tokens_seen": 13235312, "step": 121, "train_runtime": 1401.929, "train_tokens_per_second": 9440.786 }, { "epoch": 1.8239700374531835, "grad_norm": 0.22924847900867462, "learning_rate": 4.0382396767566536e-05, "loss": 0.1564, "num_input_tokens_seen": 13350920, "step": 122, "train_runtime": 1413.75, "train_tokens_per_second": 9443.622 }, { "epoch": 1.8389513108614233, "grad_norm": 0.3208468556404114, "learning_rate": 4.017587282512181e-05, "loss": 0.1292, "num_input_tokens_seen": 13458096, "step": 123, "train_runtime": 1425.7005, "train_tokens_per_second": 9439.637 }, { "epoch": 1.8539325842696628, "grad_norm": 0.2948530614376068, "learning_rate": 3.9967695716580224e-05, "loss": 0.1175, "num_input_tokens_seen": 13566016, "step": 124, "train_runtime": 1437.6399, "train_tokens_per_second": 9436.31 }, { "epoch": 1.8689138576779025, "grad_norm": 0.22196036577224731, "learning_rate": 3.975788811947351e-05, "loss": 0.1814, "num_input_tokens_seen": 13676808, "step": 125, "train_runtime": 1449.4544, "train_tokens_per_second": 9435.832 }, { "epoch": 1.8689138576779025, "eval_accuracy": 0.9412339186185766, "eval_loss": 0.18464037775993347, "eval_runtime": 4.9271, "eval_samples_per_second": 11.569, "eval_steps_per_second": 3.044, "num_input_tokens_seen": 13676808, "step": 125 }, { "epoch": 1.8838951310861423, "grad_norm": 0.3131586015224457, "learning_rate": 3.954647288894883e-05, "loss": 0.0969, "num_input_tokens_seen": 13785624, "step": 126, "train_runtime": 1466.2368, "train_tokens_per_second": 9402.045 }, { "epoch": 1.898876404494382, "grad_norm": 0.2623322606086731, "learning_rate": 3.933347305527898e-05, "loss": 0.1431, "num_input_tokens_seen": 13896368, "step": 127, "train_runtime": 1478.1099, "train_tokens_per_second": 9401.444 }, { "epoch": 1.9138576779026217, "grad_norm": 0.3807845711708069, "learning_rate": 3.911891182135371e-05, "loss": 0.1552, "num_input_tokens_seen": 14010984, "step": 128, "train_runtime": 1490.1185, "train_tokens_per_second": 9402.597 }, { "epoch": 1.9288389513108615, "grad_norm": 0.36109474301338196, "learning_rate": 3.8902812560152066e-05, "loss": 0.1472, "num_input_tokens_seen": 14112168, "step": 129, "train_runtime": 1501.7018, "train_tokens_per_second": 9397.45 }, { "epoch": 1.9438202247191012, "grad_norm": 0.20871855318546295, "learning_rate": 3.868519881219631e-05, "loss": 0.1115, "num_input_tokens_seen": 14227128, "step": 130, "train_runtime": 1513.4596, "train_tokens_per_second": 9400.402 }, { "epoch": 1.958801498127341, "grad_norm": 0.8112205266952515, "learning_rate": 3.846609428298757e-05, "loss": 0.1027, "num_input_tokens_seen": 14342592, "step": 131, "train_runtime": 1525.4083, "train_tokens_per_second": 9402.461 }, { "epoch": 1.9737827715355807, "grad_norm": 0.20138965547084808, "learning_rate": 3.824552284042351e-05, "loss": 0.1057, "num_input_tokens_seen": 14461768, "step": 132, "train_runtime": 1537.3708, "train_tokens_per_second": 9406.818 }, { "epoch": 1.9887640449438202, "grad_norm": 0.2765468657016754, "learning_rate": 3.8023508512198256e-05, "loss": 0.1326, "num_input_tokens_seen": 14568520, "step": 133, "train_runtime": 1549.1006, "train_tokens_per_second": 9404.502 }, { "epoch": 2.0, "grad_norm": 0.3718896806240082, "learning_rate": 3.780007548318507e-05, "loss": 0.1245, "num_input_tokens_seen": 14641496, "step": 134, "train_runtime": 1558.8821, "train_tokens_per_second": 9392.305 }, { "epoch": 2.0149812734082397, "grad_norm": 0.3595190942287445, "learning_rate": 3.7575248092801686e-05, "loss": 0.158, "num_input_tokens_seen": 14745856, "step": 135, "train_runtime": 1570.765, "train_tokens_per_second": 9387.691 }, { "epoch": 2.0299625468164795, "grad_norm": 0.31272247433662415, "learning_rate": 3.734905083235901e-05, "loss": 0.122, "num_input_tokens_seen": 14851856, "step": 136, "train_runtime": 1582.6874, "train_tokens_per_second": 9383.948 }, { "epoch": 2.044943820224719, "grad_norm": 0.3598696291446686, "learning_rate": 3.712150834239313e-05, "loss": 0.1392, "num_input_tokens_seen": 14962208, "step": 137, "train_runtime": 1594.3777, "train_tokens_per_second": 9384.356 }, { "epoch": 2.059925093632959, "grad_norm": 0.45219355821609497, "learning_rate": 3.689264540998116e-05, "loss": 0.0892, "num_input_tokens_seen": 15071712, "step": 138, "train_runtime": 1606.014, "train_tokens_per_second": 9384.546 }, { "epoch": 2.0749063670411987, "grad_norm": 0.18551455438137054, "learning_rate": 3.66624869660411e-05, "loss": 0.0706, "num_input_tokens_seen": 15178568, "step": 139, "train_runtime": 1617.5902, "train_tokens_per_second": 9383.444 }, { "epoch": 2.0898876404494384, "grad_norm": 0.1813335120677948, "learning_rate": 3.6431058082615964e-05, "loss": 0.0695, "num_input_tokens_seen": 15295296, "step": 140, "train_runtime": 1629.2133, "train_tokens_per_second": 9388.148 }, { "epoch": 2.1048689138576777, "grad_norm": 0.25384795665740967, "learning_rate": 3.619838397014263e-05, "loss": 0.1314, "num_input_tokens_seen": 15401968, "step": 141, "train_runtime": 1640.8051, "train_tokens_per_second": 9386.836 }, { "epoch": 2.1198501872659175, "grad_norm": 1.163858413696289, "learning_rate": 3.5964489974705553e-05, "loss": 0.1043, "num_input_tokens_seen": 15510128, "step": 142, "train_runtime": 1652.1467, "train_tokens_per_second": 9387.864 }, { "epoch": 2.134831460674157, "grad_norm": 0.280141144990921, "learning_rate": 3.572940157527572e-05, "loss": 0.1566, "num_input_tokens_seen": 15606536, "step": 143, "train_runtime": 1663.2096, "train_tokens_per_second": 9383.385 }, { "epoch": 2.149812734082397, "grad_norm": 0.371442973613739, "learning_rate": 3.549314438093515e-05, "loss": 0.0907, "num_input_tokens_seen": 15717520, "step": 144, "train_runtime": 1674.5727, "train_tokens_per_second": 9385.989 }, { "epoch": 2.1647940074906367, "grad_norm": 0.2719246447086334, "learning_rate": 3.525574412808717e-05, "loss": 0.1258, "num_input_tokens_seen": 15827848, "step": 145, "train_runtime": 1686.0755, "train_tokens_per_second": 9387.39 }, { "epoch": 2.1797752808988764, "grad_norm": 0.25204575061798096, "learning_rate": 3.501722667765286e-05, "loss": 0.1402, "num_input_tokens_seen": 15934960, "step": 146, "train_runtime": 1697.5903, "train_tokens_per_second": 9386.812 }, { "epoch": 2.194756554307116, "grad_norm": 0.20074494183063507, "learning_rate": 3.47776180122539e-05, "loss": 0.0751, "num_input_tokens_seen": 16038664, "step": 147, "train_runtime": 1708.9323, "train_tokens_per_second": 9385.196 }, { "epoch": 2.209737827715356, "grad_norm": 0.32247740030288696, "learning_rate": 3.453694423338225e-05, "loss": 0.1599, "num_input_tokens_seen": 16142344, "step": 148, "train_runtime": 1719.9895, "train_tokens_per_second": 9385.141 }, { "epoch": 2.2247191011235956, "grad_norm": 0.2926752269268036, "learning_rate": 3.4295231558556715e-05, "loss": 0.1017, "num_input_tokens_seen": 16242008, "step": 149, "train_runtime": 1731.3942, "train_tokens_per_second": 9380.884 }, { "epoch": 2.2397003745318353, "grad_norm": 0.284390926361084, "learning_rate": 3.4052506318467084e-05, "loss": 0.0857, "num_input_tokens_seen": 16353368, "step": 150, "train_runtime": 1742.8962, "train_tokens_per_second": 9382.87 }, { "epoch": 2.2397003745318353, "eval_accuracy": 0.9414076449679479, "eval_loss": 0.1802486777305603, "eval_runtime": 4.927, "eval_samples_per_second": 11.569, "eval_steps_per_second": 3.044, "num_input_tokens_seen": 16353368, "step": 150 }, { "epoch": 2.254681647940075, "grad_norm": 0.24835243821144104, "learning_rate": 3.3808794954105716e-05, "loss": 0.12, "num_input_tokens_seen": 16462800, "step": 151, "train_runtime": 1759.3705, "train_tokens_per_second": 9357.211 }, { "epoch": 2.2696629213483144, "grad_norm": 8.10318660736084, "learning_rate": 3.356412401388732e-05, "loss": 0.202, "num_input_tokens_seen": 16576136, "step": 152, "train_runtime": 1770.9609, "train_tokens_per_second": 9359.967 }, { "epoch": 2.284644194756554, "grad_norm": 0.20310688018798828, "learning_rate": 3.3318520150756846e-05, "loss": 0.0774, "num_input_tokens_seen": 16685072, "step": 153, "train_runtime": 1782.4318, "train_tokens_per_second": 9360.847 }, { "epoch": 2.299625468164794, "grad_norm": 0.2707997262477875, "learning_rate": 3.307201011928616e-05, "loss": 0.0896, "num_input_tokens_seen": 16799472, "step": 154, "train_runtime": 1793.9015, "train_tokens_per_second": 9364.768 }, { "epoch": 2.3146067415730336, "grad_norm": 0.33697864413261414, "learning_rate": 3.282462077275947e-05, "loss": 0.1516, "num_input_tokens_seen": 16916072, "step": 155, "train_runtime": 1805.4847, "train_tokens_per_second": 9369.269 }, { "epoch": 2.3295880149812733, "grad_norm": 0.27440527081489563, "learning_rate": 3.257637906024822e-05, "loss": 0.1394, "num_input_tokens_seen": 17036352, "step": 156, "train_runtime": 1817.14, "train_tokens_per_second": 9375.366 }, { "epoch": 2.344569288389513, "grad_norm": 0.24285127222537994, "learning_rate": 3.2327312023675287e-05, "loss": 0.1162, "num_input_tokens_seen": 17141704, "step": 157, "train_runtime": 1828.6308, "train_tokens_per_second": 9374.065 }, { "epoch": 2.359550561797753, "grad_norm": 0.3084551692008972, "learning_rate": 3.2077446794869295e-05, "loss": 0.1081, "num_input_tokens_seen": 17247616, "step": 158, "train_runtime": 1840.114, "train_tokens_per_second": 9373.123 }, { "epoch": 2.3745318352059925, "grad_norm": 0.2344665825366974, "learning_rate": 3.1826810592609036e-05, "loss": 0.1278, "num_input_tokens_seen": 17360352, "step": 159, "train_runtime": 1851.4615, "train_tokens_per_second": 9376.567 }, { "epoch": 2.3895131086142323, "grad_norm": 0.26749441027641296, "learning_rate": 3.157543071965835e-05, "loss": 0.1027, "num_input_tokens_seen": 17472040, "step": 160, "train_runtime": 1863.0215, "train_tokens_per_second": 9378.335 }, { "epoch": 2.404494382022472, "grad_norm": 0.543249785900116, "learning_rate": 3.132333455979202e-05, "loss": 0.1247, "num_input_tokens_seen": 17579232, "step": 161, "train_runtime": 1874.575, "train_tokens_per_second": 9377.716 }, { "epoch": 2.4194756554307117, "grad_norm": 0.1824382096529007, "learning_rate": 3.107054957481271e-05, "loss": 0.0773, "num_input_tokens_seen": 17686392, "step": 162, "train_runtime": 1886.1923, "train_tokens_per_second": 9376.77 }, { "epoch": 2.4344569288389515, "grad_norm": 0.1845661848783493, "learning_rate": 3.081710330155942e-05, "loss": 0.0579, "num_input_tokens_seen": 17800024, "step": 163, "train_runtime": 1897.7131, "train_tokens_per_second": 9379.723 }, { "epoch": 2.449438202247191, "grad_norm": 0.6334578394889832, "learning_rate": 3.056302334890786e-05, "loss": 0.0756, "num_input_tokens_seen": 17909576, "step": 164, "train_runtime": 1909.3707, "train_tokens_per_second": 9379.832 }, { "epoch": 2.464419475655431, "grad_norm": 0.28113627433776855, "learning_rate": 3.030833739476285e-05, "loss": 0.1386, "num_input_tokens_seen": 18009360, "step": 165, "train_runtime": 1920.56, "train_tokens_per_second": 9377.14 }, { "epoch": 2.4794007490636703, "grad_norm": 0.3643280267715454, "learning_rate": 3.0053073183043256e-05, "loss": 0.1432, "num_input_tokens_seen": 18114736, "step": 166, "train_runtime": 1931.8708, "train_tokens_per_second": 9376.785 }, { "epoch": 2.49438202247191, "grad_norm": 0.2565111815929413, "learning_rate": 2.979725852065981e-05, "loss": 0.1071, "num_input_tokens_seen": 18226888, "step": 167, "train_runtime": 1943.3904, "train_tokens_per_second": 9378.912 }, { "epoch": 2.5093632958801497, "grad_norm": 0.27584174275398254, "learning_rate": 2.954092127448591e-05, "loss": 0.114, "num_input_tokens_seen": 18338720, "step": 168, "train_runtime": 1954.9951, "train_tokens_per_second": 9380.443 }, { "epoch": 2.5243445692883895, "grad_norm": 0.22883868217468262, "learning_rate": 2.9284089368322045e-05, "loss": 0.0981, "num_input_tokens_seen": 18451496, "step": 169, "train_runtime": 1966.6195, "train_tokens_per_second": 9382.341 }, { "epoch": 2.539325842696629, "grad_norm": 0.33810093998908997, "learning_rate": 2.9026790779853874e-05, "loss": 0.1347, "num_input_tokens_seen": 18556776, "step": 170, "train_runtime": 1977.785, "train_tokens_per_second": 9382.605 }, { "epoch": 2.554307116104869, "grad_norm": 0.31047242879867554, "learning_rate": 2.876905353760459e-05, "loss": 0.0833, "num_input_tokens_seen": 18664112, "step": 171, "train_runtime": 1989.3899, "train_tokens_per_second": 9381.827 }, { "epoch": 2.5692883895131087, "grad_norm": 0.25530344247817993, "learning_rate": 2.8510905717881614e-05, "loss": 0.1111, "num_input_tokens_seen": 18769448, "step": 172, "train_runtime": 2000.6604, "train_tokens_per_second": 9381.626 }, { "epoch": 2.5842696629213484, "grad_norm": 0.2946469187736511, "learning_rate": 2.8252375441718137e-05, "loss": 0.1501, "num_input_tokens_seen": 18884864, "step": 173, "train_runtime": 2012.1806, "train_tokens_per_second": 9385.273 }, { "epoch": 2.599250936329588, "grad_norm": 0.3773297667503357, "learning_rate": 2.7993490871809808e-05, "loss": 0.1171, "num_input_tokens_seen": 18993424, "step": 174, "train_runtime": 2023.6738, "train_tokens_per_second": 9385.616 }, { "epoch": 2.6142322097378274, "grad_norm": 0.2301235944032669, "learning_rate": 2.7734280209446865e-05, "loss": 0.1261, "num_input_tokens_seen": 19111296, "step": 175, "train_runtime": 2035.2921, "train_tokens_per_second": 9389.953 }, { "epoch": 2.6142322097378274, "eval_accuracy": 0.943552523084463, "eval_loss": 0.1688879132270813, "eval_runtime": 4.9275, "eval_samples_per_second": 11.568, "eval_steps_per_second": 3.044, "num_input_tokens_seen": 19111296, "step": 175 }, { "epoch": 2.629213483146067, "grad_norm": 0.22906967997550964, "learning_rate": 2.7474771691442018e-05, "loss": 0.0987, "num_input_tokens_seen": 19213824, "step": 176, "train_runtime": 2051.4065, "train_tokens_per_second": 9366.171 }, { "epoch": 2.644194756554307, "grad_norm": 0.2511672079563141, "learning_rate": 2.721499358705458e-05, "loss": 0.054, "num_input_tokens_seen": 19338104, "step": 177, "train_runtime": 2063.0236, "train_tokens_per_second": 9373.671 }, { "epoch": 2.6591760299625467, "grad_norm": 0.3024023771286011, "learning_rate": 2.6954974194910888e-05, "loss": 0.0683, "num_input_tokens_seen": 19449848, "step": 178, "train_runtime": 2074.6512, "train_tokens_per_second": 9374.997 }, { "epoch": 2.6741573033707864, "grad_norm": 0.1516094207763672, "learning_rate": 2.6694741839921732e-05, "loss": 0.1121, "num_input_tokens_seen": 19571008, "step": 179, "train_runtime": 2086.2509, "train_tokens_per_second": 9380.946 }, { "epoch": 2.689138576779026, "grad_norm": 0.20220297574996948, "learning_rate": 2.6434324870196748e-05, "loss": 0.0888, "num_input_tokens_seen": 19686872, "step": 180, "train_runtime": 2097.6886, "train_tokens_per_second": 9385.031 }, { "epoch": 2.704119850187266, "grad_norm": 0.504995584487915, "learning_rate": 2.617375165395634e-05, "loss": 0.0751, "num_input_tokens_seen": 19797960, "step": 181, "train_runtime": 2109.1207, "train_tokens_per_second": 9386.831 }, { "epoch": 2.7191011235955056, "grad_norm": 0.24343626201152802, "learning_rate": 2.5913050576441477e-05, "loss": 0.1033, "num_input_tokens_seen": 19905184, "step": 182, "train_runtime": 2120.593, "train_tokens_per_second": 9386.612 }, { "epoch": 2.7340823970037453, "grad_norm": 0.20072679221630096, "learning_rate": 2.5652250036821523e-05, "loss": 0.0867, "num_input_tokens_seen": 20013120, "step": 183, "train_runtime": 2132.1281, "train_tokens_per_second": 9386.453 }, { "epoch": 2.749063670411985, "grad_norm": 0.22807280719280243, "learning_rate": 2.5391378445100644e-05, "loss": 0.1323, "num_input_tokens_seen": 20109488, "step": 184, "train_runtime": 2143.2117, "train_tokens_per_second": 9382.875 }, { "epoch": 2.764044943820225, "grad_norm": 0.2813800275325775, "learning_rate": 2.5130464219022992e-05, "loss": 0.0935, "num_input_tokens_seen": 20227088, "step": 185, "train_runtime": 2154.761, "train_tokens_per_second": 9387.161 }, { "epoch": 2.7790262172284645, "grad_norm": 0.21835725009441376, "learning_rate": 2.486953578097702e-05, "loss": 0.095, "num_input_tokens_seen": 20330176, "step": 186, "train_runtime": 2166.2477, "train_tokens_per_second": 9384.973 }, { "epoch": 2.7940074906367043, "grad_norm": 0.23183397948741913, "learning_rate": 2.4608621554899362e-05, "loss": 0.1094, "num_input_tokens_seen": 20448288, "step": 187, "train_runtime": 2177.8806, "train_tokens_per_second": 9389.077 }, { "epoch": 2.808988764044944, "grad_norm": 0.24513450264930725, "learning_rate": 2.4347749963178486e-05, "loss": 0.094, "num_input_tokens_seen": 20552120, "step": 188, "train_runtime": 2189.3542, "train_tokens_per_second": 9387.298 }, { "epoch": 2.8239700374531838, "grad_norm": 0.23791368305683136, "learning_rate": 2.4086949423558526e-05, "loss": 0.0948, "num_input_tokens_seen": 20664640, "step": 189, "train_runtime": 2201.0212, "train_tokens_per_second": 9388.66 }, { "epoch": 2.8389513108614235, "grad_norm": 0.2168198823928833, "learning_rate": 2.3826248346043663e-05, "loss": 0.0838, "num_input_tokens_seen": 20777328, "step": 190, "train_runtime": 2212.6238, "train_tokens_per_second": 9390.357 }, { "epoch": 2.853932584269663, "grad_norm": 0.20405888557434082, "learning_rate": 2.356567512980326e-05, "loss": 0.1071, "num_input_tokens_seen": 20895424, "step": 191, "train_runtime": 2224.2739, "train_tokens_per_second": 9394.267 }, { "epoch": 2.8689138576779025, "grad_norm": 0.21420727670192719, "learning_rate": 2.3305258160078274e-05, "loss": 0.0939, "num_input_tokens_seen": 21007912, "step": 192, "train_runtime": 2235.8311, "train_tokens_per_second": 9396.019 }, { "epoch": 2.8838951310861423, "grad_norm": 0.27938148379325867, "learning_rate": 2.3045025805089118e-05, "loss": 0.1093, "num_input_tokens_seen": 21112424, "step": 193, "train_runtime": 2246.9601, "train_tokens_per_second": 9395.994 }, { "epoch": 2.898876404494382, "grad_norm": 0.3431660830974579, "learning_rate": 2.278500641294543e-05, "loss": 0.1156, "num_input_tokens_seen": 21221136, "step": 194, "train_runtime": 2258.1117, "train_tokens_per_second": 9397.735 }, { "epoch": 2.9138576779026217, "grad_norm": 0.40309008955955505, "learning_rate": 2.252522830855798e-05, "loss": 0.0693, "num_input_tokens_seen": 21331720, "step": 195, "train_runtime": 2269.7189, "train_tokens_per_second": 9398.397 }, { "epoch": 2.9288389513108615, "grad_norm": 0.1703529804944992, "learning_rate": 2.2265719790553147e-05, "loss": 0.0907, "num_input_tokens_seen": 21447512, "step": 196, "train_runtime": 2281.3042, "train_tokens_per_second": 9401.426 }, { "epoch": 2.943820224719101, "grad_norm": 0.1772414594888687, "learning_rate": 2.2006509128190195e-05, "loss": 0.0821, "num_input_tokens_seen": 21553192, "step": 197, "train_runtime": 2292.9216, "train_tokens_per_second": 9399.882 }, { "epoch": 2.958801498127341, "grad_norm": 0.22802460193634033, "learning_rate": 2.174762455828187e-05, "loss": 0.1252, "num_input_tokens_seen": 21655488, "step": 198, "train_runtime": 2304.2302, "train_tokens_per_second": 9398.144 }, { "epoch": 2.9737827715355807, "grad_norm": 0.20786331593990326, "learning_rate": 2.1489094282118395e-05, "loss": 0.0859, "num_input_tokens_seen": 21767256, "step": 199, "train_runtime": 2315.7419, "train_tokens_per_second": 9399.69 }, { "epoch": 2.98876404494382, "grad_norm": 0.23133137822151184, "learning_rate": 2.123094646239541e-05, "loss": 0.1024, "num_input_tokens_seen": 21879928, "step": 200, "train_runtime": 2327.2273, "train_tokens_per_second": 9401.715 }, { "epoch": 2.98876404494382, "eval_accuracy": 0.9457574927378564, "eval_loss": 0.1642482578754425, "eval_runtime": 4.9178, "eval_samples_per_second": 11.59, "eval_steps_per_second": 3.05, "num_input_tokens_seen": 21879928, "step": 200 }, { "epoch": 3.0, "grad_norm": 0.3909849524497986, "learning_rate": 2.0973209220146135e-05, "loss": 0.1114, "num_input_tokens_seen": 21962520, "step": 201, "train_runtime": 2341.9985, "train_tokens_per_second": 9377.683 }, { "epoch": 3.0149812734082397, "grad_norm": 0.2800769805908203, "learning_rate": 2.0715910631677968e-05, "loss": 0.0762, "num_input_tokens_seen": 22064872, "step": 202, "train_runtime": 2353.5238, "train_tokens_per_second": 9375.249 }, { "epoch": 3.0299625468164795, "grad_norm": 0.17772997915744781, "learning_rate": 2.0459078725514092e-05, "loss": 0.0883, "num_input_tokens_seen": 22169728, "step": 203, "train_runtime": 2364.4445, "train_tokens_per_second": 9376.295 }, { "epoch": 3.044943820224719, "grad_norm": 0.21998678147792816, "learning_rate": 2.020274147934019e-05, "loss": 0.0756, "num_input_tokens_seen": 22285928, "step": 204, "train_runtime": 2376.0541, "train_tokens_per_second": 9379.386 }, { "epoch": 3.059925093632959, "grad_norm": 0.2388005405664444, "learning_rate": 1.9946926816956743e-05, "loss": 0.0887, "num_input_tokens_seen": 22387040, "step": 205, "train_runtime": 2387.1231, "train_tokens_per_second": 9378.251 }, { "epoch": 3.0749063670411987, "grad_norm": 0.2441033124923706, "learning_rate": 1.9691662605237166e-05, "loss": 0.0926, "num_input_tokens_seen": 22498720, "step": 206, "train_runtime": 2398.4091, "train_tokens_per_second": 9380.685 }, { "epoch": 3.0898876404494384, "grad_norm": 0.16214871406555176, "learning_rate": 1.9436976651092144e-05, "loss": 0.1224, "num_input_tokens_seen": 22621072, "step": 207, "train_runtime": 2410.0394, "train_tokens_per_second": 9386.184 }, { "epoch": 3.1048689138576777, "grad_norm": 0.39781343936920166, "learning_rate": 1.9182896698440584e-05, "loss": 0.0856, "num_input_tokens_seen": 22724704, "step": 208, "train_runtime": 2421.5513, "train_tokens_per_second": 9384.358 }, { "epoch": 3.1198501872659175, "grad_norm": 0.2390083521604538, "learning_rate": 1.89294504251873e-05, "loss": 0.0621, "num_input_tokens_seen": 22838936, "step": 209, "train_runtime": 2433.1409, "train_tokens_per_second": 9386.606 }, { "epoch": 3.134831460674157, "grad_norm": 0.2981702387332916, "learning_rate": 1.867666544020798e-05, "loss": 0.1196, "num_input_tokens_seen": 22939008, "step": 210, "train_runtime": 2444.3526, "train_tokens_per_second": 9384.492 }, { "epoch": 3.149812734082397, "grad_norm": 0.3137620687484741, "learning_rate": 1.8424569280341653e-05, "loss": 0.1071, "num_input_tokens_seen": 23054112, "step": 211, "train_runtime": 2455.964, "train_tokens_per_second": 9386.991 }, { "epoch": 3.1647940074906367, "grad_norm": 0.1887480467557907, "learning_rate": 1.817318940739098e-05, "loss": 0.0932, "num_input_tokens_seen": 23156632, "step": 212, "train_runtime": 2467.4786, "train_tokens_per_second": 9384.735 }, { "epoch": 3.1797752808988764, "grad_norm": 0.2509893774986267, "learning_rate": 1.7922553205130707e-05, "loss": 0.0792, "num_input_tokens_seen": 23271912, "step": 213, "train_runtime": 2479.0903, "train_tokens_per_second": 9387.279 }, { "epoch": 3.194756554307116, "grad_norm": 0.15841956436634064, "learning_rate": 1.767268797632472e-05, "loss": 0.0513, "num_input_tokens_seen": 23381816, "step": 214, "train_runtime": 2490.6787, "train_tokens_per_second": 9387.729 }, { "epoch": 3.209737827715356, "grad_norm": 0.18807053565979004, "learning_rate": 1.7423620939751788e-05, "loss": 0.0903, "num_input_tokens_seen": 23489552, "step": 215, "train_runtime": 2502.2201, "train_tokens_per_second": 9387.484 }, { "epoch": 3.2247191011235956, "grad_norm": 0.27771255373954773, "learning_rate": 1.7175379227240523e-05, "loss": 0.0763, "num_input_tokens_seen": 23602136, "step": 216, "train_runtime": 2513.7932, "train_tokens_per_second": 9389.053 }, { "epoch": 3.2397003745318353, "grad_norm": 0.23832547664642334, "learning_rate": 1.692798988071385e-05, "loss": 0.0656, "num_input_tokens_seen": 23705952, "step": 217, "train_runtime": 2525.2798, "train_tokens_per_second": 9387.456 }, { "epoch": 3.254681647940075, "grad_norm": 0.20118069648742676, "learning_rate": 1.6681479849243153e-05, "loss": 0.1015, "num_input_tokens_seen": 23821824, "step": 218, "train_runtime": 2536.8541, "train_tokens_per_second": 9390.301 }, { "epoch": 3.2696629213483144, "grad_norm": 0.24935227632522583, "learning_rate": 1.6435875986112685e-05, "loss": 0.1126, "num_input_tokens_seen": 23933400, "step": 219, "train_runtime": 2548.3936, "train_tokens_per_second": 9391.563 }, { "epoch": 3.284644194756554, "grad_norm": 0.20586885511875153, "learning_rate": 1.6191205045894283e-05, "loss": 0.0704, "num_input_tokens_seen": 24044912, "step": 220, "train_runtime": 2559.9579, "train_tokens_per_second": 9392.698 }, { "epoch": 3.299625468164794, "grad_norm": 0.1685791313648224, "learning_rate": 1.594749368153292e-05, "loss": 0.0695, "num_input_tokens_seen": 24165512, "step": 221, "train_runtime": 2571.5221, "train_tokens_per_second": 9397.357 }, { "epoch": 3.3146067415730336, "grad_norm": 0.21374556422233582, "learning_rate": 1.570476844144329e-05, "loss": 0.0775, "num_input_tokens_seen": 24265384, "step": 222, "train_runtime": 2582.824, "train_tokens_per_second": 9394.904 }, { "epoch": 3.3295880149812733, "grad_norm": 0.2579873204231262, "learning_rate": 1.546305576661776e-05, "loss": 0.0852, "num_input_tokens_seen": 24373048, "step": 223, "train_runtime": 2594.4338, "train_tokens_per_second": 9394.361 }, { "epoch": 3.344569288389513, "grad_norm": 0.2597576379776001, "learning_rate": 1.5222381987746104e-05, "loss": 0.0791, "num_input_tokens_seen": 24483840, "step": 224, "train_runtime": 2605.9851, "train_tokens_per_second": 9395.234 }, { "epoch": 3.359550561797753, "grad_norm": 0.26184743642807007, "learning_rate": 1.4982773322347144e-05, "loss": 0.0617, "num_input_tokens_seen": 24591096, "step": 225, "train_runtime": 2617.5518, "train_tokens_per_second": 9394.693 }, { "epoch": 3.359550561797753, "eval_accuracy": 0.9478207874272477, "eval_loss": 0.1583455204963684, "eval_runtime": 4.9331, "eval_samples_per_second": 11.555, "eval_steps_per_second": 3.041, "num_input_tokens_seen": 24591096, "step": 225 }, { "epoch": 3.3745318352059925, "grad_norm": 0.21227402985095978, "learning_rate": 1.4744255871912823e-05, "loss": 0.0616, "num_input_tokens_seen": 24690968, "step": 226, "train_runtime": 2633.5828, "train_tokens_per_second": 9375.429 }, { "epoch": 3.3895131086142323, "grad_norm": 0.21474283933639526, "learning_rate": 1.4506855619064846e-05, "loss": 0.0903, "num_input_tokens_seen": 24799096, "step": 227, "train_runtime": 2644.9253, "train_tokens_per_second": 9376.104 }, { "epoch": 3.404494382022472, "grad_norm": 0.287079393863678, "learning_rate": 1.4270598424724292e-05, "loss": 0.0394, "num_input_tokens_seen": 24909896, "step": 228, "train_runtime": 2656.516, "train_tokens_per_second": 9376.904 }, { "epoch": 3.4194756554307117, "grad_norm": 0.13763266801834106, "learning_rate": 1.4035510025294462e-05, "loss": 0.0985, "num_input_tokens_seen": 25020096, "step": 229, "train_runtime": 2668.2051, "train_tokens_per_second": 9377.126 }, { "epoch": 3.4344569288389515, "grad_norm": 0.5997536778450012, "learning_rate": 1.3801616029857378e-05, "loss": 0.0929, "num_input_tokens_seen": 25134904, "step": 230, "train_runtime": 2679.8291, "train_tokens_per_second": 9379.294 }, { "epoch": 3.449438202247191, "grad_norm": 0.20695947110652924, "learning_rate": 1.3568941917384036e-05, "loss": 0.0724, "num_input_tokens_seen": 25238032, "step": 231, "train_runtime": 2691.1959, "train_tokens_per_second": 9377.999 }, { "epoch": 3.464419475655431, "grad_norm": 0.20964248478412628, "learning_rate": 1.3337513033958904e-05, "loss": 0.0646, "num_input_tokens_seen": 25346080, "step": 232, "train_runtime": 2702.7843, "train_tokens_per_second": 9377.766 }, { "epoch": 3.4794007490636703, "grad_norm": 0.33709076046943665, "learning_rate": 1.310735459001884e-05, "loss": 0.0783, "num_input_tokens_seen": 25456760, "step": 233, "train_runtime": 2714.3037, "train_tokens_per_second": 9378.744 }, { "epoch": 3.49438202247191, "grad_norm": 0.26522397994995117, "learning_rate": 1.2878491657606872e-05, "loss": 0.0632, "num_input_tokens_seen": 25565392, "step": 234, "train_runtime": 2725.4324, "train_tokens_per_second": 9380.307 }, { "epoch": 3.5093632958801497, "grad_norm": 0.4283091425895691, "learning_rate": 1.2650949167640997e-05, "loss": 0.0887, "num_input_tokens_seen": 25678520, "step": 235, "train_runtime": 2737.0518, "train_tokens_per_second": 9381.817 }, { "epoch": 3.5243445692883895, "grad_norm": 0.20751389861106873, "learning_rate": 1.2424751907198312e-05, "loss": 0.094, "num_input_tokens_seen": 25789432, "step": 236, "train_runtime": 2748.5904, "train_tokens_per_second": 9382.785 }, { "epoch": 3.539325842696629, "grad_norm": 0.26652851700782776, "learning_rate": 1.2199924516814939e-05, "loss": 0.0623, "num_input_tokens_seen": 25893768, "step": 237, "train_runtime": 2760.1255, "train_tokens_per_second": 9381.373 }, { "epoch": 3.554307116104869, "grad_norm": 0.2422049343585968, "learning_rate": 1.1976491487801748e-05, "loss": 0.1051, "num_input_tokens_seen": 26005272, "step": 238, "train_runtime": 2771.6123, "train_tokens_per_second": 9382.723 }, { "epoch": 3.5692883895131087, "grad_norm": 0.20235666632652283, "learning_rate": 1.1754477159576499e-05, "loss": 0.069, "num_input_tokens_seen": 26112160, "step": 239, "train_runtime": 2783.0863, "train_tokens_per_second": 9382.447 }, { "epoch": 3.5842696629213484, "grad_norm": 0.6034291386604309, "learning_rate": 1.1533905717012424e-05, "loss": 0.0561, "num_input_tokens_seen": 26227496, "step": 240, "train_runtime": 2794.6305, "train_tokens_per_second": 9384.96 }, { "epoch": 3.599250936329588, "grad_norm": 0.17024967074394226, "learning_rate": 1.1314801187803686e-05, "loss": 0.0824, "num_input_tokens_seen": 26323944, "step": 241, "train_runtime": 2805.7185, "train_tokens_per_second": 9382.247 }, { "epoch": 3.6142322097378274, "grad_norm": 0.27029407024383545, "learning_rate": 1.1097187439847939e-05, "loss": 0.083, "num_input_tokens_seen": 26423816, "step": 242, "train_runtime": 2816.7296, "train_tokens_per_second": 9381.027 }, { "epoch": 3.629213483146067, "grad_norm": 0.20020800828933716, "learning_rate": 1.088108817864629e-05, "loss": 0.0969, "num_input_tokens_seen": 26530000, "step": 243, "train_runtime": 2828.3199, "train_tokens_per_second": 9380.127 }, { "epoch": 3.644194756554307, "grad_norm": 0.19449175894260406, "learning_rate": 1.0666526944721016e-05, "loss": 0.0487, "num_input_tokens_seen": 26639920, "step": 244, "train_runtime": 2839.8372, "train_tokens_per_second": 9380.791 }, { "epoch": 3.6591760299625467, "grad_norm": 0.19385063648223877, "learning_rate": 1.0453527111051184e-05, "loss": 0.0861, "num_input_tokens_seen": 26755952, "step": 245, "train_runtime": 2851.3786, "train_tokens_per_second": 9383.514 }, { "epoch": 3.6741573033707864, "grad_norm": 0.1747702807188034, "learning_rate": 1.0242111880526495e-05, "loss": 0.0879, "num_input_tokens_seen": 26867776, "step": 246, "train_runtime": 2862.9101, "train_tokens_per_second": 9384.778 }, { "epoch": 3.689138576779026, "grad_norm": 0.2870045304298401, "learning_rate": 1.003230428341979e-05, "loss": 0.081, "num_input_tokens_seen": 26975080, "step": 247, "train_runtime": 2874.3933, "train_tokens_per_second": 9384.617 }, { "epoch": 3.704119850187266, "grad_norm": 0.18543480336666107, "learning_rate": 9.824127174878195e-06, "loss": 0.0758, "num_input_tokens_seen": 27088208, "step": 248, "train_runtime": 2885.9347, "train_tokens_per_second": 9386.286 }, { "epoch": 3.7191011235955056, "grad_norm": 0.4089682698249817, "learning_rate": 9.617603232433475e-06, "loss": 0.1284, "num_input_tokens_seen": 27199040, "step": 249, "train_runtime": 2897.423, "train_tokens_per_second": 9387.321 }, { "epoch": 3.7340823970037453, "grad_norm": 0.23248536884784698, "learning_rate": 9.412754953531663e-06, "loss": 0.0883, "num_input_tokens_seen": 27307192, "step": 250, "train_runtime": 2909.0185, "train_tokens_per_second": 9387.081 }, { "epoch": 3.7340823970037453, "eval_accuracy": 0.9487738647538684, "eval_loss": 0.15280824899673462, "eval_runtime": 4.9317, "eval_samples_per_second": 11.558, "eval_steps_per_second": 3.042, "num_input_tokens_seen": 27307192, "step": 250 }, { "epoch": 3.749063670411985, "grad_norm": 0.2929266691207886, "learning_rate": 9.209604653082326e-06, "loss": 0.0618, "num_input_tokens_seen": 27419216, "step": 251, "train_runtime": 2925.3909, "train_tokens_per_second": 9372.838 }, { "epoch": 3.764044943820225, "grad_norm": 0.18417492508888245, "learning_rate": 9.008174461027724e-06, "loss": 0.0664, "num_input_tokens_seen": 27534416, "step": 252, "train_runtime": 2936.9289, "train_tokens_per_second": 9375.241 }, { "epoch": 3.7790262172284645, "grad_norm": 0.1642679125070572, "learning_rate": 8.808486319932083e-06, "loss": 0.0691, "num_input_tokens_seen": 27650456, "step": 253, "train_runtime": 2948.5436, "train_tokens_per_second": 9377.666 }, { "epoch": 3.7940074906367043, "grad_norm": 0.27271735668182373, "learning_rate": 8.610561982591357e-06, "loss": 0.1072, "num_input_tokens_seen": 27766296, "step": 254, "train_runtime": 2960.1358, "train_tokens_per_second": 9380.075 }, { "epoch": 3.808988764044944, "grad_norm": 0.2314184457063675, "learning_rate": 8.414423009663563e-06, "loss": 0.1113, "num_input_tokens_seen": 27877960, "step": 255, "train_runtime": 2971.7254, "train_tokens_per_second": 9381.069 }, { "epoch": 3.8239700374531838, "grad_norm": 4.12896203994751, "learning_rate": 8.220090767320137e-06, "loss": 0.0787, "num_input_tokens_seen": 27992400, "step": 256, "train_runtime": 2983.2661, "train_tokens_per_second": 9383.139 }, { "epoch": 3.8389513108614235, "grad_norm": 0.18129047751426697, "learning_rate": 8.027586424918412e-06, "loss": 0.0436, "num_input_tokens_seen": 28099232, "step": 257, "train_runtime": 2994.7467, "train_tokens_per_second": 9382.841 }, { "epoch": 3.853932584269663, "grad_norm": 0.1452488899230957, "learning_rate": 7.836930952695533e-06, "loss": 0.0761, "num_input_tokens_seen": 28212712, "step": 258, "train_runtime": 3006.2908, "train_tokens_per_second": 9384.559 }, { "epoch": 3.8689138576779025, "grad_norm": 0.22081199288368225, "learning_rate": 7.648145119484152e-06, "loss": 0.0876, "num_input_tokens_seen": 28327232, "step": 259, "train_runtime": 3017.7295, "train_tokens_per_second": 9386.935 }, { "epoch": 3.8838951310861423, "grad_norm": 0.1896650493144989, "learning_rate": 7.461249490449954e-06, "loss": 0.0689, "num_input_tokens_seen": 28444136, "step": 260, "train_runtime": 3029.4265, "train_tokens_per_second": 9389.281 }, { "epoch": 3.898876404494382, "grad_norm": 0.3891507685184479, "learning_rate": 7.2762644248514255e-06, "loss": 0.0934, "num_input_tokens_seen": 28553608, "step": 261, "train_runtime": 3040.9155, "train_tokens_per_second": 9389.806 }, { "epoch": 3.9138576779026217, "grad_norm": 0.20802177488803864, "learning_rate": 7.0932100738220265e-06, "loss": 0.0616, "num_input_tokens_seen": 28655944, "step": 262, "train_runtime": 3052.4258, "train_tokens_per_second": 9387.925 }, { "epoch": 3.9288389513108615, "grad_norm": 0.162275493144989, "learning_rate": 6.912106378175098e-06, "loss": 0.0505, "num_input_tokens_seen": 28770240, "step": 263, "train_runtime": 3063.975, "train_tokens_per_second": 9389.842 }, { "epoch": 3.943820224719101, "grad_norm": 0.1334082931280136, "learning_rate": 6.732973066231563e-06, "loss": 0.0716, "num_input_tokens_seen": 28879896, "step": 264, "train_runtime": 3075.4811, "train_tokens_per_second": 9390.367 }, { "epoch": 3.958801498127341, "grad_norm": 0.24781842529773712, "learning_rate": 6.555829651670911e-06, "loss": 0.0925, "num_input_tokens_seen": 28979616, "step": 265, "train_runtime": 3086.8721, "train_tokens_per_second": 9388.02 }, { "epoch": 3.9737827715355807, "grad_norm": 0.25727924704551697, "learning_rate": 6.380695431405453e-06, "loss": 0.082, "num_input_tokens_seen": 29095336, "step": 266, "train_runtime": 3098.3306, "train_tokens_per_second": 9390.649 }, { "epoch": 3.98876404494382, "grad_norm": 0.20583029091358185, "learning_rate": 6.207589483478266e-06, "loss": 0.1735, "num_input_tokens_seen": 29200208, "step": 267, "train_runtime": 3109.8583, "train_tokens_per_second": 9389.562 }, { "epoch": 4.0, "grad_norm": 0.23410587012767792, "learning_rate": 6.0365306649849214e-06, "loss": 0.0554, "num_input_tokens_seen": 29282608, "step": 268, "train_runtime": 3119.7239, "train_tokens_per_second": 9386.282 }, { "epoch": 4.01498127340824, "grad_norm": 0.08256790041923523, "learning_rate": 5.867537610019317e-06, "loss": 0.0374, "num_input_tokens_seen": 29391848, "step": 269, "train_runtime": 3131.2887, "train_tokens_per_second": 9386.502 }, { "epoch": 4.0299625468164795, "grad_norm": 0.20500850677490234, "learning_rate": 5.700628727643806e-06, "loss": 0.0644, "num_input_tokens_seen": 29507360, "step": 270, "train_runtime": 3142.8457, "train_tokens_per_second": 9388.74 }, { "epoch": 4.044943820224719, "grad_norm": 0.1724829226732254, "learning_rate": 5.53582219988382e-06, "loss": 0.0621, "num_input_tokens_seen": 29607936, "step": 271, "train_runtime": 3154.3349, "train_tokens_per_second": 9386.428 }, { "epoch": 4.059925093632959, "grad_norm": 0.17760220170021057, "learning_rate": 5.373135979747227e-06, "loss": 0.0525, "num_input_tokens_seen": 29710240, "step": 272, "train_runtime": 3165.4438, "train_tokens_per_second": 9385.806 }, { "epoch": 4.074906367041199, "grad_norm": 0.20548486709594727, "learning_rate": 5.2125877892686496e-06, "loss": 0.072, "num_input_tokens_seen": 29819600, "step": 273, "train_runtime": 3176.9528, "train_tokens_per_second": 9386.227 }, { "epoch": 4.089887640449438, "grad_norm": 2.707559108734131, "learning_rate": 5.054195117578914e-06, "loss": 0.1253, "num_input_tokens_seen": 29927712, "step": 274, "train_runtime": 3188.3808, "train_tokens_per_second": 9386.492 }, { "epoch": 4.104868913857678, "grad_norm": 0.19858214259147644, "learning_rate": 4.897975218999926e-06, "loss": 0.0516, "num_input_tokens_seen": 30036912, "step": 275, "train_runtime": 3199.9225, "train_tokens_per_second": 9386.762 }, { "epoch": 4.104868913857678, "eval_accuracy": 0.9503720481817801, "eval_loss": 0.148418128490448, "eval_runtime": 4.9281, "eval_samples_per_second": 11.566, "eval_steps_per_second": 3.044, "num_input_tokens_seen": 30036912, "step": 275 }, { "epoch": 4.119850187265918, "grad_norm": 0.20856685936450958, "learning_rate": 4.743945111165068e-06, "loss": 0.0597, "num_input_tokens_seen": 30142632, "step": 276, "train_runtime": 3216.4293, "train_tokens_per_second": 9371.458 }, { "epoch": 4.134831460674158, "grad_norm": 0.15552882850170135, "learning_rate": 4.592121573165414e-06, "loss": 0.0481, "num_input_tokens_seen": 30249816, "step": 277, "train_runtime": 3228.0168, "train_tokens_per_second": 9371.022 }, { "epoch": 4.149812734082397, "grad_norm": 0.19117474555969238, "learning_rate": 4.442521143721892e-06, "loss": 0.0528, "num_input_tokens_seen": 30360248, "step": 278, "train_runtime": 3239.5535, "train_tokens_per_second": 9371.738 }, { "epoch": 4.164794007490637, "grad_norm": 0.1939282864332199, "learning_rate": 4.295160119383712e-06, "loss": 0.0558, "num_input_tokens_seen": 30466592, "step": 279, "train_runtime": 3251.0279, "train_tokens_per_second": 9371.372 }, { "epoch": 4.179775280898877, "grad_norm": 0.21391624212265015, "learning_rate": 4.150054552753055e-06, "loss": 0.0739, "num_input_tokens_seen": 30567952, "step": 280, "train_runtime": 3262.5285, "train_tokens_per_second": 9369.405 }, { "epoch": 4.194756554307116, "grad_norm": 0.18282581865787506, "learning_rate": 4.007220250736454e-06, "loss": 0.059, "num_input_tokens_seen": 30674984, "step": 281, "train_runtime": 3274.0659, "train_tokens_per_second": 9369.08 }, { "epoch": 4.209737827715355, "grad_norm": 0.5102422833442688, "learning_rate": 3.866672772822863e-06, "loss": 0.0275, "num_input_tokens_seen": 30791864, "step": 282, "train_runtime": 3285.6956, "train_tokens_per_second": 9371.49 }, { "epoch": 4.224719101123595, "grad_norm": 0.15346960723400116, "learning_rate": 3.728427429388709e-06, "loss": 0.041, "num_input_tokens_seen": 30908384, "step": 283, "train_runtime": 3297.3237, "train_tokens_per_second": 9373.779 }, { "epoch": 4.239700374531835, "grad_norm": 0.17301329970359802, "learning_rate": 3.592499280030057e-06, "loss": 0.0492, "num_input_tokens_seen": 31023848, "step": 284, "train_runtime": 3308.9234, "train_tokens_per_second": 9375.813 }, { "epoch": 4.254681647940075, "grad_norm": 0.1514940708875656, "learning_rate": 3.458903131922134e-06, "loss": 0.0555, "num_input_tokens_seen": 31137384, "step": 285, "train_runtime": 3320.5419, "train_tokens_per_second": 9377.199 }, { "epoch": 4.269662921348314, "grad_norm": 0.18485209345817566, "learning_rate": 3.3276535382063183e-06, "loss": 0.0493, "num_input_tokens_seen": 31244936, "step": 286, "train_runtime": 3332.0917, "train_tokens_per_second": 9376.974 }, { "epoch": 4.284644194756554, "grad_norm": 0.200953871011734, "learning_rate": 3.198764796404807e-06, "loss": 0.0492, "num_input_tokens_seen": 31355616, "step": 287, "train_runtime": 3343.5886, "train_tokens_per_second": 9377.833 }, { "epoch": 4.299625468164794, "grad_norm": 0.21441112458705902, "learning_rate": 3.0722509468631392e-06, "loss": 0.0649, "num_input_tokens_seen": 31463648, "step": 288, "train_runtime": 3354.9275, "train_tokens_per_second": 9378.339 }, { "epoch": 4.314606741573034, "grad_norm": 0.1840512454509735, "learning_rate": 2.948125771220697e-06, "loss": 0.0481, "num_input_tokens_seen": 31577056, "step": 289, "train_runtime": 3366.6336, "train_tokens_per_second": 9379.416 }, { "epoch": 4.329588014981273, "grad_norm": 0.166469007730484, "learning_rate": 2.8264027909094715e-06, "loss": 0.0455, "num_input_tokens_seen": 31682424, "step": 290, "train_runtime": 3378.1026, "train_tokens_per_second": 9378.763 }, { "epoch": 4.344569288389513, "grad_norm": 0.23863935470581055, "learning_rate": 2.707095265681081e-06, "loss": 0.0588, "num_input_tokens_seen": 31790168, "step": 291, "train_runtime": 3389.5951, "train_tokens_per_second": 9378.751 }, { "epoch": 4.359550561797753, "grad_norm": 0.22671280801296234, "learning_rate": 2.5902161921623454e-06, "loss": 0.0553, "num_input_tokens_seen": 31905520, "step": 292, "train_runtime": 3401.1676, "train_tokens_per_second": 9380.755 }, { "epoch": 4.3745318352059925, "grad_norm": 0.19666582345962524, "learning_rate": 2.475778302439524e-06, "loss": 0.0452, "num_input_tokens_seen": 32020200, "step": 293, "train_runtime": 3412.8228, "train_tokens_per_second": 9382.321 }, { "epoch": 4.389513108614232, "grad_norm": 0.30095529556274414, "learning_rate": 2.3637940626713346e-06, "loss": 0.0707, "num_input_tokens_seen": 32129744, "step": 294, "train_runtime": 3424.2939, "train_tokens_per_second": 9382.881 }, { "epoch": 4.404494382022472, "grad_norm": 0.21905633807182312, "learning_rate": 2.254275671731007e-06, "loss": 0.0611, "num_input_tokens_seen": 32247024, "step": 295, "train_runtime": 3435.8655, "train_tokens_per_second": 9385.415 }, { "epoch": 4.419475655430712, "grad_norm": 0.18735012412071228, "learning_rate": 2.14723505987737e-06, "loss": 0.058, "num_input_tokens_seen": 32361392, "step": 296, "train_runtime": 3447.3852, "train_tokens_per_second": 9387.228 }, { "epoch": 4.4344569288389515, "grad_norm": 0.18301299214363098, "learning_rate": 2.0426838874552714e-06, "loss": 0.0571, "num_input_tokens_seen": 32469248, "step": 297, "train_runtime": 3458.8947, "train_tokens_per_second": 9387.174 }, { "epoch": 4.449438202247191, "grad_norm": 0.09776000678539276, "learning_rate": 1.9406335436253724e-06, "loss": 0.0364, "num_input_tokens_seen": 32582736, "step": 298, "train_runtime": 3470.5436, "train_tokens_per_second": 9388.367 }, { "epoch": 4.464419475655431, "grad_norm": 0.15819956362247467, "learning_rate": 1.8410951451234533e-06, "loss": 0.034, "num_input_tokens_seen": 32691704, "step": 299, "train_runtime": 3481.9348, "train_tokens_per_second": 9388.948 }, { "epoch": 4.479400749063671, "grad_norm": 0.22488094866275787, "learning_rate": 1.7440795350494588e-06, "loss": 0.0675, "num_input_tokens_seen": 32807520, "step": 300, "train_runtime": 3493.4629, "train_tokens_per_second": 9391.117 }, { "epoch": 4.479400749063671, "eval_accuracy": 0.9514037008261675, "eval_loss": 0.14898425340652466, "eval_runtime": 4.93, "eval_samples_per_second": 11.562, "eval_steps_per_second": 3.043, "num_input_tokens_seen": 32807520, "step": 300 }, { "epoch": 4.49438202247191, "grad_norm": 0.16308434307575226, "learning_rate": 1.649597281686302e-06, "loss": 0.0563, "num_input_tokens_seen": 32917472, "step": 301, "train_runtime": 3509.8805, "train_tokens_per_second": 9378.516 }, { "epoch": 4.50936329588015, "grad_norm": 0.1949169635772705, "learning_rate": 1.5576586773486195e-06, "loss": 0.0582, "num_input_tokens_seen": 33026552, "step": 302, "train_runtime": 3521.2763, "train_tokens_per_second": 9379.143 }, { "epoch": 4.52434456928839, "grad_norm": 0.20031088590621948, "learning_rate": 1.4682737372615967e-06, "loss": 0.048, "num_input_tokens_seen": 33135312, "step": 303, "train_runtime": 3532.5735, "train_tokens_per_second": 9379.936 }, { "epoch": 4.539325842696629, "grad_norm": 0.16514037549495697, "learning_rate": 1.3814521984699596e-06, "loss": 0.0556, "num_input_tokens_seen": 33249640, "step": 304, "train_runtime": 3544.1498, "train_tokens_per_second": 9381.556 }, { "epoch": 4.554307116104869, "grad_norm": 0.17982099950313568, "learning_rate": 1.297203518777293e-06, "loss": 0.0427, "num_input_tokens_seen": 33356584, "step": 305, "train_runtime": 3555.468, "train_tokens_per_second": 9381.77 }, { "epoch": 4.569288389513108, "grad_norm": 0.30250856280326843, "learning_rate": 1.2155368757157643e-06, "loss": 0.095, "num_input_tokens_seen": 33465096, "step": 306, "train_runtime": 3567.021, "train_tokens_per_second": 9381.805 }, { "epoch": 4.584269662921348, "grad_norm": 0.14334945380687714, "learning_rate": 1.1364611655463736e-06, "loss": 0.0329, "num_input_tokens_seen": 33589904, "step": 307, "train_runtime": 3578.618, "train_tokens_per_second": 9386.278 }, { "epoch": 4.599250936329588, "grad_norm": 0.11703667044639587, "learning_rate": 1.0599850022898539e-06, "loss": 0.048, "num_input_tokens_seen": 33693528, "step": 308, "train_runtime": 3590.015, "train_tokens_per_second": 9385.345 }, { "epoch": 4.614232209737827, "grad_norm": 0.25832632184028625, "learning_rate": 9.861167167883046e-07, "loss": 0.0709, "num_input_tokens_seen": 33800928, "step": 309, "train_runtime": 3601.3539, "train_tokens_per_second": 9385.617 }, { "epoch": 4.629213483146067, "grad_norm": 0.2718651592731476, "learning_rate": 9.148643557976955e-07, "loss": 0.0807, "num_input_tokens_seen": 33904464, "step": 310, "train_runtime": 3612.5883, "train_tokens_per_second": 9385.089 }, { "epoch": 4.644194756554307, "grad_norm": 0.14838100969791412, "learning_rate": 8.462356811112987e-07, "loss": 0.0501, "num_input_tokens_seen": 34020608, "step": 311, "train_runtime": 3623.9265, "train_tokens_per_second": 9387.775 }, { "epoch": 4.659176029962547, "grad_norm": 0.2020755410194397, "learning_rate": 7.802381687141535e-07, "loss": 0.0499, "num_input_tokens_seen": 34129480, "step": 312, "train_runtime": 3635.4612, "train_tokens_per_second": 9387.937 }, { "epoch": 4.674157303370786, "grad_norm": 0.2344108372926712, "learning_rate": 7.168790079686932e-07, "loss": 0.086, "num_input_tokens_seen": 34229672, "step": 313, "train_runtime": 3646.7065, "train_tokens_per_second": 9386.462 }, { "epoch": 4.689138576779026, "grad_norm": 0.21764852106571198, "learning_rate": 6.561651008315738e-07, "loss": 0.0711, "num_input_tokens_seen": 34335640, "step": 314, "train_runtime": 3657.926, "train_tokens_per_second": 9386.642 }, { "epoch": 4.704119850187266, "grad_norm": 0.1388695240020752, "learning_rate": 5.981030611018234e-07, "loss": 0.0417, "num_input_tokens_seen": 34431984, "step": 315, "train_runtime": 3669.1157, "train_tokens_per_second": 9384.273 }, { "epoch": 4.719101123595506, "grad_norm": 0.19717121124267578, "learning_rate": 5.426992137003622e-07, "loss": 0.0668, "num_input_tokens_seen": 34547560, "step": 316, "train_runtime": 3680.779, "train_tokens_per_second": 9385.937 }, { "epoch": 4.734082397003745, "grad_norm": 0.1789878010749817, "learning_rate": 4.899595939810236e-07, "loss": 0.0582, "num_input_tokens_seen": 34651384, "step": 317, "train_runtime": 3691.8469, "train_tokens_per_second": 9385.921 }, { "epoch": 4.749063670411985, "grad_norm": 0.20900003612041473, "learning_rate": 4.398899470730827e-07, "loss": 0.0559, "num_input_tokens_seen": 34759152, "step": 318, "train_runtime": 3703.8343, "train_tokens_per_second": 9384.64 }, { "epoch": 4.764044943820225, "grad_norm": 0.18526972830295563, "learning_rate": 3.9249572725543196e-07, "loss": 0.0529, "num_input_tokens_seen": 34874632, "step": 319, "train_runtime": 3715.8268, "train_tokens_per_second": 9385.43 }, { "epoch": 4.7790262172284645, "grad_norm": 0.18614766001701355, "learning_rate": 3.477820973624063e-07, "loss": 0.0524, "num_input_tokens_seen": 34988104, "step": 320, "train_runtime": 3727.7427, "train_tokens_per_second": 9385.869 }, { "epoch": 4.794007490636704, "grad_norm": 0.2322590947151184, "learning_rate": 3.0575392822139726e-07, "loss": 0.0521, "num_input_tokens_seen": 35096592, "step": 321, "train_runtime": 3739.5414, "train_tokens_per_second": 9385.266 }, { "epoch": 4.808988764044944, "grad_norm": 0.1705033928155899, "learning_rate": 2.664157981222437e-07, "loss": 0.0796, "num_input_tokens_seen": 35211304, "step": 322, "train_runtime": 3751.4526, "train_tokens_per_second": 9386.045 }, { "epoch": 4.823970037453184, "grad_norm": 0.26094940304756165, "learning_rate": 2.297719923185032e-07, "loss": 0.0674, "num_input_tokens_seen": 35323056, "step": 323, "train_runtime": 3763.0198, "train_tokens_per_second": 9386.891 }, { "epoch": 4.8389513108614235, "grad_norm": 0.3194412291049957, "learning_rate": 1.9582650256064205e-07, "loss": 0.0803, "num_input_tokens_seen": 35436552, "step": 324, "train_runtime": 3774.8806, "train_tokens_per_second": 9387.463 }, { "epoch": 4.853932584269663, "grad_norm": 0.2129000723361969, "learning_rate": 1.645830266611914e-07, "loss": 0.0626, "num_input_tokens_seen": 35549872, "step": 325, "train_runtime": 3786.8829, "train_tokens_per_second": 9387.634 }, { "epoch": 4.853932584269663, "eval_accuracy": 0.9510650506531415, "eval_loss": 0.14768485724925995, "eval_runtime": 4.9341, "eval_samples_per_second": 11.552, "eval_steps_per_second": 3.04, "num_input_tokens_seen": 35549872, "step": 325 }, { "epoch": 4.868913857677903, "grad_norm": 0.2143908590078354, "learning_rate": 1.3604496809195288e-07, "loss": 0.0551, "num_input_tokens_seen": 35659600, "step": 326, "train_runtime": 3803.6163, "train_tokens_per_second": 9375.183 }, { "epoch": 4.883895131086143, "grad_norm": 0.15451987087726593, "learning_rate": 1.1021543561322012e-07, "loss": 0.0536, "num_input_tokens_seen": 35770904, "step": 327, "train_runtime": 3815.5627, "train_tokens_per_second": 9375.001 }, { "epoch": 4.898876404494382, "grad_norm": 0.19724752008914948, "learning_rate": 8.709724293513854e-08, "loss": 0.0664, "num_input_tokens_seen": 35879784, "step": 328, "train_runtime": 3827.5139, "train_tokens_per_second": 9374.175 }, { "epoch": 4.913857677902621, "grad_norm": 0.23586316406726837, "learning_rate": 6.66929084112089e-08, "loss": 0.0641, "num_input_tokens_seen": 35988344, "step": 329, "train_runtime": 3839.4639, "train_tokens_per_second": 9373.273 }, { "epoch": 4.928838951310862, "grad_norm": 0.19379207491874695, "learning_rate": 4.900465476393168e-08, "loss": 0.0624, "num_input_tokens_seen": 36093032, "step": 330, "train_runtime": 3851.3388, "train_tokens_per_second": 9371.555 }, { "epoch": 4.943820224719101, "grad_norm": 0.19698284566402435, "learning_rate": 3.403440884269526e-08, "loss": 0.0484, "num_input_tokens_seen": 36199864, "step": 331, "train_runtime": 3863.2488, "train_tokens_per_second": 9370.317 }, { "epoch": 4.9588014981273405, "grad_norm": 0.19212862849235535, "learning_rate": 2.1783801413866046e-08, "loss": 0.0649, "num_input_tokens_seen": 36302712, "step": 332, "train_runtime": 3875.1667, "train_tokens_per_second": 9368.039 }, { "epoch": 4.97378277153558, "grad_norm": 0.3132294714450836, "learning_rate": 1.2254166983152737e-08, "loss": 0.0684, "num_input_tokens_seen": 36412088, "step": 333, "train_runtime": 3886.7479, "train_tokens_per_second": 9368.266 }, { "epoch": 4.98876404494382, "grad_norm": 0.24174365401268005, "learning_rate": 5.446543650219904e-09, "loss": 0.0744, "num_input_tokens_seen": 36523328, "step": 334, "train_runtime": 3898.6721, "train_tokens_per_second": 9368.146 }, { "epoch": 5.0, "grad_norm": 0.24174365401268005, "learning_rate": 1.3616729956228425e-09, "loss": 0.0815, "num_input_tokens_seen": 36600520, "step": 335, "train_runtime": 3903.2788, "train_tokens_per_second": 9376.865 }, { "epoch": 5.0, "num_input_tokens_seen": 36600520, "step": 335, "total_flos": 1.6620454705385964e+18, "train_loss": 0.11312562568641421, "train_runtime": 4017.5693, "train_samples_per_second": 1.325, "train_steps_per_second": 0.083 } ], "logging_steps": 1, "max_steps": 335, "num_input_tokens_seen": 36600520, "num_train_epochs": 5, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.6620454705385964e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }