3525 lines
100 KiB
JSON
3525 lines
100 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 5.0,
|
|
"eval_steps": 25,
|
|
"global_step": 335,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0149812734082397,
|
|
"grad_norm": 3.1280667781829834,
|
|
"learning_rate": 0.0,
|
|
"loss": 0.287,
|
|
"num_input_tokens_seen": 106224,
|
|
"step": 1,
|
|
"train_runtime": 24.4359,
|
|
"train_tokens_per_second": 4347.041
|
|
},
|
|
{
|
|
"epoch": 0.0299625468164794,
|
|
"grad_norm": 1.6757160425186157,
|
|
"learning_rate": 1.4705882352941177e-06,
|
|
"loss": 0.1593,
|
|
"num_input_tokens_seen": 222384,
|
|
"step": 2,
|
|
"train_runtime": 34.7317,
|
|
"train_tokens_per_second": 6402.905
|
|
},
|
|
{
|
|
"epoch": 0.0449438202247191,
|
|
"grad_norm": 2.345803737640381,
|
|
"learning_rate": 2.9411764705882355e-06,
|
|
"loss": 0.1572,
|
|
"num_input_tokens_seen": 332728,
|
|
"step": 3,
|
|
"train_runtime": 45.0408,
|
|
"train_tokens_per_second": 7387.255
|
|
},
|
|
{
|
|
"epoch": 0.0599250936329588,
|
|
"grad_norm": 3.6148295402526855,
|
|
"learning_rate": 4.411764705882353e-06,
|
|
"loss": 0.2334,
|
|
"num_input_tokens_seen": 439824,
|
|
"step": 4,
|
|
"train_runtime": 55.4327,
|
|
"train_tokens_per_second": 7934.377
|
|
},
|
|
{
|
|
"epoch": 0.0749063670411985,
|
|
"grad_norm": 1.2359391450881958,
|
|
"learning_rate": 5.882352941176471e-06,
|
|
"loss": 0.0885,
|
|
"num_input_tokens_seen": 552832,
|
|
"step": 5,
|
|
"train_runtime": 65.9903,
|
|
"train_tokens_per_second": 8377.468
|
|
},
|
|
{
|
|
"epoch": 0.0898876404494382,
|
|
"grad_norm": 1.303969144821167,
|
|
"learning_rate": 7.3529411764705884e-06,
|
|
"loss": 0.1054,
|
|
"num_input_tokens_seen": 667008,
|
|
"step": 6,
|
|
"train_runtime": 77.8897,
|
|
"train_tokens_per_second": 8563.494
|
|
},
|
|
{
|
|
"epoch": 0.10486891385767791,
|
|
"grad_norm": 2.0778756141662598,
|
|
"learning_rate": 8.823529411764707e-06,
|
|
"loss": 0.2192,
|
|
"num_input_tokens_seen": 779728,
|
|
"step": 7,
|
|
"train_runtime": 89.3699,
|
|
"train_tokens_per_second": 8724.73
|
|
},
|
|
{
|
|
"epoch": 0.1198501872659176,
|
|
"grad_norm": 1.932761788368225,
|
|
"learning_rate": 1.0294117647058824e-05,
|
|
"loss": 0.1713,
|
|
"num_input_tokens_seen": 893296,
|
|
"step": 8,
|
|
"train_runtime": 101.0598,
|
|
"train_tokens_per_second": 8839.281
|
|
},
|
|
{
|
|
"epoch": 0.1348314606741573,
|
|
"grad_norm": 1.013716220855713,
|
|
"learning_rate": 1.1764705882352942e-05,
|
|
"loss": 0.1316,
|
|
"num_input_tokens_seen": 1010424,
|
|
"step": 9,
|
|
"train_runtime": 112.9423,
|
|
"train_tokens_per_second": 8946.377
|
|
},
|
|
{
|
|
"epoch": 0.149812734082397,
|
|
"grad_norm": 0.4578853249549866,
|
|
"learning_rate": 1.323529411764706e-05,
|
|
"loss": 0.0925,
|
|
"num_input_tokens_seen": 1122608,
|
|
"step": 10,
|
|
"train_runtime": 124.9021,
|
|
"train_tokens_per_second": 8987.903
|
|
},
|
|
{
|
|
"epoch": 0.1647940074906367,
|
|
"grad_norm": 1.1105873584747314,
|
|
"learning_rate": 1.4705882352941177e-05,
|
|
"loss": 0.1682,
|
|
"num_input_tokens_seen": 1232432,
|
|
"step": 11,
|
|
"train_runtime": 136.2736,
|
|
"train_tokens_per_second": 9043.807
|
|
},
|
|
{
|
|
"epoch": 0.1797752808988764,
|
|
"grad_norm": 0.8049420714378357,
|
|
"learning_rate": 1.6176470588235296e-05,
|
|
"loss": 0.1501,
|
|
"num_input_tokens_seen": 1341448,
|
|
"step": 12,
|
|
"train_runtime": 147.7057,
|
|
"train_tokens_per_second": 9081.896
|
|
},
|
|
{
|
|
"epoch": 0.1947565543071161,
|
|
"grad_norm": 0.7054234743118286,
|
|
"learning_rate": 1.7647058823529414e-05,
|
|
"loss": 0.1541,
|
|
"num_input_tokens_seen": 1438440,
|
|
"step": 13,
|
|
"train_runtime": 158.7691,
|
|
"train_tokens_per_second": 9059.95
|
|
},
|
|
{
|
|
"epoch": 0.20973782771535582,
|
|
"grad_norm": 3.3378353118896484,
|
|
"learning_rate": 1.9117647058823528e-05,
|
|
"loss": 0.1972,
|
|
"num_input_tokens_seen": 1547408,
|
|
"step": 14,
|
|
"train_runtime": 170.332,
|
|
"train_tokens_per_second": 9084.656
|
|
},
|
|
{
|
|
"epoch": 0.2247191011235955,
|
|
"grad_norm": 0.47882241010665894,
|
|
"learning_rate": 2.058823529411765e-05,
|
|
"loss": 0.1355,
|
|
"num_input_tokens_seen": 1660336,
|
|
"step": 15,
|
|
"train_runtime": 181.7719,
|
|
"train_tokens_per_second": 9134.175
|
|
},
|
|
{
|
|
"epoch": 0.2397003745318352,
|
|
"grad_norm": 0.4519363045692444,
|
|
"learning_rate": 2.2058823529411766e-05,
|
|
"loss": 0.1175,
|
|
"num_input_tokens_seen": 1774704,
|
|
"step": 16,
|
|
"train_runtime": 193.265,
|
|
"train_tokens_per_second": 9182.748
|
|
},
|
|
{
|
|
"epoch": 0.2546816479400749,
|
|
"grad_norm": 0.8406792283058167,
|
|
"learning_rate": 2.3529411764705884e-05,
|
|
"loss": 0.2153,
|
|
"num_input_tokens_seen": 1887056,
|
|
"step": 17,
|
|
"train_runtime": 204.716,
|
|
"train_tokens_per_second": 9217.922
|
|
},
|
|
{
|
|
"epoch": 0.2696629213483146,
|
|
"grad_norm": 0.5358920097351074,
|
|
"learning_rate": 2.5e-05,
|
|
"loss": 0.1604,
|
|
"num_input_tokens_seen": 1995456,
|
|
"step": 18,
|
|
"train_runtime": 216.103,
|
|
"train_tokens_per_second": 9233.821
|
|
},
|
|
{
|
|
"epoch": 0.2846441947565543,
|
|
"grad_norm": 0.5870547294616699,
|
|
"learning_rate": 2.647058823529412e-05,
|
|
"loss": 0.1922,
|
|
"num_input_tokens_seen": 2111824,
|
|
"step": 19,
|
|
"train_runtime": 227.5641,
|
|
"train_tokens_per_second": 9280.126
|
|
},
|
|
{
|
|
"epoch": 0.299625468164794,
|
|
"grad_norm": 0.7451322078704834,
|
|
"learning_rate": 2.7941176470588236e-05,
|
|
"loss": 0.2839,
|
|
"num_input_tokens_seen": 2217760,
|
|
"step": 20,
|
|
"train_runtime": 238.9081,
|
|
"train_tokens_per_second": 9282.899
|
|
},
|
|
{
|
|
"epoch": 0.3146067415730337,
|
|
"grad_norm": 0.5490975975990295,
|
|
"learning_rate": 2.9411764705882354e-05,
|
|
"loss": 0.1694,
|
|
"num_input_tokens_seen": 2331736,
|
|
"step": 21,
|
|
"train_runtime": 250.286,
|
|
"train_tokens_per_second": 9316.285
|
|
},
|
|
{
|
|
"epoch": 0.3295880149812734,
|
|
"grad_norm": 0.2864564061164856,
|
|
"learning_rate": 3.0882352941176475e-05,
|
|
"loss": 0.0844,
|
|
"num_input_tokens_seen": 2436696,
|
|
"step": 22,
|
|
"train_runtime": 261.5777,
|
|
"train_tokens_per_second": 9315.381
|
|
},
|
|
{
|
|
"epoch": 0.3445692883895131,
|
|
"grad_norm": 0.5809709429740906,
|
|
"learning_rate": 3.235294117647059e-05,
|
|
"loss": 0.1601,
|
|
"num_input_tokens_seen": 2549584,
|
|
"step": 23,
|
|
"train_runtime": 272.813,
|
|
"train_tokens_per_second": 9345.538
|
|
},
|
|
{
|
|
"epoch": 0.3595505617977528,
|
|
"grad_norm": 0.5216901302337646,
|
|
"learning_rate": 3.382352941176471e-05,
|
|
"loss": 0.1835,
|
|
"num_input_tokens_seen": 2655016,
|
|
"step": 24,
|
|
"train_runtime": 284.1449,
|
|
"train_tokens_per_second": 9343.881
|
|
},
|
|
{
|
|
"epoch": 0.37453183520599254,
|
|
"grad_norm": 0.5510843992233276,
|
|
"learning_rate": 3.529411764705883e-05,
|
|
"loss": 0.2083,
|
|
"num_input_tokens_seen": 2761784,
|
|
"step": 25,
|
|
"train_runtime": 295.5762,
|
|
"train_tokens_per_second": 9343.731
|
|
},
|
|
{
|
|
"epoch": 0.37453183520599254,
|
|
"eval_accuracy": 0.9302976935430894,
|
|
"eval_loss": 0.22450505197048187,
|
|
"eval_runtime": 4.9446,
|
|
"eval_samples_per_second": 11.528,
|
|
"eval_steps_per_second": 3.034,
|
|
"num_input_tokens_seen": 2761784,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.3895131086142322,
|
|
"grad_norm": 0.40877798199653625,
|
|
"learning_rate": 3.6764705882352945e-05,
|
|
"loss": 0.1667,
|
|
"num_input_tokens_seen": 2876408,
|
|
"step": 26,
|
|
"train_runtime": 312.0108,
|
|
"train_tokens_per_second": 9218.938
|
|
},
|
|
{
|
|
"epoch": 0.4044943820224719,
|
|
"grad_norm": 0.2976829409599304,
|
|
"learning_rate": 3.8235294117647055e-05,
|
|
"loss": 0.0896,
|
|
"num_input_tokens_seen": 2987992,
|
|
"step": 27,
|
|
"train_runtime": 323.1092,
|
|
"train_tokens_per_second": 9247.622
|
|
},
|
|
{
|
|
"epoch": 0.41947565543071164,
|
|
"grad_norm": 0.68152916431427,
|
|
"learning_rate": 3.970588235294117e-05,
|
|
"loss": 0.2299,
|
|
"num_input_tokens_seen": 3093200,
|
|
"step": 28,
|
|
"train_runtime": 334.4361,
|
|
"train_tokens_per_second": 9249.001
|
|
},
|
|
{
|
|
"epoch": 0.4344569288389513,
|
|
"grad_norm": 0.6799381375312805,
|
|
"learning_rate": 4.11764705882353e-05,
|
|
"loss": 0.269,
|
|
"num_input_tokens_seen": 3192624,
|
|
"step": 29,
|
|
"train_runtime": 345.4411,
|
|
"train_tokens_per_second": 9242.167
|
|
},
|
|
{
|
|
"epoch": 0.449438202247191,
|
|
"grad_norm": 0.4044613242149353,
|
|
"learning_rate": 4.2647058823529415e-05,
|
|
"loss": 0.159,
|
|
"num_input_tokens_seen": 3306120,
|
|
"step": 30,
|
|
"train_runtime": 356.9051,
|
|
"train_tokens_per_second": 9263.303
|
|
},
|
|
{
|
|
"epoch": 0.46441947565543074,
|
|
"grad_norm": 0.4966118037700653,
|
|
"learning_rate": 4.411764705882353e-05,
|
|
"loss": 0.229,
|
|
"num_input_tokens_seen": 3419688,
|
|
"step": 31,
|
|
"train_runtime": 368.3658,
|
|
"train_tokens_per_second": 9283.402
|
|
},
|
|
{
|
|
"epoch": 0.4794007490636704,
|
|
"grad_norm": 0.6686931848526001,
|
|
"learning_rate": 4.558823529411765e-05,
|
|
"loss": 0.1994,
|
|
"num_input_tokens_seen": 3521896,
|
|
"step": 32,
|
|
"train_runtime": 379.7162,
|
|
"train_tokens_per_second": 9275.074
|
|
},
|
|
{
|
|
"epoch": 0.4943820224719101,
|
|
"grad_norm": 0.2366495132446289,
|
|
"learning_rate": 4.705882352941177e-05,
|
|
"loss": 0.109,
|
|
"num_input_tokens_seen": 3644912,
|
|
"step": 33,
|
|
"train_runtime": 391.2145,
|
|
"train_tokens_per_second": 9316.916
|
|
},
|
|
{
|
|
"epoch": 0.5093632958801498,
|
|
"grad_norm": 0.49944090843200684,
|
|
"learning_rate": 4.8529411764705885e-05,
|
|
"loss": 0.1888,
|
|
"num_input_tokens_seen": 3751480,
|
|
"step": 34,
|
|
"train_runtime": 402.5823,
|
|
"train_tokens_per_second": 9318.541
|
|
},
|
|
{
|
|
"epoch": 0.5243445692883895,
|
|
"grad_norm": 0.5719208121299744,
|
|
"learning_rate": 5e-05,
|
|
"loss": 0.1841,
|
|
"num_input_tokens_seen": 3853072,
|
|
"step": 35,
|
|
"train_runtime": 413.9404,
|
|
"train_tokens_per_second": 9308.277
|
|
},
|
|
{
|
|
"epoch": 0.5393258426966292,
|
|
"grad_norm": 0.5086914896965027,
|
|
"learning_rate": 4.999863832700438e-05,
|
|
"loss": 0.2391,
|
|
"num_input_tokens_seen": 3954992,
|
|
"step": 36,
|
|
"train_runtime": 424.9062,
|
|
"train_tokens_per_second": 9307.918
|
|
},
|
|
{
|
|
"epoch": 0.5543071161048689,
|
|
"grad_norm": 0.616763174533844,
|
|
"learning_rate": 4.999455345634978e-05,
|
|
"loss": 0.2589,
|
|
"num_input_tokens_seen": 4060312,
|
|
"step": 37,
|
|
"train_runtime": 436.2608,
|
|
"train_tokens_per_second": 9307.075
|
|
},
|
|
{
|
|
"epoch": 0.5692883895131086,
|
|
"grad_norm": 0.3873949646949768,
|
|
"learning_rate": 4.9987745833016855e-05,
|
|
"loss": 0.1603,
|
|
"num_input_tokens_seen": 4159664,
|
|
"step": 38,
|
|
"train_runtime": 447.7626,
|
|
"train_tokens_per_second": 9289.888
|
|
},
|
|
{
|
|
"epoch": 0.5842696629213483,
|
|
"grad_norm": 0.42982256412506104,
|
|
"learning_rate": 4.9978216198586135e-05,
|
|
"loss": 0.1837,
|
|
"num_input_tokens_seen": 4273696,
|
|
"step": 39,
|
|
"train_runtime": 459.1806,
|
|
"train_tokens_per_second": 9307.223
|
|
},
|
|
{
|
|
"epoch": 0.599250936329588,
|
|
"grad_norm": 0.45362767577171326,
|
|
"learning_rate": 4.996596559115731e-05,
|
|
"loss": 0.2044,
|
|
"num_input_tokens_seen": 4381080,
|
|
"step": 40,
|
|
"train_runtime": 470.3339,
|
|
"train_tokens_per_second": 9314.829
|
|
},
|
|
{
|
|
"epoch": 0.6142322097378277,
|
|
"grad_norm": 0.32139134407043457,
|
|
"learning_rate": 4.995099534523607e-05,
|
|
"loss": 0.1326,
|
|
"num_input_tokens_seen": 4499912,
|
|
"step": 41,
|
|
"train_runtime": 481.7193,
|
|
"train_tokens_per_second": 9341.357
|
|
},
|
|
{
|
|
"epoch": 0.6292134831460674,
|
|
"grad_norm": 0.39159446954727173,
|
|
"learning_rate": 4.9933307091588796e-05,
|
|
"loss": 0.1795,
|
|
"num_input_tokens_seen": 4606816,
|
|
"step": 42,
|
|
"train_runtime": 493.112,
|
|
"train_tokens_per_second": 9342.332
|
|
},
|
|
{
|
|
"epoch": 0.6441947565543071,
|
|
"grad_norm": 0.4465094804763794,
|
|
"learning_rate": 4.991290275706486e-05,
|
|
"loss": 0.188,
|
|
"num_input_tokens_seen": 4720528,
|
|
"step": 43,
|
|
"train_runtime": 504.578,
|
|
"train_tokens_per_second": 9355.398
|
|
},
|
|
{
|
|
"epoch": 0.6591760299625468,
|
|
"grad_norm": 0.5234239101409912,
|
|
"learning_rate": 4.988978456438678e-05,
|
|
"loss": 0.1692,
|
|
"num_input_tokens_seen": 4834552,
|
|
"step": 44,
|
|
"train_runtime": 516.0278,
|
|
"train_tokens_per_second": 9368.781
|
|
},
|
|
{
|
|
"epoch": 0.6741573033707865,
|
|
"grad_norm": 0.36853307485580444,
|
|
"learning_rate": 4.986395503190805e-05,
|
|
"loss": 0.1526,
|
|
"num_input_tokens_seen": 4940840,
|
|
"step": 45,
|
|
"train_runtime": 527.1679,
|
|
"train_tokens_per_second": 9372.422
|
|
},
|
|
{
|
|
"epoch": 0.6891385767790262,
|
|
"grad_norm": 0.5273284912109375,
|
|
"learning_rate": 4.983541697333881e-05,
|
|
"loss": 0.2274,
|
|
"num_input_tokens_seen": 5044880,
|
|
"step": 46,
|
|
"train_runtime": 538.3953,
|
|
"train_tokens_per_second": 9370.215
|
|
},
|
|
{
|
|
"epoch": 0.704119850187266,
|
|
"grad_norm": 0.24699360132217407,
|
|
"learning_rate": 4.980417349743936e-05,
|
|
"loss": 0.1199,
|
|
"num_input_tokens_seen": 5164256,
|
|
"step": 47,
|
|
"train_runtime": 549.2882,
|
|
"train_tokens_per_second": 9401.724
|
|
},
|
|
{
|
|
"epoch": 0.7191011235955056,
|
|
"grad_norm": 0.5065047144889832,
|
|
"learning_rate": 4.9770228007681494e-05,
|
|
"loss": 0.2262,
|
|
"num_input_tokens_seen": 5262840,
|
|
"step": 48,
|
|
"train_runtime": 559.2344,
|
|
"train_tokens_per_second": 9410.795
|
|
},
|
|
{
|
|
"epoch": 0.7340823970037453,
|
|
"grad_norm": 0.3612704873085022,
|
|
"learning_rate": 4.973358420187776e-05,
|
|
"loss": 0.1684,
|
|
"num_input_tokens_seen": 5374992,
|
|
"step": 49,
|
|
"train_runtime": 569.5899,
|
|
"train_tokens_per_second": 9436.6
|
|
},
|
|
{
|
|
"epoch": 0.7490636704119851,
|
|
"grad_norm": 0.40374258160591125,
|
|
"learning_rate": 4.9694246071778604e-05,
|
|
"loss": 0.1599,
|
|
"num_input_tokens_seen": 5486368,
|
|
"step": 50,
|
|
"train_runtime": 580.0566,
|
|
"train_tokens_per_second": 9458.332
|
|
},
|
|
{
|
|
"epoch": 0.7490636704119851,
|
|
"eval_accuracy": 0.9305199059084309,
|
|
"eval_loss": 0.22489887475967407,
|
|
"eval_runtime": 4.9257,
|
|
"eval_samples_per_second": 11.572,
|
|
"eval_steps_per_second": 3.045,
|
|
"num_input_tokens_seen": 5486368,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.7640449438202247,
|
|
"grad_norm": 0.41528448462486267,
|
|
"learning_rate": 4.9652217902637596e-05,
|
|
"loss": 0.2025,
|
|
"num_input_tokens_seen": 5582648,
|
|
"step": 51,
|
|
"train_runtime": 594.7066,
|
|
"train_tokens_per_second": 9387.23
|
|
},
|
|
{
|
|
"epoch": 0.7790262172284644,
|
|
"grad_norm": 0.32743218541145325,
|
|
"learning_rate": 4.9607504272744575e-05,
|
|
"loss": 0.1592,
|
|
"num_input_tokens_seen": 5692920,
|
|
"step": 52,
|
|
"train_runtime": 604.9175,
|
|
"train_tokens_per_second": 9411.069
|
|
},
|
|
{
|
|
"epoch": 0.7940074906367042,
|
|
"grad_norm": 0.484891802072525,
|
|
"learning_rate": 4.956011005292692e-05,
|
|
"loss": 0.2657,
|
|
"num_input_tokens_seen": 5795728,
|
|
"step": 53,
|
|
"train_runtime": 615.187,
|
|
"train_tokens_per_second": 9421.083
|
|
},
|
|
{
|
|
"epoch": 0.8089887640449438,
|
|
"grad_norm": 0.335283488035202,
|
|
"learning_rate": 4.951004040601898e-05,
|
|
"loss": 0.1878,
|
|
"num_input_tokens_seen": 5911816,
|
|
"step": 54,
|
|
"train_runtime": 625.4996,
|
|
"train_tokens_per_second": 9451.351
|
|
},
|
|
{
|
|
"epoch": 0.8239700374531835,
|
|
"grad_norm": 0.44197705388069153,
|
|
"learning_rate": 4.945730078629964e-05,
|
|
"loss": 0.2157,
|
|
"num_input_tokens_seen": 6015648,
|
|
"step": 55,
|
|
"train_runtime": 635.6908,
|
|
"train_tokens_per_second": 9463.167
|
|
},
|
|
{
|
|
"epoch": 0.8389513108614233,
|
|
"grad_norm": 0.3809565603733063,
|
|
"learning_rate": 4.9401896938898185e-05,
|
|
"loss": 0.1789,
|
|
"num_input_tokens_seen": 6132248,
|
|
"step": 56,
|
|
"train_runtime": 646.1102,
|
|
"train_tokens_per_second": 9491.025
|
|
},
|
|
{
|
|
"epoch": 0.8539325842696629,
|
|
"grad_norm": 0.4315880239009857,
|
|
"learning_rate": 4.934383489916843e-05,
|
|
"loss": 0.2019,
|
|
"num_input_tokens_seen": 6249344,
|
|
"step": 57,
|
|
"train_runtime": 656.5451,
|
|
"train_tokens_per_second": 9518.529
|
|
},
|
|
{
|
|
"epoch": 0.8689138576779026,
|
|
"grad_norm": 0.27884915471076965,
|
|
"learning_rate": 4.928312099203131e-05,
|
|
"loss": 0.132,
|
|
"num_input_tokens_seen": 6366872,
|
|
"step": 58,
|
|
"train_runtime": 667.0032,
|
|
"train_tokens_per_second": 9545.49
|
|
},
|
|
{
|
|
"epoch": 0.8838951310861424,
|
|
"grad_norm": 0.4101852476596832,
|
|
"learning_rate": 4.921976183128585e-05,
|
|
"loss": 0.2022,
|
|
"num_input_tokens_seen": 6475464,
|
|
"step": 59,
|
|
"train_runtime": 677.3685,
|
|
"train_tokens_per_second": 9559.736
|
|
},
|
|
{
|
|
"epoch": 0.898876404494382,
|
|
"grad_norm": 0.35576295852661133,
|
|
"learning_rate": 4.9153764318888706e-05,
|
|
"loss": 0.1605,
|
|
"num_input_tokens_seen": 6587040,
|
|
"step": 60,
|
|
"train_runtime": 688.9819,
|
|
"train_tokens_per_second": 9560.542
|
|
},
|
|
{
|
|
"epoch": 0.9138576779026217,
|
|
"grad_norm": 0.4770338833332062,
|
|
"learning_rate": 4.908513564420231e-05,
|
|
"loss": 0.2062,
|
|
"num_input_tokens_seen": 6702552,
|
|
"step": 61,
|
|
"train_runtime": 700.508,
|
|
"train_tokens_per_second": 9568.131
|
|
},
|
|
{
|
|
"epoch": 0.9288389513108615,
|
|
"grad_norm": 0.3157297372817993,
|
|
"learning_rate": 4.90138832832117e-05,
|
|
"loss": 0.1485,
|
|
"num_input_tokens_seen": 6809352,
|
|
"step": 62,
|
|
"train_runtime": 712.0733,
|
|
"train_tokens_per_second": 9562.713
|
|
},
|
|
{
|
|
"epoch": 0.9438202247191011,
|
|
"grad_norm": 0.4924875497817993,
|
|
"learning_rate": 4.894001499771015e-05,
|
|
"loss": 0.1896,
|
|
"num_input_tokens_seen": 6909928,
|
|
"step": 63,
|
|
"train_runtime": 723.6298,
|
|
"train_tokens_per_second": 9548.983
|
|
},
|
|
{
|
|
"epoch": 0.9588014981273408,
|
|
"grad_norm": 0.245199054479599,
|
|
"learning_rate": 4.886353883445363e-05,
|
|
"loss": 0.1141,
|
|
"num_input_tokens_seen": 7029288,
|
|
"step": 64,
|
|
"train_runtime": 735.3069,
|
|
"train_tokens_per_second": 9559.665
|
|
},
|
|
{
|
|
"epoch": 0.9737827715355806,
|
|
"grad_norm": 2.8648366928100586,
|
|
"learning_rate": 4.878446312428424e-05,
|
|
"loss": 0.2227,
|
|
"num_input_tokens_seen": 7136544,
|
|
"step": 65,
|
|
"train_runtime": 746.8368,
|
|
"train_tokens_per_second": 9555.693
|
|
},
|
|
{
|
|
"epoch": 0.9887640449438202,
|
|
"grad_norm": 0.28591519594192505,
|
|
"learning_rate": 4.8702796481222714e-05,
|
|
"loss": 0.1648,
|
|
"num_input_tokens_seen": 7244184,
|
|
"step": 66,
|
|
"train_runtime": 758.3309,
|
|
"train_tokens_per_second": 9552.801
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 0.28591519594192505,
|
|
"learning_rate": 4.861854780153004e-05,
|
|
"loss": 0.2552,
|
|
"num_input_tokens_seen": 7319544,
|
|
"step": 67,
|
|
"train_runtime": 763.1534,
|
|
"train_tokens_per_second": 9591.183
|
|
},
|
|
{
|
|
"epoch": 1.0149812734082397,
|
|
"grad_norm": 0.5447623133659363,
|
|
"learning_rate": 4.853172626273841e-05,
|
|
"loss": 0.1038,
|
|
"num_input_tokens_seen": 7437632,
|
|
"step": 68,
|
|
"train_runtime": 774.79,
|
|
"train_tokens_per_second": 9599.546
|
|
},
|
|
{
|
|
"epoch": 1.0299625468164795,
|
|
"grad_norm": 0.27388796210289,
|
|
"learning_rate": 4.8442341322651385e-05,
|
|
"loss": 0.1202,
|
|
"num_input_tokens_seen": 7547280,
|
|
"step": 69,
|
|
"train_runtime": 786.2831,
|
|
"train_tokens_per_second": 9598.68
|
|
},
|
|
{
|
|
"epoch": 1.0449438202247192,
|
|
"grad_norm": 0.3335851728916168,
|
|
"learning_rate": 4.83504027183137e-05,
|
|
"loss": 0.1851,
|
|
"num_input_tokens_seen": 7658904,
|
|
"step": 70,
|
|
"train_runtime": 798.2298,
|
|
"train_tokens_per_second": 9594.861
|
|
},
|
|
{
|
|
"epoch": 1.0599250936329587,
|
|
"grad_norm": 0.2915020287036896,
|
|
"learning_rate": 4.825592046495054e-05,
|
|
"loss": 0.1193,
|
|
"num_input_tokens_seen": 7762712,
|
|
"step": 71,
|
|
"train_runtime": 809.7872,
|
|
"train_tokens_per_second": 9586.113
|
|
},
|
|
{
|
|
"epoch": 1.0749063670411985,
|
|
"grad_norm": 0.3645778000354767,
|
|
"learning_rate": 4.8158904854876555e-05,
|
|
"loss": 0.1442,
|
|
"num_input_tokens_seen": 7875080,
|
|
"step": 72,
|
|
"train_runtime": 821.7369,
|
|
"train_tokens_per_second": 9583.457
|
|
},
|
|
{
|
|
"epoch": 1.0898876404494382,
|
|
"grad_norm": 0.21766622364521027,
|
|
"learning_rate": 4.805936645637463e-05,
|
|
"loss": 0.1783,
|
|
"num_input_tokens_seen": 7989424,
|
|
"step": 73,
|
|
"train_runtime": 833.6118,
|
|
"train_tokens_per_second": 9584.106
|
|
},
|
|
{
|
|
"epoch": 1.104868913857678,
|
|
"grad_norm": 0.24854034185409546,
|
|
"learning_rate": 4.795731611254473e-05,
|
|
"loss": 0.096,
|
|
"num_input_tokens_seen": 8104200,
|
|
"step": 74,
|
|
"train_runtime": 845.3499,
|
|
"train_tokens_per_second": 9586.799
|
|
},
|
|
{
|
|
"epoch": 1.1198501872659177,
|
|
"grad_norm": 0.2844558358192444,
|
|
"learning_rate": 4.785276494012263e-05,
|
|
"loss": 0.1223,
|
|
"num_input_tokens_seen": 8216400,
|
|
"step": 75,
|
|
"train_runtime": 857.2782,
|
|
"train_tokens_per_second": 9584.287
|
|
},
|
|
{
|
|
"epoch": 1.1198501872659177,
|
|
"eval_accuracy": 0.9364104124311492,
|
|
"eval_loss": 0.20777302980422974,
|
|
"eval_runtime": 4.9438,
|
|
"eval_samples_per_second": 11.53,
|
|
"eval_steps_per_second": 3.034,
|
|
"num_input_tokens_seen": 8216400,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 1.1348314606741572,
|
|
"grad_norm": 0.2329414337873459,
|
|
"learning_rate": 4.7745724328269e-05,
|
|
"loss": 0.1293,
|
|
"num_input_tokens_seen": 8330424,
|
|
"step": 76,
|
|
"train_runtime": 874.2192,
|
|
"train_tokens_per_second": 9528.988
|
|
},
|
|
{
|
|
"epoch": 1.149812734082397,
|
|
"grad_norm": 0.32167452573776245,
|
|
"learning_rate": 4.763620593732867e-05,
|
|
"loss": 0.1562,
|
|
"num_input_tokens_seen": 8438312,
|
|
"step": 77,
|
|
"train_runtime": 886.018,
|
|
"train_tokens_per_second": 9523.86
|
|
},
|
|
{
|
|
"epoch": 1.1647940074906367,
|
|
"grad_norm": 0.3214111030101776,
|
|
"learning_rate": 4.752422169756048e-05,
|
|
"loss": 0.1081,
|
|
"num_input_tokens_seen": 8538856,
|
|
"step": 78,
|
|
"train_runtime": 897.8564,
|
|
"train_tokens_per_second": 9510.269
|
|
},
|
|
{
|
|
"epoch": 1.1797752808988764,
|
|
"grad_norm": 0.22370071709156036,
|
|
"learning_rate": 4.740978380783765e-05,
|
|
"loss": 0.0907,
|
|
"num_input_tokens_seen": 8648688,
|
|
"step": 79,
|
|
"train_runtime": 909.7476,
|
|
"train_tokens_per_second": 9506.689
|
|
},
|
|
{
|
|
"epoch": 1.1947565543071161,
|
|
"grad_norm": 0.2359580248594284,
|
|
"learning_rate": 4.7292904734318924e-05,
|
|
"loss": 0.1497,
|
|
"num_input_tokens_seen": 8757528,
|
|
"step": 80,
|
|
"train_runtime": 921.657,
|
|
"train_tokens_per_second": 9501.938
|
|
},
|
|
{
|
|
"epoch": 1.2097378277153559,
|
|
"grad_norm": 0.45581308007240295,
|
|
"learning_rate": 4.7173597209090534e-05,
|
|
"loss": 0.1343,
|
|
"num_input_tokens_seen": 8871600,
|
|
"step": 81,
|
|
"train_runtime": 933.4439,
|
|
"train_tokens_per_second": 9504.16
|
|
},
|
|
{
|
|
"epoch": 1.2247191011235956,
|
|
"grad_norm": 0.33737245202064514,
|
|
"learning_rate": 4.70518742287793e-05,
|
|
"loss": 0.1842,
|
|
"num_input_tokens_seen": 8975328,
|
|
"step": 82,
|
|
"train_runtime": 944.8618,
|
|
"train_tokens_per_second": 9499.091
|
|
},
|
|
{
|
|
"epoch": 1.2397003745318351,
|
|
"grad_norm": 0.43545785546302795,
|
|
"learning_rate": 4.6927749053136866e-05,
|
|
"loss": 0.1342,
|
|
"num_input_tokens_seen": 9090992,
|
|
"step": 83,
|
|
"train_runtime": 956.4008,
|
|
"train_tokens_per_second": 9505.421
|
|
},
|
|
{
|
|
"epoch": 1.2546816479400749,
|
|
"grad_norm": 0.33271756768226624,
|
|
"learning_rate": 4.6801235203595195e-05,
|
|
"loss": 0.1938,
|
|
"num_input_tokens_seen": 9201320,
|
|
"step": 84,
|
|
"train_runtime": 967.7879,
|
|
"train_tokens_per_second": 9507.579
|
|
},
|
|
{
|
|
"epoch": 1.2696629213483146,
|
|
"grad_norm": 0.3993559777736664,
|
|
"learning_rate": 4.667234646179368e-05,
|
|
"loss": 0.1673,
|
|
"num_input_tokens_seen": 9304160,
|
|
"step": 85,
|
|
"train_runtime": 978.9186,
|
|
"train_tokens_per_second": 9504.529
|
|
},
|
|
{
|
|
"epoch": 1.2846441947565543,
|
|
"grad_norm": 0.3416566252708435,
|
|
"learning_rate": 4.654109686807787e-05,
|
|
"loss": 0.2025,
|
|
"num_input_tokens_seen": 9409224,
|
|
"step": 86,
|
|
"train_runtime": 990.4873,
|
|
"train_tokens_per_second": 9499.59
|
|
},
|
|
{
|
|
"epoch": 1.299625468164794,
|
|
"grad_norm": 0.3253297805786133,
|
|
"learning_rate": 4.640750071996995e-05,
|
|
"loss": 0.1421,
|
|
"num_input_tokens_seen": 9514232,
|
|
"step": 87,
|
|
"train_runtime": 1002.0082,
|
|
"train_tokens_per_second": 9495.164
|
|
},
|
|
{
|
|
"epoch": 1.3146067415730336,
|
|
"grad_norm": 0.3348604738712311,
|
|
"learning_rate": 4.6271572570611296e-05,
|
|
"loss": 0.1485,
|
|
"num_input_tokens_seen": 9623752,
|
|
"step": 88,
|
|
"train_runtime": 1013.5776,
|
|
"train_tokens_per_second": 9494.835
|
|
},
|
|
{
|
|
"epoch": 1.3295880149812733,
|
|
"grad_norm": 0.34817907214164734,
|
|
"learning_rate": 4.613332722717714e-05,
|
|
"loss": 0.1504,
|
|
"num_input_tokens_seen": 9734808,
|
|
"step": 89,
|
|
"train_runtime": 1025.1774,
|
|
"train_tokens_per_second": 9495.73
|
|
},
|
|
{
|
|
"epoch": 1.344569288389513,
|
|
"grad_norm": 0.3490277826786041,
|
|
"learning_rate": 4.5992779749263546e-05,
|
|
"loss": 0.1232,
|
|
"num_input_tokens_seen": 9847464,
|
|
"step": 90,
|
|
"train_runtime": 1036.7917,
|
|
"train_tokens_per_second": 9498.016
|
|
},
|
|
{
|
|
"epoch": 1.3595505617977528,
|
|
"grad_norm": 0.2823807895183563,
|
|
"learning_rate": 4.584994544724695e-05,
|
|
"loss": 0.1916,
|
|
"num_input_tokens_seen": 9940464,
|
|
"step": 91,
|
|
"train_runtime": 1048.2397,
|
|
"train_tokens_per_second": 9483.006
|
|
},
|
|
{
|
|
"epoch": 1.3745318352059925,
|
|
"grad_norm": 0.3263910114765167,
|
|
"learning_rate": 4.5704839880616296e-05,
|
|
"loss": 0.1665,
|
|
"num_input_tokens_seen": 10054728,
|
|
"step": 92,
|
|
"train_runtime": 1059.8602,
|
|
"train_tokens_per_second": 9486.844
|
|
},
|
|
{
|
|
"epoch": 1.3895131086142323,
|
|
"grad_norm": 0.26731908321380615,
|
|
"learning_rate": 4.5557478856278114e-05,
|
|
"loss": 0.102,
|
|
"num_input_tokens_seen": 10172456,
|
|
"step": 93,
|
|
"train_runtime": 1071.5397,
|
|
"train_tokens_per_second": 9493.307
|
|
},
|
|
{
|
|
"epoch": 1.404494382022472,
|
|
"grad_norm": 0.2940012216567993,
|
|
"learning_rate": 4.5407878426834596e-05,
|
|
"loss": 0.1167,
|
|
"num_input_tokens_seen": 10279024,
|
|
"step": 94,
|
|
"train_runtime": 1083.1405,
|
|
"train_tokens_per_second": 9490.019
|
|
},
|
|
{
|
|
"epoch": 1.4194756554307117,
|
|
"grad_norm": 0.24917353689670563,
|
|
"learning_rate": 4.5256054888834934e-05,
|
|
"loss": 0.1945,
|
|
"num_input_tokens_seen": 10394120,
|
|
"step": 95,
|
|
"train_runtime": 1094.8039,
|
|
"train_tokens_per_second": 9494.047
|
|
},
|
|
{
|
|
"epoch": 1.4344569288389513,
|
|
"grad_norm": 0.3618624806404114,
|
|
"learning_rate": 4.5102024781000077e-05,
|
|
"loss": 0.1576,
|
|
"num_input_tokens_seen": 10503768,
|
|
"step": 96,
|
|
"train_runtime": 1106.3393,
|
|
"train_tokens_per_second": 9494.165
|
|
},
|
|
{
|
|
"epoch": 1.449438202247191,
|
|
"grad_norm": 0.596593976020813,
|
|
"learning_rate": 4.4945804882421086e-05,
|
|
"loss": 0.1266,
|
|
"num_input_tokens_seen": 10616136,
|
|
"step": 97,
|
|
"train_runtime": 1117.9131,
|
|
"train_tokens_per_second": 9496.388
|
|
},
|
|
{
|
|
"epoch": 1.4644194756554307,
|
|
"grad_norm": 0.2645472586154938,
|
|
"learning_rate": 4.478741221073136e-05,
|
|
"loss": 0.0974,
|
|
"num_input_tokens_seen": 10725704,
|
|
"step": 98,
|
|
"train_runtime": 1129.4583,
|
|
"train_tokens_per_second": 9496.326
|
|
},
|
|
{
|
|
"epoch": 1.4794007490636705,
|
|
"grad_norm": 0.3144528269767761,
|
|
"learning_rate": 4.4626864020252774e-05,
|
|
"loss": 0.0942,
|
|
"num_input_tokens_seen": 10838848,
|
|
"step": 99,
|
|
"train_runtime": 1141.0205,
|
|
"train_tokens_per_second": 9499.258
|
|
},
|
|
{
|
|
"epoch": 1.49438202247191,
|
|
"grad_norm": 0.2749118208885193,
|
|
"learning_rate": 4.446417780011618e-05,
|
|
"loss": 0.16,
|
|
"num_input_tokens_seen": 10953704,
|
|
"step": 100,
|
|
"train_runtime": 1152.6526,
|
|
"train_tokens_per_second": 9503.04
|
|
},
|
|
{
|
|
"epoch": 1.49438202247191,
|
|
"eval_accuracy": 0.9357448056062344,
|
|
"eval_loss": 0.20240993797779083,
|
|
"eval_runtime": 4.9441,
|
|
"eval_samples_per_second": 11.529,
|
|
"eval_steps_per_second": 3.034,
|
|
"num_input_tokens_seen": 10953704,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 1.5093632958801497,
|
|
"grad_norm": 0.3232828378677368,
|
|
"learning_rate": 4.42993712723562e-05,
|
|
"loss": 0.1192,
|
|
"num_input_tokens_seen": 11073888,
|
|
"step": 101,
|
|
"train_runtime": 1169.2748,
|
|
"train_tokens_per_second": 9470.732
|
|
},
|
|
{
|
|
"epoch": 1.5243445692883895,
|
|
"grad_norm": 0.3835008144378662,
|
|
"learning_rate": 4.413246238998069e-05,
|
|
"loss": 0.1767,
|
|
"num_input_tokens_seen": 11178896,
|
|
"step": 102,
|
|
"train_runtime": 1180.9051,
|
|
"train_tokens_per_second": 9466.38
|
|
},
|
|
{
|
|
"epoch": 1.5393258426966292,
|
|
"grad_norm": 0.29268574714660645,
|
|
"learning_rate": 4.3963469335015085e-05,
|
|
"loss": 0.1383,
|
|
"num_input_tokens_seen": 11289112,
|
|
"step": 103,
|
|
"train_runtime": 1192.4003,
|
|
"train_tokens_per_second": 9467.552
|
|
},
|
|
{
|
|
"epoch": 1.554307116104869,
|
|
"grad_norm": 0.24543774127960205,
|
|
"learning_rate": 4.379241051652174e-05,
|
|
"loss": 0.1421,
|
|
"num_input_tokens_seen": 11401952,
|
|
"step": 104,
|
|
"train_runtime": 1203.8514,
|
|
"train_tokens_per_second": 9471.229
|
|
},
|
|
{
|
|
"epoch": 1.5692883895131087,
|
|
"grad_norm": 0.3485076427459717,
|
|
"learning_rate": 4.361930456859455e-05,
|
|
"loss": 0.1201,
|
|
"num_input_tokens_seen": 11511848,
|
|
"step": 105,
|
|
"train_runtime": 1215.263,
|
|
"train_tokens_per_second": 9472.722
|
|
},
|
|
{
|
|
"epoch": 1.5842696629213484,
|
|
"grad_norm": 0.21573545038700104,
|
|
"learning_rate": 4.34441703483291e-05,
|
|
"loss": 0.0623,
|
|
"num_input_tokens_seen": 11625728,
|
|
"step": 106,
|
|
"train_runtime": 1226.7662,
|
|
"train_tokens_per_second": 9476.726
|
|
},
|
|
{
|
|
"epoch": 1.5992509363295881,
|
|
"grad_norm": 0.578035831451416,
|
|
"learning_rate": 4.326702693376844e-05,
|
|
"loss": 0.193,
|
|
"num_input_tokens_seen": 11741544,
|
|
"step": 107,
|
|
"train_runtime": 1238.3403,
|
|
"train_tokens_per_second": 9481.678
|
|
},
|
|
{
|
|
"epoch": 1.6142322097378277,
|
|
"grad_norm": 0.21446435153484344,
|
|
"learning_rate": 4.308789362182492e-05,
|
|
"loss": 0.0936,
|
|
"num_input_tokens_seen": 11851240,
|
|
"step": 108,
|
|
"train_runtime": 1249.8102,
|
|
"train_tokens_per_second": 9482.432
|
|
},
|
|
{
|
|
"epoch": 1.6292134831460674,
|
|
"grad_norm": 0.30708542466163635,
|
|
"learning_rate": 4.2906789926177975e-05,
|
|
"loss": 0.1468,
|
|
"num_input_tokens_seen": 11963664,
|
|
"step": 109,
|
|
"train_runtime": 1261.6521,
|
|
"train_tokens_per_second": 9482.538
|
|
},
|
|
{
|
|
"epoch": 1.6441947565543071,
|
|
"grad_norm": 0.3326849341392517,
|
|
"learning_rate": 4.272373557514858e-05,
|
|
"loss": 0.1707,
|
|
"num_input_tokens_seen": 12067544,
|
|
"step": 110,
|
|
"train_runtime": 1272.7519,
|
|
"train_tokens_per_second": 9481.459
|
|
},
|
|
{
|
|
"epoch": 1.6591760299625467,
|
|
"grad_norm": 0.4741860032081604,
|
|
"learning_rate": 4.2538750509550054e-05,
|
|
"loss": 0.1829,
|
|
"num_input_tokens_seen": 12164792,
|
|
"step": 111,
|
|
"train_runtime": 1284.2245,
|
|
"train_tokens_per_second": 9472.481
|
|
},
|
|
{
|
|
"epoch": 1.6741573033707864,
|
|
"grad_norm": 0.29018136858940125,
|
|
"learning_rate": 4.235185488051585e-05,
|
|
"loss": 0.1401,
|
|
"num_input_tokens_seen": 12281440,
|
|
"step": 112,
|
|
"train_runtime": 1296.1999,
|
|
"train_tokens_per_second": 9474.958
|
|
},
|
|
{
|
|
"epoch": 1.6891385767790261,
|
|
"grad_norm": 0.2956504225730896,
|
|
"learning_rate": 4.216306904730447e-05,
|
|
"loss": 0.1412,
|
|
"num_input_tokens_seen": 12389800,
|
|
"step": 113,
|
|
"train_runtime": 1308.0426,
|
|
"train_tokens_per_second": 9472.015
|
|
},
|
|
{
|
|
"epoch": 1.7041198501872659,
|
|
"grad_norm": 0.27467837929725647,
|
|
"learning_rate": 4.1972413575081595e-05,
|
|
"loss": 0.1908,
|
|
"num_input_tokens_seen": 12498360,
|
|
"step": 114,
|
|
"train_runtime": 1319.6005,
|
|
"train_tokens_per_second": 9471.321
|
|
},
|
|
{
|
|
"epoch": 1.7191011235955056,
|
|
"grad_norm": 0.5112754702568054,
|
|
"learning_rate": 4.177990923267986e-05,
|
|
"loss": 0.1783,
|
|
"num_input_tokens_seen": 12601072,
|
|
"step": 115,
|
|
"train_runtime": 1331.1821,
|
|
"train_tokens_per_second": 9466.077
|
|
},
|
|
{
|
|
"epoch": 1.7340823970037453,
|
|
"grad_norm": 0.25173839926719666,
|
|
"learning_rate": 4.158557699033644e-05,
|
|
"loss": 0.1246,
|
|
"num_input_tokens_seen": 12704456,
|
|
"step": 116,
|
|
"train_runtime": 1342.9235,
|
|
"train_tokens_per_second": 9460.298
|
|
},
|
|
{
|
|
"epoch": 1.749063670411985,
|
|
"grad_norm": 0.34037086367607117,
|
|
"learning_rate": 4.138943801740865e-05,
|
|
"loss": 0.0917,
|
|
"num_input_tokens_seen": 12801568,
|
|
"step": 117,
|
|
"train_runtime": 1354.7403,
|
|
"train_tokens_per_second": 9449.463
|
|
},
|
|
{
|
|
"epoch": 1.7640449438202248,
|
|
"grad_norm": 0.20387206971645355,
|
|
"learning_rate": 4.119151368006793e-05,
|
|
"loss": 0.0672,
|
|
"num_input_tokens_seen": 12917448,
|
|
"step": 118,
|
|
"train_runtime": 1366.4787,
|
|
"train_tokens_per_second": 9453.092
|
|
},
|
|
{
|
|
"epoch": 1.7790262172284645,
|
|
"grad_norm": 0.19825316965579987,
|
|
"learning_rate": 4.099182553897229e-05,
|
|
"loss": 0.1358,
|
|
"num_input_tokens_seen": 13022432,
|
|
"step": 119,
|
|
"train_runtime": 1378.2058,
|
|
"train_tokens_per_second": 9448.83
|
|
},
|
|
{
|
|
"epoch": 1.7940074906367043,
|
|
"grad_norm": 0.2554757297039032,
|
|
"learning_rate": 4.079039534691767e-05,
|
|
"loss": 0.1048,
|
|
"num_input_tokens_seen": 13129888,
|
|
"step": 120,
|
|
"train_runtime": 1390.1082,
|
|
"train_tokens_per_second": 9445.227
|
|
},
|
|
{
|
|
"epoch": 1.8089887640449438,
|
|
"grad_norm": 0.256199449300766,
|
|
"learning_rate": 4.058724504646834e-05,
|
|
"loss": 0.1369,
|
|
"num_input_tokens_seen": 13235312,
|
|
"step": 121,
|
|
"train_runtime": 1401.929,
|
|
"train_tokens_per_second": 9440.786
|
|
},
|
|
{
|
|
"epoch": 1.8239700374531835,
|
|
"grad_norm": 0.22924847900867462,
|
|
"learning_rate": 4.0382396767566536e-05,
|
|
"loss": 0.1564,
|
|
"num_input_tokens_seen": 13350920,
|
|
"step": 122,
|
|
"train_runtime": 1413.75,
|
|
"train_tokens_per_second": 9443.622
|
|
},
|
|
{
|
|
"epoch": 1.8389513108614233,
|
|
"grad_norm": 0.3208468556404114,
|
|
"learning_rate": 4.017587282512181e-05,
|
|
"loss": 0.1292,
|
|
"num_input_tokens_seen": 13458096,
|
|
"step": 123,
|
|
"train_runtime": 1425.7005,
|
|
"train_tokens_per_second": 9439.637
|
|
},
|
|
{
|
|
"epoch": 1.8539325842696628,
|
|
"grad_norm": 0.2948530614376068,
|
|
"learning_rate": 3.9967695716580224e-05,
|
|
"loss": 0.1175,
|
|
"num_input_tokens_seen": 13566016,
|
|
"step": 124,
|
|
"train_runtime": 1437.6399,
|
|
"train_tokens_per_second": 9436.31
|
|
},
|
|
{
|
|
"epoch": 1.8689138576779025,
|
|
"grad_norm": 0.22196036577224731,
|
|
"learning_rate": 3.975788811947351e-05,
|
|
"loss": 0.1814,
|
|
"num_input_tokens_seen": 13676808,
|
|
"step": 125,
|
|
"train_runtime": 1449.4544,
|
|
"train_tokens_per_second": 9435.832
|
|
},
|
|
{
|
|
"epoch": 1.8689138576779025,
|
|
"eval_accuracy": 0.9412339186185766,
|
|
"eval_loss": 0.18464037775993347,
|
|
"eval_runtime": 4.9271,
|
|
"eval_samples_per_second": 11.569,
|
|
"eval_steps_per_second": 3.044,
|
|
"num_input_tokens_seen": 13676808,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 1.8838951310861423,
|
|
"grad_norm": 0.3131586015224457,
|
|
"learning_rate": 3.954647288894883e-05,
|
|
"loss": 0.0969,
|
|
"num_input_tokens_seen": 13785624,
|
|
"step": 126,
|
|
"train_runtime": 1466.2368,
|
|
"train_tokens_per_second": 9402.045
|
|
},
|
|
{
|
|
"epoch": 1.898876404494382,
|
|
"grad_norm": 0.2623322606086731,
|
|
"learning_rate": 3.933347305527898e-05,
|
|
"loss": 0.1431,
|
|
"num_input_tokens_seen": 13896368,
|
|
"step": 127,
|
|
"train_runtime": 1478.1099,
|
|
"train_tokens_per_second": 9401.444
|
|
},
|
|
{
|
|
"epoch": 1.9138576779026217,
|
|
"grad_norm": 0.3807845711708069,
|
|
"learning_rate": 3.911891182135371e-05,
|
|
"loss": 0.1552,
|
|
"num_input_tokens_seen": 14010984,
|
|
"step": 128,
|
|
"train_runtime": 1490.1185,
|
|
"train_tokens_per_second": 9402.597
|
|
},
|
|
{
|
|
"epoch": 1.9288389513108615,
|
|
"grad_norm": 0.36109474301338196,
|
|
"learning_rate": 3.8902812560152066e-05,
|
|
"loss": 0.1472,
|
|
"num_input_tokens_seen": 14112168,
|
|
"step": 129,
|
|
"train_runtime": 1501.7018,
|
|
"train_tokens_per_second": 9397.45
|
|
},
|
|
{
|
|
"epoch": 1.9438202247191012,
|
|
"grad_norm": 0.20871855318546295,
|
|
"learning_rate": 3.868519881219631e-05,
|
|
"loss": 0.1115,
|
|
"num_input_tokens_seen": 14227128,
|
|
"step": 130,
|
|
"train_runtime": 1513.4596,
|
|
"train_tokens_per_second": 9400.402
|
|
},
|
|
{
|
|
"epoch": 1.958801498127341,
|
|
"grad_norm": 0.8112205266952515,
|
|
"learning_rate": 3.846609428298757e-05,
|
|
"loss": 0.1027,
|
|
"num_input_tokens_seen": 14342592,
|
|
"step": 131,
|
|
"train_runtime": 1525.4083,
|
|
"train_tokens_per_second": 9402.461
|
|
},
|
|
{
|
|
"epoch": 1.9737827715355807,
|
|
"grad_norm": 0.20138965547084808,
|
|
"learning_rate": 3.824552284042351e-05,
|
|
"loss": 0.1057,
|
|
"num_input_tokens_seen": 14461768,
|
|
"step": 132,
|
|
"train_runtime": 1537.3708,
|
|
"train_tokens_per_second": 9406.818
|
|
},
|
|
{
|
|
"epoch": 1.9887640449438202,
|
|
"grad_norm": 0.2765468657016754,
|
|
"learning_rate": 3.8023508512198256e-05,
|
|
"loss": 0.1326,
|
|
"num_input_tokens_seen": 14568520,
|
|
"step": 133,
|
|
"train_runtime": 1549.1006,
|
|
"train_tokens_per_second": 9404.502
|
|
},
|
|
{
|
|
"epoch": 2.0,
|
|
"grad_norm": 0.3718896806240082,
|
|
"learning_rate": 3.780007548318507e-05,
|
|
"loss": 0.1245,
|
|
"num_input_tokens_seen": 14641496,
|
|
"step": 134,
|
|
"train_runtime": 1558.8821,
|
|
"train_tokens_per_second": 9392.305
|
|
},
|
|
{
|
|
"epoch": 2.0149812734082397,
|
|
"grad_norm": 0.3595190942287445,
|
|
"learning_rate": 3.7575248092801686e-05,
|
|
"loss": 0.158,
|
|
"num_input_tokens_seen": 14745856,
|
|
"step": 135,
|
|
"train_runtime": 1570.765,
|
|
"train_tokens_per_second": 9387.691
|
|
},
|
|
{
|
|
"epoch": 2.0299625468164795,
|
|
"grad_norm": 0.31272247433662415,
|
|
"learning_rate": 3.734905083235901e-05,
|
|
"loss": 0.122,
|
|
"num_input_tokens_seen": 14851856,
|
|
"step": 136,
|
|
"train_runtime": 1582.6874,
|
|
"train_tokens_per_second": 9383.948
|
|
},
|
|
{
|
|
"epoch": 2.044943820224719,
|
|
"grad_norm": 0.3598696291446686,
|
|
"learning_rate": 3.712150834239313e-05,
|
|
"loss": 0.1392,
|
|
"num_input_tokens_seen": 14962208,
|
|
"step": 137,
|
|
"train_runtime": 1594.3777,
|
|
"train_tokens_per_second": 9384.356
|
|
},
|
|
{
|
|
"epoch": 2.059925093632959,
|
|
"grad_norm": 0.45219355821609497,
|
|
"learning_rate": 3.689264540998116e-05,
|
|
"loss": 0.0892,
|
|
"num_input_tokens_seen": 15071712,
|
|
"step": 138,
|
|
"train_runtime": 1606.014,
|
|
"train_tokens_per_second": 9384.546
|
|
},
|
|
{
|
|
"epoch": 2.0749063670411987,
|
|
"grad_norm": 0.18551455438137054,
|
|
"learning_rate": 3.66624869660411e-05,
|
|
"loss": 0.0706,
|
|
"num_input_tokens_seen": 15178568,
|
|
"step": 139,
|
|
"train_runtime": 1617.5902,
|
|
"train_tokens_per_second": 9383.444
|
|
},
|
|
{
|
|
"epoch": 2.0898876404494384,
|
|
"grad_norm": 0.1813335120677948,
|
|
"learning_rate": 3.6431058082615964e-05,
|
|
"loss": 0.0695,
|
|
"num_input_tokens_seen": 15295296,
|
|
"step": 140,
|
|
"train_runtime": 1629.2133,
|
|
"train_tokens_per_second": 9388.148
|
|
},
|
|
{
|
|
"epoch": 2.1048689138576777,
|
|
"grad_norm": 0.25384795665740967,
|
|
"learning_rate": 3.619838397014263e-05,
|
|
"loss": 0.1314,
|
|
"num_input_tokens_seen": 15401968,
|
|
"step": 141,
|
|
"train_runtime": 1640.8051,
|
|
"train_tokens_per_second": 9386.836
|
|
},
|
|
{
|
|
"epoch": 2.1198501872659175,
|
|
"grad_norm": 1.163858413696289,
|
|
"learning_rate": 3.5964489974705553e-05,
|
|
"loss": 0.1043,
|
|
"num_input_tokens_seen": 15510128,
|
|
"step": 142,
|
|
"train_runtime": 1652.1467,
|
|
"train_tokens_per_second": 9387.864
|
|
},
|
|
{
|
|
"epoch": 2.134831460674157,
|
|
"grad_norm": 0.280141144990921,
|
|
"learning_rate": 3.572940157527572e-05,
|
|
"loss": 0.1566,
|
|
"num_input_tokens_seen": 15606536,
|
|
"step": 143,
|
|
"train_runtime": 1663.2096,
|
|
"train_tokens_per_second": 9383.385
|
|
},
|
|
{
|
|
"epoch": 2.149812734082397,
|
|
"grad_norm": 0.371442973613739,
|
|
"learning_rate": 3.549314438093515e-05,
|
|
"loss": 0.0907,
|
|
"num_input_tokens_seen": 15717520,
|
|
"step": 144,
|
|
"train_runtime": 1674.5727,
|
|
"train_tokens_per_second": 9385.989
|
|
},
|
|
{
|
|
"epoch": 2.1647940074906367,
|
|
"grad_norm": 0.2719246447086334,
|
|
"learning_rate": 3.525574412808717e-05,
|
|
"loss": 0.1258,
|
|
"num_input_tokens_seen": 15827848,
|
|
"step": 145,
|
|
"train_runtime": 1686.0755,
|
|
"train_tokens_per_second": 9387.39
|
|
},
|
|
{
|
|
"epoch": 2.1797752808988764,
|
|
"grad_norm": 0.25204575061798096,
|
|
"learning_rate": 3.501722667765286e-05,
|
|
"loss": 0.1402,
|
|
"num_input_tokens_seen": 15934960,
|
|
"step": 146,
|
|
"train_runtime": 1697.5903,
|
|
"train_tokens_per_second": 9386.812
|
|
},
|
|
{
|
|
"epoch": 2.194756554307116,
|
|
"grad_norm": 0.20074494183063507,
|
|
"learning_rate": 3.47776180122539e-05,
|
|
"loss": 0.0751,
|
|
"num_input_tokens_seen": 16038664,
|
|
"step": 147,
|
|
"train_runtime": 1708.9323,
|
|
"train_tokens_per_second": 9385.196
|
|
},
|
|
{
|
|
"epoch": 2.209737827715356,
|
|
"grad_norm": 0.32247740030288696,
|
|
"learning_rate": 3.453694423338225e-05,
|
|
"loss": 0.1599,
|
|
"num_input_tokens_seen": 16142344,
|
|
"step": 148,
|
|
"train_runtime": 1719.9895,
|
|
"train_tokens_per_second": 9385.141
|
|
},
|
|
{
|
|
"epoch": 2.2247191011235956,
|
|
"grad_norm": 0.2926752269268036,
|
|
"learning_rate": 3.4295231558556715e-05,
|
|
"loss": 0.1017,
|
|
"num_input_tokens_seen": 16242008,
|
|
"step": 149,
|
|
"train_runtime": 1731.3942,
|
|
"train_tokens_per_second": 9380.884
|
|
},
|
|
{
|
|
"epoch": 2.2397003745318353,
|
|
"grad_norm": 0.284390926361084,
|
|
"learning_rate": 3.4052506318467084e-05,
|
|
"loss": 0.0857,
|
|
"num_input_tokens_seen": 16353368,
|
|
"step": 150,
|
|
"train_runtime": 1742.8962,
|
|
"train_tokens_per_second": 9382.87
|
|
},
|
|
{
|
|
"epoch": 2.2397003745318353,
|
|
"eval_accuracy": 0.9414076449679479,
|
|
"eval_loss": 0.1802486777305603,
|
|
"eval_runtime": 4.927,
|
|
"eval_samples_per_second": 11.569,
|
|
"eval_steps_per_second": 3.044,
|
|
"num_input_tokens_seen": 16353368,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 2.254681647940075,
|
|
"grad_norm": 0.24835243821144104,
|
|
"learning_rate": 3.3808794954105716e-05,
|
|
"loss": 0.12,
|
|
"num_input_tokens_seen": 16462800,
|
|
"step": 151,
|
|
"train_runtime": 1759.3705,
|
|
"train_tokens_per_second": 9357.211
|
|
},
|
|
{
|
|
"epoch": 2.2696629213483144,
|
|
"grad_norm": 8.10318660736084,
|
|
"learning_rate": 3.356412401388732e-05,
|
|
"loss": 0.202,
|
|
"num_input_tokens_seen": 16576136,
|
|
"step": 152,
|
|
"train_runtime": 1770.9609,
|
|
"train_tokens_per_second": 9359.967
|
|
},
|
|
{
|
|
"epoch": 2.284644194756554,
|
|
"grad_norm": 0.20310688018798828,
|
|
"learning_rate": 3.3318520150756846e-05,
|
|
"loss": 0.0774,
|
|
"num_input_tokens_seen": 16685072,
|
|
"step": 153,
|
|
"train_runtime": 1782.4318,
|
|
"train_tokens_per_second": 9360.847
|
|
},
|
|
{
|
|
"epoch": 2.299625468164794,
|
|
"grad_norm": 0.2707997262477875,
|
|
"learning_rate": 3.307201011928616e-05,
|
|
"loss": 0.0896,
|
|
"num_input_tokens_seen": 16799472,
|
|
"step": 154,
|
|
"train_runtime": 1793.9015,
|
|
"train_tokens_per_second": 9364.768
|
|
},
|
|
{
|
|
"epoch": 2.3146067415730336,
|
|
"grad_norm": 0.33697864413261414,
|
|
"learning_rate": 3.282462077275947e-05,
|
|
"loss": 0.1516,
|
|
"num_input_tokens_seen": 16916072,
|
|
"step": 155,
|
|
"train_runtime": 1805.4847,
|
|
"train_tokens_per_second": 9369.269
|
|
},
|
|
{
|
|
"epoch": 2.3295880149812733,
|
|
"grad_norm": 0.27440527081489563,
|
|
"learning_rate": 3.257637906024822e-05,
|
|
"loss": 0.1394,
|
|
"num_input_tokens_seen": 17036352,
|
|
"step": 156,
|
|
"train_runtime": 1817.14,
|
|
"train_tokens_per_second": 9375.366
|
|
},
|
|
{
|
|
"epoch": 2.344569288389513,
|
|
"grad_norm": 0.24285127222537994,
|
|
"learning_rate": 3.2327312023675287e-05,
|
|
"loss": 0.1162,
|
|
"num_input_tokens_seen": 17141704,
|
|
"step": 157,
|
|
"train_runtime": 1828.6308,
|
|
"train_tokens_per_second": 9374.065
|
|
},
|
|
{
|
|
"epoch": 2.359550561797753,
|
|
"grad_norm": 0.3084551692008972,
|
|
"learning_rate": 3.2077446794869295e-05,
|
|
"loss": 0.1081,
|
|
"num_input_tokens_seen": 17247616,
|
|
"step": 158,
|
|
"train_runtime": 1840.114,
|
|
"train_tokens_per_second": 9373.123
|
|
},
|
|
{
|
|
"epoch": 2.3745318352059925,
|
|
"grad_norm": 0.2344665825366974,
|
|
"learning_rate": 3.1826810592609036e-05,
|
|
"loss": 0.1278,
|
|
"num_input_tokens_seen": 17360352,
|
|
"step": 159,
|
|
"train_runtime": 1851.4615,
|
|
"train_tokens_per_second": 9376.567
|
|
},
|
|
{
|
|
"epoch": 2.3895131086142323,
|
|
"grad_norm": 0.26749441027641296,
|
|
"learning_rate": 3.157543071965835e-05,
|
|
"loss": 0.1027,
|
|
"num_input_tokens_seen": 17472040,
|
|
"step": 160,
|
|
"train_runtime": 1863.0215,
|
|
"train_tokens_per_second": 9378.335
|
|
},
|
|
{
|
|
"epoch": 2.404494382022472,
|
|
"grad_norm": 0.543249785900116,
|
|
"learning_rate": 3.132333455979202e-05,
|
|
"loss": 0.1247,
|
|
"num_input_tokens_seen": 17579232,
|
|
"step": 161,
|
|
"train_runtime": 1874.575,
|
|
"train_tokens_per_second": 9377.716
|
|
},
|
|
{
|
|
"epoch": 2.4194756554307117,
|
|
"grad_norm": 0.1824382096529007,
|
|
"learning_rate": 3.107054957481271e-05,
|
|
"loss": 0.0773,
|
|
"num_input_tokens_seen": 17686392,
|
|
"step": 162,
|
|
"train_runtime": 1886.1923,
|
|
"train_tokens_per_second": 9376.77
|
|
},
|
|
{
|
|
"epoch": 2.4344569288389515,
|
|
"grad_norm": 0.1845661848783493,
|
|
"learning_rate": 3.081710330155942e-05,
|
|
"loss": 0.0579,
|
|
"num_input_tokens_seen": 17800024,
|
|
"step": 163,
|
|
"train_runtime": 1897.7131,
|
|
"train_tokens_per_second": 9379.723
|
|
},
|
|
{
|
|
"epoch": 2.449438202247191,
|
|
"grad_norm": 0.6334578394889832,
|
|
"learning_rate": 3.056302334890786e-05,
|
|
"loss": 0.0756,
|
|
"num_input_tokens_seen": 17909576,
|
|
"step": 164,
|
|
"train_runtime": 1909.3707,
|
|
"train_tokens_per_second": 9379.832
|
|
},
|
|
{
|
|
"epoch": 2.464419475655431,
|
|
"grad_norm": 0.28113627433776855,
|
|
"learning_rate": 3.030833739476285e-05,
|
|
"loss": 0.1386,
|
|
"num_input_tokens_seen": 18009360,
|
|
"step": 165,
|
|
"train_runtime": 1920.56,
|
|
"train_tokens_per_second": 9377.14
|
|
},
|
|
{
|
|
"epoch": 2.4794007490636703,
|
|
"grad_norm": 0.3643280267715454,
|
|
"learning_rate": 3.0053073183043256e-05,
|
|
"loss": 0.1432,
|
|
"num_input_tokens_seen": 18114736,
|
|
"step": 166,
|
|
"train_runtime": 1931.8708,
|
|
"train_tokens_per_second": 9376.785
|
|
},
|
|
{
|
|
"epoch": 2.49438202247191,
|
|
"grad_norm": 0.2565111815929413,
|
|
"learning_rate": 2.979725852065981e-05,
|
|
"loss": 0.1071,
|
|
"num_input_tokens_seen": 18226888,
|
|
"step": 167,
|
|
"train_runtime": 1943.3904,
|
|
"train_tokens_per_second": 9378.912
|
|
},
|
|
{
|
|
"epoch": 2.5093632958801497,
|
|
"grad_norm": 0.27584174275398254,
|
|
"learning_rate": 2.954092127448591e-05,
|
|
"loss": 0.114,
|
|
"num_input_tokens_seen": 18338720,
|
|
"step": 168,
|
|
"train_runtime": 1954.9951,
|
|
"train_tokens_per_second": 9380.443
|
|
},
|
|
{
|
|
"epoch": 2.5243445692883895,
|
|
"grad_norm": 0.22883868217468262,
|
|
"learning_rate": 2.9284089368322045e-05,
|
|
"loss": 0.0981,
|
|
"num_input_tokens_seen": 18451496,
|
|
"step": 169,
|
|
"train_runtime": 1966.6195,
|
|
"train_tokens_per_second": 9382.341
|
|
},
|
|
{
|
|
"epoch": 2.539325842696629,
|
|
"grad_norm": 0.33810093998908997,
|
|
"learning_rate": 2.9026790779853874e-05,
|
|
"loss": 0.1347,
|
|
"num_input_tokens_seen": 18556776,
|
|
"step": 170,
|
|
"train_runtime": 1977.785,
|
|
"train_tokens_per_second": 9382.605
|
|
},
|
|
{
|
|
"epoch": 2.554307116104869,
|
|
"grad_norm": 0.31047242879867554,
|
|
"learning_rate": 2.876905353760459e-05,
|
|
"loss": 0.0833,
|
|
"num_input_tokens_seen": 18664112,
|
|
"step": 171,
|
|
"train_runtime": 1989.3899,
|
|
"train_tokens_per_second": 9381.827
|
|
},
|
|
{
|
|
"epoch": 2.5692883895131087,
|
|
"grad_norm": 0.25530344247817993,
|
|
"learning_rate": 2.8510905717881614e-05,
|
|
"loss": 0.1111,
|
|
"num_input_tokens_seen": 18769448,
|
|
"step": 172,
|
|
"train_runtime": 2000.6604,
|
|
"train_tokens_per_second": 9381.626
|
|
},
|
|
{
|
|
"epoch": 2.5842696629213484,
|
|
"grad_norm": 0.2946469187736511,
|
|
"learning_rate": 2.8252375441718137e-05,
|
|
"loss": 0.1501,
|
|
"num_input_tokens_seen": 18884864,
|
|
"step": 173,
|
|
"train_runtime": 2012.1806,
|
|
"train_tokens_per_second": 9385.273
|
|
},
|
|
{
|
|
"epoch": 2.599250936329588,
|
|
"grad_norm": 0.3773297667503357,
|
|
"learning_rate": 2.7993490871809808e-05,
|
|
"loss": 0.1171,
|
|
"num_input_tokens_seen": 18993424,
|
|
"step": 174,
|
|
"train_runtime": 2023.6738,
|
|
"train_tokens_per_second": 9385.616
|
|
},
|
|
{
|
|
"epoch": 2.6142322097378274,
|
|
"grad_norm": 0.2301235944032669,
|
|
"learning_rate": 2.7734280209446865e-05,
|
|
"loss": 0.1261,
|
|
"num_input_tokens_seen": 19111296,
|
|
"step": 175,
|
|
"train_runtime": 2035.2921,
|
|
"train_tokens_per_second": 9389.953
|
|
},
|
|
{
|
|
"epoch": 2.6142322097378274,
|
|
"eval_accuracy": 0.943552523084463,
|
|
"eval_loss": 0.1688879132270813,
|
|
"eval_runtime": 4.9275,
|
|
"eval_samples_per_second": 11.568,
|
|
"eval_steps_per_second": 3.044,
|
|
"num_input_tokens_seen": 19111296,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 2.629213483146067,
|
|
"grad_norm": 0.22906967997550964,
|
|
"learning_rate": 2.7474771691442018e-05,
|
|
"loss": 0.0987,
|
|
"num_input_tokens_seen": 19213824,
|
|
"step": 176,
|
|
"train_runtime": 2051.4065,
|
|
"train_tokens_per_second": 9366.171
|
|
},
|
|
{
|
|
"epoch": 2.644194756554307,
|
|
"grad_norm": 0.2511672079563141,
|
|
"learning_rate": 2.721499358705458e-05,
|
|
"loss": 0.054,
|
|
"num_input_tokens_seen": 19338104,
|
|
"step": 177,
|
|
"train_runtime": 2063.0236,
|
|
"train_tokens_per_second": 9373.671
|
|
},
|
|
{
|
|
"epoch": 2.6591760299625467,
|
|
"grad_norm": 0.3024023771286011,
|
|
"learning_rate": 2.6954974194910888e-05,
|
|
"loss": 0.0683,
|
|
"num_input_tokens_seen": 19449848,
|
|
"step": 178,
|
|
"train_runtime": 2074.6512,
|
|
"train_tokens_per_second": 9374.997
|
|
},
|
|
{
|
|
"epoch": 2.6741573033707864,
|
|
"grad_norm": 0.1516094207763672,
|
|
"learning_rate": 2.6694741839921732e-05,
|
|
"loss": 0.1121,
|
|
"num_input_tokens_seen": 19571008,
|
|
"step": 179,
|
|
"train_runtime": 2086.2509,
|
|
"train_tokens_per_second": 9380.946
|
|
},
|
|
{
|
|
"epoch": 2.689138576779026,
|
|
"grad_norm": 0.20220297574996948,
|
|
"learning_rate": 2.6434324870196748e-05,
|
|
"loss": 0.0888,
|
|
"num_input_tokens_seen": 19686872,
|
|
"step": 180,
|
|
"train_runtime": 2097.6886,
|
|
"train_tokens_per_second": 9385.031
|
|
},
|
|
{
|
|
"epoch": 2.704119850187266,
|
|
"grad_norm": 0.504995584487915,
|
|
"learning_rate": 2.617375165395634e-05,
|
|
"loss": 0.0751,
|
|
"num_input_tokens_seen": 19797960,
|
|
"step": 181,
|
|
"train_runtime": 2109.1207,
|
|
"train_tokens_per_second": 9386.831
|
|
},
|
|
{
|
|
"epoch": 2.7191011235955056,
|
|
"grad_norm": 0.24343626201152802,
|
|
"learning_rate": 2.5913050576441477e-05,
|
|
"loss": 0.1033,
|
|
"num_input_tokens_seen": 19905184,
|
|
"step": 182,
|
|
"train_runtime": 2120.593,
|
|
"train_tokens_per_second": 9386.612
|
|
},
|
|
{
|
|
"epoch": 2.7340823970037453,
|
|
"grad_norm": 0.20072679221630096,
|
|
"learning_rate": 2.5652250036821523e-05,
|
|
"loss": 0.0867,
|
|
"num_input_tokens_seen": 20013120,
|
|
"step": 183,
|
|
"train_runtime": 2132.1281,
|
|
"train_tokens_per_second": 9386.453
|
|
},
|
|
{
|
|
"epoch": 2.749063670411985,
|
|
"grad_norm": 0.22807280719280243,
|
|
"learning_rate": 2.5391378445100644e-05,
|
|
"loss": 0.1323,
|
|
"num_input_tokens_seen": 20109488,
|
|
"step": 184,
|
|
"train_runtime": 2143.2117,
|
|
"train_tokens_per_second": 9382.875
|
|
},
|
|
{
|
|
"epoch": 2.764044943820225,
|
|
"grad_norm": 0.2813800275325775,
|
|
"learning_rate": 2.5130464219022992e-05,
|
|
"loss": 0.0935,
|
|
"num_input_tokens_seen": 20227088,
|
|
"step": 185,
|
|
"train_runtime": 2154.761,
|
|
"train_tokens_per_second": 9387.161
|
|
},
|
|
{
|
|
"epoch": 2.7790262172284645,
|
|
"grad_norm": 0.21835725009441376,
|
|
"learning_rate": 2.486953578097702e-05,
|
|
"loss": 0.095,
|
|
"num_input_tokens_seen": 20330176,
|
|
"step": 186,
|
|
"train_runtime": 2166.2477,
|
|
"train_tokens_per_second": 9384.973
|
|
},
|
|
{
|
|
"epoch": 2.7940074906367043,
|
|
"grad_norm": 0.23183397948741913,
|
|
"learning_rate": 2.4608621554899362e-05,
|
|
"loss": 0.1094,
|
|
"num_input_tokens_seen": 20448288,
|
|
"step": 187,
|
|
"train_runtime": 2177.8806,
|
|
"train_tokens_per_second": 9389.077
|
|
},
|
|
{
|
|
"epoch": 2.808988764044944,
|
|
"grad_norm": 0.24513450264930725,
|
|
"learning_rate": 2.4347749963178486e-05,
|
|
"loss": 0.094,
|
|
"num_input_tokens_seen": 20552120,
|
|
"step": 188,
|
|
"train_runtime": 2189.3542,
|
|
"train_tokens_per_second": 9387.298
|
|
},
|
|
{
|
|
"epoch": 2.8239700374531838,
|
|
"grad_norm": 0.23791368305683136,
|
|
"learning_rate": 2.4086949423558526e-05,
|
|
"loss": 0.0948,
|
|
"num_input_tokens_seen": 20664640,
|
|
"step": 189,
|
|
"train_runtime": 2201.0212,
|
|
"train_tokens_per_second": 9388.66
|
|
},
|
|
{
|
|
"epoch": 2.8389513108614235,
|
|
"grad_norm": 0.2168198823928833,
|
|
"learning_rate": 2.3826248346043663e-05,
|
|
"loss": 0.0838,
|
|
"num_input_tokens_seen": 20777328,
|
|
"step": 190,
|
|
"train_runtime": 2212.6238,
|
|
"train_tokens_per_second": 9390.357
|
|
},
|
|
{
|
|
"epoch": 2.853932584269663,
|
|
"grad_norm": 0.20405888557434082,
|
|
"learning_rate": 2.356567512980326e-05,
|
|
"loss": 0.1071,
|
|
"num_input_tokens_seen": 20895424,
|
|
"step": 191,
|
|
"train_runtime": 2224.2739,
|
|
"train_tokens_per_second": 9394.267
|
|
},
|
|
{
|
|
"epoch": 2.8689138576779025,
|
|
"grad_norm": 0.21420727670192719,
|
|
"learning_rate": 2.3305258160078274e-05,
|
|
"loss": 0.0939,
|
|
"num_input_tokens_seen": 21007912,
|
|
"step": 192,
|
|
"train_runtime": 2235.8311,
|
|
"train_tokens_per_second": 9396.019
|
|
},
|
|
{
|
|
"epoch": 2.8838951310861423,
|
|
"grad_norm": 0.27938148379325867,
|
|
"learning_rate": 2.3045025805089118e-05,
|
|
"loss": 0.1093,
|
|
"num_input_tokens_seen": 21112424,
|
|
"step": 193,
|
|
"train_runtime": 2246.9601,
|
|
"train_tokens_per_second": 9395.994
|
|
},
|
|
{
|
|
"epoch": 2.898876404494382,
|
|
"grad_norm": 0.3431660830974579,
|
|
"learning_rate": 2.278500641294543e-05,
|
|
"loss": 0.1156,
|
|
"num_input_tokens_seen": 21221136,
|
|
"step": 194,
|
|
"train_runtime": 2258.1117,
|
|
"train_tokens_per_second": 9397.735
|
|
},
|
|
{
|
|
"epoch": 2.9138576779026217,
|
|
"grad_norm": 0.40309008955955505,
|
|
"learning_rate": 2.252522830855798e-05,
|
|
"loss": 0.0693,
|
|
"num_input_tokens_seen": 21331720,
|
|
"step": 195,
|
|
"train_runtime": 2269.7189,
|
|
"train_tokens_per_second": 9398.397
|
|
},
|
|
{
|
|
"epoch": 2.9288389513108615,
|
|
"grad_norm": 0.1703529804944992,
|
|
"learning_rate": 2.2265719790553147e-05,
|
|
"loss": 0.0907,
|
|
"num_input_tokens_seen": 21447512,
|
|
"step": 196,
|
|
"train_runtime": 2281.3042,
|
|
"train_tokens_per_second": 9401.426
|
|
},
|
|
{
|
|
"epoch": 2.943820224719101,
|
|
"grad_norm": 0.1772414594888687,
|
|
"learning_rate": 2.2006509128190195e-05,
|
|
"loss": 0.0821,
|
|
"num_input_tokens_seen": 21553192,
|
|
"step": 197,
|
|
"train_runtime": 2292.9216,
|
|
"train_tokens_per_second": 9399.882
|
|
},
|
|
{
|
|
"epoch": 2.958801498127341,
|
|
"grad_norm": 0.22802460193634033,
|
|
"learning_rate": 2.174762455828187e-05,
|
|
"loss": 0.1252,
|
|
"num_input_tokens_seen": 21655488,
|
|
"step": 198,
|
|
"train_runtime": 2304.2302,
|
|
"train_tokens_per_second": 9398.144
|
|
},
|
|
{
|
|
"epoch": 2.9737827715355807,
|
|
"grad_norm": 0.20786331593990326,
|
|
"learning_rate": 2.1489094282118395e-05,
|
|
"loss": 0.0859,
|
|
"num_input_tokens_seen": 21767256,
|
|
"step": 199,
|
|
"train_runtime": 2315.7419,
|
|
"train_tokens_per_second": 9399.69
|
|
},
|
|
{
|
|
"epoch": 2.98876404494382,
|
|
"grad_norm": 0.23133137822151184,
|
|
"learning_rate": 2.123094646239541e-05,
|
|
"loss": 0.1024,
|
|
"num_input_tokens_seen": 21879928,
|
|
"step": 200,
|
|
"train_runtime": 2327.2273,
|
|
"train_tokens_per_second": 9401.715
|
|
},
|
|
{
|
|
"epoch": 2.98876404494382,
|
|
"eval_accuracy": 0.9457574927378564,
|
|
"eval_loss": 0.1642482578754425,
|
|
"eval_runtime": 4.9178,
|
|
"eval_samples_per_second": 11.59,
|
|
"eval_steps_per_second": 3.05,
|
|
"num_input_tokens_seen": 21879928,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 3.0,
|
|
"grad_norm": 0.3909849524497986,
|
|
"learning_rate": 2.0973209220146135e-05,
|
|
"loss": 0.1114,
|
|
"num_input_tokens_seen": 21962520,
|
|
"step": 201,
|
|
"train_runtime": 2341.9985,
|
|
"train_tokens_per_second": 9377.683
|
|
},
|
|
{
|
|
"epoch": 3.0149812734082397,
|
|
"grad_norm": 0.2800769805908203,
|
|
"learning_rate": 2.0715910631677968e-05,
|
|
"loss": 0.0762,
|
|
"num_input_tokens_seen": 22064872,
|
|
"step": 202,
|
|
"train_runtime": 2353.5238,
|
|
"train_tokens_per_second": 9375.249
|
|
},
|
|
{
|
|
"epoch": 3.0299625468164795,
|
|
"grad_norm": 0.17772997915744781,
|
|
"learning_rate": 2.0459078725514092e-05,
|
|
"loss": 0.0883,
|
|
"num_input_tokens_seen": 22169728,
|
|
"step": 203,
|
|
"train_runtime": 2364.4445,
|
|
"train_tokens_per_second": 9376.295
|
|
},
|
|
{
|
|
"epoch": 3.044943820224719,
|
|
"grad_norm": 0.21998678147792816,
|
|
"learning_rate": 2.020274147934019e-05,
|
|
"loss": 0.0756,
|
|
"num_input_tokens_seen": 22285928,
|
|
"step": 204,
|
|
"train_runtime": 2376.0541,
|
|
"train_tokens_per_second": 9379.386
|
|
},
|
|
{
|
|
"epoch": 3.059925093632959,
|
|
"grad_norm": 0.2388005405664444,
|
|
"learning_rate": 1.9946926816956743e-05,
|
|
"loss": 0.0887,
|
|
"num_input_tokens_seen": 22387040,
|
|
"step": 205,
|
|
"train_runtime": 2387.1231,
|
|
"train_tokens_per_second": 9378.251
|
|
},
|
|
{
|
|
"epoch": 3.0749063670411987,
|
|
"grad_norm": 0.2441033124923706,
|
|
"learning_rate": 1.9691662605237166e-05,
|
|
"loss": 0.0926,
|
|
"num_input_tokens_seen": 22498720,
|
|
"step": 206,
|
|
"train_runtime": 2398.4091,
|
|
"train_tokens_per_second": 9380.685
|
|
},
|
|
{
|
|
"epoch": 3.0898876404494384,
|
|
"grad_norm": 0.16214871406555176,
|
|
"learning_rate": 1.9436976651092144e-05,
|
|
"loss": 0.1224,
|
|
"num_input_tokens_seen": 22621072,
|
|
"step": 207,
|
|
"train_runtime": 2410.0394,
|
|
"train_tokens_per_second": 9386.184
|
|
},
|
|
{
|
|
"epoch": 3.1048689138576777,
|
|
"grad_norm": 0.39781343936920166,
|
|
"learning_rate": 1.9182896698440584e-05,
|
|
"loss": 0.0856,
|
|
"num_input_tokens_seen": 22724704,
|
|
"step": 208,
|
|
"train_runtime": 2421.5513,
|
|
"train_tokens_per_second": 9384.358
|
|
},
|
|
{
|
|
"epoch": 3.1198501872659175,
|
|
"grad_norm": 0.2390083521604538,
|
|
"learning_rate": 1.89294504251873e-05,
|
|
"loss": 0.0621,
|
|
"num_input_tokens_seen": 22838936,
|
|
"step": 209,
|
|
"train_runtime": 2433.1409,
|
|
"train_tokens_per_second": 9386.606
|
|
},
|
|
{
|
|
"epoch": 3.134831460674157,
|
|
"grad_norm": 0.2981702387332916,
|
|
"learning_rate": 1.867666544020798e-05,
|
|
"loss": 0.1196,
|
|
"num_input_tokens_seen": 22939008,
|
|
"step": 210,
|
|
"train_runtime": 2444.3526,
|
|
"train_tokens_per_second": 9384.492
|
|
},
|
|
{
|
|
"epoch": 3.149812734082397,
|
|
"grad_norm": 0.3137620687484741,
|
|
"learning_rate": 1.8424569280341653e-05,
|
|
"loss": 0.1071,
|
|
"num_input_tokens_seen": 23054112,
|
|
"step": 211,
|
|
"train_runtime": 2455.964,
|
|
"train_tokens_per_second": 9386.991
|
|
},
|
|
{
|
|
"epoch": 3.1647940074906367,
|
|
"grad_norm": 0.1887480467557907,
|
|
"learning_rate": 1.817318940739098e-05,
|
|
"loss": 0.0932,
|
|
"num_input_tokens_seen": 23156632,
|
|
"step": 212,
|
|
"train_runtime": 2467.4786,
|
|
"train_tokens_per_second": 9384.735
|
|
},
|
|
{
|
|
"epoch": 3.1797752808988764,
|
|
"grad_norm": 0.2509893774986267,
|
|
"learning_rate": 1.7922553205130707e-05,
|
|
"loss": 0.0792,
|
|
"num_input_tokens_seen": 23271912,
|
|
"step": 213,
|
|
"train_runtime": 2479.0903,
|
|
"train_tokens_per_second": 9387.279
|
|
},
|
|
{
|
|
"epoch": 3.194756554307116,
|
|
"grad_norm": 0.15841956436634064,
|
|
"learning_rate": 1.767268797632472e-05,
|
|
"loss": 0.0513,
|
|
"num_input_tokens_seen": 23381816,
|
|
"step": 214,
|
|
"train_runtime": 2490.6787,
|
|
"train_tokens_per_second": 9387.729
|
|
},
|
|
{
|
|
"epoch": 3.209737827715356,
|
|
"grad_norm": 0.18807053565979004,
|
|
"learning_rate": 1.7423620939751788e-05,
|
|
"loss": 0.0903,
|
|
"num_input_tokens_seen": 23489552,
|
|
"step": 215,
|
|
"train_runtime": 2502.2201,
|
|
"train_tokens_per_second": 9387.484
|
|
},
|
|
{
|
|
"epoch": 3.2247191011235956,
|
|
"grad_norm": 0.27771255373954773,
|
|
"learning_rate": 1.7175379227240523e-05,
|
|
"loss": 0.0763,
|
|
"num_input_tokens_seen": 23602136,
|
|
"step": 216,
|
|
"train_runtime": 2513.7932,
|
|
"train_tokens_per_second": 9389.053
|
|
},
|
|
{
|
|
"epoch": 3.2397003745318353,
|
|
"grad_norm": 0.23832547664642334,
|
|
"learning_rate": 1.692798988071385e-05,
|
|
"loss": 0.0656,
|
|
"num_input_tokens_seen": 23705952,
|
|
"step": 217,
|
|
"train_runtime": 2525.2798,
|
|
"train_tokens_per_second": 9387.456
|
|
},
|
|
{
|
|
"epoch": 3.254681647940075,
|
|
"grad_norm": 0.20118069648742676,
|
|
"learning_rate": 1.6681479849243153e-05,
|
|
"loss": 0.1015,
|
|
"num_input_tokens_seen": 23821824,
|
|
"step": 218,
|
|
"train_runtime": 2536.8541,
|
|
"train_tokens_per_second": 9390.301
|
|
},
|
|
{
|
|
"epoch": 3.2696629213483144,
|
|
"grad_norm": 0.24935227632522583,
|
|
"learning_rate": 1.6435875986112685e-05,
|
|
"loss": 0.1126,
|
|
"num_input_tokens_seen": 23933400,
|
|
"step": 219,
|
|
"train_runtime": 2548.3936,
|
|
"train_tokens_per_second": 9391.563
|
|
},
|
|
{
|
|
"epoch": 3.284644194756554,
|
|
"grad_norm": 0.20586885511875153,
|
|
"learning_rate": 1.6191205045894283e-05,
|
|
"loss": 0.0704,
|
|
"num_input_tokens_seen": 24044912,
|
|
"step": 220,
|
|
"train_runtime": 2559.9579,
|
|
"train_tokens_per_second": 9392.698
|
|
},
|
|
{
|
|
"epoch": 3.299625468164794,
|
|
"grad_norm": 0.1685791313648224,
|
|
"learning_rate": 1.594749368153292e-05,
|
|
"loss": 0.0695,
|
|
"num_input_tokens_seen": 24165512,
|
|
"step": 221,
|
|
"train_runtime": 2571.5221,
|
|
"train_tokens_per_second": 9397.357
|
|
},
|
|
{
|
|
"epoch": 3.3146067415730336,
|
|
"grad_norm": 0.21374556422233582,
|
|
"learning_rate": 1.570476844144329e-05,
|
|
"loss": 0.0775,
|
|
"num_input_tokens_seen": 24265384,
|
|
"step": 222,
|
|
"train_runtime": 2582.824,
|
|
"train_tokens_per_second": 9394.904
|
|
},
|
|
{
|
|
"epoch": 3.3295880149812733,
|
|
"grad_norm": 0.2579873204231262,
|
|
"learning_rate": 1.546305576661776e-05,
|
|
"loss": 0.0852,
|
|
"num_input_tokens_seen": 24373048,
|
|
"step": 223,
|
|
"train_runtime": 2594.4338,
|
|
"train_tokens_per_second": 9394.361
|
|
},
|
|
{
|
|
"epoch": 3.344569288389513,
|
|
"grad_norm": 0.2597576379776001,
|
|
"learning_rate": 1.5222381987746104e-05,
|
|
"loss": 0.0791,
|
|
"num_input_tokens_seen": 24483840,
|
|
"step": 224,
|
|
"train_runtime": 2605.9851,
|
|
"train_tokens_per_second": 9395.234
|
|
},
|
|
{
|
|
"epoch": 3.359550561797753,
|
|
"grad_norm": 0.26184743642807007,
|
|
"learning_rate": 1.4982773322347144e-05,
|
|
"loss": 0.0617,
|
|
"num_input_tokens_seen": 24591096,
|
|
"step": 225,
|
|
"train_runtime": 2617.5518,
|
|
"train_tokens_per_second": 9394.693
|
|
},
|
|
{
|
|
"epoch": 3.359550561797753,
|
|
"eval_accuracy": 0.9478207874272477,
|
|
"eval_loss": 0.1583455204963684,
|
|
"eval_runtime": 4.9331,
|
|
"eval_samples_per_second": 11.555,
|
|
"eval_steps_per_second": 3.041,
|
|
"num_input_tokens_seen": 24591096,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 3.3745318352059925,
|
|
"grad_norm": 0.21227402985095978,
|
|
"learning_rate": 1.4744255871912823e-05,
|
|
"loss": 0.0616,
|
|
"num_input_tokens_seen": 24690968,
|
|
"step": 226,
|
|
"train_runtime": 2633.5828,
|
|
"train_tokens_per_second": 9375.429
|
|
},
|
|
{
|
|
"epoch": 3.3895131086142323,
|
|
"grad_norm": 0.21474283933639526,
|
|
"learning_rate": 1.4506855619064846e-05,
|
|
"loss": 0.0903,
|
|
"num_input_tokens_seen": 24799096,
|
|
"step": 227,
|
|
"train_runtime": 2644.9253,
|
|
"train_tokens_per_second": 9376.104
|
|
},
|
|
{
|
|
"epoch": 3.404494382022472,
|
|
"grad_norm": 0.287079393863678,
|
|
"learning_rate": 1.4270598424724292e-05,
|
|
"loss": 0.0394,
|
|
"num_input_tokens_seen": 24909896,
|
|
"step": 228,
|
|
"train_runtime": 2656.516,
|
|
"train_tokens_per_second": 9376.904
|
|
},
|
|
{
|
|
"epoch": 3.4194756554307117,
|
|
"grad_norm": 0.13763266801834106,
|
|
"learning_rate": 1.4035510025294462e-05,
|
|
"loss": 0.0985,
|
|
"num_input_tokens_seen": 25020096,
|
|
"step": 229,
|
|
"train_runtime": 2668.2051,
|
|
"train_tokens_per_second": 9377.126
|
|
},
|
|
{
|
|
"epoch": 3.4344569288389515,
|
|
"grad_norm": 0.5997536778450012,
|
|
"learning_rate": 1.3801616029857378e-05,
|
|
"loss": 0.0929,
|
|
"num_input_tokens_seen": 25134904,
|
|
"step": 230,
|
|
"train_runtime": 2679.8291,
|
|
"train_tokens_per_second": 9379.294
|
|
},
|
|
{
|
|
"epoch": 3.449438202247191,
|
|
"grad_norm": 0.20695947110652924,
|
|
"learning_rate": 1.3568941917384036e-05,
|
|
"loss": 0.0724,
|
|
"num_input_tokens_seen": 25238032,
|
|
"step": 231,
|
|
"train_runtime": 2691.1959,
|
|
"train_tokens_per_second": 9377.999
|
|
},
|
|
{
|
|
"epoch": 3.464419475655431,
|
|
"grad_norm": 0.20964248478412628,
|
|
"learning_rate": 1.3337513033958904e-05,
|
|
"loss": 0.0646,
|
|
"num_input_tokens_seen": 25346080,
|
|
"step": 232,
|
|
"train_runtime": 2702.7843,
|
|
"train_tokens_per_second": 9377.766
|
|
},
|
|
{
|
|
"epoch": 3.4794007490636703,
|
|
"grad_norm": 0.33709076046943665,
|
|
"learning_rate": 1.310735459001884e-05,
|
|
"loss": 0.0783,
|
|
"num_input_tokens_seen": 25456760,
|
|
"step": 233,
|
|
"train_runtime": 2714.3037,
|
|
"train_tokens_per_second": 9378.744
|
|
},
|
|
{
|
|
"epoch": 3.49438202247191,
|
|
"grad_norm": 0.26522397994995117,
|
|
"learning_rate": 1.2878491657606872e-05,
|
|
"loss": 0.0632,
|
|
"num_input_tokens_seen": 25565392,
|
|
"step": 234,
|
|
"train_runtime": 2725.4324,
|
|
"train_tokens_per_second": 9380.307
|
|
},
|
|
{
|
|
"epoch": 3.5093632958801497,
|
|
"grad_norm": 0.4283091425895691,
|
|
"learning_rate": 1.2650949167640997e-05,
|
|
"loss": 0.0887,
|
|
"num_input_tokens_seen": 25678520,
|
|
"step": 235,
|
|
"train_runtime": 2737.0518,
|
|
"train_tokens_per_second": 9381.817
|
|
},
|
|
{
|
|
"epoch": 3.5243445692883895,
|
|
"grad_norm": 0.20751389861106873,
|
|
"learning_rate": 1.2424751907198312e-05,
|
|
"loss": 0.094,
|
|
"num_input_tokens_seen": 25789432,
|
|
"step": 236,
|
|
"train_runtime": 2748.5904,
|
|
"train_tokens_per_second": 9382.785
|
|
},
|
|
{
|
|
"epoch": 3.539325842696629,
|
|
"grad_norm": 0.26652851700782776,
|
|
"learning_rate": 1.2199924516814939e-05,
|
|
"loss": 0.0623,
|
|
"num_input_tokens_seen": 25893768,
|
|
"step": 237,
|
|
"train_runtime": 2760.1255,
|
|
"train_tokens_per_second": 9381.373
|
|
},
|
|
{
|
|
"epoch": 3.554307116104869,
|
|
"grad_norm": 0.2422049343585968,
|
|
"learning_rate": 1.1976491487801748e-05,
|
|
"loss": 0.1051,
|
|
"num_input_tokens_seen": 26005272,
|
|
"step": 238,
|
|
"train_runtime": 2771.6123,
|
|
"train_tokens_per_second": 9382.723
|
|
},
|
|
{
|
|
"epoch": 3.5692883895131087,
|
|
"grad_norm": 0.20235666632652283,
|
|
"learning_rate": 1.1754477159576499e-05,
|
|
"loss": 0.069,
|
|
"num_input_tokens_seen": 26112160,
|
|
"step": 239,
|
|
"train_runtime": 2783.0863,
|
|
"train_tokens_per_second": 9382.447
|
|
},
|
|
{
|
|
"epoch": 3.5842696629213484,
|
|
"grad_norm": 0.6034291386604309,
|
|
"learning_rate": 1.1533905717012424e-05,
|
|
"loss": 0.0561,
|
|
"num_input_tokens_seen": 26227496,
|
|
"step": 240,
|
|
"train_runtime": 2794.6305,
|
|
"train_tokens_per_second": 9384.96
|
|
},
|
|
{
|
|
"epoch": 3.599250936329588,
|
|
"grad_norm": 0.17024967074394226,
|
|
"learning_rate": 1.1314801187803686e-05,
|
|
"loss": 0.0824,
|
|
"num_input_tokens_seen": 26323944,
|
|
"step": 241,
|
|
"train_runtime": 2805.7185,
|
|
"train_tokens_per_second": 9382.247
|
|
},
|
|
{
|
|
"epoch": 3.6142322097378274,
|
|
"grad_norm": 0.27029407024383545,
|
|
"learning_rate": 1.1097187439847939e-05,
|
|
"loss": 0.083,
|
|
"num_input_tokens_seen": 26423816,
|
|
"step": 242,
|
|
"train_runtime": 2816.7296,
|
|
"train_tokens_per_second": 9381.027
|
|
},
|
|
{
|
|
"epoch": 3.629213483146067,
|
|
"grad_norm": 0.20020800828933716,
|
|
"learning_rate": 1.088108817864629e-05,
|
|
"loss": 0.0969,
|
|
"num_input_tokens_seen": 26530000,
|
|
"step": 243,
|
|
"train_runtime": 2828.3199,
|
|
"train_tokens_per_second": 9380.127
|
|
},
|
|
{
|
|
"epoch": 3.644194756554307,
|
|
"grad_norm": 0.19449175894260406,
|
|
"learning_rate": 1.0666526944721016e-05,
|
|
"loss": 0.0487,
|
|
"num_input_tokens_seen": 26639920,
|
|
"step": 244,
|
|
"train_runtime": 2839.8372,
|
|
"train_tokens_per_second": 9380.791
|
|
},
|
|
{
|
|
"epoch": 3.6591760299625467,
|
|
"grad_norm": 0.19385063648223877,
|
|
"learning_rate": 1.0453527111051184e-05,
|
|
"loss": 0.0861,
|
|
"num_input_tokens_seen": 26755952,
|
|
"step": 245,
|
|
"train_runtime": 2851.3786,
|
|
"train_tokens_per_second": 9383.514
|
|
},
|
|
{
|
|
"epoch": 3.6741573033707864,
|
|
"grad_norm": 0.1747702807188034,
|
|
"learning_rate": 1.0242111880526495e-05,
|
|
"loss": 0.0879,
|
|
"num_input_tokens_seen": 26867776,
|
|
"step": 246,
|
|
"train_runtime": 2862.9101,
|
|
"train_tokens_per_second": 9384.778
|
|
},
|
|
{
|
|
"epoch": 3.689138576779026,
|
|
"grad_norm": 0.2870045304298401,
|
|
"learning_rate": 1.003230428341979e-05,
|
|
"loss": 0.081,
|
|
"num_input_tokens_seen": 26975080,
|
|
"step": 247,
|
|
"train_runtime": 2874.3933,
|
|
"train_tokens_per_second": 9384.617
|
|
},
|
|
{
|
|
"epoch": 3.704119850187266,
|
|
"grad_norm": 0.18543480336666107,
|
|
"learning_rate": 9.824127174878195e-06,
|
|
"loss": 0.0758,
|
|
"num_input_tokens_seen": 27088208,
|
|
"step": 248,
|
|
"train_runtime": 2885.9347,
|
|
"train_tokens_per_second": 9386.286
|
|
},
|
|
{
|
|
"epoch": 3.7191011235955056,
|
|
"grad_norm": 0.4089682698249817,
|
|
"learning_rate": 9.617603232433475e-06,
|
|
"loss": 0.1284,
|
|
"num_input_tokens_seen": 27199040,
|
|
"step": 249,
|
|
"train_runtime": 2897.423,
|
|
"train_tokens_per_second": 9387.321
|
|
},
|
|
{
|
|
"epoch": 3.7340823970037453,
|
|
"grad_norm": 0.23248536884784698,
|
|
"learning_rate": 9.412754953531663e-06,
|
|
"loss": 0.0883,
|
|
"num_input_tokens_seen": 27307192,
|
|
"step": 250,
|
|
"train_runtime": 2909.0185,
|
|
"train_tokens_per_second": 9387.081
|
|
},
|
|
{
|
|
"epoch": 3.7340823970037453,
|
|
"eval_accuracy": 0.9487738647538684,
|
|
"eval_loss": 0.15280824899673462,
|
|
"eval_runtime": 4.9317,
|
|
"eval_samples_per_second": 11.558,
|
|
"eval_steps_per_second": 3.042,
|
|
"num_input_tokens_seen": 27307192,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 3.749063670411985,
|
|
"grad_norm": 0.2929266691207886,
|
|
"learning_rate": 9.209604653082326e-06,
|
|
"loss": 0.0618,
|
|
"num_input_tokens_seen": 27419216,
|
|
"step": 251,
|
|
"train_runtime": 2925.3909,
|
|
"train_tokens_per_second": 9372.838
|
|
},
|
|
{
|
|
"epoch": 3.764044943820225,
|
|
"grad_norm": 0.18417492508888245,
|
|
"learning_rate": 9.008174461027724e-06,
|
|
"loss": 0.0664,
|
|
"num_input_tokens_seen": 27534416,
|
|
"step": 252,
|
|
"train_runtime": 2936.9289,
|
|
"train_tokens_per_second": 9375.241
|
|
},
|
|
{
|
|
"epoch": 3.7790262172284645,
|
|
"grad_norm": 0.1642679125070572,
|
|
"learning_rate": 8.808486319932083e-06,
|
|
"loss": 0.0691,
|
|
"num_input_tokens_seen": 27650456,
|
|
"step": 253,
|
|
"train_runtime": 2948.5436,
|
|
"train_tokens_per_second": 9377.666
|
|
},
|
|
{
|
|
"epoch": 3.7940074906367043,
|
|
"grad_norm": 0.27271735668182373,
|
|
"learning_rate": 8.610561982591357e-06,
|
|
"loss": 0.1072,
|
|
"num_input_tokens_seen": 27766296,
|
|
"step": 254,
|
|
"train_runtime": 2960.1358,
|
|
"train_tokens_per_second": 9380.075
|
|
},
|
|
{
|
|
"epoch": 3.808988764044944,
|
|
"grad_norm": 0.2314184457063675,
|
|
"learning_rate": 8.414423009663563e-06,
|
|
"loss": 0.1113,
|
|
"num_input_tokens_seen": 27877960,
|
|
"step": 255,
|
|
"train_runtime": 2971.7254,
|
|
"train_tokens_per_second": 9381.069
|
|
},
|
|
{
|
|
"epoch": 3.8239700374531838,
|
|
"grad_norm": 4.12896203994751,
|
|
"learning_rate": 8.220090767320137e-06,
|
|
"loss": 0.0787,
|
|
"num_input_tokens_seen": 27992400,
|
|
"step": 256,
|
|
"train_runtime": 2983.2661,
|
|
"train_tokens_per_second": 9383.139
|
|
},
|
|
{
|
|
"epoch": 3.8389513108614235,
|
|
"grad_norm": 0.18129047751426697,
|
|
"learning_rate": 8.027586424918412e-06,
|
|
"loss": 0.0436,
|
|
"num_input_tokens_seen": 28099232,
|
|
"step": 257,
|
|
"train_runtime": 2994.7467,
|
|
"train_tokens_per_second": 9382.841
|
|
},
|
|
{
|
|
"epoch": 3.853932584269663,
|
|
"grad_norm": 0.1452488899230957,
|
|
"learning_rate": 7.836930952695533e-06,
|
|
"loss": 0.0761,
|
|
"num_input_tokens_seen": 28212712,
|
|
"step": 258,
|
|
"train_runtime": 3006.2908,
|
|
"train_tokens_per_second": 9384.559
|
|
},
|
|
{
|
|
"epoch": 3.8689138576779025,
|
|
"grad_norm": 0.22081199288368225,
|
|
"learning_rate": 7.648145119484152e-06,
|
|
"loss": 0.0876,
|
|
"num_input_tokens_seen": 28327232,
|
|
"step": 259,
|
|
"train_runtime": 3017.7295,
|
|
"train_tokens_per_second": 9386.935
|
|
},
|
|
{
|
|
"epoch": 3.8838951310861423,
|
|
"grad_norm": 0.1896650493144989,
|
|
"learning_rate": 7.461249490449954e-06,
|
|
"loss": 0.0689,
|
|
"num_input_tokens_seen": 28444136,
|
|
"step": 260,
|
|
"train_runtime": 3029.4265,
|
|
"train_tokens_per_second": 9389.281
|
|
},
|
|
{
|
|
"epoch": 3.898876404494382,
|
|
"grad_norm": 0.3891507685184479,
|
|
"learning_rate": 7.2762644248514255e-06,
|
|
"loss": 0.0934,
|
|
"num_input_tokens_seen": 28553608,
|
|
"step": 261,
|
|
"train_runtime": 3040.9155,
|
|
"train_tokens_per_second": 9389.806
|
|
},
|
|
{
|
|
"epoch": 3.9138576779026217,
|
|
"grad_norm": 0.20802177488803864,
|
|
"learning_rate": 7.0932100738220265e-06,
|
|
"loss": 0.0616,
|
|
"num_input_tokens_seen": 28655944,
|
|
"step": 262,
|
|
"train_runtime": 3052.4258,
|
|
"train_tokens_per_second": 9387.925
|
|
},
|
|
{
|
|
"epoch": 3.9288389513108615,
|
|
"grad_norm": 0.162275493144989,
|
|
"learning_rate": 6.912106378175098e-06,
|
|
"loss": 0.0505,
|
|
"num_input_tokens_seen": 28770240,
|
|
"step": 263,
|
|
"train_runtime": 3063.975,
|
|
"train_tokens_per_second": 9389.842
|
|
},
|
|
{
|
|
"epoch": 3.943820224719101,
|
|
"grad_norm": 0.1334082931280136,
|
|
"learning_rate": 6.732973066231563e-06,
|
|
"loss": 0.0716,
|
|
"num_input_tokens_seen": 28879896,
|
|
"step": 264,
|
|
"train_runtime": 3075.4811,
|
|
"train_tokens_per_second": 9390.367
|
|
},
|
|
{
|
|
"epoch": 3.958801498127341,
|
|
"grad_norm": 0.24781842529773712,
|
|
"learning_rate": 6.555829651670911e-06,
|
|
"loss": 0.0925,
|
|
"num_input_tokens_seen": 28979616,
|
|
"step": 265,
|
|
"train_runtime": 3086.8721,
|
|
"train_tokens_per_second": 9388.02
|
|
},
|
|
{
|
|
"epoch": 3.9737827715355807,
|
|
"grad_norm": 0.25727924704551697,
|
|
"learning_rate": 6.380695431405453e-06,
|
|
"loss": 0.082,
|
|
"num_input_tokens_seen": 29095336,
|
|
"step": 266,
|
|
"train_runtime": 3098.3306,
|
|
"train_tokens_per_second": 9390.649
|
|
},
|
|
{
|
|
"epoch": 3.98876404494382,
|
|
"grad_norm": 0.20583029091358185,
|
|
"learning_rate": 6.207589483478266e-06,
|
|
"loss": 0.1735,
|
|
"num_input_tokens_seen": 29200208,
|
|
"step": 267,
|
|
"train_runtime": 3109.8583,
|
|
"train_tokens_per_second": 9389.562
|
|
},
|
|
{
|
|
"epoch": 4.0,
|
|
"grad_norm": 0.23410587012767792,
|
|
"learning_rate": 6.0365306649849214e-06,
|
|
"loss": 0.0554,
|
|
"num_input_tokens_seen": 29282608,
|
|
"step": 268,
|
|
"train_runtime": 3119.7239,
|
|
"train_tokens_per_second": 9386.282
|
|
},
|
|
{
|
|
"epoch": 4.01498127340824,
|
|
"grad_norm": 0.08256790041923523,
|
|
"learning_rate": 5.867537610019317e-06,
|
|
"loss": 0.0374,
|
|
"num_input_tokens_seen": 29391848,
|
|
"step": 269,
|
|
"train_runtime": 3131.2887,
|
|
"train_tokens_per_second": 9386.502
|
|
},
|
|
{
|
|
"epoch": 4.0299625468164795,
|
|
"grad_norm": 0.20500850677490234,
|
|
"learning_rate": 5.700628727643806e-06,
|
|
"loss": 0.0644,
|
|
"num_input_tokens_seen": 29507360,
|
|
"step": 270,
|
|
"train_runtime": 3142.8457,
|
|
"train_tokens_per_second": 9388.74
|
|
},
|
|
{
|
|
"epoch": 4.044943820224719,
|
|
"grad_norm": 0.1724829226732254,
|
|
"learning_rate": 5.53582219988382e-06,
|
|
"loss": 0.0621,
|
|
"num_input_tokens_seen": 29607936,
|
|
"step": 271,
|
|
"train_runtime": 3154.3349,
|
|
"train_tokens_per_second": 9386.428
|
|
},
|
|
{
|
|
"epoch": 4.059925093632959,
|
|
"grad_norm": 0.17760220170021057,
|
|
"learning_rate": 5.373135979747227e-06,
|
|
"loss": 0.0525,
|
|
"num_input_tokens_seen": 29710240,
|
|
"step": 272,
|
|
"train_runtime": 3165.4438,
|
|
"train_tokens_per_second": 9385.806
|
|
},
|
|
{
|
|
"epoch": 4.074906367041199,
|
|
"grad_norm": 0.20548486709594727,
|
|
"learning_rate": 5.2125877892686496e-06,
|
|
"loss": 0.072,
|
|
"num_input_tokens_seen": 29819600,
|
|
"step": 273,
|
|
"train_runtime": 3176.9528,
|
|
"train_tokens_per_second": 9386.227
|
|
},
|
|
{
|
|
"epoch": 4.089887640449438,
|
|
"grad_norm": 2.707559108734131,
|
|
"learning_rate": 5.054195117578914e-06,
|
|
"loss": 0.1253,
|
|
"num_input_tokens_seen": 29927712,
|
|
"step": 274,
|
|
"train_runtime": 3188.3808,
|
|
"train_tokens_per_second": 9386.492
|
|
},
|
|
{
|
|
"epoch": 4.104868913857678,
|
|
"grad_norm": 0.19858214259147644,
|
|
"learning_rate": 4.897975218999926e-06,
|
|
"loss": 0.0516,
|
|
"num_input_tokens_seen": 30036912,
|
|
"step": 275,
|
|
"train_runtime": 3199.9225,
|
|
"train_tokens_per_second": 9386.762
|
|
},
|
|
{
|
|
"epoch": 4.104868913857678,
|
|
"eval_accuracy": 0.9503720481817801,
|
|
"eval_loss": 0.148418128490448,
|
|
"eval_runtime": 4.9281,
|
|
"eval_samples_per_second": 11.566,
|
|
"eval_steps_per_second": 3.044,
|
|
"num_input_tokens_seen": 30036912,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 4.119850187265918,
|
|
"grad_norm": 0.20856685936450958,
|
|
"learning_rate": 4.743945111165068e-06,
|
|
"loss": 0.0597,
|
|
"num_input_tokens_seen": 30142632,
|
|
"step": 276,
|
|
"train_runtime": 3216.4293,
|
|
"train_tokens_per_second": 9371.458
|
|
},
|
|
{
|
|
"epoch": 4.134831460674158,
|
|
"grad_norm": 0.15552882850170135,
|
|
"learning_rate": 4.592121573165414e-06,
|
|
"loss": 0.0481,
|
|
"num_input_tokens_seen": 30249816,
|
|
"step": 277,
|
|
"train_runtime": 3228.0168,
|
|
"train_tokens_per_second": 9371.022
|
|
},
|
|
{
|
|
"epoch": 4.149812734082397,
|
|
"grad_norm": 0.19117474555969238,
|
|
"learning_rate": 4.442521143721892e-06,
|
|
"loss": 0.0528,
|
|
"num_input_tokens_seen": 30360248,
|
|
"step": 278,
|
|
"train_runtime": 3239.5535,
|
|
"train_tokens_per_second": 9371.738
|
|
},
|
|
{
|
|
"epoch": 4.164794007490637,
|
|
"grad_norm": 0.1939282864332199,
|
|
"learning_rate": 4.295160119383712e-06,
|
|
"loss": 0.0558,
|
|
"num_input_tokens_seen": 30466592,
|
|
"step": 279,
|
|
"train_runtime": 3251.0279,
|
|
"train_tokens_per_second": 9371.372
|
|
},
|
|
{
|
|
"epoch": 4.179775280898877,
|
|
"grad_norm": 0.21391624212265015,
|
|
"learning_rate": 4.150054552753055e-06,
|
|
"loss": 0.0739,
|
|
"num_input_tokens_seen": 30567952,
|
|
"step": 280,
|
|
"train_runtime": 3262.5285,
|
|
"train_tokens_per_second": 9369.405
|
|
},
|
|
{
|
|
"epoch": 4.194756554307116,
|
|
"grad_norm": 0.18282581865787506,
|
|
"learning_rate": 4.007220250736454e-06,
|
|
"loss": 0.059,
|
|
"num_input_tokens_seen": 30674984,
|
|
"step": 281,
|
|
"train_runtime": 3274.0659,
|
|
"train_tokens_per_second": 9369.08
|
|
},
|
|
{
|
|
"epoch": 4.209737827715355,
|
|
"grad_norm": 0.5102422833442688,
|
|
"learning_rate": 3.866672772822863e-06,
|
|
"loss": 0.0275,
|
|
"num_input_tokens_seen": 30791864,
|
|
"step": 282,
|
|
"train_runtime": 3285.6956,
|
|
"train_tokens_per_second": 9371.49
|
|
},
|
|
{
|
|
"epoch": 4.224719101123595,
|
|
"grad_norm": 0.15346960723400116,
|
|
"learning_rate": 3.728427429388709e-06,
|
|
"loss": 0.041,
|
|
"num_input_tokens_seen": 30908384,
|
|
"step": 283,
|
|
"train_runtime": 3297.3237,
|
|
"train_tokens_per_second": 9373.779
|
|
},
|
|
{
|
|
"epoch": 4.239700374531835,
|
|
"grad_norm": 0.17301329970359802,
|
|
"learning_rate": 3.592499280030057e-06,
|
|
"loss": 0.0492,
|
|
"num_input_tokens_seen": 31023848,
|
|
"step": 284,
|
|
"train_runtime": 3308.9234,
|
|
"train_tokens_per_second": 9375.813
|
|
},
|
|
{
|
|
"epoch": 4.254681647940075,
|
|
"grad_norm": 0.1514940708875656,
|
|
"learning_rate": 3.458903131922134e-06,
|
|
"loss": 0.0555,
|
|
"num_input_tokens_seen": 31137384,
|
|
"step": 285,
|
|
"train_runtime": 3320.5419,
|
|
"train_tokens_per_second": 9377.199
|
|
},
|
|
{
|
|
"epoch": 4.269662921348314,
|
|
"grad_norm": 0.18485209345817566,
|
|
"learning_rate": 3.3276535382063183e-06,
|
|
"loss": 0.0493,
|
|
"num_input_tokens_seen": 31244936,
|
|
"step": 286,
|
|
"train_runtime": 3332.0917,
|
|
"train_tokens_per_second": 9376.974
|
|
},
|
|
{
|
|
"epoch": 4.284644194756554,
|
|
"grad_norm": 0.200953871011734,
|
|
"learning_rate": 3.198764796404807e-06,
|
|
"loss": 0.0492,
|
|
"num_input_tokens_seen": 31355616,
|
|
"step": 287,
|
|
"train_runtime": 3343.5886,
|
|
"train_tokens_per_second": 9377.833
|
|
},
|
|
{
|
|
"epoch": 4.299625468164794,
|
|
"grad_norm": 0.21441112458705902,
|
|
"learning_rate": 3.0722509468631392e-06,
|
|
"loss": 0.0649,
|
|
"num_input_tokens_seen": 31463648,
|
|
"step": 288,
|
|
"train_runtime": 3354.9275,
|
|
"train_tokens_per_second": 9378.339
|
|
},
|
|
{
|
|
"epoch": 4.314606741573034,
|
|
"grad_norm": 0.1840512454509735,
|
|
"learning_rate": 2.948125771220697e-06,
|
|
"loss": 0.0481,
|
|
"num_input_tokens_seen": 31577056,
|
|
"step": 289,
|
|
"train_runtime": 3366.6336,
|
|
"train_tokens_per_second": 9379.416
|
|
},
|
|
{
|
|
"epoch": 4.329588014981273,
|
|
"grad_norm": 0.166469007730484,
|
|
"learning_rate": 2.8264027909094715e-06,
|
|
"loss": 0.0455,
|
|
"num_input_tokens_seen": 31682424,
|
|
"step": 290,
|
|
"train_runtime": 3378.1026,
|
|
"train_tokens_per_second": 9378.763
|
|
},
|
|
{
|
|
"epoch": 4.344569288389513,
|
|
"grad_norm": 0.23863935470581055,
|
|
"learning_rate": 2.707095265681081e-06,
|
|
"loss": 0.0588,
|
|
"num_input_tokens_seen": 31790168,
|
|
"step": 291,
|
|
"train_runtime": 3389.5951,
|
|
"train_tokens_per_second": 9378.751
|
|
},
|
|
{
|
|
"epoch": 4.359550561797753,
|
|
"grad_norm": 0.22671280801296234,
|
|
"learning_rate": 2.5902161921623454e-06,
|
|
"loss": 0.0553,
|
|
"num_input_tokens_seen": 31905520,
|
|
"step": 292,
|
|
"train_runtime": 3401.1676,
|
|
"train_tokens_per_second": 9380.755
|
|
},
|
|
{
|
|
"epoch": 4.3745318352059925,
|
|
"grad_norm": 0.19666582345962524,
|
|
"learning_rate": 2.475778302439524e-06,
|
|
"loss": 0.0452,
|
|
"num_input_tokens_seen": 32020200,
|
|
"step": 293,
|
|
"train_runtime": 3412.8228,
|
|
"train_tokens_per_second": 9382.321
|
|
},
|
|
{
|
|
"epoch": 4.389513108614232,
|
|
"grad_norm": 0.30095529556274414,
|
|
"learning_rate": 2.3637940626713346e-06,
|
|
"loss": 0.0707,
|
|
"num_input_tokens_seen": 32129744,
|
|
"step": 294,
|
|
"train_runtime": 3424.2939,
|
|
"train_tokens_per_second": 9382.881
|
|
},
|
|
{
|
|
"epoch": 4.404494382022472,
|
|
"grad_norm": 0.21905633807182312,
|
|
"learning_rate": 2.254275671731007e-06,
|
|
"loss": 0.0611,
|
|
"num_input_tokens_seen": 32247024,
|
|
"step": 295,
|
|
"train_runtime": 3435.8655,
|
|
"train_tokens_per_second": 9385.415
|
|
},
|
|
{
|
|
"epoch": 4.419475655430712,
|
|
"grad_norm": 0.18735012412071228,
|
|
"learning_rate": 2.14723505987737e-06,
|
|
"loss": 0.058,
|
|
"num_input_tokens_seen": 32361392,
|
|
"step": 296,
|
|
"train_runtime": 3447.3852,
|
|
"train_tokens_per_second": 9387.228
|
|
},
|
|
{
|
|
"epoch": 4.4344569288389515,
|
|
"grad_norm": 0.18301299214363098,
|
|
"learning_rate": 2.0426838874552714e-06,
|
|
"loss": 0.0571,
|
|
"num_input_tokens_seen": 32469248,
|
|
"step": 297,
|
|
"train_runtime": 3458.8947,
|
|
"train_tokens_per_second": 9387.174
|
|
},
|
|
{
|
|
"epoch": 4.449438202247191,
|
|
"grad_norm": 0.09776000678539276,
|
|
"learning_rate": 1.9406335436253724e-06,
|
|
"loss": 0.0364,
|
|
"num_input_tokens_seen": 32582736,
|
|
"step": 298,
|
|
"train_runtime": 3470.5436,
|
|
"train_tokens_per_second": 9388.367
|
|
},
|
|
{
|
|
"epoch": 4.464419475655431,
|
|
"grad_norm": 0.15819956362247467,
|
|
"learning_rate": 1.8410951451234533e-06,
|
|
"loss": 0.034,
|
|
"num_input_tokens_seen": 32691704,
|
|
"step": 299,
|
|
"train_runtime": 3481.9348,
|
|
"train_tokens_per_second": 9388.948
|
|
},
|
|
{
|
|
"epoch": 4.479400749063671,
|
|
"grad_norm": 0.22488094866275787,
|
|
"learning_rate": 1.7440795350494588e-06,
|
|
"loss": 0.0675,
|
|
"num_input_tokens_seen": 32807520,
|
|
"step": 300,
|
|
"train_runtime": 3493.4629,
|
|
"train_tokens_per_second": 9391.117
|
|
},
|
|
{
|
|
"epoch": 4.479400749063671,
|
|
"eval_accuracy": 0.9514037008261675,
|
|
"eval_loss": 0.14898425340652466,
|
|
"eval_runtime": 4.93,
|
|
"eval_samples_per_second": 11.562,
|
|
"eval_steps_per_second": 3.043,
|
|
"num_input_tokens_seen": 32807520,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 4.49438202247191,
|
|
"grad_norm": 0.16308434307575226,
|
|
"learning_rate": 1.649597281686302e-06,
|
|
"loss": 0.0563,
|
|
"num_input_tokens_seen": 32917472,
|
|
"step": 301,
|
|
"train_runtime": 3509.8805,
|
|
"train_tokens_per_second": 9378.516
|
|
},
|
|
{
|
|
"epoch": 4.50936329588015,
|
|
"grad_norm": 0.1949169635772705,
|
|
"learning_rate": 1.5576586773486195e-06,
|
|
"loss": 0.0582,
|
|
"num_input_tokens_seen": 33026552,
|
|
"step": 302,
|
|
"train_runtime": 3521.2763,
|
|
"train_tokens_per_second": 9379.143
|
|
},
|
|
{
|
|
"epoch": 4.52434456928839,
|
|
"grad_norm": 0.20031088590621948,
|
|
"learning_rate": 1.4682737372615967e-06,
|
|
"loss": 0.048,
|
|
"num_input_tokens_seen": 33135312,
|
|
"step": 303,
|
|
"train_runtime": 3532.5735,
|
|
"train_tokens_per_second": 9379.936
|
|
},
|
|
{
|
|
"epoch": 4.539325842696629,
|
|
"grad_norm": 0.16514037549495697,
|
|
"learning_rate": 1.3814521984699596e-06,
|
|
"loss": 0.0556,
|
|
"num_input_tokens_seen": 33249640,
|
|
"step": 304,
|
|
"train_runtime": 3544.1498,
|
|
"train_tokens_per_second": 9381.556
|
|
},
|
|
{
|
|
"epoch": 4.554307116104869,
|
|
"grad_norm": 0.17982099950313568,
|
|
"learning_rate": 1.297203518777293e-06,
|
|
"loss": 0.0427,
|
|
"num_input_tokens_seen": 33356584,
|
|
"step": 305,
|
|
"train_runtime": 3555.468,
|
|
"train_tokens_per_second": 9381.77
|
|
},
|
|
{
|
|
"epoch": 4.569288389513108,
|
|
"grad_norm": 0.30250856280326843,
|
|
"learning_rate": 1.2155368757157643e-06,
|
|
"loss": 0.095,
|
|
"num_input_tokens_seen": 33465096,
|
|
"step": 306,
|
|
"train_runtime": 3567.021,
|
|
"train_tokens_per_second": 9381.805
|
|
},
|
|
{
|
|
"epoch": 4.584269662921348,
|
|
"grad_norm": 0.14334945380687714,
|
|
"learning_rate": 1.1364611655463736e-06,
|
|
"loss": 0.0329,
|
|
"num_input_tokens_seen": 33589904,
|
|
"step": 307,
|
|
"train_runtime": 3578.618,
|
|
"train_tokens_per_second": 9386.278
|
|
},
|
|
{
|
|
"epoch": 4.599250936329588,
|
|
"grad_norm": 0.11703667044639587,
|
|
"learning_rate": 1.0599850022898539e-06,
|
|
"loss": 0.048,
|
|
"num_input_tokens_seen": 33693528,
|
|
"step": 308,
|
|
"train_runtime": 3590.015,
|
|
"train_tokens_per_second": 9385.345
|
|
},
|
|
{
|
|
"epoch": 4.614232209737827,
|
|
"grad_norm": 0.25832632184028625,
|
|
"learning_rate": 9.861167167883046e-07,
|
|
"loss": 0.0709,
|
|
"num_input_tokens_seen": 33800928,
|
|
"step": 309,
|
|
"train_runtime": 3601.3539,
|
|
"train_tokens_per_second": 9385.617
|
|
},
|
|
{
|
|
"epoch": 4.629213483146067,
|
|
"grad_norm": 0.2718651592731476,
|
|
"learning_rate": 9.148643557976955e-07,
|
|
"loss": 0.0807,
|
|
"num_input_tokens_seen": 33904464,
|
|
"step": 310,
|
|
"train_runtime": 3612.5883,
|
|
"train_tokens_per_second": 9385.089
|
|
},
|
|
{
|
|
"epoch": 4.644194756554307,
|
|
"grad_norm": 0.14838100969791412,
|
|
"learning_rate": 8.462356811112987e-07,
|
|
"loss": 0.0501,
|
|
"num_input_tokens_seen": 34020608,
|
|
"step": 311,
|
|
"train_runtime": 3623.9265,
|
|
"train_tokens_per_second": 9387.775
|
|
},
|
|
{
|
|
"epoch": 4.659176029962547,
|
|
"grad_norm": 0.2020755410194397,
|
|
"learning_rate": 7.802381687141535e-07,
|
|
"loss": 0.0499,
|
|
"num_input_tokens_seen": 34129480,
|
|
"step": 312,
|
|
"train_runtime": 3635.4612,
|
|
"train_tokens_per_second": 9387.937
|
|
},
|
|
{
|
|
"epoch": 4.674157303370786,
|
|
"grad_norm": 0.2344108372926712,
|
|
"learning_rate": 7.168790079686932e-07,
|
|
"loss": 0.086,
|
|
"num_input_tokens_seen": 34229672,
|
|
"step": 313,
|
|
"train_runtime": 3646.7065,
|
|
"train_tokens_per_second": 9386.462
|
|
},
|
|
{
|
|
"epoch": 4.689138576779026,
|
|
"grad_norm": 0.21764852106571198,
|
|
"learning_rate": 6.561651008315738e-07,
|
|
"loss": 0.0711,
|
|
"num_input_tokens_seen": 34335640,
|
|
"step": 314,
|
|
"train_runtime": 3657.926,
|
|
"train_tokens_per_second": 9386.642
|
|
},
|
|
{
|
|
"epoch": 4.704119850187266,
|
|
"grad_norm": 0.1388695240020752,
|
|
"learning_rate": 5.981030611018234e-07,
|
|
"loss": 0.0417,
|
|
"num_input_tokens_seen": 34431984,
|
|
"step": 315,
|
|
"train_runtime": 3669.1157,
|
|
"train_tokens_per_second": 9384.273
|
|
},
|
|
{
|
|
"epoch": 4.719101123595506,
|
|
"grad_norm": 0.19717121124267578,
|
|
"learning_rate": 5.426992137003622e-07,
|
|
"loss": 0.0668,
|
|
"num_input_tokens_seen": 34547560,
|
|
"step": 316,
|
|
"train_runtime": 3680.779,
|
|
"train_tokens_per_second": 9385.937
|
|
},
|
|
{
|
|
"epoch": 4.734082397003745,
|
|
"grad_norm": 0.1789878010749817,
|
|
"learning_rate": 4.899595939810236e-07,
|
|
"loss": 0.0582,
|
|
"num_input_tokens_seen": 34651384,
|
|
"step": 317,
|
|
"train_runtime": 3691.8469,
|
|
"train_tokens_per_second": 9385.921
|
|
},
|
|
{
|
|
"epoch": 4.749063670411985,
|
|
"grad_norm": 0.20900003612041473,
|
|
"learning_rate": 4.398899470730827e-07,
|
|
"loss": 0.0559,
|
|
"num_input_tokens_seen": 34759152,
|
|
"step": 318,
|
|
"train_runtime": 3703.8343,
|
|
"train_tokens_per_second": 9384.64
|
|
},
|
|
{
|
|
"epoch": 4.764044943820225,
|
|
"grad_norm": 0.18526972830295563,
|
|
"learning_rate": 3.9249572725543196e-07,
|
|
"loss": 0.0529,
|
|
"num_input_tokens_seen": 34874632,
|
|
"step": 319,
|
|
"train_runtime": 3715.8268,
|
|
"train_tokens_per_second": 9385.43
|
|
},
|
|
{
|
|
"epoch": 4.7790262172284645,
|
|
"grad_norm": 0.18614766001701355,
|
|
"learning_rate": 3.477820973624063e-07,
|
|
"loss": 0.0524,
|
|
"num_input_tokens_seen": 34988104,
|
|
"step": 320,
|
|
"train_runtime": 3727.7427,
|
|
"train_tokens_per_second": 9385.869
|
|
},
|
|
{
|
|
"epoch": 4.794007490636704,
|
|
"grad_norm": 0.2322590947151184,
|
|
"learning_rate": 3.0575392822139726e-07,
|
|
"loss": 0.0521,
|
|
"num_input_tokens_seen": 35096592,
|
|
"step": 321,
|
|
"train_runtime": 3739.5414,
|
|
"train_tokens_per_second": 9385.266
|
|
},
|
|
{
|
|
"epoch": 4.808988764044944,
|
|
"grad_norm": 0.1705033928155899,
|
|
"learning_rate": 2.664157981222437e-07,
|
|
"loss": 0.0796,
|
|
"num_input_tokens_seen": 35211304,
|
|
"step": 322,
|
|
"train_runtime": 3751.4526,
|
|
"train_tokens_per_second": 9386.045
|
|
},
|
|
{
|
|
"epoch": 4.823970037453184,
|
|
"grad_norm": 0.26094940304756165,
|
|
"learning_rate": 2.297719923185032e-07,
|
|
"loss": 0.0674,
|
|
"num_input_tokens_seen": 35323056,
|
|
"step": 323,
|
|
"train_runtime": 3763.0198,
|
|
"train_tokens_per_second": 9386.891
|
|
},
|
|
{
|
|
"epoch": 4.8389513108614235,
|
|
"grad_norm": 0.3194412291049957,
|
|
"learning_rate": 1.9582650256064205e-07,
|
|
"loss": 0.0803,
|
|
"num_input_tokens_seen": 35436552,
|
|
"step": 324,
|
|
"train_runtime": 3774.8806,
|
|
"train_tokens_per_second": 9387.463
|
|
},
|
|
{
|
|
"epoch": 4.853932584269663,
|
|
"grad_norm": 0.2129000723361969,
|
|
"learning_rate": 1.645830266611914e-07,
|
|
"loss": 0.0626,
|
|
"num_input_tokens_seen": 35549872,
|
|
"step": 325,
|
|
"train_runtime": 3786.8829,
|
|
"train_tokens_per_second": 9387.634
|
|
},
|
|
{
|
|
"epoch": 4.853932584269663,
|
|
"eval_accuracy": 0.9510650506531415,
|
|
"eval_loss": 0.14768485724925995,
|
|
"eval_runtime": 4.9341,
|
|
"eval_samples_per_second": 11.552,
|
|
"eval_steps_per_second": 3.04,
|
|
"num_input_tokens_seen": 35549872,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 4.868913857677903,
|
|
"grad_norm": 0.2143908590078354,
|
|
"learning_rate": 1.3604496809195288e-07,
|
|
"loss": 0.0551,
|
|
"num_input_tokens_seen": 35659600,
|
|
"step": 326,
|
|
"train_runtime": 3803.6163,
|
|
"train_tokens_per_second": 9375.183
|
|
},
|
|
{
|
|
"epoch": 4.883895131086143,
|
|
"grad_norm": 0.15451987087726593,
|
|
"learning_rate": 1.1021543561322012e-07,
|
|
"loss": 0.0536,
|
|
"num_input_tokens_seen": 35770904,
|
|
"step": 327,
|
|
"train_runtime": 3815.5627,
|
|
"train_tokens_per_second": 9375.001
|
|
},
|
|
{
|
|
"epoch": 4.898876404494382,
|
|
"grad_norm": 0.19724752008914948,
|
|
"learning_rate": 8.709724293513854e-08,
|
|
"loss": 0.0664,
|
|
"num_input_tokens_seen": 35879784,
|
|
"step": 328,
|
|
"train_runtime": 3827.5139,
|
|
"train_tokens_per_second": 9374.175
|
|
},
|
|
{
|
|
"epoch": 4.913857677902621,
|
|
"grad_norm": 0.23586316406726837,
|
|
"learning_rate": 6.66929084112089e-08,
|
|
"loss": 0.0641,
|
|
"num_input_tokens_seen": 35988344,
|
|
"step": 329,
|
|
"train_runtime": 3839.4639,
|
|
"train_tokens_per_second": 9373.273
|
|
},
|
|
{
|
|
"epoch": 4.928838951310862,
|
|
"grad_norm": 0.19379207491874695,
|
|
"learning_rate": 4.900465476393168e-08,
|
|
"loss": 0.0624,
|
|
"num_input_tokens_seen": 36093032,
|
|
"step": 330,
|
|
"train_runtime": 3851.3388,
|
|
"train_tokens_per_second": 9371.555
|
|
},
|
|
{
|
|
"epoch": 4.943820224719101,
|
|
"grad_norm": 0.19698284566402435,
|
|
"learning_rate": 3.403440884269526e-08,
|
|
"loss": 0.0484,
|
|
"num_input_tokens_seen": 36199864,
|
|
"step": 331,
|
|
"train_runtime": 3863.2488,
|
|
"train_tokens_per_second": 9370.317
|
|
},
|
|
{
|
|
"epoch": 4.9588014981273405,
|
|
"grad_norm": 0.19212862849235535,
|
|
"learning_rate": 2.1783801413866046e-08,
|
|
"loss": 0.0649,
|
|
"num_input_tokens_seen": 36302712,
|
|
"step": 332,
|
|
"train_runtime": 3875.1667,
|
|
"train_tokens_per_second": 9368.039
|
|
},
|
|
{
|
|
"epoch": 4.97378277153558,
|
|
"grad_norm": 0.3132294714450836,
|
|
"learning_rate": 1.2254166983152737e-08,
|
|
"loss": 0.0684,
|
|
"num_input_tokens_seen": 36412088,
|
|
"step": 333,
|
|
"train_runtime": 3886.7479,
|
|
"train_tokens_per_second": 9368.266
|
|
},
|
|
{
|
|
"epoch": 4.98876404494382,
|
|
"grad_norm": 0.24174365401268005,
|
|
"learning_rate": 5.446543650219904e-09,
|
|
"loss": 0.0744,
|
|
"num_input_tokens_seen": 36523328,
|
|
"step": 334,
|
|
"train_runtime": 3898.6721,
|
|
"train_tokens_per_second": 9368.146
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 0.24174365401268005,
|
|
"learning_rate": 1.3616729956228425e-09,
|
|
"loss": 0.0815,
|
|
"num_input_tokens_seen": 36600520,
|
|
"step": 335,
|
|
"train_runtime": 3903.2788,
|
|
"train_tokens_per_second": 9376.865
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"num_input_tokens_seen": 36600520,
|
|
"step": 335,
|
|
"total_flos": 1.6620454705385964e+18,
|
|
"train_loss": 0.11312562568641421,
|
|
"train_runtime": 4017.5693,
|
|
"train_samples_per_second": 1.325,
|
|
"train_steps_per_second": 0.083
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 335,
|
|
"num_input_tokens_seen": 36600520,
|
|
"num_train_epochs": 5,
|
|
"save_steps": 1000,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": true,
|
|
"should_training_stop": true
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.6620454705385964e+18,
|
|
"train_batch_size": 1,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|