5174 lines
135 KiB
JSON
5174 lines
135 KiB
JSON
|
|
{
|
||
|
|
"best_global_step": null,
|
||
|
|
"best_metric": null,
|
||
|
|
"best_model_checkpoint": null,
|
||
|
|
"epoch": 1.0,
|
||
|
|
"eval_steps": 500,
|
||
|
|
"global_step": 570,
|
||
|
|
"is_hyper_param_search": false,
|
||
|
|
"is_local_process_zero": true,
|
||
|
|
"is_world_process_zero": true,
|
||
|
|
"log_history": [
|
||
|
|
{
|
||
|
|
"epoch": 0.0017543859649122807,
|
||
|
|
"grad_norm": 6.018052101135254,
|
||
|
|
"learning_rate": 0.0,
|
||
|
|
"loss": 1.789,
|
||
|
|
"mean_token_accuracy": 0.5678549408912659,
|
||
|
|
"num_tokens": 429478.0,
|
||
|
|
"step": 1
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0035087719298245615,
|
||
|
|
"grad_norm": 6.373941898345947,
|
||
|
|
"learning_rate": 1.7543859649122807e-06,
|
||
|
|
"loss": 1.7812,
|
||
|
|
"mean_token_accuracy": 0.5701305270195007,
|
||
|
|
"num_tokens": 824562.0,
|
||
|
|
"step": 2
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.005263157894736842,
|
||
|
|
"grad_norm": 6.300591468811035,
|
||
|
|
"learning_rate": 3.5087719298245615e-06,
|
||
|
|
"loss": 1.787,
|
||
|
|
"mean_token_accuracy": 0.5685350894927979,
|
||
|
|
"num_tokens": 1228422.0,
|
||
|
|
"step": 3
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.007017543859649123,
|
||
|
|
"grad_norm": 4.870020389556885,
|
||
|
|
"learning_rate": 5.263157894736842e-06,
|
||
|
|
"loss": 1.7577,
|
||
|
|
"mean_token_accuracy": 0.570589542388916,
|
||
|
|
"num_tokens": 1633223.0,
|
||
|
|
"step": 4
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.008771929824561403,
|
||
|
|
"grad_norm": 3.564033269882202,
|
||
|
|
"learning_rate": 7.017543859649123e-06,
|
||
|
|
"loss": 1.6964,
|
||
|
|
"mean_token_accuracy": 0.5784072279930115,
|
||
|
|
"num_tokens": 2060143.0,
|
||
|
|
"step": 5
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.010526315789473684,
|
||
|
|
"grad_norm": 2.9137721061706543,
|
||
|
|
"learning_rate": 8.771929824561403e-06,
|
||
|
|
"loss": 1.646,
|
||
|
|
"mean_token_accuracy": 0.5835782289505005,
|
||
|
|
"num_tokens": 2503690.0,
|
||
|
|
"step": 6
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.012280701754385965,
|
||
|
|
"grad_norm": 2.481250047683716,
|
||
|
|
"learning_rate": 1.0526315789473684e-05,
|
||
|
|
"loss": 1.5991,
|
||
|
|
"mean_token_accuracy": 0.5927026271820068,
|
||
|
|
"num_tokens": 2904426.0,
|
||
|
|
"step": 7
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.014035087719298246,
|
||
|
|
"grad_norm": 3.379573345184326,
|
||
|
|
"learning_rate": 1.2280701754385964e-05,
|
||
|
|
"loss": 1.5736,
|
||
|
|
"mean_token_accuracy": 0.5973439812660217,
|
||
|
|
"num_tokens": 3299360.0,
|
||
|
|
"step": 8
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.015789473684210527,
|
||
|
|
"grad_norm": 2.4704713821411133,
|
||
|
|
"learning_rate": 1.4035087719298246e-05,
|
||
|
|
"loss": 1.5427,
|
||
|
|
"mean_token_accuracy": 0.6033717393875122,
|
||
|
|
"num_tokens": 3709604.0,
|
||
|
|
"step": 9
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.017543859649122806,
|
||
|
|
"grad_norm": 1.8616167306900024,
|
||
|
|
"learning_rate": 1.5789473684210526e-05,
|
||
|
|
"loss": 1.5179,
|
||
|
|
"mean_token_accuracy": 0.606142520904541,
|
||
|
|
"num_tokens": 4122427.0,
|
||
|
|
"step": 10
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.01929824561403509,
|
||
|
|
"grad_norm": 1.90486478805542,
|
||
|
|
"learning_rate": 1.7543859649122806e-05,
|
||
|
|
"loss": 1.5291,
|
||
|
|
"mean_token_accuracy": 0.6004294157028198,
|
||
|
|
"num_tokens": 4549800.0,
|
||
|
|
"step": 11
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.021052631578947368,
|
||
|
|
"grad_norm": 1.6314690113067627,
|
||
|
|
"learning_rate": 1.929824561403509e-05,
|
||
|
|
"loss": 1.5063,
|
||
|
|
"mean_token_accuracy": 0.6060437560081482,
|
||
|
|
"num_tokens": 4972367.0,
|
||
|
|
"step": 12
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02280701754385965,
|
||
|
|
"grad_norm": 1.4108632802963257,
|
||
|
|
"learning_rate": 2.105263157894737e-05,
|
||
|
|
"loss": 1.4803,
|
||
|
|
"mean_token_accuracy": 0.6093297004699707,
|
||
|
|
"num_tokens": 5381784.0,
|
||
|
|
"step": 13
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02456140350877193,
|
||
|
|
"grad_norm": 1.3794684410095215,
|
||
|
|
"learning_rate": 2.280701754385965e-05,
|
||
|
|
"loss": 1.4411,
|
||
|
|
"mean_token_accuracy": 0.6182389259338379,
|
||
|
|
"num_tokens": 5783941.0,
|
||
|
|
"step": 14
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02631578947368421,
|
||
|
|
"grad_norm": 1.2287817001342773,
|
||
|
|
"learning_rate": 2.456140350877193e-05,
|
||
|
|
"loss": 1.423,
|
||
|
|
"mean_token_accuracy": 0.621440052986145,
|
||
|
|
"num_tokens": 6173946.0,
|
||
|
|
"step": 15
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.028070175438596492,
|
||
|
|
"grad_norm": 1.2201406955718994,
|
||
|
|
"learning_rate": 2.6315789473684212e-05,
|
||
|
|
"loss": 1.4455,
|
||
|
|
"mean_token_accuracy": 0.6170344352722168,
|
||
|
|
"num_tokens": 6588292.0,
|
||
|
|
"step": 16
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.02982456140350877,
|
||
|
|
"grad_norm": 1.1274272203445435,
|
||
|
|
"learning_rate": 2.8070175438596492e-05,
|
||
|
|
"loss": 1.4051,
|
||
|
|
"mean_token_accuracy": 0.6251938343048096,
|
||
|
|
"num_tokens": 7005165.0,
|
||
|
|
"step": 17
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.031578947368421054,
|
||
|
|
"grad_norm": 1.040313482284546,
|
||
|
|
"learning_rate": 2.9824561403508772e-05,
|
||
|
|
"loss": 1.4229,
|
||
|
|
"mean_token_accuracy": 0.6200644373893738,
|
||
|
|
"num_tokens": 7422596.0,
|
||
|
|
"step": 18
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03333333333333333,
|
||
|
|
"grad_norm": 1.0897908210754395,
|
||
|
|
"learning_rate": 3.157894736842105e-05,
|
||
|
|
"loss": 1.3782,
|
||
|
|
"mean_token_accuracy": 0.6302369236946106,
|
||
|
|
"num_tokens": 7812857.0,
|
||
|
|
"step": 19
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03508771929824561,
|
||
|
|
"grad_norm": 1.08786940574646,
|
||
|
|
"learning_rate": 3.3333333333333335e-05,
|
||
|
|
"loss": 1.4219,
|
||
|
|
"mean_token_accuracy": 0.6207268834114075,
|
||
|
|
"num_tokens": 8244992.0,
|
||
|
|
"step": 20
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03684210526315789,
|
||
|
|
"grad_norm": 0.991358757019043,
|
||
|
|
"learning_rate": 3.508771929824561e-05,
|
||
|
|
"loss": 1.3912,
|
||
|
|
"mean_token_accuracy": 0.6278927326202393,
|
||
|
|
"num_tokens": 8664134.0,
|
||
|
|
"step": 21
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.03859649122807018,
|
||
|
|
"grad_norm": 1.1284328699111938,
|
||
|
|
"learning_rate": 3.6842105263157895e-05,
|
||
|
|
"loss": 1.4004,
|
||
|
|
"mean_token_accuracy": 0.6226587295532227,
|
||
|
|
"num_tokens": 9082923.0,
|
||
|
|
"step": 22
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04035087719298246,
|
||
|
|
"grad_norm": 1.1181979179382324,
|
||
|
|
"learning_rate": 3.859649122807018e-05,
|
||
|
|
"loss": 1.3971,
|
||
|
|
"mean_token_accuracy": 0.6253641247749329,
|
||
|
|
"num_tokens": 9497934.0,
|
||
|
|
"step": 23
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.042105263157894736,
|
||
|
|
"grad_norm": 1.2639045715332031,
|
||
|
|
"learning_rate": 4.0350877192982455e-05,
|
||
|
|
"loss": 1.4259,
|
||
|
|
"mean_token_accuracy": 0.6176168322563171,
|
||
|
|
"num_tokens": 9923716.0,
|
||
|
|
"step": 24
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.043859649122807015,
|
||
|
|
"grad_norm": 1.0910519361495972,
|
||
|
|
"learning_rate": 4.210526315789474e-05,
|
||
|
|
"loss": 1.3733,
|
||
|
|
"mean_token_accuracy": 0.6294311285018921,
|
||
|
|
"num_tokens": 10353257.0,
|
||
|
|
"step": 25
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0456140350877193,
|
||
|
|
"grad_norm": 1.0363545417785645,
|
||
|
|
"learning_rate": 4.3859649122807014e-05,
|
||
|
|
"loss": 1.3809,
|
||
|
|
"mean_token_accuracy": 0.6275283098220825,
|
||
|
|
"num_tokens": 10767965.0,
|
||
|
|
"step": 26
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04736842105263158,
|
||
|
|
"grad_norm": 1.048935055732727,
|
||
|
|
"learning_rate": 4.56140350877193e-05,
|
||
|
|
"loss": 1.3815,
|
||
|
|
"mean_token_accuracy": 0.6274988651275635,
|
||
|
|
"num_tokens": 11188411.0,
|
||
|
|
"step": 27
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.04912280701754386,
|
||
|
|
"grad_norm": 1.3705120086669922,
|
||
|
|
"learning_rate": 4.736842105263158e-05,
|
||
|
|
"loss": 1.357,
|
||
|
|
"mean_token_accuracy": 0.6328938007354736,
|
||
|
|
"num_tokens": 11597031.0,
|
||
|
|
"step": 28
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05087719298245614,
|
||
|
|
"grad_norm": 1.2502895593643188,
|
||
|
|
"learning_rate": 4.912280701754386e-05,
|
||
|
|
"loss": 1.3848,
|
||
|
|
"mean_token_accuracy": 0.626395583152771,
|
||
|
|
"num_tokens": 11996650.0,
|
||
|
|
"step": 29
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05263157894736842,
|
||
|
|
"grad_norm": 0.9786079525947571,
|
||
|
|
"learning_rate": 5.087719298245615e-05,
|
||
|
|
"loss": 1.3669,
|
||
|
|
"mean_token_accuracy": 0.6301195621490479,
|
||
|
|
"num_tokens": 12414351.0,
|
||
|
|
"step": 30
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.054385964912280704,
|
||
|
|
"grad_norm": 1.322411298751831,
|
||
|
|
"learning_rate": 5.2631578947368424e-05,
|
||
|
|
"loss": 1.3703,
|
||
|
|
"mean_token_accuracy": 0.6289023756980896,
|
||
|
|
"num_tokens": 12853897.0,
|
||
|
|
"step": 31
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.056140350877192984,
|
||
|
|
"grad_norm": 1.1092149019241333,
|
||
|
|
"learning_rate": 5.438596491228071e-05,
|
||
|
|
"loss": 1.343,
|
||
|
|
"mean_token_accuracy": 0.6342843770980835,
|
||
|
|
"num_tokens": 13246819.0,
|
||
|
|
"step": 32
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05789473684210526,
|
||
|
|
"grad_norm": 1.39752995967865,
|
||
|
|
"learning_rate": 5.6140350877192984e-05,
|
||
|
|
"loss": 1.3795,
|
||
|
|
"mean_token_accuracy": 0.6259216666221619,
|
||
|
|
"num_tokens": 13668134.0,
|
||
|
|
"step": 33
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.05964912280701754,
|
||
|
|
"grad_norm": 1.2338861227035522,
|
||
|
|
"learning_rate": 5.789473684210527e-05,
|
||
|
|
"loss": 1.3461,
|
||
|
|
"mean_token_accuracy": 0.6343478560447693,
|
||
|
|
"num_tokens": 14077677.0,
|
||
|
|
"step": 34
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06140350877192982,
|
||
|
|
"grad_norm": 1.1885336637496948,
|
||
|
|
"learning_rate": 5.9649122807017544e-05,
|
||
|
|
"loss": 1.3637,
|
||
|
|
"mean_token_accuracy": 0.6306077241897583,
|
||
|
|
"num_tokens": 14502863.0,
|
||
|
|
"step": 35
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06315789473684211,
|
||
|
|
"grad_norm": 1.471887469291687,
|
||
|
|
"learning_rate": 6.140350877192983e-05,
|
||
|
|
"loss": 1.3589,
|
||
|
|
"mean_token_accuracy": 0.6298720836639404,
|
||
|
|
"num_tokens": 14940331.0,
|
||
|
|
"step": 36
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06491228070175438,
|
||
|
|
"grad_norm": 1.0302767753601074,
|
||
|
|
"learning_rate": 6.31578947368421e-05,
|
||
|
|
"loss": 1.3397,
|
||
|
|
"mean_token_accuracy": 0.6345314979553223,
|
||
|
|
"num_tokens": 15343369.0,
|
||
|
|
"step": 37
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06666666666666667,
|
||
|
|
"grad_norm": 1.42778480052948,
|
||
|
|
"learning_rate": 6.49122807017544e-05,
|
||
|
|
"loss": 1.3546,
|
||
|
|
"mean_token_accuracy": 0.6308321356773376,
|
||
|
|
"num_tokens": 15752951.0,
|
||
|
|
"step": 38
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.06842105263157895,
|
||
|
|
"grad_norm": 1.52997624874115,
|
||
|
|
"learning_rate": 6.666666666666667e-05,
|
||
|
|
"loss": 1.3207,
|
||
|
|
"mean_token_accuracy": 0.6400465965270996,
|
||
|
|
"num_tokens": 16147599.0,
|
||
|
|
"step": 39
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07017543859649122,
|
||
|
|
"grad_norm": 1.1962817907333374,
|
||
|
|
"learning_rate": 6.842105263157895e-05,
|
||
|
|
"loss": 1.3581,
|
||
|
|
"mean_token_accuracy": 0.6308744549751282,
|
||
|
|
"num_tokens": 16557268.0,
|
||
|
|
"step": 40
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07192982456140351,
|
||
|
|
"grad_norm": 1.1255979537963867,
|
||
|
|
"learning_rate": 7.017543859649122e-05,
|
||
|
|
"loss": 1.3227,
|
||
|
|
"mean_token_accuracy": 0.639037013053894,
|
||
|
|
"num_tokens": 16950170.0,
|
||
|
|
"step": 41
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07368421052631578,
|
||
|
|
"grad_norm": 1.3424605131149292,
|
||
|
|
"learning_rate": 7.192982456140351e-05,
|
||
|
|
"loss": 1.3326,
|
||
|
|
"mean_token_accuracy": 0.6364561319351196,
|
||
|
|
"num_tokens": 17369691.0,
|
||
|
|
"step": 42
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07543859649122807,
|
||
|
|
"grad_norm": 1.4676284790039062,
|
||
|
|
"learning_rate": 7.368421052631579e-05,
|
||
|
|
"loss": 1.3922,
|
||
|
|
"mean_token_accuracy": 0.6229093074798584,
|
||
|
|
"num_tokens": 17777844.0,
|
||
|
|
"step": 43
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07719298245614035,
|
||
|
|
"grad_norm": 1.339996099472046,
|
||
|
|
"learning_rate": 7.543859649122808e-05,
|
||
|
|
"loss": 1.3405,
|
||
|
|
"mean_token_accuracy": 0.6354778409004211,
|
||
|
|
"num_tokens": 18191802.0,
|
||
|
|
"step": 44
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.07894736842105263,
|
||
|
|
"grad_norm": 1.7620866298675537,
|
||
|
|
"learning_rate": 7.719298245614036e-05,
|
||
|
|
"loss": 1.3225,
|
||
|
|
"mean_token_accuracy": 0.6384661197662354,
|
||
|
|
"num_tokens": 18608551.0,
|
||
|
|
"step": 45
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08070175438596491,
|
||
|
|
"grad_norm": 1.4890868663787842,
|
||
|
|
"learning_rate": 7.894736842105263e-05,
|
||
|
|
"loss": 1.3829,
|
||
|
|
"mean_token_accuracy": 0.6231105327606201,
|
||
|
|
"num_tokens": 19020275.0,
|
||
|
|
"step": 46
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0824561403508772,
|
||
|
|
"grad_norm": 1.3470134735107422,
|
||
|
|
"learning_rate": 8.070175438596491e-05,
|
||
|
|
"loss": 1.3068,
|
||
|
|
"mean_token_accuracy": 0.6428367495536804,
|
||
|
|
"num_tokens": 19416792.0,
|
||
|
|
"step": 47
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08421052631578947,
|
||
|
|
"grad_norm": 1.2967629432678223,
|
||
|
|
"learning_rate": 8.24561403508772e-05,
|
||
|
|
"loss": 1.3352,
|
||
|
|
"mean_token_accuracy": 0.6349055171012878,
|
||
|
|
"num_tokens": 19842380.0,
|
||
|
|
"step": 48
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08596491228070176,
|
||
|
|
"grad_norm": 1.5379173755645752,
|
||
|
|
"learning_rate": 8.421052631578948e-05,
|
||
|
|
"loss": 1.3475,
|
||
|
|
"mean_token_accuracy": 0.6317988634109497,
|
||
|
|
"num_tokens": 20257340.0,
|
||
|
|
"step": 49
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08771929824561403,
|
||
|
|
"grad_norm": 1.2810230255126953,
|
||
|
|
"learning_rate": 8.596491228070177e-05,
|
||
|
|
"loss": 1.3337,
|
||
|
|
"mean_token_accuracy": 0.6345160007476807,
|
||
|
|
"num_tokens": 20693853.0,
|
||
|
|
"step": 50
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.08947368421052632,
|
||
|
|
"grad_norm": 1.5687311887741089,
|
||
|
|
"learning_rate": 8.771929824561403e-05,
|
||
|
|
"loss": 1.3213,
|
||
|
|
"mean_token_accuracy": 0.637017011642456,
|
||
|
|
"num_tokens": 21145069.0,
|
||
|
|
"step": 51
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.0912280701754386,
|
||
|
|
"grad_norm": 1.3021150827407837,
|
||
|
|
"learning_rate": 8.947368421052632e-05,
|
||
|
|
"loss": 1.3366,
|
||
|
|
"mean_token_accuracy": 0.6351794600486755,
|
||
|
|
"num_tokens": 21548907.0,
|
||
|
|
"step": 52
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09298245614035087,
|
||
|
|
"grad_norm": 1.6907377243041992,
|
||
|
|
"learning_rate": 9.12280701754386e-05,
|
||
|
|
"loss": 1.3326,
|
||
|
|
"mean_token_accuracy": 0.6350468397140503,
|
||
|
|
"num_tokens": 21947011.0,
|
||
|
|
"step": 53
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09473684210526316,
|
||
|
|
"grad_norm": 1.4103087186813354,
|
||
|
|
"learning_rate": 9.298245614035089e-05,
|
||
|
|
"loss": 1.3593,
|
||
|
|
"mean_token_accuracy": 0.6285296082496643,
|
||
|
|
"num_tokens": 22384995.0,
|
||
|
|
"step": 54
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09649122807017543,
|
||
|
|
"grad_norm": 1.3662679195404053,
|
||
|
|
"learning_rate": 9.473684210526316e-05,
|
||
|
|
"loss": 1.2921,
|
||
|
|
"mean_token_accuracy": 0.644428014755249,
|
||
|
|
"num_tokens": 22774976.0,
|
||
|
|
"step": 55
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.09824561403508772,
|
||
|
|
"grad_norm": 1.4143177270889282,
|
||
|
|
"learning_rate": 9.649122807017544e-05,
|
||
|
|
"loss": 1.3537,
|
||
|
|
"mean_token_accuracy": 0.6295624375343323,
|
||
|
|
"num_tokens": 23192796.0,
|
||
|
|
"step": 56
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1,
|
||
|
|
"grad_norm": 1.085375189781189,
|
||
|
|
"learning_rate": 9.824561403508771e-05,
|
||
|
|
"loss": 1.3102,
|
||
|
|
"mean_token_accuracy": 0.6399141550064087,
|
||
|
|
"num_tokens": 23610316.0,
|
||
|
|
"step": 57
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10175438596491228,
|
||
|
|
"grad_norm": 1.3309866189956665,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3685,
|
||
|
|
"mean_token_accuracy": 0.6267704367637634,
|
||
|
|
"num_tokens": 24043550.0,
|
||
|
|
"step": 58
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10350877192982456,
|
||
|
|
"grad_norm": 1.4298138618469238,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3263,
|
||
|
|
"mean_token_accuracy": 0.6358023285865784,
|
||
|
|
"num_tokens": 24482051.0,
|
||
|
|
"step": 59
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10526315789473684,
|
||
|
|
"grad_norm": 1.3495875597000122,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3231,
|
||
|
|
"mean_token_accuracy": 0.6365145444869995,
|
||
|
|
"num_tokens": 24889567.0,
|
||
|
|
"step": 60
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10701754385964912,
|
||
|
|
"grad_norm": 1.3433363437652588,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3265,
|
||
|
|
"mean_token_accuracy": 0.6364420652389526,
|
||
|
|
"num_tokens": 25296636.0,
|
||
|
|
"step": 61
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.10877192982456141,
|
||
|
|
"grad_norm": 1.4023200273513794,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2956,
|
||
|
|
"mean_token_accuracy": 0.6436514854431152,
|
||
|
|
"num_tokens": 25694614.0,
|
||
|
|
"step": 62
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11052631578947368,
|
||
|
|
"grad_norm": 1.4170814752578735,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.308,
|
||
|
|
"mean_token_accuracy": 0.6407404541969299,
|
||
|
|
"num_tokens": 26107689.0,
|
||
|
|
"step": 63
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11228070175438597,
|
||
|
|
"grad_norm": 1.198994755744934,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3057,
|
||
|
|
"mean_token_accuracy": 0.6392735838890076,
|
||
|
|
"num_tokens": 26522427.0,
|
||
|
|
"step": 64
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11403508771929824,
|
||
|
|
"grad_norm": 1.422518014907837,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3237,
|
||
|
|
"mean_token_accuracy": 0.6369431018829346,
|
||
|
|
"num_tokens": 26934123.0,
|
||
|
|
"step": 65
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11578947368421053,
|
||
|
|
"grad_norm": 1.3225864171981812,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3213,
|
||
|
|
"mean_token_accuracy": 0.637446403503418,
|
||
|
|
"num_tokens": 27358915.0,
|
||
|
|
"step": 66
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11754385964912281,
|
||
|
|
"grad_norm": 1.1103287935256958,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3107,
|
||
|
|
"mean_token_accuracy": 0.6421770453453064,
|
||
|
|
"num_tokens": 27775556.0,
|
||
|
|
"step": 67
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.11929824561403508,
|
||
|
|
"grad_norm": 1.1607317924499512,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3219,
|
||
|
|
"mean_token_accuracy": 0.6369372010231018,
|
||
|
|
"num_tokens": 28174439.0,
|
||
|
|
"step": 68
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12105263157894737,
|
||
|
|
"grad_norm": 1.121587872505188,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2874,
|
||
|
|
"mean_token_accuracy": 0.6452154517173767,
|
||
|
|
"num_tokens": 28591718.0,
|
||
|
|
"step": 69
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12280701754385964,
|
||
|
|
"grad_norm": 1.347907304763794,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3031,
|
||
|
|
"mean_token_accuracy": 0.6407017707824707,
|
||
|
|
"num_tokens": 29010174.0,
|
||
|
|
"step": 70
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12456140350877193,
|
||
|
|
"grad_norm": 0.9920047521591187,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3081,
|
||
|
|
"mean_token_accuracy": 0.6407448053359985,
|
||
|
|
"num_tokens": 29430243.0,
|
||
|
|
"step": 71
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12631578947368421,
|
||
|
|
"grad_norm": 1.4440033435821533,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.317,
|
||
|
|
"mean_token_accuracy": 0.6371080279350281,
|
||
|
|
"num_tokens": 29852311.0,
|
||
|
|
"step": 72
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1280701754385965,
|
||
|
|
"grad_norm": 1.172947645187378,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2955,
|
||
|
|
"mean_token_accuracy": 0.6423896551132202,
|
||
|
|
"num_tokens": 30267287.0,
|
||
|
|
"step": 73
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.12982456140350876,
|
||
|
|
"grad_norm": 1.2112936973571777,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3155,
|
||
|
|
"mean_token_accuracy": 0.6372196674346924,
|
||
|
|
"num_tokens": 30715583.0,
|
||
|
|
"step": 74
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13157894736842105,
|
||
|
|
"grad_norm": 1.1959091424942017,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.326,
|
||
|
|
"mean_token_accuracy": 0.6362699866294861,
|
||
|
|
"num_tokens": 31143599.0,
|
||
|
|
"step": 75
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13333333333333333,
|
||
|
|
"grad_norm": 1.3436111211776733,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3162,
|
||
|
|
"mean_token_accuracy": 0.6371738910675049,
|
||
|
|
"num_tokens": 31573952.0,
|
||
|
|
"step": 76
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13508771929824562,
|
||
|
|
"grad_norm": 1.101008653640747,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.334,
|
||
|
|
"mean_token_accuracy": 0.6339811086654663,
|
||
|
|
"num_tokens": 31985309.0,
|
||
|
|
"step": 77
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1368421052631579,
|
||
|
|
"grad_norm": 1.2296723127365112,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3346,
|
||
|
|
"mean_token_accuracy": 0.6317209005355835,
|
||
|
|
"num_tokens": 32422743.0,
|
||
|
|
"step": 78
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.13859649122807016,
|
||
|
|
"grad_norm": 1.0157369375228882,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3197,
|
||
|
|
"mean_token_accuracy": 0.6366320848464966,
|
||
|
|
"num_tokens": 32830001.0,
|
||
|
|
"step": 79
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14035087719298245,
|
||
|
|
"grad_norm": 1.1848087310791016,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2867,
|
||
|
|
"mean_token_accuracy": 0.6441957950592041,
|
||
|
|
"num_tokens": 33242795.0,
|
||
|
|
"step": 80
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14210526315789473,
|
||
|
|
"grad_norm": 1.035370945930481,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.291,
|
||
|
|
"mean_token_accuracy": 0.6427246332168579,
|
||
|
|
"num_tokens": 33631163.0,
|
||
|
|
"step": 81
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14385964912280702,
|
||
|
|
"grad_norm": 1.2173899412155151,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.331,
|
||
|
|
"mean_token_accuracy": 0.6329866647720337,
|
||
|
|
"num_tokens": 34030802.0,
|
||
|
|
"step": 82
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1456140350877193,
|
||
|
|
"grad_norm": 1.2178702354431152,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3159,
|
||
|
|
"mean_token_accuracy": 0.6359639167785645,
|
||
|
|
"num_tokens": 34443782.0,
|
||
|
|
"step": 83
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14736842105263157,
|
||
|
|
"grad_norm": 1.045278787612915,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3248,
|
||
|
|
"mean_token_accuracy": 0.6354624629020691,
|
||
|
|
"num_tokens": 34854553.0,
|
||
|
|
"step": 84
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.14912280701754385,
|
||
|
|
"grad_norm": 1.0509958267211914,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3038,
|
||
|
|
"mean_token_accuracy": 0.6399705410003662,
|
||
|
|
"num_tokens": 35265577.0,
|
||
|
|
"step": 85
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15087719298245614,
|
||
|
|
"grad_norm": 1.1449450254440308,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3004,
|
||
|
|
"mean_token_accuracy": 0.6418460011482239,
|
||
|
|
"num_tokens": 35705670.0,
|
||
|
|
"step": 86
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15263157894736842,
|
||
|
|
"grad_norm": 1.254193902015686,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3088,
|
||
|
|
"mean_token_accuracy": 0.6380743980407715,
|
||
|
|
"num_tokens": 36117458.0,
|
||
|
|
"step": 87
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1543859649122807,
|
||
|
|
"grad_norm": 1.107653021812439,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3009,
|
||
|
|
"mean_token_accuracy": 0.6418921947479248,
|
||
|
|
"num_tokens": 36528322.0,
|
||
|
|
"step": 88
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.156140350877193,
|
||
|
|
"grad_norm": 0.9854401350021362,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3022,
|
||
|
|
"mean_token_accuracy": 0.6411072015762329,
|
||
|
|
"num_tokens": 36937089.0,
|
||
|
|
"step": 89
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15789473684210525,
|
||
|
|
"grad_norm": 0.9852709174156189,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2818,
|
||
|
|
"mean_token_accuracy": 0.6445783972740173,
|
||
|
|
"num_tokens": 37347449.0,
|
||
|
|
"step": 90
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.15964912280701754,
|
||
|
|
"grad_norm": 1.0607930421829224,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3042,
|
||
|
|
"mean_token_accuracy": 0.6411298513412476,
|
||
|
|
"num_tokens": 37768132.0,
|
||
|
|
"step": 91
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16140350877192983,
|
||
|
|
"grad_norm": 0.8618792295455933,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2899,
|
||
|
|
"mean_token_accuracy": 0.6437462568283081,
|
||
|
|
"num_tokens": 38173420.0,
|
||
|
|
"step": 92
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1631578947368421,
|
||
|
|
"grad_norm": 0.9967447519302368,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2854,
|
||
|
|
"mean_token_accuracy": 0.6431381106376648,
|
||
|
|
"num_tokens": 38564492.0,
|
||
|
|
"step": 93
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1649122807017544,
|
||
|
|
"grad_norm": 0.984609842300415,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3059,
|
||
|
|
"mean_token_accuracy": 0.6391075849533081,
|
||
|
|
"num_tokens": 38973529.0,
|
||
|
|
"step": 94
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16666666666666666,
|
||
|
|
"grad_norm": 1.2071311473846436,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3399,
|
||
|
|
"mean_token_accuracy": 0.6329282522201538,
|
||
|
|
"num_tokens": 39394470.0,
|
||
|
|
"step": 95
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.16842105263157894,
|
||
|
|
"grad_norm": 0.976823627948761,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3189,
|
||
|
|
"mean_token_accuracy": 0.6357463598251343,
|
||
|
|
"num_tokens": 39817247.0,
|
||
|
|
"step": 96
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17017543859649123,
|
||
|
|
"grad_norm": 1.1396266222000122,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3212,
|
||
|
|
"mean_token_accuracy": 0.6357501745223999,
|
||
|
|
"num_tokens": 40219578.0,
|
||
|
|
"step": 97
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17192982456140352,
|
||
|
|
"grad_norm": 1.375174880027771,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3187,
|
||
|
|
"mean_token_accuracy": 0.6361098289489746,
|
||
|
|
"num_tokens": 40631135.0,
|
||
|
|
"step": 98
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1736842105263158,
|
||
|
|
"grad_norm": 1.1790404319763184,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2962,
|
||
|
|
"mean_token_accuracy": 0.6430338621139526,
|
||
|
|
"num_tokens": 41060591.0,
|
||
|
|
"step": 99
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17543859649122806,
|
||
|
|
"grad_norm": 1.1208826303482056,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3243,
|
||
|
|
"mean_token_accuracy": 0.6350250244140625,
|
||
|
|
"num_tokens": 41491310.0,
|
||
|
|
"step": 100
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17719298245614035,
|
||
|
|
"grad_norm": 0.9812876582145691,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2989,
|
||
|
|
"mean_token_accuracy": 0.6403151154518127,
|
||
|
|
"num_tokens": 41928251.0,
|
||
|
|
"step": 101
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.17894736842105263,
|
||
|
|
"grad_norm": 1.118895173072815,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2881,
|
||
|
|
"mean_token_accuracy": 0.643653392791748,
|
||
|
|
"num_tokens": 42328746.0,
|
||
|
|
"step": 102
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18070175438596492,
|
||
|
|
"grad_norm": 1.0872011184692383,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3338,
|
||
|
|
"mean_token_accuracy": 0.6348128318786621,
|
||
|
|
"num_tokens": 42770651.0,
|
||
|
|
"step": 103
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1824561403508772,
|
||
|
|
"grad_norm": 1.0117576122283936,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.305,
|
||
|
|
"mean_token_accuracy": 0.6394751071929932,
|
||
|
|
"num_tokens": 43173759.0,
|
||
|
|
"step": 104
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18421052631578946,
|
||
|
|
"grad_norm": 0.9142250418663025,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2908,
|
||
|
|
"mean_token_accuracy": 0.642844557762146,
|
||
|
|
"num_tokens": 43604295.0,
|
||
|
|
"step": 105
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18596491228070175,
|
||
|
|
"grad_norm": 1.1038587093353271,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2963,
|
||
|
|
"mean_token_accuracy": 0.6419985294342041,
|
||
|
|
"num_tokens": 44008348.0,
|
||
|
|
"step": 106
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18771929824561404,
|
||
|
|
"grad_norm": 0.928559422492981,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3107,
|
||
|
|
"mean_token_accuracy": 0.6373360753059387,
|
||
|
|
"num_tokens": 44444613.0,
|
||
|
|
"step": 107
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.18947368421052632,
|
||
|
|
"grad_norm": 1.0053200721740723,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2887,
|
||
|
|
"mean_token_accuracy": 0.6448897123336792,
|
||
|
|
"num_tokens": 44875312.0,
|
||
|
|
"step": 108
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.1912280701754386,
|
||
|
|
"grad_norm": 0.9399821758270264,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2996,
|
||
|
|
"mean_token_accuracy": 0.6389566659927368,
|
||
|
|
"num_tokens": 45273670.0,
|
||
|
|
"step": 109
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19298245614035087,
|
||
|
|
"grad_norm": 1.2514432668685913,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2788,
|
||
|
|
"mean_token_accuracy": 0.6447431445121765,
|
||
|
|
"num_tokens": 45696861.0,
|
||
|
|
"step": 110
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19473684210526315,
|
||
|
|
"grad_norm": 0.9928343892097473,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.303,
|
||
|
|
"mean_token_accuracy": 0.639816403388977,
|
||
|
|
"num_tokens": 46115387.0,
|
||
|
|
"step": 111
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19649122807017544,
|
||
|
|
"grad_norm": 1.0918611288070679,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2904,
|
||
|
|
"mean_token_accuracy": 0.6424538493156433,
|
||
|
|
"num_tokens": 46521933.0,
|
||
|
|
"step": 112
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.19824561403508772,
|
||
|
|
"grad_norm": 1.1192419528961182,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3263,
|
||
|
|
"mean_token_accuracy": 0.634386420249939,
|
||
|
|
"num_tokens": 46941357.0,
|
||
|
|
"step": 113
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2,
|
||
|
|
"grad_norm": 0.9753395318984985,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2792,
|
||
|
|
"mean_token_accuracy": 0.6461848020553589,
|
||
|
|
"num_tokens": 47343683.0,
|
||
|
|
"step": 114
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20175438596491227,
|
||
|
|
"grad_norm": 0.8872193694114685,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2928,
|
||
|
|
"mean_token_accuracy": 0.6433566808700562,
|
||
|
|
"num_tokens": 47787665.0,
|
||
|
|
"step": 115
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20350877192982456,
|
||
|
|
"grad_norm": 0.9394273161888123,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2839,
|
||
|
|
"mean_token_accuracy": 0.643855094909668,
|
||
|
|
"num_tokens": 48190313.0,
|
||
|
|
"step": 116
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20526315789473684,
|
||
|
|
"grad_norm": 1.136915922164917,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2904,
|
||
|
|
"mean_token_accuracy": 0.6421671509742737,
|
||
|
|
"num_tokens": 48630247.0,
|
||
|
|
"step": 117
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20701754385964913,
|
||
|
|
"grad_norm": 0.9522098898887634,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2942,
|
||
|
|
"mean_token_accuracy": 0.6419311761856079,
|
||
|
|
"num_tokens": 49009657.0,
|
||
|
|
"step": 118
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.20877192982456141,
|
||
|
|
"grad_norm": 1.1538357734680176,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2708,
|
||
|
|
"mean_token_accuracy": 0.6458480358123779,
|
||
|
|
"num_tokens": 49398873.0,
|
||
|
|
"step": 119
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21052631578947367,
|
||
|
|
"grad_norm": 0.9239334464073181,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2768,
|
||
|
|
"mean_token_accuracy": 0.6459267139434814,
|
||
|
|
"num_tokens": 49804381.0,
|
||
|
|
"step": 120
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21228070175438596,
|
||
|
|
"grad_norm": 0.9793084859848022,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2712,
|
||
|
|
"mean_token_accuracy": 0.6456162929534912,
|
||
|
|
"num_tokens": 50213766.0,
|
||
|
|
"step": 121
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21403508771929824,
|
||
|
|
"grad_norm": 1.1136904954910278,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2877,
|
||
|
|
"mean_token_accuracy": 0.6435809135437012,
|
||
|
|
"num_tokens": 50625155.0,
|
||
|
|
"step": 122
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21578947368421053,
|
||
|
|
"grad_norm": 0.8962170481681824,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2929,
|
||
|
|
"mean_token_accuracy": 0.6421340703964233,
|
||
|
|
"num_tokens": 51028769.0,
|
||
|
|
"step": 123
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21754385964912282,
|
||
|
|
"grad_norm": 1.0955440998077393,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2801,
|
||
|
|
"mean_token_accuracy": 0.6436057090759277,
|
||
|
|
"num_tokens": 51442144.0,
|
||
|
|
"step": 124
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.21929824561403508,
|
||
|
|
"grad_norm": 0.9009307622909546,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2709,
|
||
|
|
"mean_token_accuracy": 0.6477597951889038,
|
||
|
|
"num_tokens": 51847840.0,
|
||
|
|
"step": 125
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22105263157894736,
|
||
|
|
"grad_norm": 1.0885659456253052,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2771,
|
||
|
|
"mean_token_accuracy": 0.6435371041297913,
|
||
|
|
"num_tokens": 52284344.0,
|
||
|
|
"step": 126
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22280701754385965,
|
||
|
|
"grad_norm": 0.92705899477005,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3212,
|
||
|
|
"mean_token_accuracy": 0.6340687870979309,
|
||
|
|
"num_tokens": 52685505.0,
|
||
|
|
"step": 127
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22456140350877193,
|
||
|
|
"grad_norm": 0.9139009118080139,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2932,
|
||
|
|
"mean_token_accuracy": 0.6407190561294556,
|
||
|
|
"num_tokens": 53110059.0,
|
||
|
|
"step": 128
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22631578947368422,
|
||
|
|
"grad_norm": 0.8279791474342346,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2659,
|
||
|
|
"mean_token_accuracy": 0.6474618911743164,
|
||
|
|
"num_tokens": 53502719.0,
|
||
|
|
"step": 129
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22807017543859648,
|
||
|
|
"grad_norm": 0.9933703541755676,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2681,
|
||
|
|
"mean_token_accuracy": 0.6483220458030701,
|
||
|
|
"num_tokens": 53917317.0,
|
||
|
|
"step": 130
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.22982456140350876,
|
||
|
|
"grad_norm": 0.887478768825531,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3002,
|
||
|
|
"mean_token_accuracy": 0.6406154632568359,
|
||
|
|
"num_tokens": 54338305.0,
|
||
|
|
"step": 131
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23157894736842105,
|
||
|
|
"grad_norm": 0.8612638711929321,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2774,
|
||
|
|
"mean_token_accuracy": 0.6442515850067139,
|
||
|
|
"num_tokens": 54754376.0,
|
||
|
|
"step": 132
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23333333333333334,
|
||
|
|
"grad_norm": 0.850595235824585,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3222,
|
||
|
|
"mean_token_accuracy": 0.6351276636123657,
|
||
|
|
"num_tokens": 55177668.0,
|
||
|
|
"step": 133
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23508771929824562,
|
||
|
|
"grad_norm": 1.1265441179275513,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2869,
|
||
|
|
"mean_token_accuracy": 0.6440436840057373,
|
||
|
|
"num_tokens": 55588413.0,
|
||
|
|
"step": 134
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23684210526315788,
|
||
|
|
"grad_norm": 0.8181601762771606,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2689,
|
||
|
|
"mean_token_accuracy": 0.6486095190048218,
|
||
|
|
"num_tokens": 56002661.0,
|
||
|
|
"step": 135
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.23859649122807017,
|
||
|
|
"grad_norm": 0.9597206115722656,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2685,
|
||
|
|
"mean_token_accuracy": 0.6476383209228516,
|
||
|
|
"num_tokens": 56399978.0,
|
||
|
|
"step": 136
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24035087719298245,
|
||
|
|
"grad_norm": 0.9021192193031311,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.277,
|
||
|
|
"mean_token_accuracy": 0.6456701159477234,
|
||
|
|
"num_tokens": 56805057.0,
|
||
|
|
"step": 137
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24210526315789474,
|
||
|
|
"grad_norm": 0.9269475936889648,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2937,
|
||
|
|
"mean_token_accuracy": 0.6424517631530762,
|
||
|
|
"num_tokens": 57221548.0,
|
||
|
|
"step": 138
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24385964912280703,
|
||
|
|
"grad_norm": 0.9395855069160461,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2905,
|
||
|
|
"mean_token_accuracy": 0.6421390175819397,
|
||
|
|
"num_tokens": 57619205.0,
|
||
|
|
"step": 139
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24561403508771928,
|
||
|
|
"grad_norm": 1.0334845781326294,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2727,
|
||
|
|
"mean_token_accuracy": 0.6469994783401489,
|
||
|
|
"num_tokens": 58029104.0,
|
||
|
|
"step": 140
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24736842105263157,
|
||
|
|
"grad_norm": 1.080823302268982,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3086,
|
||
|
|
"mean_token_accuracy": 0.6387298107147217,
|
||
|
|
"num_tokens": 58444372.0,
|
||
|
|
"step": 141
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.24912280701754386,
|
||
|
|
"grad_norm": 0.8953016400337219,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3045,
|
||
|
|
"mean_token_accuracy": 0.6378800272941589,
|
||
|
|
"num_tokens": 58859539.0,
|
||
|
|
"step": 142
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25087719298245614,
|
||
|
|
"grad_norm": 0.8567958474159241,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3121,
|
||
|
|
"mean_token_accuracy": 0.6372794508934021,
|
||
|
|
"num_tokens": 59268101.0,
|
||
|
|
"step": 143
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.25263157894736843,
|
||
|
|
"grad_norm": 1.158692479133606,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2691,
|
||
|
|
"mean_token_accuracy": 0.6473510265350342,
|
||
|
|
"num_tokens": 59708311.0,
|
||
|
|
"step": 144
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2543859649122807,
|
||
|
|
"grad_norm": 0.9232509732246399,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2682,
|
||
|
|
"mean_token_accuracy": 0.648154616355896,
|
||
|
|
"num_tokens": 60135806.0,
|
||
|
|
"step": 145
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.256140350877193,
|
||
|
|
"grad_norm": 0.9411163330078125,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.3155,
|
||
|
|
"mean_token_accuracy": 0.6371098756790161,
|
||
|
|
"num_tokens": 60546162.0,
|
||
|
|
"step": 146
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2578947368421053,
|
||
|
|
"grad_norm": 1.013136863708496,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2655,
|
||
|
|
"mean_token_accuracy": 0.646664023399353,
|
||
|
|
"num_tokens": 60977521.0,
|
||
|
|
"step": 147
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2596491228070175,
|
||
|
|
"grad_norm": 1.1551271677017212,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2798,
|
||
|
|
"mean_token_accuracy": 0.6451727747917175,
|
||
|
|
"num_tokens": 61372998.0,
|
||
|
|
"step": 148
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2614035087719298,
|
||
|
|
"grad_norm": 0.8795229196548462,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2982,
|
||
|
|
"mean_token_accuracy": 0.6401211023330688,
|
||
|
|
"num_tokens": 61781320.0,
|
||
|
|
"step": 149
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2631578947368421,
|
||
|
|
"grad_norm": 0.965307891368866,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2788,
|
||
|
|
"mean_token_accuracy": 0.6439850330352783,
|
||
|
|
"num_tokens": 62199535.0,
|
||
|
|
"step": 150
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2649122807017544,
|
||
|
|
"grad_norm": 0.9804089665412903,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2765,
|
||
|
|
"mean_token_accuracy": 0.6456645727157593,
|
||
|
|
"num_tokens": 62632976.0,
|
||
|
|
"step": 151
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26666666666666666,
|
||
|
|
"grad_norm": 0.9098561406135559,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2564,
|
||
|
|
"mean_token_accuracy": 0.6501985192298889,
|
||
|
|
"num_tokens": 63043041.0,
|
||
|
|
"step": 152
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.26842105263157895,
|
||
|
|
"grad_norm": 0.7934507727622986,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2638,
|
||
|
|
"mean_token_accuracy": 0.6469926834106445,
|
||
|
|
"num_tokens": 63453330.0,
|
||
|
|
"step": 153
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.27017543859649124,
|
||
|
|
"grad_norm": 1.0823460817337036,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2983,
|
||
|
|
"mean_token_accuracy": 0.6407555341720581,
|
||
|
|
"num_tokens": 63864814.0,
|
||
|
|
"step": 154
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2719298245614035,
|
||
|
|
"grad_norm": 0.7126585841178894,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2528,
|
||
|
|
"mean_token_accuracy": 0.6504640579223633,
|
||
|
|
"num_tokens": 64280965.0,
|
||
|
|
"step": 155
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2736842105263158,
|
||
|
|
"grad_norm": 0.9585691690444946,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2686,
|
||
|
|
"mean_token_accuracy": 0.6472254395484924,
|
||
|
|
"num_tokens": 64672526.0,
|
||
|
|
"step": 156
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2754385964912281,
|
||
|
|
"grad_norm": 0.752656102180481,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2474,
|
||
|
|
"mean_token_accuracy": 0.6523414850234985,
|
||
|
|
"num_tokens": 65077313.0,
|
||
|
|
"step": 157
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2771929824561403,
|
||
|
|
"grad_norm": 0.9288751482963562,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2624,
|
||
|
|
"mean_token_accuracy": 0.646138072013855,
|
||
|
|
"num_tokens": 65487692.0,
|
||
|
|
"step": 158
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2789473684210526,
|
||
|
|
"grad_norm": 0.8809065222740173,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2607,
|
||
|
|
"mean_token_accuracy": 0.6489483118057251,
|
||
|
|
"num_tokens": 65905410.0,
|
||
|
|
"step": 159
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2807017543859649,
|
||
|
|
"grad_norm": 0.9240980744361877,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2958,
|
||
|
|
"mean_token_accuracy": 0.6388487219810486,
|
||
|
|
"num_tokens": 66310475.0,
|
||
|
|
"step": 160
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2824561403508772,
|
||
|
|
"grad_norm": 0.8388931751251221,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2838,
|
||
|
|
"mean_token_accuracy": 0.6431634426116943,
|
||
|
|
"num_tokens": 66727367.0,
|
||
|
|
"step": 161
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28421052631578947,
|
||
|
|
"grad_norm": 0.8820334076881409,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.303,
|
||
|
|
"mean_token_accuracy": 0.63872230052948,
|
||
|
|
"num_tokens": 67154750.0,
|
||
|
|
"step": 162
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28596491228070176,
|
||
|
|
"grad_norm": 0.8385342359542847,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2632,
|
||
|
|
"mean_token_accuracy": 0.6464277505874634,
|
||
|
|
"num_tokens": 67571403.0,
|
||
|
|
"step": 163
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.28771929824561404,
|
||
|
|
"grad_norm": 0.8737322092056274,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.248,
|
||
|
|
"mean_token_accuracy": 0.6486573219299316,
|
||
|
|
"num_tokens": 67947644.0,
|
||
|
|
"step": 164
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2894736842105263,
|
||
|
|
"grad_norm": 0.8021559119224548,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2716,
|
||
|
|
"mean_token_accuracy": 0.644127607345581,
|
||
|
|
"num_tokens": 68362596.0,
|
||
|
|
"step": 165
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2912280701754386,
|
||
|
|
"grad_norm": 0.7599727511405945,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2511,
|
||
|
|
"mean_token_accuracy": 0.6510897874832153,
|
||
|
|
"num_tokens": 68768455.0,
|
||
|
|
"step": 166
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2929824561403509,
|
||
|
|
"grad_norm": 0.8421241044998169,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2839,
|
||
|
|
"mean_token_accuracy": 0.6423199772834778,
|
||
|
|
"num_tokens": 69182971.0,
|
||
|
|
"step": 167
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.29473684210526313,
|
||
|
|
"grad_norm": 0.8853815793991089,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2698,
|
||
|
|
"mean_token_accuracy": 0.6462802886962891,
|
||
|
|
"num_tokens": 69633484.0,
|
||
|
|
"step": 168
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2964912280701754,
|
||
|
|
"grad_norm": 1.0347827672958374,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2935,
|
||
|
|
"mean_token_accuracy": 0.6394780874252319,
|
||
|
|
"num_tokens": 70069352.0,
|
||
|
|
"step": 169
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.2982456140350877,
|
||
|
|
"grad_norm": 0.8993912935256958,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2689,
|
||
|
|
"mean_token_accuracy": 0.6471203565597534,
|
||
|
|
"num_tokens": 70478616.0,
|
||
|
|
"step": 170
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3,
|
||
|
|
"grad_norm": 0.8886722922325134,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2943,
|
||
|
|
"mean_token_accuracy": 0.6398030519485474,
|
||
|
|
"num_tokens": 70905952.0,
|
||
|
|
"step": 171
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3017543859649123,
|
||
|
|
"grad_norm": 1.1024540662765503,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2799,
|
||
|
|
"mean_token_accuracy": 0.6444812417030334,
|
||
|
|
"num_tokens": 71317263.0,
|
||
|
|
"step": 172
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30350877192982456,
|
||
|
|
"grad_norm": 0.8578200340270996,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2851,
|
||
|
|
"mean_token_accuracy": 0.6433255672454834,
|
||
|
|
"num_tokens": 71764261.0,
|
||
|
|
"step": 173
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30526315789473685,
|
||
|
|
"grad_norm": 0.9540239572525024,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2777,
|
||
|
|
"mean_token_accuracy": 0.644009530544281,
|
||
|
|
"num_tokens": 72179518.0,
|
||
|
|
"step": 174
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.30701754385964913,
|
||
|
|
"grad_norm": 0.9197105169296265,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2664,
|
||
|
|
"mean_token_accuracy": 0.6461474895477295,
|
||
|
|
"num_tokens": 72594932.0,
|
||
|
|
"step": 175
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3087719298245614,
|
||
|
|
"grad_norm": 0.7414684891700745,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2688,
|
||
|
|
"mean_token_accuracy": 0.6475616097450256,
|
||
|
|
"num_tokens": 73005473.0,
|
||
|
|
"step": 176
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3105263157894737,
|
||
|
|
"grad_norm": 0.9558865427970886,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2673,
|
||
|
|
"mean_token_accuracy": 0.6460444927215576,
|
||
|
|
"num_tokens": 73397415.0,
|
||
|
|
"step": 177
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.312280701754386,
|
||
|
|
"grad_norm": 0.9229446053504944,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2738,
|
||
|
|
"mean_token_accuracy": 0.6450560092926025,
|
||
|
|
"num_tokens": 73793964.0,
|
||
|
|
"step": 178
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3140350877192982,
|
||
|
|
"grad_norm": 0.7821291089057922,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2489,
|
||
|
|
"mean_token_accuracy": 0.6500634551048279,
|
||
|
|
"num_tokens": 74195270.0,
|
||
|
|
"step": 179
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3157894736842105,
|
||
|
|
"grad_norm": 0.7419561743736267,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2612,
|
||
|
|
"mean_token_accuracy": 0.6482713222503662,
|
||
|
|
"num_tokens": 74603802.0,
|
||
|
|
"step": 180
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3175438596491228,
|
||
|
|
"grad_norm": 0.7956511378288269,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2644,
|
||
|
|
"mean_token_accuracy": 0.6461498141288757,
|
||
|
|
"num_tokens": 75026083.0,
|
||
|
|
"step": 181
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3192982456140351,
|
||
|
|
"grad_norm": 0.7759901285171509,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2647,
|
||
|
|
"mean_token_accuracy": 0.6459044218063354,
|
||
|
|
"num_tokens": 75426052.0,
|
||
|
|
"step": 182
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32105263157894737,
|
||
|
|
"grad_norm": 0.8206394910812378,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2661,
|
||
|
|
"mean_token_accuracy": 0.646410346031189,
|
||
|
|
"num_tokens": 75830225.0,
|
||
|
|
"step": 183
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32280701754385965,
|
||
|
|
"grad_norm": 0.9032196402549744,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2677,
|
||
|
|
"mean_token_accuracy": 0.64692223072052,
|
||
|
|
"num_tokens": 76241940.0,
|
||
|
|
"step": 184
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.32456140350877194,
|
||
|
|
"grad_norm": 0.7018728256225586,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2505,
|
||
|
|
"mean_token_accuracy": 0.6487954258918762,
|
||
|
|
"num_tokens": 76663142.0,
|
||
|
|
"step": 185
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3263157894736842,
|
||
|
|
"grad_norm": 0.9026210904121399,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2483,
|
||
|
|
"mean_token_accuracy": 0.652391254901886,
|
||
|
|
"num_tokens": 77102129.0,
|
||
|
|
"step": 186
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3280701754385965,
|
||
|
|
"grad_norm": 0.8878228068351746,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2736,
|
||
|
|
"mean_token_accuracy": 0.6432400941848755,
|
||
|
|
"num_tokens": 77508487.0,
|
||
|
|
"step": 187
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3298245614035088,
|
||
|
|
"grad_norm": 0.9250103235244751,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2426,
|
||
|
|
"mean_token_accuracy": 0.651581883430481,
|
||
|
|
"num_tokens": 77908351.0,
|
||
|
|
"step": 188
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.33157894736842103,
|
||
|
|
"grad_norm": 0.6793785095214844,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2453,
|
||
|
|
"mean_token_accuracy": 0.6514815092086792,
|
||
|
|
"num_tokens": 78320845.0,
|
||
|
|
"step": 189
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3333333333333333,
|
||
|
|
"grad_norm": 0.7402032017707825,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2523,
|
||
|
|
"mean_token_accuracy": 0.6495726704597473,
|
||
|
|
"num_tokens": 78732383.0,
|
||
|
|
"step": 190
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3350877192982456,
|
||
|
|
"grad_norm": 0.9974339604377747,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2545,
|
||
|
|
"mean_token_accuracy": 0.6485158801078796,
|
||
|
|
"num_tokens": 79123256.0,
|
||
|
|
"step": 191
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3368421052631579,
|
||
|
|
"grad_norm": 0.9054931998252869,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2538,
|
||
|
|
"mean_token_accuracy": 0.6507511734962463,
|
||
|
|
"num_tokens": 79525484.0,
|
||
|
|
"step": 192
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3385964912280702,
|
||
|
|
"grad_norm": 0.7434863448143005,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2615,
|
||
|
|
"mean_token_accuracy": 0.6481617093086243,
|
||
|
|
"num_tokens": 79955140.0,
|
||
|
|
"step": 193
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34035087719298246,
|
||
|
|
"grad_norm": 0.7779750823974609,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2739,
|
||
|
|
"mean_token_accuracy": 0.6448099613189697,
|
||
|
|
"num_tokens": 80387845.0,
|
||
|
|
"step": 194
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34210526315789475,
|
||
|
|
"grad_norm": 0.8742808103561401,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2674,
|
||
|
|
"mean_token_accuracy": 0.6466174125671387,
|
||
|
|
"num_tokens": 80810849.0,
|
||
|
|
"step": 195
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34385964912280703,
|
||
|
|
"grad_norm": 0.810045063495636,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2722,
|
||
|
|
"mean_token_accuracy": 0.6458349227905273,
|
||
|
|
"num_tokens": 81226626.0,
|
||
|
|
"step": 196
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3456140350877193,
|
||
|
|
"grad_norm": 0.7127732634544373,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2912,
|
||
|
|
"mean_token_accuracy": 0.6405543684959412,
|
||
|
|
"num_tokens": 81648045.0,
|
||
|
|
"step": 197
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3473684210526316,
|
||
|
|
"grad_norm": 0.8309784531593323,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.254,
|
||
|
|
"mean_token_accuracy": 0.6493059396743774,
|
||
|
|
"num_tokens": 82055621.0,
|
||
|
|
"step": 198
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.34912280701754383,
|
||
|
|
"grad_norm": 0.8503166437149048,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2772,
|
||
|
|
"mean_token_accuracy": 0.6441330909729004,
|
||
|
|
"num_tokens": 82454820.0,
|
||
|
|
"step": 199
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3508771929824561,
|
||
|
|
"grad_norm": 0.8834285736083984,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2701,
|
||
|
|
"mean_token_accuracy": 0.6456678509712219,
|
||
|
|
"num_tokens": 82881912.0,
|
||
|
|
"step": 200
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3526315789473684,
|
||
|
|
"grad_norm": 0.7746639847755432,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2731,
|
||
|
|
"mean_token_accuracy": 0.6454135775566101,
|
||
|
|
"num_tokens": 83294708.0,
|
||
|
|
"step": 201
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3543859649122807,
|
||
|
|
"grad_norm": 0.8626236915588379,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2677,
|
||
|
|
"mean_token_accuracy": 0.6472684144973755,
|
||
|
|
"num_tokens": 83692153.0,
|
||
|
|
"step": 202
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.356140350877193,
|
||
|
|
"grad_norm": 0.8129353523254395,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2504,
|
||
|
|
"mean_token_accuracy": 0.649857223033905,
|
||
|
|
"num_tokens": 84106215.0,
|
||
|
|
"step": 203
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35789473684210527,
|
||
|
|
"grad_norm": 0.9501094818115234,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2788,
|
||
|
|
"mean_token_accuracy": 0.6440906524658203,
|
||
|
|
"num_tokens": 84533326.0,
|
||
|
|
"step": 204
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.35964912280701755,
|
||
|
|
"grad_norm": 0.7424087524414062,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2663,
|
||
|
|
"mean_token_accuracy": 0.6469358205795288,
|
||
|
|
"num_tokens": 84958198.0,
|
||
|
|
"step": 205
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36140350877192984,
|
||
|
|
"grad_norm": 0.7956259846687317,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2691,
|
||
|
|
"mean_token_accuracy": 0.6459769010543823,
|
||
|
|
"num_tokens": 85375870.0,
|
||
|
|
"step": 206
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3631578947368421,
|
||
|
|
"grad_norm": 0.7288737893104553,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2707,
|
||
|
|
"mean_token_accuracy": 0.6465530395507812,
|
||
|
|
"num_tokens": 85800348.0,
|
||
|
|
"step": 207
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3649122807017544,
|
||
|
|
"grad_norm": 0.7138190865516663,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2415,
|
||
|
|
"mean_token_accuracy": 0.6529696583747864,
|
||
|
|
"num_tokens": 86195245.0,
|
||
|
|
"step": 208
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.36666666666666664,
|
||
|
|
"grad_norm": 0.9041345119476318,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2673,
|
||
|
|
"mean_token_accuracy": 0.6487336754798889,
|
||
|
|
"num_tokens": 86599515.0,
|
||
|
|
"step": 209
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3684210526315789,
|
||
|
|
"grad_norm": 0.7553381323814392,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.298,
|
||
|
|
"mean_token_accuracy": 0.6391161680221558,
|
||
|
|
"num_tokens": 87039537.0,
|
||
|
|
"step": 210
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3701754385964912,
|
||
|
|
"grad_norm": 0.7526540160179138,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2465,
|
||
|
|
"mean_token_accuracy": 0.6537913084030151,
|
||
|
|
"num_tokens": 87426029.0,
|
||
|
|
"step": 211
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3719298245614035,
|
||
|
|
"grad_norm": 0.9352124333381653,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2655,
|
||
|
|
"mean_token_accuracy": 0.6462576389312744,
|
||
|
|
"num_tokens": 87817987.0,
|
||
|
|
"step": 212
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3736842105263158,
|
||
|
|
"grad_norm": 0.8342838883399963,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2356,
|
||
|
|
"mean_token_accuracy": 0.653677225112915,
|
||
|
|
"num_tokens": 88218779.0,
|
||
|
|
"step": 213
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37543859649122807,
|
||
|
|
"grad_norm": 0.7606971263885498,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2423,
|
||
|
|
"mean_token_accuracy": 0.6509567499160767,
|
||
|
|
"num_tokens": 88610670.0,
|
||
|
|
"step": 214
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37719298245614036,
|
||
|
|
"grad_norm": 0.9147993326187134,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2777,
|
||
|
|
"mean_token_accuracy": 0.6447807550430298,
|
||
|
|
"num_tokens": 89031094.0,
|
||
|
|
"step": 215
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.37894736842105264,
|
||
|
|
"grad_norm": 0.8798630833625793,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.282,
|
||
|
|
"mean_token_accuracy": 0.6422438025474548,
|
||
|
|
"num_tokens": 89465235.0,
|
||
|
|
"step": 216
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38070175438596493,
|
||
|
|
"grad_norm": 0.7571805119514465,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2503,
|
||
|
|
"mean_token_accuracy": 0.6498503684997559,
|
||
|
|
"num_tokens": 89867145.0,
|
||
|
|
"step": 217
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3824561403508772,
|
||
|
|
"grad_norm": 0.9793193936347961,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.248,
|
||
|
|
"mean_token_accuracy": 0.6518094539642334,
|
||
|
|
"num_tokens": 90262494.0,
|
||
|
|
"step": 218
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38421052631578945,
|
||
|
|
"grad_norm": 0.871235728263855,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2707,
|
||
|
|
"mean_token_accuracy": 0.6453557014465332,
|
||
|
|
"num_tokens": 90671131.0,
|
||
|
|
"step": 219
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.38596491228070173,
|
||
|
|
"grad_norm": 0.7807226181030273,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2607,
|
||
|
|
"mean_token_accuracy": 0.6465795040130615,
|
||
|
|
"num_tokens": 91092593.0,
|
||
|
|
"step": 220
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.387719298245614,
|
||
|
|
"grad_norm": 0.9600160121917725,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2791,
|
||
|
|
"mean_token_accuracy": 0.6422662734985352,
|
||
|
|
"num_tokens": 91501248.0,
|
||
|
|
"step": 221
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3894736842105263,
|
||
|
|
"grad_norm": 0.8549517393112183,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.243,
|
||
|
|
"mean_token_accuracy": 0.6511504650115967,
|
||
|
|
"num_tokens": 91913007.0,
|
||
|
|
"step": 222
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3912280701754386,
|
||
|
|
"grad_norm": 0.7951960563659668,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2377,
|
||
|
|
"mean_token_accuracy": 0.6538807153701782,
|
||
|
|
"num_tokens": 92321188.0,
|
||
|
|
"step": 223
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.3929824561403509,
|
||
|
|
"grad_norm": 0.8606045842170715,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2678,
|
||
|
|
"mean_token_accuracy": 0.645931601524353,
|
||
|
|
"num_tokens": 92726934.0,
|
||
|
|
"step": 224
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39473684210526316,
|
||
|
|
"grad_norm": 0.7008436322212219,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2384,
|
||
|
|
"mean_token_accuracy": 0.6525442600250244,
|
||
|
|
"num_tokens": 93128966.0,
|
||
|
|
"step": 225
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39649122807017545,
|
||
|
|
"grad_norm": 0.7526488304138184,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2738,
|
||
|
|
"mean_token_accuracy": 0.6442047357559204,
|
||
|
|
"num_tokens": 93567917.0,
|
||
|
|
"step": 226
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.39824561403508774,
|
||
|
|
"grad_norm": 0.8679794669151306,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2548,
|
||
|
|
"mean_token_accuracy": 0.6482324600219727,
|
||
|
|
"num_tokens": 93979268.0,
|
||
|
|
"step": 227
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4,
|
||
|
|
"grad_norm": 0.8233749270439148,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2215,
|
||
|
|
"mean_token_accuracy": 0.6573722958564758,
|
||
|
|
"num_tokens": 94367195.0,
|
||
|
|
"step": 228
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4017543859649123,
|
||
|
|
"grad_norm": 0.7261408567428589,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2415,
|
||
|
|
"mean_token_accuracy": 0.6515704989433289,
|
||
|
|
"num_tokens": 94759639.0,
|
||
|
|
"step": 229
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.40350877192982454,
|
||
|
|
"grad_norm": 0.7959755659103394,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2722,
|
||
|
|
"mean_token_accuracy": 0.6438157558441162,
|
||
|
|
"num_tokens": 95191668.0,
|
||
|
|
"step": 230
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4052631578947368,
|
||
|
|
"grad_norm": 0.8794543147087097,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2477,
|
||
|
|
"mean_token_accuracy": 0.6511826515197754,
|
||
|
|
"num_tokens": 95614874.0,
|
||
|
|
"step": 231
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4070175438596491,
|
||
|
|
"grad_norm": 0.7663288116455078,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2175,
|
||
|
|
"mean_token_accuracy": 0.656987190246582,
|
||
|
|
"num_tokens": 95990695.0,
|
||
|
|
"step": 232
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4087719298245614,
|
||
|
|
"grad_norm": 0.7509688138961792,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2395,
|
||
|
|
"mean_token_accuracy": 0.6518152952194214,
|
||
|
|
"num_tokens": 96377947.0,
|
||
|
|
"step": 233
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4105263157894737,
|
||
|
|
"grad_norm": 0.9182112812995911,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2567,
|
||
|
|
"mean_token_accuracy": 0.6489981412887573,
|
||
|
|
"num_tokens": 96787894.0,
|
||
|
|
"step": 234
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41228070175438597,
|
||
|
|
"grad_norm": 0.8123442530632019,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2541,
|
||
|
|
"mean_token_accuracy": 0.6488667726516724,
|
||
|
|
"num_tokens": 97203762.0,
|
||
|
|
"step": 235
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41403508771929826,
|
||
|
|
"grad_norm": 0.8581697344779968,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2595,
|
||
|
|
"mean_token_accuracy": 0.6481375694274902,
|
||
|
|
"num_tokens": 97598494.0,
|
||
|
|
"step": 236
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41578947368421054,
|
||
|
|
"grad_norm": 0.8051207065582275,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2763,
|
||
|
|
"mean_token_accuracy": 0.644673228263855,
|
||
|
|
"num_tokens": 98011465.0,
|
||
|
|
"step": 237
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.41754385964912283,
|
||
|
|
"grad_norm": 0.7852127552032471,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2765,
|
||
|
|
"mean_token_accuracy": 0.6438398361206055,
|
||
|
|
"num_tokens": 98447067.0,
|
||
|
|
"step": 238
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4192982456140351,
|
||
|
|
"grad_norm": 0.7962046265602112,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2429,
|
||
|
|
"mean_token_accuracy": 0.6508150100708008,
|
||
|
|
"num_tokens": 98854782.0,
|
||
|
|
"step": 239
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42105263157894735,
|
||
|
|
"grad_norm": 0.8521065711975098,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2499,
|
||
|
|
"mean_token_accuracy": 0.64998859167099,
|
||
|
|
"num_tokens": 99276423.0,
|
||
|
|
"step": 240
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.42280701754385963,
|
||
|
|
"grad_norm": 0.8006791472434998,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2523,
|
||
|
|
"mean_token_accuracy": 0.65036940574646,
|
||
|
|
"num_tokens": 99705527.0,
|
||
|
|
"step": 241
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4245614035087719,
|
||
|
|
"grad_norm": 0.6923927664756775,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2698,
|
||
|
|
"mean_token_accuracy": 0.645989179611206,
|
||
|
|
"num_tokens": 100144851.0,
|
||
|
|
"step": 242
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4263157894736842,
|
||
|
|
"grad_norm": 0.8310588002204895,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2131,
|
||
|
|
"mean_token_accuracy": 0.6578903198242188,
|
||
|
|
"num_tokens": 100546756.0,
|
||
|
|
"step": 243
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4280701754385965,
|
||
|
|
"grad_norm": 0.7767439484596252,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2374,
|
||
|
|
"mean_token_accuracy": 0.6534087061882019,
|
||
|
|
"num_tokens": 100961732.0,
|
||
|
|
"step": 244
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4298245614035088,
|
||
|
|
"grad_norm": 0.7211782932281494,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2627,
|
||
|
|
"mean_token_accuracy": 0.6475375294685364,
|
||
|
|
"num_tokens": 101396866.0,
|
||
|
|
"step": 245
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43157894736842106,
|
||
|
|
"grad_norm": 0.754098117351532,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2535,
|
||
|
|
"mean_token_accuracy": 0.6487017869949341,
|
||
|
|
"num_tokens": 101823395.0,
|
||
|
|
"step": 246
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43333333333333335,
|
||
|
|
"grad_norm": 0.887698233127594,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2372,
|
||
|
|
"mean_token_accuracy": 0.6514610052108765,
|
||
|
|
"num_tokens": 102218896.0,
|
||
|
|
"step": 247
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43508771929824563,
|
||
|
|
"grad_norm": 0.6688896417617798,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2135,
|
||
|
|
"mean_token_accuracy": 0.6570154428482056,
|
||
|
|
"num_tokens": 102635943.0,
|
||
|
|
"step": 248
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4368421052631579,
|
||
|
|
"grad_norm": 0.6720183491706848,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2314,
|
||
|
|
"mean_token_accuracy": 0.653835654258728,
|
||
|
|
"num_tokens": 103060426.0,
|
||
|
|
"step": 249
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.43859649122807015,
|
||
|
|
"grad_norm": 0.6985954642295837,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2302,
|
||
|
|
"mean_token_accuracy": 0.6543055176734924,
|
||
|
|
"num_tokens": 103480891.0,
|
||
|
|
"step": 250
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44035087719298244,
|
||
|
|
"grad_norm": 0.7861040234565735,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2259,
|
||
|
|
"mean_token_accuracy": 0.6543919444084167,
|
||
|
|
"num_tokens": 103896368.0,
|
||
|
|
"step": 251
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4421052631578947,
|
||
|
|
"grad_norm": 0.7467155456542969,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2425,
|
||
|
|
"mean_token_accuracy": 0.6521680355072021,
|
||
|
|
"num_tokens": 104318424.0,
|
||
|
|
"step": 252
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.443859649122807,
|
||
|
|
"grad_norm": 0.689565896987915,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2384,
|
||
|
|
"mean_token_accuracy": 0.6529023051261902,
|
||
|
|
"num_tokens": 104743917.0,
|
||
|
|
"step": 253
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4456140350877193,
|
||
|
|
"grad_norm": 0.8311668634414673,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2286,
|
||
|
|
"mean_token_accuracy": 0.6543080806732178,
|
||
|
|
"num_tokens": 105146836.0,
|
||
|
|
"step": 254
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4473684210526316,
|
||
|
|
"grad_norm": 0.8047776818275452,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2555,
|
||
|
|
"mean_token_accuracy": 0.6480646133422852,
|
||
|
|
"num_tokens": 105552302.0,
|
||
|
|
"step": 255
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.44912280701754387,
|
||
|
|
"grad_norm": 0.6903892159461975,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2878,
|
||
|
|
"mean_token_accuracy": 0.6401119232177734,
|
||
|
|
"num_tokens": 105963414.0,
|
||
|
|
"step": 256
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45087719298245615,
|
||
|
|
"grad_norm": 0.7000618577003479,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2557,
|
||
|
|
"mean_token_accuracy": 0.6474447846412659,
|
||
|
|
"num_tokens": 106368736.0,
|
||
|
|
"step": 257
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45263157894736844,
|
||
|
|
"grad_norm": 0.7351795434951782,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2237,
|
||
|
|
"mean_token_accuracy": 0.6556580662727356,
|
||
|
|
"num_tokens": 106769771.0,
|
||
|
|
"step": 258
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4543859649122807,
|
||
|
|
"grad_norm": 0.7257981300354004,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2416,
|
||
|
|
"mean_token_accuracy": 0.6521273255348206,
|
||
|
|
"num_tokens": 107170029.0,
|
||
|
|
"step": 259
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45614035087719296,
|
||
|
|
"grad_norm": 0.8522328734397888,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2301,
|
||
|
|
"mean_token_accuracy": 0.6529244184494019,
|
||
|
|
"num_tokens": 107576140.0,
|
||
|
|
"step": 260
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45789473684210524,
|
||
|
|
"grad_norm": 0.6672490835189819,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2414,
|
||
|
|
"mean_token_accuracy": 0.6505205631256104,
|
||
|
|
"num_tokens": 108009067.0,
|
||
|
|
"step": 261
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.45964912280701753,
|
||
|
|
"grad_norm": 0.8998327255249023,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2365,
|
||
|
|
"mean_token_accuracy": 0.6523317694664001,
|
||
|
|
"num_tokens": 108434633.0,
|
||
|
|
"step": 262
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4614035087719298,
|
||
|
|
"grad_norm": 0.7883278727531433,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2511,
|
||
|
|
"mean_token_accuracy": 0.648801326751709,
|
||
|
|
"num_tokens": 108878862.0,
|
||
|
|
"step": 263
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4631578947368421,
|
||
|
|
"grad_norm": 0.9719793796539307,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2326,
|
||
|
|
"mean_token_accuracy": 0.6538045406341553,
|
||
|
|
"num_tokens": 109287222.0,
|
||
|
|
"step": 264
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4649122807017544,
|
||
|
|
"grad_norm": 0.6874752044677734,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2319,
|
||
|
|
"mean_token_accuracy": 0.6524173021316528,
|
||
|
|
"num_tokens": 109693157.0,
|
||
|
|
"step": 265
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4666666666666667,
|
||
|
|
"grad_norm": 0.8174811601638794,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2235,
|
||
|
|
"mean_token_accuracy": 0.6545587182044983,
|
||
|
|
"num_tokens": 110099662.0,
|
||
|
|
"step": 266
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.46842105263157896,
|
||
|
|
"grad_norm": 0.7676987051963806,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.252,
|
||
|
|
"mean_token_accuracy": 0.6494687795639038,
|
||
|
|
"num_tokens": 110511759.0,
|
||
|
|
"step": 267
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47017543859649125,
|
||
|
|
"grad_norm": 0.7034929394721985,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2328,
|
||
|
|
"mean_token_accuracy": 0.653315544128418,
|
||
|
|
"num_tokens": 110927626.0,
|
||
|
|
"step": 268
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47192982456140353,
|
||
|
|
"grad_norm": 0.6947440505027771,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2451,
|
||
|
|
"mean_token_accuracy": 0.6503375172615051,
|
||
|
|
"num_tokens": 111371879.0,
|
||
|
|
"step": 269
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47368421052631576,
|
||
|
|
"grad_norm": 0.7659525871276855,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.251,
|
||
|
|
"mean_token_accuracy": 0.6494304537773132,
|
||
|
|
"num_tokens": 111784069.0,
|
||
|
|
"step": 270
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47543859649122805,
|
||
|
|
"grad_norm": 0.7740342617034912,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2253,
|
||
|
|
"mean_token_accuracy": 0.6542062759399414,
|
||
|
|
"num_tokens": 112186870.0,
|
||
|
|
"step": 271
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.47719298245614034,
|
||
|
|
"grad_norm": 0.65045565366745,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2305,
|
||
|
|
"mean_token_accuracy": 0.6532946228981018,
|
||
|
|
"num_tokens": 112616585.0,
|
||
|
|
"step": 272
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4789473684210526,
|
||
|
|
"grad_norm": 0.7001651525497437,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2247,
|
||
|
|
"mean_token_accuracy": 0.654727041721344,
|
||
|
|
"num_tokens": 113009798.0,
|
||
|
|
"step": 273
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4807017543859649,
|
||
|
|
"grad_norm": 0.6165850162506104,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2174,
|
||
|
|
"mean_token_accuracy": 0.6552860736846924,
|
||
|
|
"num_tokens": 113410244.0,
|
||
|
|
"step": 274
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4824561403508772,
|
||
|
|
"grad_norm": 0.7424379587173462,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2379,
|
||
|
|
"mean_token_accuracy": 0.653006911277771,
|
||
|
|
"num_tokens": 113805215.0,
|
||
|
|
"step": 275
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4842105263157895,
|
||
|
|
"grad_norm": 0.7236623167991638,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2748,
|
||
|
|
"mean_token_accuracy": 0.6438848972320557,
|
||
|
|
"num_tokens": 114224914.0,
|
||
|
|
"step": 276
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48596491228070177,
|
||
|
|
"grad_norm": 0.6665499210357666,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.205,
|
||
|
|
"mean_token_accuracy": 0.6590371131896973,
|
||
|
|
"num_tokens": 114606381.0,
|
||
|
|
"step": 277
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48771929824561405,
|
||
|
|
"grad_norm": 0.6881427764892578,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2456,
|
||
|
|
"mean_token_accuracy": 0.6502711772918701,
|
||
|
|
"num_tokens": 115021603.0,
|
||
|
|
"step": 278
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.48947368421052634,
|
||
|
|
"grad_norm": 0.8498430848121643,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2443,
|
||
|
|
"mean_token_accuracy": 0.6494314670562744,
|
||
|
|
"num_tokens": 115427417.0,
|
||
|
|
"step": 279
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49122807017543857,
|
||
|
|
"grad_norm": 0.724355936050415,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2574,
|
||
|
|
"mean_token_accuracy": 0.6479068398475647,
|
||
|
|
"num_tokens": 115868301.0,
|
||
|
|
"step": 280
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49298245614035086,
|
||
|
|
"grad_norm": 0.6625252366065979,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2128,
|
||
|
|
"mean_token_accuracy": 0.6565937995910645,
|
||
|
|
"num_tokens": 116278418.0,
|
||
|
|
"step": 281
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.49473684210526314,
|
||
|
|
"grad_norm": 0.8329636454582214,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2419,
|
||
|
|
"mean_token_accuracy": 0.650545060634613,
|
||
|
|
"num_tokens": 116681770.0,
|
||
|
|
"step": 282
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4964912280701754,
|
||
|
|
"grad_norm": 0.8298386335372925,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2356,
|
||
|
|
"mean_token_accuracy": 0.6532111167907715,
|
||
|
|
"num_tokens": 117054940.0,
|
||
|
|
"step": 283
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.4982456140350877,
|
||
|
|
"grad_norm": 0.7011889219284058,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2418,
|
||
|
|
"mean_token_accuracy": 0.6518392562866211,
|
||
|
|
"num_tokens": 117477299.0,
|
||
|
|
"step": 284
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5,
|
||
|
|
"grad_norm": 0.710082471370697,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2479,
|
||
|
|
"mean_token_accuracy": 0.6494768857955933,
|
||
|
|
"num_tokens": 117885331.0,
|
||
|
|
"step": 285
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5017543859649123,
|
||
|
|
"grad_norm": 0.8371219038963318,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2577,
|
||
|
|
"mean_token_accuracy": 0.6471771001815796,
|
||
|
|
"num_tokens": 118307149.0,
|
||
|
|
"step": 286
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5035087719298246,
|
||
|
|
"grad_norm": 0.8411158919334412,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.219,
|
||
|
|
"mean_token_accuracy": 0.6554292440414429,
|
||
|
|
"num_tokens": 118697262.0,
|
||
|
|
"step": 287
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5052631578947369,
|
||
|
|
"grad_norm": 0.7115722298622131,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2501,
|
||
|
|
"mean_token_accuracy": 0.6482685804367065,
|
||
|
|
"num_tokens": 119124053.0,
|
||
|
|
"step": 288
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5070175438596491,
|
||
|
|
"grad_norm": 0.6575236916542053,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2555,
|
||
|
|
"mean_token_accuracy": 0.6482589840888977,
|
||
|
|
"num_tokens": 119569185.0,
|
||
|
|
"step": 289
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5087719298245614,
|
||
|
|
"grad_norm": 0.8516756892204285,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2332,
|
||
|
|
"mean_token_accuracy": 0.6534713506698608,
|
||
|
|
"num_tokens": 119987270.0,
|
||
|
|
"step": 290
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5105263157894737,
|
||
|
|
"grad_norm": 0.7346055507659912,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2347,
|
||
|
|
"mean_token_accuracy": 0.6515175700187683,
|
||
|
|
"num_tokens": 120377177.0,
|
||
|
|
"step": 291
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.512280701754386,
|
||
|
|
"grad_norm": 0.6637006402015686,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2241,
|
||
|
|
"mean_token_accuracy": 0.6554847359657288,
|
||
|
|
"num_tokens": 120794750.0,
|
||
|
|
"step": 292
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5140350877192983,
|
||
|
|
"grad_norm": 0.8050562143325806,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2462,
|
||
|
|
"mean_token_accuracy": 0.6506670713424683,
|
||
|
|
"num_tokens": 121223883.0,
|
||
|
|
"step": 293
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5157894736842106,
|
||
|
|
"grad_norm": 0.7059856057167053,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2287,
|
||
|
|
"mean_token_accuracy": 0.6533610820770264,
|
||
|
|
"num_tokens": 121644108.0,
|
||
|
|
"step": 294
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5175438596491229,
|
||
|
|
"grad_norm": 0.6939064860343933,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2357,
|
||
|
|
"mean_token_accuracy": 0.6511529684066772,
|
||
|
|
"num_tokens": 122051612.0,
|
||
|
|
"step": 295
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.519298245614035,
|
||
|
|
"grad_norm": 0.8220492601394653,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2257,
|
||
|
|
"mean_token_accuracy": 0.6545118093490601,
|
||
|
|
"num_tokens": 122430847.0,
|
||
|
|
"step": 296
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5210526315789473,
|
||
|
|
"grad_norm": 0.7044985294342041,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2309,
|
||
|
|
"mean_token_accuracy": 0.654273271560669,
|
||
|
|
"num_tokens": 122849234.0,
|
||
|
|
"step": 297
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5228070175438596,
|
||
|
|
"grad_norm": 0.8146756291389465,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2489,
|
||
|
|
"mean_token_accuracy": 0.6501311659812927,
|
||
|
|
"num_tokens": 123258290.0,
|
||
|
|
"step": 298
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5245614035087719,
|
||
|
|
"grad_norm": 0.766899824142456,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2436,
|
||
|
|
"mean_token_accuracy": 0.6506932973861694,
|
||
|
|
"num_tokens": 123663735.0,
|
||
|
|
"step": 299
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5263157894736842,
|
||
|
|
"grad_norm": 0.7193543910980225,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2451,
|
||
|
|
"mean_token_accuracy": 0.6491553783416748,
|
||
|
|
"num_tokens": 124058727.0,
|
||
|
|
"step": 300
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5280701754385965,
|
||
|
|
"grad_norm": 0.6504607200622559,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2455,
|
||
|
|
"mean_token_accuracy": 0.6509132385253906,
|
||
|
|
"num_tokens": 124484021.0,
|
||
|
|
"step": 301
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5298245614035088,
|
||
|
|
"grad_norm": 0.7661638259887695,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2158,
|
||
|
|
"mean_token_accuracy": 0.6565036177635193,
|
||
|
|
"num_tokens": 124902607.0,
|
||
|
|
"step": 302
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.531578947368421,
|
||
|
|
"grad_norm": 0.73735511302948,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2408,
|
||
|
|
"mean_token_accuracy": 0.6503796577453613,
|
||
|
|
"num_tokens": 125338175.0,
|
||
|
|
"step": 303
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5333333333333333,
|
||
|
|
"grad_norm": 0.9022007584571838,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2332,
|
||
|
|
"mean_token_accuracy": 0.6524848937988281,
|
||
|
|
"num_tokens": 125757431.0,
|
||
|
|
"step": 304
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5350877192982456,
|
||
|
|
"grad_norm": 0.6961904764175415,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2217,
|
||
|
|
"mean_token_accuracy": 0.6560453176498413,
|
||
|
|
"num_tokens": 126155529.0,
|
||
|
|
"step": 305
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5368421052631579,
|
||
|
|
"grad_norm": 0.6821785569190979,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.223,
|
||
|
|
"mean_token_accuracy": 0.6551496982574463,
|
||
|
|
"num_tokens": 126572012.0,
|
||
|
|
"step": 306
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5385964912280702,
|
||
|
|
"grad_norm": 0.8659482002258301,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2393,
|
||
|
|
"mean_token_accuracy": 0.6508756875991821,
|
||
|
|
"num_tokens": 126990573.0,
|
||
|
|
"step": 307
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5403508771929825,
|
||
|
|
"grad_norm": 0.6646002531051636,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2173,
|
||
|
|
"mean_token_accuracy": 0.6561950445175171,
|
||
|
|
"num_tokens": 127418976.0,
|
||
|
|
"step": 308
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5421052631578948,
|
||
|
|
"grad_norm": 0.6923218369483948,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1911,
|
||
|
|
"mean_token_accuracy": 0.6624599695205688,
|
||
|
|
"num_tokens": 127827614.0,
|
||
|
|
"step": 309
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.543859649122807,
|
||
|
|
"grad_norm": 0.6864442825317383,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2267,
|
||
|
|
"mean_token_accuracy": 0.6538540720939636,
|
||
|
|
"num_tokens": 128258313.0,
|
||
|
|
"step": 310
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5456140350877193,
|
||
|
|
"grad_norm": 0.7230309247970581,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2495,
|
||
|
|
"mean_token_accuracy": 0.6486865282058716,
|
||
|
|
"num_tokens": 128702682.0,
|
||
|
|
"step": 311
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5473684210526316,
|
||
|
|
"grad_norm": 0.6914284825325012,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2211,
|
||
|
|
"mean_token_accuracy": 0.6558956503868103,
|
||
|
|
"num_tokens": 129094449.0,
|
||
|
|
"step": 312
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5491228070175439,
|
||
|
|
"grad_norm": 0.6948025822639465,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2232,
|
||
|
|
"mean_token_accuracy": 0.6554611325263977,
|
||
|
|
"num_tokens": 129492795.0,
|
||
|
|
"step": 313
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5508771929824562,
|
||
|
|
"grad_norm": 0.6883065104484558,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.225,
|
||
|
|
"mean_token_accuracy": 0.6549999713897705,
|
||
|
|
"num_tokens": 129923670.0,
|
||
|
|
"step": 314
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5526315789473685,
|
||
|
|
"grad_norm": 0.7065843939781189,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.211,
|
||
|
|
"mean_token_accuracy": 0.6596200466156006,
|
||
|
|
"num_tokens": 130333012.0,
|
||
|
|
"step": 315
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5543859649122806,
|
||
|
|
"grad_norm": 0.8073469996452332,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2073,
|
||
|
|
"mean_token_accuracy": 0.6588984727859497,
|
||
|
|
"num_tokens": 130749305.0,
|
||
|
|
"step": 316
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5561403508771929,
|
||
|
|
"grad_norm": 0.8134505748748779,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2544,
|
||
|
|
"mean_token_accuracy": 0.6476566791534424,
|
||
|
|
"num_tokens": 131169421.0,
|
||
|
|
"step": 317
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5578947368421052,
|
||
|
|
"grad_norm": 0.6765173077583313,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2078,
|
||
|
|
"mean_token_accuracy": 0.6600826978683472,
|
||
|
|
"num_tokens": 131574788.0,
|
||
|
|
"step": 318
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5596491228070175,
|
||
|
|
"grad_norm": 0.7156663537025452,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2267,
|
||
|
|
"mean_token_accuracy": 0.6545342206954956,
|
||
|
|
"num_tokens": 131984459.0,
|
||
|
|
"step": 319
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5614035087719298,
|
||
|
|
"grad_norm": 0.8103324174880981,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2612,
|
||
|
|
"mean_token_accuracy": 0.6481289267539978,
|
||
|
|
"num_tokens": 132415165.0,
|
||
|
|
"step": 320
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5631578947368421,
|
||
|
|
"grad_norm": 0.742142915725708,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2039,
|
||
|
|
"mean_token_accuracy": 0.6596871018409729,
|
||
|
|
"num_tokens": 132838824.0,
|
||
|
|
"step": 321
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5649122807017544,
|
||
|
|
"grad_norm": 0.7613045573234558,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2095,
|
||
|
|
"mean_token_accuracy": 0.6572511792182922,
|
||
|
|
"num_tokens": 133247120.0,
|
||
|
|
"step": 322
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5666666666666667,
|
||
|
|
"grad_norm": 0.7817480564117432,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2232,
|
||
|
|
"mean_token_accuracy": 0.6545703411102295,
|
||
|
|
"num_tokens": 133672706.0,
|
||
|
|
"step": 323
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5684210526315789,
|
||
|
|
"grad_norm": 0.6124296188354492,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2342,
|
||
|
|
"mean_token_accuracy": 0.6552791595458984,
|
||
|
|
"num_tokens": 134079103.0,
|
||
|
|
"step": 324
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5701754385964912,
|
||
|
|
"grad_norm": 0.6886869668960571,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2231,
|
||
|
|
"mean_token_accuracy": 0.6547764539718628,
|
||
|
|
"num_tokens": 134494826.0,
|
||
|
|
"step": 325
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5719298245614035,
|
||
|
|
"grad_norm": 0.6630454659461975,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2345,
|
||
|
|
"mean_token_accuracy": 0.6520872116088867,
|
||
|
|
"num_tokens": 134919504.0,
|
||
|
|
"step": 326
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5736842105263158,
|
||
|
|
"grad_norm": 0.8173869252204895,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2276,
|
||
|
|
"mean_token_accuracy": 0.6533281207084656,
|
||
|
|
"num_tokens": 135331962.0,
|
||
|
|
"step": 327
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5754385964912281,
|
||
|
|
"grad_norm": 0.6743276715278625,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2281,
|
||
|
|
"mean_token_accuracy": 0.6535520553588867,
|
||
|
|
"num_tokens": 135745710.0,
|
||
|
|
"step": 328
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5771929824561404,
|
||
|
|
"grad_norm": 0.6731691360473633,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2346,
|
||
|
|
"mean_token_accuracy": 0.6528003215789795,
|
||
|
|
"num_tokens": 136174418.0,
|
||
|
|
"step": 329
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5789473684210527,
|
||
|
|
"grad_norm": 0.6211588382720947,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2329,
|
||
|
|
"mean_token_accuracy": 0.653133749961853,
|
||
|
|
"num_tokens": 136597949.0,
|
||
|
|
"step": 330
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5807017543859649,
|
||
|
|
"grad_norm": 0.8585658073425293,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2506,
|
||
|
|
"mean_token_accuracy": 0.648890495300293,
|
||
|
|
"num_tokens": 137047696.0,
|
||
|
|
"step": 331
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5824561403508772,
|
||
|
|
"grad_norm": 0.8006256222724915,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2119,
|
||
|
|
"mean_token_accuracy": 0.658997118473053,
|
||
|
|
"num_tokens": 137428701.0,
|
||
|
|
"step": 332
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5842105263157895,
|
||
|
|
"grad_norm": 0.692973792552948,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2167,
|
||
|
|
"mean_token_accuracy": 0.6570533514022827,
|
||
|
|
"num_tokens": 137823083.0,
|
||
|
|
"step": 333
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5859649122807018,
|
||
|
|
"grad_norm": 0.7685320973396301,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.271,
|
||
|
|
"mean_token_accuracy": 0.6423413157463074,
|
||
|
|
"num_tokens": 138250591.0,
|
||
|
|
"step": 334
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5877192982456141,
|
||
|
|
"grad_norm": 0.7700155377388,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2252,
|
||
|
|
"mean_token_accuracy": 0.6543079614639282,
|
||
|
|
"num_tokens": 138660562.0,
|
||
|
|
"step": 335
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5894736842105263,
|
||
|
|
"grad_norm": 0.7410191893577576,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2156,
|
||
|
|
"mean_token_accuracy": 0.6561688184738159,
|
||
|
|
"num_tokens": 139077597.0,
|
||
|
|
"step": 336
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5912280701754385,
|
||
|
|
"grad_norm": 0.7632637619972229,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.235,
|
||
|
|
"mean_token_accuracy": 0.6516165733337402,
|
||
|
|
"num_tokens": 139482698.0,
|
||
|
|
"step": 337
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5929824561403508,
|
||
|
|
"grad_norm": 0.690731942653656,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2408,
|
||
|
|
"mean_token_accuracy": 0.6502476930618286,
|
||
|
|
"num_tokens": 139906177.0,
|
||
|
|
"step": 338
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5947368421052631,
|
||
|
|
"grad_norm": 0.6513046026229858,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2297,
|
||
|
|
"mean_token_accuracy": 0.6529406905174255,
|
||
|
|
"num_tokens": 140319741.0,
|
||
|
|
"step": 339
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5964912280701754,
|
||
|
|
"grad_norm": 0.6879235506057739,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2234,
|
||
|
|
"mean_token_accuracy": 0.6540219783782959,
|
||
|
|
"num_tokens": 140731809.0,
|
||
|
|
"step": 340
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.5982456140350877,
|
||
|
|
"grad_norm": 0.7240639925003052,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2043,
|
||
|
|
"mean_token_accuracy": 0.6592921018600464,
|
||
|
|
"num_tokens": 141126659.0,
|
||
|
|
"step": 341
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6,
|
||
|
|
"grad_norm": 0.6559076905250549,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2353,
|
||
|
|
"mean_token_accuracy": 0.6514889001846313,
|
||
|
|
"num_tokens": 141551619.0,
|
||
|
|
"step": 342
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6017543859649123,
|
||
|
|
"grad_norm": 0.7054679989814758,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2225,
|
||
|
|
"mean_token_accuracy": 0.6558979749679565,
|
||
|
|
"num_tokens": 141970744.0,
|
||
|
|
"step": 343
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6035087719298246,
|
||
|
|
"grad_norm": 0.6867666244506836,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2162,
|
||
|
|
"mean_token_accuracy": 0.6566264629364014,
|
||
|
|
"num_tokens": 142395401.0,
|
||
|
|
"step": 344
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6052631578947368,
|
||
|
|
"grad_norm": 0.6507348418235779,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2244,
|
||
|
|
"mean_token_accuracy": 0.6545271277427673,
|
||
|
|
"num_tokens": 142827910.0,
|
||
|
|
"step": 345
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6070175438596491,
|
||
|
|
"grad_norm": 0.7520820498466492,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2315,
|
||
|
|
"mean_token_accuracy": 0.6522303819656372,
|
||
|
|
"num_tokens": 143256693.0,
|
||
|
|
"step": 346
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6087719298245614,
|
||
|
|
"grad_norm": 0.7250421047210693,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2126,
|
||
|
|
"mean_token_accuracy": 0.6569870710372925,
|
||
|
|
"num_tokens": 143670113.0,
|
||
|
|
"step": 347
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6105263157894737,
|
||
|
|
"grad_norm": 0.707240104675293,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2337,
|
||
|
|
"mean_token_accuracy": 0.6521450281143188,
|
||
|
|
"num_tokens": 144085012.0,
|
||
|
|
"step": 348
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.612280701754386,
|
||
|
|
"grad_norm": 0.6530799269676208,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2366,
|
||
|
|
"mean_token_accuracy": 0.650078296661377,
|
||
|
|
"num_tokens": 144511304.0,
|
||
|
|
"step": 349
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6140350877192983,
|
||
|
|
"grad_norm": 0.7164869904518127,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1957,
|
||
|
|
"mean_token_accuracy": 0.6614063382148743,
|
||
|
|
"num_tokens": 144903389.0,
|
||
|
|
"step": 350
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6157894736842106,
|
||
|
|
"grad_norm": 0.6941936612129211,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2244,
|
||
|
|
"mean_token_accuracy": 0.6538809537887573,
|
||
|
|
"num_tokens": 145317796.0,
|
||
|
|
"step": 351
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6175438596491228,
|
||
|
|
"grad_norm": 0.5569853186607361,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2303,
|
||
|
|
"mean_token_accuracy": 0.6540141701698303,
|
||
|
|
"num_tokens": 145728692.0,
|
||
|
|
"step": 352
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6192982456140351,
|
||
|
|
"grad_norm": 0.6453179121017456,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2219,
|
||
|
|
"mean_token_accuracy": 0.6558411121368408,
|
||
|
|
"num_tokens": 146149428.0,
|
||
|
|
"step": 353
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6210526315789474,
|
||
|
|
"grad_norm": 0.7571195363998413,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2371,
|
||
|
|
"mean_token_accuracy": 0.6512084007263184,
|
||
|
|
"num_tokens": 146567803.0,
|
||
|
|
"step": 354
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6228070175438597,
|
||
|
|
"grad_norm": 0.7026142477989197,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2577,
|
||
|
|
"mean_token_accuracy": 0.6465089917182922,
|
||
|
|
"num_tokens": 146996287.0,
|
||
|
|
"step": 355
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.624561403508772,
|
||
|
|
"grad_norm": 0.7396862506866455,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.207,
|
||
|
|
"mean_token_accuracy": 0.657769501209259,
|
||
|
|
"num_tokens": 147406032.0,
|
||
|
|
"step": 356
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6263157894736842,
|
||
|
|
"grad_norm": 0.7301826477050781,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2549,
|
||
|
|
"mean_token_accuracy": 0.648101270198822,
|
||
|
|
"num_tokens": 147818711.0,
|
||
|
|
"step": 357
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6280701754385964,
|
||
|
|
"grad_norm": 0.6443963646888733,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2247,
|
||
|
|
"mean_token_accuracy": 0.6540451645851135,
|
||
|
|
"num_tokens": 148224782.0,
|
||
|
|
"step": 358
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6298245614035087,
|
||
|
|
"grad_norm": 0.5962257981300354,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2095,
|
||
|
|
"mean_token_accuracy": 0.6578296422958374,
|
||
|
|
"num_tokens": 148638041.0,
|
||
|
|
"step": 359
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.631578947368421,
|
||
|
|
"grad_norm": 0.553277850151062,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1954,
|
||
|
|
"mean_token_accuracy": 0.6618661880493164,
|
||
|
|
"num_tokens": 149034209.0,
|
||
|
|
"step": 360
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6333333333333333,
|
||
|
|
"grad_norm": 0.8141903281211853,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2181,
|
||
|
|
"mean_token_accuracy": 0.6552863121032715,
|
||
|
|
"num_tokens": 149462197.0,
|
||
|
|
"step": 361
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6350877192982456,
|
||
|
|
"grad_norm": 0.6312337517738342,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2237,
|
||
|
|
"mean_token_accuracy": 0.6552015542984009,
|
||
|
|
"num_tokens": 149874309.0,
|
||
|
|
"step": 362
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6368421052631579,
|
||
|
|
"grad_norm": 0.6863110661506653,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2144,
|
||
|
|
"mean_token_accuracy": 0.6574706435203552,
|
||
|
|
"num_tokens": 150278354.0,
|
||
|
|
"step": 363
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6385964912280702,
|
||
|
|
"grad_norm": 0.7062144875526428,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2309,
|
||
|
|
"mean_token_accuracy": 0.6519078016281128,
|
||
|
|
"num_tokens": 150696393.0,
|
||
|
|
"step": 364
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6403508771929824,
|
||
|
|
"grad_norm": 0.6141137480735779,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2175,
|
||
|
|
"mean_token_accuracy": 0.6562414169311523,
|
||
|
|
"num_tokens": 151097966.0,
|
||
|
|
"step": 365
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6421052631578947,
|
||
|
|
"grad_norm": 0.6939074993133545,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2115,
|
||
|
|
"mean_token_accuracy": 0.6587799787521362,
|
||
|
|
"num_tokens": 151487261.0,
|
||
|
|
"step": 366
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.643859649122807,
|
||
|
|
"grad_norm": 0.6834867596626282,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2492,
|
||
|
|
"mean_token_accuracy": 0.6492800116539001,
|
||
|
|
"num_tokens": 151915716.0,
|
||
|
|
"step": 367
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6456140350877193,
|
||
|
|
"grad_norm": 0.6845062971115112,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2045,
|
||
|
|
"mean_token_accuracy": 0.6578772664070129,
|
||
|
|
"num_tokens": 152333716.0,
|
||
|
|
"step": 368
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6473684210526316,
|
||
|
|
"grad_norm": 0.6263954639434814,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2285,
|
||
|
|
"mean_token_accuracy": 0.6524069905281067,
|
||
|
|
"num_tokens": 152758004.0,
|
||
|
|
"step": 369
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6491228070175439,
|
||
|
|
"grad_norm": 0.7604780793190002,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2321,
|
||
|
|
"mean_token_accuracy": 0.6509186029434204,
|
||
|
|
"num_tokens": 153175726.0,
|
||
|
|
"step": 370
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6508771929824562,
|
||
|
|
"grad_norm": 0.6607220768928528,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.193,
|
||
|
|
"mean_token_accuracy": 0.6612952947616577,
|
||
|
|
"num_tokens": 153573526.0,
|
||
|
|
"step": 371
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6526315789473685,
|
||
|
|
"grad_norm": 0.7317623496055603,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2233,
|
||
|
|
"mean_token_accuracy": 0.6555420756340027,
|
||
|
|
"num_tokens": 154001303.0,
|
||
|
|
"step": 372
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6543859649122807,
|
||
|
|
"grad_norm": 0.5643908381462097,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1888,
|
||
|
|
"mean_token_accuracy": 0.6617960929870605,
|
||
|
|
"num_tokens": 154409874.0,
|
||
|
|
"step": 373
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.656140350877193,
|
||
|
|
"grad_norm": 0.631582498550415,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2192,
|
||
|
|
"mean_token_accuracy": 0.6558030843734741,
|
||
|
|
"num_tokens": 154826554.0,
|
||
|
|
"step": 374
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6578947368421053,
|
||
|
|
"grad_norm": 0.745689332485199,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2146,
|
||
|
|
"mean_token_accuracy": 0.6560918688774109,
|
||
|
|
"num_tokens": 155230585.0,
|
||
|
|
"step": 375
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6596491228070176,
|
||
|
|
"grad_norm": 0.651474118232727,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2055,
|
||
|
|
"mean_token_accuracy": 0.6597638726234436,
|
||
|
|
"num_tokens": 155636974.0,
|
||
|
|
"step": 376
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6614035087719298,
|
||
|
|
"grad_norm": 0.7227398753166199,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2211,
|
||
|
|
"mean_token_accuracy": 0.6551980972290039,
|
||
|
|
"num_tokens": 156063068.0,
|
||
|
|
"step": 377
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6631578947368421,
|
||
|
|
"grad_norm": 0.6124153137207031,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2435,
|
||
|
|
"mean_token_accuracy": 0.6511872410774231,
|
||
|
|
"num_tokens": 156510643.0,
|
||
|
|
"step": 378
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6649122807017543,
|
||
|
|
"grad_norm": 0.7193928360939026,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2242,
|
||
|
|
"mean_token_accuracy": 0.6536823511123657,
|
||
|
|
"num_tokens": 156947423.0,
|
||
|
|
"step": 379
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6666666666666666,
|
||
|
|
"grad_norm": 0.7923741936683655,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2149,
|
||
|
|
"mean_token_accuracy": 0.6561374664306641,
|
||
|
|
"num_tokens": 157370426.0,
|
||
|
|
"step": 380
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6684210526315789,
|
||
|
|
"grad_norm": 0.7290387153625488,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2132,
|
||
|
|
"mean_token_accuracy": 0.6568667888641357,
|
||
|
|
"num_tokens": 157782963.0,
|
||
|
|
"step": 381
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6701754385964912,
|
||
|
|
"grad_norm": 0.6192464232444763,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2309,
|
||
|
|
"mean_token_accuracy": 0.6526235342025757,
|
||
|
|
"num_tokens": 158180387.0,
|
||
|
|
"step": 382
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6719298245614035,
|
||
|
|
"grad_norm": 0.7137374877929688,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2328,
|
||
|
|
"mean_token_accuracy": 0.6518644094467163,
|
||
|
|
"num_tokens": 158582401.0,
|
||
|
|
"step": 383
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6736842105263158,
|
||
|
|
"grad_norm": 0.7550848126411438,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2166,
|
||
|
|
"mean_token_accuracy": 0.6573696136474609,
|
||
|
|
"num_tokens": 158991919.0,
|
||
|
|
"step": 384
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6754385964912281,
|
||
|
|
"grad_norm": 0.6890254020690918,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2107,
|
||
|
|
"mean_token_accuracy": 0.6569054126739502,
|
||
|
|
"num_tokens": 159393180.0,
|
||
|
|
"step": 385
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6771929824561403,
|
||
|
|
"grad_norm": 0.7258317470550537,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2151,
|
||
|
|
"mean_token_accuracy": 0.6549711227416992,
|
||
|
|
"num_tokens": 159800687.0,
|
||
|
|
"step": 386
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6789473684210526,
|
||
|
|
"grad_norm": 0.7973881363868713,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1913,
|
||
|
|
"mean_token_accuracy": 0.6616858839988708,
|
||
|
|
"num_tokens": 160206677.0,
|
||
|
|
"step": 387
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6807017543859649,
|
||
|
|
"grad_norm": 0.6781461238861084,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2296,
|
||
|
|
"mean_token_accuracy": 0.6531736850738525,
|
||
|
|
"num_tokens": 160623973.0,
|
||
|
|
"step": 388
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6824561403508772,
|
||
|
|
"grad_norm": 0.8034713268280029,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2306,
|
||
|
|
"mean_token_accuracy": 0.6528879404067993,
|
||
|
|
"num_tokens": 161033575.0,
|
||
|
|
"step": 389
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6842105263157895,
|
||
|
|
"grad_norm": 0.7085846066474915,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1892,
|
||
|
|
"mean_token_accuracy": 0.6614021062850952,
|
||
|
|
"num_tokens": 161420334.0,
|
||
|
|
"step": 390
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6859649122807018,
|
||
|
|
"grad_norm": 0.712842583656311,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2096,
|
||
|
|
"mean_token_accuracy": 0.6570684909820557,
|
||
|
|
"num_tokens": 161823214.0,
|
||
|
|
"step": 391
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6877192982456141,
|
||
|
|
"grad_norm": 0.6031337380409241,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2131,
|
||
|
|
"mean_token_accuracy": 0.6579930782318115,
|
||
|
|
"num_tokens": 162253222.0,
|
||
|
|
"step": 392
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6894736842105263,
|
||
|
|
"grad_norm": 0.6571363806724548,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2151,
|
||
|
|
"mean_token_accuracy": 0.6550261974334717,
|
||
|
|
"num_tokens": 162673396.0,
|
||
|
|
"step": 393
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6912280701754386,
|
||
|
|
"grad_norm": 0.590053915977478,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1913,
|
||
|
|
"mean_token_accuracy": 0.6606940031051636,
|
||
|
|
"num_tokens": 163095701.0,
|
||
|
|
"step": 394
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6929824561403509,
|
||
|
|
"grad_norm": 0.660569429397583,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2168,
|
||
|
|
"mean_token_accuracy": 0.6552713513374329,
|
||
|
|
"num_tokens": 163503487.0,
|
||
|
|
"step": 395
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6947368421052632,
|
||
|
|
"grad_norm": 0.5482744574546814,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1966,
|
||
|
|
"mean_token_accuracy": 0.6622109413146973,
|
||
|
|
"num_tokens": 163908638.0,
|
||
|
|
"step": 396
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6964912280701754,
|
||
|
|
"grad_norm": 0.6649277210235596,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2082,
|
||
|
|
"mean_token_accuracy": 0.6560900211334229,
|
||
|
|
"num_tokens": 164321664.0,
|
||
|
|
"step": 397
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.6982456140350877,
|
||
|
|
"grad_norm": 0.6546705365180969,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.208,
|
||
|
|
"mean_token_accuracy": 0.6577179431915283,
|
||
|
|
"num_tokens": 164739198.0,
|
||
|
|
"step": 398
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7,
|
||
|
|
"grad_norm": 0.6374883651733398,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1893,
|
||
|
|
"mean_token_accuracy": 0.660727322101593,
|
||
|
|
"num_tokens": 165130707.0,
|
||
|
|
"step": 399
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7017543859649122,
|
||
|
|
"grad_norm": 0.6626867055892944,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2056,
|
||
|
|
"mean_token_accuracy": 0.6570228934288025,
|
||
|
|
"num_tokens": 165544679.0,
|
||
|
|
"step": 400
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7035087719298245,
|
||
|
|
"grad_norm": 0.648720920085907,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1889,
|
||
|
|
"mean_token_accuracy": 0.6612677574157715,
|
||
|
|
"num_tokens": 165963969.0,
|
||
|
|
"step": 401
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7052631578947368,
|
||
|
|
"grad_norm": 0.6660583019256592,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.201,
|
||
|
|
"mean_token_accuracy": 0.6595137119293213,
|
||
|
|
"num_tokens": 166342479.0,
|
||
|
|
"step": 402
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7070175438596491,
|
||
|
|
"grad_norm": 0.6676925420761108,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2178,
|
||
|
|
"mean_token_accuracy": 0.655318558216095,
|
||
|
|
"num_tokens": 166746524.0,
|
||
|
|
"step": 403
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7087719298245614,
|
||
|
|
"grad_norm": 0.6398362517356873,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2217,
|
||
|
|
"mean_token_accuracy": 0.6538881063461304,
|
||
|
|
"num_tokens": 167166144.0,
|
||
|
|
"step": 404
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7105263157894737,
|
||
|
|
"grad_norm": 0.6486631035804749,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2107,
|
||
|
|
"mean_token_accuracy": 0.6568524837493896,
|
||
|
|
"num_tokens": 167576053.0,
|
||
|
|
"step": 405
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.712280701754386,
|
||
|
|
"grad_norm": 0.6971449851989746,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2072,
|
||
|
|
"mean_token_accuracy": 0.6588019132614136,
|
||
|
|
"num_tokens": 168005716.0,
|
||
|
|
"step": 406
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7140350877192982,
|
||
|
|
"grad_norm": 0.5594667792320251,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1815,
|
||
|
|
"mean_token_accuracy": 0.6640152931213379,
|
||
|
|
"num_tokens": 168425061.0,
|
||
|
|
"step": 407
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7157894736842105,
|
||
|
|
"grad_norm": 0.6978932619094849,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2123,
|
||
|
|
"mean_token_accuracy": 0.6560491323471069,
|
||
|
|
"num_tokens": 168832540.0,
|
||
|
|
"step": 408
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7175438596491228,
|
||
|
|
"grad_norm": 0.577872097492218,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1961,
|
||
|
|
"mean_token_accuracy": 0.6605911254882812,
|
||
|
|
"num_tokens": 169243355.0,
|
||
|
|
"step": 409
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7192982456140351,
|
||
|
|
"grad_norm": 0.6972746849060059,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2263,
|
||
|
|
"mean_token_accuracy": 0.6549021005630493,
|
||
|
|
"num_tokens": 169678758.0,
|
||
|
|
"step": 410
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7210526315789474,
|
||
|
|
"grad_norm": 0.6528338193893433,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2193,
|
||
|
|
"mean_token_accuracy": 0.6543501615524292,
|
||
|
|
"num_tokens": 170107843.0,
|
||
|
|
"step": 411
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7228070175438597,
|
||
|
|
"grad_norm": 0.6352643370628357,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.21,
|
||
|
|
"mean_token_accuracy": 0.6568456292152405,
|
||
|
|
"num_tokens": 170512414.0,
|
||
|
|
"step": 412
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.724561403508772,
|
||
|
|
"grad_norm": 0.6633725762367249,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2273,
|
||
|
|
"mean_token_accuracy": 0.6531171798706055,
|
||
|
|
"num_tokens": 170927891.0,
|
||
|
|
"step": 413
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7263157894736842,
|
||
|
|
"grad_norm": 0.7003793716430664,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2471,
|
||
|
|
"mean_token_accuracy": 0.6485984921455383,
|
||
|
|
"num_tokens": 171347602.0,
|
||
|
|
"step": 414
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7280701754385965,
|
||
|
|
"grad_norm": 0.6166436076164246,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1822,
|
||
|
|
"mean_token_accuracy": 0.664222240447998,
|
||
|
|
"num_tokens": 171764325.0,
|
||
|
|
"step": 415
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7298245614035088,
|
||
|
|
"grad_norm": 0.6370410323143005,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2288,
|
||
|
|
"mean_token_accuracy": 0.6530359387397766,
|
||
|
|
"num_tokens": 172161316.0,
|
||
|
|
"step": 416
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7315789473684211,
|
||
|
|
"grad_norm": 0.5680028200149536,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.188,
|
||
|
|
"mean_token_accuracy": 0.663171112537384,
|
||
|
|
"num_tokens": 172557979.0,
|
||
|
|
"step": 417
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7333333333333333,
|
||
|
|
"grad_norm": 0.6317917704582214,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2088,
|
||
|
|
"mean_token_accuracy": 0.6587448120117188,
|
||
|
|
"num_tokens": 172977293.0,
|
||
|
|
"step": 418
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7350877192982456,
|
||
|
|
"grad_norm": 0.6629990935325623,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.206,
|
||
|
|
"mean_token_accuracy": 0.657719612121582,
|
||
|
|
"num_tokens": 173386905.0,
|
||
|
|
"step": 419
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7368421052631579,
|
||
|
|
"grad_norm": 0.7318717241287231,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1874,
|
||
|
|
"mean_token_accuracy": 0.662236750125885,
|
||
|
|
"num_tokens": 173790039.0,
|
||
|
|
"step": 420
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7385964912280701,
|
||
|
|
"grad_norm": 0.5909295678138733,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1857,
|
||
|
|
"mean_token_accuracy": 0.66287761926651,
|
||
|
|
"num_tokens": 174200214.0,
|
||
|
|
"step": 421
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7403508771929824,
|
||
|
|
"grad_norm": 0.7244629859924316,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.198,
|
||
|
|
"mean_token_accuracy": 0.6586729288101196,
|
||
|
|
"num_tokens": 174594155.0,
|
||
|
|
"step": 422
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7421052631578947,
|
||
|
|
"grad_norm": 0.7065144777297974,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.189,
|
||
|
|
"mean_token_accuracy": 0.6611475348472595,
|
||
|
|
"num_tokens": 175025672.0,
|
||
|
|
"step": 423
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.743859649122807,
|
||
|
|
"grad_norm": 0.6348630785942078,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2285,
|
||
|
|
"mean_token_accuracy": 0.6518391370773315,
|
||
|
|
"num_tokens": 175452636.0,
|
||
|
|
"step": 424
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7456140350877193,
|
||
|
|
"grad_norm": 0.6401616930961609,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2204,
|
||
|
|
"mean_token_accuracy": 0.655087947845459,
|
||
|
|
"num_tokens": 175879906.0,
|
||
|
|
"step": 425
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7473684210526316,
|
||
|
|
"grad_norm": 0.6971575617790222,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2111,
|
||
|
|
"mean_token_accuracy": 0.6562093496322632,
|
||
|
|
"num_tokens": 176292162.0,
|
||
|
|
"step": 426
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7491228070175439,
|
||
|
|
"grad_norm": 0.6440587043762207,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2012,
|
||
|
|
"mean_token_accuracy": 0.6593471765518188,
|
||
|
|
"num_tokens": 176720725.0,
|
||
|
|
"step": 427
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7508771929824561,
|
||
|
|
"grad_norm": 0.597520649433136,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2284,
|
||
|
|
"mean_token_accuracy": 0.6531672477722168,
|
||
|
|
"num_tokens": 177161243.0,
|
||
|
|
"step": 428
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7526315789473684,
|
||
|
|
"grad_norm": 0.8046004772186279,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1928,
|
||
|
|
"mean_token_accuracy": 0.6594202518463135,
|
||
|
|
"num_tokens": 177562007.0,
|
||
|
|
"step": 429
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7543859649122807,
|
||
|
|
"grad_norm": 0.6298813223838806,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.219,
|
||
|
|
"mean_token_accuracy": 0.6546623706817627,
|
||
|
|
"num_tokens": 177983323.0,
|
||
|
|
"step": 430
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.756140350877193,
|
||
|
|
"grad_norm": 0.5731974840164185,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2153,
|
||
|
|
"mean_token_accuracy": 0.6574859619140625,
|
||
|
|
"num_tokens": 178409419.0,
|
||
|
|
"step": 431
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7578947368421053,
|
||
|
|
"grad_norm": 0.7396548390388489,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2113,
|
||
|
|
"mean_token_accuracy": 0.6574329137802124,
|
||
|
|
"num_tokens": 178832025.0,
|
||
|
|
"step": 432
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7596491228070176,
|
||
|
|
"grad_norm": 0.6398889422416687,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2159,
|
||
|
|
"mean_token_accuracy": 0.6554309129714966,
|
||
|
|
"num_tokens": 179246477.0,
|
||
|
|
"step": 433
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7614035087719299,
|
||
|
|
"grad_norm": 0.6085229516029358,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2216,
|
||
|
|
"mean_token_accuracy": 0.6549739837646484,
|
||
|
|
"num_tokens": 179666041.0,
|
||
|
|
"step": 434
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7631578947368421,
|
||
|
|
"grad_norm": 0.7816640734672546,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2119,
|
||
|
|
"mean_token_accuracy": 0.6565245389938354,
|
||
|
|
"num_tokens": 180092225.0,
|
||
|
|
"step": 435
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7649122807017544,
|
||
|
|
"grad_norm": 0.8083506226539612,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1961,
|
||
|
|
"mean_token_accuracy": 0.6606351733207703,
|
||
|
|
"num_tokens": 180498123.0,
|
||
|
|
"step": 436
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7666666666666667,
|
||
|
|
"grad_norm": 0.6019986271858215,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1972,
|
||
|
|
"mean_token_accuracy": 0.6611742377281189,
|
||
|
|
"num_tokens": 180904735.0,
|
||
|
|
"step": 437
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7684210526315789,
|
||
|
|
"grad_norm": 0.6621778011322021,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1987,
|
||
|
|
"mean_token_accuracy": 0.6592767238616943,
|
||
|
|
"num_tokens": 181297827.0,
|
||
|
|
"step": 438
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7701754385964912,
|
||
|
|
"grad_norm": 0.5817862749099731,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2096,
|
||
|
|
"mean_token_accuracy": 0.6584010124206543,
|
||
|
|
"num_tokens": 181722096.0,
|
||
|
|
"step": 439
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7719298245614035,
|
||
|
|
"grad_norm": 0.6433981657028198,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1872,
|
||
|
|
"mean_token_accuracy": 0.6626437902450562,
|
||
|
|
"num_tokens": 182124515.0,
|
||
|
|
"step": 440
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7736842105263158,
|
||
|
|
"grad_norm": 0.6573434472084045,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2159,
|
||
|
|
"mean_token_accuracy": 0.6551169157028198,
|
||
|
|
"num_tokens": 182542265.0,
|
||
|
|
"step": 441
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.775438596491228,
|
||
|
|
"grad_norm": 0.684744656085968,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2095,
|
||
|
|
"mean_token_accuracy": 0.6574633121490479,
|
||
|
|
"num_tokens": 182942368.0,
|
||
|
|
"step": 442
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7771929824561403,
|
||
|
|
"grad_norm": 0.5961515307426453,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2012,
|
||
|
|
"mean_token_accuracy": 0.6580845713615417,
|
||
|
|
"num_tokens": 183350652.0,
|
||
|
|
"step": 443
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7789473684210526,
|
||
|
|
"grad_norm": 0.7268422842025757,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2082,
|
||
|
|
"mean_token_accuracy": 0.657654345035553,
|
||
|
|
"num_tokens": 183786290.0,
|
||
|
|
"step": 444
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7807017543859649,
|
||
|
|
"grad_norm": 0.7548661231994629,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.203,
|
||
|
|
"mean_token_accuracy": 0.6581261157989502,
|
||
|
|
"num_tokens": 184191985.0,
|
||
|
|
"step": 445
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7824561403508772,
|
||
|
|
"grad_norm": 0.589838981628418,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2253,
|
||
|
|
"mean_token_accuracy": 0.652956485748291,
|
||
|
|
"num_tokens": 184617087.0,
|
||
|
|
"step": 446
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7842105263157895,
|
||
|
|
"grad_norm": 0.7901304960250854,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2023,
|
||
|
|
"mean_token_accuracy": 0.6594702005386353,
|
||
|
|
"num_tokens": 185046113.0,
|
||
|
|
"step": 447
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7859649122807018,
|
||
|
|
"grad_norm": 0.681577205657959,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1765,
|
||
|
|
"mean_token_accuracy": 0.6648210287094116,
|
||
|
|
"num_tokens": 185440210.0,
|
||
|
|
"step": 448
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.787719298245614,
|
||
|
|
"grad_norm": 0.619105339050293,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2151,
|
||
|
|
"mean_token_accuracy": 0.6544394493103027,
|
||
|
|
"num_tokens": 185866240.0,
|
||
|
|
"step": 449
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7894736842105263,
|
||
|
|
"grad_norm": 0.6568613648414612,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1808,
|
||
|
|
"mean_token_accuracy": 0.6645166277885437,
|
||
|
|
"num_tokens": 186262559.0,
|
||
|
|
"step": 450
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7912280701754386,
|
||
|
|
"grad_norm": 0.6452411413192749,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2289,
|
||
|
|
"mean_token_accuracy": 0.6531310677528381,
|
||
|
|
"num_tokens": 186677017.0,
|
||
|
|
"step": 451
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7929824561403509,
|
||
|
|
"grad_norm": 0.6799737215042114,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2207,
|
||
|
|
"mean_token_accuracy": 0.6556583046913147,
|
||
|
|
"num_tokens": 187108135.0,
|
||
|
|
"step": 452
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7947368421052632,
|
||
|
|
"grad_norm": 0.5680040717124939,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1886,
|
||
|
|
"mean_token_accuracy": 0.6613532900810242,
|
||
|
|
"num_tokens": 187533913.0,
|
||
|
|
"step": 453
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7964912280701755,
|
||
|
|
"grad_norm": 0.6380943655967712,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2136,
|
||
|
|
"mean_token_accuracy": 0.6577192544937134,
|
||
|
|
"num_tokens": 187943777.0,
|
||
|
|
"step": 454
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.7982456140350878,
|
||
|
|
"grad_norm": 0.5565281510353088,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1941,
|
||
|
|
"mean_token_accuracy": 0.6604122519493103,
|
||
|
|
"num_tokens": 188365013.0,
|
||
|
|
"step": 455
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8,
|
||
|
|
"grad_norm": 0.6176914572715759,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1957,
|
||
|
|
"mean_token_accuracy": 0.6601607799530029,
|
||
|
|
"num_tokens": 188805546.0,
|
||
|
|
"step": 456
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8017543859649123,
|
||
|
|
"grad_norm": 0.6163376569747925,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1788,
|
||
|
|
"mean_token_accuracy": 0.6634429097175598,
|
||
|
|
"num_tokens": 189204261.0,
|
||
|
|
"step": 457
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8035087719298246,
|
||
|
|
"grad_norm": 0.6874009966850281,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2061,
|
||
|
|
"mean_token_accuracy": 0.6587145924568176,
|
||
|
|
"num_tokens": 189609457.0,
|
||
|
|
"step": 458
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8052631578947368,
|
||
|
|
"grad_norm": 0.6584733724594116,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2077,
|
||
|
|
"mean_token_accuracy": 0.6568840742111206,
|
||
|
|
"num_tokens": 190024904.0,
|
||
|
|
"step": 459
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8070175438596491,
|
||
|
|
"grad_norm": 0.554511547088623,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1883,
|
||
|
|
"mean_token_accuracy": 0.661857008934021,
|
||
|
|
"num_tokens": 190435843.0,
|
||
|
|
"step": 460
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8087719298245614,
|
||
|
|
"grad_norm": 0.6625659465789795,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.209,
|
||
|
|
"mean_token_accuracy": 0.6570659875869751,
|
||
|
|
"num_tokens": 190844879.0,
|
||
|
|
"step": 461
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8105263157894737,
|
||
|
|
"grad_norm": 0.6230789422988892,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1932,
|
||
|
|
"mean_token_accuracy": 0.6605242490768433,
|
||
|
|
"num_tokens": 191240668.0,
|
||
|
|
"step": 462
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8122807017543859,
|
||
|
|
"grad_norm": 0.5848865509033203,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2055,
|
||
|
|
"mean_token_accuracy": 0.6577123999595642,
|
||
|
|
"num_tokens": 191649912.0,
|
||
|
|
"step": 463
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8140350877192982,
|
||
|
|
"grad_norm": 0.7131868600845337,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1945,
|
||
|
|
"mean_token_accuracy": 0.6616647839546204,
|
||
|
|
"num_tokens": 192065093.0,
|
||
|
|
"step": 464
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8157894736842105,
|
||
|
|
"grad_norm": 0.620922863483429,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2134,
|
||
|
|
"mean_token_accuracy": 0.657670259475708,
|
||
|
|
"num_tokens": 192458897.0,
|
||
|
|
"step": 465
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8175438596491228,
|
||
|
|
"grad_norm": 0.6825653910636902,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2083,
|
||
|
|
"mean_token_accuracy": 0.6574592590332031,
|
||
|
|
"num_tokens": 192871837.0,
|
||
|
|
"step": 466
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8192982456140351,
|
||
|
|
"grad_norm": 0.649117648601532,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1904,
|
||
|
|
"mean_token_accuracy": 0.661907970905304,
|
||
|
|
"num_tokens": 193297231.0,
|
||
|
|
"step": 467
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8210526315789474,
|
||
|
|
"grad_norm": 0.5843600630760193,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1613,
|
||
|
|
"mean_token_accuracy": 0.6669655442237854,
|
||
|
|
"num_tokens": 193684403.0,
|
||
|
|
"step": 468
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8228070175438597,
|
||
|
|
"grad_norm": 0.6877574324607849,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2131,
|
||
|
|
"mean_token_accuracy": 0.6550207138061523,
|
||
|
|
"num_tokens": 194113357.0,
|
||
|
|
"step": 469
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8245614035087719,
|
||
|
|
"grad_norm": 0.6516855955123901,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1979,
|
||
|
|
"mean_token_accuracy": 0.65838223695755,
|
||
|
|
"num_tokens": 194526469.0,
|
||
|
|
"step": 470
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8263157894736842,
|
||
|
|
"grad_norm": 0.6000040769577026,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2303,
|
||
|
|
"mean_token_accuracy": 0.6509213447570801,
|
||
|
|
"num_tokens": 194953909.0,
|
||
|
|
"step": 471
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8280701754385965,
|
||
|
|
"grad_norm": 0.6414221525192261,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2276,
|
||
|
|
"mean_token_accuracy": 0.6521142721176147,
|
||
|
|
"num_tokens": 195388769.0,
|
||
|
|
"step": 472
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8298245614035088,
|
||
|
|
"grad_norm": 0.614547848701477,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2174,
|
||
|
|
"mean_token_accuracy": 0.6551234126091003,
|
||
|
|
"num_tokens": 195818916.0,
|
||
|
|
"step": 473
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8315789473684211,
|
||
|
|
"grad_norm": 0.6391692161560059,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1963,
|
||
|
|
"mean_token_accuracy": 0.659719705581665,
|
||
|
|
"num_tokens": 196233034.0,
|
||
|
|
"step": 474
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8333333333333334,
|
||
|
|
"grad_norm": 0.6614966988563538,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2407,
|
||
|
|
"mean_token_accuracy": 0.6485875844955444,
|
||
|
|
"num_tokens": 196660034.0,
|
||
|
|
"step": 475
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8350877192982457,
|
||
|
|
"grad_norm": 0.5896729826927185,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2188,
|
||
|
|
"mean_token_accuracy": 0.6551612615585327,
|
||
|
|
"num_tokens": 197080673.0,
|
||
|
|
"step": 476
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8368421052631579,
|
||
|
|
"grad_norm": 0.6428948044776917,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1962,
|
||
|
|
"mean_token_accuracy": 0.6604630947113037,
|
||
|
|
"num_tokens": 197493003.0,
|
||
|
|
"step": 477
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8385964912280702,
|
||
|
|
"grad_norm": 0.6853853464126587,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2116,
|
||
|
|
"mean_token_accuracy": 0.6553216576576233,
|
||
|
|
"num_tokens": 197923635.0,
|
||
|
|
"step": 478
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8403508771929824,
|
||
|
|
"grad_norm": 0.6877092719078064,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2017,
|
||
|
|
"mean_token_accuracy": 0.657435417175293,
|
||
|
|
"num_tokens": 198339854.0,
|
||
|
|
"step": 479
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8421052631578947,
|
||
|
|
"grad_norm": 0.5886791348457336,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2414,
|
||
|
|
"mean_token_accuracy": 0.6491390466690063,
|
||
|
|
"num_tokens": 198768483.0,
|
||
|
|
"step": 480
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.843859649122807,
|
||
|
|
"grad_norm": 0.8585889935493469,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2168,
|
||
|
|
"mean_token_accuracy": 0.6541799902915955,
|
||
|
|
"num_tokens": 199190842.0,
|
||
|
|
"step": 481
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8456140350877193,
|
||
|
|
"grad_norm": 0.6527767181396484,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2236,
|
||
|
|
"mean_token_accuracy": 0.6539558172225952,
|
||
|
|
"num_tokens": 199620649.0,
|
||
|
|
"step": 482
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8473684210526315,
|
||
|
|
"grad_norm": 0.6834746599197388,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2015,
|
||
|
|
"mean_token_accuracy": 0.6586301326751709,
|
||
|
|
"num_tokens": 200023406.0,
|
||
|
|
"step": 483
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8491228070175438,
|
||
|
|
"grad_norm": 0.6827247142791748,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.178,
|
||
|
|
"mean_token_accuracy": 0.6644470691680908,
|
||
|
|
"num_tokens": 200430239.0,
|
||
|
|
"step": 484
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8508771929824561,
|
||
|
|
"grad_norm": 0.6491426229476929,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1927,
|
||
|
|
"mean_token_accuracy": 0.6621779799461365,
|
||
|
|
"num_tokens": 200833864.0,
|
||
|
|
"step": 485
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8526315789473684,
|
||
|
|
"grad_norm": 0.6229031682014465,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1923,
|
||
|
|
"mean_token_accuracy": 0.6596782207489014,
|
||
|
|
"num_tokens": 201241869.0,
|
||
|
|
"step": 486
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8543859649122807,
|
||
|
|
"grad_norm": 0.5779981017112732,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1857,
|
||
|
|
"mean_token_accuracy": 0.6614409685134888,
|
||
|
|
"num_tokens": 201658653.0,
|
||
|
|
"step": 487
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.856140350877193,
|
||
|
|
"grad_norm": 0.6096077561378479,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2096,
|
||
|
|
"mean_token_accuracy": 0.6574957370758057,
|
||
|
|
"num_tokens": 202086503.0,
|
||
|
|
"step": 488
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8578947368421053,
|
||
|
|
"grad_norm": 0.7495996952056885,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2005,
|
||
|
|
"mean_token_accuracy": 0.6597346067428589,
|
||
|
|
"num_tokens": 202509306.0,
|
||
|
|
"step": 489
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8596491228070176,
|
||
|
|
"grad_norm": 0.6209189295768738,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1753,
|
||
|
|
"mean_token_accuracy": 0.6628221273422241,
|
||
|
|
"num_tokens": 202909087.0,
|
||
|
|
"step": 490
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8614035087719298,
|
||
|
|
"grad_norm": 0.563208281993866,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2005,
|
||
|
|
"mean_token_accuracy": 0.6574559211730957,
|
||
|
|
"num_tokens": 203337615.0,
|
||
|
|
"step": 491
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8631578947368421,
|
||
|
|
"grad_norm": 0.6872074604034424,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1982,
|
||
|
|
"mean_token_accuracy": 0.6597882509231567,
|
||
|
|
"num_tokens": 203754527.0,
|
||
|
|
"step": 492
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8649122807017544,
|
||
|
|
"grad_norm": 0.6505935192108154,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1734,
|
||
|
|
"mean_token_accuracy": 0.666144609451294,
|
||
|
|
"num_tokens": 204166768.0,
|
||
|
|
"step": 493
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8666666666666667,
|
||
|
|
"grad_norm": 0.7290279269218445,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1923,
|
||
|
|
"mean_token_accuracy": 0.6601245403289795,
|
||
|
|
"num_tokens": 204554076.0,
|
||
|
|
"step": 494
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.868421052631579,
|
||
|
|
"grad_norm": 0.6451328992843628,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2272,
|
||
|
|
"mean_token_accuracy": 0.653006911277771,
|
||
|
|
"num_tokens": 204962578.0,
|
||
|
|
"step": 495
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8701754385964913,
|
||
|
|
"grad_norm": 0.7413347363471985,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.153,
|
||
|
|
"mean_token_accuracy": 0.6694173216819763,
|
||
|
|
"num_tokens": 205358271.0,
|
||
|
|
"step": 496
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8719298245614036,
|
||
|
|
"grad_norm": 0.6787010431289673,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2085,
|
||
|
|
"mean_token_accuracy": 0.6561261415481567,
|
||
|
|
"num_tokens": 205786890.0,
|
||
|
|
"step": 497
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8736842105263158,
|
||
|
|
"grad_norm": 0.6698117256164551,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2019,
|
||
|
|
"mean_token_accuracy": 0.6592704057693481,
|
||
|
|
"num_tokens": 206193572.0,
|
||
|
|
"step": 498
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.875438596491228,
|
||
|
|
"grad_norm": 0.6170295476913452,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1723,
|
||
|
|
"mean_token_accuracy": 0.6650887727737427,
|
||
|
|
"num_tokens": 206607450.0,
|
||
|
|
"step": 499
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8771929824561403,
|
||
|
|
"grad_norm": 0.5921252965927124,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1823,
|
||
|
|
"mean_token_accuracy": 0.6633787155151367,
|
||
|
|
"num_tokens": 207005126.0,
|
||
|
|
"step": 500
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8789473684210526,
|
||
|
|
"grad_norm": 0.69658362865448,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1793,
|
||
|
|
"mean_token_accuracy": 0.6646496057510376,
|
||
|
|
"num_tokens": 207394794.0,
|
||
|
|
"step": 501
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8807017543859649,
|
||
|
|
"grad_norm": 0.6810624599456787,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1979,
|
||
|
|
"mean_token_accuracy": 0.6584210395812988,
|
||
|
|
"num_tokens": 207783539.0,
|
||
|
|
"step": 502
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8824561403508772,
|
||
|
|
"grad_norm": 0.6264888644218445,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2045,
|
||
|
|
"mean_token_accuracy": 0.6583248376846313,
|
||
|
|
"num_tokens": 208196198.0,
|
||
|
|
"step": 503
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8842105263157894,
|
||
|
|
"grad_norm": 0.6070482730865479,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1995,
|
||
|
|
"mean_token_accuracy": 0.6601771712303162,
|
||
|
|
"num_tokens": 208602598.0,
|
||
|
|
"step": 504
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8859649122807017,
|
||
|
|
"grad_norm": 0.6856517791748047,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1909,
|
||
|
|
"mean_token_accuracy": 0.6614357233047485,
|
||
|
|
"num_tokens": 209039139.0,
|
||
|
|
"step": 505
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.887719298245614,
|
||
|
|
"grad_norm": 0.5697737336158752,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1967,
|
||
|
|
"mean_token_accuracy": 0.6589823961257935,
|
||
|
|
"num_tokens": 209437950.0,
|
||
|
|
"step": 506
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8894736842105263,
|
||
|
|
"grad_norm": 0.7310987114906311,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.205,
|
||
|
|
"mean_token_accuracy": 0.6580671668052673,
|
||
|
|
"num_tokens": 209862559.0,
|
||
|
|
"step": 507
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8912280701754386,
|
||
|
|
"grad_norm": 0.6229117512702942,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.222,
|
||
|
|
"mean_token_accuracy": 0.6536027789115906,
|
||
|
|
"num_tokens": 210284769.0,
|
||
|
|
"step": 508
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8929824561403509,
|
||
|
|
"grad_norm": 0.5739285349845886,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2059,
|
||
|
|
"mean_token_accuracy": 0.6585639119148254,
|
||
|
|
"num_tokens": 210708859.0,
|
||
|
|
"step": 509
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8947368421052632,
|
||
|
|
"grad_norm": 0.6239802837371826,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1986,
|
||
|
|
"mean_token_accuracy": 0.6589633822441101,
|
||
|
|
"num_tokens": 211130168.0,
|
||
|
|
"step": 510
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8964912280701754,
|
||
|
|
"grad_norm": 0.6617391705513,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2027,
|
||
|
|
"mean_token_accuracy": 0.6577974557876587,
|
||
|
|
"num_tokens": 211529753.0,
|
||
|
|
"step": 511
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.8982456140350877,
|
||
|
|
"grad_norm": 0.638733983039856,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2142,
|
||
|
|
"mean_token_accuracy": 0.6540495157241821,
|
||
|
|
"num_tokens": 211963430.0,
|
||
|
|
"step": 512
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9,
|
||
|
|
"grad_norm": 0.6008735299110413,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2136,
|
||
|
|
"mean_token_accuracy": 0.6559524536132812,
|
||
|
|
"num_tokens": 212387404.0,
|
||
|
|
"step": 513
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9017543859649123,
|
||
|
|
"grad_norm": 0.6343475580215454,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1718,
|
||
|
|
"mean_token_accuracy": 0.6651860475540161,
|
||
|
|
"num_tokens": 212802763.0,
|
||
|
|
"step": 514
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9035087719298246,
|
||
|
|
"grad_norm": 0.637675940990448,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1694,
|
||
|
|
"mean_token_accuracy": 0.6661834716796875,
|
||
|
|
"num_tokens": 213219406.0,
|
||
|
|
"step": 515
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9052631578947369,
|
||
|
|
"grad_norm": 0.5518184900283813,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1954,
|
||
|
|
"mean_token_accuracy": 0.6603313684463501,
|
||
|
|
"num_tokens": 213623710.0,
|
||
|
|
"step": 516
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9070175438596492,
|
||
|
|
"grad_norm": 0.6756175756454468,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1701,
|
||
|
|
"mean_token_accuracy": 0.6667043566703796,
|
||
|
|
"num_tokens": 214053806.0,
|
||
|
|
"step": 517
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9087719298245615,
|
||
|
|
"grad_norm": 0.5964516401290894,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2007,
|
||
|
|
"mean_token_accuracy": 0.6573567390441895,
|
||
|
|
"num_tokens": 214457394.0,
|
||
|
|
"step": 518
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9105263157894737,
|
||
|
|
"grad_norm": 0.745707094669342,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1697,
|
||
|
|
"mean_token_accuracy": 0.6656190156936646,
|
||
|
|
"num_tokens": 214841472.0,
|
||
|
|
"step": 519
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9122807017543859,
|
||
|
|
"grad_norm": 0.5971705317497253,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2061,
|
||
|
|
"mean_token_accuracy": 0.656207799911499,
|
||
|
|
"num_tokens": 215261046.0,
|
||
|
|
"step": 520
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9140350877192982,
|
||
|
|
"grad_norm": 0.7177700400352478,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1753,
|
||
|
|
"mean_token_accuracy": 0.6650264263153076,
|
||
|
|
"num_tokens": 215664423.0,
|
||
|
|
"step": 521
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9157894736842105,
|
||
|
|
"grad_norm": 0.5945612788200378,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1813,
|
||
|
|
"mean_token_accuracy": 0.66515052318573,
|
||
|
|
"num_tokens": 216072733.0,
|
||
|
|
"step": 522
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9175438596491228,
|
||
|
|
"grad_norm": 0.7161288857460022,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1964,
|
||
|
|
"mean_token_accuracy": 0.6598995923995972,
|
||
|
|
"num_tokens": 216490157.0,
|
||
|
|
"step": 523
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9192982456140351,
|
||
|
|
"grad_norm": 0.6490321159362793,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2151,
|
||
|
|
"mean_token_accuracy": 0.6550993919372559,
|
||
|
|
"num_tokens": 216933714.0,
|
||
|
|
"step": 524
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9210526315789473,
|
||
|
|
"grad_norm": 0.6328585743904114,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2008,
|
||
|
|
"mean_token_accuracy": 0.658663272857666,
|
||
|
|
"num_tokens": 217360826.0,
|
||
|
|
"step": 525
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9228070175438596,
|
||
|
|
"grad_norm": 0.6045000553131104,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1837,
|
||
|
|
"mean_token_accuracy": 0.6624784469604492,
|
||
|
|
"num_tokens": 217767964.0,
|
||
|
|
"step": 526
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9245614035087719,
|
||
|
|
"grad_norm": 0.5896552205085754,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1785,
|
||
|
|
"mean_token_accuracy": 0.663489043712616,
|
||
|
|
"num_tokens": 218182502.0,
|
||
|
|
"step": 527
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9263157894736842,
|
||
|
|
"grad_norm": 0.6433465480804443,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1866,
|
||
|
|
"mean_token_accuracy": 0.662973940372467,
|
||
|
|
"num_tokens": 218599164.0,
|
||
|
|
"step": 528
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9280701754385965,
|
||
|
|
"grad_norm": 0.6225712895393372,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1732,
|
||
|
|
"mean_token_accuracy": 0.6657634973526001,
|
||
|
|
"num_tokens": 219038853.0,
|
||
|
|
"step": 529
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9298245614035088,
|
||
|
|
"grad_norm": 0.6584674715995789,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1931,
|
||
|
|
"mean_token_accuracy": 0.6607257127761841,
|
||
|
|
"num_tokens": 219455763.0,
|
||
|
|
"step": 530
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9315789473684211,
|
||
|
|
"grad_norm": 0.5859020352363586,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2043,
|
||
|
|
"mean_token_accuracy": 0.657474935054779,
|
||
|
|
"num_tokens": 219857096.0,
|
||
|
|
"step": 531
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9333333333333333,
|
||
|
|
"grad_norm": 0.6879558563232422,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1761,
|
||
|
|
"mean_token_accuracy": 0.6635361909866333,
|
||
|
|
"num_tokens": 220269043.0,
|
||
|
|
"step": 532
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9350877192982456,
|
||
|
|
"grad_norm": 0.6866979002952576,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2073,
|
||
|
|
"mean_token_accuracy": 0.6566738486289978,
|
||
|
|
"num_tokens": 220650778.0,
|
||
|
|
"step": 533
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9368421052631579,
|
||
|
|
"grad_norm": 0.6336923241615295,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2039,
|
||
|
|
"mean_token_accuracy": 0.6579070091247559,
|
||
|
|
"num_tokens": 221055825.0,
|
||
|
|
"step": 534
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9385964912280702,
|
||
|
|
"grad_norm": 0.6081579327583313,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2108,
|
||
|
|
"mean_token_accuracy": 0.6550259590148926,
|
||
|
|
"num_tokens": 221459868.0,
|
||
|
|
"step": 535
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9403508771929825,
|
||
|
|
"grad_norm": 0.6312009692192078,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1702,
|
||
|
|
"mean_token_accuracy": 0.6641944646835327,
|
||
|
|
"num_tokens": 221860324.0,
|
||
|
|
"step": 536
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9421052631578948,
|
||
|
|
"grad_norm": 0.5887439250946045,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1778,
|
||
|
|
"mean_token_accuracy": 0.6638119220733643,
|
||
|
|
"num_tokens": 222291251.0,
|
||
|
|
"step": 537
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9438596491228071,
|
||
|
|
"grad_norm": 0.543400764465332,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1805,
|
||
|
|
"mean_token_accuracy": 0.6627988815307617,
|
||
|
|
"num_tokens": 222699559.0,
|
||
|
|
"step": 538
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9456140350877194,
|
||
|
|
"grad_norm": 0.5787383913993835,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1914,
|
||
|
|
"mean_token_accuracy": 0.6620433330535889,
|
||
|
|
"num_tokens": 223156357.0,
|
||
|
|
"step": 539
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9473684210526315,
|
||
|
|
"grad_norm": 0.6597963571548462,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1871,
|
||
|
|
"mean_token_accuracy": 0.6626171469688416,
|
||
|
|
"num_tokens": 223562411.0,
|
||
|
|
"step": 540
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9491228070175438,
|
||
|
|
"grad_norm": 0.5731210112571716,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2078,
|
||
|
|
"mean_token_accuracy": 0.6560046076774597,
|
||
|
|
"num_tokens": 223988252.0,
|
||
|
|
"step": 541
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9508771929824561,
|
||
|
|
"grad_norm": 0.7036701440811157,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1917,
|
||
|
|
"mean_token_accuracy": 0.6613459587097168,
|
||
|
|
"num_tokens": 224401217.0,
|
||
|
|
"step": 542
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9526315789473684,
|
||
|
|
"grad_norm": 0.5783252120018005,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1757,
|
||
|
|
"mean_token_accuracy": 0.6623378396034241,
|
||
|
|
"num_tokens": 224807482.0,
|
||
|
|
"step": 543
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9543859649122807,
|
||
|
|
"grad_norm": 0.7617517113685608,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1937,
|
||
|
|
"mean_token_accuracy": 0.659681499004364,
|
||
|
|
"num_tokens": 225220680.0,
|
||
|
|
"step": 544
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.956140350877193,
|
||
|
|
"grad_norm": 0.6007680296897888,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2401,
|
||
|
|
"mean_token_accuracy": 0.6486543416976929,
|
||
|
|
"num_tokens": 225640539.0,
|
||
|
|
"step": 545
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9578947368421052,
|
||
|
|
"grad_norm": 0.7272628545761108,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1822,
|
||
|
|
"mean_token_accuracy": 0.6626489162445068,
|
||
|
|
"num_tokens": 226035924.0,
|
||
|
|
"step": 546
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9596491228070175,
|
||
|
|
"grad_norm": 0.700038492679596,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1658,
|
||
|
|
"mean_token_accuracy": 0.666049599647522,
|
||
|
|
"num_tokens": 226425232.0,
|
||
|
|
"step": 547
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9614035087719298,
|
||
|
|
"grad_norm": 0.6490049958229065,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1691,
|
||
|
|
"mean_token_accuracy": 0.6649153828620911,
|
||
|
|
"num_tokens": 226802383.0,
|
||
|
|
"step": 548
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9631578947368421,
|
||
|
|
"grad_norm": 0.7154028415679932,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1986,
|
||
|
|
"mean_token_accuracy": 0.6584769487380981,
|
||
|
|
"num_tokens": 227227610.0,
|
||
|
|
"step": 549
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9649122807017544,
|
||
|
|
"grad_norm": 0.6601865887641907,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2034,
|
||
|
|
"mean_token_accuracy": 0.6581115126609802,
|
||
|
|
"num_tokens": 227657882.0,
|
||
|
|
"step": 550
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9666666666666667,
|
||
|
|
"grad_norm": 0.6211066842079163,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1749,
|
||
|
|
"mean_token_accuracy": 0.6637754440307617,
|
||
|
|
"num_tokens": 228081460.0,
|
||
|
|
"step": 551
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.968421052631579,
|
||
|
|
"grad_norm": 0.6879007816314697,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2181,
|
||
|
|
"mean_token_accuracy": 0.6532254219055176,
|
||
|
|
"num_tokens": 228507388.0,
|
||
|
|
"step": 552
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9701754385964912,
|
||
|
|
"grad_norm": 0.6297675371170044,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2126,
|
||
|
|
"mean_token_accuracy": 0.654970645904541,
|
||
|
|
"num_tokens": 228935033.0,
|
||
|
|
"step": 553
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9719298245614035,
|
||
|
|
"grad_norm": 0.5917762517929077,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1517,
|
||
|
|
"mean_token_accuracy": 0.6693041920661926,
|
||
|
|
"num_tokens": 229325690.0,
|
||
|
|
"step": 554
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9736842105263158,
|
||
|
|
"grad_norm": 0.6466293334960938,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1573,
|
||
|
|
"mean_token_accuracy": 0.6677490472793579,
|
||
|
|
"num_tokens": 229726549.0,
|
||
|
|
"step": 555
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9754385964912281,
|
||
|
|
"grad_norm": 0.6341378688812256,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.185,
|
||
|
|
"mean_token_accuracy": 0.6614177227020264,
|
||
|
|
"num_tokens": 230122274.0,
|
||
|
|
"step": 556
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9771929824561404,
|
||
|
|
"grad_norm": 0.604850172996521,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1959,
|
||
|
|
"mean_token_accuracy": 0.6580736637115479,
|
||
|
|
"num_tokens": 230526104.0,
|
||
|
|
"step": 557
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9789473684210527,
|
||
|
|
"grad_norm": 0.7436766624450684,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1642,
|
||
|
|
"mean_token_accuracy": 0.6674508452415466,
|
||
|
|
"num_tokens": 230941199.0,
|
||
|
|
"step": 558
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.980701754385965,
|
||
|
|
"grad_norm": 0.6362001895904541,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1751,
|
||
|
|
"mean_token_accuracy": 0.6646236181259155,
|
||
|
|
"num_tokens": 231367872.0,
|
||
|
|
"step": 559
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9824561403508771,
|
||
|
|
"grad_norm": 0.6686745882034302,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2065,
|
||
|
|
"mean_token_accuracy": 0.6581393480300903,
|
||
|
|
"num_tokens": 231804511.0,
|
||
|
|
"step": 560
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9842105263157894,
|
||
|
|
"grad_norm": 0.7186607718467712,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1692,
|
||
|
|
"mean_token_accuracy": 0.6657729148864746,
|
||
|
|
"num_tokens": 232201554.0,
|
||
|
|
"step": 561
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9859649122807017,
|
||
|
|
"grad_norm": 0.5875235795974731,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.185,
|
||
|
|
"mean_token_accuracy": 0.6623810529708862,
|
||
|
|
"num_tokens": 232632247.0,
|
||
|
|
"step": 562
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.987719298245614,
|
||
|
|
"grad_norm": 0.6285355687141418,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1932,
|
||
|
|
"mean_token_accuracy": 0.6584882140159607,
|
||
|
|
"num_tokens": 233040299.0,
|
||
|
|
"step": 563
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9894736842105263,
|
||
|
|
"grad_norm": 0.6787013411521912,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2017,
|
||
|
|
"mean_token_accuracy": 0.6589356660842896,
|
||
|
|
"num_tokens": 233476898.0,
|
||
|
|
"step": 564
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9912280701754386,
|
||
|
|
"grad_norm": 0.5261335372924805,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1674,
|
||
|
|
"mean_token_accuracy": 0.6651347279548645,
|
||
|
|
"num_tokens": 233898203.0,
|
||
|
|
"step": 565
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9929824561403509,
|
||
|
|
"grad_norm": 0.6217278242111206,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.2212,
|
||
|
|
"mean_token_accuracy": 0.653939962387085,
|
||
|
|
"num_tokens": 234316491.0,
|
||
|
|
"step": 566
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9947368421052631,
|
||
|
|
"grad_norm": 0.6469559073448181,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1848,
|
||
|
|
"mean_token_accuracy": 0.6615195870399475,
|
||
|
|
"num_tokens": 234725455.0,
|
||
|
|
"step": 567
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9964912280701754,
|
||
|
|
"grad_norm": 0.6558631062507629,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.173,
|
||
|
|
"mean_token_accuracy": 0.6650323867797852,
|
||
|
|
"num_tokens": 235152094.0,
|
||
|
|
"step": 568
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 0.9982456140350877,
|
||
|
|
"grad_norm": 0.6159579157829285,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1885,
|
||
|
|
"mean_token_accuracy": 0.6604526042938232,
|
||
|
|
"num_tokens": 235558911.0,
|
||
|
|
"step": 569
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0,
|
||
|
|
"grad_norm": 0.6799984574317932,
|
||
|
|
"learning_rate": 0.0001,
|
||
|
|
"loss": 1.1975,
|
||
|
|
"mean_token_accuracy": 0.6584136486053467,
|
||
|
|
"num_tokens": 235994347.0,
|
||
|
|
"step": 570
|
||
|
|
},
|
||
|
|
{
|
||
|
|
"epoch": 1.0,
|
||
|
|
"step": 570,
|
||
|
|
"total_flos": 1.377941890090926e+18,
|
||
|
|
"train_loss": 1.2564991597543682,
|
||
|
|
"train_runtime": 1307.2842,
|
||
|
|
"train_samples_per_second": 111.621,
|
||
|
|
"train_steps_per_second": 0.436
|
||
|
|
}
|
||
|
|
],
|
||
|
|
"logging_steps": 1,
|
||
|
|
"max_steps": 570,
|
||
|
|
"num_input_tokens_seen": 0,
|
||
|
|
"num_train_epochs": 1,
|
||
|
|
"save_steps": 285,
|
||
|
|
"stateful_callbacks": {
|
||
|
|
"TrainerControl": {
|
||
|
|
"args": {
|
||
|
|
"should_epoch_stop": false,
|
||
|
|
"should_evaluate": false,
|
||
|
|
"should_log": false,
|
||
|
|
"should_save": true,
|
||
|
|
"should_training_stop": true
|
||
|
|
},
|
||
|
|
"attributes": {}
|
||
|
|
}
|
||
|
|
},
|
||
|
|
"total_flos": 1.377941890090926e+18,
|
||
|
|
"train_batch_size": 64,
|
||
|
|
"trial_name": null,
|
||
|
|
"trial_params": null
|
||
|
|
}
|