13814 lines
382 KiB
JSON
13814 lines
382 KiB
JSON
{
|
|
"best_global_step": null,
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 25.0,
|
|
"eval_steps": 500,
|
|
"global_step": 7650,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.016366612111292964,
|
|
"grad_norm": 55.62101037513415,
|
|
"learning_rate": 2.61437908496732e-11,
|
|
"loss": 2.6905,
|
|
"mean_token_accuracy": 0.6009584546089173,
|
|
"num_tokens": 316429.0,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.03273322422258593,
|
|
"grad_norm": 58.172720274495184,
|
|
"learning_rate": 5.88235294117647e-11,
|
|
"loss": 2.7382,
|
|
"mean_token_accuracy": 0.595202910900116,
|
|
"num_tokens": 631415.0,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.049099836333878884,
|
|
"grad_norm": 56.310876284999395,
|
|
"learning_rate": 9.150326797385621e-11,
|
|
"loss": 2.7181,
|
|
"mean_token_accuracy": 0.5965762197971344,
|
|
"num_tokens": 948728.0,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.06546644844517185,
|
|
"grad_norm": 55.77884956792775,
|
|
"learning_rate": 1.241830065359477e-10,
|
|
"loss": 2.7651,
|
|
"mean_token_accuracy": 0.5920272588729858,
|
|
"num_tokens": 1262537.0,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.08183306055646482,
|
|
"grad_norm": 56.771410740550785,
|
|
"learning_rate": 1.5686274509803922e-10,
|
|
"loss": 2.75,
|
|
"mean_token_accuracy": 0.5928871929645538,
|
|
"num_tokens": 1578196.0,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.09819967266775777,
|
|
"grad_norm": 55.98400313077124,
|
|
"learning_rate": 1.895424836601307e-10,
|
|
"loss": 2.7047,
|
|
"mean_token_accuracy": 0.6000989556312561,
|
|
"num_tokens": 1894739.0,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.11456628477905073,
|
|
"grad_norm": 57.760741569113556,
|
|
"learning_rate": 2.2222222222222224e-10,
|
|
"loss": 2.7384,
|
|
"mean_token_accuracy": 0.5951428174972534,
|
|
"num_tokens": 2210581.0,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.1309328968903437,
|
|
"grad_norm": 57.28938487977958,
|
|
"learning_rate": 2.5490196078431375e-10,
|
|
"loss": 2.7062,
|
|
"mean_token_accuracy": 0.5997058689594269,
|
|
"num_tokens": 2526492.0,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.14729950900163666,
|
|
"grad_norm": 56.76416543532324,
|
|
"learning_rate": 2.8758169934640523e-10,
|
|
"loss": 2.7623,
|
|
"mean_token_accuracy": 0.5918912470340729,
|
|
"num_tokens": 2842714.0,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.16366612111292964,
|
|
"grad_norm": 56.38556168991693,
|
|
"learning_rate": 3.202614379084967e-10,
|
|
"loss": 2.6966,
|
|
"mean_token_accuracy": 0.5994902729988099,
|
|
"num_tokens": 3159154.0,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.18003273322422259,
|
|
"grad_norm": 56.99858489246395,
|
|
"learning_rate": 3.5294117647058825e-10,
|
|
"loss": 2.6908,
|
|
"mean_token_accuracy": 0.600296539068222,
|
|
"num_tokens": 3472831.0,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.19639934533551553,
|
|
"grad_norm": 54.51487201849461,
|
|
"learning_rate": 3.856209150326798e-10,
|
|
"loss": 2.678,
|
|
"mean_token_accuracy": 0.60253204703331,
|
|
"num_tokens": 3788192.0,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.2127659574468085,
|
|
"grad_norm": 56.88214275656488,
|
|
"learning_rate": 4.183006535947712e-10,
|
|
"loss": 2.746,
|
|
"mean_token_accuracy": 0.5942470014095307,
|
|
"num_tokens": 4103915.0,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.22913256955810146,
|
|
"grad_norm": 59.43648730633251,
|
|
"learning_rate": 4.5098039215686275e-10,
|
|
"loss": 2.7032,
|
|
"mean_token_accuracy": 0.5992533683776855,
|
|
"num_tokens": 4420361.0,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.24549918166939444,
|
|
"grad_norm": 56.43974466098924,
|
|
"learning_rate": 4.836601307189543e-10,
|
|
"loss": 2.7446,
|
|
"mean_token_accuracy": 0.5943080246448517,
|
|
"num_tokens": 4735381.0,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.2618657937806874,
|
|
"grad_norm": 56.16523920809429,
|
|
"learning_rate": 5.163398692810458e-10,
|
|
"loss": 2.6893,
|
|
"mean_token_accuracy": 0.6008193671703339,
|
|
"num_tokens": 5050563.0,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.27823240589198034,
|
|
"grad_norm": 57.720048665459046,
|
|
"learning_rate": 5.490196078431373e-10,
|
|
"loss": 2.7405,
|
|
"mean_token_accuracy": 0.5943594813346863,
|
|
"num_tokens": 5364896.0,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.2945990180032733,
|
|
"grad_norm": 56.75066213164205,
|
|
"learning_rate": 5.816993464052287e-10,
|
|
"loss": 2.715,
|
|
"mean_token_accuracy": 0.5964757144451142,
|
|
"num_tokens": 5680185.0,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.3109656301145663,
|
|
"grad_norm": 54.15505392392569,
|
|
"learning_rate": 6.143790849673202e-10,
|
|
"loss": 2.664,
|
|
"mean_token_accuracy": 0.60406773686409,
|
|
"num_tokens": 5994650.0,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.32733224222585927,
|
|
"grad_norm": 59.45987110326202,
|
|
"learning_rate": 6.470588235294118e-10,
|
|
"loss": 2.7524,
|
|
"mean_token_accuracy": 0.5922898650169373,
|
|
"num_tokens": 6310228.0,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.3436988543371522,
|
|
"grad_norm": 57.085451412144366,
|
|
"learning_rate": 6.797385620915032e-10,
|
|
"loss": 2.7131,
|
|
"mean_token_accuracy": 0.5981797277927399,
|
|
"num_tokens": 6626412.0,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.36006546644844517,
|
|
"grad_norm": 55.63396297277116,
|
|
"learning_rate": 7.124183006535948e-10,
|
|
"loss": 2.7468,
|
|
"mean_token_accuracy": 0.5937129497528076,
|
|
"num_tokens": 6942407.0,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.37643207855973815,
|
|
"grad_norm": 55.04424122552027,
|
|
"learning_rate": 7.450980392156863e-10,
|
|
"loss": 2.6654,
|
|
"mean_token_accuracy": 0.6039653539657592,
|
|
"num_tokens": 7259157.0,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.39279869067103107,
|
|
"grad_norm": 55.91055581080712,
|
|
"learning_rate": 7.777777777777778e-10,
|
|
"loss": 2.7051,
|
|
"mean_token_accuracy": 0.5981750130653382,
|
|
"num_tokens": 7572014.0,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.40916530278232405,
|
|
"grad_norm": 57.70042748713812,
|
|
"learning_rate": 8.104575163398693e-10,
|
|
"loss": 2.7115,
|
|
"mean_token_accuracy": 0.5981932282447815,
|
|
"num_tokens": 7887390.0,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.425531914893617,
|
|
"grad_norm": 53.28893293552655,
|
|
"learning_rate": 8.431372549019608e-10,
|
|
"loss": 2.6906,
|
|
"mean_token_accuracy": 0.6010222673416138,
|
|
"num_tokens": 8202912.0,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.44189852700491,
|
|
"grad_norm": 56.73313551252287,
|
|
"learning_rate": 8.758169934640522e-10,
|
|
"loss": 2.7097,
|
|
"mean_token_accuracy": 0.5978877365589141,
|
|
"num_tokens": 8521524.0,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.4582651391162029,
|
|
"grad_norm": 58.905904373776984,
|
|
"learning_rate": 9.084967320261438e-10,
|
|
"loss": 2.7888,
|
|
"mean_token_accuracy": 0.5877204239368439,
|
|
"num_tokens": 8837257.0,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.4746317512274959,
|
|
"grad_norm": 55.71496830883279,
|
|
"learning_rate": 9.411764705882353e-10,
|
|
"loss": 2.72,
|
|
"mean_token_accuracy": 0.5970898568630219,
|
|
"num_tokens": 9152286.0,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.4909983633387889,
|
|
"grad_norm": 60.74321699143503,
|
|
"learning_rate": 9.738562091503268e-10,
|
|
"loss": 2.7175,
|
|
"mean_token_accuracy": 0.5960192620754242,
|
|
"num_tokens": 9467071.0,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.5073649754500819,
|
|
"grad_norm": 55.91981881970168,
|
|
"learning_rate": 1.0065359477124184e-09,
|
|
"loss": 2.727,
|
|
"mean_token_accuracy": 0.5952532231807709,
|
|
"num_tokens": 9782388.0,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.5237315875613748,
|
|
"grad_norm": 59.43252838902965,
|
|
"learning_rate": 1.03921568627451e-09,
|
|
"loss": 2.7349,
|
|
"mean_token_accuracy": 0.5958111166954041,
|
|
"num_tokens": 10098522.0,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.5400981996726678,
|
|
"grad_norm": 54.711678745868866,
|
|
"learning_rate": 1.0718954248366012e-09,
|
|
"loss": 2.6838,
|
|
"mean_token_accuracy": 0.6017765641212464,
|
|
"num_tokens": 10414165.0,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.5564648117839607,
|
|
"grad_norm": 59.2576714699596,
|
|
"learning_rate": 1.1045751633986929e-09,
|
|
"loss": 2.7696,
|
|
"mean_token_accuracy": 0.5895721673965454,
|
|
"num_tokens": 10728716.0,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.5728314238952537,
|
|
"grad_norm": 55.33516265128209,
|
|
"learning_rate": 1.1372549019607844e-09,
|
|
"loss": 2.7015,
|
|
"mean_token_accuracy": 0.5986992299556733,
|
|
"num_tokens": 11044535.0,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.5891980360065466,
|
|
"grad_norm": 54.35208599375252,
|
|
"learning_rate": 1.1699346405228759e-09,
|
|
"loss": 2.6997,
|
|
"mean_token_accuracy": 0.5996046185493469,
|
|
"num_tokens": 11360787.0,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.6055646481178396,
|
|
"grad_norm": 54.908204606493655,
|
|
"learning_rate": 1.2026143790849673e-09,
|
|
"loss": 2.6522,
|
|
"mean_token_accuracy": 0.6056609749794006,
|
|
"num_tokens": 11676189.0,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.6219312602291326,
|
|
"grad_norm": 57.15438435248898,
|
|
"learning_rate": 1.2352941176470588e-09,
|
|
"loss": 2.7403,
|
|
"mean_token_accuracy": 0.5933549106121063,
|
|
"num_tokens": 11993732.0,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.6382978723404256,
|
|
"grad_norm": 55.85187532544056,
|
|
"learning_rate": 1.2679738562091503e-09,
|
|
"loss": 2.7242,
|
|
"mean_token_accuracy": 0.5965588092803955,
|
|
"num_tokens": 12309686.0,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.6546644844517185,
|
|
"grad_norm": 57.169912622360634,
|
|
"learning_rate": 1.300653594771242e-09,
|
|
"loss": 2.6877,
|
|
"mean_token_accuracy": 0.6016717553138733,
|
|
"num_tokens": 12626255.0,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.6710310965630114,
|
|
"grad_norm": 56.35014678877227,
|
|
"learning_rate": 1.3333333333333333e-09,
|
|
"loss": 2.7287,
|
|
"mean_token_accuracy": 0.594957309961319,
|
|
"num_tokens": 12942690.0,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.6873977086743044,
|
|
"grad_norm": 58.920641210373454,
|
|
"learning_rate": 1.3660130718954248e-09,
|
|
"loss": 2.7485,
|
|
"mean_token_accuracy": 0.5927645146846772,
|
|
"num_tokens": 13259396.0,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.7037643207855974,
|
|
"grad_norm": 57.018665034459005,
|
|
"learning_rate": 1.3986928104575165e-09,
|
|
"loss": 2.7263,
|
|
"mean_token_accuracy": 0.5958705544471741,
|
|
"num_tokens": 13574685.0,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.7201309328968903,
|
|
"grad_norm": 57.70089348693714,
|
|
"learning_rate": 1.4313725490196077e-09,
|
|
"loss": 2.7246,
|
|
"mean_token_accuracy": 0.5950082004070282,
|
|
"num_tokens": 13889357.0,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.7364975450081833,
|
|
"grad_norm": 57.363546508146634,
|
|
"learning_rate": 1.4640522875816994e-09,
|
|
"loss": 2.7261,
|
|
"mean_token_accuracy": 0.5963014245033265,
|
|
"num_tokens": 14205437.0,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.7528641571194763,
|
|
"grad_norm": 53.49987696781798,
|
|
"learning_rate": 1.496732026143791e-09,
|
|
"loss": 2.7102,
|
|
"mean_token_accuracy": 0.5987072646617889,
|
|
"num_tokens": 14521840.0,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.7692307692307693,
|
|
"grad_norm": 56.20359405171631,
|
|
"learning_rate": 1.5294117647058826e-09,
|
|
"loss": 2.7486,
|
|
"mean_token_accuracy": 0.5931043148040771,
|
|
"num_tokens": 14836289.0,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.7855973813420621,
|
|
"grad_norm": 57.02411335901064,
|
|
"learning_rate": 1.5620915032679739e-09,
|
|
"loss": 2.7297,
|
|
"mean_token_accuracy": 0.5947885155677796,
|
|
"num_tokens": 15150656.0,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.8019639934533551,
|
|
"grad_norm": 56.33397544722411,
|
|
"learning_rate": 1.5947712418300654e-09,
|
|
"loss": 2.7665,
|
|
"mean_token_accuracy": 0.5909724771976471,
|
|
"num_tokens": 15465617.0,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.8183306055646481,
|
|
"grad_norm": 57.676254136521834,
|
|
"learning_rate": 1.627450980392157e-09,
|
|
"loss": 2.7472,
|
|
"mean_token_accuracy": 0.5932631254196167,
|
|
"num_tokens": 15781412.0,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.8346972176759411,
|
|
"grad_norm": 56.46876654435729,
|
|
"learning_rate": 1.6601307189542483e-09,
|
|
"loss": 2.7183,
|
|
"mean_token_accuracy": 0.5978624880313873,
|
|
"num_tokens": 16097194.0,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.851063829787234,
|
|
"grad_norm": 58.43011011505955,
|
|
"learning_rate": 1.69281045751634e-09,
|
|
"loss": 2.7817,
|
|
"mean_token_accuracy": 0.5885932326316834,
|
|
"num_tokens": 16414161.0,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.867430441898527,
|
|
"grad_norm": 57.54400391752138,
|
|
"learning_rate": 1.7254901960784313e-09,
|
|
"loss": 2.6971,
|
|
"mean_token_accuracy": 0.5998899936676025,
|
|
"num_tokens": 16729455.0,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.88379705400982,
|
|
"grad_norm": 55.728573679624986,
|
|
"learning_rate": 1.758169934640523e-09,
|
|
"loss": 2.7623,
|
|
"mean_token_accuracy": 0.5904077827930451,
|
|
"num_tokens": 17044493.0,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.900163666121113,
|
|
"grad_norm": 57.60438824990003,
|
|
"learning_rate": 1.7908496732026145e-09,
|
|
"loss": 2.7451,
|
|
"mean_token_accuracy": 0.5936128437519074,
|
|
"num_tokens": 17360258.0,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.9165302782324058,
|
|
"grad_norm": 52.461559653921434,
|
|
"learning_rate": 1.8235294117647057e-09,
|
|
"loss": 2.6809,
|
|
"mean_token_accuracy": 0.6017558097839355,
|
|
"num_tokens": 17674914.0,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.9328968903436988,
|
|
"grad_norm": 54.08802081407188,
|
|
"learning_rate": 1.8562091503267974e-09,
|
|
"loss": 2.6901,
|
|
"mean_token_accuracy": 0.6013341307640075,
|
|
"num_tokens": 17991190.0,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.9492635024549918,
|
|
"grad_norm": 54.9744366583653,
|
|
"learning_rate": 1.8888888888888887e-09,
|
|
"loss": 2.6909,
|
|
"mean_token_accuracy": 0.600698959827423,
|
|
"num_tokens": 18306141.0,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.9656301145662848,
|
|
"grad_norm": 58.033499216049144,
|
|
"learning_rate": 1.9215686274509804e-09,
|
|
"loss": 2.7827,
|
|
"mean_token_accuracy": 0.5892059445381165,
|
|
"num_tokens": 18622962.0,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.9819967266775778,
|
|
"grad_norm": 56.18744968213324,
|
|
"learning_rate": 1.954248366013072e-09,
|
|
"loss": 2.7342,
|
|
"mean_token_accuracy": 0.5947219908237458,
|
|
"num_tokens": 18938998.0,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.9983633387888707,
|
|
"grad_norm": 57.00662709599486,
|
|
"learning_rate": 1.9869281045751634e-09,
|
|
"loss": 2.7185,
|
|
"mean_token_accuracy": 0.5977497279644013,
|
|
"num_tokens": 19254740.0,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 1.0130932896890343,
|
|
"grad_norm": 58.15220601527463,
|
|
"learning_rate": 2.019607843137255e-09,
|
|
"loss": 2.7109,
|
|
"mean_token_accuracy": 0.5939835707346598,
|
|
"num_tokens": 19514459.0,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 1.0294599018003274,
|
|
"grad_norm": 57.382494721681105,
|
|
"learning_rate": 2.0522875816993463e-09,
|
|
"loss": 2.7038,
|
|
"mean_token_accuracy": 0.5981919527053833,
|
|
"num_tokens": 19830901.0,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 1.0458265139116203,
|
|
"grad_norm": 56.81487821310481,
|
|
"learning_rate": 2.084967320261438e-09,
|
|
"loss": 2.7198,
|
|
"mean_token_accuracy": 0.5969063222408295,
|
|
"num_tokens": 20147766.0,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 1.0621931260229132,
|
|
"grad_norm": 56.514014528428106,
|
|
"learning_rate": 2.1176470588235293e-09,
|
|
"loss": 2.7307,
|
|
"mean_token_accuracy": 0.5957262694835663,
|
|
"num_tokens": 20463289.0,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 1.0785597381342062,
|
|
"grad_norm": 55.71056833038111,
|
|
"learning_rate": 2.150326797385621e-09,
|
|
"loss": 2.7166,
|
|
"mean_token_accuracy": 0.5964736342430115,
|
|
"num_tokens": 20778436.0,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 1.0949263502454991,
|
|
"grad_norm": 57.82464672665498,
|
|
"learning_rate": 2.1830065359477127e-09,
|
|
"loss": 2.6896,
|
|
"mean_token_accuracy": 0.6009782791137696,
|
|
"num_tokens": 21094311.0,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 1.1112929623567922,
|
|
"grad_norm": 57.361293489004666,
|
|
"learning_rate": 2.215686274509804e-09,
|
|
"loss": 2.713,
|
|
"mean_token_accuracy": 0.5986053884029389,
|
|
"num_tokens": 21410544.0,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 1.127659574468085,
|
|
"grad_norm": 57.72987276907856,
|
|
"learning_rate": 2.2483660130718956e-09,
|
|
"loss": 2.7463,
|
|
"mean_token_accuracy": 0.5940020740032196,
|
|
"num_tokens": 21725832.0,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 1.1440261865793782,
|
|
"grad_norm": 55.803725938254715,
|
|
"learning_rate": 2.281045751633987e-09,
|
|
"loss": 2.7144,
|
|
"mean_token_accuracy": 0.5972111761569977,
|
|
"num_tokens": 22041212.0,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 1.160392798690671,
|
|
"grad_norm": 55.27391304057763,
|
|
"learning_rate": 2.3137254901960786e-09,
|
|
"loss": 2.7043,
|
|
"mean_token_accuracy": 0.59928178191185,
|
|
"num_tokens": 22357705.0,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 1.1767594108019641,
|
|
"grad_norm": 55.886619087218165,
|
|
"learning_rate": 2.34640522875817e-09,
|
|
"loss": 2.7442,
|
|
"mean_token_accuracy": 0.5932347357273102,
|
|
"num_tokens": 22673798.0,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 1.193126022913257,
|
|
"grad_norm": 57.751208013174015,
|
|
"learning_rate": 2.379084967320261e-09,
|
|
"loss": 2.7045,
|
|
"mean_token_accuracy": 0.5995621025562287,
|
|
"num_tokens": 22989381.0,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 1.2094926350245498,
|
|
"grad_norm": 57.77951094605076,
|
|
"learning_rate": 2.411764705882353e-09,
|
|
"loss": 2.7395,
|
|
"mean_token_accuracy": 0.5948926329612731,
|
|
"num_tokens": 23305377.0,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 1.225859247135843,
|
|
"grad_norm": 54.89485818089428,
|
|
"learning_rate": 2.4444444444444446e-09,
|
|
"loss": 2.6894,
|
|
"mean_token_accuracy": 0.6005812406539917,
|
|
"num_tokens": 23620440.0,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 1.2422258592471358,
|
|
"grad_norm": 57.33835006471561,
|
|
"learning_rate": 2.4771241830065362e-09,
|
|
"loss": 2.7431,
|
|
"mean_token_accuracy": 0.5935854852199555,
|
|
"num_tokens": 23935615.0,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 1.2585924713584289,
|
|
"grad_norm": 56.69754723412471,
|
|
"learning_rate": 2.5098039215686275e-09,
|
|
"loss": 2.7055,
|
|
"mean_token_accuracy": 0.5975941598415375,
|
|
"num_tokens": 24250759.0,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 1.2749590834697218,
|
|
"grad_norm": 58.64054742433329,
|
|
"learning_rate": 2.542483660130719e-09,
|
|
"loss": 2.7932,
|
|
"mean_token_accuracy": 0.5873025774955749,
|
|
"num_tokens": 24566546.0,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 1.2913256955810146,
|
|
"grad_norm": 53.341820755206776,
|
|
"learning_rate": 2.57516339869281e-09,
|
|
"loss": 2.7041,
|
|
"mean_token_accuracy": 0.5986806511878967,
|
|
"num_tokens": 24882934.0,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 1.3076923076923077,
|
|
"grad_norm": 55.89678065009442,
|
|
"learning_rate": 2.607843137254902e-09,
|
|
"loss": 2.7215,
|
|
"mean_token_accuracy": 0.596479618549347,
|
|
"num_tokens": 25199952.0,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 1.3240589198036006,
|
|
"grad_norm": 56.031812370748284,
|
|
"learning_rate": 2.6405228758169935e-09,
|
|
"loss": 2.7416,
|
|
"mean_token_accuracy": 0.5937622725963593,
|
|
"num_tokens": 25515305.0,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 1.3404255319148937,
|
|
"grad_norm": 56.53930697071187,
|
|
"learning_rate": 2.673202614379085e-09,
|
|
"loss": 2.7287,
|
|
"mean_token_accuracy": 0.5946074604988099,
|
|
"num_tokens": 25831044.0,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 1.3567921440261865,
|
|
"grad_norm": 58.366158564347025,
|
|
"learning_rate": 2.7058823529411764e-09,
|
|
"loss": 2.7198,
|
|
"mean_token_accuracy": 0.596553748846054,
|
|
"num_tokens": 26146292.0,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 1.3731587561374796,
|
|
"grad_norm": 54.865600304232686,
|
|
"learning_rate": 2.738562091503268e-09,
|
|
"loss": 2.7124,
|
|
"mean_token_accuracy": 0.5966074049472809,
|
|
"num_tokens": 26462672.0,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 1.3895253682487725,
|
|
"grad_norm": 56.79859615381639,
|
|
"learning_rate": 2.77124183006536e-09,
|
|
"loss": 2.6805,
|
|
"mean_token_accuracy": 0.6013938546180725,
|
|
"num_tokens": 26775810.0,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 1.4058919803600656,
|
|
"grad_norm": 56.88030488511343,
|
|
"learning_rate": 2.803921568627451e-09,
|
|
"loss": 2.759,
|
|
"mean_token_accuracy": 0.5923470973968505,
|
|
"num_tokens": 27091768.0,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 1.4222585924713584,
|
|
"grad_norm": 54.682735365518894,
|
|
"learning_rate": 2.8366013071895424e-09,
|
|
"loss": 2.7374,
|
|
"mean_token_accuracy": 0.5940196335315704,
|
|
"num_tokens": 27406064.0,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 1.4386252045826513,
|
|
"grad_norm": 56.01861141931974,
|
|
"learning_rate": 2.869281045751634e-09,
|
|
"loss": 2.7245,
|
|
"mean_token_accuracy": 0.5955065369606019,
|
|
"num_tokens": 27721370.0,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 1.4549918166939444,
|
|
"grad_norm": 55.81282965504614,
|
|
"learning_rate": 2.9019607843137257e-09,
|
|
"loss": 2.7094,
|
|
"mean_token_accuracy": 0.5974486649036408,
|
|
"num_tokens": 28037451.0,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 1.4713584288052373,
|
|
"grad_norm": 54.735450983454065,
|
|
"learning_rate": 2.934640522875817e-09,
|
|
"loss": 2.7323,
|
|
"mean_token_accuracy": 0.5936485469341278,
|
|
"num_tokens": 28352352.0,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 1.4877250409165304,
|
|
"grad_norm": 56.490867272630034,
|
|
"learning_rate": 2.9673202614379087e-09,
|
|
"loss": 2.7201,
|
|
"mean_token_accuracy": 0.5969516217708588,
|
|
"num_tokens": 28667282.0,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 1.5040916530278232,
|
|
"grad_norm": 57.59062069435996,
|
|
"learning_rate": 3e-09,
|
|
"loss": 2.7465,
|
|
"mean_token_accuracy": 0.5931427359580994,
|
|
"num_tokens": 28982011.0,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 1.520458265139116,
|
|
"grad_norm": 56.46328376363721,
|
|
"learning_rate": 3.0326797385620913e-09,
|
|
"loss": 2.7429,
|
|
"mean_token_accuracy": 0.5932705223560333,
|
|
"num_tokens": 29296827.0,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 1.5368248772504092,
|
|
"grad_norm": 55.01577140250396,
|
|
"learning_rate": 3.0653594771241834e-09,
|
|
"loss": 2.7166,
|
|
"mean_token_accuracy": 0.5958990335464478,
|
|
"num_tokens": 29613826.0,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 1.5531914893617023,
|
|
"grad_norm": 57.52972493680479,
|
|
"learning_rate": 3.0980392156862746e-09,
|
|
"loss": 2.7465,
|
|
"mean_token_accuracy": 0.5942055583000183,
|
|
"num_tokens": 29929096.0,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 1.5695581014729951,
|
|
"grad_norm": 58.84850414001869,
|
|
"learning_rate": 3.130718954248366e-09,
|
|
"loss": 2.709,
|
|
"mean_token_accuracy": 0.5981499969959259,
|
|
"num_tokens": 30243368.0,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 1.585924713584288,
|
|
"grad_norm": 57.50064518392254,
|
|
"learning_rate": 3.1633986928104576e-09,
|
|
"loss": 2.7609,
|
|
"mean_token_accuracy": 0.5903271436691284,
|
|
"num_tokens": 30557343.0,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 1.6022913256955809,
|
|
"grad_norm": 55.5149653191555,
|
|
"learning_rate": 3.196078431372549e-09,
|
|
"loss": 2.7082,
|
|
"mean_token_accuracy": 0.5975622475147248,
|
|
"num_tokens": 30873056.0,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 1.618657937806874,
|
|
"grad_norm": 55.87400536065197,
|
|
"learning_rate": 3.2287581699346406e-09,
|
|
"loss": 2.738,
|
|
"mean_token_accuracy": 0.5949967682361603,
|
|
"num_tokens": 31189263.0,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 1.635024549918167,
|
|
"grad_norm": 57.28584477551273,
|
|
"learning_rate": 3.2614379084967323e-09,
|
|
"loss": 2.7052,
|
|
"mean_token_accuracy": 0.5990139007568359,
|
|
"num_tokens": 31504291.0,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 1.65139116202946,
|
|
"grad_norm": 54.83291742448827,
|
|
"learning_rate": 3.2941176470588235e-09,
|
|
"loss": 2.6666,
|
|
"mean_token_accuracy": 0.6031867802143097,
|
|
"num_tokens": 31821042.0,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 1.6677577741407528,
|
|
"grad_norm": 58.40949288203933,
|
|
"learning_rate": 3.326797385620915e-09,
|
|
"loss": 2.7014,
|
|
"mean_token_accuracy": 0.5982214510440826,
|
|
"num_tokens": 32136842.0,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 1.6841243862520459,
|
|
"grad_norm": 55.77363797416089,
|
|
"learning_rate": 3.359477124183007e-09,
|
|
"loss": 2.7135,
|
|
"mean_token_accuracy": 0.5962281107902527,
|
|
"num_tokens": 32453472.0,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 1.700490998363339,
|
|
"grad_norm": 54.68653787956224,
|
|
"learning_rate": 3.392156862745098e-09,
|
|
"loss": 2.6678,
|
|
"mean_token_accuracy": 0.6035849511623382,
|
|
"num_tokens": 32768088.0,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 1.7168576104746318,
|
|
"grad_norm": 54.62194321614914,
|
|
"learning_rate": 3.4248366013071895e-09,
|
|
"loss": 2.7084,
|
|
"mean_token_accuracy": 0.5977645874023437,
|
|
"num_tokens": 33082497.0,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 1.7332242225859247,
|
|
"grad_norm": 53.093565639192974,
|
|
"learning_rate": 3.457516339869281e-09,
|
|
"loss": 2.7051,
|
|
"mean_token_accuracy": 0.5985934197902679,
|
|
"num_tokens": 33398550.0,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 1.7495908346972175,
|
|
"grad_norm": 56.59068400803138,
|
|
"learning_rate": 3.4901960784313724e-09,
|
|
"loss": 2.724,
|
|
"mean_token_accuracy": 0.5946956694126129,
|
|
"num_tokens": 33714169.0,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 1.7659574468085106,
|
|
"grad_norm": 56.89444327874508,
|
|
"learning_rate": 3.5228758169934645e-09,
|
|
"loss": 2.6444,
|
|
"mean_token_accuracy": 0.6075450599193573,
|
|
"num_tokens": 34030238.0,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 1.7823240589198037,
|
|
"grad_norm": 56.40460626976272,
|
|
"learning_rate": 3.555555555555556e-09,
|
|
"loss": 2.7523,
|
|
"mean_token_accuracy": 0.5912691414356231,
|
|
"num_tokens": 34344423.0,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 1.7986906710310966,
|
|
"grad_norm": 54.590169589414685,
|
|
"learning_rate": 3.588235294117647e-09,
|
|
"loss": 2.6558,
|
|
"mean_token_accuracy": 0.6027090668678283,
|
|
"num_tokens": 34661768.0,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 1.8150572831423895,
|
|
"grad_norm": 54.738525111336244,
|
|
"learning_rate": 3.6209150326797384e-09,
|
|
"loss": 2.674,
|
|
"mean_token_accuracy": 0.5994877934455871,
|
|
"num_tokens": 34978338.0,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 1.8314238952536823,
|
|
"grad_norm": 54.44736090622008,
|
|
"learning_rate": 3.65359477124183e-09,
|
|
"loss": 2.7485,
|
|
"mean_token_accuracy": 0.5917412161827087,
|
|
"num_tokens": 35294554.0,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 1.8477905073649754,
|
|
"grad_norm": 56.99256411648127,
|
|
"learning_rate": 3.6862745098039218e-09,
|
|
"loss": 2.7338,
|
|
"mean_token_accuracy": 0.5929640531539917,
|
|
"num_tokens": 35611110.0,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 1.8641571194762685,
|
|
"grad_norm": 54.79344913325811,
|
|
"learning_rate": 3.7189542483660134e-09,
|
|
"loss": 2.7106,
|
|
"mean_token_accuracy": 0.5953186571598053,
|
|
"num_tokens": 35926411.0,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 1.8805237315875614,
|
|
"grad_norm": 52.797498352731125,
|
|
"learning_rate": 3.751633986928105e-09,
|
|
"loss": 2.7266,
|
|
"mean_token_accuracy": 0.5939077198505401,
|
|
"num_tokens": 36244250.0,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 1.8968903436988542,
|
|
"grad_norm": 57.216007808474345,
|
|
"learning_rate": 3.784313725490196e-09,
|
|
"loss": 2.6608,
|
|
"mean_token_accuracy": 0.6030964910984039,
|
|
"num_tokens": 36561165.0,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 1.9132569558101473,
|
|
"grad_norm": 53.97785850665361,
|
|
"learning_rate": 3.816993464052287e-09,
|
|
"loss": 2.6583,
|
|
"mean_token_accuracy": 0.6021470665931702,
|
|
"num_tokens": 36877458.0,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 1.9296235679214404,
|
|
"grad_norm": 54.143082330381475,
|
|
"learning_rate": 3.849673202614379e-09,
|
|
"loss": 2.7001,
|
|
"mean_token_accuracy": 0.5981690645217895,
|
|
"num_tokens": 37193872.0,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 1.9459901800327333,
|
|
"grad_norm": 56.092454263999294,
|
|
"learning_rate": 3.882352941176471e-09,
|
|
"loss": 2.6859,
|
|
"mean_token_accuracy": 0.5982384502887725,
|
|
"num_tokens": 37508485.0,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 1.9623567921440261,
|
|
"grad_norm": 53.68828562669948,
|
|
"learning_rate": 3.915032679738562e-09,
|
|
"loss": 2.701,
|
|
"mean_token_accuracy": 0.5964562237262726,
|
|
"num_tokens": 37823707.0,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 1.978723404255319,
|
|
"grad_norm": 56.201055702078115,
|
|
"learning_rate": 3.947712418300653e-09,
|
|
"loss": 2.7612,
|
|
"mean_token_accuracy": 0.5894128203392028,
|
|
"num_tokens": 38138418.0,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 1.995090016366612,
|
|
"grad_norm": 54.62282636390223,
|
|
"learning_rate": 3.980392156862745e-09,
|
|
"loss": 2.7223,
|
|
"mean_token_accuracy": 0.5942505180835724,
|
|
"num_tokens": 38454032.0,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 2.009819967266776,
|
|
"grad_norm": 56.10486030635885,
|
|
"learning_rate": 4.013071895424837e-09,
|
|
"loss": 2.7005,
|
|
"mean_token_accuracy": 0.5975761082437303,
|
|
"num_tokens": 38713730.0,
|
|
"step": 615
|
|
},
|
|
{
|
|
"epoch": 2.0261865793780687,
|
|
"grad_norm": 52.7688119052421,
|
|
"learning_rate": 4.045751633986928e-09,
|
|
"loss": 2.6856,
|
|
"mean_token_accuracy": 0.5990508139133454,
|
|
"num_tokens": 39030761.0,
|
|
"step": 620
|
|
},
|
|
{
|
|
"epoch": 2.0425531914893615,
|
|
"grad_norm": 55.17135344620159,
|
|
"learning_rate": 4.07843137254902e-09,
|
|
"loss": 2.7085,
|
|
"mean_token_accuracy": 0.5959590137004852,
|
|
"num_tokens": 39347637.0,
|
|
"step": 625
|
|
},
|
|
{
|
|
"epoch": 2.058919803600655,
|
|
"grad_norm": 57.995842457694756,
|
|
"learning_rate": 4.111111111111111e-09,
|
|
"loss": 2.7192,
|
|
"mean_token_accuracy": 0.594757741689682,
|
|
"num_tokens": 39661859.0,
|
|
"step": 630
|
|
},
|
|
{
|
|
"epoch": 2.0752864157119477,
|
|
"grad_norm": 54.38975526380071,
|
|
"learning_rate": 4.143790849673203e-09,
|
|
"loss": 2.709,
|
|
"mean_token_accuracy": 0.5958807587623596,
|
|
"num_tokens": 39978360.0,
|
|
"step": 635
|
|
},
|
|
{
|
|
"epoch": 2.0916530278232406,
|
|
"grad_norm": 54.68354125601648,
|
|
"learning_rate": 4.176470588235295e-09,
|
|
"loss": 2.7235,
|
|
"mean_token_accuracy": 0.594566798210144,
|
|
"num_tokens": 40294927.0,
|
|
"step": 640
|
|
},
|
|
{
|
|
"epoch": 2.1080196399345335,
|
|
"grad_norm": 52.99792679525105,
|
|
"learning_rate": 4.209150326797386e-09,
|
|
"loss": 2.6955,
|
|
"mean_token_accuracy": 0.5974120557308197,
|
|
"num_tokens": 40610315.0,
|
|
"step": 645
|
|
},
|
|
{
|
|
"epoch": 2.1243862520458263,
|
|
"grad_norm": 55.908240404817846,
|
|
"learning_rate": 4.241830065359477e-09,
|
|
"loss": 2.7015,
|
|
"mean_token_accuracy": 0.5971021175384521,
|
|
"num_tokens": 40925803.0,
|
|
"step": 650
|
|
},
|
|
{
|
|
"epoch": 2.1407528641571196,
|
|
"grad_norm": 54.811323789860005,
|
|
"learning_rate": 4.2745098039215685e-09,
|
|
"loss": 2.6697,
|
|
"mean_token_accuracy": 0.6004336535930633,
|
|
"num_tokens": 41241287.0,
|
|
"step": 655
|
|
},
|
|
{
|
|
"epoch": 2.1571194762684125,
|
|
"grad_norm": 54.65515873667756,
|
|
"learning_rate": 4.3071895424836606e-09,
|
|
"loss": 2.7401,
|
|
"mean_token_accuracy": 0.5916546046733856,
|
|
"num_tokens": 41556298.0,
|
|
"step": 660
|
|
},
|
|
{
|
|
"epoch": 2.1734860883797054,
|
|
"grad_norm": 54.878154658789114,
|
|
"learning_rate": 4.339869281045752e-09,
|
|
"loss": 2.734,
|
|
"mean_token_accuracy": 0.5918886005878449,
|
|
"num_tokens": 41871045.0,
|
|
"step": 665
|
|
},
|
|
{
|
|
"epoch": 2.1898527004909982,
|
|
"grad_norm": 52.7218116891618,
|
|
"learning_rate": 4.372549019607843e-09,
|
|
"loss": 2.6912,
|
|
"mean_token_accuracy": 0.5978993952274323,
|
|
"num_tokens": 42186308.0,
|
|
"step": 670
|
|
},
|
|
{
|
|
"epoch": 2.2062193126022915,
|
|
"grad_norm": 54.32965865225877,
|
|
"learning_rate": 4.405228758169934e-09,
|
|
"loss": 2.6798,
|
|
"mean_token_accuracy": 0.5993401885032654,
|
|
"num_tokens": 42501889.0,
|
|
"step": 675
|
|
},
|
|
{
|
|
"epoch": 2.2225859247135844,
|
|
"grad_norm": 53.89889503525825,
|
|
"learning_rate": 4.437908496732026e-09,
|
|
"loss": 2.6935,
|
|
"mean_token_accuracy": 0.596401983499527,
|
|
"num_tokens": 42817655.0,
|
|
"step": 680
|
|
},
|
|
{
|
|
"epoch": 2.2389525368248773,
|
|
"grad_norm": 58.23012419600424,
|
|
"learning_rate": 4.470588235294118e-09,
|
|
"loss": 2.7287,
|
|
"mean_token_accuracy": 0.5919546246528625,
|
|
"num_tokens": 43133962.0,
|
|
"step": 685
|
|
},
|
|
{
|
|
"epoch": 2.25531914893617,
|
|
"grad_norm": 58.184484453333866,
|
|
"learning_rate": 4.503267973856209e-09,
|
|
"loss": 2.7211,
|
|
"mean_token_accuracy": 0.59313685297966,
|
|
"num_tokens": 43448995.0,
|
|
"step": 690
|
|
},
|
|
{
|
|
"epoch": 2.271685761047463,
|
|
"grad_norm": 54.85524353717034,
|
|
"learning_rate": 4.5359477124183e-09,
|
|
"loss": 2.7183,
|
|
"mean_token_accuracy": 0.59401575922966,
|
|
"num_tokens": 43763497.0,
|
|
"step": 695
|
|
},
|
|
{
|
|
"epoch": 2.2880523731587563,
|
|
"grad_norm": 54.61408181909944,
|
|
"learning_rate": 4.5686274509803924e-09,
|
|
"loss": 2.673,
|
|
"mean_token_accuracy": 0.6004755556583404,
|
|
"num_tokens": 44078904.0,
|
|
"step": 700
|
|
},
|
|
{
|
|
"epoch": 2.304418985270049,
|
|
"grad_norm": 53.910655475887154,
|
|
"learning_rate": 4.601307189542484e-09,
|
|
"loss": 2.687,
|
|
"mean_token_accuracy": 0.5986486673355103,
|
|
"num_tokens": 44393042.0,
|
|
"step": 705
|
|
},
|
|
{
|
|
"epoch": 2.320785597381342,
|
|
"grad_norm": 55.10138236422902,
|
|
"learning_rate": 4.633986928104576e-09,
|
|
"loss": 2.7297,
|
|
"mean_token_accuracy": 0.5921121656894683,
|
|
"num_tokens": 44707827.0,
|
|
"step": 710
|
|
},
|
|
{
|
|
"epoch": 2.337152209492635,
|
|
"grad_norm": 54.73885538831377,
|
|
"learning_rate": 4.666666666666667e-09,
|
|
"loss": 2.6563,
|
|
"mean_token_accuracy": 0.6028881072998047,
|
|
"num_tokens": 45023884.0,
|
|
"step": 715
|
|
},
|
|
{
|
|
"epoch": 2.3535188216039282,
|
|
"grad_norm": 53.44933175127846,
|
|
"learning_rate": 4.699346405228758e-09,
|
|
"loss": 2.7157,
|
|
"mean_token_accuracy": 0.5943553507328033,
|
|
"num_tokens": 45338498.0,
|
|
"step": 720
|
|
},
|
|
{
|
|
"epoch": 2.369885433715221,
|
|
"grad_norm": 55.740342198653835,
|
|
"learning_rate": 4.73202614379085e-09,
|
|
"loss": 2.711,
|
|
"mean_token_accuracy": 0.5951617062091827,
|
|
"num_tokens": 45653999.0,
|
|
"step": 725
|
|
},
|
|
{
|
|
"epoch": 2.386252045826514,
|
|
"grad_norm": 54.62824485548636,
|
|
"learning_rate": 4.764705882352941e-09,
|
|
"loss": 2.7073,
|
|
"mean_token_accuracy": 0.5961497783660888,
|
|
"num_tokens": 45970321.0,
|
|
"step": 730
|
|
},
|
|
{
|
|
"epoch": 2.402618657937807,
|
|
"grad_norm": 55.40457150277988,
|
|
"learning_rate": 4.797385620915033e-09,
|
|
"loss": 2.659,
|
|
"mean_token_accuracy": 0.6022453665733337,
|
|
"num_tokens": 46286216.0,
|
|
"step": 735
|
|
},
|
|
{
|
|
"epoch": 2.4189852700490997,
|
|
"grad_norm": 55.97649140796812,
|
|
"learning_rate": 4.830065359477124e-09,
|
|
"loss": 2.6926,
|
|
"mean_token_accuracy": 0.5985015273094177,
|
|
"num_tokens": 46601739.0,
|
|
"step": 740
|
|
},
|
|
{
|
|
"epoch": 2.4353518821603926,
|
|
"grad_norm": 52.96577140428808,
|
|
"learning_rate": 4.8627450980392156e-09,
|
|
"loss": 2.6167,
|
|
"mean_token_accuracy": 0.6079153478145599,
|
|
"num_tokens": 46917261.0,
|
|
"step": 745
|
|
},
|
|
{
|
|
"epoch": 2.451718494271686,
|
|
"grad_norm": 54.239981137658475,
|
|
"learning_rate": 4.895424836601307e-09,
|
|
"loss": 2.7005,
|
|
"mean_token_accuracy": 0.5955610156059266,
|
|
"num_tokens": 47232983.0,
|
|
"step": 750
|
|
},
|
|
{
|
|
"epoch": 2.4680851063829787,
|
|
"grad_norm": 55.64879350500573,
|
|
"learning_rate": 4.928104575163399e-09,
|
|
"loss": 2.6989,
|
|
"mean_token_accuracy": 0.5967093467712402,
|
|
"num_tokens": 47548001.0,
|
|
"step": 755
|
|
},
|
|
{
|
|
"epoch": 2.4844517184942716,
|
|
"grad_norm": 55.46875075671352,
|
|
"learning_rate": 4.96078431372549e-09,
|
|
"loss": 2.7075,
|
|
"mean_token_accuracy": 0.5952642858028412,
|
|
"num_tokens": 47864499.0,
|
|
"step": 760
|
|
},
|
|
{
|
|
"epoch": 2.500818330605565,
|
|
"grad_norm": 56.02832550186899,
|
|
"learning_rate": 4.9934640522875815e-09,
|
|
"loss": 2.6982,
|
|
"mean_token_accuracy": 0.5948630213737488,
|
|
"num_tokens": 48181788.0,
|
|
"step": 765
|
|
},
|
|
{
|
|
"epoch": 2.5171849427168578,
|
|
"grad_norm": 51.02977447624827,
|
|
"learning_rate": 4.999995835894431e-09,
|
|
"loss": 2.6569,
|
|
"mean_token_accuracy": 0.5976875245571136,
|
|
"num_tokens": 48497291.0,
|
|
"step": 770
|
|
},
|
|
{
|
|
"epoch": 2.5335515548281506,
|
|
"grad_norm": 52.132741336495044,
|
|
"learning_rate": 4.999978919239329e-09,
|
|
"loss": 2.6595,
|
|
"mean_token_accuracy": 0.5953408718109131,
|
|
"num_tokens": 48813596.0,
|
|
"step": 775
|
|
},
|
|
{
|
|
"epoch": 2.5499181669394435,
|
|
"grad_norm": 52.59810785644208,
|
|
"learning_rate": 4.999948989866086e-09,
|
|
"loss": 2.6407,
|
|
"mean_token_accuracy": 0.5970385134220123,
|
|
"num_tokens": 49128618.0,
|
|
"step": 780
|
|
},
|
|
{
|
|
"epoch": 2.5662847790507364,
|
|
"grad_norm": 52.64943844109409,
|
|
"learning_rate": 4.999906047930483e-09,
|
|
"loss": 2.6657,
|
|
"mean_token_accuracy": 0.5940295934677124,
|
|
"num_tokens": 49444350.0,
|
|
"step": 785
|
|
},
|
|
{
|
|
"epoch": 2.5826513911620292,
|
|
"grad_norm": 51.58494049727213,
|
|
"learning_rate": 4.999850093656042e-09,
|
|
"loss": 2.6456,
|
|
"mean_token_accuracy": 0.5964194118976593,
|
|
"num_tokens": 49760874.0,
|
|
"step": 790
|
|
},
|
|
{
|
|
"epoch": 2.5990180032733226,
|
|
"grad_norm": 50.4229415460418,
|
|
"learning_rate": 4.999781127334011e-09,
|
|
"loss": 2.6463,
|
|
"mean_token_accuracy": 0.5977925717830658,
|
|
"num_tokens": 50077471.0,
|
|
"step": 795
|
|
},
|
|
{
|
|
"epoch": 2.6153846153846154,
|
|
"grad_norm": 51.61467397516237,
|
|
"learning_rate": 4.999699149323369e-09,
|
|
"loss": 2.6322,
|
|
"mean_token_accuracy": 0.5987462520599365,
|
|
"num_tokens": 50394292.0,
|
|
"step": 800
|
|
},
|
|
{
|
|
"epoch": 2.6317512274959083,
|
|
"grad_norm": 51.35425116447994,
|
|
"learning_rate": 4.9996041600508215e-09,
|
|
"loss": 2.6848,
|
|
"mean_token_accuracy": 0.5933543801307678,
|
|
"num_tokens": 50708508.0,
|
|
"step": 805
|
|
},
|
|
{
|
|
"epoch": 2.648117839607201,
|
|
"grad_norm": 54.23489070568219,
|
|
"learning_rate": 4.999496160010801e-09,
|
|
"loss": 2.7238,
|
|
"mean_token_accuracy": 0.5853737473487854,
|
|
"num_tokens": 51024224.0,
|
|
"step": 810
|
|
},
|
|
{
|
|
"epoch": 2.6644844517184945,
|
|
"grad_norm": 49.584559362850236,
|
|
"learning_rate": 4.999375149765462e-09,
|
|
"loss": 2.6308,
|
|
"mean_token_accuracy": 0.600192254781723,
|
|
"num_tokens": 51339876.0,
|
|
"step": 815
|
|
},
|
|
{
|
|
"epoch": 2.6808510638297873,
|
|
"grad_norm": 52.75483476576607,
|
|
"learning_rate": 4.999241129944679e-09,
|
|
"loss": 2.644,
|
|
"mean_token_accuracy": 0.5973427474498749,
|
|
"num_tokens": 51653756.0,
|
|
"step": 820
|
|
},
|
|
{
|
|
"epoch": 2.69721767594108,
|
|
"grad_norm": 49.18710368234475,
|
|
"learning_rate": 4.9990941012460426e-09,
|
|
"loss": 2.6092,
|
|
"mean_token_accuracy": 0.6024131655693055,
|
|
"num_tokens": 51970308.0,
|
|
"step": 825
|
|
},
|
|
{
|
|
"epoch": 2.713584288052373,
|
|
"grad_norm": 53.787832600682705,
|
|
"learning_rate": 4.9989340644348545e-09,
|
|
"loss": 2.6728,
|
|
"mean_token_accuracy": 0.5934543550014496,
|
|
"num_tokens": 52287190.0,
|
|
"step": 830
|
|
},
|
|
{
|
|
"epoch": 2.729950900163666,
|
|
"grad_norm": 51.579113331419386,
|
|
"learning_rate": 4.998761020344129e-09,
|
|
"loss": 2.6233,
|
|
"mean_token_accuracy": 0.6000344455242157,
|
|
"num_tokens": 52604275.0,
|
|
"step": 835
|
|
},
|
|
{
|
|
"epoch": 2.7463175122749592,
|
|
"grad_norm": 49.84173315383667,
|
|
"learning_rate": 4.998574969874584e-09,
|
|
"loss": 2.6406,
|
|
"mean_token_accuracy": 0.5963773608207703,
|
|
"num_tokens": 52919052.0,
|
|
"step": 840
|
|
},
|
|
{
|
|
"epoch": 2.762684124386252,
|
|
"grad_norm": 50.39862997599813,
|
|
"learning_rate": 4.998375913994635e-09,
|
|
"loss": 2.6845,
|
|
"mean_token_accuracy": 0.5930020451545716,
|
|
"num_tokens": 53234461.0,
|
|
"step": 845
|
|
},
|
|
{
|
|
"epoch": 2.779050736497545,
|
|
"grad_norm": 50.560791912654636,
|
|
"learning_rate": 4.998163853740395e-09,
|
|
"loss": 2.6236,
|
|
"mean_token_accuracy": 0.6000065624713897,
|
|
"num_tokens": 53550429.0,
|
|
"step": 850
|
|
},
|
|
{
|
|
"epoch": 2.795417348608838,
|
|
"grad_norm": 49.666468120698156,
|
|
"learning_rate": 4.997938790215665e-09,
|
|
"loss": 2.6166,
|
|
"mean_token_accuracy": 0.5981848835945129,
|
|
"num_tokens": 53867271.0,
|
|
"step": 855
|
|
},
|
|
{
|
|
"epoch": 2.811783960720131,
|
|
"grad_norm": 51.23962535977192,
|
|
"learning_rate": 4.997700724591931e-09,
|
|
"loss": 2.6119,
|
|
"mean_token_accuracy": 0.6018172323703765,
|
|
"num_tokens": 54183461.0,
|
|
"step": 860
|
|
},
|
|
{
|
|
"epoch": 2.828150572831424,
|
|
"grad_norm": 48.83583292298752,
|
|
"learning_rate": 4.997449658108354e-09,
|
|
"loss": 2.6079,
|
|
"mean_token_accuracy": 0.6019073128700256,
|
|
"num_tokens": 54498182.0,
|
|
"step": 865
|
|
},
|
|
{
|
|
"epoch": 2.844517184942717,
|
|
"grad_norm": 53.57936650160131,
|
|
"learning_rate": 4.997185592071769e-09,
|
|
"loss": 2.6676,
|
|
"mean_token_accuracy": 0.5932618260383606,
|
|
"num_tokens": 54813016.0,
|
|
"step": 870
|
|
},
|
|
{
|
|
"epoch": 2.8608837970540097,
|
|
"grad_norm": 48.43631537595537,
|
|
"learning_rate": 4.996908527856674e-09,
|
|
"loss": 2.642,
|
|
"mean_token_accuracy": 0.5957655310630798,
|
|
"num_tokens": 55128483.0,
|
|
"step": 875
|
|
},
|
|
{
|
|
"epoch": 2.8772504091653026,
|
|
"grad_norm": 51.59484809137071,
|
|
"learning_rate": 4.996618466905226e-09,
|
|
"loss": 2.6599,
|
|
"mean_token_accuracy": 0.5930556237697602,
|
|
"num_tokens": 55444969.0,
|
|
"step": 880
|
|
},
|
|
{
|
|
"epoch": 2.8936170212765955,
|
|
"grad_norm": 49.41078450422501,
|
|
"learning_rate": 4.996315410727229e-09,
|
|
"loss": 2.6121,
|
|
"mean_token_accuracy": 0.6001948356628418,
|
|
"num_tokens": 55761639.0,
|
|
"step": 885
|
|
},
|
|
{
|
|
"epoch": 2.909983633387889,
|
|
"grad_norm": 52.77375569773051,
|
|
"learning_rate": 4.995999360900131e-09,
|
|
"loss": 2.695,
|
|
"mean_token_accuracy": 0.5912358403205872,
|
|
"num_tokens": 56077525.0,
|
|
"step": 890
|
|
},
|
|
{
|
|
"epoch": 2.9263502454991817,
|
|
"grad_norm": 51.62554625125403,
|
|
"learning_rate": 4.995670319069011e-09,
|
|
"loss": 2.6763,
|
|
"mean_token_accuracy": 0.5933657169342041,
|
|
"num_tokens": 56394235.0,
|
|
"step": 895
|
|
},
|
|
{
|
|
"epoch": 2.9427168576104745,
|
|
"grad_norm": 50.955150373337716,
|
|
"learning_rate": 4.995328286946577e-09,
|
|
"loss": 2.66,
|
|
"mean_token_accuracy": 0.5938617050647735,
|
|
"num_tokens": 56709470.0,
|
|
"step": 900
|
|
},
|
|
{
|
|
"epoch": 2.959083469721768,
|
|
"grad_norm": 51.61162113532987,
|
|
"learning_rate": 4.994973266313149e-09,
|
|
"loss": 2.6376,
|
|
"mean_token_accuracy": 0.5965611219406128,
|
|
"num_tokens": 57024644.0,
|
|
"step": 905
|
|
},
|
|
{
|
|
"epoch": 2.9754500818330607,
|
|
"grad_norm": 50.4765635733869,
|
|
"learning_rate": 4.9946052590166576e-09,
|
|
"loss": 2.6603,
|
|
"mean_token_accuracy": 0.5933335602283478,
|
|
"num_tokens": 57339368.0,
|
|
"step": 910
|
|
},
|
|
{
|
|
"epoch": 2.9918166939443536,
|
|
"grad_norm": 49.80334090841871,
|
|
"learning_rate": 4.994224266972629e-09,
|
|
"loss": 2.6482,
|
|
"mean_token_accuracy": 0.5958111524581909,
|
|
"num_tokens": 57654849.0,
|
|
"step": 915
|
|
},
|
|
{
|
|
"epoch": 3.006546644844517,
|
|
"grad_norm": 51.06592869430665,
|
|
"learning_rate": 4.993830292164176e-09,
|
|
"loss": 2.6796,
|
|
"mean_token_accuracy": 0.592134588294559,
|
|
"num_tokens": 57915824.0,
|
|
"step": 920
|
|
},
|
|
{
|
|
"epoch": 3.0229132569558104,
|
|
"grad_norm": 49.96950667591423,
|
|
"learning_rate": 4.99342333664199e-09,
|
|
"loss": 2.6534,
|
|
"mean_token_accuracy": 0.594581139087677,
|
|
"num_tokens": 58231596.0,
|
|
"step": 925
|
|
},
|
|
{
|
|
"epoch": 3.0392798690671032,
|
|
"grad_norm": 51.99343130345125,
|
|
"learning_rate": 4.993003402524327e-09,
|
|
"loss": 2.6668,
|
|
"mean_token_accuracy": 0.5940016269683838,
|
|
"num_tokens": 58548804.0,
|
|
"step": 930
|
|
},
|
|
{
|
|
"epoch": 3.055646481178396,
|
|
"grad_norm": 51.79899324274999,
|
|
"learning_rate": 4.992570491996999e-09,
|
|
"loss": 2.6066,
|
|
"mean_token_accuracy": 0.6022139132022858,
|
|
"num_tokens": 58864353.0,
|
|
"step": 935
|
|
},
|
|
{
|
|
"epoch": 3.072013093289689,
|
|
"grad_norm": 49.269387362251386,
|
|
"learning_rate": 4.992124607313363e-09,
|
|
"loss": 2.6598,
|
|
"mean_token_accuracy": 0.5946215331554413,
|
|
"num_tokens": 59179864.0,
|
|
"step": 940
|
|
},
|
|
{
|
|
"epoch": 3.088379705400982,
|
|
"grad_norm": 49.892803527802585,
|
|
"learning_rate": 4.991665750794306e-09,
|
|
"loss": 2.6403,
|
|
"mean_token_accuracy": 0.5959248483181,
|
|
"num_tokens": 59494088.0,
|
|
"step": 945
|
|
},
|
|
{
|
|
"epoch": 3.104746317512275,
|
|
"grad_norm": 49.68211690452728,
|
|
"learning_rate": 4.991193924828238e-09,
|
|
"loss": 2.6347,
|
|
"mean_token_accuracy": 0.5975275099277496,
|
|
"num_tokens": 59809665.0,
|
|
"step": 950
|
|
},
|
|
{
|
|
"epoch": 3.121112929623568,
|
|
"grad_norm": 50.56798569601125,
|
|
"learning_rate": 4.990709131871074e-09,
|
|
"loss": 2.6194,
|
|
"mean_token_accuracy": 0.5987059652805329,
|
|
"num_tokens": 60124351.0,
|
|
"step": 955
|
|
},
|
|
{
|
|
"epoch": 3.137479541734861,
|
|
"grad_norm": 52.1522983883937,
|
|
"learning_rate": 4.990211374446225e-09,
|
|
"loss": 2.6575,
|
|
"mean_token_accuracy": 0.5933027803897858,
|
|
"num_tokens": 60440561.0,
|
|
"step": 960
|
|
},
|
|
{
|
|
"epoch": 3.1538461538461537,
|
|
"grad_norm": 48.35836366419251,
|
|
"learning_rate": 4.989700655144584e-09,
|
|
"loss": 2.6014,
|
|
"mean_token_accuracy": 0.600798100233078,
|
|
"num_tokens": 60757057.0,
|
|
"step": 965
|
|
},
|
|
{
|
|
"epoch": 3.1702127659574466,
|
|
"grad_norm": 49.44218785880177,
|
|
"learning_rate": 4.989176976624511e-09,
|
|
"loss": 2.6277,
|
|
"mean_token_accuracy": 0.5984482586383819,
|
|
"num_tokens": 61072035.0,
|
|
"step": 970
|
|
},
|
|
{
|
|
"epoch": 3.18657937806874,
|
|
"grad_norm": 51.01216759285889,
|
|
"learning_rate": 4.988640341611823e-09,
|
|
"loss": 2.6489,
|
|
"mean_token_accuracy": 0.5925885379314423,
|
|
"num_tokens": 61387223.0,
|
|
"step": 975
|
|
},
|
|
{
|
|
"epoch": 3.202945990180033,
|
|
"grad_norm": 49.07223807349433,
|
|
"learning_rate": 4.988090752899774e-09,
|
|
"loss": 2.6261,
|
|
"mean_token_accuracy": 0.596798449754715,
|
|
"num_tokens": 61702198.0,
|
|
"step": 980
|
|
},
|
|
{
|
|
"epoch": 3.2193126022913257,
|
|
"grad_norm": 51.019708804907054,
|
|
"learning_rate": 4.987528213349046e-09,
|
|
"loss": 2.6434,
|
|
"mean_token_accuracy": 0.5930721282958984,
|
|
"num_tokens": 62016279.0,
|
|
"step": 985
|
|
},
|
|
{
|
|
"epoch": 3.2356792144026185,
|
|
"grad_norm": 50.396042389119174,
|
|
"learning_rate": 4.986952725887732e-09,
|
|
"loss": 2.6436,
|
|
"mean_token_accuracy": 0.5951732397079468,
|
|
"num_tokens": 62332853.0,
|
|
"step": 990
|
|
},
|
|
{
|
|
"epoch": 3.2520458265139114,
|
|
"grad_norm": 51.4770414159445,
|
|
"learning_rate": 4.9863642935113184e-09,
|
|
"loss": 2.6492,
|
|
"mean_token_accuracy": 0.5950457334518433,
|
|
"num_tokens": 62649395.0,
|
|
"step": 995
|
|
},
|
|
{
|
|
"epoch": 3.2684124386252047,
|
|
"grad_norm": 49.22644140858429,
|
|
"learning_rate": 4.985762919282674e-09,
|
|
"loss": 2.62,
|
|
"mean_token_accuracy": 0.5985433161258698,
|
|
"num_tokens": 62963623.0,
|
|
"step": 1000
|
|
},
|
|
{
|
|
"epoch": 3.2847790507364976,
|
|
"grad_norm": 52.7613105615504,
|
|
"learning_rate": 4.9851486063320286e-09,
|
|
"loss": 2.6547,
|
|
"mean_token_accuracy": 0.5932449102401733,
|
|
"num_tokens": 63279133.0,
|
|
"step": 1005
|
|
},
|
|
{
|
|
"epoch": 3.3011456628477904,
|
|
"grad_norm": 50.00996107413749,
|
|
"learning_rate": 4.9845213578569636e-09,
|
|
"loss": 2.596,
|
|
"mean_token_accuracy": 0.601094377040863,
|
|
"num_tokens": 63594496.0,
|
|
"step": 1010
|
|
},
|
|
{
|
|
"epoch": 3.3175122749590833,
|
|
"grad_norm": 51.50632807901888,
|
|
"learning_rate": 4.983881177122389e-09,
|
|
"loss": 2.658,
|
|
"mean_token_accuracy": 0.5926860809326172,
|
|
"num_tokens": 63909565.0,
|
|
"step": 1015
|
|
},
|
|
{
|
|
"epoch": 3.3338788870703766,
|
|
"grad_norm": 48.43933199467551,
|
|
"learning_rate": 4.98322806746053e-09,
|
|
"loss": 2.6263,
|
|
"mean_token_accuracy": 0.5961672484874725,
|
|
"num_tokens": 64225776.0,
|
|
"step": 1020
|
|
},
|
|
{
|
|
"epoch": 3.3502454991816695,
|
|
"grad_norm": 49.670231681135135,
|
|
"learning_rate": 4.982562032270907e-09,
|
|
"loss": 2.6224,
|
|
"mean_token_accuracy": 0.5976954877376557,
|
|
"num_tokens": 64542549.0,
|
|
"step": 1025
|
|
},
|
|
{
|
|
"epoch": 3.3666121112929623,
|
|
"grad_norm": 51.24543012467001,
|
|
"learning_rate": 4.981883075020321e-09,
|
|
"loss": 2.6385,
|
|
"mean_token_accuracy": 0.5948790550231934,
|
|
"num_tokens": 64857858.0,
|
|
"step": 1030
|
|
},
|
|
{
|
|
"epoch": 3.382978723404255,
|
|
"grad_norm": 49.50915341555558,
|
|
"learning_rate": 4.981191199242833e-09,
|
|
"loss": 2.6233,
|
|
"mean_token_accuracy": 0.5965835630893708,
|
|
"num_tokens": 65173178.0,
|
|
"step": 1035
|
|
},
|
|
{
|
|
"epoch": 3.399345335515548,
|
|
"grad_norm": 52.16472811170959,
|
|
"learning_rate": 4.980486408539749e-09,
|
|
"loss": 2.6528,
|
|
"mean_token_accuracy": 0.5927862703800202,
|
|
"num_tokens": 65490079.0,
|
|
"step": 1040
|
|
},
|
|
{
|
|
"epoch": 3.4157119476268414,
|
|
"grad_norm": 50.313527732755254,
|
|
"learning_rate": 4.979768706579595e-09,
|
|
"loss": 2.6437,
|
|
"mean_token_accuracy": 0.5943560242652893,
|
|
"num_tokens": 65803108.0,
|
|
"step": 1045
|
|
},
|
|
{
|
|
"epoch": 3.4320785597381342,
|
|
"grad_norm": 49.48251544830384,
|
|
"learning_rate": 4.979038097098104e-09,
|
|
"loss": 2.647,
|
|
"mean_token_accuracy": 0.5918027520179748,
|
|
"num_tokens": 66118713.0,
|
|
"step": 1050
|
|
},
|
|
{
|
|
"epoch": 3.448445171849427,
|
|
"grad_norm": 48.972433105538634,
|
|
"learning_rate": 4.978294583898196e-09,
|
|
"loss": 2.6254,
|
|
"mean_token_accuracy": 0.5962560415267945,
|
|
"num_tokens": 66433951.0,
|
|
"step": 1055
|
|
},
|
|
{
|
|
"epoch": 3.46481178396072,
|
|
"grad_norm": 50.314477031346506,
|
|
"learning_rate": 4.9775381708499526e-09,
|
|
"loss": 2.5885,
|
|
"mean_token_accuracy": 0.599977308511734,
|
|
"num_tokens": 66749970.0,
|
|
"step": 1060
|
|
},
|
|
{
|
|
"epoch": 3.4811783960720133,
|
|
"grad_norm": 49.31565636425134,
|
|
"learning_rate": 4.9767688618906034e-09,
|
|
"loss": 2.6078,
|
|
"mean_token_accuracy": 0.598986804485321,
|
|
"num_tokens": 67065501.0,
|
|
"step": 1065
|
|
},
|
|
{
|
|
"epoch": 3.497545008183306,
|
|
"grad_norm": 49.146350694301276,
|
|
"learning_rate": 4.9759866610245045e-09,
|
|
"loss": 2.6128,
|
|
"mean_token_accuracy": 0.5985178530216217,
|
|
"num_tokens": 67380313.0,
|
|
"step": 1070
|
|
},
|
|
{
|
|
"epoch": 3.513911620294599,
|
|
"grad_norm": 50.30669827447236,
|
|
"learning_rate": 4.9751915723231105e-09,
|
|
"loss": 2.6244,
|
|
"mean_token_accuracy": 0.595407634973526,
|
|
"num_tokens": 67695668.0,
|
|
"step": 1075
|
|
},
|
|
{
|
|
"epoch": 3.530278232405892,
|
|
"grad_norm": 51.803022503947055,
|
|
"learning_rate": 4.974383599924965e-09,
|
|
"loss": 2.6499,
|
|
"mean_token_accuracy": 0.5911674559116363,
|
|
"num_tokens": 68011201.0,
|
|
"step": 1080
|
|
},
|
|
{
|
|
"epoch": 3.5466448445171848,
|
|
"grad_norm": 49.796669023866826,
|
|
"learning_rate": 4.973562748035669e-09,
|
|
"loss": 2.6225,
|
|
"mean_token_accuracy": 0.5949772834777832,
|
|
"num_tokens": 68327902.0,
|
|
"step": 1085
|
|
},
|
|
{
|
|
"epoch": 3.563011456628478,
|
|
"grad_norm": 47.91885017050742,
|
|
"learning_rate": 4.972729020927865e-09,
|
|
"loss": 2.6125,
|
|
"mean_token_accuracy": 0.5971624255180359,
|
|
"num_tokens": 68644931.0,
|
|
"step": 1090
|
|
},
|
|
{
|
|
"epoch": 3.579378068739771,
|
|
"grad_norm": 49.87817891616827,
|
|
"learning_rate": 4.971882422941212e-09,
|
|
"loss": 2.602,
|
|
"mean_token_accuracy": 0.5991207838058472,
|
|
"num_tokens": 68962382.0,
|
|
"step": 1095
|
|
},
|
|
{
|
|
"epoch": 3.595744680851064,
|
|
"grad_norm": 48.683634051145795,
|
|
"learning_rate": 4.971022958482363e-09,
|
|
"loss": 2.6374,
|
|
"mean_token_accuracy": 0.5917447209358215,
|
|
"num_tokens": 69279280.0,
|
|
"step": 1100
|
|
},
|
|
{
|
|
"epoch": 3.6121112929623567,
|
|
"grad_norm": 52.2877151961869,
|
|
"learning_rate": 4.970150632024943e-09,
|
|
"loss": 2.6103,
|
|
"mean_token_accuracy": 0.5968870699405671,
|
|
"num_tokens": 69596738.0,
|
|
"step": 1105
|
|
},
|
|
{
|
|
"epoch": 3.62847790507365,
|
|
"grad_norm": 50.138960236262314,
|
|
"learning_rate": 4.969265448109526e-09,
|
|
"loss": 2.6396,
|
|
"mean_token_accuracy": 0.5927744507789612,
|
|
"num_tokens": 69911145.0,
|
|
"step": 1110
|
|
},
|
|
{
|
|
"epoch": 3.644844517184943,
|
|
"grad_norm": 48.7717696588962,
|
|
"learning_rate": 4.968367411343611e-09,
|
|
"loss": 2.5959,
|
|
"mean_token_accuracy": 0.5978303670883178,
|
|
"num_tokens": 70226940.0,
|
|
"step": 1115
|
|
},
|
|
{
|
|
"epoch": 3.6612111292962357,
|
|
"grad_norm": 49.49374181018141,
|
|
"learning_rate": 4.967456526401595e-09,
|
|
"loss": 2.6144,
|
|
"mean_token_accuracy": 0.5930372834205627,
|
|
"num_tokens": 70544098.0,
|
|
"step": 1120
|
|
},
|
|
{
|
|
"epoch": 3.6775777414075286,
|
|
"grad_norm": 47.7742894266053,
|
|
"learning_rate": 4.966532798024756e-09,
|
|
"loss": 2.5993,
|
|
"mean_token_accuracy": 0.5967276990413666,
|
|
"num_tokens": 70858464.0,
|
|
"step": 1125
|
|
},
|
|
{
|
|
"epoch": 3.6939443535188214,
|
|
"grad_norm": 48.24018085532676,
|
|
"learning_rate": 4.965596231021221e-09,
|
|
"loss": 2.6043,
|
|
"mean_token_accuracy": 0.5943282008171081,
|
|
"num_tokens": 71173847.0,
|
|
"step": 1130
|
|
},
|
|
{
|
|
"epoch": 3.7103109656301143,
|
|
"grad_norm": 46.88934534155509,
|
|
"learning_rate": 4.964646830265944e-09,
|
|
"loss": 2.6083,
|
|
"mean_token_accuracy": 0.5917117774486542,
|
|
"num_tokens": 71490884.0,
|
|
"step": 1135
|
|
},
|
|
{
|
|
"epoch": 3.7266775777414076,
|
|
"grad_norm": 47.822075277292676,
|
|
"learning_rate": 4.9636846007006784e-09,
|
|
"loss": 2.5422,
|
|
"mean_token_accuracy": 0.5986848413944245,
|
|
"num_tokens": 71807302.0,
|
|
"step": 1140
|
|
},
|
|
{
|
|
"epoch": 3.7430441898527005,
|
|
"grad_norm": 46.80028842269654,
|
|
"learning_rate": 4.9627095473339576e-09,
|
|
"loss": 2.5654,
|
|
"mean_token_accuracy": 0.5952131927013398,
|
|
"num_tokens": 72122097.0,
|
|
"step": 1145
|
|
},
|
|
{
|
|
"epoch": 3.7594108019639934,
|
|
"grad_norm": 43.95158266274471,
|
|
"learning_rate": 4.961721675241062e-09,
|
|
"loss": 2.5409,
|
|
"mean_token_accuracy": 0.5981755375862121,
|
|
"num_tokens": 72436073.0,
|
|
"step": 1150
|
|
},
|
|
{
|
|
"epoch": 3.7757774140752867,
|
|
"grad_norm": 44.184577219614965,
|
|
"learning_rate": 4.960720989563995e-09,
|
|
"loss": 2.5074,
|
|
"mean_token_accuracy": 0.5997650861740113,
|
|
"num_tokens": 72751719.0,
|
|
"step": 1155
|
|
},
|
|
{
|
|
"epoch": 3.7921440261865795,
|
|
"grad_norm": 41.34369211387233,
|
|
"learning_rate": 4.959707495511456e-09,
|
|
"loss": 2.5088,
|
|
"mean_token_accuracy": 0.5975705504417419,
|
|
"num_tokens": 73067477.0,
|
|
"step": 1160
|
|
},
|
|
{
|
|
"epoch": 3.8085106382978724,
|
|
"grad_norm": 41.04319366765971,
|
|
"learning_rate": 4.958681198358815e-09,
|
|
"loss": 2.4731,
|
|
"mean_token_accuracy": 0.6023198246955872,
|
|
"num_tokens": 73384446.0,
|
|
"step": 1165
|
|
},
|
|
{
|
|
"epoch": 3.8248772504091653,
|
|
"grad_norm": 40.35840500310339,
|
|
"learning_rate": 4.957642103448085e-09,
|
|
"loss": 2.5386,
|
|
"mean_token_accuracy": 0.5931589961051941,
|
|
"num_tokens": 73698463.0,
|
|
"step": 1170
|
|
},
|
|
{
|
|
"epoch": 3.841243862520458,
|
|
"grad_norm": 40.15978306256828,
|
|
"learning_rate": 4.956590216187888e-09,
|
|
"loss": 2.5528,
|
|
"mean_token_accuracy": 0.5887567341327667,
|
|
"num_tokens": 74012661.0,
|
|
"step": 1175
|
|
},
|
|
{
|
|
"epoch": 3.857610474631751,
|
|
"grad_norm": 38.37902076680877,
|
|
"learning_rate": 4.955525542053438e-09,
|
|
"loss": 2.4691,
|
|
"mean_token_accuracy": 0.5988661289215088,
|
|
"num_tokens": 74328504.0,
|
|
"step": 1180
|
|
},
|
|
{
|
|
"epoch": 3.8739770867430443,
|
|
"grad_norm": 38.07754678822132,
|
|
"learning_rate": 4.954448086586502e-09,
|
|
"loss": 2.479,
|
|
"mean_token_accuracy": 0.5997465968132019,
|
|
"num_tokens": 74644682.0,
|
|
"step": 1185
|
|
},
|
|
{
|
|
"epoch": 3.890343698854337,
|
|
"grad_norm": 39.03855697418089,
|
|
"learning_rate": 4.953357855395377e-09,
|
|
"loss": 2.4996,
|
|
"mean_token_accuracy": 0.597508841753006,
|
|
"num_tokens": 74960381.0,
|
|
"step": 1190
|
|
},
|
|
{
|
|
"epoch": 3.90671031096563,
|
|
"grad_norm": 37.97979041564868,
|
|
"learning_rate": 4.952254854154861e-09,
|
|
"loss": 2.4706,
|
|
"mean_token_accuracy": 0.6000802993774415,
|
|
"num_tokens": 75275290.0,
|
|
"step": 1195
|
|
},
|
|
{
|
|
"epoch": 3.9230769230769234,
|
|
"grad_norm": 39.84397161684973,
|
|
"learning_rate": 4.951139088606217e-09,
|
|
"loss": 2.493,
|
|
"mean_token_accuracy": 0.5967117786407471,
|
|
"num_tokens": 75590574.0,
|
|
"step": 1200
|
|
},
|
|
{
|
|
"epoch": 3.939443535188216,
|
|
"grad_norm": 37.68774052927674,
|
|
"learning_rate": 4.950010564557154e-09,
|
|
"loss": 2.4645,
|
|
"mean_token_accuracy": 0.6019287884235383,
|
|
"num_tokens": 75905357.0,
|
|
"step": 1205
|
|
},
|
|
{
|
|
"epoch": 3.955810147299509,
|
|
"grad_norm": 39.52747879667662,
|
|
"learning_rate": 4.9488692878817865e-09,
|
|
"loss": 2.4973,
|
|
"mean_token_accuracy": 0.595619136095047,
|
|
"num_tokens": 76221434.0,
|
|
"step": 1210
|
|
},
|
|
{
|
|
"epoch": 3.972176759410802,
|
|
"grad_norm": 36.7441206054698,
|
|
"learning_rate": 4.947715264520609e-09,
|
|
"loss": 2.4541,
|
|
"mean_token_accuracy": 0.6009849727153778,
|
|
"num_tokens": 76536701.0,
|
|
"step": 1215
|
|
},
|
|
{
|
|
"epoch": 3.988543371522095,
|
|
"grad_norm": 36.456613261964904,
|
|
"learning_rate": 4.946548500480466e-09,
|
|
"loss": 2.4683,
|
|
"mean_token_accuracy": 0.6003902852535248,
|
|
"num_tokens": 76853831.0,
|
|
"step": 1220
|
|
},
|
|
{
|
|
"epoch": 4.003273322422259,
|
|
"grad_norm": 36.21497438273292,
|
|
"learning_rate": 4.9453690018345145e-09,
|
|
"loss": 2.4962,
|
|
"mean_token_accuracy": 0.5997353924645318,
|
|
"num_tokens": 77113832.0,
|
|
"step": 1225
|
|
},
|
|
{
|
|
"epoch": 4.019639934533552,
|
|
"grad_norm": 35.87047882495412,
|
|
"learning_rate": 4.944176774722201e-09,
|
|
"loss": 2.4601,
|
|
"mean_token_accuracy": 0.59957355260849,
|
|
"num_tokens": 77428282.0,
|
|
"step": 1230
|
|
},
|
|
{
|
|
"epoch": 4.0360065466448445,
|
|
"grad_norm": 37.237635620382,
|
|
"learning_rate": 4.9429718253492254e-09,
|
|
"loss": 2.4653,
|
|
"mean_token_accuracy": 0.5990661978721619,
|
|
"num_tokens": 77746000.0,
|
|
"step": 1235
|
|
},
|
|
{
|
|
"epoch": 4.052373158756137,
|
|
"grad_norm": 38.89147301666522,
|
|
"learning_rate": 4.941754159987506e-09,
|
|
"loss": 2.4703,
|
|
"mean_token_accuracy": 0.6005004703998565,
|
|
"num_tokens": 78061879.0,
|
|
"step": 1240
|
|
},
|
|
{
|
|
"epoch": 4.06873977086743,
|
|
"grad_norm": 38.168350494896394,
|
|
"learning_rate": 4.94052378497515e-09,
|
|
"loss": 2.4951,
|
|
"mean_token_accuracy": 0.5971946477890014,
|
|
"num_tokens": 78377346.0,
|
|
"step": 1245
|
|
},
|
|
{
|
|
"epoch": 4.085106382978723,
|
|
"grad_norm": 37.01849369940714,
|
|
"learning_rate": 4.939280706716422e-09,
|
|
"loss": 2.4873,
|
|
"mean_token_accuracy": 0.5959964275360108,
|
|
"num_tokens": 78694009.0,
|
|
"step": 1250
|
|
},
|
|
{
|
|
"epoch": 4.101472995090016,
|
|
"grad_norm": 35.2694041688307,
|
|
"learning_rate": 4.938024931681706e-09,
|
|
"loss": 2.4285,
|
|
"mean_token_accuracy": 0.6062565207481384,
|
|
"num_tokens": 79006603.0,
|
|
"step": 1255
|
|
},
|
|
{
|
|
"epoch": 4.11783960720131,
|
|
"grad_norm": 38.65142188861583,
|
|
"learning_rate": 4.936756466407477e-09,
|
|
"loss": 2.4936,
|
|
"mean_token_accuracy": 0.5969399809837341,
|
|
"num_tokens": 79320758.0,
|
|
"step": 1260
|
|
},
|
|
{
|
|
"epoch": 4.134206219312603,
|
|
"grad_norm": 37.82578716741412,
|
|
"learning_rate": 4.935475317496264e-09,
|
|
"loss": 2.4921,
|
|
"mean_token_accuracy": 0.5950555443763733,
|
|
"num_tokens": 79636923.0,
|
|
"step": 1265
|
|
},
|
|
{
|
|
"epoch": 4.150572831423895,
|
|
"grad_norm": 36.637270956716655,
|
|
"learning_rate": 4.934181491616612e-09,
|
|
"loss": 2.4587,
|
|
"mean_token_accuracy": 0.6006620645523071,
|
|
"num_tokens": 79953014.0,
|
|
"step": 1270
|
|
},
|
|
{
|
|
"epoch": 4.166939443535188,
|
|
"grad_norm": 35.400359882953225,
|
|
"learning_rate": 4.9328749955030575e-09,
|
|
"loss": 2.4658,
|
|
"mean_token_accuracy": 0.5980123102664947,
|
|
"num_tokens": 80268771.0,
|
|
"step": 1275
|
|
},
|
|
{
|
|
"epoch": 4.183306055646481,
|
|
"grad_norm": 36.72319719460392,
|
|
"learning_rate": 4.931555835956082e-09,
|
|
"loss": 2.4862,
|
|
"mean_token_accuracy": 0.5970359206199646,
|
|
"num_tokens": 80583167.0,
|
|
"step": 1280
|
|
},
|
|
{
|
|
"epoch": 4.199672667757774,
|
|
"grad_norm": 36.62421262807592,
|
|
"learning_rate": 4.930224019842085e-09,
|
|
"loss": 2.4709,
|
|
"mean_token_accuracy": 0.600509512424469,
|
|
"num_tokens": 80897726.0,
|
|
"step": 1285
|
|
},
|
|
{
|
|
"epoch": 4.216039279869067,
|
|
"grad_norm": 37.137724593533896,
|
|
"learning_rate": 4.928879554093343e-09,
|
|
"loss": 2.4701,
|
|
"mean_token_accuracy": 0.5969075500965119,
|
|
"num_tokens": 81213422.0,
|
|
"step": 1290
|
|
},
|
|
{
|
|
"epoch": 4.23240589198036,
|
|
"grad_norm": 36.02686338812537,
|
|
"learning_rate": 4.927522445707978e-09,
|
|
"loss": 2.4485,
|
|
"mean_token_accuracy": 0.600042587518692,
|
|
"num_tokens": 81530245.0,
|
|
"step": 1295
|
|
},
|
|
{
|
|
"epoch": 4.248772504091653,
|
|
"grad_norm": 36.28731293885398,
|
|
"learning_rate": 4.926152701749917e-09,
|
|
"loss": 2.4685,
|
|
"mean_token_accuracy": 0.5975348889827728,
|
|
"num_tokens": 81843704.0,
|
|
"step": 1300
|
|
},
|
|
{
|
|
"epoch": 4.265139116202946,
|
|
"grad_norm": 36.618856102825845,
|
|
"learning_rate": 4.924770329348854e-09,
|
|
"loss": 2.49,
|
|
"mean_token_accuracy": 0.5962951421737671,
|
|
"num_tokens": 82160459.0,
|
|
"step": 1305
|
|
},
|
|
{
|
|
"epoch": 4.281505728314239,
|
|
"grad_norm": 38.21558740491567,
|
|
"learning_rate": 4.923375335700223e-09,
|
|
"loss": 2.49,
|
|
"mean_token_accuracy": 0.5959554374217987,
|
|
"num_tokens": 82476106.0,
|
|
"step": 1310
|
|
},
|
|
{
|
|
"epoch": 4.297872340425532,
|
|
"grad_norm": 37.497039235613,
|
|
"learning_rate": 4.921967728065147e-09,
|
|
"loss": 2.4849,
|
|
"mean_token_accuracy": 0.5967646718025208,
|
|
"num_tokens": 82792051.0,
|
|
"step": 1315
|
|
},
|
|
{
|
|
"epoch": 4.314238952536825,
|
|
"grad_norm": 34.48094366633381,
|
|
"learning_rate": 4.920547513770408e-09,
|
|
"loss": 2.445,
|
|
"mean_token_accuracy": 0.6008966147899628,
|
|
"num_tokens": 83106820.0,
|
|
"step": 1320
|
|
},
|
|
{
|
|
"epoch": 4.330605564648118,
|
|
"grad_norm": 37.813730656736176,
|
|
"learning_rate": 4.919114700208408e-09,
|
|
"loss": 2.4607,
|
|
"mean_token_accuracy": 0.5983804702758789,
|
|
"num_tokens": 83421720.0,
|
|
"step": 1325
|
|
},
|
|
{
|
|
"epoch": 4.346972176759411,
|
|
"grad_norm": 35.513809408398906,
|
|
"learning_rate": 4.917669294837129e-09,
|
|
"loss": 2.4551,
|
|
"mean_token_accuracy": 0.5996524155139923,
|
|
"num_tokens": 83738044.0,
|
|
"step": 1330
|
|
},
|
|
{
|
|
"epoch": 4.363338788870704,
|
|
"grad_norm": 36.18508976141847,
|
|
"learning_rate": 4.916211305180096e-09,
|
|
"loss": 2.42,
|
|
"mean_token_accuracy": 0.6065082490444184,
|
|
"num_tokens": 84055112.0,
|
|
"step": 1335
|
|
},
|
|
{
|
|
"epoch": 4.3797054009819965,
|
|
"grad_norm": 35.41417337406567,
|
|
"learning_rate": 4.9147407388263365e-09,
|
|
"loss": 2.472,
|
|
"mean_token_accuracy": 0.5982517778873444,
|
|
"num_tokens": 84370037.0,
|
|
"step": 1340
|
|
},
|
|
{
|
|
"epoch": 4.396072013093289,
|
|
"grad_norm": 35.797678528257016,
|
|
"learning_rate": 4.913257603430341e-09,
|
|
"loss": 2.479,
|
|
"mean_token_accuracy": 0.5972541332244873,
|
|
"num_tokens": 84685592.0,
|
|
"step": 1345
|
|
},
|
|
{
|
|
"epoch": 4.412438625204583,
|
|
"grad_norm": 37.599742811404646,
|
|
"learning_rate": 4.9117619067120245e-09,
|
|
"loss": 2.5071,
|
|
"mean_token_accuracy": 0.5945683479309082,
|
|
"num_tokens": 84999613.0,
|
|
"step": 1350
|
|
},
|
|
{
|
|
"epoch": 4.428805237315876,
|
|
"grad_norm": 35.00424778002652,
|
|
"learning_rate": 4.910253656456683e-09,
|
|
"loss": 2.4543,
|
|
"mean_token_accuracy": 0.5988426804542542,
|
|
"num_tokens": 85316360.0,
|
|
"step": 1355
|
|
},
|
|
{
|
|
"epoch": 4.445171849427169,
|
|
"grad_norm": 36.30117151566015,
|
|
"learning_rate": 4.908732860514958e-09,
|
|
"loss": 2.4747,
|
|
"mean_token_accuracy": 0.5954937934875488,
|
|
"num_tokens": 85633390.0,
|
|
"step": 1360
|
|
},
|
|
{
|
|
"epoch": 4.461538461538462,
|
|
"grad_norm": 36.06452936238572,
|
|
"learning_rate": 4.907199526802791e-09,
|
|
"loss": 2.4289,
|
|
"mean_token_accuracy": 0.6033567905426025,
|
|
"num_tokens": 85949251.0,
|
|
"step": 1365
|
|
},
|
|
{
|
|
"epoch": 4.4779050736497545,
|
|
"grad_norm": 37.51693653126231,
|
|
"learning_rate": 4.905653663301387e-09,
|
|
"loss": 2.5224,
|
|
"mean_token_accuracy": 0.5897888660430908,
|
|
"num_tokens": 86265830.0,
|
|
"step": 1370
|
|
},
|
|
{
|
|
"epoch": 4.494271685761047,
|
|
"grad_norm": 35.854037607697414,
|
|
"learning_rate": 4.904095278057166e-09,
|
|
"loss": 2.4691,
|
|
"mean_token_accuracy": 0.5975383460521698,
|
|
"num_tokens": 86581476.0,
|
|
"step": 1375
|
|
},
|
|
{
|
|
"epoch": 4.51063829787234,
|
|
"grad_norm": 36.325492671950144,
|
|
"learning_rate": 4.902524379181728e-09,
|
|
"loss": 2.4419,
|
|
"mean_token_accuracy": 0.5999314069747925,
|
|
"num_tokens": 86898519.0,
|
|
"step": 1380
|
|
},
|
|
{
|
|
"epoch": 4.527004909983633,
|
|
"grad_norm": 36.93795273948186,
|
|
"learning_rate": 4.90094097485181e-09,
|
|
"loss": 2.4828,
|
|
"mean_token_accuracy": 0.59447141289711,
|
|
"num_tokens": 87216046.0,
|
|
"step": 1385
|
|
},
|
|
{
|
|
"epoch": 4.543371522094926,
|
|
"grad_norm": 34.85070068347699,
|
|
"learning_rate": 4.899345073309236e-09,
|
|
"loss": 2.4256,
|
|
"mean_token_accuracy": 0.6024092137813568,
|
|
"num_tokens": 87532275.0,
|
|
"step": 1390
|
|
},
|
|
{
|
|
"epoch": 4.559738134206219,
|
|
"grad_norm": 37.081910141065485,
|
|
"learning_rate": 4.8977366828608846e-09,
|
|
"loss": 2.5232,
|
|
"mean_token_accuracy": 0.5894258916378021,
|
|
"num_tokens": 87847744.0,
|
|
"step": 1395
|
|
},
|
|
{
|
|
"epoch": 4.576104746317513,
|
|
"grad_norm": 35.447906810365794,
|
|
"learning_rate": 4.896115811878639e-09,
|
|
"loss": 2.4367,
|
|
"mean_token_accuracy": 0.6020602822303772,
|
|
"num_tokens": 88163634.0,
|
|
"step": 1400
|
|
},
|
|
{
|
|
"epoch": 4.5924713584288055,
|
|
"grad_norm": 37.20055072424884,
|
|
"learning_rate": 4.8944824687993435e-09,
|
|
"loss": 2.4496,
|
|
"mean_token_accuracy": 0.6003972291946411,
|
|
"num_tokens": 88478549.0,
|
|
"step": 1405
|
|
},
|
|
{
|
|
"epoch": 4.608837970540098,
|
|
"grad_norm": 37.04974152317881,
|
|
"learning_rate": 4.892836662124766e-09,
|
|
"loss": 2.4788,
|
|
"mean_token_accuracy": 0.5948118209838867,
|
|
"num_tokens": 88794566.0,
|
|
"step": 1410
|
|
},
|
|
{
|
|
"epoch": 4.625204582651391,
|
|
"grad_norm": 35.91992247765496,
|
|
"learning_rate": 4.891178400421543e-09,
|
|
"loss": 2.4532,
|
|
"mean_token_accuracy": 0.5996142148971557,
|
|
"num_tokens": 89109615.0,
|
|
"step": 1415
|
|
},
|
|
{
|
|
"epoch": 4.641571194762684,
|
|
"grad_norm": 36.935699347637375,
|
|
"learning_rate": 4.889507692321146e-09,
|
|
"loss": 2.4788,
|
|
"mean_token_accuracy": 0.5942489326000213,
|
|
"num_tokens": 89424020.0,
|
|
"step": 1420
|
|
},
|
|
{
|
|
"epoch": 4.657937806873977,
|
|
"grad_norm": 36.535263648437336,
|
|
"learning_rate": 4.88782454651983e-09,
|
|
"loss": 2.4642,
|
|
"mean_token_accuracy": 0.5973877847194672,
|
|
"num_tokens": 89739762.0,
|
|
"step": 1425
|
|
},
|
|
{
|
|
"epoch": 4.67430441898527,
|
|
"grad_norm": 35.09943792092194,
|
|
"learning_rate": 4.88612897177859e-09,
|
|
"loss": 2.4584,
|
|
"mean_token_accuracy": 0.5972561419010163,
|
|
"num_tokens": 90056358.0,
|
|
"step": 1430
|
|
},
|
|
{
|
|
"epoch": 4.690671031096563,
|
|
"grad_norm": 36.40251641806779,
|
|
"learning_rate": 4.884420976923112e-09,
|
|
"loss": 2.4699,
|
|
"mean_token_accuracy": 0.594540125131607,
|
|
"num_tokens": 90373815.0,
|
|
"step": 1435
|
|
},
|
|
{
|
|
"epoch": 4.7070376432078564,
|
|
"grad_norm": 36.20096084241157,
|
|
"learning_rate": 4.882700570843737e-09,
|
|
"loss": 2.4284,
|
|
"mean_token_accuracy": 0.6023145020008087,
|
|
"num_tokens": 90690273.0,
|
|
"step": 1440
|
|
},
|
|
{
|
|
"epoch": 4.723404255319149,
|
|
"grad_norm": 34.99455458968427,
|
|
"learning_rate": 4.880967762495401e-09,
|
|
"loss": 2.481,
|
|
"mean_token_accuracy": 0.5950899481773376,
|
|
"num_tokens": 91007144.0,
|
|
"step": 1445
|
|
},
|
|
{
|
|
"epoch": 4.739770867430442,
|
|
"grad_norm": 35.72119070392356,
|
|
"learning_rate": 4.8792225608976e-09,
|
|
"loss": 2.452,
|
|
"mean_token_accuracy": 0.5972677767276764,
|
|
"num_tokens": 91321113.0,
|
|
"step": 1450
|
|
},
|
|
{
|
|
"epoch": 4.756137479541735,
|
|
"grad_norm": 35.9579226923624,
|
|
"learning_rate": 4.8774649751343384e-09,
|
|
"loss": 2.4639,
|
|
"mean_token_accuracy": 0.5965704917907715,
|
|
"num_tokens": 91636320.0,
|
|
"step": 1455
|
|
},
|
|
{
|
|
"epoch": 4.772504091653028,
|
|
"grad_norm": 36.28261360146757,
|
|
"learning_rate": 4.875695014354079e-09,
|
|
"loss": 2.4266,
|
|
"mean_token_accuracy": 0.6007283627986908,
|
|
"num_tokens": 91951445.0,
|
|
"step": 1460
|
|
},
|
|
{
|
|
"epoch": 4.788870703764321,
|
|
"grad_norm": 37.16953048348806,
|
|
"learning_rate": 4.8739126877697e-09,
|
|
"loss": 2.4722,
|
|
"mean_token_accuracy": 0.5935324609279633,
|
|
"num_tokens": 92265893.0,
|
|
"step": 1465
|
|
},
|
|
{
|
|
"epoch": 4.805237315875614,
|
|
"grad_norm": 36.25198111318839,
|
|
"learning_rate": 4.872118004658446e-09,
|
|
"loss": 2.4446,
|
|
"mean_token_accuracy": 0.5985255479812622,
|
|
"num_tokens": 92582049.0,
|
|
"step": 1470
|
|
},
|
|
{
|
|
"epoch": 4.8216039279869065,
|
|
"grad_norm": 34.95866540091871,
|
|
"learning_rate": 4.8703109743618775e-09,
|
|
"loss": 2.4311,
|
|
"mean_token_accuracy": 0.6004527628421783,
|
|
"num_tokens": 92899566.0,
|
|
"step": 1475
|
|
},
|
|
{
|
|
"epoch": 4.837970540098199,
|
|
"grad_norm": 35.953089394319996,
|
|
"learning_rate": 4.868491606285823e-09,
|
|
"loss": 2.4457,
|
|
"mean_token_accuracy": 0.5963627815246582,
|
|
"num_tokens": 93212824.0,
|
|
"step": 1480
|
|
},
|
|
{
|
|
"epoch": 4.854337152209492,
|
|
"grad_norm": 36.34600921156728,
|
|
"learning_rate": 4.866659909900334e-09,
|
|
"loss": 2.4735,
|
|
"mean_token_accuracy": 0.5948220491409302,
|
|
"num_tokens": 93528357.0,
|
|
"step": 1485
|
|
},
|
|
{
|
|
"epoch": 4.870703764320785,
|
|
"grad_norm": 36.11778890464434,
|
|
"learning_rate": 4.864815894739629e-09,
|
|
"loss": 2.444,
|
|
"mean_token_accuracy": 0.5960727274417877,
|
|
"num_tokens": 93843459.0,
|
|
"step": 1490
|
|
},
|
|
{
|
|
"epoch": 4.887070376432079,
|
|
"grad_norm": 36.19249009284775,
|
|
"learning_rate": 4.862959570402049e-09,
|
|
"loss": 2.4493,
|
|
"mean_token_accuracy": 0.5964035809040069,
|
|
"num_tokens": 94158516.0,
|
|
"step": 1495
|
|
},
|
|
{
|
|
"epoch": 4.903436988543372,
|
|
"grad_norm": 35.86509172201521,
|
|
"learning_rate": 4.8610909465500055e-09,
|
|
"loss": 2.4221,
|
|
"mean_token_accuracy": 0.5996748507022858,
|
|
"num_tokens": 94473674.0,
|
|
"step": 1500
|
|
},
|
|
{
|
|
"epoch": 4.919803600654665,
|
|
"grad_norm": 36.534709819616,
|
|
"learning_rate": 4.859210032909931e-09,
|
|
"loss": 2.4551,
|
|
"mean_token_accuracy": 0.5964705228805542,
|
|
"num_tokens": 94789597.0,
|
|
"step": 1505
|
|
},
|
|
{
|
|
"epoch": 4.9361702127659575,
|
|
"grad_norm": 36.071966143335516,
|
|
"learning_rate": 4.857316839272229e-09,
|
|
"loss": 2.4456,
|
|
"mean_token_accuracy": 0.5960875868797302,
|
|
"num_tokens": 95103661.0,
|
|
"step": 1510
|
|
},
|
|
{
|
|
"epoch": 4.95253682487725,
|
|
"grad_norm": 33.949806954557204,
|
|
"learning_rate": 4.855411375491217e-09,
|
|
"loss": 2.4029,
|
|
"mean_token_accuracy": 0.6011349201202393,
|
|
"num_tokens": 95419831.0,
|
|
"step": 1515
|
|
},
|
|
{
|
|
"epoch": 4.968903436988543,
|
|
"grad_norm": 36.025707856738336,
|
|
"learning_rate": 4.853493651485088e-09,
|
|
"loss": 2.4658,
|
|
"mean_token_accuracy": 0.5928048253059387,
|
|
"num_tokens": 95735502.0,
|
|
"step": 1520
|
|
},
|
|
{
|
|
"epoch": 4.985270049099836,
|
|
"grad_norm": 35.54506660841942,
|
|
"learning_rate": 4.851563677235845e-09,
|
|
"loss": 2.4165,
|
|
"mean_token_accuracy": 0.6018769025802613,
|
|
"num_tokens": 96053078.0,
|
|
"step": 1525
|
|
},
|
|
{
|
|
"epoch": 5.0,
|
|
"grad_norm": 35.316681076979684,
|
|
"learning_rate": 4.849621462789257e-09,
|
|
"loss": 2.466,
|
|
"mean_token_accuracy": 0.5903970334264967,
|
|
"num_tokens": 96313602.0,
|
|
"step": 1530
|
|
},
|
|
{
|
|
"epoch": 5.016366612111293,
|
|
"grad_norm": 34.48683940022847,
|
|
"learning_rate": 4.8476670182548045e-09,
|
|
"loss": 2.4255,
|
|
"mean_token_accuracy": 0.598287183046341,
|
|
"num_tokens": 96629562.0,
|
|
"step": 1535
|
|
},
|
|
{
|
|
"epoch": 5.032733224222586,
|
|
"grad_norm": 34.658985731341446,
|
|
"learning_rate": 4.8457003538056285e-09,
|
|
"loss": 2.3785,
|
|
"mean_token_accuracy": 0.6074756979942322,
|
|
"num_tokens": 96946467.0,
|
|
"step": 1540
|
|
},
|
|
{
|
|
"epoch": 5.049099836333879,
|
|
"grad_norm": 33.853524601076565,
|
|
"learning_rate": 4.843721479678476e-09,
|
|
"loss": 2.4122,
|
|
"mean_token_accuracy": 0.6005301892757415,
|
|
"num_tokens": 97260313.0,
|
|
"step": 1545
|
|
},
|
|
{
|
|
"epoch": 5.0654664484451715,
|
|
"grad_norm": 35.74543512906391,
|
|
"learning_rate": 4.841730406173645e-09,
|
|
"loss": 2.4103,
|
|
"mean_token_accuracy": 0.6016847252845764,
|
|
"num_tokens": 97578758.0,
|
|
"step": 1550
|
|
},
|
|
{
|
|
"epoch": 5.081833060556465,
|
|
"grad_norm": 35.48837441704123,
|
|
"learning_rate": 4.839727143654934e-09,
|
|
"loss": 2.3891,
|
|
"mean_token_accuracy": 0.6052005052566528,
|
|
"num_tokens": 97895124.0,
|
|
"step": 1555
|
|
},
|
|
{
|
|
"epoch": 5.098199672667758,
|
|
"grad_norm": 35.52394836029277,
|
|
"learning_rate": 4.837711702549589e-09,
|
|
"loss": 2.3923,
|
|
"mean_token_accuracy": 0.6032778918743134,
|
|
"num_tokens": 98209910.0,
|
|
"step": 1560
|
|
},
|
|
{
|
|
"epoch": 5.114566284779051,
|
|
"grad_norm": 35.90933473348734,
|
|
"learning_rate": 4.835684093348244e-09,
|
|
"loss": 2.4441,
|
|
"mean_token_accuracy": 0.5969228565692901,
|
|
"num_tokens": 98525651.0,
|
|
"step": 1565
|
|
},
|
|
{
|
|
"epoch": 5.130932896890344,
|
|
"grad_norm": 36.39247016426329,
|
|
"learning_rate": 4.83364432660487e-09,
|
|
"loss": 2.4319,
|
|
"mean_token_accuracy": 0.5964118003845215,
|
|
"num_tokens": 98842269.0,
|
|
"step": 1570
|
|
},
|
|
{
|
|
"epoch": 5.147299509001637,
|
|
"grad_norm": 35.63480593062428,
|
|
"learning_rate": 4.8315924129367224e-09,
|
|
"loss": 2.4305,
|
|
"mean_token_accuracy": 0.597532719373703,
|
|
"num_tokens": 99157104.0,
|
|
"step": 1575
|
|
},
|
|
{
|
|
"epoch": 5.1636661211129296,
|
|
"grad_norm": 35.23834160461372,
|
|
"learning_rate": 4.829528363024279e-09,
|
|
"loss": 2.4339,
|
|
"mean_token_accuracy": 0.5947438180446625,
|
|
"num_tokens": 99472398.0,
|
|
"step": 1580
|
|
},
|
|
{
|
|
"epoch": 5.180032733224222,
|
|
"grad_norm": 35.93484257317461,
|
|
"learning_rate": 4.827452187611192e-09,
|
|
"loss": 2.4657,
|
|
"mean_token_accuracy": 0.5917093694210053,
|
|
"num_tokens": 99785710.0,
|
|
"step": 1585
|
|
},
|
|
{
|
|
"epoch": 5.196399345335515,
|
|
"grad_norm": 34.64447666981932,
|
|
"learning_rate": 4.825363897504226e-09,
|
|
"loss": 2.409,
|
|
"mean_token_accuracy": 0.5984897494316102,
|
|
"num_tokens": 100101052.0,
|
|
"step": 1590
|
|
},
|
|
{
|
|
"epoch": 5.212765957446808,
|
|
"grad_norm": 36.10117173060778,
|
|
"learning_rate": 4.823263503573204e-09,
|
|
"loss": 2.421,
|
|
"mean_token_accuracy": 0.5984189927577972,
|
|
"num_tokens": 100417907.0,
|
|
"step": 1595
|
|
},
|
|
{
|
|
"epoch": 5.229132569558102,
|
|
"grad_norm": 32.118403250633314,
|
|
"learning_rate": 4.821151016750953e-09,
|
|
"loss": 2.3962,
|
|
"mean_token_accuracy": 0.6026824653148651,
|
|
"num_tokens": 100734371.0,
|
|
"step": 1600
|
|
},
|
|
{
|
|
"epoch": 5.245499181669395,
|
|
"grad_norm": 34.52896891196535,
|
|
"learning_rate": 4.819026448033244e-09,
|
|
"loss": 2.3906,
|
|
"mean_token_accuracy": 0.6012833952903748,
|
|
"num_tokens": 101050013.0,
|
|
"step": 1605
|
|
},
|
|
{
|
|
"epoch": 5.261865793780688,
|
|
"grad_norm": 32.79111439062977,
|
|
"learning_rate": 4.816889808478735e-09,
|
|
"loss": 2.3625,
|
|
"mean_token_accuracy": 0.6078511595726013,
|
|
"num_tokens": 101365831.0,
|
|
"step": 1610
|
|
},
|
|
{
|
|
"epoch": 5.2782324058919805,
|
|
"grad_norm": 33.66051847624121,
|
|
"learning_rate": 4.814741109208916e-09,
|
|
"loss": 2.3859,
|
|
"mean_token_accuracy": 0.6022830188274384,
|
|
"num_tokens": 101681937.0,
|
|
"step": 1615
|
|
},
|
|
{
|
|
"epoch": 5.294599018003273,
|
|
"grad_norm": 32.46266480007908,
|
|
"learning_rate": 4.812580361408048e-09,
|
|
"loss": 2.3444,
|
|
"mean_token_accuracy": 0.6072778820991516,
|
|
"num_tokens": 101998221.0,
|
|
"step": 1620
|
|
},
|
|
{
|
|
"epoch": 5.310965630114566,
|
|
"grad_norm": 33.090685851416744,
|
|
"learning_rate": 4.810407576323107e-09,
|
|
"loss": 2.3883,
|
|
"mean_token_accuracy": 0.6031101226806641,
|
|
"num_tokens": 102314712.0,
|
|
"step": 1625
|
|
},
|
|
{
|
|
"epoch": 5.327332242225859,
|
|
"grad_norm": 34.59572822062589,
|
|
"learning_rate": 4.808222765263724e-09,
|
|
"loss": 2.3654,
|
|
"mean_token_accuracy": 0.6064081609249115,
|
|
"num_tokens": 102630928.0,
|
|
"step": 1630
|
|
},
|
|
{
|
|
"epoch": 5.343698854337152,
|
|
"grad_norm": 33.27667097985031,
|
|
"learning_rate": 4.8060259396021264e-09,
|
|
"loss": 2.3848,
|
|
"mean_token_accuracy": 0.6004369139671326,
|
|
"num_tokens": 102947814.0,
|
|
"step": 1635
|
|
},
|
|
{
|
|
"epoch": 5.360065466448445,
|
|
"grad_norm": 32.59372126839617,
|
|
"learning_rate": 4.803817110773081e-09,
|
|
"loss": 2.3775,
|
|
"mean_token_accuracy": 0.6084607303142547,
|
|
"num_tokens": 103263194.0,
|
|
"step": 1640
|
|
},
|
|
{
|
|
"epoch": 5.376432078559738,
|
|
"grad_norm": 33.619174051544235,
|
|
"learning_rate": 4.801596290273832e-09,
|
|
"loss": 2.3844,
|
|
"mean_token_accuracy": 0.606970465183258,
|
|
"num_tokens": 103578803.0,
|
|
"step": 1645
|
|
},
|
|
{
|
|
"epoch": 5.3927986906710315,
|
|
"grad_norm": 32.14783977338467,
|
|
"learning_rate": 4.799363489664039e-09,
|
|
"loss": 2.3604,
|
|
"mean_token_accuracy": 0.6091885685920715,
|
|
"num_tokens": 103894623.0,
|
|
"step": 1650
|
|
},
|
|
{
|
|
"epoch": 5.409165302782324,
|
|
"grad_norm": 32.73860509638947,
|
|
"learning_rate": 4.797118720565724e-09,
|
|
"loss": 2.4045,
|
|
"mean_token_accuracy": 0.599544358253479,
|
|
"num_tokens": 104211684.0,
|
|
"step": 1655
|
|
},
|
|
{
|
|
"epoch": 5.425531914893617,
|
|
"grad_norm": 33.06579842685092,
|
|
"learning_rate": 4.794861994663205e-09,
|
|
"loss": 2.3551,
|
|
"mean_token_accuracy": 0.6075682699680328,
|
|
"num_tokens": 104527791.0,
|
|
"step": 1660
|
|
},
|
|
{
|
|
"epoch": 5.44189852700491,
|
|
"grad_norm": 32.745445010813356,
|
|
"learning_rate": 4.792593323703035e-09,
|
|
"loss": 2.3591,
|
|
"mean_token_accuracy": 0.6083964228630065,
|
|
"num_tokens": 104842509.0,
|
|
"step": 1665
|
|
},
|
|
{
|
|
"epoch": 5.458265139116203,
|
|
"grad_norm": 33.35194456579634,
|
|
"learning_rate": 4.790312719493944e-09,
|
|
"loss": 2.3717,
|
|
"mean_token_accuracy": 0.6044698596000672,
|
|
"num_tokens": 105157961.0,
|
|
"step": 1670
|
|
},
|
|
{
|
|
"epoch": 5.474631751227496,
|
|
"grad_norm": 33.6853627434255,
|
|
"learning_rate": 4.788020193906776e-09,
|
|
"loss": 2.4123,
|
|
"mean_token_accuracy": 0.5995143771171569,
|
|
"num_tokens": 105472057.0,
|
|
"step": 1675
|
|
},
|
|
{
|
|
"epoch": 5.490998363338789,
|
|
"grad_norm": 32.41724618948233,
|
|
"learning_rate": 4.785715758874428e-09,
|
|
"loss": 2.36,
|
|
"mean_token_accuracy": 0.6099441170692443,
|
|
"num_tokens": 105789028.0,
|
|
"step": 1680
|
|
},
|
|
{
|
|
"epoch": 5.5073649754500815,
|
|
"grad_norm": 33.039731094631016,
|
|
"learning_rate": 4.783399426391786e-09,
|
|
"loss": 2.3617,
|
|
"mean_token_accuracy": 0.6067679703235627,
|
|
"num_tokens": 106105035.0,
|
|
"step": 1685
|
|
},
|
|
{
|
|
"epoch": 5.523731587561375,
|
|
"grad_norm": 32.798434224683184,
|
|
"learning_rate": 4.781071208515665e-09,
|
|
"loss": 2.3335,
|
|
"mean_token_accuracy": 0.6111400902271271,
|
|
"num_tokens": 106420164.0,
|
|
"step": 1690
|
|
},
|
|
{
|
|
"epoch": 5.540098199672668,
|
|
"grad_norm": 31.195355670151503,
|
|
"learning_rate": 4.778731117364744e-09,
|
|
"loss": 2.3537,
|
|
"mean_token_accuracy": 0.6082989335060119,
|
|
"num_tokens": 106733630.0,
|
|
"step": 1695
|
|
},
|
|
{
|
|
"epoch": 5.556464811783961,
|
|
"grad_norm": 32.15524904031231,
|
|
"learning_rate": 4.7763791651195035e-09,
|
|
"loss": 2.3555,
|
|
"mean_token_accuracy": 0.607761561870575,
|
|
"num_tokens": 107049953.0,
|
|
"step": 1700
|
|
},
|
|
{
|
|
"epoch": 5.572831423895254,
|
|
"grad_norm": 31.920502634421535,
|
|
"learning_rate": 4.774015364022165e-09,
|
|
"loss": 2.3545,
|
|
"mean_token_accuracy": 0.6080652475357056,
|
|
"num_tokens": 107365569.0,
|
|
"step": 1705
|
|
},
|
|
{
|
|
"epoch": 5.589198036006547,
|
|
"grad_norm": 30.981864636239646,
|
|
"learning_rate": 4.7716397263766215e-09,
|
|
"loss": 2.3248,
|
|
"mean_token_accuracy": 0.6130104541778565,
|
|
"num_tokens": 107681393.0,
|
|
"step": 1710
|
|
},
|
|
{
|
|
"epoch": 5.60556464811784,
|
|
"grad_norm": 31.488785549629814,
|
|
"learning_rate": 4.7692522645483796e-09,
|
|
"loss": 2.3347,
|
|
"mean_token_accuracy": 0.6084707498550415,
|
|
"num_tokens": 107996454.0,
|
|
"step": 1715
|
|
},
|
|
{
|
|
"epoch": 5.6219312602291325,
|
|
"grad_norm": 32.70963616909119,
|
|
"learning_rate": 4.766852990964491e-09,
|
|
"loss": 2.3575,
|
|
"mean_token_accuracy": 0.6064568817615509,
|
|
"num_tokens": 108311689.0,
|
|
"step": 1720
|
|
},
|
|
{
|
|
"epoch": 5.638297872340425,
|
|
"grad_norm": 30.48960666827236,
|
|
"learning_rate": 4.76444191811349e-09,
|
|
"loss": 2.3296,
|
|
"mean_token_accuracy": 0.6116202116012573,
|
|
"num_tokens": 108628481.0,
|
|
"step": 1725
|
|
},
|
|
{
|
|
"epoch": 5.654664484451718,
|
|
"grad_norm": 31.46744994781824,
|
|
"learning_rate": 4.762019058545326e-09,
|
|
"loss": 2.3357,
|
|
"mean_token_accuracy": 0.6112555265426636,
|
|
"num_tokens": 108944800.0,
|
|
"step": 1730
|
|
},
|
|
{
|
|
"epoch": 5.671031096563011,
|
|
"grad_norm": 31.5451605308921,
|
|
"learning_rate": 4.759584424871302e-09,
|
|
"loss": 2.329,
|
|
"mean_token_accuracy": 0.6095126152038575,
|
|
"num_tokens": 109260521.0,
|
|
"step": 1735
|
|
},
|
|
{
|
|
"epoch": 5.687397708674304,
|
|
"grad_norm": 31.012545675589063,
|
|
"learning_rate": 4.757138029764003e-09,
|
|
"loss": 2.3571,
|
|
"mean_token_accuracy": 0.6056615591049195,
|
|
"num_tokens": 109574830.0,
|
|
"step": 1740
|
|
},
|
|
{
|
|
"epoch": 5.703764320785598,
|
|
"grad_norm": 30.138938836035837,
|
|
"learning_rate": 4.754679885957239e-09,
|
|
"loss": 2.3023,
|
|
"mean_token_accuracy": 0.6127878904342652,
|
|
"num_tokens": 109890335.0,
|
|
"step": 1745
|
|
},
|
|
{
|
|
"epoch": 5.720130932896891,
|
|
"grad_norm": 29.217074277624654,
|
|
"learning_rate": 4.7522100062459706e-09,
|
|
"loss": 2.3089,
|
|
"mean_token_accuracy": 0.6136601090431213,
|
|
"num_tokens": 110205835.0,
|
|
"step": 1750
|
|
},
|
|
{
|
|
"epoch": 5.736497545008183,
|
|
"grad_norm": 30.51562401439012,
|
|
"learning_rate": 4.749728403486245e-09,
|
|
"loss": 2.3161,
|
|
"mean_token_accuracy": 0.6105396926403046,
|
|
"num_tokens": 110521915.0,
|
|
"step": 1755
|
|
},
|
|
{
|
|
"epoch": 5.752864157119476,
|
|
"grad_norm": 29.819759106075676,
|
|
"learning_rate": 4.747235090595129e-09,
|
|
"loss": 2.349,
|
|
"mean_token_accuracy": 0.6054076015949249,
|
|
"num_tokens": 110837238.0,
|
|
"step": 1760
|
|
},
|
|
{
|
|
"epoch": 5.769230769230769,
|
|
"grad_norm": 30.89858633499367,
|
|
"learning_rate": 4.744730080550646e-09,
|
|
"loss": 2.3386,
|
|
"mean_token_accuracy": 0.6067017555236817,
|
|
"num_tokens": 111153241.0,
|
|
"step": 1765
|
|
},
|
|
{
|
|
"epoch": 5.785597381342062,
|
|
"grad_norm": 30.91938995645458,
|
|
"learning_rate": 4.742213386391698e-09,
|
|
"loss": 2.3483,
|
|
"mean_token_accuracy": 0.604193365573883,
|
|
"num_tokens": 111468960.0,
|
|
"step": 1770
|
|
},
|
|
{
|
|
"epoch": 5.801963993453355,
|
|
"grad_norm": 27.323433001798556,
|
|
"learning_rate": 4.739685021218012e-09,
|
|
"loss": 2.2693,
|
|
"mean_token_accuracy": 0.6164486765861511,
|
|
"num_tokens": 111786342.0,
|
|
"step": 1775
|
|
},
|
|
{
|
|
"epoch": 5.818330605564648,
|
|
"grad_norm": 30.739373596899014,
|
|
"learning_rate": 4.737144998190057e-09,
|
|
"loss": 2.3301,
|
|
"mean_token_accuracy": 0.6068127572536468,
|
|
"num_tokens": 112101386.0,
|
|
"step": 1780
|
|
},
|
|
{
|
|
"epoch": 5.8346972176759415,
|
|
"grad_norm": 29.304255959591316,
|
|
"learning_rate": 4.734593330528989e-09,
|
|
"loss": 2.2716,
|
|
"mean_token_accuracy": 0.6155919015407563,
|
|
"num_tokens": 112415452.0,
|
|
"step": 1785
|
|
},
|
|
{
|
|
"epoch": 5.851063829787234,
|
|
"grad_norm": 29.784953751928743,
|
|
"learning_rate": 4.732030031516571e-09,
|
|
"loss": 2.2885,
|
|
"mean_token_accuracy": 0.6114451467990876,
|
|
"num_tokens": 112730033.0,
|
|
"step": 1790
|
|
},
|
|
{
|
|
"epoch": 5.867430441898527,
|
|
"grad_norm": 28.262213285997248,
|
|
"learning_rate": 4.72945511449511e-09,
|
|
"loss": 2.3039,
|
|
"mean_token_accuracy": 0.6072455883026123,
|
|
"num_tokens": 113045711.0,
|
|
"step": 1795
|
|
},
|
|
{
|
|
"epoch": 5.88379705400982,
|
|
"grad_norm": 28.352633321650952,
|
|
"learning_rate": 4.726868592867388e-09,
|
|
"loss": 2.266,
|
|
"mean_token_accuracy": 0.6129730522632599,
|
|
"num_tokens": 113360554.0,
|
|
"step": 1800
|
|
},
|
|
{
|
|
"epoch": 5.900163666121113,
|
|
"grad_norm": 28.894218746546375,
|
|
"learning_rate": 4.724270480096589e-09,
|
|
"loss": 2.298,
|
|
"mean_token_accuracy": 0.6091433703899384,
|
|
"num_tokens": 113675208.0,
|
|
"step": 1805
|
|
},
|
|
{
|
|
"epoch": 5.916530278232406,
|
|
"grad_norm": 26.932849836839313,
|
|
"learning_rate": 4.721660789706232e-09,
|
|
"loss": 2.2583,
|
|
"mean_token_accuracy": 0.6158608973026276,
|
|
"num_tokens": 113990879.0,
|
|
"step": 1810
|
|
},
|
|
{
|
|
"epoch": 5.932896890343699,
|
|
"grad_norm": 26.642378048564314,
|
|
"learning_rate": 4.719039535280095e-09,
|
|
"loss": 2.2566,
|
|
"mean_token_accuracy": 0.6155524849891663,
|
|
"num_tokens": 114306432.0,
|
|
"step": 1815
|
|
},
|
|
{
|
|
"epoch": 5.949263502454992,
|
|
"grad_norm": 26.350950920645698,
|
|
"learning_rate": 4.716406730462153e-09,
|
|
"loss": 2.2372,
|
|
"mean_token_accuracy": 0.6172758162021637,
|
|
"num_tokens": 114622631.0,
|
|
"step": 1820
|
|
},
|
|
{
|
|
"epoch": 5.9656301145662844,
|
|
"grad_norm": 27.44466078553468,
|
|
"learning_rate": 4.713762388956501e-09,
|
|
"loss": 2.262,
|
|
"mean_token_accuracy": 0.6123747944831848,
|
|
"num_tokens": 114937555.0,
|
|
"step": 1825
|
|
},
|
|
{
|
|
"epoch": 5.981996726677577,
|
|
"grad_norm": 26.323128620614657,
|
|
"learning_rate": 4.71110652452728e-09,
|
|
"loss": 2.2493,
|
|
"mean_token_accuracy": 0.6143295288085937,
|
|
"num_tokens": 115252837.0,
|
|
"step": 1830
|
|
},
|
|
{
|
|
"epoch": 5.998363338788871,
|
|
"grad_norm": 25.783928944992137,
|
|
"learning_rate": 4.7084391509986155e-09,
|
|
"loss": 2.2469,
|
|
"mean_token_accuracy": 0.613012844324112,
|
|
"num_tokens": 115568359.0,
|
|
"step": 1835
|
|
},
|
|
{
|
|
"epoch": 6.013093289689034,
|
|
"grad_norm": 25.525666646792722,
|
|
"learning_rate": 4.705760282254537e-09,
|
|
"loss": 2.2152,
|
|
"mean_token_accuracy": 0.6208844979604086,
|
|
"num_tokens": 115828706.0,
|
|
"step": 1840
|
|
},
|
|
{
|
|
"epoch": 6.029459901800327,
|
|
"grad_norm": 26.755470190379572,
|
|
"learning_rate": 4.703069932238906e-09,
|
|
"loss": 2.263,
|
|
"mean_token_accuracy": 0.61106738448143,
|
|
"num_tokens": 116144638.0,
|
|
"step": 1845
|
|
},
|
|
{
|
|
"epoch": 6.045826513911621,
|
|
"grad_norm": 24.507907534529128,
|
|
"learning_rate": 4.7003681149553475e-09,
|
|
"loss": 2.263,
|
|
"mean_token_accuracy": 0.6099263489246368,
|
|
"num_tokens": 116458739.0,
|
|
"step": 1850
|
|
},
|
|
{
|
|
"epoch": 6.062193126022914,
|
|
"grad_norm": 26.141828238324674,
|
|
"learning_rate": 4.697654844467175e-09,
|
|
"loss": 2.21,
|
|
"mean_token_accuracy": 0.6175226509571076,
|
|
"num_tokens": 116773686.0,
|
|
"step": 1855
|
|
},
|
|
{
|
|
"epoch": 6.0785597381342065,
|
|
"grad_norm": 25.07581997868601,
|
|
"learning_rate": 4.6949301348973174e-09,
|
|
"loss": 2.2532,
|
|
"mean_token_accuracy": 0.6104423046112061,
|
|
"num_tokens": 117090382.0,
|
|
"step": 1860
|
|
},
|
|
{
|
|
"epoch": 6.094926350245499,
|
|
"grad_norm": 24.482814976190816,
|
|
"learning_rate": 4.692194000428245e-09,
|
|
"loss": 2.2089,
|
|
"mean_token_accuracy": 0.6172590911388397,
|
|
"num_tokens": 117405941.0,
|
|
"step": 1865
|
|
},
|
|
{
|
|
"epoch": 6.111292962356792,
|
|
"grad_norm": 24.43522181875196,
|
|
"learning_rate": 4.6894464553018976e-09,
|
|
"loss": 2.2496,
|
|
"mean_token_accuracy": 0.6107006132602691,
|
|
"num_tokens": 117722695.0,
|
|
"step": 1870
|
|
},
|
|
{
|
|
"epoch": 6.127659574468085,
|
|
"grad_norm": 27.28050902421836,
|
|
"learning_rate": 4.686687513819606e-09,
|
|
"loss": 2.187,
|
|
"mean_token_accuracy": 0.6217321693897248,
|
|
"num_tokens": 118037466.0,
|
|
"step": 1875
|
|
},
|
|
{
|
|
"epoch": 6.144026186579378,
|
|
"grad_norm": 24.463275743963848,
|
|
"learning_rate": 4.6839171903420245e-09,
|
|
"loss": 2.1935,
|
|
"mean_token_accuracy": 0.6205372273921966,
|
|
"num_tokens": 118352819.0,
|
|
"step": 1880
|
|
},
|
|
{
|
|
"epoch": 6.160392798690671,
|
|
"grad_norm": 24.00192156906821,
|
|
"learning_rate": 4.681135499289048e-09,
|
|
"loss": 2.2127,
|
|
"mean_token_accuracy": 0.6138154864311218,
|
|
"num_tokens": 118668463.0,
|
|
"step": 1885
|
|
},
|
|
{
|
|
"epoch": 6.176759410801964,
|
|
"grad_norm": 24.372092039508235,
|
|
"learning_rate": 4.678342455139744e-09,
|
|
"loss": 2.1885,
|
|
"mean_token_accuracy": 0.6191801726818085,
|
|
"num_tokens": 118983231.0,
|
|
"step": 1890
|
|
},
|
|
{
|
|
"epoch": 6.1931260229132565,
|
|
"grad_norm": 23.624206455606668,
|
|
"learning_rate": 4.675538072432276e-09,
|
|
"loss": 2.2036,
|
|
"mean_token_accuracy": 0.6157269537448883,
|
|
"num_tokens": 119298059.0,
|
|
"step": 1895
|
|
},
|
|
{
|
|
"epoch": 6.20949263502455,
|
|
"grad_norm": 23.629247772201367,
|
|
"learning_rate": 4.672722365763821e-09,
|
|
"loss": 2.1807,
|
|
"mean_token_accuracy": 0.6190297603607178,
|
|
"num_tokens": 119613994.0,
|
|
"step": 1900
|
|
},
|
|
{
|
|
"epoch": 6.225859247135843,
|
|
"grad_norm": 23.635670695000215,
|
|
"learning_rate": 4.669895349790502e-09,
|
|
"loss": 2.2345,
|
|
"mean_token_accuracy": 0.6094496965408325,
|
|
"num_tokens": 119930637.0,
|
|
"step": 1905
|
|
},
|
|
{
|
|
"epoch": 6.242225859247136,
|
|
"grad_norm": 23.665973763482956,
|
|
"learning_rate": 4.667057039227308e-09,
|
|
"loss": 2.1728,
|
|
"mean_token_accuracy": 0.6191839039325714,
|
|
"num_tokens": 120245981.0,
|
|
"step": 1910
|
|
},
|
|
{
|
|
"epoch": 6.258592471358429,
|
|
"grad_norm": 24.159573167994676,
|
|
"learning_rate": 4.664207448848018e-09,
|
|
"loss": 2.1897,
|
|
"mean_token_accuracy": 0.6166786015033722,
|
|
"num_tokens": 120560959.0,
|
|
"step": 1915
|
|
},
|
|
{
|
|
"epoch": 6.274959083469722,
|
|
"grad_norm": 22.631008864459677,
|
|
"learning_rate": 4.661346593485124e-09,
|
|
"loss": 2.1623,
|
|
"mean_token_accuracy": 0.6211640655994415,
|
|
"num_tokens": 120878150.0,
|
|
"step": 1920
|
|
},
|
|
{
|
|
"epoch": 6.291325695581015,
|
|
"grad_norm": 23.296439842018362,
|
|
"learning_rate": 4.658474488029753e-09,
|
|
"loss": 2.1886,
|
|
"mean_token_accuracy": 0.6151513159275055,
|
|
"num_tokens": 121191333.0,
|
|
"step": 1925
|
|
},
|
|
{
|
|
"epoch": 6.3076923076923075,
|
|
"grad_norm": 22.80831617398982,
|
|
"learning_rate": 4.655591147431589e-09,
|
|
"loss": 2.1641,
|
|
"mean_token_accuracy": 0.6197402775287628,
|
|
"num_tokens": 121506516.0,
|
|
"step": 1930
|
|
},
|
|
{
|
|
"epoch": 6.3240589198036,
|
|
"grad_norm": 22.872432441327586,
|
|
"learning_rate": 4.652696586698801e-09,
|
|
"loss": 2.1717,
|
|
"mean_token_accuracy": 0.6172288477420806,
|
|
"num_tokens": 121820087.0,
|
|
"step": 1935
|
|
},
|
|
{
|
|
"epoch": 6.340425531914893,
|
|
"grad_norm": 21.855626803707125,
|
|
"learning_rate": 4.649790820897955e-09,
|
|
"loss": 2.1661,
|
|
"mean_token_accuracy": 0.6183198809623718,
|
|
"num_tokens": 122136793.0,
|
|
"step": 1940
|
|
},
|
|
{
|
|
"epoch": 6.356792144026187,
|
|
"grad_norm": 21.4551134123129,
|
|
"learning_rate": 4.646873865153944e-09,
|
|
"loss": 2.1404,
|
|
"mean_token_accuracy": 0.622417813539505,
|
|
"num_tokens": 122452611.0,
|
|
"step": 1945
|
|
},
|
|
{
|
|
"epoch": 6.37315875613748,
|
|
"grad_norm": 22.172464286307513,
|
|
"learning_rate": 4.6439457346499045e-09,
|
|
"loss": 2.1567,
|
|
"mean_token_accuracy": 0.6196485161781311,
|
|
"num_tokens": 122766828.0,
|
|
"step": 1950
|
|
},
|
|
{
|
|
"epoch": 6.389525368248773,
|
|
"grad_norm": 22.38339117966395,
|
|
"learning_rate": 4.641006444627141e-09,
|
|
"loss": 2.1793,
|
|
"mean_token_accuracy": 0.6164099514484406,
|
|
"num_tokens": 123081686.0,
|
|
"step": 1955
|
|
},
|
|
{
|
|
"epoch": 6.405891980360066,
|
|
"grad_norm": 23.618424749681687,
|
|
"learning_rate": 4.638056010385042e-09,
|
|
"loss": 2.1862,
|
|
"mean_token_accuracy": 0.6123735785484314,
|
|
"num_tokens": 123398051.0,
|
|
"step": 1960
|
|
},
|
|
{
|
|
"epoch": 6.422258592471358,
|
|
"grad_norm": 21.65478232539387,
|
|
"learning_rate": 4.635094447281006e-09,
|
|
"loss": 2.16,
|
|
"mean_token_accuracy": 0.6157838463783264,
|
|
"num_tokens": 123712386.0,
|
|
"step": 1965
|
|
},
|
|
{
|
|
"epoch": 6.438625204582651,
|
|
"grad_norm": 21.99814700391443,
|
|
"learning_rate": 4.632121770730357e-09,
|
|
"loss": 2.1315,
|
|
"mean_token_accuracy": 0.6206058323383331,
|
|
"num_tokens": 124028129.0,
|
|
"step": 1970
|
|
},
|
|
{
|
|
"epoch": 6.454991816693944,
|
|
"grad_norm": 21.24229531471878,
|
|
"learning_rate": 4.629137996206266e-09,
|
|
"loss": 2.1896,
|
|
"mean_token_accuracy": 0.6120827198028564,
|
|
"num_tokens": 124343973.0,
|
|
"step": 1975
|
|
},
|
|
{
|
|
"epoch": 6.471358428805237,
|
|
"grad_norm": 21.20761731125876,
|
|
"learning_rate": 4.62614313923967e-09,
|
|
"loss": 2.1334,
|
|
"mean_token_accuracy": 0.619620543718338,
|
|
"num_tokens": 124659135.0,
|
|
"step": 1980
|
|
},
|
|
{
|
|
"epoch": 6.48772504091653,
|
|
"grad_norm": 21.694275642010822,
|
|
"learning_rate": 4.623137215419194e-09,
|
|
"loss": 2.1736,
|
|
"mean_token_accuracy": 0.6120718121528625,
|
|
"num_tokens": 124975035.0,
|
|
"step": 1985
|
|
},
|
|
{
|
|
"epoch": 6.504091653027823,
|
|
"grad_norm": 20.732129794066694,
|
|
"learning_rate": 4.620120240391064e-09,
|
|
"loss": 2.135,
|
|
"mean_token_accuracy": 0.6189526736736297,
|
|
"num_tokens": 125290692.0,
|
|
"step": 1990
|
|
},
|
|
{
|
|
"epoch": 6.5204582651391165,
|
|
"grad_norm": 21.938805671841106,
|
|
"learning_rate": 4.6170922298590336e-09,
|
|
"loss": 2.1461,
|
|
"mean_token_accuracy": 0.6168884932994843,
|
|
"num_tokens": 125605387.0,
|
|
"step": 1995
|
|
},
|
|
{
|
|
"epoch": 6.536824877250409,
|
|
"grad_norm": 21.852887235656542,
|
|
"learning_rate": 4.614053199584291e-09,
|
|
"loss": 2.1565,
|
|
"mean_token_accuracy": 0.615131002664566,
|
|
"num_tokens": 125923044.0,
|
|
"step": 2000
|
|
},
|
|
{
|
|
"epoch": 6.553191489361702,
|
|
"grad_norm": 21.61436281642574,
|
|
"learning_rate": 4.611003165385389e-09,
|
|
"loss": 2.1705,
|
|
"mean_token_accuracy": 0.614033317565918,
|
|
"num_tokens": 126239452.0,
|
|
"step": 2005
|
|
},
|
|
{
|
|
"epoch": 6.569558101472995,
|
|
"grad_norm": 21.50966636839898,
|
|
"learning_rate": 4.607942143138157e-09,
|
|
"loss": 2.1953,
|
|
"mean_token_accuracy": 0.6101115584373474,
|
|
"num_tokens": 126556662.0,
|
|
"step": 2010
|
|
},
|
|
{
|
|
"epoch": 6.585924713584288,
|
|
"grad_norm": 20.995613724620096,
|
|
"learning_rate": 4.6048701487756136e-09,
|
|
"loss": 2.1428,
|
|
"mean_token_accuracy": 0.6168196797370911,
|
|
"num_tokens": 126871126.0,
|
|
"step": 2015
|
|
},
|
|
{
|
|
"epoch": 6.602291325695581,
|
|
"grad_norm": 22.384610556987017,
|
|
"learning_rate": 4.601787198287896e-09,
|
|
"loss": 2.1824,
|
|
"mean_token_accuracy": 0.6081099033355712,
|
|
"num_tokens": 127187229.0,
|
|
"step": 2020
|
|
},
|
|
{
|
|
"epoch": 6.618657937806874,
|
|
"grad_norm": 21.879272078386037,
|
|
"learning_rate": 4.598693307722165e-09,
|
|
"loss": 2.1545,
|
|
"mean_token_accuracy": 0.6163994073867798,
|
|
"num_tokens": 127503607.0,
|
|
"step": 2025
|
|
},
|
|
{
|
|
"epoch": 6.635024549918167,
|
|
"grad_norm": 21.28896281508814,
|
|
"learning_rate": 4.595588493182525e-09,
|
|
"loss": 2.1428,
|
|
"mean_token_accuracy": 0.6177895963191986,
|
|
"num_tokens": 127820570.0,
|
|
"step": 2030
|
|
},
|
|
{
|
|
"epoch": 6.65139116202946,
|
|
"grad_norm": 20.16858032321011,
|
|
"learning_rate": 4.592472770829945e-09,
|
|
"loss": 2.0875,
|
|
"mean_token_accuracy": 0.6251370906829834,
|
|
"num_tokens": 128136630.0,
|
|
"step": 2035
|
|
},
|
|
{
|
|
"epoch": 6.667757774140753,
|
|
"grad_norm": 20.37161020175876,
|
|
"learning_rate": 4.589346156882167e-09,
|
|
"loss": 2.1434,
|
|
"mean_token_accuracy": 0.6160525560379029,
|
|
"num_tokens": 128451184.0,
|
|
"step": 2040
|
|
},
|
|
{
|
|
"epoch": 6.684124386252046,
|
|
"grad_norm": 20.995068375593643,
|
|
"learning_rate": 4.5862086676136275e-09,
|
|
"loss": 2.1157,
|
|
"mean_token_accuracy": 0.6187912404537201,
|
|
"num_tokens": 128767986.0,
|
|
"step": 2045
|
|
},
|
|
{
|
|
"epoch": 6.700490998363339,
|
|
"grad_norm": 20.03837123793174,
|
|
"learning_rate": 4.5830603193553685e-09,
|
|
"loss": 2.1128,
|
|
"mean_token_accuracy": 0.6212022960186004,
|
|
"num_tokens": 129083628.0,
|
|
"step": 2050
|
|
},
|
|
{
|
|
"epoch": 6.716857610474632,
|
|
"grad_norm": 21.732839017733074,
|
|
"learning_rate": 4.579901128494958e-09,
|
|
"loss": 2.138,
|
|
"mean_token_accuracy": 0.6142685532569885,
|
|
"num_tokens": 129398358.0,
|
|
"step": 2055
|
|
},
|
|
{
|
|
"epoch": 6.733224222585925,
|
|
"grad_norm": 21.3996768207252,
|
|
"learning_rate": 4.576731111476395e-09,
|
|
"loss": 2.1406,
|
|
"mean_token_accuracy": 0.614562475681305,
|
|
"num_tokens": 129713765.0,
|
|
"step": 2060
|
|
},
|
|
{
|
|
"epoch": 6.7495908346972175,
|
|
"grad_norm": 20.86887078387194,
|
|
"learning_rate": 4.5735502848000375e-09,
|
|
"loss": 2.1023,
|
|
"mean_token_accuracy": 0.6210401713848114,
|
|
"num_tokens": 130029521.0,
|
|
"step": 2065
|
|
},
|
|
{
|
|
"epoch": 6.76595744680851,
|
|
"grad_norm": 21.35259791338893,
|
|
"learning_rate": 4.570358665022504e-09,
|
|
"loss": 2.1534,
|
|
"mean_token_accuracy": 0.611782455444336,
|
|
"num_tokens": 130343583.0,
|
|
"step": 2070
|
|
},
|
|
{
|
|
"epoch": 6.782324058919803,
|
|
"grad_norm": 20.350039593083146,
|
|
"learning_rate": 4.567156268756594e-09,
|
|
"loss": 2.117,
|
|
"mean_token_accuracy": 0.6170144140720367,
|
|
"num_tokens": 130660087.0,
|
|
"step": 2075
|
|
},
|
|
{
|
|
"epoch": 6.798690671031096,
|
|
"grad_norm": 21.746541271091658,
|
|
"learning_rate": 4.5639431126712e-09,
|
|
"loss": 2.1353,
|
|
"mean_token_accuracy": 0.6146936535835266,
|
|
"num_tokens": 130976790.0,
|
|
"step": 2080
|
|
},
|
|
{
|
|
"epoch": 6.81505728314239,
|
|
"grad_norm": 21.651718945925616,
|
|
"learning_rate": 4.56071921349122e-09,
|
|
"loss": 2.1343,
|
|
"mean_token_accuracy": 0.6141116917133331,
|
|
"num_tokens": 131294454.0,
|
|
"step": 2085
|
|
},
|
|
{
|
|
"epoch": 6.831423895253683,
|
|
"grad_norm": 21.237894277524884,
|
|
"learning_rate": 4.557484587997473e-09,
|
|
"loss": 2.1273,
|
|
"mean_token_accuracy": 0.6154871582984924,
|
|
"num_tokens": 131610087.0,
|
|
"step": 2090
|
|
},
|
|
{
|
|
"epoch": 6.847790507364976,
|
|
"grad_norm": 21.281926833038302,
|
|
"learning_rate": 4.55423925302661e-09,
|
|
"loss": 2.1393,
|
|
"mean_token_accuracy": 0.6141292870044708,
|
|
"num_tokens": 131926207.0,
|
|
"step": 2095
|
|
},
|
|
{
|
|
"epoch": 6.8641571194762685,
|
|
"grad_norm": 20.402026254120138,
|
|
"learning_rate": 4.550983225471023e-09,
|
|
"loss": 2.1273,
|
|
"mean_token_accuracy": 0.61502326130867,
|
|
"num_tokens": 132239715.0,
|
|
"step": 2100
|
|
},
|
|
{
|
|
"epoch": 6.880523731587561,
|
|
"grad_norm": 21.317416165671755,
|
|
"learning_rate": 4.547716522278764e-09,
|
|
"loss": 2.1389,
|
|
"mean_token_accuracy": 0.6126947045326233,
|
|
"num_tokens": 132556394.0,
|
|
"step": 2105
|
|
},
|
|
{
|
|
"epoch": 6.896890343698854,
|
|
"grad_norm": 20.709431005668037,
|
|
"learning_rate": 4.5444391604534505e-09,
|
|
"loss": 2.1358,
|
|
"mean_token_accuracy": 0.6119840562343597,
|
|
"num_tokens": 132870997.0,
|
|
"step": 2110
|
|
},
|
|
{
|
|
"epoch": 6.913256955810147,
|
|
"grad_norm": 20.72493284331367,
|
|
"learning_rate": 4.5411511570541815e-09,
|
|
"loss": 2.1086,
|
|
"mean_token_accuracy": 0.6191326141357422,
|
|
"num_tokens": 133187277.0,
|
|
"step": 2115
|
|
},
|
|
{
|
|
"epoch": 6.92962356792144,
|
|
"grad_norm": 20.401506768438352,
|
|
"learning_rate": 4.5378525291954456e-09,
|
|
"loss": 2.1036,
|
|
"mean_token_accuracy": 0.6179743111133575,
|
|
"num_tokens": 133504379.0,
|
|
"step": 2120
|
|
},
|
|
{
|
|
"epoch": 6.945990180032734,
|
|
"grad_norm": 21.151405028030936,
|
|
"learning_rate": 4.534543294047033e-09,
|
|
"loss": 2.1407,
|
|
"mean_token_accuracy": 0.6133243441581726,
|
|
"num_tokens": 133819166.0,
|
|
"step": 2125
|
|
},
|
|
{
|
|
"epoch": 6.962356792144027,
|
|
"grad_norm": 20.77846794569156,
|
|
"learning_rate": 4.5312234688339474e-09,
|
|
"loss": 2.1495,
|
|
"mean_token_accuracy": 0.6119216084480286,
|
|
"num_tokens": 134135526.0,
|
|
"step": 2130
|
|
},
|
|
{
|
|
"epoch": 6.9787234042553195,
|
|
"grad_norm": 20.024591823948676,
|
|
"learning_rate": 4.527893070836314e-09,
|
|
"loss": 2.0845,
|
|
"mean_token_accuracy": 0.6207044720649719,
|
|
"num_tokens": 134451520.0,
|
|
"step": 2135
|
|
},
|
|
{
|
|
"epoch": 6.995090016366612,
|
|
"grad_norm": 20.40853541843752,
|
|
"learning_rate": 4.52455211738929e-09,
|
|
"loss": 2.083,
|
|
"mean_token_accuracy": 0.6208429157733917,
|
|
"num_tokens": 134768176.0,
|
|
"step": 2140
|
|
},
|
|
{
|
|
"epoch": 7.009819967266775,
|
|
"grad_norm": 20.932141294235674,
|
|
"learning_rate": 4.521200625882978e-09,
|
|
"loss": 2.0872,
|
|
"mean_token_accuracy": 0.6209465795093112,
|
|
"num_tokens": 135029611.0,
|
|
"step": 2145
|
|
},
|
|
{
|
|
"epoch": 7.026186579378069,
|
|
"grad_norm": 19.94403526331167,
|
|
"learning_rate": 4.517838613762331e-09,
|
|
"loss": 2.071,
|
|
"mean_token_accuracy": 0.6247387766838074,
|
|
"num_tokens": 135344576.0,
|
|
"step": 2150
|
|
},
|
|
{
|
|
"epoch": 7.042553191489362,
|
|
"grad_norm": 20.2959867414438,
|
|
"learning_rate": 4.514466098527062e-09,
|
|
"loss": 2.1312,
|
|
"mean_token_accuracy": 0.6129837095737457,
|
|
"num_tokens": 135660597.0,
|
|
"step": 2155
|
|
},
|
|
{
|
|
"epoch": 7.058919803600655,
|
|
"grad_norm": 20.13114916587158,
|
|
"learning_rate": 4.5110830977315555e-09,
|
|
"loss": 2.1074,
|
|
"mean_token_accuracy": 0.6167091131210327,
|
|
"num_tokens": 135976424.0,
|
|
"step": 2160
|
|
},
|
|
{
|
|
"epoch": 7.075286415711948,
|
|
"grad_norm": 20.66542558789711,
|
|
"learning_rate": 4.5076896289847735e-09,
|
|
"loss": 2.1191,
|
|
"mean_token_accuracy": 0.6145562171936035,
|
|
"num_tokens": 136292501.0,
|
|
"step": 2165
|
|
},
|
|
{
|
|
"epoch": 7.091653027823241,
|
|
"grad_norm": 21.114797520019756,
|
|
"learning_rate": 4.504285709950167e-09,
|
|
"loss": 2.1162,
|
|
"mean_token_accuracy": 0.6152336478233338,
|
|
"num_tokens": 136606279.0,
|
|
"step": 2170
|
|
},
|
|
{
|
|
"epoch": 7.1080196399345335,
|
|
"grad_norm": 20.14558765288978,
|
|
"learning_rate": 4.50087135834558e-09,
|
|
"loss": 2.0733,
|
|
"mean_token_accuracy": 0.6247061014175415,
|
|
"num_tokens": 136922670.0,
|
|
"step": 2175
|
|
},
|
|
{
|
|
"epoch": 7.124386252045826,
|
|
"grad_norm": 20.9247828857839,
|
|
"learning_rate": 4.497446591943162e-09,
|
|
"loss": 2.0978,
|
|
"mean_token_accuracy": 0.6188047885894775,
|
|
"num_tokens": 137236588.0,
|
|
"step": 2180
|
|
},
|
|
{
|
|
"epoch": 7.140752864157119,
|
|
"grad_norm": 21.082001804171576,
|
|
"learning_rate": 4.494011428569269e-09,
|
|
"loss": 2.1465,
|
|
"mean_token_accuracy": 0.609574556350708,
|
|
"num_tokens": 137550798.0,
|
|
"step": 2185
|
|
},
|
|
{
|
|
"epoch": 7.157119476268412,
|
|
"grad_norm": 20.902296160839647,
|
|
"learning_rate": 4.490565886104378e-09,
|
|
"loss": 2.1155,
|
|
"mean_token_accuracy": 0.6145494759082795,
|
|
"num_tokens": 137866384.0,
|
|
"step": 2190
|
|
},
|
|
{
|
|
"epoch": 7.173486088379706,
|
|
"grad_norm": 20.191528730933655,
|
|
"learning_rate": 4.487109982482991e-09,
|
|
"loss": 2.0624,
|
|
"mean_token_accuracy": 0.6228398621082306,
|
|
"num_tokens": 138183138.0,
|
|
"step": 2195
|
|
},
|
|
{
|
|
"epoch": 7.189852700490999,
|
|
"grad_norm": 19.70846607577594,
|
|
"learning_rate": 4.483643735693537e-09,
|
|
"loss": 2.0661,
|
|
"mean_token_accuracy": 0.6221046924591065,
|
|
"num_tokens": 138497274.0,
|
|
"step": 2200
|
|
},
|
|
{
|
|
"epoch": 7.2062193126022915,
|
|
"grad_norm": 21.054465864675535,
|
|
"learning_rate": 4.480167163778287e-09,
|
|
"loss": 2.1081,
|
|
"mean_token_accuracy": 0.614901089668274,
|
|
"num_tokens": 138812244.0,
|
|
"step": 2205
|
|
},
|
|
{
|
|
"epoch": 7.222585924713584,
|
|
"grad_norm": 20.177884489958767,
|
|
"learning_rate": 4.476680284833252e-09,
|
|
"loss": 2.059,
|
|
"mean_token_accuracy": 0.6242061018943786,
|
|
"num_tokens": 139127574.0,
|
|
"step": 2210
|
|
},
|
|
{
|
|
"epoch": 7.238952536824877,
|
|
"grad_norm": 20.50976864112745,
|
|
"learning_rate": 4.473183117008096e-09,
|
|
"loss": 2.104,
|
|
"mean_token_accuracy": 0.6162344992160798,
|
|
"num_tokens": 139444561.0,
|
|
"step": 2215
|
|
},
|
|
{
|
|
"epoch": 7.25531914893617,
|
|
"grad_norm": 19.844812284201744,
|
|
"learning_rate": 4.469675678506035e-09,
|
|
"loss": 2.0767,
|
|
"mean_token_accuracy": 0.6197092831134796,
|
|
"num_tokens": 139759101.0,
|
|
"step": 2220
|
|
},
|
|
{
|
|
"epoch": 7.271685761047463,
|
|
"grad_norm": 20.125104087682654,
|
|
"learning_rate": 4.466157987583747e-09,
|
|
"loss": 2.0822,
|
|
"mean_token_accuracy": 0.6182936906814576,
|
|
"num_tokens": 140073771.0,
|
|
"step": 2225
|
|
},
|
|
{
|
|
"epoch": 7.288052373158756,
|
|
"grad_norm": 20.428231845938967,
|
|
"learning_rate": 4.462630062551274e-09,
|
|
"loss": 2.0704,
|
|
"mean_token_accuracy": 0.621297436952591,
|
|
"num_tokens": 140389001.0,
|
|
"step": 2230
|
|
},
|
|
{
|
|
"epoch": 7.304418985270049,
|
|
"grad_norm": 19.954165544812422,
|
|
"learning_rate": 4.459091921771929e-09,
|
|
"loss": 2.0792,
|
|
"mean_token_accuracy": 0.6181824147701264,
|
|
"num_tokens": 140704788.0,
|
|
"step": 2235
|
|
},
|
|
{
|
|
"epoch": 7.320785597381342,
|
|
"grad_norm": 20.204841527212057,
|
|
"learning_rate": 4.455543583662199e-09,
|
|
"loss": 2.0928,
|
|
"mean_token_accuracy": 0.6172252476215363,
|
|
"num_tokens": 141020036.0,
|
|
"step": 2240
|
|
},
|
|
{
|
|
"epoch": 7.337152209492635,
|
|
"grad_norm": 20.162303951401036,
|
|
"learning_rate": 4.451985066691649e-09,
|
|
"loss": 2.0907,
|
|
"mean_token_accuracy": 0.6169468641281128,
|
|
"num_tokens": 141335334.0,
|
|
"step": 2245
|
|
},
|
|
{
|
|
"epoch": 7.353518821603928,
|
|
"grad_norm": 19.880239575791986,
|
|
"learning_rate": 4.448416389382826e-09,
|
|
"loss": 2.0814,
|
|
"mean_token_accuracy": 0.6201544046401978,
|
|
"num_tokens": 141651325.0,
|
|
"step": 2250
|
|
},
|
|
{
|
|
"epoch": 7.369885433715221,
|
|
"grad_norm": 19.837865962881644,
|
|
"learning_rate": 4.444837570311163e-09,
|
|
"loss": 2.077,
|
|
"mean_token_accuracy": 0.6198208451271057,
|
|
"num_tokens": 141967976.0,
|
|
"step": 2255
|
|
},
|
|
{
|
|
"epoch": 7.386252045826514,
|
|
"grad_norm": 19.56879716151567,
|
|
"learning_rate": 4.441248628104884e-09,
|
|
"loss": 2.0843,
|
|
"mean_token_accuracy": 0.617261779308319,
|
|
"num_tokens": 142284034.0,
|
|
"step": 2260
|
|
},
|
|
{
|
|
"epoch": 7.402618657937807,
|
|
"grad_norm": 20.20902374782504,
|
|
"learning_rate": 4.4376495814449034e-09,
|
|
"loss": 2.062,
|
|
"mean_token_accuracy": 0.6235301315784454,
|
|
"num_tokens": 142600358.0,
|
|
"step": 2265
|
|
},
|
|
{
|
|
"epoch": 7.4189852700491,
|
|
"grad_norm": 20.30083756077551,
|
|
"learning_rate": 4.4340404490647316e-09,
|
|
"loss": 2.1089,
|
|
"mean_token_accuracy": 0.6131757915019989,
|
|
"num_tokens": 142915407.0,
|
|
"step": 2270
|
|
},
|
|
{
|
|
"epoch": 7.435351882160393,
|
|
"grad_norm": 20.30697450610285,
|
|
"learning_rate": 4.4304212497503735e-09,
|
|
"loss": 2.0812,
|
|
"mean_token_accuracy": 0.6191363453865051,
|
|
"num_tokens": 143231393.0,
|
|
"step": 2275
|
|
},
|
|
{
|
|
"epoch": 7.451718494271685,
|
|
"grad_norm": 20.52437090532947,
|
|
"learning_rate": 4.42679200234024e-09,
|
|
"loss": 2.0942,
|
|
"mean_token_accuracy": 0.6162575602531433,
|
|
"num_tokens": 143547968.0,
|
|
"step": 2280
|
|
},
|
|
{
|
|
"epoch": 7.468085106382979,
|
|
"grad_norm": 19.98790638119669,
|
|
"learning_rate": 4.423152725725037e-09,
|
|
"loss": 2.0759,
|
|
"mean_token_accuracy": 0.6203132092952728,
|
|
"num_tokens": 143864798.0,
|
|
"step": 2285
|
|
},
|
|
{
|
|
"epoch": 7.484451718494272,
|
|
"grad_norm": 20.723567179964625,
|
|
"learning_rate": 4.419503438847678e-09,
|
|
"loss": 2.0803,
|
|
"mean_token_accuracy": 0.6180489838123322,
|
|
"num_tokens": 144180405.0,
|
|
"step": 2290
|
|
},
|
|
{
|
|
"epoch": 7.500818330605565,
|
|
"grad_norm": 20.363175730134444,
|
|
"learning_rate": 4.415844160703178e-09,
|
|
"loss": 2.1056,
|
|
"mean_token_accuracy": 0.6134061455726624,
|
|
"num_tokens": 144496031.0,
|
|
"step": 2295
|
|
},
|
|
{
|
|
"epoch": 7.517184942716858,
|
|
"grad_norm": 19.504255018620267,
|
|
"learning_rate": 4.412174910338562e-09,
|
|
"loss": 2.0759,
|
|
"mean_token_accuracy": 0.6175196766853333,
|
|
"num_tokens": 144813411.0,
|
|
"step": 2300
|
|
},
|
|
{
|
|
"epoch": 7.533551554828151,
|
|
"grad_norm": 19.571313987498225,
|
|
"learning_rate": 4.408495706852758e-09,
|
|
"loss": 2.065,
|
|
"mean_token_accuracy": 0.6217400550842285,
|
|
"num_tokens": 145129640.0,
|
|
"step": 2305
|
|
},
|
|
{
|
|
"epoch": 7.5499181669394435,
|
|
"grad_norm": 19.88010468801649,
|
|
"learning_rate": 4.404806569396502e-09,
|
|
"loss": 2.0739,
|
|
"mean_token_accuracy": 0.6190234243869781,
|
|
"num_tokens": 145445200.0,
|
|
"step": 2310
|
|
},
|
|
{
|
|
"epoch": 7.566284779050736,
|
|
"grad_norm": 20.30094612692668,
|
|
"learning_rate": 4.40110751717224e-09,
|
|
"loss": 2.0807,
|
|
"mean_token_accuracy": 0.615682327747345,
|
|
"num_tokens": 145761534.0,
|
|
"step": 2315
|
|
},
|
|
{
|
|
"epoch": 7.582651391162029,
|
|
"grad_norm": 19.59261400508155,
|
|
"learning_rate": 4.397398569434024e-09,
|
|
"loss": 2.0588,
|
|
"mean_token_accuracy": 0.6192507922649384,
|
|
"num_tokens": 146078820.0,
|
|
"step": 2320
|
|
},
|
|
{
|
|
"epoch": 7.599018003273322,
|
|
"grad_norm": 19.185470644015048,
|
|
"learning_rate": 4.393679745487411e-09,
|
|
"loss": 2.0413,
|
|
"mean_token_accuracy": 0.6238569915294647,
|
|
"num_tokens": 146394440.0,
|
|
"step": 2325
|
|
},
|
|
{
|
|
"epoch": 7.615384615384615,
|
|
"grad_norm": 19.495292421388935,
|
|
"learning_rate": 4.3899510646893695e-09,
|
|
"loss": 2.0612,
|
|
"mean_token_accuracy": 0.6186753571033478,
|
|
"num_tokens": 146709930.0,
|
|
"step": 2330
|
|
},
|
|
{
|
|
"epoch": 7.631751227495909,
|
|
"grad_norm": 19.79956682637677,
|
|
"learning_rate": 4.386212546448172e-09,
|
|
"loss": 2.0987,
|
|
"mean_token_accuracy": 0.6140192031860352,
|
|
"num_tokens": 147023216.0,
|
|
"step": 2335
|
|
},
|
|
{
|
|
"epoch": 7.648117839607202,
|
|
"grad_norm": 19.411576587443466,
|
|
"learning_rate": 4.3824642102232955e-09,
|
|
"loss": 2.0512,
|
|
"mean_token_accuracy": 0.6212476313114166,
|
|
"num_tokens": 147338846.0,
|
|
"step": 2340
|
|
},
|
|
{
|
|
"epoch": 7.6644844517184945,
|
|
"grad_norm": 19.61725592268653,
|
|
"learning_rate": 4.378706075525322e-09,
|
|
"loss": 2.0814,
|
|
"mean_token_accuracy": 0.614804869890213,
|
|
"num_tokens": 147655009.0,
|
|
"step": 2345
|
|
},
|
|
{
|
|
"epoch": 7.680851063829787,
|
|
"grad_norm": 19.2538668350145,
|
|
"learning_rate": 4.374938161915835e-09,
|
|
"loss": 2.0888,
|
|
"mean_token_accuracy": 0.6152995645999908,
|
|
"num_tokens": 147971418.0,
|
|
"step": 2350
|
|
},
|
|
{
|
|
"epoch": 7.69721767594108,
|
|
"grad_norm": 19.309480308973157,
|
|
"learning_rate": 4.371160489007319e-09,
|
|
"loss": 2.0576,
|
|
"mean_token_accuracy": 0.6194174110889434,
|
|
"num_tokens": 148286907.0,
|
|
"step": 2355
|
|
},
|
|
{
|
|
"epoch": 7.713584288052373,
|
|
"grad_norm": 20.163070273135265,
|
|
"learning_rate": 4.367373076463057e-09,
|
|
"loss": 2.0506,
|
|
"mean_token_accuracy": 0.6217274129390716,
|
|
"num_tokens": 148602683.0,
|
|
"step": 2360
|
|
},
|
|
{
|
|
"epoch": 7.729950900163666,
|
|
"grad_norm": 19.981014947076595,
|
|
"learning_rate": 4.3635759439970294e-09,
|
|
"loss": 2.0834,
|
|
"mean_token_accuracy": 0.6150952398777008,
|
|
"num_tokens": 148917173.0,
|
|
"step": 2365
|
|
},
|
|
{
|
|
"epoch": 7.746317512274959,
|
|
"grad_norm": 19.968450563246567,
|
|
"learning_rate": 4.359769111373807e-09,
|
|
"loss": 2.0796,
|
|
"mean_token_accuracy": 0.6174699187278747,
|
|
"num_tokens": 149233197.0,
|
|
"step": 2370
|
|
},
|
|
{
|
|
"epoch": 7.762684124386252,
|
|
"grad_norm": 19.960468429955657,
|
|
"learning_rate": 4.355952598408453e-09,
|
|
"loss": 2.0785,
|
|
"mean_token_accuracy": 0.6167018711566925,
|
|
"num_tokens": 149548312.0,
|
|
"step": 2375
|
|
},
|
|
{
|
|
"epoch": 7.779050736497545,
|
|
"grad_norm": 20.6340513147405,
|
|
"learning_rate": 4.35212642496642e-09,
|
|
"loss": 2.0784,
|
|
"mean_token_accuracy": 0.6170047044754028,
|
|
"num_tokens": 149863164.0,
|
|
"step": 2380
|
|
},
|
|
{
|
|
"epoch": 7.795417348608838,
|
|
"grad_norm": 20.354735657801292,
|
|
"learning_rate": 4.348290610963439e-09,
|
|
"loss": 2.1153,
|
|
"mean_token_accuracy": 0.6087307870388031,
|
|
"num_tokens": 150177524.0,
|
|
"step": 2385
|
|
},
|
|
{
|
|
"epoch": 7.811783960720131,
|
|
"grad_norm": 20.087151707090026,
|
|
"learning_rate": 4.344445176365428e-09,
|
|
"loss": 2.09,
|
|
"mean_token_accuracy": 0.6149612128734588,
|
|
"num_tokens": 150493173.0,
|
|
"step": 2390
|
|
},
|
|
{
|
|
"epoch": 7.828150572831424,
|
|
"grad_norm": 20.13559372519946,
|
|
"learning_rate": 4.3405901411883765e-09,
|
|
"loss": 2.0901,
|
|
"mean_token_accuracy": 0.6129091382026672,
|
|
"num_tokens": 150810335.0,
|
|
"step": 2395
|
|
},
|
|
{
|
|
"epoch": 7.844517184942717,
|
|
"grad_norm": 19.013326282949027,
|
|
"learning_rate": 4.336725525498249e-09,
|
|
"loss": 2.0497,
|
|
"mean_token_accuracy": 0.6194914102554321,
|
|
"num_tokens": 151126438.0,
|
|
"step": 2400
|
|
},
|
|
{
|
|
"epoch": 7.86088379705401,
|
|
"grad_norm": 19.222782991435427,
|
|
"learning_rate": 4.3328513494108774e-09,
|
|
"loss": 2.086,
|
|
"mean_token_accuracy": 0.6155364334583282,
|
|
"num_tokens": 151441771.0,
|
|
"step": 2405
|
|
},
|
|
{
|
|
"epoch": 7.877250409165303,
|
|
"grad_norm": 19.228039566412573,
|
|
"learning_rate": 4.328967633091856e-09,
|
|
"loss": 2.0898,
|
|
"mean_token_accuracy": 0.6135625004768371,
|
|
"num_tokens": 151758082.0,
|
|
"step": 2410
|
|
},
|
|
{
|
|
"epoch": 7.8936170212765955,
|
|
"grad_norm": 20.01595055988023,
|
|
"learning_rate": 4.325074396756437e-09,
|
|
"loss": 2.0227,
|
|
"mean_token_accuracy": 0.6251669287681579,
|
|
"num_tokens": 152073684.0,
|
|
"step": 2415
|
|
},
|
|
{
|
|
"epoch": 7.909983633387888,
|
|
"grad_norm": 19.752325541846453,
|
|
"learning_rate": 4.321171660669426e-09,
|
|
"loss": 2.0636,
|
|
"mean_token_accuracy": 0.6182598769664764,
|
|
"num_tokens": 152390090.0,
|
|
"step": 2420
|
|
},
|
|
{
|
|
"epoch": 7.926350245499181,
|
|
"grad_norm": 19.321038430559828,
|
|
"learning_rate": 4.3172594451450775e-09,
|
|
"loss": 2.0471,
|
|
"mean_token_accuracy": 0.6199876964092255,
|
|
"num_tokens": 152704416.0,
|
|
"step": 2425
|
|
},
|
|
{
|
|
"epoch": 7.942716857610475,
|
|
"grad_norm": 19.966443244824358,
|
|
"learning_rate": 4.313337770546986e-09,
|
|
"loss": 2.0788,
|
|
"mean_token_accuracy": 0.6131214797496796,
|
|
"num_tokens": 153019785.0,
|
|
"step": 2430
|
|
},
|
|
{
|
|
"epoch": 7.959083469721768,
|
|
"grad_norm": 19.24138520226915,
|
|
"learning_rate": 4.309406657287981e-09,
|
|
"loss": 2.0363,
|
|
"mean_token_accuracy": 0.620780485868454,
|
|
"num_tokens": 153335233.0,
|
|
"step": 2435
|
|
},
|
|
{
|
|
"epoch": 7.975450081833061,
|
|
"grad_norm": 19.360751757947458,
|
|
"learning_rate": 4.305466125830023e-09,
|
|
"loss": 2.0598,
|
|
"mean_token_accuracy": 0.6176416158676148,
|
|
"num_tokens": 153651679.0,
|
|
"step": 2440
|
|
},
|
|
{
|
|
"epoch": 7.991816693944354,
|
|
"grad_norm": 19.437309161187518,
|
|
"learning_rate": 4.301516196684097e-09,
|
|
"loss": 2.0537,
|
|
"mean_token_accuracy": 0.618579763174057,
|
|
"num_tokens": 153967435.0,
|
|
"step": 2445
|
|
},
|
|
{
|
|
"epoch": 8.006546644844518,
|
|
"grad_norm": 19.317444336785467,
|
|
"learning_rate": 4.297556890410099e-09,
|
|
"loss": 2.0168,
|
|
"mean_token_accuracy": 0.6221174928877089,
|
|
"num_tokens": 154228527.0,
|
|
"step": 2450
|
|
},
|
|
{
|
|
"epoch": 8.02291325695581,
|
|
"grad_norm": 18.955046602063693,
|
|
"learning_rate": 4.29358822761674e-09,
|
|
"loss": 2.0435,
|
|
"mean_token_accuracy": 0.6193597435951232,
|
|
"num_tokens": 154544523.0,
|
|
"step": 2455
|
|
},
|
|
{
|
|
"epoch": 8.039279869067103,
|
|
"grad_norm": 19.754475500032857,
|
|
"learning_rate": 4.2896102289614284e-09,
|
|
"loss": 2.0634,
|
|
"mean_token_accuracy": 0.6151875674724578,
|
|
"num_tokens": 154858664.0,
|
|
"step": 2460
|
|
},
|
|
{
|
|
"epoch": 8.055646481178396,
|
|
"grad_norm": 19.374158690398726,
|
|
"learning_rate": 4.28562291515017e-09,
|
|
"loss": 2.0172,
|
|
"mean_token_accuracy": 0.6231228113174438,
|
|
"num_tokens": 155174006.0,
|
|
"step": 2465
|
|
},
|
|
{
|
|
"epoch": 8.072013093289689,
|
|
"grad_norm": 18.2855602378677,
|
|
"learning_rate": 4.281626306937456e-09,
|
|
"loss": 2.0179,
|
|
"mean_token_accuracy": 0.6234484672546386,
|
|
"num_tokens": 155489837.0,
|
|
"step": 2470
|
|
},
|
|
{
|
|
"epoch": 8.088379705400982,
|
|
"grad_norm": 19.157790006915853,
|
|
"learning_rate": 4.277620425126156e-09,
|
|
"loss": 2.0431,
|
|
"mean_token_accuracy": 0.6187334835529328,
|
|
"num_tokens": 155805554.0,
|
|
"step": 2475
|
|
},
|
|
{
|
|
"epoch": 8.104746317512275,
|
|
"grad_norm": 19.10121563513015,
|
|
"learning_rate": 4.273605290567412e-09,
|
|
"loss": 2.0224,
|
|
"mean_token_accuracy": 0.6224100470542908,
|
|
"num_tokens": 156119528.0,
|
|
"step": 2480
|
|
},
|
|
{
|
|
"epoch": 8.121112929623568,
|
|
"grad_norm": 20.052105369610455,
|
|
"learning_rate": 4.269580924160523e-09,
|
|
"loss": 2.074,
|
|
"mean_token_accuracy": 0.6132338345050812,
|
|
"num_tokens": 156435330.0,
|
|
"step": 2485
|
|
},
|
|
{
|
|
"epoch": 8.13747954173486,
|
|
"grad_norm": 18.93313246732794,
|
|
"learning_rate": 4.265547346852845e-09,
|
|
"loss": 2.0608,
|
|
"mean_token_accuracy": 0.6157853841781616,
|
|
"num_tokens": 156750303.0,
|
|
"step": 2490
|
|
},
|
|
{
|
|
"epoch": 8.153846153846153,
|
|
"grad_norm": 19.151946930372876,
|
|
"learning_rate": 4.261504579639678e-09,
|
|
"loss": 2.0286,
|
|
"mean_token_accuracy": 0.6220345199108124,
|
|
"num_tokens": 157065896.0,
|
|
"step": 2495
|
|
},
|
|
{
|
|
"epoch": 8.170212765957446,
|
|
"grad_norm": 19.20698734133269,
|
|
"learning_rate": 4.257452643564154e-09,
|
|
"loss": 2.0148,
|
|
"mean_token_accuracy": 0.6229146420955658,
|
|
"num_tokens": 157380698.0,
|
|
"step": 2500
|
|
},
|
|
{
|
|
"epoch": 8.186579378068739,
|
|
"grad_norm": 18.895147128140277,
|
|
"learning_rate": 4.253391559717134e-09,
|
|
"loss": 2.0162,
|
|
"mean_token_accuracy": 0.623430597782135,
|
|
"num_tokens": 157696848.0,
|
|
"step": 2505
|
|
},
|
|
{
|
|
"epoch": 8.202945990180032,
|
|
"grad_norm": 18.59917375497589,
|
|
"learning_rate": 4.249321349237088e-09,
|
|
"loss": 1.9741,
|
|
"mean_token_accuracy": 0.6283079147338867,
|
|
"num_tokens": 158013554.0,
|
|
"step": 2510
|
|
},
|
|
{
|
|
"epoch": 8.219312602291327,
|
|
"grad_norm": 18.81115825320335,
|
|
"learning_rate": 4.24524203331e-09,
|
|
"loss": 2.0411,
|
|
"mean_token_accuracy": 0.6198984026908875,
|
|
"num_tokens": 158329281.0,
|
|
"step": 2515
|
|
},
|
|
{
|
|
"epoch": 8.23567921440262,
|
|
"grad_norm": 19.40948052890125,
|
|
"learning_rate": 4.241153633169241e-09,
|
|
"loss": 2.0434,
|
|
"mean_token_accuracy": 0.6172616899013519,
|
|
"num_tokens": 158645204.0,
|
|
"step": 2520
|
|
},
|
|
{
|
|
"epoch": 8.252045826513912,
|
|
"grad_norm": 19.128928676302635,
|
|
"learning_rate": 4.237056170095473e-09,
|
|
"loss": 2.0208,
|
|
"mean_token_accuracy": 0.6226502656936646,
|
|
"num_tokens": 158960971.0,
|
|
"step": 2525
|
|
},
|
|
{
|
|
"epoch": 8.268412438625205,
|
|
"grad_norm": 18.487170030947755,
|
|
"learning_rate": 4.232949665416525e-09,
|
|
"loss": 2.0205,
|
|
"mean_token_accuracy": 0.6218908786773681,
|
|
"num_tokens": 159275712.0,
|
|
"step": 2530
|
|
},
|
|
{
|
|
"epoch": 8.284779050736498,
|
|
"grad_norm": 19.130608652611837,
|
|
"learning_rate": 4.2288341405072946e-09,
|
|
"loss": 2.0634,
|
|
"mean_token_accuracy": 0.6136705696582794,
|
|
"num_tokens": 159591288.0,
|
|
"step": 2535
|
|
},
|
|
{
|
|
"epoch": 8.30114566284779,
|
|
"grad_norm": 19.69139917071449,
|
|
"learning_rate": 4.224709616789628e-09,
|
|
"loss": 2.0317,
|
|
"mean_token_accuracy": 0.6212322771549225,
|
|
"num_tokens": 159907007.0,
|
|
"step": 2540
|
|
},
|
|
{
|
|
"epoch": 8.317512274959084,
|
|
"grad_norm": 18.98342744228345,
|
|
"learning_rate": 4.220576115732213e-09,
|
|
"loss": 2.0423,
|
|
"mean_token_accuracy": 0.6142703175544739,
|
|
"num_tokens": 160220532.0,
|
|
"step": 2545
|
|
},
|
|
{
|
|
"epoch": 8.333878887070377,
|
|
"grad_norm": 18.36077241118662,
|
|
"learning_rate": 4.216433658850464e-09,
|
|
"loss": 2.0142,
|
|
"mean_token_accuracy": 0.6223547279834747,
|
|
"num_tokens": 160536285.0,
|
|
"step": 2550
|
|
},
|
|
{
|
|
"epoch": 8.35024549918167,
|
|
"grad_norm": 18.94827333595501,
|
|
"learning_rate": 4.212282267706413e-09,
|
|
"loss": 2.0065,
|
|
"mean_token_accuracy": 0.6223433613777161,
|
|
"num_tokens": 160850576.0,
|
|
"step": 2555
|
|
},
|
|
{
|
|
"epoch": 8.366612111292962,
|
|
"grad_norm": 18.298738766625497,
|
|
"learning_rate": 4.208121963908594e-09,
|
|
"loss": 2.006,
|
|
"mean_token_accuracy": 0.623051005601883,
|
|
"num_tokens": 161167036.0,
|
|
"step": 2560
|
|
},
|
|
{
|
|
"epoch": 8.382978723404255,
|
|
"grad_norm": 18.265369067443615,
|
|
"learning_rate": 4.203952769111935e-09,
|
|
"loss": 2.0195,
|
|
"mean_token_accuracy": 0.6212840378284454,
|
|
"num_tokens": 161483084.0,
|
|
"step": 2565
|
|
},
|
|
{
|
|
"epoch": 8.399345335515548,
|
|
"grad_norm": 19.688986359575967,
|
|
"learning_rate": 4.199774705017642e-09,
|
|
"loss": 2.0623,
|
|
"mean_token_accuracy": 0.6111198544502259,
|
|
"num_tokens": 161799520.0,
|
|
"step": 2570
|
|
},
|
|
{
|
|
"epoch": 8.415711947626841,
|
|
"grad_norm": 18.566585593107703,
|
|
"learning_rate": 4.195587793373085e-09,
|
|
"loss": 1.9965,
|
|
"mean_token_accuracy": 0.6247856795787812,
|
|
"num_tokens": 162115336.0,
|
|
"step": 2575
|
|
},
|
|
{
|
|
"epoch": 8.432078559738134,
|
|
"grad_norm": 18.55307853336787,
|
|
"learning_rate": 4.19139205597169e-09,
|
|
"loss": 2.0091,
|
|
"mean_token_accuracy": 0.621464341878891,
|
|
"num_tokens": 162431400.0,
|
|
"step": 2580
|
|
},
|
|
{
|
|
"epoch": 8.448445171849427,
|
|
"grad_norm": 18.55129652765704,
|
|
"learning_rate": 4.1871875146528196e-09,
|
|
"loss": 1.9971,
|
|
"mean_token_accuracy": 0.6246702432632446,
|
|
"num_tokens": 162748170.0,
|
|
"step": 2585
|
|
},
|
|
{
|
|
"epoch": 8.46481178396072,
|
|
"grad_norm": 18.318036230564275,
|
|
"learning_rate": 4.182974191301662e-09,
|
|
"loss": 1.9903,
|
|
"mean_token_accuracy": 0.6247745037078858,
|
|
"num_tokens": 163065470.0,
|
|
"step": 2590
|
|
},
|
|
{
|
|
"epoch": 8.481178396072012,
|
|
"grad_norm": 18.823692124621207,
|
|
"learning_rate": 4.178752107849119e-09,
|
|
"loss": 2.0466,
|
|
"mean_token_accuracy": 0.6138881623744965,
|
|
"num_tokens": 163381018.0,
|
|
"step": 2595
|
|
},
|
|
{
|
|
"epoch": 8.497545008183305,
|
|
"grad_norm": 19.035020757446386,
|
|
"learning_rate": 4.1745212862716885e-09,
|
|
"loss": 2.0251,
|
|
"mean_token_accuracy": 0.6184180915355683,
|
|
"num_tokens": 163694158.0,
|
|
"step": 2600
|
|
},
|
|
{
|
|
"epoch": 8.5139116202946,
|
|
"grad_norm": 18.33063656676717,
|
|
"learning_rate": 4.170281748591351e-09,
|
|
"loss": 1.9991,
|
|
"mean_token_accuracy": 0.6240422368049622,
|
|
"num_tokens": 164010496.0,
|
|
"step": 2605
|
|
},
|
|
{
|
|
"epoch": 8.530278232405893,
|
|
"grad_norm": 19.26904730748046,
|
|
"learning_rate": 4.166033516875457e-09,
|
|
"loss": 2.0101,
|
|
"mean_token_accuracy": 0.6214252293109894,
|
|
"num_tokens": 164325861.0,
|
|
"step": 2610
|
|
},
|
|
{
|
|
"epoch": 8.546644844517186,
|
|
"grad_norm": 19.671297978586228,
|
|
"learning_rate": 4.16177661323661e-09,
|
|
"loss": 2.0256,
|
|
"mean_token_accuracy": 0.6183163821697235,
|
|
"num_tokens": 164641507.0,
|
|
"step": 2615
|
|
},
|
|
{
|
|
"epoch": 8.563011456628479,
|
|
"grad_norm": 18.608109118678463,
|
|
"learning_rate": 4.157511059832551e-09,
|
|
"loss": 2.0225,
|
|
"mean_token_accuracy": 0.6190088152885437,
|
|
"num_tokens": 164957579.0,
|
|
"step": 2620
|
|
},
|
|
{
|
|
"epoch": 8.579378068739771,
|
|
"grad_norm": 19.66971661058079,
|
|
"learning_rate": 4.1532368788660435e-09,
|
|
"loss": 2.0026,
|
|
"mean_token_accuracy": 0.6196180164813996,
|
|
"num_tokens": 165274043.0,
|
|
"step": 2625
|
|
},
|
|
{
|
|
"epoch": 8.595744680851064,
|
|
"grad_norm": 18.648238572936588,
|
|
"learning_rate": 4.1489540925847624e-09,
|
|
"loss": 2.0198,
|
|
"mean_token_accuracy": 0.6171901106834412,
|
|
"num_tokens": 165588679.0,
|
|
"step": 2630
|
|
},
|
|
{
|
|
"epoch": 8.612111292962357,
|
|
"grad_norm": 18.0144822329861,
|
|
"learning_rate": 4.144662723281171e-09,
|
|
"loss": 1.978,
|
|
"mean_token_accuracy": 0.6250441014766693,
|
|
"num_tokens": 165903404.0,
|
|
"step": 2635
|
|
},
|
|
{
|
|
"epoch": 8.62847790507365,
|
|
"grad_norm": 17.95362559890803,
|
|
"learning_rate": 4.14036279329241e-09,
|
|
"loss": 1.99,
|
|
"mean_token_accuracy": 0.6221159398555756,
|
|
"num_tokens": 166219798.0,
|
|
"step": 2640
|
|
},
|
|
{
|
|
"epoch": 8.644844517184943,
|
|
"grad_norm": 18.310831388232366,
|
|
"learning_rate": 4.136054325000178e-09,
|
|
"loss": 1.9402,
|
|
"mean_token_accuracy": 0.6302378952503205,
|
|
"num_tokens": 166536213.0,
|
|
"step": 2645
|
|
},
|
|
{
|
|
"epoch": 8.661211129296236,
|
|
"grad_norm": 18.59936082392863,
|
|
"learning_rate": 4.131737340830618e-09,
|
|
"loss": 1.9867,
|
|
"mean_token_accuracy": 0.6227317154407501,
|
|
"num_tokens": 166852864.0,
|
|
"step": 2650
|
|
},
|
|
{
|
|
"epoch": 8.677577741407529,
|
|
"grad_norm": 17.936983386521433,
|
|
"learning_rate": 4.127411863254198e-09,
|
|
"loss": 1.9897,
|
|
"mean_token_accuracy": 0.6210679411888123,
|
|
"num_tokens": 167168596.0,
|
|
"step": 2655
|
|
},
|
|
{
|
|
"epoch": 8.693944353518821,
|
|
"grad_norm": 18.516548610843525,
|
|
"learning_rate": 4.123077914785597e-09,
|
|
"loss": 1.977,
|
|
"mean_token_accuracy": 0.6251841723918915,
|
|
"num_tokens": 167484584.0,
|
|
"step": 2660
|
|
},
|
|
{
|
|
"epoch": 8.710310965630114,
|
|
"grad_norm": 18.359500979183156,
|
|
"learning_rate": 4.1187355179835836e-09,
|
|
"loss": 1.984,
|
|
"mean_token_accuracy": 0.623200523853302,
|
|
"num_tokens": 167801147.0,
|
|
"step": 2665
|
|
},
|
|
{
|
|
"epoch": 8.726677577741407,
|
|
"grad_norm": 18.161306481323752,
|
|
"learning_rate": 4.114384695450906e-09,
|
|
"loss": 2.0179,
|
|
"mean_token_accuracy": 0.618176156282425,
|
|
"num_tokens": 168116240.0,
|
|
"step": 2670
|
|
},
|
|
{
|
|
"epoch": 8.7430441898527,
|
|
"grad_norm": 18.927299349251303,
|
|
"learning_rate": 4.110025469834162e-09,
|
|
"loss": 1.9805,
|
|
"mean_token_accuracy": 0.6226609289646149,
|
|
"num_tokens": 168431117.0,
|
|
"step": 2675
|
|
},
|
|
{
|
|
"epoch": 8.759410801963993,
|
|
"grad_norm": 19.012736318695463,
|
|
"learning_rate": 4.105657863823697e-09,
|
|
"loss": 1.9767,
|
|
"mean_token_accuracy": 0.6218480348587037,
|
|
"num_tokens": 168746772.0,
|
|
"step": 2680
|
|
},
|
|
{
|
|
"epoch": 8.775777414075286,
|
|
"grad_norm": 18.295532227167453,
|
|
"learning_rate": 4.101281900153469e-09,
|
|
"loss": 1.9511,
|
|
"mean_token_accuracy": 0.627713143825531,
|
|
"num_tokens": 169063269.0,
|
|
"step": 2685
|
|
},
|
|
{
|
|
"epoch": 8.792144026186579,
|
|
"grad_norm": 18.61795636373396,
|
|
"learning_rate": 4.096897601600944e-09,
|
|
"loss": 1.9883,
|
|
"mean_token_accuracy": 0.6205691993236542,
|
|
"num_tokens": 169379160.0,
|
|
"step": 2690
|
|
},
|
|
{
|
|
"epoch": 8.808510638297872,
|
|
"grad_norm": 18.673489459493226,
|
|
"learning_rate": 4.092504990986972e-09,
|
|
"loss": 1.9751,
|
|
"mean_token_accuracy": 0.6233503878116607,
|
|
"num_tokens": 169695599.0,
|
|
"step": 2695
|
|
},
|
|
{
|
|
"epoch": 8.824877250409166,
|
|
"grad_norm": 18.331509805767606,
|
|
"learning_rate": 4.088104091175667e-09,
|
|
"loss": 1.9806,
|
|
"mean_token_accuracy": 0.6208960115909576,
|
|
"num_tokens": 170010952.0,
|
|
"step": 2700
|
|
},
|
|
{
|
|
"epoch": 8.841243862520459,
|
|
"grad_norm": 18.329443723081933,
|
|
"learning_rate": 4.08369492507429e-09,
|
|
"loss": 1.9784,
|
|
"mean_token_accuracy": 0.6229167640209198,
|
|
"num_tokens": 170326218.0,
|
|
"step": 2705
|
|
},
|
|
{
|
|
"epoch": 8.857610474631752,
|
|
"grad_norm": 18.045575742161883,
|
|
"learning_rate": 4.0792775156331276e-09,
|
|
"loss": 1.9635,
|
|
"mean_token_accuracy": 0.6248275518417359,
|
|
"num_tokens": 170641654.0,
|
|
"step": 2710
|
|
},
|
|
{
|
|
"epoch": 8.873977086743045,
|
|
"grad_norm": 18.464840629768773,
|
|
"learning_rate": 4.0748518858453756e-09,
|
|
"loss": 1.9731,
|
|
"mean_token_accuracy": 0.6229199707508087,
|
|
"num_tokens": 170957841.0,
|
|
"step": 2715
|
|
},
|
|
{
|
|
"epoch": 8.890343698854338,
|
|
"grad_norm": 18.609226014251735,
|
|
"learning_rate": 4.070418058747018e-09,
|
|
"loss": 1.9753,
|
|
"mean_token_accuracy": 0.6225042223930359,
|
|
"num_tokens": 171274797.0,
|
|
"step": 2720
|
|
},
|
|
{
|
|
"epoch": 8.90671031096563,
|
|
"grad_norm": 19.047635263213472,
|
|
"learning_rate": 4.065976057416707e-09,
|
|
"loss": 1.9723,
|
|
"mean_token_accuracy": 0.622650933265686,
|
|
"num_tokens": 171589606.0,
|
|
"step": 2725
|
|
},
|
|
{
|
|
"epoch": 8.923076923076923,
|
|
"grad_norm": 19.226875174529695,
|
|
"learning_rate": 4.061525904975642e-09,
|
|
"loss": 1.9748,
|
|
"mean_token_accuracy": 0.624282443523407,
|
|
"num_tokens": 171905454.0,
|
|
"step": 2730
|
|
},
|
|
{
|
|
"epoch": 8.939443535188216,
|
|
"grad_norm": 18.25508447639412,
|
|
"learning_rate": 4.057067624587448e-09,
|
|
"loss": 1.9444,
|
|
"mean_token_accuracy": 0.626853859424591,
|
|
"num_tokens": 172220595.0,
|
|
"step": 2735
|
|
},
|
|
{
|
|
"epoch": 8.955810147299509,
|
|
"grad_norm": 17.82977614301095,
|
|
"learning_rate": 4.052601239458061e-09,
|
|
"loss": 1.9464,
|
|
"mean_token_accuracy": 0.6270898818969727,
|
|
"num_tokens": 172537829.0,
|
|
"step": 2740
|
|
},
|
|
{
|
|
"epoch": 8.972176759410802,
|
|
"grad_norm": 18.155334994112835,
|
|
"learning_rate": 4.0481267728356e-09,
|
|
"loss": 1.9575,
|
|
"mean_token_accuracy": 0.6241896629333497,
|
|
"num_tokens": 172853529.0,
|
|
"step": 2745
|
|
},
|
|
{
|
|
"epoch": 8.988543371522095,
|
|
"grad_norm": 17.88360441782584,
|
|
"learning_rate": 4.043644248010252e-09,
|
|
"loss": 1.9545,
|
|
"mean_token_accuracy": 0.6252379953861237,
|
|
"num_tokens": 173167883.0,
|
|
"step": 2750
|
|
},
|
|
{
|
|
"epoch": 9.003273322422258,
|
|
"grad_norm": 18.04480210223081,
|
|
"learning_rate": 4.039153688314146e-09,
|
|
"loss": 1.9658,
|
|
"mean_token_accuracy": 0.619963526725769,
|
|
"num_tokens": 173428375.0,
|
|
"step": 2755
|
|
},
|
|
{
|
|
"epoch": 9.01963993453355,
|
|
"grad_norm": 18.72621974154579,
|
|
"learning_rate": 4.0346551171212344e-09,
|
|
"loss": 1.9569,
|
|
"mean_token_accuracy": 0.6238627254962921,
|
|
"num_tokens": 173745176.0,
|
|
"step": 2760
|
|
},
|
|
{
|
|
"epoch": 9.036006546644845,
|
|
"grad_norm": 17.527238066541386,
|
|
"learning_rate": 4.030148557847169e-09,
|
|
"loss": 1.9277,
|
|
"mean_token_accuracy": 0.6290217995643616,
|
|
"num_tokens": 174062085.0,
|
|
"step": 2765
|
|
},
|
|
{
|
|
"epoch": 9.052373158756138,
|
|
"grad_norm": 18.57546301322759,
|
|
"learning_rate": 4.025634033949184e-09,
|
|
"loss": 1.9642,
|
|
"mean_token_accuracy": 0.6221346378326416,
|
|
"num_tokens": 174377672.0,
|
|
"step": 2770
|
|
},
|
|
{
|
|
"epoch": 9.068739770867431,
|
|
"grad_norm": 18.032035732313652,
|
|
"learning_rate": 4.021111568925967e-09,
|
|
"loss": 1.9594,
|
|
"mean_token_accuracy": 0.6231431603431702,
|
|
"num_tokens": 174693197.0,
|
|
"step": 2775
|
|
},
|
|
{
|
|
"epoch": 9.085106382978724,
|
|
"grad_norm": 17.62498580152985,
|
|
"learning_rate": 4.016581186317542e-09,
|
|
"loss": 1.9394,
|
|
"mean_token_accuracy": 0.6259716868400573,
|
|
"num_tokens": 175008227.0,
|
|
"step": 2780
|
|
},
|
|
{
|
|
"epoch": 9.101472995090017,
|
|
"grad_norm": 17.966771537737063,
|
|
"learning_rate": 4.012042909705143e-09,
|
|
"loss": 1.9411,
|
|
"mean_token_accuracy": 0.62475745677948,
|
|
"num_tokens": 175323205.0,
|
|
"step": 2785
|
|
},
|
|
{
|
|
"epoch": 9.11783960720131,
|
|
"grad_norm": 17.47952411808635,
|
|
"learning_rate": 4.007496762711098e-09,
|
|
"loss": 1.9577,
|
|
"mean_token_accuracy": 0.621793794631958,
|
|
"num_tokens": 175638747.0,
|
|
"step": 2790
|
|
},
|
|
{
|
|
"epoch": 9.134206219312603,
|
|
"grad_norm": 18.843414914975632,
|
|
"learning_rate": 4.002942768998696e-09,
|
|
"loss": 1.9772,
|
|
"mean_token_accuracy": 0.6197193324565887,
|
|
"num_tokens": 175955103.0,
|
|
"step": 2795
|
|
},
|
|
{
|
|
"epoch": 9.150572831423895,
|
|
"grad_norm": 17.098878985984722,
|
|
"learning_rate": 3.998380952272073e-09,
|
|
"loss": 1.9096,
|
|
"mean_token_accuracy": 0.6299042642116547,
|
|
"num_tokens": 176270139.0,
|
|
"step": 2800
|
|
},
|
|
{
|
|
"epoch": 9.166939443535188,
|
|
"grad_norm": 17.182681882574787,
|
|
"learning_rate": 3.993811336276081e-09,
|
|
"loss": 1.9223,
|
|
"mean_token_accuracy": 0.6275332272052765,
|
|
"num_tokens": 176585317.0,
|
|
"step": 2805
|
|
},
|
|
{
|
|
"epoch": 9.183306055646481,
|
|
"grad_norm": 18.248471061146546,
|
|
"learning_rate": 3.989233944796173e-09,
|
|
"loss": 1.956,
|
|
"mean_token_accuracy": 0.6209408044815063,
|
|
"num_tokens": 176901360.0,
|
|
"step": 2810
|
|
},
|
|
{
|
|
"epoch": 9.199672667757774,
|
|
"grad_norm": 19.34514275913769,
|
|
"learning_rate": 3.984648801658272e-09,
|
|
"loss": 1.9525,
|
|
"mean_token_accuracy": 0.6216700792312622,
|
|
"num_tokens": 177216887.0,
|
|
"step": 2815
|
|
},
|
|
{
|
|
"epoch": 9.216039279869067,
|
|
"grad_norm": 18.33216037869195,
|
|
"learning_rate": 3.980055930728647e-09,
|
|
"loss": 1.9736,
|
|
"mean_token_accuracy": 0.6176897406578064,
|
|
"num_tokens": 177531745.0,
|
|
"step": 2820
|
|
},
|
|
{
|
|
"epoch": 9.23240589198036,
|
|
"grad_norm": 18.01373662008413,
|
|
"learning_rate": 3.975455355913796e-09,
|
|
"loss": 1.9179,
|
|
"mean_token_accuracy": 0.6283258557319641,
|
|
"num_tokens": 177847164.0,
|
|
"step": 2825
|
|
},
|
|
{
|
|
"epoch": 9.248772504091653,
|
|
"grad_norm": 18.644065662113995,
|
|
"learning_rate": 3.970847101160312e-09,
|
|
"loss": 1.9228,
|
|
"mean_token_accuracy": 0.6259247720241546,
|
|
"num_tokens": 178161807.0,
|
|
"step": 2830
|
|
},
|
|
{
|
|
"epoch": 9.265139116202946,
|
|
"grad_norm": 18.19303408352185,
|
|
"learning_rate": 3.966231190454767e-09,
|
|
"loss": 1.934,
|
|
"mean_token_accuracy": 0.6245276927947998,
|
|
"num_tokens": 178476348.0,
|
|
"step": 2835
|
|
},
|
|
{
|
|
"epoch": 9.281505728314238,
|
|
"grad_norm": 17.573488111961076,
|
|
"learning_rate": 3.961607647823583e-09,
|
|
"loss": 1.9011,
|
|
"mean_token_accuracy": 0.6307465970516205,
|
|
"num_tokens": 178792537.0,
|
|
"step": 2840
|
|
},
|
|
{
|
|
"epoch": 9.297872340425531,
|
|
"grad_norm": 18.230014313895904,
|
|
"learning_rate": 3.956976497332903e-09,
|
|
"loss": 1.9031,
|
|
"mean_token_accuracy": 0.6299893498420716,
|
|
"num_tokens": 179108205.0,
|
|
"step": 2845
|
|
},
|
|
{
|
|
"epoch": 9.314238952536824,
|
|
"grad_norm": 17.007404114028006,
|
|
"learning_rate": 3.952337763088473e-09,
|
|
"loss": 1.9081,
|
|
"mean_token_accuracy": 0.6293568968772888,
|
|
"num_tokens": 179422450.0,
|
|
"step": 2850
|
|
},
|
|
{
|
|
"epoch": 9.330605564648117,
|
|
"grad_norm": 18.076430353756503,
|
|
"learning_rate": 3.947691469235514e-09,
|
|
"loss": 1.9208,
|
|
"mean_token_accuracy": 0.6249583125114441,
|
|
"num_tokens": 179738583.0,
|
|
"step": 2855
|
|
},
|
|
{
|
|
"epoch": 9.346972176759412,
|
|
"grad_norm": 17.885738938494494,
|
|
"learning_rate": 3.9430376399585945e-09,
|
|
"loss": 1.9007,
|
|
"mean_token_accuracy": 0.6298418581485749,
|
|
"num_tokens": 180055204.0,
|
|
"step": 2860
|
|
},
|
|
{
|
|
"epoch": 9.363338788870704,
|
|
"grad_norm": 18.66586545840845,
|
|
"learning_rate": 3.938376299481506e-09,
|
|
"loss": 1.921,
|
|
"mean_token_accuracy": 0.6242359042167663,
|
|
"num_tokens": 180372131.0,
|
|
"step": 2865
|
|
},
|
|
{
|
|
"epoch": 9.379705400981997,
|
|
"grad_norm": 18.250054937583442,
|
|
"learning_rate": 3.93370747206714e-09,
|
|
"loss": 1.9274,
|
|
"mean_token_accuracy": 0.6258101642131806,
|
|
"num_tokens": 180687489.0,
|
|
"step": 2870
|
|
},
|
|
{
|
|
"epoch": 9.39607201309329,
|
|
"grad_norm": 17.662285336054975,
|
|
"learning_rate": 3.92903118201735e-09,
|
|
"loss": 1.9026,
|
|
"mean_token_accuracy": 0.6290419936180115,
|
|
"num_tokens": 181002519.0,
|
|
"step": 2875
|
|
},
|
|
{
|
|
"epoch": 9.412438625204583,
|
|
"grad_norm": 18.323648354162366,
|
|
"learning_rate": 3.924347453672843e-09,
|
|
"loss": 1.9207,
|
|
"mean_token_accuracy": 0.6275826394557953,
|
|
"num_tokens": 181317643.0,
|
|
"step": 2880
|
|
},
|
|
{
|
|
"epoch": 9.428805237315876,
|
|
"grad_norm": 18.03313243291237,
|
|
"learning_rate": 3.919656311413038e-09,
|
|
"loss": 1.8935,
|
|
"mean_token_accuracy": 0.6286854863166809,
|
|
"num_tokens": 181633606.0,
|
|
"step": 2885
|
|
},
|
|
{
|
|
"epoch": 9.445171849427169,
|
|
"grad_norm": 18.57778730511887,
|
|
"learning_rate": 3.914957779655946e-09,
|
|
"loss": 1.9335,
|
|
"mean_token_accuracy": 0.623274952173233,
|
|
"num_tokens": 181948723.0,
|
|
"step": 2890
|
|
},
|
|
{
|
|
"epoch": 9.461538461538462,
|
|
"grad_norm": 18.227838310934676,
|
|
"learning_rate": 3.91025188285804e-09,
|
|
"loss": 1.9188,
|
|
"mean_token_accuracy": 0.6270026624202728,
|
|
"num_tokens": 182267064.0,
|
|
"step": 2895
|
|
},
|
|
{
|
|
"epoch": 9.477905073649755,
|
|
"grad_norm": 18.34210442651482,
|
|
"learning_rate": 3.9055386455141314e-09,
|
|
"loss": 1.9202,
|
|
"mean_token_accuracy": 0.6245188891887665,
|
|
"num_tokens": 182581869.0,
|
|
"step": 2900
|
|
},
|
|
{
|
|
"epoch": 9.494271685761047,
|
|
"grad_norm": 17.687141855032003,
|
|
"learning_rate": 3.900818092157239e-09,
|
|
"loss": 1.8737,
|
|
"mean_token_accuracy": 0.6342037916183472,
|
|
"num_tokens": 182898292.0,
|
|
"step": 2905
|
|
},
|
|
{
|
|
"epoch": 9.51063829787234,
|
|
"grad_norm": 17.933760186583516,
|
|
"learning_rate": 3.89609024735846e-09,
|
|
"loss": 1.906,
|
|
"mean_token_accuracy": 0.6296324312686921,
|
|
"num_tokens": 183212511.0,
|
|
"step": 2910
|
|
},
|
|
{
|
|
"epoch": 9.527004909983633,
|
|
"grad_norm": 18.867128515643696,
|
|
"learning_rate": 3.891355135726849e-09,
|
|
"loss": 1.911,
|
|
"mean_token_accuracy": 0.6257581770420074,
|
|
"num_tokens": 183527923.0,
|
|
"step": 2915
|
|
},
|
|
{
|
|
"epoch": 9.543371522094926,
|
|
"grad_norm": 17.53962602317489,
|
|
"learning_rate": 3.886612781909281e-09,
|
|
"loss": 1.8653,
|
|
"mean_token_accuracy": 0.6354433178901673,
|
|
"num_tokens": 183843275.0,
|
|
"step": 2920
|
|
},
|
|
{
|
|
"epoch": 9.559738134206219,
|
|
"grad_norm": 18.501450293771324,
|
|
"learning_rate": 3.881863210590332e-09,
|
|
"loss": 1.9218,
|
|
"mean_token_accuracy": 0.6226227402687072,
|
|
"num_tokens": 184160244.0,
|
|
"step": 2925
|
|
},
|
|
{
|
|
"epoch": 9.576104746317512,
|
|
"grad_norm": 18.344935811703643,
|
|
"learning_rate": 3.877106446492141e-09,
|
|
"loss": 1.8926,
|
|
"mean_token_accuracy": 0.6301439940929413,
|
|
"num_tokens": 184476831.0,
|
|
"step": 2930
|
|
},
|
|
{
|
|
"epoch": 9.592471358428805,
|
|
"grad_norm": 17.456391655964303,
|
|
"learning_rate": 3.8723425143742904e-09,
|
|
"loss": 1.8564,
|
|
"mean_token_accuracy": 0.6359524667263031,
|
|
"num_tokens": 184793601.0,
|
|
"step": 2935
|
|
},
|
|
{
|
|
"epoch": 9.608837970540097,
|
|
"grad_norm": 17.362489760879928,
|
|
"learning_rate": 3.867571439033671e-09,
|
|
"loss": 1.8698,
|
|
"mean_token_accuracy": 0.6319121718406677,
|
|
"num_tokens": 185110931.0,
|
|
"step": 2940
|
|
},
|
|
{
|
|
"epoch": 9.62520458265139,
|
|
"grad_norm": 17.78448055098728,
|
|
"learning_rate": 3.862793245304358e-09,
|
|
"loss": 1.8868,
|
|
"mean_token_accuracy": 0.6301158666610718,
|
|
"num_tokens": 185427628.0,
|
|
"step": 2945
|
|
},
|
|
{
|
|
"epoch": 9.641571194762683,
|
|
"grad_norm": 17.87940817258405,
|
|
"learning_rate": 3.858007958057473e-09,
|
|
"loss": 1.8552,
|
|
"mean_token_accuracy": 0.6365744888782501,
|
|
"num_tokens": 185742371.0,
|
|
"step": 2950
|
|
},
|
|
{
|
|
"epoch": 9.657937806873978,
|
|
"grad_norm": 18.114260046662046,
|
|
"learning_rate": 3.853215602201065e-09,
|
|
"loss": 1.8855,
|
|
"mean_token_accuracy": 0.6294423818588257,
|
|
"num_tokens": 186058058.0,
|
|
"step": 2955
|
|
},
|
|
{
|
|
"epoch": 9.67430441898527,
|
|
"grad_norm": 17.069902906334928,
|
|
"learning_rate": 3.848416202679975e-09,
|
|
"loss": 1.8699,
|
|
"mean_token_accuracy": 0.6321876406669616,
|
|
"num_tokens": 186374896.0,
|
|
"step": 2960
|
|
},
|
|
{
|
|
"epoch": 9.690671031096564,
|
|
"grad_norm": 17.26492781592171,
|
|
"learning_rate": 3.843609784475708e-09,
|
|
"loss": 1.8976,
|
|
"mean_token_accuracy": 0.6271225333213806,
|
|
"num_tokens": 186691633.0,
|
|
"step": 2965
|
|
},
|
|
{
|
|
"epoch": 9.707037643207856,
|
|
"grad_norm": 18.95360140068516,
|
|
"learning_rate": 3.838796372606299e-09,
|
|
"loss": 1.8978,
|
|
"mean_token_accuracy": 0.6272413194179535,
|
|
"num_tokens": 187006529.0,
|
|
"step": 2970
|
|
},
|
|
{
|
|
"epoch": 9.72340425531915,
|
|
"grad_norm": 18.122093609301107,
|
|
"learning_rate": 3.833975992126189e-09,
|
|
"loss": 1.8674,
|
|
"mean_token_accuracy": 0.6307429075241089,
|
|
"num_tokens": 187321093.0,
|
|
"step": 2975
|
|
},
|
|
{
|
|
"epoch": 9.739770867430442,
|
|
"grad_norm": 18.056802859104373,
|
|
"learning_rate": 3.82914866812609e-09,
|
|
"loss": 1.8869,
|
|
"mean_token_accuracy": 0.627680492401123,
|
|
"num_tokens": 187634669.0,
|
|
"step": 2980
|
|
},
|
|
{
|
|
"epoch": 9.756137479541735,
|
|
"grad_norm": 17.361076661953092,
|
|
"learning_rate": 3.824314425732859e-09,
|
|
"loss": 1.8616,
|
|
"mean_token_accuracy": 0.6338369309902191,
|
|
"num_tokens": 187950365.0,
|
|
"step": 2985
|
|
},
|
|
{
|
|
"epoch": 9.772504091653028,
|
|
"grad_norm": 18.01075845513964,
|
|
"learning_rate": 3.819473290109359e-09,
|
|
"loss": 1.8595,
|
|
"mean_token_accuracy": 0.633523577451706,
|
|
"num_tokens": 188266181.0,
|
|
"step": 2990
|
|
},
|
|
{
|
|
"epoch": 9.78887070376432,
|
|
"grad_norm": 17.57825984359166,
|
|
"learning_rate": 3.814625286454335e-09,
|
|
"loss": 1.8644,
|
|
"mean_token_accuracy": 0.6331724166870117,
|
|
"num_tokens": 188582595.0,
|
|
"step": 2995
|
|
},
|
|
{
|
|
"epoch": 9.805237315875614,
|
|
"grad_norm": 18.02430311833049,
|
|
"learning_rate": 3.809770440002286e-09,
|
|
"loss": 1.8491,
|
|
"mean_token_accuracy": 0.6342060387134552,
|
|
"num_tokens": 188897418.0,
|
|
"step": 3000
|
|
},
|
|
{
|
|
"epoch": 9.821603927986907,
|
|
"grad_norm": 17.839992907497123,
|
|
"learning_rate": 3.80490877602332e-09,
|
|
"loss": 1.8468,
|
|
"mean_token_accuracy": 0.63299680352211,
|
|
"num_tokens": 189213354.0,
|
|
"step": 3005
|
|
},
|
|
{
|
|
"epoch": 9.8379705400982,
|
|
"grad_norm": 17.707546662790627,
|
|
"learning_rate": 3.800040319823038e-09,
|
|
"loss": 1.8587,
|
|
"mean_token_accuracy": 0.6331423938274383,
|
|
"num_tokens": 189528334.0,
|
|
"step": 3010
|
|
},
|
|
{
|
|
"epoch": 9.854337152209492,
|
|
"grad_norm": 17.12413666228983,
|
|
"learning_rate": 3.795165096742394e-09,
|
|
"loss": 1.858,
|
|
"mean_token_accuracy": 0.6314215183258056,
|
|
"num_tokens": 189842582.0,
|
|
"step": 3015
|
|
},
|
|
{
|
|
"epoch": 9.870703764320785,
|
|
"grad_norm": 17.26456725836023,
|
|
"learning_rate": 3.790283132157561e-09,
|
|
"loss": 1.8425,
|
|
"mean_token_accuracy": 0.6359744787216186,
|
|
"num_tokens": 190158840.0,
|
|
"step": 3020
|
|
},
|
|
{
|
|
"epoch": 9.887070376432078,
|
|
"grad_norm": 17.385229623404953,
|
|
"learning_rate": 3.785394451479806e-09,
|
|
"loss": 1.8484,
|
|
"mean_token_accuracy": 0.6332973361015319,
|
|
"num_tokens": 190473395.0,
|
|
"step": 3025
|
|
},
|
|
{
|
|
"epoch": 9.90343698854337,
|
|
"grad_norm": 17.777934882860322,
|
|
"learning_rate": 3.780499080155353e-09,
|
|
"loss": 1.884,
|
|
"mean_token_accuracy": 0.6261075854301452,
|
|
"num_tokens": 190789849.0,
|
|
"step": 3030
|
|
},
|
|
{
|
|
"epoch": 9.919803600654664,
|
|
"grad_norm": 16.90729981545112,
|
|
"learning_rate": 3.775597043665252e-09,
|
|
"loss": 1.8599,
|
|
"mean_token_accuracy": 0.635511976480484,
|
|
"num_tokens": 191105958.0,
|
|
"step": 3035
|
|
},
|
|
{
|
|
"epoch": 9.936170212765958,
|
|
"grad_norm": 17.900635313977407,
|
|
"learning_rate": 3.770688367525247e-09,
|
|
"loss": 1.8418,
|
|
"mean_token_accuracy": 0.6317579805850982,
|
|
"num_tokens": 191422525.0,
|
|
"step": 3040
|
|
},
|
|
{
|
|
"epoch": 9.952536824877251,
|
|
"grad_norm": 17.038381383182745,
|
|
"learning_rate": 3.765773077285639e-09,
|
|
"loss": 1.7945,
|
|
"mean_token_accuracy": 0.6432504296302796,
|
|
"num_tokens": 191736691.0,
|
|
"step": 3045
|
|
},
|
|
{
|
|
"epoch": 9.968903436988544,
|
|
"grad_norm": 17.954414671181723,
|
|
"learning_rate": 3.7608511985311575e-09,
|
|
"loss": 1.8422,
|
|
"mean_token_accuracy": 0.6309450984001159,
|
|
"num_tokens": 192052351.0,
|
|
"step": 3050
|
|
},
|
|
{
|
|
"epoch": 9.985270049099837,
|
|
"grad_norm": 17.827699266023465,
|
|
"learning_rate": 3.755922756880831e-09,
|
|
"loss": 1.828,
|
|
"mean_token_accuracy": 0.6335801184177399,
|
|
"num_tokens": 192367730.0,
|
|
"step": 3055
|
|
},
|
|
{
|
|
"epoch": 10.0,
|
|
"grad_norm": 17.792353956261156,
|
|
"learning_rate": 3.750987777987841e-09,
|
|
"loss": 1.8302,
|
|
"mean_token_accuracy": 0.6340090367529128,
|
|
"num_tokens": 192627053.0,
|
|
"step": 3060
|
|
},
|
|
{
|
|
"epoch": 10.016366612111293,
|
|
"grad_norm": 17.4349917278818,
|
|
"learning_rate": 3.7460462875394e-09,
|
|
"loss": 1.813,
|
|
"mean_token_accuracy": 0.6366794407367706,
|
|
"num_tokens": 192942851.0,
|
|
"step": 3065
|
|
},
|
|
{
|
|
"epoch": 10.032733224222586,
|
|
"grad_norm": 17.541872465331707,
|
|
"learning_rate": 3.741098311256616e-09,
|
|
"loss": 1.8463,
|
|
"mean_token_accuracy": 0.6313372552394867,
|
|
"num_tokens": 193257200.0,
|
|
"step": 3070
|
|
},
|
|
{
|
|
"epoch": 10.049099836333879,
|
|
"grad_norm": 17.231122883700056,
|
|
"learning_rate": 3.736143874894354e-09,
|
|
"loss": 1.812,
|
|
"mean_token_accuracy": 0.636261624097824,
|
|
"num_tokens": 193574165.0,
|
|
"step": 3075
|
|
},
|
|
{
|
|
"epoch": 10.065466448445171,
|
|
"grad_norm": 16.57540736745677,
|
|
"learning_rate": 3.731183004241103e-09,
|
|
"loss": 1.8135,
|
|
"mean_token_accuracy": 0.6378425419330597,
|
|
"num_tokens": 193890421.0,
|
|
"step": 3080
|
|
},
|
|
{
|
|
"epoch": 10.081833060556464,
|
|
"grad_norm": 17.806512444498733,
|
|
"learning_rate": 3.726215725118848e-09,
|
|
"loss": 1.8265,
|
|
"mean_token_accuracy": 0.6336937725543976,
|
|
"num_tokens": 194205428.0,
|
|
"step": 3085
|
|
},
|
|
{
|
|
"epoch": 10.098199672667757,
|
|
"grad_norm": 17.813348873544196,
|
|
"learning_rate": 3.721242063382926e-09,
|
|
"loss": 1.827,
|
|
"mean_token_accuracy": 0.6328922867774963,
|
|
"num_tokens": 194520954.0,
|
|
"step": 3090
|
|
},
|
|
{
|
|
"epoch": 10.11456628477905,
|
|
"grad_norm": 17.36429708821553,
|
|
"learning_rate": 3.7162620449218997e-09,
|
|
"loss": 1.796,
|
|
"mean_token_accuracy": 0.6400941967964172,
|
|
"num_tokens": 194835641.0,
|
|
"step": 3095
|
|
},
|
|
{
|
|
"epoch": 10.130932896890343,
|
|
"grad_norm": 16.481248380963812,
|
|
"learning_rate": 3.711275695657419e-09,
|
|
"loss": 1.8043,
|
|
"mean_token_accuracy": 0.6384086728096008,
|
|
"num_tokens": 195151450.0,
|
|
"step": 3100
|
|
},
|
|
{
|
|
"epoch": 10.147299509001636,
|
|
"grad_norm": 17.113804257065954,
|
|
"learning_rate": 3.7062830415440844e-09,
|
|
"loss": 1.8284,
|
|
"mean_token_accuracy": 0.631797057390213,
|
|
"num_tokens": 195467613.0,
|
|
"step": 3105
|
|
},
|
|
{
|
|
"epoch": 10.16366612111293,
|
|
"grad_norm": 16.672796542423484,
|
|
"learning_rate": 3.7012841085693164e-09,
|
|
"loss": 1.7864,
|
|
"mean_token_accuracy": 0.6400185823440552,
|
|
"num_tokens": 195783439.0,
|
|
"step": 3110
|
|
},
|
|
{
|
|
"epoch": 10.180032733224223,
|
|
"grad_norm": 16.679087590315454,
|
|
"learning_rate": 3.696278922753216e-09,
|
|
"loss": 1.8161,
|
|
"mean_token_accuracy": 0.634281975030899,
|
|
"num_tokens": 196099260.0,
|
|
"step": 3115
|
|
},
|
|
{
|
|
"epoch": 10.196399345335516,
|
|
"grad_norm": 16.53721256199728,
|
|
"learning_rate": 3.6912675101484327e-09,
|
|
"loss": 1.7851,
|
|
"mean_token_accuracy": 0.640500259399414,
|
|
"num_tokens": 196414809.0,
|
|
"step": 3120
|
|
},
|
|
{
|
|
"epoch": 10.212765957446809,
|
|
"grad_norm": 16.569237354892337,
|
|
"learning_rate": 3.686249896840026e-09,
|
|
"loss": 1.8038,
|
|
"mean_token_accuracy": 0.6380644977092743,
|
|
"num_tokens": 196729837.0,
|
|
"step": 3125
|
|
},
|
|
{
|
|
"epoch": 10.229132569558102,
|
|
"grad_norm": 16.786788982952096,
|
|
"learning_rate": 3.68122610894533e-09,
|
|
"loss": 1.8194,
|
|
"mean_token_accuracy": 0.6346737504005432,
|
|
"num_tokens": 197044385.0,
|
|
"step": 3130
|
|
},
|
|
{
|
|
"epoch": 10.245499181669395,
|
|
"grad_norm": 16.761298565172286,
|
|
"learning_rate": 3.676196172613821e-09,
|
|
"loss": 1.7857,
|
|
"mean_token_accuracy": 0.6384332537651062,
|
|
"num_tokens": 197360546.0,
|
|
"step": 3135
|
|
},
|
|
{
|
|
"epoch": 10.261865793780688,
|
|
"grad_norm": 16.87315468806941,
|
|
"learning_rate": 3.671160114026977e-09,
|
|
"loss": 1.7984,
|
|
"mean_token_accuracy": 0.63452108502388,
|
|
"num_tokens": 197676489.0,
|
|
"step": 3140
|
|
},
|
|
{
|
|
"epoch": 10.27823240589198,
|
|
"grad_norm": 16.491704243241642,
|
|
"learning_rate": 3.666117959398143e-09,
|
|
"loss": 1.8026,
|
|
"mean_token_accuracy": 0.6362354993820191,
|
|
"num_tokens": 197990771.0,
|
|
"step": 3145
|
|
},
|
|
{
|
|
"epoch": 10.294599018003273,
|
|
"grad_norm": 16.280069613051808,
|
|
"learning_rate": 3.6610697349723955e-09,
|
|
"loss": 1.7907,
|
|
"mean_token_accuracy": 0.6383823394775391,
|
|
"num_tokens": 198305211.0,
|
|
"step": 3150
|
|
},
|
|
{
|
|
"epoch": 10.310965630114566,
|
|
"grad_norm": 16.52034352373918,
|
|
"learning_rate": 3.6560154670264046e-09,
|
|
"loss": 1.7935,
|
|
"mean_token_accuracy": 0.6377637565135956,
|
|
"num_tokens": 198621331.0,
|
|
"step": 3155
|
|
},
|
|
{
|
|
"epoch": 10.327332242225859,
|
|
"grad_norm": 15.499203423449122,
|
|
"learning_rate": 3.650955181868298e-09,
|
|
"loss": 1.7475,
|
|
"mean_token_accuracy": 0.6453053712844848,
|
|
"num_tokens": 198935989.0,
|
|
"step": 3160
|
|
},
|
|
{
|
|
"epoch": 10.343698854337152,
|
|
"grad_norm": 16.370463459072013,
|
|
"learning_rate": 3.645888905837523e-09,
|
|
"loss": 1.7855,
|
|
"mean_token_accuracy": 0.6374387383460999,
|
|
"num_tokens": 199251515.0,
|
|
"step": 3165
|
|
},
|
|
{
|
|
"epoch": 10.360065466448445,
|
|
"grad_norm": 16.299945785170074,
|
|
"learning_rate": 3.6408166653047108e-09,
|
|
"loss": 1.7836,
|
|
"mean_token_accuracy": 0.6420963048934937,
|
|
"num_tokens": 199567139.0,
|
|
"step": 3170
|
|
},
|
|
{
|
|
"epoch": 10.376432078559738,
|
|
"grad_norm": 16.335326417897214,
|
|
"learning_rate": 3.63573848667154e-09,
|
|
"loss": 1.7806,
|
|
"mean_token_accuracy": 0.6386357069015502,
|
|
"num_tokens": 199881323.0,
|
|
"step": 3175
|
|
},
|
|
{
|
|
"epoch": 10.39279869067103,
|
|
"grad_norm": 16.121960949269923,
|
|
"learning_rate": 3.630654396370594e-09,
|
|
"loss": 1.7666,
|
|
"mean_token_accuracy": 0.6415923595428467,
|
|
"num_tokens": 200197082.0,
|
|
"step": 3180
|
|
},
|
|
{
|
|
"epoch": 10.409165302782323,
|
|
"grad_norm": 15.517362094147444,
|
|
"learning_rate": 3.6255644208652316e-09,
|
|
"loss": 1.7784,
|
|
"mean_token_accuracy": 0.6394968807697297,
|
|
"num_tokens": 200513159.0,
|
|
"step": 3185
|
|
},
|
|
{
|
|
"epoch": 10.425531914893616,
|
|
"grad_norm": 15.99746353634616,
|
|
"learning_rate": 3.6204685866494426e-09,
|
|
"loss": 1.7605,
|
|
"mean_token_accuracy": 0.6430169403553009,
|
|
"num_tokens": 200829112.0,
|
|
"step": 3190
|
|
},
|
|
{
|
|
"epoch": 10.44189852700491,
|
|
"grad_norm": 16.322051281430188,
|
|
"learning_rate": 3.6153669202477113e-09,
|
|
"loss": 1.8034,
|
|
"mean_token_accuracy": 0.6361609637737274,
|
|
"num_tokens": 201145377.0,
|
|
"step": 3195
|
|
},
|
|
{
|
|
"epoch": 10.458265139116204,
|
|
"grad_norm": 17.010695164653885,
|
|
"learning_rate": 3.6102594482148815e-09,
|
|
"loss": 1.7819,
|
|
"mean_token_accuracy": 0.637069708108902,
|
|
"num_tokens": 201460905.0,
|
|
"step": 3200
|
|
},
|
|
{
|
|
"epoch": 10.474631751227497,
|
|
"grad_norm": 16.00993109809842,
|
|
"learning_rate": 3.6051461971360142e-09,
|
|
"loss": 1.7922,
|
|
"mean_token_accuracy": 0.635745245218277,
|
|
"num_tokens": 201777160.0,
|
|
"step": 3205
|
|
},
|
|
{
|
|
"epoch": 10.49099836333879,
|
|
"grad_norm": 16.528275926750045,
|
|
"learning_rate": 3.600027193626253e-09,
|
|
"loss": 1.7771,
|
|
"mean_token_accuracy": 0.6353870570659638,
|
|
"num_tokens": 202091924.0,
|
|
"step": 3210
|
|
},
|
|
{
|
|
"epoch": 10.507364975450082,
|
|
"grad_norm": 15.64504960410459,
|
|
"learning_rate": 3.5949024643306816e-09,
|
|
"loss": 1.7763,
|
|
"mean_token_accuracy": 0.6359361112117767,
|
|
"num_tokens": 202407738.0,
|
|
"step": 3215
|
|
},
|
|
{
|
|
"epoch": 10.523731587561375,
|
|
"grad_norm": 15.891551126219262,
|
|
"learning_rate": 3.5897720359241876e-09,
|
|
"loss": 1.7615,
|
|
"mean_token_accuracy": 0.6402227580547333,
|
|
"num_tokens": 202722791.0,
|
|
"step": 3220
|
|
},
|
|
{
|
|
"epoch": 10.540098199672668,
|
|
"grad_norm": 15.248335632489418,
|
|
"learning_rate": 3.5846359351113244e-09,
|
|
"loss": 1.7675,
|
|
"mean_token_accuracy": 0.6383937776088715,
|
|
"num_tokens": 203038092.0,
|
|
"step": 3225
|
|
},
|
|
{
|
|
"epoch": 10.556464811783961,
|
|
"grad_norm": 16.003380638768103,
|
|
"learning_rate": 3.57949418862617e-09,
|
|
"loss": 1.7808,
|
|
"mean_token_accuracy": 0.6339847385883332,
|
|
"num_tokens": 203354370.0,
|
|
"step": 3230
|
|
},
|
|
{
|
|
"epoch": 10.572831423895254,
|
|
"grad_norm": 15.14019211081649,
|
|
"learning_rate": 3.5743468232321897e-09,
|
|
"loss": 1.7503,
|
|
"mean_token_accuracy": 0.6410067021846771,
|
|
"num_tokens": 203671582.0,
|
|
"step": 3235
|
|
},
|
|
{
|
|
"epoch": 10.589198036006547,
|
|
"grad_norm": 15.519984653479437,
|
|
"learning_rate": 3.569193865722096e-09,
|
|
"loss": 1.7321,
|
|
"mean_token_accuracy": 0.644438773393631,
|
|
"num_tokens": 203987499.0,
|
|
"step": 3240
|
|
},
|
|
{
|
|
"epoch": 10.60556464811784,
|
|
"grad_norm": 15.407299743600658,
|
|
"learning_rate": 3.564035342917707e-09,
|
|
"loss": 1.7361,
|
|
"mean_token_accuracy": 0.6415053546428681,
|
|
"num_tokens": 204305165.0,
|
|
"step": 3245
|
|
},
|
|
{
|
|
"epoch": 10.621931260229132,
|
|
"grad_norm": 15.432492843596417,
|
|
"learning_rate": 3.558871281669811e-09,
|
|
"loss": 1.7325,
|
|
"mean_token_accuracy": 0.6469219923019409,
|
|
"num_tokens": 204621541.0,
|
|
"step": 3250
|
|
},
|
|
{
|
|
"epoch": 10.638297872340425,
|
|
"grad_norm": 15.141663299177573,
|
|
"learning_rate": 3.5537017088580244e-09,
|
|
"loss": 1.7466,
|
|
"mean_token_accuracy": 0.6414005696773529,
|
|
"num_tokens": 204938513.0,
|
|
"step": 3255
|
|
},
|
|
{
|
|
"epoch": 10.654664484451718,
|
|
"grad_norm": 15.2352538843922,
|
|
"learning_rate": 3.548526651390651e-09,
|
|
"loss": 1.7561,
|
|
"mean_token_accuracy": 0.6406930923461914,
|
|
"num_tokens": 205254497.0,
|
|
"step": 3260
|
|
},
|
|
{
|
|
"epoch": 10.671031096563011,
|
|
"grad_norm": 15.018961061358699,
|
|
"learning_rate": 3.543346136204545e-09,
|
|
"loss": 1.7721,
|
|
"mean_token_accuracy": 0.637039589881897,
|
|
"num_tokens": 205570795.0,
|
|
"step": 3265
|
|
},
|
|
{
|
|
"epoch": 10.687397708674304,
|
|
"grad_norm": 15.643328087526758,
|
|
"learning_rate": 3.538160190264966e-09,
|
|
"loss": 1.7636,
|
|
"mean_token_accuracy": 0.6392747342586518,
|
|
"num_tokens": 205886038.0,
|
|
"step": 3270
|
|
},
|
|
{
|
|
"epoch": 10.703764320785597,
|
|
"grad_norm": 14.821441582923677,
|
|
"learning_rate": 3.532968840565443e-09,
|
|
"loss": 1.7234,
|
|
"mean_token_accuracy": 0.6462870895862579,
|
|
"num_tokens": 206200570.0,
|
|
"step": 3275
|
|
},
|
|
{
|
|
"epoch": 10.72013093289689,
|
|
"grad_norm": 15.34731712498548,
|
|
"learning_rate": 3.5277721141276327e-09,
|
|
"loss": 1.7762,
|
|
"mean_token_accuracy": 0.6353504419326782,
|
|
"num_tokens": 206516423.0,
|
|
"step": 3280
|
|
},
|
|
{
|
|
"epoch": 10.736497545008183,
|
|
"grad_norm": 14.598592610119189,
|
|
"learning_rate": 3.522570038001177e-09,
|
|
"loss": 1.7293,
|
|
"mean_token_accuracy": 0.6442684292793274,
|
|
"num_tokens": 206833415.0,
|
|
"step": 3285
|
|
},
|
|
{
|
|
"epoch": 10.752864157119475,
|
|
"grad_norm": 15.030434843310996,
|
|
"learning_rate": 3.5173626392635645e-09,
|
|
"loss": 1.7071,
|
|
"mean_token_accuracy": 0.648097550868988,
|
|
"num_tokens": 207148402.0,
|
|
"step": 3290
|
|
},
|
|
{
|
|
"epoch": 10.76923076923077,
|
|
"grad_norm": 14.42367654172462,
|
|
"learning_rate": 3.512149945019989e-09,
|
|
"loss": 1.7067,
|
|
"mean_token_accuracy": 0.6503620088100434,
|
|
"num_tokens": 207463073.0,
|
|
"step": 3295
|
|
},
|
|
{
|
|
"epoch": 10.785597381342063,
|
|
"grad_norm": 14.850989940280288,
|
|
"learning_rate": 3.5069319824032076e-09,
|
|
"loss": 1.7476,
|
|
"mean_token_accuracy": 0.638144338130951,
|
|
"num_tokens": 207777543.0,
|
|
"step": 3300
|
|
},
|
|
{
|
|
"epoch": 10.801963993453356,
|
|
"grad_norm": 15.007681069243336,
|
|
"learning_rate": 3.5017087785734e-09,
|
|
"loss": 1.7264,
|
|
"mean_token_accuracy": 0.6454558491706848,
|
|
"num_tokens": 208094500.0,
|
|
"step": 3305
|
|
},
|
|
{
|
|
"epoch": 10.818330605564649,
|
|
"grad_norm": 15.777790947959936,
|
|
"learning_rate": 3.496480360718026e-09,
|
|
"loss": 1.745,
|
|
"mean_token_accuracy": 0.6422793984413147,
|
|
"num_tokens": 208410348.0,
|
|
"step": 3310
|
|
},
|
|
{
|
|
"epoch": 10.834697217675942,
|
|
"grad_norm": 15.124131114888696,
|
|
"learning_rate": 3.4912467560516886e-09,
|
|
"loss": 1.7451,
|
|
"mean_token_accuracy": 0.6406438410282135,
|
|
"num_tokens": 208725794.0,
|
|
"step": 3315
|
|
},
|
|
{
|
|
"epoch": 10.851063829787234,
|
|
"grad_norm": 14.901110226184628,
|
|
"learning_rate": 3.4860079918159844e-09,
|
|
"loss": 1.7217,
|
|
"mean_token_accuracy": 0.6472250401973725,
|
|
"num_tokens": 209041520.0,
|
|
"step": 3320
|
|
},
|
|
{
|
|
"epoch": 10.867430441898527,
|
|
"grad_norm": 14.874077465654924,
|
|
"learning_rate": 3.4807640952793695e-09,
|
|
"loss": 1.7252,
|
|
"mean_token_accuracy": 0.6431742966175079,
|
|
"num_tokens": 209357603.0,
|
|
"step": 3325
|
|
},
|
|
{
|
|
"epoch": 10.88379705400982,
|
|
"grad_norm": 14.995454699176682,
|
|
"learning_rate": 3.4755150937370124e-09,
|
|
"loss": 1.7256,
|
|
"mean_token_accuracy": 0.6463587701320648,
|
|
"num_tokens": 209673991.0,
|
|
"step": 3330
|
|
},
|
|
{
|
|
"epoch": 10.900163666121113,
|
|
"grad_norm": 14.494540634301961,
|
|
"learning_rate": 3.4702610145106545e-09,
|
|
"loss": 1.6856,
|
|
"mean_token_accuracy": 0.6515645325183869,
|
|
"num_tokens": 209989177.0,
|
|
"step": 3335
|
|
},
|
|
{
|
|
"epoch": 10.916530278232406,
|
|
"grad_norm": 14.950428049798738,
|
|
"learning_rate": 3.465001884948468e-09,
|
|
"loss": 1.7412,
|
|
"mean_token_accuracy": 0.6403026700019836,
|
|
"num_tokens": 210304282.0,
|
|
"step": 3340
|
|
},
|
|
{
|
|
"epoch": 10.932896890343699,
|
|
"grad_norm": 14.463577816118367,
|
|
"learning_rate": 3.45973773242491e-09,
|
|
"loss": 1.7279,
|
|
"mean_token_accuracy": 0.6433824181556702,
|
|
"num_tokens": 210618390.0,
|
|
"step": 3345
|
|
},
|
|
{
|
|
"epoch": 10.949263502454992,
|
|
"grad_norm": 14.518198523486582,
|
|
"learning_rate": 3.4544685843405875e-09,
|
|
"loss": 1.696,
|
|
"mean_token_accuracy": 0.6500982105731964,
|
|
"num_tokens": 210932450.0,
|
|
"step": 3350
|
|
},
|
|
{
|
|
"epoch": 10.965630114566284,
|
|
"grad_norm": 14.202566979681045,
|
|
"learning_rate": 3.4491944681221065e-09,
|
|
"loss": 1.7046,
|
|
"mean_token_accuracy": 0.6483056366443634,
|
|
"num_tokens": 211249906.0,
|
|
"step": 3355
|
|
},
|
|
{
|
|
"epoch": 10.981996726677577,
|
|
"grad_norm": 14.357942906401359,
|
|
"learning_rate": 3.443915411221933e-09,
|
|
"loss": 1.6951,
|
|
"mean_token_accuracy": 0.6466932356357574,
|
|
"num_tokens": 211566650.0,
|
|
"step": 3360
|
|
},
|
|
{
|
|
"epoch": 10.99836333878887,
|
|
"grad_norm": 13.989121868685887,
|
|
"learning_rate": 3.43863144111825e-09,
|
|
"loss": 1.716,
|
|
"mean_token_accuracy": 0.6450442373752594,
|
|
"num_tokens": 211881793.0,
|
|
"step": 3365
|
|
},
|
|
{
|
|
"epoch": 11.013093289689035,
|
|
"grad_norm": 14.34498478284,
|
|
"learning_rate": 3.4333425853148157e-09,
|
|
"loss": 1.6897,
|
|
"mean_token_accuracy": 0.64995010693868,
|
|
"num_tokens": 212143193.0,
|
|
"step": 3370
|
|
},
|
|
{
|
|
"epoch": 11.029459901800328,
|
|
"grad_norm": 14.53268159695013,
|
|
"learning_rate": 3.4280488713408185e-09,
|
|
"loss": 1.7068,
|
|
"mean_token_accuracy": 0.6474012017250061,
|
|
"num_tokens": 212460404.0,
|
|
"step": 3375
|
|
},
|
|
{
|
|
"epoch": 11.04582651391162,
|
|
"grad_norm": 14.166663868063697,
|
|
"learning_rate": 3.4227503267507332e-09,
|
|
"loss": 1.7053,
|
|
"mean_token_accuracy": 0.6485930442810058,
|
|
"num_tokens": 212777430.0,
|
|
"step": 3380
|
|
},
|
|
{
|
|
"epoch": 11.062193126022914,
|
|
"grad_norm": 14.456689486688052,
|
|
"learning_rate": 3.41744697912418e-09,
|
|
"loss": 1.7112,
|
|
"mean_token_accuracy": 0.6475742220878601,
|
|
"num_tokens": 213093496.0,
|
|
"step": 3385
|
|
},
|
|
{
|
|
"epoch": 11.078559738134206,
|
|
"grad_norm": 14.097537246192404,
|
|
"learning_rate": 3.4121388560657785e-09,
|
|
"loss": 1.7106,
|
|
"mean_token_accuracy": 0.6455345630645752,
|
|
"num_tokens": 213409098.0,
|
|
"step": 3390
|
|
},
|
|
{
|
|
"epoch": 11.0949263502455,
|
|
"grad_norm": 13.933961411959768,
|
|
"learning_rate": 3.406825985205005e-09,
|
|
"loss": 1.729,
|
|
"mean_token_accuracy": 0.6434445321559906,
|
|
"num_tokens": 213725910.0,
|
|
"step": 3395
|
|
},
|
|
{
|
|
"epoch": 11.111292962356792,
|
|
"grad_norm": 14.752945706255815,
|
|
"learning_rate": 3.401508394196049e-09,
|
|
"loss": 1.6957,
|
|
"mean_token_accuracy": 0.6487627983093261,
|
|
"num_tokens": 214040857.0,
|
|
"step": 3400
|
|
},
|
|
{
|
|
"epoch": 11.127659574468085,
|
|
"grad_norm": 14.20709457845416,
|
|
"learning_rate": 3.39618611071767e-09,
|
|
"loss": 1.6807,
|
|
"mean_token_accuracy": 0.6522836029529572,
|
|
"num_tokens": 214356466.0,
|
|
"step": 3405
|
|
},
|
|
{
|
|
"epoch": 11.144026186579378,
|
|
"grad_norm": 13.398213700828151,
|
|
"learning_rate": 3.3908591624730512e-09,
|
|
"loss": 1.6381,
|
|
"mean_token_accuracy": 0.6592851996421814,
|
|
"num_tokens": 214672248.0,
|
|
"step": 3410
|
|
},
|
|
{
|
|
"epoch": 11.16039279869067,
|
|
"grad_norm": 13.731779015035574,
|
|
"learning_rate": 3.385527577189656e-09,
|
|
"loss": 1.7024,
|
|
"mean_token_accuracy": 0.6492987155914307,
|
|
"num_tokens": 214986750.0,
|
|
"step": 3415
|
|
},
|
|
{
|
|
"epoch": 11.176759410801964,
|
|
"grad_norm": 13.62502150214815,
|
|
"learning_rate": 3.3801913826190855e-09,
|
|
"loss": 1.6919,
|
|
"mean_token_accuracy": 0.6497934758663177,
|
|
"num_tokens": 215300415.0,
|
|
"step": 3420
|
|
},
|
|
{
|
|
"epoch": 11.193126022913257,
|
|
"grad_norm": 13.588807308802734,
|
|
"learning_rate": 3.374850606536933e-09,
|
|
"loss": 1.6774,
|
|
"mean_token_accuracy": 0.6529603004455566,
|
|
"num_tokens": 215616055.0,
|
|
"step": 3425
|
|
},
|
|
{
|
|
"epoch": 11.20949263502455,
|
|
"grad_norm": 14.004348021352275,
|
|
"learning_rate": 3.3695052767426376e-09,
|
|
"loss": 1.6831,
|
|
"mean_token_accuracy": 0.6526573598384857,
|
|
"num_tokens": 215931529.0,
|
|
"step": 3430
|
|
},
|
|
{
|
|
"epoch": 11.225859247135842,
|
|
"grad_norm": 14.469828986147332,
|
|
"learning_rate": 3.3641554210593416e-09,
|
|
"loss": 1.6917,
|
|
"mean_token_accuracy": 0.6479479074478149,
|
|
"num_tokens": 216246536.0,
|
|
"step": 3435
|
|
},
|
|
{
|
|
"epoch": 11.242225859247135,
|
|
"grad_norm": 13.724821289796603,
|
|
"learning_rate": 3.358801067333747e-09,
|
|
"loss": 1.6773,
|
|
"mean_token_accuracy": 0.6549168825149536,
|
|
"num_tokens": 216560534.0,
|
|
"step": 3440
|
|
},
|
|
{
|
|
"epoch": 11.258592471358428,
|
|
"grad_norm": 13.96963647530131,
|
|
"learning_rate": 3.3534422434359656e-09,
|
|
"loss": 1.6632,
|
|
"mean_token_accuracy": 0.6518442153930664,
|
|
"num_tokens": 216874009.0,
|
|
"step": 3445
|
|
},
|
|
{
|
|
"epoch": 11.27495908346972,
|
|
"grad_norm": 14.002011605528583,
|
|
"learning_rate": 3.3480789772593793e-09,
|
|
"loss": 1.6514,
|
|
"mean_token_accuracy": 0.6585093021392823,
|
|
"num_tokens": 217189692.0,
|
|
"step": 3450
|
|
},
|
|
{
|
|
"epoch": 11.291325695581016,
|
|
"grad_norm": 13.844181087235965,
|
|
"learning_rate": 3.342711296720492e-09,
|
|
"loss": 1.6611,
|
|
"mean_token_accuracy": 0.6569892466068268,
|
|
"num_tokens": 217505372.0,
|
|
"step": 3455
|
|
},
|
|
{
|
|
"epoch": 11.307692307692308,
|
|
"grad_norm": 13.798099661785878,
|
|
"learning_rate": 3.3373392297587847e-09,
|
|
"loss": 1.6746,
|
|
"mean_token_accuracy": 0.6542344272136689,
|
|
"num_tokens": 217820392.0,
|
|
"step": 3460
|
|
},
|
|
{
|
|
"epoch": 11.324058919803601,
|
|
"grad_norm": 13.633044394240601,
|
|
"learning_rate": 3.3319628043365703e-09,
|
|
"loss": 1.6726,
|
|
"mean_token_accuracy": 0.6545675575733185,
|
|
"num_tokens": 218136541.0,
|
|
"step": 3465
|
|
},
|
|
{
|
|
"epoch": 11.340425531914894,
|
|
"grad_norm": 13.646712769096844,
|
|
"learning_rate": 3.3265820484388485e-09,
|
|
"loss": 1.6754,
|
|
"mean_token_accuracy": 0.6528559982776642,
|
|
"num_tokens": 218453741.0,
|
|
"step": 3470
|
|
},
|
|
{
|
|
"epoch": 11.356792144026187,
|
|
"grad_norm": 13.174588490549636,
|
|
"learning_rate": 3.3211969900731597e-09,
|
|
"loss": 1.6592,
|
|
"mean_token_accuracy": 0.6580550074577332,
|
|
"num_tokens": 218771030.0,
|
|
"step": 3475
|
|
},
|
|
{
|
|
"epoch": 11.37315875613748,
|
|
"grad_norm": 13.901574990786047,
|
|
"learning_rate": 3.3158076572694386e-09,
|
|
"loss": 1.6747,
|
|
"mean_token_accuracy": 0.6550671398639679,
|
|
"num_tokens": 219087292.0,
|
|
"step": 3480
|
|
},
|
|
{
|
|
"epoch": 11.389525368248773,
|
|
"grad_norm": 14.015642465284007,
|
|
"learning_rate": 3.3104140780798685e-09,
|
|
"loss": 1.6646,
|
|
"mean_token_accuracy": 0.6581855952739716,
|
|
"num_tokens": 219403238.0,
|
|
"step": 3485
|
|
},
|
|
{
|
|
"epoch": 11.405891980360066,
|
|
"grad_norm": 13.904282659392736,
|
|
"learning_rate": 3.3050162805787375e-09,
|
|
"loss": 1.6741,
|
|
"mean_token_accuracy": 0.6535765409469605,
|
|
"num_tokens": 219719693.0,
|
|
"step": 3490
|
|
},
|
|
{
|
|
"epoch": 11.422258592471358,
|
|
"grad_norm": 13.789370291753515,
|
|
"learning_rate": 3.2996142928622896e-09,
|
|
"loss": 1.6654,
|
|
"mean_token_accuracy": 0.6573232293128968,
|
|
"num_tokens": 220037015.0,
|
|
"step": 3495
|
|
},
|
|
{
|
|
"epoch": 11.438625204582651,
|
|
"grad_norm": 13.832705931075548,
|
|
"learning_rate": 3.2942081430485782e-09,
|
|
"loss": 1.6465,
|
|
"mean_token_accuracy": 0.6604346215724946,
|
|
"num_tokens": 220354265.0,
|
|
"step": 3500
|
|
},
|
|
{
|
|
"epoch": 11.454991816693944,
|
|
"grad_norm": 13.557976866092726,
|
|
"learning_rate": 3.2887978592773234e-09,
|
|
"loss": 1.6666,
|
|
"mean_token_accuracy": 0.6565652191638947,
|
|
"num_tokens": 220669974.0,
|
|
"step": 3505
|
|
},
|
|
{
|
|
"epoch": 11.471358428805237,
|
|
"grad_norm": 13.210160428205072,
|
|
"learning_rate": 3.2833834697097608e-09,
|
|
"loss": 1.6527,
|
|
"mean_token_accuracy": 0.6604192018508911,
|
|
"num_tokens": 220986724.0,
|
|
"step": 3510
|
|
},
|
|
{
|
|
"epoch": 11.48772504091653,
|
|
"grad_norm": 13.689016733980383,
|
|
"learning_rate": 3.2779650025284985e-09,
|
|
"loss": 1.6738,
|
|
"mean_token_accuracy": 0.6558438301086426,
|
|
"num_tokens": 221302826.0,
|
|
"step": 3515
|
|
},
|
|
{
|
|
"epoch": 11.504091653027823,
|
|
"grad_norm": 13.329552275037532,
|
|
"learning_rate": 3.2725424859373687e-09,
|
|
"loss": 1.6531,
|
|
"mean_token_accuracy": 0.6575049161911011,
|
|
"num_tokens": 221617698.0,
|
|
"step": 3520
|
|
},
|
|
{
|
|
"epoch": 11.520458265139116,
|
|
"grad_norm": 13.630816765028563,
|
|
"learning_rate": 3.267115948161282e-09,
|
|
"loss": 1.6479,
|
|
"mean_token_accuracy": 0.6591286897659302,
|
|
"num_tokens": 221931887.0,
|
|
"step": 3525
|
|
},
|
|
{
|
|
"epoch": 11.536824877250409,
|
|
"grad_norm": 12.94733730296298,
|
|
"learning_rate": 3.2616854174460786e-09,
|
|
"loss": 1.6383,
|
|
"mean_token_accuracy": 0.6619257152080535,
|
|
"num_tokens": 222247478.0,
|
|
"step": 3530
|
|
},
|
|
{
|
|
"epoch": 11.553191489361701,
|
|
"grad_norm": 13.878682812123111,
|
|
"learning_rate": 3.256250922058383e-09,
|
|
"loss": 1.6523,
|
|
"mean_token_accuracy": 0.6560933649539947,
|
|
"num_tokens": 222561836.0,
|
|
"step": 3535
|
|
},
|
|
{
|
|
"epoch": 11.569558101472996,
|
|
"grad_norm": 13.544551226928728,
|
|
"learning_rate": 3.2508124902854567e-09,
|
|
"loss": 1.6409,
|
|
"mean_token_accuracy": 0.6584199607372284,
|
|
"num_tokens": 222878184.0,
|
|
"step": 3540
|
|
},
|
|
{
|
|
"epoch": 11.585924713584289,
|
|
"grad_norm": 13.67811999848458,
|
|
"learning_rate": 3.2453701504350507e-09,
|
|
"loss": 1.6471,
|
|
"mean_token_accuracy": 0.6592689633369446,
|
|
"num_tokens": 223192021.0,
|
|
"step": 3545
|
|
},
|
|
{
|
|
"epoch": 11.602291325695582,
|
|
"grad_norm": 13.194026543743341,
|
|
"learning_rate": 3.239923930835257e-09,
|
|
"loss": 1.6546,
|
|
"mean_token_accuracy": 0.6571596205234528,
|
|
"num_tokens": 223507736.0,
|
|
"step": 3550
|
|
},
|
|
{
|
|
"epoch": 11.618657937806875,
|
|
"grad_norm": 13.438919843074075,
|
|
"learning_rate": 3.234473859834364e-09,
|
|
"loss": 1.6612,
|
|
"mean_token_accuracy": 0.6565041303634643,
|
|
"num_tokens": 223822076.0,
|
|
"step": 3555
|
|
},
|
|
{
|
|
"epoch": 11.635024549918167,
|
|
"grad_norm": 13.572873566039807,
|
|
"learning_rate": 3.229019965800705e-09,
|
|
"loss": 1.6361,
|
|
"mean_token_accuracy": 0.6581840515136719,
|
|
"num_tokens": 224137036.0,
|
|
"step": 3560
|
|
},
|
|
{
|
|
"epoch": 11.65139116202946,
|
|
"grad_norm": 13.11267130097918,
|
|
"learning_rate": 3.2235622771225127e-09,
|
|
"loss": 1.6344,
|
|
"mean_token_accuracy": 0.6608539044857025,
|
|
"num_tokens": 224455359.0,
|
|
"step": 3565
|
|
},
|
|
{
|
|
"epoch": 11.667757774140753,
|
|
"grad_norm": 13.745388564823362,
|
|
"learning_rate": 3.2181008222077746e-09,
|
|
"loss": 1.6102,
|
|
"mean_token_accuracy": 0.6658849954605103,
|
|
"num_tokens": 224771875.0,
|
|
"step": 3570
|
|
},
|
|
{
|
|
"epoch": 11.684124386252046,
|
|
"grad_norm": 13.368804864950883,
|
|
"learning_rate": 3.2126356294840787e-09,
|
|
"loss": 1.6308,
|
|
"mean_token_accuracy": 0.6618900954723358,
|
|
"num_tokens": 225086494.0,
|
|
"step": 3575
|
|
},
|
|
{
|
|
"epoch": 11.700490998363339,
|
|
"grad_norm": 13.071279177114466,
|
|
"learning_rate": 3.2071667273984706e-09,
|
|
"loss": 1.622,
|
|
"mean_token_accuracy": 0.6624108016490936,
|
|
"num_tokens": 225399906.0,
|
|
"step": 3580
|
|
},
|
|
{
|
|
"epoch": 11.716857610474632,
|
|
"grad_norm": 13.744495890253397,
|
|
"learning_rate": 3.2016941444173014e-09,
|
|
"loss": 1.6291,
|
|
"mean_token_accuracy": 0.6606933116912842,
|
|
"num_tokens": 225716561.0,
|
|
"step": 3585
|
|
},
|
|
{
|
|
"epoch": 11.733224222585925,
|
|
"grad_norm": 13.264943229878893,
|
|
"learning_rate": 3.1962179090260845e-09,
|
|
"loss": 1.6268,
|
|
"mean_token_accuracy": 0.6614998817443848,
|
|
"num_tokens": 226031872.0,
|
|
"step": 3590
|
|
},
|
|
{
|
|
"epoch": 11.749590834697218,
|
|
"grad_norm": 13.062220154925717,
|
|
"learning_rate": 3.1907380497293427e-09,
|
|
"loss": 1.6423,
|
|
"mean_token_accuracy": 0.6583823144435883,
|
|
"num_tokens": 226348444.0,
|
|
"step": 3595
|
|
},
|
|
{
|
|
"epoch": 11.76595744680851,
|
|
"grad_norm": 13.416738533921773,
|
|
"learning_rate": 3.185254595050463e-09,
|
|
"loss": 1.619,
|
|
"mean_token_accuracy": 0.6620403587818146,
|
|
"num_tokens": 226664829.0,
|
|
"step": 3600
|
|
},
|
|
{
|
|
"epoch": 11.782324058919803,
|
|
"grad_norm": 12.674798008155374,
|
|
"learning_rate": 3.1797675735315457e-09,
|
|
"loss": 1.6193,
|
|
"mean_token_accuracy": 0.6634869813919068,
|
|
"num_tokens": 226980423.0,
|
|
"step": 3605
|
|
},
|
|
{
|
|
"epoch": 11.798690671031096,
|
|
"grad_norm": 13.115142248657635,
|
|
"learning_rate": 3.174277013733257e-09,
|
|
"loss": 1.6076,
|
|
"mean_token_accuracy": 0.6654283106327057,
|
|
"num_tokens": 227295351.0,
|
|
"step": 3610
|
|
},
|
|
{
|
|
"epoch": 11.815057283142389,
|
|
"grad_norm": 13.29350625734606,
|
|
"learning_rate": 3.1687829442346814e-09,
|
|
"loss": 1.6139,
|
|
"mean_token_accuracy": 0.6620844781398774,
|
|
"num_tokens": 227611552.0,
|
|
"step": 3615
|
|
},
|
|
{
|
|
"epoch": 11.831423895253682,
|
|
"grad_norm": 13.061041661166026,
|
|
"learning_rate": 3.1632853936331713e-09,
|
|
"loss": 1.635,
|
|
"mean_token_accuracy": 0.6598536133766174,
|
|
"num_tokens": 227925667.0,
|
|
"step": 3620
|
|
},
|
|
{
|
|
"epoch": 11.847790507364975,
|
|
"grad_norm": 13.537642032805756,
|
|
"learning_rate": 3.1577843905441977e-09,
|
|
"loss": 1.6446,
|
|
"mean_token_accuracy": 0.6571721136569977,
|
|
"num_tokens": 228241524.0,
|
|
"step": 3625
|
|
},
|
|
{
|
|
"epoch": 11.864157119476268,
|
|
"grad_norm": 13.81972607646031,
|
|
"learning_rate": 3.152279963601204e-09,
|
|
"loss": 1.6274,
|
|
"mean_token_accuracy": 0.6625904440879822,
|
|
"num_tokens": 228554391.0,
|
|
"step": 3630
|
|
},
|
|
{
|
|
"epoch": 11.880523731587562,
|
|
"grad_norm": 12.818969485029632,
|
|
"learning_rate": 3.146772141455454e-09,
|
|
"loss": 1.5954,
|
|
"mean_token_accuracy": 0.6688964426517486,
|
|
"num_tokens": 228871206.0,
|
|
"step": 3635
|
|
},
|
|
{
|
|
"epoch": 11.896890343698855,
|
|
"grad_norm": 13.422568818132685,
|
|
"learning_rate": 3.1412609527758852e-09,
|
|
"loss": 1.5942,
|
|
"mean_token_accuracy": 0.6679134428501129,
|
|
"num_tokens": 229187692.0,
|
|
"step": 3640
|
|
},
|
|
{
|
|
"epoch": 11.913256955810148,
|
|
"grad_norm": 12.777528450533671,
|
|
"learning_rate": 3.1357464262489556e-09,
|
|
"loss": 1.6013,
|
|
"mean_token_accuracy": 0.6679581284523011,
|
|
"num_tokens": 229503728.0,
|
|
"step": 3645
|
|
},
|
|
{
|
|
"epoch": 11.92962356792144,
|
|
"grad_norm": 13.562107747673085,
|
|
"learning_rate": 3.1302285905785e-09,
|
|
"loss": 1.632,
|
|
"mean_token_accuracy": 0.6611423671245575,
|
|
"num_tokens": 229819869.0,
|
|
"step": 3650
|
|
},
|
|
{
|
|
"epoch": 11.945990180032734,
|
|
"grad_norm": 12.523050886924725,
|
|
"learning_rate": 3.124707474485577e-09,
|
|
"loss": 1.6184,
|
|
"mean_token_accuracy": 0.6638299524784088,
|
|
"num_tokens": 230134907.0,
|
|
"step": 3655
|
|
},
|
|
{
|
|
"epoch": 11.962356792144027,
|
|
"grad_norm": 12.611898786093215,
|
|
"learning_rate": 3.11918310670832e-09,
|
|
"loss": 1.6214,
|
|
"mean_token_accuracy": 0.6640681028366089,
|
|
"num_tokens": 230450604.0,
|
|
"step": 3660
|
|
},
|
|
{
|
|
"epoch": 11.97872340425532,
|
|
"grad_norm": 13.276037366533183,
|
|
"learning_rate": 3.1136555160017866e-09,
|
|
"loss": 1.6167,
|
|
"mean_token_accuracy": 0.6637542247772217,
|
|
"num_tokens": 230767066.0,
|
|
"step": 3665
|
|
},
|
|
{
|
|
"epoch": 11.995090016366612,
|
|
"grad_norm": 12.74004340390755,
|
|
"learning_rate": 3.1081247311378134e-09,
|
|
"loss": 1.5976,
|
|
"mean_token_accuracy": 0.6676555573940277,
|
|
"num_tokens": 231081724.0,
|
|
"step": 3670
|
|
},
|
|
{
|
|
"epoch": 12.009819967266775,
|
|
"grad_norm": 12.948745150559041,
|
|
"learning_rate": 3.1025907809048586e-09,
|
|
"loss": 1.6031,
|
|
"mean_token_accuracy": 0.6675824787881639,
|
|
"num_tokens": 231343267.0,
|
|
"step": 3675
|
|
},
|
|
{
|
|
"epoch": 12.026186579378068,
|
|
"grad_norm": 12.843382353063111,
|
|
"learning_rate": 3.0970536941078607e-09,
|
|
"loss": 1.6057,
|
|
"mean_token_accuracy": 0.6665944814682007,
|
|
"num_tokens": 231659119.0,
|
|
"step": 3680
|
|
},
|
|
{
|
|
"epoch": 12.042553191489361,
|
|
"grad_norm": 12.824640395729176,
|
|
"learning_rate": 3.091513499568082e-09,
|
|
"loss": 1.6,
|
|
"mean_token_accuracy": 0.6633087992668152,
|
|
"num_tokens": 231975244.0,
|
|
"step": 3685
|
|
},
|
|
{
|
|
"epoch": 12.058919803600654,
|
|
"grad_norm": 12.908617166906525,
|
|
"learning_rate": 3.0859702261229617e-09,
|
|
"loss": 1.6164,
|
|
"mean_token_accuracy": 0.6648675918579101,
|
|
"num_tokens": 232292418.0,
|
|
"step": 3690
|
|
},
|
|
{
|
|
"epoch": 12.075286415711947,
|
|
"grad_norm": 13.006079515249429,
|
|
"learning_rate": 3.0804239026259663e-09,
|
|
"loss": 1.6116,
|
|
"mean_token_accuracy": 0.6631251335144043,
|
|
"num_tokens": 232607064.0,
|
|
"step": 3695
|
|
},
|
|
{
|
|
"epoch": 12.091653027823241,
|
|
"grad_norm": 12.954002093848636,
|
|
"learning_rate": 3.074874557946434e-09,
|
|
"loss": 1.6229,
|
|
"mean_token_accuracy": 0.6616682350635529,
|
|
"num_tokens": 232922763.0,
|
|
"step": 3700
|
|
},
|
|
{
|
|
"epoch": 12.108019639934534,
|
|
"grad_norm": 13.00134579290529,
|
|
"learning_rate": 3.0693222209694336e-09,
|
|
"loss": 1.5862,
|
|
"mean_token_accuracy": 0.669516533613205,
|
|
"num_tokens": 233239213.0,
|
|
"step": 3705
|
|
},
|
|
{
|
|
"epoch": 12.124386252045827,
|
|
"grad_norm": 12.585401536226662,
|
|
"learning_rate": 3.063766920595608e-09,
|
|
"loss": 1.5904,
|
|
"mean_token_accuracy": 0.6675543785095215,
|
|
"num_tokens": 233554467.0,
|
|
"step": 3710
|
|
},
|
|
{
|
|
"epoch": 12.14075286415712,
|
|
"grad_norm": 12.663377303240502,
|
|
"learning_rate": 3.058208685741023e-09,
|
|
"loss": 1.5966,
|
|
"mean_token_accuracy": 0.6668773412704467,
|
|
"num_tokens": 233871092.0,
|
|
"step": 3715
|
|
},
|
|
{
|
|
"epoch": 12.157119476268413,
|
|
"grad_norm": 12.911660404059452,
|
|
"learning_rate": 3.0526475453370206e-09,
|
|
"loss": 1.5989,
|
|
"mean_token_accuracy": 0.6680683135986328,
|
|
"num_tokens": 234188066.0,
|
|
"step": 3720
|
|
},
|
|
{
|
|
"epoch": 12.173486088379706,
|
|
"grad_norm": 12.911278000841309,
|
|
"learning_rate": 3.047083528330066e-09,
|
|
"loss": 1.5815,
|
|
"mean_token_accuracy": 0.6699133396148682,
|
|
"num_tokens": 234503095.0,
|
|
"step": 3725
|
|
},
|
|
{
|
|
"epoch": 12.189852700490999,
|
|
"grad_norm": 12.794569499674953,
|
|
"learning_rate": 3.0415166636815965e-09,
|
|
"loss": 1.5895,
|
|
"mean_token_accuracy": 0.6685011863708497,
|
|
"num_tokens": 234818492.0,
|
|
"step": 3730
|
|
},
|
|
{
|
|
"epoch": 12.206219312602292,
|
|
"grad_norm": 12.502821002737141,
|
|
"learning_rate": 3.035946980367873e-09,
|
|
"loss": 1.6057,
|
|
"mean_token_accuracy": 0.6661465525627136,
|
|
"num_tokens": 235133128.0,
|
|
"step": 3735
|
|
},
|
|
{
|
|
"epoch": 12.222585924713584,
|
|
"grad_norm": 12.054248094194744,
|
|
"learning_rate": 3.0303745073798283e-09,
|
|
"loss": 1.585,
|
|
"mean_token_accuracy": 0.6686634719371796,
|
|
"num_tokens": 235449485.0,
|
|
"step": 3740
|
|
},
|
|
{
|
|
"epoch": 12.238952536824877,
|
|
"grad_norm": 12.434412926443686,
|
|
"learning_rate": 3.0247992737229145e-09,
|
|
"loss": 1.5734,
|
|
"mean_token_accuracy": 0.6727577865123748,
|
|
"num_tokens": 235765311.0,
|
|
"step": 3745
|
|
},
|
|
{
|
|
"epoch": 12.25531914893617,
|
|
"grad_norm": 13.13401634274974,
|
|
"learning_rate": 3.0192213084169547e-09,
|
|
"loss": 1.584,
|
|
"mean_token_accuracy": 0.6712262392044067,
|
|
"num_tokens": 236081014.0,
|
|
"step": 3750
|
|
},
|
|
{
|
|
"epoch": 12.271685761047463,
|
|
"grad_norm": 12.253526888229622,
|
|
"learning_rate": 3.0136406404959894e-09,
|
|
"loss": 1.5837,
|
|
"mean_token_accuracy": 0.6716396868228912,
|
|
"num_tokens": 236396309.0,
|
|
"step": 3755
|
|
},
|
|
{
|
|
"epoch": 12.288052373158756,
|
|
"grad_norm": 12.300536705102283,
|
|
"learning_rate": 3.008057299008127e-09,
|
|
"loss": 1.5915,
|
|
"mean_token_accuracy": 0.6713536083698273,
|
|
"num_tokens": 236711861.0,
|
|
"step": 3760
|
|
},
|
|
{
|
|
"epoch": 12.304418985270049,
|
|
"grad_norm": 12.542173938542376,
|
|
"learning_rate": 3.0024713130153915e-09,
|
|
"loss": 1.5832,
|
|
"mean_token_accuracy": 0.673738706111908,
|
|
"num_tokens": 237027950.0,
|
|
"step": 3765
|
|
},
|
|
{
|
|
"epoch": 12.320785597381342,
|
|
"grad_norm": 12.858376257223238,
|
|
"learning_rate": 2.9968827115935733e-09,
|
|
"loss": 1.5897,
|
|
"mean_token_accuracy": 0.6709151029586792,
|
|
"num_tokens": 237345505.0,
|
|
"step": 3770
|
|
},
|
|
{
|
|
"epoch": 12.337152209492634,
|
|
"grad_norm": 12.689502324676397,
|
|
"learning_rate": 2.9912915238320756e-09,
|
|
"loss": 1.5791,
|
|
"mean_token_accuracy": 0.6716641247272491,
|
|
"num_tokens": 237659458.0,
|
|
"step": 3775
|
|
},
|
|
{
|
|
"epoch": 12.353518821603927,
|
|
"grad_norm": 12.381858812642847,
|
|
"learning_rate": 2.985697778833765e-09,
|
|
"loss": 1.5628,
|
|
"mean_token_accuracy": 0.6747232496738433,
|
|
"num_tokens": 237975560.0,
|
|
"step": 3780
|
|
},
|
|
{
|
|
"epoch": 12.36988543371522,
|
|
"grad_norm": 13.294914967347232,
|
|
"learning_rate": 2.9801015057148156e-09,
|
|
"loss": 1.6012,
|
|
"mean_token_accuracy": 0.6689247190952301,
|
|
"num_tokens": 238291458.0,
|
|
"step": 3785
|
|
},
|
|
{
|
|
"epoch": 12.386252045826513,
|
|
"grad_norm": 12.5479953722173,
|
|
"learning_rate": 2.974502733604565e-09,
|
|
"loss": 1.553,
|
|
"mean_token_accuracy": 0.6777917623519898,
|
|
"num_tokens": 238607733.0,
|
|
"step": 3790
|
|
},
|
|
{
|
|
"epoch": 12.402618657937808,
|
|
"grad_norm": 12.801549719803207,
|
|
"learning_rate": 2.968901491645355e-09,
|
|
"loss": 1.5887,
|
|
"mean_token_accuracy": 0.6701488494873047,
|
|
"num_tokens": 238923547.0,
|
|
"step": 3795
|
|
},
|
|
{
|
|
"epoch": 12.4189852700491,
|
|
"grad_norm": 12.751637893126095,
|
|
"learning_rate": 2.963297808992385e-09,
|
|
"loss": 1.5703,
|
|
"mean_token_accuracy": 0.6744679152965546,
|
|
"num_tokens": 239240577.0,
|
|
"step": 3800
|
|
},
|
|
{
|
|
"epoch": 12.435351882160393,
|
|
"grad_norm": 13.321683101603757,
|
|
"learning_rate": 2.9576917148135583e-09,
|
|
"loss": 1.5673,
|
|
"mean_token_accuracy": 0.6739533841609955,
|
|
"num_tokens": 239555986.0,
|
|
"step": 3805
|
|
},
|
|
{
|
|
"epoch": 12.451718494271686,
|
|
"grad_norm": 12.627225187767989,
|
|
"learning_rate": 2.9520832382893313e-09,
|
|
"loss": 1.5755,
|
|
"mean_token_accuracy": 0.6728475570678711,
|
|
"num_tokens": 239872210.0,
|
|
"step": 3810
|
|
},
|
|
{
|
|
"epoch": 12.46808510638298,
|
|
"grad_norm": 12.773046439619442,
|
|
"learning_rate": 2.9464724086125582e-09,
|
|
"loss": 1.5773,
|
|
"mean_token_accuracy": 0.6726521015167236,
|
|
"num_tokens": 240188732.0,
|
|
"step": 3815
|
|
},
|
|
{
|
|
"epoch": 12.484451718494272,
|
|
"grad_norm": 13.121405300009847,
|
|
"learning_rate": 2.940859254988344e-09,
|
|
"loss": 1.5637,
|
|
"mean_token_accuracy": 0.6766914367675781,
|
|
"num_tokens": 240503830.0,
|
|
"step": 3820
|
|
},
|
|
{
|
|
"epoch": 12.500818330605565,
|
|
"grad_norm": 12.688014889794543,
|
|
"learning_rate": 2.9352438066338895e-09,
|
|
"loss": 1.5945,
|
|
"mean_token_accuracy": 0.671603900194168,
|
|
"num_tokens": 240819434.0,
|
|
"step": 3825
|
|
},
|
|
{
|
|
"epoch": 12.517184942716858,
|
|
"grad_norm": 12.343229509904557,
|
|
"learning_rate": 2.9296260927783397e-09,
|
|
"loss": 1.5781,
|
|
"mean_token_accuracy": 0.6743817329406738,
|
|
"num_tokens": 241135183.0,
|
|
"step": 3830
|
|
},
|
|
{
|
|
"epoch": 12.53355155482815,
|
|
"grad_norm": 12.241788090729434,
|
|
"learning_rate": 2.924006142662632e-09,
|
|
"loss": 1.5489,
|
|
"mean_token_accuracy": 0.6783745586872101,
|
|
"num_tokens": 241452289.0,
|
|
"step": 3835
|
|
},
|
|
{
|
|
"epoch": 12.549918166939444,
|
|
"grad_norm": 12.800379983122136,
|
|
"learning_rate": 2.918383985539344e-09,
|
|
"loss": 1.581,
|
|
"mean_token_accuracy": 0.6698661625385285,
|
|
"num_tokens": 241766768.0,
|
|
"step": 3840
|
|
},
|
|
{
|
|
"epoch": 12.566284779050736,
|
|
"grad_norm": 12.628445454216601,
|
|
"learning_rate": 2.9127596506725405e-09,
|
|
"loss": 1.5541,
|
|
"mean_token_accuracy": 0.6765264332294464,
|
|
"num_tokens": 242082935.0,
|
|
"step": 3845
|
|
},
|
|
{
|
|
"epoch": 12.58265139116203,
|
|
"grad_norm": 12.663307813547721,
|
|
"learning_rate": 2.9071331673376223e-09,
|
|
"loss": 1.5785,
|
|
"mean_token_accuracy": 0.6735577821731568,
|
|
"num_tokens": 242398564.0,
|
|
"step": 3850
|
|
},
|
|
{
|
|
"epoch": 12.599018003273322,
|
|
"grad_norm": 11.901477804269593,
|
|
"learning_rate": 2.901504564821173e-09,
|
|
"loss": 1.5631,
|
|
"mean_token_accuracy": 0.6759593665599823,
|
|
"num_tokens": 242713296.0,
|
|
"step": 3855
|
|
},
|
|
{
|
|
"epoch": 12.615384615384615,
|
|
"grad_norm": 12.861647771494315,
|
|
"learning_rate": 2.8958738724208073e-09,
|
|
"loss": 1.5661,
|
|
"mean_token_accuracy": 0.6735385596752167,
|
|
"num_tokens": 243027655.0,
|
|
"step": 3860
|
|
},
|
|
{
|
|
"epoch": 12.631751227495908,
|
|
"grad_norm": 12.532695918194063,
|
|
"learning_rate": 2.8902411194450174e-09,
|
|
"loss": 1.5713,
|
|
"mean_token_accuracy": 0.6764239609241486,
|
|
"num_tokens": 243341790.0,
|
|
"step": 3865
|
|
},
|
|
{
|
|
"epoch": 12.6481178396072,
|
|
"grad_norm": 12.658498402394635,
|
|
"learning_rate": 2.884606335213021e-09,
|
|
"loss": 1.5679,
|
|
"mean_token_accuracy": 0.6758080422878265,
|
|
"num_tokens": 243657854.0,
|
|
"step": 3870
|
|
},
|
|
{
|
|
"epoch": 12.664484451718494,
|
|
"grad_norm": 12.54243307666182,
|
|
"learning_rate": 2.8789695490546086e-09,
|
|
"loss": 1.5813,
|
|
"mean_token_accuracy": 0.6715005517005921,
|
|
"num_tokens": 243973655.0,
|
|
"step": 3875
|
|
},
|
|
{
|
|
"epoch": 12.680851063829786,
|
|
"grad_norm": 12.530149231407798,
|
|
"learning_rate": 2.8733307903099926e-09,
|
|
"loss": 1.5675,
|
|
"mean_token_accuracy": 0.675896018743515,
|
|
"num_tokens": 244290378.0,
|
|
"step": 3880
|
|
},
|
|
{
|
|
"epoch": 12.69721767594108,
|
|
"grad_norm": 12.608277259330517,
|
|
"learning_rate": 2.867690088329651e-09,
|
|
"loss": 1.5454,
|
|
"mean_token_accuracy": 0.6792291462421417,
|
|
"num_tokens": 244605616.0,
|
|
"step": 3885
|
|
},
|
|
{
|
|
"epoch": 12.713584288052374,
|
|
"grad_norm": 12.672281948216312,
|
|
"learning_rate": 2.8620474724741764e-09,
|
|
"loss": 1.5589,
|
|
"mean_token_accuracy": 0.6749845445156097,
|
|
"num_tokens": 244920074.0,
|
|
"step": 3890
|
|
},
|
|
{
|
|
"epoch": 12.729950900163667,
|
|
"grad_norm": 13.022006524153223,
|
|
"learning_rate": 2.8564029721141272e-09,
|
|
"loss": 1.5733,
|
|
"mean_token_accuracy": 0.6721878945827484,
|
|
"num_tokens": 245236410.0,
|
|
"step": 3895
|
|
},
|
|
{
|
|
"epoch": 12.74631751227496,
|
|
"grad_norm": 12.367334432965174,
|
|
"learning_rate": 2.850756616629865e-09,
|
|
"loss": 1.5556,
|
|
"mean_token_accuracy": 0.6784782648086548,
|
|
"num_tokens": 245550979.0,
|
|
"step": 3900
|
|
},
|
|
{
|
|
"epoch": 12.762684124386253,
|
|
"grad_norm": 12.58712217252632,
|
|
"learning_rate": 2.8451084354114132e-09,
|
|
"loss": 1.5548,
|
|
"mean_token_accuracy": 0.6767111480236053,
|
|
"num_tokens": 245866265.0,
|
|
"step": 3905
|
|
},
|
|
{
|
|
"epoch": 12.779050736497545,
|
|
"grad_norm": 12.763252688413466,
|
|
"learning_rate": 2.839458457858294e-09,
|
|
"loss": 1.5485,
|
|
"mean_token_accuracy": 0.677623575925827,
|
|
"num_tokens": 246182198.0,
|
|
"step": 3910
|
|
},
|
|
{
|
|
"epoch": 12.795417348608838,
|
|
"grad_norm": 12.300357752454952,
|
|
"learning_rate": 2.8338067133793816e-09,
|
|
"loss": 1.5598,
|
|
"mean_token_accuracy": 0.6755125164985657,
|
|
"num_tokens": 246497923.0,
|
|
"step": 3915
|
|
},
|
|
{
|
|
"epoch": 12.811783960720131,
|
|
"grad_norm": 12.940198469333163,
|
|
"learning_rate": 2.8281532313927477e-09,
|
|
"loss": 1.5444,
|
|
"mean_token_accuracy": 0.6788832068443298,
|
|
"num_tokens": 246812317.0,
|
|
"step": 3920
|
|
},
|
|
{
|
|
"epoch": 12.828150572831424,
|
|
"grad_norm": 12.408410874213551,
|
|
"learning_rate": 2.8224980413255086e-09,
|
|
"loss": 1.5558,
|
|
"mean_token_accuracy": 0.6768961608409881,
|
|
"num_tokens": 247128288.0,
|
|
"step": 3925
|
|
},
|
|
{
|
|
"epoch": 12.844517184942717,
|
|
"grad_norm": 12.506008892788866,
|
|
"learning_rate": 2.8168411726136682e-09,
|
|
"loss": 1.5463,
|
|
"mean_token_accuracy": 0.6788853287696839,
|
|
"num_tokens": 247443898.0,
|
|
"step": 3930
|
|
},
|
|
{
|
|
"epoch": 12.86088379705401,
|
|
"grad_norm": 12.158494287742517,
|
|
"learning_rate": 2.8111826547019715e-09,
|
|
"loss": 1.5577,
|
|
"mean_token_accuracy": 0.6765970945358276,
|
|
"num_tokens": 247757914.0,
|
|
"step": 3935
|
|
},
|
|
{
|
|
"epoch": 12.877250409165303,
|
|
"grad_norm": 12.62478478655889,
|
|
"learning_rate": 2.8055225170437455e-09,
|
|
"loss": 1.5492,
|
|
"mean_token_accuracy": 0.6792675971984863,
|
|
"num_tokens": 248072481.0,
|
|
"step": 3940
|
|
},
|
|
{
|
|
"epoch": 12.893617021276595,
|
|
"grad_norm": 12.75354807025127,
|
|
"learning_rate": 2.7998607891007495e-09,
|
|
"loss": 1.565,
|
|
"mean_token_accuracy": 0.67542844414711,
|
|
"num_tokens": 248387709.0,
|
|
"step": 3945
|
|
},
|
|
{
|
|
"epoch": 12.909983633387888,
|
|
"grad_norm": 13.058935530125755,
|
|
"learning_rate": 2.7941975003430204e-09,
|
|
"loss": 1.5746,
|
|
"mean_token_accuracy": 0.6717760920524597,
|
|
"num_tokens": 248704386.0,
|
|
"step": 3950
|
|
},
|
|
{
|
|
"epoch": 12.926350245499181,
|
|
"grad_norm": 12.297537130228084,
|
|
"learning_rate": 2.7885326802487175e-09,
|
|
"loss": 1.5368,
|
|
"mean_token_accuracy": 0.6805076479911805,
|
|
"num_tokens": 249020305.0,
|
|
"step": 3955
|
|
},
|
|
{
|
|
"epoch": 12.942716857610474,
|
|
"grad_norm": 12.224890011682676,
|
|
"learning_rate": 2.782866358303973e-09,
|
|
"loss": 1.5402,
|
|
"mean_token_accuracy": 0.6809644043445587,
|
|
"num_tokens": 249335996.0,
|
|
"step": 3960
|
|
},
|
|
{
|
|
"epoch": 12.959083469721767,
|
|
"grad_norm": 12.010577456081947,
|
|
"learning_rate": 2.777198564002737e-09,
|
|
"loss": 1.5294,
|
|
"mean_token_accuracy": 0.6835005700588226,
|
|
"num_tokens": 249650759.0,
|
|
"step": 3965
|
|
},
|
|
{
|
|
"epoch": 12.97545008183306,
|
|
"grad_norm": 12.755477098006706,
|
|
"learning_rate": 2.7715293268466204e-09,
|
|
"loss": 1.5377,
|
|
"mean_token_accuracy": 0.6816042780876159,
|
|
"num_tokens": 249965419.0,
|
|
"step": 3970
|
|
},
|
|
{
|
|
"epoch": 12.991816693944354,
|
|
"grad_norm": 12.11533297566282,
|
|
"learning_rate": 2.765858676344747e-09,
|
|
"loss": 1.556,
|
|
"mean_token_accuracy": 0.6773697674274445,
|
|
"num_tokens": 250280974.0,
|
|
"step": 3975
|
|
},
|
|
{
|
|
"epoch": 13.006546644844518,
|
|
"grad_norm": 12.53886715357263,
|
|
"learning_rate": 2.7601866420135955e-09,
|
|
"loss": 1.5426,
|
|
"mean_token_accuracy": 0.6790764596727159,
|
|
"num_tokens": 250541723.0,
|
|
"step": 3980
|
|
},
|
|
{
|
|
"epoch": 13.02291325695581,
|
|
"grad_norm": 12.267173509734528,
|
|
"learning_rate": 2.7545132533768503e-09,
|
|
"loss": 1.5455,
|
|
"mean_token_accuracy": 0.6799408495426178,
|
|
"num_tokens": 250857786.0,
|
|
"step": 3985
|
|
},
|
|
{
|
|
"epoch": 13.039279869067103,
|
|
"grad_norm": 12.659666618100243,
|
|
"learning_rate": 2.7488385399652418e-09,
|
|
"loss": 1.5442,
|
|
"mean_token_accuracy": 0.6778158783912659,
|
|
"num_tokens": 251172605.0,
|
|
"step": 3990
|
|
},
|
|
{
|
|
"epoch": 13.055646481178396,
|
|
"grad_norm": 13.190436807019603,
|
|
"learning_rate": 2.7431625313163973e-09,
|
|
"loss": 1.5529,
|
|
"mean_token_accuracy": 0.6766701757907867,
|
|
"num_tokens": 251489221.0,
|
|
"step": 3995
|
|
},
|
|
{
|
|
"epoch": 13.072013093289689,
|
|
"grad_norm": 11.806954687900037,
|
|
"learning_rate": 2.7374852569746872e-09,
|
|
"loss": 1.5532,
|
|
"mean_token_accuracy": 0.6783233880996704,
|
|
"num_tokens": 251805450.0,
|
|
"step": 4000
|
|
},
|
|
{
|
|
"epoch": 13.088379705400982,
|
|
"grad_norm": 12.442400974599957,
|
|
"learning_rate": 2.7318067464910685e-09,
|
|
"loss": 1.5376,
|
|
"mean_token_accuracy": 0.6806444525718689,
|
|
"num_tokens": 252120975.0,
|
|
"step": 4005
|
|
},
|
|
{
|
|
"epoch": 13.104746317512275,
|
|
"grad_norm": 12.48596610332013,
|
|
"learning_rate": 2.726127029422934e-09,
|
|
"loss": 1.5271,
|
|
"mean_token_accuracy": 0.6825267374515533,
|
|
"num_tokens": 252436301.0,
|
|
"step": 4010
|
|
},
|
|
{
|
|
"epoch": 13.121112929623568,
|
|
"grad_norm": 12.5605303792605,
|
|
"learning_rate": 2.7204461353339542e-09,
|
|
"loss": 1.5284,
|
|
"mean_token_accuracy": 0.6824884533882141,
|
|
"num_tokens": 252752055.0,
|
|
"step": 4015
|
|
},
|
|
{
|
|
"epoch": 13.13747954173486,
|
|
"grad_norm": 12.961259371866541,
|
|
"learning_rate": 2.714764093793929e-09,
|
|
"loss": 1.5503,
|
|
"mean_token_accuracy": 0.6780583560466766,
|
|
"num_tokens": 253069583.0,
|
|
"step": 4020
|
|
},
|
|
{
|
|
"epoch": 13.153846153846153,
|
|
"grad_norm": 12.485200498103994,
|
|
"learning_rate": 2.7090809343786294e-09,
|
|
"loss": 1.5426,
|
|
"mean_token_accuracy": 0.6800074696540832,
|
|
"num_tokens": 253385821.0,
|
|
"step": 4025
|
|
},
|
|
{
|
|
"epoch": 13.170212765957446,
|
|
"grad_norm": 12.47147244014249,
|
|
"learning_rate": 2.703396686669646e-09,
|
|
"loss": 1.5212,
|
|
"mean_token_accuracy": 0.6839386880397796,
|
|
"num_tokens": 253701910.0,
|
|
"step": 4030
|
|
},
|
|
{
|
|
"epoch": 13.186579378068739,
|
|
"grad_norm": 12.52921187779744,
|
|
"learning_rate": 2.6977113802542337e-09,
|
|
"loss": 1.5301,
|
|
"mean_token_accuracy": 0.6811168432235718,
|
|
"num_tokens": 254018498.0,
|
|
"step": 4035
|
|
},
|
|
{
|
|
"epoch": 13.202945990180032,
|
|
"grad_norm": 12.47194604814109,
|
|
"learning_rate": 2.6920250447251564e-09,
|
|
"loss": 1.5287,
|
|
"mean_token_accuracy": 0.6823192477226258,
|
|
"num_tokens": 254335166.0,
|
|
"step": 4040
|
|
},
|
|
{
|
|
"epoch": 13.219312602291327,
|
|
"grad_norm": 13.154414990831478,
|
|
"learning_rate": 2.686337709680538e-09,
|
|
"loss": 1.5417,
|
|
"mean_token_accuracy": 0.6793733775615692,
|
|
"num_tokens": 254648953.0,
|
|
"step": 4045
|
|
},
|
|
{
|
|
"epoch": 13.23567921440262,
|
|
"grad_norm": 12.742574961680194,
|
|
"learning_rate": 2.6806494047237022e-09,
|
|
"loss": 1.5403,
|
|
"mean_token_accuracy": 0.6783177971839904,
|
|
"num_tokens": 254964346.0,
|
|
"step": 4050
|
|
},
|
|
{
|
|
"epoch": 13.252045826513912,
|
|
"grad_norm": 12.024039931655503,
|
|
"learning_rate": 2.6749601594630236e-09,
|
|
"loss": 1.523,
|
|
"mean_token_accuracy": 0.6833492398262024,
|
|
"num_tokens": 255280007.0,
|
|
"step": 4055
|
|
},
|
|
{
|
|
"epoch": 13.268412438625205,
|
|
"grad_norm": 12.35117661325111,
|
|
"learning_rate": 2.669270003511769e-09,
|
|
"loss": 1.5343,
|
|
"mean_token_accuracy": 0.6811910986900329,
|
|
"num_tokens": 255596838.0,
|
|
"step": 4060
|
|
},
|
|
{
|
|
"epoch": 13.284779050736498,
|
|
"grad_norm": 12.549651974040733,
|
|
"learning_rate": 2.663578966487946e-09,
|
|
"loss": 1.5343,
|
|
"mean_token_accuracy": 0.6812186896800995,
|
|
"num_tokens": 255911585.0,
|
|
"step": 4065
|
|
},
|
|
{
|
|
"epoch": 13.30114566284779,
|
|
"grad_norm": 12.89348141219778,
|
|
"learning_rate": 2.65788707801415e-09,
|
|
"loss": 1.5395,
|
|
"mean_token_accuracy": 0.6799389600753785,
|
|
"num_tokens": 256227004.0,
|
|
"step": 4070
|
|
},
|
|
{
|
|
"epoch": 13.317512274959084,
|
|
"grad_norm": 12.66495954354997,
|
|
"learning_rate": 2.652194367717406e-09,
|
|
"loss": 1.5107,
|
|
"mean_token_accuracy": 0.6846579313278198,
|
|
"num_tokens": 256543458.0,
|
|
"step": 4075
|
|
},
|
|
{
|
|
"epoch": 13.333878887070377,
|
|
"grad_norm": 12.621706259288624,
|
|
"learning_rate": 2.6465008652290177e-09,
|
|
"loss": 1.518,
|
|
"mean_token_accuracy": 0.684733635187149,
|
|
"num_tokens": 256859415.0,
|
|
"step": 4080
|
|
},
|
|
{
|
|
"epoch": 13.35024549918167,
|
|
"grad_norm": 12.097297652839174,
|
|
"learning_rate": 2.6408066001844127e-09,
|
|
"loss": 1.5196,
|
|
"mean_token_accuracy": 0.6841577529907227,
|
|
"num_tokens": 257172225.0,
|
|
"step": 4085
|
|
},
|
|
{
|
|
"epoch": 13.366612111292962,
|
|
"grad_norm": 12.508833721919608,
|
|
"learning_rate": 2.6351116022229872e-09,
|
|
"loss": 1.528,
|
|
"mean_token_accuracy": 0.6815730452537536,
|
|
"num_tokens": 257489871.0,
|
|
"step": 4090
|
|
},
|
|
{
|
|
"epoch": 13.382978723404255,
|
|
"grad_norm": 12.15282167610772,
|
|
"learning_rate": 2.6294159009879524e-09,
|
|
"loss": 1.5253,
|
|
"mean_token_accuracy": 0.6824976325035095,
|
|
"num_tokens": 257804998.0,
|
|
"step": 4095
|
|
},
|
|
{
|
|
"epoch": 13.399345335515548,
|
|
"grad_norm": 12.609861031701218,
|
|
"learning_rate": 2.6237195261261803e-09,
|
|
"loss": 1.5336,
|
|
"mean_token_accuracy": 0.6804631054401398,
|
|
"num_tokens": 258121618.0,
|
|
"step": 4100
|
|
},
|
|
{
|
|
"epoch": 13.415711947626841,
|
|
"grad_norm": 12.81485883900022,
|
|
"learning_rate": 2.6180225072880485e-09,
|
|
"loss": 1.5177,
|
|
"mean_token_accuracy": 0.683049613237381,
|
|
"num_tokens": 258438728.0,
|
|
"step": 4105
|
|
},
|
|
{
|
|
"epoch": 13.432078559738134,
|
|
"grad_norm": 12.505318879085046,
|
|
"learning_rate": 2.6123248741272883e-09,
|
|
"loss": 1.5054,
|
|
"mean_token_accuracy": 0.6859242916107178,
|
|
"num_tokens": 258753766.0,
|
|
"step": 4110
|
|
},
|
|
{
|
|
"epoch": 13.448445171849427,
|
|
"grad_norm": 12.599140341885622,
|
|
"learning_rate": 2.606626656300827e-09,
|
|
"loss": 1.5288,
|
|
"mean_token_accuracy": 0.6825321853160858,
|
|
"num_tokens": 259070867.0,
|
|
"step": 4115
|
|
},
|
|
{
|
|
"epoch": 13.46481178396072,
|
|
"grad_norm": 12.59190273859541,
|
|
"learning_rate": 2.600927883468635e-09,
|
|
"loss": 1.5409,
|
|
"mean_token_accuracy": 0.6785263419151306,
|
|
"num_tokens": 259386335.0,
|
|
"step": 4120
|
|
},
|
|
{
|
|
"epoch": 13.481178396072012,
|
|
"grad_norm": 12.959983447834418,
|
|
"learning_rate": 2.595228585293574e-09,
|
|
"loss": 1.5222,
|
|
"mean_token_accuracy": 0.6814795911312104,
|
|
"num_tokens": 259701062.0,
|
|
"step": 4125
|
|
},
|
|
{
|
|
"epoch": 13.497545008183305,
|
|
"grad_norm": 12.246060045870713,
|
|
"learning_rate": 2.589528791441237e-09,
|
|
"loss": 1.5258,
|
|
"mean_token_accuracy": 0.6814365029335022,
|
|
"num_tokens": 260016188.0,
|
|
"step": 4130
|
|
},
|
|
{
|
|
"epoch": 13.5139116202946,
|
|
"grad_norm": 11.762747726590536,
|
|
"learning_rate": 2.5838285315797988e-09,
|
|
"loss": 1.5287,
|
|
"mean_token_accuracy": 0.6819169402122498,
|
|
"num_tokens": 260332549.0,
|
|
"step": 4135
|
|
},
|
|
{
|
|
"epoch": 13.530278232405893,
|
|
"grad_norm": 12.205042415924925,
|
|
"learning_rate": 2.57812783537986e-09,
|
|
"loss": 1.5121,
|
|
"mean_token_accuracy": 0.6839991450309754,
|
|
"num_tokens": 260649111.0,
|
|
"step": 4140
|
|
},
|
|
{
|
|
"epoch": 13.546644844517186,
|
|
"grad_norm": 11.791312121137548,
|
|
"learning_rate": 2.572426732514291e-09,
|
|
"loss": 1.5255,
|
|
"mean_token_accuracy": 0.6823135852813721,
|
|
"num_tokens": 260963017.0,
|
|
"step": 4145
|
|
},
|
|
{
|
|
"epoch": 13.563011456628479,
|
|
"grad_norm": 11.911116010773625,
|
|
"learning_rate": 2.566725252658081e-09,
|
|
"loss": 1.5041,
|
|
"mean_token_accuracy": 0.6869804382324218,
|
|
"num_tokens": 261279118.0,
|
|
"step": 4150
|
|
},
|
|
{
|
|
"epoch": 13.579378068739771,
|
|
"grad_norm": 11.587422849493581,
|
|
"learning_rate": 2.56102342548818e-09,
|
|
"loss": 1.4929,
|
|
"mean_token_accuracy": 0.6880825638771058,
|
|
"num_tokens": 261595332.0,
|
|
"step": 4155
|
|
},
|
|
{
|
|
"epoch": 13.595744680851064,
|
|
"grad_norm": 12.957142991751684,
|
|
"learning_rate": 2.555321280683346e-09,
|
|
"loss": 1.54,
|
|
"mean_token_accuracy": 0.6787398636341095,
|
|
"num_tokens": 261910265.0,
|
|
"step": 4160
|
|
},
|
|
{
|
|
"epoch": 13.612111292962357,
|
|
"grad_norm": 12.257003877444124,
|
|
"learning_rate": 2.549618847923991e-09,
|
|
"loss": 1.5058,
|
|
"mean_token_accuracy": 0.6841898381710052,
|
|
"num_tokens": 262224697.0,
|
|
"step": 4165
|
|
},
|
|
{
|
|
"epoch": 13.62847790507365,
|
|
"grad_norm": 12.452796927706355,
|
|
"learning_rate": 2.543916156892025e-09,
|
|
"loss": 1.5111,
|
|
"mean_token_accuracy": 0.6824253857135772,
|
|
"num_tokens": 262541480.0,
|
|
"step": 4170
|
|
},
|
|
{
|
|
"epoch": 13.644844517184943,
|
|
"grad_norm": 12.030249176136618,
|
|
"learning_rate": 2.5382132372707027e-09,
|
|
"loss": 1.5126,
|
|
"mean_token_accuracy": 0.6844086408615112,
|
|
"num_tokens": 262855801.0,
|
|
"step": 4175
|
|
},
|
|
{
|
|
"epoch": 13.661211129296236,
|
|
"grad_norm": 12.387048565345177,
|
|
"learning_rate": 2.5325101187444694e-09,
|
|
"loss": 1.5159,
|
|
"mean_token_accuracy": 0.6827063441276551,
|
|
"num_tokens": 263170799.0,
|
|
"step": 4180
|
|
},
|
|
{
|
|
"epoch": 13.677577741407529,
|
|
"grad_norm": 12.22309480247876,
|
|
"learning_rate": 2.526806830998804e-09,
|
|
"loss": 1.523,
|
|
"mean_token_accuracy": 0.6828779339790344,
|
|
"num_tokens": 263486969.0,
|
|
"step": 4185
|
|
},
|
|
{
|
|
"epoch": 13.693944353518821,
|
|
"grad_norm": 12.525816516791572,
|
|
"learning_rate": 2.5211034037200675e-09,
|
|
"loss": 1.5194,
|
|
"mean_token_accuracy": 0.6844118356704711,
|
|
"num_tokens": 263801197.0,
|
|
"step": 4190
|
|
},
|
|
{
|
|
"epoch": 13.710310965630114,
|
|
"grad_norm": 12.434982126404309,
|
|
"learning_rate": 2.515399866595347e-09,
|
|
"loss": 1.4834,
|
|
"mean_token_accuracy": 0.6887698948383332,
|
|
"num_tokens": 264117363.0,
|
|
"step": 4195
|
|
},
|
|
{
|
|
"epoch": 13.726677577741407,
|
|
"grad_norm": 12.238598723305001,
|
|
"learning_rate": 2.509696249312301e-09,
|
|
"loss": 1.5066,
|
|
"mean_token_accuracy": 0.685460901260376,
|
|
"num_tokens": 264432646.0,
|
|
"step": 4200
|
|
},
|
|
{
|
|
"epoch": 13.7430441898527,
|
|
"grad_norm": 12.650740657273452,
|
|
"learning_rate": 2.503992581559005e-09,
|
|
"loss": 1.5332,
|
|
"mean_token_accuracy": 0.678538054227829,
|
|
"num_tokens": 264747827.0,
|
|
"step": 4205
|
|
},
|
|
{
|
|
"epoch": 13.759410801963993,
|
|
"grad_norm": 12.099360007266315,
|
|
"learning_rate": 2.4982888930237996e-09,
|
|
"loss": 1.4965,
|
|
"mean_token_accuracy": 0.6864968955516815,
|
|
"num_tokens": 265063292.0,
|
|
"step": 4210
|
|
},
|
|
{
|
|
"epoch": 13.775777414075286,
|
|
"grad_norm": 12.271944931579954,
|
|
"learning_rate": 2.49258521339513e-09,
|
|
"loss": 1.4828,
|
|
"mean_token_accuracy": 0.6900125324726105,
|
|
"num_tokens": 265377285.0,
|
|
"step": 4215
|
|
},
|
|
{
|
|
"epoch": 13.792144026186579,
|
|
"grad_norm": 12.611396153670063,
|
|
"learning_rate": 2.4868815723613977e-09,
|
|
"loss": 1.5133,
|
|
"mean_token_accuracy": 0.6828788638114929,
|
|
"num_tokens": 265692743.0,
|
|
"step": 4220
|
|
},
|
|
{
|
|
"epoch": 13.808510638297872,
|
|
"grad_norm": 12.01276121557013,
|
|
"learning_rate": 2.4811779996108013e-09,
|
|
"loss": 1.4912,
|
|
"mean_token_accuracy": 0.6870843648910523,
|
|
"num_tokens": 266007767.0,
|
|
"step": 4225
|
|
},
|
|
{
|
|
"epoch": 13.824877250409166,
|
|
"grad_norm": 12.19758047419805,
|
|
"learning_rate": 2.475474524831185e-09,
|
|
"loss": 1.5152,
|
|
"mean_token_accuracy": 0.683402705192566,
|
|
"num_tokens": 266324778.0,
|
|
"step": 4230
|
|
},
|
|
{
|
|
"epoch": 13.841243862520459,
|
|
"grad_norm": 12.479958154608948,
|
|
"learning_rate": 2.4697711777098836e-09,
|
|
"loss": 1.5082,
|
|
"mean_token_accuracy": 0.6837309658527374,
|
|
"num_tokens": 266641733.0,
|
|
"step": 4235
|
|
},
|
|
{
|
|
"epoch": 13.857610474631752,
|
|
"grad_norm": 12.400439983424395,
|
|
"learning_rate": 2.464067987933567e-09,
|
|
"loss": 1.4813,
|
|
"mean_token_accuracy": 0.688559752702713,
|
|
"num_tokens": 266958456.0,
|
|
"step": 4240
|
|
},
|
|
{
|
|
"epoch": 13.873977086743045,
|
|
"grad_norm": 12.642592990777057,
|
|
"learning_rate": 2.458364985188085e-09,
|
|
"loss": 1.5132,
|
|
"mean_token_accuracy": 0.6824531733989716,
|
|
"num_tokens": 267273047.0,
|
|
"step": 4245
|
|
},
|
|
{
|
|
"epoch": 13.890343698854338,
|
|
"grad_norm": 12.72320574929076,
|
|
"learning_rate": 2.452662199158316e-09,
|
|
"loss": 1.4868,
|
|
"mean_token_accuracy": 0.6883831202983857,
|
|
"num_tokens": 267588938.0,
|
|
"step": 4250
|
|
},
|
|
{
|
|
"epoch": 13.90671031096563,
|
|
"grad_norm": 12.287005056680044,
|
|
"learning_rate": 2.4469596595280084e-09,
|
|
"loss": 1.509,
|
|
"mean_token_accuracy": 0.6836120009422302,
|
|
"num_tokens": 267905040.0,
|
|
"step": 4255
|
|
},
|
|
{
|
|
"epoch": 13.923076923076923,
|
|
"grad_norm": 12.593872648580946,
|
|
"learning_rate": 2.441257395979629e-09,
|
|
"loss": 1.4969,
|
|
"mean_token_accuracy": 0.6880811274051666,
|
|
"num_tokens": 268218957.0,
|
|
"step": 4260
|
|
},
|
|
{
|
|
"epoch": 13.939443535188216,
|
|
"grad_norm": 12.215400254590408,
|
|
"learning_rate": 2.435555438194208e-09,
|
|
"loss": 1.4883,
|
|
"mean_token_accuracy": 0.6877467036247253,
|
|
"num_tokens": 268534219.0,
|
|
"step": 4265
|
|
},
|
|
{
|
|
"epoch": 13.955810147299509,
|
|
"grad_norm": 11.71218778076416,
|
|
"learning_rate": 2.429853815851183e-09,
|
|
"loss": 1.5188,
|
|
"mean_token_accuracy": 0.6834926068782806,
|
|
"num_tokens": 268849335.0,
|
|
"step": 4270
|
|
},
|
|
{
|
|
"epoch": 13.972176759410802,
|
|
"grad_norm": 11.811582966108432,
|
|
"learning_rate": 2.424152558628246e-09,
|
|
"loss": 1.4734,
|
|
"mean_token_accuracy": 0.6901955604553223,
|
|
"num_tokens": 269164477.0,
|
|
"step": 4275
|
|
},
|
|
{
|
|
"epoch": 13.988543371522095,
|
|
"grad_norm": 12.391517223881769,
|
|
"learning_rate": 2.4184516962011894e-09,
|
|
"loss": 1.4777,
|
|
"mean_token_accuracy": 0.6901420176029205,
|
|
"num_tokens": 269481191.0,
|
|
"step": 4280
|
|
},
|
|
{
|
|
"epoch": 14.003273322422258,
|
|
"grad_norm": 11.85243276024841,
|
|
"learning_rate": 2.412751258243748e-09,
|
|
"loss": 1.4796,
|
|
"mean_token_accuracy": 0.6884656879636977,
|
|
"num_tokens": 269740807.0,
|
|
"step": 4285
|
|
},
|
|
{
|
|
"epoch": 14.01963993453355,
|
|
"grad_norm": 12.127537335092095,
|
|
"learning_rate": 2.4070512744274503e-09,
|
|
"loss": 1.4912,
|
|
"mean_token_accuracy": 0.6857654273509979,
|
|
"num_tokens": 270056263.0,
|
|
"step": 4290
|
|
},
|
|
{
|
|
"epoch": 14.036006546644845,
|
|
"grad_norm": 13.025005489656099,
|
|
"learning_rate": 2.4013517744214595e-09,
|
|
"loss": 1.5247,
|
|
"mean_token_accuracy": 0.6816625893115997,
|
|
"num_tokens": 270371668.0,
|
|
"step": 4295
|
|
},
|
|
{
|
|
"epoch": 14.052373158756138,
|
|
"grad_norm": 12.219489354645315,
|
|
"learning_rate": 2.3956527878924202e-09,
|
|
"loss": 1.5032,
|
|
"mean_token_accuracy": 0.6837205052375793,
|
|
"num_tokens": 270686642.0,
|
|
"step": 4300
|
|
},
|
|
{
|
|
"epoch": 14.068739770867431,
|
|
"grad_norm": 12.456845642626293,
|
|
"learning_rate": 2.3899543445043044e-09,
|
|
"loss": 1.4786,
|
|
"mean_token_accuracy": 0.6902859628200531,
|
|
"num_tokens": 271001819.0,
|
|
"step": 4305
|
|
},
|
|
{
|
|
"epoch": 14.085106382978724,
|
|
"grad_norm": 12.28464819163932,
|
|
"learning_rate": 2.3842564739182586e-09,
|
|
"loss": 1.496,
|
|
"mean_token_accuracy": 0.6836179614067077,
|
|
"num_tokens": 271317420.0,
|
|
"step": 4310
|
|
},
|
|
{
|
|
"epoch": 14.101472995090017,
|
|
"grad_norm": 12.790520935026995,
|
|
"learning_rate": 2.378559205792445e-09,
|
|
"loss": 1.5081,
|
|
"mean_token_accuracy": 0.6849936664104461,
|
|
"num_tokens": 271632759.0,
|
|
"step": 4315
|
|
},
|
|
{
|
|
"epoch": 14.11783960720131,
|
|
"grad_norm": 12.595762861453693,
|
|
"learning_rate": 2.372862569781893e-09,
|
|
"loss": 1.4957,
|
|
"mean_token_accuracy": 0.6861394762992858,
|
|
"num_tokens": 271947230.0,
|
|
"step": 4320
|
|
},
|
|
{
|
|
"epoch": 14.134206219312603,
|
|
"grad_norm": 12.571761249252841,
|
|
"learning_rate": 2.3671665955383383e-09,
|
|
"loss": 1.5141,
|
|
"mean_token_accuracy": 0.6836262583732605,
|
|
"num_tokens": 272263460.0,
|
|
"step": 4325
|
|
},
|
|
{
|
|
"epoch": 14.150572831423895,
|
|
"grad_norm": 12.156579183616737,
|
|
"learning_rate": 2.3614713127100753e-09,
|
|
"loss": 1.4833,
|
|
"mean_token_accuracy": 0.687548840045929,
|
|
"num_tokens": 272580508.0,
|
|
"step": 4330
|
|
},
|
|
{
|
|
"epoch": 14.166939443535188,
|
|
"grad_norm": 12.06870266875961,
|
|
"learning_rate": 2.3557767509417978e-09,
|
|
"loss": 1.4899,
|
|
"mean_token_accuracy": 0.6882768452167511,
|
|
"num_tokens": 272895912.0,
|
|
"step": 4335
|
|
},
|
|
{
|
|
"epoch": 14.183306055646481,
|
|
"grad_norm": 11.801476081580352,
|
|
"learning_rate": 2.3500829398744456e-09,
|
|
"loss": 1.4797,
|
|
"mean_token_accuracy": 0.6898080468177795,
|
|
"num_tokens": 273209887.0,
|
|
"step": 4340
|
|
},
|
|
{
|
|
"epoch": 14.199672667757774,
|
|
"grad_norm": 13.11999339098903,
|
|
"learning_rate": 2.3443899091450532e-09,
|
|
"loss": 1.4996,
|
|
"mean_token_accuracy": 0.6869996070861817,
|
|
"num_tokens": 273525314.0,
|
|
"step": 4345
|
|
},
|
|
{
|
|
"epoch": 14.216039279869067,
|
|
"grad_norm": 12.88555178536362,
|
|
"learning_rate": 2.3386976883865917e-09,
|
|
"loss": 1.4877,
|
|
"mean_token_accuracy": 0.6876905500888825,
|
|
"num_tokens": 273841131.0,
|
|
"step": 4350
|
|
},
|
|
{
|
|
"epoch": 14.23240589198036,
|
|
"grad_norm": 12.61192381171216,
|
|
"learning_rate": 2.333006307227817e-09,
|
|
"loss": 1.4876,
|
|
"mean_token_accuracy": 0.6866623759269714,
|
|
"num_tokens": 274156716.0,
|
|
"step": 4355
|
|
},
|
|
{
|
|
"epoch": 14.248772504091653,
|
|
"grad_norm": 12.102832652830054,
|
|
"learning_rate": 2.3273157952931137e-09,
|
|
"loss": 1.4881,
|
|
"mean_token_accuracy": 0.6889342963695526,
|
|
"num_tokens": 274470358.0,
|
|
"step": 4360
|
|
},
|
|
{
|
|
"epoch": 14.265139116202946,
|
|
"grad_norm": 12.137000267042431,
|
|
"learning_rate": 2.321626182202343e-09,
|
|
"loss": 1.4712,
|
|
"mean_token_accuracy": 0.6922279059886932,
|
|
"num_tokens": 274786272.0,
|
|
"step": 4365
|
|
},
|
|
{
|
|
"epoch": 14.281505728314238,
|
|
"grad_norm": 12.497262980217268,
|
|
"learning_rate": 2.315937497570688e-09,
|
|
"loss": 1.5073,
|
|
"mean_token_accuracy": 0.685647439956665,
|
|
"num_tokens": 275100575.0,
|
|
"step": 4370
|
|
},
|
|
{
|
|
"epoch": 14.297872340425531,
|
|
"grad_norm": 13.06102305602527,
|
|
"learning_rate": 2.3102497710084977e-09,
|
|
"loss": 1.5056,
|
|
"mean_token_accuracy": 0.6866560220718384,
|
|
"num_tokens": 275415316.0,
|
|
"step": 4375
|
|
},
|
|
{
|
|
"epoch": 14.314238952536824,
|
|
"grad_norm": 12.410306438367476,
|
|
"learning_rate": 2.304563032121135e-09,
|
|
"loss": 1.464,
|
|
"mean_token_accuracy": 0.691516500711441,
|
|
"num_tokens": 275731387.0,
|
|
"step": 4380
|
|
},
|
|
{
|
|
"epoch": 14.330605564648117,
|
|
"grad_norm": 12.57267060527453,
|
|
"learning_rate": 2.2988773105088208e-09,
|
|
"loss": 1.4798,
|
|
"mean_token_accuracy": 0.6907178342342377,
|
|
"num_tokens": 276048294.0,
|
|
"step": 4385
|
|
},
|
|
{
|
|
"epoch": 14.346972176759412,
|
|
"grad_norm": 12.71377949896571,
|
|
"learning_rate": 2.2931926357664828e-09,
|
|
"loss": 1.4918,
|
|
"mean_token_accuracy": 0.6845012664794922,
|
|
"num_tokens": 276364776.0,
|
|
"step": 4390
|
|
},
|
|
{
|
|
"epoch": 14.363338788870704,
|
|
"grad_norm": 11.806182237450276,
|
|
"learning_rate": 2.2875090374835995e-09,
|
|
"loss": 1.471,
|
|
"mean_token_accuracy": 0.6929399073123932,
|
|
"num_tokens": 276680814.0,
|
|
"step": 4395
|
|
},
|
|
{
|
|
"epoch": 14.379705400981997,
|
|
"grad_norm": 12.939282258504374,
|
|
"learning_rate": 2.281826545244042e-09,
|
|
"loss": 1.4964,
|
|
"mean_token_accuracy": 0.6841404914855957,
|
|
"num_tokens": 276997538.0,
|
|
"step": 4400
|
|
},
|
|
{
|
|
"epoch": 14.39607201309329,
|
|
"grad_norm": 12.87439071641723,
|
|
"learning_rate": 2.2761451886259303e-09,
|
|
"loss": 1.4989,
|
|
"mean_token_accuracy": 0.6866525292396546,
|
|
"num_tokens": 277313452.0,
|
|
"step": 4405
|
|
},
|
|
{
|
|
"epoch": 14.412438625204583,
|
|
"grad_norm": 12.779019358899058,
|
|
"learning_rate": 2.27046499720147e-09,
|
|
"loss": 1.4862,
|
|
"mean_token_accuracy": 0.6897963464260102,
|
|
"num_tokens": 277629752.0,
|
|
"step": 4410
|
|
},
|
|
{
|
|
"epoch": 14.428805237315876,
|
|
"grad_norm": 12.611209638210697,
|
|
"learning_rate": 2.2647860005368025e-09,
|
|
"loss": 1.4835,
|
|
"mean_token_accuracy": 0.6918173551559448,
|
|
"num_tokens": 277946687.0,
|
|
"step": 4415
|
|
},
|
|
{
|
|
"epoch": 14.445171849427169,
|
|
"grad_norm": 12.201971543911935,
|
|
"learning_rate": 2.259108228191851e-09,
|
|
"loss": 1.4688,
|
|
"mean_token_accuracy": 0.6911928713321686,
|
|
"num_tokens": 278264205.0,
|
|
"step": 4420
|
|
},
|
|
{
|
|
"epoch": 14.461538461538462,
|
|
"grad_norm": 12.967808337071494,
|
|
"learning_rate": 2.2534317097201633e-09,
|
|
"loss": 1.4748,
|
|
"mean_token_accuracy": 0.6898449957370758,
|
|
"num_tokens": 278578719.0,
|
|
"step": 4425
|
|
},
|
|
{
|
|
"epoch": 14.477905073649755,
|
|
"grad_norm": 12.69460481544349,
|
|
"learning_rate": 2.2477564746687644e-09,
|
|
"loss": 1.4825,
|
|
"mean_token_accuracy": 0.6889922320842743,
|
|
"num_tokens": 278895508.0,
|
|
"step": 4430
|
|
},
|
|
{
|
|
"epoch": 14.494271685761047,
|
|
"grad_norm": 12.038321940990556,
|
|
"learning_rate": 2.242082552577996e-09,
|
|
"loss": 1.4809,
|
|
"mean_token_accuracy": 0.6891875326633453,
|
|
"num_tokens": 279210632.0,
|
|
"step": 4435
|
|
},
|
|
{
|
|
"epoch": 14.51063829787234,
|
|
"grad_norm": 12.647305286438627,
|
|
"learning_rate": 2.2364099729813668e-09,
|
|
"loss": 1.4796,
|
|
"mean_token_accuracy": 0.6954532265663147,
|
|
"num_tokens": 279526403.0,
|
|
"step": 4440
|
|
},
|
|
{
|
|
"epoch": 14.527004909983633,
|
|
"grad_norm": 12.055013468342107,
|
|
"learning_rate": 2.2307387654053978e-09,
|
|
"loss": 1.4674,
|
|
"mean_token_accuracy": 0.6918425500392914,
|
|
"num_tokens": 279842256.0,
|
|
"step": 4445
|
|
},
|
|
{
|
|
"epoch": 14.543371522094926,
|
|
"grad_norm": 12.690408025111353,
|
|
"learning_rate": 2.2250689593694696e-09,
|
|
"loss": 1.5005,
|
|
"mean_token_accuracy": 0.6831823945045471,
|
|
"num_tokens": 280157536.0,
|
|
"step": 4450
|
|
},
|
|
{
|
|
"epoch": 14.559738134206219,
|
|
"grad_norm": 12.193205510406495,
|
|
"learning_rate": 2.2194005843856633e-09,
|
|
"loss": 1.4756,
|
|
"mean_token_accuracy": 0.6923376679420471,
|
|
"num_tokens": 280474231.0,
|
|
"step": 4455
|
|
},
|
|
{
|
|
"epoch": 14.576104746317512,
|
|
"grad_norm": 12.09933217624297,
|
|
"learning_rate": 2.2137336699586157e-09,
|
|
"loss": 1.46,
|
|
"mean_token_accuracy": 0.6976064741611481,
|
|
"num_tokens": 280789726.0,
|
|
"step": 4460
|
|
},
|
|
{
|
|
"epoch": 14.592471358428805,
|
|
"grad_norm": 12.577292347337101,
|
|
"learning_rate": 2.2080682455853595e-09,
|
|
"loss": 1.474,
|
|
"mean_token_accuracy": 0.693066680431366,
|
|
"num_tokens": 281106145.0,
|
|
"step": 4465
|
|
},
|
|
{
|
|
"epoch": 14.608837970540097,
|
|
"grad_norm": 11.937726197090774,
|
|
"learning_rate": 2.2024043407551717e-09,
|
|
"loss": 1.4704,
|
|
"mean_token_accuracy": 0.6958605766296386,
|
|
"num_tokens": 281421354.0,
|
|
"step": 4470
|
|
},
|
|
{
|
|
"epoch": 14.62520458265139,
|
|
"grad_norm": 12.321027290389669,
|
|
"learning_rate": 2.196741984949419e-09,
|
|
"loss": 1.4515,
|
|
"mean_token_accuracy": 0.6971550405025482,
|
|
"num_tokens": 281735612.0,
|
|
"step": 4475
|
|
},
|
|
{
|
|
"epoch": 14.641571194762683,
|
|
"grad_norm": 12.440597736777564,
|
|
"learning_rate": 2.1910812076414075e-09,
|
|
"loss": 1.4636,
|
|
"mean_token_accuracy": 0.6930567026138306,
|
|
"num_tokens": 282050979.0,
|
|
"step": 4480
|
|
},
|
|
{
|
|
"epoch": 14.657937806873978,
|
|
"grad_norm": 12.248636275112409,
|
|
"learning_rate": 2.185422038296224e-09,
|
|
"loss": 1.4589,
|
|
"mean_token_accuracy": 0.6959433734416962,
|
|
"num_tokens": 282366764.0,
|
|
"step": 4485
|
|
},
|
|
{
|
|
"epoch": 14.67430441898527,
|
|
"grad_norm": 12.90372128226122,
|
|
"learning_rate": 2.1797645063705874e-09,
|
|
"loss": 1.4877,
|
|
"mean_token_accuracy": 0.6922920167446136,
|
|
"num_tokens": 282681458.0,
|
|
"step": 4490
|
|
},
|
|
{
|
|
"epoch": 14.690671031096564,
|
|
"grad_norm": 11.926305941030886,
|
|
"learning_rate": 2.174108641312694e-09,
|
|
"loss": 1.4507,
|
|
"mean_token_accuracy": 0.6994022250175476,
|
|
"num_tokens": 282998330.0,
|
|
"step": 4495
|
|
},
|
|
{
|
|
"epoch": 14.707037643207856,
|
|
"grad_norm": 12.889652279538323,
|
|
"learning_rate": 2.1684544725620626e-09,
|
|
"loss": 1.4876,
|
|
"mean_token_accuracy": 0.6908430218696594,
|
|
"num_tokens": 283314245.0,
|
|
"step": 4500
|
|
},
|
|
{
|
|
"epoch": 14.72340425531915,
|
|
"grad_norm": 12.216650977375846,
|
|
"learning_rate": 2.1628020295493844e-09,
|
|
"loss": 1.4446,
|
|
"mean_token_accuracy": 0.6986382126808166,
|
|
"num_tokens": 283631182.0,
|
|
"step": 4505
|
|
},
|
|
{
|
|
"epoch": 14.739770867430442,
|
|
"grad_norm": 12.24979629911043,
|
|
"learning_rate": 2.1571513416963645e-09,
|
|
"loss": 1.4586,
|
|
"mean_token_accuracy": 0.6951400220394135,
|
|
"num_tokens": 283947490.0,
|
|
"step": 4510
|
|
},
|
|
{
|
|
"epoch": 14.756137479541735,
|
|
"grad_norm": 12.339552640099162,
|
|
"learning_rate": 2.1515024384155752e-09,
|
|
"loss": 1.4631,
|
|
"mean_token_accuracy": 0.698326563835144,
|
|
"num_tokens": 284261505.0,
|
|
"step": 4515
|
|
},
|
|
{
|
|
"epoch": 14.772504091653028,
|
|
"grad_norm": 12.757398483091528,
|
|
"learning_rate": 2.145855349110299e-09,
|
|
"loss": 1.4715,
|
|
"mean_token_accuracy": 0.6933198750019074,
|
|
"num_tokens": 284576696.0,
|
|
"step": 4520
|
|
},
|
|
{
|
|
"epoch": 14.78887070376432,
|
|
"grad_norm": 12.728594598720028,
|
|
"learning_rate": 2.1402101031743764e-09,
|
|
"loss": 1.4623,
|
|
"mean_token_accuracy": 0.6925253510475159,
|
|
"num_tokens": 284892532.0,
|
|
"step": 4525
|
|
},
|
|
{
|
|
"epoch": 14.805237315875614,
|
|
"grad_norm": 12.400149509699013,
|
|
"learning_rate": 2.134566729992053e-09,
|
|
"loss": 1.4514,
|
|
"mean_token_accuracy": 0.6978048384189606,
|
|
"num_tokens": 285208161.0,
|
|
"step": 4530
|
|
},
|
|
{
|
|
"epoch": 14.821603927986907,
|
|
"grad_norm": 12.30488766418087,
|
|
"learning_rate": 2.128925258937826e-09,
|
|
"loss": 1.4797,
|
|
"mean_token_accuracy": 0.6912436008453369,
|
|
"num_tokens": 285525048.0,
|
|
"step": 4535
|
|
},
|
|
{
|
|
"epoch": 14.8379705400982,
|
|
"grad_norm": 12.357791717436664,
|
|
"learning_rate": 2.123285719376292e-09,
|
|
"loss": 1.4742,
|
|
"mean_token_accuracy": 0.695197343826294,
|
|
"num_tokens": 285840446.0,
|
|
"step": 4540
|
|
},
|
|
{
|
|
"epoch": 14.854337152209492,
|
|
"grad_norm": 12.342749508582655,
|
|
"learning_rate": 2.1176481406619947e-09,
|
|
"loss": 1.4587,
|
|
"mean_token_accuracy": 0.6979107022285461,
|
|
"num_tokens": 286156416.0,
|
|
"step": 4545
|
|
},
|
|
{
|
|
"epoch": 14.870703764320785,
|
|
"grad_norm": 12.728714363435309,
|
|
"learning_rate": 2.11201255213927e-09,
|
|
"loss": 1.4589,
|
|
"mean_token_accuracy": 0.6975548446178437,
|
|
"num_tokens": 286472721.0,
|
|
"step": 4550
|
|
},
|
|
{
|
|
"epoch": 14.887070376432078,
|
|
"grad_norm": 12.96147876703625,
|
|
"learning_rate": 2.1063789831420955e-09,
|
|
"loss": 1.4911,
|
|
"mean_token_accuracy": 0.6897732436656951,
|
|
"num_tokens": 286789509.0,
|
|
"step": 4555
|
|
},
|
|
{
|
|
"epoch": 14.90343698854337,
|
|
"grad_norm": 12.619512227634361,
|
|
"learning_rate": 2.1007474629939365e-09,
|
|
"loss": 1.4643,
|
|
"mean_token_accuracy": 0.6959713280200959,
|
|
"num_tokens": 287102963.0,
|
|
"step": 4560
|
|
},
|
|
{
|
|
"epoch": 14.919803600654664,
|
|
"grad_norm": 11.942604179577812,
|
|
"learning_rate": 2.0951180210075957e-09,
|
|
"loss": 1.4478,
|
|
"mean_token_accuracy": 0.6993954658508301,
|
|
"num_tokens": 287418442.0,
|
|
"step": 4565
|
|
},
|
|
{
|
|
"epoch": 14.936170212765958,
|
|
"grad_norm": 12.625415378283076,
|
|
"learning_rate": 2.089490686485054e-09,
|
|
"loss": 1.454,
|
|
"mean_token_accuracy": 0.6950019478797913,
|
|
"num_tokens": 287733944.0,
|
|
"step": 4570
|
|
},
|
|
{
|
|
"epoch": 14.952536824877251,
|
|
"grad_norm": 13.1432068644648,
|
|
"learning_rate": 2.0838654887173267e-09,
|
|
"loss": 1.476,
|
|
"mean_token_accuracy": 0.6924629271030426,
|
|
"num_tokens": 288049250.0,
|
|
"step": 4575
|
|
},
|
|
{
|
|
"epoch": 14.968903436988544,
|
|
"grad_norm": 12.757524167271344,
|
|
"learning_rate": 2.0782424569843065e-09,
|
|
"loss": 1.4576,
|
|
"mean_token_accuracy": 0.6961781263351441,
|
|
"num_tokens": 288365769.0,
|
|
"step": 4580
|
|
},
|
|
{
|
|
"epoch": 14.985270049099837,
|
|
"grad_norm": 12.63342489493526,
|
|
"learning_rate": 2.07262162055461e-09,
|
|
"loss": 1.4477,
|
|
"mean_token_accuracy": 0.6964144468307495,
|
|
"num_tokens": 288679654.0,
|
|
"step": 4585
|
|
},
|
|
{
|
|
"epoch": 15.0,
|
|
"grad_norm": 11.887885024431077,
|
|
"learning_rate": 2.0670030086854292e-09,
|
|
"loss": 1.4382,
|
|
"mean_token_accuracy": 0.6943457788891263,
|
|
"num_tokens": 288940517.0,
|
|
"step": 4590
|
|
},
|
|
{
|
|
"epoch": 15.016366612111293,
|
|
"grad_norm": 12.537734466637337,
|
|
"learning_rate": 2.061386650622375e-09,
|
|
"loss": 1.4629,
|
|
"mean_token_accuracy": 0.6950094759464264,
|
|
"num_tokens": 289255149.0,
|
|
"step": 4595
|
|
},
|
|
{
|
|
"epoch": 15.032733224222586,
|
|
"grad_norm": 13.189978916684646,
|
|
"learning_rate": 2.0557725755993286e-09,
|
|
"loss": 1.4733,
|
|
"mean_token_accuracy": 0.6929343402385711,
|
|
"num_tokens": 289569537.0,
|
|
"step": 4600
|
|
},
|
|
{
|
|
"epoch": 15.049099836333879,
|
|
"grad_norm": 12.160429562262673,
|
|
"learning_rate": 2.0501608128382854e-09,
|
|
"loss": 1.4469,
|
|
"mean_token_accuracy": 0.69942986369133,
|
|
"num_tokens": 289884661.0,
|
|
"step": 4605
|
|
},
|
|
{
|
|
"epoch": 15.065466448445171,
|
|
"grad_norm": 12.820755065669108,
|
|
"learning_rate": 2.0445513915492077e-09,
|
|
"loss": 1.458,
|
|
"mean_token_accuracy": 0.7008291482925415,
|
|
"num_tokens": 290201349.0,
|
|
"step": 4610
|
|
},
|
|
{
|
|
"epoch": 15.081833060556464,
|
|
"grad_norm": 12.469195158529283,
|
|
"learning_rate": 2.038944340929868e-09,
|
|
"loss": 1.4598,
|
|
"mean_token_accuracy": 0.695182865858078,
|
|
"num_tokens": 290517988.0,
|
|
"step": 4615
|
|
},
|
|
{
|
|
"epoch": 15.098199672667757,
|
|
"grad_norm": 12.539429180588215,
|
|
"learning_rate": 2.033339690165702e-09,
|
|
"loss": 1.4582,
|
|
"mean_token_accuracy": 0.6969483494758606,
|
|
"num_tokens": 290834097.0,
|
|
"step": 4620
|
|
},
|
|
{
|
|
"epoch": 15.11456628477905,
|
|
"grad_norm": 12.347141898709026,
|
|
"learning_rate": 2.0277374684296497e-09,
|
|
"loss": 1.4446,
|
|
"mean_token_accuracy": 0.698224401473999,
|
|
"num_tokens": 291148346.0,
|
|
"step": 4625
|
|
},
|
|
{
|
|
"epoch": 15.130932896890343,
|
|
"grad_norm": 12.477486605316148,
|
|
"learning_rate": 2.0221377048820108e-09,
|
|
"loss": 1.4726,
|
|
"mean_token_accuracy": 0.6941590189933777,
|
|
"num_tokens": 291460917.0,
|
|
"step": 4630
|
|
},
|
|
{
|
|
"epoch": 15.147299509001636,
|
|
"grad_norm": 12.597641418413549,
|
|
"learning_rate": 2.016540428670289e-09,
|
|
"loss": 1.4511,
|
|
"mean_token_accuracy": 0.6982307612895966,
|
|
"num_tokens": 291777004.0,
|
|
"step": 4635
|
|
},
|
|
{
|
|
"epoch": 15.16366612111293,
|
|
"grad_norm": 12.259001662920467,
|
|
"learning_rate": 2.0109456689290413e-09,
|
|
"loss": 1.4512,
|
|
"mean_token_accuracy": 0.6984145939350128,
|
|
"num_tokens": 292093657.0,
|
|
"step": 4640
|
|
},
|
|
{
|
|
"epoch": 15.180032733224223,
|
|
"grad_norm": 12.297638018990243,
|
|
"learning_rate": 2.0053534547797256e-09,
|
|
"loss": 1.4641,
|
|
"mean_token_accuracy": 0.693070936203003,
|
|
"num_tokens": 292409436.0,
|
|
"step": 4645
|
|
},
|
|
{
|
|
"epoch": 15.196399345335516,
|
|
"grad_norm": 12.425519490074832,
|
|
"learning_rate": 1.99976381533055e-09,
|
|
"loss": 1.4527,
|
|
"mean_token_accuracy": 0.6970273613929748,
|
|
"num_tokens": 292725817.0,
|
|
"step": 4650
|
|
},
|
|
{
|
|
"epoch": 15.212765957446809,
|
|
"grad_norm": 12.572790188395173,
|
|
"learning_rate": 1.994176779676321e-09,
|
|
"loss": 1.4643,
|
|
"mean_token_accuracy": 0.6970184624195099,
|
|
"num_tokens": 293041899.0,
|
|
"step": 4655
|
|
},
|
|
{
|
|
"epoch": 15.229132569558102,
|
|
"grad_norm": 12.559415461824194,
|
|
"learning_rate": 1.988592376898292e-09,
|
|
"loss": 1.4544,
|
|
"mean_token_accuracy": 0.6966762185096741,
|
|
"num_tokens": 293357735.0,
|
|
"step": 4660
|
|
},
|
|
{
|
|
"epoch": 15.245499181669395,
|
|
"grad_norm": 12.840229816853638,
|
|
"learning_rate": 1.9830106360640117e-09,
|
|
"loss": 1.4607,
|
|
"mean_token_accuracy": 0.6955319404602051,
|
|
"num_tokens": 293673395.0,
|
|
"step": 4665
|
|
},
|
|
{
|
|
"epoch": 15.261865793780688,
|
|
"grad_norm": 12.921981395383156,
|
|
"learning_rate": 1.977431586227173e-09,
|
|
"loss": 1.4485,
|
|
"mean_token_accuracy": 0.7030904352664947,
|
|
"num_tokens": 293988719.0,
|
|
"step": 4670
|
|
},
|
|
{
|
|
"epoch": 15.27823240589198,
|
|
"grad_norm": 12.686769441565513,
|
|
"learning_rate": 1.9718552564274626e-09,
|
|
"loss": 1.4536,
|
|
"mean_token_accuracy": 0.6988371670246124,
|
|
"num_tokens": 294305177.0,
|
|
"step": 4675
|
|
},
|
|
{
|
|
"epoch": 15.294599018003273,
|
|
"grad_norm": 12.317038010818157,
|
|
"learning_rate": 1.9662816756904084e-09,
|
|
"loss": 1.4345,
|
|
"mean_token_accuracy": 0.7039202451705933,
|
|
"num_tokens": 294621539.0,
|
|
"step": 4680
|
|
},
|
|
{
|
|
"epoch": 15.310965630114566,
|
|
"grad_norm": 12.794939081964158,
|
|
"learning_rate": 1.960710873027228e-09,
|
|
"loss": 1.4501,
|
|
"mean_token_accuracy": 0.6983423411846161,
|
|
"num_tokens": 294936167.0,
|
|
"step": 4685
|
|
},
|
|
{
|
|
"epoch": 15.327332242225859,
|
|
"grad_norm": 11.698580162988128,
|
|
"learning_rate": 1.955142877434681e-09,
|
|
"loss": 1.452,
|
|
"mean_token_accuracy": 0.700901734828949,
|
|
"num_tokens": 295252190.0,
|
|
"step": 4690
|
|
},
|
|
{
|
|
"epoch": 15.343698854337152,
|
|
"grad_norm": 12.556785946500368,
|
|
"learning_rate": 1.949577717894914e-09,
|
|
"loss": 1.4536,
|
|
"mean_token_accuracy": 0.7017082333564758,
|
|
"num_tokens": 295567293.0,
|
|
"step": 4695
|
|
},
|
|
{
|
|
"epoch": 15.360065466448445,
|
|
"grad_norm": 13.21370608963778,
|
|
"learning_rate": 1.9440154233753125e-09,
|
|
"loss": 1.4282,
|
|
"mean_token_accuracy": 0.7037009358406067,
|
|
"num_tokens": 295882469.0,
|
|
"step": 4700
|
|
},
|
|
{
|
|
"epoch": 15.376432078559738,
|
|
"grad_norm": 12.822389095916504,
|
|
"learning_rate": 1.9384560228283493e-09,
|
|
"loss": 1.4396,
|
|
"mean_token_accuracy": 0.7042408645153045,
|
|
"num_tokens": 296198439.0,
|
|
"step": 4705
|
|
},
|
|
{
|
|
"epoch": 15.39279869067103,
|
|
"grad_norm": 12.68294166232564,
|
|
"learning_rate": 1.932899545191433e-09,
|
|
"loss": 1.4679,
|
|
"mean_token_accuracy": 0.6939967513084412,
|
|
"num_tokens": 296514887.0,
|
|
"step": 4710
|
|
},
|
|
{
|
|
"epoch": 15.409165302782323,
|
|
"grad_norm": 12.8287458513874,
|
|
"learning_rate": 1.9273460193867585e-09,
|
|
"loss": 1.4523,
|
|
"mean_token_accuracy": 0.6993020355701447,
|
|
"num_tokens": 296831104.0,
|
|
"step": 4715
|
|
},
|
|
{
|
|
"epoch": 15.425531914893616,
|
|
"grad_norm": 13.046240796523204,
|
|
"learning_rate": 1.921795474321156e-09,
|
|
"loss": 1.4629,
|
|
"mean_token_accuracy": 0.6947001516819,
|
|
"num_tokens": 297145680.0,
|
|
"step": 4720
|
|
},
|
|
{
|
|
"epoch": 15.44189852700491,
|
|
"grad_norm": 13.02764337966389,
|
|
"learning_rate": 1.9162479388859405e-09,
|
|
"loss": 1.4428,
|
|
"mean_token_accuracy": 0.6996423721313476,
|
|
"num_tokens": 297462302.0,
|
|
"step": 4725
|
|
},
|
|
{
|
|
"epoch": 15.458265139116204,
|
|
"grad_norm": 12.706079510070456,
|
|
"learning_rate": 1.9107034419567616e-09,
|
|
"loss": 1.4356,
|
|
"mean_token_accuracy": 0.6992950081825257,
|
|
"num_tokens": 297778557.0,
|
|
"step": 4730
|
|
},
|
|
{
|
|
"epoch": 15.474631751227497,
|
|
"grad_norm": 12.368405600391505,
|
|
"learning_rate": 1.905162012393454e-09,
|
|
"loss": 1.4301,
|
|
"mean_token_accuracy": 0.7041154444217682,
|
|
"num_tokens": 298094510.0,
|
|
"step": 4735
|
|
},
|
|
{
|
|
"epoch": 15.49099836333879,
|
|
"grad_norm": 12.132688423350789,
|
|
"learning_rate": 1.8996236790398827e-09,
|
|
"loss": 1.4461,
|
|
"mean_token_accuracy": 0.6984711229801178,
|
|
"num_tokens": 298408407.0,
|
|
"step": 4740
|
|
},
|
|
{
|
|
"epoch": 15.507364975450082,
|
|
"grad_norm": 12.584146973765089,
|
|
"learning_rate": 1.894088470723801e-09,
|
|
"loss": 1.4434,
|
|
"mean_token_accuracy": 0.7076845765113831,
|
|
"num_tokens": 298723960.0,
|
|
"step": 4745
|
|
},
|
|
{
|
|
"epoch": 15.523731587561375,
|
|
"grad_norm": 12.461445555120452,
|
|
"learning_rate": 1.8885564162566935e-09,
|
|
"loss": 1.4504,
|
|
"mean_token_accuracy": 0.7017831683158875,
|
|
"num_tokens": 299038837.0,
|
|
"step": 4750
|
|
},
|
|
{
|
|
"epoch": 15.540098199672668,
|
|
"grad_norm": 12.049024412508352,
|
|
"learning_rate": 1.8830275444336294e-09,
|
|
"loss": 1.4351,
|
|
"mean_token_accuracy": 0.7035296440124512,
|
|
"num_tokens": 299353453.0,
|
|
"step": 4755
|
|
},
|
|
{
|
|
"epoch": 15.556464811783961,
|
|
"grad_norm": 12.489061092870097,
|
|
"learning_rate": 1.877501884033112e-09,
|
|
"loss": 1.4631,
|
|
"mean_token_accuracy": 0.7008910655975342,
|
|
"num_tokens": 299668337.0,
|
|
"step": 4760
|
|
},
|
|
{
|
|
"epoch": 15.572831423895254,
|
|
"grad_norm": 12.754571106004892,
|
|
"learning_rate": 1.871979463816928e-09,
|
|
"loss": 1.4403,
|
|
"mean_token_accuracy": 0.7025842666625977,
|
|
"num_tokens": 299984450.0,
|
|
"step": 4765
|
|
},
|
|
{
|
|
"epoch": 15.589198036006547,
|
|
"grad_norm": 12.501616382655923,
|
|
"learning_rate": 1.866460312529999e-09,
|
|
"loss": 1.4298,
|
|
"mean_token_accuracy": 0.7049470484256745,
|
|
"num_tokens": 300299888.0,
|
|
"step": 4770
|
|
},
|
|
{
|
|
"epoch": 15.60556464811784,
|
|
"grad_norm": 13.137669630274404,
|
|
"learning_rate": 1.8609444589002305e-09,
|
|
"loss": 1.4457,
|
|
"mean_token_accuracy": 0.704905104637146,
|
|
"num_tokens": 300614429.0,
|
|
"step": 4775
|
|
},
|
|
{
|
|
"epoch": 15.621931260229132,
|
|
"grad_norm": 12.794971835716657,
|
|
"learning_rate": 1.8554319316383656e-09,
|
|
"loss": 1.4323,
|
|
"mean_token_accuracy": 0.7025792956352234,
|
|
"num_tokens": 300930547.0,
|
|
"step": 4780
|
|
},
|
|
{
|
|
"epoch": 15.638297872340425,
|
|
"grad_norm": 12.796486073759842,
|
|
"learning_rate": 1.8499227594378307e-09,
|
|
"loss": 1.4345,
|
|
"mean_token_accuracy": 0.7019464492797851,
|
|
"num_tokens": 301246649.0,
|
|
"step": 4785
|
|
},
|
|
{
|
|
"epoch": 15.654664484451718,
|
|
"grad_norm": 13.004969358776405,
|
|
"learning_rate": 1.8444169709745909e-09,
|
|
"loss": 1.4368,
|
|
"mean_token_accuracy": 0.7053989946842194,
|
|
"num_tokens": 301563164.0,
|
|
"step": 4790
|
|
},
|
|
{
|
|
"epoch": 15.671031096563011,
|
|
"grad_norm": 12.547077558098191,
|
|
"learning_rate": 1.8389145949069952e-09,
|
|
"loss": 1.4333,
|
|
"mean_token_accuracy": 0.703494918346405,
|
|
"num_tokens": 301878774.0,
|
|
"step": 4795
|
|
},
|
|
{
|
|
"epoch": 15.687397708674304,
|
|
"grad_norm": 12.864772984217574,
|
|
"learning_rate": 1.8334156598756332e-09,
|
|
"loss": 1.4456,
|
|
"mean_token_accuracy": 0.7033185422420501,
|
|
"num_tokens": 302194764.0,
|
|
"step": 4800
|
|
},
|
|
{
|
|
"epoch": 15.703764320785597,
|
|
"grad_norm": 12.889303348512994,
|
|
"learning_rate": 1.8279201945031835e-09,
|
|
"loss": 1.4443,
|
|
"mean_token_accuracy": 0.7018985509872436,
|
|
"num_tokens": 302511045.0,
|
|
"step": 4805
|
|
},
|
|
{
|
|
"epoch": 15.72013093289689,
|
|
"grad_norm": 11.93106030575331,
|
|
"learning_rate": 1.8224282273942639e-09,
|
|
"loss": 1.4423,
|
|
"mean_token_accuracy": 0.7027697741985321,
|
|
"num_tokens": 302826404.0,
|
|
"step": 4810
|
|
},
|
|
{
|
|
"epoch": 15.736497545008183,
|
|
"grad_norm": 12.753590751329146,
|
|
"learning_rate": 1.8169397871352833e-09,
|
|
"loss": 1.4321,
|
|
"mean_token_accuracy": 0.702825516462326,
|
|
"num_tokens": 303143255.0,
|
|
"step": 4815
|
|
},
|
|
{
|
|
"epoch": 15.752864157119475,
|
|
"grad_norm": 12.986619290364375,
|
|
"learning_rate": 1.8114549022942933e-09,
|
|
"loss": 1.4477,
|
|
"mean_token_accuracy": 0.703691053390503,
|
|
"num_tokens": 303460830.0,
|
|
"step": 4820
|
|
},
|
|
{
|
|
"epoch": 15.76923076923077,
|
|
"grad_norm": 13.099929730581625,
|
|
"learning_rate": 1.8059736014208387e-09,
|
|
"loss": 1.4308,
|
|
"mean_token_accuracy": 0.708087545633316,
|
|
"num_tokens": 303776766.0,
|
|
"step": 4825
|
|
},
|
|
{
|
|
"epoch": 15.785597381342063,
|
|
"grad_norm": 13.00016809628579,
|
|
"learning_rate": 1.8004959130458092e-09,
|
|
"loss": 1.4495,
|
|
"mean_token_accuracy": 0.7040505766868591,
|
|
"num_tokens": 304092857.0,
|
|
"step": 4830
|
|
},
|
|
{
|
|
"epoch": 15.801963993453356,
|
|
"grad_norm": 12.622790826032567,
|
|
"learning_rate": 1.7950218656812916e-09,
|
|
"loss": 1.4421,
|
|
"mean_token_accuracy": 0.7057828962802887,
|
|
"num_tokens": 304407912.0,
|
|
"step": 4835
|
|
},
|
|
{
|
|
"epoch": 15.818330605564649,
|
|
"grad_norm": 13.074411965123678,
|
|
"learning_rate": 1.7895514878204203e-09,
|
|
"loss": 1.4364,
|
|
"mean_token_accuracy": 0.7043783128261566,
|
|
"num_tokens": 304724801.0,
|
|
"step": 4840
|
|
},
|
|
{
|
|
"epoch": 15.834697217675942,
|
|
"grad_norm": 12.357229368576297,
|
|
"learning_rate": 1.7840848079372291e-09,
|
|
"loss": 1.4217,
|
|
"mean_token_accuracy": 0.7087771832942963,
|
|
"num_tokens": 305041073.0,
|
|
"step": 4845
|
|
},
|
|
{
|
|
"epoch": 15.851063829787234,
|
|
"grad_norm": 12.527322694946676,
|
|
"learning_rate": 1.7786218544865048e-09,
|
|
"loss": 1.4421,
|
|
"mean_token_accuracy": 0.7043534576892853,
|
|
"num_tokens": 305357141.0,
|
|
"step": 4850
|
|
},
|
|
{
|
|
"epoch": 15.867430441898527,
|
|
"grad_norm": 11.632455478155906,
|
|
"learning_rate": 1.773162655903635e-09,
|
|
"loss": 1.4156,
|
|
"mean_token_accuracy": 0.7109094977378845,
|
|
"num_tokens": 305673330.0,
|
|
"step": 4855
|
|
},
|
|
{
|
|
"epoch": 15.88379705400982,
|
|
"grad_norm": 12.484383450949295,
|
|
"learning_rate": 1.7677072406044653e-09,
|
|
"loss": 1.4291,
|
|
"mean_token_accuracy": 0.7088080048561096,
|
|
"num_tokens": 305988151.0,
|
|
"step": 4860
|
|
},
|
|
{
|
|
"epoch": 15.900163666121113,
|
|
"grad_norm": 12.366432589792248,
|
|
"learning_rate": 1.7622556369851476e-09,
|
|
"loss": 1.4219,
|
|
"mean_token_accuracy": 0.7094823539257049,
|
|
"num_tokens": 306303670.0,
|
|
"step": 4865
|
|
},
|
|
{
|
|
"epoch": 15.916530278232406,
|
|
"grad_norm": 12.747791579732302,
|
|
"learning_rate": 1.7568078734219933e-09,
|
|
"loss": 1.4459,
|
|
"mean_token_accuracy": 0.7039444029331208,
|
|
"num_tokens": 306617897.0,
|
|
"step": 4870
|
|
},
|
|
{
|
|
"epoch": 15.932896890343699,
|
|
"grad_norm": 12.532223512008814,
|
|
"learning_rate": 1.751363978271327e-09,
|
|
"loss": 1.4239,
|
|
"mean_token_accuracy": 0.7083085179328918,
|
|
"num_tokens": 306933638.0,
|
|
"step": 4875
|
|
},
|
|
{
|
|
"epoch": 15.949263502454992,
|
|
"grad_norm": 12.416976278316078,
|
|
"learning_rate": 1.7459239798693363e-09,
|
|
"loss": 1.4146,
|
|
"mean_token_accuracy": 0.7101383566856384,
|
|
"num_tokens": 307249025.0,
|
|
"step": 4880
|
|
},
|
|
{
|
|
"epoch": 15.965630114566284,
|
|
"grad_norm": 13.01261682583028,
|
|
"learning_rate": 1.7404879065319268e-09,
|
|
"loss": 1.442,
|
|
"mean_token_accuracy": 0.7075551569461822,
|
|
"num_tokens": 307564915.0,
|
|
"step": 4885
|
|
},
|
|
{
|
|
"epoch": 15.981996726677577,
|
|
"grad_norm": 12.602326240128173,
|
|
"learning_rate": 1.7350557865545724e-09,
|
|
"loss": 1.4359,
|
|
"mean_token_accuracy": 0.7061196863651276,
|
|
"num_tokens": 307879676.0,
|
|
"step": 4890
|
|
},
|
|
{
|
|
"epoch": 15.99836333878887,
|
|
"grad_norm": 12.026995154848029,
|
|
"learning_rate": 1.729627648212171e-09,
|
|
"loss": 1.4224,
|
|
"mean_token_accuracy": 0.7099742531776428,
|
|
"num_tokens": 308195258.0,
|
|
"step": 4895
|
|
},
|
|
{
|
|
"epoch": 16.013093289689035,
|
|
"grad_norm": 12.487645695354066,
|
|
"learning_rate": 1.7242035197588937e-09,
|
|
"loss": 1.4465,
|
|
"mean_token_accuracy": 0.7059299283557467,
|
|
"num_tokens": 308455045.0,
|
|
"step": 4900
|
|
},
|
|
{
|
|
"epoch": 16.029459901800326,
|
|
"grad_norm": 12.742840987338324,
|
|
"learning_rate": 1.7187834294280422e-09,
|
|
"loss": 1.4347,
|
|
"mean_token_accuracy": 0.7084084928035737,
|
|
"num_tokens": 308769309.0,
|
|
"step": 4905
|
|
},
|
|
{
|
|
"epoch": 16.04582651391162,
|
|
"grad_norm": 12.28965784309087,
|
|
"learning_rate": 1.7133674054318947e-09,
|
|
"loss": 1.4004,
|
|
"mean_token_accuracy": 0.7148695111274719,
|
|
"num_tokens": 309085266.0,
|
|
"step": 4910
|
|
},
|
|
{
|
|
"epoch": 16.062193126022912,
|
|
"grad_norm": 12.425330561068234,
|
|
"learning_rate": 1.7079554759615685e-09,
|
|
"loss": 1.4026,
|
|
"mean_token_accuracy": 0.7121374189853669,
|
|
"num_tokens": 309399971.0,
|
|
"step": 4915
|
|
},
|
|
{
|
|
"epoch": 16.078559738134206,
|
|
"grad_norm": 12.318608831255458,
|
|
"learning_rate": 1.702547669186865e-09,
|
|
"loss": 1.4247,
|
|
"mean_token_accuracy": 0.7116099834442139,
|
|
"num_tokens": 309716297.0,
|
|
"step": 4920
|
|
},
|
|
{
|
|
"epoch": 16.094926350245498,
|
|
"grad_norm": 12.209031557381481,
|
|
"learning_rate": 1.6971440132561283e-09,
|
|
"loss": 1.4393,
|
|
"mean_token_accuracy": 0.7075030922889709,
|
|
"num_tokens": 310030581.0,
|
|
"step": 4925
|
|
},
|
|
{
|
|
"epoch": 16.111292962356792,
|
|
"grad_norm": 12.519677365451061,
|
|
"learning_rate": 1.6917445362960965e-09,
|
|
"loss": 1.4413,
|
|
"mean_token_accuracy": 0.7066156387329101,
|
|
"num_tokens": 310345610.0,
|
|
"step": 4930
|
|
},
|
|
{
|
|
"epoch": 16.127659574468087,
|
|
"grad_norm": 12.828979086104322,
|
|
"learning_rate": 1.6863492664117547e-09,
|
|
"loss": 1.4193,
|
|
"mean_token_accuracy": 0.7108834385871887,
|
|
"num_tokens": 310662055.0,
|
|
"step": 4935
|
|
},
|
|
{
|
|
"epoch": 16.144026186579378,
|
|
"grad_norm": 12.966364808553605,
|
|
"learning_rate": 1.680958231686191e-09,
|
|
"loss": 1.4467,
|
|
"mean_token_accuracy": 0.7091517686843872,
|
|
"num_tokens": 310977412.0,
|
|
"step": 4940
|
|
},
|
|
{
|
|
"epoch": 16.160392798690673,
|
|
"grad_norm": 12.4486500881121,
|
|
"learning_rate": 1.6755714601804473e-09,
|
|
"loss": 1.4123,
|
|
"mean_token_accuracy": 0.7110908329486847,
|
|
"num_tokens": 311293114.0,
|
|
"step": 4945
|
|
},
|
|
{
|
|
"epoch": 16.176759410801964,
|
|
"grad_norm": 12.12641458468891,
|
|
"learning_rate": 1.6701889799333764e-09,
|
|
"loss": 1.4338,
|
|
"mean_token_accuracy": 0.7081463217735291,
|
|
"num_tokens": 311608822.0,
|
|
"step": 4950
|
|
},
|
|
{
|
|
"epoch": 16.19312602291326,
|
|
"grad_norm": 12.419806067336424,
|
|
"learning_rate": 1.6648108189614937e-09,
|
|
"loss": 1.4285,
|
|
"mean_token_accuracy": 0.7072434544563293,
|
|
"num_tokens": 311924352.0,
|
|
"step": 4955
|
|
},
|
|
{
|
|
"epoch": 16.20949263502455,
|
|
"grad_norm": 12.528113062059223,
|
|
"learning_rate": 1.6594370052588325e-09,
|
|
"loss": 1.4158,
|
|
"mean_token_accuracy": 0.7136779129505157,
|
|
"num_tokens": 312239834.0,
|
|
"step": 4960
|
|
},
|
|
{
|
|
"epoch": 16.225859247135844,
|
|
"grad_norm": 12.26285948778274,
|
|
"learning_rate": 1.6540675667967975e-09,
|
|
"loss": 1.4241,
|
|
"mean_token_accuracy": 0.7105782389640808,
|
|
"num_tokens": 312555700.0,
|
|
"step": 4965
|
|
},
|
|
{
|
|
"epoch": 16.242225859247135,
|
|
"grad_norm": 12.857900754501056,
|
|
"learning_rate": 1.6487025315240205e-09,
|
|
"loss": 1.4192,
|
|
"mean_token_accuracy": 0.7106291711330414,
|
|
"num_tokens": 312873912.0,
|
|
"step": 4970
|
|
},
|
|
{
|
|
"epoch": 16.25859247135843,
|
|
"grad_norm": 12.471903624085858,
|
|
"learning_rate": 1.6433419273662134e-09,
|
|
"loss": 1.4212,
|
|
"mean_token_accuracy": 0.711182713508606,
|
|
"num_tokens": 313188562.0,
|
|
"step": 4975
|
|
},
|
|
{
|
|
"epoch": 16.27495908346972,
|
|
"grad_norm": 12.977401525766625,
|
|
"learning_rate": 1.6379857822260242e-09,
|
|
"loss": 1.4246,
|
|
"mean_token_accuracy": 0.712104445695877,
|
|
"num_tokens": 313504755.0,
|
|
"step": 4980
|
|
},
|
|
{
|
|
"epoch": 16.291325695581016,
|
|
"grad_norm": 12.38125234612746,
|
|
"learning_rate": 1.63263412398289e-09,
|
|
"loss": 1.4211,
|
|
"mean_token_accuracy": 0.7122440755367279,
|
|
"num_tokens": 313821196.0,
|
|
"step": 4985
|
|
},
|
|
{
|
|
"epoch": 16.307692307692307,
|
|
"grad_norm": 13.00957545801318,
|
|
"learning_rate": 1.6272869804928953e-09,
|
|
"loss": 1.4188,
|
|
"mean_token_accuracy": 0.712299644947052,
|
|
"num_tokens": 314135776.0,
|
|
"step": 4990
|
|
},
|
|
{
|
|
"epoch": 16.3240589198036,
|
|
"grad_norm": 12.191919695501758,
|
|
"learning_rate": 1.621944379588622e-09,
|
|
"loss": 1.4022,
|
|
"mean_token_accuracy": 0.7136437952518463,
|
|
"num_tokens": 314450399.0,
|
|
"step": 4995
|
|
},
|
|
{
|
|
"epoch": 16.340425531914892,
|
|
"grad_norm": 12.2615909933301,
|
|
"learning_rate": 1.616606349079009e-09,
|
|
"loss": 1.4097,
|
|
"mean_token_accuracy": 0.713315773010254,
|
|
"num_tokens": 314766520.0,
|
|
"step": 5000
|
|
},
|
|
{
|
|
"epoch": 16.356792144026187,
|
|
"grad_norm": 13.113137902963848,
|
|
"learning_rate": 1.611272916749205e-09,
|
|
"loss": 1.4274,
|
|
"mean_token_accuracy": 0.7096070230007172,
|
|
"num_tokens": 315080587.0,
|
|
"step": 5005
|
|
},
|
|
{
|
|
"epoch": 16.373158756137478,
|
|
"grad_norm": 13.089840378773003,
|
|
"learning_rate": 1.6059441103604248e-09,
|
|
"loss": 1.4345,
|
|
"mean_token_accuracy": 0.7088598847389221,
|
|
"num_tokens": 315396046.0,
|
|
"step": 5010
|
|
},
|
|
{
|
|
"epoch": 16.389525368248773,
|
|
"grad_norm": 13.007606738230844,
|
|
"learning_rate": 1.6006199576498043e-09,
|
|
"loss": 1.4324,
|
|
"mean_token_accuracy": 0.7114957571029663,
|
|
"num_tokens": 315712887.0,
|
|
"step": 5015
|
|
},
|
|
{
|
|
"epoch": 16.405891980360064,
|
|
"grad_norm": 12.73546978432187,
|
|
"learning_rate": 1.5953004863302579e-09,
|
|
"loss": 1.4243,
|
|
"mean_token_accuracy": 0.7114252030849457,
|
|
"num_tokens": 316028929.0,
|
|
"step": 5020
|
|
},
|
|
{
|
|
"epoch": 16.42225859247136,
|
|
"grad_norm": 11.962805948309864,
|
|
"learning_rate": 1.5899857240903293e-09,
|
|
"loss": 1.4221,
|
|
"mean_token_accuracy": 0.7109561562538147,
|
|
"num_tokens": 316344275.0,
|
|
"step": 5025
|
|
},
|
|
{
|
|
"epoch": 16.438625204582653,
|
|
"grad_norm": 12.6828623743294,
|
|
"learning_rate": 1.5846756985940544e-09,
|
|
"loss": 1.4271,
|
|
"mean_token_accuracy": 0.709166294336319,
|
|
"num_tokens": 316659921.0,
|
|
"step": 5030
|
|
},
|
|
{
|
|
"epoch": 16.454991816693944,
|
|
"grad_norm": 12.139305827793805,
|
|
"learning_rate": 1.5793704374808121e-09,
|
|
"loss": 1.4121,
|
|
"mean_token_accuracy": 0.7111846745014191,
|
|
"num_tokens": 316974336.0,
|
|
"step": 5035
|
|
},
|
|
{
|
|
"epoch": 16.47135842880524,
|
|
"grad_norm": 12.273749837279315,
|
|
"learning_rate": 1.574069968365182e-09,
|
|
"loss": 1.4156,
|
|
"mean_token_accuracy": 0.712162172794342,
|
|
"num_tokens": 317291107.0,
|
|
"step": 5040
|
|
},
|
|
{
|
|
"epoch": 16.48772504091653,
|
|
"grad_norm": 12.577531348932242,
|
|
"learning_rate": 1.5687743188368012e-09,
|
|
"loss": 1.4272,
|
|
"mean_token_accuracy": 0.7101832747459411,
|
|
"num_tokens": 317606002.0,
|
|
"step": 5045
|
|
},
|
|
{
|
|
"epoch": 16.504091653027825,
|
|
"grad_norm": 12.636590864899892,
|
|
"learning_rate": 1.5634835164602198e-09,
|
|
"loss": 1.4001,
|
|
"mean_token_accuracy": 0.7117600500583648,
|
|
"num_tokens": 317921351.0,
|
|
"step": 5050
|
|
},
|
|
{
|
|
"epoch": 16.520458265139116,
|
|
"grad_norm": 12.546261992814705,
|
|
"learning_rate": 1.5581975887747584e-09,
|
|
"loss": 1.4454,
|
|
"mean_token_accuracy": 0.7062943160533905,
|
|
"num_tokens": 318239595.0,
|
|
"step": 5055
|
|
},
|
|
{
|
|
"epoch": 16.53682487725041,
|
|
"grad_norm": 12.820298302371835,
|
|
"learning_rate": 1.5529165632943637e-09,
|
|
"loss": 1.4285,
|
|
"mean_token_accuracy": 0.7094673216342926,
|
|
"num_tokens": 318555791.0,
|
|
"step": 5060
|
|
},
|
|
{
|
|
"epoch": 16.5531914893617,
|
|
"grad_norm": 12.81934414091327,
|
|
"learning_rate": 1.5476404675074662e-09,
|
|
"loss": 1.4185,
|
|
"mean_token_accuracy": 0.7127328336238861,
|
|
"num_tokens": 318871455.0,
|
|
"step": 5065
|
|
},
|
|
{
|
|
"epoch": 16.569558101472996,
|
|
"grad_norm": 12.528634024114519,
|
|
"learning_rate": 1.5423693288768356e-09,
|
|
"loss": 1.4257,
|
|
"mean_token_accuracy": 0.7096285283565521,
|
|
"num_tokens": 319186216.0,
|
|
"step": 5070
|
|
},
|
|
{
|
|
"epoch": 16.585924713584287,
|
|
"grad_norm": 12.45352197076654,
|
|
"learning_rate": 1.5371031748394415e-09,
|
|
"loss": 1.4133,
|
|
"mean_token_accuracy": 0.7132382929325104,
|
|
"num_tokens": 319503790.0,
|
|
"step": 5075
|
|
},
|
|
{
|
|
"epoch": 16.60229132569558,
|
|
"grad_norm": 12.823560917577696,
|
|
"learning_rate": 1.5318420328063042e-09,
|
|
"loss": 1.433,
|
|
"mean_token_accuracy": 0.7091699779033661,
|
|
"num_tokens": 319819206.0,
|
|
"step": 5080
|
|
},
|
|
{
|
|
"epoch": 16.618657937806873,
|
|
"grad_norm": 12.551449755539915,
|
|
"learning_rate": 1.526585930162359e-09,
|
|
"loss": 1.3969,
|
|
"mean_token_accuracy": 0.7174212098121643,
|
|
"num_tokens": 320135045.0,
|
|
"step": 5085
|
|
},
|
|
{
|
|
"epoch": 16.635024549918167,
|
|
"grad_norm": 12.432937590754685,
|
|
"learning_rate": 1.5213348942663091e-09,
|
|
"loss": 1.4038,
|
|
"mean_token_accuracy": 0.712946742773056,
|
|
"num_tokens": 320451345.0,
|
|
"step": 5090
|
|
},
|
|
{
|
|
"epoch": 16.65139116202946,
|
|
"grad_norm": 12.591699551531104,
|
|
"learning_rate": 1.5160889524504857e-09,
|
|
"loss": 1.402,
|
|
"mean_token_accuracy": 0.7149172186851501,
|
|
"num_tokens": 320765107.0,
|
|
"step": 5095
|
|
},
|
|
{
|
|
"epoch": 16.667757774140753,
|
|
"grad_norm": 12.558780429229012,
|
|
"learning_rate": 1.5108481320207031e-09,
|
|
"loss": 1.409,
|
|
"mean_token_accuracy": 0.7116455256938934,
|
|
"num_tokens": 321080526.0,
|
|
"step": 5100
|
|
},
|
|
{
|
|
"epoch": 16.684124386252044,
|
|
"grad_norm": 12.556788522001066,
|
|
"learning_rate": 1.5056124602561197e-09,
|
|
"loss": 1.4188,
|
|
"mean_token_accuracy": 0.7126824915409088,
|
|
"num_tokens": 321395505.0,
|
|
"step": 5105
|
|
},
|
|
{
|
|
"epoch": 16.70049099836334,
|
|
"grad_norm": 12.56475488051728,
|
|
"learning_rate": 1.5003819644090933e-09,
|
|
"loss": 1.4172,
|
|
"mean_token_accuracy": 0.7109236419200897,
|
|
"num_tokens": 321711520.0,
|
|
"step": 5110
|
|
},
|
|
{
|
|
"epoch": 16.71685761047463,
|
|
"grad_norm": 12.673240593488543,
|
|
"learning_rate": 1.4951566717050408e-09,
|
|
"loss": 1.4185,
|
|
"mean_token_accuracy": 0.7114186406135559,
|
|
"num_tokens": 322028854.0,
|
|
"step": 5115
|
|
},
|
|
{
|
|
"epoch": 16.733224222585925,
|
|
"grad_norm": 12.749139578170611,
|
|
"learning_rate": 1.4899366093422962e-09,
|
|
"loss": 1.411,
|
|
"mean_token_accuracy": 0.7120306134223938,
|
|
"num_tokens": 322345696.0,
|
|
"step": 5120
|
|
},
|
|
{
|
|
"epoch": 16.74959083469722,
|
|
"grad_norm": 12.565219692692061,
|
|
"learning_rate": 1.4847218044919685e-09,
|
|
"loss": 1.3943,
|
|
"mean_token_accuracy": 0.7164922475814819,
|
|
"num_tokens": 322662655.0,
|
|
"step": 5125
|
|
},
|
|
{
|
|
"epoch": 16.76595744680851,
|
|
"grad_norm": 13.494265061680965,
|
|
"learning_rate": 1.479512284297801e-09,
|
|
"loss": 1.4221,
|
|
"mean_token_accuracy": 0.7114957809448242,
|
|
"num_tokens": 322977947.0,
|
|
"step": 5130
|
|
},
|
|
{
|
|
"epoch": 16.782324058919805,
|
|
"grad_norm": 12.453801890723668,
|
|
"learning_rate": 1.47430807587603e-09,
|
|
"loss": 1.4019,
|
|
"mean_token_accuracy": 0.7145236849784851,
|
|
"num_tokens": 323294224.0,
|
|
"step": 5135
|
|
},
|
|
{
|
|
"epoch": 16.798690671031096,
|
|
"grad_norm": 12.835301219712429,
|
|
"learning_rate": 1.4691092063152418e-09,
|
|
"loss": 1.4176,
|
|
"mean_token_accuracy": 0.7122674882411957,
|
|
"num_tokens": 323609615.0,
|
|
"step": 5140
|
|
},
|
|
{
|
|
"epoch": 16.81505728314239,
|
|
"grad_norm": 12.174453724474661,
|
|
"learning_rate": 1.4639157026762344e-09,
|
|
"loss": 1.4172,
|
|
"mean_token_accuracy": 0.7103416383266449,
|
|
"num_tokens": 323924353.0,
|
|
"step": 5145
|
|
},
|
|
{
|
|
"epoch": 16.831423895253682,
|
|
"grad_norm": 12.302664934760815,
|
|
"learning_rate": 1.458727591991877e-09,
|
|
"loss": 1.4035,
|
|
"mean_token_accuracy": 0.7144258797168732,
|
|
"num_tokens": 324241521.0,
|
|
"step": 5150
|
|
},
|
|
{
|
|
"epoch": 16.847790507364977,
|
|
"grad_norm": 12.624334579460958,
|
|
"learning_rate": 1.4535449012669638e-09,
|
|
"loss": 1.3817,
|
|
"mean_token_accuracy": 0.7187546908855438,
|
|
"num_tokens": 324556732.0,
|
|
"step": 5155
|
|
},
|
|
{
|
|
"epoch": 16.864157119476268,
|
|
"grad_norm": 12.862259746060234,
|
|
"learning_rate": 1.4483676574780814e-09,
|
|
"loss": 1.4156,
|
|
"mean_token_accuracy": 0.7101677834987641,
|
|
"num_tokens": 324872836.0,
|
|
"step": 5160
|
|
},
|
|
{
|
|
"epoch": 16.880523731587562,
|
|
"grad_norm": 12.59492906662343,
|
|
"learning_rate": 1.4431958875734616e-09,
|
|
"loss": 1.4117,
|
|
"mean_token_accuracy": 0.7114337801933288,
|
|
"num_tokens": 325187319.0,
|
|
"step": 5165
|
|
},
|
|
{
|
|
"epoch": 16.896890343698853,
|
|
"grad_norm": 12.904641520216886,
|
|
"learning_rate": 1.4380296184728447e-09,
|
|
"loss": 1.4,
|
|
"mean_token_accuracy": 0.7136520922183991,
|
|
"num_tokens": 325502445.0,
|
|
"step": 5170
|
|
},
|
|
{
|
|
"epoch": 16.913256955810148,
|
|
"grad_norm": 12.997644941022777,
|
|
"learning_rate": 1.432868877067341e-09,
|
|
"loss": 1.4183,
|
|
"mean_token_accuracy": 0.7120531141757965,
|
|
"num_tokens": 325819093.0,
|
|
"step": 5175
|
|
},
|
|
{
|
|
"epoch": 16.92962356792144,
|
|
"grad_norm": 12.457739496706811,
|
|
"learning_rate": 1.427713690219285e-09,
|
|
"loss": 1.4025,
|
|
"mean_token_accuracy": 0.7128809988498688,
|
|
"num_tokens": 326134906.0,
|
|
"step": 5180
|
|
},
|
|
{
|
|
"epoch": 16.945990180032734,
|
|
"grad_norm": 12.349458584670058,
|
|
"learning_rate": 1.4225640847621006e-09,
|
|
"loss": 1.4191,
|
|
"mean_token_accuracy": 0.7112360656261444,
|
|
"num_tokens": 326450570.0,
|
|
"step": 5185
|
|
},
|
|
{
|
|
"epoch": 16.962356792144025,
|
|
"grad_norm": 12.43600358041225,
|
|
"learning_rate": 1.4174200875001603e-09,
|
|
"loss": 1.3981,
|
|
"mean_token_accuracy": 0.7156706392765045,
|
|
"num_tokens": 326766790.0,
|
|
"step": 5190
|
|
},
|
|
{
|
|
"epoch": 16.97872340425532,
|
|
"grad_norm": 12.57363296167154,
|
|
"learning_rate": 1.4122817252086426e-09,
|
|
"loss": 1.4069,
|
|
"mean_token_accuracy": 0.7124275147914887,
|
|
"num_tokens": 327079981.0,
|
|
"step": 5195
|
|
},
|
|
{
|
|
"epoch": 16.99509001636661,
|
|
"grad_norm": 12.224492851778363,
|
|
"learning_rate": 1.4071490246333978e-09,
|
|
"loss": 1.3923,
|
|
"mean_token_accuracy": 0.7143379509449005,
|
|
"num_tokens": 327394913.0,
|
|
"step": 5200
|
|
},
|
|
{
|
|
"epoch": 17.009819967266775,
|
|
"grad_norm": 12.660254127021258,
|
|
"learning_rate": 1.4020220124908064e-09,
|
|
"loss": 1.3851,
|
|
"mean_token_accuracy": 0.7145334217283461,
|
|
"num_tokens": 327655529.0,
|
|
"step": 5205
|
|
},
|
|
{
|
|
"epoch": 17.02618657937807,
|
|
"grad_norm": 12.374122101450418,
|
|
"learning_rate": 1.3969007154676383e-09,
|
|
"loss": 1.3968,
|
|
"mean_token_accuracy": 0.7153842210769653,
|
|
"num_tokens": 327971787.0,
|
|
"step": 5210
|
|
},
|
|
{
|
|
"epoch": 17.04255319148936,
|
|
"grad_norm": 13.523104807250503,
|
|
"learning_rate": 1.3917851602209163e-09,
|
|
"loss": 1.4022,
|
|
"mean_token_accuracy": 0.712611448764801,
|
|
"num_tokens": 328288325.0,
|
|
"step": 5215
|
|
},
|
|
{
|
|
"epoch": 17.058919803600656,
|
|
"grad_norm": 12.828752397429142,
|
|
"learning_rate": 1.3866753733777766e-09,
|
|
"loss": 1.415,
|
|
"mean_token_accuracy": 0.7102250695228577,
|
|
"num_tokens": 328603210.0,
|
|
"step": 5220
|
|
},
|
|
{
|
|
"epoch": 17.075286415711947,
|
|
"grad_norm": 12.137538036991014,
|
|
"learning_rate": 1.3815713815353295e-09,
|
|
"loss": 1.3892,
|
|
"mean_token_accuracy": 0.7171003341674804,
|
|
"num_tokens": 328919502.0,
|
|
"step": 5225
|
|
},
|
|
{
|
|
"epoch": 17.09165302782324,
|
|
"grad_norm": 12.799277571065465,
|
|
"learning_rate": 1.376473211260522e-09,
|
|
"loss": 1.4185,
|
|
"mean_token_accuracy": 0.7131861090660095,
|
|
"num_tokens": 329234120.0,
|
|
"step": 5230
|
|
},
|
|
{
|
|
"epoch": 17.108019639934533,
|
|
"grad_norm": 12.603093386870297,
|
|
"learning_rate": 1.3713808890899993e-09,
|
|
"loss": 1.4136,
|
|
"mean_token_accuracy": 0.7117380559444427,
|
|
"num_tokens": 329550898.0,
|
|
"step": 5235
|
|
},
|
|
{
|
|
"epoch": 17.124386252045827,
|
|
"grad_norm": 11.975315830322039,
|
|
"learning_rate": 1.3662944415299658e-09,
|
|
"loss": 1.4063,
|
|
"mean_token_accuracy": 0.7126054346561432,
|
|
"num_tokens": 329865571.0,
|
|
"step": 5240
|
|
},
|
|
{
|
|
"epoch": 17.14075286415712,
|
|
"grad_norm": 13.417997807400582,
|
|
"learning_rate": 1.3612138950560493e-09,
|
|
"loss": 1.402,
|
|
"mean_token_accuracy": 0.7132501244544983,
|
|
"num_tokens": 330181355.0,
|
|
"step": 5245
|
|
},
|
|
{
|
|
"epoch": 17.157119476268413,
|
|
"grad_norm": 12.89015778822025,
|
|
"learning_rate": 1.3561392761131583e-09,
|
|
"loss": 1.4048,
|
|
"mean_token_accuracy": 0.7087698340415954,
|
|
"num_tokens": 330496848.0,
|
|
"step": 5250
|
|
},
|
|
{
|
|
"epoch": 17.173486088379704,
|
|
"grad_norm": 12.561411499706335,
|
|
"learning_rate": 1.3510706111153515e-09,
|
|
"loss": 1.4171,
|
|
"mean_token_accuracy": 0.710574495792389,
|
|
"num_tokens": 330812707.0,
|
|
"step": 5255
|
|
},
|
|
{
|
|
"epoch": 17.189852700491,
|
|
"grad_norm": 12.451761146260129,
|
|
"learning_rate": 1.346007926445694e-09,
|
|
"loss": 1.3968,
|
|
"mean_token_accuracy": 0.7150484561920166,
|
|
"num_tokens": 331129434.0,
|
|
"step": 5260
|
|
},
|
|
{
|
|
"epoch": 17.20621931260229,
|
|
"grad_norm": 12.336908491072222,
|
|
"learning_rate": 1.3409512484561242e-09,
|
|
"loss": 1.3909,
|
|
"mean_token_accuracy": 0.7159785687923431,
|
|
"num_tokens": 331446337.0,
|
|
"step": 5265
|
|
},
|
|
{
|
|
"epoch": 17.222585924713584,
|
|
"grad_norm": 12.821838773919659,
|
|
"learning_rate": 1.3359006034673144e-09,
|
|
"loss": 1.3834,
|
|
"mean_token_accuracy": 0.7165203213691711,
|
|
"num_tokens": 331761818.0,
|
|
"step": 5270
|
|
},
|
|
{
|
|
"epoch": 17.238952536824875,
|
|
"grad_norm": 12.394587970969967,
|
|
"learning_rate": 1.3308560177685334e-09,
|
|
"loss": 1.3957,
|
|
"mean_token_accuracy": 0.7148654341697693,
|
|
"num_tokens": 332077756.0,
|
|
"step": 5275
|
|
},
|
|
{
|
|
"epoch": 17.25531914893617,
|
|
"grad_norm": 12.842214481966593,
|
|
"learning_rate": 1.325817517617512e-09,
|
|
"loss": 1.3947,
|
|
"mean_token_accuracy": 0.7158268690109253,
|
|
"num_tokens": 332393546.0,
|
|
"step": 5280
|
|
},
|
|
{
|
|
"epoch": 17.271685761047465,
|
|
"grad_norm": 12.662037146870594,
|
|
"learning_rate": 1.3207851292403036e-09,
|
|
"loss": 1.3972,
|
|
"mean_token_accuracy": 0.7155718326568603,
|
|
"num_tokens": 332710522.0,
|
|
"step": 5285
|
|
},
|
|
{
|
|
"epoch": 17.288052373158756,
|
|
"grad_norm": 12.631565840669538,
|
|
"learning_rate": 1.3157588788311504e-09,
|
|
"loss": 1.3822,
|
|
"mean_token_accuracy": 0.715112566947937,
|
|
"num_tokens": 333026066.0,
|
|
"step": 5290
|
|
},
|
|
{
|
|
"epoch": 17.30441898527005,
|
|
"grad_norm": 13.060137316177375,
|
|
"learning_rate": 1.3107387925523445e-09,
|
|
"loss": 1.4061,
|
|
"mean_token_accuracy": 0.7136071026325226,
|
|
"num_tokens": 333338968.0,
|
|
"step": 5295
|
|
},
|
|
{
|
|
"epoch": 17.32078559738134,
|
|
"grad_norm": 12.497161370908389,
|
|
"learning_rate": 1.305724896534094e-09,
|
|
"loss": 1.3911,
|
|
"mean_token_accuracy": 0.7150938749313355,
|
|
"num_tokens": 333656222.0,
|
|
"step": 5300
|
|
},
|
|
{
|
|
"epoch": 17.337152209492636,
|
|
"grad_norm": 12.310988293329496,
|
|
"learning_rate": 1.3007172168743852e-09,
|
|
"loss": 1.3997,
|
|
"mean_token_accuracy": 0.712196159362793,
|
|
"num_tokens": 333972297.0,
|
|
"step": 5305
|
|
},
|
|
{
|
|
"epoch": 17.353518821603927,
|
|
"grad_norm": 13.00195560661611,
|
|
"learning_rate": 1.2957157796388463e-09,
|
|
"loss": 1.4093,
|
|
"mean_token_accuracy": 0.7123797535896301,
|
|
"num_tokens": 334288137.0,
|
|
"step": 5310
|
|
},
|
|
{
|
|
"epoch": 17.369885433715222,
|
|
"grad_norm": 12.53115445959651,
|
|
"learning_rate": 1.2907206108606151e-09,
|
|
"loss": 1.4021,
|
|
"mean_token_accuracy": 0.7121556222438812,
|
|
"num_tokens": 334603646.0,
|
|
"step": 5315
|
|
},
|
|
{
|
|
"epoch": 17.386252045826513,
|
|
"grad_norm": 12.920395716601327,
|
|
"learning_rate": 1.2857317365401996e-09,
|
|
"loss": 1.4012,
|
|
"mean_token_accuracy": 0.7114644229412079,
|
|
"num_tokens": 334918136.0,
|
|
"step": 5320
|
|
},
|
|
{
|
|
"epoch": 17.402618657937808,
|
|
"grad_norm": 12.925708294532836,
|
|
"learning_rate": 1.2807491826453455e-09,
|
|
"loss": 1.4002,
|
|
"mean_token_accuracy": 0.7134134829044342,
|
|
"num_tokens": 335233847.0,
|
|
"step": 5325
|
|
},
|
|
{
|
|
"epoch": 17.4189852700491,
|
|
"grad_norm": 13.075160728738613,
|
|
"learning_rate": 1.2757729751108988e-09,
|
|
"loss": 1.4126,
|
|
"mean_token_accuracy": 0.7115934014320373,
|
|
"num_tokens": 335550114.0,
|
|
"step": 5330
|
|
},
|
|
{
|
|
"epoch": 17.435351882160393,
|
|
"grad_norm": 13.185241533755097,
|
|
"learning_rate": 1.2708031398386724e-09,
|
|
"loss": 1.3959,
|
|
"mean_token_accuracy": 0.7153161346912384,
|
|
"num_tokens": 335866911.0,
|
|
"step": 5335
|
|
},
|
|
{
|
|
"epoch": 17.451718494271685,
|
|
"grad_norm": 12.65442216382764,
|
|
"learning_rate": 1.2658397026973112e-09,
|
|
"loss": 1.4011,
|
|
"mean_token_accuracy": 0.7110715806484222,
|
|
"num_tokens": 336184022.0,
|
|
"step": 5340
|
|
},
|
|
{
|
|
"epoch": 17.46808510638298,
|
|
"grad_norm": 12.753114054132789,
|
|
"learning_rate": 1.2608826895221558e-09,
|
|
"loss": 1.4102,
|
|
"mean_token_accuracy": 0.7115493476390838,
|
|
"num_tokens": 336499508.0,
|
|
"step": 5345
|
|
},
|
|
{
|
|
"epoch": 17.48445171849427,
|
|
"grad_norm": 12.936481191853076,
|
|
"learning_rate": 1.2559321261151103e-09,
|
|
"loss": 1.3971,
|
|
"mean_token_accuracy": 0.7129360318183899,
|
|
"num_tokens": 336815137.0,
|
|
"step": 5350
|
|
},
|
|
{
|
|
"epoch": 17.500818330605565,
|
|
"grad_norm": 12.383260787314933,
|
|
"learning_rate": 1.2509880382445062e-09,
|
|
"loss": 1.3937,
|
|
"mean_token_accuracy": 0.7159365475177765,
|
|
"num_tokens": 337130392.0,
|
|
"step": 5355
|
|
},
|
|
{
|
|
"epoch": 17.517184942716856,
|
|
"grad_norm": 12.692724465470993,
|
|
"learning_rate": 1.2460504516449696e-09,
|
|
"loss": 1.4046,
|
|
"mean_token_accuracy": 0.7109850525856019,
|
|
"num_tokens": 337445070.0,
|
|
"step": 5360
|
|
},
|
|
{
|
|
"epoch": 17.53355155482815,
|
|
"grad_norm": 12.712425609177462,
|
|
"learning_rate": 1.2411193920172864e-09,
|
|
"loss": 1.4089,
|
|
"mean_token_accuracy": 0.7130200266838074,
|
|
"num_tokens": 337761551.0,
|
|
"step": 5365
|
|
},
|
|
{
|
|
"epoch": 17.54991816693944,
|
|
"grad_norm": 12.350995391081879,
|
|
"learning_rate": 1.236194885028268e-09,
|
|
"loss": 1.3759,
|
|
"mean_token_accuracy": 0.7158704221248626,
|
|
"num_tokens": 338075491.0,
|
|
"step": 5370
|
|
},
|
|
{
|
|
"epoch": 17.566284779050736,
|
|
"grad_norm": 12.99858596953847,
|
|
"learning_rate": 1.23127695631062e-09,
|
|
"loss": 1.3916,
|
|
"mean_token_accuracy": 0.7128506302833557,
|
|
"num_tokens": 338391201.0,
|
|
"step": 5375
|
|
},
|
|
{
|
|
"epoch": 17.58265139116203,
|
|
"grad_norm": 12.434309697315385,
|
|
"learning_rate": 1.2263656314628056e-09,
|
|
"loss": 1.3785,
|
|
"mean_token_accuracy": 0.7174550950527191,
|
|
"num_tokens": 338707150.0,
|
|
"step": 5380
|
|
},
|
|
{
|
|
"epoch": 17.599018003273322,
|
|
"grad_norm": 12.836552647774617,
|
|
"learning_rate": 1.221460936048915e-09,
|
|
"loss": 1.3864,
|
|
"mean_token_accuracy": 0.7141911685466766,
|
|
"num_tokens": 339022998.0,
|
|
"step": 5385
|
|
},
|
|
{
|
|
"epoch": 17.615384615384617,
|
|
"grad_norm": 12.571682517325653,
|
|
"learning_rate": 1.2165628955985313e-09,
|
|
"loss": 1.3889,
|
|
"mean_token_accuracy": 0.7143755376338958,
|
|
"num_tokens": 339340065.0,
|
|
"step": 5390
|
|
},
|
|
{
|
|
"epoch": 17.631751227495908,
|
|
"grad_norm": 13.395496985978712,
|
|
"learning_rate": 1.2116715356065971e-09,
|
|
"loss": 1.4006,
|
|
"mean_token_accuracy": 0.7140235543251038,
|
|
"num_tokens": 339654668.0,
|
|
"step": 5395
|
|
},
|
|
{
|
|
"epoch": 17.648117839607202,
|
|
"grad_norm": 12.479045524766219,
|
|
"learning_rate": 1.206786881533283e-09,
|
|
"loss": 1.3942,
|
|
"mean_token_accuracy": 0.7156869411468506,
|
|
"num_tokens": 339970606.0,
|
|
"step": 5400
|
|
},
|
|
{
|
|
"epoch": 17.664484451718494,
|
|
"grad_norm": 12.206575662961889,
|
|
"learning_rate": 1.2019089588038538e-09,
|
|
"loss": 1.3719,
|
|
"mean_token_accuracy": 0.7192688524723053,
|
|
"num_tokens": 340286065.0,
|
|
"step": 5405
|
|
},
|
|
{
|
|
"epoch": 17.680851063829788,
|
|
"grad_norm": 12.73772441105973,
|
|
"learning_rate": 1.1970377928085372e-09,
|
|
"loss": 1.3989,
|
|
"mean_token_accuracy": 0.7134172976016998,
|
|
"num_tokens": 340601645.0,
|
|
"step": 5410
|
|
},
|
|
{
|
|
"epoch": 17.69721767594108,
|
|
"grad_norm": 13.067032223425121,
|
|
"learning_rate": 1.1921734089023916e-09,
|
|
"loss": 1.416,
|
|
"mean_token_accuracy": 0.7102409243583679,
|
|
"num_tokens": 340916817.0,
|
|
"step": 5415
|
|
},
|
|
{
|
|
"epoch": 17.713584288052374,
|
|
"grad_norm": 12.860371801785337,
|
|
"learning_rate": 1.1873158324051716e-09,
|
|
"loss": 1.3921,
|
|
"mean_token_accuracy": 0.7153089880943299,
|
|
"num_tokens": 341231812.0,
|
|
"step": 5420
|
|
},
|
|
{
|
|
"epoch": 17.729950900163665,
|
|
"grad_norm": 12.237459579013946,
|
|
"learning_rate": 1.1824650886012012e-09,
|
|
"loss": 1.3719,
|
|
"mean_token_accuracy": 0.7181912899017334,
|
|
"num_tokens": 341546324.0,
|
|
"step": 5425
|
|
},
|
|
{
|
|
"epoch": 17.74631751227496,
|
|
"grad_norm": 12.851633508224957,
|
|
"learning_rate": 1.1776212027392376e-09,
|
|
"loss": 1.3814,
|
|
"mean_token_accuracy": 0.7169644117355347,
|
|
"num_tokens": 341862416.0,
|
|
"step": 5430
|
|
},
|
|
{
|
|
"epoch": 17.76268412438625,
|
|
"grad_norm": 12.715774815841636,
|
|
"learning_rate": 1.1727842000323422e-09,
|
|
"loss": 1.3832,
|
|
"mean_token_accuracy": 0.7157602131366729,
|
|
"num_tokens": 342179110.0,
|
|
"step": 5435
|
|
},
|
|
{
|
|
"epoch": 17.779050736497545,
|
|
"grad_norm": 13.101118093674382,
|
|
"learning_rate": 1.1679541056577482e-09,
|
|
"loss": 1.41,
|
|
"mean_token_accuracy": 0.7121834099292755,
|
|
"num_tokens": 342493037.0,
|
|
"step": 5440
|
|
},
|
|
{
|
|
"epoch": 17.795417348608837,
|
|
"grad_norm": 12.471346133514018,
|
|
"learning_rate": 1.1631309447567306e-09,
|
|
"loss": 1.3978,
|
|
"mean_token_accuracy": 0.7117744445800781,
|
|
"num_tokens": 342808622.0,
|
|
"step": 5445
|
|
},
|
|
{
|
|
"epoch": 17.81178396072013,
|
|
"grad_norm": 12.930267752585188,
|
|
"learning_rate": 1.1583147424344746e-09,
|
|
"loss": 1.3777,
|
|
"mean_token_accuracy": 0.7164147138595581,
|
|
"num_tokens": 343123030.0,
|
|
"step": 5450
|
|
},
|
|
{
|
|
"epoch": 17.828150572831422,
|
|
"grad_norm": 12.703543993441485,
|
|
"learning_rate": 1.153505523759944e-09,
|
|
"loss": 1.3916,
|
|
"mean_token_accuracy": 0.7136729001998902,
|
|
"num_tokens": 343439667.0,
|
|
"step": 5455
|
|
},
|
|
{
|
|
"epoch": 17.844517184942717,
|
|
"grad_norm": 13.14210044589097,
|
|
"learning_rate": 1.1487033137657538e-09,
|
|
"loss": 1.406,
|
|
"mean_token_accuracy": 0.7114916920661927,
|
|
"num_tokens": 343754489.0,
|
|
"step": 5460
|
|
},
|
|
{
|
|
"epoch": 17.86088379705401,
|
|
"grad_norm": 12.301357268333014,
|
|
"learning_rate": 1.1439081374480362e-09,
|
|
"loss": 1.3737,
|
|
"mean_token_accuracy": 0.7186058342456818,
|
|
"num_tokens": 344069924.0,
|
|
"step": 5465
|
|
},
|
|
{
|
|
"epoch": 17.877250409165303,
|
|
"grad_norm": 13.156843241094293,
|
|
"learning_rate": 1.1391200197663132e-09,
|
|
"loss": 1.3965,
|
|
"mean_token_accuracy": 0.7138831853866577,
|
|
"num_tokens": 344384088.0,
|
|
"step": 5470
|
|
},
|
|
{
|
|
"epoch": 17.893617021276597,
|
|
"grad_norm": 12.692490380092018,
|
|
"learning_rate": 1.134338985643366e-09,
|
|
"loss": 1.3866,
|
|
"mean_token_accuracy": 0.7155437529087066,
|
|
"num_tokens": 344701422.0,
|
|
"step": 5475
|
|
},
|
|
{
|
|
"epoch": 17.90998363338789,
|
|
"grad_norm": 12.715473298247183,
|
|
"learning_rate": 1.1295650599651023e-09,
|
|
"loss": 1.4021,
|
|
"mean_token_accuracy": 0.7133882701396942,
|
|
"num_tokens": 345016651.0,
|
|
"step": 5480
|
|
},
|
|
{
|
|
"epoch": 17.926350245499183,
|
|
"grad_norm": 12.238271624667775,
|
|
"learning_rate": 1.1247982675804322e-09,
|
|
"loss": 1.4036,
|
|
"mean_token_accuracy": 0.7109548151493073,
|
|
"num_tokens": 345332748.0,
|
|
"step": 5485
|
|
},
|
|
{
|
|
"epoch": 17.942716857610474,
|
|
"grad_norm": 13.07904637799471,
|
|
"learning_rate": 1.1200386333011356e-09,
|
|
"loss": 1.3852,
|
|
"mean_token_accuracy": 0.7151263058185577,
|
|
"num_tokens": 345649158.0,
|
|
"step": 5490
|
|
},
|
|
{
|
|
"epoch": 17.95908346972177,
|
|
"grad_norm": 12.589623435885747,
|
|
"learning_rate": 1.115286181901733e-09,
|
|
"loss": 1.3763,
|
|
"mean_token_accuracy": 0.7184123635292053,
|
|
"num_tokens": 345963757.0,
|
|
"step": 5495
|
|
},
|
|
{
|
|
"epoch": 17.97545008183306,
|
|
"grad_norm": 12.90918631521044,
|
|
"learning_rate": 1.1105409381193571e-09,
|
|
"loss": 1.3848,
|
|
"mean_token_accuracy": 0.7155660688877106,
|
|
"num_tokens": 346280806.0,
|
|
"step": 5500
|
|
},
|
|
{
|
|
"epoch": 17.991816693944354,
|
|
"grad_norm": 12.314582020763753,
|
|
"learning_rate": 1.105802926653624e-09,
|
|
"loss": 1.4122,
|
|
"mean_token_accuracy": 0.7101211845874786,
|
|
"num_tokens": 346594573.0,
|
|
"step": 5505
|
|
},
|
|
{
|
|
"epoch": 18.006546644844516,
|
|
"grad_norm": 13.235208784657654,
|
|
"learning_rate": 1.101072172166505e-09,
|
|
"loss": 1.407,
|
|
"mean_token_accuracy": 0.7137935625182258,
|
|
"num_tokens": 346854659.0,
|
|
"step": 5510
|
|
},
|
|
{
|
|
"epoch": 18.02291325695581,
|
|
"grad_norm": 12.763597579363163,
|
|
"learning_rate": 1.0963486992821977e-09,
|
|
"loss": 1.3887,
|
|
"mean_token_accuracy": 0.7150306701660156,
|
|
"num_tokens": 347169565.0,
|
|
"step": 5515
|
|
},
|
|
{
|
|
"epoch": 18.0392798690671,
|
|
"grad_norm": 12.655615875345337,
|
|
"learning_rate": 1.091632532586998e-09,
|
|
"loss": 1.3813,
|
|
"mean_token_accuracy": 0.715842741727829,
|
|
"num_tokens": 347484856.0,
|
|
"step": 5520
|
|
},
|
|
{
|
|
"epoch": 18.055646481178396,
|
|
"grad_norm": 12.64337473403488,
|
|
"learning_rate": 1.0869236966291715e-09,
|
|
"loss": 1.3727,
|
|
"mean_token_accuracy": 0.7158585131168366,
|
|
"num_tokens": 347801962.0,
|
|
"step": 5525
|
|
},
|
|
{
|
|
"epoch": 18.07201309328969,
|
|
"grad_norm": 12.462334312203728,
|
|
"learning_rate": 1.0822222159188275e-09,
|
|
"loss": 1.3852,
|
|
"mean_token_accuracy": 0.7147506773471832,
|
|
"num_tokens": 348117074.0,
|
|
"step": 5530
|
|
},
|
|
{
|
|
"epoch": 18.088379705400982,
|
|
"grad_norm": 12.565042037190233,
|
|
"learning_rate": 1.0775281149277897e-09,
|
|
"loss": 1.3925,
|
|
"mean_token_accuracy": 0.7138961970806121,
|
|
"num_tokens": 348433613.0,
|
|
"step": 5535
|
|
},
|
|
{
|
|
"epoch": 18.104746317512276,
|
|
"grad_norm": 12.646266276743935,
|
|
"learning_rate": 1.072841418089469e-09,
|
|
"loss": 1.3787,
|
|
"mean_token_accuracy": 0.7157909095287323,
|
|
"num_tokens": 348749005.0,
|
|
"step": 5540
|
|
},
|
|
{
|
|
"epoch": 18.121112929623568,
|
|
"grad_norm": 12.38061218796714,
|
|
"learning_rate": 1.068162149798737e-09,
|
|
"loss": 1.3877,
|
|
"mean_token_accuracy": 0.7154739260673523,
|
|
"num_tokens": 349064208.0,
|
|
"step": 5545
|
|
},
|
|
{
|
|
"epoch": 18.137479541734862,
|
|
"grad_norm": 11.968930657167016,
|
|
"learning_rate": 1.0634903344117995e-09,
|
|
"loss": 1.3649,
|
|
"mean_token_accuracy": 0.718022209405899,
|
|
"num_tokens": 349379547.0,
|
|
"step": 5550
|
|
},
|
|
{
|
|
"epoch": 18.153846153846153,
|
|
"grad_norm": 12.780826346237394,
|
|
"learning_rate": 1.0588259962460676e-09,
|
|
"loss": 1.369,
|
|
"mean_token_accuracy": 0.718323028087616,
|
|
"num_tokens": 349692928.0,
|
|
"step": 5555
|
|
},
|
|
{
|
|
"epoch": 18.170212765957448,
|
|
"grad_norm": 12.422856867889942,
|
|
"learning_rate": 1.0541691595800337e-09,
|
|
"loss": 1.389,
|
|
"mean_token_accuracy": 0.713786643743515,
|
|
"num_tokens": 350009624.0,
|
|
"step": 5560
|
|
},
|
|
{
|
|
"epoch": 18.18657937806874,
|
|
"grad_norm": 12.413005292724472,
|
|
"learning_rate": 1.049519848653143e-09,
|
|
"loss": 1.3808,
|
|
"mean_token_accuracy": 0.7161435902118682,
|
|
"num_tokens": 350326433.0,
|
|
"step": 5565
|
|
},
|
|
{
|
|
"epoch": 18.202945990180034,
|
|
"grad_norm": 13.381397742985778,
|
|
"learning_rate": 1.0448780876656688e-09,
|
|
"loss": 1.389,
|
|
"mean_token_accuracy": 0.714937961101532,
|
|
"num_tokens": 350641346.0,
|
|
"step": 5570
|
|
},
|
|
{
|
|
"epoch": 18.219312602291325,
|
|
"grad_norm": 12.49129531641033,
|
|
"learning_rate": 1.0402439007785862e-09,
|
|
"loss": 1.4007,
|
|
"mean_token_accuracy": 0.7126554310321808,
|
|
"num_tokens": 350957360.0,
|
|
"step": 5575
|
|
},
|
|
{
|
|
"epoch": 18.23567921440262,
|
|
"grad_norm": 12.308619890450021,
|
|
"learning_rate": 1.0356173121134446e-09,
|
|
"loss": 1.3815,
|
|
"mean_token_accuracy": 0.7172755897045135,
|
|
"num_tokens": 351273459.0,
|
|
"step": 5580
|
|
},
|
|
{
|
|
"epoch": 18.25204582651391,
|
|
"grad_norm": 13.242074621025358,
|
|
"learning_rate": 1.030998345752246e-09,
|
|
"loss": 1.3967,
|
|
"mean_token_accuracy": 0.7135592460632324,
|
|
"num_tokens": 351588136.0,
|
|
"step": 5585
|
|
},
|
|
{
|
|
"epoch": 18.268412438625205,
|
|
"grad_norm": 11.861574062896562,
|
|
"learning_rate": 1.0263870257373162e-09,
|
|
"loss": 1.3948,
|
|
"mean_token_accuracy": 0.712981390953064,
|
|
"num_tokens": 351903224.0,
|
|
"step": 5590
|
|
},
|
|
{
|
|
"epoch": 18.284779050736496,
|
|
"grad_norm": 12.395041538470807,
|
|
"learning_rate": 1.0217833760711792e-09,
|
|
"loss": 1.3683,
|
|
"mean_token_accuracy": 0.7180960893630981,
|
|
"num_tokens": 352218383.0,
|
|
"step": 5595
|
|
},
|
|
{
|
|
"epoch": 18.30114566284779,
|
|
"grad_norm": 12.894649709921252,
|
|
"learning_rate": 1.0171874207164362e-09,
|
|
"loss": 1.3978,
|
|
"mean_token_accuracy": 0.7124384999275207,
|
|
"num_tokens": 352533109.0,
|
|
"step": 5600
|
|
},
|
|
{
|
|
"epoch": 18.317512274959082,
|
|
"grad_norm": 13.096345184514144,
|
|
"learning_rate": 1.0125991835956376e-09,
|
|
"loss": 1.3872,
|
|
"mean_token_accuracy": 0.7149949491024017,
|
|
"num_tokens": 352848562.0,
|
|
"step": 5605
|
|
},
|
|
{
|
|
"epoch": 18.333878887070377,
|
|
"grad_norm": 11.9497648848681,
|
|
"learning_rate": 1.0080186885911588e-09,
|
|
"loss": 1.3819,
|
|
"mean_token_accuracy": 0.7147423505783081,
|
|
"num_tokens": 353164572.0,
|
|
"step": 5610
|
|
},
|
|
{
|
|
"epoch": 18.350245499181668,
|
|
"grad_norm": 12.339471746049261,
|
|
"learning_rate": 1.0034459595450776e-09,
|
|
"loss": 1.3802,
|
|
"mean_token_accuracy": 0.715108597278595,
|
|
"num_tokens": 353480615.0,
|
|
"step": 5615
|
|
},
|
|
{
|
|
"epoch": 18.366612111292962,
|
|
"grad_norm": 12.481496979460216,
|
|
"learning_rate": 9.988810202590481e-10,
|
|
"loss": 1.3905,
|
|
"mean_token_accuracy": 0.7153669118881225,
|
|
"num_tokens": 353794865.0,
|
|
"step": 5620
|
|
},
|
|
{
|
|
"epoch": 18.382978723404257,
|
|
"grad_norm": 12.518437673992242,
|
|
"learning_rate": 9.943238944941782e-10,
|
|
"loss": 1.3622,
|
|
"mean_token_accuracy": 0.7209295690059662,
|
|
"num_tokens": 354109372.0,
|
|
"step": 5625
|
|
},
|
|
{
|
|
"epoch": 18.399345335515548,
|
|
"grad_norm": 13.186352873301463,
|
|
"learning_rate": 9.897746059709054e-10,
|
|
"loss": 1.3925,
|
|
"mean_token_accuracy": 0.7112901329994201,
|
|
"num_tokens": 354424071.0,
|
|
"step": 5630
|
|
},
|
|
{
|
|
"epoch": 18.415711947626843,
|
|
"grad_norm": 12.906442859579837,
|
|
"learning_rate": 9.852331783688722e-10,
|
|
"loss": 1.4138,
|
|
"mean_token_accuracy": 0.7088511765003205,
|
|
"num_tokens": 354738984.0,
|
|
"step": 5635
|
|
},
|
|
{
|
|
"epoch": 18.432078559738134,
|
|
"grad_norm": 12.987822195411988,
|
|
"learning_rate": 9.806996353268057e-10,
|
|
"loss": 1.3849,
|
|
"mean_token_accuracy": 0.7144550621509552,
|
|
"num_tokens": 355054114.0,
|
|
"step": 5640
|
|
},
|
|
{
|
|
"epoch": 18.44844517184943,
|
|
"grad_norm": 12.674483928796581,
|
|
"learning_rate": 9.761740004423927e-10,
|
|
"loss": 1.3794,
|
|
"mean_token_accuracy": 0.7165055871009827,
|
|
"num_tokens": 355370777.0,
|
|
"step": 5645
|
|
},
|
|
{
|
|
"epoch": 18.46481178396072,
|
|
"grad_norm": 12.215728589409949,
|
|
"learning_rate": 9.716562972721544e-10,
|
|
"loss": 1.3814,
|
|
"mean_token_accuracy": 0.7154396593570709,
|
|
"num_tokens": 355687582.0,
|
|
"step": 5650
|
|
},
|
|
{
|
|
"epoch": 18.481178396072014,
|
|
"grad_norm": 12.38282753815485,
|
|
"learning_rate": 9.671465493313292e-10,
|
|
"loss": 1.3649,
|
|
"mean_token_accuracy": 0.719496488571167,
|
|
"num_tokens": 356001954.0,
|
|
"step": 5655
|
|
},
|
|
{
|
|
"epoch": 18.497545008183305,
|
|
"grad_norm": 13.079847773691125,
|
|
"learning_rate": 9.626447800937467e-10,
|
|
"loss": 1.3823,
|
|
"mean_token_accuracy": 0.7160343945026397,
|
|
"num_tokens": 356317376.0,
|
|
"step": 5660
|
|
},
|
|
{
|
|
"epoch": 18.5139116202946,
|
|
"grad_norm": 12.663364759533373,
|
|
"learning_rate": 9.581510129917063e-10,
|
|
"loss": 1.3899,
|
|
"mean_token_accuracy": 0.7153219997882843,
|
|
"num_tokens": 356631136.0,
|
|
"step": 5665
|
|
},
|
|
{
|
|
"epoch": 18.53027823240589,
|
|
"grad_norm": 12.740117263835618,
|
|
"learning_rate": 9.536652714158545e-10,
|
|
"loss": 1.3838,
|
|
"mean_token_accuracy": 0.7163176774978638,
|
|
"num_tokens": 356946372.0,
|
|
"step": 5670
|
|
},
|
|
{
|
|
"epoch": 18.546644844517186,
|
|
"grad_norm": 12.482008982127597,
|
|
"learning_rate": 9.49187578715065e-10,
|
|
"loss": 1.3932,
|
|
"mean_token_accuracy": 0.7150310695171356,
|
|
"num_tokens": 357263208.0,
|
|
"step": 5675
|
|
},
|
|
{
|
|
"epoch": 18.563011456628477,
|
|
"grad_norm": 12.335105657867352,
|
|
"learning_rate": 9.447179581963155e-10,
|
|
"loss": 1.3794,
|
|
"mean_token_accuracy": 0.7140471935272217,
|
|
"num_tokens": 357580121.0,
|
|
"step": 5680
|
|
},
|
|
{
|
|
"epoch": 18.57937806873977,
|
|
"grad_norm": 13.168790147885918,
|
|
"learning_rate": 9.402564331245673e-10,
|
|
"loss": 1.375,
|
|
"mean_token_accuracy": 0.7164913952350617,
|
|
"num_tokens": 357898026.0,
|
|
"step": 5685
|
|
},
|
|
{
|
|
"epoch": 18.595744680851062,
|
|
"grad_norm": 12.1581165136031,
|
|
"learning_rate": 9.358030267226429e-10,
|
|
"loss": 1.3558,
|
|
"mean_token_accuracy": 0.7206232190132141,
|
|
"num_tokens": 358213880.0,
|
|
"step": 5690
|
|
},
|
|
{
|
|
"epoch": 18.612111292962357,
|
|
"grad_norm": 12.59819387730946,
|
|
"learning_rate": 9.313577621711069e-10,
|
|
"loss": 1.3658,
|
|
"mean_token_accuracy": 0.7186311244964599,
|
|
"num_tokens": 358530242.0,
|
|
"step": 5695
|
|
},
|
|
{
|
|
"epoch": 18.628477905073648,
|
|
"grad_norm": 13.046071159861091,
|
|
"learning_rate": 9.269206626081444e-10,
|
|
"loss": 1.3939,
|
|
"mean_token_accuracy": 0.7141192197799683,
|
|
"num_tokens": 358844767.0,
|
|
"step": 5700
|
|
},
|
|
{
|
|
"epoch": 18.644844517184943,
|
|
"grad_norm": 12.737927621667515,
|
|
"learning_rate": 9.224917511294406e-10,
|
|
"loss": 1.3672,
|
|
"mean_token_accuracy": 0.7181422650814057,
|
|
"num_tokens": 359159924.0,
|
|
"step": 5705
|
|
},
|
|
{
|
|
"epoch": 18.661211129296234,
|
|
"grad_norm": 12.386579552377142,
|
|
"learning_rate": 9.180710507880605e-10,
|
|
"loss": 1.3655,
|
|
"mean_token_accuracy": 0.7202182531356811,
|
|
"num_tokens": 359474736.0,
|
|
"step": 5710
|
|
},
|
|
{
|
|
"epoch": 18.67757774140753,
|
|
"grad_norm": 12.365715715935234,
|
|
"learning_rate": 9.136585845943287e-10,
|
|
"loss": 1.3644,
|
|
"mean_token_accuracy": 0.7191633701324462,
|
|
"num_tokens": 359788829.0,
|
|
"step": 5715
|
|
},
|
|
{
|
|
"epoch": 18.693944353518823,
|
|
"grad_norm": 12.369603146497102,
|
|
"learning_rate": 9.092543755157112e-10,
|
|
"loss": 1.3658,
|
|
"mean_token_accuracy": 0.7192545533180237,
|
|
"num_tokens": 360105016.0,
|
|
"step": 5720
|
|
},
|
|
{
|
|
"epoch": 18.710310965630114,
|
|
"grad_norm": 12.996855685166068,
|
|
"learning_rate": 9.048584464766937e-10,
|
|
"loss": 1.3864,
|
|
"mean_token_accuracy": 0.7158155918121338,
|
|
"num_tokens": 360420518.0,
|
|
"step": 5725
|
|
},
|
|
{
|
|
"epoch": 18.72667757774141,
|
|
"grad_norm": 12.953296229285426,
|
|
"learning_rate": 9.004708203586629e-10,
|
|
"loss": 1.3861,
|
|
"mean_token_accuracy": 0.7145832180976868,
|
|
"num_tokens": 360737454.0,
|
|
"step": 5730
|
|
},
|
|
{
|
|
"epoch": 18.7430441898527,
|
|
"grad_norm": 12.402893762350994,
|
|
"learning_rate": 8.960915199997885e-10,
|
|
"loss": 1.3823,
|
|
"mean_token_accuracy": 0.7143435657024384,
|
|
"num_tokens": 361052926.0,
|
|
"step": 5735
|
|
},
|
|
{
|
|
"epoch": 18.759410801963995,
|
|
"grad_norm": 13.008197404879443,
|
|
"learning_rate": 8.917205681949034e-10,
|
|
"loss": 1.3759,
|
|
"mean_token_accuracy": 0.7186384499073029,
|
|
"num_tokens": 361368937.0,
|
|
"step": 5740
|
|
},
|
|
{
|
|
"epoch": 18.775777414075286,
|
|
"grad_norm": 13.423075286610377,
|
|
"learning_rate": 8.873579876953844e-10,
|
|
"loss": 1.3835,
|
|
"mean_token_accuracy": 0.7161759972572327,
|
|
"num_tokens": 361684524.0,
|
|
"step": 5745
|
|
},
|
|
{
|
|
"epoch": 18.79214402618658,
|
|
"grad_norm": 12.811505010435685,
|
|
"learning_rate": 8.830038012090357e-10,
|
|
"loss": 1.3862,
|
|
"mean_token_accuracy": 0.7145053625106812,
|
|
"num_tokens": 362001450.0,
|
|
"step": 5750
|
|
},
|
|
{
|
|
"epoch": 18.80851063829787,
|
|
"grad_norm": 13.027449136406155,
|
|
"learning_rate": 8.78658031399969e-10,
|
|
"loss": 1.3694,
|
|
"mean_token_accuracy": 0.716279661655426,
|
|
"num_tokens": 362317884.0,
|
|
"step": 5755
|
|
},
|
|
{
|
|
"epoch": 18.824877250409166,
|
|
"grad_norm": 12.360573369881436,
|
|
"learning_rate": 8.743207008884865e-10,
|
|
"loss": 1.3789,
|
|
"mean_token_accuracy": 0.7169335722923279,
|
|
"num_tokens": 362632884.0,
|
|
"step": 5760
|
|
},
|
|
{
|
|
"epoch": 18.841243862520457,
|
|
"grad_norm": 12.645584794759708,
|
|
"learning_rate": 8.699918322509609e-10,
|
|
"loss": 1.3622,
|
|
"mean_token_accuracy": 0.7194086253643036,
|
|
"num_tokens": 362949683.0,
|
|
"step": 5765
|
|
},
|
|
{
|
|
"epoch": 18.857610474631752,
|
|
"grad_norm": 12.975756722712436,
|
|
"learning_rate": 8.65671448019722e-10,
|
|
"loss": 1.3828,
|
|
"mean_token_accuracy": 0.7150522589683532,
|
|
"num_tokens": 363265994.0,
|
|
"step": 5770
|
|
},
|
|
{
|
|
"epoch": 18.873977086743043,
|
|
"grad_norm": 12.723655904397996,
|
|
"learning_rate": 8.613595706829366e-10,
|
|
"loss": 1.3576,
|
|
"mean_token_accuracy": 0.7214667618274688,
|
|
"num_tokens": 363580698.0,
|
|
"step": 5775
|
|
},
|
|
{
|
|
"epoch": 18.890343698854338,
|
|
"grad_norm": 12.685832080615537,
|
|
"learning_rate": 8.570562226844914e-10,
|
|
"loss": 1.3775,
|
|
"mean_token_accuracy": 0.7159756720066071,
|
|
"num_tokens": 363897121.0,
|
|
"step": 5780
|
|
},
|
|
{
|
|
"epoch": 18.90671031096563,
|
|
"grad_norm": 12.602544332252107,
|
|
"learning_rate": 8.527614264238773e-10,
|
|
"loss": 1.378,
|
|
"mean_token_accuracy": 0.7161263406276703,
|
|
"num_tokens": 364213776.0,
|
|
"step": 5785
|
|
},
|
|
{
|
|
"epoch": 18.923076923076923,
|
|
"grad_norm": 11.854014351161501,
|
|
"learning_rate": 8.48475204256072e-10,
|
|
"loss": 1.3496,
|
|
"mean_token_accuracy": 0.722278642654419,
|
|
"num_tokens": 364531018.0,
|
|
"step": 5790
|
|
},
|
|
{
|
|
"epoch": 18.939443535188214,
|
|
"grad_norm": 13.056726278143165,
|
|
"learning_rate": 8.441975784914241e-10,
|
|
"loss": 1.3706,
|
|
"mean_token_accuracy": 0.7184775650501252,
|
|
"num_tokens": 364846166.0,
|
|
"step": 5795
|
|
},
|
|
{
|
|
"epoch": 18.95581014729951,
|
|
"grad_norm": 12.693598074284184,
|
|
"learning_rate": 8.399285713955366e-10,
|
|
"loss": 1.3778,
|
|
"mean_token_accuracy": 0.7162315368652343,
|
|
"num_tokens": 365161528.0,
|
|
"step": 5800
|
|
},
|
|
{
|
|
"epoch": 18.9721767594108,
|
|
"grad_norm": 12.712806515492721,
|
|
"learning_rate": 8.356682051891512e-10,
|
|
"loss": 1.3732,
|
|
"mean_token_accuracy": 0.717895120382309,
|
|
"num_tokens": 365477749.0,
|
|
"step": 5805
|
|
},
|
|
{
|
|
"epoch": 18.988543371522095,
|
|
"grad_norm": 12.61013695062063,
|
|
"learning_rate": 8.31416502048033e-10,
|
|
"loss": 1.3729,
|
|
"mean_token_accuracy": 0.7183041274547577,
|
|
"num_tokens": 365794318.0,
|
|
"step": 5810
|
|
},
|
|
{
|
|
"epoch": 19.00327332242226,
|
|
"grad_norm": 12.846197841429813,
|
|
"learning_rate": 8.271734841028553e-10,
|
|
"loss": 1.3871,
|
|
"mean_token_accuracy": 0.7166521615452237,
|
|
"num_tokens": 366054135.0,
|
|
"step": 5815
|
|
},
|
|
{
|
|
"epoch": 19.01963993453355,
|
|
"grad_norm": 12.781031512055613,
|
|
"learning_rate": 8.229391734390809e-10,
|
|
"loss": 1.378,
|
|
"mean_token_accuracy": 0.7174058675765991,
|
|
"num_tokens": 366369329.0,
|
|
"step": 5820
|
|
},
|
|
{
|
|
"epoch": 19.036006546644845,
|
|
"grad_norm": 12.817751563572923,
|
|
"learning_rate": 8.187135920968536e-10,
|
|
"loss": 1.357,
|
|
"mean_token_accuracy": 0.7178467869758606,
|
|
"num_tokens": 366684911.0,
|
|
"step": 5825
|
|
},
|
|
{
|
|
"epoch": 19.052373158756136,
|
|
"grad_norm": 12.361476407235411,
|
|
"learning_rate": 8.14496762070878e-10,
|
|
"loss": 1.3882,
|
|
"mean_token_accuracy": 0.7144702851772309,
|
|
"num_tokens": 367001229.0,
|
|
"step": 5830
|
|
},
|
|
{
|
|
"epoch": 19.06873977086743,
|
|
"grad_norm": 13.611363516655436,
|
|
"learning_rate": 8.102887053103075e-10,
|
|
"loss": 1.388,
|
|
"mean_token_accuracy": 0.7142431914806366,
|
|
"num_tokens": 367316510.0,
|
|
"step": 5835
|
|
},
|
|
{
|
|
"epoch": 19.085106382978722,
|
|
"grad_norm": 12.616111993185598,
|
|
"learning_rate": 8.060894437186295e-10,
|
|
"loss": 1.3735,
|
|
"mean_token_accuracy": 0.7148357987403869,
|
|
"num_tokens": 367631892.0,
|
|
"step": 5840
|
|
},
|
|
{
|
|
"epoch": 19.101472995090017,
|
|
"grad_norm": 12.774290787727974,
|
|
"learning_rate": 8.018989991535513e-10,
|
|
"loss": 1.3843,
|
|
"mean_token_accuracy": 0.7155320584774018,
|
|
"num_tokens": 367949305.0,
|
|
"step": 5845
|
|
},
|
|
{
|
|
"epoch": 19.117839607201308,
|
|
"grad_norm": 12.874638084090844,
|
|
"learning_rate": 7.977173934268864e-10,
|
|
"loss": 1.3674,
|
|
"mean_token_accuracy": 0.7179316282272339,
|
|
"num_tokens": 368264037.0,
|
|
"step": 5850
|
|
},
|
|
{
|
|
"epoch": 19.134206219312603,
|
|
"grad_norm": 12.84731693292664,
|
|
"learning_rate": 7.935446483044412e-10,
|
|
"loss": 1.3841,
|
|
"mean_token_accuracy": 0.7166074812412262,
|
|
"num_tokens": 368579670.0,
|
|
"step": 5855
|
|
},
|
|
{
|
|
"epoch": 19.150572831423894,
|
|
"grad_norm": 12.958459519702389,
|
|
"learning_rate": 7.89380785505901e-10,
|
|
"loss": 1.3699,
|
|
"mean_token_accuracy": 0.7167773723602295,
|
|
"num_tokens": 368896056.0,
|
|
"step": 5860
|
|
},
|
|
{
|
|
"epoch": 19.16693944353519,
|
|
"grad_norm": 12.650749076343129,
|
|
"learning_rate": 7.852258267047177e-10,
|
|
"loss": 1.3757,
|
|
"mean_token_accuracy": 0.7171500504016877,
|
|
"num_tokens": 369210471.0,
|
|
"step": 5865
|
|
},
|
|
{
|
|
"epoch": 19.183306055646483,
|
|
"grad_norm": 12.283216913121649,
|
|
"learning_rate": 7.810797935279973e-10,
|
|
"loss": 1.3608,
|
|
"mean_token_accuracy": 0.7200214743614197,
|
|
"num_tokens": 369527272.0,
|
|
"step": 5870
|
|
},
|
|
{
|
|
"epoch": 19.199672667757774,
|
|
"grad_norm": 12.842889599405346,
|
|
"learning_rate": 7.769427075563856e-10,
|
|
"loss": 1.3817,
|
|
"mean_token_accuracy": 0.7152867078781128,
|
|
"num_tokens": 369841125.0,
|
|
"step": 5875
|
|
},
|
|
{
|
|
"epoch": 19.21603927986907,
|
|
"grad_norm": 12.480638763825803,
|
|
"learning_rate": 7.728145903239584e-10,
|
|
"loss": 1.3648,
|
|
"mean_token_accuracy": 0.718204790353775,
|
|
"num_tokens": 370157321.0,
|
|
"step": 5880
|
|
},
|
|
{
|
|
"epoch": 19.23240589198036,
|
|
"grad_norm": 12.117212343748356,
|
|
"learning_rate": 7.686954633181065e-10,
|
|
"loss": 1.3505,
|
|
"mean_token_accuracy": 0.7220339953899384,
|
|
"num_tokens": 370472593.0,
|
|
"step": 5885
|
|
},
|
|
{
|
|
"epoch": 19.248772504091654,
|
|
"grad_norm": 13.132183687091533,
|
|
"learning_rate": 7.645853479794265e-10,
|
|
"loss": 1.3708,
|
|
"mean_token_accuracy": 0.7181020259857178,
|
|
"num_tokens": 370788104.0,
|
|
"step": 5890
|
|
},
|
|
{
|
|
"epoch": 19.265139116202946,
|
|
"grad_norm": 13.285147654938097,
|
|
"learning_rate": 7.604842657016078e-10,
|
|
"loss": 1.3554,
|
|
"mean_token_accuracy": 0.7207320153713226,
|
|
"num_tokens": 371105424.0,
|
|
"step": 5895
|
|
},
|
|
{
|
|
"epoch": 19.28150572831424,
|
|
"grad_norm": 12.941522431737862,
|
|
"learning_rate": 7.563922378313218e-10,
|
|
"loss": 1.3829,
|
|
"mean_token_accuracy": 0.7163071990013122,
|
|
"num_tokens": 371420806.0,
|
|
"step": 5900
|
|
},
|
|
{
|
|
"epoch": 19.29787234042553,
|
|
"grad_norm": 12.66823555573856,
|
|
"learning_rate": 7.523092856681099e-10,
|
|
"loss": 1.3719,
|
|
"mean_token_accuracy": 0.7173808157444,
|
|
"num_tokens": 371737127.0,
|
|
"step": 5905
|
|
},
|
|
{
|
|
"epoch": 19.314238952536826,
|
|
"grad_norm": 12.468782774971105,
|
|
"learning_rate": 7.482354304642735e-10,
|
|
"loss": 1.3572,
|
|
"mean_token_accuracy": 0.7218125700950623,
|
|
"num_tokens": 372053743.0,
|
|
"step": 5910
|
|
},
|
|
{
|
|
"epoch": 19.330605564648117,
|
|
"grad_norm": 12.673430481906253,
|
|
"learning_rate": 7.441706934247633e-10,
|
|
"loss": 1.3714,
|
|
"mean_token_accuracy": 0.7178383052349091,
|
|
"num_tokens": 372368942.0,
|
|
"step": 5915
|
|
},
|
|
{
|
|
"epoch": 19.34697217675941,
|
|
"grad_norm": 12.64582960304754,
|
|
"learning_rate": 7.401150957070687e-10,
|
|
"loss": 1.3615,
|
|
"mean_token_accuracy": 0.7213193953037262,
|
|
"num_tokens": 372684501.0,
|
|
"step": 5920
|
|
},
|
|
{
|
|
"epoch": 19.363338788870703,
|
|
"grad_norm": 12.881251697477449,
|
|
"learning_rate": 7.360686584211079e-10,
|
|
"loss": 1.3582,
|
|
"mean_token_accuracy": 0.7224127888679505,
|
|
"num_tokens": 373000557.0,
|
|
"step": 5925
|
|
},
|
|
{
|
|
"epoch": 19.379705400981997,
|
|
"grad_norm": 13.016655812957188,
|
|
"learning_rate": 7.320314026291183e-10,
|
|
"loss": 1.373,
|
|
"mean_token_accuracy": 0.7166104733943939,
|
|
"num_tokens": 373314587.0,
|
|
"step": 5930
|
|
},
|
|
{
|
|
"epoch": 19.39607201309329,
|
|
"grad_norm": 12.703007701115933,
|
|
"learning_rate": 7.28003349345544e-10,
|
|
"loss": 1.3467,
|
|
"mean_token_accuracy": 0.7213922083377838,
|
|
"num_tokens": 373629407.0,
|
|
"step": 5935
|
|
},
|
|
{
|
|
"epoch": 19.412438625204583,
|
|
"grad_norm": 13.11920416031955,
|
|
"learning_rate": 7.239845195369319e-10,
|
|
"loss": 1.3712,
|
|
"mean_token_accuracy": 0.7184795260429382,
|
|
"num_tokens": 373945797.0,
|
|
"step": 5940
|
|
},
|
|
{
|
|
"epoch": 19.428805237315874,
|
|
"grad_norm": 12.745812531059482,
|
|
"learning_rate": 7.199749341218176e-10,
|
|
"loss": 1.3669,
|
|
"mean_token_accuracy": 0.7184098780155181,
|
|
"num_tokens": 374261376.0,
|
|
"step": 5945
|
|
},
|
|
{
|
|
"epoch": 19.44517184942717,
|
|
"grad_norm": 12.825356488169058,
|
|
"learning_rate": 7.159746139706194e-10,
|
|
"loss": 1.3745,
|
|
"mean_token_accuracy": 0.7177829325199128,
|
|
"num_tokens": 374577040.0,
|
|
"step": 5950
|
|
},
|
|
{
|
|
"epoch": 19.46153846153846,
|
|
"grad_norm": 13.67165270930649,
|
|
"learning_rate": 7.119835799055285e-10,
|
|
"loss": 1.382,
|
|
"mean_token_accuracy": 0.7166715204715729,
|
|
"num_tokens": 374892491.0,
|
|
"step": 5955
|
|
},
|
|
{
|
|
"epoch": 19.477905073649755,
|
|
"grad_norm": 12.360439867627685,
|
|
"learning_rate": 7.080018527004001e-10,
|
|
"loss": 1.3682,
|
|
"mean_token_accuracy": 0.7182742238044739,
|
|
"num_tokens": 375207737.0,
|
|
"step": 5960
|
|
},
|
|
{
|
|
"epoch": 19.49427168576105,
|
|
"grad_norm": 13.031003624035211,
|
|
"learning_rate": 7.040294530806468e-10,
|
|
"loss": 1.3874,
|
|
"mean_token_accuracy": 0.7151427447795868,
|
|
"num_tokens": 375523822.0,
|
|
"step": 5965
|
|
},
|
|
{
|
|
"epoch": 19.51063829787234,
|
|
"grad_norm": 13.00731945578167,
|
|
"learning_rate": 7.000664017231297e-10,
|
|
"loss": 1.3652,
|
|
"mean_token_accuracy": 0.7197602808475494,
|
|
"num_tokens": 375839994.0,
|
|
"step": 5970
|
|
},
|
|
{
|
|
"epoch": 19.527004909983635,
|
|
"grad_norm": 12.556107146083836,
|
|
"learning_rate": 6.961127192560509e-10,
|
|
"loss": 1.3689,
|
|
"mean_token_accuracy": 0.7184596836566925,
|
|
"num_tokens": 376155305.0,
|
|
"step": 5975
|
|
},
|
|
{
|
|
"epoch": 19.543371522094926,
|
|
"grad_norm": 12.4673726717845,
|
|
"learning_rate": 6.92168426258846e-10,
|
|
"loss": 1.3711,
|
|
"mean_token_accuracy": 0.7177050232887268,
|
|
"num_tokens": 376470754.0,
|
|
"step": 5980
|
|
},
|
|
{
|
|
"epoch": 19.55973813420622,
|
|
"grad_norm": 13.13075618583375,
|
|
"learning_rate": 6.882335432620779e-10,
|
|
"loss": 1.3629,
|
|
"mean_token_accuracy": 0.7193731069564819,
|
|
"num_tokens": 376786620.0,
|
|
"step": 5985
|
|
},
|
|
{
|
|
"epoch": 19.57610474631751,
|
|
"grad_norm": 12.170960660813558,
|
|
"learning_rate": 6.843080907473276e-10,
|
|
"loss": 1.3608,
|
|
"mean_token_accuracy": 0.7193795144557953,
|
|
"num_tokens": 377102222.0,
|
|
"step": 5990
|
|
},
|
|
{
|
|
"epoch": 19.592471358428806,
|
|
"grad_norm": 12.309205288205849,
|
|
"learning_rate": 6.803920891470905e-10,
|
|
"loss": 1.354,
|
|
"mean_token_accuracy": 0.7204973518848419,
|
|
"num_tokens": 377417349.0,
|
|
"step": 5995
|
|
},
|
|
{
|
|
"epoch": 19.608837970540097,
|
|
"grad_norm": 13.031618542541247,
|
|
"learning_rate": 6.764855588446689e-10,
|
|
"loss": 1.3725,
|
|
"mean_token_accuracy": 0.7182239472866059,
|
|
"num_tokens": 377732207.0,
|
|
"step": 6000
|
|
},
|
|
{
|
|
"epoch": 19.625204582651392,
|
|
"grad_norm": 12.540057625658143,
|
|
"learning_rate": 6.725885201740653e-10,
|
|
"loss": 1.36,
|
|
"mean_token_accuracy": 0.7203054070472718,
|
|
"num_tokens": 378046720.0,
|
|
"step": 6005
|
|
},
|
|
{
|
|
"epoch": 19.641571194762683,
|
|
"grad_norm": 12.352507333255614,
|
|
"learning_rate": 6.687009934198771e-10,
|
|
"loss": 1.3408,
|
|
"mean_token_accuracy": 0.7238680064678192,
|
|
"num_tokens": 378361684.0,
|
|
"step": 6010
|
|
},
|
|
{
|
|
"epoch": 19.657937806873978,
|
|
"grad_norm": 12.420982668782106,
|
|
"learning_rate": 6.648229988171906e-10,
|
|
"loss": 1.3639,
|
|
"mean_token_accuracy": 0.7186601638793946,
|
|
"num_tokens": 378677558.0,
|
|
"step": 6015
|
|
},
|
|
{
|
|
"epoch": 19.67430441898527,
|
|
"grad_norm": 12.720448227814035,
|
|
"learning_rate": 6.609545565514766e-10,
|
|
"loss": 1.3821,
|
|
"mean_token_accuracy": 0.7174455404281617,
|
|
"num_tokens": 378992818.0,
|
|
"step": 6020
|
|
},
|
|
{
|
|
"epoch": 19.690671031096564,
|
|
"grad_norm": 12.501448331802214,
|
|
"learning_rate": 6.570956867584843e-10,
|
|
"loss": 1.3572,
|
|
"mean_token_accuracy": 0.7190974712371826,
|
|
"num_tokens": 379309437.0,
|
|
"step": 6025
|
|
},
|
|
{
|
|
"epoch": 19.707037643207855,
|
|
"grad_norm": 12.147760849737383,
|
|
"learning_rate": 6.532464095241372e-10,
|
|
"loss": 1.3423,
|
|
"mean_token_accuracy": 0.7238264679908752,
|
|
"num_tokens": 379626036.0,
|
|
"step": 6030
|
|
},
|
|
{
|
|
"epoch": 19.72340425531915,
|
|
"grad_norm": 12.720576028448043,
|
|
"learning_rate": 6.494067448844279e-10,
|
|
"loss": 1.3574,
|
|
"mean_token_accuracy": 0.7195711672306061,
|
|
"num_tokens": 379940886.0,
|
|
"step": 6035
|
|
},
|
|
{
|
|
"epoch": 19.73977086743044,
|
|
"grad_norm": 13.135370961706531,
|
|
"learning_rate": 6.455767128253149e-10,
|
|
"loss": 1.3735,
|
|
"mean_token_accuracy": 0.7186027526855469,
|
|
"num_tokens": 380257522.0,
|
|
"step": 6040
|
|
},
|
|
{
|
|
"epoch": 19.756137479541735,
|
|
"grad_norm": 12.672254675360048,
|
|
"learning_rate": 6.417563332826165e-10,
|
|
"loss": 1.3791,
|
|
"mean_token_accuracy": 0.7168498575687409,
|
|
"num_tokens": 380572688.0,
|
|
"step": 6045
|
|
},
|
|
{
|
|
"epoch": 19.772504091653026,
|
|
"grad_norm": 13.156088402541622,
|
|
"learning_rate": 6.3794562614191e-10,
|
|
"loss": 1.3739,
|
|
"mean_token_accuracy": 0.7170314073562623,
|
|
"num_tokens": 380889136.0,
|
|
"step": 6050
|
|
},
|
|
{
|
|
"epoch": 19.78887070376432,
|
|
"grad_norm": 13.239645957722919,
|
|
"learning_rate": 6.341446112384259e-10,
|
|
"loss": 1.3756,
|
|
"mean_token_accuracy": 0.7166466414928436,
|
|
"num_tokens": 381203100.0,
|
|
"step": 6055
|
|
},
|
|
{
|
|
"epoch": 19.805237315875615,
|
|
"grad_norm": 12.621465653927025,
|
|
"learning_rate": 6.303533083569448e-10,
|
|
"loss": 1.3809,
|
|
"mean_token_accuracy": 0.7170312762260437,
|
|
"num_tokens": 381518716.0,
|
|
"step": 6060
|
|
},
|
|
{
|
|
"epoch": 19.821603927986907,
|
|
"grad_norm": 12.684645310238352,
|
|
"learning_rate": 6.265717372316957e-10,
|
|
"loss": 1.3801,
|
|
"mean_token_accuracy": 0.7160680174827576,
|
|
"num_tokens": 381834327.0,
|
|
"step": 6065
|
|
},
|
|
{
|
|
"epoch": 19.8379705400982,
|
|
"grad_norm": 13.310217231405765,
|
|
"learning_rate": 6.227999175462521e-10,
|
|
"loss": 1.3618,
|
|
"mean_token_accuracy": 0.7182558119297028,
|
|
"num_tokens": 382151144.0,
|
|
"step": 6070
|
|
},
|
|
{
|
|
"epoch": 19.854337152209492,
|
|
"grad_norm": 12.592065764147137,
|
|
"learning_rate": 6.1903786893343e-10,
|
|
"loss": 1.3767,
|
|
"mean_token_accuracy": 0.7171549320220947,
|
|
"num_tokens": 382467394.0,
|
|
"step": 6075
|
|
},
|
|
{
|
|
"epoch": 19.870703764320787,
|
|
"grad_norm": 13.131250487547925,
|
|
"learning_rate": 6.152856109751861e-10,
|
|
"loss": 1.3784,
|
|
"mean_token_accuracy": 0.7171645045280457,
|
|
"num_tokens": 382783474.0,
|
|
"step": 6080
|
|
},
|
|
{
|
|
"epoch": 19.887070376432078,
|
|
"grad_norm": 12.201328973734851,
|
|
"learning_rate": 6.115431632025153e-10,
|
|
"loss": 1.3511,
|
|
"mean_token_accuracy": 0.7223453462123871,
|
|
"num_tokens": 383098936.0,
|
|
"step": 6085
|
|
},
|
|
{
|
|
"epoch": 19.903436988543373,
|
|
"grad_norm": 13.796231255641032,
|
|
"learning_rate": 6.078105450953488e-10,
|
|
"loss": 1.3668,
|
|
"mean_token_accuracy": 0.7188094437122345,
|
|
"num_tokens": 383414381.0,
|
|
"step": 6090
|
|
},
|
|
{
|
|
"epoch": 19.919803600654664,
|
|
"grad_norm": 11.806977173308766,
|
|
"learning_rate": 6.040877760824535e-10,
|
|
"loss": 1.348,
|
|
"mean_token_accuracy": 0.721502012014389,
|
|
"num_tokens": 383729234.0,
|
|
"step": 6095
|
|
},
|
|
{
|
|
"epoch": 19.93617021276596,
|
|
"grad_norm": 12.735720188720164,
|
|
"learning_rate": 6.003748755413311e-10,
|
|
"loss": 1.3473,
|
|
"mean_token_accuracy": 0.7218765377998352,
|
|
"num_tokens": 384045885.0,
|
|
"step": 6100
|
|
},
|
|
{
|
|
"epoch": 19.95253682487725,
|
|
"grad_norm": 12.838838181022378,
|
|
"learning_rate": 5.966718627981141e-10,
|
|
"loss": 1.3493,
|
|
"mean_token_accuracy": 0.7213131487369537,
|
|
"num_tokens": 384360353.0,
|
|
"step": 6105
|
|
},
|
|
{
|
|
"epoch": 19.968903436988544,
|
|
"grad_norm": 12.407763840948075,
|
|
"learning_rate": 5.929787571274706e-10,
|
|
"loss": 1.3594,
|
|
"mean_token_accuracy": 0.7206071019172668,
|
|
"num_tokens": 384675717.0,
|
|
"step": 6110
|
|
},
|
|
{
|
|
"epoch": 19.985270049099835,
|
|
"grad_norm": 12.94758071340867,
|
|
"learning_rate": 5.892955777524997e-10,
|
|
"loss": 1.3678,
|
|
"mean_token_accuracy": 0.7174268543720246,
|
|
"num_tokens": 384993655.0,
|
|
"step": 6115
|
|
},
|
|
{
|
|
"epoch": 20.0,
|
|
"grad_norm": 12.510272040402462,
|
|
"learning_rate": 5.856223438446331e-10,
|
|
"loss": 1.349,
|
|
"mean_token_accuracy": 0.720099237230089,
|
|
"num_tokens": 385254493.0,
|
|
"step": 6120
|
|
},
|
|
{
|
|
"epoch": 20.016366612111295,
|
|
"grad_norm": 12.408361502858067,
|
|
"learning_rate": 5.819590745235353e-10,
|
|
"loss": 1.3581,
|
|
"mean_token_accuracy": 0.7212778329849243,
|
|
"num_tokens": 385569935.0,
|
|
"step": 6125
|
|
},
|
|
{
|
|
"epoch": 20.032733224222586,
|
|
"grad_norm": 12.89064406341307,
|
|
"learning_rate": 5.783057888570034e-10,
|
|
"loss": 1.3627,
|
|
"mean_token_accuracy": 0.7189658522605896,
|
|
"num_tokens": 385886048.0,
|
|
"step": 6130
|
|
},
|
|
{
|
|
"epoch": 20.04909983633388,
|
|
"grad_norm": 12.656855994493375,
|
|
"learning_rate": 5.746625058608681e-10,
|
|
"loss": 1.3511,
|
|
"mean_token_accuracy": 0.7225908994674682,
|
|
"num_tokens": 386202625.0,
|
|
"step": 6135
|
|
},
|
|
{
|
|
"epoch": 20.06546644844517,
|
|
"grad_norm": 12.59303551321297,
|
|
"learning_rate": 5.710292444988957e-10,
|
|
"loss": 1.3544,
|
|
"mean_token_accuracy": 0.7210854589939117,
|
|
"num_tokens": 386516480.0,
|
|
"step": 6140
|
|
},
|
|
{
|
|
"epoch": 20.081833060556466,
|
|
"grad_norm": 13.373786710360742,
|
|
"learning_rate": 5.674060236826881e-10,
|
|
"loss": 1.3666,
|
|
"mean_token_accuracy": 0.7191102504730225,
|
|
"num_tokens": 386831825.0,
|
|
"step": 6145
|
|
},
|
|
{
|
|
"epoch": 20.098199672667757,
|
|
"grad_norm": 13.039962709930045,
|
|
"learning_rate": 5.637928622715844e-10,
|
|
"loss": 1.3633,
|
|
"mean_token_accuracy": 0.71987065076828,
|
|
"num_tokens": 387147364.0,
|
|
"step": 6150
|
|
},
|
|
{
|
|
"epoch": 20.114566284779052,
|
|
"grad_norm": 12.87146333006844,
|
|
"learning_rate": 5.601897790725643e-10,
|
|
"loss": 1.3736,
|
|
"mean_token_accuracy": 0.7173676788806915,
|
|
"num_tokens": 387461434.0,
|
|
"step": 6155
|
|
},
|
|
{
|
|
"epoch": 20.130932896890343,
|
|
"grad_norm": 12.769528987719692,
|
|
"learning_rate": 5.565967928401475e-10,
|
|
"loss": 1.3827,
|
|
"mean_token_accuracy": 0.7164701819419861,
|
|
"num_tokens": 387776913.0,
|
|
"step": 6160
|
|
},
|
|
{
|
|
"epoch": 20.147299509001638,
|
|
"grad_norm": 13.104881890764489,
|
|
"learning_rate": 5.530139222762986e-10,
|
|
"loss": 1.3562,
|
|
"mean_token_accuracy": 0.7204134702682495,
|
|
"num_tokens": 388094220.0,
|
|
"step": 6165
|
|
},
|
|
{
|
|
"epoch": 20.16366612111293,
|
|
"grad_norm": 13.026891970002072,
|
|
"learning_rate": 5.494411860303295e-10,
|
|
"loss": 1.3525,
|
|
"mean_token_accuracy": 0.7217986106872558,
|
|
"num_tokens": 388409234.0,
|
|
"step": 6170
|
|
},
|
|
{
|
|
"epoch": 20.180032733224223,
|
|
"grad_norm": 12.217206361830561,
|
|
"learning_rate": 5.458786026988006e-10,
|
|
"loss": 1.3779,
|
|
"mean_token_accuracy": 0.7165840566158295,
|
|
"num_tokens": 388725714.0,
|
|
"step": 6175
|
|
},
|
|
{
|
|
"epoch": 20.196399345335514,
|
|
"grad_norm": 13.636015132017496,
|
|
"learning_rate": 5.423261908254251e-10,
|
|
"loss": 1.3791,
|
|
"mean_token_accuracy": 0.716965913772583,
|
|
"num_tokens": 389040838.0,
|
|
"step": 6180
|
|
},
|
|
{
|
|
"epoch": 20.21276595744681,
|
|
"grad_norm": 13.055484726435909,
|
|
"learning_rate": 5.38783968900973e-10,
|
|
"loss": 1.3529,
|
|
"mean_token_accuracy": 0.7214811325073243,
|
|
"num_tokens": 389358099.0,
|
|
"step": 6185
|
|
},
|
|
{
|
|
"epoch": 20.2291325695581,
|
|
"grad_norm": 12.80499515972432,
|
|
"learning_rate": 5.352519553631738e-10,
|
|
"loss": 1.3586,
|
|
"mean_token_accuracy": 0.7204404175281525,
|
|
"num_tokens": 389674079.0,
|
|
"step": 6190
|
|
},
|
|
{
|
|
"epoch": 20.245499181669395,
|
|
"grad_norm": 12.579179331917139,
|
|
"learning_rate": 5.317301685966214e-10,
|
|
"loss": 1.355,
|
|
"mean_token_accuracy": 0.7205852210521698,
|
|
"num_tokens": 389990677.0,
|
|
"step": 6195
|
|
},
|
|
{
|
|
"epoch": 20.261865793780686,
|
|
"grad_norm": 12.410957372710085,
|
|
"learning_rate": 5.282186269326778e-10,
|
|
"loss": 1.3492,
|
|
"mean_token_accuracy": 0.7207090735435486,
|
|
"num_tokens": 390306224.0,
|
|
"step": 6200
|
|
},
|
|
{
|
|
"epoch": 20.27823240589198,
|
|
"grad_norm": 13.79551759700545,
|
|
"learning_rate": 5.247173486493775e-10,
|
|
"loss": 1.377,
|
|
"mean_token_accuracy": 0.717050963640213,
|
|
"num_tokens": 390621762.0,
|
|
"step": 6205
|
|
},
|
|
{
|
|
"epoch": 20.29459901800327,
|
|
"grad_norm": 12.442990671978828,
|
|
"learning_rate": 5.212263519713337e-10,
|
|
"loss": 1.3427,
|
|
"mean_token_accuracy": 0.7230148434638977,
|
|
"num_tokens": 390937426.0,
|
|
"step": 6210
|
|
},
|
|
{
|
|
"epoch": 20.310965630114566,
|
|
"grad_norm": 12.517953307134098,
|
|
"learning_rate": 5.177456550696413e-10,
|
|
"loss": 1.3487,
|
|
"mean_token_accuracy": 0.7233328282833099,
|
|
"num_tokens": 391252916.0,
|
|
"step": 6215
|
|
},
|
|
{
|
|
"epoch": 20.32733224222586,
|
|
"grad_norm": 13.092747881609323,
|
|
"learning_rate": 5.14275276061785e-10,
|
|
"loss": 1.3579,
|
|
"mean_token_accuracy": 0.7207543611526489,
|
|
"num_tokens": 391567998.0,
|
|
"step": 6220
|
|
},
|
|
{
|
|
"epoch": 20.343698854337152,
|
|
"grad_norm": 12.726469014852299,
|
|
"learning_rate": 5.108152330115417e-10,
|
|
"loss": 1.3628,
|
|
"mean_token_accuracy": 0.7196714520454407,
|
|
"num_tokens": 391885189.0,
|
|
"step": 6225
|
|
},
|
|
{
|
|
"epoch": 20.360065466448447,
|
|
"grad_norm": 13.334299082905925,
|
|
"learning_rate": 5.073655439288902e-10,
|
|
"loss": 1.3714,
|
|
"mean_token_accuracy": 0.7176052629947662,
|
|
"num_tokens": 392200295.0,
|
|
"step": 6230
|
|
},
|
|
{
|
|
"epoch": 20.376432078559738,
|
|
"grad_norm": 12.91998412060829,
|
|
"learning_rate": 5.039262267699141e-10,
|
|
"loss": 1.3658,
|
|
"mean_token_accuracy": 0.7184491395950318,
|
|
"num_tokens": 392516438.0,
|
|
"step": 6235
|
|
},
|
|
{
|
|
"epoch": 20.392798690671032,
|
|
"grad_norm": 12.970503138080948,
|
|
"learning_rate": 5.004972994367102e-10,
|
|
"loss": 1.3655,
|
|
"mean_token_accuracy": 0.7194463074207306,
|
|
"num_tokens": 392830717.0,
|
|
"step": 6240
|
|
},
|
|
{
|
|
"epoch": 20.409165302782323,
|
|
"grad_norm": 12.644866646541756,
|
|
"learning_rate": 4.970787797772949e-10,
|
|
"loss": 1.3755,
|
|
"mean_token_accuracy": 0.7172148823738098,
|
|
"num_tokens": 393146212.0,
|
|
"step": 6245
|
|
},
|
|
{
|
|
"epoch": 20.425531914893618,
|
|
"grad_norm": 13.62925757634868,
|
|
"learning_rate": 4.936706855855119e-10,
|
|
"loss": 1.3698,
|
|
"mean_token_accuracy": 0.7158120036125183,
|
|
"num_tokens": 393462795.0,
|
|
"step": 6250
|
|
},
|
|
{
|
|
"epoch": 20.44189852700491,
|
|
"grad_norm": 13.441202382467907,
|
|
"learning_rate": 4.902730346009382e-10,
|
|
"loss": 1.3592,
|
|
"mean_token_accuracy": 0.7201913297176361,
|
|
"num_tokens": 393779211.0,
|
|
"step": 6255
|
|
},
|
|
{
|
|
"epoch": 20.458265139116204,
|
|
"grad_norm": 12.66638053909492,
|
|
"learning_rate": 4.868858445087923e-10,
|
|
"loss": 1.3327,
|
|
"mean_token_accuracy": 0.7251097619533539,
|
|
"num_tokens": 394094924.0,
|
|
"step": 6260
|
|
},
|
|
{
|
|
"epoch": 20.474631751227495,
|
|
"grad_norm": 12.748745055757105,
|
|
"learning_rate": 4.835091329398436e-10,
|
|
"loss": 1.3632,
|
|
"mean_token_accuracy": 0.7194243013858795,
|
|
"num_tokens": 394410468.0,
|
|
"step": 6265
|
|
},
|
|
{
|
|
"epoch": 20.49099836333879,
|
|
"grad_norm": 13.324491028604893,
|
|
"learning_rate": 4.801429174703187e-10,
|
|
"loss": 1.3647,
|
|
"mean_token_accuracy": 0.7182667195796967,
|
|
"num_tokens": 394724947.0,
|
|
"step": 6270
|
|
},
|
|
{
|
|
"epoch": 20.50736497545008,
|
|
"grad_norm": 12.473896879426514,
|
|
"learning_rate": 4.767872156218097e-10,
|
|
"loss": 1.3349,
|
|
"mean_token_accuracy": 0.7238370895385742,
|
|
"num_tokens": 395040911.0,
|
|
"step": 6275
|
|
},
|
|
{
|
|
"epoch": 20.523731587561375,
|
|
"grad_norm": 12.454795272312785,
|
|
"learning_rate": 4.734420448611851e-10,
|
|
"loss": 1.3587,
|
|
"mean_token_accuracy": 0.7194471001625061,
|
|
"num_tokens": 395356010.0,
|
|
"step": 6280
|
|
},
|
|
{
|
|
"epoch": 20.540098199672666,
|
|
"grad_norm": 13.386471916617674,
|
|
"learning_rate": 4.701074226004978e-10,
|
|
"loss": 1.3493,
|
|
"mean_token_accuracy": 0.7223416805267334,
|
|
"num_tokens": 395672392.0,
|
|
"step": 6285
|
|
},
|
|
{
|
|
"epoch": 20.55646481178396,
|
|
"grad_norm": 12.922461612723442,
|
|
"learning_rate": 4.66783366196894e-10,
|
|
"loss": 1.3626,
|
|
"mean_token_accuracy": 0.7206296324729919,
|
|
"num_tokens": 395987449.0,
|
|
"step": 6290
|
|
},
|
|
{
|
|
"epoch": 20.572831423895252,
|
|
"grad_norm": 13.186551545631183,
|
|
"learning_rate": 4.6346989295252274e-10,
|
|
"loss": 1.3761,
|
|
"mean_token_accuracy": 0.7167114853858948,
|
|
"num_tokens": 396302511.0,
|
|
"step": 6295
|
|
},
|
|
{
|
|
"epoch": 20.589198036006547,
|
|
"grad_norm": 12.453339257872456,
|
|
"learning_rate": 4.601670201144473e-10,
|
|
"loss": 1.3703,
|
|
"mean_token_accuracy": 0.7186362683773041,
|
|
"num_tokens": 396618897.0,
|
|
"step": 6300
|
|
},
|
|
{
|
|
"epoch": 20.60556464811784,
|
|
"grad_norm": 12.578505216956795,
|
|
"learning_rate": 4.568747648745539e-10,
|
|
"loss": 1.3424,
|
|
"mean_token_accuracy": 0.7220037519931793,
|
|
"num_tokens": 396932845.0,
|
|
"step": 6305
|
|
},
|
|
{
|
|
"epoch": 20.621931260229132,
|
|
"grad_norm": 13.14266322328383,
|
|
"learning_rate": 4.535931443694627e-10,
|
|
"loss": 1.3597,
|
|
"mean_token_accuracy": 0.7195671617984771,
|
|
"num_tokens": 397249593.0,
|
|
"step": 6310
|
|
},
|
|
{
|
|
"epoch": 20.638297872340427,
|
|
"grad_norm": 13.155868390509191,
|
|
"learning_rate": 4.5032217568043874e-10,
|
|
"loss": 1.3528,
|
|
"mean_token_accuracy": 0.72027388215065,
|
|
"num_tokens": 397566272.0,
|
|
"step": 6315
|
|
},
|
|
{
|
|
"epoch": 20.654664484451718,
|
|
"grad_norm": 12.650985827442252,
|
|
"learning_rate": 4.470618758333031e-10,
|
|
"loss": 1.3709,
|
|
"mean_token_accuracy": 0.7184613645076752,
|
|
"num_tokens": 397881061.0,
|
|
"step": 6320
|
|
},
|
|
{
|
|
"epoch": 20.671031096563013,
|
|
"grad_norm": 12.834968827273396,
|
|
"learning_rate": 4.4381226179834424e-10,
|
|
"loss": 1.3596,
|
|
"mean_token_accuracy": 0.7198035597801209,
|
|
"num_tokens": 398196496.0,
|
|
"step": 6325
|
|
},
|
|
{
|
|
"epoch": 20.687397708674304,
|
|
"grad_norm": 12.472663814811538,
|
|
"learning_rate": 4.405733504902298e-10,
|
|
"loss": 1.3241,
|
|
"mean_token_accuracy": 0.7258457541465759,
|
|
"num_tokens": 398513160.0,
|
|
"step": 6330
|
|
},
|
|
{
|
|
"epoch": 20.7037643207856,
|
|
"grad_norm": 12.03529616087969,
|
|
"learning_rate": 4.3734515876791695e-10,
|
|
"loss": 1.3543,
|
|
"mean_token_accuracy": 0.720082575082779,
|
|
"num_tokens": 398828925.0,
|
|
"step": 6335
|
|
},
|
|
{
|
|
"epoch": 20.72013093289689,
|
|
"grad_norm": 13.045430517890043,
|
|
"learning_rate": 4.3412770343456725e-10,
|
|
"loss": 1.3688,
|
|
"mean_token_accuracy": 0.7178512036800384,
|
|
"num_tokens": 399143548.0,
|
|
"step": 6340
|
|
},
|
|
{
|
|
"epoch": 20.736497545008184,
|
|
"grad_norm": 13.023269314604761,
|
|
"learning_rate": 4.3092100123745786e-10,
|
|
"loss": 1.3537,
|
|
"mean_token_accuracy": 0.7202800393104554,
|
|
"num_tokens": 399457769.0,
|
|
"step": 6345
|
|
},
|
|
{
|
|
"epoch": 20.752864157119475,
|
|
"grad_norm": 13.120357401735287,
|
|
"learning_rate": 4.2772506886789434e-10,
|
|
"loss": 1.3537,
|
|
"mean_token_accuracy": 0.7197918653488159,
|
|
"num_tokens": 399772688.0,
|
|
"step": 6350
|
|
},
|
|
{
|
|
"epoch": 20.76923076923077,
|
|
"grad_norm": 12.497378191320612,
|
|
"learning_rate": 4.245399229611238e-10,
|
|
"loss": 1.3514,
|
|
"mean_token_accuracy": 0.7215233445167542,
|
|
"num_tokens": 400090305.0,
|
|
"step": 6355
|
|
},
|
|
{
|
|
"epoch": 20.78559738134206,
|
|
"grad_norm": 13.224221419816752,
|
|
"learning_rate": 4.213655800962482e-10,
|
|
"loss": 1.3766,
|
|
"mean_token_accuracy": 0.7170398950576782,
|
|
"num_tokens": 400406189.0,
|
|
"step": 6360
|
|
},
|
|
{
|
|
"epoch": 20.801963993453356,
|
|
"grad_norm": 13.125172702519224,
|
|
"learning_rate": 4.1820205679613866e-10,
|
|
"loss": 1.3437,
|
|
"mean_token_accuracy": 0.722397255897522,
|
|
"num_tokens": 400722467.0,
|
|
"step": 6365
|
|
},
|
|
{
|
|
"epoch": 20.818330605564647,
|
|
"grad_norm": 13.20586057784677,
|
|
"learning_rate": 4.1504936952734855e-10,
|
|
"loss": 1.35,
|
|
"mean_token_accuracy": 0.7202737271785736,
|
|
"num_tokens": 401037657.0,
|
|
"step": 6370
|
|
},
|
|
{
|
|
"epoch": 20.83469721767594,
|
|
"grad_norm": 12.761874738404625,
|
|
"learning_rate": 4.119075347000292e-10,
|
|
"loss": 1.3612,
|
|
"mean_token_accuracy": 0.7196288645267487,
|
|
"num_tokens": 401353210.0,
|
|
"step": 6375
|
|
},
|
|
{
|
|
"epoch": 20.851063829787233,
|
|
"grad_norm": 13.407483508696666,
|
|
"learning_rate": 4.087765686678424e-10,
|
|
"loss": 1.3623,
|
|
"mean_token_accuracy": 0.7191239655017853,
|
|
"num_tokens": 401668048.0,
|
|
"step": 6380
|
|
},
|
|
{
|
|
"epoch": 20.867430441898527,
|
|
"grad_norm": 13.144433390958858,
|
|
"learning_rate": 4.0565648772787703e-10,
|
|
"loss": 1.362,
|
|
"mean_token_accuracy": 0.7194087266921997,
|
|
"num_tokens": 401984410.0,
|
|
"step": 6385
|
|
},
|
|
{
|
|
"epoch": 20.88379705400982,
|
|
"grad_norm": 13.27548156246285,
|
|
"learning_rate": 4.0254730812056384e-10,
|
|
"loss": 1.3469,
|
|
"mean_token_accuracy": 0.7210758686065674,
|
|
"num_tokens": 402300070.0,
|
|
"step": 6390
|
|
},
|
|
{
|
|
"epoch": 20.900163666121113,
|
|
"grad_norm": 13.149919970213444,
|
|
"learning_rate": 3.9944904602958994e-10,
|
|
"loss": 1.3596,
|
|
"mean_token_accuracy": 0.7191324174404145,
|
|
"num_tokens": 402614769.0,
|
|
"step": 6395
|
|
},
|
|
{
|
|
"epoch": 20.916530278232408,
|
|
"grad_norm": 12.361842765694908,
|
|
"learning_rate": 3.9636171758181655e-10,
|
|
"loss": 1.3608,
|
|
"mean_token_accuracy": 0.7194510757923126,
|
|
"num_tokens": 402929102.0,
|
|
"step": 6400
|
|
},
|
|
{
|
|
"epoch": 20.9328968903437,
|
|
"grad_norm": 12.48137044626653,
|
|
"learning_rate": 3.9328533884719267e-10,
|
|
"loss": 1.3334,
|
|
"mean_token_accuracy": 0.7238231897354126,
|
|
"num_tokens": 403245266.0,
|
|
"step": 6405
|
|
},
|
|
{
|
|
"epoch": 20.949263502454993,
|
|
"grad_norm": 13.341785056031886,
|
|
"learning_rate": 3.902199258386732e-10,
|
|
"loss": 1.3811,
|
|
"mean_token_accuracy": 0.7154872000217438,
|
|
"num_tokens": 403561358.0,
|
|
"step": 6410
|
|
},
|
|
{
|
|
"epoch": 20.965630114566284,
|
|
"grad_norm": 11.86055012503018,
|
|
"learning_rate": 3.8716549451213473e-10,
|
|
"loss": 1.3537,
|
|
"mean_token_accuracy": 0.719732791185379,
|
|
"num_tokens": 403877016.0,
|
|
"step": 6415
|
|
},
|
|
{
|
|
"epoch": 20.98199672667758,
|
|
"grad_norm": 12.66194804811954,
|
|
"learning_rate": 3.841220607662932e-10,
|
|
"loss": 1.345,
|
|
"mean_token_accuracy": 0.7209204435348511,
|
|
"num_tokens": 404193031.0,
|
|
"step": 6420
|
|
},
|
|
{
|
|
"epoch": 20.99836333878887,
|
|
"grad_norm": 13.057343518581341,
|
|
"learning_rate": 3.8108964044262034e-10,
|
|
"loss": 1.3601,
|
|
"mean_token_accuracy": 0.718568354845047,
|
|
"num_tokens": 404509316.0,
|
|
"step": 6425
|
|
},
|
|
{
|
|
"epoch": 21.013093289689035,
|
|
"grad_norm": 12.848348091317341,
|
|
"learning_rate": 3.780682493252613e-10,
|
|
"loss": 1.3604,
|
|
"mean_token_accuracy": 0.7186027036772834,
|
|
"num_tokens": 404770618.0,
|
|
"step": 6430
|
|
},
|
|
{
|
|
"epoch": 21.029459901800326,
|
|
"grad_norm": 12.514267821398358,
|
|
"learning_rate": 3.7505790314095347e-10,
|
|
"loss": 1.3397,
|
|
"mean_token_accuracy": 0.722569715976715,
|
|
"num_tokens": 405087475.0,
|
|
"step": 6435
|
|
},
|
|
{
|
|
"epoch": 21.04582651391162,
|
|
"grad_norm": 12.552588439453062,
|
|
"learning_rate": 3.720586175589438e-10,
|
|
"loss": 1.349,
|
|
"mean_token_accuracy": 0.7212419390678406,
|
|
"num_tokens": 405402509.0,
|
|
"step": 6440
|
|
},
|
|
{
|
|
"epoch": 21.062193126022912,
|
|
"grad_norm": 12.402093691703406,
|
|
"learning_rate": 3.69070408190906e-10,
|
|
"loss": 1.335,
|
|
"mean_token_accuracy": 0.7232251703739166,
|
|
"num_tokens": 405718756.0,
|
|
"step": 6445
|
|
},
|
|
{
|
|
"epoch": 21.078559738134206,
|
|
"grad_norm": 13.312628292563984,
|
|
"learning_rate": 3.6609329059086286e-10,
|
|
"loss": 1.3703,
|
|
"mean_token_accuracy": 0.7168916404247284,
|
|
"num_tokens": 406034042.0,
|
|
"step": 6450
|
|
},
|
|
{
|
|
"epoch": 21.094926350245498,
|
|
"grad_norm": 13.20544818974223,
|
|
"learning_rate": 3.631272802551011e-10,
|
|
"loss": 1.3492,
|
|
"mean_token_accuracy": 0.7211582005023957,
|
|
"num_tokens": 406349959.0,
|
|
"step": 6455
|
|
},
|
|
{
|
|
"epoch": 21.111292962356792,
|
|
"grad_norm": 13.23125049133312,
|
|
"learning_rate": 3.60172392622094e-10,
|
|
"loss": 1.3448,
|
|
"mean_token_accuracy": 0.7220036685466766,
|
|
"num_tokens": 406665577.0,
|
|
"step": 6460
|
|
},
|
|
{
|
|
"epoch": 21.127659574468087,
|
|
"grad_norm": 13.005624491873656,
|
|
"learning_rate": 3.572286430724192e-10,
|
|
"loss": 1.3542,
|
|
"mean_token_accuracy": 0.7203407347202301,
|
|
"num_tokens": 406980864.0,
|
|
"step": 6465
|
|
},
|
|
{
|
|
"epoch": 21.144026186579378,
|
|
"grad_norm": 12.765581546026217,
|
|
"learning_rate": 3.5429604692867905e-10,
|
|
"loss": 1.3463,
|
|
"mean_token_accuracy": 0.7219978153705597,
|
|
"num_tokens": 407298180.0,
|
|
"step": 6470
|
|
},
|
|
{
|
|
"epoch": 21.160392798690673,
|
|
"grad_norm": 12.667059462802193,
|
|
"learning_rate": 3.5137461945542125e-10,
|
|
"loss": 1.3499,
|
|
"mean_token_accuracy": 0.7211233079433441,
|
|
"num_tokens": 407614589.0,
|
|
"step": 6475
|
|
},
|
|
{
|
|
"epoch": 21.176759410801964,
|
|
"grad_norm": 12.979083992930654,
|
|
"learning_rate": 3.484643758590586e-10,
|
|
"loss": 1.342,
|
|
"mean_token_accuracy": 0.7224942982196808,
|
|
"num_tokens": 407929635.0,
|
|
"step": 6480
|
|
},
|
|
{
|
|
"epoch": 21.19312602291326,
|
|
"grad_norm": 12.249019308152713,
|
|
"learning_rate": 3.455653312877913e-10,
|
|
"loss": 1.3385,
|
|
"mean_token_accuracy": 0.7228384554386139,
|
|
"num_tokens": 408244228.0,
|
|
"step": 6485
|
|
},
|
|
{
|
|
"epoch": 21.20949263502455,
|
|
"grad_norm": 12.699289474269534,
|
|
"learning_rate": 3.426775008315258e-10,
|
|
"loss": 1.3584,
|
|
"mean_token_accuracy": 0.7195417165756226,
|
|
"num_tokens": 408560800.0,
|
|
"step": 6490
|
|
},
|
|
{
|
|
"epoch": 21.225859247135844,
|
|
"grad_norm": 12.99167124681205,
|
|
"learning_rate": 3.398008995217988e-10,
|
|
"loss": 1.3527,
|
|
"mean_token_accuracy": 0.7201470553874969,
|
|
"num_tokens": 408875635.0,
|
|
"step": 6495
|
|
},
|
|
{
|
|
"epoch": 21.242225859247135,
|
|
"grad_norm": 12.629510378355464,
|
|
"learning_rate": 3.3693554233169777e-10,
|
|
"loss": 1.365,
|
|
"mean_token_accuracy": 0.7178857147693634,
|
|
"num_tokens": 409191753.0,
|
|
"step": 6500
|
|
},
|
|
{
|
|
"epoch": 21.25859247135843,
|
|
"grad_norm": 12.879603899060175,
|
|
"learning_rate": 3.3408144417578196e-10,
|
|
"loss": 1.3523,
|
|
"mean_token_accuracy": 0.7209328949451447,
|
|
"num_tokens": 409508642.0,
|
|
"step": 6505
|
|
},
|
|
{
|
|
"epoch": 21.27495908346972,
|
|
"grad_norm": 13.309240097642768,
|
|
"learning_rate": 3.3123861991000646e-10,
|
|
"loss": 1.3794,
|
|
"mean_token_accuracy": 0.7156186401844025,
|
|
"num_tokens": 409824571.0,
|
|
"step": 6510
|
|
},
|
|
{
|
|
"epoch": 21.291325695581016,
|
|
"grad_norm": 12.92867376437594,
|
|
"learning_rate": 3.28407084331645e-10,
|
|
"loss": 1.3657,
|
|
"mean_token_accuracy": 0.7183760046958924,
|
|
"num_tokens": 410142157.0,
|
|
"step": 6515
|
|
},
|
|
{
|
|
"epoch": 21.307692307692307,
|
|
"grad_norm": 12.472225455278851,
|
|
"learning_rate": 3.255868521792113e-10,
|
|
"loss": 1.3482,
|
|
"mean_token_accuracy": 0.7216750860214234,
|
|
"num_tokens": 410457614.0,
|
|
"step": 6520
|
|
},
|
|
{
|
|
"epoch": 21.3240589198036,
|
|
"grad_norm": 12.365890438292144,
|
|
"learning_rate": 3.2277793813238393e-10,
|
|
"loss": 1.3437,
|
|
"mean_token_accuracy": 0.7215527594089508,
|
|
"num_tokens": 410774679.0,
|
|
"step": 6525
|
|
},
|
|
{
|
|
"epoch": 21.340425531914892,
|
|
"grad_norm": 13.965813014874932,
|
|
"learning_rate": 3.199803568119283e-10,
|
|
"loss": 1.3708,
|
|
"mean_token_accuracy": 0.7182475388050079,
|
|
"num_tokens": 411089535.0,
|
|
"step": 6530
|
|
},
|
|
{
|
|
"epoch": 21.356792144026187,
|
|
"grad_norm": 12.9264410294177,
|
|
"learning_rate": 3.171941227796227e-10,
|
|
"loss": 1.3516,
|
|
"mean_token_accuracy": 0.7200949966907502,
|
|
"num_tokens": 411402595.0,
|
|
"step": 6535
|
|
},
|
|
{
|
|
"epoch": 21.373158756137478,
|
|
"grad_norm": 12.718262389396598,
|
|
"learning_rate": 3.1441925053818015e-10,
|
|
"loss": 1.3586,
|
|
"mean_token_accuracy": 0.7214510560035705,
|
|
"num_tokens": 411718212.0,
|
|
"step": 6540
|
|
},
|
|
{
|
|
"epoch": 21.389525368248773,
|
|
"grad_norm": 12.393441417705136,
|
|
"learning_rate": 3.116557545311749e-10,
|
|
"loss": 1.3643,
|
|
"mean_token_accuracy": 0.7181841313838959,
|
|
"num_tokens": 412034328.0,
|
|
"step": 6545
|
|
},
|
|
{
|
|
"epoch": 21.405891980360064,
|
|
"grad_norm": 13.397061349747721,
|
|
"learning_rate": 3.0890364914296614e-10,
|
|
"loss": 1.3665,
|
|
"mean_token_accuracy": 0.7183075726032258,
|
|
"num_tokens": 412349959.0,
|
|
"step": 6550
|
|
},
|
|
{
|
|
"epoch": 21.42225859247136,
|
|
"grad_norm": 12.686656256195334,
|
|
"learning_rate": 3.0616294869862364e-10,
|
|
"loss": 1.3633,
|
|
"mean_token_accuracy": 0.7172622561454773,
|
|
"num_tokens": 412665283.0,
|
|
"step": 6555
|
|
},
|
|
{
|
|
"epoch": 21.438625204582653,
|
|
"grad_norm": 13.68919572012682,
|
|
"learning_rate": 3.0343366746385133e-10,
|
|
"loss": 1.3577,
|
|
"mean_token_accuracy": 0.7190939366817475,
|
|
"num_tokens": 412981575.0,
|
|
"step": 6560
|
|
},
|
|
{
|
|
"epoch": 21.454991816693944,
|
|
"grad_norm": 13.02482080433939,
|
|
"learning_rate": 3.0071581964491723e-10,
|
|
"loss": 1.3513,
|
|
"mean_token_accuracy": 0.720437103509903,
|
|
"num_tokens": 413297068.0,
|
|
"step": 6565
|
|
},
|
|
{
|
|
"epoch": 21.47135842880524,
|
|
"grad_norm": 13.332749583564876,
|
|
"learning_rate": 2.9800941938857574e-10,
|
|
"loss": 1.3646,
|
|
"mean_token_accuracy": 0.7186848640441894,
|
|
"num_tokens": 413612227.0,
|
|
"step": 6570
|
|
},
|
|
{
|
|
"epoch": 21.48772504091653,
|
|
"grad_norm": 13.087991897108951,
|
|
"learning_rate": 2.9531448078199436e-10,
|
|
"loss": 1.3563,
|
|
"mean_token_accuracy": 0.7198020398616791,
|
|
"num_tokens": 413927177.0,
|
|
"step": 6575
|
|
},
|
|
{
|
|
"epoch": 21.504091653027825,
|
|
"grad_norm": 13.335835947964268,
|
|
"learning_rate": 2.9263101785268255e-10,
|
|
"loss": 1.3518,
|
|
"mean_token_accuracy": 0.7200286328792572,
|
|
"num_tokens": 414243696.0,
|
|
"step": 6580
|
|
},
|
|
{
|
|
"epoch": 21.520458265139116,
|
|
"grad_norm": 13.160382556221116,
|
|
"learning_rate": 2.8995904456841664e-10,
|
|
"loss": 1.3605,
|
|
"mean_token_accuracy": 0.7191519558429718,
|
|
"num_tokens": 414559555.0,
|
|
"step": 6585
|
|
},
|
|
{
|
|
"epoch": 21.53682487725041,
|
|
"grad_norm": 13.058908572794502,
|
|
"learning_rate": 2.872985748371679e-10,
|
|
"loss": 1.3462,
|
|
"mean_token_accuracy": 0.7216069400310516,
|
|
"num_tokens": 414874568.0,
|
|
"step": 6590
|
|
},
|
|
{
|
|
"epoch": 21.5531914893617,
|
|
"grad_norm": 12.931286937951938,
|
|
"learning_rate": 2.8464962250703023e-10,
|
|
"loss": 1.3589,
|
|
"mean_token_accuracy": 0.7190203011035919,
|
|
"num_tokens": 415190194.0,
|
|
"step": 6595
|
|
},
|
|
{
|
|
"epoch": 21.569558101472996,
|
|
"grad_norm": 12.855372446575597,
|
|
"learning_rate": 2.8201220136614805e-10,
|
|
"loss": 1.346,
|
|
"mean_token_accuracy": 0.7213197529315949,
|
|
"num_tokens": 415506361.0,
|
|
"step": 6600
|
|
},
|
|
{
|
|
"epoch": 21.585924713584287,
|
|
"grad_norm": 13.099093485339138,
|
|
"learning_rate": 2.79386325142644e-10,
|
|
"loss": 1.3508,
|
|
"mean_token_accuracy": 0.720091027021408,
|
|
"num_tokens": 415820106.0,
|
|
"step": 6605
|
|
},
|
|
{
|
|
"epoch": 21.60229132569558,
|
|
"grad_norm": 12.51626494162,
|
|
"learning_rate": 2.7677200750454904e-10,
|
|
"loss": 1.3449,
|
|
"mean_token_accuracy": 0.7217469274997711,
|
|
"num_tokens": 416136377.0,
|
|
"step": 6610
|
|
},
|
|
{
|
|
"epoch": 21.618657937806873,
|
|
"grad_norm": 12.463459434825667,
|
|
"learning_rate": 2.7416926205972833e-10,
|
|
"loss": 1.332,
|
|
"mean_token_accuracy": 0.7255747377872467,
|
|
"num_tokens": 416452893.0,
|
|
"step": 6615
|
|
},
|
|
{
|
|
"epoch": 21.635024549918167,
|
|
"grad_norm": 12.977423501928877,
|
|
"learning_rate": 2.7157810235581335e-10,
|
|
"loss": 1.3569,
|
|
"mean_token_accuracy": 0.7192984879016876,
|
|
"num_tokens": 416768235.0,
|
|
"step": 6620
|
|
},
|
|
{
|
|
"epoch": 21.65139116202946,
|
|
"grad_norm": 12.47697245725263,
|
|
"learning_rate": 2.689985418801305e-10,
|
|
"loss": 1.3514,
|
|
"mean_token_accuracy": 0.7202287018299103,
|
|
"num_tokens": 417084339.0,
|
|
"step": 6625
|
|
},
|
|
{
|
|
"epoch": 21.667757774140753,
|
|
"grad_norm": 12.874623370443464,
|
|
"learning_rate": 2.6643059405963036e-10,
|
|
"loss": 1.3392,
|
|
"mean_token_accuracy": 0.7226230144500733,
|
|
"num_tokens": 417399309.0,
|
|
"step": 6630
|
|
},
|
|
{
|
|
"epoch": 21.684124386252044,
|
|
"grad_norm": 13.289710561216035,
|
|
"learning_rate": 2.638742722608184e-10,
|
|
"loss": 1.3531,
|
|
"mean_token_accuracy": 0.7205904841423034,
|
|
"num_tokens": 417715195.0,
|
|
"step": 6635
|
|
},
|
|
{
|
|
"epoch": 21.70049099836334,
|
|
"grad_norm": 13.159041280263851,
|
|
"learning_rate": 2.613295897896842e-10,
|
|
"loss": 1.3396,
|
|
"mean_token_accuracy": 0.7222914814949035,
|
|
"num_tokens": 418032025.0,
|
|
"step": 6640
|
|
},
|
|
{
|
|
"epoch": 21.71685761047463,
|
|
"grad_norm": 13.05845630365163,
|
|
"learning_rate": 2.587965598916342e-10,
|
|
"loss": 1.3535,
|
|
"mean_token_accuracy": 0.7190665304660797,
|
|
"num_tokens": 418346152.0,
|
|
"step": 6645
|
|
},
|
|
{
|
|
"epoch": 21.733224222585925,
|
|
"grad_norm": 12.825672435680955,
|
|
"learning_rate": 2.5627519575142086e-10,
|
|
"loss": 1.3706,
|
|
"mean_token_accuracy": 0.716326767206192,
|
|
"num_tokens": 418661650.0,
|
|
"step": 6650
|
|
},
|
|
{
|
|
"epoch": 21.74959083469722,
|
|
"grad_norm": 12.497004382450312,
|
|
"learning_rate": 2.5376551049307554e-10,
|
|
"loss": 1.3463,
|
|
"mean_token_accuracy": 0.7214847326278686,
|
|
"num_tokens": 418977651.0,
|
|
"step": 6655
|
|
},
|
|
{
|
|
"epoch": 21.76595744680851,
|
|
"grad_norm": 13.241035170630276,
|
|
"learning_rate": 2.5126751717983923e-10,
|
|
"loss": 1.3524,
|
|
"mean_token_accuracy": 0.7203598082065582,
|
|
"num_tokens": 419293460.0,
|
|
"step": 6660
|
|
},
|
|
{
|
|
"epoch": 21.782324058919805,
|
|
"grad_norm": 12.72372468441974,
|
|
"learning_rate": 2.4878122881409446e-10,
|
|
"loss": 1.355,
|
|
"mean_token_accuracy": 0.7196272253990174,
|
|
"num_tokens": 419607370.0,
|
|
"step": 6665
|
|
},
|
|
{
|
|
"epoch": 21.798690671031096,
|
|
"grad_norm": 13.029294249856717,
|
|
"learning_rate": 2.463066583372989e-10,
|
|
"loss": 1.346,
|
|
"mean_token_accuracy": 0.7200318992137908,
|
|
"num_tokens": 419921816.0,
|
|
"step": 6670
|
|
},
|
|
{
|
|
"epoch": 21.81505728314239,
|
|
"grad_norm": 12.912529889183855,
|
|
"learning_rate": 2.4384381862991523e-10,
|
|
"loss": 1.3534,
|
|
"mean_token_accuracy": 0.7199345767498017,
|
|
"num_tokens": 420237025.0,
|
|
"step": 6675
|
|
},
|
|
{
|
|
"epoch": 21.831423895253682,
|
|
"grad_norm": 12.933571334197762,
|
|
"learning_rate": 2.41392722511348e-10,
|
|
"loss": 1.3403,
|
|
"mean_token_accuracy": 0.7232010543346405,
|
|
"num_tokens": 420551721.0,
|
|
"step": 6680
|
|
},
|
|
{
|
|
"epoch": 21.847790507364977,
|
|
"grad_norm": 12.387962123212533,
|
|
"learning_rate": 2.389533827398735e-10,
|
|
"loss": 1.3377,
|
|
"mean_token_accuracy": 0.7233124315738678,
|
|
"num_tokens": 420867419.0,
|
|
"step": 6685
|
|
},
|
|
{
|
|
"epoch": 21.864157119476268,
|
|
"grad_norm": 12.274817137377648,
|
|
"learning_rate": 2.3652581201257547e-10,
|
|
"loss": 1.3473,
|
|
"mean_token_accuracy": 0.7211391031742096,
|
|
"num_tokens": 421182137.0,
|
|
"step": 6690
|
|
},
|
|
{
|
|
"epoch": 21.880523731587562,
|
|
"grad_norm": 12.755298828886199,
|
|
"learning_rate": 2.341100229652779e-10,
|
|
"loss": 1.3506,
|
|
"mean_token_accuracy": 0.7204267501831054,
|
|
"num_tokens": 421498754.0,
|
|
"step": 6695
|
|
},
|
|
{
|
|
"epoch": 21.896890343698853,
|
|
"grad_norm": 13.474176207147067,
|
|
"learning_rate": 2.317060281724795e-10,
|
|
"loss": 1.3818,
|
|
"mean_token_accuracy": 0.7143054306507111,
|
|
"num_tokens": 421814794.0,
|
|
"step": 6700
|
|
},
|
|
{
|
|
"epoch": 21.913256955810148,
|
|
"grad_norm": 12.999095205544377,
|
|
"learning_rate": 2.2931384014728856e-10,
|
|
"loss": 1.364,
|
|
"mean_token_accuracy": 0.7177674889564514,
|
|
"num_tokens": 422131641.0,
|
|
"step": 6705
|
|
},
|
|
{
|
|
"epoch": 21.92962356792144,
|
|
"grad_norm": 12.997861914085577,
|
|
"learning_rate": 2.2693347134135733e-10,
|
|
"loss": 1.3648,
|
|
"mean_token_accuracy": 0.717613697052002,
|
|
"num_tokens": 422447861.0,
|
|
"step": 6710
|
|
},
|
|
{
|
|
"epoch": 21.945990180032734,
|
|
"grad_norm": 13.01026998846727,
|
|
"learning_rate": 2.2456493414481776e-10,
|
|
"loss": 1.3353,
|
|
"mean_token_accuracy": 0.7228084802627563,
|
|
"num_tokens": 422761860.0,
|
|
"step": 6715
|
|
},
|
|
{
|
|
"epoch": 21.962356792144025,
|
|
"grad_norm": 12.99630753112788,
|
|
"learning_rate": 2.2220824088621638e-10,
|
|
"loss": 1.357,
|
|
"mean_token_accuracy": 0.7195675671100616,
|
|
"num_tokens": 423075970.0,
|
|
"step": 6720
|
|
},
|
|
{
|
|
"epoch": 21.97872340425532,
|
|
"grad_norm": 12.551715322951035,
|
|
"learning_rate": 2.1986340383245152e-10,
|
|
"loss": 1.3367,
|
|
"mean_token_accuracy": 0.7229941666126252,
|
|
"num_tokens": 423391794.0,
|
|
"step": 6725
|
|
},
|
|
{
|
|
"epoch": 21.99509001636661,
|
|
"grad_norm": 12.632402232140949,
|
|
"learning_rate": 2.1753043518870613e-10,
|
|
"loss": 1.3371,
|
|
"mean_token_accuracy": 0.722889506816864,
|
|
"num_tokens": 423708843.0,
|
|
"step": 6730
|
|
},
|
|
{
|
|
"epoch": 22.009819967266775,
|
|
"grad_norm": 12.886380554794895,
|
|
"learning_rate": 2.1520934709838901e-10,
|
|
"loss": 1.3502,
|
|
"mean_token_accuracy": 0.7217356893751357,
|
|
"num_tokens": 423970325.0,
|
|
"step": 6735
|
|
},
|
|
{
|
|
"epoch": 22.02618657937807,
|
|
"grad_norm": 13.370664170255838,
|
|
"learning_rate": 2.1290015164306758e-10,
|
|
"loss": 1.3538,
|
|
"mean_token_accuracy": 0.7191697001457215,
|
|
"num_tokens": 424285761.0,
|
|
"step": 6740
|
|
},
|
|
{
|
|
"epoch": 22.04255319148936,
|
|
"grad_norm": 12.845599012476232,
|
|
"learning_rate": 2.1060286084240738e-10,
|
|
"loss": 1.3512,
|
|
"mean_token_accuracy": 0.7195539712905884,
|
|
"num_tokens": 424601367.0,
|
|
"step": 6745
|
|
},
|
|
{
|
|
"epoch": 22.058919803600656,
|
|
"grad_norm": 13.358957826186089,
|
|
"learning_rate": 2.0831748665410767e-10,
|
|
"loss": 1.3607,
|
|
"mean_token_accuracy": 0.7183383524417877,
|
|
"num_tokens": 424918318.0,
|
|
"step": 6750
|
|
},
|
|
{
|
|
"epoch": 22.075286415711947,
|
|
"grad_norm": 12.603249648959288,
|
|
"learning_rate": 2.0604404097384178e-10,
|
|
"loss": 1.3344,
|
|
"mean_token_accuracy": 0.7244606792926789,
|
|
"num_tokens": 425235565.0,
|
|
"step": 6755
|
|
},
|
|
{
|
|
"epoch": 22.09165302782324,
|
|
"grad_norm": 12.673344237200768,
|
|
"learning_rate": 2.0378253563519245e-10,
|
|
"loss": 1.3364,
|
|
"mean_token_accuracy": 0.7232702493667602,
|
|
"num_tokens": 425549227.0,
|
|
"step": 6760
|
|
},
|
|
{
|
|
"epoch": 22.108019639934533,
|
|
"grad_norm": 12.991911952026106,
|
|
"learning_rate": 2.01532982409591e-10,
|
|
"loss": 1.3644,
|
|
"mean_token_accuracy": 0.7179206192493439,
|
|
"num_tokens": 425866443.0,
|
|
"step": 6765
|
|
},
|
|
{
|
|
"epoch": 22.124386252045827,
|
|
"grad_norm": 13.238495456255036,
|
|
"learning_rate": 1.9929539300625744e-10,
|
|
"loss": 1.3388,
|
|
"mean_token_accuracy": 0.7227777302265167,
|
|
"num_tokens": 426181410.0,
|
|
"step": 6770
|
|
},
|
|
{
|
|
"epoch": 22.14075286415712,
|
|
"grad_norm": 13.432269527486834,
|
|
"learning_rate": 1.9706977907213763e-10,
|
|
"loss": 1.3639,
|
|
"mean_token_accuracy": 0.717751395702362,
|
|
"num_tokens": 426497130.0,
|
|
"step": 6775
|
|
},
|
|
{
|
|
"epoch": 22.157119476268413,
|
|
"grad_norm": 12.539273082066268,
|
|
"learning_rate": 1.948561521918446e-10,
|
|
"loss": 1.3424,
|
|
"mean_token_accuracy": 0.7216863572597504,
|
|
"num_tokens": 426812100.0,
|
|
"step": 6780
|
|
},
|
|
{
|
|
"epoch": 22.173486088379704,
|
|
"grad_norm": 13.243823023413835,
|
|
"learning_rate": 1.9265452388759652e-10,
|
|
"loss": 1.3539,
|
|
"mean_token_accuracy": 0.7189646363258362,
|
|
"num_tokens": 427125130.0,
|
|
"step": 6785
|
|
},
|
|
{
|
|
"epoch": 22.189852700491,
|
|
"grad_norm": 13.079190798755858,
|
|
"learning_rate": 1.9046490561915708e-10,
|
|
"loss": 1.3606,
|
|
"mean_token_accuracy": 0.7185764789581299,
|
|
"num_tokens": 427442723.0,
|
|
"step": 6790
|
|
},
|
|
{
|
|
"epoch": 22.20621931260229,
|
|
"grad_norm": 12.959085959087055,
|
|
"learning_rate": 1.8828730878377638e-10,
|
|
"loss": 1.3423,
|
|
"mean_token_accuracy": 0.7217658877372741,
|
|
"num_tokens": 427758619.0,
|
|
"step": 6795
|
|
},
|
|
{
|
|
"epoch": 22.222585924713584,
|
|
"grad_norm": 12.599950436626239,
|
|
"learning_rate": 1.8612174471613174e-10,
|
|
"loss": 1.3487,
|
|
"mean_token_accuracy": 0.7207997500896454,
|
|
"num_tokens": 428074285.0,
|
|
"step": 6800
|
|
},
|
|
{
|
|
"epoch": 22.238952536824875,
|
|
"grad_norm": 13.44831286777348,
|
|
"learning_rate": 1.8396822468826819e-10,
|
|
"loss": 1.3766,
|
|
"mean_token_accuracy": 0.7160114467144012,
|
|
"num_tokens": 428390105.0,
|
|
"step": 6805
|
|
},
|
|
{
|
|
"epoch": 22.25531914893617,
|
|
"grad_norm": 13.214936849964493,
|
|
"learning_rate": 1.8182675990954022e-10,
|
|
"loss": 1.3517,
|
|
"mean_token_accuracy": 0.7194897770881653,
|
|
"num_tokens": 428705606.0,
|
|
"step": 6810
|
|
},
|
|
{
|
|
"epoch": 22.271685761047465,
|
|
"grad_norm": 13.11981613970459,
|
|
"learning_rate": 1.7969736152655237e-10,
|
|
"loss": 1.3591,
|
|
"mean_token_accuracy": 0.7187436342239379,
|
|
"num_tokens": 429020487.0,
|
|
"step": 6815
|
|
},
|
|
{
|
|
"epoch": 22.288052373158756,
|
|
"grad_norm": 12.507560434328925,
|
|
"learning_rate": 1.775800406231026e-10,
|
|
"loss": 1.3439,
|
|
"mean_token_accuracy": 0.7216341137886048,
|
|
"num_tokens": 429336937.0,
|
|
"step": 6820
|
|
},
|
|
{
|
|
"epoch": 22.30441898527005,
|
|
"grad_norm": 12.734388947281815,
|
|
"learning_rate": 1.7547480822012408e-10,
|
|
"loss": 1.3453,
|
|
"mean_token_accuracy": 0.7207939743995666,
|
|
"num_tokens": 429653469.0,
|
|
"step": 6825
|
|
},
|
|
{
|
|
"epoch": 22.32078559738134,
|
|
"grad_norm": 12.63205594092822,
|
|
"learning_rate": 1.7338167527562732e-10,
|
|
"loss": 1.3377,
|
|
"mean_token_accuracy": 0.7222929179668427,
|
|
"num_tokens": 429969131.0,
|
|
"step": 6830
|
|
},
|
|
{
|
|
"epoch": 22.337152209492636,
|
|
"grad_norm": 13.162783662994846,
|
|
"learning_rate": 1.713006526846439e-10,
|
|
"loss": 1.3487,
|
|
"mean_token_accuracy": 0.7200253903865814,
|
|
"num_tokens": 430284447.0,
|
|
"step": 6835
|
|
},
|
|
{
|
|
"epoch": 22.353518821603927,
|
|
"grad_norm": 12.60058719095576,
|
|
"learning_rate": 1.6923175127916994e-10,
|
|
"loss": 1.3484,
|
|
"mean_token_accuracy": 0.7198909163475037,
|
|
"num_tokens": 430599699.0,
|
|
"step": 6840
|
|
},
|
|
{
|
|
"epoch": 22.369885433715222,
|
|
"grad_norm": 12.855197780418354,
|
|
"learning_rate": 1.6717498182810765e-10,
|
|
"loss": 1.3405,
|
|
"mean_token_accuracy": 0.7213269472122192,
|
|
"num_tokens": 430913774.0,
|
|
"step": 6845
|
|
},
|
|
{
|
|
"epoch": 22.386252045826513,
|
|
"grad_norm": 12.782693683323123,
|
|
"learning_rate": 1.6513035503721212e-10,
|
|
"loss": 1.3462,
|
|
"mean_token_accuracy": 0.7212722063064575,
|
|
"num_tokens": 431227584.0,
|
|
"step": 6850
|
|
},
|
|
{
|
|
"epoch": 22.402618657937808,
|
|
"grad_norm": 12.563522351858756,
|
|
"learning_rate": 1.630978815490339e-10,
|
|
"loss": 1.3287,
|
|
"mean_token_accuracy": 0.7248152911663055,
|
|
"num_tokens": 431543507.0,
|
|
"step": 6855
|
|
},
|
|
{
|
|
"epoch": 22.4189852700491,
|
|
"grad_norm": 12.861715310841179,
|
|
"learning_rate": 1.610775719428642e-10,
|
|
"loss": 1.3492,
|
|
"mean_token_accuracy": 0.7201033174991608,
|
|
"num_tokens": 431858223.0,
|
|
"step": 6860
|
|
},
|
|
{
|
|
"epoch": 22.435351882160393,
|
|
"grad_norm": 13.174983537506147,
|
|
"learning_rate": 1.5906943673467955e-10,
|
|
"loss": 1.3467,
|
|
"mean_token_accuracy": 0.721436756849289,
|
|
"num_tokens": 432173885.0,
|
|
"step": 6865
|
|
},
|
|
{
|
|
"epoch": 22.451718494271685,
|
|
"grad_norm": 12.86280817976593,
|
|
"learning_rate": 1.5707348637708674e-10,
|
|
"loss": 1.3527,
|
|
"mean_token_accuracy": 0.7199465811252594,
|
|
"num_tokens": 432489890.0,
|
|
"step": 6870
|
|
},
|
|
{
|
|
"epoch": 22.46808510638298,
|
|
"grad_norm": 13.075985796707062,
|
|
"learning_rate": 1.5508973125926918e-10,
|
|
"loss": 1.3635,
|
|
"mean_token_accuracy": 0.7167506515979767,
|
|
"num_tokens": 432805161.0,
|
|
"step": 6875
|
|
},
|
|
{
|
|
"epoch": 22.48445171849427,
|
|
"grad_norm": 12.930055267973149,
|
|
"learning_rate": 1.531181817069327e-10,
|
|
"loss": 1.3514,
|
|
"mean_token_accuracy": 0.7211288928985595,
|
|
"num_tokens": 433121001.0,
|
|
"step": 6880
|
|
},
|
|
{
|
|
"epoch": 22.500818330605565,
|
|
"grad_norm": 12.801291847440664,
|
|
"learning_rate": 1.5115884798225122e-10,
|
|
"loss": 1.3521,
|
|
"mean_token_accuracy": 0.7190639019012451,
|
|
"num_tokens": 433435828.0,
|
|
"step": 6885
|
|
},
|
|
{
|
|
"epoch": 22.517184942716856,
|
|
"grad_norm": 13.245523412336043,
|
|
"learning_rate": 1.4921174028381362e-10,
|
|
"loss": 1.3559,
|
|
"mean_token_accuracy": 0.7189247727394104,
|
|
"num_tokens": 433750208.0,
|
|
"step": 6890
|
|
},
|
|
{
|
|
"epoch": 22.53355155482815,
|
|
"grad_norm": 13.490417527494259,
|
|
"learning_rate": 1.4727686874657143e-10,
|
|
"loss": 1.3482,
|
|
"mean_token_accuracy": 0.720479530096054,
|
|
"num_tokens": 434066217.0,
|
|
"step": 6895
|
|
},
|
|
{
|
|
"epoch": 22.54991816693944,
|
|
"grad_norm": 12.307057662694007,
|
|
"learning_rate": 1.4535424344178372e-10,
|
|
"loss": 1.3303,
|
|
"mean_token_accuracy": 0.7245355308055877,
|
|
"num_tokens": 434382326.0,
|
|
"step": 6900
|
|
},
|
|
{
|
|
"epoch": 22.566284779050736,
|
|
"grad_norm": 12.389134965064018,
|
|
"learning_rate": 1.4344387437696781e-10,
|
|
"loss": 1.3513,
|
|
"mean_token_accuracy": 0.7200321733951569,
|
|
"num_tokens": 434697233.0,
|
|
"step": 6905
|
|
},
|
|
{
|
|
"epoch": 22.58265139116203,
|
|
"grad_norm": 12.910124027424311,
|
|
"learning_rate": 1.4154577149584542e-10,
|
|
"loss": 1.3451,
|
|
"mean_token_accuracy": 0.7210674345493316,
|
|
"num_tokens": 435012252.0,
|
|
"step": 6910
|
|
},
|
|
{
|
|
"epoch": 22.599018003273322,
|
|
"grad_norm": 13.600220736718454,
|
|
"learning_rate": 1.396599446782909e-10,
|
|
"loss": 1.339,
|
|
"mean_token_accuracy": 0.7226611793041229,
|
|
"num_tokens": 435328644.0,
|
|
"step": 6915
|
|
},
|
|
{
|
|
"epoch": 22.615384615384617,
|
|
"grad_norm": 12.52738378390851,
|
|
"learning_rate": 1.3778640374027983e-10,
|
|
"loss": 1.336,
|
|
"mean_token_accuracy": 0.7228205442428589,
|
|
"num_tokens": 435642951.0,
|
|
"step": 6920
|
|
},
|
|
{
|
|
"epoch": 22.631751227495908,
|
|
"grad_norm": 13.62863010832926,
|
|
"learning_rate": 1.359251584338389e-10,
|
|
"loss": 1.3641,
|
|
"mean_token_accuracy": 0.7171992361545563,
|
|
"num_tokens": 435958506.0,
|
|
"step": 6925
|
|
},
|
|
{
|
|
"epoch": 22.648117839607202,
|
|
"grad_norm": 12.552383076369432,
|
|
"learning_rate": 1.3407621844699374e-10,
|
|
"loss": 1.3317,
|
|
"mean_token_accuracy": 0.7235641539096832,
|
|
"num_tokens": 436276318.0,
|
|
"step": 6930
|
|
},
|
|
{
|
|
"epoch": 22.664484451718494,
|
|
"grad_norm": 13.664599298303148,
|
|
"learning_rate": 1.322395934037199e-10,
|
|
"loss": 1.3645,
|
|
"mean_token_accuracy": 0.7180260837078094,
|
|
"num_tokens": 436593498.0,
|
|
"step": 6935
|
|
},
|
|
{
|
|
"epoch": 22.680851063829788,
|
|
"grad_norm": 13.351252233873854,
|
|
"learning_rate": 1.3041529286389076e-10,
|
|
"loss": 1.3621,
|
|
"mean_token_accuracy": 0.7189351558685303,
|
|
"num_tokens": 436909048.0,
|
|
"step": 6940
|
|
},
|
|
{
|
|
"epoch": 22.69721767594108,
|
|
"grad_norm": 13.465048819018577,
|
|
"learning_rate": 1.2860332632323085e-10,
|
|
"loss": 1.3742,
|
|
"mean_token_accuracy": 0.7154442071914673,
|
|
"num_tokens": 437225143.0,
|
|
"step": 6945
|
|
},
|
|
{
|
|
"epoch": 22.713584288052374,
|
|
"grad_norm": 13.822851613415096,
|
|
"learning_rate": 1.2680370321326323e-10,
|
|
"loss": 1.3526,
|
|
"mean_token_accuracy": 0.7198641777038575,
|
|
"num_tokens": 437541140.0,
|
|
"step": 6950
|
|
},
|
|
{
|
|
"epoch": 22.729950900163665,
|
|
"grad_norm": 12.152454968770806,
|
|
"learning_rate": 1.2501643290126263e-10,
|
|
"loss": 1.3428,
|
|
"mean_token_accuracy": 0.7212522566318512,
|
|
"num_tokens": 437854378.0,
|
|
"step": 6955
|
|
},
|
|
{
|
|
"epoch": 22.74631751227496,
|
|
"grad_norm": 12.660960103280965,
|
|
"learning_rate": 1.2324152469020465e-10,
|
|
"loss": 1.3406,
|
|
"mean_token_accuracy": 0.7219815969467163,
|
|
"num_tokens": 438169410.0,
|
|
"step": 6960
|
|
},
|
|
{
|
|
"epoch": 22.76268412438625,
|
|
"grad_norm": 12.902062843477703,
|
|
"learning_rate": 1.2147898781871974e-10,
|
|
"loss": 1.3517,
|
|
"mean_token_accuracy": 0.7200750589370728,
|
|
"num_tokens": 438484479.0,
|
|
"step": 6965
|
|
},
|
|
{
|
|
"epoch": 22.779050736497545,
|
|
"grad_norm": 12.874574336103201,
|
|
"learning_rate": 1.197288314610434e-10,
|
|
"loss": 1.3485,
|
|
"mean_token_accuracy": 0.7214947521686554,
|
|
"num_tokens": 438801983.0,
|
|
"step": 6970
|
|
},
|
|
{
|
|
"epoch": 22.795417348608837,
|
|
"grad_norm": 12.866352405422353,
|
|
"learning_rate": 1.1799106472696912e-10,
|
|
"loss": 1.3536,
|
|
"mean_token_accuracy": 0.7188063561916351,
|
|
"num_tokens": 439118078.0,
|
|
"step": 6975
|
|
},
|
|
{
|
|
"epoch": 22.81178396072013,
|
|
"grad_norm": 12.925726362826204,
|
|
"learning_rate": 1.1626569666180031e-10,
|
|
"loss": 1.367,
|
|
"mean_token_accuracy": 0.7190398752689362,
|
|
"num_tokens": 439432429.0,
|
|
"step": 6980
|
|
},
|
|
{
|
|
"epoch": 22.828150572831422,
|
|
"grad_norm": 12.56058886584475,
|
|
"learning_rate": 1.1455273624630419e-10,
|
|
"loss": 1.341,
|
|
"mean_token_accuracy": 0.7224170148372651,
|
|
"num_tokens": 439748343.0,
|
|
"step": 6985
|
|
},
|
|
{
|
|
"epoch": 22.844517184942717,
|
|
"grad_norm": 12.931255482808035,
|
|
"learning_rate": 1.1285219239666467e-10,
|
|
"loss": 1.347,
|
|
"mean_token_accuracy": 0.7204574346542358,
|
|
"num_tokens": 440064663.0,
|
|
"step": 6990
|
|
},
|
|
{
|
|
"epoch": 22.86088379705401,
|
|
"grad_norm": 12.603505356494258,
|
|
"learning_rate": 1.111640739644354e-10,
|
|
"loss": 1.3633,
|
|
"mean_token_accuracy": 0.7168758630752563,
|
|
"num_tokens": 440381472.0,
|
|
"step": 6995
|
|
},
|
|
{
|
|
"epoch": 22.877250409165303,
|
|
"grad_norm": 13.503457434866782,
|
|
"learning_rate": 1.0948838973649372e-10,
|
|
"loss": 1.3557,
|
|
"mean_token_accuracy": 0.7181106150150299,
|
|
"num_tokens": 440698593.0,
|
|
"step": 7000
|
|
},
|
|
{
|
|
"epoch": 22.893617021276597,
|
|
"grad_norm": 13.288625175162997,
|
|
"learning_rate": 1.0782514843499652e-10,
|
|
"loss": 1.3569,
|
|
"mean_token_accuracy": 0.7186809659004212,
|
|
"num_tokens": 441014934.0,
|
|
"step": 7005
|
|
},
|
|
{
|
|
"epoch": 22.90998363338789,
|
|
"grad_norm": 13.067345412316314,
|
|
"learning_rate": 1.0617435871733277e-10,
|
|
"loss": 1.3531,
|
|
"mean_token_accuracy": 0.719657689332962,
|
|
"num_tokens": 441331673.0,
|
|
"step": 7010
|
|
},
|
|
{
|
|
"epoch": 22.926350245499183,
|
|
"grad_norm": 12.297508956037928,
|
|
"learning_rate": 1.0453602917607885e-10,
|
|
"loss": 1.3357,
|
|
"mean_token_accuracy": 0.7232572436332703,
|
|
"num_tokens": 441646020.0,
|
|
"step": 7015
|
|
},
|
|
{
|
|
"epoch": 22.942716857610474,
|
|
"grad_norm": 12.411782822458209,
|
|
"learning_rate": 1.029101683389555e-10,
|
|
"loss": 1.3627,
|
|
"mean_token_accuracy": 0.7166090369224548,
|
|
"num_tokens": 441960330.0,
|
|
"step": 7020
|
|
},
|
|
{
|
|
"epoch": 22.95908346972177,
|
|
"grad_norm": 12.400946609701034,
|
|
"learning_rate": 1.0129678466878123e-10,
|
|
"loss": 1.3347,
|
|
"mean_token_accuracy": 0.7229482293128967,
|
|
"num_tokens": 442277761.0,
|
|
"step": 7025
|
|
},
|
|
{
|
|
"epoch": 22.97545008183306,
|
|
"grad_norm": 13.114720694822743,
|
|
"learning_rate": 9.969588656342981e-11,
|
|
"loss": 1.3389,
|
|
"mean_token_accuracy": 0.7220352053642273,
|
|
"num_tokens": 442593144.0,
|
|
"step": 7030
|
|
},
|
|
{
|
|
"epoch": 22.991816693944354,
|
|
"grad_norm": 12.880875573574846,
|
|
"learning_rate": 9.810748235578592e-11,
|
|
"loss": 1.3402,
|
|
"mean_token_accuracy": 0.7220639884471893,
|
|
"num_tokens": 442908607.0,
|
|
"step": 7035
|
|
},
|
|
{
|
|
"epoch": 23.006546644844516,
|
|
"grad_norm": 12.885175681405324,
|
|
"learning_rate": 9.653158031370152e-11,
|
|
"loss": 1.3477,
|
|
"mean_token_accuracy": 0.7198513878716363,
|
|
"num_tokens": 443169643.0,
|
|
"step": 7040
|
|
},
|
|
{
|
|
"epoch": 23.02291325695581,
|
|
"grad_norm": 12.979282554677258,
|
|
"learning_rate": 9.496818863995365e-11,
|
|
"loss": 1.3509,
|
|
"mean_token_accuracy": 0.7200859129428864,
|
|
"num_tokens": 443483874.0,
|
|
"step": 7045
|
|
},
|
|
{
|
|
"epoch": 23.0392798690671,
|
|
"grad_norm": 12.65359546368029,
|
|
"learning_rate": 9.341731547220094e-11,
|
|
"loss": 1.3582,
|
|
"mean_token_accuracy": 0.7197104752063751,
|
|
"num_tokens": 443799489.0,
|
|
"step": 7050
|
|
},
|
|
{
|
|
"epoch": 23.055646481178396,
|
|
"grad_norm": 13.24581632006229,
|
|
"learning_rate": 9.187896888294189e-11,
|
|
"loss": 1.3628,
|
|
"mean_token_accuracy": 0.7178315281867981,
|
|
"num_tokens": 444115077.0,
|
|
"step": 7055
|
|
},
|
|
{
|
|
"epoch": 23.07201309328969,
|
|
"grad_norm": 12.786295664937347,
|
|
"learning_rate": 9.03531568794716e-11,
|
|
"loss": 1.3401,
|
|
"mean_token_accuracy": 0.7225833356380462,
|
|
"num_tokens": 444429590.0,
|
|
"step": 7060
|
|
},
|
|
{
|
|
"epoch": 23.088379705400982,
|
|
"grad_norm": 13.64398814950113,
|
|
"learning_rate": 8.883988740384264e-11,
|
|
"loss": 1.3619,
|
|
"mean_token_accuracy": 0.7186536669731141,
|
|
"num_tokens": 444746524.0,
|
|
"step": 7065
|
|
},
|
|
{
|
|
"epoch": 23.104746317512276,
|
|
"grad_norm": 12.267877353989375,
|
|
"learning_rate": 8.733916833282008e-11,
|
|
"loss": 1.3198,
|
|
"mean_token_accuracy": 0.7258092045783997,
|
|
"num_tokens": 445062057.0,
|
|
"step": 7070
|
|
},
|
|
{
|
|
"epoch": 23.121112929623568,
|
|
"grad_norm": 12.485433355461446,
|
|
"learning_rate": 8.585100747784374e-11,
|
|
"loss": 1.3392,
|
|
"mean_token_accuracy": 0.7215893030166626,
|
|
"num_tokens": 445378595.0,
|
|
"step": 7075
|
|
},
|
|
{
|
|
"epoch": 23.137479541734862,
|
|
"grad_norm": 13.150909792468429,
|
|
"learning_rate": 8.437541258498633e-11,
|
|
"loss": 1.3416,
|
|
"mean_token_accuracy": 0.7213694810867309,
|
|
"num_tokens": 445695024.0,
|
|
"step": 7080
|
|
},
|
|
{
|
|
"epoch": 23.153846153846153,
|
|
"grad_norm": 12.56982379223737,
|
|
"learning_rate": 8.29123913349128e-11,
|
|
"loss": 1.3451,
|
|
"mean_token_accuracy": 0.721850723028183,
|
|
"num_tokens": 446013483.0,
|
|
"step": 7085
|
|
},
|
|
{
|
|
"epoch": 23.170212765957448,
|
|
"grad_norm": 12.74917405617213,
|
|
"learning_rate": 8.146195134284052e-11,
|
|
"loss": 1.3256,
|
|
"mean_token_accuracy": 0.7241310834884643,
|
|
"num_tokens": 446331438.0,
|
|
"step": 7090
|
|
},
|
|
{
|
|
"epoch": 23.18657937806874,
|
|
"grad_norm": 12.861658218052662,
|
|
"learning_rate": 8.002410015849948e-11,
|
|
"loss": 1.3507,
|
|
"mean_token_accuracy": 0.7200630605220795,
|
|
"num_tokens": 446646339.0,
|
|
"step": 7095
|
|
},
|
|
{
|
|
"epoch": 23.202945990180034,
|
|
"grad_norm": 12.629385644245952,
|
|
"learning_rate": 7.859884526609434e-11,
|
|
"loss": 1.3431,
|
|
"mean_token_accuracy": 0.7209880650043488,
|
|
"num_tokens": 446961928.0,
|
|
"step": 7100
|
|
},
|
|
{
|
|
"epoch": 23.219312602291325,
|
|
"grad_norm": 13.07788475145339,
|
|
"learning_rate": 7.718619408426358e-11,
|
|
"loss": 1.3566,
|
|
"mean_token_accuracy": 0.7183891952037811,
|
|
"num_tokens": 447277873.0,
|
|
"step": 7105
|
|
},
|
|
{
|
|
"epoch": 23.23567921440262,
|
|
"grad_norm": 12.952389492992099,
|
|
"learning_rate": 7.578615396604149e-11,
|
|
"loss": 1.3488,
|
|
"mean_token_accuracy": 0.7210026741027832,
|
|
"num_tokens": 447593799.0,
|
|
"step": 7110
|
|
},
|
|
{
|
|
"epoch": 23.25204582651391,
|
|
"grad_norm": 12.743509865987193,
|
|
"learning_rate": 7.439873219882098e-11,
|
|
"loss": 1.3422,
|
|
"mean_token_accuracy": 0.721809321641922,
|
|
"num_tokens": 447908979.0,
|
|
"step": 7115
|
|
},
|
|
{
|
|
"epoch": 23.268412438625205,
|
|
"grad_norm": 12.783610210000228,
|
|
"learning_rate": 7.30239360043139e-11,
|
|
"loss": 1.3338,
|
|
"mean_token_accuracy": 0.7223511099815368,
|
|
"num_tokens": 448224475.0,
|
|
"step": 7120
|
|
},
|
|
{
|
|
"epoch": 23.284779050736496,
|
|
"grad_norm": 12.445475133174945,
|
|
"learning_rate": 7.166177253851491e-11,
|
|
"loss": 1.3395,
|
|
"mean_token_accuracy": 0.721209728717804,
|
|
"num_tokens": 448540511.0,
|
|
"step": 7125
|
|
},
|
|
{
|
|
"epoch": 23.30114566284779,
|
|
"grad_norm": 13.170033737273622,
|
|
"learning_rate": 7.031224889166326e-11,
|
|
"loss": 1.3473,
|
|
"mean_token_accuracy": 0.7205008029937744,
|
|
"num_tokens": 448856961.0,
|
|
"step": 7130
|
|
},
|
|
{
|
|
"epoch": 23.317512274959082,
|
|
"grad_norm": 12.733430417544065,
|
|
"learning_rate": 6.89753720882072e-11,
|
|
"loss": 1.3572,
|
|
"mean_token_accuracy": 0.7191106677055359,
|
|
"num_tokens": 449172747.0,
|
|
"step": 7135
|
|
},
|
|
{
|
|
"epoch": 23.333878887070377,
|
|
"grad_norm": 12.820932566449374,
|
|
"learning_rate": 6.765114908676512e-11,
|
|
"loss": 1.3541,
|
|
"mean_token_accuracy": 0.7197527647018432,
|
|
"num_tokens": 449489574.0,
|
|
"step": 7140
|
|
},
|
|
{
|
|
"epoch": 23.350245499181668,
|
|
"grad_norm": 12.334727024147877,
|
|
"learning_rate": 6.633958678009172e-11,
|
|
"loss": 1.3572,
|
|
"mean_token_accuracy": 0.7184008717536926,
|
|
"num_tokens": 449805224.0,
|
|
"step": 7145
|
|
},
|
|
{
|
|
"epoch": 23.366612111292962,
|
|
"grad_norm": 13.243260617772965,
|
|
"learning_rate": 6.504069199504081e-11,
|
|
"loss": 1.3533,
|
|
"mean_token_accuracy": 0.720440012216568,
|
|
"num_tokens": 450121499.0,
|
|
"step": 7150
|
|
},
|
|
{
|
|
"epoch": 23.382978723404257,
|
|
"grad_norm": 13.111364694962516,
|
|
"learning_rate": 6.375447149253005e-11,
|
|
"loss": 1.3438,
|
|
"mean_token_accuracy": 0.7227679491043091,
|
|
"num_tokens": 450438216.0,
|
|
"step": 7155
|
|
},
|
|
{
|
|
"epoch": 23.399345335515548,
|
|
"grad_norm": 13.02081588241417,
|
|
"learning_rate": 6.24809319675057e-11,
|
|
"loss": 1.3588,
|
|
"mean_token_accuracy": 0.718359899520874,
|
|
"num_tokens": 450753408.0,
|
|
"step": 7160
|
|
},
|
|
{
|
|
"epoch": 23.415711947626843,
|
|
"grad_norm": 16.994427042019105,
|
|
"learning_rate": 6.12200800489085e-11,
|
|
"loss": 1.3611,
|
|
"mean_token_accuracy": 0.7182079792022705,
|
|
"num_tokens": 451069176.0,
|
|
"step": 7165
|
|
},
|
|
{
|
|
"epoch": 23.432078559738134,
|
|
"grad_norm": 12.825635117115837,
|
|
"learning_rate": 5.997192229963727e-11,
|
|
"loss": 1.3464,
|
|
"mean_token_accuracy": 0.720733916759491,
|
|
"num_tokens": 451384214.0,
|
|
"step": 7170
|
|
},
|
|
{
|
|
"epoch": 23.44844517184943,
|
|
"grad_norm": 12.356272462085352,
|
|
"learning_rate": 5.873646521651759e-11,
|
|
"loss": 1.35,
|
|
"mean_token_accuracy": 0.7204180717468261,
|
|
"num_tokens": 451700388.0,
|
|
"step": 7175
|
|
},
|
|
{
|
|
"epoch": 23.46481178396072,
|
|
"grad_norm": 13.158569750292818,
|
|
"learning_rate": 5.7513715230265165e-11,
|
|
"loss": 1.3539,
|
|
"mean_token_accuracy": 0.7190254867076874,
|
|
"num_tokens": 452015367.0,
|
|
"step": 7180
|
|
},
|
|
{
|
|
"epoch": 23.481178396072014,
|
|
"grad_norm": 12.094869366015358,
|
|
"learning_rate": 5.630367870545411e-11,
|
|
"loss": 1.3408,
|
|
"mean_token_accuracy": 0.722713416814804,
|
|
"num_tokens": 452329550.0,
|
|
"step": 7185
|
|
},
|
|
{
|
|
"epoch": 23.497545008183305,
|
|
"grad_norm": 11.893876981267749,
|
|
"learning_rate": 5.510636194048318e-11,
|
|
"loss": 1.3471,
|
|
"mean_token_accuracy": 0.7201150059700012,
|
|
"num_tokens": 452644884.0,
|
|
"step": 7190
|
|
},
|
|
{
|
|
"epoch": 23.5139116202946,
|
|
"grad_norm": 12.109253702866202,
|
|
"learning_rate": 5.3921771167542985e-11,
|
|
"loss": 1.3517,
|
|
"mean_token_accuracy": 0.7199622452259063,
|
|
"num_tokens": 452959253.0,
|
|
"step": 7195
|
|
},
|
|
{
|
|
"epoch": 23.53027823240589,
|
|
"grad_norm": 13.099421347139643,
|
|
"learning_rate": 5.274991255258432e-11,
|
|
"loss": 1.3623,
|
|
"mean_token_accuracy": 0.7182047188282012,
|
|
"num_tokens": 453273720.0,
|
|
"step": 7200
|
|
},
|
|
{
|
|
"epoch": 23.546644844517186,
|
|
"grad_norm": 12.605437677197763,
|
|
"learning_rate": 5.1590792195284616e-11,
|
|
"loss": 1.3436,
|
|
"mean_token_accuracy": 0.7211773514747619,
|
|
"num_tokens": 453589369.0,
|
|
"step": 7205
|
|
},
|
|
{
|
|
"epoch": 23.563011456628477,
|
|
"grad_norm": 13.084592752849911,
|
|
"learning_rate": 5.044441612901768e-11,
|
|
"loss": 1.3417,
|
|
"mean_token_accuracy": 0.7220472991466522,
|
|
"num_tokens": 453906277.0,
|
|
"step": 7210
|
|
},
|
|
{
|
|
"epoch": 23.57937806873977,
|
|
"grad_norm": 12.357535357551479,
|
|
"learning_rate": 4.931079032082092e-11,
|
|
"loss": 1.3437,
|
|
"mean_token_accuracy": 0.7210754454135895,
|
|
"num_tokens": 454222243.0,
|
|
"step": 7215
|
|
},
|
|
{
|
|
"epoch": 23.595744680851062,
|
|
"grad_norm": 13.283999157997714,
|
|
"learning_rate": 4.8189920671365405e-11,
|
|
"loss": 1.3503,
|
|
"mean_token_accuracy": 0.7204611420631408,
|
|
"num_tokens": 454537914.0,
|
|
"step": 7220
|
|
},
|
|
{
|
|
"epoch": 23.612111292962357,
|
|
"grad_norm": 12.647991933541059,
|
|
"learning_rate": 4.7081813014924755e-11,
|
|
"loss": 1.3495,
|
|
"mean_token_accuracy": 0.720174902677536,
|
|
"num_tokens": 454853441.0,
|
|
"step": 7225
|
|
},
|
|
{
|
|
"epoch": 23.628477905073648,
|
|
"grad_norm": 12.833034997500734,
|
|
"learning_rate": 4.598647311934462e-11,
|
|
"loss": 1.3365,
|
|
"mean_token_accuracy": 0.722919511795044,
|
|
"num_tokens": 455169051.0,
|
|
"step": 7230
|
|
},
|
|
{
|
|
"epoch": 23.644844517184943,
|
|
"grad_norm": 12.726194071489417,
|
|
"learning_rate": 4.490390668601296e-11,
|
|
"loss": 1.3635,
|
|
"mean_token_accuracy": 0.7177496433258057,
|
|
"num_tokens": 455483328.0,
|
|
"step": 7235
|
|
},
|
|
{
|
|
"epoch": 23.661211129296234,
|
|
"grad_norm": 12.836985723011823,
|
|
"learning_rate": 4.383411934983012e-11,
|
|
"loss": 1.3529,
|
|
"mean_token_accuracy": 0.7187625408172608,
|
|
"num_tokens": 455798691.0,
|
|
"step": 7240
|
|
},
|
|
{
|
|
"epoch": 23.67757774140753,
|
|
"grad_norm": 12.369616415044517,
|
|
"learning_rate": 4.277711667917877e-11,
|
|
"loss": 1.3202,
|
|
"mean_token_accuracy": 0.7258556962013245,
|
|
"num_tokens": 456113780.0,
|
|
"step": 7245
|
|
},
|
|
{
|
|
"epoch": 23.693944353518823,
|
|
"grad_norm": 13.777116689233463,
|
|
"learning_rate": 4.173290417589737e-11,
|
|
"loss": 1.3503,
|
|
"mean_token_accuracy": 0.7203498423099518,
|
|
"num_tokens": 456429261.0,
|
|
"step": 7250
|
|
},
|
|
{
|
|
"epoch": 23.710310965630114,
|
|
"grad_norm": 13.33248130407782,
|
|
"learning_rate": 4.070148727524814e-11,
|
|
"loss": 1.3794,
|
|
"mean_token_accuracy": 0.7169650435447693,
|
|
"num_tokens": 456744751.0,
|
|
"step": 7255
|
|
},
|
|
{
|
|
"epoch": 23.72667757774141,
|
|
"grad_norm": 12.73782241993151,
|
|
"learning_rate": 3.968287134589188e-11,
|
|
"loss": 1.3483,
|
|
"mean_token_accuracy": 0.7209219753742218,
|
|
"num_tokens": 457061525.0,
|
|
"step": 7260
|
|
},
|
|
{
|
|
"epoch": 23.7430441898527,
|
|
"grad_norm": 12.612029883251239,
|
|
"learning_rate": 3.867706168985768e-11,
|
|
"loss": 1.33,
|
|
"mean_token_accuracy": 0.7241364538669586,
|
|
"num_tokens": 457377985.0,
|
|
"step": 7265
|
|
},
|
|
{
|
|
"epoch": 23.759410801963995,
|
|
"grad_norm": 13.238807566993644,
|
|
"learning_rate": 3.768406354251713e-11,
|
|
"loss": 1.3444,
|
|
"mean_token_accuracy": 0.7211177289485932,
|
|
"num_tokens": 457692359.0,
|
|
"step": 7270
|
|
},
|
|
{
|
|
"epoch": 23.775777414075286,
|
|
"grad_norm": 13.62214958956788,
|
|
"learning_rate": 3.6703882072555706e-11,
|
|
"loss": 1.3632,
|
|
"mean_token_accuracy": 0.7179272472858429,
|
|
"num_tokens": 458007552.0,
|
|
"step": 7275
|
|
},
|
|
{
|
|
"epoch": 23.79214402618658,
|
|
"grad_norm": 12.76761616842123,
|
|
"learning_rate": 3.5736522381946137e-11,
|
|
"loss": 1.364,
|
|
"mean_token_accuracy": 0.7169786393642426,
|
|
"num_tokens": 458324087.0,
|
|
"step": 7280
|
|
},
|
|
{
|
|
"epoch": 23.80851063829787,
|
|
"grad_norm": 12.821563995336735,
|
|
"learning_rate": 3.478198950592315e-11,
|
|
"loss": 1.3535,
|
|
"mean_token_accuracy": 0.7197467684745789,
|
|
"num_tokens": 458641310.0,
|
|
"step": 7285
|
|
},
|
|
{
|
|
"epoch": 23.824877250409166,
|
|
"grad_norm": 12.34030320089937,
|
|
"learning_rate": 3.384028841295489e-11,
|
|
"loss": 1.3606,
|
|
"mean_token_accuracy": 0.7179623246192932,
|
|
"num_tokens": 458954389.0,
|
|
"step": 7290
|
|
},
|
|
{
|
|
"epoch": 23.841243862520457,
|
|
"grad_norm": 13.306594697494013,
|
|
"learning_rate": 3.2911424004719305e-11,
|
|
"loss": 1.3504,
|
|
"mean_token_accuracy": 0.7206150174140931,
|
|
"num_tokens": 459269086.0,
|
|
"step": 7295
|
|
},
|
|
{
|
|
"epoch": 23.857610474631752,
|
|
"grad_norm": 12.879399761128884,
|
|
"learning_rate": 3.199540111607752e-11,
|
|
"loss": 1.3522,
|
|
"mean_token_accuracy": 0.721342933177948,
|
|
"num_tokens": 459583992.0,
|
|
"step": 7300
|
|
},
|
|
{
|
|
"epoch": 23.873977086743043,
|
|
"grad_norm": 12.935481056677569,
|
|
"learning_rate": 3.109222451504884e-11,
|
|
"loss": 1.3714,
|
|
"mean_token_accuracy": 0.7164525389671326,
|
|
"num_tokens": 459899132.0,
|
|
"step": 7305
|
|
},
|
|
{
|
|
"epoch": 23.890343698854338,
|
|
"grad_norm": 12.846970895626185,
|
|
"learning_rate": 3.020189890278579e-11,
|
|
"loss": 1.3363,
|
|
"mean_token_accuracy": 0.7231940507888794,
|
|
"num_tokens": 460213554.0,
|
|
"step": 7310
|
|
},
|
|
{
|
|
"epoch": 23.90671031096563,
|
|
"grad_norm": 13.115590780433335,
|
|
"learning_rate": 2.932442891354997e-11,
|
|
"loss": 1.3411,
|
|
"mean_token_accuracy": 0.7229074001312256,
|
|
"num_tokens": 460530104.0,
|
|
"step": 7315
|
|
},
|
|
{
|
|
"epoch": 23.923076923076923,
|
|
"grad_norm": 13.49431363989052,
|
|
"learning_rate": 2.8459819114687868e-11,
|
|
"loss": 1.3552,
|
|
"mean_token_accuracy": 0.7194628655910492,
|
|
"num_tokens": 460846228.0,
|
|
"step": 7320
|
|
},
|
|
{
|
|
"epoch": 23.939443535188214,
|
|
"grad_norm": 13.019870380653288,
|
|
"learning_rate": 2.7608074006606755e-11,
|
|
"loss": 1.3532,
|
|
"mean_token_accuracy": 0.7177223682403564,
|
|
"num_tokens": 461161597.0,
|
|
"step": 7325
|
|
},
|
|
{
|
|
"epoch": 23.95581014729951,
|
|
"grad_norm": 12.789360155231835,
|
|
"learning_rate": 2.676919802275163e-11,
|
|
"loss": 1.3247,
|
|
"mean_token_accuracy": 0.7247434377670288,
|
|
"num_tokens": 461476843.0,
|
|
"step": 7330
|
|
},
|
|
{
|
|
"epoch": 23.9721767594108,
|
|
"grad_norm": 12.918338648170026,
|
|
"learning_rate": 2.594319552958191e-11,
|
|
"loss": 1.349,
|
|
"mean_token_accuracy": 0.7203211784362793,
|
|
"num_tokens": 461793859.0,
|
|
"step": 7335
|
|
},
|
|
{
|
|
"epoch": 23.988543371522095,
|
|
"grad_norm": 13.01942740314573,
|
|
"learning_rate": 2.513007082654922e-11,
|
|
"loss": 1.3625,
|
|
"mean_token_accuracy": 0.7167957127094269,
|
|
"num_tokens": 462108257.0,
|
|
"step": 7340
|
|
},
|
|
{
|
|
"epoch": 24.00327332242226,
|
|
"grad_norm": 12.751880986482613,
|
|
"learning_rate": 2.4329828146074095e-11,
|
|
"loss": 1.3563,
|
|
"mean_token_accuracy": 0.7189247012138367,
|
|
"num_tokens": 462368701.0,
|
|
"step": 7345
|
|
},
|
|
{
|
|
"epoch": 24.01963993453355,
|
|
"grad_norm": 12.621994373110237,
|
|
"learning_rate": 2.3542471653524856e-11,
|
|
"loss": 1.3477,
|
|
"mean_token_accuracy": 0.7204642951488495,
|
|
"num_tokens": 462685003.0,
|
|
"step": 7350
|
|
},
|
|
{
|
|
"epoch": 24.036006546644845,
|
|
"grad_norm": 12.69132203011607,
|
|
"learning_rate": 2.2768005447194872e-11,
|
|
"loss": 1.3548,
|
|
"mean_token_accuracy": 0.7195856988430023,
|
|
"num_tokens": 463000988.0,
|
|
"step": 7355
|
|
},
|
|
{
|
|
"epoch": 24.052373158756136,
|
|
"grad_norm": 13.590120640008223,
|
|
"learning_rate": 2.200643355828258e-11,
|
|
"loss": 1.3477,
|
|
"mean_token_accuracy": 0.7200317621231079,
|
|
"num_tokens": 463316798.0,
|
|
"step": 7360
|
|
},
|
|
{
|
|
"epoch": 24.06873977086743,
|
|
"grad_norm": 13.013557565391965,
|
|
"learning_rate": 2.125775995086926e-11,
|
|
"loss": 1.3695,
|
|
"mean_token_accuracy": 0.7159531533718109,
|
|
"num_tokens": 463631806.0,
|
|
"step": 7365
|
|
},
|
|
{
|
|
"epoch": 24.085106382978722,
|
|
"grad_norm": 12.907712960234285,
|
|
"learning_rate": 2.0521988521899628e-11,
|
|
"loss": 1.3413,
|
|
"mean_token_accuracy": 0.7220550358295441,
|
|
"num_tokens": 463948216.0,
|
|
"step": 7370
|
|
},
|
|
{
|
|
"epoch": 24.101472995090017,
|
|
"grad_norm": 13.256062819208426,
|
|
"learning_rate": 1.9799123101160444e-11,
|
|
"loss": 1.3465,
|
|
"mean_token_accuracy": 0.7210370361804962,
|
|
"num_tokens": 464263741.0,
|
|
"step": 7375
|
|
},
|
|
{
|
|
"epoch": 24.117839607201308,
|
|
"grad_norm": 12.994341656646789,
|
|
"learning_rate": 1.9089167451260547e-11,
|
|
"loss": 1.3577,
|
|
"mean_token_accuracy": 0.7184650480747223,
|
|
"num_tokens": 464580296.0,
|
|
"step": 7380
|
|
},
|
|
{
|
|
"epoch": 24.134206219312603,
|
|
"grad_norm": 12.81826767989765,
|
|
"learning_rate": 1.8392125267612803e-11,
|
|
"loss": 1.3583,
|
|
"mean_token_accuracy": 0.7188352525234223,
|
|
"num_tokens": 464893834.0,
|
|
"step": 7385
|
|
},
|
|
{
|
|
"epoch": 24.150572831423894,
|
|
"grad_norm": 12.7856551090598,
|
|
"learning_rate": 1.7708000178413008e-11,
|
|
"loss": 1.3379,
|
|
"mean_token_accuracy": 0.7223597168922424,
|
|
"num_tokens": 465209881.0,
|
|
"step": 7390
|
|
},
|
|
{
|
|
"epoch": 24.16693944353519,
|
|
"grad_norm": 12.585297565916532,
|
|
"learning_rate": 1.703679574462158e-11,
|
|
"loss": 1.3376,
|
|
"mean_token_accuracy": 0.7221680283546448,
|
|
"num_tokens": 465525042.0,
|
|
"step": 7395
|
|
},
|
|
{
|
|
"epoch": 24.183306055646483,
|
|
"grad_norm": 13.97539060097907,
|
|
"learning_rate": 1.6378515459946065e-11,
|
|
"loss": 1.375,
|
|
"mean_token_accuracy": 0.714812695980072,
|
|
"num_tokens": 465840909.0,
|
|
"step": 7400
|
|
},
|
|
{
|
|
"epoch": 24.199672667757774,
|
|
"grad_norm": 13.5560172635617,
|
|
"learning_rate": 1.5733162750821706e-11,
|
|
"loss": 1.3535,
|
|
"mean_token_accuracy": 0.719173789024353,
|
|
"num_tokens": 466154795.0,
|
|
"step": 7405
|
|
},
|
|
{
|
|
"epoch": 24.21603927986907,
|
|
"grad_norm": 13.078641012785111,
|
|
"learning_rate": 1.5100740976393968e-11,
|
|
"loss": 1.3598,
|
|
"mean_token_accuracy": 0.7185278415679932,
|
|
"num_tokens": 466470138.0,
|
|
"step": 7410
|
|
},
|
|
{
|
|
"epoch": 24.23240589198036,
|
|
"grad_norm": 12.897154852485773,
|
|
"learning_rate": 1.4481253428500763e-11,
|
|
"loss": 1.3591,
|
|
"mean_token_accuracy": 0.7187921524047851,
|
|
"num_tokens": 466784749.0,
|
|
"step": 7415
|
|
},
|
|
{
|
|
"epoch": 24.248772504091654,
|
|
"grad_norm": 12.492321955565348,
|
|
"learning_rate": 1.387470333165608e-11,
|
|
"loss": 1.3379,
|
|
"mean_token_accuracy": 0.722098046541214,
|
|
"num_tokens": 467098461.0,
|
|
"step": 7420
|
|
},
|
|
{
|
|
"epoch": 24.265139116202946,
|
|
"grad_norm": 13.007611243527537,
|
|
"learning_rate": 1.3281093843033055e-11,
|
|
"loss": 1.3448,
|
|
"mean_token_accuracy": 0.7209655940532684,
|
|
"num_tokens": 467416352.0,
|
|
"step": 7425
|
|
},
|
|
{
|
|
"epoch": 24.28150572831424,
|
|
"grad_norm": 12.697667313743437,
|
|
"learning_rate": 1.2700428052447033e-11,
|
|
"loss": 1.3579,
|
|
"mean_token_accuracy": 0.7195048153400421,
|
|
"num_tokens": 467731305.0,
|
|
"step": 7430
|
|
},
|
|
{
|
|
"epoch": 24.29787234042553,
|
|
"grad_norm": 13.353247427811356,
|
|
"learning_rate": 1.2132708982338924e-11,
|
|
"loss": 1.3429,
|
|
"mean_token_accuracy": 0.7209743320941925,
|
|
"num_tokens": 468047361.0,
|
|
"step": 7435
|
|
},
|
|
{
|
|
"epoch": 24.314238952536826,
|
|
"grad_norm": 12.837440884503787,
|
|
"learning_rate": 1.15779395877616e-11,
|
|
"loss": 1.3584,
|
|
"mean_token_accuracy": 0.7181209921836853,
|
|
"num_tokens": 468361613.0,
|
|
"step": 7440
|
|
},
|
|
{
|
|
"epoch": 24.330605564648117,
|
|
"grad_norm": 12.962933524119704,
|
|
"learning_rate": 1.10361227563624e-11,
|
|
"loss": 1.3361,
|
|
"mean_token_accuracy": 0.7226485967636108,
|
|
"num_tokens": 468676912.0,
|
|
"step": 7445
|
|
},
|
|
{
|
|
"epoch": 24.34697217675941,
|
|
"grad_norm": 12.23329675615014,
|
|
"learning_rate": 1.0507261308368709e-11,
|
|
"loss": 1.3351,
|
|
"mean_token_accuracy": 0.7236675322055817,
|
|
"num_tokens": 468994360.0,
|
|
"step": 7450
|
|
},
|
|
{
|
|
"epoch": 24.363338788870703,
|
|
"grad_norm": 12.726903918157424,
|
|
"learning_rate": 9.991357996573803e-12,
|
|
"loss": 1.3755,
|
|
"mean_token_accuracy": 0.7160296976566315,
|
|
"num_tokens": 469310842.0,
|
|
"step": 7455
|
|
},
|
|
{
|
|
"epoch": 24.379705400981997,
|
|
"grad_norm": 13.198393892424019,
|
|
"learning_rate": 9.488415506322123e-12,
|
|
"loss": 1.3622,
|
|
"mean_token_accuracy": 0.7167876899242401,
|
|
"num_tokens": 469626170.0,
|
|
"step": 7460
|
|
},
|
|
{
|
|
"epoch": 24.39607201309329,
|
|
"grad_norm": 12.368761103188476,
|
|
"learning_rate": 8.998436455495696e-12,
|
|
"loss": 1.3258,
|
|
"mean_token_accuracy": 0.7249380946159363,
|
|
"num_tokens": 469943305.0,
|
|
"step": 7465
|
|
},
|
|
{
|
|
"epoch": 24.412438625204583,
|
|
"grad_norm": 12.962191619603873,
|
|
"learning_rate": 8.521423394499129e-12,
|
|
"loss": 1.3428,
|
|
"mean_token_accuracy": 0.7214452266693115,
|
|
"num_tokens": 470260118.0,
|
|
"step": 7470
|
|
},
|
|
{
|
|
"epoch": 24.428805237315874,
|
|
"grad_norm": 12.849320327271007,
|
|
"learning_rate": 8.05737880624824e-12,
|
|
"loss": 1.3236,
|
|
"mean_token_accuracy": 0.7250566124916077,
|
|
"num_tokens": 470576565.0,
|
|
"step": 7475
|
|
},
|
|
{
|
|
"epoch": 24.44517184942717,
|
|
"grad_norm": 12.474474797345762,
|
|
"learning_rate": 7.606305106155897e-12,
|
|
"loss": 1.3249,
|
|
"mean_token_accuracy": 0.7245312631130219,
|
|
"num_tokens": 470893001.0,
|
|
"step": 7480
|
|
},
|
|
{
|
|
"epoch": 24.46153846153846,
|
|
"grad_norm": 13.300980774411267,
|
|
"learning_rate": 7.168204642119813e-12,
|
|
"loss": 1.3604,
|
|
"mean_token_accuracy": 0.7191662549972534,
|
|
"num_tokens": 471208820.0,
|
|
"step": 7485
|
|
},
|
|
{
|
|
"epoch": 24.477905073649755,
|
|
"grad_norm": 12.968424055077652,
|
|
"learning_rate": 6.743079694510601e-12,
|
|
"loss": 1.362,
|
|
"mean_token_accuracy": 0.7179892778396606,
|
|
"num_tokens": 471525954.0,
|
|
"step": 7490
|
|
},
|
|
{
|
|
"epoch": 24.49427168576105,
|
|
"grad_norm": 12.536855830459842,
|
|
"learning_rate": 6.33093247615929e-12,
|
|
"loss": 1.3545,
|
|
"mean_token_accuracy": 0.7187951862812042,
|
|
"num_tokens": 471840937.0,
|
|
"step": 7495
|
|
},
|
|
{
|
|
"epoch": 24.51063829787234,
|
|
"grad_norm": 13.178979626599647,
|
|
"learning_rate": 5.931765132346223e-12,
|
|
"loss": 1.3329,
|
|
"mean_token_accuracy": 0.7232393980026245,
|
|
"num_tokens": 472157742.0,
|
|
"step": 7500
|
|
},
|
|
{
|
|
"epoch": 24.527004909983635,
|
|
"grad_norm": 12.475923633074265,
|
|
"learning_rate": 5.545579740789397e-12,
|
|
"loss": 1.3248,
|
|
"mean_token_accuracy": 0.7247317433357239,
|
|
"num_tokens": 472474594.0,
|
|
"step": 7505
|
|
},
|
|
{
|
|
"epoch": 24.543371522094926,
|
|
"grad_norm": 13.289472428562675,
|
|
"learning_rate": 5.1723783116350284e-12,
|
|
"loss": 1.3621,
|
|
"mean_token_accuracy": 0.7177056550979615,
|
|
"num_tokens": 472790990.0,
|
|
"step": 7510
|
|
},
|
|
{
|
|
"epoch": 24.55973813420622,
|
|
"grad_norm": 12.370563451684093,
|
|
"learning_rate": 4.812162787445063e-12,
|
|
"loss": 1.3681,
|
|
"mean_token_accuracy": 0.7161191761493683,
|
|
"num_tokens": 473106808.0,
|
|
"step": 7515
|
|
},
|
|
{
|
|
"epoch": 24.57610474631751,
|
|
"grad_norm": 13.28225745876113,
|
|
"learning_rate": 4.464935043188567e-12,
|
|
"loss": 1.3724,
|
|
"mean_token_accuracy": 0.7154851138591767,
|
|
"num_tokens": 473421989.0,
|
|
"step": 7520
|
|
},
|
|
{
|
|
"epoch": 24.592471358428806,
|
|
"grad_norm": 13.301245647293618,
|
|
"learning_rate": 4.130696886231744e-12,
|
|
"loss": 1.3605,
|
|
"mean_token_accuracy": 0.7182529032230377,
|
|
"num_tokens": 473736855.0,
|
|
"step": 7525
|
|
},
|
|
{
|
|
"epoch": 24.608837970540097,
|
|
"grad_norm": 12.426546296528416,
|
|
"learning_rate": 3.809450056327934e-12,
|
|
"loss": 1.3496,
|
|
"mean_token_accuracy": 0.7204344034194946,
|
|
"num_tokens": 474052212.0,
|
|
"step": 7530
|
|
},
|
|
{
|
|
"epoch": 24.625204582651392,
|
|
"grad_norm": 12.648395109151407,
|
|
"learning_rate": 3.501196225608738e-12,
|
|
"loss": 1.3309,
|
|
"mean_token_accuracy": 0.7240665137767792,
|
|
"num_tokens": 474369338.0,
|
|
"step": 7535
|
|
},
|
|
{
|
|
"epoch": 24.641571194762683,
|
|
"grad_norm": 12.734272716729766,
|
|
"learning_rate": 3.2059369985762423e-12,
|
|
"loss": 1.3427,
|
|
"mean_token_accuracy": 0.7211298882961273,
|
|
"num_tokens": 474685028.0,
|
|
"step": 7540
|
|
},
|
|
{
|
|
"epoch": 24.657937806873978,
|
|
"grad_norm": 13.209935222715595,
|
|
"learning_rate": 2.923673912093028e-12,
|
|
"loss": 1.3385,
|
|
"mean_token_accuracy": 0.721641993522644,
|
|
"num_tokens": 475001822.0,
|
|
"step": 7545
|
|
},
|
|
{
|
|
"epoch": 24.67430441898527,
|
|
"grad_norm": 12.898389985570757,
|
|
"learning_rate": 2.654408435375788e-12,
|
|
"loss": 1.3624,
|
|
"mean_token_accuracy": 0.7173764705657959,
|
|
"num_tokens": 475316276.0,
|
|
"step": 7550
|
|
},
|
|
{
|
|
"epoch": 24.690671031096564,
|
|
"grad_norm": 12.541776646335519,
|
|
"learning_rate": 2.398141969986445e-12,
|
|
"loss": 1.3503,
|
|
"mean_token_accuracy": 0.720172131061554,
|
|
"num_tokens": 475630298.0,
|
|
"step": 7555
|
|
},
|
|
{
|
|
"epoch": 24.707037643207855,
|
|
"grad_norm": 13.762831164718952,
|
|
"learning_rate": 2.154875849825766e-12,
|
|
"loss": 1.3443,
|
|
"mean_token_accuracy": 0.7212839305400849,
|
|
"num_tokens": 475944133.0,
|
|
"step": 7560
|
|
},
|
|
{
|
|
"epoch": 24.72340425531915,
|
|
"grad_norm": 12.460693550510845,
|
|
"learning_rate": 1.924611341125315e-12,
|
|
"loss": 1.3492,
|
|
"mean_token_accuracy": 0.7199400305747986,
|
|
"num_tokens": 476257411.0,
|
|
"step": 7565
|
|
},
|
|
{
|
|
"epoch": 24.73977086743044,
|
|
"grad_norm": 12.636399236316878,
|
|
"learning_rate": 1.7073496424427347e-12,
|
|
"loss": 1.3606,
|
|
"mean_token_accuracy": 0.7177873611450195,
|
|
"num_tokens": 476572234.0,
|
|
"step": 7570
|
|
},
|
|
{
|
|
"epoch": 24.756137479541735,
|
|
"grad_norm": 12.806079318268207,
|
|
"learning_rate": 1.5030918846534182e-12,
|
|
"loss": 1.3509,
|
|
"mean_token_accuracy": 0.7192403972148895,
|
|
"num_tokens": 476885443.0,
|
|
"step": 7575
|
|
},
|
|
{
|
|
"epoch": 24.772504091653026,
|
|
"grad_norm": 12.87340982617205,
|
|
"learning_rate": 1.3118391309455136e-12,
|
|
"loss": 1.3321,
|
|
"mean_token_accuracy": 0.7227283775806427,
|
|
"num_tokens": 477200707.0,
|
|
"step": 7580
|
|
},
|
|
{
|
|
"epoch": 24.78887070376432,
|
|
"grad_norm": 13.196133476693376,
|
|
"learning_rate": 1.1335923768149292e-12,
|
|
"loss": 1.3599,
|
|
"mean_token_accuracy": 0.7180555760860443,
|
|
"num_tokens": 477517157.0,
|
|
"step": 7585
|
|
},
|
|
{
|
|
"epoch": 24.805237315875615,
|
|
"grad_norm": 12.525929111718444,
|
|
"learning_rate": 9.68352550059226e-13,
|
|
"loss": 1.3507,
|
|
"mean_token_accuracy": 0.7201619267463684,
|
|
"num_tokens": 477832923.0,
|
|
"step": 7590
|
|
},
|
|
{
|
|
"epoch": 24.821603927986907,
|
|
"grad_norm": 12.495416050685176,
|
|
"learning_rate": 8.161205107737324e-13,
|
|
"loss": 1.3392,
|
|
"mean_token_accuracy": 0.7222834944725036,
|
|
"num_tokens": 478149099.0,
|
|
"step": 7595
|
|
},
|
|
{
|
|
"epoch": 24.8379705400982,
|
|
"grad_norm": 12.655900316269998,
|
|
"learning_rate": 6.768970513457151e-13,
|
|
"loss": 1.3523,
|
|
"mean_token_accuracy": 0.7203588485717773,
|
|
"num_tokens": 478464829.0,
|
|
"step": 7600
|
|
},
|
|
{
|
|
"epoch": 24.854337152209492,
|
|
"grad_norm": 13.126910205340408,
|
|
"learning_rate": 5.506828964518818e-13,
|
|
"loss": 1.351,
|
|
"mean_token_accuracy": 0.7200382113456726,
|
|
"num_tokens": 478780816.0,
|
|
"step": 7605
|
|
},
|
|
{
|
|
"epoch": 24.870703764320787,
|
|
"grad_norm": 12.611751802636238,
|
|
"learning_rate": 4.3747870305338443e-13,
|
|
"loss": 1.3485,
|
|
"mean_token_accuracy": 0.7205796897411346,
|
|
"num_tokens": 479097853.0,
|
|
"step": 7610
|
|
},
|
|
{
|
|
"epoch": 24.887070376432078,
|
|
"grad_norm": 13.003538205209928,
|
|
"learning_rate": 3.3728506039276683e-13,
|
|
"loss": 1.3477,
|
|
"mean_token_accuracy": 0.7211360156536102,
|
|
"num_tokens": 479413268.0,
|
|
"step": 7615
|
|
},
|
|
{
|
|
"epoch": 24.903436988543373,
|
|
"grad_norm": 12.900651331291833,
|
|
"learning_rate": 2.501024899914661e-13,
|
|
"loss": 1.3452,
|
|
"mean_token_accuracy": 0.7210480213165283,
|
|
"num_tokens": 479730891.0,
|
|
"step": 7620
|
|
},
|
|
{
|
|
"epoch": 24.919803600654664,
|
|
"grad_norm": 13.073814232685633,
|
|
"learning_rate": 1.7593144564564956e-13,
|
|
"loss": 1.34,
|
|
"mean_token_accuracy": 0.7227759063243866,
|
|
"num_tokens": 480044395.0,
|
|
"step": 7625
|
|
},
|
|
{
|
|
"epoch": 24.93617021276596,
|
|
"grad_norm": 12.350548181385452,
|
|
"learning_rate": 1.1477231342538198e-13,
|
|
"loss": 1.3295,
|
|
"mean_token_accuracy": 0.7247556269168853,
|
|
"num_tokens": 480359255.0,
|
|
"step": 7630
|
|
},
|
|
{
|
|
"epoch": 24.95253682487725,
|
|
"grad_norm": 13.327420468075404,
|
|
"learning_rate": 6.662541167240521e-14,
|
|
"loss": 1.3351,
|
|
"mean_token_accuracy": 0.7232851803302764,
|
|
"num_tokens": 480675647.0,
|
|
"step": 7635
|
|
},
|
|
{
|
|
"epoch": 24.968903436988544,
|
|
"grad_norm": 12.389629604640263,
|
|
"learning_rate": 3.1490990997362634e-14,
|
|
"loss": 1.3502,
|
|
"mean_token_accuracy": 0.7206938445568085,
|
|
"num_tokens": 480991118.0,
|
|
"step": 7640
|
|
},
|
|
{
|
|
"epoch": 24.985270049099835,
|
|
"grad_norm": 12.850968329093641,
|
|
"learning_rate": 9.369234279799077e-15,
|
|
"loss": 1.3521,
|
|
"mean_token_accuracy": 0.7204296112060546,
|
|
"num_tokens": 481307400.0,
|
|
"step": 7645
|
|
},
|
|
{
|
|
"epoch": 25.0,
|
|
"grad_norm": 13.285064991937862,
|
|
"learning_rate": 2.6025666594042817e-16,
|
|
"loss": 1.3573,
|
|
"mean_token_accuracy": 0.7185248600112067,
|
|
"num_tokens": 481567917.0,
|
|
"step": 7650
|
|
},
|
|
{
|
|
"epoch": 25.0,
|
|
"step": 7650,
|
|
"total_flos": 567260733210624.0,
|
|
"train_loss": 1.8163700196010615,
|
|
"train_runtime": 15926.6953,
|
|
"train_samples_per_second": 30.644,
|
|
"train_steps_per_second": 0.48
|
|
}
|
|
],
|
|
"logging_steps": 5,
|
|
"max_steps": 7650,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 25,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": false,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 567260733210624.0,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|