Files
qwen3-4b-spectrum-nl2sql/trainer_state.json
ModelHub XC 522ca7e1e6 初始化项目,由ModelHub XC社区提供模型
Model: riv25-aim410/qwen3-4b-spectrum-nl2sql
Source: Original Platform
2026-04-30 00:50:07 +08:00

13814 lines
382 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 25.0,
"eval_steps": 500,
"global_step": 7650,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.016366612111292964,
"grad_norm": 55.62101037513415,
"learning_rate": 2.61437908496732e-11,
"loss": 2.6905,
"mean_token_accuracy": 0.6009584546089173,
"num_tokens": 316429.0,
"step": 5
},
{
"epoch": 0.03273322422258593,
"grad_norm": 58.172720274495184,
"learning_rate": 5.88235294117647e-11,
"loss": 2.7382,
"mean_token_accuracy": 0.595202910900116,
"num_tokens": 631415.0,
"step": 10
},
{
"epoch": 0.049099836333878884,
"grad_norm": 56.310876284999395,
"learning_rate": 9.150326797385621e-11,
"loss": 2.7181,
"mean_token_accuracy": 0.5965762197971344,
"num_tokens": 948728.0,
"step": 15
},
{
"epoch": 0.06546644844517185,
"grad_norm": 55.77884956792775,
"learning_rate": 1.241830065359477e-10,
"loss": 2.7651,
"mean_token_accuracy": 0.5920272588729858,
"num_tokens": 1262537.0,
"step": 20
},
{
"epoch": 0.08183306055646482,
"grad_norm": 56.771410740550785,
"learning_rate": 1.5686274509803922e-10,
"loss": 2.75,
"mean_token_accuracy": 0.5928871929645538,
"num_tokens": 1578196.0,
"step": 25
},
{
"epoch": 0.09819967266775777,
"grad_norm": 55.98400313077124,
"learning_rate": 1.895424836601307e-10,
"loss": 2.7047,
"mean_token_accuracy": 0.6000989556312561,
"num_tokens": 1894739.0,
"step": 30
},
{
"epoch": 0.11456628477905073,
"grad_norm": 57.760741569113556,
"learning_rate": 2.2222222222222224e-10,
"loss": 2.7384,
"mean_token_accuracy": 0.5951428174972534,
"num_tokens": 2210581.0,
"step": 35
},
{
"epoch": 0.1309328968903437,
"grad_norm": 57.28938487977958,
"learning_rate": 2.5490196078431375e-10,
"loss": 2.7062,
"mean_token_accuracy": 0.5997058689594269,
"num_tokens": 2526492.0,
"step": 40
},
{
"epoch": 0.14729950900163666,
"grad_norm": 56.76416543532324,
"learning_rate": 2.8758169934640523e-10,
"loss": 2.7623,
"mean_token_accuracy": 0.5918912470340729,
"num_tokens": 2842714.0,
"step": 45
},
{
"epoch": 0.16366612111292964,
"grad_norm": 56.38556168991693,
"learning_rate": 3.202614379084967e-10,
"loss": 2.6966,
"mean_token_accuracy": 0.5994902729988099,
"num_tokens": 3159154.0,
"step": 50
},
{
"epoch": 0.18003273322422259,
"grad_norm": 56.99858489246395,
"learning_rate": 3.5294117647058825e-10,
"loss": 2.6908,
"mean_token_accuracy": 0.600296539068222,
"num_tokens": 3472831.0,
"step": 55
},
{
"epoch": 0.19639934533551553,
"grad_norm": 54.51487201849461,
"learning_rate": 3.856209150326798e-10,
"loss": 2.678,
"mean_token_accuracy": 0.60253204703331,
"num_tokens": 3788192.0,
"step": 60
},
{
"epoch": 0.2127659574468085,
"grad_norm": 56.88214275656488,
"learning_rate": 4.183006535947712e-10,
"loss": 2.746,
"mean_token_accuracy": 0.5942470014095307,
"num_tokens": 4103915.0,
"step": 65
},
{
"epoch": 0.22913256955810146,
"grad_norm": 59.43648730633251,
"learning_rate": 4.5098039215686275e-10,
"loss": 2.7032,
"mean_token_accuracy": 0.5992533683776855,
"num_tokens": 4420361.0,
"step": 70
},
{
"epoch": 0.24549918166939444,
"grad_norm": 56.43974466098924,
"learning_rate": 4.836601307189543e-10,
"loss": 2.7446,
"mean_token_accuracy": 0.5943080246448517,
"num_tokens": 4735381.0,
"step": 75
},
{
"epoch": 0.2618657937806874,
"grad_norm": 56.16523920809429,
"learning_rate": 5.163398692810458e-10,
"loss": 2.6893,
"mean_token_accuracy": 0.6008193671703339,
"num_tokens": 5050563.0,
"step": 80
},
{
"epoch": 0.27823240589198034,
"grad_norm": 57.720048665459046,
"learning_rate": 5.490196078431373e-10,
"loss": 2.7405,
"mean_token_accuracy": 0.5943594813346863,
"num_tokens": 5364896.0,
"step": 85
},
{
"epoch": 0.2945990180032733,
"grad_norm": 56.75066213164205,
"learning_rate": 5.816993464052287e-10,
"loss": 2.715,
"mean_token_accuracy": 0.5964757144451142,
"num_tokens": 5680185.0,
"step": 90
},
{
"epoch": 0.3109656301145663,
"grad_norm": 54.15505392392569,
"learning_rate": 6.143790849673202e-10,
"loss": 2.664,
"mean_token_accuracy": 0.60406773686409,
"num_tokens": 5994650.0,
"step": 95
},
{
"epoch": 0.32733224222585927,
"grad_norm": 59.45987110326202,
"learning_rate": 6.470588235294118e-10,
"loss": 2.7524,
"mean_token_accuracy": 0.5922898650169373,
"num_tokens": 6310228.0,
"step": 100
},
{
"epoch": 0.3436988543371522,
"grad_norm": 57.085451412144366,
"learning_rate": 6.797385620915032e-10,
"loss": 2.7131,
"mean_token_accuracy": 0.5981797277927399,
"num_tokens": 6626412.0,
"step": 105
},
{
"epoch": 0.36006546644844517,
"grad_norm": 55.63396297277116,
"learning_rate": 7.124183006535948e-10,
"loss": 2.7468,
"mean_token_accuracy": 0.5937129497528076,
"num_tokens": 6942407.0,
"step": 110
},
{
"epoch": 0.37643207855973815,
"grad_norm": 55.04424122552027,
"learning_rate": 7.450980392156863e-10,
"loss": 2.6654,
"mean_token_accuracy": 0.6039653539657592,
"num_tokens": 7259157.0,
"step": 115
},
{
"epoch": 0.39279869067103107,
"grad_norm": 55.91055581080712,
"learning_rate": 7.777777777777778e-10,
"loss": 2.7051,
"mean_token_accuracy": 0.5981750130653382,
"num_tokens": 7572014.0,
"step": 120
},
{
"epoch": 0.40916530278232405,
"grad_norm": 57.70042748713812,
"learning_rate": 8.104575163398693e-10,
"loss": 2.7115,
"mean_token_accuracy": 0.5981932282447815,
"num_tokens": 7887390.0,
"step": 125
},
{
"epoch": 0.425531914893617,
"grad_norm": 53.28893293552655,
"learning_rate": 8.431372549019608e-10,
"loss": 2.6906,
"mean_token_accuracy": 0.6010222673416138,
"num_tokens": 8202912.0,
"step": 130
},
{
"epoch": 0.44189852700491,
"grad_norm": 56.73313551252287,
"learning_rate": 8.758169934640522e-10,
"loss": 2.7097,
"mean_token_accuracy": 0.5978877365589141,
"num_tokens": 8521524.0,
"step": 135
},
{
"epoch": 0.4582651391162029,
"grad_norm": 58.905904373776984,
"learning_rate": 9.084967320261438e-10,
"loss": 2.7888,
"mean_token_accuracy": 0.5877204239368439,
"num_tokens": 8837257.0,
"step": 140
},
{
"epoch": 0.4746317512274959,
"grad_norm": 55.71496830883279,
"learning_rate": 9.411764705882353e-10,
"loss": 2.72,
"mean_token_accuracy": 0.5970898568630219,
"num_tokens": 9152286.0,
"step": 145
},
{
"epoch": 0.4909983633387889,
"grad_norm": 60.74321699143503,
"learning_rate": 9.738562091503268e-10,
"loss": 2.7175,
"mean_token_accuracy": 0.5960192620754242,
"num_tokens": 9467071.0,
"step": 150
},
{
"epoch": 0.5073649754500819,
"grad_norm": 55.91981881970168,
"learning_rate": 1.0065359477124184e-09,
"loss": 2.727,
"mean_token_accuracy": 0.5952532231807709,
"num_tokens": 9782388.0,
"step": 155
},
{
"epoch": 0.5237315875613748,
"grad_norm": 59.43252838902965,
"learning_rate": 1.03921568627451e-09,
"loss": 2.7349,
"mean_token_accuracy": 0.5958111166954041,
"num_tokens": 10098522.0,
"step": 160
},
{
"epoch": 0.5400981996726678,
"grad_norm": 54.711678745868866,
"learning_rate": 1.0718954248366012e-09,
"loss": 2.6838,
"mean_token_accuracy": 0.6017765641212464,
"num_tokens": 10414165.0,
"step": 165
},
{
"epoch": 0.5564648117839607,
"grad_norm": 59.2576714699596,
"learning_rate": 1.1045751633986929e-09,
"loss": 2.7696,
"mean_token_accuracy": 0.5895721673965454,
"num_tokens": 10728716.0,
"step": 170
},
{
"epoch": 0.5728314238952537,
"grad_norm": 55.33516265128209,
"learning_rate": 1.1372549019607844e-09,
"loss": 2.7015,
"mean_token_accuracy": 0.5986992299556733,
"num_tokens": 11044535.0,
"step": 175
},
{
"epoch": 0.5891980360065466,
"grad_norm": 54.35208599375252,
"learning_rate": 1.1699346405228759e-09,
"loss": 2.6997,
"mean_token_accuracy": 0.5996046185493469,
"num_tokens": 11360787.0,
"step": 180
},
{
"epoch": 0.6055646481178396,
"grad_norm": 54.908204606493655,
"learning_rate": 1.2026143790849673e-09,
"loss": 2.6522,
"mean_token_accuracy": 0.6056609749794006,
"num_tokens": 11676189.0,
"step": 185
},
{
"epoch": 0.6219312602291326,
"grad_norm": 57.15438435248898,
"learning_rate": 1.2352941176470588e-09,
"loss": 2.7403,
"mean_token_accuracy": 0.5933549106121063,
"num_tokens": 11993732.0,
"step": 190
},
{
"epoch": 0.6382978723404256,
"grad_norm": 55.85187532544056,
"learning_rate": 1.2679738562091503e-09,
"loss": 2.7242,
"mean_token_accuracy": 0.5965588092803955,
"num_tokens": 12309686.0,
"step": 195
},
{
"epoch": 0.6546644844517185,
"grad_norm": 57.169912622360634,
"learning_rate": 1.300653594771242e-09,
"loss": 2.6877,
"mean_token_accuracy": 0.6016717553138733,
"num_tokens": 12626255.0,
"step": 200
},
{
"epoch": 0.6710310965630114,
"grad_norm": 56.35014678877227,
"learning_rate": 1.3333333333333333e-09,
"loss": 2.7287,
"mean_token_accuracy": 0.594957309961319,
"num_tokens": 12942690.0,
"step": 205
},
{
"epoch": 0.6873977086743044,
"grad_norm": 58.920641210373454,
"learning_rate": 1.3660130718954248e-09,
"loss": 2.7485,
"mean_token_accuracy": 0.5927645146846772,
"num_tokens": 13259396.0,
"step": 210
},
{
"epoch": 0.7037643207855974,
"grad_norm": 57.018665034459005,
"learning_rate": 1.3986928104575165e-09,
"loss": 2.7263,
"mean_token_accuracy": 0.5958705544471741,
"num_tokens": 13574685.0,
"step": 215
},
{
"epoch": 0.7201309328968903,
"grad_norm": 57.70089348693714,
"learning_rate": 1.4313725490196077e-09,
"loss": 2.7246,
"mean_token_accuracy": 0.5950082004070282,
"num_tokens": 13889357.0,
"step": 220
},
{
"epoch": 0.7364975450081833,
"grad_norm": 57.363546508146634,
"learning_rate": 1.4640522875816994e-09,
"loss": 2.7261,
"mean_token_accuracy": 0.5963014245033265,
"num_tokens": 14205437.0,
"step": 225
},
{
"epoch": 0.7528641571194763,
"grad_norm": 53.49987696781798,
"learning_rate": 1.496732026143791e-09,
"loss": 2.7102,
"mean_token_accuracy": 0.5987072646617889,
"num_tokens": 14521840.0,
"step": 230
},
{
"epoch": 0.7692307692307693,
"grad_norm": 56.20359405171631,
"learning_rate": 1.5294117647058826e-09,
"loss": 2.7486,
"mean_token_accuracy": 0.5931043148040771,
"num_tokens": 14836289.0,
"step": 235
},
{
"epoch": 0.7855973813420621,
"grad_norm": 57.02411335901064,
"learning_rate": 1.5620915032679739e-09,
"loss": 2.7297,
"mean_token_accuracy": 0.5947885155677796,
"num_tokens": 15150656.0,
"step": 240
},
{
"epoch": 0.8019639934533551,
"grad_norm": 56.33397544722411,
"learning_rate": 1.5947712418300654e-09,
"loss": 2.7665,
"mean_token_accuracy": 0.5909724771976471,
"num_tokens": 15465617.0,
"step": 245
},
{
"epoch": 0.8183306055646481,
"grad_norm": 57.676254136521834,
"learning_rate": 1.627450980392157e-09,
"loss": 2.7472,
"mean_token_accuracy": 0.5932631254196167,
"num_tokens": 15781412.0,
"step": 250
},
{
"epoch": 0.8346972176759411,
"grad_norm": 56.46876654435729,
"learning_rate": 1.6601307189542483e-09,
"loss": 2.7183,
"mean_token_accuracy": 0.5978624880313873,
"num_tokens": 16097194.0,
"step": 255
},
{
"epoch": 0.851063829787234,
"grad_norm": 58.43011011505955,
"learning_rate": 1.69281045751634e-09,
"loss": 2.7817,
"mean_token_accuracy": 0.5885932326316834,
"num_tokens": 16414161.0,
"step": 260
},
{
"epoch": 0.867430441898527,
"grad_norm": 57.54400391752138,
"learning_rate": 1.7254901960784313e-09,
"loss": 2.6971,
"mean_token_accuracy": 0.5998899936676025,
"num_tokens": 16729455.0,
"step": 265
},
{
"epoch": 0.88379705400982,
"grad_norm": 55.728573679624986,
"learning_rate": 1.758169934640523e-09,
"loss": 2.7623,
"mean_token_accuracy": 0.5904077827930451,
"num_tokens": 17044493.0,
"step": 270
},
{
"epoch": 0.900163666121113,
"grad_norm": 57.60438824990003,
"learning_rate": 1.7908496732026145e-09,
"loss": 2.7451,
"mean_token_accuracy": 0.5936128437519074,
"num_tokens": 17360258.0,
"step": 275
},
{
"epoch": 0.9165302782324058,
"grad_norm": 52.461559653921434,
"learning_rate": 1.8235294117647057e-09,
"loss": 2.6809,
"mean_token_accuracy": 0.6017558097839355,
"num_tokens": 17674914.0,
"step": 280
},
{
"epoch": 0.9328968903436988,
"grad_norm": 54.08802081407188,
"learning_rate": 1.8562091503267974e-09,
"loss": 2.6901,
"mean_token_accuracy": 0.6013341307640075,
"num_tokens": 17991190.0,
"step": 285
},
{
"epoch": 0.9492635024549918,
"grad_norm": 54.9744366583653,
"learning_rate": 1.8888888888888887e-09,
"loss": 2.6909,
"mean_token_accuracy": 0.600698959827423,
"num_tokens": 18306141.0,
"step": 290
},
{
"epoch": 0.9656301145662848,
"grad_norm": 58.033499216049144,
"learning_rate": 1.9215686274509804e-09,
"loss": 2.7827,
"mean_token_accuracy": 0.5892059445381165,
"num_tokens": 18622962.0,
"step": 295
},
{
"epoch": 0.9819967266775778,
"grad_norm": 56.18744968213324,
"learning_rate": 1.954248366013072e-09,
"loss": 2.7342,
"mean_token_accuracy": 0.5947219908237458,
"num_tokens": 18938998.0,
"step": 300
},
{
"epoch": 0.9983633387888707,
"grad_norm": 57.00662709599486,
"learning_rate": 1.9869281045751634e-09,
"loss": 2.7185,
"mean_token_accuracy": 0.5977497279644013,
"num_tokens": 19254740.0,
"step": 305
},
{
"epoch": 1.0130932896890343,
"grad_norm": 58.15220601527463,
"learning_rate": 2.019607843137255e-09,
"loss": 2.7109,
"mean_token_accuracy": 0.5939835707346598,
"num_tokens": 19514459.0,
"step": 310
},
{
"epoch": 1.0294599018003274,
"grad_norm": 57.382494721681105,
"learning_rate": 2.0522875816993463e-09,
"loss": 2.7038,
"mean_token_accuracy": 0.5981919527053833,
"num_tokens": 19830901.0,
"step": 315
},
{
"epoch": 1.0458265139116203,
"grad_norm": 56.81487821310481,
"learning_rate": 2.084967320261438e-09,
"loss": 2.7198,
"mean_token_accuracy": 0.5969063222408295,
"num_tokens": 20147766.0,
"step": 320
},
{
"epoch": 1.0621931260229132,
"grad_norm": 56.514014528428106,
"learning_rate": 2.1176470588235293e-09,
"loss": 2.7307,
"mean_token_accuracy": 0.5957262694835663,
"num_tokens": 20463289.0,
"step": 325
},
{
"epoch": 1.0785597381342062,
"grad_norm": 55.71056833038111,
"learning_rate": 2.150326797385621e-09,
"loss": 2.7166,
"mean_token_accuracy": 0.5964736342430115,
"num_tokens": 20778436.0,
"step": 330
},
{
"epoch": 1.0949263502454991,
"grad_norm": 57.82464672665498,
"learning_rate": 2.1830065359477127e-09,
"loss": 2.6896,
"mean_token_accuracy": 0.6009782791137696,
"num_tokens": 21094311.0,
"step": 335
},
{
"epoch": 1.1112929623567922,
"grad_norm": 57.361293489004666,
"learning_rate": 2.215686274509804e-09,
"loss": 2.713,
"mean_token_accuracy": 0.5986053884029389,
"num_tokens": 21410544.0,
"step": 340
},
{
"epoch": 1.127659574468085,
"grad_norm": 57.72987276907856,
"learning_rate": 2.2483660130718956e-09,
"loss": 2.7463,
"mean_token_accuracy": 0.5940020740032196,
"num_tokens": 21725832.0,
"step": 345
},
{
"epoch": 1.1440261865793782,
"grad_norm": 55.803725938254715,
"learning_rate": 2.281045751633987e-09,
"loss": 2.7144,
"mean_token_accuracy": 0.5972111761569977,
"num_tokens": 22041212.0,
"step": 350
},
{
"epoch": 1.160392798690671,
"grad_norm": 55.27391304057763,
"learning_rate": 2.3137254901960786e-09,
"loss": 2.7043,
"mean_token_accuracy": 0.59928178191185,
"num_tokens": 22357705.0,
"step": 355
},
{
"epoch": 1.1767594108019641,
"grad_norm": 55.886619087218165,
"learning_rate": 2.34640522875817e-09,
"loss": 2.7442,
"mean_token_accuracy": 0.5932347357273102,
"num_tokens": 22673798.0,
"step": 360
},
{
"epoch": 1.193126022913257,
"grad_norm": 57.751208013174015,
"learning_rate": 2.379084967320261e-09,
"loss": 2.7045,
"mean_token_accuracy": 0.5995621025562287,
"num_tokens": 22989381.0,
"step": 365
},
{
"epoch": 1.2094926350245498,
"grad_norm": 57.77951094605076,
"learning_rate": 2.411764705882353e-09,
"loss": 2.7395,
"mean_token_accuracy": 0.5948926329612731,
"num_tokens": 23305377.0,
"step": 370
},
{
"epoch": 1.225859247135843,
"grad_norm": 54.89485818089428,
"learning_rate": 2.4444444444444446e-09,
"loss": 2.6894,
"mean_token_accuracy": 0.6005812406539917,
"num_tokens": 23620440.0,
"step": 375
},
{
"epoch": 1.2422258592471358,
"grad_norm": 57.33835006471561,
"learning_rate": 2.4771241830065362e-09,
"loss": 2.7431,
"mean_token_accuracy": 0.5935854852199555,
"num_tokens": 23935615.0,
"step": 380
},
{
"epoch": 1.2585924713584289,
"grad_norm": 56.69754723412471,
"learning_rate": 2.5098039215686275e-09,
"loss": 2.7055,
"mean_token_accuracy": 0.5975941598415375,
"num_tokens": 24250759.0,
"step": 385
},
{
"epoch": 1.2749590834697218,
"grad_norm": 58.64054742433329,
"learning_rate": 2.542483660130719e-09,
"loss": 2.7932,
"mean_token_accuracy": 0.5873025774955749,
"num_tokens": 24566546.0,
"step": 390
},
{
"epoch": 1.2913256955810146,
"grad_norm": 53.341820755206776,
"learning_rate": 2.57516339869281e-09,
"loss": 2.7041,
"mean_token_accuracy": 0.5986806511878967,
"num_tokens": 24882934.0,
"step": 395
},
{
"epoch": 1.3076923076923077,
"grad_norm": 55.89678065009442,
"learning_rate": 2.607843137254902e-09,
"loss": 2.7215,
"mean_token_accuracy": 0.596479618549347,
"num_tokens": 25199952.0,
"step": 400
},
{
"epoch": 1.3240589198036006,
"grad_norm": 56.031812370748284,
"learning_rate": 2.6405228758169935e-09,
"loss": 2.7416,
"mean_token_accuracy": 0.5937622725963593,
"num_tokens": 25515305.0,
"step": 405
},
{
"epoch": 1.3404255319148937,
"grad_norm": 56.53930697071187,
"learning_rate": 2.673202614379085e-09,
"loss": 2.7287,
"mean_token_accuracy": 0.5946074604988099,
"num_tokens": 25831044.0,
"step": 410
},
{
"epoch": 1.3567921440261865,
"grad_norm": 58.366158564347025,
"learning_rate": 2.7058823529411764e-09,
"loss": 2.7198,
"mean_token_accuracy": 0.596553748846054,
"num_tokens": 26146292.0,
"step": 415
},
{
"epoch": 1.3731587561374796,
"grad_norm": 54.865600304232686,
"learning_rate": 2.738562091503268e-09,
"loss": 2.7124,
"mean_token_accuracy": 0.5966074049472809,
"num_tokens": 26462672.0,
"step": 420
},
{
"epoch": 1.3895253682487725,
"grad_norm": 56.79859615381639,
"learning_rate": 2.77124183006536e-09,
"loss": 2.6805,
"mean_token_accuracy": 0.6013938546180725,
"num_tokens": 26775810.0,
"step": 425
},
{
"epoch": 1.4058919803600656,
"grad_norm": 56.88030488511343,
"learning_rate": 2.803921568627451e-09,
"loss": 2.759,
"mean_token_accuracy": 0.5923470973968505,
"num_tokens": 27091768.0,
"step": 430
},
{
"epoch": 1.4222585924713584,
"grad_norm": 54.682735365518894,
"learning_rate": 2.8366013071895424e-09,
"loss": 2.7374,
"mean_token_accuracy": 0.5940196335315704,
"num_tokens": 27406064.0,
"step": 435
},
{
"epoch": 1.4386252045826513,
"grad_norm": 56.01861141931974,
"learning_rate": 2.869281045751634e-09,
"loss": 2.7245,
"mean_token_accuracy": 0.5955065369606019,
"num_tokens": 27721370.0,
"step": 440
},
{
"epoch": 1.4549918166939444,
"grad_norm": 55.81282965504614,
"learning_rate": 2.9019607843137257e-09,
"loss": 2.7094,
"mean_token_accuracy": 0.5974486649036408,
"num_tokens": 28037451.0,
"step": 445
},
{
"epoch": 1.4713584288052373,
"grad_norm": 54.735450983454065,
"learning_rate": 2.934640522875817e-09,
"loss": 2.7323,
"mean_token_accuracy": 0.5936485469341278,
"num_tokens": 28352352.0,
"step": 450
},
{
"epoch": 1.4877250409165304,
"grad_norm": 56.490867272630034,
"learning_rate": 2.9673202614379087e-09,
"loss": 2.7201,
"mean_token_accuracy": 0.5969516217708588,
"num_tokens": 28667282.0,
"step": 455
},
{
"epoch": 1.5040916530278232,
"grad_norm": 57.59062069435996,
"learning_rate": 3e-09,
"loss": 2.7465,
"mean_token_accuracy": 0.5931427359580994,
"num_tokens": 28982011.0,
"step": 460
},
{
"epoch": 1.520458265139116,
"grad_norm": 56.46328376363721,
"learning_rate": 3.0326797385620913e-09,
"loss": 2.7429,
"mean_token_accuracy": 0.5932705223560333,
"num_tokens": 29296827.0,
"step": 465
},
{
"epoch": 1.5368248772504092,
"grad_norm": 55.01577140250396,
"learning_rate": 3.0653594771241834e-09,
"loss": 2.7166,
"mean_token_accuracy": 0.5958990335464478,
"num_tokens": 29613826.0,
"step": 470
},
{
"epoch": 1.5531914893617023,
"grad_norm": 57.52972493680479,
"learning_rate": 3.0980392156862746e-09,
"loss": 2.7465,
"mean_token_accuracy": 0.5942055583000183,
"num_tokens": 29929096.0,
"step": 475
},
{
"epoch": 1.5695581014729951,
"grad_norm": 58.84850414001869,
"learning_rate": 3.130718954248366e-09,
"loss": 2.709,
"mean_token_accuracy": 0.5981499969959259,
"num_tokens": 30243368.0,
"step": 480
},
{
"epoch": 1.585924713584288,
"grad_norm": 57.50064518392254,
"learning_rate": 3.1633986928104576e-09,
"loss": 2.7609,
"mean_token_accuracy": 0.5903271436691284,
"num_tokens": 30557343.0,
"step": 485
},
{
"epoch": 1.6022913256955809,
"grad_norm": 55.5149653191555,
"learning_rate": 3.196078431372549e-09,
"loss": 2.7082,
"mean_token_accuracy": 0.5975622475147248,
"num_tokens": 30873056.0,
"step": 490
},
{
"epoch": 1.618657937806874,
"grad_norm": 55.87400536065197,
"learning_rate": 3.2287581699346406e-09,
"loss": 2.738,
"mean_token_accuracy": 0.5949967682361603,
"num_tokens": 31189263.0,
"step": 495
},
{
"epoch": 1.635024549918167,
"grad_norm": 57.28584477551273,
"learning_rate": 3.2614379084967323e-09,
"loss": 2.7052,
"mean_token_accuracy": 0.5990139007568359,
"num_tokens": 31504291.0,
"step": 500
},
{
"epoch": 1.65139116202946,
"grad_norm": 54.83291742448827,
"learning_rate": 3.2941176470588235e-09,
"loss": 2.6666,
"mean_token_accuracy": 0.6031867802143097,
"num_tokens": 31821042.0,
"step": 505
},
{
"epoch": 1.6677577741407528,
"grad_norm": 58.40949288203933,
"learning_rate": 3.326797385620915e-09,
"loss": 2.7014,
"mean_token_accuracy": 0.5982214510440826,
"num_tokens": 32136842.0,
"step": 510
},
{
"epoch": 1.6841243862520459,
"grad_norm": 55.77363797416089,
"learning_rate": 3.359477124183007e-09,
"loss": 2.7135,
"mean_token_accuracy": 0.5962281107902527,
"num_tokens": 32453472.0,
"step": 515
},
{
"epoch": 1.700490998363339,
"grad_norm": 54.68653787956224,
"learning_rate": 3.392156862745098e-09,
"loss": 2.6678,
"mean_token_accuracy": 0.6035849511623382,
"num_tokens": 32768088.0,
"step": 520
},
{
"epoch": 1.7168576104746318,
"grad_norm": 54.62194321614914,
"learning_rate": 3.4248366013071895e-09,
"loss": 2.7084,
"mean_token_accuracy": 0.5977645874023437,
"num_tokens": 33082497.0,
"step": 525
},
{
"epoch": 1.7332242225859247,
"grad_norm": 53.093565639192974,
"learning_rate": 3.457516339869281e-09,
"loss": 2.7051,
"mean_token_accuracy": 0.5985934197902679,
"num_tokens": 33398550.0,
"step": 530
},
{
"epoch": 1.7495908346972175,
"grad_norm": 56.59068400803138,
"learning_rate": 3.4901960784313724e-09,
"loss": 2.724,
"mean_token_accuracy": 0.5946956694126129,
"num_tokens": 33714169.0,
"step": 535
},
{
"epoch": 1.7659574468085106,
"grad_norm": 56.89444327874508,
"learning_rate": 3.5228758169934645e-09,
"loss": 2.6444,
"mean_token_accuracy": 0.6075450599193573,
"num_tokens": 34030238.0,
"step": 540
},
{
"epoch": 1.7823240589198037,
"grad_norm": 56.40460626976272,
"learning_rate": 3.555555555555556e-09,
"loss": 2.7523,
"mean_token_accuracy": 0.5912691414356231,
"num_tokens": 34344423.0,
"step": 545
},
{
"epoch": 1.7986906710310966,
"grad_norm": 54.590169589414685,
"learning_rate": 3.588235294117647e-09,
"loss": 2.6558,
"mean_token_accuracy": 0.6027090668678283,
"num_tokens": 34661768.0,
"step": 550
},
{
"epoch": 1.8150572831423895,
"grad_norm": 54.738525111336244,
"learning_rate": 3.6209150326797384e-09,
"loss": 2.674,
"mean_token_accuracy": 0.5994877934455871,
"num_tokens": 34978338.0,
"step": 555
},
{
"epoch": 1.8314238952536823,
"grad_norm": 54.44736090622008,
"learning_rate": 3.65359477124183e-09,
"loss": 2.7485,
"mean_token_accuracy": 0.5917412161827087,
"num_tokens": 35294554.0,
"step": 560
},
{
"epoch": 1.8477905073649754,
"grad_norm": 56.99256411648127,
"learning_rate": 3.6862745098039218e-09,
"loss": 2.7338,
"mean_token_accuracy": 0.5929640531539917,
"num_tokens": 35611110.0,
"step": 565
},
{
"epoch": 1.8641571194762685,
"grad_norm": 54.79344913325811,
"learning_rate": 3.7189542483660134e-09,
"loss": 2.7106,
"mean_token_accuracy": 0.5953186571598053,
"num_tokens": 35926411.0,
"step": 570
},
{
"epoch": 1.8805237315875614,
"grad_norm": 52.797498352731125,
"learning_rate": 3.751633986928105e-09,
"loss": 2.7266,
"mean_token_accuracy": 0.5939077198505401,
"num_tokens": 36244250.0,
"step": 575
},
{
"epoch": 1.8968903436988542,
"grad_norm": 57.216007808474345,
"learning_rate": 3.784313725490196e-09,
"loss": 2.6608,
"mean_token_accuracy": 0.6030964910984039,
"num_tokens": 36561165.0,
"step": 580
},
{
"epoch": 1.9132569558101473,
"grad_norm": 53.97785850665361,
"learning_rate": 3.816993464052287e-09,
"loss": 2.6583,
"mean_token_accuracy": 0.6021470665931702,
"num_tokens": 36877458.0,
"step": 585
},
{
"epoch": 1.9296235679214404,
"grad_norm": 54.143082330381475,
"learning_rate": 3.849673202614379e-09,
"loss": 2.7001,
"mean_token_accuracy": 0.5981690645217895,
"num_tokens": 37193872.0,
"step": 590
},
{
"epoch": 1.9459901800327333,
"grad_norm": 56.092454263999294,
"learning_rate": 3.882352941176471e-09,
"loss": 2.6859,
"mean_token_accuracy": 0.5982384502887725,
"num_tokens": 37508485.0,
"step": 595
},
{
"epoch": 1.9623567921440261,
"grad_norm": 53.68828562669948,
"learning_rate": 3.915032679738562e-09,
"loss": 2.701,
"mean_token_accuracy": 0.5964562237262726,
"num_tokens": 37823707.0,
"step": 600
},
{
"epoch": 1.978723404255319,
"grad_norm": 56.201055702078115,
"learning_rate": 3.947712418300653e-09,
"loss": 2.7612,
"mean_token_accuracy": 0.5894128203392028,
"num_tokens": 38138418.0,
"step": 605
},
{
"epoch": 1.995090016366612,
"grad_norm": 54.62282636390223,
"learning_rate": 3.980392156862745e-09,
"loss": 2.7223,
"mean_token_accuracy": 0.5942505180835724,
"num_tokens": 38454032.0,
"step": 610
},
{
"epoch": 2.009819967266776,
"grad_norm": 56.10486030635885,
"learning_rate": 4.013071895424837e-09,
"loss": 2.7005,
"mean_token_accuracy": 0.5975761082437303,
"num_tokens": 38713730.0,
"step": 615
},
{
"epoch": 2.0261865793780687,
"grad_norm": 52.7688119052421,
"learning_rate": 4.045751633986928e-09,
"loss": 2.6856,
"mean_token_accuracy": 0.5990508139133454,
"num_tokens": 39030761.0,
"step": 620
},
{
"epoch": 2.0425531914893615,
"grad_norm": 55.17135344620159,
"learning_rate": 4.07843137254902e-09,
"loss": 2.7085,
"mean_token_accuracy": 0.5959590137004852,
"num_tokens": 39347637.0,
"step": 625
},
{
"epoch": 2.058919803600655,
"grad_norm": 57.995842457694756,
"learning_rate": 4.111111111111111e-09,
"loss": 2.7192,
"mean_token_accuracy": 0.594757741689682,
"num_tokens": 39661859.0,
"step": 630
},
{
"epoch": 2.0752864157119477,
"grad_norm": 54.38975526380071,
"learning_rate": 4.143790849673203e-09,
"loss": 2.709,
"mean_token_accuracy": 0.5958807587623596,
"num_tokens": 39978360.0,
"step": 635
},
{
"epoch": 2.0916530278232406,
"grad_norm": 54.68354125601648,
"learning_rate": 4.176470588235295e-09,
"loss": 2.7235,
"mean_token_accuracy": 0.594566798210144,
"num_tokens": 40294927.0,
"step": 640
},
{
"epoch": 2.1080196399345335,
"grad_norm": 52.99792679525105,
"learning_rate": 4.209150326797386e-09,
"loss": 2.6955,
"mean_token_accuracy": 0.5974120557308197,
"num_tokens": 40610315.0,
"step": 645
},
{
"epoch": 2.1243862520458263,
"grad_norm": 55.908240404817846,
"learning_rate": 4.241830065359477e-09,
"loss": 2.7015,
"mean_token_accuracy": 0.5971021175384521,
"num_tokens": 40925803.0,
"step": 650
},
{
"epoch": 2.1407528641571196,
"grad_norm": 54.811323789860005,
"learning_rate": 4.2745098039215685e-09,
"loss": 2.6697,
"mean_token_accuracy": 0.6004336535930633,
"num_tokens": 41241287.0,
"step": 655
},
{
"epoch": 2.1571194762684125,
"grad_norm": 54.65515873667756,
"learning_rate": 4.3071895424836606e-09,
"loss": 2.7401,
"mean_token_accuracy": 0.5916546046733856,
"num_tokens": 41556298.0,
"step": 660
},
{
"epoch": 2.1734860883797054,
"grad_norm": 54.878154658789114,
"learning_rate": 4.339869281045752e-09,
"loss": 2.734,
"mean_token_accuracy": 0.5918886005878449,
"num_tokens": 41871045.0,
"step": 665
},
{
"epoch": 2.1898527004909982,
"grad_norm": 52.7218116891618,
"learning_rate": 4.372549019607843e-09,
"loss": 2.6912,
"mean_token_accuracy": 0.5978993952274323,
"num_tokens": 42186308.0,
"step": 670
},
{
"epoch": 2.2062193126022915,
"grad_norm": 54.32965865225877,
"learning_rate": 4.405228758169934e-09,
"loss": 2.6798,
"mean_token_accuracy": 0.5993401885032654,
"num_tokens": 42501889.0,
"step": 675
},
{
"epoch": 2.2225859247135844,
"grad_norm": 53.89889503525825,
"learning_rate": 4.437908496732026e-09,
"loss": 2.6935,
"mean_token_accuracy": 0.596401983499527,
"num_tokens": 42817655.0,
"step": 680
},
{
"epoch": 2.2389525368248773,
"grad_norm": 58.23012419600424,
"learning_rate": 4.470588235294118e-09,
"loss": 2.7287,
"mean_token_accuracy": 0.5919546246528625,
"num_tokens": 43133962.0,
"step": 685
},
{
"epoch": 2.25531914893617,
"grad_norm": 58.184484453333866,
"learning_rate": 4.503267973856209e-09,
"loss": 2.7211,
"mean_token_accuracy": 0.59313685297966,
"num_tokens": 43448995.0,
"step": 690
},
{
"epoch": 2.271685761047463,
"grad_norm": 54.85524353717034,
"learning_rate": 4.5359477124183e-09,
"loss": 2.7183,
"mean_token_accuracy": 0.59401575922966,
"num_tokens": 43763497.0,
"step": 695
},
{
"epoch": 2.2880523731587563,
"grad_norm": 54.61408181909944,
"learning_rate": 4.5686274509803924e-09,
"loss": 2.673,
"mean_token_accuracy": 0.6004755556583404,
"num_tokens": 44078904.0,
"step": 700
},
{
"epoch": 2.304418985270049,
"grad_norm": 53.910655475887154,
"learning_rate": 4.601307189542484e-09,
"loss": 2.687,
"mean_token_accuracy": 0.5986486673355103,
"num_tokens": 44393042.0,
"step": 705
},
{
"epoch": 2.320785597381342,
"grad_norm": 55.10138236422902,
"learning_rate": 4.633986928104576e-09,
"loss": 2.7297,
"mean_token_accuracy": 0.5921121656894683,
"num_tokens": 44707827.0,
"step": 710
},
{
"epoch": 2.337152209492635,
"grad_norm": 54.73885538831377,
"learning_rate": 4.666666666666667e-09,
"loss": 2.6563,
"mean_token_accuracy": 0.6028881072998047,
"num_tokens": 45023884.0,
"step": 715
},
{
"epoch": 2.3535188216039282,
"grad_norm": 53.44933175127846,
"learning_rate": 4.699346405228758e-09,
"loss": 2.7157,
"mean_token_accuracy": 0.5943553507328033,
"num_tokens": 45338498.0,
"step": 720
},
{
"epoch": 2.369885433715221,
"grad_norm": 55.740342198653835,
"learning_rate": 4.73202614379085e-09,
"loss": 2.711,
"mean_token_accuracy": 0.5951617062091827,
"num_tokens": 45653999.0,
"step": 725
},
{
"epoch": 2.386252045826514,
"grad_norm": 54.62824485548636,
"learning_rate": 4.764705882352941e-09,
"loss": 2.7073,
"mean_token_accuracy": 0.5961497783660888,
"num_tokens": 45970321.0,
"step": 730
},
{
"epoch": 2.402618657937807,
"grad_norm": 55.40457150277988,
"learning_rate": 4.797385620915033e-09,
"loss": 2.659,
"mean_token_accuracy": 0.6022453665733337,
"num_tokens": 46286216.0,
"step": 735
},
{
"epoch": 2.4189852700490997,
"grad_norm": 55.97649140796812,
"learning_rate": 4.830065359477124e-09,
"loss": 2.6926,
"mean_token_accuracy": 0.5985015273094177,
"num_tokens": 46601739.0,
"step": 740
},
{
"epoch": 2.4353518821603926,
"grad_norm": 52.96577140428808,
"learning_rate": 4.8627450980392156e-09,
"loss": 2.6167,
"mean_token_accuracy": 0.6079153478145599,
"num_tokens": 46917261.0,
"step": 745
},
{
"epoch": 2.451718494271686,
"grad_norm": 54.239981137658475,
"learning_rate": 4.895424836601307e-09,
"loss": 2.7005,
"mean_token_accuracy": 0.5955610156059266,
"num_tokens": 47232983.0,
"step": 750
},
{
"epoch": 2.4680851063829787,
"grad_norm": 55.64879350500573,
"learning_rate": 4.928104575163399e-09,
"loss": 2.6989,
"mean_token_accuracy": 0.5967093467712402,
"num_tokens": 47548001.0,
"step": 755
},
{
"epoch": 2.4844517184942716,
"grad_norm": 55.46875075671352,
"learning_rate": 4.96078431372549e-09,
"loss": 2.7075,
"mean_token_accuracy": 0.5952642858028412,
"num_tokens": 47864499.0,
"step": 760
},
{
"epoch": 2.500818330605565,
"grad_norm": 56.02832550186899,
"learning_rate": 4.9934640522875815e-09,
"loss": 2.6982,
"mean_token_accuracy": 0.5948630213737488,
"num_tokens": 48181788.0,
"step": 765
},
{
"epoch": 2.5171849427168578,
"grad_norm": 51.02977447624827,
"learning_rate": 4.999995835894431e-09,
"loss": 2.6569,
"mean_token_accuracy": 0.5976875245571136,
"num_tokens": 48497291.0,
"step": 770
},
{
"epoch": 2.5335515548281506,
"grad_norm": 52.132741336495044,
"learning_rate": 4.999978919239329e-09,
"loss": 2.6595,
"mean_token_accuracy": 0.5953408718109131,
"num_tokens": 48813596.0,
"step": 775
},
{
"epoch": 2.5499181669394435,
"grad_norm": 52.59810785644208,
"learning_rate": 4.999948989866086e-09,
"loss": 2.6407,
"mean_token_accuracy": 0.5970385134220123,
"num_tokens": 49128618.0,
"step": 780
},
{
"epoch": 2.5662847790507364,
"grad_norm": 52.64943844109409,
"learning_rate": 4.999906047930483e-09,
"loss": 2.6657,
"mean_token_accuracy": 0.5940295934677124,
"num_tokens": 49444350.0,
"step": 785
},
{
"epoch": 2.5826513911620292,
"grad_norm": 51.58494049727213,
"learning_rate": 4.999850093656042e-09,
"loss": 2.6456,
"mean_token_accuracy": 0.5964194118976593,
"num_tokens": 49760874.0,
"step": 790
},
{
"epoch": 2.5990180032733226,
"grad_norm": 50.4229415460418,
"learning_rate": 4.999781127334011e-09,
"loss": 2.6463,
"mean_token_accuracy": 0.5977925717830658,
"num_tokens": 50077471.0,
"step": 795
},
{
"epoch": 2.6153846153846154,
"grad_norm": 51.61467397516237,
"learning_rate": 4.999699149323369e-09,
"loss": 2.6322,
"mean_token_accuracy": 0.5987462520599365,
"num_tokens": 50394292.0,
"step": 800
},
{
"epoch": 2.6317512274959083,
"grad_norm": 51.35425116447994,
"learning_rate": 4.9996041600508215e-09,
"loss": 2.6848,
"mean_token_accuracy": 0.5933543801307678,
"num_tokens": 50708508.0,
"step": 805
},
{
"epoch": 2.648117839607201,
"grad_norm": 54.23489070568219,
"learning_rate": 4.999496160010801e-09,
"loss": 2.7238,
"mean_token_accuracy": 0.5853737473487854,
"num_tokens": 51024224.0,
"step": 810
},
{
"epoch": 2.6644844517184945,
"grad_norm": 49.584559362850236,
"learning_rate": 4.999375149765462e-09,
"loss": 2.6308,
"mean_token_accuracy": 0.600192254781723,
"num_tokens": 51339876.0,
"step": 815
},
{
"epoch": 2.6808510638297873,
"grad_norm": 52.75483476576607,
"learning_rate": 4.999241129944679e-09,
"loss": 2.644,
"mean_token_accuracy": 0.5973427474498749,
"num_tokens": 51653756.0,
"step": 820
},
{
"epoch": 2.69721767594108,
"grad_norm": 49.18710368234475,
"learning_rate": 4.9990941012460426e-09,
"loss": 2.6092,
"mean_token_accuracy": 0.6024131655693055,
"num_tokens": 51970308.0,
"step": 825
},
{
"epoch": 2.713584288052373,
"grad_norm": 53.787832600682705,
"learning_rate": 4.9989340644348545e-09,
"loss": 2.6728,
"mean_token_accuracy": 0.5934543550014496,
"num_tokens": 52287190.0,
"step": 830
},
{
"epoch": 2.729950900163666,
"grad_norm": 51.579113331419386,
"learning_rate": 4.998761020344129e-09,
"loss": 2.6233,
"mean_token_accuracy": 0.6000344455242157,
"num_tokens": 52604275.0,
"step": 835
},
{
"epoch": 2.7463175122749592,
"grad_norm": 49.84173315383667,
"learning_rate": 4.998574969874584e-09,
"loss": 2.6406,
"mean_token_accuracy": 0.5963773608207703,
"num_tokens": 52919052.0,
"step": 840
},
{
"epoch": 2.762684124386252,
"grad_norm": 50.39862997599813,
"learning_rate": 4.998375913994635e-09,
"loss": 2.6845,
"mean_token_accuracy": 0.5930020451545716,
"num_tokens": 53234461.0,
"step": 845
},
{
"epoch": 2.779050736497545,
"grad_norm": 50.560791912654636,
"learning_rate": 4.998163853740395e-09,
"loss": 2.6236,
"mean_token_accuracy": 0.6000065624713897,
"num_tokens": 53550429.0,
"step": 850
},
{
"epoch": 2.795417348608838,
"grad_norm": 49.666468120698156,
"learning_rate": 4.997938790215665e-09,
"loss": 2.6166,
"mean_token_accuracy": 0.5981848835945129,
"num_tokens": 53867271.0,
"step": 855
},
{
"epoch": 2.811783960720131,
"grad_norm": 51.23962535977192,
"learning_rate": 4.997700724591931e-09,
"loss": 2.6119,
"mean_token_accuracy": 0.6018172323703765,
"num_tokens": 54183461.0,
"step": 860
},
{
"epoch": 2.828150572831424,
"grad_norm": 48.83583292298752,
"learning_rate": 4.997449658108354e-09,
"loss": 2.6079,
"mean_token_accuracy": 0.6019073128700256,
"num_tokens": 54498182.0,
"step": 865
},
{
"epoch": 2.844517184942717,
"grad_norm": 53.57936650160131,
"learning_rate": 4.997185592071769e-09,
"loss": 2.6676,
"mean_token_accuracy": 0.5932618260383606,
"num_tokens": 54813016.0,
"step": 870
},
{
"epoch": 2.8608837970540097,
"grad_norm": 48.43631537595537,
"learning_rate": 4.996908527856674e-09,
"loss": 2.642,
"mean_token_accuracy": 0.5957655310630798,
"num_tokens": 55128483.0,
"step": 875
},
{
"epoch": 2.8772504091653026,
"grad_norm": 51.59484809137071,
"learning_rate": 4.996618466905226e-09,
"loss": 2.6599,
"mean_token_accuracy": 0.5930556237697602,
"num_tokens": 55444969.0,
"step": 880
},
{
"epoch": 2.8936170212765955,
"grad_norm": 49.41078450422501,
"learning_rate": 4.996315410727229e-09,
"loss": 2.6121,
"mean_token_accuracy": 0.6001948356628418,
"num_tokens": 55761639.0,
"step": 885
},
{
"epoch": 2.909983633387889,
"grad_norm": 52.77375569773051,
"learning_rate": 4.995999360900131e-09,
"loss": 2.695,
"mean_token_accuracy": 0.5912358403205872,
"num_tokens": 56077525.0,
"step": 890
},
{
"epoch": 2.9263502454991817,
"grad_norm": 51.62554625125403,
"learning_rate": 4.995670319069011e-09,
"loss": 2.6763,
"mean_token_accuracy": 0.5933657169342041,
"num_tokens": 56394235.0,
"step": 895
},
{
"epoch": 2.9427168576104745,
"grad_norm": 50.955150373337716,
"learning_rate": 4.995328286946577e-09,
"loss": 2.66,
"mean_token_accuracy": 0.5938617050647735,
"num_tokens": 56709470.0,
"step": 900
},
{
"epoch": 2.959083469721768,
"grad_norm": 51.61162113532987,
"learning_rate": 4.994973266313149e-09,
"loss": 2.6376,
"mean_token_accuracy": 0.5965611219406128,
"num_tokens": 57024644.0,
"step": 905
},
{
"epoch": 2.9754500818330607,
"grad_norm": 50.4765635733869,
"learning_rate": 4.9946052590166576e-09,
"loss": 2.6603,
"mean_token_accuracy": 0.5933335602283478,
"num_tokens": 57339368.0,
"step": 910
},
{
"epoch": 2.9918166939443536,
"grad_norm": 49.80334090841871,
"learning_rate": 4.994224266972629e-09,
"loss": 2.6482,
"mean_token_accuracy": 0.5958111524581909,
"num_tokens": 57654849.0,
"step": 915
},
{
"epoch": 3.006546644844517,
"grad_norm": 51.06592869430665,
"learning_rate": 4.993830292164176e-09,
"loss": 2.6796,
"mean_token_accuracy": 0.592134588294559,
"num_tokens": 57915824.0,
"step": 920
},
{
"epoch": 3.0229132569558104,
"grad_norm": 49.96950667591423,
"learning_rate": 4.99342333664199e-09,
"loss": 2.6534,
"mean_token_accuracy": 0.594581139087677,
"num_tokens": 58231596.0,
"step": 925
},
{
"epoch": 3.0392798690671032,
"grad_norm": 51.99343130345125,
"learning_rate": 4.993003402524327e-09,
"loss": 2.6668,
"mean_token_accuracy": 0.5940016269683838,
"num_tokens": 58548804.0,
"step": 930
},
{
"epoch": 3.055646481178396,
"grad_norm": 51.79899324274999,
"learning_rate": 4.992570491996999e-09,
"loss": 2.6066,
"mean_token_accuracy": 0.6022139132022858,
"num_tokens": 58864353.0,
"step": 935
},
{
"epoch": 3.072013093289689,
"grad_norm": 49.269387362251386,
"learning_rate": 4.992124607313363e-09,
"loss": 2.6598,
"mean_token_accuracy": 0.5946215331554413,
"num_tokens": 59179864.0,
"step": 940
},
{
"epoch": 3.088379705400982,
"grad_norm": 49.892803527802585,
"learning_rate": 4.991665750794306e-09,
"loss": 2.6403,
"mean_token_accuracy": 0.5959248483181,
"num_tokens": 59494088.0,
"step": 945
},
{
"epoch": 3.104746317512275,
"grad_norm": 49.68211690452728,
"learning_rate": 4.991193924828238e-09,
"loss": 2.6347,
"mean_token_accuracy": 0.5975275099277496,
"num_tokens": 59809665.0,
"step": 950
},
{
"epoch": 3.121112929623568,
"grad_norm": 50.56798569601125,
"learning_rate": 4.990709131871074e-09,
"loss": 2.6194,
"mean_token_accuracy": 0.5987059652805329,
"num_tokens": 60124351.0,
"step": 955
},
{
"epoch": 3.137479541734861,
"grad_norm": 52.1522983883937,
"learning_rate": 4.990211374446225e-09,
"loss": 2.6575,
"mean_token_accuracy": 0.5933027803897858,
"num_tokens": 60440561.0,
"step": 960
},
{
"epoch": 3.1538461538461537,
"grad_norm": 48.35836366419251,
"learning_rate": 4.989700655144584e-09,
"loss": 2.6014,
"mean_token_accuracy": 0.600798100233078,
"num_tokens": 60757057.0,
"step": 965
},
{
"epoch": 3.1702127659574466,
"grad_norm": 49.44218785880177,
"learning_rate": 4.989176976624511e-09,
"loss": 2.6277,
"mean_token_accuracy": 0.5984482586383819,
"num_tokens": 61072035.0,
"step": 970
},
{
"epoch": 3.18657937806874,
"grad_norm": 51.01216759285889,
"learning_rate": 4.988640341611823e-09,
"loss": 2.6489,
"mean_token_accuracy": 0.5925885379314423,
"num_tokens": 61387223.0,
"step": 975
},
{
"epoch": 3.202945990180033,
"grad_norm": 49.07223807349433,
"learning_rate": 4.988090752899774e-09,
"loss": 2.6261,
"mean_token_accuracy": 0.596798449754715,
"num_tokens": 61702198.0,
"step": 980
},
{
"epoch": 3.2193126022913257,
"grad_norm": 51.019708804907054,
"learning_rate": 4.987528213349046e-09,
"loss": 2.6434,
"mean_token_accuracy": 0.5930721282958984,
"num_tokens": 62016279.0,
"step": 985
},
{
"epoch": 3.2356792144026185,
"grad_norm": 50.396042389119174,
"learning_rate": 4.986952725887732e-09,
"loss": 2.6436,
"mean_token_accuracy": 0.5951732397079468,
"num_tokens": 62332853.0,
"step": 990
},
{
"epoch": 3.2520458265139114,
"grad_norm": 51.4770414159445,
"learning_rate": 4.9863642935113184e-09,
"loss": 2.6492,
"mean_token_accuracy": 0.5950457334518433,
"num_tokens": 62649395.0,
"step": 995
},
{
"epoch": 3.2684124386252047,
"grad_norm": 49.22644140858429,
"learning_rate": 4.985762919282674e-09,
"loss": 2.62,
"mean_token_accuracy": 0.5985433161258698,
"num_tokens": 62963623.0,
"step": 1000
},
{
"epoch": 3.2847790507364976,
"grad_norm": 52.7613105615504,
"learning_rate": 4.9851486063320286e-09,
"loss": 2.6547,
"mean_token_accuracy": 0.5932449102401733,
"num_tokens": 63279133.0,
"step": 1005
},
{
"epoch": 3.3011456628477904,
"grad_norm": 50.00996107413749,
"learning_rate": 4.9845213578569636e-09,
"loss": 2.596,
"mean_token_accuracy": 0.601094377040863,
"num_tokens": 63594496.0,
"step": 1010
},
{
"epoch": 3.3175122749590833,
"grad_norm": 51.50632807901888,
"learning_rate": 4.983881177122389e-09,
"loss": 2.658,
"mean_token_accuracy": 0.5926860809326172,
"num_tokens": 63909565.0,
"step": 1015
},
{
"epoch": 3.3338788870703766,
"grad_norm": 48.43933199467551,
"learning_rate": 4.98322806746053e-09,
"loss": 2.6263,
"mean_token_accuracy": 0.5961672484874725,
"num_tokens": 64225776.0,
"step": 1020
},
{
"epoch": 3.3502454991816695,
"grad_norm": 49.670231681135135,
"learning_rate": 4.982562032270907e-09,
"loss": 2.6224,
"mean_token_accuracy": 0.5976954877376557,
"num_tokens": 64542549.0,
"step": 1025
},
{
"epoch": 3.3666121112929623,
"grad_norm": 51.24543012467001,
"learning_rate": 4.981883075020321e-09,
"loss": 2.6385,
"mean_token_accuracy": 0.5948790550231934,
"num_tokens": 64857858.0,
"step": 1030
},
{
"epoch": 3.382978723404255,
"grad_norm": 49.50915341555558,
"learning_rate": 4.981191199242833e-09,
"loss": 2.6233,
"mean_token_accuracy": 0.5965835630893708,
"num_tokens": 65173178.0,
"step": 1035
},
{
"epoch": 3.399345335515548,
"grad_norm": 52.16472811170959,
"learning_rate": 4.980486408539749e-09,
"loss": 2.6528,
"mean_token_accuracy": 0.5927862703800202,
"num_tokens": 65490079.0,
"step": 1040
},
{
"epoch": 3.4157119476268414,
"grad_norm": 50.313527732755254,
"learning_rate": 4.979768706579595e-09,
"loss": 2.6437,
"mean_token_accuracy": 0.5943560242652893,
"num_tokens": 65803108.0,
"step": 1045
},
{
"epoch": 3.4320785597381342,
"grad_norm": 49.48251544830384,
"learning_rate": 4.979038097098104e-09,
"loss": 2.647,
"mean_token_accuracy": 0.5918027520179748,
"num_tokens": 66118713.0,
"step": 1050
},
{
"epoch": 3.448445171849427,
"grad_norm": 48.972433105538634,
"learning_rate": 4.978294583898196e-09,
"loss": 2.6254,
"mean_token_accuracy": 0.5962560415267945,
"num_tokens": 66433951.0,
"step": 1055
},
{
"epoch": 3.46481178396072,
"grad_norm": 50.314477031346506,
"learning_rate": 4.9775381708499526e-09,
"loss": 2.5885,
"mean_token_accuracy": 0.599977308511734,
"num_tokens": 66749970.0,
"step": 1060
},
{
"epoch": 3.4811783960720133,
"grad_norm": 49.31565636425134,
"learning_rate": 4.9767688618906034e-09,
"loss": 2.6078,
"mean_token_accuracy": 0.598986804485321,
"num_tokens": 67065501.0,
"step": 1065
},
{
"epoch": 3.497545008183306,
"grad_norm": 49.146350694301276,
"learning_rate": 4.9759866610245045e-09,
"loss": 2.6128,
"mean_token_accuracy": 0.5985178530216217,
"num_tokens": 67380313.0,
"step": 1070
},
{
"epoch": 3.513911620294599,
"grad_norm": 50.30669827447236,
"learning_rate": 4.9751915723231105e-09,
"loss": 2.6244,
"mean_token_accuracy": 0.595407634973526,
"num_tokens": 67695668.0,
"step": 1075
},
{
"epoch": 3.530278232405892,
"grad_norm": 51.803022503947055,
"learning_rate": 4.974383599924965e-09,
"loss": 2.6499,
"mean_token_accuracy": 0.5911674559116363,
"num_tokens": 68011201.0,
"step": 1080
},
{
"epoch": 3.5466448445171848,
"grad_norm": 49.796669023866826,
"learning_rate": 4.973562748035669e-09,
"loss": 2.6225,
"mean_token_accuracy": 0.5949772834777832,
"num_tokens": 68327902.0,
"step": 1085
},
{
"epoch": 3.563011456628478,
"grad_norm": 47.91885017050742,
"learning_rate": 4.972729020927865e-09,
"loss": 2.6125,
"mean_token_accuracy": 0.5971624255180359,
"num_tokens": 68644931.0,
"step": 1090
},
{
"epoch": 3.579378068739771,
"grad_norm": 49.87817891616827,
"learning_rate": 4.971882422941212e-09,
"loss": 2.602,
"mean_token_accuracy": 0.5991207838058472,
"num_tokens": 68962382.0,
"step": 1095
},
{
"epoch": 3.595744680851064,
"grad_norm": 48.683634051145795,
"learning_rate": 4.971022958482363e-09,
"loss": 2.6374,
"mean_token_accuracy": 0.5917447209358215,
"num_tokens": 69279280.0,
"step": 1100
},
{
"epoch": 3.6121112929623567,
"grad_norm": 52.2877151961869,
"learning_rate": 4.970150632024943e-09,
"loss": 2.6103,
"mean_token_accuracy": 0.5968870699405671,
"num_tokens": 69596738.0,
"step": 1105
},
{
"epoch": 3.62847790507365,
"grad_norm": 50.138960236262314,
"learning_rate": 4.969265448109526e-09,
"loss": 2.6396,
"mean_token_accuracy": 0.5927744507789612,
"num_tokens": 69911145.0,
"step": 1110
},
{
"epoch": 3.644844517184943,
"grad_norm": 48.7717696588962,
"learning_rate": 4.968367411343611e-09,
"loss": 2.5959,
"mean_token_accuracy": 0.5978303670883178,
"num_tokens": 70226940.0,
"step": 1115
},
{
"epoch": 3.6612111292962357,
"grad_norm": 49.49374181018141,
"learning_rate": 4.967456526401595e-09,
"loss": 2.6144,
"mean_token_accuracy": 0.5930372834205627,
"num_tokens": 70544098.0,
"step": 1120
},
{
"epoch": 3.6775777414075286,
"grad_norm": 47.7742894266053,
"learning_rate": 4.966532798024756e-09,
"loss": 2.5993,
"mean_token_accuracy": 0.5967276990413666,
"num_tokens": 70858464.0,
"step": 1125
},
{
"epoch": 3.6939443535188214,
"grad_norm": 48.24018085532676,
"learning_rate": 4.965596231021221e-09,
"loss": 2.6043,
"mean_token_accuracy": 0.5943282008171081,
"num_tokens": 71173847.0,
"step": 1130
},
{
"epoch": 3.7103109656301143,
"grad_norm": 46.88934534155509,
"learning_rate": 4.964646830265944e-09,
"loss": 2.6083,
"mean_token_accuracy": 0.5917117774486542,
"num_tokens": 71490884.0,
"step": 1135
},
{
"epoch": 3.7266775777414076,
"grad_norm": 47.822075277292676,
"learning_rate": 4.9636846007006784e-09,
"loss": 2.5422,
"mean_token_accuracy": 0.5986848413944245,
"num_tokens": 71807302.0,
"step": 1140
},
{
"epoch": 3.7430441898527005,
"grad_norm": 46.80028842269654,
"learning_rate": 4.9627095473339576e-09,
"loss": 2.5654,
"mean_token_accuracy": 0.5952131927013398,
"num_tokens": 72122097.0,
"step": 1145
},
{
"epoch": 3.7594108019639934,
"grad_norm": 43.95158266274471,
"learning_rate": 4.961721675241062e-09,
"loss": 2.5409,
"mean_token_accuracy": 0.5981755375862121,
"num_tokens": 72436073.0,
"step": 1150
},
{
"epoch": 3.7757774140752867,
"grad_norm": 44.184577219614965,
"learning_rate": 4.960720989563995e-09,
"loss": 2.5074,
"mean_token_accuracy": 0.5997650861740113,
"num_tokens": 72751719.0,
"step": 1155
},
{
"epoch": 3.7921440261865795,
"grad_norm": 41.34369211387233,
"learning_rate": 4.959707495511456e-09,
"loss": 2.5088,
"mean_token_accuracy": 0.5975705504417419,
"num_tokens": 73067477.0,
"step": 1160
},
{
"epoch": 3.8085106382978724,
"grad_norm": 41.04319366765971,
"learning_rate": 4.958681198358815e-09,
"loss": 2.4731,
"mean_token_accuracy": 0.6023198246955872,
"num_tokens": 73384446.0,
"step": 1165
},
{
"epoch": 3.8248772504091653,
"grad_norm": 40.35840500310339,
"learning_rate": 4.957642103448085e-09,
"loss": 2.5386,
"mean_token_accuracy": 0.5931589961051941,
"num_tokens": 73698463.0,
"step": 1170
},
{
"epoch": 3.841243862520458,
"grad_norm": 40.15978306256828,
"learning_rate": 4.956590216187888e-09,
"loss": 2.5528,
"mean_token_accuracy": 0.5887567341327667,
"num_tokens": 74012661.0,
"step": 1175
},
{
"epoch": 3.857610474631751,
"grad_norm": 38.37902076680877,
"learning_rate": 4.955525542053438e-09,
"loss": 2.4691,
"mean_token_accuracy": 0.5988661289215088,
"num_tokens": 74328504.0,
"step": 1180
},
{
"epoch": 3.8739770867430443,
"grad_norm": 38.07754678822132,
"learning_rate": 4.954448086586502e-09,
"loss": 2.479,
"mean_token_accuracy": 0.5997465968132019,
"num_tokens": 74644682.0,
"step": 1185
},
{
"epoch": 3.890343698854337,
"grad_norm": 39.03855697418089,
"learning_rate": 4.953357855395377e-09,
"loss": 2.4996,
"mean_token_accuracy": 0.597508841753006,
"num_tokens": 74960381.0,
"step": 1190
},
{
"epoch": 3.90671031096563,
"grad_norm": 37.97979041564868,
"learning_rate": 4.952254854154861e-09,
"loss": 2.4706,
"mean_token_accuracy": 0.6000802993774415,
"num_tokens": 75275290.0,
"step": 1195
},
{
"epoch": 3.9230769230769234,
"grad_norm": 39.84397161684973,
"learning_rate": 4.951139088606217e-09,
"loss": 2.493,
"mean_token_accuracy": 0.5967117786407471,
"num_tokens": 75590574.0,
"step": 1200
},
{
"epoch": 3.939443535188216,
"grad_norm": 37.68774052927674,
"learning_rate": 4.950010564557154e-09,
"loss": 2.4645,
"mean_token_accuracy": 0.6019287884235383,
"num_tokens": 75905357.0,
"step": 1205
},
{
"epoch": 3.955810147299509,
"grad_norm": 39.52747879667662,
"learning_rate": 4.9488692878817865e-09,
"loss": 2.4973,
"mean_token_accuracy": 0.595619136095047,
"num_tokens": 76221434.0,
"step": 1210
},
{
"epoch": 3.972176759410802,
"grad_norm": 36.7441206054698,
"learning_rate": 4.947715264520609e-09,
"loss": 2.4541,
"mean_token_accuracy": 0.6009849727153778,
"num_tokens": 76536701.0,
"step": 1215
},
{
"epoch": 3.988543371522095,
"grad_norm": 36.456613261964904,
"learning_rate": 4.946548500480466e-09,
"loss": 2.4683,
"mean_token_accuracy": 0.6003902852535248,
"num_tokens": 76853831.0,
"step": 1220
},
{
"epoch": 4.003273322422259,
"grad_norm": 36.21497438273292,
"learning_rate": 4.9453690018345145e-09,
"loss": 2.4962,
"mean_token_accuracy": 0.5997353924645318,
"num_tokens": 77113832.0,
"step": 1225
},
{
"epoch": 4.019639934533552,
"grad_norm": 35.87047882495412,
"learning_rate": 4.944176774722201e-09,
"loss": 2.4601,
"mean_token_accuracy": 0.59957355260849,
"num_tokens": 77428282.0,
"step": 1230
},
{
"epoch": 4.0360065466448445,
"grad_norm": 37.237635620382,
"learning_rate": 4.9429718253492254e-09,
"loss": 2.4653,
"mean_token_accuracy": 0.5990661978721619,
"num_tokens": 77746000.0,
"step": 1235
},
{
"epoch": 4.052373158756137,
"grad_norm": 38.89147301666522,
"learning_rate": 4.941754159987506e-09,
"loss": 2.4703,
"mean_token_accuracy": 0.6005004703998565,
"num_tokens": 78061879.0,
"step": 1240
},
{
"epoch": 4.06873977086743,
"grad_norm": 38.168350494896394,
"learning_rate": 4.94052378497515e-09,
"loss": 2.4951,
"mean_token_accuracy": 0.5971946477890014,
"num_tokens": 78377346.0,
"step": 1245
},
{
"epoch": 4.085106382978723,
"grad_norm": 37.01849369940714,
"learning_rate": 4.939280706716422e-09,
"loss": 2.4873,
"mean_token_accuracy": 0.5959964275360108,
"num_tokens": 78694009.0,
"step": 1250
},
{
"epoch": 4.101472995090016,
"grad_norm": 35.2694041688307,
"learning_rate": 4.938024931681706e-09,
"loss": 2.4285,
"mean_token_accuracy": 0.6062565207481384,
"num_tokens": 79006603.0,
"step": 1255
},
{
"epoch": 4.11783960720131,
"grad_norm": 38.65142188861583,
"learning_rate": 4.936756466407477e-09,
"loss": 2.4936,
"mean_token_accuracy": 0.5969399809837341,
"num_tokens": 79320758.0,
"step": 1260
},
{
"epoch": 4.134206219312603,
"grad_norm": 37.82578716741412,
"learning_rate": 4.935475317496264e-09,
"loss": 2.4921,
"mean_token_accuracy": 0.5950555443763733,
"num_tokens": 79636923.0,
"step": 1265
},
{
"epoch": 4.150572831423895,
"grad_norm": 36.637270956716655,
"learning_rate": 4.934181491616612e-09,
"loss": 2.4587,
"mean_token_accuracy": 0.6006620645523071,
"num_tokens": 79953014.0,
"step": 1270
},
{
"epoch": 4.166939443535188,
"grad_norm": 35.400359882953225,
"learning_rate": 4.9328749955030575e-09,
"loss": 2.4658,
"mean_token_accuracy": 0.5980123102664947,
"num_tokens": 80268771.0,
"step": 1275
},
{
"epoch": 4.183306055646481,
"grad_norm": 36.72319719460392,
"learning_rate": 4.931555835956082e-09,
"loss": 2.4862,
"mean_token_accuracy": 0.5970359206199646,
"num_tokens": 80583167.0,
"step": 1280
},
{
"epoch": 4.199672667757774,
"grad_norm": 36.62421262807592,
"learning_rate": 4.930224019842085e-09,
"loss": 2.4709,
"mean_token_accuracy": 0.600509512424469,
"num_tokens": 80897726.0,
"step": 1285
},
{
"epoch": 4.216039279869067,
"grad_norm": 37.137724593533896,
"learning_rate": 4.928879554093343e-09,
"loss": 2.4701,
"mean_token_accuracy": 0.5969075500965119,
"num_tokens": 81213422.0,
"step": 1290
},
{
"epoch": 4.23240589198036,
"grad_norm": 36.02686338812537,
"learning_rate": 4.927522445707978e-09,
"loss": 2.4485,
"mean_token_accuracy": 0.600042587518692,
"num_tokens": 81530245.0,
"step": 1295
},
{
"epoch": 4.248772504091653,
"grad_norm": 36.28731293885398,
"learning_rate": 4.926152701749917e-09,
"loss": 2.4685,
"mean_token_accuracy": 0.5975348889827728,
"num_tokens": 81843704.0,
"step": 1300
},
{
"epoch": 4.265139116202946,
"grad_norm": 36.618856102825845,
"learning_rate": 4.924770329348854e-09,
"loss": 2.49,
"mean_token_accuracy": 0.5962951421737671,
"num_tokens": 82160459.0,
"step": 1305
},
{
"epoch": 4.281505728314239,
"grad_norm": 38.21558740491567,
"learning_rate": 4.923375335700223e-09,
"loss": 2.49,
"mean_token_accuracy": 0.5959554374217987,
"num_tokens": 82476106.0,
"step": 1310
},
{
"epoch": 4.297872340425532,
"grad_norm": 37.497039235613,
"learning_rate": 4.921967728065147e-09,
"loss": 2.4849,
"mean_token_accuracy": 0.5967646718025208,
"num_tokens": 82792051.0,
"step": 1315
},
{
"epoch": 4.314238952536825,
"grad_norm": 34.48094366633381,
"learning_rate": 4.920547513770408e-09,
"loss": 2.445,
"mean_token_accuracy": 0.6008966147899628,
"num_tokens": 83106820.0,
"step": 1320
},
{
"epoch": 4.330605564648118,
"grad_norm": 37.813730656736176,
"learning_rate": 4.919114700208408e-09,
"loss": 2.4607,
"mean_token_accuracy": 0.5983804702758789,
"num_tokens": 83421720.0,
"step": 1325
},
{
"epoch": 4.346972176759411,
"grad_norm": 35.513809408398906,
"learning_rate": 4.917669294837129e-09,
"loss": 2.4551,
"mean_token_accuracy": 0.5996524155139923,
"num_tokens": 83738044.0,
"step": 1330
},
{
"epoch": 4.363338788870704,
"grad_norm": 36.18508976141847,
"learning_rate": 4.916211305180096e-09,
"loss": 2.42,
"mean_token_accuracy": 0.6065082490444184,
"num_tokens": 84055112.0,
"step": 1335
},
{
"epoch": 4.3797054009819965,
"grad_norm": 35.41417337406567,
"learning_rate": 4.9147407388263365e-09,
"loss": 2.472,
"mean_token_accuracy": 0.5982517778873444,
"num_tokens": 84370037.0,
"step": 1340
},
{
"epoch": 4.396072013093289,
"grad_norm": 35.797678528257016,
"learning_rate": 4.913257603430341e-09,
"loss": 2.479,
"mean_token_accuracy": 0.5972541332244873,
"num_tokens": 84685592.0,
"step": 1345
},
{
"epoch": 4.412438625204583,
"grad_norm": 37.599742811404646,
"learning_rate": 4.9117619067120245e-09,
"loss": 2.5071,
"mean_token_accuracy": 0.5945683479309082,
"num_tokens": 84999613.0,
"step": 1350
},
{
"epoch": 4.428805237315876,
"grad_norm": 35.00424778002652,
"learning_rate": 4.910253656456683e-09,
"loss": 2.4543,
"mean_token_accuracy": 0.5988426804542542,
"num_tokens": 85316360.0,
"step": 1355
},
{
"epoch": 4.445171849427169,
"grad_norm": 36.30117151566015,
"learning_rate": 4.908732860514958e-09,
"loss": 2.4747,
"mean_token_accuracy": 0.5954937934875488,
"num_tokens": 85633390.0,
"step": 1360
},
{
"epoch": 4.461538461538462,
"grad_norm": 36.06452936238572,
"learning_rate": 4.907199526802791e-09,
"loss": 2.4289,
"mean_token_accuracy": 0.6033567905426025,
"num_tokens": 85949251.0,
"step": 1365
},
{
"epoch": 4.4779050736497545,
"grad_norm": 37.51693653126231,
"learning_rate": 4.905653663301387e-09,
"loss": 2.5224,
"mean_token_accuracy": 0.5897888660430908,
"num_tokens": 86265830.0,
"step": 1370
},
{
"epoch": 4.494271685761047,
"grad_norm": 35.854037607697414,
"learning_rate": 4.904095278057166e-09,
"loss": 2.4691,
"mean_token_accuracy": 0.5975383460521698,
"num_tokens": 86581476.0,
"step": 1375
},
{
"epoch": 4.51063829787234,
"grad_norm": 36.325492671950144,
"learning_rate": 4.902524379181728e-09,
"loss": 2.4419,
"mean_token_accuracy": 0.5999314069747925,
"num_tokens": 86898519.0,
"step": 1380
},
{
"epoch": 4.527004909983633,
"grad_norm": 36.93795273948186,
"learning_rate": 4.90094097485181e-09,
"loss": 2.4828,
"mean_token_accuracy": 0.59447141289711,
"num_tokens": 87216046.0,
"step": 1385
},
{
"epoch": 4.543371522094926,
"grad_norm": 34.85070068347699,
"learning_rate": 4.899345073309236e-09,
"loss": 2.4256,
"mean_token_accuracy": 0.6024092137813568,
"num_tokens": 87532275.0,
"step": 1390
},
{
"epoch": 4.559738134206219,
"grad_norm": 37.081910141065485,
"learning_rate": 4.8977366828608846e-09,
"loss": 2.5232,
"mean_token_accuracy": 0.5894258916378021,
"num_tokens": 87847744.0,
"step": 1395
},
{
"epoch": 4.576104746317513,
"grad_norm": 35.447906810365794,
"learning_rate": 4.896115811878639e-09,
"loss": 2.4367,
"mean_token_accuracy": 0.6020602822303772,
"num_tokens": 88163634.0,
"step": 1400
},
{
"epoch": 4.5924713584288055,
"grad_norm": 37.20055072424884,
"learning_rate": 4.8944824687993435e-09,
"loss": 2.4496,
"mean_token_accuracy": 0.6003972291946411,
"num_tokens": 88478549.0,
"step": 1405
},
{
"epoch": 4.608837970540098,
"grad_norm": 37.04974152317881,
"learning_rate": 4.892836662124766e-09,
"loss": 2.4788,
"mean_token_accuracy": 0.5948118209838867,
"num_tokens": 88794566.0,
"step": 1410
},
{
"epoch": 4.625204582651391,
"grad_norm": 35.91992247765496,
"learning_rate": 4.891178400421543e-09,
"loss": 2.4532,
"mean_token_accuracy": 0.5996142148971557,
"num_tokens": 89109615.0,
"step": 1415
},
{
"epoch": 4.641571194762684,
"grad_norm": 36.935699347637375,
"learning_rate": 4.889507692321146e-09,
"loss": 2.4788,
"mean_token_accuracy": 0.5942489326000213,
"num_tokens": 89424020.0,
"step": 1420
},
{
"epoch": 4.657937806873977,
"grad_norm": 36.535263648437336,
"learning_rate": 4.88782454651983e-09,
"loss": 2.4642,
"mean_token_accuracy": 0.5973877847194672,
"num_tokens": 89739762.0,
"step": 1425
},
{
"epoch": 4.67430441898527,
"grad_norm": 35.09943792092194,
"learning_rate": 4.88612897177859e-09,
"loss": 2.4584,
"mean_token_accuracy": 0.5972561419010163,
"num_tokens": 90056358.0,
"step": 1430
},
{
"epoch": 4.690671031096563,
"grad_norm": 36.40251641806779,
"learning_rate": 4.884420976923112e-09,
"loss": 2.4699,
"mean_token_accuracy": 0.594540125131607,
"num_tokens": 90373815.0,
"step": 1435
},
{
"epoch": 4.7070376432078564,
"grad_norm": 36.20096084241157,
"learning_rate": 4.882700570843737e-09,
"loss": 2.4284,
"mean_token_accuracy": 0.6023145020008087,
"num_tokens": 90690273.0,
"step": 1440
},
{
"epoch": 4.723404255319149,
"grad_norm": 34.99455458968427,
"learning_rate": 4.880967762495401e-09,
"loss": 2.481,
"mean_token_accuracy": 0.5950899481773376,
"num_tokens": 91007144.0,
"step": 1445
},
{
"epoch": 4.739770867430442,
"grad_norm": 35.72119070392356,
"learning_rate": 4.8792225608976e-09,
"loss": 2.452,
"mean_token_accuracy": 0.5972677767276764,
"num_tokens": 91321113.0,
"step": 1450
},
{
"epoch": 4.756137479541735,
"grad_norm": 35.9579226923624,
"learning_rate": 4.8774649751343384e-09,
"loss": 2.4639,
"mean_token_accuracy": 0.5965704917907715,
"num_tokens": 91636320.0,
"step": 1455
},
{
"epoch": 4.772504091653028,
"grad_norm": 36.28261360146757,
"learning_rate": 4.875695014354079e-09,
"loss": 2.4266,
"mean_token_accuracy": 0.6007283627986908,
"num_tokens": 91951445.0,
"step": 1460
},
{
"epoch": 4.788870703764321,
"grad_norm": 37.16953048348806,
"learning_rate": 4.8739126877697e-09,
"loss": 2.4722,
"mean_token_accuracy": 0.5935324609279633,
"num_tokens": 92265893.0,
"step": 1465
},
{
"epoch": 4.805237315875614,
"grad_norm": 36.25198111318839,
"learning_rate": 4.872118004658446e-09,
"loss": 2.4446,
"mean_token_accuracy": 0.5985255479812622,
"num_tokens": 92582049.0,
"step": 1470
},
{
"epoch": 4.8216039279869065,
"grad_norm": 34.95866540091871,
"learning_rate": 4.8703109743618775e-09,
"loss": 2.4311,
"mean_token_accuracy": 0.6004527628421783,
"num_tokens": 92899566.0,
"step": 1475
},
{
"epoch": 4.837970540098199,
"grad_norm": 35.953089394319996,
"learning_rate": 4.868491606285823e-09,
"loss": 2.4457,
"mean_token_accuracy": 0.5963627815246582,
"num_tokens": 93212824.0,
"step": 1480
},
{
"epoch": 4.854337152209492,
"grad_norm": 36.34600921156728,
"learning_rate": 4.866659909900334e-09,
"loss": 2.4735,
"mean_token_accuracy": 0.5948220491409302,
"num_tokens": 93528357.0,
"step": 1485
},
{
"epoch": 4.870703764320785,
"grad_norm": 36.11778890464434,
"learning_rate": 4.864815894739629e-09,
"loss": 2.444,
"mean_token_accuracy": 0.5960727274417877,
"num_tokens": 93843459.0,
"step": 1490
},
{
"epoch": 4.887070376432079,
"grad_norm": 36.19249009284775,
"learning_rate": 4.862959570402049e-09,
"loss": 2.4493,
"mean_token_accuracy": 0.5964035809040069,
"num_tokens": 94158516.0,
"step": 1495
},
{
"epoch": 4.903436988543372,
"grad_norm": 35.86509172201521,
"learning_rate": 4.8610909465500055e-09,
"loss": 2.4221,
"mean_token_accuracy": 0.5996748507022858,
"num_tokens": 94473674.0,
"step": 1500
},
{
"epoch": 4.919803600654665,
"grad_norm": 36.534709819616,
"learning_rate": 4.859210032909931e-09,
"loss": 2.4551,
"mean_token_accuracy": 0.5964705228805542,
"num_tokens": 94789597.0,
"step": 1505
},
{
"epoch": 4.9361702127659575,
"grad_norm": 36.071966143335516,
"learning_rate": 4.857316839272229e-09,
"loss": 2.4456,
"mean_token_accuracy": 0.5960875868797302,
"num_tokens": 95103661.0,
"step": 1510
},
{
"epoch": 4.95253682487725,
"grad_norm": 33.949806954557204,
"learning_rate": 4.855411375491217e-09,
"loss": 2.4029,
"mean_token_accuracy": 0.6011349201202393,
"num_tokens": 95419831.0,
"step": 1515
},
{
"epoch": 4.968903436988543,
"grad_norm": 36.025707856738336,
"learning_rate": 4.853493651485088e-09,
"loss": 2.4658,
"mean_token_accuracy": 0.5928048253059387,
"num_tokens": 95735502.0,
"step": 1520
},
{
"epoch": 4.985270049099836,
"grad_norm": 35.54506660841942,
"learning_rate": 4.851563677235845e-09,
"loss": 2.4165,
"mean_token_accuracy": 0.6018769025802613,
"num_tokens": 96053078.0,
"step": 1525
},
{
"epoch": 5.0,
"grad_norm": 35.316681076979684,
"learning_rate": 4.849621462789257e-09,
"loss": 2.466,
"mean_token_accuracy": 0.5903970334264967,
"num_tokens": 96313602.0,
"step": 1530
},
{
"epoch": 5.016366612111293,
"grad_norm": 34.48683940022847,
"learning_rate": 4.8476670182548045e-09,
"loss": 2.4255,
"mean_token_accuracy": 0.598287183046341,
"num_tokens": 96629562.0,
"step": 1535
},
{
"epoch": 5.032733224222586,
"grad_norm": 34.658985731341446,
"learning_rate": 4.8457003538056285e-09,
"loss": 2.3785,
"mean_token_accuracy": 0.6074756979942322,
"num_tokens": 96946467.0,
"step": 1540
},
{
"epoch": 5.049099836333879,
"grad_norm": 33.853524601076565,
"learning_rate": 4.843721479678476e-09,
"loss": 2.4122,
"mean_token_accuracy": 0.6005301892757415,
"num_tokens": 97260313.0,
"step": 1545
},
{
"epoch": 5.0654664484451715,
"grad_norm": 35.74543512906391,
"learning_rate": 4.841730406173645e-09,
"loss": 2.4103,
"mean_token_accuracy": 0.6016847252845764,
"num_tokens": 97578758.0,
"step": 1550
},
{
"epoch": 5.081833060556465,
"grad_norm": 35.48837441704123,
"learning_rate": 4.839727143654934e-09,
"loss": 2.3891,
"mean_token_accuracy": 0.6052005052566528,
"num_tokens": 97895124.0,
"step": 1555
},
{
"epoch": 5.098199672667758,
"grad_norm": 35.52394836029277,
"learning_rate": 4.837711702549589e-09,
"loss": 2.3923,
"mean_token_accuracy": 0.6032778918743134,
"num_tokens": 98209910.0,
"step": 1560
},
{
"epoch": 5.114566284779051,
"grad_norm": 35.90933473348734,
"learning_rate": 4.835684093348244e-09,
"loss": 2.4441,
"mean_token_accuracy": 0.5969228565692901,
"num_tokens": 98525651.0,
"step": 1565
},
{
"epoch": 5.130932896890344,
"grad_norm": 36.39247016426329,
"learning_rate": 4.83364432660487e-09,
"loss": 2.4319,
"mean_token_accuracy": 0.5964118003845215,
"num_tokens": 98842269.0,
"step": 1570
},
{
"epoch": 5.147299509001637,
"grad_norm": 35.63480593062428,
"learning_rate": 4.8315924129367224e-09,
"loss": 2.4305,
"mean_token_accuracy": 0.597532719373703,
"num_tokens": 99157104.0,
"step": 1575
},
{
"epoch": 5.1636661211129296,
"grad_norm": 35.23834160461372,
"learning_rate": 4.829528363024279e-09,
"loss": 2.4339,
"mean_token_accuracy": 0.5947438180446625,
"num_tokens": 99472398.0,
"step": 1580
},
{
"epoch": 5.180032733224222,
"grad_norm": 35.93484257317461,
"learning_rate": 4.827452187611192e-09,
"loss": 2.4657,
"mean_token_accuracy": 0.5917093694210053,
"num_tokens": 99785710.0,
"step": 1585
},
{
"epoch": 5.196399345335515,
"grad_norm": 34.64447666981932,
"learning_rate": 4.825363897504226e-09,
"loss": 2.409,
"mean_token_accuracy": 0.5984897494316102,
"num_tokens": 100101052.0,
"step": 1590
},
{
"epoch": 5.212765957446808,
"grad_norm": 36.10117173060778,
"learning_rate": 4.823263503573204e-09,
"loss": 2.421,
"mean_token_accuracy": 0.5984189927577972,
"num_tokens": 100417907.0,
"step": 1595
},
{
"epoch": 5.229132569558102,
"grad_norm": 32.118403250633314,
"learning_rate": 4.821151016750953e-09,
"loss": 2.3962,
"mean_token_accuracy": 0.6026824653148651,
"num_tokens": 100734371.0,
"step": 1600
},
{
"epoch": 5.245499181669395,
"grad_norm": 34.52896891196535,
"learning_rate": 4.819026448033244e-09,
"loss": 2.3906,
"mean_token_accuracy": 0.6012833952903748,
"num_tokens": 101050013.0,
"step": 1605
},
{
"epoch": 5.261865793780688,
"grad_norm": 32.79111439062977,
"learning_rate": 4.816889808478735e-09,
"loss": 2.3625,
"mean_token_accuracy": 0.6078511595726013,
"num_tokens": 101365831.0,
"step": 1610
},
{
"epoch": 5.2782324058919805,
"grad_norm": 33.66051847624121,
"learning_rate": 4.814741109208916e-09,
"loss": 2.3859,
"mean_token_accuracy": 0.6022830188274384,
"num_tokens": 101681937.0,
"step": 1615
},
{
"epoch": 5.294599018003273,
"grad_norm": 32.46266480007908,
"learning_rate": 4.812580361408048e-09,
"loss": 2.3444,
"mean_token_accuracy": 0.6072778820991516,
"num_tokens": 101998221.0,
"step": 1620
},
{
"epoch": 5.310965630114566,
"grad_norm": 33.090685851416744,
"learning_rate": 4.810407576323107e-09,
"loss": 2.3883,
"mean_token_accuracy": 0.6031101226806641,
"num_tokens": 102314712.0,
"step": 1625
},
{
"epoch": 5.327332242225859,
"grad_norm": 34.59572822062589,
"learning_rate": 4.808222765263724e-09,
"loss": 2.3654,
"mean_token_accuracy": 0.6064081609249115,
"num_tokens": 102630928.0,
"step": 1630
},
{
"epoch": 5.343698854337152,
"grad_norm": 33.27667097985031,
"learning_rate": 4.8060259396021264e-09,
"loss": 2.3848,
"mean_token_accuracy": 0.6004369139671326,
"num_tokens": 102947814.0,
"step": 1635
},
{
"epoch": 5.360065466448445,
"grad_norm": 32.59372126839617,
"learning_rate": 4.803817110773081e-09,
"loss": 2.3775,
"mean_token_accuracy": 0.6084607303142547,
"num_tokens": 103263194.0,
"step": 1640
},
{
"epoch": 5.376432078559738,
"grad_norm": 33.619174051544235,
"learning_rate": 4.801596290273832e-09,
"loss": 2.3844,
"mean_token_accuracy": 0.606970465183258,
"num_tokens": 103578803.0,
"step": 1645
},
{
"epoch": 5.3927986906710315,
"grad_norm": 32.14783977338467,
"learning_rate": 4.799363489664039e-09,
"loss": 2.3604,
"mean_token_accuracy": 0.6091885685920715,
"num_tokens": 103894623.0,
"step": 1650
},
{
"epoch": 5.409165302782324,
"grad_norm": 32.73860509638947,
"learning_rate": 4.797118720565724e-09,
"loss": 2.4045,
"mean_token_accuracy": 0.599544358253479,
"num_tokens": 104211684.0,
"step": 1655
},
{
"epoch": 5.425531914893617,
"grad_norm": 33.06579842685092,
"learning_rate": 4.794861994663205e-09,
"loss": 2.3551,
"mean_token_accuracy": 0.6075682699680328,
"num_tokens": 104527791.0,
"step": 1660
},
{
"epoch": 5.44189852700491,
"grad_norm": 32.745445010813356,
"learning_rate": 4.792593323703035e-09,
"loss": 2.3591,
"mean_token_accuracy": 0.6083964228630065,
"num_tokens": 104842509.0,
"step": 1665
},
{
"epoch": 5.458265139116203,
"grad_norm": 33.35194456579634,
"learning_rate": 4.790312719493944e-09,
"loss": 2.3717,
"mean_token_accuracy": 0.6044698596000672,
"num_tokens": 105157961.0,
"step": 1670
},
{
"epoch": 5.474631751227496,
"grad_norm": 33.6853627434255,
"learning_rate": 4.788020193906776e-09,
"loss": 2.4123,
"mean_token_accuracy": 0.5995143771171569,
"num_tokens": 105472057.0,
"step": 1675
},
{
"epoch": 5.490998363338789,
"grad_norm": 32.41724618948233,
"learning_rate": 4.785715758874428e-09,
"loss": 2.36,
"mean_token_accuracy": 0.6099441170692443,
"num_tokens": 105789028.0,
"step": 1680
},
{
"epoch": 5.5073649754500815,
"grad_norm": 33.039731094631016,
"learning_rate": 4.783399426391786e-09,
"loss": 2.3617,
"mean_token_accuracy": 0.6067679703235627,
"num_tokens": 106105035.0,
"step": 1685
},
{
"epoch": 5.523731587561375,
"grad_norm": 32.798434224683184,
"learning_rate": 4.781071208515665e-09,
"loss": 2.3335,
"mean_token_accuracy": 0.6111400902271271,
"num_tokens": 106420164.0,
"step": 1690
},
{
"epoch": 5.540098199672668,
"grad_norm": 31.195355670151503,
"learning_rate": 4.778731117364744e-09,
"loss": 2.3537,
"mean_token_accuracy": 0.6082989335060119,
"num_tokens": 106733630.0,
"step": 1695
},
{
"epoch": 5.556464811783961,
"grad_norm": 32.15524904031231,
"learning_rate": 4.7763791651195035e-09,
"loss": 2.3555,
"mean_token_accuracy": 0.607761561870575,
"num_tokens": 107049953.0,
"step": 1700
},
{
"epoch": 5.572831423895254,
"grad_norm": 31.920502634421535,
"learning_rate": 4.774015364022165e-09,
"loss": 2.3545,
"mean_token_accuracy": 0.6080652475357056,
"num_tokens": 107365569.0,
"step": 1705
},
{
"epoch": 5.589198036006547,
"grad_norm": 30.981864636239646,
"learning_rate": 4.7716397263766215e-09,
"loss": 2.3248,
"mean_token_accuracy": 0.6130104541778565,
"num_tokens": 107681393.0,
"step": 1710
},
{
"epoch": 5.60556464811784,
"grad_norm": 31.488785549629814,
"learning_rate": 4.7692522645483796e-09,
"loss": 2.3347,
"mean_token_accuracy": 0.6084707498550415,
"num_tokens": 107996454.0,
"step": 1715
},
{
"epoch": 5.6219312602291325,
"grad_norm": 32.70963616909119,
"learning_rate": 4.766852990964491e-09,
"loss": 2.3575,
"mean_token_accuracy": 0.6064568817615509,
"num_tokens": 108311689.0,
"step": 1720
},
{
"epoch": 5.638297872340425,
"grad_norm": 30.48960666827236,
"learning_rate": 4.76444191811349e-09,
"loss": 2.3296,
"mean_token_accuracy": 0.6116202116012573,
"num_tokens": 108628481.0,
"step": 1725
},
{
"epoch": 5.654664484451718,
"grad_norm": 31.46744994781824,
"learning_rate": 4.762019058545326e-09,
"loss": 2.3357,
"mean_token_accuracy": 0.6112555265426636,
"num_tokens": 108944800.0,
"step": 1730
},
{
"epoch": 5.671031096563011,
"grad_norm": 31.5451605308921,
"learning_rate": 4.759584424871302e-09,
"loss": 2.329,
"mean_token_accuracy": 0.6095126152038575,
"num_tokens": 109260521.0,
"step": 1735
},
{
"epoch": 5.687397708674304,
"grad_norm": 31.012545675589063,
"learning_rate": 4.757138029764003e-09,
"loss": 2.3571,
"mean_token_accuracy": 0.6056615591049195,
"num_tokens": 109574830.0,
"step": 1740
},
{
"epoch": 5.703764320785598,
"grad_norm": 30.138938836035837,
"learning_rate": 4.754679885957239e-09,
"loss": 2.3023,
"mean_token_accuracy": 0.6127878904342652,
"num_tokens": 109890335.0,
"step": 1745
},
{
"epoch": 5.720130932896891,
"grad_norm": 29.217074277624654,
"learning_rate": 4.7522100062459706e-09,
"loss": 2.3089,
"mean_token_accuracy": 0.6136601090431213,
"num_tokens": 110205835.0,
"step": 1750
},
{
"epoch": 5.736497545008183,
"grad_norm": 30.51562401439012,
"learning_rate": 4.749728403486245e-09,
"loss": 2.3161,
"mean_token_accuracy": 0.6105396926403046,
"num_tokens": 110521915.0,
"step": 1755
},
{
"epoch": 5.752864157119476,
"grad_norm": 29.819759106075676,
"learning_rate": 4.747235090595129e-09,
"loss": 2.349,
"mean_token_accuracy": 0.6054076015949249,
"num_tokens": 110837238.0,
"step": 1760
},
{
"epoch": 5.769230769230769,
"grad_norm": 30.89858633499367,
"learning_rate": 4.744730080550646e-09,
"loss": 2.3386,
"mean_token_accuracy": 0.6067017555236817,
"num_tokens": 111153241.0,
"step": 1765
},
{
"epoch": 5.785597381342062,
"grad_norm": 30.91938995645458,
"learning_rate": 4.742213386391698e-09,
"loss": 2.3483,
"mean_token_accuracy": 0.604193365573883,
"num_tokens": 111468960.0,
"step": 1770
},
{
"epoch": 5.801963993453355,
"grad_norm": 27.323433001798556,
"learning_rate": 4.739685021218012e-09,
"loss": 2.2693,
"mean_token_accuracy": 0.6164486765861511,
"num_tokens": 111786342.0,
"step": 1775
},
{
"epoch": 5.818330605564648,
"grad_norm": 30.739373596899014,
"learning_rate": 4.737144998190057e-09,
"loss": 2.3301,
"mean_token_accuracy": 0.6068127572536468,
"num_tokens": 112101386.0,
"step": 1780
},
{
"epoch": 5.8346972176759415,
"grad_norm": 29.304255959591316,
"learning_rate": 4.734593330528989e-09,
"loss": 2.2716,
"mean_token_accuracy": 0.6155919015407563,
"num_tokens": 112415452.0,
"step": 1785
},
{
"epoch": 5.851063829787234,
"grad_norm": 29.784953751928743,
"learning_rate": 4.732030031516571e-09,
"loss": 2.2885,
"mean_token_accuracy": 0.6114451467990876,
"num_tokens": 112730033.0,
"step": 1790
},
{
"epoch": 5.867430441898527,
"grad_norm": 28.262213285997248,
"learning_rate": 4.72945511449511e-09,
"loss": 2.3039,
"mean_token_accuracy": 0.6072455883026123,
"num_tokens": 113045711.0,
"step": 1795
},
{
"epoch": 5.88379705400982,
"grad_norm": 28.352633321650952,
"learning_rate": 4.726868592867388e-09,
"loss": 2.266,
"mean_token_accuracy": 0.6129730522632599,
"num_tokens": 113360554.0,
"step": 1800
},
{
"epoch": 5.900163666121113,
"grad_norm": 28.894218746546375,
"learning_rate": 4.724270480096589e-09,
"loss": 2.298,
"mean_token_accuracy": 0.6091433703899384,
"num_tokens": 113675208.0,
"step": 1805
},
{
"epoch": 5.916530278232406,
"grad_norm": 26.932849836839313,
"learning_rate": 4.721660789706232e-09,
"loss": 2.2583,
"mean_token_accuracy": 0.6158608973026276,
"num_tokens": 113990879.0,
"step": 1810
},
{
"epoch": 5.932896890343699,
"grad_norm": 26.642378048564314,
"learning_rate": 4.719039535280095e-09,
"loss": 2.2566,
"mean_token_accuracy": 0.6155524849891663,
"num_tokens": 114306432.0,
"step": 1815
},
{
"epoch": 5.949263502454992,
"grad_norm": 26.350950920645698,
"learning_rate": 4.716406730462153e-09,
"loss": 2.2372,
"mean_token_accuracy": 0.6172758162021637,
"num_tokens": 114622631.0,
"step": 1820
},
{
"epoch": 5.9656301145662844,
"grad_norm": 27.44466078553468,
"learning_rate": 4.713762388956501e-09,
"loss": 2.262,
"mean_token_accuracy": 0.6123747944831848,
"num_tokens": 114937555.0,
"step": 1825
},
{
"epoch": 5.981996726677577,
"grad_norm": 26.323128620614657,
"learning_rate": 4.71110652452728e-09,
"loss": 2.2493,
"mean_token_accuracy": 0.6143295288085937,
"num_tokens": 115252837.0,
"step": 1830
},
{
"epoch": 5.998363338788871,
"grad_norm": 25.783928944992137,
"learning_rate": 4.7084391509986155e-09,
"loss": 2.2469,
"mean_token_accuracy": 0.613012844324112,
"num_tokens": 115568359.0,
"step": 1835
},
{
"epoch": 6.013093289689034,
"grad_norm": 25.525666646792722,
"learning_rate": 4.705760282254537e-09,
"loss": 2.2152,
"mean_token_accuracy": 0.6208844979604086,
"num_tokens": 115828706.0,
"step": 1840
},
{
"epoch": 6.029459901800327,
"grad_norm": 26.755470190379572,
"learning_rate": 4.703069932238906e-09,
"loss": 2.263,
"mean_token_accuracy": 0.61106738448143,
"num_tokens": 116144638.0,
"step": 1845
},
{
"epoch": 6.045826513911621,
"grad_norm": 24.507907534529128,
"learning_rate": 4.7003681149553475e-09,
"loss": 2.263,
"mean_token_accuracy": 0.6099263489246368,
"num_tokens": 116458739.0,
"step": 1850
},
{
"epoch": 6.062193126022914,
"grad_norm": 26.141828238324674,
"learning_rate": 4.697654844467175e-09,
"loss": 2.21,
"mean_token_accuracy": 0.6175226509571076,
"num_tokens": 116773686.0,
"step": 1855
},
{
"epoch": 6.0785597381342065,
"grad_norm": 25.07581997868601,
"learning_rate": 4.6949301348973174e-09,
"loss": 2.2532,
"mean_token_accuracy": 0.6104423046112061,
"num_tokens": 117090382.0,
"step": 1860
},
{
"epoch": 6.094926350245499,
"grad_norm": 24.482814976190816,
"learning_rate": 4.692194000428245e-09,
"loss": 2.2089,
"mean_token_accuracy": 0.6172590911388397,
"num_tokens": 117405941.0,
"step": 1865
},
{
"epoch": 6.111292962356792,
"grad_norm": 24.43522181875196,
"learning_rate": 4.6894464553018976e-09,
"loss": 2.2496,
"mean_token_accuracy": 0.6107006132602691,
"num_tokens": 117722695.0,
"step": 1870
},
{
"epoch": 6.127659574468085,
"grad_norm": 27.28050902421836,
"learning_rate": 4.686687513819606e-09,
"loss": 2.187,
"mean_token_accuracy": 0.6217321693897248,
"num_tokens": 118037466.0,
"step": 1875
},
{
"epoch": 6.144026186579378,
"grad_norm": 24.463275743963848,
"learning_rate": 4.6839171903420245e-09,
"loss": 2.1935,
"mean_token_accuracy": 0.6205372273921966,
"num_tokens": 118352819.0,
"step": 1880
},
{
"epoch": 6.160392798690671,
"grad_norm": 24.00192156906821,
"learning_rate": 4.681135499289048e-09,
"loss": 2.2127,
"mean_token_accuracy": 0.6138154864311218,
"num_tokens": 118668463.0,
"step": 1885
},
{
"epoch": 6.176759410801964,
"grad_norm": 24.372092039508235,
"learning_rate": 4.678342455139744e-09,
"loss": 2.1885,
"mean_token_accuracy": 0.6191801726818085,
"num_tokens": 118983231.0,
"step": 1890
},
{
"epoch": 6.1931260229132565,
"grad_norm": 23.624206455606668,
"learning_rate": 4.675538072432276e-09,
"loss": 2.2036,
"mean_token_accuracy": 0.6157269537448883,
"num_tokens": 119298059.0,
"step": 1895
},
{
"epoch": 6.20949263502455,
"grad_norm": 23.629247772201367,
"learning_rate": 4.672722365763821e-09,
"loss": 2.1807,
"mean_token_accuracy": 0.6190297603607178,
"num_tokens": 119613994.0,
"step": 1900
},
{
"epoch": 6.225859247135843,
"grad_norm": 23.635670695000215,
"learning_rate": 4.669895349790502e-09,
"loss": 2.2345,
"mean_token_accuracy": 0.6094496965408325,
"num_tokens": 119930637.0,
"step": 1905
},
{
"epoch": 6.242225859247136,
"grad_norm": 23.665973763482956,
"learning_rate": 4.667057039227308e-09,
"loss": 2.1728,
"mean_token_accuracy": 0.6191839039325714,
"num_tokens": 120245981.0,
"step": 1910
},
{
"epoch": 6.258592471358429,
"grad_norm": 24.159573167994676,
"learning_rate": 4.664207448848018e-09,
"loss": 2.1897,
"mean_token_accuracy": 0.6166786015033722,
"num_tokens": 120560959.0,
"step": 1915
},
{
"epoch": 6.274959083469722,
"grad_norm": 22.631008864459677,
"learning_rate": 4.661346593485124e-09,
"loss": 2.1623,
"mean_token_accuracy": 0.6211640655994415,
"num_tokens": 120878150.0,
"step": 1920
},
{
"epoch": 6.291325695581015,
"grad_norm": 23.296439842018362,
"learning_rate": 4.658474488029753e-09,
"loss": 2.1886,
"mean_token_accuracy": 0.6151513159275055,
"num_tokens": 121191333.0,
"step": 1925
},
{
"epoch": 6.3076923076923075,
"grad_norm": 22.80831617398982,
"learning_rate": 4.655591147431589e-09,
"loss": 2.1641,
"mean_token_accuracy": 0.6197402775287628,
"num_tokens": 121506516.0,
"step": 1930
},
{
"epoch": 6.3240589198036,
"grad_norm": 22.872432441327586,
"learning_rate": 4.652696586698801e-09,
"loss": 2.1717,
"mean_token_accuracy": 0.6172288477420806,
"num_tokens": 121820087.0,
"step": 1935
},
{
"epoch": 6.340425531914893,
"grad_norm": 21.855626803707125,
"learning_rate": 4.649790820897955e-09,
"loss": 2.1661,
"mean_token_accuracy": 0.6183198809623718,
"num_tokens": 122136793.0,
"step": 1940
},
{
"epoch": 6.356792144026187,
"grad_norm": 21.4551134123129,
"learning_rate": 4.646873865153944e-09,
"loss": 2.1404,
"mean_token_accuracy": 0.622417813539505,
"num_tokens": 122452611.0,
"step": 1945
},
{
"epoch": 6.37315875613748,
"grad_norm": 22.172464286307513,
"learning_rate": 4.6439457346499045e-09,
"loss": 2.1567,
"mean_token_accuracy": 0.6196485161781311,
"num_tokens": 122766828.0,
"step": 1950
},
{
"epoch": 6.389525368248773,
"grad_norm": 22.38339117966395,
"learning_rate": 4.641006444627141e-09,
"loss": 2.1793,
"mean_token_accuracy": 0.6164099514484406,
"num_tokens": 123081686.0,
"step": 1955
},
{
"epoch": 6.405891980360066,
"grad_norm": 23.618424749681687,
"learning_rate": 4.638056010385042e-09,
"loss": 2.1862,
"mean_token_accuracy": 0.6123735785484314,
"num_tokens": 123398051.0,
"step": 1960
},
{
"epoch": 6.422258592471358,
"grad_norm": 21.65478232539387,
"learning_rate": 4.635094447281006e-09,
"loss": 2.16,
"mean_token_accuracy": 0.6157838463783264,
"num_tokens": 123712386.0,
"step": 1965
},
{
"epoch": 6.438625204582651,
"grad_norm": 21.99814700391443,
"learning_rate": 4.632121770730357e-09,
"loss": 2.1315,
"mean_token_accuracy": 0.6206058323383331,
"num_tokens": 124028129.0,
"step": 1970
},
{
"epoch": 6.454991816693944,
"grad_norm": 21.24229531471878,
"learning_rate": 4.629137996206266e-09,
"loss": 2.1896,
"mean_token_accuracy": 0.6120827198028564,
"num_tokens": 124343973.0,
"step": 1975
},
{
"epoch": 6.471358428805237,
"grad_norm": 21.20761731125876,
"learning_rate": 4.62614313923967e-09,
"loss": 2.1334,
"mean_token_accuracy": 0.619620543718338,
"num_tokens": 124659135.0,
"step": 1980
},
{
"epoch": 6.48772504091653,
"grad_norm": 21.694275642010822,
"learning_rate": 4.623137215419194e-09,
"loss": 2.1736,
"mean_token_accuracy": 0.6120718121528625,
"num_tokens": 124975035.0,
"step": 1985
},
{
"epoch": 6.504091653027823,
"grad_norm": 20.732129794066694,
"learning_rate": 4.620120240391064e-09,
"loss": 2.135,
"mean_token_accuracy": 0.6189526736736297,
"num_tokens": 125290692.0,
"step": 1990
},
{
"epoch": 6.5204582651391165,
"grad_norm": 21.938805671841106,
"learning_rate": 4.6170922298590336e-09,
"loss": 2.1461,
"mean_token_accuracy": 0.6168884932994843,
"num_tokens": 125605387.0,
"step": 1995
},
{
"epoch": 6.536824877250409,
"grad_norm": 21.852887235656542,
"learning_rate": 4.614053199584291e-09,
"loss": 2.1565,
"mean_token_accuracy": 0.615131002664566,
"num_tokens": 125923044.0,
"step": 2000
},
{
"epoch": 6.553191489361702,
"grad_norm": 21.61436281642574,
"learning_rate": 4.611003165385389e-09,
"loss": 2.1705,
"mean_token_accuracy": 0.614033317565918,
"num_tokens": 126239452.0,
"step": 2005
},
{
"epoch": 6.569558101472995,
"grad_norm": 21.50966636839898,
"learning_rate": 4.607942143138157e-09,
"loss": 2.1953,
"mean_token_accuracy": 0.6101115584373474,
"num_tokens": 126556662.0,
"step": 2010
},
{
"epoch": 6.585924713584288,
"grad_norm": 20.995613724620096,
"learning_rate": 4.6048701487756136e-09,
"loss": 2.1428,
"mean_token_accuracy": 0.6168196797370911,
"num_tokens": 126871126.0,
"step": 2015
},
{
"epoch": 6.602291325695581,
"grad_norm": 22.384610556987017,
"learning_rate": 4.601787198287896e-09,
"loss": 2.1824,
"mean_token_accuracy": 0.6081099033355712,
"num_tokens": 127187229.0,
"step": 2020
},
{
"epoch": 6.618657937806874,
"grad_norm": 21.879272078386037,
"learning_rate": 4.598693307722165e-09,
"loss": 2.1545,
"mean_token_accuracy": 0.6163994073867798,
"num_tokens": 127503607.0,
"step": 2025
},
{
"epoch": 6.635024549918167,
"grad_norm": 21.28896281508814,
"learning_rate": 4.595588493182525e-09,
"loss": 2.1428,
"mean_token_accuracy": 0.6177895963191986,
"num_tokens": 127820570.0,
"step": 2030
},
{
"epoch": 6.65139116202946,
"grad_norm": 20.16858032321011,
"learning_rate": 4.592472770829945e-09,
"loss": 2.0875,
"mean_token_accuracy": 0.6251370906829834,
"num_tokens": 128136630.0,
"step": 2035
},
{
"epoch": 6.667757774140753,
"grad_norm": 20.37161020175876,
"learning_rate": 4.589346156882167e-09,
"loss": 2.1434,
"mean_token_accuracy": 0.6160525560379029,
"num_tokens": 128451184.0,
"step": 2040
},
{
"epoch": 6.684124386252046,
"grad_norm": 20.995068375593643,
"learning_rate": 4.5862086676136275e-09,
"loss": 2.1157,
"mean_token_accuracy": 0.6187912404537201,
"num_tokens": 128767986.0,
"step": 2045
},
{
"epoch": 6.700490998363339,
"grad_norm": 20.03837123793174,
"learning_rate": 4.5830603193553685e-09,
"loss": 2.1128,
"mean_token_accuracy": 0.6212022960186004,
"num_tokens": 129083628.0,
"step": 2050
},
{
"epoch": 6.716857610474632,
"grad_norm": 21.732839017733074,
"learning_rate": 4.579901128494958e-09,
"loss": 2.138,
"mean_token_accuracy": 0.6142685532569885,
"num_tokens": 129398358.0,
"step": 2055
},
{
"epoch": 6.733224222585925,
"grad_norm": 21.3996768207252,
"learning_rate": 4.576731111476395e-09,
"loss": 2.1406,
"mean_token_accuracy": 0.614562475681305,
"num_tokens": 129713765.0,
"step": 2060
},
{
"epoch": 6.7495908346972175,
"grad_norm": 20.86887078387194,
"learning_rate": 4.5735502848000375e-09,
"loss": 2.1023,
"mean_token_accuracy": 0.6210401713848114,
"num_tokens": 130029521.0,
"step": 2065
},
{
"epoch": 6.76595744680851,
"grad_norm": 21.35259791338893,
"learning_rate": 4.570358665022504e-09,
"loss": 2.1534,
"mean_token_accuracy": 0.611782455444336,
"num_tokens": 130343583.0,
"step": 2070
},
{
"epoch": 6.782324058919803,
"grad_norm": 20.350039593083146,
"learning_rate": 4.567156268756594e-09,
"loss": 2.117,
"mean_token_accuracy": 0.6170144140720367,
"num_tokens": 130660087.0,
"step": 2075
},
{
"epoch": 6.798690671031096,
"grad_norm": 21.746541271091658,
"learning_rate": 4.5639431126712e-09,
"loss": 2.1353,
"mean_token_accuracy": 0.6146936535835266,
"num_tokens": 130976790.0,
"step": 2080
},
{
"epoch": 6.81505728314239,
"grad_norm": 21.651718945925616,
"learning_rate": 4.56071921349122e-09,
"loss": 2.1343,
"mean_token_accuracy": 0.6141116917133331,
"num_tokens": 131294454.0,
"step": 2085
},
{
"epoch": 6.831423895253683,
"grad_norm": 21.237894277524884,
"learning_rate": 4.557484587997473e-09,
"loss": 2.1273,
"mean_token_accuracy": 0.6154871582984924,
"num_tokens": 131610087.0,
"step": 2090
},
{
"epoch": 6.847790507364976,
"grad_norm": 21.281926833038302,
"learning_rate": 4.55423925302661e-09,
"loss": 2.1393,
"mean_token_accuracy": 0.6141292870044708,
"num_tokens": 131926207.0,
"step": 2095
},
{
"epoch": 6.8641571194762685,
"grad_norm": 20.402026254120138,
"learning_rate": 4.550983225471023e-09,
"loss": 2.1273,
"mean_token_accuracy": 0.61502326130867,
"num_tokens": 132239715.0,
"step": 2100
},
{
"epoch": 6.880523731587561,
"grad_norm": 21.317416165671755,
"learning_rate": 4.547716522278764e-09,
"loss": 2.1389,
"mean_token_accuracy": 0.6126947045326233,
"num_tokens": 132556394.0,
"step": 2105
},
{
"epoch": 6.896890343698854,
"grad_norm": 20.709431005668037,
"learning_rate": 4.5444391604534505e-09,
"loss": 2.1358,
"mean_token_accuracy": 0.6119840562343597,
"num_tokens": 132870997.0,
"step": 2110
},
{
"epoch": 6.913256955810147,
"grad_norm": 20.72493284331367,
"learning_rate": 4.5411511570541815e-09,
"loss": 2.1086,
"mean_token_accuracy": 0.6191326141357422,
"num_tokens": 133187277.0,
"step": 2115
},
{
"epoch": 6.92962356792144,
"grad_norm": 20.401506768438352,
"learning_rate": 4.5378525291954456e-09,
"loss": 2.1036,
"mean_token_accuracy": 0.6179743111133575,
"num_tokens": 133504379.0,
"step": 2120
},
{
"epoch": 6.945990180032734,
"grad_norm": 21.151405028030936,
"learning_rate": 4.534543294047033e-09,
"loss": 2.1407,
"mean_token_accuracy": 0.6133243441581726,
"num_tokens": 133819166.0,
"step": 2125
},
{
"epoch": 6.962356792144027,
"grad_norm": 20.77846794569156,
"learning_rate": 4.5312234688339474e-09,
"loss": 2.1495,
"mean_token_accuracy": 0.6119216084480286,
"num_tokens": 134135526.0,
"step": 2130
},
{
"epoch": 6.9787234042553195,
"grad_norm": 20.024591823948676,
"learning_rate": 4.527893070836314e-09,
"loss": 2.0845,
"mean_token_accuracy": 0.6207044720649719,
"num_tokens": 134451520.0,
"step": 2135
},
{
"epoch": 6.995090016366612,
"grad_norm": 20.40853541843752,
"learning_rate": 4.52455211738929e-09,
"loss": 2.083,
"mean_token_accuracy": 0.6208429157733917,
"num_tokens": 134768176.0,
"step": 2140
},
{
"epoch": 7.009819967266775,
"grad_norm": 20.932141294235674,
"learning_rate": 4.521200625882978e-09,
"loss": 2.0872,
"mean_token_accuracy": 0.6209465795093112,
"num_tokens": 135029611.0,
"step": 2145
},
{
"epoch": 7.026186579378069,
"grad_norm": 19.94403526331167,
"learning_rate": 4.517838613762331e-09,
"loss": 2.071,
"mean_token_accuracy": 0.6247387766838074,
"num_tokens": 135344576.0,
"step": 2150
},
{
"epoch": 7.042553191489362,
"grad_norm": 20.2959867414438,
"learning_rate": 4.514466098527062e-09,
"loss": 2.1312,
"mean_token_accuracy": 0.6129837095737457,
"num_tokens": 135660597.0,
"step": 2155
},
{
"epoch": 7.058919803600655,
"grad_norm": 20.13114916587158,
"learning_rate": 4.5110830977315555e-09,
"loss": 2.1074,
"mean_token_accuracy": 0.6167091131210327,
"num_tokens": 135976424.0,
"step": 2160
},
{
"epoch": 7.075286415711948,
"grad_norm": 20.66542558789711,
"learning_rate": 4.5076896289847735e-09,
"loss": 2.1191,
"mean_token_accuracy": 0.6145562171936035,
"num_tokens": 136292501.0,
"step": 2165
},
{
"epoch": 7.091653027823241,
"grad_norm": 21.114797520019756,
"learning_rate": 4.504285709950167e-09,
"loss": 2.1162,
"mean_token_accuracy": 0.6152336478233338,
"num_tokens": 136606279.0,
"step": 2170
},
{
"epoch": 7.1080196399345335,
"grad_norm": 20.14558765288978,
"learning_rate": 4.50087135834558e-09,
"loss": 2.0733,
"mean_token_accuracy": 0.6247061014175415,
"num_tokens": 136922670.0,
"step": 2175
},
{
"epoch": 7.124386252045826,
"grad_norm": 20.9247828857839,
"learning_rate": 4.497446591943162e-09,
"loss": 2.0978,
"mean_token_accuracy": 0.6188047885894775,
"num_tokens": 137236588.0,
"step": 2180
},
{
"epoch": 7.140752864157119,
"grad_norm": 21.082001804171576,
"learning_rate": 4.494011428569269e-09,
"loss": 2.1465,
"mean_token_accuracy": 0.609574556350708,
"num_tokens": 137550798.0,
"step": 2185
},
{
"epoch": 7.157119476268412,
"grad_norm": 20.902296160839647,
"learning_rate": 4.490565886104378e-09,
"loss": 2.1155,
"mean_token_accuracy": 0.6145494759082795,
"num_tokens": 137866384.0,
"step": 2190
},
{
"epoch": 7.173486088379706,
"grad_norm": 20.191528730933655,
"learning_rate": 4.487109982482991e-09,
"loss": 2.0624,
"mean_token_accuracy": 0.6228398621082306,
"num_tokens": 138183138.0,
"step": 2195
},
{
"epoch": 7.189852700490999,
"grad_norm": 19.70846607577594,
"learning_rate": 4.483643735693537e-09,
"loss": 2.0661,
"mean_token_accuracy": 0.6221046924591065,
"num_tokens": 138497274.0,
"step": 2200
},
{
"epoch": 7.2062193126022915,
"grad_norm": 21.054465864675535,
"learning_rate": 4.480167163778287e-09,
"loss": 2.1081,
"mean_token_accuracy": 0.614901089668274,
"num_tokens": 138812244.0,
"step": 2205
},
{
"epoch": 7.222585924713584,
"grad_norm": 20.177884489958767,
"learning_rate": 4.476680284833252e-09,
"loss": 2.059,
"mean_token_accuracy": 0.6242061018943786,
"num_tokens": 139127574.0,
"step": 2210
},
{
"epoch": 7.238952536824877,
"grad_norm": 20.50976864112745,
"learning_rate": 4.473183117008096e-09,
"loss": 2.104,
"mean_token_accuracy": 0.6162344992160798,
"num_tokens": 139444561.0,
"step": 2215
},
{
"epoch": 7.25531914893617,
"grad_norm": 19.844812284201744,
"learning_rate": 4.469675678506035e-09,
"loss": 2.0767,
"mean_token_accuracy": 0.6197092831134796,
"num_tokens": 139759101.0,
"step": 2220
},
{
"epoch": 7.271685761047463,
"grad_norm": 20.125104087682654,
"learning_rate": 4.466157987583747e-09,
"loss": 2.0822,
"mean_token_accuracy": 0.6182936906814576,
"num_tokens": 140073771.0,
"step": 2225
},
{
"epoch": 7.288052373158756,
"grad_norm": 20.428231845938967,
"learning_rate": 4.462630062551274e-09,
"loss": 2.0704,
"mean_token_accuracy": 0.621297436952591,
"num_tokens": 140389001.0,
"step": 2230
},
{
"epoch": 7.304418985270049,
"grad_norm": 19.954165544812422,
"learning_rate": 4.459091921771929e-09,
"loss": 2.0792,
"mean_token_accuracy": 0.6181824147701264,
"num_tokens": 140704788.0,
"step": 2235
},
{
"epoch": 7.320785597381342,
"grad_norm": 20.204841527212057,
"learning_rate": 4.455543583662199e-09,
"loss": 2.0928,
"mean_token_accuracy": 0.6172252476215363,
"num_tokens": 141020036.0,
"step": 2240
},
{
"epoch": 7.337152209492635,
"grad_norm": 20.162303951401036,
"learning_rate": 4.451985066691649e-09,
"loss": 2.0907,
"mean_token_accuracy": 0.6169468641281128,
"num_tokens": 141335334.0,
"step": 2245
},
{
"epoch": 7.353518821603928,
"grad_norm": 19.880239575791986,
"learning_rate": 4.448416389382826e-09,
"loss": 2.0814,
"mean_token_accuracy": 0.6201544046401978,
"num_tokens": 141651325.0,
"step": 2250
},
{
"epoch": 7.369885433715221,
"grad_norm": 19.837865962881644,
"learning_rate": 4.444837570311163e-09,
"loss": 2.077,
"mean_token_accuracy": 0.6198208451271057,
"num_tokens": 141967976.0,
"step": 2255
},
{
"epoch": 7.386252045826514,
"grad_norm": 19.56879716151567,
"learning_rate": 4.441248628104884e-09,
"loss": 2.0843,
"mean_token_accuracy": 0.617261779308319,
"num_tokens": 142284034.0,
"step": 2260
},
{
"epoch": 7.402618657937807,
"grad_norm": 20.20902374782504,
"learning_rate": 4.4376495814449034e-09,
"loss": 2.062,
"mean_token_accuracy": 0.6235301315784454,
"num_tokens": 142600358.0,
"step": 2265
},
{
"epoch": 7.4189852700491,
"grad_norm": 20.30083756077551,
"learning_rate": 4.4340404490647316e-09,
"loss": 2.1089,
"mean_token_accuracy": 0.6131757915019989,
"num_tokens": 142915407.0,
"step": 2270
},
{
"epoch": 7.435351882160393,
"grad_norm": 20.30697450610285,
"learning_rate": 4.4304212497503735e-09,
"loss": 2.0812,
"mean_token_accuracy": 0.6191363453865051,
"num_tokens": 143231393.0,
"step": 2275
},
{
"epoch": 7.451718494271685,
"grad_norm": 20.52437090532947,
"learning_rate": 4.42679200234024e-09,
"loss": 2.0942,
"mean_token_accuracy": 0.6162575602531433,
"num_tokens": 143547968.0,
"step": 2280
},
{
"epoch": 7.468085106382979,
"grad_norm": 19.98790638119669,
"learning_rate": 4.423152725725037e-09,
"loss": 2.0759,
"mean_token_accuracy": 0.6203132092952728,
"num_tokens": 143864798.0,
"step": 2285
},
{
"epoch": 7.484451718494272,
"grad_norm": 20.723567179964625,
"learning_rate": 4.419503438847678e-09,
"loss": 2.0803,
"mean_token_accuracy": 0.6180489838123322,
"num_tokens": 144180405.0,
"step": 2290
},
{
"epoch": 7.500818330605565,
"grad_norm": 20.363175730134444,
"learning_rate": 4.415844160703178e-09,
"loss": 2.1056,
"mean_token_accuracy": 0.6134061455726624,
"num_tokens": 144496031.0,
"step": 2295
},
{
"epoch": 7.517184942716858,
"grad_norm": 19.504255018620267,
"learning_rate": 4.412174910338562e-09,
"loss": 2.0759,
"mean_token_accuracy": 0.6175196766853333,
"num_tokens": 144813411.0,
"step": 2300
},
{
"epoch": 7.533551554828151,
"grad_norm": 19.571313987498225,
"learning_rate": 4.408495706852758e-09,
"loss": 2.065,
"mean_token_accuracy": 0.6217400550842285,
"num_tokens": 145129640.0,
"step": 2305
},
{
"epoch": 7.5499181669394435,
"grad_norm": 19.88010468801649,
"learning_rate": 4.404806569396502e-09,
"loss": 2.0739,
"mean_token_accuracy": 0.6190234243869781,
"num_tokens": 145445200.0,
"step": 2310
},
{
"epoch": 7.566284779050736,
"grad_norm": 20.30094612692668,
"learning_rate": 4.40110751717224e-09,
"loss": 2.0807,
"mean_token_accuracy": 0.615682327747345,
"num_tokens": 145761534.0,
"step": 2315
},
{
"epoch": 7.582651391162029,
"grad_norm": 19.59261400508155,
"learning_rate": 4.397398569434024e-09,
"loss": 2.0588,
"mean_token_accuracy": 0.6192507922649384,
"num_tokens": 146078820.0,
"step": 2320
},
{
"epoch": 7.599018003273322,
"grad_norm": 19.185470644015048,
"learning_rate": 4.393679745487411e-09,
"loss": 2.0413,
"mean_token_accuracy": 0.6238569915294647,
"num_tokens": 146394440.0,
"step": 2325
},
{
"epoch": 7.615384615384615,
"grad_norm": 19.495292421388935,
"learning_rate": 4.3899510646893695e-09,
"loss": 2.0612,
"mean_token_accuracy": 0.6186753571033478,
"num_tokens": 146709930.0,
"step": 2330
},
{
"epoch": 7.631751227495909,
"grad_norm": 19.79956682637677,
"learning_rate": 4.386212546448172e-09,
"loss": 2.0987,
"mean_token_accuracy": 0.6140192031860352,
"num_tokens": 147023216.0,
"step": 2335
},
{
"epoch": 7.648117839607202,
"grad_norm": 19.411576587443466,
"learning_rate": 4.3824642102232955e-09,
"loss": 2.0512,
"mean_token_accuracy": 0.6212476313114166,
"num_tokens": 147338846.0,
"step": 2340
},
{
"epoch": 7.6644844517184945,
"grad_norm": 19.61725592268653,
"learning_rate": 4.378706075525322e-09,
"loss": 2.0814,
"mean_token_accuracy": 0.614804869890213,
"num_tokens": 147655009.0,
"step": 2345
},
{
"epoch": 7.680851063829787,
"grad_norm": 19.2538668350145,
"learning_rate": 4.374938161915835e-09,
"loss": 2.0888,
"mean_token_accuracy": 0.6152995645999908,
"num_tokens": 147971418.0,
"step": 2350
},
{
"epoch": 7.69721767594108,
"grad_norm": 19.309480308973157,
"learning_rate": 4.371160489007319e-09,
"loss": 2.0576,
"mean_token_accuracy": 0.6194174110889434,
"num_tokens": 148286907.0,
"step": 2355
},
{
"epoch": 7.713584288052373,
"grad_norm": 20.163070273135265,
"learning_rate": 4.367373076463057e-09,
"loss": 2.0506,
"mean_token_accuracy": 0.6217274129390716,
"num_tokens": 148602683.0,
"step": 2360
},
{
"epoch": 7.729950900163666,
"grad_norm": 19.981014947076595,
"learning_rate": 4.3635759439970294e-09,
"loss": 2.0834,
"mean_token_accuracy": 0.6150952398777008,
"num_tokens": 148917173.0,
"step": 2365
},
{
"epoch": 7.746317512274959,
"grad_norm": 19.968450563246567,
"learning_rate": 4.359769111373807e-09,
"loss": 2.0796,
"mean_token_accuracy": 0.6174699187278747,
"num_tokens": 149233197.0,
"step": 2370
},
{
"epoch": 7.762684124386252,
"grad_norm": 19.960468429955657,
"learning_rate": 4.355952598408453e-09,
"loss": 2.0785,
"mean_token_accuracy": 0.6167018711566925,
"num_tokens": 149548312.0,
"step": 2375
},
{
"epoch": 7.779050736497545,
"grad_norm": 20.6340513147405,
"learning_rate": 4.35212642496642e-09,
"loss": 2.0784,
"mean_token_accuracy": 0.6170047044754028,
"num_tokens": 149863164.0,
"step": 2380
},
{
"epoch": 7.795417348608838,
"grad_norm": 20.354735657801292,
"learning_rate": 4.348290610963439e-09,
"loss": 2.1153,
"mean_token_accuracy": 0.6087307870388031,
"num_tokens": 150177524.0,
"step": 2385
},
{
"epoch": 7.811783960720131,
"grad_norm": 20.087151707090026,
"learning_rate": 4.344445176365428e-09,
"loss": 2.09,
"mean_token_accuracy": 0.6149612128734588,
"num_tokens": 150493173.0,
"step": 2390
},
{
"epoch": 7.828150572831424,
"grad_norm": 20.13559372519946,
"learning_rate": 4.3405901411883765e-09,
"loss": 2.0901,
"mean_token_accuracy": 0.6129091382026672,
"num_tokens": 150810335.0,
"step": 2395
},
{
"epoch": 7.844517184942717,
"grad_norm": 19.013326282949027,
"learning_rate": 4.336725525498249e-09,
"loss": 2.0497,
"mean_token_accuracy": 0.6194914102554321,
"num_tokens": 151126438.0,
"step": 2400
},
{
"epoch": 7.86088379705401,
"grad_norm": 19.222782991435427,
"learning_rate": 4.3328513494108774e-09,
"loss": 2.086,
"mean_token_accuracy": 0.6155364334583282,
"num_tokens": 151441771.0,
"step": 2405
},
{
"epoch": 7.877250409165303,
"grad_norm": 19.228039566412573,
"learning_rate": 4.328967633091856e-09,
"loss": 2.0898,
"mean_token_accuracy": 0.6135625004768371,
"num_tokens": 151758082.0,
"step": 2410
},
{
"epoch": 7.8936170212765955,
"grad_norm": 20.01595055988023,
"learning_rate": 4.325074396756437e-09,
"loss": 2.0227,
"mean_token_accuracy": 0.6251669287681579,
"num_tokens": 152073684.0,
"step": 2415
},
{
"epoch": 7.909983633387888,
"grad_norm": 19.752325541846453,
"learning_rate": 4.321171660669426e-09,
"loss": 2.0636,
"mean_token_accuracy": 0.6182598769664764,
"num_tokens": 152390090.0,
"step": 2420
},
{
"epoch": 7.926350245499181,
"grad_norm": 19.321038430559828,
"learning_rate": 4.3172594451450775e-09,
"loss": 2.0471,
"mean_token_accuracy": 0.6199876964092255,
"num_tokens": 152704416.0,
"step": 2425
},
{
"epoch": 7.942716857610475,
"grad_norm": 19.966443244824358,
"learning_rate": 4.313337770546986e-09,
"loss": 2.0788,
"mean_token_accuracy": 0.6131214797496796,
"num_tokens": 153019785.0,
"step": 2430
},
{
"epoch": 7.959083469721768,
"grad_norm": 19.24138520226915,
"learning_rate": 4.309406657287981e-09,
"loss": 2.0363,
"mean_token_accuracy": 0.620780485868454,
"num_tokens": 153335233.0,
"step": 2435
},
{
"epoch": 7.975450081833061,
"grad_norm": 19.360751757947458,
"learning_rate": 4.305466125830023e-09,
"loss": 2.0598,
"mean_token_accuracy": 0.6176416158676148,
"num_tokens": 153651679.0,
"step": 2440
},
{
"epoch": 7.991816693944354,
"grad_norm": 19.437309161187518,
"learning_rate": 4.301516196684097e-09,
"loss": 2.0537,
"mean_token_accuracy": 0.618579763174057,
"num_tokens": 153967435.0,
"step": 2445
},
{
"epoch": 8.006546644844518,
"grad_norm": 19.317444336785467,
"learning_rate": 4.297556890410099e-09,
"loss": 2.0168,
"mean_token_accuracy": 0.6221174928877089,
"num_tokens": 154228527.0,
"step": 2450
},
{
"epoch": 8.02291325695581,
"grad_norm": 18.955046602063693,
"learning_rate": 4.29358822761674e-09,
"loss": 2.0435,
"mean_token_accuracy": 0.6193597435951232,
"num_tokens": 154544523.0,
"step": 2455
},
{
"epoch": 8.039279869067103,
"grad_norm": 19.754475500032857,
"learning_rate": 4.2896102289614284e-09,
"loss": 2.0634,
"mean_token_accuracy": 0.6151875674724578,
"num_tokens": 154858664.0,
"step": 2460
},
{
"epoch": 8.055646481178396,
"grad_norm": 19.374158690398726,
"learning_rate": 4.28562291515017e-09,
"loss": 2.0172,
"mean_token_accuracy": 0.6231228113174438,
"num_tokens": 155174006.0,
"step": 2465
},
{
"epoch": 8.072013093289689,
"grad_norm": 18.2855602378677,
"learning_rate": 4.281626306937456e-09,
"loss": 2.0179,
"mean_token_accuracy": 0.6234484672546386,
"num_tokens": 155489837.0,
"step": 2470
},
{
"epoch": 8.088379705400982,
"grad_norm": 19.157790006915853,
"learning_rate": 4.277620425126156e-09,
"loss": 2.0431,
"mean_token_accuracy": 0.6187334835529328,
"num_tokens": 155805554.0,
"step": 2475
},
{
"epoch": 8.104746317512275,
"grad_norm": 19.10121563513015,
"learning_rate": 4.273605290567412e-09,
"loss": 2.0224,
"mean_token_accuracy": 0.6224100470542908,
"num_tokens": 156119528.0,
"step": 2480
},
{
"epoch": 8.121112929623568,
"grad_norm": 20.052105369610455,
"learning_rate": 4.269580924160523e-09,
"loss": 2.074,
"mean_token_accuracy": 0.6132338345050812,
"num_tokens": 156435330.0,
"step": 2485
},
{
"epoch": 8.13747954173486,
"grad_norm": 18.93313246732794,
"learning_rate": 4.265547346852845e-09,
"loss": 2.0608,
"mean_token_accuracy": 0.6157853841781616,
"num_tokens": 156750303.0,
"step": 2490
},
{
"epoch": 8.153846153846153,
"grad_norm": 19.151946930372876,
"learning_rate": 4.261504579639678e-09,
"loss": 2.0286,
"mean_token_accuracy": 0.6220345199108124,
"num_tokens": 157065896.0,
"step": 2495
},
{
"epoch": 8.170212765957446,
"grad_norm": 19.20698734133269,
"learning_rate": 4.257452643564154e-09,
"loss": 2.0148,
"mean_token_accuracy": 0.6229146420955658,
"num_tokens": 157380698.0,
"step": 2500
},
{
"epoch": 8.186579378068739,
"grad_norm": 18.895147128140277,
"learning_rate": 4.253391559717134e-09,
"loss": 2.0162,
"mean_token_accuracy": 0.623430597782135,
"num_tokens": 157696848.0,
"step": 2505
},
{
"epoch": 8.202945990180032,
"grad_norm": 18.59917375497589,
"learning_rate": 4.249321349237088e-09,
"loss": 1.9741,
"mean_token_accuracy": 0.6283079147338867,
"num_tokens": 158013554.0,
"step": 2510
},
{
"epoch": 8.219312602291327,
"grad_norm": 18.81115825320335,
"learning_rate": 4.24524203331e-09,
"loss": 2.0411,
"mean_token_accuracy": 0.6198984026908875,
"num_tokens": 158329281.0,
"step": 2515
},
{
"epoch": 8.23567921440262,
"grad_norm": 19.40948052890125,
"learning_rate": 4.241153633169241e-09,
"loss": 2.0434,
"mean_token_accuracy": 0.6172616899013519,
"num_tokens": 158645204.0,
"step": 2520
},
{
"epoch": 8.252045826513912,
"grad_norm": 19.128928676302635,
"learning_rate": 4.237056170095473e-09,
"loss": 2.0208,
"mean_token_accuracy": 0.6226502656936646,
"num_tokens": 158960971.0,
"step": 2525
},
{
"epoch": 8.268412438625205,
"grad_norm": 18.487170030947755,
"learning_rate": 4.232949665416525e-09,
"loss": 2.0205,
"mean_token_accuracy": 0.6218908786773681,
"num_tokens": 159275712.0,
"step": 2530
},
{
"epoch": 8.284779050736498,
"grad_norm": 19.130608652611837,
"learning_rate": 4.2288341405072946e-09,
"loss": 2.0634,
"mean_token_accuracy": 0.6136705696582794,
"num_tokens": 159591288.0,
"step": 2535
},
{
"epoch": 8.30114566284779,
"grad_norm": 19.69139917071449,
"learning_rate": 4.224709616789628e-09,
"loss": 2.0317,
"mean_token_accuracy": 0.6212322771549225,
"num_tokens": 159907007.0,
"step": 2540
},
{
"epoch": 8.317512274959084,
"grad_norm": 18.98342744228345,
"learning_rate": 4.220576115732213e-09,
"loss": 2.0423,
"mean_token_accuracy": 0.6142703175544739,
"num_tokens": 160220532.0,
"step": 2545
},
{
"epoch": 8.333878887070377,
"grad_norm": 18.36077241118662,
"learning_rate": 4.216433658850464e-09,
"loss": 2.0142,
"mean_token_accuracy": 0.6223547279834747,
"num_tokens": 160536285.0,
"step": 2550
},
{
"epoch": 8.35024549918167,
"grad_norm": 18.94827333595501,
"learning_rate": 4.212282267706413e-09,
"loss": 2.0065,
"mean_token_accuracy": 0.6223433613777161,
"num_tokens": 160850576.0,
"step": 2555
},
{
"epoch": 8.366612111292962,
"grad_norm": 18.298738766625497,
"learning_rate": 4.208121963908594e-09,
"loss": 2.006,
"mean_token_accuracy": 0.623051005601883,
"num_tokens": 161167036.0,
"step": 2560
},
{
"epoch": 8.382978723404255,
"grad_norm": 18.265369067443615,
"learning_rate": 4.203952769111935e-09,
"loss": 2.0195,
"mean_token_accuracy": 0.6212840378284454,
"num_tokens": 161483084.0,
"step": 2565
},
{
"epoch": 8.399345335515548,
"grad_norm": 19.688986359575967,
"learning_rate": 4.199774705017642e-09,
"loss": 2.0623,
"mean_token_accuracy": 0.6111198544502259,
"num_tokens": 161799520.0,
"step": 2570
},
{
"epoch": 8.415711947626841,
"grad_norm": 18.566585593107703,
"learning_rate": 4.195587793373085e-09,
"loss": 1.9965,
"mean_token_accuracy": 0.6247856795787812,
"num_tokens": 162115336.0,
"step": 2575
},
{
"epoch": 8.432078559738134,
"grad_norm": 18.55307853336787,
"learning_rate": 4.19139205597169e-09,
"loss": 2.0091,
"mean_token_accuracy": 0.621464341878891,
"num_tokens": 162431400.0,
"step": 2580
},
{
"epoch": 8.448445171849427,
"grad_norm": 18.55129652765704,
"learning_rate": 4.1871875146528196e-09,
"loss": 1.9971,
"mean_token_accuracy": 0.6246702432632446,
"num_tokens": 162748170.0,
"step": 2585
},
{
"epoch": 8.46481178396072,
"grad_norm": 18.318036230564275,
"learning_rate": 4.182974191301662e-09,
"loss": 1.9903,
"mean_token_accuracy": 0.6247745037078858,
"num_tokens": 163065470.0,
"step": 2590
},
{
"epoch": 8.481178396072012,
"grad_norm": 18.823692124621207,
"learning_rate": 4.178752107849119e-09,
"loss": 2.0466,
"mean_token_accuracy": 0.6138881623744965,
"num_tokens": 163381018.0,
"step": 2595
},
{
"epoch": 8.497545008183305,
"grad_norm": 19.035020757446386,
"learning_rate": 4.1745212862716885e-09,
"loss": 2.0251,
"mean_token_accuracy": 0.6184180915355683,
"num_tokens": 163694158.0,
"step": 2600
},
{
"epoch": 8.5139116202946,
"grad_norm": 18.33063656676717,
"learning_rate": 4.170281748591351e-09,
"loss": 1.9991,
"mean_token_accuracy": 0.6240422368049622,
"num_tokens": 164010496.0,
"step": 2605
},
{
"epoch": 8.530278232405893,
"grad_norm": 19.26904730748046,
"learning_rate": 4.166033516875457e-09,
"loss": 2.0101,
"mean_token_accuracy": 0.6214252293109894,
"num_tokens": 164325861.0,
"step": 2610
},
{
"epoch": 8.546644844517186,
"grad_norm": 19.671297978586228,
"learning_rate": 4.16177661323661e-09,
"loss": 2.0256,
"mean_token_accuracy": 0.6183163821697235,
"num_tokens": 164641507.0,
"step": 2615
},
{
"epoch": 8.563011456628479,
"grad_norm": 18.608109118678463,
"learning_rate": 4.157511059832551e-09,
"loss": 2.0225,
"mean_token_accuracy": 0.6190088152885437,
"num_tokens": 164957579.0,
"step": 2620
},
{
"epoch": 8.579378068739771,
"grad_norm": 19.66971661058079,
"learning_rate": 4.1532368788660435e-09,
"loss": 2.0026,
"mean_token_accuracy": 0.6196180164813996,
"num_tokens": 165274043.0,
"step": 2625
},
{
"epoch": 8.595744680851064,
"grad_norm": 18.648238572936588,
"learning_rate": 4.1489540925847624e-09,
"loss": 2.0198,
"mean_token_accuracy": 0.6171901106834412,
"num_tokens": 165588679.0,
"step": 2630
},
{
"epoch": 8.612111292962357,
"grad_norm": 18.0144822329861,
"learning_rate": 4.144662723281171e-09,
"loss": 1.978,
"mean_token_accuracy": 0.6250441014766693,
"num_tokens": 165903404.0,
"step": 2635
},
{
"epoch": 8.62847790507365,
"grad_norm": 17.95362559890803,
"learning_rate": 4.14036279329241e-09,
"loss": 1.99,
"mean_token_accuracy": 0.6221159398555756,
"num_tokens": 166219798.0,
"step": 2640
},
{
"epoch": 8.644844517184943,
"grad_norm": 18.310831388232366,
"learning_rate": 4.136054325000178e-09,
"loss": 1.9402,
"mean_token_accuracy": 0.6302378952503205,
"num_tokens": 166536213.0,
"step": 2645
},
{
"epoch": 8.661211129296236,
"grad_norm": 18.59936082392863,
"learning_rate": 4.131737340830618e-09,
"loss": 1.9867,
"mean_token_accuracy": 0.6227317154407501,
"num_tokens": 166852864.0,
"step": 2650
},
{
"epoch": 8.677577741407529,
"grad_norm": 17.936983386521433,
"learning_rate": 4.127411863254198e-09,
"loss": 1.9897,
"mean_token_accuracy": 0.6210679411888123,
"num_tokens": 167168596.0,
"step": 2655
},
{
"epoch": 8.693944353518821,
"grad_norm": 18.516548610843525,
"learning_rate": 4.123077914785597e-09,
"loss": 1.977,
"mean_token_accuracy": 0.6251841723918915,
"num_tokens": 167484584.0,
"step": 2660
},
{
"epoch": 8.710310965630114,
"grad_norm": 18.359500979183156,
"learning_rate": 4.1187355179835836e-09,
"loss": 1.984,
"mean_token_accuracy": 0.623200523853302,
"num_tokens": 167801147.0,
"step": 2665
},
{
"epoch": 8.726677577741407,
"grad_norm": 18.161306481323752,
"learning_rate": 4.114384695450906e-09,
"loss": 2.0179,
"mean_token_accuracy": 0.618176156282425,
"num_tokens": 168116240.0,
"step": 2670
},
{
"epoch": 8.7430441898527,
"grad_norm": 18.927299349251303,
"learning_rate": 4.110025469834162e-09,
"loss": 1.9805,
"mean_token_accuracy": 0.6226609289646149,
"num_tokens": 168431117.0,
"step": 2675
},
{
"epoch": 8.759410801963993,
"grad_norm": 19.012736318695463,
"learning_rate": 4.105657863823697e-09,
"loss": 1.9767,
"mean_token_accuracy": 0.6218480348587037,
"num_tokens": 168746772.0,
"step": 2680
},
{
"epoch": 8.775777414075286,
"grad_norm": 18.295532227167453,
"learning_rate": 4.101281900153469e-09,
"loss": 1.9511,
"mean_token_accuracy": 0.627713143825531,
"num_tokens": 169063269.0,
"step": 2685
},
{
"epoch": 8.792144026186579,
"grad_norm": 18.61795636373396,
"learning_rate": 4.096897601600944e-09,
"loss": 1.9883,
"mean_token_accuracy": 0.6205691993236542,
"num_tokens": 169379160.0,
"step": 2690
},
{
"epoch": 8.808510638297872,
"grad_norm": 18.673489459493226,
"learning_rate": 4.092504990986972e-09,
"loss": 1.9751,
"mean_token_accuracy": 0.6233503878116607,
"num_tokens": 169695599.0,
"step": 2695
},
{
"epoch": 8.824877250409166,
"grad_norm": 18.331509805767606,
"learning_rate": 4.088104091175667e-09,
"loss": 1.9806,
"mean_token_accuracy": 0.6208960115909576,
"num_tokens": 170010952.0,
"step": 2700
},
{
"epoch": 8.841243862520459,
"grad_norm": 18.329443723081933,
"learning_rate": 4.08369492507429e-09,
"loss": 1.9784,
"mean_token_accuracy": 0.6229167640209198,
"num_tokens": 170326218.0,
"step": 2705
},
{
"epoch": 8.857610474631752,
"grad_norm": 18.045575742161883,
"learning_rate": 4.0792775156331276e-09,
"loss": 1.9635,
"mean_token_accuracy": 0.6248275518417359,
"num_tokens": 170641654.0,
"step": 2710
},
{
"epoch": 8.873977086743045,
"grad_norm": 18.464840629768773,
"learning_rate": 4.0748518858453756e-09,
"loss": 1.9731,
"mean_token_accuracy": 0.6229199707508087,
"num_tokens": 170957841.0,
"step": 2715
},
{
"epoch": 8.890343698854338,
"grad_norm": 18.609226014251735,
"learning_rate": 4.070418058747018e-09,
"loss": 1.9753,
"mean_token_accuracy": 0.6225042223930359,
"num_tokens": 171274797.0,
"step": 2720
},
{
"epoch": 8.90671031096563,
"grad_norm": 19.047635263213472,
"learning_rate": 4.065976057416707e-09,
"loss": 1.9723,
"mean_token_accuracy": 0.622650933265686,
"num_tokens": 171589606.0,
"step": 2725
},
{
"epoch": 8.923076923076923,
"grad_norm": 19.226875174529695,
"learning_rate": 4.061525904975642e-09,
"loss": 1.9748,
"mean_token_accuracy": 0.624282443523407,
"num_tokens": 171905454.0,
"step": 2730
},
{
"epoch": 8.939443535188216,
"grad_norm": 18.25508447639412,
"learning_rate": 4.057067624587448e-09,
"loss": 1.9444,
"mean_token_accuracy": 0.626853859424591,
"num_tokens": 172220595.0,
"step": 2735
},
{
"epoch": 8.955810147299509,
"grad_norm": 17.82977614301095,
"learning_rate": 4.052601239458061e-09,
"loss": 1.9464,
"mean_token_accuracy": 0.6270898818969727,
"num_tokens": 172537829.0,
"step": 2740
},
{
"epoch": 8.972176759410802,
"grad_norm": 18.155334994112835,
"learning_rate": 4.0481267728356e-09,
"loss": 1.9575,
"mean_token_accuracy": 0.6241896629333497,
"num_tokens": 172853529.0,
"step": 2745
},
{
"epoch": 8.988543371522095,
"grad_norm": 17.88360441782584,
"learning_rate": 4.043644248010252e-09,
"loss": 1.9545,
"mean_token_accuracy": 0.6252379953861237,
"num_tokens": 173167883.0,
"step": 2750
},
{
"epoch": 9.003273322422258,
"grad_norm": 18.04480210223081,
"learning_rate": 4.039153688314146e-09,
"loss": 1.9658,
"mean_token_accuracy": 0.619963526725769,
"num_tokens": 173428375.0,
"step": 2755
},
{
"epoch": 9.01963993453355,
"grad_norm": 18.72621974154579,
"learning_rate": 4.0346551171212344e-09,
"loss": 1.9569,
"mean_token_accuracy": 0.6238627254962921,
"num_tokens": 173745176.0,
"step": 2760
},
{
"epoch": 9.036006546644845,
"grad_norm": 17.527238066541386,
"learning_rate": 4.030148557847169e-09,
"loss": 1.9277,
"mean_token_accuracy": 0.6290217995643616,
"num_tokens": 174062085.0,
"step": 2765
},
{
"epoch": 9.052373158756138,
"grad_norm": 18.57546301322759,
"learning_rate": 4.025634033949184e-09,
"loss": 1.9642,
"mean_token_accuracy": 0.6221346378326416,
"num_tokens": 174377672.0,
"step": 2770
},
{
"epoch": 9.068739770867431,
"grad_norm": 18.032035732313652,
"learning_rate": 4.021111568925967e-09,
"loss": 1.9594,
"mean_token_accuracy": 0.6231431603431702,
"num_tokens": 174693197.0,
"step": 2775
},
{
"epoch": 9.085106382978724,
"grad_norm": 17.62498580152985,
"learning_rate": 4.016581186317542e-09,
"loss": 1.9394,
"mean_token_accuracy": 0.6259716868400573,
"num_tokens": 175008227.0,
"step": 2780
},
{
"epoch": 9.101472995090017,
"grad_norm": 17.966771537737063,
"learning_rate": 4.012042909705143e-09,
"loss": 1.9411,
"mean_token_accuracy": 0.62475745677948,
"num_tokens": 175323205.0,
"step": 2785
},
{
"epoch": 9.11783960720131,
"grad_norm": 17.47952411808635,
"learning_rate": 4.007496762711098e-09,
"loss": 1.9577,
"mean_token_accuracy": 0.621793794631958,
"num_tokens": 175638747.0,
"step": 2790
},
{
"epoch": 9.134206219312603,
"grad_norm": 18.843414914975632,
"learning_rate": 4.002942768998696e-09,
"loss": 1.9772,
"mean_token_accuracy": 0.6197193324565887,
"num_tokens": 175955103.0,
"step": 2795
},
{
"epoch": 9.150572831423895,
"grad_norm": 17.098878985984722,
"learning_rate": 3.998380952272073e-09,
"loss": 1.9096,
"mean_token_accuracy": 0.6299042642116547,
"num_tokens": 176270139.0,
"step": 2800
},
{
"epoch": 9.166939443535188,
"grad_norm": 17.182681882574787,
"learning_rate": 3.993811336276081e-09,
"loss": 1.9223,
"mean_token_accuracy": 0.6275332272052765,
"num_tokens": 176585317.0,
"step": 2805
},
{
"epoch": 9.183306055646481,
"grad_norm": 18.248471061146546,
"learning_rate": 3.989233944796173e-09,
"loss": 1.956,
"mean_token_accuracy": 0.6209408044815063,
"num_tokens": 176901360.0,
"step": 2810
},
{
"epoch": 9.199672667757774,
"grad_norm": 19.34514275913769,
"learning_rate": 3.984648801658272e-09,
"loss": 1.9525,
"mean_token_accuracy": 0.6216700792312622,
"num_tokens": 177216887.0,
"step": 2815
},
{
"epoch": 9.216039279869067,
"grad_norm": 18.33216037869195,
"learning_rate": 3.980055930728647e-09,
"loss": 1.9736,
"mean_token_accuracy": 0.6176897406578064,
"num_tokens": 177531745.0,
"step": 2820
},
{
"epoch": 9.23240589198036,
"grad_norm": 18.01373662008413,
"learning_rate": 3.975455355913796e-09,
"loss": 1.9179,
"mean_token_accuracy": 0.6283258557319641,
"num_tokens": 177847164.0,
"step": 2825
},
{
"epoch": 9.248772504091653,
"grad_norm": 18.644065662113995,
"learning_rate": 3.970847101160312e-09,
"loss": 1.9228,
"mean_token_accuracy": 0.6259247720241546,
"num_tokens": 178161807.0,
"step": 2830
},
{
"epoch": 9.265139116202946,
"grad_norm": 18.19303408352185,
"learning_rate": 3.966231190454767e-09,
"loss": 1.934,
"mean_token_accuracy": 0.6245276927947998,
"num_tokens": 178476348.0,
"step": 2835
},
{
"epoch": 9.281505728314238,
"grad_norm": 17.573488111961076,
"learning_rate": 3.961607647823583e-09,
"loss": 1.9011,
"mean_token_accuracy": 0.6307465970516205,
"num_tokens": 178792537.0,
"step": 2840
},
{
"epoch": 9.297872340425531,
"grad_norm": 18.230014313895904,
"learning_rate": 3.956976497332903e-09,
"loss": 1.9031,
"mean_token_accuracy": 0.6299893498420716,
"num_tokens": 179108205.0,
"step": 2845
},
{
"epoch": 9.314238952536824,
"grad_norm": 17.007404114028006,
"learning_rate": 3.952337763088473e-09,
"loss": 1.9081,
"mean_token_accuracy": 0.6293568968772888,
"num_tokens": 179422450.0,
"step": 2850
},
{
"epoch": 9.330605564648117,
"grad_norm": 18.076430353756503,
"learning_rate": 3.947691469235514e-09,
"loss": 1.9208,
"mean_token_accuracy": 0.6249583125114441,
"num_tokens": 179738583.0,
"step": 2855
},
{
"epoch": 9.346972176759412,
"grad_norm": 17.885738938494494,
"learning_rate": 3.9430376399585945e-09,
"loss": 1.9007,
"mean_token_accuracy": 0.6298418581485749,
"num_tokens": 180055204.0,
"step": 2860
},
{
"epoch": 9.363338788870704,
"grad_norm": 18.66586545840845,
"learning_rate": 3.938376299481506e-09,
"loss": 1.921,
"mean_token_accuracy": 0.6242359042167663,
"num_tokens": 180372131.0,
"step": 2865
},
{
"epoch": 9.379705400981997,
"grad_norm": 18.250054937583442,
"learning_rate": 3.93370747206714e-09,
"loss": 1.9274,
"mean_token_accuracy": 0.6258101642131806,
"num_tokens": 180687489.0,
"step": 2870
},
{
"epoch": 9.39607201309329,
"grad_norm": 17.662285336054975,
"learning_rate": 3.92903118201735e-09,
"loss": 1.9026,
"mean_token_accuracy": 0.6290419936180115,
"num_tokens": 181002519.0,
"step": 2875
},
{
"epoch": 9.412438625204583,
"grad_norm": 18.323648354162366,
"learning_rate": 3.924347453672843e-09,
"loss": 1.9207,
"mean_token_accuracy": 0.6275826394557953,
"num_tokens": 181317643.0,
"step": 2880
},
{
"epoch": 9.428805237315876,
"grad_norm": 18.03313243291237,
"learning_rate": 3.919656311413038e-09,
"loss": 1.8935,
"mean_token_accuracy": 0.6286854863166809,
"num_tokens": 181633606.0,
"step": 2885
},
{
"epoch": 9.445171849427169,
"grad_norm": 18.57778730511887,
"learning_rate": 3.914957779655946e-09,
"loss": 1.9335,
"mean_token_accuracy": 0.623274952173233,
"num_tokens": 181948723.0,
"step": 2890
},
{
"epoch": 9.461538461538462,
"grad_norm": 18.227838310934676,
"learning_rate": 3.91025188285804e-09,
"loss": 1.9188,
"mean_token_accuracy": 0.6270026624202728,
"num_tokens": 182267064.0,
"step": 2895
},
{
"epoch": 9.477905073649755,
"grad_norm": 18.34210442651482,
"learning_rate": 3.9055386455141314e-09,
"loss": 1.9202,
"mean_token_accuracy": 0.6245188891887665,
"num_tokens": 182581869.0,
"step": 2900
},
{
"epoch": 9.494271685761047,
"grad_norm": 17.687141855032003,
"learning_rate": 3.900818092157239e-09,
"loss": 1.8737,
"mean_token_accuracy": 0.6342037916183472,
"num_tokens": 182898292.0,
"step": 2905
},
{
"epoch": 9.51063829787234,
"grad_norm": 17.933760186583516,
"learning_rate": 3.89609024735846e-09,
"loss": 1.906,
"mean_token_accuracy": 0.6296324312686921,
"num_tokens": 183212511.0,
"step": 2910
},
{
"epoch": 9.527004909983633,
"grad_norm": 18.867128515643696,
"learning_rate": 3.891355135726849e-09,
"loss": 1.911,
"mean_token_accuracy": 0.6257581770420074,
"num_tokens": 183527923.0,
"step": 2915
},
{
"epoch": 9.543371522094926,
"grad_norm": 17.53962602317489,
"learning_rate": 3.886612781909281e-09,
"loss": 1.8653,
"mean_token_accuracy": 0.6354433178901673,
"num_tokens": 183843275.0,
"step": 2920
},
{
"epoch": 9.559738134206219,
"grad_norm": 18.501450293771324,
"learning_rate": 3.881863210590332e-09,
"loss": 1.9218,
"mean_token_accuracy": 0.6226227402687072,
"num_tokens": 184160244.0,
"step": 2925
},
{
"epoch": 9.576104746317512,
"grad_norm": 18.344935811703643,
"learning_rate": 3.877106446492141e-09,
"loss": 1.8926,
"mean_token_accuracy": 0.6301439940929413,
"num_tokens": 184476831.0,
"step": 2930
},
{
"epoch": 9.592471358428805,
"grad_norm": 17.456391655964303,
"learning_rate": 3.8723425143742904e-09,
"loss": 1.8564,
"mean_token_accuracy": 0.6359524667263031,
"num_tokens": 184793601.0,
"step": 2935
},
{
"epoch": 9.608837970540097,
"grad_norm": 17.362489760879928,
"learning_rate": 3.867571439033671e-09,
"loss": 1.8698,
"mean_token_accuracy": 0.6319121718406677,
"num_tokens": 185110931.0,
"step": 2940
},
{
"epoch": 9.62520458265139,
"grad_norm": 17.78448055098728,
"learning_rate": 3.862793245304358e-09,
"loss": 1.8868,
"mean_token_accuracy": 0.6301158666610718,
"num_tokens": 185427628.0,
"step": 2945
},
{
"epoch": 9.641571194762683,
"grad_norm": 17.87940817258405,
"learning_rate": 3.858007958057473e-09,
"loss": 1.8552,
"mean_token_accuracy": 0.6365744888782501,
"num_tokens": 185742371.0,
"step": 2950
},
{
"epoch": 9.657937806873978,
"grad_norm": 18.114260046662046,
"learning_rate": 3.853215602201065e-09,
"loss": 1.8855,
"mean_token_accuracy": 0.6294423818588257,
"num_tokens": 186058058.0,
"step": 2955
},
{
"epoch": 9.67430441898527,
"grad_norm": 17.069902906334928,
"learning_rate": 3.848416202679975e-09,
"loss": 1.8699,
"mean_token_accuracy": 0.6321876406669616,
"num_tokens": 186374896.0,
"step": 2960
},
{
"epoch": 9.690671031096564,
"grad_norm": 17.26492781592171,
"learning_rate": 3.843609784475708e-09,
"loss": 1.8976,
"mean_token_accuracy": 0.6271225333213806,
"num_tokens": 186691633.0,
"step": 2965
},
{
"epoch": 9.707037643207856,
"grad_norm": 18.95360140068516,
"learning_rate": 3.838796372606299e-09,
"loss": 1.8978,
"mean_token_accuracy": 0.6272413194179535,
"num_tokens": 187006529.0,
"step": 2970
},
{
"epoch": 9.72340425531915,
"grad_norm": 18.122093609301107,
"learning_rate": 3.833975992126189e-09,
"loss": 1.8674,
"mean_token_accuracy": 0.6307429075241089,
"num_tokens": 187321093.0,
"step": 2975
},
{
"epoch": 9.739770867430442,
"grad_norm": 18.056802859104373,
"learning_rate": 3.82914866812609e-09,
"loss": 1.8869,
"mean_token_accuracy": 0.627680492401123,
"num_tokens": 187634669.0,
"step": 2980
},
{
"epoch": 9.756137479541735,
"grad_norm": 17.361076661953092,
"learning_rate": 3.824314425732859e-09,
"loss": 1.8616,
"mean_token_accuracy": 0.6338369309902191,
"num_tokens": 187950365.0,
"step": 2985
},
{
"epoch": 9.772504091653028,
"grad_norm": 18.01075845513964,
"learning_rate": 3.819473290109359e-09,
"loss": 1.8595,
"mean_token_accuracy": 0.633523577451706,
"num_tokens": 188266181.0,
"step": 2990
},
{
"epoch": 9.78887070376432,
"grad_norm": 17.57825984359166,
"learning_rate": 3.814625286454335e-09,
"loss": 1.8644,
"mean_token_accuracy": 0.6331724166870117,
"num_tokens": 188582595.0,
"step": 2995
},
{
"epoch": 9.805237315875614,
"grad_norm": 18.02430311833049,
"learning_rate": 3.809770440002286e-09,
"loss": 1.8491,
"mean_token_accuracy": 0.6342060387134552,
"num_tokens": 188897418.0,
"step": 3000
},
{
"epoch": 9.821603927986907,
"grad_norm": 17.839992907497123,
"learning_rate": 3.80490877602332e-09,
"loss": 1.8468,
"mean_token_accuracy": 0.63299680352211,
"num_tokens": 189213354.0,
"step": 3005
},
{
"epoch": 9.8379705400982,
"grad_norm": 17.707546662790627,
"learning_rate": 3.800040319823038e-09,
"loss": 1.8587,
"mean_token_accuracy": 0.6331423938274383,
"num_tokens": 189528334.0,
"step": 3010
},
{
"epoch": 9.854337152209492,
"grad_norm": 17.12413666228983,
"learning_rate": 3.795165096742394e-09,
"loss": 1.858,
"mean_token_accuracy": 0.6314215183258056,
"num_tokens": 189842582.0,
"step": 3015
},
{
"epoch": 9.870703764320785,
"grad_norm": 17.26456725836023,
"learning_rate": 3.790283132157561e-09,
"loss": 1.8425,
"mean_token_accuracy": 0.6359744787216186,
"num_tokens": 190158840.0,
"step": 3020
},
{
"epoch": 9.887070376432078,
"grad_norm": 17.385229623404953,
"learning_rate": 3.785394451479806e-09,
"loss": 1.8484,
"mean_token_accuracy": 0.6332973361015319,
"num_tokens": 190473395.0,
"step": 3025
},
{
"epoch": 9.90343698854337,
"grad_norm": 17.777934882860322,
"learning_rate": 3.780499080155353e-09,
"loss": 1.884,
"mean_token_accuracy": 0.6261075854301452,
"num_tokens": 190789849.0,
"step": 3030
},
{
"epoch": 9.919803600654664,
"grad_norm": 16.90729981545112,
"learning_rate": 3.775597043665252e-09,
"loss": 1.8599,
"mean_token_accuracy": 0.635511976480484,
"num_tokens": 191105958.0,
"step": 3035
},
{
"epoch": 9.936170212765958,
"grad_norm": 17.900635313977407,
"learning_rate": 3.770688367525247e-09,
"loss": 1.8418,
"mean_token_accuracy": 0.6317579805850982,
"num_tokens": 191422525.0,
"step": 3040
},
{
"epoch": 9.952536824877251,
"grad_norm": 17.038381383182745,
"learning_rate": 3.765773077285639e-09,
"loss": 1.7945,
"mean_token_accuracy": 0.6432504296302796,
"num_tokens": 191736691.0,
"step": 3045
},
{
"epoch": 9.968903436988544,
"grad_norm": 17.954414671181723,
"learning_rate": 3.7608511985311575e-09,
"loss": 1.8422,
"mean_token_accuracy": 0.6309450984001159,
"num_tokens": 192052351.0,
"step": 3050
},
{
"epoch": 9.985270049099837,
"grad_norm": 17.827699266023465,
"learning_rate": 3.755922756880831e-09,
"loss": 1.828,
"mean_token_accuracy": 0.6335801184177399,
"num_tokens": 192367730.0,
"step": 3055
},
{
"epoch": 10.0,
"grad_norm": 17.792353956261156,
"learning_rate": 3.750987777987841e-09,
"loss": 1.8302,
"mean_token_accuracy": 0.6340090367529128,
"num_tokens": 192627053.0,
"step": 3060
},
{
"epoch": 10.016366612111293,
"grad_norm": 17.4349917278818,
"learning_rate": 3.7460462875394e-09,
"loss": 1.813,
"mean_token_accuracy": 0.6366794407367706,
"num_tokens": 192942851.0,
"step": 3065
},
{
"epoch": 10.032733224222586,
"grad_norm": 17.541872465331707,
"learning_rate": 3.741098311256616e-09,
"loss": 1.8463,
"mean_token_accuracy": 0.6313372552394867,
"num_tokens": 193257200.0,
"step": 3070
},
{
"epoch": 10.049099836333879,
"grad_norm": 17.231122883700056,
"learning_rate": 3.736143874894354e-09,
"loss": 1.812,
"mean_token_accuracy": 0.636261624097824,
"num_tokens": 193574165.0,
"step": 3075
},
{
"epoch": 10.065466448445171,
"grad_norm": 16.57540736745677,
"learning_rate": 3.731183004241103e-09,
"loss": 1.8135,
"mean_token_accuracy": 0.6378425419330597,
"num_tokens": 193890421.0,
"step": 3080
},
{
"epoch": 10.081833060556464,
"grad_norm": 17.806512444498733,
"learning_rate": 3.726215725118848e-09,
"loss": 1.8265,
"mean_token_accuracy": 0.6336937725543976,
"num_tokens": 194205428.0,
"step": 3085
},
{
"epoch": 10.098199672667757,
"grad_norm": 17.813348873544196,
"learning_rate": 3.721242063382926e-09,
"loss": 1.827,
"mean_token_accuracy": 0.6328922867774963,
"num_tokens": 194520954.0,
"step": 3090
},
{
"epoch": 10.11456628477905,
"grad_norm": 17.36429708821553,
"learning_rate": 3.7162620449218997e-09,
"loss": 1.796,
"mean_token_accuracy": 0.6400941967964172,
"num_tokens": 194835641.0,
"step": 3095
},
{
"epoch": 10.130932896890343,
"grad_norm": 16.481248380963812,
"learning_rate": 3.711275695657419e-09,
"loss": 1.8043,
"mean_token_accuracy": 0.6384086728096008,
"num_tokens": 195151450.0,
"step": 3100
},
{
"epoch": 10.147299509001636,
"grad_norm": 17.113804257065954,
"learning_rate": 3.7062830415440844e-09,
"loss": 1.8284,
"mean_token_accuracy": 0.631797057390213,
"num_tokens": 195467613.0,
"step": 3105
},
{
"epoch": 10.16366612111293,
"grad_norm": 16.672796542423484,
"learning_rate": 3.7012841085693164e-09,
"loss": 1.7864,
"mean_token_accuracy": 0.6400185823440552,
"num_tokens": 195783439.0,
"step": 3110
},
{
"epoch": 10.180032733224223,
"grad_norm": 16.679087590315454,
"learning_rate": 3.696278922753216e-09,
"loss": 1.8161,
"mean_token_accuracy": 0.634281975030899,
"num_tokens": 196099260.0,
"step": 3115
},
{
"epoch": 10.196399345335516,
"grad_norm": 16.53721256199728,
"learning_rate": 3.6912675101484327e-09,
"loss": 1.7851,
"mean_token_accuracy": 0.640500259399414,
"num_tokens": 196414809.0,
"step": 3120
},
{
"epoch": 10.212765957446809,
"grad_norm": 16.569237354892337,
"learning_rate": 3.686249896840026e-09,
"loss": 1.8038,
"mean_token_accuracy": 0.6380644977092743,
"num_tokens": 196729837.0,
"step": 3125
},
{
"epoch": 10.229132569558102,
"grad_norm": 16.786788982952096,
"learning_rate": 3.68122610894533e-09,
"loss": 1.8194,
"mean_token_accuracy": 0.6346737504005432,
"num_tokens": 197044385.0,
"step": 3130
},
{
"epoch": 10.245499181669395,
"grad_norm": 16.761298565172286,
"learning_rate": 3.676196172613821e-09,
"loss": 1.7857,
"mean_token_accuracy": 0.6384332537651062,
"num_tokens": 197360546.0,
"step": 3135
},
{
"epoch": 10.261865793780688,
"grad_norm": 16.87315468806941,
"learning_rate": 3.671160114026977e-09,
"loss": 1.7984,
"mean_token_accuracy": 0.63452108502388,
"num_tokens": 197676489.0,
"step": 3140
},
{
"epoch": 10.27823240589198,
"grad_norm": 16.491704243241642,
"learning_rate": 3.666117959398143e-09,
"loss": 1.8026,
"mean_token_accuracy": 0.6362354993820191,
"num_tokens": 197990771.0,
"step": 3145
},
{
"epoch": 10.294599018003273,
"grad_norm": 16.280069613051808,
"learning_rate": 3.6610697349723955e-09,
"loss": 1.7907,
"mean_token_accuracy": 0.6383823394775391,
"num_tokens": 198305211.0,
"step": 3150
},
{
"epoch": 10.310965630114566,
"grad_norm": 16.52034352373918,
"learning_rate": 3.6560154670264046e-09,
"loss": 1.7935,
"mean_token_accuracy": 0.6377637565135956,
"num_tokens": 198621331.0,
"step": 3155
},
{
"epoch": 10.327332242225859,
"grad_norm": 15.499203423449122,
"learning_rate": 3.650955181868298e-09,
"loss": 1.7475,
"mean_token_accuracy": 0.6453053712844848,
"num_tokens": 198935989.0,
"step": 3160
},
{
"epoch": 10.343698854337152,
"grad_norm": 16.370463459072013,
"learning_rate": 3.645888905837523e-09,
"loss": 1.7855,
"mean_token_accuracy": 0.6374387383460999,
"num_tokens": 199251515.0,
"step": 3165
},
{
"epoch": 10.360065466448445,
"grad_norm": 16.299945785170074,
"learning_rate": 3.6408166653047108e-09,
"loss": 1.7836,
"mean_token_accuracy": 0.6420963048934937,
"num_tokens": 199567139.0,
"step": 3170
},
{
"epoch": 10.376432078559738,
"grad_norm": 16.335326417897214,
"learning_rate": 3.63573848667154e-09,
"loss": 1.7806,
"mean_token_accuracy": 0.6386357069015502,
"num_tokens": 199881323.0,
"step": 3175
},
{
"epoch": 10.39279869067103,
"grad_norm": 16.121960949269923,
"learning_rate": 3.630654396370594e-09,
"loss": 1.7666,
"mean_token_accuracy": 0.6415923595428467,
"num_tokens": 200197082.0,
"step": 3180
},
{
"epoch": 10.409165302782323,
"grad_norm": 15.517362094147444,
"learning_rate": 3.6255644208652316e-09,
"loss": 1.7784,
"mean_token_accuracy": 0.6394968807697297,
"num_tokens": 200513159.0,
"step": 3185
},
{
"epoch": 10.425531914893616,
"grad_norm": 15.99746353634616,
"learning_rate": 3.6204685866494426e-09,
"loss": 1.7605,
"mean_token_accuracy": 0.6430169403553009,
"num_tokens": 200829112.0,
"step": 3190
},
{
"epoch": 10.44189852700491,
"grad_norm": 16.322051281430188,
"learning_rate": 3.6153669202477113e-09,
"loss": 1.8034,
"mean_token_accuracy": 0.6361609637737274,
"num_tokens": 201145377.0,
"step": 3195
},
{
"epoch": 10.458265139116204,
"grad_norm": 17.010695164653885,
"learning_rate": 3.6102594482148815e-09,
"loss": 1.7819,
"mean_token_accuracy": 0.637069708108902,
"num_tokens": 201460905.0,
"step": 3200
},
{
"epoch": 10.474631751227497,
"grad_norm": 16.00993109809842,
"learning_rate": 3.6051461971360142e-09,
"loss": 1.7922,
"mean_token_accuracy": 0.635745245218277,
"num_tokens": 201777160.0,
"step": 3205
},
{
"epoch": 10.49099836333879,
"grad_norm": 16.528275926750045,
"learning_rate": 3.600027193626253e-09,
"loss": 1.7771,
"mean_token_accuracy": 0.6353870570659638,
"num_tokens": 202091924.0,
"step": 3210
},
{
"epoch": 10.507364975450082,
"grad_norm": 15.64504960410459,
"learning_rate": 3.5949024643306816e-09,
"loss": 1.7763,
"mean_token_accuracy": 0.6359361112117767,
"num_tokens": 202407738.0,
"step": 3215
},
{
"epoch": 10.523731587561375,
"grad_norm": 15.891551126219262,
"learning_rate": 3.5897720359241876e-09,
"loss": 1.7615,
"mean_token_accuracy": 0.6402227580547333,
"num_tokens": 202722791.0,
"step": 3220
},
{
"epoch": 10.540098199672668,
"grad_norm": 15.248335632489418,
"learning_rate": 3.5846359351113244e-09,
"loss": 1.7675,
"mean_token_accuracy": 0.6383937776088715,
"num_tokens": 203038092.0,
"step": 3225
},
{
"epoch": 10.556464811783961,
"grad_norm": 16.003380638768103,
"learning_rate": 3.57949418862617e-09,
"loss": 1.7808,
"mean_token_accuracy": 0.6339847385883332,
"num_tokens": 203354370.0,
"step": 3230
},
{
"epoch": 10.572831423895254,
"grad_norm": 15.14019211081649,
"learning_rate": 3.5743468232321897e-09,
"loss": 1.7503,
"mean_token_accuracy": 0.6410067021846771,
"num_tokens": 203671582.0,
"step": 3235
},
{
"epoch": 10.589198036006547,
"grad_norm": 15.519984653479437,
"learning_rate": 3.569193865722096e-09,
"loss": 1.7321,
"mean_token_accuracy": 0.644438773393631,
"num_tokens": 203987499.0,
"step": 3240
},
{
"epoch": 10.60556464811784,
"grad_norm": 15.407299743600658,
"learning_rate": 3.564035342917707e-09,
"loss": 1.7361,
"mean_token_accuracy": 0.6415053546428681,
"num_tokens": 204305165.0,
"step": 3245
},
{
"epoch": 10.621931260229132,
"grad_norm": 15.432492843596417,
"learning_rate": 3.558871281669811e-09,
"loss": 1.7325,
"mean_token_accuracy": 0.6469219923019409,
"num_tokens": 204621541.0,
"step": 3250
},
{
"epoch": 10.638297872340425,
"grad_norm": 15.141663299177573,
"learning_rate": 3.5537017088580244e-09,
"loss": 1.7466,
"mean_token_accuracy": 0.6414005696773529,
"num_tokens": 204938513.0,
"step": 3255
},
{
"epoch": 10.654664484451718,
"grad_norm": 15.2352538843922,
"learning_rate": 3.548526651390651e-09,
"loss": 1.7561,
"mean_token_accuracy": 0.6406930923461914,
"num_tokens": 205254497.0,
"step": 3260
},
{
"epoch": 10.671031096563011,
"grad_norm": 15.018961061358699,
"learning_rate": 3.543346136204545e-09,
"loss": 1.7721,
"mean_token_accuracy": 0.637039589881897,
"num_tokens": 205570795.0,
"step": 3265
},
{
"epoch": 10.687397708674304,
"grad_norm": 15.643328087526758,
"learning_rate": 3.538160190264966e-09,
"loss": 1.7636,
"mean_token_accuracy": 0.6392747342586518,
"num_tokens": 205886038.0,
"step": 3270
},
{
"epoch": 10.703764320785597,
"grad_norm": 14.821441582923677,
"learning_rate": 3.532968840565443e-09,
"loss": 1.7234,
"mean_token_accuracy": 0.6462870895862579,
"num_tokens": 206200570.0,
"step": 3275
},
{
"epoch": 10.72013093289689,
"grad_norm": 15.34731712498548,
"learning_rate": 3.5277721141276327e-09,
"loss": 1.7762,
"mean_token_accuracy": 0.6353504419326782,
"num_tokens": 206516423.0,
"step": 3280
},
{
"epoch": 10.736497545008183,
"grad_norm": 14.598592610119189,
"learning_rate": 3.522570038001177e-09,
"loss": 1.7293,
"mean_token_accuracy": 0.6442684292793274,
"num_tokens": 206833415.0,
"step": 3285
},
{
"epoch": 10.752864157119475,
"grad_norm": 15.030434843310996,
"learning_rate": 3.5173626392635645e-09,
"loss": 1.7071,
"mean_token_accuracy": 0.648097550868988,
"num_tokens": 207148402.0,
"step": 3290
},
{
"epoch": 10.76923076923077,
"grad_norm": 14.42367654172462,
"learning_rate": 3.512149945019989e-09,
"loss": 1.7067,
"mean_token_accuracy": 0.6503620088100434,
"num_tokens": 207463073.0,
"step": 3295
},
{
"epoch": 10.785597381342063,
"grad_norm": 14.850989940280288,
"learning_rate": 3.5069319824032076e-09,
"loss": 1.7476,
"mean_token_accuracy": 0.638144338130951,
"num_tokens": 207777543.0,
"step": 3300
},
{
"epoch": 10.801963993453356,
"grad_norm": 15.007681069243336,
"learning_rate": 3.5017087785734e-09,
"loss": 1.7264,
"mean_token_accuracy": 0.6454558491706848,
"num_tokens": 208094500.0,
"step": 3305
},
{
"epoch": 10.818330605564649,
"grad_norm": 15.777790947959936,
"learning_rate": 3.496480360718026e-09,
"loss": 1.745,
"mean_token_accuracy": 0.6422793984413147,
"num_tokens": 208410348.0,
"step": 3310
},
{
"epoch": 10.834697217675942,
"grad_norm": 15.124131114888696,
"learning_rate": 3.4912467560516886e-09,
"loss": 1.7451,
"mean_token_accuracy": 0.6406438410282135,
"num_tokens": 208725794.0,
"step": 3315
},
{
"epoch": 10.851063829787234,
"grad_norm": 14.901110226184628,
"learning_rate": 3.4860079918159844e-09,
"loss": 1.7217,
"mean_token_accuracy": 0.6472250401973725,
"num_tokens": 209041520.0,
"step": 3320
},
{
"epoch": 10.867430441898527,
"grad_norm": 14.874077465654924,
"learning_rate": 3.4807640952793695e-09,
"loss": 1.7252,
"mean_token_accuracy": 0.6431742966175079,
"num_tokens": 209357603.0,
"step": 3325
},
{
"epoch": 10.88379705400982,
"grad_norm": 14.995454699176682,
"learning_rate": 3.4755150937370124e-09,
"loss": 1.7256,
"mean_token_accuracy": 0.6463587701320648,
"num_tokens": 209673991.0,
"step": 3330
},
{
"epoch": 10.900163666121113,
"grad_norm": 14.494540634301961,
"learning_rate": 3.4702610145106545e-09,
"loss": 1.6856,
"mean_token_accuracy": 0.6515645325183869,
"num_tokens": 209989177.0,
"step": 3335
},
{
"epoch": 10.916530278232406,
"grad_norm": 14.950428049798738,
"learning_rate": 3.465001884948468e-09,
"loss": 1.7412,
"mean_token_accuracy": 0.6403026700019836,
"num_tokens": 210304282.0,
"step": 3340
},
{
"epoch": 10.932896890343699,
"grad_norm": 14.463577816118367,
"learning_rate": 3.45973773242491e-09,
"loss": 1.7279,
"mean_token_accuracy": 0.6433824181556702,
"num_tokens": 210618390.0,
"step": 3345
},
{
"epoch": 10.949263502454992,
"grad_norm": 14.518198523486582,
"learning_rate": 3.4544685843405875e-09,
"loss": 1.696,
"mean_token_accuracy": 0.6500982105731964,
"num_tokens": 210932450.0,
"step": 3350
},
{
"epoch": 10.965630114566284,
"grad_norm": 14.202566979681045,
"learning_rate": 3.4491944681221065e-09,
"loss": 1.7046,
"mean_token_accuracy": 0.6483056366443634,
"num_tokens": 211249906.0,
"step": 3355
},
{
"epoch": 10.981996726677577,
"grad_norm": 14.357942906401359,
"learning_rate": 3.443915411221933e-09,
"loss": 1.6951,
"mean_token_accuracy": 0.6466932356357574,
"num_tokens": 211566650.0,
"step": 3360
},
{
"epoch": 10.99836333878887,
"grad_norm": 13.989121868685887,
"learning_rate": 3.43863144111825e-09,
"loss": 1.716,
"mean_token_accuracy": 0.6450442373752594,
"num_tokens": 211881793.0,
"step": 3365
},
{
"epoch": 11.013093289689035,
"grad_norm": 14.34498478284,
"learning_rate": 3.4333425853148157e-09,
"loss": 1.6897,
"mean_token_accuracy": 0.64995010693868,
"num_tokens": 212143193.0,
"step": 3370
},
{
"epoch": 11.029459901800328,
"grad_norm": 14.53268159695013,
"learning_rate": 3.4280488713408185e-09,
"loss": 1.7068,
"mean_token_accuracy": 0.6474012017250061,
"num_tokens": 212460404.0,
"step": 3375
},
{
"epoch": 11.04582651391162,
"grad_norm": 14.166663868063697,
"learning_rate": 3.4227503267507332e-09,
"loss": 1.7053,
"mean_token_accuracy": 0.6485930442810058,
"num_tokens": 212777430.0,
"step": 3380
},
{
"epoch": 11.062193126022914,
"grad_norm": 14.456689486688052,
"learning_rate": 3.41744697912418e-09,
"loss": 1.7112,
"mean_token_accuracy": 0.6475742220878601,
"num_tokens": 213093496.0,
"step": 3385
},
{
"epoch": 11.078559738134206,
"grad_norm": 14.097537246192404,
"learning_rate": 3.4121388560657785e-09,
"loss": 1.7106,
"mean_token_accuracy": 0.6455345630645752,
"num_tokens": 213409098.0,
"step": 3390
},
{
"epoch": 11.0949263502455,
"grad_norm": 13.933961411959768,
"learning_rate": 3.406825985205005e-09,
"loss": 1.729,
"mean_token_accuracy": 0.6434445321559906,
"num_tokens": 213725910.0,
"step": 3395
},
{
"epoch": 11.111292962356792,
"grad_norm": 14.752945706255815,
"learning_rate": 3.401508394196049e-09,
"loss": 1.6957,
"mean_token_accuracy": 0.6487627983093261,
"num_tokens": 214040857.0,
"step": 3400
},
{
"epoch": 11.127659574468085,
"grad_norm": 14.20709457845416,
"learning_rate": 3.39618611071767e-09,
"loss": 1.6807,
"mean_token_accuracy": 0.6522836029529572,
"num_tokens": 214356466.0,
"step": 3405
},
{
"epoch": 11.144026186579378,
"grad_norm": 13.398213700828151,
"learning_rate": 3.3908591624730512e-09,
"loss": 1.6381,
"mean_token_accuracy": 0.6592851996421814,
"num_tokens": 214672248.0,
"step": 3410
},
{
"epoch": 11.16039279869067,
"grad_norm": 13.731779015035574,
"learning_rate": 3.385527577189656e-09,
"loss": 1.7024,
"mean_token_accuracy": 0.6492987155914307,
"num_tokens": 214986750.0,
"step": 3415
},
{
"epoch": 11.176759410801964,
"grad_norm": 13.62502150214815,
"learning_rate": 3.3801913826190855e-09,
"loss": 1.6919,
"mean_token_accuracy": 0.6497934758663177,
"num_tokens": 215300415.0,
"step": 3420
},
{
"epoch": 11.193126022913257,
"grad_norm": 13.588807308802734,
"learning_rate": 3.374850606536933e-09,
"loss": 1.6774,
"mean_token_accuracy": 0.6529603004455566,
"num_tokens": 215616055.0,
"step": 3425
},
{
"epoch": 11.20949263502455,
"grad_norm": 14.004348021352275,
"learning_rate": 3.3695052767426376e-09,
"loss": 1.6831,
"mean_token_accuracy": 0.6526573598384857,
"num_tokens": 215931529.0,
"step": 3430
},
{
"epoch": 11.225859247135842,
"grad_norm": 14.469828986147332,
"learning_rate": 3.3641554210593416e-09,
"loss": 1.6917,
"mean_token_accuracy": 0.6479479074478149,
"num_tokens": 216246536.0,
"step": 3435
},
{
"epoch": 11.242225859247135,
"grad_norm": 13.724821289796603,
"learning_rate": 3.358801067333747e-09,
"loss": 1.6773,
"mean_token_accuracy": 0.6549168825149536,
"num_tokens": 216560534.0,
"step": 3440
},
{
"epoch": 11.258592471358428,
"grad_norm": 13.96963647530131,
"learning_rate": 3.3534422434359656e-09,
"loss": 1.6632,
"mean_token_accuracy": 0.6518442153930664,
"num_tokens": 216874009.0,
"step": 3445
},
{
"epoch": 11.27495908346972,
"grad_norm": 14.002011605528583,
"learning_rate": 3.3480789772593793e-09,
"loss": 1.6514,
"mean_token_accuracy": 0.6585093021392823,
"num_tokens": 217189692.0,
"step": 3450
},
{
"epoch": 11.291325695581016,
"grad_norm": 13.844181087235965,
"learning_rate": 3.342711296720492e-09,
"loss": 1.6611,
"mean_token_accuracy": 0.6569892466068268,
"num_tokens": 217505372.0,
"step": 3455
},
{
"epoch": 11.307692307692308,
"grad_norm": 13.798099661785878,
"learning_rate": 3.3373392297587847e-09,
"loss": 1.6746,
"mean_token_accuracy": 0.6542344272136689,
"num_tokens": 217820392.0,
"step": 3460
},
{
"epoch": 11.324058919803601,
"grad_norm": 13.633044394240601,
"learning_rate": 3.3319628043365703e-09,
"loss": 1.6726,
"mean_token_accuracy": 0.6545675575733185,
"num_tokens": 218136541.0,
"step": 3465
},
{
"epoch": 11.340425531914894,
"grad_norm": 13.646712769096844,
"learning_rate": 3.3265820484388485e-09,
"loss": 1.6754,
"mean_token_accuracy": 0.6528559982776642,
"num_tokens": 218453741.0,
"step": 3470
},
{
"epoch": 11.356792144026187,
"grad_norm": 13.174588490549636,
"learning_rate": 3.3211969900731597e-09,
"loss": 1.6592,
"mean_token_accuracy": 0.6580550074577332,
"num_tokens": 218771030.0,
"step": 3475
},
{
"epoch": 11.37315875613748,
"grad_norm": 13.901574990786047,
"learning_rate": 3.3158076572694386e-09,
"loss": 1.6747,
"mean_token_accuracy": 0.6550671398639679,
"num_tokens": 219087292.0,
"step": 3480
},
{
"epoch": 11.389525368248773,
"grad_norm": 14.015642465284007,
"learning_rate": 3.3104140780798685e-09,
"loss": 1.6646,
"mean_token_accuracy": 0.6581855952739716,
"num_tokens": 219403238.0,
"step": 3485
},
{
"epoch": 11.405891980360066,
"grad_norm": 13.904282659392736,
"learning_rate": 3.3050162805787375e-09,
"loss": 1.6741,
"mean_token_accuracy": 0.6535765409469605,
"num_tokens": 219719693.0,
"step": 3490
},
{
"epoch": 11.422258592471358,
"grad_norm": 13.789370291753515,
"learning_rate": 3.2996142928622896e-09,
"loss": 1.6654,
"mean_token_accuracy": 0.6573232293128968,
"num_tokens": 220037015.0,
"step": 3495
},
{
"epoch": 11.438625204582651,
"grad_norm": 13.832705931075548,
"learning_rate": 3.2942081430485782e-09,
"loss": 1.6465,
"mean_token_accuracy": 0.6604346215724946,
"num_tokens": 220354265.0,
"step": 3500
},
{
"epoch": 11.454991816693944,
"grad_norm": 13.557976866092726,
"learning_rate": 3.2887978592773234e-09,
"loss": 1.6666,
"mean_token_accuracy": 0.6565652191638947,
"num_tokens": 220669974.0,
"step": 3505
},
{
"epoch": 11.471358428805237,
"grad_norm": 13.210160428205072,
"learning_rate": 3.2833834697097608e-09,
"loss": 1.6527,
"mean_token_accuracy": 0.6604192018508911,
"num_tokens": 220986724.0,
"step": 3510
},
{
"epoch": 11.48772504091653,
"grad_norm": 13.689016733980383,
"learning_rate": 3.2779650025284985e-09,
"loss": 1.6738,
"mean_token_accuracy": 0.6558438301086426,
"num_tokens": 221302826.0,
"step": 3515
},
{
"epoch": 11.504091653027823,
"grad_norm": 13.329552275037532,
"learning_rate": 3.2725424859373687e-09,
"loss": 1.6531,
"mean_token_accuracy": 0.6575049161911011,
"num_tokens": 221617698.0,
"step": 3520
},
{
"epoch": 11.520458265139116,
"grad_norm": 13.630816765028563,
"learning_rate": 3.267115948161282e-09,
"loss": 1.6479,
"mean_token_accuracy": 0.6591286897659302,
"num_tokens": 221931887.0,
"step": 3525
},
{
"epoch": 11.536824877250409,
"grad_norm": 12.94733730296298,
"learning_rate": 3.2616854174460786e-09,
"loss": 1.6383,
"mean_token_accuracy": 0.6619257152080535,
"num_tokens": 222247478.0,
"step": 3530
},
{
"epoch": 11.553191489361701,
"grad_norm": 13.878682812123111,
"learning_rate": 3.256250922058383e-09,
"loss": 1.6523,
"mean_token_accuracy": 0.6560933649539947,
"num_tokens": 222561836.0,
"step": 3535
},
{
"epoch": 11.569558101472996,
"grad_norm": 13.544551226928728,
"learning_rate": 3.2508124902854567e-09,
"loss": 1.6409,
"mean_token_accuracy": 0.6584199607372284,
"num_tokens": 222878184.0,
"step": 3540
},
{
"epoch": 11.585924713584289,
"grad_norm": 13.67811999848458,
"learning_rate": 3.2453701504350507e-09,
"loss": 1.6471,
"mean_token_accuracy": 0.6592689633369446,
"num_tokens": 223192021.0,
"step": 3545
},
{
"epoch": 11.602291325695582,
"grad_norm": 13.194026543743341,
"learning_rate": 3.239923930835257e-09,
"loss": 1.6546,
"mean_token_accuracy": 0.6571596205234528,
"num_tokens": 223507736.0,
"step": 3550
},
{
"epoch": 11.618657937806875,
"grad_norm": 13.438919843074075,
"learning_rate": 3.234473859834364e-09,
"loss": 1.6612,
"mean_token_accuracy": 0.6565041303634643,
"num_tokens": 223822076.0,
"step": 3555
},
{
"epoch": 11.635024549918167,
"grad_norm": 13.572873566039807,
"learning_rate": 3.229019965800705e-09,
"loss": 1.6361,
"mean_token_accuracy": 0.6581840515136719,
"num_tokens": 224137036.0,
"step": 3560
},
{
"epoch": 11.65139116202946,
"grad_norm": 13.11267130097918,
"learning_rate": 3.2235622771225127e-09,
"loss": 1.6344,
"mean_token_accuracy": 0.6608539044857025,
"num_tokens": 224455359.0,
"step": 3565
},
{
"epoch": 11.667757774140753,
"grad_norm": 13.745388564823362,
"learning_rate": 3.2181008222077746e-09,
"loss": 1.6102,
"mean_token_accuracy": 0.6658849954605103,
"num_tokens": 224771875.0,
"step": 3570
},
{
"epoch": 11.684124386252046,
"grad_norm": 13.368804864950883,
"learning_rate": 3.2126356294840787e-09,
"loss": 1.6308,
"mean_token_accuracy": 0.6618900954723358,
"num_tokens": 225086494.0,
"step": 3575
},
{
"epoch": 11.700490998363339,
"grad_norm": 13.071279177114466,
"learning_rate": 3.2071667273984706e-09,
"loss": 1.622,
"mean_token_accuracy": 0.6624108016490936,
"num_tokens": 225399906.0,
"step": 3580
},
{
"epoch": 11.716857610474632,
"grad_norm": 13.744495890253397,
"learning_rate": 3.2016941444173014e-09,
"loss": 1.6291,
"mean_token_accuracy": 0.6606933116912842,
"num_tokens": 225716561.0,
"step": 3585
},
{
"epoch": 11.733224222585925,
"grad_norm": 13.264943229878893,
"learning_rate": 3.1962179090260845e-09,
"loss": 1.6268,
"mean_token_accuracy": 0.6614998817443848,
"num_tokens": 226031872.0,
"step": 3590
},
{
"epoch": 11.749590834697218,
"grad_norm": 13.062220154925717,
"learning_rate": 3.1907380497293427e-09,
"loss": 1.6423,
"mean_token_accuracy": 0.6583823144435883,
"num_tokens": 226348444.0,
"step": 3595
},
{
"epoch": 11.76595744680851,
"grad_norm": 13.416738533921773,
"learning_rate": 3.185254595050463e-09,
"loss": 1.619,
"mean_token_accuracy": 0.6620403587818146,
"num_tokens": 226664829.0,
"step": 3600
},
{
"epoch": 11.782324058919803,
"grad_norm": 12.674798008155374,
"learning_rate": 3.1797675735315457e-09,
"loss": 1.6193,
"mean_token_accuracy": 0.6634869813919068,
"num_tokens": 226980423.0,
"step": 3605
},
{
"epoch": 11.798690671031096,
"grad_norm": 13.115142248657635,
"learning_rate": 3.174277013733257e-09,
"loss": 1.6076,
"mean_token_accuracy": 0.6654283106327057,
"num_tokens": 227295351.0,
"step": 3610
},
{
"epoch": 11.815057283142389,
"grad_norm": 13.29350625734606,
"learning_rate": 3.1687829442346814e-09,
"loss": 1.6139,
"mean_token_accuracy": 0.6620844781398774,
"num_tokens": 227611552.0,
"step": 3615
},
{
"epoch": 11.831423895253682,
"grad_norm": 13.061041661166026,
"learning_rate": 3.1632853936331713e-09,
"loss": 1.635,
"mean_token_accuracy": 0.6598536133766174,
"num_tokens": 227925667.0,
"step": 3620
},
{
"epoch": 11.847790507364975,
"grad_norm": 13.537642032805756,
"learning_rate": 3.1577843905441977e-09,
"loss": 1.6446,
"mean_token_accuracy": 0.6571721136569977,
"num_tokens": 228241524.0,
"step": 3625
},
{
"epoch": 11.864157119476268,
"grad_norm": 13.81972607646031,
"learning_rate": 3.152279963601204e-09,
"loss": 1.6274,
"mean_token_accuracy": 0.6625904440879822,
"num_tokens": 228554391.0,
"step": 3630
},
{
"epoch": 11.880523731587562,
"grad_norm": 12.818969485029632,
"learning_rate": 3.146772141455454e-09,
"loss": 1.5954,
"mean_token_accuracy": 0.6688964426517486,
"num_tokens": 228871206.0,
"step": 3635
},
{
"epoch": 11.896890343698855,
"grad_norm": 13.422568818132685,
"learning_rate": 3.1412609527758852e-09,
"loss": 1.5942,
"mean_token_accuracy": 0.6679134428501129,
"num_tokens": 229187692.0,
"step": 3640
},
{
"epoch": 11.913256955810148,
"grad_norm": 12.777528450533671,
"learning_rate": 3.1357464262489556e-09,
"loss": 1.6013,
"mean_token_accuracy": 0.6679581284523011,
"num_tokens": 229503728.0,
"step": 3645
},
{
"epoch": 11.92962356792144,
"grad_norm": 13.562107747673085,
"learning_rate": 3.1302285905785e-09,
"loss": 1.632,
"mean_token_accuracy": 0.6611423671245575,
"num_tokens": 229819869.0,
"step": 3650
},
{
"epoch": 11.945990180032734,
"grad_norm": 12.523050886924725,
"learning_rate": 3.124707474485577e-09,
"loss": 1.6184,
"mean_token_accuracy": 0.6638299524784088,
"num_tokens": 230134907.0,
"step": 3655
},
{
"epoch": 11.962356792144027,
"grad_norm": 12.611898786093215,
"learning_rate": 3.11918310670832e-09,
"loss": 1.6214,
"mean_token_accuracy": 0.6640681028366089,
"num_tokens": 230450604.0,
"step": 3660
},
{
"epoch": 11.97872340425532,
"grad_norm": 13.276037366533183,
"learning_rate": 3.1136555160017866e-09,
"loss": 1.6167,
"mean_token_accuracy": 0.6637542247772217,
"num_tokens": 230767066.0,
"step": 3665
},
{
"epoch": 11.995090016366612,
"grad_norm": 12.74004340390755,
"learning_rate": 3.1081247311378134e-09,
"loss": 1.5976,
"mean_token_accuracy": 0.6676555573940277,
"num_tokens": 231081724.0,
"step": 3670
},
{
"epoch": 12.009819967266775,
"grad_norm": 12.948745150559041,
"learning_rate": 3.1025907809048586e-09,
"loss": 1.6031,
"mean_token_accuracy": 0.6675824787881639,
"num_tokens": 231343267.0,
"step": 3675
},
{
"epoch": 12.026186579378068,
"grad_norm": 12.843382353063111,
"learning_rate": 3.0970536941078607e-09,
"loss": 1.6057,
"mean_token_accuracy": 0.6665944814682007,
"num_tokens": 231659119.0,
"step": 3680
},
{
"epoch": 12.042553191489361,
"grad_norm": 12.824640395729176,
"learning_rate": 3.091513499568082e-09,
"loss": 1.6,
"mean_token_accuracy": 0.6633087992668152,
"num_tokens": 231975244.0,
"step": 3685
},
{
"epoch": 12.058919803600654,
"grad_norm": 12.908617166906525,
"learning_rate": 3.0859702261229617e-09,
"loss": 1.6164,
"mean_token_accuracy": 0.6648675918579101,
"num_tokens": 232292418.0,
"step": 3690
},
{
"epoch": 12.075286415711947,
"grad_norm": 13.006079515249429,
"learning_rate": 3.0804239026259663e-09,
"loss": 1.6116,
"mean_token_accuracy": 0.6631251335144043,
"num_tokens": 232607064.0,
"step": 3695
},
{
"epoch": 12.091653027823241,
"grad_norm": 12.954002093848636,
"learning_rate": 3.074874557946434e-09,
"loss": 1.6229,
"mean_token_accuracy": 0.6616682350635529,
"num_tokens": 232922763.0,
"step": 3700
},
{
"epoch": 12.108019639934534,
"grad_norm": 13.00134579290529,
"learning_rate": 3.0693222209694336e-09,
"loss": 1.5862,
"mean_token_accuracy": 0.669516533613205,
"num_tokens": 233239213.0,
"step": 3705
},
{
"epoch": 12.124386252045827,
"grad_norm": 12.585401536226662,
"learning_rate": 3.063766920595608e-09,
"loss": 1.5904,
"mean_token_accuracy": 0.6675543785095215,
"num_tokens": 233554467.0,
"step": 3710
},
{
"epoch": 12.14075286415712,
"grad_norm": 12.663377303240502,
"learning_rate": 3.058208685741023e-09,
"loss": 1.5966,
"mean_token_accuracy": 0.6668773412704467,
"num_tokens": 233871092.0,
"step": 3715
},
{
"epoch": 12.157119476268413,
"grad_norm": 12.911660404059452,
"learning_rate": 3.0526475453370206e-09,
"loss": 1.5989,
"mean_token_accuracy": 0.6680683135986328,
"num_tokens": 234188066.0,
"step": 3720
},
{
"epoch": 12.173486088379706,
"grad_norm": 12.911278000841309,
"learning_rate": 3.047083528330066e-09,
"loss": 1.5815,
"mean_token_accuracy": 0.6699133396148682,
"num_tokens": 234503095.0,
"step": 3725
},
{
"epoch": 12.189852700490999,
"grad_norm": 12.794569499674953,
"learning_rate": 3.0415166636815965e-09,
"loss": 1.5895,
"mean_token_accuracy": 0.6685011863708497,
"num_tokens": 234818492.0,
"step": 3730
},
{
"epoch": 12.206219312602292,
"grad_norm": 12.502821002737141,
"learning_rate": 3.035946980367873e-09,
"loss": 1.6057,
"mean_token_accuracy": 0.6661465525627136,
"num_tokens": 235133128.0,
"step": 3735
},
{
"epoch": 12.222585924713584,
"grad_norm": 12.054248094194744,
"learning_rate": 3.0303745073798283e-09,
"loss": 1.585,
"mean_token_accuracy": 0.6686634719371796,
"num_tokens": 235449485.0,
"step": 3740
},
{
"epoch": 12.238952536824877,
"grad_norm": 12.434412926443686,
"learning_rate": 3.0247992737229145e-09,
"loss": 1.5734,
"mean_token_accuracy": 0.6727577865123748,
"num_tokens": 235765311.0,
"step": 3745
},
{
"epoch": 12.25531914893617,
"grad_norm": 13.13401634274974,
"learning_rate": 3.0192213084169547e-09,
"loss": 1.584,
"mean_token_accuracy": 0.6712262392044067,
"num_tokens": 236081014.0,
"step": 3750
},
{
"epoch": 12.271685761047463,
"grad_norm": 12.253526888229622,
"learning_rate": 3.0136406404959894e-09,
"loss": 1.5837,
"mean_token_accuracy": 0.6716396868228912,
"num_tokens": 236396309.0,
"step": 3755
},
{
"epoch": 12.288052373158756,
"grad_norm": 12.300536705102283,
"learning_rate": 3.008057299008127e-09,
"loss": 1.5915,
"mean_token_accuracy": 0.6713536083698273,
"num_tokens": 236711861.0,
"step": 3760
},
{
"epoch": 12.304418985270049,
"grad_norm": 12.542173938542376,
"learning_rate": 3.0024713130153915e-09,
"loss": 1.5832,
"mean_token_accuracy": 0.673738706111908,
"num_tokens": 237027950.0,
"step": 3765
},
{
"epoch": 12.320785597381342,
"grad_norm": 12.858376257223238,
"learning_rate": 2.9968827115935733e-09,
"loss": 1.5897,
"mean_token_accuracy": 0.6709151029586792,
"num_tokens": 237345505.0,
"step": 3770
},
{
"epoch": 12.337152209492634,
"grad_norm": 12.689502324676397,
"learning_rate": 2.9912915238320756e-09,
"loss": 1.5791,
"mean_token_accuracy": 0.6716641247272491,
"num_tokens": 237659458.0,
"step": 3775
},
{
"epoch": 12.353518821603927,
"grad_norm": 12.381858812642847,
"learning_rate": 2.985697778833765e-09,
"loss": 1.5628,
"mean_token_accuracy": 0.6747232496738433,
"num_tokens": 237975560.0,
"step": 3780
},
{
"epoch": 12.36988543371522,
"grad_norm": 13.294914967347232,
"learning_rate": 2.9801015057148156e-09,
"loss": 1.6012,
"mean_token_accuracy": 0.6689247190952301,
"num_tokens": 238291458.0,
"step": 3785
},
{
"epoch": 12.386252045826513,
"grad_norm": 12.5479953722173,
"learning_rate": 2.974502733604565e-09,
"loss": 1.553,
"mean_token_accuracy": 0.6777917623519898,
"num_tokens": 238607733.0,
"step": 3790
},
{
"epoch": 12.402618657937808,
"grad_norm": 12.801549719803207,
"learning_rate": 2.968901491645355e-09,
"loss": 1.5887,
"mean_token_accuracy": 0.6701488494873047,
"num_tokens": 238923547.0,
"step": 3795
},
{
"epoch": 12.4189852700491,
"grad_norm": 12.751637893126095,
"learning_rate": 2.963297808992385e-09,
"loss": 1.5703,
"mean_token_accuracy": 0.6744679152965546,
"num_tokens": 239240577.0,
"step": 3800
},
{
"epoch": 12.435351882160393,
"grad_norm": 13.321683101603757,
"learning_rate": 2.9576917148135583e-09,
"loss": 1.5673,
"mean_token_accuracy": 0.6739533841609955,
"num_tokens": 239555986.0,
"step": 3805
},
{
"epoch": 12.451718494271686,
"grad_norm": 12.627225187767989,
"learning_rate": 2.9520832382893313e-09,
"loss": 1.5755,
"mean_token_accuracy": 0.6728475570678711,
"num_tokens": 239872210.0,
"step": 3810
},
{
"epoch": 12.46808510638298,
"grad_norm": 12.773046439619442,
"learning_rate": 2.9464724086125582e-09,
"loss": 1.5773,
"mean_token_accuracy": 0.6726521015167236,
"num_tokens": 240188732.0,
"step": 3815
},
{
"epoch": 12.484451718494272,
"grad_norm": 13.121405300009847,
"learning_rate": 2.940859254988344e-09,
"loss": 1.5637,
"mean_token_accuracy": 0.6766914367675781,
"num_tokens": 240503830.0,
"step": 3820
},
{
"epoch": 12.500818330605565,
"grad_norm": 12.688014889794543,
"learning_rate": 2.9352438066338895e-09,
"loss": 1.5945,
"mean_token_accuracy": 0.671603900194168,
"num_tokens": 240819434.0,
"step": 3825
},
{
"epoch": 12.517184942716858,
"grad_norm": 12.343229509904557,
"learning_rate": 2.9296260927783397e-09,
"loss": 1.5781,
"mean_token_accuracy": 0.6743817329406738,
"num_tokens": 241135183.0,
"step": 3830
},
{
"epoch": 12.53355155482815,
"grad_norm": 12.241788090729434,
"learning_rate": 2.924006142662632e-09,
"loss": 1.5489,
"mean_token_accuracy": 0.6783745586872101,
"num_tokens": 241452289.0,
"step": 3835
},
{
"epoch": 12.549918166939444,
"grad_norm": 12.800379983122136,
"learning_rate": 2.918383985539344e-09,
"loss": 1.581,
"mean_token_accuracy": 0.6698661625385285,
"num_tokens": 241766768.0,
"step": 3840
},
{
"epoch": 12.566284779050736,
"grad_norm": 12.628445454216601,
"learning_rate": 2.9127596506725405e-09,
"loss": 1.5541,
"mean_token_accuracy": 0.6765264332294464,
"num_tokens": 242082935.0,
"step": 3845
},
{
"epoch": 12.58265139116203,
"grad_norm": 12.663307813547721,
"learning_rate": 2.9071331673376223e-09,
"loss": 1.5785,
"mean_token_accuracy": 0.6735577821731568,
"num_tokens": 242398564.0,
"step": 3850
},
{
"epoch": 12.599018003273322,
"grad_norm": 11.901477804269593,
"learning_rate": 2.901504564821173e-09,
"loss": 1.5631,
"mean_token_accuracy": 0.6759593665599823,
"num_tokens": 242713296.0,
"step": 3855
},
{
"epoch": 12.615384615384615,
"grad_norm": 12.861647771494315,
"learning_rate": 2.8958738724208073e-09,
"loss": 1.5661,
"mean_token_accuracy": 0.6735385596752167,
"num_tokens": 243027655.0,
"step": 3860
},
{
"epoch": 12.631751227495908,
"grad_norm": 12.532695918194063,
"learning_rate": 2.8902411194450174e-09,
"loss": 1.5713,
"mean_token_accuracy": 0.6764239609241486,
"num_tokens": 243341790.0,
"step": 3865
},
{
"epoch": 12.6481178396072,
"grad_norm": 12.658498402394635,
"learning_rate": 2.884606335213021e-09,
"loss": 1.5679,
"mean_token_accuracy": 0.6758080422878265,
"num_tokens": 243657854.0,
"step": 3870
},
{
"epoch": 12.664484451718494,
"grad_norm": 12.54243307666182,
"learning_rate": 2.8789695490546086e-09,
"loss": 1.5813,
"mean_token_accuracy": 0.6715005517005921,
"num_tokens": 243973655.0,
"step": 3875
},
{
"epoch": 12.680851063829786,
"grad_norm": 12.530149231407798,
"learning_rate": 2.8733307903099926e-09,
"loss": 1.5675,
"mean_token_accuracy": 0.675896018743515,
"num_tokens": 244290378.0,
"step": 3880
},
{
"epoch": 12.69721767594108,
"grad_norm": 12.608277259330517,
"learning_rate": 2.867690088329651e-09,
"loss": 1.5454,
"mean_token_accuracy": 0.6792291462421417,
"num_tokens": 244605616.0,
"step": 3885
},
{
"epoch": 12.713584288052374,
"grad_norm": 12.672281948216312,
"learning_rate": 2.8620474724741764e-09,
"loss": 1.5589,
"mean_token_accuracy": 0.6749845445156097,
"num_tokens": 244920074.0,
"step": 3890
},
{
"epoch": 12.729950900163667,
"grad_norm": 13.022006524153223,
"learning_rate": 2.8564029721141272e-09,
"loss": 1.5733,
"mean_token_accuracy": 0.6721878945827484,
"num_tokens": 245236410.0,
"step": 3895
},
{
"epoch": 12.74631751227496,
"grad_norm": 12.367334432965174,
"learning_rate": 2.850756616629865e-09,
"loss": 1.5556,
"mean_token_accuracy": 0.6784782648086548,
"num_tokens": 245550979.0,
"step": 3900
},
{
"epoch": 12.762684124386253,
"grad_norm": 12.58712217252632,
"learning_rate": 2.8451084354114132e-09,
"loss": 1.5548,
"mean_token_accuracy": 0.6767111480236053,
"num_tokens": 245866265.0,
"step": 3905
},
{
"epoch": 12.779050736497545,
"grad_norm": 12.763252688413466,
"learning_rate": 2.839458457858294e-09,
"loss": 1.5485,
"mean_token_accuracy": 0.677623575925827,
"num_tokens": 246182198.0,
"step": 3910
},
{
"epoch": 12.795417348608838,
"grad_norm": 12.300357752454952,
"learning_rate": 2.8338067133793816e-09,
"loss": 1.5598,
"mean_token_accuracy": 0.6755125164985657,
"num_tokens": 246497923.0,
"step": 3915
},
{
"epoch": 12.811783960720131,
"grad_norm": 12.940198469333163,
"learning_rate": 2.8281532313927477e-09,
"loss": 1.5444,
"mean_token_accuracy": 0.6788832068443298,
"num_tokens": 246812317.0,
"step": 3920
},
{
"epoch": 12.828150572831424,
"grad_norm": 12.408410874213551,
"learning_rate": 2.8224980413255086e-09,
"loss": 1.5558,
"mean_token_accuracy": 0.6768961608409881,
"num_tokens": 247128288.0,
"step": 3925
},
{
"epoch": 12.844517184942717,
"grad_norm": 12.506008892788866,
"learning_rate": 2.8168411726136682e-09,
"loss": 1.5463,
"mean_token_accuracy": 0.6788853287696839,
"num_tokens": 247443898.0,
"step": 3930
},
{
"epoch": 12.86088379705401,
"grad_norm": 12.158494287742517,
"learning_rate": 2.8111826547019715e-09,
"loss": 1.5577,
"mean_token_accuracy": 0.6765970945358276,
"num_tokens": 247757914.0,
"step": 3935
},
{
"epoch": 12.877250409165303,
"grad_norm": 12.62478478655889,
"learning_rate": 2.8055225170437455e-09,
"loss": 1.5492,
"mean_token_accuracy": 0.6792675971984863,
"num_tokens": 248072481.0,
"step": 3940
},
{
"epoch": 12.893617021276595,
"grad_norm": 12.75354807025127,
"learning_rate": 2.7998607891007495e-09,
"loss": 1.565,
"mean_token_accuracy": 0.67542844414711,
"num_tokens": 248387709.0,
"step": 3945
},
{
"epoch": 12.909983633387888,
"grad_norm": 13.058935530125755,
"learning_rate": 2.7941975003430204e-09,
"loss": 1.5746,
"mean_token_accuracy": 0.6717760920524597,
"num_tokens": 248704386.0,
"step": 3950
},
{
"epoch": 12.926350245499181,
"grad_norm": 12.297537130228084,
"learning_rate": 2.7885326802487175e-09,
"loss": 1.5368,
"mean_token_accuracy": 0.6805076479911805,
"num_tokens": 249020305.0,
"step": 3955
},
{
"epoch": 12.942716857610474,
"grad_norm": 12.224890011682676,
"learning_rate": 2.782866358303973e-09,
"loss": 1.5402,
"mean_token_accuracy": 0.6809644043445587,
"num_tokens": 249335996.0,
"step": 3960
},
{
"epoch": 12.959083469721767,
"grad_norm": 12.010577456081947,
"learning_rate": 2.777198564002737e-09,
"loss": 1.5294,
"mean_token_accuracy": 0.6835005700588226,
"num_tokens": 249650759.0,
"step": 3965
},
{
"epoch": 12.97545008183306,
"grad_norm": 12.755477098006706,
"learning_rate": 2.7715293268466204e-09,
"loss": 1.5377,
"mean_token_accuracy": 0.6816042780876159,
"num_tokens": 249965419.0,
"step": 3970
},
{
"epoch": 12.991816693944354,
"grad_norm": 12.11533297566282,
"learning_rate": 2.765858676344747e-09,
"loss": 1.556,
"mean_token_accuracy": 0.6773697674274445,
"num_tokens": 250280974.0,
"step": 3975
},
{
"epoch": 13.006546644844518,
"grad_norm": 12.53886715357263,
"learning_rate": 2.7601866420135955e-09,
"loss": 1.5426,
"mean_token_accuracy": 0.6790764596727159,
"num_tokens": 250541723.0,
"step": 3980
},
{
"epoch": 13.02291325695581,
"grad_norm": 12.267173509734528,
"learning_rate": 2.7545132533768503e-09,
"loss": 1.5455,
"mean_token_accuracy": 0.6799408495426178,
"num_tokens": 250857786.0,
"step": 3985
},
{
"epoch": 13.039279869067103,
"grad_norm": 12.659666618100243,
"learning_rate": 2.7488385399652418e-09,
"loss": 1.5442,
"mean_token_accuracy": 0.6778158783912659,
"num_tokens": 251172605.0,
"step": 3990
},
{
"epoch": 13.055646481178396,
"grad_norm": 13.190436807019603,
"learning_rate": 2.7431625313163973e-09,
"loss": 1.5529,
"mean_token_accuracy": 0.6766701757907867,
"num_tokens": 251489221.0,
"step": 3995
},
{
"epoch": 13.072013093289689,
"grad_norm": 11.806954687900037,
"learning_rate": 2.7374852569746872e-09,
"loss": 1.5532,
"mean_token_accuracy": 0.6783233880996704,
"num_tokens": 251805450.0,
"step": 4000
},
{
"epoch": 13.088379705400982,
"grad_norm": 12.442400974599957,
"learning_rate": 2.7318067464910685e-09,
"loss": 1.5376,
"mean_token_accuracy": 0.6806444525718689,
"num_tokens": 252120975.0,
"step": 4005
},
{
"epoch": 13.104746317512275,
"grad_norm": 12.48596610332013,
"learning_rate": 2.726127029422934e-09,
"loss": 1.5271,
"mean_token_accuracy": 0.6825267374515533,
"num_tokens": 252436301.0,
"step": 4010
},
{
"epoch": 13.121112929623568,
"grad_norm": 12.5605303792605,
"learning_rate": 2.7204461353339542e-09,
"loss": 1.5284,
"mean_token_accuracy": 0.6824884533882141,
"num_tokens": 252752055.0,
"step": 4015
},
{
"epoch": 13.13747954173486,
"grad_norm": 12.961259371866541,
"learning_rate": 2.714764093793929e-09,
"loss": 1.5503,
"mean_token_accuracy": 0.6780583560466766,
"num_tokens": 253069583.0,
"step": 4020
},
{
"epoch": 13.153846153846153,
"grad_norm": 12.485200498103994,
"learning_rate": 2.7090809343786294e-09,
"loss": 1.5426,
"mean_token_accuracy": 0.6800074696540832,
"num_tokens": 253385821.0,
"step": 4025
},
{
"epoch": 13.170212765957446,
"grad_norm": 12.47147244014249,
"learning_rate": 2.703396686669646e-09,
"loss": 1.5212,
"mean_token_accuracy": 0.6839386880397796,
"num_tokens": 253701910.0,
"step": 4030
},
{
"epoch": 13.186579378068739,
"grad_norm": 12.52921187779744,
"learning_rate": 2.6977113802542337e-09,
"loss": 1.5301,
"mean_token_accuracy": 0.6811168432235718,
"num_tokens": 254018498.0,
"step": 4035
},
{
"epoch": 13.202945990180032,
"grad_norm": 12.47194604814109,
"learning_rate": 2.6920250447251564e-09,
"loss": 1.5287,
"mean_token_accuracy": 0.6823192477226258,
"num_tokens": 254335166.0,
"step": 4040
},
{
"epoch": 13.219312602291327,
"grad_norm": 13.154414990831478,
"learning_rate": 2.686337709680538e-09,
"loss": 1.5417,
"mean_token_accuracy": 0.6793733775615692,
"num_tokens": 254648953.0,
"step": 4045
},
{
"epoch": 13.23567921440262,
"grad_norm": 12.742574961680194,
"learning_rate": 2.6806494047237022e-09,
"loss": 1.5403,
"mean_token_accuracy": 0.6783177971839904,
"num_tokens": 254964346.0,
"step": 4050
},
{
"epoch": 13.252045826513912,
"grad_norm": 12.024039931655503,
"learning_rate": 2.6749601594630236e-09,
"loss": 1.523,
"mean_token_accuracy": 0.6833492398262024,
"num_tokens": 255280007.0,
"step": 4055
},
{
"epoch": 13.268412438625205,
"grad_norm": 12.35117661325111,
"learning_rate": 2.669270003511769e-09,
"loss": 1.5343,
"mean_token_accuracy": 0.6811910986900329,
"num_tokens": 255596838.0,
"step": 4060
},
{
"epoch": 13.284779050736498,
"grad_norm": 12.549651974040733,
"learning_rate": 2.663578966487946e-09,
"loss": 1.5343,
"mean_token_accuracy": 0.6812186896800995,
"num_tokens": 255911585.0,
"step": 4065
},
{
"epoch": 13.30114566284779,
"grad_norm": 12.89348141219778,
"learning_rate": 2.65788707801415e-09,
"loss": 1.5395,
"mean_token_accuracy": 0.6799389600753785,
"num_tokens": 256227004.0,
"step": 4070
},
{
"epoch": 13.317512274959084,
"grad_norm": 12.66495954354997,
"learning_rate": 2.652194367717406e-09,
"loss": 1.5107,
"mean_token_accuracy": 0.6846579313278198,
"num_tokens": 256543458.0,
"step": 4075
},
{
"epoch": 13.333878887070377,
"grad_norm": 12.621706259288624,
"learning_rate": 2.6465008652290177e-09,
"loss": 1.518,
"mean_token_accuracy": 0.684733635187149,
"num_tokens": 256859415.0,
"step": 4080
},
{
"epoch": 13.35024549918167,
"grad_norm": 12.097297652839174,
"learning_rate": 2.6408066001844127e-09,
"loss": 1.5196,
"mean_token_accuracy": 0.6841577529907227,
"num_tokens": 257172225.0,
"step": 4085
},
{
"epoch": 13.366612111292962,
"grad_norm": 12.508833721919608,
"learning_rate": 2.6351116022229872e-09,
"loss": 1.528,
"mean_token_accuracy": 0.6815730452537536,
"num_tokens": 257489871.0,
"step": 4090
},
{
"epoch": 13.382978723404255,
"grad_norm": 12.15282167610772,
"learning_rate": 2.6294159009879524e-09,
"loss": 1.5253,
"mean_token_accuracy": 0.6824976325035095,
"num_tokens": 257804998.0,
"step": 4095
},
{
"epoch": 13.399345335515548,
"grad_norm": 12.609861031701218,
"learning_rate": 2.6237195261261803e-09,
"loss": 1.5336,
"mean_token_accuracy": 0.6804631054401398,
"num_tokens": 258121618.0,
"step": 4100
},
{
"epoch": 13.415711947626841,
"grad_norm": 12.81485883900022,
"learning_rate": 2.6180225072880485e-09,
"loss": 1.5177,
"mean_token_accuracy": 0.683049613237381,
"num_tokens": 258438728.0,
"step": 4105
},
{
"epoch": 13.432078559738134,
"grad_norm": 12.505318879085046,
"learning_rate": 2.6123248741272883e-09,
"loss": 1.5054,
"mean_token_accuracy": 0.6859242916107178,
"num_tokens": 258753766.0,
"step": 4110
},
{
"epoch": 13.448445171849427,
"grad_norm": 12.599140341885622,
"learning_rate": 2.606626656300827e-09,
"loss": 1.5288,
"mean_token_accuracy": 0.6825321853160858,
"num_tokens": 259070867.0,
"step": 4115
},
{
"epoch": 13.46481178396072,
"grad_norm": 12.59190273859541,
"learning_rate": 2.600927883468635e-09,
"loss": 1.5409,
"mean_token_accuracy": 0.6785263419151306,
"num_tokens": 259386335.0,
"step": 4120
},
{
"epoch": 13.481178396072012,
"grad_norm": 12.959983447834418,
"learning_rate": 2.595228585293574e-09,
"loss": 1.5222,
"mean_token_accuracy": 0.6814795911312104,
"num_tokens": 259701062.0,
"step": 4125
},
{
"epoch": 13.497545008183305,
"grad_norm": 12.246060045870713,
"learning_rate": 2.589528791441237e-09,
"loss": 1.5258,
"mean_token_accuracy": 0.6814365029335022,
"num_tokens": 260016188.0,
"step": 4130
},
{
"epoch": 13.5139116202946,
"grad_norm": 11.762747726590536,
"learning_rate": 2.5838285315797988e-09,
"loss": 1.5287,
"mean_token_accuracy": 0.6819169402122498,
"num_tokens": 260332549.0,
"step": 4135
},
{
"epoch": 13.530278232405893,
"grad_norm": 12.205042415924925,
"learning_rate": 2.57812783537986e-09,
"loss": 1.5121,
"mean_token_accuracy": 0.6839991450309754,
"num_tokens": 260649111.0,
"step": 4140
},
{
"epoch": 13.546644844517186,
"grad_norm": 11.791312121137548,
"learning_rate": 2.572426732514291e-09,
"loss": 1.5255,
"mean_token_accuracy": 0.6823135852813721,
"num_tokens": 260963017.0,
"step": 4145
},
{
"epoch": 13.563011456628479,
"grad_norm": 11.911116010773625,
"learning_rate": 2.566725252658081e-09,
"loss": 1.5041,
"mean_token_accuracy": 0.6869804382324218,
"num_tokens": 261279118.0,
"step": 4150
},
{
"epoch": 13.579378068739771,
"grad_norm": 11.587422849493581,
"learning_rate": 2.56102342548818e-09,
"loss": 1.4929,
"mean_token_accuracy": 0.6880825638771058,
"num_tokens": 261595332.0,
"step": 4155
},
{
"epoch": 13.595744680851064,
"grad_norm": 12.957142991751684,
"learning_rate": 2.555321280683346e-09,
"loss": 1.54,
"mean_token_accuracy": 0.6787398636341095,
"num_tokens": 261910265.0,
"step": 4160
},
{
"epoch": 13.612111292962357,
"grad_norm": 12.257003877444124,
"learning_rate": 2.549618847923991e-09,
"loss": 1.5058,
"mean_token_accuracy": 0.6841898381710052,
"num_tokens": 262224697.0,
"step": 4165
},
{
"epoch": 13.62847790507365,
"grad_norm": 12.452796927706355,
"learning_rate": 2.543916156892025e-09,
"loss": 1.5111,
"mean_token_accuracy": 0.6824253857135772,
"num_tokens": 262541480.0,
"step": 4170
},
{
"epoch": 13.644844517184943,
"grad_norm": 12.030249176136618,
"learning_rate": 2.5382132372707027e-09,
"loss": 1.5126,
"mean_token_accuracy": 0.6844086408615112,
"num_tokens": 262855801.0,
"step": 4175
},
{
"epoch": 13.661211129296236,
"grad_norm": 12.387048565345177,
"learning_rate": 2.5325101187444694e-09,
"loss": 1.5159,
"mean_token_accuracy": 0.6827063441276551,
"num_tokens": 263170799.0,
"step": 4180
},
{
"epoch": 13.677577741407529,
"grad_norm": 12.22309480247876,
"learning_rate": 2.526806830998804e-09,
"loss": 1.523,
"mean_token_accuracy": 0.6828779339790344,
"num_tokens": 263486969.0,
"step": 4185
},
{
"epoch": 13.693944353518821,
"grad_norm": 12.525816516791572,
"learning_rate": 2.5211034037200675e-09,
"loss": 1.5194,
"mean_token_accuracy": 0.6844118356704711,
"num_tokens": 263801197.0,
"step": 4190
},
{
"epoch": 13.710310965630114,
"grad_norm": 12.434982126404309,
"learning_rate": 2.515399866595347e-09,
"loss": 1.4834,
"mean_token_accuracy": 0.6887698948383332,
"num_tokens": 264117363.0,
"step": 4195
},
{
"epoch": 13.726677577741407,
"grad_norm": 12.238598723305001,
"learning_rate": 2.509696249312301e-09,
"loss": 1.5066,
"mean_token_accuracy": 0.685460901260376,
"num_tokens": 264432646.0,
"step": 4200
},
{
"epoch": 13.7430441898527,
"grad_norm": 12.650740657273452,
"learning_rate": 2.503992581559005e-09,
"loss": 1.5332,
"mean_token_accuracy": 0.678538054227829,
"num_tokens": 264747827.0,
"step": 4205
},
{
"epoch": 13.759410801963993,
"grad_norm": 12.099360007266315,
"learning_rate": 2.4982888930237996e-09,
"loss": 1.4965,
"mean_token_accuracy": 0.6864968955516815,
"num_tokens": 265063292.0,
"step": 4210
},
{
"epoch": 13.775777414075286,
"grad_norm": 12.271944931579954,
"learning_rate": 2.49258521339513e-09,
"loss": 1.4828,
"mean_token_accuracy": 0.6900125324726105,
"num_tokens": 265377285.0,
"step": 4215
},
{
"epoch": 13.792144026186579,
"grad_norm": 12.611396153670063,
"learning_rate": 2.4868815723613977e-09,
"loss": 1.5133,
"mean_token_accuracy": 0.6828788638114929,
"num_tokens": 265692743.0,
"step": 4220
},
{
"epoch": 13.808510638297872,
"grad_norm": 12.01276121557013,
"learning_rate": 2.4811779996108013e-09,
"loss": 1.4912,
"mean_token_accuracy": 0.6870843648910523,
"num_tokens": 266007767.0,
"step": 4225
},
{
"epoch": 13.824877250409166,
"grad_norm": 12.19758047419805,
"learning_rate": 2.475474524831185e-09,
"loss": 1.5152,
"mean_token_accuracy": 0.683402705192566,
"num_tokens": 266324778.0,
"step": 4230
},
{
"epoch": 13.841243862520459,
"grad_norm": 12.479958154608948,
"learning_rate": 2.4697711777098836e-09,
"loss": 1.5082,
"mean_token_accuracy": 0.6837309658527374,
"num_tokens": 266641733.0,
"step": 4235
},
{
"epoch": 13.857610474631752,
"grad_norm": 12.400439983424395,
"learning_rate": 2.464067987933567e-09,
"loss": 1.4813,
"mean_token_accuracy": 0.688559752702713,
"num_tokens": 266958456.0,
"step": 4240
},
{
"epoch": 13.873977086743045,
"grad_norm": 12.642592990777057,
"learning_rate": 2.458364985188085e-09,
"loss": 1.5132,
"mean_token_accuracy": 0.6824531733989716,
"num_tokens": 267273047.0,
"step": 4245
},
{
"epoch": 13.890343698854338,
"grad_norm": 12.72320574929076,
"learning_rate": 2.452662199158316e-09,
"loss": 1.4868,
"mean_token_accuracy": 0.6883831202983857,
"num_tokens": 267588938.0,
"step": 4250
},
{
"epoch": 13.90671031096563,
"grad_norm": 12.287005056680044,
"learning_rate": 2.4469596595280084e-09,
"loss": 1.509,
"mean_token_accuracy": 0.6836120009422302,
"num_tokens": 267905040.0,
"step": 4255
},
{
"epoch": 13.923076923076923,
"grad_norm": 12.593872648580946,
"learning_rate": 2.441257395979629e-09,
"loss": 1.4969,
"mean_token_accuracy": 0.6880811274051666,
"num_tokens": 268218957.0,
"step": 4260
},
{
"epoch": 13.939443535188216,
"grad_norm": 12.215400254590408,
"learning_rate": 2.435555438194208e-09,
"loss": 1.4883,
"mean_token_accuracy": 0.6877467036247253,
"num_tokens": 268534219.0,
"step": 4265
},
{
"epoch": 13.955810147299509,
"grad_norm": 11.71218778076416,
"learning_rate": 2.429853815851183e-09,
"loss": 1.5188,
"mean_token_accuracy": 0.6834926068782806,
"num_tokens": 268849335.0,
"step": 4270
},
{
"epoch": 13.972176759410802,
"grad_norm": 11.811582966108432,
"learning_rate": 2.424152558628246e-09,
"loss": 1.4734,
"mean_token_accuracy": 0.6901955604553223,
"num_tokens": 269164477.0,
"step": 4275
},
{
"epoch": 13.988543371522095,
"grad_norm": 12.391517223881769,
"learning_rate": 2.4184516962011894e-09,
"loss": 1.4777,
"mean_token_accuracy": 0.6901420176029205,
"num_tokens": 269481191.0,
"step": 4280
},
{
"epoch": 14.003273322422258,
"grad_norm": 11.85243276024841,
"learning_rate": 2.412751258243748e-09,
"loss": 1.4796,
"mean_token_accuracy": 0.6884656879636977,
"num_tokens": 269740807.0,
"step": 4285
},
{
"epoch": 14.01963993453355,
"grad_norm": 12.127537335092095,
"learning_rate": 2.4070512744274503e-09,
"loss": 1.4912,
"mean_token_accuracy": 0.6857654273509979,
"num_tokens": 270056263.0,
"step": 4290
},
{
"epoch": 14.036006546644845,
"grad_norm": 13.025005489656099,
"learning_rate": 2.4013517744214595e-09,
"loss": 1.5247,
"mean_token_accuracy": 0.6816625893115997,
"num_tokens": 270371668.0,
"step": 4295
},
{
"epoch": 14.052373158756138,
"grad_norm": 12.219489354645315,
"learning_rate": 2.3956527878924202e-09,
"loss": 1.5032,
"mean_token_accuracy": 0.6837205052375793,
"num_tokens": 270686642.0,
"step": 4300
},
{
"epoch": 14.068739770867431,
"grad_norm": 12.456845642626293,
"learning_rate": 2.3899543445043044e-09,
"loss": 1.4786,
"mean_token_accuracy": 0.6902859628200531,
"num_tokens": 271001819.0,
"step": 4305
},
{
"epoch": 14.085106382978724,
"grad_norm": 12.28464819163932,
"learning_rate": 2.3842564739182586e-09,
"loss": 1.496,
"mean_token_accuracy": 0.6836179614067077,
"num_tokens": 271317420.0,
"step": 4310
},
{
"epoch": 14.101472995090017,
"grad_norm": 12.790520935026995,
"learning_rate": 2.378559205792445e-09,
"loss": 1.5081,
"mean_token_accuracy": 0.6849936664104461,
"num_tokens": 271632759.0,
"step": 4315
},
{
"epoch": 14.11783960720131,
"grad_norm": 12.595762861453693,
"learning_rate": 2.372862569781893e-09,
"loss": 1.4957,
"mean_token_accuracy": 0.6861394762992858,
"num_tokens": 271947230.0,
"step": 4320
},
{
"epoch": 14.134206219312603,
"grad_norm": 12.571761249252841,
"learning_rate": 2.3671665955383383e-09,
"loss": 1.5141,
"mean_token_accuracy": 0.6836262583732605,
"num_tokens": 272263460.0,
"step": 4325
},
{
"epoch": 14.150572831423895,
"grad_norm": 12.156579183616737,
"learning_rate": 2.3614713127100753e-09,
"loss": 1.4833,
"mean_token_accuracy": 0.687548840045929,
"num_tokens": 272580508.0,
"step": 4330
},
{
"epoch": 14.166939443535188,
"grad_norm": 12.06870266875961,
"learning_rate": 2.3557767509417978e-09,
"loss": 1.4899,
"mean_token_accuracy": 0.6882768452167511,
"num_tokens": 272895912.0,
"step": 4335
},
{
"epoch": 14.183306055646481,
"grad_norm": 11.801476081580352,
"learning_rate": 2.3500829398744456e-09,
"loss": 1.4797,
"mean_token_accuracy": 0.6898080468177795,
"num_tokens": 273209887.0,
"step": 4340
},
{
"epoch": 14.199672667757774,
"grad_norm": 13.11999339098903,
"learning_rate": 2.3443899091450532e-09,
"loss": 1.4996,
"mean_token_accuracy": 0.6869996070861817,
"num_tokens": 273525314.0,
"step": 4345
},
{
"epoch": 14.216039279869067,
"grad_norm": 12.88555178536362,
"learning_rate": 2.3386976883865917e-09,
"loss": 1.4877,
"mean_token_accuracy": 0.6876905500888825,
"num_tokens": 273841131.0,
"step": 4350
},
{
"epoch": 14.23240589198036,
"grad_norm": 12.61192381171216,
"learning_rate": 2.333006307227817e-09,
"loss": 1.4876,
"mean_token_accuracy": 0.6866623759269714,
"num_tokens": 274156716.0,
"step": 4355
},
{
"epoch": 14.248772504091653,
"grad_norm": 12.102832652830054,
"learning_rate": 2.3273157952931137e-09,
"loss": 1.4881,
"mean_token_accuracy": 0.6889342963695526,
"num_tokens": 274470358.0,
"step": 4360
},
{
"epoch": 14.265139116202946,
"grad_norm": 12.137000267042431,
"learning_rate": 2.321626182202343e-09,
"loss": 1.4712,
"mean_token_accuracy": 0.6922279059886932,
"num_tokens": 274786272.0,
"step": 4365
},
{
"epoch": 14.281505728314238,
"grad_norm": 12.497262980217268,
"learning_rate": 2.315937497570688e-09,
"loss": 1.5073,
"mean_token_accuracy": 0.685647439956665,
"num_tokens": 275100575.0,
"step": 4370
},
{
"epoch": 14.297872340425531,
"grad_norm": 13.06102305602527,
"learning_rate": 2.3102497710084977e-09,
"loss": 1.5056,
"mean_token_accuracy": 0.6866560220718384,
"num_tokens": 275415316.0,
"step": 4375
},
{
"epoch": 14.314238952536824,
"grad_norm": 12.410306438367476,
"learning_rate": 2.304563032121135e-09,
"loss": 1.464,
"mean_token_accuracy": 0.691516500711441,
"num_tokens": 275731387.0,
"step": 4380
},
{
"epoch": 14.330605564648117,
"grad_norm": 12.57267060527453,
"learning_rate": 2.2988773105088208e-09,
"loss": 1.4798,
"mean_token_accuracy": 0.6907178342342377,
"num_tokens": 276048294.0,
"step": 4385
},
{
"epoch": 14.346972176759412,
"grad_norm": 12.71377949896571,
"learning_rate": 2.2931926357664828e-09,
"loss": 1.4918,
"mean_token_accuracy": 0.6845012664794922,
"num_tokens": 276364776.0,
"step": 4390
},
{
"epoch": 14.363338788870704,
"grad_norm": 11.806182237450276,
"learning_rate": 2.2875090374835995e-09,
"loss": 1.471,
"mean_token_accuracy": 0.6929399073123932,
"num_tokens": 276680814.0,
"step": 4395
},
{
"epoch": 14.379705400981997,
"grad_norm": 12.939282258504374,
"learning_rate": 2.281826545244042e-09,
"loss": 1.4964,
"mean_token_accuracy": 0.6841404914855957,
"num_tokens": 276997538.0,
"step": 4400
},
{
"epoch": 14.39607201309329,
"grad_norm": 12.87439071641723,
"learning_rate": 2.2761451886259303e-09,
"loss": 1.4989,
"mean_token_accuracy": 0.6866525292396546,
"num_tokens": 277313452.0,
"step": 4405
},
{
"epoch": 14.412438625204583,
"grad_norm": 12.779019358899058,
"learning_rate": 2.27046499720147e-09,
"loss": 1.4862,
"mean_token_accuracy": 0.6897963464260102,
"num_tokens": 277629752.0,
"step": 4410
},
{
"epoch": 14.428805237315876,
"grad_norm": 12.611209638210697,
"learning_rate": 2.2647860005368025e-09,
"loss": 1.4835,
"mean_token_accuracy": 0.6918173551559448,
"num_tokens": 277946687.0,
"step": 4415
},
{
"epoch": 14.445171849427169,
"grad_norm": 12.201971543911935,
"learning_rate": 2.259108228191851e-09,
"loss": 1.4688,
"mean_token_accuracy": 0.6911928713321686,
"num_tokens": 278264205.0,
"step": 4420
},
{
"epoch": 14.461538461538462,
"grad_norm": 12.967808337071494,
"learning_rate": 2.2534317097201633e-09,
"loss": 1.4748,
"mean_token_accuracy": 0.6898449957370758,
"num_tokens": 278578719.0,
"step": 4425
},
{
"epoch": 14.477905073649755,
"grad_norm": 12.69460481544349,
"learning_rate": 2.2477564746687644e-09,
"loss": 1.4825,
"mean_token_accuracy": 0.6889922320842743,
"num_tokens": 278895508.0,
"step": 4430
},
{
"epoch": 14.494271685761047,
"grad_norm": 12.038321940990556,
"learning_rate": 2.242082552577996e-09,
"loss": 1.4809,
"mean_token_accuracy": 0.6891875326633453,
"num_tokens": 279210632.0,
"step": 4435
},
{
"epoch": 14.51063829787234,
"grad_norm": 12.647305286438627,
"learning_rate": 2.2364099729813668e-09,
"loss": 1.4796,
"mean_token_accuracy": 0.6954532265663147,
"num_tokens": 279526403.0,
"step": 4440
},
{
"epoch": 14.527004909983633,
"grad_norm": 12.055013468342107,
"learning_rate": 2.2307387654053978e-09,
"loss": 1.4674,
"mean_token_accuracy": 0.6918425500392914,
"num_tokens": 279842256.0,
"step": 4445
},
{
"epoch": 14.543371522094926,
"grad_norm": 12.690408025111353,
"learning_rate": 2.2250689593694696e-09,
"loss": 1.5005,
"mean_token_accuracy": 0.6831823945045471,
"num_tokens": 280157536.0,
"step": 4450
},
{
"epoch": 14.559738134206219,
"grad_norm": 12.193205510406495,
"learning_rate": 2.2194005843856633e-09,
"loss": 1.4756,
"mean_token_accuracy": 0.6923376679420471,
"num_tokens": 280474231.0,
"step": 4455
},
{
"epoch": 14.576104746317512,
"grad_norm": 12.09933217624297,
"learning_rate": 2.2137336699586157e-09,
"loss": 1.46,
"mean_token_accuracy": 0.6976064741611481,
"num_tokens": 280789726.0,
"step": 4460
},
{
"epoch": 14.592471358428805,
"grad_norm": 12.577292347337101,
"learning_rate": 2.2080682455853595e-09,
"loss": 1.474,
"mean_token_accuracy": 0.693066680431366,
"num_tokens": 281106145.0,
"step": 4465
},
{
"epoch": 14.608837970540097,
"grad_norm": 11.937726197090774,
"learning_rate": 2.2024043407551717e-09,
"loss": 1.4704,
"mean_token_accuracy": 0.6958605766296386,
"num_tokens": 281421354.0,
"step": 4470
},
{
"epoch": 14.62520458265139,
"grad_norm": 12.321027290389669,
"learning_rate": 2.196741984949419e-09,
"loss": 1.4515,
"mean_token_accuracy": 0.6971550405025482,
"num_tokens": 281735612.0,
"step": 4475
},
{
"epoch": 14.641571194762683,
"grad_norm": 12.440597736777564,
"learning_rate": 2.1910812076414075e-09,
"loss": 1.4636,
"mean_token_accuracy": 0.6930567026138306,
"num_tokens": 282050979.0,
"step": 4480
},
{
"epoch": 14.657937806873978,
"grad_norm": 12.248636275112409,
"learning_rate": 2.185422038296224e-09,
"loss": 1.4589,
"mean_token_accuracy": 0.6959433734416962,
"num_tokens": 282366764.0,
"step": 4485
},
{
"epoch": 14.67430441898527,
"grad_norm": 12.90372128226122,
"learning_rate": 2.1797645063705874e-09,
"loss": 1.4877,
"mean_token_accuracy": 0.6922920167446136,
"num_tokens": 282681458.0,
"step": 4490
},
{
"epoch": 14.690671031096564,
"grad_norm": 11.926305941030886,
"learning_rate": 2.174108641312694e-09,
"loss": 1.4507,
"mean_token_accuracy": 0.6994022250175476,
"num_tokens": 282998330.0,
"step": 4495
},
{
"epoch": 14.707037643207856,
"grad_norm": 12.889652279538323,
"learning_rate": 2.1684544725620626e-09,
"loss": 1.4876,
"mean_token_accuracy": 0.6908430218696594,
"num_tokens": 283314245.0,
"step": 4500
},
{
"epoch": 14.72340425531915,
"grad_norm": 12.216650977375846,
"learning_rate": 2.1628020295493844e-09,
"loss": 1.4446,
"mean_token_accuracy": 0.6986382126808166,
"num_tokens": 283631182.0,
"step": 4505
},
{
"epoch": 14.739770867430442,
"grad_norm": 12.24979629911043,
"learning_rate": 2.1571513416963645e-09,
"loss": 1.4586,
"mean_token_accuracy": 0.6951400220394135,
"num_tokens": 283947490.0,
"step": 4510
},
{
"epoch": 14.756137479541735,
"grad_norm": 12.339552640099162,
"learning_rate": 2.1515024384155752e-09,
"loss": 1.4631,
"mean_token_accuracy": 0.698326563835144,
"num_tokens": 284261505.0,
"step": 4515
},
{
"epoch": 14.772504091653028,
"grad_norm": 12.757398483091528,
"learning_rate": 2.145855349110299e-09,
"loss": 1.4715,
"mean_token_accuracy": 0.6933198750019074,
"num_tokens": 284576696.0,
"step": 4520
},
{
"epoch": 14.78887070376432,
"grad_norm": 12.728594598720028,
"learning_rate": 2.1402101031743764e-09,
"loss": 1.4623,
"mean_token_accuracy": 0.6925253510475159,
"num_tokens": 284892532.0,
"step": 4525
},
{
"epoch": 14.805237315875614,
"grad_norm": 12.400149509699013,
"learning_rate": 2.134566729992053e-09,
"loss": 1.4514,
"mean_token_accuracy": 0.6978048384189606,
"num_tokens": 285208161.0,
"step": 4530
},
{
"epoch": 14.821603927986907,
"grad_norm": 12.30488766418087,
"learning_rate": 2.128925258937826e-09,
"loss": 1.4797,
"mean_token_accuracy": 0.6912436008453369,
"num_tokens": 285525048.0,
"step": 4535
},
{
"epoch": 14.8379705400982,
"grad_norm": 12.357791717436664,
"learning_rate": 2.123285719376292e-09,
"loss": 1.4742,
"mean_token_accuracy": 0.695197343826294,
"num_tokens": 285840446.0,
"step": 4540
},
{
"epoch": 14.854337152209492,
"grad_norm": 12.342749508582655,
"learning_rate": 2.1176481406619947e-09,
"loss": 1.4587,
"mean_token_accuracy": 0.6979107022285461,
"num_tokens": 286156416.0,
"step": 4545
},
{
"epoch": 14.870703764320785,
"grad_norm": 12.728714363435309,
"learning_rate": 2.11201255213927e-09,
"loss": 1.4589,
"mean_token_accuracy": 0.6975548446178437,
"num_tokens": 286472721.0,
"step": 4550
},
{
"epoch": 14.887070376432078,
"grad_norm": 12.96147876703625,
"learning_rate": 2.1063789831420955e-09,
"loss": 1.4911,
"mean_token_accuracy": 0.6897732436656951,
"num_tokens": 286789509.0,
"step": 4555
},
{
"epoch": 14.90343698854337,
"grad_norm": 12.619512227634361,
"learning_rate": 2.1007474629939365e-09,
"loss": 1.4643,
"mean_token_accuracy": 0.6959713280200959,
"num_tokens": 287102963.0,
"step": 4560
},
{
"epoch": 14.919803600654664,
"grad_norm": 11.942604179577812,
"learning_rate": 2.0951180210075957e-09,
"loss": 1.4478,
"mean_token_accuracy": 0.6993954658508301,
"num_tokens": 287418442.0,
"step": 4565
},
{
"epoch": 14.936170212765958,
"grad_norm": 12.625415378283076,
"learning_rate": 2.089490686485054e-09,
"loss": 1.454,
"mean_token_accuracy": 0.6950019478797913,
"num_tokens": 287733944.0,
"step": 4570
},
{
"epoch": 14.952536824877251,
"grad_norm": 13.1432068644648,
"learning_rate": 2.0838654887173267e-09,
"loss": 1.476,
"mean_token_accuracy": 0.6924629271030426,
"num_tokens": 288049250.0,
"step": 4575
},
{
"epoch": 14.968903436988544,
"grad_norm": 12.757524167271344,
"learning_rate": 2.0782424569843065e-09,
"loss": 1.4576,
"mean_token_accuracy": 0.6961781263351441,
"num_tokens": 288365769.0,
"step": 4580
},
{
"epoch": 14.985270049099837,
"grad_norm": 12.63342489493526,
"learning_rate": 2.07262162055461e-09,
"loss": 1.4477,
"mean_token_accuracy": 0.6964144468307495,
"num_tokens": 288679654.0,
"step": 4585
},
{
"epoch": 15.0,
"grad_norm": 11.887885024431077,
"learning_rate": 2.0670030086854292e-09,
"loss": 1.4382,
"mean_token_accuracy": 0.6943457788891263,
"num_tokens": 288940517.0,
"step": 4590
},
{
"epoch": 15.016366612111293,
"grad_norm": 12.537734466637337,
"learning_rate": 2.061386650622375e-09,
"loss": 1.4629,
"mean_token_accuracy": 0.6950094759464264,
"num_tokens": 289255149.0,
"step": 4595
},
{
"epoch": 15.032733224222586,
"grad_norm": 13.189978916684646,
"learning_rate": 2.0557725755993286e-09,
"loss": 1.4733,
"mean_token_accuracy": 0.6929343402385711,
"num_tokens": 289569537.0,
"step": 4600
},
{
"epoch": 15.049099836333879,
"grad_norm": 12.160429562262673,
"learning_rate": 2.0501608128382854e-09,
"loss": 1.4469,
"mean_token_accuracy": 0.69942986369133,
"num_tokens": 289884661.0,
"step": 4605
},
{
"epoch": 15.065466448445171,
"grad_norm": 12.820755065669108,
"learning_rate": 2.0445513915492077e-09,
"loss": 1.458,
"mean_token_accuracy": 0.7008291482925415,
"num_tokens": 290201349.0,
"step": 4610
},
{
"epoch": 15.081833060556464,
"grad_norm": 12.469195158529283,
"learning_rate": 2.038944340929868e-09,
"loss": 1.4598,
"mean_token_accuracy": 0.695182865858078,
"num_tokens": 290517988.0,
"step": 4615
},
{
"epoch": 15.098199672667757,
"grad_norm": 12.539429180588215,
"learning_rate": 2.033339690165702e-09,
"loss": 1.4582,
"mean_token_accuracy": 0.6969483494758606,
"num_tokens": 290834097.0,
"step": 4620
},
{
"epoch": 15.11456628477905,
"grad_norm": 12.347141898709026,
"learning_rate": 2.0277374684296497e-09,
"loss": 1.4446,
"mean_token_accuracy": 0.698224401473999,
"num_tokens": 291148346.0,
"step": 4625
},
{
"epoch": 15.130932896890343,
"grad_norm": 12.477486605316148,
"learning_rate": 2.0221377048820108e-09,
"loss": 1.4726,
"mean_token_accuracy": 0.6941590189933777,
"num_tokens": 291460917.0,
"step": 4630
},
{
"epoch": 15.147299509001636,
"grad_norm": 12.597641418413549,
"learning_rate": 2.016540428670289e-09,
"loss": 1.4511,
"mean_token_accuracy": 0.6982307612895966,
"num_tokens": 291777004.0,
"step": 4635
},
{
"epoch": 15.16366612111293,
"grad_norm": 12.259001662920467,
"learning_rate": 2.0109456689290413e-09,
"loss": 1.4512,
"mean_token_accuracy": 0.6984145939350128,
"num_tokens": 292093657.0,
"step": 4640
},
{
"epoch": 15.180032733224223,
"grad_norm": 12.297638018990243,
"learning_rate": 2.0053534547797256e-09,
"loss": 1.4641,
"mean_token_accuracy": 0.693070936203003,
"num_tokens": 292409436.0,
"step": 4645
},
{
"epoch": 15.196399345335516,
"grad_norm": 12.425519490074832,
"learning_rate": 1.99976381533055e-09,
"loss": 1.4527,
"mean_token_accuracy": 0.6970273613929748,
"num_tokens": 292725817.0,
"step": 4650
},
{
"epoch": 15.212765957446809,
"grad_norm": 12.572790188395173,
"learning_rate": 1.994176779676321e-09,
"loss": 1.4643,
"mean_token_accuracy": 0.6970184624195099,
"num_tokens": 293041899.0,
"step": 4655
},
{
"epoch": 15.229132569558102,
"grad_norm": 12.559415461824194,
"learning_rate": 1.988592376898292e-09,
"loss": 1.4544,
"mean_token_accuracy": 0.6966762185096741,
"num_tokens": 293357735.0,
"step": 4660
},
{
"epoch": 15.245499181669395,
"grad_norm": 12.840229816853638,
"learning_rate": 1.9830106360640117e-09,
"loss": 1.4607,
"mean_token_accuracy": 0.6955319404602051,
"num_tokens": 293673395.0,
"step": 4665
},
{
"epoch": 15.261865793780688,
"grad_norm": 12.921981395383156,
"learning_rate": 1.977431586227173e-09,
"loss": 1.4485,
"mean_token_accuracy": 0.7030904352664947,
"num_tokens": 293988719.0,
"step": 4670
},
{
"epoch": 15.27823240589198,
"grad_norm": 12.686769441565513,
"learning_rate": 1.9718552564274626e-09,
"loss": 1.4536,
"mean_token_accuracy": 0.6988371670246124,
"num_tokens": 294305177.0,
"step": 4675
},
{
"epoch": 15.294599018003273,
"grad_norm": 12.317038010818157,
"learning_rate": 1.9662816756904084e-09,
"loss": 1.4345,
"mean_token_accuracy": 0.7039202451705933,
"num_tokens": 294621539.0,
"step": 4680
},
{
"epoch": 15.310965630114566,
"grad_norm": 12.794939081964158,
"learning_rate": 1.960710873027228e-09,
"loss": 1.4501,
"mean_token_accuracy": 0.6983423411846161,
"num_tokens": 294936167.0,
"step": 4685
},
{
"epoch": 15.327332242225859,
"grad_norm": 11.698580162988128,
"learning_rate": 1.955142877434681e-09,
"loss": 1.452,
"mean_token_accuracy": 0.700901734828949,
"num_tokens": 295252190.0,
"step": 4690
},
{
"epoch": 15.343698854337152,
"grad_norm": 12.556785946500368,
"learning_rate": 1.949577717894914e-09,
"loss": 1.4536,
"mean_token_accuracy": 0.7017082333564758,
"num_tokens": 295567293.0,
"step": 4695
},
{
"epoch": 15.360065466448445,
"grad_norm": 13.21370608963778,
"learning_rate": 1.9440154233753125e-09,
"loss": 1.4282,
"mean_token_accuracy": 0.7037009358406067,
"num_tokens": 295882469.0,
"step": 4700
},
{
"epoch": 15.376432078559738,
"grad_norm": 12.822389095916504,
"learning_rate": 1.9384560228283493e-09,
"loss": 1.4396,
"mean_token_accuracy": 0.7042408645153045,
"num_tokens": 296198439.0,
"step": 4705
},
{
"epoch": 15.39279869067103,
"grad_norm": 12.68294166232564,
"learning_rate": 1.932899545191433e-09,
"loss": 1.4679,
"mean_token_accuracy": 0.6939967513084412,
"num_tokens": 296514887.0,
"step": 4710
},
{
"epoch": 15.409165302782323,
"grad_norm": 12.8287458513874,
"learning_rate": 1.9273460193867585e-09,
"loss": 1.4523,
"mean_token_accuracy": 0.6993020355701447,
"num_tokens": 296831104.0,
"step": 4715
},
{
"epoch": 15.425531914893616,
"grad_norm": 13.046240796523204,
"learning_rate": 1.921795474321156e-09,
"loss": 1.4629,
"mean_token_accuracy": 0.6947001516819,
"num_tokens": 297145680.0,
"step": 4720
},
{
"epoch": 15.44189852700491,
"grad_norm": 13.02764337966389,
"learning_rate": 1.9162479388859405e-09,
"loss": 1.4428,
"mean_token_accuracy": 0.6996423721313476,
"num_tokens": 297462302.0,
"step": 4725
},
{
"epoch": 15.458265139116204,
"grad_norm": 12.706079510070456,
"learning_rate": 1.9107034419567616e-09,
"loss": 1.4356,
"mean_token_accuracy": 0.6992950081825257,
"num_tokens": 297778557.0,
"step": 4730
},
{
"epoch": 15.474631751227497,
"grad_norm": 12.368405600391505,
"learning_rate": 1.905162012393454e-09,
"loss": 1.4301,
"mean_token_accuracy": 0.7041154444217682,
"num_tokens": 298094510.0,
"step": 4735
},
{
"epoch": 15.49099836333879,
"grad_norm": 12.132688423350789,
"learning_rate": 1.8996236790398827e-09,
"loss": 1.4461,
"mean_token_accuracy": 0.6984711229801178,
"num_tokens": 298408407.0,
"step": 4740
},
{
"epoch": 15.507364975450082,
"grad_norm": 12.584146973765089,
"learning_rate": 1.894088470723801e-09,
"loss": 1.4434,
"mean_token_accuracy": 0.7076845765113831,
"num_tokens": 298723960.0,
"step": 4745
},
{
"epoch": 15.523731587561375,
"grad_norm": 12.461445555120452,
"learning_rate": 1.8885564162566935e-09,
"loss": 1.4504,
"mean_token_accuracy": 0.7017831683158875,
"num_tokens": 299038837.0,
"step": 4750
},
{
"epoch": 15.540098199672668,
"grad_norm": 12.049024412508352,
"learning_rate": 1.8830275444336294e-09,
"loss": 1.4351,
"mean_token_accuracy": 0.7035296440124512,
"num_tokens": 299353453.0,
"step": 4755
},
{
"epoch": 15.556464811783961,
"grad_norm": 12.489061092870097,
"learning_rate": 1.877501884033112e-09,
"loss": 1.4631,
"mean_token_accuracy": 0.7008910655975342,
"num_tokens": 299668337.0,
"step": 4760
},
{
"epoch": 15.572831423895254,
"grad_norm": 12.754571106004892,
"learning_rate": 1.871979463816928e-09,
"loss": 1.4403,
"mean_token_accuracy": 0.7025842666625977,
"num_tokens": 299984450.0,
"step": 4765
},
{
"epoch": 15.589198036006547,
"grad_norm": 12.501616382655923,
"learning_rate": 1.866460312529999e-09,
"loss": 1.4298,
"mean_token_accuracy": 0.7049470484256745,
"num_tokens": 300299888.0,
"step": 4770
},
{
"epoch": 15.60556464811784,
"grad_norm": 13.137669630274404,
"learning_rate": 1.8609444589002305e-09,
"loss": 1.4457,
"mean_token_accuracy": 0.704905104637146,
"num_tokens": 300614429.0,
"step": 4775
},
{
"epoch": 15.621931260229132,
"grad_norm": 12.794971835716657,
"learning_rate": 1.8554319316383656e-09,
"loss": 1.4323,
"mean_token_accuracy": 0.7025792956352234,
"num_tokens": 300930547.0,
"step": 4780
},
{
"epoch": 15.638297872340425,
"grad_norm": 12.796486073759842,
"learning_rate": 1.8499227594378307e-09,
"loss": 1.4345,
"mean_token_accuracy": 0.7019464492797851,
"num_tokens": 301246649.0,
"step": 4785
},
{
"epoch": 15.654664484451718,
"grad_norm": 13.004969358776405,
"learning_rate": 1.8444169709745909e-09,
"loss": 1.4368,
"mean_token_accuracy": 0.7053989946842194,
"num_tokens": 301563164.0,
"step": 4790
},
{
"epoch": 15.671031096563011,
"grad_norm": 12.547077558098191,
"learning_rate": 1.8389145949069952e-09,
"loss": 1.4333,
"mean_token_accuracy": 0.703494918346405,
"num_tokens": 301878774.0,
"step": 4795
},
{
"epoch": 15.687397708674304,
"grad_norm": 12.864772984217574,
"learning_rate": 1.8334156598756332e-09,
"loss": 1.4456,
"mean_token_accuracy": 0.7033185422420501,
"num_tokens": 302194764.0,
"step": 4800
},
{
"epoch": 15.703764320785597,
"grad_norm": 12.889303348512994,
"learning_rate": 1.8279201945031835e-09,
"loss": 1.4443,
"mean_token_accuracy": 0.7018985509872436,
"num_tokens": 302511045.0,
"step": 4805
},
{
"epoch": 15.72013093289689,
"grad_norm": 11.93106030575331,
"learning_rate": 1.8224282273942639e-09,
"loss": 1.4423,
"mean_token_accuracy": 0.7027697741985321,
"num_tokens": 302826404.0,
"step": 4810
},
{
"epoch": 15.736497545008183,
"grad_norm": 12.753590751329146,
"learning_rate": 1.8169397871352833e-09,
"loss": 1.4321,
"mean_token_accuracy": 0.702825516462326,
"num_tokens": 303143255.0,
"step": 4815
},
{
"epoch": 15.752864157119475,
"grad_norm": 12.986619290364375,
"learning_rate": 1.8114549022942933e-09,
"loss": 1.4477,
"mean_token_accuracy": 0.703691053390503,
"num_tokens": 303460830.0,
"step": 4820
},
{
"epoch": 15.76923076923077,
"grad_norm": 13.099929730581625,
"learning_rate": 1.8059736014208387e-09,
"loss": 1.4308,
"mean_token_accuracy": 0.708087545633316,
"num_tokens": 303776766.0,
"step": 4825
},
{
"epoch": 15.785597381342063,
"grad_norm": 13.00016809628579,
"learning_rate": 1.8004959130458092e-09,
"loss": 1.4495,
"mean_token_accuracy": 0.7040505766868591,
"num_tokens": 304092857.0,
"step": 4830
},
{
"epoch": 15.801963993453356,
"grad_norm": 12.622790826032567,
"learning_rate": 1.7950218656812916e-09,
"loss": 1.4421,
"mean_token_accuracy": 0.7057828962802887,
"num_tokens": 304407912.0,
"step": 4835
},
{
"epoch": 15.818330605564649,
"grad_norm": 13.074411965123678,
"learning_rate": 1.7895514878204203e-09,
"loss": 1.4364,
"mean_token_accuracy": 0.7043783128261566,
"num_tokens": 304724801.0,
"step": 4840
},
{
"epoch": 15.834697217675942,
"grad_norm": 12.357229368576297,
"learning_rate": 1.7840848079372291e-09,
"loss": 1.4217,
"mean_token_accuracy": 0.7087771832942963,
"num_tokens": 305041073.0,
"step": 4845
},
{
"epoch": 15.851063829787234,
"grad_norm": 12.527322694946676,
"learning_rate": 1.7786218544865048e-09,
"loss": 1.4421,
"mean_token_accuracy": 0.7043534576892853,
"num_tokens": 305357141.0,
"step": 4850
},
{
"epoch": 15.867430441898527,
"grad_norm": 11.632455478155906,
"learning_rate": 1.773162655903635e-09,
"loss": 1.4156,
"mean_token_accuracy": 0.7109094977378845,
"num_tokens": 305673330.0,
"step": 4855
},
{
"epoch": 15.88379705400982,
"grad_norm": 12.484383450949295,
"learning_rate": 1.7677072406044653e-09,
"loss": 1.4291,
"mean_token_accuracy": 0.7088080048561096,
"num_tokens": 305988151.0,
"step": 4860
},
{
"epoch": 15.900163666121113,
"grad_norm": 12.366432589792248,
"learning_rate": 1.7622556369851476e-09,
"loss": 1.4219,
"mean_token_accuracy": 0.7094823539257049,
"num_tokens": 306303670.0,
"step": 4865
},
{
"epoch": 15.916530278232406,
"grad_norm": 12.747791579732302,
"learning_rate": 1.7568078734219933e-09,
"loss": 1.4459,
"mean_token_accuracy": 0.7039444029331208,
"num_tokens": 306617897.0,
"step": 4870
},
{
"epoch": 15.932896890343699,
"grad_norm": 12.532223512008814,
"learning_rate": 1.751363978271327e-09,
"loss": 1.4239,
"mean_token_accuracy": 0.7083085179328918,
"num_tokens": 306933638.0,
"step": 4875
},
{
"epoch": 15.949263502454992,
"grad_norm": 12.416976278316078,
"learning_rate": 1.7459239798693363e-09,
"loss": 1.4146,
"mean_token_accuracy": 0.7101383566856384,
"num_tokens": 307249025.0,
"step": 4880
},
{
"epoch": 15.965630114566284,
"grad_norm": 13.01261682583028,
"learning_rate": 1.7404879065319268e-09,
"loss": 1.442,
"mean_token_accuracy": 0.7075551569461822,
"num_tokens": 307564915.0,
"step": 4885
},
{
"epoch": 15.981996726677577,
"grad_norm": 12.602326240128173,
"learning_rate": 1.7350557865545724e-09,
"loss": 1.4359,
"mean_token_accuracy": 0.7061196863651276,
"num_tokens": 307879676.0,
"step": 4890
},
{
"epoch": 15.99836333878887,
"grad_norm": 12.026995154848029,
"learning_rate": 1.729627648212171e-09,
"loss": 1.4224,
"mean_token_accuracy": 0.7099742531776428,
"num_tokens": 308195258.0,
"step": 4895
},
{
"epoch": 16.013093289689035,
"grad_norm": 12.487645695354066,
"learning_rate": 1.7242035197588937e-09,
"loss": 1.4465,
"mean_token_accuracy": 0.7059299283557467,
"num_tokens": 308455045.0,
"step": 4900
},
{
"epoch": 16.029459901800326,
"grad_norm": 12.742840987338324,
"learning_rate": 1.7187834294280422e-09,
"loss": 1.4347,
"mean_token_accuracy": 0.7084084928035737,
"num_tokens": 308769309.0,
"step": 4905
},
{
"epoch": 16.04582651391162,
"grad_norm": 12.28965784309087,
"learning_rate": 1.7133674054318947e-09,
"loss": 1.4004,
"mean_token_accuracy": 0.7148695111274719,
"num_tokens": 309085266.0,
"step": 4910
},
{
"epoch": 16.062193126022912,
"grad_norm": 12.425330561068234,
"learning_rate": 1.7079554759615685e-09,
"loss": 1.4026,
"mean_token_accuracy": 0.7121374189853669,
"num_tokens": 309399971.0,
"step": 4915
},
{
"epoch": 16.078559738134206,
"grad_norm": 12.318608831255458,
"learning_rate": 1.702547669186865e-09,
"loss": 1.4247,
"mean_token_accuracy": 0.7116099834442139,
"num_tokens": 309716297.0,
"step": 4920
},
{
"epoch": 16.094926350245498,
"grad_norm": 12.209031557381481,
"learning_rate": 1.6971440132561283e-09,
"loss": 1.4393,
"mean_token_accuracy": 0.7075030922889709,
"num_tokens": 310030581.0,
"step": 4925
},
{
"epoch": 16.111292962356792,
"grad_norm": 12.519677365451061,
"learning_rate": 1.6917445362960965e-09,
"loss": 1.4413,
"mean_token_accuracy": 0.7066156387329101,
"num_tokens": 310345610.0,
"step": 4930
},
{
"epoch": 16.127659574468087,
"grad_norm": 12.828979086104322,
"learning_rate": 1.6863492664117547e-09,
"loss": 1.4193,
"mean_token_accuracy": 0.7108834385871887,
"num_tokens": 310662055.0,
"step": 4935
},
{
"epoch": 16.144026186579378,
"grad_norm": 12.966364808553605,
"learning_rate": 1.680958231686191e-09,
"loss": 1.4467,
"mean_token_accuracy": 0.7091517686843872,
"num_tokens": 310977412.0,
"step": 4940
},
{
"epoch": 16.160392798690673,
"grad_norm": 12.4486500881121,
"learning_rate": 1.6755714601804473e-09,
"loss": 1.4123,
"mean_token_accuracy": 0.7110908329486847,
"num_tokens": 311293114.0,
"step": 4945
},
{
"epoch": 16.176759410801964,
"grad_norm": 12.12641458468891,
"learning_rate": 1.6701889799333764e-09,
"loss": 1.4338,
"mean_token_accuracy": 0.7081463217735291,
"num_tokens": 311608822.0,
"step": 4950
},
{
"epoch": 16.19312602291326,
"grad_norm": 12.419806067336424,
"learning_rate": 1.6648108189614937e-09,
"loss": 1.4285,
"mean_token_accuracy": 0.7072434544563293,
"num_tokens": 311924352.0,
"step": 4955
},
{
"epoch": 16.20949263502455,
"grad_norm": 12.528113062059223,
"learning_rate": 1.6594370052588325e-09,
"loss": 1.4158,
"mean_token_accuracy": 0.7136779129505157,
"num_tokens": 312239834.0,
"step": 4960
},
{
"epoch": 16.225859247135844,
"grad_norm": 12.26285948778274,
"learning_rate": 1.6540675667967975e-09,
"loss": 1.4241,
"mean_token_accuracy": 0.7105782389640808,
"num_tokens": 312555700.0,
"step": 4965
},
{
"epoch": 16.242225859247135,
"grad_norm": 12.857900754501056,
"learning_rate": 1.6487025315240205e-09,
"loss": 1.4192,
"mean_token_accuracy": 0.7106291711330414,
"num_tokens": 312873912.0,
"step": 4970
},
{
"epoch": 16.25859247135843,
"grad_norm": 12.471903624085858,
"learning_rate": 1.6433419273662134e-09,
"loss": 1.4212,
"mean_token_accuracy": 0.711182713508606,
"num_tokens": 313188562.0,
"step": 4975
},
{
"epoch": 16.27495908346972,
"grad_norm": 12.977401525766625,
"learning_rate": 1.6379857822260242e-09,
"loss": 1.4246,
"mean_token_accuracy": 0.712104445695877,
"num_tokens": 313504755.0,
"step": 4980
},
{
"epoch": 16.291325695581016,
"grad_norm": 12.38125234612746,
"learning_rate": 1.63263412398289e-09,
"loss": 1.4211,
"mean_token_accuracy": 0.7122440755367279,
"num_tokens": 313821196.0,
"step": 4985
},
{
"epoch": 16.307692307692307,
"grad_norm": 13.00957545801318,
"learning_rate": 1.6272869804928953e-09,
"loss": 1.4188,
"mean_token_accuracy": 0.712299644947052,
"num_tokens": 314135776.0,
"step": 4990
},
{
"epoch": 16.3240589198036,
"grad_norm": 12.191919695501758,
"learning_rate": 1.621944379588622e-09,
"loss": 1.4022,
"mean_token_accuracy": 0.7136437952518463,
"num_tokens": 314450399.0,
"step": 4995
},
{
"epoch": 16.340425531914892,
"grad_norm": 12.2615909933301,
"learning_rate": 1.616606349079009e-09,
"loss": 1.4097,
"mean_token_accuracy": 0.713315773010254,
"num_tokens": 314766520.0,
"step": 5000
},
{
"epoch": 16.356792144026187,
"grad_norm": 13.113137902963848,
"learning_rate": 1.611272916749205e-09,
"loss": 1.4274,
"mean_token_accuracy": 0.7096070230007172,
"num_tokens": 315080587.0,
"step": 5005
},
{
"epoch": 16.373158756137478,
"grad_norm": 13.089840378773003,
"learning_rate": 1.6059441103604248e-09,
"loss": 1.4345,
"mean_token_accuracy": 0.7088598847389221,
"num_tokens": 315396046.0,
"step": 5010
},
{
"epoch": 16.389525368248773,
"grad_norm": 13.007606738230844,
"learning_rate": 1.6006199576498043e-09,
"loss": 1.4324,
"mean_token_accuracy": 0.7114957571029663,
"num_tokens": 315712887.0,
"step": 5015
},
{
"epoch": 16.405891980360064,
"grad_norm": 12.73546978432187,
"learning_rate": 1.5953004863302579e-09,
"loss": 1.4243,
"mean_token_accuracy": 0.7114252030849457,
"num_tokens": 316028929.0,
"step": 5020
},
{
"epoch": 16.42225859247136,
"grad_norm": 11.962805948309864,
"learning_rate": 1.5899857240903293e-09,
"loss": 1.4221,
"mean_token_accuracy": 0.7109561562538147,
"num_tokens": 316344275.0,
"step": 5025
},
{
"epoch": 16.438625204582653,
"grad_norm": 12.6828623743294,
"learning_rate": 1.5846756985940544e-09,
"loss": 1.4271,
"mean_token_accuracy": 0.709166294336319,
"num_tokens": 316659921.0,
"step": 5030
},
{
"epoch": 16.454991816693944,
"grad_norm": 12.139305827793805,
"learning_rate": 1.5793704374808121e-09,
"loss": 1.4121,
"mean_token_accuracy": 0.7111846745014191,
"num_tokens": 316974336.0,
"step": 5035
},
{
"epoch": 16.47135842880524,
"grad_norm": 12.273749837279315,
"learning_rate": 1.574069968365182e-09,
"loss": 1.4156,
"mean_token_accuracy": 0.712162172794342,
"num_tokens": 317291107.0,
"step": 5040
},
{
"epoch": 16.48772504091653,
"grad_norm": 12.577531348932242,
"learning_rate": 1.5687743188368012e-09,
"loss": 1.4272,
"mean_token_accuracy": 0.7101832747459411,
"num_tokens": 317606002.0,
"step": 5045
},
{
"epoch": 16.504091653027825,
"grad_norm": 12.636590864899892,
"learning_rate": 1.5634835164602198e-09,
"loss": 1.4001,
"mean_token_accuracy": 0.7117600500583648,
"num_tokens": 317921351.0,
"step": 5050
},
{
"epoch": 16.520458265139116,
"grad_norm": 12.546261992814705,
"learning_rate": 1.5581975887747584e-09,
"loss": 1.4454,
"mean_token_accuracy": 0.7062943160533905,
"num_tokens": 318239595.0,
"step": 5055
},
{
"epoch": 16.53682487725041,
"grad_norm": 12.820298302371835,
"learning_rate": 1.5529165632943637e-09,
"loss": 1.4285,
"mean_token_accuracy": 0.7094673216342926,
"num_tokens": 318555791.0,
"step": 5060
},
{
"epoch": 16.5531914893617,
"grad_norm": 12.81934414091327,
"learning_rate": 1.5476404675074662e-09,
"loss": 1.4185,
"mean_token_accuracy": 0.7127328336238861,
"num_tokens": 318871455.0,
"step": 5065
},
{
"epoch": 16.569558101472996,
"grad_norm": 12.528634024114519,
"learning_rate": 1.5423693288768356e-09,
"loss": 1.4257,
"mean_token_accuracy": 0.7096285283565521,
"num_tokens": 319186216.0,
"step": 5070
},
{
"epoch": 16.585924713584287,
"grad_norm": 12.45352197076654,
"learning_rate": 1.5371031748394415e-09,
"loss": 1.4133,
"mean_token_accuracy": 0.7132382929325104,
"num_tokens": 319503790.0,
"step": 5075
},
{
"epoch": 16.60229132569558,
"grad_norm": 12.823560917577696,
"learning_rate": 1.5318420328063042e-09,
"loss": 1.433,
"mean_token_accuracy": 0.7091699779033661,
"num_tokens": 319819206.0,
"step": 5080
},
{
"epoch": 16.618657937806873,
"grad_norm": 12.551449755539915,
"learning_rate": 1.526585930162359e-09,
"loss": 1.3969,
"mean_token_accuracy": 0.7174212098121643,
"num_tokens": 320135045.0,
"step": 5085
},
{
"epoch": 16.635024549918167,
"grad_norm": 12.432937590754685,
"learning_rate": 1.5213348942663091e-09,
"loss": 1.4038,
"mean_token_accuracy": 0.712946742773056,
"num_tokens": 320451345.0,
"step": 5090
},
{
"epoch": 16.65139116202946,
"grad_norm": 12.591699551531104,
"learning_rate": 1.5160889524504857e-09,
"loss": 1.402,
"mean_token_accuracy": 0.7149172186851501,
"num_tokens": 320765107.0,
"step": 5095
},
{
"epoch": 16.667757774140753,
"grad_norm": 12.558780429229012,
"learning_rate": 1.5108481320207031e-09,
"loss": 1.409,
"mean_token_accuracy": 0.7116455256938934,
"num_tokens": 321080526.0,
"step": 5100
},
{
"epoch": 16.684124386252044,
"grad_norm": 12.556788522001066,
"learning_rate": 1.5056124602561197e-09,
"loss": 1.4188,
"mean_token_accuracy": 0.7126824915409088,
"num_tokens": 321395505.0,
"step": 5105
},
{
"epoch": 16.70049099836334,
"grad_norm": 12.56475488051728,
"learning_rate": 1.5003819644090933e-09,
"loss": 1.4172,
"mean_token_accuracy": 0.7109236419200897,
"num_tokens": 321711520.0,
"step": 5110
},
{
"epoch": 16.71685761047463,
"grad_norm": 12.673240593488543,
"learning_rate": 1.4951566717050408e-09,
"loss": 1.4185,
"mean_token_accuracy": 0.7114186406135559,
"num_tokens": 322028854.0,
"step": 5115
},
{
"epoch": 16.733224222585925,
"grad_norm": 12.749139578170611,
"learning_rate": 1.4899366093422962e-09,
"loss": 1.411,
"mean_token_accuracy": 0.7120306134223938,
"num_tokens": 322345696.0,
"step": 5120
},
{
"epoch": 16.74959083469722,
"grad_norm": 12.565219692692061,
"learning_rate": 1.4847218044919685e-09,
"loss": 1.3943,
"mean_token_accuracy": 0.7164922475814819,
"num_tokens": 322662655.0,
"step": 5125
},
{
"epoch": 16.76595744680851,
"grad_norm": 13.494265061680965,
"learning_rate": 1.479512284297801e-09,
"loss": 1.4221,
"mean_token_accuracy": 0.7114957809448242,
"num_tokens": 322977947.0,
"step": 5130
},
{
"epoch": 16.782324058919805,
"grad_norm": 12.453801890723668,
"learning_rate": 1.47430807587603e-09,
"loss": 1.4019,
"mean_token_accuracy": 0.7145236849784851,
"num_tokens": 323294224.0,
"step": 5135
},
{
"epoch": 16.798690671031096,
"grad_norm": 12.835301219712429,
"learning_rate": 1.4691092063152418e-09,
"loss": 1.4176,
"mean_token_accuracy": 0.7122674882411957,
"num_tokens": 323609615.0,
"step": 5140
},
{
"epoch": 16.81505728314239,
"grad_norm": 12.174453724474661,
"learning_rate": 1.4639157026762344e-09,
"loss": 1.4172,
"mean_token_accuracy": 0.7103416383266449,
"num_tokens": 323924353.0,
"step": 5145
},
{
"epoch": 16.831423895253682,
"grad_norm": 12.302664934760815,
"learning_rate": 1.458727591991877e-09,
"loss": 1.4035,
"mean_token_accuracy": 0.7144258797168732,
"num_tokens": 324241521.0,
"step": 5150
},
{
"epoch": 16.847790507364977,
"grad_norm": 12.624334579460958,
"learning_rate": 1.4535449012669638e-09,
"loss": 1.3817,
"mean_token_accuracy": 0.7187546908855438,
"num_tokens": 324556732.0,
"step": 5155
},
{
"epoch": 16.864157119476268,
"grad_norm": 12.862259746060234,
"learning_rate": 1.4483676574780814e-09,
"loss": 1.4156,
"mean_token_accuracy": 0.7101677834987641,
"num_tokens": 324872836.0,
"step": 5160
},
{
"epoch": 16.880523731587562,
"grad_norm": 12.59492906662343,
"learning_rate": 1.4431958875734616e-09,
"loss": 1.4117,
"mean_token_accuracy": 0.7114337801933288,
"num_tokens": 325187319.0,
"step": 5165
},
{
"epoch": 16.896890343698853,
"grad_norm": 12.904641520216886,
"learning_rate": 1.4380296184728447e-09,
"loss": 1.4,
"mean_token_accuracy": 0.7136520922183991,
"num_tokens": 325502445.0,
"step": 5170
},
{
"epoch": 16.913256955810148,
"grad_norm": 12.997644941022777,
"learning_rate": 1.432868877067341e-09,
"loss": 1.4183,
"mean_token_accuracy": 0.7120531141757965,
"num_tokens": 325819093.0,
"step": 5175
},
{
"epoch": 16.92962356792144,
"grad_norm": 12.457739496706811,
"learning_rate": 1.427713690219285e-09,
"loss": 1.4025,
"mean_token_accuracy": 0.7128809988498688,
"num_tokens": 326134906.0,
"step": 5180
},
{
"epoch": 16.945990180032734,
"grad_norm": 12.349458584670058,
"learning_rate": 1.4225640847621006e-09,
"loss": 1.4191,
"mean_token_accuracy": 0.7112360656261444,
"num_tokens": 326450570.0,
"step": 5185
},
{
"epoch": 16.962356792144025,
"grad_norm": 12.43600358041225,
"learning_rate": 1.4174200875001603e-09,
"loss": 1.3981,
"mean_token_accuracy": 0.7156706392765045,
"num_tokens": 326766790.0,
"step": 5190
},
{
"epoch": 16.97872340425532,
"grad_norm": 12.57363296167154,
"learning_rate": 1.4122817252086426e-09,
"loss": 1.4069,
"mean_token_accuracy": 0.7124275147914887,
"num_tokens": 327079981.0,
"step": 5195
},
{
"epoch": 16.99509001636661,
"grad_norm": 12.224492851778363,
"learning_rate": 1.4071490246333978e-09,
"loss": 1.3923,
"mean_token_accuracy": 0.7143379509449005,
"num_tokens": 327394913.0,
"step": 5200
},
{
"epoch": 17.009819967266775,
"grad_norm": 12.660254127021258,
"learning_rate": 1.4020220124908064e-09,
"loss": 1.3851,
"mean_token_accuracy": 0.7145334217283461,
"num_tokens": 327655529.0,
"step": 5205
},
{
"epoch": 17.02618657937807,
"grad_norm": 12.374122101450418,
"learning_rate": 1.3969007154676383e-09,
"loss": 1.3968,
"mean_token_accuracy": 0.7153842210769653,
"num_tokens": 327971787.0,
"step": 5210
},
{
"epoch": 17.04255319148936,
"grad_norm": 13.523104807250503,
"learning_rate": 1.3917851602209163e-09,
"loss": 1.4022,
"mean_token_accuracy": 0.712611448764801,
"num_tokens": 328288325.0,
"step": 5215
},
{
"epoch": 17.058919803600656,
"grad_norm": 12.828752397429142,
"learning_rate": 1.3866753733777766e-09,
"loss": 1.415,
"mean_token_accuracy": 0.7102250695228577,
"num_tokens": 328603210.0,
"step": 5220
},
{
"epoch": 17.075286415711947,
"grad_norm": 12.137538036991014,
"learning_rate": 1.3815713815353295e-09,
"loss": 1.3892,
"mean_token_accuracy": 0.7171003341674804,
"num_tokens": 328919502.0,
"step": 5225
},
{
"epoch": 17.09165302782324,
"grad_norm": 12.799277571065465,
"learning_rate": 1.376473211260522e-09,
"loss": 1.4185,
"mean_token_accuracy": 0.7131861090660095,
"num_tokens": 329234120.0,
"step": 5230
},
{
"epoch": 17.108019639934533,
"grad_norm": 12.603093386870297,
"learning_rate": 1.3713808890899993e-09,
"loss": 1.4136,
"mean_token_accuracy": 0.7117380559444427,
"num_tokens": 329550898.0,
"step": 5235
},
{
"epoch": 17.124386252045827,
"grad_norm": 11.975315830322039,
"learning_rate": 1.3662944415299658e-09,
"loss": 1.4063,
"mean_token_accuracy": 0.7126054346561432,
"num_tokens": 329865571.0,
"step": 5240
},
{
"epoch": 17.14075286415712,
"grad_norm": 13.417997807400582,
"learning_rate": 1.3612138950560493e-09,
"loss": 1.402,
"mean_token_accuracy": 0.7132501244544983,
"num_tokens": 330181355.0,
"step": 5245
},
{
"epoch": 17.157119476268413,
"grad_norm": 12.89015778822025,
"learning_rate": 1.3561392761131583e-09,
"loss": 1.4048,
"mean_token_accuracy": 0.7087698340415954,
"num_tokens": 330496848.0,
"step": 5250
},
{
"epoch": 17.173486088379704,
"grad_norm": 12.561411499706335,
"learning_rate": 1.3510706111153515e-09,
"loss": 1.4171,
"mean_token_accuracy": 0.710574495792389,
"num_tokens": 330812707.0,
"step": 5255
},
{
"epoch": 17.189852700491,
"grad_norm": 12.451761146260129,
"learning_rate": 1.346007926445694e-09,
"loss": 1.3968,
"mean_token_accuracy": 0.7150484561920166,
"num_tokens": 331129434.0,
"step": 5260
},
{
"epoch": 17.20621931260229,
"grad_norm": 12.336908491072222,
"learning_rate": 1.3409512484561242e-09,
"loss": 1.3909,
"mean_token_accuracy": 0.7159785687923431,
"num_tokens": 331446337.0,
"step": 5265
},
{
"epoch": 17.222585924713584,
"grad_norm": 12.821838773919659,
"learning_rate": 1.3359006034673144e-09,
"loss": 1.3834,
"mean_token_accuracy": 0.7165203213691711,
"num_tokens": 331761818.0,
"step": 5270
},
{
"epoch": 17.238952536824875,
"grad_norm": 12.394587970969967,
"learning_rate": 1.3308560177685334e-09,
"loss": 1.3957,
"mean_token_accuracy": 0.7148654341697693,
"num_tokens": 332077756.0,
"step": 5275
},
{
"epoch": 17.25531914893617,
"grad_norm": 12.842214481966593,
"learning_rate": 1.325817517617512e-09,
"loss": 1.3947,
"mean_token_accuracy": 0.7158268690109253,
"num_tokens": 332393546.0,
"step": 5280
},
{
"epoch": 17.271685761047465,
"grad_norm": 12.662037146870594,
"learning_rate": 1.3207851292403036e-09,
"loss": 1.3972,
"mean_token_accuracy": 0.7155718326568603,
"num_tokens": 332710522.0,
"step": 5285
},
{
"epoch": 17.288052373158756,
"grad_norm": 12.631565840669538,
"learning_rate": 1.3157588788311504e-09,
"loss": 1.3822,
"mean_token_accuracy": 0.715112566947937,
"num_tokens": 333026066.0,
"step": 5290
},
{
"epoch": 17.30441898527005,
"grad_norm": 13.060137316177375,
"learning_rate": 1.3107387925523445e-09,
"loss": 1.4061,
"mean_token_accuracy": 0.7136071026325226,
"num_tokens": 333338968.0,
"step": 5295
},
{
"epoch": 17.32078559738134,
"grad_norm": 12.497161370908389,
"learning_rate": 1.305724896534094e-09,
"loss": 1.3911,
"mean_token_accuracy": 0.7150938749313355,
"num_tokens": 333656222.0,
"step": 5300
},
{
"epoch": 17.337152209492636,
"grad_norm": 12.310988293329496,
"learning_rate": 1.3007172168743852e-09,
"loss": 1.3997,
"mean_token_accuracy": 0.712196159362793,
"num_tokens": 333972297.0,
"step": 5305
},
{
"epoch": 17.353518821603927,
"grad_norm": 13.00195560661611,
"learning_rate": 1.2957157796388463e-09,
"loss": 1.4093,
"mean_token_accuracy": 0.7123797535896301,
"num_tokens": 334288137.0,
"step": 5310
},
{
"epoch": 17.369885433715222,
"grad_norm": 12.53115445959651,
"learning_rate": 1.2907206108606151e-09,
"loss": 1.4021,
"mean_token_accuracy": 0.7121556222438812,
"num_tokens": 334603646.0,
"step": 5315
},
{
"epoch": 17.386252045826513,
"grad_norm": 12.920395716601327,
"learning_rate": 1.2857317365401996e-09,
"loss": 1.4012,
"mean_token_accuracy": 0.7114644229412079,
"num_tokens": 334918136.0,
"step": 5320
},
{
"epoch": 17.402618657937808,
"grad_norm": 12.925708294532836,
"learning_rate": 1.2807491826453455e-09,
"loss": 1.4002,
"mean_token_accuracy": 0.7134134829044342,
"num_tokens": 335233847.0,
"step": 5325
},
{
"epoch": 17.4189852700491,
"grad_norm": 13.075160728738613,
"learning_rate": 1.2757729751108988e-09,
"loss": 1.4126,
"mean_token_accuracy": 0.7115934014320373,
"num_tokens": 335550114.0,
"step": 5330
},
{
"epoch": 17.435351882160393,
"grad_norm": 13.185241533755097,
"learning_rate": 1.2708031398386724e-09,
"loss": 1.3959,
"mean_token_accuracy": 0.7153161346912384,
"num_tokens": 335866911.0,
"step": 5335
},
{
"epoch": 17.451718494271685,
"grad_norm": 12.65442216382764,
"learning_rate": 1.2658397026973112e-09,
"loss": 1.4011,
"mean_token_accuracy": 0.7110715806484222,
"num_tokens": 336184022.0,
"step": 5340
},
{
"epoch": 17.46808510638298,
"grad_norm": 12.753114054132789,
"learning_rate": 1.2608826895221558e-09,
"loss": 1.4102,
"mean_token_accuracy": 0.7115493476390838,
"num_tokens": 336499508.0,
"step": 5345
},
{
"epoch": 17.48445171849427,
"grad_norm": 12.936481191853076,
"learning_rate": 1.2559321261151103e-09,
"loss": 1.3971,
"mean_token_accuracy": 0.7129360318183899,
"num_tokens": 336815137.0,
"step": 5350
},
{
"epoch": 17.500818330605565,
"grad_norm": 12.383260787314933,
"learning_rate": 1.2509880382445062e-09,
"loss": 1.3937,
"mean_token_accuracy": 0.7159365475177765,
"num_tokens": 337130392.0,
"step": 5355
},
{
"epoch": 17.517184942716856,
"grad_norm": 12.692724465470993,
"learning_rate": 1.2460504516449696e-09,
"loss": 1.4046,
"mean_token_accuracy": 0.7109850525856019,
"num_tokens": 337445070.0,
"step": 5360
},
{
"epoch": 17.53355155482815,
"grad_norm": 12.712425609177462,
"learning_rate": 1.2411193920172864e-09,
"loss": 1.4089,
"mean_token_accuracy": 0.7130200266838074,
"num_tokens": 337761551.0,
"step": 5365
},
{
"epoch": 17.54991816693944,
"grad_norm": 12.350995391081879,
"learning_rate": 1.236194885028268e-09,
"loss": 1.3759,
"mean_token_accuracy": 0.7158704221248626,
"num_tokens": 338075491.0,
"step": 5370
},
{
"epoch": 17.566284779050736,
"grad_norm": 12.99858596953847,
"learning_rate": 1.23127695631062e-09,
"loss": 1.3916,
"mean_token_accuracy": 0.7128506302833557,
"num_tokens": 338391201.0,
"step": 5375
},
{
"epoch": 17.58265139116203,
"grad_norm": 12.434309697315385,
"learning_rate": 1.2263656314628056e-09,
"loss": 1.3785,
"mean_token_accuracy": 0.7174550950527191,
"num_tokens": 338707150.0,
"step": 5380
},
{
"epoch": 17.599018003273322,
"grad_norm": 12.836552647774617,
"learning_rate": 1.221460936048915e-09,
"loss": 1.3864,
"mean_token_accuracy": 0.7141911685466766,
"num_tokens": 339022998.0,
"step": 5385
},
{
"epoch": 17.615384615384617,
"grad_norm": 12.571682517325653,
"learning_rate": 1.2165628955985313e-09,
"loss": 1.3889,
"mean_token_accuracy": 0.7143755376338958,
"num_tokens": 339340065.0,
"step": 5390
},
{
"epoch": 17.631751227495908,
"grad_norm": 13.395496985978712,
"learning_rate": 1.2116715356065971e-09,
"loss": 1.4006,
"mean_token_accuracy": 0.7140235543251038,
"num_tokens": 339654668.0,
"step": 5395
},
{
"epoch": 17.648117839607202,
"grad_norm": 12.479045524766219,
"learning_rate": 1.206786881533283e-09,
"loss": 1.3942,
"mean_token_accuracy": 0.7156869411468506,
"num_tokens": 339970606.0,
"step": 5400
},
{
"epoch": 17.664484451718494,
"grad_norm": 12.206575662961889,
"learning_rate": 1.2019089588038538e-09,
"loss": 1.3719,
"mean_token_accuracy": 0.7192688524723053,
"num_tokens": 340286065.0,
"step": 5405
},
{
"epoch": 17.680851063829788,
"grad_norm": 12.73772441105973,
"learning_rate": 1.1970377928085372e-09,
"loss": 1.3989,
"mean_token_accuracy": 0.7134172976016998,
"num_tokens": 340601645.0,
"step": 5410
},
{
"epoch": 17.69721767594108,
"grad_norm": 13.067032223425121,
"learning_rate": 1.1921734089023916e-09,
"loss": 1.416,
"mean_token_accuracy": 0.7102409243583679,
"num_tokens": 340916817.0,
"step": 5415
},
{
"epoch": 17.713584288052374,
"grad_norm": 12.860371801785337,
"learning_rate": 1.1873158324051716e-09,
"loss": 1.3921,
"mean_token_accuracy": 0.7153089880943299,
"num_tokens": 341231812.0,
"step": 5420
},
{
"epoch": 17.729950900163665,
"grad_norm": 12.237459579013946,
"learning_rate": 1.1824650886012012e-09,
"loss": 1.3719,
"mean_token_accuracy": 0.7181912899017334,
"num_tokens": 341546324.0,
"step": 5425
},
{
"epoch": 17.74631751227496,
"grad_norm": 12.851633508224957,
"learning_rate": 1.1776212027392376e-09,
"loss": 1.3814,
"mean_token_accuracy": 0.7169644117355347,
"num_tokens": 341862416.0,
"step": 5430
},
{
"epoch": 17.76268412438625,
"grad_norm": 12.715774815841636,
"learning_rate": 1.1727842000323422e-09,
"loss": 1.3832,
"mean_token_accuracy": 0.7157602131366729,
"num_tokens": 342179110.0,
"step": 5435
},
{
"epoch": 17.779050736497545,
"grad_norm": 13.101118093674382,
"learning_rate": 1.1679541056577482e-09,
"loss": 1.41,
"mean_token_accuracy": 0.7121834099292755,
"num_tokens": 342493037.0,
"step": 5440
},
{
"epoch": 17.795417348608837,
"grad_norm": 12.471346133514018,
"learning_rate": 1.1631309447567306e-09,
"loss": 1.3978,
"mean_token_accuracy": 0.7117744445800781,
"num_tokens": 342808622.0,
"step": 5445
},
{
"epoch": 17.81178396072013,
"grad_norm": 12.930267752585188,
"learning_rate": 1.1583147424344746e-09,
"loss": 1.3777,
"mean_token_accuracy": 0.7164147138595581,
"num_tokens": 343123030.0,
"step": 5450
},
{
"epoch": 17.828150572831422,
"grad_norm": 12.703543993441485,
"learning_rate": 1.153505523759944e-09,
"loss": 1.3916,
"mean_token_accuracy": 0.7136729001998902,
"num_tokens": 343439667.0,
"step": 5455
},
{
"epoch": 17.844517184942717,
"grad_norm": 13.14210044589097,
"learning_rate": 1.1487033137657538e-09,
"loss": 1.406,
"mean_token_accuracy": 0.7114916920661927,
"num_tokens": 343754489.0,
"step": 5460
},
{
"epoch": 17.86088379705401,
"grad_norm": 12.301357268333014,
"learning_rate": 1.1439081374480362e-09,
"loss": 1.3737,
"mean_token_accuracy": 0.7186058342456818,
"num_tokens": 344069924.0,
"step": 5465
},
{
"epoch": 17.877250409165303,
"grad_norm": 13.156843241094293,
"learning_rate": 1.1391200197663132e-09,
"loss": 1.3965,
"mean_token_accuracy": 0.7138831853866577,
"num_tokens": 344384088.0,
"step": 5470
},
{
"epoch": 17.893617021276597,
"grad_norm": 12.692490380092018,
"learning_rate": 1.134338985643366e-09,
"loss": 1.3866,
"mean_token_accuracy": 0.7155437529087066,
"num_tokens": 344701422.0,
"step": 5475
},
{
"epoch": 17.90998363338789,
"grad_norm": 12.715473298247183,
"learning_rate": 1.1295650599651023e-09,
"loss": 1.4021,
"mean_token_accuracy": 0.7133882701396942,
"num_tokens": 345016651.0,
"step": 5480
},
{
"epoch": 17.926350245499183,
"grad_norm": 12.238271624667775,
"learning_rate": 1.1247982675804322e-09,
"loss": 1.4036,
"mean_token_accuracy": 0.7109548151493073,
"num_tokens": 345332748.0,
"step": 5485
},
{
"epoch": 17.942716857610474,
"grad_norm": 13.07904637799471,
"learning_rate": 1.1200386333011356e-09,
"loss": 1.3852,
"mean_token_accuracy": 0.7151263058185577,
"num_tokens": 345649158.0,
"step": 5490
},
{
"epoch": 17.95908346972177,
"grad_norm": 12.589623435885747,
"learning_rate": 1.115286181901733e-09,
"loss": 1.3763,
"mean_token_accuracy": 0.7184123635292053,
"num_tokens": 345963757.0,
"step": 5495
},
{
"epoch": 17.97545008183306,
"grad_norm": 12.90918631521044,
"learning_rate": 1.1105409381193571e-09,
"loss": 1.3848,
"mean_token_accuracy": 0.7155660688877106,
"num_tokens": 346280806.0,
"step": 5500
},
{
"epoch": 17.991816693944354,
"grad_norm": 12.314582020763753,
"learning_rate": 1.105802926653624e-09,
"loss": 1.4122,
"mean_token_accuracy": 0.7101211845874786,
"num_tokens": 346594573.0,
"step": 5505
},
{
"epoch": 18.006546644844516,
"grad_norm": 13.235208784657654,
"learning_rate": 1.101072172166505e-09,
"loss": 1.407,
"mean_token_accuracy": 0.7137935625182258,
"num_tokens": 346854659.0,
"step": 5510
},
{
"epoch": 18.02291325695581,
"grad_norm": 12.763597579363163,
"learning_rate": 1.0963486992821977e-09,
"loss": 1.3887,
"mean_token_accuracy": 0.7150306701660156,
"num_tokens": 347169565.0,
"step": 5515
},
{
"epoch": 18.0392798690671,
"grad_norm": 12.655615875345337,
"learning_rate": 1.091632532586998e-09,
"loss": 1.3813,
"mean_token_accuracy": 0.715842741727829,
"num_tokens": 347484856.0,
"step": 5520
},
{
"epoch": 18.055646481178396,
"grad_norm": 12.64337473403488,
"learning_rate": 1.0869236966291715e-09,
"loss": 1.3727,
"mean_token_accuracy": 0.7158585131168366,
"num_tokens": 347801962.0,
"step": 5525
},
{
"epoch": 18.07201309328969,
"grad_norm": 12.462334312203728,
"learning_rate": 1.0822222159188275e-09,
"loss": 1.3852,
"mean_token_accuracy": 0.7147506773471832,
"num_tokens": 348117074.0,
"step": 5530
},
{
"epoch": 18.088379705400982,
"grad_norm": 12.565042037190233,
"learning_rate": 1.0775281149277897e-09,
"loss": 1.3925,
"mean_token_accuracy": 0.7138961970806121,
"num_tokens": 348433613.0,
"step": 5535
},
{
"epoch": 18.104746317512276,
"grad_norm": 12.646266276743935,
"learning_rate": 1.072841418089469e-09,
"loss": 1.3787,
"mean_token_accuracy": 0.7157909095287323,
"num_tokens": 348749005.0,
"step": 5540
},
{
"epoch": 18.121112929623568,
"grad_norm": 12.38061218796714,
"learning_rate": 1.068162149798737e-09,
"loss": 1.3877,
"mean_token_accuracy": 0.7154739260673523,
"num_tokens": 349064208.0,
"step": 5545
},
{
"epoch": 18.137479541734862,
"grad_norm": 11.968930657167016,
"learning_rate": 1.0634903344117995e-09,
"loss": 1.3649,
"mean_token_accuracy": 0.718022209405899,
"num_tokens": 349379547.0,
"step": 5550
},
{
"epoch": 18.153846153846153,
"grad_norm": 12.780826346237394,
"learning_rate": 1.0588259962460676e-09,
"loss": 1.369,
"mean_token_accuracy": 0.718323028087616,
"num_tokens": 349692928.0,
"step": 5555
},
{
"epoch": 18.170212765957448,
"grad_norm": 12.422856867889942,
"learning_rate": 1.0541691595800337e-09,
"loss": 1.389,
"mean_token_accuracy": 0.713786643743515,
"num_tokens": 350009624.0,
"step": 5560
},
{
"epoch": 18.18657937806874,
"grad_norm": 12.413005292724472,
"learning_rate": 1.049519848653143e-09,
"loss": 1.3808,
"mean_token_accuracy": 0.7161435902118682,
"num_tokens": 350326433.0,
"step": 5565
},
{
"epoch": 18.202945990180034,
"grad_norm": 13.381397742985778,
"learning_rate": 1.0448780876656688e-09,
"loss": 1.389,
"mean_token_accuracy": 0.714937961101532,
"num_tokens": 350641346.0,
"step": 5570
},
{
"epoch": 18.219312602291325,
"grad_norm": 12.49129531641033,
"learning_rate": 1.0402439007785862e-09,
"loss": 1.4007,
"mean_token_accuracy": 0.7126554310321808,
"num_tokens": 350957360.0,
"step": 5575
},
{
"epoch": 18.23567921440262,
"grad_norm": 12.308619890450021,
"learning_rate": 1.0356173121134446e-09,
"loss": 1.3815,
"mean_token_accuracy": 0.7172755897045135,
"num_tokens": 351273459.0,
"step": 5580
},
{
"epoch": 18.25204582651391,
"grad_norm": 13.242074621025358,
"learning_rate": 1.030998345752246e-09,
"loss": 1.3967,
"mean_token_accuracy": 0.7135592460632324,
"num_tokens": 351588136.0,
"step": 5585
},
{
"epoch": 18.268412438625205,
"grad_norm": 11.861574062896562,
"learning_rate": 1.0263870257373162e-09,
"loss": 1.3948,
"mean_token_accuracy": 0.712981390953064,
"num_tokens": 351903224.0,
"step": 5590
},
{
"epoch": 18.284779050736496,
"grad_norm": 12.395041538470807,
"learning_rate": 1.0217833760711792e-09,
"loss": 1.3683,
"mean_token_accuracy": 0.7180960893630981,
"num_tokens": 352218383.0,
"step": 5595
},
{
"epoch": 18.30114566284779,
"grad_norm": 12.894649709921252,
"learning_rate": 1.0171874207164362e-09,
"loss": 1.3978,
"mean_token_accuracy": 0.7124384999275207,
"num_tokens": 352533109.0,
"step": 5600
},
{
"epoch": 18.317512274959082,
"grad_norm": 13.096345184514144,
"learning_rate": 1.0125991835956376e-09,
"loss": 1.3872,
"mean_token_accuracy": 0.7149949491024017,
"num_tokens": 352848562.0,
"step": 5605
},
{
"epoch": 18.333878887070377,
"grad_norm": 11.9497648848681,
"learning_rate": 1.0080186885911588e-09,
"loss": 1.3819,
"mean_token_accuracy": 0.7147423505783081,
"num_tokens": 353164572.0,
"step": 5610
},
{
"epoch": 18.350245499181668,
"grad_norm": 12.339471746049261,
"learning_rate": 1.0034459595450776e-09,
"loss": 1.3802,
"mean_token_accuracy": 0.715108597278595,
"num_tokens": 353480615.0,
"step": 5615
},
{
"epoch": 18.366612111292962,
"grad_norm": 12.481496979460216,
"learning_rate": 9.988810202590481e-10,
"loss": 1.3905,
"mean_token_accuracy": 0.7153669118881225,
"num_tokens": 353794865.0,
"step": 5620
},
{
"epoch": 18.382978723404257,
"grad_norm": 12.518437673992242,
"learning_rate": 9.943238944941782e-10,
"loss": 1.3622,
"mean_token_accuracy": 0.7209295690059662,
"num_tokens": 354109372.0,
"step": 5625
},
{
"epoch": 18.399345335515548,
"grad_norm": 13.186352873301463,
"learning_rate": 9.897746059709054e-10,
"loss": 1.3925,
"mean_token_accuracy": 0.7112901329994201,
"num_tokens": 354424071.0,
"step": 5630
},
{
"epoch": 18.415711947626843,
"grad_norm": 12.906442859579837,
"learning_rate": 9.852331783688722e-10,
"loss": 1.4138,
"mean_token_accuracy": 0.7088511765003205,
"num_tokens": 354738984.0,
"step": 5635
},
{
"epoch": 18.432078559738134,
"grad_norm": 12.987822195411988,
"learning_rate": 9.806996353268057e-10,
"loss": 1.3849,
"mean_token_accuracy": 0.7144550621509552,
"num_tokens": 355054114.0,
"step": 5640
},
{
"epoch": 18.44844517184943,
"grad_norm": 12.674483928796581,
"learning_rate": 9.761740004423927e-10,
"loss": 1.3794,
"mean_token_accuracy": 0.7165055871009827,
"num_tokens": 355370777.0,
"step": 5645
},
{
"epoch": 18.46481178396072,
"grad_norm": 12.215728589409949,
"learning_rate": 9.716562972721544e-10,
"loss": 1.3814,
"mean_token_accuracy": 0.7154396593570709,
"num_tokens": 355687582.0,
"step": 5650
},
{
"epoch": 18.481178396072014,
"grad_norm": 12.38282753815485,
"learning_rate": 9.671465493313292e-10,
"loss": 1.3649,
"mean_token_accuracy": 0.719496488571167,
"num_tokens": 356001954.0,
"step": 5655
},
{
"epoch": 18.497545008183305,
"grad_norm": 13.079847773691125,
"learning_rate": 9.626447800937467e-10,
"loss": 1.3823,
"mean_token_accuracy": 0.7160343945026397,
"num_tokens": 356317376.0,
"step": 5660
},
{
"epoch": 18.5139116202946,
"grad_norm": 12.663364759533373,
"learning_rate": 9.581510129917063e-10,
"loss": 1.3899,
"mean_token_accuracy": 0.7153219997882843,
"num_tokens": 356631136.0,
"step": 5665
},
{
"epoch": 18.53027823240589,
"grad_norm": 12.740117263835618,
"learning_rate": 9.536652714158545e-10,
"loss": 1.3838,
"mean_token_accuracy": 0.7163176774978638,
"num_tokens": 356946372.0,
"step": 5670
},
{
"epoch": 18.546644844517186,
"grad_norm": 12.482008982127597,
"learning_rate": 9.49187578715065e-10,
"loss": 1.3932,
"mean_token_accuracy": 0.7150310695171356,
"num_tokens": 357263208.0,
"step": 5675
},
{
"epoch": 18.563011456628477,
"grad_norm": 12.335105657867352,
"learning_rate": 9.447179581963155e-10,
"loss": 1.3794,
"mean_token_accuracy": 0.7140471935272217,
"num_tokens": 357580121.0,
"step": 5680
},
{
"epoch": 18.57937806873977,
"grad_norm": 13.168790147885918,
"learning_rate": 9.402564331245673e-10,
"loss": 1.375,
"mean_token_accuracy": 0.7164913952350617,
"num_tokens": 357898026.0,
"step": 5685
},
{
"epoch": 18.595744680851062,
"grad_norm": 12.1581165136031,
"learning_rate": 9.358030267226429e-10,
"loss": 1.3558,
"mean_token_accuracy": 0.7206232190132141,
"num_tokens": 358213880.0,
"step": 5690
},
{
"epoch": 18.612111292962357,
"grad_norm": 12.59819387730946,
"learning_rate": 9.313577621711069e-10,
"loss": 1.3658,
"mean_token_accuracy": 0.7186311244964599,
"num_tokens": 358530242.0,
"step": 5695
},
{
"epoch": 18.628477905073648,
"grad_norm": 13.046071159861091,
"learning_rate": 9.269206626081444e-10,
"loss": 1.3939,
"mean_token_accuracy": 0.7141192197799683,
"num_tokens": 358844767.0,
"step": 5700
},
{
"epoch": 18.644844517184943,
"grad_norm": 12.737927621667515,
"learning_rate": 9.224917511294406e-10,
"loss": 1.3672,
"mean_token_accuracy": 0.7181422650814057,
"num_tokens": 359159924.0,
"step": 5705
},
{
"epoch": 18.661211129296234,
"grad_norm": 12.386579552377142,
"learning_rate": 9.180710507880605e-10,
"loss": 1.3655,
"mean_token_accuracy": 0.7202182531356811,
"num_tokens": 359474736.0,
"step": 5710
},
{
"epoch": 18.67757774140753,
"grad_norm": 12.365715715935234,
"learning_rate": 9.136585845943287e-10,
"loss": 1.3644,
"mean_token_accuracy": 0.7191633701324462,
"num_tokens": 359788829.0,
"step": 5715
},
{
"epoch": 18.693944353518823,
"grad_norm": 12.369603146497102,
"learning_rate": 9.092543755157112e-10,
"loss": 1.3658,
"mean_token_accuracy": 0.7192545533180237,
"num_tokens": 360105016.0,
"step": 5720
},
{
"epoch": 18.710310965630114,
"grad_norm": 12.996855685166068,
"learning_rate": 9.048584464766937e-10,
"loss": 1.3864,
"mean_token_accuracy": 0.7158155918121338,
"num_tokens": 360420518.0,
"step": 5725
},
{
"epoch": 18.72667757774141,
"grad_norm": 12.953296229285426,
"learning_rate": 9.004708203586629e-10,
"loss": 1.3861,
"mean_token_accuracy": 0.7145832180976868,
"num_tokens": 360737454.0,
"step": 5730
},
{
"epoch": 18.7430441898527,
"grad_norm": 12.402893762350994,
"learning_rate": 8.960915199997885e-10,
"loss": 1.3823,
"mean_token_accuracy": 0.7143435657024384,
"num_tokens": 361052926.0,
"step": 5735
},
{
"epoch": 18.759410801963995,
"grad_norm": 13.008197404879443,
"learning_rate": 8.917205681949034e-10,
"loss": 1.3759,
"mean_token_accuracy": 0.7186384499073029,
"num_tokens": 361368937.0,
"step": 5740
},
{
"epoch": 18.775777414075286,
"grad_norm": 13.423075286610377,
"learning_rate": 8.873579876953844e-10,
"loss": 1.3835,
"mean_token_accuracy": 0.7161759972572327,
"num_tokens": 361684524.0,
"step": 5745
},
{
"epoch": 18.79214402618658,
"grad_norm": 12.811505010435685,
"learning_rate": 8.830038012090357e-10,
"loss": 1.3862,
"mean_token_accuracy": 0.7145053625106812,
"num_tokens": 362001450.0,
"step": 5750
},
{
"epoch": 18.80851063829787,
"grad_norm": 13.027449136406155,
"learning_rate": 8.78658031399969e-10,
"loss": 1.3694,
"mean_token_accuracy": 0.716279661655426,
"num_tokens": 362317884.0,
"step": 5755
},
{
"epoch": 18.824877250409166,
"grad_norm": 12.360573369881436,
"learning_rate": 8.743207008884865e-10,
"loss": 1.3789,
"mean_token_accuracy": 0.7169335722923279,
"num_tokens": 362632884.0,
"step": 5760
},
{
"epoch": 18.841243862520457,
"grad_norm": 12.645584794759708,
"learning_rate": 8.699918322509609e-10,
"loss": 1.3622,
"mean_token_accuracy": 0.7194086253643036,
"num_tokens": 362949683.0,
"step": 5765
},
{
"epoch": 18.857610474631752,
"grad_norm": 12.975756722712436,
"learning_rate": 8.65671448019722e-10,
"loss": 1.3828,
"mean_token_accuracy": 0.7150522589683532,
"num_tokens": 363265994.0,
"step": 5770
},
{
"epoch": 18.873977086743043,
"grad_norm": 12.723655904397996,
"learning_rate": 8.613595706829366e-10,
"loss": 1.3576,
"mean_token_accuracy": 0.7214667618274688,
"num_tokens": 363580698.0,
"step": 5775
},
{
"epoch": 18.890343698854338,
"grad_norm": 12.685832080615537,
"learning_rate": 8.570562226844914e-10,
"loss": 1.3775,
"mean_token_accuracy": 0.7159756720066071,
"num_tokens": 363897121.0,
"step": 5780
},
{
"epoch": 18.90671031096563,
"grad_norm": 12.602544332252107,
"learning_rate": 8.527614264238773e-10,
"loss": 1.378,
"mean_token_accuracy": 0.7161263406276703,
"num_tokens": 364213776.0,
"step": 5785
},
{
"epoch": 18.923076923076923,
"grad_norm": 11.854014351161501,
"learning_rate": 8.48475204256072e-10,
"loss": 1.3496,
"mean_token_accuracy": 0.722278642654419,
"num_tokens": 364531018.0,
"step": 5790
},
{
"epoch": 18.939443535188214,
"grad_norm": 13.056726278143165,
"learning_rate": 8.441975784914241e-10,
"loss": 1.3706,
"mean_token_accuracy": 0.7184775650501252,
"num_tokens": 364846166.0,
"step": 5795
},
{
"epoch": 18.95581014729951,
"grad_norm": 12.693598074284184,
"learning_rate": 8.399285713955366e-10,
"loss": 1.3778,
"mean_token_accuracy": 0.7162315368652343,
"num_tokens": 365161528.0,
"step": 5800
},
{
"epoch": 18.9721767594108,
"grad_norm": 12.712806515492721,
"learning_rate": 8.356682051891512e-10,
"loss": 1.3732,
"mean_token_accuracy": 0.717895120382309,
"num_tokens": 365477749.0,
"step": 5805
},
{
"epoch": 18.988543371522095,
"grad_norm": 12.61013695062063,
"learning_rate": 8.31416502048033e-10,
"loss": 1.3729,
"mean_token_accuracy": 0.7183041274547577,
"num_tokens": 365794318.0,
"step": 5810
},
{
"epoch": 19.00327332242226,
"grad_norm": 12.846197841429813,
"learning_rate": 8.271734841028553e-10,
"loss": 1.3871,
"mean_token_accuracy": 0.7166521615452237,
"num_tokens": 366054135.0,
"step": 5815
},
{
"epoch": 19.01963993453355,
"grad_norm": 12.781031512055613,
"learning_rate": 8.229391734390809e-10,
"loss": 1.378,
"mean_token_accuracy": 0.7174058675765991,
"num_tokens": 366369329.0,
"step": 5820
},
{
"epoch": 19.036006546644845,
"grad_norm": 12.817751563572923,
"learning_rate": 8.187135920968536e-10,
"loss": 1.357,
"mean_token_accuracy": 0.7178467869758606,
"num_tokens": 366684911.0,
"step": 5825
},
{
"epoch": 19.052373158756136,
"grad_norm": 12.361476407235411,
"learning_rate": 8.14496762070878e-10,
"loss": 1.3882,
"mean_token_accuracy": 0.7144702851772309,
"num_tokens": 367001229.0,
"step": 5830
},
{
"epoch": 19.06873977086743,
"grad_norm": 13.611363516655436,
"learning_rate": 8.102887053103075e-10,
"loss": 1.388,
"mean_token_accuracy": 0.7142431914806366,
"num_tokens": 367316510.0,
"step": 5835
},
{
"epoch": 19.085106382978722,
"grad_norm": 12.616111993185598,
"learning_rate": 8.060894437186295e-10,
"loss": 1.3735,
"mean_token_accuracy": 0.7148357987403869,
"num_tokens": 367631892.0,
"step": 5840
},
{
"epoch": 19.101472995090017,
"grad_norm": 12.774290787727974,
"learning_rate": 8.018989991535513e-10,
"loss": 1.3843,
"mean_token_accuracy": 0.7155320584774018,
"num_tokens": 367949305.0,
"step": 5845
},
{
"epoch": 19.117839607201308,
"grad_norm": 12.874638084090844,
"learning_rate": 7.977173934268864e-10,
"loss": 1.3674,
"mean_token_accuracy": 0.7179316282272339,
"num_tokens": 368264037.0,
"step": 5850
},
{
"epoch": 19.134206219312603,
"grad_norm": 12.84731693292664,
"learning_rate": 7.935446483044412e-10,
"loss": 1.3841,
"mean_token_accuracy": 0.7166074812412262,
"num_tokens": 368579670.0,
"step": 5855
},
{
"epoch": 19.150572831423894,
"grad_norm": 12.958459519702389,
"learning_rate": 7.89380785505901e-10,
"loss": 1.3699,
"mean_token_accuracy": 0.7167773723602295,
"num_tokens": 368896056.0,
"step": 5860
},
{
"epoch": 19.16693944353519,
"grad_norm": 12.650749076343129,
"learning_rate": 7.852258267047177e-10,
"loss": 1.3757,
"mean_token_accuracy": 0.7171500504016877,
"num_tokens": 369210471.0,
"step": 5865
},
{
"epoch": 19.183306055646483,
"grad_norm": 12.283216913121649,
"learning_rate": 7.810797935279973e-10,
"loss": 1.3608,
"mean_token_accuracy": 0.7200214743614197,
"num_tokens": 369527272.0,
"step": 5870
},
{
"epoch": 19.199672667757774,
"grad_norm": 12.842889599405346,
"learning_rate": 7.769427075563856e-10,
"loss": 1.3817,
"mean_token_accuracy": 0.7152867078781128,
"num_tokens": 369841125.0,
"step": 5875
},
{
"epoch": 19.21603927986907,
"grad_norm": 12.480638763825803,
"learning_rate": 7.728145903239584e-10,
"loss": 1.3648,
"mean_token_accuracy": 0.718204790353775,
"num_tokens": 370157321.0,
"step": 5880
},
{
"epoch": 19.23240589198036,
"grad_norm": 12.117212343748356,
"learning_rate": 7.686954633181065e-10,
"loss": 1.3505,
"mean_token_accuracy": 0.7220339953899384,
"num_tokens": 370472593.0,
"step": 5885
},
{
"epoch": 19.248772504091654,
"grad_norm": 13.132183687091533,
"learning_rate": 7.645853479794265e-10,
"loss": 1.3708,
"mean_token_accuracy": 0.7181020259857178,
"num_tokens": 370788104.0,
"step": 5890
},
{
"epoch": 19.265139116202946,
"grad_norm": 13.285147654938097,
"learning_rate": 7.604842657016078e-10,
"loss": 1.3554,
"mean_token_accuracy": 0.7207320153713226,
"num_tokens": 371105424.0,
"step": 5895
},
{
"epoch": 19.28150572831424,
"grad_norm": 12.941522431737862,
"learning_rate": 7.563922378313218e-10,
"loss": 1.3829,
"mean_token_accuracy": 0.7163071990013122,
"num_tokens": 371420806.0,
"step": 5900
},
{
"epoch": 19.29787234042553,
"grad_norm": 12.66823555573856,
"learning_rate": 7.523092856681099e-10,
"loss": 1.3719,
"mean_token_accuracy": 0.7173808157444,
"num_tokens": 371737127.0,
"step": 5905
},
{
"epoch": 19.314238952536826,
"grad_norm": 12.468782774971105,
"learning_rate": 7.482354304642735e-10,
"loss": 1.3572,
"mean_token_accuracy": 0.7218125700950623,
"num_tokens": 372053743.0,
"step": 5910
},
{
"epoch": 19.330605564648117,
"grad_norm": 12.673430481906253,
"learning_rate": 7.441706934247633e-10,
"loss": 1.3714,
"mean_token_accuracy": 0.7178383052349091,
"num_tokens": 372368942.0,
"step": 5915
},
{
"epoch": 19.34697217675941,
"grad_norm": 12.64582960304754,
"learning_rate": 7.401150957070687e-10,
"loss": 1.3615,
"mean_token_accuracy": 0.7213193953037262,
"num_tokens": 372684501.0,
"step": 5920
},
{
"epoch": 19.363338788870703,
"grad_norm": 12.881251697477449,
"learning_rate": 7.360686584211079e-10,
"loss": 1.3582,
"mean_token_accuracy": 0.7224127888679505,
"num_tokens": 373000557.0,
"step": 5925
},
{
"epoch": 19.379705400981997,
"grad_norm": 13.016655812957188,
"learning_rate": 7.320314026291183e-10,
"loss": 1.373,
"mean_token_accuracy": 0.7166104733943939,
"num_tokens": 373314587.0,
"step": 5930
},
{
"epoch": 19.39607201309329,
"grad_norm": 12.703007701115933,
"learning_rate": 7.28003349345544e-10,
"loss": 1.3467,
"mean_token_accuracy": 0.7213922083377838,
"num_tokens": 373629407.0,
"step": 5935
},
{
"epoch": 19.412438625204583,
"grad_norm": 13.11920416031955,
"learning_rate": 7.239845195369319e-10,
"loss": 1.3712,
"mean_token_accuracy": 0.7184795260429382,
"num_tokens": 373945797.0,
"step": 5940
},
{
"epoch": 19.428805237315874,
"grad_norm": 12.745812531059482,
"learning_rate": 7.199749341218176e-10,
"loss": 1.3669,
"mean_token_accuracy": 0.7184098780155181,
"num_tokens": 374261376.0,
"step": 5945
},
{
"epoch": 19.44517184942717,
"grad_norm": 12.825356488169058,
"learning_rate": 7.159746139706194e-10,
"loss": 1.3745,
"mean_token_accuracy": 0.7177829325199128,
"num_tokens": 374577040.0,
"step": 5950
},
{
"epoch": 19.46153846153846,
"grad_norm": 13.67165270930649,
"learning_rate": 7.119835799055285e-10,
"loss": 1.382,
"mean_token_accuracy": 0.7166715204715729,
"num_tokens": 374892491.0,
"step": 5955
},
{
"epoch": 19.477905073649755,
"grad_norm": 12.360439867627685,
"learning_rate": 7.080018527004001e-10,
"loss": 1.3682,
"mean_token_accuracy": 0.7182742238044739,
"num_tokens": 375207737.0,
"step": 5960
},
{
"epoch": 19.49427168576105,
"grad_norm": 13.031003624035211,
"learning_rate": 7.040294530806468e-10,
"loss": 1.3874,
"mean_token_accuracy": 0.7151427447795868,
"num_tokens": 375523822.0,
"step": 5965
},
{
"epoch": 19.51063829787234,
"grad_norm": 13.00731945578167,
"learning_rate": 7.000664017231297e-10,
"loss": 1.3652,
"mean_token_accuracy": 0.7197602808475494,
"num_tokens": 375839994.0,
"step": 5970
},
{
"epoch": 19.527004909983635,
"grad_norm": 12.556107146083836,
"learning_rate": 6.961127192560509e-10,
"loss": 1.3689,
"mean_token_accuracy": 0.7184596836566925,
"num_tokens": 376155305.0,
"step": 5975
},
{
"epoch": 19.543371522094926,
"grad_norm": 12.4673726717845,
"learning_rate": 6.92168426258846e-10,
"loss": 1.3711,
"mean_token_accuracy": 0.7177050232887268,
"num_tokens": 376470754.0,
"step": 5980
},
{
"epoch": 19.55973813420622,
"grad_norm": 13.13075618583375,
"learning_rate": 6.882335432620779e-10,
"loss": 1.3629,
"mean_token_accuracy": 0.7193731069564819,
"num_tokens": 376786620.0,
"step": 5985
},
{
"epoch": 19.57610474631751,
"grad_norm": 12.170960660813558,
"learning_rate": 6.843080907473276e-10,
"loss": 1.3608,
"mean_token_accuracy": 0.7193795144557953,
"num_tokens": 377102222.0,
"step": 5990
},
{
"epoch": 19.592471358428806,
"grad_norm": 12.309205288205849,
"learning_rate": 6.803920891470905e-10,
"loss": 1.354,
"mean_token_accuracy": 0.7204973518848419,
"num_tokens": 377417349.0,
"step": 5995
},
{
"epoch": 19.608837970540097,
"grad_norm": 13.031618542541247,
"learning_rate": 6.764855588446689e-10,
"loss": 1.3725,
"mean_token_accuracy": 0.7182239472866059,
"num_tokens": 377732207.0,
"step": 6000
},
{
"epoch": 19.625204582651392,
"grad_norm": 12.540057625658143,
"learning_rate": 6.725885201740653e-10,
"loss": 1.36,
"mean_token_accuracy": 0.7203054070472718,
"num_tokens": 378046720.0,
"step": 6005
},
{
"epoch": 19.641571194762683,
"grad_norm": 12.352507333255614,
"learning_rate": 6.687009934198771e-10,
"loss": 1.3408,
"mean_token_accuracy": 0.7238680064678192,
"num_tokens": 378361684.0,
"step": 6010
},
{
"epoch": 19.657937806873978,
"grad_norm": 12.420982668782106,
"learning_rate": 6.648229988171906e-10,
"loss": 1.3639,
"mean_token_accuracy": 0.7186601638793946,
"num_tokens": 378677558.0,
"step": 6015
},
{
"epoch": 19.67430441898527,
"grad_norm": 12.720448227814035,
"learning_rate": 6.609545565514766e-10,
"loss": 1.3821,
"mean_token_accuracy": 0.7174455404281617,
"num_tokens": 378992818.0,
"step": 6020
},
{
"epoch": 19.690671031096564,
"grad_norm": 12.501448331802214,
"learning_rate": 6.570956867584843e-10,
"loss": 1.3572,
"mean_token_accuracy": 0.7190974712371826,
"num_tokens": 379309437.0,
"step": 6025
},
{
"epoch": 19.707037643207855,
"grad_norm": 12.147760849737383,
"learning_rate": 6.532464095241372e-10,
"loss": 1.3423,
"mean_token_accuracy": 0.7238264679908752,
"num_tokens": 379626036.0,
"step": 6030
},
{
"epoch": 19.72340425531915,
"grad_norm": 12.720576028448043,
"learning_rate": 6.494067448844279e-10,
"loss": 1.3574,
"mean_token_accuracy": 0.7195711672306061,
"num_tokens": 379940886.0,
"step": 6035
},
{
"epoch": 19.73977086743044,
"grad_norm": 13.135370961706531,
"learning_rate": 6.455767128253149e-10,
"loss": 1.3735,
"mean_token_accuracy": 0.7186027526855469,
"num_tokens": 380257522.0,
"step": 6040
},
{
"epoch": 19.756137479541735,
"grad_norm": 12.672254675360048,
"learning_rate": 6.417563332826165e-10,
"loss": 1.3791,
"mean_token_accuracy": 0.7168498575687409,
"num_tokens": 380572688.0,
"step": 6045
},
{
"epoch": 19.772504091653026,
"grad_norm": 13.156088402541622,
"learning_rate": 6.3794562614191e-10,
"loss": 1.3739,
"mean_token_accuracy": 0.7170314073562623,
"num_tokens": 380889136.0,
"step": 6050
},
{
"epoch": 19.78887070376432,
"grad_norm": 13.239645957722919,
"learning_rate": 6.341446112384259e-10,
"loss": 1.3756,
"mean_token_accuracy": 0.7166466414928436,
"num_tokens": 381203100.0,
"step": 6055
},
{
"epoch": 19.805237315875615,
"grad_norm": 12.621465653927025,
"learning_rate": 6.303533083569448e-10,
"loss": 1.3809,
"mean_token_accuracy": 0.7170312762260437,
"num_tokens": 381518716.0,
"step": 6060
},
{
"epoch": 19.821603927986907,
"grad_norm": 12.684645310238352,
"learning_rate": 6.265717372316957e-10,
"loss": 1.3801,
"mean_token_accuracy": 0.7160680174827576,
"num_tokens": 381834327.0,
"step": 6065
},
{
"epoch": 19.8379705400982,
"grad_norm": 13.310217231405765,
"learning_rate": 6.227999175462521e-10,
"loss": 1.3618,
"mean_token_accuracy": 0.7182558119297028,
"num_tokens": 382151144.0,
"step": 6070
},
{
"epoch": 19.854337152209492,
"grad_norm": 12.592065764147137,
"learning_rate": 6.1903786893343e-10,
"loss": 1.3767,
"mean_token_accuracy": 0.7171549320220947,
"num_tokens": 382467394.0,
"step": 6075
},
{
"epoch": 19.870703764320787,
"grad_norm": 13.131250487547925,
"learning_rate": 6.152856109751861e-10,
"loss": 1.3784,
"mean_token_accuracy": 0.7171645045280457,
"num_tokens": 382783474.0,
"step": 6080
},
{
"epoch": 19.887070376432078,
"grad_norm": 12.201328973734851,
"learning_rate": 6.115431632025153e-10,
"loss": 1.3511,
"mean_token_accuracy": 0.7223453462123871,
"num_tokens": 383098936.0,
"step": 6085
},
{
"epoch": 19.903436988543373,
"grad_norm": 13.796231255641032,
"learning_rate": 6.078105450953488e-10,
"loss": 1.3668,
"mean_token_accuracy": 0.7188094437122345,
"num_tokens": 383414381.0,
"step": 6090
},
{
"epoch": 19.919803600654664,
"grad_norm": 11.806977173308766,
"learning_rate": 6.040877760824535e-10,
"loss": 1.348,
"mean_token_accuracy": 0.721502012014389,
"num_tokens": 383729234.0,
"step": 6095
},
{
"epoch": 19.93617021276596,
"grad_norm": 12.735720188720164,
"learning_rate": 6.003748755413311e-10,
"loss": 1.3473,
"mean_token_accuracy": 0.7218765377998352,
"num_tokens": 384045885.0,
"step": 6100
},
{
"epoch": 19.95253682487725,
"grad_norm": 12.838838181022378,
"learning_rate": 5.966718627981141e-10,
"loss": 1.3493,
"mean_token_accuracy": 0.7213131487369537,
"num_tokens": 384360353.0,
"step": 6105
},
{
"epoch": 19.968903436988544,
"grad_norm": 12.407763840948075,
"learning_rate": 5.929787571274706e-10,
"loss": 1.3594,
"mean_token_accuracy": 0.7206071019172668,
"num_tokens": 384675717.0,
"step": 6110
},
{
"epoch": 19.985270049099835,
"grad_norm": 12.94758071340867,
"learning_rate": 5.892955777524997e-10,
"loss": 1.3678,
"mean_token_accuracy": 0.7174268543720246,
"num_tokens": 384993655.0,
"step": 6115
},
{
"epoch": 20.0,
"grad_norm": 12.510272040402462,
"learning_rate": 5.856223438446331e-10,
"loss": 1.349,
"mean_token_accuracy": 0.720099237230089,
"num_tokens": 385254493.0,
"step": 6120
},
{
"epoch": 20.016366612111295,
"grad_norm": 12.408361502858067,
"learning_rate": 5.819590745235353e-10,
"loss": 1.3581,
"mean_token_accuracy": 0.7212778329849243,
"num_tokens": 385569935.0,
"step": 6125
},
{
"epoch": 20.032733224222586,
"grad_norm": 12.89064406341307,
"learning_rate": 5.783057888570034e-10,
"loss": 1.3627,
"mean_token_accuracy": 0.7189658522605896,
"num_tokens": 385886048.0,
"step": 6130
},
{
"epoch": 20.04909983633388,
"grad_norm": 12.656855994493375,
"learning_rate": 5.746625058608681e-10,
"loss": 1.3511,
"mean_token_accuracy": 0.7225908994674682,
"num_tokens": 386202625.0,
"step": 6135
},
{
"epoch": 20.06546644844517,
"grad_norm": 12.59303551321297,
"learning_rate": 5.710292444988957e-10,
"loss": 1.3544,
"mean_token_accuracy": 0.7210854589939117,
"num_tokens": 386516480.0,
"step": 6140
},
{
"epoch": 20.081833060556466,
"grad_norm": 13.373786710360742,
"learning_rate": 5.674060236826881e-10,
"loss": 1.3666,
"mean_token_accuracy": 0.7191102504730225,
"num_tokens": 386831825.0,
"step": 6145
},
{
"epoch": 20.098199672667757,
"grad_norm": 13.039962709930045,
"learning_rate": 5.637928622715844e-10,
"loss": 1.3633,
"mean_token_accuracy": 0.71987065076828,
"num_tokens": 387147364.0,
"step": 6150
},
{
"epoch": 20.114566284779052,
"grad_norm": 12.87146333006844,
"learning_rate": 5.601897790725643e-10,
"loss": 1.3736,
"mean_token_accuracy": 0.7173676788806915,
"num_tokens": 387461434.0,
"step": 6155
},
{
"epoch": 20.130932896890343,
"grad_norm": 12.769528987719692,
"learning_rate": 5.565967928401475e-10,
"loss": 1.3827,
"mean_token_accuracy": 0.7164701819419861,
"num_tokens": 387776913.0,
"step": 6160
},
{
"epoch": 20.147299509001638,
"grad_norm": 13.104881890764489,
"learning_rate": 5.530139222762986e-10,
"loss": 1.3562,
"mean_token_accuracy": 0.7204134702682495,
"num_tokens": 388094220.0,
"step": 6165
},
{
"epoch": 20.16366612111293,
"grad_norm": 13.026891970002072,
"learning_rate": 5.494411860303295e-10,
"loss": 1.3525,
"mean_token_accuracy": 0.7217986106872558,
"num_tokens": 388409234.0,
"step": 6170
},
{
"epoch": 20.180032733224223,
"grad_norm": 12.217206361830561,
"learning_rate": 5.458786026988006e-10,
"loss": 1.3779,
"mean_token_accuracy": 0.7165840566158295,
"num_tokens": 388725714.0,
"step": 6175
},
{
"epoch": 20.196399345335514,
"grad_norm": 13.636015132017496,
"learning_rate": 5.423261908254251e-10,
"loss": 1.3791,
"mean_token_accuracy": 0.716965913772583,
"num_tokens": 389040838.0,
"step": 6180
},
{
"epoch": 20.21276595744681,
"grad_norm": 13.055484726435909,
"learning_rate": 5.38783968900973e-10,
"loss": 1.3529,
"mean_token_accuracy": 0.7214811325073243,
"num_tokens": 389358099.0,
"step": 6185
},
{
"epoch": 20.2291325695581,
"grad_norm": 12.80499515972432,
"learning_rate": 5.352519553631738e-10,
"loss": 1.3586,
"mean_token_accuracy": 0.7204404175281525,
"num_tokens": 389674079.0,
"step": 6190
},
{
"epoch": 20.245499181669395,
"grad_norm": 12.579179331917139,
"learning_rate": 5.317301685966214e-10,
"loss": 1.355,
"mean_token_accuracy": 0.7205852210521698,
"num_tokens": 389990677.0,
"step": 6195
},
{
"epoch": 20.261865793780686,
"grad_norm": 12.410957372710085,
"learning_rate": 5.282186269326778e-10,
"loss": 1.3492,
"mean_token_accuracy": 0.7207090735435486,
"num_tokens": 390306224.0,
"step": 6200
},
{
"epoch": 20.27823240589198,
"grad_norm": 13.79551759700545,
"learning_rate": 5.247173486493775e-10,
"loss": 1.377,
"mean_token_accuracy": 0.717050963640213,
"num_tokens": 390621762.0,
"step": 6205
},
{
"epoch": 20.29459901800327,
"grad_norm": 12.442990671978828,
"learning_rate": 5.212263519713337e-10,
"loss": 1.3427,
"mean_token_accuracy": 0.7230148434638977,
"num_tokens": 390937426.0,
"step": 6210
},
{
"epoch": 20.310965630114566,
"grad_norm": 12.517953307134098,
"learning_rate": 5.177456550696413e-10,
"loss": 1.3487,
"mean_token_accuracy": 0.7233328282833099,
"num_tokens": 391252916.0,
"step": 6215
},
{
"epoch": 20.32733224222586,
"grad_norm": 13.092747881609323,
"learning_rate": 5.14275276061785e-10,
"loss": 1.3579,
"mean_token_accuracy": 0.7207543611526489,
"num_tokens": 391567998.0,
"step": 6220
},
{
"epoch": 20.343698854337152,
"grad_norm": 12.726469014852299,
"learning_rate": 5.108152330115417e-10,
"loss": 1.3628,
"mean_token_accuracy": 0.7196714520454407,
"num_tokens": 391885189.0,
"step": 6225
},
{
"epoch": 20.360065466448447,
"grad_norm": 13.334299082905925,
"learning_rate": 5.073655439288902e-10,
"loss": 1.3714,
"mean_token_accuracy": 0.7176052629947662,
"num_tokens": 392200295.0,
"step": 6230
},
{
"epoch": 20.376432078559738,
"grad_norm": 12.91998412060829,
"learning_rate": 5.039262267699141e-10,
"loss": 1.3658,
"mean_token_accuracy": 0.7184491395950318,
"num_tokens": 392516438.0,
"step": 6235
},
{
"epoch": 20.392798690671032,
"grad_norm": 12.970503138080948,
"learning_rate": 5.004972994367102e-10,
"loss": 1.3655,
"mean_token_accuracy": 0.7194463074207306,
"num_tokens": 392830717.0,
"step": 6240
},
{
"epoch": 20.409165302782323,
"grad_norm": 12.644866646541756,
"learning_rate": 4.970787797772949e-10,
"loss": 1.3755,
"mean_token_accuracy": 0.7172148823738098,
"num_tokens": 393146212.0,
"step": 6245
},
{
"epoch": 20.425531914893618,
"grad_norm": 13.62925757634868,
"learning_rate": 4.936706855855119e-10,
"loss": 1.3698,
"mean_token_accuracy": 0.7158120036125183,
"num_tokens": 393462795.0,
"step": 6250
},
{
"epoch": 20.44189852700491,
"grad_norm": 13.441202382467907,
"learning_rate": 4.902730346009382e-10,
"loss": 1.3592,
"mean_token_accuracy": 0.7201913297176361,
"num_tokens": 393779211.0,
"step": 6255
},
{
"epoch": 20.458265139116204,
"grad_norm": 12.66638053909492,
"learning_rate": 4.868858445087923e-10,
"loss": 1.3327,
"mean_token_accuracy": 0.7251097619533539,
"num_tokens": 394094924.0,
"step": 6260
},
{
"epoch": 20.474631751227495,
"grad_norm": 12.748745055757105,
"learning_rate": 4.835091329398436e-10,
"loss": 1.3632,
"mean_token_accuracy": 0.7194243013858795,
"num_tokens": 394410468.0,
"step": 6265
},
{
"epoch": 20.49099836333879,
"grad_norm": 13.324491028604893,
"learning_rate": 4.801429174703187e-10,
"loss": 1.3647,
"mean_token_accuracy": 0.7182667195796967,
"num_tokens": 394724947.0,
"step": 6270
},
{
"epoch": 20.50736497545008,
"grad_norm": 12.473896879426514,
"learning_rate": 4.767872156218097e-10,
"loss": 1.3349,
"mean_token_accuracy": 0.7238370895385742,
"num_tokens": 395040911.0,
"step": 6275
},
{
"epoch": 20.523731587561375,
"grad_norm": 12.454795272312785,
"learning_rate": 4.734420448611851e-10,
"loss": 1.3587,
"mean_token_accuracy": 0.7194471001625061,
"num_tokens": 395356010.0,
"step": 6280
},
{
"epoch": 20.540098199672666,
"grad_norm": 13.386471916617674,
"learning_rate": 4.701074226004978e-10,
"loss": 1.3493,
"mean_token_accuracy": 0.7223416805267334,
"num_tokens": 395672392.0,
"step": 6285
},
{
"epoch": 20.55646481178396,
"grad_norm": 12.922461612723442,
"learning_rate": 4.66783366196894e-10,
"loss": 1.3626,
"mean_token_accuracy": 0.7206296324729919,
"num_tokens": 395987449.0,
"step": 6290
},
{
"epoch": 20.572831423895252,
"grad_norm": 13.186551545631183,
"learning_rate": 4.6346989295252274e-10,
"loss": 1.3761,
"mean_token_accuracy": 0.7167114853858948,
"num_tokens": 396302511.0,
"step": 6295
},
{
"epoch": 20.589198036006547,
"grad_norm": 12.453339257872456,
"learning_rate": 4.601670201144473e-10,
"loss": 1.3703,
"mean_token_accuracy": 0.7186362683773041,
"num_tokens": 396618897.0,
"step": 6300
},
{
"epoch": 20.60556464811784,
"grad_norm": 12.578505216956795,
"learning_rate": 4.568747648745539e-10,
"loss": 1.3424,
"mean_token_accuracy": 0.7220037519931793,
"num_tokens": 396932845.0,
"step": 6305
},
{
"epoch": 20.621931260229132,
"grad_norm": 13.14266322328383,
"learning_rate": 4.535931443694627e-10,
"loss": 1.3597,
"mean_token_accuracy": 0.7195671617984771,
"num_tokens": 397249593.0,
"step": 6310
},
{
"epoch": 20.638297872340427,
"grad_norm": 13.155868390509191,
"learning_rate": 4.5032217568043874e-10,
"loss": 1.3528,
"mean_token_accuracy": 0.72027388215065,
"num_tokens": 397566272.0,
"step": 6315
},
{
"epoch": 20.654664484451718,
"grad_norm": 12.650985827442252,
"learning_rate": 4.470618758333031e-10,
"loss": 1.3709,
"mean_token_accuracy": 0.7184613645076752,
"num_tokens": 397881061.0,
"step": 6320
},
{
"epoch": 20.671031096563013,
"grad_norm": 12.834968827273396,
"learning_rate": 4.4381226179834424e-10,
"loss": 1.3596,
"mean_token_accuracy": 0.7198035597801209,
"num_tokens": 398196496.0,
"step": 6325
},
{
"epoch": 20.687397708674304,
"grad_norm": 12.472663814811538,
"learning_rate": 4.405733504902298e-10,
"loss": 1.3241,
"mean_token_accuracy": 0.7258457541465759,
"num_tokens": 398513160.0,
"step": 6330
},
{
"epoch": 20.7037643207856,
"grad_norm": 12.03529616087969,
"learning_rate": 4.3734515876791695e-10,
"loss": 1.3543,
"mean_token_accuracy": 0.720082575082779,
"num_tokens": 398828925.0,
"step": 6335
},
{
"epoch": 20.72013093289689,
"grad_norm": 13.045430517890043,
"learning_rate": 4.3412770343456725e-10,
"loss": 1.3688,
"mean_token_accuracy": 0.7178512036800384,
"num_tokens": 399143548.0,
"step": 6340
},
{
"epoch": 20.736497545008184,
"grad_norm": 13.023269314604761,
"learning_rate": 4.3092100123745786e-10,
"loss": 1.3537,
"mean_token_accuracy": 0.7202800393104554,
"num_tokens": 399457769.0,
"step": 6345
},
{
"epoch": 20.752864157119475,
"grad_norm": 13.120357401735287,
"learning_rate": 4.2772506886789434e-10,
"loss": 1.3537,
"mean_token_accuracy": 0.7197918653488159,
"num_tokens": 399772688.0,
"step": 6350
},
{
"epoch": 20.76923076923077,
"grad_norm": 12.497378191320612,
"learning_rate": 4.245399229611238e-10,
"loss": 1.3514,
"mean_token_accuracy": 0.7215233445167542,
"num_tokens": 400090305.0,
"step": 6355
},
{
"epoch": 20.78559738134206,
"grad_norm": 13.224221419816752,
"learning_rate": 4.213655800962482e-10,
"loss": 1.3766,
"mean_token_accuracy": 0.7170398950576782,
"num_tokens": 400406189.0,
"step": 6360
},
{
"epoch": 20.801963993453356,
"grad_norm": 13.125172702519224,
"learning_rate": 4.1820205679613866e-10,
"loss": 1.3437,
"mean_token_accuracy": 0.722397255897522,
"num_tokens": 400722467.0,
"step": 6365
},
{
"epoch": 20.818330605564647,
"grad_norm": 13.20586057784677,
"learning_rate": 4.1504936952734855e-10,
"loss": 1.35,
"mean_token_accuracy": 0.7202737271785736,
"num_tokens": 401037657.0,
"step": 6370
},
{
"epoch": 20.83469721767594,
"grad_norm": 12.761874738404625,
"learning_rate": 4.119075347000292e-10,
"loss": 1.3612,
"mean_token_accuracy": 0.7196288645267487,
"num_tokens": 401353210.0,
"step": 6375
},
{
"epoch": 20.851063829787233,
"grad_norm": 13.407483508696666,
"learning_rate": 4.087765686678424e-10,
"loss": 1.3623,
"mean_token_accuracy": 0.7191239655017853,
"num_tokens": 401668048.0,
"step": 6380
},
{
"epoch": 20.867430441898527,
"grad_norm": 13.144433390958858,
"learning_rate": 4.0565648772787703e-10,
"loss": 1.362,
"mean_token_accuracy": 0.7194087266921997,
"num_tokens": 401984410.0,
"step": 6385
},
{
"epoch": 20.88379705400982,
"grad_norm": 13.27548156246285,
"learning_rate": 4.0254730812056384e-10,
"loss": 1.3469,
"mean_token_accuracy": 0.7210758686065674,
"num_tokens": 402300070.0,
"step": 6390
},
{
"epoch": 20.900163666121113,
"grad_norm": 13.149919970213444,
"learning_rate": 3.9944904602958994e-10,
"loss": 1.3596,
"mean_token_accuracy": 0.7191324174404145,
"num_tokens": 402614769.0,
"step": 6395
},
{
"epoch": 20.916530278232408,
"grad_norm": 12.361842765694908,
"learning_rate": 3.9636171758181655e-10,
"loss": 1.3608,
"mean_token_accuracy": 0.7194510757923126,
"num_tokens": 402929102.0,
"step": 6400
},
{
"epoch": 20.9328968903437,
"grad_norm": 12.48137044626653,
"learning_rate": 3.9328533884719267e-10,
"loss": 1.3334,
"mean_token_accuracy": 0.7238231897354126,
"num_tokens": 403245266.0,
"step": 6405
},
{
"epoch": 20.949263502454993,
"grad_norm": 13.341785056031886,
"learning_rate": 3.902199258386732e-10,
"loss": 1.3811,
"mean_token_accuracy": 0.7154872000217438,
"num_tokens": 403561358.0,
"step": 6410
},
{
"epoch": 20.965630114566284,
"grad_norm": 11.86055012503018,
"learning_rate": 3.8716549451213473e-10,
"loss": 1.3537,
"mean_token_accuracy": 0.719732791185379,
"num_tokens": 403877016.0,
"step": 6415
},
{
"epoch": 20.98199672667758,
"grad_norm": 12.66194804811954,
"learning_rate": 3.841220607662932e-10,
"loss": 1.345,
"mean_token_accuracy": 0.7209204435348511,
"num_tokens": 404193031.0,
"step": 6420
},
{
"epoch": 20.99836333878887,
"grad_norm": 13.057343518581341,
"learning_rate": 3.8108964044262034e-10,
"loss": 1.3601,
"mean_token_accuracy": 0.718568354845047,
"num_tokens": 404509316.0,
"step": 6425
},
{
"epoch": 21.013093289689035,
"grad_norm": 12.848348091317341,
"learning_rate": 3.780682493252613e-10,
"loss": 1.3604,
"mean_token_accuracy": 0.7186027036772834,
"num_tokens": 404770618.0,
"step": 6430
},
{
"epoch": 21.029459901800326,
"grad_norm": 12.514267821398358,
"learning_rate": 3.7505790314095347e-10,
"loss": 1.3397,
"mean_token_accuracy": 0.722569715976715,
"num_tokens": 405087475.0,
"step": 6435
},
{
"epoch": 21.04582651391162,
"grad_norm": 12.552588439453062,
"learning_rate": 3.720586175589438e-10,
"loss": 1.349,
"mean_token_accuracy": 0.7212419390678406,
"num_tokens": 405402509.0,
"step": 6440
},
{
"epoch": 21.062193126022912,
"grad_norm": 12.402093691703406,
"learning_rate": 3.69070408190906e-10,
"loss": 1.335,
"mean_token_accuracy": 0.7232251703739166,
"num_tokens": 405718756.0,
"step": 6445
},
{
"epoch": 21.078559738134206,
"grad_norm": 13.312628292563984,
"learning_rate": 3.6609329059086286e-10,
"loss": 1.3703,
"mean_token_accuracy": 0.7168916404247284,
"num_tokens": 406034042.0,
"step": 6450
},
{
"epoch": 21.094926350245498,
"grad_norm": 13.20544818974223,
"learning_rate": 3.631272802551011e-10,
"loss": 1.3492,
"mean_token_accuracy": 0.7211582005023957,
"num_tokens": 406349959.0,
"step": 6455
},
{
"epoch": 21.111292962356792,
"grad_norm": 13.23125049133312,
"learning_rate": 3.60172392622094e-10,
"loss": 1.3448,
"mean_token_accuracy": 0.7220036685466766,
"num_tokens": 406665577.0,
"step": 6460
},
{
"epoch": 21.127659574468087,
"grad_norm": 13.005624491873656,
"learning_rate": 3.572286430724192e-10,
"loss": 1.3542,
"mean_token_accuracy": 0.7203407347202301,
"num_tokens": 406980864.0,
"step": 6465
},
{
"epoch": 21.144026186579378,
"grad_norm": 12.765581546026217,
"learning_rate": 3.5429604692867905e-10,
"loss": 1.3463,
"mean_token_accuracy": 0.7219978153705597,
"num_tokens": 407298180.0,
"step": 6470
},
{
"epoch": 21.160392798690673,
"grad_norm": 12.667059462802193,
"learning_rate": 3.5137461945542125e-10,
"loss": 1.3499,
"mean_token_accuracy": 0.7211233079433441,
"num_tokens": 407614589.0,
"step": 6475
},
{
"epoch": 21.176759410801964,
"grad_norm": 12.979083992930654,
"learning_rate": 3.484643758590586e-10,
"loss": 1.342,
"mean_token_accuracy": 0.7224942982196808,
"num_tokens": 407929635.0,
"step": 6480
},
{
"epoch": 21.19312602291326,
"grad_norm": 12.249019308152713,
"learning_rate": 3.455653312877913e-10,
"loss": 1.3385,
"mean_token_accuracy": 0.7228384554386139,
"num_tokens": 408244228.0,
"step": 6485
},
{
"epoch": 21.20949263502455,
"grad_norm": 12.699289474269534,
"learning_rate": 3.426775008315258e-10,
"loss": 1.3584,
"mean_token_accuracy": 0.7195417165756226,
"num_tokens": 408560800.0,
"step": 6490
},
{
"epoch": 21.225859247135844,
"grad_norm": 12.99167124681205,
"learning_rate": 3.398008995217988e-10,
"loss": 1.3527,
"mean_token_accuracy": 0.7201470553874969,
"num_tokens": 408875635.0,
"step": 6495
},
{
"epoch": 21.242225859247135,
"grad_norm": 12.629510378355464,
"learning_rate": 3.3693554233169777e-10,
"loss": 1.365,
"mean_token_accuracy": 0.7178857147693634,
"num_tokens": 409191753.0,
"step": 6500
},
{
"epoch": 21.25859247135843,
"grad_norm": 12.879603899060175,
"learning_rate": 3.3408144417578196e-10,
"loss": 1.3523,
"mean_token_accuracy": 0.7209328949451447,
"num_tokens": 409508642.0,
"step": 6505
},
{
"epoch": 21.27495908346972,
"grad_norm": 13.309240097642768,
"learning_rate": 3.3123861991000646e-10,
"loss": 1.3794,
"mean_token_accuracy": 0.7156186401844025,
"num_tokens": 409824571.0,
"step": 6510
},
{
"epoch": 21.291325695581016,
"grad_norm": 12.92867376437594,
"learning_rate": 3.28407084331645e-10,
"loss": 1.3657,
"mean_token_accuracy": 0.7183760046958924,
"num_tokens": 410142157.0,
"step": 6515
},
{
"epoch": 21.307692307692307,
"grad_norm": 12.472225455278851,
"learning_rate": 3.255868521792113e-10,
"loss": 1.3482,
"mean_token_accuracy": 0.7216750860214234,
"num_tokens": 410457614.0,
"step": 6520
},
{
"epoch": 21.3240589198036,
"grad_norm": 12.365890438292144,
"learning_rate": 3.2277793813238393e-10,
"loss": 1.3437,
"mean_token_accuracy": 0.7215527594089508,
"num_tokens": 410774679.0,
"step": 6525
},
{
"epoch": 21.340425531914892,
"grad_norm": 13.965813014874932,
"learning_rate": 3.199803568119283e-10,
"loss": 1.3708,
"mean_token_accuracy": 0.7182475388050079,
"num_tokens": 411089535.0,
"step": 6530
},
{
"epoch": 21.356792144026187,
"grad_norm": 12.9264410294177,
"learning_rate": 3.171941227796227e-10,
"loss": 1.3516,
"mean_token_accuracy": 0.7200949966907502,
"num_tokens": 411402595.0,
"step": 6535
},
{
"epoch": 21.373158756137478,
"grad_norm": 12.718262389396598,
"learning_rate": 3.1441925053818015e-10,
"loss": 1.3586,
"mean_token_accuracy": 0.7214510560035705,
"num_tokens": 411718212.0,
"step": 6540
},
{
"epoch": 21.389525368248773,
"grad_norm": 12.393441417705136,
"learning_rate": 3.116557545311749e-10,
"loss": 1.3643,
"mean_token_accuracy": 0.7181841313838959,
"num_tokens": 412034328.0,
"step": 6545
},
{
"epoch": 21.405891980360064,
"grad_norm": 13.397061349747721,
"learning_rate": 3.0890364914296614e-10,
"loss": 1.3665,
"mean_token_accuracy": 0.7183075726032258,
"num_tokens": 412349959.0,
"step": 6550
},
{
"epoch": 21.42225859247136,
"grad_norm": 12.686656256195334,
"learning_rate": 3.0616294869862364e-10,
"loss": 1.3633,
"mean_token_accuracy": 0.7172622561454773,
"num_tokens": 412665283.0,
"step": 6555
},
{
"epoch": 21.438625204582653,
"grad_norm": 13.68919572012682,
"learning_rate": 3.0343366746385133e-10,
"loss": 1.3577,
"mean_token_accuracy": 0.7190939366817475,
"num_tokens": 412981575.0,
"step": 6560
},
{
"epoch": 21.454991816693944,
"grad_norm": 13.02482080433939,
"learning_rate": 3.0071581964491723e-10,
"loss": 1.3513,
"mean_token_accuracy": 0.720437103509903,
"num_tokens": 413297068.0,
"step": 6565
},
{
"epoch": 21.47135842880524,
"grad_norm": 13.332749583564876,
"learning_rate": 2.9800941938857574e-10,
"loss": 1.3646,
"mean_token_accuracy": 0.7186848640441894,
"num_tokens": 413612227.0,
"step": 6570
},
{
"epoch": 21.48772504091653,
"grad_norm": 13.087991897108951,
"learning_rate": 2.9531448078199436e-10,
"loss": 1.3563,
"mean_token_accuracy": 0.7198020398616791,
"num_tokens": 413927177.0,
"step": 6575
},
{
"epoch": 21.504091653027825,
"grad_norm": 13.335835947964268,
"learning_rate": 2.9263101785268255e-10,
"loss": 1.3518,
"mean_token_accuracy": 0.7200286328792572,
"num_tokens": 414243696.0,
"step": 6580
},
{
"epoch": 21.520458265139116,
"grad_norm": 13.160382556221116,
"learning_rate": 2.8995904456841664e-10,
"loss": 1.3605,
"mean_token_accuracy": 0.7191519558429718,
"num_tokens": 414559555.0,
"step": 6585
},
{
"epoch": 21.53682487725041,
"grad_norm": 13.058908572794502,
"learning_rate": 2.872985748371679e-10,
"loss": 1.3462,
"mean_token_accuracy": 0.7216069400310516,
"num_tokens": 414874568.0,
"step": 6590
},
{
"epoch": 21.5531914893617,
"grad_norm": 12.931286937951938,
"learning_rate": 2.8464962250703023e-10,
"loss": 1.3589,
"mean_token_accuracy": 0.7190203011035919,
"num_tokens": 415190194.0,
"step": 6595
},
{
"epoch": 21.569558101472996,
"grad_norm": 12.855372446575597,
"learning_rate": 2.8201220136614805e-10,
"loss": 1.346,
"mean_token_accuracy": 0.7213197529315949,
"num_tokens": 415506361.0,
"step": 6600
},
{
"epoch": 21.585924713584287,
"grad_norm": 13.099093485339138,
"learning_rate": 2.79386325142644e-10,
"loss": 1.3508,
"mean_token_accuracy": 0.720091027021408,
"num_tokens": 415820106.0,
"step": 6605
},
{
"epoch": 21.60229132569558,
"grad_norm": 12.51626494162,
"learning_rate": 2.7677200750454904e-10,
"loss": 1.3449,
"mean_token_accuracy": 0.7217469274997711,
"num_tokens": 416136377.0,
"step": 6610
},
{
"epoch": 21.618657937806873,
"grad_norm": 12.463459434825667,
"learning_rate": 2.7416926205972833e-10,
"loss": 1.332,
"mean_token_accuracy": 0.7255747377872467,
"num_tokens": 416452893.0,
"step": 6615
},
{
"epoch": 21.635024549918167,
"grad_norm": 12.977423501928877,
"learning_rate": 2.7157810235581335e-10,
"loss": 1.3569,
"mean_token_accuracy": 0.7192984879016876,
"num_tokens": 416768235.0,
"step": 6620
},
{
"epoch": 21.65139116202946,
"grad_norm": 12.47697245725263,
"learning_rate": 2.689985418801305e-10,
"loss": 1.3514,
"mean_token_accuracy": 0.7202287018299103,
"num_tokens": 417084339.0,
"step": 6625
},
{
"epoch": 21.667757774140753,
"grad_norm": 12.874623370443464,
"learning_rate": 2.6643059405963036e-10,
"loss": 1.3392,
"mean_token_accuracy": 0.7226230144500733,
"num_tokens": 417399309.0,
"step": 6630
},
{
"epoch": 21.684124386252044,
"grad_norm": 13.289710561216035,
"learning_rate": 2.638742722608184e-10,
"loss": 1.3531,
"mean_token_accuracy": 0.7205904841423034,
"num_tokens": 417715195.0,
"step": 6635
},
{
"epoch": 21.70049099836334,
"grad_norm": 13.159041280263851,
"learning_rate": 2.613295897896842e-10,
"loss": 1.3396,
"mean_token_accuracy": 0.7222914814949035,
"num_tokens": 418032025.0,
"step": 6640
},
{
"epoch": 21.71685761047463,
"grad_norm": 13.05845630365163,
"learning_rate": 2.587965598916342e-10,
"loss": 1.3535,
"mean_token_accuracy": 0.7190665304660797,
"num_tokens": 418346152.0,
"step": 6645
},
{
"epoch": 21.733224222585925,
"grad_norm": 12.825672435680955,
"learning_rate": 2.5627519575142086e-10,
"loss": 1.3706,
"mean_token_accuracy": 0.716326767206192,
"num_tokens": 418661650.0,
"step": 6650
},
{
"epoch": 21.74959083469722,
"grad_norm": 12.497004382450312,
"learning_rate": 2.5376551049307554e-10,
"loss": 1.3463,
"mean_token_accuracy": 0.7214847326278686,
"num_tokens": 418977651.0,
"step": 6655
},
{
"epoch": 21.76595744680851,
"grad_norm": 13.241035170630276,
"learning_rate": 2.5126751717983923e-10,
"loss": 1.3524,
"mean_token_accuracy": 0.7203598082065582,
"num_tokens": 419293460.0,
"step": 6660
},
{
"epoch": 21.782324058919805,
"grad_norm": 12.72372468441974,
"learning_rate": 2.4878122881409446e-10,
"loss": 1.355,
"mean_token_accuracy": 0.7196272253990174,
"num_tokens": 419607370.0,
"step": 6665
},
{
"epoch": 21.798690671031096,
"grad_norm": 13.029294249856717,
"learning_rate": 2.463066583372989e-10,
"loss": 1.346,
"mean_token_accuracy": 0.7200318992137908,
"num_tokens": 419921816.0,
"step": 6670
},
{
"epoch": 21.81505728314239,
"grad_norm": 12.912529889183855,
"learning_rate": 2.4384381862991523e-10,
"loss": 1.3534,
"mean_token_accuracy": 0.7199345767498017,
"num_tokens": 420237025.0,
"step": 6675
},
{
"epoch": 21.831423895253682,
"grad_norm": 12.933571334197762,
"learning_rate": 2.41392722511348e-10,
"loss": 1.3403,
"mean_token_accuracy": 0.7232010543346405,
"num_tokens": 420551721.0,
"step": 6680
},
{
"epoch": 21.847790507364977,
"grad_norm": 12.387962123212533,
"learning_rate": 2.389533827398735e-10,
"loss": 1.3377,
"mean_token_accuracy": 0.7233124315738678,
"num_tokens": 420867419.0,
"step": 6685
},
{
"epoch": 21.864157119476268,
"grad_norm": 12.274817137377648,
"learning_rate": 2.3652581201257547e-10,
"loss": 1.3473,
"mean_token_accuracy": 0.7211391031742096,
"num_tokens": 421182137.0,
"step": 6690
},
{
"epoch": 21.880523731587562,
"grad_norm": 12.755298828886199,
"learning_rate": 2.341100229652779e-10,
"loss": 1.3506,
"mean_token_accuracy": 0.7204267501831054,
"num_tokens": 421498754.0,
"step": 6695
},
{
"epoch": 21.896890343698853,
"grad_norm": 13.474176207147067,
"learning_rate": 2.317060281724795e-10,
"loss": 1.3818,
"mean_token_accuracy": 0.7143054306507111,
"num_tokens": 421814794.0,
"step": 6700
},
{
"epoch": 21.913256955810148,
"grad_norm": 12.999095205544377,
"learning_rate": 2.2931384014728856e-10,
"loss": 1.364,
"mean_token_accuracy": 0.7177674889564514,
"num_tokens": 422131641.0,
"step": 6705
},
{
"epoch": 21.92962356792144,
"grad_norm": 12.997861914085577,
"learning_rate": 2.2693347134135733e-10,
"loss": 1.3648,
"mean_token_accuracy": 0.717613697052002,
"num_tokens": 422447861.0,
"step": 6710
},
{
"epoch": 21.945990180032734,
"grad_norm": 13.01026998846727,
"learning_rate": 2.2456493414481776e-10,
"loss": 1.3353,
"mean_token_accuracy": 0.7228084802627563,
"num_tokens": 422761860.0,
"step": 6715
},
{
"epoch": 21.962356792144025,
"grad_norm": 12.99630753112788,
"learning_rate": 2.2220824088621638e-10,
"loss": 1.357,
"mean_token_accuracy": 0.7195675671100616,
"num_tokens": 423075970.0,
"step": 6720
},
{
"epoch": 21.97872340425532,
"grad_norm": 12.551715322951035,
"learning_rate": 2.1986340383245152e-10,
"loss": 1.3367,
"mean_token_accuracy": 0.7229941666126252,
"num_tokens": 423391794.0,
"step": 6725
},
{
"epoch": 21.99509001636661,
"grad_norm": 12.632402232140949,
"learning_rate": 2.1753043518870613e-10,
"loss": 1.3371,
"mean_token_accuracy": 0.722889506816864,
"num_tokens": 423708843.0,
"step": 6730
},
{
"epoch": 22.009819967266775,
"grad_norm": 12.886380554794895,
"learning_rate": 2.1520934709838901e-10,
"loss": 1.3502,
"mean_token_accuracy": 0.7217356893751357,
"num_tokens": 423970325.0,
"step": 6735
},
{
"epoch": 22.02618657937807,
"grad_norm": 13.370664170255838,
"learning_rate": 2.1290015164306758e-10,
"loss": 1.3538,
"mean_token_accuracy": 0.7191697001457215,
"num_tokens": 424285761.0,
"step": 6740
},
{
"epoch": 22.04255319148936,
"grad_norm": 12.845599012476232,
"learning_rate": 2.1060286084240738e-10,
"loss": 1.3512,
"mean_token_accuracy": 0.7195539712905884,
"num_tokens": 424601367.0,
"step": 6745
},
{
"epoch": 22.058919803600656,
"grad_norm": 13.358957826186089,
"learning_rate": 2.0831748665410767e-10,
"loss": 1.3607,
"mean_token_accuracy": 0.7183383524417877,
"num_tokens": 424918318.0,
"step": 6750
},
{
"epoch": 22.075286415711947,
"grad_norm": 12.603249648959288,
"learning_rate": 2.0604404097384178e-10,
"loss": 1.3344,
"mean_token_accuracy": 0.7244606792926789,
"num_tokens": 425235565.0,
"step": 6755
},
{
"epoch": 22.09165302782324,
"grad_norm": 12.673344237200768,
"learning_rate": 2.0378253563519245e-10,
"loss": 1.3364,
"mean_token_accuracy": 0.7232702493667602,
"num_tokens": 425549227.0,
"step": 6760
},
{
"epoch": 22.108019639934533,
"grad_norm": 12.991911952026106,
"learning_rate": 2.01532982409591e-10,
"loss": 1.3644,
"mean_token_accuracy": 0.7179206192493439,
"num_tokens": 425866443.0,
"step": 6765
},
{
"epoch": 22.124386252045827,
"grad_norm": 13.238495456255036,
"learning_rate": 1.9929539300625744e-10,
"loss": 1.3388,
"mean_token_accuracy": 0.7227777302265167,
"num_tokens": 426181410.0,
"step": 6770
},
{
"epoch": 22.14075286415712,
"grad_norm": 13.432269527486834,
"learning_rate": 1.9706977907213763e-10,
"loss": 1.3639,
"mean_token_accuracy": 0.717751395702362,
"num_tokens": 426497130.0,
"step": 6775
},
{
"epoch": 22.157119476268413,
"grad_norm": 12.539273082066268,
"learning_rate": 1.948561521918446e-10,
"loss": 1.3424,
"mean_token_accuracy": 0.7216863572597504,
"num_tokens": 426812100.0,
"step": 6780
},
{
"epoch": 22.173486088379704,
"grad_norm": 13.243823023413835,
"learning_rate": 1.9265452388759652e-10,
"loss": 1.3539,
"mean_token_accuracy": 0.7189646363258362,
"num_tokens": 427125130.0,
"step": 6785
},
{
"epoch": 22.189852700491,
"grad_norm": 13.079190798755858,
"learning_rate": 1.9046490561915708e-10,
"loss": 1.3606,
"mean_token_accuracy": 0.7185764789581299,
"num_tokens": 427442723.0,
"step": 6790
},
{
"epoch": 22.20621931260229,
"grad_norm": 12.959085959087055,
"learning_rate": 1.8828730878377638e-10,
"loss": 1.3423,
"mean_token_accuracy": 0.7217658877372741,
"num_tokens": 427758619.0,
"step": 6795
},
{
"epoch": 22.222585924713584,
"grad_norm": 12.599950436626239,
"learning_rate": 1.8612174471613174e-10,
"loss": 1.3487,
"mean_token_accuracy": 0.7207997500896454,
"num_tokens": 428074285.0,
"step": 6800
},
{
"epoch": 22.238952536824875,
"grad_norm": 13.44831286777348,
"learning_rate": 1.8396822468826819e-10,
"loss": 1.3766,
"mean_token_accuracy": 0.7160114467144012,
"num_tokens": 428390105.0,
"step": 6805
},
{
"epoch": 22.25531914893617,
"grad_norm": 13.214936849964493,
"learning_rate": 1.8182675990954022e-10,
"loss": 1.3517,
"mean_token_accuracy": 0.7194897770881653,
"num_tokens": 428705606.0,
"step": 6810
},
{
"epoch": 22.271685761047465,
"grad_norm": 13.11981613970459,
"learning_rate": 1.7969736152655237e-10,
"loss": 1.3591,
"mean_token_accuracy": 0.7187436342239379,
"num_tokens": 429020487.0,
"step": 6815
},
{
"epoch": 22.288052373158756,
"grad_norm": 12.507560434328925,
"learning_rate": 1.775800406231026e-10,
"loss": 1.3439,
"mean_token_accuracy": 0.7216341137886048,
"num_tokens": 429336937.0,
"step": 6820
},
{
"epoch": 22.30441898527005,
"grad_norm": 12.734388947281815,
"learning_rate": 1.7547480822012408e-10,
"loss": 1.3453,
"mean_token_accuracy": 0.7207939743995666,
"num_tokens": 429653469.0,
"step": 6825
},
{
"epoch": 22.32078559738134,
"grad_norm": 12.63205594092822,
"learning_rate": 1.7338167527562732e-10,
"loss": 1.3377,
"mean_token_accuracy": 0.7222929179668427,
"num_tokens": 429969131.0,
"step": 6830
},
{
"epoch": 22.337152209492636,
"grad_norm": 13.162783662994846,
"learning_rate": 1.713006526846439e-10,
"loss": 1.3487,
"mean_token_accuracy": 0.7200253903865814,
"num_tokens": 430284447.0,
"step": 6835
},
{
"epoch": 22.353518821603927,
"grad_norm": 12.60058719095576,
"learning_rate": 1.6923175127916994e-10,
"loss": 1.3484,
"mean_token_accuracy": 0.7198909163475037,
"num_tokens": 430599699.0,
"step": 6840
},
{
"epoch": 22.369885433715222,
"grad_norm": 12.855197780418354,
"learning_rate": 1.6717498182810765e-10,
"loss": 1.3405,
"mean_token_accuracy": 0.7213269472122192,
"num_tokens": 430913774.0,
"step": 6845
},
{
"epoch": 22.386252045826513,
"grad_norm": 12.782693683323123,
"learning_rate": 1.6513035503721212e-10,
"loss": 1.3462,
"mean_token_accuracy": 0.7212722063064575,
"num_tokens": 431227584.0,
"step": 6850
},
{
"epoch": 22.402618657937808,
"grad_norm": 12.563522351858756,
"learning_rate": 1.630978815490339e-10,
"loss": 1.3287,
"mean_token_accuracy": 0.7248152911663055,
"num_tokens": 431543507.0,
"step": 6855
},
{
"epoch": 22.4189852700491,
"grad_norm": 12.861715310841179,
"learning_rate": 1.610775719428642e-10,
"loss": 1.3492,
"mean_token_accuracy": 0.7201033174991608,
"num_tokens": 431858223.0,
"step": 6860
},
{
"epoch": 22.435351882160393,
"grad_norm": 13.174983537506147,
"learning_rate": 1.5906943673467955e-10,
"loss": 1.3467,
"mean_token_accuracy": 0.721436756849289,
"num_tokens": 432173885.0,
"step": 6865
},
{
"epoch": 22.451718494271685,
"grad_norm": 12.86280817976593,
"learning_rate": 1.5707348637708674e-10,
"loss": 1.3527,
"mean_token_accuracy": 0.7199465811252594,
"num_tokens": 432489890.0,
"step": 6870
},
{
"epoch": 22.46808510638298,
"grad_norm": 13.075985796707062,
"learning_rate": 1.5508973125926918e-10,
"loss": 1.3635,
"mean_token_accuracy": 0.7167506515979767,
"num_tokens": 432805161.0,
"step": 6875
},
{
"epoch": 22.48445171849427,
"grad_norm": 12.930055267973149,
"learning_rate": 1.531181817069327e-10,
"loss": 1.3514,
"mean_token_accuracy": 0.7211288928985595,
"num_tokens": 433121001.0,
"step": 6880
},
{
"epoch": 22.500818330605565,
"grad_norm": 12.801291847440664,
"learning_rate": 1.5115884798225122e-10,
"loss": 1.3521,
"mean_token_accuracy": 0.7190639019012451,
"num_tokens": 433435828.0,
"step": 6885
},
{
"epoch": 22.517184942716856,
"grad_norm": 13.245523412336043,
"learning_rate": 1.4921174028381362e-10,
"loss": 1.3559,
"mean_token_accuracy": 0.7189247727394104,
"num_tokens": 433750208.0,
"step": 6890
},
{
"epoch": 22.53355155482815,
"grad_norm": 13.490417527494259,
"learning_rate": 1.4727686874657143e-10,
"loss": 1.3482,
"mean_token_accuracy": 0.720479530096054,
"num_tokens": 434066217.0,
"step": 6895
},
{
"epoch": 22.54991816693944,
"grad_norm": 12.307057662694007,
"learning_rate": 1.4535424344178372e-10,
"loss": 1.3303,
"mean_token_accuracy": 0.7245355308055877,
"num_tokens": 434382326.0,
"step": 6900
},
{
"epoch": 22.566284779050736,
"grad_norm": 12.389134965064018,
"learning_rate": 1.4344387437696781e-10,
"loss": 1.3513,
"mean_token_accuracy": 0.7200321733951569,
"num_tokens": 434697233.0,
"step": 6905
},
{
"epoch": 22.58265139116203,
"grad_norm": 12.910124027424311,
"learning_rate": 1.4154577149584542e-10,
"loss": 1.3451,
"mean_token_accuracy": 0.7210674345493316,
"num_tokens": 435012252.0,
"step": 6910
},
{
"epoch": 22.599018003273322,
"grad_norm": 13.600220736718454,
"learning_rate": 1.396599446782909e-10,
"loss": 1.339,
"mean_token_accuracy": 0.7226611793041229,
"num_tokens": 435328644.0,
"step": 6915
},
{
"epoch": 22.615384615384617,
"grad_norm": 12.52738378390851,
"learning_rate": 1.3778640374027983e-10,
"loss": 1.336,
"mean_token_accuracy": 0.7228205442428589,
"num_tokens": 435642951.0,
"step": 6920
},
{
"epoch": 22.631751227495908,
"grad_norm": 13.62863010832926,
"learning_rate": 1.359251584338389e-10,
"loss": 1.3641,
"mean_token_accuracy": 0.7171992361545563,
"num_tokens": 435958506.0,
"step": 6925
},
{
"epoch": 22.648117839607202,
"grad_norm": 12.552383076369432,
"learning_rate": 1.3407621844699374e-10,
"loss": 1.3317,
"mean_token_accuracy": 0.7235641539096832,
"num_tokens": 436276318.0,
"step": 6930
},
{
"epoch": 22.664484451718494,
"grad_norm": 13.664599298303148,
"learning_rate": 1.322395934037199e-10,
"loss": 1.3645,
"mean_token_accuracy": 0.7180260837078094,
"num_tokens": 436593498.0,
"step": 6935
},
{
"epoch": 22.680851063829788,
"grad_norm": 13.351252233873854,
"learning_rate": 1.3041529286389076e-10,
"loss": 1.3621,
"mean_token_accuracy": 0.7189351558685303,
"num_tokens": 436909048.0,
"step": 6940
},
{
"epoch": 22.69721767594108,
"grad_norm": 13.465048819018577,
"learning_rate": 1.2860332632323085e-10,
"loss": 1.3742,
"mean_token_accuracy": 0.7154442071914673,
"num_tokens": 437225143.0,
"step": 6945
},
{
"epoch": 22.713584288052374,
"grad_norm": 13.822851613415096,
"learning_rate": 1.2680370321326323e-10,
"loss": 1.3526,
"mean_token_accuracy": 0.7198641777038575,
"num_tokens": 437541140.0,
"step": 6950
},
{
"epoch": 22.729950900163665,
"grad_norm": 12.152454968770806,
"learning_rate": 1.2501643290126263e-10,
"loss": 1.3428,
"mean_token_accuracy": 0.7212522566318512,
"num_tokens": 437854378.0,
"step": 6955
},
{
"epoch": 22.74631751227496,
"grad_norm": 12.660960103280965,
"learning_rate": 1.2324152469020465e-10,
"loss": 1.3406,
"mean_token_accuracy": 0.7219815969467163,
"num_tokens": 438169410.0,
"step": 6960
},
{
"epoch": 22.76268412438625,
"grad_norm": 12.902062843477703,
"learning_rate": 1.2147898781871974e-10,
"loss": 1.3517,
"mean_token_accuracy": 0.7200750589370728,
"num_tokens": 438484479.0,
"step": 6965
},
{
"epoch": 22.779050736497545,
"grad_norm": 12.874574336103201,
"learning_rate": 1.197288314610434e-10,
"loss": 1.3485,
"mean_token_accuracy": 0.7214947521686554,
"num_tokens": 438801983.0,
"step": 6970
},
{
"epoch": 22.795417348608837,
"grad_norm": 12.866352405422353,
"learning_rate": 1.1799106472696912e-10,
"loss": 1.3536,
"mean_token_accuracy": 0.7188063561916351,
"num_tokens": 439118078.0,
"step": 6975
},
{
"epoch": 22.81178396072013,
"grad_norm": 12.925726362826204,
"learning_rate": 1.1626569666180031e-10,
"loss": 1.367,
"mean_token_accuracy": 0.7190398752689362,
"num_tokens": 439432429.0,
"step": 6980
},
{
"epoch": 22.828150572831422,
"grad_norm": 12.56058886584475,
"learning_rate": 1.1455273624630419e-10,
"loss": 1.341,
"mean_token_accuracy": 0.7224170148372651,
"num_tokens": 439748343.0,
"step": 6985
},
{
"epoch": 22.844517184942717,
"grad_norm": 12.931255482808035,
"learning_rate": 1.1285219239666467e-10,
"loss": 1.347,
"mean_token_accuracy": 0.7204574346542358,
"num_tokens": 440064663.0,
"step": 6990
},
{
"epoch": 22.86088379705401,
"grad_norm": 12.603505356494258,
"learning_rate": 1.111640739644354e-10,
"loss": 1.3633,
"mean_token_accuracy": 0.7168758630752563,
"num_tokens": 440381472.0,
"step": 6995
},
{
"epoch": 22.877250409165303,
"grad_norm": 13.503457434866782,
"learning_rate": 1.0948838973649372e-10,
"loss": 1.3557,
"mean_token_accuracy": 0.7181106150150299,
"num_tokens": 440698593.0,
"step": 7000
},
{
"epoch": 22.893617021276597,
"grad_norm": 13.288625175162997,
"learning_rate": 1.0782514843499652e-10,
"loss": 1.3569,
"mean_token_accuracy": 0.7186809659004212,
"num_tokens": 441014934.0,
"step": 7005
},
{
"epoch": 22.90998363338789,
"grad_norm": 13.067345412316314,
"learning_rate": 1.0617435871733277e-10,
"loss": 1.3531,
"mean_token_accuracy": 0.719657689332962,
"num_tokens": 441331673.0,
"step": 7010
},
{
"epoch": 22.926350245499183,
"grad_norm": 12.297508956037928,
"learning_rate": 1.0453602917607885e-10,
"loss": 1.3357,
"mean_token_accuracy": 0.7232572436332703,
"num_tokens": 441646020.0,
"step": 7015
},
{
"epoch": 22.942716857610474,
"grad_norm": 12.411782822458209,
"learning_rate": 1.029101683389555e-10,
"loss": 1.3627,
"mean_token_accuracy": 0.7166090369224548,
"num_tokens": 441960330.0,
"step": 7020
},
{
"epoch": 22.95908346972177,
"grad_norm": 12.400946609701034,
"learning_rate": 1.0129678466878123e-10,
"loss": 1.3347,
"mean_token_accuracy": 0.7229482293128967,
"num_tokens": 442277761.0,
"step": 7025
},
{
"epoch": 22.97545008183306,
"grad_norm": 13.114720694822743,
"learning_rate": 9.969588656342981e-11,
"loss": 1.3389,
"mean_token_accuracy": 0.7220352053642273,
"num_tokens": 442593144.0,
"step": 7030
},
{
"epoch": 22.991816693944354,
"grad_norm": 12.880875573574846,
"learning_rate": 9.810748235578592e-11,
"loss": 1.3402,
"mean_token_accuracy": 0.7220639884471893,
"num_tokens": 442908607.0,
"step": 7035
},
{
"epoch": 23.006546644844516,
"grad_norm": 12.885175681405324,
"learning_rate": 9.653158031370152e-11,
"loss": 1.3477,
"mean_token_accuracy": 0.7198513878716363,
"num_tokens": 443169643.0,
"step": 7040
},
{
"epoch": 23.02291325695581,
"grad_norm": 12.979282554677258,
"learning_rate": 9.496818863995365e-11,
"loss": 1.3509,
"mean_token_accuracy": 0.7200859129428864,
"num_tokens": 443483874.0,
"step": 7045
},
{
"epoch": 23.0392798690671,
"grad_norm": 12.65359546368029,
"learning_rate": 9.341731547220094e-11,
"loss": 1.3582,
"mean_token_accuracy": 0.7197104752063751,
"num_tokens": 443799489.0,
"step": 7050
},
{
"epoch": 23.055646481178396,
"grad_norm": 13.24581632006229,
"learning_rate": 9.187896888294189e-11,
"loss": 1.3628,
"mean_token_accuracy": 0.7178315281867981,
"num_tokens": 444115077.0,
"step": 7055
},
{
"epoch": 23.07201309328969,
"grad_norm": 12.786295664937347,
"learning_rate": 9.03531568794716e-11,
"loss": 1.3401,
"mean_token_accuracy": 0.7225833356380462,
"num_tokens": 444429590.0,
"step": 7060
},
{
"epoch": 23.088379705400982,
"grad_norm": 13.64398814950113,
"learning_rate": 8.883988740384264e-11,
"loss": 1.3619,
"mean_token_accuracy": 0.7186536669731141,
"num_tokens": 444746524.0,
"step": 7065
},
{
"epoch": 23.104746317512276,
"grad_norm": 12.267877353989375,
"learning_rate": 8.733916833282008e-11,
"loss": 1.3198,
"mean_token_accuracy": 0.7258092045783997,
"num_tokens": 445062057.0,
"step": 7070
},
{
"epoch": 23.121112929623568,
"grad_norm": 12.485433355461446,
"learning_rate": 8.585100747784374e-11,
"loss": 1.3392,
"mean_token_accuracy": 0.7215893030166626,
"num_tokens": 445378595.0,
"step": 7075
},
{
"epoch": 23.137479541734862,
"grad_norm": 13.150909792468429,
"learning_rate": 8.437541258498633e-11,
"loss": 1.3416,
"mean_token_accuracy": 0.7213694810867309,
"num_tokens": 445695024.0,
"step": 7080
},
{
"epoch": 23.153846153846153,
"grad_norm": 12.56982379223737,
"learning_rate": 8.29123913349128e-11,
"loss": 1.3451,
"mean_token_accuracy": 0.721850723028183,
"num_tokens": 446013483.0,
"step": 7085
},
{
"epoch": 23.170212765957448,
"grad_norm": 12.74917405617213,
"learning_rate": 8.146195134284052e-11,
"loss": 1.3256,
"mean_token_accuracy": 0.7241310834884643,
"num_tokens": 446331438.0,
"step": 7090
},
{
"epoch": 23.18657937806874,
"grad_norm": 12.861658218052662,
"learning_rate": 8.002410015849948e-11,
"loss": 1.3507,
"mean_token_accuracy": 0.7200630605220795,
"num_tokens": 446646339.0,
"step": 7095
},
{
"epoch": 23.202945990180034,
"grad_norm": 12.629385644245952,
"learning_rate": 7.859884526609434e-11,
"loss": 1.3431,
"mean_token_accuracy": 0.7209880650043488,
"num_tokens": 446961928.0,
"step": 7100
},
{
"epoch": 23.219312602291325,
"grad_norm": 13.07788475145339,
"learning_rate": 7.718619408426358e-11,
"loss": 1.3566,
"mean_token_accuracy": 0.7183891952037811,
"num_tokens": 447277873.0,
"step": 7105
},
{
"epoch": 23.23567921440262,
"grad_norm": 12.952389492992099,
"learning_rate": 7.578615396604149e-11,
"loss": 1.3488,
"mean_token_accuracy": 0.7210026741027832,
"num_tokens": 447593799.0,
"step": 7110
},
{
"epoch": 23.25204582651391,
"grad_norm": 12.743509865987193,
"learning_rate": 7.439873219882098e-11,
"loss": 1.3422,
"mean_token_accuracy": 0.721809321641922,
"num_tokens": 447908979.0,
"step": 7115
},
{
"epoch": 23.268412438625205,
"grad_norm": 12.783610210000228,
"learning_rate": 7.30239360043139e-11,
"loss": 1.3338,
"mean_token_accuracy": 0.7223511099815368,
"num_tokens": 448224475.0,
"step": 7120
},
{
"epoch": 23.284779050736496,
"grad_norm": 12.445475133174945,
"learning_rate": 7.166177253851491e-11,
"loss": 1.3395,
"mean_token_accuracy": 0.721209728717804,
"num_tokens": 448540511.0,
"step": 7125
},
{
"epoch": 23.30114566284779,
"grad_norm": 13.170033737273622,
"learning_rate": 7.031224889166326e-11,
"loss": 1.3473,
"mean_token_accuracy": 0.7205008029937744,
"num_tokens": 448856961.0,
"step": 7130
},
{
"epoch": 23.317512274959082,
"grad_norm": 12.733430417544065,
"learning_rate": 6.89753720882072e-11,
"loss": 1.3572,
"mean_token_accuracy": 0.7191106677055359,
"num_tokens": 449172747.0,
"step": 7135
},
{
"epoch": 23.333878887070377,
"grad_norm": 12.820932566449374,
"learning_rate": 6.765114908676512e-11,
"loss": 1.3541,
"mean_token_accuracy": 0.7197527647018432,
"num_tokens": 449489574.0,
"step": 7140
},
{
"epoch": 23.350245499181668,
"grad_norm": 12.334727024147877,
"learning_rate": 6.633958678009172e-11,
"loss": 1.3572,
"mean_token_accuracy": 0.7184008717536926,
"num_tokens": 449805224.0,
"step": 7145
},
{
"epoch": 23.366612111292962,
"grad_norm": 13.243260617772965,
"learning_rate": 6.504069199504081e-11,
"loss": 1.3533,
"mean_token_accuracy": 0.720440012216568,
"num_tokens": 450121499.0,
"step": 7150
},
{
"epoch": 23.382978723404257,
"grad_norm": 13.111364694962516,
"learning_rate": 6.375447149253005e-11,
"loss": 1.3438,
"mean_token_accuracy": 0.7227679491043091,
"num_tokens": 450438216.0,
"step": 7155
},
{
"epoch": 23.399345335515548,
"grad_norm": 13.02081588241417,
"learning_rate": 6.24809319675057e-11,
"loss": 1.3588,
"mean_token_accuracy": 0.718359899520874,
"num_tokens": 450753408.0,
"step": 7160
},
{
"epoch": 23.415711947626843,
"grad_norm": 16.994427042019105,
"learning_rate": 6.12200800489085e-11,
"loss": 1.3611,
"mean_token_accuracy": 0.7182079792022705,
"num_tokens": 451069176.0,
"step": 7165
},
{
"epoch": 23.432078559738134,
"grad_norm": 12.825635117115837,
"learning_rate": 5.997192229963727e-11,
"loss": 1.3464,
"mean_token_accuracy": 0.720733916759491,
"num_tokens": 451384214.0,
"step": 7170
},
{
"epoch": 23.44844517184943,
"grad_norm": 12.356272462085352,
"learning_rate": 5.873646521651759e-11,
"loss": 1.35,
"mean_token_accuracy": 0.7204180717468261,
"num_tokens": 451700388.0,
"step": 7175
},
{
"epoch": 23.46481178396072,
"grad_norm": 13.158569750292818,
"learning_rate": 5.7513715230265165e-11,
"loss": 1.3539,
"mean_token_accuracy": 0.7190254867076874,
"num_tokens": 452015367.0,
"step": 7180
},
{
"epoch": 23.481178396072014,
"grad_norm": 12.094869366015358,
"learning_rate": 5.630367870545411e-11,
"loss": 1.3408,
"mean_token_accuracy": 0.722713416814804,
"num_tokens": 452329550.0,
"step": 7185
},
{
"epoch": 23.497545008183305,
"grad_norm": 11.893876981267749,
"learning_rate": 5.510636194048318e-11,
"loss": 1.3471,
"mean_token_accuracy": 0.7201150059700012,
"num_tokens": 452644884.0,
"step": 7190
},
{
"epoch": 23.5139116202946,
"grad_norm": 12.109253702866202,
"learning_rate": 5.3921771167542985e-11,
"loss": 1.3517,
"mean_token_accuracy": 0.7199622452259063,
"num_tokens": 452959253.0,
"step": 7195
},
{
"epoch": 23.53027823240589,
"grad_norm": 13.099421347139643,
"learning_rate": 5.274991255258432e-11,
"loss": 1.3623,
"mean_token_accuracy": 0.7182047188282012,
"num_tokens": 453273720.0,
"step": 7200
},
{
"epoch": 23.546644844517186,
"grad_norm": 12.605437677197763,
"learning_rate": 5.1590792195284616e-11,
"loss": 1.3436,
"mean_token_accuracy": 0.7211773514747619,
"num_tokens": 453589369.0,
"step": 7205
},
{
"epoch": 23.563011456628477,
"grad_norm": 13.084592752849911,
"learning_rate": 5.044441612901768e-11,
"loss": 1.3417,
"mean_token_accuracy": 0.7220472991466522,
"num_tokens": 453906277.0,
"step": 7210
},
{
"epoch": 23.57937806873977,
"grad_norm": 12.357535357551479,
"learning_rate": 4.931079032082092e-11,
"loss": 1.3437,
"mean_token_accuracy": 0.7210754454135895,
"num_tokens": 454222243.0,
"step": 7215
},
{
"epoch": 23.595744680851062,
"grad_norm": 13.283999157997714,
"learning_rate": 4.8189920671365405e-11,
"loss": 1.3503,
"mean_token_accuracy": 0.7204611420631408,
"num_tokens": 454537914.0,
"step": 7220
},
{
"epoch": 23.612111292962357,
"grad_norm": 12.647991933541059,
"learning_rate": 4.7081813014924755e-11,
"loss": 1.3495,
"mean_token_accuracy": 0.720174902677536,
"num_tokens": 454853441.0,
"step": 7225
},
{
"epoch": 23.628477905073648,
"grad_norm": 12.833034997500734,
"learning_rate": 4.598647311934462e-11,
"loss": 1.3365,
"mean_token_accuracy": 0.722919511795044,
"num_tokens": 455169051.0,
"step": 7230
},
{
"epoch": 23.644844517184943,
"grad_norm": 12.726194071489417,
"learning_rate": 4.490390668601296e-11,
"loss": 1.3635,
"mean_token_accuracy": 0.7177496433258057,
"num_tokens": 455483328.0,
"step": 7235
},
{
"epoch": 23.661211129296234,
"grad_norm": 12.836985723011823,
"learning_rate": 4.383411934983012e-11,
"loss": 1.3529,
"mean_token_accuracy": 0.7187625408172608,
"num_tokens": 455798691.0,
"step": 7240
},
{
"epoch": 23.67757774140753,
"grad_norm": 12.369616415044517,
"learning_rate": 4.277711667917877e-11,
"loss": 1.3202,
"mean_token_accuracy": 0.7258556962013245,
"num_tokens": 456113780.0,
"step": 7245
},
{
"epoch": 23.693944353518823,
"grad_norm": 13.777116689233463,
"learning_rate": 4.173290417589737e-11,
"loss": 1.3503,
"mean_token_accuracy": 0.7203498423099518,
"num_tokens": 456429261.0,
"step": 7250
},
{
"epoch": 23.710310965630114,
"grad_norm": 13.33248130407782,
"learning_rate": 4.070148727524814e-11,
"loss": 1.3794,
"mean_token_accuracy": 0.7169650435447693,
"num_tokens": 456744751.0,
"step": 7255
},
{
"epoch": 23.72667757774141,
"grad_norm": 12.73782241993151,
"learning_rate": 3.968287134589188e-11,
"loss": 1.3483,
"mean_token_accuracy": 0.7209219753742218,
"num_tokens": 457061525.0,
"step": 7260
},
{
"epoch": 23.7430441898527,
"grad_norm": 12.612029883251239,
"learning_rate": 3.867706168985768e-11,
"loss": 1.33,
"mean_token_accuracy": 0.7241364538669586,
"num_tokens": 457377985.0,
"step": 7265
},
{
"epoch": 23.759410801963995,
"grad_norm": 13.238807566993644,
"learning_rate": 3.768406354251713e-11,
"loss": 1.3444,
"mean_token_accuracy": 0.7211177289485932,
"num_tokens": 457692359.0,
"step": 7270
},
{
"epoch": 23.775777414075286,
"grad_norm": 13.62214958956788,
"learning_rate": 3.6703882072555706e-11,
"loss": 1.3632,
"mean_token_accuracy": 0.7179272472858429,
"num_tokens": 458007552.0,
"step": 7275
},
{
"epoch": 23.79214402618658,
"grad_norm": 12.76761616842123,
"learning_rate": 3.5736522381946137e-11,
"loss": 1.364,
"mean_token_accuracy": 0.7169786393642426,
"num_tokens": 458324087.0,
"step": 7280
},
{
"epoch": 23.80851063829787,
"grad_norm": 12.821563995336735,
"learning_rate": 3.478198950592315e-11,
"loss": 1.3535,
"mean_token_accuracy": 0.7197467684745789,
"num_tokens": 458641310.0,
"step": 7285
},
{
"epoch": 23.824877250409166,
"grad_norm": 12.34030320089937,
"learning_rate": 3.384028841295489e-11,
"loss": 1.3606,
"mean_token_accuracy": 0.7179623246192932,
"num_tokens": 458954389.0,
"step": 7290
},
{
"epoch": 23.841243862520457,
"grad_norm": 13.306594697494013,
"learning_rate": 3.2911424004719305e-11,
"loss": 1.3504,
"mean_token_accuracy": 0.7206150174140931,
"num_tokens": 459269086.0,
"step": 7295
},
{
"epoch": 23.857610474631752,
"grad_norm": 12.879399761128884,
"learning_rate": 3.199540111607752e-11,
"loss": 1.3522,
"mean_token_accuracy": 0.721342933177948,
"num_tokens": 459583992.0,
"step": 7300
},
{
"epoch": 23.873977086743043,
"grad_norm": 12.935481056677569,
"learning_rate": 3.109222451504884e-11,
"loss": 1.3714,
"mean_token_accuracy": 0.7164525389671326,
"num_tokens": 459899132.0,
"step": 7305
},
{
"epoch": 23.890343698854338,
"grad_norm": 12.846970895626185,
"learning_rate": 3.020189890278579e-11,
"loss": 1.3363,
"mean_token_accuracy": 0.7231940507888794,
"num_tokens": 460213554.0,
"step": 7310
},
{
"epoch": 23.90671031096563,
"grad_norm": 13.115590780433335,
"learning_rate": 2.932442891354997e-11,
"loss": 1.3411,
"mean_token_accuracy": 0.7229074001312256,
"num_tokens": 460530104.0,
"step": 7315
},
{
"epoch": 23.923076923076923,
"grad_norm": 13.49431363989052,
"learning_rate": 2.8459819114687868e-11,
"loss": 1.3552,
"mean_token_accuracy": 0.7194628655910492,
"num_tokens": 460846228.0,
"step": 7320
},
{
"epoch": 23.939443535188214,
"grad_norm": 13.019870380653288,
"learning_rate": 2.7608074006606755e-11,
"loss": 1.3532,
"mean_token_accuracy": 0.7177223682403564,
"num_tokens": 461161597.0,
"step": 7325
},
{
"epoch": 23.95581014729951,
"grad_norm": 12.789360155231835,
"learning_rate": 2.676919802275163e-11,
"loss": 1.3247,
"mean_token_accuracy": 0.7247434377670288,
"num_tokens": 461476843.0,
"step": 7330
},
{
"epoch": 23.9721767594108,
"grad_norm": 12.918338648170026,
"learning_rate": 2.594319552958191e-11,
"loss": 1.349,
"mean_token_accuracy": 0.7203211784362793,
"num_tokens": 461793859.0,
"step": 7335
},
{
"epoch": 23.988543371522095,
"grad_norm": 13.01942740314573,
"learning_rate": 2.513007082654922e-11,
"loss": 1.3625,
"mean_token_accuracy": 0.7167957127094269,
"num_tokens": 462108257.0,
"step": 7340
},
{
"epoch": 24.00327332242226,
"grad_norm": 12.751880986482613,
"learning_rate": 2.4329828146074095e-11,
"loss": 1.3563,
"mean_token_accuracy": 0.7189247012138367,
"num_tokens": 462368701.0,
"step": 7345
},
{
"epoch": 24.01963993453355,
"grad_norm": 12.621994373110237,
"learning_rate": 2.3542471653524856e-11,
"loss": 1.3477,
"mean_token_accuracy": 0.7204642951488495,
"num_tokens": 462685003.0,
"step": 7350
},
{
"epoch": 24.036006546644845,
"grad_norm": 12.69132203011607,
"learning_rate": 2.2768005447194872e-11,
"loss": 1.3548,
"mean_token_accuracy": 0.7195856988430023,
"num_tokens": 463000988.0,
"step": 7355
},
{
"epoch": 24.052373158756136,
"grad_norm": 13.590120640008223,
"learning_rate": 2.200643355828258e-11,
"loss": 1.3477,
"mean_token_accuracy": 0.7200317621231079,
"num_tokens": 463316798.0,
"step": 7360
},
{
"epoch": 24.06873977086743,
"grad_norm": 13.013557565391965,
"learning_rate": 2.125775995086926e-11,
"loss": 1.3695,
"mean_token_accuracy": 0.7159531533718109,
"num_tokens": 463631806.0,
"step": 7365
},
{
"epoch": 24.085106382978722,
"grad_norm": 12.907712960234285,
"learning_rate": 2.0521988521899628e-11,
"loss": 1.3413,
"mean_token_accuracy": 0.7220550358295441,
"num_tokens": 463948216.0,
"step": 7370
},
{
"epoch": 24.101472995090017,
"grad_norm": 13.256062819208426,
"learning_rate": 1.9799123101160444e-11,
"loss": 1.3465,
"mean_token_accuracy": 0.7210370361804962,
"num_tokens": 464263741.0,
"step": 7375
},
{
"epoch": 24.117839607201308,
"grad_norm": 12.994341656646789,
"learning_rate": 1.9089167451260547e-11,
"loss": 1.3577,
"mean_token_accuracy": 0.7184650480747223,
"num_tokens": 464580296.0,
"step": 7380
},
{
"epoch": 24.134206219312603,
"grad_norm": 12.81826767989765,
"learning_rate": 1.8392125267612803e-11,
"loss": 1.3583,
"mean_token_accuracy": 0.7188352525234223,
"num_tokens": 464893834.0,
"step": 7385
},
{
"epoch": 24.150572831423894,
"grad_norm": 12.7856551090598,
"learning_rate": 1.7708000178413008e-11,
"loss": 1.3379,
"mean_token_accuracy": 0.7223597168922424,
"num_tokens": 465209881.0,
"step": 7390
},
{
"epoch": 24.16693944353519,
"grad_norm": 12.585297565916532,
"learning_rate": 1.703679574462158e-11,
"loss": 1.3376,
"mean_token_accuracy": 0.7221680283546448,
"num_tokens": 465525042.0,
"step": 7395
},
{
"epoch": 24.183306055646483,
"grad_norm": 13.97539060097907,
"learning_rate": 1.6378515459946065e-11,
"loss": 1.375,
"mean_token_accuracy": 0.714812695980072,
"num_tokens": 465840909.0,
"step": 7400
},
{
"epoch": 24.199672667757774,
"grad_norm": 13.5560172635617,
"learning_rate": 1.5733162750821706e-11,
"loss": 1.3535,
"mean_token_accuracy": 0.719173789024353,
"num_tokens": 466154795.0,
"step": 7405
},
{
"epoch": 24.21603927986907,
"grad_norm": 13.078641012785111,
"learning_rate": 1.5100740976393968e-11,
"loss": 1.3598,
"mean_token_accuracy": 0.7185278415679932,
"num_tokens": 466470138.0,
"step": 7410
},
{
"epoch": 24.23240589198036,
"grad_norm": 12.897154852485773,
"learning_rate": 1.4481253428500763e-11,
"loss": 1.3591,
"mean_token_accuracy": 0.7187921524047851,
"num_tokens": 466784749.0,
"step": 7415
},
{
"epoch": 24.248772504091654,
"grad_norm": 12.492321955565348,
"learning_rate": 1.387470333165608e-11,
"loss": 1.3379,
"mean_token_accuracy": 0.722098046541214,
"num_tokens": 467098461.0,
"step": 7420
},
{
"epoch": 24.265139116202946,
"grad_norm": 13.007611243527537,
"learning_rate": 1.3281093843033055e-11,
"loss": 1.3448,
"mean_token_accuracy": 0.7209655940532684,
"num_tokens": 467416352.0,
"step": 7425
},
{
"epoch": 24.28150572831424,
"grad_norm": 12.697667313743437,
"learning_rate": 1.2700428052447033e-11,
"loss": 1.3579,
"mean_token_accuracy": 0.7195048153400421,
"num_tokens": 467731305.0,
"step": 7430
},
{
"epoch": 24.29787234042553,
"grad_norm": 13.353247427811356,
"learning_rate": 1.2132708982338924e-11,
"loss": 1.3429,
"mean_token_accuracy": 0.7209743320941925,
"num_tokens": 468047361.0,
"step": 7435
},
{
"epoch": 24.314238952536826,
"grad_norm": 12.837440884503787,
"learning_rate": 1.15779395877616e-11,
"loss": 1.3584,
"mean_token_accuracy": 0.7181209921836853,
"num_tokens": 468361613.0,
"step": 7440
},
{
"epoch": 24.330605564648117,
"grad_norm": 12.962933524119704,
"learning_rate": 1.10361227563624e-11,
"loss": 1.3361,
"mean_token_accuracy": 0.7226485967636108,
"num_tokens": 468676912.0,
"step": 7445
},
{
"epoch": 24.34697217675941,
"grad_norm": 12.23329675615014,
"learning_rate": 1.0507261308368709e-11,
"loss": 1.3351,
"mean_token_accuracy": 0.7236675322055817,
"num_tokens": 468994360.0,
"step": 7450
},
{
"epoch": 24.363338788870703,
"grad_norm": 12.726903918157424,
"learning_rate": 9.991357996573803e-12,
"loss": 1.3755,
"mean_token_accuracy": 0.7160296976566315,
"num_tokens": 469310842.0,
"step": 7455
},
{
"epoch": 24.379705400981997,
"grad_norm": 13.198393892424019,
"learning_rate": 9.488415506322123e-12,
"loss": 1.3622,
"mean_token_accuracy": 0.7167876899242401,
"num_tokens": 469626170.0,
"step": 7460
},
{
"epoch": 24.39607201309329,
"grad_norm": 12.368761103188476,
"learning_rate": 8.998436455495696e-12,
"loss": 1.3258,
"mean_token_accuracy": 0.7249380946159363,
"num_tokens": 469943305.0,
"step": 7465
},
{
"epoch": 24.412438625204583,
"grad_norm": 12.962191619603873,
"learning_rate": 8.521423394499129e-12,
"loss": 1.3428,
"mean_token_accuracy": 0.7214452266693115,
"num_tokens": 470260118.0,
"step": 7470
},
{
"epoch": 24.428805237315874,
"grad_norm": 12.849320327271007,
"learning_rate": 8.05737880624824e-12,
"loss": 1.3236,
"mean_token_accuracy": 0.7250566124916077,
"num_tokens": 470576565.0,
"step": 7475
},
{
"epoch": 24.44517184942717,
"grad_norm": 12.474474797345762,
"learning_rate": 7.606305106155897e-12,
"loss": 1.3249,
"mean_token_accuracy": 0.7245312631130219,
"num_tokens": 470893001.0,
"step": 7480
},
{
"epoch": 24.46153846153846,
"grad_norm": 13.300980774411267,
"learning_rate": 7.168204642119813e-12,
"loss": 1.3604,
"mean_token_accuracy": 0.7191662549972534,
"num_tokens": 471208820.0,
"step": 7485
},
{
"epoch": 24.477905073649755,
"grad_norm": 12.968424055077652,
"learning_rate": 6.743079694510601e-12,
"loss": 1.362,
"mean_token_accuracy": 0.7179892778396606,
"num_tokens": 471525954.0,
"step": 7490
},
{
"epoch": 24.49427168576105,
"grad_norm": 12.536855830459842,
"learning_rate": 6.33093247615929e-12,
"loss": 1.3545,
"mean_token_accuracy": 0.7187951862812042,
"num_tokens": 471840937.0,
"step": 7495
},
{
"epoch": 24.51063829787234,
"grad_norm": 13.178979626599647,
"learning_rate": 5.931765132346223e-12,
"loss": 1.3329,
"mean_token_accuracy": 0.7232393980026245,
"num_tokens": 472157742.0,
"step": 7500
},
{
"epoch": 24.527004909983635,
"grad_norm": 12.475923633074265,
"learning_rate": 5.545579740789397e-12,
"loss": 1.3248,
"mean_token_accuracy": 0.7247317433357239,
"num_tokens": 472474594.0,
"step": 7505
},
{
"epoch": 24.543371522094926,
"grad_norm": 13.289472428562675,
"learning_rate": 5.1723783116350284e-12,
"loss": 1.3621,
"mean_token_accuracy": 0.7177056550979615,
"num_tokens": 472790990.0,
"step": 7510
},
{
"epoch": 24.55973813420622,
"grad_norm": 12.370563451684093,
"learning_rate": 4.812162787445063e-12,
"loss": 1.3681,
"mean_token_accuracy": 0.7161191761493683,
"num_tokens": 473106808.0,
"step": 7515
},
{
"epoch": 24.57610474631751,
"grad_norm": 13.28225745876113,
"learning_rate": 4.464935043188567e-12,
"loss": 1.3724,
"mean_token_accuracy": 0.7154851138591767,
"num_tokens": 473421989.0,
"step": 7520
},
{
"epoch": 24.592471358428806,
"grad_norm": 13.301245647293618,
"learning_rate": 4.130696886231744e-12,
"loss": 1.3605,
"mean_token_accuracy": 0.7182529032230377,
"num_tokens": 473736855.0,
"step": 7525
},
{
"epoch": 24.608837970540097,
"grad_norm": 12.426546296528416,
"learning_rate": 3.809450056327934e-12,
"loss": 1.3496,
"mean_token_accuracy": 0.7204344034194946,
"num_tokens": 474052212.0,
"step": 7530
},
{
"epoch": 24.625204582651392,
"grad_norm": 12.648395109151407,
"learning_rate": 3.501196225608738e-12,
"loss": 1.3309,
"mean_token_accuracy": 0.7240665137767792,
"num_tokens": 474369338.0,
"step": 7535
},
{
"epoch": 24.641571194762683,
"grad_norm": 12.734272716729766,
"learning_rate": 3.2059369985762423e-12,
"loss": 1.3427,
"mean_token_accuracy": 0.7211298882961273,
"num_tokens": 474685028.0,
"step": 7540
},
{
"epoch": 24.657937806873978,
"grad_norm": 13.209935222715595,
"learning_rate": 2.923673912093028e-12,
"loss": 1.3385,
"mean_token_accuracy": 0.721641993522644,
"num_tokens": 475001822.0,
"step": 7545
},
{
"epoch": 24.67430441898527,
"grad_norm": 12.898389985570757,
"learning_rate": 2.654408435375788e-12,
"loss": 1.3624,
"mean_token_accuracy": 0.7173764705657959,
"num_tokens": 475316276.0,
"step": 7550
},
{
"epoch": 24.690671031096564,
"grad_norm": 12.541776646335519,
"learning_rate": 2.398141969986445e-12,
"loss": 1.3503,
"mean_token_accuracy": 0.720172131061554,
"num_tokens": 475630298.0,
"step": 7555
},
{
"epoch": 24.707037643207855,
"grad_norm": 13.762831164718952,
"learning_rate": 2.154875849825766e-12,
"loss": 1.3443,
"mean_token_accuracy": 0.7212839305400849,
"num_tokens": 475944133.0,
"step": 7560
},
{
"epoch": 24.72340425531915,
"grad_norm": 12.460693550510845,
"learning_rate": 1.924611341125315e-12,
"loss": 1.3492,
"mean_token_accuracy": 0.7199400305747986,
"num_tokens": 476257411.0,
"step": 7565
},
{
"epoch": 24.73977086743044,
"grad_norm": 12.636399236316878,
"learning_rate": 1.7073496424427347e-12,
"loss": 1.3606,
"mean_token_accuracy": 0.7177873611450195,
"num_tokens": 476572234.0,
"step": 7570
},
{
"epoch": 24.756137479541735,
"grad_norm": 12.806079318268207,
"learning_rate": 1.5030918846534182e-12,
"loss": 1.3509,
"mean_token_accuracy": 0.7192403972148895,
"num_tokens": 476885443.0,
"step": 7575
},
{
"epoch": 24.772504091653026,
"grad_norm": 12.87340982617205,
"learning_rate": 1.3118391309455136e-12,
"loss": 1.3321,
"mean_token_accuracy": 0.7227283775806427,
"num_tokens": 477200707.0,
"step": 7580
},
{
"epoch": 24.78887070376432,
"grad_norm": 13.196133476693376,
"learning_rate": 1.1335923768149292e-12,
"loss": 1.3599,
"mean_token_accuracy": 0.7180555760860443,
"num_tokens": 477517157.0,
"step": 7585
},
{
"epoch": 24.805237315875615,
"grad_norm": 12.525929111718444,
"learning_rate": 9.68352550059226e-13,
"loss": 1.3507,
"mean_token_accuracy": 0.7201619267463684,
"num_tokens": 477832923.0,
"step": 7590
},
{
"epoch": 24.821603927986907,
"grad_norm": 12.495416050685176,
"learning_rate": 8.161205107737324e-13,
"loss": 1.3392,
"mean_token_accuracy": 0.7222834944725036,
"num_tokens": 478149099.0,
"step": 7595
},
{
"epoch": 24.8379705400982,
"grad_norm": 12.655900316269998,
"learning_rate": 6.768970513457151e-13,
"loss": 1.3523,
"mean_token_accuracy": 0.7203588485717773,
"num_tokens": 478464829.0,
"step": 7600
},
{
"epoch": 24.854337152209492,
"grad_norm": 13.126910205340408,
"learning_rate": 5.506828964518818e-13,
"loss": 1.351,
"mean_token_accuracy": 0.7200382113456726,
"num_tokens": 478780816.0,
"step": 7605
},
{
"epoch": 24.870703764320787,
"grad_norm": 12.611751802636238,
"learning_rate": 4.3747870305338443e-13,
"loss": 1.3485,
"mean_token_accuracy": 0.7205796897411346,
"num_tokens": 479097853.0,
"step": 7610
},
{
"epoch": 24.887070376432078,
"grad_norm": 13.003538205209928,
"learning_rate": 3.3728506039276683e-13,
"loss": 1.3477,
"mean_token_accuracy": 0.7211360156536102,
"num_tokens": 479413268.0,
"step": 7615
},
{
"epoch": 24.903436988543373,
"grad_norm": 12.900651331291833,
"learning_rate": 2.501024899914661e-13,
"loss": 1.3452,
"mean_token_accuracy": 0.7210480213165283,
"num_tokens": 479730891.0,
"step": 7620
},
{
"epoch": 24.919803600654664,
"grad_norm": 13.073814232685633,
"learning_rate": 1.7593144564564956e-13,
"loss": 1.34,
"mean_token_accuracy": 0.7227759063243866,
"num_tokens": 480044395.0,
"step": 7625
},
{
"epoch": 24.93617021276596,
"grad_norm": 12.350548181385452,
"learning_rate": 1.1477231342538198e-13,
"loss": 1.3295,
"mean_token_accuracy": 0.7247556269168853,
"num_tokens": 480359255.0,
"step": 7630
},
{
"epoch": 24.95253682487725,
"grad_norm": 13.327420468075404,
"learning_rate": 6.662541167240521e-14,
"loss": 1.3351,
"mean_token_accuracy": 0.7232851803302764,
"num_tokens": 480675647.0,
"step": 7635
},
{
"epoch": 24.968903436988544,
"grad_norm": 12.389629604640263,
"learning_rate": 3.1490990997362634e-14,
"loss": 1.3502,
"mean_token_accuracy": 0.7206938445568085,
"num_tokens": 480991118.0,
"step": 7640
},
{
"epoch": 24.985270049099835,
"grad_norm": 12.850968329093641,
"learning_rate": 9.369234279799077e-15,
"loss": 1.3521,
"mean_token_accuracy": 0.7204296112060546,
"num_tokens": 481307400.0,
"step": 7645
},
{
"epoch": 25.0,
"grad_norm": 13.285064991937862,
"learning_rate": 2.6025666594042817e-16,
"loss": 1.3573,
"mean_token_accuracy": 0.7185248600112067,
"num_tokens": 481567917.0,
"step": 7650
},
{
"epoch": 25.0,
"step": 7650,
"total_flos": 567260733210624.0,
"train_loss": 1.8163700196010615,
"train_runtime": 15926.6953,
"train_samples_per_second": 30.644,
"train_steps_per_second": 0.48
}
],
"logging_steps": 5,
"max_steps": 7650,
"num_input_tokens_seen": 0,
"num_train_epochs": 25,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 567260733210624.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}