Files
Qwen1.5-MOE-sft-math7k-sft-…/trainer_state.json
ModelHub XC 467ad095ae 初始化项目,由ModelHub XC社区提供模型
Model: xd2010/Qwen1.5-MOE-sft-math7k-sft-2epochs-frozen-router
Source: Original Platform
2026-04-11 22:26:02 +08:00

3484 lines
94 KiB
JSON

{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 430,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004651162790697674,
"grad_norm": 19.502025604248047,
"learning_rate": 0.0,
"loss": 0.842,
"mean_token_accuracy": 0.8234314918518066,
"step": 1
},
{
"epoch": 0.009302325581395349,
"grad_norm": 21.47846031188965,
"learning_rate": 2.3255813953488374e-07,
"loss": 0.8778,
"mean_token_accuracy": 0.8233309388160706,
"step": 2
},
{
"epoch": 0.013953488372093023,
"grad_norm": 19.13929557800293,
"learning_rate": 4.651162790697675e-07,
"loss": 0.8513,
"mean_token_accuracy": 0.8330637216567993,
"step": 3
},
{
"epoch": 0.018604651162790697,
"grad_norm": 18.666189193725586,
"learning_rate": 6.976744186046513e-07,
"loss": 0.8217,
"mean_token_accuracy": 0.8303450345993042,
"step": 4
},
{
"epoch": 0.023255813953488372,
"grad_norm": 22.670207977294922,
"learning_rate": 9.30232558139535e-07,
"loss": 0.9557,
"mean_token_accuracy": 0.8066383600234985,
"step": 5
},
{
"epoch": 0.027906976744186046,
"grad_norm": 21.466005325317383,
"learning_rate": 1.1627906976744188e-06,
"loss": 0.845,
"mean_token_accuracy": 0.8213537931442261,
"step": 6
},
{
"epoch": 0.03255813953488372,
"grad_norm": 20.89145851135254,
"learning_rate": 1.3953488372093025e-06,
"loss": 0.8985,
"mean_token_accuracy": 0.8202102184295654,
"step": 7
},
{
"epoch": 0.037209302325581395,
"grad_norm": 13.6134614944458,
"learning_rate": 1.6279069767441862e-06,
"loss": 0.755,
"mean_token_accuracy": 0.8332804441452026,
"step": 8
},
{
"epoch": 0.04186046511627907,
"grad_norm": 12.933883666992188,
"learning_rate": 1.86046511627907e-06,
"loss": 0.6942,
"mean_token_accuracy": 0.8436604738235474,
"step": 9
},
{
"epoch": 0.046511627906976744,
"grad_norm": 11.615815162658691,
"learning_rate": 2.0930232558139536e-06,
"loss": 0.645,
"mean_token_accuracy": 0.8547393083572388,
"step": 10
},
{
"epoch": 0.05116279069767442,
"grad_norm": 11.619194030761719,
"learning_rate": 2.3255813953488376e-06,
"loss": 0.5963,
"mean_token_accuracy": 0.8689830303192139,
"step": 11
},
{
"epoch": 0.05581395348837209,
"grad_norm": 9.622608184814453,
"learning_rate": 2.558139534883721e-06,
"loss": 0.6343,
"mean_token_accuracy": 0.8541088104248047,
"step": 12
},
{
"epoch": 0.06046511627906977,
"grad_norm": 8.54611873626709,
"learning_rate": 2.790697674418605e-06,
"loss": 0.5877,
"mean_token_accuracy": 0.8466028571128845,
"step": 13
},
{
"epoch": 0.06511627906976744,
"grad_norm": 9.963361740112305,
"learning_rate": 3.0232558139534885e-06,
"loss": 0.4568,
"mean_token_accuracy": 0.8818826079368591,
"step": 14
},
{
"epoch": 0.06976744186046512,
"grad_norm": 8.291352272033691,
"learning_rate": 3.2558139534883724e-06,
"loss": 0.4129,
"mean_token_accuracy": 0.8876066207885742,
"step": 15
},
{
"epoch": 0.07441860465116279,
"grad_norm": 7.158071994781494,
"learning_rate": 3.4883720930232564e-06,
"loss": 0.4261,
"mean_token_accuracy": 0.8846276998519897,
"step": 16
},
{
"epoch": 0.07906976744186046,
"grad_norm": 4.239832878112793,
"learning_rate": 3.72093023255814e-06,
"loss": 0.3677,
"mean_token_accuracy": 0.8992332816123962,
"step": 17
},
{
"epoch": 0.08372093023255814,
"grad_norm": 4.137484073638916,
"learning_rate": 3.953488372093024e-06,
"loss": 0.3967,
"mean_token_accuracy": 0.8831896185874939,
"step": 18
},
{
"epoch": 0.08837209302325581,
"grad_norm": 7.036076068878174,
"learning_rate": 4.186046511627907e-06,
"loss": 0.4065,
"mean_token_accuracy": 0.8782775402069092,
"step": 19
},
{
"epoch": 0.09302325581395349,
"grad_norm": 3.3408806324005127,
"learning_rate": 4.418604651162791e-06,
"loss": 0.3653,
"mean_token_accuracy": 0.8928460478782654,
"step": 20
},
{
"epoch": 0.09767441860465116,
"grad_norm": 3.5764951705932617,
"learning_rate": 4.651162790697675e-06,
"loss": 0.4158,
"mean_token_accuracy": 0.8744533658027649,
"step": 21
},
{
"epoch": 0.10232558139534884,
"grad_norm": 3.2680561542510986,
"learning_rate": 4.883720930232559e-06,
"loss": 0.3608,
"mean_token_accuracy": 0.8843502998352051,
"step": 22
},
{
"epoch": 0.10697674418604651,
"grad_norm": 3.0347847938537598,
"learning_rate": 5.116279069767442e-06,
"loss": 0.3442,
"mean_token_accuracy": 0.8947029709815979,
"step": 23
},
{
"epoch": 0.11162790697674418,
"grad_norm": 3.145249843597412,
"learning_rate": 5.348837209302326e-06,
"loss": 0.3726,
"mean_token_accuracy": 0.882703423500061,
"step": 24
},
{
"epoch": 0.11627906976744186,
"grad_norm": 3.2718427181243896,
"learning_rate": 5.58139534883721e-06,
"loss": 0.3467,
"mean_token_accuracy": 0.8916211128234863,
"step": 25
},
{
"epoch": 0.12093023255813953,
"grad_norm": 3.159402847290039,
"learning_rate": 5.8139534883720935e-06,
"loss": 0.3374,
"mean_token_accuracy": 0.8960058689117432,
"step": 26
},
{
"epoch": 0.12558139534883722,
"grad_norm": 3.0180864334106445,
"learning_rate": 6.046511627906977e-06,
"loss": 0.3579,
"mean_token_accuracy": 0.8927838802337646,
"step": 27
},
{
"epoch": 0.13023255813953488,
"grad_norm": 3.1647789478302,
"learning_rate": 6.279069767441861e-06,
"loss": 0.3569,
"mean_token_accuracy": 0.8896810412406921,
"step": 28
},
{
"epoch": 0.13488372093023257,
"grad_norm": 2.948369264602661,
"learning_rate": 6.511627906976745e-06,
"loss": 0.3782,
"mean_token_accuracy": 0.8793545365333557,
"step": 29
},
{
"epoch": 0.13953488372093023,
"grad_norm": 3.0937540531158447,
"learning_rate": 6.744186046511628e-06,
"loss": 0.3703,
"mean_token_accuracy": 0.8813475966453552,
"step": 30
},
{
"epoch": 0.14418604651162792,
"grad_norm": 3.3827579021453857,
"learning_rate": 6.976744186046513e-06,
"loss": 0.3959,
"mean_token_accuracy": 0.8739346861839294,
"step": 31
},
{
"epoch": 0.14883720930232558,
"grad_norm": 3.005185127258301,
"learning_rate": 7.209302325581395e-06,
"loss": 0.3191,
"mean_token_accuracy": 0.9007467031478882,
"step": 32
},
{
"epoch": 0.15348837209302327,
"grad_norm": 3.6860156059265137,
"learning_rate": 7.44186046511628e-06,
"loss": 0.3518,
"mean_token_accuracy": 0.8902492523193359,
"step": 33
},
{
"epoch": 0.15813953488372093,
"grad_norm": 2.850628137588501,
"learning_rate": 7.674418604651164e-06,
"loss": 0.3055,
"mean_token_accuracy": 0.9046649932861328,
"step": 34
},
{
"epoch": 0.16279069767441862,
"grad_norm": 3.145270347595215,
"learning_rate": 7.906976744186048e-06,
"loss": 0.3553,
"mean_token_accuracy": 0.893163800239563,
"step": 35
},
{
"epoch": 0.16744186046511628,
"grad_norm": 3.0683932304382324,
"learning_rate": 8.139534883720931e-06,
"loss": 0.3326,
"mean_token_accuracy": 0.8977251648902893,
"step": 36
},
{
"epoch": 0.17209302325581396,
"grad_norm": 3.071561336517334,
"learning_rate": 8.372093023255815e-06,
"loss": 0.3218,
"mean_token_accuracy": 0.9021276831626892,
"step": 37
},
{
"epoch": 0.17674418604651163,
"grad_norm": 3.128781318664551,
"learning_rate": 8.604651162790698e-06,
"loss": 0.3417,
"mean_token_accuracy": 0.899016261100769,
"step": 38
},
{
"epoch": 0.1813953488372093,
"grad_norm": 2.9364817142486572,
"learning_rate": 8.837209302325582e-06,
"loss": 0.3202,
"mean_token_accuracy": 0.9026311039924622,
"step": 39
},
{
"epoch": 0.18604651162790697,
"grad_norm": 2.9265284538269043,
"learning_rate": 9.069767441860465e-06,
"loss": 0.3426,
"mean_token_accuracy": 0.8922522664070129,
"step": 40
},
{
"epoch": 0.19069767441860466,
"grad_norm": 2.898973226547241,
"learning_rate": 9.30232558139535e-06,
"loss": 0.3363,
"mean_token_accuracy": 0.8978930115699768,
"step": 41
},
{
"epoch": 0.19534883720930232,
"grad_norm": 3.2877488136291504,
"learning_rate": 9.534883720930234e-06,
"loss": 0.3378,
"mean_token_accuracy": 0.8954758048057556,
"step": 42
},
{
"epoch": 0.2,
"grad_norm": 3.0985326766967773,
"learning_rate": 9.767441860465117e-06,
"loss": 0.3116,
"mean_token_accuracy": 0.9019308090209961,
"step": 43
},
{
"epoch": 0.20465116279069767,
"grad_norm": 2.912959575653076,
"learning_rate": 1e-05,
"loss": 0.3394,
"mean_token_accuracy": 0.8979052901268005,
"step": 44
},
{
"epoch": 0.20930232558139536,
"grad_norm": 3.3199048042297363,
"learning_rate": 9.999851728408726e-06,
"loss": 0.3074,
"mean_token_accuracy": 0.9037721157073975,
"step": 45
},
{
"epoch": 0.21395348837209302,
"grad_norm": 3.068047046661377,
"learning_rate": 9.999406923405777e-06,
"loss": 0.3081,
"mean_token_accuracy": 0.9021856188774109,
"step": 46
},
{
"epoch": 0.2186046511627907,
"grad_norm": 2.947935104370117,
"learning_rate": 9.998665614303127e-06,
"loss": 0.3174,
"mean_token_accuracy": 0.900806725025177,
"step": 47
},
{
"epoch": 0.22325581395348837,
"grad_norm": 2.833920478820801,
"learning_rate": 9.997627849951926e-06,
"loss": 0.3281,
"mean_token_accuracy": 0.8946992754936218,
"step": 48
},
{
"epoch": 0.22790697674418606,
"grad_norm": 3.0928986072540283,
"learning_rate": 9.996293698739271e-06,
"loss": 0.3201,
"mean_token_accuracy": 0.8966386318206787,
"step": 49
},
{
"epoch": 0.23255813953488372,
"grad_norm": 2.9233152866363525,
"learning_rate": 9.994663248583704e-06,
"loss": 0.3263,
"mean_token_accuracy": 0.8985762000083923,
"step": 50
},
{
"epoch": 0.2372093023255814,
"grad_norm": 3.122283697128296,
"learning_rate": 9.992736606929422e-06,
"loss": 0.3401,
"mean_token_accuracy": 0.8825533390045166,
"step": 51
},
{
"epoch": 0.24186046511627907,
"grad_norm": 2.8861806392669678,
"learning_rate": 9.990513900739192e-06,
"loss": 0.3384,
"mean_token_accuracy": 0.8865090012550354,
"step": 52
},
{
"epoch": 0.24651162790697675,
"grad_norm": 3.0938143730163574,
"learning_rate": 9.987995276485984e-06,
"loss": 0.3231,
"mean_token_accuracy": 0.8968112468719482,
"step": 53
},
{
"epoch": 0.25116279069767444,
"grad_norm": 2.87567400932312,
"learning_rate": 9.985180900143318e-06,
"loss": 0.3352,
"mean_token_accuracy": 0.8892236948013306,
"step": 54
},
{
"epoch": 0.2558139534883721,
"grad_norm": 2.760570764541626,
"learning_rate": 9.982070957174334e-06,
"loss": 0.3362,
"mean_token_accuracy": 0.8886131048202515,
"step": 55
},
{
"epoch": 0.26046511627906976,
"grad_norm": 2.875344753265381,
"learning_rate": 9.978665652519562e-06,
"loss": 0.3292,
"mean_token_accuracy": 0.8967674374580383,
"step": 56
},
{
"epoch": 0.2651162790697674,
"grad_norm": 2.552971124649048,
"learning_rate": 9.97496521058342e-06,
"loss": 0.2887,
"mean_token_accuracy": 0.9075325727462769,
"step": 57
},
{
"epoch": 0.26976744186046514,
"grad_norm": 2.6377763748168945,
"learning_rate": 9.970969875219422e-06,
"loss": 0.316,
"mean_token_accuracy": 0.8951440453529358,
"step": 58
},
{
"epoch": 0.2744186046511628,
"grad_norm": 2.8528380393981934,
"learning_rate": 9.96667990971412e-06,
"loss": 0.2988,
"mean_token_accuracy": 0.9043198823928833,
"step": 59
},
{
"epoch": 0.27906976744186046,
"grad_norm": 2.762105703353882,
"learning_rate": 9.962095596769738e-06,
"loss": 0.3143,
"mean_token_accuracy": 0.894010066986084,
"step": 60
},
{
"epoch": 0.2837209302325581,
"grad_norm": 2.994401216506958,
"learning_rate": 9.957217238485557e-06,
"loss": 0.3472,
"mean_token_accuracy": 0.8801930546760559,
"step": 61
},
{
"epoch": 0.28837209302325584,
"grad_norm": 2.898961067199707,
"learning_rate": 9.952045156337998e-06,
"loss": 0.345,
"mean_token_accuracy": 0.8863232731819153,
"step": 62
},
{
"epoch": 0.2930232558139535,
"grad_norm": 2.6734066009521484,
"learning_rate": 9.946579691159434e-06,
"loss": 0.3159,
"mean_token_accuracy": 0.8985881805419922,
"step": 63
},
{
"epoch": 0.29767441860465116,
"grad_norm": 2.6497905254364014,
"learning_rate": 9.940821203115742e-06,
"loss": 0.3156,
"mean_token_accuracy": 0.8967551589012146,
"step": 64
},
{
"epoch": 0.3023255813953488,
"grad_norm": 2.952893018722534,
"learning_rate": 9.934770071682563e-06,
"loss": 0.3379,
"mean_token_accuracy": 0.8862144351005554,
"step": 65
},
{
"epoch": 0.30697674418604654,
"grad_norm": 2.4500250816345215,
"learning_rate": 9.928426695620288e-06,
"loss": 0.3051,
"mean_token_accuracy": 0.8942438960075378,
"step": 66
},
{
"epoch": 0.3116279069767442,
"grad_norm": 2.6351726055145264,
"learning_rate": 9.92179149294779e-06,
"loss": 0.3226,
"mean_token_accuracy": 0.8906832337379456,
"step": 67
},
{
"epoch": 0.31627906976744186,
"grad_norm": 2.530116558074951,
"learning_rate": 9.914864900914875e-06,
"loss": 0.2882,
"mean_token_accuracy": 0.899300217628479,
"step": 68
},
{
"epoch": 0.3209302325581395,
"grad_norm": 2.615708827972412,
"learning_rate": 9.907647375973461e-06,
"loss": 0.309,
"mean_token_accuracy": 0.8936994075775146,
"step": 69
},
{
"epoch": 0.32558139534883723,
"grad_norm": 2.7337634563446045,
"learning_rate": 9.90013939374751e-06,
"loss": 0.3138,
"mean_token_accuracy": 0.8906617164611816,
"step": 70
},
{
"epoch": 0.3302325581395349,
"grad_norm": 2.6460962295532227,
"learning_rate": 9.892341449001673e-06,
"loss": 0.3118,
"mean_token_accuracy": 0.8964554071426392,
"step": 71
},
{
"epoch": 0.33488372093023255,
"grad_norm": 2.540764093399048,
"learning_rate": 9.884254055608696e-06,
"loss": 0.3142,
"mean_token_accuracy": 0.8957387208938599,
"step": 72
},
{
"epoch": 0.3395348837209302,
"grad_norm": 2.7574427127838135,
"learning_rate": 9.875877746515556e-06,
"loss": 0.3377,
"mean_token_accuracy": 0.8824289441108704,
"step": 73
},
{
"epoch": 0.34418604651162793,
"grad_norm": 2.4597768783569336,
"learning_rate": 9.867213073708324e-06,
"loss": 0.2907,
"mean_token_accuracy": 0.8995174169540405,
"step": 74
},
{
"epoch": 0.3488372093023256,
"grad_norm": 2.7972140312194824,
"learning_rate": 9.858260608175816e-06,
"loss": 0.3253,
"mean_token_accuracy": 0.8898718357086182,
"step": 75
},
{
"epoch": 0.35348837209302325,
"grad_norm": 2.4993696212768555,
"learning_rate": 9.849020939871951e-06,
"loss": 0.2904,
"mean_token_accuracy": 0.9023594856262207,
"step": 76
},
{
"epoch": 0.3581395348837209,
"grad_norm": 2.5091350078582764,
"learning_rate": 9.839494677676865e-06,
"loss": 0.3146,
"mean_token_accuracy": 0.8932511210441589,
"step": 77
},
{
"epoch": 0.3627906976744186,
"grad_norm": 2.4673449993133545,
"learning_rate": 9.829682449356807e-06,
"loss": 0.2777,
"mean_token_accuracy": 0.9046984314918518,
"step": 78
},
{
"epoch": 0.3674418604651163,
"grad_norm": 2.791132688522339,
"learning_rate": 9.819584901522761e-06,
"loss": 0.3151,
"mean_token_accuracy": 0.8915500044822693,
"step": 79
},
{
"epoch": 0.37209302325581395,
"grad_norm": 2.322134017944336,
"learning_rate": 9.809202699587828e-06,
"loss": 0.3315,
"mean_token_accuracy": 0.8860378861427307,
"step": 80
},
{
"epoch": 0.3767441860465116,
"grad_norm": 2.75315523147583,
"learning_rate": 9.798536527723388e-06,
"loss": 0.2886,
"mean_token_accuracy": 0.8997381329536438,
"step": 81
},
{
"epoch": 0.3813953488372093,
"grad_norm": 2.623223066329956,
"learning_rate": 9.787587088814007e-06,
"loss": 0.2831,
"mean_token_accuracy": 0.902046799659729,
"step": 82
},
{
"epoch": 0.386046511627907,
"grad_norm": 2.417131185531616,
"learning_rate": 9.776355104411123e-06,
"loss": 0.2905,
"mean_token_accuracy": 0.902239978313446,
"step": 83
},
{
"epoch": 0.39069767441860465,
"grad_norm": 2.4174351692199707,
"learning_rate": 9.764841314685487e-06,
"loss": 0.3314,
"mean_token_accuracy": 0.8844476938247681,
"step": 84
},
{
"epoch": 0.3953488372093023,
"grad_norm": 2.363462448120117,
"learning_rate": 9.753046478378403e-06,
"loss": 0.3021,
"mean_token_accuracy": 0.8904982805252075,
"step": 85
},
{
"epoch": 0.4,
"grad_norm": 2.6582350730895996,
"learning_rate": 9.740971372751715e-06,
"loss": 0.3318,
"mean_token_accuracy": 0.8901240825653076,
"step": 86
},
{
"epoch": 0.4046511627906977,
"grad_norm": 2.460733652114868,
"learning_rate": 9.728616793536588e-06,
"loss": 0.3071,
"mean_token_accuracy": 0.8879771828651428,
"step": 87
},
{
"epoch": 0.40930232558139534,
"grad_norm": 3.4467663764953613,
"learning_rate": 9.715983554881077e-06,
"loss": 0.3561,
"mean_token_accuracy": 0.8794929385185242,
"step": 88
},
{
"epoch": 0.413953488372093,
"grad_norm": 2.4044029712677,
"learning_rate": 9.703072489296467e-06,
"loss": 0.3067,
"mean_token_accuracy": 0.9014084339141846,
"step": 89
},
{
"epoch": 0.4186046511627907,
"grad_norm": 2.3434884548187256,
"learning_rate": 9.689884447602423e-06,
"loss": 0.2613,
"mean_token_accuracy": 0.9113546013832092,
"step": 90
},
{
"epoch": 0.4232558139534884,
"grad_norm": 2.498213529586792,
"learning_rate": 9.67642029887091e-06,
"loss": 0.2702,
"mean_token_accuracy": 0.9063438177108765,
"step": 91
},
{
"epoch": 0.42790697674418604,
"grad_norm": 2.4761126041412354,
"learning_rate": 9.662680930368934e-06,
"loss": 0.3227,
"mean_token_accuracy": 0.8934637308120728,
"step": 92
},
{
"epoch": 0.4325581395348837,
"grad_norm": 2.3710319995880127,
"learning_rate": 9.648667247500065e-06,
"loss": 0.2956,
"mean_token_accuracy": 0.8998901844024658,
"step": 93
},
{
"epoch": 0.4372093023255814,
"grad_norm": 2.3822901248931885,
"learning_rate": 9.634380173744771e-06,
"loss": 0.3107,
"mean_token_accuracy": 0.8973243236541748,
"step": 94
},
{
"epoch": 0.4418604651162791,
"grad_norm": 2.4040417671203613,
"learning_rate": 9.619820650599568e-06,
"loss": 0.305,
"mean_token_accuracy": 0.8939420580863953,
"step": 95
},
{
"epoch": 0.44651162790697674,
"grad_norm": 2.35666823387146,
"learning_rate": 9.604989637514976e-06,
"loss": 0.2911,
"mean_token_accuracy": 0.9011731147766113,
"step": 96
},
{
"epoch": 0.4511627906976744,
"grad_norm": 2.3194568157196045,
"learning_rate": 9.589888111832284e-06,
"loss": 0.2524,
"mean_token_accuracy": 0.910751223564148,
"step": 97
},
{
"epoch": 0.4558139534883721,
"grad_norm": 2.354339838027954,
"learning_rate": 9.57451706871916e-06,
"loss": 0.3063,
"mean_token_accuracy": 0.8894273638725281,
"step": 98
},
{
"epoch": 0.4604651162790698,
"grad_norm": 2.2645998001098633,
"learning_rate": 9.558877521104059e-06,
"loss": 0.2627,
"mean_token_accuracy": 0.9070965647697449,
"step": 99
},
{
"epoch": 0.46511627906976744,
"grad_norm": 2.35956072807312,
"learning_rate": 9.54297049960947e-06,
"loss": 0.3314,
"mean_token_accuracy": 0.8871257901191711,
"step": 100
},
{
"epoch": 0.4697674418604651,
"grad_norm": 2.3613924980163574,
"learning_rate": 9.526797052484013e-06,
"loss": 0.3145,
"mean_token_accuracy": 0.8908148407936096,
"step": 101
},
{
"epoch": 0.4744186046511628,
"grad_norm": 2.3541440963745117,
"learning_rate": 9.510358245533355e-06,
"loss": 0.3106,
"mean_token_accuracy": 0.8964611291885376,
"step": 102
},
{
"epoch": 0.4790697674418605,
"grad_norm": 2.2093734741210938,
"learning_rate": 9.493655162049963e-06,
"loss": 0.3013,
"mean_token_accuracy": 0.8970993161201477,
"step": 103
},
{
"epoch": 0.48372093023255813,
"grad_norm": 2.1879260540008545,
"learning_rate": 9.476688902741737e-06,
"loss": 0.2825,
"mean_token_accuracy": 0.9041797518730164,
"step": 104
},
{
"epoch": 0.4883720930232558,
"grad_norm": 2.3790950775146484,
"learning_rate": 9.459460585659461e-06,
"loss": 0.3142,
"mean_token_accuracy": 0.8897626399993896,
"step": 105
},
{
"epoch": 0.4930232558139535,
"grad_norm": 2.383648633956909,
"learning_rate": 9.44197134612313e-06,
"loss": 0.2819,
"mean_token_accuracy": 0.9058862924575806,
"step": 106
},
{
"epoch": 0.49767441860465117,
"grad_norm": 2.3105363845825195,
"learning_rate": 9.424222336647135e-06,
"loss": 0.3438,
"mean_token_accuracy": 0.889424741268158,
"step": 107
},
{
"epoch": 0.5023255813953489,
"grad_norm": 2.4852828979492188,
"learning_rate": 9.406214726864308e-06,
"loss": 0.3239,
"mean_token_accuracy": 0.884105384349823,
"step": 108
},
{
"epoch": 0.5069767441860465,
"grad_norm": 2.5007684230804443,
"learning_rate": 9.387949703448855e-06,
"loss": 0.2802,
"mean_token_accuracy": 0.9021934270858765,
"step": 109
},
{
"epoch": 0.5116279069767442,
"grad_norm": 2.4788923263549805,
"learning_rate": 9.369428470038146e-06,
"loss": 0.2924,
"mean_token_accuracy": 0.8947466611862183,
"step": 110
},
{
"epoch": 0.5162790697674419,
"grad_norm": 2.3672244548797607,
"learning_rate": 9.350652247153405e-06,
"loss": 0.317,
"mean_token_accuracy": 0.8881039023399353,
"step": 111
},
{
"epoch": 0.5209302325581395,
"grad_norm": 2.3644192218780518,
"learning_rate": 9.331622272119272e-06,
"loss": 0.2936,
"mean_token_accuracy": 0.897639274597168,
"step": 112
},
{
"epoch": 0.5255813953488372,
"grad_norm": 2.3581173419952393,
"learning_rate": 9.312339798982271e-06,
"loss": 0.3333,
"mean_token_accuracy": 0.8910724520683289,
"step": 113
},
{
"epoch": 0.5302325581395348,
"grad_norm": 2.267174005508423,
"learning_rate": 9.292806098428174e-06,
"loss": 0.3053,
"mean_token_accuracy": 0.9002794027328491,
"step": 114
},
{
"epoch": 0.5348837209302325,
"grad_norm": 2.5837225914001465,
"learning_rate": 9.27302245769825e-06,
"loss": 0.3058,
"mean_token_accuracy": 0.8905591368675232,
"step": 115
},
{
"epoch": 0.5395348837209303,
"grad_norm": 2.686472177505493,
"learning_rate": 9.252990180504451e-06,
"loss": 0.3117,
"mean_token_accuracy": 0.89449542760849,
"step": 116
},
{
"epoch": 0.5441860465116279,
"grad_norm": 2.654763698577881,
"learning_rate": 9.232710586943498e-06,
"loss": 0.3193,
"mean_token_accuracy": 0.8888066411018372,
"step": 117
},
{
"epoch": 0.5488372093023256,
"grad_norm": 2.2310616970062256,
"learning_rate": 9.21218501340988e-06,
"loss": 0.2677,
"mean_token_accuracy": 0.9084425568580627,
"step": 118
},
{
"epoch": 0.5534883720930233,
"grad_norm": 2.363785743713379,
"learning_rate": 9.1914148125078e-06,
"loss": 0.2578,
"mean_token_accuracy": 0.9098054766654968,
"step": 119
},
{
"epoch": 0.5581395348837209,
"grad_norm": 2.268791437149048,
"learning_rate": 9.170401352962028e-06,
"loss": 0.2621,
"mean_token_accuracy": 0.9081822633743286,
"step": 120
},
{
"epoch": 0.5627906976744186,
"grad_norm": 2.216200590133667,
"learning_rate": 9.149146019527715e-06,
"loss": 0.2358,
"mean_token_accuracy": 0.9134396314620972,
"step": 121
},
{
"epoch": 0.5674418604651162,
"grad_norm": 2.481264114379883,
"learning_rate": 9.127650212899133e-06,
"loss": 0.3148,
"mean_token_accuracy": 0.8915533423423767,
"step": 122
},
{
"epoch": 0.5720930232558139,
"grad_norm": 2.7739858627319336,
"learning_rate": 9.105915349617372e-06,
"loss": 0.3498,
"mean_token_accuracy": 0.8751838207244873,
"step": 123
},
{
"epoch": 0.5767441860465117,
"grad_norm": 2.3887064456939697,
"learning_rate": 9.083942861976991e-06,
"loss": 0.3148,
"mean_token_accuracy": 0.8894332647323608,
"step": 124
},
{
"epoch": 0.5813953488372093,
"grad_norm": 2.2876875400543213,
"learning_rate": 9.061734197931645e-06,
"loss": 0.2463,
"mean_token_accuracy": 0.9108673334121704,
"step": 125
},
{
"epoch": 0.586046511627907,
"grad_norm": 2.95443058013916,
"learning_rate": 9.03929082099864e-06,
"loss": 0.33,
"mean_token_accuracy": 0.8911877274513245,
"step": 126
},
{
"epoch": 0.5906976744186047,
"grad_norm": 2.2911572456359863,
"learning_rate": 9.016614210162523e-06,
"loss": 0.257,
"mean_token_accuracy": 0.9141084551811218,
"step": 127
},
{
"epoch": 0.5953488372093023,
"grad_norm": 2.452328681945801,
"learning_rate": 8.993705859777587e-06,
"loss": 0.3074,
"mean_token_accuracy": 0.8989298343658447,
"step": 128
},
{
"epoch": 0.6,
"grad_norm": 2.8409242630004883,
"learning_rate": 8.970567279469417e-06,
"loss": 0.3673,
"mean_token_accuracy": 0.8829707503318787,
"step": 129
},
{
"epoch": 0.6046511627906976,
"grad_norm": 2.5934958457946777,
"learning_rate": 8.947199994035402e-06,
"loss": 0.292,
"mean_token_accuracy": 0.8922224640846252,
"step": 130
},
{
"epoch": 0.6093023255813953,
"grad_norm": 2.518958330154419,
"learning_rate": 8.923605543344252e-06,
"loss": 0.2793,
"mean_token_accuracy": 0.9032871127128601,
"step": 131
},
{
"epoch": 0.6139534883720931,
"grad_norm": 2.331040620803833,
"learning_rate": 8.89978548223452e-06,
"loss": 0.2803,
"mean_token_accuracy": 0.8997518420219421,
"step": 132
},
{
"epoch": 0.6186046511627907,
"grad_norm": 2.249236583709717,
"learning_rate": 8.875741380412149e-06,
"loss": 0.3143,
"mean_token_accuracy": 0.8874413967132568,
"step": 133
},
{
"epoch": 0.6232558139534884,
"grad_norm": 2.3988537788391113,
"learning_rate": 8.85147482234702e-06,
"loss": 0.3187,
"mean_token_accuracy": 0.8939599394798279,
"step": 134
},
{
"epoch": 0.627906976744186,
"grad_norm": 2.3432514667510986,
"learning_rate": 8.826987407168546e-06,
"loss": 0.308,
"mean_token_accuracy": 0.8958589434623718,
"step": 135
},
{
"epoch": 0.6325581395348837,
"grad_norm": 2.433220386505127,
"learning_rate": 8.80228074856029e-06,
"loss": 0.3033,
"mean_token_accuracy": 0.8961129784584045,
"step": 136
},
{
"epoch": 0.6372093023255814,
"grad_norm": 2.2877557277679443,
"learning_rate": 8.777356474653623e-06,
"loss": 0.2938,
"mean_token_accuracy": 0.9027256965637207,
"step": 137
},
{
"epoch": 0.641860465116279,
"grad_norm": 2.2342700958251953,
"learning_rate": 8.752216227920436e-06,
"loss": 0.3012,
"mean_token_accuracy": 0.8945578336715698,
"step": 138
},
{
"epoch": 0.6465116279069767,
"grad_norm": 2.2655529975891113,
"learning_rate": 8.726861665064903e-06,
"loss": 0.2812,
"mean_token_accuracy": 0.9012300372123718,
"step": 139
},
{
"epoch": 0.6511627906976745,
"grad_norm": 2.375288248062134,
"learning_rate": 8.701294456914301e-06,
"loss": 0.3202,
"mean_token_accuracy": 0.8921115398406982,
"step": 140
},
{
"epoch": 0.6558139534883721,
"grad_norm": 2.3848204612731934,
"learning_rate": 8.675516288308916e-06,
"loss": 0.3188,
"mean_token_accuracy": 0.891019344329834,
"step": 141
},
{
"epoch": 0.6604651162790698,
"grad_norm": 2.4406163692474365,
"learning_rate": 8.649528857991005e-06,
"loss": 0.2917,
"mean_token_accuracy": 0.8987188339233398,
"step": 142
},
{
"epoch": 0.6651162790697674,
"grad_norm": 2.3761322498321533,
"learning_rate": 8.623333878492853e-06,
"loss": 0.2779,
"mean_token_accuracy": 0.9028142094612122,
"step": 143
},
{
"epoch": 0.6697674418604651,
"grad_norm": 2.5542666912078857,
"learning_rate": 8.596933076023927e-06,
"loss": 0.3606,
"mean_token_accuracy": 0.8800684213638306,
"step": 144
},
{
"epoch": 0.6744186046511628,
"grad_norm": 2.644892692565918,
"learning_rate": 8.57032819035711e-06,
"loss": 0.3358,
"mean_token_accuracy": 0.8846637010574341,
"step": 145
},
{
"epoch": 0.6790697674418604,
"grad_norm": 2.3639047145843506,
"learning_rate": 8.543520974714062e-06,
"loss": 0.3492,
"mean_token_accuracy": 0.8764985203742981,
"step": 146
},
{
"epoch": 0.6837209302325581,
"grad_norm": 2.4288532733917236,
"learning_rate": 8.516513195649686e-06,
"loss": 0.2936,
"mean_token_accuracy": 0.8934412002563477,
"step": 147
},
{
"epoch": 0.6883720930232559,
"grad_norm": 2.626235246658325,
"learning_rate": 8.489306632935698e-06,
"loss": 0.2826,
"mean_token_accuracy": 0.9025585055351257,
"step": 148
},
{
"epoch": 0.6930232558139535,
"grad_norm": 2.367561101913452,
"learning_rate": 8.461903079443367e-06,
"loss": 0.3243,
"mean_token_accuracy": 0.8897247314453125,
"step": 149
},
{
"epoch": 0.6976744186046512,
"grad_norm": 2.6119275093078613,
"learning_rate": 8.434304341025352e-06,
"loss": 0.2953,
"mean_token_accuracy": 0.8976051211357117,
"step": 150
},
{
"epoch": 0.7023255813953488,
"grad_norm": 2.146310329437256,
"learning_rate": 8.406512236396705e-06,
"loss": 0.2886,
"mean_token_accuracy": 0.8988166451454163,
"step": 151
},
{
"epoch": 0.7069767441860465,
"grad_norm": 2.6668293476104736,
"learning_rate": 8.378528597015011e-06,
"loss": 0.3446,
"mean_token_accuracy": 0.8813462853431702,
"step": 152
},
{
"epoch": 0.7116279069767442,
"grad_norm": 2.6185600757598877,
"learning_rate": 8.350355266959715e-06,
"loss": 0.3204,
"mean_token_accuracy": 0.8888283371925354,
"step": 153
},
{
"epoch": 0.7162790697674418,
"grad_norm": 2.2908973693847656,
"learning_rate": 8.321994102810585e-06,
"loss": 0.2912,
"mean_token_accuracy": 0.8983022570610046,
"step": 154
},
{
"epoch": 0.7209302325581395,
"grad_norm": 2.1010096073150635,
"learning_rate": 8.293446973525368e-06,
"loss": 0.2701,
"mean_token_accuracy": 0.9079765677452087,
"step": 155
},
{
"epoch": 0.7255813953488373,
"grad_norm": 2.3426928520202637,
"learning_rate": 8.26471576031664e-06,
"loss": 0.2493,
"mean_token_accuracy": 0.9087682962417603,
"step": 156
},
{
"epoch": 0.7302325581395349,
"grad_norm": 2.581718683242798,
"learning_rate": 8.235802356527821e-06,
"loss": 0.3091,
"mean_token_accuracy": 0.8938986659049988,
"step": 157
},
{
"epoch": 0.7348837209302326,
"grad_norm": 2.3071372509002686,
"learning_rate": 8.206708667508418e-06,
"loss": 0.2966,
"mean_token_accuracy": 0.8992342352867126,
"step": 158
},
{
"epoch": 0.7395348837209302,
"grad_norm": 2.2428340911865234,
"learning_rate": 8.177436610488459e-06,
"loss": 0.3142,
"mean_token_accuracy": 0.8913880586624146,
"step": 159
},
{
"epoch": 0.7441860465116279,
"grad_norm": 2.289311647415161,
"learning_rate": 8.147988114452159e-06,
"loss": 0.3302,
"mean_token_accuracy": 0.885582685470581,
"step": 160
},
{
"epoch": 0.7488372093023256,
"grad_norm": 2.49139666557312,
"learning_rate": 8.11836512001079e-06,
"loss": 0.2575,
"mean_token_accuracy": 0.9106836318969727,
"step": 161
},
{
"epoch": 0.7534883720930232,
"grad_norm": 2.2492876052856445,
"learning_rate": 8.088569579274804e-06,
"loss": 0.2915,
"mean_token_accuracy": 0.9044934511184692,
"step": 162
},
{
"epoch": 0.7581395348837209,
"grad_norm": 2.5655603408813477,
"learning_rate": 8.058603455725202e-06,
"loss": 0.3009,
"mean_token_accuracy": 0.9001610279083252,
"step": 163
},
{
"epoch": 0.7627906976744186,
"grad_norm": 2.376429319381714,
"learning_rate": 8.028468724084121e-06,
"loss": 0.3134,
"mean_token_accuracy": 0.8932521343231201,
"step": 164
},
{
"epoch": 0.7674418604651163,
"grad_norm": 2.4902546405792236,
"learning_rate": 7.99816737018473e-06,
"loss": 0.3132,
"mean_token_accuracy": 0.8874028325080872,
"step": 165
},
{
"epoch": 0.772093023255814,
"grad_norm": 2.286018133163452,
"learning_rate": 7.967701390840339e-06,
"loss": 0.3051,
"mean_token_accuracy": 0.8954407572746277,
"step": 166
},
{
"epoch": 0.7767441860465116,
"grad_norm": 2.1808741092681885,
"learning_rate": 7.93707279371283e-06,
"loss": 0.2888,
"mean_token_accuracy": 0.9057142734527588,
"step": 167
},
{
"epoch": 0.7813953488372093,
"grad_norm": 2.4743106365203857,
"learning_rate": 7.906283597180357e-06,
"loss": 0.3272,
"mean_token_accuracy": 0.8907923102378845,
"step": 168
},
{
"epoch": 0.786046511627907,
"grad_norm": 2.4288330078125,
"learning_rate": 7.875335830204323e-06,
"loss": 0.2612,
"mean_token_accuracy": 0.9085434675216675,
"step": 169
},
{
"epoch": 0.7906976744186046,
"grad_norm": 2.390599012374878,
"learning_rate": 7.844231532195686e-06,
"loss": 0.2717,
"mean_token_accuracy": 0.9026373028755188,
"step": 170
},
{
"epoch": 0.7953488372093023,
"grad_norm": 2.386613368988037,
"learning_rate": 7.812972752880566e-06,
"loss": 0.3131,
"mean_token_accuracy": 0.8961803913116455,
"step": 171
},
{
"epoch": 0.8,
"grad_norm": 2.338465929031372,
"learning_rate": 7.781561552165156e-06,
"loss": 0.2846,
"mean_token_accuracy": 0.9064022302627563,
"step": 172
},
{
"epoch": 0.8046511627906977,
"grad_norm": 2.4404520988464355,
"learning_rate": 7.75e-06,
"loss": 0.301,
"mean_token_accuracy": 0.9013170003890991,
"step": 173
},
{
"epoch": 0.8093023255813954,
"grad_norm": 2.441054582595825,
"learning_rate": 7.71829017624357e-06,
"loss": 0.2902,
"mean_token_accuracy": 0.8965333104133606,
"step": 174
},
{
"epoch": 0.813953488372093,
"grad_norm": 2.3165605068206787,
"learning_rate": 7.686434170525213e-06,
"loss": 0.2986,
"mean_token_accuracy": 0.8884420394897461,
"step": 175
},
{
"epoch": 0.8186046511627907,
"grad_norm": 2.340376377105713,
"learning_rate": 7.654434082107442e-06,
"loss": 0.2753,
"mean_token_accuracy": 0.9065356254577637,
"step": 176
},
{
"epoch": 0.8232558139534883,
"grad_norm": 5.1856608390808105,
"learning_rate": 7.622292019747604e-06,
"loss": 0.3261,
"mean_token_accuracy": 0.887184739112854,
"step": 177
},
{
"epoch": 0.827906976744186,
"grad_norm": 2.327040195465088,
"learning_rate": 7.590010101558913e-06,
"loss": 0.2815,
"mean_token_accuracy": 0.9034022688865662,
"step": 178
},
{
"epoch": 0.8325581395348837,
"grad_norm": 2.146498203277588,
"learning_rate": 7.557590454870874e-06,
"loss": 0.2995,
"mean_token_accuracy": 0.8991208672523499,
"step": 179
},
{
"epoch": 0.8372093023255814,
"grad_norm": 2.472043514251709,
"learning_rate": 7.525035216089086e-06,
"loss": 0.2884,
"mean_token_accuracy": 0.900874137878418,
"step": 180
},
{
"epoch": 0.8418604651162791,
"grad_norm": 2.309894561767578,
"learning_rate": 7.492346530554463e-06,
"loss": 0.2919,
"mean_token_accuracy": 0.8961777091026306,
"step": 181
},
{
"epoch": 0.8465116279069768,
"grad_norm": 2.3329412937164307,
"learning_rate": 7.459526552401861e-06,
"loss": 0.2993,
"mean_token_accuracy": 0.9014893174171448,
"step": 182
},
{
"epoch": 0.8511627906976744,
"grad_norm": 2.2241170406341553,
"learning_rate": 7.4265774444181184e-06,
"loss": 0.3036,
"mean_token_accuracy": 0.8940504193305969,
"step": 183
},
{
"epoch": 0.8558139534883721,
"grad_norm": 2.4457859992980957,
"learning_rate": 7.39350137789953e-06,
"loss": 0.3062,
"mean_token_accuracy": 0.894417941570282,
"step": 184
},
{
"epoch": 0.8604651162790697,
"grad_norm": 2.253269672393799,
"learning_rate": 7.360300532508775e-06,
"loss": 0.282,
"mean_token_accuracy": 0.9023967385292053,
"step": 185
},
{
"epoch": 0.8651162790697674,
"grad_norm": 2.2496144771575928,
"learning_rate": 7.3269770961312616e-06,
"loss": 0.3094,
"mean_token_accuracy": 0.8949607610702515,
"step": 186
},
{
"epoch": 0.8697674418604651,
"grad_norm": 2.237771511077881,
"learning_rate": 7.2935332647309624e-06,
"loss": 0.271,
"mean_token_accuracy": 0.9009068608283997,
"step": 187
},
{
"epoch": 0.8744186046511628,
"grad_norm": 2.1698238849639893,
"learning_rate": 7.259971242205702e-06,
"loss": 0.293,
"mean_token_accuracy": 0.9026434421539307,
"step": 188
},
{
"epoch": 0.8790697674418605,
"grad_norm": 2.1061506271362305,
"learning_rate": 7.226293240241918e-06,
"loss": 0.2693,
"mean_token_accuracy": 0.9091058969497681,
"step": 189
},
{
"epoch": 0.8837209302325582,
"grad_norm": 2.3756353855133057,
"learning_rate": 7.1925014781689185e-06,
"loss": 0.2892,
"mean_token_accuracy": 0.8960843086242676,
"step": 190
},
{
"epoch": 0.8883720930232558,
"grad_norm": 2.4632489681243896,
"learning_rate": 7.158598182812628e-06,
"loss": 0.337,
"mean_token_accuracy": 0.8823441863059998,
"step": 191
},
{
"epoch": 0.8930232558139535,
"grad_norm": 2.2445359230041504,
"learning_rate": 7.12458558834885e-06,
"loss": 0.2502,
"mean_token_accuracy": 0.9130662083625793,
"step": 192
},
{
"epoch": 0.8976744186046511,
"grad_norm": 2.2357635498046875,
"learning_rate": 7.090465936156028e-06,
"loss": 0.2964,
"mean_token_accuracy": 0.8986391425132751,
"step": 193
},
{
"epoch": 0.9023255813953488,
"grad_norm": 2.2954163551330566,
"learning_rate": 7.056241474667552e-06,
"loss": 0.2769,
"mean_token_accuracy": 0.9078544974327087,
"step": 194
},
{
"epoch": 0.9069767441860465,
"grad_norm": 2.27939510345459,
"learning_rate": 7.021914459223586e-06,
"loss": 0.2765,
"mean_token_accuracy": 0.9080632925033569,
"step": 195
},
{
"epoch": 0.9116279069767442,
"grad_norm": 2.234778881072998,
"learning_rate": 6.987487151922439e-06,
"loss": 0.3069,
"mean_token_accuracy": 0.8933604955673218,
"step": 196
},
{
"epoch": 0.9162790697674419,
"grad_norm": 2.3990750312805176,
"learning_rate": 6.952961821471509e-06,
"loss": 0.2639,
"mean_token_accuracy": 0.9116538166999817,
"step": 197
},
{
"epoch": 0.9209302325581395,
"grad_norm": 2.3618667125701904,
"learning_rate": 6.9183407430377645e-06,
"loss": 0.2734,
"mean_token_accuracy": 0.9068870544433594,
"step": 198
},
{
"epoch": 0.9255813953488372,
"grad_norm": 2.492704153060913,
"learning_rate": 6.883626198097825e-06,
"loss": 0.3028,
"mean_token_accuracy": 0.8957169651985168,
"step": 199
},
{
"epoch": 0.9302325581395349,
"grad_norm": 2.458460569381714,
"learning_rate": 6.84882047428761e-06,
"loss": 0.2783,
"mean_token_accuracy": 0.906191349029541,
"step": 200
},
{
"epoch": 0.9348837209302325,
"grad_norm": 2.3292219638824463,
"learning_rate": 6.813925865251587e-06,
"loss": 0.305,
"mean_token_accuracy": 0.8985507488250732,
"step": 201
},
{
"epoch": 0.9395348837209302,
"grad_norm": 2.1781818866729736,
"learning_rate": 6.77894467049163e-06,
"loss": 0.2698,
"mean_token_accuracy": 0.9075833559036255,
"step": 202
},
{
"epoch": 0.9441860465116279,
"grad_norm": 2.580416679382324,
"learning_rate": 6.743879195215472e-06,
"loss": 0.2944,
"mean_token_accuracy": 0.8997113704681396,
"step": 203
},
{
"epoch": 0.9488372093023256,
"grad_norm": 2.0918219089508057,
"learning_rate": 6.708731750184815e-06,
"loss": 0.2585,
"mean_token_accuracy": 0.9011285305023193,
"step": 204
},
{
"epoch": 0.9534883720930233,
"grad_norm": 2.086010456085205,
"learning_rate": 6.673504651563035e-06,
"loss": 0.2583,
"mean_token_accuracy": 0.9062029719352722,
"step": 205
},
{
"epoch": 0.958139534883721,
"grad_norm": 2.3076541423797607,
"learning_rate": 6.638200220762563e-06,
"loss": 0.2861,
"mean_token_accuracy": 0.9016419649124146,
"step": 206
},
{
"epoch": 0.9627906976744186,
"grad_norm": 2.5171306133270264,
"learning_rate": 6.602820784291907e-06,
"loss": 0.3105,
"mean_token_accuracy": 0.8922398090362549,
"step": 207
},
{
"epoch": 0.9674418604651163,
"grad_norm": 2.2755181789398193,
"learning_rate": 6.5673686736023245e-06,
"loss": 0.2753,
"mean_token_accuracy": 0.9048746824264526,
"step": 208
},
{
"epoch": 0.9720930232558139,
"grad_norm": 2.164433479309082,
"learning_rate": 6.531846224934206e-06,
"loss": 0.2596,
"mean_token_accuracy": 0.9133898019790649,
"step": 209
},
{
"epoch": 0.9767441860465116,
"grad_norm": 2.0087199211120605,
"learning_rate": 6.4962557791631e-06,
"loss": 0.2715,
"mean_token_accuracy": 0.9074162840843201,
"step": 210
},
{
"epoch": 0.9813953488372092,
"grad_norm": 2.2094898223876953,
"learning_rate": 6.460599681645462e-06,
"loss": 0.2664,
"mean_token_accuracy": 0.907378077507019,
"step": 211
},
{
"epoch": 0.986046511627907,
"grad_norm": 2.1591098308563232,
"learning_rate": 6.424880282064103e-06,
"loss": 0.2926,
"mean_token_accuracy": 0.9012137055397034,
"step": 212
},
{
"epoch": 0.9906976744186047,
"grad_norm": 2.1089112758636475,
"learning_rate": 6.3890999342733396e-06,
"loss": 0.2566,
"mean_token_accuracy": 0.9117971062660217,
"step": 213
},
{
"epoch": 0.9953488372093023,
"grad_norm": 2.244309186935425,
"learning_rate": 6.353260996143884e-06,
"loss": 0.2977,
"mean_token_accuracy": 0.8972173929214478,
"step": 214
},
{
"epoch": 1.0,
"grad_norm": 1.9823395013809204,
"learning_rate": 6.317365829407465e-06,
"loss": 0.2511,
"mean_token_accuracy": 0.9041892886161804,
"step": 215
},
{
"epoch": 1.0046511627906978,
"grad_norm": 3.1557061672210693,
"learning_rate": 6.281416799501188e-06,
"loss": 0.1454,
"mean_token_accuracy": 0.9540554881095886,
"step": 216
},
{
"epoch": 1.0093023255813953,
"grad_norm": 2.734393835067749,
"learning_rate": 6.245416275411661e-06,
"loss": 0.1622,
"mean_token_accuracy": 0.9430840611457825,
"step": 217
},
{
"epoch": 1.013953488372093,
"grad_norm": 2.5812294483184814,
"learning_rate": 6.2093666295188816e-06,
"loss": 0.1363,
"mean_token_accuracy": 0.955654501914978,
"step": 218
},
{
"epoch": 1.0186046511627906,
"grad_norm": 2.294768810272217,
"learning_rate": 6.173270237439901e-06,
"loss": 0.1409,
"mean_token_accuracy": 0.9525974988937378,
"step": 219
},
{
"epoch": 1.0232558139534884,
"grad_norm": 2.10762619972229,
"learning_rate": 6.1371294778722705e-06,
"loss": 0.1142,
"mean_token_accuracy": 0.9630529284477234,
"step": 220
},
{
"epoch": 1.027906976744186,
"grad_norm": 1.7485405206680298,
"learning_rate": 6.100946732437291e-06,
"loss": 0.1396,
"mean_token_accuracy": 0.9498718976974487,
"step": 221
},
{
"epoch": 1.0325581395348837,
"grad_norm": 1.6996605396270752,
"learning_rate": 6.064724385523073e-06,
"loss": 0.1211,
"mean_token_accuracy": 0.9572222232818604,
"step": 222
},
{
"epoch": 1.0372093023255813,
"grad_norm": 1.8055946826934814,
"learning_rate": 6.028464824127399e-06,
"loss": 0.1245,
"mean_token_accuracy": 0.9569948315620422,
"step": 223
},
{
"epoch": 1.041860465116279,
"grad_norm": 2.2612037658691406,
"learning_rate": 5.992170437700436e-06,
"loss": 0.1184,
"mean_token_accuracy": 0.9553605318069458,
"step": 224
},
{
"epoch": 1.0465116279069768,
"grad_norm": 2.0997769832611084,
"learning_rate": 5.955843617987259e-06,
"loss": 0.1302,
"mean_token_accuracy": 0.9527406692504883,
"step": 225
},
{
"epoch": 1.0511627906976744,
"grad_norm": 2.1007702350616455,
"learning_rate": 5.919486758870257e-06,
"loss": 0.118,
"mean_token_accuracy": 0.9590943455696106,
"step": 226
},
{
"epoch": 1.0558139534883721,
"grad_norm": 2.0043294429779053,
"learning_rate": 5.883102256211361e-06,
"loss": 0.1404,
"mean_token_accuracy": 0.9549180269241333,
"step": 227
},
{
"epoch": 1.0604651162790697,
"grad_norm": 2.222476005554199,
"learning_rate": 5.8466925076941785e-06,
"loss": 0.142,
"mean_token_accuracy": 0.9515007734298706,
"step": 228
},
{
"epoch": 1.0651162790697675,
"grad_norm": 2.368314504623413,
"learning_rate": 5.810259912665973e-06,
"loss": 0.1406,
"mean_token_accuracy": 0.9539972543716431,
"step": 229
},
{
"epoch": 1.069767441860465,
"grad_norm": 1.954593300819397,
"learning_rate": 5.773806871979564e-06,
"loss": 0.1184,
"mean_token_accuracy": 0.958109974861145,
"step": 230
},
{
"epoch": 1.0744186046511628,
"grad_norm": 1.8620705604553223,
"learning_rate": 5.7373357878351055e-06,
"loss": 0.1202,
"mean_token_accuracy": 0.9591098427772522,
"step": 231
},
{
"epoch": 1.0790697674418606,
"grad_norm": 1.718745231628418,
"learning_rate": 5.700849063621789e-06,
"loss": 0.1072,
"mean_token_accuracy": 0.9625619053840637,
"step": 232
},
{
"epoch": 1.083720930232558,
"grad_norm": 2.1164543628692627,
"learning_rate": 5.664349103759467e-06,
"loss": 0.1303,
"mean_token_accuracy": 0.9507201313972473,
"step": 233
},
{
"epoch": 1.0883720930232559,
"grad_norm": 1.8693439960479736,
"learning_rate": 5.627838313540191e-06,
"loss": 0.1475,
"mean_token_accuracy": 0.9470409750938416,
"step": 234
},
{
"epoch": 1.0930232558139534,
"grad_norm": 1.944366216659546,
"learning_rate": 5.591319098969727e-06,
"loss": 0.1476,
"mean_token_accuracy": 0.9515094757080078,
"step": 235
},
{
"epoch": 1.0976744186046512,
"grad_norm": 1.9349125623703003,
"learning_rate": 5.55479386660899e-06,
"loss": 0.1238,
"mean_token_accuracy": 0.9569832682609558,
"step": 236
},
{
"epoch": 1.1023255813953488,
"grad_norm": 1.7475922107696533,
"learning_rate": 5.5182650234154544e-06,
"loss": 0.1181,
"mean_token_accuracy": 0.9598400592803955,
"step": 237
},
{
"epoch": 1.1069767441860465,
"grad_norm": 1.7131977081298828,
"learning_rate": 5.481734976584546e-06,
"loss": 0.1207,
"mean_token_accuracy": 0.955795168876648,
"step": 238
},
{
"epoch": 1.1116279069767443,
"grad_norm": 1.8039817810058594,
"learning_rate": 5.4452061333910125e-06,
"loss": 0.1257,
"mean_token_accuracy": 0.9571937918663025,
"step": 239
},
{
"epoch": 1.1162790697674418,
"grad_norm": 1.6558111906051636,
"learning_rate": 5.4086809010302734e-06,
"loss": 0.1084,
"mean_token_accuracy": 0.9620627164840698,
"step": 240
},
{
"epoch": 1.1209302325581396,
"grad_norm": 1.880353331565857,
"learning_rate": 5.3721616864598094e-06,
"loss": 0.1209,
"mean_token_accuracy": 0.95799320936203,
"step": 241
},
{
"epoch": 1.1255813953488372,
"grad_norm": 1.9320411682128906,
"learning_rate": 5.3356508962405355e-06,
"loss": 0.1293,
"mean_token_accuracy": 0.9533839821815491,
"step": 242
},
{
"epoch": 1.130232558139535,
"grad_norm": 1.9183322191238403,
"learning_rate": 5.299150936378212e-06,
"loss": 0.1248,
"mean_token_accuracy": 0.9582386016845703,
"step": 243
},
{
"epoch": 1.1348837209302325,
"grad_norm": 3.065585136413574,
"learning_rate": 5.262664212164898e-06,
"loss": 0.1176,
"mean_token_accuracy": 0.9561497569084167,
"step": 244
},
{
"epoch": 1.1395348837209303,
"grad_norm": 1.8231462240219116,
"learning_rate": 5.226193128020438e-06,
"loss": 0.1244,
"mean_token_accuracy": 0.9571564793586731,
"step": 245
},
{
"epoch": 1.1441860465116278,
"grad_norm": 1.9364738464355469,
"learning_rate": 5.189740087334029e-06,
"loss": 0.1279,
"mean_token_accuracy": 0.952070415019989,
"step": 246
},
{
"epoch": 1.1488372093023256,
"grad_norm": 1.7137261629104614,
"learning_rate": 5.153307492305824e-06,
"loss": 0.1109,
"mean_token_accuracy": 0.959574818611145,
"step": 247
},
{
"epoch": 1.1534883720930234,
"grad_norm": 1.8068032264709473,
"learning_rate": 5.116897743788639e-06,
"loss": 0.1196,
"mean_token_accuracy": 0.9579454064369202,
"step": 248
},
{
"epoch": 1.158139534883721,
"grad_norm": 2.0845437049865723,
"learning_rate": 5.080513241129745e-06,
"loss": 0.1388,
"mean_token_accuracy": 0.950688898563385,
"step": 249
},
{
"epoch": 1.1627906976744187,
"grad_norm": 2.2837700843811035,
"learning_rate": 5.044156382012742e-06,
"loss": 0.1194,
"mean_token_accuracy": 0.959080159664154,
"step": 250
},
{
"epoch": 1.1674418604651162,
"grad_norm": 3.3996706008911133,
"learning_rate": 5.007829562299567e-06,
"loss": 0.1162,
"mean_token_accuracy": 0.9592645764350891,
"step": 251
},
{
"epoch": 1.172093023255814,
"grad_norm": 1.7496733665466309,
"learning_rate": 4.9715351758726015e-06,
"loss": 0.1233,
"mean_token_accuracy": 0.957980215549469,
"step": 252
},
{
"epoch": 1.1767441860465115,
"grad_norm": 2.1975224018096924,
"learning_rate": 4.9352756144769285e-06,
"loss": 0.1487,
"mean_token_accuracy": 0.9537729620933533,
"step": 253
},
{
"epoch": 1.1813953488372093,
"grad_norm": 2.0753679275512695,
"learning_rate": 4.89905326756271e-06,
"loss": 0.115,
"mean_token_accuracy": 0.9599113464355469,
"step": 254
},
{
"epoch": 1.1860465116279069,
"grad_norm": 2.137821912765503,
"learning_rate": 4.862870522127731e-06,
"loss": 0.1306,
"mean_token_accuracy": 0.9522392749786377,
"step": 255
},
{
"epoch": 1.1906976744186046,
"grad_norm": 1.751489520072937,
"learning_rate": 4.8267297625601e-06,
"loss": 0.1194,
"mean_token_accuracy": 0.9590802192687988,
"step": 256
},
{
"epoch": 1.1953488372093024,
"grad_norm": 1.7990120649337769,
"learning_rate": 4.790633370481121e-06,
"loss": 0.119,
"mean_token_accuracy": 0.9594948291778564,
"step": 257
},
{
"epoch": 1.2,
"grad_norm": 1.755265474319458,
"learning_rate": 4.754583724588342e-06,
"loss": 0.1143,
"mean_token_accuracy": 0.9570131301879883,
"step": 258
},
{
"epoch": 1.2046511627906977,
"grad_norm": 1.8024441003799438,
"learning_rate": 4.718583200498814e-06,
"loss": 0.1064,
"mean_token_accuracy": 0.9623507857322693,
"step": 259
},
{
"epoch": 1.2093023255813953,
"grad_norm": 1.8982021808624268,
"learning_rate": 4.682634170592537e-06,
"loss": 0.1362,
"mean_token_accuracy": 0.9527772665023804,
"step": 260
},
{
"epoch": 1.213953488372093,
"grad_norm": 1.9453184604644775,
"learning_rate": 4.646739003856117e-06,
"loss": 0.1338,
"mean_token_accuracy": 0.9539026618003845,
"step": 261
},
{
"epoch": 1.2186046511627908,
"grad_norm": 1.8251960277557373,
"learning_rate": 4.610900065726661e-06,
"loss": 0.1315,
"mean_token_accuracy": 0.9544464349746704,
"step": 262
},
{
"epoch": 1.2232558139534884,
"grad_norm": 1.9196908473968506,
"learning_rate": 4.575119717935898e-06,
"loss": 0.1272,
"mean_token_accuracy": 0.9516383409500122,
"step": 263
},
{
"epoch": 1.2279069767441861,
"grad_norm": 1.615856647491455,
"learning_rate": 4.53940031835454e-06,
"loss": 0.1187,
"mean_token_accuracy": 0.9601250886917114,
"step": 264
},
{
"epoch": 1.2325581395348837,
"grad_norm": 1.9295324087142944,
"learning_rate": 4.503744220836902e-06,
"loss": 0.1485,
"mean_token_accuracy": 0.9450379610061646,
"step": 265
},
{
"epoch": 1.2372093023255815,
"grad_norm": 2.03997802734375,
"learning_rate": 4.468153775065795e-06,
"loss": 0.1439,
"mean_token_accuracy": 0.9504145383834839,
"step": 266
},
{
"epoch": 1.241860465116279,
"grad_norm": 2.085965394973755,
"learning_rate": 4.432631326397676e-06,
"loss": 0.1598,
"mean_token_accuracy": 0.9417144656181335,
"step": 267
},
{
"epoch": 1.2465116279069768,
"grad_norm": 1.768497347831726,
"learning_rate": 4.397179215708095e-06,
"loss": 0.108,
"mean_token_accuracy": 0.9582984447479248,
"step": 268
},
{
"epoch": 1.2511627906976743,
"grad_norm": 1.8647841215133667,
"learning_rate": 4.3617997792374365e-06,
"loss": 0.1178,
"mean_token_accuracy": 0.9578408598899841,
"step": 269
},
{
"epoch": 1.255813953488372,
"grad_norm": 1.831526279449463,
"learning_rate": 4.326495348436966e-06,
"loss": 0.1085,
"mean_token_accuracy": 0.9653323292732239,
"step": 270
},
{
"epoch": 1.2604651162790699,
"grad_norm": 1.904144287109375,
"learning_rate": 4.291268249815188e-06,
"loss": 0.1176,
"mean_token_accuracy": 0.9586750268936157,
"step": 271
},
{
"epoch": 1.2651162790697674,
"grad_norm": 2.3330485820770264,
"learning_rate": 4.256120804784528e-06,
"loss": 0.1105,
"mean_token_accuracy": 0.9589130878448486,
"step": 272
},
{
"epoch": 1.2697674418604652,
"grad_norm": 1.7716329097747803,
"learning_rate": 4.221055329508372e-06,
"loss": 0.1061,
"mean_token_accuracy": 0.9632218480110168,
"step": 273
},
{
"epoch": 1.2744186046511627,
"grad_norm": 1.6046407222747803,
"learning_rate": 4.186074134748414e-06,
"loss": 0.1012,
"mean_token_accuracy": 0.9630866646766663,
"step": 274
},
{
"epoch": 1.2790697674418605,
"grad_norm": 1.8529785871505737,
"learning_rate": 4.151179525712392e-06,
"loss": 0.1206,
"mean_token_accuracy": 0.9566202163696289,
"step": 275
},
{
"epoch": 1.283720930232558,
"grad_norm": 1.8019949197769165,
"learning_rate": 4.116373801902176e-06,
"loss": 0.1035,
"mean_token_accuracy": 0.9640029072761536,
"step": 276
},
{
"epoch": 1.2883720930232558,
"grad_norm": 2.1905837059020996,
"learning_rate": 4.081659256962237e-06,
"loss": 0.1273,
"mean_token_accuracy": 0.9552264213562012,
"step": 277
},
{
"epoch": 1.2930232558139534,
"grad_norm": 1.8310658931732178,
"learning_rate": 4.047038178528494e-06,
"loss": 0.1202,
"mean_token_accuracy": 0.9595242142677307,
"step": 278
},
{
"epoch": 1.2976744186046512,
"grad_norm": 1.7499853372573853,
"learning_rate": 4.012512848077562e-06,
"loss": 0.1129,
"mean_token_accuracy": 0.9607168436050415,
"step": 279
},
{
"epoch": 1.302325581395349,
"grad_norm": 1.76249361038208,
"learning_rate": 3.978085540776416e-06,
"loss": 0.1158,
"mean_token_accuracy": 0.9601153135299683,
"step": 280
},
{
"epoch": 1.3069767441860465,
"grad_norm": 1.9077409505844116,
"learning_rate": 3.94375852533245e-06,
"loss": 0.1326,
"mean_token_accuracy": 0.9523097276687622,
"step": 281
},
{
"epoch": 1.3116279069767443,
"grad_norm": 2.2027199268341064,
"learning_rate": 3.9095340638439735e-06,
"loss": 0.1613,
"mean_token_accuracy": 0.9432445764541626,
"step": 282
},
{
"epoch": 1.3162790697674418,
"grad_norm": 1.6710178852081299,
"learning_rate": 3.8754144116511516e-06,
"loss": 0.106,
"mean_token_accuracy": 0.9620450735092163,
"step": 283
},
{
"epoch": 1.3209302325581396,
"grad_norm": 1.8264927864074707,
"learning_rate": 3.8414018171873725e-06,
"loss": 0.1296,
"mean_token_accuracy": 0.9531369209289551,
"step": 284
},
{
"epoch": 1.3255813953488373,
"grad_norm": 1.7415143251419067,
"learning_rate": 3.8074985218310833e-06,
"loss": 0.0949,
"mean_token_accuracy": 0.9662027955055237,
"step": 285
},
{
"epoch": 1.330232558139535,
"grad_norm": 1.7828929424285889,
"learning_rate": 3.7737067597580822e-06,
"loss": 0.1238,
"mean_token_accuracy": 0.9578856825828552,
"step": 286
},
{
"epoch": 1.3348837209302324,
"grad_norm": 1.7337418794631958,
"learning_rate": 3.7400287577942994e-06,
"loss": 0.1313,
"mean_token_accuracy": 0.9521440863609314,
"step": 287
},
{
"epoch": 1.3395348837209302,
"grad_norm": 2.2492868900299072,
"learning_rate": 3.7064667352690386e-06,
"loss": 0.1318,
"mean_token_accuracy": 0.9546619653701782,
"step": 288
},
{
"epoch": 1.344186046511628,
"grad_norm": 1.9296540021896362,
"learning_rate": 3.6730229038687403e-06,
"loss": 0.1205,
"mean_token_accuracy": 0.9589089155197144,
"step": 289
},
{
"epoch": 1.3488372093023255,
"grad_norm": 2.0041182041168213,
"learning_rate": 3.639699467491228e-06,
"loss": 0.1267,
"mean_token_accuracy": 0.9551070332527161,
"step": 290
},
{
"epoch": 1.3534883720930233,
"grad_norm": 2.0701072216033936,
"learning_rate": 3.6064986221004704e-06,
"loss": 0.1407,
"mean_token_accuracy": 0.9523890614509583,
"step": 291
},
{
"epoch": 1.3581395348837209,
"grad_norm": 1.9877480268478394,
"learning_rate": 3.5734225555818847e-06,
"loss": 0.1376,
"mean_token_accuracy": 0.9489011168479919,
"step": 292
},
{
"epoch": 1.3627906976744186,
"grad_norm": 1.8878151178359985,
"learning_rate": 3.5404734475981405e-06,
"loss": 0.1275,
"mean_token_accuracy": 0.9542827010154724,
"step": 293
},
{
"epoch": 1.3674418604651164,
"grad_norm": 2.095825672149658,
"learning_rate": 3.5076534694455376e-06,
"loss": 0.1026,
"mean_token_accuracy": 0.9636394381523132,
"step": 294
},
{
"epoch": 1.372093023255814,
"grad_norm": 1.8154828548431396,
"learning_rate": 3.474964783910916e-06,
"loss": 0.1295,
"mean_token_accuracy": 0.9553645253181458,
"step": 295
},
{
"epoch": 1.3767441860465115,
"grad_norm": 1.8968881368637085,
"learning_rate": 3.4424095451291273e-06,
"loss": 0.1288,
"mean_token_accuracy": 0.9542672038078308,
"step": 296
},
{
"epoch": 1.3813953488372093,
"grad_norm": 1.8607019186019897,
"learning_rate": 3.409989898441086e-06,
"loss": 0.1278,
"mean_token_accuracy": 0.9535946846008301,
"step": 297
},
{
"epoch": 1.386046511627907,
"grad_norm": 1.7035014629364014,
"learning_rate": 3.3777079802523976e-06,
"loss": 0.117,
"mean_token_accuracy": 0.9564270377159119,
"step": 298
},
{
"epoch": 1.3906976744186046,
"grad_norm": 1.8858115673065186,
"learning_rate": 3.345565917892561e-06,
"loss": 0.1174,
"mean_token_accuracy": 0.9577510952949524,
"step": 299
},
{
"epoch": 1.3953488372093024,
"grad_norm": 2.2695724964141846,
"learning_rate": 3.3135658294747886e-06,
"loss": 0.1232,
"mean_token_accuracy": 0.9565290212631226,
"step": 300
},
{
"epoch": 1.4,
"grad_norm": 1.6535850763320923,
"learning_rate": 3.2817098237564292e-06,
"loss": 0.1078,
"mean_token_accuracy": 0.95890873670578,
"step": 301
},
{
"epoch": 1.4046511627906977,
"grad_norm": 1.7052584886550903,
"learning_rate": 3.2500000000000015e-06,
"loss": 0.1302,
"mean_token_accuracy": 0.9495029449462891,
"step": 302
},
{
"epoch": 1.4093023255813955,
"grad_norm": 1.785459280014038,
"learning_rate": 3.218438447834845e-06,
"loss": 0.1311,
"mean_token_accuracy": 0.9482329487800598,
"step": 303
},
{
"epoch": 1.413953488372093,
"grad_norm": 1.7197625637054443,
"learning_rate": 3.1870272471194363e-06,
"loss": 0.1245,
"mean_token_accuracy": 0.9550527930259705,
"step": 304
},
{
"epoch": 1.4186046511627908,
"grad_norm": 1.738937497138977,
"learning_rate": 3.1557684678043145e-06,
"loss": 0.1393,
"mean_token_accuracy": 0.9483538866043091,
"step": 305
},
{
"epoch": 1.4232558139534883,
"grad_norm": 1.6273069381713867,
"learning_rate": 3.124664169795677e-06,
"loss": 0.1112,
"mean_token_accuracy": 0.9574396014213562,
"step": 306
},
{
"epoch": 1.427906976744186,
"grad_norm": 2.0904576778411865,
"learning_rate": 3.0937164028196443e-06,
"loss": 0.1204,
"mean_token_accuracy": 0.958777666091919,
"step": 307
},
{
"epoch": 1.4325581395348836,
"grad_norm": 1.9097553491592407,
"learning_rate": 3.0629272062871697e-06,
"loss": 0.1296,
"mean_token_accuracy": 0.9571428298950195,
"step": 308
},
{
"epoch": 1.4372093023255814,
"grad_norm": 1.8212406635284424,
"learning_rate": 3.032298609159664e-06,
"loss": 0.1247,
"mean_token_accuracy": 0.9599398374557495,
"step": 309
},
{
"epoch": 1.441860465116279,
"grad_norm": 2.2745423316955566,
"learning_rate": 3.0018326298152716e-06,
"loss": 0.1258,
"mean_token_accuracy": 0.9553403854370117,
"step": 310
},
{
"epoch": 1.4465116279069767,
"grad_norm": 1.7779735326766968,
"learning_rate": 2.9715312759158776e-06,
"loss": 0.1004,
"mean_token_accuracy": 0.9625091552734375,
"step": 311
},
{
"epoch": 1.4511627906976745,
"grad_norm": 1.987384557723999,
"learning_rate": 2.9413965442748e-06,
"loss": 0.1444,
"mean_token_accuracy": 0.9496188759803772,
"step": 312
},
{
"epoch": 1.455813953488372,
"grad_norm": 1.733598232269287,
"learning_rate": 2.9114304207251966e-06,
"loss": 0.1261,
"mean_token_accuracy": 0.9566228985786438,
"step": 313
},
{
"epoch": 1.4604651162790698,
"grad_norm": 1.6986511945724487,
"learning_rate": 2.8816348799892134e-06,
"loss": 0.115,
"mean_token_accuracy": 0.9578772783279419,
"step": 314
},
{
"epoch": 1.4651162790697674,
"grad_norm": 1.7694047689437866,
"learning_rate": 2.8520118855478425e-06,
"loss": 0.1163,
"mean_token_accuracy": 0.9581395387649536,
"step": 315
},
{
"epoch": 1.4697674418604652,
"grad_norm": 2.0580880641937256,
"learning_rate": 2.822563389511542e-06,
"loss": 0.1496,
"mean_token_accuracy": 0.9496130347251892,
"step": 316
},
{
"epoch": 1.474418604651163,
"grad_norm": 1.9992523193359375,
"learning_rate": 2.793291332491584e-06,
"loss": 0.1246,
"mean_token_accuracy": 0.9544215202331543,
"step": 317
},
{
"epoch": 1.4790697674418605,
"grad_norm": 1.7976362705230713,
"learning_rate": 2.7641976434721795e-06,
"loss": 0.1236,
"mean_token_accuracy": 0.9601340293884277,
"step": 318
},
{
"epoch": 1.483720930232558,
"grad_norm": 1.844316840171814,
"learning_rate": 2.735284239683361e-06,
"loss": 0.1216,
"mean_token_accuracy": 0.9563649296760559,
"step": 319
},
{
"epoch": 1.4883720930232558,
"grad_norm": 1.6500184535980225,
"learning_rate": 2.706553026474632e-06,
"loss": 0.1058,
"mean_token_accuracy": 0.9624313116073608,
"step": 320
},
{
"epoch": 1.4930232558139536,
"grad_norm": 1.7818459272384644,
"learning_rate": 2.6780058971894175e-06,
"loss": 0.1181,
"mean_token_accuracy": 0.9556201696395874,
"step": 321
},
{
"epoch": 1.4976744186046511,
"grad_norm": 1.8266305923461914,
"learning_rate": 2.6496447330402857e-06,
"loss": 0.1125,
"mean_token_accuracy": 0.9616958498954773,
"step": 322
},
{
"epoch": 1.5023255813953489,
"grad_norm": 1.7001821994781494,
"learning_rate": 2.621471402984991e-06,
"loss": 0.1056,
"mean_token_accuracy": 0.9641386270523071,
"step": 323
},
{
"epoch": 1.5069767441860464,
"grad_norm": 1.7136989831924438,
"learning_rate": 2.5934877636032975e-06,
"loss": 0.094,
"mean_token_accuracy": 0.965951144695282,
"step": 324
},
{
"epoch": 1.5116279069767442,
"grad_norm": 1.9149391651153564,
"learning_rate": 2.5656956589746486e-06,
"loss": 0.1203,
"mean_token_accuracy": 0.9579139947891235,
"step": 325
},
{
"epoch": 1.516279069767442,
"grad_norm": 1.828525185585022,
"learning_rate": 2.538096920556635e-06,
"loss": 0.1244,
"mean_token_accuracy": 0.9578738212585449,
"step": 326
},
{
"epoch": 1.5209302325581395,
"grad_norm": 1.8189513683319092,
"learning_rate": 2.510693367064304e-06,
"loss": 0.1187,
"mean_token_accuracy": 0.9572099447250366,
"step": 327
},
{
"epoch": 1.525581395348837,
"grad_norm": 2.128922939300537,
"learning_rate": 2.4834868043503176e-06,
"loss": 0.1543,
"mean_token_accuracy": 0.9494983553886414,
"step": 328
},
{
"epoch": 1.5302325581395348,
"grad_norm": 1.8340766429901123,
"learning_rate": 2.4564790252859377e-06,
"loss": 0.1143,
"mean_token_accuracy": 0.9591530561447144,
"step": 329
},
{
"epoch": 1.5348837209302326,
"grad_norm": 1.4942302703857422,
"learning_rate": 2.4296718096428903e-06,
"loss": 0.0954,
"mean_token_accuracy": 0.9681605696678162,
"step": 330
},
{
"epoch": 1.5395348837209304,
"grad_norm": 2.0481481552124023,
"learning_rate": 2.403066923976075e-06,
"loss": 0.1042,
"mean_token_accuracy": 0.9619013071060181,
"step": 331
},
{
"epoch": 1.544186046511628,
"grad_norm": 1.8036472797393799,
"learning_rate": 2.3766661215071473e-06,
"loss": 0.1153,
"mean_token_accuracy": 0.9580026268959045,
"step": 332
},
{
"epoch": 1.5488372093023255,
"grad_norm": 1.7446261644363403,
"learning_rate": 2.3504711420089975e-06,
"loss": 0.1212,
"mean_token_accuracy": 0.957176148891449,
"step": 333
},
{
"epoch": 1.5534883720930233,
"grad_norm": 1.9047147035598755,
"learning_rate": 2.324483711691085e-06,
"loss": 0.1185,
"mean_token_accuracy": 0.9612976908683777,
"step": 334
},
{
"epoch": 1.558139534883721,
"grad_norm": 1.7788504362106323,
"learning_rate": 2.298705543085701e-06,
"loss": 0.1367,
"mean_token_accuracy": 0.9494468569755554,
"step": 335
},
{
"epoch": 1.5627906976744186,
"grad_norm": 1.6594794988632202,
"learning_rate": 2.273138334935099e-06,
"loss": 0.1044,
"mean_token_accuracy": 0.9624683260917664,
"step": 336
},
{
"epoch": 1.5674418604651161,
"grad_norm": 1.673941969871521,
"learning_rate": 2.2477837720795647e-06,
"loss": 0.0971,
"mean_token_accuracy": 0.965850293636322,
"step": 337
},
{
"epoch": 1.572093023255814,
"grad_norm": 1.8094561100006104,
"learning_rate": 2.222643525346379e-06,
"loss": 0.1199,
"mean_token_accuracy": 0.9571694731712341,
"step": 338
},
{
"epoch": 1.5767441860465117,
"grad_norm": 1.9664356708526611,
"learning_rate": 2.1977192514397115e-06,
"loss": 0.1432,
"mean_token_accuracy": 0.9504490494728088,
"step": 339
},
{
"epoch": 1.5813953488372094,
"grad_norm": 1.7851786613464355,
"learning_rate": 2.1730125928314566e-06,
"loss": 0.1267,
"mean_token_accuracy": 0.9532294273376465,
"step": 340
},
{
"epoch": 1.586046511627907,
"grad_norm": 1.9649142026901245,
"learning_rate": 2.148525177652982e-06,
"loss": 0.1133,
"mean_token_accuracy": 0.9610248804092407,
"step": 341
},
{
"epoch": 1.5906976744186045,
"grad_norm": 1.8593871593475342,
"learning_rate": 2.124258619587853e-06,
"loss": 0.1147,
"mean_token_accuracy": 0.9592034220695496,
"step": 342
},
{
"epoch": 1.5953488372093023,
"grad_norm": 1.9876712560653687,
"learning_rate": 2.100214517765481e-06,
"loss": 0.1309,
"mean_token_accuracy": 0.9497568607330322,
"step": 343
},
{
"epoch": 1.6,
"grad_norm": 1.8158270120620728,
"learning_rate": 2.076394456655749e-06,
"loss": 0.1225,
"mean_token_accuracy": 0.9589306116104126,
"step": 344
},
{
"epoch": 1.6046511627906976,
"grad_norm": 1.985721468925476,
"learning_rate": 2.0528000059646e-06,
"loss": 0.1192,
"mean_token_accuracy": 0.9585253596305847,
"step": 345
},
{
"epoch": 1.6093023255813952,
"grad_norm": 1.5803555250167847,
"learning_rate": 2.029432720530585e-06,
"loss": 0.1024,
"mean_token_accuracy": 0.9614368677139282,
"step": 346
},
{
"epoch": 1.613953488372093,
"grad_norm": 1.97838294506073,
"learning_rate": 2.006294140222416e-06,
"loss": 0.1354,
"mean_token_accuracy": 0.9507514238357544,
"step": 347
},
{
"epoch": 1.6186046511627907,
"grad_norm": 1.9568642377853394,
"learning_rate": 1.9833857898374796e-06,
"loss": 0.129,
"mean_token_accuracy": 0.9536830186843872,
"step": 348
},
{
"epoch": 1.6232558139534885,
"grad_norm": 1.998573899269104,
"learning_rate": 1.960709179001361e-06,
"loss": 0.1113,
"mean_token_accuracy": 0.9594219923019409,
"step": 349
},
{
"epoch": 1.627906976744186,
"grad_norm": 1.615460991859436,
"learning_rate": 1.9382658020683572e-06,
"loss": 0.0981,
"mean_token_accuracy": 0.9683917760848999,
"step": 350
},
{
"epoch": 1.6325581395348836,
"grad_norm": 2.0428853034973145,
"learning_rate": 1.9160571380230087e-06,
"loss": 0.1365,
"mean_token_accuracy": 0.9510607123374939,
"step": 351
},
{
"epoch": 1.6372093023255814,
"grad_norm": 1.8019459247589111,
"learning_rate": 1.8940846503826302e-06,
"loss": 0.1203,
"mean_token_accuracy": 0.9561994671821594,
"step": 352
},
{
"epoch": 1.6418604651162791,
"grad_norm": 1.7695238590240479,
"learning_rate": 1.8723497871008678e-06,
"loss": 0.1181,
"mean_token_accuracy": 0.9560151100158691,
"step": 353
},
{
"epoch": 1.6465116279069767,
"grad_norm": 2.7435801029205322,
"learning_rate": 1.8508539804722847e-06,
"loss": 0.1274,
"mean_token_accuracy": 0.9515418410301208,
"step": 354
},
{
"epoch": 1.6511627906976745,
"grad_norm": 1.7049776315689087,
"learning_rate": 1.8295986470379726e-06,
"loss": 0.1002,
"mean_token_accuracy": 0.9640846252441406,
"step": 355
},
{
"epoch": 1.655813953488372,
"grad_norm": 1.8068418502807617,
"learning_rate": 1.8085851874922012e-06,
"loss": 0.1215,
"mean_token_accuracy": 0.9560132622718811,
"step": 356
},
{
"epoch": 1.6604651162790698,
"grad_norm": 1.7952479124069214,
"learning_rate": 1.7878149865901207e-06,
"loss": 0.1124,
"mean_token_accuracy": 0.9578981399536133,
"step": 357
},
{
"epoch": 1.6651162790697676,
"grad_norm": 1.6922228336334229,
"learning_rate": 1.7672894130565033e-06,
"loss": 0.1118,
"mean_token_accuracy": 0.9599378705024719,
"step": 358
},
{
"epoch": 1.669767441860465,
"grad_norm": 1.9545196294784546,
"learning_rate": 1.7470098194955502e-06,
"loss": 0.1497,
"mean_token_accuracy": 0.9433708786964417,
"step": 359
},
{
"epoch": 1.6744186046511627,
"grad_norm": 1.7300385236740112,
"learning_rate": 1.7269775423017513e-06,
"loss": 0.1078,
"mean_token_accuracy": 0.9611701965332031,
"step": 360
},
{
"epoch": 1.6790697674418604,
"grad_norm": 1.9443196058273315,
"learning_rate": 1.7071939015718264e-06,
"loss": 0.1382,
"mean_token_accuracy": 0.9518130421638489,
"step": 361
},
{
"epoch": 1.6837209302325582,
"grad_norm": 1.7919453382492065,
"learning_rate": 1.687660201017729e-06,
"loss": 0.1257,
"mean_token_accuracy": 0.9533194899559021,
"step": 362
},
{
"epoch": 1.688372093023256,
"grad_norm": 1.6777992248535156,
"learning_rate": 1.6683777278807296e-06,
"loss": 0.1083,
"mean_token_accuracy": 0.9628692269325256,
"step": 363
},
{
"epoch": 1.6930232558139535,
"grad_norm": 1.7619105577468872,
"learning_rate": 1.6493477528465974e-06,
"loss": 0.1051,
"mean_token_accuracy": 0.9633173942565918,
"step": 364
},
{
"epoch": 1.697674418604651,
"grad_norm": 1.6752265691757202,
"learning_rate": 1.6305715299618547e-06,
"loss": 0.1187,
"mean_token_accuracy": 0.9575539827346802,
"step": 365
},
{
"epoch": 1.7023255813953488,
"grad_norm": 1.687470555305481,
"learning_rate": 1.6120502965511467e-06,
"loss": 0.1188,
"mean_token_accuracy": 0.9554470777511597,
"step": 366
},
{
"epoch": 1.7069767441860466,
"grad_norm": 1.5344772338867188,
"learning_rate": 1.5937852731356923e-06,
"loss": 0.0966,
"mean_token_accuracy": 0.9682371616363525,
"step": 367
},
{
"epoch": 1.7116279069767442,
"grad_norm": 1.6204265356063843,
"learning_rate": 1.5757776633528654e-06,
"loss": 0.1017,
"mean_token_accuracy": 0.9634027481079102,
"step": 368
},
{
"epoch": 1.7162790697674417,
"grad_norm": 1.8507808446884155,
"learning_rate": 1.5580286538768705e-06,
"loss": 0.1104,
"mean_token_accuracy": 0.9628064036369324,
"step": 369
},
{
"epoch": 1.7209302325581395,
"grad_norm": 1.6015089750289917,
"learning_rate": 1.5405394143405394e-06,
"loss": 0.103,
"mean_token_accuracy": 0.9659707546234131,
"step": 370
},
{
"epoch": 1.7255813953488373,
"grad_norm": 1.7210160493850708,
"learning_rate": 1.5233110972582646e-06,
"loss": 0.13,
"mean_token_accuracy": 0.9564670920372009,
"step": 371
},
{
"epoch": 1.730232558139535,
"grad_norm": 1.7505390644073486,
"learning_rate": 1.506344837950038e-06,
"loss": 0.1151,
"mean_token_accuracy": 0.959705650806427,
"step": 372
},
{
"epoch": 1.7348837209302326,
"grad_norm": 1.6202950477600098,
"learning_rate": 1.4896417544666476e-06,
"loss": 0.1068,
"mean_token_accuracy": 0.9642030000686646,
"step": 373
},
{
"epoch": 1.7395348837209301,
"grad_norm": 1.9536902904510498,
"learning_rate": 1.473202947515987e-06,
"loss": 0.1329,
"mean_token_accuracy": 0.9557550549507141,
"step": 374
},
{
"epoch": 1.744186046511628,
"grad_norm": 1.72047758102417,
"learning_rate": 1.4570295003905314e-06,
"loss": 0.1082,
"mean_token_accuracy": 0.9637593030929565,
"step": 375
},
{
"epoch": 1.7488372093023257,
"grad_norm": 1.5606462955474854,
"learning_rate": 1.4411224788959439e-06,
"loss": 0.0997,
"mean_token_accuracy": 0.9654178619384766,
"step": 376
},
{
"epoch": 1.7534883720930232,
"grad_norm": 1.6534537076950073,
"learning_rate": 1.4254829312808405e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9666008353233337,
"step": 377
},
{
"epoch": 1.7581395348837208,
"grad_norm": 1.7377055883407593,
"learning_rate": 1.4101118881677161e-06,
"loss": 0.1235,
"mean_token_accuracy": 0.9573296904563904,
"step": 378
},
{
"epoch": 1.7627906976744185,
"grad_norm": 1.8244922161102295,
"learning_rate": 1.3950103624850264e-06,
"loss": 0.1085,
"mean_token_accuracy": 0.9609296321868896,
"step": 379
},
{
"epoch": 1.7674418604651163,
"grad_norm": 1.6825976371765137,
"learning_rate": 1.3801793494004336e-06,
"loss": 0.117,
"mean_token_accuracy": 0.9583475589752197,
"step": 380
},
{
"epoch": 1.772093023255814,
"grad_norm": 1.9061481952667236,
"learning_rate": 1.365619826255231e-06,
"loss": 0.1181,
"mean_token_accuracy": 0.955722451210022,
"step": 381
},
{
"epoch": 1.7767441860465116,
"grad_norm": 2.0009758472442627,
"learning_rate": 1.351332752499936e-06,
"loss": 0.1313,
"mean_token_accuracy": 0.9538022875785828,
"step": 382
},
{
"epoch": 1.7813953488372092,
"grad_norm": 1.6647913455963135,
"learning_rate": 1.3373190696310664e-06,
"loss": 0.0934,
"mean_token_accuracy": 0.9652481079101562,
"step": 383
},
{
"epoch": 1.786046511627907,
"grad_norm": 1.8045130968093872,
"learning_rate": 1.3235797011290902e-06,
"loss": 0.118,
"mean_token_accuracy": 0.9571335911750793,
"step": 384
},
{
"epoch": 1.7906976744186047,
"grad_norm": 1.8397167921066284,
"learning_rate": 1.3101155523975787e-06,
"loss": 0.1196,
"mean_token_accuracy": 0.9594667553901672,
"step": 385
},
{
"epoch": 1.7953488372093023,
"grad_norm": 2.044651508331299,
"learning_rate": 1.2969275107035344e-06,
"loss": 0.1493,
"mean_token_accuracy": 0.949529767036438,
"step": 386
},
{
"epoch": 1.8,
"grad_norm": 2.365513563156128,
"learning_rate": 1.2840164451189253e-06,
"loss": 0.1246,
"mean_token_accuracy": 0.9561926126480103,
"step": 387
},
{
"epoch": 1.8046511627906976,
"grad_norm": 1.9553759098052979,
"learning_rate": 1.2713832064634127e-06,
"loss": 0.1206,
"mean_token_accuracy": 0.9597529172897339,
"step": 388
},
{
"epoch": 1.8093023255813954,
"grad_norm": 1.8584918975830078,
"learning_rate": 1.2590286272482852e-06,
"loss": 0.1085,
"mean_token_accuracy": 0.9604840278625488,
"step": 389
},
{
"epoch": 1.8139534883720931,
"grad_norm": 1.8316679000854492,
"learning_rate": 1.246953521621597e-06,
"loss": 0.1202,
"mean_token_accuracy": 0.957573652267456,
"step": 390
},
{
"epoch": 1.8186046511627907,
"grad_norm": 1.8838489055633545,
"learning_rate": 1.2351586853145135e-06,
"loss": 0.1148,
"mean_token_accuracy": 0.9598619937896729,
"step": 391
},
{
"epoch": 1.8232558139534882,
"grad_norm": 1.7429184913635254,
"learning_rate": 1.2236448955888793e-06,
"loss": 0.1129,
"mean_token_accuracy": 0.9605519771575928,
"step": 392
},
{
"epoch": 1.827906976744186,
"grad_norm": 1.4080933332443237,
"learning_rate": 1.212412911185994e-06,
"loss": 0.0993,
"mean_token_accuracy": 0.9594095945358276,
"step": 393
},
{
"epoch": 1.8325581395348838,
"grad_norm": 1.9343010187149048,
"learning_rate": 1.2014634722766138e-06,
"loss": 0.1342,
"mean_token_accuracy": 0.9440457224845886,
"step": 394
},
{
"epoch": 1.8372093023255816,
"grad_norm": 1.7764984369277954,
"learning_rate": 1.190797300412174e-06,
"loss": 0.1123,
"mean_token_accuracy": 0.9590568542480469,
"step": 395
},
{
"epoch": 1.841860465116279,
"grad_norm": 1.8751682043075562,
"learning_rate": 1.1804150984772405e-06,
"loss": 0.1265,
"mean_token_accuracy": 0.9582030773162842,
"step": 396
},
{
"epoch": 1.8465116279069766,
"grad_norm": 1.5821163654327393,
"learning_rate": 1.1703175506431936e-06,
"loss": 0.1142,
"mean_token_accuracy": 0.9597623348236084,
"step": 397
},
{
"epoch": 1.8511627906976744,
"grad_norm": 1.7600274085998535,
"learning_rate": 1.1605053223231367e-06,
"loss": 0.1189,
"mean_token_accuracy": 0.9547197222709656,
"step": 398
},
{
"epoch": 1.8558139534883722,
"grad_norm": 1.8238446712493896,
"learning_rate": 1.1509790601280508e-06,
"loss": 0.1183,
"mean_token_accuracy": 0.9584389328956604,
"step": 399
},
{
"epoch": 1.8604651162790697,
"grad_norm": 1.825140357017517,
"learning_rate": 1.1417393918241832e-06,
"loss": 0.1149,
"mean_token_accuracy": 0.9614356756210327,
"step": 400
},
{
"epoch": 1.8651162790697673,
"grad_norm": 1.8221229314804077,
"learning_rate": 1.1327869262916764e-06,
"loss": 0.1177,
"mean_token_accuracy": 0.954285740852356,
"step": 401
},
{
"epoch": 1.869767441860465,
"grad_norm": 2.279554843902588,
"learning_rate": 1.1241222534844456e-06,
"loss": 0.1456,
"mean_token_accuracy": 0.9513813853263855,
"step": 402
},
{
"epoch": 1.8744186046511628,
"grad_norm": 1.7494654655456543,
"learning_rate": 1.1157459443913036e-06,
"loss": 0.1034,
"mean_token_accuracy": 0.9605114459991455,
"step": 403
},
{
"epoch": 1.8790697674418606,
"grad_norm": 1.9064695835113525,
"learning_rate": 1.1076585509983285e-06,
"loss": 0.1237,
"mean_token_accuracy": 0.9582035541534424,
"step": 404
},
{
"epoch": 1.8837209302325582,
"grad_norm": 1.8712602853775024,
"learning_rate": 1.0998606062524917e-06,
"loss": 0.121,
"mean_token_accuracy": 0.9547223448753357,
"step": 405
},
{
"epoch": 1.8883720930232557,
"grad_norm": 1.6676530838012695,
"learning_rate": 1.0923526240265397e-06,
"loss": 0.1131,
"mean_token_accuracy": 0.9593326449394226,
"step": 406
},
{
"epoch": 1.8930232558139535,
"grad_norm": 1.7135958671569824,
"learning_rate": 1.085135099085126e-06,
"loss": 0.102,
"mean_token_accuracy": 0.961244523525238,
"step": 407
},
{
"epoch": 1.8976744186046512,
"grad_norm": 2.4428012371063232,
"learning_rate": 1.07820850705221e-06,
"loss": 0.1375,
"mean_token_accuracy": 0.9498010873794556,
"step": 408
},
{
"epoch": 1.9023255813953488,
"grad_norm": 1.8211802244186401,
"learning_rate": 1.0715733043797121e-06,
"loss": 0.1109,
"mean_token_accuracy": 0.9600291848182678,
"step": 409
},
{
"epoch": 1.9069767441860463,
"grad_norm": 1.9840325117111206,
"learning_rate": 1.065229928317438e-06,
"loss": 0.1219,
"mean_token_accuracy": 0.95467209815979,
"step": 410
},
{
"epoch": 1.9116279069767441,
"grad_norm": 1.722679615020752,
"learning_rate": 1.0591787968842587e-06,
"loss": 0.1223,
"mean_token_accuracy": 0.9608275294303894,
"step": 411
},
{
"epoch": 1.916279069767442,
"grad_norm": 1.979507327079773,
"learning_rate": 1.0534203088405679e-06,
"loss": 0.1175,
"mean_token_accuracy": 0.9590908885002136,
"step": 412
},
{
"epoch": 1.9209302325581397,
"grad_norm": 1.834620714187622,
"learning_rate": 1.047954843662004e-06,
"loss": 0.1246,
"mean_token_accuracy": 0.9556223750114441,
"step": 413
},
{
"epoch": 1.9255813953488372,
"grad_norm": 1.8021700382232666,
"learning_rate": 1.0427827615144432e-06,
"loss": 0.1171,
"mean_token_accuracy": 0.9610346555709839,
"step": 414
},
{
"epoch": 1.9302325581395348,
"grad_norm": 1.7287272214889526,
"learning_rate": 1.0379044032302621e-06,
"loss": 0.1111,
"mean_token_accuracy": 0.95878666639328,
"step": 415
},
{
"epoch": 1.9348837209302325,
"grad_norm": 1.7230780124664307,
"learning_rate": 1.0333200902858814e-06,
"loss": 0.1162,
"mean_token_accuracy": 0.9592291712760925,
"step": 416
},
{
"epoch": 1.9395348837209303,
"grad_norm": 1.9648188352584839,
"learning_rate": 1.0290301247805788e-06,
"loss": 0.1217,
"mean_token_accuracy": 0.9570950269699097,
"step": 417
},
{
"epoch": 1.9441860465116279,
"grad_norm": 1.5934516191482544,
"learning_rate": 1.0250347894165825e-06,
"loss": 0.1006,
"mean_token_accuracy": 0.9645111560821533,
"step": 418
},
{
"epoch": 1.9488372093023256,
"grad_norm": 1.874199628829956,
"learning_rate": 1.021334347480439e-06,
"loss": 0.123,
"mean_token_accuracy": 0.9575681686401367,
"step": 419
},
{
"epoch": 1.9534883720930232,
"grad_norm": 1.9127286672592163,
"learning_rate": 1.0179290428256663e-06,
"loss": 0.1208,
"mean_token_accuracy": 0.9570922255516052,
"step": 420
},
{
"epoch": 1.958139534883721,
"grad_norm": 1.699216604232788,
"learning_rate": 1.014819099856683e-06,
"loss": 0.1182,
"mean_token_accuracy": 0.9542940855026245,
"step": 421
},
{
"epoch": 1.9627906976744187,
"grad_norm": 1.9871116876602173,
"learning_rate": 1.0120047235140178e-06,
"loss": 0.1239,
"mean_token_accuracy": 0.9596614241600037,
"step": 422
},
{
"epoch": 1.9674418604651163,
"grad_norm": 1.7161730527877808,
"learning_rate": 1.0094860992608083e-06,
"loss": 0.0969,
"mean_token_accuracy": 0.9646767973899841,
"step": 423
},
{
"epoch": 1.9720930232558138,
"grad_norm": 1.6434606313705444,
"learning_rate": 1.0072633930705777e-06,
"loss": 0.1096,
"mean_token_accuracy": 0.9614861011505127,
"step": 424
},
{
"epoch": 1.9767441860465116,
"grad_norm": 1.6597245931625366,
"learning_rate": 1.0053367514162967e-06,
"loss": 0.1082,
"mean_token_accuracy": 0.963832676410675,
"step": 425
},
{
"epoch": 1.9813953488372094,
"grad_norm": 1.8078020811080933,
"learning_rate": 1.0037063012607302e-06,
"loss": 0.1163,
"mean_token_accuracy": 0.9587554335594177,
"step": 426
},
{
"epoch": 1.9860465116279071,
"grad_norm": 1.8626351356506348,
"learning_rate": 1.0023721500480747e-06,
"loss": 0.1105,
"mean_token_accuracy": 0.9619902968406677,
"step": 427
},
{
"epoch": 1.9906976744186047,
"grad_norm": 1.7608290910720825,
"learning_rate": 1.001334385696873e-06,
"loss": 0.1287,
"mean_token_accuracy": 0.9509281516075134,
"step": 428
},
{
"epoch": 1.9953488372093022,
"grad_norm": 1.7384284734725952,
"learning_rate": 1.0005930765942238e-06,
"loss": 0.1167,
"mean_token_accuracy": 0.9590829014778137,
"step": 429
},
{
"epoch": 2.0,
"grad_norm": 1.298376202583313,
"learning_rate": 1.0001482715912744e-06,
"loss": 0.0638,
"mean_token_accuracy": 0.9736953973770142,
"step": 430
},
{
"epoch": 2.0,
"step": 430,
"total_flos": 4.0376628208572826e+17,
"train_loss": 0.22927807642276896,
"train_runtime": 2180.6252,
"train_samples_per_second": 6.284,
"train_steps_per_second": 0.197
}
],
"logging_steps": 1,
"max_steps": 430,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.0376628208572826e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}