Files
Qwen1.5-MOE-sft-ESFT-summary/trainer_state.json
ModelHub XC 220b277a01 初始化项目,由ModelHub XC社区提供模型
Model: jayzou3773/Qwen1.5-MOE-sft-ESFT-summary
Source: Original Platform
2026-05-05 19:50:49 +08:00

4947 lines
133 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 613,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0016313213703099511,
"grad_norm": 54.38072967529297,
"learning_rate": 1.6129032258064518e-07,
"loss": 3.9722,
"mean_token_accuracy": 0.314461886882782,
"step": 1
},
{
"epoch": 0.0032626427406199023,
"grad_norm": 59.083343505859375,
"learning_rate": 3.2258064516129035e-07,
"loss": 3.7752,
"mean_token_accuracy": 0.3500784933567047,
"step": 2
},
{
"epoch": 0.004893964110929853,
"grad_norm": 52.31679153442383,
"learning_rate": 4.838709677419355e-07,
"loss": 3.9767,
"mean_token_accuracy": 0.32198143005371094,
"step": 3
},
{
"epoch": 0.0065252854812398045,
"grad_norm": 56.8325080871582,
"learning_rate": 6.451612903225807e-07,
"loss": 3.8677,
"mean_token_accuracy": 0.34073251485824585,
"step": 4
},
{
"epoch": 0.008156606851549755,
"grad_norm": 46.90914535522461,
"learning_rate": 8.064516129032258e-07,
"loss": 3.7833,
"mean_token_accuracy": 0.3529976010322571,
"step": 5
},
{
"epoch": 0.009787928221859706,
"grad_norm": 50.84980010986328,
"learning_rate": 9.67741935483871e-07,
"loss": 3.6046,
"mean_token_accuracy": 0.36332181096076965,
"step": 6
},
{
"epoch": 0.011419249592169658,
"grad_norm": 44.124671936035156,
"learning_rate": 1.1290322580645162e-06,
"loss": 3.5605,
"mean_token_accuracy": 0.3784194588661194,
"step": 7
},
{
"epoch": 0.013050570962479609,
"grad_norm": 38.687442779541016,
"learning_rate": 1.2903225806451614e-06,
"loss": 3.6567,
"mean_token_accuracy": 0.36630603671073914,
"step": 8
},
{
"epoch": 0.01468189233278956,
"grad_norm": 32.46002960205078,
"learning_rate": 1.4516129032258066e-06,
"loss": 3.7645,
"mean_token_accuracy": 0.3374135196208954,
"step": 9
},
{
"epoch": 0.01631321370309951,
"grad_norm": 29.601980209350586,
"learning_rate": 1.6129032258064516e-06,
"loss": 3.735,
"mean_token_accuracy": 0.340471088886261,
"step": 10
},
{
"epoch": 0.01794453507340946,
"grad_norm": 25.11663818359375,
"learning_rate": 1.774193548387097e-06,
"loss": 3.3774,
"mean_token_accuracy": 0.3931350111961365,
"step": 11
},
{
"epoch": 0.01957585644371941,
"grad_norm": 18.90343475341797,
"learning_rate": 1.935483870967742e-06,
"loss": 3.2297,
"mean_token_accuracy": 0.41412118077278137,
"step": 12
},
{
"epoch": 0.021207177814029365,
"grad_norm": 21.3724422454834,
"learning_rate": 2.096774193548387e-06,
"loss": 3.1907,
"mean_token_accuracy": 0.43043479323387146,
"step": 13
},
{
"epoch": 0.022838499184339316,
"grad_norm": 18.062108993530273,
"learning_rate": 2.2580645161290324e-06,
"loss": 3.0692,
"mean_token_accuracy": 0.43661972880363464,
"step": 14
},
{
"epoch": 0.024469820554649267,
"grad_norm": 18.955305099487305,
"learning_rate": 2.4193548387096776e-06,
"loss": 3.0939,
"mean_token_accuracy": 0.4284232258796692,
"step": 15
},
{
"epoch": 0.026101141924959218,
"grad_norm": 19.71297264099121,
"learning_rate": 2.580645161290323e-06,
"loss": 2.9745,
"mean_token_accuracy": 0.45571428537368774,
"step": 16
},
{
"epoch": 0.02773246329526917,
"grad_norm": 15.891701698303223,
"learning_rate": 2.7419354838709676e-06,
"loss": 2.843,
"mean_token_accuracy": 0.4642857015132904,
"step": 17
},
{
"epoch": 0.02936378466557912,
"grad_norm": 14.574506759643555,
"learning_rate": 2.903225806451613e-06,
"loss": 2.6097,
"mean_token_accuracy": 0.4918205738067627,
"step": 18
},
{
"epoch": 0.03099510603588907,
"grad_norm": 13.931673049926758,
"learning_rate": 3.0645161290322584e-06,
"loss": 2.5162,
"mean_token_accuracy": 0.5131129026412964,
"step": 19
},
{
"epoch": 0.03262642740619902,
"grad_norm": 13.101471900939941,
"learning_rate": 3.225806451612903e-06,
"loss": 2.7058,
"mean_token_accuracy": 0.47575756907463074,
"step": 20
},
{
"epoch": 0.03425774877650897,
"grad_norm": 12.979852676391602,
"learning_rate": 3.3870967741935484e-06,
"loss": 2.5696,
"mean_token_accuracy": 0.4878854751586914,
"step": 21
},
{
"epoch": 0.03588907014681892,
"grad_norm": 14.335384368896484,
"learning_rate": 3.548387096774194e-06,
"loss": 2.5031,
"mean_token_accuracy": 0.48602256178855896,
"step": 22
},
{
"epoch": 0.037520391517128875,
"grad_norm": 14.542072296142578,
"learning_rate": 3.7096774193548392e-06,
"loss": 2.4432,
"mean_token_accuracy": 0.5048364996910095,
"step": 23
},
{
"epoch": 0.03915171288743882,
"grad_norm": 12.069889068603516,
"learning_rate": 3.870967741935484e-06,
"loss": 2.2043,
"mean_token_accuracy": 0.5519013404846191,
"step": 24
},
{
"epoch": 0.040783034257748776,
"grad_norm": 9.698949813842773,
"learning_rate": 4.032258064516129e-06,
"loss": 2.0031,
"mean_token_accuracy": 0.5868473649024963,
"step": 25
},
{
"epoch": 0.04241435562805873,
"grad_norm": 10.89166259765625,
"learning_rate": 4.193548387096774e-06,
"loss": 2.3342,
"mean_token_accuracy": 0.5324609875679016,
"step": 26
},
{
"epoch": 0.04404567699836868,
"grad_norm": 9.197402000427246,
"learning_rate": 4.35483870967742e-06,
"loss": 2.2205,
"mean_token_accuracy": 0.5475698113441467,
"step": 27
},
{
"epoch": 0.04567699836867863,
"grad_norm": 9.47153377532959,
"learning_rate": 4.516129032258065e-06,
"loss": 2.0431,
"mean_token_accuracy": 0.5686706900596619,
"step": 28
},
{
"epoch": 0.04730831973898858,
"grad_norm": 8.886749267578125,
"learning_rate": 4.67741935483871e-06,
"loss": 2.1793,
"mean_token_accuracy": 0.5322735905647278,
"step": 29
},
{
"epoch": 0.048939641109298535,
"grad_norm": 10.089822769165039,
"learning_rate": 4.838709677419355e-06,
"loss": 1.987,
"mean_token_accuracy": 0.5579903721809387,
"step": 30
},
{
"epoch": 0.05057096247960848,
"grad_norm": 11.309324264526367,
"learning_rate": 5e-06,
"loss": 2.0749,
"mean_token_accuracy": 0.5649139285087585,
"step": 31
},
{
"epoch": 0.052202283849918436,
"grad_norm": 9.036641120910645,
"learning_rate": 5.161290322580646e-06,
"loss": 2.1629,
"mean_token_accuracy": 0.5413948893547058,
"step": 32
},
{
"epoch": 0.053833605220228384,
"grad_norm": 8.936366081237793,
"learning_rate": 5.322580645161291e-06,
"loss": 1.9053,
"mean_token_accuracy": 0.5875675678253174,
"step": 33
},
{
"epoch": 0.05546492659053834,
"grad_norm": 8.523772239685059,
"learning_rate": 5.483870967741935e-06,
"loss": 1.962,
"mean_token_accuracy": 0.5871559381484985,
"step": 34
},
{
"epoch": 0.057096247960848286,
"grad_norm": 8.703071594238281,
"learning_rate": 5.645161290322582e-06,
"loss": 2.0717,
"mean_token_accuracy": 0.5543113350868225,
"step": 35
},
{
"epoch": 0.05872756933115824,
"grad_norm": 8.243901252746582,
"learning_rate": 5.806451612903226e-06,
"loss": 1.9278,
"mean_token_accuracy": 0.5818815231323242,
"step": 36
},
{
"epoch": 0.06035889070146819,
"grad_norm": 8.658400535583496,
"learning_rate": 5.967741935483872e-06,
"loss": 1.9476,
"mean_token_accuracy": 0.5738636255264282,
"step": 37
},
{
"epoch": 0.06199021207177814,
"grad_norm": 8.671000480651855,
"learning_rate": 6.129032258064517e-06,
"loss": 2.0554,
"mean_token_accuracy": 0.5679658055305481,
"step": 38
},
{
"epoch": 0.0636215334420881,
"grad_norm": 9.466026306152344,
"learning_rate": 6.290322580645162e-06,
"loss": 1.9489,
"mean_token_accuracy": 0.5673534274101257,
"step": 39
},
{
"epoch": 0.06525285481239804,
"grad_norm": 8.415104866027832,
"learning_rate": 6.451612903225806e-06,
"loss": 2.1262,
"mean_token_accuracy": 0.5633999109268188,
"step": 40
},
{
"epoch": 0.06688417618270799,
"grad_norm": 7.783365726470947,
"learning_rate": 6.612903225806452e-06,
"loss": 1.7869,
"mean_token_accuracy": 0.6028110384941101,
"step": 41
},
{
"epoch": 0.06851549755301795,
"grad_norm": 8.495488166809082,
"learning_rate": 6.774193548387097e-06,
"loss": 1.7062,
"mean_token_accuracy": 0.6242873668670654,
"step": 42
},
{
"epoch": 0.0701468189233279,
"grad_norm": 8.216286659240723,
"learning_rate": 6.935483870967743e-06,
"loss": 1.8002,
"mean_token_accuracy": 0.6164383292198181,
"step": 43
},
{
"epoch": 0.07177814029363784,
"grad_norm": 7.681854724884033,
"learning_rate": 7.096774193548388e-06,
"loss": 1.8663,
"mean_token_accuracy": 0.5869767665863037,
"step": 44
},
{
"epoch": 0.0734094616639478,
"grad_norm": 7.960548400878906,
"learning_rate": 7.258064516129033e-06,
"loss": 1.5801,
"mean_token_accuracy": 0.6295210123062134,
"step": 45
},
{
"epoch": 0.07504078303425775,
"grad_norm": 8.843791007995605,
"learning_rate": 7.4193548387096784e-06,
"loss": 1.9695,
"mean_token_accuracy": 0.5767748951911926,
"step": 46
},
{
"epoch": 0.0766721044045677,
"grad_norm": 7.562375068664551,
"learning_rate": 7.580645161290323e-06,
"loss": 1.8982,
"mean_token_accuracy": 0.5856515169143677,
"step": 47
},
{
"epoch": 0.07830342577487764,
"grad_norm": 7.976773738861084,
"learning_rate": 7.741935483870968e-06,
"loss": 1.8455,
"mean_token_accuracy": 0.5857519507408142,
"step": 48
},
{
"epoch": 0.0799347471451876,
"grad_norm": 7.795076847076416,
"learning_rate": 7.903225806451613e-06,
"loss": 1.738,
"mean_token_accuracy": 0.602642297744751,
"step": 49
},
{
"epoch": 0.08156606851549755,
"grad_norm": 9.113154411315918,
"learning_rate": 8.064516129032258e-06,
"loss": 1.7887,
"mean_token_accuracy": 0.6150583028793335,
"step": 50
},
{
"epoch": 0.08319738988580751,
"grad_norm": 9.503119468688965,
"learning_rate": 8.225806451612904e-06,
"loss": 1.6738,
"mean_token_accuracy": 0.6308540105819702,
"step": 51
},
{
"epoch": 0.08482871125611746,
"grad_norm": 7.7233757972717285,
"learning_rate": 8.387096774193549e-06,
"loss": 1.8524,
"mean_token_accuracy": 0.6068170666694641,
"step": 52
},
{
"epoch": 0.0864600326264274,
"grad_norm": 8.368830680847168,
"learning_rate": 8.548387096774194e-06,
"loss": 1.6863,
"mean_token_accuracy": 0.641238272190094,
"step": 53
},
{
"epoch": 0.08809135399673736,
"grad_norm": 8.289685249328613,
"learning_rate": 8.70967741935484e-06,
"loss": 1.7527,
"mean_token_accuracy": 0.6219838857650757,
"step": 54
},
{
"epoch": 0.08972267536704731,
"grad_norm": 8.580499649047852,
"learning_rate": 8.870967741935484e-06,
"loss": 1.7605,
"mean_token_accuracy": 0.622188150882721,
"step": 55
},
{
"epoch": 0.09135399673735727,
"grad_norm": 8.407153129577637,
"learning_rate": 9.03225806451613e-06,
"loss": 1.9015,
"mean_token_accuracy": 0.6121242046356201,
"step": 56
},
{
"epoch": 0.0929853181076672,
"grad_norm": 7.347232818603516,
"learning_rate": 9.193548387096775e-06,
"loss": 1.6066,
"mean_token_accuracy": 0.6575052738189697,
"step": 57
},
{
"epoch": 0.09461663947797716,
"grad_norm": 7.600398063659668,
"learning_rate": 9.35483870967742e-06,
"loss": 1.6309,
"mean_token_accuracy": 0.6496000289916992,
"step": 58
},
{
"epoch": 0.09624796084828711,
"grad_norm": 9.03729248046875,
"learning_rate": 9.516129032258065e-06,
"loss": 1.5208,
"mean_token_accuracy": 0.6523297429084778,
"step": 59
},
{
"epoch": 0.09787928221859707,
"grad_norm": 7.88900899887085,
"learning_rate": 9.67741935483871e-06,
"loss": 1.5696,
"mean_token_accuracy": 0.6507083773612976,
"step": 60
},
{
"epoch": 0.09951060358890701,
"grad_norm": 7.398552417755127,
"learning_rate": 9.838709677419356e-06,
"loss": 1.4991,
"mean_token_accuracy": 0.6561679840087891,
"step": 61
},
{
"epoch": 0.10114192495921696,
"grad_norm": 7.690386772155762,
"learning_rate": 1e-05,
"loss": 1.4677,
"mean_token_accuracy": 0.6609534025192261,
"step": 62
},
{
"epoch": 0.10277324632952692,
"grad_norm": 7.935258865356445,
"learning_rate": 9.999926856137682e-06,
"loss": 1.5293,
"mean_token_accuracy": 0.6509740352630615,
"step": 63
},
{
"epoch": 0.10440456769983687,
"grad_norm": 7.435649871826172,
"learning_rate": 9.999707426928513e-06,
"loss": 1.5408,
"mean_token_accuracy": 0.6423665881156921,
"step": 64
},
{
"epoch": 0.10603588907014681,
"grad_norm": 7.0717668533325195,
"learning_rate": 9.999341719505784e-06,
"loss": 1.2598,
"mean_token_accuracy": 0.7105831503868103,
"step": 65
},
{
"epoch": 0.10766721044045677,
"grad_norm": 7.5760722160339355,
"learning_rate": 9.998829745758052e-06,
"loss": 1.5635,
"mean_token_accuracy": 0.6381751298904419,
"step": 66
},
{
"epoch": 0.10929853181076672,
"grad_norm": 7.556014060974121,
"learning_rate": 9.998171522328753e-06,
"loss": 1.6741,
"mean_token_accuracy": 0.6098901033401489,
"step": 67
},
{
"epoch": 0.11092985318107668,
"grad_norm": 7.316895008087158,
"learning_rate": 9.99736707061567e-06,
"loss": 1.698,
"mean_token_accuracy": 0.6228723526000977,
"step": 68
},
{
"epoch": 0.11256117455138662,
"grad_norm": 8.193136215209961,
"learning_rate": 9.996416416770227e-06,
"loss": 1.6473,
"mean_token_accuracy": 0.6394094228744507,
"step": 69
},
{
"epoch": 0.11419249592169657,
"grad_norm": 6.792864799499512,
"learning_rate": 9.995319591696643e-06,
"loss": 1.6064,
"mean_token_accuracy": 0.6287455558776855,
"step": 70
},
{
"epoch": 0.11582381729200653,
"grad_norm": 7.596305847167969,
"learning_rate": 9.994076631050926e-06,
"loss": 1.8675,
"mean_token_accuracy": 0.5812404155731201,
"step": 71
},
{
"epoch": 0.11745513866231648,
"grad_norm": 6.764160633087158,
"learning_rate": 9.99268757523972e-06,
"loss": 1.5861,
"mean_token_accuracy": 0.64697265625,
"step": 72
},
{
"epoch": 0.11908646003262642,
"grad_norm": 7.583809852600098,
"learning_rate": 9.991152469418984e-06,
"loss": 1.3654,
"mean_token_accuracy": 0.6922652125358582,
"step": 73
},
{
"epoch": 0.12071778140293637,
"grad_norm": 7.365781307220459,
"learning_rate": 9.989471363492523e-06,
"loss": 1.6449,
"mean_token_accuracy": 0.6340000033378601,
"step": 74
},
{
"epoch": 0.12234910277324633,
"grad_norm": 7.349303722381592,
"learning_rate": 9.987644312110373e-06,
"loss": 1.7496,
"mean_token_accuracy": 0.6141689419746399,
"step": 75
},
{
"epoch": 0.12398042414355628,
"grad_norm": 6.4074273109436035,
"learning_rate": 9.985671374667024e-06,
"loss": 1.5874,
"mean_token_accuracy": 0.6464434862136841,
"step": 76
},
{
"epoch": 0.12561174551386622,
"grad_norm": 6.483602046966553,
"learning_rate": 9.98355261529948e-06,
"loss": 1.6916,
"mean_token_accuracy": 0.6172904372215271,
"step": 77
},
{
"epoch": 0.1272430668841762,
"grad_norm": 6.887275695800781,
"learning_rate": 9.981288102885185e-06,
"loss": 1.6873,
"mean_token_accuracy": 0.6121962666511536,
"step": 78
},
{
"epoch": 0.12887438825448613,
"grad_norm": 6.4050703048706055,
"learning_rate": 9.978877911039772e-06,
"loss": 1.4187,
"mean_token_accuracy": 0.6751824617385864,
"step": 79
},
{
"epoch": 0.13050570962479607,
"grad_norm": 6.44724178314209,
"learning_rate": 9.976322118114685e-06,
"loss": 1.4161,
"mean_token_accuracy": 0.6592556834220886,
"step": 80
},
{
"epoch": 0.13213703099510604,
"grad_norm": 5.995436668395996,
"learning_rate": 9.97362080719462e-06,
"loss": 1.3907,
"mean_token_accuracy": 0.6656084656715393,
"step": 81
},
{
"epoch": 0.13376835236541598,
"grad_norm": 6.501825332641602,
"learning_rate": 9.970774066094825e-06,
"loss": 1.6026,
"mean_token_accuracy": 0.603732168674469,
"step": 82
},
{
"epoch": 0.13539967373572595,
"grad_norm": 7.173989772796631,
"learning_rate": 9.967781987358252e-06,
"loss": 1.7378,
"mean_token_accuracy": 0.6143959164619446,
"step": 83
},
{
"epoch": 0.1370309951060359,
"grad_norm": 6.576292991638184,
"learning_rate": 9.964644668252544e-06,
"loss": 1.4204,
"mean_token_accuracy": 0.6584976315498352,
"step": 84
},
{
"epoch": 0.13866231647634583,
"grad_norm": 8.727774620056152,
"learning_rate": 9.961362210766871e-06,
"loss": 1.6993,
"mean_token_accuracy": 0.6126176118850708,
"step": 85
},
{
"epoch": 0.1402936378466558,
"grad_norm": 6.580403804779053,
"learning_rate": 9.957934721608621e-06,
"loss": 1.6845,
"mean_token_accuracy": 0.6215676665306091,
"step": 86
},
{
"epoch": 0.14192495921696574,
"grad_norm": 5.9920830726623535,
"learning_rate": 9.954362312199926e-06,
"loss": 1.3893,
"mean_token_accuracy": 0.6767676472663879,
"step": 87
},
{
"epoch": 0.14355628058727568,
"grad_norm": 5.893803119659424,
"learning_rate": 9.950645098674037e-06,
"loss": 1.4447,
"mean_token_accuracy": 0.6626806259155273,
"step": 88
},
{
"epoch": 0.14518760195758565,
"grad_norm": 6.5982770919799805,
"learning_rate": 9.946783201871558e-06,
"loss": 1.3436,
"mean_token_accuracy": 0.6762666702270508,
"step": 89
},
{
"epoch": 0.1468189233278956,
"grad_norm": 5.981234550476074,
"learning_rate": 9.942776747336509e-06,
"loss": 1.5784,
"mean_token_accuracy": 0.6174784898757935,
"step": 90
},
{
"epoch": 0.14845024469820556,
"grad_norm": 6.088432788848877,
"learning_rate": 9.938625865312252e-06,
"loss": 1.7807,
"mean_token_accuracy": 0.5808597803115845,
"step": 91
},
{
"epoch": 0.1500815660685155,
"grad_norm": 6.743659973144531,
"learning_rate": 9.934330690737247e-06,
"loss": 1.6376,
"mean_token_accuracy": 0.604613721370697,
"step": 92
},
{
"epoch": 0.15171288743882544,
"grad_norm": 5.764866828918457,
"learning_rate": 9.929891363240679e-06,
"loss": 1.6292,
"mean_token_accuracy": 0.6264821887016296,
"step": 93
},
{
"epoch": 0.1533442088091354,
"grad_norm": 5.750985622406006,
"learning_rate": 9.925308027137906e-06,
"loss": 1.3667,
"mean_token_accuracy": 0.6758104562759399,
"step": 94
},
{
"epoch": 0.15497553017944535,
"grad_norm": 5.635873317718506,
"learning_rate": 9.920580831425774e-06,
"loss": 1.442,
"mean_token_accuracy": 0.6777954697608948,
"step": 95
},
{
"epoch": 0.1566068515497553,
"grad_norm": 5.207980632781982,
"learning_rate": 9.915709929777773e-06,
"loss": 1.1315,
"mean_token_accuracy": 0.7171201705932617,
"step": 96
},
{
"epoch": 0.15823817292006526,
"grad_norm": 6.929599761962891,
"learning_rate": 9.910695480539043e-06,
"loss": 1.5498,
"mean_token_accuracy": 0.6462904810905457,
"step": 97
},
{
"epoch": 0.1598694942903752,
"grad_norm": 6.597740173339844,
"learning_rate": 9.905537646721215e-06,
"loss": 1.3707,
"mean_token_accuracy": 0.6714513301849365,
"step": 98
},
{
"epoch": 0.16150081566068517,
"grad_norm": 5.562872409820557,
"learning_rate": 9.900236595997138e-06,
"loss": 1.2183,
"mean_token_accuracy": 0.709775984287262,
"step": 99
},
{
"epoch": 0.1631321370309951,
"grad_norm": 5.840291976928711,
"learning_rate": 9.89479250069539e-06,
"loss": 1.2124,
"mean_token_accuracy": 0.7145169973373413,
"step": 100
},
{
"epoch": 0.16476345840130505,
"grad_norm": 5.99063777923584,
"learning_rate": 9.889205537794715e-06,
"loss": 1.3492,
"mean_token_accuracy": 0.6756311655044556,
"step": 101
},
{
"epoch": 0.16639477977161501,
"grad_norm": 6.224008560180664,
"learning_rate": 9.883475888918241e-06,
"loss": 1.2016,
"mean_token_accuracy": 0.7054827809333801,
"step": 102
},
{
"epoch": 0.16802610114192496,
"grad_norm": 5.562602519989014,
"learning_rate": 9.87760374032759e-06,
"loss": 1.5352,
"mean_token_accuracy": 0.6521076560020447,
"step": 103
},
{
"epoch": 0.16965742251223492,
"grad_norm": 5.726022243499756,
"learning_rate": 9.87158928291682e-06,
"loss": 1.3858,
"mean_token_accuracy": 0.6717791557312012,
"step": 104
},
{
"epoch": 0.17128874388254486,
"grad_norm": 6.054457664489746,
"learning_rate": 9.865432712206215e-06,
"loss": 1.6255,
"mean_token_accuracy": 0.6333163976669312,
"step": 105
},
{
"epoch": 0.1729200652528548,
"grad_norm": 5.757321357727051,
"learning_rate": 9.859134228335937e-06,
"loss": 1.3847,
"mean_token_accuracy": 0.6641345620155334,
"step": 106
},
{
"epoch": 0.17455138662316477,
"grad_norm": 5.4531450271606445,
"learning_rate": 9.852694036059514e-06,
"loss": 1.4778,
"mean_token_accuracy": 0.680861234664917,
"step": 107
},
{
"epoch": 0.1761827079934747,
"grad_norm": 6.217274188995361,
"learning_rate": 9.846112344737182e-06,
"loss": 1.3624,
"mean_token_accuracy": 0.6645264625549316,
"step": 108
},
{
"epoch": 0.17781402936378465,
"grad_norm": 5.447512626647949,
"learning_rate": 9.839389368329088e-06,
"loss": 1.5179,
"mean_token_accuracy": 0.6528394818305969,
"step": 109
},
{
"epoch": 0.17944535073409462,
"grad_norm": 6.115851402282715,
"learning_rate": 9.832525325388326e-06,
"loss": 1.6997,
"mean_token_accuracy": 0.6170212626457214,
"step": 110
},
{
"epoch": 0.18107667210440456,
"grad_norm": 5.800912857055664,
"learning_rate": 9.825520439053832e-06,
"loss": 1.4313,
"mean_token_accuracy": 0.6626384854316711,
"step": 111
},
{
"epoch": 0.18270799347471453,
"grad_norm": 6.369785785675049,
"learning_rate": 9.818374937043138e-06,
"loss": 1.5534,
"mean_token_accuracy": 0.6290909051895142,
"step": 112
},
{
"epoch": 0.18433931484502447,
"grad_norm": 6.613420009613037,
"learning_rate": 9.811089051644959e-06,
"loss": 1.6318,
"mean_token_accuracy": 0.6186726689338684,
"step": 113
},
{
"epoch": 0.1859706362153344,
"grad_norm": 5.590596675872803,
"learning_rate": 9.803663019711654e-06,
"loss": 1.3043,
"mean_token_accuracy": 0.6894215941429138,
"step": 114
},
{
"epoch": 0.18760195758564438,
"grad_norm": 6.427780628204346,
"learning_rate": 9.796097082651511e-06,
"loss": 1.6446,
"mean_token_accuracy": 0.6234225034713745,
"step": 115
},
{
"epoch": 0.18923327895595432,
"grad_norm": 6.452088356018066,
"learning_rate": 9.788391486420914e-06,
"loss": 1.4595,
"mean_token_accuracy": 0.6346368789672852,
"step": 116
},
{
"epoch": 0.19086460032626426,
"grad_norm": 5.884222984313965,
"learning_rate": 9.780546481516338e-06,
"loss": 1.3437,
"mean_token_accuracy": 0.6792058348655701,
"step": 117
},
{
"epoch": 0.19249592169657423,
"grad_norm": 5.718683242797852,
"learning_rate": 9.772562322966209e-06,
"loss": 1.2696,
"mean_token_accuracy": 0.6850185394287109,
"step": 118
},
{
"epoch": 0.19412724306688417,
"grad_norm": 5.645365238189697,
"learning_rate": 9.764439270322612e-06,
"loss": 1.5184,
"mean_token_accuracy": 0.6474390625953674,
"step": 119
},
{
"epoch": 0.19575856443719414,
"grad_norm": 5.762539386749268,
"learning_rate": 9.756177587652857e-06,
"loss": 1.4345,
"mean_token_accuracy": 0.6544578075408936,
"step": 120
},
{
"epoch": 0.19738988580750408,
"grad_norm": 5.77543306350708,
"learning_rate": 9.74777754353089e-06,
"loss": 1.7153,
"mean_token_accuracy": 0.6169678568840027,
"step": 121
},
{
"epoch": 0.19902120717781402,
"grad_norm": 5.565819263458252,
"learning_rate": 9.739239411028565e-06,
"loss": 1.3033,
"mean_token_accuracy": 0.6986506581306458,
"step": 122
},
{
"epoch": 0.200652528548124,
"grad_norm": 5.547922134399414,
"learning_rate": 9.730563467706765e-06,
"loss": 1.327,
"mean_token_accuracy": 0.683811604976654,
"step": 123
},
{
"epoch": 0.20228384991843393,
"grad_norm": 5.765176296234131,
"learning_rate": 9.721749995606381e-06,
"loss": 1.3776,
"mean_token_accuracy": 0.6884735226631165,
"step": 124
},
{
"epoch": 0.2039151712887439,
"grad_norm": 5.340542793273926,
"learning_rate": 9.712799281239142e-06,
"loss": 1.4246,
"mean_token_accuracy": 0.6791791915893555,
"step": 125
},
{
"epoch": 0.20554649265905384,
"grad_norm": 5.423886775970459,
"learning_rate": 9.703711615578301e-06,
"loss": 1.1438,
"mean_token_accuracy": 0.7353861927986145,
"step": 126
},
{
"epoch": 0.20717781402936378,
"grad_norm": 5.641276836395264,
"learning_rate": 9.694487294049174e-06,
"loss": 1.4128,
"mean_token_accuracy": 0.6514989137649536,
"step": 127
},
{
"epoch": 0.20880913539967375,
"grad_norm": 5.543446063995361,
"learning_rate": 9.685126616519545e-06,
"loss": 1.4135,
"mean_token_accuracy": 0.6586325764656067,
"step": 128
},
{
"epoch": 0.21044045676998369,
"grad_norm": 6.770927906036377,
"learning_rate": 9.675629887289904e-06,
"loss": 1.4884,
"mean_token_accuracy": 0.6546052694320679,
"step": 129
},
{
"epoch": 0.21207177814029363,
"grad_norm": 5.887889385223389,
"learning_rate": 9.665997415083565e-06,
"loss": 1.4939,
"mean_token_accuracy": 0.653674840927124,
"step": 130
},
{
"epoch": 0.2137030995106036,
"grad_norm": 5.511849880218506,
"learning_rate": 9.656229513036623e-06,
"loss": 1.2267,
"mean_token_accuracy": 0.7116374969482422,
"step": 131
},
{
"epoch": 0.21533442088091354,
"grad_norm": 5.637845039367676,
"learning_rate": 9.646326498687787e-06,
"loss": 1.5632,
"mean_token_accuracy": 0.6471421718597412,
"step": 132
},
{
"epoch": 0.2169657422512235,
"grad_norm": 5.33619499206543,
"learning_rate": 9.636288693968039e-06,
"loss": 1.4464,
"mean_token_accuracy": 0.656867265701294,
"step": 133
},
{
"epoch": 0.21859706362153344,
"grad_norm": 5.903771877288818,
"learning_rate": 9.626116425190182e-06,
"loss": 1.5197,
"mean_token_accuracy": 0.6431440114974976,
"step": 134
},
{
"epoch": 0.22022838499184338,
"grad_norm": 5.29071569442749,
"learning_rate": 9.615810023038228e-06,
"loss": 1.4022,
"mean_token_accuracy": 0.646789014339447,
"step": 135
},
{
"epoch": 0.22185970636215335,
"grad_norm": 5.770832538604736,
"learning_rate": 9.605369822556651e-06,
"loss": 1.3488,
"mean_token_accuracy": 0.672672688961029,
"step": 136
},
{
"epoch": 0.2234910277324633,
"grad_norm": 5.827826023101807,
"learning_rate": 9.594796163139487e-06,
"loss": 1.2913,
"mean_token_accuracy": 0.707563042640686,
"step": 137
},
{
"epoch": 0.22512234910277323,
"grad_norm": 6.449001312255859,
"learning_rate": 9.584089388519307e-06,
"loss": 1.6024,
"mean_token_accuracy": 0.6305343508720398,
"step": 138
},
{
"epoch": 0.2267536704730832,
"grad_norm": 5.251701831817627,
"learning_rate": 9.573249846756048e-06,
"loss": 1.4945,
"mean_token_accuracy": 0.6551724076271057,
"step": 139
},
{
"epoch": 0.22838499184339314,
"grad_norm": 5.719169616699219,
"learning_rate": 9.562277890225683e-06,
"loss": 1.4588,
"mean_token_accuracy": 0.6551551818847656,
"step": 140
},
{
"epoch": 0.2300163132137031,
"grad_norm": 5.2488226890563965,
"learning_rate": 9.551173875608785e-06,
"loss": 1.235,
"mean_token_accuracy": 0.6981236338615417,
"step": 141
},
{
"epoch": 0.23164763458401305,
"grad_norm": 5.853959083557129,
"learning_rate": 9.539938163878916e-06,
"loss": 1.3501,
"mean_token_accuracy": 0.6693121790885925,
"step": 142
},
{
"epoch": 0.233278955954323,
"grad_norm": 5.647499084472656,
"learning_rate": 9.528571120290894e-06,
"loss": 1.2444,
"mean_token_accuracy": 0.7117318511009216,
"step": 143
},
{
"epoch": 0.23491027732463296,
"grad_norm": 5.933478832244873,
"learning_rate": 9.517073114368933e-06,
"loss": 1.4919,
"mean_token_accuracy": 0.6552088856697083,
"step": 144
},
{
"epoch": 0.2365415986949429,
"grad_norm": 5.842235565185547,
"learning_rate": 9.505444519894616e-06,
"loss": 1.52,
"mean_token_accuracy": 0.6385658979415894,
"step": 145
},
{
"epoch": 0.23817292006525284,
"grad_norm": 6.486652374267578,
"learning_rate": 9.493685714894746e-06,
"loss": 1.1983,
"mean_token_accuracy": 0.7004634737968445,
"step": 146
},
{
"epoch": 0.2398042414355628,
"grad_norm": 4.8720245361328125,
"learning_rate": 9.481797081629068e-06,
"loss": 1.3004,
"mean_token_accuracy": 0.709541380405426,
"step": 147
},
{
"epoch": 0.24143556280587275,
"grad_norm": 5.410114288330078,
"learning_rate": 9.469779006577822e-06,
"loss": 1.2591,
"mean_token_accuracy": 0.690431535243988,
"step": 148
},
{
"epoch": 0.24306688417618272,
"grad_norm": 5.812628746032715,
"learning_rate": 9.4576318804292e-06,
"loss": 1.612,
"mean_token_accuracy": 0.6232700943946838,
"step": 149
},
{
"epoch": 0.24469820554649266,
"grad_norm": 6.259674072265625,
"learning_rate": 9.445356098066638e-06,
"loss": 1.3041,
"mean_token_accuracy": 0.6718587875366211,
"step": 150
},
{
"epoch": 0.2463295269168026,
"grad_norm": 6.436178207397461,
"learning_rate": 9.43295205855597e-06,
"loss": 1.6111,
"mean_token_accuracy": 0.6207820177078247,
"step": 151
},
{
"epoch": 0.24796084828711257,
"grad_norm": 5.527941703796387,
"learning_rate": 9.420420165132466e-06,
"loss": 1.6642,
"mean_token_accuracy": 0.6238217949867249,
"step": 152
},
{
"epoch": 0.2495921696574225,
"grad_norm": 5.792147159576416,
"learning_rate": 9.407760825187722e-06,
"loss": 1.4365,
"mean_token_accuracy": 0.6555671095848083,
"step": 153
},
{
"epoch": 0.25122349102773245,
"grad_norm": 5.005126953125,
"learning_rate": 9.39497445025641e-06,
"loss": 1.2446,
"mean_token_accuracy": 0.7050209045410156,
"step": 154
},
{
"epoch": 0.2528548123980424,
"grad_norm": 5.894453048706055,
"learning_rate": 9.38206145600291e-06,
"loss": 1.5225,
"mean_token_accuracy": 0.6514018774032593,
"step": 155
},
{
"epoch": 0.2544861337683524,
"grad_norm": 5.637172698974609,
"learning_rate": 9.369022262207788e-06,
"loss": 1.5141,
"mean_token_accuracy": 0.622454047203064,
"step": 156
},
{
"epoch": 0.2561174551386623,
"grad_norm": 5.716491222381592,
"learning_rate": 9.355857292754152e-06,
"loss": 1.5215,
"mean_token_accuracy": 0.6571729779243469,
"step": 157
},
{
"epoch": 0.25774877650897227,
"grad_norm": 6.088312149047852,
"learning_rate": 9.342566975613875e-06,
"loss": 1.5606,
"mean_token_accuracy": 0.6172152161598206,
"step": 158
},
{
"epoch": 0.25938009787928223,
"grad_norm": 6.6313796043396,
"learning_rate": 9.329151742833678e-06,
"loss": 1.261,
"mean_token_accuracy": 0.6948052048683167,
"step": 159
},
{
"epoch": 0.26101141924959215,
"grad_norm": 6.572261333465576,
"learning_rate": 9.315612030521091e-06,
"loss": 1.174,
"mean_token_accuracy": 0.7152777910232544,
"step": 160
},
{
"epoch": 0.2626427406199021,
"grad_norm": 6.0583882331848145,
"learning_rate": 9.301948278830273e-06,
"loss": 1.4,
"mean_token_accuracy": 0.6757156848907471,
"step": 161
},
{
"epoch": 0.2642740619902121,
"grad_norm": 5.715542316436768,
"learning_rate": 9.288160931947698e-06,
"loss": 1.3266,
"mean_token_accuracy": 0.6793855428695679,
"step": 162
},
{
"epoch": 0.265905383360522,
"grad_norm": 5.376319408416748,
"learning_rate": 9.274250438077724e-06,
"loss": 1.1109,
"mean_token_accuracy": 0.7322580814361572,
"step": 163
},
{
"epoch": 0.26753670473083196,
"grad_norm": 5.3145012855529785,
"learning_rate": 9.260217249428016e-06,
"loss": 1.1862,
"mean_token_accuracy": 0.7048360109329224,
"step": 164
},
{
"epoch": 0.26916802610114193,
"grad_norm": 6.1805338859558105,
"learning_rate": 9.246061822194849e-06,
"loss": 1.5489,
"mean_token_accuracy": 0.6458333134651184,
"step": 165
},
{
"epoch": 0.2707993474714519,
"grad_norm": 5.672875881195068,
"learning_rate": 9.231784616548277e-06,
"loss": 1.3288,
"mean_token_accuracy": 0.6853932738304138,
"step": 166
},
{
"epoch": 0.2724306688417618,
"grad_norm": 5.999112606048584,
"learning_rate": 9.217386096617175e-06,
"loss": 1.5361,
"mean_token_accuracy": 0.6438902616500854,
"step": 167
},
{
"epoch": 0.2740619902120718,
"grad_norm": 6.415194511413574,
"learning_rate": 9.202866730474143e-06,
"loss": 1.5405,
"mean_token_accuracy": 0.6401821970939636,
"step": 168
},
{
"epoch": 0.27569331158238175,
"grad_norm": 6.119101524353027,
"learning_rate": 9.188226990120303e-06,
"loss": 1.4685,
"mean_token_accuracy": 0.6468571424484253,
"step": 169
},
{
"epoch": 0.27732463295269166,
"grad_norm": 5.0899434089660645,
"learning_rate": 9.173467351469943e-06,
"loss": 1.1837,
"mean_token_accuracy": 0.7153804302215576,
"step": 170
},
{
"epoch": 0.27895595432300163,
"grad_norm": 5.665865421295166,
"learning_rate": 9.158588294335055e-06,
"loss": 1.271,
"mean_token_accuracy": 0.6892816424369812,
"step": 171
},
{
"epoch": 0.2805872756933116,
"grad_norm": 5.781040668487549,
"learning_rate": 9.14359030240973e-06,
"loss": 1.1938,
"mean_token_accuracy": 0.7185488343238831,
"step": 172
},
{
"epoch": 0.2822185970636215,
"grad_norm": 4.997267723083496,
"learning_rate": 9.128473863254438e-06,
"loss": 1.2519,
"mean_token_accuracy": 0.6875,
"step": 173
},
{
"epoch": 0.2838499184339315,
"grad_norm": 5.392592906951904,
"learning_rate": 9.113239468280175e-06,
"loss": 1.5819,
"mean_token_accuracy": 0.6332082748413086,
"step": 174
},
{
"epoch": 0.28548123980424145,
"grad_norm": 4.405828952789307,
"learning_rate": 9.097887612732495e-06,
"loss": 0.9685,
"mean_token_accuracy": 0.7657608985900879,
"step": 175
},
{
"epoch": 0.28711256117455136,
"grad_norm": 4.870915412902832,
"learning_rate": 9.082418795675397e-06,
"loss": 1.2698,
"mean_token_accuracy": 0.7014712691307068,
"step": 176
},
{
"epoch": 0.28874388254486133,
"grad_norm": 5.485860824584961,
"learning_rate": 9.066833519975118e-06,
"loss": 1.3616,
"mean_token_accuracy": 0.6694870591163635,
"step": 177
},
{
"epoch": 0.2903752039151713,
"grad_norm": 5.251032829284668,
"learning_rate": 9.051132292283772e-06,
"loss": 1.1863,
"mean_token_accuracy": 0.6943209767341614,
"step": 178
},
{
"epoch": 0.29200652528548127,
"grad_norm": 5.481298923492432,
"learning_rate": 9.035315623022886e-06,
"loss": 1.3581,
"mean_token_accuracy": 0.6696730256080627,
"step": 179
},
{
"epoch": 0.2936378466557912,
"grad_norm": 5.111570358276367,
"learning_rate": 9.019384026366807e-06,
"loss": 1.3505,
"mean_token_accuracy": 0.6688086986541748,
"step": 180
},
{
"epoch": 0.29526916802610115,
"grad_norm": 4.826779842376709,
"learning_rate": 9.003338020225986e-06,
"loss": 1.1635,
"mean_token_accuracy": 0.7186034321784973,
"step": 181
},
{
"epoch": 0.2969004893964111,
"grad_norm": 5.660580635070801,
"learning_rate": 8.987178126230138e-06,
"loss": 1.5801,
"mean_token_accuracy": 0.6331775784492493,
"step": 182
},
{
"epoch": 0.29853181076672103,
"grad_norm": 5.761633396148682,
"learning_rate": 8.97090486971129e-06,
"loss": 1.1748,
"mean_token_accuracy": 0.7208150029182434,
"step": 183
},
{
"epoch": 0.300163132137031,
"grad_norm": 5.576194763183594,
"learning_rate": 8.954518779686704e-06,
"loss": 1.4442,
"mean_token_accuracy": 0.6586382389068604,
"step": 184
},
{
"epoch": 0.30179445350734097,
"grad_norm": 5.576228618621826,
"learning_rate": 8.938020388841673e-06,
"loss": 1.3454,
"mean_token_accuracy": 0.6765140295028687,
"step": 185
},
{
"epoch": 0.3034257748776509,
"grad_norm": 4.994912624359131,
"learning_rate": 8.921410233512211e-06,
"loss": 1.24,
"mean_token_accuracy": 0.7072243094444275,
"step": 186
},
{
"epoch": 0.30505709624796085,
"grad_norm": 5.298640251159668,
"learning_rate": 8.904688853667612e-06,
"loss": 1.3136,
"mean_token_accuracy": 0.6705882549285889,
"step": 187
},
{
"epoch": 0.3066884176182708,
"grad_norm": 5.550191879272461,
"learning_rate": 8.887856792892902e-06,
"loss": 1.3868,
"mean_token_accuracy": 0.6856528520584106,
"step": 188
},
{
"epoch": 0.3083197389885807,
"grad_norm": 5.478514671325684,
"learning_rate": 8.87091459837116e-06,
"loss": 1.2973,
"mean_token_accuracy": 0.6864721775054932,
"step": 189
},
{
"epoch": 0.3099510603588907,
"grad_norm": 5.3640546798706055,
"learning_rate": 8.853862820865742e-06,
"loss": 1.4836,
"mean_token_accuracy": 0.6382033824920654,
"step": 190
},
{
"epoch": 0.31158238172920066,
"grad_norm": 4.50584077835083,
"learning_rate": 8.83670201470237e-06,
"loss": 1.0835,
"mean_token_accuracy": 0.7182095646858215,
"step": 191
},
{
"epoch": 0.3132137030995106,
"grad_norm": 5.293252944946289,
"learning_rate": 8.819432737751097e-06,
"loss": 1.2622,
"mean_token_accuracy": 0.6940993666648865,
"step": 192
},
{
"epoch": 0.31484502446982054,
"grad_norm": 4.696035861968994,
"learning_rate": 8.802055551408207e-06,
"loss": 1.189,
"mean_token_accuracy": 0.7159493565559387,
"step": 193
},
{
"epoch": 0.3164763458401305,
"grad_norm": 4.758869171142578,
"learning_rate": 8.784571020577926e-06,
"loss": 1.0363,
"mean_token_accuracy": 0.7414075136184692,
"step": 194
},
{
"epoch": 0.3181076672104405,
"grad_norm": 5.393585681915283,
"learning_rate": 8.76697971365409e-06,
"loss": 1.3754,
"mean_token_accuracy": 0.6663179993629456,
"step": 195
},
{
"epoch": 0.3197389885807504,
"grad_norm": 5.480104446411133,
"learning_rate": 8.74928220250164e-06,
"loss": 1.7055,
"mean_token_accuracy": 0.6046082973480225,
"step": 196
},
{
"epoch": 0.32137030995106036,
"grad_norm": 5.184609413146973,
"learning_rate": 8.731479062438056e-06,
"loss": 1.4335,
"mean_token_accuracy": 0.6592000126838684,
"step": 197
},
{
"epoch": 0.32300163132137033,
"grad_norm": 5.132387638092041,
"learning_rate": 8.713570872214637e-06,
"loss": 1.4172,
"mean_token_accuracy": 0.6633475422859192,
"step": 198
},
{
"epoch": 0.32463295269168024,
"grad_norm": 5.561227798461914,
"learning_rate": 8.695558213997692e-06,
"loss": 1.5116,
"mean_token_accuracy": 0.6382217407226562,
"step": 199
},
{
"epoch": 0.3262642740619902,
"grad_norm": 6.255463123321533,
"learning_rate": 8.677441673349622e-06,
"loss": 1.3863,
"mean_token_accuracy": 0.6630803942680359,
"step": 200
},
{
"epoch": 0.3278955954323002,
"grad_norm": 4.947396755218506,
"learning_rate": 8.659221839209869e-06,
"loss": 1.4143,
"mean_token_accuracy": 0.6645483374595642,
"step": 201
},
{
"epoch": 0.3295269168026101,
"grad_norm": 5.235170364379883,
"learning_rate": 8.640899303875785e-06,
"loss": 1.2793,
"mean_token_accuracy": 0.6936695575714111,
"step": 202
},
{
"epoch": 0.33115823817292006,
"grad_norm": 5.727679252624512,
"learning_rate": 8.622474662983372e-06,
"loss": 1.428,
"mean_token_accuracy": 0.6479238867759705,
"step": 203
},
{
"epoch": 0.33278955954323003,
"grad_norm": 5.557906627655029,
"learning_rate": 8.60394851548792e-06,
"loss": 1.3305,
"mean_token_accuracy": 0.6868632435798645,
"step": 204
},
{
"epoch": 0.33442088091353994,
"grad_norm": 5.403807640075684,
"learning_rate": 8.585321463644525e-06,
"loss": 1.3701,
"mean_token_accuracy": 0.6680100560188293,
"step": 205
},
{
"epoch": 0.3360522022838499,
"grad_norm": 5.334835052490234,
"learning_rate": 8.566594112988534e-06,
"loss": 1.3598,
"mean_token_accuracy": 0.6583541035652161,
"step": 206
},
{
"epoch": 0.3376835236541599,
"grad_norm": 4.983403205871582,
"learning_rate": 8.547767072315835e-06,
"loss": 1.2434,
"mean_token_accuracy": 0.6838777661323547,
"step": 207
},
{
"epoch": 0.33931484502446985,
"grad_norm": 5.587502956390381,
"learning_rate": 8.528840953663086e-06,
"loss": 1.3061,
"mean_token_accuracy": 0.688642680644989,
"step": 208
},
{
"epoch": 0.34094616639477976,
"grad_norm": 5.853117942810059,
"learning_rate": 8.5098163722878e-06,
"loss": 1.4633,
"mean_token_accuracy": 0.6635462641716003,
"step": 209
},
{
"epoch": 0.3425774877650897,
"grad_norm": 5.541942596435547,
"learning_rate": 8.490693946648364e-06,
"loss": 1.2622,
"mean_token_accuracy": 0.7057894468307495,
"step": 210
},
{
"epoch": 0.3442088091353997,
"grad_norm": 5.35739278793335,
"learning_rate": 8.47147429838392e-06,
"loss": 1.2618,
"mean_token_accuracy": 0.689638078212738,
"step": 211
},
{
"epoch": 0.3458401305057096,
"grad_norm": 5.423904895782471,
"learning_rate": 8.452158052294158e-06,
"loss": 1.5032,
"mean_token_accuracy": 0.6418230533599854,
"step": 212
},
{
"epoch": 0.3474714518760196,
"grad_norm": 4.8785223960876465,
"learning_rate": 8.432745836319007e-06,
"loss": 1.4344,
"mean_token_accuracy": 0.6615913510322571,
"step": 213
},
{
"epoch": 0.34910277324632955,
"grad_norm": 4.893246650695801,
"learning_rate": 8.413238281518225e-06,
"loss": 1.2007,
"mean_token_accuracy": 0.6991991996765137,
"step": 214
},
{
"epoch": 0.35073409461663946,
"grad_norm": 5.7973504066467285,
"learning_rate": 8.39363602205088e-06,
"loss": 1.5249,
"mean_token_accuracy": 0.6353210806846619,
"step": 215
},
{
"epoch": 0.3523654159869494,
"grad_norm": 5.406508922576904,
"learning_rate": 8.373939695154739e-06,
"loss": 1.2806,
"mean_token_accuracy": 0.6916395425796509,
"step": 216
},
{
"epoch": 0.3539967373572594,
"grad_norm": 4.771231174468994,
"learning_rate": 8.354149941125539e-06,
"loss": 1.1256,
"mean_token_accuracy": 0.7322953343391418,
"step": 217
},
{
"epoch": 0.3556280587275693,
"grad_norm": 5.047488689422607,
"learning_rate": 8.334267403296193e-06,
"loss": 1.1106,
"mean_token_accuracy": 0.7239696383476257,
"step": 218
},
{
"epoch": 0.3572593800978793,
"grad_norm": 5.410397529602051,
"learning_rate": 8.314292728015859e-06,
"loss": 1.182,
"mean_token_accuracy": 0.7058823704719543,
"step": 219
},
{
"epoch": 0.35889070146818924,
"grad_norm": 6.237778663635254,
"learning_rate": 8.294226564628936e-06,
"loss": 1.2493,
"mean_token_accuracy": 0.6834862232208252,
"step": 220
},
{
"epoch": 0.3605220228384992,
"grad_norm": 5.143507957458496,
"learning_rate": 8.274069565453955e-06,
"loss": 1.352,
"mean_token_accuracy": 0.6808404326438904,
"step": 221
},
{
"epoch": 0.3621533442088091,
"grad_norm": 5.389186859130859,
"learning_rate": 8.25382238576237e-06,
"loss": 1.2109,
"mean_token_accuracy": 0.7188329100608826,
"step": 222
},
{
"epoch": 0.3637846655791191,
"grad_norm": 5.256932735443115,
"learning_rate": 8.23348568375726e-06,
"loss": 1.3621,
"mean_token_accuracy": 0.679024875164032,
"step": 223
},
{
"epoch": 0.36541598694942906,
"grad_norm": 5.2731146812438965,
"learning_rate": 8.213060120551923e-06,
"loss": 1.4888,
"mean_token_accuracy": 0.644489586353302,
"step": 224
},
{
"epoch": 0.367047308319739,
"grad_norm": 5.008488655090332,
"learning_rate": 8.1925463601484e-06,
"loss": 1.3388,
"mean_token_accuracy": 0.6928645372390747,
"step": 225
},
{
"epoch": 0.36867862969004894,
"grad_norm": 6.0909247398376465,
"learning_rate": 8.171945069415877e-06,
"loss": 1.3308,
"mean_token_accuracy": 0.6703540086746216,
"step": 226
},
{
"epoch": 0.3703099510603589,
"grad_norm": 6.270472526550293,
"learning_rate": 8.151256918069002e-06,
"loss": 1.5142,
"mean_token_accuracy": 0.6341871023178101,
"step": 227
},
{
"epoch": 0.3719412724306688,
"grad_norm": 5.570935249328613,
"learning_rate": 8.130482578646137e-06,
"loss": 1.1315,
"mean_token_accuracy": 0.7041916251182556,
"step": 228
},
{
"epoch": 0.3735725938009788,
"grad_norm": 5.195607662200928,
"learning_rate": 8.109622726487463e-06,
"loss": 1.54,
"mean_token_accuracy": 0.6397637724876404,
"step": 229
},
{
"epoch": 0.37520391517128876,
"grad_norm": 4.792831897735596,
"learning_rate": 8.088678039713052e-06,
"loss": 1.2567,
"mean_token_accuracy": 0.7066537141799927,
"step": 230
},
{
"epoch": 0.3768352365415987,
"grad_norm": 5.558446407318115,
"learning_rate": 8.067649199200807e-06,
"loss": 1.3282,
"mean_token_accuracy": 0.6886616945266724,
"step": 231
},
{
"epoch": 0.37846655791190864,
"grad_norm": 5.962700366973877,
"learning_rate": 8.046536888564335e-06,
"loss": 1.2761,
"mean_token_accuracy": 0.6899516582489014,
"step": 232
},
{
"epoch": 0.3800978792822186,
"grad_norm": 4.565369129180908,
"learning_rate": 8.025341794130722e-06,
"loss": 1.1489,
"mean_token_accuracy": 0.7200378775596619,
"step": 233
},
{
"epoch": 0.3817292006525285,
"grad_norm": 5.34097146987915,
"learning_rate": 8.004064604918219e-06,
"loss": 1.5369,
"mean_token_accuracy": 0.6295350193977356,
"step": 234
},
{
"epoch": 0.3833605220228385,
"grad_norm": 4.983196258544922,
"learning_rate": 7.982706012613854e-06,
"loss": 1.1661,
"mean_token_accuracy": 0.6999412775039673,
"step": 235
},
{
"epoch": 0.38499184339314846,
"grad_norm": 5.128100395202637,
"learning_rate": 7.961266711550922e-06,
"loss": 1.345,
"mean_token_accuracy": 0.6874102354049683,
"step": 236
},
{
"epoch": 0.3866231647634584,
"grad_norm": 5.386168479919434,
"learning_rate": 7.939747398686445e-06,
"loss": 1.3224,
"mean_token_accuracy": 0.6796019673347473,
"step": 237
},
{
"epoch": 0.38825448613376834,
"grad_norm": 5.458306312561035,
"learning_rate": 7.918148773578492e-06,
"loss": 1.4898,
"mean_token_accuracy": 0.6451776623725891,
"step": 238
},
{
"epoch": 0.3898858075040783,
"grad_norm": 5.1783552169799805,
"learning_rate": 7.896471538363442e-06,
"loss": 1.5354,
"mean_token_accuracy": 0.6542155742645264,
"step": 239
},
{
"epoch": 0.3915171288743883,
"grad_norm": 5.7401580810546875,
"learning_rate": 7.874716397733172e-06,
"loss": 1.4129,
"mean_token_accuracy": 0.6713286638259888,
"step": 240
},
{
"epoch": 0.3931484502446982,
"grad_norm": 5.0389180183410645,
"learning_rate": 7.852884058912124e-06,
"loss": 1.4643,
"mean_token_accuracy": 0.6414728760719299,
"step": 241
},
{
"epoch": 0.39477977161500816,
"grad_norm": 4.874699592590332,
"learning_rate": 7.830975231634341e-06,
"loss": 1.0325,
"mean_token_accuracy": 0.740480363368988,
"step": 242
},
{
"epoch": 0.3964110929853181,
"grad_norm": 4.744316101074219,
"learning_rate": 7.808990628120374e-06,
"loss": 1.154,
"mean_token_accuracy": 0.7321428656578064,
"step": 243
},
{
"epoch": 0.39804241435562804,
"grad_norm": 4.903902530670166,
"learning_rate": 7.786930963054142e-06,
"loss": 1.2538,
"mean_token_accuracy": 0.6969696879386902,
"step": 244
},
{
"epoch": 0.399673735725938,
"grad_norm": 4.861123085021973,
"learning_rate": 7.76479695355969e-06,
"loss": 1.1761,
"mean_token_accuracy": 0.7024747133255005,
"step": 245
},
{
"epoch": 0.401305057096248,
"grad_norm": 5.309647083282471,
"learning_rate": 7.742589319177879e-06,
"loss": 1.2522,
"mean_token_accuracy": 0.7030481696128845,
"step": 246
},
{
"epoch": 0.4029363784665579,
"grad_norm": 4.72802209854126,
"learning_rate": 7.720308781843003e-06,
"loss": 1.1986,
"mean_token_accuracy": 0.7094155550003052,
"step": 247
},
{
"epoch": 0.40456769983686786,
"grad_norm": 6.070117473602295,
"learning_rate": 7.697956065859308e-06,
"loss": 1.295,
"mean_token_accuracy": 0.6842672228813171,
"step": 248
},
{
"epoch": 0.4061990212071778,
"grad_norm": 4.879459857940674,
"learning_rate": 7.67553189787745e-06,
"loss": 1.2096,
"mean_token_accuracy": 0.686804473400116,
"step": 249
},
{
"epoch": 0.4078303425774878,
"grad_norm": 5.451211452484131,
"learning_rate": 7.653037006870878e-06,
"loss": 1.4763,
"mean_token_accuracy": 0.637888491153717,
"step": 250
},
{
"epoch": 0.4094616639477977,
"grad_norm": 4.923818588256836,
"learning_rate": 7.630472124112125e-06,
"loss": 1.2607,
"mean_token_accuracy": 0.6872745752334595,
"step": 251
},
{
"epoch": 0.4110929853181077,
"grad_norm": 5.415319442749023,
"learning_rate": 7.607837983149057e-06,
"loss": 1.1446,
"mean_token_accuracy": 0.7222545146942139,
"step": 252
},
{
"epoch": 0.41272430668841764,
"grad_norm": 5.4529900550842285,
"learning_rate": 7.585135319780995e-06,
"loss": 1.4468,
"mean_token_accuracy": 0.6554580926895142,
"step": 253
},
{
"epoch": 0.41435562805872755,
"grad_norm": 5.247809410095215,
"learning_rate": 7.562364872034823e-06,
"loss": 1.3883,
"mean_token_accuracy": 0.6721068024635315,
"step": 254
},
{
"epoch": 0.4159869494290375,
"grad_norm": 5.47812557220459,
"learning_rate": 7.5395273801409854e-06,
"loss": 1.4343,
"mean_token_accuracy": 0.6608517169952393,
"step": 255
},
{
"epoch": 0.4176182707993475,
"grad_norm": 5.498720645904541,
"learning_rate": 7.5166235865094174e-06,
"loss": 1.4222,
"mean_token_accuracy": 0.6456736326217651,
"step": 256
},
{
"epoch": 0.4192495921696574,
"grad_norm": 4.786160945892334,
"learning_rate": 7.493654235705422e-06,
"loss": 1.4204,
"mean_token_accuracy": 0.6773132681846619,
"step": 257
},
{
"epoch": 0.42088091353996737,
"grad_norm": 5.397915840148926,
"learning_rate": 7.470620074425459e-06,
"loss": 1.4843,
"mean_token_accuracy": 0.6380900740623474,
"step": 258
},
{
"epoch": 0.42251223491027734,
"grad_norm": 5.466760158538818,
"learning_rate": 7.447521851472872e-06,
"loss": 1.4852,
"mean_token_accuracy": 0.6487154364585876,
"step": 259
},
{
"epoch": 0.42414355628058725,
"grad_norm": 5.804627895355225,
"learning_rate": 7.424360317733544e-06,
"loss": 1.3923,
"mean_token_accuracy": 0.6545741558074951,
"step": 260
},
{
"epoch": 0.4257748776508972,
"grad_norm": 5.381365776062012,
"learning_rate": 7.401136226151488e-06,
"loss": 1.4495,
"mean_token_accuracy": 0.6681222915649414,
"step": 261
},
{
"epoch": 0.4274061990212072,
"grad_norm": 4.776740550994873,
"learning_rate": 7.377850331704377e-06,
"loss": 1.0082,
"mean_token_accuracy": 0.7397812604904175,
"step": 262
},
{
"epoch": 0.4290375203915171,
"grad_norm": 5.0946149826049805,
"learning_rate": 7.354503391378992e-06,
"loss": 1.1745,
"mean_token_accuracy": 0.7127882838249207,
"step": 263
},
{
"epoch": 0.43066884176182707,
"grad_norm": 5.161426067352295,
"learning_rate": 7.331096164146616e-06,
"loss": 1.4598,
"mean_token_accuracy": 0.6507353186607361,
"step": 264
},
{
"epoch": 0.43230016313213704,
"grad_norm": 5.14084005355835,
"learning_rate": 7.307629410938364e-06,
"loss": 1.3107,
"mean_token_accuracy": 0.6751173734664917,
"step": 265
},
{
"epoch": 0.433931484502447,
"grad_norm": 4.744462966918945,
"learning_rate": 7.28410389462044e-06,
"loss": 1.2483,
"mean_token_accuracy": 0.6957618594169617,
"step": 266
},
{
"epoch": 0.4355628058727569,
"grad_norm": 5.074441432952881,
"learning_rate": 7.260520379969347e-06,
"loss": 1.2429,
"mean_token_accuracy": 0.7157652378082275,
"step": 267
},
{
"epoch": 0.4371941272430669,
"grad_norm": 5.745429992675781,
"learning_rate": 7.236879633647018e-06,
"loss": 1.3938,
"mean_token_accuracy": 0.6745472550392151,
"step": 268
},
{
"epoch": 0.43882544861337686,
"grad_norm": 4.631476879119873,
"learning_rate": 7.213182424175895e-06,
"loss": 1.141,
"mean_token_accuracy": 0.726822018623352,
"step": 269
},
{
"epoch": 0.44045676998368677,
"grad_norm": 4.983883857727051,
"learning_rate": 7.189429521913942e-06,
"loss": 1.4304,
"mean_token_accuracy": 0.656844973564148,
"step": 270
},
{
"epoch": 0.44208809135399674,
"grad_norm": 5.129734039306641,
"learning_rate": 7.165621699029615e-06,
"loss": 1.2641,
"mean_token_accuracy": 0.7042542099952698,
"step": 271
},
{
"epoch": 0.4437194127243067,
"grad_norm": 5.031182765960693,
"learning_rate": 7.1417597294767405e-06,
"loss": 1.0971,
"mean_token_accuracy": 0.7178630828857422,
"step": 272
},
{
"epoch": 0.4453507340946166,
"grad_norm": 4.948353290557861,
"learning_rate": 7.1178443889693694e-06,
"loss": 0.9821,
"mean_token_accuracy": 0.7603078484535217,
"step": 273
},
{
"epoch": 0.4469820554649266,
"grad_norm": 5.791008472442627,
"learning_rate": 7.0938764549565605e-06,
"loss": 1.3631,
"mean_token_accuracy": 0.6715368032455444,
"step": 274
},
{
"epoch": 0.44861337683523655,
"grad_norm": 5.602413654327393,
"learning_rate": 7.069856706597095e-06,
"loss": 1.4013,
"mean_token_accuracy": 0.6645569801330566,
"step": 275
},
{
"epoch": 0.45024469820554647,
"grad_norm": 4.785682201385498,
"learning_rate": 7.04578592473416e-06,
"loss": 1.2362,
"mean_token_accuracy": 0.6916077136993408,
"step": 276
},
{
"epoch": 0.45187601957585644,
"grad_norm": 4.269145488739014,
"learning_rate": 7.021664891869955e-06,
"loss": 1.1638,
"mean_token_accuracy": 0.7208147048950195,
"step": 277
},
{
"epoch": 0.4535073409461664,
"grad_norm": 5.598394870758057,
"learning_rate": 6.997494392140264e-06,
"loss": 1.449,
"mean_token_accuracy": 0.6509479880332947,
"step": 278
},
{
"epoch": 0.4551386623164764,
"grad_norm": 4.253592491149902,
"learning_rate": 6.973275211288953e-06,
"loss": 0.962,
"mean_token_accuracy": 0.7480490803718567,
"step": 279
},
{
"epoch": 0.4567699836867863,
"grad_norm": 5.125161647796631,
"learning_rate": 6.949008136642437e-06,
"loss": 1.3255,
"mean_token_accuracy": 0.6781437397003174,
"step": 280
},
{
"epoch": 0.45840130505709625,
"grad_norm": 5.201466083526611,
"learning_rate": 6.924693957084079e-06,
"loss": 1.3969,
"mean_token_accuracy": 0.6604675650596619,
"step": 281
},
{
"epoch": 0.4600326264274062,
"grad_norm": 6.443404674530029,
"learning_rate": 6.900333463028546e-06,
"loss": 1.4835,
"mean_token_accuracy": 0.6526094079017639,
"step": 282
},
{
"epoch": 0.46166394779771613,
"grad_norm": 5.083189964294434,
"learning_rate": 6.8759274463961145e-06,
"loss": 1.3969,
"mean_token_accuracy": 0.657814085483551,
"step": 283
},
{
"epoch": 0.4632952691680261,
"grad_norm": 5.082605838775635,
"learning_rate": 6.851476700586926e-06,
"loss": 1.1498,
"mean_token_accuracy": 0.7049723863601685,
"step": 284
},
{
"epoch": 0.46492659053833607,
"grad_norm": 5.190065860748291,
"learning_rate": 6.8269820204551985e-06,
"loss": 1.3005,
"mean_token_accuracy": 0.6958598494529724,
"step": 285
},
{
"epoch": 0.466557911908646,
"grad_norm": 5.341015338897705,
"learning_rate": 6.802444202283381e-06,
"loss": 1.3399,
"mean_token_accuracy": 0.6875981092453003,
"step": 286
},
{
"epoch": 0.46818923327895595,
"grad_norm": 5.5725226402282715,
"learning_rate": 6.777864043756268e-06,
"loss": 1.2856,
"mean_token_accuracy": 0.6699952483177185,
"step": 287
},
{
"epoch": 0.4698205546492659,
"grad_norm": 5.266445636749268,
"learning_rate": 6.7532423439350794e-06,
"loss": 1.4138,
"mean_token_accuracy": 0.6606606841087341,
"step": 288
},
{
"epoch": 0.47145187601957583,
"grad_norm": 4.3366780281066895,
"learning_rate": 6.728579903231463e-06,
"loss": 0.9495,
"mean_token_accuracy": 0.7561102509498596,
"step": 289
},
{
"epoch": 0.4730831973898858,
"grad_norm": 5.066049575805664,
"learning_rate": 6.703877523381495e-06,
"loss": 1.4154,
"mean_token_accuracy": 0.6551030874252319,
"step": 290
},
{
"epoch": 0.47471451876019577,
"grad_norm": 5.101365089416504,
"learning_rate": 6.679136007419607e-06,
"loss": 1.1613,
"mean_token_accuracy": 0.7094940543174744,
"step": 291
},
{
"epoch": 0.4763458401305057,
"grad_norm": 5.026820182800293,
"learning_rate": 6.654356159652483e-06,
"loss": 1.1103,
"mean_token_accuracy": 0.7332636117935181,
"step": 292
},
{
"epoch": 0.47797716150081565,
"grad_norm": 5.343873977661133,
"learning_rate": 6.629538785632912e-06,
"loss": 1.2417,
"mean_token_accuracy": 0.6913783550262451,
"step": 293
},
{
"epoch": 0.4796084828711256,
"grad_norm": 4.890134334564209,
"learning_rate": 6.604684692133597e-06,
"loss": 1.1577,
"mean_token_accuracy": 0.7185184955596924,
"step": 294
},
{
"epoch": 0.4812398042414356,
"grad_norm": 4.612360000610352,
"learning_rate": 6.579794687120938e-06,
"loss": 1.1759,
"mean_token_accuracy": 0.7133533954620361,
"step": 295
},
{
"epoch": 0.4828711256117455,
"grad_norm": 5.377026081085205,
"learning_rate": 6.554869579728753e-06,
"loss": 1.3571,
"mean_token_accuracy": 0.6833840012550354,
"step": 296
},
{
"epoch": 0.48450244698205547,
"grad_norm": 4.644443988800049,
"learning_rate": 6.5299101802319905e-06,
"loss": 1.2068,
"mean_token_accuracy": 0.678475558757782,
"step": 297
},
{
"epoch": 0.48613376835236544,
"grad_norm": 5.2673869132995605,
"learning_rate": 6.504917300020373e-06,
"loss": 1.2203,
"mean_token_accuracy": 0.7017102837562561,
"step": 298
},
{
"epoch": 0.48776508972267535,
"grad_norm": 4.513294219970703,
"learning_rate": 6.479891751572026e-06,
"loss": 1.0001,
"mean_token_accuracy": 0.7441986203193665,
"step": 299
},
{
"epoch": 0.4893964110929853,
"grad_norm": 5.3284454345703125,
"learning_rate": 6.454834348427077e-06,
"loss": 1.4106,
"mean_token_accuracy": 0.6686747074127197,
"step": 300
},
{
"epoch": 0.4910277324632953,
"grad_norm": 4.518412113189697,
"learning_rate": 6.429745905161183e-06,
"loss": 1.0715,
"mean_token_accuracy": 0.7324516773223877,
"step": 301
},
{
"epoch": 0.4926590538336052,
"grad_norm": 6.271851062774658,
"learning_rate": 6.404627237359078e-06,
"loss": 1.3864,
"mean_token_accuracy": 0.6706717610359192,
"step": 302
},
{
"epoch": 0.49429037520391517,
"grad_norm": 5.340519905090332,
"learning_rate": 6.379479161588039e-06,
"loss": 1.3695,
"mean_token_accuracy": 0.6908436417579651,
"step": 303
},
{
"epoch": 0.49592169657422513,
"grad_norm": 5.797330856323242,
"learning_rate": 6.354302495371352e-06,
"loss": 1.5499,
"mean_token_accuracy": 0.6514008641242981,
"step": 304
},
{
"epoch": 0.49755301794453505,
"grad_norm": 5.730583667755127,
"learning_rate": 6.329098057161731e-06,
"loss": 1.2407,
"mean_token_accuracy": 0.7070135474205017,
"step": 305
},
{
"epoch": 0.499184339314845,
"grad_norm": 5.447910308837891,
"learning_rate": 6.303866666314715e-06,
"loss": 1.2594,
"mean_token_accuracy": 0.6743515729904175,
"step": 306
},
{
"epoch": 0.5008156606851549,
"grad_norm": 4.888600826263428,
"learning_rate": 6.278609143062026e-06,
"loss": 1.4212,
"mean_token_accuracy": 0.6533401012420654,
"step": 307
},
{
"epoch": 0.5024469820554649,
"grad_norm": 5.023259162902832,
"learning_rate": 6.2533263084849095e-06,
"loss": 1.149,
"mean_token_accuracy": 0.7093348503112793,
"step": 308
},
{
"epoch": 0.5040783034257749,
"grad_norm": 5.078878879547119,
"learning_rate": 6.228018984487443e-06,
"loss": 1.4057,
"mean_token_accuracy": 0.6541189551353455,
"step": 309
},
{
"epoch": 0.5057096247960848,
"grad_norm": 5.9124298095703125,
"learning_rate": 6.202687993769811e-06,
"loss": 1.381,
"mean_token_accuracy": 0.6719298362731934,
"step": 310
},
{
"epoch": 0.5073409461663948,
"grad_norm": 4.667541980743408,
"learning_rate": 6.177334159801571e-06,
"loss": 1.2029,
"mean_token_accuracy": 0.703399121761322,
"step": 311
},
{
"epoch": 0.5089722675367048,
"grad_norm": 5.1174116134643555,
"learning_rate": 6.151958306794878e-06,
"loss": 1.2424,
"mean_token_accuracy": 0.6848514080047607,
"step": 312
},
{
"epoch": 0.5106035889070146,
"grad_norm": 4.411402702331543,
"learning_rate": 6.126561259677679e-06,
"loss": 1.0155,
"mean_token_accuracy": 0.7456547021865845,
"step": 313
},
{
"epoch": 0.5122349102773246,
"grad_norm": 5.111578464508057,
"learning_rate": 6.101143844066919e-06,
"loss": 1.5141,
"mean_token_accuracy": 0.6380020380020142,
"step": 314
},
{
"epoch": 0.5138662316476346,
"grad_norm": 4.498473167419434,
"learning_rate": 6.0757068862416855e-06,
"loss": 1.0826,
"mean_token_accuracy": 0.7336174845695496,
"step": 315
},
{
"epoch": 0.5154975530179445,
"grad_norm": 4.840219974517822,
"learning_rate": 6.050251213116356e-06,
"loss": 1.1874,
"mean_token_accuracy": 0.7011764645576477,
"step": 316
},
{
"epoch": 0.5171288743882545,
"grad_norm": 5.577286243438721,
"learning_rate": 6.024777652213702e-06,
"loss": 1.3661,
"mean_token_accuracy": 0.6853360533714294,
"step": 317
},
{
"epoch": 0.5187601957585645,
"grad_norm": 5.854229927062988,
"learning_rate": 5.9992870316380085e-06,
"loss": 1.3195,
"mean_token_accuracy": 0.6762208342552185,
"step": 318
},
{
"epoch": 0.5203915171288744,
"grad_norm": 4.742856502532959,
"learning_rate": 5.973780180048138e-06,
"loss": 1.3327,
"mean_token_accuracy": 0.6779661178588867,
"step": 319
},
{
"epoch": 0.5220228384991843,
"grad_norm": 4.710707187652588,
"learning_rate": 5.948257926630594e-06,
"loss": 1.2339,
"mean_token_accuracy": 0.6876561641693115,
"step": 320
},
{
"epoch": 0.5236541598694943,
"grad_norm": 4.941910266876221,
"learning_rate": 5.9227211010725774e-06,
"loss": 1.2255,
"mean_token_accuracy": 0.6985294222831726,
"step": 321
},
{
"epoch": 0.5252854812398042,
"grad_norm": 4.412341594696045,
"learning_rate": 5.897170533534997e-06,
"loss": 1.0061,
"mean_token_accuracy": 0.7371076345443726,
"step": 322
},
{
"epoch": 0.5269168026101142,
"grad_norm": 5.1144843101501465,
"learning_rate": 5.871607054625497e-06,
"loss": 1.2831,
"mean_token_accuracy": 0.6948955655097961,
"step": 323
},
{
"epoch": 0.5285481239804242,
"grad_norm": 4.727828025817871,
"learning_rate": 5.846031495371445e-06,
"loss": 1.1593,
"mean_token_accuracy": 0.716786801815033,
"step": 324
},
{
"epoch": 0.5301794453507341,
"grad_norm": 4.1832451820373535,
"learning_rate": 5.820444687192922e-06,
"loss": 0.8687,
"mean_token_accuracy": 0.7938080430030823,
"step": 325
},
{
"epoch": 0.531810766721044,
"grad_norm": 4.599882125854492,
"learning_rate": 5.794847461875699e-06,
"loss": 1.2684,
"mean_token_accuracy": 0.7069767713546753,
"step": 326
},
{
"epoch": 0.533442088091354,
"grad_norm": 4.93676233291626,
"learning_rate": 5.769240651544182e-06,
"loss": 1.3537,
"mean_token_accuracy": 0.6710861921310425,
"step": 327
},
{
"epoch": 0.5350734094616639,
"grad_norm": 5.627974510192871,
"learning_rate": 5.74362508863438e-06,
"loss": 1.1545,
"mean_token_accuracy": 0.6976369619369507,
"step": 328
},
{
"epoch": 0.5367047308319739,
"grad_norm": 4.828066349029541,
"learning_rate": 5.7180016058668255e-06,
"loss": 1.3031,
"mean_token_accuracy": 0.6644359230995178,
"step": 329
},
{
"epoch": 0.5383360522022839,
"grad_norm": 4.859476089477539,
"learning_rate": 5.692371036219517e-06,
"loss": 1.2398,
"mean_token_accuracy": 0.6936061382293701,
"step": 330
},
{
"epoch": 0.5399673735725938,
"grad_norm": 5.192332744598389,
"learning_rate": 5.666734212900838e-06,
"loss": 1.4352,
"mean_token_accuracy": 0.6518259048461914,
"step": 331
},
{
"epoch": 0.5415986949429038,
"grad_norm": 5.073202610015869,
"learning_rate": 5.641091969322462e-06,
"loss": 1.4968,
"mean_token_accuracy": 0.6290949583053589,
"step": 332
},
{
"epoch": 0.5432300163132137,
"grad_norm": 5.155499458312988,
"learning_rate": 5.615445139072276e-06,
"loss": 1.2214,
"mean_token_accuracy": 0.6994413137435913,
"step": 333
},
{
"epoch": 0.5448613376835236,
"grad_norm": 5.361922264099121,
"learning_rate": 5.589794555887261e-06,
"loss": 1.3211,
"mean_token_accuracy": 0.6952879428863525,
"step": 334
},
{
"epoch": 0.5464926590538336,
"grad_norm": 4.938032627105713,
"learning_rate": 5.564141053626412e-06,
"loss": 1.0671,
"mean_token_accuracy": 0.7365955114364624,
"step": 335
},
{
"epoch": 0.5481239804241436,
"grad_norm": 5.547269344329834,
"learning_rate": 5.538485466243609e-06,
"loss": 1.1093,
"mean_token_accuracy": 0.7243272662162781,
"step": 336
},
{
"epoch": 0.5497553017944535,
"grad_norm": 4.6895318031311035,
"learning_rate": 5.512828627760519e-06,
"loss": 1.1681,
"mean_token_accuracy": 0.721276581287384,
"step": 337
},
{
"epoch": 0.5513866231647635,
"grad_norm": 4.660412311553955,
"learning_rate": 5.487171372239484e-06,
"loss": 1.0067,
"mean_token_accuracy": 0.756860613822937,
"step": 338
},
{
"epoch": 0.5530179445350734,
"grad_norm": 4.920834541320801,
"learning_rate": 5.461514533756394e-06,
"loss": 1.1513,
"mean_token_accuracy": 0.7190653085708618,
"step": 339
},
{
"epoch": 0.5546492659053833,
"grad_norm": 4.938427925109863,
"learning_rate": 5.435858946373589e-06,
"loss": 1.2603,
"mean_token_accuracy": 0.6953125,
"step": 340
},
{
"epoch": 0.5562805872756933,
"grad_norm": 5.092044830322266,
"learning_rate": 5.410205444112739e-06,
"loss": 1.3949,
"mean_token_accuracy": 0.6499231457710266,
"step": 341
},
{
"epoch": 0.5579119086460033,
"grad_norm": 5.051768779754639,
"learning_rate": 5.384554860927727e-06,
"loss": 1.2452,
"mean_token_accuracy": 0.6954964399337769,
"step": 342
},
{
"epoch": 0.5595432300163132,
"grad_norm": 5.367892265319824,
"learning_rate": 5.35890803067754e-06,
"loss": 1.4017,
"mean_token_accuracy": 0.6810073256492615,
"step": 343
},
{
"epoch": 0.5611745513866232,
"grad_norm": 5.445290565490723,
"learning_rate": 5.333265787099165e-06,
"loss": 1.3892,
"mean_token_accuracy": 0.6558409929275513,
"step": 344
},
{
"epoch": 0.5628058727569332,
"grad_norm": 4.5011091232299805,
"learning_rate": 5.307628963780486e-06,
"loss": 1.134,
"mean_token_accuracy": 0.7164948582649231,
"step": 345
},
{
"epoch": 0.564437194127243,
"grad_norm": 4.943375587463379,
"learning_rate": 5.281998394133177e-06,
"loss": 1.2984,
"mean_token_accuracy": 0.6739248633384705,
"step": 346
},
{
"epoch": 0.566068515497553,
"grad_norm": 5.216898441314697,
"learning_rate": 5.256374911365621e-06,
"loss": 1.0943,
"mean_token_accuracy": 0.742380678653717,
"step": 347
},
{
"epoch": 0.567699836867863,
"grad_norm": 5.109920501708984,
"learning_rate": 5.2307593484558175e-06,
"loss": 1.2526,
"mean_token_accuracy": 0.7040214538574219,
"step": 348
},
{
"epoch": 0.5693311582381729,
"grad_norm": 5.156416416168213,
"learning_rate": 5.205152538124303e-06,
"loss": 1.3782,
"mean_token_accuracy": 0.6628924608230591,
"step": 349
},
{
"epoch": 0.5709624796084829,
"grad_norm": 4.267928600311279,
"learning_rate": 5.179555312807079e-06,
"loss": 1.0406,
"mean_token_accuracy": 0.7428425550460815,
"step": 350
},
{
"epoch": 0.5725938009787929,
"grad_norm": 4.501908302307129,
"learning_rate": 5.153968504628558e-06,
"loss": 1.177,
"mean_token_accuracy": 0.7172932624816895,
"step": 351
},
{
"epoch": 0.5742251223491027,
"grad_norm": 4.767745494842529,
"learning_rate": 5.1283929453745055e-06,
"loss": 1.0175,
"mean_token_accuracy": 0.7485062479972839,
"step": 352
},
{
"epoch": 0.5758564437194127,
"grad_norm": 4.706437110900879,
"learning_rate": 5.102829466465005e-06,
"loss": 1.4045,
"mean_token_accuracy": 0.6768350601196289,
"step": 353
},
{
"epoch": 0.5774877650897227,
"grad_norm": 4.489633083343506,
"learning_rate": 5.077278898927425e-06,
"loss": 1.1147,
"mean_token_accuracy": 0.7224782109260559,
"step": 354
},
{
"epoch": 0.5791190864600326,
"grad_norm": 5.124320030212402,
"learning_rate": 5.051742073369407e-06,
"loss": 1.3278,
"mean_token_accuracy": 0.6733261346817017,
"step": 355
},
{
"epoch": 0.5807504078303426,
"grad_norm": 4.827151298522949,
"learning_rate": 5.026219819951865e-06,
"loss": 1.0634,
"mean_token_accuracy": 0.7362812757492065,
"step": 356
},
{
"epoch": 0.5823817292006526,
"grad_norm": 4.903264045715332,
"learning_rate": 5.000712968361994e-06,
"loss": 1.2472,
"mean_token_accuracy": 0.6971870064735413,
"step": 357
},
{
"epoch": 0.5840130505709625,
"grad_norm": 5.2526421546936035,
"learning_rate": 4.975222347786299e-06,
"loss": 1.4272,
"mean_token_accuracy": 0.6600102186203003,
"step": 358
},
{
"epoch": 0.5856443719412724,
"grad_norm": 4.291179656982422,
"learning_rate": 4.949748786883647e-06,
"loss": 1.058,
"mean_token_accuracy": 0.7493276000022888,
"step": 359
},
{
"epoch": 0.5872756933115824,
"grad_norm": 5.2762041091918945,
"learning_rate": 4.924293113758314e-06,
"loss": 1.3768,
"mean_token_accuracy": 0.6687370538711548,
"step": 360
},
{
"epoch": 0.5889070146818923,
"grad_norm": 4.883618354797363,
"learning_rate": 4.898856155933084e-06,
"loss": 1.2404,
"mean_token_accuracy": 0.7043189406394958,
"step": 361
},
{
"epoch": 0.5905383360522023,
"grad_norm": 5.123283863067627,
"learning_rate": 4.873438740322325e-06,
"loss": 1.2315,
"mean_token_accuracy": 0.7245850563049316,
"step": 362
},
{
"epoch": 0.5921696574225123,
"grad_norm": 5.0717267990112305,
"learning_rate": 4.8480416932051255e-06,
"loss": 1.29,
"mean_token_accuracy": 0.6664901971817017,
"step": 363
},
{
"epoch": 0.5938009787928222,
"grad_norm": 5.272220611572266,
"learning_rate": 4.8226658401984295e-06,
"loss": 1.4004,
"mean_token_accuracy": 0.665610134601593,
"step": 364
},
{
"epoch": 0.5954323001631321,
"grad_norm": 4.845883846282959,
"learning_rate": 4.79731200623019e-06,
"loss": 1.2454,
"mean_token_accuracy": 0.695147693157196,
"step": 365
},
{
"epoch": 0.5970636215334421,
"grad_norm": 4.378237247467041,
"learning_rate": 4.771981015512559e-06,
"loss": 0.8819,
"mean_token_accuracy": 0.7768194079399109,
"step": 366
},
{
"epoch": 0.598694942903752,
"grad_norm": 5.140429973602295,
"learning_rate": 4.746673691515093e-06,
"loss": 1.2651,
"mean_token_accuracy": 0.6864282488822937,
"step": 367
},
{
"epoch": 0.600326264274062,
"grad_norm": 5.289265155792236,
"learning_rate": 4.721390856937976e-06,
"loss": 1.1395,
"mean_token_accuracy": 0.7040935754776001,
"step": 368
},
{
"epoch": 0.601957585644372,
"grad_norm": 5.229256629943848,
"learning_rate": 4.696133333685286e-06,
"loss": 1.2456,
"mean_token_accuracy": 0.6875337362289429,
"step": 369
},
{
"epoch": 0.6035889070146819,
"grad_norm": 5.632972240447998,
"learning_rate": 4.67090194283827e-06,
"loss": 1.3961,
"mean_token_accuracy": 0.670738160610199,
"step": 370
},
{
"epoch": 0.6052202283849919,
"grad_norm": 4.017366409301758,
"learning_rate": 4.645697504628649e-06,
"loss": 0.9787,
"mean_token_accuracy": 0.7547547817230225,
"step": 371
},
{
"epoch": 0.6068515497553018,
"grad_norm": 5.327452182769775,
"learning_rate": 4.6205208384119626e-06,
"loss": 1.201,
"mean_token_accuracy": 0.7167056202888489,
"step": 372
},
{
"epoch": 0.6084828711256117,
"grad_norm": 5.935764312744141,
"learning_rate": 4.595372762640924e-06,
"loss": 1.6929,
"mean_token_accuracy": 0.598901093006134,
"step": 373
},
{
"epoch": 0.6101141924959217,
"grad_norm": 4.699948310852051,
"learning_rate": 4.57025409483882e-06,
"loss": 1.0746,
"mean_token_accuracy": 0.753125011920929,
"step": 374
},
{
"epoch": 0.6117455138662317,
"grad_norm": 4.765663146972656,
"learning_rate": 4.545165651572926e-06,
"loss": 1.2652,
"mean_token_accuracy": 0.707344651222229,
"step": 375
},
{
"epoch": 0.6133768352365416,
"grad_norm": 5.047967433929443,
"learning_rate": 4.520108248427975e-06,
"loss": 1.3292,
"mean_token_accuracy": 0.67514967918396,
"step": 376
},
{
"epoch": 0.6150081566068516,
"grad_norm": 4.735743045806885,
"learning_rate": 4.49508269997963e-06,
"loss": 1.2413,
"mean_token_accuracy": 0.6973969340324402,
"step": 377
},
{
"epoch": 0.6166394779771615,
"grad_norm": 4.652902603149414,
"learning_rate": 4.470089819768011e-06,
"loss": 1.088,
"mean_token_accuracy": 0.7307506203651428,
"step": 378
},
{
"epoch": 0.6182707993474714,
"grad_norm": 5.012315273284912,
"learning_rate": 4.4451304202712486e-06,
"loss": 1.1939,
"mean_token_accuracy": 0.6982803344726562,
"step": 379
},
{
"epoch": 0.6199021207177814,
"grad_norm": 5.312098026275635,
"learning_rate": 4.420205312879065e-06,
"loss": 1.1707,
"mean_token_accuracy": 0.7069069147109985,
"step": 380
},
{
"epoch": 0.6215334420880914,
"grad_norm": 5.2296462059021,
"learning_rate": 4.395315307866404e-06,
"loss": 1.4177,
"mean_token_accuracy": 0.6581963300704956,
"step": 381
},
{
"epoch": 0.6231647634584013,
"grad_norm": 5.124307155609131,
"learning_rate": 4.37046121436709e-06,
"loss": 1.4677,
"mean_token_accuracy": 0.6682761907577515,
"step": 382
},
{
"epoch": 0.6247960848287113,
"grad_norm": 5.106579303741455,
"learning_rate": 4.3456438403475174e-06,
"loss": 1.3623,
"mean_token_accuracy": 0.670346200466156,
"step": 383
},
{
"epoch": 0.6264274061990212,
"grad_norm": 5.028356552124023,
"learning_rate": 4.320863992580393e-06,
"loss": 1.2408,
"mean_token_accuracy": 0.6920192837715149,
"step": 384
},
{
"epoch": 0.6280587275693311,
"grad_norm": 4.444116115570068,
"learning_rate": 4.296122476618507e-06,
"loss": 1.1429,
"mean_token_accuracy": 0.704580545425415,
"step": 385
},
{
"epoch": 0.6296900489396411,
"grad_norm": 4.459901809692383,
"learning_rate": 4.2714200967685405e-06,
"loss": 1.1629,
"mean_token_accuracy": 0.72246915102005,
"step": 386
},
{
"epoch": 0.6313213703099511,
"grad_norm": 6.285675525665283,
"learning_rate": 4.246757656064924e-06,
"loss": 1.4322,
"mean_token_accuracy": 0.6744464635848999,
"step": 387
},
{
"epoch": 0.632952691680261,
"grad_norm": 4.624763011932373,
"learning_rate": 4.222135956243732e-06,
"loss": 1.2574,
"mean_token_accuracy": 0.681078314781189,
"step": 388
},
{
"epoch": 0.634584013050571,
"grad_norm": 4.69317626953125,
"learning_rate": 4.19755579771662e-06,
"loss": 0.912,
"mean_token_accuracy": 0.7686527371406555,
"step": 389
},
{
"epoch": 0.636215334420881,
"grad_norm": 5.052898406982422,
"learning_rate": 4.173017979544804e-06,
"loss": 1.2663,
"mean_token_accuracy": 0.6840921640396118,
"step": 390
},
{
"epoch": 0.6378466557911908,
"grad_norm": 4.830198287963867,
"learning_rate": 4.148523299413075e-06,
"loss": 1.1944,
"mean_token_accuracy": 0.7132551670074463,
"step": 391
},
{
"epoch": 0.6394779771615008,
"grad_norm": 4.418455600738525,
"learning_rate": 4.124072553603887e-06,
"loss": 1.1429,
"mean_token_accuracy": 0.720703125,
"step": 392
},
{
"epoch": 0.6411092985318108,
"grad_norm": 5.233757019042969,
"learning_rate": 4.099666536971456e-06,
"loss": 1.4604,
"mean_token_accuracy": 0.6442708373069763,
"step": 393
},
{
"epoch": 0.6427406199021207,
"grad_norm": 5.798137187957764,
"learning_rate": 4.075306042915922e-06,
"loss": 1.4581,
"mean_token_accuracy": 0.6500260233879089,
"step": 394
},
{
"epoch": 0.6443719412724307,
"grad_norm": 5.040640354156494,
"learning_rate": 4.050991863357564e-06,
"loss": 1.0962,
"mean_token_accuracy": 0.7073915600776672,
"step": 395
},
{
"epoch": 0.6460032626427407,
"grad_norm": 4.5930352210998535,
"learning_rate": 4.026724788711047e-06,
"loss": 1.1013,
"mean_token_accuracy": 0.7120794057846069,
"step": 396
},
{
"epoch": 0.6476345840130505,
"grad_norm": 4.828030586242676,
"learning_rate": 4.002505607859738e-06,
"loss": 1.1984,
"mean_token_accuracy": 0.7033898234367371,
"step": 397
},
{
"epoch": 0.6492659053833605,
"grad_norm": 4.7295331954956055,
"learning_rate": 3.978335108130047e-06,
"loss": 1.0876,
"mean_token_accuracy": 0.7375543713569641,
"step": 398
},
{
"epoch": 0.6508972267536705,
"grad_norm": 5.275457859039307,
"learning_rate": 3.954214075265842e-06,
"loss": 1.2306,
"mean_token_accuracy": 0.7018927335739136,
"step": 399
},
{
"epoch": 0.6525285481239804,
"grad_norm": 5.105504989624023,
"learning_rate": 3.930143293402907e-06,
"loss": 1.3803,
"mean_token_accuracy": 0.637172520160675,
"step": 400
},
{
"epoch": 0.6541598694942904,
"grad_norm": 4.749675750732422,
"learning_rate": 3.906123545043441e-06,
"loss": 1.1234,
"mean_token_accuracy": 0.7189365029335022,
"step": 401
},
{
"epoch": 0.6557911908646004,
"grad_norm": 4.926899433135986,
"learning_rate": 3.882155611030631e-06,
"loss": 1.2681,
"mean_token_accuracy": 0.6934037208557129,
"step": 402
},
{
"epoch": 0.6574225122349103,
"grad_norm": 5.266970157623291,
"learning_rate": 3.858240270523262e-06,
"loss": 1.3901,
"mean_token_accuracy": 0.6708167195320129,
"step": 403
},
{
"epoch": 0.6590538336052202,
"grad_norm": 4.678843975067139,
"learning_rate": 3.834378300970385e-06,
"loss": 1.096,
"mean_token_accuracy": 0.7200217247009277,
"step": 404
},
{
"epoch": 0.6606851549755302,
"grad_norm": 5.216601371765137,
"learning_rate": 3.8105704780860575e-06,
"loss": 1.515,
"mean_token_accuracy": 0.6313887238502502,
"step": 405
},
{
"epoch": 0.6623164763458401,
"grad_norm": 5.138411045074463,
"learning_rate": 3.7868175758241065e-06,
"loss": 1.2448,
"mean_token_accuracy": 0.7139689326286316,
"step": 406
},
{
"epoch": 0.6639477977161501,
"grad_norm": 4.843050479888916,
"learning_rate": 3.7631203663529823e-06,
"loss": 1.3766,
"mean_token_accuracy": 0.6812297701835632,
"step": 407
},
{
"epoch": 0.6655791190864601,
"grad_norm": 4.814765453338623,
"learning_rate": 3.739479620030655e-06,
"loss": 1.0831,
"mean_token_accuracy": 0.7297152280807495,
"step": 408
},
{
"epoch": 0.66721044045677,
"grad_norm": 4.954052448272705,
"learning_rate": 3.715896105379562e-06,
"loss": 1.2928,
"mean_token_accuracy": 0.6796213388442993,
"step": 409
},
{
"epoch": 0.6688417618270799,
"grad_norm": 4.973556995391846,
"learning_rate": 3.692370589061639e-06,
"loss": 1.203,
"mean_token_accuracy": 0.7126886248588562,
"step": 410
},
{
"epoch": 0.6704730831973899,
"grad_norm": 4.508687973022461,
"learning_rate": 3.668903835853386e-06,
"loss": 1.0417,
"mean_token_accuracy": 0.7396226525306702,
"step": 411
},
{
"epoch": 0.6721044045676998,
"grad_norm": 4.325466632843018,
"learning_rate": 3.64549660862101e-06,
"loss": 1.0965,
"mean_token_accuracy": 0.7506775259971619,
"step": 412
},
{
"epoch": 0.6737357259380098,
"grad_norm": 4.78257417678833,
"learning_rate": 3.6221496682956236e-06,
"loss": 1.2328,
"mean_token_accuracy": 0.6968838572502136,
"step": 413
},
{
"epoch": 0.6753670473083198,
"grad_norm": 5.217673301696777,
"learning_rate": 3.5988637738485146e-06,
"loss": 1.1468,
"mean_token_accuracy": 0.7180641293525696,
"step": 414
},
{
"epoch": 0.6769983686786297,
"grad_norm": 5.608780384063721,
"learning_rate": 3.5756396822664595e-06,
"loss": 1.4427,
"mean_token_accuracy": 0.6482036113739014,
"step": 415
},
{
"epoch": 0.6786296900489397,
"grad_norm": 4.913776397705078,
"learning_rate": 3.5524781485271287e-06,
"loss": 1.3126,
"mean_token_accuracy": 0.703459620475769,
"step": 416
},
{
"epoch": 0.6802610114192496,
"grad_norm": 4.990585803985596,
"learning_rate": 3.5293799255745407e-06,
"loss": 1.425,
"mean_token_accuracy": 0.6552053689956665,
"step": 417
},
{
"epoch": 0.6818923327895595,
"grad_norm": 5.035621643066406,
"learning_rate": 3.5063457642945788e-06,
"loss": 1.3351,
"mean_token_accuracy": 0.6864407062530518,
"step": 418
},
{
"epoch": 0.6835236541598695,
"grad_norm": 5.281700134277344,
"learning_rate": 3.4833764134905835e-06,
"loss": 1.2133,
"mean_token_accuracy": 0.6881720423698425,
"step": 419
},
{
"epoch": 0.6851549755301795,
"grad_norm": 4.842787742614746,
"learning_rate": 3.4604726198590177e-06,
"loss": 1.1954,
"mean_token_accuracy": 0.7155085802078247,
"step": 420
},
{
"epoch": 0.6867862969004894,
"grad_norm": 4.937472820281982,
"learning_rate": 3.4376351279651788e-06,
"loss": 1.3095,
"mean_token_accuracy": 0.6968302726745605,
"step": 421
},
{
"epoch": 0.6884176182707994,
"grad_norm": 4.842049598693848,
"learning_rate": 3.4148646802190066e-06,
"loss": 1.0614,
"mean_token_accuracy": 0.7444320917129517,
"step": 422
},
{
"epoch": 0.6900489396411092,
"grad_norm": 4.4336628913879395,
"learning_rate": 3.392162016850945e-06,
"loss": 1.0914,
"mean_token_accuracy": 0.729891300201416,
"step": 423
},
{
"epoch": 0.6916802610114192,
"grad_norm": 5.191675186157227,
"learning_rate": 3.369527875887875e-06,
"loss": 1.2101,
"mean_token_accuracy": 0.7204244136810303,
"step": 424
},
{
"epoch": 0.6933115823817292,
"grad_norm": 5.435412406921387,
"learning_rate": 3.346962993129125e-06,
"loss": 1.2044,
"mean_token_accuracy": 0.7158119678497314,
"step": 425
},
{
"epoch": 0.6949429037520392,
"grad_norm": 4.86824369430542,
"learning_rate": 3.3244681021225506e-06,
"loss": 1.1128,
"mean_token_accuracy": 0.7002801299095154,
"step": 426
},
{
"epoch": 0.6965742251223491,
"grad_norm": 4.692442417144775,
"learning_rate": 3.302043934140693e-06,
"loss": 1.247,
"mean_token_accuracy": 0.683964729309082,
"step": 427
},
{
"epoch": 0.6982055464926591,
"grad_norm": 4.847585201263428,
"learning_rate": 3.279691218156998e-06,
"loss": 1.2886,
"mean_token_accuracy": 0.6823869347572327,
"step": 428
},
{
"epoch": 0.6998368678629691,
"grad_norm": 4.947258472442627,
"learning_rate": 3.2574106808221206e-06,
"loss": 1.1626,
"mean_token_accuracy": 0.7202127575874329,
"step": 429
},
{
"epoch": 0.7014681892332789,
"grad_norm": 4.548014163970947,
"learning_rate": 3.2352030464403117e-06,
"loss": 1.1406,
"mean_token_accuracy": 0.7432366013526917,
"step": 430
},
{
"epoch": 0.7030995106035889,
"grad_norm": 4.8469743728637695,
"learning_rate": 3.2130690369458594e-06,
"loss": 1.2848,
"mean_token_accuracy": 0.6986584067344666,
"step": 431
},
{
"epoch": 0.7047308319738989,
"grad_norm": 4.121768474578857,
"learning_rate": 3.191009371879627e-06,
"loss": 0.9523,
"mean_token_accuracy": 0.7665964365005493,
"step": 432
},
{
"epoch": 0.7063621533442088,
"grad_norm": 4.720678329467773,
"learning_rate": 3.1690247683656617e-06,
"loss": 1.2706,
"mean_token_accuracy": 0.6949771642684937,
"step": 433
},
{
"epoch": 0.7079934747145188,
"grad_norm": 4.939698219299316,
"learning_rate": 3.1471159410878784e-06,
"loss": 1.3505,
"mean_token_accuracy": 0.6539000272750854,
"step": 434
},
{
"epoch": 0.7096247960848288,
"grad_norm": 4.132518291473389,
"learning_rate": 3.125283602266832e-06,
"loss": 0.9859,
"mean_token_accuracy": 0.7509416341781616,
"step": 435
},
{
"epoch": 0.7112561174551386,
"grad_norm": 5.197821617126465,
"learning_rate": 3.1035284616365586e-06,
"loss": 1.1386,
"mean_token_accuracy": 0.714631199836731,
"step": 436
},
{
"epoch": 0.7128874388254486,
"grad_norm": 4.722414016723633,
"learning_rate": 3.0818512264215107e-06,
"loss": 0.9581,
"mean_token_accuracy": 0.7724301815032959,
"step": 437
},
{
"epoch": 0.7145187601957586,
"grad_norm": 4.7281599044799805,
"learning_rate": 3.060252601313557e-06,
"loss": 1.1291,
"mean_token_accuracy": 0.717391312122345,
"step": 438
},
{
"epoch": 0.7161500815660685,
"grad_norm": 4.817330360412598,
"learning_rate": 3.0387332884490806e-06,
"loss": 1.1184,
"mean_token_accuracy": 0.725653886795044,
"step": 439
},
{
"epoch": 0.7177814029363785,
"grad_norm": 4.662072658538818,
"learning_rate": 3.0172939873861486e-06,
"loss": 1.1475,
"mean_token_accuracy": 0.7279778122901917,
"step": 440
},
{
"epoch": 0.7194127243066885,
"grad_norm": 4.278316020965576,
"learning_rate": 2.995935395081781e-06,
"loss": 0.9249,
"mean_token_accuracy": 0.7603439092636108,
"step": 441
},
{
"epoch": 0.7210440456769984,
"grad_norm": 4.619575500488281,
"learning_rate": 2.9746582058692803e-06,
"loss": 1.0338,
"mean_token_accuracy": 0.7423180341720581,
"step": 442
},
{
"epoch": 0.7226753670473083,
"grad_norm": 4.950908660888672,
"learning_rate": 2.953463111435666e-06,
"loss": 1.1649,
"mean_token_accuracy": 0.7079599499702454,
"step": 443
},
{
"epoch": 0.7243066884176182,
"grad_norm": 5.330234050750732,
"learning_rate": 2.932350800799196e-06,
"loss": 1.308,
"mean_token_accuracy": 0.6914836764335632,
"step": 444
},
{
"epoch": 0.7259380097879282,
"grad_norm": 5.278100490570068,
"learning_rate": 2.9113219602869515e-06,
"loss": 1.5142,
"mean_token_accuracy": 0.6575073599815369,
"step": 445
},
{
"epoch": 0.7275693311582382,
"grad_norm": 4.93350076675415,
"learning_rate": 2.890377273512538e-06,
"loss": 1.3363,
"mean_token_accuracy": 0.6751728057861328,
"step": 446
},
{
"epoch": 0.7292006525285482,
"grad_norm": 4.3188910484313965,
"learning_rate": 2.8695174213538647e-06,
"loss": 1.0682,
"mean_token_accuracy": 0.7265364527702332,
"step": 447
},
{
"epoch": 0.7308319738988581,
"grad_norm": 4.598663806915283,
"learning_rate": 2.848743081930998e-06,
"loss": 1.1568,
"mean_token_accuracy": 0.7006726264953613,
"step": 448
},
{
"epoch": 0.732463295269168,
"grad_norm": 5.178636074066162,
"learning_rate": 2.8280549305841265e-06,
"loss": 1.2928,
"mean_token_accuracy": 0.6874651908874512,
"step": 449
},
{
"epoch": 0.734094616639478,
"grad_norm": 5.297123908996582,
"learning_rate": 2.8074536398516004e-06,
"loss": 1.2612,
"mean_token_accuracy": 0.6888131499290466,
"step": 450
},
{
"epoch": 0.7357259380097879,
"grad_norm": 5.056674957275391,
"learning_rate": 2.7869398794480778e-06,
"loss": 1.1595,
"mean_token_accuracy": 0.7092235684394836,
"step": 451
},
{
"epoch": 0.7373572593800979,
"grad_norm": 4.8112030029296875,
"learning_rate": 2.7665143162427427e-06,
"loss": 1.2288,
"mean_token_accuracy": 0.7047522664070129,
"step": 452
},
{
"epoch": 0.7389885807504079,
"grad_norm": 4.844231605529785,
"learning_rate": 2.746177614237631e-06,
"loss": 1.3594,
"mean_token_accuracy": 0.6892874240875244,
"step": 453
},
{
"epoch": 0.7406199021207178,
"grad_norm": 5.323098659515381,
"learning_rate": 2.7259304345460445e-06,
"loss": 1.4409,
"mean_token_accuracy": 0.6324736475944519,
"step": 454
},
{
"epoch": 0.7422512234910277,
"grad_norm": 4.965455532073975,
"learning_rate": 2.7057734353710655e-06,
"loss": 1.2032,
"mean_token_accuracy": 0.6982530355453491,
"step": 455
},
{
"epoch": 0.7438825448613376,
"grad_norm": 4.611636161804199,
"learning_rate": 2.6857072719841436e-06,
"loss": 1.0921,
"mean_token_accuracy": 0.7258726954460144,
"step": 456
},
{
"epoch": 0.7455138662316476,
"grad_norm": 5.415761470794678,
"learning_rate": 2.6657325967038084e-06,
"loss": 1.4882,
"mean_token_accuracy": 0.6622621417045593,
"step": 457
},
{
"epoch": 0.7471451876019576,
"grad_norm": 5.130191326141357,
"learning_rate": 2.645850058874463e-06,
"loss": 1.2448,
"mean_token_accuracy": 0.6971870064735413,
"step": 458
},
{
"epoch": 0.7487765089722676,
"grad_norm": 4.7735748291015625,
"learning_rate": 2.6260603048452636e-06,
"loss": 1.2079,
"mean_token_accuracy": 0.7042531967163086,
"step": 459
},
{
"epoch": 0.7504078303425775,
"grad_norm": 4.764122486114502,
"learning_rate": 2.6063639779491197e-06,
"loss": 1.3132,
"mean_token_accuracy": 0.677205502986908,
"step": 460
},
{
"epoch": 0.7520391517128875,
"grad_norm": 4.8977556228637695,
"learning_rate": 2.586761718481776e-06,
"loss": 1.0483,
"mean_token_accuracy": 0.7458379864692688,
"step": 461
},
{
"epoch": 0.7536704730831973,
"grad_norm": 5.250521183013916,
"learning_rate": 2.5672541636809957e-06,
"loss": 1.3854,
"mean_token_accuracy": 0.6714285612106323,
"step": 462
},
{
"epoch": 0.7553017944535073,
"grad_norm": 4.352292537689209,
"learning_rate": 2.5478419477058446e-06,
"loss": 1.2105,
"mean_token_accuracy": 0.714142918586731,
"step": 463
},
{
"epoch": 0.7569331158238173,
"grad_norm": 4.649628162384033,
"learning_rate": 2.52852570161608e-06,
"loss": 1.1386,
"mean_token_accuracy": 0.721030056476593,
"step": 464
},
{
"epoch": 0.7585644371941273,
"grad_norm": 5.159845352172852,
"learning_rate": 2.5093060533516357e-06,
"loss": 1.0597,
"mean_token_accuracy": 0.7296990156173706,
"step": 465
},
{
"epoch": 0.7601957585644372,
"grad_norm": 4.948349475860596,
"learning_rate": 2.4901836277122e-06,
"loss": 1.2113,
"mean_token_accuracy": 0.6993117928504944,
"step": 466
},
{
"epoch": 0.7618270799347472,
"grad_norm": 4.682156085968018,
"learning_rate": 2.4711590463369163e-06,
"loss": 1.1495,
"mean_token_accuracy": 0.7079691290855408,
"step": 467
},
{
"epoch": 0.763458401305057,
"grad_norm": 4.9600830078125,
"learning_rate": 2.4522329276841664e-06,
"loss": 1.2248,
"mean_token_accuracy": 0.7208632826805115,
"step": 468
},
{
"epoch": 0.765089722675367,
"grad_norm": 5.011682033538818,
"learning_rate": 2.4334058870114685e-06,
"loss": 1.2514,
"mean_token_accuracy": 0.690378725528717,
"step": 469
},
{
"epoch": 0.766721044045677,
"grad_norm": 6.021939754486084,
"learning_rate": 2.414678536355476e-06,
"loss": 1.1848,
"mean_token_accuracy": 0.7004357576370239,
"step": 470
},
{
"epoch": 0.768352365415987,
"grad_norm": 5.621747970581055,
"learning_rate": 2.3960514845120835e-06,
"loss": 1.3135,
"mean_token_accuracy": 0.6799768805503845,
"step": 471
},
{
"epoch": 0.7699836867862969,
"grad_norm": 5.001407623291016,
"learning_rate": 2.377525337016629e-06,
"loss": 1.1641,
"mean_token_accuracy": 0.7319232821464539,
"step": 472
},
{
"epoch": 0.7716150081566069,
"grad_norm": 4.856801509857178,
"learning_rate": 2.359100696124217e-06,
"loss": 1.2248,
"mean_token_accuracy": 0.7054263353347778,
"step": 473
},
{
"epoch": 0.7732463295269169,
"grad_norm": 5.092650890350342,
"learning_rate": 2.340778160790133e-06,
"loss": 1.2368,
"mean_token_accuracy": 0.6984392404556274,
"step": 474
},
{
"epoch": 0.7748776508972267,
"grad_norm": 5.131616592407227,
"learning_rate": 2.32255832665038e-06,
"loss": 1.1432,
"mean_token_accuracy": 0.7190889120101929,
"step": 475
},
{
"epoch": 0.7765089722675367,
"grad_norm": 5.5193047523498535,
"learning_rate": 2.3044417860023082e-06,
"loss": 1.4145,
"mean_token_accuracy": 0.6792343258857727,
"step": 476
},
{
"epoch": 0.7781402936378466,
"grad_norm": 4.5522356033325195,
"learning_rate": 2.286429127785365e-06,
"loss": 1.2906,
"mean_token_accuracy": 0.6974206566810608,
"step": 477
},
{
"epoch": 0.7797716150081566,
"grad_norm": 4.760054588317871,
"learning_rate": 2.2685209375619433e-06,
"loss": 1.2122,
"mean_token_accuracy": 0.7080909609794617,
"step": 478
},
{
"epoch": 0.7814029363784666,
"grad_norm": 4.7698187828063965,
"learning_rate": 2.250717797498361e-06,
"loss": 1.2056,
"mean_token_accuracy": 0.7150395512580872,
"step": 479
},
{
"epoch": 0.7830342577487766,
"grad_norm": 5.215602397918701,
"learning_rate": 2.2330202863459123e-06,
"loss": 1.417,
"mean_token_accuracy": 0.6677489280700684,
"step": 480
},
{
"epoch": 0.7846655791190864,
"grad_norm": 5.066779136657715,
"learning_rate": 2.215428979422074e-06,
"loss": 1.3455,
"mean_token_accuracy": 0.6654175519943237,
"step": 481
},
{
"epoch": 0.7862969004893964,
"grad_norm": 4.236968994140625,
"learning_rate": 2.1979444485917957e-06,
"loss": 1.2404,
"mean_token_accuracy": 0.7059952020645142,
"step": 482
},
{
"epoch": 0.7879282218597063,
"grad_norm": 4.6524224281311035,
"learning_rate": 2.1805672622489044e-06,
"loss": 1.2244,
"mean_token_accuracy": 0.6920965909957886,
"step": 483
},
{
"epoch": 0.7895595432300163,
"grad_norm": 4.233443737030029,
"learning_rate": 2.163297985297633e-06,
"loss": 1.015,
"mean_token_accuracy": 0.7436676621437073,
"step": 484
},
{
"epoch": 0.7911908646003263,
"grad_norm": 4.818909168243408,
"learning_rate": 2.1461371791342572e-06,
"loss": 1.1409,
"mean_token_accuracy": 0.7303598523139954,
"step": 485
},
{
"epoch": 0.7928221859706363,
"grad_norm": 5.0629448890686035,
"learning_rate": 2.129085401628841e-06,
"loss": 1.263,
"mean_token_accuracy": 0.6733444333076477,
"step": 486
},
{
"epoch": 0.7944535073409462,
"grad_norm": 5.042863368988037,
"learning_rate": 2.1121432071071008e-06,
"loss": 1.2654,
"mean_token_accuracy": 0.6947311162948608,
"step": 487
},
{
"epoch": 0.7960848287112561,
"grad_norm": 4.359389305114746,
"learning_rate": 2.0953111463323885e-06,
"loss": 1.09,
"mean_token_accuracy": 0.7307896018028259,
"step": 488
},
{
"epoch": 0.797716150081566,
"grad_norm": 4.828915119171143,
"learning_rate": 2.07858976648779e-06,
"loss": 1.3271,
"mean_token_accuracy": 0.6866028904914856,
"step": 489
},
{
"epoch": 0.799347471451876,
"grad_norm": 5.311947822570801,
"learning_rate": 2.061979611158329e-06,
"loss": 1.4026,
"mean_token_accuracy": 0.6727748513221741,
"step": 490
},
{
"epoch": 0.800978792822186,
"grad_norm": 5.242700576782227,
"learning_rate": 2.045481220313298e-06,
"loss": 1.3683,
"mean_token_accuracy": 0.6764549016952515,
"step": 491
},
{
"epoch": 0.802610114192496,
"grad_norm": 4.709912300109863,
"learning_rate": 2.0290951302887117e-06,
"loss": 1.1447,
"mean_token_accuracy": 0.7429931163787842,
"step": 492
},
{
"epoch": 0.8042414355628059,
"grad_norm": 4.1881184577941895,
"learning_rate": 2.0128218737698653e-06,
"loss": 1.0764,
"mean_token_accuracy": 0.7385087013244629,
"step": 493
},
{
"epoch": 0.8058727569331158,
"grad_norm": 4.042761325836182,
"learning_rate": 1.996661979774017e-06,
"loss": 1.0007,
"mean_token_accuracy": 0.743196427822113,
"step": 494
},
{
"epoch": 0.8075040783034257,
"grad_norm": 4.446390151977539,
"learning_rate": 1.9806159736331935e-06,
"loss": 1.0239,
"mean_token_accuracy": 0.7473176121711731,
"step": 495
},
{
"epoch": 0.8091353996737357,
"grad_norm": 4.78018856048584,
"learning_rate": 1.964684376977115e-06,
"loss": 1.1063,
"mean_token_accuracy": 0.7371134161949158,
"step": 496
},
{
"epoch": 0.8107667210440457,
"grad_norm": 5.604861736297607,
"learning_rate": 1.94886770771623e-06,
"loss": 1.4752,
"mean_token_accuracy": 0.6601036190986633,
"step": 497
},
{
"epoch": 0.8123980424143556,
"grad_norm": 5.058335304260254,
"learning_rate": 1.933166480024883e-06,
"loss": 1.055,
"mean_token_accuracy": 0.7369833588600159,
"step": 498
},
{
"epoch": 0.8140293637846656,
"grad_norm": 4.705621242523193,
"learning_rate": 1.9175812043246034e-06,
"loss": 1.2298,
"mean_token_accuracy": 0.6933262944221497,
"step": 499
},
{
"epoch": 0.8156606851549756,
"grad_norm": 4.777103424072266,
"learning_rate": 1.9021123872675062e-06,
"loss": 1.1538,
"mean_token_accuracy": 0.7174683809280396,
"step": 500
},
{
"epoch": 0.8172920065252854,
"grad_norm": 4.314986705780029,
"learning_rate": 1.886760531719825e-06,
"loss": 0.9366,
"mean_token_accuracy": 0.7647951245307922,
"step": 501
},
{
"epoch": 0.8189233278955954,
"grad_norm": 4.484466075897217,
"learning_rate": 1.8715261367455634e-06,
"loss": 1.0794,
"mean_token_accuracy": 0.744053304195404,
"step": 502
},
{
"epoch": 0.8205546492659054,
"grad_norm": 4.761155605316162,
"learning_rate": 1.8564096975902715e-06,
"loss": 1.1912,
"mean_token_accuracy": 0.7101010084152222,
"step": 503
},
{
"epoch": 0.8221859706362153,
"grad_norm": 5.64600944519043,
"learning_rate": 1.8414117056649466e-06,
"loss": 1.3092,
"mean_token_accuracy": 0.6834645867347717,
"step": 504
},
{
"epoch": 0.8238172920065253,
"grad_norm": 4.866972923278809,
"learning_rate": 1.8265326485300582e-06,
"loss": 1.0176,
"mean_token_accuracy": 0.7384013533592224,
"step": 505
},
{
"epoch": 0.8254486133768353,
"grad_norm": 4.5388994216918945,
"learning_rate": 1.8117730098796996e-06,
"loss": 1.2966,
"mean_token_accuracy": 0.701646089553833,
"step": 506
},
{
"epoch": 0.8270799347471451,
"grad_norm": 4.454381942749023,
"learning_rate": 1.7971332695258592e-06,
"loss": 1.1112,
"mean_token_accuracy": 0.7266221642494202,
"step": 507
},
{
"epoch": 0.8287112561174551,
"grad_norm": 4.481594085693359,
"learning_rate": 1.7826139033828263e-06,
"loss": 1.2742,
"mean_token_accuracy": 0.6912720799446106,
"step": 508
},
{
"epoch": 0.8303425774877651,
"grad_norm": 4.99500036239624,
"learning_rate": 1.768215383451723e-06,
"loss": 1.1617,
"mean_token_accuracy": 0.710889995098114,
"step": 509
},
{
"epoch": 0.831973898858075,
"grad_norm": 4.590748310089111,
"learning_rate": 1.7539381778051511e-06,
"loss": 1.046,
"mean_token_accuracy": 0.7437499761581421,
"step": 510
},
{
"epoch": 0.833605220228385,
"grad_norm": 4.781766414642334,
"learning_rate": 1.7397827505719852e-06,
"loss": 1.2756,
"mean_token_accuracy": 0.6818851232528687,
"step": 511
},
{
"epoch": 0.835236541598695,
"grad_norm": 4.8062744140625,
"learning_rate": 1.7257495619222763e-06,
"loss": 1.2438,
"mean_token_accuracy": 0.6988636255264282,
"step": 512
},
{
"epoch": 0.8368678629690048,
"grad_norm": 4.5913615226745605,
"learning_rate": 1.7118390680523023e-06,
"loss": 1.1542,
"mean_token_accuracy": 0.7089864015579224,
"step": 513
},
{
"epoch": 0.8384991843393148,
"grad_norm": 4.614170551300049,
"learning_rate": 1.6980517211697293e-06,
"loss": 1.0838,
"mean_token_accuracy": 0.7278003692626953,
"step": 514
},
{
"epoch": 0.8401305057096248,
"grad_norm": 4.4173359870910645,
"learning_rate": 1.6843879694789095e-06,
"loss": 1.1843,
"mean_token_accuracy": 0.7148330807685852,
"step": 515
},
{
"epoch": 0.8417618270799347,
"grad_norm": 4.110933303833008,
"learning_rate": 1.6708482571663238e-06,
"loss": 1.0402,
"mean_token_accuracy": 0.7376889586448669,
"step": 516
},
{
"epoch": 0.8433931484502447,
"grad_norm": 4.51687479019165,
"learning_rate": 1.657433024386127e-06,
"loss": 1.0383,
"mean_token_accuracy": 0.7657114267349243,
"step": 517
},
{
"epoch": 0.8450244698205547,
"grad_norm": 5.177441596984863,
"learning_rate": 1.6441427072458493e-06,
"loss": 1.3209,
"mean_token_accuracy": 0.6875,
"step": 518
},
{
"epoch": 0.8466557911908646,
"grad_norm": 4.650432109832764,
"learning_rate": 1.630977737792212e-06,
"loss": 1.1279,
"mean_token_accuracy": 0.7242990732192993,
"step": 519
},
{
"epoch": 0.8482871125611745,
"grad_norm": 4.902032852172852,
"learning_rate": 1.6179385439970897e-06,
"loss": 1.1124,
"mean_token_accuracy": 0.7066738605499268,
"step": 520
},
{
"epoch": 0.8499184339314845,
"grad_norm": 4.605056285858154,
"learning_rate": 1.6050255497435902e-06,
"loss": 1.0645,
"mean_token_accuracy": 0.7346938848495483,
"step": 521
},
{
"epoch": 0.8515497553017944,
"grad_norm": 5.043729305267334,
"learning_rate": 1.592239174812279e-06,
"loss": 1.3279,
"mean_token_accuracy": 0.6896191835403442,
"step": 522
},
{
"epoch": 0.8531810766721044,
"grad_norm": 5.051156520843506,
"learning_rate": 1.5795798348675352e-06,
"loss": 1.0265,
"mean_token_accuracy": 0.7457534074783325,
"step": 523
},
{
"epoch": 0.8548123980424144,
"grad_norm": 4.62628173828125,
"learning_rate": 1.5670479414440315e-06,
"loss": 1.0211,
"mean_token_accuracy": 0.7560975551605225,
"step": 524
},
{
"epoch": 0.8564437194127243,
"grad_norm": 5.277249813079834,
"learning_rate": 1.5546439019333632e-06,
"loss": 1.3336,
"mean_token_accuracy": 0.681064784526825,
"step": 525
},
{
"epoch": 0.8580750407830342,
"grad_norm": 4.982065677642822,
"learning_rate": 1.5423681195707997e-06,
"loss": 1.4144,
"mean_token_accuracy": 0.6686686873435974,
"step": 526
},
{
"epoch": 0.8597063621533442,
"grad_norm": 4.6587605476379395,
"learning_rate": 1.5302209934221796e-06,
"loss": 1.1911,
"mean_token_accuracy": 0.7020725607872009,
"step": 527
},
{
"epoch": 0.8613376835236541,
"grad_norm": 5.415839195251465,
"learning_rate": 1.5182029183709345e-06,
"loss": 1.3637,
"mean_token_accuracy": 0.6866196990013123,
"step": 528
},
{
"epoch": 0.8629690048939641,
"grad_norm": 4.830744743347168,
"learning_rate": 1.5063142851052535e-06,
"loss": 1.0927,
"mean_token_accuracy": 0.7163712382316589,
"step": 529
},
{
"epoch": 0.8646003262642741,
"grad_norm": 4.314631938934326,
"learning_rate": 1.4945554801053852e-06,
"loss": 1.0773,
"mean_token_accuracy": 0.7398513555526733,
"step": 530
},
{
"epoch": 0.866231647634584,
"grad_norm": 4.3542680740356445,
"learning_rate": 1.4829268856310677e-06,
"loss": 1.1271,
"mean_token_accuracy": 0.7248595952987671,
"step": 531
},
{
"epoch": 0.867862969004894,
"grad_norm": 4.48630952835083,
"learning_rate": 1.471428879709107e-06,
"loss": 1.0675,
"mean_token_accuracy": 0.7440000176429749,
"step": 532
},
{
"epoch": 0.8694942903752039,
"grad_norm": 4.849664211273193,
"learning_rate": 1.4600618361210857e-06,
"loss": 1.2855,
"mean_token_accuracy": 0.713458776473999,
"step": 533
},
{
"epoch": 0.8711256117455138,
"grad_norm": 4.989716529846191,
"learning_rate": 1.448826124391215e-06,
"loss": 1.2499,
"mean_token_accuracy": 0.7188649773597717,
"step": 534
},
{
"epoch": 0.8727569331158238,
"grad_norm": 4.539302825927734,
"learning_rate": 1.437722109774317e-06,
"loss": 1.1633,
"mean_token_accuracy": 0.7338669300079346,
"step": 535
},
{
"epoch": 0.8743882544861338,
"grad_norm": 4.66331148147583,
"learning_rate": 1.4267501532439526e-06,
"loss": 1.2576,
"mean_token_accuracy": 0.6965973377227783,
"step": 536
},
{
"epoch": 0.8760195758564437,
"grad_norm": 4.61297607421875,
"learning_rate": 1.4159106114806943e-06,
"loss": 1.3736,
"mean_token_accuracy": 0.6653734445571899,
"step": 537
},
{
"epoch": 0.8776508972267537,
"grad_norm": 4.935201644897461,
"learning_rate": 1.4052038368605156e-06,
"loss": 1.3792,
"mean_token_accuracy": 0.6775679588317871,
"step": 538
},
{
"epoch": 0.8792822185970636,
"grad_norm": 4.569594383239746,
"learning_rate": 1.3946301774433502e-06,
"loss": 1.105,
"mean_token_accuracy": 0.7271789312362671,
"step": 539
},
{
"epoch": 0.8809135399673735,
"grad_norm": 4.568352699279785,
"learning_rate": 1.3841899769617723e-06,
"loss": 1.1148,
"mean_token_accuracy": 0.7321231961250305,
"step": 540
},
{
"epoch": 0.8825448613376835,
"grad_norm": 5.049271583557129,
"learning_rate": 1.3738835748098198e-06,
"loss": 1.0984,
"mean_token_accuracy": 0.7366254925727844,
"step": 541
},
{
"epoch": 0.8841761827079935,
"grad_norm": 5.136232376098633,
"learning_rate": 1.3637113060319629e-06,
"loss": 1.2849,
"mean_token_accuracy": 0.6897223591804504,
"step": 542
},
{
"epoch": 0.8858075040783034,
"grad_norm": 4.453695774078369,
"learning_rate": 1.3536735013122144e-06,
"loss": 1.0962,
"mean_token_accuracy": 0.7319535613059998,
"step": 543
},
{
"epoch": 0.8874388254486134,
"grad_norm": 4.621738910675049,
"learning_rate": 1.3437704869633772e-06,
"loss": 1.0924,
"mean_token_accuracy": 0.7451643943786621,
"step": 544
},
{
"epoch": 0.8890701468189234,
"grad_norm": 4.363915920257568,
"learning_rate": 1.334002584916437e-06,
"loss": 1.2547,
"mean_token_accuracy": 0.6975655555725098,
"step": 545
},
{
"epoch": 0.8907014681892332,
"grad_norm": 4.77221155166626,
"learning_rate": 1.3243701127100971e-06,
"loss": 1.1272,
"mean_token_accuracy": 0.732022762298584,
"step": 546
},
{
"epoch": 0.8923327895595432,
"grad_norm": 4.910726070404053,
"learning_rate": 1.314873383480455e-06,
"loss": 1.1381,
"mean_token_accuracy": 0.7128287553787231,
"step": 547
},
{
"epoch": 0.8939641109298532,
"grad_norm": 4.650912284851074,
"learning_rate": 1.3055127059508257e-06,
"loss": 1.0727,
"mean_token_accuracy": 0.7480478882789612,
"step": 548
},
{
"epoch": 0.8955954323001631,
"grad_norm": 3.9856724739074707,
"learning_rate": 1.2962883844217e-06,
"loss": 0.8759,
"mean_token_accuracy": 0.7789642214775085,
"step": 549
},
{
"epoch": 0.8972267536704731,
"grad_norm": 4.78012752532959,
"learning_rate": 1.287200718760859e-06,
"loss": 1.2732,
"mean_token_accuracy": 0.6914893388748169,
"step": 550
},
{
"epoch": 0.8988580750407831,
"grad_norm": 4.302763938903809,
"learning_rate": 1.27825000439362e-06,
"loss": 1.0871,
"mean_token_accuracy": 0.7311992049217224,
"step": 551
},
{
"epoch": 0.9004893964110929,
"grad_norm": 4.6384100914001465,
"learning_rate": 1.2694365322932365e-06,
"loss": 1.3448,
"mean_token_accuracy": 0.6719226837158203,
"step": 552
},
{
"epoch": 0.9021207177814029,
"grad_norm": 4.745211124420166,
"learning_rate": 1.2607605889714359e-06,
"loss": 1.19,
"mean_token_accuracy": 0.7090080976486206,
"step": 553
},
{
"epoch": 0.9037520391517129,
"grad_norm": 4.419302940368652,
"learning_rate": 1.252222456469111e-06,
"loss": 0.9335,
"mean_token_accuracy": 0.774678111076355,
"step": 554
},
{
"epoch": 0.9053833605220228,
"grad_norm": 5.066204071044922,
"learning_rate": 1.2438224123471442e-06,
"loss": 1.3473,
"mean_token_accuracy": 0.6653266549110413,
"step": 555
},
{
"epoch": 0.9070146818923328,
"grad_norm": 4.375471115112305,
"learning_rate": 1.2355607296773896e-06,
"loss": 1.2947,
"mean_token_accuracy": 0.6962790489196777,
"step": 556
},
{
"epoch": 0.9086460032626428,
"grad_norm": 5.035999774932861,
"learning_rate": 1.2274376770337925e-06,
"loss": 1.1271,
"mean_token_accuracy": 0.7255297899246216,
"step": 557
},
{
"epoch": 0.9102773246329527,
"grad_norm": 4.534280776977539,
"learning_rate": 1.2194535184836633e-06,
"loss": 1.1659,
"mean_token_accuracy": 0.7146624326705933,
"step": 558
},
{
"epoch": 0.9119086460032626,
"grad_norm": 4.192361354827881,
"learning_rate": 1.2116085135790872e-06,
"loss": 0.9654,
"mean_token_accuracy": 0.7518177628517151,
"step": 559
},
{
"epoch": 0.9135399673735726,
"grad_norm": 5.638926982879639,
"learning_rate": 1.2039029173484892e-06,
"loss": 1.6001,
"mean_token_accuracy": 0.6247368454933167,
"step": 560
},
{
"epoch": 0.9151712887438825,
"grad_norm": 4.600732326507568,
"learning_rate": 1.1963369802883478e-06,
"loss": 1.2123,
"mean_token_accuracy": 0.7063252925872803,
"step": 561
},
{
"epoch": 0.9168026101141925,
"grad_norm": 4.525058746337891,
"learning_rate": 1.1889109483550411e-06,
"loss": 1.0932,
"mean_token_accuracy": 0.7251037359237671,
"step": 562
},
{
"epoch": 0.9184339314845025,
"grad_norm": 4.5724005699157715,
"learning_rate": 1.1816250629568632e-06,
"loss": 1.0861,
"mean_token_accuracy": 0.7240241765975952,
"step": 563
},
{
"epoch": 0.9200652528548124,
"grad_norm": 5.578955173492432,
"learning_rate": 1.1744795609461683e-06,
"loss": 1.2629,
"mean_token_accuracy": 0.6909705400466919,
"step": 564
},
{
"epoch": 0.9216965742251223,
"grad_norm": 5.318408489227295,
"learning_rate": 1.167474674611675e-06,
"loss": 1.0538,
"mean_token_accuracy": 0.7338252067565918,
"step": 565
},
{
"epoch": 0.9233278955954323,
"grad_norm": 4.251341342926025,
"learning_rate": 1.1606106316709122e-06,
"loss": 1.0875,
"mean_token_accuracy": 0.7354260087013245,
"step": 566
},
{
"epoch": 0.9249592169657422,
"grad_norm": 5.110576629638672,
"learning_rate": 1.1538876552628183e-06,
"loss": 1.1861,
"mean_token_accuracy": 0.7216981053352356,
"step": 567
},
{
"epoch": 0.9265905383360522,
"grad_norm": 4.9769721031188965,
"learning_rate": 1.147305963940488e-06,
"loss": 1.0369,
"mean_token_accuracy": 0.744041919708252,
"step": 568
},
{
"epoch": 0.9282218597063622,
"grad_norm": 5.02736759185791,
"learning_rate": 1.1408657716640643e-06,
"loss": 1.5051,
"mean_token_accuracy": 0.6656504273414612,
"step": 569
},
{
"epoch": 0.9298531810766721,
"grad_norm": 4.389795303344727,
"learning_rate": 1.134567287793787e-06,
"loss": 1.1081,
"mean_token_accuracy": 0.7329843044281006,
"step": 570
},
{
"epoch": 0.9314845024469821,
"grad_norm": 4.3082427978515625,
"learning_rate": 1.128410717083182e-06,
"loss": 1.0839,
"mean_token_accuracy": 0.729187548160553,
"step": 571
},
{
"epoch": 0.933115823817292,
"grad_norm": 5.201175212860107,
"learning_rate": 1.1223962596724115e-06,
"loss": 1.2717,
"mean_token_accuracy": 0.6742309927940369,
"step": 572
},
{
"epoch": 0.9347471451876019,
"grad_norm": 4.306964874267578,
"learning_rate": 1.1165241110817602e-06,
"loss": 1.1214,
"mean_token_accuracy": 0.721930742263794,
"step": 573
},
{
"epoch": 0.9363784665579119,
"grad_norm": 4.683149814605713,
"learning_rate": 1.1107944622052857e-06,
"loss": 1.2618,
"mean_token_accuracy": 0.6971399188041687,
"step": 574
},
{
"epoch": 0.9380097879282219,
"grad_norm": 5.620746612548828,
"learning_rate": 1.1052074993046102e-06,
"loss": 1.2447,
"mean_token_accuracy": 0.6849538087844849,
"step": 575
},
{
"epoch": 0.9396411092985318,
"grad_norm": 4.673566818237305,
"learning_rate": 1.0997634040028643e-06,
"loss": 1.1948,
"mean_token_accuracy": 0.7181038856506348,
"step": 576
},
{
"epoch": 0.9412724306688418,
"grad_norm": 4.916784763336182,
"learning_rate": 1.0944623532787844e-06,
"loss": 1.0561,
"mean_token_accuracy": 0.7292323708534241,
"step": 577
},
{
"epoch": 0.9429037520391517,
"grad_norm": 4.703412055969238,
"learning_rate": 1.0893045194609596e-06,
"loss": 1.1676,
"mean_token_accuracy": 0.7098938226699829,
"step": 578
},
{
"epoch": 0.9445350734094616,
"grad_norm": 4.568572521209717,
"learning_rate": 1.0842900702222283e-06,
"loss": 1.3776,
"mean_token_accuracy": 0.6739327907562256,
"step": 579
},
{
"epoch": 0.9461663947797716,
"grad_norm": 4.693262577056885,
"learning_rate": 1.0794191685742276e-06,
"loss": 1.3097,
"mean_token_accuracy": 0.6928605437278748,
"step": 580
},
{
"epoch": 0.9477977161500816,
"grad_norm": 4.470661640167236,
"learning_rate": 1.074691972862095e-06,
"loss": 1.0411,
"mean_token_accuracy": 0.7344121932983398,
"step": 581
},
{
"epoch": 0.9494290375203915,
"grad_norm": 4.835824966430664,
"learning_rate": 1.070108636759322e-06,
"loss": 1.037,
"mean_token_accuracy": 0.7305210828781128,
"step": 582
},
{
"epoch": 0.9510603588907015,
"grad_norm": 5.557530403137207,
"learning_rate": 1.0656693092627534e-06,
"loss": 1.4262,
"mean_token_accuracy": 0.6733524203300476,
"step": 583
},
{
"epoch": 0.9526916802610114,
"grad_norm": 5.196942329406738,
"learning_rate": 1.0613741346877498e-06,
"loss": 1.0261,
"mean_token_accuracy": 0.7382140755653381,
"step": 584
},
{
"epoch": 0.9543230016313213,
"grad_norm": 4.611567497253418,
"learning_rate": 1.0572232526634918e-06,
"loss": 1.1281,
"mean_token_accuracy": 0.7303561568260193,
"step": 585
},
{
"epoch": 0.9559543230016313,
"grad_norm": 4.973482608795166,
"learning_rate": 1.0532167981284437e-06,
"loss": 1.1927,
"mean_token_accuracy": 0.7078921794891357,
"step": 586
},
{
"epoch": 0.9575856443719413,
"grad_norm": 4.535336017608643,
"learning_rate": 1.0493549013259644e-06,
"loss": 1.1746,
"mean_token_accuracy": 0.736328125,
"step": 587
},
{
"epoch": 0.9592169657422512,
"grad_norm": 4.868354320526123,
"learning_rate": 1.0456376878000754e-06,
"loss": 1.1741,
"mean_token_accuracy": 0.7153007984161377,
"step": 588
},
{
"epoch": 0.9608482871125612,
"grad_norm": 4.772627353668213,
"learning_rate": 1.0420652783913794e-06,
"loss": 1.2043,
"mean_token_accuracy": 0.7127602696418762,
"step": 589
},
{
"epoch": 0.9624796084828712,
"grad_norm": 4.772705554962158,
"learning_rate": 1.03863778923313e-06,
"loss": 1.2719,
"mean_token_accuracy": 0.6881773471832275,
"step": 590
},
{
"epoch": 0.964110929853181,
"grad_norm": 4.778547286987305,
"learning_rate": 1.0353553317474574e-06,
"loss": 1.0815,
"mean_token_accuracy": 0.7438063025474548,
"step": 591
},
{
"epoch": 0.965742251223491,
"grad_norm": 4.736347198486328,
"learning_rate": 1.0322180126417494e-06,
"loss": 1.1622,
"mean_token_accuracy": 0.7216761708259583,
"step": 592
},
{
"epoch": 0.967373572593801,
"grad_norm": 4.148738384246826,
"learning_rate": 1.0292259339051769e-06,
"loss": 1.1596,
"mean_token_accuracy": 0.7182163000106812,
"step": 593
},
{
"epoch": 0.9690048939641109,
"grad_norm": 4.727193832397461,
"learning_rate": 1.026379192805382e-06,
"loss": 1.4,
"mean_token_accuracy": 0.6754344701766968,
"step": 594
},
{
"epoch": 0.9706362153344209,
"grad_norm": 4.908797264099121,
"learning_rate": 1.0236778818853158e-06,
"loss": 1.3418,
"mean_token_accuracy": 0.6792452931404114,
"step": 595
},
{
"epoch": 0.9722675367047309,
"grad_norm": 5.056847095489502,
"learning_rate": 1.0211220889602289e-06,
"loss": 1.1988,
"mean_token_accuracy": 0.715332567691803,
"step": 596
},
{
"epoch": 0.9738988580750407,
"grad_norm": 4.906404495239258,
"learning_rate": 1.018711897114817e-06,
"loss": 1.3387,
"mean_token_accuracy": 0.6841831207275391,
"step": 597
},
{
"epoch": 0.9755301794453507,
"grad_norm": 4.806992530822754,
"learning_rate": 1.0164473847005205e-06,
"loss": 1.2102,
"mean_token_accuracy": 0.7100494503974915,
"step": 598
},
{
"epoch": 0.9771615008156607,
"grad_norm": 4.936591148376465,
"learning_rate": 1.0143286253329769e-06,
"loss": 1.1404,
"mean_token_accuracy": 0.7201149463653564,
"step": 599
},
{
"epoch": 0.9787928221859706,
"grad_norm": 4.549412727355957,
"learning_rate": 1.0123556878896274e-06,
"loss": 1.2092,
"mean_token_accuracy": 0.7039577960968018,
"step": 600
},
{
"epoch": 0.9804241435562806,
"grad_norm": 4.218964576721191,
"learning_rate": 1.0105286365074788e-06,
"loss": 0.9088,
"mean_token_accuracy": 0.775624692440033,
"step": 601
},
{
"epoch": 0.9820554649265906,
"grad_norm": 5.200554847717285,
"learning_rate": 1.0088475305810178e-06,
"loss": 1.1501,
"mean_token_accuracy": 0.7204116582870483,
"step": 602
},
{
"epoch": 0.9836867862969005,
"grad_norm": 4.525951385498047,
"learning_rate": 1.0073124247602805e-06,
"loss": 1.1539,
"mean_token_accuracy": 0.7239478826522827,
"step": 603
},
{
"epoch": 0.9853181076672104,
"grad_norm": 4.843019008636475,
"learning_rate": 1.0059233689490742e-06,
"loss": 1.3085,
"mean_token_accuracy": 0.6880208253860474,
"step": 604
},
{
"epoch": 0.9869494290375204,
"grad_norm": 4.720979690551758,
"learning_rate": 1.0046804083033585e-06,
"loss": 0.753,
"mean_token_accuracy": 0.8125383853912354,
"step": 605
},
{
"epoch": 0.9885807504078303,
"grad_norm": 4.803732395172119,
"learning_rate": 1.0035835832297736e-06,
"loss": 1.1941,
"mean_token_accuracy": 0.6987314820289612,
"step": 606
},
{
"epoch": 0.9902120717781403,
"grad_norm": 4.542924880981445,
"learning_rate": 1.00263292938433e-06,
"loss": 1.0593,
"mean_token_accuracy": 0.730215847492218,
"step": 607
},
{
"epoch": 0.9918433931484503,
"grad_norm": 4.79932975769043,
"learning_rate": 1.0018284776712475e-06,
"loss": 1.3496,
"mean_token_accuracy": 0.6967418789863586,
"step": 608
},
{
"epoch": 0.9934747145187602,
"grad_norm": 4.847198963165283,
"learning_rate": 1.0011702542419498e-06,
"loss": 1.1661,
"mean_token_accuracy": 0.7184059023857117,
"step": 609
},
{
"epoch": 0.9951060358890701,
"grad_norm": 5.3224406242370605,
"learning_rate": 1.0006582804942171e-06,
"loss": 1.2955,
"mean_token_accuracy": 0.6828246712684631,
"step": 610
},
{
"epoch": 0.9967373572593801,
"grad_norm": 4.992889404296875,
"learning_rate": 1.000292573071488e-06,
"loss": 1.2474,
"mean_token_accuracy": 0.7027914524078369,
"step": 611
},
{
"epoch": 0.99836867862969,
"grad_norm": 5.054308891296387,
"learning_rate": 1.000073143862319e-06,
"loss": 1.1465,
"mean_token_accuracy": 0.7231833934783936,
"step": 612
},
{
"epoch": 1.0,
"grad_norm": 5.045431613922119,
"learning_rate": 1.0000000000000002e-06,
"loss": 1.0511,
"mean_token_accuracy": 0.7292899489402771,
"step": 613
},
{
"epoch": 1.0,
"step": 613,
"total_flos": 1.770321796592042e+18,
"train_loss": 1.3923614178746209,
"train_runtime": 2541.2937,
"train_samples_per_second": 7.707,
"train_steps_per_second": 0.241
}
],
"logging_steps": 1,
"max_steps": 613,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.770321796592042e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}