4947 lines
133 KiB
JSON
4947 lines
133 KiB
JSON
{
|
|
"best_metric": null,
|
|
"best_model_checkpoint": null,
|
|
"epoch": 1.0,
|
|
"eval_steps": 500,
|
|
"global_step": 613,
|
|
"is_hyper_param_search": false,
|
|
"is_local_process_zero": true,
|
|
"is_world_process_zero": true,
|
|
"log_history": [
|
|
{
|
|
"epoch": 0.0016313213703099511,
|
|
"grad_norm": 54.38072967529297,
|
|
"learning_rate": 1.6129032258064518e-07,
|
|
"loss": 3.9722,
|
|
"mean_token_accuracy": 0.314461886882782,
|
|
"step": 1
|
|
},
|
|
{
|
|
"epoch": 0.0032626427406199023,
|
|
"grad_norm": 59.083343505859375,
|
|
"learning_rate": 3.2258064516129035e-07,
|
|
"loss": 3.7752,
|
|
"mean_token_accuracy": 0.3500784933567047,
|
|
"step": 2
|
|
},
|
|
{
|
|
"epoch": 0.004893964110929853,
|
|
"grad_norm": 52.31679153442383,
|
|
"learning_rate": 4.838709677419355e-07,
|
|
"loss": 3.9767,
|
|
"mean_token_accuracy": 0.32198143005371094,
|
|
"step": 3
|
|
},
|
|
{
|
|
"epoch": 0.0065252854812398045,
|
|
"grad_norm": 56.8325080871582,
|
|
"learning_rate": 6.451612903225807e-07,
|
|
"loss": 3.8677,
|
|
"mean_token_accuracy": 0.34073251485824585,
|
|
"step": 4
|
|
},
|
|
{
|
|
"epoch": 0.008156606851549755,
|
|
"grad_norm": 46.90914535522461,
|
|
"learning_rate": 8.064516129032258e-07,
|
|
"loss": 3.7833,
|
|
"mean_token_accuracy": 0.3529976010322571,
|
|
"step": 5
|
|
},
|
|
{
|
|
"epoch": 0.009787928221859706,
|
|
"grad_norm": 50.84980010986328,
|
|
"learning_rate": 9.67741935483871e-07,
|
|
"loss": 3.6046,
|
|
"mean_token_accuracy": 0.36332181096076965,
|
|
"step": 6
|
|
},
|
|
{
|
|
"epoch": 0.011419249592169658,
|
|
"grad_norm": 44.124671936035156,
|
|
"learning_rate": 1.1290322580645162e-06,
|
|
"loss": 3.5605,
|
|
"mean_token_accuracy": 0.3784194588661194,
|
|
"step": 7
|
|
},
|
|
{
|
|
"epoch": 0.013050570962479609,
|
|
"grad_norm": 38.687442779541016,
|
|
"learning_rate": 1.2903225806451614e-06,
|
|
"loss": 3.6567,
|
|
"mean_token_accuracy": 0.36630603671073914,
|
|
"step": 8
|
|
},
|
|
{
|
|
"epoch": 0.01468189233278956,
|
|
"grad_norm": 32.46002960205078,
|
|
"learning_rate": 1.4516129032258066e-06,
|
|
"loss": 3.7645,
|
|
"mean_token_accuracy": 0.3374135196208954,
|
|
"step": 9
|
|
},
|
|
{
|
|
"epoch": 0.01631321370309951,
|
|
"grad_norm": 29.601980209350586,
|
|
"learning_rate": 1.6129032258064516e-06,
|
|
"loss": 3.735,
|
|
"mean_token_accuracy": 0.340471088886261,
|
|
"step": 10
|
|
},
|
|
{
|
|
"epoch": 0.01794453507340946,
|
|
"grad_norm": 25.11663818359375,
|
|
"learning_rate": 1.774193548387097e-06,
|
|
"loss": 3.3774,
|
|
"mean_token_accuracy": 0.3931350111961365,
|
|
"step": 11
|
|
},
|
|
{
|
|
"epoch": 0.01957585644371941,
|
|
"grad_norm": 18.90343475341797,
|
|
"learning_rate": 1.935483870967742e-06,
|
|
"loss": 3.2297,
|
|
"mean_token_accuracy": 0.41412118077278137,
|
|
"step": 12
|
|
},
|
|
{
|
|
"epoch": 0.021207177814029365,
|
|
"grad_norm": 21.3724422454834,
|
|
"learning_rate": 2.096774193548387e-06,
|
|
"loss": 3.1907,
|
|
"mean_token_accuracy": 0.43043479323387146,
|
|
"step": 13
|
|
},
|
|
{
|
|
"epoch": 0.022838499184339316,
|
|
"grad_norm": 18.062108993530273,
|
|
"learning_rate": 2.2580645161290324e-06,
|
|
"loss": 3.0692,
|
|
"mean_token_accuracy": 0.43661972880363464,
|
|
"step": 14
|
|
},
|
|
{
|
|
"epoch": 0.024469820554649267,
|
|
"grad_norm": 18.955305099487305,
|
|
"learning_rate": 2.4193548387096776e-06,
|
|
"loss": 3.0939,
|
|
"mean_token_accuracy": 0.4284232258796692,
|
|
"step": 15
|
|
},
|
|
{
|
|
"epoch": 0.026101141924959218,
|
|
"grad_norm": 19.71297264099121,
|
|
"learning_rate": 2.580645161290323e-06,
|
|
"loss": 2.9745,
|
|
"mean_token_accuracy": 0.45571428537368774,
|
|
"step": 16
|
|
},
|
|
{
|
|
"epoch": 0.02773246329526917,
|
|
"grad_norm": 15.891701698303223,
|
|
"learning_rate": 2.7419354838709676e-06,
|
|
"loss": 2.843,
|
|
"mean_token_accuracy": 0.4642857015132904,
|
|
"step": 17
|
|
},
|
|
{
|
|
"epoch": 0.02936378466557912,
|
|
"grad_norm": 14.574506759643555,
|
|
"learning_rate": 2.903225806451613e-06,
|
|
"loss": 2.6097,
|
|
"mean_token_accuracy": 0.4918205738067627,
|
|
"step": 18
|
|
},
|
|
{
|
|
"epoch": 0.03099510603588907,
|
|
"grad_norm": 13.931673049926758,
|
|
"learning_rate": 3.0645161290322584e-06,
|
|
"loss": 2.5162,
|
|
"mean_token_accuracy": 0.5131129026412964,
|
|
"step": 19
|
|
},
|
|
{
|
|
"epoch": 0.03262642740619902,
|
|
"grad_norm": 13.101471900939941,
|
|
"learning_rate": 3.225806451612903e-06,
|
|
"loss": 2.7058,
|
|
"mean_token_accuracy": 0.47575756907463074,
|
|
"step": 20
|
|
},
|
|
{
|
|
"epoch": 0.03425774877650897,
|
|
"grad_norm": 12.979852676391602,
|
|
"learning_rate": 3.3870967741935484e-06,
|
|
"loss": 2.5696,
|
|
"mean_token_accuracy": 0.4878854751586914,
|
|
"step": 21
|
|
},
|
|
{
|
|
"epoch": 0.03588907014681892,
|
|
"grad_norm": 14.335384368896484,
|
|
"learning_rate": 3.548387096774194e-06,
|
|
"loss": 2.5031,
|
|
"mean_token_accuracy": 0.48602256178855896,
|
|
"step": 22
|
|
},
|
|
{
|
|
"epoch": 0.037520391517128875,
|
|
"grad_norm": 14.542072296142578,
|
|
"learning_rate": 3.7096774193548392e-06,
|
|
"loss": 2.4432,
|
|
"mean_token_accuracy": 0.5048364996910095,
|
|
"step": 23
|
|
},
|
|
{
|
|
"epoch": 0.03915171288743882,
|
|
"grad_norm": 12.069889068603516,
|
|
"learning_rate": 3.870967741935484e-06,
|
|
"loss": 2.2043,
|
|
"mean_token_accuracy": 0.5519013404846191,
|
|
"step": 24
|
|
},
|
|
{
|
|
"epoch": 0.040783034257748776,
|
|
"grad_norm": 9.698949813842773,
|
|
"learning_rate": 4.032258064516129e-06,
|
|
"loss": 2.0031,
|
|
"mean_token_accuracy": 0.5868473649024963,
|
|
"step": 25
|
|
},
|
|
{
|
|
"epoch": 0.04241435562805873,
|
|
"grad_norm": 10.89166259765625,
|
|
"learning_rate": 4.193548387096774e-06,
|
|
"loss": 2.3342,
|
|
"mean_token_accuracy": 0.5324609875679016,
|
|
"step": 26
|
|
},
|
|
{
|
|
"epoch": 0.04404567699836868,
|
|
"grad_norm": 9.197402000427246,
|
|
"learning_rate": 4.35483870967742e-06,
|
|
"loss": 2.2205,
|
|
"mean_token_accuracy": 0.5475698113441467,
|
|
"step": 27
|
|
},
|
|
{
|
|
"epoch": 0.04567699836867863,
|
|
"grad_norm": 9.47153377532959,
|
|
"learning_rate": 4.516129032258065e-06,
|
|
"loss": 2.0431,
|
|
"mean_token_accuracy": 0.5686706900596619,
|
|
"step": 28
|
|
},
|
|
{
|
|
"epoch": 0.04730831973898858,
|
|
"grad_norm": 8.886749267578125,
|
|
"learning_rate": 4.67741935483871e-06,
|
|
"loss": 2.1793,
|
|
"mean_token_accuracy": 0.5322735905647278,
|
|
"step": 29
|
|
},
|
|
{
|
|
"epoch": 0.048939641109298535,
|
|
"grad_norm": 10.089822769165039,
|
|
"learning_rate": 4.838709677419355e-06,
|
|
"loss": 1.987,
|
|
"mean_token_accuracy": 0.5579903721809387,
|
|
"step": 30
|
|
},
|
|
{
|
|
"epoch": 0.05057096247960848,
|
|
"grad_norm": 11.309324264526367,
|
|
"learning_rate": 5e-06,
|
|
"loss": 2.0749,
|
|
"mean_token_accuracy": 0.5649139285087585,
|
|
"step": 31
|
|
},
|
|
{
|
|
"epoch": 0.052202283849918436,
|
|
"grad_norm": 9.036641120910645,
|
|
"learning_rate": 5.161290322580646e-06,
|
|
"loss": 2.1629,
|
|
"mean_token_accuracy": 0.5413948893547058,
|
|
"step": 32
|
|
},
|
|
{
|
|
"epoch": 0.053833605220228384,
|
|
"grad_norm": 8.936366081237793,
|
|
"learning_rate": 5.322580645161291e-06,
|
|
"loss": 1.9053,
|
|
"mean_token_accuracy": 0.5875675678253174,
|
|
"step": 33
|
|
},
|
|
{
|
|
"epoch": 0.05546492659053834,
|
|
"grad_norm": 8.523772239685059,
|
|
"learning_rate": 5.483870967741935e-06,
|
|
"loss": 1.962,
|
|
"mean_token_accuracy": 0.5871559381484985,
|
|
"step": 34
|
|
},
|
|
{
|
|
"epoch": 0.057096247960848286,
|
|
"grad_norm": 8.703071594238281,
|
|
"learning_rate": 5.645161290322582e-06,
|
|
"loss": 2.0717,
|
|
"mean_token_accuracy": 0.5543113350868225,
|
|
"step": 35
|
|
},
|
|
{
|
|
"epoch": 0.05872756933115824,
|
|
"grad_norm": 8.243901252746582,
|
|
"learning_rate": 5.806451612903226e-06,
|
|
"loss": 1.9278,
|
|
"mean_token_accuracy": 0.5818815231323242,
|
|
"step": 36
|
|
},
|
|
{
|
|
"epoch": 0.06035889070146819,
|
|
"grad_norm": 8.658400535583496,
|
|
"learning_rate": 5.967741935483872e-06,
|
|
"loss": 1.9476,
|
|
"mean_token_accuracy": 0.5738636255264282,
|
|
"step": 37
|
|
},
|
|
{
|
|
"epoch": 0.06199021207177814,
|
|
"grad_norm": 8.671000480651855,
|
|
"learning_rate": 6.129032258064517e-06,
|
|
"loss": 2.0554,
|
|
"mean_token_accuracy": 0.5679658055305481,
|
|
"step": 38
|
|
},
|
|
{
|
|
"epoch": 0.0636215334420881,
|
|
"grad_norm": 9.466026306152344,
|
|
"learning_rate": 6.290322580645162e-06,
|
|
"loss": 1.9489,
|
|
"mean_token_accuracy": 0.5673534274101257,
|
|
"step": 39
|
|
},
|
|
{
|
|
"epoch": 0.06525285481239804,
|
|
"grad_norm": 8.415104866027832,
|
|
"learning_rate": 6.451612903225806e-06,
|
|
"loss": 2.1262,
|
|
"mean_token_accuracy": 0.5633999109268188,
|
|
"step": 40
|
|
},
|
|
{
|
|
"epoch": 0.06688417618270799,
|
|
"grad_norm": 7.783365726470947,
|
|
"learning_rate": 6.612903225806452e-06,
|
|
"loss": 1.7869,
|
|
"mean_token_accuracy": 0.6028110384941101,
|
|
"step": 41
|
|
},
|
|
{
|
|
"epoch": 0.06851549755301795,
|
|
"grad_norm": 8.495488166809082,
|
|
"learning_rate": 6.774193548387097e-06,
|
|
"loss": 1.7062,
|
|
"mean_token_accuracy": 0.6242873668670654,
|
|
"step": 42
|
|
},
|
|
{
|
|
"epoch": 0.0701468189233279,
|
|
"grad_norm": 8.216286659240723,
|
|
"learning_rate": 6.935483870967743e-06,
|
|
"loss": 1.8002,
|
|
"mean_token_accuracy": 0.6164383292198181,
|
|
"step": 43
|
|
},
|
|
{
|
|
"epoch": 0.07177814029363784,
|
|
"grad_norm": 7.681854724884033,
|
|
"learning_rate": 7.096774193548388e-06,
|
|
"loss": 1.8663,
|
|
"mean_token_accuracy": 0.5869767665863037,
|
|
"step": 44
|
|
},
|
|
{
|
|
"epoch": 0.0734094616639478,
|
|
"grad_norm": 7.960548400878906,
|
|
"learning_rate": 7.258064516129033e-06,
|
|
"loss": 1.5801,
|
|
"mean_token_accuracy": 0.6295210123062134,
|
|
"step": 45
|
|
},
|
|
{
|
|
"epoch": 0.07504078303425775,
|
|
"grad_norm": 8.843791007995605,
|
|
"learning_rate": 7.4193548387096784e-06,
|
|
"loss": 1.9695,
|
|
"mean_token_accuracy": 0.5767748951911926,
|
|
"step": 46
|
|
},
|
|
{
|
|
"epoch": 0.0766721044045677,
|
|
"grad_norm": 7.562375068664551,
|
|
"learning_rate": 7.580645161290323e-06,
|
|
"loss": 1.8982,
|
|
"mean_token_accuracy": 0.5856515169143677,
|
|
"step": 47
|
|
},
|
|
{
|
|
"epoch": 0.07830342577487764,
|
|
"grad_norm": 7.976773738861084,
|
|
"learning_rate": 7.741935483870968e-06,
|
|
"loss": 1.8455,
|
|
"mean_token_accuracy": 0.5857519507408142,
|
|
"step": 48
|
|
},
|
|
{
|
|
"epoch": 0.0799347471451876,
|
|
"grad_norm": 7.795076847076416,
|
|
"learning_rate": 7.903225806451613e-06,
|
|
"loss": 1.738,
|
|
"mean_token_accuracy": 0.602642297744751,
|
|
"step": 49
|
|
},
|
|
{
|
|
"epoch": 0.08156606851549755,
|
|
"grad_norm": 9.113154411315918,
|
|
"learning_rate": 8.064516129032258e-06,
|
|
"loss": 1.7887,
|
|
"mean_token_accuracy": 0.6150583028793335,
|
|
"step": 50
|
|
},
|
|
{
|
|
"epoch": 0.08319738988580751,
|
|
"grad_norm": 9.503119468688965,
|
|
"learning_rate": 8.225806451612904e-06,
|
|
"loss": 1.6738,
|
|
"mean_token_accuracy": 0.6308540105819702,
|
|
"step": 51
|
|
},
|
|
{
|
|
"epoch": 0.08482871125611746,
|
|
"grad_norm": 7.7233757972717285,
|
|
"learning_rate": 8.387096774193549e-06,
|
|
"loss": 1.8524,
|
|
"mean_token_accuracy": 0.6068170666694641,
|
|
"step": 52
|
|
},
|
|
{
|
|
"epoch": 0.0864600326264274,
|
|
"grad_norm": 8.368830680847168,
|
|
"learning_rate": 8.548387096774194e-06,
|
|
"loss": 1.6863,
|
|
"mean_token_accuracy": 0.641238272190094,
|
|
"step": 53
|
|
},
|
|
{
|
|
"epoch": 0.08809135399673736,
|
|
"grad_norm": 8.289685249328613,
|
|
"learning_rate": 8.70967741935484e-06,
|
|
"loss": 1.7527,
|
|
"mean_token_accuracy": 0.6219838857650757,
|
|
"step": 54
|
|
},
|
|
{
|
|
"epoch": 0.08972267536704731,
|
|
"grad_norm": 8.580499649047852,
|
|
"learning_rate": 8.870967741935484e-06,
|
|
"loss": 1.7605,
|
|
"mean_token_accuracy": 0.622188150882721,
|
|
"step": 55
|
|
},
|
|
{
|
|
"epoch": 0.09135399673735727,
|
|
"grad_norm": 8.407153129577637,
|
|
"learning_rate": 9.03225806451613e-06,
|
|
"loss": 1.9015,
|
|
"mean_token_accuracy": 0.6121242046356201,
|
|
"step": 56
|
|
},
|
|
{
|
|
"epoch": 0.0929853181076672,
|
|
"grad_norm": 7.347232818603516,
|
|
"learning_rate": 9.193548387096775e-06,
|
|
"loss": 1.6066,
|
|
"mean_token_accuracy": 0.6575052738189697,
|
|
"step": 57
|
|
},
|
|
{
|
|
"epoch": 0.09461663947797716,
|
|
"grad_norm": 7.600398063659668,
|
|
"learning_rate": 9.35483870967742e-06,
|
|
"loss": 1.6309,
|
|
"mean_token_accuracy": 0.6496000289916992,
|
|
"step": 58
|
|
},
|
|
{
|
|
"epoch": 0.09624796084828711,
|
|
"grad_norm": 9.03729248046875,
|
|
"learning_rate": 9.516129032258065e-06,
|
|
"loss": 1.5208,
|
|
"mean_token_accuracy": 0.6523297429084778,
|
|
"step": 59
|
|
},
|
|
{
|
|
"epoch": 0.09787928221859707,
|
|
"grad_norm": 7.88900899887085,
|
|
"learning_rate": 9.67741935483871e-06,
|
|
"loss": 1.5696,
|
|
"mean_token_accuracy": 0.6507083773612976,
|
|
"step": 60
|
|
},
|
|
{
|
|
"epoch": 0.09951060358890701,
|
|
"grad_norm": 7.398552417755127,
|
|
"learning_rate": 9.838709677419356e-06,
|
|
"loss": 1.4991,
|
|
"mean_token_accuracy": 0.6561679840087891,
|
|
"step": 61
|
|
},
|
|
{
|
|
"epoch": 0.10114192495921696,
|
|
"grad_norm": 7.690386772155762,
|
|
"learning_rate": 1e-05,
|
|
"loss": 1.4677,
|
|
"mean_token_accuracy": 0.6609534025192261,
|
|
"step": 62
|
|
},
|
|
{
|
|
"epoch": 0.10277324632952692,
|
|
"grad_norm": 7.935258865356445,
|
|
"learning_rate": 9.999926856137682e-06,
|
|
"loss": 1.5293,
|
|
"mean_token_accuracy": 0.6509740352630615,
|
|
"step": 63
|
|
},
|
|
{
|
|
"epoch": 0.10440456769983687,
|
|
"grad_norm": 7.435649871826172,
|
|
"learning_rate": 9.999707426928513e-06,
|
|
"loss": 1.5408,
|
|
"mean_token_accuracy": 0.6423665881156921,
|
|
"step": 64
|
|
},
|
|
{
|
|
"epoch": 0.10603588907014681,
|
|
"grad_norm": 7.0717668533325195,
|
|
"learning_rate": 9.999341719505784e-06,
|
|
"loss": 1.2598,
|
|
"mean_token_accuracy": 0.7105831503868103,
|
|
"step": 65
|
|
},
|
|
{
|
|
"epoch": 0.10766721044045677,
|
|
"grad_norm": 7.5760722160339355,
|
|
"learning_rate": 9.998829745758052e-06,
|
|
"loss": 1.5635,
|
|
"mean_token_accuracy": 0.6381751298904419,
|
|
"step": 66
|
|
},
|
|
{
|
|
"epoch": 0.10929853181076672,
|
|
"grad_norm": 7.556014060974121,
|
|
"learning_rate": 9.998171522328753e-06,
|
|
"loss": 1.6741,
|
|
"mean_token_accuracy": 0.6098901033401489,
|
|
"step": 67
|
|
},
|
|
{
|
|
"epoch": 0.11092985318107668,
|
|
"grad_norm": 7.316895008087158,
|
|
"learning_rate": 9.99736707061567e-06,
|
|
"loss": 1.698,
|
|
"mean_token_accuracy": 0.6228723526000977,
|
|
"step": 68
|
|
},
|
|
{
|
|
"epoch": 0.11256117455138662,
|
|
"grad_norm": 8.193136215209961,
|
|
"learning_rate": 9.996416416770227e-06,
|
|
"loss": 1.6473,
|
|
"mean_token_accuracy": 0.6394094228744507,
|
|
"step": 69
|
|
},
|
|
{
|
|
"epoch": 0.11419249592169657,
|
|
"grad_norm": 6.792864799499512,
|
|
"learning_rate": 9.995319591696643e-06,
|
|
"loss": 1.6064,
|
|
"mean_token_accuracy": 0.6287455558776855,
|
|
"step": 70
|
|
},
|
|
{
|
|
"epoch": 0.11582381729200653,
|
|
"grad_norm": 7.596305847167969,
|
|
"learning_rate": 9.994076631050926e-06,
|
|
"loss": 1.8675,
|
|
"mean_token_accuracy": 0.5812404155731201,
|
|
"step": 71
|
|
},
|
|
{
|
|
"epoch": 0.11745513866231648,
|
|
"grad_norm": 6.764160633087158,
|
|
"learning_rate": 9.99268757523972e-06,
|
|
"loss": 1.5861,
|
|
"mean_token_accuracy": 0.64697265625,
|
|
"step": 72
|
|
},
|
|
{
|
|
"epoch": 0.11908646003262642,
|
|
"grad_norm": 7.583809852600098,
|
|
"learning_rate": 9.991152469418984e-06,
|
|
"loss": 1.3654,
|
|
"mean_token_accuracy": 0.6922652125358582,
|
|
"step": 73
|
|
},
|
|
{
|
|
"epoch": 0.12071778140293637,
|
|
"grad_norm": 7.365781307220459,
|
|
"learning_rate": 9.989471363492523e-06,
|
|
"loss": 1.6449,
|
|
"mean_token_accuracy": 0.6340000033378601,
|
|
"step": 74
|
|
},
|
|
{
|
|
"epoch": 0.12234910277324633,
|
|
"grad_norm": 7.349303722381592,
|
|
"learning_rate": 9.987644312110373e-06,
|
|
"loss": 1.7496,
|
|
"mean_token_accuracy": 0.6141689419746399,
|
|
"step": 75
|
|
},
|
|
{
|
|
"epoch": 0.12398042414355628,
|
|
"grad_norm": 6.4074273109436035,
|
|
"learning_rate": 9.985671374667024e-06,
|
|
"loss": 1.5874,
|
|
"mean_token_accuracy": 0.6464434862136841,
|
|
"step": 76
|
|
},
|
|
{
|
|
"epoch": 0.12561174551386622,
|
|
"grad_norm": 6.483602046966553,
|
|
"learning_rate": 9.98355261529948e-06,
|
|
"loss": 1.6916,
|
|
"mean_token_accuracy": 0.6172904372215271,
|
|
"step": 77
|
|
},
|
|
{
|
|
"epoch": 0.1272430668841762,
|
|
"grad_norm": 6.887275695800781,
|
|
"learning_rate": 9.981288102885185e-06,
|
|
"loss": 1.6873,
|
|
"mean_token_accuracy": 0.6121962666511536,
|
|
"step": 78
|
|
},
|
|
{
|
|
"epoch": 0.12887438825448613,
|
|
"grad_norm": 6.4050703048706055,
|
|
"learning_rate": 9.978877911039772e-06,
|
|
"loss": 1.4187,
|
|
"mean_token_accuracy": 0.6751824617385864,
|
|
"step": 79
|
|
},
|
|
{
|
|
"epoch": 0.13050570962479607,
|
|
"grad_norm": 6.44724178314209,
|
|
"learning_rate": 9.976322118114685e-06,
|
|
"loss": 1.4161,
|
|
"mean_token_accuracy": 0.6592556834220886,
|
|
"step": 80
|
|
},
|
|
{
|
|
"epoch": 0.13213703099510604,
|
|
"grad_norm": 5.995436668395996,
|
|
"learning_rate": 9.97362080719462e-06,
|
|
"loss": 1.3907,
|
|
"mean_token_accuracy": 0.6656084656715393,
|
|
"step": 81
|
|
},
|
|
{
|
|
"epoch": 0.13376835236541598,
|
|
"grad_norm": 6.501825332641602,
|
|
"learning_rate": 9.970774066094825e-06,
|
|
"loss": 1.6026,
|
|
"mean_token_accuracy": 0.603732168674469,
|
|
"step": 82
|
|
},
|
|
{
|
|
"epoch": 0.13539967373572595,
|
|
"grad_norm": 7.173989772796631,
|
|
"learning_rate": 9.967781987358252e-06,
|
|
"loss": 1.7378,
|
|
"mean_token_accuracy": 0.6143959164619446,
|
|
"step": 83
|
|
},
|
|
{
|
|
"epoch": 0.1370309951060359,
|
|
"grad_norm": 6.576292991638184,
|
|
"learning_rate": 9.964644668252544e-06,
|
|
"loss": 1.4204,
|
|
"mean_token_accuracy": 0.6584976315498352,
|
|
"step": 84
|
|
},
|
|
{
|
|
"epoch": 0.13866231647634583,
|
|
"grad_norm": 8.727774620056152,
|
|
"learning_rate": 9.961362210766871e-06,
|
|
"loss": 1.6993,
|
|
"mean_token_accuracy": 0.6126176118850708,
|
|
"step": 85
|
|
},
|
|
{
|
|
"epoch": 0.1402936378466558,
|
|
"grad_norm": 6.580403804779053,
|
|
"learning_rate": 9.957934721608621e-06,
|
|
"loss": 1.6845,
|
|
"mean_token_accuracy": 0.6215676665306091,
|
|
"step": 86
|
|
},
|
|
{
|
|
"epoch": 0.14192495921696574,
|
|
"grad_norm": 5.9920830726623535,
|
|
"learning_rate": 9.954362312199926e-06,
|
|
"loss": 1.3893,
|
|
"mean_token_accuracy": 0.6767676472663879,
|
|
"step": 87
|
|
},
|
|
{
|
|
"epoch": 0.14355628058727568,
|
|
"grad_norm": 5.893803119659424,
|
|
"learning_rate": 9.950645098674037e-06,
|
|
"loss": 1.4447,
|
|
"mean_token_accuracy": 0.6626806259155273,
|
|
"step": 88
|
|
},
|
|
{
|
|
"epoch": 0.14518760195758565,
|
|
"grad_norm": 6.5982770919799805,
|
|
"learning_rate": 9.946783201871558e-06,
|
|
"loss": 1.3436,
|
|
"mean_token_accuracy": 0.6762666702270508,
|
|
"step": 89
|
|
},
|
|
{
|
|
"epoch": 0.1468189233278956,
|
|
"grad_norm": 5.981234550476074,
|
|
"learning_rate": 9.942776747336509e-06,
|
|
"loss": 1.5784,
|
|
"mean_token_accuracy": 0.6174784898757935,
|
|
"step": 90
|
|
},
|
|
{
|
|
"epoch": 0.14845024469820556,
|
|
"grad_norm": 6.088432788848877,
|
|
"learning_rate": 9.938625865312252e-06,
|
|
"loss": 1.7807,
|
|
"mean_token_accuracy": 0.5808597803115845,
|
|
"step": 91
|
|
},
|
|
{
|
|
"epoch": 0.1500815660685155,
|
|
"grad_norm": 6.743659973144531,
|
|
"learning_rate": 9.934330690737247e-06,
|
|
"loss": 1.6376,
|
|
"mean_token_accuracy": 0.604613721370697,
|
|
"step": 92
|
|
},
|
|
{
|
|
"epoch": 0.15171288743882544,
|
|
"grad_norm": 5.764866828918457,
|
|
"learning_rate": 9.929891363240679e-06,
|
|
"loss": 1.6292,
|
|
"mean_token_accuracy": 0.6264821887016296,
|
|
"step": 93
|
|
},
|
|
{
|
|
"epoch": 0.1533442088091354,
|
|
"grad_norm": 5.750985622406006,
|
|
"learning_rate": 9.925308027137906e-06,
|
|
"loss": 1.3667,
|
|
"mean_token_accuracy": 0.6758104562759399,
|
|
"step": 94
|
|
},
|
|
{
|
|
"epoch": 0.15497553017944535,
|
|
"grad_norm": 5.635873317718506,
|
|
"learning_rate": 9.920580831425774e-06,
|
|
"loss": 1.442,
|
|
"mean_token_accuracy": 0.6777954697608948,
|
|
"step": 95
|
|
},
|
|
{
|
|
"epoch": 0.1566068515497553,
|
|
"grad_norm": 5.207980632781982,
|
|
"learning_rate": 9.915709929777773e-06,
|
|
"loss": 1.1315,
|
|
"mean_token_accuracy": 0.7171201705932617,
|
|
"step": 96
|
|
},
|
|
{
|
|
"epoch": 0.15823817292006526,
|
|
"grad_norm": 6.929599761962891,
|
|
"learning_rate": 9.910695480539043e-06,
|
|
"loss": 1.5498,
|
|
"mean_token_accuracy": 0.6462904810905457,
|
|
"step": 97
|
|
},
|
|
{
|
|
"epoch": 0.1598694942903752,
|
|
"grad_norm": 6.597740173339844,
|
|
"learning_rate": 9.905537646721215e-06,
|
|
"loss": 1.3707,
|
|
"mean_token_accuracy": 0.6714513301849365,
|
|
"step": 98
|
|
},
|
|
{
|
|
"epoch": 0.16150081566068517,
|
|
"grad_norm": 5.562872409820557,
|
|
"learning_rate": 9.900236595997138e-06,
|
|
"loss": 1.2183,
|
|
"mean_token_accuracy": 0.709775984287262,
|
|
"step": 99
|
|
},
|
|
{
|
|
"epoch": 0.1631321370309951,
|
|
"grad_norm": 5.840291976928711,
|
|
"learning_rate": 9.89479250069539e-06,
|
|
"loss": 1.2124,
|
|
"mean_token_accuracy": 0.7145169973373413,
|
|
"step": 100
|
|
},
|
|
{
|
|
"epoch": 0.16476345840130505,
|
|
"grad_norm": 5.99063777923584,
|
|
"learning_rate": 9.889205537794715e-06,
|
|
"loss": 1.3492,
|
|
"mean_token_accuracy": 0.6756311655044556,
|
|
"step": 101
|
|
},
|
|
{
|
|
"epoch": 0.16639477977161501,
|
|
"grad_norm": 6.224008560180664,
|
|
"learning_rate": 9.883475888918241e-06,
|
|
"loss": 1.2016,
|
|
"mean_token_accuracy": 0.7054827809333801,
|
|
"step": 102
|
|
},
|
|
{
|
|
"epoch": 0.16802610114192496,
|
|
"grad_norm": 5.562602519989014,
|
|
"learning_rate": 9.87760374032759e-06,
|
|
"loss": 1.5352,
|
|
"mean_token_accuracy": 0.6521076560020447,
|
|
"step": 103
|
|
},
|
|
{
|
|
"epoch": 0.16965742251223492,
|
|
"grad_norm": 5.726022243499756,
|
|
"learning_rate": 9.87158928291682e-06,
|
|
"loss": 1.3858,
|
|
"mean_token_accuracy": 0.6717791557312012,
|
|
"step": 104
|
|
},
|
|
{
|
|
"epoch": 0.17128874388254486,
|
|
"grad_norm": 6.054457664489746,
|
|
"learning_rate": 9.865432712206215e-06,
|
|
"loss": 1.6255,
|
|
"mean_token_accuracy": 0.6333163976669312,
|
|
"step": 105
|
|
},
|
|
{
|
|
"epoch": 0.1729200652528548,
|
|
"grad_norm": 5.757321357727051,
|
|
"learning_rate": 9.859134228335937e-06,
|
|
"loss": 1.3847,
|
|
"mean_token_accuracy": 0.6641345620155334,
|
|
"step": 106
|
|
},
|
|
{
|
|
"epoch": 0.17455138662316477,
|
|
"grad_norm": 5.4531450271606445,
|
|
"learning_rate": 9.852694036059514e-06,
|
|
"loss": 1.4778,
|
|
"mean_token_accuracy": 0.680861234664917,
|
|
"step": 107
|
|
},
|
|
{
|
|
"epoch": 0.1761827079934747,
|
|
"grad_norm": 6.217274188995361,
|
|
"learning_rate": 9.846112344737182e-06,
|
|
"loss": 1.3624,
|
|
"mean_token_accuracy": 0.6645264625549316,
|
|
"step": 108
|
|
},
|
|
{
|
|
"epoch": 0.17781402936378465,
|
|
"grad_norm": 5.447512626647949,
|
|
"learning_rate": 9.839389368329088e-06,
|
|
"loss": 1.5179,
|
|
"mean_token_accuracy": 0.6528394818305969,
|
|
"step": 109
|
|
},
|
|
{
|
|
"epoch": 0.17944535073409462,
|
|
"grad_norm": 6.115851402282715,
|
|
"learning_rate": 9.832525325388326e-06,
|
|
"loss": 1.6997,
|
|
"mean_token_accuracy": 0.6170212626457214,
|
|
"step": 110
|
|
},
|
|
{
|
|
"epoch": 0.18107667210440456,
|
|
"grad_norm": 5.800912857055664,
|
|
"learning_rate": 9.825520439053832e-06,
|
|
"loss": 1.4313,
|
|
"mean_token_accuracy": 0.6626384854316711,
|
|
"step": 111
|
|
},
|
|
{
|
|
"epoch": 0.18270799347471453,
|
|
"grad_norm": 6.369785785675049,
|
|
"learning_rate": 9.818374937043138e-06,
|
|
"loss": 1.5534,
|
|
"mean_token_accuracy": 0.6290909051895142,
|
|
"step": 112
|
|
},
|
|
{
|
|
"epoch": 0.18433931484502447,
|
|
"grad_norm": 6.613420009613037,
|
|
"learning_rate": 9.811089051644959e-06,
|
|
"loss": 1.6318,
|
|
"mean_token_accuracy": 0.6186726689338684,
|
|
"step": 113
|
|
},
|
|
{
|
|
"epoch": 0.1859706362153344,
|
|
"grad_norm": 5.590596675872803,
|
|
"learning_rate": 9.803663019711654e-06,
|
|
"loss": 1.3043,
|
|
"mean_token_accuracy": 0.6894215941429138,
|
|
"step": 114
|
|
},
|
|
{
|
|
"epoch": 0.18760195758564438,
|
|
"grad_norm": 6.427780628204346,
|
|
"learning_rate": 9.796097082651511e-06,
|
|
"loss": 1.6446,
|
|
"mean_token_accuracy": 0.6234225034713745,
|
|
"step": 115
|
|
},
|
|
{
|
|
"epoch": 0.18923327895595432,
|
|
"grad_norm": 6.452088356018066,
|
|
"learning_rate": 9.788391486420914e-06,
|
|
"loss": 1.4595,
|
|
"mean_token_accuracy": 0.6346368789672852,
|
|
"step": 116
|
|
},
|
|
{
|
|
"epoch": 0.19086460032626426,
|
|
"grad_norm": 5.884222984313965,
|
|
"learning_rate": 9.780546481516338e-06,
|
|
"loss": 1.3437,
|
|
"mean_token_accuracy": 0.6792058348655701,
|
|
"step": 117
|
|
},
|
|
{
|
|
"epoch": 0.19249592169657423,
|
|
"grad_norm": 5.718683242797852,
|
|
"learning_rate": 9.772562322966209e-06,
|
|
"loss": 1.2696,
|
|
"mean_token_accuracy": 0.6850185394287109,
|
|
"step": 118
|
|
},
|
|
{
|
|
"epoch": 0.19412724306688417,
|
|
"grad_norm": 5.645365238189697,
|
|
"learning_rate": 9.764439270322612e-06,
|
|
"loss": 1.5184,
|
|
"mean_token_accuracy": 0.6474390625953674,
|
|
"step": 119
|
|
},
|
|
{
|
|
"epoch": 0.19575856443719414,
|
|
"grad_norm": 5.762539386749268,
|
|
"learning_rate": 9.756177587652857e-06,
|
|
"loss": 1.4345,
|
|
"mean_token_accuracy": 0.6544578075408936,
|
|
"step": 120
|
|
},
|
|
{
|
|
"epoch": 0.19738988580750408,
|
|
"grad_norm": 5.77543306350708,
|
|
"learning_rate": 9.74777754353089e-06,
|
|
"loss": 1.7153,
|
|
"mean_token_accuracy": 0.6169678568840027,
|
|
"step": 121
|
|
},
|
|
{
|
|
"epoch": 0.19902120717781402,
|
|
"grad_norm": 5.565819263458252,
|
|
"learning_rate": 9.739239411028565e-06,
|
|
"loss": 1.3033,
|
|
"mean_token_accuracy": 0.6986506581306458,
|
|
"step": 122
|
|
},
|
|
{
|
|
"epoch": 0.200652528548124,
|
|
"grad_norm": 5.547922134399414,
|
|
"learning_rate": 9.730563467706765e-06,
|
|
"loss": 1.327,
|
|
"mean_token_accuracy": 0.683811604976654,
|
|
"step": 123
|
|
},
|
|
{
|
|
"epoch": 0.20228384991843393,
|
|
"grad_norm": 5.765176296234131,
|
|
"learning_rate": 9.721749995606381e-06,
|
|
"loss": 1.3776,
|
|
"mean_token_accuracy": 0.6884735226631165,
|
|
"step": 124
|
|
},
|
|
{
|
|
"epoch": 0.2039151712887439,
|
|
"grad_norm": 5.340542793273926,
|
|
"learning_rate": 9.712799281239142e-06,
|
|
"loss": 1.4246,
|
|
"mean_token_accuracy": 0.6791791915893555,
|
|
"step": 125
|
|
},
|
|
{
|
|
"epoch": 0.20554649265905384,
|
|
"grad_norm": 5.423886775970459,
|
|
"learning_rate": 9.703711615578301e-06,
|
|
"loss": 1.1438,
|
|
"mean_token_accuracy": 0.7353861927986145,
|
|
"step": 126
|
|
},
|
|
{
|
|
"epoch": 0.20717781402936378,
|
|
"grad_norm": 5.641276836395264,
|
|
"learning_rate": 9.694487294049174e-06,
|
|
"loss": 1.4128,
|
|
"mean_token_accuracy": 0.6514989137649536,
|
|
"step": 127
|
|
},
|
|
{
|
|
"epoch": 0.20880913539967375,
|
|
"grad_norm": 5.543446063995361,
|
|
"learning_rate": 9.685126616519545e-06,
|
|
"loss": 1.4135,
|
|
"mean_token_accuracy": 0.6586325764656067,
|
|
"step": 128
|
|
},
|
|
{
|
|
"epoch": 0.21044045676998369,
|
|
"grad_norm": 6.770927906036377,
|
|
"learning_rate": 9.675629887289904e-06,
|
|
"loss": 1.4884,
|
|
"mean_token_accuracy": 0.6546052694320679,
|
|
"step": 129
|
|
},
|
|
{
|
|
"epoch": 0.21207177814029363,
|
|
"grad_norm": 5.887889385223389,
|
|
"learning_rate": 9.665997415083565e-06,
|
|
"loss": 1.4939,
|
|
"mean_token_accuracy": 0.653674840927124,
|
|
"step": 130
|
|
},
|
|
{
|
|
"epoch": 0.2137030995106036,
|
|
"grad_norm": 5.511849880218506,
|
|
"learning_rate": 9.656229513036623e-06,
|
|
"loss": 1.2267,
|
|
"mean_token_accuracy": 0.7116374969482422,
|
|
"step": 131
|
|
},
|
|
{
|
|
"epoch": 0.21533442088091354,
|
|
"grad_norm": 5.637845039367676,
|
|
"learning_rate": 9.646326498687787e-06,
|
|
"loss": 1.5632,
|
|
"mean_token_accuracy": 0.6471421718597412,
|
|
"step": 132
|
|
},
|
|
{
|
|
"epoch": 0.2169657422512235,
|
|
"grad_norm": 5.33619499206543,
|
|
"learning_rate": 9.636288693968039e-06,
|
|
"loss": 1.4464,
|
|
"mean_token_accuracy": 0.656867265701294,
|
|
"step": 133
|
|
},
|
|
{
|
|
"epoch": 0.21859706362153344,
|
|
"grad_norm": 5.903771877288818,
|
|
"learning_rate": 9.626116425190182e-06,
|
|
"loss": 1.5197,
|
|
"mean_token_accuracy": 0.6431440114974976,
|
|
"step": 134
|
|
},
|
|
{
|
|
"epoch": 0.22022838499184338,
|
|
"grad_norm": 5.29071569442749,
|
|
"learning_rate": 9.615810023038228e-06,
|
|
"loss": 1.4022,
|
|
"mean_token_accuracy": 0.646789014339447,
|
|
"step": 135
|
|
},
|
|
{
|
|
"epoch": 0.22185970636215335,
|
|
"grad_norm": 5.770832538604736,
|
|
"learning_rate": 9.605369822556651e-06,
|
|
"loss": 1.3488,
|
|
"mean_token_accuracy": 0.672672688961029,
|
|
"step": 136
|
|
},
|
|
{
|
|
"epoch": 0.2234910277324633,
|
|
"grad_norm": 5.827826023101807,
|
|
"learning_rate": 9.594796163139487e-06,
|
|
"loss": 1.2913,
|
|
"mean_token_accuracy": 0.707563042640686,
|
|
"step": 137
|
|
},
|
|
{
|
|
"epoch": 0.22512234910277323,
|
|
"grad_norm": 6.449001312255859,
|
|
"learning_rate": 9.584089388519307e-06,
|
|
"loss": 1.6024,
|
|
"mean_token_accuracy": 0.6305343508720398,
|
|
"step": 138
|
|
},
|
|
{
|
|
"epoch": 0.2267536704730832,
|
|
"grad_norm": 5.251701831817627,
|
|
"learning_rate": 9.573249846756048e-06,
|
|
"loss": 1.4945,
|
|
"mean_token_accuracy": 0.6551724076271057,
|
|
"step": 139
|
|
},
|
|
{
|
|
"epoch": 0.22838499184339314,
|
|
"grad_norm": 5.719169616699219,
|
|
"learning_rate": 9.562277890225683e-06,
|
|
"loss": 1.4588,
|
|
"mean_token_accuracy": 0.6551551818847656,
|
|
"step": 140
|
|
},
|
|
{
|
|
"epoch": 0.2300163132137031,
|
|
"grad_norm": 5.2488226890563965,
|
|
"learning_rate": 9.551173875608785e-06,
|
|
"loss": 1.235,
|
|
"mean_token_accuracy": 0.6981236338615417,
|
|
"step": 141
|
|
},
|
|
{
|
|
"epoch": 0.23164763458401305,
|
|
"grad_norm": 5.853959083557129,
|
|
"learning_rate": 9.539938163878916e-06,
|
|
"loss": 1.3501,
|
|
"mean_token_accuracy": 0.6693121790885925,
|
|
"step": 142
|
|
},
|
|
{
|
|
"epoch": 0.233278955954323,
|
|
"grad_norm": 5.647499084472656,
|
|
"learning_rate": 9.528571120290894e-06,
|
|
"loss": 1.2444,
|
|
"mean_token_accuracy": 0.7117318511009216,
|
|
"step": 143
|
|
},
|
|
{
|
|
"epoch": 0.23491027732463296,
|
|
"grad_norm": 5.933478832244873,
|
|
"learning_rate": 9.517073114368933e-06,
|
|
"loss": 1.4919,
|
|
"mean_token_accuracy": 0.6552088856697083,
|
|
"step": 144
|
|
},
|
|
{
|
|
"epoch": 0.2365415986949429,
|
|
"grad_norm": 5.842235565185547,
|
|
"learning_rate": 9.505444519894616e-06,
|
|
"loss": 1.52,
|
|
"mean_token_accuracy": 0.6385658979415894,
|
|
"step": 145
|
|
},
|
|
{
|
|
"epoch": 0.23817292006525284,
|
|
"grad_norm": 6.486652374267578,
|
|
"learning_rate": 9.493685714894746e-06,
|
|
"loss": 1.1983,
|
|
"mean_token_accuracy": 0.7004634737968445,
|
|
"step": 146
|
|
},
|
|
{
|
|
"epoch": 0.2398042414355628,
|
|
"grad_norm": 4.8720245361328125,
|
|
"learning_rate": 9.481797081629068e-06,
|
|
"loss": 1.3004,
|
|
"mean_token_accuracy": 0.709541380405426,
|
|
"step": 147
|
|
},
|
|
{
|
|
"epoch": 0.24143556280587275,
|
|
"grad_norm": 5.410114288330078,
|
|
"learning_rate": 9.469779006577822e-06,
|
|
"loss": 1.2591,
|
|
"mean_token_accuracy": 0.690431535243988,
|
|
"step": 148
|
|
},
|
|
{
|
|
"epoch": 0.24306688417618272,
|
|
"grad_norm": 5.812628746032715,
|
|
"learning_rate": 9.4576318804292e-06,
|
|
"loss": 1.612,
|
|
"mean_token_accuracy": 0.6232700943946838,
|
|
"step": 149
|
|
},
|
|
{
|
|
"epoch": 0.24469820554649266,
|
|
"grad_norm": 6.259674072265625,
|
|
"learning_rate": 9.445356098066638e-06,
|
|
"loss": 1.3041,
|
|
"mean_token_accuracy": 0.6718587875366211,
|
|
"step": 150
|
|
},
|
|
{
|
|
"epoch": 0.2463295269168026,
|
|
"grad_norm": 6.436178207397461,
|
|
"learning_rate": 9.43295205855597e-06,
|
|
"loss": 1.6111,
|
|
"mean_token_accuracy": 0.6207820177078247,
|
|
"step": 151
|
|
},
|
|
{
|
|
"epoch": 0.24796084828711257,
|
|
"grad_norm": 5.527941703796387,
|
|
"learning_rate": 9.420420165132466e-06,
|
|
"loss": 1.6642,
|
|
"mean_token_accuracy": 0.6238217949867249,
|
|
"step": 152
|
|
},
|
|
{
|
|
"epoch": 0.2495921696574225,
|
|
"grad_norm": 5.792147159576416,
|
|
"learning_rate": 9.407760825187722e-06,
|
|
"loss": 1.4365,
|
|
"mean_token_accuracy": 0.6555671095848083,
|
|
"step": 153
|
|
},
|
|
{
|
|
"epoch": 0.25122349102773245,
|
|
"grad_norm": 5.005126953125,
|
|
"learning_rate": 9.39497445025641e-06,
|
|
"loss": 1.2446,
|
|
"mean_token_accuracy": 0.7050209045410156,
|
|
"step": 154
|
|
},
|
|
{
|
|
"epoch": 0.2528548123980424,
|
|
"grad_norm": 5.894453048706055,
|
|
"learning_rate": 9.38206145600291e-06,
|
|
"loss": 1.5225,
|
|
"mean_token_accuracy": 0.6514018774032593,
|
|
"step": 155
|
|
},
|
|
{
|
|
"epoch": 0.2544861337683524,
|
|
"grad_norm": 5.637172698974609,
|
|
"learning_rate": 9.369022262207788e-06,
|
|
"loss": 1.5141,
|
|
"mean_token_accuracy": 0.622454047203064,
|
|
"step": 156
|
|
},
|
|
{
|
|
"epoch": 0.2561174551386623,
|
|
"grad_norm": 5.716491222381592,
|
|
"learning_rate": 9.355857292754152e-06,
|
|
"loss": 1.5215,
|
|
"mean_token_accuracy": 0.6571729779243469,
|
|
"step": 157
|
|
},
|
|
{
|
|
"epoch": 0.25774877650897227,
|
|
"grad_norm": 6.088312149047852,
|
|
"learning_rate": 9.342566975613875e-06,
|
|
"loss": 1.5606,
|
|
"mean_token_accuracy": 0.6172152161598206,
|
|
"step": 158
|
|
},
|
|
{
|
|
"epoch": 0.25938009787928223,
|
|
"grad_norm": 6.6313796043396,
|
|
"learning_rate": 9.329151742833678e-06,
|
|
"loss": 1.261,
|
|
"mean_token_accuracy": 0.6948052048683167,
|
|
"step": 159
|
|
},
|
|
{
|
|
"epoch": 0.26101141924959215,
|
|
"grad_norm": 6.572261333465576,
|
|
"learning_rate": 9.315612030521091e-06,
|
|
"loss": 1.174,
|
|
"mean_token_accuracy": 0.7152777910232544,
|
|
"step": 160
|
|
},
|
|
{
|
|
"epoch": 0.2626427406199021,
|
|
"grad_norm": 6.0583882331848145,
|
|
"learning_rate": 9.301948278830273e-06,
|
|
"loss": 1.4,
|
|
"mean_token_accuracy": 0.6757156848907471,
|
|
"step": 161
|
|
},
|
|
{
|
|
"epoch": 0.2642740619902121,
|
|
"grad_norm": 5.715542316436768,
|
|
"learning_rate": 9.288160931947698e-06,
|
|
"loss": 1.3266,
|
|
"mean_token_accuracy": 0.6793855428695679,
|
|
"step": 162
|
|
},
|
|
{
|
|
"epoch": 0.265905383360522,
|
|
"grad_norm": 5.376319408416748,
|
|
"learning_rate": 9.274250438077724e-06,
|
|
"loss": 1.1109,
|
|
"mean_token_accuracy": 0.7322580814361572,
|
|
"step": 163
|
|
},
|
|
{
|
|
"epoch": 0.26753670473083196,
|
|
"grad_norm": 5.3145012855529785,
|
|
"learning_rate": 9.260217249428016e-06,
|
|
"loss": 1.1862,
|
|
"mean_token_accuracy": 0.7048360109329224,
|
|
"step": 164
|
|
},
|
|
{
|
|
"epoch": 0.26916802610114193,
|
|
"grad_norm": 6.1805338859558105,
|
|
"learning_rate": 9.246061822194849e-06,
|
|
"loss": 1.5489,
|
|
"mean_token_accuracy": 0.6458333134651184,
|
|
"step": 165
|
|
},
|
|
{
|
|
"epoch": 0.2707993474714519,
|
|
"grad_norm": 5.672875881195068,
|
|
"learning_rate": 9.231784616548277e-06,
|
|
"loss": 1.3288,
|
|
"mean_token_accuracy": 0.6853932738304138,
|
|
"step": 166
|
|
},
|
|
{
|
|
"epoch": 0.2724306688417618,
|
|
"grad_norm": 5.999112606048584,
|
|
"learning_rate": 9.217386096617175e-06,
|
|
"loss": 1.5361,
|
|
"mean_token_accuracy": 0.6438902616500854,
|
|
"step": 167
|
|
},
|
|
{
|
|
"epoch": 0.2740619902120718,
|
|
"grad_norm": 6.415194511413574,
|
|
"learning_rate": 9.202866730474143e-06,
|
|
"loss": 1.5405,
|
|
"mean_token_accuracy": 0.6401821970939636,
|
|
"step": 168
|
|
},
|
|
{
|
|
"epoch": 0.27569331158238175,
|
|
"grad_norm": 6.119101524353027,
|
|
"learning_rate": 9.188226990120303e-06,
|
|
"loss": 1.4685,
|
|
"mean_token_accuracy": 0.6468571424484253,
|
|
"step": 169
|
|
},
|
|
{
|
|
"epoch": 0.27732463295269166,
|
|
"grad_norm": 5.0899434089660645,
|
|
"learning_rate": 9.173467351469943e-06,
|
|
"loss": 1.1837,
|
|
"mean_token_accuracy": 0.7153804302215576,
|
|
"step": 170
|
|
},
|
|
{
|
|
"epoch": 0.27895595432300163,
|
|
"grad_norm": 5.665865421295166,
|
|
"learning_rate": 9.158588294335055e-06,
|
|
"loss": 1.271,
|
|
"mean_token_accuracy": 0.6892816424369812,
|
|
"step": 171
|
|
},
|
|
{
|
|
"epoch": 0.2805872756933116,
|
|
"grad_norm": 5.781040668487549,
|
|
"learning_rate": 9.14359030240973e-06,
|
|
"loss": 1.1938,
|
|
"mean_token_accuracy": 0.7185488343238831,
|
|
"step": 172
|
|
},
|
|
{
|
|
"epoch": 0.2822185970636215,
|
|
"grad_norm": 4.997267723083496,
|
|
"learning_rate": 9.128473863254438e-06,
|
|
"loss": 1.2519,
|
|
"mean_token_accuracy": 0.6875,
|
|
"step": 173
|
|
},
|
|
{
|
|
"epoch": 0.2838499184339315,
|
|
"grad_norm": 5.392592906951904,
|
|
"learning_rate": 9.113239468280175e-06,
|
|
"loss": 1.5819,
|
|
"mean_token_accuracy": 0.6332082748413086,
|
|
"step": 174
|
|
},
|
|
{
|
|
"epoch": 0.28548123980424145,
|
|
"grad_norm": 4.405828952789307,
|
|
"learning_rate": 9.097887612732495e-06,
|
|
"loss": 0.9685,
|
|
"mean_token_accuracy": 0.7657608985900879,
|
|
"step": 175
|
|
},
|
|
{
|
|
"epoch": 0.28711256117455136,
|
|
"grad_norm": 4.870915412902832,
|
|
"learning_rate": 9.082418795675397e-06,
|
|
"loss": 1.2698,
|
|
"mean_token_accuracy": 0.7014712691307068,
|
|
"step": 176
|
|
},
|
|
{
|
|
"epoch": 0.28874388254486133,
|
|
"grad_norm": 5.485860824584961,
|
|
"learning_rate": 9.066833519975118e-06,
|
|
"loss": 1.3616,
|
|
"mean_token_accuracy": 0.6694870591163635,
|
|
"step": 177
|
|
},
|
|
{
|
|
"epoch": 0.2903752039151713,
|
|
"grad_norm": 5.251032829284668,
|
|
"learning_rate": 9.051132292283772e-06,
|
|
"loss": 1.1863,
|
|
"mean_token_accuracy": 0.6943209767341614,
|
|
"step": 178
|
|
},
|
|
{
|
|
"epoch": 0.29200652528548127,
|
|
"grad_norm": 5.481298923492432,
|
|
"learning_rate": 9.035315623022886e-06,
|
|
"loss": 1.3581,
|
|
"mean_token_accuracy": 0.6696730256080627,
|
|
"step": 179
|
|
},
|
|
{
|
|
"epoch": 0.2936378466557912,
|
|
"grad_norm": 5.111570358276367,
|
|
"learning_rate": 9.019384026366807e-06,
|
|
"loss": 1.3505,
|
|
"mean_token_accuracy": 0.6688086986541748,
|
|
"step": 180
|
|
},
|
|
{
|
|
"epoch": 0.29526916802610115,
|
|
"grad_norm": 4.826779842376709,
|
|
"learning_rate": 9.003338020225986e-06,
|
|
"loss": 1.1635,
|
|
"mean_token_accuracy": 0.7186034321784973,
|
|
"step": 181
|
|
},
|
|
{
|
|
"epoch": 0.2969004893964111,
|
|
"grad_norm": 5.660580635070801,
|
|
"learning_rate": 8.987178126230138e-06,
|
|
"loss": 1.5801,
|
|
"mean_token_accuracy": 0.6331775784492493,
|
|
"step": 182
|
|
},
|
|
{
|
|
"epoch": 0.29853181076672103,
|
|
"grad_norm": 5.761633396148682,
|
|
"learning_rate": 8.97090486971129e-06,
|
|
"loss": 1.1748,
|
|
"mean_token_accuracy": 0.7208150029182434,
|
|
"step": 183
|
|
},
|
|
{
|
|
"epoch": 0.300163132137031,
|
|
"grad_norm": 5.576194763183594,
|
|
"learning_rate": 8.954518779686704e-06,
|
|
"loss": 1.4442,
|
|
"mean_token_accuracy": 0.6586382389068604,
|
|
"step": 184
|
|
},
|
|
{
|
|
"epoch": 0.30179445350734097,
|
|
"grad_norm": 5.576228618621826,
|
|
"learning_rate": 8.938020388841673e-06,
|
|
"loss": 1.3454,
|
|
"mean_token_accuracy": 0.6765140295028687,
|
|
"step": 185
|
|
},
|
|
{
|
|
"epoch": 0.3034257748776509,
|
|
"grad_norm": 4.994912624359131,
|
|
"learning_rate": 8.921410233512211e-06,
|
|
"loss": 1.24,
|
|
"mean_token_accuracy": 0.7072243094444275,
|
|
"step": 186
|
|
},
|
|
{
|
|
"epoch": 0.30505709624796085,
|
|
"grad_norm": 5.298640251159668,
|
|
"learning_rate": 8.904688853667612e-06,
|
|
"loss": 1.3136,
|
|
"mean_token_accuracy": 0.6705882549285889,
|
|
"step": 187
|
|
},
|
|
{
|
|
"epoch": 0.3066884176182708,
|
|
"grad_norm": 5.550191879272461,
|
|
"learning_rate": 8.887856792892902e-06,
|
|
"loss": 1.3868,
|
|
"mean_token_accuracy": 0.6856528520584106,
|
|
"step": 188
|
|
},
|
|
{
|
|
"epoch": 0.3083197389885807,
|
|
"grad_norm": 5.478514671325684,
|
|
"learning_rate": 8.87091459837116e-06,
|
|
"loss": 1.2973,
|
|
"mean_token_accuracy": 0.6864721775054932,
|
|
"step": 189
|
|
},
|
|
{
|
|
"epoch": 0.3099510603588907,
|
|
"grad_norm": 5.3640546798706055,
|
|
"learning_rate": 8.853862820865742e-06,
|
|
"loss": 1.4836,
|
|
"mean_token_accuracy": 0.6382033824920654,
|
|
"step": 190
|
|
},
|
|
{
|
|
"epoch": 0.31158238172920066,
|
|
"grad_norm": 4.50584077835083,
|
|
"learning_rate": 8.83670201470237e-06,
|
|
"loss": 1.0835,
|
|
"mean_token_accuracy": 0.7182095646858215,
|
|
"step": 191
|
|
},
|
|
{
|
|
"epoch": 0.3132137030995106,
|
|
"grad_norm": 5.293252944946289,
|
|
"learning_rate": 8.819432737751097e-06,
|
|
"loss": 1.2622,
|
|
"mean_token_accuracy": 0.6940993666648865,
|
|
"step": 192
|
|
},
|
|
{
|
|
"epoch": 0.31484502446982054,
|
|
"grad_norm": 4.696035861968994,
|
|
"learning_rate": 8.802055551408207e-06,
|
|
"loss": 1.189,
|
|
"mean_token_accuracy": 0.7159493565559387,
|
|
"step": 193
|
|
},
|
|
{
|
|
"epoch": 0.3164763458401305,
|
|
"grad_norm": 4.758869171142578,
|
|
"learning_rate": 8.784571020577926e-06,
|
|
"loss": 1.0363,
|
|
"mean_token_accuracy": 0.7414075136184692,
|
|
"step": 194
|
|
},
|
|
{
|
|
"epoch": 0.3181076672104405,
|
|
"grad_norm": 5.393585681915283,
|
|
"learning_rate": 8.76697971365409e-06,
|
|
"loss": 1.3754,
|
|
"mean_token_accuracy": 0.6663179993629456,
|
|
"step": 195
|
|
},
|
|
{
|
|
"epoch": 0.3197389885807504,
|
|
"grad_norm": 5.480104446411133,
|
|
"learning_rate": 8.74928220250164e-06,
|
|
"loss": 1.7055,
|
|
"mean_token_accuracy": 0.6046082973480225,
|
|
"step": 196
|
|
},
|
|
{
|
|
"epoch": 0.32137030995106036,
|
|
"grad_norm": 5.184609413146973,
|
|
"learning_rate": 8.731479062438056e-06,
|
|
"loss": 1.4335,
|
|
"mean_token_accuracy": 0.6592000126838684,
|
|
"step": 197
|
|
},
|
|
{
|
|
"epoch": 0.32300163132137033,
|
|
"grad_norm": 5.132387638092041,
|
|
"learning_rate": 8.713570872214637e-06,
|
|
"loss": 1.4172,
|
|
"mean_token_accuracy": 0.6633475422859192,
|
|
"step": 198
|
|
},
|
|
{
|
|
"epoch": 0.32463295269168024,
|
|
"grad_norm": 5.561227798461914,
|
|
"learning_rate": 8.695558213997692e-06,
|
|
"loss": 1.5116,
|
|
"mean_token_accuracy": 0.6382217407226562,
|
|
"step": 199
|
|
},
|
|
{
|
|
"epoch": 0.3262642740619902,
|
|
"grad_norm": 6.255463123321533,
|
|
"learning_rate": 8.677441673349622e-06,
|
|
"loss": 1.3863,
|
|
"mean_token_accuracy": 0.6630803942680359,
|
|
"step": 200
|
|
},
|
|
{
|
|
"epoch": 0.3278955954323002,
|
|
"grad_norm": 4.947396755218506,
|
|
"learning_rate": 8.659221839209869e-06,
|
|
"loss": 1.4143,
|
|
"mean_token_accuracy": 0.6645483374595642,
|
|
"step": 201
|
|
},
|
|
{
|
|
"epoch": 0.3295269168026101,
|
|
"grad_norm": 5.235170364379883,
|
|
"learning_rate": 8.640899303875785e-06,
|
|
"loss": 1.2793,
|
|
"mean_token_accuracy": 0.6936695575714111,
|
|
"step": 202
|
|
},
|
|
{
|
|
"epoch": 0.33115823817292006,
|
|
"grad_norm": 5.727679252624512,
|
|
"learning_rate": 8.622474662983372e-06,
|
|
"loss": 1.428,
|
|
"mean_token_accuracy": 0.6479238867759705,
|
|
"step": 203
|
|
},
|
|
{
|
|
"epoch": 0.33278955954323003,
|
|
"grad_norm": 5.557906627655029,
|
|
"learning_rate": 8.60394851548792e-06,
|
|
"loss": 1.3305,
|
|
"mean_token_accuracy": 0.6868632435798645,
|
|
"step": 204
|
|
},
|
|
{
|
|
"epoch": 0.33442088091353994,
|
|
"grad_norm": 5.403807640075684,
|
|
"learning_rate": 8.585321463644525e-06,
|
|
"loss": 1.3701,
|
|
"mean_token_accuracy": 0.6680100560188293,
|
|
"step": 205
|
|
},
|
|
{
|
|
"epoch": 0.3360522022838499,
|
|
"grad_norm": 5.334835052490234,
|
|
"learning_rate": 8.566594112988534e-06,
|
|
"loss": 1.3598,
|
|
"mean_token_accuracy": 0.6583541035652161,
|
|
"step": 206
|
|
},
|
|
{
|
|
"epoch": 0.3376835236541599,
|
|
"grad_norm": 4.983403205871582,
|
|
"learning_rate": 8.547767072315835e-06,
|
|
"loss": 1.2434,
|
|
"mean_token_accuracy": 0.6838777661323547,
|
|
"step": 207
|
|
},
|
|
{
|
|
"epoch": 0.33931484502446985,
|
|
"grad_norm": 5.587502956390381,
|
|
"learning_rate": 8.528840953663086e-06,
|
|
"loss": 1.3061,
|
|
"mean_token_accuracy": 0.688642680644989,
|
|
"step": 208
|
|
},
|
|
{
|
|
"epoch": 0.34094616639477976,
|
|
"grad_norm": 5.853117942810059,
|
|
"learning_rate": 8.5098163722878e-06,
|
|
"loss": 1.4633,
|
|
"mean_token_accuracy": 0.6635462641716003,
|
|
"step": 209
|
|
},
|
|
{
|
|
"epoch": 0.3425774877650897,
|
|
"grad_norm": 5.541942596435547,
|
|
"learning_rate": 8.490693946648364e-06,
|
|
"loss": 1.2622,
|
|
"mean_token_accuracy": 0.7057894468307495,
|
|
"step": 210
|
|
},
|
|
{
|
|
"epoch": 0.3442088091353997,
|
|
"grad_norm": 5.35739278793335,
|
|
"learning_rate": 8.47147429838392e-06,
|
|
"loss": 1.2618,
|
|
"mean_token_accuracy": 0.689638078212738,
|
|
"step": 211
|
|
},
|
|
{
|
|
"epoch": 0.3458401305057096,
|
|
"grad_norm": 5.423904895782471,
|
|
"learning_rate": 8.452158052294158e-06,
|
|
"loss": 1.5032,
|
|
"mean_token_accuracy": 0.6418230533599854,
|
|
"step": 212
|
|
},
|
|
{
|
|
"epoch": 0.3474714518760196,
|
|
"grad_norm": 4.8785223960876465,
|
|
"learning_rate": 8.432745836319007e-06,
|
|
"loss": 1.4344,
|
|
"mean_token_accuracy": 0.6615913510322571,
|
|
"step": 213
|
|
},
|
|
{
|
|
"epoch": 0.34910277324632955,
|
|
"grad_norm": 4.893246650695801,
|
|
"learning_rate": 8.413238281518225e-06,
|
|
"loss": 1.2007,
|
|
"mean_token_accuracy": 0.6991991996765137,
|
|
"step": 214
|
|
},
|
|
{
|
|
"epoch": 0.35073409461663946,
|
|
"grad_norm": 5.7973504066467285,
|
|
"learning_rate": 8.39363602205088e-06,
|
|
"loss": 1.5249,
|
|
"mean_token_accuracy": 0.6353210806846619,
|
|
"step": 215
|
|
},
|
|
{
|
|
"epoch": 0.3523654159869494,
|
|
"grad_norm": 5.406508922576904,
|
|
"learning_rate": 8.373939695154739e-06,
|
|
"loss": 1.2806,
|
|
"mean_token_accuracy": 0.6916395425796509,
|
|
"step": 216
|
|
},
|
|
{
|
|
"epoch": 0.3539967373572594,
|
|
"grad_norm": 4.771231174468994,
|
|
"learning_rate": 8.354149941125539e-06,
|
|
"loss": 1.1256,
|
|
"mean_token_accuracy": 0.7322953343391418,
|
|
"step": 217
|
|
},
|
|
{
|
|
"epoch": 0.3556280587275693,
|
|
"grad_norm": 5.047488689422607,
|
|
"learning_rate": 8.334267403296193e-06,
|
|
"loss": 1.1106,
|
|
"mean_token_accuracy": 0.7239696383476257,
|
|
"step": 218
|
|
},
|
|
{
|
|
"epoch": 0.3572593800978793,
|
|
"grad_norm": 5.410397529602051,
|
|
"learning_rate": 8.314292728015859e-06,
|
|
"loss": 1.182,
|
|
"mean_token_accuracy": 0.7058823704719543,
|
|
"step": 219
|
|
},
|
|
{
|
|
"epoch": 0.35889070146818924,
|
|
"grad_norm": 6.237778663635254,
|
|
"learning_rate": 8.294226564628936e-06,
|
|
"loss": 1.2493,
|
|
"mean_token_accuracy": 0.6834862232208252,
|
|
"step": 220
|
|
},
|
|
{
|
|
"epoch": 0.3605220228384992,
|
|
"grad_norm": 5.143507957458496,
|
|
"learning_rate": 8.274069565453955e-06,
|
|
"loss": 1.352,
|
|
"mean_token_accuracy": 0.6808404326438904,
|
|
"step": 221
|
|
},
|
|
{
|
|
"epoch": 0.3621533442088091,
|
|
"grad_norm": 5.389186859130859,
|
|
"learning_rate": 8.25382238576237e-06,
|
|
"loss": 1.2109,
|
|
"mean_token_accuracy": 0.7188329100608826,
|
|
"step": 222
|
|
},
|
|
{
|
|
"epoch": 0.3637846655791191,
|
|
"grad_norm": 5.256932735443115,
|
|
"learning_rate": 8.23348568375726e-06,
|
|
"loss": 1.3621,
|
|
"mean_token_accuracy": 0.679024875164032,
|
|
"step": 223
|
|
},
|
|
{
|
|
"epoch": 0.36541598694942906,
|
|
"grad_norm": 5.2731146812438965,
|
|
"learning_rate": 8.213060120551923e-06,
|
|
"loss": 1.4888,
|
|
"mean_token_accuracy": 0.644489586353302,
|
|
"step": 224
|
|
},
|
|
{
|
|
"epoch": 0.367047308319739,
|
|
"grad_norm": 5.008488655090332,
|
|
"learning_rate": 8.1925463601484e-06,
|
|
"loss": 1.3388,
|
|
"mean_token_accuracy": 0.6928645372390747,
|
|
"step": 225
|
|
},
|
|
{
|
|
"epoch": 0.36867862969004894,
|
|
"grad_norm": 6.0909247398376465,
|
|
"learning_rate": 8.171945069415877e-06,
|
|
"loss": 1.3308,
|
|
"mean_token_accuracy": 0.6703540086746216,
|
|
"step": 226
|
|
},
|
|
{
|
|
"epoch": 0.3703099510603589,
|
|
"grad_norm": 6.270472526550293,
|
|
"learning_rate": 8.151256918069002e-06,
|
|
"loss": 1.5142,
|
|
"mean_token_accuracy": 0.6341871023178101,
|
|
"step": 227
|
|
},
|
|
{
|
|
"epoch": 0.3719412724306688,
|
|
"grad_norm": 5.570935249328613,
|
|
"learning_rate": 8.130482578646137e-06,
|
|
"loss": 1.1315,
|
|
"mean_token_accuracy": 0.7041916251182556,
|
|
"step": 228
|
|
},
|
|
{
|
|
"epoch": 0.3735725938009788,
|
|
"grad_norm": 5.195607662200928,
|
|
"learning_rate": 8.109622726487463e-06,
|
|
"loss": 1.54,
|
|
"mean_token_accuracy": 0.6397637724876404,
|
|
"step": 229
|
|
},
|
|
{
|
|
"epoch": 0.37520391517128876,
|
|
"grad_norm": 4.792831897735596,
|
|
"learning_rate": 8.088678039713052e-06,
|
|
"loss": 1.2567,
|
|
"mean_token_accuracy": 0.7066537141799927,
|
|
"step": 230
|
|
},
|
|
{
|
|
"epoch": 0.3768352365415987,
|
|
"grad_norm": 5.558446407318115,
|
|
"learning_rate": 8.067649199200807e-06,
|
|
"loss": 1.3282,
|
|
"mean_token_accuracy": 0.6886616945266724,
|
|
"step": 231
|
|
},
|
|
{
|
|
"epoch": 0.37846655791190864,
|
|
"grad_norm": 5.962700366973877,
|
|
"learning_rate": 8.046536888564335e-06,
|
|
"loss": 1.2761,
|
|
"mean_token_accuracy": 0.6899516582489014,
|
|
"step": 232
|
|
},
|
|
{
|
|
"epoch": 0.3800978792822186,
|
|
"grad_norm": 4.565369129180908,
|
|
"learning_rate": 8.025341794130722e-06,
|
|
"loss": 1.1489,
|
|
"mean_token_accuracy": 0.7200378775596619,
|
|
"step": 233
|
|
},
|
|
{
|
|
"epoch": 0.3817292006525285,
|
|
"grad_norm": 5.34097146987915,
|
|
"learning_rate": 8.004064604918219e-06,
|
|
"loss": 1.5369,
|
|
"mean_token_accuracy": 0.6295350193977356,
|
|
"step": 234
|
|
},
|
|
{
|
|
"epoch": 0.3833605220228385,
|
|
"grad_norm": 4.983196258544922,
|
|
"learning_rate": 7.982706012613854e-06,
|
|
"loss": 1.1661,
|
|
"mean_token_accuracy": 0.6999412775039673,
|
|
"step": 235
|
|
},
|
|
{
|
|
"epoch": 0.38499184339314846,
|
|
"grad_norm": 5.128100395202637,
|
|
"learning_rate": 7.961266711550922e-06,
|
|
"loss": 1.345,
|
|
"mean_token_accuracy": 0.6874102354049683,
|
|
"step": 236
|
|
},
|
|
{
|
|
"epoch": 0.3866231647634584,
|
|
"grad_norm": 5.386168479919434,
|
|
"learning_rate": 7.939747398686445e-06,
|
|
"loss": 1.3224,
|
|
"mean_token_accuracy": 0.6796019673347473,
|
|
"step": 237
|
|
},
|
|
{
|
|
"epoch": 0.38825448613376834,
|
|
"grad_norm": 5.458306312561035,
|
|
"learning_rate": 7.918148773578492e-06,
|
|
"loss": 1.4898,
|
|
"mean_token_accuracy": 0.6451776623725891,
|
|
"step": 238
|
|
},
|
|
{
|
|
"epoch": 0.3898858075040783,
|
|
"grad_norm": 5.1783552169799805,
|
|
"learning_rate": 7.896471538363442e-06,
|
|
"loss": 1.5354,
|
|
"mean_token_accuracy": 0.6542155742645264,
|
|
"step": 239
|
|
},
|
|
{
|
|
"epoch": 0.3915171288743883,
|
|
"grad_norm": 5.7401580810546875,
|
|
"learning_rate": 7.874716397733172e-06,
|
|
"loss": 1.4129,
|
|
"mean_token_accuracy": 0.6713286638259888,
|
|
"step": 240
|
|
},
|
|
{
|
|
"epoch": 0.3931484502446982,
|
|
"grad_norm": 5.0389180183410645,
|
|
"learning_rate": 7.852884058912124e-06,
|
|
"loss": 1.4643,
|
|
"mean_token_accuracy": 0.6414728760719299,
|
|
"step": 241
|
|
},
|
|
{
|
|
"epoch": 0.39477977161500816,
|
|
"grad_norm": 4.874699592590332,
|
|
"learning_rate": 7.830975231634341e-06,
|
|
"loss": 1.0325,
|
|
"mean_token_accuracy": 0.740480363368988,
|
|
"step": 242
|
|
},
|
|
{
|
|
"epoch": 0.3964110929853181,
|
|
"grad_norm": 4.744316101074219,
|
|
"learning_rate": 7.808990628120374e-06,
|
|
"loss": 1.154,
|
|
"mean_token_accuracy": 0.7321428656578064,
|
|
"step": 243
|
|
},
|
|
{
|
|
"epoch": 0.39804241435562804,
|
|
"grad_norm": 4.903902530670166,
|
|
"learning_rate": 7.786930963054142e-06,
|
|
"loss": 1.2538,
|
|
"mean_token_accuracy": 0.6969696879386902,
|
|
"step": 244
|
|
},
|
|
{
|
|
"epoch": 0.399673735725938,
|
|
"grad_norm": 4.861123085021973,
|
|
"learning_rate": 7.76479695355969e-06,
|
|
"loss": 1.1761,
|
|
"mean_token_accuracy": 0.7024747133255005,
|
|
"step": 245
|
|
},
|
|
{
|
|
"epoch": 0.401305057096248,
|
|
"grad_norm": 5.309647083282471,
|
|
"learning_rate": 7.742589319177879e-06,
|
|
"loss": 1.2522,
|
|
"mean_token_accuracy": 0.7030481696128845,
|
|
"step": 246
|
|
},
|
|
{
|
|
"epoch": 0.4029363784665579,
|
|
"grad_norm": 4.72802209854126,
|
|
"learning_rate": 7.720308781843003e-06,
|
|
"loss": 1.1986,
|
|
"mean_token_accuracy": 0.7094155550003052,
|
|
"step": 247
|
|
},
|
|
{
|
|
"epoch": 0.40456769983686786,
|
|
"grad_norm": 6.070117473602295,
|
|
"learning_rate": 7.697956065859308e-06,
|
|
"loss": 1.295,
|
|
"mean_token_accuracy": 0.6842672228813171,
|
|
"step": 248
|
|
},
|
|
{
|
|
"epoch": 0.4061990212071778,
|
|
"grad_norm": 4.879459857940674,
|
|
"learning_rate": 7.67553189787745e-06,
|
|
"loss": 1.2096,
|
|
"mean_token_accuracy": 0.686804473400116,
|
|
"step": 249
|
|
},
|
|
{
|
|
"epoch": 0.4078303425774878,
|
|
"grad_norm": 5.451211452484131,
|
|
"learning_rate": 7.653037006870878e-06,
|
|
"loss": 1.4763,
|
|
"mean_token_accuracy": 0.637888491153717,
|
|
"step": 250
|
|
},
|
|
{
|
|
"epoch": 0.4094616639477977,
|
|
"grad_norm": 4.923818588256836,
|
|
"learning_rate": 7.630472124112125e-06,
|
|
"loss": 1.2607,
|
|
"mean_token_accuracy": 0.6872745752334595,
|
|
"step": 251
|
|
},
|
|
{
|
|
"epoch": 0.4110929853181077,
|
|
"grad_norm": 5.415319442749023,
|
|
"learning_rate": 7.607837983149057e-06,
|
|
"loss": 1.1446,
|
|
"mean_token_accuracy": 0.7222545146942139,
|
|
"step": 252
|
|
},
|
|
{
|
|
"epoch": 0.41272430668841764,
|
|
"grad_norm": 5.4529900550842285,
|
|
"learning_rate": 7.585135319780995e-06,
|
|
"loss": 1.4468,
|
|
"mean_token_accuracy": 0.6554580926895142,
|
|
"step": 253
|
|
},
|
|
{
|
|
"epoch": 0.41435562805872755,
|
|
"grad_norm": 5.247809410095215,
|
|
"learning_rate": 7.562364872034823e-06,
|
|
"loss": 1.3883,
|
|
"mean_token_accuracy": 0.6721068024635315,
|
|
"step": 254
|
|
},
|
|
{
|
|
"epoch": 0.4159869494290375,
|
|
"grad_norm": 5.47812557220459,
|
|
"learning_rate": 7.5395273801409854e-06,
|
|
"loss": 1.4343,
|
|
"mean_token_accuracy": 0.6608517169952393,
|
|
"step": 255
|
|
},
|
|
{
|
|
"epoch": 0.4176182707993475,
|
|
"grad_norm": 5.498720645904541,
|
|
"learning_rate": 7.5166235865094174e-06,
|
|
"loss": 1.4222,
|
|
"mean_token_accuracy": 0.6456736326217651,
|
|
"step": 256
|
|
},
|
|
{
|
|
"epoch": 0.4192495921696574,
|
|
"grad_norm": 4.786160945892334,
|
|
"learning_rate": 7.493654235705422e-06,
|
|
"loss": 1.4204,
|
|
"mean_token_accuracy": 0.6773132681846619,
|
|
"step": 257
|
|
},
|
|
{
|
|
"epoch": 0.42088091353996737,
|
|
"grad_norm": 5.397915840148926,
|
|
"learning_rate": 7.470620074425459e-06,
|
|
"loss": 1.4843,
|
|
"mean_token_accuracy": 0.6380900740623474,
|
|
"step": 258
|
|
},
|
|
{
|
|
"epoch": 0.42251223491027734,
|
|
"grad_norm": 5.466760158538818,
|
|
"learning_rate": 7.447521851472872e-06,
|
|
"loss": 1.4852,
|
|
"mean_token_accuracy": 0.6487154364585876,
|
|
"step": 259
|
|
},
|
|
{
|
|
"epoch": 0.42414355628058725,
|
|
"grad_norm": 5.804627895355225,
|
|
"learning_rate": 7.424360317733544e-06,
|
|
"loss": 1.3923,
|
|
"mean_token_accuracy": 0.6545741558074951,
|
|
"step": 260
|
|
},
|
|
{
|
|
"epoch": 0.4257748776508972,
|
|
"grad_norm": 5.381365776062012,
|
|
"learning_rate": 7.401136226151488e-06,
|
|
"loss": 1.4495,
|
|
"mean_token_accuracy": 0.6681222915649414,
|
|
"step": 261
|
|
},
|
|
{
|
|
"epoch": 0.4274061990212072,
|
|
"grad_norm": 4.776740550994873,
|
|
"learning_rate": 7.377850331704377e-06,
|
|
"loss": 1.0082,
|
|
"mean_token_accuracy": 0.7397812604904175,
|
|
"step": 262
|
|
},
|
|
{
|
|
"epoch": 0.4290375203915171,
|
|
"grad_norm": 5.0946149826049805,
|
|
"learning_rate": 7.354503391378992e-06,
|
|
"loss": 1.1745,
|
|
"mean_token_accuracy": 0.7127882838249207,
|
|
"step": 263
|
|
},
|
|
{
|
|
"epoch": 0.43066884176182707,
|
|
"grad_norm": 5.161426067352295,
|
|
"learning_rate": 7.331096164146616e-06,
|
|
"loss": 1.4598,
|
|
"mean_token_accuracy": 0.6507353186607361,
|
|
"step": 264
|
|
},
|
|
{
|
|
"epoch": 0.43230016313213704,
|
|
"grad_norm": 5.14084005355835,
|
|
"learning_rate": 7.307629410938364e-06,
|
|
"loss": 1.3107,
|
|
"mean_token_accuracy": 0.6751173734664917,
|
|
"step": 265
|
|
},
|
|
{
|
|
"epoch": 0.433931484502447,
|
|
"grad_norm": 4.744462966918945,
|
|
"learning_rate": 7.28410389462044e-06,
|
|
"loss": 1.2483,
|
|
"mean_token_accuracy": 0.6957618594169617,
|
|
"step": 266
|
|
},
|
|
{
|
|
"epoch": 0.4355628058727569,
|
|
"grad_norm": 5.074441432952881,
|
|
"learning_rate": 7.260520379969347e-06,
|
|
"loss": 1.2429,
|
|
"mean_token_accuracy": 0.7157652378082275,
|
|
"step": 267
|
|
},
|
|
{
|
|
"epoch": 0.4371941272430669,
|
|
"grad_norm": 5.745429992675781,
|
|
"learning_rate": 7.236879633647018e-06,
|
|
"loss": 1.3938,
|
|
"mean_token_accuracy": 0.6745472550392151,
|
|
"step": 268
|
|
},
|
|
{
|
|
"epoch": 0.43882544861337686,
|
|
"grad_norm": 4.631476879119873,
|
|
"learning_rate": 7.213182424175895e-06,
|
|
"loss": 1.141,
|
|
"mean_token_accuracy": 0.726822018623352,
|
|
"step": 269
|
|
},
|
|
{
|
|
"epoch": 0.44045676998368677,
|
|
"grad_norm": 4.983883857727051,
|
|
"learning_rate": 7.189429521913942e-06,
|
|
"loss": 1.4304,
|
|
"mean_token_accuracy": 0.656844973564148,
|
|
"step": 270
|
|
},
|
|
{
|
|
"epoch": 0.44208809135399674,
|
|
"grad_norm": 5.129734039306641,
|
|
"learning_rate": 7.165621699029615e-06,
|
|
"loss": 1.2641,
|
|
"mean_token_accuracy": 0.7042542099952698,
|
|
"step": 271
|
|
},
|
|
{
|
|
"epoch": 0.4437194127243067,
|
|
"grad_norm": 5.031182765960693,
|
|
"learning_rate": 7.1417597294767405e-06,
|
|
"loss": 1.0971,
|
|
"mean_token_accuracy": 0.7178630828857422,
|
|
"step": 272
|
|
},
|
|
{
|
|
"epoch": 0.4453507340946166,
|
|
"grad_norm": 4.948353290557861,
|
|
"learning_rate": 7.1178443889693694e-06,
|
|
"loss": 0.9821,
|
|
"mean_token_accuracy": 0.7603078484535217,
|
|
"step": 273
|
|
},
|
|
{
|
|
"epoch": 0.4469820554649266,
|
|
"grad_norm": 5.791008472442627,
|
|
"learning_rate": 7.0938764549565605e-06,
|
|
"loss": 1.3631,
|
|
"mean_token_accuracy": 0.6715368032455444,
|
|
"step": 274
|
|
},
|
|
{
|
|
"epoch": 0.44861337683523655,
|
|
"grad_norm": 5.602413654327393,
|
|
"learning_rate": 7.069856706597095e-06,
|
|
"loss": 1.4013,
|
|
"mean_token_accuracy": 0.6645569801330566,
|
|
"step": 275
|
|
},
|
|
{
|
|
"epoch": 0.45024469820554647,
|
|
"grad_norm": 4.785682201385498,
|
|
"learning_rate": 7.04578592473416e-06,
|
|
"loss": 1.2362,
|
|
"mean_token_accuracy": 0.6916077136993408,
|
|
"step": 276
|
|
},
|
|
{
|
|
"epoch": 0.45187601957585644,
|
|
"grad_norm": 4.269145488739014,
|
|
"learning_rate": 7.021664891869955e-06,
|
|
"loss": 1.1638,
|
|
"mean_token_accuracy": 0.7208147048950195,
|
|
"step": 277
|
|
},
|
|
{
|
|
"epoch": 0.4535073409461664,
|
|
"grad_norm": 5.598394870758057,
|
|
"learning_rate": 6.997494392140264e-06,
|
|
"loss": 1.449,
|
|
"mean_token_accuracy": 0.6509479880332947,
|
|
"step": 278
|
|
},
|
|
{
|
|
"epoch": 0.4551386623164764,
|
|
"grad_norm": 4.253592491149902,
|
|
"learning_rate": 6.973275211288953e-06,
|
|
"loss": 0.962,
|
|
"mean_token_accuracy": 0.7480490803718567,
|
|
"step": 279
|
|
},
|
|
{
|
|
"epoch": 0.4567699836867863,
|
|
"grad_norm": 5.125161647796631,
|
|
"learning_rate": 6.949008136642437e-06,
|
|
"loss": 1.3255,
|
|
"mean_token_accuracy": 0.6781437397003174,
|
|
"step": 280
|
|
},
|
|
{
|
|
"epoch": 0.45840130505709625,
|
|
"grad_norm": 5.201466083526611,
|
|
"learning_rate": 6.924693957084079e-06,
|
|
"loss": 1.3969,
|
|
"mean_token_accuracy": 0.6604675650596619,
|
|
"step": 281
|
|
},
|
|
{
|
|
"epoch": 0.4600326264274062,
|
|
"grad_norm": 6.443404674530029,
|
|
"learning_rate": 6.900333463028546e-06,
|
|
"loss": 1.4835,
|
|
"mean_token_accuracy": 0.6526094079017639,
|
|
"step": 282
|
|
},
|
|
{
|
|
"epoch": 0.46166394779771613,
|
|
"grad_norm": 5.083189964294434,
|
|
"learning_rate": 6.8759274463961145e-06,
|
|
"loss": 1.3969,
|
|
"mean_token_accuracy": 0.657814085483551,
|
|
"step": 283
|
|
},
|
|
{
|
|
"epoch": 0.4632952691680261,
|
|
"grad_norm": 5.082605838775635,
|
|
"learning_rate": 6.851476700586926e-06,
|
|
"loss": 1.1498,
|
|
"mean_token_accuracy": 0.7049723863601685,
|
|
"step": 284
|
|
},
|
|
{
|
|
"epoch": 0.46492659053833607,
|
|
"grad_norm": 5.190065860748291,
|
|
"learning_rate": 6.8269820204551985e-06,
|
|
"loss": 1.3005,
|
|
"mean_token_accuracy": 0.6958598494529724,
|
|
"step": 285
|
|
},
|
|
{
|
|
"epoch": 0.466557911908646,
|
|
"grad_norm": 5.341015338897705,
|
|
"learning_rate": 6.802444202283381e-06,
|
|
"loss": 1.3399,
|
|
"mean_token_accuracy": 0.6875981092453003,
|
|
"step": 286
|
|
},
|
|
{
|
|
"epoch": 0.46818923327895595,
|
|
"grad_norm": 5.5725226402282715,
|
|
"learning_rate": 6.777864043756268e-06,
|
|
"loss": 1.2856,
|
|
"mean_token_accuracy": 0.6699952483177185,
|
|
"step": 287
|
|
},
|
|
{
|
|
"epoch": 0.4698205546492659,
|
|
"grad_norm": 5.266445636749268,
|
|
"learning_rate": 6.7532423439350794e-06,
|
|
"loss": 1.4138,
|
|
"mean_token_accuracy": 0.6606606841087341,
|
|
"step": 288
|
|
},
|
|
{
|
|
"epoch": 0.47145187601957583,
|
|
"grad_norm": 4.3366780281066895,
|
|
"learning_rate": 6.728579903231463e-06,
|
|
"loss": 0.9495,
|
|
"mean_token_accuracy": 0.7561102509498596,
|
|
"step": 289
|
|
},
|
|
{
|
|
"epoch": 0.4730831973898858,
|
|
"grad_norm": 5.066049575805664,
|
|
"learning_rate": 6.703877523381495e-06,
|
|
"loss": 1.4154,
|
|
"mean_token_accuracy": 0.6551030874252319,
|
|
"step": 290
|
|
},
|
|
{
|
|
"epoch": 0.47471451876019577,
|
|
"grad_norm": 5.101365089416504,
|
|
"learning_rate": 6.679136007419607e-06,
|
|
"loss": 1.1613,
|
|
"mean_token_accuracy": 0.7094940543174744,
|
|
"step": 291
|
|
},
|
|
{
|
|
"epoch": 0.4763458401305057,
|
|
"grad_norm": 5.026820182800293,
|
|
"learning_rate": 6.654356159652483e-06,
|
|
"loss": 1.1103,
|
|
"mean_token_accuracy": 0.7332636117935181,
|
|
"step": 292
|
|
},
|
|
{
|
|
"epoch": 0.47797716150081565,
|
|
"grad_norm": 5.343873977661133,
|
|
"learning_rate": 6.629538785632912e-06,
|
|
"loss": 1.2417,
|
|
"mean_token_accuracy": 0.6913783550262451,
|
|
"step": 293
|
|
},
|
|
{
|
|
"epoch": 0.4796084828711256,
|
|
"grad_norm": 4.890134334564209,
|
|
"learning_rate": 6.604684692133597e-06,
|
|
"loss": 1.1577,
|
|
"mean_token_accuracy": 0.7185184955596924,
|
|
"step": 294
|
|
},
|
|
{
|
|
"epoch": 0.4812398042414356,
|
|
"grad_norm": 4.612360000610352,
|
|
"learning_rate": 6.579794687120938e-06,
|
|
"loss": 1.1759,
|
|
"mean_token_accuracy": 0.7133533954620361,
|
|
"step": 295
|
|
},
|
|
{
|
|
"epoch": 0.4828711256117455,
|
|
"grad_norm": 5.377026081085205,
|
|
"learning_rate": 6.554869579728753e-06,
|
|
"loss": 1.3571,
|
|
"mean_token_accuracy": 0.6833840012550354,
|
|
"step": 296
|
|
},
|
|
{
|
|
"epoch": 0.48450244698205547,
|
|
"grad_norm": 4.644443988800049,
|
|
"learning_rate": 6.5299101802319905e-06,
|
|
"loss": 1.2068,
|
|
"mean_token_accuracy": 0.678475558757782,
|
|
"step": 297
|
|
},
|
|
{
|
|
"epoch": 0.48613376835236544,
|
|
"grad_norm": 5.2673869132995605,
|
|
"learning_rate": 6.504917300020373e-06,
|
|
"loss": 1.2203,
|
|
"mean_token_accuracy": 0.7017102837562561,
|
|
"step": 298
|
|
},
|
|
{
|
|
"epoch": 0.48776508972267535,
|
|
"grad_norm": 4.513294219970703,
|
|
"learning_rate": 6.479891751572026e-06,
|
|
"loss": 1.0001,
|
|
"mean_token_accuracy": 0.7441986203193665,
|
|
"step": 299
|
|
},
|
|
{
|
|
"epoch": 0.4893964110929853,
|
|
"grad_norm": 5.3284454345703125,
|
|
"learning_rate": 6.454834348427077e-06,
|
|
"loss": 1.4106,
|
|
"mean_token_accuracy": 0.6686747074127197,
|
|
"step": 300
|
|
},
|
|
{
|
|
"epoch": 0.4910277324632953,
|
|
"grad_norm": 4.518412113189697,
|
|
"learning_rate": 6.429745905161183e-06,
|
|
"loss": 1.0715,
|
|
"mean_token_accuracy": 0.7324516773223877,
|
|
"step": 301
|
|
},
|
|
{
|
|
"epoch": 0.4926590538336052,
|
|
"grad_norm": 6.271851062774658,
|
|
"learning_rate": 6.404627237359078e-06,
|
|
"loss": 1.3864,
|
|
"mean_token_accuracy": 0.6706717610359192,
|
|
"step": 302
|
|
},
|
|
{
|
|
"epoch": 0.49429037520391517,
|
|
"grad_norm": 5.340519905090332,
|
|
"learning_rate": 6.379479161588039e-06,
|
|
"loss": 1.3695,
|
|
"mean_token_accuracy": 0.6908436417579651,
|
|
"step": 303
|
|
},
|
|
{
|
|
"epoch": 0.49592169657422513,
|
|
"grad_norm": 5.797330856323242,
|
|
"learning_rate": 6.354302495371352e-06,
|
|
"loss": 1.5499,
|
|
"mean_token_accuracy": 0.6514008641242981,
|
|
"step": 304
|
|
},
|
|
{
|
|
"epoch": 0.49755301794453505,
|
|
"grad_norm": 5.730583667755127,
|
|
"learning_rate": 6.329098057161731e-06,
|
|
"loss": 1.2407,
|
|
"mean_token_accuracy": 0.7070135474205017,
|
|
"step": 305
|
|
},
|
|
{
|
|
"epoch": 0.499184339314845,
|
|
"grad_norm": 5.447910308837891,
|
|
"learning_rate": 6.303866666314715e-06,
|
|
"loss": 1.2594,
|
|
"mean_token_accuracy": 0.6743515729904175,
|
|
"step": 306
|
|
},
|
|
{
|
|
"epoch": 0.5008156606851549,
|
|
"grad_norm": 4.888600826263428,
|
|
"learning_rate": 6.278609143062026e-06,
|
|
"loss": 1.4212,
|
|
"mean_token_accuracy": 0.6533401012420654,
|
|
"step": 307
|
|
},
|
|
{
|
|
"epoch": 0.5024469820554649,
|
|
"grad_norm": 5.023259162902832,
|
|
"learning_rate": 6.2533263084849095e-06,
|
|
"loss": 1.149,
|
|
"mean_token_accuracy": 0.7093348503112793,
|
|
"step": 308
|
|
},
|
|
{
|
|
"epoch": 0.5040783034257749,
|
|
"grad_norm": 5.078878879547119,
|
|
"learning_rate": 6.228018984487443e-06,
|
|
"loss": 1.4057,
|
|
"mean_token_accuracy": 0.6541189551353455,
|
|
"step": 309
|
|
},
|
|
{
|
|
"epoch": 0.5057096247960848,
|
|
"grad_norm": 5.9124298095703125,
|
|
"learning_rate": 6.202687993769811e-06,
|
|
"loss": 1.381,
|
|
"mean_token_accuracy": 0.6719298362731934,
|
|
"step": 310
|
|
},
|
|
{
|
|
"epoch": 0.5073409461663948,
|
|
"grad_norm": 4.667541980743408,
|
|
"learning_rate": 6.177334159801571e-06,
|
|
"loss": 1.2029,
|
|
"mean_token_accuracy": 0.703399121761322,
|
|
"step": 311
|
|
},
|
|
{
|
|
"epoch": 0.5089722675367048,
|
|
"grad_norm": 5.1174116134643555,
|
|
"learning_rate": 6.151958306794878e-06,
|
|
"loss": 1.2424,
|
|
"mean_token_accuracy": 0.6848514080047607,
|
|
"step": 312
|
|
},
|
|
{
|
|
"epoch": 0.5106035889070146,
|
|
"grad_norm": 4.411402702331543,
|
|
"learning_rate": 6.126561259677679e-06,
|
|
"loss": 1.0155,
|
|
"mean_token_accuracy": 0.7456547021865845,
|
|
"step": 313
|
|
},
|
|
{
|
|
"epoch": 0.5122349102773246,
|
|
"grad_norm": 5.111578464508057,
|
|
"learning_rate": 6.101143844066919e-06,
|
|
"loss": 1.5141,
|
|
"mean_token_accuracy": 0.6380020380020142,
|
|
"step": 314
|
|
},
|
|
{
|
|
"epoch": 0.5138662316476346,
|
|
"grad_norm": 4.498473167419434,
|
|
"learning_rate": 6.0757068862416855e-06,
|
|
"loss": 1.0826,
|
|
"mean_token_accuracy": 0.7336174845695496,
|
|
"step": 315
|
|
},
|
|
{
|
|
"epoch": 0.5154975530179445,
|
|
"grad_norm": 4.840219974517822,
|
|
"learning_rate": 6.050251213116356e-06,
|
|
"loss": 1.1874,
|
|
"mean_token_accuracy": 0.7011764645576477,
|
|
"step": 316
|
|
},
|
|
{
|
|
"epoch": 0.5171288743882545,
|
|
"grad_norm": 5.577286243438721,
|
|
"learning_rate": 6.024777652213702e-06,
|
|
"loss": 1.3661,
|
|
"mean_token_accuracy": 0.6853360533714294,
|
|
"step": 317
|
|
},
|
|
{
|
|
"epoch": 0.5187601957585645,
|
|
"grad_norm": 5.854229927062988,
|
|
"learning_rate": 5.9992870316380085e-06,
|
|
"loss": 1.3195,
|
|
"mean_token_accuracy": 0.6762208342552185,
|
|
"step": 318
|
|
},
|
|
{
|
|
"epoch": 0.5203915171288744,
|
|
"grad_norm": 4.742856502532959,
|
|
"learning_rate": 5.973780180048138e-06,
|
|
"loss": 1.3327,
|
|
"mean_token_accuracy": 0.6779661178588867,
|
|
"step": 319
|
|
},
|
|
{
|
|
"epoch": 0.5220228384991843,
|
|
"grad_norm": 4.710707187652588,
|
|
"learning_rate": 5.948257926630594e-06,
|
|
"loss": 1.2339,
|
|
"mean_token_accuracy": 0.6876561641693115,
|
|
"step": 320
|
|
},
|
|
{
|
|
"epoch": 0.5236541598694943,
|
|
"grad_norm": 4.941910266876221,
|
|
"learning_rate": 5.9227211010725774e-06,
|
|
"loss": 1.2255,
|
|
"mean_token_accuracy": 0.6985294222831726,
|
|
"step": 321
|
|
},
|
|
{
|
|
"epoch": 0.5252854812398042,
|
|
"grad_norm": 4.412341594696045,
|
|
"learning_rate": 5.897170533534997e-06,
|
|
"loss": 1.0061,
|
|
"mean_token_accuracy": 0.7371076345443726,
|
|
"step": 322
|
|
},
|
|
{
|
|
"epoch": 0.5269168026101142,
|
|
"grad_norm": 5.1144843101501465,
|
|
"learning_rate": 5.871607054625497e-06,
|
|
"loss": 1.2831,
|
|
"mean_token_accuracy": 0.6948955655097961,
|
|
"step": 323
|
|
},
|
|
{
|
|
"epoch": 0.5285481239804242,
|
|
"grad_norm": 4.727828025817871,
|
|
"learning_rate": 5.846031495371445e-06,
|
|
"loss": 1.1593,
|
|
"mean_token_accuracy": 0.716786801815033,
|
|
"step": 324
|
|
},
|
|
{
|
|
"epoch": 0.5301794453507341,
|
|
"grad_norm": 4.1832451820373535,
|
|
"learning_rate": 5.820444687192922e-06,
|
|
"loss": 0.8687,
|
|
"mean_token_accuracy": 0.7938080430030823,
|
|
"step": 325
|
|
},
|
|
{
|
|
"epoch": 0.531810766721044,
|
|
"grad_norm": 4.599882125854492,
|
|
"learning_rate": 5.794847461875699e-06,
|
|
"loss": 1.2684,
|
|
"mean_token_accuracy": 0.7069767713546753,
|
|
"step": 326
|
|
},
|
|
{
|
|
"epoch": 0.533442088091354,
|
|
"grad_norm": 4.93676233291626,
|
|
"learning_rate": 5.769240651544182e-06,
|
|
"loss": 1.3537,
|
|
"mean_token_accuracy": 0.6710861921310425,
|
|
"step": 327
|
|
},
|
|
{
|
|
"epoch": 0.5350734094616639,
|
|
"grad_norm": 5.627974510192871,
|
|
"learning_rate": 5.74362508863438e-06,
|
|
"loss": 1.1545,
|
|
"mean_token_accuracy": 0.6976369619369507,
|
|
"step": 328
|
|
},
|
|
{
|
|
"epoch": 0.5367047308319739,
|
|
"grad_norm": 4.828066349029541,
|
|
"learning_rate": 5.7180016058668255e-06,
|
|
"loss": 1.3031,
|
|
"mean_token_accuracy": 0.6644359230995178,
|
|
"step": 329
|
|
},
|
|
{
|
|
"epoch": 0.5383360522022839,
|
|
"grad_norm": 4.859476089477539,
|
|
"learning_rate": 5.692371036219517e-06,
|
|
"loss": 1.2398,
|
|
"mean_token_accuracy": 0.6936061382293701,
|
|
"step": 330
|
|
},
|
|
{
|
|
"epoch": 0.5399673735725938,
|
|
"grad_norm": 5.192332744598389,
|
|
"learning_rate": 5.666734212900838e-06,
|
|
"loss": 1.4352,
|
|
"mean_token_accuracy": 0.6518259048461914,
|
|
"step": 331
|
|
},
|
|
{
|
|
"epoch": 0.5415986949429038,
|
|
"grad_norm": 5.073202610015869,
|
|
"learning_rate": 5.641091969322462e-06,
|
|
"loss": 1.4968,
|
|
"mean_token_accuracy": 0.6290949583053589,
|
|
"step": 332
|
|
},
|
|
{
|
|
"epoch": 0.5432300163132137,
|
|
"grad_norm": 5.155499458312988,
|
|
"learning_rate": 5.615445139072276e-06,
|
|
"loss": 1.2214,
|
|
"mean_token_accuracy": 0.6994413137435913,
|
|
"step": 333
|
|
},
|
|
{
|
|
"epoch": 0.5448613376835236,
|
|
"grad_norm": 5.361922264099121,
|
|
"learning_rate": 5.589794555887261e-06,
|
|
"loss": 1.3211,
|
|
"mean_token_accuracy": 0.6952879428863525,
|
|
"step": 334
|
|
},
|
|
{
|
|
"epoch": 0.5464926590538336,
|
|
"grad_norm": 4.938032627105713,
|
|
"learning_rate": 5.564141053626412e-06,
|
|
"loss": 1.0671,
|
|
"mean_token_accuracy": 0.7365955114364624,
|
|
"step": 335
|
|
},
|
|
{
|
|
"epoch": 0.5481239804241436,
|
|
"grad_norm": 5.547269344329834,
|
|
"learning_rate": 5.538485466243609e-06,
|
|
"loss": 1.1093,
|
|
"mean_token_accuracy": 0.7243272662162781,
|
|
"step": 336
|
|
},
|
|
{
|
|
"epoch": 0.5497553017944535,
|
|
"grad_norm": 4.6895318031311035,
|
|
"learning_rate": 5.512828627760519e-06,
|
|
"loss": 1.1681,
|
|
"mean_token_accuracy": 0.721276581287384,
|
|
"step": 337
|
|
},
|
|
{
|
|
"epoch": 0.5513866231647635,
|
|
"grad_norm": 4.660412311553955,
|
|
"learning_rate": 5.487171372239484e-06,
|
|
"loss": 1.0067,
|
|
"mean_token_accuracy": 0.756860613822937,
|
|
"step": 338
|
|
},
|
|
{
|
|
"epoch": 0.5530179445350734,
|
|
"grad_norm": 4.920834541320801,
|
|
"learning_rate": 5.461514533756394e-06,
|
|
"loss": 1.1513,
|
|
"mean_token_accuracy": 0.7190653085708618,
|
|
"step": 339
|
|
},
|
|
{
|
|
"epoch": 0.5546492659053833,
|
|
"grad_norm": 4.938427925109863,
|
|
"learning_rate": 5.435858946373589e-06,
|
|
"loss": 1.2603,
|
|
"mean_token_accuracy": 0.6953125,
|
|
"step": 340
|
|
},
|
|
{
|
|
"epoch": 0.5562805872756933,
|
|
"grad_norm": 5.092044830322266,
|
|
"learning_rate": 5.410205444112739e-06,
|
|
"loss": 1.3949,
|
|
"mean_token_accuracy": 0.6499231457710266,
|
|
"step": 341
|
|
},
|
|
{
|
|
"epoch": 0.5579119086460033,
|
|
"grad_norm": 5.051768779754639,
|
|
"learning_rate": 5.384554860927727e-06,
|
|
"loss": 1.2452,
|
|
"mean_token_accuracy": 0.6954964399337769,
|
|
"step": 342
|
|
},
|
|
{
|
|
"epoch": 0.5595432300163132,
|
|
"grad_norm": 5.367892265319824,
|
|
"learning_rate": 5.35890803067754e-06,
|
|
"loss": 1.4017,
|
|
"mean_token_accuracy": 0.6810073256492615,
|
|
"step": 343
|
|
},
|
|
{
|
|
"epoch": 0.5611745513866232,
|
|
"grad_norm": 5.445290565490723,
|
|
"learning_rate": 5.333265787099165e-06,
|
|
"loss": 1.3892,
|
|
"mean_token_accuracy": 0.6558409929275513,
|
|
"step": 344
|
|
},
|
|
{
|
|
"epoch": 0.5628058727569332,
|
|
"grad_norm": 4.5011091232299805,
|
|
"learning_rate": 5.307628963780486e-06,
|
|
"loss": 1.134,
|
|
"mean_token_accuracy": 0.7164948582649231,
|
|
"step": 345
|
|
},
|
|
{
|
|
"epoch": 0.564437194127243,
|
|
"grad_norm": 4.943375587463379,
|
|
"learning_rate": 5.281998394133177e-06,
|
|
"loss": 1.2984,
|
|
"mean_token_accuracy": 0.6739248633384705,
|
|
"step": 346
|
|
},
|
|
{
|
|
"epoch": 0.566068515497553,
|
|
"grad_norm": 5.216898441314697,
|
|
"learning_rate": 5.256374911365621e-06,
|
|
"loss": 1.0943,
|
|
"mean_token_accuracy": 0.742380678653717,
|
|
"step": 347
|
|
},
|
|
{
|
|
"epoch": 0.567699836867863,
|
|
"grad_norm": 5.109920501708984,
|
|
"learning_rate": 5.2307593484558175e-06,
|
|
"loss": 1.2526,
|
|
"mean_token_accuracy": 0.7040214538574219,
|
|
"step": 348
|
|
},
|
|
{
|
|
"epoch": 0.5693311582381729,
|
|
"grad_norm": 5.156416416168213,
|
|
"learning_rate": 5.205152538124303e-06,
|
|
"loss": 1.3782,
|
|
"mean_token_accuracy": 0.6628924608230591,
|
|
"step": 349
|
|
},
|
|
{
|
|
"epoch": 0.5709624796084829,
|
|
"grad_norm": 4.267928600311279,
|
|
"learning_rate": 5.179555312807079e-06,
|
|
"loss": 1.0406,
|
|
"mean_token_accuracy": 0.7428425550460815,
|
|
"step": 350
|
|
},
|
|
{
|
|
"epoch": 0.5725938009787929,
|
|
"grad_norm": 4.501908302307129,
|
|
"learning_rate": 5.153968504628558e-06,
|
|
"loss": 1.177,
|
|
"mean_token_accuracy": 0.7172932624816895,
|
|
"step": 351
|
|
},
|
|
{
|
|
"epoch": 0.5742251223491027,
|
|
"grad_norm": 4.767745494842529,
|
|
"learning_rate": 5.1283929453745055e-06,
|
|
"loss": 1.0175,
|
|
"mean_token_accuracy": 0.7485062479972839,
|
|
"step": 352
|
|
},
|
|
{
|
|
"epoch": 0.5758564437194127,
|
|
"grad_norm": 4.706437110900879,
|
|
"learning_rate": 5.102829466465005e-06,
|
|
"loss": 1.4045,
|
|
"mean_token_accuracy": 0.6768350601196289,
|
|
"step": 353
|
|
},
|
|
{
|
|
"epoch": 0.5774877650897227,
|
|
"grad_norm": 4.489633083343506,
|
|
"learning_rate": 5.077278898927425e-06,
|
|
"loss": 1.1147,
|
|
"mean_token_accuracy": 0.7224782109260559,
|
|
"step": 354
|
|
},
|
|
{
|
|
"epoch": 0.5791190864600326,
|
|
"grad_norm": 5.124320030212402,
|
|
"learning_rate": 5.051742073369407e-06,
|
|
"loss": 1.3278,
|
|
"mean_token_accuracy": 0.6733261346817017,
|
|
"step": 355
|
|
},
|
|
{
|
|
"epoch": 0.5807504078303426,
|
|
"grad_norm": 4.827151298522949,
|
|
"learning_rate": 5.026219819951865e-06,
|
|
"loss": 1.0634,
|
|
"mean_token_accuracy": 0.7362812757492065,
|
|
"step": 356
|
|
},
|
|
{
|
|
"epoch": 0.5823817292006526,
|
|
"grad_norm": 4.903264045715332,
|
|
"learning_rate": 5.000712968361994e-06,
|
|
"loss": 1.2472,
|
|
"mean_token_accuracy": 0.6971870064735413,
|
|
"step": 357
|
|
},
|
|
{
|
|
"epoch": 0.5840130505709625,
|
|
"grad_norm": 5.2526421546936035,
|
|
"learning_rate": 4.975222347786299e-06,
|
|
"loss": 1.4272,
|
|
"mean_token_accuracy": 0.6600102186203003,
|
|
"step": 358
|
|
},
|
|
{
|
|
"epoch": 0.5856443719412724,
|
|
"grad_norm": 4.291179656982422,
|
|
"learning_rate": 4.949748786883647e-06,
|
|
"loss": 1.058,
|
|
"mean_token_accuracy": 0.7493276000022888,
|
|
"step": 359
|
|
},
|
|
{
|
|
"epoch": 0.5872756933115824,
|
|
"grad_norm": 5.2762041091918945,
|
|
"learning_rate": 4.924293113758314e-06,
|
|
"loss": 1.3768,
|
|
"mean_token_accuracy": 0.6687370538711548,
|
|
"step": 360
|
|
},
|
|
{
|
|
"epoch": 0.5889070146818923,
|
|
"grad_norm": 4.883618354797363,
|
|
"learning_rate": 4.898856155933084e-06,
|
|
"loss": 1.2404,
|
|
"mean_token_accuracy": 0.7043189406394958,
|
|
"step": 361
|
|
},
|
|
{
|
|
"epoch": 0.5905383360522023,
|
|
"grad_norm": 5.123283863067627,
|
|
"learning_rate": 4.873438740322325e-06,
|
|
"loss": 1.2315,
|
|
"mean_token_accuracy": 0.7245850563049316,
|
|
"step": 362
|
|
},
|
|
{
|
|
"epoch": 0.5921696574225123,
|
|
"grad_norm": 5.0717267990112305,
|
|
"learning_rate": 4.8480416932051255e-06,
|
|
"loss": 1.29,
|
|
"mean_token_accuracy": 0.6664901971817017,
|
|
"step": 363
|
|
},
|
|
{
|
|
"epoch": 0.5938009787928222,
|
|
"grad_norm": 5.272220611572266,
|
|
"learning_rate": 4.8226658401984295e-06,
|
|
"loss": 1.4004,
|
|
"mean_token_accuracy": 0.665610134601593,
|
|
"step": 364
|
|
},
|
|
{
|
|
"epoch": 0.5954323001631321,
|
|
"grad_norm": 4.845883846282959,
|
|
"learning_rate": 4.79731200623019e-06,
|
|
"loss": 1.2454,
|
|
"mean_token_accuracy": 0.695147693157196,
|
|
"step": 365
|
|
},
|
|
{
|
|
"epoch": 0.5970636215334421,
|
|
"grad_norm": 4.378237247467041,
|
|
"learning_rate": 4.771981015512559e-06,
|
|
"loss": 0.8819,
|
|
"mean_token_accuracy": 0.7768194079399109,
|
|
"step": 366
|
|
},
|
|
{
|
|
"epoch": 0.598694942903752,
|
|
"grad_norm": 5.140429973602295,
|
|
"learning_rate": 4.746673691515093e-06,
|
|
"loss": 1.2651,
|
|
"mean_token_accuracy": 0.6864282488822937,
|
|
"step": 367
|
|
},
|
|
{
|
|
"epoch": 0.600326264274062,
|
|
"grad_norm": 5.289265155792236,
|
|
"learning_rate": 4.721390856937976e-06,
|
|
"loss": 1.1395,
|
|
"mean_token_accuracy": 0.7040935754776001,
|
|
"step": 368
|
|
},
|
|
{
|
|
"epoch": 0.601957585644372,
|
|
"grad_norm": 5.229256629943848,
|
|
"learning_rate": 4.696133333685286e-06,
|
|
"loss": 1.2456,
|
|
"mean_token_accuracy": 0.6875337362289429,
|
|
"step": 369
|
|
},
|
|
{
|
|
"epoch": 0.6035889070146819,
|
|
"grad_norm": 5.632972240447998,
|
|
"learning_rate": 4.67090194283827e-06,
|
|
"loss": 1.3961,
|
|
"mean_token_accuracy": 0.670738160610199,
|
|
"step": 370
|
|
},
|
|
{
|
|
"epoch": 0.6052202283849919,
|
|
"grad_norm": 4.017366409301758,
|
|
"learning_rate": 4.645697504628649e-06,
|
|
"loss": 0.9787,
|
|
"mean_token_accuracy": 0.7547547817230225,
|
|
"step": 371
|
|
},
|
|
{
|
|
"epoch": 0.6068515497553018,
|
|
"grad_norm": 5.327452182769775,
|
|
"learning_rate": 4.6205208384119626e-06,
|
|
"loss": 1.201,
|
|
"mean_token_accuracy": 0.7167056202888489,
|
|
"step": 372
|
|
},
|
|
{
|
|
"epoch": 0.6084828711256117,
|
|
"grad_norm": 5.935764312744141,
|
|
"learning_rate": 4.595372762640924e-06,
|
|
"loss": 1.6929,
|
|
"mean_token_accuracy": 0.598901093006134,
|
|
"step": 373
|
|
},
|
|
{
|
|
"epoch": 0.6101141924959217,
|
|
"grad_norm": 4.699948310852051,
|
|
"learning_rate": 4.57025409483882e-06,
|
|
"loss": 1.0746,
|
|
"mean_token_accuracy": 0.753125011920929,
|
|
"step": 374
|
|
},
|
|
{
|
|
"epoch": 0.6117455138662317,
|
|
"grad_norm": 4.765663146972656,
|
|
"learning_rate": 4.545165651572926e-06,
|
|
"loss": 1.2652,
|
|
"mean_token_accuracy": 0.707344651222229,
|
|
"step": 375
|
|
},
|
|
{
|
|
"epoch": 0.6133768352365416,
|
|
"grad_norm": 5.047967433929443,
|
|
"learning_rate": 4.520108248427975e-06,
|
|
"loss": 1.3292,
|
|
"mean_token_accuracy": 0.67514967918396,
|
|
"step": 376
|
|
},
|
|
{
|
|
"epoch": 0.6150081566068516,
|
|
"grad_norm": 4.735743045806885,
|
|
"learning_rate": 4.49508269997963e-06,
|
|
"loss": 1.2413,
|
|
"mean_token_accuracy": 0.6973969340324402,
|
|
"step": 377
|
|
},
|
|
{
|
|
"epoch": 0.6166394779771615,
|
|
"grad_norm": 4.652902603149414,
|
|
"learning_rate": 4.470089819768011e-06,
|
|
"loss": 1.088,
|
|
"mean_token_accuracy": 0.7307506203651428,
|
|
"step": 378
|
|
},
|
|
{
|
|
"epoch": 0.6182707993474714,
|
|
"grad_norm": 5.012315273284912,
|
|
"learning_rate": 4.4451304202712486e-06,
|
|
"loss": 1.1939,
|
|
"mean_token_accuracy": 0.6982803344726562,
|
|
"step": 379
|
|
},
|
|
{
|
|
"epoch": 0.6199021207177814,
|
|
"grad_norm": 5.312098026275635,
|
|
"learning_rate": 4.420205312879065e-06,
|
|
"loss": 1.1707,
|
|
"mean_token_accuracy": 0.7069069147109985,
|
|
"step": 380
|
|
},
|
|
{
|
|
"epoch": 0.6215334420880914,
|
|
"grad_norm": 5.2296462059021,
|
|
"learning_rate": 4.395315307866404e-06,
|
|
"loss": 1.4177,
|
|
"mean_token_accuracy": 0.6581963300704956,
|
|
"step": 381
|
|
},
|
|
{
|
|
"epoch": 0.6231647634584013,
|
|
"grad_norm": 5.124307155609131,
|
|
"learning_rate": 4.37046121436709e-06,
|
|
"loss": 1.4677,
|
|
"mean_token_accuracy": 0.6682761907577515,
|
|
"step": 382
|
|
},
|
|
{
|
|
"epoch": 0.6247960848287113,
|
|
"grad_norm": 5.106579303741455,
|
|
"learning_rate": 4.3456438403475174e-06,
|
|
"loss": 1.3623,
|
|
"mean_token_accuracy": 0.670346200466156,
|
|
"step": 383
|
|
},
|
|
{
|
|
"epoch": 0.6264274061990212,
|
|
"grad_norm": 5.028356552124023,
|
|
"learning_rate": 4.320863992580393e-06,
|
|
"loss": 1.2408,
|
|
"mean_token_accuracy": 0.6920192837715149,
|
|
"step": 384
|
|
},
|
|
{
|
|
"epoch": 0.6280587275693311,
|
|
"grad_norm": 4.444116115570068,
|
|
"learning_rate": 4.296122476618507e-06,
|
|
"loss": 1.1429,
|
|
"mean_token_accuracy": 0.704580545425415,
|
|
"step": 385
|
|
},
|
|
{
|
|
"epoch": 0.6296900489396411,
|
|
"grad_norm": 4.459901809692383,
|
|
"learning_rate": 4.2714200967685405e-06,
|
|
"loss": 1.1629,
|
|
"mean_token_accuracy": 0.72246915102005,
|
|
"step": 386
|
|
},
|
|
{
|
|
"epoch": 0.6313213703099511,
|
|
"grad_norm": 6.285675525665283,
|
|
"learning_rate": 4.246757656064924e-06,
|
|
"loss": 1.4322,
|
|
"mean_token_accuracy": 0.6744464635848999,
|
|
"step": 387
|
|
},
|
|
{
|
|
"epoch": 0.632952691680261,
|
|
"grad_norm": 4.624763011932373,
|
|
"learning_rate": 4.222135956243732e-06,
|
|
"loss": 1.2574,
|
|
"mean_token_accuracy": 0.681078314781189,
|
|
"step": 388
|
|
},
|
|
{
|
|
"epoch": 0.634584013050571,
|
|
"grad_norm": 4.69317626953125,
|
|
"learning_rate": 4.19755579771662e-06,
|
|
"loss": 0.912,
|
|
"mean_token_accuracy": 0.7686527371406555,
|
|
"step": 389
|
|
},
|
|
{
|
|
"epoch": 0.636215334420881,
|
|
"grad_norm": 5.052898406982422,
|
|
"learning_rate": 4.173017979544804e-06,
|
|
"loss": 1.2663,
|
|
"mean_token_accuracy": 0.6840921640396118,
|
|
"step": 390
|
|
},
|
|
{
|
|
"epoch": 0.6378466557911908,
|
|
"grad_norm": 4.830198287963867,
|
|
"learning_rate": 4.148523299413075e-06,
|
|
"loss": 1.1944,
|
|
"mean_token_accuracy": 0.7132551670074463,
|
|
"step": 391
|
|
},
|
|
{
|
|
"epoch": 0.6394779771615008,
|
|
"grad_norm": 4.418455600738525,
|
|
"learning_rate": 4.124072553603887e-06,
|
|
"loss": 1.1429,
|
|
"mean_token_accuracy": 0.720703125,
|
|
"step": 392
|
|
},
|
|
{
|
|
"epoch": 0.6411092985318108,
|
|
"grad_norm": 5.233757019042969,
|
|
"learning_rate": 4.099666536971456e-06,
|
|
"loss": 1.4604,
|
|
"mean_token_accuracy": 0.6442708373069763,
|
|
"step": 393
|
|
},
|
|
{
|
|
"epoch": 0.6427406199021207,
|
|
"grad_norm": 5.798137187957764,
|
|
"learning_rate": 4.075306042915922e-06,
|
|
"loss": 1.4581,
|
|
"mean_token_accuracy": 0.6500260233879089,
|
|
"step": 394
|
|
},
|
|
{
|
|
"epoch": 0.6443719412724307,
|
|
"grad_norm": 5.040640354156494,
|
|
"learning_rate": 4.050991863357564e-06,
|
|
"loss": 1.0962,
|
|
"mean_token_accuracy": 0.7073915600776672,
|
|
"step": 395
|
|
},
|
|
{
|
|
"epoch": 0.6460032626427407,
|
|
"grad_norm": 4.5930352210998535,
|
|
"learning_rate": 4.026724788711047e-06,
|
|
"loss": 1.1013,
|
|
"mean_token_accuracy": 0.7120794057846069,
|
|
"step": 396
|
|
},
|
|
{
|
|
"epoch": 0.6476345840130505,
|
|
"grad_norm": 4.828030586242676,
|
|
"learning_rate": 4.002505607859738e-06,
|
|
"loss": 1.1984,
|
|
"mean_token_accuracy": 0.7033898234367371,
|
|
"step": 397
|
|
},
|
|
{
|
|
"epoch": 0.6492659053833605,
|
|
"grad_norm": 4.7295331954956055,
|
|
"learning_rate": 3.978335108130047e-06,
|
|
"loss": 1.0876,
|
|
"mean_token_accuracy": 0.7375543713569641,
|
|
"step": 398
|
|
},
|
|
{
|
|
"epoch": 0.6508972267536705,
|
|
"grad_norm": 5.275457859039307,
|
|
"learning_rate": 3.954214075265842e-06,
|
|
"loss": 1.2306,
|
|
"mean_token_accuracy": 0.7018927335739136,
|
|
"step": 399
|
|
},
|
|
{
|
|
"epoch": 0.6525285481239804,
|
|
"grad_norm": 5.105504989624023,
|
|
"learning_rate": 3.930143293402907e-06,
|
|
"loss": 1.3803,
|
|
"mean_token_accuracy": 0.637172520160675,
|
|
"step": 400
|
|
},
|
|
{
|
|
"epoch": 0.6541598694942904,
|
|
"grad_norm": 4.749675750732422,
|
|
"learning_rate": 3.906123545043441e-06,
|
|
"loss": 1.1234,
|
|
"mean_token_accuracy": 0.7189365029335022,
|
|
"step": 401
|
|
},
|
|
{
|
|
"epoch": 0.6557911908646004,
|
|
"grad_norm": 4.926899433135986,
|
|
"learning_rate": 3.882155611030631e-06,
|
|
"loss": 1.2681,
|
|
"mean_token_accuracy": 0.6934037208557129,
|
|
"step": 402
|
|
},
|
|
{
|
|
"epoch": 0.6574225122349103,
|
|
"grad_norm": 5.266970157623291,
|
|
"learning_rate": 3.858240270523262e-06,
|
|
"loss": 1.3901,
|
|
"mean_token_accuracy": 0.6708167195320129,
|
|
"step": 403
|
|
},
|
|
{
|
|
"epoch": 0.6590538336052202,
|
|
"grad_norm": 4.678843975067139,
|
|
"learning_rate": 3.834378300970385e-06,
|
|
"loss": 1.096,
|
|
"mean_token_accuracy": 0.7200217247009277,
|
|
"step": 404
|
|
},
|
|
{
|
|
"epoch": 0.6606851549755302,
|
|
"grad_norm": 5.216601371765137,
|
|
"learning_rate": 3.8105704780860575e-06,
|
|
"loss": 1.515,
|
|
"mean_token_accuracy": 0.6313887238502502,
|
|
"step": 405
|
|
},
|
|
{
|
|
"epoch": 0.6623164763458401,
|
|
"grad_norm": 5.138411045074463,
|
|
"learning_rate": 3.7868175758241065e-06,
|
|
"loss": 1.2448,
|
|
"mean_token_accuracy": 0.7139689326286316,
|
|
"step": 406
|
|
},
|
|
{
|
|
"epoch": 0.6639477977161501,
|
|
"grad_norm": 4.843050479888916,
|
|
"learning_rate": 3.7631203663529823e-06,
|
|
"loss": 1.3766,
|
|
"mean_token_accuracy": 0.6812297701835632,
|
|
"step": 407
|
|
},
|
|
{
|
|
"epoch": 0.6655791190864601,
|
|
"grad_norm": 4.814765453338623,
|
|
"learning_rate": 3.739479620030655e-06,
|
|
"loss": 1.0831,
|
|
"mean_token_accuracy": 0.7297152280807495,
|
|
"step": 408
|
|
},
|
|
{
|
|
"epoch": 0.66721044045677,
|
|
"grad_norm": 4.954052448272705,
|
|
"learning_rate": 3.715896105379562e-06,
|
|
"loss": 1.2928,
|
|
"mean_token_accuracy": 0.6796213388442993,
|
|
"step": 409
|
|
},
|
|
{
|
|
"epoch": 0.6688417618270799,
|
|
"grad_norm": 4.973556995391846,
|
|
"learning_rate": 3.692370589061639e-06,
|
|
"loss": 1.203,
|
|
"mean_token_accuracy": 0.7126886248588562,
|
|
"step": 410
|
|
},
|
|
{
|
|
"epoch": 0.6704730831973899,
|
|
"grad_norm": 4.508687973022461,
|
|
"learning_rate": 3.668903835853386e-06,
|
|
"loss": 1.0417,
|
|
"mean_token_accuracy": 0.7396226525306702,
|
|
"step": 411
|
|
},
|
|
{
|
|
"epoch": 0.6721044045676998,
|
|
"grad_norm": 4.325466632843018,
|
|
"learning_rate": 3.64549660862101e-06,
|
|
"loss": 1.0965,
|
|
"mean_token_accuracy": 0.7506775259971619,
|
|
"step": 412
|
|
},
|
|
{
|
|
"epoch": 0.6737357259380098,
|
|
"grad_norm": 4.78257417678833,
|
|
"learning_rate": 3.6221496682956236e-06,
|
|
"loss": 1.2328,
|
|
"mean_token_accuracy": 0.6968838572502136,
|
|
"step": 413
|
|
},
|
|
{
|
|
"epoch": 0.6753670473083198,
|
|
"grad_norm": 5.217673301696777,
|
|
"learning_rate": 3.5988637738485146e-06,
|
|
"loss": 1.1468,
|
|
"mean_token_accuracy": 0.7180641293525696,
|
|
"step": 414
|
|
},
|
|
{
|
|
"epoch": 0.6769983686786297,
|
|
"grad_norm": 5.608780384063721,
|
|
"learning_rate": 3.5756396822664595e-06,
|
|
"loss": 1.4427,
|
|
"mean_token_accuracy": 0.6482036113739014,
|
|
"step": 415
|
|
},
|
|
{
|
|
"epoch": 0.6786296900489397,
|
|
"grad_norm": 4.913776397705078,
|
|
"learning_rate": 3.5524781485271287e-06,
|
|
"loss": 1.3126,
|
|
"mean_token_accuracy": 0.703459620475769,
|
|
"step": 416
|
|
},
|
|
{
|
|
"epoch": 0.6802610114192496,
|
|
"grad_norm": 4.990585803985596,
|
|
"learning_rate": 3.5293799255745407e-06,
|
|
"loss": 1.425,
|
|
"mean_token_accuracy": 0.6552053689956665,
|
|
"step": 417
|
|
},
|
|
{
|
|
"epoch": 0.6818923327895595,
|
|
"grad_norm": 5.035621643066406,
|
|
"learning_rate": 3.5063457642945788e-06,
|
|
"loss": 1.3351,
|
|
"mean_token_accuracy": 0.6864407062530518,
|
|
"step": 418
|
|
},
|
|
{
|
|
"epoch": 0.6835236541598695,
|
|
"grad_norm": 5.281700134277344,
|
|
"learning_rate": 3.4833764134905835e-06,
|
|
"loss": 1.2133,
|
|
"mean_token_accuracy": 0.6881720423698425,
|
|
"step": 419
|
|
},
|
|
{
|
|
"epoch": 0.6851549755301795,
|
|
"grad_norm": 4.842787742614746,
|
|
"learning_rate": 3.4604726198590177e-06,
|
|
"loss": 1.1954,
|
|
"mean_token_accuracy": 0.7155085802078247,
|
|
"step": 420
|
|
},
|
|
{
|
|
"epoch": 0.6867862969004894,
|
|
"grad_norm": 4.937472820281982,
|
|
"learning_rate": 3.4376351279651788e-06,
|
|
"loss": 1.3095,
|
|
"mean_token_accuracy": 0.6968302726745605,
|
|
"step": 421
|
|
},
|
|
{
|
|
"epoch": 0.6884176182707994,
|
|
"grad_norm": 4.842049598693848,
|
|
"learning_rate": 3.4148646802190066e-06,
|
|
"loss": 1.0614,
|
|
"mean_token_accuracy": 0.7444320917129517,
|
|
"step": 422
|
|
},
|
|
{
|
|
"epoch": 0.6900489396411092,
|
|
"grad_norm": 4.4336628913879395,
|
|
"learning_rate": 3.392162016850945e-06,
|
|
"loss": 1.0914,
|
|
"mean_token_accuracy": 0.729891300201416,
|
|
"step": 423
|
|
},
|
|
{
|
|
"epoch": 0.6916802610114192,
|
|
"grad_norm": 5.191675186157227,
|
|
"learning_rate": 3.369527875887875e-06,
|
|
"loss": 1.2101,
|
|
"mean_token_accuracy": 0.7204244136810303,
|
|
"step": 424
|
|
},
|
|
{
|
|
"epoch": 0.6933115823817292,
|
|
"grad_norm": 5.435412406921387,
|
|
"learning_rate": 3.346962993129125e-06,
|
|
"loss": 1.2044,
|
|
"mean_token_accuracy": 0.7158119678497314,
|
|
"step": 425
|
|
},
|
|
{
|
|
"epoch": 0.6949429037520392,
|
|
"grad_norm": 4.86824369430542,
|
|
"learning_rate": 3.3244681021225506e-06,
|
|
"loss": 1.1128,
|
|
"mean_token_accuracy": 0.7002801299095154,
|
|
"step": 426
|
|
},
|
|
{
|
|
"epoch": 0.6965742251223491,
|
|
"grad_norm": 4.692442417144775,
|
|
"learning_rate": 3.302043934140693e-06,
|
|
"loss": 1.247,
|
|
"mean_token_accuracy": 0.683964729309082,
|
|
"step": 427
|
|
},
|
|
{
|
|
"epoch": 0.6982055464926591,
|
|
"grad_norm": 4.847585201263428,
|
|
"learning_rate": 3.279691218156998e-06,
|
|
"loss": 1.2886,
|
|
"mean_token_accuracy": 0.6823869347572327,
|
|
"step": 428
|
|
},
|
|
{
|
|
"epoch": 0.6998368678629691,
|
|
"grad_norm": 4.947258472442627,
|
|
"learning_rate": 3.2574106808221206e-06,
|
|
"loss": 1.1626,
|
|
"mean_token_accuracy": 0.7202127575874329,
|
|
"step": 429
|
|
},
|
|
{
|
|
"epoch": 0.7014681892332789,
|
|
"grad_norm": 4.548014163970947,
|
|
"learning_rate": 3.2352030464403117e-06,
|
|
"loss": 1.1406,
|
|
"mean_token_accuracy": 0.7432366013526917,
|
|
"step": 430
|
|
},
|
|
{
|
|
"epoch": 0.7030995106035889,
|
|
"grad_norm": 4.8469743728637695,
|
|
"learning_rate": 3.2130690369458594e-06,
|
|
"loss": 1.2848,
|
|
"mean_token_accuracy": 0.6986584067344666,
|
|
"step": 431
|
|
},
|
|
{
|
|
"epoch": 0.7047308319738989,
|
|
"grad_norm": 4.121768474578857,
|
|
"learning_rate": 3.191009371879627e-06,
|
|
"loss": 0.9523,
|
|
"mean_token_accuracy": 0.7665964365005493,
|
|
"step": 432
|
|
},
|
|
{
|
|
"epoch": 0.7063621533442088,
|
|
"grad_norm": 4.720678329467773,
|
|
"learning_rate": 3.1690247683656617e-06,
|
|
"loss": 1.2706,
|
|
"mean_token_accuracy": 0.6949771642684937,
|
|
"step": 433
|
|
},
|
|
{
|
|
"epoch": 0.7079934747145188,
|
|
"grad_norm": 4.939698219299316,
|
|
"learning_rate": 3.1471159410878784e-06,
|
|
"loss": 1.3505,
|
|
"mean_token_accuracy": 0.6539000272750854,
|
|
"step": 434
|
|
},
|
|
{
|
|
"epoch": 0.7096247960848288,
|
|
"grad_norm": 4.132518291473389,
|
|
"learning_rate": 3.125283602266832e-06,
|
|
"loss": 0.9859,
|
|
"mean_token_accuracy": 0.7509416341781616,
|
|
"step": 435
|
|
},
|
|
{
|
|
"epoch": 0.7112561174551386,
|
|
"grad_norm": 5.197821617126465,
|
|
"learning_rate": 3.1035284616365586e-06,
|
|
"loss": 1.1386,
|
|
"mean_token_accuracy": 0.714631199836731,
|
|
"step": 436
|
|
},
|
|
{
|
|
"epoch": 0.7128874388254486,
|
|
"grad_norm": 4.722414016723633,
|
|
"learning_rate": 3.0818512264215107e-06,
|
|
"loss": 0.9581,
|
|
"mean_token_accuracy": 0.7724301815032959,
|
|
"step": 437
|
|
},
|
|
{
|
|
"epoch": 0.7145187601957586,
|
|
"grad_norm": 4.7281599044799805,
|
|
"learning_rate": 3.060252601313557e-06,
|
|
"loss": 1.1291,
|
|
"mean_token_accuracy": 0.717391312122345,
|
|
"step": 438
|
|
},
|
|
{
|
|
"epoch": 0.7161500815660685,
|
|
"grad_norm": 4.817330360412598,
|
|
"learning_rate": 3.0387332884490806e-06,
|
|
"loss": 1.1184,
|
|
"mean_token_accuracy": 0.725653886795044,
|
|
"step": 439
|
|
},
|
|
{
|
|
"epoch": 0.7177814029363785,
|
|
"grad_norm": 4.662072658538818,
|
|
"learning_rate": 3.0172939873861486e-06,
|
|
"loss": 1.1475,
|
|
"mean_token_accuracy": 0.7279778122901917,
|
|
"step": 440
|
|
},
|
|
{
|
|
"epoch": 0.7194127243066885,
|
|
"grad_norm": 4.278316020965576,
|
|
"learning_rate": 2.995935395081781e-06,
|
|
"loss": 0.9249,
|
|
"mean_token_accuracy": 0.7603439092636108,
|
|
"step": 441
|
|
},
|
|
{
|
|
"epoch": 0.7210440456769984,
|
|
"grad_norm": 4.619575500488281,
|
|
"learning_rate": 2.9746582058692803e-06,
|
|
"loss": 1.0338,
|
|
"mean_token_accuracy": 0.7423180341720581,
|
|
"step": 442
|
|
},
|
|
{
|
|
"epoch": 0.7226753670473083,
|
|
"grad_norm": 4.950908660888672,
|
|
"learning_rate": 2.953463111435666e-06,
|
|
"loss": 1.1649,
|
|
"mean_token_accuracy": 0.7079599499702454,
|
|
"step": 443
|
|
},
|
|
{
|
|
"epoch": 0.7243066884176182,
|
|
"grad_norm": 5.330234050750732,
|
|
"learning_rate": 2.932350800799196e-06,
|
|
"loss": 1.308,
|
|
"mean_token_accuracy": 0.6914836764335632,
|
|
"step": 444
|
|
},
|
|
{
|
|
"epoch": 0.7259380097879282,
|
|
"grad_norm": 5.278100490570068,
|
|
"learning_rate": 2.9113219602869515e-06,
|
|
"loss": 1.5142,
|
|
"mean_token_accuracy": 0.6575073599815369,
|
|
"step": 445
|
|
},
|
|
{
|
|
"epoch": 0.7275693311582382,
|
|
"grad_norm": 4.93350076675415,
|
|
"learning_rate": 2.890377273512538e-06,
|
|
"loss": 1.3363,
|
|
"mean_token_accuracy": 0.6751728057861328,
|
|
"step": 446
|
|
},
|
|
{
|
|
"epoch": 0.7292006525285482,
|
|
"grad_norm": 4.3188910484313965,
|
|
"learning_rate": 2.8695174213538647e-06,
|
|
"loss": 1.0682,
|
|
"mean_token_accuracy": 0.7265364527702332,
|
|
"step": 447
|
|
},
|
|
{
|
|
"epoch": 0.7308319738988581,
|
|
"grad_norm": 4.598663806915283,
|
|
"learning_rate": 2.848743081930998e-06,
|
|
"loss": 1.1568,
|
|
"mean_token_accuracy": 0.7006726264953613,
|
|
"step": 448
|
|
},
|
|
{
|
|
"epoch": 0.732463295269168,
|
|
"grad_norm": 5.178636074066162,
|
|
"learning_rate": 2.8280549305841265e-06,
|
|
"loss": 1.2928,
|
|
"mean_token_accuracy": 0.6874651908874512,
|
|
"step": 449
|
|
},
|
|
{
|
|
"epoch": 0.734094616639478,
|
|
"grad_norm": 5.297123908996582,
|
|
"learning_rate": 2.8074536398516004e-06,
|
|
"loss": 1.2612,
|
|
"mean_token_accuracy": 0.6888131499290466,
|
|
"step": 450
|
|
},
|
|
{
|
|
"epoch": 0.7357259380097879,
|
|
"grad_norm": 5.056674957275391,
|
|
"learning_rate": 2.7869398794480778e-06,
|
|
"loss": 1.1595,
|
|
"mean_token_accuracy": 0.7092235684394836,
|
|
"step": 451
|
|
},
|
|
{
|
|
"epoch": 0.7373572593800979,
|
|
"grad_norm": 4.8112030029296875,
|
|
"learning_rate": 2.7665143162427427e-06,
|
|
"loss": 1.2288,
|
|
"mean_token_accuracy": 0.7047522664070129,
|
|
"step": 452
|
|
},
|
|
{
|
|
"epoch": 0.7389885807504079,
|
|
"grad_norm": 4.844231605529785,
|
|
"learning_rate": 2.746177614237631e-06,
|
|
"loss": 1.3594,
|
|
"mean_token_accuracy": 0.6892874240875244,
|
|
"step": 453
|
|
},
|
|
{
|
|
"epoch": 0.7406199021207178,
|
|
"grad_norm": 5.323098659515381,
|
|
"learning_rate": 2.7259304345460445e-06,
|
|
"loss": 1.4409,
|
|
"mean_token_accuracy": 0.6324736475944519,
|
|
"step": 454
|
|
},
|
|
{
|
|
"epoch": 0.7422512234910277,
|
|
"grad_norm": 4.965455532073975,
|
|
"learning_rate": 2.7057734353710655e-06,
|
|
"loss": 1.2032,
|
|
"mean_token_accuracy": 0.6982530355453491,
|
|
"step": 455
|
|
},
|
|
{
|
|
"epoch": 0.7438825448613376,
|
|
"grad_norm": 4.611636161804199,
|
|
"learning_rate": 2.6857072719841436e-06,
|
|
"loss": 1.0921,
|
|
"mean_token_accuracy": 0.7258726954460144,
|
|
"step": 456
|
|
},
|
|
{
|
|
"epoch": 0.7455138662316476,
|
|
"grad_norm": 5.415761470794678,
|
|
"learning_rate": 2.6657325967038084e-06,
|
|
"loss": 1.4882,
|
|
"mean_token_accuracy": 0.6622621417045593,
|
|
"step": 457
|
|
},
|
|
{
|
|
"epoch": 0.7471451876019576,
|
|
"grad_norm": 5.130191326141357,
|
|
"learning_rate": 2.645850058874463e-06,
|
|
"loss": 1.2448,
|
|
"mean_token_accuracy": 0.6971870064735413,
|
|
"step": 458
|
|
},
|
|
{
|
|
"epoch": 0.7487765089722676,
|
|
"grad_norm": 4.7735748291015625,
|
|
"learning_rate": 2.6260603048452636e-06,
|
|
"loss": 1.2079,
|
|
"mean_token_accuracy": 0.7042531967163086,
|
|
"step": 459
|
|
},
|
|
{
|
|
"epoch": 0.7504078303425775,
|
|
"grad_norm": 4.764122486114502,
|
|
"learning_rate": 2.6063639779491197e-06,
|
|
"loss": 1.3132,
|
|
"mean_token_accuracy": 0.677205502986908,
|
|
"step": 460
|
|
},
|
|
{
|
|
"epoch": 0.7520391517128875,
|
|
"grad_norm": 4.8977556228637695,
|
|
"learning_rate": 2.586761718481776e-06,
|
|
"loss": 1.0483,
|
|
"mean_token_accuracy": 0.7458379864692688,
|
|
"step": 461
|
|
},
|
|
{
|
|
"epoch": 0.7536704730831973,
|
|
"grad_norm": 5.250521183013916,
|
|
"learning_rate": 2.5672541636809957e-06,
|
|
"loss": 1.3854,
|
|
"mean_token_accuracy": 0.6714285612106323,
|
|
"step": 462
|
|
},
|
|
{
|
|
"epoch": 0.7553017944535073,
|
|
"grad_norm": 4.352292537689209,
|
|
"learning_rate": 2.5478419477058446e-06,
|
|
"loss": 1.2105,
|
|
"mean_token_accuracy": 0.714142918586731,
|
|
"step": 463
|
|
},
|
|
{
|
|
"epoch": 0.7569331158238173,
|
|
"grad_norm": 4.649628162384033,
|
|
"learning_rate": 2.52852570161608e-06,
|
|
"loss": 1.1386,
|
|
"mean_token_accuracy": 0.721030056476593,
|
|
"step": 464
|
|
},
|
|
{
|
|
"epoch": 0.7585644371941273,
|
|
"grad_norm": 5.159845352172852,
|
|
"learning_rate": 2.5093060533516357e-06,
|
|
"loss": 1.0597,
|
|
"mean_token_accuracy": 0.7296990156173706,
|
|
"step": 465
|
|
},
|
|
{
|
|
"epoch": 0.7601957585644372,
|
|
"grad_norm": 4.948349475860596,
|
|
"learning_rate": 2.4901836277122e-06,
|
|
"loss": 1.2113,
|
|
"mean_token_accuracy": 0.6993117928504944,
|
|
"step": 466
|
|
},
|
|
{
|
|
"epoch": 0.7618270799347472,
|
|
"grad_norm": 4.682156085968018,
|
|
"learning_rate": 2.4711590463369163e-06,
|
|
"loss": 1.1495,
|
|
"mean_token_accuracy": 0.7079691290855408,
|
|
"step": 467
|
|
},
|
|
{
|
|
"epoch": 0.763458401305057,
|
|
"grad_norm": 4.9600830078125,
|
|
"learning_rate": 2.4522329276841664e-06,
|
|
"loss": 1.2248,
|
|
"mean_token_accuracy": 0.7208632826805115,
|
|
"step": 468
|
|
},
|
|
{
|
|
"epoch": 0.765089722675367,
|
|
"grad_norm": 5.011682033538818,
|
|
"learning_rate": 2.4334058870114685e-06,
|
|
"loss": 1.2514,
|
|
"mean_token_accuracy": 0.690378725528717,
|
|
"step": 469
|
|
},
|
|
{
|
|
"epoch": 0.766721044045677,
|
|
"grad_norm": 6.021939754486084,
|
|
"learning_rate": 2.414678536355476e-06,
|
|
"loss": 1.1848,
|
|
"mean_token_accuracy": 0.7004357576370239,
|
|
"step": 470
|
|
},
|
|
{
|
|
"epoch": 0.768352365415987,
|
|
"grad_norm": 5.621747970581055,
|
|
"learning_rate": 2.3960514845120835e-06,
|
|
"loss": 1.3135,
|
|
"mean_token_accuracy": 0.6799768805503845,
|
|
"step": 471
|
|
},
|
|
{
|
|
"epoch": 0.7699836867862969,
|
|
"grad_norm": 5.001407623291016,
|
|
"learning_rate": 2.377525337016629e-06,
|
|
"loss": 1.1641,
|
|
"mean_token_accuracy": 0.7319232821464539,
|
|
"step": 472
|
|
},
|
|
{
|
|
"epoch": 0.7716150081566069,
|
|
"grad_norm": 4.856801509857178,
|
|
"learning_rate": 2.359100696124217e-06,
|
|
"loss": 1.2248,
|
|
"mean_token_accuracy": 0.7054263353347778,
|
|
"step": 473
|
|
},
|
|
{
|
|
"epoch": 0.7732463295269169,
|
|
"grad_norm": 5.092650890350342,
|
|
"learning_rate": 2.340778160790133e-06,
|
|
"loss": 1.2368,
|
|
"mean_token_accuracy": 0.6984392404556274,
|
|
"step": 474
|
|
},
|
|
{
|
|
"epoch": 0.7748776508972267,
|
|
"grad_norm": 5.131616592407227,
|
|
"learning_rate": 2.32255832665038e-06,
|
|
"loss": 1.1432,
|
|
"mean_token_accuracy": 0.7190889120101929,
|
|
"step": 475
|
|
},
|
|
{
|
|
"epoch": 0.7765089722675367,
|
|
"grad_norm": 5.5193047523498535,
|
|
"learning_rate": 2.3044417860023082e-06,
|
|
"loss": 1.4145,
|
|
"mean_token_accuracy": 0.6792343258857727,
|
|
"step": 476
|
|
},
|
|
{
|
|
"epoch": 0.7781402936378466,
|
|
"grad_norm": 4.5522356033325195,
|
|
"learning_rate": 2.286429127785365e-06,
|
|
"loss": 1.2906,
|
|
"mean_token_accuracy": 0.6974206566810608,
|
|
"step": 477
|
|
},
|
|
{
|
|
"epoch": 0.7797716150081566,
|
|
"grad_norm": 4.760054588317871,
|
|
"learning_rate": 2.2685209375619433e-06,
|
|
"loss": 1.2122,
|
|
"mean_token_accuracy": 0.7080909609794617,
|
|
"step": 478
|
|
},
|
|
{
|
|
"epoch": 0.7814029363784666,
|
|
"grad_norm": 4.7698187828063965,
|
|
"learning_rate": 2.250717797498361e-06,
|
|
"loss": 1.2056,
|
|
"mean_token_accuracy": 0.7150395512580872,
|
|
"step": 479
|
|
},
|
|
{
|
|
"epoch": 0.7830342577487766,
|
|
"grad_norm": 5.215602397918701,
|
|
"learning_rate": 2.2330202863459123e-06,
|
|
"loss": 1.417,
|
|
"mean_token_accuracy": 0.6677489280700684,
|
|
"step": 480
|
|
},
|
|
{
|
|
"epoch": 0.7846655791190864,
|
|
"grad_norm": 5.066779136657715,
|
|
"learning_rate": 2.215428979422074e-06,
|
|
"loss": 1.3455,
|
|
"mean_token_accuracy": 0.6654175519943237,
|
|
"step": 481
|
|
},
|
|
{
|
|
"epoch": 0.7862969004893964,
|
|
"grad_norm": 4.236968994140625,
|
|
"learning_rate": 2.1979444485917957e-06,
|
|
"loss": 1.2404,
|
|
"mean_token_accuracy": 0.7059952020645142,
|
|
"step": 482
|
|
},
|
|
{
|
|
"epoch": 0.7879282218597063,
|
|
"grad_norm": 4.6524224281311035,
|
|
"learning_rate": 2.1805672622489044e-06,
|
|
"loss": 1.2244,
|
|
"mean_token_accuracy": 0.6920965909957886,
|
|
"step": 483
|
|
},
|
|
{
|
|
"epoch": 0.7895595432300163,
|
|
"grad_norm": 4.233443737030029,
|
|
"learning_rate": 2.163297985297633e-06,
|
|
"loss": 1.015,
|
|
"mean_token_accuracy": 0.7436676621437073,
|
|
"step": 484
|
|
},
|
|
{
|
|
"epoch": 0.7911908646003263,
|
|
"grad_norm": 4.818909168243408,
|
|
"learning_rate": 2.1461371791342572e-06,
|
|
"loss": 1.1409,
|
|
"mean_token_accuracy": 0.7303598523139954,
|
|
"step": 485
|
|
},
|
|
{
|
|
"epoch": 0.7928221859706363,
|
|
"grad_norm": 5.0629448890686035,
|
|
"learning_rate": 2.129085401628841e-06,
|
|
"loss": 1.263,
|
|
"mean_token_accuracy": 0.6733444333076477,
|
|
"step": 486
|
|
},
|
|
{
|
|
"epoch": 0.7944535073409462,
|
|
"grad_norm": 5.042863368988037,
|
|
"learning_rate": 2.1121432071071008e-06,
|
|
"loss": 1.2654,
|
|
"mean_token_accuracy": 0.6947311162948608,
|
|
"step": 487
|
|
},
|
|
{
|
|
"epoch": 0.7960848287112561,
|
|
"grad_norm": 4.359389305114746,
|
|
"learning_rate": 2.0953111463323885e-06,
|
|
"loss": 1.09,
|
|
"mean_token_accuracy": 0.7307896018028259,
|
|
"step": 488
|
|
},
|
|
{
|
|
"epoch": 0.797716150081566,
|
|
"grad_norm": 4.828915119171143,
|
|
"learning_rate": 2.07858976648779e-06,
|
|
"loss": 1.3271,
|
|
"mean_token_accuracy": 0.6866028904914856,
|
|
"step": 489
|
|
},
|
|
{
|
|
"epoch": 0.799347471451876,
|
|
"grad_norm": 5.311947822570801,
|
|
"learning_rate": 2.061979611158329e-06,
|
|
"loss": 1.4026,
|
|
"mean_token_accuracy": 0.6727748513221741,
|
|
"step": 490
|
|
},
|
|
{
|
|
"epoch": 0.800978792822186,
|
|
"grad_norm": 5.242700576782227,
|
|
"learning_rate": 2.045481220313298e-06,
|
|
"loss": 1.3683,
|
|
"mean_token_accuracy": 0.6764549016952515,
|
|
"step": 491
|
|
},
|
|
{
|
|
"epoch": 0.802610114192496,
|
|
"grad_norm": 4.709912300109863,
|
|
"learning_rate": 2.0290951302887117e-06,
|
|
"loss": 1.1447,
|
|
"mean_token_accuracy": 0.7429931163787842,
|
|
"step": 492
|
|
},
|
|
{
|
|
"epoch": 0.8042414355628059,
|
|
"grad_norm": 4.1881184577941895,
|
|
"learning_rate": 2.0128218737698653e-06,
|
|
"loss": 1.0764,
|
|
"mean_token_accuracy": 0.7385087013244629,
|
|
"step": 493
|
|
},
|
|
{
|
|
"epoch": 0.8058727569331158,
|
|
"grad_norm": 4.042761325836182,
|
|
"learning_rate": 1.996661979774017e-06,
|
|
"loss": 1.0007,
|
|
"mean_token_accuracy": 0.743196427822113,
|
|
"step": 494
|
|
},
|
|
{
|
|
"epoch": 0.8075040783034257,
|
|
"grad_norm": 4.446390151977539,
|
|
"learning_rate": 1.9806159736331935e-06,
|
|
"loss": 1.0239,
|
|
"mean_token_accuracy": 0.7473176121711731,
|
|
"step": 495
|
|
},
|
|
{
|
|
"epoch": 0.8091353996737357,
|
|
"grad_norm": 4.78018856048584,
|
|
"learning_rate": 1.964684376977115e-06,
|
|
"loss": 1.1063,
|
|
"mean_token_accuracy": 0.7371134161949158,
|
|
"step": 496
|
|
},
|
|
{
|
|
"epoch": 0.8107667210440457,
|
|
"grad_norm": 5.604861736297607,
|
|
"learning_rate": 1.94886770771623e-06,
|
|
"loss": 1.4752,
|
|
"mean_token_accuracy": 0.6601036190986633,
|
|
"step": 497
|
|
},
|
|
{
|
|
"epoch": 0.8123980424143556,
|
|
"grad_norm": 5.058335304260254,
|
|
"learning_rate": 1.933166480024883e-06,
|
|
"loss": 1.055,
|
|
"mean_token_accuracy": 0.7369833588600159,
|
|
"step": 498
|
|
},
|
|
{
|
|
"epoch": 0.8140293637846656,
|
|
"grad_norm": 4.705621242523193,
|
|
"learning_rate": 1.9175812043246034e-06,
|
|
"loss": 1.2298,
|
|
"mean_token_accuracy": 0.6933262944221497,
|
|
"step": 499
|
|
},
|
|
{
|
|
"epoch": 0.8156606851549756,
|
|
"grad_norm": 4.777103424072266,
|
|
"learning_rate": 1.9021123872675062e-06,
|
|
"loss": 1.1538,
|
|
"mean_token_accuracy": 0.7174683809280396,
|
|
"step": 500
|
|
},
|
|
{
|
|
"epoch": 0.8172920065252854,
|
|
"grad_norm": 4.314986705780029,
|
|
"learning_rate": 1.886760531719825e-06,
|
|
"loss": 0.9366,
|
|
"mean_token_accuracy": 0.7647951245307922,
|
|
"step": 501
|
|
},
|
|
{
|
|
"epoch": 0.8189233278955954,
|
|
"grad_norm": 4.484466075897217,
|
|
"learning_rate": 1.8715261367455634e-06,
|
|
"loss": 1.0794,
|
|
"mean_token_accuracy": 0.744053304195404,
|
|
"step": 502
|
|
},
|
|
{
|
|
"epoch": 0.8205546492659054,
|
|
"grad_norm": 4.761155605316162,
|
|
"learning_rate": 1.8564096975902715e-06,
|
|
"loss": 1.1912,
|
|
"mean_token_accuracy": 0.7101010084152222,
|
|
"step": 503
|
|
},
|
|
{
|
|
"epoch": 0.8221859706362153,
|
|
"grad_norm": 5.64600944519043,
|
|
"learning_rate": 1.8414117056649466e-06,
|
|
"loss": 1.3092,
|
|
"mean_token_accuracy": 0.6834645867347717,
|
|
"step": 504
|
|
},
|
|
{
|
|
"epoch": 0.8238172920065253,
|
|
"grad_norm": 4.866972923278809,
|
|
"learning_rate": 1.8265326485300582e-06,
|
|
"loss": 1.0176,
|
|
"mean_token_accuracy": 0.7384013533592224,
|
|
"step": 505
|
|
},
|
|
{
|
|
"epoch": 0.8254486133768353,
|
|
"grad_norm": 4.5388994216918945,
|
|
"learning_rate": 1.8117730098796996e-06,
|
|
"loss": 1.2966,
|
|
"mean_token_accuracy": 0.701646089553833,
|
|
"step": 506
|
|
},
|
|
{
|
|
"epoch": 0.8270799347471451,
|
|
"grad_norm": 4.454381942749023,
|
|
"learning_rate": 1.7971332695258592e-06,
|
|
"loss": 1.1112,
|
|
"mean_token_accuracy": 0.7266221642494202,
|
|
"step": 507
|
|
},
|
|
{
|
|
"epoch": 0.8287112561174551,
|
|
"grad_norm": 4.481594085693359,
|
|
"learning_rate": 1.7826139033828263e-06,
|
|
"loss": 1.2742,
|
|
"mean_token_accuracy": 0.6912720799446106,
|
|
"step": 508
|
|
},
|
|
{
|
|
"epoch": 0.8303425774877651,
|
|
"grad_norm": 4.99500036239624,
|
|
"learning_rate": 1.768215383451723e-06,
|
|
"loss": 1.1617,
|
|
"mean_token_accuracy": 0.710889995098114,
|
|
"step": 509
|
|
},
|
|
{
|
|
"epoch": 0.831973898858075,
|
|
"grad_norm": 4.590748310089111,
|
|
"learning_rate": 1.7539381778051511e-06,
|
|
"loss": 1.046,
|
|
"mean_token_accuracy": 0.7437499761581421,
|
|
"step": 510
|
|
},
|
|
{
|
|
"epoch": 0.833605220228385,
|
|
"grad_norm": 4.781766414642334,
|
|
"learning_rate": 1.7397827505719852e-06,
|
|
"loss": 1.2756,
|
|
"mean_token_accuracy": 0.6818851232528687,
|
|
"step": 511
|
|
},
|
|
{
|
|
"epoch": 0.835236541598695,
|
|
"grad_norm": 4.8062744140625,
|
|
"learning_rate": 1.7257495619222763e-06,
|
|
"loss": 1.2438,
|
|
"mean_token_accuracy": 0.6988636255264282,
|
|
"step": 512
|
|
},
|
|
{
|
|
"epoch": 0.8368678629690048,
|
|
"grad_norm": 4.5913615226745605,
|
|
"learning_rate": 1.7118390680523023e-06,
|
|
"loss": 1.1542,
|
|
"mean_token_accuracy": 0.7089864015579224,
|
|
"step": 513
|
|
},
|
|
{
|
|
"epoch": 0.8384991843393148,
|
|
"grad_norm": 4.614170551300049,
|
|
"learning_rate": 1.6980517211697293e-06,
|
|
"loss": 1.0838,
|
|
"mean_token_accuracy": 0.7278003692626953,
|
|
"step": 514
|
|
},
|
|
{
|
|
"epoch": 0.8401305057096248,
|
|
"grad_norm": 4.4173359870910645,
|
|
"learning_rate": 1.6843879694789095e-06,
|
|
"loss": 1.1843,
|
|
"mean_token_accuracy": 0.7148330807685852,
|
|
"step": 515
|
|
},
|
|
{
|
|
"epoch": 0.8417618270799347,
|
|
"grad_norm": 4.110933303833008,
|
|
"learning_rate": 1.6708482571663238e-06,
|
|
"loss": 1.0402,
|
|
"mean_token_accuracy": 0.7376889586448669,
|
|
"step": 516
|
|
},
|
|
{
|
|
"epoch": 0.8433931484502447,
|
|
"grad_norm": 4.51687479019165,
|
|
"learning_rate": 1.657433024386127e-06,
|
|
"loss": 1.0383,
|
|
"mean_token_accuracy": 0.7657114267349243,
|
|
"step": 517
|
|
},
|
|
{
|
|
"epoch": 0.8450244698205547,
|
|
"grad_norm": 5.177441596984863,
|
|
"learning_rate": 1.6441427072458493e-06,
|
|
"loss": 1.3209,
|
|
"mean_token_accuracy": 0.6875,
|
|
"step": 518
|
|
},
|
|
{
|
|
"epoch": 0.8466557911908646,
|
|
"grad_norm": 4.650432109832764,
|
|
"learning_rate": 1.630977737792212e-06,
|
|
"loss": 1.1279,
|
|
"mean_token_accuracy": 0.7242990732192993,
|
|
"step": 519
|
|
},
|
|
{
|
|
"epoch": 0.8482871125611745,
|
|
"grad_norm": 4.902032852172852,
|
|
"learning_rate": 1.6179385439970897e-06,
|
|
"loss": 1.1124,
|
|
"mean_token_accuracy": 0.7066738605499268,
|
|
"step": 520
|
|
},
|
|
{
|
|
"epoch": 0.8499184339314845,
|
|
"grad_norm": 4.605056285858154,
|
|
"learning_rate": 1.6050255497435902e-06,
|
|
"loss": 1.0645,
|
|
"mean_token_accuracy": 0.7346938848495483,
|
|
"step": 521
|
|
},
|
|
{
|
|
"epoch": 0.8515497553017944,
|
|
"grad_norm": 5.043729305267334,
|
|
"learning_rate": 1.592239174812279e-06,
|
|
"loss": 1.3279,
|
|
"mean_token_accuracy": 0.6896191835403442,
|
|
"step": 522
|
|
},
|
|
{
|
|
"epoch": 0.8531810766721044,
|
|
"grad_norm": 5.051156520843506,
|
|
"learning_rate": 1.5795798348675352e-06,
|
|
"loss": 1.0265,
|
|
"mean_token_accuracy": 0.7457534074783325,
|
|
"step": 523
|
|
},
|
|
{
|
|
"epoch": 0.8548123980424144,
|
|
"grad_norm": 4.62628173828125,
|
|
"learning_rate": 1.5670479414440315e-06,
|
|
"loss": 1.0211,
|
|
"mean_token_accuracy": 0.7560975551605225,
|
|
"step": 524
|
|
},
|
|
{
|
|
"epoch": 0.8564437194127243,
|
|
"grad_norm": 5.277249813079834,
|
|
"learning_rate": 1.5546439019333632e-06,
|
|
"loss": 1.3336,
|
|
"mean_token_accuracy": 0.681064784526825,
|
|
"step": 525
|
|
},
|
|
{
|
|
"epoch": 0.8580750407830342,
|
|
"grad_norm": 4.982065677642822,
|
|
"learning_rate": 1.5423681195707997e-06,
|
|
"loss": 1.4144,
|
|
"mean_token_accuracy": 0.6686686873435974,
|
|
"step": 526
|
|
},
|
|
{
|
|
"epoch": 0.8597063621533442,
|
|
"grad_norm": 4.6587605476379395,
|
|
"learning_rate": 1.5302209934221796e-06,
|
|
"loss": 1.1911,
|
|
"mean_token_accuracy": 0.7020725607872009,
|
|
"step": 527
|
|
},
|
|
{
|
|
"epoch": 0.8613376835236541,
|
|
"grad_norm": 5.415839195251465,
|
|
"learning_rate": 1.5182029183709345e-06,
|
|
"loss": 1.3637,
|
|
"mean_token_accuracy": 0.6866196990013123,
|
|
"step": 528
|
|
},
|
|
{
|
|
"epoch": 0.8629690048939641,
|
|
"grad_norm": 4.830744743347168,
|
|
"learning_rate": 1.5063142851052535e-06,
|
|
"loss": 1.0927,
|
|
"mean_token_accuracy": 0.7163712382316589,
|
|
"step": 529
|
|
},
|
|
{
|
|
"epoch": 0.8646003262642741,
|
|
"grad_norm": 4.314631938934326,
|
|
"learning_rate": 1.4945554801053852e-06,
|
|
"loss": 1.0773,
|
|
"mean_token_accuracy": 0.7398513555526733,
|
|
"step": 530
|
|
},
|
|
{
|
|
"epoch": 0.866231647634584,
|
|
"grad_norm": 4.3542680740356445,
|
|
"learning_rate": 1.4829268856310677e-06,
|
|
"loss": 1.1271,
|
|
"mean_token_accuracy": 0.7248595952987671,
|
|
"step": 531
|
|
},
|
|
{
|
|
"epoch": 0.867862969004894,
|
|
"grad_norm": 4.48630952835083,
|
|
"learning_rate": 1.471428879709107e-06,
|
|
"loss": 1.0675,
|
|
"mean_token_accuracy": 0.7440000176429749,
|
|
"step": 532
|
|
},
|
|
{
|
|
"epoch": 0.8694942903752039,
|
|
"grad_norm": 4.849664211273193,
|
|
"learning_rate": 1.4600618361210857e-06,
|
|
"loss": 1.2855,
|
|
"mean_token_accuracy": 0.713458776473999,
|
|
"step": 533
|
|
},
|
|
{
|
|
"epoch": 0.8711256117455138,
|
|
"grad_norm": 4.989716529846191,
|
|
"learning_rate": 1.448826124391215e-06,
|
|
"loss": 1.2499,
|
|
"mean_token_accuracy": 0.7188649773597717,
|
|
"step": 534
|
|
},
|
|
{
|
|
"epoch": 0.8727569331158238,
|
|
"grad_norm": 4.539302825927734,
|
|
"learning_rate": 1.437722109774317e-06,
|
|
"loss": 1.1633,
|
|
"mean_token_accuracy": 0.7338669300079346,
|
|
"step": 535
|
|
},
|
|
{
|
|
"epoch": 0.8743882544861338,
|
|
"grad_norm": 4.66331148147583,
|
|
"learning_rate": 1.4267501532439526e-06,
|
|
"loss": 1.2576,
|
|
"mean_token_accuracy": 0.6965973377227783,
|
|
"step": 536
|
|
},
|
|
{
|
|
"epoch": 0.8760195758564437,
|
|
"grad_norm": 4.61297607421875,
|
|
"learning_rate": 1.4159106114806943e-06,
|
|
"loss": 1.3736,
|
|
"mean_token_accuracy": 0.6653734445571899,
|
|
"step": 537
|
|
},
|
|
{
|
|
"epoch": 0.8776508972267537,
|
|
"grad_norm": 4.935201644897461,
|
|
"learning_rate": 1.4052038368605156e-06,
|
|
"loss": 1.3792,
|
|
"mean_token_accuracy": 0.6775679588317871,
|
|
"step": 538
|
|
},
|
|
{
|
|
"epoch": 0.8792822185970636,
|
|
"grad_norm": 4.569594383239746,
|
|
"learning_rate": 1.3946301774433502e-06,
|
|
"loss": 1.105,
|
|
"mean_token_accuracy": 0.7271789312362671,
|
|
"step": 539
|
|
},
|
|
{
|
|
"epoch": 0.8809135399673735,
|
|
"grad_norm": 4.568352699279785,
|
|
"learning_rate": 1.3841899769617723e-06,
|
|
"loss": 1.1148,
|
|
"mean_token_accuracy": 0.7321231961250305,
|
|
"step": 540
|
|
},
|
|
{
|
|
"epoch": 0.8825448613376835,
|
|
"grad_norm": 5.049271583557129,
|
|
"learning_rate": 1.3738835748098198e-06,
|
|
"loss": 1.0984,
|
|
"mean_token_accuracy": 0.7366254925727844,
|
|
"step": 541
|
|
},
|
|
{
|
|
"epoch": 0.8841761827079935,
|
|
"grad_norm": 5.136232376098633,
|
|
"learning_rate": 1.3637113060319629e-06,
|
|
"loss": 1.2849,
|
|
"mean_token_accuracy": 0.6897223591804504,
|
|
"step": 542
|
|
},
|
|
{
|
|
"epoch": 0.8858075040783034,
|
|
"grad_norm": 4.453695774078369,
|
|
"learning_rate": 1.3536735013122144e-06,
|
|
"loss": 1.0962,
|
|
"mean_token_accuracy": 0.7319535613059998,
|
|
"step": 543
|
|
},
|
|
{
|
|
"epoch": 0.8874388254486134,
|
|
"grad_norm": 4.621738910675049,
|
|
"learning_rate": 1.3437704869633772e-06,
|
|
"loss": 1.0924,
|
|
"mean_token_accuracy": 0.7451643943786621,
|
|
"step": 544
|
|
},
|
|
{
|
|
"epoch": 0.8890701468189234,
|
|
"grad_norm": 4.363915920257568,
|
|
"learning_rate": 1.334002584916437e-06,
|
|
"loss": 1.2547,
|
|
"mean_token_accuracy": 0.6975655555725098,
|
|
"step": 545
|
|
},
|
|
{
|
|
"epoch": 0.8907014681892332,
|
|
"grad_norm": 4.77221155166626,
|
|
"learning_rate": 1.3243701127100971e-06,
|
|
"loss": 1.1272,
|
|
"mean_token_accuracy": 0.732022762298584,
|
|
"step": 546
|
|
},
|
|
{
|
|
"epoch": 0.8923327895595432,
|
|
"grad_norm": 4.910726070404053,
|
|
"learning_rate": 1.314873383480455e-06,
|
|
"loss": 1.1381,
|
|
"mean_token_accuracy": 0.7128287553787231,
|
|
"step": 547
|
|
},
|
|
{
|
|
"epoch": 0.8939641109298532,
|
|
"grad_norm": 4.650912284851074,
|
|
"learning_rate": 1.3055127059508257e-06,
|
|
"loss": 1.0727,
|
|
"mean_token_accuracy": 0.7480478882789612,
|
|
"step": 548
|
|
},
|
|
{
|
|
"epoch": 0.8955954323001631,
|
|
"grad_norm": 3.9856724739074707,
|
|
"learning_rate": 1.2962883844217e-06,
|
|
"loss": 0.8759,
|
|
"mean_token_accuracy": 0.7789642214775085,
|
|
"step": 549
|
|
},
|
|
{
|
|
"epoch": 0.8972267536704731,
|
|
"grad_norm": 4.78012752532959,
|
|
"learning_rate": 1.287200718760859e-06,
|
|
"loss": 1.2732,
|
|
"mean_token_accuracy": 0.6914893388748169,
|
|
"step": 550
|
|
},
|
|
{
|
|
"epoch": 0.8988580750407831,
|
|
"grad_norm": 4.302763938903809,
|
|
"learning_rate": 1.27825000439362e-06,
|
|
"loss": 1.0871,
|
|
"mean_token_accuracy": 0.7311992049217224,
|
|
"step": 551
|
|
},
|
|
{
|
|
"epoch": 0.9004893964110929,
|
|
"grad_norm": 4.6384100914001465,
|
|
"learning_rate": 1.2694365322932365e-06,
|
|
"loss": 1.3448,
|
|
"mean_token_accuracy": 0.6719226837158203,
|
|
"step": 552
|
|
},
|
|
{
|
|
"epoch": 0.9021207177814029,
|
|
"grad_norm": 4.745211124420166,
|
|
"learning_rate": 1.2607605889714359e-06,
|
|
"loss": 1.19,
|
|
"mean_token_accuracy": 0.7090080976486206,
|
|
"step": 553
|
|
},
|
|
{
|
|
"epoch": 0.9037520391517129,
|
|
"grad_norm": 4.419302940368652,
|
|
"learning_rate": 1.252222456469111e-06,
|
|
"loss": 0.9335,
|
|
"mean_token_accuracy": 0.774678111076355,
|
|
"step": 554
|
|
},
|
|
{
|
|
"epoch": 0.9053833605220228,
|
|
"grad_norm": 5.066204071044922,
|
|
"learning_rate": 1.2438224123471442e-06,
|
|
"loss": 1.3473,
|
|
"mean_token_accuracy": 0.6653266549110413,
|
|
"step": 555
|
|
},
|
|
{
|
|
"epoch": 0.9070146818923328,
|
|
"grad_norm": 4.375471115112305,
|
|
"learning_rate": 1.2355607296773896e-06,
|
|
"loss": 1.2947,
|
|
"mean_token_accuracy": 0.6962790489196777,
|
|
"step": 556
|
|
},
|
|
{
|
|
"epoch": 0.9086460032626428,
|
|
"grad_norm": 5.035999774932861,
|
|
"learning_rate": 1.2274376770337925e-06,
|
|
"loss": 1.1271,
|
|
"mean_token_accuracy": 0.7255297899246216,
|
|
"step": 557
|
|
},
|
|
{
|
|
"epoch": 0.9102773246329527,
|
|
"grad_norm": 4.534280776977539,
|
|
"learning_rate": 1.2194535184836633e-06,
|
|
"loss": 1.1659,
|
|
"mean_token_accuracy": 0.7146624326705933,
|
|
"step": 558
|
|
},
|
|
{
|
|
"epoch": 0.9119086460032626,
|
|
"grad_norm": 4.192361354827881,
|
|
"learning_rate": 1.2116085135790872e-06,
|
|
"loss": 0.9654,
|
|
"mean_token_accuracy": 0.7518177628517151,
|
|
"step": 559
|
|
},
|
|
{
|
|
"epoch": 0.9135399673735726,
|
|
"grad_norm": 5.638926982879639,
|
|
"learning_rate": 1.2039029173484892e-06,
|
|
"loss": 1.6001,
|
|
"mean_token_accuracy": 0.6247368454933167,
|
|
"step": 560
|
|
},
|
|
{
|
|
"epoch": 0.9151712887438825,
|
|
"grad_norm": 4.600732326507568,
|
|
"learning_rate": 1.1963369802883478e-06,
|
|
"loss": 1.2123,
|
|
"mean_token_accuracy": 0.7063252925872803,
|
|
"step": 561
|
|
},
|
|
{
|
|
"epoch": 0.9168026101141925,
|
|
"grad_norm": 4.525058746337891,
|
|
"learning_rate": 1.1889109483550411e-06,
|
|
"loss": 1.0932,
|
|
"mean_token_accuracy": 0.7251037359237671,
|
|
"step": 562
|
|
},
|
|
{
|
|
"epoch": 0.9184339314845025,
|
|
"grad_norm": 4.5724005699157715,
|
|
"learning_rate": 1.1816250629568632e-06,
|
|
"loss": 1.0861,
|
|
"mean_token_accuracy": 0.7240241765975952,
|
|
"step": 563
|
|
},
|
|
{
|
|
"epoch": 0.9200652528548124,
|
|
"grad_norm": 5.578955173492432,
|
|
"learning_rate": 1.1744795609461683e-06,
|
|
"loss": 1.2629,
|
|
"mean_token_accuracy": 0.6909705400466919,
|
|
"step": 564
|
|
},
|
|
{
|
|
"epoch": 0.9216965742251223,
|
|
"grad_norm": 5.318408489227295,
|
|
"learning_rate": 1.167474674611675e-06,
|
|
"loss": 1.0538,
|
|
"mean_token_accuracy": 0.7338252067565918,
|
|
"step": 565
|
|
},
|
|
{
|
|
"epoch": 0.9233278955954323,
|
|
"grad_norm": 4.251341342926025,
|
|
"learning_rate": 1.1606106316709122e-06,
|
|
"loss": 1.0875,
|
|
"mean_token_accuracy": 0.7354260087013245,
|
|
"step": 566
|
|
},
|
|
{
|
|
"epoch": 0.9249592169657422,
|
|
"grad_norm": 5.110576629638672,
|
|
"learning_rate": 1.1538876552628183e-06,
|
|
"loss": 1.1861,
|
|
"mean_token_accuracy": 0.7216981053352356,
|
|
"step": 567
|
|
},
|
|
{
|
|
"epoch": 0.9265905383360522,
|
|
"grad_norm": 4.9769721031188965,
|
|
"learning_rate": 1.147305963940488e-06,
|
|
"loss": 1.0369,
|
|
"mean_token_accuracy": 0.744041919708252,
|
|
"step": 568
|
|
},
|
|
{
|
|
"epoch": 0.9282218597063622,
|
|
"grad_norm": 5.02736759185791,
|
|
"learning_rate": 1.1408657716640643e-06,
|
|
"loss": 1.5051,
|
|
"mean_token_accuracy": 0.6656504273414612,
|
|
"step": 569
|
|
},
|
|
{
|
|
"epoch": 0.9298531810766721,
|
|
"grad_norm": 4.389795303344727,
|
|
"learning_rate": 1.134567287793787e-06,
|
|
"loss": 1.1081,
|
|
"mean_token_accuracy": 0.7329843044281006,
|
|
"step": 570
|
|
},
|
|
{
|
|
"epoch": 0.9314845024469821,
|
|
"grad_norm": 4.3082427978515625,
|
|
"learning_rate": 1.128410717083182e-06,
|
|
"loss": 1.0839,
|
|
"mean_token_accuracy": 0.729187548160553,
|
|
"step": 571
|
|
},
|
|
{
|
|
"epoch": 0.933115823817292,
|
|
"grad_norm": 5.201175212860107,
|
|
"learning_rate": 1.1223962596724115e-06,
|
|
"loss": 1.2717,
|
|
"mean_token_accuracy": 0.6742309927940369,
|
|
"step": 572
|
|
},
|
|
{
|
|
"epoch": 0.9347471451876019,
|
|
"grad_norm": 4.306964874267578,
|
|
"learning_rate": 1.1165241110817602e-06,
|
|
"loss": 1.1214,
|
|
"mean_token_accuracy": 0.721930742263794,
|
|
"step": 573
|
|
},
|
|
{
|
|
"epoch": 0.9363784665579119,
|
|
"grad_norm": 4.683149814605713,
|
|
"learning_rate": 1.1107944622052857e-06,
|
|
"loss": 1.2618,
|
|
"mean_token_accuracy": 0.6971399188041687,
|
|
"step": 574
|
|
},
|
|
{
|
|
"epoch": 0.9380097879282219,
|
|
"grad_norm": 5.620746612548828,
|
|
"learning_rate": 1.1052074993046102e-06,
|
|
"loss": 1.2447,
|
|
"mean_token_accuracy": 0.6849538087844849,
|
|
"step": 575
|
|
},
|
|
{
|
|
"epoch": 0.9396411092985318,
|
|
"grad_norm": 4.673566818237305,
|
|
"learning_rate": 1.0997634040028643e-06,
|
|
"loss": 1.1948,
|
|
"mean_token_accuracy": 0.7181038856506348,
|
|
"step": 576
|
|
},
|
|
{
|
|
"epoch": 0.9412724306688418,
|
|
"grad_norm": 4.916784763336182,
|
|
"learning_rate": 1.0944623532787844e-06,
|
|
"loss": 1.0561,
|
|
"mean_token_accuracy": 0.7292323708534241,
|
|
"step": 577
|
|
},
|
|
{
|
|
"epoch": 0.9429037520391517,
|
|
"grad_norm": 4.703412055969238,
|
|
"learning_rate": 1.0893045194609596e-06,
|
|
"loss": 1.1676,
|
|
"mean_token_accuracy": 0.7098938226699829,
|
|
"step": 578
|
|
},
|
|
{
|
|
"epoch": 0.9445350734094616,
|
|
"grad_norm": 4.568572521209717,
|
|
"learning_rate": 1.0842900702222283e-06,
|
|
"loss": 1.3776,
|
|
"mean_token_accuracy": 0.6739327907562256,
|
|
"step": 579
|
|
},
|
|
{
|
|
"epoch": 0.9461663947797716,
|
|
"grad_norm": 4.693262577056885,
|
|
"learning_rate": 1.0794191685742276e-06,
|
|
"loss": 1.3097,
|
|
"mean_token_accuracy": 0.6928605437278748,
|
|
"step": 580
|
|
},
|
|
{
|
|
"epoch": 0.9477977161500816,
|
|
"grad_norm": 4.470661640167236,
|
|
"learning_rate": 1.074691972862095e-06,
|
|
"loss": 1.0411,
|
|
"mean_token_accuracy": 0.7344121932983398,
|
|
"step": 581
|
|
},
|
|
{
|
|
"epoch": 0.9494290375203915,
|
|
"grad_norm": 4.835824966430664,
|
|
"learning_rate": 1.070108636759322e-06,
|
|
"loss": 1.037,
|
|
"mean_token_accuracy": 0.7305210828781128,
|
|
"step": 582
|
|
},
|
|
{
|
|
"epoch": 0.9510603588907015,
|
|
"grad_norm": 5.557530403137207,
|
|
"learning_rate": 1.0656693092627534e-06,
|
|
"loss": 1.4262,
|
|
"mean_token_accuracy": 0.6733524203300476,
|
|
"step": 583
|
|
},
|
|
{
|
|
"epoch": 0.9526916802610114,
|
|
"grad_norm": 5.196942329406738,
|
|
"learning_rate": 1.0613741346877498e-06,
|
|
"loss": 1.0261,
|
|
"mean_token_accuracy": 0.7382140755653381,
|
|
"step": 584
|
|
},
|
|
{
|
|
"epoch": 0.9543230016313213,
|
|
"grad_norm": 4.611567497253418,
|
|
"learning_rate": 1.0572232526634918e-06,
|
|
"loss": 1.1281,
|
|
"mean_token_accuracy": 0.7303561568260193,
|
|
"step": 585
|
|
},
|
|
{
|
|
"epoch": 0.9559543230016313,
|
|
"grad_norm": 4.973482608795166,
|
|
"learning_rate": 1.0532167981284437e-06,
|
|
"loss": 1.1927,
|
|
"mean_token_accuracy": 0.7078921794891357,
|
|
"step": 586
|
|
},
|
|
{
|
|
"epoch": 0.9575856443719413,
|
|
"grad_norm": 4.535336017608643,
|
|
"learning_rate": 1.0493549013259644e-06,
|
|
"loss": 1.1746,
|
|
"mean_token_accuracy": 0.736328125,
|
|
"step": 587
|
|
},
|
|
{
|
|
"epoch": 0.9592169657422512,
|
|
"grad_norm": 4.868354320526123,
|
|
"learning_rate": 1.0456376878000754e-06,
|
|
"loss": 1.1741,
|
|
"mean_token_accuracy": 0.7153007984161377,
|
|
"step": 588
|
|
},
|
|
{
|
|
"epoch": 0.9608482871125612,
|
|
"grad_norm": 4.772627353668213,
|
|
"learning_rate": 1.0420652783913794e-06,
|
|
"loss": 1.2043,
|
|
"mean_token_accuracy": 0.7127602696418762,
|
|
"step": 589
|
|
},
|
|
{
|
|
"epoch": 0.9624796084828712,
|
|
"grad_norm": 4.772705554962158,
|
|
"learning_rate": 1.03863778923313e-06,
|
|
"loss": 1.2719,
|
|
"mean_token_accuracy": 0.6881773471832275,
|
|
"step": 590
|
|
},
|
|
{
|
|
"epoch": 0.964110929853181,
|
|
"grad_norm": 4.778547286987305,
|
|
"learning_rate": 1.0353553317474574e-06,
|
|
"loss": 1.0815,
|
|
"mean_token_accuracy": 0.7438063025474548,
|
|
"step": 591
|
|
},
|
|
{
|
|
"epoch": 0.965742251223491,
|
|
"grad_norm": 4.736347198486328,
|
|
"learning_rate": 1.0322180126417494e-06,
|
|
"loss": 1.1622,
|
|
"mean_token_accuracy": 0.7216761708259583,
|
|
"step": 592
|
|
},
|
|
{
|
|
"epoch": 0.967373572593801,
|
|
"grad_norm": 4.148738384246826,
|
|
"learning_rate": 1.0292259339051769e-06,
|
|
"loss": 1.1596,
|
|
"mean_token_accuracy": 0.7182163000106812,
|
|
"step": 593
|
|
},
|
|
{
|
|
"epoch": 0.9690048939641109,
|
|
"grad_norm": 4.727193832397461,
|
|
"learning_rate": 1.026379192805382e-06,
|
|
"loss": 1.4,
|
|
"mean_token_accuracy": 0.6754344701766968,
|
|
"step": 594
|
|
},
|
|
{
|
|
"epoch": 0.9706362153344209,
|
|
"grad_norm": 4.908797264099121,
|
|
"learning_rate": 1.0236778818853158e-06,
|
|
"loss": 1.3418,
|
|
"mean_token_accuracy": 0.6792452931404114,
|
|
"step": 595
|
|
},
|
|
{
|
|
"epoch": 0.9722675367047309,
|
|
"grad_norm": 5.056847095489502,
|
|
"learning_rate": 1.0211220889602289e-06,
|
|
"loss": 1.1988,
|
|
"mean_token_accuracy": 0.715332567691803,
|
|
"step": 596
|
|
},
|
|
{
|
|
"epoch": 0.9738988580750407,
|
|
"grad_norm": 4.906404495239258,
|
|
"learning_rate": 1.018711897114817e-06,
|
|
"loss": 1.3387,
|
|
"mean_token_accuracy": 0.6841831207275391,
|
|
"step": 597
|
|
},
|
|
{
|
|
"epoch": 0.9755301794453507,
|
|
"grad_norm": 4.806992530822754,
|
|
"learning_rate": 1.0164473847005205e-06,
|
|
"loss": 1.2102,
|
|
"mean_token_accuracy": 0.7100494503974915,
|
|
"step": 598
|
|
},
|
|
{
|
|
"epoch": 0.9771615008156607,
|
|
"grad_norm": 4.936591148376465,
|
|
"learning_rate": 1.0143286253329769e-06,
|
|
"loss": 1.1404,
|
|
"mean_token_accuracy": 0.7201149463653564,
|
|
"step": 599
|
|
},
|
|
{
|
|
"epoch": 0.9787928221859706,
|
|
"grad_norm": 4.549412727355957,
|
|
"learning_rate": 1.0123556878896274e-06,
|
|
"loss": 1.2092,
|
|
"mean_token_accuracy": 0.7039577960968018,
|
|
"step": 600
|
|
},
|
|
{
|
|
"epoch": 0.9804241435562806,
|
|
"grad_norm": 4.218964576721191,
|
|
"learning_rate": 1.0105286365074788e-06,
|
|
"loss": 0.9088,
|
|
"mean_token_accuracy": 0.775624692440033,
|
|
"step": 601
|
|
},
|
|
{
|
|
"epoch": 0.9820554649265906,
|
|
"grad_norm": 5.200554847717285,
|
|
"learning_rate": 1.0088475305810178e-06,
|
|
"loss": 1.1501,
|
|
"mean_token_accuracy": 0.7204116582870483,
|
|
"step": 602
|
|
},
|
|
{
|
|
"epoch": 0.9836867862969005,
|
|
"grad_norm": 4.525951385498047,
|
|
"learning_rate": 1.0073124247602805e-06,
|
|
"loss": 1.1539,
|
|
"mean_token_accuracy": 0.7239478826522827,
|
|
"step": 603
|
|
},
|
|
{
|
|
"epoch": 0.9853181076672104,
|
|
"grad_norm": 4.843019008636475,
|
|
"learning_rate": 1.0059233689490742e-06,
|
|
"loss": 1.3085,
|
|
"mean_token_accuracy": 0.6880208253860474,
|
|
"step": 604
|
|
},
|
|
{
|
|
"epoch": 0.9869494290375204,
|
|
"grad_norm": 4.720979690551758,
|
|
"learning_rate": 1.0046804083033585e-06,
|
|
"loss": 0.753,
|
|
"mean_token_accuracy": 0.8125383853912354,
|
|
"step": 605
|
|
},
|
|
{
|
|
"epoch": 0.9885807504078303,
|
|
"grad_norm": 4.803732395172119,
|
|
"learning_rate": 1.0035835832297736e-06,
|
|
"loss": 1.1941,
|
|
"mean_token_accuracy": 0.6987314820289612,
|
|
"step": 606
|
|
},
|
|
{
|
|
"epoch": 0.9902120717781403,
|
|
"grad_norm": 4.542924880981445,
|
|
"learning_rate": 1.00263292938433e-06,
|
|
"loss": 1.0593,
|
|
"mean_token_accuracy": 0.730215847492218,
|
|
"step": 607
|
|
},
|
|
{
|
|
"epoch": 0.9918433931484503,
|
|
"grad_norm": 4.79932975769043,
|
|
"learning_rate": 1.0018284776712475e-06,
|
|
"loss": 1.3496,
|
|
"mean_token_accuracy": 0.6967418789863586,
|
|
"step": 608
|
|
},
|
|
{
|
|
"epoch": 0.9934747145187602,
|
|
"grad_norm": 4.847198963165283,
|
|
"learning_rate": 1.0011702542419498e-06,
|
|
"loss": 1.1661,
|
|
"mean_token_accuracy": 0.7184059023857117,
|
|
"step": 609
|
|
},
|
|
{
|
|
"epoch": 0.9951060358890701,
|
|
"grad_norm": 5.3224406242370605,
|
|
"learning_rate": 1.0006582804942171e-06,
|
|
"loss": 1.2955,
|
|
"mean_token_accuracy": 0.6828246712684631,
|
|
"step": 610
|
|
},
|
|
{
|
|
"epoch": 0.9967373572593801,
|
|
"grad_norm": 4.992889404296875,
|
|
"learning_rate": 1.000292573071488e-06,
|
|
"loss": 1.2474,
|
|
"mean_token_accuracy": 0.7027914524078369,
|
|
"step": 611
|
|
},
|
|
{
|
|
"epoch": 0.99836867862969,
|
|
"grad_norm": 5.054308891296387,
|
|
"learning_rate": 1.000073143862319e-06,
|
|
"loss": 1.1465,
|
|
"mean_token_accuracy": 0.7231833934783936,
|
|
"step": 612
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"grad_norm": 5.045431613922119,
|
|
"learning_rate": 1.0000000000000002e-06,
|
|
"loss": 1.0511,
|
|
"mean_token_accuracy": 0.7292899489402771,
|
|
"step": 613
|
|
},
|
|
{
|
|
"epoch": 1.0,
|
|
"step": 613,
|
|
"total_flos": 1.770321796592042e+18,
|
|
"train_loss": 1.3923614178746209,
|
|
"train_runtime": 2541.2937,
|
|
"train_samples_per_second": 7.707,
|
|
"train_steps_per_second": 0.241
|
|
}
|
|
],
|
|
"logging_steps": 1,
|
|
"max_steps": 613,
|
|
"num_input_tokens_seen": 0,
|
|
"num_train_epochs": 1,
|
|
"save_steps": 500,
|
|
"stateful_callbacks": {
|
|
"TrainerControl": {
|
|
"args": {
|
|
"should_epoch_stop": false,
|
|
"should_evaluate": false,
|
|
"should_log": false,
|
|
"should_save": false,
|
|
"should_training_stop": false
|
|
},
|
|
"attributes": {}
|
|
}
|
|
},
|
|
"total_flos": 1.770321796592042e+18,
|
|
"train_batch_size": 4,
|
|
"trial_name": null,
|
|
"trial_params": null
|
|
}
|