Files
Qwen1.5-MOE-sft-ESFT-transl…/trainer_state.json
ModelHub XC d3da1a674f 初始化项目,由ModelHub XC社区提供模型
Model: jayzou3773/Qwen1.5-MOE-sft-ESFT-translation
Source: Original Platform
2026-05-27 05:08:19 +08:00

5867 lines
158 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 728,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0027472527472527475,
"grad_norm": 79.80936431884766,
"learning_rate": 1.36986301369863e-07,
"loss": 3.2006,
"mean_token_accuracy": 0.5809080600738525,
"step": 1
},
{
"epoch": 0.005494505494505495,
"grad_norm": 73.02354431152344,
"learning_rate": 2.73972602739726e-07,
"loss": 3.0485,
"mean_token_accuracy": 0.5531686544418335,
"step": 2
},
{
"epoch": 0.008241758241758242,
"grad_norm": 90.18173217773438,
"learning_rate": 4.1095890410958903e-07,
"loss": 3.3162,
"mean_token_accuracy": 0.5780051350593567,
"step": 3
},
{
"epoch": 0.01098901098901099,
"grad_norm": 89.79154205322266,
"learning_rate": 5.47945205479452e-07,
"loss": 3.2123,
"mean_token_accuracy": 0.5886699557304382,
"step": 4
},
{
"epoch": 0.013736263736263736,
"grad_norm": 83.19721984863281,
"learning_rate": 6.849315068493151e-07,
"loss": 3.1087,
"mean_token_accuracy": 0.5597964525222778,
"step": 5
},
{
"epoch": 0.016483516483516484,
"grad_norm": 75.79467010498047,
"learning_rate": 8.219178082191781e-07,
"loss": 3.4744,
"mean_token_accuracy": 0.54666668176651,
"step": 6
},
{
"epoch": 0.019230769230769232,
"grad_norm": 63.705596923828125,
"learning_rate": 9.589041095890411e-07,
"loss": 3.7404,
"mean_token_accuracy": 0.48148149251937866,
"step": 7
},
{
"epoch": 0.02197802197802198,
"grad_norm": 53.74971008300781,
"learning_rate": 1.095890410958904e-06,
"loss": 3.3282,
"mean_token_accuracy": 0.5387324094772339,
"step": 8
},
{
"epoch": 0.024725274725274724,
"grad_norm": 52.09133529663086,
"learning_rate": 1.2328767123287673e-06,
"loss": 3.1801,
"mean_token_accuracy": 0.5777262449264526,
"step": 9
},
{
"epoch": 0.027472527472527472,
"grad_norm": 37.37009048461914,
"learning_rate": 1.3698630136986302e-06,
"loss": 2.6745,
"mean_token_accuracy": 0.6029082536697388,
"step": 10
},
{
"epoch": 0.03021978021978022,
"grad_norm": 36.568485260009766,
"learning_rate": 1.5068493150684932e-06,
"loss": 2.8587,
"mean_token_accuracy": 0.590347945690155,
"step": 11
},
{
"epoch": 0.03296703296703297,
"grad_norm": 35.09193801879883,
"learning_rate": 1.6438356164383561e-06,
"loss": 2.4748,
"mean_token_accuracy": 0.6369710564613342,
"step": 12
},
{
"epoch": 0.03571428571428571,
"grad_norm": 29.399085998535156,
"learning_rate": 1.7808219178082193e-06,
"loss": 2.3455,
"mean_token_accuracy": 0.6575654149055481,
"step": 13
},
{
"epoch": 0.038461538461538464,
"grad_norm": 29.879207611083984,
"learning_rate": 1.9178082191780823e-06,
"loss": 2.9237,
"mean_token_accuracy": 0.5609756112098694,
"step": 14
},
{
"epoch": 0.04120879120879121,
"grad_norm": 31.87448501586914,
"learning_rate": 2.0547945205479454e-06,
"loss": 2.8315,
"mean_token_accuracy": 0.5687830448150635,
"step": 15
},
{
"epoch": 0.04395604395604396,
"grad_norm": 29.622562408447266,
"learning_rate": 2.191780821917808e-06,
"loss": 2.7541,
"mean_token_accuracy": 0.6079999804496765,
"step": 16
},
{
"epoch": 0.046703296703296704,
"grad_norm": 46.84980010986328,
"learning_rate": 2.3287671232876713e-06,
"loss": 2.2641,
"mean_token_accuracy": 0.6080626845359802,
"step": 17
},
{
"epoch": 0.04945054945054945,
"grad_norm": 34.44080352783203,
"learning_rate": 2.4657534246575345e-06,
"loss": 2.4124,
"mean_token_accuracy": 0.6391075849533081,
"step": 18
},
{
"epoch": 0.0521978021978022,
"grad_norm": 36.21349334716797,
"learning_rate": 2.6027397260273973e-06,
"loss": 2.2594,
"mean_token_accuracy": 0.6270996928215027,
"step": 19
},
{
"epoch": 0.054945054945054944,
"grad_norm": 39.15977096557617,
"learning_rate": 2.7397260273972604e-06,
"loss": 2.2868,
"mean_token_accuracy": 0.6432291865348816,
"step": 20
},
{
"epoch": 0.057692307692307696,
"grad_norm": 35.869197845458984,
"learning_rate": 2.876712328767123e-06,
"loss": 2.4556,
"mean_token_accuracy": 0.6113360524177551,
"step": 21
},
{
"epoch": 0.06043956043956044,
"grad_norm": 30.194963455200195,
"learning_rate": 3.0136986301369864e-06,
"loss": 2.436,
"mean_token_accuracy": 0.6002538204193115,
"step": 22
},
{
"epoch": 0.06318681318681318,
"grad_norm": 24.54381561279297,
"learning_rate": 3.1506849315068495e-06,
"loss": 2.6554,
"mean_token_accuracy": 0.5426666736602783,
"step": 23
},
{
"epoch": 0.06593406593406594,
"grad_norm": 19.29971694946289,
"learning_rate": 3.2876712328767123e-06,
"loss": 2.4589,
"mean_token_accuracy": 0.5473411083221436,
"step": 24
},
{
"epoch": 0.06868131868131869,
"grad_norm": 16.851980209350586,
"learning_rate": 3.4246575342465754e-06,
"loss": 2.2021,
"mean_token_accuracy": 0.6083832383155823,
"step": 25
},
{
"epoch": 0.07142857142857142,
"grad_norm": 17.44552230834961,
"learning_rate": 3.5616438356164386e-06,
"loss": 1.989,
"mean_token_accuracy": 0.6418079137802124,
"step": 26
},
{
"epoch": 0.07417582417582418,
"grad_norm": 17.299097061157227,
"learning_rate": 3.6986301369863014e-06,
"loss": 1.8649,
"mean_token_accuracy": 0.6751946806907654,
"step": 27
},
{
"epoch": 0.07692307692307693,
"grad_norm": 20.152931213378906,
"learning_rate": 3.8356164383561645e-06,
"loss": 1.9959,
"mean_token_accuracy": 0.6616438627243042,
"step": 28
},
{
"epoch": 0.07967032967032966,
"grad_norm": 16.22498321533203,
"learning_rate": 3.972602739726027e-06,
"loss": 1.7964,
"mean_token_accuracy": 0.6655290126800537,
"step": 29
},
{
"epoch": 0.08241758241758242,
"grad_norm": 16.668590545654297,
"learning_rate": 4.109589041095891e-06,
"loss": 2.1534,
"mean_token_accuracy": 0.6357786655426025,
"step": 30
},
{
"epoch": 0.08516483516483517,
"grad_norm": 17.106897354125977,
"learning_rate": 4.246575342465754e-06,
"loss": 2.0976,
"mean_token_accuracy": 0.6054502129554749,
"step": 31
},
{
"epoch": 0.08791208791208792,
"grad_norm": 17.229530334472656,
"learning_rate": 4.383561643835616e-06,
"loss": 2.0481,
"mean_token_accuracy": 0.6231527328491211,
"step": 32
},
{
"epoch": 0.09065934065934066,
"grad_norm": 15.264318466186523,
"learning_rate": 4.52054794520548e-06,
"loss": 1.9633,
"mean_token_accuracy": 0.6321558952331543,
"step": 33
},
{
"epoch": 0.09340659340659341,
"grad_norm": 16.209901809692383,
"learning_rate": 4.657534246575343e-06,
"loss": 1.9988,
"mean_token_accuracy": 0.6246753334999084,
"step": 34
},
{
"epoch": 0.09615384615384616,
"grad_norm": 15.802133560180664,
"learning_rate": 4.7945205479452054e-06,
"loss": 1.7998,
"mean_token_accuracy": 0.6637499928474426,
"step": 35
},
{
"epoch": 0.0989010989010989,
"grad_norm": 19.071365356445312,
"learning_rate": 4.931506849315069e-06,
"loss": 2.1914,
"mean_token_accuracy": 0.6349413394927979,
"step": 36
},
{
"epoch": 0.10164835164835165,
"grad_norm": 21.21844482421875,
"learning_rate": 5.068493150684932e-06,
"loss": 2.2208,
"mean_token_accuracy": 0.5880758762359619,
"step": 37
},
{
"epoch": 0.1043956043956044,
"grad_norm": 18.13629150390625,
"learning_rate": 5.2054794520547945e-06,
"loss": 1.9187,
"mean_token_accuracy": 0.642346203327179,
"step": 38
},
{
"epoch": 0.10714285714285714,
"grad_norm": 17.24445152282715,
"learning_rate": 5.342465753424658e-06,
"loss": 1.9784,
"mean_token_accuracy": 0.6438547372817993,
"step": 39
},
{
"epoch": 0.10989010989010989,
"grad_norm": 15.44080638885498,
"learning_rate": 5.479452054794521e-06,
"loss": 1.9891,
"mean_token_accuracy": 0.6200241446495056,
"step": 40
},
{
"epoch": 0.11263736263736264,
"grad_norm": 14.55709457397461,
"learning_rate": 5.6164383561643845e-06,
"loss": 1.7106,
"mean_token_accuracy": 0.6681614518165588,
"step": 41
},
{
"epoch": 0.11538461538461539,
"grad_norm": 15.768806457519531,
"learning_rate": 5.753424657534246e-06,
"loss": 2.1038,
"mean_token_accuracy": 0.5827505588531494,
"step": 42
},
{
"epoch": 0.11813186813186813,
"grad_norm": 16.35256576538086,
"learning_rate": 5.89041095890411e-06,
"loss": 2.0303,
"mean_token_accuracy": 0.5822167992591858,
"step": 43
},
{
"epoch": 0.12087912087912088,
"grad_norm": 16.935888290405273,
"learning_rate": 6.027397260273973e-06,
"loss": 1.7315,
"mean_token_accuracy": 0.6608186960220337,
"step": 44
},
{
"epoch": 0.12362637362637363,
"grad_norm": 17.174970626831055,
"learning_rate": 6.164383561643836e-06,
"loss": 1.9942,
"mean_token_accuracy": 0.6288659572601318,
"step": 45
},
{
"epoch": 0.12637362637362637,
"grad_norm": 19.020933151245117,
"learning_rate": 6.301369863013699e-06,
"loss": 1.9312,
"mean_token_accuracy": 0.6689007878303528,
"step": 46
},
{
"epoch": 0.12912087912087913,
"grad_norm": 17.157032012939453,
"learning_rate": 6.438356164383563e-06,
"loss": 2.0065,
"mean_token_accuracy": 0.5868187546730042,
"step": 47
},
{
"epoch": 0.13186813186813187,
"grad_norm": 15.546041488647461,
"learning_rate": 6.5753424657534245e-06,
"loss": 1.5726,
"mean_token_accuracy": 0.7092882990837097,
"step": 48
},
{
"epoch": 0.1346153846153846,
"grad_norm": 14.470401763916016,
"learning_rate": 6.712328767123288e-06,
"loss": 1.6003,
"mean_token_accuracy": 0.681922197341919,
"step": 49
},
{
"epoch": 0.13736263736263737,
"grad_norm": 15.666678428649902,
"learning_rate": 6.849315068493151e-06,
"loss": 1.6609,
"mean_token_accuracy": 0.6787048578262329,
"step": 50
},
{
"epoch": 0.1401098901098901,
"grad_norm": 14.968053817749023,
"learning_rate": 6.9863013698630145e-06,
"loss": 1.9766,
"mean_token_accuracy": 0.6470588445663452,
"step": 51
},
{
"epoch": 0.14285714285714285,
"grad_norm": 15.640029907226562,
"learning_rate": 7.123287671232877e-06,
"loss": 1.736,
"mean_token_accuracy": 0.6700379252433777,
"step": 52
},
{
"epoch": 0.14560439560439561,
"grad_norm": 17.240005493164062,
"learning_rate": 7.260273972602741e-06,
"loss": 1.6045,
"mean_token_accuracy": 0.6543046236038208,
"step": 53
},
{
"epoch": 0.14835164835164835,
"grad_norm": 17.649545669555664,
"learning_rate": 7.397260273972603e-06,
"loss": 2.0,
"mean_token_accuracy": 0.6148648858070374,
"step": 54
},
{
"epoch": 0.1510989010989011,
"grad_norm": 17.608076095581055,
"learning_rate": 7.534246575342466e-06,
"loss": 1.5678,
"mean_token_accuracy": 0.7154255509376526,
"step": 55
},
{
"epoch": 0.15384615384615385,
"grad_norm": 17.627878189086914,
"learning_rate": 7.671232876712329e-06,
"loss": 1.6404,
"mean_token_accuracy": 0.6906377077102661,
"step": 56
},
{
"epoch": 0.1565934065934066,
"grad_norm": 17.066679000854492,
"learning_rate": 7.808219178082192e-06,
"loss": 2.0689,
"mean_token_accuracy": 0.6567164063453674,
"step": 57
},
{
"epoch": 0.15934065934065933,
"grad_norm": 16.802358627319336,
"learning_rate": 7.945205479452055e-06,
"loss": 1.8222,
"mean_token_accuracy": 0.6650062203407288,
"step": 58
},
{
"epoch": 0.1620879120879121,
"grad_norm": 14.752166748046875,
"learning_rate": 8.082191780821919e-06,
"loss": 1.3144,
"mean_token_accuracy": 0.7395243644714355,
"step": 59
},
{
"epoch": 0.16483516483516483,
"grad_norm": 15.987241744995117,
"learning_rate": 8.219178082191782e-06,
"loss": 1.6505,
"mean_token_accuracy": 0.6697613000869751,
"step": 60
},
{
"epoch": 0.16758241758241757,
"grad_norm": 14.781291007995605,
"learning_rate": 8.356164383561644e-06,
"loss": 1.5079,
"mean_token_accuracy": 0.6952491402626038,
"step": 61
},
{
"epoch": 0.17032967032967034,
"grad_norm": 17.22585105895996,
"learning_rate": 8.493150684931507e-06,
"loss": 1.7571,
"mean_token_accuracy": 0.67527174949646,
"step": 62
},
{
"epoch": 0.17307692307692307,
"grad_norm": 14.363125801086426,
"learning_rate": 8.63013698630137e-06,
"loss": 1.2132,
"mean_token_accuracy": 0.7480490803718567,
"step": 63
},
{
"epoch": 0.17582417582417584,
"grad_norm": 14.252264976501465,
"learning_rate": 8.767123287671233e-06,
"loss": 1.3583,
"mean_token_accuracy": 0.751387357711792,
"step": 64
},
{
"epoch": 0.17857142857142858,
"grad_norm": 16.291038513183594,
"learning_rate": 8.904109589041097e-06,
"loss": 1.5959,
"mean_token_accuracy": 0.7064676880836487,
"step": 65
},
{
"epoch": 0.1813186813186813,
"grad_norm": 16.286724090576172,
"learning_rate": 9.04109589041096e-06,
"loss": 2.0605,
"mean_token_accuracy": 0.5750834345817566,
"step": 66
},
{
"epoch": 0.18406593406593408,
"grad_norm": 16.324674606323242,
"learning_rate": 9.178082191780823e-06,
"loss": 1.4098,
"mean_token_accuracy": 0.7394179701805115,
"step": 67
},
{
"epoch": 0.18681318681318682,
"grad_norm": 14.816543579101562,
"learning_rate": 9.315068493150685e-06,
"loss": 1.4708,
"mean_token_accuracy": 0.7154285907745361,
"step": 68
},
{
"epoch": 0.18956043956043955,
"grad_norm": 16.583890914916992,
"learning_rate": 9.452054794520548e-06,
"loss": 1.7538,
"mean_token_accuracy": 0.677135705947876,
"step": 69
},
{
"epoch": 0.19230769230769232,
"grad_norm": 14.951848030090332,
"learning_rate": 9.589041095890411e-06,
"loss": 1.5561,
"mean_token_accuracy": 0.70525062084198,
"step": 70
},
{
"epoch": 0.19505494505494506,
"grad_norm": 16.887245178222656,
"learning_rate": 9.726027397260275e-06,
"loss": 1.6847,
"mean_token_accuracy": 0.6716621518135071,
"step": 71
},
{
"epoch": 0.1978021978021978,
"grad_norm": 16.578994750976562,
"learning_rate": 9.863013698630138e-06,
"loss": 1.7824,
"mean_token_accuracy": 0.644752025604248,
"step": 72
},
{
"epoch": 0.20054945054945056,
"grad_norm": 17.51304817199707,
"learning_rate": 1e-05,
"loss": 1.4727,
"mean_token_accuracy": 0.7205039858818054,
"step": 73
},
{
"epoch": 0.2032967032967033,
"grad_norm": 17.006372451782227,
"learning_rate": 9.999948239456138e-06,
"loss": 2.0252,
"mean_token_accuracy": 0.6419752836227417,
"step": 74
},
{
"epoch": 0.20604395604395603,
"grad_norm": 13.299188613891602,
"learning_rate": 9.999792959015282e-06,
"loss": 1.3618,
"mean_token_accuracy": 0.7111574411392212,
"step": 75
},
{
"epoch": 0.2087912087912088,
"grad_norm": 16.785383224487305,
"learning_rate": 9.999534162249611e-06,
"loss": 1.6636,
"mean_token_accuracy": 0.6866666674613953,
"step": 76
},
{
"epoch": 0.21153846153846154,
"grad_norm": 16.0639591217041,
"learning_rate": 9.999171855112667e-06,
"loss": 1.6716,
"mean_token_accuracy": 0.6938775777816772,
"step": 77
},
{
"epoch": 0.21428571428571427,
"grad_norm": 14.496546745300293,
"learning_rate": 9.998706045939206e-06,
"loss": 1.5081,
"mean_token_accuracy": 0.7064676880836487,
"step": 78
},
{
"epoch": 0.21703296703296704,
"grad_norm": 14.92887020111084,
"learning_rate": 9.998136745445027e-06,
"loss": 1.5817,
"mean_token_accuracy": 0.672703742980957,
"step": 79
},
{
"epoch": 0.21978021978021978,
"grad_norm": 15.17734146118164,
"learning_rate": 9.997463966726706e-06,
"loss": 1.7535,
"mean_token_accuracy": 0.614503800868988,
"step": 80
},
{
"epoch": 0.22252747252747251,
"grad_norm": 13.309514045715332,
"learning_rate": 9.996687725261309e-06,
"loss": 1.4115,
"mean_token_accuracy": 0.6910039186477661,
"step": 81
},
{
"epoch": 0.22527472527472528,
"grad_norm": 13.710087776184082,
"learning_rate": 9.995808038906024e-06,
"loss": 1.9807,
"mean_token_accuracy": 0.6172986030578613,
"step": 82
},
{
"epoch": 0.22802197802197802,
"grad_norm": 11.803552627563477,
"learning_rate": 9.994824927897763e-06,
"loss": 1.3752,
"mean_token_accuracy": 0.6950276494026184,
"step": 83
},
{
"epoch": 0.23076923076923078,
"grad_norm": 10.76941204071045,
"learning_rate": 9.993738414852683e-06,
"loss": 1.3429,
"mean_token_accuracy": 0.7200000286102295,
"step": 84
},
{
"epoch": 0.23351648351648352,
"grad_norm": 12.579700469970703,
"learning_rate": 9.992548524765677e-06,
"loss": 1.6501,
"mean_token_accuracy": 0.6551281809806824,
"step": 85
},
{
"epoch": 0.23626373626373626,
"grad_norm": 12.145017623901367,
"learning_rate": 9.991255285009794e-06,
"loss": 1.7792,
"mean_token_accuracy": 0.6391875743865967,
"step": 86
},
{
"epoch": 0.23901098901098902,
"grad_norm": 12.107645034790039,
"learning_rate": 9.989858725335608e-06,
"loss": 1.7553,
"mean_token_accuracy": 0.6398928761482239,
"step": 87
},
{
"epoch": 0.24175824175824176,
"grad_norm": 11.869401931762695,
"learning_rate": 9.988358877870536e-06,
"loss": 1.4868,
"mean_token_accuracy": 0.7045204043388367,
"step": 88
},
{
"epoch": 0.2445054945054945,
"grad_norm": 11.478519439697266,
"learning_rate": 9.986755777118095e-06,
"loss": 1.6232,
"mean_token_accuracy": 0.6606714725494385,
"step": 89
},
{
"epoch": 0.24725274725274726,
"grad_norm": 12.422060012817383,
"learning_rate": 9.985049459957121e-06,
"loss": 1.1337,
"mean_token_accuracy": 0.7422279715538025,
"step": 90
},
{
"epoch": 0.25,
"grad_norm": 13.169363021850586,
"learning_rate": 9.983239965640902e-06,
"loss": 1.2939,
"mean_token_accuracy": 0.7121729254722595,
"step": 91
},
{
"epoch": 0.25274725274725274,
"grad_norm": 11.118330001831055,
"learning_rate": 9.981327335796284e-06,
"loss": 1.4643,
"mean_token_accuracy": 0.6801661252975464,
"step": 92
},
{
"epoch": 0.2554945054945055,
"grad_norm": 13.81788444519043,
"learning_rate": 9.97931161442272e-06,
"loss": 1.6353,
"mean_token_accuracy": 0.6736111044883728,
"step": 93
},
{
"epoch": 0.25824175824175827,
"grad_norm": 11.797772407531738,
"learning_rate": 9.977192847891245e-06,
"loss": 1.4335,
"mean_token_accuracy": 0.6979038119316101,
"step": 94
},
{
"epoch": 0.260989010989011,
"grad_norm": 11.760414123535156,
"learning_rate": 9.974971084943421e-06,
"loss": 1.5042,
"mean_token_accuracy": 0.6891191601753235,
"step": 95
},
{
"epoch": 0.26373626373626374,
"grad_norm": 12.482735633850098,
"learning_rate": 9.972646376690204e-06,
"loss": 1.3888,
"mean_token_accuracy": 0.7099447250366211,
"step": 96
},
{
"epoch": 0.2664835164835165,
"grad_norm": 11.861308097839355,
"learning_rate": 9.970218776610781e-06,
"loss": 1.6511,
"mean_token_accuracy": 0.6784741282463074,
"step": 97
},
{
"epoch": 0.2692307692307692,
"grad_norm": 11.980021476745605,
"learning_rate": 9.967688340551328e-06,
"loss": 1.4763,
"mean_token_accuracy": 0.6922183632850647,
"step": 98
},
{
"epoch": 0.27197802197802196,
"grad_norm": 12.542531967163086,
"learning_rate": 9.965055126723733e-06,
"loss": 1.7086,
"mean_token_accuracy": 0.6587795615196228,
"step": 99
},
{
"epoch": 0.27472527472527475,
"grad_norm": 11.29738998413086,
"learning_rate": 9.962319195704253e-06,
"loss": 1.2443,
"mean_token_accuracy": 0.7068965435028076,
"step": 100
},
{
"epoch": 0.2774725274725275,
"grad_norm": 10.83761978149414,
"learning_rate": 9.959480610432126e-06,
"loss": 1.3708,
"mean_token_accuracy": 0.7224435806274414,
"step": 101
},
{
"epoch": 0.2802197802197802,
"grad_norm": 11.421622276306152,
"learning_rate": 9.956539436208109e-06,
"loss": 1.2497,
"mean_token_accuracy": 0.704336404800415,
"step": 102
},
{
"epoch": 0.28296703296703296,
"grad_norm": 11.305917739868164,
"learning_rate": 9.953495740692997e-06,
"loss": 1.4686,
"mean_token_accuracy": 0.6876543164253235,
"step": 103
},
{
"epoch": 0.2857142857142857,
"grad_norm": 10.745238304138184,
"learning_rate": 9.950349593906047e-06,
"loss": 1.4166,
"mean_token_accuracy": 0.695652186870575,
"step": 104
},
{
"epoch": 0.28846153846153844,
"grad_norm": 10.237006187438965,
"learning_rate": 9.947101068223379e-06,
"loss": 1.1987,
"mean_token_accuracy": 0.717255711555481,
"step": 105
},
{
"epoch": 0.29120879120879123,
"grad_norm": 14.004864692687988,
"learning_rate": 9.943750238376311e-06,
"loss": 1.9631,
"mean_token_accuracy": 0.5841726660728455,
"step": 106
},
{
"epoch": 0.29395604395604397,
"grad_norm": 12.94350814819336,
"learning_rate": 9.940297181449626e-06,
"loss": 1.5451,
"mean_token_accuracy": 0.6545681953430176,
"step": 107
},
{
"epoch": 0.2967032967032967,
"grad_norm": 11.937135696411133,
"learning_rate": 9.93674197687982e-06,
"loss": 1.5099,
"mean_token_accuracy": 0.6719636917114258,
"step": 108
},
{
"epoch": 0.29945054945054944,
"grad_norm": 12.004481315612793,
"learning_rate": 9.933084706453253e-06,
"loss": 1.362,
"mean_token_accuracy": 0.719239354133606,
"step": 109
},
{
"epoch": 0.3021978021978022,
"grad_norm": 11.889949798583984,
"learning_rate": 9.929325454304288e-06,
"loss": 1.3806,
"mean_token_accuracy": 0.7053072452545166,
"step": 110
},
{
"epoch": 0.30494505494505497,
"grad_norm": 29.812204360961914,
"learning_rate": 9.92546430691334e-06,
"loss": 1.5889,
"mean_token_accuracy": 0.6545681953430176,
"step": 111
},
{
"epoch": 0.3076923076923077,
"grad_norm": 12.241945266723633,
"learning_rate": 9.92150135310489e-06,
"loss": 1.4933,
"mean_token_accuracy": 0.6770833134651184,
"step": 112
},
{
"epoch": 0.31043956043956045,
"grad_norm": 11.248332023620605,
"learning_rate": 9.917436684045452e-06,
"loss": 1.2714,
"mean_token_accuracy": 0.6968973875045776,
"step": 113
},
{
"epoch": 0.3131868131868132,
"grad_norm": 11.750696182250977,
"learning_rate": 9.913270393241456e-06,
"loss": 1.2923,
"mean_token_accuracy": 0.7200000286102295,
"step": 114
},
{
"epoch": 0.3159340659340659,
"grad_norm": 11.194052696228027,
"learning_rate": 9.90900257653712e-06,
"loss": 1.488,
"mean_token_accuracy": 0.678185760974884,
"step": 115
},
{
"epoch": 0.31868131868131866,
"grad_norm": 10.4213228225708,
"learning_rate": 9.904633332112222e-06,
"loss": 1.2861,
"mean_token_accuracy": 0.6877682209014893,
"step": 116
},
{
"epoch": 0.32142857142857145,
"grad_norm": 12.326902389526367,
"learning_rate": 9.900162760479863e-06,
"loss": 1.4382,
"mean_token_accuracy": 0.7132667899131775,
"step": 117
},
{
"epoch": 0.3241758241758242,
"grad_norm": 11.26202392578125,
"learning_rate": 9.895590964484142e-06,
"loss": 1.5174,
"mean_token_accuracy": 0.6674786806106567,
"step": 118
},
{
"epoch": 0.3269230769230769,
"grad_norm": 11.558907508850098,
"learning_rate": 9.890918049297785e-06,
"loss": 1.4946,
"mean_token_accuracy": 0.6675094962120056,
"step": 119
},
{
"epoch": 0.32967032967032966,
"grad_norm": 12.017938613891602,
"learning_rate": 9.886144122419744e-06,
"loss": 1.3344,
"mean_token_accuracy": 0.7115135788917542,
"step": 120
},
{
"epoch": 0.3324175824175824,
"grad_norm": 12.464367866516113,
"learning_rate": 9.881269293672706e-06,
"loss": 1.6151,
"mean_token_accuracy": 0.6619115471839905,
"step": 121
},
{
"epoch": 0.33516483516483514,
"grad_norm": 11.000343322753906,
"learning_rate": 9.87629367520058e-06,
"loss": 1.4913,
"mean_token_accuracy": 0.6726457476615906,
"step": 122
},
{
"epoch": 0.33791208791208793,
"grad_norm": 11.648791313171387,
"learning_rate": 9.871217381465904e-06,
"loss": 1.5451,
"mean_token_accuracy": 0.6589806079864502,
"step": 123
},
{
"epoch": 0.34065934065934067,
"grad_norm": 11.940444946289062,
"learning_rate": 9.866040529247224e-06,
"loss": 1.342,
"mean_token_accuracy": 0.7100591659545898,
"step": 124
},
{
"epoch": 0.3434065934065934,
"grad_norm": 11.262575149536133,
"learning_rate": 9.860763237636397e-06,
"loss": 1.3526,
"mean_token_accuracy": 0.6929922103881836,
"step": 125
},
{
"epoch": 0.34615384615384615,
"grad_norm": 10.675347328186035,
"learning_rate": 9.855385628035866e-06,
"loss": 1.0643,
"mean_token_accuracy": 0.7545126080513,
"step": 126
},
{
"epoch": 0.3489010989010989,
"grad_norm": 12.187606811523438,
"learning_rate": 9.849907824155847e-06,
"loss": 1.4871,
"mean_token_accuracy": 0.6959620118141174,
"step": 127
},
{
"epoch": 0.3516483516483517,
"grad_norm": 12.919577598571777,
"learning_rate": 9.844329952011506e-06,
"loss": 1.8205,
"mean_token_accuracy": 0.6162227392196655,
"step": 128
},
{
"epoch": 0.3543956043956044,
"grad_norm": 12.444584846496582,
"learning_rate": 9.838652139920032e-06,
"loss": 1.5858,
"mean_token_accuracy": 0.6922005414962769,
"step": 129
},
{
"epoch": 0.35714285714285715,
"grad_norm": 11.21202564239502,
"learning_rate": 9.832874518497718e-06,
"loss": 1.2042,
"mean_token_accuracy": 0.7413793206214905,
"step": 130
},
{
"epoch": 0.3598901098901099,
"grad_norm": 10.988180160522461,
"learning_rate": 9.826997220656925e-06,
"loss": 1.289,
"mean_token_accuracy": 0.7080214023590088,
"step": 131
},
{
"epoch": 0.3626373626373626,
"grad_norm": 12.33310604095459,
"learning_rate": 9.821020381603052e-06,
"loss": 1.4773,
"mean_token_accuracy": 0.6791208982467651,
"step": 132
},
{
"epoch": 0.36538461538461536,
"grad_norm": 11.341861724853516,
"learning_rate": 9.814944138831402e-06,
"loss": 1.4338,
"mean_token_accuracy": 0.6756476759910583,
"step": 133
},
{
"epoch": 0.36813186813186816,
"grad_norm": 11.409241676330566,
"learning_rate": 9.808768632124033e-06,
"loss": 1.6764,
"mean_token_accuracy": 0.6054852604866028,
"step": 134
},
{
"epoch": 0.3708791208791209,
"grad_norm": 12.015110969543457,
"learning_rate": 9.802494003546537e-06,
"loss": 1.4402,
"mean_token_accuracy": 0.7052023410797119,
"step": 135
},
{
"epoch": 0.37362637362637363,
"grad_norm": 10.703707695007324,
"learning_rate": 9.79612039744478e-06,
"loss": 1.3956,
"mean_token_accuracy": 0.6877990365028381,
"step": 136
},
{
"epoch": 0.37637362637362637,
"grad_norm": 9.813762664794922,
"learning_rate": 9.789647960441567e-06,
"loss": 1.0888,
"mean_token_accuracy": 0.7485515475273132,
"step": 137
},
{
"epoch": 0.3791208791208791,
"grad_norm": 11.24870777130127,
"learning_rate": 9.78307684143328e-06,
"loss": 1.5276,
"mean_token_accuracy": 0.6972833275794983,
"step": 138
},
{
"epoch": 0.38186813186813184,
"grad_norm": 13.590780258178711,
"learning_rate": 9.77640719158645e-06,
"loss": 1.5919,
"mean_token_accuracy": 0.6567164063453674,
"step": 139
},
{
"epoch": 0.38461538461538464,
"grad_norm": 13.572460174560547,
"learning_rate": 9.769639164334279e-06,
"loss": 1.7118,
"mean_token_accuracy": 0.6849489808082581,
"step": 140
},
{
"epoch": 0.3873626373626374,
"grad_norm": 11.5997896194458,
"learning_rate": 9.76277291537311e-06,
"loss": 1.279,
"mean_token_accuracy": 0.7191435694694519,
"step": 141
},
{
"epoch": 0.3901098901098901,
"grad_norm": 10.374162673950195,
"learning_rate": 9.75580860265884e-06,
"loss": 1.0923,
"mean_token_accuracy": 0.7603210806846619,
"step": 142
},
{
"epoch": 0.39285714285714285,
"grad_norm": 11.554888725280762,
"learning_rate": 9.748746386403308e-06,
"loss": 1.2049,
"mean_token_accuracy": 0.7256177067756653,
"step": 143
},
{
"epoch": 0.3956043956043956,
"grad_norm": 12.095945358276367,
"learning_rate": 9.741586429070574e-06,
"loss": 1.772,
"mean_token_accuracy": 0.6329723000526428,
"step": 144
},
{
"epoch": 0.3983516483516483,
"grad_norm": 11.040619850158691,
"learning_rate": 9.73432889537321e-06,
"loss": 1.2994,
"mean_token_accuracy": 0.7119628190994263,
"step": 145
},
{
"epoch": 0.4010989010989011,
"grad_norm": 13.869733810424805,
"learning_rate": 9.726973952268507e-06,
"loss": 1.9615,
"mean_token_accuracy": 0.5833333134651184,
"step": 146
},
{
"epoch": 0.40384615384615385,
"grad_norm": 16.08857536315918,
"learning_rate": 9.719521768954615e-06,
"loss": 1.767,
"mean_token_accuracy": 0.6537982821464539,
"step": 147
},
{
"epoch": 0.4065934065934066,
"grad_norm": 11.085786819458008,
"learning_rate": 9.71197251686668e-06,
"loss": 1.4405,
"mean_token_accuracy": 0.6847457885742188,
"step": 148
},
{
"epoch": 0.40934065934065933,
"grad_norm": 11.921771049499512,
"learning_rate": 9.704326369672872e-06,
"loss": 1.092,
"mean_token_accuracy": 0.7311015129089355,
"step": 149
},
{
"epoch": 0.41208791208791207,
"grad_norm": 12.389166831970215,
"learning_rate": 9.696583503270409e-06,
"loss": 1.3396,
"mean_token_accuracy": 0.6969001293182373,
"step": 150
},
{
"epoch": 0.41483516483516486,
"grad_norm": 12.603693008422852,
"learning_rate": 9.688744095781501e-06,
"loss": 1.494,
"mean_token_accuracy": 0.6674938201904297,
"step": 151
},
{
"epoch": 0.4175824175824176,
"grad_norm": 12.303590774536133,
"learning_rate": 9.680808327549261e-06,
"loss": 1.3313,
"mean_token_accuracy": 0.7104136943817139,
"step": 152
},
{
"epoch": 0.42032967032967034,
"grad_norm": 11.780409812927246,
"learning_rate": 9.672776381133541e-06,
"loss": 1.3221,
"mean_token_accuracy": 0.7101449370384216,
"step": 153
},
{
"epoch": 0.4230769230769231,
"grad_norm": 11.474019050598145,
"learning_rate": 9.664648441306753e-06,
"loss": 1.5544,
"mean_token_accuracy": 0.6828644275665283,
"step": 154
},
{
"epoch": 0.4258241758241758,
"grad_norm": 11.8672456741333,
"learning_rate": 9.656424695049597e-06,
"loss": 1.6288,
"mean_token_accuracy": 0.6559263467788696,
"step": 155
},
{
"epoch": 0.42857142857142855,
"grad_norm": 11.574856758117676,
"learning_rate": 9.648105331546778e-06,
"loss": 1.1779,
"mean_token_accuracy": 0.7253766059875488,
"step": 156
},
{
"epoch": 0.43131868131868134,
"grad_norm": 13.268657684326172,
"learning_rate": 9.639690542182643e-06,
"loss": 1.6477,
"mean_token_accuracy": 0.6597131490707397,
"step": 157
},
{
"epoch": 0.4340659340659341,
"grad_norm": 11.048521995544434,
"learning_rate": 9.631180520536778e-06,
"loss": 1.476,
"mean_token_accuracy": 0.703125,
"step": 158
},
{
"epoch": 0.4368131868131868,
"grad_norm": 10.296670913696289,
"learning_rate": 9.622575462379562e-06,
"loss": 1.128,
"mean_token_accuracy": 0.7398906946182251,
"step": 159
},
{
"epoch": 0.43956043956043955,
"grad_norm": 10.901885986328125,
"learning_rate": 9.613875565667655e-06,
"loss": 1.4029,
"mean_token_accuracy": 0.6959537863731384,
"step": 160
},
{
"epoch": 0.4423076923076923,
"grad_norm": 11.685770988464355,
"learning_rate": 9.605081030539453e-06,
"loss": 1.4721,
"mean_token_accuracy": 0.6764705777168274,
"step": 161
},
{
"epoch": 0.44505494505494503,
"grad_norm": 12.1082763671875,
"learning_rate": 9.596192059310475e-06,
"loss": 1.4389,
"mean_token_accuracy": 0.7186761498451233,
"step": 162
},
{
"epoch": 0.4478021978021978,
"grad_norm": 11.559240341186523,
"learning_rate": 9.587208856468715e-06,
"loss": 1.5186,
"mean_token_accuracy": 0.6510416865348816,
"step": 163
},
{
"epoch": 0.45054945054945056,
"grad_norm": 11.276078224182129,
"learning_rate": 9.578131628669936e-06,
"loss": 1.3336,
"mean_token_accuracy": 0.7079530358314514,
"step": 164
},
{
"epoch": 0.4532967032967033,
"grad_norm": 13.037327766418457,
"learning_rate": 9.568960584732912e-06,
"loss": 1.5361,
"mean_token_accuracy": 0.6537467837333679,
"step": 165
},
{
"epoch": 0.45604395604395603,
"grad_norm": 13.014458656311035,
"learning_rate": 9.559695935634636e-06,
"loss": 1.7156,
"mean_token_accuracy": 0.6452054977416992,
"step": 166
},
{
"epoch": 0.45879120879120877,
"grad_norm": 12.075533866882324,
"learning_rate": 9.550337894505446e-06,
"loss": 1.2952,
"mean_token_accuracy": 0.7117853164672852,
"step": 167
},
{
"epoch": 0.46153846153846156,
"grad_norm": 12.357090950012207,
"learning_rate": 9.540886676624145e-06,
"loss": 1.3997,
"mean_token_accuracy": 0.6951871514320374,
"step": 168
},
{
"epoch": 0.4642857142857143,
"grad_norm": 11.073513984680176,
"learning_rate": 9.531342499413034e-06,
"loss": 1.1314,
"mean_token_accuracy": 0.7375144958496094,
"step": 169
},
{
"epoch": 0.46703296703296704,
"grad_norm": 12.422121047973633,
"learning_rate": 9.521705582432915e-06,
"loss": 1.7004,
"mean_token_accuracy": 0.6406821012496948,
"step": 170
},
{
"epoch": 0.4697802197802198,
"grad_norm": 11.14840030670166,
"learning_rate": 9.511976147378038e-06,
"loss": 1.3761,
"mean_token_accuracy": 0.709113597869873,
"step": 171
},
{
"epoch": 0.4725274725274725,
"grad_norm": 11.885308265686035,
"learning_rate": 9.502154418071002e-06,
"loss": 1.2528,
"mean_token_accuracy": 0.7129629850387573,
"step": 172
},
{
"epoch": 0.47527472527472525,
"grad_norm": 11.952164649963379,
"learning_rate": 9.492240620457609e-06,
"loss": 1.2345,
"mean_token_accuracy": 0.7546296119689941,
"step": 173
},
{
"epoch": 0.47802197802197804,
"grad_norm": 12.823383331298828,
"learning_rate": 9.48223498260166e-06,
"loss": 1.3316,
"mean_token_accuracy": 0.7302383780479431,
"step": 174
},
{
"epoch": 0.4807692307692308,
"grad_norm": 11.896831512451172,
"learning_rate": 9.472137734679715e-06,
"loss": 1.5243,
"mean_token_accuracy": 0.6763284802436829,
"step": 175
},
{
"epoch": 0.4835164835164835,
"grad_norm": 12.501947402954102,
"learning_rate": 9.461949108975794e-06,
"loss": 1.896,
"mean_token_accuracy": 0.6086404323577881,
"step": 176
},
{
"epoch": 0.48626373626373626,
"grad_norm": 11.727622032165527,
"learning_rate": 9.45166933987603e-06,
"loss": 1.306,
"mean_token_accuracy": 0.7123456597328186,
"step": 177
},
{
"epoch": 0.489010989010989,
"grad_norm": 11.941061019897461,
"learning_rate": 9.44129866386329e-06,
"loss": 1.2725,
"mean_token_accuracy": 0.7091412544250488,
"step": 178
},
{
"epoch": 0.49175824175824173,
"grad_norm": 13.788084030151367,
"learning_rate": 9.430837319511718e-06,
"loss": 1.6164,
"mean_token_accuracy": 0.6568502187728882,
"step": 179
},
{
"epoch": 0.4945054945054945,
"grad_norm": 10.0413818359375,
"learning_rate": 9.420285547481257e-06,
"loss": 1.3372,
"mean_token_accuracy": 0.6839577555656433,
"step": 180
},
{
"epoch": 0.49725274725274726,
"grad_norm": 11.216383934020996,
"learning_rate": 9.409643590512116e-06,
"loss": 1.3565,
"mean_token_accuracy": 0.7152317762374878,
"step": 181
},
{
"epoch": 0.5,
"grad_norm": 12.224161148071289,
"learning_rate": 9.398911693419168e-06,
"loss": 1.2158,
"mean_token_accuracy": 0.7243436574935913,
"step": 182
},
{
"epoch": 0.5027472527472527,
"grad_norm": 10.415431022644043,
"learning_rate": 9.388090103086344e-06,
"loss": 1.2495,
"mean_token_accuracy": 0.7525423765182495,
"step": 183
},
{
"epoch": 0.5054945054945055,
"grad_norm": 12.435955047607422,
"learning_rate": 9.37717906846093e-06,
"loss": 1.3597,
"mean_token_accuracy": 0.7139272093772888,
"step": 184
},
{
"epoch": 0.5082417582417582,
"grad_norm": 11.92297649383545,
"learning_rate": 9.366178840547853e-06,
"loss": 1.2254,
"mean_token_accuracy": 0.7317647337913513,
"step": 185
},
{
"epoch": 0.510989010989011,
"grad_norm": 11.196688652038574,
"learning_rate": 9.355089672403905e-06,
"loss": 1.2686,
"mean_token_accuracy": 0.7352586984634399,
"step": 186
},
{
"epoch": 0.5137362637362637,
"grad_norm": 11.09434986114502,
"learning_rate": 9.343911819131918e-06,
"loss": 1.1463,
"mean_token_accuracy": 0.7363420724868774,
"step": 187
},
{
"epoch": 0.5164835164835165,
"grad_norm": 10.482327461242676,
"learning_rate": 9.332645537874901e-06,
"loss": 1.2708,
"mean_token_accuracy": 0.7420091032981873,
"step": 188
},
{
"epoch": 0.5192307692307693,
"grad_norm": 11.183061599731445,
"learning_rate": 9.321291087810115e-06,
"loss": 1.1896,
"mean_token_accuracy": 0.7237977981567383,
"step": 189
},
{
"epoch": 0.521978021978022,
"grad_norm": 12.56871223449707,
"learning_rate": 9.309848730143122e-06,
"loss": 1.4552,
"mean_token_accuracy": 0.6875817179679871,
"step": 190
},
{
"epoch": 0.5247252747252747,
"grad_norm": 12.282219886779785,
"learning_rate": 9.298318728101769e-06,
"loss": 1.6814,
"mean_token_accuracy": 0.6299212574958801,
"step": 191
},
{
"epoch": 0.5274725274725275,
"grad_norm": 11.732946395874023,
"learning_rate": 9.286701346930134e-06,
"loss": 1.3322,
"mean_token_accuracy": 0.6991474032402039,
"step": 192
},
{
"epoch": 0.5302197802197802,
"grad_norm": 11.771052360534668,
"learning_rate": 9.274996853882426e-06,
"loss": 1.5943,
"mean_token_accuracy": 0.6880733966827393,
"step": 193
},
{
"epoch": 0.532967032967033,
"grad_norm": 12.6982421875,
"learning_rate": 9.263205518216834e-06,
"loss": 1.4098,
"mean_token_accuracy": 0.6879063844680786,
"step": 194
},
{
"epoch": 0.5357142857142857,
"grad_norm": 12.095486640930176,
"learning_rate": 9.251327611189333e-06,
"loss": 1.4982,
"mean_token_accuracy": 0.6825208067893982,
"step": 195
},
{
"epoch": 0.5384615384615384,
"grad_norm": 10.93848991394043,
"learning_rate": 9.239363406047446e-06,
"loss": 1.3762,
"mean_token_accuracy": 0.6978672742843628,
"step": 196
},
{
"epoch": 0.5412087912087912,
"grad_norm": 10.177602767944336,
"learning_rate": 9.227313178023962e-06,
"loss": 1.2093,
"mean_token_accuracy": 0.7204058766365051,
"step": 197
},
{
"epoch": 0.5439560439560439,
"grad_norm": 11.405716896057129,
"learning_rate": 9.21517720433059e-06,
"loss": 1.4351,
"mean_token_accuracy": 0.692307710647583,
"step": 198
},
{
"epoch": 0.5467032967032966,
"grad_norm": 11.433396339416504,
"learning_rate": 9.202955764151597e-06,
"loss": 1.2367,
"mean_token_accuracy": 0.7333333492279053,
"step": 199
},
{
"epoch": 0.5494505494505495,
"grad_norm": 11.622626304626465,
"learning_rate": 9.190649138637378e-06,
"loss": 1.3771,
"mean_token_accuracy": 0.707454264163971,
"step": 200
},
{
"epoch": 0.5521978021978022,
"grad_norm": 10.980391502380371,
"learning_rate": 9.178257610897996e-06,
"loss": 1.3038,
"mean_token_accuracy": 0.6971365809440613,
"step": 201
},
{
"epoch": 0.554945054945055,
"grad_norm": 11.408574104309082,
"learning_rate": 9.16578146599665e-06,
"loss": 1.2083,
"mean_token_accuracy": 0.7157106995582581,
"step": 202
},
{
"epoch": 0.5576923076923077,
"grad_norm": 11.354791641235352,
"learning_rate": 9.153220990943147e-06,
"loss": 1.4099,
"mean_token_accuracy": 0.7070063948631287,
"step": 203
},
{
"epoch": 0.5604395604395604,
"grad_norm": 11.774231910705566,
"learning_rate": 9.140576474687263e-06,
"loss": 1.4604,
"mean_token_accuracy": 0.6878378391265869,
"step": 204
},
{
"epoch": 0.5631868131868132,
"grad_norm": 10.277194023132324,
"learning_rate": 9.127848208112135e-06,
"loss": 1.1073,
"mean_token_accuracy": 0.7497337460517883,
"step": 205
},
{
"epoch": 0.5659340659340659,
"grad_norm": 11.545296669006348,
"learning_rate": 9.115036484027537e-06,
"loss": 1.4562,
"mean_token_accuracy": 0.7032085657119751,
"step": 206
},
{
"epoch": 0.5686813186813187,
"grad_norm": 12.219311714172363,
"learning_rate": 9.10214159716316e-06,
"loss": 1.4314,
"mean_token_accuracy": 0.6811988949775696,
"step": 207
},
{
"epoch": 0.5714285714285714,
"grad_norm": 12.188143730163574,
"learning_rate": 9.08916384416183e-06,
"loss": 1.4823,
"mean_token_accuracy": 0.6658130884170532,
"step": 208
},
{
"epoch": 0.5741758241758241,
"grad_norm": 11.3258056640625,
"learning_rate": 9.076103523572685e-06,
"loss": 1.1717,
"mean_token_accuracy": 0.7044854760169983,
"step": 209
},
{
"epoch": 0.5769230769230769,
"grad_norm": 10.30764102935791,
"learning_rate": 9.0629609358443e-06,
"loss": 1.0972,
"mean_token_accuracy": 0.7533252835273743,
"step": 210
},
{
"epoch": 0.5796703296703297,
"grad_norm": 12.15485954284668,
"learning_rate": 9.049736383317777e-06,
"loss": 1.4388,
"mean_token_accuracy": 0.6901408433914185,
"step": 211
},
{
"epoch": 0.5824175824175825,
"grad_norm": 10.234277725219727,
"learning_rate": 9.0364301702198e-06,
"loss": 1.6417,
"mean_token_accuracy": 0.6692913174629211,
"step": 212
},
{
"epoch": 0.5851648351648352,
"grad_norm": 10.58019733428955,
"learning_rate": 9.023042602655624e-06,
"loss": 1.2144,
"mean_token_accuracy": 0.7276995182037354,
"step": 213
},
{
"epoch": 0.5879120879120879,
"grad_norm": 11.242633819580078,
"learning_rate": 9.009573988602042e-06,
"loss": 1.3192,
"mean_token_accuracy": 0.6910569071769714,
"step": 214
},
{
"epoch": 0.5906593406593407,
"grad_norm": 14.42585277557373,
"learning_rate": 8.99602463790029e-06,
"loss": 1.7263,
"mean_token_accuracy": 0.6614060401916504,
"step": 215
},
{
"epoch": 0.5934065934065934,
"grad_norm": 12.312402725219727,
"learning_rate": 8.98239486224893e-06,
"loss": 1.2264,
"mean_token_accuracy": 0.7335203289985657,
"step": 216
},
{
"epoch": 0.5961538461538461,
"grad_norm": 11.044991493225098,
"learning_rate": 8.968684975196673e-06,
"loss": 1.6332,
"mean_token_accuracy": 0.6666666865348816,
"step": 217
},
{
"epoch": 0.5989010989010989,
"grad_norm": 12.098809242248535,
"learning_rate": 8.954895292135171e-06,
"loss": 1.3326,
"mean_token_accuracy": 0.715923547744751,
"step": 218
},
{
"epoch": 0.6016483516483516,
"grad_norm": 10.962403297424316,
"learning_rate": 8.94102613029175e-06,
"loss": 1.1455,
"mean_token_accuracy": 0.7470588088035583,
"step": 219
},
{
"epoch": 0.6043956043956044,
"grad_norm": 13.998690605163574,
"learning_rate": 8.927077808722127e-06,
"loss": 1.5056,
"mean_token_accuracy": 0.6980891823768616,
"step": 220
},
{
"epoch": 0.6071428571428571,
"grad_norm": 13.010772705078125,
"learning_rate": 8.913050648303064e-06,
"loss": 1.5572,
"mean_token_accuracy": 0.6395863890647888,
"step": 221
},
{
"epoch": 0.6098901098901099,
"grad_norm": 11.286635398864746,
"learning_rate": 8.898944971724983e-06,
"loss": 1.5845,
"mean_token_accuracy": 0.6471264362335205,
"step": 222
},
{
"epoch": 0.6126373626373627,
"grad_norm": 10.34750747680664,
"learning_rate": 8.884761103484548e-06,
"loss": 1.31,
"mean_token_accuracy": 0.7002262473106384,
"step": 223
},
{
"epoch": 0.6153846153846154,
"grad_norm": 11.606867790222168,
"learning_rate": 8.870499369877194e-06,
"loss": 1.2979,
"mean_token_accuracy": 0.7079856991767883,
"step": 224
},
{
"epoch": 0.6181318681318682,
"grad_norm": 11.105297088623047,
"learning_rate": 8.85616009898963e-06,
"loss": 1.1587,
"mean_token_accuracy": 0.7566539645195007,
"step": 225
},
{
"epoch": 0.6208791208791209,
"grad_norm": 11.666239738464355,
"learning_rate": 8.841743620692279e-06,
"loss": 1.1036,
"mean_token_accuracy": 0.7243674993515015,
"step": 226
},
{
"epoch": 0.6236263736263736,
"grad_norm": 11.083747863769531,
"learning_rate": 8.827250266631704e-06,
"loss": 1.2543,
"mean_token_accuracy": 0.7404980063438416,
"step": 227
},
{
"epoch": 0.6263736263736264,
"grad_norm": 10.682806968688965,
"learning_rate": 8.81268037022296e-06,
"loss": 1.2809,
"mean_token_accuracy": 0.711448609828949,
"step": 228
},
{
"epoch": 0.6291208791208791,
"grad_norm": 9.957572937011719,
"learning_rate": 8.798034266641948e-06,
"loss": 1.226,
"mean_token_accuracy": 0.7218863368034363,
"step": 229
},
{
"epoch": 0.6318681318681318,
"grad_norm": 11.039960861206055,
"learning_rate": 8.783312292817681e-06,
"loss": 1.3115,
"mean_token_accuracy": 0.722300112247467,
"step": 230
},
{
"epoch": 0.6346153846153846,
"grad_norm": 12.31747055053711,
"learning_rate": 8.768514787424548e-06,
"loss": 1.6285,
"mean_token_accuracy": 0.643410861492157,
"step": 231
},
{
"epoch": 0.6373626373626373,
"grad_norm": 11.145745277404785,
"learning_rate": 8.753642090874516e-06,
"loss": 1.1148,
"mean_token_accuracy": 0.7636138796806335,
"step": 232
},
{
"epoch": 0.6401098901098901,
"grad_norm": 13.070597648620605,
"learning_rate": 8.7386945453093e-06,
"loss": 1.5973,
"mean_token_accuracy": 0.6937984228134155,
"step": 233
},
{
"epoch": 0.6428571428571429,
"grad_norm": 10.66708755493164,
"learning_rate": 8.723672494592497e-06,
"loss": 0.9787,
"mean_token_accuracy": 0.7600487470626831,
"step": 234
},
{
"epoch": 0.6456043956043956,
"grad_norm": 11.117300033569336,
"learning_rate": 8.708576284301668e-06,
"loss": 1.2219,
"mean_token_accuracy": 0.7341614961624146,
"step": 235
},
{
"epoch": 0.6483516483516484,
"grad_norm": 11.09400749206543,
"learning_rate": 8.693406261720392e-06,
"loss": 1.3723,
"mean_token_accuracy": 0.7157622575759888,
"step": 236
},
{
"epoch": 0.6510989010989011,
"grad_norm": 11.477158546447754,
"learning_rate": 8.67816277583028e-06,
"loss": 1.0769,
"mean_token_accuracy": 0.7450980544090271,
"step": 237
},
{
"epoch": 0.6538461538461539,
"grad_norm": 13.270724296569824,
"learning_rate": 8.66284617730294e-06,
"loss": 1.5841,
"mean_token_accuracy": 0.6744487881660461,
"step": 238
},
{
"epoch": 0.6565934065934066,
"grad_norm": 10.172012329101562,
"learning_rate": 8.647456818491912e-06,
"loss": 1.1889,
"mean_token_accuracy": 0.7316538691520691,
"step": 239
},
{
"epoch": 0.6593406593406593,
"grad_norm": 12.0361967086792,
"learning_rate": 8.63199505342457e-06,
"loss": 1.1995,
"mean_token_accuracy": 0.740841269493103,
"step": 240
},
{
"epoch": 0.6620879120879121,
"grad_norm": 12.539520263671875,
"learning_rate": 8.616461237793962e-06,
"loss": 1.2821,
"mean_token_accuracy": 0.7157490253448486,
"step": 241
},
{
"epoch": 0.6648351648351648,
"grad_norm": 10.934772491455078,
"learning_rate": 8.600855728950645e-06,
"loss": 1.373,
"mean_token_accuracy": 0.7006896734237671,
"step": 242
},
{
"epoch": 0.6675824175824175,
"grad_norm": 8.917642593383789,
"learning_rate": 8.585178885894451e-06,
"loss": 1.314,
"mean_token_accuracy": 0.7070063948631287,
"step": 243
},
{
"epoch": 0.6703296703296703,
"grad_norm": 11.78652572631836,
"learning_rate": 8.569431069266236e-06,
"loss": 1.5815,
"mean_token_accuracy": 0.6636713743209839,
"step": 244
},
{
"epoch": 0.6730769230769231,
"grad_norm": 10.088194847106934,
"learning_rate": 8.553612641339577e-06,
"loss": 1.3323,
"mean_token_accuracy": 0.6963562965393066,
"step": 245
},
{
"epoch": 0.6758241758241759,
"grad_norm": 10.238805770874023,
"learning_rate": 8.537723966012444e-06,
"loss": 1.0912,
"mean_token_accuracy": 0.7577962875366211,
"step": 246
},
{
"epoch": 0.6785714285714286,
"grad_norm": 10.34375,
"learning_rate": 8.521765408798828e-06,
"loss": 1.2058,
"mean_token_accuracy": 0.7399346828460693,
"step": 247
},
{
"epoch": 0.6813186813186813,
"grad_norm": 10.886113166809082,
"learning_rate": 8.505737336820327e-06,
"loss": 1.3668,
"mean_token_accuracy": 0.7172582745552063,
"step": 248
},
{
"epoch": 0.6840659340659341,
"grad_norm": 10.41820240020752,
"learning_rate": 8.48964011879771e-06,
"loss": 1.0878,
"mean_token_accuracy": 0.7511904835700989,
"step": 249
},
{
"epoch": 0.6868131868131868,
"grad_norm": 11.025300025939941,
"learning_rate": 8.473474125042424e-06,
"loss": 1.2859,
"mean_token_accuracy": 0.7300613522529602,
"step": 250
},
{
"epoch": 0.6895604395604396,
"grad_norm": 11.369904518127441,
"learning_rate": 8.457239727448083e-06,
"loss": 1.3666,
"mean_token_accuracy": 0.676508367061615,
"step": 251
},
{
"epoch": 0.6923076923076923,
"grad_norm": 13.08974838256836,
"learning_rate": 8.440937299481906e-06,
"loss": 1.5691,
"mean_token_accuracy": 0.6548347473144531,
"step": 252
},
{
"epoch": 0.695054945054945,
"grad_norm": 12.096307754516602,
"learning_rate": 8.424567216176132e-06,
"loss": 1.4822,
"mean_token_accuracy": 0.6430939435958862,
"step": 253
},
{
"epoch": 0.6978021978021978,
"grad_norm": 33.42136001586914,
"learning_rate": 8.408129854119395e-06,
"loss": 1.4324,
"mean_token_accuracy": 0.6841530203819275,
"step": 254
},
{
"epoch": 0.7005494505494505,
"grad_norm": 10.152215957641602,
"learning_rate": 8.391625591448044e-06,
"loss": 1.4295,
"mean_token_accuracy": 0.6865127682685852,
"step": 255
},
{
"epoch": 0.7032967032967034,
"grad_norm": 10.057063102722168,
"learning_rate": 8.375054807837466e-06,
"loss": 1.2096,
"mean_token_accuracy": 0.7173252105712891,
"step": 256
},
{
"epoch": 0.7060439560439561,
"grad_norm": 9.629958152770996,
"learning_rate": 8.358417884493336e-06,
"loss": 1.179,
"mean_token_accuracy": 0.725824773311615,
"step": 257
},
{
"epoch": 0.7087912087912088,
"grad_norm": 10.766592979431152,
"learning_rate": 8.341715204142854e-06,
"loss": 1.0063,
"mean_token_accuracy": 0.7549751400947571,
"step": 258
},
{
"epoch": 0.7115384615384616,
"grad_norm": 11.319416999816895,
"learning_rate": 8.324947151025941e-06,
"loss": 1.0844,
"mean_token_accuracy": 0.75,
"step": 259
},
{
"epoch": 0.7142857142857143,
"grad_norm": 11.977400779724121,
"learning_rate": 8.308114110886397e-06,
"loss": 1.5588,
"mean_token_accuracy": 0.656862735748291,
"step": 260
},
{
"epoch": 0.717032967032967,
"grad_norm": 16.977502822875977,
"learning_rate": 8.291216470963026e-06,
"loss": 0.9755,
"mean_token_accuracy": 0.7794561982154846,
"step": 261
},
{
"epoch": 0.7197802197802198,
"grad_norm": 11.817448616027832,
"learning_rate": 8.274254619980728e-06,
"loss": 1.2085,
"mean_token_accuracy": 0.7370466589927673,
"step": 262
},
{
"epoch": 0.7225274725274725,
"grad_norm": 12.315993309020996,
"learning_rate": 8.257228948141569e-06,
"loss": 1.3312,
"mean_token_accuracy": 0.6979020833969116,
"step": 263
},
{
"epoch": 0.7252747252747253,
"grad_norm": 12.558300018310547,
"learning_rate": 8.24013984711578e-06,
"loss": 1.2436,
"mean_token_accuracy": 0.7176966071128845,
"step": 264
},
{
"epoch": 0.728021978021978,
"grad_norm": 18.78453254699707,
"learning_rate": 8.22298771003277e-06,
"loss": 1.3972,
"mean_token_accuracy": 0.7003994584083557,
"step": 265
},
{
"epoch": 0.7307692307692307,
"grad_norm": 11.667128562927246,
"learning_rate": 8.205772931472068e-06,
"loss": 1.2294,
"mean_token_accuracy": 0.7415881752967834,
"step": 266
},
{
"epoch": 0.7335164835164835,
"grad_norm": 11.029336929321289,
"learning_rate": 8.188495907454253e-06,
"loss": 1.3288,
"mean_token_accuracy": 0.7267441749572754,
"step": 267
},
{
"epoch": 0.7362637362637363,
"grad_norm": 11.00023365020752,
"learning_rate": 8.171157035431842e-06,
"loss": 1.4495,
"mean_token_accuracy": 0.6796338558197021,
"step": 268
},
{
"epoch": 0.739010989010989,
"grad_norm": 12.021223068237305,
"learning_rate": 8.153756714280143e-06,
"loss": 1.5251,
"mean_token_accuracy": 0.6521739363670349,
"step": 269
},
{
"epoch": 0.7417582417582418,
"grad_norm": 11.635795593261719,
"learning_rate": 8.13629534428808e-06,
"loss": 1.488,
"mean_token_accuracy": 0.6854636669158936,
"step": 270
},
{
"epoch": 0.7445054945054945,
"grad_norm": 11.152154922485352,
"learning_rate": 8.118773327148994e-06,
"loss": 0.9805,
"mean_token_accuracy": 0.7692307829856873,
"step": 271
},
{
"epoch": 0.7472527472527473,
"grad_norm": 11.193713188171387,
"learning_rate": 8.101191065951388e-06,
"loss": 1.482,
"mean_token_accuracy": 0.6787003874778748,
"step": 272
},
{
"epoch": 0.75,
"grad_norm": 10.59757137298584,
"learning_rate": 8.083548965169663e-06,
"loss": 1.2458,
"mean_token_accuracy": 0.7222856879234314,
"step": 273
},
{
"epoch": 0.7527472527472527,
"grad_norm": 11.860565185546875,
"learning_rate": 8.065847430654813e-06,
"loss": 1.3717,
"mean_token_accuracy": 0.6764705777168274,
"step": 274
},
{
"epoch": 0.7554945054945055,
"grad_norm": 12.620820045471191,
"learning_rate": 8.048086869625081e-06,
"loss": 1.3039,
"mean_token_accuracy": 0.6946778893470764,
"step": 275
},
{
"epoch": 0.7582417582417582,
"grad_norm": 10.615486145019531,
"learning_rate": 8.0302676906566e-06,
"loss": 1.0608,
"mean_token_accuracy": 0.7779126167297363,
"step": 276
},
{
"epoch": 0.760989010989011,
"grad_norm": 11.107016563415527,
"learning_rate": 8.012390303673994e-06,
"loss": 1.4002,
"mean_token_accuracy": 0.6892052292823792,
"step": 277
},
{
"epoch": 0.7637362637362637,
"grad_norm": 13.737344741821289,
"learning_rate": 7.994455119940936e-06,
"loss": 1.5131,
"mean_token_accuracy": 0.6741293668746948,
"step": 278
},
{
"epoch": 0.7664835164835165,
"grad_norm": 14.99168586730957,
"learning_rate": 7.976462552050696e-06,
"loss": 1.3036,
"mean_token_accuracy": 0.6979637742042542,
"step": 279
},
{
"epoch": 0.7692307692307693,
"grad_norm": 11.542069435119629,
"learning_rate": 7.958413013916657e-06,
"loss": 1.3555,
"mean_token_accuracy": 0.7072808146476746,
"step": 280
},
{
"epoch": 0.771978021978022,
"grad_norm": 11.233433723449707,
"learning_rate": 7.94030692076278e-06,
"loss": 1.2378,
"mean_token_accuracy": 0.714981734752655,
"step": 281
},
{
"epoch": 0.7747252747252747,
"grad_norm": 12.117866516113281,
"learning_rate": 7.92214468911405e-06,
"loss": 1.3431,
"mean_token_accuracy": 0.6927152276039124,
"step": 282
},
{
"epoch": 0.7774725274725275,
"grad_norm": 12.193938255310059,
"learning_rate": 7.903926736786908e-06,
"loss": 1.2657,
"mean_token_accuracy": 0.7244094610214233,
"step": 283
},
{
"epoch": 0.7802197802197802,
"grad_norm": 10.754172325134277,
"learning_rate": 7.885653482879632e-06,
"loss": 1.1887,
"mean_token_accuracy": 0.7329341173171997,
"step": 284
},
{
"epoch": 0.782967032967033,
"grad_norm": 10.909982681274414,
"learning_rate": 7.867325347762694e-06,
"loss": 1.1202,
"mean_token_accuracy": 0.7314148545265198,
"step": 285
},
{
"epoch": 0.7857142857142857,
"grad_norm": 10.36418628692627,
"learning_rate": 7.848942753069087e-06,
"loss": 1.2096,
"mean_token_accuracy": 0.7448856830596924,
"step": 286
},
{
"epoch": 0.7884615384615384,
"grad_norm": 10.146966934204102,
"learning_rate": 7.830506121684633e-06,
"loss": 1.1391,
"mean_token_accuracy": 0.7517985701560974,
"step": 287
},
{
"epoch": 0.7912087912087912,
"grad_norm": 10.96828842163086,
"learning_rate": 7.812015877738254e-06,
"loss": 1.5554,
"mean_token_accuracy": 0.6592317223548889,
"step": 288
},
{
"epoch": 0.7939560439560439,
"grad_norm": 10.668539047241211,
"learning_rate": 7.793472446592203e-06,
"loss": 1.1544,
"mean_token_accuracy": 0.7598944306373596,
"step": 289
},
{
"epoch": 0.7967032967032966,
"grad_norm": 10.111360549926758,
"learning_rate": 7.774876254832303e-06,
"loss": 1.2486,
"mean_token_accuracy": 0.7079261541366577,
"step": 290
},
{
"epoch": 0.7994505494505495,
"grad_norm": 10.784381866455078,
"learning_rate": 7.756227730258103e-06,
"loss": 1.0485,
"mean_token_accuracy": 0.7605294585227966,
"step": 291
},
{
"epoch": 0.8021978021978022,
"grad_norm": 12.186707496643066,
"learning_rate": 7.737527301873056e-06,
"loss": 1.367,
"mean_token_accuracy": 0.7229254841804504,
"step": 292
},
{
"epoch": 0.804945054945055,
"grad_norm": 10.163819313049316,
"learning_rate": 7.718775399874655e-06,
"loss": 1.046,
"mean_token_accuracy": 0.7559523582458496,
"step": 293
},
{
"epoch": 0.8076923076923077,
"grad_norm": 11.084784507751465,
"learning_rate": 7.699972455644516e-06,
"loss": 1.1147,
"mean_token_accuracy": 0.7242128252983093,
"step": 294
},
{
"epoch": 0.8104395604395604,
"grad_norm": 9.741537094116211,
"learning_rate": 7.681118901738471e-06,
"loss": 1.0944,
"mean_token_accuracy": 0.7550111413002014,
"step": 295
},
{
"epoch": 0.8131868131868132,
"grad_norm": 10.739821434020996,
"learning_rate": 7.662215171876609e-06,
"loss": 1.0543,
"mean_token_accuracy": 0.755667507648468,
"step": 296
},
{
"epoch": 0.8159340659340659,
"grad_norm": 10.812918663024902,
"learning_rate": 7.643261700933305e-06,
"loss": 1.0856,
"mean_token_accuracy": 0.7506459951400757,
"step": 297
},
{
"epoch": 0.8186813186813187,
"grad_norm": 10.698429107666016,
"learning_rate": 7.624258924927209e-06,
"loss": 1.3178,
"mean_token_accuracy": 0.6916950941085815,
"step": 298
},
{
"epoch": 0.8214285714285714,
"grad_norm": 11.797857284545898,
"learning_rate": 7.605207281011219e-06,
"loss": 1.3815,
"mean_token_accuracy": 0.7045769691467285,
"step": 299
},
{
"epoch": 0.8241758241758241,
"grad_norm": 10.237780570983887,
"learning_rate": 7.5861072074624254e-06,
"loss": 1.2108,
"mean_token_accuracy": 0.7392900586128235,
"step": 300
},
{
"epoch": 0.8269230769230769,
"grad_norm": 11.791534423828125,
"learning_rate": 7.566959143672023e-06,
"loss": 1.3458,
"mean_token_accuracy": 0.6741440296173096,
"step": 301
},
{
"epoch": 0.8296703296703297,
"grad_norm": 11.488512992858887,
"learning_rate": 7.5477635301352115e-06,
"loss": 1.2632,
"mean_token_accuracy": 0.7227332592010498,
"step": 302
},
{
"epoch": 0.8324175824175825,
"grad_norm": 11.239252090454102,
"learning_rate": 7.528520808441058e-06,
"loss": 1.3094,
"mean_token_accuracy": 0.7037887573242188,
"step": 303
},
{
"epoch": 0.8351648351648352,
"grad_norm": 11.873623847961426,
"learning_rate": 7.509231421262333e-06,
"loss": 1.2245,
"mean_token_accuracy": 0.7310924530029297,
"step": 304
},
{
"epoch": 0.8379120879120879,
"grad_norm": 9.97056770324707,
"learning_rate": 7.489895812345335e-06,
"loss": 1.1895,
"mean_token_accuracy": 0.7085106372833252,
"step": 305
},
{
"epoch": 0.8406593406593407,
"grad_norm": 10.721016883850098,
"learning_rate": 7.470514426499681e-06,
"loss": 0.9412,
"mean_token_accuracy": 0.7770859003067017,
"step": 306
},
{
"epoch": 0.8434065934065934,
"grad_norm": 10.802633285522461,
"learning_rate": 7.451087709588069e-06,
"loss": 1.1512,
"mean_token_accuracy": 0.7355460524559021,
"step": 307
},
{
"epoch": 0.8461538461538461,
"grad_norm": 11.425252914428711,
"learning_rate": 7.431616108516022e-06,
"loss": 1.406,
"mean_token_accuracy": 0.6849925518035889,
"step": 308
},
{
"epoch": 0.8489010989010989,
"grad_norm": 9.682330131530762,
"learning_rate": 7.4121000712216165e-06,
"loss": 1.0936,
"mean_token_accuracy": 0.7689873576164246,
"step": 309
},
{
"epoch": 0.8516483516483516,
"grad_norm": 11.848267555236816,
"learning_rate": 7.392540046665161e-06,
"loss": 1.4753,
"mean_token_accuracy": 0.680701732635498,
"step": 310
},
{
"epoch": 0.8543956043956044,
"grad_norm": 10.513921737670898,
"learning_rate": 7.372936484818884e-06,
"loss": 1.1327,
"mean_token_accuracy": 0.7203166484832764,
"step": 311
},
{
"epoch": 0.8571428571428571,
"grad_norm": 10.796121597290039,
"learning_rate": 7.353289836656574e-06,
"loss": 1.1179,
"mean_token_accuracy": 0.7421451807022095,
"step": 312
},
{
"epoch": 0.8598901098901099,
"grad_norm": 9.850513458251953,
"learning_rate": 7.333600554143204e-06,
"loss": 1.4695,
"mean_token_accuracy": 0.6694129705429077,
"step": 313
},
{
"epoch": 0.8626373626373627,
"grad_norm": 10.93211555480957,
"learning_rate": 7.313869090224542e-06,
"loss": 1.4422,
"mean_token_accuracy": 0.6784530282020569,
"step": 314
},
{
"epoch": 0.8653846153846154,
"grad_norm": 9.837514877319336,
"learning_rate": 7.29409589881672e-06,
"loss": 1.3123,
"mean_token_accuracy": 0.7209039330482483,
"step": 315
},
{
"epoch": 0.8681318681318682,
"grad_norm": 11.812731742858887,
"learning_rate": 7.274281434795804e-06,
"loss": 1.4742,
"mean_token_accuracy": 0.6945205330848694,
"step": 316
},
{
"epoch": 0.8708791208791209,
"grad_norm": 12.415311813354492,
"learning_rate": 7.254426153987315e-06,
"loss": 1.4583,
"mean_token_accuracy": 0.6542416214942932,
"step": 317
},
{
"epoch": 0.8736263736263736,
"grad_norm": 9.164080619812012,
"learning_rate": 7.234530513155762e-06,
"loss": 1.2034,
"mean_token_accuracy": 0.7345678806304932,
"step": 318
},
{
"epoch": 0.8763736263736264,
"grad_norm": 10.568696022033691,
"learning_rate": 7.214594969994115e-06,
"loss": 1.3091,
"mean_token_accuracy": 0.6913425326347351,
"step": 319
},
{
"epoch": 0.8791208791208791,
"grad_norm": 11.376129150390625,
"learning_rate": 7.1946199831132905e-06,
"loss": 1.1427,
"mean_token_accuracy": 0.730867326259613,
"step": 320
},
{
"epoch": 0.8818681318681318,
"grad_norm": 10.715977668762207,
"learning_rate": 7.174606012031591e-06,
"loss": 1.052,
"mean_token_accuracy": 0.7385203838348389,
"step": 321
},
{
"epoch": 0.8846153846153846,
"grad_norm": 10.006759643554688,
"learning_rate": 7.154553517164139e-06,
"loss": 1.1576,
"mean_token_accuracy": 0.7474167346954346,
"step": 322
},
{
"epoch": 0.8873626373626373,
"grad_norm": 11.648452758789062,
"learning_rate": 7.134462959812287e-06,
"loss": 1.107,
"mean_token_accuracy": 0.7347995042800903,
"step": 323
},
{
"epoch": 0.8901098901098901,
"grad_norm": 11.925621032714844,
"learning_rate": 7.114334802153003e-06,
"loss": 1.8605,
"mean_token_accuracy": 0.5874384045600891,
"step": 324
},
{
"epoch": 0.8928571428571429,
"grad_norm": 11.52076244354248,
"learning_rate": 7.094169507228236e-06,
"loss": 1.3594,
"mean_token_accuracy": 0.7149321436882019,
"step": 325
},
{
"epoch": 0.8956043956043956,
"grad_norm": 11.350264549255371,
"learning_rate": 7.0739675389342665e-06,
"loss": 1.2657,
"mean_token_accuracy": 0.7225130796432495,
"step": 326
},
{
"epoch": 0.8983516483516484,
"grad_norm": 9.646256446838379,
"learning_rate": 7.053729362011034e-06,
"loss": 1.0548,
"mean_token_accuracy": 0.7713310718536377,
"step": 327
},
{
"epoch": 0.9010989010989011,
"grad_norm": 11.415197372436523,
"learning_rate": 7.033455442031451e-06,
"loss": 1.1871,
"mean_token_accuracy": 0.7532097101211548,
"step": 328
},
{
"epoch": 0.9038461538461539,
"grad_norm": 11.284040451049805,
"learning_rate": 7.0131462453906785e-06,
"loss": 1.3412,
"mean_token_accuracy": 0.7117437720298767,
"step": 329
},
{
"epoch": 0.9065934065934066,
"grad_norm": 12.890923500061035,
"learning_rate": 6.9928022392954175e-06,
"loss": 1.502,
"mean_token_accuracy": 0.6699716448783875,
"step": 330
},
{
"epoch": 0.9093406593406593,
"grad_norm": 11.523416519165039,
"learning_rate": 6.972423891753136e-06,
"loss": 1.2782,
"mean_token_accuracy": 0.7075351476669312,
"step": 331
},
{
"epoch": 0.9120879120879121,
"grad_norm": 11.033285140991211,
"learning_rate": 6.9520116715613315e-06,
"loss": 1.4068,
"mean_token_accuracy": 0.6863979697227478,
"step": 332
},
{
"epoch": 0.9148351648351648,
"grad_norm": 12.092578887939453,
"learning_rate": 6.9315660482967185e-06,
"loss": 1.3415,
"mean_token_accuracy": 0.7046035528182983,
"step": 333
},
{
"epoch": 0.9175824175824175,
"grad_norm": 9.621912002563477,
"learning_rate": 6.9110874923044445e-06,
"loss": 0.9547,
"mean_token_accuracy": 0.7870036363601685,
"step": 334
},
{
"epoch": 0.9203296703296703,
"grad_norm": 10.068906784057617,
"learning_rate": 6.890576474687264e-06,
"loss": 1.0756,
"mean_token_accuracy": 0.7344632744789124,
"step": 335
},
{
"epoch": 0.9230769230769231,
"grad_norm": 10.981210708618164,
"learning_rate": 6.8700334672947e-06,
"loss": 1.2718,
"mean_token_accuracy": 0.7097902297973633,
"step": 336
},
{
"epoch": 0.9258241758241759,
"grad_norm": 10.970985412597656,
"learning_rate": 6.849458942712189e-06,
"loss": 1.245,
"mean_token_accuracy": 0.7150127291679382,
"step": 337
},
{
"epoch": 0.9285714285714286,
"grad_norm": 9.914239883422852,
"learning_rate": 6.828853374250213e-06,
"loss": 1.1096,
"mean_token_accuracy": 0.7284382581710815,
"step": 338
},
{
"epoch": 0.9313186813186813,
"grad_norm": 11.386443138122559,
"learning_rate": 6.8082172359334085e-06,
"loss": 1.1512,
"mean_token_accuracy": 0.7243902683258057,
"step": 339
},
{
"epoch": 0.9340659340659341,
"grad_norm": 11.182723999023438,
"learning_rate": 6.7875510024896595e-06,
"loss": 1.0592,
"mean_token_accuracy": 0.7604035139083862,
"step": 340
},
{
"epoch": 0.9368131868131868,
"grad_norm": 10.25452709197998,
"learning_rate": 6.766855149339182e-06,
"loss": 1.2342,
"mean_token_accuracy": 0.7211981415748596,
"step": 341
},
{
"epoch": 0.9395604395604396,
"grad_norm": 11.961226463317871,
"learning_rate": 6.746130152583581e-06,
"loss": 1.3183,
"mean_token_accuracy": 0.7117726802825928,
"step": 342
},
{
"epoch": 0.9423076923076923,
"grad_norm": 10.968716621398926,
"learning_rate": 6.725376488994904e-06,
"loss": 1.08,
"mean_token_accuracy": 0.7506666779518127,
"step": 343
},
{
"epoch": 0.945054945054945,
"grad_norm": 11.551229476928711,
"learning_rate": 6.704594636004669e-06,
"loss": 1.3396,
"mean_token_accuracy": 0.729619562625885,
"step": 344
},
{
"epoch": 0.9478021978021978,
"grad_norm": 11.677756309509277,
"learning_rate": 6.683785071692877e-06,
"loss": 1.3994,
"mean_token_accuracy": 0.6937062740325928,
"step": 345
},
{
"epoch": 0.9505494505494505,
"grad_norm": 13.50044059753418,
"learning_rate": 6.662948274777031e-06,
"loss": 1.3472,
"mean_token_accuracy": 0.6719576716423035,
"step": 346
},
{
"epoch": 0.9532967032967034,
"grad_norm": 10.820409774780273,
"learning_rate": 6.642084724601101e-06,
"loss": 1.0404,
"mean_token_accuracy": 0.7569352984428406,
"step": 347
},
{
"epoch": 0.9560439560439561,
"grad_norm": 11.08627700805664,
"learning_rate": 6.6211949011245116e-06,
"loss": 1.0583,
"mean_token_accuracy": 0.740359902381897,
"step": 348
},
{
"epoch": 0.9587912087912088,
"grad_norm": 11.280929565429688,
"learning_rate": 6.6002792849110966e-06,
"loss": 1.2419,
"mean_token_accuracy": 0.7247820496559143,
"step": 349
},
{
"epoch": 0.9615384615384616,
"grad_norm": 11.241645812988281,
"learning_rate": 6.579338357118039e-06,
"loss": 1.1482,
"mean_token_accuracy": 0.7217597961425781,
"step": 350
},
{
"epoch": 0.9642857142857143,
"grad_norm": 10.611461639404297,
"learning_rate": 6.558372599484817e-06,
"loss": 1.1699,
"mean_token_accuracy": 0.7283372282981873,
"step": 351
},
{
"epoch": 0.967032967032967,
"grad_norm": 11.097305297851562,
"learning_rate": 6.537382494322101e-06,
"loss": 1.1236,
"mean_token_accuracy": 0.7349397540092468,
"step": 352
},
{
"epoch": 0.9697802197802198,
"grad_norm": 11.588397026062012,
"learning_rate": 6.516368524500673e-06,
"loss": 1.2617,
"mean_token_accuracy": 0.7199453711509705,
"step": 353
},
{
"epoch": 0.9725274725274725,
"grad_norm": 11.946807861328125,
"learning_rate": 6.495331173440315e-06,
"loss": 1.1311,
"mean_token_accuracy": 0.736912727355957,
"step": 354
},
{
"epoch": 0.9752747252747253,
"grad_norm": 10.510326385498047,
"learning_rate": 6.474270925098685e-06,
"loss": 1.0399,
"mean_token_accuracy": 0.7677852511405945,
"step": 355
},
{
"epoch": 0.978021978021978,
"grad_norm": 10.737903594970703,
"learning_rate": 6.453188263960186e-06,
"loss": 0.9998,
"mean_token_accuracy": 0.772020697593689,
"step": 356
},
{
"epoch": 0.9807692307692307,
"grad_norm": 11.893712997436523,
"learning_rate": 6.432083675024823e-06,
"loss": 1.5014,
"mean_token_accuracy": 0.701265811920166,
"step": 357
},
{
"epoch": 0.9835164835164835,
"grad_norm": 10.858631134033203,
"learning_rate": 6.410957643797039e-06,
"loss": 1.1924,
"mean_token_accuracy": 0.7433525919914246,
"step": 358
},
{
"epoch": 0.9862637362637363,
"grad_norm": 11.224592208862305,
"learning_rate": 6.389810656274553e-06,
"loss": 1.3194,
"mean_token_accuracy": 0.7270560264587402,
"step": 359
},
{
"epoch": 0.989010989010989,
"grad_norm": 11.10081958770752,
"learning_rate": 6.368643198937176e-06,
"loss": 1.1145,
"mean_token_accuracy": 0.734375,
"step": 360
},
{
"epoch": 0.9917582417582418,
"grad_norm": 10.680919647216797,
"learning_rate": 6.347455758735622e-06,
"loss": 1.1355,
"mean_token_accuracy": 0.7137681245803833,
"step": 361
},
{
"epoch": 0.9945054945054945,
"grad_norm": 12.283355712890625,
"learning_rate": 6.326248823080302e-06,
"loss": 1.2164,
"mean_token_accuracy": 0.7481805086135864,
"step": 362
},
{
"epoch": 0.9972527472527473,
"grad_norm": 10.820568084716797,
"learning_rate": 6.305022879830115e-06,
"loss": 1.1457,
"mean_token_accuracy": 0.739130437374115,
"step": 363
},
{
"epoch": 1.0,
"grad_norm": 11.383041381835938,
"learning_rate": 6.283778417281226e-06,
"loss": 1.1409,
"mean_token_accuracy": 0.735052764415741,
"step": 364
},
{
"epoch": 1.0027472527472527,
"grad_norm": 8.176081657409668,
"learning_rate": 6.262515924155826e-06,
"loss": 0.5403,
"mean_token_accuracy": 0.8576388955116272,
"step": 365
},
{
"epoch": 1.0054945054945055,
"grad_norm": 11.837261199951172,
"learning_rate": 6.2412358895908975e-06,
"loss": 1.077,
"mean_token_accuracy": 0.7554417252540588,
"step": 366
},
{
"epoch": 1.0082417582417582,
"grad_norm": 7.267159461975098,
"learning_rate": 6.219938803126958e-06,
"loss": 0.4787,
"mean_token_accuracy": 0.8890160322189331,
"step": 367
},
{
"epoch": 1.010989010989011,
"grad_norm": 7.923352241516113,
"learning_rate": 6.198625154696797e-06,
"loss": 0.5756,
"mean_token_accuracy": 0.8782935738563538,
"step": 368
},
{
"epoch": 1.0137362637362637,
"grad_norm": 8.488197326660156,
"learning_rate": 6.177295434614207e-06,
"loss": 0.6727,
"mean_token_accuracy": 0.8545688390731812,
"step": 369
},
{
"epoch": 1.0164835164835164,
"grad_norm": 9.032209396362305,
"learning_rate": 6.155950133562705e-06,
"loss": 0.7759,
"mean_token_accuracy": 0.8111273646354675,
"step": 370
},
{
"epoch": 1.0192307692307692,
"grad_norm": 8.959623336791992,
"learning_rate": 6.134589742584243e-06,
"loss": 0.7806,
"mean_token_accuracy": 0.80402010679245,
"step": 371
},
{
"epoch": 1.021978021978022,
"grad_norm": 7.533729076385498,
"learning_rate": 6.113214753067911e-06,
"loss": 0.6383,
"mean_token_accuracy": 0.8503740429878235,
"step": 372
},
{
"epoch": 1.0247252747252746,
"grad_norm": 7.809054374694824,
"learning_rate": 6.091825656738636e-06,
"loss": 0.6482,
"mean_token_accuracy": 0.8529411554336548,
"step": 373
},
{
"epoch": 1.0274725274725274,
"grad_norm": 6.753856658935547,
"learning_rate": 6.070422945645865e-06,
"loss": 0.4424,
"mean_token_accuracy": 0.9049773812294006,
"step": 374
},
{
"epoch": 1.0302197802197801,
"grad_norm": 7.692054748535156,
"learning_rate": 6.049007112152249e-06,
"loss": 0.5784,
"mean_token_accuracy": 0.8744344115257263,
"step": 375
},
{
"epoch": 1.032967032967033,
"grad_norm": 8.008811950683594,
"learning_rate": 6.027578648922319e-06,
"loss": 0.5913,
"mean_token_accuracy": 0.8524971008300781,
"step": 376
},
{
"epoch": 1.0357142857142858,
"grad_norm": 7.686079978942871,
"learning_rate": 6.006138048911146e-06,
"loss": 0.3791,
"mean_token_accuracy": 0.903930127620697,
"step": 377
},
{
"epoch": 1.0384615384615385,
"grad_norm": 7.934598445892334,
"learning_rate": 5.984685805353001e-06,
"loss": 0.5688,
"mean_token_accuracy": 0.8726027607917786,
"step": 378
},
{
"epoch": 1.0412087912087913,
"grad_norm": 9.844480514526367,
"learning_rate": 5.963222411750017e-06,
"loss": 0.6188,
"mean_token_accuracy": 0.8668596148490906,
"step": 379
},
{
"epoch": 1.043956043956044,
"grad_norm": 7.320831298828125,
"learning_rate": 5.941748361860828e-06,
"loss": 0.5387,
"mean_token_accuracy": 0.8523409366607666,
"step": 380
},
{
"epoch": 1.0467032967032968,
"grad_norm": 8.960830688476562,
"learning_rate": 5.920264149689213e-06,
"loss": 0.7356,
"mean_token_accuracy": 0.8335607051849365,
"step": 381
},
{
"epoch": 1.0494505494505495,
"grad_norm": 9.879374504089355,
"learning_rate": 5.898770269472728e-06,
"loss": 0.6628,
"mean_token_accuracy": 0.8490284085273743,
"step": 382
},
{
"epoch": 1.0521978021978022,
"grad_norm": 10.704723358154297,
"learning_rate": 5.877267215671345e-06,
"loss": 0.8533,
"mean_token_accuracy": 0.8047493696212769,
"step": 383
},
{
"epoch": 1.054945054945055,
"grad_norm": 7.644108772277832,
"learning_rate": 5.855755482956065e-06,
"loss": 0.4996,
"mean_token_accuracy": 0.8881909251213074,
"step": 384
},
{
"epoch": 1.0576923076923077,
"grad_norm": 8.324013710021973,
"learning_rate": 5.834235566197551e-06,
"loss": 0.5251,
"mean_token_accuracy": 0.8783930540084839,
"step": 385
},
{
"epoch": 1.0604395604395604,
"grad_norm": 7.874902725219727,
"learning_rate": 5.812707960454731e-06,
"loss": 0.4971,
"mean_token_accuracy": 0.884567141532898,
"step": 386
},
{
"epoch": 1.0631868131868132,
"grad_norm": 7.967838764190674,
"learning_rate": 5.791173160963419e-06,
"loss": 0.5384,
"mean_token_accuracy": 0.8828025460243225,
"step": 387
},
{
"epoch": 1.065934065934066,
"grad_norm": 8.472578048706055,
"learning_rate": 5.769631663124923e-06,
"loss": 0.7063,
"mean_token_accuracy": 0.8527131676673889,
"step": 388
},
{
"epoch": 1.0686813186813187,
"grad_norm": 8.554250717163086,
"learning_rate": 5.748083962494637e-06,
"loss": 0.6025,
"mean_token_accuracy": 0.8615210056304932,
"step": 389
},
{
"epoch": 1.0714285714285714,
"grad_norm": 7.113428592681885,
"learning_rate": 5.7265305547706516e-06,
"loss": 0.4874,
"mean_token_accuracy": 0.8806941509246826,
"step": 390
},
{
"epoch": 1.0741758241758241,
"grad_norm": 7.222031593322754,
"learning_rate": 5.704971935782348e-06,
"loss": 0.4854,
"mean_token_accuracy": 0.8786717653274536,
"step": 391
},
{
"epoch": 1.0769230769230769,
"grad_norm": 8.575010299682617,
"learning_rate": 5.68340860147899e-06,
"loss": 0.5006,
"mean_token_accuracy": 0.8834196925163269,
"step": 392
},
{
"epoch": 1.0796703296703296,
"grad_norm": 8.167390823364258,
"learning_rate": 5.661841047918318e-06,
"loss": 0.5515,
"mean_token_accuracy": 0.8620283007621765,
"step": 393
},
{
"epoch": 1.0824175824175823,
"grad_norm": 8.286924362182617,
"learning_rate": 5.640269771255126e-06,
"loss": 0.5343,
"mean_token_accuracy": 0.8598971962928772,
"step": 394
},
{
"epoch": 1.085164835164835,
"grad_norm": 9.173430442810059,
"learning_rate": 5.6186952677298705e-06,
"loss": 0.6477,
"mean_token_accuracy": 0.8426828980445862,
"step": 395
},
{
"epoch": 1.0879120879120878,
"grad_norm": 8.196123123168945,
"learning_rate": 5.597118033657231e-06,
"loss": 0.637,
"mean_token_accuracy": 0.8584905862808228,
"step": 396
},
{
"epoch": 1.0906593406593406,
"grad_norm": 8.771379470825195,
"learning_rate": 5.5755385654147084e-06,
"loss": 0.5563,
"mean_token_accuracy": 0.8635236024856567,
"step": 397
},
{
"epoch": 1.0934065934065935,
"grad_norm": 9.898123741149902,
"learning_rate": 5.5539573594311945e-06,
"loss": 0.7194,
"mean_token_accuracy": 0.822277843952179,
"step": 398
},
{
"epoch": 1.0961538461538463,
"grad_norm": 8.823380470275879,
"learning_rate": 5.53237491217556e-06,
"loss": 0.6639,
"mean_token_accuracy": 0.8430851101875305,
"step": 399
},
{
"epoch": 1.098901098901099,
"grad_norm": 12.10927963256836,
"learning_rate": 5.510791720145232e-06,
"loss": 0.7262,
"mean_token_accuracy": 0.8203753232955933,
"step": 400
},
{
"epoch": 1.1016483516483517,
"grad_norm": 7.197577476501465,
"learning_rate": 5.489208279854769e-06,
"loss": 0.5494,
"mean_token_accuracy": 0.8551502227783203,
"step": 401
},
{
"epoch": 1.1043956043956045,
"grad_norm": 8.447471618652344,
"learning_rate": 5.467625087824442e-06,
"loss": 0.5537,
"mean_token_accuracy": 0.8692810535430908,
"step": 402
},
{
"epoch": 1.1071428571428572,
"grad_norm": 8.937617301940918,
"learning_rate": 5.446042640568809e-06,
"loss": 0.7289,
"mean_token_accuracy": 0.8333333134651184,
"step": 403
},
{
"epoch": 1.10989010989011,
"grad_norm": 8.270979881286621,
"learning_rate": 5.424461434585293e-06,
"loss": 0.6222,
"mean_token_accuracy": 0.846905529499054,
"step": 404
},
{
"epoch": 1.1126373626373627,
"grad_norm": 7.838191986083984,
"learning_rate": 5.40288196634277e-06,
"loss": 0.6071,
"mean_token_accuracy": 0.8605230450630188,
"step": 405
},
{
"epoch": 1.1153846153846154,
"grad_norm": 8.771527290344238,
"learning_rate": 5.381304732270131e-06,
"loss": 0.5886,
"mean_token_accuracy": 0.8531374931335449,
"step": 406
},
{
"epoch": 1.1181318681318682,
"grad_norm": 7.3298563957214355,
"learning_rate": 5.359730228744876e-06,
"loss": 0.5052,
"mean_token_accuracy": 0.8785796165466309,
"step": 407
},
{
"epoch": 1.120879120879121,
"grad_norm": 8.253058433532715,
"learning_rate": 5.3381589520816855e-06,
"loss": 0.6519,
"mean_token_accuracy": 0.8469135761260986,
"step": 408
},
{
"epoch": 1.1236263736263736,
"grad_norm": 9.247401237487793,
"learning_rate": 5.31659139852101e-06,
"loss": 0.7053,
"mean_token_accuracy": 0.8385093212127686,
"step": 409
},
{
"epoch": 1.1263736263736264,
"grad_norm": 8.64867877960205,
"learning_rate": 5.295028064217653e-06,
"loss": 0.6317,
"mean_token_accuracy": 0.84624844789505,
"step": 410
},
{
"epoch": 1.129120879120879,
"grad_norm": 8.751404762268066,
"learning_rate": 5.27346944522935e-06,
"loss": 0.5933,
"mean_token_accuracy": 0.8576826453208923,
"step": 411
},
{
"epoch": 1.1318681318681318,
"grad_norm": 8.154433250427246,
"learning_rate": 5.2519160375053645e-06,
"loss": 0.4434,
"mean_token_accuracy": 0.9011124968528748,
"step": 412
},
{
"epoch": 1.1346153846153846,
"grad_norm": 8.975844383239746,
"learning_rate": 5.230368336875078e-06,
"loss": 0.7341,
"mean_token_accuracy": 0.8397710919380188,
"step": 413
},
{
"epoch": 1.1373626373626373,
"grad_norm": 10.191604614257812,
"learning_rate": 5.2088268390365804e-06,
"loss": 0.6342,
"mean_token_accuracy": 0.8444666266441345,
"step": 414
},
{
"epoch": 1.14010989010989,
"grad_norm": 8.440400123596191,
"learning_rate": 5.187292039545271e-06,
"loss": 0.6542,
"mean_token_accuracy": 0.8637640476226807,
"step": 415
},
{
"epoch": 1.1428571428571428,
"grad_norm": 7.392390727996826,
"learning_rate": 5.1657644338024525e-06,
"loss": 0.6651,
"mean_token_accuracy": 0.8404371738433838,
"step": 416
},
{
"epoch": 1.1456043956043955,
"grad_norm": 8.26939582824707,
"learning_rate": 5.144244517043936e-06,
"loss": 0.5253,
"mean_token_accuracy": 0.8696275353431702,
"step": 417
},
{
"epoch": 1.1483516483516483,
"grad_norm": 8.410235404968262,
"learning_rate": 5.122732784328657e-06,
"loss": 0.6554,
"mean_token_accuracy": 0.842737078666687,
"step": 418
},
{
"epoch": 1.151098901098901,
"grad_norm": 8.329333305358887,
"learning_rate": 5.1012297305272725e-06,
"loss": 0.6661,
"mean_token_accuracy": 0.8437118530273438,
"step": 419
},
{
"epoch": 1.1538461538461537,
"grad_norm": 7.471403121948242,
"learning_rate": 5.0797358503107875e-06,
"loss": 0.5257,
"mean_token_accuracy": 0.8748419880867004,
"step": 420
},
{
"epoch": 1.1565934065934065,
"grad_norm": 8.804765701293945,
"learning_rate": 5.058251638139173e-06,
"loss": 0.7146,
"mean_token_accuracy": 0.8457831144332886,
"step": 421
},
{
"epoch": 1.1593406593406592,
"grad_norm": 8.508563041687012,
"learning_rate": 5.036777588249983e-06,
"loss": 0.6724,
"mean_token_accuracy": 0.8466413021087646,
"step": 422
},
{
"epoch": 1.1620879120879122,
"grad_norm": 8.738518714904785,
"learning_rate": 5.015314194647001e-06,
"loss": 0.7564,
"mean_token_accuracy": 0.8212974071502686,
"step": 423
},
{
"epoch": 1.164835164835165,
"grad_norm": 8.125940322875977,
"learning_rate": 4.9938619510888575e-06,
"loss": 0.5889,
"mean_token_accuracy": 0.879807710647583,
"step": 424
},
{
"epoch": 1.1675824175824177,
"grad_norm": 7.5376081466674805,
"learning_rate": 4.972421351077682e-06,
"loss": 0.5469,
"mean_token_accuracy": 0.8646616339683533,
"step": 425
},
{
"epoch": 1.1703296703296704,
"grad_norm": 8.586243629455566,
"learning_rate": 4.950992887847752e-06,
"loss": 0.6722,
"mean_token_accuracy": 0.8421768546104431,
"step": 426
},
{
"epoch": 1.1730769230769231,
"grad_norm": 9.106139183044434,
"learning_rate": 4.929577054354138e-06,
"loss": 0.6973,
"mean_token_accuracy": 0.8415716290473938,
"step": 427
},
{
"epoch": 1.1758241758241759,
"grad_norm": 8.343849182128906,
"learning_rate": 4.908174343261366e-06,
"loss": 0.6343,
"mean_token_accuracy": 0.8487690687179565,
"step": 428
},
{
"epoch": 1.1785714285714286,
"grad_norm": 8.809746742248535,
"learning_rate": 4.88678524693209e-06,
"loss": 0.7209,
"mean_token_accuracy": 0.8351351618766785,
"step": 429
},
{
"epoch": 1.1813186813186813,
"grad_norm": 8.554579734802246,
"learning_rate": 4.865410257415758e-06,
"loss": 0.8525,
"mean_token_accuracy": 0.7745803594589233,
"step": 430
},
{
"epoch": 1.184065934065934,
"grad_norm": 6.8353495597839355,
"learning_rate": 4.844049866437296e-06,
"loss": 0.5986,
"mean_token_accuracy": 0.8746867179870605,
"step": 431
},
{
"epoch": 1.1868131868131868,
"grad_norm": 8.058353424072266,
"learning_rate": 4.822704565385796e-06,
"loss": 0.8536,
"mean_token_accuracy": 0.8027842044830322,
"step": 432
},
{
"epoch": 1.1895604395604396,
"grad_norm": 8.545801162719727,
"learning_rate": 4.801374845303205e-06,
"loss": 0.7223,
"mean_token_accuracy": 0.8521836400032043,
"step": 433
},
{
"epoch": 1.1923076923076923,
"grad_norm": 8.993727684020996,
"learning_rate": 4.780061196873044e-06,
"loss": 0.7401,
"mean_token_accuracy": 0.8204181790351868,
"step": 434
},
{
"epoch": 1.195054945054945,
"grad_norm": 7.437719345092773,
"learning_rate": 4.758764110409103e-06,
"loss": 0.7573,
"mean_token_accuracy": 0.8167487978935242,
"step": 435
},
{
"epoch": 1.1978021978021978,
"grad_norm": 7.946503162384033,
"learning_rate": 4.737484075844175e-06,
"loss": 0.5876,
"mean_token_accuracy": 0.8464567065238953,
"step": 436
},
{
"epoch": 1.2005494505494505,
"grad_norm": 7.993088245391846,
"learning_rate": 4.7162215827187765e-06,
"loss": 0.5828,
"mean_token_accuracy": 0.8666666746139526,
"step": 437
},
{
"epoch": 1.2032967032967032,
"grad_norm": 8.567098617553711,
"learning_rate": 4.694977120169886e-06,
"loss": 0.6235,
"mean_token_accuracy": 0.8593023419380188,
"step": 438
},
{
"epoch": 1.206043956043956,
"grad_norm": 8.035146713256836,
"learning_rate": 4.6737511769197e-06,
"loss": 0.6591,
"mean_token_accuracy": 0.8387516140937805,
"step": 439
},
{
"epoch": 1.2087912087912087,
"grad_norm": 9.358229637145996,
"learning_rate": 4.65254424126438e-06,
"loss": 0.4838,
"mean_token_accuracy": 0.8591022491455078,
"step": 440
},
{
"epoch": 1.2115384615384615,
"grad_norm": 7.417166709899902,
"learning_rate": 4.631356801062824e-06,
"loss": 0.5559,
"mean_token_accuracy": 0.881313145160675,
"step": 441
},
{
"epoch": 1.2142857142857142,
"grad_norm": 8.637767791748047,
"learning_rate": 4.6101893437254485e-06,
"loss": 0.5632,
"mean_token_accuracy": 0.8659658432006836,
"step": 442
},
{
"epoch": 1.2170329670329672,
"grad_norm": 7.665891170501709,
"learning_rate": 4.5890423562029605e-06,
"loss": 0.6366,
"mean_token_accuracy": 0.863930881023407,
"step": 443
},
{
"epoch": 1.2197802197802199,
"grad_norm": 8.18127155303955,
"learning_rate": 4.567916324975178e-06,
"loss": 0.5943,
"mean_token_accuracy": 0.8619354963302612,
"step": 444
},
{
"epoch": 1.2225274725274726,
"grad_norm": 7.742138862609863,
"learning_rate": 4.546811736039814e-06,
"loss": 0.6785,
"mean_token_accuracy": 0.8467432856559753,
"step": 445
},
{
"epoch": 1.2252747252747254,
"grad_norm": 7.575841426849365,
"learning_rate": 4.525729074901316e-06,
"loss": 0.5459,
"mean_token_accuracy": 0.8661518692970276,
"step": 446
},
{
"epoch": 1.228021978021978,
"grad_norm": 8.588372230529785,
"learning_rate": 4.504668826559687e-06,
"loss": 0.6267,
"mean_token_accuracy": 0.8547717928886414,
"step": 447
},
{
"epoch": 1.2307692307692308,
"grad_norm": 8.191335678100586,
"learning_rate": 4.483631475499329e-06,
"loss": 0.603,
"mean_token_accuracy": 0.8422301411628723,
"step": 448
},
{
"epoch": 1.2335164835164836,
"grad_norm": 7.857209205627441,
"learning_rate": 4.4626175056779005e-06,
"loss": 0.7386,
"mean_token_accuracy": 0.8307873010635376,
"step": 449
},
{
"epoch": 1.2362637362637363,
"grad_norm": 7.451523303985596,
"learning_rate": 4.441627400515185e-06,
"loss": 0.5647,
"mean_token_accuracy": 0.8670588135719299,
"step": 450
},
{
"epoch": 1.239010989010989,
"grad_norm": 7.155614376068115,
"learning_rate": 4.420661642881961e-06,
"loss": 0.545,
"mean_token_accuracy": 0.874709963798523,
"step": 451
},
{
"epoch": 1.2417582417582418,
"grad_norm": 6.548577785491943,
"learning_rate": 4.399720715088906e-06,
"loss": 0.4698,
"mean_token_accuracy": 0.892816424369812,
"step": 452
},
{
"epoch": 1.2445054945054945,
"grad_norm": 7.577704906463623,
"learning_rate": 4.378805098875491e-06,
"loss": 0.6068,
"mean_token_accuracy": 0.851767361164093,
"step": 453
},
{
"epoch": 1.2472527472527473,
"grad_norm": 8.976188659667969,
"learning_rate": 4.357915275398901e-06,
"loss": 0.8297,
"mean_token_accuracy": 0.8129205703735352,
"step": 454
},
{
"epoch": 1.25,
"grad_norm": 7.827886581420898,
"learning_rate": 4.33705172522297e-06,
"loss": 0.7283,
"mean_token_accuracy": 0.8148936033248901,
"step": 455
},
{
"epoch": 1.2527472527472527,
"grad_norm": 6.6771931648254395,
"learning_rate": 4.316214928307125e-06,
"loss": 0.4674,
"mean_token_accuracy": 0.8864168524742126,
"step": 456
},
{
"epoch": 1.2554945054945055,
"grad_norm": 8.46672248840332,
"learning_rate": 4.295405363995333e-06,
"loss": 0.5265,
"mean_token_accuracy": 0.8727770447731018,
"step": 457
},
{
"epoch": 1.2582417582417582,
"grad_norm": 8.40459156036377,
"learning_rate": 4.274623511005098e-06,
"loss": 0.6493,
"mean_token_accuracy": 0.8478260636329651,
"step": 458
},
{
"epoch": 1.260989010989011,
"grad_norm": 8.433124542236328,
"learning_rate": 4.25386984741642e-06,
"loss": 0.568,
"mean_token_accuracy": 0.8743386268615723,
"step": 459
},
{
"epoch": 1.2637362637362637,
"grad_norm": 7.261688709259033,
"learning_rate": 4.2331448506608196e-06,
"loss": 0.485,
"mean_token_accuracy": 0.8756371140480042,
"step": 460
},
{
"epoch": 1.2664835164835164,
"grad_norm": 8.551594734191895,
"learning_rate": 4.212448997510341e-06,
"loss": 0.627,
"mean_token_accuracy": 0.851037859916687,
"step": 461
},
{
"epoch": 1.2692307692307692,
"grad_norm": 7.449108123779297,
"learning_rate": 4.191782764066592e-06,
"loss": 0.74,
"mean_token_accuracy": 0.8116805553436279,
"step": 462
},
{
"epoch": 1.271978021978022,
"grad_norm": 8.490302085876465,
"learning_rate": 4.171146625749788e-06,
"loss": 0.6644,
"mean_token_accuracy": 0.8355408310890198,
"step": 463
},
{
"epoch": 1.2747252747252746,
"grad_norm": 7.6355180740356445,
"learning_rate": 4.150541057287814e-06,
"loss": 0.5712,
"mean_token_accuracy": 0.872826099395752,
"step": 464
},
{
"epoch": 1.2774725274725274,
"grad_norm": 8.184901237487793,
"learning_rate": 4.129966532705302e-06,
"loss": 0.4505,
"mean_token_accuracy": 0.8914955854415894,
"step": 465
},
{
"epoch": 1.2802197802197801,
"grad_norm": 22.74942398071289,
"learning_rate": 4.109423525312738e-06,
"loss": 0.8091,
"mean_token_accuracy": 0.8078431487083435,
"step": 466
},
{
"epoch": 1.2829670329670328,
"grad_norm": 8.367789268493652,
"learning_rate": 4.088912507695556e-06,
"loss": 0.6103,
"mean_token_accuracy": 0.8558322191238403,
"step": 467
},
{
"epoch": 1.2857142857142856,
"grad_norm": 7.367910861968994,
"learning_rate": 4.068433951703284e-06,
"loss": 0.4561,
"mean_token_accuracy": 0.8980070352554321,
"step": 468
},
{
"epoch": 1.2884615384615383,
"grad_norm": 10.204851150512695,
"learning_rate": 4.04798832843867e-06,
"loss": 0.7424,
"mean_token_accuracy": 0.8163265585899353,
"step": 469
},
{
"epoch": 1.2912087912087913,
"grad_norm": 7.735379219055176,
"learning_rate": 4.027576108246863e-06,
"loss": 0.5422,
"mean_token_accuracy": 0.8828213810920715,
"step": 470
},
{
"epoch": 1.293956043956044,
"grad_norm": 8.325048446655273,
"learning_rate": 4.007197760704586e-06,
"loss": 0.7073,
"mean_token_accuracy": 0.8321759104728699,
"step": 471
},
{
"epoch": 1.2967032967032968,
"grad_norm": 7.73416805267334,
"learning_rate": 3.986853754609323e-06,
"loss": 0.5546,
"mean_token_accuracy": 0.8792401552200317,
"step": 472
},
{
"epoch": 1.2994505494505495,
"grad_norm": 8.063698768615723,
"learning_rate": 3.96654455796855e-06,
"loss": 0.6846,
"mean_token_accuracy": 0.8486292958259583,
"step": 473
},
{
"epoch": 1.3021978021978022,
"grad_norm": 7.270962715148926,
"learning_rate": 3.946270637988967e-06,
"loss": 0.6298,
"mean_token_accuracy": 0.8434125185012817,
"step": 474
},
{
"epoch": 1.304945054945055,
"grad_norm": 6.182555675506592,
"learning_rate": 3.926032461065735e-06,
"loss": 0.4491,
"mean_token_accuracy": 0.8908342123031616,
"step": 475
},
{
"epoch": 1.3076923076923077,
"grad_norm": 8.628203392028809,
"learning_rate": 3.9058304927717665e-06,
"loss": 0.6467,
"mean_token_accuracy": 0.8459495306015015,
"step": 476
},
{
"epoch": 1.3104395604395604,
"grad_norm": 6.947534084320068,
"learning_rate": 3.885665197847e-06,
"loss": 0.4605,
"mean_token_accuracy": 0.891922652721405,
"step": 477
},
{
"epoch": 1.3131868131868132,
"grad_norm": 7.754703044891357,
"learning_rate": 3.865537040187714e-06,
"loss": 0.5706,
"mean_token_accuracy": 0.8667481541633606,
"step": 478
},
{
"epoch": 1.315934065934066,
"grad_norm": 8.421850204467773,
"learning_rate": 3.845446482835864e-06,
"loss": 0.5796,
"mean_token_accuracy": 0.8535211086273193,
"step": 479
},
{
"epoch": 1.3186813186813187,
"grad_norm": 7.543336391448975,
"learning_rate": 3.825393987968412e-06,
"loss": 0.5661,
"mean_token_accuracy": 0.8620689511299133,
"step": 480
},
{
"epoch": 1.3214285714285714,
"grad_norm": 7.205345153808594,
"learning_rate": 3.8053800168867117e-06,
"loss": 0.502,
"mean_token_accuracy": 0.8709288239479065,
"step": 481
},
{
"epoch": 1.3241758241758241,
"grad_norm": 7.576408386230469,
"learning_rate": 3.7854050300058865e-06,
"loss": 0.6592,
"mean_token_accuracy": 0.8460508584976196,
"step": 482
},
{
"epoch": 1.3269230769230769,
"grad_norm": 9.292501449584961,
"learning_rate": 3.765469486844239e-06,
"loss": 0.8449,
"mean_token_accuracy": 0.8119080066680908,
"step": 483
},
{
"epoch": 1.3296703296703296,
"grad_norm": 7.648805141448975,
"learning_rate": 3.745573846012687e-06,
"loss": 0.4898,
"mean_token_accuracy": 0.8732572793960571,
"step": 484
},
{
"epoch": 1.3324175824175823,
"grad_norm": 7.9348530769348145,
"learning_rate": 3.7257185652041994e-06,
"loss": 0.5911,
"mean_token_accuracy": 0.8699284195899963,
"step": 485
},
{
"epoch": 1.335164835164835,
"grad_norm": 8.589659690856934,
"learning_rate": 3.705904101183281e-06,
"loss": 0.5973,
"mean_token_accuracy": 0.8438576459884644,
"step": 486
},
{
"epoch": 1.337912087912088,
"grad_norm": 8.7539701461792,
"learning_rate": 3.6861309097754595e-06,
"loss": 0.7594,
"mean_token_accuracy": 0.8237704634666443,
"step": 487
},
{
"epoch": 1.3406593406593408,
"grad_norm": 8.811039924621582,
"learning_rate": 3.6663994458567977e-06,
"loss": 0.6975,
"mean_token_accuracy": 0.8472585082054138,
"step": 488
},
{
"epoch": 1.3434065934065935,
"grad_norm": 7.768817901611328,
"learning_rate": 3.646710163343429e-06,
"loss": 0.6072,
"mean_token_accuracy": 0.8522167205810547,
"step": 489
},
{
"epoch": 1.3461538461538463,
"grad_norm": 6.7450079917907715,
"learning_rate": 3.6270635151811175e-06,
"loss": 0.433,
"mean_token_accuracy": 0.9020737409591675,
"step": 490
},
{
"epoch": 1.348901098901099,
"grad_norm": 7.87033224105835,
"learning_rate": 3.60745995333484e-06,
"loss": 0.622,
"mean_token_accuracy": 0.8647594451904297,
"step": 491
},
{
"epoch": 1.3516483516483517,
"grad_norm": 8.751251220703125,
"learning_rate": 3.5878999287783866e-06,
"loss": 0.7387,
"mean_token_accuracy": 0.8346773982048035,
"step": 492
},
{
"epoch": 1.3543956043956045,
"grad_norm": 8.54961109161377,
"learning_rate": 3.5683838914839795e-06,
"loss": 0.6551,
"mean_token_accuracy": 0.8329238295555115,
"step": 493
},
{
"epoch": 1.3571428571428572,
"grad_norm": 7.03640079498291,
"learning_rate": 3.5489122904119332e-06,
"loss": 0.4917,
"mean_token_accuracy": 0.884529173374176,
"step": 494
},
{
"epoch": 1.35989010989011,
"grad_norm": 147.5779571533203,
"learning_rate": 3.52948557350032e-06,
"loss": 0.7133,
"mean_token_accuracy": 0.8239277601242065,
"step": 495
},
{
"epoch": 1.3626373626373627,
"grad_norm": 5.867126941680908,
"learning_rate": 3.510104187654666e-06,
"loss": 0.3257,
"mean_token_accuracy": 0.9185360074043274,
"step": 496
},
{
"epoch": 1.3653846153846154,
"grad_norm": 8.838720321655273,
"learning_rate": 3.490768578737669e-06,
"loss": 0.5831,
"mean_token_accuracy": 0.8387096524238586,
"step": 497
},
{
"epoch": 1.3681318681318682,
"grad_norm": 7.148526668548584,
"learning_rate": 3.471479191558944e-06,
"loss": 0.432,
"mean_token_accuracy": 0.8999999761581421,
"step": 498
},
{
"epoch": 1.370879120879121,
"grad_norm": 8.821269989013672,
"learning_rate": 3.452236469864789e-06,
"loss": 0.8936,
"mean_token_accuracy": 0.7850574851036072,
"step": 499
},
{
"epoch": 1.3736263736263736,
"grad_norm": 8.092583656311035,
"learning_rate": 3.433040856327979e-06,
"loss": 0.6344,
"mean_token_accuracy": 0.8361244201660156,
"step": 500
},
{
"epoch": 1.3763736263736264,
"grad_norm": 6.676445007324219,
"learning_rate": 3.413892792537577e-06,
"loss": 0.3909,
"mean_token_accuracy": 0.9048811197280884,
"step": 501
},
{
"epoch": 1.379120879120879,
"grad_norm": 8.763235092163086,
"learning_rate": 3.394792718988783e-06,
"loss": 0.6677,
"mean_token_accuracy": 0.8345499038696289,
"step": 502
},
{
"epoch": 1.3818681318681318,
"grad_norm": 8.628216743469238,
"learning_rate": 3.3757410750727933e-06,
"loss": 0.7486,
"mean_token_accuracy": 0.8381071090698242,
"step": 503
},
{
"epoch": 1.3846153846153846,
"grad_norm": 8.193002700805664,
"learning_rate": 3.356738299066695e-06,
"loss": 0.6746,
"mean_token_accuracy": 0.8546798229217529,
"step": 504
},
{
"epoch": 1.3873626373626373,
"grad_norm": 7.721636772155762,
"learning_rate": 3.3377848281233916e-06,
"loss": 0.4957,
"mean_token_accuracy": 0.8779149651527405,
"step": 505
},
{
"epoch": 1.39010989010989,
"grad_norm": 7.719691753387451,
"learning_rate": 3.318881098261533e-06,
"loss": 0.6541,
"mean_token_accuracy": 0.8626444339752197,
"step": 506
},
{
"epoch": 1.3928571428571428,
"grad_norm": 8.396798133850098,
"learning_rate": 3.300027544355485e-06,
"loss": 0.5343,
"mean_token_accuracy": 0.8785046935081482,
"step": 507
},
{
"epoch": 1.3956043956043955,
"grad_norm": 9.754472732543945,
"learning_rate": 3.2812246001253455e-06,
"loss": 0.7571,
"mean_token_accuracy": 0.8223087191581726,
"step": 508
},
{
"epoch": 1.3983516483516483,
"grad_norm": 8.206571578979492,
"learning_rate": 3.262472698126944e-06,
"loss": 0.7667,
"mean_token_accuracy": 0.8345771431922913,
"step": 509
},
{
"epoch": 1.401098901098901,
"grad_norm": 8.819429397583008,
"learning_rate": 3.2437722697418995e-06,
"loss": 0.7533,
"mean_token_accuracy": 0.8287752866744995,
"step": 510
},
{
"epoch": 1.4038461538461537,
"grad_norm": 6.980717182159424,
"learning_rate": 3.225123745167699e-06,
"loss": 0.4373,
"mean_token_accuracy": 0.898408830165863,
"step": 511
},
{
"epoch": 1.4065934065934065,
"grad_norm": 7.27022647857666,
"learning_rate": 3.206527553407795e-06,
"loss": 0.4416,
"mean_token_accuracy": 0.9033613204956055,
"step": 512
},
{
"epoch": 1.4093406593406592,
"grad_norm": 6.68601655960083,
"learning_rate": 3.1879841222617484e-06,
"loss": 0.5126,
"mean_token_accuracy": 0.8735891580581665,
"step": 513
},
{
"epoch": 1.412087912087912,
"grad_norm": 7.937134265899658,
"learning_rate": 3.169493878315369e-06,
"loss": 0.5541,
"mean_token_accuracy": 0.8812351822853088,
"step": 514
},
{
"epoch": 1.414835164835165,
"grad_norm": 8.167470932006836,
"learning_rate": 3.151057246930914e-06,
"loss": 0.6355,
"mean_token_accuracy": 0.8437935709953308,
"step": 515
},
{
"epoch": 1.4175824175824177,
"grad_norm": 7.2766594886779785,
"learning_rate": 3.1326746522373073e-06,
"loss": 0.4797,
"mean_token_accuracy": 0.8883495330810547,
"step": 516
},
{
"epoch": 1.4203296703296704,
"grad_norm": 8.126523971557617,
"learning_rate": 3.114346517120369e-06,
"loss": 0.6775,
"mean_token_accuracy": 0.8465011119842529,
"step": 517
},
{
"epoch": 1.4230769230769231,
"grad_norm": 9.549711227416992,
"learning_rate": 3.0960732632130923e-06,
"loss": 0.7806,
"mean_token_accuracy": 0.798353910446167,
"step": 518
},
{
"epoch": 1.4258241758241759,
"grad_norm": 8.250473022460938,
"learning_rate": 3.077855310885952e-06,
"loss": 0.6848,
"mean_token_accuracy": 0.8284251093864441,
"step": 519
},
{
"epoch": 1.4285714285714286,
"grad_norm": 8.499979019165039,
"learning_rate": 3.0596930792372227e-06,
"loss": 0.5562,
"mean_token_accuracy": 0.8656361699104309,
"step": 520
},
{
"epoch": 1.4313186813186813,
"grad_norm": 8.121464729309082,
"learning_rate": 3.0415869860833436e-06,
"loss": 0.661,
"mean_token_accuracy": 0.8321428298950195,
"step": 521
},
{
"epoch": 1.434065934065934,
"grad_norm": 8.008270263671875,
"learning_rate": 3.0235374479493053e-06,
"loss": 0.6003,
"mean_token_accuracy": 0.8410689234733582,
"step": 522
},
{
"epoch": 1.4368131868131868,
"grad_norm": 7.292999267578125,
"learning_rate": 3.0055448800590674e-06,
"loss": 0.5489,
"mean_token_accuracy": 0.8599537014961243,
"step": 523
},
{
"epoch": 1.4395604395604396,
"grad_norm": 7.336670875549316,
"learning_rate": 2.987609696326008e-06,
"loss": 0.4491,
"mean_token_accuracy": 0.8777633309364319,
"step": 524
},
{
"epoch": 1.4423076923076923,
"grad_norm": 8.703383445739746,
"learning_rate": 2.9697323093434006e-06,
"loss": 0.6206,
"mean_token_accuracy": 0.8430232405662537,
"step": 525
},
{
"epoch": 1.445054945054945,
"grad_norm": 8.316123962402344,
"learning_rate": 2.951913130374919e-06,
"loss": 0.663,
"mean_token_accuracy": 0.8420427441596985,
"step": 526
},
{
"epoch": 1.4478021978021978,
"grad_norm": 8.570442199707031,
"learning_rate": 2.934152569345189e-06,
"loss": 0.595,
"mean_token_accuracy": 0.868789792060852,
"step": 527
},
{
"epoch": 1.4505494505494505,
"grad_norm": 7.871905326843262,
"learning_rate": 2.9164510348303366e-06,
"loss": 0.6954,
"mean_token_accuracy": 0.8410193920135498,
"step": 528
},
{
"epoch": 1.4532967032967032,
"grad_norm": 7.661655902862549,
"learning_rate": 2.898808934048613e-06,
"loss": 0.6222,
"mean_token_accuracy": 0.8458781242370605,
"step": 529
},
{
"epoch": 1.456043956043956,
"grad_norm": 9.568252563476562,
"learning_rate": 2.8812266728510075e-06,
"loss": 0.5718,
"mean_token_accuracy": 0.8805555701255798,
"step": 530
},
{
"epoch": 1.4587912087912087,
"grad_norm": 7.459258556365967,
"learning_rate": 2.8637046557119217e-06,
"loss": 0.645,
"mean_token_accuracy": 0.8480725884437561,
"step": 531
},
{
"epoch": 1.4615384615384617,
"grad_norm": 8.410935401916504,
"learning_rate": 2.84624328571986e-06,
"loss": 0.686,
"mean_token_accuracy": 0.8389512896537781,
"step": 532
},
{
"epoch": 1.4642857142857144,
"grad_norm": 7.343708515167236,
"learning_rate": 2.8288429645681604e-06,
"loss": 0.4948,
"mean_token_accuracy": 0.8748450875282288,
"step": 533
},
{
"epoch": 1.4670329670329672,
"grad_norm": 8.968671798706055,
"learning_rate": 2.811504092545748e-06,
"loss": 0.6255,
"mean_token_accuracy": 0.851640522480011,
"step": 534
},
{
"epoch": 1.4697802197802199,
"grad_norm": 8.382376670837402,
"learning_rate": 2.794227068527934e-06,
"loss": 0.5991,
"mean_token_accuracy": 0.8453188538551331,
"step": 535
},
{
"epoch": 1.4725274725274726,
"grad_norm": 9.791983604431152,
"learning_rate": 2.7770122899672314e-06,
"loss": 0.9362,
"mean_token_accuracy": 0.7942283749580383,
"step": 536
},
{
"epoch": 1.4752747252747254,
"grad_norm": 8.932596206665039,
"learning_rate": 2.759860152884222e-06,
"loss": 0.6614,
"mean_token_accuracy": 0.8456549644470215,
"step": 537
},
{
"epoch": 1.478021978021978,
"grad_norm": 10.183149337768555,
"learning_rate": 2.742771051858435e-06,
"loss": 0.7975,
"mean_token_accuracy": 0.8182989954948425,
"step": 538
},
{
"epoch": 1.4807692307692308,
"grad_norm": 7.038871765136719,
"learning_rate": 2.7257453800192724e-06,
"loss": 0.5672,
"mean_token_accuracy": 0.8578838109970093,
"step": 539
},
{
"epoch": 1.4835164835164836,
"grad_norm": 6.658552169799805,
"learning_rate": 2.708783529036977e-06,
"loss": 0.4442,
"mean_token_accuracy": 0.8848684430122375,
"step": 540
},
{
"epoch": 1.4862637362637363,
"grad_norm": 8.20215129852295,
"learning_rate": 2.691885889113606e-06,
"loss": 0.7322,
"mean_token_accuracy": 0.831932783126831,
"step": 541
},
{
"epoch": 1.489010989010989,
"grad_norm": 9.458049774169922,
"learning_rate": 2.675052848974059e-06,
"loss": 0.7799,
"mean_token_accuracy": 0.8369565010070801,
"step": 542
},
{
"epoch": 1.4917582417582418,
"grad_norm": 9.558260917663574,
"learning_rate": 2.6582847958571466e-06,
"loss": 0.7362,
"mean_token_accuracy": 0.8473091125488281,
"step": 543
},
{
"epoch": 1.4945054945054945,
"grad_norm": 8.83701229095459,
"learning_rate": 2.6415821155066657e-06,
"loss": 0.693,
"mean_token_accuracy": 0.8454810380935669,
"step": 544
},
{
"epoch": 1.4972527472527473,
"grad_norm": 8.02824592590332,
"learning_rate": 2.6249451921625355e-06,
"loss": 0.6736,
"mean_token_accuracy": 0.8448660969734192,
"step": 545
},
{
"epoch": 1.5,
"grad_norm": 6.946861267089844,
"learning_rate": 2.608374408551958e-06,
"loss": 0.5594,
"mean_token_accuracy": 0.8766839504241943,
"step": 546
},
{
"epoch": 1.5027472527472527,
"grad_norm": 6.514960289001465,
"learning_rate": 2.5918701458806074e-06,
"loss": 0.4437,
"mean_token_accuracy": 0.8953744769096375,
"step": 547
},
{
"epoch": 1.5054945054945055,
"grad_norm": 7.499314785003662,
"learning_rate": 2.575432783823869e-06,
"loss": 0.5534,
"mean_token_accuracy": 0.8698453903198242,
"step": 548
},
{
"epoch": 1.5082417582417582,
"grad_norm": 7.372369766235352,
"learning_rate": 2.5590627005180974e-06,
"loss": 0.5977,
"mean_token_accuracy": 0.8597701191902161,
"step": 549
},
{
"epoch": 1.510989010989011,
"grad_norm": 6.4079365730285645,
"learning_rate": 2.5427602725519185e-06,
"loss": 0.5101,
"mean_token_accuracy": 0.89012211561203,
"step": 550
},
{
"epoch": 1.5137362637362637,
"grad_norm": 8.462462425231934,
"learning_rate": 2.526525874957577e-06,
"loss": 0.7414,
"mean_token_accuracy": 0.8104650974273682,
"step": 551
},
{
"epoch": 1.5164835164835164,
"grad_norm": 7.26004695892334,
"learning_rate": 2.510359881202291e-06,
"loss": 0.5279,
"mean_token_accuracy": 0.8739837408065796,
"step": 552
},
{
"epoch": 1.5192307692307692,
"grad_norm": 9.653335571289062,
"learning_rate": 2.4942626631796737e-06,
"loss": 0.6012,
"mean_token_accuracy": 0.854411780834198,
"step": 553
},
{
"epoch": 1.521978021978022,
"grad_norm": 8.012587547302246,
"learning_rate": 2.4782345912011746e-06,
"loss": 0.7189,
"mean_token_accuracy": 0.8267270922660828,
"step": 554
},
{
"epoch": 1.5247252747252746,
"grad_norm": 6.256516456604004,
"learning_rate": 2.4622760339875586e-06,
"loss": 0.4029,
"mean_token_accuracy": 0.9051833152770996,
"step": 555
},
{
"epoch": 1.5274725274725274,
"grad_norm": 7.887014865875244,
"learning_rate": 2.4463873586604266e-06,
"loss": 0.6252,
"mean_token_accuracy": 0.8506731986999512,
"step": 556
},
{
"epoch": 1.5302197802197801,
"grad_norm": 8.715618133544922,
"learning_rate": 2.430568930733765e-06,
"loss": 0.6903,
"mean_token_accuracy": 0.8444130420684814,
"step": 557
},
{
"epoch": 1.5329670329670328,
"grad_norm": 7.426331996917725,
"learning_rate": 2.4148211141055495e-06,
"loss": 0.6732,
"mean_token_accuracy": 0.8408644199371338,
"step": 558
},
{
"epoch": 1.5357142857142856,
"grad_norm": 7.998394966125488,
"learning_rate": 2.399144271049357e-06,
"loss": 0.5477,
"mean_token_accuracy": 0.8670658469200134,
"step": 559
},
{
"epoch": 1.5384615384615383,
"grad_norm": 8.026217460632324,
"learning_rate": 2.383538762206038e-06,
"loss": 0.5915,
"mean_token_accuracy": 0.8620296716690063,
"step": 560
},
{
"epoch": 1.541208791208791,
"grad_norm": 8.00511360168457,
"learning_rate": 2.3680049465754314e-06,
"loss": 0.5861,
"mean_token_accuracy": 0.8640661835670471,
"step": 561
},
{
"epoch": 1.5439560439560438,
"grad_norm": 8.416573524475098,
"learning_rate": 2.3525431815080895e-06,
"loss": 0.6603,
"mean_token_accuracy": 0.8461538553237915,
"step": 562
},
{
"epoch": 1.5467032967032965,
"grad_norm": 8.667383193969727,
"learning_rate": 2.337153822697061e-06,
"loss": 0.6265,
"mean_token_accuracy": 0.8599269390106201,
"step": 563
},
{
"epoch": 1.5494505494505495,
"grad_norm": 8.645866394042969,
"learning_rate": 2.3218372241697207e-06,
"loss": 0.7027,
"mean_token_accuracy": 0.8388969302177429,
"step": 564
},
{
"epoch": 1.5521978021978022,
"grad_norm": 8.266505241394043,
"learning_rate": 2.306593738279609e-06,
"loss": 0.596,
"mean_token_accuracy": 0.8659549355506897,
"step": 565
},
{
"epoch": 1.554945054945055,
"grad_norm": 7.0971550941467285,
"learning_rate": 2.291423715698334e-06,
"loss": 0.4934,
"mean_token_accuracy": 0.8758782148361206,
"step": 566
},
{
"epoch": 1.5576923076923077,
"grad_norm": 7.9168806076049805,
"learning_rate": 2.276327505407505e-06,
"loss": 0.5401,
"mean_token_accuracy": 0.8573486804962158,
"step": 567
},
{
"epoch": 1.5604395604395604,
"grad_norm": 7.8917059898376465,
"learning_rate": 2.2613054546907007e-06,
"loss": 0.531,
"mean_token_accuracy": 0.8639125227928162,
"step": 568
},
{
"epoch": 1.5631868131868132,
"grad_norm": 7.845230579376221,
"learning_rate": 2.2463579091254865e-06,
"loss": 0.6296,
"mean_token_accuracy": 0.8465408682823181,
"step": 569
},
{
"epoch": 1.565934065934066,
"grad_norm": 7.656919479370117,
"learning_rate": 2.2314852125754546e-06,
"loss": 0.4849,
"mean_token_accuracy": 0.891581654548645,
"step": 570
},
{
"epoch": 1.5686813186813187,
"grad_norm": 7.512231826782227,
"learning_rate": 2.2166877071823195e-06,
"loss": 0.4747,
"mean_token_accuracy": 0.8897150158882141,
"step": 571
},
{
"epoch": 1.5714285714285714,
"grad_norm": 7.536767482757568,
"learning_rate": 2.201965733358053e-06,
"loss": 0.6517,
"mean_token_accuracy": 0.8303167223930359,
"step": 572
},
{
"epoch": 1.5741758241758241,
"grad_norm": 6.979121208190918,
"learning_rate": 2.1873196297770407e-06,
"loss": 0.4826,
"mean_token_accuracy": 0.8690476417541504,
"step": 573
},
{
"epoch": 1.5769230769230769,
"grad_norm": 7.652553081512451,
"learning_rate": 2.172749733368299e-06,
"loss": 0.6127,
"mean_token_accuracy": 0.8521836400032043,
"step": 574
},
{
"epoch": 1.5796703296703298,
"grad_norm": 9.753581047058105,
"learning_rate": 2.158256379307722e-06,
"loss": 1.0171,
"mean_token_accuracy": 0.7713936567306519,
"step": 575
},
{
"epoch": 1.5824175824175826,
"grad_norm": 7.93673038482666,
"learning_rate": 2.143839901010372e-06,
"loss": 0.6781,
"mean_token_accuracy": 0.8463227152824402,
"step": 576
},
{
"epoch": 1.5851648351648353,
"grad_norm": 7.431739330291748,
"learning_rate": 2.1295006301228067e-06,
"loss": 0.5165,
"mean_token_accuracy": 0.8890290260314941,
"step": 577
},
{
"epoch": 1.587912087912088,
"grad_norm": 7.179192543029785,
"learning_rate": 2.1152388965154536e-06,
"loss": 0.5525,
"mean_token_accuracy": 0.8607305884361267,
"step": 578
},
{
"epoch": 1.5906593406593408,
"grad_norm": 8.471110343933105,
"learning_rate": 2.101055028275018e-06,
"loss": 0.8447,
"mean_token_accuracy": 0.805443525314331,
"step": 579
},
{
"epoch": 1.5934065934065935,
"grad_norm": 7.559426784515381,
"learning_rate": 2.0869493516969373e-06,
"loss": 0.6594,
"mean_token_accuracy": 0.8257080316543579,
"step": 580
},
{
"epoch": 1.5961538461538463,
"grad_norm": 7.421477317810059,
"learning_rate": 2.0729221912778736e-06,
"loss": 0.4922,
"mean_token_accuracy": 0.8662053346633911,
"step": 581
},
{
"epoch": 1.598901098901099,
"grad_norm": 7.897608757019043,
"learning_rate": 2.0589738697082518e-06,
"loss": 0.6287,
"mean_token_accuracy": 0.8559006452560425,
"step": 582
},
{
"epoch": 1.6016483516483517,
"grad_norm": 7.810044288635254,
"learning_rate": 2.0451047078648316e-06,
"loss": 0.6793,
"mean_token_accuracy": 0.8473118543624878,
"step": 583
},
{
"epoch": 1.6043956043956045,
"grad_norm": 7.520912170410156,
"learning_rate": 2.031315024803327e-06,
"loss": 0.3915,
"mean_token_accuracy": 0.9056203365325928,
"step": 584
},
{
"epoch": 1.6071428571428572,
"grad_norm": 8.198314666748047,
"learning_rate": 2.0176051377510707e-06,
"loss": 0.6472,
"mean_token_accuracy": 0.8204225301742554,
"step": 585
},
{
"epoch": 1.60989010989011,
"grad_norm": 7.389269828796387,
"learning_rate": 2.003975362099711e-06,
"loss": 0.5766,
"mean_token_accuracy": 0.8601484894752502,
"step": 586
},
{
"epoch": 1.6126373626373627,
"grad_norm": 7.9077887535095215,
"learning_rate": 1.9904260113979594e-06,
"loss": 0.5486,
"mean_token_accuracy": 0.8645833134651184,
"step": 587
},
{
"epoch": 1.6153846153846154,
"grad_norm": 7.626746654510498,
"learning_rate": 1.9769573973443767e-06,
"loss": 0.6089,
"mean_token_accuracy": 0.8592411279678345,
"step": 588
},
{
"epoch": 1.6181318681318682,
"grad_norm": 6.979987144470215,
"learning_rate": 1.9635698297802006e-06,
"loss": 0.3991,
"mean_token_accuracy": 0.887499988079071,
"step": 589
},
{
"epoch": 1.620879120879121,
"grad_norm": 7.271255016326904,
"learning_rate": 1.9502636166822253e-06,
"loss": 0.5482,
"mean_token_accuracy": 0.874015748500824,
"step": 590
},
{
"epoch": 1.6236263736263736,
"grad_norm": 8.575961112976074,
"learning_rate": 1.9370390641557034e-06,
"loss": 0.7048,
"mean_token_accuracy": 0.8509485125541687,
"step": 591
},
{
"epoch": 1.6263736263736264,
"grad_norm": 8.057097434997559,
"learning_rate": 1.923896476427315e-06,
"loss": 0.6843,
"mean_token_accuracy": 0.8231292366981506,
"step": 592
},
{
"epoch": 1.629120879120879,
"grad_norm": 6.481530666351318,
"learning_rate": 1.9108361558381695e-06,
"loss": 0.4746,
"mean_token_accuracy": 0.8867470026016235,
"step": 593
},
{
"epoch": 1.6318681318681318,
"grad_norm": 8.327600479125977,
"learning_rate": 1.8978584028368418e-06,
"loss": 0.6566,
"mean_token_accuracy": 0.8510638475418091,
"step": 594
},
{
"epoch": 1.6346153846153846,
"grad_norm": 7.571817398071289,
"learning_rate": 1.8849635159724644e-06,
"loss": 0.6381,
"mean_token_accuracy": 0.867986798286438,
"step": 595
},
{
"epoch": 1.6373626373626373,
"grad_norm": 8.101214408874512,
"learning_rate": 1.8721517918878663e-06,
"loss": 0.6557,
"mean_token_accuracy": 0.8449612259864807,
"step": 596
},
{
"epoch": 1.64010989010989,
"grad_norm": 7.908919334411621,
"learning_rate": 1.8594235253127373e-06,
"loss": 0.5748,
"mean_token_accuracy": 0.8552787899971008,
"step": 597
},
{
"epoch": 1.6428571428571428,
"grad_norm": 8.520292282104492,
"learning_rate": 1.8467790090568554e-06,
"loss": 0.6607,
"mean_token_accuracy": 0.8362652063369751,
"step": 598
},
{
"epoch": 1.6456043956043955,
"grad_norm": 8.188483238220215,
"learning_rate": 1.8342185340033496e-06,
"loss": 0.6737,
"mean_token_accuracy": 0.8428927659988403,
"step": 599
},
{
"epoch": 1.6483516483516483,
"grad_norm": 8.296211242675781,
"learning_rate": 1.8217423891020058e-06,
"loss": 0.5965,
"mean_token_accuracy": 0.8537930846214294,
"step": 600
},
{
"epoch": 1.651098901098901,
"grad_norm": 7.898365497589111,
"learning_rate": 1.8093508613626221e-06,
"loss": 0.6133,
"mean_token_accuracy": 0.8399999737739563,
"step": 601
},
{
"epoch": 1.6538461538461537,
"grad_norm": 7.735372066497803,
"learning_rate": 1.7970442358484049e-06,
"loss": 0.5112,
"mean_token_accuracy": 0.8731428384780884,
"step": 602
},
{
"epoch": 1.6565934065934065,
"grad_norm": 9.300334930419922,
"learning_rate": 1.7848227956694119e-06,
"loss": 0.7739,
"mean_token_accuracy": 0.8144853711128235,
"step": 603
},
{
"epoch": 1.6593406593406592,
"grad_norm": 7.729982376098633,
"learning_rate": 1.7726868219760407e-06,
"loss": 0.5127,
"mean_token_accuracy": 0.8717647194862366,
"step": 604
},
{
"epoch": 1.662087912087912,
"grad_norm": 8.184350967407227,
"learning_rate": 1.7606365939525544e-06,
"loss": 0.6079,
"mean_token_accuracy": 0.8677042722702026,
"step": 605
},
{
"epoch": 1.6648351648351647,
"grad_norm": 7.653561115264893,
"learning_rate": 1.7486723888106689e-06,
"loss": 0.5513,
"mean_token_accuracy": 0.8632371425628662,
"step": 606
},
{
"epoch": 1.6675824175824174,
"grad_norm": 7.327295303344727,
"learning_rate": 1.736794481783168e-06,
"loss": 0.6586,
"mean_token_accuracy": 0.8426023125648499,
"step": 607
},
{
"epoch": 1.6703296703296702,
"grad_norm": 9.70237922668457,
"learning_rate": 1.7250031461175751e-06,
"loss": 0.5927,
"mean_token_accuracy": 0.8830645084381104,
"step": 608
},
{
"epoch": 1.6730769230769231,
"grad_norm": 8.11240005493164,
"learning_rate": 1.713298653069867e-06,
"loss": 0.6267,
"mean_token_accuracy": 0.8669354915618896,
"step": 609
},
{
"epoch": 1.6758241758241759,
"grad_norm": 7.044604778289795,
"learning_rate": 1.7016812718982315e-06,
"loss": 0.6068,
"mean_token_accuracy": 0.8408163189888,
"step": 610
},
{
"epoch": 1.6785714285714286,
"grad_norm": 8.983325958251953,
"learning_rate": 1.6901512698568798e-06,
"loss": 0.6311,
"mean_token_accuracy": 0.8347339034080505,
"step": 611
},
{
"epoch": 1.6813186813186813,
"grad_norm": 6.355010032653809,
"learning_rate": 1.678708912189887e-06,
"loss": 0.4312,
"mean_token_accuracy": 0.8883978128433228,
"step": 612
},
{
"epoch": 1.684065934065934,
"grad_norm": 7.719866752624512,
"learning_rate": 1.6673544621251005e-06,
"loss": 0.5471,
"mean_token_accuracy": 0.8897338509559631,
"step": 613
},
{
"epoch": 1.6868131868131868,
"grad_norm": 7.775828838348389,
"learning_rate": 1.6560881808680824e-06,
"loss": 0.5938,
"mean_token_accuracy": 0.8479212522506714,
"step": 614
},
{
"epoch": 1.6895604395604396,
"grad_norm": 6.961485385894775,
"learning_rate": 1.6449103275960967e-06,
"loss": 0.5051,
"mean_token_accuracy": 0.8725274801254272,
"step": 615
},
{
"epoch": 1.6923076923076923,
"grad_norm": 7.267236232757568,
"learning_rate": 1.633821159452148e-06,
"loss": 0.5444,
"mean_token_accuracy": 0.8766006827354431,
"step": 616
},
{
"epoch": 1.695054945054945,
"grad_norm": 8.290145874023438,
"learning_rate": 1.6228209315390716e-06,
"loss": 0.5968,
"mean_token_accuracy": 0.8578553795814514,
"step": 617
},
{
"epoch": 1.6978021978021978,
"grad_norm": 7.283031940460205,
"learning_rate": 1.611909896913657e-06,
"loss": 0.5454,
"mean_token_accuracy": 0.8671775460243225,
"step": 618
},
{
"epoch": 1.7005494505494505,
"grad_norm": 6.956562519073486,
"learning_rate": 1.6010883065808318e-06,
"loss": 0.5201,
"mean_token_accuracy": 0.8701456189155579,
"step": 619
},
{
"epoch": 1.7032967032967035,
"grad_norm": 7.430464267730713,
"learning_rate": 1.5903564094878857e-06,
"loss": 0.5259,
"mean_token_accuracy": 0.88968825340271,
"step": 620
},
{
"epoch": 1.7060439560439562,
"grad_norm": 8.572484016418457,
"learning_rate": 1.5797144525187433e-06,
"loss": 0.5999,
"mean_token_accuracy": 0.8608695864677429,
"step": 621
},
{
"epoch": 1.708791208791209,
"grad_norm": 7.143974781036377,
"learning_rate": 1.5691626804882837e-06,
"loss": 0.5746,
"mean_token_accuracy": 0.8566392660140991,
"step": 622
},
{
"epoch": 1.7115384615384617,
"grad_norm": 7.603835582733154,
"learning_rate": 1.5587013361367126e-06,
"loss": 0.4557,
"mean_token_accuracy": 0.9003517031669617,
"step": 623
},
{
"epoch": 1.7142857142857144,
"grad_norm": 8.094274520874023,
"learning_rate": 1.5483306601239708e-06,
"loss": 0.6486,
"mean_token_accuracy": 0.8408796787261963,
"step": 624
},
{
"epoch": 1.7170329670329672,
"grad_norm": 9.519272804260254,
"learning_rate": 1.5380508910242099e-06,
"loss": 0.775,
"mean_token_accuracy": 0.8139534592628479,
"step": 625
},
{
"epoch": 1.7197802197802199,
"grad_norm": 10.531563758850098,
"learning_rate": 1.527862265320287e-06,
"loss": 0.8133,
"mean_token_accuracy": 0.8015267252922058,
"step": 626
},
{
"epoch": 1.7225274725274726,
"grad_norm": 7.760357856750488,
"learning_rate": 1.5177650173983415e-06,
"loss": 0.5966,
"mean_token_accuracy": 0.8634311556816101,
"step": 627
},
{
"epoch": 1.7252747252747254,
"grad_norm": 8.748411178588867,
"learning_rate": 1.507759379542393e-06,
"loss": 0.6808,
"mean_token_accuracy": 0.8243430256843567,
"step": 628
},
{
"epoch": 1.728021978021978,
"grad_norm": 7.159116268157959,
"learning_rate": 1.4978455819289994e-06,
"loss": 0.5778,
"mean_token_accuracy": 0.8440366983413696,
"step": 629
},
{
"epoch": 1.7307692307692308,
"grad_norm": 8.136507987976074,
"learning_rate": 1.4880238526219635e-06,
"loss": 0.6099,
"mean_token_accuracy": 0.8495787978172302,
"step": 630
},
{
"epoch": 1.7335164835164836,
"grad_norm": 7.160614013671875,
"learning_rate": 1.4782944175670857e-06,
"loss": 0.5372,
"mean_token_accuracy": 0.8724604845046997,
"step": 631
},
{
"epoch": 1.7362637362637363,
"grad_norm": 7.923597812652588,
"learning_rate": 1.4686575005869663e-06,
"loss": 0.5452,
"mean_token_accuracy": 0.8799019455909729,
"step": 632
},
{
"epoch": 1.739010989010989,
"grad_norm": 8.121793746948242,
"learning_rate": 1.459113323375856e-06,
"loss": 0.5996,
"mean_token_accuracy": 0.8438761830329895,
"step": 633
},
{
"epoch": 1.7417582417582418,
"grad_norm": 7.278065204620361,
"learning_rate": 1.4496621054945545e-06,
"loss": 0.5784,
"mean_token_accuracy": 0.8584905862808228,
"step": 634
},
{
"epoch": 1.7445054945054945,
"grad_norm": 6.142355918884277,
"learning_rate": 1.4403040643653657e-06,
"loss": 0.3751,
"mean_token_accuracy": 0.9079254269599915,
"step": 635
},
{
"epoch": 1.7472527472527473,
"grad_norm": 7.723720550537109,
"learning_rate": 1.4310394152670886e-06,
"loss": 0.6285,
"mean_token_accuracy": 0.8589263558387756,
"step": 636
},
{
"epoch": 1.75,
"grad_norm": 6.810434341430664,
"learning_rate": 1.4218683713300653e-06,
"loss": 0.4836,
"mean_token_accuracy": 0.8887559771537781,
"step": 637
},
{
"epoch": 1.7527472527472527,
"grad_norm": 6.936891555786133,
"learning_rate": 1.4127911435312857e-06,
"loss": 0.5311,
"mean_token_accuracy": 0.8746702075004578,
"step": 638
},
{
"epoch": 1.7554945054945055,
"grad_norm": 7.792840003967285,
"learning_rate": 1.4038079406895261e-06,
"loss": 0.7044,
"mean_token_accuracy": 0.8347205519676208,
"step": 639
},
{
"epoch": 1.7582417582417582,
"grad_norm": 7.776517868041992,
"learning_rate": 1.3949189694605486e-06,
"loss": 0.538,
"mean_token_accuracy": 0.8720445036888123,
"step": 640
},
{
"epoch": 1.760989010989011,
"grad_norm": 6.852481365203857,
"learning_rate": 1.3861244343323466e-06,
"loss": 0.4217,
"mean_token_accuracy": 0.8895630836486816,
"step": 641
},
{
"epoch": 1.7637362637362637,
"grad_norm": 6.889708518981934,
"learning_rate": 1.3774245376204407e-06,
"loss": 0.419,
"mean_token_accuracy": 0.8810572624206543,
"step": 642
},
{
"epoch": 1.7664835164835164,
"grad_norm": 7.797403335571289,
"learning_rate": 1.3688194794632236e-06,
"loss": 0.5792,
"mean_token_accuracy": 0.8663366436958313,
"step": 643
},
{
"epoch": 1.7692307692307692,
"grad_norm": 7.262906551361084,
"learning_rate": 1.3603094578173587e-06,
"loss": 0.5168,
"mean_token_accuracy": 0.8837209343910217,
"step": 644
},
{
"epoch": 1.771978021978022,
"grad_norm": 6.884127140045166,
"learning_rate": 1.3518946684532224e-06,
"loss": 0.3545,
"mean_token_accuracy": 0.9082462191581726,
"step": 645
},
{
"epoch": 1.7747252747252746,
"grad_norm": 7.663375377655029,
"learning_rate": 1.3435753049504041e-06,
"loss": 0.5832,
"mean_token_accuracy": 0.862023651599884,
"step": 646
},
{
"epoch": 1.7774725274725274,
"grad_norm": 8.101320266723633,
"learning_rate": 1.3353515586932497e-06,
"loss": 0.6461,
"mean_token_accuracy": 0.8532731533050537,
"step": 647
},
{
"epoch": 1.7802197802197801,
"grad_norm": 6.119275093078613,
"learning_rate": 1.32722361886646e-06,
"loss": 0.4742,
"mean_token_accuracy": 0.8779661059379578,
"step": 648
},
{
"epoch": 1.7829670329670328,
"grad_norm": 6.471920490264893,
"learning_rate": 1.3191916724507415e-06,
"loss": 0.3851,
"mean_token_accuracy": 0.8980815410614014,
"step": 649
},
{
"epoch": 1.7857142857142856,
"grad_norm": 7.637779712677002,
"learning_rate": 1.3112559042184993e-06,
"loss": 0.6389,
"mean_token_accuracy": 0.8456140160560608,
"step": 650
},
{
"epoch": 1.7884615384615383,
"grad_norm": 6.639374256134033,
"learning_rate": 1.3034164967295929e-06,
"loss": 0.4687,
"mean_token_accuracy": 0.8847059011459351,
"step": 651
},
{
"epoch": 1.791208791208791,
"grad_norm": 7.081116199493408,
"learning_rate": 1.2956736303271292e-06,
"loss": 0.4666,
"mean_token_accuracy": 0.8781321048736572,
"step": 652
},
{
"epoch": 1.7939560439560438,
"grad_norm": 7.375399589538574,
"learning_rate": 1.2880274831333211e-06,
"loss": 0.5556,
"mean_token_accuracy": 0.8748639822006226,
"step": 653
},
{
"epoch": 1.7967032967032965,
"grad_norm": 7.636590957641602,
"learning_rate": 1.2804782310453842e-06,
"loss": 0.503,
"mean_token_accuracy": 0.8553008437156677,
"step": 654
},
{
"epoch": 1.7994505494505495,
"grad_norm": 7.556530952453613,
"learning_rate": 1.2730260477314943e-06,
"loss": 0.5379,
"mean_token_accuracy": 0.8660826086997986,
"step": 655
},
{
"epoch": 1.8021978021978022,
"grad_norm": 7.6223320960998535,
"learning_rate": 1.2656711046267891e-06,
"loss": 0.5053,
"mean_token_accuracy": 0.8884976506233215,
"step": 656
},
{
"epoch": 1.804945054945055,
"grad_norm": 7.420393466949463,
"learning_rate": 1.2584135709294283e-06,
"loss": 0.5047,
"mean_token_accuracy": 0.8751696348190308,
"step": 657
},
{
"epoch": 1.8076923076923077,
"grad_norm": 7.559555530548096,
"learning_rate": 1.2512536135966938e-06,
"loss": 0.6115,
"mean_token_accuracy": 0.8485838770866394,
"step": 658
},
{
"epoch": 1.8104395604395604,
"grad_norm": 8.376801490783691,
"learning_rate": 1.2441913973411594e-06,
"loss": 0.5606,
"mean_token_accuracy": 0.8773333430290222,
"step": 659
},
{
"epoch": 1.8131868131868132,
"grad_norm": 6.666446685791016,
"learning_rate": 1.2372270846268935e-06,
"loss": 0.4313,
"mean_token_accuracy": 0.8859060406684875,
"step": 660
},
{
"epoch": 1.815934065934066,
"grad_norm": 7.478560447692871,
"learning_rate": 1.2303608356657226e-06,
"loss": 0.5867,
"mean_token_accuracy": 0.8590604066848755,
"step": 661
},
{
"epoch": 1.8186813186813187,
"grad_norm": 7.4808855056762695,
"learning_rate": 1.223592808413551e-06,
"loss": 0.5184,
"mean_token_accuracy": 0.8727959990501404,
"step": 662
},
{
"epoch": 1.8214285714285714,
"grad_norm": 8.410175323486328,
"learning_rate": 1.216923158566721e-06,
"loss": 0.6202,
"mean_token_accuracy": 0.8604027032852173,
"step": 663
},
{
"epoch": 1.8241758241758241,
"grad_norm": 8.690070152282715,
"learning_rate": 1.2103520395584339e-06,
"loss": 0.6715,
"mean_token_accuracy": 0.8455089926719666,
"step": 664
},
{
"epoch": 1.8269230769230769,
"grad_norm": 8.344685554504395,
"learning_rate": 1.2038796025552207e-06,
"loss": 0.5889,
"mean_token_accuracy": 0.8775757551193237,
"step": 665
},
{
"epoch": 1.8296703296703298,
"grad_norm": 7.113524913787842,
"learning_rate": 1.1975059964534628e-06,
"loss": 0.4772,
"mean_token_accuracy": 0.8787062168121338,
"step": 666
},
{
"epoch": 1.8324175824175826,
"grad_norm": 9.631033897399902,
"learning_rate": 1.191231367875969e-06,
"loss": 0.5952,
"mean_token_accuracy": 0.8740053176879883,
"step": 667
},
{
"epoch": 1.8351648351648353,
"grad_norm": 7.207862854003906,
"learning_rate": 1.1850558611685998e-06,
"loss": 0.47,
"mean_token_accuracy": 0.8942093253135681,
"step": 668
},
{
"epoch": 1.837912087912088,
"grad_norm": 7.906933307647705,
"learning_rate": 1.178979618396949e-06,
"loss": 0.5486,
"mean_token_accuracy": 0.8479042053222656,
"step": 669
},
{
"epoch": 1.8406593406593408,
"grad_norm": 7.593602657318115,
"learning_rate": 1.173002779343075e-06,
"loss": 0.5137,
"mean_token_accuracy": 0.8721351027488708,
"step": 670
},
{
"epoch": 1.8434065934065935,
"grad_norm": 6.665176868438721,
"learning_rate": 1.167125481502284e-06,
"loss": 0.5351,
"mean_token_accuracy": 0.8627219200134277,
"step": 671
},
{
"epoch": 1.8461538461538463,
"grad_norm": 7.5192670822143555,
"learning_rate": 1.1613478600799688e-06,
"loss": 0.5553,
"mean_token_accuracy": 0.8680142760276794,
"step": 672
},
{
"epoch": 1.848901098901099,
"grad_norm": 7.737743377685547,
"learning_rate": 1.1556700479884969e-06,
"loss": 0.4324,
"mean_token_accuracy": 0.8904638886451721,
"step": 673
},
{
"epoch": 1.8516483516483517,
"grad_norm": 7.322444438934326,
"learning_rate": 1.150092175844153e-06,
"loss": 0.486,
"mean_token_accuracy": 0.8801897764205933,
"step": 674
},
{
"epoch": 1.8543956043956045,
"grad_norm": 8.201929092407227,
"learning_rate": 1.1446143719641354e-06,
"loss": 0.5865,
"mean_token_accuracy": 0.852223813533783,
"step": 675
},
{
"epoch": 1.8571428571428572,
"grad_norm": 7.934874057769775,
"learning_rate": 1.1392367623636041e-06,
"loss": 0.7429,
"mean_token_accuracy": 0.8293269276618958,
"step": 676
},
{
"epoch": 1.85989010989011,
"grad_norm": 7.658584117889404,
"learning_rate": 1.133959470752779e-06,
"loss": 0.5866,
"mean_token_accuracy": 0.8679039478302002,
"step": 677
},
{
"epoch": 1.8626373626373627,
"grad_norm": 7.818109512329102,
"learning_rate": 1.1287826185340987e-06,
"loss": 0.5487,
"mean_token_accuracy": 0.8636363744735718,
"step": 678
},
{
"epoch": 1.8653846153846154,
"grad_norm": 7.738321304321289,
"learning_rate": 1.1237063247994219e-06,
"loss": 0.6939,
"mean_token_accuracy": 0.8329596519470215,
"step": 679
},
{
"epoch": 1.8681318681318682,
"grad_norm": 8.202414512634277,
"learning_rate": 1.1187307063272948e-06,
"loss": 0.5474,
"mean_token_accuracy": 0.8687415719032288,
"step": 680
},
{
"epoch": 1.870879120879121,
"grad_norm": 9.126557350158691,
"learning_rate": 1.1138558775802582e-06,
"loss": 0.6274,
"mean_token_accuracy": 0.8616071343421936,
"step": 681
},
{
"epoch": 1.8736263736263736,
"grad_norm": 7.537760257720947,
"learning_rate": 1.1090819507022166e-06,
"loss": 0.5639,
"mean_token_accuracy": 0.8717948794364929,
"step": 682
},
{
"epoch": 1.8763736263736264,
"grad_norm": 7.776443004608154,
"learning_rate": 1.1044090355158607e-06,
"loss": 0.5634,
"mean_token_accuracy": 0.8497174978256226,
"step": 683
},
{
"epoch": 1.879120879120879,
"grad_norm": 7.746001720428467,
"learning_rate": 1.0998372395201377e-06,
"loss": 0.5863,
"mean_token_accuracy": 0.8390297889709473,
"step": 684
},
{
"epoch": 1.8818681318681318,
"grad_norm": 8.56049633026123,
"learning_rate": 1.0953666678877789e-06,
"loss": 0.5126,
"mean_token_accuracy": 0.8790435791015625,
"step": 685
},
{
"epoch": 1.8846153846153846,
"grad_norm": 7.443101406097412,
"learning_rate": 1.0909974234628826e-06,
"loss": 0.48,
"mean_token_accuracy": 0.8778833150863647,
"step": 686
},
{
"epoch": 1.8873626373626373,
"grad_norm": 8.6889066696167,
"learning_rate": 1.0867296067585444e-06,
"loss": 0.7503,
"mean_token_accuracy": 0.832425057888031,
"step": 687
},
{
"epoch": 1.89010989010989,
"grad_norm": 8.396735191345215,
"learning_rate": 1.0825633159545498e-06,
"loss": 0.6225,
"mean_token_accuracy": 0.8434886336326599,
"step": 688
},
{
"epoch": 1.8928571428571428,
"grad_norm": 6.919631481170654,
"learning_rate": 1.0784986468951102e-06,
"loss": 0.5102,
"mean_token_accuracy": 0.8629807829856873,
"step": 689
},
{
"epoch": 1.8956043956043955,
"grad_norm": 8.00849723815918,
"learning_rate": 1.0745356930866608e-06,
"loss": 0.6416,
"mean_token_accuracy": 0.8387942314147949,
"step": 690
},
{
"epoch": 1.8983516483516483,
"grad_norm": 6.977931499481201,
"learning_rate": 1.0706745456957125e-06,
"loss": 0.4322,
"mean_token_accuracy": 0.9055214524269104,
"step": 691
},
{
"epoch": 1.901098901098901,
"grad_norm": 8.84980583190918,
"learning_rate": 1.0669152935467473e-06,
"loss": 0.8483,
"mean_token_accuracy": 0.8108747005462646,
"step": 692
},
{
"epoch": 1.9038461538461537,
"grad_norm": 7.389832019805908,
"learning_rate": 1.0632580231201816e-06,
"loss": 0.5073,
"mean_token_accuracy": 0.8797468543052673,
"step": 693
},
{
"epoch": 1.9065934065934065,
"grad_norm": 8.551029205322266,
"learning_rate": 1.0597028185503741e-06,
"loss": 0.7209,
"mean_token_accuracy": 0.8173515796661377,
"step": 694
},
{
"epoch": 1.9093406593406592,
"grad_norm": 8.817502975463867,
"learning_rate": 1.0562497616236902e-06,
"loss": 0.5205,
"mean_token_accuracy": 0.8672566413879395,
"step": 695
},
{
"epoch": 1.912087912087912,
"grad_norm": 7.306185722351074,
"learning_rate": 1.0528989317766207e-06,
"loss": 0.567,
"mean_token_accuracy": 0.8645319938659668,
"step": 696
},
{
"epoch": 1.9148351648351647,
"grad_norm": 6.0902228355407715,
"learning_rate": 1.0496504060939541e-06,
"loss": 0.4767,
"mean_token_accuracy": 0.8842975497245789,
"step": 697
},
{
"epoch": 1.9175824175824174,
"grad_norm": 6.566694736480713,
"learning_rate": 1.0465042593070051e-06,
"loss": 0.4539,
"mean_token_accuracy": 0.8871151804924011,
"step": 698
},
{
"epoch": 1.9203296703296702,
"grad_norm": 7.715193748474121,
"learning_rate": 1.0434605637918922e-06,
"loss": 0.4615,
"mean_token_accuracy": 0.8861111402511597,
"step": 699
},
{
"epoch": 1.9230769230769231,
"grad_norm": 6.1946821212768555,
"learning_rate": 1.040519389567876e-06,
"loss": 0.3546,
"mean_token_accuracy": 0.9018912315368652,
"step": 700
},
{
"epoch": 1.9258241758241759,
"grad_norm": 8.542852401733398,
"learning_rate": 1.0376808042957467e-06,
"loss": 0.7264,
"mean_token_accuracy": 0.8218673467636108,
"step": 701
},
{
"epoch": 1.9285714285714286,
"grad_norm": 8.205451011657715,
"learning_rate": 1.0349448732762673e-06,
"loss": 0.6035,
"mean_token_accuracy": 0.8728476762771606,
"step": 702
},
{
"epoch": 1.9313186813186813,
"grad_norm": 8.581854820251465,
"learning_rate": 1.032311659448672e-06,
"loss": 0.6127,
"mean_token_accuracy": 0.8588390350341797,
"step": 703
},
{
"epoch": 1.934065934065934,
"grad_norm": 6.2623443603515625,
"learning_rate": 1.0297812233892193e-06,
"loss": 0.4931,
"mean_token_accuracy": 0.8853362798690796,
"step": 704
},
{
"epoch": 1.9368131868131868,
"grad_norm": 8.019908905029297,
"learning_rate": 1.0273536233097956e-06,
"loss": 0.622,
"mean_token_accuracy": 0.8511363863945007,
"step": 705
},
{
"epoch": 1.9395604395604396,
"grad_norm": 6.778481960296631,
"learning_rate": 1.02502891505658e-06,
"loss": 0.5602,
"mean_token_accuracy": 0.8511837720870972,
"step": 706
},
{
"epoch": 1.9423076923076923,
"grad_norm": 7.289844512939453,
"learning_rate": 1.0228071521087555e-06,
"loss": 0.5142,
"mean_token_accuracy": 0.8857837319374084,
"step": 707
},
{
"epoch": 1.945054945054945,
"grad_norm": 7.7085466384887695,
"learning_rate": 1.0206883855772813e-06,
"loss": 0.6465,
"mean_token_accuracy": 0.8451536893844604,
"step": 708
},
{
"epoch": 1.9478021978021978,
"grad_norm": 8.750447273254395,
"learning_rate": 1.0186726642037172e-06,
"loss": 0.6184,
"mean_token_accuracy": 0.8487874269485474,
"step": 709
},
{
"epoch": 1.9505494505494505,
"grad_norm": 9.00527286529541,
"learning_rate": 1.0167600343591e-06,
"loss": 0.7001,
"mean_token_accuracy": 0.8461538553237915,
"step": 710
},
{
"epoch": 1.9532967032967035,
"grad_norm": 7.398614406585693,
"learning_rate": 1.0149505400428795e-06,
"loss": 0.5246,
"mean_token_accuracy": 0.869927167892456,
"step": 711
},
{
"epoch": 1.9560439560439562,
"grad_norm": 7.224842548370361,
"learning_rate": 1.0132442228819047e-06,
"loss": 0.4347,
"mean_token_accuracy": 0.9007731676101685,
"step": 712
},
{
"epoch": 1.958791208791209,
"grad_norm": 6.304765224456787,
"learning_rate": 1.0116411221294663e-06,
"loss": 0.3831,
"mean_token_accuracy": 0.9077936410903931,
"step": 713
},
{
"epoch": 1.9615384615384617,
"grad_norm": 8.755301475524902,
"learning_rate": 1.0101412746643932e-06,
"loss": 0.6125,
"mean_token_accuracy": 0.8540462255477905,
"step": 714
},
{
"epoch": 1.9642857142857144,
"grad_norm": 8.6292724609375,
"learning_rate": 1.0087447149902067e-06,
"loss": 0.6039,
"mean_token_accuracy": 0.8370370268821716,
"step": 715
},
{
"epoch": 1.9670329670329672,
"grad_norm": 7.7700347900390625,
"learning_rate": 1.0074514752343238e-06,
"loss": 0.5602,
"mean_token_accuracy": 0.8450184464454651,
"step": 716
},
{
"epoch": 1.9697802197802199,
"grad_norm": 8.478242874145508,
"learning_rate": 1.0062615851473182e-06,
"loss": 0.6344,
"mean_token_accuracy": 0.8534798622131348,
"step": 717
},
{
"epoch": 1.9725274725274726,
"grad_norm": 8.109445571899414,
"learning_rate": 1.0051750721022387e-06,
"loss": 0.5387,
"mean_token_accuracy": 0.8751530051231384,
"step": 718
},
{
"epoch": 1.9752747252747254,
"grad_norm": 6.251338958740234,
"learning_rate": 1.0041919610939768e-06,
"loss": 0.3721,
"mean_token_accuracy": 0.9060240983963013,
"step": 719
},
{
"epoch": 1.978021978021978,
"grad_norm": 8.324958801269531,
"learning_rate": 1.0033122747386922e-06,
"loss": 0.6845,
"mean_token_accuracy": 0.8313725590705872,
"step": 720
},
{
"epoch": 1.9807692307692308,
"grad_norm": 8.283183097839355,
"learning_rate": 1.0025360332732942e-06,
"loss": 0.5906,
"mean_token_accuracy": 0.8554913401603699,
"step": 721
},
{
"epoch": 1.9835164835164836,
"grad_norm": 8.054919242858887,
"learning_rate": 1.0018632545549739e-06,
"loss": 0.7021,
"mean_token_accuracy": 0.8413878679275513,
"step": 722
},
{
"epoch": 1.9862637362637363,
"grad_norm": 7.945398807525635,
"learning_rate": 1.0012939540607945e-06,
"loss": 0.5362,
"mean_token_accuracy": 0.8812903165817261,
"step": 723
},
{
"epoch": 1.989010989010989,
"grad_norm": 7.639397144317627,
"learning_rate": 1.0008281448873346e-06,
"loss": 0.5045,
"mean_token_accuracy": 0.871408998966217,
"step": 724
},
{
"epoch": 1.9917582417582418,
"grad_norm": 7.862728595733643,
"learning_rate": 1.0004658377503893e-06,
"loss": 0.6753,
"mean_token_accuracy": 0.8276283740997314,
"step": 725
},
{
"epoch": 1.9945054945054945,
"grad_norm": 8.535736083984375,
"learning_rate": 1.0002070409847193e-06,
"loss": 0.6547,
"mean_token_accuracy": 0.8429203629493713,
"step": 726
},
{
"epoch": 1.9972527472527473,
"grad_norm": 6.102328300476074,
"learning_rate": 1.0000517605438636e-06,
"loss": 0.3124,
"mean_token_accuracy": 0.9138134717941284,
"step": 727
},
{
"epoch": 2.0,
"grad_norm": 6.113708972930908,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.3288,
"mean_token_accuracy": 0.9122137427330017,
"step": 728
},
{
"epoch": 2.0,
"step": 728,
"total_flos": 4.595247209544417e+17,
"train_loss": 1.052156428714375,
"train_runtime": 2874.0296,
"train_samples_per_second": 8.099,
"train_steps_per_second": 0.253
}
],
"logging_steps": 1,
"max_steps": 728,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.595247209544417e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}