Files
Qwen1.5-MOE-sft-gsm/trainer_state.json
ModelHub XC 87d94517b0 初始化项目,由ModelHub XC社区提供模型
Model: jayzou3773/Qwen1.5-MOE-sft-gsm
Source: Original Platform
2026-05-26 22:22:22 +08:00

3787 lines
102 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 468,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004273504273504274,
"grad_norm": 55.73861312866211,
"learning_rate": 2.1276595744680852e-07,
"loss": 1.4264,
"mean_token_accuracy": 0.8229577541351318,
"step": 1
},
{
"epoch": 0.008547008547008548,
"grad_norm": 53.30722427368164,
"learning_rate": 4.2553191489361704e-07,
"loss": 1.3324,
"mean_token_accuracy": 0.8398163318634033,
"step": 2
},
{
"epoch": 0.01282051282051282,
"grad_norm": 57.59785461425781,
"learning_rate": 6.382978723404255e-07,
"loss": 1.4545,
"mean_token_accuracy": 0.8217088580131531,
"step": 3
},
{
"epoch": 0.017094017094017096,
"grad_norm": 52.380863189697266,
"learning_rate": 8.510638297872341e-07,
"loss": 1.3236,
"mean_token_accuracy": 0.8358691334724426,
"step": 4
},
{
"epoch": 0.021367521367521368,
"grad_norm": 56.7705192565918,
"learning_rate": 1.0638297872340427e-06,
"loss": 1.3879,
"mean_token_accuracy": 0.831577479839325,
"step": 5
},
{
"epoch": 0.02564102564102564,
"grad_norm": 47.17439270019531,
"learning_rate": 1.276595744680851e-06,
"loss": 1.4027,
"mean_token_accuracy": 0.824908971786499,
"step": 6
},
{
"epoch": 0.029914529914529916,
"grad_norm": 48.12850570678711,
"learning_rate": 1.4893617021276596e-06,
"loss": 1.376,
"mean_token_accuracy": 0.831479012966156,
"step": 7
},
{
"epoch": 0.03418803418803419,
"grad_norm": 40.80478286743164,
"learning_rate": 1.7021276595744682e-06,
"loss": 1.2739,
"mean_token_accuracy": 0.8457263112068176,
"step": 8
},
{
"epoch": 0.038461538461538464,
"grad_norm": 36.486202239990234,
"learning_rate": 1.9148936170212767e-06,
"loss": 1.184,
"mean_token_accuracy": 0.856406569480896,
"step": 9
},
{
"epoch": 0.042735042735042736,
"grad_norm": 37.32433319091797,
"learning_rate": 2.1276595744680853e-06,
"loss": 1.1372,
"mean_token_accuracy": 0.8697962760925293,
"step": 10
},
{
"epoch": 0.04700854700854701,
"grad_norm": 42.96809005737305,
"learning_rate": 2.340425531914894e-06,
"loss": 1.0039,
"mean_token_accuracy": 0.8862177133560181,
"step": 11
},
{
"epoch": 0.05128205128205128,
"grad_norm": 28.670639038085938,
"learning_rate": 2.553191489361702e-06,
"loss": 0.9532,
"mean_token_accuracy": 0.8876250982284546,
"step": 12
},
{
"epoch": 0.05555555555555555,
"grad_norm": 21.205902099609375,
"learning_rate": 2.765957446808511e-06,
"loss": 0.8262,
"mean_token_accuracy": 0.9083229303359985,
"step": 13
},
{
"epoch": 0.05982905982905983,
"grad_norm": 26.272602081298828,
"learning_rate": 2.978723404255319e-06,
"loss": 0.7653,
"mean_token_accuracy": 0.9099212884902954,
"step": 14
},
{
"epoch": 0.0641025641025641,
"grad_norm": 21.69289207458496,
"learning_rate": 3.191489361702128e-06,
"loss": 0.6729,
"mean_token_accuracy": 0.9244007468223572,
"step": 15
},
{
"epoch": 0.06837606837606838,
"grad_norm": 15.221785545349121,
"learning_rate": 3.4042553191489363e-06,
"loss": 0.5633,
"mean_token_accuracy": 0.9354838728904724,
"step": 16
},
{
"epoch": 0.07264957264957266,
"grad_norm": 13.282171249389648,
"learning_rate": 3.6170212765957453e-06,
"loss": 0.5253,
"mean_token_accuracy": 0.9385498762130737,
"step": 17
},
{
"epoch": 0.07692307692307693,
"grad_norm": 9.240787506103516,
"learning_rate": 3.8297872340425535e-06,
"loss": 0.4977,
"mean_token_accuracy": 0.9378522634506226,
"step": 18
},
{
"epoch": 0.0811965811965812,
"grad_norm": 11.71231746673584,
"learning_rate": 4.042553191489362e-06,
"loss": 0.4495,
"mean_token_accuracy": 0.94464111328125,
"step": 19
},
{
"epoch": 0.08547008547008547,
"grad_norm": 14.272575378417969,
"learning_rate": 4.255319148936171e-06,
"loss": 0.4435,
"mean_token_accuracy": 0.9443333745002747,
"step": 20
},
{
"epoch": 0.08974358974358974,
"grad_norm": 5.9292144775390625,
"learning_rate": 4.468085106382979e-06,
"loss": 0.3692,
"mean_token_accuracy": 0.9512913227081299,
"step": 21
},
{
"epoch": 0.09401709401709402,
"grad_norm": 5.188922882080078,
"learning_rate": 4.680851063829788e-06,
"loss": 0.3901,
"mean_token_accuracy": 0.9478208422660828,
"step": 22
},
{
"epoch": 0.09829059829059829,
"grad_norm": 5.5339274406433105,
"learning_rate": 4.893617021276596e-06,
"loss": 0.3865,
"mean_token_accuracy": 0.9494546055793762,
"step": 23
},
{
"epoch": 0.10256410256410256,
"grad_norm": 6.537012100219727,
"learning_rate": 5.106382978723404e-06,
"loss": 0.3417,
"mean_token_accuracy": 0.9544081091880798,
"step": 24
},
{
"epoch": 0.10683760683760683,
"grad_norm": 4.679383277893066,
"learning_rate": 5.319148936170213e-06,
"loss": 0.3538,
"mean_token_accuracy": 0.9526859521865845,
"step": 25
},
{
"epoch": 0.1111111111111111,
"grad_norm": 4.511115074157715,
"learning_rate": 5.531914893617022e-06,
"loss": 0.346,
"mean_token_accuracy": 0.9522511959075928,
"step": 26
},
{
"epoch": 0.11538461538461539,
"grad_norm": 5.591527938842773,
"learning_rate": 5.744680851063831e-06,
"loss": 0.3842,
"mean_token_accuracy": 0.9494712352752686,
"step": 27
},
{
"epoch": 0.11965811965811966,
"grad_norm": 5.034028053283691,
"learning_rate": 5.957446808510638e-06,
"loss": 0.3603,
"mean_token_accuracy": 0.9508964419364929,
"step": 28
},
{
"epoch": 0.12393162393162394,
"grad_norm": 5.053023815155029,
"learning_rate": 6.170212765957447e-06,
"loss": 0.3497,
"mean_token_accuracy": 0.9484222531318665,
"step": 29
},
{
"epoch": 0.1282051282051282,
"grad_norm": 4.7491536140441895,
"learning_rate": 6.382978723404256e-06,
"loss": 0.3214,
"mean_token_accuracy": 0.9549383521080017,
"step": 30
},
{
"epoch": 0.13247863247863248,
"grad_norm": 4.993784427642822,
"learning_rate": 6.595744680851064e-06,
"loss": 0.3288,
"mean_token_accuracy": 0.9535877704620361,
"step": 31
},
{
"epoch": 0.13675213675213677,
"grad_norm": 5.114716053009033,
"learning_rate": 6.808510638297873e-06,
"loss": 0.3161,
"mean_token_accuracy": 0.9539319276809692,
"step": 32
},
{
"epoch": 0.14102564102564102,
"grad_norm": 5.302931308746338,
"learning_rate": 7.021276595744682e-06,
"loss": 0.3356,
"mean_token_accuracy": 0.9496086239814758,
"step": 33
},
{
"epoch": 0.1452991452991453,
"grad_norm": 4.586728096008301,
"learning_rate": 7.234042553191491e-06,
"loss": 0.2819,
"mean_token_accuracy": 0.9578362107276917,
"step": 34
},
{
"epoch": 0.14957264957264957,
"grad_norm": 4.939296245574951,
"learning_rate": 7.446808510638298e-06,
"loss": 0.2949,
"mean_token_accuracy": 0.9560521841049194,
"step": 35
},
{
"epoch": 0.15384615384615385,
"grad_norm": 4.6308064460754395,
"learning_rate": 7.659574468085107e-06,
"loss": 0.2724,
"mean_token_accuracy": 0.9601955413818359,
"step": 36
},
{
"epoch": 0.1581196581196581,
"grad_norm": 5.236644268035889,
"learning_rate": 7.872340425531916e-06,
"loss": 0.2964,
"mean_token_accuracy": 0.9557088017463684,
"step": 37
},
{
"epoch": 0.1623931623931624,
"grad_norm": 6.120398044586182,
"learning_rate": 8.085106382978723e-06,
"loss": 0.2837,
"mean_token_accuracy": 0.9584332704544067,
"step": 38
},
{
"epoch": 0.16666666666666666,
"grad_norm": 4.9033522605896,
"learning_rate": 8.297872340425532e-06,
"loss": 0.2664,
"mean_token_accuracy": 0.9623565673828125,
"step": 39
},
{
"epoch": 0.17094017094017094,
"grad_norm": 5.031535625457764,
"learning_rate": 8.510638297872341e-06,
"loss": 0.2735,
"mean_token_accuracy": 0.9660788774490356,
"step": 40
},
{
"epoch": 0.1752136752136752,
"grad_norm": 5.569308757781982,
"learning_rate": 8.72340425531915e-06,
"loss": 0.3008,
"mean_token_accuracy": 0.9610214829444885,
"step": 41
},
{
"epoch": 0.1794871794871795,
"grad_norm": 4.788116455078125,
"learning_rate": 8.936170212765958e-06,
"loss": 0.249,
"mean_token_accuracy": 0.9689068794250488,
"step": 42
},
{
"epoch": 0.18376068376068377,
"grad_norm": 4.97907829284668,
"learning_rate": 9.148936170212767e-06,
"loss": 0.2526,
"mean_token_accuracy": 0.9705320596694946,
"step": 43
},
{
"epoch": 0.18803418803418803,
"grad_norm": 5.394193172454834,
"learning_rate": 9.361702127659576e-06,
"loss": 0.2575,
"mean_token_accuracy": 0.9729896783828735,
"step": 44
},
{
"epoch": 0.19230769230769232,
"grad_norm": 5.6487345695495605,
"learning_rate": 9.574468085106385e-06,
"loss": 0.2699,
"mean_token_accuracy": 0.9703598022460938,
"step": 45
},
{
"epoch": 0.19658119658119658,
"grad_norm": 4.73795223236084,
"learning_rate": 9.787234042553192e-06,
"loss": 0.213,
"mean_token_accuracy": 0.9751967191696167,
"step": 46
},
{
"epoch": 0.20085470085470086,
"grad_norm": 5.331301689147949,
"learning_rate": 1e-05,
"loss": 0.2284,
"mean_token_accuracy": 0.9703989624977112,
"step": 47
},
{
"epoch": 0.20512820512820512,
"grad_norm": 5.021146774291992,
"learning_rate": 9.999874710101753e-06,
"loss": 0.2204,
"mean_token_accuracy": 0.9770245552062988,
"step": 48
},
{
"epoch": 0.2094017094017094,
"grad_norm": 5.38365364074707,
"learning_rate": 9.999498847383701e-06,
"loss": 0.2246,
"mean_token_accuracy": 0.9734657406806946,
"step": 49
},
{
"epoch": 0.21367521367521367,
"grad_norm": 5.117666244506836,
"learning_rate": 9.998872432775537e-06,
"loss": 0.2124,
"mean_token_accuracy": 0.9771387577056885,
"step": 50
},
{
"epoch": 0.21794871794871795,
"grad_norm": 5.200557231903076,
"learning_rate": 9.997995501158781e-06,
"loss": 0.1975,
"mean_token_accuracy": 0.9770382046699524,
"step": 51
},
{
"epoch": 0.2222222222222222,
"grad_norm": 5.478715419769287,
"learning_rate": 9.996868101364841e-06,
"loss": 0.2082,
"mean_token_accuracy": 0.9750229716300964,
"step": 52
},
{
"epoch": 0.2264957264957265,
"grad_norm": 5.196822643280029,
"learning_rate": 9.995490296172302e-06,
"loss": 0.1931,
"mean_token_accuracy": 0.9769241809844971,
"step": 53
},
{
"epoch": 0.23076923076923078,
"grad_norm": 5.330948352813721,
"learning_rate": 9.993862162303414e-06,
"loss": 0.1951,
"mean_token_accuracy": 0.9732048511505127,
"step": 54
},
{
"epoch": 0.23504273504273504,
"grad_norm": 4.878313064575195,
"learning_rate": 9.991983790419835e-06,
"loss": 0.1719,
"mean_token_accuracy": 0.9765658378601074,
"step": 55
},
{
"epoch": 0.23931623931623933,
"grad_norm": 5.060050964355469,
"learning_rate": 9.989855285117573e-06,
"loss": 0.1844,
"mean_token_accuracy": 0.9762716889381409,
"step": 56
},
{
"epoch": 0.24358974358974358,
"grad_norm": 4.994210243225098,
"learning_rate": 9.987476764921172e-06,
"loss": 0.1818,
"mean_token_accuracy": 0.9686596393585205,
"step": 57
},
{
"epoch": 0.24786324786324787,
"grad_norm": 4.1533708572387695,
"learning_rate": 9.984848362277094e-06,
"loss": 0.1432,
"mean_token_accuracy": 0.9788950085639954,
"step": 58
},
{
"epoch": 0.25213675213675213,
"grad_norm": 4.930089473724365,
"learning_rate": 9.981970223546365e-06,
"loss": 0.1659,
"mean_token_accuracy": 0.9724494814872742,
"step": 59
},
{
"epoch": 0.2564102564102564,
"grad_norm": 4.344954490661621,
"learning_rate": 9.978842508996411e-06,
"loss": 0.156,
"mean_token_accuracy": 0.9734289050102234,
"step": 60
},
{
"epoch": 0.2606837606837607,
"grad_norm": 4.381039619445801,
"learning_rate": 9.975465392792136e-06,
"loss": 0.1529,
"mean_token_accuracy": 0.971455454826355,
"step": 61
},
{
"epoch": 0.26495726495726496,
"grad_norm": 4.306074619293213,
"learning_rate": 9.971839062986229e-06,
"loss": 0.1589,
"mean_token_accuracy": 0.9690320491790771,
"step": 62
},
{
"epoch": 0.2692307692307692,
"grad_norm": 3.455580949783325,
"learning_rate": 9.967963721508684e-06,
"loss": 0.1269,
"mean_token_accuracy": 0.9736381769180298,
"step": 63
},
{
"epoch": 0.27350427350427353,
"grad_norm": 3.405048370361328,
"learning_rate": 9.963839584155565e-06,
"loss": 0.1266,
"mean_token_accuracy": 0.9724921584129333,
"step": 64
},
{
"epoch": 0.2777777777777778,
"grad_norm": 3.429849147796631,
"learning_rate": 9.95946688057698e-06,
"loss": 0.1309,
"mean_token_accuracy": 0.9714710116386414,
"step": 65
},
{
"epoch": 0.28205128205128205,
"grad_norm": 3.2758281230926514,
"learning_rate": 9.954845854264306e-06,
"loss": 0.1294,
"mean_token_accuracy": 0.9714531898498535,
"step": 66
},
{
"epoch": 0.2863247863247863,
"grad_norm": 2.8356447219848633,
"learning_rate": 9.949976762536612e-06,
"loss": 0.1213,
"mean_token_accuracy": 0.9722399711608887,
"step": 67
},
{
"epoch": 0.2905982905982906,
"grad_norm": 2.8008711338043213,
"learning_rate": 9.944859876526348e-06,
"loss": 0.1204,
"mean_token_accuracy": 0.9703302979469299,
"step": 68
},
{
"epoch": 0.2948717948717949,
"grad_norm": 2.8772833347320557,
"learning_rate": 9.939495481164237e-06,
"loss": 0.1305,
"mean_token_accuracy": 0.9711523652076721,
"step": 69
},
{
"epoch": 0.29914529914529914,
"grad_norm": 2.3752126693725586,
"learning_rate": 9.933883875163411e-06,
"loss": 0.1187,
"mean_token_accuracy": 0.9728144407272339,
"step": 70
},
{
"epoch": 0.3034188034188034,
"grad_norm": 3.0123136043548584,
"learning_rate": 9.928025371002781e-06,
"loss": 0.11,
"mean_token_accuracy": 0.9761629700660706,
"step": 71
},
{
"epoch": 0.3076923076923077,
"grad_norm": 2.6965625286102295,
"learning_rate": 9.921920294909629e-06,
"loss": 0.1242,
"mean_token_accuracy": 0.9750074744224548,
"step": 72
},
{
"epoch": 0.31196581196581197,
"grad_norm": 2.699901819229126,
"learning_rate": 9.915568986841452e-06,
"loss": 0.1281,
"mean_token_accuracy": 0.9753836989402771,
"step": 73
},
{
"epoch": 0.3162393162393162,
"grad_norm": 2.854963779449463,
"learning_rate": 9.908971800467021e-06,
"loss": 0.125,
"mean_token_accuracy": 0.9752618074417114,
"step": 74
},
{
"epoch": 0.32051282051282054,
"grad_norm": 2.7737538814544678,
"learning_rate": 9.902129103146697e-06,
"loss": 0.1298,
"mean_token_accuracy": 0.9693613052368164,
"step": 75
},
{
"epoch": 0.3247863247863248,
"grad_norm": 2.7642276287078857,
"learning_rate": 9.895041275911972e-06,
"loss": 0.1291,
"mean_token_accuracy": 0.9729389548301697,
"step": 76
},
{
"epoch": 0.32905982905982906,
"grad_norm": 3.0446126461029053,
"learning_rate": 9.887708713444242e-06,
"loss": 0.134,
"mean_token_accuracy": 0.9701337218284607,
"step": 77
},
{
"epoch": 0.3333333333333333,
"grad_norm": 2.538485288619995,
"learning_rate": 9.88013182405285e-06,
"loss": 0.1241,
"mean_token_accuracy": 0.9748736023902893,
"step": 78
},
{
"epoch": 0.33760683760683763,
"grad_norm": 2.8316211700439453,
"learning_rate": 9.872311029652322e-06,
"loss": 0.1236,
"mean_token_accuracy": 0.9711246490478516,
"step": 79
},
{
"epoch": 0.3418803418803419,
"grad_norm": 2.848421335220337,
"learning_rate": 9.864246765738901e-06,
"loss": 0.1363,
"mean_token_accuracy": 0.9725619554519653,
"step": 80
},
{
"epoch": 0.34615384615384615,
"grad_norm": 2.8594236373901367,
"learning_rate": 9.855939481366276e-06,
"loss": 0.1327,
"mean_token_accuracy": 0.9708970785140991,
"step": 81
},
{
"epoch": 0.3504273504273504,
"grad_norm": 2.9276626110076904,
"learning_rate": 9.847389639120585e-06,
"loss": 0.1278,
"mean_token_accuracy": 0.969779372215271,
"step": 82
},
{
"epoch": 0.3547008547008547,
"grad_norm": 3.02407169342041,
"learning_rate": 9.838597715094661e-06,
"loss": 0.1291,
"mean_token_accuracy": 0.972348690032959,
"step": 83
},
{
"epoch": 0.358974358974359,
"grad_norm": 3.002563953399658,
"learning_rate": 9.82956419886151e-06,
"loss": 0.1289,
"mean_token_accuracy": 0.9705927968025208,
"step": 84
},
{
"epoch": 0.36324786324786323,
"grad_norm": 3.177736520767212,
"learning_rate": 9.820289593447053e-06,
"loss": 0.1258,
"mean_token_accuracy": 0.9714928865432739,
"step": 85
},
{
"epoch": 0.36752136752136755,
"grad_norm": 3.0293781757354736,
"learning_rate": 9.810774415302124e-06,
"loss": 0.1258,
"mean_token_accuracy": 0.9701533913612366,
"step": 86
},
{
"epoch": 0.3717948717948718,
"grad_norm": 3.531522512435913,
"learning_rate": 9.801019194273702e-06,
"loss": 0.1368,
"mean_token_accuracy": 0.9657713174819946,
"step": 87
},
{
"epoch": 0.37606837606837606,
"grad_norm": 2.783475160598755,
"learning_rate": 9.791024473575405e-06,
"loss": 0.1265,
"mean_token_accuracy": 0.9692285060882568,
"step": 88
},
{
"epoch": 0.3803418803418803,
"grad_norm": 3.15690279006958,
"learning_rate": 9.780790809757254e-06,
"loss": 0.12,
"mean_token_accuracy": 0.9689655303955078,
"step": 89
},
{
"epoch": 0.38461538461538464,
"grad_norm": 3.449833393096924,
"learning_rate": 9.770318772674669e-06,
"loss": 0.1304,
"mean_token_accuracy": 0.9709726572036743,
"step": 90
},
{
"epoch": 0.3888888888888889,
"grad_norm": 2.9852263927459717,
"learning_rate": 9.759608945456745e-06,
"loss": 0.1282,
"mean_token_accuracy": 0.9665982127189636,
"step": 91
},
{
"epoch": 0.39316239316239315,
"grad_norm": 2.830143690109253,
"learning_rate": 9.748661924473777e-06,
"loss": 0.122,
"mean_token_accuracy": 0.9698586463928223,
"step": 92
},
{
"epoch": 0.3974358974358974,
"grad_norm": 3.019658327102661,
"learning_rate": 9.73747831930405e-06,
"loss": 0.1192,
"mean_token_accuracy": 0.9726413488388062,
"step": 93
},
{
"epoch": 0.4017094017094017,
"grad_norm": 3.0762267112731934,
"learning_rate": 9.726058752699898e-06,
"loss": 0.1259,
"mean_token_accuracy": 0.96836256980896,
"step": 94
},
{
"epoch": 0.405982905982906,
"grad_norm": 2.900409698486328,
"learning_rate": 9.714403860553028e-06,
"loss": 0.1286,
"mean_token_accuracy": 0.968357503414154,
"step": 95
},
{
"epoch": 0.41025641025641024,
"grad_norm": 2.963927984237671,
"learning_rate": 9.70251429185911e-06,
"loss": 0.1371,
"mean_token_accuracy": 0.9652521014213562,
"step": 96
},
{
"epoch": 0.41452991452991456,
"grad_norm": 2.7278425693511963,
"learning_rate": 9.690390708681627e-06,
"loss": 0.1223,
"mean_token_accuracy": 0.9720891118049622,
"step": 97
},
{
"epoch": 0.4188034188034188,
"grad_norm": 2.7922236919403076,
"learning_rate": 9.67803378611503e-06,
"loss": 0.1152,
"mean_token_accuracy": 0.9715592265129089,
"step": 98
},
{
"epoch": 0.4230769230769231,
"grad_norm": 3.1420092582702637,
"learning_rate": 9.665444212247127e-06,
"loss": 0.1223,
"mean_token_accuracy": 0.969136655330658,
"step": 99
},
{
"epoch": 0.42735042735042733,
"grad_norm": 2.8342413902282715,
"learning_rate": 9.652622688120776e-06,
"loss": 0.1237,
"mean_token_accuracy": 0.9718309640884399,
"step": 100
},
{
"epoch": 0.43162393162393164,
"grad_norm": 3.100160598754883,
"learning_rate": 9.639569927694845e-06,
"loss": 0.1382,
"mean_token_accuracy": 0.9662887454032898,
"step": 101
},
{
"epoch": 0.4358974358974359,
"grad_norm": 3.214078664779663,
"learning_rate": 9.626286657804455e-06,
"loss": 0.129,
"mean_token_accuracy": 0.9656282067298889,
"step": 102
},
{
"epoch": 0.44017094017094016,
"grad_norm": 2.7865896224975586,
"learning_rate": 9.61277361812051e-06,
"loss": 0.1184,
"mean_token_accuracy": 0.970238983631134,
"step": 103
},
{
"epoch": 0.4444444444444444,
"grad_norm": 3.0797605514526367,
"learning_rate": 9.599031561108506e-06,
"loss": 0.1216,
"mean_token_accuracy": 0.9684609770774841,
"step": 104
},
{
"epoch": 0.44871794871794873,
"grad_norm": 2.7707583904266357,
"learning_rate": 9.585061251986634e-06,
"loss": 0.1259,
"mean_token_accuracy": 0.9682539701461792,
"step": 105
},
{
"epoch": 0.452991452991453,
"grad_norm": 3.061985492706299,
"learning_rate": 9.570863468683161e-06,
"loss": 0.1196,
"mean_token_accuracy": 0.9703070521354675,
"step": 106
},
{
"epoch": 0.45726495726495725,
"grad_norm": 3.291062355041504,
"learning_rate": 9.556439001793125e-06,
"loss": 0.1318,
"mean_token_accuracy": 0.965293824672699,
"step": 107
},
{
"epoch": 0.46153846153846156,
"grad_norm": 2.679776430130005,
"learning_rate": 9.541788654534296e-06,
"loss": 0.1209,
"mean_token_accuracy": 0.9700332880020142,
"step": 108
},
{
"epoch": 0.4658119658119658,
"grad_norm": 3.1325292587280273,
"learning_rate": 9.526913242702459e-06,
"loss": 0.1344,
"mean_token_accuracy": 0.9673547744750977,
"step": 109
},
{
"epoch": 0.4700854700854701,
"grad_norm": 2.857881784439087,
"learning_rate": 9.511813594625987e-06,
"loss": 0.1291,
"mean_token_accuracy": 0.9659602642059326,
"step": 110
},
{
"epoch": 0.47435897435897434,
"grad_norm": 2.7918944358825684,
"learning_rate": 9.49649055111971e-06,
"loss": 0.1252,
"mean_token_accuracy": 0.9653458595275879,
"step": 111
},
{
"epoch": 0.47863247863247865,
"grad_norm": 2.8813693523406982,
"learning_rate": 9.480944965438099e-06,
"loss": 0.128,
"mean_token_accuracy": 0.962127685546875,
"step": 112
},
{
"epoch": 0.4829059829059829,
"grad_norm": 2.667649984359741,
"learning_rate": 9.465177703227755e-06,
"loss": 0.1214,
"mean_token_accuracy": 0.9691765904426575,
"step": 113
},
{
"epoch": 0.48717948717948717,
"grad_norm": 2.925039529800415,
"learning_rate": 9.449189642479203e-06,
"loss": 0.1263,
"mean_token_accuracy": 0.9643491506576538,
"step": 114
},
{
"epoch": 0.49145299145299143,
"grad_norm": 2.7949769496917725,
"learning_rate": 9.432981673477998e-06,
"loss": 0.1241,
"mean_token_accuracy": 0.9685289263725281,
"step": 115
},
{
"epoch": 0.49572649572649574,
"grad_norm": 3.3986713886260986,
"learning_rate": 9.416554698755154e-06,
"loss": 0.1262,
"mean_token_accuracy": 0.9689081907272339,
"step": 116
},
{
"epoch": 0.5,
"grad_norm": 3.4772071838378906,
"learning_rate": 9.399909633036896e-06,
"loss": 0.1284,
"mean_token_accuracy": 0.9670020937919617,
"step": 117
},
{
"epoch": 0.5042735042735043,
"grad_norm": 3.6029865741729736,
"learning_rate": 9.383047403193704e-06,
"loss": 0.1313,
"mean_token_accuracy": 0.9666910171508789,
"step": 118
},
{
"epoch": 0.5085470085470085,
"grad_norm": 3.636085033416748,
"learning_rate": 9.365968948188717e-06,
"loss": 0.1343,
"mean_token_accuracy": 0.9655527472496033,
"step": 119
},
{
"epoch": 0.5128205128205128,
"grad_norm": 3.53393816947937,
"learning_rate": 9.348675219025443e-06,
"loss": 0.1426,
"mean_token_accuracy": 0.961666464805603,
"step": 120
},
{
"epoch": 0.5170940170940171,
"grad_norm": 2.9354088306427,
"learning_rate": 9.331167178694798e-06,
"loss": 0.1188,
"mean_token_accuracy": 0.9681200981140137,
"step": 121
},
{
"epoch": 0.5213675213675214,
"grad_norm": 3.025766611099243,
"learning_rate": 9.313445802121493e-06,
"loss": 0.1242,
"mean_token_accuracy": 0.9681670069694519,
"step": 122
},
{
"epoch": 0.5256410256410257,
"grad_norm": 3.3338441848754883,
"learning_rate": 9.295512076109734e-06,
"loss": 0.1342,
"mean_token_accuracy": 0.966331958770752,
"step": 123
},
{
"epoch": 0.5299145299145299,
"grad_norm": 2.8079636096954346,
"learning_rate": 9.277366999288279e-06,
"loss": 0.1212,
"mean_token_accuracy": 0.9686406254768372,
"step": 124
},
{
"epoch": 0.5341880341880342,
"grad_norm": 3.0100514888763428,
"learning_rate": 9.25901158205483e-06,
"loss": 0.126,
"mean_token_accuracy": 0.967146635055542,
"step": 125
},
{
"epoch": 0.5384615384615384,
"grad_norm": 2.392273426055908,
"learning_rate": 9.240446846519769e-06,
"loss": 0.1141,
"mean_token_accuracy": 0.9710744023323059,
"step": 126
},
{
"epoch": 0.5427350427350427,
"grad_norm": 2.8574609756469727,
"learning_rate": 9.22167382644924e-06,
"loss": 0.1391,
"mean_token_accuracy": 0.9653558135032654,
"step": 127
},
{
"epoch": 0.5470085470085471,
"grad_norm": 2.432234525680542,
"learning_rate": 9.202693567207588e-06,
"loss": 0.1157,
"mean_token_accuracy": 0.9682042002677917,
"step": 128
},
{
"epoch": 0.5512820512820513,
"grad_norm": 2.939648151397705,
"learning_rate": 9.183507125699144e-06,
"loss": 0.1234,
"mean_token_accuracy": 0.9685525298118591,
"step": 129
},
{
"epoch": 0.5555555555555556,
"grad_norm": 2.6586060523986816,
"learning_rate": 9.16411557030938e-06,
"loss": 0.1148,
"mean_token_accuracy": 0.9667240381240845,
"step": 130
},
{
"epoch": 0.5598290598290598,
"grad_norm": 2.9206671714782715,
"learning_rate": 9.144519980845405e-06,
"loss": 0.1218,
"mean_token_accuracy": 0.9657745361328125,
"step": 131
},
{
"epoch": 0.5641025641025641,
"grad_norm": 3.10524320602417,
"learning_rate": 9.124721448475848e-06,
"loss": 0.1258,
"mean_token_accuracy": 0.9668874144554138,
"step": 132
},
{
"epoch": 0.5683760683760684,
"grad_norm": 2.9490950107574463,
"learning_rate": 9.104721075670087e-06,
"loss": 0.1178,
"mean_token_accuracy": 0.9691486358642578,
"step": 133
},
{
"epoch": 0.5726495726495726,
"grad_norm": 3.5055556297302246,
"learning_rate": 9.084519976136867e-06,
"loss": 0.1324,
"mean_token_accuracy": 0.967658281326294,
"step": 134
},
{
"epoch": 0.5769230769230769,
"grad_norm": 3.0520718097686768,
"learning_rate": 9.06411927476228e-06,
"loss": 0.1291,
"mean_token_accuracy": 0.9664340019226074,
"step": 135
},
{
"epoch": 0.5811965811965812,
"grad_norm": 3.246432065963745,
"learning_rate": 9.043520107547123e-06,
"loss": 0.1287,
"mean_token_accuracy": 0.9649555683135986,
"step": 136
},
{
"epoch": 0.5854700854700855,
"grad_norm": 2.822936773300171,
"learning_rate": 9.02272362154365e-06,
"loss": 0.1167,
"mean_token_accuracy": 0.9701053500175476,
"step": 137
},
{
"epoch": 0.5897435897435898,
"grad_norm": 3.3216702938079834,
"learning_rate": 9.00173097479169e-06,
"loss": 0.137,
"mean_token_accuracy": 0.9654337167739868,
"step": 138
},
{
"epoch": 0.594017094017094,
"grad_norm": 3.212130546569824,
"learning_rate": 8.980543336254161e-06,
"loss": 0.1357,
"mean_token_accuracy": 0.9622559547424316,
"step": 139
},
{
"epoch": 0.5982905982905983,
"grad_norm": 2.845677375793457,
"learning_rate": 8.959161885751991e-06,
"loss": 0.1145,
"mean_token_accuracy": 0.9693924784660339,
"step": 140
},
{
"epoch": 0.6025641025641025,
"grad_norm": 3.0614523887634277,
"learning_rate": 8.937587813898402e-06,
"loss": 0.1305,
"mean_token_accuracy": 0.965413510799408,
"step": 141
},
{
"epoch": 0.6068376068376068,
"grad_norm": 2.913874387741089,
"learning_rate": 8.915822322032628e-06,
"loss": 0.1299,
"mean_token_accuracy": 0.9669148325920105,
"step": 142
},
{
"epoch": 0.6111111111111112,
"grad_norm": 2.596005916595459,
"learning_rate": 8.893866622153006e-06,
"loss": 0.1227,
"mean_token_accuracy": 0.9691241383552551,
"step": 143
},
{
"epoch": 0.6153846153846154,
"grad_norm": 2.7618889808654785,
"learning_rate": 8.87172193684949e-06,
"loss": 0.1212,
"mean_token_accuracy": 0.9670799374580383,
"step": 144
},
{
"epoch": 0.6196581196581197,
"grad_norm": 3.188061237335205,
"learning_rate": 8.84938949923558e-06,
"loss": 0.1196,
"mean_token_accuracy": 0.9661862254142761,
"step": 145
},
{
"epoch": 0.6239316239316239,
"grad_norm": 2.9043335914611816,
"learning_rate": 8.826870552879646e-06,
"loss": 0.1158,
"mean_token_accuracy": 0.9694948792457581,
"step": 146
},
{
"epoch": 0.6282051282051282,
"grad_norm": 2.6360220909118652,
"learning_rate": 8.80416635173569e-06,
"loss": 0.1256,
"mean_token_accuracy": 0.9667301774024963,
"step": 147
},
{
"epoch": 0.6324786324786325,
"grad_norm": 2.8086981773376465,
"learning_rate": 8.78127816007351e-06,
"loss": 0.1336,
"mean_token_accuracy": 0.9666719436645508,
"step": 148
},
{
"epoch": 0.6367521367521367,
"grad_norm": 3.122762441635132,
"learning_rate": 8.758207252408306e-06,
"loss": 0.1195,
"mean_token_accuracy": 0.9682682156562805,
"step": 149
},
{
"epoch": 0.6410256410256411,
"grad_norm": 2.828507423400879,
"learning_rate": 8.734954913429715e-06,
"loss": 0.1206,
"mean_token_accuracy": 0.9686484932899475,
"step": 150
},
{
"epoch": 0.6452991452991453,
"grad_norm": 2.836404800415039,
"learning_rate": 8.71152243793026e-06,
"loss": 0.1208,
"mean_token_accuracy": 0.9689119458198547,
"step": 151
},
{
"epoch": 0.6495726495726496,
"grad_norm": 2.7368805408477783,
"learning_rate": 8.687911130733266e-06,
"loss": 0.1189,
"mean_token_accuracy": 0.967898964881897,
"step": 152
},
{
"epoch": 0.6538461538461539,
"grad_norm": 2.948840618133545,
"learning_rate": 8.664122306620185e-06,
"loss": 0.1229,
"mean_token_accuracy": 0.9695394039154053,
"step": 153
},
{
"epoch": 0.6581196581196581,
"grad_norm": 2.9949734210968018,
"learning_rate": 8.640157290257398e-06,
"loss": 0.1181,
"mean_token_accuracy": 0.9687728881835938,
"step": 154
},
{
"epoch": 0.6623931623931624,
"grad_norm": 2.728360652923584,
"learning_rate": 8.61601741612244e-06,
"loss": 0.1218,
"mean_token_accuracy": 0.969561755657196,
"step": 155
},
{
"epoch": 0.6666666666666666,
"grad_norm": 3.2591919898986816,
"learning_rate": 8.591704028429704e-06,
"loss": 0.1242,
"mean_token_accuracy": 0.9672890305519104,
"step": 156
},
{
"epoch": 0.6709401709401709,
"grad_norm": 2.9548633098602295,
"learning_rate": 8.567218481055575e-06,
"loss": 0.129,
"mean_token_accuracy": 0.9685006141662598,
"step": 157
},
{
"epoch": 0.6752136752136753,
"grad_norm": 3.126328229904175,
"learning_rate": 8.542562137463049e-06,
"loss": 0.1297,
"mean_token_accuracy": 0.9670224785804749,
"step": 158
},
{
"epoch": 0.6794871794871795,
"grad_norm": 3.116311550140381,
"learning_rate": 8.517736370625803e-06,
"loss": 0.1282,
"mean_token_accuracy": 0.9657440185546875,
"step": 159
},
{
"epoch": 0.6837606837606838,
"grad_norm": 3.0345067977905273,
"learning_rate": 8.492742562951752e-06,
"loss": 0.1193,
"mean_token_accuracy": 0.9700401425361633,
"step": 160
},
{
"epoch": 0.688034188034188,
"grad_norm": 2.7307636737823486,
"learning_rate": 8.467582106206059e-06,
"loss": 0.1163,
"mean_token_accuracy": 0.9677555561065674,
"step": 161
},
{
"epoch": 0.6923076923076923,
"grad_norm": 3.3693504333496094,
"learning_rate": 8.44225640143364e-06,
"loss": 0.1389,
"mean_token_accuracy": 0.9626268744468689,
"step": 162
},
{
"epoch": 0.6965811965811965,
"grad_norm": 3.258207321166992,
"learning_rate": 8.416766858881155e-06,
"loss": 0.13,
"mean_token_accuracy": 0.9651078581809998,
"step": 163
},
{
"epoch": 0.7008547008547008,
"grad_norm": 2.904632806777954,
"learning_rate": 8.391114897918463e-06,
"loss": 0.1252,
"mean_token_accuracy": 0.9667837023735046,
"step": 164
},
{
"epoch": 0.7051282051282052,
"grad_norm": 3.072021245956421,
"learning_rate": 8.365301946959601e-06,
"loss": 0.1195,
"mean_token_accuracy": 0.9670177102088928,
"step": 165
},
{
"epoch": 0.7094017094017094,
"grad_norm": 2.99420166015625,
"learning_rate": 8.339329443383234e-06,
"loss": 0.1198,
"mean_token_accuracy": 0.9678386449813843,
"step": 166
},
{
"epoch": 0.7136752136752137,
"grad_norm": 3.074570417404175,
"learning_rate": 8.313198833452622e-06,
"loss": 0.1222,
"mean_token_accuracy": 0.968068540096283,
"step": 167
},
{
"epoch": 0.717948717948718,
"grad_norm": 3.298757553100586,
"learning_rate": 8.28691157223508e-06,
"loss": 0.1392,
"mean_token_accuracy": 0.9620700478553772,
"step": 168
},
{
"epoch": 0.7222222222222222,
"grad_norm": 2.632319688796997,
"learning_rate": 8.260469123520955e-06,
"loss": 0.1193,
"mean_token_accuracy": 0.9686574339866638,
"step": 169
},
{
"epoch": 0.7264957264957265,
"grad_norm": 2.8462018966674805,
"learning_rate": 8.233872959742117e-06,
"loss": 0.1345,
"mean_token_accuracy": 0.9627861380577087,
"step": 170
},
{
"epoch": 0.7307692307692307,
"grad_norm": 2.8106915950775146,
"learning_rate": 8.207124561889967e-06,
"loss": 0.1221,
"mean_token_accuracy": 0.9662142395973206,
"step": 171
},
{
"epoch": 0.7350427350427351,
"grad_norm": 2.946042776107788,
"learning_rate": 8.180225419432974e-06,
"loss": 0.1336,
"mean_token_accuracy": 0.9638904929161072,
"step": 172
},
{
"epoch": 0.7393162393162394,
"grad_norm": 2.8250818252563477,
"learning_rate": 8.15317703023372e-06,
"loss": 0.1233,
"mean_token_accuracy": 0.968315839767456,
"step": 173
},
{
"epoch": 0.7435897435897436,
"grad_norm": 2.757537364959717,
"learning_rate": 8.125980900465512e-06,
"loss": 0.1241,
"mean_token_accuracy": 0.9649458527565002,
"step": 174
},
{
"epoch": 0.7478632478632479,
"grad_norm": 3.01009464263916,
"learning_rate": 8.098638544528493e-06,
"loss": 0.1282,
"mean_token_accuracy": 0.9654348492622375,
"step": 175
},
{
"epoch": 0.7521367521367521,
"grad_norm": 2.971163034439087,
"learning_rate": 8.07115148496533e-06,
"loss": 0.1219,
"mean_token_accuracy": 0.9695356488227844,
"step": 176
},
{
"epoch": 0.7564102564102564,
"grad_norm": 2.752361536026001,
"learning_rate": 8.043521252376419e-06,
"loss": 0.1115,
"mean_token_accuracy": 0.9711832404136658,
"step": 177
},
{
"epoch": 0.7606837606837606,
"grad_norm": 3.2744572162628174,
"learning_rate": 8.015749385334662e-06,
"loss": 0.1255,
"mean_token_accuracy": 0.9651964902877808,
"step": 178
},
{
"epoch": 0.7649572649572649,
"grad_norm": 3.198021411895752,
"learning_rate": 7.987837430299793e-06,
"loss": 0.1128,
"mean_token_accuracy": 0.971238911151886,
"step": 179
},
{
"epoch": 0.7692307692307693,
"grad_norm": 3.0890040397644043,
"learning_rate": 7.959786941532257e-06,
"loss": 0.1158,
"mean_token_accuracy": 0.9695550203323364,
"step": 180
},
{
"epoch": 0.7735042735042735,
"grad_norm": 3.1784725189208984,
"learning_rate": 7.93159948100667e-06,
"loss": 0.1237,
"mean_token_accuracy": 0.9680773019790649,
"step": 181
},
{
"epoch": 0.7777777777777778,
"grad_norm": 2.876870632171631,
"learning_rate": 7.903276618324833e-06,
"loss": 0.122,
"mean_token_accuracy": 0.968846321105957,
"step": 182
},
{
"epoch": 0.782051282051282,
"grad_norm": 3.075652599334717,
"learning_rate": 7.874819930628346e-06,
"loss": 0.1288,
"mean_token_accuracy": 0.9640507698059082,
"step": 183
},
{
"epoch": 0.7863247863247863,
"grad_norm": 3.393401861190796,
"learning_rate": 7.846231002510763e-06,
"loss": 0.1298,
"mean_token_accuracy": 0.9649748802185059,
"step": 184
},
{
"epoch": 0.7905982905982906,
"grad_norm": 2.9897360801696777,
"learning_rate": 7.817511425929368e-06,
"loss": 0.1195,
"mean_token_accuracy": 0.9691243171691895,
"step": 185
},
{
"epoch": 0.7948717948717948,
"grad_norm": 3.0758893489837646,
"learning_rate": 7.788662800116534e-06,
"loss": 0.1259,
"mean_token_accuracy": 0.9677749872207642,
"step": 186
},
{
"epoch": 0.7991452991452992,
"grad_norm": 2.974975824356079,
"learning_rate": 7.759686731490655e-06,
"loss": 0.1277,
"mean_token_accuracy": 0.9659624695777893,
"step": 187
},
{
"epoch": 0.8034188034188035,
"grad_norm": 2.820207357406616,
"learning_rate": 7.730584833566704e-06,
"loss": 0.1181,
"mean_token_accuracy": 0.9691581130027771,
"step": 188
},
{
"epoch": 0.8076923076923077,
"grad_norm": 2.644383668899536,
"learning_rate": 7.701358726866384e-06,
"loss": 0.1282,
"mean_token_accuracy": 0.9655576944351196,
"step": 189
},
{
"epoch": 0.811965811965812,
"grad_norm": 2.8675873279571533,
"learning_rate": 7.672010038827888e-06,
"loss": 0.1177,
"mean_token_accuracy": 0.9688689708709717,
"step": 190
},
{
"epoch": 0.8162393162393162,
"grad_norm": 2.505047082901001,
"learning_rate": 7.642540403715279e-06,
"loss": 0.1131,
"mean_token_accuracy": 0.9703559875488281,
"step": 191
},
{
"epoch": 0.8205128205128205,
"grad_norm": 2.9962170124053955,
"learning_rate": 7.6129514625274806e-06,
"loss": 0.1191,
"mean_token_accuracy": 0.9711988568305969,
"step": 192
},
{
"epoch": 0.8247863247863247,
"grad_norm": 2.7384965419769287,
"learning_rate": 7.583244862906906e-06,
"loss": 0.12,
"mean_token_accuracy": 0.9679151773452759,
"step": 193
},
{
"epoch": 0.8290598290598291,
"grad_norm": 2.7203431129455566,
"learning_rate": 7.553422259047712e-06,
"loss": 0.1118,
"mean_token_accuracy": 0.9700168967247009,
"step": 194
},
{
"epoch": 0.8333333333333334,
"grad_norm": 2.715108871459961,
"learning_rate": 7.523485311603672e-06,
"loss": 0.1168,
"mean_token_accuracy": 0.9702435731887817,
"step": 195
},
{
"epoch": 0.8376068376068376,
"grad_norm": 2.69944429397583,
"learning_rate": 7.493435687595725e-06,
"loss": 0.1101,
"mean_token_accuracy": 0.9719541668891907,
"step": 196
},
{
"epoch": 0.8418803418803419,
"grad_norm": 2.8891189098358154,
"learning_rate": 7.463275060319127e-06,
"loss": 0.1156,
"mean_token_accuracy": 0.9676284193992615,
"step": 197
},
{
"epoch": 0.8461538461538461,
"grad_norm": 2.8775413036346436,
"learning_rate": 7.433005109250291e-06,
"loss": 0.1176,
"mean_token_accuracy": 0.9696885943412781,
"step": 198
},
{
"epoch": 0.8504273504273504,
"grad_norm": 2.7444727420806885,
"learning_rate": 7.402627519953251e-06,
"loss": 0.1123,
"mean_token_accuracy": 0.9693893194198608,
"step": 199
},
{
"epoch": 0.8547008547008547,
"grad_norm": 2.682316780090332,
"learning_rate": 7.3721439839858245e-06,
"loss": 0.1166,
"mean_token_accuracy": 0.9664866328239441,
"step": 200
},
{
"epoch": 0.8589743589743589,
"grad_norm": 3.2439863681793213,
"learning_rate": 7.341556198805392e-06,
"loss": 0.124,
"mean_token_accuracy": 0.9668755531311035,
"step": 201
},
{
"epoch": 0.8632478632478633,
"grad_norm": 3.0606460571289062,
"learning_rate": 7.310865867674397e-06,
"loss": 0.1163,
"mean_token_accuracy": 0.96805340051651,
"step": 202
},
{
"epoch": 0.8675213675213675,
"grad_norm": 2.628513813018799,
"learning_rate": 7.28007469956549e-06,
"loss": 0.112,
"mean_token_accuracy": 0.9691203832626343,
"step": 203
},
{
"epoch": 0.8717948717948718,
"grad_norm": 3.0109522342681885,
"learning_rate": 7.249184409066368e-06,
"loss": 0.1193,
"mean_token_accuracy": 0.9659459590911865,
"step": 204
},
{
"epoch": 0.8760683760683761,
"grad_norm": 2.7769620418548584,
"learning_rate": 7.218196716284302e-06,
"loss": 0.1188,
"mean_token_accuracy": 0.9707610607147217,
"step": 205
},
{
"epoch": 0.8803418803418803,
"grad_norm": 3.0632548332214355,
"learning_rate": 7.187113346750345e-06,
"loss": 0.12,
"mean_token_accuracy": 0.9674087166786194,
"step": 206
},
{
"epoch": 0.8846153846153846,
"grad_norm": 2.789855718612671,
"learning_rate": 7.155936031323254e-06,
"loss": 0.1106,
"mean_token_accuracy": 0.9703866243362427,
"step": 207
},
{
"epoch": 0.8888888888888888,
"grad_norm": 3.0573058128356934,
"learning_rate": 7.124666506093112e-06,
"loss": 0.122,
"mean_token_accuracy": 0.9684903621673584,
"step": 208
},
{
"epoch": 0.8931623931623932,
"grad_norm": 3.0812251567840576,
"learning_rate": 7.093306512284642e-06,
"loss": 0.1174,
"mean_token_accuracy": 0.9704350233078003,
"step": 209
},
{
"epoch": 0.8974358974358975,
"grad_norm": 2.5385091304779053,
"learning_rate": 7.061857796160261e-06,
"loss": 0.1043,
"mean_token_accuracy": 0.9734078645706177,
"step": 210
},
{
"epoch": 0.9017094017094017,
"grad_norm": 3.305619239807129,
"learning_rate": 7.030322108922831e-06,
"loss": 0.1268,
"mean_token_accuracy": 0.9653949737548828,
"step": 211
},
{
"epoch": 0.905982905982906,
"grad_norm": 2.835049629211426,
"learning_rate": 6.998701206618153e-06,
"loss": 0.1138,
"mean_token_accuracy": 0.9680963158607483,
"step": 212
},
{
"epoch": 0.9102564102564102,
"grad_norm": 2.810410499572754,
"learning_rate": 6.966996850037168e-06,
"loss": 0.1122,
"mean_token_accuracy": 0.9715486764907837,
"step": 213
},
{
"epoch": 0.9145299145299145,
"grad_norm": 2.569246292114258,
"learning_rate": 6.9352108046179325e-06,
"loss": 0.1085,
"mean_token_accuracy": 0.9726495742797852,
"step": 214
},
{
"epoch": 0.9188034188034188,
"grad_norm": 2.7993946075439453,
"learning_rate": 6.903344840347286e-06,
"loss": 0.1183,
"mean_token_accuracy": 0.9697819352149963,
"step": 215
},
{
"epoch": 0.9230769230769231,
"grad_norm": 2.7210962772369385,
"learning_rate": 6.871400731662303e-06,
"loss": 0.1186,
"mean_token_accuracy": 0.9677559733390808,
"step": 216
},
{
"epoch": 0.9273504273504274,
"grad_norm": 2.9043362140655518,
"learning_rate": 6.839380257351486e-06,
"loss": 0.1093,
"mean_token_accuracy": 0.9705744981765747,
"step": 217
},
{
"epoch": 0.9316239316239316,
"grad_norm": 2.5438663959503174,
"learning_rate": 6.8072852004557085e-06,
"loss": 0.1109,
"mean_token_accuracy": 0.9724981188774109,
"step": 218
},
{
"epoch": 0.9358974358974359,
"grad_norm": 2.694098711013794,
"learning_rate": 6.775117348168934e-06,
"loss": 0.1156,
"mean_token_accuracy": 0.9705842137336731,
"step": 219
},
{
"epoch": 0.9401709401709402,
"grad_norm": 2.7085683345794678,
"learning_rate": 6.742878491738691e-06,
"loss": 0.1058,
"mean_token_accuracy": 0.9714870452880859,
"step": 220
},
{
"epoch": 0.9444444444444444,
"grad_norm": 3.007352828979492,
"learning_rate": 6.71057042636633e-06,
"loss": 0.1179,
"mean_token_accuracy": 0.9679550528526306,
"step": 221
},
{
"epoch": 0.9487179487179487,
"grad_norm": 2.7090036869049072,
"learning_rate": 6.678194951107061e-06,
"loss": 0.1093,
"mean_token_accuracy": 0.9711792469024658,
"step": 222
},
{
"epoch": 0.9529914529914529,
"grad_norm": 2.7703821659088135,
"learning_rate": 6.645753868769773e-06,
"loss": 0.119,
"mean_token_accuracy": 0.9694185256958008,
"step": 223
},
{
"epoch": 0.9572649572649573,
"grad_norm": 2.726327657699585,
"learning_rate": 6.61324898581665e-06,
"loss": 0.1235,
"mean_token_accuracy": 0.9704574346542358,
"step": 224
},
{
"epoch": 0.9615384615384616,
"grad_norm": 2.35150408744812,
"learning_rate": 6.580682112262566e-06,
"loss": 0.1003,
"mean_token_accuracy": 0.9741271138191223,
"step": 225
},
{
"epoch": 0.9658119658119658,
"grad_norm": 2.970930814743042,
"learning_rate": 6.5480550615743124e-06,
"loss": 0.1207,
"mean_token_accuracy": 0.967167854309082,
"step": 226
},
{
"epoch": 0.9700854700854701,
"grad_norm": 2.907005548477173,
"learning_rate": 6.515369650569603e-06,
"loss": 0.1176,
"mean_token_accuracy": 0.9715429544448853,
"step": 227
},
{
"epoch": 0.9743589743589743,
"grad_norm": 2.661025047302246,
"learning_rate": 6.4826276993159155e-06,
"loss": 0.1154,
"mean_token_accuracy": 0.9683908224105835,
"step": 228
},
{
"epoch": 0.9786324786324786,
"grad_norm": 2.9590840339660645,
"learning_rate": 6.449831031029134e-06,
"loss": 0.1145,
"mean_token_accuracy": 0.9706094264984131,
"step": 229
},
{
"epoch": 0.9829059829059829,
"grad_norm": 3.047009229660034,
"learning_rate": 6.416981471972026e-06,
"loss": 0.1144,
"mean_token_accuracy": 0.969413161277771,
"step": 230
},
{
"epoch": 0.9871794871794872,
"grad_norm": 2.5724730491638184,
"learning_rate": 6.384080851352553e-06,
"loss": 0.1014,
"mean_token_accuracy": 0.9736120104789734,
"step": 231
},
{
"epoch": 0.9914529914529915,
"grad_norm": 2.7814061641693115,
"learning_rate": 6.351131001222012e-06,
"loss": 0.1112,
"mean_token_accuracy": 0.9722570776939392,
"step": 232
},
{
"epoch": 0.9957264957264957,
"grad_norm": 3.269582509994507,
"learning_rate": 6.318133756373009e-06,
"loss": 0.122,
"mean_token_accuracy": 0.9661454558372498,
"step": 233
},
{
"epoch": 1.0,
"grad_norm": 2.7873144149780273,
"learning_rate": 6.2850909542373e-06,
"loss": 0.1111,
"mean_token_accuracy": 0.9729042053222656,
"step": 234
},
{
"epoch": 1.0042735042735043,
"grad_norm": 3.190188407897949,
"learning_rate": 6.2520044347834684e-06,
"loss": 0.0988,
"mean_token_accuracy": 0.9759896993637085,
"step": 235
},
{
"epoch": 1.0085470085470085,
"grad_norm": 3.600405693054199,
"learning_rate": 6.218876040414476e-06,
"loss": 0.1298,
"mean_token_accuracy": 0.9713731408119202,
"step": 236
},
{
"epoch": 1.0128205128205128,
"grad_norm": 2.5787041187286377,
"learning_rate": 6.185707615865058e-06,
"loss": 0.0896,
"mean_token_accuracy": 0.9797489643096924,
"step": 237
},
{
"epoch": 1.017094017094017,
"grad_norm": 3.417285203933716,
"learning_rate": 6.152501008099009e-06,
"loss": 0.1086,
"mean_token_accuracy": 0.9717546105384827,
"step": 238
},
{
"epoch": 1.0213675213675213,
"grad_norm": 3.2675163745880127,
"learning_rate": 6.119258066206333e-06,
"loss": 0.1084,
"mean_token_accuracy": 0.9739053249359131,
"step": 239
},
{
"epoch": 1.0256410256410255,
"grad_norm": 3.1262497901916504,
"learning_rate": 6.085980641300278e-06,
"loss": 0.0979,
"mean_token_accuracy": 0.9745363593101501,
"step": 240
},
{
"epoch": 1.0299145299145298,
"grad_norm": 2.747432231903076,
"learning_rate": 6.052670586414255e-06,
"loss": 0.0991,
"mean_token_accuracy": 0.9759474396705627,
"step": 241
},
{
"epoch": 1.0341880341880343,
"grad_norm": 3.2090530395507812,
"learning_rate": 6.019329756398661e-06,
"loss": 0.1136,
"mean_token_accuracy": 0.975712776184082,
"step": 242
},
{
"epoch": 1.0384615384615385,
"grad_norm": 2.662107229232788,
"learning_rate": 5.9859600078175836e-06,
"loss": 0.0946,
"mean_token_accuracy": 0.9759296774864197,
"step": 243
},
{
"epoch": 1.0427350427350428,
"grad_norm": 2.873894214630127,
"learning_rate": 5.952563198845427e-06,
"loss": 0.1103,
"mean_token_accuracy": 0.9721987247467041,
"step": 244
},
{
"epoch": 1.047008547008547,
"grad_norm": 2.198923349380493,
"learning_rate": 5.919141189163431e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9801637530326843,
"step": 245
},
{
"epoch": 1.0512820512820513,
"grad_norm": 2.7435288429260254,
"learning_rate": 5.885695839856129e-06,
"loss": 0.1029,
"mean_token_accuracy": 0.9745575189590454,
"step": 246
},
{
"epoch": 1.0555555555555556,
"grad_norm": 2.4154202938079834,
"learning_rate": 5.852229013307704e-06,
"loss": 0.1004,
"mean_token_accuracy": 0.9773362278938293,
"step": 247
},
{
"epoch": 1.0598290598290598,
"grad_norm": 1.7442923784255981,
"learning_rate": 5.818742573098283e-06,
"loss": 0.079,
"mean_token_accuracy": 0.9817251563072205,
"step": 248
},
{
"epoch": 1.064102564102564,
"grad_norm": 2.7456750869750977,
"learning_rate": 5.785238383900172e-06,
"loss": 0.0906,
"mean_token_accuracy": 0.9783667325973511,
"step": 249
},
{
"epoch": 1.0683760683760684,
"grad_norm": 3.0801470279693604,
"learning_rate": 5.75171831137402e-06,
"loss": 0.1113,
"mean_token_accuracy": 0.970004677772522,
"step": 250
},
{
"epoch": 1.0726495726495726,
"grad_norm": 1.9699081182479858,
"learning_rate": 5.7181842220649245e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.9803862571716309,
"step": 251
},
{
"epoch": 1.0769230769230769,
"grad_norm": 2.0998027324676514,
"learning_rate": 5.6846379832985046e-06,
"loss": 0.0942,
"mean_token_accuracy": 0.9790985584259033,
"step": 252
},
{
"epoch": 1.0811965811965811,
"grad_norm": 2.5801784992218018,
"learning_rate": 5.651081463076911e-06,
"loss": 0.0972,
"mean_token_accuracy": 0.9760162234306335,
"step": 253
},
{
"epoch": 1.0854700854700854,
"grad_norm": 2.551215171813965,
"learning_rate": 5.617516529974812e-06,
"loss": 0.1051,
"mean_token_accuracy": 0.9762815833091736,
"step": 254
},
{
"epoch": 1.0897435897435896,
"grad_norm": 2.5487148761749268,
"learning_rate": 5.583945053035346e-06,
"loss": 0.0921,
"mean_token_accuracy": 0.9765682816505432,
"step": 255
},
{
"epoch": 1.0940170940170941,
"grad_norm": 2.5740602016448975,
"learning_rate": 5.550368901666031e-06,
"loss": 0.1004,
"mean_token_accuracy": 0.9755838513374329,
"step": 256
},
{
"epoch": 1.0982905982905984,
"grad_norm": 2.338292121887207,
"learning_rate": 5.5167899455346875e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9767951369285583,
"step": 257
},
{
"epoch": 1.1025641025641026,
"grad_norm": 2.2318756580352783,
"learning_rate": 5.483210054465313e-06,
"loss": 0.0846,
"mean_token_accuracy": 0.9803149700164795,
"step": 258
},
{
"epoch": 1.106837606837607,
"grad_norm": 2.467461347579956,
"learning_rate": 5.449631098333971e-06,
"loss": 0.0988,
"mean_token_accuracy": 0.9754037857055664,
"step": 259
},
{
"epoch": 1.1111111111111112,
"grad_norm": 2.1578803062438965,
"learning_rate": 5.416054946964657e-06,
"loss": 0.0848,
"mean_token_accuracy": 0.9773491024971008,
"step": 260
},
{
"epoch": 1.1153846153846154,
"grad_norm": 2.831975221633911,
"learning_rate": 5.382483470025188e-06,
"loss": 0.1046,
"mean_token_accuracy": 0.976597785949707,
"step": 261
},
{
"epoch": 1.1196581196581197,
"grad_norm": 2.9957687854766846,
"learning_rate": 5.34891853692309e-06,
"loss": 0.1071,
"mean_token_accuracy": 0.9731133580207825,
"step": 262
},
{
"epoch": 1.123931623931624,
"grad_norm": 2.807539701461792,
"learning_rate": 5.315362016701496e-06,
"loss": 0.1042,
"mean_token_accuracy": 0.9745635390281677,
"step": 263
},
{
"epoch": 1.1282051282051282,
"grad_norm": 2.411320447921753,
"learning_rate": 5.281815777935077e-06,
"loss": 0.0939,
"mean_token_accuracy": 0.9777224063873291,
"step": 264
},
{
"epoch": 1.1324786324786325,
"grad_norm": 2.6769258975982666,
"learning_rate": 5.248281688625984e-06,
"loss": 0.1019,
"mean_token_accuracy": 0.9765105247497559,
"step": 265
},
{
"epoch": 1.1367521367521367,
"grad_norm": 2.6799635887145996,
"learning_rate": 5.214761616099831e-06,
"loss": 0.1106,
"mean_token_accuracy": 0.9744277596473694,
"step": 266
},
{
"epoch": 1.141025641025641,
"grad_norm": 2.4581170082092285,
"learning_rate": 5.18125742690172e-06,
"loss": 0.0975,
"mean_token_accuracy": 0.9779106974601746,
"step": 267
},
{
"epoch": 1.1452991452991452,
"grad_norm": 2.4275119304656982,
"learning_rate": 5.147770986692298e-06,
"loss": 0.0896,
"mean_token_accuracy": 0.977669358253479,
"step": 268
},
{
"epoch": 1.1495726495726495,
"grad_norm": 2.197598934173584,
"learning_rate": 5.114304160143873e-06,
"loss": 0.0904,
"mean_token_accuracy": 0.9766778349876404,
"step": 269
},
{
"epoch": 1.1538461538461537,
"grad_norm": 2.2721002101898193,
"learning_rate": 5.08085881083657e-06,
"loss": 0.0897,
"mean_token_accuracy": 0.977644681930542,
"step": 270
},
{
"epoch": 1.158119658119658,
"grad_norm": 2.1074318885803223,
"learning_rate": 5.047436801154575e-06,
"loss": 0.0877,
"mean_token_accuracy": 0.9794923067092896,
"step": 271
},
{
"epoch": 1.1623931623931625,
"grad_norm": 2.6289446353912354,
"learning_rate": 5.014039992182417e-06,
"loss": 0.0994,
"mean_token_accuracy": 0.9763491153717041,
"step": 272
},
{
"epoch": 1.1666666666666667,
"grad_norm": 2.1914560794830322,
"learning_rate": 4.980670243601341e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9788976907730103,
"step": 273
},
{
"epoch": 1.170940170940171,
"grad_norm": 2.311638116836548,
"learning_rate": 4.947329413585746e-06,
"loss": 0.092,
"mean_token_accuracy": 0.9794085025787354,
"step": 274
},
{
"epoch": 1.1752136752136753,
"grad_norm": 2.0195963382720947,
"learning_rate": 4.914019358699725e-06,
"loss": 0.0755,
"mean_token_accuracy": 0.9825838804244995,
"step": 275
},
{
"epoch": 1.1794871794871795,
"grad_norm": 2.5088884830474854,
"learning_rate": 4.880741933793669e-06,
"loss": 0.1063,
"mean_token_accuracy": 0.9738301038742065,
"step": 276
},
{
"epoch": 1.1837606837606838,
"grad_norm": 2.246293544769287,
"learning_rate": 4.8474989919009915e-06,
"loss": 0.0943,
"mean_token_accuracy": 0.9782775044441223,
"step": 277
},
{
"epoch": 1.188034188034188,
"grad_norm": 2.261032819747925,
"learning_rate": 4.8142923841349435e-06,
"loss": 0.0944,
"mean_token_accuracy": 0.9763620495796204,
"step": 278
},
{
"epoch": 1.1923076923076923,
"grad_norm": 2.5907723903656006,
"learning_rate": 4.781123959585526e-06,
"loss": 0.1054,
"mean_token_accuracy": 0.9734842777252197,
"step": 279
},
{
"epoch": 1.1965811965811965,
"grad_norm": 2.166476011276245,
"learning_rate": 4.747995565216532e-06,
"loss": 0.0817,
"mean_token_accuracy": 0.9802180528640747,
"step": 280
},
{
"epoch": 1.2008547008547008,
"grad_norm": 2.6413872241973877,
"learning_rate": 4.714909045762702e-06,
"loss": 0.1075,
"mean_token_accuracy": 0.9746893644332886,
"step": 281
},
{
"epoch": 1.205128205128205,
"grad_norm": 2.4932637214660645,
"learning_rate": 4.681866243626992e-06,
"loss": 0.1068,
"mean_token_accuracy": 0.9767117500305176,
"step": 282
},
{
"epoch": 1.2094017094017093,
"grad_norm": 1.9535247087478638,
"learning_rate": 4.64886899877799e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9810839295387268,
"step": 283
},
{
"epoch": 1.2136752136752136,
"grad_norm": 2.248490810394287,
"learning_rate": 4.615919148647449e-06,
"loss": 0.0932,
"mean_token_accuracy": 0.9774066805839539,
"step": 284
},
{
"epoch": 1.217948717948718,
"grad_norm": 2.524912118911743,
"learning_rate": 4.583018528027976e-06,
"loss": 0.0975,
"mean_token_accuracy": 0.9775086641311646,
"step": 285
},
{
"epoch": 1.2222222222222223,
"grad_norm": 2.622223138809204,
"learning_rate": 4.550168968970869e-06,
"loss": 0.0936,
"mean_token_accuracy": 0.9763756990432739,
"step": 286
},
{
"epoch": 1.2264957264957266,
"grad_norm": 2.0964972972869873,
"learning_rate": 4.517372300684087e-06,
"loss": 0.0842,
"mean_token_accuracy": 0.9804538488388062,
"step": 287
},
{
"epoch": 1.2307692307692308,
"grad_norm": 2.625102996826172,
"learning_rate": 4.484630349430398e-06,
"loss": 0.0949,
"mean_token_accuracy": 0.975182294845581,
"step": 288
},
{
"epoch": 1.235042735042735,
"grad_norm": 2.5802104473114014,
"learning_rate": 4.45194493842569e-06,
"loss": 0.0979,
"mean_token_accuracy": 0.9763470888137817,
"step": 289
},
{
"epoch": 1.2393162393162394,
"grad_norm": 2.3211922645568848,
"learning_rate": 4.419317887737434e-06,
"loss": 0.0882,
"mean_token_accuracy": 0.9776690006256104,
"step": 290
},
{
"epoch": 1.2435897435897436,
"grad_norm": 2.8441874980926514,
"learning_rate": 4.386751014183352e-06,
"loss": 0.1091,
"mean_token_accuracy": 0.9710574746131897,
"step": 291
},
{
"epoch": 1.2478632478632479,
"grad_norm": 2.6006760597229004,
"learning_rate": 4.3542461312302264e-06,
"loss": 0.098,
"mean_token_accuracy": 0.9752784967422485,
"step": 292
},
{
"epoch": 1.2521367521367521,
"grad_norm": 2.247154712677002,
"learning_rate": 4.321805048892942e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9812250137329102,
"step": 293
},
{
"epoch": 1.2564102564102564,
"grad_norm": 2.42104172706604,
"learning_rate": 4.2894295736336725e-06,
"loss": 0.0872,
"mean_token_accuracy": 0.9812902212142944,
"step": 294
},
{
"epoch": 1.2606837606837606,
"grad_norm": 2.7402937412261963,
"learning_rate": 4.257121508261311e-06,
"loss": 0.0995,
"mean_token_accuracy": 0.9761265516281128,
"step": 295
},
{
"epoch": 1.264957264957265,
"grad_norm": 2.2438583374023438,
"learning_rate": 4.224882651831067e-06,
"loss": 0.0892,
"mean_token_accuracy": 0.9804651141166687,
"step": 296
},
{
"epoch": 1.2692307692307692,
"grad_norm": 2.573054075241089,
"learning_rate": 4.192714799544293e-06,
"loss": 0.1079,
"mean_token_accuracy": 0.9739599227905273,
"step": 297
},
{
"epoch": 1.2735042735042734,
"grad_norm": 2.515164852142334,
"learning_rate": 4.1606197426485175e-06,
"loss": 0.1055,
"mean_token_accuracy": 0.9740031957626343,
"step": 298
},
{
"epoch": 1.2777777777777777,
"grad_norm": 2.208859920501709,
"learning_rate": 4.128599268337699e-06,
"loss": 0.0816,
"mean_token_accuracy": 0.9804835319519043,
"step": 299
},
{
"epoch": 1.282051282051282,
"grad_norm": 2.472729444503784,
"learning_rate": 4.096655159652717e-06,
"loss": 0.0953,
"mean_token_accuracy": 0.9765296578407288,
"step": 300
},
{
"epoch": 1.2863247863247862,
"grad_norm": 2.4752037525177,
"learning_rate": 4.064789195382068e-06,
"loss": 0.0997,
"mean_token_accuracy": 0.9748784303665161,
"step": 301
},
{
"epoch": 1.2905982905982907,
"grad_norm": 2.179346799850464,
"learning_rate": 4.033003149962833e-06,
"loss": 0.0883,
"mean_token_accuracy": 0.9797558784484863,
"step": 302
},
{
"epoch": 1.294871794871795,
"grad_norm": 2.3181893825531006,
"learning_rate": 4.00129879338185e-06,
"loss": 0.0914,
"mean_token_accuracy": 0.974800705909729,
"step": 303
},
{
"epoch": 1.2991452991452992,
"grad_norm": 2.3745031356811523,
"learning_rate": 3.96967789107717e-06,
"loss": 0.0958,
"mean_token_accuracy": 0.9788101315498352,
"step": 304
},
{
"epoch": 1.3034188034188035,
"grad_norm": 2.3366518020629883,
"learning_rate": 3.9381422038397395e-06,
"loss": 0.0867,
"mean_token_accuracy": 0.9784852266311646,
"step": 305
},
{
"epoch": 1.3076923076923077,
"grad_norm": 2.771221160888672,
"learning_rate": 3.906693487715358e-06,
"loss": 0.1107,
"mean_token_accuracy": 0.9715330600738525,
"step": 306
},
{
"epoch": 1.311965811965812,
"grad_norm": 2.3197829723358154,
"learning_rate": 3.87533349390689e-06,
"loss": 0.0947,
"mean_token_accuracy": 0.9761702418327332,
"step": 307
},
{
"epoch": 1.3162393162393162,
"grad_norm": 2.152040719985962,
"learning_rate": 3.844063968676748e-06,
"loss": 0.0928,
"mean_token_accuracy": 0.9783110022544861,
"step": 308
},
{
"epoch": 1.3205128205128205,
"grad_norm": 2.331920623779297,
"learning_rate": 3.8128866532496576e-06,
"loss": 0.0911,
"mean_token_accuracy": 0.9766107797622681,
"step": 309
},
{
"epoch": 1.3247863247863247,
"grad_norm": 2.161576986312866,
"learning_rate": 3.7818032837157006e-06,
"loss": 0.0878,
"mean_token_accuracy": 0.9797548055648804,
"step": 310
},
{
"epoch": 1.329059829059829,
"grad_norm": 2.173659086227417,
"learning_rate": 3.750815590933633e-06,
"loss": 0.0912,
"mean_token_accuracy": 0.9780091047286987,
"step": 311
},
{
"epoch": 1.3333333333333333,
"grad_norm": 2.7930123805999756,
"learning_rate": 3.7199253004345114e-06,
"loss": 0.1133,
"mean_token_accuracy": 0.9729244112968445,
"step": 312
},
{
"epoch": 1.3376068376068377,
"grad_norm": 2.3382835388183594,
"learning_rate": 3.6891341323256047e-06,
"loss": 0.0904,
"mean_token_accuracy": 0.9790826439857483,
"step": 313
},
{
"epoch": 1.341880341880342,
"grad_norm": 1.9892425537109375,
"learning_rate": 3.65844380119461e-06,
"loss": 0.0868,
"mean_token_accuracy": 0.9794397950172424,
"step": 314
},
{
"epoch": 1.3461538461538463,
"grad_norm": 2.2188029289245605,
"learning_rate": 3.6278560160141774e-06,
"loss": 0.0878,
"mean_token_accuracy": 0.9780125617980957,
"step": 315
},
{
"epoch": 1.3504273504273505,
"grad_norm": 2.6905603408813477,
"learning_rate": 3.597372480046749e-06,
"loss": 0.0933,
"mean_token_accuracy": 0.9762943387031555,
"step": 316
},
{
"epoch": 1.3547008547008548,
"grad_norm": 2.358496904373169,
"learning_rate": 3.5669948907497108e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9798140525817871,
"step": 317
},
{
"epoch": 1.358974358974359,
"grad_norm": 2.5938258171081543,
"learning_rate": 3.5367249396808733e-06,
"loss": 0.0974,
"mean_token_accuracy": 0.9744832515716553,
"step": 318
},
{
"epoch": 1.3632478632478633,
"grad_norm": 2.1112005710601807,
"learning_rate": 3.5065643124042746e-06,
"loss": 0.0861,
"mean_token_accuracy": 0.9812760949134827,
"step": 319
},
{
"epoch": 1.3675213675213675,
"grad_norm": 2.4144303798675537,
"learning_rate": 3.4765146883963263e-06,
"loss": 0.0952,
"mean_token_accuracy": 0.9786186814308167,
"step": 320
},
{
"epoch": 1.3717948717948718,
"grad_norm": 2.3461201190948486,
"learning_rate": 3.4465777409522915e-06,
"loss": 0.0883,
"mean_token_accuracy": 0.9787299633026123,
"step": 321
},
{
"epoch": 1.376068376068376,
"grad_norm": 2.0723886489868164,
"learning_rate": 3.4167551370930955e-06,
"loss": 0.0851,
"mean_token_accuracy": 0.9797854423522949,
"step": 322
},
{
"epoch": 1.3803418803418803,
"grad_norm": 2.3695363998413086,
"learning_rate": 3.3870485374725217e-06,
"loss": 0.0949,
"mean_token_accuracy": 0.9754375219345093,
"step": 323
},
{
"epoch": 1.3846153846153846,
"grad_norm": 2.0983939170837402,
"learning_rate": 3.3574595962847234e-06,
"loss": 0.0816,
"mean_token_accuracy": 0.9791032671928406,
"step": 324
},
{
"epoch": 1.3888888888888888,
"grad_norm": 2.357313871383667,
"learning_rate": 3.327989961172112e-06,
"loss": 0.0934,
"mean_token_accuracy": 0.9782047271728516,
"step": 325
},
{
"epoch": 1.393162393162393,
"grad_norm": 2.5017433166503906,
"learning_rate": 3.2986412731336184e-06,
"loss": 0.0876,
"mean_token_accuracy": 0.9789897203445435,
"step": 326
},
{
"epoch": 1.3974358974358974,
"grad_norm": 2.283299684524536,
"learning_rate": 3.269415166433297e-06,
"loss": 0.0828,
"mean_token_accuracy": 0.9798080921173096,
"step": 327
},
{
"epoch": 1.4017094017094016,
"grad_norm": 2.114645481109619,
"learning_rate": 3.2403132685093455e-06,
"loss": 0.0856,
"mean_token_accuracy": 0.9803335666656494,
"step": 328
},
{
"epoch": 1.4059829059829059,
"grad_norm": 2.050668478012085,
"learning_rate": 3.2113371998834677e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.980510950088501,
"step": 329
},
{
"epoch": 1.4102564102564101,
"grad_norm": 2.2558069229125977,
"learning_rate": 3.1824885740706323e-06,
"loss": 0.08,
"mean_token_accuracy": 0.9798475503921509,
"step": 330
},
{
"epoch": 1.4145299145299146,
"grad_norm": 2.7303566932678223,
"learning_rate": 3.1537689974892393e-06,
"loss": 0.1006,
"mean_token_accuracy": 0.9758530855178833,
"step": 331
},
{
"epoch": 1.4188034188034189,
"grad_norm": 2.5855560302734375,
"learning_rate": 3.125180069371655e-06,
"loss": 0.0972,
"mean_token_accuracy": 0.9753068089485168,
"step": 332
},
{
"epoch": 1.4230769230769231,
"grad_norm": 2.2191054821014404,
"learning_rate": 3.0967233816751657e-06,
"loss": 0.0871,
"mean_token_accuracy": 0.9791666865348816,
"step": 333
},
{
"epoch": 1.4273504273504274,
"grad_norm": 2.373244285583496,
"learning_rate": 3.0684005189933317e-06,
"loss": 0.0879,
"mean_token_accuracy": 0.9779411554336548,
"step": 334
},
{
"epoch": 1.4316239316239316,
"grad_norm": 2.3935558795928955,
"learning_rate": 3.040213058467746e-06,
"loss": 0.0944,
"mean_token_accuracy": 0.9777158498764038,
"step": 335
},
{
"epoch": 1.435897435897436,
"grad_norm": 2.688636302947998,
"learning_rate": 3.012162569700209e-06,
"loss": 0.099,
"mean_token_accuracy": 0.9758201241493225,
"step": 336
},
{
"epoch": 1.4401709401709402,
"grad_norm": 2.169508934020996,
"learning_rate": 2.9842506146653395e-06,
"loss": 0.0875,
"mean_token_accuracy": 0.9801456928253174,
"step": 337
},
{
"epoch": 1.4444444444444444,
"grad_norm": 2.194892168045044,
"learning_rate": 2.9564787476235828e-06,
"loss": 0.0803,
"mean_token_accuracy": 0.9811508059501648,
"step": 338
},
{
"epoch": 1.4487179487179487,
"grad_norm": 2.4856936931610107,
"learning_rate": 2.928848515034673e-06,
"loss": 0.0882,
"mean_token_accuracy": 0.9796777367591858,
"step": 339
},
{
"epoch": 1.452991452991453,
"grad_norm": 2.5571961402893066,
"learning_rate": 2.9013614554715084e-06,
"loss": 0.0968,
"mean_token_accuracy": 0.9769884347915649,
"step": 340
},
{
"epoch": 1.4572649572649572,
"grad_norm": 2.440589427947998,
"learning_rate": 2.8740190995344908e-06,
"loss": 0.0924,
"mean_token_accuracy": 0.9782286882400513,
"step": 341
},
{
"epoch": 1.4615384615384617,
"grad_norm": 2.2676053047180176,
"learning_rate": 2.846822969766281e-06,
"loss": 0.0802,
"mean_token_accuracy": 0.9807692170143127,
"step": 342
},
{
"epoch": 1.465811965811966,
"grad_norm": 2.512946605682373,
"learning_rate": 2.8197745805670274e-06,
"loss": 0.1079,
"mean_token_accuracy": 0.9728260636329651,
"step": 343
},
{
"epoch": 1.4700854700854702,
"grad_norm": 2.4237072467803955,
"learning_rate": 2.792875438110033e-06,
"loss": 0.0919,
"mean_token_accuracy": 0.9775099158287048,
"step": 344
},
{
"epoch": 1.4743589743589745,
"grad_norm": 2.0417165756225586,
"learning_rate": 2.766127040257884e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9809138178825378,
"step": 345
},
{
"epoch": 1.4786324786324787,
"grad_norm": 2.1946988105773926,
"learning_rate": 2.739530876479048e-06,
"loss": 0.0853,
"mean_token_accuracy": 0.9817578792572021,
"step": 346
},
{
"epoch": 1.482905982905983,
"grad_norm": 2.1269588470458984,
"learning_rate": 2.7130884277649215e-06,
"loss": 0.0887,
"mean_token_accuracy": 0.978447675704956,
"step": 347
},
{
"epoch": 1.4871794871794872,
"grad_norm": 2.0282142162323,
"learning_rate": 2.6868011665473777e-06,
"loss": 0.0818,
"mean_token_accuracy": 0.981675386428833,
"step": 348
},
{
"epoch": 1.4914529914529915,
"grad_norm": 2.108011484146118,
"learning_rate": 2.660670556616768e-06,
"loss": 0.086,
"mean_token_accuracy": 0.9770215153694153,
"step": 349
},
{
"epoch": 1.4957264957264957,
"grad_norm": 2.2282185554504395,
"learning_rate": 2.634698053040401e-06,
"loss": 0.1159,
"mean_token_accuracy": 0.9792807698249817,
"step": 350
},
{
"epoch": 1.5,
"grad_norm": 1.9703973531723022,
"learning_rate": 2.608885102081539e-06,
"loss": 0.0821,
"mean_token_accuracy": 0.9803772568702698,
"step": 351
},
{
"epoch": 1.5042735042735043,
"grad_norm": 2.060464859008789,
"learning_rate": 2.5832331411188476e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9798780679702759,
"step": 352
},
{
"epoch": 1.5085470085470085,
"grad_norm": 2.1341521739959717,
"learning_rate": 2.5577435985663614e-06,
"loss": 0.0816,
"mean_token_accuracy": 0.9788466691970825,
"step": 353
},
{
"epoch": 1.5128205128205128,
"grad_norm": 1.9853061437606812,
"learning_rate": 2.5324178937939436e-06,
"loss": 0.0825,
"mean_token_accuracy": 0.9808792471885681,
"step": 354
},
{
"epoch": 1.517094017094017,
"grad_norm": 2.1807961463928223,
"learning_rate": 2.5072574370482493e-06,
"loss": 0.081,
"mean_token_accuracy": 0.9800437092781067,
"step": 355
},
{
"epoch": 1.5213675213675213,
"grad_norm": 1.9949653148651123,
"learning_rate": 2.482263629374197e-06,
"loss": 0.0804,
"mean_token_accuracy": 0.9815117120742798,
"step": 356
},
{
"epoch": 1.5256410256410255,
"grad_norm": 2.1713290214538574,
"learning_rate": 2.457437862536953e-06,
"loss": 0.0785,
"mean_token_accuracy": 0.9811709523200989,
"step": 357
},
{
"epoch": 1.5299145299145298,
"grad_norm": 2.1930902004241943,
"learning_rate": 2.4327815189444255e-06,
"loss": 0.0897,
"mean_token_accuracy": 0.9784946441650391,
"step": 358
},
{
"epoch": 1.534188034188034,
"grad_norm": 2.5312485694885254,
"learning_rate": 2.408295971570297e-06,
"loss": 0.0921,
"mean_token_accuracy": 0.9755170941352844,
"step": 359
},
{
"epoch": 1.5384615384615383,
"grad_norm": 2.149301052093506,
"learning_rate": 2.38398258387756e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9801076650619507,
"step": 360
},
{
"epoch": 1.5427350427350426,
"grad_norm": 1.987364649772644,
"learning_rate": 2.359842709742603e-06,
"loss": 0.0895,
"mean_token_accuracy": 0.980217456817627,
"step": 361
},
{
"epoch": 1.547008547008547,
"grad_norm": 2.093104839324951,
"learning_rate": 2.3358776933798166e-06,
"loss": 0.0757,
"mean_token_accuracy": 0.9816683530807495,
"step": 362
},
{
"epoch": 1.5512820512820513,
"grad_norm": 1.9293057918548584,
"learning_rate": 2.3120888692667358e-06,
"loss": 0.0768,
"mean_token_accuracy": 0.9828811883926392,
"step": 363
},
{
"epoch": 1.5555555555555556,
"grad_norm": 1.9935529232025146,
"learning_rate": 2.28847756206974e-06,
"loss": 0.083,
"mean_token_accuracy": 0.9815475940704346,
"step": 364
},
{
"epoch": 1.5598290598290598,
"grad_norm": 2.6537835597991943,
"learning_rate": 2.2650450865702876e-06,
"loss": 0.0837,
"mean_token_accuracy": 0.9782962799072266,
"step": 365
},
{
"epoch": 1.564102564102564,
"grad_norm": 2.2035484313964844,
"learning_rate": 2.241792747591695e-06,
"loss": 0.0971,
"mean_token_accuracy": 0.9779199957847595,
"step": 366
},
{
"epoch": 1.5683760683760684,
"grad_norm": 1.952289342880249,
"learning_rate": 2.2187218399264933e-06,
"loss": 0.0765,
"mean_token_accuracy": 0.9826548099517822,
"step": 367
},
{
"epoch": 1.5726495726495726,
"grad_norm": 2.5793616771698,
"learning_rate": 2.1958336482643123e-06,
"loss": 0.0884,
"mean_token_accuracy": 0.9791607856750488,
"step": 368
},
{
"epoch": 1.5769230769230769,
"grad_norm": 2.2669849395751953,
"learning_rate": 2.1731294471203543e-06,
"loss": 0.084,
"mean_token_accuracy": 0.9792264699935913,
"step": 369
},
{
"epoch": 1.5811965811965814,
"grad_norm": 2.0273962020874023,
"learning_rate": 2.1506105007644216e-06,
"loss": 0.0834,
"mean_token_accuracy": 0.9810412526130676,
"step": 370
},
{
"epoch": 1.5854700854700856,
"grad_norm": 2.562767505645752,
"learning_rate": 2.128278063150511e-06,
"loss": 0.0956,
"mean_token_accuracy": 0.9758344888687134,
"step": 371
},
{
"epoch": 1.5897435897435899,
"grad_norm": 2.866821765899658,
"learning_rate": 2.106133377846996e-06,
"loss": 0.0939,
"mean_token_accuracy": 0.9768562912940979,
"step": 372
},
{
"epoch": 1.5940170940170941,
"grad_norm": 2.002687931060791,
"learning_rate": 2.0841776779673715e-06,
"loss": 0.088,
"mean_token_accuracy": 0.9787533283233643,
"step": 373
},
{
"epoch": 1.5982905982905984,
"grad_norm": 2.1603848934173584,
"learning_rate": 2.062412186101596e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9783121347427368,
"step": 374
},
{
"epoch": 1.6025641025641026,
"grad_norm": 2.0110819339752197,
"learning_rate": 2.0408381142480095e-06,
"loss": 0.081,
"mean_token_accuracy": 0.9819519519805908,
"step": 375
},
{
"epoch": 1.606837606837607,
"grad_norm": 2.259678840637207,
"learning_rate": 2.019456663745839e-06,
"loss": 0.0916,
"mean_token_accuracy": 0.9787756204605103,
"step": 376
},
{
"epoch": 1.6111111111111112,
"grad_norm": 2.026271343231201,
"learning_rate": 1.9982690252083127e-06,
"loss": 0.0798,
"mean_token_accuracy": 0.9790022373199463,
"step": 377
},
{
"epoch": 1.6153846153846154,
"grad_norm": 2.129807472229004,
"learning_rate": 1.977276378456352e-06,
"loss": 0.0762,
"mean_token_accuracy": 0.9800582528114319,
"step": 378
},
{
"epoch": 1.6196581196581197,
"grad_norm": 2.276139259338379,
"learning_rate": 1.956479892452878e-06,
"loss": 0.0944,
"mean_token_accuracy": 0.9772907495498657,
"step": 379
},
{
"epoch": 1.623931623931624,
"grad_norm": 1.944844126701355,
"learning_rate": 1.9358807252377226e-06,
"loss": 0.0786,
"mean_token_accuracy": 0.9809095859527588,
"step": 380
},
{
"epoch": 1.6282051282051282,
"grad_norm": 2.225731611251831,
"learning_rate": 1.9154800238631344e-06,
"loss": 0.0839,
"mean_token_accuracy": 0.9785885214805603,
"step": 381
},
{
"epoch": 1.6324786324786325,
"grad_norm": 2.48402738571167,
"learning_rate": 1.8952789243299141e-06,
"loss": 0.0859,
"mean_token_accuracy": 0.9793146252632141,
"step": 382
},
{
"epoch": 1.6367521367521367,
"grad_norm": 2.5232338905334473,
"learning_rate": 1.8752785515241536e-06,
"loss": 0.0978,
"mean_token_accuracy": 0.9753180146217346,
"step": 383
},
{
"epoch": 1.641025641025641,
"grad_norm": 2.413363456726074,
"learning_rate": 1.8554800191545957e-06,
"loss": 0.0961,
"mean_token_accuracy": 0.9776303768157959,
"step": 384
},
{
"epoch": 1.6452991452991452,
"grad_norm": 3.013465404510498,
"learning_rate": 1.8358844296906213e-06,
"loss": 0.1043,
"mean_token_accuracy": 0.9743627309799194,
"step": 385
},
{
"epoch": 1.6495726495726495,
"grad_norm": 1.9660561084747314,
"learning_rate": 1.8164928743008564e-06,
"loss": 0.0783,
"mean_token_accuracy": 0.9812124967575073,
"step": 386
},
{
"epoch": 1.6538461538461537,
"grad_norm": 1.8742467164993286,
"learning_rate": 1.7973064327924128e-06,
"loss": 0.0813,
"mean_token_accuracy": 0.9807978272438049,
"step": 387
},
{
"epoch": 1.658119658119658,
"grad_norm": 2.1209309101104736,
"learning_rate": 1.778326173550761e-06,
"loss": 0.0954,
"mean_token_accuracy": 0.9767441749572754,
"step": 388
},
{
"epoch": 1.6623931623931623,
"grad_norm": 2.1338541507720947,
"learning_rate": 1.7595531534802317e-06,
"loss": 0.0943,
"mean_token_accuracy": 0.9795450568199158,
"step": 389
},
{
"epoch": 1.6666666666666665,
"grad_norm": 1.9863265752792358,
"learning_rate": 1.7409884179451714e-06,
"loss": 0.0888,
"mean_token_accuracy": 0.977613091468811,
"step": 390
},
{
"epoch": 1.6709401709401708,
"grad_norm": 2.217792272567749,
"learning_rate": 1.7226330007117231e-06,
"loss": 0.0915,
"mean_token_accuracy": 0.9778071641921997,
"step": 391
},
{
"epoch": 1.6752136752136753,
"grad_norm": 2.277088165283203,
"learning_rate": 1.7044879238902675e-06,
"loss": 0.0923,
"mean_token_accuracy": 0.9758321046829224,
"step": 392
},
{
"epoch": 1.6794871794871795,
"grad_norm": 2.33744740486145,
"learning_rate": 1.6865541978785083e-06,
"loss": 0.0888,
"mean_token_accuracy": 0.9786233901977539,
"step": 393
},
{
"epoch": 1.6837606837606838,
"grad_norm": 2.055072546005249,
"learning_rate": 1.6688328213052018e-06,
"loss": 0.0815,
"mean_token_accuracy": 0.9813379645347595,
"step": 394
},
{
"epoch": 1.688034188034188,
"grad_norm": 2.1897664070129395,
"learning_rate": 1.6513247809745587e-06,
"loss": 0.0911,
"mean_token_accuracy": 0.9775086641311646,
"step": 395
},
{
"epoch": 1.6923076923076923,
"grad_norm": 2.0002174377441406,
"learning_rate": 1.634031051811284e-06,
"loss": 0.0717,
"mean_token_accuracy": 0.9830272793769836,
"step": 396
},
{
"epoch": 1.6965811965811965,
"grad_norm": 2.155630111694336,
"learning_rate": 1.6169525968062965e-06,
"loss": 0.0884,
"mean_token_accuracy": 0.9808605909347534,
"step": 397
},
{
"epoch": 1.7008547008547008,
"grad_norm": 1.7671974897384644,
"learning_rate": 1.6000903669631052e-06,
"loss": 0.0826,
"mean_token_accuracy": 0.980997622013092,
"step": 398
},
{
"epoch": 1.7051282051282053,
"grad_norm": 2.4356114864349365,
"learning_rate": 1.5834453012448455e-06,
"loss": 0.0939,
"mean_token_accuracy": 0.9777652025222778,
"step": 399
},
{
"epoch": 1.7094017094017095,
"grad_norm": 2.3396825790405273,
"learning_rate": 1.5670183265220046e-06,
"loss": 0.0909,
"mean_token_accuracy": 0.9789909720420837,
"step": 400
},
{
"epoch": 1.7136752136752138,
"grad_norm": 2.167603015899658,
"learning_rate": 1.5508103575207989e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9791582822799683,
"step": 401
},
{
"epoch": 1.717948717948718,
"grad_norm": 1.838385820388794,
"learning_rate": 1.5348222967722451e-06,
"loss": 0.0786,
"mean_token_accuracy": 0.9818563461303711,
"step": 402
},
{
"epoch": 1.7222222222222223,
"grad_norm": 2.2510430812835693,
"learning_rate": 1.5190550345619021e-06,
"loss": 0.0931,
"mean_token_accuracy": 0.9793668985366821,
"step": 403
},
{
"epoch": 1.7264957264957266,
"grad_norm": 1.8351444005966187,
"learning_rate": 1.503509448880292e-06,
"loss": 0.078,
"mean_token_accuracy": 0.9809679388999939,
"step": 404
},
{
"epoch": 1.7307692307692308,
"grad_norm": 2.311199188232422,
"learning_rate": 1.4881864053740154e-06,
"loss": 0.0862,
"mean_token_accuracy": 0.9793581366539001,
"step": 405
},
{
"epoch": 1.735042735042735,
"grad_norm": 2.3395721912384033,
"learning_rate": 1.473086757297543e-06,
"loss": 0.0812,
"mean_token_accuracy": 0.9812895655632019,
"step": 406
},
{
"epoch": 1.7393162393162394,
"grad_norm": 2.0679736137390137,
"learning_rate": 1.4582113454657057e-06,
"loss": 0.0876,
"mean_token_accuracy": 0.9811984300613403,
"step": 407
},
{
"epoch": 1.7435897435897436,
"grad_norm": 2.3800151348114014,
"learning_rate": 1.4435609982068766e-06,
"loss": 0.0877,
"mean_token_accuracy": 0.9787911176681519,
"step": 408
},
{
"epoch": 1.7478632478632479,
"grad_norm": 2.3753533363342285,
"learning_rate": 1.4291365313168393e-06,
"loss": 0.0889,
"mean_token_accuracy": 0.9775543808937073,
"step": 409
},
{
"epoch": 1.7521367521367521,
"grad_norm": 2.0593650341033936,
"learning_rate": 1.4149387480133674e-06,
"loss": 0.0897,
"mean_token_accuracy": 0.9783128499984741,
"step": 410
},
{
"epoch": 1.7564102564102564,
"grad_norm": 2.45052170753479,
"learning_rate": 1.4009684388914957e-06,
"loss": 0.0955,
"mean_token_accuracy": 0.9760656952857971,
"step": 411
},
{
"epoch": 1.7606837606837606,
"grad_norm": 2.1106207370758057,
"learning_rate": 1.3872263818794918e-06,
"loss": 0.0989,
"mean_token_accuracy": 0.9786368012428284,
"step": 412
},
{
"epoch": 1.764957264957265,
"grad_norm": 2.3723652362823486,
"learning_rate": 1.373713342195548e-06,
"loss": 0.0878,
"mean_token_accuracy": 0.9786877632141113,
"step": 413
},
{
"epoch": 1.7692307692307692,
"grad_norm": 1.7907770872116089,
"learning_rate": 1.3604300723051571e-06,
"loss": 0.0767,
"mean_token_accuracy": 0.9808289408683777,
"step": 414
},
{
"epoch": 1.7735042735042734,
"grad_norm": 2.0177221298217773,
"learning_rate": 1.347377311879225e-06,
"loss": 0.0903,
"mean_token_accuracy": 0.9788788557052612,
"step": 415
},
{
"epoch": 1.7777777777777777,
"grad_norm": 2.277787446975708,
"learning_rate": 1.3345557877528737e-06,
"loss": 0.0931,
"mean_token_accuracy": 0.9771180152893066,
"step": 416
},
{
"epoch": 1.782051282051282,
"grad_norm": 1.9423468112945557,
"learning_rate": 1.3219662138849707e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9799258708953857,
"step": 417
},
{
"epoch": 1.7863247863247862,
"grad_norm": 1.8951685428619385,
"learning_rate": 1.3096092913183741e-06,
"loss": 0.0753,
"mean_token_accuracy": 0.9838992953300476,
"step": 418
},
{
"epoch": 1.7905982905982905,
"grad_norm": 2.024014949798584,
"learning_rate": 1.2974857081408935e-06,
"loss": 0.0865,
"mean_token_accuracy": 0.9807155132293701,
"step": 419
},
{
"epoch": 1.7948717948717947,
"grad_norm": 2.1214489936828613,
"learning_rate": 1.2855961394469728e-06,
"loss": 0.089,
"mean_token_accuracy": 0.9781718850135803,
"step": 420
},
{
"epoch": 1.7991452991452992,
"grad_norm": 2.066615581512451,
"learning_rate": 1.273941247300104e-06,
"loss": 0.0831,
"mean_token_accuracy": 0.9806697368621826,
"step": 421
},
{
"epoch": 1.8034188034188035,
"grad_norm": 1.923561930656433,
"learning_rate": 1.2625216806959522e-06,
"loss": 0.0765,
"mean_token_accuracy": 0.9830482602119446,
"step": 422
},
{
"epoch": 1.8076923076923077,
"grad_norm": 1.823364496231079,
"learning_rate": 1.2513380755262242e-06,
"loss": 0.085,
"mean_token_accuracy": 0.9803729057312012,
"step": 423
},
{
"epoch": 1.811965811965812,
"grad_norm": 2.010810136795044,
"learning_rate": 1.240391054543255e-06,
"loss": 0.0804,
"mean_token_accuracy": 0.9809787273406982,
"step": 424
},
{
"epoch": 1.8162393162393162,
"grad_norm": 2.2021121978759766,
"learning_rate": 1.2296812273253308e-06,
"loss": 0.0832,
"mean_token_accuracy": 0.9796929955482483,
"step": 425
},
{
"epoch": 1.8205128205128205,
"grad_norm": 2.7726950645446777,
"learning_rate": 1.2192091902427471e-06,
"loss": 0.1055,
"mean_token_accuracy": 0.9748052358627319,
"step": 426
},
{
"epoch": 1.8247863247863247,
"grad_norm": 2.122213363647461,
"learning_rate": 1.2089755264245962e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9801605939865112,
"step": 427
},
{
"epoch": 1.8290598290598292,
"grad_norm": 2.100584030151367,
"learning_rate": 1.1989808057263e-06,
"loss": 0.0844,
"mean_token_accuracy": 0.9800516366958618,
"step": 428
},
{
"epoch": 1.8333333333333335,
"grad_norm": 2.343102216720581,
"learning_rate": 1.1892255846978764e-06,
"loss": 0.0885,
"mean_token_accuracy": 0.9799159169197083,
"step": 429
},
{
"epoch": 1.8376068376068377,
"grad_norm": 2.159567356109619,
"learning_rate": 1.179710406552947e-06,
"loss": 0.0812,
"mean_token_accuracy": 0.9803233742713928,
"step": 430
},
{
"epoch": 1.841880341880342,
"grad_norm": 2.3904519081115723,
"learning_rate": 1.1704358011384918e-06,
"loss": 0.1003,
"mean_token_accuracy": 0.9763982892036438,
"step": 431
},
{
"epoch": 1.8461538461538463,
"grad_norm": 1.9930096864700317,
"learning_rate": 1.1614022849053393e-06,
"loss": 0.0836,
"mean_token_accuracy": 0.9820988774299622,
"step": 432
},
{
"epoch": 1.8504273504273505,
"grad_norm": 1.978892207145691,
"learning_rate": 1.152610360879415e-06,
"loss": 0.0847,
"mean_token_accuracy": 0.979092001914978,
"step": 433
},
{
"epoch": 1.8547008547008548,
"grad_norm": 2.216299295425415,
"learning_rate": 1.1440605186337256e-06,
"loss": 0.0872,
"mean_token_accuracy": 0.9798073768615723,
"step": 434
},
{
"epoch": 1.858974358974359,
"grad_norm": 1.9769878387451172,
"learning_rate": 1.1357532342611006e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.980143666267395,
"step": 435
},
{
"epoch": 1.8632478632478633,
"grad_norm": 2.341925621032715,
"learning_rate": 1.1276889703476789e-06,
"loss": 0.0833,
"mean_token_accuracy": 0.9784598350524902,
"step": 436
},
{
"epoch": 1.8675213675213675,
"grad_norm": 2.1460580825805664,
"learning_rate": 1.1198681759471524e-06,
"loss": 0.0896,
"mean_token_accuracy": 0.9792025089263916,
"step": 437
},
{
"epoch": 1.8717948717948718,
"grad_norm": 2.2955880165100098,
"learning_rate": 1.1122912865557579e-06,
"loss": 0.0887,
"mean_token_accuracy": 0.9784141182899475,
"step": 438
},
{
"epoch": 1.876068376068376,
"grad_norm": 1.7442694902420044,
"learning_rate": 1.1049587240880296e-06,
"loss": 0.0691,
"mean_token_accuracy": 0.9838666319847107,
"step": 439
},
{
"epoch": 1.8803418803418803,
"grad_norm": 2.3275156021118164,
"learning_rate": 1.0978708968533029e-06,
"loss": 0.0863,
"mean_token_accuracy": 0.9796561598777771,
"step": 440
},
{
"epoch": 1.8846153846153846,
"grad_norm": 2.238799810409546,
"learning_rate": 1.09102819953298e-06,
"loss": 0.0989,
"mean_token_accuracy": 0.9774456024169922,
"step": 441
},
{
"epoch": 1.8888888888888888,
"grad_norm": 2.3636116981506348,
"learning_rate": 1.0844310131585498e-06,
"loss": 0.0878,
"mean_token_accuracy": 0.9802519083023071,
"step": 442
},
{
"epoch": 1.893162393162393,
"grad_norm": 2.082477331161499,
"learning_rate": 1.0780797050903713e-06,
"loss": 0.0819,
"mean_token_accuracy": 0.9814638495445251,
"step": 443
},
{
"epoch": 1.8974358974358974,
"grad_norm": 2.0771663188934326,
"learning_rate": 1.07197462899722e-06,
"loss": 0.0852,
"mean_token_accuracy": 0.9814097881317139,
"step": 444
},
{
"epoch": 1.9017094017094016,
"grad_norm": 2.476078510284424,
"learning_rate": 1.066116124836589e-06,
"loss": 0.0937,
"mean_token_accuracy": 0.9798304438591003,
"step": 445
},
{
"epoch": 1.9059829059829059,
"grad_norm": 1.598405122756958,
"learning_rate": 1.0605045188357633e-06,
"loss": 0.0712,
"mean_token_accuracy": 0.9831634759902954,
"step": 446
},
{
"epoch": 1.9102564102564101,
"grad_norm": 2.015641450881958,
"learning_rate": 1.0551401234736524e-06,
"loss": 0.0797,
"mean_token_accuracy": 0.9820832014083862,
"step": 447
},
{
"epoch": 1.9145299145299144,
"grad_norm": 2.4531426429748535,
"learning_rate": 1.0500232374633884e-06,
"loss": 0.0961,
"mean_token_accuracy": 0.976710319519043,
"step": 448
},
{
"epoch": 1.9188034188034186,
"grad_norm": 1.7264620065689087,
"learning_rate": 1.0451541457356949e-06,
"loss": 0.0774,
"mean_token_accuracy": 0.9826287627220154,
"step": 449
},
{
"epoch": 1.9230769230769231,
"grad_norm": 1.9337164163589478,
"learning_rate": 1.0405331194230197e-06,
"loss": 0.08,
"mean_token_accuracy": 0.9816673994064331,
"step": 450
},
{
"epoch": 1.9273504273504274,
"grad_norm": 2.3220560550689697,
"learning_rate": 1.036160415844436e-06,
"loss": 0.087,
"mean_token_accuracy": 0.9797703623771667,
"step": 451
},
{
"epoch": 1.9316239316239316,
"grad_norm": 2.0156548023223877,
"learning_rate": 1.032036278491317e-06,
"loss": 0.0827,
"mean_token_accuracy": 0.980651319026947,
"step": 452
},
{
"epoch": 1.935897435897436,
"grad_norm": 2.087442636489868,
"learning_rate": 1.0281609370137724e-06,
"loss": 0.0854,
"mean_token_accuracy": 0.9798422455787659,
"step": 453
},
{
"epoch": 1.9401709401709402,
"grad_norm": 2.0209758281707764,
"learning_rate": 1.0245346072078642e-06,
"loss": 0.0786,
"mean_token_accuracy": 0.9813392162322998,
"step": 454
},
{
"epoch": 1.9444444444444444,
"grad_norm": 2.3285915851593018,
"learning_rate": 1.0211574910035892e-06,
"loss": 0.0864,
"mean_token_accuracy": 0.980617880821228,
"step": 455
},
{
"epoch": 1.9487179487179487,
"grad_norm": 2.3946189880371094,
"learning_rate": 1.018029776453635e-06,
"loss": 0.0888,
"mean_token_accuracy": 0.9790608286857605,
"step": 456
},
{
"epoch": 1.952991452991453,
"grad_norm": 2.0309736728668213,
"learning_rate": 1.0151516377229062e-06,
"loss": 0.085,
"mean_token_accuracy": 0.9819555878639221,
"step": 457
},
{
"epoch": 1.9572649572649574,
"grad_norm": 2.4477789402008057,
"learning_rate": 1.0125232350788295e-06,
"loss": 0.0968,
"mean_token_accuracy": 0.9767808318138123,
"step": 458
},
{
"epoch": 1.9615384615384617,
"grad_norm": 2.2161478996276855,
"learning_rate": 1.0101447148824265e-06,
"loss": 0.0871,
"mean_token_accuracy": 0.9767209887504578,
"step": 459
},
{
"epoch": 1.965811965811966,
"grad_norm": 2.2850284576416016,
"learning_rate": 1.0080162095801663e-06,
"loss": 0.082,
"mean_token_accuracy": 0.9784152507781982,
"step": 460
},
{
"epoch": 1.9700854700854702,
"grad_norm": 1.9745234251022339,
"learning_rate": 1.0061378376965871e-06,
"loss": 0.0808,
"mean_token_accuracy": 0.9789802432060242,
"step": 461
},
{
"epoch": 1.9743589743589745,
"grad_norm": 1.8992419242858887,
"learning_rate": 1.0045097038276994e-06,
"loss": 0.0783,
"mean_token_accuracy": 0.9831842184066772,
"step": 462
},
{
"epoch": 1.9786324786324787,
"grad_norm": 2.1573259830474854,
"learning_rate": 1.0031318986351587e-06,
"loss": 0.0783,
"mean_token_accuracy": 0.9816881418228149,
"step": 463
},
{
"epoch": 1.982905982905983,
"grad_norm": 2.2348172664642334,
"learning_rate": 1.0020044988412197e-06,
"loss": 0.0893,
"mean_token_accuracy": 0.9791444540023804,
"step": 464
},
{
"epoch": 1.9871794871794872,
"grad_norm": 2.1570818424224854,
"learning_rate": 1.0011275672244635e-06,
"loss": 0.0886,
"mean_token_accuracy": 0.9798603653907776,
"step": 465
},
{
"epoch": 1.9914529914529915,
"grad_norm": 2.324618101119995,
"learning_rate": 1.0005011526162988e-06,
"loss": 0.0888,
"mean_token_accuracy": 0.9777023196220398,
"step": 466
},
{
"epoch": 1.9957264957264957,
"grad_norm": 1.7417622804641724,
"learning_rate": 1.0001252898982478e-06,
"loss": 0.0776,
"mean_token_accuracy": 0.9812348484992981,
"step": 467
},
{
"epoch": 2.0,
"grad_norm": 1.4360781908035278,
"learning_rate": 1.0000000000000002e-06,
"loss": 0.0643,
"mean_token_accuracy": 0.9865781664848328,
"step": 468
},
{
"epoch": 2.0,
"step": 468,
"total_flos": 3.6134539308407194e+17,
"train_loss": 0.15647654232178998,
"train_runtime": 1906.4028,
"train_samples_per_second": 7.84,
"train_steps_per_second": 0.245
}
],
"logging_steps": 1,
"max_steps": 468,
"num_input_tokens_seen": 0,
"num_train_epochs": 2,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": false,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.6134539308407194e+17,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}