Files
titulm-llama-3.2-1b-v1.0/trainer_state.json
ModelHub XC be30a6cde6 初始化项目,由ModelHub XC社区提供模型
Model: hishab/titulm-llama-3.2-1b-v1.0
Source: Original Platform
2026-05-19 11:52:07 +08:00

17606 lines
428 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999003686360466,
"eval_steps": 500,
"global_step": 2509,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003985254558134901,
"grad_norm": 1.1682030229557891,
"learning_rate": 1.5384615384615387e-06,
"loss": 1.298,
"step": 1
},
{
"epoch": 0.0007970509116269802,
"grad_norm": 1.1450089305489202,
"learning_rate": 3.0769230769230774e-06,
"loss": 1.2606,
"step": 2
},
{
"epoch": 0.0011955763674404703,
"grad_norm": 1.1741081208500113,
"learning_rate": 4.615384615384616e-06,
"loss": 1.317,
"step": 3
},
{
"epoch": 0.0015941018232539603,
"grad_norm": 0.9416899998464173,
"learning_rate": 6.153846153846155e-06,
"loss": 1.3244,
"step": 4
},
{
"epoch": 0.0019926272790674504,
"grad_norm": 1.17272634152426,
"learning_rate": 7.692307692307694e-06,
"loss": 1.2691,
"step": 5
},
{
"epoch": 0.0023911527348809405,
"grad_norm": 0.8938096517547656,
"learning_rate": 9.230769230769232e-06,
"loss": 1.3528,
"step": 6
},
{
"epoch": 0.0027896781906944306,
"grad_norm": 1.4781168610568196,
"learning_rate": 1.076923076923077e-05,
"loss": 1.3365,
"step": 7
},
{
"epoch": 0.0031882036465079207,
"grad_norm": 1.323556020239157,
"learning_rate": 1.230769230769231e-05,
"loss": 1.3375,
"step": 8
},
{
"epoch": 0.0035867291023214108,
"grad_norm": 1.9185267488446602,
"learning_rate": 1.3846153846153847e-05,
"loss": 1.2539,
"step": 9
},
{
"epoch": 0.003985254558134901,
"grad_norm": 1.674769393300418,
"learning_rate": 1.5384615384615387e-05,
"loss": 1.2846,
"step": 10
},
{
"epoch": 0.004383780013948391,
"grad_norm": 1.509254464009656,
"learning_rate": 1.6923076923076924e-05,
"loss": 1.2827,
"step": 11
},
{
"epoch": 0.004782305469761881,
"grad_norm": 1.3888982819984244,
"learning_rate": 1.8461538461538465e-05,
"loss": 1.323,
"step": 12
},
{
"epoch": 0.0051808309255753715,
"grad_norm": 1.5318573252337477,
"learning_rate": 2e-05,
"loss": 1.2514,
"step": 13
},
{
"epoch": 0.005579356381388861,
"grad_norm": 1.7716074190442104,
"learning_rate": 2.153846153846154e-05,
"loss": 1.224,
"step": 14
},
{
"epoch": 0.005977881837202352,
"grad_norm": 1.5774334632902784,
"learning_rate": 2.3076923076923076e-05,
"loss": 1.2513,
"step": 15
},
{
"epoch": 0.006376407293015841,
"grad_norm": 2.079535848411662,
"learning_rate": 2.461538461538462e-05,
"loss": 1.2324,
"step": 16
},
{
"epoch": 0.006774932748829332,
"grad_norm": 1.6224827783116045,
"learning_rate": 2.6153846153846157e-05,
"loss": 1.2437,
"step": 17
},
{
"epoch": 0.0071734582046428215,
"grad_norm": 1.731628237386042,
"learning_rate": 2.7692307692307694e-05,
"loss": 1.1941,
"step": 18
},
{
"epoch": 0.007571983660456312,
"grad_norm": 1.6486789158728468,
"learning_rate": 2.923076923076923e-05,
"loss": 1.2157,
"step": 19
},
{
"epoch": 0.007970509116269802,
"grad_norm": 1.5878767170549857,
"learning_rate": 3.0769230769230774e-05,
"loss": 1.2282,
"step": 20
},
{
"epoch": 0.008369034572083291,
"grad_norm": 1.769036375459327,
"learning_rate": 3.230769230769231e-05,
"loss": 1.2276,
"step": 21
},
{
"epoch": 0.008767560027896783,
"grad_norm": 1.6262212305434318,
"learning_rate": 3.384615384615385e-05,
"loss": 1.1966,
"step": 22
},
{
"epoch": 0.009166085483710272,
"grad_norm": 1.6178408127403725,
"learning_rate": 3.538461538461539e-05,
"loss": 1.1681,
"step": 23
},
{
"epoch": 0.009564610939523762,
"grad_norm": 1.8576583132326376,
"learning_rate": 3.692307692307693e-05,
"loss": 1.1733,
"step": 24
},
{
"epoch": 0.009963136395337252,
"grad_norm": 2.10735796807257,
"learning_rate": 3.846153846153846e-05,
"loss": 1.1781,
"step": 25
},
{
"epoch": 0.010361661851150743,
"grad_norm": 2.061441058129094,
"learning_rate": 4e-05,
"loss": 1.1766,
"step": 26
},
{
"epoch": 0.010760187306964233,
"grad_norm": 1.689955130193812,
"learning_rate": 3.9999983991661895e-05,
"loss": 1.2193,
"step": 27
},
{
"epoch": 0.011158712762777722,
"grad_norm": 2.089801173287961,
"learning_rate": 3.99999359666732e-05,
"loss": 1.1864,
"step": 28
},
{
"epoch": 0.011557238218591212,
"grad_norm": 2.1810170181408584,
"learning_rate": 3.999985592511079e-05,
"loss": 1.1981,
"step": 29
},
{
"epoch": 0.011955763674404703,
"grad_norm": 1.76919939388419,
"learning_rate": 3.999974386710281e-05,
"loss": 1.0961,
"step": 30
},
{
"epoch": 0.012354289130218193,
"grad_norm": 1.7129497533016933,
"learning_rate": 3.999959979282864e-05,
"loss": 1.1348,
"step": 31
},
{
"epoch": 0.012752814586031683,
"grad_norm": 1.23643910459474,
"learning_rate": 3.999942370251891e-05,
"loss": 1.1678,
"step": 32
},
{
"epoch": 0.013151340041845172,
"grad_norm": 1.9739746593085052,
"learning_rate": 3.999921559645554e-05,
"loss": 1.1677,
"step": 33
},
{
"epoch": 0.013549865497658664,
"grad_norm": 1.2119052688289602,
"learning_rate": 3.9998975474971644e-05,
"loss": 1.1073,
"step": 34
},
{
"epoch": 0.013948390953472153,
"grad_norm": 2.2576004723914758,
"learning_rate": 3.999870333845162e-05,
"loss": 1.1745,
"step": 35
},
{
"epoch": 0.014346916409285643,
"grad_norm": 1.0352975506141129,
"learning_rate": 3.9998399187331125e-05,
"loss": 1.1283,
"step": 36
},
{
"epoch": 0.014745441865099133,
"grad_norm": 2.0996501651362243,
"learning_rate": 3.999806302209705e-05,
"loss": 1.1212,
"step": 37
},
{
"epoch": 0.015143967320912624,
"grad_norm": 1.225013558748945,
"learning_rate": 3.9997694843287546e-05,
"loss": 1.1209,
"step": 38
},
{
"epoch": 0.015542492776726114,
"grad_norm": 1.764970024494282,
"learning_rate": 3.999729465149199e-05,
"loss": 1.1445,
"step": 39
},
{
"epoch": 0.015941018232539603,
"grad_norm": 1.4826459735644733,
"learning_rate": 3.999686244735103e-05,
"loss": 1.1341,
"step": 40
},
{
"epoch": 0.016339543688353095,
"grad_norm": 1.5561072965892055,
"learning_rate": 3.9996398231556565e-05,
"loss": 1.1582,
"step": 41
},
{
"epoch": 0.016738069144166583,
"grad_norm": 1.659587994848137,
"learning_rate": 3.99959020048517e-05,
"loss": 1.0567,
"step": 42
},
{
"epoch": 0.017136594599980074,
"grad_norm": 1.4377552352395278,
"learning_rate": 3.999537376803085e-05,
"loss": 1.1493,
"step": 43
},
{
"epoch": 0.017535120055793565,
"grad_norm": 1.4553863448092164,
"learning_rate": 3.99948135219396e-05,
"loss": 1.135,
"step": 44
},
{
"epoch": 0.017933645511607053,
"grad_norm": 1.2714080589572554,
"learning_rate": 3.9994221267474826e-05,
"loss": 1.1033,
"step": 45
},
{
"epoch": 0.018332170967420545,
"grad_norm": 1.4352549542350836,
"learning_rate": 3.9993597005584625e-05,
"loss": 1.1441,
"step": 46
},
{
"epoch": 0.018730696423234033,
"grad_norm": 1.355132046439583,
"learning_rate": 3.9992940737268344e-05,
"loss": 1.1654,
"step": 47
},
{
"epoch": 0.019129221879047524,
"grad_norm": 1.2739353234997868,
"learning_rate": 3.9992252463576547e-05,
"loss": 1.0932,
"step": 48
},
{
"epoch": 0.019527747334861015,
"grad_norm": 1.5599129273160852,
"learning_rate": 3.9991532185611054e-05,
"loss": 1.1289,
"step": 49
},
{
"epoch": 0.019926272790674503,
"grad_norm": 1.2761680959247894,
"learning_rate": 3.9990779904524915e-05,
"loss": 1.1008,
"step": 50
},
{
"epoch": 0.020324798246487995,
"grad_norm": 1.527368858852383,
"learning_rate": 3.998999562152239e-05,
"loss": 1.0787,
"step": 51
},
{
"epoch": 0.020723323702301486,
"grad_norm": 1.3999003605132498,
"learning_rate": 3.9989179337859e-05,
"loss": 1.0898,
"step": 52
},
{
"epoch": 0.021121849158114974,
"grad_norm": 1.3815814063131917,
"learning_rate": 3.998833105484148e-05,
"loss": 1.1101,
"step": 53
},
{
"epoch": 0.021520374613928465,
"grad_norm": 1.2557770729577848,
"learning_rate": 3.998745077382779e-05,
"loss": 1.069,
"step": 54
},
{
"epoch": 0.021918900069741953,
"grad_norm": 1.3393579953921733,
"learning_rate": 3.99865384962271e-05,
"loss": 1.0726,
"step": 55
},
{
"epoch": 0.022317425525555445,
"grad_norm": 1.8263793843329788,
"learning_rate": 3.998559422349983e-05,
"loss": 1.0557,
"step": 56
},
{
"epoch": 0.022715950981368936,
"grad_norm": 1.0325515310273663,
"learning_rate": 3.99846179571576e-05,
"loss": 1.0813,
"step": 57
},
{
"epoch": 0.023114476437182424,
"grad_norm": 1.660896527304962,
"learning_rate": 3.998360969876325e-05,
"loss": 1.0583,
"step": 58
},
{
"epoch": 0.023513001892995915,
"grad_norm": 0.9235555725660893,
"learning_rate": 3.998256944993083e-05,
"loss": 1.0914,
"step": 59
},
{
"epoch": 0.023911527348809407,
"grad_norm": 1.5716895827106996,
"learning_rate": 3.99814972123256e-05,
"loss": 1.0919,
"step": 60
},
{
"epoch": 0.024310052804622895,
"grad_norm": 1.1583937200957837,
"learning_rate": 3.998039298766405e-05,
"loss": 1.0255,
"step": 61
},
{
"epoch": 0.024708578260436386,
"grad_norm": 1.7286427351895097,
"learning_rate": 3.9979256777713856e-05,
"loss": 1.0395,
"step": 62
},
{
"epoch": 0.025107103716249874,
"grad_norm": 1.1208870057484686,
"learning_rate": 3.9978088584293894e-05,
"loss": 1.0619,
"step": 63
},
{
"epoch": 0.025505629172063365,
"grad_norm": 1.302369859146436,
"learning_rate": 3.997688840927425e-05,
"loss": 1.0526,
"step": 64
},
{
"epoch": 0.025904154627876857,
"grad_norm": 1.4174940189185974,
"learning_rate": 3.997565625457621e-05,
"loss": 1.0629,
"step": 65
},
{
"epoch": 0.026302680083690345,
"grad_norm": 1.232886411420502,
"learning_rate": 3.9974392122172244e-05,
"loss": 1.0289,
"step": 66
},
{
"epoch": 0.026701205539503836,
"grad_norm": 1.3590067350773096,
"learning_rate": 3.9973096014086017e-05,
"loss": 1.0471,
"step": 67
},
{
"epoch": 0.027099730995317328,
"grad_norm": 1.1328281166100345,
"learning_rate": 3.9971767932392386e-05,
"loss": 1.0373,
"step": 68
},
{
"epoch": 0.027498256451130815,
"grad_norm": 1.398126297704576,
"learning_rate": 3.997040787921739e-05,
"loss": 1.01,
"step": 69
},
{
"epoch": 0.027896781906944307,
"grad_norm": 1.121173880074476,
"learning_rate": 3.996901585673824e-05,
"loss": 1.0509,
"step": 70
},
{
"epoch": 0.028295307362757795,
"grad_norm": 1.1562605633658927,
"learning_rate": 3.996759186718334e-05,
"loss": 1.0394,
"step": 71
},
{
"epoch": 0.028693832818571286,
"grad_norm": 1.563700864160097,
"learning_rate": 3.996613591283226e-05,
"loss": 1.0338,
"step": 72
},
{
"epoch": 0.029092358274384778,
"grad_norm": 1.258313908870013,
"learning_rate": 3.9964647996015745e-05,
"loss": 1.0402,
"step": 73
},
{
"epoch": 0.029490883730198265,
"grad_norm": 1.191082490937846,
"learning_rate": 3.996312811911569e-05,
"loss": 1.0405,
"step": 74
},
{
"epoch": 0.029889409186011757,
"grad_norm": 1.1466323991622203,
"learning_rate": 3.996157628456518e-05,
"loss": 1.0211,
"step": 75
},
{
"epoch": 0.030287934641825248,
"grad_norm": 1.528035955341438,
"learning_rate": 3.9959992494848433e-05,
"loss": 1.0462,
"step": 76
},
{
"epoch": 0.030686460097638736,
"grad_norm": 1.324417703714102,
"learning_rate": 3.995837675250084e-05,
"loss": 1.0842,
"step": 77
},
{
"epoch": 0.031084985553452227,
"grad_norm": 0.893828852913908,
"learning_rate": 3.995672906010893e-05,
"loss": 1.0135,
"step": 78
},
{
"epoch": 0.03148351100926572,
"grad_norm": 1.5886985675595782,
"learning_rate": 3.9955049420310386e-05,
"loss": 0.985,
"step": 79
},
{
"epoch": 0.03188203646507921,
"grad_norm": 0.8842933361031705,
"learning_rate": 3.995333783579404e-05,
"loss": 0.9826,
"step": 80
},
{
"epoch": 0.032280561920892695,
"grad_norm": 1.2312713203427161,
"learning_rate": 3.995159430929984e-05,
"loss": 0.9933,
"step": 81
},
{
"epoch": 0.03267908737670619,
"grad_norm": 0.968587201770918,
"learning_rate": 3.99498188436189e-05,
"loss": 1.0305,
"step": 82
},
{
"epoch": 0.03307761283251968,
"grad_norm": 1.354446902187372,
"learning_rate": 3.994801144159343e-05,
"loss": 1.0015,
"step": 83
},
{
"epoch": 0.033476138288333165,
"grad_norm": 0.9815177511320659,
"learning_rate": 3.9946172106116786e-05,
"loss": 1.0419,
"step": 84
},
{
"epoch": 0.03387466374414666,
"grad_norm": 1.4163104282934211,
"learning_rate": 3.994430084013345e-05,
"loss": 1.0693,
"step": 85
},
{
"epoch": 0.03427318919996015,
"grad_norm": 0.9575099047174793,
"learning_rate": 3.994239764663898e-05,
"loss": 1.0352,
"step": 86
},
{
"epoch": 0.034671714655773636,
"grad_norm": 1.3071690946757393,
"learning_rate": 3.99404625286801e-05,
"loss": 0.9971,
"step": 87
},
{
"epoch": 0.03507024011158713,
"grad_norm": 1.0094650013129123,
"learning_rate": 3.993849548935459e-05,
"loss": 1.0347,
"step": 88
},
{
"epoch": 0.03546876556740062,
"grad_norm": 1.2105057500431875,
"learning_rate": 3.993649653181138e-05,
"loss": 1.0249,
"step": 89
},
{
"epoch": 0.03586729102321411,
"grad_norm": 1.1494038215569387,
"learning_rate": 3.9934465659250445e-05,
"loss": 1.07,
"step": 90
},
{
"epoch": 0.036265816479027595,
"grad_norm": 1.3619795030427553,
"learning_rate": 3.993240287492288e-05,
"loss": 0.9727,
"step": 91
},
{
"epoch": 0.03666434193484109,
"grad_norm": 1.0095282991348078,
"learning_rate": 3.993030818213087e-05,
"loss": 1.0542,
"step": 92
},
{
"epoch": 0.03706286739065458,
"grad_norm": 1.233700566815371,
"learning_rate": 3.992818158422766e-05,
"loss": 1.0034,
"step": 93
},
{
"epoch": 0.037461392846468065,
"grad_norm": 1.0447313763347152,
"learning_rate": 3.992602308461758e-05,
"loss": 1.0058,
"step": 94
},
{
"epoch": 0.03785991830228156,
"grad_norm": 1.0696169481085038,
"learning_rate": 3.992383268675603e-05,
"loss": 1.0478,
"step": 95
},
{
"epoch": 0.03825844375809505,
"grad_norm": 1.3030274633669099,
"learning_rate": 3.9921610394149484e-05,
"loss": 0.9885,
"step": 96
},
{
"epoch": 0.038656969213908536,
"grad_norm": 0.9547168721038842,
"learning_rate": 3.991935621035545e-05,
"loss": 1.0126,
"step": 97
},
{
"epoch": 0.03905549466972203,
"grad_norm": 1.0282165364592126,
"learning_rate": 3.9917070138982496e-05,
"loss": 1.0352,
"step": 98
},
{
"epoch": 0.03945402012553552,
"grad_norm": 1.4052288957523145,
"learning_rate": 3.991475218369026e-05,
"loss": 0.9908,
"step": 99
},
{
"epoch": 0.03985254558134901,
"grad_norm": 0.883707027247818,
"learning_rate": 3.99124023481894e-05,
"loss": 1.0155,
"step": 100
},
{
"epoch": 0.0402510710371625,
"grad_norm": 1.0103744787259499,
"learning_rate": 3.991002063624159e-05,
"loss": 1.0398,
"step": 101
},
{
"epoch": 0.04064959649297599,
"grad_norm": 1.3196267795391554,
"learning_rate": 3.9907607051659594e-05,
"loss": 0.9986,
"step": 102
},
{
"epoch": 0.04104812194878948,
"grad_norm": 0.9068591396167901,
"learning_rate": 3.990516159830712e-05,
"loss": 0.988,
"step": 103
},
{
"epoch": 0.04144664740460297,
"grad_norm": 1.3332646337147993,
"learning_rate": 3.9902684280098965e-05,
"loss": 1.0022,
"step": 104
},
{
"epoch": 0.04184517286041646,
"grad_norm": 1.0383165114992166,
"learning_rate": 3.990017510100088e-05,
"loss": 0.9767,
"step": 105
},
{
"epoch": 0.04224369831622995,
"grad_norm": 1.0850955219468192,
"learning_rate": 3.9897634065029656e-05,
"loss": 1.0166,
"step": 106
},
{
"epoch": 0.042642223772043436,
"grad_norm": 1.0137112717519785,
"learning_rate": 3.989506117625306e-05,
"loss": 1.0039,
"step": 107
},
{
"epoch": 0.04304074922785693,
"grad_norm": 1.3161286100477132,
"learning_rate": 3.989245643878987e-05,
"loss": 1.031,
"step": 108
},
{
"epoch": 0.04343927468367042,
"grad_norm": 0.9789302387291591,
"learning_rate": 3.988981985680983e-05,
"loss": 1.0007,
"step": 109
},
{
"epoch": 0.04383780013948391,
"grad_norm": 1.367535024910473,
"learning_rate": 3.9887151434533674e-05,
"loss": 1.018,
"step": 110
},
{
"epoch": 0.0442363255952974,
"grad_norm": 0.7004934620329838,
"learning_rate": 3.988445117623311e-05,
"loss": 0.9821,
"step": 111
},
{
"epoch": 0.04463485105111089,
"grad_norm": 1.158874430209204,
"learning_rate": 3.9881719086230786e-05,
"loss": 0.9865,
"step": 112
},
{
"epoch": 0.04503337650692438,
"grad_norm": 1.152431912909897,
"learning_rate": 3.9878955168900334e-05,
"loss": 0.9645,
"step": 113
},
{
"epoch": 0.04543190196273787,
"grad_norm": 1.1079205102947556,
"learning_rate": 3.987615942866632e-05,
"loss": 0.9582,
"step": 114
},
{
"epoch": 0.04583042741855136,
"grad_norm": 1.1791654374723093,
"learning_rate": 3.987333187000427e-05,
"loss": 1.0214,
"step": 115
},
{
"epoch": 0.04622895287436485,
"grad_norm": 0.936906534851351,
"learning_rate": 3.9870472497440624e-05,
"loss": 1.0127,
"step": 116
},
{
"epoch": 0.04662747833017834,
"grad_norm": 1.092836008794883,
"learning_rate": 3.986758131555278e-05,
"loss": 0.9664,
"step": 117
},
{
"epoch": 0.04702600378599183,
"grad_norm": 1.094413912535255,
"learning_rate": 3.986465832896902e-05,
"loss": 0.9757,
"step": 118
},
{
"epoch": 0.04742452924180532,
"grad_norm": 1.0623495271819532,
"learning_rate": 3.986170354236856e-05,
"loss": 0.9984,
"step": 119
},
{
"epoch": 0.047823054697618814,
"grad_norm": 0.854179583596702,
"learning_rate": 3.985871696048154e-05,
"loss": 0.9864,
"step": 120
},
{
"epoch": 0.0482215801534323,
"grad_norm": 1.0432520232855218,
"learning_rate": 3.9855698588088965e-05,
"loss": 0.9548,
"step": 121
},
{
"epoch": 0.04862010560924579,
"grad_norm": 1.0755622132654334,
"learning_rate": 3.9852648430022754e-05,
"loss": 0.9485,
"step": 122
},
{
"epoch": 0.04901863106505928,
"grad_norm": 1.2217694552157112,
"learning_rate": 3.984956649116571e-05,
"loss": 0.9855,
"step": 123
},
{
"epoch": 0.04941715652087277,
"grad_norm": 1.0275276231271884,
"learning_rate": 3.984645277645149e-05,
"loss": 0.9964,
"step": 124
},
{
"epoch": 0.04981568197668626,
"grad_norm": 1.1178940979524548,
"learning_rate": 3.984330729086464e-05,
"loss": 0.9497,
"step": 125
},
{
"epoch": 0.05021420743249975,
"grad_norm": 0.741923762221831,
"learning_rate": 3.984013003944056e-05,
"loss": 1.0072,
"step": 126
},
{
"epoch": 0.05061273288831324,
"grad_norm": 0.8682737579433879,
"learning_rate": 3.983692102726551e-05,
"loss": 1.0082,
"step": 127
},
{
"epoch": 0.05101125834412673,
"grad_norm": 1.0434473812056535,
"learning_rate": 3.983368025947657e-05,
"loss": 0.9831,
"step": 128
},
{
"epoch": 0.05140978379994022,
"grad_norm": 1.022692118220617,
"learning_rate": 3.983040774126169e-05,
"loss": 0.9566,
"step": 129
},
{
"epoch": 0.051808309255753714,
"grad_norm": 1.2484490098325738,
"learning_rate": 3.9827103477859605e-05,
"loss": 1.0005,
"step": 130
},
{
"epoch": 0.0522068347115672,
"grad_norm": 0.8271462851970588,
"learning_rate": 3.9823767474559905e-05,
"loss": 0.968,
"step": 131
},
{
"epoch": 0.05260536016738069,
"grad_norm": 0.8519476486723382,
"learning_rate": 3.982039973670298e-05,
"loss": 0.9617,
"step": 132
},
{
"epoch": 0.053003885623194184,
"grad_norm": 0.8333279737618872,
"learning_rate": 3.9817000269680005e-05,
"loss": 0.9757,
"step": 133
},
{
"epoch": 0.05340241107900767,
"grad_norm": 0.8703944410797784,
"learning_rate": 3.981356907893298e-05,
"loss": 0.9917,
"step": 134
},
{
"epoch": 0.05380093653482116,
"grad_norm": 0.9994780910035236,
"learning_rate": 3.981010616995465e-05,
"loss": 0.9603,
"step": 135
},
{
"epoch": 0.054199461990634655,
"grad_norm": 1.1123731475641294,
"learning_rate": 3.980661154828857e-05,
"loss": 0.9695,
"step": 136
},
{
"epoch": 0.05459798744644814,
"grad_norm": 0.9337508858933264,
"learning_rate": 3.980308521952905e-05,
"loss": 0.9786,
"step": 137
},
{
"epoch": 0.05499651290226163,
"grad_norm": 0.8773514301553659,
"learning_rate": 3.979952718932116e-05,
"loss": 0.9829,
"step": 138
},
{
"epoch": 0.05539503835807512,
"grad_norm": 0.8259379275752252,
"learning_rate": 3.97959374633607e-05,
"loss": 0.9731,
"step": 139
},
{
"epoch": 0.055793563813888614,
"grad_norm": 0.9481177250720214,
"learning_rate": 3.979231604739423e-05,
"loss": 1.0004,
"step": 140
},
{
"epoch": 0.0561920892697021,
"grad_norm": 1.0333391418969482,
"learning_rate": 3.978866294721904e-05,
"loss": 0.9685,
"step": 141
},
{
"epoch": 0.05659061472551559,
"grad_norm": 0.9955889948584824,
"learning_rate": 3.9784978168683134e-05,
"loss": 0.9716,
"step": 142
},
{
"epoch": 0.056989140181329084,
"grad_norm": 1.0603086583420307,
"learning_rate": 3.978126171768523e-05,
"loss": 0.9801,
"step": 143
},
{
"epoch": 0.05738766563714257,
"grad_norm": 0.812587571522746,
"learning_rate": 3.977751360017474e-05,
"loss": 0.9595,
"step": 144
},
{
"epoch": 0.05778619109295606,
"grad_norm": 0.7781386777987177,
"learning_rate": 3.97737338221518e-05,
"loss": 1.0095,
"step": 145
},
{
"epoch": 0.058184716548769555,
"grad_norm": 0.9828802357688441,
"learning_rate": 3.976992238966719e-05,
"loss": 0.992,
"step": 146
},
{
"epoch": 0.05858324200458304,
"grad_norm": 0.9416827586556631,
"learning_rate": 3.976607930882238e-05,
"loss": 0.9628,
"step": 147
},
{
"epoch": 0.05898176746039653,
"grad_norm": 0.7650913970674944,
"learning_rate": 3.97622045857695e-05,
"loss": 0.9995,
"step": 148
},
{
"epoch": 0.059380292916210026,
"grad_norm": 0.6668203189771907,
"learning_rate": 3.9758298226711346e-05,
"loss": 0.9709,
"step": 149
},
{
"epoch": 0.059778818372023514,
"grad_norm": 0.9120833321517047,
"learning_rate": 3.975436023790135e-05,
"loss": 0.9644,
"step": 150
},
{
"epoch": 0.060177343827837,
"grad_norm": 1.0907868368195024,
"learning_rate": 3.975039062564357e-05,
"loss": 0.9628,
"step": 151
},
{
"epoch": 0.060575869283650496,
"grad_norm": 0.9368612099613929,
"learning_rate": 3.9746389396292705e-05,
"loss": 0.9937,
"step": 152
},
{
"epoch": 0.060974394739463984,
"grad_norm": 0.9737465093992717,
"learning_rate": 3.974235655625405e-05,
"loss": 0.961,
"step": 153
},
{
"epoch": 0.06137292019527747,
"grad_norm": 0.8996382068900802,
"learning_rate": 3.973829211198352e-05,
"loss": 0.9339,
"step": 154
},
{
"epoch": 0.06177144565109096,
"grad_norm": 0.9165314697100433,
"learning_rate": 3.973419606998761e-05,
"loss": 0.9568,
"step": 155
},
{
"epoch": 0.062169971106904455,
"grad_norm": 0.9274654639084001,
"learning_rate": 3.9730068436823395e-05,
"loss": 0.9389,
"step": 156
},
{
"epoch": 0.06256849656271794,
"grad_norm": 0.8441046935557636,
"learning_rate": 3.9725909219098546e-05,
"loss": 0.9388,
"step": 157
},
{
"epoch": 0.06296702201853144,
"grad_norm": 0.9902084616052694,
"learning_rate": 3.972171842347127e-05,
"loss": 0.9596,
"step": 158
},
{
"epoch": 0.06336554747434492,
"grad_norm": 1.1115069818338272,
"learning_rate": 3.9717496056650325e-05,
"loss": 0.9421,
"step": 159
},
{
"epoch": 0.06376407293015841,
"grad_norm": 0.9808461355374265,
"learning_rate": 3.9713242125395035e-05,
"loss": 0.9549,
"step": 160
},
{
"epoch": 0.06416259838597191,
"grad_norm": 0.6838984370781541,
"learning_rate": 3.970895663651523e-05,
"loss": 0.9577,
"step": 161
},
{
"epoch": 0.06456112384178539,
"grad_norm": 0.5849603441312805,
"learning_rate": 3.970463959687127e-05,
"loss": 0.9391,
"step": 162
},
{
"epoch": 0.06495964929759888,
"grad_norm": 0.8012305866704266,
"learning_rate": 3.9700291013374005e-05,
"loss": 0.9749,
"step": 163
},
{
"epoch": 0.06535817475341238,
"grad_norm": 0.9116141961043895,
"learning_rate": 3.969591089298481e-05,
"loss": 0.9734,
"step": 164
},
{
"epoch": 0.06575670020922586,
"grad_norm": 0.7666536547751186,
"learning_rate": 3.9691499242715524e-05,
"loss": 0.9679,
"step": 165
},
{
"epoch": 0.06615522566503935,
"grad_norm": 0.5587510714841003,
"learning_rate": 3.968705606962847e-05,
"loss": 0.9581,
"step": 166
},
{
"epoch": 0.06655375112085285,
"grad_norm": 0.5276592494284221,
"learning_rate": 3.9682581380836415e-05,
"loss": 0.9171,
"step": 167
},
{
"epoch": 0.06695227657666633,
"grad_norm": 0.7394645356756339,
"learning_rate": 3.967807518350261e-05,
"loss": 0.9612,
"step": 168
},
{
"epoch": 0.06735080203247983,
"grad_norm": 1.1007193079182445,
"learning_rate": 3.967353748484071e-05,
"loss": 0.9118,
"step": 169
},
{
"epoch": 0.06774932748829332,
"grad_norm": 1.0581797805010837,
"learning_rate": 3.966896829211483e-05,
"loss": 0.9641,
"step": 170
},
{
"epoch": 0.0681478529441068,
"grad_norm": 0.8757602622657974,
"learning_rate": 3.966436761263949e-05,
"loss": 0.9566,
"step": 171
},
{
"epoch": 0.0685463783999203,
"grad_norm": 0.8687270000650961,
"learning_rate": 3.96597354537796e-05,
"loss": 0.9701,
"step": 172
},
{
"epoch": 0.06894490385573379,
"grad_norm": 1.0166656418615307,
"learning_rate": 3.965507182295049e-05,
"loss": 0.9564,
"step": 173
},
{
"epoch": 0.06934342931154727,
"grad_norm": 0.8215033487256318,
"learning_rate": 3.965037672761785e-05,
"loss": 1.0189,
"step": 174
},
{
"epoch": 0.06974195476736077,
"grad_norm": 0.7260355443552792,
"learning_rate": 3.964565017529775e-05,
"loss": 0.9431,
"step": 175
},
{
"epoch": 0.07014048022317426,
"grad_norm": 0.7653437077317252,
"learning_rate": 3.9640892173556624e-05,
"loss": 0.947,
"step": 176
},
{
"epoch": 0.07053900567898774,
"grad_norm": 0.9116401355112523,
"learning_rate": 3.963610273001122e-05,
"loss": 0.9472,
"step": 177
},
{
"epoch": 0.07093753113480124,
"grad_norm": 0.9609189669126867,
"learning_rate": 3.963128185232866e-05,
"loss": 0.9427,
"step": 178
},
{
"epoch": 0.07133605659061472,
"grad_norm": 0.8565841157727021,
"learning_rate": 3.9626429548226364e-05,
"loss": 0.9477,
"step": 179
},
{
"epoch": 0.07173458204642821,
"grad_norm": 0.7814839364600451,
"learning_rate": 3.962154582547205e-05,
"loss": 0.9094,
"step": 180
},
{
"epoch": 0.07213310750224171,
"grad_norm": 0.7824911161278741,
"learning_rate": 3.961663069188377e-05,
"loss": 0.9647,
"step": 181
},
{
"epoch": 0.07253163295805519,
"grad_norm": 0.8488502117489565,
"learning_rate": 3.9611684155329825e-05,
"loss": 0.9634,
"step": 182
},
{
"epoch": 0.07293015841386868,
"grad_norm": 0.8663407155900105,
"learning_rate": 3.9606706223728796e-05,
"loss": 0.9522,
"step": 183
},
{
"epoch": 0.07332868386968218,
"grad_norm": 0.8427930838971712,
"learning_rate": 3.960169690504952e-05,
"loss": 0.957,
"step": 184
},
{
"epoch": 0.07372720932549566,
"grad_norm": 0.8728940813219989,
"learning_rate": 3.9596656207311096e-05,
"loss": 0.9103,
"step": 185
},
{
"epoch": 0.07412573478130915,
"grad_norm": 0.8964681349142457,
"learning_rate": 3.9591584138582835e-05,
"loss": 0.9783,
"step": 186
},
{
"epoch": 0.07452426023712265,
"grad_norm": 0.747475640936641,
"learning_rate": 3.958648070698428e-05,
"loss": 0.9343,
"step": 187
},
{
"epoch": 0.07492278569293613,
"grad_norm": 0.6081767246649388,
"learning_rate": 3.9581345920685176e-05,
"loss": 0.9426,
"step": 188
},
{
"epoch": 0.07532131114874963,
"grad_norm": 0.646327313636509,
"learning_rate": 3.957617978790546e-05,
"loss": 0.936,
"step": 189
},
{
"epoch": 0.07571983660456312,
"grad_norm": 0.5762067425821266,
"learning_rate": 3.9570982316915245e-05,
"loss": 0.9869,
"step": 190
},
{
"epoch": 0.0761183620603766,
"grad_norm": 0.5277633100224635,
"learning_rate": 3.956575351603484e-05,
"loss": 0.9247,
"step": 191
},
{
"epoch": 0.0765168875161901,
"grad_norm": 0.6079283681455546,
"learning_rate": 3.9560493393634665e-05,
"loss": 0.9003,
"step": 192
},
{
"epoch": 0.07691541297200359,
"grad_norm": 0.6485268072649816,
"learning_rate": 3.955520195813531e-05,
"loss": 0.9428,
"step": 193
},
{
"epoch": 0.07731393842781707,
"grad_norm": 0.6753541169437033,
"learning_rate": 3.954987921800749e-05,
"loss": 0.9546,
"step": 194
},
{
"epoch": 0.07771246388363057,
"grad_norm": 0.6320121035158947,
"learning_rate": 3.954452518177201e-05,
"loss": 0.9425,
"step": 195
},
{
"epoch": 0.07811098933944406,
"grad_norm": 0.8024382967580528,
"learning_rate": 3.953913985799982e-05,
"loss": 0.9575,
"step": 196
},
{
"epoch": 0.07850951479525754,
"grad_norm": 0.6451828329766384,
"learning_rate": 3.95337232553119e-05,
"loss": 0.9618,
"step": 197
},
{
"epoch": 0.07890804025107104,
"grad_norm": 0.5637480570882453,
"learning_rate": 3.952827538237934e-05,
"loss": 0.9436,
"step": 198
},
{
"epoch": 0.07930656570688453,
"grad_norm": 0.6287403860445728,
"learning_rate": 3.952279624792329e-05,
"loss": 0.9585,
"step": 199
},
{
"epoch": 0.07970509116269801,
"grad_norm": 0.6133071011985074,
"learning_rate": 3.9517285860714915e-05,
"loss": 0.9447,
"step": 200
},
{
"epoch": 0.08010361661851151,
"grad_norm": 0.5782665343325509,
"learning_rate": 3.951174422957545e-05,
"loss": 0.9381,
"step": 201
},
{
"epoch": 0.080502142074325,
"grad_norm": 0.5255985375741193,
"learning_rate": 3.950617136337611e-05,
"loss": 0.893,
"step": 202
},
{
"epoch": 0.08090066753013848,
"grad_norm": 0.5926087052436324,
"learning_rate": 3.950056727103813e-05,
"loss": 0.9226,
"step": 203
},
{
"epoch": 0.08129919298595198,
"grad_norm": 0.6283429524618049,
"learning_rate": 3.949493196153274e-05,
"loss": 0.9381,
"step": 204
},
{
"epoch": 0.08169771844176547,
"grad_norm": 0.6457268317630597,
"learning_rate": 3.948926544388112e-05,
"loss": 0.9097,
"step": 205
},
{
"epoch": 0.08209624389757895,
"grad_norm": 0.8396169584539872,
"learning_rate": 3.948356772715443e-05,
"loss": 0.9303,
"step": 206
},
{
"epoch": 0.08249476935339245,
"grad_norm": 0.9970461466822023,
"learning_rate": 3.9477838820473776e-05,
"loss": 0.9218,
"step": 207
},
{
"epoch": 0.08289329480920594,
"grad_norm": 1.1370242066432408,
"learning_rate": 3.9472078733010174e-05,
"loss": 0.9393,
"step": 208
},
{
"epoch": 0.08329182026501943,
"grad_norm": 0.8481740560416752,
"learning_rate": 3.946628747398457e-05,
"loss": 0.9539,
"step": 209
},
{
"epoch": 0.08369034572083292,
"grad_norm": 0.7749044455116462,
"learning_rate": 3.94604650526678e-05,
"loss": 0.9064,
"step": 210
},
{
"epoch": 0.0840888711766464,
"grad_norm": 0.8242769108366514,
"learning_rate": 3.9454611478380604e-05,
"loss": 0.9578,
"step": 211
},
{
"epoch": 0.0844873966324599,
"grad_norm": 0.7060014980899263,
"learning_rate": 3.944872676049358e-05,
"loss": 0.9586,
"step": 212
},
{
"epoch": 0.08488592208827339,
"grad_norm": 0.8645214673367116,
"learning_rate": 3.944281090842718e-05,
"loss": 0.919,
"step": 213
},
{
"epoch": 0.08528444754408687,
"grad_norm": 1.0934973623844684,
"learning_rate": 3.943686393165171e-05,
"loss": 0.955,
"step": 214
},
{
"epoch": 0.08568297299990037,
"grad_norm": 0.8673963340448777,
"learning_rate": 3.943088583968726e-05,
"loss": 0.9304,
"step": 215
},
{
"epoch": 0.08608149845571386,
"grad_norm": 0.835352668198479,
"learning_rate": 3.9424876642103805e-05,
"loss": 0.9615,
"step": 216
},
{
"epoch": 0.08648002391152734,
"grad_norm": 0.8611507271565368,
"learning_rate": 3.9418836348521045e-05,
"loss": 0.929,
"step": 217
},
{
"epoch": 0.08687854936734084,
"grad_norm": 0.8251142229076397,
"learning_rate": 3.941276496860849e-05,
"loss": 0.9642,
"step": 218
},
{
"epoch": 0.08727707482315433,
"grad_norm": 0.7930096914994095,
"learning_rate": 3.9406662512085416e-05,
"loss": 0.9622,
"step": 219
},
{
"epoch": 0.08767560027896781,
"grad_norm": 0.6629634789706741,
"learning_rate": 3.940052898872084e-05,
"loss": 0.9083,
"step": 220
},
{
"epoch": 0.08807412573478131,
"grad_norm": 0.6439473882747895,
"learning_rate": 3.93943644083335e-05,
"loss": 0.9155,
"step": 221
},
{
"epoch": 0.0884726511905948,
"grad_norm": 0.7838839076395734,
"learning_rate": 3.9388168780791883e-05,
"loss": 0.9127,
"step": 222
},
{
"epoch": 0.08887117664640828,
"grad_norm": 0.7675321153839495,
"learning_rate": 3.938194211601416e-05,
"loss": 0.9313,
"step": 223
},
{
"epoch": 0.08926970210222178,
"grad_norm": 0.670858178864275,
"learning_rate": 3.937568442396817e-05,
"loss": 0.9215,
"step": 224
},
{
"epoch": 0.08966822755803527,
"grad_norm": 0.5430100456071535,
"learning_rate": 3.936939571467145e-05,
"loss": 0.9215,
"step": 225
},
{
"epoch": 0.09006675301384875,
"grad_norm": 0.645122412385762,
"learning_rate": 3.9363075998191175e-05,
"loss": 0.9518,
"step": 226
},
{
"epoch": 0.09046527846966225,
"grad_norm": 0.7124302784985599,
"learning_rate": 3.935672528464416e-05,
"loss": 0.9472,
"step": 227
},
{
"epoch": 0.09086380392547574,
"grad_norm": 0.6944932728108557,
"learning_rate": 3.935034358419684e-05,
"loss": 0.9043,
"step": 228
},
{
"epoch": 0.09126232938128923,
"grad_norm": 0.7428731739366404,
"learning_rate": 3.934393090706527e-05,
"loss": 0.9276,
"step": 229
},
{
"epoch": 0.09166085483710272,
"grad_norm": 0.7237371542570604,
"learning_rate": 3.9337487263515065e-05,
"loss": 0.966,
"step": 230
},
{
"epoch": 0.09205938029291622,
"grad_norm": 0.7584658608788947,
"learning_rate": 3.9331012663861435e-05,
"loss": 0.9195,
"step": 231
},
{
"epoch": 0.0924579057487297,
"grad_norm": 0.8151922759638645,
"learning_rate": 3.932450711846914e-05,
"loss": 0.9352,
"step": 232
},
{
"epoch": 0.09285643120454319,
"grad_norm": 0.7799720068156271,
"learning_rate": 3.931797063775246e-05,
"loss": 0.867,
"step": 233
},
{
"epoch": 0.09325495666035669,
"grad_norm": 0.7195572843892059,
"learning_rate": 3.931140323217524e-05,
"loss": 0.9485,
"step": 234
},
{
"epoch": 0.09365348211617017,
"grad_norm": 0.5676394070871306,
"learning_rate": 3.9304804912250785e-05,
"loss": 0.9479,
"step": 235
},
{
"epoch": 0.09405200757198366,
"grad_norm": 0.603554245394414,
"learning_rate": 3.9298175688541916e-05,
"loss": 0.8831,
"step": 236
},
{
"epoch": 0.09445053302779716,
"grad_norm": 0.7416220601956737,
"learning_rate": 3.9291515571660926e-05,
"loss": 0.9537,
"step": 237
},
{
"epoch": 0.09484905848361064,
"grad_norm": 0.7400965861280613,
"learning_rate": 3.928482457226954e-05,
"loss": 0.9087,
"step": 238
},
{
"epoch": 0.09524758393942413,
"grad_norm": 0.8210302591504622,
"learning_rate": 3.927810270107894e-05,
"loss": 0.8909,
"step": 239
},
{
"epoch": 0.09564610939523763,
"grad_norm": 0.7137333890568919,
"learning_rate": 3.9271349968849735e-05,
"loss": 0.9301,
"step": 240
},
{
"epoch": 0.09604463485105111,
"grad_norm": 0.5314296904513427,
"learning_rate": 3.9264566386391925e-05,
"loss": 0.9233,
"step": 241
},
{
"epoch": 0.0964431603068646,
"grad_norm": 0.6166230859092278,
"learning_rate": 3.925775196456488e-05,
"loss": 0.8958,
"step": 242
},
{
"epoch": 0.09684168576267808,
"grad_norm": 0.6958069670048053,
"learning_rate": 3.925090671427739e-05,
"loss": 0.9278,
"step": 243
},
{
"epoch": 0.09724021121849158,
"grad_norm": 0.6889489906309647,
"learning_rate": 3.9244030646487524e-05,
"loss": 0.9453,
"step": 244
},
{
"epoch": 0.09763873667430507,
"grad_norm": 0.6113796976521826,
"learning_rate": 3.923712377220275e-05,
"loss": 0.9042,
"step": 245
},
{
"epoch": 0.09803726213011855,
"grad_norm": 0.5576650794524141,
"learning_rate": 3.9230186102479824e-05,
"loss": 0.9457,
"step": 246
},
{
"epoch": 0.09843578758593205,
"grad_norm": 0.5050600559673174,
"learning_rate": 3.922321764842479e-05,
"loss": 0.9128,
"step": 247
},
{
"epoch": 0.09883431304174554,
"grad_norm": 0.5792206556379802,
"learning_rate": 3.9216218421193e-05,
"loss": 0.9346,
"step": 248
},
{
"epoch": 0.09923283849755903,
"grad_norm": 0.7117260079905121,
"learning_rate": 3.9209188431989044e-05,
"loss": 0.9242,
"step": 249
},
{
"epoch": 0.09963136395337252,
"grad_norm": 0.5411445590412157,
"learning_rate": 3.920212769206676e-05,
"loss": 0.8808,
"step": 250
},
{
"epoch": 0.10002988940918602,
"grad_norm": 0.6383206470777513,
"learning_rate": 3.919503621272924e-05,
"loss": 0.9014,
"step": 251
},
{
"epoch": 0.1004284148649995,
"grad_norm": 0.5870726238645826,
"learning_rate": 3.918791400532874e-05,
"loss": 0.8833,
"step": 252
},
{
"epoch": 0.10082694032081299,
"grad_norm": 0.5677734635394229,
"learning_rate": 3.918076108126675e-05,
"loss": 0.9128,
"step": 253
},
{
"epoch": 0.10122546577662649,
"grad_norm": 0.6397706154970396,
"learning_rate": 3.91735774519939e-05,
"loss": 0.8892,
"step": 254
},
{
"epoch": 0.10162399123243997,
"grad_norm": 0.6025324225700743,
"learning_rate": 3.916636312900999e-05,
"loss": 0.8924,
"step": 255
},
{
"epoch": 0.10202251668825346,
"grad_norm": 0.5993884554898958,
"learning_rate": 3.9159118123863964e-05,
"loss": 0.9249,
"step": 256
},
{
"epoch": 0.10242104214406696,
"grad_norm": 0.5139396732603375,
"learning_rate": 3.915184244815385e-05,
"loss": 0.8977,
"step": 257
},
{
"epoch": 0.10281956759988044,
"grad_norm": 0.582154606226688,
"learning_rate": 3.9144536113526806e-05,
"loss": 0.9064,
"step": 258
},
{
"epoch": 0.10321809305569393,
"grad_norm": 0.5737111995658692,
"learning_rate": 3.9137199131679064e-05,
"loss": 0.9003,
"step": 259
},
{
"epoch": 0.10361661851150743,
"grad_norm": 0.5501293796446101,
"learning_rate": 3.912983151435591e-05,
"loss": 0.9053,
"step": 260
},
{
"epoch": 0.10401514396732091,
"grad_norm": 0.5616191359055138,
"learning_rate": 3.912243327335167e-05,
"loss": 0.9059,
"step": 261
},
{
"epoch": 0.1044136694231344,
"grad_norm": 0.5779090114992178,
"learning_rate": 3.91150044205097e-05,
"loss": 0.9215,
"step": 262
},
{
"epoch": 0.1048121948789479,
"grad_norm": 0.5621858273638006,
"learning_rate": 3.910754496772236e-05,
"loss": 0.9231,
"step": 263
},
{
"epoch": 0.10521072033476138,
"grad_norm": 0.5187195624713219,
"learning_rate": 3.9100054926931e-05,
"loss": 0.9077,
"step": 264
},
{
"epoch": 0.10560924579057487,
"grad_norm": 0.5484074323672972,
"learning_rate": 3.909253431012592e-05,
"loss": 0.8943,
"step": 265
},
{
"epoch": 0.10600777124638837,
"grad_norm": 0.5860726206207597,
"learning_rate": 3.9084983129346386e-05,
"loss": 0.9215,
"step": 266
},
{
"epoch": 0.10640629670220185,
"grad_norm": 0.5785145319929371,
"learning_rate": 3.907740139668058e-05,
"loss": 0.9079,
"step": 267
},
{
"epoch": 0.10680482215801534,
"grad_norm": 0.6293154987830761,
"learning_rate": 3.9069789124265595e-05,
"loss": 0.9199,
"step": 268
},
{
"epoch": 0.10720334761382884,
"grad_norm": 0.6138996610001156,
"learning_rate": 3.906214632428742e-05,
"loss": 0.9307,
"step": 269
},
{
"epoch": 0.10760187306964232,
"grad_norm": 0.5574707399267468,
"learning_rate": 3.90544730089809e-05,
"loss": 0.9235,
"step": 270
},
{
"epoch": 0.10800039852545582,
"grad_norm": 0.49410092240642955,
"learning_rate": 3.904676919062973e-05,
"loss": 0.8892,
"step": 271
},
{
"epoch": 0.10839892398126931,
"grad_norm": 0.551637520171974,
"learning_rate": 3.903903488156646e-05,
"loss": 0.9133,
"step": 272
},
{
"epoch": 0.10879744943708279,
"grad_norm": 0.5731759290280689,
"learning_rate": 3.903127009417244e-05,
"loss": 0.8961,
"step": 273
},
{
"epoch": 0.10919597489289629,
"grad_norm": 0.5762364732869328,
"learning_rate": 3.9023474840877775e-05,
"loss": 0.8803,
"step": 274
},
{
"epoch": 0.10959450034870978,
"grad_norm": 0.6947758285401612,
"learning_rate": 3.901564913416139e-05,
"loss": 0.8906,
"step": 275
},
{
"epoch": 0.10999302580452326,
"grad_norm": 0.7885605570685301,
"learning_rate": 3.9007792986550937e-05,
"loss": 0.9016,
"step": 276
},
{
"epoch": 0.11039155126033676,
"grad_norm": 0.779205530434434,
"learning_rate": 3.8999906410622805e-05,
"loss": 0.909,
"step": 277
},
{
"epoch": 0.11079007671615024,
"grad_norm": 0.9502303024617071,
"learning_rate": 3.899198941900209e-05,
"loss": 0.8972,
"step": 278
},
{
"epoch": 0.11118860217196373,
"grad_norm": 0.7020948558600761,
"learning_rate": 3.898404202436258e-05,
"loss": 0.8992,
"step": 279
},
{
"epoch": 0.11158712762777723,
"grad_norm": 0.6253004452655916,
"learning_rate": 3.8976064239426727e-05,
"loss": 0.8983,
"step": 280
},
{
"epoch": 0.11198565308359071,
"grad_norm": 0.48947446603739525,
"learning_rate": 3.896805607696565e-05,
"loss": 0.9092,
"step": 281
},
{
"epoch": 0.1123841785394042,
"grad_norm": 0.5305885289397677,
"learning_rate": 3.896001754979908e-05,
"loss": 0.8828,
"step": 282
},
{
"epoch": 0.1127827039952177,
"grad_norm": 0.6511594701603155,
"learning_rate": 3.8951948670795356e-05,
"loss": 0.8949,
"step": 283
},
{
"epoch": 0.11318122945103118,
"grad_norm": 0.7838264076235747,
"learning_rate": 3.8943849452871416e-05,
"loss": 0.9061,
"step": 284
},
{
"epoch": 0.11357975490684467,
"grad_norm": 0.8176144028366352,
"learning_rate": 3.8935719908992776e-05,
"loss": 0.9139,
"step": 285
},
{
"epoch": 0.11397828036265817,
"grad_norm": 0.7483860858548197,
"learning_rate": 3.892756005217347e-05,
"loss": 0.9092,
"step": 286
},
{
"epoch": 0.11437680581847165,
"grad_norm": 0.6145924322571729,
"learning_rate": 3.891936989547608e-05,
"loss": 0.9052,
"step": 287
},
{
"epoch": 0.11477533127428514,
"grad_norm": 0.5793992708257767,
"learning_rate": 3.891114945201168e-05,
"loss": 0.9041,
"step": 288
},
{
"epoch": 0.11517385673009864,
"grad_norm": 0.6386399436855802,
"learning_rate": 3.890289873493984e-05,
"loss": 0.8765,
"step": 289
},
{
"epoch": 0.11557238218591212,
"grad_norm": 0.7545452332949172,
"learning_rate": 3.889461775746858e-05,
"loss": 0.9407,
"step": 290
},
{
"epoch": 0.11597090764172562,
"grad_norm": 0.643068181670375,
"learning_rate": 3.888630653285437e-05,
"loss": 0.9044,
"step": 291
},
{
"epoch": 0.11636943309753911,
"grad_norm": 0.4963770968380342,
"learning_rate": 3.887796507440211e-05,
"loss": 0.9244,
"step": 292
},
{
"epoch": 0.11676795855335259,
"grad_norm": 0.5330885480112182,
"learning_rate": 3.8869593395465066e-05,
"loss": 0.9007,
"step": 293
},
{
"epoch": 0.11716648400916609,
"grad_norm": 0.6867642996793515,
"learning_rate": 3.8861191509444926e-05,
"loss": 0.8923,
"step": 294
},
{
"epoch": 0.11756500946497958,
"grad_norm": 0.5931835622625073,
"learning_rate": 3.88527594297917e-05,
"loss": 0.9172,
"step": 295
},
{
"epoch": 0.11796353492079306,
"grad_norm": 0.6693705563895682,
"learning_rate": 3.884429717000376e-05,
"loss": 0.8941,
"step": 296
},
{
"epoch": 0.11836206037660656,
"grad_norm": 0.6616211136884201,
"learning_rate": 3.883580474362777e-05,
"loss": 0.9208,
"step": 297
},
{
"epoch": 0.11876058583242005,
"grad_norm": 0.5241813662858397,
"learning_rate": 3.88272821642587e-05,
"loss": 0.9295,
"step": 298
},
{
"epoch": 0.11915911128823353,
"grad_norm": 0.47998991090285037,
"learning_rate": 3.8818729445539765e-05,
"loss": 0.9134,
"step": 299
},
{
"epoch": 0.11955763674404703,
"grad_norm": 0.5575410784453981,
"learning_rate": 3.881014660116246e-05,
"loss": 0.9264,
"step": 300
},
{
"epoch": 0.11995616219986052,
"grad_norm": 0.5762249128335137,
"learning_rate": 3.880153364486649e-05,
"loss": 0.8924,
"step": 301
},
{
"epoch": 0.120354687655674,
"grad_norm": 0.9257335770621549,
"learning_rate": 3.8792890590439764e-05,
"loss": 0.8861,
"step": 302
},
{
"epoch": 0.1207532131114875,
"grad_norm": 0.5676730409091856,
"learning_rate": 3.878421745171839e-05,
"loss": 0.9112,
"step": 303
},
{
"epoch": 0.12115173856730099,
"grad_norm": 0.4637670476081397,
"learning_rate": 3.87755142425866e-05,
"loss": 0.8917,
"step": 304
},
{
"epoch": 0.12155026402311447,
"grad_norm": 0.5310661309184922,
"learning_rate": 3.8766780976976795e-05,
"loss": 0.9182,
"step": 305
},
{
"epoch": 0.12194878947892797,
"grad_norm": 0.5584733508565086,
"learning_rate": 3.8758017668869484e-05,
"loss": 0.9396,
"step": 306
},
{
"epoch": 0.12234731493474146,
"grad_norm": 0.5545890950572487,
"learning_rate": 3.8749224332293265e-05,
"loss": 0.9016,
"step": 307
},
{
"epoch": 0.12274584039055494,
"grad_norm": 0.5692405766886073,
"learning_rate": 3.874040098132481e-05,
"loss": 0.8543,
"step": 308
},
{
"epoch": 0.12314436584636844,
"grad_norm": 0.5829038395471384,
"learning_rate": 3.873154763008884e-05,
"loss": 0.8766,
"step": 309
},
{
"epoch": 0.12354289130218192,
"grad_norm": 0.6399720498446062,
"learning_rate": 3.872266429275809e-05,
"loss": 0.8924,
"step": 310
},
{
"epoch": 0.12394141675799542,
"grad_norm": 0.5563668304631704,
"learning_rate": 3.871375098355331e-05,
"loss": 0.9351,
"step": 311
},
{
"epoch": 0.12433994221380891,
"grad_norm": 0.4891838671794899,
"learning_rate": 3.8704807716743235e-05,
"loss": 0.9084,
"step": 312
},
{
"epoch": 0.12473846766962239,
"grad_norm": 0.5390514488310643,
"learning_rate": 3.869583450664454e-05,
"loss": 0.9006,
"step": 313
},
{
"epoch": 0.12513699312543589,
"grad_norm": 0.6535963479715494,
"learning_rate": 3.868683136762185e-05,
"loss": 0.8946,
"step": 314
},
{
"epoch": 0.12553551858124937,
"grad_norm": 0.6360077741778746,
"learning_rate": 3.867779831408768e-05,
"loss": 0.8997,
"step": 315
},
{
"epoch": 0.12593404403706288,
"grad_norm": 0.47594974316839744,
"learning_rate": 3.8668735360502474e-05,
"loss": 0.9135,
"step": 316
},
{
"epoch": 0.12633256949287636,
"grad_norm": 0.554635644525251,
"learning_rate": 3.865964252137449e-05,
"loss": 0.9056,
"step": 317
},
{
"epoch": 0.12673109494868984,
"grad_norm": 0.6542560775862073,
"learning_rate": 3.8650519811259856e-05,
"loss": 0.8837,
"step": 318
},
{
"epoch": 0.12712962040450335,
"grad_norm": 0.5504279116926618,
"learning_rate": 3.864136724476252e-05,
"loss": 0.909,
"step": 319
},
{
"epoch": 0.12752814586031683,
"grad_norm": 0.4207363922400064,
"learning_rate": 3.863218483653423e-05,
"loss": 0.9199,
"step": 320
},
{
"epoch": 0.1279266713161303,
"grad_norm": 0.5480250503031011,
"learning_rate": 3.862297260127447e-05,
"loss": 0.9115,
"step": 321
},
{
"epoch": 0.12832519677194382,
"grad_norm": 0.7116612376007252,
"learning_rate": 3.8613730553730525e-05,
"loss": 0.902,
"step": 322
},
{
"epoch": 0.1287237222277573,
"grad_norm": 0.7034441679085705,
"learning_rate": 3.8604458708697354e-05,
"loss": 0.93,
"step": 323
},
{
"epoch": 0.12912224768357078,
"grad_norm": 0.6464461922880574,
"learning_rate": 3.859515708101766e-05,
"loss": 0.9027,
"step": 324
},
{
"epoch": 0.1295207731393843,
"grad_norm": 0.5724183071806952,
"learning_rate": 3.858582568558179e-05,
"loss": 0.9152,
"step": 325
},
{
"epoch": 0.12991929859519777,
"grad_norm": 0.5434975703367534,
"learning_rate": 3.857646453732776e-05,
"loss": 0.8873,
"step": 326
},
{
"epoch": 0.13031782405101125,
"grad_norm": 0.5134121010042222,
"learning_rate": 3.856707365124122e-05,
"loss": 0.8728,
"step": 327
},
{
"epoch": 0.13071634950682476,
"grad_norm": 0.5097236839503941,
"learning_rate": 3.85576530423554e-05,
"loss": 0.911,
"step": 328
},
{
"epoch": 0.13111487496263824,
"grad_norm": 0.5227325664183777,
"learning_rate": 3.854820272575115e-05,
"loss": 0.8658,
"step": 329
},
{
"epoch": 0.13151340041845172,
"grad_norm": 0.6322853032653781,
"learning_rate": 3.853872271655685e-05,
"loss": 0.891,
"step": 330
},
{
"epoch": 0.13191192587426523,
"grad_norm": 0.5184506986493536,
"learning_rate": 3.852921302994841e-05,
"loss": 0.8612,
"step": 331
},
{
"epoch": 0.1323104513300787,
"grad_norm": 0.5046807022502423,
"learning_rate": 3.8519673681149265e-05,
"loss": 0.8994,
"step": 332
},
{
"epoch": 0.1327089767858922,
"grad_norm": 0.5061850051002039,
"learning_rate": 3.851010468543033e-05,
"loss": 0.8849,
"step": 333
},
{
"epoch": 0.1331075022417057,
"grad_norm": 0.4935717896499033,
"learning_rate": 3.850050605810997e-05,
"loss": 0.9285,
"step": 334
},
{
"epoch": 0.13350602769751918,
"grad_norm": 0.4947315091214366,
"learning_rate": 3.8490877814553996e-05,
"loss": 0.9004,
"step": 335
},
{
"epoch": 0.13390455315333266,
"grad_norm": 0.46140205389577676,
"learning_rate": 3.848121997017563e-05,
"loss": 0.9065,
"step": 336
},
{
"epoch": 0.13430307860914617,
"grad_norm": 0.47248289695698514,
"learning_rate": 3.847153254043547e-05,
"loss": 0.8805,
"step": 337
},
{
"epoch": 0.13470160406495965,
"grad_norm": 0.45224697013215626,
"learning_rate": 3.846181554084147e-05,
"loss": 0.896,
"step": 338
},
{
"epoch": 0.13510012952077313,
"grad_norm": 0.527417114425614,
"learning_rate": 3.8452068986948956e-05,
"loss": 0.9383,
"step": 339
},
{
"epoch": 0.13549865497658664,
"grad_norm": 0.5092127958405034,
"learning_rate": 3.844229289436053e-05,
"loss": 0.8961,
"step": 340
},
{
"epoch": 0.13589718043240012,
"grad_norm": 0.4746200986505316,
"learning_rate": 3.8432487278726084e-05,
"loss": 0.9281,
"step": 341
},
{
"epoch": 0.1362957058882136,
"grad_norm": 0.484617132707988,
"learning_rate": 3.842265215574279e-05,
"loss": 0.8799,
"step": 342
},
{
"epoch": 0.1366942313440271,
"grad_norm": 0.472139637172473,
"learning_rate": 3.8412787541155035e-05,
"loss": 0.8571,
"step": 343
},
{
"epoch": 0.1370927567998406,
"grad_norm": 0.4750954980383929,
"learning_rate": 3.840289345075444e-05,
"loss": 0.8997,
"step": 344
},
{
"epoch": 0.13749128225565407,
"grad_norm": 0.5058566298011136,
"learning_rate": 3.839296990037979e-05,
"loss": 0.8947,
"step": 345
},
{
"epoch": 0.13788980771146758,
"grad_norm": 0.5034036144166951,
"learning_rate": 3.838301690591704e-05,
"loss": 0.856,
"step": 346
},
{
"epoch": 0.13828833316728106,
"grad_norm": 0.5109042435371637,
"learning_rate": 3.8373034483299286e-05,
"loss": 0.8676,
"step": 347
},
{
"epoch": 0.13868685862309454,
"grad_norm": 0.5076861609812875,
"learning_rate": 3.836302264850673e-05,
"loss": 0.8899,
"step": 348
},
{
"epoch": 0.13908538407890805,
"grad_norm": 0.48688791345770777,
"learning_rate": 3.835298141756664e-05,
"loss": 0.8952,
"step": 349
},
{
"epoch": 0.13948390953472153,
"grad_norm": 0.4294678692671596,
"learning_rate": 3.8342910806553374e-05,
"loss": 0.896,
"step": 350
},
{
"epoch": 0.13988243499053502,
"grad_norm": 0.4759618640018106,
"learning_rate": 3.83328108315883e-05,
"loss": 0.8925,
"step": 351
},
{
"epoch": 0.14028096044634852,
"grad_norm": 0.498083239156812,
"learning_rate": 3.8322681508839796e-05,
"loss": 0.897,
"step": 352
},
{
"epoch": 0.140679485902162,
"grad_norm": 0.47774282716676997,
"learning_rate": 3.8312522854523236e-05,
"loss": 0.853,
"step": 353
},
{
"epoch": 0.14107801135797549,
"grad_norm": 0.5425614790073936,
"learning_rate": 3.830233488490092e-05,
"loss": 0.9072,
"step": 354
},
{
"epoch": 0.141476536813789,
"grad_norm": 0.607352655774501,
"learning_rate": 3.8292117616282116e-05,
"loss": 0.8849,
"step": 355
},
{
"epoch": 0.14187506226960248,
"grad_norm": 0.622366562638722,
"learning_rate": 3.828187106502295e-05,
"loss": 0.8743,
"step": 356
},
{
"epoch": 0.14227358772541596,
"grad_norm": 0.6880401152515128,
"learning_rate": 3.827159524752646e-05,
"loss": 0.854,
"step": 357
},
{
"epoch": 0.14267211318122944,
"grad_norm": 0.6320544909726663,
"learning_rate": 3.8261290180242524e-05,
"loss": 0.8823,
"step": 358
},
{
"epoch": 0.14307063863704295,
"grad_norm": 0.6117634467858145,
"learning_rate": 3.825095587966784e-05,
"loss": 0.8821,
"step": 359
},
{
"epoch": 0.14346916409285643,
"grad_norm": 0.5586681204591263,
"learning_rate": 3.82405923623459e-05,
"loss": 0.8851,
"step": 360
},
{
"epoch": 0.1438676895486699,
"grad_norm": 0.568103604064326,
"learning_rate": 3.823019964486698e-05,
"loss": 0.8963,
"step": 361
},
{
"epoch": 0.14426621500448342,
"grad_norm": 0.5481484665397642,
"learning_rate": 3.8219777743868095e-05,
"loss": 0.8847,
"step": 362
},
{
"epoch": 0.1446647404602969,
"grad_norm": 0.5839213790650319,
"learning_rate": 3.820932667603297e-05,
"loss": 0.8858,
"step": 363
},
{
"epoch": 0.14506326591611038,
"grad_norm": 0.6803626614692434,
"learning_rate": 3.819884645809203e-05,
"loss": 0.9316,
"step": 364
},
{
"epoch": 0.1454617913719239,
"grad_norm": 0.5826226983177064,
"learning_rate": 3.8188337106822364e-05,
"loss": 0.8926,
"step": 365
},
{
"epoch": 0.14586031682773737,
"grad_norm": 0.4587053421690505,
"learning_rate": 3.8177798639047693e-05,
"loss": 0.9015,
"step": 366
},
{
"epoch": 0.14625884228355085,
"grad_norm": 0.4979532996043012,
"learning_rate": 3.8167231071638355e-05,
"loss": 0.9084,
"step": 367
},
{
"epoch": 0.14665736773936436,
"grad_norm": 0.6060462788501415,
"learning_rate": 3.815663442151127e-05,
"loss": 0.8913,
"step": 368
},
{
"epoch": 0.14705589319517784,
"grad_norm": 0.5719962639011669,
"learning_rate": 3.8146008705629916e-05,
"loss": 0.9119,
"step": 369
},
{
"epoch": 0.14745441865099132,
"grad_norm": 0.49076638405233397,
"learning_rate": 3.813535394100429e-05,
"loss": 0.8802,
"step": 370
},
{
"epoch": 0.14785294410680483,
"grad_norm": 0.49594758931441285,
"learning_rate": 3.81246701446909e-05,
"loss": 0.8639,
"step": 371
},
{
"epoch": 0.1482514695626183,
"grad_norm": 0.5940377132680764,
"learning_rate": 3.8113957333792744e-05,
"loss": 0.87,
"step": 372
},
{
"epoch": 0.1486499950184318,
"grad_norm": 0.5596407953869648,
"learning_rate": 3.810321552545924e-05,
"loss": 0.8875,
"step": 373
},
{
"epoch": 0.1490485204742453,
"grad_norm": 0.5587229850427988,
"learning_rate": 3.8092444736886235e-05,
"loss": 0.8823,
"step": 374
},
{
"epoch": 0.14944704593005878,
"grad_norm": 0.6185912922060778,
"learning_rate": 3.808164498531598e-05,
"loss": 0.8736,
"step": 375
},
{
"epoch": 0.14984557138587226,
"grad_norm": 0.5707944153693156,
"learning_rate": 3.8070816288037076e-05,
"loss": 0.9053,
"step": 376
},
{
"epoch": 0.15024409684168577,
"grad_norm": 0.5131528156556673,
"learning_rate": 3.805995866238446e-05,
"loss": 0.9038,
"step": 377
},
{
"epoch": 0.15064262229749925,
"grad_norm": 0.5289298616408312,
"learning_rate": 3.804907212573941e-05,
"loss": 0.9067,
"step": 378
},
{
"epoch": 0.15104114775331273,
"grad_norm": 0.5460088042514601,
"learning_rate": 3.803815669552944e-05,
"loss": 0.8742,
"step": 379
},
{
"epoch": 0.15143967320912624,
"grad_norm": 0.5901247804029622,
"learning_rate": 3.802721238922835e-05,
"loss": 0.8788,
"step": 380
},
{
"epoch": 0.15183819866493972,
"grad_norm": 0.43400747036846915,
"learning_rate": 3.801623922435615e-05,
"loss": 0.8676,
"step": 381
},
{
"epoch": 0.1522367241207532,
"grad_norm": 0.580607227815199,
"learning_rate": 3.800523721847906e-05,
"loss": 0.9247,
"step": 382
},
{
"epoch": 0.1526352495765667,
"grad_norm": 0.553191736940903,
"learning_rate": 3.7994206389209457e-05,
"loss": 0.8516,
"step": 383
},
{
"epoch": 0.1530337750323802,
"grad_norm": 0.5178209878197958,
"learning_rate": 3.7983146754205866e-05,
"loss": 0.8759,
"step": 384
},
{
"epoch": 0.15343230048819367,
"grad_norm": 0.5241403248580444,
"learning_rate": 3.7972058331172935e-05,
"loss": 0.9084,
"step": 385
},
{
"epoch": 0.15383082594400718,
"grad_norm": 0.4871129484635027,
"learning_rate": 3.796094113786137e-05,
"loss": 0.886,
"step": 386
},
{
"epoch": 0.15422935139982066,
"grad_norm": 0.43638582131414316,
"learning_rate": 3.794979519206796e-05,
"loss": 0.8884,
"step": 387
},
{
"epoch": 0.15462787685563414,
"grad_norm": 0.4833333706695009,
"learning_rate": 3.793862051163551e-05,
"loss": 0.8911,
"step": 388
},
{
"epoch": 0.15502640231144765,
"grad_norm": 0.5314502365145202,
"learning_rate": 3.792741711445283e-05,
"loss": 0.9347,
"step": 389
},
{
"epoch": 0.15542492776726113,
"grad_norm": 0.47578888436804323,
"learning_rate": 3.791618501845469e-05,
"loss": 0.8512,
"step": 390
},
{
"epoch": 0.15582345322307461,
"grad_norm": 0.5374852434985777,
"learning_rate": 3.790492424162181e-05,
"loss": 0.8765,
"step": 391
},
{
"epoch": 0.15622197867888812,
"grad_norm": 0.568861342025691,
"learning_rate": 3.789363480198083e-05,
"loss": 0.88,
"step": 392
},
{
"epoch": 0.1566205041347016,
"grad_norm": 0.5082814585192399,
"learning_rate": 3.788231671760426e-05,
"loss": 0.8846,
"step": 393
},
{
"epoch": 0.15701902959051509,
"grad_norm": 0.5514304292988225,
"learning_rate": 3.787097000661047e-05,
"loss": 0.9023,
"step": 394
},
{
"epoch": 0.1574175550463286,
"grad_norm": 0.5203382428096642,
"learning_rate": 3.785959468716367e-05,
"loss": 0.9036,
"step": 395
},
{
"epoch": 0.15781608050214208,
"grad_norm": 0.43118668216324796,
"learning_rate": 3.7848190777473836e-05,
"loss": 0.8952,
"step": 396
},
{
"epoch": 0.15821460595795556,
"grad_norm": 0.4912071245587214,
"learning_rate": 3.783675829579675e-05,
"loss": 0.8798,
"step": 397
},
{
"epoch": 0.15861313141376907,
"grad_norm": 0.5961696064294701,
"learning_rate": 3.7825297260433904e-05,
"loss": 0.8888,
"step": 398
},
{
"epoch": 0.15901165686958255,
"grad_norm": 0.7191150184982619,
"learning_rate": 3.781380768973252e-05,
"loss": 0.9002,
"step": 399
},
{
"epoch": 0.15941018232539603,
"grad_norm": 0.7060067375415279,
"learning_rate": 3.7802289602085485e-05,
"loss": 0.8741,
"step": 400
},
{
"epoch": 0.15980870778120954,
"grad_norm": 0.5469078244459111,
"learning_rate": 3.779074301593135e-05,
"loss": 0.8786,
"step": 401
},
{
"epoch": 0.16020723323702302,
"grad_norm": 0.4518738436666743,
"learning_rate": 3.777916794975428e-05,
"loss": 0.8641,
"step": 402
},
{
"epoch": 0.1606057586928365,
"grad_norm": 0.7446776049733693,
"learning_rate": 3.776756442208402e-05,
"loss": 0.8841,
"step": 403
},
{
"epoch": 0.16100428414865,
"grad_norm": 0.8590281212461937,
"learning_rate": 3.7755932451495906e-05,
"loss": 0.8589,
"step": 404
},
{
"epoch": 0.1614028096044635,
"grad_norm": 0.8179740795657136,
"learning_rate": 3.774427205661077e-05,
"loss": 0.8997,
"step": 405
},
{
"epoch": 0.16180133506027697,
"grad_norm": 0.6554445877560577,
"learning_rate": 3.773258325609499e-05,
"loss": 0.8686,
"step": 406
},
{
"epoch": 0.16219986051609048,
"grad_norm": 0.5244424483306168,
"learning_rate": 3.7720866068660376e-05,
"loss": 0.8705,
"step": 407
},
{
"epoch": 0.16259838597190396,
"grad_norm": 0.5471724085897548,
"learning_rate": 3.7709120513064196e-05,
"loss": 0.8629,
"step": 408
},
{
"epoch": 0.16299691142771744,
"grad_norm": 0.6834100949875108,
"learning_rate": 3.769734660810915e-05,
"loss": 0.8863,
"step": 409
},
{
"epoch": 0.16339543688353095,
"grad_norm": 0.7279947229048482,
"learning_rate": 3.768554437264329e-05,
"loss": 0.8666,
"step": 410
},
{
"epoch": 0.16379396233934443,
"grad_norm": 0.6176989230226226,
"learning_rate": 3.767371382556003e-05,
"loss": 0.8537,
"step": 411
},
{
"epoch": 0.1641924877951579,
"grad_norm": 0.4903712989166882,
"learning_rate": 3.766185498579813e-05,
"loss": 0.903,
"step": 412
},
{
"epoch": 0.16459101325097142,
"grad_norm": 0.552748741724315,
"learning_rate": 3.76499678723416e-05,
"loss": 0.8765,
"step": 413
},
{
"epoch": 0.1649895387067849,
"grad_norm": 0.6272889269130209,
"learning_rate": 3.763805250421974e-05,
"loss": 0.8738,
"step": 414
},
{
"epoch": 0.16538806416259838,
"grad_norm": 0.5264119048766897,
"learning_rate": 3.762610890050707e-05,
"loss": 0.8776,
"step": 415
},
{
"epoch": 0.1657865896184119,
"grad_norm": 0.5169756029407534,
"learning_rate": 3.761413708032332e-05,
"loss": 0.9039,
"step": 416
},
{
"epoch": 0.16618511507422537,
"grad_norm": 0.5970794940209743,
"learning_rate": 3.760213706283339e-05,
"loss": 0.9157,
"step": 417
},
{
"epoch": 0.16658364053003885,
"grad_norm": 0.5978586824697808,
"learning_rate": 3.759010886724731e-05,
"loss": 0.8627,
"step": 418
},
{
"epoch": 0.16698216598585236,
"grad_norm": 0.6350014516716387,
"learning_rate": 3.757805251282021e-05,
"loss": 0.8924,
"step": 419
},
{
"epoch": 0.16738069144166584,
"grad_norm": 0.4788632516360886,
"learning_rate": 3.756596801885232e-05,
"loss": 0.8823,
"step": 420
},
{
"epoch": 0.16777921689747932,
"grad_norm": 0.4586359434458119,
"learning_rate": 3.755385540468892e-05,
"loss": 0.8929,
"step": 421
},
{
"epoch": 0.1681777423532928,
"grad_norm": 0.44317101728143243,
"learning_rate": 3.7541714689720265e-05,
"loss": 0.8649,
"step": 422
},
{
"epoch": 0.1685762678091063,
"grad_norm": 0.5122716359415467,
"learning_rate": 3.7529545893381645e-05,
"loss": 0.853,
"step": 423
},
{
"epoch": 0.1689747932649198,
"grad_norm": 0.5459289409614204,
"learning_rate": 3.7517349035153265e-05,
"loss": 0.884,
"step": 424
},
{
"epoch": 0.16937331872073327,
"grad_norm": 0.5242102541749672,
"learning_rate": 3.750512413456027e-05,
"loss": 0.8657,
"step": 425
},
{
"epoch": 0.16977184417654678,
"grad_norm": 0.4867591923017328,
"learning_rate": 3.749287121117271e-05,
"loss": 0.8792,
"step": 426
},
{
"epoch": 0.17017036963236026,
"grad_norm": 0.46645737295772005,
"learning_rate": 3.7480590284605456e-05,
"loss": 0.8555,
"step": 427
},
{
"epoch": 0.17056889508817374,
"grad_norm": 0.5173979998559967,
"learning_rate": 3.746828137451825e-05,
"loss": 0.8767,
"step": 428
},
{
"epoch": 0.17096742054398725,
"grad_norm": 0.5369165613294684,
"learning_rate": 3.74559445006156e-05,
"loss": 0.8705,
"step": 429
},
{
"epoch": 0.17136594599980073,
"grad_norm": 0.5189321766211082,
"learning_rate": 3.74435796826468e-05,
"loss": 0.8903,
"step": 430
},
{
"epoch": 0.17176447145561421,
"grad_norm": 0.5153398576442575,
"learning_rate": 3.743118694040585e-05,
"loss": 0.856,
"step": 431
},
{
"epoch": 0.17216299691142772,
"grad_norm": 0.6454497262759452,
"learning_rate": 3.74187662937315e-05,
"loss": 0.9,
"step": 432
},
{
"epoch": 0.1725615223672412,
"grad_norm": 0.49986119364421433,
"learning_rate": 3.740631776250712e-05,
"loss": 0.8445,
"step": 433
},
{
"epoch": 0.17296004782305469,
"grad_norm": 0.48967274132042343,
"learning_rate": 3.7393841366660735e-05,
"loss": 0.8767,
"step": 434
},
{
"epoch": 0.1733585732788682,
"grad_norm": 0.45785208420296847,
"learning_rate": 3.7381337126165e-05,
"loss": 0.9046,
"step": 435
},
{
"epoch": 0.17375709873468168,
"grad_norm": 0.5084392551993347,
"learning_rate": 3.736880506103711e-05,
"loss": 0.8463,
"step": 436
},
{
"epoch": 0.17415562419049516,
"grad_norm": 0.6260870917802238,
"learning_rate": 3.735624519133883e-05,
"loss": 0.8526,
"step": 437
},
{
"epoch": 0.17455414964630867,
"grad_norm": 0.667002011430546,
"learning_rate": 3.734365753717642e-05,
"loss": 0.9163,
"step": 438
},
{
"epoch": 0.17495267510212215,
"grad_norm": 0.5524932335618813,
"learning_rate": 3.7331042118700616e-05,
"loss": 0.8909,
"step": 439
},
{
"epoch": 0.17535120055793563,
"grad_norm": 0.5179221999500747,
"learning_rate": 3.731839895610662e-05,
"loss": 0.8491,
"step": 440
},
{
"epoch": 0.17574972601374914,
"grad_norm": 0.6055468639799181,
"learning_rate": 3.7305728069634024e-05,
"loss": 0.9039,
"step": 441
},
{
"epoch": 0.17614825146956262,
"grad_norm": 0.6369378504491895,
"learning_rate": 3.729302947956681e-05,
"loss": 0.8699,
"step": 442
},
{
"epoch": 0.1765467769253761,
"grad_norm": 0.517132348583334,
"learning_rate": 3.728030320623332e-05,
"loss": 0.8747,
"step": 443
},
{
"epoch": 0.1769453023811896,
"grad_norm": 0.4377714733389691,
"learning_rate": 3.7267549270006195e-05,
"loss": 0.8574,
"step": 444
},
{
"epoch": 0.1773438278370031,
"grad_norm": 0.5519428657517451,
"learning_rate": 3.7254767691302366e-05,
"loss": 0.8716,
"step": 445
},
{
"epoch": 0.17774235329281657,
"grad_norm": 0.5779289605769454,
"learning_rate": 3.724195849058302e-05,
"loss": 0.855,
"step": 446
},
{
"epoch": 0.17814087874863008,
"grad_norm": 0.5189071675619338,
"learning_rate": 3.722912168835356e-05,
"loss": 0.8789,
"step": 447
},
{
"epoch": 0.17853940420444356,
"grad_norm": 0.44907580503791095,
"learning_rate": 3.7216257305163576e-05,
"loss": 0.8659,
"step": 448
},
{
"epoch": 0.17893792966025704,
"grad_norm": 0.5335537287232798,
"learning_rate": 3.7203365361606796e-05,
"loss": 0.896,
"step": 449
},
{
"epoch": 0.17933645511607055,
"grad_norm": 0.5493861171202665,
"learning_rate": 3.719044587832109e-05,
"loss": 0.8547,
"step": 450
},
{
"epoch": 0.17973498057188403,
"grad_norm": 0.4686748664722927,
"learning_rate": 3.71774988759884e-05,
"loss": 0.8288,
"step": 451
},
{
"epoch": 0.1801335060276975,
"grad_norm": 0.4149387142024727,
"learning_rate": 3.716452437533471e-05,
"loss": 0.8596,
"step": 452
},
{
"epoch": 0.18053203148351102,
"grad_norm": 0.4325334501517392,
"learning_rate": 3.715152239713007e-05,
"loss": 0.859,
"step": 453
},
{
"epoch": 0.1809305569393245,
"grad_norm": 0.4976629397106674,
"learning_rate": 3.713849296218847e-05,
"loss": 0.8789,
"step": 454
},
{
"epoch": 0.18132908239513798,
"grad_norm": 0.507007279338876,
"learning_rate": 3.7125436091367866e-05,
"loss": 0.8726,
"step": 455
},
{
"epoch": 0.1817276078509515,
"grad_norm": 0.5348993862470603,
"learning_rate": 3.711235180557014e-05,
"loss": 0.9106,
"step": 456
},
{
"epoch": 0.18212613330676497,
"grad_norm": 0.46294587476217225,
"learning_rate": 3.709924012574107e-05,
"loss": 0.8358,
"step": 457
},
{
"epoch": 0.18252465876257845,
"grad_norm": 0.43107837967105883,
"learning_rate": 3.708610107287026e-05,
"loss": 0.8448,
"step": 458
},
{
"epoch": 0.18292318421839196,
"grad_norm": 0.48433441169264524,
"learning_rate": 3.7072934667991157e-05,
"loss": 0.8677,
"step": 459
},
{
"epoch": 0.18332170967420544,
"grad_norm": 0.5181824793139834,
"learning_rate": 3.705974093218099e-05,
"loss": 0.8867,
"step": 460
},
{
"epoch": 0.18372023513001892,
"grad_norm": 0.5376360855846708,
"learning_rate": 3.704651988656074e-05,
"loss": 0.9073,
"step": 461
},
{
"epoch": 0.18411876058583243,
"grad_norm": 0.5000814848716162,
"learning_rate": 3.703327155229509e-05,
"loss": 0.87,
"step": 462
},
{
"epoch": 0.1845172860416459,
"grad_norm": 0.4780561422951961,
"learning_rate": 3.701999595059244e-05,
"loss": 0.8614,
"step": 463
},
{
"epoch": 0.1849158114974594,
"grad_norm": 0.4722288774763096,
"learning_rate": 3.700669310270481e-05,
"loss": 0.8507,
"step": 464
},
{
"epoch": 0.1853143369532729,
"grad_norm": 0.46238619081900495,
"learning_rate": 3.699336302992786e-05,
"loss": 0.8795,
"step": 465
},
{
"epoch": 0.18571286240908638,
"grad_norm": 0.5217809598476334,
"learning_rate": 3.69800057536008e-05,
"loss": 0.8679,
"step": 466
},
{
"epoch": 0.18611138786489986,
"grad_norm": 0.5670490274865951,
"learning_rate": 3.6966621295106425e-05,
"loss": 0.8821,
"step": 467
},
{
"epoch": 0.18650991332071337,
"grad_norm": 0.5541701975380785,
"learning_rate": 3.695320967587103e-05,
"loss": 0.8671,
"step": 468
},
{
"epoch": 0.18690843877652685,
"grad_norm": 0.48332966121728094,
"learning_rate": 3.693977091736438e-05,
"loss": 0.8543,
"step": 469
},
{
"epoch": 0.18730696423234033,
"grad_norm": 0.4228426707268364,
"learning_rate": 3.6926305041099705e-05,
"loss": 0.8421,
"step": 470
},
{
"epoch": 0.18770548968815384,
"grad_norm": 0.4683111306073849,
"learning_rate": 3.6912812068633626e-05,
"loss": 0.8584,
"step": 471
},
{
"epoch": 0.18810401514396732,
"grad_norm": 0.5422991697909932,
"learning_rate": 3.689929202156615e-05,
"loss": 0.9349,
"step": 472
},
{
"epoch": 0.1885025405997808,
"grad_norm": 0.474589914149524,
"learning_rate": 3.688574492154063e-05,
"loss": 0.8683,
"step": 473
},
{
"epoch": 0.1889010660555943,
"grad_norm": 0.4982233301174737,
"learning_rate": 3.687217079024371e-05,
"loss": 0.8636,
"step": 474
},
{
"epoch": 0.1892995915114078,
"grad_norm": 0.5267276262142256,
"learning_rate": 3.6858569649405336e-05,
"loss": 0.8559,
"step": 475
},
{
"epoch": 0.18969811696722128,
"grad_norm": 0.40458583321271047,
"learning_rate": 3.6844941520798664e-05,
"loss": 0.8432,
"step": 476
},
{
"epoch": 0.19009664242303478,
"grad_norm": 0.38424753205506557,
"learning_rate": 3.683128642624007e-05,
"loss": 0.857,
"step": 477
},
{
"epoch": 0.19049516787884826,
"grad_norm": 0.4997565524770705,
"learning_rate": 3.6817604387589086e-05,
"loss": 0.8763,
"step": 478
},
{
"epoch": 0.19089369333466175,
"grad_norm": 0.38620309944213566,
"learning_rate": 3.680389542674837e-05,
"loss": 0.8402,
"step": 479
},
{
"epoch": 0.19129221879047525,
"grad_norm": 0.4217979959268514,
"learning_rate": 3.679015956566371e-05,
"loss": 0.8921,
"step": 480
},
{
"epoch": 0.19169074424628874,
"grad_norm": 0.6509327369251123,
"learning_rate": 3.6776396826323925e-05,
"loss": 0.8981,
"step": 481
},
{
"epoch": 0.19208926970210222,
"grad_norm": 0.3957479519147936,
"learning_rate": 3.6762607230760884e-05,
"loss": 0.887,
"step": 482
},
{
"epoch": 0.19248779515791573,
"grad_norm": 0.3933212374183316,
"learning_rate": 3.6748790801049435e-05,
"loss": 0.8555,
"step": 483
},
{
"epoch": 0.1928863206137292,
"grad_norm": 0.3942675959179187,
"learning_rate": 3.673494755930737e-05,
"loss": 0.8619,
"step": 484
},
{
"epoch": 0.1932848460695427,
"grad_norm": 0.4102773938392307,
"learning_rate": 3.6721077527695435e-05,
"loss": 0.8684,
"step": 485
},
{
"epoch": 0.19368337152535617,
"grad_norm": 0.5537091771770686,
"learning_rate": 3.670718072841724e-05,
"loss": 0.8657,
"step": 486
},
{
"epoch": 0.19408189698116968,
"grad_norm": 0.4445425000622428,
"learning_rate": 3.6693257183719256e-05,
"loss": 0.8527,
"step": 487
},
{
"epoch": 0.19448042243698316,
"grad_norm": 0.4048218822376927,
"learning_rate": 3.667930691589075e-05,
"loss": 0.8786,
"step": 488
},
{
"epoch": 0.19487894789279664,
"grad_norm": 0.4525605726219098,
"learning_rate": 3.666532994726381e-05,
"loss": 0.8544,
"step": 489
},
{
"epoch": 0.19527747334861015,
"grad_norm": 0.4471569397505119,
"learning_rate": 3.665132630021321e-05,
"loss": 0.8506,
"step": 490
},
{
"epoch": 0.19567599880442363,
"grad_norm": 0.47638751339784896,
"learning_rate": 3.6637295997156475e-05,
"loss": 0.887,
"step": 491
},
{
"epoch": 0.1960745242602371,
"grad_norm": 0.5025010909937182,
"learning_rate": 3.662323906055379e-05,
"loss": 0.8653,
"step": 492
},
{
"epoch": 0.19647304971605062,
"grad_norm": 0.42315599557494776,
"learning_rate": 3.6609155512907966e-05,
"loss": 0.8531,
"step": 493
},
{
"epoch": 0.1968715751718641,
"grad_norm": 0.4543478055892151,
"learning_rate": 3.659504537676444e-05,
"loss": 0.8512,
"step": 494
},
{
"epoch": 0.19727010062767758,
"grad_norm": 0.5089668275890759,
"learning_rate": 3.658090867471118e-05,
"loss": 0.8733,
"step": 495
},
{
"epoch": 0.1976686260834911,
"grad_norm": 0.48725887709055965,
"learning_rate": 3.656674542937869e-05,
"loss": 0.8629,
"step": 496
},
{
"epoch": 0.19806715153930457,
"grad_norm": 0.5284757567578545,
"learning_rate": 3.655255566343999e-05,
"loss": 0.8845,
"step": 497
},
{
"epoch": 0.19846567699511805,
"grad_norm": 0.5026058309669479,
"learning_rate": 3.653833939961053e-05,
"loss": 0.8876,
"step": 498
},
{
"epoch": 0.19886420245093156,
"grad_norm": 0.4169989456283724,
"learning_rate": 3.6524096660648186e-05,
"loss": 0.8713,
"step": 499
},
{
"epoch": 0.19926272790674504,
"grad_norm": 0.43259318912302097,
"learning_rate": 3.650982746935321e-05,
"loss": 0.8463,
"step": 500
},
{
"epoch": 0.19966125336255852,
"grad_norm": 0.47387386790106595,
"learning_rate": 3.6495531848568206e-05,
"loss": 0.8315,
"step": 501
},
{
"epoch": 0.20005977881837203,
"grad_norm": 0.4501351523826911,
"learning_rate": 3.6481209821178104e-05,
"loss": 0.8628,
"step": 502
},
{
"epoch": 0.2004583042741855,
"grad_norm": 0.5285961857854481,
"learning_rate": 3.646686141011008e-05,
"loss": 0.8605,
"step": 503
},
{
"epoch": 0.200856829729999,
"grad_norm": 0.40989354815942786,
"learning_rate": 3.645248663833354e-05,
"loss": 0.8688,
"step": 504
},
{
"epoch": 0.2012553551858125,
"grad_norm": 0.4346950335335224,
"learning_rate": 3.643808552886012e-05,
"loss": 0.873,
"step": 505
},
{
"epoch": 0.20165388064162598,
"grad_norm": 0.5336085053270726,
"learning_rate": 3.6423658104743606e-05,
"loss": 0.8593,
"step": 506
},
{
"epoch": 0.20205240609743946,
"grad_norm": 0.4077411294947737,
"learning_rate": 3.6409204389079896e-05,
"loss": 0.8444,
"step": 507
},
{
"epoch": 0.20245093155325297,
"grad_norm": 0.44445720308169706,
"learning_rate": 3.6394724405007e-05,
"loss": 0.8636,
"step": 508
},
{
"epoch": 0.20284945700906645,
"grad_norm": 0.3987179137110336,
"learning_rate": 3.6380218175704954e-05,
"loss": 0.8897,
"step": 509
},
{
"epoch": 0.20324798246487993,
"grad_norm": 0.4428095828153124,
"learning_rate": 3.636568572439582e-05,
"loss": 0.8471,
"step": 510
},
{
"epoch": 0.20364650792069344,
"grad_norm": 0.46898791636388926,
"learning_rate": 3.6351127074343654e-05,
"loss": 0.8567,
"step": 511
},
{
"epoch": 0.20404503337650692,
"grad_norm": 0.44474651154582173,
"learning_rate": 3.633654224885441e-05,
"loss": 0.848,
"step": 512
},
{
"epoch": 0.2044435588323204,
"grad_norm": 0.4312423546670495,
"learning_rate": 3.632193127127598e-05,
"loss": 0.8693,
"step": 513
},
{
"epoch": 0.2048420842881339,
"grad_norm": 0.49632782286130483,
"learning_rate": 3.630729416499813e-05,
"loss": 0.8814,
"step": 514
},
{
"epoch": 0.2052406097439474,
"grad_norm": 0.45170716058550536,
"learning_rate": 3.6292630953452406e-05,
"loss": 0.8685,
"step": 515
},
{
"epoch": 0.20563913519976088,
"grad_norm": 0.5446213353134834,
"learning_rate": 3.627794166011219e-05,
"loss": 0.8717,
"step": 516
},
{
"epoch": 0.20603766065557438,
"grad_norm": 0.5471560197738125,
"learning_rate": 3.626322630849259e-05,
"loss": 0.8667,
"step": 517
},
{
"epoch": 0.20643618611138786,
"grad_norm": 0.5858086900062635,
"learning_rate": 3.6248484922150445e-05,
"loss": 0.8279,
"step": 518
},
{
"epoch": 0.20683471156720135,
"grad_norm": 0.5915507808065805,
"learning_rate": 3.6233717524684264e-05,
"loss": 0.8647,
"step": 519
},
{
"epoch": 0.20723323702301485,
"grad_norm": 0.5742838245899272,
"learning_rate": 3.62189241397342e-05,
"loss": 0.8756,
"step": 520
},
{
"epoch": 0.20763176247882834,
"grad_norm": 0.4770900993779875,
"learning_rate": 3.620410479098199e-05,
"loss": 0.8595,
"step": 521
},
{
"epoch": 0.20803028793464182,
"grad_norm": 0.4639336066600716,
"learning_rate": 3.618925950215096e-05,
"loss": 0.8539,
"step": 522
},
{
"epoch": 0.20842881339045533,
"grad_norm": 0.5019882836143528,
"learning_rate": 3.617438829700595e-05,
"loss": 0.8461,
"step": 523
},
{
"epoch": 0.2088273388462688,
"grad_norm": 0.4562491167280308,
"learning_rate": 3.615949119935328e-05,
"loss": 0.8631,
"step": 524
},
{
"epoch": 0.2092258643020823,
"grad_norm": 0.46086677639660656,
"learning_rate": 3.614456823304073e-05,
"loss": 0.8489,
"step": 525
},
{
"epoch": 0.2096243897578958,
"grad_norm": 0.44996342982439314,
"learning_rate": 3.61296194219575e-05,
"loss": 0.8554,
"step": 526
},
{
"epoch": 0.21002291521370928,
"grad_norm": 0.4156003055691938,
"learning_rate": 3.6114644790034144e-05,
"loss": 0.8566,
"step": 527
},
{
"epoch": 0.21042144066952276,
"grad_norm": 0.4501085849731328,
"learning_rate": 3.609964436124255e-05,
"loss": 0.8728,
"step": 528
},
{
"epoch": 0.21081996612533627,
"grad_norm": 0.40787146977289557,
"learning_rate": 3.6084618159595935e-05,
"loss": 0.8667,
"step": 529
},
{
"epoch": 0.21121849158114975,
"grad_norm": 0.474878191977019,
"learning_rate": 3.606956620914873e-05,
"loss": 0.8295,
"step": 530
},
{
"epoch": 0.21161701703696323,
"grad_norm": 0.46121373114207476,
"learning_rate": 3.605448853399661e-05,
"loss": 0.8647,
"step": 531
},
{
"epoch": 0.21201554249277674,
"grad_norm": 0.5256057649499315,
"learning_rate": 3.603938515827643e-05,
"loss": 0.8765,
"step": 532
},
{
"epoch": 0.21241406794859022,
"grad_norm": 0.4296063955695742,
"learning_rate": 3.6024256106166194e-05,
"loss": 0.8698,
"step": 533
},
{
"epoch": 0.2128125934044037,
"grad_norm": 0.49055349825343775,
"learning_rate": 3.600910140188498e-05,
"loss": 0.8554,
"step": 534
},
{
"epoch": 0.2132111188602172,
"grad_norm": 0.4028038490785686,
"learning_rate": 3.599392106969296e-05,
"loss": 0.8797,
"step": 535
},
{
"epoch": 0.2136096443160307,
"grad_norm": 0.4426507424773926,
"learning_rate": 3.5978715133891334e-05,
"loss": 0.8433,
"step": 536
},
{
"epoch": 0.21400816977184417,
"grad_norm": 0.4408887572324347,
"learning_rate": 3.596348361882226e-05,
"loss": 0.8919,
"step": 537
},
{
"epoch": 0.21440669522765768,
"grad_norm": 0.4023818298390077,
"learning_rate": 3.594822654886888e-05,
"loss": 0.8219,
"step": 538
},
{
"epoch": 0.21480522068347116,
"grad_norm": 0.5445602848649418,
"learning_rate": 3.593294394845521e-05,
"loss": 0.8561,
"step": 539
},
{
"epoch": 0.21520374613928464,
"grad_norm": 0.44164972512016026,
"learning_rate": 3.5917635842046165e-05,
"loss": 0.8428,
"step": 540
},
{
"epoch": 0.21560227159509815,
"grad_norm": 0.48977170056676267,
"learning_rate": 3.590230225414748e-05,
"loss": 0.8701,
"step": 541
},
{
"epoch": 0.21600079705091163,
"grad_norm": 0.465180272328864,
"learning_rate": 3.588694320930567e-05,
"loss": 0.837,
"step": 542
},
{
"epoch": 0.2163993225067251,
"grad_norm": 0.3718782369142703,
"learning_rate": 3.5871558732108034e-05,
"loss": 0.8491,
"step": 543
},
{
"epoch": 0.21679784796253862,
"grad_norm": 0.4506626708822692,
"learning_rate": 3.5856148847182535e-05,
"loss": 0.8293,
"step": 544
},
{
"epoch": 0.2171963734183521,
"grad_norm": 0.5210277329620194,
"learning_rate": 3.5840713579197856e-05,
"loss": 0.8587,
"step": 545
},
{
"epoch": 0.21759489887416558,
"grad_norm": 0.5358427464347824,
"learning_rate": 3.5825252952863296e-05,
"loss": 0.8251,
"step": 546
},
{
"epoch": 0.2179934243299791,
"grad_norm": 0.48542122022372863,
"learning_rate": 3.5809766992928746e-05,
"loss": 0.8725,
"step": 547
},
{
"epoch": 0.21839194978579257,
"grad_norm": 0.4243230434228638,
"learning_rate": 3.579425572418465e-05,
"loss": 0.8518,
"step": 548
},
{
"epoch": 0.21879047524160605,
"grad_norm": 0.4218795984129036,
"learning_rate": 3.5778719171461975e-05,
"loss": 0.8548,
"step": 549
},
{
"epoch": 0.21918900069741956,
"grad_norm": 0.4590501106129811,
"learning_rate": 3.5763157359632164e-05,
"loss": 0.8531,
"step": 550
},
{
"epoch": 0.21958752615323304,
"grad_norm": 0.46361499771905873,
"learning_rate": 3.574757031360708e-05,
"loss": 0.8817,
"step": 551
},
{
"epoch": 0.21998605160904652,
"grad_norm": 0.47793550002117074,
"learning_rate": 3.5731958058339e-05,
"loss": 0.856,
"step": 552
},
{
"epoch": 0.22038457706486,
"grad_norm": 0.4585859943216561,
"learning_rate": 3.571632061882056e-05,
"loss": 0.8616,
"step": 553
},
{
"epoch": 0.2207831025206735,
"grad_norm": 0.3864454910550978,
"learning_rate": 3.570065802008468e-05,
"loss": 0.8621,
"step": 554
},
{
"epoch": 0.221181627976487,
"grad_norm": 0.42677803227423167,
"learning_rate": 3.56849702872046e-05,
"loss": 0.8824,
"step": 555
},
{
"epoch": 0.22158015343230048,
"grad_norm": 0.4968788156141536,
"learning_rate": 3.5669257445293755e-05,
"loss": 0.8601,
"step": 556
},
{
"epoch": 0.22197867888811398,
"grad_norm": 0.4839933541994568,
"learning_rate": 3.5653519519505803e-05,
"loss": 0.852,
"step": 557
},
{
"epoch": 0.22237720434392746,
"grad_norm": 0.43272119648953283,
"learning_rate": 3.563775653503455e-05,
"loss": 0.8733,
"step": 558
},
{
"epoch": 0.22277572979974095,
"grad_norm": 0.39605938751897557,
"learning_rate": 3.562196851711391e-05,
"loss": 0.8417,
"step": 559
},
{
"epoch": 0.22317425525555445,
"grad_norm": 0.43460908962065953,
"learning_rate": 3.560615549101788e-05,
"loss": 0.8443,
"step": 560
},
{
"epoch": 0.22357278071136794,
"grad_norm": 0.49038667322845025,
"learning_rate": 3.5590317482060474e-05,
"loss": 0.8441,
"step": 561
},
{
"epoch": 0.22397130616718142,
"grad_norm": 0.4896634258033811,
"learning_rate": 3.5574454515595735e-05,
"loss": 0.8216,
"step": 562
},
{
"epoch": 0.22436983162299493,
"grad_norm": 0.47379532355614734,
"learning_rate": 3.5558566617017616e-05,
"loss": 0.8664,
"step": 563
},
{
"epoch": 0.2247683570788084,
"grad_norm": 0.4064914983245694,
"learning_rate": 3.554265381176e-05,
"loss": 0.8195,
"step": 564
},
{
"epoch": 0.2251668825346219,
"grad_norm": 0.44817812465361634,
"learning_rate": 3.552671612529667e-05,
"loss": 0.8251,
"step": 565
},
{
"epoch": 0.2255654079904354,
"grad_norm": 0.5252162424970518,
"learning_rate": 3.5510753583141185e-05,
"loss": 0.8873,
"step": 566
},
{
"epoch": 0.22596393344624888,
"grad_norm": 0.5355671371355674,
"learning_rate": 3.5494766210846936e-05,
"loss": 0.8544,
"step": 567
},
{
"epoch": 0.22636245890206236,
"grad_norm": 0.4819586808295284,
"learning_rate": 3.547875403400705e-05,
"loss": 0.8619,
"step": 568
},
{
"epoch": 0.22676098435787587,
"grad_norm": 0.3961215921893707,
"learning_rate": 3.5462717078254353e-05,
"loss": 0.8687,
"step": 569
},
{
"epoch": 0.22715950981368935,
"grad_norm": 0.4108394698195708,
"learning_rate": 3.5446655369261355e-05,
"loss": 0.8629,
"step": 570
},
{
"epoch": 0.22755803526950283,
"grad_norm": 0.4534157567866205,
"learning_rate": 3.543056893274017e-05,
"loss": 0.843,
"step": 571
},
{
"epoch": 0.22795656072531634,
"grad_norm": 0.5102875270779772,
"learning_rate": 3.541445779444252e-05,
"loss": 0.8485,
"step": 572
},
{
"epoch": 0.22835508618112982,
"grad_norm": 0.3859177522136378,
"learning_rate": 3.5398321980159666e-05,
"loss": 0.8373,
"step": 573
},
{
"epoch": 0.2287536116369433,
"grad_norm": 0.37972963850475683,
"learning_rate": 3.5382161515722354e-05,
"loss": 0.8741,
"step": 574
},
{
"epoch": 0.2291521370927568,
"grad_norm": 0.4136228500070505,
"learning_rate": 3.53659764270008e-05,
"loss": 0.8739,
"step": 575
},
{
"epoch": 0.2295506625485703,
"grad_norm": 0.42386454317477146,
"learning_rate": 3.534976673990465e-05,
"loss": 0.8504,
"step": 576
},
{
"epoch": 0.22994918800438377,
"grad_norm": 0.4131700773814348,
"learning_rate": 3.5333532480382915e-05,
"loss": 0.8325,
"step": 577
},
{
"epoch": 0.23034771346019728,
"grad_norm": 0.4321055766938808,
"learning_rate": 3.5317273674423944e-05,
"loss": 0.842,
"step": 578
},
{
"epoch": 0.23074623891601076,
"grad_norm": 0.4452054733522704,
"learning_rate": 3.5300990348055385e-05,
"loss": 0.8826,
"step": 579
},
{
"epoch": 0.23114476437182424,
"grad_norm": 0.4096599637303119,
"learning_rate": 3.528468252734414e-05,
"loss": 0.8633,
"step": 580
},
{
"epoch": 0.23154328982763775,
"grad_norm": 0.4122953744704833,
"learning_rate": 3.526835023839632e-05,
"loss": 0.8772,
"step": 581
},
{
"epoch": 0.23194181528345123,
"grad_norm": 0.4547152883012281,
"learning_rate": 3.52519935073572e-05,
"loss": 0.8613,
"step": 582
},
{
"epoch": 0.2323403407392647,
"grad_norm": 0.4725670891982683,
"learning_rate": 3.5235612360411196e-05,
"loss": 0.8819,
"step": 583
},
{
"epoch": 0.23273886619507822,
"grad_norm": 0.40729982125282965,
"learning_rate": 3.521920682378179e-05,
"loss": 0.8471,
"step": 584
},
{
"epoch": 0.2331373916508917,
"grad_norm": 0.4348949494906739,
"learning_rate": 3.520277692373154e-05,
"loss": 0.8682,
"step": 585
},
{
"epoch": 0.23353591710670518,
"grad_norm": 0.4881551767292844,
"learning_rate": 3.518632268656196e-05,
"loss": 0.8408,
"step": 586
},
{
"epoch": 0.2339344425625187,
"grad_norm": 0.5373093582603797,
"learning_rate": 3.516984413861357e-05,
"loss": 0.8646,
"step": 587
},
{
"epoch": 0.23433296801833217,
"grad_norm": 0.4789730876955116,
"learning_rate": 3.5153341306265775e-05,
"loss": 0.8489,
"step": 588
},
{
"epoch": 0.23473149347414565,
"grad_norm": 0.5957598632234159,
"learning_rate": 3.5136814215936864e-05,
"loss": 0.8478,
"step": 589
},
{
"epoch": 0.23513001892995916,
"grad_norm": 0.6296888663536283,
"learning_rate": 3.512026289408398e-05,
"loss": 0.866,
"step": 590
},
{
"epoch": 0.23552854438577264,
"grad_norm": 0.5086372892787441,
"learning_rate": 3.5103687367203025e-05,
"loss": 0.8893,
"step": 591
},
{
"epoch": 0.23592706984158612,
"grad_norm": 0.4732493082235356,
"learning_rate": 3.508708766182866e-05,
"loss": 0.8435,
"step": 592
},
{
"epoch": 0.23632559529739963,
"grad_norm": 0.5470935688327907,
"learning_rate": 3.507046380453426e-05,
"loss": 0.8572,
"step": 593
},
{
"epoch": 0.2367241207532131,
"grad_norm": 0.5928161194589755,
"learning_rate": 3.5053815821931865e-05,
"loss": 0.8991,
"step": 594
},
{
"epoch": 0.2371226462090266,
"grad_norm": 0.541542878613048,
"learning_rate": 3.503714374067212e-05,
"loss": 0.843,
"step": 595
},
{
"epoch": 0.2375211716648401,
"grad_norm": 0.5945037136372829,
"learning_rate": 3.502044758744425e-05,
"loss": 0.8313,
"step": 596
},
{
"epoch": 0.23791969712065358,
"grad_norm": 0.5262860873148738,
"learning_rate": 3.500372738897603e-05,
"loss": 0.8302,
"step": 597
},
{
"epoch": 0.23831822257646706,
"grad_norm": 0.47043036160591684,
"learning_rate": 3.498698317203372e-05,
"loss": 0.8483,
"step": 598
},
{
"epoch": 0.23871674803228057,
"grad_norm": 0.48587903119210246,
"learning_rate": 3.497021496342203e-05,
"loss": 0.8435,
"step": 599
},
{
"epoch": 0.23911527348809405,
"grad_norm": 0.5044732980078849,
"learning_rate": 3.495342278998406e-05,
"loss": 0.828,
"step": 600
},
{
"epoch": 0.23951379894390754,
"grad_norm": 0.4739216086634541,
"learning_rate": 3.493660667860131e-05,
"loss": 0.8077,
"step": 601
},
{
"epoch": 0.23991232439972104,
"grad_norm": 0.5102507150713297,
"learning_rate": 3.4919766656193576e-05,
"loss": 0.8558,
"step": 602
},
{
"epoch": 0.24031084985553453,
"grad_norm": 0.4940960518342556,
"learning_rate": 3.490290274971892e-05,
"loss": 0.8655,
"step": 603
},
{
"epoch": 0.240709375311348,
"grad_norm": 0.4976450093350724,
"learning_rate": 3.488601498617367e-05,
"loss": 0.8451,
"step": 604
},
{
"epoch": 0.24110790076716151,
"grad_norm": 0.5068077897232314,
"learning_rate": 3.486910339259231e-05,
"loss": 0.8424,
"step": 605
},
{
"epoch": 0.241506426222975,
"grad_norm": 0.40217075451363676,
"learning_rate": 3.485216799604752e-05,
"loss": 0.8766,
"step": 606
},
{
"epoch": 0.24190495167878848,
"grad_norm": 0.41405398864625936,
"learning_rate": 3.483520882365003e-05,
"loss": 0.8295,
"step": 607
},
{
"epoch": 0.24230347713460199,
"grad_norm": 0.45479094126766634,
"learning_rate": 3.4818225902548666e-05,
"loss": 0.8832,
"step": 608
},
{
"epoch": 0.24270200259041547,
"grad_norm": 0.44930048442037135,
"learning_rate": 3.480121925993026e-05,
"loss": 0.8775,
"step": 609
},
{
"epoch": 0.24310052804622895,
"grad_norm": 0.4305314388039683,
"learning_rate": 3.478418892301962e-05,
"loss": 0.8585,
"step": 610
},
{
"epoch": 0.24349905350204246,
"grad_norm": 0.42635507279318796,
"learning_rate": 3.47671349190795e-05,
"loss": 0.8748,
"step": 611
},
{
"epoch": 0.24389757895785594,
"grad_norm": 0.4470652779000305,
"learning_rate": 3.475005727541049e-05,
"loss": 0.8707,
"step": 612
},
{
"epoch": 0.24429610441366942,
"grad_norm": 0.6860613528881833,
"learning_rate": 3.4732956019351105e-05,
"loss": 0.8586,
"step": 613
},
{
"epoch": 0.24469462986948293,
"grad_norm": 0.4514815132734232,
"learning_rate": 3.471583117827758e-05,
"loss": 0.847,
"step": 614
},
{
"epoch": 0.2450931553252964,
"grad_norm": 0.405387622879431,
"learning_rate": 3.469868277960395e-05,
"loss": 0.8537,
"step": 615
},
{
"epoch": 0.2454916807811099,
"grad_norm": 0.45894128089045466,
"learning_rate": 3.468151085078196e-05,
"loss": 0.8329,
"step": 616
},
{
"epoch": 0.24589020623692337,
"grad_norm": 0.5102574940014621,
"learning_rate": 3.4664315419301e-05,
"loss": 0.8407,
"step": 617
},
{
"epoch": 0.24628873169273688,
"grad_norm": 0.44535784146833973,
"learning_rate": 3.464709651268811e-05,
"loss": 0.8503,
"step": 618
},
{
"epoch": 0.24668725714855036,
"grad_norm": 0.43055173741202407,
"learning_rate": 3.4629854158507884e-05,
"loss": 0.8685,
"step": 619
},
{
"epoch": 0.24708578260436384,
"grad_norm": 0.44729573957137375,
"learning_rate": 3.461258838436248e-05,
"loss": 0.8708,
"step": 620
},
{
"epoch": 0.24748430806017735,
"grad_norm": 0.4062311195130286,
"learning_rate": 3.459529921789153e-05,
"loss": 0.824,
"step": 621
},
{
"epoch": 0.24788283351599083,
"grad_norm": 0.4359478505964142,
"learning_rate": 3.457798668677211e-05,
"loss": 0.849,
"step": 622
},
{
"epoch": 0.2482813589718043,
"grad_norm": 0.4269566124271948,
"learning_rate": 3.456065081871871e-05,
"loss": 0.8504,
"step": 623
},
{
"epoch": 0.24867988442761782,
"grad_norm": 0.39280331015093617,
"learning_rate": 3.454329164148317e-05,
"loss": 0.8529,
"step": 624
},
{
"epoch": 0.2490784098834313,
"grad_norm": 0.414050219224192,
"learning_rate": 3.452590918285465e-05,
"loss": 0.871,
"step": 625
},
{
"epoch": 0.24947693533924478,
"grad_norm": 0.4021318325147454,
"learning_rate": 3.450850347065958e-05,
"loss": 0.841,
"step": 626
},
{
"epoch": 0.2498754607950583,
"grad_norm": 0.4120701796015395,
"learning_rate": 3.4491074532761614e-05,
"loss": 0.8261,
"step": 627
},
{
"epoch": 0.25027398625087177,
"grad_norm": 0.42792903386869047,
"learning_rate": 3.4473622397061576e-05,
"loss": 0.8366,
"step": 628
},
{
"epoch": 0.25067251170668525,
"grad_norm": 0.5026276371812628,
"learning_rate": 3.445614709149744e-05,
"loss": 0.8797,
"step": 629
},
{
"epoch": 0.25107103716249873,
"grad_norm": 0.42307765492760363,
"learning_rate": 3.443864864404427e-05,
"loss": 0.8333,
"step": 630
},
{
"epoch": 0.25146956261831227,
"grad_norm": 0.43146499355102447,
"learning_rate": 3.4421127082714165e-05,
"loss": 0.8745,
"step": 631
},
{
"epoch": 0.25186808807412575,
"grad_norm": 0.4232386337048391,
"learning_rate": 3.4403582435556235e-05,
"loss": 0.8615,
"step": 632
},
{
"epoch": 0.25226661352993923,
"grad_norm": 0.39549286132767947,
"learning_rate": 3.4386014730656554e-05,
"loss": 0.852,
"step": 633
},
{
"epoch": 0.2526651389857527,
"grad_norm": 0.37990676255356576,
"learning_rate": 3.436842399613808e-05,
"loss": 0.8667,
"step": 634
},
{
"epoch": 0.2530636644415662,
"grad_norm": 0.35437344682645827,
"learning_rate": 3.435081026016067e-05,
"loss": 0.8629,
"step": 635
},
{
"epoch": 0.2534621898973797,
"grad_norm": 0.4072267228198412,
"learning_rate": 3.433317355092098e-05,
"loss": 0.863,
"step": 636
},
{
"epoch": 0.2538607153531932,
"grad_norm": 0.4087915210981998,
"learning_rate": 3.431551389665246e-05,
"loss": 0.8629,
"step": 637
},
{
"epoch": 0.2542592408090067,
"grad_norm": 0.39541112177531035,
"learning_rate": 3.429783132562527e-05,
"loss": 0.8431,
"step": 638
},
{
"epoch": 0.2546577662648202,
"grad_norm": 0.449324447165349,
"learning_rate": 3.428012586614628e-05,
"loss": 0.8301,
"step": 639
},
{
"epoch": 0.25505629172063365,
"grad_norm": 0.438103934508987,
"learning_rate": 3.426239754655898e-05,
"loss": 0.8346,
"step": 640
},
{
"epoch": 0.25545481717644714,
"grad_norm": 0.4080543057741031,
"learning_rate": 3.4244646395243456e-05,
"loss": 0.8199,
"step": 641
},
{
"epoch": 0.2558533426322606,
"grad_norm": 0.4781805788640452,
"learning_rate": 3.422687244061636e-05,
"loss": 0.8396,
"step": 642
},
{
"epoch": 0.25625186808807415,
"grad_norm": 0.39665434242169373,
"learning_rate": 3.420907571113085e-05,
"loss": 0.8738,
"step": 643
},
{
"epoch": 0.25665039354388763,
"grad_norm": 0.44427399502026793,
"learning_rate": 3.419125623527651e-05,
"loss": 0.8276,
"step": 644
},
{
"epoch": 0.2570489189997011,
"grad_norm": 0.4971093826856599,
"learning_rate": 3.417341404157938e-05,
"loss": 0.844,
"step": 645
},
{
"epoch": 0.2574474444555146,
"grad_norm": 0.40433006793477544,
"learning_rate": 3.415554915860184e-05,
"loss": 0.8515,
"step": 646
},
{
"epoch": 0.2578459699113281,
"grad_norm": 0.41435122876017727,
"learning_rate": 3.413766161494259e-05,
"loss": 0.8504,
"step": 647
},
{
"epoch": 0.25824449536714156,
"grad_norm": 0.4025721405079423,
"learning_rate": 3.411975143923662e-05,
"loss": 0.8003,
"step": 648
},
{
"epoch": 0.25864302082295504,
"grad_norm": 0.4230151107223422,
"learning_rate": 3.410181866015515e-05,
"loss": 0.8253,
"step": 649
},
{
"epoch": 0.2590415462787686,
"grad_norm": 0.43018219174517974,
"learning_rate": 3.4083863306405576e-05,
"loss": 0.8494,
"step": 650
},
{
"epoch": 0.25944007173458206,
"grad_norm": 0.5580571782658815,
"learning_rate": 3.406588540673143e-05,
"loss": 0.839,
"step": 651
},
{
"epoch": 0.25983859719039554,
"grad_norm": 0.40240838407878654,
"learning_rate": 3.4047884989912355e-05,
"loss": 0.8295,
"step": 652
},
{
"epoch": 0.260237122646209,
"grad_norm": 0.42705376431218756,
"learning_rate": 3.402986208476401e-05,
"loss": 0.8513,
"step": 653
},
{
"epoch": 0.2606356481020225,
"grad_norm": 0.37891252038962947,
"learning_rate": 3.4011816720138076e-05,
"loss": 0.8551,
"step": 654
},
{
"epoch": 0.261034173557836,
"grad_norm": 0.4742754786354608,
"learning_rate": 3.39937489249222e-05,
"loss": 0.8494,
"step": 655
},
{
"epoch": 0.2614326990136495,
"grad_norm": 0.5757481855161607,
"learning_rate": 3.3975658728039894e-05,
"loss": 0.866,
"step": 656
},
{
"epoch": 0.261831224469463,
"grad_norm": 0.41879176964003356,
"learning_rate": 3.395754615845057e-05,
"loss": 0.8199,
"step": 657
},
{
"epoch": 0.2622297499252765,
"grad_norm": 0.3977116381507401,
"learning_rate": 3.393941124514944e-05,
"loss": 0.8464,
"step": 658
},
{
"epoch": 0.26262827538108996,
"grad_norm": 0.4361036030052378,
"learning_rate": 3.3921254017167485e-05,
"loss": 0.8554,
"step": 659
},
{
"epoch": 0.26302680083690344,
"grad_norm": 0.36947748546095344,
"learning_rate": 3.3903074503571414e-05,
"loss": 0.8332,
"step": 660
},
{
"epoch": 0.2634253262927169,
"grad_norm": 0.39322680162826995,
"learning_rate": 3.3884872733463605e-05,
"loss": 0.8522,
"step": 661
},
{
"epoch": 0.26382385174853046,
"grad_norm": 0.4426408711257021,
"learning_rate": 3.386664873598206e-05,
"loss": 0.8439,
"step": 662
},
{
"epoch": 0.26422237720434394,
"grad_norm": 0.40481569528280453,
"learning_rate": 3.384840254030039e-05,
"loss": 0.8463,
"step": 663
},
{
"epoch": 0.2646209026601574,
"grad_norm": 0.486897366169285,
"learning_rate": 3.3830134175627694e-05,
"loss": 0.8383,
"step": 664
},
{
"epoch": 0.2650194281159709,
"grad_norm": 0.4124318747978423,
"learning_rate": 3.3811843671208604e-05,
"loss": 0.8341,
"step": 665
},
{
"epoch": 0.2654179535717844,
"grad_norm": 0.4480853051751989,
"learning_rate": 3.379353105632318e-05,
"loss": 0.8719,
"step": 666
},
{
"epoch": 0.26581647902759786,
"grad_norm": 0.4075223126165696,
"learning_rate": 3.3775196360286864e-05,
"loss": 0.825,
"step": 667
},
{
"epoch": 0.2662150044834114,
"grad_norm": 0.4598432178350243,
"learning_rate": 3.375683961245047e-05,
"loss": 0.8459,
"step": 668
},
{
"epoch": 0.2666135299392249,
"grad_norm": 0.4747860282082611,
"learning_rate": 3.3738460842200095e-05,
"loss": 0.8448,
"step": 669
},
{
"epoch": 0.26701205539503836,
"grad_norm": 0.42550536631714303,
"learning_rate": 3.37200600789571e-05,
"loss": 0.8482,
"step": 670
},
{
"epoch": 0.26741058085085184,
"grad_norm": 0.5014696923841511,
"learning_rate": 3.3701637352178035e-05,
"loss": 0.839,
"step": 671
},
{
"epoch": 0.2678091063066653,
"grad_norm": 0.44071644150719574,
"learning_rate": 3.368319269135464e-05,
"loss": 0.8499,
"step": 672
},
{
"epoch": 0.2682076317624788,
"grad_norm": 0.45694183948733363,
"learning_rate": 3.366472612601374e-05,
"loss": 0.8495,
"step": 673
},
{
"epoch": 0.26860615721829234,
"grad_norm": 0.45776428701146005,
"learning_rate": 3.364623768571725e-05,
"loss": 0.8683,
"step": 674
},
{
"epoch": 0.2690046826741058,
"grad_norm": 0.4300670256635499,
"learning_rate": 3.3627727400062074e-05,
"loss": 0.8409,
"step": 675
},
{
"epoch": 0.2694032081299193,
"grad_norm": 0.4522484813223993,
"learning_rate": 3.360919529868012e-05,
"loss": 0.8549,
"step": 676
},
{
"epoch": 0.2698017335857328,
"grad_norm": 0.46483110883882417,
"learning_rate": 3.3590641411238184e-05,
"loss": 0.8316,
"step": 677
},
{
"epoch": 0.27020025904154626,
"grad_norm": 0.46516087115887955,
"learning_rate": 3.3572065767437974e-05,
"loss": 0.847,
"step": 678
},
{
"epoch": 0.27059878449735975,
"grad_norm": 0.4870114489474851,
"learning_rate": 3.355346839701601e-05,
"loss": 0.866,
"step": 679
},
{
"epoch": 0.2709973099531733,
"grad_norm": 0.4112151077893339,
"learning_rate": 3.353484932974357e-05,
"loss": 0.8747,
"step": 680
},
{
"epoch": 0.27139583540898676,
"grad_norm": 0.39988331169551145,
"learning_rate": 3.35162085954267e-05,
"loss": 0.8491,
"step": 681
},
{
"epoch": 0.27179436086480024,
"grad_norm": 0.4580861040010356,
"learning_rate": 3.3497546223906114e-05,
"loss": 0.8373,
"step": 682
},
{
"epoch": 0.2721928863206137,
"grad_norm": 0.4676988585541286,
"learning_rate": 3.347886224505718e-05,
"loss": 0.8562,
"step": 683
},
{
"epoch": 0.2725914117764272,
"grad_norm": 0.3815018026041965,
"learning_rate": 3.346015668878982e-05,
"loss": 0.8865,
"step": 684
},
{
"epoch": 0.2729899372322407,
"grad_norm": 0.3853282548165928,
"learning_rate": 3.3441429585048544e-05,
"loss": 0.8451,
"step": 685
},
{
"epoch": 0.2733884626880542,
"grad_norm": 0.46857379361810175,
"learning_rate": 3.342268096381233e-05,
"loss": 0.8343,
"step": 686
},
{
"epoch": 0.2737869881438677,
"grad_norm": 0.44893908766670865,
"learning_rate": 3.340391085509458e-05,
"loss": 0.8425,
"step": 687
},
{
"epoch": 0.2741855135996812,
"grad_norm": 0.4623804261603112,
"learning_rate": 3.338511928894315e-05,
"loss": 0.8752,
"step": 688
},
{
"epoch": 0.27458403905549467,
"grad_norm": 0.40030690241398437,
"learning_rate": 3.3366306295440195e-05,
"loss": 0.8854,
"step": 689
},
{
"epoch": 0.27498256451130815,
"grad_norm": 0.41617160670796793,
"learning_rate": 3.3347471904702196e-05,
"loss": 0.8976,
"step": 690
},
{
"epoch": 0.27538108996712163,
"grad_norm": 0.4056939768327828,
"learning_rate": 3.3328616146879886e-05,
"loss": 0.872,
"step": 691
},
{
"epoch": 0.27577961542293516,
"grad_norm": 0.37847852674838545,
"learning_rate": 3.33097390521582e-05,
"loss": 0.8155,
"step": 692
},
{
"epoch": 0.27617814087874865,
"grad_norm": 0.35872927161364443,
"learning_rate": 3.329084065075622e-05,
"loss": 0.8273,
"step": 693
},
{
"epoch": 0.2765766663345621,
"grad_norm": 0.39096155431724333,
"learning_rate": 3.327192097292715e-05,
"loss": 0.8581,
"step": 694
},
{
"epoch": 0.2769751917903756,
"grad_norm": 0.3861177159461641,
"learning_rate": 3.325298004895826e-05,
"loss": 0.8132,
"step": 695
},
{
"epoch": 0.2773737172461891,
"grad_norm": 0.4171747417597138,
"learning_rate": 3.323401790917082e-05,
"loss": 0.8347,
"step": 696
},
{
"epoch": 0.27777224270200257,
"grad_norm": 0.364670807824471,
"learning_rate": 3.321503458392005e-05,
"loss": 0.8415,
"step": 697
},
{
"epoch": 0.2781707681578161,
"grad_norm": 0.331401074927844,
"learning_rate": 3.3196030103595105e-05,
"loss": 0.8459,
"step": 698
},
{
"epoch": 0.2785692936136296,
"grad_norm": 0.43255738046602604,
"learning_rate": 3.317700449861901e-05,
"loss": 0.8335,
"step": 699
},
{
"epoch": 0.27896781906944307,
"grad_norm": 0.33456506773762923,
"learning_rate": 3.315795779944858e-05,
"loss": 0.8647,
"step": 700
},
{
"epoch": 0.27936634452525655,
"grad_norm": 0.3715707582620995,
"learning_rate": 3.313889003657443e-05,
"loss": 0.8547,
"step": 701
},
{
"epoch": 0.27976486998107003,
"grad_norm": 0.3331498560093925,
"learning_rate": 3.311980124052087e-05,
"loss": 0.8447,
"step": 702
},
{
"epoch": 0.2801633954368835,
"grad_norm": 0.4038630202134111,
"learning_rate": 3.3100691441845896e-05,
"loss": 0.8247,
"step": 703
},
{
"epoch": 0.28056192089269705,
"grad_norm": 0.365237203718338,
"learning_rate": 3.308156067114111e-05,
"loss": 0.8737,
"step": 704
},
{
"epoch": 0.28096044634851053,
"grad_norm": 0.4002592791047349,
"learning_rate": 3.3062408959031715e-05,
"loss": 0.8478,
"step": 705
},
{
"epoch": 0.281358971804324,
"grad_norm": 0.34357520687563103,
"learning_rate": 3.304323633617641e-05,
"loss": 0.8233,
"step": 706
},
{
"epoch": 0.2817574972601375,
"grad_norm": 0.3505454925796206,
"learning_rate": 3.3024042833267357e-05,
"loss": 0.8281,
"step": 707
},
{
"epoch": 0.28215602271595097,
"grad_norm": 0.35854787844493347,
"learning_rate": 3.3004828481030197e-05,
"loss": 0.8314,
"step": 708
},
{
"epoch": 0.28255454817176445,
"grad_norm": 0.3633810116569549,
"learning_rate": 3.2985593310223905e-05,
"loss": 0.8337,
"step": 709
},
{
"epoch": 0.282953073627578,
"grad_norm": 0.40905086354028014,
"learning_rate": 3.296633735164078e-05,
"loss": 0.8278,
"step": 710
},
{
"epoch": 0.28335159908339147,
"grad_norm": 0.39198864644450826,
"learning_rate": 3.294706063610642e-05,
"loss": 0.8495,
"step": 711
},
{
"epoch": 0.28375012453920495,
"grad_norm": 0.39676678952183586,
"learning_rate": 3.292776319447965e-05,
"loss": 0.841,
"step": 712
},
{
"epoch": 0.28414864999501843,
"grad_norm": 0.4904457094152149,
"learning_rate": 3.290844505765246e-05,
"loss": 0.8538,
"step": 713
},
{
"epoch": 0.2845471754508319,
"grad_norm": 0.38619534462184524,
"learning_rate": 3.288910625654997e-05,
"loss": 0.831,
"step": 714
},
{
"epoch": 0.2849457009066454,
"grad_norm": 0.3965911327088796,
"learning_rate": 3.28697468221304e-05,
"loss": 0.855,
"step": 715
},
{
"epoch": 0.2853442263624589,
"grad_norm": 0.4104504182776709,
"learning_rate": 3.2850366785384975e-05,
"loss": 0.8312,
"step": 716
},
{
"epoch": 0.2857427518182724,
"grad_norm": 0.39320803615560024,
"learning_rate": 3.2830966177337926e-05,
"loss": 0.8256,
"step": 717
},
{
"epoch": 0.2861412772740859,
"grad_norm": 0.36766055059184494,
"learning_rate": 3.281154502904639e-05,
"loss": 0.8612,
"step": 718
},
{
"epoch": 0.2865398027298994,
"grad_norm": 0.3523821293496536,
"learning_rate": 3.279210337160041e-05,
"loss": 0.8546,
"step": 719
},
{
"epoch": 0.28693832818571285,
"grad_norm": 0.4303479446087632,
"learning_rate": 3.277264123612283e-05,
"loss": 0.843,
"step": 720
},
{
"epoch": 0.28733685364152634,
"grad_norm": 0.37256602383763016,
"learning_rate": 3.275315865376932e-05,
"loss": 0.8525,
"step": 721
},
{
"epoch": 0.2877353790973398,
"grad_norm": 0.372312406331151,
"learning_rate": 3.273365565572824e-05,
"loss": 0.8718,
"step": 722
},
{
"epoch": 0.28813390455315335,
"grad_norm": 0.3748404787253373,
"learning_rate": 3.271413227322064e-05,
"loss": 0.8284,
"step": 723
},
{
"epoch": 0.28853243000896683,
"grad_norm": 0.40949697147874353,
"learning_rate": 3.269458853750023e-05,
"loss": 0.8342,
"step": 724
},
{
"epoch": 0.2889309554647803,
"grad_norm": 0.35759282756001504,
"learning_rate": 3.267502447985328e-05,
"loss": 0.8376,
"step": 725
},
{
"epoch": 0.2893294809205938,
"grad_norm": 0.424890270877448,
"learning_rate": 3.2655440131598585e-05,
"loss": 0.8144,
"step": 726
},
{
"epoch": 0.2897280063764073,
"grad_norm": 0.37228222071530115,
"learning_rate": 3.263583552408744e-05,
"loss": 0.8203,
"step": 727
},
{
"epoch": 0.29012653183222076,
"grad_norm": 0.36804439864776206,
"learning_rate": 3.261621068870355e-05,
"loss": 0.8436,
"step": 728
},
{
"epoch": 0.2905250572880343,
"grad_norm": 0.4010864307131854,
"learning_rate": 3.2596565656863036e-05,
"loss": 0.8211,
"step": 729
},
{
"epoch": 0.2909235827438478,
"grad_norm": 0.43321148633091444,
"learning_rate": 3.257690046001431e-05,
"loss": 0.8659,
"step": 730
},
{
"epoch": 0.29132210819966126,
"grad_norm": 0.37678425829862483,
"learning_rate": 3.255721512963811e-05,
"loss": 0.8549,
"step": 731
},
{
"epoch": 0.29172063365547474,
"grad_norm": 0.38473774610717565,
"learning_rate": 3.253750969724735e-05,
"loss": 0.8584,
"step": 732
},
{
"epoch": 0.2921191591112882,
"grad_norm": 0.3274732323738536,
"learning_rate": 3.251778419438716e-05,
"loss": 0.8197,
"step": 733
},
{
"epoch": 0.2925176845671017,
"grad_norm": 0.37385182013341806,
"learning_rate": 3.2498038652634797e-05,
"loss": 0.8485,
"step": 734
},
{
"epoch": 0.29291621002291524,
"grad_norm": 0.37571422954043315,
"learning_rate": 3.2478273103599587e-05,
"loss": 0.8131,
"step": 735
},
{
"epoch": 0.2933147354787287,
"grad_norm": 1.0205773925944017,
"learning_rate": 3.24584875789229e-05,
"loss": 0.8122,
"step": 736
},
{
"epoch": 0.2937132609345422,
"grad_norm": 0.397474423244844,
"learning_rate": 3.243868211027807e-05,
"loss": 0.8575,
"step": 737
},
{
"epoch": 0.2941117863903557,
"grad_norm": 0.35542654634964194,
"learning_rate": 3.241885672937034e-05,
"loss": 0.8459,
"step": 738
},
{
"epoch": 0.29451031184616916,
"grad_norm": 0.7079812695011942,
"learning_rate": 3.239901146793688e-05,
"loss": 0.8235,
"step": 739
},
{
"epoch": 0.29490883730198264,
"grad_norm": 0.40472908559410964,
"learning_rate": 3.237914635774664e-05,
"loss": 0.8358,
"step": 740
},
{
"epoch": 0.2953073627577962,
"grad_norm": 0.6704919581462614,
"learning_rate": 3.235926143060036e-05,
"loss": 0.881,
"step": 741
},
{
"epoch": 0.29570588821360966,
"grad_norm": 0.373533664396295,
"learning_rate": 3.23393567183305e-05,
"loss": 0.853,
"step": 742
},
{
"epoch": 0.29610441366942314,
"grad_norm": 0.4047009515080516,
"learning_rate": 3.231943225280121e-05,
"loss": 0.8569,
"step": 743
},
{
"epoch": 0.2965029391252366,
"grad_norm": 0.3877536209778869,
"learning_rate": 3.229948806590824e-05,
"loss": 0.835,
"step": 744
},
{
"epoch": 0.2969014645810501,
"grad_norm": 0.4714038839534881,
"learning_rate": 3.227952418957892e-05,
"loss": 0.868,
"step": 745
},
{
"epoch": 0.2972999900368636,
"grad_norm": 0.4463329373269963,
"learning_rate": 3.225954065577209e-05,
"loss": 0.848,
"step": 746
},
{
"epoch": 0.2976985154926771,
"grad_norm": 0.42587530691745,
"learning_rate": 3.223953749647807e-05,
"loss": 0.8607,
"step": 747
},
{
"epoch": 0.2980970409484906,
"grad_norm": 0.4379931392773523,
"learning_rate": 3.221951474371861e-05,
"loss": 0.813,
"step": 748
},
{
"epoch": 0.2984955664043041,
"grad_norm": 0.38309480692550185,
"learning_rate": 3.2199472429546785e-05,
"loss": 0.8474,
"step": 749
},
{
"epoch": 0.29889409186011756,
"grad_norm": 0.3616798063850079,
"learning_rate": 3.2179410586047025e-05,
"loss": 0.8154,
"step": 750
},
{
"epoch": 0.29929261731593104,
"grad_norm": 0.3747541200969163,
"learning_rate": 3.215932924533501e-05,
"loss": 0.8378,
"step": 751
},
{
"epoch": 0.2996911427717445,
"grad_norm": 0.38031077846694633,
"learning_rate": 3.213922843955762e-05,
"loss": 0.8543,
"step": 752
},
{
"epoch": 0.30008966822755806,
"grad_norm": 0.41068418371221344,
"learning_rate": 3.21191082008929e-05,
"loss": 0.8392,
"step": 753
},
{
"epoch": 0.30048819368337154,
"grad_norm": 0.3644597909816924,
"learning_rate": 3.2098968561550024e-05,
"loss": 0.8061,
"step": 754
},
{
"epoch": 0.300886719139185,
"grad_norm": 0.37311229876996665,
"learning_rate": 3.2078809553769195e-05,
"loss": 0.8693,
"step": 755
},
{
"epoch": 0.3012852445949985,
"grad_norm": 0.45016158998524075,
"learning_rate": 3.205863120982164e-05,
"loss": 0.8602,
"step": 756
},
{
"epoch": 0.301683770050812,
"grad_norm": 0.42629280896654315,
"learning_rate": 3.203843356200952e-05,
"loss": 0.8532,
"step": 757
},
{
"epoch": 0.30208229550662546,
"grad_norm": 0.4110371155650319,
"learning_rate": 3.201821664266595e-05,
"loss": 0.8451,
"step": 758
},
{
"epoch": 0.302480820962439,
"grad_norm": 0.4192137078636866,
"learning_rate": 3.199798048415481e-05,
"loss": 0.8436,
"step": 759
},
{
"epoch": 0.3028793464182525,
"grad_norm": 0.4446866796453996,
"learning_rate": 3.197772511887086e-05,
"loss": 0.8235,
"step": 760
},
{
"epoch": 0.30327787187406596,
"grad_norm": 0.433556905913176,
"learning_rate": 3.195745057923957e-05,
"loss": 0.8603,
"step": 761
},
{
"epoch": 0.30367639732987944,
"grad_norm": 0.4114711662961495,
"learning_rate": 3.193715689771709e-05,
"loss": 0.838,
"step": 762
},
{
"epoch": 0.3040749227856929,
"grad_norm": 0.3926214986996156,
"learning_rate": 3.191684410679025e-05,
"loss": 0.8502,
"step": 763
},
{
"epoch": 0.3044734482415064,
"grad_norm": 0.4139928341021709,
"learning_rate": 3.189651223897644e-05,
"loss": 0.8385,
"step": 764
},
{
"epoch": 0.30487197369731994,
"grad_norm": 0.4129548938591373,
"learning_rate": 3.1876161326823615e-05,
"loss": 0.8791,
"step": 765
},
{
"epoch": 0.3052704991531334,
"grad_norm": 0.3955272894598311,
"learning_rate": 3.185579140291019e-05,
"loss": 0.8384,
"step": 766
},
{
"epoch": 0.3056690246089469,
"grad_norm": 0.3585005878079346,
"learning_rate": 3.183540249984504e-05,
"loss": 0.8132,
"step": 767
},
{
"epoch": 0.3060675500647604,
"grad_norm": 0.4212205077030527,
"learning_rate": 3.18149946502674e-05,
"loss": 0.8308,
"step": 768
},
{
"epoch": 0.30646607552057387,
"grad_norm": 0.3638728218380253,
"learning_rate": 3.179456788684685e-05,
"loss": 0.8097,
"step": 769
},
{
"epoch": 0.30686460097638735,
"grad_norm": 0.4024379131636804,
"learning_rate": 3.1774122242283236e-05,
"loss": 0.8401,
"step": 770
},
{
"epoch": 0.3072631264322009,
"grad_norm": 0.4132201236498637,
"learning_rate": 3.175365774930665e-05,
"loss": 0.8111,
"step": 771
},
{
"epoch": 0.30766165188801436,
"grad_norm": 0.3525657580163014,
"learning_rate": 3.1733174440677346e-05,
"loss": 0.8201,
"step": 772
},
{
"epoch": 0.30806017734382785,
"grad_norm": 0.35083612349906135,
"learning_rate": 3.171267234918568e-05,
"loss": 0.815,
"step": 773
},
{
"epoch": 0.3084587027996413,
"grad_norm": 0.4002385012230293,
"learning_rate": 3.169215150765211e-05,
"loss": 0.8168,
"step": 774
},
{
"epoch": 0.3088572282554548,
"grad_norm": 0.3629878196057507,
"learning_rate": 3.1671611948927074e-05,
"loss": 0.8367,
"step": 775
},
{
"epoch": 0.3092557537112683,
"grad_norm": 0.34583616562695413,
"learning_rate": 3.165105370589102e-05,
"loss": 0.8253,
"step": 776
},
{
"epoch": 0.3096542791670818,
"grad_norm": 0.3607827655628309,
"learning_rate": 3.1630476811454246e-05,
"loss": 0.8284,
"step": 777
},
{
"epoch": 0.3100528046228953,
"grad_norm": 0.37546990727594654,
"learning_rate": 3.160988129855697e-05,
"loss": 0.8376,
"step": 778
},
{
"epoch": 0.3104513300787088,
"grad_norm": 0.39969610145426393,
"learning_rate": 3.158926720016917e-05,
"loss": 0.8516,
"step": 779
},
{
"epoch": 0.31084985553452227,
"grad_norm": 0.36953469600153793,
"learning_rate": 3.156863454929059e-05,
"loss": 0.8236,
"step": 780
},
{
"epoch": 0.31124838099033575,
"grad_norm": 0.33928479120444516,
"learning_rate": 3.154798337895067e-05,
"loss": 0.8443,
"step": 781
},
{
"epoch": 0.31164690644614923,
"grad_norm": 0.3966330597527675,
"learning_rate": 3.152731372220852e-05,
"loss": 0.8188,
"step": 782
},
{
"epoch": 0.3120454319019627,
"grad_norm": 0.3946127272938953,
"learning_rate": 3.1506625612152814e-05,
"loss": 0.832,
"step": 783
},
{
"epoch": 0.31244395735777625,
"grad_norm": 0.3785322567375632,
"learning_rate": 3.148591908190178e-05,
"loss": 0.8393,
"step": 784
},
{
"epoch": 0.31284248281358973,
"grad_norm": 0.36331251784056434,
"learning_rate": 3.1465194164603135e-05,
"loss": 0.8403,
"step": 785
},
{
"epoch": 0.3132410082694032,
"grad_norm": 0.35684726071521566,
"learning_rate": 3.1444450893434025e-05,
"loss": 0.8464,
"step": 786
},
{
"epoch": 0.3136395337252167,
"grad_norm": 0.33346839612618157,
"learning_rate": 3.142368930160098e-05,
"loss": 0.8607,
"step": 787
},
{
"epoch": 0.31403805918103017,
"grad_norm": 0.34733144268906585,
"learning_rate": 3.140290942233985e-05,
"loss": 0.858,
"step": 788
},
{
"epoch": 0.31443658463684365,
"grad_norm": 0.3523769266485713,
"learning_rate": 3.138211128891578e-05,
"loss": 0.8245,
"step": 789
},
{
"epoch": 0.3148351100926572,
"grad_norm": 0.3491121768861967,
"learning_rate": 3.136129493462312e-05,
"loss": 0.8394,
"step": 790
},
{
"epoch": 0.31523363554847067,
"grad_norm": 0.3878058197741651,
"learning_rate": 3.134046039278539e-05,
"loss": 0.8406,
"step": 791
},
{
"epoch": 0.31563216100428415,
"grad_norm": 0.3331713976353916,
"learning_rate": 3.131960769675524e-05,
"loss": 0.8205,
"step": 792
},
{
"epoch": 0.31603068646009763,
"grad_norm": 0.3902176893077025,
"learning_rate": 3.1298736879914364e-05,
"loss": 0.8634,
"step": 793
},
{
"epoch": 0.3164292119159111,
"grad_norm": 0.39518447785038,
"learning_rate": 3.127784797567347e-05,
"loss": 0.8298,
"step": 794
},
{
"epoch": 0.3168277373717246,
"grad_norm": 0.3422487336442997,
"learning_rate": 3.125694101747222e-05,
"loss": 0.8613,
"step": 795
},
{
"epoch": 0.31722626282753813,
"grad_norm": 0.33332846452402065,
"learning_rate": 3.123601603877918e-05,
"loss": 0.8502,
"step": 796
},
{
"epoch": 0.3176247882833516,
"grad_norm": 0.6423101526850392,
"learning_rate": 3.121507307309178e-05,
"loss": 0.8338,
"step": 797
},
{
"epoch": 0.3180233137391651,
"grad_norm": 0.38531993142674054,
"learning_rate": 3.11941121539362e-05,
"loss": 0.7963,
"step": 798
},
{
"epoch": 0.3184218391949786,
"grad_norm": 0.3592316503041697,
"learning_rate": 3.1173133314867414e-05,
"loss": 0.8411,
"step": 799
},
{
"epoch": 0.31882036465079205,
"grad_norm": 0.3598280004430287,
"learning_rate": 3.115213658946904e-05,
"loss": 0.8336,
"step": 800
},
{
"epoch": 0.31921889010660554,
"grad_norm": 0.3496111681067253,
"learning_rate": 3.113112201135335e-05,
"loss": 0.8574,
"step": 801
},
{
"epoch": 0.31961741556241907,
"grad_norm": 0.3664242703958735,
"learning_rate": 3.11100896141612e-05,
"loss": 0.8436,
"step": 802
},
{
"epoch": 0.32001594101823255,
"grad_norm": 0.32787991821140705,
"learning_rate": 3.108903943156194e-05,
"loss": 0.8489,
"step": 803
},
{
"epoch": 0.32041446647404603,
"grad_norm": 0.40557517482435224,
"learning_rate": 3.106797149725344e-05,
"loss": 0.8237,
"step": 804
},
{
"epoch": 0.3208129919298595,
"grad_norm": 0.37518817153121636,
"learning_rate": 3.1046885844961946e-05,
"loss": 0.8274,
"step": 805
},
{
"epoch": 0.321211517385673,
"grad_norm": 0.37714764259452016,
"learning_rate": 3.102578250844209e-05,
"loss": 0.8331,
"step": 806
},
{
"epoch": 0.3216100428414865,
"grad_norm": 0.37798047544093105,
"learning_rate": 3.10046615214768e-05,
"loss": 0.8502,
"step": 807
},
{
"epoch": 0.3220085682973,
"grad_norm": 0.4109920014418336,
"learning_rate": 3.098352291787728e-05,
"loss": 0.8227,
"step": 808
},
{
"epoch": 0.3224070937531135,
"grad_norm": 0.4499775221189975,
"learning_rate": 3.09623667314829e-05,
"loss": 0.8247,
"step": 809
},
{
"epoch": 0.322805619208927,
"grad_norm": 0.331922156881542,
"learning_rate": 3.0941192996161215e-05,
"loss": 0.7928,
"step": 810
},
{
"epoch": 0.32320414466474046,
"grad_norm": 0.34248930965498,
"learning_rate": 3.092000174580785e-05,
"loss": 0.8432,
"step": 811
},
{
"epoch": 0.32360267012055394,
"grad_norm": 0.35843509172736904,
"learning_rate": 3.089879301434648e-05,
"loss": 0.8477,
"step": 812
},
{
"epoch": 0.3240011955763674,
"grad_norm": 0.3683897489622322,
"learning_rate": 3.0877566835728755e-05,
"loss": 0.8091,
"step": 813
},
{
"epoch": 0.32439972103218095,
"grad_norm": 0.5313658405862416,
"learning_rate": 3.0856323243934255e-05,
"loss": 0.8279,
"step": 814
},
{
"epoch": 0.32479824648799444,
"grad_norm": 0.37323472384352163,
"learning_rate": 3.083506227297045e-05,
"loss": 0.8326,
"step": 815
},
{
"epoch": 0.3251967719438079,
"grad_norm": 0.39228250684825317,
"learning_rate": 3.0813783956872615e-05,
"loss": 0.8294,
"step": 816
},
{
"epoch": 0.3255952973996214,
"grad_norm": 0.3652945541655549,
"learning_rate": 3.07924883297038e-05,
"loss": 0.846,
"step": 817
},
{
"epoch": 0.3259938228554349,
"grad_norm": 0.3659044128890069,
"learning_rate": 3.0771175425554766e-05,
"loss": 0.8204,
"step": 818
},
{
"epoch": 0.32639234831124836,
"grad_norm": 0.4707331446693342,
"learning_rate": 3.074984527854392e-05,
"loss": 0.8163,
"step": 819
},
{
"epoch": 0.3267908737670619,
"grad_norm": 0.3606528922605574,
"learning_rate": 3.072849792281731e-05,
"loss": 0.8334,
"step": 820
},
{
"epoch": 0.3271893992228754,
"grad_norm": 0.3770070622615337,
"learning_rate": 3.0707133392548474e-05,
"loss": 0.8224,
"step": 821
},
{
"epoch": 0.32758792467868886,
"grad_norm": 0.359650139273174,
"learning_rate": 3.068575172193849e-05,
"loss": 0.8534,
"step": 822
},
{
"epoch": 0.32798645013450234,
"grad_norm": 0.3456572438444792,
"learning_rate": 3.066435294521584e-05,
"loss": 0.889,
"step": 823
},
{
"epoch": 0.3283849755903158,
"grad_norm": 0.3918222247018766,
"learning_rate": 3.064293709663645e-05,
"loss": 0.7898,
"step": 824
},
{
"epoch": 0.3287835010461293,
"grad_norm": 0.4247237481434523,
"learning_rate": 3.0621504210483495e-05,
"loss": 0.8535,
"step": 825
},
{
"epoch": 0.32918202650194284,
"grad_norm": 0.36874426839954455,
"learning_rate": 3.0600054321067486e-05,
"loss": 0.8336,
"step": 826
},
{
"epoch": 0.3295805519577563,
"grad_norm": 0.4207632539441216,
"learning_rate": 3.057858746272611e-05,
"loss": 0.841,
"step": 827
},
{
"epoch": 0.3299790774135698,
"grad_norm": 0.38496904071215293,
"learning_rate": 3.055710366982427e-05,
"loss": 0.8195,
"step": 828
},
{
"epoch": 0.3303776028693833,
"grad_norm": 0.4663868777863652,
"learning_rate": 3.053560297675392e-05,
"loss": 0.8419,
"step": 829
},
{
"epoch": 0.33077612832519676,
"grad_norm": 0.5264881698443798,
"learning_rate": 3.0514085417934112e-05,
"loss": 0.8017,
"step": 830
},
{
"epoch": 0.33117465378101024,
"grad_norm": 0.4647249062040843,
"learning_rate": 3.0492551027810876e-05,
"loss": 0.8468,
"step": 831
},
{
"epoch": 0.3315731792368238,
"grad_norm": 0.305099119380529,
"learning_rate": 3.04709998408572e-05,
"loss": 0.7996,
"step": 832
},
{
"epoch": 0.33197170469263726,
"grad_norm": 0.46977408947791516,
"learning_rate": 3.0449431891572936e-05,
"loss": 0.8474,
"step": 833
},
{
"epoch": 0.33237023014845074,
"grad_norm": 0.44745094401575514,
"learning_rate": 3.0427847214484804e-05,
"loss": 0.8349,
"step": 834
},
{
"epoch": 0.3327687556042642,
"grad_norm": 0.3543202737692515,
"learning_rate": 3.0406245844146273e-05,
"loss": 0.8253,
"step": 835
},
{
"epoch": 0.3331672810600777,
"grad_norm": 0.3933697240001331,
"learning_rate": 3.0384627815137553e-05,
"loss": 0.8125,
"step": 836
},
{
"epoch": 0.3335658065158912,
"grad_norm": 0.5148260560348337,
"learning_rate": 3.0362993162065516e-05,
"loss": 0.8627,
"step": 837
},
{
"epoch": 0.3339643319717047,
"grad_norm": 0.48857380870627215,
"learning_rate": 3.034134191956364e-05,
"loss": 0.8236,
"step": 838
},
{
"epoch": 0.3343628574275182,
"grad_norm": 0.4056146666480351,
"learning_rate": 3.0319674122291977e-05,
"loss": 0.8302,
"step": 839
},
{
"epoch": 0.3347613828833317,
"grad_norm": 0.3904977617394034,
"learning_rate": 3.0297989804937057e-05,
"loss": 0.8167,
"step": 840
},
{
"epoch": 0.33515990833914516,
"grad_norm": 0.5044251985190126,
"learning_rate": 3.027628900221187e-05,
"loss": 0.8233,
"step": 841
},
{
"epoch": 0.33555843379495864,
"grad_norm": 0.4250841928547596,
"learning_rate": 3.025457174885581e-05,
"loss": 0.8281,
"step": 842
},
{
"epoch": 0.3359569592507721,
"grad_norm": 0.3823907908471619,
"learning_rate": 3.0232838079634575e-05,
"loss": 0.8242,
"step": 843
},
{
"epoch": 0.3363554847065856,
"grad_norm": 0.42945934078552406,
"learning_rate": 3.0211088029340154e-05,
"loss": 0.8354,
"step": 844
},
{
"epoch": 0.33675401016239914,
"grad_norm": 0.42902292521578395,
"learning_rate": 3.018932163279078e-05,
"loss": 0.833,
"step": 845
},
{
"epoch": 0.3371525356182126,
"grad_norm": 0.3937451062114422,
"learning_rate": 3.016753892483083e-05,
"loss": 0.7891,
"step": 846
},
{
"epoch": 0.3375510610740261,
"grad_norm": 0.3540399272237491,
"learning_rate": 3.0145739940330786e-05,
"loss": 0.8573,
"step": 847
},
{
"epoch": 0.3379495865298396,
"grad_norm": 0.4084630243877346,
"learning_rate": 3.0123924714187214e-05,
"loss": 0.8234,
"step": 848
},
{
"epoch": 0.33834811198565307,
"grad_norm": 0.42274333879010845,
"learning_rate": 3.0102093281322666e-05,
"loss": 0.8212,
"step": 849
},
{
"epoch": 0.33874663744146655,
"grad_norm": 0.3321533474722135,
"learning_rate": 3.008024567668563e-05,
"loss": 0.8173,
"step": 850
},
{
"epoch": 0.3391451628972801,
"grad_norm": 0.3692564529574208,
"learning_rate": 3.0058381935250495e-05,
"loss": 0.8557,
"step": 851
},
{
"epoch": 0.33954368835309356,
"grad_norm": 0.39610202569549047,
"learning_rate": 3.0036502092017473e-05,
"loss": 0.8654,
"step": 852
},
{
"epoch": 0.33994221380890705,
"grad_norm": 0.3661238023568551,
"learning_rate": 3.0014606182012566e-05,
"loss": 0.8727,
"step": 853
},
{
"epoch": 0.3403407392647205,
"grad_norm": 0.3872040100330332,
"learning_rate": 2.9992694240287474e-05,
"loss": 0.8291,
"step": 854
},
{
"epoch": 0.340739264720534,
"grad_norm": 0.3974606504195108,
"learning_rate": 2.9970766301919583e-05,
"loss": 0.8679,
"step": 855
},
{
"epoch": 0.3411377901763475,
"grad_norm": 0.3938746707369231,
"learning_rate": 2.994882240201188e-05,
"loss": 0.8433,
"step": 856
},
{
"epoch": 0.341536315632161,
"grad_norm": 0.3691659772037152,
"learning_rate": 2.99268625756929e-05,
"loss": 0.8393,
"step": 857
},
{
"epoch": 0.3419348410879745,
"grad_norm": 0.3780103920503278,
"learning_rate": 2.990488685811667e-05,
"loss": 0.8346,
"step": 858
},
{
"epoch": 0.342333366543788,
"grad_norm": 0.4073582614267046,
"learning_rate": 2.9882895284462664e-05,
"loss": 0.8476,
"step": 859
},
{
"epoch": 0.34273189199960147,
"grad_norm": 0.34365964699391127,
"learning_rate": 2.9860887889935744e-05,
"loss": 0.8282,
"step": 860
},
{
"epoch": 0.34313041745541495,
"grad_norm": 0.40120857716998304,
"learning_rate": 2.983886470976608e-05,
"loss": 0.8275,
"step": 861
},
{
"epoch": 0.34352894291122843,
"grad_norm": 0.3959132704688456,
"learning_rate": 2.9816825779209133e-05,
"loss": 0.8251,
"step": 862
},
{
"epoch": 0.34392746836704197,
"grad_norm": 0.4334298136162478,
"learning_rate": 2.9794771133545565e-05,
"loss": 0.822,
"step": 863
},
{
"epoch": 0.34432599382285545,
"grad_norm": 0.3870945760786885,
"learning_rate": 2.977270080808119e-05,
"loss": 0.8251,
"step": 864
},
{
"epoch": 0.34472451927866893,
"grad_norm": 0.37106301614057785,
"learning_rate": 2.975061483814694e-05,
"loss": 0.8545,
"step": 865
},
{
"epoch": 0.3451230447344824,
"grad_norm": 0.38427213586073594,
"learning_rate": 2.9728513259098784e-05,
"loss": 0.8161,
"step": 866
},
{
"epoch": 0.3455215701902959,
"grad_norm": 0.3916565010304088,
"learning_rate": 2.9706396106317675e-05,
"loss": 0.8419,
"step": 867
},
{
"epoch": 0.34592009564610937,
"grad_norm": 0.3709069418845533,
"learning_rate": 2.96842634152095e-05,
"loss": 0.846,
"step": 868
},
{
"epoch": 0.3463186211019229,
"grad_norm": 0.3703215359984664,
"learning_rate": 2.9662115221205015e-05,
"loss": 0.8222,
"step": 869
},
{
"epoch": 0.3467171465577364,
"grad_norm": 0.3464063836842463,
"learning_rate": 2.9639951559759802e-05,
"loss": 0.8036,
"step": 870
},
{
"epoch": 0.34711567201354987,
"grad_norm": 0.35048924216820243,
"learning_rate": 2.9617772466354192e-05,
"loss": 0.818,
"step": 871
},
{
"epoch": 0.34751419746936335,
"grad_norm": 0.374548515628163,
"learning_rate": 2.9595577976493238e-05,
"loss": 0.8199,
"step": 872
},
{
"epoch": 0.34791272292517683,
"grad_norm": 0.37643952630682037,
"learning_rate": 2.9573368125706624e-05,
"loss": 0.825,
"step": 873
},
{
"epoch": 0.3483112483809903,
"grad_norm": 0.3873605831737666,
"learning_rate": 2.9551142949548634e-05,
"loss": 0.8183,
"step": 874
},
{
"epoch": 0.34870977383680385,
"grad_norm": 0.3639676705380599,
"learning_rate": 2.9528902483598076e-05,
"loss": 0.8536,
"step": 875
},
{
"epoch": 0.34910829929261733,
"grad_norm": 0.3239254348822666,
"learning_rate": 2.950664676345824e-05,
"loss": 0.7855,
"step": 876
},
{
"epoch": 0.3495068247484308,
"grad_norm": 0.4333203945657134,
"learning_rate": 2.9484375824756845e-05,
"loss": 0.8377,
"step": 877
},
{
"epoch": 0.3499053502042443,
"grad_norm": 0.366000478962248,
"learning_rate": 2.946208970314595e-05,
"loss": 0.841,
"step": 878
},
{
"epoch": 0.3503038756600578,
"grad_norm": 0.36232443933919917,
"learning_rate": 2.943978843430194e-05,
"loss": 0.8415,
"step": 879
},
{
"epoch": 0.35070240111587125,
"grad_norm": 0.3632587538915808,
"learning_rate": 2.9417472053925435e-05,
"loss": 0.833,
"step": 880
},
{
"epoch": 0.3511009265716848,
"grad_norm": 0.34528067844688565,
"learning_rate": 2.939514059774126e-05,
"loss": 0.8089,
"step": 881
},
{
"epoch": 0.35149945202749827,
"grad_norm": 0.3186838935536136,
"learning_rate": 2.9372794101498353e-05,
"loss": 0.8112,
"step": 882
},
{
"epoch": 0.35189797748331175,
"grad_norm": 0.33496956021034613,
"learning_rate": 2.935043260096975e-05,
"loss": 0.8421,
"step": 883
},
{
"epoch": 0.35229650293912523,
"grad_norm": 0.33411225546854484,
"learning_rate": 2.932805613195249e-05,
"loss": 0.8113,
"step": 884
},
{
"epoch": 0.3526950283949387,
"grad_norm": 0.32478642663480967,
"learning_rate": 2.9305664730267586e-05,
"loss": 0.8046,
"step": 885
},
{
"epoch": 0.3530935538507522,
"grad_norm": 0.3631121635365864,
"learning_rate": 2.9283258431759954e-05,
"loss": 0.8173,
"step": 886
},
{
"epoch": 0.35349207930656573,
"grad_norm": 0.3429622024570721,
"learning_rate": 2.926083727229835e-05,
"loss": 0.8583,
"step": 887
},
{
"epoch": 0.3538906047623792,
"grad_norm": 0.345044521347691,
"learning_rate": 2.923840128777532e-05,
"loss": 0.813,
"step": 888
},
{
"epoch": 0.3542891302181927,
"grad_norm": 0.3694760550020032,
"learning_rate": 2.9215950514107155e-05,
"loss": 0.8315,
"step": 889
},
{
"epoch": 0.3546876556740062,
"grad_norm": 0.34900971672785386,
"learning_rate": 2.9193484987233804e-05,
"loss": 0.8251,
"step": 890
},
{
"epoch": 0.35508618112981966,
"grad_norm": 0.36620900329612915,
"learning_rate": 2.917100474311885e-05,
"loss": 0.8243,
"step": 891
},
{
"epoch": 0.35548470658563314,
"grad_norm": 0.3732972879676541,
"learning_rate": 2.9148509817749424e-05,
"loss": 0.8263,
"step": 892
},
{
"epoch": 0.3558832320414467,
"grad_norm": 0.3754066448612361,
"learning_rate": 2.9126000247136162e-05,
"loss": 0.8549,
"step": 893
},
{
"epoch": 0.35628175749726015,
"grad_norm": 0.37766294343524515,
"learning_rate": 2.910347606731315e-05,
"loss": 0.8642,
"step": 894
},
{
"epoch": 0.35668028295307364,
"grad_norm": 0.3335713482308801,
"learning_rate": 2.9080937314337853e-05,
"loss": 0.8261,
"step": 895
},
{
"epoch": 0.3570788084088871,
"grad_norm": 0.3586058859524884,
"learning_rate": 2.9058384024291064e-05,
"loss": 0.8299,
"step": 896
},
{
"epoch": 0.3574773338647006,
"grad_norm": 0.35518778170798426,
"learning_rate": 2.9035816233276866e-05,
"loss": 0.8664,
"step": 897
},
{
"epoch": 0.3578758593205141,
"grad_norm": 0.3226292379642851,
"learning_rate": 2.901323397742253e-05,
"loss": 0.8176,
"step": 898
},
{
"epoch": 0.3582743847763276,
"grad_norm": 0.2963818087079733,
"learning_rate": 2.8990637292878495e-05,
"loss": 0.8379,
"step": 899
},
{
"epoch": 0.3586729102321411,
"grad_norm": 0.330128684962309,
"learning_rate": 2.896802621581831e-05,
"loss": 0.8069,
"step": 900
},
{
"epoch": 0.3590714356879546,
"grad_norm": 0.30550512523931456,
"learning_rate": 2.8945400782438536e-05,
"loss": 0.8098,
"step": 901
},
{
"epoch": 0.35946996114376806,
"grad_norm": 0.3225722537828969,
"learning_rate": 2.8922761028958735e-05,
"loss": 0.8256,
"step": 902
},
{
"epoch": 0.35986848659958154,
"grad_norm": 0.32436626447460576,
"learning_rate": 2.89001069916214e-05,
"loss": 0.8697,
"step": 903
},
{
"epoch": 0.360267012055395,
"grad_norm": 0.3248090965744356,
"learning_rate": 2.8877438706691876e-05,
"loss": 0.7905,
"step": 904
},
{
"epoch": 0.36066553751120856,
"grad_norm": 0.3423557906931257,
"learning_rate": 2.8854756210458305e-05,
"loss": 0.808,
"step": 905
},
{
"epoch": 0.36106406296702204,
"grad_norm": 0.3533066672835484,
"learning_rate": 2.8832059539231612e-05,
"loss": 0.8158,
"step": 906
},
{
"epoch": 0.3614625884228355,
"grad_norm": 0.3274286434791991,
"learning_rate": 2.88093487293454e-05,
"loss": 0.7964,
"step": 907
},
{
"epoch": 0.361861113878649,
"grad_norm": 0.3549517407326649,
"learning_rate": 2.8786623817155875e-05,
"loss": 0.8459,
"step": 908
},
{
"epoch": 0.3622596393344625,
"grad_norm": 0.3179414770046732,
"learning_rate": 2.8763884839041876e-05,
"loss": 0.8141,
"step": 909
},
{
"epoch": 0.36265816479027596,
"grad_norm": 0.34921190558386694,
"learning_rate": 2.87411318314047e-05,
"loss": 0.8319,
"step": 910
},
{
"epoch": 0.36305669024608944,
"grad_norm": 0.46547909862633313,
"learning_rate": 2.8718364830668153e-05,
"loss": 0.8386,
"step": 911
},
{
"epoch": 0.363455215701903,
"grad_norm": 0.3362430896899564,
"learning_rate": 2.8695583873278402e-05,
"loss": 0.8087,
"step": 912
},
{
"epoch": 0.36385374115771646,
"grad_norm": 0.3421880254638392,
"learning_rate": 2.8672788995703985e-05,
"loss": 0.8288,
"step": 913
},
{
"epoch": 0.36425226661352994,
"grad_norm": 0.33774819740594564,
"learning_rate": 2.864998023443571e-05,
"loss": 0.8284,
"step": 914
},
{
"epoch": 0.3646507920693434,
"grad_norm": 0.32177729327477683,
"learning_rate": 2.862715762598662e-05,
"loss": 0.8086,
"step": 915
},
{
"epoch": 0.3650493175251569,
"grad_norm": 0.31718396437386565,
"learning_rate": 2.8604321206891904e-05,
"loss": 0.8077,
"step": 916
},
{
"epoch": 0.3654478429809704,
"grad_norm": 0.3078535072758799,
"learning_rate": 2.858147101370888e-05,
"loss": 0.815,
"step": 917
},
{
"epoch": 0.3658463684367839,
"grad_norm": 0.3251261011534896,
"learning_rate": 2.855860708301692e-05,
"loss": 0.8154,
"step": 918
},
{
"epoch": 0.3662448938925974,
"grad_norm": 0.32646080328089405,
"learning_rate": 2.8535729451417354e-05,
"loss": 0.8495,
"step": 919
},
{
"epoch": 0.3666434193484109,
"grad_norm": 0.32013473579432894,
"learning_rate": 2.851283815553349e-05,
"loss": 0.8257,
"step": 920
},
{
"epoch": 0.36704194480422436,
"grad_norm": 0.3404460262778686,
"learning_rate": 2.8489933232010486e-05,
"loss": 0.8274,
"step": 921
},
{
"epoch": 0.36744047026003784,
"grad_norm": 0.3179214806128248,
"learning_rate": 2.8467014717515303e-05,
"loss": 0.8221,
"step": 922
},
{
"epoch": 0.3678389957158513,
"grad_norm": 0.3686956431219607,
"learning_rate": 2.8444082648736695e-05,
"loss": 0.8577,
"step": 923
},
{
"epoch": 0.36823752117166486,
"grad_norm": 0.3319571070853765,
"learning_rate": 2.8421137062385077e-05,
"loss": 0.8472,
"step": 924
},
{
"epoch": 0.36863604662747834,
"grad_norm": 0.33391728985772273,
"learning_rate": 2.839817799519252e-05,
"loss": 0.8407,
"step": 925
},
{
"epoch": 0.3690345720832918,
"grad_norm": 0.36377333064615536,
"learning_rate": 2.8375205483912683e-05,
"loss": 0.8062,
"step": 926
},
{
"epoch": 0.3694330975391053,
"grad_norm": 0.3192797421529141,
"learning_rate": 2.8352219565320734e-05,
"loss": 0.8198,
"step": 927
},
{
"epoch": 0.3698316229949188,
"grad_norm": 0.34072810185050395,
"learning_rate": 2.8329220276213312e-05,
"loss": 0.8553,
"step": 928
},
{
"epoch": 0.37023014845073227,
"grad_norm": 0.3510179405385589,
"learning_rate": 2.8306207653408452e-05,
"loss": 0.803,
"step": 929
},
{
"epoch": 0.3706286739065458,
"grad_norm": 0.33046352991412514,
"learning_rate": 2.8283181733745545e-05,
"loss": 0.8196,
"step": 930
},
{
"epoch": 0.3710271993623593,
"grad_norm": 0.3296330314721836,
"learning_rate": 2.826014255408525e-05,
"loss": 0.8113,
"step": 931
},
{
"epoch": 0.37142572481817276,
"grad_norm": 0.32819051407453925,
"learning_rate": 2.823709015130948e-05,
"loss": 0.8363,
"step": 932
},
{
"epoch": 0.37182425027398625,
"grad_norm": 0.32244270165621963,
"learning_rate": 2.8214024562321288e-05,
"loss": 0.8159,
"step": 933
},
{
"epoch": 0.3722227757297997,
"grad_norm": 0.33554287954574435,
"learning_rate": 2.8190945824044854e-05,
"loss": 0.8275,
"step": 934
},
{
"epoch": 0.3726213011856132,
"grad_norm": 0.31619676372667777,
"learning_rate": 2.8167853973425408e-05,
"loss": 0.8237,
"step": 935
},
{
"epoch": 0.37301982664142674,
"grad_norm": 0.3145096541701049,
"learning_rate": 2.8144749047429155e-05,
"loss": 0.8112,
"step": 936
},
{
"epoch": 0.3734183520972402,
"grad_norm": 0.3733084988221381,
"learning_rate": 2.812163108304325e-05,
"loss": 0.8492,
"step": 937
},
{
"epoch": 0.3738168775530537,
"grad_norm": 0.3271910427372345,
"learning_rate": 2.8098500117275708e-05,
"loss": 0.8409,
"step": 938
},
{
"epoch": 0.3742154030088672,
"grad_norm": 0.3506373095855538,
"learning_rate": 2.8075356187155357e-05,
"loss": 0.8255,
"step": 939
},
{
"epoch": 0.37461392846468067,
"grad_norm": 0.3523796388032185,
"learning_rate": 2.805219932973179e-05,
"loss": 0.8198,
"step": 940
},
{
"epoch": 0.37501245392049415,
"grad_norm": 0.31630826125781786,
"learning_rate": 2.8029029582075286e-05,
"loss": 0.8279,
"step": 941
},
{
"epoch": 0.3754109793763077,
"grad_norm": 0.31383140189055664,
"learning_rate": 2.8005846981276758e-05,
"loss": 0.84,
"step": 942
},
{
"epoch": 0.37580950483212117,
"grad_norm": 0.3308152244077927,
"learning_rate": 2.79826515644477e-05,
"loss": 0.8551,
"step": 943
},
{
"epoch": 0.37620803028793465,
"grad_norm": 0.3183707047927005,
"learning_rate": 2.795944336872012e-05,
"loss": 0.835,
"step": 944
},
{
"epoch": 0.37660655574374813,
"grad_norm": 0.34065129082815276,
"learning_rate": 2.7936222431246478e-05,
"loss": 0.8194,
"step": 945
},
{
"epoch": 0.3770050811995616,
"grad_norm": 0.33055758193564483,
"learning_rate": 2.791298878919964e-05,
"loss": 0.8295,
"step": 946
},
{
"epoch": 0.3774036066553751,
"grad_norm": 0.3178548706287361,
"learning_rate": 2.7889742479772793e-05,
"loss": 0.8487,
"step": 947
},
{
"epoch": 0.3778021321111886,
"grad_norm": 0.34056866287653254,
"learning_rate": 2.7866483540179438e-05,
"loss": 0.822,
"step": 948
},
{
"epoch": 0.3782006575670021,
"grad_norm": 0.3530872392015572,
"learning_rate": 2.784321200765326e-05,
"loss": 0.7945,
"step": 949
},
{
"epoch": 0.3785991830228156,
"grad_norm": 0.34823844388780467,
"learning_rate": 2.781992791944811e-05,
"loss": 0.8343,
"step": 950
},
{
"epoch": 0.37899770847862907,
"grad_norm": 0.32473433019889203,
"learning_rate": 2.779663131283795e-05,
"loss": 0.7889,
"step": 951
},
{
"epoch": 0.37939623393444255,
"grad_norm": 0.3440773152101907,
"learning_rate": 2.7773322225116774e-05,
"loss": 0.8085,
"step": 952
},
{
"epoch": 0.37979475939025603,
"grad_norm": 0.3136356275301238,
"learning_rate": 2.7750000693598557e-05,
"loss": 0.7984,
"step": 953
},
{
"epoch": 0.38019328484606957,
"grad_norm": 0.36010994273938446,
"learning_rate": 2.7726666755617198e-05,
"loss": 0.8176,
"step": 954
},
{
"epoch": 0.38059181030188305,
"grad_norm": 0.4042048335792527,
"learning_rate": 2.770332044852645e-05,
"loss": 0.8298,
"step": 955
},
{
"epoch": 0.38099033575769653,
"grad_norm": 0.33696767739158523,
"learning_rate": 2.7679961809699878e-05,
"loss": 0.7998,
"step": 956
},
{
"epoch": 0.38138886121351,
"grad_norm": 0.32263411827838845,
"learning_rate": 2.765659087653077e-05,
"loss": 0.8234,
"step": 957
},
{
"epoch": 0.3817873866693235,
"grad_norm": 0.3199567939883172,
"learning_rate": 2.7633207686432113e-05,
"loss": 0.8108,
"step": 958
},
{
"epoch": 0.382185912125137,
"grad_norm": 0.33168910588991024,
"learning_rate": 2.760981227683651e-05,
"loss": 0.8313,
"step": 959
},
{
"epoch": 0.3825844375809505,
"grad_norm": 0.3238687202666879,
"learning_rate": 2.758640468519611e-05,
"loss": 0.8321,
"step": 960
},
{
"epoch": 0.382982963036764,
"grad_norm": 0.3478685120540082,
"learning_rate": 2.7562984948982595e-05,
"loss": 0.824,
"step": 961
},
{
"epoch": 0.38338148849257747,
"grad_norm": 0.4127997530905888,
"learning_rate": 2.7539553105687063e-05,
"loss": 0.8061,
"step": 962
},
{
"epoch": 0.38378001394839095,
"grad_norm": 0.3571852104724218,
"learning_rate": 2.7516109192820003e-05,
"loss": 0.8401,
"step": 963
},
{
"epoch": 0.38417853940420443,
"grad_norm": 0.33227253978050236,
"learning_rate": 2.749265324791122e-05,
"loss": 0.8522,
"step": 964
},
{
"epoch": 0.3845770648600179,
"grad_norm": 0.5247271121688866,
"learning_rate": 2.7469185308509786e-05,
"loss": 0.8134,
"step": 965
},
{
"epoch": 0.38497559031583145,
"grad_norm": 0.3470222523911159,
"learning_rate": 2.744570541218397e-05,
"loss": 0.7991,
"step": 966
},
{
"epoch": 0.38537411577164493,
"grad_norm": 0.34151142631527753,
"learning_rate": 2.7422213596521183e-05,
"loss": 0.8467,
"step": 967
},
{
"epoch": 0.3857726412274584,
"grad_norm": 0.519889333298418,
"learning_rate": 2.7398709899127927e-05,
"loss": 0.8306,
"step": 968
},
{
"epoch": 0.3861711666832719,
"grad_norm": 0.3258609895102337,
"learning_rate": 2.7375194357629696e-05,
"loss": 0.7873,
"step": 969
},
{
"epoch": 0.3865696921390854,
"grad_norm": 0.4295037852575729,
"learning_rate": 2.7351667009670993e-05,
"loss": 0.8403,
"step": 970
},
{
"epoch": 0.38696821759489886,
"grad_norm": 0.36998924298526037,
"learning_rate": 2.732812789291516e-05,
"loss": 0.8075,
"step": 971
},
{
"epoch": 0.38736674305071234,
"grad_norm": 0.32705437276780996,
"learning_rate": 2.7304577045044433e-05,
"loss": 0.8282,
"step": 972
},
{
"epoch": 0.3877652685065259,
"grad_norm": 0.3340699092845928,
"learning_rate": 2.72810145037598e-05,
"loss": 0.7963,
"step": 973
},
{
"epoch": 0.38816379396233935,
"grad_norm": 0.3503260696592739,
"learning_rate": 2.7257440306780968e-05,
"loss": 0.8606,
"step": 974
},
{
"epoch": 0.38856231941815284,
"grad_norm": 0.3459980112053063,
"learning_rate": 2.7233854491846314e-05,
"loss": 0.7951,
"step": 975
},
{
"epoch": 0.3889608448739663,
"grad_norm": 0.319254119951506,
"learning_rate": 2.721025709671281e-05,
"loss": 0.8032,
"step": 976
},
{
"epoch": 0.3893593703297798,
"grad_norm": 0.4897236117125459,
"learning_rate": 2.7186648159155962e-05,
"loss": 0.8315,
"step": 977
},
{
"epoch": 0.3897578957855933,
"grad_norm": 0.3087529107037527,
"learning_rate": 2.7163027716969755e-05,
"loss": 0.8117,
"step": 978
},
{
"epoch": 0.3901564212414068,
"grad_norm": 0.3275439817021243,
"learning_rate": 2.7139395807966588e-05,
"loss": 0.8346,
"step": 979
},
{
"epoch": 0.3905549466972203,
"grad_norm": 0.3083375926780146,
"learning_rate": 2.7115752469977224e-05,
"loss": 0.8136,
"step": 980
},
{
"epoch": 0.3909534721530338,
"grad_norm": 0.3069416211569783,
"learning_rate": 2.7092097740850712e-05,
"loss": 0.8213,
"step": 981
},
{
"epoch": 0.39135199760884726,
"grad_norm": 0.3138396694972504,
"learning_rate": 2.7068431658454355e-05,
"loss": 0.8405,
"step": 982
},
{
"epoch": 0.39175052306466074,
"grad_norm": 0.3236054977163557,
"learning_rate": 2.7044754260673607e-05,
"loss": 0.8085,
"step": 983
},
{
"epoch": 0.3921490485204742,
"grad_norm": 0.31483246013918365,
"learning_rate": 2.702106558541205e-05,
"loss": 0.8244,
"step": 984
},
{
"epoch": 0.39254757397628776,
"grad_norm": 0.3541307522351268,
"learning_rate": 2.699736567059132e-05,
"loss": 0.8002,
"step": 985
},
{
"epoch": 0.39294609943210124,
"grad_norm": 0.32364536612849215,
"learning_rate": 2.6973654554151028e-05,
"loss": 0.8198,
"step": 986
},
{
"epoch": 0.3933446248879147,
"grad_norm": 0.33398363230386113,
"learning_rate": 2.694993227404875e-05,
"loss": 0.8393,
"step": 987
},
{
"epoch": 0.3937431503437282,
"grad_norm": 0.349530991319565,
"learning_rate": 2.69261988682599e-05,
"loss": 0.821,
"step": 988
},
{
"epoch": 0.3941416757995417,
"grad_norm": 0.3679139832318692,
"learning_rate": 2.690245437477772e-05,
"loss": 0.815,
"step": 989
},
{
"epoch": 0.39454020125535516,
"grad_norm": 0.31671482584430505,
"learning_rate": 2.6878698831613202e-05,
"loss": 0.8636,
"step": 990
},
{
"epoch": 0.3949387267111687,
"grad_norm": 0.3452241320073205,
"learning_rate": 2.6854932276795026e-05,
"loss": 0.8111,
"step": 991
},
{
"epoch": 0.3953372521669822,
"grad_norm": 0.3018394208024079,
"learning_rate": 2.6831154748369485e-05,
"loss": 0.8273,
"step": 992
},
{
"epoch": 0.39573577762279566,
"grad_norm": 0.322632592726802,
"learning_rate": 2.6807366284400457e-05,
"loss": 0.8038,
"step": 993
},
{
"epoch": 0.39613430307860914,
"grad_norm": 0.34241476868414766,
"learning_rate": 2.6783566922969318e-05,
"loss": 0.8158,
"step": 994
},
{
"epoch": 0.3965328285344226,
"grad_norm": 0.35584217533454204,
"learning_rate": 2.675975670217489e-05,
"loss": 0.83,
"step": 995
},
{
"epoch": 0.3969313539902361,
"grad_norm": 0.29169575061351766,
"learning_rate": 2.673593566013338e-05,
"loss": 0.8124,
"step": 996
},
{
"epoch": 0.39732987944604964,
"grad_norm": 0.3705964533467081,
"learning_rate": 2.671210383497832e-05,
"loss": 0.8304,
"step": 997
},
{
"epoch": 0.3977284049018631,
"grad_norm": 0.33331825287941125,
"learning_rate": 2.66882612648605e-05,
"loss": 0.8232,
"step": 998
},
{
"epoch": 0.3981269303576766,
"grad_norm": 0.3379785793208752,
"learning_rate": 2.666440798794791e-05,
"loss": 0.8113,
"step": 999
},
{
"epoch": 0.3985254558134901,
"grad_norm": 0.47824925692484593,
"learning_rate": 2.6640544042425685e-05,
"loss": 0.8411,
"step": 1000
},
{
"epoch": 0.39892398126930356,
"grad_norm": 0.33431552475555065,
"learning_rate": 2.6616669466496037e-05,
"loss": 0.8468,
"step": 1001
},
{
"epoch": 0.39932250672511704,
"grad_norm": 0.34137387833760563,
"learning_rate": 2.6592784298378188e-05,
"loss": 0.8418,
"step": 1002
},
{
"epoch": 0.3997210321809306,
"grad_norm": 0.39365755246331835,
"learning_rate": 2.656888857630833e-05,
"loss": 0.8224,
"step": 1003
},
{
"epoch": 0.40011955763674406,
"grad_norm": 0.49873669187777425,
"learning_rate": 2.654498233853954e-05,
"loss": 0.808,
"step": 1004
},
{
"epoch": 0.40051808309255754,
"grad_norm": 0.3248564941543554,
"learning_rate": 2.652106562334173e-05,
"loss": 0.8139,
"step": 1005
},
{
"epoch": 0.400916608548371,
"grad_norm": 0.37674381602697304,
"learning_rate": 2.649713846900159e-05,
"loss": 0.8295,
"step": 1006
},
{
"epoch": 0.4013151340041845,
"grad_norm": 0.39334509517210275,
"learning_rate": 2.6473200913822514e-05,
"loss": 0.8131,
"step": 1007
},
{
"epoch": 0.401713659459998,
"grad_norm": 0.3224088785864611,
"learning_rate": 2.644925299612455e-05,
"loss": 0.7975,
"step": 1008
},
{
"epoch": 0.4021121849158115,
"grad_norm": 0.4490995597319525,
"learning_rate": 2.642529475424433e-05,
"loss": 0.8337,
"step": 1009
},
{
"epoch": 0.402510710371625,
"grad_norm": 0.34775476784856935,
"learning_rate": 2.6401326226535037e-05,
"loss": 0.81,
"step": 1010
},
{
"epoch": 0.4029092358274385,
"grad_norm": 0.3424684175535075,
"learning_rate": 2.6377347451366278e-05,
"loss": 0.7884,
"step": 1011
},
{
"epoch": 0.40330776128325196,
"grad_norm": 0.32706343083018596,
"learning_rate": 2.6353358467124094e-05,
"loss": 0.8105,
"step": 1012
},
{
"epoch": 0.40370628673906545,
"grad_norm": 0.3531111527591312,
"learning_rate": 2.632935931221087e-05,
"loss": 0.8524,
"step": 1013
},
{
"epoch": 0.4041048121948789,
"grad_norm": 0.3134079131717474,
"learning_rate": 2.6305350025045257e-05,
"loss": 0.8258,
"step": 1014
},
{
"epoch": 0.40450333765069246,
"grad_norm": 0.3041258254708691,
"learning_rate": 2.6281330644062126e-05,
"loss": 0.8363,
"step": 1015
},
{
"epoch": 0.40490186310650594,
"grad_norm": 0.34271831802902314,
"learning_rate": 2.6257301207712536e-05,
"loss": 0.8045,
"step": 1016
},
{
"epoch": 0.4053003885623194,
"grad_norm": 0.31087347936442256,
"learning_rate": 2.6233261754463605e-05,
"loss": 0.8331,
"step": 1017
},
{
"epoch": 0.4056989140181329,
"grad_norm": 0.32439571159756025,
"learning_rate": 2.62092123227985e-05,
"loss": 0.839,
"step": 1018
},
{
"epoch": 0.4060974394739464,
"grad_norm": 0.3077347841509726,
"learning_rate": 2.6185152951216373e-05,
"loss": 0.8078,
"step": 1019
},
{
"epoch": 0.40649596492975987,
"grad_norm": 0.32342264487059186,
"learning_rate": 2.6161083678232277e-05,
"loss": 0.8101,
"step": 1020
},
{
"epoch": 0.4068944903855734,
"grad_norm": 0.4192377215503443,
"learning_rate": 2.6137004542377122e-05,
"loss": 0.8333,
"step": 1021
},
{
"epoch": 0.4072930158413869,
"grad_norm": 0.3193163344884458,
"learning_rate": 2.611291558219759e-05,
"loss": 0.8177,
"step": 1022
},
{
"epoch": 0.40769154129720037,
"grad_norm": 0.36366689035128674,
"learning_rate": 2.608881683625612e-05,
"loss": 0.8339,
"step": 1023
},
{
"epoch": 0.40809006675301385,
"grad_norm": 0.2956336562200817,
"learning_rate": 2.6064708343130787e-05,
"loss": 0.8344,
"step": 1024
},
{
"epoch": 0.40848859220882733,
"grad_norm": 0.35391087494148843,
"learning_rate": 2.604059014141529e-05,
"loss": 0.8243,
"step": 1025
},
{
"epoch": 0.4088871176646408,
"grad_norm": 0.3753489106825966,
"learning_rate": 2.601646226971885e-05,
"loss": 0.816,
"step": 1026
},
{
"epoch": 0.40928564312045435,
"grad_norm": 0.33358896662610243,
"learning_rate": 2.5992324766666194e-05,
"loss": 0.8168,
"step": 1027
},
{
"epoch": 0.4096841685762678,
"grad_norm": 0.3494626801983563,
"learning_rate": 2.5968177670897447e-05,
"loss": 0.8158,
"step": 1028
},
{
"epoch": 0.4100826940320813,
"grad_norm": 0.3311602416729186,
"learning_rate": 2.5944021021068086e-05,
"loss": 0.8289,
"step": 1029
},
{
"epoch": 0.4104812194878948,
"grad_norm": 0.32920620411123275,
"learning_rate": 2.591985485584891e-05,
"loss": 0.8462,
"step": 1030
},
{
"epoch": 0.41087974494370827,
"grad_norm": 0.33365179190960775,
"learning_rate": 2.589567921392593e-05,
"loss": 0.8316,
"step": 1031
},
{
"epoch": 0.41127827039952175,
"grad_norm": 0.2840477218269186,
"learning_rate": 2.587149413400032e-05,
"loss": 0.8243,
"step": 1032
},
{
"epoch": 0.4116767958553353,
"grad_norm": 0.3094870981520638,
"learning_rate": 2.5847299654788384e-05,
"loss": 0.8302,
"step": 1033
},
{
"epoch": 0.41207532131114877,
"grad_norm": 0.33160788932455293,
"learning_rate": 2.5823095815021458e-05,
"loss": 0.8047,
"step": 1034
},
{
"epoch": 0.41247384676696225,
"grad_norm": 0.3296215696895382,
"learning_rate": 2.579888265344586e-05,
"loss": 0.8408,
"step": 1035
},
{
"epoch": 0.41287237222277573,
"grad_norm": 0.31027823213043904,
"learning_rate": 2.5774660208822854e-05,
"loss": 0.797,
"step": 1036
},
{
"epoch": 0.4132708976785892,
"grad_norm": 0.3233755627458931,
"learning_rate": 2.5750428519928542e-05,
"loss": 0.8437,
"step": 1037
},
{
"epoch": 0.4136694231344027,
"grad_norm": 0.3618514564925971,
"learning_rate": 2.572618762555382e-05,
"loss": 0.8202,
"step": 1038
},
{
"epoch": 0.4140679485902162,
"grad_norm": 0.33907339886292404,
"learning_rate": 2.5701937564504345e-05,
"loss": 0.8199,
"step": 1039
},
{
"epoch": 0.4144664740460297,
"grad_norm": 0.3068383167662696,
"learning_rate": 2.5677678375600436e-05,
"loss": 0.8301,
"step": 1040
},
{
"epoch": 0.4148649995018432,
"grad_norm": 0.3790622200712186,
"learning_rate": 2.565341009767701e-05,
"loss": 0.8171,
"step": 1041
},
{
"epoch": 0.41526352495765667,
"grad_norm": 0.3433456715007725,
"learning_rate": 2.562913276958355e-05,
"loss": 0.8431,
"step": 1042
},
{
"epoch": 0.41566205041347015,
"grad_norm": 0.31629971388025424,
"learning_rate": 2.5604846430184034e-05,
"loss": 0.8188,
"step": 1043
},
{
"epoch": 0.41606057586928363,
"grad_norm": 0.36903895294398353,
"learning_rate": 2.5580551118356842e-05,
"loss": 0.7884,
"step": 1044
},
{
"epoch": 0.4164591013250971,
"grad_norm": 0.3504976082604236,
"learning_rate": 2.5556246872994744e-05,
"loss": 0.8139,
"step": 1045
},
{
"epoch": 0.41685762678091065,
"grad_norm": 0.32035221174765094,
"learning_rate": 2.5531933733004785e-05,
"loss": 0.8017,
"step": 1046
},
{
"epoch": 0.41725615223672413,
"grad_norm": 0.3394515589088212,
"learning_rate": 2.550761173730827e-05,
"loss": 0.8029,
"step": 1047
},
{
"epoch": 0.4176546776925376,
"grad_norm": 0.35804587588727005,
"learning_rate": 2.548328092484067e-05,
"loss": 0.8015,
"step": 1048
},
{
"epoch": 0.4180532031483511,
"grad_norm": 0.33858523464707274,
"learning_rate": 2.5458941334551566e-05,
"loss": 0.801,
"step": 1049
},
{
"epoch": 0.4184517286041646,
"grad_norm": 0.3288133650068113,
"learning_rate": 2.5434593005404605e-05,
"loss": 0.8036,
"step": 1050
},
{
"epoch": 0.41885025405997806,
"grad_norm": 0.3424539726833037,
"learning_rate": 2.5410235976377418e-05,
"loss": 0.8028,
"step": 1051
},
{
"epoch": 0.4192487795157916,
"grad_norm": 0.3023013418013977,
"learning_rate": 2.5385870286461547e-05,
"loss": 0.8513,
"step": 1052
},
{
"epoch": 0.4196473049716051,
"grad_norm": 0.34500936623066886,
"learning_rate": 2.536149597466243e-05,
"loss": 0.8254,
"step": 1053
},
{
"epoch": 0.42004583042741855,
"grad_norm": 0.31922631055010225,
"learning_rate": 2.5337113079999278e-05,
"loss": 0.8363,
"step": 1054
},
{
"epoch": 0.42044435588323203,
"grad_norm": 1.3109348539871832,
"learning_rate": 2.5312721641505054e-05,
"loss": 0.8507,
"step": 1055
},
{
"epoch": 0.4208428813390455,
"grad_norm": 0.32480191303613704,
"learning_rate": 2.5288321698226393e-05,
"loss": 0.8271,
"step": 1056
},
{
"epoch": 0.421241406794859,
"grad_norm": 0.37122122754776027,
"learning_rate": 2.5263913289223567e-05,
"loss": 0.8461,
"step": 1057
},
{
"epoch": 0.42163993225067253,
"grad_norm": 0.3268123530148818,
"learning_rate": 2.523949645357036e-05,
"loss": 0.8081,
"step": 1058
},
{
"epoch": 0.422038457706486,
"grad_norm": 0.3751401095220027,
"learning_rate": 2.5215071230354085e-05,
"loss": 0.7995,
"step": 1059
},
{
"epoch": 0.4224369831622995,
"grad_norm": 0.3784425259279124,
"learning_rate": 2.519063765867546e-05,
"loss": 0.8189,
"step": 1060
},
{
"epoch": 0.422835508618113,
"grad_norm": 0.3433963567838051,
"learning_rate": 2.5166195777648565e-05,
"loss": 0.8306,
"step": 1061
},
{
"epoch": 0.42323403407392646,
"grad_norm": 0.3566697956385714,
"learning_rate": 2.5141745626400804e-05,
"loss": 0.8073,
"step": 1062
},
{
"epoch": 0.42363255952973994,
"grad_norm": 0.3406773772854413,
"learning_rate": 2.511728724407279e-05,
"loss": 0.8126,
"step": 1063
},
{
"epoch": 0.4240310849855535,
"grad_norm": 0.3227569499796658,
"learning_rate": 2.509282066981834e-05,
"loss": 0.8547,
"step": 1064
},
{
"epoch": 0.42442961044136696,
"grad_norm": 0.43322555481131175,
"learning_rate": 2.5068345942804372e-05,
"loss": 0.8056,
"step": 1065
},
{
"epoch": 0.42482813589718044,
"grad_norm": 0.3401664677873486,
"learning_rate": 2.5043863102210854e-05,
"loss": 0.8301,
"step": 1066
},
{
"epoch": 0.4252266613529939,
"grad_norm": 0.3308251533254951,
"learning_rate": 2.5019372187230734e-05,
"loss": 0.8109,
"step": 1067
},
{
"epoch": 0.4256251868088074,
"grad_norm": 0.3369938034523319,
"learning_rate": 2.4994873237069922e-05,
"loss": 0.8198,
"step": 1068
},
{
"epoch": 0.4260237122646209,
"grad_norm": 0.3280216989154936,
"learning_rate": 2.4970366290947145e-05,
"loss": 0.8119,
"step": 1069
},
{
"epoch": 0.4264222377204344,
"grad_norm": 0.335124086686642,
"learning_rate": 2.4945851388093953e-05,
"loss": 0.8111,
"step": 1070
},
{
"epoch": 0.4268207631762479,
"grad_norm": 0.32998466833884404,
"learning_rate": 2.4921328567754643e-05,
"loss": 0.7979,
"step": 1071
},
{
"epoch": 0.4272192886320614,
"grad_norm": 0.3245876352671091,
"learning_rate": 2.489679786918617e-05,
"loss": 0.8341,
"step": 1072
},
{
"epoch": 0.42761781408787486,
"grad_norm": 0.339066366216921,
"learning_rate": 2.4872259331658092e-05,
"loss": 0.8412,
"step": 1073
},
{
"epoch": 0.42801633954368834,
"grad_norm": 0.3632142337136183,
"learning_rate": 2.4847712994452552e-05,
"loss": 0.8287,
"step": 1074
},
{
"epoch": 0.4284148649995018,
"grad_norm": 0.31666688457965547,
"learning_rate": 2.4823158896864138e-05,
"loss": 0.8108,
"step": 1075
},
{
"epoch": 0.42881339045531536,
"grad_norm": 0.33156115315753226,
"learning_rate": 2.479859707819989e-05,
"loss": 0.8115,
"step": 1076
},
{
"epoch": 0.42921191591112884,
"grad_norm": 0.3411054033949336,
"learning_rate": 2.47740275777792e-05,
"loss": 0.8132,
"step": 1077
},
{
"epoch": 0.4296104413669423,
"grad_norm": 0.3402952332409344,
"learning_rate": 2.4749450434933743e-05,
"loss": 0.8076,
"step": 1078
},
{
"epoch": 0.4300089668227558,
"grad_norm": 0.3223187993665448,
"learning_rate": 2.472486568900745e-05,
"loss": 0.8426,
"step": 1079
},
{
"epoch": 0.4304074922785693,
"grad_norm": 0.31498257951573805,
"learning_rate": 2.470027337935641e-05,
"loss": 0.8166,
"step": 1080
},
{
"epoch": 0.43080601773438276,
"grad_norm": 0.31243598520804755,
"learning_rate": 2.4675673545348825e-05,
"loss": 0.8295,
"step": 1081
},
{
"epoch": 0.4312045431901963,
"grad_norm": 0.3538419858260921,
"learning_rate": 2.4651066226364943e-05,
"loss": 0.8293,
"step": 1082
},
{
"epoch": 0.4316030686460098,
"grad_norm": 0.31794472665083506,
"learning_rate": 2.462645146179698e-05,
"loss": 0.8099,
"step": 1083
},
{
"epoch": 0.43200159410182326,
"grad_norm": 0.31784933887583533,
"learning_rate": 2.4601829291049098e-05,
"loss": 0.7962,
"step": 1084
},
{
"epoch": 0.43240011955763674,
"grad_norm": 0.42026330060809836,
"learning_rate": 2.45771997535373e-05,
"loss": 0.816,
"step": 1085
},
{
"epoch": 0.4327986450134502,
"grad_norm": 0.33452336084693307,
"learning_rate": 2.4552562888689376e-05,
"loss": 0.8075,
"step": 1086
},
{
"epoch": 0.4331971704692637,
"grad_norm": 0.322413780248328,
"learning_rate": 2.4527918735944853e-05,
"loss": 0.7956,
"step": 1087
},
{
"epoch": 0.43359569592507724,
"grad_norm": 0.32866939422553315,
"learning_rate": 2.4503267334754925e-05,
"loss": 0.8368,
"step": 1088
},
{
"epoch": 0.4339942213808907,
"grad_norm": 0.31934456546936785,
"learning_rate": 2.447860872458239e-05,
"loss": 0.8438,
"step": 1089
},
{
"epoch": 0.4343927468367042,
"grad_norm": 0.32490029875471044,
"learning_rate": 2.4453942944901575e-05,
"loss": 0.8056,
"step": 1090
},
{
"epoch": 0.4347912722925177,
"grad_norm": 0.32929458543358014,
"learning_rate": 2.4429270035198313e-05,
"loss": 0.8037,
"step": 1091
},
{
"epoch": 0.43518979774833116,
"grad_norm": 0.32506473231877164,
"learning_rate": 2.4404590034969822e-05,
"loss": 0.8113,
"step": 1092
},
{
"epoch": 0.43558832320414465,
"grad_norm": 0.29212134247678295,
"learning_rate": 2.437990298372467e-05,
"loss": 0.8005,
"step": 1093
},
{
"epoch": 0.4359868486599582,
"grad_norm": 0.3455754520750264,
"learning_rate": 2.4355208920982744e-05,
"loss": 0.7994,
"step": 1094
},
{
"epoch": 0.43638537411577166,
"grad_norm": 0.30065376764152013,
"learning_rate": 2.4330507886275122e-05,
"loss": 0.8164,
"step": 1095
},
{
"epoch": 0.43678389957158514,
"grad_norm": 0.32131061866768784,
"learning_rate": 2.4305799919144055e-05,
"loss": 0.8316,
"step": 1096
},
{
"epoch": 0.4371824250273986,
"grad_norm": 0.3311309667775356,
"learning_rate": 2.4281085059142892e-05,
"loss": 0.8194,
"step": 1097
},
{
"epoch": 0.4375809504832121,
"grad_norm": 0.32898839768451466,
"learning_rate": 2.4256363345836026e-05,
"loss": 0.8321,
"step": 1098
},
{
"epoch": 0.4379794759390256,
"grad_norm": 0.3065918498699849,
"learning_rate": 2.4231634818798798e-05,
"loss": 0.7826,
"step": 1099
},
{
"epoch": 0.4383780013948391,
"grad_norm": 0.35465547671253245,
"learning_rate": 2.4206899517617485e-05,
"loss": 0.8267,
"step": 1100
},
{
"epoch": 0.4387765268506526,
"grad_norm": 0.3301349903148197,
"learning_rate": 2.4182157481889183e-05,
"loss": 0.8022,
"step": 1101
},
{
"epoch": 0.4391750523064661,
"grad_norm": 0.316437289769763,
"learning_rate": 2.415740875122178e-05,
"loss": 0.8036,
"step": 1102
},
{
"epoch": 0.43957357776227957,
"grad_norm": 0.332243171121802,
"learning_rate": 2.413265336523389e-05,
"loss": 0.8352,
"step": 1103
},
{
"epoch": 0.43997210321809305,
"grad_norm": 0.5376924415941126,
"learning_rate": 2.4107891363554753e-05,
"loss": 0.8306,
"step": 1104
},
{
"epoch": 0.44037062867390653,
"grad_norm": 0.303147057063706,
"learning_rate": 2.4083122785824236e-05,
"loss": 0.7916,
"step": 1105
},
{
"epoch": 0.44076915412972,
"grad_norm": 0.34716257230796316,
"learning_rate": 2.405834767169271e-05,
"loss": 0.7974,
"step": 1106
},
{
"epoch": 0.44116767958553355,
"grad_norm": 0.3205567864972624,
"learning_rate": 2.403356606082101e-05,
"loss": 0.8002,
"step": 1107
},
{
"epoch": 0.441566205041347,
"grad_norm": 0.29598982127864676,
"learning_rate": 2.400877799288039e-05,
"loss": 0.8077,
"step": 1108
},
{
"epoch": 0.4419647304971605,
"grad_norm": 0.3707790401289273,
"learning_rate": 2.398398350755242e-05,
"loss": 0.8119,
"step": 1109
},
{
"epoch": 0.442363255952974,
"grad_norm": 0.35724626182329483,
"learning_rate": 2.3959182644528945e-05,
"loss": 0.8117,
"step": 1110
},
{
"epoch": 0.44276178140878747,
"grad_norm": 0.3194532912667194,
"learning_rate": 2.3934375443512025e-05,
"loss": 0.8052,
"step": 1111
},
{
"epoch": 0.44316030686460095,
"grad_norm": 0.3897881316911469,
"learning_rate": 2.3909561944213876e-05,
"loss": 0.8188,
"step": 1112
},
{
"epoch": 0.4435588323204145,
"grad_norm": 0.31474565450210384,
"learning_rate": 2.3884742186356783e-05,
"loss": 0.8301,
"step": 1113
},
{
"epoch": 0.44395735777622797,
"grad_norm": 0.34893912043486475,
"learning_rate": 2.385991620967305e-05,
"loss": 0.7822,
"step": 1114
},
{
"epoch": 0.44435588323204145,
"grad_norm": 0.34444018169025264,
"learning_rate": 2.383508405390494e-05,
"loss": 0.8036,
"step": 1115
},
{
"epoch": 0.44475440868785493,
"grad_norm": 0.3209220544042362,
"learning_rate": 2.3810245758804614e-05,
"loss": 0.7959,
"step": 1116
},
{
"epoch": 0.4451529341436684,
"grad_norm": 0.3597044151663452,
"learning_rate": 2.378540136413405e-05,
"loss": 0.8029,
"step": 1117
},
{
"epoch": 0.4455514595994819,
"grad_norm": 0.5678063532761977,
"learning_rate": 2.3760550909664987e-05,
"loss": 0.7966,
"step": 1118
},
{
"epoch": 0.44594998505529543,
"grad_norm": 0.3399480220411935,
"learning_rate": 2.373569443517888e-05,
"loss": 0.8075,
"step": 1119
},
{
"epoch": 0.4463485105111089,
"grad_norm": 0.30860916880522943,
"learning_rate": 2.3710831980466825e-05,
"loss": 0.816,
"step": 1120
},
{
"epoch": 0.4467470359669224,
"grad_norm": 0.30451406346046384,
"learning_rate": 2.368596358532947e-05,
"loss": 0.7821,
"step": 1121
},
{
"epoch": 0.44714556142273587,
"grad_norm": 0.3274342257348003,
"learning_rate": 2.3661089289576973e-05,
"loss": 0.8099,
"step": 1122
},
{
"epoch": 0.44754408687854935,
"grad_norm": 0.2990103230908009,
"learning_rate": 2.3636209133028957e-05,
"loss": 0.8438,
"step": 1123
},
{
"epoch": 0.44794261233436283,
"grad_norm": 0.33085965104050497,
"learning_rate": 2.361132315551442e-05,
"loss": 0.8148,
"step": 1124
},
{
"epoch": 0.44834113779017637,
"grad_norm": 0.3235378935161311,
"learning_rate": 2.3586431396871677e-05,
"loss": 0.816,
"step": 1125
},
{
"epoch": 0.44873966324598985,
"grad_norm": 0.30982112537132234,
"learning_rate": 2.3561533896948296e-05,
"loss": 0.8205,
"step": 1126
},
{
"epoch": 0.44913818870180333,
"grad_norm": 0.3148765787287355,
"learning_rate": 2.3536630695601027e-05,
"loss": 0.7902,
"step": 1127
},
{
"epoch": 0.4495367141576168,
"grad_norm": 0.3794802774217404,
"learning_rate": 2.3511721832695767e-05,
"loss": 0.8269,
"step": 1128
},
{
"epoch": 0.4499352396134303,
"grad_norm": 0.3284627503131426,
"learning_rate": 2.3486807348107464e-05,
"loss": 0.8597,
"step": 1129
},
{
"epoch": 0.4503337650692438,
"grad_norm": 0.31901034421618163,
"learning_rate": 2.3461887281720066e-05,
"loss": 0.8024,
"step": 1130
},
{
"epoch": 0.4507322905250573,
"grad_norm": 0.35755058361337694,
"learning_rate": 2.3436961673426456e-05,
"loss": 0.8201,
"step": 1131
},
{
"epoch": 0.4511308159808708,
"grad_norm": 0.37055788579790766,
"learning_rate": 2.3412030563128402e-05,
"loss": 0.8043,
"step": 1132
},
{
"epoch": 0.4515293414366843,
"grad_norm": 0.29135675861869104,
"learning_rate": 2.338709399073645e-05,
"loss": 0.8151,
"step": 1133
},
{
"epoch": 0.45192786689249775,
"grad_norm": 0.3342416376182507,
"learning_rate": 2.336215199616992e-05,
"loss": 0.8368,
"step": 1134
},
{
"epoch": 0.45232639234831123,
"grad_norm": 0.33393406000623976,
"learning_rate": 2.33372046193568e-05,
"loss": 0.8156,
"step": 1135
},
{
"epoch": 0.4527249178041247,
"grad_norm": 0.2962123245077335,
"learning_rate": 2.3312251900233687e-05,
"loss": 0.8133,
"step": 1136
},
{
"epoch": 0.45312344325993825,
"grad_norm": 0.3252453832873177,
"learning_rate": 2.3287293878745746e-05,
"loss": 0.8104,
"step": 1137
},
{
"epoch": 0.45352196871575173,
"grad_norm": 0.31101543033789,
"learning_rate": 2.3262330594846615e-05,
"loss": 0.8116,
"step": 1138
},
{
"epoch": 0.4539204941715652,
"grad_norm": 0.3142215269516538,
"learning_rate": 2.3237362088498366e-05,
"loss": 0.8312,
"step": 1139
},
{
"epoch": 0.4543190196273787,
"grad_norm": 0.3156466217062423,
"learning_rate": 2.3212388399671434e-05,
"loss": 0.8026,
"step": 1140
},
{
"epoch": 0.4547175450831922,
"grad_norm": 0.29130193805422705,
"learning_rate": 2.318740956834453e-05,
"loss": 0.8208,
"step": 1141
},
{
"epoch": 0.45511607053900566,
"grad_norm": 0.31609767343436057,
"learning_rate": 2.3162425634504624e-05,
"loss": 0.8048,
"step": 1142
},
{
"epoch": 0.4555145959948192,
"grad_norm": 0.30627780545918254,
"learning_rate": 2.3137436638146838e-05,
"loss": 0.8256,
"step": 1143
},
{
"epoch": 0.4559131214506327,
"grad_norm": 0.3942343869320896,
"learning_rate": 2.3112442619274408e-05,
"loss": 0.8231,
"step": 1144
},
{
"epoch": 0.45631164690644616,
"grad_norm": 0.30922816387497437,
"learning_rate": 2.3087443617898585e-05,
"loss": 0.8128,
"step": 1145
},
{
"epoch": 0.45671017236225964,
"grad_norm": 0.31257709643441933,
"learning_rate": 2.3062439674038643e-05,
"loss": 0.7816,
"step": 1146
},
{
"epoch": 0.4571086978180731,
"grad_norm": 0.3125099111968418,
"learning_rate": 2.3037430827721724e-05,
"loss": 0.8511,
"step": 1147
},
{
"epoch": 0.4575072232738866,
"grad_norm": 0.3259270287494568,
"learning_rate": 2.3012417118982833e-05,
"loss": 0.8078,
"step": 1148
},
{
"epoch": 0.45790574872970013,
"grad_norm": 0.4841424847659405,
"learning_rate": 2.298739858786477e-05,
"loss": 0.846,
"step": 1149
},
{
"epoch": 0.4583042741855136,
"grad_norm": 0.30651971893302865,
"learning_rate": 2.2962375274418042e-05,
"loss": 0.7836,
"step": 1150
},
{
"epoch": 0.4587027996413271,
"grad_norm": 0.29130109838002205,
"learning_rate": 2.2937347218700814e-05,
"loss": 0.8251,
"step": 1151
},
{
"epoch": 0.4591013250971406,
"grad_norm": 0.29216772346283687,
"learning_rate": 2.2912314460778838e-05,
"loss": 0.7934,
"step": 1152
},
{
"epoch": 0.45949985055295406,
"grad_norm": 0.28659925320048857,
"learning_rate": 2.2887277040725416e-05,
"loss": 0.8132,
"step": 1153
},
{
"epoch": 0.45989837600876754,
"grad_norm": 0.2821978280610863,
"learning_rate": 2.2862234998621276e-05,
"loss": 0.8018,
"step": 1154
},
{
"epoch": 0.4602969014645811,
"grad_norm": 0.3022683438134659,
"learning_rate": 2.2837188374554584e-05,
"loss": 0.8011,
"step": 1155
},
{
"epoch": 0.46069542692039456,
"grad_norm": 0.29620670062698495,
"learning_rate": 2.281213720862081e-05,
"loss": 0.7884,
"step": 1156
},
{
"epoch": 0.46109395237620804,
"grad_norm": 0.2804223684367047,
"learning_rate": 2.2787081540922716e-05,
"loss": 0.8016,
"step": 1157
},
{
"epoch": 0.4614924778320215,
"grad_norm": 0.30149704387252646,
"learning_rate": 2.2762021411570254e-05,
"loss": 0.8044,
"step": 1158
},
{
"epoch": 0.461891003287835,
"grad_norm": 0.28566950350769055,
"learning_rate": 2.273695686068053e-05,
"loss": 0.8113,
"step": 1159
},
{
"epoch": 0.4622895287436485,
"grad_norm": 0.27932263683794883,
"learning_rate": 2.2711887928377725e-05,
"loss": 0.8178,
"step": 1160
},
{
"epoch": 0.462688054199462,
"grad_norm": 0.3504836230780002,
"learning_rate": 2.2686814654793036e-05,
"loss": 0.8276,
"step": 1161
},
{
"epoch": 0.4630865796552755,
"grad_norm": 0.31710148422205037,
"learning_rate": 2.26617370800646e-05,
"loss": 0.8075,
"step": 1162
},
{
"epoch": 0.463485105111089,
"grad_norm": 0.288322551014853,
"learning_rate": 2.2636655244337455e-05,
"loss": 0.8099,
"step": 1163
},
{
"epoch": 0.46388363056690246,
"grad_norm": 0.30696335215944015,
"learning_rate": 2.2611569187763448e-05,
"loss": 0.8167,
"step": 1164
},
{
"epoch": 0.46428215602271594,
"grad_norm": 0.2740251270995111,
"learning_rate": 2.258647895050118e-05,
"loss": 0.8122,
"step": 1165
},
{
"epoch": 0.4646806814785294,
"grad_norm": 0.30100618811204716,
"learning_rate": 2.2561384572715957e-05,
"loss": 0.8124,
"step": 1166
},
{
"epoch": 0.4650792069343429,
"grad_norm": 0.28921422085766796,
"learning_rate": 2.2536286094579717e-05,
"loss": 0.8344,
"step": 1167
},
{
"epoch": 0.46547773239015644,
"grad_norm": 0.30173959947735146,
"learning_rate": 2.2511183556270937e-05,
"loss": 0.8326,
"step": 1168
},
{
"epoch": 0.4658762578459699,
"grad_norm": 0.5060784189623851,
"learning_rate": 2.2486076997974617e-05,
"loss": 0.7857,
"step": 1169
},
{
"epoch": 0.4662747833017834,
"grad_norm": 0.29228478601288754,
"learning_rate": 2.2460966459882184e-05,
"loss": 0.7995,
"step": 1170
},
{
"epoch": 0.4666733087575969,
"grad_norm": 0.31868507689912057,
"learning_rate": 2.2435851982191426e-05,
"loss": 0.8323,
"step": 1171
},
{
"epoch": 0.46707183421341036,
"grad_norm": 0.27865315868245927,
"learning_rate": 2.2410733605106462e-05,
"loss": 0.7983,
"step": 1172
},
{
"epoch": 0.46747035966922384,
"grad_norm": 0.29759002153633596,
"learning_rate": 2.238561136883764e-05,
"loss": 0.8044,
"step": 1173
},
{
"epoch": 0.4678688851250374,
"grad_norm": 0.2846486337810441,
"learning_rate": 2.236048531360147e-05,
"loss": 0.8111,
"step": 1174
},
{
"epoch": 0.46826741058085086,
"grad_norm": 0.3118599392906745,
"learning_rate": 2.2335355479620605e-05,
"loss": 0.802,
"step": 1175
},
{
"epoch": 0.46866593603666434,
"grad_norm": 0.30270097977856236,
"learning_rate": 2.231022190712373e-05,
"loss": 0.802,
"step": 1176
},
{
"epoch": 0.4690644614924778,
"grad_norm": 0.2817261828834847,
"learning_rate": 2.228508463634551e-05,
"loss": 0.8007,
"step": 1177
},
{
"epoch": 0.4694629869482913,
"grad_norm": 0.3274731513059302,
"learning_rate": 2.225994370752655e-05,
"loss": 0.8138,
"step": 1178
},
{
"epoch": 0.4698615124041048,
"grad_norm": 0.2968053602546118,
"learning_rate": 2.2234799160913285e-05,
"loss": 0.8239,
"step": 1179
},
{
"epoch": 0.4702600378599183,
"grad_norm": 0.9004493930737405,
"learning_rate": 2.2209651036757965e-05,
"loss": 0.8121,
"step": 1180
},
{
"epoch": 0.4706585633157318,
"grad_norm": 0.29343035187513045,
"learning_rate": 2.218449937531856e-05,
"loss": 0.8062,
"step": 1181
},
{
"epoch": 0.4710570887715453,
"grad_norm": 0.3251626790620503,
"learning_rate": 2.2159344216858693e-05,
"loss": 0.8171,
"step": 1182
},
{
"epoch": 0.47145561422735877,
"grad_norm": 0.3008660196180082,
"learning_rate": 2.2134185601647595e-05,
"loss": 0.8233,
"step": 1183
},
{
"epoch": 0.47185413968317225,
"grad_norm": 0.31587152291948645,
"learning_rate": 2.2109023569960028e-05,
"loss": 0.7893,
"step": 1184
},
{
"epoch": 0.4722526651389857,
"grad_norm": 0.3109368684781642,
"learning_rate": 2.208385816207622e-05,
"loss": 0.8351,
"step": 1185
},
{
"epoch": 0.47265119059479926,
"grad_norm": 0.3585332576145692,
"learning_rate": 2.2058689418281806e-05,
"loss": 0.8235,
"step": 1186
},
{
"epoch": 0.47304971605061275,
"grad_norm": 0.36347361575702536,
"learning_rate": 2.2033517378867773e-05,
"loss": 0.8333,
"step": 1187
},
{
"epoch": 0.4734482415064262,
"grad_norm": 0.3104981737491085,
"learning_rate": 2.2008342084130357e-05,
"loss": 0.7985,
"step": 1188
},
{
"epoch": 0.4738467669622397,
"grad_norm": 0.29070707839217663,
"learning_rate": 2.1983163574371038e-05,
"loss": 0.8135,
"step": 1189
},
{
"epoch": 0.4742452924180532,
"grad_norm": 0.3019633554231252,
"learning_rate": 2.1957981889896413e-05,
"loss": 0.8042,
"step": 1190
},
{
"epoch": 0.47464381787386667,
"grad_norm": 0.28671960218113185,
"learning_rate": 2.1932797071018176e-05,
"loss": 0.7833,
"step": 1191
},
{
"epoch": 0.4750423433296802,
"grad_norm": 0.30296654651092136,
"learning_rate": 2.1907609158053043e-05,
"loss": 0.802,
"step": 1192
},
{
"epoch": 0.4754408687854937,
"grad_norm": 0.30792479960608926,
"learning_rate": 2.1882418191322667e-05,
"loss": 0.7874,
"step": 1193
},
{
"epoch": 0.47583939424130717,
"grad_norm": 0.39407347199239423,
"learning_rate": 2.18572242111536e-05,
"loss": 0.8171,
"step": 1194
},
{
"epoch": 0.47623791969712065,
"grad_norm": 0.2981154461238015,
"learning_rate": 2.183202725787723e-05,
"loss": 0.8202,
"step": 1195
},
{
"epoch": 0.47663644515293413,
"grad_norm": 0.2883120319508124,
"learning_rate": 2.1806827371829686e-05,
"loss": 0.8354,
"step": 1196
},
{
"epoch": 0.4770349706087476,
"grad_norm": 0.29569950551843616,
"learning_rate": 2.1781624593351788e-05,
"loss": 0.8034,
"step": 1197
},
{
"epoch": 0.47743349606456115,
"grad_norm": 0.2942079747064485,
"learning_rate": 2.175641896278901e-05,
"loss": 0.8423,
"step": 1198
},
{
"epoch": 0.47783202152037463,
"grad_norm": 0.31504833020024914,
"learning_rate": 2.1731210520491365e-05,
"loss": 0.7956,
"step": 1199
},
{
"epoch": 0.4782305469761881,
"grad_norm": 0.27602156434261366,
"learning_rate": 2.1705999306813378e-05,
"loss": 0.7789,
"step": 1200
},
{
"epoch": 0.4786290724320016,
"grad_norm": 0.3159340649254405,
"learning_rate": 2.168078536211403e-05,
"loss": 0.8196,
"step": 1201
},
{
"epoch": 0.47902759788781507,
"grad_norm": 0.30368372482852835,
"learning_rate": 2.1655568726756643e-05,
"loss": 0.8199,
"step": 1202
},
{
"epoch": 0.47942612334362855,
"grad_norm": 0.3082856381822439,
"learning_rate": 2.163034944110886e-05,
"loss": 0.8217,
"step": 1203
},
{
"epoch": 0.4798246487994421,
"grad_norm": 0.30444993184134234,
"learning_rate": 2.1605127545542572e-05,
"loss": 0.81,
"step": 1204
},
{
"epoch": 0.48022317425525557,
"grad_norm": 0.3053503071698002,
"learning_rate": 2.1579903080433837e-05,
"loss": 0.7724,
"step": 1205
},
{
"epoch": 0.48062169971106905,
"grad_norm": 0.2907609764564475,
"learning_rate": 2.1554676086162827e-05,
"loss": 0.7939,
"step": 1206
},
{
"epoch": 0.48102022516688253,
"grad_norm": 0.30438913548426777,
"learning_rate": 2.152944660311378e-05,
"loss": 0.8124,
"step": 1207
},
{
"epoch": 0.481418750622696,
"grad_norm": 0.2916803784401073,
"learning_rate": 2.1504214671674903e-05,
"loss": 0.8002,
"step": 1208
},
{
"epoch": 0.4818172760785095,
"grad_norm": 0.3118580484823128,
"learning_rate": 2.147898033223831e-05,
"loss": 0.8152,
"step": 1209
},
{
"epoch": 0.48221580153432303,
"grad_norm": 0.30562499279688954,
"learning_rate": 2.1453743625200004e-05,
"loss": 0.7978,
"step": 1210
},
{
"epoch": 0.4826143269901365,
"grad_norm": 0.29452400424891173,
"learning_rate": 2.142850459095975e-05,
"loss": 0.8083,
"step": 1211
},
{
"epoch": 0.48301285244595,
"grad_norm": 0.29629314495355424,
"learning_rate": 2.1403263269921046e-05,
"loss": 0.8073,
"step": 1212
},
{
"epoch": 0.4834113779017635,
"grad_norm": 0.32650392294542924,
"learning_rate": 2.1378019702491054e-05,
"loss": 0.7924,
"step": 1213
},
{
"epoch": 0.48380990335757695,
"grad_norm": 0.30150320495591154,
"learning_rate": 2.135277392908053e-05,
"loss": 0.8531,
"step": 1214
},
{
"epoch": 0.48420842881339043,
"grad_norm": 0.310619189184776,
"learning_rate": 2.132752599010376e-05,
"loss": 0.834,
"step": 1215
},
{
"epoch": 0.48460695426920397,
"grad_norm": 0.32562782034606635,
"learning_rate": 2.1302275925978508e-05,
"loss": 0.7904,
"step": 1216
},
{
"epoch": 0.48500547972501745,
"grad_norm": 0.3017176154191894,
"learning_rate": 2.1277023777125915e-05,
"loss": 0.8194,
"step": 1217
},
{
"epoch": 0.48540400518083093,
"grad_norm": 0.32023476312765164,
"learning_rate": 2.1251769583970484e-05,
"loss": 0.7893,
"step": 1218
},
{
"epoch": 0.4858025306366444,
"grad_norm": 0.28781956877783055,
"learning_rate": 2.122651338693998e-05,
"loss": 0.8156,
"step": 1219
},
{
"epoch": 0.4862010560924579,
"grad_norm": 1.2149233879740187,
"learning_rate": 2.1201255226465375e-05,
"loss": 0.8266,
"step": 1220
},
{
"epoch": 0.4865995815482714,
"grad_norm": 0.2992816242260791,
"learning_rate": 2.1175995142980793e-05,
"loss": 0.8263,
"step": 1221
},
{
"epoch": 0.4869981070040849,
"grad_norm": 0.3235204400431873,
"learning_rate": 2.115073317692342e-05,
"loss": 0.8074,
"step": 1222
},
{
"epoch": 0.4873966324598984,
"grad_norm": 0.2995871348511909,
"learning_rate": 2.112546936873347e-05,
"loss": 0.8347,
"step": 1223
},
{
"epoch": 0.4877951579157119,
"grad_norm": 0.3268455050694444,
"learning_rate": 2.110020375885411e-05,
"loss": 0.8104,
"step": 1224
},
{
"epoch": 0.48819368337152536,
"grad_norm": 0.31345643601355155,
"learning_rate": 2.1074936387731367e-05,
"loss": 0.8271,
"step": 1225
},
{
"epoch": 0.48859220882733884,
"grad_norm": 0.37781746616538014,
"learning_rate": 2.1049667295814113e-05,
"loss": 0.8276,
"step": 1226
},
{
"epoch": 0.4889907342831523,
"grad_norm": 0.30667467990270375,
"learning_rate": 2.1024396523553955e-05,
"loss": 0.7966,
"step": 1227
},
{
"epoch": 0.48938925973896585,
"grad_norm": 0.3116435731085305,
"learning_rate": 2.099912411140521e-05,
"loss": 0.801,
"step": 1228
},
{
"epoch": 0.48978778519477933,
"grad_norm": 0.3045824871287522,
"learning_rate": 2.0973850099824807e-05,
"loss": 0.76,
"step": 1229
},
{
"epoch": 0.4901863106505928,
"grad_norm": 0.32180564748889195,
"learning_rate": 2.094857452927224e-05,
"loss": 0.8158,
"step": 1230
},
{
"epoch": 0.4905848361064063,
"grad_norm": 0.9714532194362665,
"learning_rate": 2.09232974402095e-05,
"loss": 0.7917,
"step": 1231
},
{
"epoch": 0.4909833615622198,
"grad_norm": 0.4083517397563029,
"learning_rate": 2.089801887310099e-05,
"loss": 0.7759,
"step": 1232
},
{
"epoch": 0.49138188701803326,
"grad_norm": 0.32375580190481257,
"learning_rate": 2.087273886841351e-05,
"loss": 0.8225,
"step": 1233
},
{
"epoch": 0.49178041247384674,
"grad_norm": 0.29897291559360073,
"learning_rate": 2.0847457466616135e-05,
"loss": 0.8223,
"step": 1234
},
{
"epoch": 0.4921789379296603,
"grad_norm": 0.6264426925966912,
"learning_rate": 2.08221747081802e-05,
"loss": 0.806,
"step": 1235
},
{
"epoch": 0.49257746338547376,
"grad_norm": 0.3393552807659732,
"learning_rate": 2.079689063357919e-05,
"loss": 0.808,
"step": 1236
},
{
"epoch": 0.49297598884128724,
"grad_norm": 0.48064261120943,
"learning_rate": 2.0771605283288716e-05,
"loss": 0.8028,
"step": 1237
},
{
"epoch": 0.4933745142971007,
"grad_norm": 0.30581132700814045,
"learning_rate": 2.074631869778641e-05,
"loss": 0.8067,
"step": 1238
},
{
"epoch": 0.4937730397529142,
"grad_norm": 0.29530312754650695,
"learning_rate": 2.0721030917551905e-05,
"loss": 0.8212,
"step": 1239
},
{
"epoch": 0.4941715652087277,
"grad_norm": 0.29055485043935136,
"learning_rate": 2.0695741983066724e-05,
"loss": 0.8193,
"step": 1240
},
{
"epoch": 0.4945700906645412,
"grad_norm": 0.31170603570838856,
"learning_rate": 2.0670451934814252e-05,
"loss": 0.7959,
"step": 1241
},
{
"epoch": 0.4949686161203547,
"grad_norm": 0.28393384738922395,
"learning_rate": 2.0645160813279657e-05,
"loss": 0.8113,
"step": 1242
},
{
"epoch": 0.4953671415761682,
"grad_norm": 0.31099237786422546,
"learning_rate": 2.0619868658949818e-05,
"loss": 0.8277,
"step": 1243
},
{
"epoch": 0.49576566703198166,
"grad_norm": 0.4543341488542098,
"learning_rate": 2.059457551231327e-05,
"loss": 0.8053,
"step": 1244
},
{
"epoch": 0.49616419248779514,
"grad_norm": 0.3934508739825585,
"learning_rate": 2.0569281413860147e-05,
"loss": 0.821,
"step": 1245
},
{
"epoch": 0.4965627179436086,
"grad_norm": 0.3041220289880547,
"learning_rate": 2.054398640408208e-05,
"loss": 0.7835,
"step": 1246
},
{
"epoch": 0.49696124339942216,
"grad_norm": 0.3121481686636135,
"learning_rate": 2.0518690523472182e-05,
"loss": 0.8196,
"step": 1247
},
{
"epoch": 0.49735976885523564,
"grad_norm": 0.29339385739102847,
"learning_rate": 2.0493393812524967e-05,
"loss": 0.812,
"step": 1248
},
{
"epoch": 0.4977582943110491,
"grad_norm": 0.6381668064023208,
"learning_rate": 2.0468096311736247e-05,
"loss": 0.8051,
"step": 1249
},
{
"epoch": 0.4981568197668626,
"grad_norm": 0.30166068852688105,
"learning_rate": 2.044279806160313e-05,
"loss": 0.787,
"step": 1250
},
{
"epoch": 0.4985553452226761,
"grad_norm": 0.28274022253823955,
"learning_rate": 2.0417499102623903e-05,
"loss": 0.8003,
"step": 1251
},
{
"epoch": 0.49895387067848956,
"grad_norm": 0.3796924292206021,
"learning_rate": 2.0392199475297995e-05,
"loss": 0.7982,
"step": 1252
},
{
"epoch": 0.4993523961343031,
"grad_norm": 0.2853722232096178,
"learning_rate": 2.0366899220125903e-05,
"loss": 0.8013,
"step": 1253
},
{
"epoch": 0.4997509215901166,
"grad_norm": 0.31573490109402036,
"learning_rate": 2.034159837760914e-05,
"loss": 0.8147,
"step": 1254
},
{
"epoch": 0.50014944704593,
"grad_norm": 0.2765481712079679,
"learning_rate": 2.0316296988250138e-05,
"loss": 0.7995,
"step": 1255
},
{
"epoch": 0.5005479725017435,
"grad_norm": 0.2994449499838975,
"learning_rate": 2.029099509255223e-05,
"loss": 0.7946,
"step": 1256
},
{
"epoch": 0.5009464979575571,
"grad_norm": 0.3207532131664091,
"learning_rate": 2.026569273101954e-05,
"loss": 0.8038,
"step": 1257
},
{
"epoch": 0.5013450234133705,
"grad_norm": 0.2829753955420768,
"learning_rate": 2.0240389944156937e-05,
"loss": 0.8001,
"step": 1258
},
{
"epoch": 0.501743548869184,
"grad_norm": 0.27998354424049926,
"learning_rate": 2.021508677246999e-05,
"loss": 0.791,
"step": 1259
},
{
"epoch": 0.5021420743249975,
"grad_norm": 0.2913911881200998,
"learning_rate": 2.018978325646486e-05,
"loss": 0.7914,
"step": 1260
},
{
"epoch": 0.502540599780811,
"grad_norm": 0.26963096722494334,
"learning_rate": 2.0164479436648272e-05,
"loss": 0.8406,
"step": 1261
},
{
"epoch": 0.5029391252366245,
"grad_norm": 0.3010795830435557,
"learning_rate": 2.0139175353527446e-05,
"loss": 0.8078,
"step": 1262
},
{
"epoch": 0.503337650692438,
"grad_norm": 0.30960536952730017,
"learning_rate": 2.0113871047610016e-05,
"loss": 0.8074,
"step": 1263
},
{
"epoch": 0.5037361761482515,
"grad_norm": 0.26906634414413455,
"learning_rate": 2.0088566559403953e-05,
"loss": 0.7935,
"step": 1264
},
{
"epoch": 0.5041347016040649,
"grad_norm": 0.34646731409844644,
"learning_rate": 2.006326192941755e-05,
"loss": 0.8442,
"step": 1265
},
{
"epoch": 0.5045332270598785,
"grad_norm": 0.2726972871873017,
"learning_rate": 2.003795719815931e-05,
"loss": 0.7859,
"step": 1266
},
{
"epoch": 0.5049317525156919,
"grad_norm": 0.3143394544398179,
"learning_rate": 2.0012652406137903e-05,
"loss": 0.8307,
"step": 1267
},
{
"epoch": 0.5053302779715054,
"grad_norm": 0.2631801881501474,
"learning_rate": 1.99873475938621e-05,
"loss": 0.7999,
"step": 1268
},
{
"epoch": 0.505728803427319,
"grad_norm": 0.34508087706819923,
"learning_rate": 1.9962042801840698e-05,
"loss": 0.8091,
"step": 1269
},
{
"epoch": 0.5061273288831324,
"grad_norm": 0.27438242812890384,
"learning_rate": 1.9936738070582455e-05,
"loss": 0.798,
"step": 1270
},
{
"epoch": 0.5065258543389459,
"grad_norm": 0.3025634657688614,
"learning_rate": 1.991143344059605e-05,
"loss": 0.7952,
"step": 1271
},
{
"epoch": 0.5069243797947593,
"grad_norm": 0.2845789431308592,
"learning_rate": 1.988612895238999e-05,
"loss": 0.8374,
"step": 1272
},
{
"epoch": 0.5073229052505729,
"grad_norm": 0.30248035578518695,
"learning_rate": 1.986082464647255e-05,
"loss": 0.7864,
"step": 1273
},
{
"epoch": 0.5077214307063864,
"grad_norm": 0.2950710488906475,
"learning_rate": 1.9835520563351735e-05,
"loss": 0.8288,
"step": 1274
},
{
"epoch": 0.5081199561621998,
"grad_norm": 0.26824757799025784,
"learning_rate": 1.9810216743535146e-05,
"loss": 0.8364,
"step": 1275
},
{
"epoch": 0.5085184816180134,
"grad_norm": 0.2849419128102798,
"learning_rate": 1.9784913227530024e-05,
"loss": 0.8236,
"step": 1276
},
{
"epoch": 0.5089170070738268,
"grad_norm": 0.3103889603819969,
"learning_rate": 1.975961005584307e-05,
"loss": 0.8136,
"step": 1277
},
{
"epoch": 0.5093155325296403,
"grad_norm": 0.6303290018451543,
"learning_rate": 1.9734307268980467e-05,
"loss": 0.8311,
"step": 1278
},
{
"epoch": 0.5097140579854538,
"grad_norm": 0.32732619537234586,
"learning_rate": 1.9709004907447774e-05,
"loss": 0.8221,
"step": 1279
},
{
"epoch": 0.5101125834412673,
"grad_norm": 0.28339108969670607,
"learning_rate": 1.9683703011749862e-05,
"loss": 0.7966,
"step": 1280
},
{
"epoch": 0.5105111088970808,
"grad_norm": 0.3203578395612973,
"learning_rate": 1.965840162239087e-05,
"loss": 0.8137,
"step": 1281
},
{
"epoch": 0.5109096343528943,
"grad_norm": 0.27176747745707136,
"learning_rate": 1.96331007798741e-05,
"loss": 0.8078,
"step": 1282
},
{
"epoch": 0.5113081598087078,
"grad_norm": 0.29516820299549673,
"learning_rate": 1.9607800524702015e-05,
"loss": 0.8209,
"step": 1283
},
{
"epoch": 0.5117066852645212,
"grad_norm": 0.26212656038325677,
"learning_rate": 1.9582500897376104e-05,
"loss": 0.8141,
"step": 1284
},
{
"epoch": 0.5121052107203348,
"grad_norm": 0.28250007105261504,
"learning_rate": 1.955720193839687e-05,
"loss": 0.8278,
"step": 1285
},
{
"epoch": 0.5125037361761483,
"grad_norm": 0.26685759222958566,
"learning_rate": 1.953190368826376e-05,
"loss": 0.8339,
"step": 1286
},
{
"epoch": 0.5129022616319617,
"grad_norm": 0.29022333673533535,
"learning_rate": 1.9506606187475036e-05,
"loss": 0.8315,
"step": 1287
},
{
"epoch": 0.5133007870877753,
"grad_norm": 0.2670289567076886,
"learning_rate": 1.9481309476527825e-05,
"loss": 0.801,
"step": 1288
},
{
"epoch": 0.5136993125435887,
"grad_norm": 0.289510280019879,
"learning_rate": 1.9456013595917928e-05,
"loss": 0.812,
"step": 1289
},
{
"epoch": 0.5140978379994022,
"grad_norm": 0.3128114319953551,
"learning_rate": 1.9430718586139863e-05,
"loss": 0.8095,
"step": 1290
},
{
"epoch": 0.5144963634552157,
"grad_norm": 0.2888978962753298,
"learning_rate": 1.9405424487686732e-05,
"loss": 0.79,
"step": 1291
},
{
"epoch": 0.5148948889110292,
"grad_norm": 0.30521651593807825,
"learning_rate": 1.9380131341050185e-05,
"loss": 0.8137,
"step": 1292
},
{
"epoch": 0.5152934143668427,
"grad_norm": 0.2722787387877988,
"learning_rate": 1.935483918672035e-05,
"loss": 0.8291,
"step": 1293
},
{
"epoch": 0.5156919398226562,
"grad_norm": 0.2863692337341115,
"learning_rate": 1.932954806518575e-05,
"loss": 0.7981,
"step": 1294
},
{
"epoch": 0.5160904652784697,
"grad_norm": 0.2759777323624655,
"learning_rate": 1.9304258016933282e-05,
"loss": 0.8272,
"step": 1295
},
{
"epoch": 0.5164889907342831,
"grad_norm": 0.27713843658608434,
"learning_rate": 1.92789690824481e-05,
"loss": 0.8079,
"step": 1296
},
{
"epoch": 0.5168875161900967,
"grad_norm": 0.2877077017647955,
"learning_rate": 1.92536813022136e-05,
"loss": 0.7918,
"step": 1297
},
{
"epoch": 0.5172860416459101,
"grad_norm": 0.28949094300241585,
"learning_rate": 1.9228394716711288e-05,
"loss": 0.7969,
"step": 1298
},
{
"epoch": 0.5176845671017236,
"grad_norm": 0.29697989743375497,
"learning_rate": 1.9203109366420812e-05,
"loss": 0.7928,
"step": 1299
},
{
"epoch": 0.5180830925575372,
"grad_norm": 0.27889648874882045,
"learning_rate": 1.917782529181981e-05,
"loss": 0.8233,
"step": 1300
},
{
"epoch": 0.5184816180133506,
"grad_norm": 0.3023364181088352,
"learning_rate": 1.9152542533383872e-05,
"loss": 0.8312,
"step": 1301
},
{
"epoch": 0.5188801434691641,
"grad_norm": 0.28357607259449,
"learning_rate": 1.9127261131586503e-05,
"loss": 0.7801,
"step": 1302
},
{
"epoch": 0.5192786689249775,
"grad_norm": 0.2869887242640123,
"learning_rate": 1.910198112689902e-05,
"loss": 0.7965,
"step": 1303
},
{
"epoch": 0.5196771943807911,
"grad_norm": 0.28743333022015244,
"learning_rate": 1.9076702559790514e-05,
"loss": 0.8146,
"step": 1304
},
{
"epoch": 0.5200757198366046,
"grad_norm": 0.284017183782701,
"learning_rate": 1.9051425470727766e-05,
"loss": 0.7865,
"step": 1305
},
{
"epoch": 0.520474245292418,
"grad_norm": 0.29268712998816515,
"learning_rate": 1.9026149900175193e-05,
"loss": 0.7996,
"step": 1306
},
{
"epoch": 0.5208727707482316,
"grad_norm": 0.28999045805168566,
"learning_rate": 1.9000875888594792e-05,
"loss": 0.849,
"step": 1307
},
{
"epoch": 0.521271296204045,
"grad_norm": 0.30459398540455407,
"learning_rate": 1.8975603476446048e-05,
"loss": 0.7935,
"step": 1308
},
{
"epoch": 0.5216698216598585,
"grad_norm": 0.2838650093705641,
"learning_rate": 1.89503327041859e-05,
"loss": 0.8034,
"step": 1309
},
{
"epoch": 0.522068347115672,
"grad_norm": 0.304766254772995,
"learning_rate": 1.8925063612268637e-05,
"loss": 0.846,
"step": 1310
},
{
"epoch": 0.5224668725714855,
"grad_norm": 0.27645008842126473,
"learning_rate": 1.8899796241145903e-05,
"loss": 0.8269,
"step": 1311
},
{
"epoch": 0.522865398027299,
"grad_norm": 0.2952376158549396,
"learning_rate": 1.8874530631266536e-05,
"loss": 0.8369,
"step": 1312
},
{
"epoch": 0.5232639234831125,
"grad_norm": 0.33296861229967156,
"learning_rate": 1.8849266823076578e-05,
"loss": 0.8134,
"step": 1313
},
{
"epoch": 0.523662448938926,
"grad_norm": 0.2866595965213398,
"learning_rate": 1.8824004857019217e-05,
"loss": 0.8192,
"step": 1314
},
{
"epoch": 0.5240609743947394,
"grad_norm": 0.4924611590945922,
"learning_rate": 1.879874477353463e-05,
"loss": 0.7903,
"step": 1315
},
{
"epoch": 0.524459499850553,
"grad_norm": 0.27677116299415827,
"learning_rate": 1.877348661306003e-05,
"loss": 0.8102,
"step": 1316
},
{
"epoch": 0.5248580253063665,
"grad_norm": 0.28883962158261584,
"learning_rate": 1.8748230416029522e-05,
"loss": 0.7984,
"step": 1317
},
{
"epoch": 0.5252565507621799,
"grad_norm": 0.281009978014599,
"learning_rate": 1.8722976222874095e-05,
"loss": 0.8045,
"step": 1318
},
{
"epoch": 0.5256550762179935,
"grad_norm": 0.3095342467124618,
"learning_rate": 1.8697724074021502e-05,
"loss": 0.767,
"step": 1319
},
{
"epoch": 0.5260536016738069,
"grad_norm": 0.29319346123143347,
"learning_rate": 1.8672474009896242e-05,
"loss": 0.8372,
"step": 1320
},
{
"epoch": 0.5264521271296204,
"grad_norm": 0.47782909290265757,
"learning_rate": 1.8647226070919474e-05,
"loss": 0.8488,
"step": 1321
},
{
"epoch": 0.5268506525854338,
"grad_norm": 0.3110245262948928,
"learning_rate": 1.862198029750895e-05,
"loss": 0.7963,
"step": 1322
},
{
"epoch": 0.5272491780412474,
"grad_norm": 0.2917881624752996,
"learning_rate": 1.8596736730078967e-05,
"loss": 0.7952,
"step": 1323
},
{
"epoch": 0.5276477034970609,
"grad_norm": 0.33165379448294435,
"learning_rate": 1.857149540904026e-05,
"loss": 0.8076,
"step": 1324
},
{
"epoch": 0.5280462289528743,
"grad_norm": 0.4239553010821896,
"learning_rate": 1.8546256374800006e-05,
"loss": 0.8028,
"step": 1325
},
{
"epoch": 0.5284447544086879,
"grad_norm": 0.2734465341467207,
"learning_rate": 1.8521019667761697e-05,
"loss": 0.794,
"step": 1326
},
{
"epoch": 0.5288432798645013,
"grad_norm": 0.2629858746393782,
"learning_rate": 1.8495785328325104e-05,
"loss": 0.8112,
"step": 1327
},
{
"epoch": 0.5292418053203148,
"grad_norm": 0.28632746629019823,
"learning_rate": 1.8470553396886222e-05,
"loss": 0.8052,
"step": 1328
},
{
"epoch": 0.5296403307761284,
"grad_norm": 0.2693728963637755,
"learning_rate": 1.8445323913837173e-05,
"loss": 0.797,
"step": 1329
},
{
"epoch": 0.5300388562319418,
"grad_norm": 0.29114792078325186,
"learning_rate": 1.8420096919566173e-05,
"loss": 0.8199,
"step": 1330
},
{
"epoch": 0.5304373816877553,
"grad_norm": 0.2806667770430771,
"learning_rate": 1.8394872454457434e-05,
"loss": 0.7832,
"step": 1331
},
{
"epoch": 0.5308359071435688,
"grad_norm": 0.28182635320788874,
"learning_rate": 1.836965055889115e-05,
"loss": 0.7998,
"step": 1332
},
{
"epoch": 0.5312344325993823,
"grad_norm": 0.3254325490129574,
"learning_rate": 1.8344431273243364e-05,
"loss": 0.8112,
"step": 1333
},
{
"epoch": 0.5316329580551957,
"grad_norm": 0.29483982391186925,
"learning_rate": 1.8319214637885975e-05,
"loss": 0.8025,
"step": 1334
},
{
"epoch": 0.5320314835110093,
"grad_norm": 0.2552432370606682,
"learning_rate": 1.829400069318663e-05,
"loss": 0.7978,
"step": 1335
},
{
"epoch": 0.5324300089668228,
"grad_norm": 0.2923821069068519,
"learning_rate": 1.826878947950864e-05,
"loss": 0.7833,
"step": 1336
},
{
"epoch": 0.5328285344226362,
"grad_norm": 0.26602672952480433,
"learning_rate": 1.8243581037211005e-05,
"loss": 0.7893,
"step": 1337
},
{
"epoch": 0.5332270598784498,
"grad_norm": 0.26880063097474627,
"learning_rate": 1.821837540664822e-05,
"loss": 0.7862,
"step": 1338
},
{
"epoch": 0.5336255853342632,
"grad_norm": 0.2708329335402036,
"learning_rate": 1.8193172628170324e-05,
"loss": 0.8108,
"step": 1339
},
{
"epoch": 0.5340241107900767,
"grad_norm": 0.28229072758383317,
"learning_rate": 1.8167972742122773e-05,
"loss": 0.8675,
"step": 1340
},
{
"epoch": 0.5344226362458903,
"grad_norm": 0.2741069117172231,
"learning_rate": 1.81427757888464e-05,
"loss": 0.8261,
"step": 1341
},
{
"epoch": 0.5348211617017037,
"grad_norm": 0.27606049985568326,
"learning_rate": 1.811758180867734e-05,
"loss": 0.8128,
"step": 1342
},
{
"epoch": 0.5352196871575172,
"grad_norm": 0.27575883416758074,
"learning_rate": 1.8092390841946964e-05,
"loss": 0.7975,
"step": 1343
},
{
"epoch": 0.5356182126133306,
"grad_norm": 0.27470419217590547,
"learning_rate": 1.8067202928981827e-05,
"loss": 0.801,
"step": 1344
},
{
"epoch": 0.5360167380691442,
"grad_norm": 0.2682028369114076,
"learning_rate": 1.804201811010359e-05,
"loss": 0.7992,
"step": 1345
},
{
"epoch": 0.5364152635249576,
"grad_norm": 0.29741163933246206,
"learning_rate": 1.8016836425628972e-05,
"loss": 0.7863,
"step": 1346
},
{
"epoch": 0.5368137889807711,
"grad_norm": 0.2879307582320043,
"learning_rate": 1.7991657915869646e-05,
"loss": 0.7912,
"step": 1347
},
{
"epoch": 0.5372123144365847,
"grad_norm": 0.26970303031329906,
"learning_rate": 1.7966482621132227e-05,
"loss": 0.83,
"step": 1348
},
{
"epoch": 0.5376108398923981,
"grad_norm": 0.2899878970961642,
"learning_rate": 1.7941310581718197e-05,
"loss": 0.8143,
"step": 1349
},
{
"epoch": 0.5380093653482116,
"grad_norm": 0.2749787514839584,
"learning_rate": 1.7916141837923787e-05,
"loss": 0.7954,
"step": 1350
},
{
"epoch": 0.5384078908040251,
"grad_norm": 0.27467702468985844,
"learning_rate": 1.7890976430039982e-05,
"loss": 0.7982,
"step": 1351
},
{
"epoch": 0.5388064162598386,
"grad_norm": 0.2618705303695261,
"learning_rate": 1.786581439835241e-05,
"loss": 0.8195,
"step": 1352
},
{
"epoch": 0.5392049417156521,
"grad_norm": 0.2714594323337975,
"learning_rate": 1.7840655783141313e-05,
"loss": 0.796,
"step": 1353
},
{
"epoch": 0.5396034671714656,
"grad_norm": 0.28811188495556306,
"learning_rate": 1.7815500624681444e-05,
"loss": 0.7994,
"step": 1354
},
{
"epoch": 0.5400019926272791,
"grad_norm": 0.2720623478220906,
"learning_rate": 1.779034896324204e-05,
"loss": 0.8153,
"step": 1355
},
{
"epoch": 0.5404005180830925,
"grad_norm": 0.26375062989547793,
"learning_rate": 1.7765200839086722e-05,
"loss": 0.8091,
"step": 1356
},
{
"epoch": 0.5407990435389061,
"grad_norm": 0.2692041660964484,
"learning_rate": 1.774005629247346e-05,
"loss": 0.8079,
"step": 1357
},
{
"epoch": 0.5411975689947195,
"grad_norm": 0.26724517612106163,
"learning_rate": 1.77149153636545e-05,
"loss": 0.8255,
"step": 1358
},
{
"epoch": 0.541596094450533,
"grad_norm": 0.2767757640601006,
"learning_rate": 1.7689778092876276e-05,
"loss": 0.7899,
"step": 1359
},
{
"epoch": 0.5419946199063466,
"grad_norm": 0.4018120080677502,
"learning_rate": 1.7664644520379398e-05,
"loss": 0.8113,
"step": 1360
},
{
"epoch": 0.54239314536216,
"grad_norm": 0.31258004159467684,
"learning_rate": 1.7639514686398537e-05,
"loss": 0.8172,
"step": 1361
},
{
"epoch": 0.5427916708179735,
"grad_norm": 0.3470011840822337,
"learning_rate": 1.7614388631162365e-05,
"loss": 0.7933,
"step": 1362
},
{
"epoch": 0.543190196273787,
"grad_norm": 0.3043763377673315,
"learning_rate": 1.758926639489354e-05,
"loss": 0.8135,
"step": 1363
},
{
"epoch": 0.5435887217296005,
"grad_norm": 0.2923964849291302,
"learning_rate": 1.7564148017808578e-05,
"loss": 0.7818,
"step": 1364
},
{
"epoch": 0.5439872471854139,
"grad_norm": 0.3065609901064694,
"learning_rate": 1.753903354011783e-05,
"loss": 0.8423,
"step": 1365
},
{
"epoch": 0.5443857726412275,
"grad_norm": 0.2985623055209066,
"learning_rate": 1.751392300202539e-05,
"loss": 0.8157,
"step": 1366
},
{
"epoch": 0.544784298097041,
"grad_norm": 0.2786406179918027,
"learning_rate": 1.7488816443729066e-05,
"loss": 0.8133,
"step": 1367
},
{
"epoch": 0.5451828235528544,
"grad_norm": 0.30926673491457163,
"learning_rate": 1.746371390542029e-05,
"loss": 0.8133,
"step": 1368
},
{
"epoch": 0.545581349008668,
"grad_norm": 0.2641540209794052,
"learning_rate": 1.743861542728404e-05,
"loss": 0.7962,
"step": 1369
},
{
"epoch": 0.5459798744644814,
"grad_norm": 0.29034836879196485,
"learning_rate": 1.7413521049498823e-05,
"loss": 0.8176,
"step": 1370
},
{
"epoch": 0.5463783999202949,
"grad_norm": 0.2768072644524204,
"learning_rate": 1.7388430812236556e-05,
"loss": 0.7693,
"step": 1371
},
{
"epoch": 0.5467769253761084,
"grad_norm": 0.2769206801693697,
"learning_rate": 1.7363344755662555e-05,
"loss": 0.8047,
"step": 1372
},
{
"epoch": 0.5471754508319219,
"grad_norm": 0.36766327627843176,
"learning_rate": 1.733826291993541e-05,
"loss": 0.8223,
"step": 1373
},
{
"epoch": 0.5475739762877354,
"grad_norm": 0.40226420420015246,
"learning_rate": 1.7313185345206968e-05,
"loss": 0.7996,
"step": 1374
},
{
"epoch": 0.5479725017435488,
"grad_norm": 0.2964909563746245,
"learning_rate": 1.728811207162228e-05,
"loss": 0.809,
"step": 1375
},
{
"epoch": 0.5483710271993624,
"grad_norm": 0.28906435974471956,
"learning_rate": 1.7263043139319476e-05,
"loss": 0.755,
"step": 1376
},
{
"epoch": 0.5487695526551758,
"grad_norm": 0.292890255157397,
"learning_rate": 1.7237978588429753e-05,
"loss": 0.8009,
"step": 1377
},
{
"epoch": 0.5491680781109893,
"grad_norm": 0.2781433781639577,
"learning_rate": 1.721291845907729e-05,
"loss": 0.7944,
"step": 1378
},
{
"epoch": 0.5495666035668029,
"grad_norm": 0.2937009571551766,
"learning_rate": 1.7187862791379198e-05,
"loss": 0.8135,
"step": 1379
},
{
"epoch": 0.5499651290226163,
"grad_norm": 0.2912565732468286,
"learning_rate": 1.7162811625445423e-05,
"loss": 0.8388,
"step": 1380
},
{
"epoch": 0.5503636544784298,
"grad_norm": 0.2748876016189558,
"learning_rate": 1.7137765001378724e-05,
"loss": 0.836,
"step": 1381
},
{
"epoch": 0.5507621799342433,
"grad_norm": 0.3163822046309509,
"learning_rate": 1.711272295927459e-05,
"loss": 0.8288,
"step": 1382
},
{
"epoch": 0.5511607053900568,
"grad_norm": 0.27254752708037466,
"learning_rate": 1.7087685539221162e-05,
"loss": 0.8161,
"step": 1383
},
{
"epoch": 0.5515592308458703,
"grad_norm": 0.3125729789680171,
"learning_rate": 1.70626527812992e-05,
"loss": 0.8181,
"step": 1384
},
{
"epoch": 0.5519577563016838,
"grad_norm": 0.29916353607545526,
"learning_rate": 1.703762472558196e-05,
"loss": 0.776,
"step": 1385
},
{
"epoch": 0.5523562817574973,
"grad_norm": 0.333298444535358,
"learning_rate": 1.7012601412135237e-05,
"loss": 0.8271,
"step": 1386
},
{
"epoch": 0.5527548072133107,
"grad_norm": 0.26574557176935226,
"learning_rate": 1.6987582881017173e-05,
"loss": 0.7903,
"step": 1387
},
{
"epoch": 0.5531533326691243,
"grad_norm": 0.30640181668201066,
"learning_rate": 1.6962569172278283e-05,
"loss": 0.8029,
"step": 1388
},
{
"epoch": 0.5535518581249377,
"grad_norm": 0.27259308701491025,
"learning_rate": 1.6937560325961364e-05,
"loss": 0.8145,
"step": 1389
},
{
"epoch": 0.5539503835807512,
"grad_norm": 0.29936679527497784,
"learning_rate": 1.6912556382101415e-05,
"loss": 0.791,
"step": 1390
},
{
"epoch": 0.5543489090365648,
"grad_norm": 0.2708401911735976,
"learning_rate": 1.6887557380725602e-05,
"loss": 0.8067,
"step": 1391
},
{
"epoch": 0.5547474344923782,
"grad_norm": 0.2744964958311244,
"learning_rate": 1.6862563361853165e-05,
"loss": 0.8082,
"step": 1392
},
{
"epoch": 0.5551459599481917,
"grad_norm": 0.27774556322816,
"learning_rate": 1.6837574365495383e-05,
"loss": 0.8201,
"step": 1393
},
{
"epoch": 0.5555444854040051,
"grad_norm": 0.2860333592628782,
"learning_rate": 1.6812590431655473e-05,
"loss": 0.8132,
"step": 1394
},
{
"epoch": 0.5559430108598187,
"grad_norm": 0.2874026887492097,
"learning_rate": 1.678761160032857e-05,
"loss": 0.8031,
"step": 1395
},
{
"epoch": 0.5563415363156322,
"grad_norm": 0.28106720251341816,
"learning_rate": 1.676263791150164e-05,
"loss": 0.8094,
"step": 1396
},
{
"epoch": 0.5567400617714456,
"grad_norm": 0.29522074096111917,
"learning_rate": 1.6737669405153388e-05,
"loss": 0.794,
"step": 1397
},
{
"epoch": 0.5571385872272592,
"grad_norm": 0.273137049734289,
"learning_rate": 1.6712706121254264e-05,
"loss": 0.7904,
"step": 1398
},
{
"epoch": 0.5575371126830726,
"grad_norm": 0.2938729039193004,
"learning_rate": 1.668774809976632e-05,
"loss": 0.8211,
"step": 1399
},
{
"epoch": 0.5579356381388861,
"grad_norm": 0.27893542802339405,
"learning_rate": 1.6662795380643212e-05,
"loss": 0.7831,
"step": 1400
},
{
"epoch": 0.5583341635946996,
"grad_norm": 0.31771721535476655,
"learning_rate": 1.6637848003830086e-05,
"loss": 0.78,
"step": 1401
},
{
"epoch": 0.5587326890505131,
"grad_norm": 0.27599058172210705,
"learning_rate": 1.6612906009263553e-05,
"loss": 0.7996,
"step": 1402
},
{
"epoch": 0.5591312145063266,
"grad_norm": 0.26309299304248956,
"learning_rate": 1.6587969436871608e-05,
"loss": 0.8273,
"step": 1403
},
{
"epoch": 0.5595297399621401,
"grad_norm": 0.2658663776464135,
"learning_rate": 1.6563038326573544e-05,
"loss": 0.7803,
"step": 1404
},
{
"epoch": 0.5599282654179536,
"grad_norm": 0.27453871016555076,
"learning_rate": 1.6538112718279937e-05,
"loss": 0.8192,
"step": 1405
},
{
"epoch": 0.560326790873767,
"grad_norm": 0.30380713206643706,
"learning_rate": 1.651319265189254e-05,
"loss": 0.7841,
"step": 1406
},
{
"epoch": 0.5607253163295806,
"grad_norm": 0.2745314071899381,
"learning_rate": 1.6488278167304243e-05,
"loss": 0.7966,
"step": 1407
},
{
"epoch": 0.5611238417853941,
"grad_norm": 0.27106784806374307,
"learning_rate": 1.6463369304398976e-05,
"loss": 0.782,
"step": 1408
},
{
"epoch": 0.5615223672412075,
"grad_norm": 0.26824801623885447,
"learning_rate": 1.6438466103051708e-05,
"loss": 0.7975,
"step": 1409
},
{
"epoch": 0.5619208926970211,
"grad_norm": 0.315466445265476,
"learning_rate": 1.641356860312833e-05,
"loss": 0.8375,
"step": 1410
},
{
"epoch": 0.5623194181528345,
"grad_norm": 0.26586433303215745,
"learning_rate": 1.6388676844485583e-05,
"loss": 0.7963,
"step": 1411
},
{
"epoch": 0.562717943608648,
"grad_norm": 0.26384331857538773,
"learning_rate": 1.636379086697105e-05,
"loss": 0.811,
"step": 1412
},
{
"epoch": 0.5631164690644614,
"grad_norm": 0.2743841871460786,
"learning_rate": 1.6338910710423034e-05,
"loss": 0.7687,
"step": 1413
},
{
"epoch": 0.563514994520275,
"grad_norm": 0.2598827208531272,
"learning_rate": 1.6314036414670544e-05,
"loss": 0.7926,
"step": 1414
},
{
"epoch": 0.5639135199760885,
"grad_norm": 0.2631333168836199,
"learning_rate": 1.6289168019533182e-05,
"loss": 0.8233,
"step": 1415
},
{
"epoch": 0.5643120454319019,
"grad_norm": 0.274009439927925,
"learning_rate": 1.626430556482112e-05,
"loss": 0.8093,
"step": 1416
},
{
"epoch": 0.5647105708877155,
"grad_norm": 0.2815241084799363,
"learning_rate": 1.623944909033502e-05,
"loss": 0.8386,
"step": 1417
},
{
"epoch": 0.5651090963435289,
"grad_norm": 0.2693426340478129,
"learning_rate": 1.621459863586596e-05,
"loss": 0.7934,
"step": 1418
},
{
"epoch": 0.5655076217993424,
"grad_norm": 0.28640728418548206,
"learning_rate": 1.61897542411954e-05,
"loss": 0.7605,
"step": 1419
},
{
"epoch": 0.565906147255156,
"grad_norm": 0.28566808429395685,
"learning_rate": 1.6164915946095063e-05,
"loss": 0.7836,
"step": 1420
},
{
"epoch": 0.5663046727109694,
"grad_norm": 0.2703972532532415,
"learning_rate": 1.6140083790326963e-05,
"loss": 0.8089,
"step": 1421
},
{
"epoch": 0.5667031981667829,
"grad_norm": 0.2792579130299739,
"learning_rate": 1.6115257813643227e-05,
"loss": 0.8133,
"step": 1422
},
{
"epoch": 0.5671017236225964,
"grad_norm": 0.2729454606681309,
"learning_rate": 1.6090438055786123e-05,
"loss": 0.8097,
"step": 1423
},
{
"epoch": 0.5675002490784099,
"grad_norm": 0.2915157005944316,
"learning_rate": 1.606562455648798e-05,
"loss": 0.8078,
"step": 1424
},
{
"epoch": 0.5678987745342233,
"grad_norm": 0.29032778472704807,
"learning_rate": 1.6040817355471065e-05,
"loss": 0.7931,
"step": 1425
},
{
"epoch": 0.5682972999900369,
"grad_norm": 0.2636401468661431,
"learning_rate": 1.601601649244759e-05,
"loss": 0.8162,
"step": 1426
},
{
"epoch": 0.5686958254458504,
"grad_norm": 0.288342129461046,
"learning_rate": 1.5991222007119614e-05,
"loss": 0.831,
"step": 1427
},
{
"epoch": 0.5690943509016638,
"grad_norm": 0.25892278113322154,
"learning_rate": 1.5966433939178992e-05,
"loss": 0.7956,
"step": 1428
},
{
"epoch": 0.5694928763574774,
"grad_norm": 0.30072057342912867,
"learning_rate": 1.5941652328307296e-05,
"loss": 0.777,
"step": 1429
},
{
"epoch": 0.5698914018132908,
"grad_norm": 0.26806489233741043,
"learning_rate": 1.5916877214175768e-05,
"loss": 0.8291,
"step": 1430
},
{
"epoch": 0.5702899272691043,
"grad_norm": 0.2905448743699399,
"learning_rate": 1.589210863644525e-05,
"loss": 0.8472,
"step": 1431
},
{
"epoch": 0.5706884527249177,
"grad_norm": 0.2982764650867147,
"learning_rate": 1.586734663476612e-05,
"loss": 0.8144,
"step": 1432
},
{
"epoch": 0.5710869781807313,
"grad_norm": 0.2872873533319639,
"learning_rate": 1.584259124877823e-05,
"loss": 0.8113,
"step": 1433
},
{
"epoch": 0.5714855036365448,
"grad_norm": 0.29449735325312454,
"learning_rate": 1.5817842518110827e-05,
"loss": 0.8214,
"step": 1434
},
{
"epoch": 0.5718840290923582,
"grad_norm": 0.39051963343272733,
"learning_rate": 1.5793100482382525e-05,
"loss": 0.7799,
"step": 1435
},
{
"epoch": 0.5722825545481718,
"grad_norm": 0.2616459809836497,
"learning_rate": 1.5768365181201205e-05,
"loss": 0.7777,
"step": 1436
},
{
"epoch": 0.5726810800039852,
"grad_norm": 0.28842653622157877,
"learning_rate": 1.574363665416398e-05,
"loss": 0.7962,
"step": 1437
},
{
"epoch": 0.5730796054597987,
"grad_norm": 0.2641950748942506,
"learning_rate": 1.5718914940857114e-05,
"loss": 0.7991,
"step": 1438
},
{
"epoch": 0.5734781309156123,
"grad_norm": 0.27488209941925706,
"learning_rate": 1.5694200080855952e-05,
"loss": 0.7883,
"step": 1439
},
{
"epoch": 0.5738766563714257,
"grad_norm": 0.26045131988579345,
"learning_rate": 1.5669492113724888e-05,
"loss": 0.7938,
"step": 1440
},
{
"epoch": 0.5742751818272392,
"grad_norm": 0.2974260811653572,
"learning_rate": 1.5644791079017263e-05,
"loss": 0.8168,
"step": 1441
},
{
"epoch": 0.5746737072830527,
"grad_norm": 0.28973731321680374,
"learning_rate": 1.562009701627533e-05,
"loss": 0.7946,
"step": 1442
},
{
"epoch": 0.5750722327388662,
"grad_norm": 0.28100822605068104,
"learning_rate": 1.5595409965030188e-05,
"loss": 0.8041,
"step": 1443
},
{
"epoch": 0.5754707581946796,
"grad_norm": 0.2836905042084171,
"learning_rate": 1.557072996480169e-05,
"loss": 0.7906,
"step": 1444
},
{
"epoch": 0.5758692836504932,
"grad_norm": 0.265117167660616,
"learning_rate": 1.554605705509843e-05,
"loss": 0.8415,
"step": 1445
},
{
"epoch": 0.5762678091063067,
"grad_norm": 0.26306772688466995,
"learning_rate": 1.5521391275417613e-05,
"loss": 0.8292,
"step": 1446
},
{
"epoch": 0.5766663345621201,
"grad_norm": 0.2710950213877723,
"learning_rate": 1.5496732665245085e-05,
"loss": 0.8231,
"step": 1447
},
{
"epoch": 0.5770648600179337,
"grad_norm": 0.2788906456071625,
"learning_rate": 1.5472081264055154e-05,
"loss": 0.8116,
"step": 1448
},
{
"epoch": 0.5774633854737471,
"grad_norm": 0.27310715767259724,
"learning_rate": 1.5447437111310624e-05,
"loss": 0.8271,
"step": 1449
},
{
"epoch": 0.5778619109295606,
"grad_norm": 0.2785035809739301,
"learning_rate": 1.5422800246462706e-05,
"loss": 0.7981,
"step": 1450
},
{
"epoch": 0.5782604363853742,
"grad_norm": 0.27219975804237134,
"learning_rate": 1.5398170708950902e-05,
"loss": 0.7965,
"step": 1451
},
{
"epoch": 0.5786589618411876,
"grad_norm": 0.27506447504088605,
"learning_rate": 1.5373548538203026e-05,
"loss": 0.8201,
"step": 1452
},
{
"epoch": 0.5790574872970011,
"grad_norm": 0.2946170401264071,
"learning_rate": 1.5348933773635067e-05,
"loss": 0.8128,
"step": 1453
},
{
"epoch": 0.5794560127528146,
"grad_norm": 0.3826815086737385,
"learning_rate": 1.532432645465118e-05,
"loss": 0.8173,
"step": 1454
},
{
"epoch": 0.5798545382086281,
"grad_norm": 0.2924952233528226,
"learning_rate": 1.5299726620643595e-05,
"loss": 0.7775,
"step": 1455
},
{
"epoch": 0.5802530636644415,
"grad_norm": 0.2642260239005724,
"learning_rate": 1.5275134310992553e-05,
"loss": 0.8191,
"step": 1456
},
{
"epoch": 0.580651589120255,
"grad_norm": 0.3149422419473645,
"learning_rate": 1.5250549565066262e-05,
"loss": 0.7974,
"step": 1457
},
{
"epoch": 0.5810501145760686,
"grad_norm": 0.27490534215380524,
"learning_rate": 1.5225972422220804e-05,
"loss": 0.804,
"step": 1458
},
{
"epoch": 0.581448640031882,
"grad_norm": 0.2755621466065312,
"learning_rate": 1.5201402921800114e-05,
"loss": 0.8127,
"step": 1459
},
{
"epoch": 0.5818471654876956,
"grad_norm": 0.3037669691142441,
"learning_rate": 1.5176841103135867e-05,
"loss": 0.7912,
"step": 1460
},
{
"epoch": 0.582245690943509,
"grad_norm": 0.25177796617384035,
"learning_rate": 1.5152287005547458e-05,
"loss": 0.8329,
"step": 1461
},
{
"epoch": 0.5826442163993225,
"grad_norm": 0.27341817612876335,
"learning_rate": 1.512774066834191e-05,
"loss": 0.7794,
"step": 1462
},
{
"epoch": 0.583042741855136,
"grad_norm": 0.2406762714221454,
"learning_rate": 1.5103202130813839e-05,
"loss": 0.7918,
"step": 1463
},
{
"epoch": 0.5834412673109495,
"grad_norm": 0.28482104897292554,
"learning_rate": 1.5078671432245362e-05,
"loss": 0.7675,
"step": 1464
},
{
"epoch": 0.583839792766763,
"grad_norm": 0.25741699835096044,
"learning_rate": 1.5054148611906047e-05,
"loss": 0.7924,
"step": 1465
},
{
"epoch": 0.5842383182225764,
"grad_norm": 0.2920808223289217,
"learning_rate": 1.5029633709052864e-05,
"loss": 0.8141,
"step": 1466
},
{
"epoch": 0.58463684367839,
"grad_norm": 0.2807331727085593,
"learning_rate": 1.5005126762930085e-05,
"loss": 0.7992,
"step": 1467
},
{
"epoch": 0.5850353691342034,
"grad_norm": 0.2785002462676359,
"learning_rate": 1.4980627812769273e-05,
"loss": 0.8283,
"step": 1468
},
{
"epoch": 0.5854338945900169,
"grad_norm": 0.48934793357042067,
"learning_rate": 1.4956136897789155e-05,
"loss": 0.8011,
"step": 1469
},
{
"epoch": 0.5858324200458305,
"grad_norm": 0.2541832978215571,
"learning_rate": 1.4931654057195633e-05,
"loss": 0.7957,
"step": 1470
},
{
"epoch": 0.5862309455016439,
"grad_norm": 0.28333216989436416,
"learning_rate": 1.4907179330181667e-05,
"loss": 0.7933,
"step": 1471
},
{
"epoch": 0.5866294709574574,
"grad_norm": 0.25893200895383417,
"learning_rate": 1.4882712755927208e-05,
"loss": 0.8324,
"step": 1472
},
{
"epoch": 0.5870279964132709,
"grad_norm": 0.2964360831302451,
"learning_rate": 1.4858254373599206e-05,
"loss": 0.8116,
"step": 1473
},
{
"epoch": 0.5874265218690844,
"grad_norm": 0.2520201190243798,
"learning_rate": 1.4833804222351437e-05,
"loss": 0.7728,
"step": 1474
},
{
"epoch": 0.5878250473248979,
"grad_norm": 0.28965585570658003,
"learning_rate": 1.4809362341324549e-05,
"loss": 0.8301,
"step": 1475
},
{
"epoch": 0.5882235727807114,
"grad_norm": 0.2680016094991912,
"learning_rate": 1.478492876964592e-05,
"loss": 0.8104,
"step": 1476
},
{
"epoch": 0.5886220982365249,
"grad_norm": 0.29138008709625307,
"learning_rate": 1.4760503546429642e-05,
"loss": 0.7939,
"step": 1477
},
{
"epoch": 0.5890206236923383,
"grad_norm": 0.27301356294256424,
"learning_rate": 1.473608671077644e-05,
"loss": 0.8017,
"step": 1478
},
{
"epoch": 0.5894191491481519,
"grad_norm": 0.27632908308241927,
"learning_rate": 1.4711678301773607e-05,
"loss": 0.7876,
"step": 1479
},
{
"epoch": 0.5898176746039653,
"grad_norm": 0.29739284619714174,
"learning_rate": 1.4687278358494954e-05,
"loss": 0.8396,
"step": 1480
},
{
"epoch": 0.5902162000597788,
"grad_norm": 0.26373275038816285,
"learning_rate": 1.4662886920000727e-05,
"loss": 0.7893,
"step": 1481
},
{
"epoch": 0.5906147255155924,
"grad_norm": 0.28819618380315065,
"learning_rate": 1.463850402533758e-05,
"loss": 0.8096,
"step": 1482
},
{
"epoch": 0.5910132509714058,
"grad_norm": 0.26086188725806075,
"learning_rate": 1.4614129713538456e-05,
"loss": 0.8272,
"step": 1483
},
{
"epoch": 0.5914117764272193,
"grad_norm": 0.2998087493750338,
"learning_rate": 1.4589764023622585e-05,
"loss": 0.811,
"step": 1484
},
{
"epoch": 0.5918103018830327,
"grad_norm": 0.28423477916709305,
"learning_rate": 1.4565406994595402e-05,
"loss": 0.8314,
"step": 1485
},
{
"epoch": 0.5922088273388463,
"grad_norm": 0.4714680189752818,
"learning_rate": 1.4541058665448437e-05,
"loss": 0.8132,
"step": 1486
},
{
"epoch": 0.5926073527946598,
"grad_norm": 0.2832956819184063,
"learning_rate": 1.4516719075159342e-05,
"loss": 0.8201,
"step": 1487
},
{
"epoch": 0.5930058782504732,
"grad_norm": 0.280931582487737,
"learning_rate": 1.4492388262691737e-05,
"loss": 0.8104,
"step": 1488
},
{
"epoch": 0.5934044037062868,
"grad_norm": 0.4352514915841819,
"learning_rate": 1.4468066266995222e-05,
"loss": 0.7969,
"step": 1489
},
{
"epoch": 0.5938029291621002,
"grad_norm": 0.28686409934998564,
"learning_rate": 1.4443753127005264e-05,
"loss": 0.7842,
"step": 1490
},
{
"epoch": 0.5942014546179137,
"grad_norm": 0.2569294965760903,
"learning_rate": 1.4419448881643158e-05,
"loss": 0.8154,
"step": 1491
},
{
"epoch": 0.5945999800737272,
"grad_norm": 0.28382287666623324,
"learning_rate": 1.4395153569815974e-05,
"loss": 0.8105,
"step": 1492
},
{
"epoch": 0.5949985055295407,
"grad_norm": 0.2572203424982894,
"learning_rate": 1.4370867230416451e-05,
"loss": 0.7826,
"step": 1493
},
{
"epoch": 0.5953970309853542,
"grad_norm": 6.465506917099715,
"learning_rate": 1.4346589902323003e-05,
"loss": 0.783,
"step": 1494
},
{
"epoch": 0.5957955564411677,
"grad_norm": 0.39706235846696825,
"learning_rate": 1.432232162439957e-05,
"loss": 0.8166,
"step": 1495
},
{
"epoch": 0.5961940818969812,
"grad_norm": 0.26404445452409736,
"learning_rate": 1.4298062435495661e-05,
"loss": 0.7826,
"step": 1496
},
{
"epoch": 0.5965926073527946,
"grad_norm": 0.3308104505575439,
"learning_rate": 1.4273812374446183e-05,
"loss": 0.795,
"step": 1497
},
{
"epoch": 0.5969911328086082,
"grad_norm": 0.3026458263801191,
"learning_rate": 1.4249571480071467e-05,
"loss": 0.7715,
"step": 1498
},
{
"epoch": 0.5973896582644216,
"grad_norm": 0.28588534412959155,
"learning_rate": 1.4225339791177151e-05,
"loss": 0.7987,
"step": 1499
},
{
"epoch": 0.5977881837202351,
"grad_norm": 0.32101230875160675,
"learning_rate": 1.4201117346554144e-05,
"loss": 0.8046,
"step": 1500
},
{
"epoch": 0.5981867091760487,
"grad_norm": 0.290897264466864,
"learning_rate": 1.4176904184978552e-05,
"loss": 0.8004,
"step": 1501
},
{
"epoch": 0.5985852346318621,
"grad_norm": 0.3026009841483658,
"learning_rate": 1.4152700345211626e-05,
"loss": 0.8065,
"step": 1502
},
{
"epoch": 0.5989837600876756,
"grad_norm": 0.44263950851966477,
"learning_rate": 1.412850586599969e-05,
"loss": 0.8096,
"step": 1503
},
{
"epoch": 0.599382285543489,
"grad_norm": 0.3248532926102643,
"learning_rate": 1.4104320786074078e-05,
"loss": 0.8377,
"step": 1504
},
{
"epoch": 0.5997808109993026,
"grad_norm": 0.28575595840318735,
"learning_rate": 1.408014514415109e-05,
"loss": 0.78,
"step": 1505
},
{
"epoch": 0.6001793364551161,
"grad_norm": 0.2794084216593132,
"learning_rate": 1.4055978978931919e-05,
"loss": 0.784,
"step": 1506
},
{
"epoch": 0.6005778619109295,
"grad_norm": 0.2796315632479643,
"learning_rate": 1.4031822329102558e-05,
"loss": 0.7991,
"step": 1507
},
{
"epoch": 0.6009763873667431,
"grad_norm": 0.29082183486321656,
"learning_rate": 1.4007675233333812e-05,
"loss": 0.7593,
"step": 1508
},
{
"epoch": 0.6013749128225565,
"grad_norm": 0.27442890679937104,
"learning_rate": 1.3983537730281153e-05,
"loss": 0.82,
"step": 1509
},
{
"epoch": 0.60177343827837,
"grad_norm": 0.28240777195387234,
"learning_rate": 1.3959409858584718e-05,
"loss": 0.7895,
"step": 1510
},
{
"epoch": 0.6021719637341835,
"grad_norm": 0.28640189626735446,
"learning_rate": 1.3935291656869216e-05,
"loss": 0.8065,
"step": 1511
},
{
"epoch": 0.602570489189997,
"grad_norm": 0.27042843088562313,
"learning_rate": 1.3911183163743883e-05,
"loss": 0.7875,
"step": 1512
},
{
"epoch": 0.6029690146458105,
"grad_norm": 0.3230930753709,
"learning_rate": 1.3887084417802412e-05,
"loss": 0.7854,
"step": 1513
},
{
"epoch": 0.603367540101624,
"grad_norm": 0.26957683695591095,
"learning_rate": 1.3862995457622883e-05,
"loss": 0.8231,
"step": 1514
},
{
"epoch": 0.6037660655574375,
"grad_norm": 0.2814390906832594,
"learning_rate": 1.3838916321767726e-05,
"loss": 0.8048,
"step": 1515
},
{
"epoch": 0.6041645910132509,
"grad_norm": 0.2654808310179734,
"learning_rate": 1.381484704878363e-05,
"loss": 0.8074,
"step": 1516
},
{
"epoch": 0.6045631164690645,
"grad_norm": 0.26170541781453055,
"learning_rate": 1.379078767720151e-05,
"loss": 0.7921,
"step": 1517
},
{
"epoch": 0.604961641924878,
"grad_norm": 0.26340697807382485,
"learning_rate": 1.3766738245536403e-05,
"loss": 0.7894,
"step": 1518
},
{
"epoch": 0.6053601673806914,
"grad_norm": 0.34917912033176396,
"learning_rate": 1.3742698792287467e-05,
"loss": 0.7979,
"step": 1519
},
{
"epoch": 0.605758692836505,
"grad_norm": 0.2698143223745579,
"learning_rate": 1.371866935593788e-05,
"loss": 0.7705,
"step": 1520
},
{
"epoch": 0.6061572182923184,
"grad_norm": 0.25293807015990133,
"learning_rate": 1.369464997495475e-05,
"loss": 0.7881,
"step": 1521
},
{
"epoch": 0.6065557437481319,
"grad_norm": 0.2713270396836266,
"learning_rate": 1.3670640687789139e-05,
"loss": 0.7931,
"step": 1522
},
{
"epoch": 0.6069542692039454,
"grad_norm": 0.27034783836116744,
"learning_rate": 1.3646641532875911e-05,
"loss": 0.7961,
"step": 1523
},
{
"epoch": 0.6073527946597589,
"grad_norm": 0.27490482613460554,
"learning_rate": 1.362265254863373e-05,
"loss": 0.8147,
"step": 1524
},
{
"epoch": 0.6077513201155724,
"grad_norm": 0.2676216739525722,
"learning_rate": 1.3598673773464972e-05,
"loss": 0.7853,
"step": 1525
},
{
"epoch": 0.6081498455713858,
"grad_norm": 1.272473309536001,
"learning_rate": 1.3574705245755669e-05,
"loss": 0.8089,
"step": 1526
},
{
"epoch": 0.6085483710271994,
"grad_norm": 0.6571572306931123,
"learning_rate": 1.3550747003875458e-05,
"loss": 0.8261,
"step": 1527
},
{
"epoch": 0.6089468964830128,
"grad_norm": 0.2596088192309901,
"learning_rate": 1.3526799086177494e-05,
"loss": 0.8193,
"step": 1528
},
{
"epoch": 0.6093454219388263,
"grad_norm": 0.26444135616895786,
"learning_rate": 1.350286153099842e-05,
"loss": 0.7892,
"step": 1529
},
{
"epoch": 0.6097439473946399,
"grad_norm": 0.25937034971149103,
"learning_rate": 1.3478934376658273e-05,
"loss": 0.8026,
"step": 1530
},
{
"epoch": 0.6101424728504533,
"grad_norm": 0.7431115276392141,
"learning_rate": 1.3455017661460464e-05,
"loss": 0.7932,
"step": 1531
},
{
"epoch": 0.6105409983062668,
"grad_norm": 0.27995622010695853,
"learning_rate": 1.3431111423691677e-05,
"loss": 0.7833,
"step": 1532
},
{
"epoch": 0.6109395237620803,
"grad_norm": 0.2464262348021282,
"learning_rate": 1.3407215701621812e-05,
"loss": 0.796,
"step": 1533
},
{
"epoch": 0.6113380492178938,
"grad_norm": 0.26689725888573773,
"learning_rate": 1.3383330533503971e-05,
"loss": 0.7984,
"step": 1534
},
{
"epoch": 0.6117365746737072,
"grad_norm": 0.281243345103868,
"learning_rate": 1.335945595757432e-05,
"loss": 0.8119,
"step": 1535
},
{
"epoch": 0.6121351001295208,
"grad_norm": 0.2807299105795548,
"learning_rate": 1.3335592012052096e-05,
"loss": 0.8208,
"step": 1536
},
{
"epoch": 0.6125336255853343,
"grad_norm": 0.2822355271519365,
"learning_rate": 1.3311738735139502e-05,
"loss": 0.7958,
"step": 1537
},
{
"epoch": 0.6129321510411477,
"grad_norm": 0.2570136422498892,
"learning_rate": 1.328789616502168e-05,
"loss": 0.7798,
"step": 1538
},
{
"epoch": 0.6133306764969613,
"grad_norm": 0.2602381753045998,
"learning_rate": 1.3264064339866622e-05,
"loss": 0.7952,
"step": 1539
},
{
"epoch": 0.6137292019527747,
"grad_norm": 0.27124645437474926,
"learning_rate": 1.3240243297825112e-05,
"loss": 0.8447,
"step": 1540
},
{
"epoch": 0.6141277274085882,
"grad_norm": 0.2614506972170479,
"learning_rate": 1.3216433077030689e-05,
"loss": 0.8067,
"step": 1541
},
{
"epoch": 0.6145262528644018,
"grad_norm": 0.273112140897487,
"learning_rate": 1.3192633715599548e-05,
"loss": 0.8041,
"step": 1542
},
{
"epoch": 0.6149247783202152,
"grad_norm": 0.24587524256890503,
"learning_rate": 1.3168845251630527e-05,
"loss": 0.7969,
"step": 1543
},
{
"epoch": 0.6153233037760287,
"grad_norm": 0.2931074811806814,
"learning_rate": 1.3145067723204979e-05,
"loss": 0.7919,
"step": 1544
},
{
"epoch": 0.6157218292318422,
"grad_norm": 0.23408431837644428,
"learning_rate": 1.3121301168386796e-05,
"loss": 0.7974,
"step": 1545
},
{
"epoch": 0.6161203546876557,
"grad_norm": 0.2885214636424266,
"learning_rate": 1.3097545625222284e-05,
"loss": 0.8183,
"step": 1546
},
{
"epoch": 0.6165188801434691,
"grad_norm": 0.2565866864664869,
"learning_rate": 1.3073801131740104e-05,
"loss": 0.8187,
"step": 1547
},
{
"epoch": 0.6169174055992827,
"grad_norm": 0.3070425063241222,
"learning_rate": 1.3050067725951258e-05,
"loss": 0.8084,
"step": 1548
},
{
"epoch": 0.6173159310550962,
"grad_norm": 0.3551888980070755,
"learning_rate": 1.3026345445848976e-05,
"loss": 0.7969,
"step": 1549
},
{
"epoch": 0.6177144565109096,
"grad_norm": 0.3309087361846915,
"learning_rate": 1.3002634329408692e-05,
"loss": 0.7573,
"step": 1550
},
{
"epoch": 0.6181129819667232,
"grad_norm": 0.2685150964208705,
"learning_rate": 1.2978934414587955e-05,
"loss": 0.8077,
"step": 1551
},
{
"epoch": 0.6185115074225366,
"grad_norm": 0.28733052685665156,
"learning_rate": 1.2955245739326397e-05,
"loss": 0.807,
"step": 1552
},
{
"epoch": 0.6189100328783501,
"grad_norm": 0.25727837605034215,
"learning_rate": 1.2931568341545649e-05,
"loss": 0.8055,
"step": 1553
},
{
"epoch": 0.6193085583341637,
"grad_norm": 0.28129842927276943,
"learning_rate": 1.2907902259149287e-05,
"loss": 0.8003,
"step": 1554
},
{
"epoch": 0.6197070837899771,
"grad_norm": 0.2650304078824774,
"learning_rate": 1.2884247530022786e-05,
"loss": 0.7906,
"step": 1555
},
{
"epoch": 0.6201056092457906,
"grad_norm": 0.36649121713601185,
"learning_rate": 1.2860604192033414e-05,
"loss": 0.7765,
"step": 1556
},
{
"epoch": 0.620504134701604,
"grad_norm": 0.25867200718505207,
"learning_rate": 1.2836972283030256e-05,
"loss": 0.8186,
"step": 1557
},
{
"epoch": 0.6209026601574176,
"grad_norm": 0.2720817068824379,
"learning_rate": 1.2813351840844046e-05,
"loss": 0.7753,
"step": 1558
},
{
"epoch": 0.621301185613231,
"grad_norm": 0.27600718946732516,
"learning_rate": 1.2789742903287187e-05,
"loss": 0.8002,
"step": 1559
},
{
"epoch": 0.6216997110690445,
"grad_norm": 0.26210695686216645,
"learning_rate": 1.2766145508153689e-05,
"loss": 0.7726,
"step": 1560
},
{
"epoch": 0.6220982365248581,
"grad_norm": 0.27148047901992983,
"learning_rate": 1.2742559693219035e-05,
"loss": 0.8221,
"step": 1561
},
{
"epoch": 0.6224967619806715,
"grad_norm": 0.2506440715577259,
"learning_rate": 1.2718985496240209e-05,
"loss": 0.8161,
"step": 1562
},
{
"epoch": 0.622895287436485,
"grad_norm": 0.2562550466452998,
"learning_rate": 1.2695422954955569e-05,
"loss": 0.812,
"step": 1563
},
{
"epoch": 0.6232938128922985,
"grad_norm": 0.273331861541004,
"learning_rate": 1.2671872107084844e-05,
"loss": 0.7746,
"step": 1564
},
{
"epoch": 0.623692338348112,
"grad_norm": 0.24027870818880687,
"learning_rate": 1.2648332990329016e-05,
"loss": 0.783,
"step": 1565
},
{
"epoch": 0.6240908638039254,
"grad_norm": 0.2751061681477381,
"learning_rate": 1.2624805642370302e-05,
"loss": 0.8006,
"step": 1566
},
{
"epoch": 0.624489389259739,
"grad_norm": 0.2603821217505175,
"learning_rate": 1.2601290100872081e-05,
"loss": 0.8093,
"step": 1567
},
{
"epoch": 0.6248879147155525,
"grad_norm": 0.3093537763083936,
"learning_rate": 1.2577786403478815e-05,
"loss": 0.8071,
"step": 1568
},
{
"epoch": 0.6252864401713659,
"grad_norm": 0.25834846435694175,
"learning_rate": 1.2554294587816039e-05,
"loss": 0.8046,
"step": 1569
},
{
"epoch": 0.6256849656271795,
"grad_norm": 0.2614225968860621,
"learning_rate": 1.253081469149022e-05,
"loss": 0.809,
"step": 1570
},
{
"epoch": 0.6260834910829929,
"grad_norm": 0.2641571048713672,
"learning_rate": 1.2507346752088788e-05,
"loss": 0.8151,
"step": 1571
},
{
"epoch": 0.6264820165388064,
"grad_norm": 0.2570556300174585,
"learning_rate": 1.2483890807180003e-05,
"loss": 0.7807,
"step": 1572
},
{
"epoch": 0.62688054199462,
"grad_norm": 0.25821601421943596,
"learning_rate": 1.2460446894312938e-05,
"loss": 0.8099,
"step": 1573
},
{
"epoch": 0.6272790674504334,
"grad_norm": 0.2631395054682711,
"learning_rate": 1.243701505101741e-05,
"loss": 0.8161,
"step": 1574
},
{
"epoch": 0.6276775929062469,
"grad_norm": 0.24766766238334142,
"learning_rate": 1.2413595314803892e-05,
"loss": 0.7707,
"step": 1575
},
{
"epoch": 0.6280761183620603,
"grad_norm": 0.24707466931883929,
"learning_rate": 1.2390187723163503e-05,
"loss": 0.804,
"step": 1576
},
{
"epoch": 0.6284746438178739,
"grad_norm": 0.2621376069815184,
"learning_rate": 1.2366792313567895e-05,
"loss": 0.8055,
"step": 1577
},
{
"epoch": 0.6288731692736873,
"grad_norm": 0.2455537279746612,
"learning_rate": 1.2343409123469244e-05,
"loss": 0.8099,
"step": 1578
},
{
"epoch": 0.6292716947295008,
"grad_norm": 0.27105059580537544,
"learning_rate": 1.232003819030013e-05,
"loss": 0.7965,
"step": 1579
},
{
"epoch": 0.6296702201853144,
"grad_norm": 0.24578937265717318,
"learning_rate": 1.2296679551473551e-05,
"loss": 0.7871,
"step": 1580
},
{
"epoch": 0.6300687456411278,
"grad_norm": 0.24084765272449513,
"learning_rate": 1.227333324438281e-05,
"loss": 0.7965,
"step": 1581
},
{
"epoch": 0.6304672710969413,
"grad_norm": 0.23922572705746703,
"learning_rate": 1.2249999306401445e-05,
"loss": 0.7936,
"step": 1582
},
{
"epoch": 0.6308657965527548,
"grad_norm": 0.269202817136775,
"learning_rate": 1.2226677774883236e-05,
"loss": 0.8134,
"step": 1583
},
{
"epoch": 0.6312643220085683,
"grad_norm": 0.24194081424246755,
"learning_rate": 1.2203368687162058e-05,
"loss": 0.8036,
"step": 1584
},
{
"epoch": 0.6316628474643818,
"grad_norm": 0.2606593476377602,
"learning_rate": 1.2180072080551899e-05,
"loss": 0.8057,
"step": 1585
},
{
"epoch": 0.6320613729201953,
"grad_norm": 0.25284920681339745,
"learning_rate": 1.215678799234675e-05,
"loss": 0.7793,
"step": 1586
},
{
"epoch": 0.6324598983760088,
"grad_norm": 0.26507641296686857,
"learning_rate": 1.2133516459820565e-05,
"loss": 0.7942,
"step": 1587
},
{
"epoch": 0.6328584238318222,
"grad_norm": 0.25208081960776024,
"learning_rate": 1.2110257520227208e-05,
"loss": 0.8054,
"step": 1588
},
{
"epoch": 0.6332569492876358,
"grad_norm": 0.27064673184332666,
"learning_rate": 1.2087011210800368e-05,
"loss": 0.8022,
"step": 1589
},
{
"epoch": 0.6336554747434492,
"grad_norm": 0.2586090399717606,
"learning_rate": 1.206377756875353e-05,
"loss": 0.7962,
"step": 1590
},
{
"epoch": 0.6340540001992627,
"grad_norm": 0.2758486757724476,
"learning_rate": 1.2040556631279885e-05,
"loss": 0.8141,
"step": 1591
},
{
"epoch": 0.6344525256550763,
"grad_norm": 0.25007000963272646,
"learning_rate": 1.2017348435552308e-05,
"loss": 0.7876,
"step": 1592
},
{
"epoch": 0.6348510511108897,
"grad_norm": 0.28045825131568236,
"learning_rate": 1.1994153018723247e-05,
"loss": 0.7782,
"step": 1593
},
{
"epoch": 0.6352495765667032,
"grad_norm": 0.2559398025371776,
"learning_rate": 1.1970970417924715e-05,
"loss": 0.8016,
"step": 1594
},
{
"epoch": 0.6356481020225166,
"grad_norm": 0.2910472724027498,
"learning_rate": 1.1947800670268218e-05,
"loss": 0.8057,
"step": 1595
},
{
"epoch": 0.6360466274783302,
"grad_norm": 0.26090925545251104,
"learning_rate": 1.1924643812844648e-05,
"loss": 0.8074,
"step": 1596
},
{
"epoch": 0.6364451529341437,
"grad_norm": 0.26077758902957177,
"learning_rate": 1.1901499882724302e-05,
"loss": 0.8125,
"step": 1597
},
{
"epoch": 0.6368436783899571,
"grad_norm": 0.27193415193529746,
"learning_rate": 1.1878368916956758e-05,
"loss": 0.8205,
"step": 1598
},
{
"epoch": 0.6372422038457707,
"grad_norm": 0.24868413662213312,
"learning_rate": 1.1855250952570852e-05,
"loss": 0.8046,
"step": 1599
},
{
"epoch": 0.6376407293015841,
"grad_norm": 0.25516205225914074,
"learning_rate": 1.1832146026574597e-05,
"loss": 0.7823,
"step": 1600
},
{
"epoch": 0.6380392547573976,
"grad_norm": 0.2444397059280007,
"learning_rate": 1.1809054175955148e-05,
"loss": 0.8074,
"step": 1601
},
{
"epoch": 0.6384377802132111,
"grad_norm": 0.2406561292975351,
"learning_rate": 1.1785975437678716e-05,
"loss": 0.7995,
"step": 1602
},
{
"epoch": 0.6388363056690246,
"grad_norm": 0.25213243022945864,
"learning_rate": 1.1762909848690525e-05,
"loss": 0.794,
"step": 1603
},
{
"epoch": 0.6392348311248381,
"grad_norm": 0.250582196145571,
"learning_rate": 1.1739857445914757e-05,
"loss": 0.8081,
"step": 1604
},
{
"epoch": 0.6396333565806516,
"grad_norm": 0.24639126507572728,
"learning_rate": 1.1716818266254462e-05,
"loss": 0.8223,
"step": 1605
},
{
"epoch": 0.6400318820364651,
"grad_norm": 0.2341044085916874,
"learning_rate": 1.169379234659156e-05,
"loss": 0.8122,
"step": 1606
},
{
"epoch": 0.6404304074922785,
"grad_norm": 0.34128549774390465,
"learning_rate": 1.1670779723786697e-05,
"loss": 0.8032,
"step": 1607
},
{
"epoch": 0.6408289329480921,
"grad_norm": 0.33588417532052334,
"learning_rate": 1.1647780434679273e-05,
"loss": 0.7921,
"step": 1608
},
{
"epoch": 0.6412274584039056,
"grad_norm": 0.25140600726539664,
"learning_rate": 1.1624794516087322e-05,
"loss": 0.7937,
"step": 1609
},
{
"epoch": 0.641625983859719,
"grad_norm": 0.23449581497433394,
"learning_rate": 1.160182200480748e-05,
"loss": 0.7835,
"step": 1610
},
{
"epoch": 0.6420245093155326,
"grad_norm": 0.24952525378723442,
"learning_rate": 1.1578862937614935e-05,
"loss": 0.7802,
"step": 1611
},
{
"epoch": 0.642423034771346,
"grad_norm": 0.24961214587481048,
"learning_rate": 1.1555917351263313e-05,
"loss": 0.7823,
"step": 1612
},
{
"epoch": 0.6428215602271595,
"grad_norm": 0.243896806000912,
"learning_rate": 1.1532985282484694e-05,
"loss": 0.7699,
"step": 1613
},
{
"epoch": 0.643220085682973,
"grad_norm": 0.2704485294167498,
"learning_rate": 1.1510066767989522e-05,
"loss": 0.7942,
"step": 1614
},
{
"epoch": 0.6436186111387865,
"grad_norm": 0.24876368726137116,
"learning_rate": 1.1487161844466513e-05,
"loss": 0.8,
"step": 1615
},
{
"epoch": 0.6440171365946,
"grad_norm": 0.24032636669948387,
"learning_rate": 1.1464270548582648e-05,
"loss": 0.7968,
"step": 1616
},
{
"epoch": 0.6444156620504135,
"grad_norm": 0.26676359276330697,
"learning_rate": 1.1441392916983088e-05,
"loss": 0.8146,
"step": 1617
},
{
"epoch": 0.644814187506227,
"grad_norm": 0.24000233827708323,
"learning_rate": 1.1418528986291126e-05,
"loss": 0.813,
"step": 1618
},
{
"epoch": 0.6452127129620404,
"grad_norm": 0.2384982360045188,
"learning_rate": 1.1395678793108106e-05,
"loss": 0.7664,
"step": 1619
},
{
"epoch": 0.645611238417854,
"grad_norm": 0.25233152858510866,
"learning_rate": 1.1372842374013389e-05,
"loss": 0.791,
"step": 1620
},
{
"epoch": 0.6460097638736674,
"grad_norm": 0.23424267270162125,
"learning_rate": 1.135001976556429e-05,
"loss": 0.7872,
"step": 1621
},
{
"epoch": 0.6464082893294809,
"grad_norm": 0.3476922887656111,
"learning_rate": 1.1327211004296013e-05,
"loss": 0.8117,
"step": 1622
},
{
"epoch": 0.6468068147852944,
"grad_norm": 0.25999768296030096,
"learning_rate": 1.1304416126721604e-05,
"loss": 0.8016,
"step": 1623
},
{
"epoch": 0.6472053402411079,
"grad_norm": 0.2386900544989497,
"learning_rate": 1.1281635169331855e-05,
"loss": 0.816,
"step": 1624
},
{
"epoch": 0.6476038656969214,
"grad_norm": 0.23919854850884364,
"learning_rate": 1.1258868168595309e-05,
"loss": 0.7672,
"step": 1625
},
{
"epoch": 0.6480023911527348,
"grad_norm": 0.24292253081996207,
"learning_rate": 1.1236115160958137e-05,
"loss": 0.7876,
"step": 1626
},
{
"epoch": 0.6484009166085484,
"grad_norm": 0.2573324955094864,
"learning_rate": 1.1213376182844118e-05,
"loss": 0.8105,
"step": 1627
},
{
"epoch": 0.6487994420643619,
"grad_norm": 0.2374878968994724,
"learning_rate": 1.1190651270654608e-05,
"loss": 0.7956,
"step": 1628
},
{
"epoch": 0.6491979675201753,
"grad_norm": 0.22808064108307496,
"learning_rate": 1.1167940460768384e-05,
"loss": 0.778,
"step": 1629
},
{
"epoch": 0.6495964929759889,
"grad_norm": 0.24975995163182776,
"learning_rate": 1.11452437895417e-05,
"loss": 0.7927,
"step": 1630
},
{
"epoch": 0.6499950184318023,
"grad_norm": 0.2496302640812307,
"learning_rate": 1.1122561293308134e-05,
"loss": 0.8093,
"step": 1631
},
{
"epoch": 0.6503935438876158,
"grad_norm": 0.2382600490081852,
"learning_rate": 1.1099893008378602e-05,
"loss": 0.7989,
"step": 1632
},
{
"epoch": 0.6507920693434293,
"grad_norm": 0.23966141846275127,
"learning_rate": 1.1077238971041265e-05,
"loss": 0.7737,
"step": 1633
},
{
"epoch": 0.6511905947992428,
"grad_norm": 0.2658481127884238,
"learning_rate": 1.1054599217561466e-05,
"loss": 0.8161,
"step": 1634
},
{
"epoch": 0.6515891202550563,
"grad_norm": 0.23310988570227098,
"learning_rate": 1.10319737841817e-05,
"loss": 0.7965,
"step": 1635
},
{
"epoch": 0.6519876457108698,
"grad_norm": 0.2593756062996178,
"learning_rate": 1.1009362707121506e-05,
"loss": 0.8034,
"step": 1636
},
{
"epoch": 0.6523861711666833,
"grad_norm": 0.25538154058327805,
"learning_rate": 1.098676602257748e-05,
"loss": 0.8041,
"step": 1637
},
{
"epoch": 0.6527846966224967,
"grad_norm": 0.253312859294886,
"learning_rate": 1.0964183766723142e-05,
"loss": 0.8418,
"step": 1638
},
{
"epoch": 0.6531832220783103,
"grad_norm": 0.2492955855138997,
"learning_rate": 1.0941615975708939e-05,
"loss": 0.7821,
"step": 1639
},
{
"epoch": 0.6535817475341238,
"grad_norm": 0.24807305513899183,
"learning_rate": 1.0919062685662154e-05,
"loss": 0.8218,
"step": 1640
},
{
"epoch": 0.6539802729899372,
"grad_norm": 0.24157259403786543,
"learning_rate": 1.0896523932686853e-05,
"loss": 0.8093,
"step": 1641
},
{
"epoch": 0.6543787984457508,
"grad_norm": 0.2887027342486142,
"learning_rate": 1.0873999752863846e-05,
"loss": 0.7708,
"step": 1642
},
{
"epoch": 0.6547773239015642,
"grad_norm": 0.2516367839521763,
"learning_rate": 1.085149018225058e-05,
"loss": 0.8102,
"step": 1643
},
{
"epoch": 0.6551758493573777,
"grad_norm": 0.24924932650750312,
"learning_rate": 1.0828995256881151e-05,
"loss": 0.8155,
"step": 1644
},
{
"epoch": 0.6555743748131911,
"grad_norm": 0.2794672477405356,
"learning_rate": 1.0806515012766196e-05,
"loss": 0.7793,
"step": 1645
},
{
"epoch": 0.6559729002690047,
"grad_norm": 0.2573710085448088,
"learning_rate": 1.0784049485892853e-05,
"loss": 0.7823,
"step": 1646
},
{
"epoch": 0.6563714257248182,
"grad_norm": 0.22754413318247524,
"learning_rate": 1.0761598712224686e-05,
"loss": 0.8244,
"step": 1647
},
{
"epoch": 0.6567699511806316,
"grad_norm": 0.2529075220091104,
"learning_rate": 1.0739162727701655e-05,
"loss": 0.8248,
"step": 1648
},
{
"epoch": 0.6571684766364452,
"grad_norm": 0.23442166283314864,
"learning_rate": 1.0716741568240056e-05,
"loss": 0.7863,
"step": 1649
},
{
"epoch": 0.6575670020922586,
"grad_norm": 0.2310467368157676,
"learning_rate": 1.0694335269732412e-05,
"loss": 0.7935,
"step": 1650
},
{
"epoch": 0.6579655275480721,
"grad_norm": 0.2519609841775046,
"learning_rate": 1.0671943868047514e-05,
"loss": 0.8174,
"step": 1651
},
{
"epoch": 0.6583640530038857,
"grad_norm": 0.23381769850197567,
"learning_rate": 1.0649567399030256e-05,
"loss": 0.8125,
"step": 1652
},
{
"epoch": 0.6587625784596991,
"grad_norm": 0.2248688496445257,
"learning_rate": 1.0627205898501658e-05,
"loss": 0.7631,
"step": 1653
},
{
"epoch": 0.6591611039155126,
"grad_norm": 0.24042601112993525,
"learning_rate": 1.0604859402258749e-05,
"loss": 0.8093,
"step": 1654
},
{
"epoch": 0.6595596293713261,
"grad_norm": 0.23829888619576395,
"learning_rate": 1.0582527946074568e-05,
"loss": 0.757,
"step": 1655
},
{
"epoch": 0.6599581548271396,
"grad_norm": 0.24849887674234067,
"learning_rate": 1.0560211565698065e-05,
"loss": 0.7925,
"step": 1656
},
{
"epoch": 0.660356680282953,
"grad_norm": 0.23966740664443098,
"learning_rate": 1.053791029685405e-05,
"loss": 0.7956,
"step": 1657
},
{
"epoch": 0.6607552057387666,
"grad_norm": 0.2326370782463841,
"learning_rate": 1.0515624175243162e-05,
"loss": 0.7662,
"step": 1658
},
{
"epoch": 0.6611537311945801,
"grad_norm": 0.31722454033580055,
"learning_rate": 1.0493353236541762e-05,
"loss": 0.7802,
"step": 1659
},
{
"epoch": 0.6615522566503935,
"grad_norm": 0.25707007749842065,
"learning_rate": 1.0471097516401936e-05,
"loss": 0.8621,
"step": 1660
},
{
"epoch": 0.6619507821062071,
"grad_norm": 0.24902572963184474,
"learning_rate": 1.0448857050451378e-05,
"loss": 0.7842,
"step": 1661
},
{
"epoch": 0.6623493075620205,
"grad_norm": 0.24955167998517547,
"learning_rate": 1.0426631874293375e-05,
"loss": 0.8294,
"step": 1662
},
{
"epoch": 0.662747833017834,
"grad_norm": 0.23384165302801938,
"learning_rate": 1.0404422023506769e-05,
"loss": 0.79,
"step": 1663
},
{
"epoch": 0.6631463584736476,
"grad_norm": 0.2392972325732434,
"learning_rate": 1.038222753364581e-05,
"loss": 0.8006,
"step": 1664
},
{
"epoch": 0.663544883929461,
"grad_norm": 0.24206783576164856,
"learning_rate": 1.0360048440240211e-05,
"loss": 0.8027,
"step": 1665
},
{
"epoch": 0.6639434093852745,
"grad_norm": 0.22839516664163145,
"learning_rate": 1.0337884778794993e-05,
"loss": 0.7948,
"step": 1666
},
{
"epoch": 0.6643419348410879,
"grad_norm": 0.2402973963775374,
"learning_rate": 1.0315736584790507e-05,
"loss": 0.8151,
"step": 1667
},
{
"epoch": 0.6647404602969015,
"grad_norm": 0.2343262068157496,
"learning_rate": 1.0293603893682327e-05,
"loss": 0.7982,
"step": 1668
},
{
"epoch": 0.6651389857527149,
"grad_norm": 0.23763455582566587,
"learning_rate": 1.0271486740901215e-05,
"loss": 0.8202,
"step": 1669
},
{
"epoch": 0.6655375112085284,
"grad_norm": 0.22857617190624355,
"learning_rate": 1.0249385161853064e-05,
"loss": 0.8043,
"step": 1670
},
{
"epoch": 0.665936036664342,
"grad_norm": 0.23554787626388524,
"learning_rate": 1.0227299191918818e-05,
"loss": 0.7754,
"step": 1671
},
{
"epoch": 0.6663345621201554,
"grad_norm": 0.24185097085110915,
"learning_rate": 1.0205228866454452e-05,
"loss": 0.8149,
"step": 1672
},
{
"epoch": 0.6667330875759689,
"grad_norm": 0.24371976817956506,
"learning_rate": 1.018317422079087e-05,
"loss": 0.7953,
"step": 1673
},
{
"epoch": 0.6671316130317824,
"grad_norm": 0.22651548749239922,
"learning_rate": 1.0161135290233928e-05,
"loss": 0.7856,
"step": 1674
},
{
"epoch": 0.6675301384875959,
"grad_norm": 0.23694878438384515,
"learning_rate": 1.0139112110064265e-05,
"loss": 0.7917,
"step": 1675
},
{
"epoch": 0.6679286639434094,
"grad_norm": 0.23479229889643258,
"learning_rate": 1.0117104715537338e-05,
"loss": 0.7941,
"step": 1676
},
{
"epoch": 0.6683271893992229,
"grad_norm": 0.24124394146663952,
"learning_rate": 1.009511314188334e-05,
"loss": 0.8183,
"step": 1677
},
{
"epoch": 0.6687257148550364,
"grad_norm": 0.22678268771998955,
"learning_rate": 1.0073137424307109e-05,
"loss": 0.785,
"step": 1678
},
{
"epoch": 0.6691242403108498,
"grad_norm": 0.2477335220816568,
"learning_rate": 1.0051177597988122e-05,
"loss": 0.8033,
"step": 1679
},
{
"epoch": 0.6695227657666634,
"grad_norm": 0.23625778900717528,
"learning_rate": 1.0029233698080415e-05,
"loss": 0.8033,
"step": 1680
},
{
"epoch": 0.6699212912224768,
"grad_norm": 0.23825465079514177,
"learning_rate": 1.0007305759712533e-05,
"loss": 0.7735,
"step": 1681
},
{
"epoch": 0.6703198166782903,
"grad_norm": 0.22035417715886807,
"learning_rate": 9.985393817987444e-06,
"loss": 0.8073,
"step": 1682
},
{
"epoch": 0.6707183421341039,
"grad_norm": 0.23849505686477043,
"learning_rate": 9.963497907982532e-06,
"loss": 0.8026,
"step": 1683
},
{
"epoch": 0.6711168675899173,
"grad_norm": 0.2337573641381328,
"learning_rate": 9.94161806474951e-06,
"loss": 0.7889,
"step": 1684
},
{
"epoch": 0.6715153930457308,
"grad_norm": 0.9103894523595338,
"learning_rate": 9.919754323314372e-06,
"loss": 0.792,
"step": 1685
},
{
"epoch": 0.6719139185015442,
"grad_norm": 0.22893455291621617,
"learning_rate": 9.897906718677344e-06,
"loss": 0.782,
"step": 1686
},
{
"epoch": 0.6723124439573578,
"grad_norm": 0.2372100351252991,
"learning_rate": 9.87607528581279e-06,
"loss": 0.8011,
"step": 1687
},
{
"epoch": 0.6727109694131712,
"grad_norm": 0.23111628536958412,
"learning_rate": 9.854260059669225e-06,
"loss": 0.8025,
"step": 1688
},
{
"epoch": 0.6731094948689847,
"grad_norm": 0.2368927356235449,
"learning_rate": 9.832461075169184e-06,
"loss": 0.8033,
"step": 1689
},
{
"epoch": 0.6735080203247983,
"grad_norm": 0.22855325082673575,
"learning_rate": 9.810678367209227e-06,
"loss": 0.7911,
"step": 1690
},
{
"epoch": 0.6739065457806117,
"grad_norm": 0.27522701488615475,
"learning_rate": 9.788911970659848e-06,
"loss": 0.7916,
"step": 1691
},
{
"epoch": 0.6743050712364252,
"grad_norm": 0.24022760398565116,
"learning_rate": 9.767161920365431e-06,
"loss": 0.8037,
"step": 1692
},
{
"epoch": 0.6747035966922387,
"grad_norm": 0.22778160452010449,
"learning_rate": 9.7454282511442e-06,
"loss": 0.8169,
"step": 1693
},
{
"epoch": 0.6751021221480522,
"grad_norm": 0.2307169634206417,
"learning_rate": 9.723710997788134e-06,
"loss": 0.7951,
"step": 1694
},
{
"epoch": 0.6755006476038657,
"grad_norm": 0.2278130241658777,
"learning_rate": 9.702010195062957e-06,
"loss": 0.804,
"step": 1695
},
{
"epoch": 0.6758991730596792,
"grad_norm": 0.23860918505971207,
"learning_rate": 9.68032587770803e-06,
"loss": 0.7775,
"step": 1696
},
{
"epoch": 0.6762976985154927,
"grad_norm": 0.23206722403706048,
"learning_rate": 9.65865808043636e-06,
"loss": 0.7717,
"step": 1697
},
{
"epoch": 0.6766962239713061,
"grad_norm": 0.2424939487602499,
"learning_rate": 9.637006837934491e-06,
"loss": 0.8284,
"step": 1698
},
{
"epoch": 0.6770947494271197,
"grad_norm": 0.2422935170368267,
"learning_rate": 9.61537218486245e-06,
"loss": 0.7982,
"step": 1699
},
{
"epoch": 0.6774932748829331,
"grad_norm": 0.268912315082055,
"learning_rate": 9.593754155853736e-06,
"loss": 0.8025,
"step": 1700
},
{
"epoch": 0.6778918003387466,
"grad_norm": 0.24641465322988168,
"learning_rate": 9.572152785515206e-06,
"loss": 0.796,
"step": 1701
},
{
"epoch": 0.6782903257945602,
"grad_norm": 0.23523832181072415,
"learning_rate": 9.550568108427067e-06,
"loss": 0.7945,
"step": 1702
},
{
"epoch": 0.6786888512503736,
"grad_norm": 0.23985080041043766,
"learning_rate": 9.529000159142806e-06,
"loss": 0.7967,
"step": 1703
},
{
"epoch": 0.6790873767061871,
"grad_norm": 0.24109034221158648,
"learning_rate": 9.507448972189124e-06,
"loss": 0.809,
"step": 1704
},
{
"epoch": 0.6794859021620006,
"grad_norm": 0.3279682419994762,
"learning_rate": 9.485914582065893e-06,
"loss": 0.7976,
"step": 1705
},
{
"epoch": 0.6798844276178141,
"grad_norm": 0.24600000203117356,
"learning_rate": 9.464397023246086e-06,
"loss": 0.798,
"step": 1706
},
{
"epoch": 0.6802829530736276,
"grad_norm": 0.25198127703741363,
"learning_rate": 9.442896330175736e-06,
"loss": 0.7666,
"step": 1707
},
{
"epoch": 0.680681478529441,
"grad_norm": 0.24602873394094937,
"learning_rate": 9.421412537273888e-06,
"loss": 0.8296,
"step": 1708
},
{
"epoch": 0.6810800039852546,
"grad_norm": 0.2462861171716341,
"learning_rate": 9.399945678932518e-06,
"loss": 0.7671,
"step": 1709
},
{
"epoch": 0.681478529441068,
"grad_norm": 0.2356910774374406,
"learning_rate": 9.378495789516511e-06,
"loss": 0.8005,
"step": 1710
},
{
"epoch": 0.6818770548968816,
"grad_norm": 0.26676136395934497,
"learning_rate": 9.357062903363559e-06,
"loss": 0.7966,
"step": 1711
},
{
"epoch": 0.682275580352695,
"grad_norm": 0.22412533500879198,
"learning_rate": 9.335647054784163e-06,
"loss": 0.7837,
"step": 1712
},
{
"epoch": 0.6826741058085085,
"grad_norm": 0.24899863246739254,
"learning_rate": 9.314248278061524e-06,
"loss": 0.8113,
"step": 1713
},
{
"epoch": 0.683072631264322,
"grad_norm": 0.2518131395877076,
"learning_rate": 9.292866607451534e-06,
"loss": 0.7868,
"step": 1714
},
{
"epoch": 0.6834711567201355,
"grad_norm": 0.2384173486107651,
"learning_rate": 9.271502077182697e-06,
"loss": 0.7748,
"step": 1715
},
{
"epoch": 0.683869682175949,
"grad_norm": 0.5967497241397911,
"learning_rate": 9.250154721456075e-06,
"loss": 0.7962,
"step": 1716
},
{
"epoch": 0.6842682076317624,
"grad_norm": 0.24269806832216176,
"learning_rate": 9.22882457444524e-06,
"loss": 0.8026,
"step": 1717
},
{
"epoch": 0.684666733087576,
"grad_norm": 0.23438959649008212,
"learning_rate": 9.207511670296204e-06,
"loss": 0.795,
"step": 1718
},
{
"epoch": 0.6850652585433895,
"grad_norm": 0.24041761239392234,
"learning_rate": 9.186216043127388e-06,
"loss": 0.8214,
"step": 1719
},
{
"epoch": 0.6854637839992029,
"grad_norm": 0.2415192222064715,
"learning_rate": 9.16493772702955e-06,
"loss": 0.7907,
"step": 1720
},
{
"epoch": 0.6858623094550165,
"grad_norm": 0.25457580261405643,
"learning_rate": 9.143676756065752e-06,
"loss": 0.7912,
"step": 1721
},
{
"epoch": 0.6862608349108299,
"grad_norm": 0.24138741526314378,
"learning_rate": 9.122433164271252e-06,
"loss": 0.7952,
"step": 1722
},
{
"epoch": 0.6866593603666434,
"grad_norm": 0.23982959026182568,
"learning_rate": 9.101206985653523e-06,
"loss": 0.8109,
"step": 1723
},
{
"epoch": 0.6870578858224569,
"grad_norm": 0.23128247905861088,
"learning_rate": 9.079998254192157e-06,
"loss": 0.7996,
"step": 1724
},
{
"epoch": 0.6874564112782704,
"grad_norm": 0.23257176458111745,
"learning_rate": 9.058807003838792e-06,
"loss": 0.7959,
"step": 1725
},
{
"epoch": 0.6878549367340839,
"grad_norm": 0.2514299885659865,
"learning_rate": 9.037633268517105e-06,
"loss": 0.8007,
"step": 1726
},
{
"epoch": 0.6882534621898974,
"grad_norm": 0.2296427095516536,
"learning_rate": 9.016477082122727e-06,
"loss": 0.7671,
"step": 1727
},
{
"epoch": 0.6886519876457109,
"grad_norm": 0.24370730489409603,
"learning_rate": 8.995338478523206e-06,
"loss": 0.8123,
"step": 1728
},
{
"epoch": 0.6890505131015243,
"grad_norm": 0.23578511930028617,
"learning_rate": 8.974217491557916e-06,
"loss": 0.7964,
"step": 1729
},
{
"epoch": 0.6894490385573379,
"grad_norm": 0.23684202240770086,
"learning_rate": 8.953114155038059e-06,
"loss": 0.7808,
"step": 1730
},
{
"epoch": 0.6898475640131514,
"grad_norm": 0.22699784086777558,
"learning_rate": 8.932028502746563e-06,
"loss": 0.7959,
"step": 1731
},
{
"epoch": 0.6902460894689648,
"grad_norm": 0.24063862708544978,
"learning_rate": 8.910960568438058e-06,
"loss": 0.789,
"step": 1732
},
{
"epoch": 0.6906446149247784,
"grad_norm": 0.22874206732454588,
"learning_rate": 8.889910385838813e-06,
"loss": 0.7826,
"step": 1733
},
{
"epoch": 0.6910431403805918,
"grad_norm": 0.2250049276809127,
"learning_rate": 8.868877988646656e-06,
"loss": 0.7941,
"step": 1734
},
{
"epoch": 0.6914416658364053,
"grad_norm": 0.22799809229676088,
"learning_rate": 8.847863410530973e-06,
"loss": 0.8039,
"step": 1735
},
{
"epoch": 0.6918401912922187,
"grad_norm": 0.22068818384437014,
"learning_rate": 8.826866685132597e-06,
"loss": 0.764,
"step": 1736
},
{
"epoch": 0.6922387167480323,
"grad_norm": 0.23302636532036256,
"learning_rate": 8.805887846063793e-06,
"loss": 0.7814,
"step": 1737
},
{
"epoch": 0.6926372422038458,
"grad_norm": 0.2235081586612528,
"learning_rate": 8.784926926908228e-06,
"loss": 0.7906,
"step": 1738
},
{
"epoch": 0.6930357676596592,
"grad_norm": 0.23695689079275012,
"learning_rate": 8.763983961220818e-06,
"loss": 0.7948,
"step": 1739
},
{
"epoch": 0.6934342931154728,
"grad_norm": 0.24343892771165315,
"learning_rate": 8.74305898252779e-06,
"loss": 0.777,
"step": 1740
},
{
"epoch": 0.6938328185712862,
"grad_norm": 0.2403895498767754,
"learning_rate": 8.72215202432654e-06,
"loss": 0.8093,
"step": 1741
},
{
"epoch": 0.6942313440270997,
"grad_norm": 0.23104547501067635,
"learning_rate": 8.701263120085643e-06,
"loss": 0.7747,
"step": 1742
},
{
"epoch": 0.6946298694829133,
"grad_norm": 0.2399257360677753,
"learning_rate": 8.680392303244762e-06,
"loss": 0.7887,
"step": 1743
},
{
"epoch": 0.6950283949387267,
"grad_norm": 0.2298960897757004,
"learning_rate": 8.659539607214609e-06,
"loss": 0.805,
"step": 1744
},
{
"epoch": 0.6954269203945402,
"grad_norm": 0.22209674980320604,
"learning_rate": 8.638705065376887e-06,
"loss": 0.7882,
"step": 1745
},
{
"epoch": 0.6958254458503537,
"grad_norm": 0.22996129591563572,
"learning_rate": 8.617888711084225e-06,
"loss": 0.7907,
"step": 1746
},
{
"epoch": 0.6962239713061672,
"grad_norm": 0.23756147299275276,
"learning_rate": 8.597090577660158e-06,
"loss": 0.8248,
"step": 1747
},
{
"epoch": 0.6966224967619806,
"grad_norm": 0.23089712940348142,
"learning_rate": 8.576310698399031e-06,
"loss": 0.7827,
"step": 1748
},
{
"epoch": 0.6970210222177942,
"grad_norm": 0.22154445039007642,
"learning_rate": 8.555549106565981e-06,
"loss": 0.7987,
"step": 1749
},
{
"epoch": 0.6974195476736077,
"grad_norm": 0.2331241726825461,
"learning_rate": 8.534805835396866e-06,
"loss": 0.8262,
"step": 1750
},
{
"epoch": 0.6978180731294211,
"grad_norm": 0.22789526498273438,
"learning_rate": 8.514080918098218e-06,
"loss": 0.7886,
"step": 1751
},
{
"epoch": 0.6982165985852347,
"grad_norm": 0.21821146925663867,
"learning_rate": 8.49337438784719e-06,
"loss": 0.801,
"step": 1752
},
{
"epoch": 0.6986151240410481,
"grad_norm": 0.23508205049301503,
"learning_rate": 8.472686277791485e-06,
"loss": 0.7643,
"step": 1753
},
{
"epoch": 0.6990136494968616,
"grad_norm": 0.22461888065681415,
"learning_rate": 8.452016621049333e-06,
"loss": 0.7991,
"step": 1754
},
{
"epoch": 0.699412174952675,
"grad_norm": 0.21803368130601183,
"learning_rate": 8.431365450709419e-06,
"loss": 0.7987,
"step": 1755
},
{
"epoch": 0.6998107004084886,
"grad_norm": 0.23740898039198863,
"learning_rate": 8.410732799830845e-06,
"loss": 0.7915,
"step": 1756
},
{
"epoch": 0.7002092258643021,
"grad_norm": 0.261735854629893,
"learning_rate": 8.39011870144304e-06,
"loss": 0.7955,
"step": 1757
},
{
"epoch": 0.7006077513201155,
"grad_norm": 0.2180685253328265,
"learning_rate": 8.369523188545756e-06,
"loss": 0.8028,
"step": 1758
},
{
"epoch": 0.7010062767759291,
"grad_norm": 0.2301419951414697,
"learning_rate": 8.348946294108996e-06,
"loss": 0.8103,
"step": 1759
},
{
"epoch": 0.7014048022317425,
"grad_norm": 0.22024932183589127,
"learning_rate": 8.328388051072922e-06,
"loss": 0.7928,
"step": 1760
},
{
"epoch": 0.701803327687556,
"grad_norm": 0.4020336814790439,
"learning_rate": 8.307848492347899e-06,
"loss": 0.8011,
"step": 1761
},
{
"epoch": 0.7022018531433696,
"grad_norm": 0.22024662257821778,
"learning_rate": 8.287327650814323e-06,
"loss": 0.8119,
"step": 1762
},
{
"epoch": 0.702600378599183,
"grad_norm": 0.27996952780116363,
"learning_rate": 8.266825559322667e-06,
"loss": 0.7987,
"step": 1763
},
{
"epoch": 0.7029989040549965,
"grad_norm": 0.22630541171175222,
"learning_rate": 8.246342250693354e-06,
"loss": 0.817,
"step": 1764
},
{
"epoch": 0.70339742951081,
"grad_norm": 0.2290021039403852,
"learning_rate": 8.225877757716768e-06,
"loss": 0.7959,
"step": 1765
},
{
"epoch": 0.7037959549666235,
"grad_norm": 0.2216297139655694,
"learning_rate": 8.205432113153158e-06,
"loss": 0.7791,
"step": 1766
},
{
"epoch": 0.7041944804224369,
"grad_norm": 0.22804574889964005,
"learning_rate": 8.185005349732605e-06,
"loss": 0.8041,
"step": 1767
},
{
"epoch": 0.7045930058782505,
"grad_norm": 0.21915038552906846,
"learning_rate": 8.16459750015497e-06,
"loss": 0.7919,
"step": 1768
},
{
"epoch": 0.704991531334064,
"grad_norm": 0.23641715849802888,
"learning_rate": 8.144208597089814e-06,
"loss": 0.7684,
"step": 1769
},
{
"epoch": 0.7053900567898774,
"grad_norm": 0.23150971294969083,
"learning_rate": 8.123838673176396e-06,
"loss": 0.8268,
"step": 1770
},
{
"epoch": 0.705788582245691,
"grad_norm": 0.22487777470325962,
"learning_rate": 8.103487761023559e-06,
"loss": 0.7952,
"step": 1771
},
{
"epoch": 0.7061871077015044,
"grad_norm": 0.22359225895687845,
"learning_rate": 8.08315589320975e-06,
"loss": 0.7942,
"step": 1772
},
{
"epoch": 0.7065856331573179,
"grad_norm": 0.22728777425623412,
"learning_rate": 8.062843102282916e-06,
"loss": 0.7979,
"step": 1773
},
{
"epoch": 0.7069841586131315,
"grad_norm": 0.32242287769373923,
"learning_rate": 8.042549420760437e-06,
"loss": 0.7758,
"step": 1774
},
{
"epoch": 0.7073826840689449,
"grad_norm": 0.23211148591348726,
"learning_rate": 8.022274881129146e-06,
"loss": 0.7932,
"step": 1775
},
{
"epoch": 0.7077812095247584,
"grad_norm": 0.23149548150957583,
"learning_rate": 8.002019515845194e-06,
"loss": 0.781,
"step": 1776
},
{
"epoch": 0.7081797349805719,
"grad_norm": 0.23571260576059858,
"learning_rate": 7.981783357334061e-06,
"loss": 0.8099,
"step": 1777
},
{
"epoch": 0.7085782604363854,
"grad_norm": 0.23684120441719464,
"learning_rate": 7.961566437990475e-06,
"loss": 0.7925,
"step": 1778
},
{
"epoch": 0.7089767858921988,
"grad_norm": 0.23808835745048676,
"learning_rate": 7.941368790178365e-06,
"loss": 0.8035,
"step": 1779
},
{
"epoch": 0.7093753113480123,
"grad_norm": 0.24734022897944857,
"learning_rate": 7.921190446230813e-06,
"loss": 0.7797,
"step": 1780
},
{
"epoch": 0.7097738368038259,
"grad_norm": 0.2453484186566751,
"learning_rate": 7.901031438449982e-06,
"loss": 0.819,
"step": 1781
},
{
"epoch": 0.7101723622596393,
"grad_norm": 0.22709522154253955,
"learning_rate": 7.880891799107108e-06,
"loss": 0.8394,
"step": 1782
},
{
"epoch": 0.7105708877154528,
"grad_norm": 0.24346320063244078,
"learning_rate": 7.860771560442384e-06,
"loss": 0.8114,
"step": 1783
},
{
"epoch": 0.7109694131712663,
"grad_norm": 0.23923932846526716,
"learning_rate": 7.84067075466499e-06,
"loss": 0.7866,
"step": 1784
},
{
"epoch": 0.7113679386270798,
"grad_norm": 0.24156935661046483,
"learning_rate": 7.820589413952976e-06,
"loss": 0.7792,
"step": 1785
},
{
"epoch": 0.7117664640828933,
"grad_norm": 0.24507452424550918,
"learning_rate": 7.800527570453215e-06,
"loss": 0.7986,
"step": 1786
},
{
"epoch": 0.7121649895387068,
"grad_norm": 0.22251550647565904,
"learning_rate": 7.780485256281402e-06,
"loss": 0.7733,
"step": 1787
},
{
"epoch": 0.7125635149945203,
"grad_norm": 0.2426455233626753,
"learning_rate": 7.760462503521933e-06,
"loss": 0.7954,
"step": 1788
},
{
"epoch": 0.7129620404503337,
"grad_norm": 0.23577702373705983,
"learning_rate": 7.740459344227918e-06,
"loss": 0.7985,
"step": 1789
},
{
"epoch": 0.7133605659061473,
"grad_norm": 0.24472106889910925,
"learning_rate": 7.720475810421088e-06,
"loss": 0.7924,
"step": 1790
},
{
"epoch": 0.7137590913619607,
"grad_norm": 0.23276012167993276,
"learning_rate": 7.700511934091763e-06,
"loss": 0.8098,
"step": 1791
},
{
"epoch": 0.7141576168177742,
"grad_norm": 0.2227128937074685,
"learning_rate": 7.680567747198797e-06,
"loss": 0.8368,
"step": 1792
},
{
"epoch": 0.7145561422735878,
"grad_norm": 0.35446105784971366,
"learning_rate": 7.660643281669502e-06,
"loss": 0.7913,
"step": 1793
},
{
"epoch": 0.7149546677294012,
"grad_norm": 0.23973566003992375,
"learning_rate": 7.640738569399645e-06,
"loss": 0.8357,
"step": 1794
},
{
"epoch": 0.7153531931852147,
"grad_norm": 0.2391665089124275,
"learning_rate": 7.620853642253363e-06,
"loss": 0.8133,
"step": 1795
},
{
"epoch": 0.7157517186410282,
"grad_norm": 0.23522226900870816,
"learning_rate": 7.600988532063125e-06,
"loss": 0.7926,
"step": 1796
},
{
"epoch": 0.7161502440968417,
"grad_norm": 0.24554684562043907,
"learning_rate": 7.58114327062966e-06,
"loss": 0.7709,
"step": 1797
},
{
"epoch": 0.7165487695526552,
"grad_norm": 0.22714750497856911,
"learning_rate": 7.561317889721937e-06,
"loss": 0.7818,
"step": 1798
},
{
"epoch": 0.7169472950084687,
"grad_norm": 0.23827899963595306,
"learning_rate": 7.541512421077106e-06,
"loss": 0.7728,
"step": 1799
},
{
"epoch": 0.7173458204642822,
"grad_norm": 0.48457590255842975,
"learning_rate": 7.521726896400414e-06,
"loss": 0.7739,
"step": 1800
},
{
"epoch": 0.7177443459200956,
"grad_norm": 0.23582475280902745,
"learning_rate": 7.50196134736521e-06,
"loss": 0.8168,
"step": 1801
},
{
"epoch": 0.7181428713759092,
"grad_norm": 0.2380146361056826,
"learning_rate": 7.482215805612847e-06,
"loss": 0.7779,
"step": 1802
},
{
"epoch": 0.7185413968317226,
"grad_norm": 0.2360276005567584,
"learning_rate": 7.462490302752665e-06,
"loss": 0.7864,
"step": 1803
},
{
"epoch": 0.7189399222875361,
"grad_norm": 0.22814213757245871,
"learning_rate": 7.442784870361903e-06,
"loss": 0.8191,
"step": 1804
},
{
"epoch": 0.7193384477433497,
"grad_norm": 0.24107281393643026,
"learning_rate": 7.42309953998569e-06,
"loss": 0.7838,
"step": 1805
},
{
"epoch": 0.7197369731991631,
"grad_norm": 0.24232433035462758,
"learning_rate": 7.4034343431369685e-06,
"loss": 0.7977,
"step": 1806
},
{
"epoch": 0.7201354986549766,
"grad_norm": 0.38084230051806445,
"learning_rate": 7.38378931129645e-06,
"loss": 0.8043,
"step": 1807
},
{
"epoch": 0.72053402411079,
"grad_norm": 0.24397034947179694,
"learning_rate": 7.364164475912572e-06,
"loss": 0.8068,
"step": 1808
},
{
"epoch": 0.7209325495666036,
"grad_norm": 0.4613176607526505,
"learning_rate": 7.344559868401422e-06,
"loss": 0.7877,
"step": 1809
},
{
"epoch": 0.7213310750224171,
"grad_norm": 0.23005075594522995,
"learning_rate": 7.3249755201467335e-06,
"loss": 0.7722,
"step": 1810
},
{
"epoch": 0.7217296004782305,
"grad_norm": 0.2387695579592527,
"learning_rate": 7.305411462499776e-06,
"loss": 0.8201,
"step": 1811
},
{
"epoch": 0.7221281259340441,
"grad_norm": 0.2344269204447853,
"learning_rate": 7.2858677267793635e-06,
"loss": 0.7815,
"step": 1812
},
{
"epoch": 0.7225266513898575,
"grad_norm": 0.2280635583340256,
"learning_rate": 7.26634434427177e-06,
"loss": 0.7814,
"step": 1813
},
{
"epoch": 0.722925176845671,
"grad_norm": 0.2328509307005202,
"learning_rate": 7.246841346230684e-06,
"loss": 0.7695,
"step": 1814
},
{
"epoch": 0.7233237023014845,
"grad_norm": 0.2237984273349448,
"learning_rate": 7.227358763877172e-06,
"loss": 0.8082,
"step": 1815
},
{
"epoch": 0.723722227757298,
"grad_norm": 0.24293928069372236,
"learning_rate": 7.207896628399598e-06,
"loss": 0.8018,
"step": 1816
},
{
"epoch": 0.7241207532131115,
"grad_norm": 0.22708584207065824,
"learning_rate": 7.1884549709536115e-06,
"loss": 0.788,
"step": 1817
},
{
"epoch": 0.724519278668925,
"grad_norm": 0.23024391469364716,
"learning_rate": 7.169033822662077e-06,
"loss": 0.7722,
"step": 1818
},
{
"epoch": 0.7249178041247385,
"grad_norm": 0.21908469252061188,
"learning_rate": 7.149633214615022e-06,
"loss": 0.7757,
"step": 1819
},
{
"epoch": 0.7253163295805519,
"grad_norm": 0.23374912363797343,
"learning_rate": 7.130253177869606e-06,
"loss": 0.8123,
"step": 1820
},
{
"epoch": 0.7257148550363655,
"grad_norm": 0.23339945263366027,
"learning_rate": 7.1108937434500335e-06,
"loss": 0.8145,
"step": 1821
},
{
"epoch": 0.7261133804921789,
"grad_norm": 0.22566815004670457,
"learning_rate": 7.091554942347551e-06,
"loss": 0.7879,
"step": 1822
},
{
"epoch": 0.7265119059479924,
"grad_norm": 0.22495869682272615,
"learning_rate": 7.072236805520358e-06,
"loss": 0.7979,
"step": 1823
},
{
"epoch": 0.726910431403806,
"grad_norm": 0.2376828902036485,
"learning_rate": 7.052939363893583e-06,
"loss": 0.8208,
"step": 1824
},
{
"epoch": 0.7273089568596194,
"grad_norm": 0.23450024068687056,
"learning_rate": 7.033662648359225e-06,
"loss": 0.7824,
"step": 1825
},
{
"epoch": 0.7277074823154329,
"grad_norm": 0.22685374818541473,
"learning_rate": 7.014406689776101e-06,
"loss": 0.7876,
"step": 1826
},
{
"epoch": 0.7281060077712463,
"grad_norm": 0.23011276016836252,
"learning_rate": 6.995171518969808e-06,
"loss": 0.8075,
"step": 1827
},
{
"epoch": 0.7285045332270599,
"grad_norm": 0.24933093286417946,
"learning_rate": 6.975957166732645e-06,
"loss": 0.7662,
"step": 1828
},
{
"epoch": 0.7289030586828734,
"grad_norm": 0.22506531353014372,
"learning_rate": 6.956763663823602e-06,
"loss": 0.7808,
"step": 1829
},
{
"epoch": 0.7293015841386868,
"grad_norm": 0.23401655584722747,
"learning_rate": 6.937591040968288e-06,
"loss": 0.8209,
"step": 1830
},
{
"epoch": 0.7297001095945004,
"grad_norm": 0.24774972767529824,
"learning_rate": 6.918439328858892e-06,
"loss": 0.7712,
"step": 1831
},
{
"epoch": 0.7300986350503138,
"grad_norm": 0.23342909513340782,
"learning_rate": 6.89930855815411e-06,
"loss": 0.7994,
"step": 1832
},
{
"epoch": 0.7304971605061273,
"grad_norm": 0.22394459360997282,
"learning_rate": 6.880198759479133e-06,
"loss": 0.8042,
"step": 1833
},
{
"epoch": 0.7308956859619408,
"grad_norm": 0.23360743949550875,
"learning_rate": 6.861109963425578e-06,
"loss": 0.7916,
"step": 1834
},
{
"epoch": 0.7312942114177543,
"grad_norm": 0.22281906219641856,
"learning_rate": 6.8420422005514266e-06,
"loss": 0.8137,
"step": 1835
},
{
"epoch": 0.7316927368735678,
"grad_norm": 0.22014312278105563,
"learning_rate": 6.822995501380998e-06,
"loss": 0.8021,
"step": 1836
},
{
"epoch": 0.7320912623293813,
"grad_norm": 0.2257715944227968,
"learning_rate": 6.803969896404896e-06,
"loss": 0.784,
"step": 1837
},
{
"epoch": 0.7324897877851948,
"grad_norm": 0.24155855616319677,
"learning_rate": 6.784965416079961e-06,
"loss": 0.7933,
"step": 1838
},
{
"epoch": 0.7328883132410082,
"grad_norm": 0.22107207590046762,
"learning_rate": 6.765982090829189e-06,
"loss": 0.784,
"step": 1839
},
{
"epoch": 0.7332868386968218,
"grad_norm": 0.21216318175362134,
"learning_rate": 6.74701995104174e-06,
"loss": 0.8023,
"step": 1840
},
{
"epoch": 0.7336853641526353,
"grad_norm": 0.2513348774684416,
"learning_rate": 6.728079027072847e-06,
"loss": 0.8255,
"step": 1841
},
{
"epoch": 0.7340838896084487,
"grad_norm": 0.23421026990778565,
"learning_rate": 6.709159349243781e-06,
"loss": 0.8255,
"step": 1842
},
{
"epoch": 0.7344824150642623,
"grad_norm": 0.20679965719103174,
"learning_rate": 6.690260947841809e-06,
"loss": 0.7863,
"step": 1843
},
{
"epoch": 0.7348809405200757,
"grad_norm": 0.24196895097156834,
"learning_rate": 6.671383853120117e-06,
"loss": 0.8162,
"step": 1844
},
{
"epoch": 0.7352794659758892,
"grad_norm": 0.23539184150189893,
"learning_rate": 6.652528095297812e-06,
"loss": 0.7788,
"step": 1845
},
{
"epoch": 0.7356779914317026,
"grad_norm": 0.2158639231432844,
"learning_rate": 6.633693704559814e-06,
"loss": 0.8077,
"step": 1846
},
{
"epoch": 0.7360765168875162,
"grad_norm": 0.23071528135591446,
"learning_rate": 6.614880711056853e-06,
"loss": 0.7774,
"step": 1847
},
{
"epoch": 0.7364750423433297,
"grad_norm": 0.22552702501791788,
"learning_rate": 6.596089144905422e-06,
"loss": 0.7794,
"step": 1848
},
{
"epoch": 0.7368735677991431,
"grad_norm": 0.2330734404526342,
"learning_rate": 6.577319036187679e-06,
"loss": 0.79,
"step": 1849
},
{
"epoch": 0.7372720932549567,
"grad_norm": 0.2265375246131879,
"learning_rate": 6.558570414951462e-06,
"loss": 0.7922,
"step": 1850
},
{
"epoch": 0.7376706187107701,
"grad_norm": 0.22667338696640402,
"learning_rate": 6.539843311210181e-06,
"loss": 0.7796,
"step": 1851
},
{
"epoch": 0.7380691441665836,
"grad_norm": 0.23040531636916783,
"learning_rate": 6.521137754942828e-06,
"loss": 0.8163,
"step": 1852
},
{
"epoch": 0.7384676696223972,
"grad_norm": 0.22397477455791673,
"learning_rate": 6.5024537760938886e-06,
"loss": 0.8049,
"step": 1853
},
{
"epoch": 0.7388661950782106,
"grad_norm": 0.21837702568211942,
"learning_rate": 6.483791404573305e-06,
"loss": 0.7899,
"step": 1854
},
{
"epoch": 0.7392647205340241,
"grad_norm": 0.23621768578628966,
"learning_rate": 6.465150670256441e-06,
"loss": 0.8131,
"step": 1855
},
{
"epoch": 0.7396632459898376,
"grad_norm": 0.22441226758524066,
"learning_rate": 6.446531602984003e-06,
"loss": 0.8044,
"step": 1856
},
{
"epoch": 0.7400617714456511,
"grad_norm": 0.21742047573106374,
"learning_rate": 6.427934232562034e-06,
"loss": 0.7779,
"step": 1857
},
{
"epoch": 0.7404602969014645,
"grad_norm": 0.2177698894735104,
"learning_rate": 6.409358588761814e-06,
"loss": 0.7894,
"step": 1858
},
{
"epoch": 0.7408588223572781,
"grad_norm": 0.22916632915750462,
"learning_rate": 6.39080470131989e-06,
"loss": 0.7928,
"step": 1859
},
{
"epoch": 0.7412573478130916,
"grad_norm": 0.22082966691884467,
"learning_rate": 6.37227259993793e-06,
"loss": 0.7915,
"step": 1860
},
{
"epoch": 0.741655873268905,
"grad_norm": 0.2241200766337397,
"learning_rate": 6.353762314282757e-06,
"loss": 0.7779,
"step": 1861
},
{
"epoch": 0.7420543987247186,
"grad_norm": 0.23702387172593264,
"learning_rate": 6.335273873986267e-06,
"loss": 0.7829,
"step": 1862
},
{
"epoch": 0.742452924180532,
"grad_norm": 0.2527038905168017,
"learning_rate": 6.316807308645367e-06,
"loss": 0.7829,
"step": 1863
},
{
"epoch": 0.7428514496363455,
"grad_norm": 0.23475628446887611,
"learning_rate": 6.2983626478219695e-06,
"loss": 0.7999,
"step": 1864
},
{
"epoch": 0.7432499750921591,
"grad_norm": 0.23416030882805897,
"learning_rate": 6.279939921042906e-06,
"loss": 0.8085,
"step": 1865
},
{
"epoch": 0.7436485005479725,
"grad_norm": 0.23262020269941716,
"learning_rate": 6.261539157799912e-06,
"loss": 0.8256,
"step": 1866
},
{
"epoch": 0.744047026003786,
"grad_norm": 0.217504432107485,
"learning_rate": 6.243160387549534e-06,
"loss": 0.7919,
"step": 1867
},
{
"epoch": 0.7444455514595995,
"grad_norm": 0.22220778420283688,
"learning_rate": 6.224803639713138e-06,
"loss": 0.7531,
"step": 1868
},
{
"epoch": 0.744844076915413,
"grad_norm": 0.21437200486409036,
"learning_rate": 6.206468943676831e-06,
"loss": 0.7965,
"step": 1869
},
{
"epoch": 0.7452426023712264,
"grad_norm": 0.23487795253335572,
"learning_rate": 6.188156328791397e-06,
"loss": 0.8301,
"step": 1870
},
{
"epoch": 0.74564112782704,
"grad_norm": 0.21763886551801245,
"learning_rate": 6.169865824372314e-06,
"loss": 0.7875,
"step": 1871
},
{
"epoch": 0.7460396532828535,
"grad_norm": 0.22604818846373181,
"learning_rate": 6.151597459699621e-06,
"loss": 0.8054,
"step": 1872
},
{
"epoch": 0.7464381787386669,
"grad_norm": 0.21771303595209707,
"learning_rate": 6.133351264017939e-06,
"loss": 0.7735,
"step": 1873
},
{
"epoch": 0.7468367041944804,
"grad_norm": 0.21715354774157822,
"learning_rate": 6.115127266536403e-06,
"loss": 0.7762,
"step": 1874
},
{
"epoch": 0.7472352296502939,
"grad_norm": 0.2157960601894358,
"learning_rate": 6.0969254964285895e-06,
"loss": 0.8153,
"step": 1875
},
{
"epoch": 0.7476337551061074,
"grad_norm": 0.22332780451488388,
"learning_rate": 6.0787459828325166e-06,
"loss": 0.8143,
"step": 1876
},
{
"epoch": 0.748032280561921,
"grad_norm": 0.2309153231971099,
"learning_rate": 6.060588754850562e-06,
"loss": 0.7899,
"step": 1877
},
{
"epoch": 0.7484308060177344,
"grad_norm": 0.22898127613887323,
"learning_rate": 6.042453841549438e-06,
"loss": 0.8309,
"step": 1878
},
{
"epoch": 0.7488293314735479,
"grad_norm": 0.21931059736091962,
"learning_rate": 6.024341271960112e-06,
"loss": 0.7921,
"step": 1879
},
{
"epoch": 0.7492278569293613,
"grad_norm": 0.23434936881308505,
"learning_rate": 6.006251075077809e-06,
"loss": 0.7799,
"step": 1880
},
{
"epoch": 0.7496263823851749,
"grad_norm": 0.2372270380137871,
"learning_rate": 5.988183279861921e-06,
"loss": 0.7829,
"step": 1881
},
{
"epoch": 0.7500249078409883,
"grad_norm": 0.22942099098861327,
"learning_rate": 5.970137915235992e-06,
"loss": 0.7918,
"step": 1882
},
{
"epoch": 0.7504234332968018,
"grad_norm": 0.2355040611383991,
"learning_rate": 5.952115010087654e-06,
"loss": 0.835,
"step": 1883
},
{
"epoch": 0.7508219587526154,
"grad_norm": 0.2239708740237137,
"learning_rate": 5.934114593268572e-06,
"loss": 0.7781,
"step": 1884
},
{
"epoch": 0.7512204842084288,
"grad_norm": 0.21984896769317516,
"learning_rate": 5.916136693594434e-06,
"loss": 0.7862,
"step": 1885
},
{
"epoch": 0.7516190096642423,
"grad_norm": 0.2197233848994438,
"learning_rate": 5.898181339844858e-06,
"loss": 0.8147,
"step": 1886
},
{
"epoch": 0.7520175351200558,
"grad_norm": 0.21853538967964484,
"learning_rate": 5.880248560763384e-06,
"loss": 0.7897,
"step": 1887
},
{
"epoch": 0.7524160605758693,
"grad_norm": 0.2251548690545732,
"learning_rate": 5.862338385057416e-06,
"loss": 0.7984,
"step": 1888
},
{
"epoch": 0.7528145860316827,
"grad_norm": 0.21585033327673825,
"learning_rate": 5.844450841398166e-06,
"loss": 0.7953,
"step": 1889
},
{
"epoch": 0.7532131114874963,
"grad_norm": 0.22933572814422915,
"learning_rate": 5.826585958420625e-06,
"loss": 0.8006,
"step": 1890
},
{
"epoch": 0.7536116369433098,
"grad_norm": 0.22747479613099156,
"learning_rate": 5.80874376472349e-06,
"loss": 0.7598,
"step": 1891
},
{
"epoch": 0.7540101623991232,
"grad_norm": 0.21512314765889684,
"learning_rate": 5.790924288869162e-06,
"loss": 0.8148,
"step": 1892
},
{
"epoch": 0.7544086878549368,
"grad_norm": 0.33438808323630886,
"learning_rate": 5.773127559383638e-06,
"loss": 0.7554,
"step": 1893
},
{
"epoch": 0.7548072133107502,
"grad_norm": 0.22483670938682515,
"learning_rate": 5.755353604756544e-06,
"loss": 0.784,
"step": 1894
},
{
"epoch": 0.7552057387665637,
"grad_norm": 0.21592647946477764,
"learning_rate": 5.737602453441032e-06,
"loss": 0.7715,
"step": 1895
},
{
"epoch": 0.7556042642223773,
"grad_norm": 0.21691744670655036,
"learning_rate": 5.719874133853725e-06,
"loss": 0.7909,
"step": 1896
},
{
"epoch": 0.7560027896781907,
"grad_norm": 0.23150710281578893,
"learning_rate": 5.702168674374735e-06,
"loss": 0.7983,
"step": 1897
},
{
"epoch": 0.7564013151340042,
"grad_norm": 0.22053519786366013,
"learning_rate": 5.6844861033475466e-06,
"loss": 0.764,
"step": 1898
},
{
"epoch": 0.7567998405898176,
"grad_norm": 0.21199239099110317,
"learning_rate": 5.666826449079022e-06,
"loss": 0.7872,
"step": 1899
},
{
"epoch": 0.7571983660456312,
"grad_norm": 0.33723343359752794,
"learning_rate": 5.649189739839331e-06,
"loss": 0.8006,
"step": 1900
},
{
"epoch": 0.7575968915014446,
"grad_norm": 0.22529144997723208,
"learning_rate": 5.63157600386192e-06,
"loss": 0.8264,
"step": 1901
},
{
"epoch": 0.7579954169572581,
"grad_norm": 0.21629640216592316,
"learning_rate": 5.613985269343456e-06,
"loss": 0.7854,
"step": 1902
},
{
"epoch": 0.7583939424130717,
"grad_norm": 0.22311405638594484,
"learning_rate": 5.596417564443768e-06,
"loss": 0.7773,
"step": 1903
},
{
"epoch": 0.7587924678688851,
"grad_norm": 0.21547315103858006,
"learning_rate": 5.578872917285838e-06,
"loss": 0.7626,
"step": 1904
},
{
"epoch": 0.7591909933246986,
"grad_norm": 0.22382658871923508,
"learning_rate": 5.561351355955733e-06,
"loss": 0.8059,
"step": 1905
},
{
"epoch": 0.7595895187805121,
"grad_norm": 0.22341672646153143,
"learning_rate": 5.543852908502565e-06,
"loss": 0.7624,
"step": 1906
},
{
"epoch": 0.7599880442363256,
"grad_norm": 0.21972426758841143,
"learning_rate": 5.526377602938429e-06,
"loss": 0.8004,
"step": 1907
},
{
"epoch": 0.7603865696921391,
"grad_norm": 0.20999907442340116,
"learning_rate": 5.508925467238391e-06,
"loss": 0.7865,
"step": 1908
},
{
"epoch": 0.7607850951479526,
"grad_norm": 0.21874631069378098,
"learning_rate": 5.491496529340425e-06,
"loss": 0.782,
"step": 1909
},
{
"epoch": 0.7611836206037661,
"grad_norm": 0.2171739766459026,
"learning_rate": 5.474090817145352e-06,
"loss": 0.817,
"step": 1910
},
{
"epoch": 0.7615821460595795,
"grad_norm": 0.23395913286116207,
"learning_rate": 5.456708358516833e-06,
"loss": 0.7909,
"step": 1911
},
{
"epoch": 0.7619806715153931,
"grad_norm": 0.3909635390360292,
"learning_rate": 5.439349181281293e-06,
"loss": 0.783,
"step": 1912
},
{
"epoch": 0.7623791969712065,
"grad_norm": 0.21817745960660756,
"learning_rate": 5.422013313227896e-06,
"loss": 0.7968,
"step": 1913
},
{
"epoch": 0.76277772242702,
"grad_norm": 0.2129422416400334,
"learning_rate": 5.404700782108476e-06,
"loss": 0.7986,
"step": 1914
},
{
"epoch": 0.7631762478828336,
"grad_norm": 0.24894816442926734,
"learning_rate": 5.387411615637521e-06,
"loss": 0.7838,
"step": 1915
},
{
"epoch": 0.763574773338647,
"grad_norm": 0.21380121079954537,
"learning_rate": 5.370145841492116e-06,
"loss": 0.8042,
"step": 1916
},
{
"epoch": 0.7639732987944605,
"grad_norm": 0.2145326012345622,
"learning_rate": 5.352903487311893e-06,
"loss": 0.7684,
"step": 1917
},
{
"epoch": 0.764371824250274,
"grad_norm": 0.2238740099248399,
"learning_rate": 5.3356845806990054e-06,
"loss": 0.7789,
"step": 1918
},
{
"epoch": 0.7647703497060875,
"grad_norm": 0.23241336202019805,
"learning_rate": 5.318489149218047e-06,
"loss": 0.7955,
"step": 1919
},
{
"epoch": 0.765168875161901,
"grad_norm": 0.22274065294729253,
"learning_rate": 5.301317220396056e-06,
"loss": 0.7971,
"step": 1920
},
{
"epoch": 0.7655674006177144,
"grad_norm": 0.21450551669208287,
"learning_rate": 5.284168821722429e-06,
"loss": 0.8039,
"step": 1921
},
{
"epoch": 0.765965926073528,
"grad_norm": 0.22005637491103672,
"learning_rate": 5.267043980648905e-06,
"loss": 0.7785,
"step": 1922
},
{
"epoch": 0.7663644515293414,
"grad_norm": 0.21711685516462279,
"learning_rate": 5.249942724589508e-06,
"loss": 0.7748,
"step": 1923
},
{
"epoch": 0.7667629769851549,
"grad_norm": 0.21195870427677962,
"learning_rate": 5.23286508092051e-06,
"loss": 0.7791,
"step": 1924
},
{
"epoch": 0.7671615024409684,
"grad_norm": 0.2215540780948147,
"learning_rate": 5.215811076980384e-06,
"loss": 0.7867,
"step": 1925
},
{
"epoch": 0.7675600278967819,
"grad_norm": 0.2134811799235333,
"learning_rate": 5.1987807400697465e-06,
"loss": 0.8204,
"step": 1926
},
{
"epoch": 0.7679585533525954,
"grad_norm": 0.21126480142948123,
"learning_rate": 5.1817740974513394e-06,
"loss": 0.7744,
"step": 1927
},
{
"epoch": 0.7683570788084089,
"grad_norm": 0.21093921074309108,
"learning_rate": 5.164791176349975e-06,
"loss": 0.7804,
"step": 1928
},
{
"epoch": 0.7687556042642224,
"grad_norm": 0.22232833723691933,
"learning_rate": 5.147832003952482e-06,
"loss": 0.8122,
"step": 1929
},
{
"epoch": 0.7691541297200358,
"grad_norm": 0.21135760176592855,
"learning_rate": 5.130896607407689e-06,
"loss": 0.7837,
"step": 1930
},
{
"epoch": 0.7695526551758494,
"grad_norm": 0.21690410153487147,
"learning_rate": 5.113985013826337e-06,
"loss": 0.8333,
"step": 1931
},
{
"epoch": 0.7699511806316629,
"grad_norm": 0.22611226851018745,
"learning_rate": 5.097097250281089e-06,
"loss": 0.8336,
"step": 1932
},
{
"epoch": 0.7703497060874763,
"grad_norm": 0.21422680254932244,
"learning_rate": 5.080233343806435e-06,
"loss": 0.7925,
"step": 1933
},
{
"epoch": 0.7707482315432899,
"grad_norm": 0.21725411912202952,
"learning_rate": 5.063393321398693e-06,
"loss": 0.7682,
"step": 1934
},
{
"epoch": 0.7711467569991033,
"grad_norm": 0.20486094819815992,
"learning_rate": 5.046577210015941e-06,
"loss": 0.7698,
"step": 1935
},
{
"epoch": 0.7715452824549168,
"grad_norm": 0.21116949065534618,
"learning_rate": 5.029785036577976e-06,
"loss": 0.7839,
"step": 1936
},
{
"epoch": 0.7719438079107303,
"grad_norm": 0.21365660447596332,
"learning_rate": 5.013016827966289e-06,
"loss": 0.794,
"step": 1937
},
{
"epoch": 0.7723423333665438,
"grad_norm": 0.21986116163132582,
"learning_rate": 4.996272611023978e-06,
"loss": 0.8004,
"step": 1938
},
{
"epoch": 0.7727408588223573,
"grad_norm": 0.21667082564742637,
"learning_rate": 4.979552412555757e-06,
"loss": 0.7955,
"step": 1939
},
{
"epoch": 0.7731393842781707,
"grad_norm": 0.2131311718527391,
"learning_rate": 4.962856259327888e-06,
"loss": 0.8222,
"step": 1940
},
{
"epoch": 0.7735379097339843,
"grad_norm": 0.20312498370931167,
"learning_rate": 4.946184178068145e-06,
"loss": 0.7777,
"step": 1941
},
{
"epoch": 0.7739364351897977,
"grad_norm": 0.21157244173886958,
"learning_rate": 4.929536195465743e-06,
"loss": 0.7674,
"step": 1942
},
{
"epoch": 0.7743349606456112,
"grad_norm": 0.21401144119856197,
"learning_rate": 4.9129123381713426e-06,
"loss": 0.8245,
"step": 1943
},
{
"epoch": 0.7747334861014247,
"grad_norm": 0.21771908112415073,
"learning_rate": 4.8963126327969844e-06,
"loss": 0.8122,
"step": 1944
},
{
"epoch": 0.7751320115572382,
"grad_norm": 0.21187987139599745,
"learning_rate": 4.879737105916021e-06,
"loss": 0.8179,
"step": 1945
},
{
"epoch": 0.7755305370130517,
"grad_norm": 0.20845520286257718,
"learning_rate": 4.863185784063136e-06,
"loss": 0.7991,
"step": 1946
},
{
"epoch": 0.7759290624688652,
"grad_norm": 0.21881307944899714,
"learning_rate": 4.8466586937342315e-06,
"loss": 0.7715,
"step": 1947
},
{
"epoch": 0.7763275879246787,
"grad_norm": 0.22037508987905377,
"learning_rate": 4.830155861386441e-06,
"loss": 0.8178,
"step": 1948
},
{
"epoch": 0.7767261133804921,
"grad_norm": 0.2188466732998409,
"learning_rate": 4.813677313438045e-06,
"loss": 0.7931,
"step": 1949
},
{
"epoch": 0.7771246388363057,
"grad_norm": 0.22029271333920605,
"learning_rate": 4.7972230762684695e-06,
"loss": 0.7962,
"step": 1950
},
{
"epoch": 0.7775231642921192,
"grad_norm": 0.21586985458048003,
"learning_rate": 4.78079317621821e-06,
"loss": 0.8035,
"step": 1951
},
{
"epoch": 0.7779216897479326,
"grad_norm": 0.2122373168935699,
"learning_rate": 4.7643876395888076e-06,
"loss": 0.7668,
"step": 1952
},
{
"epoch": 0.7783202152037462,
"grad_norm": 0.20775917857186701,
"learning_rate": 4.748006492642805e-06,
"loss": 0.7786,
"step": 1953
},
{
"epoch": 0.7787187406595596,
"grad_norm": 0.21569140886208557,
"learning_rate": 4.731649761603685e-06,
"loss": 0.8067,
"step": 1954
},
{
"epoch": 0.7791172661153731,
"grad_norm": 0.2131646673455944,
"learning_rate": 4.715317472655863e-06,
"loss": 0.7971,
"step": 1955
},
{
"epoch": 0.7795157915711866,
"grad_norm": 0.2146175074423186,
"learning_rate": 4.699009651944622e-06,
"loss": 0.777,
"step": 1956
},
{
"epoch": 0.7799143170270001,
"grad_norm": 0.21312837734855186,
"learning_rate": 4.682726325576059e-06,
"loss": 0.7932,
"step": 1957
},
{
"epoch": 0.7803128424828136,
"grad_norm": 0.21781795703518547,
"learning_rate": 4.666467519617093e-06,
"loss": 0.8004,
"step": 1958
},
{
"epoch": 0.780711367938627,
"grad_norm": 0.21181093024914874,
"learning_rate": 4.650233260095354e-06,
"loss": 0.7586,
"step": 1959
},
{
"epoch": 0.7811098933944406,
"grad_norm": 0.21750201665933414,
"learning_rate": 4.634023572999207e-06,
"loss": 0.8103,
"step": 1960
},
{
"epoch": 0.781508418850254,
"grad_norm": 0.21261609028271256,
"learning_rate": 4.617838484277654e-06,
"loss": 0.7794,
"step": 1961
},
{
"epoch": 0.7819069443060676,
"grad_norm": 0.22127702762736784,
"learning_rate": 4.601678019840339e-06,
"loss": 0.824,
"step": 1962
},
{
"epoch": 0.7823054697618811,
"grad_norm": 0.21167895347901275,
"learning_rate": 4.585542205557478e-06,
"loss": 0.7872,
"step": 1963
},
{
"epoch": 0.7827039952176945,
"grad_norm": 0.20443014284749786,
"learning_rate": 4.569431067259828e-06,
"loss": 0.768,
"step": 1964
},
{
"epoch": 0.783102520673508,
"grad_norm": 0.21508398213351645,
"learning_rate": 4.553344630738654e-06,
"loss": 0.7972,
"step": 1965
},
{
"epoch": 0.7835010461293215,
"grad_norm": 0.21284922880197987,
"learning_rate": 4.5372829217456515e-06,
"loss": 0.7877,
"step": 1966
},
{
"epoch": 0.783899571585135,
"grad_norm": 0.21149964459483625,
"learning_rate": 4.5212459659929596e-06,
"loss": 0.8317,
"step": 1967
},
{
"epoch": 0.7842980970409484,
"grad_norm": 0.20959662240837698,
"learning_rate": 4.505233789153063e-06,
"loss": 0.7761,
"step": 1968
},
{
"epoch": 0.784696622496762,
"grad_norm": 0.21566004770178748,
"learning_rate": 4.489246416858814e-06,
"loss": 0.7787,
"step": 1969
},
{
"epoch": 0.7850951479525755,
"grad_norm": 0.20948032542954348,
"learning_rate": 4.473283874703336e-06,
"loss": 0.8001,
"step": 1970
},
{
"epoch": 0.7854936734083889,
"grad_norm": 0.21171612340758303,
"learning_rate": 4.457346188239997e-06,
"loss": 0.7846,
"step": 1971
},
{
"epoch": 0.7858921988642025,
"grad_norm": 0.211495224788516,
"learning_rate": 4.4414333829823944e-06,
"loss": 0.8205,
"step": 1972
},
{
"epoch": 0.7862907243200159,
"grad_norm": 0.21182971426196345,
"learning_rate": 4.425545484404272e-06,
"loss": 0.817,
"step": 1973
},
{
"epoch": 0.7866892497758294,
"grad_norm": 0.20652359587837626,
"learning_rate": 4.409682517939527e-06,
"loss": 0.7975,
"step": 1974
},
{
"epoch": 0.787087775231643,
"grad_norm": 0.2039383627589195,
"learning_rate": 4.393844508982124e-06,
"loss": 0.7934,
"step": 1975
},
{
"epoch": 0.7874863006874564,
"grad_norm": 0.20780785483145897,
"learning_rate": 4.3780314828860895e-06,
"loss": 0.7954,
"step": 1976
},
{
"epoch": 0.7878848261432699,
"grad_norm": 0.2072740025638685,
"learning_rate": 4.362243464965452e-06,
"loss": 0.7901,
"step": 1977
},
{
"epoch": 0.7882833515990834,
"grad_norm": 0.19867758615892187,
"learning_rate": 4.346480480494197e-06,
"loss": 0.7606,
"step": 1978
},
{
"epoch": 0.7886818770548969,
"grad_norm": 0.21773075945607415,
"learning_rate": 4.330742554706251e-06,
"loss": 0.8123,
"step": 1979
},
{
"epoch": 0.7890804025107103,
"grad_norm": 0.20266873734956298,
"learning_rate": 4.315029712795404e-06,
"loss": 0.799,
"step": 1980
},
{
"epoch": 0.7894789279665239,
"grad_norm": 0.20650482471845288,
"learning_rate": 4.299341979915324e-06,
"loss": 0.7972,
"step": 1981
},
{
"epoch": 0.7898774534223374,
"grad_norm": 0.20847406865766804,
"learning_rate": 4.283679381179449e-06,
"loss": 0.8187,
"step": 1982
},
{
"epoch": 0.7902759788781508,
"grad_norm": 0.2077737716719368,
"learning_rate": 4.268041941660998e-06,
"loss": 0.8032,
"step": 1983
},
{
"epoch": 0.7906745043339644,
"grad_norm": 0.20859031258363198,
"learning_rate": 4.252429686392927e-06,
"loss": 0.7706,
"step": 1984
},
{
"epoch": 0.7910730297897778,
"grad_norm": 0.20953564600107155,
"learning_rate": 4.236842640367844e-06,
"loss": 0.7902,
"step": 1985
},
{
"epoch": 0.7914715552455913,
"grad_norm": 0.1998647822957012,
"learning_rate": 4.221280828538028e-06,
"loss": 0.785,
"step": 1986
},
{
"epoch": 0.7918700807014049,
"grad_norm": 0.2109037742269456,
"learning_rate": 4.205744275815351e-06,
"loss": 0.788,
"step": 1987
},
{
"epoch": 0.7922686061572183,
"grad_norm": 0.3093393907121497,
"learning_rate": 4.19023300707126e-06,
"loss": 0.8089,
"step": 1988
},
{
"epoch": 0.7926671316130318,
"grad_norm": 0.21256297107207034,
"learning_rate": 4.174747047136707e-06,
"loss": 0.7745,
"step": 1989
},
{
"epoch": 0.7930656570688452,
"grad_norm": 0.5160365968905928,
"learning_rate": 4.159286420802144e-06,
"loss": 0.7948,
"step": 1990
},
{
"epoch": 0.7934641825246588,
"grad_norm": 0.21126289660765277,
"learning_rate": 4.1438511528174665e-06,
"loss": 0.7918,
"step": 1991
},
{
"epoch": 0.7938627079804722,
"grad_norm": 0.21794744648330014,
"learning_rate": 4.1284412678919715e-06,
"loss": 0.7843,
"step": 1992
},
{
"epoch": 0.7942612334362857,
"grad_norm": 0.20868906992268485,
"learning_rate": 4.11305679069433e-06,
"loss": 0.8017,
"step": 1993
},
{
"epoch": 0.7946597588920993,
"grad_norm": 0.21719069879632263,
"learning_rate": 4.097697745852522e-06,
"loss": 0.7973,
"step": 1994
},
{
"epoch": 0.7950582843479127,
"grad_norm": 0.21142187004817078,
"learning_rate": 4.08236415795384e-06,
"loss": 0.7814,
"step": 1995
},
{
"epoch": 0.7954568098037262,
"grad_norm": 0.2039420161311614,
"learning_rate": 4.067056051544793e-06,
"loss": 0.7889,
"step": 1996
},
{
"epoch": 0.7958553352595397,
"grad_norm": 0.24194928974109936,
"learning_rate": 4.051773451131127e-06,
"loss": 0.7682,
"step": 1997
},
{
"epoch": 0.7962538607153532,
"grad_norm": 0.2012545890604259,
"learning_rate": 4.036516381177742e-06,
"loss": 0.7782,
"step": 1998
},
{
"epoch": 0.7966523861711667,
"grad_norm": 0.20970642629605174,
"learning_rate": 4.02128486610867e-06,
"loss": 0.8223,
"step": 1999
},
{
"epoch": 0.7970509116269802,
"grad_norm": 0.20665659488141222,
"learning_rate": 4.006078930307043e-06,
"loss": 0.7812,
"step": 2000
},
{
"epoch": 0.7974494370827937,
"grad_norm": 0.21749421417588286,
"learning_rate": 3.9908985981150275e-06,
"loss": 0.7676,
"step": 2001
},
{
"epoch": 0.7978479625386071,
"grad_norm": 0.20888996451808617,
"learning_rate": 3.975743893833821e-06,
"loss": 0.8185,
"step": 2002
},
{
"epoch": 0.7982464879944207,
"grad_norm": 0.2704077080536192,
"learning_rate": 3.960614841723569e-06,
"loss": 0.7838,
"step": 2003
},
{
"epoch": 0.7986450134502341,
"grad_norm": 0.2088559508207916,
"learning_rate": 3.945511466003391e-06,
"loss": 0.8171,
"step": 2004
},
{
"epoch": 0.7990435389060476,
"grad_norm": 0.20661415959125704,
"learning_rate": 3.930433790851278e-06,
"loss": 0.7754,
"step": 2005
},
{
"epoch": 0.7994420643618612,
"grad_norm": 0.20701920533433565,
"learning_rate": 3.915381840404071e-06,
"loss": 0.7841,
"step": 2006
},
{
"epoch": 0.7998405898176746,
"grad_norm": 0.21927395552931095,
"learning_rate": 3.900355638757452e-06,
"loss": 0.8029,
"step": 2007
},
{
"epoch": 0.8002391152734881,
"grad_norm": 0.20280686560023278,
"learning_rate": 3.885355209965865e-06,
"loss": 0.7794,
"step": 2008
},
{
"epoch": 0.8006376407293015,
"grad_norm": 0.22037706389941072,
"learning_rate": 3.870380578042505e-06,
"loss": 0.8098,
"step": 2009
},
{
"epoch": 0.8010361661851151,
"grad_norm": 0.22041475186669696,
"learning_rate": 3.85543176695927e-06,
"loss": 0.803,
"step": 2010
},
{
"epoch": 0.8014346916409285,
"grad_norm": 0.20998177604491353,
"learning_rate": 3.840508800646725e-06,
"loss": 0.8175,
"step": 2011
},
{
"epoch": 0.801833217096742,
"grad_norm": 0.45165795643816325,
"learning_rate": 3.825611702994061e-06,
"loss": 0.8009,
"step": 2012
},
{
"epoch": 0.8022317425525556,
"grad_norm": 0.21072158850784894,
"learning_rate": 3.810740497849048e-06,
"loss": 0.7807,
"step": 2013
},
{
"epoch": 0.802630268008369,
"grad_norm": 0.2069068117921759,
"learning_rate": 3.7958952090180145e-06,
"loss": 0.8019,
"step": 2014
},
{
"epoch": 0.8030287934641825,
"grad_norm": 0.21068337260203102,
"learning_rate": 3.781075860265806e-06,
"loss": 0.7816,
"step": 2015
},
{
"epoch": 0.803427318919996,
"grad_norm": 0.21398934601155856,
"learning_rate": 3.766282475315741e-06,
"loss": 0.7638,
"step": 2016
},
{
"epoch": 0.8038258443758095,
"grad_norm": 0.20441959178687177,
"learning_rate": 3.7515150778495566e-06,
"loss": 0.806,
"step": 2017
},
{
"epoch": 0.804224369831623,
"grad_norm": 0.21249378504406466,
"learning_rate": 3.7367736915074116e-06,
"loss": 0.7552,
"step": 2018
},
{
"epoch": 0.8046228952874365,
"grad_norm": 0.20661783667193465,
"learning_rate": 3.7220583398878198e-06,
"loss": 0.7926,
"step": 2019
},
{
"epoch": 0.80502142074325,
"grad_norm": 0.2077752476136891,
"learning_rate": 3.7073690465475996e-06,
"loss": 0.8021,
"step": 2020
},
{
"epoch": 0.8054199461990634,
"grad_norm": 0.20570938011934367,
"learning_rate": 3.6927058350018774e-06,
"loss": 0.7833,
"step": 2021
},
{
"epoch": 0.805818471654877,
"grad_norm": 0.2068378623875997,
"learning_rate": 3.678068728724018e-06,
"loss": 0.7916,
"step": 2022
},
{
"epoch": 0.8062169971106904,
"grad_norm": 0.2108307060112381,
"learning_rate": 3.663457751145598e-06,
"loss": 0.8342,
"step": 2023
},
{
"epoch": 0.8066155225665039,
"grad_norm": 0.2078448862912843,
"learning_rate": 3.648872925656357e-06,
"loss": 0.7984,
"step": 2024
},
{
"epoch": 0.8070140480223175,
"grad_norm": 0.21028048335603441,
"learning_rate": 3.6343142756041804e-06,
"loss": 0.8018,
"step": 2025
},
{
"epoch": 0.8074125734781309,
"grad_norm": 0.20117720599120376,
"learning_rate": 3.61978182429505e-06,
"loss": 0.7707,
"step": 2026
},
{
"epoch": 0.8078110989339444,
"grad_norm": 0.20314858168527,
"learning_rate": 3.6052755949930028e-06,
"loss": 0.8014,
"step": 2027
},
{
"epoch": 0.8082096243897579,
"grad_norm": 0.20807347591232647,
"learning_rate": 3.590795610920106e-06,
"loss": 0.7783,
"step": 2028
},
{
"epoch": 0.8086081498455714,
"grad_norm": 0.20632811448011976,
"learning_rate": 3.5763418952563964e-06,
"loss": 0.7887,
"step": 2029
},
{
"epoch": 0.8090066753013849,
"grad_norm": 0.21490462809860467,
"learning_rate": 3.561914471139887e-06,
"loss": 0.7844,
"step": 2030
},
{
"epoch": 0.8094052007571984,
"grad_norm": 0.20507534096776664,
"learning_rate": 3.547513361666468e-06,
"loss": 0.7904,
"step": 2031
},
{
"epoch": 0.8098037262130119,
"grad_norm": 0.20644876557134534,
"learning_rate": 3.5331385898899286e-06,
"loss": 0.7691,
"step": 2032
},
{
"epoch": 0.8102022516688253,
"grad_norm": 0.21240998726372254,
"learning_rate": 3.5187901788219005e-06,
"loss": 0.8199,
"step": 2033
},
{
"epoch": 0.8106007771246388,
"grad_norm": 0.20137624296072554,
"learning_rate": 3.5044681514317923e-06,
"loss": 0.7814,
"step": 2034
},
{
"epoch": 0.8109993025804523,
"grad_norm": 0.2073451450199298,
"learning_rate": 3.4901725306467983e-06,
"loss": 0.7769,
"step": 2035
},
{
"epoch": 0.8113978280362658,
"grad_norm": 0.2134160597885788,
"learning_rate": 3.4759033393518227e-06,
"loss": 0.7811,
"step": 2036
},
{
"epoch": 0.8117963534920793,
"grad_norm": 0.20469419291818344,
"learning_rate": 3.461660600389476e-06,
"loss": 0.7819,
"step": 2037
},
{
"epoch": 0.8121948789478928,
"grad_norm": 0.20376860496093793,
"learning_rate": 3.447444336560013e-06,
"loss": 0.7816,
"step": 2038
},
{
"epoch": 0.8125934044037063,
"grad_norm": 0.41207208863994677,
"learning_rate": 3.4332545706213092e-06,
"loss": 0.7927,
"step": 2039
},
{
"epoch": 0.8129919298595197,
"grad_norm": 0.21507072465785926,
"learning_rate": 3.4190913252888304e-06,
"loss": 0.804,
"step": 2040
},
{
"epoch": 0.8133904553153333,
"grad_norm": 0.20319740876888007,
"learning_rate": 3.4049546232355677e-06,
"loss": 0.7874,
"step": 2041
},
{
"epoch": 0.8137889807711468,
"grad_norm": 0.20241224467511873,
"learning_rate": 3.3908444870920377e-06,
"loss": 0.7805,
"step": 2042
},
{
"epoch": 0.8141875062269602,
"grad_norm": 0.21466864150429207,
"learning_rate": 3.3767609394462177e-06,
"loss": 0.78,
"step": 2043
},
{
"epoch": 0.8145860316827738,
"grad_norm": 0.20218659511290218,
"learning_rate": 3.3627040028435266e-06,
"loss": 0.7801,
"step": 2044
},
{
"epoch": 0.8149845571385872,
"grad_norm": 0.213036870154348,
"learning_rate": 3.3486736997867973e-06,
"loss": 0.7824,
"step": 2045
},
{
"epoch": 0.8153830825944007,
"grad_norm": 0.19949805665039408,
"learning_rate": 3.3346700527361976e-06,
"loss": 0.7955,
"step": 2046
},
{
"epoch": 0.8157816080502142,
"grad_norm": 0.20680232683225422,
"learning_rate": 3.320693084109252e-06,
"loss": 0.7897,
"step": 2047
},
{
"epoch": 0.8161801335060277,
"grad_norm": 0.2000391282113421,
"learning_rate": 3.3067428162807524e-06,
"loss": 0.8005,
"step": 2048
},
{
"epoch": 0.8165786589618412,
"grad_norm": 0.2156772773776592,
"learning_rate": 3.2928192715827635e-06,
"loss": 0.8053,
"step": 2049
},
{
"epoch": 0.8169771844176547,
"grad_norm": 0.39867015204161727,
"learning_rate": 3.2789224723045688e-06,
"loss": 0.7969,
"step": 2050
},
{
"epoch": 0.8173757098734682,
"grad_norm": 0.2046620024871545,
"learning_rate": 3.265052440692633e-06,
"loss": 0.7926,
"step": 2051
},
{
"epoch": 0.8177742353292816,
"grad_norm": 0.2030808711787401,
"learning_rate": 3.2512091989505755e-06,
"loss": 0.7774,
"step": 2052
},
{
"epoch": 0.8181727607850952,
"grad_norm": 0.20949507249814342,
"learning_rate": 3.2373927692391183e-06,
"loss": 0.793,
"step": 2053
},
{
"epoch": 0.8185712862409087,
"grad_norm": 0.20772541980987708,
"learning_rate": 3.2236031736760775e-06,
"loss": 0.7726,
"step": 2054
},
{
"epoch": 0.8189698116967221,
"grad_norm": 0.21408416045479248,
"learning_rate": 3.209840434336291e-06,
"loss": 0.7794,
"step": 2055
},
{
"epoch": 0.8193683371525357,
"grad_norm": 0.22494235529547763,
"learning_rate": 3.196104573251633e-06,
"loss": 0.791,
"step": 2056
},
{
"epoch": 0.8197668626083491,
"grad_norm": 0.20454170412693226,
"learning_rate": 3.1823956124109245e-06,
"loss": 0.7862,
"step": 2057
},
{
"epoch": 0.8201653880641626,
"grad_norm": 0.20433874449012537,
"learning_rate": 3.168713573759934e-06,
"loss": 0.7666,
"step": 2058
},
{
"epoch": 0.820563913519976,
"grad_norm": 0.20661160157593184,
"learning_rate": 3.1550584792013384e-06,
"loss": 0.7433,
"step": 2059
},
{
"epoch": 0.8209624389757896,
"grad_norm": 0.20629809799285342,
"learning_rate": 3.1414303505946674e-06,
"loss": 0.7976,
"step": 2060
},
{
"epoch": 0.8213609644316031,
"grad_norm": 0.2144450649554419,
"learning_rate": 3.1278292097562902e-06,
"loss": 0.8333,
"step": 2061
},
{
"epoch": 0.8217594898874165,
"grad_norm": 0.20822166366362016,
"learning_rate": 3.1142550784593784e-06,
"loss": 0.8266,
"step": 2062
},
{
"epoch": 0.8221580153432301,
"grad_norm": 0.24188329998112856,
"learning_rate": 3.100707978433859e-06,
"loss": 0.7876,
"step": 2063
},
{
"epoch": 0.8225565407990435,
"grad_norm": 0.2048848180047204,
"learning_rate": 3.087187931366382e-06,
"loss": 0.7614,
"step": 2064
},
{
"epoch": 0.822955066254857,
"grad_norm": 0.20470377463967024,
"learning_rate": 3.0736949589003016e-06,
"loss": 0.7781,
"step": 2065
},
{
"epoch": 0.8233535917106706,
"grad_norm": 0.20987934787578208,
"learning_rate": 3.0602290826356264e-06,
"loss": 0.772,
"step": 2066
},
{
"epoch": 0.823752117166484,
"grad_norm": 0.2113936816052613,
"learning_rate": 3.046790324128972e-06,
"loss": 0.7872,
"step": 2067
},
{
"epoch": 0.8241506426222975,
"grad_norm": 0.19957043349861603,
"learning_rate": 3.0333787048935794e-06,
"loss": 0.7887,
"step": 2068
},
{
"epoch": 0.824549168078111,
"grad_norm": 0.3857301817498995,
"learning_rate": 3.019994246399205e-06,
"loss": 0.7882,
"step": 2069
},
{
"epoch": 0.8249476935339245,
"grad_norm": 0.20789973511441273,
"learning_rate": 3.006636970072152e-06,
"loss": 0.8076,
"step": 2070
},
{
"epoch": 0.8253462189897379,
"grad_norm": 0.2058835362862163,
"learning_rate": 2.993306897295194e-06,
"loss": 0.7764,
"step": 2071
},
{
"epoch": 0.8257447444455515,
"grad_norm": 0.20439869423777723,
"learning_rate": 2.980004049407561e-06,
"loss": 0.7764,
"step": 2072
},
{
"epoch": 0.826143269901365,
"grad_norm": 0.19876479503616204,
"learning_rate": 2.9667284477049075e-06,
"loss": 0.7826,
"step": 2073
},
{
"epoch": 0.8265417953571784,
"grad_norm": 0.1982699447253256,
"learning_rate": 2.9534801134392644e-06,
"loss": 0.7757,
"step": 2074
},
{
"epoch": 0.826940320812992,
"grad_norm": 0.20536270507053644,
"learning_rate": 2.9402590678190134e-06,
"loss": 0.7943,
"step": 2075
},
{
"epoch": 0.8273388462688054,
"grad_norm": 0.20479786214195925,
"learning_rate": 2.927065332008847e-06,
"loss": 0.796,
"step": 2076
},
{
"epoch": 0.8277373717246189,
"grad_norm": 0.204692054035632,
"learning_rate": 2.9138989271297525e-06,
"loss": 0.7757,
"step": 2077
},
{
"epoch": 0.8281358971804323,
"grad_norm": 0.2088750085892623,
"learning_rate": 2.900759874258938e-06,
"loss": 0.8125,
"step": 2078
},
{
"epoch": 0.8285344226362459,
"grad_norm": 0.2044102963337698,
"learning_rate": 2.887648194429862e-06,
"loss": 0.7641,
"step": 2079
},
{
"epoch": 0.8289329480920594,
"grad_norm": 0.21327563387382853,
"learning_rate": 2.874563908632142e-06,
"loss": 0.7994,
"step": 2080
},
{
"epoch": 0.8293314735478728,
"grad_norm": 0.2046570896223022,
"learning_rate": 2.8615070378115372e-06,
"loss": 0.8017,
"step": 2081
},
{
"epoch": 0.8297299990036864,
"grad_norm": 0.19812578410366266,
"learning_rate": 2.848477602869937e-06,
"loss": 0.784,
"step": 2082
},
{
"epoch": 0.8301285244594998,
"grad_norm": 0.20601688938227922,
"learning_rate": 2.8354756246652913e-06,
"loss": 0.769,
"step": 2083
},
{
"epoch": 0.8305270499153133,
"grad_norm": 0.2057354048825274,
"learning_rate": 2.822501124011612e-06,
"loss": 0.7847,
"step": 2084
},
{
"epoch": 0.8309255753711269,
"grad_norm": 0.21168604129063812,
"learning_rate": 2.809554121678917e-06,
"loss": 0.8032,
"step": 2085
},
{
"epoch": 0.8313241008269403,
"grad_norm": 0.2100939254517527,
"learning_rate": 2.7966346383932076e-06,
"loss": 0.7874,
"step": 2086
},
{
"epoch": 0.8317226262827538,
"grad_norm": 0.21934203978806813,
"learning_rate": 2.7837426948364334e-06,
"loss": 0.79,
"step": 2087
},
{
"epoch": 0.8321211517385673,
"grad_norm": 0.19759229839235726,
"learning_rate": 2.7708783116464435e-06,
"loss": 0.7655,
"step": 2088
},
{
"epoch": 0.8325196771943808,
"grad_norm": 0.2086778699301496,
"learning_rate": 2.7580415094169865e-06,
"loss": 0.7839,
"step": 2089
},
{
"epoch": 0.8329182026501942,
"grad_norm": 0.21338341723931933,
"learning_rate": 2.745232308697636e-06,
"loss": 0.829,
"step": 2090
},
{
"epoch": 0.8333167281060078,
"grad_norm": 0.21045174950788936,
"learning_rate": 2.732450729993814e-06,
"loss": 0.8096,
"step": 2091
},
{
"epoch": 0.8337152535618213,
"grad_norm": 0.2051766400490156,
"learning_rate": 2.7196967937666865e-06,
"loss": 0.8039,
"step": 2092
},
{
"epoch": 0.8341137790176347,
"grad_norm": 0.19510414251619265,
"learning_rate": 2.706970520433192e-06,
"loss": 0.7793,
"step": 2093
},
{
"epoch": 0.8345123044734483,
"grad_norm": 0.2023242681129976,
"learning_rate": 2.6942719303659837e-06,
"loss": 0.781,
"step": 2094
},
{
"epoch": 0.8349108299292617,
"grad_norm": 0.2030427501132859,
"learning_rate": 2.681601043893387e-06,
"loss": 0.781,
"step": 2095
},
{
"epoch": 0.8353093553850752,
"grad_norm": 0.20888874667008847,
"learning_rate": 2.6689578812993857e-06,
"loss": 0.7694,
"step": 2096
},
{
"epoch": 0.8357078808408888,
"grad_norm": 0.20077367736979854,
"learning_rate": 2.6563424628235845e-06,
"loss": 0.7848,
"step": 2097
},
{
"epoch": 0.8361064062967022,
"grad_norm": 0.21005110509053168,
"learning_rate": 2.6437548086611765e-06,
"loss": 0.7988,
"step": 2098
},
{
"epoch": 0.8365049317525157,
"grad_norm": 0.19800915015594286,
"learning_rate": 2.6311949389628956e-06,
"loss": 0.8021,
"step": 2099
},
{
"epoch": 0.8369034572083291,
"grad_norm": 0.20692630086537173,
"learning_rate": 2.618662873835007e-06,
"loss": 0.796,
"step": 2100
},
{
"epoch": 0.8373019826641427,
"grad_norm": 0.20999876285414867,
"learning_rate": 2.6061586333392684e-06,
"loss": 0.8025,
"step": 2101
},
{
"epoch": 0.8377005081199561,
"grad_norm": 0.20623308075487845,
"learning_rate": 2.5936822374928894e-06,
"loss": 0.7815,
"step": 2102
},
{
"epoch": 0.8380990335757696,
"grad_norm": 0.205638179543828,
"learning_rate": 2.581233706268509e-06,
"loss": 0.802,
"step": 2103
},
{
"epoch": 0.8384975590315832,
"grad_norm": 0.19752040584951092,
"learning_rate": 2.5688130595941486e-06,
"loss": 0.7556,
"step": 2104
},
{
"epoch": 0.8388960844873966,
"grad_norm": 0.20069625765475899,
"learning_rate": 2.55642031735321e-06,
"loss": 0.7889,
"step": 2105
},
{
"epoch": 0.8392946099432101,
"grad_norm": 0.2018781461121737,
"learning_rate": 2.544055499384406e-06,
"loss": 0.8142,
"step": 2106
},
{
"epoch": 0.8396931353990236,
"grad_norm": 0.19475379047238844,
"learning_rate": 2.5317186254817538e-06,
"loss": 0.7663,
"step": 2107
},
{
"epoch": 0.8400916608548371,
"grad_norm": 0.1969342228912807,
"learning_rate": 2.519409715394545e-06,
"loss": 0.7938,
"step": 2108
},
{
"epoch": 0.8404901863106506,
"grad_norm": 0.19895944903191795,
"learning_rate": 2.5071287888272953e-06,
"loss": 0.8051,
"step": 2109
},
{
"epoch": 0.8408887117664641,
"grad_norm": 0.20042877149823382,
"learning_rate": 2.4948758654397342e-06,
"loss": 0.7833,
"step": 2110
},
{
"epoch": 0.8412872372222776,
"grad_norm": 0.19887545472768395,
"learning_rate": 2.4826509648467424e-06,
"loss": 0.7742,
"step": 2111
},
{
"epoch": 0.841685762678091,
"grad_norm": 0.2011722070087204,
"learning_rate": 2.470454106618363e-06,
"loss": 0.7857,
"step": 2112
},
{
"epoch": 0.8420842881339046,
"grad_norm": 0.20180297794597085,
"learning_rate": 2.458285310279738e-06,
"loss": 0.7997,
"step": 2113
},
{
"epoch": 0.842482813589718,
"grad_norm": 0.20055121230743078,
"learning_rate": 2.4461445953110862e-06,
"loss": 0.8014,
"step": 2114
},
{
"epoch": 0.8428813390455315,
"grad_norm": 0.19868315248272878,
"learning_rate": 2.43403198114768e-06,
"loss": 0.774,
"step": 2115
},
{
"epoch": 0.8432798645013451,
"grad_norm": 0.19770045553158802,
"learning_rate": 2.4219474871797942e-06,
"loss": 0.7856,
"step": 2116
},
{
"epoch": 0.8436783899571585,
"grad_norm": 0.20259006469350982,
"learning_rate": 2.409891132752702e-06,
"loss": 0.8102,
"step": 2117
},
{
"epoch": 0.844076915412972,
"grad_norm": 0.2013541403832189,
"learning_rate": 2.3978629371666174e-06,
"loss": 0.7853,
"step": 2118
},
{
"epoch": 0.8444754408687855,
"grad_norm": 0.20033442757315134,
"learning_rate": 2.3858629196766846e-06,
"loss": 0.7877,
"step": 2119
},
{
"epoch": 0.844873966324599,
"grad_norm": 0.21068432536317944,
"learning_rate": 2.3738910994929353e-06,
"loss": 0.766,
"step": 2120
},
{
"epoch": 0.8452724917804125,
"grad_norm": 0.1980119004076494,
"learning_rate": 2.36194749578027e-06,
"loss": 0.7731,
"step": 2121
},
{
"epoch": 0.845671017236226,
"grad_norm": 0.19889954520717595,
"learning_rate": 2.3500321276584103e-06,
"loss": 0.796,
"step": 2122
},
{
"epoch": 0.8460695426920395,
"grad_norm": 0.29416894294679846,
"learning_rate": 2.338145014201878e-06,
"loss": 0.8096,
"step": 2123
},
{
"epoch": 0.8464680681478529,
"grad_norm": 0.19806318324832906,
"learning_rate": 2.326286174439969e-06,
"loss": 0.7997,
"step": 2124
},
{
"epoch": 0.8468665936036665,
"grad_norm": 0.19823684897235574,
"learning_rate": 2.3144556273567132e-06,
"loss": 0.7607,
"step": 2125
},
{
"epoch": 0.8472651190594799,
"grad_norm": 0.18966161568344858,
"learning_rate": 2.30265339189085e-06,
"loss": 0.7804,
"step": 2126
},
{
"epoch": 0.8476636445152934,
"grad_norm": 0.19521990516259677,
"learning_rate": 2.2908794869358044e-06,
"loss": 0.7648,
"step": 2127
},
{
"epoch": 0.848062169971107,
"grad_norm": 0.21019481820981523,
"learning_rate": 2.27913393133963e-06,
"loss": 0.801,
"step": 2128
},
{
"epoch": 0.8484606954269204,
"grad_norm": 0.2044393443918899,
"learning_rate": 2.267416743905018e-06,
"loss": 0.7998,
"step": 2129
},
{
"epoch": 0.8488592208827339,
"grad_norm": 0.1983161340871745,
"learning_rate": 2.255727943389232e-06,
"loss": 0.7829,
"step": 2130
},
{
"epoch": 0.8492577463385473,
"grad_norm": 0.3883686062566025,
"learning_rate": 2.244067548504101e-06,
"loss": 0.7689,
"step": 2131
},
{
"epoch": 0.8496562717943609,
"grad_norm": 0.19823170694060893,
"learning_rate": 2.232435577915981e-06,
"loss": 0.7841,
"step": 2132
},
{
"epoch": 0.8500547972501744,
"grad_norm": 0.2011348839077823,
"learning_rate": 2.2208320502457247e-06,
"loss": 0.7743,
"step": 2133
},
{
"epoch": 0.8504533227059878,
"grad_norm": 0.2678986826453042,
"learning_rate": 2.209256984068653e-06,
"loss": 0.8186,
"step": 2134
},
{
"epoch": 0.8508518481618014,
"grad_norm": 0.38901312200457155,
"learning_rate": 2.1977103979145144e-06,
"loss": 0.7873,
"step": 2135
},
{
"epoch": 0.8512503736176148,
"grad_norm": 0.19801665808383853,
"learning_rate": 2.186192310267481e-06,
"loss": 0.7962,
"step": 2136
},
{
"epoch": 0.8516488990734283,
"grad_norm": 0.19959353534388102,
"learning_rate": 2.174702739566097e-06,
"loss": 0.7875,
"step": 2137
},
{
"epoch": 0.8520474245292418,
"grad_norm": 0.19906997852364527,
"learning_rate": 2.1632417042032582e-06,
"loss": 0.799,
"step": 2138
},
{
"epoch": 0.8524459499850553,
"grad_norm": 0.19383785374266083,
"learning_rate": 2.151809222526171e-06,
"loss": 0.8012,
"step": 2139
},
{
"epoch": 0.8528444754408688,
"grad_norm": 0.20008791840830747,
"learning_rate": 2.140405312836342e-06,
"loss": 0.8034,
"step": 2140
},
{
"epoch": 0.8532430008966823,
"grad_norm": 0.5550294238933178,
"learning_rate": 2.1290299933895375e-06,
"loss": 0.8056,
"step": 2141
},
{
"epoch": 0.8536415263524958,
"grad_norm": 0.19867486415459287,
"learning_rate": 2.1176832823957437e-06,
"loss": 0.7777,
"step": 2142
},
{
"epoch": 0.8540400518083092,
"grad_norm": 0.19676333190679646,
"learning_rate": 2.1063651980191735e-06,
"loss": 0.7915,
"step": 2143
},
{
"epoch": 0.8544385772641228,
"grad_norm": 0.1989409125958559,
"learning_rate": 2.095075758378191e-06,
"loss": 0.8095,
"step": 2144
},
{
"epoch": 0.8548371027199362,
"grad_norm": 0.21328576722717954,
"learning_rate": 2.083814981545316e-06,
"loss": 0.8003,
"step": 2145
},
{
"epoch": 0.8552356281757497,
"grad_norm": 0.20295493914625967,
"learning_rate": 2.0725828855471743e-06,
"loss": 0.8048,
"step": 2146
},
{
"epoch": 0.8556341536315633,
"grad_norm": 0.2074806852443234,
"learning_rate": 2.06137948836449e-06,
"loss": 0.8056,
"step": 2147
},
{
"epoch": 0.8560326790873767,
"grad_norm": 0.1970460127714032,
"learning_rate": 2.0502048079320412e-06,
"loss": 0.7719,
"step": 2148
},
{
"epoch": 0.8564312045431902,
"grad_norm": 0.20135572980918695,
"learning_rate": 2.03905886213863e-06,
"loss": 0.8124,
"step": 2149
},
{
"epoch": 0.8568297299990036,
"grad_norm": 0.19706602719348762,
"learning_rate": 2.0279416688270714e-06,
"loss": 0.8042,
"step": 2150
},
{
"epoch": 0.8572282554548172,
"grad_norm": 0.19351017765851636,
"learning_rate": 2.0168532457941347e-06,
"loss": 0.7817,
"step": 2151
},
{
"epoch": 0.8576267809106307,
"grad_norm": 0.19662641436265876,
"learning_rate": 2.0057936107905496e-06,
"loss": 0.7872,
"step": 2152
},
{
"epoch": 0.8580253063664441,
"grad_norm": 0.19472713717233617,
"learning_rate": 1.994762781520947e-06,
"loss": 0.7959,
"step": 2153
},
{
"epoch": 0.8584238318222577,
"grad_norm": 0.4466872234199686,
"learning_rate": 1.9837607756438506e-06,
"loss": 0.7957,
"step": 2154
},
{
"epoch": 0.8588223572780711,
"grad_norm": 0.19598069824689382,
"learning_rate": 1.972787610771656e-06,
"loss": 0.7728,
"step": 2155
},
{
"epoch": 0.8592208827338846,
"grad_norm": 0.20101685010301282,
"learning_rate": 1.9618433044705653e-06,
"loss": 0.7943,
"step": 2156
},
{
"epoch": 0.8596194081896981,
"grad_norm": 0.298341423595395,
"learning_rate": 1.9509278742605998e-06,
"loss": 0.8152,
"step": 2157
},
{
"epoch": 0.8600179336455116,
"grad_norm": 0.19641318468760852,
"learning_rate": 1.9400413376155414e-06,
"loss": 0.7718,
"step": 2158
},
{
"epoch": 0.8604164591013251,
"grad_norm": 0.20359959382775875,
"learning_rate": 1.929183711962932e-06,
"loss": 0.8166,
"step": 2159
},
{
"epoch": 0.8608149845571386,
"grad_norm": 0.29285934932172486,
"learning_rate": 1.918355014684026e-06,
"loss": 0.8116,
"step": 2160
},
{
"epoch": 0.8612135100129521,
"grad_norm": 0.20081004118069398,
"learning_rate": 1.9075552631137673e-06,
"loss": 0.828,
"step": 2161
},
{
"epoch": 0.8616120354687655,
"grad_norm": 0.19491684359283115,
"learning_rate": 1.8967844745407649e-06,
"loss": 0.8162,
"step": 2162
},
{
"epoch": 0.8620105609245791,
"grad_norm": 0.19931801177242742,
"learning_rate": 1.8860426662072573e-06,
"loss": 0.7646,
"step": 2163
},
{
"epoch": 0.8624090863803926,
"grad_norm": 0.19469429796070387,
"learning_rate": 1.8753298553091004e-06,
"loss": 0.7662,
"step": 2164
},
{
"epoch": 0.862807611836206,
"grad_norm": 0.19523553415875863,
"learning_rate": 1.8646460589957138e-06,
"loss": 0.7675,
"step": 2165
},
{
"epoch": 0.8632061372920196,
"grad_norm": 0.19836255092500826,
"learning_rate": 1.8539912943700921e-06,
"loss": 0.8162,
"step": 2166
},
{
"epoch": 0.863604662747833,
"grad_norm": 0.33046612241829804,
"learning_rate": 1.8433655784887338e-06,
"loss": 0.786,
"step": 2167
},
{
"epoch": 0.8640031882036465,
"grad_norm": 0.20287140254104755,
"learning_rate": 1.832768928361648e-06,
"loss": 0.8033,
"step": 2168
},
{
"epoch": 0.86440171365946,
"grad_norm": 0.19837142562234192,
"learning_rate": 1.8222013609523138e-06,
"loss": 0.7856,
"step": 2169
},
{
"epoch": 0.8648002391152735,
"grad_norm": 0.21103666545418504,
"learning_rate": 1.8116628931776437e-06,
"loss": 0.8434,
"step": 2170
},
{
"epoch": 0.865198764571087,
"grad_norm": 0.19867703712237042,
"learning_rate": 1.801153541907974e-06,
"loss": 0.7698,
"step": 2171
},
{
"epoch": 0.8655972900269004,
"grad_norm": 0.19825876352724692,
"learning_rate": 1.7906733239670338e-06,
"loss": 0.772,
"step": 2172
},
{
"epoch": 0.865995815482714,
"grad_norm": 0.20878459364682986,
"learning_rate": 1.7802222561319116e-06,
"loss": 0.7581,
"step": 2173
},
{
"epoch": 0.8663943409385274,
"grad_norm": 0.2958038314902087,
"learning_rate": 1.7698003551330222e-06,
"loss": 0.7944,
"step": 2174
},
{
"epoch": 0.8667928663943409,
"grad_norm": 0.20169391290837302,
"learning_rate": 1.7594076376541025e-06,
"loss": 0.8066,
"step": 2175
},
{
"epoch": 0.8671913918501545,
"grad_norm": 0.234034044100227,
"learning_rate": 1.749044120332164e-06,
"loss": 0.7721,
"step": 2176
},
{
"epoch": 0.8675899173059679,
"grad_norm": 0.2034910419905341,
"learning_rate": 1.7387098197574782e-06,
"loss": 0.8084,
"step": 2177
},
{
"epoch": 0.8679884427617814,
"grad_norm": 0.2073685879363281,
"learning_rate": 1.7284047524735426e-06,
"loss": 0.7925,
"step": 2178
},
{
"epoch": 0.8683869682175949,
"grad_norm": 0.20037230019907548,
"learning_rate": 1.7181289349770547e-06,
"loss": 0.7811,
"step": 2179
},
{
"epoch": 0.8687854936734084,
"grad_norm": 0.21712284699454534,
"learning_rate": 1.707882383717896e-06,
"loss": 0.7678,
"step": 2180
},
{
"epoch": 0.8691840191292218,
"grad_norm": 0.20117180870370702,
"learning_rate": 1.697665115099083e-06,
"loss": 0.7942,
"step": 2181
},
{
"epoch": 0.8695825445850354,
"grad_norm": 0.194101573652863,
"learning_rate": 1.6874771454767723e-06,
"loss": 0.7824,
"step": 2182
},
{
"epoch": 0.8699810700408489,
"grad_norm": 0.19921324707773355,
"learning_rate": 1.677318491160207e-06,
"loss": 0.7928,
"step": 2183
},
{
"epoch": 0.8703795954966623,
"grad_norm": 0.3229505296718228,
"learning_rate": 1.6671891684117048e-06,
"loss": 0.827,
"step": 2184
},
{
"epoch": 0.8707781209524759,
"grad_norm": 0.19497337244902666,
"learning_rate": 1.6570891934466304e-06,
"loss": 0.8059,
"step": 2185
},
{
"epoch": 0.8711766464082893,
"grad_norm": 0.19561470121792823,
"learning_rate": 1.6470185824333617e-06,
"loss": 0.7976,
"step": 2186
},
{
"epoch": 0.8715751718641028,
"grad_norm": 0.1969078670974646,
"learning_rate": 1.6369773514932786e-06,
"loss": 0.7653,
"step": 2187
},
{
"epoch": 0.8719736973199164,
"grad_norm": 0.19792267780479758,
"learning_rate": 1.6269655167007136e-06,
"loss": 0.7824,
"step": 2188
},
{
"epoch": 0.8723722227757298,
"grad_norm": 0.19510256307880908,
"learning_rate": 1.6169830940829578e-06,
"loss": 0.8068,
"step": 2189
},
{
"epoch": 0.8727707482315433,
"grad_norm": 0.1960870054521117,
"learning_rate": 1.6070300996202126e-06,
"loss": 0.7989,
"step": 2190
},
{
"epoch": 0.8731692736873568,
"grad_norm": 0.19085815051372912,
"learning_rate": 1.5971065492455617e-06,
"loss": 0.7636,
"step": 2191
},
{
"epoch": 0.8735677991431703,
"grad_norm": 0.19882296691960544,
"learning_rate": 1.5872124588449667e-06,
"loss": 0.7659,
"step": 2192
},
{
"epoch": 0.8739663245989837,
"grad_norm": 0.20028837148412157,
"learning_rate": 1.5773478442572154e-06,
"loss": 0.7934,
"step": 2193
},
{
"epoch": 0.8743648500547972,
"grad_norm": 0.19461902920242444,
"learning_rate": 1.5675127212739183e-06,
"loss": 0.7905,
"step": 2194
},
{
"epoch": 0.8747633755106108,
"grad_norm": 0.2016751952111212,
"learning_rate": 1.5577071056394743e-06,
"loss": 0.7862,
"step": 2195
},
{
"epoch": 0.8751619009664242,
"grad_norm": 0.19602147097639658,
"learning_rate": 1.5479310130510428e-06,
"loss": 0.7845,
"step": 2196
},
{
"epoch": 0.8755604264222377,
"grad_norm": 0.19583338180249446,
"learning_rate": 1.5381844591585294e-06,
"loss": 0.7957,
"step": 2197
},
{
"epoch": 0.8759589518780512,
"grad_norm": 0.19403020064241092,
"learning_rate": 1.5284674595645376e-06,
"loss": 0.7963,
"step": 2198
},
{
"epoch": 0.8763574773338647,
"grad_norm": 0.19782089212017984,
"learning_rate": 1.518780029824376e-06,
"loss": 0.7782,
"step": 2199
},
{
"epoch": 0.8767560027896782,
"grad_norm": 0.19942984981212644,
"learning_rate": 1.5091221854460037e-06,
"loss": 0.7975,
"step": 2200
},
{
"epoch": 0.8771545282454917,
"grad_norm": 0.19196702353727593,
"learning_rate": 1.4994939418900334e-06,
"loss": 0.7829,
"step": 2201
},
{
"epoch": 0.8775530537013052,
"grad_norm": 0.19379377172825363,
"learning_rate": 1.4898953145696738e-06,
"loss": 0.7982,
"step": 2202
},
{
"epoch": 0.8779515791571186,
"grad_norm": 0.19506234613903994,
"learning_rate": 1.4803263188507377e-06,
"loss": 0.7954,
"step": 2203
},
{
"epoch": 0.8783501046129322,
"grad_norm": 0.1978506554262955,
"learning_rate": 1.4707869700515965e-06,
"loss": 0.784,
"step": 2204
},
{
"epoch": 0.8787486300687456,
"grad_norm": 0.1980098585833247,
"learning_rate": 1.4612772834431566e-06,
"loss": 0.7569,
"step": 2205
},
{
"epoch": 0.8791471555245591,
"grad_norm": 0.19286242098132406,
"learning_rate": 1.4517972742488518e-06,
"loss": 0.7872,
"step": 2206
},
{
"epoch": 0.8795456809803727,
"grad_norm": 0.19098749250411995,
"learning_rate": 1.4423469576446002e-06,
"loss": 0.7815,
"step": 2207
},
{
"epoch": 0.8799442064361861,
"grad_norm": 0.20211925019195784,
"learning_rate": 1.4329263487587896e-06,
"loss": 0.8205,
"step": 2208
},
{
"epoch": 0.8803427318919996,
"grad_norm": 0.19532927186278154,
"learning_rate": 1.4235354626722431e-06,
"loss": 0.8121,
"step": 2209
},
{
"epoch": 0.8807412573478131,
"grad_norm": 0.1977750810931428,
"learning_rate": 1.4141743144182153e-06,
"loss": 0.7813,
"step": 2210
},
{
"epoch": 0.8811397828036266,
"grad_norm": 0.19358648033690376,
"learning_rate": 1.4048429189823432e-06,
"loss": 0.7455,
"step": 2211
},
{
"epoch": 0.88153830825944,
"grad_norm": 0.19846194328922676,
"learning_rate": 1.3955412913026468e-06,
"loss": 0.7662,
"step": 2212
},
{
"epoch": 0.8819368337152536,
"grad_norm": 0.19353205579063595,
"learning_rate": 1.3862694462694836e-06,
"loss": 0.7835,
"step": 2213
},
{
"epoch": 0.8823353591710671,
"grad_norm": 0.1961760649090444,
"learning_rate": 1.3770273987255322e-06,
"loss": 0.7869,
"step": 2214
},
{
"epoch": 0.8827338846268805,
"grad_norm": 0.198917812531222,
"learning_rate": 1.36781516346578e-06,
"loss": 0.7903,
"step": 2215
},
{
"epoch": 0.883132410082694,
"grad_norm": 0.31377432050732995,
"learning_rate": 1.3586327552374834e-06,
"loss": 0.7966,
"step": 2216
},
{
"epoch": 0.8835309355385075,
"grad_norm": 0.198947487765649,
"learning_rate": 1.349480188740151e-06,
"loss": 0.7845,
"step": 2217
},
{
"epoch": 0.883929460994321,
"grad_norm": 0.19609086834502595,
"learning_rate": 1.3403574786255203e-06,
"loss": 0.8267,
"step": 2218
},
{
"epoch": 0.8843279864501346,
"grad_norm": 0.19456541239424982,
"learning_rate": 1.3312646394975336e-06,
"loss": 0.7844,
"step": 2219
},
{
"epoch": 0.884726511905948,
"grad_norm": 0.18969287146965966,
"learning_rate": 1.322201685912321e-06,
"loss": 0.7561,
"step": 2220
},
{
"epoch": 0.8851250373617615,
"grad_norm": 0.19860314043543428,
"learning_rate": 1.3131686323781567e-06,
"loss": 0.7827,
"step": 2221
},
{
"epoch": 0.8855235628175749,
"grad_norm": 0.19669097960151344,
"learning_rate": 1.3041654933554627e-06,
"loss": 0.8035,
"step": 2222
},
{
"epoch": 0.8859220882733885,
"grad_norm": 0.20094073004540627,
"learning_rate": 1.2951922832567676e-06,
"loss": 0.7944,
"step": 2223
},
{
"epoch": 0.8863206137292019,
"grad_norm": 0.20272095445679028,
"learning_rate": 1.28624901644669e-06,
"loss": 0.8167,
"step": 2224
},
{
"epoch": 0.8867191391850154,
"grad_norm": 0.1953963230612544,
"learning_rate": 1.2773357072419156e-06,
"loss": 0.7721,
"step": 2225
},
{
"epoch": 0.887117664640829,
"grad_norm": 0.19553209878909217,
"learning_rate": 1.2684523699111683e-06,
"loss": 0.7898,
"step": 2226
},
{
"epoch": 0.8875161900966424,
"grad_norm": 0.19739783007158812,
"learning_rate": 1.259599018675197e-06,
"loss": 0.7751,
"step": 2227
},
{
"epoch": 0.8879147155524559,
"grad_norm": 0.2004207680549029,
"learning_rate": 1.2507756677067407e-06,
"loss": 0.7937,
"step": 2228
},
{
"epoch": 0.8883132410082694,
"grad_norm": 0.20274609576106925,
"learning_rate": 1.241982331130518e-06,
"loss": 0.7834,
"step": 2229
},
{
"epoch": 0.8887117664640829,
"grad_norm": 0.191410521331001,
"learning_rate": 1.233219023023211e-06,
"loss": 0.7964,
"step": 2230
},
{
"epoch": 0.8891102919198964,
"grad_norm": 0.19149409444639265,
"learning_rate": 1.2244857574134073e-06,
"loss": 0.8145,
"step": 2231
},
{
"epoch": 0.8895088173757099,
"grad_norm": 0.18942106902230516,
"learning_rate": 1.215782548281621e-06,
"loss": 0.7978,
"step": 2232
},
{
"epoch": 0.8899073428315234,
"grad_norm": 0.1936628231256215,
"learning_rate": 1.2071094095602388e-06,
"loss": 0.7688,
"step": 2233
},
{
"epoch": 0.8903058682873368,
"grad_norm": 0.19500787909123946,
"learning_rate": 1.198466355133514e-06,
"loss": 0.7985,
"step": 2234
},
{
"epoch": 0.8907043937431504,
"grad_norm": 0.19336207767259123,
"learning_rate": 1.1898533988375438e-06,
"loss": 0.7776,
"step": 2235
},
{
"epoch": 0.8911029191989638,
"grad_norm": 0.19306715682597733,
"learning_rate": 1.1812705544602387e-06,
"loss": 0.7781,
"step": 2236
},
{
"epoch": 0.8915014446547773,
"grad_norm": 0.1941966927070721,
"learning_rate": 1.1727178357413082e-06,
"loss": 0.7966,
"step": 2237
},
{
"epoch": 0.8918999701105909,
"grad_norm": 0.19470860439438434,
"learning_rate": 1.1641952563722292e-06,
"loss": 0.7875,
"step": 2238
},
{
"epoch": 0.8922984955664043,
"grad_norm": 0.19439172234266613,
"learning_rate": 1.155702829996239e-06,
"loss": 0.7949,
"step": 2239
},
{
"epoch": 0.8926970210222178,
"grad_norm": 0.19183529464923693,
"learning_rate": 1.1472405702082966e-06,
"loss": 0.8169,
"step": 2240
},
{
"epoch": 0.8930955464780312,
"grad_norm": 0.187219800602026,
"learning_rate": 1.1388084905550767e-06,
"loss": 0.7913,
"step": 2241
},
{
"epoch": 0.8934940719338448,
"grad_norm": 0.19725379366605064,
"learning_rate": 1.1304066045349371e-06,
"loss": 0.7759,
"step": 2242
},
{
"epoch": 0.8938925973896583,
"grad_norm": 0.1992594593840214,
"learning_rate": 1.1220349255978991e-06,
"loss": 0.8375,
"step": 2243
},
{
"epoch": 0.8942911228454717,
"grad_norm": 0.192342482643918,
"learning_rate": 1.1136934671456356e-06,
"loss": 0.7732,
"step": 2244
},
{
"epoch": 0.8946896483012853,
"grad_norm": 0.19446382141607246,
"learning_rate": 1.1053822425314253e-06,
"loss": 0.7787,
"step": 2245
},
{
"epoch": 0.8950881737570987,
"grad_norm": 0.21096981672144022,
"learning_rate": 1.0971012650601653e-06,
"loss": 0.7856,
"step": 2246
},
{
"epoch": 0.8954866992129122,
"grad_norm": 0.19037453116007338,
"learning_rate": 1.0888505479883226e-06,
"loss": 0.8141,
"step": 2247
},
{
"epoch": 0.8958852246687257,
"grad_norm": 0.1936745233440335,
"learning_rate": 1.0806301045239253e-06,
"loss": 0.776,
"step": 2248
},
{
"epoch": 0.8962837501245392,
"grad_norm": 0.19445932650167486,
"learning_rate": 1.0724399478265312e-06,
"loss": 0.7968,
"step": 2249
},
{
"epoch": 0.8966822755803527,
"grad_norm": 0.1942260109414643,
"learning_rate": 1.064280091007226e-06,
"loss": 0.7982,
"step": 2250
},
{
"epoch": 0.8970808010361662,
"grad_norm": 0.19599907378500261,
"learning_rate": 1.056150547128585e-06,
"loss": 0.7812,
"step": 2251
},
{
"epoch": 0.8974793264919797,
"grad_norm": 0.18888785669949568,
"learning_rate": 1.048051329204649e-06,
"loss": 0.7749,
"step": 2252
},
{
"epoch": 0.8978778519477931,
"grad_norm": 0.19413389947923068,
"learning_rate": 1.0399824502009292e-06,
"loss": 0.817,
"step": 2253
},
{
"epoch": 0.8982763774036067,
"grad_norm": 0.19041901167362632,
"learning_rate": 1.0319439230343552e-06,
"loss": 0.7829,
"step": 2254
},
{
"epoch": 0.8986749028594202,
"grad_norm": 0.190265615798965,
"learning_rate": 1.023935760573278e-06,
"loss": 0.7854,
"step": 2255
},
{
"epoch": 0.8990734283152336,
"grad_norm": 0.1917924700076846,
"learning_rate": 1.0159579756374272e-06,
"loss": 0.8021,
"step": 2256
},
{
"epoch": 0.8994719537710472,
"grad_norm": 0.19462841904809697,
"learning_rate": 1.0080105809979134e-06,
"loss": 0.7983,
"step": 2257
},
{
"epoch": 0.8998704792268606,
"grad_norm": 0.19572994397974086,
"learning_rate": 1.0000935893771957e-06,
"loss": 0.7807,
"step": 2258
},
{
"epoch": 0.9002690046826741,
"grad_norm": 0.19368930137185603,
"learning_rate": 9.922070134490625e-07,
"loss": 0.8069,
"step": 2259
},
{
"epoch": 0.9006675301384875,
"grad_norm": 0.18858216628151148,
"learning_rate": 9.843508658386147e-07,
"loss": 0.778,
"step": 2260
},
{
"epoch": 0.9010660555943011,
"grad_norm": 0.1902121814138829,
"learning_rate": 9.765251591222302e-07,
"loss": 0.7545,
"step": 2261
},
{
"epoch": 0.9014645810501146,
"grad_norm": 0.19207716877501332,
"learning_rate": 9.687299058275723e-07,
"loss": 0.8013,
"step": 2262
},
{
"epoch": 0.901863106505928,
"grad_norm": 0.19334913879349405,
"learning_rate": 9.609651184335389e-07,
"loss": 0.7946,
"step": 2263
},
{
"epoch": 0.9022616319617416,
"grad_norm": 0.19358676874591074,
"learning_rate": 9.532308093702691e-07,
"loss": 0.7772,
"step": 2264
},
{
"epoch": 0.902660157417555,
"grad_norm": 0.19148428383209684,
"learning_rate": 9.455269910191101e-07,
"loss": 0.7696,
"step": 2265
},
{
"epoch": 0.9030586828733685,
"grad_norm": 0.19540552907978265,
"learning_rate": 9.378536757125878e-07,
"loss": 0.8139,
"step": 2266
},
{
"epoch": 0.903457208329182,
"grad_norm": 0.19107073335621758,
"learning_rate": 9.302108757344119e-07,
"loss": 0.7858,
"step": 2267
},
{
"epoch": 0.9038557337849955,
"grad_norm": 0.1918963920445226,
"learning_rate": 9.225986033194268e-07,
"loss": 0.7788,
"step": 2268
},
{
"epoch": 0.904254259240809,
"grad_norm": 0.19310448866238283,
"learning_rate": 9.150168706536178e-07,
"loss": 0.7866,
"step": 2269
},
{
"epoch": 0.9046527846966225,
"grad_norm": 0.19687413534571704,
"learning_rate": 9.07465689874083e-07,
"loss": 0.7893,
"step": 2270
},
{
"epoch": 0.905051310152436,
"grad_norm": 0.1914042946404483,
"learning_rate": 8.99945073069004e-07,
"loss": 0.7748,
"step": 2271
},
{
"epoch": 0.9054498356082494,
"grad_norm": 0.2024204777517844,
"learning_rate": 8.924550322776415e-07,
"loss": 0.8568,
"step": 2272
},
{
"epoch": 0.905848361064063,
"grad_norm": 0.19403659491993944,
"learning_rate": 8.849955794903042e-07,
"loss": 0.8056,
"step": 2273
},
{
"epoch": 0.9062468865198765,
"grad_norm": 0.19411389381810215,
"learning_rate": 8.775667266483378e-07,
"loss": 0.7911,
"step": 2274
},
{
"epoch": 0.9066454119756899,
"grad_norm": 0.1924715710067694,
"learning_rate": 8.70168485644094e-07,
"loss": 0.7965,
"step": 2275
},
{
"epoch": 0.9070439374315035,
"grad_norm": 0.20038568330574344,
"learning_rate": 8.628008683209388e-07,
"loss": 0.7843,
"step": 2276
},
{
"epoch": 0.9074424628873169,
"grad_norm": 0.20132103527197703,
"learning_rate": 8.554638864731957e-07,
"loss": 0.7999,
"step": 2277
},
{
"epoch": 0.9078409883431304,
"grad_norm": 0.19240880279129838,
"learning_rate": 8.481575518461538e-07,
"loss": 0.7665,
"step": 2278
},
{
"epoch": 0.9082395137989439,
"grad_norm": 0.19434784566980481,
"learning_rate": 8.408818761360437e-07,
"loss": 0.8056,
"step": 2279
},
{
"epoch": 0.9086380392547574,
"grad_norm": 0.1978390018533812,
"learning_rate": 8.336368709900089e-07,
"loss": 0.8144,
"step": 2280
},
{
"epoch": 0.9090365647105709,
"grad_norm": 0.19566833800627478,
"learning_rate": 8.264225480061028e-07,
"loss": 0.7771,
"step": 2281
},
{
"epoch": 0.9094350901663844,
"grad_norm": 0.1975826569993677,
"learning_rate": 8.192389187332539e-07,
"loss": 0.7938,
"step": 2282
},
{
"epoch": 0.9098336156221979,
"grad_norm": 0.19045983399236568,
"learning_rate": 8.120859946712634e-07,
"loss": 0.7845,
"step": 2283
},
{
"epoch": 0.9102321410780113,
"grad_norm": 0.19130128975193195,
"learning_rate": 8.049637872707672e-07,
"loss": 0.7958,
"step": 2284
},
{
"epoch": 0.9106306665338249,
"grad_norm": 0.19085596321752288,
"learning_rate": 7.978723079332406e-07,
"loss": 0.7612,
"step": 2285
},
{
"epoch": 0.9110291919896384,
"grad_norm": 0.33424704454608156,
"learning_rate": 7.908115680109629e-07,
"loss": 0.7853,
"step": 2286
},
{
"epoch": 0.9114277174454518,
"grad_norm": 0.1954738492496232,
"learning_rate": 7.837815788070035e-07,
"loss": 0.8041,
"step": 2287
},
{
"epoch": 0.9118262429012653,
"grad_norm": 0.19475975240294963,
"learning_rate": 7.767823515752116e-07,
"loss": 0.7872,
"step": 2288
},
{
"epoch": 0.9122247683570788,
"grad_norm": 0.18960360869197374,
"learning_rate": 7.698138975201819e-07,
"loss": 0.8041,
"step": 2289
},
{
"epoch": 0.9126232938128923,
"grad_norm": 0.19589521054226136,
"learning_rate": 7.628762277972534e-07,
"loss": 0.7982,
"step": 2290
},
{
"epoch": 0.9130218192687057,
"grad_norm": 0.1917631141189516,
"learning_rate": 7.559693535124802e-07,
"loss": 0.7938,
"step": 2291
},
{
"epoch": 0.9134203447245193,
"grad_norm": 0.19253740493505767,
"learning_rate": 7.490932857226219e-07,
"loss": 0.7959,
"step": 2292
},
{
"epoch": 0.9138188701803328,
"grad_norm": 0.19582622703851235,
"learning_rate": 7.422480354351202e-07,
"loss": 0.834,
"step": 2293
},
{
"epoch": 0.9142173956361462,
"grad_norm": 0.18995947588533355,
"learning_rate": 7.354336136080809e-07,
"loss": 0.7762,
"step": 2294
},
{
"epoch": 0.9146159210919598,
"grad_norm": 0.18806413991915635,
"learning_rate": 7.286500311502686e-07,
"loss": 0.797,
"step": 2295
},
{
"epoch": 0.9150144465477732,
"grad_norm": 0.19277211114688542,
"learning_rate": 7.218972989210616e-07,
"loss": 0.7763,
"step": 2296
},
{
"epoch": 0.9154129720035867,
"grad_norm": 0.19199075944716948,
"learning_rate": 7.151754277304657e-07,
"loss": 0.7568,
"step": 2297
},
{
"epoch": 0.9158114974594003,
"grad_norm": 0.19072158788713017,
"learning_rate": 7.084844283390823e-07,
"loss": 0.7915,
"step": 2298
},
{
"epoch": 0.9162100229152137,
"grad_norm": 0.19205282392375037,
"learning_rate": 7.018243114580858e-07,
"loss": 0.8034,
"step": 2299
},
{
"epoch": 0.9166085483710272,
"grad_norm": 0.2052923264205816,
"learning_rate": 6.951950877492209e-07,
"loss": 0.7857,
"step": 2300
},
{
"epoch": 0.9170070738268407,
"grad_norm": 0.22779157975014266,
"learning_rate": 6.885967678247652e-07,
"loss": 0.756,
"step": 2301
},
{
"epoch": 0.9174055992826542,
"grad_norm": 0.18774142177297953,
"learning_rate": 6.820293622475427e-07,
"loss": 0.7857,
"step": 2302
},
{
"epoch": 0.9178041247384676,
"grad_norm": 0.19498696530660528,
"learning_rate": 6.754928815308703e-07,
"loss": 0.7991,
"step": 2303
},
{
"epoch": 0.9182026501942812,
"grad_norm": 0.19209043390951142,
"learning_rate": 6.689873361385691e-07,
"loss": 0.8101,
"step": 2304
},
{
"epoch": 0.9186011756500947,
"grad_norm": 0.19290885228459345,
"learning_rate": 6.625127364849371e-07,
"loss": 0.7955,
"step": 2305
},
{
"epoch": 0.9189997011059081,
"grad_norm": 0.1877743297868329,
"learning_rate": 6.560690929347324e-07,
"loss": 0.7844,
"step": 2306
},
{
"epoch": 0.9193982265617217,
"grad_norm": 0.19214675198757558,
"learning_rate": 6.49656415803157e-07,
"loss": 0.7903,
"step": 2307
},
{
"epoch": 0.9197967520175351,
"grad_norm": 0.19219057718417967,
"learning_rate": 6.432747153558416e-07,
"loss": 0.7761,
"step": 2308
},
{
"epoch": 0.9201952774733486,
"grad_norm": 0.18838660622383804,
"learning_rate": 6.369240018088297e-07,
"loss": 0.7947,
"step": 2309
},
{
"epoch": 0.9205938029291622,
"grad_norm": 0.1886108613905356,
"learning_rate": 6.306042853285532e-07,
"loss": 0.7813,
"step": 2310
},
{
"epoch": 0.9209923283849756,
"grad_norm": 0.1925293191301323,
"learning_rate": 6.243155760318332e-07,
"loss": 0.7982,
"step": 2311
},
{
"epoch": 0.9213908538407891,
"grad_norm": 0.19530492377194633,
"learning_rate": 6.180578839858475e-07,
"loss": 0.7885,
"step": 2312
},
{
"epoch": 0.9217893792966025,
"grad_norm": 0.18651121519218392,
"learning_rate": 6.118312192081166e-07,
"loss": 0.7949,
"step": 2313
},
{
"epoch": 0.9221879047524161,
"grad_norm": 0.19338094401034905,
"learning_rate": 6.056355916665024e-07,
"loss": 0.7717,
"step": 2314
},
{
"epoch": 0.9225864302082295,
"grad_norm": 0.18423336706407692,
"learning_rate": 5.994710112791713e-07,
"loss": 0.7811,
"step": 2315
},
{
"epoch": 0.922984955664043,
"grad_norm": 0.18939928604114048,
"learning_rate": 5.933374879145893e-07,
"loss": 0.7755,
"step": 2316
},
{
"epoch": 0.9233834811198566,
"grad_norm": 0.1926905369480336,
"learning_rate": 5.872350313915131e-07,
"loss": 0.8114,
"step": 2317
},
{
"epoch": 0.92378200657567,
"grad_norm": 0.19646582733405174,
"learning_rate": 5.811636514789598e-07,
"loss": 0.7871,
"step": 2318
},
{
"epoch": 0.9241805320314835,
"grad_norm": 0.19298296374648816,
"learning_rate": 5.75123357896199e-07,
"loss": 0.8039,
"step": 2319
},
{
"epoch": 0.924579057487297,
"grad_norm": 0.190458055205602,
"learning_rate": 5.691141603127381e-07,
"loss": 0.7835,
"step": 2320
},
{
"epoch": 0.9249775829431105,
"grad_norm": 0.1915360703578091,
"learning_rate": 5.631360683483001e-07,
"loss": 0.8234,
"step": 2321
},
{
"epoch": 0.925376108398924,
"grad_norm": 0.23747828168438873,
"learning_rate": 5.571890915728206e-07,
"loss": 0.79,
"step": 2322
},
{
"epoch": 0.9257746338547375,
"grad_norm": 0.19153058747182247,
"learning_rate": 5.512732395064224e-07,
"loss": 0.7649,
"step": 2323
},
{
"epoch": 0.926173159310551,
"grad_norm": 0.19727254538457217,
"learning_rate": 5.453885216193988e-07,
"loss": 0.8349,
"step": 2324
},
{
"epoch": 0.9265716847663644,
"grad_norm": 0.1951429962580588,
"learning_rate": 5.395349473322032e-07,
"loss": 0.7978,
"step": 2325
},
{
"epoch": 0.926970210222178,
"grad_norm": 0.18510338682179783,
"learning_rate": 5.337125260154397e-07,
"loss": 0.7777,
"step": 2326
},
{
"epoch": 0.9273687356779914,
"grad_norm": 0.1946540136821385,
"learning_rate": 5.279212669898326e-07,
"loss": 0.8047,
"step": 2327
},
{
"epoch": 0.9277672611338049,
"grad_norm": 0.18491969397571634,
"learning_rate": 5.221611795262283e-07,
"loss": 0.7573,
"step": 2328
},
{
"epoch": 0.9281657865896185,
"grad_norm": 0.19372900242022098,
"learning_rate": 5.164322728455684e-07,
"loss": 0.8202,
"step": 2329
},
{
"epoch": 0.9285643120454319,
"grad_norm": 0.19227025991711344,
"learning_rate": 5.107345561188836e-07,
"loss": 0.7805,
"step": 2330
},
{
"epoch": 0.9289628375012454,
"grad_norm": 0.1884216178497241,
"learning_rate": 5.050680384672668e-07,
"loss": 0.7911,
"step": 2331
},
{
"epoch": 0.9293613629570588,
"grad_norm": 0.19064898796693053,
"learning_rate": 4.994327289618728e-07,
"loss": 0.8286,
"step": 2332
},
{
"epoch": 0.9297598884128724,
"grad_norm": 0.19168131739896943,
"learning_rate": 4.938286366238942e-07,
"loss": 0.7741,
"step": 2333
},
{
"epoch": 0.9301584138686858,
"grad_norm": 0.19521237858027687,
"learning_rate": 4.88255770424555e-07,
"loss": 0.806,
"step": 2334
},
{
"epoch": 0.9305569393244993,
"grad_norm": 0.19197627577848786,
"learning_rate": 4.827141392850876e-07,
"loss": 0.7898,
"step": 2335
},
{
"epoch": 0.9309554647803129,
"grad_norm": 0.19415977793697126,
"learning_rate": 4.772037520767181e-07,
"loss": 0.7764,
"step": 2336
},
{
"epoch": 0.9313539902361263,
"grad_norm": 0.18652836321806102,
"learning_rate": 4.7172461762066356e-07,
"loss": 0.8058,
"step": 2337
},
{
"epoch": 0.9317525156919398,
"grad_norm": 0.1894930310367945,
"learning_rate": 4.662767446881078e-07,
"loss": 0.7747,
"step": 2338
},
{
"epoch": 0.9321510411477533,
"grad_norm": 0.19105060903289703,
"learning_rate": 4.6086014200018793e-07,
"loss": 0.7969,
"step": 2339
},
{
"epoch": 0.9325495666035668,
"grad_norm": 0.18939524754458784,
"learning_rate": 4.5547481822799e-07,
"loss": 0.775,
"step": 2340
},
{
"epoch": 0.9329480920593803,
"grad_norm": 0.18595385386063937,
"learning_rate": 4.5012078199251576e-07,
"loss": 0.7898,
"step": 2341
},
{
"epoch": 0.9333466175151938,
"grad_norm": 0.19033445395137963,
"learning_rate": 4.4479804186469353e-07,
"loss": 0.7734,
"step": 2342
},
{
"epoch": 0.9337451429710073,
"grad_norm": 0.18559439244342524,
"learning_rate": 4.3950660636534084e-07,
"loss": 0.7788,
"step": 2343
},
{
"epoch": 0.9341436684268207,
"grad_norm": 0.20079834918402295,
"learning_rate": 4.342464839651661e-07,
"loss": 0.8214,
"step": 2344
},
{
"epoch": 0.9345421938826343,
"grad_norm": 0.1912887265583529,
"learning_rate": 4.290176830847559e-07,
"loss": 0.7846,
"step": 2345
},
{
"epoch": 0.9349407193384477,
"grad_norm": 0.18960450357073091,
"learning_rate": 4.238202120945478e-07,
"loss": 0.7669,
"step": 2346
},
{
"epoch": 0.9353392447942612,
"grad_norm": 0.19153974072758262,
"learning_rate": 4.186540793148308e-07,
"loss": 0.812,
"step": 2347
},
{
"epoch": 0.9357377702500748,
"grad_norm": 0.18970776862717967,
"learning_rate": 4.13519293015725e-07,
"loss": 0.8019,
"step": 2348
},
{
"epoch": 0.9361362957058882,
"grad_norm": 0.18839937884535282,
"learning_rate": 4.084158614171685e-07,
"loss": 0.7991,
"step": 2349
},
{
"epoch": 0.9365348211617017,
"grad_norm": 0.19300125889679948,
"learning_rate": 4.033437926889061e-07,
"loss": 0.7821,
"step": 2350
},
{
"epoch": 0.9369333466175152,
"grad_norm": 0.18898379977540675,
"learning_rate": 3.983030949504829e-07,
"loss": 0.7919,
"step": 2351
},
{
"epoch": 0.9373318720733287,
"grad_norm": 0.19196900837664088,
"learning_rate": 3.932937762712108e-07,
"loss": 0.7896,
"step": 2352
},
{
"epoch": 0.9377303975291422,
"grad_norm": 0.19925612955481922,
"learning_rate": 3.883158446701796e-07,
"loss": 0.8139,
"step": 2353
},
{
"epoch": 0.9381289229849556,
"grad_norm": 0.18497982043923966,
"learning_rate": 3.833693081162326e-07,
"loss": 0.805,
"step": 2354
},
{
"epoch": 0.9385274484407692,
"grad_norm": 0.19343018564847927,
"learning_rate": 3.784541745279491e-07,
"loss": 0.7965,
"step": 2355
},
{
"epoch": 0.9389259738965826,
"grad_norm": 0.2122663845906148,
"learning_rate": 3.735704517736438e-07,
"loss": 0.7731,
"step": 2356
},
{
"epoch": 0.9393244993523961,
"grad_norm": 0.19304292990391148,
"learning_rate": 3.6871814767134305e-07,
"loss": 0.7985,
"step": 2357
},
{
"epoch": 0.9397230248082096,
"grad_norm": 0.18892205141585297,
"learning_rate": 3.638972699887822e-07,
"loss": 0.8119,
"step": 2358
},
{
"epoch": 0.9401215502640231,
"grad_norm": 0.1913297127895265,
"learning_rate": 3.5910782644338336e-07,
"loss": 0.7902,
"step": 2359
},
{
"epoch": 0.9405200757198366,
"grad_norm": 0.19033240742898452,
"learning_rate": 3.543498247022492e-07,
"loss": 0.7575,
"step": 2360
},
{
"epoch": 0.9409186011756501,
"grad_norm": 0.18332766514040255,
"learning_rate": 3.4962327238215134e-07,
"loss": 0.7598,
"step": 2361
},
{
"epoch": 0.9413171266314636,
"grad_norm": 0.18186761156092401,
"learning_rate": 3.449281770495105e-07,
"loss": 0.7943,
"step": 2362
},
{
"epoch": 0.941715652087277,
"grad_norm": 0.1965958953954756,
"learning_rate": 3.402645462204013e-07,
"loss": 0.8086,
"step": 2363
},
{
"epoch": 0.9421141775430906,
"grad_norm": 0.19135591876413088,
"learning_rate": 3.3563238736051604e-07,
"loss": 0.804,
"step": 2364
},
{
"epoch": 0.9425127029989041,
"grad_norm": 0.18843239133027376,
"learning_rate": 3.310317078851744e-07,
"loss": 0.7751,
"step": 2365
},
{
"epoch": 0.9429112284547175,
"grad_norm": 0.1944972137264629,
"learning_rate": 3.2646251515929597e-07,
"loss": 0.7862,
"step": 2366
},
{
"epoch": 0.9433097539105311,
"grad_norm": 0.19037760409725837,
"learning_rate": 3.2192481649740095e-07,
"loss": 0.8166,
"step": 2367
},
{
"epoch": 0.9437082793663445,
"grad_norm": 0.1871623371191181,
"learning_rate": 3.1741861916359193e-07,
"loss": 0.7655,
"step": 2368
},
{
"epoch": 0.944106804822158,
"grad_norm": 0.18764789979300736,
"learning_rate": 3.129439303715387e-07,
"loss": 0.7942,
"step": 2369
},
{
"epoch": 0.9445053302779715,
"grad_norm": 0.18934527512136454,
"learning_rate": 3.0850075728448e-07,
"loss": 0.8114,
"step": 2370
},
{
"epoch": 0.944903855733785,
"grad_norm": 0.18732805849733797,
"learning_rate": 3.0408910701519303e-07,
"loss": 0.783,
"step": 2371
},
{
"epoch": 0.9453023811895985,
"grad_norm": 0.20572141307841002,
"learning_rate": 2.997089866259972e-07,
"loss": 0.8062,
"step": 2372
},
{
"epoch": 0.945700906645412,
"grad_norm": 0.19247170516758175,
"learning_rate": 2.953604031287349e-07,
"loss": 0.8098,
"step": 2373
},
{
"epoch": 0.9460994321012255,
"grad_norm": 0.18598311478233595,
"learning_rate": 2.910433634847709e-07,
"loss": 0.7549,
"step": 2374
},
{
"epoch": 0.9464979575570389,
"grad_norm": 0.18855187031366835,
"learning_rate": 2.8675787460496816e-07,
"loss": 0.7688,
"step": 2375
},
{
"epoch": 0.9468964830128525,
"grad_norm": 0.18656543479412127,
"learning_rate": 2.8250394334967903e-07,
"loss": 0.7844,
"step": 2376
},
{
"epoch": 0.947295008468666,
"grad_norm": 0.1987614281014416,
"learning_rate": 2.7828157652874054e-07,
"loss": 0.7873,
"step": 2377
},
{
"epoch": 0.9476935339244794,
"grad_norm": 0.18946005010606964,
"learning_rate": 2.7409078090146144e-07,
"loss": 0.7919,
"step": 2378
},
{
"epoch": 0.948092059380293,
"grad_norm": 0.18729259602203205,
"learning_rate": 2.699315631766064e-07,
"loss": 0.7906,
"step": 2379
},
{
"epoch": 0.9484905848361064,
"grad_norm": 0.21584635145950695,
"learning_rate": 2.6580393001239604e-07,
"loss": 0.7525,
"step": 2380
},
{
"epoch": 0.9488891102919199,
"grad_norm": 0.19301764265768684,
"learning_rate": 2.617078880164825e-07,
"loss": 0.796,
"step": 2381
},
{
"epoch": 0.9492876357477333,
"grad_norm": 0.18867210144130342,
"learning_rate": 2.5764344374595187e-07,
"loss": 0.8082,
"step": 2382
},
{
"epoch": 0.9496861612035469,
"grad_norm": 0.18458535962402378,
"learning_rate": 2.5361060370729715e-07,
"loss": 0.7828,
"step": 2383
},
{
"epoch": 0.9500846866593604,
"grad_norm": 0.18940087074242587,
"learning_rate": 2.496093743564321e-07,
"loss": 0.7912,
"step": 2384
},
{
"epoch": 0.9504832121151738,
"grad_norm": 0.1967469512602545,
"learning_rate": 2.4563976209865504e-07,
"loss": 0.795,
"step": 2385
},
{
"epoch": 0.9508817375709874,
"grad_norm": 0.18106777816661615,
"learning_rate": 2.417017732886562e-07,
"loss": 0.7606,
"step": 2386
},
{
"epoch": 0.9512802630268008,
"grad_norm": 0.19029595392071927,
"learning_rate": 2.377954142305039e-07,
"loss": 0.7953,
"step": 2387
},
{
"epoch": 0.9516787884826143,
"grad_norm": 0.1920867643492066,
"learning_rate": 2.3392069117762706e-07,
"loss": 0.7959,
"step": 2388
},
{
"epoch": 0.9520773139384279,
"grad_norm": 0.18771665475461194,
"learning_rate": 2.300776103328173e-07,
"loss": 0.7736,
"step": 2389
},
{
"epoch": 0.9524758393942413,
"grad_norm": 0.18980640789999415,
"learning_rate": 2.2626617784820225e-07,
"loss": 0.7606,
"step": 2390
},
{
"epoch": 0.9528743648500548,
"grad_norm": 0.18734395867405335,
"learning_rate": 2.2248639982525688e-07,
"loss": 0.7989,
"step": 2391
},
{
"epoch": 0.9532728903058683,
"grad_norm": 0.2160018240089795,
"learning_rate": 2.1873828231477433e-07,
"loss": 0.7957,
"step": 2392
},
{
"epoch": 0.9536714157616818,
"grad_norm": 0.1868607933405286,
"learning_rate": 2.150218313168706e-07,
"loss": 0.8183,
"step": 2393
},
{
"epoch": 0.9540699412174952,
"grad_norm": 0.18834121472126297,
"learning_rate": 2.113370527809644e-07,
"loss": 0.7748,
"step": 2394
},
{
"epoch": 0.9544684666733088,
"grad_norm": 0.1866371434060772,
"learning_rate": 2.07683952605775e-07,
"loss": 0.7682,
"step": 2395
},
{
"epoch": 0.9548669921291223,
"grad_norm": 0.19477979464068296,
"learning_rate": 2.0406253663930675e-07,
"loss": 0.7962,
"step": 2396
},
{
"epoch": 0.9552655175849357,
"grad_norm": 0.18710243595192597,
"learning_rate": 2.0047281067884672e-07,
"loss": 0.7971,
"step": 2397
},
{
"epoch": 0.9556640430407493,
"grad_norm": 0.18958183027461045,
"learning_rate": 1.9691478047094924e-07,
"loss": 0.7851,
"step": 2398
},
{
"epoch": 0.9560625684965627,
"grad_norm": 0.18583596188293008,
"learning_rate": 1.9338845171142928e-07,
"loss": 0.7729,
"step": 2399
},
{
"epoch": 0.9564610939523762,
"grad_norm": 0.1846233484812463,
"learning_rate": 1.8989383004535121e-07,
"loss": 0.7797,
"step": 2400
},
{
"epoch": 0.9568596194081896,
"grad_norm": 0.18844365183443165,
"learning_rate": 1.86430921067029e-07,
"loss": 0.7869,
"step": 2401
},
{
"epoch": 0.9572581448640032,
"grad_norm": 0.1928283675611628,
"learning_rate": 1.8299973031999707e-07,
"loss": 0.8196,
"step": 2402
},
{
"epoch": 0.9576566703198167,
"grad_norm": 0.1871302583547213,
"learning_rate": 1.7960026329702618e-07,
"loss": 0.7688,
"step": 2403
},
{
"epoch": 0.9580551957756301,
"grad_norm": 0.1819281786572131,
"learning_rate": 1.762325254400965e-07,
"loss": 0.7745,
"step": 2404
},
{
"epoch": 0.9584537212314437,
"grad_norm": 0.18580491353142833,
"learning_rate": 1.7289652214039775e-07,
"loss": 0.7688,
"step": 2405
},
{
"epoch": 0.9588522466872571,
"grad_norm": 0.18495610694488546,
"learning_rate": 1.6959225873831586e-07,
"loss": 0.7863,
"step": 2406
},
{
"epoch": 0.9592507721430706,
"grad_norm": 0.18788375628377132,
"learning_rate": 1.6631974052342846e-07,
"loss": 0.7826,
"step": 2407
},
{
"epoch": 0.9596492975988842,
"grad_norm": 0.18917657214611222,
"learning_rate": 1.6307897273449168e-07,
"loss": 0.7734,
"step": 2408
},
{
"epoch": 0.9600478230546976,
"grad_norm": 0.18759506045359045,
"learning_rate": 1.5986996055943781e-07,
"loss": 0.7992,
"step": 2409
},
{
"epoch": 0.9604463485105111,
"grad_norm": 0.1916191115268579,
"learning_rate": 1.5669270913536427e-07,
"loss": 0.8289,
"step": 2410
},
{
"epoch": 0.9608448739663246,
"grad_norm": 0.18451542468901574,
"learning_rate": 1.535472235485158e-07,
"loss": 0.7726,
"step": 2411
},
{
"epoch": 0.9612433994221381,
"grad_norm": 0.18676157641440086,
"learning_rate": 1.5043350883429786e-07,
"loss": 0.7922,
"step": 2412
},
{
"epoch": 0.9616419248779515,
"grad_norm": 0.1872437071497714,
"learning_rate": 1.4735156997724765e-07,
"loss": 0.7802,
"step": 2413
},
{
"epoch": 0.9620404503337651,
"grad_norm": 0.18907840330520773,
"learning_rate": 1.4430141191103865e-07,
"loss": 0.7903,
"step": 2414
},
{
"epoch": 0.9624389757895786,
"grad_norm": 0.18712650053474555,
"learning_rate": 1.41283039518465e-07,
"loss": 0.7993,
"step": 2415
},
{
"epoch": 0.962837501245392,
"grad_norm": 0.19060675078211464,
"learning_rate": 1.3829645763144162e-07,
"loss": 0.7952,
"step": 2416
},
{
"epoch": 0.9632360267012056,
"grad_norm": 0.19012885112510405,
"learning_rate": 1.353416710309885e-07,
"loss": 0.7988,
"step": 2417
},
{
"epoch": 0.963634552157019,
"grad_norm": 0.18668957461300054,
"learning_rate": 1.324186844472264e-07,
"loss": 0.7676,
"step": 2418
},
{
"epoch": 0.9640330776128325,
"grad_norm": 0.18246864478928232,
"learning_rate": 1.295275025593745e-07,
"loss": 0.7837,
"step": 2419
},
{
"epoch": 0.9644316030686461,
"grad_norm": 0.18700001914029454,
"learning_rate": 1.2666812999573064e-07,
"loss": 0.7841,
"step": 2420
},
{
"epoch": 0.9648301285244595,
"grad_norm": 0.18732989426938132,
"learning_rate": 1.2384057133367988e-07,
"loss": 0.7682,
"step": 2421
},
{
"epoch": 0.965228653980273,
"grad_norm": 0.18764475457309285,
"learning_rate": 1.2104483109967035e-07,
"loss": 0.7989,
"step": 2422
},
{
"epoch": 0.9656271794360864,
"grad_norm": 0.18205691271384167,
"learning_rate": 1.1828091376921758e-07,
"loss": 0.761,
"step": 2423
},
{
"epoch": 0.9660257048919,
"grad_norm": 0.18817192206412373,
"learning_rate": 1.1554882376689557e-07,
"loss": 0.795,
"step": 2424
},
{
"epoch": 0.9664242303477134,
"grad_norm": 0.18225533752041095,
"learning_rate": 1.1284856546632583e-07,
"loss": 0.7544,
"step": 2425
},
{
"epoch": 0.966822755803527,
"grad_norm": 0.1888654968857259,
"learning_rate": 1.1018014319017056e-07,
"loss": 0.7938,
"step": 2426
},
{
"epoch": 0.9672212812593405,
"grad_norm": 0.18791322397897098,
"learning_rate": 1.0754356121013276e-07,
"loss": 0.8,
"step": 2427
},
{
"epoch": 0.9676198067151539,
"grad_norm": 0.18588223430788398,
"learning_rate": 1.0493882374694287e-07,
"loss": 0.7909,
"step": 2428
},
{
"epoch": 0.9680183321709674,
"grad_norm": 0.18541611026382643,
"learning_rate": 1.0236593497035208e-07,
"loss": 0.7986,
"step": 2429
},
{
"epoch": 0.9684168576267809,
"grad_norm": 0.18672094333830974,
"learning_rate": 9.982489899912573e-08,
"loss": 0.7854,
"step": 2430
},
{
"epoch": 0.9688153830825944,
"grad_norm": 0.1943543110096466,
"learning_rate": 9.731571990104105e-08,
"loss": 0.798,
"step": 2431
},
{
"epoch": 0.9692139085384079,
"grad_norm": 0.19085690756684667,
"learning_rate": 9.483840169287828e-08,
"loss": 0.7845,
"step": 2432
},
{
"epoch": 0.9696124339942214,
"grad_norm": 0.22770637291835538,
"learning_rate": 9.239294834041179e-08,
"loss": 0.8013,
"step": 2433
},
{
"epoch": 0.9700109594500349,
"grad_norm": 0.21119203670687375,
"learning_rate": 8.997936375840566e-08,
"loss": 0.8095,
"step": 2434
},
{
"epoch": 0.9704094849058483,
"grad_norm": 0.18505204962281271,
"learning_rate": 8.759765181060698e-08,
"loss": 0.7804,
"step": 2435
},
{
"epoch": 0.9708080103616619,
"grad_norm": 0.18558037459333185,
"learning_rate": 8.524781630974144e-08,
"loss": 0.7941,
"step": 2436
},
{
"epoch": 0.9712065358174753,
"grad_norm": 0.18719202865767845,
"learning_rate": 8.292986101750222e-08,
"loss": 0.8026,
"step": 2437
},
{
"epoch": 0.9716050612732888,
"grad_norm": 0.18361106837972332,
"learning_rate": 8.064378964455666e-08,
"loss": 0.7835,
"step": 2438
},
{
"epoch": 0.9720035867291024,
"grad_norm": 0.1941218667296314,
"learning_rate": 7.838960585051959e-08,
"loss": 0.7761,
"step": 2439
},
{
"epoch": 0.9724021121849158,
"grad_norm": 0.1875724989599244,
"learning_rate": 7.616731324396887e-08,
"loss": 0.7837,
"step": 2440
},
{
"epoch": 0.9728006376407293,
"grad_norm": 0.19069373972701836,
"learning_rate": 7.397691538242103e-08,
"loss": 0.8045,
"step": 2441
},
{
"epoch": 0.9731991630965428,
"grad_norm": 0.18927346704482184,
"learning_rate": 7.181841577234449e-08,
"loss": 0.8012,
"step": 2442
},
{
"epoch": 0.9735976885523563,
"grad_norm": 0.18820692363643488,
"learning_rate": 6.969181786913304e-08,
"loss": 0.7829,
"step": 2443
},
{
"epoch": 0.9739962140081698,
"grad_norm": 0.18304987062051034,
"learning_rate": 6.759712507711902e-08,
"loss": 0.7697,
"step": 2444
},
{
"epoch": 0.9743947394639833,
"grad_norm": 0.19039574921948385,
"learning_rate": 6.553434074955789e-08,
"loss": 0.7909,
"step": 2445
},
{
"epoch": 0.9747932649197968,
"grad_norm": 0.19093438505230914,
"learning_rate": 6.350346818862374e-08,
"loss": 0.8287,
"step": 2446
},
{
"epoch": 0.9751917903756102,
"grad_norm": 0.18824893139518173,
"learning_rate": 6.150451064540708e-08,
"loss": 0.7963,
"step": 2447
},
{
"epoch": 0.9755903158314237,
"grad_norm": 0.18807173681160894,
"learning_rate": 5.953747131990595e-08,
"loss": 0.7839,
"step": 2448
},
{
"epoch": 0.9759888412872372,
"grad_norm": 0.1833502517025838,
"learning_rate": 5.760235336102149e-08,
"loss": 0.7594,
"step": 2449
},
{
"epoch": 0.9763873667430507,
"grad_norm": 0.3654620173098497,
"learning_rate": 5.569915986656016e-08,
"loss": 0.7682,
"step": 2450
},
{
"epoch": 0.9767858921988642,
"grad_norm": 0.18924038600883106,
"learning_rate": 5.3827893883215964e-08,
"loss": 0.7996,
"step": 2451
},
{
"epoch": 0.9771844176546777,
"grad_norm": 0.1903600412956394,
"learning_rate": 5.198855840657491e-08,
"loss": 0.8085,
"step": 2452
},
{
"epoch": 0.9775829431104912,
"grad_norm": 0.1869548438274652,
"learning_rate": 5.01811563811061e-08,
"loss": 0.8068,
"step": 2453
},
{
"epoch": 0.9779814685663046,
"grad_norm": 0.18864866332212885,
"learning_rate": 4.8405690700161766e-08,
"loss": 0.7886,
"step": 2454
},
{
"epoch": 0.9783799940221182,
"grad_norm": 0.18929120724640708,
"learning_rate": 4.6662164205966143e-08,
"loss": 0.7762,
"step": 2455
},
{
"epoch": 0.9787785194779317,
"grad_norm": 0.1917795489994272,
"learning_rate": 4.495057968961769e-08,
"loss": 0.7884,
"step": 2456
},
{
"epoch": 0.9791770449337451,
"grad_norm": 0.39039385650289893,
"learning_rate": 4.327093989107578e-08,
"loss": 0.7692,
"step": 2457
},
{
"epoch": 0.9795755703895587,
"grad_norm": 0.18426417807456694,
"learning_rate": 4.162324749916735e-08,
"loss": 0.7869,
"step": 2458
},
{
"epoch": 0.9799740958453721,
"grad_norm": 0.18680439549269473,
"learning_rate": 4.0007505151571365e-08,
"loss": 0.751,
"step": 2459
},
{
"epoch": 0.9803726213011856,
"grad_norm": 0.1818392556169463,
"learning_rate": 3.8423715434823264e-08,
"loss": 0.7696,
"step": 2460
},
{
"epoch": 0.9807711467569991,
"grad_norm": 0.18570844625506863,
"learning_rate": 3.6871880884310486e-08,
"loss": 0.7886,
"step": 2461
},
{
"epoch": 0.9811696722128126,
"grad_norm": 0.21101630534654273,
"learning_rate": 3.5352003984259195e-08,
"loss": 0.7831,
"step": 2462
},
{
"epoch": 0.9815681976686261,
"grad_norm": 0.18069209360220204,
"learning_rate": 3.3864087167738705e-08,
"loss": 0.7504,
"step": 2463
},
{
"epoch": 0.9819667231244396,
"grad_norm": 0.1966709155844413,
"learning_rate": 3.240813281666144e-08,
"loss": 0.8465,
"step": 2464
},
{
"epoch": 0.9823652485802531,
"grad_norm": 0.1932914272847854,
"learning_rate": 3.09841432617608e-08,
"loss": 0.8142,
"step": 2465
},
{
"epoch": 0.9827637740360665,
"grad_norm": 0.18925513736084928,
"learning_rate": 2.959212078261553e-08,
"loss": 0.7721,
"step": 2466
},
{
"epoch": 0.98316229949188,
"grad_norm": 0.19004539946693746,
"learning_rate": 2.823206760761643e-08,
"loss": 0.8169,
"step": 2467
},
{
"epoch": 0.9835608249476935,
"grad_norm": 0.1836565966256934,
"learning_rate": 2.690398591398413e-08,
"loss": 0.7499,
"step": 2468
},
{
"epoch": 0.983959350403507,
"grad_norm": 0.18503029280898622,
"learning_rate": 2.5607877827757975e-08,
"loss": 0.8015,
"step": 2469
},
{
"epoch": 0.9843578758593206,
"grad_norm": 0.20802778839564787,
"learning_rate": 2.4343745423791588e-08,
"loss": 0.8196,
"step": 2470
},
{
"epoch": 0.984756401315134,
"grad_norm": 0.18807720140750997,
"learning_rate": 2.3111590725750644e-08,
"loss": 0.7935,
"step": 2471
},
{
"epoch": 0.9851549267709475,
"grad_norm": 0.18198059255273902,
"learning_rate": 2.191141570610844e-08,
"loss": 0.784,
"step": 2472
},
{
"epoch": 0.9855534522267609,
"grad_norm": 0.18391481474432778,
"learning_rate": 2.074322228614589e-08,
"loss": 0.7844,
"step": 2473
},
{
"epoch": 0.9859519776825745,
"grad_norm": 0.18825854264243994,
"learning_rate": 1.9607012335949306e-08,
"loss": 0.7916,
"step": 2474
},
{
"epoch": 0.986350503138388,
"grad_norm": 0.18782508477662568,
"learning_rate": 1.850278767439928e-08,
"loss": 0.7595,
"step": 2475
},
{
"epoch": 0.9867490285942014,
"grad_norm": 0.18321843468534266,
"learning_rate": 1.7430550069175157e-08,
"loss": 0.7797,
"step": 2476
},
{
"epoch": 0.987147554050015,
"grad_norm": 0.18648381218274565,
"learning_rate": 1.6390301236755003e-08,
"loss": 0.7851,
"step": 2477
},
{
"epoch": 0.9875460795058284,
"grad_norm": 0.18500156564930573,
"learning_rate": 1.53820428424023e-08,
"loss": 0.7918,
"step": 2478
},
{
"epoch": 0.9879446049616419,
"grad_norm": 0.18650103609208593,
"learning_rate": 1.4405776500170388e-08,
"loss": 0.7453,
"step": 2479
},
{
"epoch": 0.9883431304174554,
"grad_norm": 0.18474307243866062,
"learning_rate": 1.346150377290023e-08,
"loss": 0.812,
"step": 2480
},
{
"epoch": 0.9887416558732689,
"grad_norm": 0.18500951818710418,
"learning_rate": 1.2549226172213769e-08,
"loss": 0.7823,
"step": 2481
},
{
"epoch": 0.9891401813290824,
"grad_norm": 0.18520599480715474,
"learning_rate": 1.1668945158518352e-08,
"loss": 0.8023,
"step": 2482
},
{
"epoch": 0.9895387067848959,
"grad_norm": 0.18653565333341804,
"learning_rate": 1.0820662140997862e-08,
"loss": 0.803,
"step": 2483
},
{
"epoch": 0.9899372322407094,
"grad_norm": 0.18774709338075257,
"learning_rate": 1.0004378477610489e-08,
"loss": 0.82,
"step": 2484
},
{
"epoch": 0.9903357576965228,
"grad_norm": 0.18995043820529847,
"learning_rate": 9.220095475090951e-09,
"loss": 0.795,
"step": 2485
},
{
"epoch": 0.9907342831523364,
"grad_norm": 0.20276705829265076,
"learning_rate": 8.467814388948282e-09,
"loss": 0.7578,
"step": 2486
},
{
"epoch": 0.9911328086081499,
"grad_norm": 0.18309866901765423,
"learning_rate": 7.747536423456937e-09,
"loss": 0.7786,
"step": 2487
},
{
"epoch": 0.9915313340639633,
"grad_norm": 0.18809540486983145,
"learning_rate": 7.059262731661243e-09,
"loss": 0.8164,
"step": 2488
},
{
"epoch": 0.9919298595197769,
"grad_norm": 0.18502562806451164,
"learning_rate": 6.402994415377617e-09,
"loss": 0.7805,
"step": 2489
},
{
"epoch": 0.9923283849755903,
"grad_norm": 0.1859120952234941,
"learning_rate": 5.7787325251768e-09,
"loss": 0.7834,
"step": 2490
},
{
"epoch": 0.9927269104314038,
"grad_norm": 0.1846277966595508,
"learning_rate": 5.186478060403844e-09,
"loss": 0.7745,
"step": 2491
},
{
"epoch": 0.9931254358872172,
"grad_norm": 0.18693650346096127,
"learning_rate": 4.626231969155903e-09,
"loss": 0.7965,
"step": 2492
},
{
"epoch": 0.9935239613430308,
"grad_norm": 0.18817760919614965,
"learning_rate": 4.0979951482955636e-09,
"loss": 0.817,
"step": 2493
},
{
"epoch": 0.9939224867988443,
"grad_norm": 0.18262042873834777,
"learning_rate": 3.6017684434397348e-09,
"loss": 0.7637,
"step": 2494
},
{
"epoch": 0.9943210122546577,
"grad_norm": 0.18654490339612353,
"learning_rate": 3.1375526489685337e-09,
"loss": 0.7688,
"step": 2495
},
{
"epoch": 0.9947195377104713,
"grad_norm": 0.18607152410344097,
"learning_rate": 2.7053485080141827e-09,
"loss": 0.7929,
"step": 2496
},
{
"epoch": 0.9951180631662847,
"grad_norm": 0.18419897844519703,
"learning_rate": 2.3051567124587894e-09,
"loss": 0.7701,
"step": 2497
},
{
"epoch": 0.9955165886220982,
"grad_norm": 0.19186493723661452,
"learning_rate": 1.936977902949888e-09,
"loss": 0.808,
"step": 2498
},
{
"epoch": 0.9959151140779118,
"grad_norm": 0.19815019961308503,
"learning_rate": 1.6008126688737968e-09,
"loss": 0.7752,
"step": 2499
},
{
"epoch": 0.9963136395337252,
"grad_norm": 0.18417877206027317,
"learning_rate": 1.2966615483800404e-09,
"loss": 0.7644,
"step": 2500
},
{
"epoch": 0.9967121649895387,
"grad_norm": 0.18546354005657695,
"learning_rate": 1.0245250283613672e-09,
"loss": 0.7876,
"step": 2501
},
{
"epoch": 0.9971106904453522,
"grad_norm": 0.1876008897484704,
"learning_rate": 7.844035444648512e-10,
"loss": 0.8233,
"step": 2502
},
{
"epoch": 0.9975092159011657,
"grad_norm": 0.19981491316640151,
"learning_rate": 5.762974810852307e-10,
"loss": 0.7684,
"step": 2503
},
{
"epoch": 0.9979077413569791,
"grad_norm": 0.18665647757266646,
"learning_rate": 4.002071713626876e-10,
"loss": 0.7954,
"step": 2504
},
{
"epoch": 0.9983062668127927,
"grad_norm": 0.1894225254309972,
"learning_rate": 2.5613289719172985e-10,
"loss": 0.788,
"step": 2505
},
{
"epoch": 0.9987047922686062,
"grad_norm": 0.187487701192293,
"learning_rate": 1.440748892100885e-10,
"loss": 0.8017,
"step": 2506
},
{
"epoch": 0.9991033177244196,
"grad_norm": 0.1916417262477468,
"learning_rate": 6.403332680537943e-11,
"loss": 0.803,
"step": 2507
},
{
"epoch": 0.9995018431802332,
"grad_norm": 0.18369932510494486,
"learning_rate": 1.6008338108441936e-11,
"loss": 0.7746,
"step": 2508
},
{
"epoch": 0.9999003686360466,
"grad_norm": 0.19092333059410313,
"learning_rate": 0.0,
"loss": 0.7752,
"step": 2509
},
{
"epoch": 0.9999003686360466,
"step": 2509,
"total_flos": 2400250660651008.0,
"train_loss": 0.8378047743086893,
"train_runtime": 25259.4249,
"train_samples_per_second": 57.218,
"train_steps_per_second": 0.099
}
],
"logging_steps": 1,
"max_steps": 2509,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2400250660651008.0,
"train_batch_size": 9,
"trial_name": null,
"trial_params": null
}