Files
titulm-llama-3.2-1b-v1.1/trainer_state.json
ModelHub XC 20d82bea4d 初始化项目,由ModelHub XC社区提供模型
Model: hishab/titulm-llama-3.2-1b-v1.1
Source: Original Platform
2026-05-21 09:54:18 +08:00

25474 lines
621 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9999311910823643,
"eval_steps": 500,
"global_step": 3633,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0002752356705429024,
"grad_norm": 1.190276107072263,
"learning_rate": 1.0810810810810812e-06,
"loss": 1.3244,
"step": 1
},
{
"epoch": 0.0005504713410858048,
"grad_norm": 1.2084409597297352,
"learning_rate": 2.1621621621621623e-06,
"loss": 1.2942,
"step": 2
},
{
"epoch": 0.0008257070116287071,
"grad_norm": 1.122752677411088,
"learning_rate": 3.2432432432432437e-06,
"loss": 1.3128,
"step": 3
},
{
"epoch": 0.0011009426821716095,
"grad_norm": 1.0924048720350894,
"learning_rate": 4.324324324324325e-06,
"loss": 1.3307,
"step": 4
},
{
"epoch": 0.0013761783527145117,
"grad_norm": 0.9078676095521644,
"learning_rate": 5.405405405405406e-06,
"loss": 1.2978,
"step": 5
},
{
"epoch": 0.0016514140232574141,
"grad_norm": 1.4160148465734577,
"learning_rate": 6.486486486486487e-06,
"loss": 1.2813,
"step": 6
},
{
"epoch": 0.0019266496938003166,
"grad_norm": 1.20323318581664,
"learning_rate": 7.567567567567569e-06,
"loss": 1.258,
"step": 7
},
{
"epoch": 0.002201885364343219,
"grad_norm": 1.7257200500394663,
"learning_rate": 8.64864864864865e-06,
"loss": 1.2677,
"step": 8
},
{
"epoch": 0.0024771210348861214,
"grad_norm": 1.2460047678725148,
"learning_rate": 9.729729729729732e-06,
"loss": 1.2826,
"step": 9
},
{
"epoch": 0.0027523567054290234,
"grad_norm": 2.927602556283335,
"learning_rate": 1.0810810810810812e-05,
"loss": 1.3088,
"step": 10
},
{
"epoch": 0.003027592375971926,
"grad_norm": 2.3291183764604737,
"learning_rate": 1.1891891891891894e-05,
"loss": 1.336,
"step": 11
},
{
"epoch": 0.0033028280465148283,
"grad_norm": 1.5611186960566894,
"learning_rate": 1.2972972972972975e-05,
"loss": 1.251,
"step": 12
},
{
"epoch": 0.0035780637170577307,
"grad_norm": 1.507762186709395,
"learning_rate": 1.4054054054054055e-05,
"loss": 1.2392,
"step": 13
},
{
"epoch": 0.003853299387600633,
"grad_norm": 1.3916628238578803,
"learning_rate": 1.5135135135135138e-05,
"loss": 1.2263,
"step": 14
},
{
"epoch": 0.004128535058143535,
"grad_norm": 1.4410267212915282,
"learning_rate": 1.6216216216216218e-05,
"loss": 1.2525,
"step": 15
},
{
"epoch": 0.004403770728686438,
"grad_norm": 1.3503388717753242,
"learning_rate": 1.72972972972973e-05,
"loss": 1.272,
"step": 16
},
{
"epoch": 0.00467900639922934,
"grad_norm": 1.2435668788235288,
"learning_rate": 1.8378378378378383e-05,
"loss": 1.2334,
"step": 17
},
{
"epoch": 0.004954242069772243,
"grad_norm": 1.332166272341274,
"learning_rate": 1.9459459459459463e-05,
"loss": 1.2487,
"step": 18
},
{
"epoch": 0.005229477740315145,
"grad_norm": 1.3657862921427664,
"learning_rate": 2.054054054054054e-05,
"loss": 1.2216,
"step": 19
},
{
"epoch": 0.005504713410858047,
"grad_norm": 1.3329043032392613,
"learning_rate": 2.1621621621621624e-05,
"loss": 1.2092,
"step": 20
},
{
"epoch": 0.00577994908140095,
"grad_norm": 1.5624375594145068,
"learning_rate": 2.2702702702702705e-05,
"loss": 1.2051,
"step": 21
},
{
"epoch": 0.006055184751943852,
"grad_norm": 1.4664415560783262,
"learning_rate": 2.378378378378379e-05,
"loss": 1.2252,
"step": 22
},
{
"epoch": 0.006330420422486755,
"grad_norm": 1.5659900022345836,
"learning_rate": 2.4864864864864866e-05,
"loss": 1.1595,
"step": 23
},
{
"epoch": 0.006605656093029657,
"grad_norm": 1.4447862720284197,
"learning_rate": 2.594594594594595e-05,
"loss": 1.1715,
"step": 24
},
{
"epoch": 0.006880891763572559,
"grad_norm": 1.2617871597038033,
"learning_rate": 2.702702702702703e-05,
"loss": 1.1759,
"step": 25
},
{
"epoch": 0.0071561274341154614,
"grad_norm": 1.7509911834922023,
"learning_rate": 2.810810810810811e-05,
"loss": 1.2134,
"step": 26
},
{
"epoch": 0.007431363104658363,
"grad_norm": 1.1184189389114705,
"learning_rate": 2.918918918918919e-05,
"loss": 1.1652,
"step": 27
},
{
"epoch": 0.007706598775201266,
"grad_norm": 1.6549112009023732,
"learning_rate": 3.0270270270270275e-05,
"loss": 1.1677,
"step": 28
},
{
"epoch": 0.00798183444574417,
"grad_norm": 1.51189283106951,
"learning_rate": 3.135135135135135e-05,
"loss": 1.222,
"step": 29
},
{
"epoch": 0.00825707011628707,
"grad_norm": 1.6570028356426652,
"learning_rate": 3.2432432432432436e-05,
"loss": 1.1138,
"step": 30
},
{
"epoch": 0.008532305786829973,
"grad_norm": 1.4291449194042198,
"learning_rate": 3.351351351351351e-05,
"loss": 1.2045,
"step": 31
},
{
"epoch": 0.008807541457372876,
"grad_norm": 1.8050828458250154,
"learning_rate": 3.45945945945946e-05,
"loss": 1.17,
"step": 32
},
{
"epoch": 0.009082777127915777,
"grad_norm": 1.7028887034299325,
"learning_rate": 3.567567567567568e-05,
"loss": 1.117,
"step": 33
},
{
"epoch": 0.00935801279845868,
"grad_norm": 1.7579705799438476,
"learning_rate": 3.6756756756756765e-05,
"loss": 1.1313,
"step": 34
},
{
"epoch": 0.009633248469001583,
"grad_norm": 1.841565198358737,
"learning_rate": 3.783783783783784e-05,
"loss": 1.1352,
"step": 35
},
{
"epoch": 0.009908484139544486,
"grad_norm": 1.51806968835451,
"learning_rate": 3.8918918918918926e-05,
"loss": 1.1459,
"step": 36
},
{
"epoch": 0.010183719810087387,
"grad_norm": 2.0701574664333102,
"learning_rate": 4e-05,
"loss": 1.0888,
"step": 37
},
{
"epoch": 0.01045895548063029,
"grad_norm": 1.6155327822146857,
"learning_rate": 3.9999992367613554e-05,
"loss": 1.079,
"step": 38
},
{
"epoch": 0.010734191151173193,
"grad_norm": 1.672991458945634,
"learning_rate": 3.999996947046004e-05,
"loss": 1.0786,
"step": 39
},
{
"epoch": 0.011009426821716094,
"grad_norm": 1.6923603269114176,
"learning_rate": 3.999993130855694e-05,
"loss": 1.1292,
"step": 40
},
{
"epoch": 0.011284662492258997,
"grad_norm": 1.6163864527003113,
"learning_rate": 3.999987788193337e-05,
"loss": 1.0859,
"step": 41
},
{
"epoch": 0.0115598981628019,
"grad_norm": 1.5568500772991893,
"learning_rate": 3.9999809190630105e-05,
"loss": 1.0795,
"step": 42
},
{
"epoch": 0.011835133833344802,
"grad_norm": 1.5109463614457816,
"learning_rate": 3.999972523469959e-05,
"loss": 1.122,
"step": 43
},
{
"epoch": 0.012110369503887703,
"grad_norm": 1.5782539419050023,
"learning_rate": 3.9999626014205895e-05,
"loss": 1.1123,
"step": 44
},
{
"epoch": 0.012385605174430606,
"grad_norm": 1.5604642948556238,
"learning_rate": 3.999951152922474e-05,
"loss": 1.1017,
"step": 45
},
{
"epoch": 0.01266084084497351,
"grad_norm": 1.3465924798161213,
"learning_rate": 3.9999381779843526e-05,
"loss": 1.1362,
"step": 46
},
{
"epoch": 0.01293607651551641,
"grad_norm": 2.2197061958421416,
"learning_rate": 3.999923676616125e-05,
"loss": 1.149,
"step": 47
},
{
"epoch": 0.013211312186059313,
"grad_norm": 1.275559559204292,
"learning_rate": 3.9999076488288625e-05,
"loss": 1.0965,
"step": 48
},
{
"epoch": 0.013486547856602216,
"grad_norm": 2.135373580458631,
"learning_rate": 3.999890094634796e-05,
"loss": 1.1152,
"step": 49
},
{
"epoch": 0.013761783527145117,
"grad_norm": 1.162458419635211,
"learning_rate": 3.999871014047324e-05,
"loss": 1.0577,
"step": 50
},
{
"epoch": 0.01403701919768802,
"grad_norm": 1.668860695031362,
"learning_rate": 3.99985040708101e-05,
"loss": 1.0751,
"step": 51
},
{
"epoch": 0.014312254868230923,
"grad_norm": 1.2352790911765088,
"learning_rate": 3.9998282737515826e-05,
"loss": 1.0523,
"step": 52
},
{
"epoch": 0.014587490538773826,
"grad_norm": 1.6709188838849682,
"learning_rate": 3.999804614075934e-05,
"loss": 1.0922,
"step": 53
},
{
"epoch": 0.014862726209316727,
"grad_norm": 1.2186730260814729,
"learning_rate": 3.9997794280721215e-05,
"loss": 1.0946,
"step": 54
},
{
"epoch": 0.01513796187985963,
"grad_norm": 1.5946952239430119,
"learning_rate": 3.999752715759368e-05,
"loss": 1.0982,
"step": 55
},
{
"epoch": 0.015413197550402533,
"grad_norm": 1.3338876549514764,
"learning_rate": 3.999724477158064e-05,
"loss": 1.0606,
"step": 56
},
{
"epoch": 0.015688433220945434,
"grad_norm": 1.4391389628569506,
"learning_rate": 3.9996947122897594e-05,
"loss": 1.0699,
"step": 57
},
{
"epoch": 0.01596366889148834,
"grad_norm": 1.4049265878583999,
"learning_rate": 3.999663421177173e-05,
"loss": 1.1227,
"step": 58
},
{
"epoch": 0.01623890456203124,
"grad_norm": 1.3947889784751317,
"learning_rate": 3.999630603844187e-05,
"loss": 1.0886,
"step": 59
},
{
"epoch": 0.01651414023257414,
"grad_norm": 1.4043255062790543,
"learning_rate": 3.99959626031585e-05,
"loss": 1.0719,
"step": 60
},
{
"epoch": 0.016789375903117045,
"grad_norm": 1.3519206423249013,
"learning_rate": 3.9995603906183726e-05,
"loss": 1.0834,
"step": 61
},
{
"epoch": 0.017064611573659946,
"grad_norm": 1.422472203995192,
"learning_rate": 3.999522994779133e-05,
"loss": 1.0501,
"step": 62
},
{
"epoch": 0.017339847244202847,
"grad_norm": 1.4681842143007482,
"learning_rate": 3.9994840728266725e-05,
"loss": 1.0426,
"step": 63
},
{
"epoch": 0.017615082914745752,
"grad_norm": 1.5623433348992308,
"learning_rate": 3.999443624790699e-05,
"loss": 1.0442,
"step": 64
},
{
"epoch": 0.017890318585288653,
"grad_norm": 1.2844018013991325,
"learning_rate": 3.999401650702083e-05,
"loss": 1.0532,
"step": 65
},
{
"epoch": 0.018165554255831554,
"grad_norm": 1.4124923773116047,
"learning_rate": 3.999358150592861e-05,
"loss": 1.047,
"step": 66
},
{
"epoch": 0.01844078992637446,
"grad_norm": 1.47024691076268,
"learning_rate": 3.999313124496234e-05,
"loss": 1.0394,
"step": 67
},
{
"epoch": 0.01871602559691736,
"grad_norm": 1.0479405707837466,
"learning_rate": 3.9992665724465686e-05,
"loss": 1.0159,
"step": 68
},
{
"epoch": 0.01899126126746026,
"grad_norm": 1.6155895896664414,
"learning_rate": 3.999218494479393e-05,
"loss": 1.0412,
"step": 69
},
{
"epoch": 0.019266496938003166,
"grad_norm": 1.1554035474855078,
"learning_rate": 3.999168890631404e-05,
"loss": 1.0444,
"step": 70
},
{
"epoch": 0.019541732608546067,
"grad_norm": 1.3916486614853192,
"learning_rate": 3.99911776094046e-05,
"loss": 1.0716,
"step": 71
},
{
"epoch": 0.01981696827908897,
"grad_norm": 1.0307232993736772,
"learning_rate": 3.999065105445586e-05,
"loss": 1.0518,
"step": 72
},
{
"epoch": 0.020092203949631873,
"grad_norm": 1.519072854318784,
"learning_rate": 3.99901092418697e-05,
"loss": 1.0511,
"step": 73
},
{
"epoch": 0.020367439620174774,
"grad_norm": 1.2452532414480466,
"learning_rate": 3.998955217205966e-05,
"loss": 1.0521,
"step": 74
},
{
"epoch": 0.02064267529071768,
"grad_norm": 1.0823724764686737,
"learning_rate": 3.998897984545091e-05,
"loss": 1.0283,
"step": 75
},
{
"epoch": 0.02091791096126058,
"grad_norm": 1.4547887075106387,
"learning_rate": 3.9988392262480274e-05,
"loss": 1.0381,
"step": 76
},
{
"epoch": 0.02119314663180348,
"grad_norm": 0.9223667414315597,
"learning_rate": 3.9987789423596224e-05,
"loss": 1.0271,
"step": 77
},
{
"epoch": 0.021468382302346385,
"grad_norm": 1.4947900030467116,
"learning_rate": 3.998717132925886e-05,
"loss": 1.0095,
"step": 78
},
{
"epoch": 0.021743617972889286,
"grad_norm": 1.0246470847225004,
"learning_rate": 3.998653797993995e-05,
"loss": 1.0274,
"step": 79
},
{
"epoch": 0.022018853643432187,
"grad_norm": 1.586742963776002,
"learning_rate": 3.998588937612287e-05,
"loss": 1.0084,
"step": 80
},
{
"epoch": 0.022294089313975092,
"grad_norm": 1.1499654167390594,
"learning_rate": 3.998522551830267e-05,
"loss": 1.0515,
"step": 81
},
{
"epoch": 0.022569324984517993,
"grad_norm": 1.543529968919974,
"learning_rate": 3.9984546406986045e-05,
"loss": 0.9933,
"step": 82
},
{
"epoch": 0.022844560655060894,
"grad_norm": 0.9475881660146706,
"learning_rate": 3.99838520426913e-05,
"loss": 1.0159,
"step": 83
},
{
"epoch": 0.0231197963256038,
"grad_norm": 1.351513155062428,
"learning_rate": 3.998314242594841e-05,
"loss": 0.9955,
"step": 84
},
{
"epoch": 0.0233950319961467,
"grad_norm": 1.1492618928180698,
"learning_rate": 3.998241755729897e-05,
"loss": 1.0673,
"step": 85
},
{
"epoch": 0.023670267666689605,
"grad_norm": 1.3497208081724184,
"learning_rate": 3.9981677437296244e-05,
"loss": 1.0341,
"step": 86
},
{
"epoch": 0.023945503337232506,
"grad_norm": 1.1251159534366102,
"learning_rate": 3.998092206650511e-05,
"loss": 0.994,
"step": 87
},
{
"epoch": 0.024220739007775407,
"grad_norm": 1.2840664657236376,
"learning_rate": 3.99801514455021e-05,
"loss": 1.0059,
"step": 88
},
{
"epoch": 0.02449597467831831,
"grad_norm": 1.0389167650958175,
"learning_rate": 3.997936557487539e-05,
"loss": 1.0232,
"step": 89
},
{
"epoch": 0.024771210348861213,
"grad_norm": 1.1934490371559106,
"learning_rate": 3.9978564455224764e-05,
"loss": 1.0154,
"step": 90
},
{
"epoch": 0.025046446019404114,
"grad_norm": 1.3565126325882042,
"learning_rate": 3.9977748087161696e-05,
"loss": 1.012,
"step": 91
},
{
"epoch": 0.02532168168994702,
"grad_norm": 1.261096742039059,
"learning_rate": 3.997691647130924e-05,
"loss": 1.0143,
"step": 92
},
{
"epoch": 0.02559691736048992,
"grad_norm": 1.5361429916101677,
"learning_rate": 3.997606960830214e-05,
"loss": 0.9854,
"step": 93
},
{
"epoch": 0.02587215303103282,
"grad_norm": 0.7587337794998066,
"learning_rate": 3.997520749878675e-05,
"loss": 0.9797,
"step": 94
},
{
"epoch": 0.026147388701575725,
"grad_norm": 1.340673980750616,
"learning_rate": 3.997433014342106e-05,
"loss": 1.0083,
"step": 95
},
{
"epoch": 0.026422624372118626,
"grad_norm": 1.442377076418099,
"learning_rate": 3.99734375428747e-05,
"loss": 1.0098,
"step": 96
},
{
"epoch": 0.026697860042661527,
"grad_norm": 1.053217627466467,
"learning_rate": 3.997252969782895e-05,
"loss": 1.0194,
"step": 97
},
{
"epoch": 0.026973095713204432,
"grad_norm": 1.2542749662949786,
"learning_rate": 3.9971606608976694e-05,
"loss": 1.0421,
"step": 98
},
{
"epoch": 0.027248331383747333,
"grad_norm": 1.0310036113535672,
"learning_rate": 3.997066827702248e-05,
"loss": 1.0255,
"step": 99
},
{
"epoch": 0.027523567054290234,
"grad_norm": 1.3425906351732504,
"learning_rate": 3.996971470268248e-05,
"loss": 1.0096,
"step": 100
},
{
"epoch": 0.02779880272483314,
"grad_norm": 1.0033245136899922,
"learning_rate": 3.9968745886684496e-05,
"loss": 1.0278,
"step": 101
},
{
"epoch": 0.02807403839537604,
"grad_norm": 1.2359786623522877,
"learning_rate": 3.996776182976796e-05,
"loss": 1.0016,
"step": 102
},
{
"epoch": 0.028349274065918945,
"grad_norm": 0.8996790282725927,
"learning_rate": 3.996676253268396e-05,
"loss": 0.9778,
"step": 103
},
{
"epoch": 0.028624509736461846,
"grad_norm": 1.122427683480799,
"learning_rate": 3.996574799619518e-05,
"loss": 1.0021,
"step": 104
},
{
"epoch": 0.028899745407004747,
"grad_norm": 1.0186269169389894,
"learning_rate": 3.996471822107596e-05,
"loss": 0.9949,
"step": 105
},
{
"epoch": 0.02917498107754765,
"grad_norm": 1.3313548328442482,
"learning_rate": 3.996367320811227e-05,
"loss": 0.9761,
"step": 106
},
{
"epoch": 0.029450216748090553,
"grad_norm": 0.8643837745261176,
"learning_rate": 3.9962612958101696e-05,
"loss": 1.0035,
"step": 107
},
{
"epoch": 0.029725452418633454,
"grad_norm": 0.98116052105504,
"learning_rate": 3.996153747185347e-05,
"loss": 0.9801,
"step": 108
},
{
"epoch": 0.03000068808917636,
"grad_norm": 1.10691016989535,
"learning_rate": 3.996044675018842e-05,
"loss": 1.0151,
"step": 109
},
{
"epoch": 0.03027592375971926,
"grad_norm": 1.2681369626856358,
"learning_rate": 3.9959340793939064e-05,
"loss": 1.0307,
"step": 110
},
{
"epoch": 0.03055115943026216,
"grad_norm": 1.0276436414413004,
"learning_rate": 3.9958219603949486e-05,
"loss": 0.9759,
"step": 111
},
{
"epoch": 0.030826395100805065,
"grad_norm": 1.420865033430615,
"learning_rate": 3.995708318107543e-05,
"loss": 0.9859,
"step": 112
},
{
"epoch": 0.031101630771347966,
"grad_norm": 0.8500712282512342,
"learning_rate": 3.995593152618425e-05,
"loss": 1.0195,
"step": 113
},
{
"epoch": 0.03137686644189087,
"grad_norm": 1.102496992794187,
"learning_rate": 3.995476464015495e-05,
"loss": 0.9947,
"step": 114
},
{
"epoch": 0.03165210211243377,
"grad_norm": 1.3088189045845282,
"learning_rate": 3.995358252387813e-05,
"loss": 0.9779,
"step": 115
},
{
"epoch": 0.03192733778297668,
"grad_norm": 0.9473377671663621,
"learning_rate": 3.995238517825602e-05,
"loss": 1.0109,
"step": 116
},
{
"epoch": 0.032202573453519574,
"grad_norm": 0.9374545115515334,
"learning_rate": 3.9951172604202494e-05,
"loss": 0.9705,
"step": 117
},
{
"epoch": 0.03247780912406248,
"grad_norm": 1.122739476374123,
"learning_rate": 3.9949944802643036e-05,
"loss": 0.9877,
"step": 118
},
{
"epoch": 0.032753044794605383,
"grad_norm": 1.2181012183983715,
"learning_rate": 3.994870177451474e-05,
"loss": 0.9867,
"step": 119
},
{
"epoch": 0.03302828046514828,
"grad_norm": 1.0190839936162515,
"learning_rate": 3.994744352076634e-05,
"loss": 0.9966,
"step": 120
},
{
"epoch": 0.033303516135691186,
"grad_norm": 1.091612675267103,
"learning_rate": 3.9946170042358185e-05,
"loss": 0.987,
"step": 121
},
{
"epoch": 0.03357875180623409,
"grad_norm": 0.9232945314206387,
"learning_rate": 3.994488134026224e-05,
"loss": 0.9963,
"step": 122
},
{
"epoch": 0.03385398747677699,
"grad_norm": 1.0736749598534863,
"learning_rate": 3.99435774154621e-05,
"loss": 1.0157,
"step": 123
},
{
"epoch": 0.03412922314731989,
"grad_norm": 1.2475900441860863,
"learning_rate": 3.994225826895295e-05,
"loss": 0.9846,
"step": 124
},
{
"epoch": 0.0344044588178628,
"grad_norm": 0.8913702469448699,
"learning_rate": 3.994092390174164e-05,
"loss": 0.9947,
"step": 125
},
{
"epoch": 0.034679694488405695,
"grad_norm": 1.0342284706230065,
"learning_rate": 3.993957431484659e-05,
"loss": 0.9563,
"step": 126
},
{
"epoch": 0.0349549301589486,
"grad_norm": 1.1094942836810213,
"learning_rate": 3.993820950929787e-05,
"loss": 0.9564,
"step": 127
},
{
"epoch": 0.035230165829491504,
"grad_norm": 1.1644167313950693,
"learning_rate": 3.9936829486137145e-05,
"loss": 1.005,
"step": 128
},
{
"epoch": 0.0355054015000344,
"grad_norm": 1.0541434126030913,
"learning_rate": 3.993543424641771e-05,
"loss": 0.9629,
"step": 129
},
{
"epoch": 0.035780637170577306,
"grad_norm": 1.2719003588455873,
"learning_rate": 3.993402379120446e-05,
"loss": 0.9779,
"step": 130
},
{
"epoch": 0.03605587284112021,
"grad_norm": 0.9814772038816815,
"learning_rate": 3.9932598121573906e-05,
"loss": 0.9683,
"step": 131
},
{
"epoch": 0.03633110851166311,
"grad_norm": 1.113127253040516,
"learning_rate": 3.993115723861418e-05,
"loss": 0.9484,
"step": 132
},
{
"epoch": 0.03660634418220601,
"grad_norm": 0.9401838923203238,
"learning_rate": 3.9929701143425014e-05,
"loss": 0.9754,
"step": 133
},
{
"epoch": 0.03688157985274892,
"grad_norm": 1.0452901290660856,
"learning_rate": 3.992822983711776e-05,
"loss": 0.9752,
"step": 134
},
{
"epoch": 0.037156815523291815,
"grad_norm": 1.029671900565212,
"learning_rate": 3.992674332081538e-05,
"loss": 0.9897,
"step": 135
},
{
"epoch": 0.03743205119383472,
"grad_norm": 1.0498191361206552,
"learning_rate": 3.992524159565243e-05,
"loss": 0.9637,
"step": 136
},
{
"epoch": 0.037707286864377625,
"grad_norm": 1.0044264183964862,
"learning_rate": 3.992372466277509e-05,
"loss": 1.0147,
"step": 137
},
{
"epoch": 0.03798252253492052,
"grad_norm": 1.0506663533652345,
"learning_rate": 3.992219252334114e-05,
"loss": 0.9392,
"step": 138
},
{
"epoch": 0.03825775820546343,
"grad_norm": 1.21490462827667,
"learning_rate": 3.992064517851998e-05,
"loss": 1.0044,
"step": 139
},
{
"epoch": 0.03853299387600633,
"grad_norm": 0.9708595219699383,
"learning_rate": 3.9919082629492585e-05,
"loss": 0.9724,
"step": 140
},
{
"epoch": 0.038808229546549236,
"grad_norm": 1.0754728525997272,
"learning_rate": 3.9917504877451563e-05,
"loss": 0.9732,
"step": 141
},
{
"epoch": 0.039083465217092134,
"grad_norm": 1.0770903291389418,
"learning_rate": 3.991591192360112e-05,
"loss": 0.9783,
"step": 142
},
{
"epoch": 0.03935870088763504,
"grad_norm": 1.022206500658949,
"learning_rate": 3.991430376915704e-05,
"loss": 1.003,
"step": 143
},
{
"epoch": 0.03963393655817794,
"grad_norm": 1.2010136364603878,
"learning_rate": 3.991268041534676e-05,
"loss": 0.9622,
"step": 144
},
{
"epoch": 0.03990917222872084,
"grad_norm": 0.9904092925521828,
"learning_rate": 3.991104186340926e-05,
"loss": 0.9903,
"step": 145
},
{
"epoch": 0.040184407899263745,
"grad_norm": 1.2395035830387846,
"learning_rate": 3.990938811459516e-05,
"loss": 0.974,
"step": 146
},
{
"epoch": 0.04045964356980665,
"grad_norm": 0.7353479908408915,
"learning_rate": 3.990771917016665e-05,
"loss": 1.0046,
"step": 147
},
{
"epoch": 0.04073487924034955,
"grad_norm": 0.9920011334434718,
"learning_rate": 3.990603503139755e-05,
"loss": 0.9755,
"step": 148
},
{
"epoch": 0.04101011491089245,
"grad_norm": 1.107219145078194,
"learning_rate": 3.9904335699573245e-05,
"loss": 1.0003,
"step": 149
},
{
"epoch": 0.04128535058143536,
"grad_norm": 0.8548007475694676,
"learning_rate": 3.990262117599074e-05,
"loss": 0.962,
"step": 150
},
{
"epoch": 0.041560586251978254,
"grad_norm": 1.1104937026132244,
"learning_rate": 3.990089146195863e-05,
"loss": 0.9254,
"step": 151
},
{
"epoch": 0.04183582192252116,
"grad_norm": 0.9909627357699444,
"learning_rate": 3.98991465587971e-05,
"loss": 0.9785,
"step": 152
},
{
"epoch": 0.042111057593064063,
"grad_norm": 0.939332432355087,
"learning_rate": 3.98973864678379e-05,
"loss": 0.9868,
"step": 153
},
{
"epoch": 0.04238629326360696,
"grad_norm": 0.9767033192925513,
"learning_rate": 3.989561119042444e-05,
"loss": 0.9537,
"step": 154
},
{
"epoch": 0.042661528934149866,
"grad_norm": 1.0137423941705093,
"learning_rate": 3.989382072791166e-05,
"loss": 0.9414,
"step": 155
},
{
"epoch": 0.04293676460469277,
"grad_norm": 0.9284874715319218,
"learning_rate": 3.98920150816661e-05,
"loss": 0.9842,
"step": 156
},
{
"epoch": 0.04321200027523567,
"grad_norm": 0.9187385752846338,
"learning_rate": 3.989019425306591e-05,
"loss": 0.9935,
"step": 157
},
{
"epoch": 0.04348723594577857,
"grad_norm": 0.9949928953759635,
"learning_rate": 3.9888358243500825e-05,
"loss": 0.9468,
"step": 158
},
{
"epoch": 0.04376247161632148,
"grad_norm": 1.1920415793958623,
"learning_rate": 3.988650705437214e-05,
"loss": 0.93,
"step": 159
},
{
"epoch": 0.044037707286864375,
"grad_norm": 0.9196365103589655,
"learning_rate": 3.9884640687092775e-05,
"loss": 0.9581,
"step": 160
},
{
"epoch": 0.04431294295740728,
"grad_norm": 0.8949804148675419,
"learning_rate": 3.9882759143087194e-05,
"loss": 0.9922,
"step": 161
},
{
"epoch": 0.044588178627950184,
"grad_norm": 0.7927064598466735,
"learning_rate": 3.988086242379148e-05,
"loss": 0.97,
"step": 162
},
{
"epoch": 0.04486341429849308,
"grad_norm": 0.8246249743192302,
"learning_rate": 3.987895053065327e-05,
"loss": 0.9687,
"step": 163
},
{
"epoch": 0.045138649969035986,
"grad_norm": 0.8539801260308312,
"learning_rate": 3.9877023465131806e-05,
"loss": 0.9226,
"step": 164
},
{
"epoch": 0.04541388563957889,
"grad_norm": 0.8531812582063119,
"learning_rate": 3.987508122869789e-05,
"loss": 0.9457,
"step": 165
},
{
"epoch": 0.04568912131012179,
"grad_norm": 0.846264771307889,
"learning_rate": 3.987312382283391e-05,
"loss": 0.9255,
"step": 166
},
{
"epoch": 0.04596435698066469,
"grad_norm": 0.8755538095182204,
"learning_rate": 3.9871151249033844e-05,
"loss": 0.9525,
"step": 167
},
{
"epoch": 0.0462395926512076,
"grad_norm": 0.8199336462623763,
"learning_rate": 3.986916350880323e-05,
"loss": 0.9228,
"step": 168
},
{
"epoch": 0.046514828321750495,
"grad_norm": 0.775083736366338,
"learning_rate": 3.986716060365919e-05,
"loss": 0.9579,
"step": 169
},
{
"epoch": 0.0467900639922934,
"grad_norm": 0.8015872286193737,
"learning_rate": 3.986514253513042e-05,
"loss": 0.9415,
"step": 170
},
{
"epoch": 0.047065299662836305,
"grad_norm": 0.8947424426024164,
"learning_rate": 3.986310930475719e-05,
"loss": 0.9374,
"step": 171
},
{
"epoch": 0.04734053533337921,
"grad_norm": 1.0177453148917428,
"learning_rate": 3.986106091409133e-05,
"loss": 0.9613,
"step": 172
},
{
"epoch": 0.04761577100392211,
"grad_norm": 0.9995155260477647,
"learning_rate": 3.9858997364696254e-05,
"loss": 0.9489,
"step": 173
},
{
"epoch": 0.04789100667446501,
"grad_norm": 1.0058466996884183,
"learning_rate": 3.985691865814695e-05,
"loss": 0.9396,
"step": 174
},
{
"epoch": 0.048166242345007916,
"grad_norm": 0.9309539337174383,
"learning_rate": 3.985482479602996e-05,
"loss": 0.9404,
"step": 175
},
{
"epoch": 0.048441478015550814,
"grad_norm": 0.8007602895755056,
"learning_rate": 3.9852715779943404e-05,
"loss": 0.9477,
"step": 176
},
{
"epoch": 0.04871671368609372,
"grad_norm": 0.5607973976392494,
"learning_rate": 3.985059161149696e-05,
"loss": 0.9446,
"step": 177
},
{
"epoch": 0.04899194935663662,
"grad_norm": 0.6269194351562074,
"learning_rate": 3.984845229231189e-05,
"loss": 0.9043,
"step": 178
},
{
"epoch": 0.04926718502717952,
"grad_norm": 0.8498363383103973,
"learning_rate": 3.984629782402098e-05,
"loss": 0.9572,
"step": 179
},
{
"epoch": 0.049542420697722425,
"grad_norm": 0.971173881009018,
"learning_rate": 3.9844128208268634e-05,
"loss": 0.9583,
"step": 180
},
{
"epoch": 0.04981765636826533,
"grad_norm": 1.0056651754503767,
"learning_rate": 3.9841943446710756e-05,
"loss": 0.928,
"step": 181
},
{
"epoch": 0.05009289203880823,
"grad_norm": 0.8850820710100574,
"learning_rate": 3.983974354101486e-05,
"loss": 0.9501,
"step": 182
},
{
"epoch": 0.05036812770935113,
"grad_norm": 0.6584048206114839,
"learning_rate": 3.983752849286e-05,
"loss": 1.0529,
"step": 183
},
{
"epoch": 0.05064336337989404,
"grad_norm": 0.5918767008080843,
"learning_rate": 3.983529830393677e-05,
"loss": 0.9018,
"step": 184
},
{
"epoch": 0.050918599050436934,
"grad_norm": 0.7851287049394087,
"learning_rate": 3.9833052975947356e-05,
"loss": 0.9542,
"step": 185
},
{
"epoch": 0.05119383472097984,
"grad_norm": 0.8516369448448238,
"learning_rate": 3.9830792510605463e-05,
"loss": 0.9326,
"step": 186
},
{
"epoch": 0.051469070391522743,
"grad_norm": 0.7828498616864874,
"learning_rate": 3.982851690963637e-05,
"loss": 0.9725,
"step": 187
},
{
"epoch": 0.05174430606206564,
"grad_norm": 0.7526449796432927,
"learning_rate": 3.982622617477691e-05,
"loss": 0.9741,
"step": 188
},
{
"epoch": 0.052019541732608546,
"grad_norm": 0.7151234103635482,
"learning_rate": 3.9823920307775464e-05,
"loss": 0.9191,
"step": 189
},
{
"epoch": 0.05229477740315145,
"grad_norm": 0.6920502159129005,
"learning_rate": 3.982159931039194e-05,
"loss": 0.9385,
"step": 190
},
{
"epoch": 0.05257001307369435,
"grad_norm": 0.7381612546258817,
"learning_rate": 3.981926318439782e-05,
"loss": 0.9482,
"step": 191
},
{
"epoch": 0.05284524874423725,
"grad_norm": 0.7023056208374715,
"learning_rate": 3.981691193157614e-05,
"loss": 0.9376,
"step": 192
},
{
"epoch": 0.05312048441478016,
"grad_norm": 0.622343310183164,
"learning_rate": 3.9814545553721456e-05,
"loss": 0.9337,
"step": 193
},
{
"epoch": 0.053395720085323055,
"grad_norm": 0.6795583093281086,
"learning_rate": 3.981216405263987e-05,
"loss": 0.9465,
"step": 194
},
{
"epoch": 0.05367095575586596,
"grad_norm": 0.7025074228756656,
"learning_rate": 3.980976743014905e-05,
"loss": 0.9629,
"step": 195
},
{
"epoch": 0.053946191426408864,
"grad_norm": 0.7154857620479277,
"learning_rate": 3.9807355688078193e-05,
"loss": 0.9609,
"step": 196
},
{
"epoch": 0.05422142709695176,
"grad_norm": 0.6481641291088678,
"learning_rate": 3.9804928828268015e-05,
"loss": 0.9278,
"step": 197
},
{
"epoch": 0.054496662767494666,
"grad_norm": 0.7075225654319801,
"learning_rate": 3.980248685257081e-05,
"loss": 0.9465,
"step": 198
},
{
"epoch": 0.05477189843803757,
"grad_norm": 0.8236382811112433,
"learning_rate": 3.980002976285037e-05,
"loss": 0.9202,
"step": 199
},
{
"epoch": 0.05504713410858047,
"grad_norm": 0.8492664853773008,
"learning_rate": 3.9797557560982056e-05,
"loss": 0.9491,
"step": 200
},
{
"epoch": 0.05532236977912337,
"grad_norm": 0.8531323294396649,
"learning_rate": 3.979507024885274e-05,
"loss": 0.9361,
"step": 201
},
{
"epoch": 0.05559760544966628,
"grad_norm": 1.2050949971243259,
"learning_rate": 3.9792567828360843e-05,
"loss": 0.939,
"step": 202
},
{
"epoch": 0.05587284112020918,
"grad_norm": 0.6054954581158436,
"learning_rate": 3.97900503014163e-05,
"loss": 0.9502,
"step": 203
},
{
"epoch": 0.05614807679075208,
"grad_norm": 0.7022132332194272,
"learning_rate": 3.978751766994059e-05,
"loss": 0.9512,
"step": 204
},
{
"epoch": 0.056423312461294985,
"grad_norm": 0.7524655364093807,
"learning_rate": 3.97849699358667e-05,
"loss": 0.9378,
"step": 205
},
{
"epoch": 0.05669854813183789,
"grad_norm": 0.8229128606866842,
"learning_rate": 3.978240710113919e-05,
"loss": 0.9252,
"step": 206
},
{
"epoch": 0.05697378380238079,
"grad_norm": 0.8015210160942963,
"learning_rate": 3.977982916771408e-05,
"loss": 0.9628,
"step": 207
},
{
"epoch": 0.05724901947292369,
"grad_norm": 0.6550145265716083,
"learning_rate": 3.977723613755897e-05,
"loss": 0.9351,
"step": 208
},
{
"epoch": 0.057524255143466596,
"grad_norm": 0.783410134030528,
"learning_rate": 3.9774628012652965e-05,
"loss": 0.9026,
"step": 209
},
{
"epoch": 0.057799490814009494,
"grad_norm": 0.8364287945657276,
"learning_rate": 3.9772004794986665e-05,
"loss": 0.9052,
"step": 210
},
{
"epoch": 0.0580747264845524,
"grad_norm": 0.8053375394519587,
"learning_rate": 3.976936648656223e-05,
"loss": 0.8964,
"step": 211
},
{
"epoch": 0.0583499621550953,
"grad_norm": 0.8625980591094503,
"learning_rate": 3.976671308939331e-05,
"loss": 0.9051,
"step": 212
},
{
"epoch": 0.0586251978256382,
"grad_norm": 0.9634136358053212,
"learning_rate": 3.976404460550509e-05,
"loss": 0.8621,
"step": 213
},
{
"epoch": 0.058900433496181105,
"grad_norm": 1.0650392212088253,
"learning_rate": 3.976136103693424e-05,
"loss": 0.9111,
"step": 214
},
{
"epoch": 0.05917566916672401,
"grad_norm": 0.8740403534764651,
"learning_rate": 3.9758662385728984e-05,
"loss": 0.9366,
"step": 215
},
{
"epoch": 0.05945090483726691,
"grad_norm": 0.6401848780793619,
"learning_rate": 3.975594865394903e-05,
"loss": 0.9537,
"step": 216
},
{
"epoch": 0.05972614050780981,
"grad_norm": 0.6706070644677035,
"learning_rate": 3.97532198436656e-05,
"loss": 0.9362,
"step": 217
},
{
"epoch": 0.06000137617835272,
"grad_norm": 0.7707750247530193,
"learning_rate": 3.975047595696142e-05,
"loss": 0.9437,
"step": 218
},
{
"epoch": 0.060276611848895614,
"grad_norm": 0.7921067517201671,
"learning_rate": 3.974771699593076e-05,
"loss": 0.9515,
"step": 219
},
{
"epoch": 0.06055184751943852,
"grad_norm": 0.7622632352659637,
"learning_rate": 3.974494296267933e-05,
"loss": 0.9137,
"step": 220
},
{
"epoch": 0.060827083189981423,
"grad_norm": 0.7753020252514123,
"learning_rate": 3.9742153859324403e-05,
"loss": 0.9477,
"step": 221
},
{
"epoch": 0.06110231886052432,
"grad_norm": 0.7746283948680501,
"learning_rate": 3.9739349687994713e-05,
"loss": 0.9404,
"step": 222
},
{
"epoch": 0.061377554531067226,
"grad_norm": 0.6872362412419311,
"learning_rate": 3.9736530450830525e-05,
"loss": 0.9442,
"step": 223
},
{
"epoch": 0.06165279020161013,
"grad_norm": 0.6563306084572785,
"learning_rate": 3.9733696149983586e-05,
"loss": 0.9379,
"step": 224
},
{
"epoch": 0.06192802587215303,
"grad_norm": 0.7495256808670511,
"learning_rate": 3.9730846787617145e-05,
"loss": 0.9649,
"step": 225
},
{
"epoch": 0.06220326154269593,
"grad_norm": 0.7568541570047215,
"learning_rate": 3.972798236590595e-05,
"loss": 0.8936,
"step": 226
},
{
"epoch": 0.06247849721323884,
"grad_norm": 0.7666357668612135,
"learning_rate": 3.972510288703622e-05,
"loss": 0.9227,
"step": 227
},
{
"epoch": 0.06275373288378173,
"grad_norm": 0.6930907336537482,
"learning_rate": 3.9722208353205704e-05,
"loss": 0.9552,
"step": 228
},
{
"epoch": 0.06302896855432465,
"grad_norm": 0.6806540530714671,
"learning_rate": 3.9719298766623614e-05,
"loss": 0.9431,
"step": 229
},
{
"epoch": 0.06330420422486754,
"grad_norm": 0.6249426098911484,
"learning_rate": 3.971637412951066e-05,
"loss": 0.9257,
"step": 230
},
{
"epoch": 0.06357943989541044,
"grad_norm": 0.678997884529542,
"learning_rate": 3.971343444409904e-05,
"loss": 0.9324,
"step": 231
},
{
"epoch": 0.06385467556595335,
"grad_norm": 0.6504053402255093,
"learning_rate": 3.9710479712632435e-05,
"loss": 0.9298,
"step": 232
},
{
"epoch": 0.06412991123649625,
"grad_norm": 0.6813223741520832,
"learning_rate": 3.9707509937366006e-05,
"loss": 0.9234,
"step": 233
},
{
"epoch": 0.06440514690703915,
"grad_norm": 0.5645584348910755,
"learning_rate": 3.9704525120566406e-05,
"loss": 0.899,
"step": 234
},
{
"epoch": 0.06468038257758206,
"grad_norm": 0.6096834703891368,
"learning_rate": 3.970152526451176e-05,
"loss": 0.922,
"step": 235
},
{
"epoch": 0.06495561824812496,
"grad_norm": 0.7075292059001774,
"learning_rate": 3.969851037149167e-05,
"loss": 0.9206,
"step": 236
},
{
"epoch": 0.06523085391866786,
"grad_norm": 0.6718415377168108,
"learning_rate": 3.969548044380722e-05,
"loss": 0.8914,
"step": 237
},
{
"epoch": 0.06550608958921077,
"grad_norm": 0.7192792263292144,
"learning_rate": 3.969243548377098e-05,
"loss": 0.95,
"step": 238
},
{
"epoch": 0.06578132525975366,
"grad_norm": 0.6723385117598139,
"learning_rate": 3.968937549370696e-05,
"loss": 0.9259,
"step": 239
},
{
"epoch": 0.06605656093029656,
"grad_norm": 0.653817726766455,
"learning_rate": 3.9686300475950686e-05,
"loss": 0.9126,
"step": 240
},
{
"epoch": 0.06633179660083947,
"grad_norm": 0.6365110370621555,
"learning_rate": 3.968321043284912e-05,
"loss": 0.9198,
"step": 241
},
{
"epoch": 0.06660703227138237,
"grad_norm": 0.6252107892810178,
"learning_rate": 3.9680105366760686e-05,
"loss": 0.9122,
"step": 242
},
{
"epoch": 0.06688226794192527,
"grad_norm": 0.6904254889862481,
"learning_rate": 3.9676985280055315e-05,
"loss": 0.9172,
"step": 243
},
{
"epoch": 0.06715750361246818,
"grad_norm": 0.6986909512580151,
"learning_rate": 3.9673850175114375e-05,
"loss": 0.9318,
"step": 244
},
{
"epoch": 0.06743273928301108,
"grad_norm": 0.6788305029535093,
"learning_rate": 3.9670700054330685e-05,
"loss": 0.9428,
"step": 245
},
{
"epoch": 0.06770797495355398,
"grad_norm": 0.5920166716231594,
"learning_rate": 3.9667534920108545e-05,
"loss": 0.9142,
"step": 246
},
{
"epoch": 0.06798321062409689,
"grad_norm": 0.5856057224883433,
"learning_rate": 3.966435477486371e-05,
"loss": 0.9186,
"step": 247
},
{
"epoch": 0.06825844629463979,
"grad_norm": 0.6403810123397401,
"learning_rate": 3.966115962102339e-05,
"loss": 0.926,
"step": 248
},
{
"epoch": 0.06853368196518268,
"grad_norm": 0.7177599031300991,
"learning_rate": 3.965794946102625e-05,
"loss": 0.913,
"step": 249
},
{
"epoch": 0.0688089176357256,
"grad_norm": 0.6657273816333562,
"learning_rate": 3.9654724297322406e-05,
"loss": 0.9264,
"step": 250
},
{
"epoch": 0.06908415330626849,
"grad_norm": 0.7383600410114154,
"learning_rate": 3.965148413237342e-05,
"loss": 0.9296,
"step": 251
},
{
"epoch": 0.06935938897681139,
"grad_norm": 0.5409635282957039,
"learning_rate": 3.964822896865234e-05,
"loss": 0.9117,
"step": 252
},
{
"epoch": 0.0696346246473543,
"grad_norm": 0.6016959320603094,
"learning_rate": 3.96449588086436e-05,
"loss": 0.9111,
"step": 253
},
{
"epoch": 0.0699098603178972,
"grad_norm": 0.6359959140668964,
"learning_rate": 3.964167365484312e-05,
"loss": 0.8903,
"step": 254
},
{
"epoch": 0.0701850959884401,
"grad_norm": 0.7068907835626644,
"learning_rate": 3.9638373509758274e-05,
"loss": 0.9083,
"step": 255
},
{
"epoch": 0.07046033165898301,
"grad_norm": 0.7151486793125481,
"learning_rate": 3.9635058375907836e-05,
"loss": 0.9502,
"step": 256
},
{
"epoch": 0.0707355673295259,
"grad_norm": 0.6389786677951766,
"learning_rate": 3.963172825582206e-05,
"loss": 0.9124,
"step": 257
},
{
"epoch": 0.0710108030000688,
"grad_norm": 0.5910610648092535,
"learning_rate": 3.962838315204262e-05,
"loss": 0.9242,
"step": 258
},
{
"epoch": 0.07128603867061171,
"grad_norm": 0.543348196410837,
"learning_rate": 3.962502306712263e-05,
"loss": 0.9436,
"step": 259
},
{
"epoch": 0.07156127434115461,
"grad_norm": 0.4759432647751585,
"learning_rate": 3.962164800362662e-05,
"loss": 0.94,
"step": 260
},
{
"epoch": 0.07183651001169751,
"grad_norm": 0.5489501422865493,
"learning_rate": 3.961825796413059e-05,
"loss": 0.894,
"step": 261
},
{
"epoch": 0.07211174568224042,
"grad_norm": 0.5545140012133268,
"learning_rate": 3.9614852951221945e-05,
"loss": 0.9268,
"step": 262
},
{
"epoch": 0.07238698135278332,
"grad_norm": 0.5176304561516295,
"learning_rate": 3.961143296749952e-05,
"loss": 0.9018,
"step": 263
},
{
"epoch": 0.07266221702332622,
"grad_norm": 0.4936550228340853,
"learning_rate": 3.960799801557357e-05,
"loss": 0.9271,
"step": 264
},
{
"epoch": 0.07293745269386913,
"grad_norm": 0.4649125188981785,
"learning_rate": 3.9604548098065796e-05,
"loss": 0.9009,
"step": 265
},
{
"epoch": 0.07321268836441203,
"grad_norm": 0.45983355235823387,
"learning_rate": 3.96010832176093e-05,
"loss": 0.9095,
"step": 266
},
{
"epoch": 0.07348792403495492,
"grad_norm": 0.46023598503687446,
"learning_rate": 3.9597603376848614e-05,
"loss": 0.9525,
"step": 267
},
{
"epoch": 0.07376315970549784,
"grad_norm": 0.43826928991196545,
"learning_rate": 3.959410857843969e-05,
"loss": 0.9357,
"step": 268
},
{
"epoch": 0.07403839537604073,
"grad_norm": 0.44969426028379234,
"learning_rate": 3.9590598825049896e-05,
"loss": 0.9052,
"step": 269
},
{
"epoch": 0.07431363104658363,
"grad_norm": 0.4172117173156426,
"learning_rate": 3.9587074119358e-05,
"loss": 0.9029,
"step": 270
},
{
"epoch": 0.07458886671712654,
"grad_norm": 0.4484248145931521,
"learning_rate": 3.95835344640542e-05,
"loss": 0.9308,
"step": 271
},
{
"epoch": 0.07486410238766944,
"grad_norm": 0.5135846985483652,
"learning_rate": 3.957997986184011e-05,
"loss": 0.9143,
"step": 272
},
{
"epoch": 0.07513933805821234,
"grad_norm": 0.48945107499007995,
"learning_rate": 3.957641031542872e-05,
"loss": 0.9235,
"step": 273
},
{
"epoch": 0.07541457372875525,
"grad_norm": 0.5754430181789083,
"learning_rate": 3.957282582754445e-05,
"loss": 0.9134,
"step": 274
},
{
"epoch": 0.07568980939929815,
"grad_norm": 0.5531912332257736,
"learning_rate": 3.9569226400923135e-05,
"loss": 0.9126,
"step": 275
},
{
"epoch": 0.07596504506984104,
"grad_norm": 0.6388268251950844,
"learning_rate": 3.956561203831198e-05,
"loss": 0.8906,
"step": 276
},
{
"epoch": 0.07624028074038396,
"grad_norm": 0.7349885262264578,
"learning_rate": 3.9561982742469606e-05,
"loss": 0.9171,
"step": 277
},
{
"epoch": 0.07651551641092685,
"grad_norm": 0.6680870684670991,
"learning_rate": 3.955833851616604e-05,
"loss": 0.873,
"step": 278
},
{
"epoch": 0.07679075208146977,
"grad_norm": 0.6461308971358634,
"learning_rate": 3.95546793621827e-05,
"loss": 0.9046,
"step": 279
},
{
"epoch": 0.07706598775201266,
"grad_norm": 0.6300989584233229,
"learning_rate": 3.955100528331238e-05,
"loss": 0.8672,
"step": 280
},
{
"epoch": 0.07734122342255556,
"grad_norm": 0.6351347433080852,
"learning_rate": 3.9547316282359284e-05,
"loss": 0.9448,
"step": 281
},
{
"epoch": 0.07761645909309847,
"grad_norm": 0.953155967281556,
"learning_rate": 3.954361236213901e-05,
"loss": 0.9118,
"step": 282
},
{
"epoch": 0.07789169476364137,
"grad_norm": 0.7702100160420944,
"learning_rate": 3.9539893525478524e-05,
"loss": 0.9099,
"step": 283
},
{
"epoch": 0.07816693043418427,
"grad_norm": 0.8391527574702072,
"learning_rate": 3.9536159775216185e-05,
"loss": 0.9096,
"step": 284
},
{
"epoch": 0.07844216610472718,
"grad_norm": 0.925785241620294,
"learning_rate": 3.953241111420174e-05,
"loss": 0.9365,
"step": 285
},
{
"epoch": 0.07871740177527008,
"grad_norm": 1.0370904205356115,
"learning_rate": 3.9528647545296306e-05,
"loss": 0.9076,
"step": 286
},
{
"epoch": 0.07899263744581297,
"grad_norm": 0.8333618823361661,
"learning_rate": 3.952486907137239e-05,
"loss": 0.9239,
"step": 287
},
{
"epoch": 0.07926787311635589,
"grad_norm": 0.616154186568232,
"learning_rate": 3.9521075695313864e-05,
"loss": 0.9181,
"step": 288
},
{
"epoch": 0.07954310878689878,
"grad_norm": 0.6315972529810878,
"learning_rate": 3.951726742001599e-05,
"loss": 0.8923,
"step": 289
},
{
"epoch": 0.07981834445744168,
"grad_norm": 0.751290814003144,
"learning_rate": 3.951344424838538e-05,
"loss": 0.9555,
"step": 290
},
{
"epoch": 0.08009358012798459,
"grad_norm": 0.6998218974846498,
"learning_rate": 3.9509606183340026e-05,
"loss": 0.8874,
"step": 291
},
{
"epoch": 0.08036881579852749,
"grad_norm": 0.7352815171351497,
"learning_rate": 3.950575322780929e-05,
"loss": 0.9089,
"step": 292
},
{
"epoch": 0.08064405146907039,
"grad_norm": 0.7023022013545431,
"learning_rate": 3.9501885384733906e-05,
"loss": 0.909,
"step": 293
},
{
"epoch": 0.0809192871396133,
"grad_norm": 0.6749091557752724,
"learning_rate": 3.949800265706595e-05,
"loss": 0.8704,
"step": 294
},
{
"epoch": 0.0811945228101562,
"grad_norm": 0.7312732879936177,
"learning_rate": 3.949410504776887e-05,
"loss": 0.8886,
"step": 295
},
{
"epoch": 0.0814697584806991,
"grad_norm": 0.655108696795065,
"learning_rate": 3.949019255981747e-05,
"loss": 0.942,
"step": 296
},
{
"epoch": 0.081744994151242,
"grad_norm": 0.6761260309562537,
"learning_rate": 3.948626519619793e-05,
"loss": 0.908,
"step": 297
},
{
"epoch": 0.0820202298217849,
"grad_norm": 0.5891569339909765,
"learning_rate": 3.9482322959907745e-05,
"loss": 0.8947,
"step": 298
},
{
"epoch": 0.0822954654923278,
"grad_norm": 0.5268646031762036,
"learning_rate": 3.947836585395579e-05,
"loss": 0.8896,
"step": 299
},
{
"epoch": 0.08257070116287071,
"grad_norm": 0.5572584026668544,
"learning_rate": 3.947439388136228e-05,
"loss": 0.9279,
"step": 300
},
{
"epoch": 0.08284593683341361,
"grad_norm": 0.6016927517733835,
"learning_rate": 3.947040704515878e-05,
"loss": 0.9121,
"step": 301
},
{
"epoch": 0.08312117250395651,
"grad_norm": 0.6196112117746112,
"learning_rate": 3.94664053483882e-05,
"loss": 0.9493,
"step": 302
},
{
"epoch": 0.08339640817449942,
"grad_norm": 0.5729651894313627,
"learning_rate": 3.946238879410478e-05,
"loss": 0.9029,
"step": 303
},
{
"epoch": 0.08367164384504232,
"grad_norm": 0.5416308933869269,
"learning_rate": 3.9458357385374116e-05,
"loss": 0.9092,
"step": 304
},
{
"epoch": 0.08394687951558522,
"grad_norm": 0.5781810718082886,
"learning_rate": 3.945431112527314e-05,
"loss": 0.964,
"step": 305
},
{
"epoch": 0.08422211518612813,
"grad_norm": 0.5592511163596525,
"learning_rate": 3.94502500168901e-05,
"loss": 0.903,
"step": 306
},
{
"epoch": 0.08449735085667102,
"grad_norm": 0.5541896438592137,
"learning_rate": 3.944617406332461e-05,
"loss": 0.8853,
"step": 307
},
{
"epoch": 0.08477258652721392,
"grad_norm": 0.5489645129909775,
"learning_rate": 3.944208326768758e-05,
"loss": 0.911,
"step": 308
},
{
"epoch": 0.08504782219775683,
"grad_norm": 0.5625633636794496,
"learning_rate": 3.9437977633101266e-05,
"loss": 0.8833,
"step": 309
},
{
"epoch": 0.08532305786829973,
"grad_norm": 0.5448235271657504,
"learning_rate": 3.9433857162699245e-05,
"loss": 0.8645,
"step": 310
},
{
"epoch": 0.08559829353884263,
"grad_norm": 0.6148293436890931,
"learning_rate": 3.9429721859626434e-05,
"loss": 0.8982,
"step": 311
},
{
"epoch": 0.08587352920938554,
"grad_norm": 0.5856838059042696,
"learning_rate": 3.942557172703903e-05,
"loss": 0.8764,
"step": 312
},
{
"epoch": 0.08614876487992844,
"grad_norm": 0.6627976946036365,
"learning_rate": 3.94214067681046e-05,
"loss": 0.8854,
"step": 313
},
{
"epoch": 0.08642400055047134,
"grad_norm": 0.723862149221964,
"learning_rate": 3.9417226986001994e-05,
"loss": 0.9025,
"step": 314
},
{
"epoch": 0.08669923622101425,
"grad_norm": 0.649076526741882,
"learning_rate": 3.9413032383921374e-05,
"loss": 0.8537,
"step": 315
},
{
"epoch": 0.08697447189155715,
"grad_norm": 0.5897716399851319,
"learning_rate": 3.940882296506423e-05,
"loss": 0.9179,
"step": 316
},
{
"epoch": 0.08724970756210004,
"grad_norm": 0.5707783248675382,
"learning_rate": 3.940459873264336e-05,
"loss": 0.9182,
"step": 317
},
{
"epoch": 0.08752494323264295,
"grad_norm": 0.5739789279770842,
"learning_rate": 3.940035968988284e-05,
"loss": 0.8827,
"step": 318
},
{
"epoch": 0.08780017890318585,
"grad_norm": 0.587354108382882,
"learning_rate": 3.939610584001809e-05,
"loss": 0.9102,
"step": 319
},
{
"epoch": 0.08807541457372875,
"grad_norm": 0.5812972779745122,
"learning_rate": 3.9391837186295816e-05,
"loss": 0.8915,
"step": 320
},
{
"epoch": 0.08835065024427166,
"grad_norm": 0.5602934710906348,
"learning_rate": 3.9387553731974e-05,
"loss": 0.8849,
"step": 321
},
{
"epoch": 0.08862588591481456,
"grad_norm": 0.5928749356180759,
"learning_rate": 3.9383255480321955e-05,
"loss": 0.896,
"step": 322
},
{
"epoch": 0.08890112158535746,
"grad_norm": 0.4846448604955536,
"learning_rate": 3.937894243462027e-05,
"loss": 0.894,
"step": 323
},
{
"epoch": 0.08917635725590037,
"grad_norm": 0.5364236454230179,
"learning_rate": 3.937461459816082e-05,
"loss": 0.9165,
"step": 324
},
{
"epoch": 0.08945159292644327,
"grad_norm": 0.575559207025926,
"learning_rate": 3.937027197424679e-05,
"loss": 0.901,
"step": 325
},
{
"epoch": 0.08972682859698616,
"grad_norm": 0.647872054045973,
"learning_rate": 3.9365914566192635e-05,
"loss": 0.8753,
"step": 326
},
{
"epoch": 0.09000206426752907,
"grad_norm": 0.6496703700196986,
"learning_rate": 3.936154237732409e-05,
"loss": 0.9088,
"step": 327
},
{
"epoch": 0.09027729993807197,
"grad_norm": 0.6185755595736986,
"learning_rate": 3.9357155410978184e-05,
"loss": 0.9084,
"step": 328
},
{
"epoch": 0.09055253560861487,
"grad_norm": 0.6250487984329662,
"learning_rate": 3.9352753670503216e-05,
"loss": 0.9227,
"step": 329
},
{
"epoch": 0.09082777127915778,
"grad_norm": 0.60775145646772,
"learning_rate": 3.934833715925877e-05,
"loss": 0.8739,
"step": 330
},
{
"epoch": 0.09110300694970068,
"grad_norm": 0.6171269689039475,
"learning_rate": 3.934390588061569e-05,
"loss": 0.8905,
"step": 331
},
{
"epoch": 0.09137824262024358,
"grad_norm": 0.5524209601199233,
"learning_rate": 3.933945983795611e-05,
"loss": 0.8986,
"step": 332
},
{
"epoch": 0.09165347829078649,
"grad_norm": 0.6016660474331017,
"learning_rate": 3.933499903467341e-05,
"loss": 0.9203,
"step": 333
},
{
"epoch": 0.09192871396132939,
"grad_norm": 0.6058072777994923,
"learning_rate": 3.933052347417225e-05,
"loss": 0.9331,
"step": 334
},
{
"epoch": 0.09220394963187228,
"grad_norm": 0.7352857206810751,
"learning_rate": 3.932603315986856e-05,
"loss": 0.8583,
"step": 335
},
{
"epoch": 0.0924791853024152,
"grad_norm": 0.729197233429657,
"learning_rate": 3.932152809518951e-05,
"loss": 0.8843,
"step": 336
},
{
"epoch": 0.0927544209729581,
"grad_norm": 0.6360878987285922,
"learning_rate": 3.931700828357355e-05,
"loss": 0.9146,
"step": 337
},
{
"epoch": 0.09302965664350099,
"grad_norm": 0.5538393227342899,
"learning_rate": 3.9312473728470364e-05,
"loss": 0.8909,
"step": 338
},
{
"epoch": 0.0933048923140439,
"grad_norm": 0.5572788719586435,
"learning_rate": 3.9307924433340906e-05,
"loss": 0.9228,
"step": 339
},
{
"epoch": 0.0935801279845868,
"grad_norm": 0.6080360349844633,
"learning_rate": 3.930336040165738e-05,
"loss": 0.8727,
"step": 340
},
{
"epoch": 0.09385536365512971,
"grad_norm": 0.794099959407607,
"learning_rate": 3.9298781636903215e-05,
"loss": 0.9092,
"step": 341
},
{
"epoch": 0.09413059932567261,
"grad_norm": 0.6791574529442961,
"learning_rate": 3.929418814257311e-05,
"loss": 0.8966,
"step": 342
},
{
"epoch": 0.0944058349962155,
"grad_norm": 0.5248090948237437,
"learning_rate": 3.9289579922173e-05,
"loss": 0.896,
"step": 343
},
{
"epoch": 0.09468107066675842,
"grad_norm": 0.5284805439594273,
"learning_rate": 3.9284956979220056e-05,
"loss": 0.8968,
"step": 344
},
{
"epoch": 0.09495630633730132,
"grad_norm": 0.550975681076434,
"learning_rate": 3.928031931724269e-05,
"loss": 0.9246,
"step": 345
},
{
"epoch": 0.09523154200784421,
"grad_norm": 0.505929866249553,
"learning_rate": 3.927566693978053e-05,
"loss": 0.8796,
"step": 346
},
{
"epoch": 0.09550677767838713,
"grad_norm": 0.5080069783013221,
"learning_rate": 3.927099985038446e-05,
"loss": 0.9042,
"step": 347
},
{
"epoch": 0.09578201334893002,
"grad_norm": 0.5053139389182594,
"learning_rate": 3.926631805261659e-05,
"loss": 0.897,
"step": 348
},
{
"epoch": 0.09605724901947292,
"grad_norm": 0.5291352192477727,
"learning_rate": 3.926162155005024e-05,
"loss": 0.8695,
"step": 349
},
{
"epoch": 0.09633248469001583,
"grad_norm": 0.435108769302938,
"learning_rate": 3.925691034626997e-05,
"loss": 0.8927,
"step": 350
},
{
"epoch": 0.09660772036055873,
"grad_norm": 0.4625701183252576,
"learning_rate": 3.925218444487154e-05,
"loss": 0.9128,
"step": 351
},
{
"epoch": 0.09688295603110163,
"grad_norm": 0.5561823027644259,
"learning_rate": 3.924744384946195e-05,
"loss": 0.8551,
"step": 352
},
{
"epoch": 0.09715819170164454,
"grad_norm": 0.6897758949035243,
"learning_rate": 3.9242688563659406e-05,
"loss": 0.8996,
"step": 353
},
{
"epoch": 0.09743342737218744,
"grad_norm": 0.518046280528702,
"learning_rate": 3.923791859109332e-05,
"loss": 0.8713,
"step": 354
},
{
"epoch": 0.09770866304273033,
"grad_norm": 0.6141786904296741,
"learning_rate": 3.923313393540433e-05,
"loss": 0.9132,
"step": 355
},
{
"epoch": 0.09798389871327325,
"grad_norm": 0.48284229958533237,
"learning_rate": 3.922833460024425e-05,
"loss": 0.9018,
"step": 356
},
{
"epoch": 0.09825913438381614,
"grad_norm": 0.4409973295054774,
"learning_rate": 3.922352058927614e-05,
"loss": 0.8537,
"step": 357
},
{
"epoch": 0.09853437005435904,
"grad_norm": 0.4890342415470555,
"learning_rate": 3.921869190617423e-05,
"loss": 0.881,
"step": 358
},
{
"epoch": 0.09880960572490195,
"grad_norm": 0.5005438507702201,
"learning_rate": 3.921384855462396e-05,
"loss": 0.8769,
"step": 359
},
{
"epoch": 0.09908484139544485,
"grad_norm": 0.47314532654978314,
"learning_rate": 3.920899053832195e-05,
"loss": 0.8736,
"step": 360
},
{
"epoch": 0.09936007706598775,
"grad_norm": 0.5428084046874413,
"learning_rate": 3.920411786097605e-05,
"loss": 0.8566,
"step": 361
},
{
"epoch": 0.09963531273653066,
"grad_norm": 0.6599627349997179,
"learning_rate": 3.919923052630526e-05,
"loss": 0.8874,
"step": 362
},
{
"epoch": 0.09991054840707356,
"grad_norm": 0.6702735192730017,
"learning_rate": 3.9194328538039775e-05,
"loss": 0.9135,
"step": 363
},
{
"epoch": 0.10018578407761645,
"grad_norm": 0.6260073822159783,
"learning_rate": 3.9189411899921e-05,
"loss": 0.8642,
"step": 364
},
{
"epoch": 0.10046101974815937,
"grad_norm": 0.6037436224945719,
"learning_rate": 3.9184480615701496e-05,
"loss": 0.898,
"step": 365
},
{
"epoch": 0.10073625541870226,
"grad_norm": 0.553940568521368,
"learning_rate": 3.917953468914501e-05,
"loss": 0.8849,
"step": 366
},
{
"epoch": 0.10101149108924516,
"grad_norm": 0.5691010429228973,
"learning_rate": 3.917457412402645e-05,
"loss": 0.8892,
"step": 367
},
{
"epoch": 0.10128672675978807,
"grad_norm": 0.5059542629361516,
"learning_rate": 3.916959892413194e-05,
"loss": 0.9121,
"step": 368
},
{
"epoch": 0.10156196243033097,
"grad_norm": 0.48598338363082094,
"learning_rate": 3.9164609093258726e-05,
"loss": 0.8686,
"step": 369
},
{
"epoch": 0.10183719810087387,
"grad_norm": 0.582121236237182,
"learning_rate": 3.9159604635215236e-05,
"loss": 0.8563,
"step": 370
},
{
"epoch": 0.10211243377141678,
"grad_norm": 0.6162442781859768,
"learning_rate": 3.915458555382108e-05,
"loss": 0.8713,
"step": 371
},
{
"epoch": 0.10238766944195968,
"grad_norm": 0.6059270830911037,
"learning_rate": 3.9149551852907e-05,
"loss": 0.8955,
"step": 372
},
{
"epoch": 0.10266290511250258,
"grad_norm": 0.5495886266547979,
"learning_rate": 3.914450353631492e-05,
"loss": 0.9098,
"step": 373
},
{
"epoch": 0.10293814078304549,
"grad_norm": 0.7435991277339683,
"learning_rate": 3.913944060789791e-05,
"loss": 0.9084,
"step": 374
},
{
"epoch": 0.10321337645358838,
"grad_norm": 0.5412944053150593,
"learning_rate": 3.91343630715202e-05,
"loss": 0.8736,
"step": 375
},
{
"epoch": 0.10348861212413128,
"grad_norm": 0.5300055966983039,
"learning_rate": 3.912927093105714e-05,
"loss": 0.8706,
"step": 376
},
{
"epoch": 0.1037638477946742,
"grad_norm": 0.5085340455305501,
"learning_rate": 3.912416419039526e-05,
"loss": 0.8844,
"step": 377
},
{
"epoch": 0.10403908346521709,
"grad_norm": 0.5314131577090296,
"learning_rate": 3.911904285343224e-05,
"loss": 0.8811,
"step": 378
},
{
"epoch": 0.10431431913575999,
"grad_norm": 0.5743431648299027,
"learning_rate": 3.911390692407685e-05,
"loss": 0.8823,
"step": 379
},
{
"epoch": 0.1045895548063029,
"grad_norm": 0.5575204437188148,
"learning_rate": 3.910875640624905e-05,
"loss": 0.8732,
"step": 380
},
{
"epoch": 0.1048647904768458,
"grad_norm": 0.6500868814849747,
"learning_rate": 3.910359130387991e-05,
"loss": 0.8587,
"step": 381
},
{
"epoch": 0.1051400261473887,
"grad_norm": 0.5906306776112186,
"learning_rate": 3.909841162091164e-05,
"loss": 0.9026,
"step": 382
},
{
"epoch": 0.10541526181793161,
"grad_norm": 0.6266630869849823,
"learning_rate": 3.909321736129757e-05,
"loss": 0.8938,
"step": 383
},
{
"epoch": 0.1056904974884745,
"grad_norm": 0.6718226466340169,
"learning_rate": 3.908800852900215e-05,
"loss": 0.8786,
"step": 384
},
{
"epoch": 0.1059657331590174,
"grad_norm": 0.6634999849790058,
"learning_rate": 3.908278512800098e-05,
"loss": 0.8885,
"step": 385
},
{
"epoch": 0.10624096882956031,
"grad_norm": 0.6431127170000246,
"learning_rate": 3.9077547162280754e-05,
"loss": 0.8749,
"step": 386
},
{
"epoch": 0.10651620450010321,
"grad_norm": 0.5710373048152448,
"learning_rate": 3.907229463583928e-05,
"loss": 0.8723,
"step": 387
},
{
"epoch": 0.10679144017064611,
"grad_norm": 0.5416823760384776,
"learning_rate": 3.9067027552685506e-05,
"loss": 0.8954,
"step": 388
},
{
"epoch": 0.10706667584118902,
"grad_norm": 0.4573907707132969,
"learning_rate": 3.906174591683946e-05,
"loss": 0.8981,
"step": 389
},
{
"epoch": 0.10734191151173192,
"grad_norm": 0.5611673580990199,
"learning_rate": 3.90564497323323e-05,
"loss": 0.9131,
"step": 390
},
{
"epoch": 0.10761714718227482,
"grad_norm": 0.6365557897336063,
"learning_rate": 3.905113900320627e-05,
"loss": 0.895,
"step": 391
},
{
"epoch": 0.10789238285281773,
"grad_norm": 0.7015761098854781,
"learning_rate": 3.904581373351474e-05,
"loss": 0.8965,
"step": 392
},
{
"epoch": 0.10816761852336063,
"grad_norm": 0.637562535977938,
"learning_rate": 3.9040473927322136e-05,
"loss": 0.8802,
"step": 393
},
{
"epoch": 0.10844285419390352,
"grad_norm": 0.6240363416564655,
"learning_rate": 3.9035119588704026e-05,
"loss": 0.9175,
"step": 394
},
{
"epoch": 0.10871808986444643,
"grad_norm": 0.5664443709947848,
"learning_rate": 3.902975072174704e-05,
"loss": 0.8742,
"step": 395
},
{
"epoch": 0.10899332553498933,
"grad_norm": 0.5507530290972074,
"learning_rate": 3.9024367330548904e-05,
"loss": 0.8716,
"step": 396
},
{
"epoch": 0.10926856120553223,
"grad_norm": 0.631821090156823,
"learning_rate": 3.901896941921843e-05,
"loss": 0.901,
"step": 397
},
{
"epoch": 0.10954379687607514,
"grad_norm": 0.6293458415988438,
"learning_rate": 3.9013556991875515e-05,
"loss": 0.8666,
"step": 398
},
{
"epoch": 0.10981903254661804,
"grad_norm": 0.5983356080974974,
"learning_rate": 3.900813005265113e-05,
"loss": 0.8703,
"step": 399
},
{
"epoch": 0.11009426821716094,
"grad_norm": 0.4978346446457818,
"learning_rate": 3.9002688605687334e-05,
"loss": 0.8923,
"step": 400
},
{
"epoch": 0.11036950388770385,
"grad_norm": 0.48512440368202475,
"learning_rate": 3.8997232655137234e-05,
"loss": 0.8714,
"step": 401
},
{
"epoch": 0.11064473955824675,
"grad_norm": 0.517562020241431,
"learning_rate": 3.899176220516504e-05,
"loss": 0.8678,
"step": 402
},
{
"epoch": 0.11091997522878966,
"grad_norm": 0.5717948870585363,
"learning_rate": 3.8986277259945996e-05,
"loss": 0.8691,
"step": 403
},
{
"epoch": 0.11119521089933256,
"grad_norm": 0.4991076776447727,
"learning_rate": 3.898077782366643e-05,
"loss": 0.874,
"step": 404
},
{
"epoch": 0.11147044656987545,
"grad_norm": 0.4855826990964506,
"learning_rate": 3.897526390052372e-05,
"loss": 0.8593,
"step": 405
},
{
"epoch": 0.11174568224041836,
"grad_norm": 0.4874229602866942,
"learning_rate": 3.8969735494726306e-05,
"loss": 0.8838,
"step": 406
},
{
"epoch": 0.11202091791096126,
"grad_norm": 0.5330378392189458,
"learning_rate": 3.896419261049369e-05,
"loss": 0.8427,
"step": 407
},
{
"epoch": 0.11229615358150416,
"grad_norm": 0.575968875495886,
"learning_rate": 3.8958635252056404e-05,
"loss": 0.8692,
"step": 408
},
{
"epoch": 0.11257138925204707,
"grad_norm": 0.4801689204644928,
"learning_rate": 3.8953063423656055e-05,
"loss": 0.892,
"step": 409
},
{
"epoch": 0.11284662492258997,
"grad_norm": 0.48737892640435837,
"learning_rate": 3.8947477129545256e-05,
"loss": 0.8883,
"step": 410
},
{
"epoch": 0.11312186059313287,
"grad_norm": 0.5244170455238508,
"learning_rate": 3.89418763739877e-05,
"loss": 0.8641,
"step": 411
},
{
"epoch": 0.11339709626367578,
"grad_norm": 0.5375336765948339,
"learning_rate": 3.8936261161258094e-05,
"loss": 0.879,
"step": 412
},
{
"epoch": 0.11367233193421868,
"grad_norm": 0.5194538613237015,
"learning_rate": 3.893063149564218e-05,
"loss": 0.8546,
"step": 413
},
{
"epoch": 0.11394756760476157,
"grad_norm": 0.5520513158148513,
"learning_rate": 3.8924987381436746e-05,
"loss": 0.8748,
"step": 414
},
{
"epoch": 0.11422280327530449,
"grad_norm": 0.6132063585013063,
"learning_rate": 3.8919328822949587e-05,
"loss": 0.8525,
"step": 415
},
{
"epoch": 0.11449803894584738,
"grad_norm": 0.6751742930556689,
"learning_rate": 3.8913655824499536e-05,
"loss": 0.8704,
"step": 416
},
{
"epoch": 0.11477327461639028,
"grad_norm": 0.7321795805175227,
"learning_rate": 3.890796839041646e-05,
"loss": 0.8755,
"step": 417
},
{
"epoch": 0.11504851028693319,
"grad_norm": 0.5708036927403648,
"learning_rate": 3.890226652504121e-05,
"loss": 0.8703,
"step": 418
},
{
"epoch": 0.11532374595747609,
"grad_norm": 0.504748003198664,
"learning_rate": 3.889655023272568e-05,
"loss": 0.8596,
"step": 419
},
{
"epoch": 0.11559898162801899,
"grad_norm": 0.5900956763318453,
"learning_rate": 3.889081951783276e-05,
"loss": 0.9089,
"step": 420
},
{
"epoch": 0.1158742172985619,
"grad_norm": 0.6466905218632802,
"learning_rate": 3.888507438473636e-05,
"loss": 0.8628,
"step": 421
},
{
"epoch": 0.1161494529691048,
"grad_norm": 0.6078563855062546,
"learning_rate": 3.887931483782137e-05,
"loss": 0.9246,
"step": 422
},
{
"epoch": 0.1164246886396477,
"grad_norm": 0.49866059364732357,
"learning_rate": 3.8873540881483725e-05,
"loss": 0.8576,
"step": 423
},
{
"epoch": 0.1166999243101906,
"grad_norm": 0.5124859820552345,
"learning_rate": 3.8867752520130315e-05,
"loss": 0.8908,
"step": 424
},
{
"epoch": 0.1169751599807335,
"grad_norm": 0.5627280720731888,
"learning_rate": 3.8861949758179044e-05,
"loss": 0.8969,
"step": 425
},
{
"epoch": 0.1172503956512764,
"grad_norm": 0.5759560120018811,
"learning_rate": 3.88561326000588e-05,
"loss": 0.8467,
"step": 426
},
{
"epoch": 0.11752563132181931,
"grad_norm": 0.5862534370058109,
"learning_rate": 3.8850301050209476e-05,
"loss": 0.9076,
"step": 427
},
{
"epoch": 0.11780086699236221,
"grad_norm": 0.5567090118087943,
"learning_rate": 3.8844455113081915e-05,
"loss": 0.8969,
"step": 428
},
{
"epoch": 0.11807610266290511,
"grad_norm": 0.5325378918415913,
"learning_rate": 3.883859479313798e-05,
"loss": 0.8923,
"step": 429
},
{
"epoch": 0.11835133833344802,
"grad_norm": 0.5272783210005675,
"learning_rate": 3.883272009485049e-05,
"loss": 0.8667,
"step": 430
},
{
"epoch": 0.11862657400399092,
"grad_norm": 0.5457322963275876,
"learning_rate": 3.8826831022703245e-05,
"loss": 0.8551,
"step": 431
},
{
"epoch": 0.11890180967453381,
"grad_norm": 0.5741588144845013,
"learning_rate": 3.882092758119099e-05,
"loss": 0.8421,
"step": 432
},
{
"epoch": 0.11917704534507673,
"grad_norm": 0.4836464637866908,
"learning_rate": 3.88150097748195e-05,
"loss": 0.8777,
"step": 433
},
{
"epoch": 0.11945228101561962,
"grad_norm": 0.5898419315572756,
"learning_rate": 3.8809077608105435e-05,
"loss": 0.8443,
"step": 434
},
{
"epoch": 0.11972751668616252,
"grad_norm": 0.5942615878371786,
"learning_rate": 3.8803131085576477e-05,
"loss": 0.8509,
"step": 435
},
{
"epoch": 0.12000275235670543,
"grad_norm": 0.5024995991629244,
"learning_rate": 3.879717021177123e-05,
"loss": 0.9012,
"step": 436
},
{
"epoch": 0.12027798802724833,
"grad_norm": 0.49176826477914476,
"learning_rate": 3.879119499123927e-05,
"loss": 0.9095,
"step": 437
},
{
"epoch": 0.12055322369779123,
"grad_norm": 0.4997512671977748,
"learning_rate": 3.878520542854111e-05,
"loss": 0.8522,
"step": 438
},
{
"epoch": 0.12082845936833414,
"grad_norm": 0.5174557816004738,
"learning_rate": 3.877920152824822e-05,
"loss": 0.8709,
"step": 439
},
{
"epoch": 0.12110369503887704,
"grad_norm": 0.5462519993585703,
"learning_rate": 3.8773183294943015e-05,
"loss": 0.8558,
"step": 440
},
{
"epoch": 0.12137893070941994,
"grad_norm": 0.5669830253709262,
"learning_rate": 3.876715073321883e-05,
"loss": 0.8589,
"step": 441
},
{
"epoch": 0.12165416637996285,
"grad_norm": 0.5559979293663662,
"learning_rate": 3.876110384767996e-05,
"loss": 0.8666,
"step": 442
},
{
"epoch": 0.12192940205050574,
"grad_norm": 0.5352320132499331,
"learning_rate": 3.875504264294161e-05,
"loss": 0.8658,
"step": 443
},
{
"epoch": 0.12220463772104864,
"grad_norm": 0.5097852955806261,
"learning_rate": 3.874896712362994e-05,
"loss": 0.8923,
"step": 444
},
{
"epoch": 0.12247987339159155,
"grad_norm": 0.5679336085640061,
"learning_rate": 3.874287729438201e-05,
"loss": 0.8747,
"step": 445
},
{
"epoch": 0.12275510906213445,
"grad_norm": 0.5717500090072024,
"learning_rate": 3.873677315984582e-05,
"loss": 0.9141,
"step": 446
},
{
"epoch": 0.12303034473267735,
"grad_norm": 0.5351395879024524,
"learning_rate": 3.8730654724680284e-05,
"loss": 0.887,
"step": 447
},
{
"epoch": 0.12330558040322026,
"grad_norm": 0.5056236978853025,
"learning_rate": 3.8724521993555216e-05,
"loss": 0.8712,
"step": 448
},
{
"epoch": 0.12358081607376316,
"grad_norm": 0.445389896216137,
"learning_rate": 3.8718374971151356e-05,
"loss": 0.8856,
"step": 449
},
{
"epoch": 0.12385605174430606,
"grad_norm": 0.4979986798823173,
"learning_rate": 3.871221366216036e-05,
"loss": 0.884,
"step": 450
},
{
"epoch": 0.12413128741484897,
"grad_norm": 0.5098373282634219,
"learning_rate": 3.870603807128477e-05,
"loss": 0.8824,
"step": 451
},
{
"epoch": 0.12440652308539187,
"grad_norm": 0.4884571444922137,
"learning_rate": 3.869984820323804e-05,
"loss": 0.866,
"step": 452
},
{
"epoch": 0.12468175875593476,
"grad_norm": 0.48319386031254,
"learning_rate": 3.86936440627445e-05,
"loss": 0.8622,
"step": 453
},
{
"epoch": 0.12495699442647767,
"grad_norm": 0.4875346343493672,
"learning_rate": 3.868742565453941e-05,
"loss": 0.9008,
"step": 454
},
{
"epoch": 0.12523223009702059,
"grad_norm": 0.44164375087009367,
"learning_rate": 3.868119298336889e-05,
"loss": 0.865,
"step": 455
},
{
"epoch": 0.12550746576756347,
"grad_norm": 0.5185602016440723,
"learning_rate": 3.867494605398996e-05,
"loss": 0.8768,
"step": 456
},
{
"epoch": 0.12578270143810638,
"grad_norm": 0.537626827141255,
"learning_rate": 3.8668684871170514e-05,
"loss": 0.8512,
"step": 457
},
{
"epoch": 0.1260579371086493,
"grad_norm": 0.4693844997367465,
"learning_rate": 3.866240943968932e-05,
"loss": 0.8425,
"step": 458
},
{
"epoch": 0.12633317277919218,
"grad_norm": 0.531515674970887,
"learning_rate": 3.865611976433605e-05,
"loss": 0.8819,
"step": 459
},
{
"epoch": 0.1266084084497351,
"grad_norm": 0.5745310526190663,
"learning_rate": 3.864981584991122e-05,
"loss": 0.8788,
"step": 460
},
{
"epoch": 0.126883644120278,
"grad_norm": 0.5770227255769966,
"learning_rate": 3.864349770122621e-05,
"loss": 0.8797,
"step": 461
},
{
"epoch": 0.12715887979082088,
"grad_norm": 0.5103586863720179,
"learning_rate": 3.863716532310329e-05,
"loss": 0.9062,
"step": 462
},
{
"epoch": 0.1274341154613638,
"grad_norm": 0.4781784565411039,
"learning_rate": 3.863081872037557e-05,
"loss": 0.8687,
"step": 463
},
{
"epoch": 0.1277093511319067,
"grad_norm": 0.4317409857087674,
"learning_rate": 3.862445789788701e-05,
"loss": 0.9079,
"step": 464
},
{
"epoch": 0.1279845868024496,
"grad_norm": 0.44915572043977114,
"learning_rate": 3.8618082860492456e-05,
"loss": 0.8738,
"step": 465
},
{
"epoch": 0.1282598224729925,
"grad_norm": 0.6805523727920348,
"learning_rate": 3.861169361305757e-05,
"loss": 0.8607,
"step": 466
},
{
"epoch": 0.1285350581435354,
"grad_norm": 0.51228196439054,
"learning_rate": 3.860529016045888e-05,
"loss": 0.8927,
"step": 467
},
{
"epoch": 0.1288102938140783,
"grad_norm": 0.672172331204869,
"learning_rate": 3.859887250758374e-05,
"loss": 0.847,
"step": 468
},
{
"epoch": 0.1290855294846212,
"grad_norm": 0.6193259834184921,
"learning_rate": 3.8592440659330354e-05,
"loss": 0.8587,
"step": 469
},
{
"epoch": 0.12936076515516412,
"grad_norm": 0.5338838370354034,
"learning_rate": 3.858599462060776e-05,
"loss": 0.8661,
"step": 470
},
{
"epoch": 0.129636000825707,
"grad_norm": 0.4871517328100876,
"learning_rate": 3.8579534396335835e-05,
"loss": 0.8719,
"step": 471
},
{
"epoch": 0.12991123649624992,
"grad_norm": 0.5398484469959097,
"learning_rate": 3.857305999144525e-05,
"loss": 0.8482,
"step": 472
},
{
"epoch": 0.13018647216679283,
"grad_norm": 1.0737124169159604,
"learning_rate": 3.856657141087753e-05,
"loss": 0.877,
"step": 473
},
{
"epoch": 0.1304617078373357,
"grad_norm": 0.5712118687368332,
"learning_rate": 3.8560068659585006e-05,
"loss": 0.9126,
"step": 474
},
{
"epoch": 0.13073694350787862,
"grad_norm": 0.5299285938372721,
"learning_rate": 3.855355174253084e-05,
"loss": 0.8648,
"step": 475
},
{
"epoch": 0.13101217917842153,
"grad_norm": 0.5832496967442821,
"learning_rate": 3.854702066468899e-05,
"loss": 0.8767,
"step": 476
},
{
"epoch": 0.13128741484896442,
"grad_norm": 0.5768216753062566,
"learning_rate": 3.8540475431044224e-05,
"loss": 0.8955,
"step": 477
},
{
"epoch": 0.13156265051950733,
"grad_norm": 0.5371945157499757,
"learning_rate": 3.8533916046592115e-05,
"loss": 0.8397,
"step": 478
},
{
"epoch": 0.13183788619005024,
"grad_norm": 0.5555649404302065,
"learning_rate": 3.852734251633905e-05,
"loss": 0.8653,
"step": 479
},
{
"epoch": 0.13211312186059312,
"grad_norm": 0.5379325982134389,
"learning_rate": 3.852075484530219e-05,
"loss": 0.8407,
"step": 480
},
{
"epoch": 0.13238835753113604,
"grad_norm": 0.6364764043277225,
"learning_rate": 3.85141530385095e-05,
"loss": 0.8481,
"step": 481
},
{
"epoch": 0.13266359320167895,
"grad_norm": 0.6346413463658178,
"learning_rate": 3.8507537100999746e-05,
"loss": 0.8597,
"step": 482
},
{
"epoch": 0.13293882887222183,
"grad_norm": 0.5880283792103489,
"learning_rate": 3.850090703782246e-05,
"loss": 0.8712,
"step": 483
},
{
"epoch": 0.13321406454276474,
"grad_norm": 0.5177115144515265,
"learning_rate": 3.8494262854037955e-05,
"loss": 0.8448,
"step": 484
},
{
"epoch": 0.13348930021330765,
"grad_norm": 0.4980145821933916,
"learning_rate": 3.848760455471734e-05,
"loss": 0.9094,
"step": 485
},
{
"epoch": 0.13376453588385054,
"grad_norm": 0.547624989774016,
"learning_rate": 3.848093214494248e-05,
"loss": 0.8744,
"step": 486
},
{
"epoch": 0.13403977155439345,
"grad_norm": 0.618583275402833,
"learning_rate": 3.847424562980602e-05,
"loss": 0.8576,
"step": 487
},
{
"epoch": 0.13431500722493636,
"grad_norm": 0.4766724787140889,
"learning_rate": 3.8467545014411365e-05,
"loss": 0.8627,
"step": 488
},
{
"epoch": 0.13459024289547925,
"grad_norm": 0.49314930291044035,
"learning_rate": 3.846083030387268e-05,
"loss": 0.8773,
"step": 489
},
{
"epoch": 0.13486547856602216,
"grad_norm": 0.5726859587483527,
"learning_rate": 3.8454101503314896e-05,
"loss": 0.8688,
"step": 490
},
{
"epoch": 0.13514071423656507,
"grad_norm": 0.5209229799023615,
"learning_rate": 3.84473586178737e-05,
"loss": 0.8446,
"step": 491
},
{
"epoch": 0.13541594990710795,
"grad_norm": 0.5380188644630678,
"learning_rate": 3.8440601652695504e-05,
"loss": 0.8615,
"step": 492
},
{
"epoch": 0.13569118557765086,
"grad_norm": 0.5368345332836999,
"learning_rate": 3.84338306129375e-05,
"loss": 0.872,
"step": 493
},
{
"epoch": 0.13596642124819378,
"grad_norm": 0.4941502507243993,
"learning_rate": 3.842704550376761e-05,
"loss": 0.8813,
"step": 494
},
{
"epoch": 0.13624165691873666,
"grad_norm": 0.510201925526349,
"learning_rate": 3.842024633036448e-05,
"loss": 0.8516,
"step": 495
},
{
"epoch": 0.13651689258927957,
"grad_norm": 0.5859285227055108,
"learning_rate": 3.841343309791751e-05,
"loss": 0.8465,
"step": 496
},
{
"epoch": 0.13679212825982248,
"grad_norm": 0.5051374546024748,
"learning_rate": 3.8406605811626814e-05,
"loss": 0.8764,
"step": 497
},
{
"epoch": 0.13706736393036537,
"grad_norm": 0.4777860450594652,
"learning_rate": 3.8399764476703244e-05,
"loss": 0.8865,
"step": 498
},
{
"epoch": 0.13734259960090828,
"grad_norm": 0.4648482392371114,
"learning_rate": 3.8392909098368377e-05,
"loss": 0.8696,
"step": 499
},
{
"epoch": 0.1376178352714512,
"grad_norm": 0.39194934141636306,
"learning_rate": 3.8386039681854504e-05,
"loss": 0.8735,
"step": 500
},
{
"epoch": 0.13789307094199407,
"grad_norm": 0.48648359915890216,
"learning_rate": 3.837915623240462e-05,
"loss": 0.8688,
"step": 501
},
{
"epoch": 0.13816830661253698,
"grad_norm": 0.5328871694309647,
"learning_rate": 3.837225875527244e-05,
"loss": 0.8696,
"step": 502
},
{
"epoch": 0.1384435422830799,
"grad_norm": 0.46675333860858087,
"learning_rate": 3.8365347255722396e-05,
"loss": 0.8423,
"step": 503
},
{
"epoch": 0.13871877795362278,
"grad_norm": 0.473226686282098,
"learning_rate": 3.835842173902959e-05,
"loss": 0.8478,
"step": 504
},
{
"epoch": 0.1389940136241657,
"grad_norm": 0.43029063072170615,
"learning_rate": 3.835148221047988e-05,
"loss": 0.8599,
"step": 505
},
{
"epoch": 0.1392692492947086,
"grad_norm": 0.3952610348334728,
"learning_rate": 3.834452867536974e-05,
"loss": 0.8493,
"step": 506
},
{
"epoch": 0.1395444849652515,
"grad_norm": 0.46255450767863043,
"learning_rate": 3.8337561139006405e-05,
"loss": 0.8435,
"step": 507
},
{
"epoch": 0.1398197206357944,
"grad_norm": 0.5907346043765216,
"learning_rate": 3.833057960670776e-05,
"loss": 0.867,
"step": 508
},
{
"epoch": 0.1400949563063373,
"grad_norm": 0.41416936627426143,
"learning_rate": 3.832358408380239e-05,
"loss": 0.8642,
"step": 509
},
{
"epoch": 0.1403701919768802,
"grad_norm": 0.3578028940778584,
"learning_rate": 3.8316574575629524e-05,
"loss": 0.8859,
"step": 510
},
{
"epoch": 0.1406454276474231,
"grad_norm": 0.4532996072863284,
"learning_rate": 3.8309551087539116e-05,
"loss": 0.8808,
"step": 511
},
{
"epoch": 0.14092066331796602,
"grad_norm": 0.40684550825360394,
"learning_rate": 3.8302513624891743e-05,
"loss": 0.8676,
"step": 512
},
{
"epoch": 0.1411958989885089,
"grad_norm": 0.40300225878763124,
"learning_rate": 3.8295462193058686e-05,
"loss": 0.8376,
"step": 513
},
{
"epoch": 0.1414711346590518,
"grad_norm": 0.39067970235574856,
"learning_rate": 3.8288396797421855e-05,
"loss": 0.8937,
"step": 514
},
{
"epoch": 0.14174637032959472,
"grad_norm": 0.5695505575761707,
"learning_rate": 3.828131744337384e-05,
"loss": 0.8645,
"step": 515
},
{
"epoch": 0.1420216060001376,
"grad_norm": 0.4526706615787753,
"learning_rate": 3.8274224136317884e-05,
"loss": 0.8576,
"step": 516
},
{
"epoch": 0.14229684167068052,
"grad_norm": 0.4287103017952185,
"learning_rate": 3.8267116881667855e-05,
"loss": 0.8805,
"step": 517
},
{
"epoch": 0.14257207734122343,
"grad_norm": 0.48674747019729414,
"learning_rate": 3.8259995684848306e-05,
"loss": 0.8482,
"step": 518
},
{
"epoch": 0.1428473130117663,
"grad_norm": 0.49325687985959527,
"learning_rate": 3.82528605512944e-05,
"loss": 0.8804,
"step": 519
},
{
"epoch": 0.14312254868230923,
"grad_norm": 0.49645020641649507,
"learning_rate": 3.824571148645194e-05,
"loss": 0.8835,
"step": 520
},
{
"epoch": 0.14339778435285214,
"grad_norm": 0.4717842701616438,
"learning_rate": 3.823854849577738e-05,
"loss": 0.8808,
"step": 521
},
{
"epoch": 0.14367302002339502,
"grad_norm": 0.5017642201403522,
"learning_rate": 3.823137158473778e-05,
"loss": 0.8738,
"step": 522
},
{
"epoch": 0.14394825569393793,
"grad_norm": 0.46900408870612853,
"learning_rate": 3.8224180758810845e-05,
"loss": 0.8466,
"step": 523
},
{
"epoch": 0.14422349136448084,
"grad_norm": 0.4649485101983954,
"learning_rate": 3.821697602348489e-05,
"loss": 0.8637,
"step": 524
},
{
"epoch": 0.14449872703502373,
"grad_norm": 0.7670893359269847,
"learning_rate": 3.820975738425884e-05,
"loss": 0.8791,
"step": 525
},
{
"epoch": 0.14477396270556664,
"grad_norm": 0.4634061219146758,
"learning_rate": 3.8202524846642246e-05,
"loss": 0.8598,
"step": 526
},
{
"epoch": 0.14504919837610955,
"grad_norm": 0.4562083751894467,
"learning_rate": 3.8195278416155266e-05,
"loss": 0.8457,
"step": 527
},
{
"epoch": 0.14532443404665243,
"grad_norm": 0.43986677245180233,
"learning_rate": 3.8188018098328636e-05,
"loss": 0.8606,
"step": 528
},
{
"epoch": 0.14559966971719535,
"grad_norm": 0.45943787267139574,
"learning_rate": 3.8180743898703735e-05,
"loss": 0.844,
"step": 529
},
{
"epoch": 0.14587490538773826,
"grad_norm": 0.4868762429617816,
"learning_rate": 3.81734558228325e-05,
"loss": 0.8649,
"step": 530
},
{
"epoch": 0.14615014105828114,
"grad_norm": 0.5229564492816823,
"learning_rate": 3.816615387627748e-05,
"loss": 0.8808,
"step": 531
},
{
"epoch": 0.14642537672882405,
"grad_norm": 0.47491592303452623,
"learning_rate": 3.8158838064611784e-05,
"loss": 0.8836,
"step": 532
},
{
"epoch": 0.14670061239936696,
"grad_norm": 0.4892872843931489,
"learning_rate": 3.815150839341915e-05,
"loss": 0.8501,
"step": 533
},
{
"epoch": 0.14697584806990985,
"grad_norm": 0.5067111165431507,
"learning_rate": 3.814416486829384e-05,
"loss": 0.8787,
"step": 534
},
{
"epoch": 0.14725108374045276,
"grad_norm": 0.5847903189477168,
"learning_rate": 3.813680749484073e-05,
"loss": 0.8862,
"step": 535
},
{
"epoch": 0.14752631941099567,
"grad_norm": 0.5307891953867472,
"learning_rate": 3.812943627867525e-05,
"loss": 0.8447,
"step": 536
},
{
"epoch": 0.14780155508153855,
"grad_norm": 0.43437401733213127,
"learning_rate": 3.81220512254234e-05,
"loss": 0.8755,
"step": 537
},
{
"epoch": 0.14807679075208147,
"grad_norm": 0.4498193054640528,
"learning_rate": 3.811465234072173e-05,
"loss": 0.863,
"step": 538
},
{
"epoch": 0.14835202642262438,
"grad_norm": 0.510868396643692,
"learning_rate": 3.810723963021737e-05,
"loss": 0.8801,
"step": 539
},
{
"epoch": 0.14862726209316726,
"grad_norm": 0.4608288472149214,
"learning_rate": 3.8099813099567964e-05,
"loss": 0.8661,
"step": 540
},
{
"epoch": 0.14890249776371017,
"grad_norm": 0.4717856489674427,
"learning_rate": 3.809237275444174e-05,
"loss": 0.8366,
"step": 541
},
{
"epoch": 0.14917773343425308,
"grad_norm": 0.5434506620844847,
"learning_rate": 3.808491860051747e-05,
"loss": 0.8596,
"step": 542
},
{
"epoch": 0.14945296910479597,
"grad_norm": 0.5381320292887712,
"learning_rate": 3.8077450643484424e-05,
"loss": 0.8555,
"step": 543
},
{
"epoch": 0.14972820477533888,
"grad_norm": 0.5143549118332782,
"learning_rate": 3.806996888904245e-05,
"loss": 0.8644,
"step": 544
},
{
"epoch": 0.1500034404458818,
"grad_norm": 0.7267877355499845,
"learning_rate": 3.8062473342901925e-05,
"loss": 0.8616,
"step": 545
},
{
"epoch": 0.15027867611642468,
"grad_norm": 0.41183292653113557,
"learning_rate": 3.805496401078372e-05,
"loss": 0.8667,
"step": 546
},
{
"epoch": 0.1505539117869676,
"grad_norm": 0.5024814370294475,
"learning_rate": 3.804744089841926e-05,
"loss": 0.8914,
"step": 547
},
{
"epoch": 0.1508291474575105,
"grad_norm": 0.5942440551592292,
"learning_rate": 3.803990401155046e-05,
"loss": 0.8633,
"step": 548
},
{
"epoch": 0.15110438312805338,
"grad_norm": 0.5566367549335995,
"learning_rate": 3.8032353355929773e-05,
"loss": 0.8756,
"step": 549
},
{
"epoch": 0.1513796187985963,
"grad_norm": 0.49409579251287367,
"learning_rate": 3.802478893732016e-05,
"loss": 0.8683,
"step": 550
},
{
"epoch": 0.1516548544691392,
"grad_norm": 0.3725949960956119,
"learning_rate": 3.801721076149506e-05,
"loss": 0.8706,
"step": 551
},
{
"epoch": 0.1519300901396821,
"grad_norm": 0.4626308585605636,
"learning_rate": 3.8009618834238445e-05,
"loss": 0.8505,
"step": 552
},
{
"epoch": 0.152205325810225,
"grad_norm": 0.5238016550980202,
"learning_rate": 3.8002013161344755e-05,
"loss": 0.864,
"step": 553
},
{
"epoch": 0.1524805614807679,
"grad_norm": 0.5662212591295838,
"learning_rate": 3.7994393748618945e-05,
"loss": 0.8404,
"step": 554
},
{
"epoch": 0.15275579715131082,
"grad_norm": 0.5457119329512516,
"learning_rate": 3.798676060187644e-05,
"loss": 0.8617,
"step": 555
},
{
"epoch": 0.1530310328218537,
"grad_norm": 0.49122969359545865,
"learning_rate": 3.797911372694314e-05,
"loss": 0.8658,
"step": 556
},
{
"epoch": 0.15330626849239662,
"grad_norm": 0.4129007932799576,
"learning_rate": 3.797145312965546e-05,
"loss": 0.8635,
"step": 557
},
{
"epoch": 0.15358150416293953,
"grad_norm": 0.45508532583757116,
"learning_rate": 3.796377881586025e-05,
"loss": 0.8575,
"step": 558
},
{
"epoch": 0.15385673983348241,
"grad_norm": 0.5207853672493407,
"learning_rate": 3.795609079141484e-05,
"loss": 0.8626,
"step": 559
},
{
"epoch": 0.15413197550402533,
"grad_norm": 0.581890744677401,
"learning_rate": 3.7948389062187025e-05,
"loss": 0.8693,
"step": 560
},
{
"epoch": 0.15440721117456824,
"grad_norm": 0.5883911873807134,
"learning_rate": 3.794067363405508e-05,
"loss": 0.846,
"step": 561
},
{
"epoch": 0.15468244684511112,
"grad_norm": 0.5207574905100074,
"learning_rate": 3.79329445129077e-05,
"loss": 0.8247,
"step": 562
},
{
"epoch": 0.15495768251565403,
"grad_norm": 0.4523953811760909,
"learning_rate": 3.792520170464406e-05,
"loss": 0.8442,
"step": 563
},
{
"epoch": 0.15523291818619694,
"grad_norm": 0.49497191299981996,
"learning_rate": 3.7917445215173765e-05,
"loss": 0.8572,
"step": 564
},
{
"epoch": 0.15550815385673983,
"grad_norm": 0.5961479001151687,
"learning_rate": 3.7909675050416864e-05,
"loss": 0.8504,
"step": 565
},
{
"epoch": 0.15578338952728274,
"grad_norm": 0.5553065203732548,
"learning_rate": 3.7901891216303855e-05,
"loss": 0.8497,
"step": 566
},
{
"epoch": 0.15605862519782565,
"grad_norm": 0.4887797088139101,
"learning_rate": 3.789409371877566e-05,
"loss": 0.8654,
"step": 567
},
{
"epoch": 0.15633386086836853,
"grad_norm": 0.4507456310067676,
"learning_rate": 3.7886282563783626e-05,
"loss": 0.8922,
"step": 568
},
{
"epoch": 0.15660909653891145,
"grad_norm": 0.534811993235575,
"learning_rate": 3.787845775728953e-05,
"loss": 0.8766,
"step": 569
},
{
"epoch": 0.15688433220945436,
"grad_norm": 0.5537662622733155,
"learning_rate": 3.7870619305265566e-05,
"loss": 0.8625,
"step": 570
},
{
"epoch": 0.15715956787999724,
"grad_norm": 0.4727809559456603,
"learning_rate": 3.7862767213694347e-05,
"loss": 0.8461,
"step": 571
},
{
"epoch": 0.15743480355054015,
"grad_norm": 0.4418209320709128,
"learning_rate": 3.785490148856889e-05,
"loss": 0.8553,
"step": 572
},
{
"epoch": 0.15771003922108306,
"grad_norm": 0.47389940708387823,
"learning_rate": 3.784702213589262e-05,
"loss": 0.854,
"step": 573
},
{
"epoch": 0.15798527489162595,
"grad_norm": 0.518179401198152,
"learning_rate": 3.7839129161679366e-05,
"loss": 0.8552,
"step": 574
},
{
"epoch": 0.15826051056216886,
"grad_norm": 0.4639622479114093,
"learning_rate": 3.7831222571953344e-05,
"loss": 0.8715,
"step": 575
},
{
"epoch": 0.15853574623271177,
"grad_norm": 0.4323743239097041,
"learning_rate": 3.782330237274918e-05,
"loss": 0.8451,
"step": 576
},
{
"epoch": 0.15881098190325466,
"grad_norm": 0.41998341728671285,
"learning_rate": 3.7815368570111866e-05,
"loss": 0.8561,
"step": 577
},
{
"epoch": 0.15908621757379757,
"grad_norm": 0.37329638579985064,
"learning_rate": 3.780742117009679e-05,
"loss": 0.8597,
"step": 578
},
{
"epoch": 0.15936145324434048,
"grad_norm": 0.3992371734908037,
"learning_rate": 3.779946017876972e-05,
"loss": 0.8547,
"step": 579
},
{
"epoch": 0.15963668891488336,
"grad_norm": 0.46504937163189863,
"learning_rate": 3.7791485602206786e-05,
"loss": 0.8815,
"step": 580
},
{
"epoch": 0.15991192458542627,
"grad_norm": 0.44217107149660534,
"learning_rate": 3.778349744649449e-05,
"loss": 0.8611,
"step": 581
},
{
"epoch": 0.16018716025596919,
"grad_norm": 0.353586171048602,
"learning_rate": 3.777549571772971e-05,
"loss": 0.8401,
"step": 582
},
{
"epoch": 0.16046239592651207,
"grad_norm": 0.44457618940068655,
"learning_rate": 3.776748042201968e-05,
"loss": 0.8659,
"step": 583
},
{
"epoch": 0.16073763159705498,
"grad_norm": 0.45988463481003333,
"learning_rate": 3.775945156548196e-05,
"loss": 0.8532,
"step": 584
},
{
"epoch": 0.1610128672675979,
"grad_norm": 0.4618503545272013,
"learning_rate": 3.77514091542445e-05,
"loss": 0.8598,
"step": 585
},
{
"epoch": 0.16128810293814078,
"grad_norm": 0.6234601884083827,
"learning_rate": 3.774335319444558e-05,
"loss": 0.829,
"step": 586
},
{
"epoch": 0.1615633386086837,
"grad_norm": 0.47049098450575466,
"learning_rate": 3.773528369223382e-05,
"loss": 0.9023,
"step": 587
},
{
"epoch": 0.1618385742792266,
"grad_norm": 0.44398244533292125,
"learning_rate": 3.772720065376817e-05,
"loss": 0.8582,
"step": 588
},
{
"epoch": 0.16211380994976948,
"grad_norm": 0.629968731666768,
"learning_rate": 3.771910408521792e-05,
"loss": 0.8834,
"step": 589
},
{
"epoch": 0.1623890456203124,
"grad_norm": 0.539380515557297,
"learning_rate": 3.771099399276268e-05,
"loss": 0.8411,
"step": 590
},
{
"epoch": 0.1626642812908553,
"grad_norm": 0.4861709192806457,
"learning_rate": 3.7702870382592394e-05,
"loss": 0.8781,
"step": 591
},
{
"epoch": 0.1629395169613982,
"grad_norm": 0.42621808860582056,
"learning_rate": 3.769473326090731e-05,
"loss": 0.8651,
"step": 592
},
{
"epoch": 0.1632147526319411,
"grad_norm": 0.4380200683345255,
"learning_rate": 3.768658263391799e-05,
"loss": 0.8723,
"step": 593
},
{
"epoch": 0.163489988302484,
"grad_norm": 0.451153728165318,
"learning_rate": 3.7678418507845316e-05,
"loss": 0.8783,
"step": 594
},
{
"epoch": 0.1637652239730269,
"grad_norm": 0.422539721328803,
"learning_rate": 3.767024088892046e-05,
"loss": 0.8623,
"step": 595
},
{
"epoch": 0.1640404596435698,
"grad_norm": 0.449545734716924,
"learning_rate": 3.76620497833849e-05,
"loss": 0.8816,
"step": 596
},
{
"epoch": 0.16431569531411272,
"grad_norm": 0.4656625878247244,
"learning_rate": 3.76538451974904e-05,
"loss": 0.8622,
"step": 597
},
{
"epoch": 0.1645909309846556,
"grad_norm": 0.4118909765823919,
"learning_rate": 3.764562713749902e-05,
"loss": 0.855,
"step": 598
},
{
"epoch": 0.16486616665519851,
"grad_norm": 0.43039838699812394,
"learning_rate": 3.7637395609683093e-05,
"loss": 0.8899,
"step": 599
},
{
"epoch": 0.16514140232574143,
"grad_norm": 0.418011137759663,
"learning_rate": 3.7629150620325255e-05,
"loss": 0.8529,
"step": 600
},
{
"epoch": 0.1654166379962843,
"grad_norm": 0.46295060796973236,
"learning_rate": 3.762089217571839e-05,
"loss": 0.8591,
"step": 601
},
{
"epoch": 0.16569187366682722,
"grad_norm": 0.4136084190087584,
"learning_rate": 3.761262028216566e-05,
"loss": 0.8364,
"step": 602
},
{
"epoch": 0.16596710933737013,
"grad_norm": 0.41656759129172366,
"learning_rate": 3.76043349459805e-05,
"loss": 0.8951,
"step": 603
},
{
"epoch": 0.16624234500791302,
"grad_norm": 0.440927658691817,
"learning_rate": 3.75960361734866e-05,
"loss": 0.8554,
"step": 604
},
{
"epoch": 0.16651758067845593,
"grad_norm": 0.43515328904506384,
"learning_rate": 3.75877239710179e-05,
"loss": 0.8583,
"step": 605
},
{
"epoch": 0.16679281634899884,
"grad_norm": 0.3819785628991007,
"learning_rate": 3.757939834491858e-05,
"loss": 0.8571,
"step": 606
},
{
"epoch": 0.16706805201954172,
"grad_norm": 0.3902177082821287,
"learning_rate": 3.7571059301543104e-05,
"loss": 0.8468,
"step": 607
},
{
"epoch": 0.16734328769008464,
"grad_norm": 0.49812295204183166,
"learning_rate": 3.756270684725614e-05,
"loss": 0.8362,
"step": 608
},
{
"epoch": 0.16761852336062755,
"grad_norm": 0.3995156875131695,
"learning_rate": 3.7554340988432606e-05,
"loss": 0.8662,
"step": 609
},
{
"epoch": 0.16789375903117043,
"grad_norm": 0.42250602723841496,
"learning_rate": 3.754596173145765e-05,
"loss": 0.8326,
"step": 610
},
{
"epoch": 0.16816899470171334,
"grad_norm": 0.42173339037171914,
"learning_rate": 3.7537569082726645e-05,
"loss": 0.8757,
"step": 611
},
{
"epoch": 0.16844423037225625,
"grad_norm": 0.36688653992050907,
"learning_rate": 3.7529163048645175e-05,
"loss": 0.8264,
"step": 612
},
{
"epoch": 0.16871946604279914,
"grad_norm": 0.43238645464559056,
"learning_rate": 3.752074363562907e-05,
"loss": 0.8422,
"step": 613
},
{
"epoch": 0.16899470171334205,
"grad_norm": 0.38387386608170954,
"learning_rate": 3.751231085010433e-05,
"loss": 0.8252,
"step": 614
},
{
"epoch": 0.16926993738388496,
"grad_norm": 0.42617280811410624,
"learning_rate": 3.750386469850719e-05,
"loss": 0.8181,
"step": 615
},
{
"epoch": 0.16954517305442784,
"grad_norm": 0.45116202185755655,
"learning_rate": 3.749540518728409e-05,
"loss": 0.8636,
"step": 616
},
{
"epoch": 0.16982040872497076,
"grad_norm": 0.41685101305431854,
"learning_rate": 3.7486932322891646e-05,
"loss": 0.8295,
"step": 617
},
{
"epoch": 0.17009564439551367,
"grad_norm": 0.4028147769083077,
"learning_rate": 3.7478446111796676e-05,
"loss": 0.829,
"step": 618
},
{
"epoch": 0.17037088006605655,
"grad_norm": 0.39845488253498856,
"learning_rate": 3.746994656047618e-05,
"loss": 0.8497,
"step": 619
},
{
"epoch": 0.17064611573659946,
"grad_norm": 0.45034021511122285,
"learning_rate": 3.746143367541736e-05,
"loss": 0.8846,
"step": 620
},
{
"epoch": 0.17092135140714237,
"grad_norm": 0.4333175526376847,
"learning_rate": 3.745290746311756e-05,
"loss": 0.8352,
"step": 621
},
{
"epoch": 0.17119658707768526,
"grad_norm": 0.43033481461769457,
"learning_rate": 3.7444367930084324e-05,
"loss": 0.8601,
"step": 622
},
{
"epoch": 0.17147182274822817,
"grad_norm": 0.49429130468760024,
"learning_rate": 3.7435815082835356e-05,
"loss": 0.8546,
"step": 623
},
{
"epoch": 0.17174705841877108,
"grad_norm": 0.44088834343857985,
"learning_rate": 3.742724892789851e-05,
"loss": 0.8461,
"step": 624
},
{
"epoch": 0.17202229408931397,
"grad_norm": 0.41294866044719825,
"learning_rate": 3.7418669471811815e-05,
"loss": 0.8269,
"step": 625
},
{
"epoch": 0.17229752975985688,
"grad_norm": 1.0199283661633092,
"learning_rate": 3.741007672112345e-05,
"loss": 0.8696,
"step": 626
},
{
"epoch": 0.1725727654303998,
"grad_norm": 0.5058421556080241,
"learning_rate": 3.740147068239171e-05,
"loss": 0.8341,
"step": 627
},
{
"epoch": 0.17284800110094267,
"grad_norm": 0.3778503710292955,
"learning_rate": 3.739285136218508e-05,
"loss": 0.8434,
"step": 628
},
{
"epoch": 0.17312323677148558,
"grad_norm": 0.4399140389760773,
"learning_rate": 3.738421876708215e-05,
"loss": 0.83,
"step": 629
},
{
"epoch": 0.1733984724420285,
"grad_norm": 0.5016172625218125,
"learning_rate": 3.7375572903671654e-05,
"loss": 0.8696,
"step": 630
},
{
"epoch": 0.17367370811257138,
"grad_norm": 0.456216673669721,
"learning_rate": 3.736691377855243e-05,
"loss": 0.8685,
"step": 631
},
{
"epoch": 0.1739489437831143,
"grad_norm": 0.4548718012725048,
"learning_rate": 3.735824139833349e-05,
"loss": 0.8373,
"step": 632
},
{
"epoch": 0.1742241794536572,
"grad_norm": 0.49875651019797274,
"learning_rate": 3.7349555769633905e-05,
"loss": 0.8363,
"step": 633
},
{
"epoch": 0.17449941512420009,
"grad_norm": 0.44963026053063526,
"learning_rate": 3.7340856899082885e-05,
"loss": 0.8564,
"step": 634
},
{
"epoch": 0.174774650794743,
"grad_norm": 0.4499071962134856,
"learning_rate": 3.733214479331976e-05,
"loss": 0.8736,
"step": 635
},
{
"epoch": 0.1750498864652859,
"grad_norm": 0.5002395296179322,
"learning_rate": 3.732341945899392e-05,
"loss": 0.8565,
"step": 636
},
{
"epoch": 0.1753251221358288,
"grad_norm": 0.48806349949471,
"learning_rate": 3.73146809027649e-05,
"loss": 0.8958,
"step": 637
},
{
"epoch": 0.1756003578063717,
"grad_norm": 1.3382369778664218,
"learning_rate": 3.7305929131302295e-05,
"loss": 0.862,
"step": 638
},
{
"epoch": 0.17587559347691462,
"grad_norm": 0.5389643821953385,
"learning_rate": 3.7297164151285784e-05,
"loss": 0.867,
"step": 639
},
{
"epoch": 0.1761508291474575,
"grad_norm": 2.9563400865736167,
"learning_rate": 3.7288385969405165e-05,
"loss": 0.8561,
"step": 640
},
{
"epoch": 0.1764260648180004,
"grad_norm": 0.8251866766193862,
"learning_rate": 3.7279594592360265e-05,
"loss": 0.87,
"step": 641
},
{
"epoch": 0.17670130048854332,
"grad_norm": 1.2182716955143615,
"learning_rate": 3.7270790026861016e-05,
"loss": 0.8344,
"step": 642
},
{
"epoch": 0.1769765361590862,
"grad_norm": 0.7097183478086542,
"learning_rate": 3.726197227962738e-05,
"loss": 0.8327,
"step": 643
},
{
"epoch": 0.17725177182962912,
"grad_norm": 0.8429420008347591,
"learning_rate": 3.725314135738943e-05,
"loss": 0.8435,
"step": 644
},
{
"epoch": 0.17752700750017203,
"grad_norm": 0.8648986343209424,
"learning_rate": 3.724429726688725e-05,
"loss": 0.8657,
"step": 645
},
{
"epoch": 0.1778022431707149,
"grad_norm": 0.754418945875783,
"learning_rate": 3.7235440014870994e-05,
"loss": 0.8107,
"step": 646
},
{
"epoch": 0.17807747884125782,
"grad_norm": 0.7183465853403834,
"learning_rate": 3.7226569608100866e-05,
"loss": 0.8672,
"step": 647
},
{
"epoch": 0.17835271451180074,
"grad_norm": 0.5483584662617327,
"learning_rate": 3.72176860533471e-05,
"loss": 0.839,
"step": 648
},
{
"epoch": 0.17862795018234362,
"grad_norm": 0.6322640426864581,
"learning_rate": 3.720878935738996e-05,
"loss": 0.8734,
"step": 649
},
{
"epoch": 0.17890318585288653,
"grad_norm": 0.7030573116202438,
"learning_rate": 3.719987952701976e-05,
"loss": 0.8489,
"step": 650
},
{
"epoch": 0.17917842152342944,
"grad_norm": 0.6555737552414294,
"learning_rate": 3.7190956569036825e-05,
"loss": 0.8425,
"step": 651
},
{
"epoch": 0.17945365719397233,
"grad_norm": 0.5482130233720278,
"learning_rate": 3.718202049025149e-05,
"loss": 0.8332,
"step": 652
},
{
"epoch": 0.17972889286451524,
"grad_norm": 0.6035641346252707,
"learning_rate": 3.717307129748413e-05,
"loss": 0.8634,
"step": 653
},
{
"epoch": 0.18000412853505815,
"grad_norm": 0.4963480276691412,
"learning_rate": 3.71641089975651e-05,
"loss": 0.8491,
"step": 654
},
{
"epoch": 0.18027936420560103,
"grad_norm": 0.5258399669126971,
"learning_rate": 3.715513359733479e-05,
"loss": 0.8556,
"step": 655
},
{
"epoch": 0.18055459987614395,
"grad_norm": 0.4454387243275687,
"learning_rate": 3.7146145103643564e-05,
"loss": 0.8608,
"step": 656
},
{
"epoch": 0.18082983554668686,
"grad_norm": 0.6320853066798535,
"learning_rate": 3.7137143523351787e-05,
"loss": 0.8599,
"step": 657
},
{
"epoch": 0.18110507121722974,
"grad_norm": 0.46435375045636657,
"learning_rate": 3.712812886332982e-05,
"loss": 0.8246,
"step": 658
},
{
"epoch": 0.18138030688777265,
"grad_norm": 0.4475002613859921,
"learning_rate": 3.7119101130457986e-05,
"loss": 0.8496,
"step": 659
},
{
"epoch": 0.18165554255831556,
"grad_norm": 0.4506156959521839,
"learning_rate": 3.7110060331626605e-05,
"loss": 0.8511,
"step": 660
},
{
"epoch": 0.18193077822885845,
"grad_norm": 0.4521781865366933,
"learning_rate": 3.710100647373597e-05,
"loss": 0.8605,
"step": 661
},
{
"epoch": 0.18220601389940136,
"grad_norm": 0.4469760583626412,
"learning_rate": 3.7091939563696343e-05,
"loss": 0.8713,
"step": 662
},
{
"epoch": 0.18248124956994427,
"grad_norm": 0.4490346194814551,
"learning_rate": 3.708285960842792e-05,
"loss": 0.8406,
"step": 663
},
{
"epoch": 0.18275648524048715,
"grad_norm": 0.45557395803295825,
"learning_rate": 3.707376661486088e-05,
"loss": 0.8568,
"step": 664
},
{
"epoch": 0.18303172091103007,
"grad_norm": 0.5221170090389651,
"learning_rate": 3.7064660589935356e-05,
"loss": 0.8584,
"step": 665
},
{
"epoch": 0.18330695658157298,
"grad_norm": 0.3958832022301931,
"learning_rate": 3.7055541540601414e-05,
"loss": 0.8346,
"step": 666
},
{
"epoch": 0.18358219225211586,
"grad_norm": 0.4171414706934924,
"learning_rate": 3.704640947381905e-05,
"loss": 0.8545,
"step": 667
},
{
"epoch": 0.18385742792265877,
"grad_norm": 0.5460493752890742,
"learning_rate": 3.7037264396558234e-05,
"loss": 0.8679,
"step": 668
},
{
"epoch": 0.18413266359320168,
"grad_norm": 0.4301601757585685,
"learning_rate": 3.7028106315798835e-05,
"loss": 0.851,
"step": 669
},
{
"epoch": 0.18440789926374457,
"grad_norm": 0.6001559118913008,
"learning_rate": 3.7018935238530646e-05,
"loss": 0.8401,
"step": 670
},
{
"epoch": 0.18468313493428748,
"grad_norm": 0.4367149397988626,
"learning_rate": 3.700975117175339e-05,
"loss": 0.8432,
"step": 671
},
{
"epoch": 0.1849583706048304,
"grad_norm": 0.39430011051376235,
"learning_rate": 3.700055412247671e-05,
"loss": 0.8551,
"step": 672
},
{
"epoch": 0.18523360627537327,
"grad_norm": 0.3994623447630073,
"learning_rate": 3.699134409772014e-05,
"loss": 0.8403,
"step": 673
},
{
"epoch": 0.1855088419459162,
"grad_norm": 0.4239377062368287,
"learning_rate": 3.698212110451313e-05,
"loss": 0.8532,
"step": 674
},
{
"epoch": 0.1857840776164591,
"grad_norm": 0.42984171548912564,
"learning_rate": 3.697288514989502e-05,
"loss": 0.8563,
"step": 675
},
{
"epoch": 0.18605931328700198,
"grad_norm": 0.4182100781010544,
"learning_rate": 3.696363624091506e-05,
"loss": 0.8265,
"step": 676
},
{
"epoch": 0.1863345489575449,
"grad_norm": 0.42805379999521154,
"learning_rate": 3.6954374384632364e-05,
"loss": 0.8493,
"step": 677
},
{
"epoch": 0.1866097846280878,
"grad_norm": 0.4592615741308747,
"learning_rate": 3.6945099588115945e-05,
"loss": 0.8312,
"step": 678
},
{
"epoch": 0.18688502029863072,
"grad_norm": 0.41069079842461964,
"learning_rate": 3.693581185844468e-05,
"loss": 0.8698,
"step": 679
},
{
"epoch": 0.1871602559691736,
"grad_norm": 0.39478483232062667,
"learning_rate": 3.692651120270733e-05,
"loss": 0.8495,
"step": 680
},
{
"epoch": 0.1874354916397165,
"grad_norm": 0.4148331618032844,
"learning_rate": 3.691719762800251e-05,
"loss": 0.8364,
"step": 681
},
{
"epoch": 0.18771072731025942,
"grad_norm": 0.39613157370235597,
"learning_rate": 3.690787114143869e-05,
"loss": 0.8362,
"step": 682
},
{
"epoch": 0.1879859629808023,
"grad_norm": 0.38213320383152727,
"learning_rate": 3.689853175013423e-05,
"loss": 0.8596,
"step": 683
},
{
"epoch": 0.18826119865134522,
"grad_norm": 0.4216344319714442,
"learning_rate": 3.6889179461217295e-05,
"loss": 0.8066,
"step": 684
},
{
"epoch": 0.18853643432188813,
"grad_norm": 0.4519848330232041,
"learning_rate": 3.6879814281825924e-05,
"loss": 0.8343,
"step": 685
},
{
"epoch": 0.188811669992431,
"grad_norm": 0.4840066654563424,
"learning_rate": 3.687043621910798e-05,
"loss": 0.889,
"step": 686
},
{
"epoch": 0.18908690566297393,
"grad_norm": 0.4771316148312613,
"learning_rate": 3.6861045280221153e-05,
"loss": 0.8536,
"step": 687
},
{
"epoch": 0.18936214133351684,
"grad_norm": 0.4100103260938992,
"learning_rate": 3.6851641472332985e-05,
"loss": 0.8478,
"step": 688
},
{
"epoch": 0.18963737700405972,
"grad_norm": 0.39765616123941616,
"learning_rate": 3.684222480262082e-05,
"loss": 0.8423,
"step": 689
},
{
"epoch": 0.18991261267460263,
"grad_norm": 0.42154783908146215,
"learning_rate": 3.683279527827182e-05,
"loss": 0.8498,
"step": 690
},
{
"epoch": 0.19018784834514554,
"grad_norm": 0.4393589996269681,
"learning_rate": 3.682335290648297e-05,
"loss": 0.8658,
"step": 691
},
{
"epoch": 0.19046308401568843,
"grad_norm": 0.45500979119470325,
"learning_rate": 3.6813897694461045e-05,
"loss": 0.836,
"step": 692
},
{
"epoch": 0.19073831968623134,
"grad_norm": 0.47142718952846213,
"learning_rate": 3.6804429649422636e-05,
"loss": 0.8267,
"step": 693
},
{
"epoch": 0.19101355535677425,
"grad_norm": 0.4512480273163847,
"learning_rate": 3.679494877859412e-05,
"loss": 0.8418,
"step": 694
},
{
"epoch": 0.19128879102731713,
"grad_norm": 0.47736619973214345,
"learning_rate": 3.678545508921166e-05,
"loss": 0.8421,
"step": 695
},
{
"epoch": 0.19156402669786005,
"grad_norm": 0.4147369289774733,
"learning_rate": 3.67759485885212e-05,
"loss": 0.8729,
"step": 696
},
{
"epoch": 0.19183926236840296,
"grad_norm": 0.38610782159182894,
"learning_rate": 3.676642928377849e-05,
"loss": 0.8418,
"step": 697
},
{
"epoch": 0.19211449803894584,
"grad_norm": 0.5074722657389754,
"learning_rate": 3.675689718224901e-05,
"loss": 0.8565,
"step": 698
},
{
"epoch": 0.19238973370948875,
"grad_norm": 0.41315633992474105,
"learning_rate": 3.674735229120804e-05,
"loss": 0.8436,
"step": 699
},
{
"epoch": 0.19266496938003166,
"grad_norm": 0.4569749943059334,
"learning_rate": 3.6737794617940604e-05,
"loss": 0.8704,
"step": 700
},
{
"epoch": 0.19294020505057455,
"grad_norm": 0.4358495054803389,
"learning_rate": 3.672822416974149e-05,
"loss": 0.8384,
"step": 701
},
{
"epoch": 0.19321544072111746,
"grad_norm": 0.3658382889556743,
"learning_rate": 3.671864095391523e-05,
"loss": 0.8641,
"step": 702
},
{
"epoch": 0.19349067639166037,
"grad_norm": 0.4074752202677212,
"learning_rate": 3.670904497777611e-05,
"loss": 0.8373,
"step": 703
},
{
"epoch": 0.19376591206220325,
"grad_norm": 0.43237299773290594,
"learning_rate": 3.669943624864815e-05,
"loss": 0.8224,
"step": 704
},
{
"epoch": 0.19404114773274617,
"grad_norm": 0.4003017314125147,
"learning_rate": 3.6689814773865103e-05,
"loss": 0.8332,
"step": 705
},
{
"epoch": 0.19431638340328908,
"grad_norm": 0.42792498361506986,
"learning_rate": 3.6680180560770445e-05,
"loss": 0.845,
"step": 706
},
{
"epoch": 0.19459161907383196,
"grad_norm": 0.4460903177214098,
"learning_rate": 3.667053361671738e-05,
"loss": 0.8239,
"step": 707
},
{
"epoch": 0.19486685474437487,
"grad_norm": 0.4235211722879677,
"learning_rate": 3.6660873949068846e-05,
"loss": 0.841,
"step": 708
},
{
"epoch": 0.19514209041491778,
"grad_norm": 0.41710277993607175,
"learning_rate": 3.665120156519745e-05,
"loss": 0.8402,
"step": 709
},
{
"epoch": 0.19541732608546067,
"grad_norm": 0.4357796505973876,
"learning_rate": 3.6641516472485544e-05,
"loss": 0.827,
"step": 710
},
{
"epoch": 0.19569256175600358,
"grad_norm": 0.4222776705197388,
"learning_rate": 3.663181867832515e-05,
"loss": 0.8554,
"step": 711
},
{
"epoch": 0.1959677974265465,
"grad_norm": 0.42764490948061673,
"learning_rate": 3.662210819011802e-05,
"loss": 0.8951,
"step": 712
},
{
"epoch": 0.19624303309708938,
"grad_norm": 0.5324227250448954,
"learning_rate": 3.661238501527556e-05,
"loss": 0.8095,
"step": 713
},
{
"epoch": 0.1965182687676323,
"grad_norm": 0.4212322505107573,
"learning_rate": 3.660264916121888e-05,
"loss": 0.8336,
"step": 714
},
{
"epoch": 0.1967935044381752,
"grad_norm": 0.574041932056587,
"learning_rate": 3.659290063537875e-05,
"loss": 0.8405,
"step": 715
},
{
"epoch": 0.19706874010871808,
"grad_norm": 0.5027051478358007,
"learning_rate": 3.658313944519564e-05,
"loss": 0.8664,
"step": 716
},
{
"epoch": 0.197343975779261,
"grad_norm": 0.5262137971164996,
"learning_rate": 3.657336559811965e-05,
"loss": 0.8487,
"step": 717
},
{
"epoch": 0.1976192114498039,
"grad_norm": 0.4758930292255706,
"learning_rate": 3.6563579101610566e-05,
"loss": 0.8327,
"step": 718
},
{
"epoch": 0.1978944471203468,
"grad_norm": 0.5446657401501271,
"learning_rate": 3.655377996313782e-05,
"loss": 0.855,
"step": 719
},
{
"epoch": 0.1981696827908897,
"grad_norm": 0.4796186891971015,
"learning_rate": 3.654396819018048e-05,
"loss": 0.8481,
"step": 720
},
{
"epoch": 0.1984449184614326,
"grad_norm": 0.48475451769115785,
"learning_rate": 3.653414379022729e-05,
"loss": 0.8354,
"step": 721
},
{
"epoch": 0.1987201541319755,
"grad_norm": 0.43354817713783983,
"learning_rate": 3.6524306770776606e-05,
"loss": 0.8626,
"step": 722
},
{
"epoch": 0.1989953898025184,
"grad_norm": 0.40858146621787894,
"learning_rate": 3.651445713933641e-05,
"loss": 0.8201,
"step": 723
},
{
"epoch": 0.19927062547306132,
"grad_norm": 0.46739570129985547,
"learning_rate": 3.6504594903424335e-05,
"loss": 0.8402,
"step": 724
},
{
"epoch": 0.1995458611436042,
"grad_norm": 0.5960198370070712,
"learning_rate": 3.649472007056762e-05,
"loss": 0.8551,
"step": 725
},
{
"epoch": 0.19982109681414711,
"grad_norm": 0.4249329905811914,
"learning_rate": 3.648483264830311e-05,
"loss": 0.8469,
"step": 726
},
{
"epoch": 0.20009633248469003,
"grad_norm": 0.3884594476823848,
"learning_rate": 3.647493264417727e-05,
"loss": 0.846,
"step": 727
},
{
"epoch": 0.2003715681552329,
"grad_norm": 0.43771188863603633,
"learning_rate": 3.6465020065746174e-05,
"loss": 0.8554,
"step": 728
},
{
"epoch": 0.20064680382577582,
"grad_norm": 0.414388470066882,
"learning_rate": 3.645509492057548e-05,
"loss": 0.8526,
"step": 729
},
{
"epoch": 0.20092203949631873,
"grad_norm": 0.4381310106126896,
"learning_rate": 3.6445157216240434e-05,
"loss": 0.8125,
"step": 730
},
{
"epoch": 0.20119727516686162,
"grad_norm": 0.4489280588883335,
"learning_rate": 3.6435206960325884e-05,
"loss": 0.8379,
"step": 731
},
{
"epoch": 0.20147251083740453,
"grad_norm": 0.4104386042629172,
"learning_rate": 3.6425244160426257e-05,
"loss": 0.8611,
"step": 732
},
{
"epoch": 0.20174774650794744,
"grad_norm": 0.43649844835285395,
"learning_rate": 3.641526882414553e-05,
"loss": 0.8358,
"step": 733
},
{
"epoch": 0.20202298217849032,
"grad_norm": 0.4785645068198297,
"learning_rate": 3.640528095909728e-05,
"loss": 0.8167,
"step": 734
},
{
"epoch": 0.20229821784903324,
"grad_norm": 0.4801829819148601,
"learning_rate": 3.6395280572904624e-05,
"loss": 0.842,
"step": 735
},
{
"epoch": 0.20257345351957615,
"grad_norm": 0.36946401326823053,
"learning_rate": 3.6385267673200247e-05,
"loss": 0.8602,
"step": 736
},
{
"epoch": 0.20284868919011903,
"grad_norm": 0.4071178451879595,
"learning_rate": 3.6375242267626374e-05,
"loss": 0.8362,
"step": 737
},
{
"epoch": 0.20312392486066194,
"grad_norm": 0.4626094932446769,
"learning_rate": 3.636520436383479e-05,
"loss": 0.85,
"step": 738
},
{
"epoch": 0.20339916053120485,
"grad_norm": 0.4444176379580745,
"learning_rate": 3.635515396948681e-05,
"loss": 0.8418,
"step": 739
},
{
"epoch": 0.20367439620174774,
"grad_norm": 0.39943612934790257,
"learning_rate": 3.634509109225328e-05,
"loss": 0.8176,
"step": 740
},
{
"epoch": 0.20394963187229065,
"grad_norm": 0.42746362491863327,
"learning_rate": 3.633501573981458e-05,
"loss": 0.8221,
"step": 741
},
{
"epoch": 0.20422486754283356,
"grad_norm": 0.43955769005816575,
"learning_rate": 3.6324927919860605e-05,
"loss": 0.8418,
"step": 742
},
{
"epoch": 0.20450010321337644,
"grad_norm": 0.40136283800513434,
"learning_rate": 3.631482764009077e-05,
"loss": 0.8597,
"step": 743
},
{
"epoch": 0.20477533888391936,
"grad_norm": 0.4342336125734908,
"learning_rate": 3.6304714908214005e-05,
"loss": 0.8522,
"step": 744
},
{
"epoch": 0.20505057455446227,
"grad_norm": 0.39796792057967034,
"learning_rate": 3.629458973194872e-05,
"loss": 0.8314,
"step": 745
},
{
"epoch": 0.20532581022500515,
"grad_norm": 0.36486618459201503,
"learning_rate": 3.6284452119022864e-05,
"loss": 0.8294,
"step": 746
},
{
"epoch": 0.20560104589554806,
"grad_norm": 0.42153144887888017,
"learning_rate": 3.627430207717384e-05,
"loss": 0.8476,
"step": 747
},
{
"epoch": 0.20587628156609097,
"grad_norm": 0.47187906267962265,
"learning_rate": 3.626413961414856e-05,
"loss": 0.8607,
"step": 748
},
{
"epoch": 0.20615151723663386,
"grad_norm": 0.571093129302097,
"learning_rate": 3.62539647377034e-05,
"loss": 0.849,
"step": 749
},
{
"epoch": 0.20642675290717677,
"grad_norm": 0.39155079928047015,
"learning_rate": 3.624377745560423e-05,
"loss": 0.8316,
"step": 750
},
{
"epoch": 0.20670198857771968,
"grad_norm": 0.4626947473628133,
"learning_rate": 3.6233577775626364e-05,
"loss": 0.8106,
"step": 751
},
{
"epoch": 0.20697722424826256,
"grad_norm": 0.3825681798940843,
"learning_rate": 3.62233657055546e-05,
"loss": 0.8505,
"step": 752
},
{
"epoch": 0.20725245991880548,
"grad_norm": 0.8103835841494073,
"learning_rate": 3.621314125318319e-05,
"loss": 0.8371,
"step": 753
},
{
"epoch": 0.2075276955893484,
"grad_norm": 0.544841934052842,
"learning_rate": 3.620290442631581e-05,
"loss": 0.8554,
"step": 754
},
{
"epoch": 0.20780293125989127,
"grad_norm": 0.4640226605335836,
"learning_rate": 3.619265523276563e-05,
"loss": 0.8572,
"step": 755
},
{
"epoch": 0.20807816693043418,
"grad_norm": 0.4142139456870526,
"learning_rate": 3.6182393680355215e-05,
"loss": 0.8407,
"step": 756
},
{
"epoch": 0.2083534026009771,
"grad_norm": 0.41231709603275885,
"learning_rate": 3.6172119776916574e-05,
"loss": 0.8627,
"step": 757
},
{
"epoch": 0.20862863827151998,
"grad_norm": 0.42718139122733884,
"learning_rate": 3.616183353029116e-05,
"loss": 0.8542,
"step": 758
},
{
"epoch": 0.2089038739420629,
"grad_norm": 0.42746219507199107,
"learning_rate": 3.615153494832982e-05,
"loss": 0.8455,
"step": 759
},
{
"epoch": 0.2091791096126058,
"grad_norm": 0.39937711944269977,
"learning_rate": 3.6141224038892844e-05,
"loss": 0.8575,
"step": 760
},
{
"epoch": 0.20945434528314869,
"grad_norm": 0.40072257740751366,
"learning_rate": 3.613090080984991e-05,
"loss": 0.8381,
"step": 761
},
{
"epoch": 0.2097295809536916,
"grad_norm": 0.4458910693132313,
"learning_rate": 3.6120565269080106e-05,
"loss": 0.8407,
"step": 762
},
{
"epoch": 0.2100048166242345,
"grad_norm": 0.407923953462484,
"learning_rate": 3.611021742447191e-05,
"loss": 0.8258,
"step": 763
},
{
"epoch": 0.2102800522947774,
"grad_norm": 0.4238305847658918,
"learning_rate": 3.6099857283923207e-05,
"loss": 0.813,
"step": 764
},
{
"epoch": 0.2105552879653203,
"grad_norm": 0.47777018385121184,
"learning_rate": 3.608948485534125e-05,
"loss": 0.8392,
"step": 765
},
{
"epoch": 0.21083052363586322,
"grad_norm": 0.3967502537158732,
"learning_rate": 3.607910014664268e-05,
"loss": 0.8059,
"step": 766
},
{
"epoch": 0.2111057593064061,
"grad_norm": 0.4237758123471388,
"learning_rate": 3.60687031657535e-05,
"loss": 0.8245,
"step": 767
},
{
"epoch": 0.211380994976949,
"grad_norm": 0.4774666222648608,
"learning_rate": 3.60582939206091e-05,
"loss": 0.8135,
"step": 768
},
{
"epoch": 0.21165623064749192,
"grad_norm": 0.446412856116806,
"learning_rate": 3.6047872419154214e-05,
"loss": 0.8272,
"step": 769
},
{
"epoch": 0.2119314663180348,
"grad_norm": 0.39678718268544694,
"learning_rate": 3.603743866934293e-05,
"loss": 0.8194,
"step": 770
},
{
"epoch": 0.21220670198857772,
"grad_norm": 0.4710935686138563,
"learning_rate": 3.60269926791387e-05,
"loss": 0.8534,
"step": 771
},
{
"epoch": 0.21248193765912063,
"grad_norm": 0.4720495079718069,
"learning_rate": 3.60165344565143e-05,
"loss": 0.8632,
"step": 772
},
{
"epoch": 0.2127571733296635,
"grad_norm": 0.37049250958887253,
"learning_rate": 3.600606400945184e-05,
"loss": 0.8287,
"step": 773
},
{
"epoch": 0.21303240900020642,
"grad_norm": 0.49193622376753743,
"learning_rate": 3.5995581345942783e-05,
"loss": 0.8417,
"step": 774
},
{
"epoch": 0.21330764467074934,
"grad_norm": 0.5202575001687807,
"learning_rate": 3.5985086473987905e-05,
"loss": 0.8315,
"step": 775
},
{
"epoch": 0.21358288034129222,
"grad_norm": 0.39311401189204315,
"learning_rate": 3.597457940159728e-05,
"loss": 0.8196,
"step": 776
},
{
"epoch": 0.21385811601183513,
"grad_norm": 0.40245893709363184,
"learning_rate": 3.596406013679034e-05,
"loss": 0.8363,
"step": 777
},
{
"epoch": 0.21413335168237804,
"grad_norm": 0.45702828332856205,
"learning_rate": 3.595352868759577e-05,
"loss": 0.8311,
"step": 778
},
{
"epoch": 0.21440858735292093,
"grad_norm": 0.38752198413705274,
"learning_rate": 3.5942985062051584e-05,
"loss": 0.8729,
"step": 779
},
{
"epoch": 0.21468382302346384,
"grad_norm": 0.4205086360306346,
"learning_rate": 3.593242926820509e-05,
"loss": 0.8396,
"step": 780
},
{
"epoch": 0.21495905869400675,
"grad_norm": 0.41145441647441555,
"learning_rate": 3.592186131411288e-05,
"loss": 0.8424,
"step": 781
},
{
"epoch": 0.21523429436454963,
"grad_norm": 0.4291218418547524,
"learning_rate": 3.591128120784081e-05,
"loss": 0.8341,
"step": 782
},
{
"epoch": 0.21550953003509254,
"grad_norm": 0.4155079935445456,
"learning_rate": 3.590068895746405e-05,
"loss": 0.8526,
"step": 783
},
{
"epoch": 0.21578476570563546,
"grad_norm": 1.1445884515085594,
"learning_rate": 3.589008457106699e-05,
"loss": 0.8226,
"step": 784
},
{
"epoch": 0.21606000137617834,
"grad_norm": 0.3764159673685187,
"learning_rate": 3.587946805674333e-05,
"loss": 0.8361,
"step": 785
},
{
"epoch": 0.21633523704672125,
"grad_norm": 0.4766390433135783,
"learning_rate": 3.5868839422595984e-05,
"loss": 0.8187,
"step": 786
},
{
"epoch": 0.21661047271726416,
"grad_norm": 0.5686529336641953,
"learning_rate": 3.5858198676737146e-05,
"loss": 0.8436,
"step": 787
},
{
"epoch": 0.21688570838780705,
"grad_norm": 0.39493700305912877,
"learning_rate": 3.5847545827288245e-05,
"loss": 0.857,
"step": 788
},
{
"epoch": 0.21716094405834996,
"grad_norm": 0.4051005714111124,
"learning_rate": 3.583688088237995e-05,
"loss": 0.8429,
"step": 789
},
{
"epoch": 0.21743617972889287,
"grad_norm": 0.41283129410754954,
"learning_rate": 3.582620385015215e-05,
"loss": 0.8374,
"step": 790
},
{
"epoch": 0.21771141539943575,
"grad_norm": 0.39826594291908823,
"learning_rate": 3.581551473875397e-05,
"loss": 0.8274,
"step": 791
},
{
"epoch": 0.21798665106997867,
"grad_norm": 0.4020013292191615,
"learning_rate": 3.5804813556343764e-05,
"loss": 0.8187,
"step": 792
},
{
"epoch": 0.21826188674052158,
"grad_norm": 0.48337407202357424,
"learning_rate": 3.579410031108908e-05,
"loss": 0.8478,
"step": 793
},
{
"epoch": 0.21853712241106446,
"grad_norm": 0.43589893225941345,
"learning_rate": 3.578337501116668e-05,
"loss": 0.8183,
"step": 794
},
{
"epoch": 0.21881235808160737,
"grad_norm": 0.3969535610324806,
"learning_rate": 3.577263766476253e-05,
"loss": 0.8091,
"step": 795
},
{
"epoch": 0.21908759375215028,
"grad_norm": 0.4299895238949179,
"learning_rate": 3.576188828007178e-05,
"loss": 0.8381,
"step": 796
},
{
"epoch": 0.21936282942269317,
"grad_norm": 0.46392382864977694,
"learning_rate": 3.575112686529879e-05,
"loss": 0.8407,
"step": 797
},
{
"epoch": 0.21963806509323608,
"grad_norm": 0.4419500179183648,
"learning_rate": 3.5740353428657075e-05,
"loss": 0.8226,
"step": 798
},
{
"epoch": 0.219913300763779,
"grad_norm": 0.3947764659546133,
"learning_rate": 3.572956797836934e-05,
"loss": 0.8247,
"step": 799
},
{
"epoch": 0.22018853643432187,
"grad_norm": 0.44553746290385293,
"learning_rate": 3.571877052266747e-05,
"loss": 0.8194,
"step": 800
},
{
"epoch": 0.22046377210486479,
"grad_norm": 0.6022734575775616,
"learning_rate": 3.5707961069792483e-05,
"loss": 0.8232,
"step": 801
},
{
"epoch": 0.2207390077754077,
"grad_norm": 0.40624940254379577,
"learning_rate": 3.5697139627994585e-05,
"loss": 0.8157,
"step": 802
},
{
"epoch": 0.2210142434459506,
"grad_norm": 0.3995554611453746,
"learning_rate": 3.568630620553311e-05,
"loss": 0.7882,
"step": 803
},
{
"epoch": 0.2212894791164935,
"grad_norm": 0.434957866603094,
"learning_rate": 3.567546081067654e-05,
"loss": 0.8205,
"step": 804
},
{
"epoch": 0.2215647147870364,
"grad_norm": 0.4411153780823904,
"learning_rate": 3.566460345170252e-05,
"loss": 0.8203,
"step": 805
},
{
"epoch": 0.22183995045757932,
"grad_norm": 0.3631520885030517,
"learning_rate": 3.565373413689779e-05,
"loss": 0.824,
"step": 806
},
{
"epoch": 0.2221151861281222,
"grad_norm": 0.37775935748126854,
"learning_rate": 3.5642852874558224e-05,
"loss": 0.8335,
"step": 807
},
{
"epoch": 0.2223904217986651,
"grad_norm": 0.41532774700032266,
"learning_rate": 3.563195967298884e-05,
"loss": 0.8428,
"step": 808
},
{
"epoch": 0.22266565746920802,
"grad_norm": 0.43219370366574045,
"learning_rate": 3.5621054540503736e-05,
"loss": 0.844,
"step": 809
},
{
"epoch": 0.2229408931397509,
"grad_norm": 0.4192748580747729,
"learning_rate": 3.561013748542615e-05,
"loss": 0.8239,
"step": 810
},
{
"epoch": 0.22321612881029382,
"grad_norm": 0.4004945451578492,
"learning_rate": 3.559920851608837e-05,
"loss": 0.8288,
"step": 811
},
{
"epoch": 0.22349136448083673,
"grad_norm": 0.37578485115688853,
"learning_rate": 3.558826764083183e-05,
"loss": 0.8174,
"step": 812
},
{
"epoch": 0.2237666001513796,
"grad_norm": 0.3900818322817826,
"learning_rate": 3.557731486800703e-05,
"loss": 0.8362,
"step": 813
},
{
"epoch": 0.22404183582192252,
"grad_norm": 0.4111267758159336,
"learning_rate": 3.556635020597354e-05,
"loss": 0.8534,
"step": 814
},
{
"epoch": 0.22431707149246544,
"grad_norm": 0.3443605977234703,
"learning_rate": 3.5555373663100015e-05,
"loss": 0.8554,
"step": 815
},
{
"epoch": 0.22459230716300832,
"grad_norm": 0.4087510461090667,
"learning_rate": 3.554438524776418e-05,
"loss": 0.847,
"step": 816
},
{
"epoch": 0.22486754283355123,
"grad_norm": 0.3969115734804899,
"learning_rate": 3.5533384968352816e-05,
"loss": 0.812,
"step": 817
},
{
"epoch": 0.22514277850409414,
"grad_norm": 0.3522044017362988,
"learning_rate": 3.5522372833261764e-05,
"loss": 0.8143,
"step": 818
},
{
"epoch": 0.22541801417463703,
"grad_norm": 0.37087751427808874,
"learning_rate": 3.55113488508959e-05,
"loss": 0.8436,
"step": 819
},
{
"epoch": 0.22569324984517994,
"grad_norm": 0.3922001870166564,
"learning_rate": 3.550031302966918e-05,
"loss": 0.8495,
"step": 820
},
{
"epoch": 0.22596848551572285,
"grad_norm": 0.491203905706425,
"learning_rate": 3.548926537800454e-05,
"loss": 0.8486,
"step": 821
},
{
"epoch": 0.22624372118626573,
"grad_norm": 0.39444069165635215,
"learning_rate": 3.547820590433399e-05,
"loss": 0.8125,
"step": 822
},
{
"epoch": 0.22651895685680865,
"grad_norm": 0.3486731037800461,
"learning_rate": 3.546713461709854e-05,
"loss": 0.8501,
"step": 823
},
{
"epoch": 0.22679419252735156,
"grad_norm": 0.38337616001967323,
"learning_rate": 3.5456051524748234e-05,
"loss": 0.8487,
"step": 824
},
{
"epoch": 0.22706942819789444,
"grad_norm": 0.5097332013738162,
"learning_rate": 3.5444956635742107e-05,
"loss": 0.8557,
"step": 825
},
{
"epoch": 0.22734466386843735,
"grad_norm": 0.3759475626959331,
"learning_rate": 3.543384995854821e-05,
"loss": 0.8445,
"step": 826
},
{
"epoch": 0.22761989953898026,
"grad_norm": 0.3863457112218582,
"learning_rate": 3.5422731501643595e-05,
"loss": 0.8318,
"step": 827
},
{
"epoch": 0.22789513520952315,
"grad_norm": 0.46392118866275983,
"learning_rate": 3.541160127351429e-05,
"loss": 0.8483,
"step": 828
},
{
"epoch": 0.22817037088006606,
"grad_norm": 0.405655137806616,
"learning_rate": 3.540045928265531e-05,
"loss": 0.811,
"step": 829
},
{
"epoch": 0.22844560655060897,
"grad_norm": 0.34745418798529587,
"learning_rate": 3.538930553757067e-05,
"loss": 0.8354,
"step": 830
},
{
"epoch": 0.22872084222115185,
"grad_norm": 0.4540177345167058,
"learning_rate": 3.5378140046773324e-05,
"loss": 0.8434,
"step": 831
},
{
"epoch": 0.22899607789169477,
"grad_norm": 0.3282915556725718,
"learning_rate": 3.536696281878521e-05,
"loss": 0.8289,
"step": 832
},
{
"epoch": 0.22927131356223768,
"grad_norm": 0.401827968549487,
"learning_rate": 3.535577386213723e-05,
"loss": 0.8329,
"step": 833
},
{
"epoch": 0.22954654923278056,
"grad_norm": 0.43062890667059867,
"learning_rate": 3.534457318536921e-05,
"loss": 0.813,
"step": 834
},
{
"epoch": 0.22982178490332347,
"grad_norm": 0.38907463282522503,
"learning_rate": 3.5333360797029957e-05,
"loss": 0.8533,
"step": 835
},
{
"epoch": 0.23009702057386638,
"grad_norm": 0.3466293668642444,
"learning_rate": 3.5322136705677186e-05,
"loss": 0.8378,
"step": 836
},
{
"epoch": 0.23037225624440927,
"grad_norm": 0.32815307841346353,
"learning_rate": 3.531090091987757e-05,
"loss": 0.7919,
"step": 837
},
{
"epoch": 0.23064749191495218,
"grad_norm": 0.39176158147379686,
"learning_rate": 3.529965344820668e-05,
"loss": 0.8272,
"step": 838
},
{
"epoch": 0.2309227275854951,
"grad_norm": 0.396333118129171,
"learning_rate": 3.528839429924904e-05,
"loss": 0.8145,
"step": 839
},
{
"epoch": 0.23119796325603797,
"grad_norm": 0.41623749051402525,
"learning_rate": 3.527712348159805e-05,
"loss": 0.8167,
"step": 840
},
{
"epoch": 0.2314731989265809,
"grad_norm": 0.34546793308076407,
"learning_rate": 3.526584100385603e-05,
"loss": 0.8219,
"step": 841
},
{
"epoch": 0.2317484345971238,
"grad_norm": 0.35318250038626503,
"learning_rate": 3.5254546874634226e-05,
"loss": 0.8246,
"step": 842
},
{
"epoch": 0.23202367026766668,
"grad_norm": 0.3649066796385545,
"learning_rate": 3.524324110255273e-05,
"loss": 0.8496,
"step": 843
},
{
"epoch": 0.2322989059382096,
"grad_norm": 0.368191191502741,
"learning_rate": 3.5231923696240564e-05,
"loss": 0.8199,
"step": 844
},
{
"epoch": 0.2325741416087525,
"grad_norm": 0.40493101571231055,
"learning_rate": 3.52205946643356e-05,
"loss": 0.7873,
"step": 845
},
{
"epoch": 0.2328493772792954,
"grad_norm": 0.3625293265094342,
"learning_rate": 3.520925401548459e-05,
"loss": 0.8151,
"step": 846
},
{
"epoch": 0.2331246129498383,
"grad_norm": 0.3600061234109703,
"learning_rate": 3.519790175834316e-05,
"loss": 0.8514,
"step": 847
},
{
"epoch": 0.2333998486203812,
"grad_norm": 0.39105417860306296,
"learning_rate": 3.518653790157579e-05,
"loss": 0.8128,
"step": 848
},
{
"epoch": 0.2336750842909241,
"grad_norm": 0.4051778323219665,
"learning_rate": 3.517516245385582e-05,
"loss": 0.8445,
"step": 849
},
{
"epoch": 0.233950319961467,
"grad_norm": 0.36006749274778344,
"learning_rate": 3.5163775423865426e-05,
"loss": 0.8328,
"step": 850
},
{
"epoch": 0.23422555563200992,
"grad_norm": 0.4466703831778952,
"learning_rate": 3.515237682029563e-05,
"loss": 0.8141,
"step": 851
},
{
"epoch": 0.2345007913025528,
"grad_norm": 0.436716336795539,
"learning_rate": 3.514096665184628e-05,
"loss": 0.807,
"step": 852
},
{
"epoch": 0.23477602697309571,
"grad_norm": 0.38083760181145965,
"learning_rate": 3.512954492722607e-05,
"loss": 0.8126,
"step": 853
},
{
"epoch": 0.23505126264363863,
"grad_norm": 0.36376687064587027,
"learning_rate": 3.5118111655152495e-05,
"loss": 0.8255,
"step": 854
},
{
"epoch": 0.2353264983141815,
"grad_norm": 0.40457155368152997,
"learning_rate": 3.5106666844351865e-05,
"loss": 0.8569,
"step": 855
},
{
"epoch": 0.23560173398472442,
"grad_norm": 0.36572034471340376,
"learning_rate": 3.5095210503559315e-05,
"loss": 0.848,
"step": 856
},
{
"epoch": 0.23587696965526733,
"grad_norm": 0.36066304336468447,
"learning_rate": 3.508374264151876e-05,
"loss": 0.833,
"step": 857
},
{
"epoch": 0.23615220532581022,
"grad_norm": 0.40143929438584175,
"learning_rate": 3.507226326698291e-05,
"loss": 0.8235,
"step": 858
},
{
"epoch": 0.23642744099635313,
"grad_norm": 0.40118410655130093,
"learning_rate": 3.506077238871328e-05,
"loss": 0.8443,
"step": 859
},
{
"epoch": 0.23670267666689604,
"grad_norm": 0.3873017074806495,
"learning_rate": 3.504927001548014e-05,
"loss": 0.8456,
"step": 860
},
{
"epoch": 0.23697791233743892,
"grad_norm": 1.1975254182139037,
"learning_rate": 3.503775615606255e-05,
"loss": 0.8334,
"step": 861
},
{
"epoch": 0.23725314800798183,
"grad_norm": 0.3576509096851949,
"learning_rate": 3.502623081924833e-05,
"loss": 0.8473,
"step": 862
},
{
"epoch": 0.23752838367852475,
"grad_norm": 0.4197276766222959,
"learning_rate": 3.501469401383407e-05,
"loss": 0.8473,
"step": 863
},
{
"epoch": 0.23780361934906763,
"grad_norm": 0.40580866166081847,
"learning_rate": 3.50031457486251e-05,
"loss": 0.8455,
"step": 864
},
{
"epoch": 0.23807885501961054,
"grad_norm": 0.4000441961670813,
"learning_rate": 3.499158603243551e-05,
"loss": 0.8319,
"step": 865
},
{
"epoch": 0.23835409069015345,
"grad_norm": 0.3720020981698597,
"learning_rate": 3.498001487408811e-05,
"loss": 0.8258,
"step": 866
},
{
"epoch": 0.23862932636069634,
"grad_norm": 0.46471451586910617,
"learning_rate": 3.4968432282414455e-05,
"loss": 0.8333,
"step": 867
},
{
"epoch": 0.23890456203123925,
"grad_norm": 0.4924794839189762,
"learning_rate": 3.495683826625485e-05,
"loss": 0.8488,
"step": 868
},
{
"epoch": 0.23917979770178216,
"grad_norm": 0.4557583818564989,
"learning_rate": 3.494523283445826e-05,
"loss": 0.8138,
"step": 869
},
{
"epoch": 0.23945503337232504,
"grad_norm": 0.43558174579410125,
"learning_rate": 3.493361599588243e-05,
"loss": 0.7978,
"step": 870
},
{
"epoch": 0.23973026904286796,
"grad_norm": 0.4093112919080268,
"learning_rate": 3.4921987759393755e-05,
"loss": 0.8347,
"step": 871
},
{
"epoch": 0.24000550471341087,
"grad_norm": 0.38281761969319905,
"learning_rate": 3.491034813386738e-05,
"loss": 0.825,
"step": 872
},
{
"epoch": 0.24028074038395375,
"grad_norm": 0.3666315809393925,
"learning_rate": 3.489869712818709e-05,
"loss": 0.8247,
"step": 873
},
{
"epoch": 0.24055597605449666,
"grad_norm": 0.3871490455459194,
"learning_rate": 3.488703475124541e-05,
"loss": 0.8278,
"step": 874
},
{
"epoch": 0.24083121172503957,
"grad_norm": 0.4241073969704235,
"learning_rate": 3.48753610119435e-05,
"loss": 0.8058,
"step": 875
},
{
"epoch": 0.24110644739558246,
"grad_norm": 0.3803506114056611,
"learning_rate": 3.486367591919121e-05,
"loss": 0.8532,
"step": 876
},
{
"epoch": 0.24138168306612537,
"grad_norm": 0.39639190673605307,
"learning_rate": 3.485197948190706e-05,
"loss": 0.8368,
"step": 877
},
{
"epoch": 0.24165691873666828,
"grad_norm": 0.38092093936016846,
"learning_rate": 3.484027170901822e-05,
"loss": 0.8508,
"step": 878
},
{
"epoch": 0.24193215440721116,
"grad_norm": 0.4019989320183119,
"learning_rate": 3.482855260946052e-05,
"loss": 0.8264,
"step": 879
},
{
"epoch": 0.24220739007775408,
"grad_norm": 0.4237030119016722,
"learning_rate": 3.4816822192178415e-05,
"loss": 0.8616,
"step": 880
},
{
"epoch": 0.242482625748297,
"grad_norm": 0.35898612975232064,
"learning_rate": 3.480508046612502e-05,
"loss": 0.7892,
"step": 881
},
{
"epoch": 0.24275786141883987,
"grad_norm": 0.3768587129155393,
"learning_rate": 3.479332744026208e-05,
"loss": 0.8565,
"step": 882
},
{
"epoch": 0.24303309708938278,
"grad_norm": 0.40306428284909,
"learning_rate": 3.478156312355996e-05,
"loss": 0.8422,
"step": 883
},
{
"epoch": 0.2433083327599257,
"grad_norm": 0.5132061342859415,
"learning_rate": 3.476978752499763e-05,
"loss": 0.8377,
"step": 884
},
{
"epoch": 0.24358356843046858,
"grad_norm": 0.3989743114285761,
"learning_rate": 3.4758000653562695e-05,
"loss": 0.8273,
"step": 885
},
{
"epoch": 0.2438588041010115,
"grad_norm": 0.413623547871739,
"learning_rate": 3.4746202518251344e-05,
"loss": 0.8266,
"step": 886
},
{
"epoch": 0.2441340397715544,
"grad_norm": 0.4401850552238258,
"learning_rate": 3.473439312806836e-05,
"loss": 0.8127,
"step": 887
},
{
"epoch": 0.24440927544209728,
"grad_norm": 0.4554287041333923,
"learning_rate": 3.4722572492027136e-05,
"loss": 0.8554,
"step": 888
},
{
"epoch": 0.2446845111126402,
"grad_norm": 0.4118004994600118,
"learning_rate": 3.4710740619149645e-05,
"loss": 0.8113,
"step": 889
},
{
"epoch": 0.2449597467831831,
"grad_norm": 0.34307074613438765,
"learning_rate": 3.469889751846642e-05,
"loss": 0.8459,
"step": 890
},
{
"epoch": 0.245234982453726,
"grad_norm": 0.4452265504756861,
"learning_rate": 3.468704319901657e-05,
"loss": 0.8261,
"step": 891
},
{
"epoch": 0.2455102181242689,
"grad_norm": 0.4696507396866337,
"learning_rate": 3.467517766984778e-05,
"loss": 0.8404,
"step": 892
},
{
"epoch": 0.24578545379481181,
"grad_norm": 0.43928661680504655,
"learning_rate": 3.466330094001628e-05,
"loss": 0.8513,
"step": 893
},
{
"epoch": 0.2460606894653547,
"grad_norm": 0.4342036451462862,
"learning_rate": 3.4651413018586844e-05,
"loss": 0.809,
"step": 894
},
{
"epoch": 0.2463359251358976,
"grad_norm": 0.4039800086638075,
"learning_rate": 3.4639513914632785e-05,
"loss": 0.8079,
"step": 895
},
{
"epoch": 0.24661116080644052,
"grad_norm": 0.3917827211289552,
"learning_rate": 3.4627603637235966e-05,
"loss": 0.8188,
"step": 896
},
{
"epoch": 0.2468863964769834,
"grad_norm": 0.43830843883671355,
"learning_rate": 3.461568219548678e-05,
"loss": 0.8036,
"step": 897
},
{
"epoch": 0.24716163214752632,
"grad_norm": 0.3466938340677024,
"learning_rate": 3.460374959848412e-05,
"loss": 0.8094,
"step": 898
},
{
"epoch": 0.24743686781806923,
"grad_norm": 0.37533059414164205,
"learning_rate": 3.459180585533542e-05,
"loss": 0.8255,
"step": 899
},
{
"epoch": 0.2477121034886121,
"grad_norm": 0.3563580048663711,
"learning_rate": 3.457985097515659e-05,
"loss": 0.8259,
"step": 900
},
{
"epoch": 0.24798733915915502,
"grad_norm": 0.35500169036390944,
"learning_rate": 3.456788496707206e-05,
"loss": 0.8323,
"step": 901
},
{
"epoch": 0.24826257482969794,
"grad_norm": 0.3379825574292942,
"learning_rate": 3.455590784021476e-05,
"loss": 0.8418,
"step": 902
},
{
"epoch": 0.24853781050024082,
"grad_norm": 0.3182222923242929,
"learning_rate": 3.454391960372608e-05,
"loss": 0.8465,
"step": 903
},
{
"epoch": 0.24881304617078373,
"grad_norm": 0.33768377703810865,
"learning_rate": 3.453192026675591e-05,
"loss": 0.846,
"step": 904
},
{
"epoch": 0.24908828184132664,
"grad_norm": 0.3540391684873382,
"learning_rate": 3.451990983846262e-05,
"loss": 0.8757,
"step": 905
},
{
"epoch": 0.24936351751186953,
"grad_norm": 0.33240189216415617,
"learning_rate": 3.4507888328013024e-05,
"loss": 0.8366,
"step": 906
},
{
"epoch": 0.24963875318241244,
"grad_norm": 0.3925041314170152,
"learning_rate": 3.44958557445824e-05,
"loss": 0.8302,
"step": 907
},
{
"epoch": 0.24991398885295535,
"grad_norm": 0.33449708063817496,
"learning_rate": 3.4483812097354494e-05,
"loss": 0.8069,
"step": 908
},
{
"epoch": 0.25018922452349823,
"grad_norm": 0.3406515455017463,
"learning_rate": 3.4471757395521465e-05,
"loss": 0.8462,
"step": 909
},
{
"epoch": 0.25046446019404117,
"grad_norm": 0.41220154811851184,
"learning_rate": 3.445969164828394e-05,
"loss": 0.8244,
"step": 910
},
{
"epoch": 0.25073969586458406,
"grad_norm": 0.4371018285501689,
"learning_rate": 3.444761486485095e-05,
"loss": 0.7883,
"step": 911
},
{
"epoch": 0.25101493153512694,
"grad_norm": 0.43146670222188904,
"learning_rate": 3.443552705443998e-05,
"loss": 0.8343,
"step": 912
},
{
"epoch": 0.2512901672056699,
"grad_norm": 0.38249638482227416,
"learning_rate": 3.442342822627691e-05,
"loss": 0.8463,
"step": 913
},
{
"epoch": 0.25156540287621276,
"grad_norm": 0.4243860027202627,
"learning_rate": 3.4411318389596026e-05,
"loss": 0.8307,
"step": 914
},
{
"epoch": 0.25184063854675565,
"grad_norm": 0.4463803488699344,
"learning_rate": 3.4399197553640026e-05,
"loss": 0.7895,
"step": 915
},
{
"epoch": 0.2521158742172986,
"grad_norm": 0.44495882971206646,
"learning_rate": 3.4387065727660004e-05,
"loss": 0.7994,
"step": 916
},
{
"epoch": 0.25239110988784147,
"grad_norm": 0.39566251098871,
"learning_rate": 3.437492292091543e-05,
"loss": 0.8299,
"step": 917
},
{
"epoch": 0.25266634555838435,
"grad_norm": 0.4045246757106012,
"learning_rate": 3.436276914267418e-05,
"loss": 0.8511,
"step": 918
},
{
"epoch": 0.2529415812289273,
"grad_norm": 0.42371859497702136,
"learning_rate": 3.4350604402212464e-05,
"loss": 0.8251,
"step": 919
},
{
"epoch": 0.2532168168994702,
"grad_norm": 0.39492548977263753,
"learning_rate": 3.4338428708814903e-05,
"loss": 0.8134,
"step": 920
},
{
"epoch": 0.25349205257001306,
"grad_norm": 0.43958568686797717,
"learning_rate": 3.432624207177444e-05,
"loss": 0.828,
"step": 921
},
{
"epoch": 0.253767288240556,
"grad_norm": 0.4053228077160484,
"learning_rate": 3.43140445003924e-05,
"loss": 0.8058,
"step": 922
},
{
"epoch": 0.2540425239110989,
"grad_norm": 0.42733786027660337,
"learning_rate": 3.430183600397844e-05,
"loss": 0.8469,
"step": 923
},
{
"epoch": 0.25431775958164177,
"grad_norm": 0.4254993160770477,
"learning_rate": 3.4289616591850545e-05,
"loss": 0.8152,
"step": 924
},
{
"epoch": 0.2545929952521847,
"grad_norm": 0.4138441910244267,
"learning_rate": 3.427738627333506e-05,
"loss": 0.8423,
"step": 925
},
{
"epoch": 0.2548682309227276,
"grad_norm": 0.3546967376757614,
"learning_rate": 3.426514505776662e-05,
"loss": 0.8135,
"step": 926
},
{
"epoch": 0.2551434665932705,
"grad_norm": 0.39442007367215237,
"learning_rate": 3.4252892954488194e-05,
"loss": 0.8363,
"step": 927
},
{
"epoch": 0.2554187022638134,
"grad_norm": 0.43492833078367915,
"learning_rate": 3.424062997285108e-05,
"loss": 0.8282,
"step": 928
},
{
"epoch": 0.2556939379343563,
"grad_norm": 0.39500704849323737,
"learning_rate": 3.422835612221484e-05,
"loss": 0.8325,
"step": 929
},
{
"epoch": 0.2559691736048992,
"grad_norm": 0.34174578426227653,
"learning_rate": 3.421607141194736e-05,
"loss": 0.8321,
"step": 930
},
{
"epoch": 0.2562444092754421,
"grad_norm": 0.3850431403355228,
"learning_rate": 3.42037758514248e-05,
"loss": 0.8874,
"step": 931
},
{
"epoch": 0.256519644945985,
"grad_norm": 0.41994000628992534,
"learning_rate": 3.4191469450031615e-05,
"loss": 0.837,
"step": 932
},
{
"epoch": 0.2567948806165279,
"grad_norm": 0.42433709518035356,
"learning_rate": 3.417915221716052e-05,
"loss": 0.8297,
"step": 933
},
{
"epoch": 0.2570701162870708,
"grad_norm": 0.3393954015254882,
"learning_rate": 3.416682416221251e-05,
"loss": 0.8312,
"step": 934
},
{
"epoch": 0.2573453519576137,
"grad_norm": 0.3839116112214657,
"learning_rate": 3.415448529459681e-05,
"loss": 0.8374,
"step": 935
},
{
"epoch": 0.2576205876281566,
"grad_norm": 0.44548769644823044,
"learning_rate": 3.4142135623730954e-05,
"loss": 0.8558,
"step": 936
},
{
"epoch": 0.25789582329869953,
"grad_norm": 0.49983780476601025,
"learning_rate": 3.412977515904067e-05,
"loss": 0.8126,
"step": 937
},
{
"epoch": 0.2581710589692424,
"grad_norm": 0.4240413807664675,
"learning_rate": 3.411740390995994e-05,
"loss": 0.8237,
"step": 938
},
{
"epoch": 0.2584462946397853,
"grad_norm": 0.42031699262819905,
"learning_rate": 3.410502188593099e-05,
"loss": 0.8228,
"step": 939
},
{
"epoch": 0.25872153031032824,
"grad_norm": 0.5173553123655487,
"learning_rate": 3.409262909640425e-05,
"loss": 0.8218,
"step": 940
},
{
"epoch": 0.2589967659808711,
"grad_norm": 0.4245075055820437,
"learning_rate": 3.4080225550838375e-05,
"loss": 0.8268,
"step": 941
},
{
"epoch": 0.259272001651414,
"grad_norm": 0.3727650142067033,
"learning_rate": 3.4067811258700236e-05,
"loss": 0.8258,
"step": 942
},
{
"epoch": 0.25954723732195695,
"grad_norm": 0.5595207476315207,
"learning_rate": 3.40553862294649e-05,
"loss": 0.8124,
"step": 943
},
{
"epoch": 0.25982247299249983,
"grad_norm": 0.49741572494471603,
"learning_rate": 3.4042950472615635e-05,
"loss": 0.8276,
"step": 944
},
{
"epoch": 0.2600977086630427,
"grad_norm": 0.48165510019003055,
"learning_rate": 3.4030503997643876e-05,
"loss": 0.8461,
"step": 945
},
{
"epoch": 0.26037294433358565,
"grad_norm": 0.48855567453007936,
"learning_rate": 3.4018046814049265e-05,
"loss": 0.8302,
"step": 946
},
{
"epoch": 0.26064818000412854,
"grad_norm": 0.436296244261392,
"learning_rate": 3.400557893133961e-05,
"loss": 0.8171,
"step": 947
},
{
"epoch": 0.2609234156746714,
"grad_norm": 0.3988161745438226,
"learning_rate": 3.399310035903087e-05,
"loss": 0.816,
"step": 948
},
{
"epoch": 0.26119865134521436,
"grad_norm": 0.42125979614280584,
"learning_rate": 3.398061110664717e-05,
"loss": 0.807,
"step": 949
},
{
"epoch": 0.26147388701575724,
"grad_norm": 0.4835863160629807,
"learning_rate": 3.3968111183720804e-05,
"loss": 0.8311,
"step": 950
},
{
"epoch": 0.26174912268630013,
"grad_norm": 0.42889734579019506,
"learning_rate": 3.3955600599792186e-05,
"loss": 0.8391,
"step": 951
},
{
"epoch": 0.26202435835684307,
"grad_norm": 0.39552458434404236,
"learning_rate": 3.394307936440989e-05,
"loss": 0.8301,
"step": 952
},
{
"epoch": 0.26229959402738595,
"grad_norm": 0.41065215014959544,
"learning_rate": 3.393054748713059e-05,
"loss": 0.8238,
"step": 953
},
{
"epoch": 0.26257482969792884,
"grad_norm": 0.5533985577656855,
"learning_rate": 3.391800497751911e-05,
"loss": 0.8051,
"step": 954
},
{
"epoch": 0.2628500653684718,
"grad_norm": 0.47953893440947115,
"learning_rate": 3.3905451845148375e-05,
"loss": 0.8269,
"step": 955
},
{
"epoch": 0.26312530103901466,
"grad_norm": 0.40075344854455874,
"learning_rate": 3.3892888099599415e-05,
"loss": 0.8513,
"step": 956
},
{
"epoch": 0.26340053670955754,
"grad_norm": 0.4543731521458455,
"learning_rate": 3.3880313750461376e-05,
"loss": 0.7749,
"step": 957
},
{
"epoch": 0.2636757723801005,
"grad_norm": 0.5196069215957392,
"learning_rate": 3.386772880733149e-05,
"loss": 0.8211,
"step": 958
},
{
"epoch": 0.26395100805064337,
"grad_norm": 0.45811909805641676,
"learning_rate": 3.3855133279815055e-05,
"loss": 0.8347,
"step": 959
},
{
"epoch": 0.26422624372118625,
"grad_norm": 0.372973209850614,
"learning_rate": 3.3842527177525475e-05,
"loss": 0.8385,
"step": 960
},
{
"epoch": 0.2645014793917292,
"grad_norm": 0.44907430913049645,
"learning_rate": 3.382991051008422e-05,
"loss": 0.8073,
"step": 961
},
{
"epoch": 0.26477671506227207,
"grad_norm": 0.42363534517689194,
"learning_rate": 3.381728328712081e-05,
"loss": 0.8453,
"step": 962
},
{
"epoch": 0.26505195073281496,
"grad_norm": 0.376662745066105,
"learning_rate": 3.3804645518272824e-05,
"loss": 0.8403,
"step": 963
},
{
"epoch": 0.2653271864033579,
"grad_norm": 0.49512431710521343,
"learning_rate": 3.379199721318591e-05,
"loss": 0.8253,
"step": 964
},
{
"epoch": 0.2656024220739008,
"grad_norm": 0.4591187483451386,
"learning_rate": 3.377933838151374e-05,
"loss": 0.8265,
"step": 965
},
{
"epoch": 0.26587765774444366,
"grad_norm": 0.4601168383862767,
"learning_rate": 3.376666903291801e-05,
"loss": 0.8297,
"step": 966
},
{
"epoch": 0.2661528934149866,
"grad_norm": 0.3191724881281232,
"learning_rate": 3.375398917706847e-05,
"loss": 0.8404,
"step": 967
},
{
"epoch": 0.2664281290855295,
"grad_norm": 0.4355995669187843,
"learning_rate": 3.3741298823642874e-05,
"loss": 0.8265,
"step": 968
},
{
"epoch": 0.26670336475607237,
"grad_norm": 0.4395611618140551,
"learning_rate": 3.3728597982326985e-05,
"loss": 0.8013,
"step": 969
},
{
"epoch": 0.2669786004266153,
"grad_norm": 0.4021718298563644,
"learning_rate": 3.371588666281458e-05,
"loss": 0.7941,
"step": 970
},
{
"epoch": 0.2672538360971582,
"grad_norm": 0.37411754108701273,
"learning_rate": 3.370316487480743e-05,
"loss": 0.7882,
"step": 971
},
{
"epoch": 0.2675290717677011,
"grad_norm": 0.532414476809795,
"learning_rate": 3.369043262801529e-05,
"loss": 0.8358,
"step": 972
},
{
"epoch": 0.267804307438244,
"grad_norm": 0.45055415041612057,
"learning_rate": 3.367768993215591e-05,
"loss": 0.8406,
"step": 973
},
{
"epoch": 0.2680795431087869,
"grad_norm": 0.38978013914461174,
"learning_rate": 3.3664936796955006e-05,
"loss": 0.8379,
"step": 974
},
{
"epoch": 0.2683547787793298,
"grad_norm": 0.41528979439579355,
"learning_rate": 3.365217323214626e-05,
"loss": 0.829,
"step": 975
},
{
"epoch": 0.2686300144498727,
"grad_norm": 0.46357769975153607,
"learning_rate": 3.363939924747132e-05,
"loss": 0.8224,
"step": 976
},
{
"epoch": 0.2689052501204156,
"grad_norm": 0.3888950297127714,
"learning_rate": 3.362661485267978e-05,
"loss": 0.8354,
"step": 977
},
{
"epoch": 0.2691804857909585,
"grad_norm": 0.35372551270193886,
"learning_rate": 3.36138200575292e-05,
"loss": 0.8172,
"step": 978
},
{
"epoch": 0.26945572146150143,
"grad_norm": 0.3431499868102491,
"learning_rate": 3.360101487178504e-05,
"loss": 0.797,
"step": 979
},
{
"epoch": 0.2697309571320443,
"grad_norm": 0.40984578702915175,
"learning_rate": 3.3588199305220735e-05,
"loss": 0.8154,
"step": 980
},
{
"epoch": 0.2700061928025872,
"grad_norm": 0.34466839402337285,
"learning_rate": 3.35753733676176e-05,
"loss": 0.8384,
"step": 981
},
{
"epoch": 0.27028142847313014,
"grad_norm": 0.3610475270773301,
"learning_rate": 3.3562537068764896e-05,
"loss": 0.8123,
"step": 982
},
{
"epoch": 0.270556664143673,
"grad_norm": 0.37468376503602213,
"learning_rate": 3.354969041845978e-05,
"loss": 0.8073,
"step": 983
},
{
"epoch": 0.2708318998142159,
"grad_norm": 0.41702853112269633,
"learning_rate": 3.3536833426507324e-05,
"loss": 0.8127,
"step": 984
},
{
"epoch": 0.27110713548475884,
"grad_norm": 0.38086610307318625,
"learning_rate": 3.3523966102720465e-05,
"loss": 0.8237,
"step": 985
},
{
"epoch": 0.2713823711553017,
"grad_norm": 0.3759654765913201,
"learning_rate": 3.3511088456920043e-05,
"loss": 0.8222,
"step": 986
},
{
"epoch": 0.2716576068258446,
"grad_norm": 0.35873822918453746,
"learning_rate": 3.349820049893478e-05,
"loss": 0.7961,
"step": 987
},
{
"epoch": 0.27193284249638755,
"grad_norm": 0.41700414745554093,
"learning_rate": 3.348530223860127e-05,
"loss": 0.7967,
"step": 988
},
{
"epoch": 0.27220807816693043,
"grad_norm": 0.383267708528797,
"learning_rate": 3.3472393685763955e-05,
"loss": 0.8263,
"step": 989
},
{
"epoch": 0.2724833138374733,
"grad_norm": 0.3613953730606685,
"learning_rate": 3.345947485027514e-05,
"loss": 0.8469,
"step": 990
},
{
"epoch": 0.27275854950801626,
"grad_norm": 0.4303837046797646,
"learning_rate": 3.344654574199499e-05,
"loss": 0.8041,
"step": 991
},
{
"epoch": 0.27303378517855914,
"grad_norm": 0.4413659571463197,
"learning_rate": 3.343360637079148e-05,
"loss": 0.8245,
"step": 992
},
{
"epoch": 0.273309020849102,
"grad_norm": 0.36170362697171604,
"learning_rate": 3.342065674654046e-05,
"loss": 0.7983,
"step": 993
},
{
"epoch": 0.27358425651964496,
"grad_norm": 0.40823498803765795,
"learning_rate": 3.340769687912557e-05,
"loss": 0.7897,
"step": 994
},
{
"epoch": 0.27385949219018785,
"grad_norm": 0.45626660403090946,
"learning_rate": 3.339472677843829e-05,
"loss": 0.8149,
"step": 995
},
{
"epoch": 0.27413472786073073,
"grad_norm": 0.4089685051647268,
"learning_rate": 3.33817464543779e-05,
"loss": 0.8165,
"step": 996
},
{
"epoch": 0.27440996353127367,
"grad_norm": 0.3690058613659837,
"learning_rate": 3.336875591685148e-05,
"loss": 0.8301,
"step": 997
},
{
"epoch": 0.27468519920181655,
"grad_norm": 0.4561618026234944,
"learning_rate": 3.335575517577391e-05,
"loss": 0.8238,
"step": 998
},
{
"epoch": 0.27496043487235944,
"grad_norm": 0.41220364133671056,
"learning_rate": 3.334274424106787e-05,
"loss": 0.8332,
"step": 999
},
{
"epoch": 0.2752356705429024,
"grad_norm": 0.3625798814772632,
"learning_rate": 3.33297231226638e-05,
"loss": 0.8407,
"step": 1000
},
{
"epoch": 0.27551090621344526,
"grad_norm": 0.3687920746126404,
"learning_rate": 3.331669183049991e-05,
"loss": 0.8484,
"step": 1001
},
{
"epoch": 0.27578614188398815,
"grad_norm": 0.4592938229425808,
"learning_rate": 3.3303650374522205e-05,
"loss": 0.8076,
"step": 1002
},
{
"epoch": 0.2760613775545311,
"grad_norm": 0.4114855454534366,
"learning_rate": 3.3290598764684415e-05,
"loss": 0.7851,
"step": 1003
},
{
"epoch": 0.27633661322507397,
"grad_norm": 0.3776835579461095,
"learning_rate": 3.3277537010948046e-05,
"loss": 0.8194,
"step": 1004
},
{
"epoch": 0.27661184889561685,
"grad_norm": 0.32442630714287224,
"learning_rate": 3.3264465123282316e-05,
"loss": 0.8225,
"step": 1005
},
{
"epoch": 0.2768870845661598,
"grad_norm": 0.4107413621161955,
"learning_rate": 3.32513831116642e-05,
"loss": 0.8312,
"step": 1006
},
{
"epoch": 0.2771623202367027,
"grad_norm": 0.40158455919786795,
"learning_rate": 3.32382909860784e-05,
"loss": 0.8025,
"step": 1007
},
{
"epoch": 0.27743755590724556,
"grad_norm": 0.3565030330111457,
"learning_rate": 3.322518875651734e-05,
"loss": 0.8227,
"step": 1008
},
{
"epoch": 0.2777127915777885,
"grad_norm": 0.38154773925258695,
"learning_rate": 3.321207643298113e-05,
"loss": 0.8302,
"step": 1009
},
{
"epoch": 0.2779880272483314,
"grad_norm": 0.3500103482441849,
"learning_rate": 3.319895402547761e-05,
"loss": 0.8266,
"step": 1010
},
{
"epoch": 0.27826326291887427,
"grad_norm": 0.38055832839320536,
"learning_rate": 3.318582154402232e-05,
"loss": 0.7931,
"step": 1011
},
{
"epoch": 0.2785384985894172,
"grad_norm": 0.4185997398789307,
"learning_rate": 3.3172678998638456e-05,
"loss": 0.8496,
"step": 1012
},
{
"epoch": 0.2788137342599601,
"grad_norm": 0.35735306715304604,
"learning_rate": 3.315952639935692e-05,
"loss": 0.8089,
"step": 1013
},
{
"epoch": 0.279088969930503,
"grad_norm": 0.3652762209734979,
"learning_rate": 3.314636375621631e-05,
"loss": 0.8347,
"step": 1014
},
{
"epoch": 0.2793642056010459,
"grad_norm": 0.35985576189106766,
"learning_rate": 3.3133191079262835e-05,
"loss": 0.8479,
"step": 1015
},
{
"epoch": 0.2796394412715888,
"grad_norm": 0.3571219737509123,
"learning_rate": 3.31200083785504e-05,
"loss": 0.8214,
"step": 1016
},
{
"epoch": 0.2799146769421317,
"grad_norm": 0.3665332659456565,
"learning_rate": 3.310681566414055e-05,
"loss": 0.846,
"step": 1017
},
{
"epoch": 0.2801899126126746,
"grad_norm": 0.5146021670283341,
"learning_rate": 3.309361294610249e-05,
"loss": 0.8226,
"step": 1018
},
{
"epoch": 0.2804651482832175,
"grad_norm": 0.3832445426516723,
"learning_rate": 3.3080400234513014e-05,
"loss": 0.8247,
"step": 1019
},
{
"epoch": 0.2807403839537604,
"grad_norm": 0.3671994816011794,
"learning_rate": 3.30671775394566e-05,
"loss": 0.8047,
"step": 1020
},
{
"epoch": 0.2810156196243033,
"grad_norm": 0.46637880773878493,
"learning_rate": 3.305394487102531e-05,
"loss": 0.8291,
"step": 1021
},
{
"epoch": 0.2812908552948462,
"grad_norm": 0.4159201543332825,
"learning_rate": 3.304070223931883e-05,
"loss": 0.8152,
"step": 1022
},
{
"epoch": 0.2815660909653891,
"grad_norm": 0.3993491676615847,
"learning_rate": 3.302744965444445e-05,
"loss": 0.8258,
"step": 1023
},
{
"epoch": 0.28184132663593203,
"grad_norm": 0.3379334616921281,
"learning_rate": 3.3014187126517047e-05,
"loss": 0.8262,
"step": 1024
},
{
"epoch": 0.2821165623064749,
"grad_norm": 0.3983835614335685,
"learning_rate": 3.3000914665659106e-05,
"loss": 0.8327,
"step": 1025
},
{
"epoch": 0.2823917979770178,
"grad_norm": 0.36513531070552024,
"learning_rate": 3.298763228200067e-05,
"loss": 0.8489,
"step": 1026
},
{
"epoch": 0.28266703364756074,
"grad_norm": 0.37126347427748796,
"learning_rate": 3.297433998567938e-05,
"loss": 0.8117,
"step": 1027
},
{
"epoch": 0.2829422693181036,
"grad_norm": 0.40646996012559816,
"learning_rate": 3.296103778684043e-05,
"loss": 0.8359,
"step": 1028
},
{
"epoch": 0.2832175049886465,
"grad_norm": 0.38317328376311643,
"learning_rate": 3.294772569563656e-05,
"loss": 0.8011,
"step": 1029
},
{
"epoch": 0.28349274065918945,
"grad_norm": 0.3488498488099232,
"learning_rate": 3.293440372222808e-05,
"loss": 0.8146,
"step": 1030
},
{
"epoch": 0.28376797632973233,
"grad_norm": 0.36648213921116923,
"learning_rate": 3.2921071876782824e-05,
"loss": 0.8049,
"step": 1031
},
{
"epoch": 0.2840432120002752,
"grad_norm": 0.4262654259718712,
"learning_rate": 3.2907730169476194e-05,
"loss": 0.7915,
"step": 1032
},
{
"epoch": 0.28431844767081815,
"grad_norm": 0.3832364802699537,
"learning_rate": 3.289437861049108e-05,
"loss": 0.8358,
"step": 1033
},
{
"epoch": 0.28459368334136104,
"grad_norm": 0.39458916831718355,
"learning_rate": 3.288101721001791e-05,
"loss": 0.7942,
"step": 1034
},
{
"epoch": 0.2848689190119039,
"grad_norm": 0.4231162507259696,
"learning_rate": 3.286764597825463e-05,
"loss": 0.7979,
"step": 1035
},
{
"epoch": 0.28514415468244686,
"grad_norm": 0.44727313611900127,
"learning_rate": 3.2854264925406666e-05,
"loss": 0.8358,
"step": 1036
},
{
"epoch": 0.28541939035298974,
"grad_norm": 0.36572940078160615,
"learning_rate": 3.2840874061686965e-05,
"loss": 0.8144,
"step": 1037
},
{
"epoch": 0.2856946260235326,
"grad_norm": 0.3857149157332862,
"learning_rate": 3.2827473397315945e-05,
"loss": 0.8096,
"step": 1038
},
{
"epoch": 0.28596986169407557,
"grad_norm": 0.4044774233752771,
"learning_rate": 3.2814062942521524e-05,
"loss": 0.8095,
"step": 1039
},
{
"epoch": 0.28624509736461845,
"grad_norm": 0.3990764616275733,
"learning_rate": 3.280064270753906e-05,
"loss": 0.7967,
"step": 1040
},
{
"epoch": 0.28652033303516133,
"grad_norm": 0.35521758970608697,
"learning_rate": 3.278721270261142e-05,
"loss": 0.8042,
"step": 1041
},
{
"epoch": 0.2867955687057043,
"grad_norm": 0.30655482545456053,
"learning_rate": 3.2773772937988874e-05,
"loss": 0.7957,
"step": 1042
},
{
"epoch": 0.28707080437624716,
"grad_norm": 0.37026485436533674,
"learning_rate": 3.27603234239292e-05,
"loss": 0.8373,
"step": 1043
},
{
"epoch": 0.28734604004679004,
"grad_norm": 0.36786957025880945,
"learning_rate": 3.2746864170697554e-05,
"loss": 0.812,
"step": 1044
},
{
"epoch": 0.287621275717333,
"grad_norm": 0.3470180637142766,
"learning_rate": 3.273339518856658e-05,
"loss": 0.8179,
"step": 1045
},
{
"epoch": 0.28789651138787586,
"grad_norm": 0.3756904712576622,
"learning_rate": 3.271991648781632e-05,
"loss": 0.7933,
"step": 1046
},
{
"epoch": 0.28817174705841875,
"grad_norm": 0.354941020911814,
"learning_rate": 3.2706428078734246e-05,
"loss": 0.8325,
"step": 1047
},
{
"epoch": 0.2884469827289617,
"grad_norm": 0.33715387407971126,
"learning_rate": 3.269292997161522e-05,
"loss": 0.8081,
"step": 1048
},
{
"epoch": 0.28872221839950457,
"grad_norm": 0.31762205037148555,
"learning_rate": 3.267942217676153e-05,
"loss": 0.811,
"step": 1049
},
{
"epoch": 0.28899745407004745,
"grad_norm": 0.29591205358051814,
"learning_rate": 3.266590470448284e-05,
"loss": 0.8288,
"step": 1050
},
{
"epoch": 0.2892726897405904,
"grad_norm": 0.33213598524888627,
"learning_rate": 3.265237756509621e-05,
"loss": 0.812,
"step": 1051
},
{
"epoch": 0.2895479254111333,
"grad_norm": 0.32687722628273364,
"learning_rate": 3.263884076892608e-05,
"loss": 0.7973,
"step": 1052
},
{
"epoch": 0.28982316108167616,
"grad_norm": 0.33189357675975595,
"learning_rate": 3.2625294326304255e-05,
"loss": 0.8596,
"step": 1053
},
{
"epoch": 0.2900983967522191,
"grad_norm": 0.3245325935760063,
"learning_rate": 3.26117382475699e-05,
"loss": 0.8156,
"step": 1054
},
{
"epoch": 0.290373632422762,
"grad_norm": 0.3244394751137452,
"learning_rate": 3.259817254306953e-05,
"loss": 0.8366,
"step": 1055
},
{
"epoch": 0.29064886809330487,
"grad_norm": 0.3563546261904209,
"learning_rate": 3.258459722315702e-05,
"loss": 0.7998,
"step": 1056
},
{
"epoch": 0.2909241037638478,
"grad_norm": 0.32714568995843457,
"learning_rate": 3.257101229819359e-05,
"loss": 0.7998,
"step": 1057
},
{
"epoch": 0.2911993394343907,
"grad_norm": 0.3236596736833071,
"learning_rate": 3.255741777854778e-05,
"loss": 0.8391,
"step": 1058
},
{
"epoch": 0.2914745751049336,
"grad_norm": 0.31755531763025624,
"learning_rate": 3.254381367459543e-05,
"loss": 0.8079,
"step": 1059
},
{
"epoch": 0.2917498107754765,
"grad_norm": 0.3357666732093731,
"learning_rate": 3.2530199996719735e-05,
"loss": 0.8483,
"step": 1060
},
{
"epoch": 0.2920250464460194,
"grad_norm": 0.5232287946458585,
"learning_rate": 3.251657675531118e-05,
"loss": 0.8395,
"step": 1061
},
{
"epoch": 0.2923002821165623,
"grad_norm": 0.33446335510949543,
"learning_rate": 3.250294396076755e-05,
"loss": 0.8198,
"step": 1062
},
{
"epoch": 0.2925755177871052,
"grad_norm": 0.34175696038594394,
"learning_rate": 3.248930162349391e-05,
"loss": 0.8364,
"step": 1063
},
{
"epoch": 0.2928507534576481,
"grad_norm": 0.3363356035598911,
"learning_rate": 3.247564975390263e-05,
"loss": 0.8256,
"step": 1064
},
{
"epoch": 0.293125989128191,
"grad_norm": 0.34158877826390627,
"learning_rate": 3.246198836241335e-05,
"loss": 0.822,
"step": 1065
},
{
"epoch": 0.29340122479873393,
"grad_norm": 0.3317539657236755,
"learning_rate": 3.244831745945295e-05,
"loss": 0.8239,
"step": 1066
},
{
"epoch": 0.2936764604692768,
"grad_norm": 0.35703551560031244,
"learning_rate": 3.2434637055455603e-05,
"loss": 0.825,
"step": 1067
},
{
"epoch": 0.2939516961398197,
"grad_norm": 0.3576830094256862,
"learning_rate": 3.242094716086273e-05,
"loss": 0.7974,
"step": 1068
},
{
"epoch": 0.29422693181036264,
"grad_norm": 0.32835397231595426,
"learning_rate": 3.240724778612298e-05,
"loss": 0.8021,
"step": 1069
},
{
"epoch": 0.2945021674809055,
"grad_norm": 0.34593052846467287,
"learning_rate": 3.2393538941692245e-05,
"loss": 0.827,
"step": 1070
},
{
"epoch": 0.2947774031514484,
"grad_norm": 0.3470889744327635,
"learning_rate": 3.237982063803365e-05,
"loss": 0.8116,
"step": 1071
},
{
"epoch": 0.29505263882199134,
"grad_norm": 0.32508847968784993,
"learning_rate": 3.236609288561753e-05,
"loss": 0.7863,
"step": 1072
},
{
"epoch": 0.2953278744925342,
"grad_norm": 0.36213804888615797,
"learning_rate": 3.235235569492143e-05,
"loss": 0.8061,
"step": 1073
},
{
"epoch": 0.2956031101630771,
"grad_norm": 0.3197503350817932,
"learning_rate": 3.2338609076430114e-05,
"loss": 0.8305,
"step": 1074
},
{
"epoch": 0.29587834583362005,
"grad_norm": 0.36038210554453803,
"learning_rate": 3.232485304063553e-05,
"loss": 0.8802,
"step": 1075
},
{
"epoch": 0.29615358150416293,
"grad_norm": 0.328607258126315,
"learning_rate": 3.2311087598036825e-05,
"loss": 0.8267,
"step": 1076
},
{
"epoch": 0.2964288171747058,
"grad_norm": 0.3640745973298245,
"learning_rate": 3.229731275914029e-05,
"loss": 0.7978,
"step": 1077
},
{
"epoch": 0.29670405284524876,
"grad_norm": 0.38813330516283573,
"learning_rate": 3.2283528534459446e-05,
"loss": 0.8318,
"step": 1078
},
{
"epoch": 0.29697928851579164,
"grad_norm": 0.34363745354810477,
"learning_rate": 3.2269734934514923e-05,
"loss": 0.8253,
"step": 1079
},
{
"epoch": 0.2972545241863345,
"grad_norm": 0.3504791780475102,
"learning_rate": 3.2255931969834546e-05,
"loss": 0.8036,
"step": 1080
},
{
"epoch": 0.29752975985687746,
"grad_norm": 0.3658093579369949,
"learning_rate": 3.224211965095326e-05,
"loss": 0.8023,
"step": 1081
},
{
"epoch": 0.29780499552742035,
"grad_norm": 0.38628879778930036,
"learning_rate": 3.2228297988413164e-05,
"loss": 0.8164,
"step": 1082
},
{
"epoch": 0.29808023119796323,
"grad_norm": 0.32724059178950327,
"learning_rate": 3.2214466992763483e-05,
"loss": 0.7929,
"step": 1083
},
{
"epoch": 0.29835546686850617,
"grad_norm": 0.38418792709404465,
"learning_rate": 3.2200626674560575e-05,
"loss": 0.807,
"step": 1084
},
{
"epoch": 0.29863070253904905,
"grad_norm": 0.457456422932918,
"learning_rate": 3.2186777044367896e-05,
"loss": 0.8199,
"step": 1085
},
{
"epoch": 0.29890593820959194,
"grad_norm": 0.3808994397435877,
"learning_rate": 3.217291811275603e-05,
"loss": 0.7976,
"step": 1086
},
{
"epoch": 0.2991811738801349,
"grad_norm": 0.3149983883864467,
"learning_rate": 3.215904989030263e-05,
"loss": 0.8054,
"step": 1087
},
{
"epoch": 0.29945640955067776,
"grad_norm": 0.39142336058549454,
"learning_rate": 3.214517238759248e-05,
"loss": 0.7858,
"step": 1088
},
{
"epoch": 0.29973164522122064,
"grad_norm": 0.40642371193343135,
"learning_rate": 3.213128561521742e-05,
"loss": 0.8198,
"step": 1089
},
{
"epoch": 0.3000068808917636,
"grad_norm": 0.38137415650228934,
"learning_rate": 3.211738958377637e-05,
"loss": 0.8409,
"step": 1090
},
{
"epoch": 0.30028211656230647,
"grad_norm": 0.6588977669498615,
"learning_rate": 3.210348430387531e-05,
"loss": 0.7886,
"step": 1091
},
{
"epoch": 0.30055735223284935,
"grad_norm": 0.3811754020394223,
"learning_rate": 3.2089569786127294e-05,
"loss": 0.7909,
"step": 1092
},
{
"epoch": 0.3008325879033923,
"grad_norm": 0.38282563447111506,
"learning_rate": 3.207564604115242e-05,
"loss": 0.8053,
"step": 1093
},
{
"epoch": 0.3011078235739352,
"grad_norm": 0.375145725571009,
"learning_rate": 3.206171307957783e-05,
"loss": 0.7704,
"step": 1094
},
{
"epoch": 0.30138305924447806,
"grad_norm": 0.3985383893005983,
"learning_rate": 3.2047770912037704e-05,
"loss": 0.8437,
"step": 1095
},
{
"epoch": 0.301658294915021,
"grad_norm": 0.3670117256233208,
"learning_rate": 3.203381954917323e-05,
"loss": 0.7936,
"step": 1096
},
{
"epoch": 0.3019335305855639,
"grad_norm": 0.45355775728194997,
"learning_rate": 3.2019859001632635e-05,
"loss": 0.8196,
"step": 1097
},
{
"epoch": 0.30220876625610676,
"grad_norm": 0.37933306920483373,
"learning_rate": 3.2005889280071154e-05,
"loss": 0.8009,
"step": 1098
},
{
"epoch": 0.3024840019266497,
"grad_norm": 0.6413068450592516,
"learning_rate": 3.1991910395151e-05,
"loss": 0.8376,
"step": 1099
},
{
"epoch": 0.3027592375971926,
"grad_norm": 0.3709682164236483,
"learning_rate": 3.1977922357541414e-05,
"loss": 0.8061,
"step": 1100
},
{
"epoch": 0.30303447326773547,
"grad_norm": 0.35389360583624657,
"learning_rate": 3.196392517791861e-05,
"loss": 0.8107,
"step": 1101
},
{
"epoch": 0.3033097089382784,
"grad_norm": 0.30775989343669724,
"learning_rate": 3.194991886696575e-05,
"loss": 0.8128,
"step": 1102
},
{
"epoch": 0.3035849446088213,
"grad_norm": 0.3676167032966343,
"learning_rate": 3.1935903435373026e-05,
"loss": 0.8052,
"step": 1103
},
{
"epoch": 0.3038601802793642,
"grad_norm": 0.34230786073322306,
"learning_rate": 3.192187889383754e-05,
"loss": 0.8067,
"step": 1104
},
{
"epoch": 0.3041354159499071,
"grad_norm": 0.30832081555167196,
"learning_rate": 3.190784525306336e-05,
"loss": 0.8205,
"step": 1105
},
{
"epoch": 0.30441065162045,
"grad_norm": 0.36304152683383306,
"learning_rate": 3.189380252376151e-05,
"loss": 0.8069,
"step": 1106
},
{
"epoch": 0.3046858872909929,
"grad_norm": 0.3615993897222199,
"learning_rate": 3.187975071664994e-05,
"loss": 0.8019,
"step": 1107
},
{
"epoch": 0.3049611229615358,
"grad_norm": 0.32974113357091156,
"learning_rate": 3.186568984245354e-05,
"loss": 0.8283,
"step": 1108
},
{
"epoch": 0.3052363586320787,
"grad_norm": 0.3238031127924285,
"learning_rate": 3.185161991190411e-05,
"loss": 0.8033,
"step": 1109
},
{
"epoch": 0.30551159430262165,
"grad_norm": 0.33615745699182764,
"learning_rate": 3.183754093574035e-05,
"loss": 0.8104,
"step": 1110
},
{
"epoch": 0.30578682997316453,
"grad_norm": 0.36044438230543663,
"learning_rate": 3.1823452924707894e-05,
"loss": 0.8013,
"step": 1111
},
{
"epoch": 0.3060620656437074,
"grad_norm": 0.5764325505023655,
"learning_rate": 3.180935588955926e-05,
"loss": 0.7694,
"step": 1112
},
{
"epoch": 0.30633730131425035,
"grad_norm": 0.357333249303477,
"learning_rate": 3.179524984105383e-05,
"loss": 0.7981,
"step": 1113
},
{
"epoch": 0.30661253698479324,
"grad_norm": 0.3668360038279917,
"learning_rate": 3.178113478995791e-05,
"loss": 0.8327,
"step": 1114
},
{
"epoch": 0.3068877726553361,
"grad_norm": 0.39086386543844476,
"learning_rate": 3.1767010747044635e-05,
"loss": 0.8309,
"step": 1115
},
{
"epoch": 0.30716300832587906,
"grad_norm": 0.32648495720119647,
"learning_rate": 3.175287772309403e-05,
"loss": 0.835,
"step": 1116
},
{
"epoch": 0.30743824399642194,
"grad_norm": 0.8511553863015345,
"learning_rate": 3.1738735728892956e-05,
"loss": 0.8103,
"step": 1117
},
{
"epoch": 0.30771347966696483,
"grad_norm": 0.39164869160745347,
"learning_rate": 3.172458477523514e-05,
"loss": 0.814,
"step": 1118
},
{
"epoch": 0.30798871533750777,
"grad_norm": 0.33960980330211993,
"learning_rate": 3.1710424872921126e-05,
"loss": 0.7888,
"step": 1119
},
{
"epoch": 0.30826395100805065,
"grad_norm": 0.36340957181666206,
"learning_rate": 3.1696256032758304e-05,
"loss": 0.8154,
"step": 1120
},
{
"epoch": 0.30853918667859354,
"grad_norm": 0.3253130095574143,
"learning_rate": 3.168207826556089e-05,
"loss": 0.8096,
"step": 1121
},
{
"epoch": 0.3088144223491365,
"grad_norm": 0.34444129036388177,
"learning_rate": 3.1667891582149886e-05,
"loss": 0.8281,
"step": 1122
},
{
"epoch": 0.30908965801967936,
"grad_norm": 0.34975256000382554,
"learning_rate": 3.165369599335312e-05,
"loss": 0.8155,
"step": 1123
},
{
"epoch": 0.30936489369022224,
"grad_norm": 0.3793158911943503,
"learning_rate": 3.163949151000522e-05,
"loss": 0.8448,
"step": 1124
},
{
"epoch": 0.3096401293607652,
"grad_norm": 0.41887201437302796,
"learning_rate": 3.162527814294761e-05,
"loss": 0.8345,
"step": 1125
},
{
"epoch": 0.30991536503130807,
"grad_norm": 0.3303807453135232,
"learning_rate": 3.161105590302845e-05,
"loss": 0.8057,
"step": 1126
},
{
"epoch": 0.31019060070185095,
"grad_norm": 0.4215554810706794,
"learning_rate": 3.159682480110273e-05,
"loss": 0.8199,
"step": 1127
},
{
"epoch": 0.3104658363723939,
"grad_norm": 0.3410676829644901,
"learning_rate": 3.158258484803216e-05,
"loss": 0.7984,
"step": 1128
},
{
"epoch": 0.3107410720429368,
"grad_norm": 0.3154713819491548,
"learning_rate": 3.156833605468523e-05,
"loss": 0.7947,
"step": 1129
},
{
"epoch": 0.31101630771347966,
"grad_norm": 0.3357186700692159,
"learning_rate": 3.1554078431937184e-05,
"loss": 0.7811,
"step": 1130
},
{
"epoch": 0.3112915433840226,
"grad_norm": 0.2964859029925219,
"learning_rate": 3.153981199066996e-05,
"loss": 0.8289,
"step": 1131
},
{
"epoch": 0.3115667790545655,
"grad_norm": 0.32597761114967777,
"learning_rate": 3.152553674177227e-05,
"loss": 0.8222,
"step": 1132
},
{
"epoch": 0.31184201472510836,
"grad_norm": 0.3353452009647141,
"learning_rate": 3.151125269613955e-05,
"loss": 0.7971,
"step": 1133
},
{
"epoch": 0.3121172503956513,
"grad_norm": 0.32858560039767065,
"learning_rate": 3.1496959864673914e-05,
"loss": 0.8003,
"step": 1134
},
{
"epoch": 0.3123924860661942,
"grad_norm": 0.3157082619075387,
"learning_rate": 3.148265825828422e-05,
"loss": 0.8215,
"step": 1135
},
{
"epoch": 0.31266772173673707,
"grad_norm": 0.3101601481876492,
"learning_rate": 3.1468347887886004e-05,
"loss": 0.8126,
"step": 1136
},
{
"epoch": 0.31294295740728,
"grad_norm": 0.3933483214188846,
"learning_rate": 3.145402876440148e-05,
"loss": 0.7987,
"step": 1137
},
{
"epoch": 0.3132181930778229,
"grad_norm": 0.31823140928731836,
"learning_rate": 3.1439700898759565e-05,
"loss": 0.8061,
"step": 1138
},
{
"epoch": 0.3134934287483658,
"grad_norm": 0.305547483255719,
"learning_rate": 3.142536430189585e-05,
"loss": 0.7949,
"step": 1139
},
{
"epoch": 0.3137686644189087,
"grad_norm": 0.34424414664507813,
"learning_rate": 3.141101898475257e-05,
"loss": 0.8018,
"step": 1140
},
{
"epoch": 0.3140439000894516,
"grad_norm": 0.360493950749892,
"learning_rate": 3.1396664958278614e-05,
"loss": 0.8444,
"step": 1141
},
{
"epoch": 0.3143191357599945,
"grad_norm": 0.3105624439412429,
"learning_rate": 3.138230223342955e-05,
"loss": 0.7923,
"step": 1142
},
{
"epoch": 0.3145943714305374,
"grad_norm": 0.33707412890930577,
"learning_rate": 3.136793082116756e-05,
"loss": 0.8507,
"step": 1143
},
{
"epoch": 0.3148696071010803,
"grad_norm": 0.3196651333111948,
"learning_rate": 3.135355073246146e-05,
"loss": 0.8353,
"step": 1144
},
{
"epoch": 0.3151448427716232,
"grad_norm": 0.30958632526199326,
"learning_rate": 3.133916197828668e-05,
"loss": 0.8093,
"step": 1145
},
{
"epoch": 0.31542007844216613,
"grad_norm": 0.3480763284379594,
"learning_rate": 3.132476456962528e-05,
"loss": 0.8423,
"step": 1146
},
{
"epoch": 0.315695314112709,
"grad_norm": 0.33953480338161784,
"learning_rate": 3.131035851746592e-05,
"loss": 0.8248,
"step": 1147
},
{
"epoch": 0.3159705497832519,
"grad_norm": 0.3453385914088088,
"learning_rate": 3.129594383280386e-05,
"loss": 0.7956,
"step": 1148
},
{
"epoch": 0.31624578545379484,
"grad_norm": 0.3771287825833907,
"learning_rate": 3.1281520526640936e-05,
"loss": 0.8335,
"step": 1149
},
{
"epoch": 0.3165210211243377,
"grad_norm": 0.3368884133886741,
"learning_rate": 3.126708860998557e-05,
"loss": 0.818,
"step": 1150
},
{
"epoch": 0.3167962567948806,
"grad_norm": 0.3303278124290784,
"learning_rate": 3.125264809385278e-05,
"loss": 0.8042,
"step": 1151
},
{
"epoch": 0.31707149246542354,
"grad_norm": 0.4145796934735968,
"learning_rate": 3.1238198989264094e-05,
"loss": 0.8208,
"step": 1152
},
{
"epoch": 0.3173467281359664,
"grad_norm": 0.35755342074383434,
"learning_rate": 3.122374130724765e-05,
"loss": 0.8246,
"step": 1153
},
{
"epoch": 0.3176219638065093,
"grad_norm": 0.34738534531203147,
"learning_rate": 3.1209275058838105e-05,
"loss": 0.8167,
"step": 1154
},
{
"epoch": 0.31789719947705225,
"grad_norm": 0.3211440831630971,
"learning_rate": 3.119480025507665e-05,
"loss": 0.8181,
"step": 1155
},
{
"epoch": 0.31817243514759513,
"grad_norm": 0.3756600592518461,
"learning_rate": 3.1180316907011026e-05,
"loss": 0.8246,
"step": 1156
},
{
"epoch": 0.318447670818138,
"grad_norm": 0.3717852513165001,
"learning_rate": 3.1165825025695484e-05,
"loss": 0.8155,
"step": 1157
},
{
"epoch": 0.31872290648868096,
"grad_norm": 0.3411660067212517,
"learning_rate": 3.1151324622190776e-05,
"loss": 0.8365,
"step": 1158
},
{
"epoch": 0.31899814215922384,
"grad_norm": 0.35723796605780694,
"learning_rate": 3.113681570756417e-05,
"loss": 0.8077,
"step": 1159
},
{
"epoch": 0.3192733778297667,
"grad_norm": 0.37898335129664723,
"learning_rate": 3.112229829288946e-05,
"loss": 0.8076,
"step": 1160
},
{
"epoch": 0.31954861350030966,
"grad_norm": 0.3842451487822477,
"learning_rate": 3.110777238924685e-05,
"loss": 0.8018,
"step": 1161
},
{
"epoch": 0.31982384917085255,
"grad_norm": 0.34774442510600123,
"learning_rate": 3.109323800772312e-05,
"loss": 0.8287,
"step": 1162
},
{
"epoch": 0.32009908484139543,
"grad_norm": 0.321811757823895,
"learning_rate": 3.1078695159411435e-05,
"loss": 0.7819,
"step": 1163
},
{
"epoch": 0.32037432051193837,
"grad_norm": 0.31062206597350395,
"learning_rate": 3.106414385541147e-05,
"loss": 0.7771,
"step": 1164
},
{
"epoch": 0.32064955618248125,
"grad_norm": 0.3462157661062194,
"learning_rate": 3.104958410682935e-05,
"loss": 0.8109,
"step": 1165
},
{
"epoch": 0.32092479185302414,
"grad_norm": 0.3225870300013751,
"learning_rate": 3.1035015924777634e-05,
"loss": 0.8416,
"step": 1166
},
{
"epoch": 0.3212000275235671,
"grad_norm": 0.2908050643911587,
"learning_rate": 3.102043932037532e-05,
"loss": 0.8122,
"step": 1167
},
{
"epoch": 0.32147526319410996,
"grad_norm": 0.32006337424346826,
"learning_rate": 3.1005854304747826e-05,
"loss": 0.852,
"step": 1168
},
{
"epoch": 0.32175049886465285,
"grad_norm": 0.32418150253371214,
"learning_rate": 3.0991260889027025e-05,
"loss": 0.7922,
"step": 1169
},
{
"epoch": 0.3220257345351958,
"grad_norm": 0.3270598384344112,
"learning_rate": 3.097665908435115e-05,
"loss": 0.7983,
"step": 1170
},
{
"epoch": 0.32230097020573867,
"grad_norm": 0.3285607660817388,
"learning_rate": 3.096204890186488e-05,
"loss": 0.8012,
"step": 1171
},
{
"epoch": 0.32257620587628155,
"grad_norm": 0.3270770971277959,
"learning_rate": 3.0947430352719254e-05,
"loss": 0.8058,
"step": 1172
},
{
"epoch": 0.3228514415468245,
"grad_norm": 0.30266960130770704,
"learning_rate": 3.0932803448071726e-05,
"loss": 0.7792,
"step": 1173
},
{
"epoch": 0.3231266772173674,
"grad_norm": 0.29525543430326306,
"learning_rate": 3.091816819908611e-05,
"loss": 0.8084,
"step": 1174
},
{
"epoch": 0.32340191288791026,
"grad_norm": 0.2949338073321175,
"learning_rate": 3.0903524616932604e-05,
"loss": 0.8111,
"step": 1175
},
{
"epoch": 0.3236771485584532,
"grad_norm": 2.409665762699941,
"learning_rate": 3.0888872712787744e-05,
"loss": 0.8098,
"step": 1176
},
{
"epoch": 0.3239523842289961,
"grad_norm": 0.34556104275883914,
"learning_rate": 3.0874212497834436e-05,
"loss": 0.7965,
"step": 1177
},
{
"epoch": 0.32422761989953897,
"grad_norm": 0.33400503870094994,
"learning_rate": 3.0859543983261916e-05,
"loss": 0.8097,
"step": 1178
},
{
"epoch": 0.3245028555700819,
"grad_norm": 0.3159377875922141,
"learning_rate": 3.0844867180265765e-05,
"loss": 0.8028,
"step": 1179
},
{
"epoch": 0.3247780912406248,
"grad_norm": 0.34527808475244703,
"learning_rate": 3.083018210004789e-05,
"loss": 0.7971,
"step": 1180
},
{
"epoch": 0.3250533269111677,
"grad_norm": 0.3571175987053503,
"learning_rate": 3.08154887538165e-05,
"loss": 0.7921,
"step": 1181
},
{
"epoch": 0.3253285625817106,
"grad_norm": 0.3430153890288714,
"learning_rate": 3.080078715278614e-05,
"loss": 0.7938,
"step": 1182
},
{
"epoch": 0.3256037982522535,
"grad_norm": 0.317884097887881,
"learning_rate": 3.078607730817763e-05,
"loss": 0.7941,
"step": 1183
},
{
"epoch": 0.3258790339227964,
"grad_norm": 0.33993810766485266,
"learning_rate": 3.077135923121809e-05,
"loss": 0.8235,
"step": 1184
},
{
"epoch": 0.3261542695933393,
"grad_norm": 0.34211246205653845,
"learning_rate": 3.075663293314093e-05,
"loss": 0.86,
"step": 1185
},
{
"epoch": 0.3264295052638822,
"grad_norm": 0.35413291917176803,
"learning_rate": 3.074189842518584e-05,
"loss": 0.7843,
"step": 1186
},
{
"epoch": 0.3267047409344251,
"grad_norm": 0.3586716027990718,
"learning_rate": 3.072715571859874e-05,
"loss": 0.7954,
"step": 1187
},
{
"epoch": 0.326979976604968,
"grad_norm": 0.32160005633334726,
"learning_rate": 3.071240482463186e-05,
"loss": 0.7991,
"step": 1188
},
{
"epoch": 0.3272552122755109,
"grad_norm": 0.36521159572448836,
"learning_rate": 3.0697645754543636e-05,
"loss": 0.8058,
"step": 1189
},
{
"epoch": 0.3275304479460538,
"grad_norm": 0.331018434067738,
"learning_rate": 3.068287851959877e-05,
"loss": 0.8261,
"step": 1190
},
{
"epoch": 0.32780568361659673,
"grad_norm": 0.37362418936014585,
"learning_rate": 3.066810313106818e-05,
"loss": 0.8238,
"step": 1191
},
{
"epoch": 0.3280809192871396,
"grad_norm": 0.40993410650019435,
"learning_rate": 3.0653319600229e-05,
"loss": 0.8012,
"step": 1192
},
{
"epoch": 0.3283561549576825,
"grad_norm": 0.3378351823805291,
"learning_rate": 3.063852793836462e-05,
"loss": 0.8327,
"step": 1193
},
{
"epoch": 0.32863139062822544,
"grad_norm": 0.3673208048610507,
"learning_rate": 3.062372815676461e-05,
"loss": 0.8315,
"step": 1194
},
{
"epoch": 0.3289066262987683,
"grad_norm": 0.3884739790830478,
"learning_rate": 3.06089202667247e-05,
"loss": 0.7938,
"step": 1195
},
{
"epoch": 0.3291818619693112,
"grad_norm": 0.34186931524257896,
"learning_rate": 3.059410427954687e-05,
"loss": 0.7876,
"step": 1196
},
{
"epoch": 0.32945709763985415,
"grad_norm": 0.32303616322219825,
"learning_rate": 3.057928020653925e-05,
"loss": 0.8208,
"step": 1197
},
{
"epoch": 0.32973233331039703,
"grad_norm": 0.35967963521533275,
"learning_rate": 3.056444805901615e-05,
"loss": 0.8186,
"step": 1198
},
{
"epoch": 0.3300075689809399,
"grad_norm": 0.3594452352052698,
"learning_rate": 3.0549607848298024e-05,
"loss": 0.8048,
"step": 1199
},
{
"epoch": 0.33028280465148285,
"grad_norm": 0.32815987567995447,
"learning_rate": 3.0534759585711505e-05,
"loss": 0.8301,
"step": 1200
},
{
"epoch": 0.33055804032202574,
"grad_norm": 0.3323415741018949,
"learning_rate": 3.0519903282589355e-05,
"loss": 0.8312,
"step": 1201
},
{
"epoch": 0.3308332759925686,
"grad_norm": 0.3596182289098799,
"learning_rate": 3.0505038950270482e-05,
"loss": 0.815,
"step": 1202
},
{
"epoch": 0.33110851166311156,
"grad_norm": 0.3602247707048881,
"learning_rate": 3.049016660009992e-05,
"loss": 0.7734,
"step": 1203
},
{
"epoch": 0.33138374733365444,
"grad_norm": 0.31273028302021144,
"learning_rate": 3.0475286243428824e-05,
"loss": 0.8322,
"step": 1204
},
{
"epoch": 0.3316589830041973,
"grad_norm": 0.36781142044735826,
"learning_rate": 3.0460397891614452e-05,
"loss": 0.8127,
"step": 1205
},
{
"epoch": 0.33193421867474027,
"grad_norm": 0.39408172254577667,
"learning_rate": 3.044550155602017e-05,
"loss": 0.8256,
"step": 1206
},
{
"epoch": 0.33220945434528315,
"grad_norm": 0.3139660280815443,
"learning_rate": 3.043059724801544e-05,
"loss": 0.7946,
"step": 1207
},
{
"epoch": 0.33248469001582603,
"grad_norm": 0.314800186230482,
"learning_rate": 3.0415684978975802e-05,
"loss": 0.8146,
"step": 1208
},
{
"epoch": 0.332759925686369,
"grad_norm": 0.30046207121652113,
"learning_rate": 3.0400764760282872e-05,
"loss": 0.8208,
"step": 1209
},
{
"epoch": 0.33303516135691186,
"grad_norm": 0.3533154546351518,
"learning_rate": 3.0385836603324348e-05,
"loss": 0.8022,
"step": 1210
},
{
"epoch": 0.33331039702745474,
"grad_norm": 0.3108939167879381,
"learning_rate": 3.037090051949397e-05,
"loss": 0.7982,
"step": 1211
},
{
"epoch": 0.3335856326979977,
"grad_norm": 0.307651020086293,
"learning_rate": 3.0355956520191544e-05,
"loss": 0.8243,
"step": 1212
},
{
"epoch": 0.33386086836854056,
"grad_norm": 0.32257166738075094,
"learning_rate": 3.0341004616822888e-05,
"loss": 0.82,
"step": 1213
},
{
"epoch": 0.33413610403908345,
"grad_norm": 0.3251066300297621,
"learning_rate": 3.0326044820799887e-05,
"loss": 0.8236,
"step": 1214
},
{
"epoch": 0.3344113397096264,
"grad_norm": 0.37688942249238155,
"learning_rate": 3.031107714354044e-05,
"loss": 0.8055,
"step": 1215
},
{
"epoch": 0.33468657538016927,
"grad_norm": 0.34475380933433564,
"learning_rate": 3.0296101596468444e-05,
"loss": 0.8088,
"step": 1216
},
{
"epoch": 0.33496181105071215,
"grad_norm": 0.332618427949028,
"learning_rate": 3.0281118191013817e-05,
"loss": 0.7932,
"step": 1217
},
{
"epoch": 0.3352370467212551,
"grad_norm": 0.35575743631868867,
"learning_rate": 3.026612693861248e-05,
"loss": 0.7902,
"step": 1218
},
{
"epoch": 0.335512282391798,
"grad_norm": 0.31644648344381754,
"learning_rate": 3.0251127850706332e-05,
"loss": 0.8479,
"step": 1219
},
{
"epoch": 0.33578751806234086,
"grad_norm": 0.3267765317990865,
"learning_rate": 3.0236120938743256e-05,
"loss": 0.8139,
"step": 1220
},
{
"epoch": 0.3360627537328838,
"grad_norm": 0.3335729983083016,
"learning_rate": 3.022110621417711e-05,
"loss": 0.8171,
"step": 1221
},
{
"epoch": 0.3363379894034267,
"grad_norm": 0.3071777559635056,
"learning_rate": 3.0206083688467714e-05,
"loss": 0.8428,
"step": 1222
},
{
"epoch": 0.33661322507396957,
"grad_norm": 0.31898196862856226,
"learning_rate": 3.0191053373080836e-05,
"loss": 0.7964,
"step": 1223
},
{
"epoch": 0.3368884607445125,
"grad_norm": 0.3322197682551663,
"learning_rate": 3.0176015279488192e-05,
"loss": 0.824,
"step": 1224
},
{
"epoch": 0.3371636964150554,
"grad_norm": 0.3106140921777173,
"learning_rate": 3.016096941916743e-05,
"loss": 0.8096,
"step": 1225
},
{
"epoch": 0.3374389320855983,
"grad_norm": 0.3217879380809505,
"learning_rate": 3.014591580360215e-05,
"loss": 0.7939,
"step": 1226
},
{
"epoch": 0.3377141677561412,
"grad_norm": 0.34349030381089224,
"learning_rate": 3.0130854444281836e-05,
"loss": 0.8313,
"step": 1227
},
{
"epoch": 0.3379894034266841,
"grad_norm": 0.31334713881153914,
"learning_rate": 3.011578535270192e-05,
"loss": 0.7933,
"step": 1228
},
{
"epoch": 0.338264639097227,
"grad_norm": 0.322590390034193,
"learning_rate": 3.0100708540363693e-05,
"loss": 0.7951,
"step": 1229
},
{
"epoch": 0.3385398747677699,
"grad_norm": 0.3220268508502108,
"learning_rate": 3.0085624018774368e-05,
"loss": 0.8019,
"step": 1230
},
{
"epoch": 0.3388151104383128,
"grad_norm": 0.33786748372948533,
"learning_rate": 3.0070531799447037e-05,
"loss": 0.7967,
"step": 1231
},
{
"epoch": 0.3390903461088557,
"grad_norm": 0.3232810238187581,
"learning_rate": 3.0055431893900668e-05,
"loss": 0.7889,
"step": 1232
},
{
"epoch": 0.33936558177939863,
"grad_norm": 0.31511447249732616,
"learning_rate": 3.0040324313660095e-05,
"loss": 0.819,
"step": 1233
},
{
"epoch": 0.3396408174499415,
"grad_norm": 0.32508515061520127,
"learning_rate": 3.002520907025599e-05,
"loss": 0.8422,
"step": 1234
},
{
"epoch": 0.3399160531204844,
"grad_norm": 0.3367511206875014,
"learning_rate": 3.0010086175224904e-05,
"loss": 0.8127,
"step": 1235
},
{
"epoch": 0.34019128879102734,
"grad_norm": 0.3197635030197052,
"learning_rate": 2.9994955640109212e-05,
"loss": 0.8557,
"step": 1236
},
{
"epoch": 0.3404665244615702,
"grad_norm": 0.3329731739747747,
"learning_rate": 2.9979817476457134e-05,
"loss": 0.8161,
"step": 1237
},
{
"epoch": 0.3407417601321131,
"grad_norm": 0.3805828506629627,
"learning_rate": 2.996467169582268e-05,
"loss": 0.8104,
"step": 1238
},
{
"epoch": 0.34101699580265604,
"grad_norm": 0.29477204305043675,
"learning_rate": 2.9949518309765716e-05,
"loss": 0.8476,
"step": 1239
},
{
"epoch": 0.3412922314731989,
"grad_norm": 0.3298880720437482,
"learning_rate": 2.9934357329851873e-05,
"loss": 0.8129,
"step": 1240
},
{
"epoch": 0.3415674671437418,
"grad_norm": 0.2971338055346614,
"learning_rate": 2.9919188767652615e-05,
"loss": 0.8022,
"step": 1241
},
{
"epoch": 0.34184270281428475,
"grad_norm": 0.38020295401976684,
"learning_rate": 2.9904012634745155e-05,
"loss": 0.8616,
"step": 1242
},
{
"epoch": 0.34211793848482763,
"grad_norm": 0.33471548909158555,
"learning_rate": 2.9888828942712526e-05,
"loss": 0.796,
"step": 1243
},
{
"epoch": 0.3423931741553705,
"grad_norm": 0.3140747511717884,
"learning_rate": 2.9873637703143496e-05,
"loss": 0.8197,
"step": 1244
},
{
"epoch": 0.34266840982591346,
"grad_norm": 0.34789922438101994,
"learning_rate": 2.9858438927632604e-05,
"loss": 0.8057,
"step": 1245
},
{
"epoch": 0.34294364549645634,
"grad_norm": 0.3500323011235929,
"learning_rate": 2.9843232627780146e-05,
"loss": 0.8288,
"step": 1246
},
{
"epoch": 0.3432188811669992,
"grad_norm": 0.3518209494114577,
"learning_rate": 2.9828018815192165e-05,
"loss": 0.8365,
"step": 1247
},
{
"epoch": 0.34349411683754216,
"grad_norm": 0.369605092665537,
"learning_rate": 2.981279750148042e-05,
"loss": 0.8176,
"step": 1248
},
{
"epoch": 0.34376935250808505,
"grad_norm": 0.3435550533667174,
"learning_rate": 2.9797568698262408e-05,
"loss": 0.8077,
"step": 1249
},
{
"epoch": 0.34404458817862793,
"grad_norm": 0.32694296796070815,
"learning_rate": 2.9782332417161347e-05,
"loss": 0.7941,
"step": 1250
},
{
"epoch": 0.34431982384917087,
"grad_norm": 0.3236069373962924,
"learning_rate": 2.9767088669806145e-05,
"loss": 0.7937,
"step": 1251
},
{
"epoch": 0.34459505951971375,
"grad_norm": 0.31080488888524316,
"learning_rate": 2.9751837467831425e-05,
"loss": 0.7979,
"step": 1252
},
{
"epoch": 0.34487029519025664,
"grad_norm": 0.5999440469186542,
"learning_rate": 2.9736578822877494e-05,
"loss": 0.794,
"step": 1253
},
{
"epoch": 0.3451455308607996,
"grad_norm": 0.35854535780548275,
"learning_rate": 2.9721312746590346e-05,
"loss": 0.7946,
"step": 1254
},
{
"epoch": 0.34542076653134246,
"grad_norm": 0.36589362982369955,
"learning_rate": 2.9706039250621626e-05,
"loss": 0.7959,
"step": 1255
},
{
"epoch": 0.34569600220188534,
"grad_norm": 0.2922692314970786,
"learning_rate": 2.9690758346628663e-05,
"loss": 0.8008,
"step": 1256
},
{
"epoch": 0.3459712378724283,
"grad_norm": 0.3478361453792954,
"learning_rate": 2.9675470046274432e-05,
"loss": 0.8221,
"step": 1257
},
{
"epoch": 0.34624647354297117,
"grad_norm": 0.3756605850220491,
"learning_rate": 2.966017436122756e-05,
"loss": 0.8077,
"step": 1258
},
{
"epoch": 0.34652170921351405,
"grad_norm": 0.3383026121017178,
"learning_rate": 2.9644871303162303e-05,
"loss": 0.7974,
"step": 1259
},
{
"epoch": 0.346796944884057,
"grad_norm": 0.3072858247518557,
"learning_rate": 2.9629560883758547e-05,
"loss": 0.7879,
"step": 1260
},
{
"epoch": 0.3470721805545999,
"grad_norm": 0.34912142348807734,
"learning_rate": 2.9614243114701793e-05,
"loss": 0.8135,
"step": 1261
},
{
"epoch": 0.34734741622514276,
"grad_norm": 0.37479003634737135,
"learning_rate": 2.959891800768315e-05,
"loss": 0.7965,
"step": 1262
},
{
"epoch": 0.3476226518956857,
"grad_norm": 0.28982446000602097,
"learning_rate": 2.9583585574399335e-05,
"loss": 0.8059,
"step": 1263
},
{
"epoch": 0.3478978875662286,
"grad_norm": 0.3469104066143551,
"learning_rate": 2.9568245826552662e-05,
"loss": 0.7957,
"step": 1264
},
{
"epoch": 0.34817312323677146,
"grad_norm": 0.35034756184161747,
"learning_rate": 2.9552898775851013e-05,
"loss": 0.7733,
"step": 1265
},
{
"epoch": 0.3484483589073144,
"grad_norm": 0.3216610530824517,
"learning_rate": 2.9537544434007844e-05,
"loss": 0.7871,
"step": 1266
},
{
"epoch": 0.3487235945778573,
"grad_norm": 0.34038548926716644,
"learning_rate": 2.9522182812742195e-05,
"loss": 0.8159,
"step": 1267
},
{
"epoch": 0.34899883024840017,
"grad_norm": 0.3129022395956265,
"learning_rate": 2.9506813923778637e-05,
"loss": 0.8493,
"step": 1268
},
{
"epoch": 0.3492740659189431,
"grad_norm": 0.35733246998396123,
"learning_rate": 2.9491437778847305e-05,
"loss": 0.7921,
"step": 1269
},
{
"epoch": 0.349549301589486,
"grad_norm": 0.4164933167458972,
"learning_rate": 2.9476054389683865e-05,
"loss": 0.8324,
"step": 1270
},
{
"epoch": 0.3498245372600289,
"grad_norm": 0.31619142772383657,
"learning_rate": 2.9460663768029523e-05,
"loss": 0.7869,
"step": 1271
},
{
"epoch": 0.3500997729305718,
"grad_norm": 0.3978250102523207,
"learning_rate": 2.944526592563099e-05,
"loss": 0.7979,
"step": 1272
},
{
"epoch": 0.3503750086011147,
"grad_norm": 0.38105837046872004,
"learning_rate": 2.9429860874240487e-05,
"loss": 0.8504,
"step": 1273
},
{
"epoch": 0.3506502442716576,
"grad_norm": 0.451797702235112,
"learning_rate": 2.941444862561575e-05,
"loss": 0.8174,
"step": 1274
},
{
"epoch": 0.3509254799422005,
"grad_norm": 0.45694558929714885,
"learning_rate": 2.939902919152001e-05,
"loss": 0.8196,
"step": 1275
},
{
"epoch": 0.3512007156127434,
"grad_norm": 0.38279750572095345,
"learning_rate": 2.938360258372197e-05,
"loss": 0.8099,
"step": 1276
},
{
"epoch": 0.3514759512832863,
"grad_norm": 0.3679145053121925,
"learning_rate": 2.9368168813995806e-05,
"loss": 0.8013,
"step": 1277
},
{
"epoch": 0.35175118695382923,
"grad_norm": 0.4157338308427359,
"learning_rate": 2.9352727894121177e-05,
"loss": 0.8227,
"step": 1278
},
{
"epoch": 0.3520264226243721,
"grad_norm": 0.38644287527604376,
"learning_rate": 2.9337279835883182e-05,
"loss": 0.8048,
"step": 1279
},
{
"epoch": 0.352301658294915,
"grad_norm": 0.38957270430092533,
"learning_rate": 2.9321824651072387e-05,
"loss": 0.7748,
"step": 1280
},
{
"epoch": 0.35257689396545794,
"grad_norm": 0.3643444602815919,
"learning_rate": 2.9306362351484775e-05,
"loss": 0.8333,
"step": 1281
},
{
"epoch": 0.3528521296360008,
"grad_norm": 0.3937275576769181,
"learning_rate": 2.9290892948921784e-05,
"loss": 0.7821,
"step": 1282
},
{
"epoch": 0.3531273653065437,
"grad_norm": 0.4223826002667787,
"learning_rate": 2.927541645519024e-05,
"loss": 0.7973,
"step": 1283
},
{
"epoch": 0.35340260097708664,
"grad_norm": 0.3636893447079271,
"learning_rate": 2.9259932882102417e-05,
"loss": 0.8181,
"step": 1284
},
{
"epoch": 0.35367783664762953,
"grad_norm": 0.41412924101092213,
"learning_rate": 2.924444224147597e-05,
"loss": 0.8136,
"step": 1285
},
{
"epoch": 0.3539530723181724,
"grad_norm": 0.36713321160115875,
"learning_rate": 2.9228944545133963e-05,
"loss": 0.8078,
"step": 1286
},
{
"epoch": 0.35422830798871535,
"grad_norm": 0.3414686821209659,
"learning_rate": 2.9213439804904826e-05,
"loss": 0.8066,
"step": 1287
},
{
"epoch": 0.35450354365925824,
"grad_norm": 0.3906568751906246,
"learning_rate": 2.9197928032622377e-05,
"loss": 0.7955,
"step": 1288
},
{
"epoch": 0.3547787793298011,
"grad_norm": 0.38845030866559555,
"learning_rate": 2.91824092401258e-05,
"loss": 0.8167,
"step": 1289
},
{
"epoch": 0.35505401500034406,
"grad_norm": 0.3802361599602513,
"learning_rate": 2.916688343925965e-05,
"loss": 0.8086,
"step": 1290
},
{
"epoch": 0.35532925067088694,
"grad_norm": 0.3688825519695599,
"learning_rate": 2.91513506418738e-05,
"loss": 0.8437,
"step": 1291
},
{
"epoch": 0.3556044863414298,
"grad_norm": 0.377317143026439,
"learning_rate": 2.913581085982349e-05,
"loss": 0.8203,
"step": 1292
},
{
"epoch": 0.35587972201197277,
"grad_norm": 0.36144713130927403,
"learning_rate": 2.912026410496929e-05,
"loss": 0.7908,
"step": 1293
},
{
"epoch": 0.35615495768251565,
"grad_norm": 0.3166623645159551,
"learning_rate": 2.910471038917707e-05,
"loss": 0.817,
"step": 1294
},
{
"epoch": 0.35643019335305853,
"grad_norm": 0.3659223836396834,
"learning_rate": 2.9089149724318026e-05,
"loss": 0.8106,
"step": 1295
},
{
"epoch": 0.3567054290236015,
"grad_norm": 0.34589936320896875,
"learning_rate": 2.9073582122268677e-05,
"loss": 0.8201,
"step": 1296
},
{
"epoch": 0.35698066469414436,
"grad_norm": 0.339484705019932,
"learning_rate": 2.9058007594910803e-05,
"loss": 0.8258,
"step": 1297
},
{
"epoch": 0.35725590036468724,
"grad_norm": 0.3230526376260753,
"learning_rate": 2.904242615413149e-05,
"loss": 0.8288,
"step": 1298
},
{
"epoch": 0.3575311360352302,
"grad_norm": 0.3177306538128517,
"learning_rate": 2.902683781182309e-05,
"loss": 0.823,
"step": 1299
},
{
"epoch": 0.35780637170577306,
"grad_norm": 0.3379774983757399,
"learning_rate": 2.9011242579883237e-05,
"loss": 0.8071,
"step": 1300
},
{
"epoch": 0.35808160737631595,
"grad_norm": 0.28028830011157246,
"learning_rate": 2.899564047021481e-05,
"loss": 0.7855,
"step": 1301
},
{
"epoch": 0.3583568430468589,
"grad_norm": 0.3160256126255414,
"learning_rate": 2.898003149472594e-05,
"loss": 0.8253,
"step": 1302
},
{
"epoch": 0.35863207871740177,
"grad_norm": 0.2812495085931235,
"learning_rate": 2.8964415665330005e-05,
"loss": 0.783,
"step": 1303
},
{
"epoch": 0.35890731438794465,
"grad_norm": 0.3304258342718014,
"learning_rate": 2.8948792993945612e-05,
"loss": 0.8093,
"step": 1304
},
{
"epoch": 0.3591825500584876,
"grad_norm": 0.3036447483986763,
"learning_rate": 2.893316349249658e-05,
"loss": 0.8194,
"step": 1305
},
{
"epoch": 0.3594577857290305,
"grad_norm": 0.3009727919107401,
"learning_rate": 2.891752717291195e-05,
"loss": 0.7908,
"step": 1306
},
{
"epoch": 0.35973302139957336,
"grad_norm": 0.36570132605373035,
"learning_rate": 2.8901884047125974e-05,
"loss": 0.8066,
"step": 1307
},
{
"epoch": 0.3600082570701163,
"grad_norm": 0.29539814063237785,
"learning_rate": 2.8886234127078077e-05,
"loss": 0.7843,
"step": 1308
},
{
"epoch": 0.3602834927406592,
"grad_norm": 0.3168307419027681,
"learning_rate": 2.8870577424712885e-05,
"loss": 0.8095,
"step": 1309
},
{
"epoch": 0.36055872841120207,
"grad_norm": 0.27458946917789645,
"learning_rate": 2.8854913951980214e-05,
"loss": 0.7595,
"step": 1310
},
{
"epoch": 0.360833964081745,
"grad_norm": 0.3425052587485172,
"learning_rate": 2.8839243720835007e-05,
"loss": 0.8023,
"step": 1311
},
{
"epoch": 0.3611091997522879,
"grad_norm": 0.47391896360520974,
"learning_rate": 2.8823566743237408e-05,
"loss": 0.8249,
"step": 1312
},
{
"epoch": 0.3613844354228308,
"grad_norm": 0.31557328858822054,
"learning_rate": 2.880788303115269e-05,
"loss": 0.8175,
"step": 1313
},
{
"epoch": 0.3616596710933737,
"grad_norm": 0.3369896790299706,
"learning_rate": 2.879219259655126e-05,
"loss": 0.8222,
"step": 1314
},
{
"epoch": 0.3619349067639166,
"grad_norm": 0.3386795554756731,
"learning_rate": 2.8776495451408677e-05,
"loss": 0.8229,
"step": 1315
},
{
"epoch": 0.3622101424344595,
"grad_norm": 0.3123954344701337,
"learning_rate": 2.8760791607705597e-05,
"loss": 0.8012,
"step": 1316
},
{
"epoch": 0.3624853781050024,
"grad_norm": 0.35211511951310254,
"learning_rate": 2.87450810774278e-05,
"loss": 0.8248,
"step": 1317
},
{
"epoch": 0.3627606137755453,
"grad_norm": 0.31329166669910147,
"learning_rate": 2.8729363872566178e-05,
"loss": 0.8139,
"step": 1318
},
{
"epoch": 0.3630358494460882,
"grad_norm": 0.34213219153214414,
"learning_rate": 2.8713640005116708e-05,
"loss": 0.8237,
"step": 1319
},
{
"epoch": 0.3633110851166311,
"grad_norm": 0.3264988397516051,
"learning_rate": 2.8697909487080445e-05,
"loss": 0.8155,
"step": 1320
},
{
"epoch": 0.363586320787174,
"grad_norm": 0.31116444914451613,
"learning_rate": 2.8682172330463536e-05,
"loss": 0.8031,
"step": 1321
},
{
"epoch": 0.3638615564577169,
"grad_norm": 0.31559275872970716,
"learning_rate": 2.8666428547277186e-05,
"loss": 0.8193,
"step": 1322
},
{
"epoch": 0.36413679212825983,
"grad_norm": 0.5090974545519873,
"learning_rate": 2.865067814953766e-05,
"loss": 0.8016,
"step": 1323
},
{
"epoch": 0.3644120277988027,
"grad_norm": 0.31499198831549413,
"learning_rate": 2.863492114926626e-05,
"loss": 0.7769,
"step": 1324
},
{
"epoch": 0.3646872634693456,
"grad_norm": 0.3064263553389884,
"learning_rate": 2.8619157558489355e-05,
"loss": 0.8053,
"step": 1325
},
{
"epoch": 0.36496249913988854,
"grad_norm": 0.3343110758386495,
"learning_rate": 2.8603387389238313e-05,
"loss": 0.8171,
"step": 1326
},
{
"epoch": 0.3652377348104314,
"grad_norm": 0.3193469070200966,
"learning_rate": 2.8587610653549536e-05,
"loss": 0.7842,
"step": 1327
},
{
"epoch": 0.3655129704809743,
"grad_norm": 0.3160529255063763,
"learning_rate": 2.8571827363464454e-05,
"loss": 0.7788,
"step": 1328
},
{
"epoch": 0.36578820615151725,
"grad_norm": 0.28750606319238275,
"learning_rate": 2.8556037531029468e-05,
"loss": 0.8211,
"step": 1329
},
{
"epoch": 0.36606344182206013,
"grad_norm": 0.3515067314610286,
"learning_rate": 2.854024116829599e-05,
"loss": 0.7957,
"step": 1330
},
{
"epoch": 0.366338677492603,
"grad_norm": 0.3377351526025328,
"learning_rate": 2.852443828732042e-05,
"loss": 0.8351,
"step": 1331
},
{
"epoch": 0.36661391316314595,
"grad_norm": 0.30706008799821316,
"learning_rate": 2.8508628900164122e-05,
"loss": 0.8064,
"step": 1332
},
{
"epoch": 0.36688914883368884,
"grad_norm": 0.3132163933785639,
"learning_rate": 2.849281301889344e-05,
"loss": 0.7672,
"step": 1333
},
{
"epoch": 0.3671643845042317,
"grad_norm": 0.30601176741209374,
"learning_rate": 2.847699065557966e-05,
"loss": 0.7908,
"step": 1334
},
{
"epoch": 0.36743962017477466,
"grad_norm": 0.3184427099143359,
"learning_rate": 2.846116182229902e-05,
"loss": 0.8145,
"step": 1335
},
{
"epoch": 0.36771485584531755,
"grad_norm": 0.30693411282540556,
"learning_rate": 2.84453265311327e-05,
"loss": 0.8238,
"step": 1336
},
{
"epoch": 0.36799009151586043,
"grad_norm": 0.2985348513413893,
"learning_rate": 2.8429484794166798e-05,
"loss": 0.7928,
"step": 1337
},
{
"epoch": 0.36826532718640337,
"grad_norm": 0.31666817103569384,
"learning_rate": 2.841363662349235e-05,
"loss": 0.7872,
"step": 1338
},
{
"epoch": 0.36854056285694625,
"grad_norm": 0.3322566974582257,
"learning_rate": 2.8397782031205295e-05,
"loss": 0.8004,
"step": 1339
},
{
"epoch": 0.36881579852748914,
"grad_norm": 0.3009981595090159,
"learning_rate": 2.8381921029406464e-05,
"loss": 0.8346,
"step": 1340
},
{
"epoch": 0.3690910341980321,
"grad_norm": 0.32567627898081886,
"learning_rate": 2.8366053630201577e-05,
"loss": 0.8052,
"step": 1341
},
{
"epoch": 0.36936626986857496,
"grad_norm": 0.3429285129950298,
"learning_rate": 2.8350179845701267e-05,
"loss": 0.7973,
"step": 1342
},
{
"epoch": 0.36964150553911784,
"grad_norm": 0.32101941136515527,
"learning_rate": 2.8334299688021002e-05,
"loss": 0.7935,
"step": 1343
},
{
"epoch": 0.3699167412096608,
"grad_norm": 0.32910331378716223,
"learning_rate": 2.8318413169281146e-05,
"loss": 0.8145,
"step": 1344
},
{
"epoch": 0.37019197688020367,
"grad_norm": 0.3326953554224791,
"learning_rate": 2.830252030160689e-05,
"loss": 0.7849,
"step": 1345
},
{
"epoch": 0.37046721255074655,
"grad_norm": 0.3236119360588396,
"learning_rate": 2.8286621097128298e-05,
"loss": 0.8243,
"step": 1346
},
{
"epoch": 0.3707424482212895,
"grad_norm": 0.3568165246303928,
"learning_rate": 2.8270715567980248e-05,
"loss": 0.8101,
"step": 1347
},
{
"epoch": 0.3710176838918324,
"grad_norm": 0.3867713763535159,
"learning_rate": 2.825480372630246e-05,
"loss": 0.8066,
"step": 1348
},
{
"epoch": 0.37129291956237526,
"grad_norm": 0.40846663591430193,
"learning_rate": 2.8238885584239458e-05,
"loss": 0.8294,
"step": 1349
},
{
"epoch": 0.3715681552329182,
"grad_norm": 0.3200132627526865,
"learning_rate": 2.8222961153940595e-05,
"loss": 0.7819,
"step": 1350
},
{
"epoch": 0.3718433909034611,
"grad_norm": 0.3602760939053267,
"learning_rate": 2.8207030447560003e-05,
"loss": 0.7826,
"step": 1351
},
{
"epoch": 0.37211862657400396,
"grad_norm": 0.369247157604994,
"learning_rate": 2.819109347725662e-05,
"loss": 0.8268,
"step": 1352
},
{
"epoch": 0.3723938622445469,
"grad_norm": 0.3170393651874839,
"learning_rate": 2.817515025519415e-05,
"loss": 0.7882,
"step": 1353
},
{
"epoch": 0.3726690979150898,
"grad_norm": 0.3326519408046655,
"learning_rate": 2.8159200793541078e-05,
"loss": 0.768,
"step": 1354
},
{
"epoch": 0.3729443335856327,
"grad_norm": 0.34434308946721537,
"learning_rate": 2.8143245104470653e-05,
"loss": 0.7953,
"step": 1355
},
{
"epoch": 0.3732195692561756,
"grad_norm": 0.3623225721725796,
"learning_rate": 2.812728320016087e-05,
"loss": 0.8252,
"step": 1356
},
{
"epoch": 0.3734948049267185,
"grad_norm": 0.315199315152692,
"learning_rate": 2.811131509279448e-05,
"loss": 0.7848,
"step": 1357
},
{
"epoch": 0.37377004059726143,
"grad_norm": 0.36338809342252487,
"learning_rate": 2.8095340794558946e-05,
"loss": 0.7896,
"step": 1358
},
{
"epoch": 0.3740452762678043,
"grad_norm": 0.33988187467434927,
"learning_rate": 2.8079360317646474e-05,
"loss": 0.812,
"step": 1359
},
{
"epoch": 0.3743205119383472,
"grad_norm": 0.2857496207925808,
"learning_rate": 2.8063373674253983e-05,
"loss": 0.7922,
"step": 1360
},
{
"epoch": 0.37459574760889014,
"grad_norm": 0.3330778243852057,
"learning_rate": 2.8047380876583105e-05,
"loss": 0.8094,
"step": 1361
},
{
"epoch": 0.374870983279433,
"grad_norm": 0.3244271784314232,
"learning_rate": 2.8031381936840153e-05,
"loss": 0.8078,
"step": 1362
},
{
"epoch": 0.3751462189499759,
"grad_norm": 0.3250873656533473,
"learning_rate": 2.801537686723613e-05,
"loss": 0.8411,
"step": 1363
},
{
"epoch": 0.37542145462051885,
"grad_norm": 0.34791160182523184,
"learning_rate": 2.7999365679986733e-05,
"loss": 0.8581,
"step": 1364
},
{
"epoch": 0.37569669029106173,
"grad_norm": 0.30304224891942977,
"learning_rate": 2.798334838731232e-05,
"loss": 0.8043,
"step": 1365
},
{
"epoch": 0.3759719259616046,
"grad_norm": 0.3257738410421442,
"learning_rate": 2.79673250014379e-05,
"loss": 0.8315,
"step": 1366
},
{
"epoch": 0.37624716163214755,
"grad_norm": 0.31452388703585527,
"learning_rate": 2.795129553459315e-05,
"loss": 0.8372,
"step": 1367
},
{
"epoch": 0.37652239730269044,
"grad_norm": 0.3269610810342665,
"learning_rate": 2.793525999901237e-05,
"loss": 0.8201,
"step": 1368
},
{
"epoch": 0.3767976329732333,
"grad_norm": 0.3293112124741156,
"learning_rate": 2.79192184069345e-05,
"loss": 0.8111,
"step": 1369
},
{
"epoch": 0.37707286864377626,
"grad_norm": 0.3190169318875583,
"learning_rate": 2.7903170770603113e-05,
"loss": 0.8161,
"step": 1370
},
{
"epoch": 0.37734810431431914,
"grad_norm": 0.335441432819471,
"learning_rate": 2.7887117102266373e-05,
"loss": 0.7934,
"step": 1371
},
{
"epoch": 0.377623339984862,
"grad_norm": 0.3194213789437805,
"learning_rate": 2.787105741417707e-05,
"loss": 0.7942,
"step": 1372
},
{
"epoch": 0.37789857565540497,
"grad_norm": 0.343590881106002,
"learning_rate": 2.7854991718592573e-05,
"loss": 0.8043,
"step": 1373
},
{
"epoch": 0.37817381132594785,
"grad_norm": 0.35997843073088864,
"learning_rate": 2.783892002777484e-05,
"loss": 0.8008,
"step": 1374
},
{
"epoch": 0.37844904699649073,
"grad_norm": 0.3612927520612644,
"learning_rate": 2.7822842353990412e-05,
"loss": 0.8154,
"step": 1375
},
{
"epoch": 0.3787242826670337,
"grad_norm": 0.33095297214331687,
"learning_rate": 2.780675870951039e-05,
"loss": 0.8079,
"step": 1376
},
{
"epoch": 0.37899951833757656,
"grad_norm": 0.31220387603827354,
"learning_rate": 2.779066910661043e-05,
"loss": 0.7997,
"step": 1377
},
{
"epoch": 0.37927475400811944,
"grad_norm": 0.31939710952173245,
"learning_rate": 2.7774573557570743e-05,
"loss": 0.7874,
"step": 1378
},
{
"epoch": 0.3795499896786624,
"grad_norm": 0.32963907224176453,
"learning_rate": 2.775847207467607e-05,
"loss": 0.7906,
"step": 1379
},
{
"epoch": 0.37982522534920526,
"grad_norm": 0.3037381999100857,
"learning_rate": 2.7742364670215686e-05,
"loss": 0.8022,
"step": 1380
},
{
"epoch": 0.38010046101974815,
"grad_norm": 0.3031129728217763,
"learning_rate": 2.772625135648338e-05,
"loss": 0.8284,
"step": 1381
},
{
"epoch": 0.3803756966902911,
"grad_norm": 0.28316725699609346,
"learning_rate": 2.7710132145777465e-05,
"loss": 0.7782,
"step": 1382
},
{
"epoch": 0.38065093236083397,
"grad_norm": 0.34053413103169805,
"learning_rate": 2.7694007050400743e-05,
"loss": 0.7869,
"step": 1383
},
{
"epoch": 0.38092616803137685,
"grad_norm": 0.28214837984365054,
"learning_rate": 2.7677876082660504e-05,
"loss": 0.7928,
"step": 1384
},
{
"epoch": 0.3812014037019198,
"grad_norm": 0.3544908061657199,
"learning_rate": 2.7661739254868534e-05,
"loss": 0.8122,
"step": 1385
},
{
"epoch": 0.3814766393724627,
"grad_norm": 0.32836527286990924,
"learning_rate": 2.7645596579341077e-05,
"loss": 0.8134,
"step": 1386
},
{
"epoch": 0.38175187504300556,
"grad_norm": 0.32829850154101425,
"learning_rate": 2.762944806839885e-05,
"loss": 0.8211,
"step": 1387
},
{
"epoch": 0.3820271107135485,
"grad_norm": 0.33511536227788746,
"learning_rate": 2.7613293734367014e-05,
"loss": 0.8221,
"step": 1388
},
{
"epoch": 0.3823023463840914,
"grad_norm": 0.3394497631985741,
"learning_rate": 2.7597133589575197e-05,
"loss": 0.8226,
"step": 1389
},
{
"epoch": 0.38257758205463427,
"grad_norm": 0.2910092588497054,
"learning_rate": 2.758096764635743e-05,
"loss": 0.7918,
"step": 1390
},
{
"epoch": 0.3828528177251772,
"grad_norm": 0.4023183538104028,
"learning_rate": 2.7564795917052194e-05,
"loss": 0.803,
"step": 1391
},
{
"epoch": 0.3831280533957201,
"grad_norm": 0.36955267265567154,
"learning_rate": 2.7548618414002368e-05,
"loss": 0.793,
"step": 1392
},
{
"epoch": 0.383403289066263,
"grad_norm": 0.31507516369283994,
"learning_rate": 2.7532435149555268e-05,
"loss": 0.7956,
"step": 1393
},
{
"epoch": 0.3836785247368059,
"grad_norm": 0.3773775660294882,
"learning_rate": 2.7516246136062567e-05,
"loss": 0.7838,
"step": 1394
},
{
"epoch": 0.3839537604073488,
"grad_norm": 0.37501625627895047,
"learning_rate": 2.7500051385880347e-05,
"loss": 0.7738,
"step": 1395
},
{
"epoch": 0.3842289960778917,
"grad_norm": 0.2974661971665518,
"learning_rate": 2.748385091136908e-05,
"loss": 0.8174,
"step": 1396
},
{
"epoch": 0.3845042317484346,
"grad_norm": 0.3505183922056784,
"learning_rate": 2.7467644724893583e-05,
"loss": 0.8054,
"step": 1397
},
{
"epoch": 0.3847794674189775,
"grad_norm": 0.30899185617899977,
"learning_rate": 2.7451432838823047e-05,
"loss": 0.7879,
"step": 1398
},
{
"epoch": 0.3850547030895204,
"grad_norm": 0.335348274613658,
"learning_rate": 2.743521526553101e-05,
"loss": 0.8324,
"step": 1399
},
{
"epoch": 0.38532993876006333,
"grad_norm": 0.3633470391262056,
"learning_rate": 2.741899201739536e-05,
"loss": 0.7793,
"step": 1400
},
{
"epoch": 0.3856051744306062,
"grad_norm": 0.35671470264157973,
"learning_rate": 2.7402763106798295e-05,
"loss": 0.7812,
"step": 1401
},
{
"epoch": 0.3858804101011491,
"grad_norm": 0.44871307083826245,
"learning_rate": 2.7386528546126342e-05,
"loss": 0.7731,
"step": 1402
},
{
"epoch": 0.38615564577169204,
"grad_norm": 0.3400205353421276,
"learning_rate": 2.7370288347770358e-05,
"loss": 0.7992,
"step": 1403
},
{
"epoch": 0.3864308814422349,
"grad_norm": 0.3187980198359592,
"learning_rate": 2.7354042524125483e-05,
"loss": 0.8159,
"step": 1404
},
{
"epoch": 0.3867061171127778,
"grad_norm": 0.28815150525493677,
"learning_rate": 2.7337791087591162e-05,
"loss": 0.8013,
"step": 1405
},
{
"epoch": 0.38698135278332074,
"grad_norm": 0.34147251379145943,
"learning_rate": 2.7321534050571115e-05,
"loss": 0.8073,
"step": 1406
},
{
"epoch": 0.3872565884538636,
"grad_norm": 0.3065715702209035,
"learning_rate": 2.7305271425473345e-05,
"loss": 0.7939,
"step": 1407
},
{
"epoch": 0.3875318241244065,
"grad_norm": 0.2981957054027009,
"learning_rate": 2.7289003224710103e-05,
"loss": 0.8513,
"step": 1408
},
{
"epoch": 0.38780705979494945,
"grad_norm": 0.30110790776698665,
"learning_rate": 2.7272729460697927e-05,
"loss": 0.7819,
"step": 1409
},
{
"epoch": 0.38808229546549233,
"grad_norm": 0.30481861252779535,
"learning_rate": 2.7256450145857578e-05,
"loss": 0.8105,
"step": 1410
},
{
"epoch": 0.3883575311360352,
"grad_norm": 0.2987766885062007,
"learning_rate": 2.7240165292614055e-05,
"loss": 0.8198,
"step": 1411
},
{
"epoch": 0.38863276680657816,
"grad_norm": 0.32115251798317024,
"learning_rate": 2.722387491339658e-05,
"loss": 0.8008,
"step": 1412
},
{
"epoch": 0.38890800247712104,
"grad_norm": 0.33777018743725346,
"learning_rate": 2.720757902063861e-05,
"loss": 0.7782,
"step": 1413
},
{
"epoch": 0.3891832381476639,
"grad_norm": 0.3296690616192377,
"learning_rate": 2.71912776267778e-05,
"loss": 0.8215,
"step": 1414
},
{
"epoch": 0.38945847381820686,
"grad_norm": 0.31869005654181193,
"learning_rate": 2.7174970744256e-05,
"loss": 0.7769,
"step": 1415
},
{
"epoch": 0.38973370948874975,
"grad_norm": 0.31483879090434064,
"learning_rate": 2.715865838551925e-05,
"loss": 0.817,
"step": 1416
},
{
"epoch": 0.39000894515929263,
"grad_norm": 0.31195017324740143,
"learning_rate": 2.714234056301778e-05,
"loss": 0.8031,
"step": 1417
},
{
"epoch": 0.39028418082983557,
"grad_norm": 0.3339314715373165,
"learning_rate": 2.7126017289205977e-05,
"loss": 0.8306,
"step": 1418
},
{
"epoch": 0.39055941650037845,
"grad_norm": 0.5165042933029558,
"learning_rate": 2.71096885765424e-05,
"loss": 0.7939,
"step": 1419
},
{
"epoch": 0.39083465217092134,
"grad_norm": 0.3325636248893185,
"learning_rate": 2.7093354437489744e-05,
"loss": 0.823,
"step": 1420
},
{
"epoch": 0.3911098878414643,
"grad_norm": 0.32175997339050016,
"learning_rate": 2.7077014884514867e-05,
"loss": 0.8238,
"step": 1421
},
{
"epoch": 0.39138512351200716,
"grad_norm": 0.3183119811402347,
"learning_rate": 2.7060669930088744e-05,
"loss": 0.7902,
"step": 1422
},
{
"epoch": 0.39166035918255004,
"grad_norm": 0.3190233340184448,
"learning_rate": 2.7044319586686464e-05,
"loss": 0.7957,
"step": 1423
},
{
"epoch": 0.391935594853093,
"grad_norm": 0.358994548429445,
"learning_rate": 2.7027963866787255e-05,
"loss": 0.7982,
"step": 1424
},
{
"epoch": 0.39221083052363587,
"grad_norm": 0.2813243071562666,
"learning_rate": 2.701160278287443e-05,
"loss": 0.7993,
"step": 1425
},
{
"epoch": 0.39248606619417875,
"grad_norm": 0.361100554529105,
"learning_rate": 2.6995236347435402e-05,
"loss": 0.8183,
"step": 1426
},
{
"epoch": 0.3927613018647217,
"grad_norm": 0.297859261527035,
"learning_rate": 2.697886457296166e-05,
"loss": 0.8051,
"step": 1427
},
{
"epoch": 0.3930365375352646,
"grad_norm": 0.3076915112032659,
"learning_rate": 2.6962487471948787e-05,
"loss": 0.8015,
"step": 1428
},
{
"epoch": 0.39331177320580746,
"grad_norm": 0.30013010580384464,
"learning_rate": 2.6946105056896406e-05,
"loss": 0.8217,
"step": 1429
},
{
"epoch": 0.3935870088763504,
"grad_norm": 0.3067900479520812,
"learning_rate": 2.692971734030822e-05,
"loss": 0.8357,
"step": 1430
},
{
"epoch": 0.3938622445468933,
"grad_norm": 0.33042204011575527,
"learning_rate": 2.6913324334691965e-05,
"loss": 0.8187,
"step": 1431
},
{
"epoch": 0.39413748021743616,
"grad_norm": 1.0235278366123177,
"learning_rate": 2.6896926052559412e-05,
"loss": 0.8055,
"step": 1432
},
{
"epoch": 0.3944127158879791,
"grad_norm": 0.3132802404760063,
"learning_rate": 2.688052250642637e-05,
"loss": 0.8033,
"step": 1433
},
{
"epoch": 0.394687951558522,
"grad_norm": 0.2859859821046954,
"learning_rate": 2.6864113708812652e-05,
"loss": 0.8039,
"step": 1434
},
{
"epoch": 0.39496318722906487,
"grad_norm": 0.31601424136481243,
"learning_rate": 2.6847699672242086e-05,
"loss": 0.7931,
"step": 1435
},
{
"epoch": 0.3952384228996078,
"grad_norm": 0.36800909101564766,
"learning_rate": 2.683128040924251e-05,
"loss": 0.8275,
"step": 1436
},
{
"epoch": 0.3955136585701507,
"grad_norm": 0.3155541640988661,
"learning_rate": 2.6814855932345733e-05,
"loss": 0.7825,
"step": 1437
},
{
"epoch": 0.3957888942406936,
"grad_norm": 0.3373869372996058,
"learning_rate": 2.679842625408755e-05,
"loss": 0.7869,
"step": 1438
},
{
"epoch": 0.3960641299112365,
"grad_norm": 0.32653211745613714,
"learning_rate": 2.6781991387007725e-05,
"loss": 0.8131,
"step": 1439
},
{
"epoch": 0.3963393655817794,
"grad_norm": 0.3268275942679181,
"learning_rate": 2.676555134364999e-05,
"loss": 0.7823,
"step": 1440
},
{
"epoch": 0.3966146012523223,
"grad_norm": 0.30157734350933985,
"learning_rate": 2.674910613656201e-05,
"loss": 0.8052,
"step": 1441
},
{
"epoch": 0.3968898369228652,
"grad_norm": 0.33513607003793183,
"learning_rate": 2.6732655778295416e-05,
"loss": 0.7968,
"step": 1442
},
{
"epoch": 0.3971650725934081,
"grad_norm": 0.2988598809491412,
"learning_rate": 2.671620028140575e-05,
"loss": 0.8164,
"step": 1443
},
{
"epoch": 0.397440308263951,
"grad_norm": 0.2974119303314532,
"learning_rate": 2.6699739658452488e-05,
"loss": 0.7867,
"step": 1444
},
{
"epoch": 0.39771554393449393,
"grad_norm": 0.2927689415610301,
"learning_rate": 2.6683273921999e-05,
"loss": 0.7959,
"step": 1445
},
{
"epoch": 0.3979907796050368,
"grad_norm": 0.3034062685357057,
"learning_rate": 2.6666803084612586e-05,
"loss": 0.7609,
"step": 1446
},
{
"epoch": 0.3982660152755797,
"grad_norm": 0.29511262430624435,
"learning_rate": 2.6650327158864423e-05,
"loss": 0.8057,
"step": 1447
},
{
"epoch": 0.39854125094612264,
"grad_norm": 0.31100547478830526,
"learning_rate": 2.663384615732957e-05,
"loss": 0.8007,
"step": 1448
},
{
"epoch": 0.3988164866166655,
"grad_norm": 0.2987495592927288,
"learning_rate": 2.6617360092586973e-05,
"loss": 0.7742,
"step": 1449
},
{
"epoch": 0.3990917222872084,
"grad_norm": 0.29353723403945653,
"learning_rate": 2.6600868977219428e-05,
"loss": 0.7967,
"step": 1450
},
{
"epoch": 0.39936695795775135,
"grad_norm": 0.2943294262418574,
"learning_rate": 2.6584372823813588e-05,
"loss": 0.7832,
"step": 1451
},
{
"epoch": 0.39964219362829423,
"grad_norm": 0.28764564181784025,
"learning_rate": 2.6567871644959954e-05,
"loss": 0.8084,
"step": 1452
},
{
"epoch": 0.3999174292988371,
"grad_norm": 0.30420633598929464,
"learning_rate": 2.6551365453252872e-05,
"loss": 0.83,
"step": 1453
},
{
"epoch": 0.40019266496938005,
"grad_norm": 0.28961885854550073,
"learning_rate": 2.6534854261290504e-05,
"loss": 0.8253,
"step": 1454
},
{
"epoch": 0.40046790063992294,
"grad_norm": 0.29650316278232675,
"learning_rate": 2.651833808167482e-05,
"loss": 0.7987,
"step": 1455
},
{
"epoch": 0.4007431363104658,
"grad_norm": 0.28840641987030097,
"learning_rate": 2.6501816927011616e-05,
"loss": 0.808,
"step": 1456
},
{
"epoch": 0.40101837198100876,
"grad_norm": 0.2632624554330908,
"learning_rate": 2.6485290809910473e-05,
"loss": 0.7983,
"step": 1457
},
{
"epoch": 0.40129360765155164,
"grad_norm": 0.3010079679168447,
"learning_rate": 2.6468759742984763e-05,
"loss": 0.8227,
"step": 1458
},
{
"epoch": 0.4015688433220945,
"grad_norm": 0.28604558779891986,
"learning_rate": 2.6452223738851634e-05,
"loss": 0.8147,
"step": 1459
},
{
"epoch": 0.40184407899263747,
"grad_norm": 0.2976386009217243,
"learning_rate": 2.6435682810132007e-05,
"loss": 0.772,
"step": 1460
},
{
"epoch": 0.40211931466318035,
"grad_norm": 0.2752057004017899,
"learning_rate": 2.641913696945055e-05,
"loss": 0.8028,
"step": 1461
},
{
"epoch": 0.40239455033372323,
"grad_norm": 1.8613068342598982,
"learning_rate": 2.6402586229435694e-05,
"loss": 0.8125,
"step": 1462
},
{
"epoch": 0.4026697860042662,
"grad_norm": 0.33706353100482705,
"learning_rate": 2.63860306027196e-05,
"loss": 0.8084,
"step": 1463
},
{
"epoch": 0.40294502167480906,
"grad_norm": 0.306332125455289,
"learning_rate": 2.636947010193817e-05,
"loss": 0.7956,
"step": 1464
},
{
"epoch": 0.40322025734535194,
"grad_norm": 0.32297963436227717,
"learning_rate": 2.6352904739731007e-05,
"loss": 0.8011,
"step": 1465
},
{
"epoch": 0.4034954930158949,
"grad_norm": 0.874575402840066,
"learning_rate": 2.6336334528741442e-05,
"loss": 0.8164,
"step": 1466
},
{
"epoch": 0.40377072868643776,
"grad_norm": 0.8744408711668231,
"learning_rate": 2.63197594816165e-05,
"loss": 0.8105,
"step": 1467
},
{
"epoch": 0.40404596435698065,
"grad_norm": 0.3646798763416938,
"learning_rate": 2.6303179611006896e-05,
"loss": 0.8017,
"step": 1468
},
{
"epoch": 0.4043212000275236,
"grad_norm": 0.32039312801768444,
"learning_rate": 2.628659492956703e-05,
"loss": 0.8154,
"step": 1469
},
{
"epoch": 0.40459643569806647,
"grad_norm": 0.3728939261903309,
"learning_rate": 2.6270005449954972e-05,
"loss": 0.8188,
"step": 1470
},
{
"epoch": 0.40487167136860935,
"grad_norm": 0.34239940638498284,
"learning_rate": 2.6253411184832454e-05,
"loss": 0.8038,
"step": 1471
},
{
"epoch": 0.4051469070391523,
"grad_norm": 0.34487757127620067,
"learning_rate": 2.6236812146864853e-05,
"loss": 0.7801,
"step": 1472
},
{
"epoch": 0.4054221427096952,
"grad_norm": 0.36325221927535567,
"learning_rate": 2.62202083487212e-05,
"loss": 0.822,
"step": 1473
},
{
"epoch": 0.40569737838023806,
"grad_norm": 0.37044590278077094,
"learning_rate": 2.6203599803074165e-05,
"loss": 0.8536,
"step": 1474
},
{
"epoch": 0.405972614050781,
"grad_norm": 0.4293282038813741,
"learning_rate": 2.6186986522600023e-05,
"loss": 0.7903,
"step": 1475
},
{
"epoch": 0.4062478497213239,
"grad_norm": 0.31220366799140137,
"learning_rate": 2.617036851997867e-05,
"loss": 0.7654,
"step": 1476
},
{
"epoch": 0.40652308539186677,
"grad_norm": 0.3269640608562408,
"learning_rate": 2.6153745807893615e-05,
"loss": 0.7918,
"step": 1477
},
{
"epoch": 0.4067983210624097,
"grad_norm": 0.3257998229141931,
"learning_rate": 2.6137118399031946e-05,
"loss": 0.8108,
"step": 1478
},
{
"epoch": 0.4070735567329526,
"grad_norm": 0.38549640403772,
"learning_rate": 2.612048630608435e-05,
"loss": 0.8208,
"step": 1479
},
{
"epoch": 0.4073487924034955,
"grad_norm": 0.33409143137306907,
"learning_rate": 2.6103849541745085e-05,
"loss": 0.7759,
"step": 1480
},
{
"epoch": 0.4076240280740384,
"grad_norm": 0.300974685979459,
"learning_rate": 2.608720811871196e-05,
"loss": 0.8014,
"step": 1481
},
{
"epoch": 0.4078992637445813,
"grad_norm": 0.3072004155673629,
"learning_rate": 2.607056204968637e-05,
"loss": 0.7928,
"step": 1482
},
{
"epoch": 0.4081744994151242,
"grad_norm": 0.44751117347870023,
"learning_rate": 2.605391134737322e-05,
"loss": 0.7873,
"step": 1483
},
{
"epoch": 0.4084497350856671,
"grad_norm": 0.29147020206747637,
"learning_rate": 2.6037256024480985e-05,
"loss": 0.819,
"step": 1484
},
{
"epoch": 0.40872497075621,
"grad_norm": 0.29787047831873453,
"learning_rate": 2.6020596093721643e-05,
"loss": 0.7967,
"step": 1485
},
{
"epoch": 0.4090002064267529,
"grad_norm": 0.3234064883399612,
"learning_rate": 2.60039315678107e-05,
"loss": 0.8082,
"step": 1486
},
{
"epoch": 0.4092754420972958,
"grad_norm": 0.2822883935726763,
"learning_rate": 2.5987262459467168e-05,
"loss": 0.7919,
"step": 1487
},
{
"epoch": 0.4095506777678387,
"grad_norm": 0.337762827412016,
"learning_rate": 2.597058878141354e-05,
"loss": 0.824,
"step": 1488
},
{
"epoch": 0.4098259134383816,
"grad_norm": 0.3351266198568725,
"learning_rate": 2.5953910546375827e-05,
"loss": 0.8169,
"step": 1489
},
{
"epoch": 0.41010114910892453,
"grad_norm": 0.27981369269275125,
"learning_rate": 2.5937227767083503e-05,
"loss": 0.7986,
"step": 1490
},
{
"epoch": 0.4103763847794674,
"grad_norm": 0.33122050921440876,
"learning_rate": 2.59205404562695e-05,
"loss": 0.7831,
"step": 1491
},
{
"epoch": 0.4106516204500103,
"grad_norm": 0.31103569343053505,
"learning_rate": 2.5903848626670227e-05,
"loss": 0.7963,
"step": 1492
},
{
"epoch": 0.41092685612055324,
"grad_norm": 0.2872677075818124,
"learning_rate": 2.5887152291025532e-05,
"loss": 0.7874,
"step": 1493
},
{
"epoch": 0.4112020917910961,
"grad_norm": 0.2803646269116244,
"learning_rate": 2.5870451462078697e-05,
"loss": 0.8081,
"step": 1494
},
{
"epoch": 0.411477327461639,
"grad_norm": 0.2887233841614559,
"learning_rate": 2.5853746152576443e-05,
"loss": 0.8068,
"step": 1495
},
{
"epoch": 0.41175256313218195,
"grad_norm": 0.2691437300859037,
"learning_rate": 2.5837036375268916e-05,
"loss": 0.807,
"step": 1496
},
{
"epoch": 0.41202779880272483,
"grad_norm": 0.2939469370716576,
"learning_rate": 2.582032214290966e-05,
"loss": 0.8074,
"step": 1497
},
{
"epoch": 0.4123030344732677,
"grad_norm": 0.2962223128230255,
"learning_rate": 2.5803603468255612e-05,
"loss": 0.784,
"step": 1498
},
{
"epoch": 0.41257827014381065,
"grad_norm": 0.30684610616954827,
"learning_rate": 2.5786880364067118e-05,
"loss": 0.8177,
"step": 1499
},
{
"epoch": 0.41285350581435354,
"grad_norm": 0.30165991323175034,
"learning_rate": 2.5770152843107906e-05,
"loss": 0.7854,
"step": 1500
},
{
"epoch": 0.4131287414848964,
"grad_norm": 0.344845393306954,
"learning_rate": 2.5753420918145054e-05,
"loss": 0.7884,
"step": 1501
},
{
"epoch": 0.41340397715543936,
"grad_norm": 0.2749121369417589,
"learning_rate": 2.5736684601949016e-05,
"loss": 0.7875,
"step": 1502
},
{
"epoch": 0.41367921282598225,
"grad_norm": 0.31662733333357823,
"learning_rate": 2.5719943907293604e-05,
"loss": 0.7919,
"step": 1503
},
{
"epoch": 0.41395444849652513,
"grad_norm": 0.4084878913616865,
"learning_rate": 2.5703198846955948e-05,
"loss": 0.7965,
"step": 1504
},
{
"epoch": 0.41422968416706807,
"grad_norm": 0.28272609789134145,
"learning_rate": 2.5686449433716542e-05,
"loss": 0.8028,
"step": 1505
},
{
"epoch": 0.41450491983761095,
"grad_norm": 0.3092023302292874,
"learning_rate": 2.5669695680359173e-05,
"loss": 0.7992,
"step": 1506
},
{
"epoch": 0.41478015550815384,
"grad_norm": 0.29500853646346326,
"learning_rate": 2.5652937599670962e-05,
"loss": 0.83,
"step": 1507
},
{
"epoch": 0.4150553911786968,
"grad_norm": 0.30316421568834717,
"learning_rate": 2.5636175204442317e-05,
"loss": 0.819,
"step": 1508
},
{
"epoch": 0.41533062684923966,
"grad_norm": 0.2837657249146373,
"learning_rate": 2.5619408507466945e-05,
"loss": 0.7702,
"step": 1509
},
{
"epoch": 0.41560586251978254,
"grad_norm": 0.2872567530513789,
"learning_rate": 2.560263752154184e-05,
"loss": 0.8166,
"step": 1510
},
{
"epoch": 0.4158810981903255,
"grad_norm": 0.2933075992543045,
"learning_rate": 2.5585862259467274e-05,
"loss": 0.8066,
"step": 1511
},
{
"epoch": 0.41615633386086837,
"grad_norm": 0.32175892636432013,
"learning_rate": 2.5569082734046765e-05,
"loss": 0.8005,
"step": 1512
},
{
"epoch": 0.41643156953141125,
"grad_norm": 0.29372399113648706,
"learning_rate": 2.555229895808709e-05,
"loss": 0.7922,
"step": 1513
},
{
"epoch": 0.4167068052019542,
"grad_norm": 0.29651349158098117,
"learning_rate": 2.553551094439829e-05,
"loss": 0.7814,
"step": 1514
},
{
"epoch": 0.4169820408724971,
"grad_norm": 0.31559421692998985,
"learning_rate": 2.5518718705793618e-05,
"loss": 0.7965,
"step": 1515
},
{
"epoch": 0.41725727654303996,
"grad_norm": 0.291214389213605,
"learning_rate": 2.5501922255089563e-05,
"loss": 0.8009,
"step": 1516
},
{
"epoch": 0.4175325122135829,
"grad_norm": 0.28998397210337973,
"learning_rate": 2.5485121605105825e-05,
"loss": 0.8044,
"step": 1517
},
{
"epoch": 0.4178077478841258,
"grad_norm": 0.2688054846484204,
"learning_rate": 2.54683167686653e-05,
"loss": 0.8056,
"step": 1518
},
{
"epoch": 0.41808298355466866,
"grad_norm": 0.2938280545832689,
"learning_rate": 2.5451507758594106e-05,
"loss": 0.7715,
"step": 1519
},
{
"epoch": 0.4183582192252116,
"grad_norm": 0.3213269323332592,
"learning_rate": 2.543469458772151e-05,
"loss": 0.8034,
"step": 1520
},
{
"epoch": 0.4186334548957545,
"grad_norm": 0.28593880805933963,
"learning_rate": 2.5417877268879987e-05,
"loss": 0.8068,
"step": 1521
},
{
"epoch": 0.41890869056629737,
"grad_norm": 0.35744553405482726,
"learning_rate": 2.540105581490516e-05,
"loss": 0.7807,
"step": 1522
},
{
"epoch": 0.4191839262368403,
"grad_norm": 0.30165493632045265,
"learning_rate": 2.5384230238635814e-05,
"loss": 0.8216,
"step": 1523
},
{
"epoch": 0.4194591619073832,
"grad_norm": 0.3165521422964494,
"learning_rate": 2.5367400552913876e-05,
"loss": 0.8086,
"step": 1524
},
{
"epoch": 0.4197343975779261,
"grad_norm": 0.3100636628963957,
"learning_rate": 2.5350566770584423e-05,
"loss": 0.7844,
"step": 1525
},
{
"epoch": 0.420009633248469,
"grad_norm": 0.280191236586672,
"learning_rate": 2.5333728904495633e-05,
"loss": 0.7865,
"step": 1526
},
{
"epoch": 0.4202848689190119,
"grad_norm": 0.29718483835011467,
"learning_rate": 2.531688696749882e-05,
"loss": 0.7895,
"step": 1527
},
{
"epoch": 0.4205601045895548,
"grad_norm": 0.3189160667948843,
"learning_rate": 2.5300040972448407e-05,
"loss": 0.7886,
"step": 1528
},
{
"epoch": 0.4208353402600977,
"grad_norm": 0.346648339637172,
"learning_rate": 2.5283190932201905e-05,
"loss": 0.813,
"step": 1529
},
{
"epoch": 0.4211105759306406,
"grad_norm": 0.4196447797342443,
"learning_rate": 2.526633685961992e-05,
"loss": 0.7752,
"step": 1530
},
{
"epoch": 0.4213858116011835,
"grad_norm": 0.3591666823765334,
"learning_rate": 2.5249478767566128e-05,
"loss": 0.7983,
"step": 1531
},
{
"epoch": 0.42166104727172643,
"grad_norm": 0.3157911243028244,
"learning_rate": 2.5232616668907272e-05,
"loss": 0.7752,
"step": 1532
},
{
"epoch": 0.4219362829422693,
"grad_norm": 0.3469253855461703,
"learning_rate": 2.521575057651317e-05,
"loss": 0.8002,
"step": 1533
},
{
"epoch": 0.4222115186128122,
"grad_norm": 0.37750370249049303,
"learning_rate": 2.5198880503256656e-05,
"loss": 0.7877,
"step": 1534
},
{
"epoch": 0.42248675428335514,
"grad_norm": 0.3462659714898315,
"learning_rate": 2.518200646201364e-05,
"loss": 0.8244,
"step": 1535
},
{
"epoch": 0.422761989953898,
"grad_norm": 0.40500355446545444,
"learning_rate": 2.5165128465663035e-05,
"loss": 0.8043,
"step": 1536
},
{
"epoch": 0.4230372256244409,
"grad_norm": 0.3376593793085698,
"learning_rate": 2.5148246527086773e-05,
"loss": 0.8066,
"step": 1537
},
{
"epoch": 0.42331246129498384,
"grad_norm": 0.31106951332736665,
"learning_rate": 2.5131360659169817e-05,
"loss": 0.8054,
"step": 1538
},
{
"epoch": 0.4235876969655267,
"grad_norm": 0.32605582211855666,
"learning_rate": 2.5114470874800106e-05,
"loss": 0.7953,
"step": 1539
},
{
"epoch": 0.4238629326360696,
"grad_norm": 0.32233029068351515,
"learning_rate": 2.509757718686858e-05,
"loss": 0.7968,
"step": 1540
},
{
"epoch": 0.42413816830661255,
"grad_norm": 0.3141658510318051,
"learning_rate": 2.5080679608269143e-05,
"loss": 0.825,
"step": 1541
},
{
"epoch": 0.42441340397715543,
"grad_norm": 0.3429314163930497,
"learning_rate": 2.5063778151898688e-05,
"loss": 0.769,
"step": 1542
},
{
"epoch": 0.4246886396476983,
"grad_norm": 0.3532207907958763,
"learning_rate": 2.504687283065707e-05,
"loss": 0.7781,
"step": 1543
},
{
"epoch": 0.42496387531824126,
"grad_norm": 0.31220809786001236,
"learning_rate": 2.5029963657447063e-05,
"loss": 0.8076,
"step": 1544
},
{
"epoch": 0.42523911098878414,
"grad_norm": 0.34181803029550617,
"learning_rate": 2.5013050645174414e-05,
"loss": 0.7757,
"step": 1545
},
{
"epoch": 0.425514346659327,
"grad_norm": 0.2934744581681451,
"learning_rate": 2.4996133806747786e-05,
"loss": 0.8182,
"step": 1546
},
{
"epoch": 0.42578958232986996,
"grad_norm": 0.2954462476060033,
"learning_rate": 2.4979213155078758e-05,
"loss": 0.8154,
"step": 1547
},
{
"epoch": 0.42606481800041285,
"grad_norm": 0.30627584397965296,
"learning_rate": 2.4962288703081833e-05,
"loss": 0.7958,
"step": 1548
},
{
"epoch": 0.42634005367095573,
"grad_norm": 0.3184444669803208,
"learning_rate": 2.4945360463674408e-05,
"loss": 0.7958,
"step": 1549
},
{
"epoch": 0.42661528934149867,
"grad_norm": 0.29221372217687863,
"learning_rate": 2.492842844977677e-05,
"loss": 0.8376,
"step": 1550
},
{
"epoch": 0.42689052501204156,
"grad_norm": 0.30012765565232413,
"learning_rate": 2.4911492674312072e-05,
"loss": 0.807,
"step": 1551
},
{
"epoch": 0.42716576068258444,
"grad_norm": 0.31353031412169613,
"learning_rate": 2.4894553150206364e-05,
"loss": 0.7936,
"step": 1552
},
{
"epoch": 0.4274409963531274,
"grad_norm": 0.2990620959446403,
"learning_rate": 2.4877609890388544e-05,
"loss": 0.7894,
"step": 1553
},
{
"epoch": 0.42771623202367026,
"grad_norm": 0.3214884522984842,
"learning_rate": 2.4860662907790363e-05,
"loss": 0.7982,
"step": 1554
},
{
"epoch": 0.42799146769421315,
"grad_norm": 0.30848511206629325,
"learning_rate": 2.484371221534641e-05,
"loss": 0.7795,
"step": 1555
},
{
"epoch": 0.4282667033647561,
"grad_norm": 0.289204480093799,
"learning_rate": 2.4826757825994116e-05,
"loss": 0.829,
"step": 1556
},
{
"epoch": 0.42854193903529897,
"grad_norm": 0.28512723873036044,
"learning_rate": 2.480979975267372e-05,
"loss": 0.7994,
"step": 1557
},
{
"epoch": 0.42881717470584185,
"grad_norm": 0.34065328908174497,
"learning_rate": 2.4792838008328273e-05,
"loss": 0.7948,
"step": 1558
},
{
"epoch": 0.4290924103763848,
"grad_norm": 0.2985031897281868,
"learning_rate": 2.4775872605903644e-05,
"loss": 0.8079,
"step": 1559
},
{
"epoch": 0.4293676460469277,
"grad_norm": 0.3198267846931661,
"learning_rate": 2.4758903558348485e-05,
"loss": 0.7749,
"step": 1560
},
{
"epoch": 0.42964288171747056,
"grad_norm": 0.36530245462620264,
"learning_rate": 2.474193087861422e-05,
"loss": 0.7844,
"step": 1561
},
{
"epoch": 0.4299181173880135,
"grad_norm": 0.29563438375387263,
"learning_rate": 2.472495457965506e-05,
"loss": 0.7743,
"step": 1562
},
{
"epoch": 0.4301933530585564,
"grad_norm": 0.2953487472265621,
"learning_rate": 2.470797467442797e-05,
"loss": 0.8117,
"step": 1563
},
{
"epoch": 0.43046858872909927,
"grad_norm": 0.3279910813270692,
"learning_rate": 2.4690991175892663e-05,
"loss": 0.8109,
"step": 1564
},
{
"epoch": 0.4307438243996422,
"grad_norm": 0.32686073979880587,
"learning_rate": 2.467400409701162e-05,
"loss": 0.8147,
"step": 1565
},
{
"epoch": 0.4310190600701851,
"grad_norm": 0.2893424695495428,
"learning_rate": 2.465701345075002e-05,
"loss": 0.8046,
"step": 1566
},
{
"epoch": 0.431294295740728,
"grad_norm": 0.3173272950085369,
"learning_rate": 2.4640019250075788e-05,
"loss": 0.7748,
"step": 1567
},
{
"epoch": 0.4315695314112709,
"grad_norm": 0.27879186742790907,
"learning_rate": 2.4623021507959552e-05,
"loss": 0.8055,
"step": 1568
},
{
"epoch": 0.4318447670818138,
"grad_norm": 0.34456420267891086,
"learning_rate": 2.4606020237374644e-05,
"loss": 0.7962,
"step": 1569
},
{
"epoch": 0.4321200027523567,
"grad_norm": 0.29678832958335566,
"learning_rate": 2.458901545129709e-05,
"loss": 0.7965,
"step": 1570
},
{
"epoch": 0.4323952384228996,
"grad_norm": 0.3142548013817184,
"learning_rate": 2.457200716270561e-05,
"loss": 0.8115,
"step": 1571
},
{
"epoch": 0.4326704740934425,
"grad_norm": 0.29426185891272927,
"learning_rate": 2.455499538458158e-05,
"loss": 0.7971,
"step": 1572
},
{
"epoch": 0.4329457097639854,
"grad_norm": 0.3232925060411943,
"learning_rate": 2.453798012990904e-05,
"loss": 0.8027,
"step": 1573
},
{
"epoch": 0.4332209454345283,
"grad_norm": 0.2851698569966,
"learning_rate": 2.45209614116747e-05,
"loss": 0.8112,
"step": 1574
},
{
"epoch": 0.4334961811050712,
"grad_norm": 0.3710782702944274,
"learning_rate": 2.4503939242867894e-05,
"loss": 0.7781,
"step": 1575
},
{
"epoch": 0.4337714167756141,
"grad_norm": 0.2958423052963948,
"learning_rate": 2.4486913636480614e-05,
"loss": 0.7993,
"step": 1576
},
{
"epoch": 0.43404665244615703,
"grad_norm": 0.284930887135061,
"learning_rate": 2.4469884605507446e-05,
"loss": 0.8023,
"step": 1577
},
{
"epoch": 0.4343218881166999,
"grad_norm": 0.314246053196774,
"learning_rate": 2.445285216294561e-05,
"loss": 0.768,
"step": 1578
},
{
"epoch": 0.4345971237872428,
"grad_norm": 0.28396390598718796,
"learning_rate": 2.443581632179493e-05,
"loss": 0.7908,
"step": 1579
},
{
"epoch": 0.43487235945778574,
"grad_norm": 0.3128320861472054,
"learning_rate": 2.4418777095057803e-05,
"loss": 0.7853,
"step": 1580
},
{
"epoch": 0.4351475951283286,
"grad_norm": 0.30881869705845577,
"learning_rate": 2.4401734495739243e-05,
"loss": 0.8109,
"step": 1581
},
{
"epoch": 0.4354228307988715,
"grad_norm": 0.30120876064722846,
"learning_rate": 2.4384688536846813e-05,
"loss": 0.805,
"step": 1582
},
{
"epoch": 0.43569806646941445,
"grad_norm": 0.31066632616537543,
"learning_rate": 2.4367639231390645e-05,
"loss": 0.7703,
"step": 1583
},
{
"epoch": 0.43597330213995733,
"grad_norm": 0.3004766739033846,
"learning_rate": 2.4350586592383424e-05,
"loss": 0.8056,
"step": 1584
},
{
"epoch": 0.4362485378105002,
"grad_norm": 0.2833664052327661,
"learning_rate": 2.433353063284039e-05,
"loss": 0.7685,
"step": 1585
},
{
"epoch": 0.43652377348104315,
"grad_norm": 0.2811209308675284,
"learning_rate": 2.4316471365779317e-05,
"loss": 0.8157,
"step": 1586
},
{
"epoch": 0.43679900915158604,
"grad_norm": 0.288913620983614,
"learning_rate": 2.4299408804220485e-05,
"loss": 0.7907,
"step": 1587
},
{
"epoch": 0.4370742448221289,
"grad_norm": 0.27966116229705296,
"learning_rate": 2.4282342961186705e-05,
"loss": 0.7655,
"step": 1588
},
{
"epoch": 0.43734948049267186,
"grad_norm": 0.2641961924186609,
"learning_rate": 2.426527384970329e-05,
"loss": 0.7959,
"step": 1589
},
{
"epoch": 0.43762471616321474,
"grad_norm": 0.27668049741714845,
"learning_rate": 2.424820148279803e-05,
"loss": 0.7867,
"step": 1590
},
{
"epoch": 0.43789995183375763,
"grad_norm": 0.2611167188967044,
"learning_rate": 2.423112587350124e-05,
"loss": 0.7984,
"step": 1591
},
{
"epoch": 0.43817518750430057,
"grad_norm": 0.33357550981600415,
"learning_rate": 2.4214047034845673e-05,
"loss": 0.8253,
"step": 1592
},
{
"epoch": 0.43845042317484345,
"grad_norm": 0.26710635956132567,
"learning_rate": 2.419696497986656e-05,
"loss": 0.7881,
"step": 1593
},
{
"epoch": 0.43872565884538633,
"grad_norm": 0.2736293414899826,
"learning_rate": 2.417987972160158e-05,
"loss": 0.7675,
"step": 1594
},
{
"epoch": 0.4390008945159293,
"grad_norm": 0.2941948694624388,
"learning_rate": 2.4162791273090863e-05,
"loss": 0.7713,
"step": 1595
},
{
"epoch": 0.43927613018647216,
"grad_norm": 0.27180507918902364,
"learning_rate": 2.414569964737698e-05,
"loss": 0.8087,
"step": 1596
},
{
"epoch": 0.43955136585701504,
"grad_norm": 0.32201452043854006,
"learning_rate": 2.4128604857504923e-05,
"loss": 0.8115,
"step": 1597
},
{
"epoch": 0.439826601527558,
"grad_norm": 0.27019563722592305,
"learning_rate": 2.4111506916522084e-05,
"loss": 0.7925,
"step": 1598
},
{
"epoch": 0.44010183719810086,
"grad_norm": 0.28629218746674434,
"learning_rate": 2.409440583747828e-05,
"loss": 0.798,
"step": 1599
},
{
"epoch": 0.44037707286864375,
"grad_norm": 0.28224945076498836,
"learning_rate": 2.4077301633425716e-05,
"loss": 0.7882,
"step": 1600
},
{
"epoch": 0.4406523085391867,
"grad_norm": 0.2700656948439867,
"learning_rate": 2.4060194317418974e-05,
"loss": 0.859,
"step": 1601
},
{
"epoch": 0.44092754420972957,
"grad_norm": 0.29623434887566996,
"learning_rate": 2.404308390251503e-05,
"loss": 0.8176,
"step": 1602
},
{
"epoch": 0.4412027798802725,
"grad_norm": 0.25995691825993167,
"learning_rate": 2.4025970401773204e-05,
"loss": 0.7734,
"step": 1603
},
{
"epoch": 0.4414780155508154,
"grad_norm": 0.28578242404804854,
"learning_rate": 2.4008853828255187e-05,
"loss": 0.8247,
"step": 1604
},
{
"epoch": 0.4417532512213583,
"grad_norm": 0.3291469264128354,
"learning_rate": 2.399173419502501e-05,
"loss": 0.8069,
"step": 1605
},
{
"epoch": 0.4420284868919012,
"grad_norm": 0.3093473781894673,
"learning_rate": 2.3974611515149032e-05,
"loss": 0.7878,
"step": 1606
},
{
"epoch": 0.4423037225624441,
"grad_norm": 0.2857840619139393,
"learning_rate": 2.395748580169595e-05,
"loss": 0.7971,
"step": 1607
},
{
"epoch": 0.442578958232987,
"grad_norm": 0.33309283781537863,
"learning_rate": 2.394035706773677e-05,
"loss": 0.8074,
"step": 1608
},
{
"epoch": 0.4428541939035299,
"grad_norm": 0.33075153702648236,
"learning_rate": 2.39232253263448e-05,
"loss": 0.7754,
"step": 1609
},
{
"epoch": 0.4431294295740728,
"grad_norm": 0.27203771265724375,
"learning_rate": 2.390609059059565e-05,
"loss": 0.782,
"step": 1610
},
{
"epoch": 0.4434046652446157,
"grad_norm": 0.33236891628353504,
"learning_rate": 2.3888952873567216e-05,
"loss": 0.7739,
"step": 1611
},
{
"epoch": 0.44367990091515863,
"grad_norm": 0.29014067567314206,
"learning_rate": 2.3871812188339653e-05,
"loss": 0.7897,
"step": 1612
},
{
"epoch": 0.4439551365857015,
"grad_norm": 0.3094146471792101,
"learning_rate": 2.385466854799541e-05,
"loss": 0.7758,
"step": 1613
},
{
"epoch": 0.4442303722562444,
"grad_norm": 0.2973785749542664,
"learning_rate": 2.3837521965619167e-05,
"loss": 0.7878,
"step": 1614
},
{
"epoch": 0.44450560792678734,
"grad_norm": 0.31623424827633717,
"learning_rate": 2.382037245429786e-05,
"loss": 0.8003,
"step": 1615
},
{
"epoch": 0.4447808435973302,
"grad_norm": 0.310821037517096,
"learning_rate": 2.3803220027120654e-05,
"loss": 0.7984,
"step": 1616
},
{
"epoch": 0.4450560792678731,
"grad_norm": 0.2857163022467033,
"learning_rate": 2.378606469717896e-05,
"loss": 0.7953,
"step": 1617
},
{
"epoch": 0.44533131493841605,
"grad_norm": 0.31477276396196974,
"learning_rate": 2.376890647756637e-05,
"loss": 0.7805,
"step": 1618
},
{
"epoch": 0.44560655060895893,
"grad_norm": 0.3108309726428149,
"learning_rate": 2.3751745381378714e-05,
"loss": 0.7957,
"step": 1619
},
{
"epoch": 0.4458817862795018,
"grad_norm": 0.28791878950978966,
"learning_rate": 2.3734581421713987e-05,
"loss": 0.7979,
"step": 1620
},
{
"epoch": 0.44615702195004475,
"grad_norm": 0.31005767280539925,
"learning_rate": 2.3717414611672408e-05,
"loss": 0.7829,
"step": 1621
},
{
"epoch": 0.44643225762058764,
"grad_norm": 0.28154708408818874,
"learning_rate": 2.370024496435634e-05,
"loss": 0.7942,
"step": 1622
},
{
"epoch": 0.4467074932911305,
"grad_norm": 0.3027781268228018,
"learning_rate": 2.368307249287031e-05,
"loss": 0.8059,
"step": 1623
},
{
"epoch": 0.44698272896167346,
"grad_norm": 0.28151340227579136,
"learning_rate": 2.366589721032103e-05,
"loss": 0.8184,
"step": 1624
},
{
"epoch": 0.44725796463221634,
"grad_norm": 0.3363786663035669,
"learning_rate": 2.3648719129817335e-05,
"loss": 0.79,
"step": 1625
},
{
"epoch": 0.4475332003027592,
"grad_norm": 0.2750818479805928,
"learning_rate": 2.363153826447019e-05,
"loss": 0.7688,
"step": 1626
},
{
"epoch": 0.44780843597330217,
"grad_norm": 0.31079540101572517,
"learning_rate": 2.3614354627392703e-05,
"loss": 0.7948,
"step": 1627
},
{
"epoch": 0.44808367164384505,
"grad_norm": 0.2736270642653545,
"learning_rate": 2.359716823170009e-05,
"loss": 0.7741,
"step": 1628
},
{
"epoch": 0.44835890731438793,
"grad_norm": 0.2938174781088623,
"learning_rate": 2.3579979090509672e-05,
"loss": 0.7932,
"step": 1629
},
{
"epoch": 0.4486341429849309,
"grad_norm": 0.3075005581220249,
"learning_rate": 2.3562787216940864e-05,
"loss": 0.8294,
"step": 1630
},
{
"epoch": 0.44890937865547376,
"grad_norm": 0.26738711635634516,
"learning_rate": 2.3545592624115172e-05,
"loss": 0.7724,
"step": 1631
},
{
"epoch": 0.44918461432601664,
"grad_norm": 0.3026137091561077,
"learning_rate": 2.3528395325156175e-05,
"loss": 0.7943,
"step": 1632
},
{
"epoch": 0.4494598499965596,
"grad_norm": 0.3535514366251364,
"learning_rate": 2.3511195333189503e-05,
"loss": 0.802,
"step": 1633
},
{
"epoch": 0.44973508566710246,
"grad_norm": 0.30117982206851973,
"learning_rate": 2.3493992661342865e-05,
"loss": 0.8023,
"step": 1634
},
{
"epoch": 0.45001032133764535,
"grad_norm": 0.2694164912698681,
"learning_rate": 2.3476787322746007e-05,
"loss": 0.7828,
"step": 1635
},
{
"epoch": 0.4502855570081883,
"grad_norm": 0.2945971699512249,
"learning_rate": 2.345957933053071e-05,
"loss": 0.7731,
"step": 1636
},
{
"epoch": 0.45056079267873117,
"grad_norm": 0.6140352459748996,
"learning_rate": 2.3442368697830767e-05,
"loss": 0.8232,
"step": 1637
},
{
"epoch": 0.45083602834927405,
"grad_norm": 0.32155502499418237,
"learning_rate": 2.3425155437782007e-05,
"loss": 0.7794,
"step": 1638
},
{
"epoch": 0.451111264019817,
"grad_norm": 0.2701455300552998,
"learning_rate": 2.3407939563522248e-05,
"loss": 0.7939,
"step": 1639
},
{
"epoch": 0.4513864996903599,
"grad_norm": 0.26950129133550305,
"learning_rate": 2.3390721088191322e-05,
"loss": 0.8323,
"step": 1640
},
{
"epoch": 0.45166173536090276,
"grad_norm": 0.2914499396388273,
"learning_rate": 2.3373500024931025e-05,
"loss": 0.7892,
"step": 1641
},
{
"epoch": 0.4519369710314457,
"grad_norm": 0.27967733941718875,
"learning_rate": 2.3356276386885144e-05,
"loss": 0.8191,
"step": 1642
},
{
"epoch": 0.4522122067019886,
"grad_norm": 0.2900091020222259,
"learning_rate": 2.3339050187199423e-05,
"loss": 0.7908,
"step": 1643
},
{
"epoch": 0.45248744237253147,
"grad_norm": 0.28773498093485295,
"learning_rate": 2.3321821439021556e-05,
"loss": 0.8074,
"step": 1644
},
{
"epoch": 0.4527626780430744,
"grad_norm": 0.45887861211448044,
"learning_rate": 2.3304590155501198e-05,
"loss": 0.7767,
"step": 1645
},
{
"epoch": 0.4530379137136173,
"grad_norm": 0.3183033245742924,
"learning_rate": 2.3287356349789936e-05,
"loss": 0.816,
"step": 1646
},
{
"epoch": 0.4533131493841602,
"grad_norm": 0.3175071359168492,
"learning_rate": 2.327012003504127e-05,
"loss": 0.8024,
"step": 1647
},
{
"epoch": 0.4535883850547031,
"grad_norm": 0.2838076219406021,
"learning_rate": 2.3252881224410612e-05,
"loss": 0.7874,
"step": 1648
},
{
"epoch": 0.453863620725246,
"grad_norm": 0.3208661583070452,
"learning_rate": 2.32356399310553e-05,
"loss": 0.8151,
"step": 1649
},
{
"epoch": 0.4541388563957889,
"grad_norm": 0.2927301112340574,
"learning_rate": 2.321839616813455e-05,
"loss": 0.8261,
"step": 1650
},
{
"epoch": 0.4544140920663318,
"grad_norm": 0.3057884616049347,
"learning_rate": 2.3201149948809473e-05,
"loss": 0.8097,
"step": 1651
},
{
"epoch": 0.4546893277368747,
"grad_norm": 0.29642780321189954,
"learning_rate": 2.3183901286243047e-05,
"loss": 0.8077,
"step": 1652
},
{
"epoch": 0.4549645634074176,
"grad_norm": 0.3064307972670116,
"learning_rate": 2.3166650193600123e-05,
"loss": 0.8146,
"step": 1653
},
{
"epoch": 0.4552397990779605,
"grad_norm": 0.298599499435739,
"learning_rate": 2.3149396684047397e-05,
"loss": 0.782,
"step": 1654
},
{
"epoch": 0.4555150347485034,
"grad_norm": 0.2609796353777113,
"learning_rate": 2.313214077075341e-05,
"loss": 0.8092,
"step": 1655
},
{
"epoch": 0.4557902704190463,
"grad_norm": 0.2982766885059548,
"learning_rate": 2.311488246688854e-05,
"loss": 0.7951,
"step": 1656
},
{
"epoch": 0.45606550608958923,
"grad_norm": 0.2882480134195979,
"learning_rate": 2.309762178562501e-05,
"loss": 0.7873,
"step": 1657
},
{
"epoch": 0.4563407417601321,
"grad_norm": 0.3153400577453351,
"learning_rate": 2.3080358740136822e-05,
"loss": 0.7921,
"step": 1658
},
{
"epoch": 0.456615977430675,
"grad_norm": 0.27485275779932977,
"learning_rate": 2.3063093343599806e-05,
"loss": 0.8,
"step": 1659
},
{
"epoch": 0.45689121310121794,
"grad_norm": 0.2958132200803276,
"learning_rate": 2.3045825609191578e-05,
"loss": 0.7663,
"step": 1660
},
{
"epoch": 0.4571664487717608,
"grad_norm": 0.27874225136860875,
"learning_rate": 2.3028555550091536e-05,
"loss": 0.8159,
"step": 1661
},
{
"epoch": 0.4574416844423037,
"grad_norm": 0.30278994866904807,
"learning_rate": 2.3011283179480862e-05,
"loss": 0.7959,
"step": 1662
},
{
"epoch": 0.45771692011284665,
"grad_norm": 0.2592295382331562,
"learning_rate": 2.2994008510542498e-05,
"loss": 0.7713,
"step": 1663
},
{
"epoch": 0.45799215578338953,
"grad_norm": 0.30398413168142274,
"learning_rate": 2.2976731556461135e-05,
"loss": 0.783,
"step": 1664
},
{
"epoch": 0.4582673914539324,
"grad_norm": 0.27671341614776185,
"learning_rate": 2.2959452330423217e-05,
"loss": 0.8502,
"step": 1665
},
{
"epoch": 0.45854262712447535,
"grad_norm": 0.31941357339594073,
"learning_rate": 2.2942170845616905e-05,
"loss": 0.8339,
"step": 1666
},
{
"epoch": 0.45881786279501824,
"grad_norm": 1.0728719259556911,
"learning_rate": 2.2924887115232113e-05,
"loss": 0.8286,
"step": 1667
},
{
"epoch": 0.4590930984655611,
"grad_norm": 0.32142749336199716,
"learning_rate": 2.2907601152460442e-05,
"loss": 0.7874,
"step": 1668
},
{
"epoch": 0.45936833413610406,
"grad_norm": 0.3430812451270998,
"learning_rate": 2.289031297049521e-05,
"loss": 0.7907,
"step": 1669
},
{
"epoch": 0.45964356980664695,
"grad_norm": 0.33332169085431984,
"learning_rate": 2.2873022582531412e-05,
"loss": 0.786,
"step": 1670
},
{
"epoch": 0.45991880547718983,
"grad_norm": 0.3186064264933752,
"learning_rate": 2.2855730001765763e-05,
"loss": 0.8062,
"step": 1671
},
{
"epoch": 0.46019404114773277,
"grad_norm": 0.31253625370356675,
"learning_rate": 2.2838435241396618e-05,
"loss": 0.7908,
"step": 1672
},
{
"epoch": 0.46046927681827565,
"grad_norm": 0.2969753207919644,
"learning_rate": 2.2821138314624e-05,
"loss": 0.8185,
"step": 1673
},
{
"epoch": 0.46074451248881854,
"grad_norm": 0.3022616202337864,
"learning_rate": 2.2803839234649604e-05,
"loss": 0.8005,
"step": 1674
},
{
"epoch": 0.4610197481593615,
"grad_norm": 0.3258353635434477,
"learning_rate": 2.278653801467675e-05,
"loss": 0.786,
"step": 1675
},
{
"epoch": 0.46129498382990436,
"grad_norm": 0.27754542177239094,
"learning_rate": 2.2769234667910394e-05,
"loss": 0.805,
"step": 1676
},
{
"epoch": 0.46157021950044724,
"grad_norm": 0.30896749485382285,
"learning_rate": 2.2751929207557124e-05,
"loss": 0.7995,
"step": 1677
},
{
"epoch": 0.4618454551709902,
"grad_norm": 0.277123554364044,
"learning_rate": 2.2734621646825145e-05,
"loss": 0.7906,
"step": 1678
},
{
"epoch": 0.46212069084153307,
"grad_norm": 0.36632232653377717,
"learning_rate": 2.2717311998924237e-05,
"loss": 0.7961,
"step": 1679
},
{
"epoch": 0.46239592651207595,
"grad_norm": 0.2791989728634918,
"learning_rate": 2.2700000277065805e-05,
"loss": 0.7912,
"step": 1680
},
{
"epoch": 0.4626711621826189,
"grad_norm": 0.29547976952313004,
"learning_rate": 2.2682686494462822e-05,
"loss": 0.8073,
"step": 1681
},
{
"epoch": 0.4629463978531618,
"grad_norm": 0.29194813535287817,
"learning_rate": 2.2665370664329834e-05,
"loss": 0.7869,
"step": 1682
},
{
"epoch": 0.46322163352370466,
"grad_norm": 0.3007751469987453,
"learning_rate": 2.2648052799882953e-05,
"loss": 0.7873,
"step": 1683
},
{
"epoch": 0.4634968691942476,
"grad_norm": 0.4010424059498456,
"learning_rate": 2.2630732914339836e-05,
"loss": 0.8353,
"step": 1684
},
{
"epoch": 0.4637721048647905,
"grad_norm": 0.3145067506452559,
"learning_rate": 2.2613411020919704e-05,
"loss": 0.8108,
"step": 1685
},
{
"epoch": 0.46404734053533336,
"grad_norm": 0.2933089618615493,
"learning_rate": 2.2596087132843287e-05,
"loss": 0.8128,
"step": 1686
},
{
"epoch": 0.4643225762058763,
"grad_norm": 0.28491725094157117,
"learning_rate": 2.257876126333284e-05,
"loss": 0.7935,
"step": 1687
},
{
"epoch": 0.4645978118764192,
"grad_norm": 0.29896200376966015,
"learning_rate": 2.256143342561214e-05,
"loss": 0.8101,
"step": 1688
},
{
"epoch": 0.46487304754696207,
"grad_norm": 0.3168832733933036,
"learning_rate": 2.2544103632906465e-05,
"loss": 0.8099,
"step": 1689
},
{
"epoch": 0.465148283217505,
"grad_norm": 0.36920663455628144,
"learning_rate": 2.252677189844259e-05,
"loss": 0.7669,
"step": 1690
},
{
"epoch": 0.4654235188880479,
"grad_norm": 0.4136014450183235,
"learning_rate": 2.2509438235448748e-05,
"loss": 0.7976,
"step": 1691
},
{
"epoch": 0.4656987545585908,
"grad_norm": 0.3330953429975218,
"learning_rate": 2.249210265715467e-05,
"loss": 0.7925,
"step": 1692
},
{
"epoch": 0.4659739902291337,
"grad_norm": 0.28640205388627815,
"learning_rate": 2.2474765176791532e-05,
"loss": 0.8072,
"step": 1693
},
{
"epoch": 0.4662492258996766,
"grad_norm": 0.28838476177714323,
"learning_rate": 2.2457425807591988e-05,
"loss": 0.7727,
"step": 1694
},
{
"epoch": 0.4665244615702195,
"grad_norm": 0.2787411327807749,
"learning_rate": 2.2440084562790085e-05,
"loss": 0.8043,
"step": 1695
},
{
"epoch": 0.4667996972407624,
"grad_norm": 0.28745042227663387,
"learning_rate": 2.242274145562136e-05,
"loss": 0.7948,
"step": 1696
},
{
"epoch": 0.4670749329113053,
"grad_norm": 0.27861867576324845,
"learning_rate": 2.2405396499322727e-05,
"loss": 0.7987,
"step": 1697
},
{
"epoch": 0.4673501685818482,
"grad_norm": 0.2670184013285605,
"learning_rate": 2.2388049707132527e-05,
"loss": 0.7943,
"step": 1698
},
{
"epoch": 0.46762540425239113,
"grad_norm": 0.2930704355624698,
"learning_rate": 2.2370701092290506e-05,
"loss": 0.7938,
"step": 1699
},
{
"epoch": 0.467900639922934,
"grad_norm": 0.2721421931599048,
"learning_rate": 2.23533506680378e-05,
"loss": 0.811,
"step": 1700
},
{
"epoch": 0.4681758755934769,
"grad_norm": 0.2698915180238021,
"learning_rate": 2.2335998447616918e-05,
"loss": 0.7921,
"step": 1701
},
{
"epoch": 0.46845111126401984,
"grad_norm": 0.3572038292718193,
"learning_rate": 2.2318644444271746e-05,
"loss": 0.7936,
"step": 1702
},
{
"epoch": 0.4687263469345627,
"grad_norm": 0.2798386134482875,
"learning_rate": 2.2301288671247532e-05,
"loss": 0.8357,
"step": 1703
},
{
"epoch": 0.4690015826051056,
"grad_norm": 0.2776607747426227,
"learning_rate": 2.228393114179087e-05,
"loss": 0.8117,
"step": 1704
},
{
"epoch": 0.46927681827564854,
"grad_norm": 0.3206463207137854,
"learning_rate": 2.2266571869149698e-05,
"loss": 0.7891,
"step": 1705
},
{
"epoch": 0.46955205394619143,
"grad_norm": 0.27957116412654204,
"learning_rate": 2.2249210866573287e-05,
"loss": 0.7742,
"step": 1706
},
{
"epoch": 0.4698272896167343,
"grad_norm": 0.3344496998668106,
"learning_rate": 2.2231848147312224e-05,
"loss": 0.8049,
"step": 1707
},
{
"epoch": 0.47010252528727725,
"grad_norm": 0.29992518479139924,
"learning_rate": 2.2214483724618406e-05,
"loss": 0.7837,
"step": 1708
},
{
"epoch": 0.47037776095782013,
"grad_norm": 0.29207296727357596,
"learning_rate": 2.2197117611745024e-05,
"loss": 0.7987,
"step": 1709
},
{
"epoch": 0.470652996628363,
"grad_norm": 0.3089860093733482,
"learning_rate": 2.217974982194658e-05,
"loss": 0.7949,
"step": 1710
},
{
"epoch": 0.47092823229890596,
"grad_norm": 0.3000964759823666,
"learning_rate": 2.2162380368478836e-05,
"loss": 0.7441,
"step": 1711
},
{
"epoch": 0.47120346796944884,
"grad_norm": 0.31947505972086276,
"learning_rate": 2.214500926459883e-05,
"loss": 0.819,
"step": 1712
},
{
"epoch": 0.4714787036399917,
"grad_norm": 0.25563854247524825,
"learning_rate": 2.212763652356486e-05,
"loss": 0.7923,
"step": 1713
},
{
"epoch": 0.47175393931053466,
"grad_norm": 0.3388312748649513,
"learning_rate": 2.2110262158636474e-05,
"loss": 0.7942,
"step": 1714
},
{
"epoch": 0.47202917498107755,
"grad_norm": 0.2649085883782548,
"learning_rate": 2.2092886183074464e-05,
"loss": 0.7988,
"step": 1715
},
{
"epoch": 0.47230441065162043,
"grad_norm": 0.29774010886809854,
"learning_rate": 2.2075508610140828e-05,
"loss": 0.7762,
"step": 1716
},
{
"epoch": 0.47257964632216337,
"grad_norm": 0.2837838715013209,
"learning_rate": 2.2058129453098826e-05,
"loss": 0.806,
"step": 1717
},
{
"epoch": 0.47285488199270626,
"grad_norm": 0.2728762648799962,
"learning_rate": 2.204074872521288e-05,
"loss": 0.8215,
"step": 1718
},
{
"epoch": 0.47313011766324914,
"grad_norm": 0.2710413956835945,
"learning_rate": 2.2023366439748647e-05,
"loss": 0.8194,
"step": 1719
},
{
"epoch": 0.4734053533337921,
"grad_norm": 0.5573222196343124,
"learning_rate": 2.2005982609972952e-05,
"loss": 0.786,
"step": 1720
},
{
"epoch": 0.47368058900433496,
"grad_norm": 0.2731160307935018,
"learning_rate": 2.1988597249153813e-05,
"loss": 0.7878,
"step": 1721
},
{
"epoch": 0.47395582467487785,
"grad_norm": 0.30399814931812263,
"learning_rate": 2.1971210370560402e-05,
"loss": 0.7796,
"step": 1722
},
{
"epoch": 0.4742310603454208,
"grad_norm": 0.2608241851456652,
"learning_rate": 2.1953821987463062e-05,
"loss": 0.7937,
"step": 1723
},
{
"epoch": 0.47450629601596367,
"grad_norm": 0.2931588669157855,
"learning_rate": 2.193643211313327e-05,
"loss": 0.7971,
"step": 1724
},
{
"epoch": 0.47478153168650655,
"grad_norm": 0.26117651761114935,
"learning_rate": 2.1919040760843663e-05,
"loss": 0.7802,
"step": 1725
},
{
"epoch": 0.4750567673570495,
"grad_norm": 0.2813837478909038,
"learning_rate": 2.1901647943867986e-05,
"loss": 0.7991,
"step": 1726
},
{
"epoch": 0.4753320030275924,
"grad_norm": 0.27209640929023815,
"learning_rate": 2.188425367548111e-05,
"loss": 0.8,
"step": 1727
},
{
"epoch": 0.47560723869813526,
"grad_norm": 0.2664957237643973,
"learning_rate": 2.186685796895901e-05,
"loss": 0.8048,
"step": 1728
},
{
"epoch": 0.4758824743686782,
"grad_norm": 0.2765271449471321,
"learning_rate": 2.1849460837578767e-05,
"loss": 0.7783,
"step": 1729
},
{
"epoch": 0.4761577100392211,
"grad_norm": 0.26359419929274464,
"learning_rate": 2.183206229461854e-05,
"loss": 0.7907,
"step": 1730
},
{
"epoch": 0.47643294570976397,
"grad_norm": 0.2728067104523246,
"learning_rate": 2.1814662353357567e-05,
"loss": 0.7896,
"step": 1731
},
{
"epoch": 0.4767081813803069,
"grad_norm": 0.2770972625384237,
"learning_rate": 2.1797261027076166e-05,
"loss": 0.7618,
"step": 1732
},
{
"epoch": 0.4769834170508498,
"grad_norm": 0.27716393138190776,
"learning_rate": 2.1779858329055688e-05,
"loss": 0.8056,
"step": 1733
},
{
"epoch": 0.4772586527213927,
"grad_norm": 0.7210957908286135,
"learning_rate": 2.176245427257855e-05,
"loss": 0.837,
"step": 1734
},
{
"epoch": 0.4775338883919356,
"grad_norm": 0.2722418496214004,
"learning_rate": 2.1745048870928208e-05,
"loss": 0.7975,
"step": 1735
},
{
"epoch": 0.4778091240624785,
"grad_norm": 0.2627307363683731,
"learning_rate": 2.1727642137389124e-05,
"loss": 0.7886,
"step": 1736
},
{
"epoch": 0.4780843597330214,
"grad_norm": 0.28372534817140965,
"learning_rate": 2.17102340852468e-05,
"loss": 0.759,
"step": 1737
},
{
"epoch": 0.4783595954035643,
"grad_norm": 0.26512671637247087,
"learning_rate": 2.1692824727787736e-05,
"loss": 0.771,
"step": 1738
},
{
"epoch": 0.4786348310741072,
"grad_norm": 0.28252988096499726,
"learning_rate": 2.1675414078299418e-05,
"loss": 0.8153,
"step": 1739
},
{
"epoch": 0.4789100667446501,
"grad_norm": 0.28314757859677153,
"learning_rate": 2.1658002150070332e-05,
"loss": 0.7748,
"step": 1740
},
{
"epoch": 0.479185302415193,
"grad_norm": 0.27183430887823945,
"learning_rate": 2.1640588956389923e-05,
"loss": 0.7949,
"step": 1741
},
{
"epoch": 0.4794605380857359,
"grad_norm": 0.3077901926897006,
"learning_rate": 2.1623174510548627e-05,
"loss": 0.7766,
"step": 1742
},
{
"epoch": 0.4797357737562788,
"grad_norm": 0.27760037753894373,
"learning_rate": 2.160575882583782e-05,
"loss": 0.8078,
"step": 1743
},
{
"epoch": 0.48001100942682173,
"grad_norm": 0.293236444387804,
"learning_rate": 2.1588341915549825e-05,
"loss": 0.7932,
"step": 1744
},
{
"epoch": 0.4802862450973646,
"grad_norm": 0.30811500300258376,
"learning_rate": 2.1570923792977893e-05,
"loss": 0.8057,
"step": 1745
},
{
"epoch": 0.4805614807679075,
"grad_norm": 0.2783070230175767,
"learning_rate": 2.155350447141622e-05,
"loss": 0.8013,
"step": 1746
},
{
"epoch": 0.48083671643845044,
"grad_norm": 0.2572646507800091,
"learning_rate": 2.1536083964159893e-05,
"loss": 0.789,
"step": 1747
},
{
"epoch": 0.4811119521089933,
"grad_norm": 0.28290675903026463,
"learning_rate": 2.1518662284504927e-05,
"loss": 0.798,
"step": 1748
},
{
"epoch": 0.4813871877795362,
"grad_norm": 0.2758544840675627,
"learning_rate": 2.150123944574822e-05,
"loss": 0.7961,
"step": 1749
},
{
"epoch": 0.48166242345007915,
"grad_norm": 0.628865638924377,
"learning_rate": 2.1483815461187553e-05,
"loss": 0.7901,
"step": 1750
},
{
"epoch": 0.48193765912062203,
"grad_norm": 0.2707563624141069,
"learning_rate": 2.1466390344121583e-05,
"loss": 0.8124,
"step": 1751
},
{
"epoch": 0.4822128947911649,
"grad_norm": 0.2831957978634998,
"learning_rate": 2.1448964107849828e-05,
"loss": 0.7904,
"step": 1752
},
{
"epoch": 0.48248813046170785,
"grad_norm": 0.29371461458299014,
"learning_rate": 2.1431536765672676e-05,
"loss": 0.7907,
"step": 1753
},
{
"epoch": 0.48276336613225074,
"grad_norm": 0.2581621035177647,
"learning_rate": 2.1414108330891348e-05,
"loss": 0.7765,
"step": 1754
},
{
"epoch": 0.4830386018027936,
"grad_norm": 0.2814056634036065,
"learning_rate": 2.139667881680789e-05,
"loss": 0.8158,
"step": 1755
},
{
"epoch": 0.48331383747333656,
"grad_norm": 0.2758666530494281,
"learning_rate": 2.137924823672518e-05,
"loss": 0.7859,
"step": 1756
},
{
"epoch": 0.48358907314387944,
"grad_norm": 0.39000091763762096,
"learning_rate": 2.1361816603946922e-05,
"loss": 0.7759,
"step": 1757
},
{
"epoch": 0.48386430881442233,
"grad_norm": 0.29037363845582215,
"learning_rate": 2.1344383931777606e-05,
"loss": 0.792,
"step": 1758
},
{
"epoch": 0.48413954448496527,
"grad_norm": 0.38418032710709565,
"learning_rate": 2.1326950233522515e-05,
"loss": 0.7993,
"step": 1759
},
{
"epoch": 0.48441478015550815,
"grad_norm": 0.29204665923332523,
"learning_rate": 2.130951552248773e-05,
"loss": 0.7665,
"step": 1760
},
{
"epoch": 0.48469001582605103,
"grad_norm": 0.291882163067355,
"learning_rate": 2.1292079811980093e-05,
"loss": 0.7819,
"step": 1761
},
{
"epoch": 0.484965251496594,
"grad_norm": 0.28631367096112953,
"learning_rate": 2.1274643115307207e-05,
"loss": 0.7981,
"step": 1762
},
{
"epoch": 0.48524048716713686,
"grad_norm": 0.28768312205681207,
"learning_rate": 2.125720544577744e-05,
"loss": 0.798,
"step": 1763
},
{
"epoch": 0.48551572283767974,
"grad_norm": 0.34242076178983794,
"learning_rate": 2.1239766816699894e-05,
"loss": 0.7956,
"step": 1764
},
{
"epoch": 0.4857909585082227,
"grad_norm": 0.2854851432802041,
"learning_rate": 2.12223272413844e-05,
"loss": 0.8174,
"step": 1765
},
{
"epoch": 0.48606619417876556,
"grad_norm": 0.26540351697584436,
"learning_rate": 2.120488673314152e-05,
"loss": 0.7867,
"step": 1766
},
{
"epoch": 0.48634142984930845,
"grad_norm": 0.2907226629348622,
"learning_rate": 2.1187445305282525e-05,
"loss": 0.8248,
"step": 1767
},
{
"epoch": 0.4866166655198514,
"grad_norm": 0.2698162490585244,
"learning_rate": 2.117000297111938e-05,
"loss": 0.8054,
"step": 1768
},
{
"epoch": 0.48689190119039427,
"grad_norm": 0.269232138249288,
"learning_rate": 2.115255974396476e-05,
"loss": 0.7755,
"step": 1769
},
{
"epoch": 0.48716713686093716,
"grad_norm": 0.2807591574601917,
"learning_rate": 2.1135115637131994e-05,
"loss": 0.7997,
"step": 1770
},
{
"epoch": 0.4874423725314801,
"grad_norm": 0.2770987432672441,
"learning_rate": 2.1117670663935118e-05,
"loss": 0.778,
"step": 1771
},
{
"epoch": 0.487717608202023,
"grad_norm": 0.2621201805827772,
"learning_rate": 2.1100224837688792e-05,
"loss": 0.7624,
"step": 1772
},
{
"epoch": 0.48799284387256586,
"grad_norm": 0.29584262114495097,
"learning_rate": 2.1082778171708355e-05,
"loss": 0.7917,
"step": 1773
},
{
"epoch": 0.4882680795431088,
"grad_norm": 0.28810584622906893,
"learning_rate": 2.1065330679309766e-05,
"loss": 0.8017,
"step": 1774
},
{
"epoch": 0.4885433152136517,
"grad_norm": 0.3037144161798492,
"learning_rate": 2.1047882373809646e-05,
"loss": 0.7912,
"step": 1775
},
{
"epoch": 0.48881855088419457,
"grad_norm": 0.3487331345848116,
"learning_rate": 2.10304332685252e-05,
"loss": 0.7938,
"step": 1776
},
{
"epoch": 0.4890937865547375,
"grad_norm": 0.29140261836006287,
"learning_rate": 2.1012983376774255e-05,
"loss": 0.7831,
"step": 1777
},
{
"epoch": 0.4893690222252804,
"grad_norm": 0.31938290483864246,
"learning_rate": 2.099553271187526e-05,
"loss": 0.7517,
"step": 1778
},
{
"epoch": 0.4896442578958233,
"grad_norm": 0.30007021053547217,
"learning_rate": 2.0978081287147218e-05,
"loss": 0.7896,
"step": 1779
},
{
"epoch": 0.4899194935663662,
"grad_norm": 0.2546187942290934,
"learning_rate": 2.0960629115909743e-05,
"loss": 0.7926,
"step": 1780
},
{
"epoch": 0.4901947292369091,
"grad_norm": 0.30089950412051,
"learning_rate": 2.0943176211483013e-05,
"loss": 0.7838,
"step": 1781
},
{
"epoch": 0.490469964907452,
"grad_norm": 0.30372815830362443,
"learning_rate": 2.092572258718774e-05,
"loss": 0.7852,
"step": 1782
},
{
"epoch": 0.4907452005779949,
"grad_norm": 0.2836246346667227,
"learning_rate": 2.090826825634522e-05,
"loss": 0.7827,
"step": 1783
},
{
"epoch": 0.4910204362485378,
"grad_norm": 0.28047859446672074,
"learning_rate": 2.0890813232277263e-05,
"loss": 0.7895,
"step": 1784
},
{
"epoch": 0.4912956719190807,
"grad_norm": 0.28040166068412964,
"learning_rate": 2.087335752830622e-05,
"loss": 0.7763,
"step": 1785
},
{
"epoch": 0.49157090758962363,
"grad_norm": 0.4580865112030622,
"learning_rate": 2.0855901157754964e-05,
"loss": 0.8046,
"step": 1786
},
{
"epoch": 0.4918461432601665,
"grad_norm": 0.3264974327831298,
"learning_rate": 2.0838444133946867e-05,
"loss": 0.8223,
"step": 1787
},
{
"epoch": 0.4921213789307094,
"grad_norm": 0.2669283921793564,
"learning_rate": 2.0820986470205805e-05,
"loss": 0.7801,
"step": 1788
},
{
"epoch": 0.49239661460125234,
"grad_norm": 0.47090998360415265,
"learning_rate": 2.0803528179856145e-05,
"loss": 0.8139,
"step": 1789
},
{
"epoch": 0.4926718502717952,
"grad_norm": 0.29972794899024474,
"learning_rate": 2.0786069276222722e-05,
"loss": 0.8035,
"step": 1790
},
{
"epoch": 0.4929470859423381,
"grad_norm": 0.2972130902539023,
"learning_rate": 2.076860977263085e-05,
"loss": 0.7858,
"step": 1791
},
{
"epoch": 0.49322232161288104,
"grad_norm": 0.28561626421570363,
"learning_rate": 2.0751149682406303e-05,
"loss": 0.7854,
"step": 1792
},
{
"epoch": 0.4934975572834239,
"grad_norm": 0.2955980744524161,
"learning_rate": 2.073368901887529e-05,
"loss": 0.7527,
"step": 1793
},
{
"epoch": 0.4937727929539668,
"grad_norm": 0.4196436861149892,
"learning_rate": 2.071622779536446e-05,
"loss": 0.8101,
"step": 1794
},
{
"epoch": 0.49404802862450975,
"grad_norm": 0.2970753345608763,
"learning_rate": 2.0698766025200897e-05,
"loss": 0.8199,
"step": 1795
},
{
"epoch": 0.49432326429505263,
"grad_norm": 0.27878974946916274,
"learning_rate": 2.0681303721712105e-05,
"loss": 0.8113,
"step": 1796
},
{
"epoch": 0.4945984999655955,
"grad_norm": 0.32790368278803866,
"learning_rate": 2.0663840898225982e-05,
"loss": 0.7836,
"step": 1797
},
{
"epoch": 0.49487373563613846,
"grad_norm": 0.2867737693561296,
"learning_rate": 2.064637756807083e-05,
"loss": 0.8134,
"step": 1798
},
{
"epoch": 0.49514897130668134,
"grad_norm": 0.32467707157300846,
"learning_rate": 2.0628913744575344e-05,
"loss": 0.7824,
"step": 1799
},
{
"epoch": 0.4954242069772242,
"grad_norm": 0.29139845858453167,
"learning_rate": 2.061144944106858e-05,
"loss": 0.8198,
"step": 1800
},
{
"epoch": 0.49569944264776716,
"grad_norm": 0.3761268661384045,
"learning_rate": 2.0593984670879973e-05,
"loss": 0.7907,
"step": 1801
},
{
"epoch": 0.49597467831831005,
"grad_norm": 0.2752478707286451,
"learning_rate": 2.0576519447339313e-05,
"loss": 0.8013,
"step": 1802
},
{
"epoch": 0.49624991398885293,
"grad_norm": 0.30508824765776554,
"learning_rate": 2.055905378377673e-05,
"loss": 0.8013,
"step": 1803
},
{
"epoch": 0.49652514965939587,
"grad_norm": 0.24694277564438605,
"learning_rate": 2.0541587693522694e-05,
"loss": 0.7752,
"step": 1804
},
{
"epoch": 0.49680038532993875,
"grad_norm": 0.35293258669054917,
"learning_rate": 2.0524121189908e-05,
"loss": 0.7877,
"step": 1805
},
{
"epoch": 0.49707562100048164,
"grad_norm": 0.2615144415699339,
"learning_rate": 2.050665428626376e-05,
"loss": 0.7906,
"step": 1806
},
{
"epoch": 0.4973508566710246,
"grad_norm": 0.26741314910235753,
"learning_rate": 2.0489186995921392e-05,
"loss": 0.7659,
"step": 1807
},
{
"epoch": 0.49762609234156746,
"grad_norm": 0.27073768541859894,
"learning_rate": 2.0471719332212605e-05,
"loss": 0.8053,
"step": 1808
},
{
"epoch": 0.49790132801211034,
"grad_norm": 0.25624827625159563,
"learning_rate": 2.045425130846939e-05,
"loss": 0.7721,
"step": 1809
},
{
"epoch": 0.4981765636826533,
"grad_norm": 0.27467751612423486,
"learning_rate": 2.0436782938024023e-05,
"loss": 0.7971,
"step": 1810
},
{
"epoch": 0.49845179935319617,
"grad_norm": 0.2540227526578231,
"learning_rate": 2.041931423420904e-05,
"loss": 0.7702,
"step": 1811
},
{
"epoch": 0.49872703502373905,
"grad_norm": 0.2537609456445603,
"learning_rate": 2.0401845210357222e-05,
"loss": 0.8158,
"step": 1812
},
{
"epoch": 0.499002270694282,
"grad_norm": 0.2553053297171385,
"learning_rate": 2.0384375879801622e-05,
"loss": 0.7945,
"step": 1813
},
{
"epoch": 0.4992775063648249,
"grad_norm": 0.23718379217472482,
"learning_rate": 2.036690625587549e-05,
"loss": 0.7967,
"step": 1814
},
{
"epoch": 0.49955274203536776,
"grad_norm": 0.26219879330390655,
"learning_rate": 2.0349436351912327e-05,
"loss": 0.8149,
"step": 1815
},
{
"epoch": 0.4998279777059107,
"grad_norm": 0.26072196556066396,
"learning_rate": 2.0331966181245835e-05,
"loss": 0.7824,
"step": 1816
},
{
"epoch": 0.5001032133764536,
"grad_norm": 0.236613647680266,
"learning_rate": 2.031449575720992e-05,
"loss": 0.7812,
"step": 1817
},
{
"epoch": 0.5003784490469965,
"grad_norm": 0.27423852596815435,
"learning_rate": 2.0297025093138697e-05,
"loss": 0.7727,
"step": 1818
},
{
"epoch": 0.5006536847175393,
"grad_norm": 0.5283423097781088,
"learning_rate": 2.0279554202366443e-05,
"loss": 0.7747,
"step": 1819
},
{
"epoch": 0.5009289203880823,
"grad_norm": 0.2660421101896664,
"learning_rate": 2.026208309822762e-05,
"loss": 0.7889,
"step": 1820
},
{
"epoch": 0.5012041560586252,
"grad_norm": 0.25963909382849104,
"learning_rate": 2.0244611794056846e-05,
"loss": 0.794,
"step": 1821
},
{
"epoch": 0.5014793917291681,
"grad_norm": 0.2871750017668787,
"learning_rate": 2.0227140303188895e-05,
"loss": 0.789,
"step": 1822
},
{
"epoch": 0.501754627399711,
"grad_norm": 0.27434033789371726,
"learning_rate": 2.0209668638958687e-05,
"loss": 0.7897,
"step": 1823
},
{
"epoch": 0.5020298630702539,
"grad_norm": 0.2794859206744288,
"learning_rate": 2.0192196814701278e-05,
"loss": 0.8211,
"step": 1824
},
{
"epoch": 0.5023050987407968,
"grad_norm": 0.27023359462501895,
"learning_rate": 2.0174724843751824e-05,
"loss": 0.7968,
"step": 1825
},
{
"epoch": 0.5025803344113398,
"grad_norm": 0.3088651290159606,
"learning_rate": 2.0157252739445624e-05,
"loss": 0.7835,
"step": 1826
},
{
"epoch": 0.5028555700818826,
"grad_norm": 0.2523274812488868,
"learning_rate": 2.0139780515118054e-05,
"loss": 0.7642,
"step": 1827
},
{
"epoch": 0.5031308057524255,
"grad_norm": 0.2901158820326341,
"learning_rate": 2.0122308184104587e-05,
"loss": 0.7728,
"step": 1828
},
{
"epoch": 0.5034060414229684,
"grad_norm": 0.2656362348103561,
"learning_rate": 2.0104835759740798e-05,
"loss": 0.8049,
"step": 1829
},
{
"epoch": 0.5036812770935113,
"grad_norm": 0.3040262021086047,
"learning_rate": 2.00873632553623e-05,
"loss": 0.7752,
"step": 1830
},
{
"epoch": 0.5039565127640542,
"grad_norm": 0.33692564783429974,
"learning_rate": 2.006989068430479e-05,
"loss": 0.782,
"step": 1831
},
{
"epoch": 0.5042317484345972,
"grad_norm": 0.2838371097622475,
"learning_rate": 2.005241805990401e-05,
"loss": 0.783,
"step": 1832
},
{
"epoch": 0.50450698410514,
"grad_norm": 0.28443192939303713,
"learning_rate": 2.003494539549574e-05,
"loss": 0.8035,
"step": 1833
},
{
"epoch": 0.5047822197756829,
"grad_norm": 0.2793398356762985,
"learning_rate": 2.001747270441579e-05,
"loss": 0.7697,
"step": 1834
},
{
"epoch": 0.5050574554462258,
"grad_norm": 0.27926091910752626,
"learning_rate": 2e-05,
"loss": 0.7907,
"step": 1835
},
{
"epoch": 0.5053326911167687,
"grad_norm": 0.2899739453078647,
"learning_rate": 1.9982527295584217e-05,
"loss": 0.7845,
"step": 1836
},
{
"epoch": 0.5056079267873116,
"grad_norm": 0.2760882542671676,
"learning_rate": 1.996505460450427e-05,
"loss": 0.7749,
"step": 1837
},
{
"epoch": 0.5058831624578546,
"grad_norm": 0.2930290348349952,
"learning_rate": 1.9947581940096e-05,
"loss": 0.7759,
"step": 1838
},
{
"epoch": 0.5061583981283975,
"grad_norm": 0.29413520625087847,
"learning_rate": 1.9930109315695212e-05,
"loss": 0.8076,
"step": 1839
},
{
"epoch": 0.5064336337989404,
"grad_norm": 0.2965867782023049,
"learning_rate": 1.9912636744637704e-05,
"loss": 0.8134,
"step": 1840
},
{
"epoch": 0.5067088694694832,
"grad_norm": 0.2726351152200352,
"learning_rate": 1.989516424025921e-05,
"loss": 0.7884,
"step": 1841
},
{
"epoch": 0.5069841051400261,
"grad_norm": 0.5284990385916277,
"learning_rate": 1.9877691815895416e-05,
"loss": 0.7711,
"step": 1842
},
{
"epoch": 0.507259340810569,
"grad_norm": 0.31078040704691867,
"learning_rate": 1.9860219484881953e-05,
"loss": 0.8002,
"step": 1843
},
{
"epoch": 0.507534576481112,
"grad_norm": 0.274453626099893,
"learning_rate": 1.9842747260554383e-05,
"loss": 0.7682,
"step": 1844
},
{
"epoch": 0.5078098121516549,
"grad_norm": 0.5039990309141663,
"learning_rate": 1.9825275156248183e-05,
"loss": 0.8001,
"step": 1845
},
{
"epoch": 0.5080850478221978,
"grad_norm": 0.26663518393366115,
"learning_rate": 1.9807803185298725e-05,
"loss": 0.8125,
"step": 1846
},
{
"epoch": 0.5083602834927406,
"grad_norm": 0.3302154261670141,
"learning_rate": 1.9790331361041316e-05,
"loss": 0.8097,
"step": 1847
},
{
"epoch": 0.5086355191632835,
"grad_norm": 0.2820575014419362,
"learning_rate": 1.977285969681111e-05,
"loss": 0.791,
"step": 1848
},
{
"epoch": 0.5089107548338264,
"grad_norm": 0.30828900340014714,
"learning_rate": 1.975538820594316e-05,
"loss": 0.8212,
"step": 1849
},
{
"epoch": 0.5091859905043694,
"grad_norm": 0.27770905907922044,
"learning_rate": 1.9737916901772387e-05,
"loss": 0.7995,
"step": 1850
},
{
"epoch": 0.5094612261749123,
"grad_norm": 0.4189477872834542,
"learning_rate": 1.9720445797633564e-05,
"loss": 0.7752,
"step": 1851
},
{
"epoch": 0.5097364618454552,
"grad_norm": 0.27017071599393705,
"learning_rate": 1.9702974906861313e-05,
"loss": 0.8072,
"step": 1852
},
{
"epoch": 0.5100116975159981,
"grad_norm": 0.32253948520203274,
"learning_rate": 1.968550424279008e-05,
"loss": 0.7607,
"step": 1853
},
{
"epoch": 0.510286933186541,
"grad_norm": 0.2849398772803456,
"learning_rate": 1.9668033818754172e-05,
"loss": 0.7822,
"step": 1854
},
{
"epoch": 0.5105621688570838,
"grad_norm": 0.30576670900428615,
"learning_rate": 1.9650563648087676e-05,
"loss": 0.776,
"step": 1855
},
{
"epoch": 0.5108374045276268,
"grad_norm": 0.3059638528133474,
"learning_rate": 1.9633093744124513e-05,
"loss": 0.7778,
"step": 1856
},
{
"epoch": 0.5111126401981697,
"grad_norm": 0.2853091596695262,
"learning_rate": 1.9615624120198385e-05,
"loss": 0.7879,
"step": 1857
},
{
"epoch": 0.5113878758687126,
"grad_norm": 0.279440207179744,
"learning_rate": 1.959815478964278e-05,
"loss": 0.7934,
"step": 1858
},
{
"epoch": 0.5116631115392555,
"grad_norm": 0.26715188895634223,
"learning_rate": 1.9580685765790967e-05,
"loss": 0.7663,
"step": 1859
},
{
"epoch": 0.5119383472097984,
"grad_norm": 0.26912141118388283,
"learning_rate": 1.956321706197598e-05,
"loss": 0.7929,
"step": 1860
},
{
"epoch": 0.5122135828803412,
"grad_norm": 0.25812474718831835,
"learning_rate": 1.9545748691530613e-05,
"loss": 0.7892,
"step": 1861
},
{
"epoch": 0.5124888185508842,
"grad_norm": 0.2782469711985159,
"learning_rate": 1.9528280667787402e-05,
"loss": 0.8091,
"step": 1862
},
{
"epoch": 0.5127640542214271,
"grad_norm": 0.2855279171052471,
"learning_rate": 1.9510813004078615e-05,
"loss": 0.8117,
"step": 1863
},
{
"epoch": 0.51303928989197,
"grad_norm": 0.28253600322665207,
"learning_rate": 1.9493345713736248e-05,
"loss": 0.8074,
"step": 1864
},
{
"epoch": 0.5133145255625129,
"grad_norm": 0.28782847388424193,
"learning_rate": 1.9475878810092005e-05,
"loss": 0.7919,
"step": 1865
},
{
"epoch": 0.5135897612330558,
"grad_norm": 0.27136792881072175,
"learning_rate": 1.9458412306477316e-05,
"loss": 0.8043,
"step": 1866
},
{
"epoch": 0.5138649969035987,
"grad_norm": 0.29449075942078307,
"learning_rate": 1.944094621622328e-05,
"loss": 0.76,
"step": 1867
},
{
"epoch": 0.5141402325741417,
"grad_norm": 0.25669349944292563,
"learning_rate": 1.942348055266069e-05,
"loss": 0.7584,
"step": 1868
},
{
"epoch": 0.5144154682446845,
"grad_norm": 0.26624978552777906,
"learning_rate": 1.940601532912003e-05,
"loss": 0.7965,
"step": 1869
},
{
"epoch": 0.5146907039152274,
"grad_norm": 0.26487169146946327,
"learning_rate": 1.938855055893143e-05,
"loss": 0.7862,
"step": 1870
},
{
"epoch": 0.5149659395857703,
"grad_norm": 0.2638987307765772,
"learning_rate": 1.9371086255424662e-05,
"loss": 0.786,
"step": 1871
},
{
"epoch": 0.5152411752563132,
"grad_norm": 0.25559559372955387,
"learning_rate": 1.9353622431929175e-05,
"loss": 0.7935,
"step": 1872
},
{
"epoch": 0.5155164109268561,
"grad_norm": 0.26630601315009644,
"learning_rate": 1.9336159101774025e-05,
"loss": 0.7826,
"step": 1873
},
{
"epoch": 0.5157916465973991,
"grad_norm": 0.2660509295352382,
"learning_rate": 1.9318696278287905e-05,
"loss": 0.7878,
"step": 1874
},
{
"epoch": 0.516066882267942,
"grad_norm": 0.2615994462412795,
"learning_rate": 1.9301233974799107e-05,
"loss": 0.7931,
"step": 1875
},
{
"epoch": 0.5163421179384848,
"grad_norm": 0.2729844686108098,
"learning_rate": 1.9283772204635544e-05,
"loss": 0.8023,
"step": 1876
},
{
"epoch": 0.5166173536090277,
"grad_norm": 0.31472095061773553,
"learning_rate": 1.9266310981124717e-05,
"loss": 0.8158,
"step": 1877
},
{
"epoch": 0.5168925892795706,
"grad_norm": 0.2829747043779742,
"learning_rate": 1.92488503175937e-05,
"loss": 0.7757,
"step": 1878
},
{
"epoch": 0.5171678249501135,
"grad_norm": 0.266646264944014,
"learning_rate": 1.9231390227369152e-05,
"loss": 0.8025,
"step": 1879
},
{
"epoch": 0.5174430606206565,
"grad_norm": 0.25708171952330294,
"learning_rate": 1.9213930723777285e-05,
"loss": 0.7672,
"step": 1880
},
{
"epoch": 0.5177182962911994,
"grad_norm": 0.2856031088074033,
"learning_rate": 1.919647182014386e-05,
"loss": 0.7851,
"step": 1881
},
{
"epoch": 0.5179935319617422,
"grad_norm": 0.250364937205058,
"learning_rate": 1.9179013529794195e-05,
"loss": 0.8055,
"step": 1882
},
{
"epoch": 0.5182687676322851,
"grad_norm": 0.26899840968706573,
"learning_rate": 1.9161555866053136e-05,
"loss": 0.755,
"step": 1883
},
{
"epoch": 0.518544003302828,
"grad_norm": 0.25350280903092137,
"learning_rate": 1.9144098842245042e-05,
"loss": 0.7899,
"step": 1884
},
{
"epoch": 0.5188192389733709,
"grad_norm": 0.27039801347560255,
"learning_rate": 1.912664247169379e-05,
"loss": 0.7617,
"step": 1885
},
{
"epoch": 0.5190944746439139,
"grad_norm": 0.26826753895162614,
"learning_rate": 1.9109186767722743e-05,
"loss": 0.7804,
"step": 1886
},
{
"epoch": 0.5193697103144568,
"grad_norm": 0.25225340441463456,
"learning_rate": 1.9091731743654792e-05,
"loss": 0.7799,
"step": 1887
},
{
"epoch": 0.5196449459849997,
"grad_norm": 0.2712241046085995,
"learning_rate": 1.907427741281227e-05,
"loss": 0.7956,
"step": 1888
},
{
"epoch": 0.5199201816555425,
"grad_norm": 0.261010355273269,
"learning_rate": 1.905682378851699e-05,
"loss": 0.7806,
"step": 1889
},
{
"epoch": 0.5201954173260854,
"grad_norm": 0.27913054691319983,
"learning_rate": 1.9039370884090256e-05,
"loss": 0.7827,
"step": 1890
},
{
"epoch": 0.5204706529966283,
"grad_norm": 0.26569515334185306,
"learning_rate": 1.9021918712852785e-05,
"loss": 0.7793,
"step": 1891
},
{
"epoch": 0.5207458886671713,
"grad_norm": 0.25150170325041604,
"learning_rate": 1.9004467288124746e-05,
"loss": 0.7626,
"step": 1892
},
{
"epoch": 0.5210211243377142,
"grad_norm": 0.2660344293365876,
"learning_rate": 1.8987016623225748e-05,
"loss": 0.7686,
"step": 1893
},
{
"epoch": 0.5212963600082571,
"grad_norm": 0.2713633212540108,
"learning_rate": 1.896956673147481e-05,
"loss": 0.7753,
"step": 1894
},
{
"epoch": 0.5215715956788,
"grad_norm": 0.260961251206183,
"learning_rate": 1.8952117626190364e-05,
"loss": 0.7677,
"step": 1895
},
{
"epoch": 0.5218468313493428,
"grad_norm": 0.27477723765459183,
"learning_rate": 1.893466932069023e-05,
"loss": 0.7499,
"step": 1896
},
{
"epoch": 0.5221220670198857,
"grad_norm": 0.25867130600828864,
"learning_rate": 1.8917221828291652e-05,
"loss": 0.8165,
"step": 1897
},
{
"epoch": 0.5223973026904287,
"grad_norm": 0.28667815675574226,
"learning_rate": 1.889977516231121e-05,
"loss": 0.805,
"step": 1898
},
{
"epoch": 0.5226725383609716,
"grad_norm": 0.26150363141638605,
"learning_rate": 1.8882329336064892e-05,
"loss": 0.8143,
"step": 1899
},
{
"epoch": 0.5229477740315145,
"grad_norm": 0.2804131727213887,
"learning_rate": 1.886488436286801e-05,
"loss": 0.8133,
"step": 1900
},
{
"epoch": 0.5232230097020574,
"grad_norm": 0.25048597469911027,
"learning_rate": 1.8847440256035252e-05,
"loss": 0.7654,
"step": 1901
},
{
"epoch": 0.5234982453726003,
"grad_norm": 0.26999491057017366,
"learning_rate": 1.8829997028880625e-05,
"loss": 0.8118,
"step": 1902
},
{
"epoch": 0.5237734810431431,
"grad_norm": 0.2775730535331951,
"learning_rate": 1.881255469471748e-05,
"loss": 0.7955,
"step": 1903
},
{
"epoch": 0.5240487167136861,
"grad_norm": 0.2680115716391252,
"learning_rate": 1.8795113266858483e-05,
"loss": 0.7818,
"step": 1904
},
{
"epoch": 0.524323952384229,
"grad_norm": 0.2752545455895527,
"learning_rate": 1.8777672758615604e-05,
"loss": 0.7856,
"step": 1905
},
{
"epoch": 0.5245991880547719,
"grad_norm": 0.27231929454550835,
"learning_rate": 1.8760233183300112e-05,
"loss": 0.8003,
"step": 1906
},
{
"epoch": 0.5248744237253148,
"grad_norm": 0.2798918244464111,
"learning_rate": 1.8742794554222568e-05,
"loss": 0.811,
"step": 1907
},
{
"epoch": 0.5251496593958577,
"grad_norm": 0.286642385052349,
"learning_rate": 1.87253568846928e-05,
"loss": 0.7648,
"step": 1908
},
{
"epoch": 0.5254248950664006,
"grad_norm": 0.2684095848293027,
"learning_rate": 1.8707920188019917e-05,
"loss": 0.7969,
"step": 1909
},
{
"epoch": 0.5257001307369435,
"grad_norm": 0.2719302405508206,
"learning_rate": 1.8690484477512272e-05,
"loss": 0.7954,
"step": 1910
},
{
"epoch": 0.5259753664074864,
"grad_norm": 0.2598519706473702,
"learning_rate": 1.8673049766477488e-05,
"loss": 0.8129,
"step": 1911
},
{
"epoch": 0.5262506020780293,
"grad_norm": 0.2758876019629264,
"learning_rate": 1.86556160682224e-05,
"loss": 0.7725,
"step": 1912
},
{
"epoch": 0.5265258377485722,
"grad_norm": 0.31537757282624546,
"learning_rate": 1.863818339605308e-05,
"loss": 0.7699,
"step": 1913
},
{
"epoch": 0.5268010734191151,
"grad_norm": 0.26261219810372477,
"learning_rate": 1.862075176327482e-05,
"loss": 0.8071,
"step": 1914
},
{
"epoch": 0.527076309089658,
"grad_norm": 0.25963322205954664,
"learning_rate": 1.8603321183192118e-05,
"loss": 0.773,
"step": 1915
},
{
"epoch": 0.527351544760201,
"grad_norm": 0.279823419586967,
"learning_rate": 1.8585891669108662e-05,
"loss": 0.8112,
"step": 1916
},
{
"epoch": 0.5276267804307438,
"grad_norm": 0.2836224263795011,
"learning_rate": 1.856846323432733e-05,
"loss": 0.7739,
"step": 1917
},
{
"epoch": 0.5279020161012867,
"grad_norm": 0.7075412114248868,
"learning_rate": 1.8551035892150176e-05,
"loss": 0.8135,
"step": 1918
},
{
"epoch": 0.5281772517718296,
"grad_norm": 0.27593565525559094,
"learning_rate": 1.853360965587842e-05,
"loss": 0.7884,
"step": 1919
},
{
"epoch": 0.5284524874423725,
"grad_norm": 0.2648960687542547,
"learning_rate": 1.8516184538812454e-05,
"loss": 0.7755,
"step": 1920
},
{
"epoch": 0.5287277231129154,
"grad_norm": 0.27460823148077607,
"learning_rate": 1.8498760554251788e-05,
"loss": 0.7938,
"step": 1921
},
{
"epoch": 0.5290029587834584,
"grad_norm": 0.25980882335955263,
"learning_rate": 1.848133771549508e-05,
"loss": 0.7612,
"step": 1922
},
{
"epoch": 0.5292781944540013,
"grad_norm": 0.2771174473857577,
"learning_rate": 1.8463916035840114e-05,
"loss": 0.7937,
"step": 1923
},
{
"epoch": 0.5295534301245441,
"grad_norm": 0.25927594122103753,
"learning_rate": 1.844649552858379e-05,
"loss": 0.8126,
"step": 1924
},
{
"epoch": 0.529828665795087,
"grad_norm": 0.28591007027338844,
"learning_rate": 1.8429076207022107e-05,
"loss": 0.8046,
"step": 1925
},
{
"epoch": 0.5301039014656299,
"grad_norm": 0.2837803520167671,
"learning_rate": 1.841165808445018e-05,
"loss": 0.8083,
"step": 1926
},
{
"epoch": 0.5303791371361728,
"grad_norm": 0.28474195324148543,
"learning_rate": 1.8394241174162184e-05,
"loss": 0.7906,
"step": 1927
},
{
"epoch": 0.5306543728067158,
"grad_norm": 0.28226403517159054,
"learning_rate": 1.837682548945138e-05,
"loss": 0.7982,
"step": 1928
},
{
"epoch": 0.5309296084772587,
"grad_norm": 0.2950887256233825,
"learning_rate": 1.8359411043610083e-05,
"loss": 0.8103,
"step": 1929
},
{
"epoch": 0.5312048441478016,
"grad_norm": 0.3031647509151272,
"learning_rate": 1.834199784992968e-05,
"loss": 0.8108,
"step": 1930
},
{
"epoch": 0.5314800798183444,
"grad_norm": 0.2709956390615864,
"learning_rate": 1.8324585921700592e-05,
"loss": 0.7783,
"step": 1931
},
{
"epoch": 0.5317553154888873,
"grad_norm": 0.2902736025344367,
"learning_rate": 1.8307175272212267e-05,
"loss": 0.7876,
"step": 1932
},
{
"epoch": 0.5320305511594302,
"grad_norm": 0.2774226425243251,
"learning_rate": 1.82897659147532e-05,
"loss": 0.7913,
"step": 1933
},
{
"epoch": 0.5323057868299732,
"grad_norm": 0.3130545227057113,
"learning_rate": 1.827235786261088e-05,
"loss": 0.7881,
"step": 1934
},
{
"epoch": 0.5325810225005161,
"grad_norm": 0.27805933752825074,
"learning_rate": 1.8254951129071795e-05,
"loss": 0.7695,
"step": 1935
},
{
"epoch": 0.532856258171059,
"grad_norm": 0.2916677849268343,
"learning_rate": 1.8237545727421455e-05,
"loss": 0.8079,
"step": 1936
},
{
"epoch": 0.5331314938416019,
"grad_norm": 0.2772032184415956,
"learning_rate": 1.8220141670944322e-05,
"loss": 0.8093,
"step": 1937
},
{
"epoch": 0.5334067295121447,
"grad_norm": 0.31938560616910966,
"learning_rate": 1.8202738972923848e-05,
"loss": 0.7775,
"step": 1938
},
{
"epoch": 0.5336819651826876,
"grad_norm": 0.28385693152230135,
"learning_rate": 1.8185337646642436e-05,
"loss": 0.7873,
"step": 1939
},
{
"epoch": 0.5339572008532306,
"grad_norm": 0.2920670205085561,
"learning_rate": 1.816793770538147e-05,
"loss": 0.7941,
"step": 1940
},
{
"epoch": 0.5342324365237735,
"grad_norm": 0.2523907852517901,
"learning_rate": 1.8150539162421236e-05,
"loss": 0.7784,
"step": 1941
},
{
"epoch": 0.5345076721943164,
"grad_norm": 0.26872016920154207,
"learning_rate": 1.8133142031040995e-05,
"loss": 0.7688,
"step": 1942
},
{
"epoch": 0.5347829078648593,
"grad_norm": 0.2630262231408708,
"learning_rate": 1.81157463245189e-05,
"loss": 0.782,
"step": 1943
},
{
"epoch": 0.5350581435354022,
"grad_norm": 0.2463270254186401,
"learning_rate": 1.809835205613202e-05,
"loss": 0.7752,
"step": 1944
},
{
"epoch": 0.535333379205945,
"grad_norm": 0.2550154470138386,
"learning_rate": 1.808095923915634e-05,
"loss": 0.8081,
"step": 1945
},
{
"epoch": 0.535608614876488,
"grad_norm": 0.23806834115134007,
"learning_rate": 1.8063567886866732e-05,
"loss": 0.7873,
"step": 1946
},
{
"epoch": 0.5358838505470309,
"grad_norm": 0.28055233795604595,
"learning_rate": 1.804617801253694e-05,
"loss": 0.7951,
"step": 1947
},
{
"epoch": 0.5361590862175738,
"grad_norm": 0.25142578647695374,
"learning_rate": 1.80287896294396e-05,
"loss": 0.7438,
"step": 1948
},
{
"epoch": 0.5364343218881167,
"grad_norm": 0.2848291687917642,
"learning_rate": 1.8011402750846194e-05,
"loss": 0.7922,
"step": 1949
},
{
"epoch": 0.5367095575586596,
"grad_norm": 0.26649225347637134,
"learning_rate": 1.7994017390027055e-05,
"loss": 0.806,
"step": 1950
},
{
"epoch": 0.5369847932292025,
"grad_norm": 0.25283388778282584,
"learning_rate": 1.797663356025136e-05,
"loss": 0.7918,
"step": 1951
},
{
"epoch": 0.5372600288997454,
"grad_norm": 0.26392808637936516,
"learning_rate": 1.795925127478713e-05,
"loss": 0.8285,
"step": 1952
},
{
"epoch": 0.5375352645702883,
"grad_norm": 0.24560111116558545,
"learning_rate": 1.7941870546901178e-05,
"loss": 0.7837,
"step": 1953
},
{
"epoch": 0.5378105002408312,
"grad_norm": 0.28234253483537725,
"learning_rate": 1.7924491389859172e-05,
"loss": 0.7894,
"step": 1954
},
{
"epoch": 0.5380857359113741,
"grad_norm": 0.2594093161334092,
"learning_rate": 1.7907113816925546e-05,
"loss": 0.8012,
"step": 1955
},
{
"epoch": 0.538360971581917,
"grad_norm": 0.2779218796259984,
"learning_rate": 1.788973784136353e-05,
"loss": 0.7862,
"step": 1956
},
{
"epoch": 0.5386362072524599,
"grad_norm": 0.2710859571554646,
"learning_rate": 1.7872363476435142e-05,
"loss": 0.7618,
"step": 1957
},
{
"epoch": 0.5389114429230029,
"grad_norm": 0.2676404382532293,
"learning_rate": 1.7854990735401174e-05,
"loss": 0.8052,
"step": 1958
},
{
"epoch": 0.5391866785935457,
"grad_norm": 0.2915039086597559,
"learning_rate": 1.783761963152117e-05,
"loss": 0.7833,
"step": 1959
},
{
"epoch": 0.5394619142640886,
"grad_norm": 0.2501789605795621,
"learning_rate": 1.782025017805342e-05,
"loss": 0.7843,
"step": 1960
},
{
"epoch": 0.5397371499346315,
"grad_norm": 0.26885451833283674,
"learning_rate": 1.780288238825498e-05,
"loss": 0.7741,
"step": 1961
},
{
"epoch": 0.5400123856051744,
"grad_norm": 0.25660414107103297,
"learning_rate": 1.77855162753816e-05,
"loss": 0.7673,
"step": 1962
},
{
"epoch": 0.5402876212757173,
"grad_norm": 0.2756794470378616,
"learning_rate": 1.776815185268778e-05,
"loss": 0.7916,
"step": 1963
},
{
"epoch": 0.5405628569462603,
"grad_norm": 0.2648244605605683,
"learning_rate": 1.7750789133426716e-05,
"loss": 0.805,
"step": 1964
},
{
"epoch": 0.5408380926168032,
"grad_norm": 0.27631326270263823,
"learning_rate": 1.773342813085031e-05,
"loss": 0.7911,
"step": 1965
},
{
"epoch": 0.541113328287346,
"grad_norm": 0.25810654406129824,
"learning_rate": 1.771606885820914e-05,
"loss": 0.7807,
"step": 1966
},
{
"epoch": 0.5413885639578889,
"grad_norm": 0.2852652284331418,
"learning_rate": 1.7698711328752474e-05,
"loss": 0.793,
"step": 1967
},
{
"epoch": 0.5416637996284318,
"grad_norm": 0.25322695807372597,
"learning_rate": 1.7681355555728257e-05,
"loss": 0.7831,
"step": 1968
},
{
"epoch": 0.5419390352989747,
"grad_norm": 0.27734794936103,
"learning_rate": 1.766400155238309e-05,
"loss": 0.786,
"step": 1969
},
{
"epoch": 0.5422142709695177,
"grad_norm": 0.28437142254364345,
"learning_rate": 1.7646649331962206e-05,
"loss": 0.786,
"step": 1970
},
{
"epoch": 0.5424895066400606,
"grad_norm": 0.2605076942673118,
"learning_rate": 1.76292989077095e-05,
"loss": 0.778,
"step": 1971
},
{
"epoch": 0.5427647423106035,
"grad_norm": 0.2773342212685139,
"learning_rate": 1.7611950292867476e-05,
"loss": 0.77,
"step": 1972
},
{
"epoch": 0.5430399779811463,
"grad_norm": 0.24994322864157123,
"learning_rate": 1.759460350067728e-05,
"loss": 0.7897,
"step": 1973
},
{
"epoch": 0.5433152136516892,
"grad_norm": 0.2923516549091756,
"learning_rate": 1.757725854437865e-05,
"loss": 0.7555,
"step": 1974
},
{
"epoch": 0.5435904493222322,
"grad_norm": 0.2530332743673048,
"learning_rate": 1.7559915437209912e-05,
"loss": 0.7776,
"step": 1975
},
{
"epoch": 0.5438656849927751,
"grad_norm": 0.29298196056658304,
"learning_rate": 1.7542574192408022e-05,
"loss": 0.8423,
"step": 1976
},
{
"epoch": 0.544140920663318,
"grad_norm": 0.2612608891964756,
"learning_rate": 1.752523482320847e-05,
"loss": 0.801,
"step": 1977
},
{
"epoch": 0.5444161563338609,
"grad_norm": 0.26183185205119774,
"learning_rate": 1.7507897342845338e-05,
"loss": 0.7763,
"step": 1978
},
{
"epoch": 0.5446913920044038,
"grad_norm": 0.28206152820613933,
"learning_rate": 1.749056176455126e-05,
"loss": 0.7919,
"step": 1979
},
{
"epoch": 0.5449666276749466,
"grad_norm": 0.24624504473837036,
"learning_rate": 1.747322810155742e-05,
"loss": 0.7645,
"step": 1980
},
{
"epoch": 0.5452418633454896,
"grad_norm": 0.2899629373082364,
"learning_rate": 1.745589636709354e-05,
"loss": 0.7709,
"step": 1981
},
{
"epoch": 0.5455170990160325,
"grad_norm": 0.239110965729579,
"learning_rate": 1.7438566574387864e-05,
"loss": 0.7692,
"step": 1982
},
{
"epoch": 0.5457923346865754,
"grad_norm": 0.25955568535138723,
"learning_rate": 1.742123873666717e-05,
"loss": 0.7918,
"step": 1983
},
{
"epoch": 0.5460675703571183,
"grad_norm": 0.2563500845539961,
"learning_rate": 1.740391286715672e-05,
"loss": 0.7589,
"step": 1984
},
{
"epoch": 0.5463428060276612,
"grad_norm": 0.27264598390987943,
"learning_rate": 1.7386588979080303e-05,
"loss": 0.8072,
"step": 1985
},
{
"epoch": 0.546618041698204,
"grad_norm": 0.25736047422488356,
"learning_rate": 1.7369267085660167e-05,
"loss": 0.7853,
"step": 1986
},
{
"epoch": 0.546893277368747,
"grad_norm": 0.25806570825572667,
"learning_rate": 1.7351947200117057e-05,
"loss": 0.7802,
"step": 1987
},
{
"epoch": 0.5471685130392899,
"grad_norm": 0.2485352254529487,
"learning_rate": 1.7334629335670176e-05,
"loss": 0.7829,
"step": 1988
},
{
"epoch": 0.5474437487098328,
"grad_norm": 0.26105039379455175,
"learning_rate": 1.7317313505537184e-05,
"loss": 0.7842,
"step": 1989
},
{
"epoch": 0.5477189843803757,
"grad_norm": 0.23580315780505323,
"learning_rate": 1.72999997229342e-05,
"loss": 0.7857,
"step": 1990
},
{
"epoch": 0.5479942200509186,
"grad_norm": 0.2597530681592618,
"learning_rate": 1.7282688001075766e-05,
"loss": 0.7875,
"step": 1991
},
{
"epoch": 0.5482694557214615,
"grad_norm": 0.2618080097326894,
"learning_rate": 1.7265378353174865e-05,
"loss": 0.7899,
"step": 1992
},
{
"epoch": 0.5485446913920045,
"grad_norm": 0.2344790619860551,
"learning_rate": 1.724807079244288e-05,
"loss": 0.7602,
"step": 1993
},
{
"epoch": 0.5488199270625473,
"grad_norm": 0.25733670318435226,
"learning_rate": 1.7230765332089613e-05,
"loss": 0.7769,
"step": 1994
},
{
"epoch": 0.5490951627330902,
"grad_norm": 0.24042937004196524,
"learning_rate": 1.721346198532326e-05,
"loss": 0.7698,
"step": 1995
},
{
"epoch": 0.5493703984036331,
"grad_norm": 0.2421860357089892,
"learning_rate": 1.71961607653504e-05,
"loss": 0.7814,
"step": 1996
},
{
"epoch": 0.549645634074176,
"grad_norm": 0.24364077587803198,
"learning_rate": 1.7178861685376004e-05,
"loss": 0.7571,
"step": 1997
},
{
"epoch": 0.5499208697447189,
"grad_norm": 0.2447840232382594,
"learning_rate": 1.7161564758603392e-05,
"loss": 0.7752,
"step": 1998
},
{
"epoch": 0.5501961054152619,
"grad_norm": 0.628668640344974,
"learning_rate": 1.7144269998234244e-05,
"loss": 0.7966,
"step": 1999
},
{
"epoch": 0.5504713410858048,
"grad_norm": 0.2571287619838796,
"learning_rate": 1.712697741746859e-05,
"loss": 0.8053,
"step": 2000
},
{
"epoch": 0.5507465767563476,
"grad_norm": 0.26695455010383273,
"learning_rate": 1.7109687029504805e-05,
"loss": 0.7676,
"step": 2001
},
{
"epoch": 0.5510218124268905,
"grad_norm": 0.2573235757930642,
"learning_rate": 1.709239884753957e-05,
"loss": 0.814,
"step": 2002
},
{
"epoch": 0.5512970480974334,
"grad_norm": 0.2751273845472534,
"learning_rate": 1.707511288476789e-05,
"loss": 0.805,
"step": 2003
},
{
"epoch": 0.5515722837679763,
"grad_norm": 0.272689116456555,
"learning_rate": 1.7057829154383095e-05,
"loss": 0.7824,
"step": 2004
},
{
"epoch": 0.5518475194385193,
"grad_norm": 0.2740587491604914,
"learning_rate": 1.704054766957679e-05,
"loss": 0.7973,
"step": 2005
},
{
"epoch": 0.5521227551090622,
"grad_norm": 0.25333928990399196,
"learning_rate": 1.7023268443538868e-05,
"loss": 0.8045,
"step": 2006
},
{
"epoch": 0.552397990779605,
"grad_norm": 0.26336090249565214,
"learning_rate": 1.700599148945751e-05,
"loss": 0.7995,
"step": 2007
},
{
"epoch": 0.5526732264501479,
"grad_norm": 0.2621200552071566,
"learning_rate": 1.6988716820519145e-05,
"loss": 0.766,
"step": 2008
},
{
"epoch": 0.5529484621206908,
"grad_norm": 0.25582732912748624,
"learning_rate": 1.6971444449908474e-05,
"loss": 0.7864,
"step": 2009
},
{
"epoch": 0.5532236977912337,
"grad_norm": 0.2604799412347714,
"learning_rate": 1.695417439080843e-05,
"loss": 0.7877,
"step": 2010
},
{
"epoch": 0.5534989334617767,
"grad_norm": 0.25328887720164894,
"learning_rate": 1.6936906656400197e-05,
"loss": 0.7656,
"step": 2011
},
{
"epoch": 0.5537741691323196,
"grad_norm": 0.2534440213109559,
"learning_rate": 1.691964125986318e-05,
"loss": 0.7907,
"step": 2012
},
{
"epoch": 0.5540494048028625,
"grad_norm": 0.2476470570149038,
"learning_rate": 1.6902378214374995e-05,
"loss": 0.7697,
"step": 2013
},
{
"epoch": 0.5543246404734054,
"grad_norm": 0.2694213175230382,
"learning_rate": 1.6885117533111463e-05,
"loss": 0.7988,
"step": 2014
},
{
"epoch": 0.5545998761439482,
"grad_norm": 0.30841770855502404,
"learning_rate": 1.68678592292466e-05,
"loss": 0.7796,
"step": 2015
},
{
"epoch": 0.5548751118144911,
"grad_norm": 0.25753374373992016,
"learning_rate": 1.6850603315952613e-05,
"loss": 0.776,
"step": 2016
},
{
"epoch": 0.5551503474850341,
"grad_norm": 0.2437887936932628,
"learning_rate": 1.683334980639988e-05,
"loss": 0.7712,
"step": 2017
},
{
"epoch": 0.555425583155577,
"grad_norm": 0.26321086219079554,
"learning_rate": 1.6816098713756956e-05,
"loss": 0.7709,
"step": 2018
},
{
"epoch": 0.5557008188261199,
"grad_norm": 0.24695977090983962,
"learning_rate": 1.679885005119053e-05,
"loss": 0.7985,
"step": 2019
},
{
"epoch": 0.5559760544966628,
"grad_norm": 0.2764422194698112,
"learning_rate": 1.6781603831865457e-05,
"loss": 0.7687,
"step": 2020
},
{
"epoch": 0.5562512901672056,
"grad_norm": 0.26101368078997494,
"learning_rate": 1.6764360068944706e-05,
"loss": 0.7706,
"step": 2021
},
{
"epoch": 0.5565265258377485,
"grad_norm": 0.2755045165307206,
"learning_rate": 1.6747118775589398e-05,
"loss": 0.769,
"step": 2022
},
{
"epoch": 0.5568017615082915,
"grad_norm": 0.26081237103206856,
"learning_rate": 1.6729879964958744e-05,
"loss": 0.7376,
"step": 2023
},
{
"epoch": 0.5570769971788344,
"grad_norm": 0.27554314183027323,
"learning_rate": 1.6712643650210074e-05,
"loss": 0.7848,
"step": 2024
},
{
"epoch": 0.5573522328493773,
"grad_norm": 0.28565893837510764,
"learning_rate": 1.66954098444988e-05,
"loss": 0.7632,
"step": 2025
},
{
"epoch": 0.5576274685199202,
"grad_norm": 0.2737243329516259,
"learning_rate": 1.6678178560978448e-05,
"loss": 0.8029,
"step": 2026
},
{
"epoch": 0.5579027041904631,
"grad_norm": 0.299381249200942,
"learning_rate": 1.6660949812800584e-05,
"loss": 0.7776,
"step": 2027
},
{
"epoch": 0.558177939861006,
"grad_norm": 0.2638957717972394,
"learning_rate": 1.6643723613114862e-05,
"loss": 0.7969,
"step": 2028
},
{
"epoch": 0.5584531755315489,
"grad_norm": 0.30472460658726175,
"learning_rate": 1.6626499975068982e-05,
"loss": 0.7797,
"step": 2029
},
{
"epoch": 0.5587284112020918,
"grad_norm": 0.2590340483031841,
"learning_rate": 1.6609278911808688e-05,
"loss": 0.7547,
"step": 2030
},
{
"epoch": 0.5590036468726347,
"grad_norm": 0.31529266976023407,
"learning_rate": 1.659206043647776e-05,
"loss": 0.7578,
"step": 2031
},
{
"epoch": 0.5592788825431776,
"grad_norm": 0.25403425411898994,
"learning_rate": 1.6574844562218e-05,
"loss": 0.7751,
"step": 2032
},
{
"epoch": 0.5595541182137205,
"grad_norm": 0.29800038972180426,
"learning_rate": 1.6557631302169236e-05,
"loss": 0.7718,
"step": 2033
},
{
"epoch": 0.5598293538842634,
"grad_norm": 0.2741538551149542,
"learning_rate": 1.6540420669469298e-05,
"loss": 0.7611,
"step": 2034
},
{
"epoch": 0.5601045895548064,
"grad_norm": 0.32261246465357896,
"learning_rate": 1.6523212677253996e-05,
"loss": 0.7896,
"step": 2035
},
{
"epoch": 0.5603798252253492,
"grad_norm": 0.284204794938927,
"learning_rate": 1.650600733865714e-05,
"loss": 0.7836,
"step": 2036
},
{
"epoch": 0.5606550608958921,
"grad_norm": 0.26216419660183365,
"learning_rate": 1.6488804666810504e-05,
"loss": 0.7828,
"step": 2037
},
{
"epoch": 0.560930296566435,
"grad_norm": 0.2957938006575376,
"learning_rate": 1.647160467484384e-05,
"loss": 0.7812,
"step": 2038
},
{
"epoch": 0.5612055322369779,
"grad_norm": 0.24078415224646846,
"learning_rate": 1.6454407375884828e-05,
"loss": 0.759,
"step": 2039
},
{
"epoch": 0.5614807679075208,
"grad_norm": 0.28878220901442014,
"learning_rate": 1.6437212783059136e-05,
"loss": 0.7706,
"step": 2040
},
{
"epoch": 0.5617560035780638,
"grad_norm": 0.24912996279183475,
"learning_rate": 1.642002090949033e-05,
"loss": 0.7904,
"step": 2041
},
{
"epoch": 0.5620312392486067,
"grad_norm": 0.2907681941664777,
"learning_rate": 1.6402831768299913e-05,
"loss": 0.7843,
"step": 2042
},
{
"epoch": 0.5623064749191495,
"grad_norm": 0.23475718522735167,
"learning_rate": 1.63856453726073e-05,
"loss": 0.7858,
"step": 2043
},
{
"epoch": 0.5625817105896924,
"grad_norm": 0.25802734415634354,
"learning_rate": 1.6368461735529816e-05,
"loss": 0.8037,
"step": 2044
},
{
"epoch": 0.5628569462602353,
"grad_norm": 0.22740669438433816,
"learning_rate": 1.635128087018268e-05,
"loss": 0.7536,
"step": 2045
},
{
"epoch": 0.5631321819307782,
"grad_norm": 0.2532030148949126,
"learning_rate": 1.6334102789678973e-05,
"loss": 0.7958,
"step": 2046
},
{
"epoch": 0.5634074176013212,
"grad_norm": 0.24557486295621084,
"learning_rate": 1.631692750712969e-05,
"loss": 0.7848,
"step": 2047
},
{
"epoch": 0.5636826532718641,
"grad_norm": 0.252681918118479,
"learning_rate": 1.6299755035643668e-05,
"loss": 0.7726,
"step": 2048
},
{
"epoch": 0.563957888942407,
"grad_norm": 0.2539413400854405,
"learning_rate": 1.6282585388327596e-05,
"loss": 0.7772,
"step": 2049
},
{
"epoch": 0.5642331246129498,
"grad_norm": 0.25698067561539034,
"learning_rate": 1.6265418578286016e-05,
"loss": 0.7544,
"step": 2050
},
{
"epoch": 0.5645083602834927,
"grad_norm": 0.25552410852604446,
"learning_rate": 1.62482546186213e-05,
"loss": 0.7657,
"step": 2051
},
{
"epoch": 0.5647835959540356,
"grad_norm": 0.25547516308813145,
"learning_rate": 1.6231093522433644e-05,
"loss": 0.7841,
"step": 2052
},
{
"epoch": 0.5650588316245786,
"grad_norm": 0.23919288113864054,
"learning_rate": 1.6213935302821048e-05,
"loss": 0.7812,
"step": 2053
},
{
"epoch": 0.5653340672951215,
"grad_norm": 0.24517970086684646,
"learning_rate": 1.6196779972879342e-05,
"loss": 0.7708,
"step": 2054
},
{
"epoch": 0.5656093029656644,
"grad_norm": 0.24938526180701784,
"learning_rate": 1.6179627545702146e-05,
"loss": 0.759,
"step": 2055
},
{
"epoch": 0.5658845386362072,
"grad_norm": 0.24762322015857288,
"learning_rate": 1.6162478034380843e-05,
"loss": 0.7662,
"step": 2056
},
{
"epoch": 0.5661597743067501,
"grad_norm": 0.24722313649073263,
"learning_rate": 1.61453314520046e-05,
"loss": 0.7777,
"step": 2057
},
{
"epoch": 0.566435009977293,
"grad_norm": 0.25320830188852356,
"learning_rate": 1.612818781166035e-05,
"loss": 0.7807,
"step": 2058
},
{
"epoch": 0.566710245647836,
"grad_norm": 0.36654122738915146,
"learning_rate": 1.6111047126432794e-05,
"loss": 0.7838,
"step": 2059
},
{
"epoch": 0.5669854813183789,
"grad_norm": 0.26133322944692217,
"learning_rate": 1.6093909409404352e-05,
"loss": 0.7798,
"step": 2060
},
{
"epoch": 0.5672607169889218,
"grad_norm": 0.2881526767960742,
"learning_rate": 1.6076774673655204e-05,
"loss": 0.8043,
"step": 2061
},
{
"epoch": 0.5675359526594647,
"grad_norm": 0.2525406598946525,
"learning_rate": 1.6059642932263235e-05,
"loss": 0.8085,
"step": 2062
},
{
"epoch": 0.5678111883300075,
"grad_norm": 0.2498535732259371,
"learning_rate": 1.6042514198304056e-05,
"loss": 0.783,
"step": 2063
},
{
"epoch": 0.5680864240005504,
"grad_norm": 0.2466455408249282,
"learning_rate": 1.602538848485097e-05,
"loss": 0.7676,
"step": 2064
},
{
"epoch": 0.5683616596710934,
"grad_norm": 0.24582045772189817,
"learning_rate": 1.6008265804974998e-05,
"loss": 0.7559,
"step": 2065
},
{
"epoch": 0.5686368953416363,
"grad_norm": 0.25116044544645955,
"learning_rate": 1.599114617174482e-05,
"loss": 0.786,
"step": 2066
},
{
"epoch": 0.5689121310121792,
"grad_norm": 0.2506465479046168,
"learning_rate": 1.5974029598226796e-05,
"loss": 0.7845,
"step": 2067
},
{
"epoch": 0.5691873666827221,
"grad_norm": 0.24848271715394182,
"learning_rate": 1.5956916097484975e-05,
"loss": 0.7795,
"step": 2068
},
{
"epoch": 0.569462602353265,
"grad_norm": 0.2402544233325968,
"learning_rate": 1.593980568258103e-05,
"loss": 0.7936,
"step": 2069
},
{
"epoch": 0.5697378380238078,
"grad_norm": 0.2549298842223705,
"learning_rate": 1.592269836657429e-05,
"loss": 0.752,
"step": 2070
},
{
"epoch": 0.5700130736943508,
"grad_norm": 0.23611264837052431,
"learning_rate": 1.5905594162521725e-05,
"loss": 0.7971,
"step": 2071
},
{
"epoch": 0.5702883093648937,
"grad_norm": 0.24434415474606616,
"learning_rate": 1.5888493083477926e-05,
"loss": 0.7524,
"step": 2072
},
{
"epoch": 0.5705635450354366,
"grad_norm": 0.2503065691490492,
"learning_rate": 1.587139514249509e-05,
"loss": 0.8098,
"step": 2073
},
{
"epoch": 0.5708387807059795,
"grad_norm": 0.24329055635485347,
"learning_rate": 1.5854300352623023e-05,
"loss": 0.7398,
"step": 2074
},
{
"epoch": 0.5711140163765224,
"grad_norm": 0.2572120138053558,
"learning_rate": 1.583720872690914e-05,
"loss": 0.761,
"step": 2075
},
{
"epoch": 0.5713892520470653,
"grad_norm": 0.23979776297460612,
"learning_rate": 1.5820120278398424e-05,
"loss": 0.8041,
"step": 2076
},
{
"epoch": 0.5716644877176082,
"grad_norm": 0.2624330874838944,
"learning_rate": 1.5803035020133448e-05,
"loss": 0.7963,
"step": 2077
},
{
"epoch": 0.5719397233881511,
"grad_norm": 0.23889894527585132,
"learning_rate": 1.578595296515433e-05,
"loss": 0.7865,
"step": 2078
},
{
"epoch": 0.572214959058694,
"grad_norm": 0.25073987996449615,
"learning_rate": 1.5768874126498766e-05,
"loss": 0.7892,
"step": 2079
},
{
"epoch": 0.5724901947292369,
"grad_norm": 0.23814982283441102,
"learning_rate": 1.5751798517201972e-05,
"loss": 0.8236,
"step": 2080
},
{
"epoch": 0.5727654303997798,
"grad_norm": 0.2539968039401214,
"learning_rate": 1.5734726150296725e-05,
"loss": 0.7881,
"step": 2081
},
{
"epoch": 0.5730406660703227,
"grad_norm": 0.25969761411521936,
"learning_rate": 1.57176570388133e-05,
"loss": 0.8042,
"step": 2082
},
{
"epoch": 0.5733159017408657,
"grad_norm": 0.2375301691335384,
"learning_rate": 1.570059119577952e-05,
"loss": 0.7835,
"step": 2083
},
{
"epoch": 0.5735911374114085,
"grad_norm": 0.2626076389318922,
"learning_rate": 1.568352863422069e-05,
"loss": 0.7935,
"step": 2084
},
{
"epoch": 0.5738663730819514,
"grad_norm": 0.23009338452442388,
"learning_rate": 1.5666469367159613e-05,
"loss": 0.7742,
"step": 2085
},
{
"epoch": 0.5741416087524943,
"grad_norm": 0.2619299816428588,
"learning_rate": 1.564941340761658e-05,
"loss": 0.7642,
"step": 2086
},
{
"epoch": 0.5744168444230372,
"grad_norm": 0.25405711124962455,
"learning_rate": 1.563236076860937e-05,
"loss": 0.765,
"step": 2087
},
{
"epoch": 0.5746920800935801,
"grad_norm": 0.2645673429868364,
"learning_rate": 1.56153114631532e-05,
"loss": 0.7861,
"step": 2088
},
{
"epoch": 0.5749673157641231,
"grad_norm": 0.2525486695505516,
"learning_rate": 1.559826550426076e-05,
"loss": 0.7944,
"step": 2089
},
{
"epoch": 0.575242551434666,
"grad_norm": 0.23315936847014512,
"learning_rate": 1.55812229049422e-05,
"loss": 0.7585,
"step": 2090
},
{
"epoch": 0.5755177871052088,
"grad_norm": 0.2551352924648423,
"learning_rate": 1.5564183678205074e-05,
"loss": 0.7463,
"step": 2091
},
{
"epoch": 0.5757930227757517,
"grad_norm": 0.22795328646090968,
"learning_rate": 1.5547147837054392e-05,
"loss": 0.7966,
"step": 2092
},
{
"epoch": 0.5760682584462946,
"grad_norm": 0.24822441078388702,
"learning_rate": 1.553011539449256e-05,
"loss": 0.7869,
"step": 2093
},
{
"epoch": 0.5763434941168375,
"grad_norm": 0.24696033515063354,
"learning_rate": 1.5513086363519392e-05,
"loss": 0.7625,
"step": 2094
},
{
"epoch": 0.5766187297873805,
"grad_norm": 0.23207065303808233,
"learning_rate": 1.5496060757132112e-05,
"loss": 0.7887,
"step": 2095
},
{
"epoch": 0.5768939654579234,
"grad_norm": 0.24032261378587064,
"learning_rate": 1.5479038588325303e-05,
"loss": 0.7783,
"step": 2096
},
{
"epoch": 0.5771692011284663,
"grad_norm": 0.2616143376584418,
"learning_rate": 1.546201987009096e-05,
"loss": 0.7939,
"step": 2097
},
{
"epoch": 0.5774444367990091,
"grad_norm": 0.25238477358610734,
"learning_rate": 1.5445004615418425e-05,
"loss": 0.7854,
"step": 2098
},
{
"epoch": 0.577719672469552,
"grad_norm": 0.27345842456168395,
"learning_rate": 1.5427992837294393e-05,
"loss": 0.7705,
"step": 2099
},
{
"epoch": 0.5779949081400949,
"grad_norm": 0.24797525297088446,
"learning_rate": 1.5410984548702913e-05,
"loss": 0.7754,
"step": 2100
},
{
"epoch": 0.5782701438106379,
"grad_norm": 0.2547197947186458,
"learning_rate": 1.5393979762625363e-05,
"loss": 0.8208,
"step": 2101
},
{
"epoch": 0.5785453794811808,
"grad_norm": 0.24702558543063802,
"learning_rate": 1.5376978492040455e-05,
"loss": 0.77,
"step": 2102
},
{
"epoch": 0.5788206151517237,
"grad_norm": 0.25100614022554396,
"learning_rate": 1.5359980749924212e-05,
"loss": 0.7638,
"step": 2103
},
{
"epoch": 0.5790958508222666,
"grad_norm": 0.2460103820922042,
"learning_rate": 1.534298654924998e-05,
"loss": 0.7929,
"step": 2104
},
{
"epoch": 0.5793710864928094,
"grad_norm": 0.24290709268941307,
"learning_rate": 1.5325995902988386e-05,
"loss": 0.7885,
"step": 2105
},
{
"epoch": 0.5796463221633523,
"grad_norm": 0.23892080646614608,
"learning_rate": 1.530900882410734e-05,
"loss": 0.8172,
"step": 2106
},
{
"epoch": 0.5799215578338953,
"grad_norm": 0.24946689388002227,
"learning_rate": 1.5292025325572035e-05,
"loss": 0.7684,
"step": 2107
},
{
"epoch": 0.5801967935044382,
"grad_norm": 0.23196145630886625,
"learning_rate": 1.5275045420344947e-05,
"loss": 0.7778,
"step": 2108
},
{
"epoch": 0.5804720291749811,
"grad_norm": 0.26101729842998894,
"learning_rate": 1.5258069121385789e-05,
"loss": 0.8088,
"step": 2109
},
{
"epoch": 0.580747264845524,
"grad_norm": 0.24520871895155857,
"learning_rate": 1.5241096441651518e-05,
"loss": 0.7919,
"step": 2110
},
{
"epoch": 0.5810225005160669,
"grad_norm": 0.25198000687125316,
"learning_rate": 1.5224127394096357e-05,
"loss": 0.7777,
"step": 2111
},
{
"epoch": 0.5812977361866097,
"grad_norm": 0.23740133013705317,
"learning_rate": 1.520716199167173e-05,
"loss": 0.7272,
"step": 2112
},
{
"epoch": 0.5815729718571527,
"grad_norm": 0.2415496612068821,
"learning_rate": 1.5190200247326286e-05,
"loss": 0.7951,
"step": 2113
},
{
"epoch": 0.5818482075276956,
"grad_norm": 0.2548145164442953,
"learning_rate": 1.517324217400589e-05,
"loss": 0.7824,
"step": 2114
},
{
"epoch": 0.5821234431982385,
"grad_norm": 0.23201023898433434,
"learning_rate": 1.5156287784653594e-05,
"loss": 0.8018,
"step": 2115
},
{
"epoch": 0.5823986788687814,
"grad_norm": 0.29720187732594855,
"learning_rate": 1.5139337092209645e-05,
"loss": 0.7733,
"step": 2116
},
{
"epoch": 0.5826739145393243,
"grad_norm": 0.23146801505485573,
"learning_rate": 1.5122390109611458e-05,
"loss": 0.8012,
"step": 2117
},
{
"epoch": 0.5829491502098672,
"grad_norm": 0.2482537042383629,
"learning_rate": 1.510544684979364e-05,
"loss": 0.7852,
"step": 2118
},
{
"epoch": 0.5832243858804101,
"grad_norm": 0.24202146603866012,
"learning_rate": 1.5088507325687931e-05,
"loss": 0.7807,
"step": 2119
},
{
"epoch": 0.583499621550953,
"grad_norm": 1.2876778355705045,
"learning_rate": 1.5071571550223238e-05,
"loss": 0.7896,
"step": 2120
},
{
"epoch": 0.5837748572214959,
"grad_norm": 0.2506389255098671,
"learning_rate": 1.5054639536325595e-05,
"loss": 0.791,
"step": 2121
},
{
"epoch": 0.5840500928920388,
"grad_norm": 0.2594330635551027,
"learning_rate": 1.5037711296918169e-05,
"loss": 0.7851,
"step": 2122
},
{
"epoch": 0.5843253285625817,
"grad_norm": 0.257735357416782,
"learning_rate": 1.5020786844921245e-05,
"loss": 0.7968,
"step": 2123
},
{
"epoch": 0.5846005642331246,
"grad_norm": 0.23623757668631695,
"learning_rate": 1.500386619325222e-05,
"loss": 0.7427,
"step": 2124
},
{
"epoch": 0.5848757999036676,
"grad_norm": 0.27831653869214235,
"learning_rate": 1.498694935482559e-05,
"loss": 0.8033,
"step": 2125
},
{
"epoch": 0.5851510355742104,
"grad_norm": 0.25548238246861205,
"learning_rate": 1.497003634255294e-05,
"loss": 0.7979,
"step": 2126
},
{
"epoch": 0.5854262712447533,
"grad_norm": 0.2593386786294403,
"learning_rate": 1.495312716934294e-05,
"loss": 0.7706,
"step": 2127
},
{
"epoch": 0.5857015069152962,
"grad_norm": 0.24010246593866066,
"learning_rate": 1.4936221848101315e-05,
"loss": 0.7941,
"step": 2128
},
{
"epoch": 0.5859767425858391,
"grad_norm": 0.2701389034455053,
"learning_rate": 1.4919320391730862e-05,
"loss": 0.7741,
"step": 2129
},
{
"epoch": 0.586251978256382,
"grad_norm": 0.24203094574073675,
"learning_rate": 1.4902422813131433e-05,
"loss": 0.7661,
"step": 2130
},
{
"epoch": 0.586527213926925,
"grad_norm": 0.2671619521416786,
"learning_rate": 1.4885529125199902e-05,
"loss": 0.7701,
"step": 2131
},
{
"epoch": 0.5868024495974679,
"grad_norm": 0.252204521046989,
"learning_rate": 1.4868639340830185e-05,
"loss": 0.7724,
"step": 2132
},
{
"epoch": 0.5870776852680107,
"grad_norm": 0.2726669507042234,
"learning_rate": 1.4851753472913228e-05,
"loss": 0.7959,
"step": 2133
},
{
"epoch": 0.5873529209385536,
"grad_norm": 0.24867246127598813,
"learning_rate": 1.4834871534336972e-05,
"loss": 0.8058,
"step": 2134
},
{
"epoch": 0.5876281566090965,
"grad_norm": 0.26141394866877476,
"learning_rate": 1.4817993537986368e-05,
"loss": 0.768,
"step": 2135
},
{
"epoch": 0.5879033922796394,
"grad_norm": 0.2587387847788051,
"learning_rate": 1.4801119496743353e-05,
"loss": 0.7864,
"step": 2136
},
{
"epoch": 0.5881786279501824,
"grad_norm": 0.2511675708237603,
"learning_rate": 1.4784249423486845e-05,
"loss": 0.7793,
"step": 2137
},
{
"epoch": 0.5884538636207253,
"grad_norm": 0.25023817845508245,
"learning_rate": 1.4767383331092737e-05,
"loss": 0.7679,
"step": 2138
},
{
"epoch": 0.5887290992912682,
"grad_norm": 0.26930821744504835,
"learning_rate": 1.4750521232433879e-05,
"loss": 0.7976,
"step": 2139
},
{
"epoch": 0.589004334961811,
"grad_norm": 0.23473374959504417,
"learning_rate": 1.4733663140380081e-05,
"loss": 0.7897,
"step": 2140
},
{
"epoch": 0.5892795706323539,
"grad_norm": 0.2706088837656889,
"learning_rate": 1.4716809067798097e-05,
"loss": 0.7771,
"step": 2141
},
{
"epoch": 0.5895548063028968,
"grad_norm": 0.22751531787456653,
"learning_rate": 1.4699959027551598e-05,
"loss": 0.7703,
"step": 2142
},
{
"epoch": 0.5898300419734398,
"grad_norm": 0.2821473648131303,
"learning_rate": 1.4683113032501188e-05,
"loss": 0.7862,
"step": 2143
},
{
"epoch": 0.5901052776439827,
"grad_norm": 0.2381236393678348,
"learning_rate": 1.4666271095504377e-05,
"loss": 0.7868,
"step": 2144
},
{
"epoch": 0.5903805133145256,
"grad_norm": 0.35446051068971185,
"learning_rate": 1.4649433229415588e-05,
"loss": 0.7926,
"step": 2145
},
{
"epoch": 0.5906557489850685,
"grad_norm": 0.23549622075698776,
"learning_rate": 1.4632599447086123e-05,
"loss": 0.793,
"step": 2146
},
{
"epoch": 0.5909309846556113,
"grad_norm": 0.320356279305611,
"learning_rate": 1.461576976136419e-05,
"loss": 0.7905,
"step": 2147
},
{
"epoch": 0.5912062203261542,
"grad_norm": 0.22676829503990834,
"learning_rate": 1.4598944185094843e-05,
"loss": 0.7581,
"step": 2148
},
{
"epoch": 0.5914814559966972,
"grad_norm": 0.2652106589087,
"learning_rate": 1.4582122731120018e-05,
"loss": 0.778,
"step": 2149
},
{
"epoch": 0.5917566916672401,
"grad_norm": 0.23972531109798575,
"learning_rate": 1.4565305412278492e-05,
"loss": 0.7959,
"step": 2150
},
{
"epoch": 0.592031927337783,
"grad_norm": 0.27323471299885144,
"learning_rate": 1.4548492241405902e-05,
"loss": 0.7419,
"step": 2151
},
{
"epoch": 0.5923071630083259,
"grad_norm": 0.24884190065273046,
"learning_rate": 1.4531683231334705e-05,
"loss": 0.789,
"step": 2152
},
{
"epoch": 0.5925823986788687,
"grad_norm": 0.2749619768226162,
"learning_rate": 1.4514878394894179e-05,
"loss": 0.7795,
"step": 2153
},
{
"epoch": 0.5928576343494116,
"grad_norm": 0.24176446091523318,
"learning_rate": 1.449807774491044e-05,
"loss": 0.776,
"step": 2154
},
{
"epoch": 0.5931328700199546,
"grad_norm": 0.26236281769206066,
"learning_rate": 1.4481281294206384e-05,
"loss": 0.7911,
"step": 2155
},
{
"epoch": 0.5934081056904975,
"grad_norm": 0.24418485563127318,
"learning_rate": 1.4464489055601711e-05,
"loss": 0.7624,
"step": 2156
},
{
"epoch": 0.5936833413610404,
"grad_norm": 0.24652423998659218,
"learning_rate": 1.4447701041912913e-05,
"loss": 0.7798,
"step": 2157
},
{
"epoch": 0.5939585770315833,
"grad_norm": 0.25788494067196344,
"learning_rate": 1.4430917265953249e-05,
"loss": 0.7896,
"step": 2158
},
{
"epoch": 0.5942338127021262,
"grad_norm": 0.24422235437206616,
"learning_rate": 1.441413774053274e-05,
"loss": 0.7814,
"step": 2159
},
{
"epoch": 0.594509048372669,
"grad_norm": 0.2698403209678665,
"learning_rate": 1.4397362478458161e-05,
"loss": 0.7979,
"step": 2160
},
{
"epoch": 0.594784284043212,
"grad_norm": 0.4504993518957787,
"learning_rate": 1.438059149253306e-05,
"loss": 0.8036,
"step": 2161
},
{
"epoch": 0.5950595197137549,
"grad_norm": 0.24344810912433718,
"learning_rate": 1.4363824795557688e-05,
"loss": 0.8054,
"step": 2162
},
{
"epoch": 0.5953347553842978,
"grad_norm": 0.2469985797108369,
"learning_rate": 1.4347062400329046e-05,
"loss": 0.7752,
"step": 2163
},
{
"epoch": 0.5956099910548407,
"grad_norm": 0.2490002922314942,
"learning_rate": 1.4330304319640834e-05,
"loss": 0.7929,
"step": 2164
},
{
"epoch": 0.5958852267253836,
"grad_norm": 0.2429649803979391,
"learning_rate": 1.4313550566283466e-05,
"loss": 0.7888,
"step": 2165
},
{
"epoch": 0.5961604623959265,
"grad_norm": 0.24481603235890745,
"learning_rate": 1.4296801153044055e-05,
"loss": 0.7885,
"step": 2166
},
{
"epoch": 0.5964356980664695,
"grad_norm": 0.3080950706307856,
"learning_rate": 1.4280056092706405e-05,
"loss": 0.7915,
"step": 2167
},
{
"epoch": 0.5967109337370123,
"grad_norm": 0.30616858358392884,
"learning_rate": 1.4263315398050986e-05,
"loss": 0.7635,
"step": 2168
},
{
"epoch": 0.5969861694075552,
"grad_norm": 0.24086458449552933,
"learning_rate": 1.4246579081854953e-05,
"loss": 0.7856,
"step": 2169
},
{
"epoch": 0.5972614050780981,
"grad_norm": 0.26227700423509437,
"learning_rate": 1.4229847156892102e-05,
"loss": 0.7935,
"step": 2170
},
{
"epoch": 0.597536640748641,
"grad_norm": 0.22949999843540742,
"learning_rate": 1.4213119635932889e-05,
"loss": 0.8084,
"step": 2171
},
{
"epoch": 0.5978118764191839,
"grad_norm": 0.3656471073242874,
"learning_rate": 1.4196396531744397e-05,
"loss": 0.743,
"step": 2172
},
{
"epoch": 0.5980871120897269,
"grad_norm": 0.22416655439542127,
"learning_rate": 1.4179677857090353e-05,
"loss": 0.7608,
"step": 2173
},
{
"epoch": 0.5983623477602698,
"grad_norm": 0.231795374036869,
"learning_rate": 1.4162963624731083e-05,
"loss": 0.7713,
"step": 2174
},
{
"epoch": 0.5986375834308126,
"grad_norm": 0.23566291636624995,
"learning_rate": 1.4146253847423555e-05,
"loss": 0.7864,
"step": 2175
},
{
"epoch": 0.5989128191013555,
"grad_norm": 0.2517273479510146,
"learning_rate": 1.4129548537921308e-05,
"loss": 0.7865,
"step": 2176
},
{
"epoch": 0.5991880547718984,
"grad_norm": 0.22410139984949565,
"learning_rate": 1.4112847708974471e-05,
"loss": 0.7909,
"step": 2177
},
{
"epoch": 0.5994632904424413,
"grad_norm": 0.26533693472104647,
"learning_rate": 1.4096151373329777e-05,
"loss": 0.7648,
"step": 2178
},
{
"epoch": 0.5997385261129843,
"grad_norm": 0.23050780141550972,
"learning_rate": 1.4079459543730504e-05,
"loss": 0.779,
"step": 2179
},
{
"epoch": 0.6000137617835272,
"grad_norm": 0.265280586075992,
"learning_rate": 1.4062772232916507e-05,
"loss": 0.7648,
"step": 2180
},
{
"epoch": 0.60028899745407,
"grad_norm": 0.23622287788664692,
"learning_rate": 1.4046089453624181e-05,
"loss": 0.7902,
"step": 2181
},
{
"epoch": 0.6005642331246129,
"grad_norm": 0.23610116149772342,
"learning_rate": 1.4029411218586464e-05,
"loss": 0.7497,
"step": 2182
},
{
"epoch": 0.6008394687951558,
"grad_norm": 0.2234032434678829,
"learning_rate": 1.4012737540532842e-05,
"loss": 0.7719,
"step": 2183
},
{
"epoch": 0.6011147044656987,
"grad_norm": 0.2504256253889632,
"learning_rate": 1.3996068432189305e-05,
"loss": 0.7751,
"step": 2184
},
{
"epoch": 0.6013899401362417,
"grad_norm": 0.23610034084470008,
"learning_rate": 1.3979403906278362e-05,
"loss": 0.7867,
"step": 2185
},
{
"epoch": 0.6016651758067846,
"grad_norm": 0.26373980797829544,
"learning_rate": 1.3962743975519021e-05,
"loss": 0.7916,
"step": 2186
},
{
"epoch": 0.6019404114773275,
"grad_norm": 0.2335537381776213,
"learning_rate": 1.3946088652626784e-05,
"loss": 0.8085,
"step": 2187
},
{
"epoch": 0.6022156471478703,
"grad_norm": 0.2576714206143028,
"learning_rate": 1.392943795031364e-05,
"loss": 0.7874,
"step": 2188
},
{
"epoch": 0.6024908828184132,
"grad_norm": 0.2316709720621963,
"learning_rate": 1.391279188128804e-05,
"loss": 0.7803,
"step": 2189
},
{
"epoch": 0.6027661184889561,
"grad_norm": 0.25280051602807724,
"learning_rate": 1.389615045825492e-05,
"loss": 0.7759,
"step": 2190
},
{
"epoch": 0.6030413541594991,
"grad_norm": 0.2367383163771406,
"learning_rate": 1.3879513693915654e-05,
"loss": 0.7881,
"step": 2191
},
{
"epoch": 0.603316589830042,
"grad_norm": 0.2602317282075869,
"learning_rate": 1.386288160096806e-05,
"loss": 0.7609,
"step": 2192
},
{
"epoch": 0.6035918255005849,
"grad_norm": 0.23248407453862296,
"learning_rate": 1.384625419210639e-05,
"loss": 0.7829,
"step": 2193
},
{
"epoch": 0.6038670611711278,
"grad_norm": 0.24736366078303895,
"learning_rate": 1.3829631480021335e-05,
"loss": 0.7729,
"step": 2194
},
{
"epoch": 0.6041422968416706,
"grad_norm": 0.251112860818329,
"learning_rate": 1.3813013477399989e-05,
"loss": 0.7754,
"step": 2195
},
{
"epoch": 0.6044175325122135,
"grad_norm": 0.2375044017911047,
"learning_rate": 1.3796400196925837e-05,
"loss": 0.7754,
"step": 2196
},
{
"epoch": 0.6046927681827565,
"grad_norm": 0.236547571630386,
"learning_rate": 1.3779791651278802e-05,
"loss": 0.7735,
"step": 2197
},
{
"epoch": 0.6049680038532994,
"grad_norm": 0.25860613071581184,
"learning_rate": 1.3763187853135156e-05,
"loss": 0.797,
"step": 2198
},
{
"epoch": 0.6052432395238423,
"grad_norm": 0.2299632050146302,
"learning_rate": 1.3746588815167555e-05,
"loss": 0.7889,
"step": 2199
},
{
"epoch": 0.6055184751943852,
"grad_norm": 0.2538085408409037,
"learning_rate": 1.3729994550045036e-05,
"loss": 0.7933,
"step": 2200
},
{
"epoch": 0.6057937108649281,
"grad_norm": 0.23846684024218845,
"learning_rate": 1.3713405070432977e-05,
"loss": 0.8148,
"step": 2201
},
{
"epoch": 0.6060689465354709,
"grad_norm": 0.24883675188960985,
"learning_rate": 1.369682038899311e-05,
"loss": 0.7836,
"step": 2202
},
{
"epoch": 0.6063441822060139,
"grad_norm": 0.2201432094251142,
"learning_rate": 1.3680240518383502e-05,
"loss": 0.75,
"step": 2203
},
{
"epoch": 0.6066194178765568,
"grad_norm": 0.2442907241332848,
"learning_rate": 1.3663665471258563e-05,
"loss": 0.7948,
"step": 2204
},
{
"epoch": 0.6068946535470997,
"grad_norm": 0.23524850760591698,
"learning_rate": 1.3647095260268994e-05,
"loss": 0.7797,
"step": 2205
},
{
"epoch": 0.6071698892176426,
"grad_norm": 0.24134584500947526,
"learning_rate": 1.3630529898061834e-05,
"loss": 0.7888,
"step": 2206
},
{
"epoch": 0.6074451248881855,
"grad_norm": 0.24028508919377853,
"learning_rate": 1.3613969397280405e-05,
"loss": 0.7939,
"step": 2207
},
{
"epoch": 0.6077203605587284,
"grad_norm": 0.23614908810125707,
"learning_rate": 1.3597413770564316e-05,
"loss": 0.7802,
"step": 2208
},
{
"epoch": 0.6079955962292714,
"grad_norm": 0.24320103802629342,
"learning_rate": 1.3580863030549457e-05,
"loss": 0.7559,
"step": 2209
},
{
"epoch": 0.6082708318998142,
"grad_norm": 0.22571292589650693,
"learning_rate": 1.3564317189868e-05,
"loss": 0.7911,
"step": 2210
},
{
"epoch": 0.6085460675703571,
"grad_norm": 0.24051990040004587,
"learning_rate": 1.3547776261148366e-05,
"loss": 0.7728,
"step": 2211
},
{
"epoch": 0.6088213032409,
"grad_norm": 0.2267786033004235,
"learning_rate": 1.3531240257015239e-05,
"loss": 0.7923,
"step": 2212
},
{
"epoch": 0.6090965389114429,
"grad_norm": 0.2493283813453356,
"learning_rate": 1.351470919008953e-05,
"loss": 0.7787,
"step": 2213
},
{
"epoch": 0.6093717745819858,
"grad_norm": 0.22358057500853282,
"learning_rate": 1.3498183072988391e-05,
"loss": 0.7814,
"step": 2214
},
{
"epoch": 0.6096470102525288,
"grad_norm": 0.3440976131242661,
"learning_rate": 1.3481661918325185e-05,
"loss": 0.753,
"step": 2215
},
{
"epoch": 0.6099222459230716,
"grad_norm": 0.22715054868175705,
"learning_rate": 1.3465145738709506e-05,
"loss": 0.7793,
"step": 2216
},
{
"epoch": 0.6101974815936145,
"grad_norm": 0.2325658496299765,
"learning_rate": 1.3448634546747128e-05,
"loss": 0.7593,
"step": 2217
},
{
"epoch": 0.6104727172641574,
"grad_norm": 0.2322885351765859,
"learning_rate": 1.3432128355040048e-05,
"loss": 0.7619,
"step": 2218
},
{
"epoch": 0.6107479529347003,
"grad_norm": 0.2528217887923534,
"learning_rate": 1.341562717618642e-05,
"loss": 0.7987,
"step": 2219
},
{
"epoch": 0.6110231886052433,
"grad_norm": 0.2392500109552147,
"learning_rate": 1.3399131022780578e-05,
"loss": 0.7536,
"step": 2220
},
{
"epoch": 0.6112984242757862,
"grad_norm": 0.2307629409968488,
"learning_rate": 1.3382639907413033e-05,
"loss": 0.7731,
"step": 2221
},
{
"epoch": 0.6115736599463291,
"grad_norm": 0.2273021766595541,
"learning_rate": 1.3366153842670433e-05,
"loss": 0.7942,
"step": 2222
},
{
"epoch": 0.611848895616872,
"grad_norm": 0.30423518686942114,
"learning_rate": 1.3349672841135586e-05,
"loss": 0.8187,
"step": 2223
},
{
"epoch": 0.6121241312874148,
"grad_norm": 0.23659855677025482,
"learning_rate": 1.3333196915387414e-05,
"loss": 0.7969,
"step": 2224
},
{
"epoch": 0.6123993669579577,
"grad_norm": 0.25569499276193625,
"learning_rate": 1.3316726078001003e-05,
"loss": 0.8072,
"step": 2225
},
{
"epoch": 0.6126746026285007,
"grad_norm": 0.23298867836581347,
"learning_rate": 1.3300260341547519e-05,
"loss": 0.793,
"step": 2226
},
{
"epoch": 0.6129498382990436,
"grad_norm": 0.2511880191579635,
"learning_rate": 1.3283799718594255e-05,
"loss": 0.7997,
"step": 2227
},
{
"epoch": 0.6132250739695865,
"grad_norm": 0.2840334137563068,
"learning_rate": 1.326734422170459e-05,
"loss": 0.7826,
"step": 2228
},
{
"epoch": 0.6135003096401294,
"grad_norm": 0.23042560639985768,
"learning_rate": 1.3250893863437996e-05,
"loss": 0.7754,
"step": 2229
},
{
"epoch": 0.6137755453106722,
"grad_norm": 0.26078713076993054,
"learning_rate": 1.3234448656350018e-05,
"loss": 0.781,
"step": 2230
},
{
"epoch": 0.6140507809812151,
"grad_norm": 0.23830661626654637,
"learning_rate": 1.3218008612992279e-05,
"loss": 0.7803,
"step": 2231
},
{
"epoch": 0.6143260166517581,
"grad_norm": 0.2426937094871854,
"learning_rate": 1.3201573745912453e-05,
"loss": 0.7478,
"step": 2232
},
{
"epoch": 0.614601252322301,
"grad_norm": 0.24652065812213447,
"learning_rate": 1.3185144067654272e-05,
"loss": 0.7812,
"step": 2233
},
{
"epoch": 0.6148764879928439,
"grad_norm": 0.2589077154275541,
"learning_rate": 1.3168719590757495e-05,
"loss": 0.7913,
"step": 2234
},
{
"epoch": 0.6151517236633868,
"grad_norm": 0.24197152701505842,
"learning_rate": 1.315230032775792e-05,
"loss": 0.8002,
"step": 2235
},
{
"epoch": 0.6154269593339297,
"grad_norm": 0.2590383635111659,
"learning_rate": 1.3135886291187356e-05,
"loss": 0.7614,
"step": 2236
},
{
"epoch": 0.6157021950044725,
"grad_norm": 0.24330646988687127,
"learning_rate": 1.311947749357364e-05,
"loss": 0.7548,
"step": 2237
},
{
"epoch": 0.6159774306750155,
"grad_norm": 0.2416648534181719,
"learning_rate": 1.3103073947440596e-05,
"loss": 0.7805,
"step": 2238
},
{
"epoch": 0.6162526663455584,
"grad_norm": 0.2441149733997987,
"learning_rate": 1.308667566530804e-05,
"loss": 0.7625,
"step": 2239
},
{
"epoch": 0.6165279020161013,
"grad_norm": 0.2575334262557978,
"learning_rate": 1.3070282659691782e-05,
"loss": 0.7389,
"step": 2240
},
{
"epoch": 0.6168031376866442,
"grad_norm": 0.24980075078996547,
"learning_rate": 1.3053894943103598e-05,
"loss": 0.7855,
"step": 2241
},
{
"epoch": 0.6170783733571871,
"grad_norm": 0.25545253681161445,
"learning_rate": 1.3037512528051217e-05,
"loss": 0.737,
"step": 2242
},
{
"epoch": 0.61735360902773,
"grad_norm": 0.24647019979471987,
"learning_rate": 1.3021135427038342e-05,
"loss": 0.8051,
"step": 2243
},
{
"epoch": 0.617628844698273,
"grad_norm": 0.26064932550995573,
"learning_rate": 1.3004763652564608e-05,
"loss": 0.7591,
"step": 2244
},
{
"epoch": 0.6179040803688158,
"grad_norm": 0.2558613974831646,
"learning_rate": 1.2988397217125579e-05,
"loss": 0.8032,
"step": 2245
},
{
"epoch": 0.6181793160393587,
"grad_norm": 0.25087162428622795,
"learning_rate": 1.2972036133212747e-05,
"loss": 0.7973,
"step": 2246
},
{
"epoch": 0.6184545517099016,
"grad_norm": 0.24663915403638537,
"learning_rate": 1.295568041331354e-05,
"loss": 0.7727,
"step": 2247
},
{
"epoch": 0.6187297873804445,
"grad_norm": 0.26800782727748,
"learning_rate": 1.2939330069911262e-05,
"loss": 0.7799,
"step": 2248
},
{
"epoch": 0.6190050230509874,
"grad_norm": 0.2410928295502244,
"learning_rate": 1.2922985115485137e-05,
"loss": 0.7862,
"step": 2249
},
{
"epoch": 0.6192802587215304,
"grad_norm": 0.24724726149674725,
"learning_rate": 1.2906645562510261e-05,
"loss": 0.7871,
"step": 2250
},
{
"epoch": 0.6195554943920732,
"grad_norm": 0.24635501240077184,
"learning_rate": 1.2890311423457611e-05,
"loss": 0.7993,
"step": 2251
},
{
"epoch": 0.6198307300626161,
"grad_norm": 0.33680368145251793,
"learning_rate": 1.2873982710794028e-05,
"loss": 0.7655,
"step": 2252
},
{
"epoch": 0.620105965733159,
"grad_norm": 0.2495871575314044,
"learning_rate": 1.2857659436982224e-05,
"loss": 0.7843,
"step": 2253
},
{
"epoch": 0.6203812014037019,
"grad_norm": 0.22796133306734526,
"learning_rate": 1.2841341614480752e-05,
"loss": 0.784,
"step": 2254
},
{
"epoch": 0.6206564370742448,
"grad_norm": 0.26664818691016473,
"learning_rate": 1.2825029255744007e-05,
"loss": 0.7715,
"step": 2255
},
{
"epoch": 0.6209316727447878,
"grad_norm": 0.24331570868163313,
"learning_rate": 1.2808722373222207e-05,
"loss": 0.7999,
"step": 2256
},
{
"epoch": 0.6212069084153307,
"grad_norm": 0.25101474771491167,
"learning_rate": 1.2792420979361397e-05,
"loss": 0.7864,
"step": 2257
},
{
"epoch": 0.6214821440858735,
"grad_norm": 0.24211368156900676,
"learning_rate": 1.2776125086603423e-05,
"loss": 0.7847,
"step": 2258
},
{
"epoch": 0.6217573797564164,
"grad_norm": 0.2503028888344997,
"learning_rate": 1.2759834707385955e-05,
"loss": 0.8151,
"step": 2259
},
{
"epoch": 0.6220326154269593,
"grad_norm": 0.24545898853250367,
"learning_rate": 1.2743549854142423e-05,
"loss": 0.7952,
"step": 2260
},
{
"epoch": 0.6223078510975022,
"grad_norm": 0.2553532121263847,
"learning_rate": 1.2727270539302073e-05,
"loss": 0.797,
"step": 2261
},
{
"epoch": 0.6225830867680452,
"grad_norm": 0.2536941497255776,
"learning_rate": 1.2710996775289898e-05,
"loss": 0.7687,
"step": 2262
},
{
"epoch": 0.6228583224385881,
"grad_norm": 0.23925111841565022,
"learning_rate": 1.2694728574526662e-05,
"loss": 0.7737,
"step": 2263
},
{
"epoch": 0.623133558109131,
"grad_norm": 0.504686627809076,
"learning_rate": 1.2678465949428893e-05,
"loss": 0.7847,
"step": 2264
},
{
"epoch": 0.6234087937796738,
"grad_norm": 0.24621864824989545,
"learning_rate": 1.2662208912408847e-05,
"loss": 0.7871,
"step": 2265
},
{
"epoch": 0.6236840294502167,
"grad_norm": 0.2556656062357497,
"learning_rate": 1.2645957475874526e-05,
"loss": 0.7911,
"step": 2266
},
{
"epoch": 0.6239592651207596,
"grad_norm": 0.24282479045729188,
"learning_rate": 1.2629711652229646e-05,
"loss": 0.7365,
"step": 2267
},
{
"epoch": 0.6242345007913026,
"grad_norm": 0.2567016389692855,
"learning_rate": 1.2613471453873665e-05,
"loss": 0.7627,
"step": 2268
},
{
"epoch": 0.6245097364618455,
"grad_norm": 0.2394284978741426,
"learning_rate": 1.2597236893201712e-05,
"loss": 0.8056,
"step": 2269
},
{
"epoch": 0.6247849721323884,
"grad_norm": 0.2493709173100802,
"learning_rate": 1.2581007982604648e-05,
"loss": 0.7816,
"step": 2270
},
{
"epoch": 0.6250602078029313,
"grad_norm": 0.25440439313110574,
"learning_rate": 1.256478473446899e-05,
"loss": 0.7622,
"step": 2271
},
{
"epoch": 0.6253354434734741,
"grad_norm": 0.2294143326463229,
"learning_rate": 1.2548567161176958e-05,
"loss": 0.7481,
"step": 2272
},
{
"epoch": 0.625610679144017,
"grad_norm": 0.24189268285585241,
"learning_rate": 1.2532355275106422e-05,
"loss": 0.7502,
"step": 2273
},
{
"epoch": 0.62588591481456,
"grad_norm": 0.25155503113058514,
"learning_rate": 1.2516149088630925e-05,
"loss": 0.7783,
"step": 2274
},
{
"epoch": 0.6261611504851029,
"grad_norm": 0.2438130760526398,
"learning_rate": 1.2499948614119653e-05,
"loss": 0.7848,
"step": 2275
},
{
"epoch": 0.6264363861556458,
"grad_norm": 0.261405497193663,
"learning_rate": 1.248375386393744e-05,
"loss": 0.7661,
"step": 2276
},
{
"epoch": 0.6267116218261887,
"grad_norm": 0.22946324746237765,
"learning_rate": 1.246756485044474e-05,
"loss": 0.7643,
"step": 2277
},
{
"epoch": 0.6269868574967316,
"grad_norm": 0.2407726296582577,
"learning_rate": 1.2451381585997636e-05,
"loss": 0.7802,
"step": 2278
},
{
"epoch": 0.6272620931672744,
"grad_norm": 0.23077959991271488,
"learning_rate": 1.2435204082947814e-05,
"loss": 0.8265,
"step": 2279
},
{
"epoch": 0.6275373288378174,
"grad_norm": 0.23362024948011076,
"learning_rate": 1.2419032353642578e-05,
"loss": 0.7813,
"step": 2280
},
{
"epoch": 0.6278125645083603,
"grad_norm": 0.23991786287094416,
"learning_rate": 1.2402866410424807e-05,
"loss": 0.7725,
"step": 2281
},
{
"epoch": 0.6280878001789032,
"grad_norm": 0.22973482496427158,
"learning_rate": 1.2386706265632986e-05,
"loss": 0.79,
"step": 2282
},
{
"epoch": 0.6283630358494461,
"grad_norm": 0.22992772662224578,
"learning_rate": 1.2370551931601158e-05,
"loss": 0.7672,
"step": 2283
},
{
"epoch": 0.628638271519989,
"grad_norm": 0.24191214980316453,
"learning_rate": 1.2354403420658931e-05,
"loss": 0.7727,
"step": 2284
},
{
"epoch": 0.6289135071905319,
"grad_norm": 0.24087795410816093,
"learning_rate": 1.2338260745131474e-05,
"loss": 0.7923,
"step": 2285
},
{
"epoch": 0.6291887428610748,
"grad_norm": 0.2457715965991265,
"learning_rate": 1.2322123917339504e-05,
"loss": 0.8129,
"step": 2286
},
{
"epoch": 0.6294639785316177,
"grad_norm": 0.2259408108952426,
"learning_rate": 1.2305992949599266e-05,
"loss": 0.8071,
"step": 2287
},
{
"epoch": 0.6297392142021606,
"grad_norm": 0.2456367249921771,
"learning_rate": 1.2289867854222543e-05,
"loss": 0.7624,
"step": 2288
},
{
"epoch": 0.6300144498727035,
"grad_norm": 0.3770905799807695,
"learning_rate": 1.2273748643516623e-05,
"loss": 0.758,
"step": 2289
},
{
"epoch": 0.6302896855432464,
"grad_norm": 0.39131468106479894,
"learning_rate": 1.2257635329784323e-05,
"loss": 0.7878,
"step": 2290
},
{
"epoch": 0.6305649212137893,
"grad_norm": 0.24483191205626265,
"learning_rate": 1.2241527925323935e-05,
"loss": 0.756,
"step": 2291
},
{
"epoch": 0.6308401568843323,
"grad_norm": 0.2573993912840869,
"learning_rate": 1.2225426442429265e-05,
"loss": 0.8081,
"step": 2292
},
{
"epoch": 0.6311153925548751,
"grad_norm": 0.41148143613137705,
"learning_rate": 1.2209330893389577e-05,
"loss": 0.8122,
"step": 2293
},
{
"epoch": 0.631390628225418,
"grad_norm": 0.23272816544232663,
"learning_rate": 1.2193241290489616e-05,
"loss": 0.7875,
"step": 2294
},
{
"epoch": 0.6316658638959609,
"grad_norm": 0.24651607793492109,
"learning_rate": 1.2177157646009593e-05,
"loss": 0.7904,
"step": 2295
},
{
"epoch": 0.6319410995665038,
"grad_norm": 0.24265080357975752,
"learning_rate": 1.2161079972225163e-05,
"loss": 0.7822,
"step": 2296
},
{
"epoch": 0.6322163352370467,
"grad_norm": 0.2784699788521609,
"learning_rate": 1.2145008281407428e-05,
"loss": 0.761,
"step": 2297
},
{
"epoch": 0.6324915709075897,
"grad_norm": 0.2315799898253475,
"learning_rate": 1.2128942585822933e-05,
"loss": 0.7773,
"step": 2298
},
{
"epoch": 0.6327668065781326,
"grad_norm": 0.23419047998663908,
"learning_rate": 1.2112882897733634e-05,
"loss": 0.7701,
"step": 2299
},
{
"epoch": 0.6330420422486754,
"grad_norm": 0.23076610103056508,
"learning_rate": 1.2096829229396895e-05,
"loss": 0.7805,
"step": 2300
},
{
"epoch": 0.6333172779192183,
"grad_norm": 0.22687340264460226,
"learning_rate": 1.2080781593065503e-05,
"loss": 0.7664,
"step": 2301
},
{
"epoch": 0.6335925135897612,
"grad_norm": 0.22535323307122473,
"learning_rate": 1.2064740000987638e-05,
"loss": 0.7795,
"step": 2302
},
{
"epoch": 0.6338677492603041,
"grad_norm": 0.2526634922138882,
"learning_rate": 1.2048704465406854e-05,
"loss": 0.7806,
"step": 2303
},
{
"epoch": 0.6341429849308471,
"grad_norm": 0.2356073577776414,
"learning_rate": 1.2032674998562101e-05,
"loss": 0.7967,
"step": 2304
},
{
"epoch": 0.63441822060139,
"grad_norm": 0.23507497560140406,
"learning_rate": 1.2016651612687685e-05,
"loss": 0.7769,
"step": 2305
},
{
"epoch": 0.6346934562719329,
"grad_norm": 0.23090413448333263,
"learning_rate": 1.2000634320013274e-05,
"loss": 0.769,
"step": 2306
},
{
"epoch": 0.6349686919424757,
"grad_norm": 0.28138726478972076,
"learning_rate": 1.1984623132763873e-05,
"loss": 0.7978,
"step": 2307
},
{
"epoch": 0.6352439276130186,
"grad_norm": 0.24092751614783542,
"learning_rate": 1.1968618063159859e-05,
"loss": 0.7643,
"step": 2308
},
{
"epoch": 0.6355191632835615,
"grad_norm": 0.2249205336894784,
"learning_rate": 1.1952619123416903e-05,
"loss": 0.7719,
"step": 2309
},
{
"epoch": 0.6357943989541045,
"grad_norm": 0.2361541579458213,
"learning_rate": 1.1936626325746015e-05,
"loss": 0.7749,
"step": 2310
},
{
"epoch": 0.6360696346246474,
"grad_norm": 0.2482843281120338,
"learning_rate": 1.1920639682353529e-05,
"loss": 0.7908,
"step": 2311
},
{
"epoch": 0.6363448702951903,
"grad_norm": 0.24824941919000415,
"learning_rate": 1.1904659205441061e-05,
"loss": 0.8059,
"step": 2312
},
{
"epoch": 0.6366201059657332,
"grad_norm": 0.23422129086053145,
"learning_rate": 1.1888684907205527e-05,
"loss": 0.7716,
"step": 2313
},
{
"epoch": 0.636895341636276,
"grad_norm": 0.25211344670993874,
"learning_rate": 1.1872716799839132e-05,
"loss": 0.7719,
"step": 2314
},
{
"epoch": 0.6371705773068189,
"grad_norm": 0.23532948350451352,
"learning_rate": 1.1856754895529355e-05,
"loss": 0.7822,
"step": 2315
},
{
"epoch": 0.6374458129773619,
"grad_norm": 0.2406375927427003,
"learning_rate": 1.1840799206458927e-05,
"loss": 0.7701,
"step": 2316
},
{
"epoch": 0.6377210486479048,
"grad_norm": 0.23401301404751848,
"learning_rate": 1.1824849744805855e-05,
"loss": 0.7846,
"step": 2317
},
{
"epoch": 0.6379962843184477,
"grad_norm": 0.24204008195510776,
"learning_rate": 1.1808906522743384e-05,
"loss": 0.7773,
"step": 2318
},
{
"epoch": 0.6382715199889906,
"grad_norm": 0.24039894750688456,
"learning_rate": 1.1792969552439998e-05,
"loss": 0.7635,
"step": 2319
},
{
"epoch": 0.6385467556595334,
"grad_norm": 0.2362156141922952,
"learning_rate": 1.1777038846059411e-05,
"loss": 0.7736,
"step": 2320
},
{
"epoch": 0.6388219913300763,
"grad_norm": 0.24775261630507306,
"learning_rate": 1.176111441576055e-05,
"loss": 0.7862,
"step": 2321
},
{
"epoch": 0.6390972270006193,
"grad_norm": 0.23323874597649452,
"learning_rate": 1.174519627369755e-05,
"loss": 0.7715,
"step": 2322
},
{
"epoch": 0.6393724626711622,
"grad_norm": 0.225345977039023,
"learning_rate": 1.172928443201976e-05,
"loss": 0.7648,
"step": 2323
},
{
"epoch": 0.6396476983417051,
"grad_norm": 0.2395747303079639,
"learning_rate": 1.1713378902871706e-05,
"loss": 0.7797,
"step": 2324
},
{
"epoch": 0.639922934012248,
"grad_norm": 0.24074112221048438,
"learning_rate": 1.1697479698393112e-05,
"loss": 0.7755,
"step": 2325
},
{
"epoch": 0.6401981696827909,
"grad_norm": 0.23765364708162398,
"learning_rate": 1.1681586830718862e-05,
"loss": 0.7727,
"step": 2326
},
{
"epoch": 0.6404734053533337,
"grad_norm": 0.24476468611717303,
"learning_rate": 1.1665700311979e-05,
"loss": 0.8085,
"step": 2327
},
{
"epoch": 0.6407486410238767,
"grad_norm": 0.24529029491177157,
"learning_rate": 1.1649820154298743e-05,
"loss": 0.802,
"step": 2328
},
{
"epoch": 0.6410238766944196,
"grad_norm": 0.24333979288255772,
"learning_rate": 1.1633946369798426e-05,
"loss": 0.7633,
"step": 2329
},
{
"epoch": 0.6412991123649625,
"grad_norm": 0.2311984249345681,
"learning_rate": 1.1618078970593544e-05,
"loss": 0.7631,
"step": 2330
},
{
"epoch": 0.6415743480355054,
"grad_norm": 0.2329726701246018,
"learning_rate": 1.160221796879471e-05,
"loss": 0.8027,
"step": 2331
},
{
"epoch": 0.6418495837060483,
"grad_norm": 0.3710747252261914,
"learning_rate": 1.1586363376507648e-05,
"loss": 0.8146,
"step": 2332
},
{
"epoch": 0.6421248193765912,
"grad_norm": 0.2454256887968694,
"learning_rate": 1.1570515205833206e-05,
"loss": 0.7871,
"step": 2333
},
{
"epoch": 0.6424000550471342,
"grad_norm": 0.2277914598754184,
"learning_rate": 1.1554673468867308e-05,
"loss": 0.8097,
"step": 2334
},
{
"epoch": 0.642675290717677,
"grad_norm": 0.23925652711046633,
"learning_rate": 1.1538838177700993e-05,
"loss": 0.8003,
"step": 2335
},
{
"epoch": 0.6429505263882199,
"grad_norm": 0.25000951173695574,
"learning_rate": 1.1523009344420348e-05,
"loss": 0.771,
"step": 2336
},
{
"epoch": 0.6432257620587628,
"grad_norm": 0.24575782003591679,
"learning_rate": 1.1507186981106564e-05,
"loss": 0.7749,
"step": 2337
},
{
"epoch": 0.6435009977293057,
"grad_norm": 0.24094055600625075,
"learning_rate": 1.1491371099835886e-05,
"loss": 0.7525,
"step": 2338
},
{
"epoch": 0.6437762333998486,
"grad_norm": 0.23616841299124244,
"learning_rate": 1.1475561712679582e-05,
"loss": 0.7947,
"step": 2339
},
{
"epoch": 0.6440514690703916,
"grad_norm": 0.22506594021173423,
"learning_rate": 1.1459758831704018e-05,
"loss": 0.7787,
"step": 2340
},
{
"epoch": 0.6443267047409345,
"grad_norm": 0.24818684843520203,
"learning_rate": 1.144396246897054e-05,
"loss": 0.7648,
"step": 2341
},
{
"epoch": 0.6446019404114773,
"grad_norm": 0.22244800818479413,
"learning_rate": 1.1428172636535551e-05,
"loss": 0.7663,
"step": 2342
},
{
"epoch": 0.6448771760820202,
"grad_norm": 0.24281419515466754,
"learning_rate": 1.1412389346450468e-05,
"loss": 0.7654,
"step": 2343
},
{
"epoch": 0.6451524117525631,
"grad_norm": 0.22878715467206565,
"learning_rate": 1.1396612610761695e-05,
"loss": 0.7773,
"step": 2344
},
{
"epoch": 0.645427647423106,
"grad_norm": 0.24416688028267866,
"learning_rate": 1.1380842441510658e-05,
"loss": 0.7923,
"step": 2345
},
{
"epoch": 0.645702883093649,
"grad_norm": 0.2303559071426893,
"learning_rate": 1.1365078850733738e-05,
"loss": 0.7865,
"step": 2346
},
{
"epoch": 0.6459781187641919,
"grad_norm": 0.23975013965658076,
"learning_rate": 1.1349321850462342e-05,
"loss": 0.8106,
"step": 2347
},
{
"epoch": 0.6462533544347348,
"grad_norm": 0.23104271924792577,
"learning_rate": 1.133357145272282e-05,
"loss": 0.7852,
"step": 2348
},
{
"epoch": 0.6465285901052776,
"grad_norm": 0.23086857881356623,
"learning_rate": 1.1317827669536467e-05,
"loss": 0.7859,
"step": 2349
},
{
"epoch": 0.6468038257758205,
"grad_norm": 0.2872589332126486,
"learning_rate": 1.1302090512919564e-05,
"loss": 0.7876,
"step": 2350
},
{
"epoch": 0.6470790614463634,
"grad_norm": 0.2306238044182807,
"learning_rate": 1.1286359994883302e-05,
"loss": 0.7667,
"step": 2351
},
{
"epoch": 0.6473542971169064,
"grad_norm": 0.32793517018662044,
"learning_rate": 1.1270636127433827e-05,
"loss": 0.784,
"step": 2352
},
{
"epoch": 0.6476295327874493,
"grad_norm": 0.23372195785710242,
"learning_rate": 1.1254918922572205e-05,
"loss": 0.7831,
"step": 2353
},
{
"epoch": 0.6479047684579922,
"grad_norm": 0.2354023056923587,
"learning_rate": 1.1239208392294406e-05,
"loss": 0.7985,
"step": 2354
},
{
"epoch": 0.648180004128535,
"grad_norm": 0.2689494954821914,
"learning_rate": 1.122350454859133e-05,
"loss": 0.7995,
"step": 2355
},
{
"epoch": 0.6484552397990779,
"grad_norm": 0.22864863268338262,
"learning_rate": 1.1207807403448742e-05,
"loss": 0.7862,
"step": 2356
},
{
"epoch": 0.6487304754696208,
"grad_norm": 0.22877383469670426,
"learning_rate": 1.1192116968847313e-05,
"loss": 0.7657,
"step": 2357
},
{
"epoch": 0.6490057111401638,
"grad_norm": 0.24724173580536435,
"learning_rate": 1.11764332567626e-05,
"loss": 0.8074,
"step": 2358
},
{
"epoch": 0.6492809468107067,
"grad_norm": 0.22760314279807323,
"learning_rate": 1.1160756279164996e-05,
"loss": 0.7546,
"step": 2359
},
{
"epoch": 0.6495561824812496,
"grad_norm": 0.24002453943654778,
"learning_rate": 1.1145086048019795e-05,
"loss": 0.7826,
"step": 2360
},
{
"epoch": 0.6498314181517925,
"grad_norm": 0.2140653012670485,
"learning_rate": 1.1129422575287116e-05,
"loss": 0.7602,
"step": 2361
},
{
"epoch": 0.6501066538223353,
"grad_norm": 0.23523993899709766,
"learning_rate": 1.1113765872921933e-05,
"loss": 0.746,
"step": 2362
},
{
"epoch": 0.6503818894928782,
"grad_norm": 0.23171924831741408,
"learning_rate": 1.1098115952874036e-05,
"loss": 0.7613,
"step": 2363
},
{
"epoch": 0.6506571251634212,
"grad_norm": 0.2534435156716113,
"learning_rate": 1.1082472827088053e-05,
"loss": 0.8077,
"step": 2364
},
{
"epoch": 0.6509323608339641,
"grad_norm": 0.23618127991487797,
"learning_rate": 1.1066836507503428e-05,
"loss": 0.7812,
"step": 2365
},
{
"epoch": 0.651207596504507,
"grad_norm": 0.24245911206849247,
"learning_rate": 1.1051207006054394e-05,
"loss": 0.7854,
"step": 2366
},
{
"epoch": 0.6514828321750499,
"grad_norm": 0.22079706440775906,
"learning_rate": 1.1035584334669998e-05,
"loss": 0.7984,
"step": 2367
},
{
"epoch": 0.6517580678455928,
"grad_norm": 0.24154293030207943,
"learning_rate": 1.101996850527406e-05,
"loss": 0.7635,
"step": 2368
},
{
"epoch": 0.6520333035161356,
"grad_norm": 0.21961355942826122,
"learning_rate": 1.1004359529785194e-05,
"loss": 0.7791,
"step": 2369
},
{
"epoch": 0.6523085391866786,
"grad_norm": 0.2394960830275329,
"learning_rate": 1.0988757420116771e-05,
"loss": 0.7948,
"step": 2370
},
{
"epoch": 0.6525837748572215,
"grad_norm": 0.22316095598405364,
"learning_rate": 1.0973162188176915e-05,
"loss": 0.7866,
"step": 2371
},
{
"epoch": 0.6528590105277644,
"grad_norm": 0.23774132487002664,
"learning_rate": 1.0957573845868525e-05,
"loss": 0.7915,
"step": 2372
},
{
"epoch": 0.6531342461983073,
"grad_norm": 0.24861044095870596,
"learning_rate": 1.0941992405089209e-05,
"loss": 0.8048,
"step": 2373
},
{
"epoch": 0.6534094818688502,
"grad_norm": 0.22040595195310883,
"learning_rate": 1.092641787773133e-05,
"loss": 0.7828,
"step": 2374
},
{
"epoch": 0.6536847175393931,
"grad_norm": 0.24109090382083664,
"learning_rate": 1.0910850275681974e-05,
"loss": 0.7785,
"step": 2375
},
{
"epoch": 0.653959953209936,
"grad_norm": 0.23803836777273013,
"learning_rate": 1.0895289610822935e-05,
"loss": 0.7592,
"step": 2376
},
{
"epoch": 0.6542351888804789,
"grad_norm": 0.23187521600298203,
"learning_rate": 1.087973589503072e-05,
"loss": 0.7836,
"step": 2377
},
{
"epoch": 0.6545104245510218,
"grad_norm": 0.23309562053529873,
"learning_rate": 1.0864189140176512e-05,
"loss": 0.7766,
"step": 2378
},
{
"epoch": 0.6547856602215647,
"grad_norm": 0.22371015381882509,
"learning_rate": 1.0848649358126205e-05,
"loss": 0.7896,
"step": 2379
},
{
"epoch": 0.6550608958921076,
"grad_norm": 0.23349125197890194,
"learning_rate": 1.0833116560740361e-05,
"loss": 0.7665,
"step": 2380
},
{
"epoch": 0.6553361315626505,
"grad_norm": 0.22837551334809192,
"learning_rate": 1.0817590759874194e-05,
"loss": 0.7783,
"step": 2381
},
{
"epoch": 0.6556113672331935,
"grad_norm": 0.23338696297514103,
"learning_rate": 1.080207196737763e-05,
"loss": 0.7719,
"step": 2382
},
{
"epoch": 0.6558866029037363,
"grad_norm": 0.23425031111914862,
"learning_rate": 1.0786560195095181e-05,
"loss": 0.7842,
"step": 2383
},
{
"epoch": 0.6561618385742792,
"grad_norm": 0.21035023614697532,
"learning_rate": 1.0771055454866048e-05,
"loss": 0.7708,
"step": 2384
},
{
"epoch": 0.6564370742448221,
"grad_norm": 0.23647780094331225,
"learning_rate": 1.0755557758524033e-05,
"loss": 0.7643,
"step": 2385
},
{
"epoch": 0.656712309915365,
"grad_norm": 0.22346276087431097,
"learning_rate": 1.0740067117897586e-05,
"loss": 0.7624,
"step": 2386
},
{
"epoch": 0.6569875455859079,
"grad_norm": 0.23977669174938704,
"learning_rate": 1.0724583544809768e-05,
"loss": 0.799,
"step": 2387
},
{
"epoch": 0.6572627812564509,
"grad_norm": 0.22182448648270314,
"learning_rate": 1.0709107051078221e-05,
"loss": 0.7723,
"step": 2388
},
{
"epoch": 0.6575380169269938,
"grad_norm": 0.2194907430601821,
"learning_rate": 1.0693637648515228e-05,
"loss": 0.7838,
"step": 2389
},
{
"epoch": 0.6578132525975366,
"grad_norm": 0.229976596291359,
"learning_rate": 1.0678175348927615e-05,
"loss": 0.7704,
"step": 2390
},
{
"epoch": 0.6580884882680795,
"grad_norm": 0.2211099114616119,
"learning_rate": 1.0662720164116815e-05,
"loss": 0.7609,
"step": 2391
},
{
"epoch": 0.6583637239386224,
"grad_norm": 0.2180251462179224,
"learning_rate": 1.0647272105878833e-05,
"loss": 0.7689,
"step": 2392
},
{
"epoch": 0.6586389596091653,
"grad_norm": 0.2203189616623406,
"learning_rate": 1.06318311860042e-05,
"loss": 0.7471,
"step": 2393
},
{
"epoch": 0.6589141952797083,
"grad_norm": 0.22680169811459414,
"learning_rate": 1.0616397416278046e-05,
"loss": 0.777,
"step": 2394
},
{
"epoch": 0.6591894309502512,
"grad_norm": 0.22476041046566922,
"learning_rate": 1.0600970808479997e-05,
"loss": 0.7878,
"step": 2395
},
{
"epoch": 0.6594646666207941,
"grad_norm": 0.2453410077918203,
"learning_rate": 1.0585551374384246e-05,
"loss": 0.7492,
"step": 2396
},
{
"epoch": 0.6597399022913369,
"grad_norm": 0.22364329816030826,
"learning_rate": 1.0570139125759518e-05,
"loss": 0.7596,
"step": 2397
},
{
"epoch": 0.6600151379618798,
"grad_norm": 0.2437942537641047,
"learning_rate": 1.0554734074369017e-05,
"loss": 0.7816,
"step": 2398
},
{
"epoch": 0.6602903736324227,
"grad_norm": 0.2217240731203041,
"learning_rate": 1.0539336231970485e-05,
"loss": 0.7559,
"step": 2399
},
{
"epoch": 0.6605656093029657,
"grad_norm": 0.23474539493899937,
"learning_rate": 1.0523945610316138e-05,
"loss": 0.7722,
"step": 2400
},
{
"epoch": 0.6608408449735086,
"grad_norm": 0.23905008315022286,
"learning_rate": 1.0508562221152699e-05,
"loss": 0.7981,
"step": 2401
},
{
"epoch": 0.6611160806440515,
"grad_norm": 0.2179803554679985,
"learning_rate": 1.0493186076221376e-05,
"loss": 0.7887,
"step": 2402
},
{
"epoch": 0.6613913163145944,
"grad_norm": 0.30493275613170756,
"learning_rate": 1.0477817187257809e-05,
"loss": 0.7689,
"step": 2403
},
{
"epoch": 0.6616665519851372,
"grad_norm": 0.21878972190937115,
"learning_rate": 1.0462455565992161e-05,
"loss": 0.778,
"step": 2404
},
{
"epoch": 0.6619417876556801,
"grad_norm": 0.21404023319142085,
"learning_rate": 1.0447101224148994e-05,
"loss": 0.7717,
"step": 2405
},
{
"epoch": 0.6622170233262231,
"grad_norm": 0.22796253040553002,
"learning_rate": 1.043175417344734e-05,
"loss": 0.7785,
"step": 2406
},
{
"epoch": 0.662492258996766,
"grad_norm": 0.2276333695329629,
"learning_rate": 1.041641442560067e-05,
"loss": 0.7638,
"step": 2407
},
{
"epoch": 0.6627674946673089,
"grad_norm": 0.21183330904074812,
"learning_rate": 1.0401081992316857e-05,
"loss": 0.7583,
"step": 2408
},
{
"epoch": 0.6630427303378518,
"grad_norm": 0.2492603438225398,
"learning_rate": 1.038575688529822e-05,
"loss": 0.7733,
"step": 2409
},
{
"epoch": 0.6633179660083947,
"grad_norm": 0.2280886545169572,
"learning_rate": 1.0370439116241455e-05,
"loss": 0.8024,
"step": 2410
},
{
"epoch": 0.6635932016789375,
"grad_norm": 0.46669864005277895,
"learning_rate": 1.0355128696837702e-05,
"loss": 0.7827,
"step": 2411
},
{
"epoch": 0.6638684373494805,
"grad_norm": 0.2364701944158192,
"learning_rate": 1.033982563877244e-05,
"loss": 0.7802,
"step": 2412
},
{
"epoch": 0.6641436730200234,
"grad_norm": 0.2371027365459466,
"learning_rate": 1.0324529953725568e-05,
"loss": 0.8017,
"step": 2413
},
{
"epoch": 0.6644189086905663,
"grad_norm": 0.2358545000830875,
"learning_rate": 1.0309241653371347e-05,
"loss": 0.7668,
"step": 2414
},
{
"epoch": 0.6646941443611092,
"grad_norm": 0.22808108263937973,
"learning_rate": 1.0293960749378384e-05,
"loss": 0.7726,
"step": 2415
},
{
"epoch": 0.6649693800316521,
"grad_norm": 0.22541924299814917,
"learning_rate": 1.0278687253409662e-05,
"loss": 0.7537,
"step": 2416
},
{
"epoch": 0.665244615702195,
"grad_norm": 0.2477169909305548,
"learning_rate": 1.0263421177122505e-05,
"loss": 0.7952,
"step": 2417
},
{
"epoch": 0.665519851372738,
"grad_norm": 0.23406721745364348,
"learning_rate": 1.0248162532168574e-05,
"loss": 0.799,
"step": 2418
},
{
"epoch": 0.6657950870432808,
"grad_norm": 0.22671711008442896,
"learning_rate": 1.0232911330193861e-05,
"loss": 0.7721,
"step": 2419
},
{
"epoch": 0.6660703227138237,
"grad_norm": 0.23392243064526896,
"learning_rate": 1.021766758283866e-05,
"loss": 0.7963,
"step": 2420
},
{
"epoch": 0.6663455583843666,
"grad_norm": 0.22942307844003806,
"learning_rate": 1.02024313017376e-05,
"loss": 0.7507,
"step": 2421
},
{
"epoch": 0.6666207940549095,
"grad_norm": 0.21580367183403693,
"learning_rate": 1.0187202498519588e-05,
"loss": 0.7794,
"step": 2422
},
{
"epoch": 0.6668960297254524,
"grad_norm": 0.24088443619876312,
"learning_rate": 1.017198118480784e-05,
"loss": 0.7978,
"step": 2423
},
{
"epoch": 0.6671712653959954,
"grad_norm": 0.21582027508119267,
"learning_rate": 1.0156767372219854e-05,
"loss": 0.7913,
"step": 2424
},
{
"epoch": 0.6674465010665382,
"grad_norm": 0.2397917395336763,
"learning_rate": 1.0141561072367396e-05,
"loss": 0.7794,
"step": 2425
},
{
"epoch": 0.6677217367370811,
"grad_norm": 0.23306780283956408,
"learning_rate": 1.0126362296856511e-05,
"loss": 0.7555,
"step": 2426
},
{
"epoch": 0.667996972407624,
"grad_norm": 0.22743305802924532,
"learning_rate": 1.0111171057287477e-05,
"loss": 0.7534,
"step": 2427
},
{
"epoch": 0.6682722080781669,
"grad_norm": 0.22907358964366473,
"learning_rate": 1.0095987365254843e-05,
"loss": 0.766,
"step": 2428
},
{
"epoch": 0.6685474437487098,
"grad_norm": 0.22599697983218686,
"learning_rate": 1.0080811232347396e-05,
"loss": 0.7926,
"step": 2429
},
{
"epoch": 0.6688226794192528,
"grad_norm": 0.2288149061038424,
"learning_rate": 1.006564267014813e-05,
"loss": 0.7393,
"step": 2430
},
{
"epoch": 0.6690979150897957,
"grad_norm": 0.2316541769565441,
"learning_rate": 1.005048169023429e-05,
"loss": 0.7778,
"step": 2431
},
{
"epoch": 0.6693731507603385,
"grad_norm": 0.23462334035713,
"learning_rate": 1.003532830417732e-05,
"loss": 0.7878,
"step": 2432
},
{
"epoch": 0.6696483864308814,
"grad_norm": 0.22281533873076556,
"learning_rate": 1.0020182523542869e-05,
"loss": 0.7815,
"step": 2433
},
{
"epoch": 0.6699236221014243,
"grad_norm": 0.2309986177915826,
"learning_rate": 1.000504435989079e-05,
"loss": 0.7658,
"step": 2434
},
{
"epoch": 0.6701988577719672,
"grad_norm": 0.22486743102674223,
"learning_rate": 9.9899138247751e-06,
"loss": 0.7823,
"step": 2435
},
{
"epoch": 0.6704740934425102,
"grad_norm": 0.23502472525632212,
"learning_rate": 9.974790929744021e-06,
"loss": 0.7657,
"step": 2436
},
{
"epoch": 0.6707493291130531,
"grad_norm": 0.24519665537909452,
"learning_rate": 9.959675686339918e-06,
"loss": 0.7782,
"step": 2437
},
{
"epoch": 0.671024564783596,
"grad_norm": 0.22979213514478425,
"learning_rate": 9.944568106099336e-06,
"loss": 0.7671,
"step": 2438
},
{
"epoch": 0.6712998004541388,
"grad_norm": 0.25216614197100384,
"learning_rate": 9.929468200552963e-06,
"loss": 0.789,
"step": 2439
},
{
"epoch": 0.6715750361246817,
"grad_norm": 0.23075737764323775,
"learning_rate": 9.914375981225632e-06,
"loss": 0.7888,
"step": 2440
},
{
"epoch": 0.6718502717952246,
"grad_norm": 0.22458144886045955,
"learning_rate": 9.899291459636316e-06,
"loss": 0.7749,
"step": 2441
},
{
"epoch": 0.6721255074657676,
"grad_norm": 0.23322259378345364,
"learning_rate": 9.884214647298087e-06,
"loss": 0.7985,
"step": 2442
},
{
"epoch": 0.6724007431363105,
"grad_norm": 0.23492890404689232,
"learning_rate": 9.869145555718162e-06,
"loss": 0.7948,
"step": 2443
},
{
"epoch": 0.6726759788068534,
"grad_norm": 0.22469135354354047,
"learning_rate": 9.854084196397859e-06,
"loss": 0.7704,
"step": 2444
},
{
"epoch": 0.6729512144773963,
"grad_norm": 0.2220893929783055,
"learning_rate": 9.839030580832573e-06,
"loss": 0.776,
"step": 2445
},
{
"epoch": 0.6732264501479391,
"grad_norm": 0.23533516873712618,
"learning_rate": 9.823984720511816e-06,
"loss": 0.7762,
"step": 2446
},
{
"epoch": 0.673501685818482,
"grad_norm": 0.2206522024450872,
"learning_rate": 9.808946626919172e-06,
"loss": 0.8001,
"step": 2447
},
{
"epoch": 0.673776921489025,
"grad_norm": 0.2253775746862108,
"learning_rate": 9.793916311532294e-06,
"loss": 0.8135,
"step": 2448
},
{
"epoch": 0.6740521571595679,
"grad_norm": 0.2269850088529325,
"learning_rate": 9.778893785822894e-06,
"loss": 0.8209,
"step": 2449
},
{
"epoch": 0.6743273928301108,
"grad_norm": 0.3225439821012345,
"learning_rate": 9.763879061256744e-06,
"loss": 0.7663,
"step": 2450
},
{
"epoch": 0.6746026285006537,
"grad_norm": 0.21724346561672142,
"learning_rate": 9.748872149293678e-06,
"loss": 0.7899,
"step": 2451
},
{
"epoch": 0.6748778641711966,
"grad_norm": 0.2376174096802007,
"learning_rate": 9.733873061387527e-06,
"loss": 0.7699,
"step": 2452
},
{
"epoch": 0.6751530998417394,
"grad_norm": 0.21132385126114916,
"learning_rate": 9.718881808986186e-06,
"loss": 0.7823,
"step": 2453
},
{
"epoch": 0.6754283355122824,
"grad_norm": 0.21257845733422154,
"learning_rate": 9.703898403531561e-06,
"loss": 0.7415,
"step": 2454
},
{
"epoch": 0.6757035711828253,
"grad_norm": 0.21390548304356413,
"learning_rate": 9.688922856459563e-06,
"loss": 0.7637,
"step": 2455
},
{
"epoch": 0.6759788068533682,
"grad_norm": 0.21537758501429546,
"learning_rate": 9.673955179200116e-06,
"loss": 0.7669,
"step": 2456
},
{
"epoch": 0.6762540425239111,
"grad_norm": 0.22287252512032207,
"learning_rate": 9.658995383177114e-06,
"loss": 0.7623,
"step": 2457
},
{
"epoch": 0.676529278194454,
"grad_norm": 0.3495194670430522,
"learning_rate": 9.64404347980847e-06,
"loss": 0.7977,
"step": 2458
},
{
"epoch": 0.6768045138649968,
"grad_norm": 0.21585522136163524,
"learning_rate": 9.629099480506034e-06,
"loss": 0.7675,
"step": 2459
},
{
"epoch": 0.6770797495355398,
"grad_norm": 0.22049895208382783,
"learning_rate": 9.614163396675657e-06,
"loss": 0.7688,
"step": 2460
},
{
"epoch": 0.6773549852060827,
"grad_norm": 0.22841228167081598,
"learning_rate": 9.599235239717131e-06,
"loss": 0.7805,
"step": 2461
},
{
"epoch": 0.6776302208766256,
"grad_norm": 0.20867410171905978,
"learning_rate": 9.584315021024205e-06,
"loss": 0.766,
"step": 2462
},
{
"epoch": 0.6779054565471685,
"grad_norm": 0.21957400992775994,
"learning_rate": 9.56940275198457e-06,
"loss": 0.7574,
"step": 2463
},
{
"epoch": 0.6781806922177114,
"grad_norm": 0.22692495941500024,
"learning_rate": 9.554498443979837e-06,
"loss": 0.7628,
"step": 2464
},
{
"epoch": 0.6784559278882544,
"grad_norm": 0.21468268215342132,
"learning_rate": 9.539602108385551e-06,
"loss": 0.7595,
"step": 2465
},
{
"epoch": 0.6787311635587973,
"grad_norm": 0.22226371709403886,
"learning_rate": 9.524713756571185e-06,
"loss": 0.7792,
"step": 2466
},
{
"epoch": 0.6790063992293401,
"grad_norm": 0.2231191835609718,
"learning_rate": 9.509833399900076e-06,
"loss": 0.789,
"step": 2467
},
{
"epoch": 0.679281634899883,
"grad_norm": 0.2230108118422868,
"learning_rate": 9.494961049729521e-06,
"loss": 0.7615,
"step": 2468
},
{
"epoch": 0.6795568705704259,
"grad_norm": 0.2189954800987077,
"learning_rate": 9.480096717410647e-06,
"loss": 0.7934,
"step": 2469
},
{
"epoch": 0.6798321062409688,
"grad_norm": 0.21776374077364447,
"learning_rate": 9.465240414288505e-06,
"loss": 0.7803,
"step": 2470
},
{
"epoch": 0.6801073419115118,
"grad_norm": 0.21921862823179844,
"learning_rate": 9.450392151701983e-06,
"loss": 0.7754,
"step": 2471
},
{
"epoch": 0.6803825775820547,
"grad_norm": 0.2241906980218283,
"learning_rate": 9.435551940983859e-06,
"loss": 0.7765,
"step": 2472
},
{
"epoch": 0.6806578132525976,
"grad_norm": 0.2253310478368545,
"learning_rate": 9.420719793460758e-06,
"loss": 0.795,
"step": 2473
},
{
"epoch": 0.6809330489231404,
"grad_norm": 0.2173925233300184,
"learning_rate": 9.405895720453128e-06,
"loss": 0.7785,
"step": 2474
},
{
"epoch": 0.6812082845936833,
"grad_norm": 0.30708324943746157,
"learning_rate": 9.391079733275306e-06,
"loss": 0.775,
"step": 2475
},
{
"epoch": 0.6814835202642262,
"grad_norm": 0.22534096606040097,
"learning_rate": 9.3762718432354e-06,
"loss": 0.8064,
"step": 2476
},
{
"epoch": 0.6817587559347692,
"grad_norm": 0.2195259041646853,
"learning_rate": 9.361472061635374e-06,
"loss": 0.7918,
"step": 2477
},
{
"epoch": 0.6820339916053121,
"grad_norm": 0.2100772620504874,
"learning_rate": 9.346680399771003e-06,
"loss": 0.7758,
"step": 2478
},
{
"epoch": 0.682309227275855,
"grad_norm": 0.2116302368744064,
"learning_rate": 9.331896868931834e-06,
"loss": 0.7545,
"step": 2479
},
{
"epoch": 0.6825844629463979,
"grad_norm": 0.221247983582461,
"learning_rate": 9.317121480401245e-06,
"loss": 0.7725,
"step": 2480
},
{
"epoch": 0.6828596986169407,
"grad_norm": 0.21351777609821598,
"learning_rate": 9.302354245456367e-06,
"loss": 0.772,
"step": 2481
},
{
"epoch": 0.6831349342874836,
"grad_norm": 0.22766266731837248,
"learning_rate": 9.287595175368143e-06,
"loss": 0.7588,
"step": 2482
},
{
"epoch": 0.6834101699580266,
"grad_norm": 0.22936644820810378,
"learning_rate": 9.272844281401263e-06,
"loss": 0.7675,
"step": 2483
},
{
"epoch": 0.6836854056285695,
"grad_norm": 0.3757291981749503,
"learning_rate": 9.25810157481417e-06,
"loss": 0.7857,
"step": 2484
},
{
"epoch": 0.6839606412991124,
"grad_norm": 0.23193772438613108,
"learning_rate": 9.243367066859077e-06,
"loss": 0.7793,
"step": 2485
},
{
"epoch": 0.6842358769696553,
"grad_norm": 0.24922646157771597,
"learning_rate": 9.228640768781919e-06,
"loss": 0.7559,
"step": 2486
},
{
"epoch": 0.6845111126401981,
"grad_norm": 0.22382613600955226,
"learning_rate": 9.21392269182238e-06,
"loss": 0.7648,
"step": 2487
},
{
"epoch": 0.684786348310741,
"grad_norm": 0.2386437090117118,
"learning_rate": 9.199212847213866e-06,
"loss": 0.7733,
"step": 2488
},
{
"epoch": 0.685061583981284,
"grad_norm": 0.22702347702387204,
"learning_rate": 9.1845112461835e-06,
"loss": 0.7695,
"step": 2489
},
{
"epoch": 0.6853368196518269,
"grad_norm": 0.2560279503686138,
"learning_rate": 9.16981789995212e-06,
"loss": 0.802,
"step": 2490
},
{
"epoch": 0.6856120553223698,
"grad_norm": 0.22291033943009855,
"learning_rate": 9.15513281973424e-06,
"loss": 0.785,
"step": 2491
},
{
"epoch": 0.6858872909929127,
"grad_norm": 0.218627646927319,
"learning_rate": 9.140456016738086e-06,
"loss": 0.7469,
"step": 2492
},
{
"epoch": 0.6861625266634556,
"grad_norm": 0.23721128902782118,
"learning_rate": 9.125787502165573e-06,
"loss": 0.7786,
"step": 2493
},
{
"epoch": 0.6864377623339984,
"grad_norm": 0.2233248862456069,
"learning_rate": 9.11112728721226e-06,
"loss": 0.7737,
"step": 2494
},
{
"epoch": 0.6867129980045414,
"grad_norm": 0.21617977016839804,
"learning_rate": 9.096475383067398e-06,
"loss": 0.7729,
"step": 2495
},
{
"epoch": 0.6869882336750843,
"grad_norm": 0.2324198017017012,
"learning_rate": 9.081831800913885e-06,
"loss": 0.8005,
"step": 2496
},
{
"epoch": 0.6872634693456272,
"grad_norm": 0.4091011695050855,
"learning_rate": 9.067196551928279e-06,
"loss": 0.8117,
"step": 2497
},
{
"epoch": 0.6875387050161701,
"grad_norm": 0.22409578706629474,
"learning_rate": 9.05256964728075e-06,
"loss": 0.7565,
"step": 2498
},
{
"epoch": 0.687813940686713,
"grad_norm": 0.21561504830453673,
"learning_rate": 9.03795109813513e-06,
"loss": 0.784,
"step": 2499
},
{
"epoch": 0.6880891763572559,
"grad_norm": 0.3939017904240304,
"learning_rate": 9.02334091564886e-06,
"loss": 0.8239,
"step": 2500
},
{
"epoch": 0.6883644120277989,
"grad_norm": 0.21486174978119335,
"learning_rate": 9.008739110972986e-06,
"loss": 0.7842,
"step": 2501
},
{
"epoch": 0.6886396476983417,
"grad_norm": 0.21701146730722043,
"learning_rate": 8.994145695252174e-06,
"loss": 0.7635,
"step": 2502
},
{
"epoch": 0.6889148833688846,
"grad_norm": 0.3449154128415657,
"learning_rate": 8.979560679624687e-06,
"loss": 0.7787,
"step": 2503
},
{
"epoch": 0.6891901190394275,
"grad_norm": 0.22169093987856153,
"learning_rate": 8.964984075222368e-06,
"loss": 0.7618,
"step": 2504
},
{
"epoch": 0.6894653547099704,
"grad_norm": 0.2206818582087166,
"learning_rate": 8.950415893170657e-06,
"loss": 0.7735,
"step": 2505
},
{
"epoch": 0.6897405903805133,
"grad_norm": 0.2345879175576268,
"learning_rate": 8.935856144588532e-06,
"loss": 0.7689,
"step": 2506
},
{
"epoch": 0.6900158260510563,
"grad_norm": 0.21893015317455772,
"learning_rate": 8.921304840588578e-06,
"loss": 0.7737,
"step": 2507
},
{
"epoch": 0.6902910617215992,
"grad_norm": 0.22027550928832026,
"learning_rate": 8.906761992276893e-06,
"loss": 0.7777,
"step": 2508
},
{
"epoch": 0.690566297392142,
"grad_norm": 0.2435470123997399,
"learning_rate": 8.89222761075315e-06,
"loss": 0.7964,
"step": 2509
},
{
"epoch": 0.6908415330626849,
"grad_norm": 0.2196593302073358,
"learning_rate": 8.87770170711055e-06,
"loss": 0.75,
"step": 2510
},
{
"epoch": 0.6911167687332278,
"grad_norm": 0.21792435152172385,
"learning_rate": 8.863184292435828e-06,
"loss": 0.7402,
"step": 2511
},
{
"epoch": 0.6913920044037707,
"grad_norm": 0.2235878784830696,
"learning_rate": 8.848675377809235e-06,
"loss": 0.7886,
"step": 2512
},
{
"epoch": 0.6916672400743137,
"grad_norm": 0.2302819263673517,
"learning_rate": 8.834174974304526e-06,
"loss": 0.7951,
"step": 2513
},
{
"epoch": 0.6919424757448566,
"grad_norm": 0.22502686844395442,
"learning_rate": 8.819683092988978e-06,
"loss": 0.7842,
"step": 2514
},
{
"epoch": 0.6922177114153995,
"grad_norm": 0.22555359566475297,
"learning_rate": 8.805199744923356e-06,
"loss": 0.7856,
"step": 2515
},
{
"epoch": 0.6924929470859423,
"grad_norm": 0.21195604623711484,
"learning_rate": 8.790724941161904e-06,
"loss": 0.7728,
"step": 2516
},
{
"epoch": 0.6927681827564852,
"grad_norm": 0.23149331987418773,
"learning_rate": 8.776258692752355e-06,
"loss": 0.7898,
"step": 2517
},
{
"epoch": 0.6930434184270281,
"grad_norm": 0.227401694556156,
"learning_rate": 8.761801010735906e-06,
"loss": 0.7655,
"step": 2518
},
{
"epoch": 0.6933186540975711,
"grad_norm": 0.21109232570009917,
"learning_rate": 8.747351906147225e-06,
"loss": 0.7716,
"step": 2519
},
{
"epoch": 0.693593889768114,
"grad_norm": 0.2207805835482109,
"learning_rate": 8.73291139001443e-06,
"loss": 0.7424,
"step": 2520
},
{
"epoch": 0.6938691254386569,
"grad_norm": 0.24099588906668826,
"learning_rate": 8.718479473359067e-06,
"loss": 0.7848,
"step": 2521
},
{
"epoch": 0.6941443611091997,
"grad_norm": 0.21952719725201358,
"learning_rate": 8.704056167196148e-06,
"loss": 0.7934,
"step": 2522
},
{
"epoch": 0.6944195967797426,
"grad_norm": 0.22370261913303857,
"learning_rate": 8.689641482534083e-06,
"loss": 0.7637,
"step": 2523
},
{
"epoch": 0.6946948324502855,
"grad_norm": 0.2238425402607639,
"learning_rate": 8.675235430374722e-06,
"loss": 0.7738,
"step": 2524
},
{
"epoch": 0.6949700681208285,
"grad_norm": 0.30398606659718463,
"learning_rate": 8.660838021713323e-06,
"loss": 0.807,
"step": 2525
},
{
"epoch": 0.6952453037913714,
"grad_norm": 0.22191070111807776,
"learning_rate": 8.646449267538544e-06,
"loss": 0.7752,
"step": 2526
},
{
"epoch": 0.6955205394619143,
"grad_norm": 0.22203322308020254,
"learning_rate": 8.632069178832445e-06,
"loss": 0.7415,
"step": 2527
},
{
"epoch": 0.6957957751324572,
"grad_norm": 0.23590067454635755,
"learning_rate": 8.617697766570449e-06,
"loss": 0.7796,
"step": 2528
},
{
"epoch": 0.696071010803,
"grad_norm": 0.22047356060959447,
"learning_rate": 8.603335041721386e-06,
"loss": 0.7672,
"step": 2529
},
{
"epoch": 0.6963462464735429,
"grad_norm": 0.22598565446047506,
"learning_rate": 8.588981015247443e-06,
"loss": 0.7847,
"step": 2530
},
{
"epoch": 0.6966214821440859,
"grad_norm": 0.22701575166779644,
"learning_rate": 8.57463569810415e-06,
"loss": 0.7649,
"step": 2531
},
{
"epoch": 0.6968967178146288,
"grad_norm": 0.22044350576235772,
"learning_rate": 8.560299101240436e-06,
"loss": 0.7673,
"step": 2532
},
{
"epoch": 0.6971719534851717,
"grad_norm": 0.215508752325192,
"learning_rate": 8.545971235598524e-06,
"loss": 0.7686,
"step": 2533
},
{
"epoch": 0.6974471891557146,
"grad_norm": 0.22641735357232648,
"learning_rate": 8.531652112114011e-06,
"loss": 0.7628,
"step": 2534
},
{
"epoch": 0.6977224248262575,
"grad_norm": 0.23307235794961992,
"learning_rate": 8.517341741715787e-06,
"loss": 0.7756,
"step": 2535
},
{
"epoch": 0.6979976604968003,
"grad_norm": 0.21664296251972612,
"learning_rate": 8.503040135326088e-06,
"loss": 0.7779,
"step": 2536
},
{
"epoch": 0.6982728961673433,
"grad_norm": 0.22873303552461818,
"learning_rate": 8.488747303860463e-06,
"loss": 0.7883,
"step": 2537
},
{
"epoch": 0.6985481318378862,
"grad_norm": 0.2272053264983479,
"learning_rate": 8.474463258227727e-06,
"loss": 0.7853,
"step": 2538
},
{
"epoch": 0.6988233675084291,
"grad_norm": 0.20952576876841586,
"learning_rate": 8.460188009330049e-06,
"loss": 0.7664,
"step": 2539
},
{
"epoch": 0.699098603178972,
"grad_norm": 0.23100765351637462,
"learning_rate": 8.445921568062826e-06,
"loss": 0.774,
"step": 2540
},
{
"epoch": 0.6993738388495149,
"grad_norm": 0.22587364684668013,
"learning_rate": 8.431663945314766e-06,
"loss": 0.7656,
"step": 2541
},
{
"epoch": 0.6996490745200578,
"grad_norm": 0.22018981683186592,
"learning_rate": 8.417415151967842e-06,
"loss": 0.7827,
"step": 2542
},
{
"epoch": 0.6999243101906008,
"grad_norm": 0.23146857047713387,
"learning_rate": 8.403175198897276e-06,
"loss": 0.7704,
"step": 2543
},
{
"epoch": 0.7001995458611436,
"grad_norm": 0.22218270447001012,
"learning_rate": 8.388944096971556e-06,
"loss": 0.7794,
"step": 2544
},
{
"epoch": 0.7004747815316865,
"grad_norm": 0.22157050442550313,
"learning_rate": 8.374721857052395e-06,
"loss": 0.8121,
"step": 2545
},
{
"epoch": 0.7007500172022294,
"grad_norm": 0.22820270674719595,
"learning_rate": 8.360508489994781e-06,
"loss": 0.7765,
"step": 2546
},
{
"epoch": 0.7010252528727723,
"grad_norm": 0.21783442266235062,
"learning_rate": 8.346304006646884e-06,
"loss": 0.7874,
"step": 2547
},
{
"epoch": 0.7013004885433152,
"grad_norm": 0.21583686423778445,
"learning_rate": 8.33210841785012e-06,
"loss": 0.7603,
"step": 2548
},
{
"epoch": 0.7015757242138582,
"grad_norm": 0.21547658169077147,
"learning_rate": 8.317921734439122e-06,
"loss": 0.7765,
"step": 2549
},
{
"epoch": 0.701850959884401,
"grad_norm": 0.2189600409277528,
"learning_rate": 8.3037439672417e-06,
"loss": 0.7983,
"step": 2550
},
{
"epoch": 0.7021261955549439,
"grad_norm": 0.23042073193250784,
"learning_rate": 8.289575127078877e-06,
"loss": 0.7741,
"step": 2551
},
{
"epoch": 0.7024014312254868,
"grad_norm": 0.21489622516586931,
"learning_rate": 8.275415224764871e-06,
"loss": 0.8043,
"step": 2552
},
{
"epoch": 0.7026766668960297,
"grad_norm": 0.22017222680919535,
"learning_rate": 8.261264271107043e-06,
"loss": 0.7568,
"step": 2553
},
{
"epoch": 0.7029519025665726,
"grad_norm": 0.21867384731949382,
"learning_rate": 8.247122276905976e-06,
"loss": 0.7731,
"step": 2554
},
{
"epoch": 0.7032271382371156,
"grad_norm": 0.22381432608871324,
"learning_rate": 8.232989252955369e-06,
"loss": 0.7767,
"step": 2555
},
{
"epoch": 0.7035023739076585,
"grad_norm": 0.22301434456062752,
"learning_rate": 8.2188652100421e-06,
"loss": 0.7646,
"step": 2556
},
{
"epoch": 0.7037776095782013,
"grad_norm": 0.22163531837428702,
"learning_rate": 8.204750158946173e-06,
"loss": 0.7736,
"step": 2557
},
{
"epoch": 0.7040528452487442,
"grad_norm": 0.23481395694357782,
"learning_rate": 8.190644110440748e-06,
"loss": 0.7832,
"step": 2558
},
{
"epoch": 0.7043280809192871,
"grad_norm": 0.20940601239892792,
"learning_rate": 8.176547075292116e-06,
"loss": 0.7766,
"step": 2559
},
{
"epoch": 0.70460331658983,
"grad_norm": 0.2209708685015769,
"learning_rate": 8.162459064259653e-06,
"loss": 0.7971,
"step": 2560
},
{
"epoch": 0.704878552260373,
"grad_norm": 0.21719861143626087,
"learning_rate": 8.148380088095904e-06,
"loss": 0.7778,
"step": 2561
},
{
"epoch": 0.7051537879309159,
"grad_norm": 0.21693256303005057,
"learning_rate": 8.134310157546466e-06,
"loss": 0.755,
"step": 2562
},
{
"epoch": 0.7054290236014588,
"grad_norm": 0.21658199171877832,
"learning_rate": 8.120249283350061e-06,
"loss": 0.7702,
"step": 2563
},
{
"epoch": 0.7057042592720016,
"grad_norm": 0.22693379810642794,
"learning_rate": 8.1061974762385e-06,
"loss": 0.7756,
"step": 2564
},
{
"epoch": 0.7059794949425445,
"grad_norm": 0.22557625670715278,
"learning_rate": 8.09215474693665e-06,
"loss": 0.7947,
"step": 2565
},
{
"epoch": 0.7062547306130874,
"grad_norm": 0.3560822885307025,
"learning_rate": 8.078121106162475e-06,
"loss": 0.7981,
"step": 2566
},
{
"epoch": 0.7065299662836304,
"grad_norm": 0.21982419278393076,
"learning_rate": 8.064096564626977e-06,
"loss": 0.7747,
"step": 2567
},
{
"epoch": 0.7068052019541733,
"grad_norm": 0.216863233336889,
"learning_rate": 8.050081133034247e-06,
"loss": 0.789,
"step": 2568
},
{
"epoch": 0.7070804376247162,
"grad_norm": 0.2129641511926811,
"learning_rate": 8.036074822081401e-06,
"loss": 0.7775,
"step": 2569
},
{
"epoch": 0.7073556732952591,
"grad_norm": 0.2185402896419739,
"learning_rate": 8.022077642458588e-06,
"loss": 0.7856,
"step": 2570
},
{
"epoch": 0.7076309089658019,
"grad_norm": 0.20996769819736522,
"learning_rate": 8.008089604849008e-06,
"loss": 0.7365,
"step": 2571
},
{
"epoch": 0.7079061446363448,
"grad_norm": 0.21688214146662685,
"learning_rate": 7.994110719928856e-06,
"loss": 0.7757,
"step": 2572
},
{
"epoch": 0.7081813803068878,
"grad_norm": 0.2185082099924713,
"learning_rate": 7.980140998367365e-06,
"loss": 0.7599,
"step": 2573
},
{
"epoch": 0.7084566159774307,
"grad_norm": 0.2336201524887943,
"learning_rate": 7.966180450826768e-06,
"loss": 0.8186,
"step": 2574
},
{
"epoch": 0.7087318516479736,
"grad_norm": 0.21981919163309177,
"learning_rate": 7.952229087962296e-06,
"loss": 0.7776,
"step": 2575
},
{
"epoch": 0.7090070873185165,
"grad_norm": 0.21652149541995094,
"learning_rate": 7.938286920422169e-06,
"loss": 0.7644,
"step": 2576
},
{
"epoch": 0.7092823229890594,
"grad_norm": 0.23425169143460922,
"learning_rate": 7.92435395884758e-06,
"loss": 0.7653,
"step": 2577
},
{
"epoch": 0.7095575586596022,
"grad_norm": 0.21985760553119063,
"learning_rate": 7.910430213872709e-06,
"loss": 0.7609,
"step": 2578
},
{
"epoch": 0.7098327943301452,
"grad_norm": 0.22588238810554612,
"learning_rate": 7.896515696124703e-06,
"loss": 0.7726,
"step": 2579
},
{
"epoch": 0.7101080300006881,
"grad_norm": 0.23218861287312292,
"learning_rate": 7.882610416223644e-06,
"loss": 0.8013,
"step": 2580
},
{
"epoch": 0.710383265671231,
"grad_norm": 0.22362351695436455,
"learning_rate": 7.868714384782588e-06,
"loss": 0.7775,
"step": 2581
},
{
"epoch": 0.7106585013417739,
"grad_norm": 0.24388419406285858,
"learning_rate": 7.854827612407521e-06,
"loss": 0.797,
"step": 2582
},
{
"epoch": 0.7109337370123168,
"grad_norm": 0.21752661884274282,
"learning_rate": 7.840950109697373e-06,
"loss": 0.7888,
"step": 2583
},
{
"epoch": 0.7112089726828597,
"grad_norm": 0.23559152832695637,
"learning_rate": 7.82708188724398e-06,
"loss": 0.7741,
"step": 2584
},
{
"epoch": 0.7114842083534026,
"grad_norm": 0.21694960124158888,
"learning_rate": 7.813222955632107e-06,
"loss": 0.7652,
"step": 2585
},
{
"epoch": 0.7117594440239455,
"grad_norm": 0.21834541915733874,
"learning_rate": 7.799373325439435e-06,
"loss": 0.7905,
"step": 2586
},
{
"epoch": 0.7120346796944884,
"grad_norm": 0.21797658290212968,
"learning_rate": 7.785533007236521e-06,
"loss": 0.7688,
"step": 2587
},
{
"epoch": 0.7123099153650313,
"grad_norm": 0.21881153505452441,
"learning_rate": 7.77170201158684e-06,
"loss": 0.7949,
"step": 2588
},
{
"epoch": 0.7125851510355742,
"grad_norm": 0.21258110515309403,
"learning_rate": 7.757880349046742e-06,
"loss": 0.7845,
"step": 2589
},
{
"epoch": 0.7128603867061171,
"grad_norm": 0.25572637344952137,
"learning_rate": 7.744068030165454e-06,
"loss": 0.7618,
"step": 2590
},
{
"epoch": 0.7131356223766601,
"grad_norm": 0.21293292230523622,
"learning_rate": 7.730265065485082e-06,
"loss": 0.8043,
"step": 2591
},
{
"epoch": 0.713410858047203,
"grad_norm": 0.23308784980622776,
"learning_rate": 7.71647146554056e-06,
"loss": 0.7771,
"step": 2592
},
{
"epoch": 0.7136860937177458,
"grad_norm": 0.23235681475884892,
"learning_rate": 7.702687240859717e-06,
"loss": 0.7834,
"step": 2593
},
{
"epoch": 0.7139613293882887,
"grad_norm": 0.22205098937173648,
"learning_rate": 7.68891240196319e-06,
"loss": 0.758,
"step": 2594
},
{
"epoch": 0.7142365650588316,
"grad_norm": 0.23388667670185762,
"learning_rate": 7.675146959364473e-06,
"loss": 0.7623,
"step": 2595
},
{
"epoch": 0.7145118007293745,
"grad_norm": 0.21123479306711065,
"learning_rate": 7.661390923569889e-06,
"loss": 0.7607,
"step": 2596
},
{
"epoch": 0.7147870363999175,
"grad_norm": 0.4441814421607099,
"learning_rate": 7.647644305078572e-06,
"loss": 0.7899,
"step": 2597
},
{
"epoch": 0.7150622720704604,
"grad_norm": 0.22675347109781566,
"learning_rate": 7.63390711438248e-06,
"loss": 0.7615,
"step": 2598
},
{
"epoch": 0.7153375077410032,
"grad_norm": 0.23992857226961903,
"learning_rate": 7.620179361966356e-06,
"loss": 0.7916,
"step": 2599
},
{
"epoch": 0.7156127434115461,
"grad_norm": 0.24172365626282546,
"learning_rate": 7.606461058307755e-06,
"loss": 0.7608,
"step": 2600
},
{
"epoch": 0.715887979082089,
"grad_norm": 0.24581114930095574,
"learning_rate": 7.592752213877026e-06,
"loss": 0.7643,
"step": 2601
},
{
"epoch": 0.7161632147526319,
"grad_norm": 0.23243543587662152,
"learning_rate": 7.579052839137273e-06,
"loss": 0.7975,
"step": 2602
},
{
"epoch": 0.7164384504231749,
"grad_norm": 0.22742501150177027,
"learning_rate": 7.565362944544396e-06,
"loss": 0.7565,
"step": 2603
},
{
"epoch": 0.7167136860937178,
"grad_norm": 0.20860500190427597,
"learning_rate": 7.551682540547054e-06,
"loss": 0.7661,
"step": 2604
},
{
"epoch": 0.7169889217642607,
"grad_norm": 0.22148520453669318,
"learning_rate": 7.538011637586658e-06,
"loss": 0.7691,
"step": 2605
},
{
"epoch": 0.7172641574348035,
"grad_norm": 0.22797264889547875,
"learning_rate": 7.524350246097374e-06,
"loss": 0.7616,
"step": 2606
},
{
"epoch": 0.7175393931053464,
"grad_norm": 0.2130472130988018,
"learning_rate": 7.510698376506091e-06,
"loss": 0.7753,
"step": 2607
},
{
"epoch": 0.7178146287758893,
"grad_norm": 0.4091533442654354,
"learning_rate": 7.497056039232462e-06,
"loss": 0.7764,
"step": 2608
},
{
"epoch": 0.7180898644464323,
"grad_norm": 0.23280487333957706,
"learning_rate": 7.483423244688828e-06,
"loss": 0.8078,
"step": 2609
},
{
"epoch": 0.7183651001169752,
"grad_norm": 0.21388497928006925,
"learning_rate": 7.46980000328027e-06,
"loss": 0.765,
"step": 2610
},
{
"epoch": 0.7186403357875181,
"grad_norm": 0.23504724160770063,
"learning_rate": 7.456186325404575e-06,
"loss": 0.7808,
"step": 2611
},
{
"epoch": 0.718915571458061,
"grad_norm": 0.23494718344875026,
"learning_rate": 7.44258222145223e-06,
"loss": 0.7801,
"step": 2612
},
{
"epoch": 0.7191908071286038,
"grad_norm": 0.22478128339705314,
"learning_rate": 7.428987701806416e-06,
"loss": 0.774,
"step": 2613
},
{
"epoch": 0.7194660427991467,
"grad_norm": 0.22527016656773594,
"learning_rate": 7.415402776842982e-06,
"loss": 0.7782,
"step": 2614
},
{
"epoch": 0.7197412784696897,
"grad_norm": 0.2248601002155795,
"learning_rate": 7.401827456930477e-06,
"loss": 0.7948,
"step": 2615
},
{
"epoch": 0.7200165141402326,
"grad_norm": 0.257821893062983,
"learning_rate": 7.388261752430115e-06,
"loss": 0.7868,
"step": 2616
},
{
"epoch": 0.7202917498107755,
"grad_norm": 0.2141776789864948,
"learning_rate": 7.374705673695748e-06,
"loss": 0.8008,
"step": 2617
},
{
"epoch": 0.7205669854813184,
"grad_norm": 0.22378162305974467,
"learning_rate": 7.361159231073922e-06,
"loss": 0.7841,
"step": 2618
},
{
"epoch": 0.7208422211518613,
"grad_norm": 0.21435602613738422,
"learning_rate": 7.347622434903787e-06,
"loss": 0.7785,
"step": 2619
},
{
"epoch": 0.7211174568224041,
"grad_norm": 0.22718048855111325,
"learning_rate": 7.3340952955171655e-06,
"loss": 0.7843,
"step": 2620
},
{
"epoch": 0.7213926924929471,
"grad_norm": 0.22841310724341327,
"learning_rate": 7.320577823238475e-06,
"loss": 0.7725,
"step": 2621
},
{
"epoch": 0.72166792816349,
"grad_norm": 0.21325490490438734,
"learning_rate": 7.307070028384782e-06,
"loss": 0.7895,
"step": 2622
},
{
"epoch": 0.7219431638340329,
"grad_norm": 0.2258875597667349,
"learning_rate": 7.293571921265765e-06,
"loss": 0.7666,
"step": 2623
},
{
"epoch": 0.7222183995045758,
"grad_norm": 0.21190356615671044,
"learning_rate": 7.280083512183678e-06,
"loss": 0.7633,
"step": 2624
},
{
"epoch": 0.7224936351751187,
"grad_norm": 0.2231753865614009,
"learning_rate": 7.266604811433424e-06,
"loss": 0.7469,
"step": 2625
},
{
"epoch": 0.7227688708456615,
"grad_norm": 0.22143692791586356,
"learning_rate": 7.253135829302451e-06,
"loss": 0.7748,
"step": 2626
},
{
"epoch": 0.7230441065162045,
"grad_norm": 0.21333224666052628,
"learning_rate": 7.239676576070809e-06,
"loss": 0.7818,
"step": 2627
},
{
"epoch": 0.7233193421867474,
"grad_norm": 0.2187465656916614,
"learning_rate": 7.2262270620111305e-06,
"loss": 0.7926,
"step": 2628
},
{
"epoch": 0.7235945778572903,
"grad_norm": 0.21542351593374082,
"learning_rate": 7.212787297388588e-06,
"loss": 0.8123,
"step": 2629
},
{
"epoch": 0.7238698135278332,
"grad_norm": 0.2182686100645093,
"learning_rate": 7.199357292460945e-06,
"loss": 0.7958,
"step": 2630
},
{
"epoch": 0.7241450491983761,
"grad_norm": 0.22336430210451583,
"learning_rate": 7.185937057478478e-06,
"loss": 0.7758,
"step": 2631
},
{
"epoch": 0.724420284868919,
"grad_norm": 0.21283687484459596,
"learning_rate": 7.172526602684058e-06,
"loss": 0.7828,
"step": 2632
},
{
"epoch": 0.724695520539462,
"grad_norm": 0.21296296324730565,
"learning_rate": 7.159125938313041e-06,
"loss": 0.78,
"step": 2633
},
{
"epoch": 0.7249707562100048,
"grad_norm": 0.2235125890136319,
"learning_rate": 7.145735074593338e-06,
"loss": 0.8013,
"step": 2634
},
{
"epoch": 0.7252459918805477,
"grad_norm": 0.22569966450039847,
"learning_rate": 7.132354021745383e-06,
"loss": 0.8054,
"step": 2635
},
{
"epoch": 0.7255212275510906,
"grad_norm": 0.22299176954010355,
"learning_rate": 7.118982789982096e-06,
"loss": 0.7813,
"step": 2636
},
{
"epoch": 0.7257964632216335,
"grad_norm": 0.21174516512179112,
"learning_rate": 7.105621389508925e-06,
"loss": 0.7489,
"step": 2637
},
{
"epoch": 0.7260716988921764,
"grad_norm": 0.2254521324842919,
"learning_rate": 7.09226983052381e-06,
"loss": 0.7875,
"step": 2638
},
{
"epoch": 0.7263469345627194,
"grad_norm": 0.22056007534564895,
"learning_rate": 7.078928123217175e-06,
"loss": 0.7938,
"step": 2639
},
{
"epoch": 0.7266221702332623,
"grad_norm": 0.2199929946297742,
"learning_rate": 7.065596277771931e-06,
"loss": 0.7815,
"step": 2640
},
{
"epoch": 0.7268974059038051,
"grad_norm": 0.20472138887987787,
"learning_rate": 7.052274304363449e-06,
"loss": 0.7776,
"step": 2641
},
{
"epoch": 0.727172641574348,
"grad_norm": 0.21545903753551834,
"learning_rate": 7.0389622131595835e-06,
"loss": 0.7738,
"step": 2642
},
{
"epoch": 0.7274478772448909,
"grad_norm": 0.21142960653804516,
"learning_rate": 7.0256600143206235e-06,
"loss": 0.7856,
"step": 2643
},
{
"epoch": 0.7277231129154338,
"grad_norm": 0.21948079817216246,
"learning_rate": 7.012367717999331e-06,
"loss": 0.7899,
"step": 2644
},
{
"epoch": 0.7279983485859768,
"grad_norm": 0.2043438503379916,
"learning_rate": 6.9990853343408986e-06,
"loss": 0.7756,
"step": 2645
},
{
"epoch": 0.7282735842565197,
"grad_norm": 0.20985830968379818,
"learning_rate": 6.985812873482953e-06,
"loss": 0.7988,
"step": 2646
},
{
"epoch": 0.7285488199270626,
"grad_norm": 0.2243795238123144,
"learning_rate": 6.97255034555556e-06,
"loss": 0.7971,
"step": 2647
},
{
"epoch": 0.7288240555976054,
"grad_norm": 0.2046682781819276,
"learning_rate": 6.959297760681176e-06,
"loss": 0.7856,
"step": 2648
},
{
"epoch": 0.7290992912681483,
"grad_norm": 0.21705682375699856,
"learning_rate": 6.946055128974694e-06,
"loss": 0.7979,
"step": 2649
},
{
"epoch": 0.7293745269386912,
"grad_norm": 0.23901909549553974,
"learning_rate": 6.932822460543409e-06,
"loss": 0.7705,
"step": 2650
},
{
"epoch": 0.7296497626092342,
"grad_norm": 0.5511118712416953,
"learning_rate": 6.919599765486993e-06,
"loss": 0.7994,
"step": 2651
},
{
"epoch": 0.7299249982797771,
"grad_norm": 0.20488173065189808,
"learning_rate": 6.906387053897523e-06,
"loss": 0.7696,
"step": 2652
},
{
"epoch": 0.73020023395032,
"grad_norm": 0.22057455829477815,
"learning_rate": 6.89318433585945e-06,
"loss": 0.7959,
"step": 2653
},
{
"epoch": 0.7304754696208628,
"grad_norm": 0.2055333890282667,
"learning_rate": 6.879991621449602e-06,
"loss": 0.7684,
"step": 2654
},
{
"epoch": 0.7307507052914057,
"grad_norm": 0.2111660825636503,
"learning_rate": 6.866808920737174e-06,
"loss": 0.73,
"step": 2655
},
{
"epoch": 0.7310259409619486,
"grad_norm": 0.2188215699005884,
"learning_rate": 6.853636243783697e-06,
"loss": 0.7733,
"step": 2656
},
{
"epoch": 0.7313011766324916,
"grad_norm": 0.2145065275849248,
"learning_rate": 6.840473600643081e-06,
"loss": 0.8002,
"step": 2657
},
{
"epoch": 0.7315764123030345,
"grad_norm": 0.23142482708949125,
"learning_rate": 6.8273210013615536e-06,
"loss": 0.7817,
"step": 2658
},
{
"epoch": 0.7318516479735774,
"grad_norm": 0.20594948595110116,
"learning_rate": 6.814178455977689e-06,
"loss": 0.8007,
"step": 2659
},
{
"epoch": 0.7321268836441203,
"grad_norm": 0.21349424936460418,
"learning_rate": 6.801045974522389e-06,
"loss": 0.7615,
"step": 2660
},
{
"epoch": 0.7324021193146631,
"grad_norm": 0.21866381173181135,
"learning_rate": 6.7879235670188705e-06,
"loss": 0.7709,
"step": 2661
},
{
"epoch": 0.732677354985206,
"grad_norm": 0.21029288711314637,
"learning_rate": 6.774811243482667e-06,
"loss": 0.7628,
"step": 2662
},
{
"epoch": 0.732952590655749,
"grad_norm": 0.2209747070892804,
"learning_rate": 6.7617090139216e-06,
"loss": 0.7752,
"step": 2663
},
{
"epoch": 0.7332278263262919,
"grad_norm": 0.23083516086026892,
"learning_rate": 6.7486168883358015e-06,
"loss": 0.7897,
"step": 2664
},
{
"epoch": 0.7335030619968348,
"grad_norm": 0.2112523377093742,
"learning_rate": 6.735534876717695e-06,
"loss": 0.7815,
"step": 2665
},
{
"epoch": 0.7337782976673777,
"grad_norm": 0.20947016260086723,
"learning_rate": 6.722462989051965e-06,
"loss": 0.788,
"step": 2666
},
{
"epoch": 0.7340535333379206,
"grad_norm": 0.22127867347340469,
"learning_rate": 6.709401235315587e-06,
"loss": 0.7916,
"step": 2667
},
{
"epoch": 0.7343287690084634,
"grad_norm": 0.2113364980957063,
"learning_rate": 6.696349625477798e-06,
"loss": 0.7914,
"step": 2668
},
{
"epoch": 0.7346040046790064,
"grad_norm": 0.21443903627483418,
"learning_rate": 6.683308169500094e-06,
"loss": 0.7866,
"step": 2669
},
{
"epoch": 0.7348792403495493,
"grad_norm": 0.22351041775050992,
"learning_rate": 6.670276877336208e-06,
"loss": 0.7639,
"step": 2670
},
{
"epoch": 0.7351544760200922,
"grad_norm": 0.21343161756436144,
"learning_rate": 6.657255758932133e-06,
"loss": 0.7593,
"step": 2671
},
{
"epoch": 0.7354297116906351,
"grad_norm": 0.206875012944444,
"learning_rate": 6.644244824226094e-06,
"loss": 0.7784,
"step": 2672
},
{
"epoch": 0.735704947361178,
"grad_norm": 0.2163725621134461,
"learning_rate": 6.631244083148525e-06,
"loss": 0.7744,
"step": 2673
},
{
"epoch": 0.7359801830317209,
"grad_norm": 0.22142145880785594,
"learning_rate": 6.618253545622104e-06,
"loss": 0.7521,
"step": 2674
},
{
"epoch": 0.7362554187022639,
"grad_norm": 0.20227419570146793,
"learning_rate": 6.60527322156171e-06,
"loss": 0.7424,
"step": 2675
},
{
"epoch": 0.7365306543728067,
"grad_norm": 0.21530029313418675,
"learning_rate": 6.592303120874428e-06,
"loss": 0.7774,
"step": 2676
},
{
"epoch": 0.7368058900433496,
"grad_norm": 0.20766952535418937,
"learning_rate": 6.579343253459545e-06,
"loss": 0.7824,
"step": 2677
},
{
"epoch": 0.7370811257138925,
"grad_norm": 0.209642890188279,
"learning_rate": 6.566393629208523e-06,
"loss": 0.7753,
"step": 2678
},
{
"epoch": 0.7373563613844354,
"grad_norm": 0.2140088731249423,
"learning_rate": 6.553454258005025e-06,
"loss": 0.7922,
"step": 2679
},
{
"epoch": 0.7376315970549783,
"grad_norm": 0.20476957127418594,
"learning_rate": 6.540525149724868e-06,
"loss": 0.7764,
"step": 2680
},
{
"epoch": 0.7379068327255213,
"grad_norm": 0.2180808817119653,
"learning_rate": 6.527606314236053e-06,
"loss": 0.8113,
"step": 2681
},
{
"epoch": 0.7381820683960642,
"grad_norm": 0.1998712377413449,
"learning_rate": 6.514697761398734e-06,
"loss": 0.7628,
"step": 2682
},
{
"epoch": 0.738457304066607,
"grad_norm": 0.22040649973499035,
"learning_rate": 6.501799501065218e-06,
"loss": 0.7783,
"step": 2683
},
{
"epoch": 0.7387325397371499,
"grad_norm": 0.2106578922619492,
"learning_rate": 6.488911543079963e-06,
"loss": 0.7874,
"step": 2684
},
{
"epoch": 0.7390077754076928,
"grad_norm": 0.20820361702320744,
"learning_rate": 6.476033897279544e-06,
"loss": 0.763,
"step": 2685
},
{
"epoch": 0.7392830110782357,
"grad_norm": 0.20952645664031386,
"learning_rate": 6.463166573492683e-06,
"loss": 0.7884,
"step": 2686
},
{
"epoch": 0.7395582467487787,
"grad_norm": 0.21486499686804741,
"learning_rate": 6.450309581540224e-06,
"loss": 0.7806,
"step": 2687
},
{
"epoch": 0.7398334824193216,
"grad_norm": 0.21459816829548498,
"learning_rate": 6.437462931235103e-06,
"loss": 0.7614,
"step": 2688
},
{
"epoch": 0.7401087180898644,
"grad_norm": 0.21430064000588245,
"learning_rate": 6.424626632382407e-06,
"loss": 0.7608,
"step": 2689
},
{
"epoch": 0.7403839537604073,
"grad_norm": 0.21700886976937256,
"learning_rate": 6.411800694779271e-06,
"loss": 0.791,
"step": 2690
},
{
"epoch": 0.7406591894309502,
"grad_norm": 0.22130148583431022,
"learning_rate": 6.398985128214959e-06,
"loss": 0.7775,
"step": 2691
},
{
"epoch": 0.7409344251014931,
"grad_norm": 0.20982250779474793,
"learning_rate": 6.386179942470807e-06,
"loss": 0.7706,
"step": 2692
},
{
"epoch": 0.7412096607720361,
"grad_norm": 0.20401306529238422,
"learning_rate": 6.373385147320219e-06,
"loss": 0.7541,
"step": 2693
},
{
"epoch": 0.741484896442579,
"grad_norm": 0.2195471330562807,
"learning_rate": 6.360600752528689e-06,
"loss": 0.7777,
"step": 2694
},
{
"epoch": 0.7417601321131219,
"grad_norm": 0.2052895874422415,
"learning_rate": 6.3478267678537396e-06,
"loss": 0.7725,
"step": 2695
},
{
"epoch": 0.7420353677836647,
"grad_norm": 0.20624667047981138,
"learning_rate": 6.335063203045e-06,
"loss": 0.7827,
"step": 2696
},
{
"epoch": 0.7423106034542076,
"grad_norm": 0.20785169992857394,
"learning_rate": 6.322310067844091e-06,
"loss": 0.7903,
"step": 2697
},
{
"epoch": 0.7425858391247505,
"grad_norm": 0.21614247749932902,
"learning_rate": 6.3095673719847106e-06,
"loss": 0.7879,
"step": 2698
},
{
"epoch": 0.7428610747952935,
"grad_norm": 0.21383956902640192,
"learning_rate": 6.296835125192578e-06,
"loss": 0.7555,
"step": 2699
},
{
"epoch": 0.7431363104658364,
"grad_norm": 0.20877126594816658,
"learning_rate": 6.284113337185425e-06,
"loss": 0.7712,
"step": 2700
},
{
"epoch": 0.7434115461363793,
"grad_norm": 0.2115766812806403,
"learning_rate": 6.271402017673021e-06,
"loss": 0.7786,
"step": 2701
},
{
"epoch": 0.7436867818069222,
"grad_norm": 0.20546818203490577,
"learning_rate": 6.258701176357132e-06,
"loss": 0.8017,
"step": 2702
},
{
"epoch": 0.743962017477465,
"grad_norm": 0.21199643879353702,
"learning_rate": 6.246010822931532e-06,
"loss": 0.7674,
"step": 2703
},
{
"epoch": 0.7442372531480079,
"grad_norm": 0.21475894347401708,
"learning_rate": 6.2333309670819965e-06,
"loss": 0.7586,
"step": 2704
},
{
"epoch": 0.7445124888185509,
"grad_norm": 0.21509543984093626,
"learning_rate": 6.220661618486268e-06,
"loss": 0.7701,
"step": 2705
},
{
"epoch": 0.7447877244890938,
"grad_norm": 0.20893566086832982,
"learning_rate": 6.208002786814098e-06,
"loss": 0.7659,
"step": 2706
},
{
"epoch": 0.7450629601596367,
"grad_norm": 0.2001635192012533,
"learning_rate": 6.195354481727181e-06,
"loss": 0.7678,
"step": 2707
},
{
"epoch": 0.7453381958301796,
"grad_norm": 0.20883481520896027,
"learning_rate": 6.182716712879198e-06,
"loss": 0.761,
"step": 2708
},
{
"epoch": 0.7456134315007225,
"grad_norm": 0.2084136942059921,
"learning_rate": 6.170089489915792e-06,
"loss": 0.7845,
"step": 2709
},
{
"epoch": 0.7458886671712655,
"grad_norm": 0.20534337376394513,
"learning_rate": 6.157472822474524e-06,
"loss": 0.7601,
"step": 2710
},
{
"epoch": 0.7461639028418083,
"grad_norm": 0.20694835285974103,
"learning_rate": 6.144866720184952e-06,
"loss": 0.7758,
"step": 2711
},
{
"epoch": 0.7464391385123512,
"grad_norm": 0.2129128193705512,
"learning_rate": 6.132271192668518e-06,
"loss": 0.7822,
"step": 2712
},
{
"epoch": 0.7467143741828941,
"grad_norm": 0.20227471703214245,
"learning_rate": 6.119686249538624e-06,
"loss": 0.8066,
"step": 2713
},
{
"epoch": 0.746989609853437,
"grad_norm": 0.209815586879814,
"learning_rate": 6.107111900400589e-06,
"loss": 0.7641,
"step": 2714
},
{
"epoch": 0.7472648455239799,
"grad_norm": 0.21229486837207356,
"learning_rate": 6.094548154851631e-06,
"loss": 0.7967,
"step": 2715
},
{
"epoch": 0.7475400811945229,
"grad_norm": 0.20809716226906377,
"learning_rate": 6.0819950224809024e-06,
"loss": 0.7831,
"step": 2716
},
{
"epoch": 0.7478153168650657,
"grad_norm": 0.21016620242804573,
"learning_rate": 6.069452512869411e-06,
"loss": 0.7676,
"step": 2717
},
{
"epoch": 0.7480905525356086,
"grad_norm": 0.20990730360216506,
"learning_rate": 6.05692063559012e-06,
"loss": 0.7694,
"step": 2718
},
{
"epoch": 0.7483657882061515,
"grad_norm": 0.19635188064107884,
"learning_rate": 6.044399400207817e-06,
"loss": 0.7628,
"step": 2719
},
{
"epoch": 0.7486410238766944,
"grad_norm": 0.21057427708628593,
"learning_rate": 6.031888816279199e-06,
"loss": 0.7869,
"step": 2720
},
{
"epoch": 0.7489162595472373,
"grad_norm": 0.20003959979288685,
"learning_rate": 6.019388893352838e-06,
"loss": 0.7362,
"step": 2721
},
{
"epoch": 0.7491914952177803,
"grad_norm": 0.20147588359078802,
"learning_rate": 6.006899640969142e-06,
"loss": 0.7621,
"step": 2722
},
{
"epoch": 0.7494667308883232,
"grad_norm": 0.21665705345996358,
"learning_rate": 5.994421068660396e-06,
"loss": 0.7796,
"step": 2723
},
{
"epoch": 0.749741966558866,
"grad_norm": 0.212541364525579,
"learning_rate": 5.981953185950735e-06,
"loss": 0.7539,
"step": 2724
},
{
"epoch": 0.7500172022294089,
"grad_norm": 0.2031858588635556,
"learning_rate": 5.969496002356121e-06,
"loss": 0.7842,
"step": 2725
},
{
"epoch": 0.7502924378999518,
"grad_norm": 0.20295736464712008,
"learning_rate": 5.9570495273843705e-06,
"loss": 0.7579,
"step": 2726
},
{
"epoch": 0.7505676735704947,
"grad_norm": 0.21978493607175753,
"learning_rate": 5.944613770535099e-06,
"loss": 0.7839,
"step": 2727
},
{
"epoch": 0.7508429092410377,
"grad_norm": 0.196979851348305,
"learning_rate": 5.9321887412997695e-06,
"loss": 0.7824,
"step": 2728
},
{
"epoch": 0.7511181449115806,
"grad_norm": 0.20890745311280653,
"learning_rate": 5.91977444916163e-06,
"loss": 0.7364,
"step": 2729
},
{
"epoch": 0.7513933805821235,
"grad_norm": 0.2172243799655082,
"learning_rate": 5.907370903595757e-06,
"loss": 0.7797,
"step": 2730
},
{
"epoch": 0.7516686162526663,
"grad_norm": 0.20076579224891278,
"learning_rate": 5.8949781140690166e-06,
"loss": 0.7674,
"step": 2731
},
{
"epoch": 0.7519438519232092,
"grad_norm": 0.20004532117901183,
"learning_rate": 5.882596090040061e-06,
"loss": 0.7473,
"step": 2732
},
{
"epoch": 0.7522190875937521,
"grad_norm": 0.21370352536628204,
"learning_rate": 5.87022484095934e-06,
"loss": 0.7812,
"step": 2733
},
{
"epoch": 0.7524943232642951,
"grad_norm": 0.20146215525752867,
"learning_rate": 5.857864376269051e-06,
"loss": 0.7721,
"step": 2734
},
{
"epoch": 0.752769558934838,
"grad_norm": 0.20877758112932118,
"learning_rate": 5.84551470540319e-06,
"loss": 0.8085,
"step": 2735
},
{
"epoch": 0.7530447946053809,
"grad_norm": 0.21114667619502187,
"learning_rate": 5.833175837787506e-06,
"loss": 0.7746,
"step": 2736
},
{
"epoch": 0.7533200302759238,
"grad_norm": 0.21414604914230712,
"learning_rate": 5.820847782839489e-06,
"loss": 0.7854,
"step": 2737
},
{
"epoch": 0.7535952659464666,
"grad_norm": 0.1981076499949966,
"learning_rate": 5.808530549968392e-06,
"loss": 0.7545,
"step": 2738
},
{
"epoch": 0.7538705016170095,
"grad_norm": 0.21137767766334561,
"learning_rate": 5.796224148575203e-06,
"loss": 0.7645,
"step": 2739
},
{
"epoch": 0.7541457372875525,
"grad_norm": 0.21277508850489377,
"learning_rate": 5.783928588052643e-06,
"loss": 0.7659,
"step": 2740
},
{
"epoch": 0.7544209729580954,
"grad_norm": 0.21471107041164525,
"learning_rate": 5.771643877785167e-06,
"loss": 0.7639,
"step": 2741
},
{
"epoch": 0.7546962086286383,
"grad_norm": 0.21141065538197631,
"learning_rate": 5.759370027148925e-06,
"loss": 0.7552,
"step": 2742
},
{
"epoch": 0.7549714442991812,
"grad_norm": 0.20970651925923653,
"learning_rate": 5.747107045511811e-06,
"loss": 0.7623,
"step": 2743
},
{
"epoch": 0.755246679969724,
"grad_norm": 0.216780701781582,
"learning_rate": 5.73485494223339e-06,
"loss": 0.7896,
"step": 2744
},
{
"epoch": 0.7555219156402669,
"grad_norm": 0.21359290287749802,
"learning_rate": 5.72261372666495e-06,
"loss": 0.7625,
"step": 2745
},
{
"epoch": 0.7557971513108099,
"grad_norm": 0.19829604785715904,
"learning_rate": 5.710383408149456e-06,
"loss": 0.7759,
"step": 2746
},
{
"epoch": 0.7560723869813528,
"grad_norm": 0.21472421287741664,
"learning_rate": 5.698163996021564e-06,
"loss": 0.8087,
"step": 2747
},
{
"epoch": 0.7563476226518957,
"grad_norm": 0.20762047223825586,
"learning_rate": 5.685955499607605e-06,
"loss": 0.7726,
"step": 2748
},
{
"epoch": 0.7566228583224386,
"grad_norm": 0.19779265094083542,
"learning_rate": 5.673757928225563e-06,
"loss": 0.7658,
"step": 2749
},
{
"epoch": 0.7568980939929815,
"grad_norm": 0.20656416297964883,
"learning_rate": 5.6615712911851016e-06,
"loss": 0.7932,
"step": 2750
},
{
"epoch": 0.7571733296635244,
"grad_norm": 0.31567168953732694,
"learning_rate": 5.649395597787544e-06,
"loss": 0.7724,
"step": 2751
},
{
"epoch": 0.7574485653340673,
"grad_norm": 0.2018029260095364,
"learning_rate": 5.6372308573258235e-06,
"loss": 0.772,
"step": 2752
},
{
"epoch": 0.7577238010046102,
"grad_norm": 0.20563668646675162,
"learning_rate": 5.625077079084571e-06,
"loss": 0.7657,
"step": 2753
},
{
"epoch": 0.7579990366751531,
"grad_norm": 0.21391863965197877,
"learning_rate": 5.612934272340001e-06,
"loss": 0.7785,
"step": 2754
},
{
"epoch": 0.758274272345696,
"grad_norm": 0.2073258224296366,
"learning_rate": 5.600802446359981e-06,
"loss": 0.7583,
"step": 2755
},
{
"epoch": 0.7585495080162389,
"grad_norm": 0.2097306152223069,
"learning_rate": 5.588681610403978e-06,
"loss": 0.7875,
"step": 2756
},
{
"epoch": 0.7588247436867818,
"grad_norm": 0.20085077935674445,
"learning_rate": 5.576571773723094e-06,
"loss": 0.7572,
"step": 2757
},
{
"epoch": 0.7590999793573248,
"grad_norm": 0.21460856643656978,
"learning_rate": 5.5644729455600246e-06,
"loss": 0.7873,
"step": 2758
},
{
"epoch": 0.7593752150278676,
"grad_norm": 0.21354432137006993,
"learning_rate": 5.552385135149048e-06,
"loss": 0.769,
"step": 2759
},
{
"epoch": 0.7596504506984105,
"grad_norm": 0.20477988532515246,
"learning_rate": 5.5403083517160686e-06,
"loss": 0.7844,
"step": 2760
},
{
"epoch": 0.7599256863689534,
"grad_norm": 0.20760878856343412,
"learning_rate": 5.5282426044785396e-06,
"loss": 0.765,
"step": 2761
},
{
"epoch": 0.7602009220394963,
"grad_norm": 0.21180288595410768,
"learning_rate": 5.516187902645511e-06,
"loss": 0.7427,
"step": 2762
},
{
"epoch": 0.7604761577100392,
"grad_norm": 0.21179482853742132,
"learning_rate": 5.504144255417605e-06,
"loss": 0.7859,
"step": 2763
},
{
"epoch": 0.7607513933805822,
"grad_norm": 0.20430100175741778,
"learning_rate": 5.492111671986981e-06,
"loss": 0.7817,
"step": 2764
},
{
"epoch": 0.7610266290511251,
"grad_norm": 0.20493524906648022,
"learning_rate": 5.480090161537388e-06,
"loss": 0.7757,
"step": 2765
},
{
"epoch": 0.7613018647216679,
"grad_norm": 0.2062615698229132,
"learning_rate": 5.468079733244096e-06,
"loss": 0.7554,
"step": 2766
},
{
"epoch": 0.7615771003922108,
"grad_norm": 0.20852437361425066,
"learning_rate": 5.45608039627393e-06,
"loss": 0.8011,
"step": 2767
},
{
"epoch": 0.7618523360627537,
"grad_norm": 0.19612811778492137,
"learning_rate": 5.444092159785252e-06,
"loss": 0.8036,
"step": 2768
},
{
"epoch": 0.7621275717332966,
"grad_norm": 0.20814841495296343,
"learning_rate": 5.4321150329279444e-06,
"loss": 0.7653,
"step": 2769
},
{
"epoch": 0.7624028074038396,
"grad_norm": 0.2067981257009054,
"learning_rate": 5.420149024843422e-06,
"loss": 0.7601,
"step": 2770
},
{
"epoch": 0.7626780430743825,
"grad_norm": 0.19340169480293778,
"learning_rate": 5.408194144664589e-06,
"loss": 0.7786,
"step": 2771
},
{
"epoch": 0.7629532787449254,
"grad_norm": 0.20213918063352884,
"learning_rate": 5.396250401515879e-06,
"loss": 0.7573,
"step": 2772
},
{
"epoch": 0.7632285144154682,
"grad_norm": 0.36101439044528943,
"learning_rate": 5.384317804513226e-06,
"loss": 0.7686,
"step": 2773
},
{
"epoch": 0.7635037500860111,
"grad_norm": 0.20121244805137242,
"learning_rate": 5.372396362764032e-06,
"loss": 0.7482,
"step": 2774
},
{
"epoch": 0.763778985756554,
"grad_norm": 0.21229294525049264,
"learning_rate": 5.360486085367223e-06,
"loss": 0.7727,
"step": 2775
},
{
"epoch": 0.764054221427097,
"grad_norm": 0.21460844358425726,
"learning_rate": 5.348586981413167e-06,
"loss": 0.7431,
"step": 2776
},
{
"epoch": 0.7643294570976399,
"grad_norm": 0.20677375955687788,
"learning_rate": 5.33669905998373e-06,
"loss": 0.766,
"step": 2777
},
{
"epoch": 0.7646046927681828,
"grad_norm": 0.20298605650792526,
"learning_rate": 5.324822330152224e-06,
"loss": 0.7729,
"step": 2778
},
{
"epoch": 0.7648799284387257,
"grad_norm": 0.21364603158298678,
"learning_rate": 5.312956800983431e-06,
"loss": 0.7824,
"step": 2779
},
{
"epoch": 0.7651551641092685,
"grad_norm": 0.1980306269101776,
"learning_rate": 5.301102481533588e-06,
"loss": 0.7663,
"step": 2780
},
{
"epoch": 0.7654303997798114,
"grad_norm": 0.21113212712739332,
"learning_rate": 5.289259380850356e-06,
"loss": 0.7536,
"step": 2781
},
{
"epoch": 0.7657056354503544,
"grad_norm": 0.22120168726052686,
"learning_rate": 5.277427507972865e-06,
"loss": 0.8017,
"step": 2782
},
{
"epoch": 0.7659808711208973,
"grad_norm": 0.1991296610849373,
"learning_rate": 5.265606871931646e-06,
"loss": 0.7809,
"step": 2783
},
{
"epoch": 0.7662561067914402,
"grad_norm": 0.205851615356833,
"learning_rate": 5.253797481748664e-06,
"loss": 0.728,
"step": 2784
},
{
"epoch": 0.7665313424619831,
"grad_norm": 0.19935705085680255,
"learning_rate": 5.241999346437312e-06,
"loss": 0.7752,
"step": 2785
},
{
"epoch": 0.766806578132526,
"grad_norm": 0.20025888764342273,
"learning_rate": 5.230212475002372e-06,
"loss": 0.7748,
"step": 2786
},
{
"epoch": 0.7670818138030688,
"grad_norm": 0.2035110499205305,
"learning_rate": 5.218436876440043e-06,
"loss": 0.7666,
"step": 2787
},
{
"epoch": 0.7673570494736118,
"grad_norm": 0.19652921691991823,
"learning_rate": 5.206672559737918e-06,
"loss": 0.7605,
"step": 2788
},
{
"epoch": 0.7676322851441547,
"grad_norm": 0.2000057952894092,
"learning_rate": 5.194919533874978e-06,
"loss": 0.7761,
"step": 2789
},
{
"epoch": 0.7679075208146976,
"grad_norm": 0.20458672467871827,
"learning_rate": 5.1831778078215934e-06,
"loss": 0.7969,
"step": 2790
},
{
"epoch": 0.7681827564852405,
"grad_norm": 0.19592344239650988,
"learning_rate": 5.17144739053949e-06,
"loss": 0.7656,
"step": 2791
},
{
"epoch": 0.7684579921557834,
"grad_norm": 0.20069305957708425,
"learning_rate": 5.159728290981789e-06,
"loss": 0.7448,
"step": 2792
},
{
"epoch": 0.7687332278263262,
"grad_norm": 0.19691865493062355,
"learning_rate": 5.148020518092946e-06,
"loss": 0.7464,
"step": 2793
},
{
"epoch": 0.7690084634968692,
"grad_norm": 0.20856646507200627,
"learning_rate": 5.136324080808794e-06,
"loss": 0.7527,
"step": 2794
},
{
"epoch": 0.7692836991674121,
"grad_norm": 0.20139881996909267,
"learning_rate": 5.124638988056505e-06,
"loss": 0.7661,
"step": 2795
},
{
"epoch": 0.769558934837955,
"grad_norm": 0.19980265147715442,
"learning_rate": 5.112965248754593e-06,
"loss": 0.7623,
"step": 2796
},
{
"epoch": 0.7698341705084979,
"grad_norm": 0.2033409444923468,
"learning_rate": 5.1013028718129125e-06,
"loss": 0.7898,
"step": 2797
},
{
"epoch": 0.7701094061790408,
"grad_norm": 0.2115439104758614,
"learning_rate": 5.08965186613263e-06,
"loss": 0.7751,
"step": 2798
},
{
"epoch": 0.7703846418495837,
"grad_norm": 0.2014661531570698,
"learning_rate": 5.078012240606247e-06,
"loss": 0.7648,
"step": 2799
},
{
"epoch": 0.7706598775201267,
"grad_norm": 0.201704932722407,
"learning_rate": 5.066384004117584e-06,
"loss": 0.7782,
"step": 2800
},
{
"epoch": 0.7709351131906695,
"grad_norm": 0.2664404059344981,
"learning_rate": 5.0547671655417475e-06,
"loss": 0.7784,
"step": 2801
},
{
"epoch": 0.7712103488612124,
"grad_norm": 0.20034439907179086,
"learning_rate": 5.043161733745163e-06,
"loss": 0.7673,
"step": 2802
},
{
"epoch": 0.7714855845317553,
"grad_norm": 0.20392548465807855,
"learning_rate": 5.031567717585544e-06,
"loss": 0.7664,
"step": 2803
},
{
"epoch": 0.7717608202022982,
"grad_norm": 0.20245352307159437,
"learning_rate": 5.019985125911899e-06,
"loss": 0.7615,
"step": 2804
},
{
"epoch": 0.7720360558728411,
"grad_norm": 0.20197079632256648,
"learning_rate": 5.008413967564496e-06,
"loss": 0.7762,
"step": 2805
},
{
"epoch": 0.7723112915433841,
"grad_norm": 0.19548801964183182,
"learning_rate": 4.996854251374901e-06,
"loss": 0.7698,
"step": 2806
},
{
"epoch": 0.772586527213927,
"grad_norm": 0.20770427627275576,
"learning_rate": 4.985305986165934e-06,
"loss": 0.7576,
"step": 2807
},
{
"epoch": 0.7728617628844698,
"grad_norm": 0.20117380867737134,
"learning_rate": 4.973769180751673e-06,
"loss": 0.7814,
"step": 2808
},
{
"epoch": 0.7731369985550127,
"grad_norm": 0.20374138518222956,
"learning_rate": 4.962243843937455e-06,
"loss": 0.7478,
"step": 2809
},
{
"epoch": 0.7734122342255556,
"grad_norm": 0.21250544302689162,
"learning_rate": 4.950729984519864e-06,
"loss": 0.7753,
"step": 2810
},
{
"epoch": 0.7736874698960985,
"grad_norm": 0.2083913435747433,
"learning_rate": 4.939227611286724e-06,
"loss": 0.7653,
"step": 2811
},
{
"epoch": 0.7739627055666415,
"grad_norm": 0.23397763953285183,
"learning_rate": 4.927736733017092e-06,
"loss": 0.7671,
"step": 2812
},
{
"epoch": 0.7742379412371844,
"grad_norm": 0.2072476530387938,
"learning_rate": 4.916257358481245e-06,
"loss": 0.7971,
"step": 2813
},
{
"epoch": 0.7745131769077273,
"grad_norm": 0.2090302491655131,
"learning_rate": 4.904789496440692e-06,
"loss": 0.758,
"step": 2814
},
{
"epoch": 0.7747884125782701,
"grad_norm": 0.2198219751580214,
"learning_rate": 4.893333155648136e-06,
"loss": 0.7874,
"step": 2815
},
{
"epoch": 0.775063648248813,
"grad_norm": 0.20189635971867195,
"learning_rate": 4.881888344847512e-06,
"loss": 0.7698,
"step": 2816
},
{
"epoch": 0.7753388839193559,
"grad_norm": 0.20717508735363308,
"learning_rate": 4.870455072773934e-06,
"loss": 0.7793,
"step": 2817
},
{
"epoch": 0.7756141195898989,
"grad_norm": 0.21574722718069833,
"learning_rate": 4.859033348153721e-06,
"loss": 0.8037,
"step": 2818
},
{
"epoch": 0.7758893552604418,
"grad_norm": 0.20852967761604516,
"learning_rate": 4.847623179704379e-06,
"loss": 0.7787,
"step": 2819
},
{
"epoch": 0.7761645909309847,
"grad_norm": 0.20298686271435418,
"learning_rate": 4.836224576134581e-06,
"loss": 0.7673,
"step": 2820
},
{
"epoch": 0.7764398266015275,
"grad_norm": 0.20542244356161593,
"learning_rate": 4.824837546144183e-06,
"loss": 0.7814,
"step": 2821
},
{
"epoch": 0.7767150622720704,
"grad_norm": 0.20741189143890504,
"learning_rate": 4.813462098424213e-06,
"loss": 0.7466,
"step": 2822
},
{
"epoch": 0.7769902979426133,
"grad_norm": 0.2196502589426676,
"learning_rate": 4.802098241656845e-06,
"loss": 0.7874,
"step": 2823
},
{
"epoch": 0.7772655336131563,
"grad_norm": 0.20375783521133,
"learning_rate": 4.790745984515415e-06,
"loss": 0.7645,
"step": 2824
},
{
"epoch": 0.7775407692836992,
"grad_norm": 0.19447425806545415,
"learning_rate": 4.779405335664404e-06,
"loss": 0.7414,
"step": 2825
},
{
"epoch": 0.7778160049542421,
"grad_norm": 0.20846971348585894,
"learning_rate": 4.7680763037594364e-06,
"loss": 0.7748,
"step": 2826
},
{
"epoch": 0.778091240624785,
"grad_norm": 0.20256123680867555,
"learning_rate": 4.7567588974472734e-06,
"loss": 0.7961,
"step": 2827
},
{
"epoch": 0.7783664762953278,
"grad_norm": 0.19768646062168152,
"learning_rate": 4.745453125365782e-06,
"loss": 0.774,
"step": 2828
},
{
"epoch": 0.7786417119658707,
"grad_norm": 0.20804450589556203,
"learning_rate": 4.734158996143978e-06,
"loss": 0.7688,
"step": 2829
},
{
"epoch": 0.7789169476364137,
"grad_norm": 0.20764714967614814,
"learning_rate": 4.7228765184019644e-06,
"loss": 0.7705,
"step": 2830
},
{
"epoch": 0.7791921833069566,
"grad_norm": 0.2730353583813722,
"learning_rate": 4.711605700750972e-06,
"loss": 0.7574,
"step": 2831
},
{
"epoch": 0.7794674189774995,
"grad_norm": 0.19959559518050152,
"learning_rate": 4.700346551793322e-06,
"loss": 0.7662,
"step": 2832
},
{
"epoch": 0.7797426546480424,
"grad_norm": 0.2036881632392109,
"learning_rate": 4.689099080122434e-06,
"loss": 0.7715,
"step": 2833
},
{
"epoch": 0.7800178903185853,
"grad_norm": 0.2088481445737849,
"learning_rate": 4.67786329432282e-06,
"loss": 0.7939,
"step": 2834
},
{
"epoch": 0.7802931259891281,
"grad_norm": 0.20050090525126577,
"learning_rate": 4.666639202970049e-06,
"loss": 0.7752,
"step": 2835
},
{
"epoch": 0.7805683616596711,
"grad_norm": 0.20564022972980098,
"learning_rate": 4.655426814630793e-06,
"loss": 0.7887,
"step": 2836
},
{
"epoch": 0.780843597330214,
"grad_norm": 0.20903195163303726,
"learning_rate": 4.644226137862782e-06,
"loss": 0.7685,
"step": 2837
},
{
"epoch": 0.7811188330007569,
"grad_norm": 0.1984022960436632,
"learning_rate": 4.63303718121479e-06,
"loss": 0.7549,
"step": 2838
},
{
"epoch": 0.7813940686712998,
"grad_norm": 0.20057372327640952,
"learning_rate": 4.621859953226682e-06,
"loss": 0.7885,
"step": 2839
},
{
"epoch": 0.7816693043418427,
"grad_norm": 0.1994920525224683,
"learning_rate": 4.610694462429337e-06,
"loss": 0.7365,
"step": 2840
},
{
"epoch": 0.7819445400123856,
"grad_norm": 0.20514206637741078,
"learning_rate": 4.599540717344695e-06,
"loss": 0.7638,
"step": 2841
},
{
"epoch": 0.7822197756829286,
"grad_norm": 0.20543267077008986,
"learning_rate": 4.588398726485719e-06,
"loss": 0.75,
"step": 2842
},
{
"epoch": 0.7824950113534714,
"grad_norm": 0.2030935113631456,
"learning_rate": 4.577268498356411e-06,
"loss": 0.7855,
"step": 2843
},
{
"epoch": 0.7827702470240143,
"grad_norm": 0.20641867697581046,
"learning_rate": 4.5661500414517955e-06,
"loss": 0.777,
"step": 2844
},
{
"epoch": 0.7830454826945572,
"grad_norm": 0.20495105897333385,
"learning_rate": 4.555043364257894e-06,
"loss": 0.7742,
"step": 2845
},
{
"epoch": 0.7833207183651001,
"grad_norm": 0.1941721815018396,
"learning_rate": 4.543948475251772e-06,
"loss": 0.7553,
"step": 2846
},
{
"epoch": 0.783595954035643,
"grad_norm": 0.20803754750016493,
"learning_rate": 4.532865382901461e-06,
"loss": 0.7842,
"step": 2847
},
{
"epoch": 0.783871189706186,
"grad_norm": 0.20833329311102658,
"learning_rate": 4.521794095666013e-06,
"loss": 0.7815,
"step": 2848
},
{
"epoch": 0.7841464253767289,
"grad_norm": 0.1995661810791607,
"learning_rate": 4.510734621995465e-06,
"loss": 0.7895,
"step": 2849
},
{
"epoch": 0.7844216610472717,
"grad_norm": 0.20743474785424687,
"learning_rate": 4.499686970330825e-06,
"loss": 0.7634,
"step": 2850
},
{
"epoch": 0.7846968967178146,
"grad_norm": 0.20061320673242355,
"learning_rate": 4.4886511491041e-06,
"loss": 0.7564,
"step": 2851
},
{
"epoch": 0.7849721323883575,
"grad_norm": 0.19742642178470157,
"learning_rate": 4.4776271667382364e-06,
"loss": 0.7537,
"step": 2852
},
{
"epoch": 0.7852473680589004,
"grad_norm": 0.303209575871292,
"learning_rate": 4.466615031647188e-06,
"loss": 0.7715,
"step": 2853
},
{
"epoch": 0.7855226037294434,
"grad_norm": 0.19353483675849117,
"learning_rate": 4.455614752235824e-06,
"loss": 0.7783,
"step": 2854
},
{
"epoch": 0.7857978393999863,
"grad_norm": 0.20233109926630172,
"learning_rate": 4.4446263368999865e-06,
"loss": 0.7697,
"step": 2855
},
{
"epoch": 0.7860730750705291,
"grad_norm": 0.25814347319127223,
"learning_rate": 4.433649794026467e-06,
"loss": 0.7488,
"step": 2856
},
{
"epoch": 0.786348310741072,
"grad_norm": 0.2027664849587621,
"learning_rate": 4.422685131992975e-06,
"loss": 0.777,
"step": 2857
},
{
"epoch": 0.7866235464116149,
"grad_norm": 0.2075529363301236,
"learning_rate": 4.411732359168168e-06,
"loss": 0.8007,
"step": 2858
},
{
"epoch": 0.7868987820821578,
"grad_norm": 0.2069726966220343,
"learning_rate": 4.40079148391163e-06,
"loss": 0.7592,
"step": 2859
},
{
"epoch": 0.7871740177527008,
"grad_norm": 0.19377565222016482,
"learning_rate": 4.3898625145738575e-06,
"loss": 0.7657,
"step": 2860
},
{
"epoch": 0.7874492534232437,
"grad_norm": 0.19292774395307385,
"learning_rate": 4.378945459496264e-06,
"loss": 0.7572,
"step": 2861
},
{
"epoch": 0.7877244890937866,
"grad_norm": 0.1927745991170634,
"learning_rate": 4.3680403270111645e-06,
"loss": 0.7365,
"step": 2862
},
{
"epoch": 0.7879997247643294,
"grad_norm": 0.19572380321966792,
"learning_rate": 4.357147125441783e-06,
"loss": 0.7647,
"step": 2863
},
{
"epoch": 0.7882749604348723,
"grad_norm": 0.20637964893616226,
"learning_rate": 4.346265863102221e-06,
"loss": 0.7365,
"step": 2864
},
{
"epoch": 0.7885501961054152,
"grad_norm": 0.1971231960174484,
"learning_rate": 4.335396548297485e-06,
"loss": 0.7513,
"step": 2865
},
{
"epoch": 0.7888254317759582,
"grad_norm": 0.1929257926222743,
"learning_rate": 4.324539189323458e-06,
"loss": 0.747,
"step": 2866
},
{
"epoch": 0.7891006674465011,
"grad_norm": 0.2525761325444834,
"learning_rate": 4.313693794466893e-06,
"loss": 0.7486,
"step": 2867
},
{
"epoch": 0.789375903117044,
"grad_norm": 0.22952195434899925,
"learning_rate": 4.302860372005422e-06,
"loss": 0.7766,
"step": 2868
},
{
"epoch": 0.7896511387875869,
"grad_norm": 0.2016058593886603,
"learning_rate": 4.292038930207518e-06,
"loss": 0.7764,
"step": 2869
},
{
"epoch": 0.7899263744581297,
"grad_norm": 0.2038852986604692,
"learning_rate": 4.281229477332534e-06,
"loss": 0.7685,
"step": 2870
},
{
"epoch": 0.7902016101286726,
"grad_norm": 0.20278325720176432,
"learning_rate": 4.270432021630662e-06,
"loss": 0.7638,
"step": 2871
},
{
"epoch": 0.7904768457992156,
"grad_norm": 0.19698233401664667,
"learning_rate": 4.25964657134293e-06,
"loss": 0.7851,
"step": 2872
},
{
"epoch": 0.7907520814697585,
"grad_norm": 0.20035466893421386,
"learning_rate": 4.248873134701215e-06,
"loss": 0.7702,
"step": 2873
},
{
"epoch": 0.7910273171403014,
"grad_norm": 0.19584400606937383,
"learning_rate": 4.238111719928219e-06,
"loss": 0.7739,
"step": 2874
},
{
"epoch": 0.7913025528108443,
"grad_norm": 0.21207906139692,
"learning_rate": 4.227362335237472e-06,
"loss": 0.7425,
"step": 2875
},
{
"epoch": 0.7915777884813872,
"grad_norm": 0.21151286179926834,
"learning_rate": 4.216624988833326e-06,
"loss": 0.8108,
"step": 2876
},
{
"epoch": 0.79185302415193,
"grad_norm": 0.2584595519581787,
"learning_rate": 4.205899688910924e-06,
"loss": 0.7767,
"step": 2877
},
{
"epoch": 0.792128259822473,
"grad_norm": 0.2022452169325136,
"learning_rate": 4.195186443656241e-06,
"loss": 0.7623,
"step": 2878
},
{
"epoch": 0.7924034954930159,
"grad_norm": 0.20441117139199405,
"learning_rate": 4.184485261246032e-06,
"loss": 0.7968,
"step": 2879
},
{
"epoch": 0.7926787311635588,
"grad_norm": 0.2063763328636017,
"learning_rate": 4.1737961498478555e-06,
"loss": 0.7875,
"step": 2880
},
{
"epoch": 0.7929539668341017,
"grad_norm": 0.19925364923707437,
"learning_rate": 4.163119117620056e-06,
"loss": 0.7842,
"step": 2881
},
{
"epoch": 0.7932292025046446,
"grad_norm": 0.20247120914161668,
"learning_rate": 4.152454172711755e-06,
"loss": 0.7758,
"step": 2882
},
{
"epoch": 0.7935044381751875,
"grad_norm": 0.21223059537589548,
"learning_rate": 4.141801323262858e-06,
"loss": 0.7941,
"step": 2883
},
{
"epoch": 0.7937796738457304,
"grad_norm": 0.19199658544560622,
"learning_rate": 4.131160577404021e-06,
"loss": 0.7798,
"step": 2884
},
{
"epoch": 0.7940549095162733,
"grad_norm": 0.20041257542187746,
"learning_rate": 4.120531943256676e-06,
"loss": 0.7664,
"step": 2885
},
{
"epoch": 0.7943301451868162,
"grad_norm": 0.20165733492992646,
"learning_rate": 4.1099154289330134e-06,
"loss": 0.7962,
"step": 2886
},
{
"epoch": 0.7946053808573591,
"grad_norm": 0.20314002376987073,
"learning_rate": 4.099311042535956e-06,
"loss": 0.7696,
"step": 2887
},
{
"epoch": 0.794880616527902,
"grad_norm": 0.20175323515167573,
"learning_rate": 4.08871879215919e-06,
"loss": 0.749,
"step": 2888
},
{
"epoch": 0.7951558521984449,
"grad_norm": 0.1912925297454833,
"learning_rate": 4.078138685887125e-06,
"loss": 0.7773,
"step": 2889
},
{
"epoch": 0.7954310878689879,
"grad_norm": 0.19981498598106223,
"learning_rate": 4.067570731794915e-06,
"loss": 0.7435,
"step": 2890
},
{
"epoch": 0.7957063235395307,
"grad_norm": 0.2824001525870759,
"learning_rate": 4.05701493794842e-06,
"loss": 0.7497,
"step": 2891
},
{
"epoch": 0.7959815592100736,
"grad_norm": 0.19586364528959677,
"learning_rate": 4.0464713124042366e-06,
"loss": 0.7549,
"step": 2892
},
{
"epoch": 0.7962567948806165,
"grad_norm": 0.21028430684116986,
"learning_rate": 4.03593986320967e-06,
"loss": 0.7681,
"step": 2893
},
{
"epoch": 0.7965320305511594,
"grad_norm": 0.2153353658282543,
"learning_rate": 4.025420598402721e-06,
"loss": 0.7827,
"step": 2894
},
{
"epoch": 0.7968072662217023,
"grad_norm": 0.1980078731555791,
"learning_rate": 4.014913526012103e-06,
"loss": 0.763,
"step": 2895
},
{
"epoch": 0.7970825018922453,
"grad_norm": 0.19616826789355,
"learning_rate": 4.004418654057218e-06,
"loss": 0.7448,
"step": 2896
},
{
"epoch": 0.7973577375627882,
"grad_norm": 0.22560287627183578,
"learning_rate": 3.993935990548161e-06,
"loss": 0.7554,
"step": 2897
},
{
"epoch": 0.797632973233331,
"grad_norm": 0.20765214829922649,
"learning_rate": 3.983465543485709e-06,
"loss": 0.7949,
"step": 2898
},
{
"epoch": 0.7979082089038739,
"grad_norm": 0.2036517887543124,
"learning_rate": 3.973007320861304e-06,
"loss": 0.7781,
"step": 2899
},
{
"epoch": 0.7981834445744168,
"grad_norm": 0.2004734367854516,
"learning_rate": 3.962561330657073e-06,
"loss": 0.7555,
"step": 2900
},
{
"epoch": 0.7984586802449597,
"grad_norm": 0.199398547264568,
"learning_rate": 3.952127580845791e-06,
"loss": 0.7622,
"step": 2901
},
{
"epoch": 0.7987339159155027,
"grad_norm": 0.20187340908690163,
"learning_rate": 3.941706079390897e-06,
"loss": 0.7719,
"step": 2902
},
{
"epoch": 0.7990091515860456,
"grad_norm": 0.19831027232711532,
"learning_rate": 3.931296834246501e-06,
"loss": 0.767,
"step": 2903
},
{
"epoch": 0.7992843872565885,
"grad_norm": 0.20748317754463497,
"learning_rate": 3.920899853357325e-06,
"loss": 0.7584,
"step": 2904
},
{
"epoch": 0.7995596229271313,
"grad_norm": 0.19223097328129718,
"learning_rate": 3.910515144658758e-06,
"loss": 0.7867,
"step": 2905
},
{
"epoch": 0.7998348585976742,
"grad_norm": 0.20307266762815543,
"learning_rate": 3.9001427160768e-06,
"loss": 0.769,
"step": 2906
},
{
"epoch": 0.8001100942682171,
"grad_norm": 0.5339420397855794,
"learning_rate": 3.889782575528094e-06,
"loss": 0.7565,
"step": 2907
},
{
"epoch": 0.8003853299387601,
"grad_norm": 0.20344106716606247,
"learning_rate": 3.879434730919904e-06,
"loss": 0.7786,
"step": 2908
},
{
"epoch": 0.800660565609303,
"grad_norm": 0.20038922402801615,
"learning_rate": 3.86909919015009e-06,
"loss": 0.7768,
"step": 2909
},
{
"epoch": 0.8009358012798459,
"grad_norm": 0.19495880254516534,
"learning_rate": 3.858775961107157e-06,
"loss": 0.7799,
"step": 2910
},
{
"epoch": 0.8012110369503888,
"grad_norm": 0.19617601320723022,
"learning_rate": 3.8484650516701784e-06,
"loss": 0.7875,
"step": 2911
},
{
"epoch": 0.8014862726209316,
"grad_norm": 0.1920851553900602,
"learning_rate": 3.838166469708844e-06,
"loss": 0.7735,
"step": 2912
},
{
"epoch": 0.8017615082914745,
"grad_norm": 0.20857451692256856,
"learning_rate": 3.827880223083431e-06,
"loss": 0.7998,
"step": 2913
},
{
"epoch": 0.8020367439620175,
"grad_norm": 0.19636130182099734,
"learning_rate": 3.817606319644793e-06,
"loss": 0.7681,
"step": 2914
},
{
"epoch": 0.8023119796325604,
"grad_norm": 0.2007209095200276,
"learning_rate": 3.8073447672343798e-06,
"loss": 0.7863,
"step": 2915
},
{
"epoch": 0.8025872153031033,
"grad_norm": 0.2011422358942804,
"learning_rate": 3.7970955736841887e-06,
"loss": 0.7454,
"step": 2916
},
{
"epoch": 0.8028624509736462,
"grad_norm": 0.20542209496523348,
"learning_rate": 3.7868587468168216e-06,
"loss": 0.7501,
"step": 2917
},
{
"epoch": 0.803137686644189,
"grad_norm": 0.20360489944609322,
"learning_rate": 3.7766342944454047e-06,
"loss": 0.7949,
"step": 2918
},
{
"epoch": 0.8034129223147319,
"grad_norm": 0.19787382286866595,
"learning_rate": 3.7664222243736404e-06,
"loss": 0.7631,
"step": 2919
},
{
"epoch": 0.8036881579852749,
"grad_norm": 0.19776558419990134,
"learning_rate": 3.75622254439578e-06,
"loss": 0.7485,
"step": 2920
},
{
"epoch": 0.8039633936558178,
"grad_norm": 0.2054346946568972,
"learning_rate": 3.7460352622966034e-06,
"loss": 0.7716,
"step": 2921
},
{
"epoch": 0.8042386293263607,
"grad_norm": 0.20142581338538534,
"learning_rate": 3.735860385851444e-06,
"loss": 0.7834,
"step": 2922
},
{
"epoch": 0.8045138649969036,
"grad_norm": 0.1999942983586885,
"learning_rate": 3.725697922826166e-06,
"loss": 0.7574,
"step": 2923
},
{
"epoch": 0.8047891006674465,
"grad_norm": 0.20633088448915526,
"learning_rate": 3.715547880977135e-06,
"loss": 0.7621,
"step": 2924
},
{
"epoch": 0.8050643363379894,
"grad_norm": 0.19525380005448217,
"learning_rate": 3.7054102680512795e-06,
"loss": 0.7787,
"step": 2925
},
{
"epoch": 0.8053395720085323,
"grad_norm": 0.19401713555394456,
"learning_rate": 3.6952850917860007e-06,
"loss": 0.7663,
"step": 2926
},
{
"epoch": 0.8056148076790752,
"grad_norm": 0.2041512462972966,
"learning_rate": 3.685172359909235e-06,
"loss": 0.7695,
"step": 2927
},
{
"epoch": 0.8058900433496181,
"grad_norm": 0.19021372442475737,
"learning_rate": 3.6750720801394014e-06,
"loss": 0.7787,
"step": 2928
},
{
"epoch": 0.806165279020161,
"grad_norm": 0.19983376635489705,
"learning_rate": 3.6649842601854245e-06,
"loss": 0.7661,
"step": 2929
},
{
"epoch": 0.8064405146907039,
"grad_norm": 0.19094784715680338,
"learning_rate": 3.6549089077467258e-06,
"loss": 0.7669,
"step": 2930
},
{
"epoch": 0.8067157503612468,
"grad_norm": 0.18971923430952783,
"learning_rate": 3.6448460305131916e-06,
"loss": 0.7657,
"step": 2931
},
{
"epoch": 0.8069909860317898,
"grad_norm": 0.19290411168702953,
"learning_rate": 3.6347956361652135e-06,
"loss": 0.7557,
"step": 2932
},
{
"epoch": 0.8072662217023326,
"grad_norm": 0.19465580767708632,
"learning_rate": 3.624757732373629e-06,
"loss": 0.7351,
"step": 2933
},
{
"epoch": 0.8075414573728755,
"grad_norm": 0.19469910878182503,
"learning_rate": 3.6147323267997592e-06,
"loss": 0.7553,
"step": 2934
},
{
"epoch": 0.8078166930434184,
"grad_norm": 0.19101637004024657,
"learning_rate": 3.6047194270953846e-06,
"loss": 0.7664,
"step": 2935
},
{
"epoch": 0.8080919287139613,
"grad_norm": 0.2097603453646194,
"learning_rate": 3.5947190409027276e-06,
"loss": 0.7646,
"step": 2936
},
{
"epoch": 0.8083671643845042,
"grad_norm": 0.20919170913934443,
"learning_rate": 3.584731175854479e-06,
"loss": 0.7921,
"step": 2937
},
{
"epoch": 0.8086424000550472,
"grad_norm": 0.1955730678628757,
"learning_rate": 3.5747558395737493e-06,
"loss": 0.7665,
"step": 2938
},
{
"epoch": 0.8089176357255901,
"grad_norm": 0.20074242203864368,
"learning_rate": 3.5647930396741213e-06,
"loss": 0.7552,
"step": 2939
},
{
"epoch": 0.8091928713961329,
"grad_norm": 0.20185678062181947,
"learning_rate": 3.5548427837595735e-06,
"loss": 0.8127,
"step": 2940
},
{
"epoch": 0.8094681070666758,
"grad_norm": 0.19660344000150748,
"learning_rate": 3.54490507942453e-06,
"loss": 0.7876,
"step": 2941
},
{
"epoch": 0.8097433427372187,
"grad_norm": 0.19445674769325583,
"learning_rate": 3.534979934253835e-06,
"loss": 0.7555,
"step": 2942
},
{
"epoch": 0.8100185784077616,
"grad_norm": 0.4918896739297948,
"learning_rate": 3.5250673558227356e-06,
"loss": 0.786,
"step": 2943
},
{
"epoch": 0.8102938140783046,
"grad_norm": 0.20779493364366397,
"learning_rate": 3.5151673516968956e-06,
"loss": 0.7912,
"step": 2944
},
{
"epoch": 0.8105690497488475,
"grad_norm": 0.19431374838052975,
"learning_rate": 3.505279929432386e-06,
"loss": 0.7623,
"step": 2945
},
{
"epoch": 0.8108442854193904,
"grad_norm": 0.20518241247885818,
"learning_rate": 3.495405096575664e-06,
"loss": 0.7666,
"step": 2946
},
{
"epoch": 0.8111195210899332,
"grad_norm": 0.20721678846360644,
"learning_rate": 3.485542860663593e-06,
"loss": 0.783,
"step": 2947
},
{
"epoch": 0.8113947567604761,
"grad_norm": 0.20064913719736718,
"learning_rate": 3.4756932292234e-06,
"loss": 0.7949,
"step": 2948
},
{
"epoch": 0.811669992431019,
"grad_norm": 0.19500803217870402,
"learning_rate": 3.4658562097727177e-06,
"loss": 0.7643,
"step": 2949
},
{
"epoch": 0.811945228101562,
"grad_norm": 0.19439907281721805,
"learning_rate": 3.4560318098195244e-06,
"loss": 0.7589,
"step": 2950
},
{
"epoch": 0.8122204637721049,
"grad_norm": 0.1951219514353315,
"learning_rate": 3.446220036862191e-06,
"loss": 0.752,
"step": 2951
},
{
"epoch": 0.8124956994426478,
"grad_norm": 0.19990388577876386,
"learning_rate": 3.4364208983894387e-06,
"loss": 0.7522,
"step": 2952
},
{
"epoch": 0.8127709351131907,
"grad_norm": 0.19486143805117162,
"learning_rate": 3.426634401880351e-06,
"loss": 0.7498,
"step": 2953
},
{
"epoch": 0.8130461707837335,
"grad_norm": 0.18819736579265198,
"learning_rate": 3.4168605548043663e-06,
"loss": 0.7576,
"step": 2954
},
{
"epoch": 0.8133214064542764,
"grad_norm": 0.1927019017067847,
"learning_rate": 3.4070993646212493e-06,
"loss": 0.7483,
"step": 2955
},
{
"epoch": 0.8135966421248194,
"grad_norm": 0.19342814881717693,
"learning_rate": 3.3973508387811237e-06,
"loss": 0.7859,
"step": 2956
},
{
"epoch": 0.8138718777953623,
"grad_norm": 0.19795873741353534,
"learning_rate": 3.3876149847244454e-06,
"loss": 0.7431,
"step": 2957
},
{
"epoch": 0.8141471134659052,
"grad_norm": 0.2014558814393953,
"learning_rate": 3.377891809881986e-06,
"loss": 0.7834,
"step": 2958
},
{
"epoch": 0.8144223491364481,
"grad_norm": 0.439267306111341,
"learning_rate": 3.368181321674853e-06,
"loss": 0.7731,
"step": 2959
},
{
"epoch": 0.814697584806991,
"grad_norm": 0.19408651237144176,
"learning_rate": 3.3584835275144647e-06,
"loss": 0.7895,
"step": 2960
},
{
"epoch": 0.8149728204775339,
"grad_norm": 0.2024694272879404,
"learning_rate": 3.348798434802556e-06,
"loss": 0.7944,
"step": 2961
},
{
"epoch": 0.8152480561480768,
"grad_norm": 0.19688323788979772,
"learning_rate": 3.339126050931165e-06,
"loss": 0.7733,
"step": 2962
},
{
"epoch": 0.8155232918186197,
"grad_norm": 0.19720016564533846,
"learning_rate": 3.3294663832826204e-06,
"loss": 0.7636,
"step": 2963
},
{
"epoch": 0.8157985274891626,
"grad_norm": 0.19631478262680774,
"learning_rate": 3.3198194392295636e-06,
"loss": 0.7929,
"step": 2964
},
{
"epoch": 0.8160737631597055,
"grad_norm": 0.194271823544458,
"learning_rate": 3.3101852261349053e-06,
"loss": 0.7771,
"step": 2965
},
{
"epoch": 0.8163489988302484,
"grad_norm": 0.19924369045625256,
"learning_rate": 3.300563751351855e-06,
"loss": 0.7604,
"step": 2966
},
{
"epoch": 0.8166242345007914,
"grad_norm": 0.19760410232127573,
"learning_rate": 3.2909550222238916e-06,
"loss": 0.7797,
"step": 2967
},
{
"epoch": 0.8168994701713342,
"grad_norm": 0.196418416252485,
"learning_rate": 3.281359046084771e-06,
"loss": 0.7804,
"step": 2968
},
{
"epoch": 0.8171747058418771,
"grad_norm": 0.19361302100665764,
"learning_rate": 3.271775830258519e-06,
"loss": 0.7388,
"step": 2969
},
{
"epoch": 0.81744994151242,
"grad_norm": 0.20038229350070116,
"learning_rate": 3.2622053820594025e-06,
"loss": 0.773,
"step": 2970
},
{
"epoch": 0.8177251771829629,
"grad_norm": 0.2031306493792435,
"learning_rate": 3.252647708791965e-06,
"loss": 0.8166,
"step": 2971
},
{
"epoch": 0.8180004128535058,
"grad_norm": 0.197840823564769,
"learning_rate": 3.243102817750996e-06,
"loss": 0.7912,
"step": 2972
},
{
"epoch": 0.8182756485240488,
"grad_norm": 0.19223881626719536,
"learning_rate": 3.233570716221517e-06,
"loss": 0.7467,
"step": 2973
},
{
"epoch": 0.8185508841945917,
"grad_norm": 0.1861312366771784,
"learning_rate": 3.224051411478799e-06,
"loss": 0.7426,
"step": 2974
},
{
"epoch": 0.8188261198651345,
"grad_norm": 0.20161153104256666,
"learning_rate": 3.214544910788344e-06,
"loss": 0.7794,
"step": 2975
},
{
"epoch": 0.8191013555356774,
"grad_norm": 0.1983209725800102,
"learning_rate": 3.205051221405886e-06,
"loss": 0.7627,
"step": 2976
},
{
"epoch": 0.8193765912062203,
"grad_norm": 0.19725390707820556,
"learning_rate": 3.195570350577366e-06,
"loss": 0.7879,
"step": 2977
},
{
"epoch": 0.8196518268767632,
"grad_norm": 0.19682838303602035,
"learning_rate": 3.186102305538956e-06,
"loss": 0.7984,
"step": 2978
},
{
"epoch": 0.8199270625473062,
"grad_norm": 0.19339250349237413,
"learning_rate": 3.176647093517038e-06,
"loss": 0.7782,
"step": 2979
},
{
"epoch": 0.8202022982178491,
"grad_norm": 0.1955081265108639,
"learning_rate": 3.1672047217281853e-06,
"loss": 0.783,
"step": 2980
},
{
"epoch": 0.820477533888392,
"grad_norm": 0.19813172300728882,
"learning_rate": 3.157775197379187e-06,
"loss": 0.7688,
"step": 2981
},
{
"epoch": 0.8207527695589348,
"grad_norm": 0.1964422364694359,
"learning_rate": 3.148358527667019e-06,
"loss": 0.7796,
"step": 2982
},
{
"epoch": 0.8210280052294777,
"grad_norm": 0.20253715265668207,
"learning_rate": 3.138954719778848e-06,
"loss": 0.7783,
"step": 2983
},
{
"epoch": 0.8213032409000206,
"grad_norm": 0.19731673602494068,
"learning_rate": 3.1295637808920286e-06,
"loss": 0.7714,
"step": 2984
},
{
"epoch": 0.8215784765705636,
"grad_norm": 0.18730586826954426,
"learning_rate": 3.1201857181740804e-06,
"loss": 0.7644,
"step": 2985
},
{
"epoch": 0.8218537122411065,
"grad_norm": 0.4150346032094395,
"learning_rate": 3.1108205387827085e-06,
"loss": 0.7828,
"step": 2986
},
{
"epoch": 0.8221289479116494,
"grad_norm": 0.20001015037380662,
"learning_rate": 3.1014682498657733e-06,
"loss": 0.7583,
"step": 2987
},
{
"epoch": 0.8224041835821922,
"grad_norm": 0.19220897004573384,
"learning_rate": 3.0921288585613053e-06,
"loss": 0.7742,
"step": 2988
},
{
"epoch": 0.8226794192527351,
"grad_norm": 0.1937326967582978,
"learning_rate": 3.0828023719974975e-06,
"loss": 0.7888,
"step": 2989
},
{
"epoch": 0.822954654923278,
"grad_norm": 0.19002952589220604,
"learning_rate": 3.0734887972926764e-06,
"loss": 0.7444,
"step": 2990
},
{
"epoch": 0.823229890593821,
"grad_norm": 0.19429892427198608,
"learning_rate": 3.0641881415553266e-06,
"loss": 0.773,
"step": 2991
},
{
"epoch": 0.8235051262643639,
"grad_norm": 0.1991475218747388,
"learning_rate": 3.0549004118840606e-06,
"loss": 0.771,
"step": 2992
},
{
"epoch": 0.8237803619349068,
"grad_norm": 0.19603075579799845,
"learning_rate": 3.0456256153676402e-06,
"loss": 0.7506,
"step": 2993
},
{
"epoch": 0.8240555976054497,
"grad_norm": 0.19267081961901716,
"learning_rate": 3.0363637590849483e-06,
"loss": 0.7926,
"step": 2994
},
{
"epoch": 0.8243308332759925,
"grad_norm": 0.1937744240979409,
"learning_rate": 3.0271148501049796e-06,
"loss": 0.7925,
"step": 2995
},
{
"epoch": 0.8246060689465354,
"grad_norm": 0.1952112705672228,
"learning_rate": 3.0178788954868764e-06,
"loss": 0.7967,
"step": 2996
},
{
"epoch": 0.8248813046170784,
"grad_norm": 0.18706297543548323,
"learning_rate": 3.008655902279867e-06,
"loss": 0.7704,
"step": 2997
},
{
"epoch": 0.8251565402876213,
"grad_norm": 0.19281286307768228,
"learning_rate": 2.9994458775232947e-06,
"loss": 0.7863,
"step": 2998
},
{
"epoch": 0.8254317759581642,
"grad_norm": 0.1940332554826848,
"learning_rate": 2.9902488282466135e-06,
"loss": 0.783,
"step": 2999
},
{
"epoch": 0.8257070116287071,
"grad_norm": 0.19919472902227528,
"learning_rate": 2.981064761469359e-06,
"loss": 0.763,
"step": 3000
},
{
"epoch": 0.82598224729925,
"grad_norm": 0.1898812375911402,
"learning_rate": 2.9718936842011727e-06,
"loss": 0.7741,
"step": 3001
},
{
"epoch": 0.8262574829697928,
"grad_norm": 0.19317549723498484,
"learning_rate": 2.962735603441762e-06,
"loss": 0.7943,
"step": 3002
},
{
"epoch": 0.8265327186403358,
"grad_norm": 0.4836962372813598,
"learning_rate": 2.9535905261809492e-06,
"loss": 0.7918,
"step": 3003
},
{
"epoch": 0.8268079543108787,
"grad_norm": 0.2012962845456614,
"learning_rate": 2.9444584593985914e-06,
"loss": 0.7917,
"step": 3004
},
{
"epoch": 0.8270831899814216,
"grad_norm": 0.18626972790480248,
"learning_rate": 2.935339410064646e-06,
"loss": 0.7644,
"step": 3005
},
{
"epoch": 0.8273584256519645,
"grad_norm": 0.1929550257006686,
"learning_rate": 2.9262333851391234e-06,
"loss": 0.7899,
"step": 3006
},
{
"epoch": 0.8276336613225074,
"grad_norm": 0.18986651972753832,
"learning_rate": 2.917140391572084e-06,
"loss": 0.7416,
"step": 3007
},
{
"epoch": 0.8279088969930503,
"grad_norm": 0.19094706258894267,
"learning_rate": 2.908060436303661e-06,
"loss": 0.7583,
"step": 3008
},
{
"epoch": 0.8281841326635933,
"grad_norm": 0.19494245808498553,
"learning_rate": 2.8989935262640245e-06,
"loss": 0.7852,
"step": 3009
},
{
"epoch": 0.8284593683341361,
"grad_norm": 0.1939673339423602,
"learning_rate": 2.8899396683733916e-06,
"loss": 0.7855,
"step": 3010
},
{
"epoch": 0.828734604004679,
"grad_norm": 0.1922472596497471,
"learning_rate": 2.880898869542019e-06,
"loss": 0.7747,
"step": 3011
},
{
"epoch": 0.8290098396752219,
"grad_norm": 0.19109372042741662,
"learning_rate": 2.871871136670188e-06,
"loss": 0.7545,
"step": 3012
},
{
"epoch": 0.8292850753457648,
"grad_norm": 0.19998845581220057,
"learning_rate": 2.8628564766482193e-06,
"loss": 0.8223,
"step": 3013
},
{
"epoch": 0.8295603110163077,
"grad_norm": 0.18875318151334095,
"learning_rate": 2.8538548963564405e-06,
"loss": 0.775,
"step": 3014
},
{
"epoch": 0.8298355466868507,
"grad_norm": 0.18877364565375876,
"learning_rate": 2.844866402665214e-06,
"loss": 0.7682,
"step": 3015
},
{
"epoch": 0.8301107823573936,
"grad_norm": 0.18672888348505698,
"learning_rate": 2.8358910024349006e-06,
"loss": 0.7456,
"step": 3016
},
{
"epoch": 0.8303860180279364,
"grad_norm": 0.32179740836847887,
"learning_rate": 2.8269287025158767e-06,
"loss": 0.7346,
"step": 3017
},
{
"epoch": 0.8306612536984793,
"grad_norm": 0.1887537464602044,
"learning_rate": 2.8179795097485163e-06,
"loss": 0.7658,
"step": 3018
},
{
"epoch": 0.8309364893690222,
"grad_norm": 0.19110158631182372,
"learning_rate": 2.8090434309631852e-06,
"loss": 0.8016,
"step": 3019
},
{
"epoch": 0.8312117250395651,
"grad_norm": 0.19256974946442487,
"learning_rate": 2.8001204729802435e-06,
"loss": 0.7815,
"step": 3020
},
{
"epoch": 0.8314869607101081,
"grad_norm": 0.1970175901032695,
"learning_rate": 2.791210642610045e-06,
"loss": 0.7681,
"step": 3021
},
{
"epoch": 0.831762196380651,
"grad_norm": 0.2015234789428279,
"learning_rate": 2.7823139466529082e-06,
"loss": 0.7663,
"step": 3022
},
{
"epoch": 0.8320374320511938,
"grad_norm": 0.1931552456652713,
"learning_rate": 2.7734303918991367e-06,
"loss": 0.7393,
"step": 3023
},
{
"epoch": 0.8323126677217367,
"grad_norm": 0.1982835634815639,
"learning_rate": 2.764559985129007e-06,
"loss": 0.7899,
"step": 3024
},
{
"epoch": 0.8325879033922796,
"grad_norm": 0.19993477247641417,
"learning_rate": 2.7557027331127572e-06,
"loss": 0.7483,
"step": 3025
},
{
"epoch": 0.8328631390628225,
"grad_norm": 0.20098094993148255,
"learning_rate": 2.746858642610577e-06,
"loss": 0.7763,
"step": 3026
},
{
"epoch": 0.8331383747333655,
"grad_norm": 0.19585662373306542,
"learning_rate": 2.73802772037262e-06,
"loss": 0.7794,
"step": 3027
},
{
"epoch": 0.8334136104039084,
"grad_norm": 0.19508845229334704,
"learning_rate": 2.729209973138998e-06,
"loss": 0.7656,
"step": 3028
},
{
"epoch": 0.8336888460744513,
"grad_norm": 0.19625263210101154,
"learning_rate": 2.720405407639739e-06,
"loss": 0.7887,
"step": 3029
},
{
"epoch": 0.8339640817449941,
"grad_norm": 0.1924255004539693,
"learning_rate": 2.71161403059484e-06,
"loss": 0.7594,
"step": 3030
},
{
"epoch": 0.834239317415537,
"grad_norm": 0.1915629995397315,
"learning_rate": 2.7028358487142137e-06,
"loss": 0.7801,
"step": 3031
},
{
"epoch": 0.8345145530860799,
"grad_norm": 0.19486619175228329,
"learning_rate": 2.6940708686977137e-06,
"loss": 0.7872,
"step": 3032
},
{
"epoch": 0.8347897887566229,
"grad_norm": 0.19743173389535998,
"learning_rate": 2.6853190972351085e-06,
"loss": 0.758,
"step": 3033
},
{
"epoch": 0.8350650244271658,
"grad_norm": 0.19409464429494008,
"learning_rate": 2.6765805410060863e-06,
"loss": 0.7796,
"step": 3034
},
{
"epoch": 0.8353402600977087,
"grad_norm": 0.19205024981287255,
"learning_rate": 2.6678552066802566e-06,
"loss": 0.7703,
"step": 3035
},
{
"epoch": 0.8356154957682516,
"grad_norm": 0.18985458380871004,
"learning_rate": 2.659143100917121e-06,
"loss": 0.7662,
"step": 3036
},
{
"epoch": 0.8358907314387944,
"grad_norm": 0.1936561993222744,
"learning_rate": 2.6504442303661027e-06,
"loss": 0.7665,
"step": 3037
},
{
"epoch": 0.8361659671093373,
"grad_norm": 0.18802118222778885,
"learning_rate": 2.6417586016665174e-06,
"loss": 0.771,
"step": 3038
},
{
"epoch": 0.8364412027798803,
"grad_norm": 0.20100424618090773,
"learning_rate": 2.6330862214475673e-06,
"loss": 0.7877,
"step": 3039
},
{
"epoch": 0.8367164384504232,
"grad_norm": 0.20037185262232557,
"learning_rate": 2.624427096328357e-06,
"loss": 0.7814,
"step": 3040
},
{
"epoch": 0.8369916741209661,
"grad_norm": 0.19698960155073983,
"learning_rate": 2.6157812329178556e-06,
"loss": 0.7892,
"step": 3041
},
{
"epoch": 0.837266909791509,
"grad_norm": 0.19251790664222262,
"learning_rate": 2.6071486378149225e-06,
"loss": 0.7851,
"step": 3042
},
{
"epoch": 0.8375421454620519,
"grad_norm": 0.20020046308820605,
"learning_rate": 2.598529317608296e-06,
"loss": 0.8155,
"step": 3043
},
{
"epoch": 0.8378173811325947,
"grad_norm": 0.19557821252234994,
"learning_rate": 2.5899232788765604e-06,
"loss": 0.7396,
"step": 3044
},
{
"epoch": 0.8380926168031377,
"grad_norm": 0.18915899568465921,
"learning_rate": 2.581330528188186e-06,
"loss": 0.7837,
"step": 3045
},
{
"epoch": 0.8383678524736806,
"grad_norm": 0.19241535460006218,
"learning_rate": 2.5727510721014916e-06,
"loss": 0.7821,
"step": 3046
},
{
"epoch": 0.8386430881442235,
"grad_norm": 0.1917539653288574,
"learning_rate": 2.5641849171646473e-06,
"loss": 0.7711,
"step": 3047
},
{
"epoch": 0.8389183238147664,
"grad_norm": 0.19484514647989906,
"learning_rate": 2.555632069915681e-06,
"loss": 0.7632,
"step": 3048
},
{
"epoch": 0.8391935594853093,
"grad_norm": 0.1926394897604978,
"learning_rate": 2.547092536882445e-06,
"loss": 0.7314,
"step": 3049
},
{
"epoch": 0.8394687951558522,
"grad_norm": 0.19796859732888455,
"learning_rate": 2.5385663245826498e-06,
"loss": 0.7662,
"step": 3050
},
{
"epoch": 0.8397440308263951,
"grad_norm": 0.1915626795030087,
"learning_rate": 2.530053439523823e-06,
"loss": 0.8084,
"step": 3051
},
{
"epoch": 0.840019266496938,
"grad_norm": 0.19644886570060285,
"learning_rate": 2.5215538882033296e-06,
"loss": 0.7609,
"step": 3052
},
{
"epoch": 0.8402945021674809,
"grad_norm": 0.2167925402527184,
"learning_rate": 2.5130676771083585e-06,
"loss": 0.7545,
"step": 3053
},
{
"epoch": 0.8405697378380238,
"grad_norm": 0.20080450993439886,
"learning_rate": 2.5045948127159105e-06,
"loss": 0.7818,
"step": 3054
},
{
"epoch": 0.8408449735085667,
"grad_norm": 0.2092790475384215,
"learning_rate": 2.4961353014928103e-06,
"loss": 0.7866,
"step": 3055
},
{
"epoch": 0.8411202091791096,
"grad_norm": 0.18866886391908752,
"learning_rate": 2.4876891498956758e-06,
"loss": 0.7528,
"step": 3056
},
{
"epoch": 0.8413954448496526,
"grad_norm": 0.21095465191219404,
"learning_rate": 2.4792563643709367e-06,
"loss": 0.8106,
"step": 3057
},
{
"epoch": 0.8416706805201954,
"grad_norm": 0.1888393831679301,
"learning_rate": 2.4708369513548293e-06,
"loss": 0.7708,
"step": 3058
},
{
"epoch": 0.8419459161907383,
"grad_norm": 0.18666594993816893,
"learning_rate": 2.4624309172733597e-06,
"loss": 0.7579,
"step": 3059
},
{
"epoch": 0.8422211518612812,
"grad_norm": 0.18384518346947176,
"learning_rate": 2.4540382685423535e-06,
"loss": 0.7486,
"step": 3060
},
{
"epoch": 0.8424963875318241,
"grad_norm": 0.1842857426357308,
"learning_rate": 2.4456590115673963e-06,
"loss": 0.7396,
"step": 3061
},
{
"epoch": 0.842771623202367,
"grad_norm": 0.18227634287949585,
"learning_rate": 2.437293152743865e-06,
"loss": 0.7548,
"step": 3062
},
{
"epoch": 0.84304685887291,
"grad_norm": 0.18846389925830498,
"learning_rate": 2.4289406984569008e-06,
"loss": 0.7603,
"step": 3063
},
{
"epoch": 0.8433220945434529,
"grad_norm": 0.22281292004390954,
"learning_rate": 2.4206016550814227e-06,
"loss": 0.7945,
"step": 3064
},
{
"epoch": 0.8435973302139957,
"grad_norm": 0.1879150711883053,
"learning_rate": 2.4122760289821144e-06,
"loss": 0.7636,
"step": 3065
},
{
"epoch": 0.8438725658845386,
"grad_norm": 0.19575038549231671,
"learning_rate": 2.4039638265134045e-06,
"loss": 0.7655,
"step": 3066
},
{
"epoch": 0.8441478015550815,
"grad_norm": 0.19200307974339387,
"learning_rate": 2.3956650540195024e-06,
"loss": 0.7688,
"step": 3067
},
{
"epoch": 0.8444230372256244,
"grad_norm": 0.1944801103609571,
"learning_rate": 2.3873797178343417e-06,
"loss": 0.752,
"step": 3068
},
{
"epoch": 0.8446982728961674,
"grad_norm": 0.3400995668774979,
"learning_rate": 2.3791078242816124e-06,
"loss": 0.7687,
"step": 3069
},
{
"epoch": 0.8449735085667103,
"grad_norm": 0.3406649122138235,
"learning_rate": 2.370849379674749e-06,
"loss": 0.7593,
"step": 3070
},
{
"epoch": 0.8452487442372532,
"grad_norm": 0.18803388959424092,
"learning_rate": 2.3626043903169073e-06,
"loss": 0.7539,
"step": 3071
},
{
"epoch": 0.845523979907796,
"grad_norm": 0.19313704606149756,
"learning_rate": 2.3543728625009885e-06,
"loss": 0.7572,
"step": 3072
},
{
"epoch": 0.8457992155783389,
"grad_norm": 0.19190785413980008,
"learning_rate": 2.3461548025096015e-06,
"loss": 0.7487,
"step": 3073
},
{
"epoch": 0.8460744512488818,
"grad_norm": 0.19146675929192586,
"learning_rate": 2.3379502166151015e-06,
"loss": 0.7728,
"step": 3074
},
{
"epoch": 0.8463496869194248,
"grad_norm": 0.1853750993600032,
"learning_rate": 2.3297591110795437e-06,
"loss": 0.7585,
"step": 3075
},
{
"epoch": 0.8466249225899677,
"grad_norm": 0.18446562759880664,
"learning_rate": 2.3215814921546853e-06,
"loss": 0.7436,
"step": 3076
},
{
"epoch": 0.8469001582605106,
"grad_norm": 0.19129834660475867,
"learning_rate": 2.313417366082016e-06,
"loss": 0.7819,
"step": 3077
},
{
"epoch": 0.8471753939310535,
"grad_norm": 0.1923504287410237,
"learning_rate": 2.3052667390926975e-06,
"loss": 0.766,
"step": 3078
},
{
"epoch": 0.8474506296015963,
"grad_norm": 0.18878989305329127,
"learning_rate": 2.297129617407612e-06,
"loss": 0.7693,
"step": 3079
},
{
"epoch": 0.8477258652721392,
"grad_norm": 0.26253976707159266,
"learning_rate": 2.2890060072373288e-06,
"loss": 0.7675,
"step": 3080
},
{
"epoch": 0.8480011009426822,
"grad_norm": 0.19511497948672077,
"learning_rate": 2.280895914782084e-06,
"loss": 0.7673,
"step": 3081
},
{
"epoch": 0.8482763366132251,
"grad_norm": 0.18956334635108352,
"learning_rate": 2.2727993462318376e-06,
"loss": 0.7595,
"step": 3082
},
{
"epoch": 0.848551572283768,
"grad_norm": 0.19191875617856083,
"learning_rate": 2.2647163077661837e-06,
"loss": 0.7675,
"step": 3083
},
{
"epoch": 0.8488268079543109,
"grad_norm": 0.1887789355258137,
"learning_rate": 2.256646805554419e-06,
"loss": 0.7641,
"step": 3084
},
{
"epoch": 0.8491020436248538,
"grad_norm": 0.18819429071318444,
"learning_rate": 2.2485908457555027e-06,
"loss": 0.7295,
"step": 3085
},
{
"epoch": 0.8493772792953966,
"grad_norm": 0.18802130214960133,
"learning_rate": 2.2405484345180438e-06,
"loss": 0.7566,
"step": 3086
},
{
"epoch": 0.8496525149659396,
"grad_norm": 0.19439351714579375,
"learning_rate": 2.232519577980332e-06,
"loss": 0.7339,
"step": 3087
},
{
"epoch": 0.8499277506364825,
"grad_norm": 0.19693049546862057,
"learning_rate": 2.224504282270288e-06,
"loss": 0.7624,
"step": 3088
},
{
"epoch": 0.8502029863070254,
"grad_norm": 0.1856672561841339,
"learning_rate": 2.2165025535055128e-06,
"loss": 0.7638,
"step": 3089
},
{
"epoch": 0.8504782219775683,
"grad_norm": 0.2458584839079196,
"learning_rate": 2.20851439779322e-06,
"loss": 0.7547,
"step": 3090
},
{
"epoch": 0.8507534576481112,
"grad_norm": 0.18778140660727222,
"learning_rate": 2.2005398212302853e-06,
"loss": 0.7702,
"step": 3091
},
{
"epoch": 0.851028693318654,
"grad_norm": 0.18783755518316525,
"learning_rate": 2.192578829903216e-06,
"loss": 0.7663,
"step": 3092
},
{
"epoch": 0.851303928989197,
"grad_norm": 0.1950484061634625,
"learning_rate": 2.18463142988814e-06,
"loss": 0.7838,
"step": 3093
},
{
"epoch": 0.8515791646597399,
"grad_norm": 0.19217844936554562,
"learning_rate": 2.176697627250828e-06,
"loss": 0.7642,
"step": 3094
},
{
"epoch": 0.8518544003302828,
"grad_norm": 0.19072685885204396,
"learning_rate": 2.16877742804666e-06,
"loss": 0.7951,
"step": 3095
},
{
"epoch": 0.8521296360008257,
"grad_norm": 0.1887281877148052,
"learning_rate": 2.160870838320639e-06,
"loss": 0.7711,
"step": 3096
},
{
"epoch": 0.8524048716713686,
"grad_norm": 0.18823243678510676,
"learning_rate": 2.152977864107386e-06,
"loss": 0.764,
"step": 3097
},
{
"epoch": 0.8526801073419115,
"grad_norm": 0.1905286541883655,
"learning_rate": 2.1450985114311163e-06,
"loss": 0.7634,
"step": 3098
},
{
"epoch": 0.8529553430124545,
"grad_norm": 0.18700851525890116,
"learning_rate": 2.137232786305661e-06,
"loss": 0.7843,
"step": 3099
},
{
"epoch": 0.8532305786829973,
"grad_norm": 0.18477582388788863,
"learning_rate": 2.1293806947344398e-06,
"loss": 0.7641,
"step": 3100
},
{
"epoch": 0.8535058143535402,
"grad_norm": 0.1910166465102131,
"learning_rate": 2.1215422427104748e-06,
"loss": 0.7712,
"step": 3101
},
{
"epoch": 0.8537810500240831,
"grad_norm": 0.19196299357818025,
"learning_rate": 2.1137174362163783e-06,
"loss": 0.7778,
"step": 3102
},
{
"epoch": 0.854056285694626,
"grad_norm": 0.21988122045040998,
"learning_rate": 2.1059062812243437e-06,
"loss": 0.7832,
"step": 3103
},
{
"epoch": 0.8543315213651689,
"grad_norm": 0.19395509727436827,
"learning_rate": 2.098108783696149e-06,
"loss": 0.7716,
"step": 3104
},
{
"epoch": 0.8546067570357119,
"grad_norm": 0.1902137491034438,
"learning_rate": 2.09032494958314e-06,
"loss": 0.7617,
"step": 3105
},
{
"epoch": 0.8548819927062548,
"grad_norm": 0.1855414369841561,
"learning_rate": 2.0825547848262405e-06,
"loss": 0.7504,
"step": 3106
},
{
"epoch": 0.8551572283767976,
"grad_norm": 0.20050767045060103,
"learning_rate": 2.0747982953559464e-06,
"loss": 0.7775,
"step": 3107
},
{
"epoch": 0.8554324640473405,
"grad_norm": 0.19249308183899058,
"learning_rate": 2.0670554870923042e-06,
"loss": 0.7588,
"step": 3108
},
{
"epoch": 0.8557076997178834,
"grad_norm": 0.2232603600663087,
"learning_rate": 2.0593263659449247e-06,
"loss": 0.7739,
"step": 3109
},
{
"epoch": 0.8559829353884263,
"grad_norm": 0.18450780097643896,
"learning_rate": 2.0516109378129756e-06,
"loss": 0.761,
"step": 3110
},
{
"epoch": 0.8562581710589693,
"grad_norm": 0.18469898200814766,
"learning_rate": 2.0439092085851685e-06,
"loss": 0.7671,
"step": 3111
},
{
"epoch": 0.8565334067295122,
"grad_norm": 0.18658290947008352,
"learning_rate": 2.0362211841397594e-06,
"loss": 0.7742,
"step": 3112
},
{
"epoch": 0.856808642400055,
"grad_norm": 0.1861967748207594,
"learning_rate": 2.028546870344543e-06,
"loss": 0.7398,
"step": 3113
},
{
"epoch": 0.8570838780705979,
"grad_norm": 0.1912475258522535,
"learning_rate": 2.0208862730568614e-06,
"loss": 0.8127,
"step": 3114
},
{
"epoch": 0.8573591137411408,
"grad_norm": 0.1943396310289039,
"learning_rate": 2.01323939812357e-06,
"loss": 0.774,
"step": 3115
},
{
"epoch": 0.8576343494116837,
"grad_norm": 0.190822139485722,
"learning_rate": 2.0056062513810583e-06,
"loss": 0.78,
"step": 3116
},
{
"epoch": 0.8579095850822267,
"grad_norm": 0.1852874973163684,
"learning_rate": 1.9979868386552436e-06,
"loss": 0.7775,
"step": 3117
},
{
"epoch": 0.8581848207527696,
"grad_norm": 0.19024843533673702,
"learning_rate": 1.990381165761557e-06,
"loss": 0.7629,
"step": 3118
},
{
"epoch": 0.8584600564233125,
"grad_norm": 0.18781757528629295,
"learning_rate": 1.982789238504941e-06,
"loss": 0.7609,
"step": 3119
},
{
"epoch": 0.8587352920938554,
"grad_norm": 0.1911685631427986,
"learning_rate": 1.975211062679845e-06,
"loss": 0.7642,
"step": 3120
},
{
"epoch": 0.8590105277643982,
"grad_norm": 0.19243268059652235,
"learning_rate": 1.967646644070229e-06,
"loss": 0.778,
"step": 3121
},
{
"epoch": 0.8592857634349411,
"grad_norm": 0.18274446698876182,
"learning_rate": 1.960095988449546e-06,
"loss": 0.7502,
"step": 3122
},
{
"epoch": 0.8595609991054841,
"grad_norm": 0.18676681567293904,
"learning_rate": 1.9525591015807465e-06,
"loss": 0.7595,
"step": 3123
},
{
"epoch": 0.859836234776027,
"grad_norm": 0.18750218930674167,
"learning_rate": 1.945035989216284e-06,
"loss": 0.7646,
"step": 3124
},
{
"epoch": 0.8601114704465699,
"grad_norm": 0.19350729645871378,
"learning_rate": 1.937526657098079e-06,
"loss": 0.7515,
"step": 3125
},
{
"epoch": 0.8603867061171128,
"grad_norm": 0.18598660251185678,
"learning_rate": 1.930031110957551e-06,
"loss": 0.7478,
"step": 3126
},
{
"epoch": 0.8606619417876556,
"grad_norm": 0.18797587796946585,
"learning_rate": 1.922549356515582e-06,
"loss": 0.7358,
"step": 3127
},
{
"epoch": 0.8609371774581985,
"grad_norm": 0.19094824376945202,
"learning_rate": 1.915081399482539e-06,
"loss": 0.7729,
"step": 3128
},
{
"epoch": 0.8612124131287415,
"grad_norm": 0.1937358458010856,
"learning_rate": 1.9076272455582635e-06,
"loss": 0.7826,
"step": 3129
},
{
"epoch": 0.8614876487992844,
"grad_norm": 0.39977367190921886,
"learning_rate": 1.9001869004320395e-06,
"loss": 0.7631,
"step": 3130
},
{
"epoch": 0.8617628844698273,
"grad_norm": 0.18933689560820816,
"learning_rate": 1.8927603697826403e-06,
"loss": 0.7727,
"step": 3131
},
{
"epoch": 0.8620381201403702,
"grad_norm": 0.187226965661617,
"learning_rate": 1.8853476592782717e-06,
"loss": 0.7491,
"step": 3132
},
{
"epoch": 0.8623133558109131,
"grad_norm": 0.19019037370739847,
"learning_rate": 1.8779487745766034e-06,
"loss": 0.7904,
"step": 3133
},
{
"epoch": 0.862588591481456,
"grad_norm": 0.1846789350724278,
"learning_rate": 1.870563721324754e-06,
"loss": 0.7587,
"step": 3134
},
{
"epoch": 0.8628638271519989,
"grad_norm": 0.18668370106947693,
"learning_rate": 1.8631925051592748e-06,
"loss": 0.7821,
"step": 3135
},
{
"epoch": 0.8631390628225418,
"grad_norm": 0.19123886465591422,
"learning_rate": 1.8558351317061696e-06,
"loss": 0.7677,
"step": 3136
},
{
"epoch": 0.8634142984930847,
"grad_norm": 0.19004359662777157,
"learning_rate": 1.8484916065808622e-06,
"loss": 0.7772,
"step": 3137
},
{
"epoch": 0.8636895341636276,
"grad_norm": 0.18887911554861778,
"learning_rate": 1.8411619353882182e-06,
"loss": 0.7514,
"step": 3138
},
{
"epoch": 0.8639647698341705,
"grad_norm": 0.18953425988012504,
"learning_rate": 1.833846123722529e-06,
"loss": 0.7806,
"step": 3139
},
{
"epoch": 0.8642400055047134,
"grad_norm": 0.5388338233803865,
"learning_rate": 1.8265441771675019e-06,
"loss": 0.7634,
"step": 3140
},
{
"epoch": 0.8645152411752564,
"grad_norm": 0.19444099108786989,
"learning_rate": 1.8192561012962673e-06,
"loss": 0.7535,
"step": 3141
},
{
"epoch": 0.8647904768457992,
"grad_norm": 0.18353107682211708,
"learning_rate": 1.8119819016713624e-06,
"loss": 0.7502,
"step": 3142
},
{
"epoch": 0.8650657125163421,
"grad_norm": 0.19021247181972498,
"learning_rate": 1.8047215838447397e-06,
"loss": 0.7739,
"step": 3143
},
{
"epoch": 0.865340948186885,
"grad_norm": 0.1911367831611288,
"learning_rate": 1.7974751533577572e-06,
"loss": 0.8046,
"step": 3144
},
{
"epoch": 0.8656161838574279,
"grad_norm": 0.18298775735981768,
"learning_rate": 1.7902426157411622e-06,
"loss": 0.7714,
"step": 3145
},
{
"epoch": 0.8658914195279708,
"grad_norm": 0.4505798359679756,
"learning_rate": 1.783023976515117e-06,
"loss": 0.8052,
"step": 3146
},
{
"epoch": 0.8661666551985138,
"grad_norm": 0.19479139866548645,
"learning_rate": 1.7758192411891584e-06,
"loss": 0.8106,
"step": 3147
},
{
"epoch": 0.8664418908690567,
"grad_norm": 0.1934580864132261,
"learning_rate": 1.7686284152622257e-06,
"loss": 0.7662,
"step": 3148
},
{
"epoch": 0.8667171265395995,
"grad_norm": 0.18951276942974823,
"learning_rate": 1.7614515042226289e-06,
"loss": 0.7829,
"step": 3149
},
{
"epoch": 0.8669923622101424,
"grad_norm": 0.18965836215958998,
"learning_rate": 1.7542885135480636e-06,
"loss": 0.7802,
"step": 3150
},
{
"epoch": 0.8672675978806853,
"grad_norm": 0.18843350597762612,
"learning_rate": 1.7471394487056082e-06,
"loss": 0.774,
"step": 3151
},
{
"epoch": 0.8675428335512282,
"grad_norm": 0.19853461141996423,
"learning_rate": 1.7400043151516955e-06,
"loss": 0.7543,
"step": 3152
},
{
"epoch": 0.8678180692217712,
"grad_norm": 0.1883128381789863,
"learning_rate": 1.7328831183321448e-06,
"loss": 0.7669,
"step": 3153
},
{
"epoch": 0.8680933048923141,
"grad_norm": 0.19324496425162602,
"learning_rate": 1.725775863682122e-06,
"loss": 0.7964,
"step": 3154
},
{
"epoch": 0.868368540562857,
"grad_norm": 0.18926465310836058,
"learning_rate": 1.718682556626161e-06,
"loss": 0.7768,
"step": 3155
},
{
"epoch": 0.8686437762333998,
"grad_norm": 0.186403577544894,
"learning_rate": 1.7116032025781515e-06,
"loss": 0.743,
"step": 3156
},
{
"epoch": 0.8689190119039427,
"grad_norm": 0.18083539936835596,
"learning_rate": 1.7045378069413222e-06,
"loss": 0.7643,
"step": 3157
},
{
"epoch": 0.8691942475744856,
"grad_norm": 0.1872850661697392,
"learning_rate": 1.6974863751082638e-06,
"loss": 0.7674,
"step": 3158
},
{
"epoch": 0.8694694832450286,
"grad_norm": 0.19361785999115488,
"learning_rate": 1.6904489124608892e-06,
"loss": 0.7449,
"step": 3159
},
{
"epoch": 0.8697447189155715,
"grad_norm": 0.19837906360231294,
"learning_rate": 1.6834254243704773e-06,
"loss": 0.7953,
"step": 3160
},
{
"epoch": 0.8700199545861144,
"grad_norm": 0.18428458602536607,
"learning_rate": 1.67641591619762e-06,
"loss": 0.7467,
"step": 3161
},
{
"epoch": 0.8702951902566572,
"grad_norm": 0.18759872592212673,
"learning_rate": 1.6694203932922404e-06,
"loss": 0.7823,
"step": 3162
},
{
"epoch": 0.8705704259272001,
"grad_norm": 0.19769691867793007,
"learning_rate": 1.6624388609935981e-06,
"loss": 0.7689,
"step": 3163
},
{
"epoch": 0.870845661597743,
"grad_norm": 0.19168604251327903,
"learning_rate": 1.6554713246302645e-06,
"loss": 0.7857,
"step": 3164
},
{
"epoch": 0.871120897268286,
"grad_norm": 0.18773544811091583,
"learning_rate": 1.648517789520132e-06,
"loss": 0.7462,
"step": 3165
},
{
"epoch": 0.8713961329388289,
"grad_norm": 0.1935682038227588,
"learning_rate": 1.641578260970409e-06,
"loss": 0.7864,
"step": 3166
},
{
"epoch": 0.8716713686093718,
"grad_norm": 0.18931453176890442,
"learning_rate": 1.6346527442776118e-06,
"loss": 0.7459,
"step": 3167
},
{
"epoch": 0.8719466042799147,
"grad_norm": 0.19666816714300905,
"learning_rate": 1.6277412447275653e-06,
"loss": 0.775,
"step": 3168
},
{
"epoch": 0.8722218399504575,
"grad_norm": 0.1831436738641368,
"learning_rate": 1.620843767595388e-06,
"loss": 0.7758,
"step": 3169
},
{
"epoch": 0.8724970756210004,
"grad_norm": 0.19154673596513266,
"learning_rate": 1.6139603181455022e-06,
"loss": 0.7869,
"step": 3170
},
{
"epoch": 0.8727723112915434,
"grad_norm": 0.19284064944952697,
"learning_rate": 1.6070909016316271e-06,
"loss": 0.7554,
"step": 3171
},
{
"epoch": 0.8730475469620863,
"grad_norm": 0.19356338535767414,
"learning_rate": 1.6002355232967603e-06,
"loss": 0.7748,
"step": 3172
},
{
"epoch": 0.8733227826326292,
"grad_norm": 0.19480134196135337,
"learning_rate": 1.593394188373194e-06,
"loss": 0.7846,
"step": 3173
},
{
"epoch": 0.8735980183031721,
"grad_norm": 0.19437637012350067,
"learning_rate": 1.586566902082498e-06,
"loss": 0.7871,
"step": 3174
},
{
"epoch": 0.873873253973715,
"grad_norm": 0.20935178360694914,
"learning_rate": 1.5797536696355287e-06,
"loss": 0.7568,
"step": 3175
},
{
"epoch": 0.8741484896442578,
"grad_norm": 0.19348409819622703,
"learning_rate": 1.5729544962323972e-06,
"loss": 0.7798,
"step": 3176
},
{
"epoch": 0.8744237253148008,
"grad_norm": 0.1945280528371594,
"learning_rate": 1.5661693870625017e-06,
"loss": 0.7789,
"step": 3177
},
{
"epoch": 0.8746989609853437,
"grad_norm": 0.19121053308573158,
"learning_rate": 1.5593983473045017e-06,
"loss": 0.7547,
"step": 3178
},
{
"epoch": 0.8749741966558866,
"grad_norm": 0.18661067137349469,
"learning_rate": 1.5526413821263097e-06,
"loss": 0.7409,
"step": 3179
},
{
"epoch": 0.8752494323264295,
"grad_norm": 0.1867149380573226,
"learning_rate": 1.5458984966851077e-06,
"loss": 0.7708,
"step": 3180
},
{
"epoch": 0.8755246679969724,
"grad_norm": 0.18681874219225864,
"learning_rate": 1.5391696961273228e-06,
"loss": 0.7559,
"step": 3181
},
{
"epoch": 0.8757999036675153,
"grad_norm": 0.1827532723605175,
"learning_rate": 1.5324549855886405e-06,
"loss": 0.7864,
"step": 3182
},
{
"epoch": 0.8760751393380583,
"grad_norm": 0.18886736509452898,
"learning_rate": 1.525754370193986e-06,
"loss": 0.7458,
"step": 3183
},
{
"epoch": 0.8763503750086011,
"grad_norm": 0.18389280692880228,
"learning_rate": 1.5190678550575256e-06,
"loss": 0.7757,
"step": 3184
},
{
"epoch": 0.876625610679144,
"grad_norm": 0.18341870730942056,
"learning_rate": 1.5123954452826682e-06,
"loss": 0.7369,
"step": 3185
},
{
"epoch": 0.8769008463496869,
"grad_norm": 0.19170232393290346,
"learning_rate": 1.5057371459620518e-06,
"loss": 0.7757,
"step": 3186
},
{
"epoch": 0.8771760820202298,
"grad_norm": 0.18727671536564067,
"learning_rate": 1.4990929621775485e-06,
"loss": 0.747,
"step": 3187
},
{
"epoch": 0.8774513176907727,
"grad_norm": 0.18804982515158827,
"learning_rate": 1.4924628990002576e-06,
"loss": 0.7709,
"step": 3188
},
{
"epoch": 0.8777265533613157,
"grad_norm": 0.19030775851831463,
"learning_rate": 1.4858469614905003e-06,
"loss": 0.7759,
"step": 3189
},
{
"epoch": 0.8780017890318585,
"grad_norm": 0.19100282276274508,
"learning_rate": 1.4792451546978171e-06,
"loss": 0.7866,
"step": 3190
},
{
"epoch": 0.8782770247024014,
"grad_norm": 0.18943433680658,
"learning_rate": 1.4726574836609575e-06,
"loss": 0.7883,
"step": 3191
},
{
"epoch": 0.8785522603729443,
"grad_norm": 0.19052339389646153,
"learning_rate": 1.4660839534078863e-06,
"loss": 0.7429,
"step": 3192
},
{
"epoch": 0.8788274960434872,
"grad_norm": 0.18385255754512617,
"learning_rate": 1.4595245689557834e-06,
"loss": 0.7684,
"step": 3193
},
{
"epoch": 0.8791027317140301,
"grad_norm": 0.18798321377523702,
"learning_rate": 1.4529793353110155e-06,
"loss": 0.7868,
"step": 3194
},
{
"epoch": 0.8793779673845731,
"grad_norm": 0.19142201236211664,
"learning_rate": 1.446448257469164e-06,
"loss": 0.7924,
"step": 3195
},
{
"epoch": 0.879653203055116,
"grad_norm": 0.1886902222370272,
"learning_rate": 1.439931340414995e-06,
"loss": 0.7838,
"step": 3196
},
{
"epoch": 0.8799284387256588,
"grad_norm": 0.1960848842709848,
"learning_rate": 1.4334285891224786e-06,
"loss": 0.772,
"step": 3197
},
{
"epoch": 0.8802036743962017,
"grad_norm": 0.18609669227991066,
"learning_rate": 1.426940008554758e-06,
"loss": 0.7719,
"step": 3198
},
{
"epoch": 0.8804789100667446,
"grad_norm": 0.18753491723105517,
"learning_rate": 1.4204656036641717e-06,
"loss": 0.7658,
"step": 3199
},
{
"epoch": 0.8807541457372875,
"grad_norm": 0.18702009830021854,
"learning_rate": 1.4140053793922403e-06,
"loss": 0.757,
"step": 3200
},
{
"epoch": 0.8810293814078305,
"grad_norm": 0.19158634371881353,
"learning_rate": 1.4075593406696464e-06,
"loss": 0.7774,
"step": 3201
},
{
"epoch": 0.8813046170783734,
"grad_norm": 0.1890662618782119,
"learning_rate": 1.4011274924162655e-06,
"loss": 0.7826,
"step": 3202
},
{
"epoch": 0.8815798527489163,
"grad_norm": 0.18656116002877973,
"learning_rate": 1.3947098395411263e-06,
"loss": 0.7795,
"step": 3203
},
{
"epoch": 0.8818550884194591,
"grad_norm": 0.19052416120103913,
"learning_rate": 1.388306386942433e-06,
"loss": 0.7606,
"step": 3204
},
{
"epoch": 0.882130324090002,
"grad_norm": 0.1810951845191948,
"learning_rate": 1.3819171395075515e-06,
"loss": 0.766,
"step": 3205
},
{
"epoch": 0.882405559760545,
"grad_norm": 0.18871073079663003,
"learning_rate": 1.3755421021129945e-06,
"loss": 0.7537,
"step": 3206
},
{
"epoch": 0.8826807954310879,
"grad_norm": 0.18354284584931382,
"learning_rate": 1.369181279624443e-06,
"loss": 0.7495,
"step": 3207
},
{
"epoch": 0.8829560311016308,
"grad_norm": 0.18181011041140363,
"learning_rate": 1.3628346768967183e-06,
"loss": 0.7373,
"step": 3208
},
{
"epoch": 0.8832312667721737,
"grad_norm": 0.17894662086945293,
"learning_rate": 1.3565022987737897e-06,
"loss": 0.7517,
"step": 3209
},
{
"epoch": 0.8835065024427166,
"grad_norm": 0.2013464428332814,
"learning_rate": 1.3501841500887846e-06,
"loss": 0.7759,
"step": 3210
},
{
"epoch": 0.8837817381132594,
"grad_norm": 0.18302769263600557,
"learning_rate": 1.34388023566395e-06,
"loss": 0.7858,
"step": 3211
},
{
"epoch": 0.8840569737838024,
"grad_norm": 0.18281235658011175,
"learning_rate": 1.3375905603106798e-06,
"loss": 0.777,
"step": 3212
},
{
"epoch": 0.8843322094543453,
"grad_norm": 0.18927640171862636,
"learning_rate": 1.3313151288294933e-06,
"loss": 0.7855,
"step": 3213
},
{
"epoch": 0.8846074451248882,
"grad_norm": 0.18266040214288423,
"learning_rate": 1.3250539460100465e-06,
"loss": 0.7621,
"step": 3214
},
{
"epoch": 0.8848826807954311,
"grad_norm": 0.1883157042438272,
"learning_rate": 1.3188070166311162e-06,
"loss": 0.7755,
"step": 3215
},
{
"epoch": 0.885157916465974,
"grad_norm": 0.18478083174657808,
"learning_rate": 1.3125743454605932e-06,
"loss": 0.7726,
"step": 3216
},
{
"epoch": 0.8854331521365169,
"grad_norm": 0.18882225888077922,
"learning_rate": 1.3063559372555056e-06,
"loss": 0.7568,
"step": 3217
},
{
"epoch": 0.8857083878070598,
"grad_norm": 0.18861334190849974,
"learning_rate": 1.3001517967619704e-06,
"loss": 0.7812,
"step": 3218
},
{
"epoch": 0.8859836234776027,
"grad_norm": 0.18786987256621746,
"learning_rate": 1.293961928715235e-06,
"loss": 0.7429,
"step": 3219
},
{
"epoch": 0.8862588591481456,
"grad_norm": 0.18375418260932208,
"learning_rate": 1.287786337839645e-06,
"loss": 0.7463,
"step": 3220
},
{
"epoch": 0.8865340948186885,
"grad_norm": 0.18021975385807273,
"learning_rate": 1.2816250288486477e-06,
"loss": 0.7551,
"step": 3221
},
{
"epoch": 0.8868093304892314,
"grad_norm": 0.1802494592234665,
"learning_rate": 1.2754780064447947e-06,
"loss": 0.7416,
"step": 3222
},
{
"epoch": 0.8870845661597743,
"grad_norm": 0.1908908424071035,
"learning_rate": 1.2693452753197222e-06,
"loss": 0.7839,
"step": 3223
},
{
"epoch": 0.8873598018303173,
"grad_norm": 0.17959849574191333,
"learning_rate": 1.2632268401541837e-06,
"loss": 0.7536,
"step": 3224
},
{
"epoch": 0.8876350375008601,
"grad_norm": 0.18891118267749493,
"learning_rate": 1.2571227056179924e-06,
"loss": 0.7777,
"step": 3225
},
{
"epoch": 0.887910273171403,
"grad_norm": 0.18665675363449444,
"learning_rate": 1.251032876370062e-06,
"loss": 0.7445,
"step": 3226
},
{
"epoch": 0.8881855088419459,
"grad_norm": 0.1828392690857017,
"learning_rate": 1.244957357058394e-06,
"loss": 0.7591,
"step": 3227
},
{
"epoch": 0.8884607445124888,
"grad_norm": 0.18328334499414642,
"learning_rate": 1.238896152320046e-06,
"loss": 0.7548,
"step": 3228
},
{
"epoch": 0.8887359801830317,
"grad_norm": 0.5178245776751196,
"learning_rate": 1.232849266781173e-06,
"loss": 0.7691,
"step": 3229
},
{
"epoch": 0.8890112158535747,
"grad_norm": 0.1895180941966564,
"learning_rate": 1.22681670505699e-06,
"loss": 0.7907,
"step": 3230
},
{
"epoch": 0.8892864515241176,
"grad_norm": 0.18912205805507937,
"learning_rate": 1.2207984717517785e-06,
"loss": 0.7768,
"step": 3231
},
{
"epoch": 0.8895616871946604,
"grad_norm": 0.1816249035219771,
"learning_rate": 1.2147945714588927e-06,
"loss": 0.7635,
"step": 3232
},
{
"epoch": 0.8898369228652033,
"grad_norm": 0.186735818696158,
"learning_rate": 1.208805008760736e-06,
"loss": 0.7789,
"step": 3233
},
{
"epoch": 0.8901121585357462,
"grad_norm": 0.18086490081164475,
"learning_rate": 1.2028297882287764e-06,
"loss": 0.7433,
"step": 3234
},
{
"epoch": 0.8903873942062891,
"grad_norm": 0.1877576003566219,
"learning_rate": 1.19686891442353e-06,
"loss": 0.7776,
"step": 3235
},
{
"epoch": 0.8906626298768321,
"grad_norm": 0.17979070869113153,
"learning_rate": 1.190922391894569e-06,
"loss": 0.765,
"step": 3236
},
{
"epoch": 0.890937865547375,
"grad_norm": 0.18199895198833935,
"learning_rate": 1.184990225180509e-06,
"loss": 0.7384,
"step": 3237
},
{
"epoch": 0.8912131012179179,
"grad_norm": 0.1853237753597599,
"learning_rate": 1.179072418809004e-06,
"loss": 0.7709,
"step": 3238
},
{
"epoch": 0.8914883368884607,
"grad_norm": 0.18993235985286852,
"learning_rate": 1.1731689772967636e-06,
"loss": 0.7671,
"step": 3239
},
{
"epoch": 0.8917635725590036,
"grad_norm": 0.1801498034640683,
"learning_rate": 1.167279905149512e-06,
"loss": 0.7602,
"step": 3240
},
{
"epoch": 0.8920388082295465,
"grad_norm": 0.17992316087193566,
"learning_rate": 1.1614052068620208e-06,
"loss": 0.7649,
"step": 3241
},
{
"epoch": 0.8923140439000895,
"grad_norm": 0.18311155414803634,
"learning_rate": 1.1555448869180897e-06,
"loss": 0.7397,
"step": 3242
},
{
"epoch": 0.8925892795706324,
"grad_norm": 0.1854289238806252,
"learning_rate": 1.1496989497905342e-06,
"loss": 0.7404,
"step": 3243
},
{
"epoch": 0.8928645152411753,
"grad_norm": 0.18543812283119235,
"learning_rate": 1.1438673999412054e-06,
"loss": 0.7504,
"step": 3244
},
{
"epoch": 0.8931397509117182,
"grad_norm": 0.18522208090112366,
"learning_rate": 1.1380502418209604e-06,
"loss": 0.7775,
"step": 3245
},
{
"epoch": 0.893414986582261,
"grad_norm": 0.1773061329002993,
"learning_rate": 1.132247479869688e-06,
"loss": 0.7415,
"step": 3246
},
{
"epoch": 0.8936902222528039,
"grad_norm": 0.185936720669809,
"learning_rate": 1.1264591185162787e-06,
"loss": 0.767,
"step": 3247
},
{
"epoch": 0.8939654579233469,
"grad_norm": 0.18110163506955634,
"learning_rate": 1.1206851621786275e-06,
"loss": 0.7538,
"step": 3248
},
{
"epoch": 0.8942406935938898,
"grad_norm": 0.18123217858386276,
"learning_rate": 1.114925615263649e-06,
"loss": 0.7726,
"step": 3249
},
{
"epoch": 0.8945159292644327,
"grad_norm": 0.1820051225440009,
"learning_rate": 1.1091804821672448e-06,
"loss": 0.7715,
"step": 3250
},
{
"epoch": 0.8947911649349756,
"grad_norm": 0.1793018545038116,
"learning_rate": 1.1034497672743249e-06,
"loss": 0.7782,
"step": 3251
},
{
"epoch": 0.8950664006055185,
"grad_norm": 0.19374990373635947,
"learning_rate": 1.0977334749587932e-06,
"loss": 0.7709,
"step": 3252
},
{
"epoch": 0.8953416362760613,
"grad_norm": 0.18369560061692597,
"learning_rate": 1.0920316095835437e-06,
"loss": 0.7898,
"step": 3253
},
{
"epoch": 0.8956168719466043,
"grad_norm": 0.18314631607832818,
"learning_rate": 1.0863441755004645e-06,
"loss": 0.7599,
"step": 3254
},
{
"epoch": 0.8958921076171472,
"grad_norm": 0.18303900316550475,
"learning_rate": 1.0806711770504207e-06,
"loss": 0.8034,
"step": 3255
},
{
"epoch": 0.8961673432876901,
"grad_norm": 0.3696098952835843,
"learning_rate": 1.0750126185632626e-06,
"loss": 0.7584,
"step": 3256
},
{
"epoch": 0.896442578958233,
"grad_norm": 0.1811012885139857,
"learning_rate": 1.0693685043578284e-06,
"loss": 0.7614,
"step": 3257
},
{
"epoch": 0.8967178146287759,
"grad_norm": 0.17765567803019613,
"learning_rate": 1.0637388387419146e-06,
"loss": 0.7311,
"step": 3258
},
{
"epoch": 0.8969930502993188,
"grad_norm": 0.18584465651559434,
"learning_rate": 1.058123626012304e-06,
"loss": 0.7791,
"step": 3259
},
{
"epoch": 0.8972682859698617,
"grad_norm": 0.18131319131955945,
"learning_rate": 1.0525228704547464e-06,
"loss": 0.7743,
"step": 3260
},
{
"epoch": 0.8975435216404046,
"grad_norm": 0.18550276417431893,
"learning_rate": 1.0469365763439532e-06,
"loss": 0.7827,
"step": 3261
},
{
"epoch": 0.8978187573109475,
"grad_norm": 0.18588065751800437,
"learning_rate": 1.0413647479435962e-06,
"loss": 0.762,
"step": 3262
},
{
"epoch": 0.8980939929814904,
"grad_norm": 0.1826564939801497,
"learning_rate": 1.0358073895063136e-06,
"loss": 0.7549,
"step": 3263
},
{
"epoch": 0.8983692286520333,
"grad_norm": 0.17734120314937488,
"learning_rate": 1.0302645052736992e-06,
"loss": 0.7505,
"step": 3264
},
{
"epoch": 0.8986444643225762,
"grad_norm": 0.18788877403336884,
"learning_rate": 1.0247360994762888e-06,
"loss": 0.7625,
"step": 3265
},
{
"epoch": 0.8989196999931192,
"grad_norm": 0.1851095058278748,
"learning_rate": 1.0192221763335807e-06,
"loss": 0.7523,
"step": 3266
},
{
"epoch": 0.899194935663662,
"grad_norm": 0.18912561658909227,
"learning_rate": 1.0137227400540128e-06,
"loss": 0.7689,
"step": 3267
},
{
"epoch": 0.8994701713342049,
"grad_norm": 0.18326911598950926,
"learning_rate": 1.0082377948349653e-06,
"loss": 0.7805,
"step": 3268
},
{
"epoch": 0.8997454070047478,
"grad_norm": 0.1824311163067431,
"learning_rate": 1.0027673448627673e-06,
"loss": 0.7798,
"step": 3269
},
{
"epoch": 0.9000206426752907,
"grad_norm": 0.18526199323055903,
"learning_rate": 9.9731139431267e-07,
"loss": 0.7771,
"step": 3270
},
{
"epoch": 0.9002958783458336,
"grad_norm": 0.18144373701305452,
"learning_rate": 9.918699473488714e-07,
"loss": 0.7378,
"step": 3271
},
{
"epoch": 0.9005711140163766,
"grad_norm": 0.18700031340046053,
"learning_rate": 9.864430081244892e-07,
"loss": 0.7962,
"step": 3272
},
{
"epoch": 0.9008463496869195,
"grad_norm": 0.17872148115622588,
"learning_rate": 9.810305807815746e-07,
"loss": 0.7409,
"step": 3273
},
{
"epoch": 0.9011215853574623,
"grad_norm": 0.18592891265931816,
"learning_rate": 9.75632669451101e-07,
"loss": 0.7944,
"step": 3274
},
{
"epoch": 0.9013968210280052,
"grad_norm": 0.1830787386929491,
"learning_rate": 9.702492782529637e-07,
"loss": 0.7658,
"step": 3275
},
{
"epoch": 0.9016720566985481,
"grad_norm": 0.18752019297533465,
"learning_rate": 9.648804112959786e-07,
"loss": 0.7909,
"step": 3276
},
{
"epoch": 0.901947292369091,
"grad_norm": 0.2808642778861853,
"learning_rate": 9.595260726778678e-07,
"loss": 0.7551,
"step": 3277
},
{
"epoch": 0.902222528039634,
"grad_norm": 0.1816910627654414,
"learning_rate": 9.541862664852686e-07,
"loss": 0.7534,
"step": 3278
},
{
"epoch": 0.9024977637101769,
"grad_norm": 0.1851249713793175,
"learning_rate": 9.488609967937323e-07,
"loss": 0.7766,
"step": 3279
},
{
"epoch": 0.9027729993807198,
"grad_norm": 0.18764193289975062,
"learning_rate": 9.435502676677011e-07,
"loss": 0.7863,
"step": 3280
},
{
"epoch": 0.9030482350512626,
"grad_norm": 0.18253717110615045,
"learning_rate": 9.382540831605413e-07,
"loss": 0.7573,
"step": 3281
},
{
"epoch": 0.9033234707218055,
"grad_norm": 0.1840378750178227,
"learning_rate": 9.329724473144974e-07,
"loss": 0.7633,
"step": 3282
},
{
"epoch": 0.9035987063923484,
"grad_norm": 0.1874412097016048,
"learning_rate": 9.277053641607225e-07,
"loss": 0.7579,
"step": 3283
},
{
"epoch": 0.9038739420628914,
"grad_norm": 0.19205818139391206,
"learning_rate": 9.224528377192543e-07,
"loss": 0.7862,
"step": 3284
},
{
"epoch": 0.9041491777334343,
"grad_norm": 0.18176755943939715,
"learning_rate": 9.172148719990237e-07,
"loss": 0.7812,
"step": 3285
},
{
"epoch": 0.9044244134039772,
"grad_norm": 0.18335505958143317,
"learning_rate": 9.119914709978528e-07,
"loss": 0.7919,
"step": 3286
},
{
"epoch": 0.90469964907452,
"grad_norm": 0.2768791477738455,
"learning_rate": 9.067826387024347e-07,
"loss": 0.742,
"step": 3287
},
{
"epoch": 0.9049748847450629,
"grad_norm": 0.18882881787581965,
"learning_rate": 9.015883790883629e-07,
"loss": 0.7872,
"step": 3288
},
{
"epoch": 0.9052501204156058,
"grad_norm": 0.18412990631147902,
"learning_rate": 8.964086961200902e-07,
"loss": 0.7766,
"step": 3289
},
{
"epoch": 0.9055253560861488,
"grad_norm": 0.1794471080088259,
"learning_rate": 8.912435937509501e-07,
"loss": 0.7722,
"step": 3290
},
{
"epoch": 0.9058005917566917,
"grad_norm": 0.18039677462433537,
"learning_rate": 8.860930759231534e-07,
"loss": 0.7598,
"step": 3291
},
{
"epoch": 0.9060758274272346,
"grad_norm": 0.18257445640321843,
"learning_rate": 8.809571465677691e-07,
"loss": 0.7792,
"step": 3292
},
{
"epoch": 0.9063510630977775,
"grad_norm": 0.18575558603530684,
"learning_rate": 8.758358096047414e-07,
"loss": 0.7628,
"step": 3293
},
{
"epoch": 0.9066262987683203,
"grad_norm": 0.318222667109141,
"learning_rate": 8.70729068942866e-07,
"loss": 0.7833,
"step": 3294
},
{
"epoch": 0.9069015344388632,
"grad_norm": 0.18547318358208326,
"learning_rate": 8.656369284798071e-07,
"loss": 0.7788,
"step": 3295
},
{
"epoch": 0.9071767701094062,
"grad_norm": 0.1805376834815158,
"learning_rate": 8.605593921020917e-07,
"loss": 0.7593,
"step": 3296
},
{
"epoch": 0.9074520057799491,
"grad_norm": 0.18492252898820455,
"learning_rate": 8.554964636850815e-07,
"loss": 0.7482,
"step": 3297
},
{
"epoch": 0.907727241450492,
"grad_norm": 0.18150363230715091,
"learning_rate": 8.504481470930037e-07,
"loss": 0.7862,
"step": 3298
},
{
"epoch": 0.9080024771210349,
"grad_norm": 0.17997984413775153,
"learning_rate": 8.454144461789271e-07,
"loss": 0.7625,
"step": 3299
},
{
"epoch": 0.9082777127915778,
"grad_norm": 0.17774947845574218,
"learning_rate": 8.403953647847674e-07,
"loss": 0.7526,
"step": 3300
},
{
"epoch": 0.9085529484621206,
"grad_norm": 0.18690105830088002,
"learning_rate": 8.353909067412824e-07,
"loss": 0.7713,
"step": 3301
},
{
"epoch": 0.9088281841326636,
"grad_norm": 0.1837051954473274,
"learning_rate": 8.304010758680614e-07,
"loss": 0.7865,
"step": 3302
},
{
"epoch": 0.9091034198032065,
"grad_norm": 0.18216246580530657,
"learning_rate": 8.254258759735468e-07,
"loss": 0.7659,
"step": 3303
},
{
"epoch": 0.9093786554737494,
"grad_norm": 0.1829953371094423,
"learning_rate": 8.204653108549965e-07,
"loss": 0.7548,
"step": 3304
},
{
"epoch": 0.9096538911442923,
"grad_norm": 0.17915974817091224,
"learning_rate": 8.155193842985066e-07,
"loss": 0.7875,
"step": 3305
},
{
"epoch": 0.9099291268148352,
"grad_norm": 0.17946220063968904,
"learning_rate": 8.105881000790016e-07,
"loss": 0.7744,
"step": 3306
},
{
"epoch": 0.9102043624853781,
"grad_norm": 0.18736348652154028,
"learning_rate": 8.056714619602246e-07,
"loss": 0.7744,
"step": 3307
},
{
"epoch": 0.910479598155921,
"grad_norm": 0.18232306632741607,
"learning_rate": 8.007694736947491e-07,
"loss": 0.7693,
"step": 3308
},
{
"epoch": 0.9107548338264639,
"grad_norm": 0.18063025095043908,
"learning_rate": 7.958821390239535e-07,
"loss": 0.756,
"step": 3309
},
{
"epoch": 0.9110300694970068,
"grad_norm": 0.1801797098842179,
"learning_rate": 7.910094616780495e-07,
"loss": 0.7509,
"step": 3310
},
{
"epoch": 0.9113053051675497,
"grad_norm": 0.1835824285114398,
"learning_rate": 7.861514453760466e-07,
"loss": 0.7738,
"step": 3311
},
{
"epoch": 0.9115805408380926,
"grad_norm": 0.1813721539661817,
"learning_rate": 7.813080938257722e-07,
"loss": 0.7859,
"step": 3312
},
{
"epoch": 0.9118557765086355,
"grad_norm": 0.1819370221825069,
"learning_rate": 7.764794107238627e-07,
"loss": 0.7584,
"step": 3313
},
{
"epoch": 0.9121310121791785,
"grad_norm": 0.18301458943707027,
"learning_rate": 7.716653997557521e-07,
"loss": 0.7881,
"step": 3314
},
{
"epoch": 0.9124062478497214,
"grad_norm": 0.17943713213846424,
"learning_rate": 7.668660645956794e-07,
"loss": 0.7649,
"step": 3315
},
{
"epoch": 0.9126814835202642,
"grad_norm": 0.1810842977738103,
"learning_rate": 7.62081408906683e-07,
"loss": 0.7749,
"step": 3316
},
{
"epoch": 0.9129567191908071,
"grad_norm": 0.17991730122011682,
"learning_rate": 7.573114363405976e-07,
"loss": 0.7736,
"step": 3317
},
{
"epoch": 0.91323195486135,
"grad_norm": 0.177238683292289,
"learning_rate": 7.52556150538053e-07,
"loss": 0.7663,
"step": 3318
},
{
"epoch": 0.9135071905318929,
"grad_norm": 0.1822944602073359,
"learning_rate": 7.478155551284638e-07,
"loss": 0.7536,
"step": 3319
},
{
"epoch": 0.9137824262024359,
"grad_norm": 0.1814977801758155,
"learning_rate": 7.430896537300381e-07,
"loss": 0.7387,
"step": 3320
},
{
"epoch": 0.9140576618729788,
"grad_norm": 0.17747273869824762,
"learning_rate": 7.383784499497637e-07,
"loss": 0.7648,
"step": 3321
},
{
"epoch": 0.9143328975435216,
"grad_norm": 0.18445974312771046,
"learning_rate": 7.336819473834134e-07,
"loss": 0.8025,
"step": 3322
},
{
"epoch": 0.9146081332140645,
"grad_norm": 0.18042862375122798,
"learning_rate": 7.290001496155418e-07,
"loss": 0.7722,
"step": 3323
},
{
"epoch": 0.9148833688846074,
"grad_norm": 0.1829255169316263,
"learning_rate": 7.243330602194754e-07,
"loss": 0.8095,
"step": 3324
},
{
"epoch": 0.9151586045551503,
"grad_norm": 0.18009375165013491,
"learning_rate": 7.196806827573222e-07,
"loss": 0.7736,
"step": 3325
},
{
"epoch": 0.9154338402256933,
"grad_norm": 0.18169937563497035,
"learning_rate": 7.150430207799486e-07,
"loss": 0.795,
"step": 3326
},
{
"epoch": 0.9157090758962362,
"grad_norm": 0.1788925426445409,
"learning_rate": 7.104200778270032e-07,
"loss": 0.7684,
"step": 3327
},
{
"epoch": 0.9159843115667791,
"grad_norm": 0.18003129703576484,
"learning_rate": 7.058118574268969e-07,
"loss": 0.7671,
"step": 3328
},
{
"epoch": 0.916259547237322,
"grad_norm": 0.18689407052760498,
"learning_rate": 7.012183630967939e-07,
"loss": 0.8137,
"step": 3329
},
{
"epoch": 0.9165347829078648,
"grad_norm": 0.18569610275362458,
"learning_rate": 6.966395983426299e-07,
"loss": 0.7733,
"step": 3330
},
{
"epoch": 0.9168100185784077,
"grad_norm": 0.18134155747735997,
"learning_rate": 6.920755666590961e-07,
"loss": 0.7843,
"step": 3331
},
{
"epoch": 0.9170852542489507,
"grad_norm": 0.17665921722270503,
"learning_rate": 6.875262715296393e-07,
"loss": 0.775,
"step": 3332
},
{
"epoch": 0.9173604899194936,
"grad_norm": 0.25057512052343545,
"learning_rate": 6.829917164264554e-07,
"loss": 0.785,
"step": 3333
},
{
"epoch": 0.9176357255900365,
"grad_norm": 0.18372676596852336,
"learning_rate": 6.784719048104915e-07,
"loss": 0.7837,
"step": 3334
},
{
"epoch": 0.9179109612605794,
"grad_norm": 0.21021110732434006,
"learning_rate": 6.739668401314459e-07,
"loss": 0.7666,
"step": 3335
},
{
"epoch": 0.9181861969311222,
"grad_norm": 0.17999276756879717,
"learning_rate": 6.694765258277524e-07,
"loss": 0.7575,
"step": 3336
},
{
"epoch": 0.9184614326016651,
"grad_norm": 0.18350298484920563,
"learning_rate": 6.650009653265965e-07,
"loss": 0.7385,
"step": 3337
},
{
"epoch": 0.9187366682722081,
"grad_norm": 0.19485152267127173,
"learning_rate": 6.605401620438967e-07,
"loss": 0.7859,
"step": 3338
},
{
"epoch": 0.919011903942751,
"grad_norm": 0.183347840395693,
"learning_rate": 6.560941193843118e-07,
"loss": 0.7595,
"step": 3339
},
{
"epoch": 0.9192871396132939,
"grad_norm": 0.18164410805064746,
"learning_rate": 6.516628407412362e-07,
"loss": 0.7729,
"step": 3340
},
{
"epoch": 0.9195623752838368,
"grad_norm": 0.18128818734487367,
"learning_rate": 6.47246329496789e-07,
"loss": 0.7729,
"step": 3341
},
{
"epoch": 0.9198376109543797,
"grad_norm": 0.1754103639942013,
"learning_rate": 6.428445890218205e-07,
"loss": 0.7674,
"step": 3342
},
{
"epoch": 0.9201128466249225,
"grad_norm": 0.18003330952170885,
"learning_rate": 6.384576226759165e-07,
"loss": 0.7492,
"step": 3343
},
{
"epoch": 0.9203880822954655,
"grad_norm": 0.188265865346989,
"learning_rate": 6.340854338073699e-07,
"loss": 0.8079,
"step": 3344
},
{
"epoch": 0.9206633179660084,
"grad_norm": 0.18198473434318366,
"learning_rate": 6.297280257532112e-07,
"loss": 0.7815,
"step": 3345
},
{
"epoch": 0.9209385536365513,
"grad_norm": 0.1886024420024592,
"learning_rate": 6.25385401839178e-07,
"loss": 0.7756,
"step": 3346
},
{
"epoch": 0.9212137893070942,
"grad_norm": 0.17627773507695044,
"learning_rate": 6.210575653797346e-07,
"loss": 0.7598,
"step": 3347
},
{
"epoch": 0.9214890249776371,
"grad_norm": 0.18682397363854383,
"learning_rate": 6.167445196780475e-07,
"loss": 0.7988,
"step": 3348
},
{
"epoch": 0.92176426064818,
"grad_norm": 0.1814496775862832,
"learning_rate": 6.124462680260035e-07,
"loss": 0.7625,
"step": 3349
},
{
"epoch": 0.922039496318723,
"grad_norm": 0.18931687425075752,
"learning_rate": 6.081628137041917e-07,
"loss": 0.7709,
"step": 3350
},
{
"epoch": 0.9223147319892658,
"grad_norm": 0.18573460812678885,
"learning_rate": 6.038941599819104e-07,
"loss": 0.7779,
"step": 3351
},
{
"epoch": 0.9225899676598087,
"grad_norm": 0.18600778689760136,
"learning_rate": 5.996403101171622e-07,
"loss": 0.7779,
"step": 3352
},
{
"epoch": 0.9228652033303516,
"grad_norm": 0.1794970277809988,
"learning_rate": 5.954012673566479e-07,
"loss": 0.772,
"step": 3353
},
{
"epoch": 0.9231404390008945,
"grad_norm": 0.1776580578157053,
"learning_rate": 5.911770349357704e-07,
"loss": 0.7636,
"step": 3354
},
{
"epoch": 0.9234156746714374,
"grad_norm": 0.17259111731943394,
"learning_rate": 5.869676160786308e-07,
"loss": 0.7531,
"step": 3355
},
{
"epoch": 0.9236909103419804,
"grad_norm": 0.18573617522316438,
"learning_rate": 5.827730139980125e-07,
"loss": 0.7519,
"step": 3356
},
{
"epoch": 0.9239661460125232,
"grad_norm": 0.17880114121294402,
"learning_rate": 5.785932318954035e-07,
"loss": 0.781,
"step": 3357
},
{
"epoch": 0.9242413816830661,
"grad_norm": 0.18487770952312754,
"learning_rate": 5.744282729609696e-07,
"loss": 0.7633,
"step": 3358
},
{
"epoch": 0.924516617353609,
"grad_norm": 0.18177462817697282,
"learning_rate": 5.702781403735746e-07,
"loss": 0.7517,
"step": 3359
},
{
"epoch": 0.9247918530241519,
"grad_norm": 0.29856715115902766,
"learning_rate": 5.66142837300756e-07,
"loss": 0.7806,
"step": 3360
},
{
"epoch": 0.9250670886946948,
"grad_norm": 0.18204168450450275,
"learning_rate": 5.620223668987379e-07,
"loss": 0.7817,
"step": 3361
},
{
"epoch": 0.9253423243652378,
"grad_norm": 0.1780772942381337,
"learning_rate": 5.579167323124268e-07,
"loss": 0.761,
"step": 3362
},
{
"epoch": 0.9256175600357807,
"grad_norm": 0.17719504717407475,
"learning_rate": 5.53825936675394e-07,
"loss": 0.7488,
"step": 3363
},
{
"epoch": 0.9258927957063235,
"grad_norm": 0.18664310097507678,
"learning_rate": 5.497499831098974e-07,
"loss": 0.7939,
"step": 3364
},
{
"epoch": 0.9261680313768664,
"grad_norm": 0.17977919885884985,
"learning_rate": 5.456888747268641e-07,
"loss": 0.7688,
"step": 3365
},
{
"epoch": 0.9264432670474093,
"grad_norm": 0.1773780216528953,
"learning_rate": 5.416426146258835e-07,
"loss": 0.7589,
"step": 3366
},
{
"epoch": 0.9267185027179522,
"grad_norm": 0.18386694009127807,
"learning_rate": 5.376112058952232e-07,
"loss": 0.7802,
"step": 3367
},
{
"epoch": 0.9269937383884952,
"grad_norm": 0.18115229126900081,
"learning_rate": 5.33594651611804e-07,
"loss": 0.7505,
"step": 3368
},
{
"epoch": 0.9272689740590381,
"grad_norm": 0.1817925842703226,
"learning_rate": 5.295929548412227e-07,
"loss": 0.7558,
"step": 3369
},
{
"epoch": 0.927544209729581,
"grad_norm": 0.18434080805609537,
"learning_rate": 5.256061186377226e-07,
"loss": 0.7528,
"step": 3370
},
{
"epoch": 0.9278194454001238,
"grad_norm": 0.18799234590140312,
"learning_rate": 5.216341460442143e-07,
"loss": 0.7779,
"step": 3371
},
{
"epoch": 0.9280946810706667,
"grad_norm": 0.18458407979936434,
"learning_rate": 5.176770400922614e-07,
"loss": 0.8223,
"step": 3372
},
{
"epoch": 0.9283699167412096,
"grad_norm": 0.1753178844170801,
"learning_rate": 5.137348038020751e-07,
"loss": 0.7469,
"step": 3373
},
{
"epoch": 0.9286451524117526,
"grad_norm": 0.1778575071911813,
"learning_rate": 5.098074401825282e-07,
"loss": 0.7538,
"step": 3374
},
{
"epoch": 0.9289203880822955,
"grad_norm": 0.1783933538909113,
"learning_rate": 5.05894952231134e-07,
"loss": 0.7656,
"step": 3375
},
{
"epoch": 0.9291956237528384,
"grad_norm": 0.18309823615905257,
"learning_rate": 5.019973429340552e-07,
"loss": 0.7774,
"step": 3376
},
{
"epoch": 0.9294708594233813,
"grad_norm": 0.18051321250726957,
"learning_rate": 4.981146152661009e-07,
"loss": 0.7804,
"step": 3377
},
{
"epoch": 0.9297460950939241,
"grad_norm": 0.18438418164768278,
"learning_rate": 4.942467721907118e-07,
"loss": 0.789,
"step": 3378
},
{
"epoch": 0.930021330764467,
"grad_norm": 0.17827807824858685,
"learning_rate": 4.903938166599797e-07,
"loss": 0.7528,
"step": 3379
},
{
"epoch": 0.93029656643501,
"grad_norm": 0.18112310348622002,
"learning_rate": 4.865557516146258e-07,
"loss": 0.7756,
"step": 3380
},
{
"epoch": 0.9305718021055529,
"grad_norm": 0.1813875849515388,
"learning_rate": 4.827325799840155e-07,
"loss": 0.7704,
"step": 3381
},
{
"epoch": 0.9308470377760958,
"grad_norm": 0.182498966296108,
"learning_rate": 4.78924304686137e-07,
"loss": 0.8009,
"step": 3382
},
{
"epoch": 0.9311222734466387,
"grad_norm": 0.18239337308281747,
"learning_rate": 4.75130928627614e-07,
"loss": 0.7639,
"step": 3383
},
{
"epoch": 0.9313975091171816,
"grad_norm": 0.18190491248200163,
"learning_rate": 4.713524547036996e-07,
"loss": 0.7566,
"step": 3384
},
{
"epoch": 0.9316727447877244,
"grad_norm": 0.1822250097338557,
"learning_rate": 4.675888857982669e-07,
"loss": 0.8163,
"step": 3385
},
{
"epoch": 0.9319479804582674,
"grad_norm": 0.18217006472188224,
"learning_rate": 4.638402247838203e-07,
"loss": 0.7822,
"step": 3386
},
{
"epoch": 0.9322232161288103,
"grad_norm": 0.1781627244840082,
"learning_rate": 4.6010647452148005e-07,
"loss": 0.7686,
"step": 3387
},
{
"epoch": 0.9324984517993532,
"grad_norm": 0.17740412794334748,
"learning_rate": 4.5638763786099324e-07,
"loss": 0.7596,
"step": 3388
},
{
"epoch": 0.9327736874698961,
"grad_norm": 0.18587976864248756,
"learning_rate": 4.526837176407162e-07,
"loss": 0.7775,
"step": 3389
},
{
"epoch": 0.933048923140439,
"grad_norm": 0.18231904518557465,
"learning_rate": 4.4899471668762517e-07,
"loss": 0.7714,
"step": 3390
},
{
"epoch": 0.9333241588109819,
"grad_norm": 0.177209197535535,
"learning_rate": 4.4532063781730585e-07,
"loss": 0.7742,
"step": 3391
},
{
"epoch": 0.9335993944815248,
"grad_norm": 0.17835140414163642,
"learning_rate": 4.416614838339639e-07,
"loss": 0.7596,
"step": 3392
},
{
"epoch": 0.9338746301520677,
"grad_norm": 0.17697842072296835,
"learning_rate": 4.380172575303987e-07,
"loss": 0.7368,
"step": 3393
},
{
"epoch": 0.9341498658226106,
"grad_norm": 0.17759407218544426,
"learning_rate": 4.3438796168802753e-07,
"loss": 0.7529,
"step": 3394
},
{
"epoch": 0.9344251014931535,
"grad_norm": 0.1837684811218352,
"learning_rate": 4.307735990768702e-07,
"loss": 0.7512,
"step": 3395
},
{
"epoch": 0.9347003371636964,
"grad_norm": 0.1825328000991451,
"learning_rate": 4.2717417245555113e-07,
"loss": 0.7759,
"step": 3396
},
{
"epoch": 0.9349755728342393,
"grad_norm": 0.178967684743946,
"learning_rate": 4.2358968457128615e-07,
"loss": 0.7585,
"step": 3397
},
{
"epoch": 0.9352508085047823,
"grad_norm": 0.17499958571737417,
"learning_rate": 4.200201381598956e-07,
"loss": 0.7743,
"step": 3398
},
{
"epoch": 0.9355260441753251,
"grad_norm": 0.18041431537799826,
"learning_rate": 4.164655359458003e-07,
"loss": 0.7535,
"step": 3399
},
{
"epoch": 0.935801279845868,
"grad_norm": 0.2035188034133509,
"learning_rate": 4.1292588064200334e-07,
"loss": 0.775,
"step": 3400
},
{
"epoch": 0.9360765155164109,
"grad_norm": 0.18313380673794988,
"learning_rate": 4.094011749501103e-07,
"loss": 0.7803,
"step": 3401
},
{
"epoch": 0.9363517511869538,
"grad_norm": 0.21531022816264084,
"learning_rate": 4.0589142156031156e-07,
"loss": 0.7567,
"step": 3402
},
{
"epoch": 0.9366269868574967,
"grad_norm": 0.18203357333827486,
"learning_rate": 4.023966231513887e-07,
"loss": 0.7652,
"step": 3403
},
{
"epoch": 0.9369022225280397,
"grad_norm": 0.18230096273909116,
"learning_rate": 3.9891678239070586e-07,
"loss": 0.7942,
"step": 3404
},
{
"epoch": 0.9371774581985826,
"grad_norm": 0.17834518000836627,
"learning_rate": 3.9545190193420955e-07,
"loss": 0.774,
"step": 3405
},
{
"epoch": 0.9374526938691254,
"grad_norm": 0.17894154877984061,
"learning_rate": 3.920019844264356e-07,
"loss": 0.7573,
"step": 3406
},
{
"epoch": 0.9377279295396683,
"grad_norm": 0.17798035810326152,
"learning_rate": 3.8856703250048866e-07,
"loss": 0.7611,
"step": 3407
},
{
"epoch": 0.9380031652102112,
"grad_norm": 0.18057571880520498,
"learning_rate": 3.8514704877805844e-07,
"loss": 0.7588,
"step": 3408
},
{
"epoch": 0.9382784008807541,
"grad_norm": 0.18886857810380955,
"learning_rate": 3.817420358694102e-07,
"loss": 0.7936,
"step": 3409
},
{
"epoch": 0.9385536365512971,
"grad_norm": 0.18383807064836205,
"learning_rate": 3.783519963733806e-07,
"loss": 0.7722,
"step": 3410
},
{
"epoch": 0.93882887222184,
"grad_norm": 0.2625029611934594,
"learning_rate": 3.7497693287738e-07,
"loss": 0.7902,
"step": 3411
},
{
"epoch": 0.9391041078923829,
"grad_norm": 0.17861620442865095,
"learning_rate": 3.716168479573834e-07,
"loss": 0.7437,
"step": 3412
},
{
"epoch": 0.9393793435629257,
"grad_norm": 0.2251790915406425,
"learning_rate": 3.6827174417794153e-07,
"loss": 0.7849,
"step": 3413
},
{
"epoch": 0.9396545792334686,
"grad_norm": 0.17782609807406816,
"learning_rate": 3.649416240921677e-07,
"loss": 0.7689,
"step": 3414
},
{
"epoch": 0.9399298149040115,
"grad_norm": 0.17468421444719337,
"learning_rate": 3.6162649024173327e-07,
"loss": 0.7336,
"step": 3415
},
{
"epoch": 0.9402050505745545,
"grad_norm": 0.2308708162147,
"learning_rate": 3.583263451568808e-07,
"loss": 0.7678,
"step": 3416
},
{
"epoch": 0.9404802862450974,
"grad_norm": 0.17777308245293016,
"learning_rate": 3.550411913564067e-07,
"loss": 0.7612,
"step": 3417
},
{
"epoch": 0.9407555219156403,
"grad_norm": 0.18466314423449567,
"learning_rate": 3.517710313476652e-07,
"loss": 0.7925,
"step": 3418
},
{
"epoch": 0.9410307575861832,
"grad_norm": 0.1798245361167039,
"learning_rate": 3.485158676265754e-07,
"loss": 0.7947,
"step": 3419
},
{
"epoch": 0.941305993256726,
"grad_norm": 0.1767865918196957,
"learning_rate": 3.452757026775988e-07,
"loss": 0.7784,
"step": 3420
},
{
"epoch": 0.9415812289272689,
"grad_norm": 0.17620272589442937,
"learning_rate": 3.4205053897375497e-07,
"loss": 0.7361,
"step": 3421
},
{
"epoch": 0.9418564645978119,
"grad_norm": 0.1801820070268112,
"learning_rate": 3.3884037897661483e-07,
"loss": 0.7692,
"step": 3422
},
{
"epoch": 0.9421317002683548,
"grad_norm": 0.1799634706851007,
"learning_rate": 3.3564522513629407e-07,
"loss": 0.7474,
"step": 3423
},
{
"epoch": 0.9424069359388977,
"grad_norm": 0.17840757969221127,
"learning_rate": 3.324650798914597e-07,
"loss": 0.7877,
"step": 3424
},
{
"epoch": 0.9426821716094406,
"grad_norm": 0.18213285386900555,
"learning_rate": 3.2929994566932134e-07,
"loss": 0.7877,
"step": 3425
},
{
"epoch": 0.9429574072799835,
"grad_norm": 0.1829158920280716,
"learning_rate": 3.261498248856332e-07,
"loss": 0.7691,
"step": 3426
},
{
"epoch": 0.9432326429505263,
"grad_norm": 0.2714194726349643,
"learning_rate": 3.2301471994468536e-07,
"loss": 0.786,
"step": 3427
},
{
"epoch": 0.9435078786210693,
"grad_norm": 0.17962525664904788,
"learning_rate": 3.198946332393127e-07,
"loss": 0.7743,
"step": 3428
},
{
"epoch": 0.9437831142916122,
"grad_norm": 0.18276403812114503,
"learning_rate": 3.167895671508903e-07,
"loss": 0.7794,
"step": 3429
},
{
"epoch": 0.9440583499621551,
"grad_norm": 0.17930935781084684,
"learning_rate": 3.136995240493157e-07,
"loss": 0.7577,
"step": 3430
},
{
"epoch": 0.944333585632698,
"grad_norm": 0.18165700525680606,
"learning_rate": 3.10624506293038e-07,
"loss": 0.7765,
"step": 3431
},
{
"epoch": 0.9446088213032409,
"grad_norm": 0.17966295155502715,
"learning_rate": 3.0756451622902416e-07,
"loss": 0.7672,
"step": 3432
},
{
"epoch": 0.9448840569737837,
"grad_norm": 0.17842326449913629,
"learning_rate": 3.0451955619278164e-07,
"loss": 0.7696,
"step": 3433
},
{
"epoch": 0.9451592926443267,
"grad_norm": 0.173822163120527,
"learning_rate": 3.014896285083357e-07,
"loss": 0.7393,
"step": 3434
},
{
"epoch": 0.9454345283148696,
"grad_norm": 0.1794448576189847,
"learning_rate": 2.984747354882456e-07,
"loss": 0.7588,
"step": 3435
},
{
"epoch": 0.9457097639854125,
"grad_norm": 0.18133635826751426,
"learning_rate": 2.954748794335993e-07,
"loss": 0.772,
"step": 3436
},
{
"epoch": 0.9459849996559554,
"grad_norm": 0.17803707386819312,
"learning_rate": 2.924900626339966e-07,
"loss": 0.7848,
"step": 3437
},
{
"epoch": 0.9462602353264983,
"grad_norm": 0.1784366672781347,
"learning_rate": 2.895202873675684e-07,
"loss": 0.7754,
"step": 3438
},
{
"epoch": 0.9465354709970412,
"grad_norm": 0.18170351773250304,
"learning_rate": 2.865655559009617e-07,
"loss": 0.7824,
"step": 3439
},
{
"epoch": 0.9468107066675842,
"grad_norm": 0.17819881192043055,
"learning_rate": 2.836258704893391e-07,
"loss": 0.7476,
"step": 3440
},
{
"epoch": 0.947085942338127,
"grad_norm": 0.4997461614693514,
"learning_rate": 2.807012333763881e-07,
"loss": 0.8007,
"step": 3441
},
{
"epoch": 0.9473611780086699,
"grad_norm": 0.1769510006550083,
"learning_rate": 2.7779164679429873e-07,
"loss": 0.7757,
"step": 3442
},
{
"epoch": 0.9476364136792128,
"grad_norm": 0.34088402821254643,
"learning_rate": 2.7489711296378343e-07,
"loss": 0.7744,
"step": 3443
},
{
"epoch": 0.9479116493497557,
"grad_norm": 0.17867595346551646,
"learning_rate": 2.7201763409405726e-07,
"loss": 0.7518,
"step": 3444
},
{
"epoch": 0.9481868850202986,
"grad_norm": 0.18289283467859183,
"learning_rate": 2.6915321238285773e-07,
"loss": 0.7586,
"step": 3445
},
{
"epoch": 0.9484621206908416,
"grad_norm": 0.18000393757159489,
"learning_rate": 2.663038500164161e-07,
"loss": 0.7891,
"step": 3446
},
{
"epoch": 0.9487373563613845,
"grad_norm": 0.17811652924158544,
"learning_rate": 2.634695491694772e-07,
"loss": 0.7802,
"step": 3447
},
{
"epoch": 0.9490125920319273,
"grad_norm": 0.1776492457184233,
"learning_rate": 2.606503120052906e-07,
"loss": 0.7593,
"step": 3448
},
{
"epoch": 0.9492878277024702,
"grad_norm": 0.18636513662190413,
"learning_rate": 2.578461406756061e-07,
"loss": 0.7872,
"step": 3449
},
{
"epoch": 0.9495630633730131,
"grad_norm": 0.17671353163445164,
"learning_rate": 2.55057037320674e-07,
"loss": 0.7709,
"step": 3450
},
{
"epoch": 0.9498382990435561,
"grad_norm": 0.1802777302014596,
"learning_rate": 2.52283004069247e-07,
"loss": 0.7574,
"step": 3451
},
{
"epoch": 0.950113534714099,
"grad_norm": 0.17787202683873812,
"learning_rate": 2.495240430385737e-07,
"loss": 0.7656,
"step": 3452
},
{
"epoch": 0.9503887703846419,
"grad_norm": 0.17901545341346076,
"learning_rate": 2.467801563344052e-07,
"loss": 0.7717,
"step": 3453
},
{
"epoch": 0.9506640060551848,
"grad_norm": 0.17597541158563862,
"learning_rate": 2.4405134605097304e-07,
"loss": 0.7845,
"step": 3454
},
{
"epoch": 0.9509392417257276,
"grad_norm": 0.1797347844276362,
"learning_rate": 2.4133761427101776e-07,
"loss": 0.7682,
"step": 3455
},
{
"epoch": 0.9512144773962705,
"grad_norm": 0.17984792534794747,
"learning_rate": 2.386389630657604e-07,
"loss": 0.7789,
"step": 3456
},
{
"epoch": 0.9514897130668135,
"grad_norm": 0.18737390267781007,
"learning_rate": 2.3595539449491778e-07,
"loss": 0.7722,
"step": 3457
},
{
"epoch": 0.9517649487373564,
"grad_norm": 0.17653028561183995,
"learning_rate": 2.332869106066915e-07,
"loss": 0.7541,
"step": 3458
},
{
"epoch": 0.9520401844078993,
"grad_norm": 0.1803865835967897,
"learning_rate": 2.3063351343777241e-07,
"loss": 0.7788,
"step": 3459
},
{
"epoch": 0.9523154200784422,
"grad_norm": 0.17817562075030466,
"learning_rate": 2.2799520501333606e-07,
"loss": 0.7723,
"step": 3460
},
{
"epoch": 0.952590655748985,
"grad_norm": 0.17930027186013844,
"learning_rate": 2.253719873470406e-07,
"loss": 0.7823,
"step": 3461
},
{
"epoch": 0.9528658914195279,
"grad_norm": 0.17834616787129215,
"learning_rate": 2.2276386244102888e-07,
"loss": 0.7683,
"step": 3462
},
{
"epoch": 0.9531411270900709,
"grad_norm": 0.17608618430327727,
"learning_rate": 2.2017083228592195e-07,
"loss": 0.768,
"step": 3463
},
{
"epoch": 0.9534163627606138,
"grad_norm": 0.1818134089880582,
"learning_rate": 2.1759289886081892e-07,
"loss": 0.7905,
"step": 3464
},
{
"epoch": 0.9536915984311567,
"grad_norm": 0.1801450580653961,
"learning_rate": 2.1503006413330142e-07,
"loss": 0.7651,
"step": 3465
},
{
"epoch": 0.9539668341016996,
"grad_norm": 0.3924989381263502,
"learning_rate": 2.124823300594181e-07,
"loss": 0.7706,
"step": 3466
},
{
"epoch": 0.9542420697722425,
"grad_norm": 0.17912647516824165,
"learning_rate": 2.0994969858370463e-07,
"loss": 0.7426,
"step": 3467
},
{
"epoch": 0.9545173054427853,
"grad_norm": 0.1745049111177619,
"learning_rate": 2.074321716391614e-07,
"loss": 0.7413,
"step": 3468
},
{
"epoch": 0.9547925411133283,
"grad_norm": 0.18118406596132217,
"learning_rate": 2.049297511472581e-07,
"loss": 0.7878,
"step": 3469
},
{
"epoch": 0.9550677767838712,
"grad_norm": 0.1803089579448522,
"learning_rate": 2.024424390179447e-07,
"loss": 0.7702,
"step": 3470
},
{
"epoch": 0.9553430124544141,
"grad_norm": 0.177746866281357,
"learning_rate": 1.999702371496315e-07,
"loss": 0.7828,
"step": 3471
},
{
"epoch": 0.955618248124957,
"grad_norm": 0.17813758130928734,
"learning_rate": 1.975131474291958e-07,
"loss": 0.7779,
"step": 3472
},
{
"epoch": 0.9558934837954999,
"grad_norm": 0.1767147746377242,
"learning_rate": 1.9507117173198864e-07,
"loss": 0.7582,
"step": 3473
},
{
"epoch": 0.9561687194660428,
"grad_norm": 0.25456672245448575,
"learning_rate": 1.9264431192181466e-07,
"loss": 0.7773,
"step": 3474
},
{
"epoch": 0.9564439551365858,
"grad_norm": 0.1789887411926834,
"learning_rate": 1.9023256985095217e-07,
"loss": 0.7461,
"step": 3475
},
{
"epoch": 0.9567191908071286,
"grad_norm": 0.18163546011577508,
"learning_rate": 1.8783594736013322e-07,
"loss": 0.7768,
"step": 3476
},
{
"epoch": 0.9569944264776715,
"grad_norm": 0.2702597219081077,
"learning_rate": 1.8545444627855236e-07,
"loss": 0.7632,
"step": 3477
},
{
"epoch": 0.9572696621482144,
"grad_norm": 0.2997512889466131,
"learning_rate": 1.830880684238645e-07,
"loss": 0.7465,
"step": 3478
},
{
"epoch": 0.9575448978187573,
"grad_norm": 0.2711901813937954,
"learning_rate": 1.8073681560218047e-07,
"loss": 0.77,
"step": 3479
},
{
"epoch": 0.9578201334893002,
"grad_norm": 0.18179524960436003,
"learning_rate": 1.78400689608067e-07,
"loss": 0.767,
"step": 3480
},
{
"epoch": 0.9580953691598432,
"grad_norm": 0.1787474371453518,
"learning_rate": 1.7607969222454446e-07,
"loss": 0.737,
"step": 3481
},
{
"epoch": 0.958370604830386,
"grad_norm": 0.17646985645336227,
"learning_rate": 1.7377382522309138e-07,
"loss": 0.7675,
"step": 3482
},
{
"epoch": 0.9586458405009289,
"grad_norm": 0.1772746613899236,
"learning_rate": 1.714830903636311e-07,
"loss": 0.7884,
"step": 3483
},
{
"epoch": 0.9589210761714718,
"grad_norm": 0.17532594677389737,
"learning_rate": 1.6920748939454058e-07,
"loss": 0.7789,
"step": 3484
},
{
"epoch": 0.9591963118420147,
"grad_norm": 0.1725185422582348,
"learning_rate": 1.669470240526505e-07,
"loss": 0.7441,
"step": 3485
},
{
"epoch": 0.9594715475125576,
"grad_norm": 0.1832051702657459,
"learning_rate": 1.6470169606323193e-07,
"loss": 0.7707,
"step": 3486
},
{
"epoch": 0.9597467831831006,
"grad_norm": 0.18308635605218143,
"learning_rate": 1.6247150714000514e-07,
"loss": 0.7884,
"step": 3487
},
{
"epoch": 0.9600220188536435,
"grad_norm": 0.1780212599011205,
"learning_rate": 1.6025645898513963e-07,
"loss": 0.7769,
"step": 3488
},
{
"epoch": 0.9602972545241864,
"grad_norm": 0.17867323891786976,
"learning_rate": 1.5805655328924308e-07,
"loss": 0.7613,
"step": 3489
},
{
"epoch": 0.9605724901947292,
"grad_norm": 0.1817119219295843,
"learning_rate": 1.5587179173137234e-07,
"loss": 0.7804,
"step": 3490
},
{
"epoch": 0.9608477258652721,
"grad_norm": 0.18151539362345998,
"learning_rate": 1.5370217597901805e-07,
"loss": 0.7813,
"step": 3491
},
{
"epoch": 0.961122961535815,
"grad_norm": 0.1810355088739828,
"learning_rate": 1.5154770768811556e-07,
"loss": 0.7612,
"step": 3492
},
{
"epoch": 0.961398197206358,
"grad_norm": 0.18718055180850396,
"learning_rate": 1.4940838850304063e-07,
"loss": 0.7784,
"step": 3493
},
{
"epoch": 0.9616734328769009,
"grad_norm": 0.17661636434505176,
"learning_rate": 1.4728422005660048e-07,
"loss": 0.7577,
"step": 3494
},
{
"epoch": 0.9619486685474438,
"grad_norm": 0.17718307154213916,
"learning_rate": 1.4517520397004492e-07,
"loss": 0.7757,
"step": 3495
},
{
"epoch": 0.9622239042179866,
"grad_norm": 0.1778218613606195,
"learning_rate": 1.4308134185305522e-07,
"loss": 0.7676,
"step": 3496
},
{
"epoch": 0.9624991398885295,
"grad_norm": 0.18337886620272192,
"learning_rate": 1.4100263530375081e-07,
"loss": 0.7889,
"step": 3497
},
{
"epoch": 0.9627743755590724,
"grad_norm": 0.18216361767310676,
"learning_rate": 1.3893908590867811e-07,
"loss": 0.7822,
"step": 3498
},
{
"epoch": 0.9630496112296154,
"grad_norm": 0.1798065585331738,
"learning_rate": 1.3689069524281728e-07,
"loss": 0.7625,
"step": 3499
},
{
"epoch": 0.9633248469001583,
"grad_norm": 0.17296336534665763,
"learning_rate": 1.3485746486958217e-07,
"loss": 0.7485,
"step": 3500
},
{
"epoch": 0.9636000825707012,
"grad_norm": 0.17646303216234493,
"learning_rate": 1.3283939634081143e-07,
"loss": 0.7751,
"step": 3501
},
{
"epoch": 0.9638753182412441,
"grad_norm": 0.18295128543702405,
"learning_rate": 1.3083649119677078e-07,
"loss": 0.7844,
"step": 3502
},
{
"epoch": 0.964150553911787,
"grad_norm": 0.18151638905946132,
"learning_rate": 1.2884875096615734e-07,
"loss": 0.7903,
"step": 3503
},
{
"epoch": 0.9644257895823298,
"grad_norm": 0.18140173990630296,
"learning_rate": 1.2687617716609092e-07,
"loss": 0.775,
"step": 3504
},
{
"epoch": 0.9647010252528728,
"grad_norm": 0.17404082168705917,
"learning_rate": 1.2491877130211606e-07,
"loss": 0.7329,
"step": 3505
},
{
"epoch": 0.9649762609234157,
"grad_norm": 0.2732951700976934,
"learning_rate": 1.2297653486819994e-07,
"loss": 0.7756,
"step": 3506
},
{
"epoch": 0.9652514965939586,
"grad_norm": 0.17879034445366826,
"learning_rate": 1.2104946934673235e-07,
"loss": 0.7718,
"step": 3507
},
{
"epoch": 0.9655267322645015,
"grad_norm": 0.17793241131942597,
"learning_rate": 1.1913757620852562e-07,
"loss": 0.7533,
"step": 3508
},
{
"epoch": 0.9658019679350444,
"grad_norm": 0.18249359033078905,
"learning_rate": 1.1724085691280806e-07,
"loss": 0.7843,
"step": 3509
},
{
"epoch": 0.9660772036055872,
"grad_norm": 0.17887088090366485,
"learning_rate": 1.1535931290723057e-07,
"loss": 0.7836,
"step": 3510
},
{
"epoch": 0.9663524392761302,
"grad_norm": 0.17649617617448546,
"learning_rate": 1.1349294562786217e-07,
"loss": 0.7488,
"step": 3511
},
{
"epoch": 0.9666276749466731,
"grad_norm": 0.17699676701245734,
"learning_rate": 1.1164175649918341e-07,
"loss": 0.7714,
"step": 3512
},
{
"epoch": 0.966902910617216,
"grad_norm": 0.1802940285321163,
"learning_rate": 1.0980574693409295e-07,
"loss": 0.7283,
"step": 3513
},
{
"epoch": 0.9671781462877589,
"grad_norm": 0.1810028532172541,
"learning_rate": 1.0798491833390767e-07,
"loss": 0.7777,
"step": 3514
},
{
"epoch": 0.9674533819583018,
"grad_norm": 0.18222466272505522,
"learning_rate": 1.0617927208835143e-07,
"loss": 0.7573,
"step": 3515
},
{
"epoch": 0.9677286176288447,
"grad_norm": 0.17622547495446858,
"learning_rate": 1.0438880957556408e-07,
"loss": 0.7675,
"step": 3516
},
{
"epoch": 0.9680038532993877,
"grad_norm": 0.17456426789450077,
"learning_rate": 1.0261353216209691e-07,
"loss": 0.7666,
"step": 3517
},
{
"epoch": 0.9682790889699305,
"grad_norm": 0.6731092038117731,
"learning_rate": 1.008534412029083e-07,
"loss": 0.8104,
"step": 3518
},
{
"epoch": 0.9685543246404734,
"grad_norm": 0.18001065930138233,
"learning_rate": 9.910853804137033e-08,
"loss": 0.7596,
"step": 3519
},
{
"epoch": 0.9688295603110163,
"grad_norm": 0.17976827811068685,
"learning_rate": 9.737882400925768e-08,
"loss": 0.7746,
"step": 3520
},
{
"epoch": 0.9691047959815592,
"grad_norm": 0.17377817363019726,
"learning_rate": 9.566430042675657e-08,
"loss": 0.741,
"step": 3521
},
{
"epoch": 0.9693800316521021,
"grad_norm": 0.1790577389207181,
"learning_rate": 9.396496860245797e-08,
"loss": 0.7662,
"step": 3522
},
{
"epoch": 0.9696552673226451,
"grad_norm": 0.18103421902489195,
"learning_rate": 9.228082983335329e-08,
"loss": 0.7685,
"step": 3523
},
{
"epoch": 0.969930502993188,
"grad_norm": 0.17549445706841693,
"learning_rate": 9.061188540484989e-08,
"loss": 0.7515,
"step": 3524
},
{
"epoch": 0.9702057386637308,
"grad_norm": 0.18297146952209412,
"learning_rate": 8.895813659074437e-08,
"loss": 0.8175,
"step": 3525
},
{
"epoch": 0.9704809743342737,
"grad_norm": 0.17487822209097756,
"learning_rate": 8.731958465324486e-08,
"loss": 0.755,
"step": 3526
},
{
"epoch": 0.9707562100048166,
"grad_norm": 0.1782621251590202,
"learning_rate": 8.569623084295541e-08,
"loss": 0.7545,
"step": 3527
},
{
"epoch": 0.9710314456753595,
"grad_norm": 0.17901393418751282,
"learning_rate": 8.408807639888494e-08,
"loss": 0.7656,
"step": 3528
},
{
"epoch": 0.9713066813459025,
"grad_norm": 0.18147513421709688,
"learning_rate": 8.249512254843827e-08,
"loss": 0.7861,
"step": 3529
},
{
"epoch": 0.9715819170164454,
"grad_norm": 0.17957197077444595,
"learning_rate": 8.091737050741621e-08,
"loss": 0.7782,
"step": 3530
},
{
"epoch": 0.9718571526869882,
"grad_norm": 0.17672929151204175,
"learning_rate": 7.93548214800266e-08,
"loss": 0.7597,
"step": 3531
},
{
"epoch": 0.9721323883575311,
"grad_norm": 0.4349985677603449,
"learning_rate": 7.78074766588599e-08,
"loss": 0.7466,
"step": 3532
},
{
"epoch": 0.972407624028074,
"grad_norm": 0.17633210395280272,
"learning_rate": 7.627533722491364e-08,
"loss": 0.7701,
"step": 3533
},
{
"epoch": 0.9726828596986169,
"grad_norm": 0.17672996696366425,
"learning_rate": 7.475840434757686e-08,
"loss": 0.7604,
"step": 3534
},
{
"epoch": 0.9729580953691599,
"grad_norm": 0.17797749847629954,
"learning_rate": 7.325667918462787e-08,
"loss": 0.7608,
"step": 3535
},
{
"epoch": 0.9732333310397028,
"grad_norm": 0.18212680472703546,
"learning_rate": 7.177016288224315e-08,
"loss": 0.7865,
"step": 3536
},
{
"epoch": 0.9735085667102457,
"grad_norm": 0.1797596592006918,
"learning_rate": 7.02988565749907e-08,
"loss": 0.7647,
"step": 3537
},
{
"epoch": 0.9737838023807885,
"grad_norm": 0.33927059362669343,
"learning_rate": 6.884276138582557e-08,
"loss": 0.8035,
"step": 3538
},
{
"epoch": 0.9740590380513314,
"grad_norm": 0.17928149815434763,
"learning_rate": 6.74018784260988e-08,
"loss": 0.77,
"step": 3539
},
{
"epoch": 0.9743342737218743,
"grad_norm": 0.18172923679065384,
"learning_rate": 6.597620879554623e-08,
"loss": 0.787,
"step": 3540
},
{
"epoch": 0.9746095093924173,
"grad_norm": 0.17697969705309036,
"learning_rate": 6.4565753582293e-08,
"loss": 0.7558,
"step": 3541
},
{
"epoch": 0.9748847450629602,
"grad_norm": 0.17581623954789408,
"learning_rate": 6.317051386285356e-08,
"loss": 0.7744,
"step": 3542
},
{
"epoch": 0.9751599807335031,
"grad_norm": 0.18288959489294285,
"learning_rate": 6.179049070213161e-08,
"loss": 0.7987,
"step": 3543
},
{
"epoch": 0.975435216404046,
"grad_norm": 0.17634802116204348,
"learning_rate": 6.04256851534113e-08,
"loss": 0.766,
"step": 3544
},
{
"epoch": 0.9757104520745888,
"grad_norm": 0.17673599042413227,
"learning_rate": 5.90760982583638e-08,
"loss": 0.7655,
"step": 3545
},
{
"epoch": 0.9759856877451317,
"grad_norm": 0.1820519779333532,
"learning_rate": 5.774173104705183e-08,
"loss": 0.7916,
"step": 3546
},
{
"epoch": 0.9762609234156747,
"grad_norm": 0.175861718108478,
"learning_rate": 5.642258453790961e-08,
"loss": 0.7453,
"step": 3547
},
{
"epoch": 0.9765361590862176,
"grad_norm": 0.17486506031221477,
"learning_rate": 5.511865973776287e-08,
"loss": 0.7406,
"step": 3548
},
{
"epoch": 0.9768113947567605,
"grad_norm": 0.1780123599448829,
"learning_rate": 5.382995764181775e-08,
"loss": 0.7565,
"step": 3549
},
{
"epoch": 0.9770866304273034,
"grad_norm": 0.17599109803309804,
"learning_rate": 5.2556479233663026e-08,
"loss": 0.7611,
"step": 3550
},
{
"epoch": 0.9773618660978463,
"grad_norm": 0.17667839153089893,
"learning_rate": 5.129822548526342e-08,
"loss": 0.7545,
"step": 3551
},
{
"epoch": 0.9776371017683891,
"grad_norm": 0.17280983449231263,
"learning_rate": 5.005519735696851e-08,
"loss": 0.7479,
"step": 3552
},
{
"epoch": 0.9779123374389321,
"grad_norm": 0.176523232961457,
"learning_rate": 4.882739579750606e-08,
"loss": 0.7672,
"step": 3553
},
{
"epoch": 0.978187573109475,
"grad_norm": 0.18141590619449227,
"learning_rate": 4.761482174398202e-08,
"loss": 0.7813,
"step": 3554
},
{
"epoch": 0.9784628087800179,
"grad_norm": 0.17584721367548525,
"learning_rate": 4.641747612187608e-08,
"loss": 0.7656,
"step": 3555
},
{
"epoch": 0.9787380444505608,
"grad_norm": 0.17578752767897704,
"learning_rate": 4.523535984505278e-08,
"loss": 0.7694,
"step": 3556
},
{
"epoch": 0.9790132801211037,
"grad_norm": 0.17692227791635268,
"learning_rate": 4.406847381574819e-08,
"loss": 0.7762,
"step": 3557
},
{
"epoch": 0.9792885157916466,
"grad_norm": 0.17881547873142664,
"learning_rate": 4.291681892457211e-08,
"loss": 0.7646,
"step": 3558
},
{
"epoch": 0.9795637514621895,
"grad_norm": 0.17752552599136098,
"learning_rate": 4.178039605051698e-08,
"loss": 0.7883,
"step": 3559
},
{
"epoch": 0.9798389871327324,
"grad_norm": 0.17557339109858847,
"learning_rate": 4.065920606093787e-08,
"loss": 0.7636,
"step": 3560
},
{
"epoch": 0.9801142228032753,
"grad_norm": 0.17574774842950883,
"learning_rate": 3.9553249811576936e-08,
"loss": 0.7775,
"step": 3561
},
{
"epoch": 0.9803894584738182,
"grad_norm": 0.1772844194397064,
"learning_rate": 3.846252814654117e-08,
"loss": 0.7571,
"step": 3562
},
{
"epoch": 0.9806646941443611,
"grad_norm": 0.17729271362679233,
"learning_rate": 3.738704189830689e-08,
"loss": 0.7679,
"step": 3563
},
{
"epoch": 0.980939929814904,
"grad_norm": 0.17909654849099269,
"learning_rate": 3.632679188773303e-08,
"loss": 0.7684,
"step": 3564
},
{
"epoch": 0.981215165485447,
"grad_norm": 0.1833268540809398,
"learning_rate": 3.528177892403894e-08,
"loss": 0.8016,
"step": 3565
},
{
"epoch": 0.9814904011559898,
"grad_norm": 0.178510730996882,
"learning_rate": 3.425200380481997e-08,
"loss": 0.7641,
"step": 3566
},
{
"epoch": 0.9817656368265327,
"grad_norm": 0.1820364095197272,
"learning_rate": 3.3237467316042937e-08,
"loss": 0.7578,
"step": 3567
},
{
"epoch": 0.9820408724970756,
"grad_norm": 0.18138425675425987,
"learning_rate": 3.2238170232037346e-08,
"loss": 0.7832,
"step": 3568
},
{
"epoch": 0.9823161081676185,
"grad_norm": 0.17304657211950922,
"learning_rate": 3.125411331550643e-08,
"loss": 0.7526,
"step": 3569
},
{
"epoch": 0.9825913438381614,
"grad_norm": 0.17567294905646494,
"learning_rate": 3.028529731752272e-08,
"loss": 0.754,
"step": 3570
},
{
"epoch": 0.9828665795087044,
"grad_norm": 0.17489419630595576,
"learning_rate": 2.9331722977523625e-08,
"loss": 0.7554,
"step": 3571
},
{
"epoch": 0.9831418151792473,
"grad_norm": 0.17745684383307575,
"learning_rate": 2.83933910233114e-08,
"loss": 0.759,
"step": 3572
},
{
"epoch": 0.9834170508497901,
"grad_norm": 0.17206565807239377,
"learning_rate": 2.7470302171057616e-08,
"loss": 0.7384,
"step": 3573
},
{
"epoch": 0.983692286520333,
"grad_norm": 0.17641064197005765,
"learning_rate": 2.6562457125300922e-08,
"loss": 0.7616,
"step": 3574
},
{
"epoch": 0.9839675221908759,
"grad_norm": 0.177807913042134,
"learning_rate": 2.566985657894483e-08,
"loss": 0.7653,
"step": 3575
},
{
"epoch": 0.9842427578614188,
"grad_norm": 0.1790946374687885,
"learning_rate": 2.4792501213253272e-08,
"loss": 0.7548,
"step": 3576
},
{
"epoch": 0.9845179935319618,
"grad_norm": 0.1780544315960312,
"learning_rate": 2.393039169785949e-08,
"loss": 0.7847,
"step": 3577
},
{
"epoch": 0.9847932292025047,
"grad_norm": 0.17834656751545258,
"learning_rate": 2.308352869075936e-08,
"loss": 0.7577,
"step": 3578
},
{
"epoch": 0.9850684648730476,
"grad_norm": 0.1790462243194874,
"learning_rate": 2.2251912838311408e-08,
"loss": 0.7698,
"step": 3579
},
{
"epoch": 0.9853437005435904,
"grad_norm": 0.18434661086620202,
"learning_rate": 2.1435544775234574e-08,
"loss": 0.772,
"step": 3580
},
{
"epoch": 0.9856189362141333,
"grad_norm": 0.1764678047623265,
"learning_rate": 2.0634425124614886e-08,
"loss": 0.7903,
"step": 3581
},
{
"epoch": 0.9858941718846762,
"grad_norm": 0.1906681654294173,
"learning_rate": 1.98485544978988e-08,
"loss": 0.7519,
"step": 3582
},
{
"epoch": 0.9861694075552192,
"grad_norm": 0.1744191667294603,
"learning_rate": 1.9077933494888733e-08,
"loss": 0.7709,
"step": 3583
},
{
"epoch": 0.9864446432257621,
"grad_norm": 0.1760264371211054,
"learning_rate": 1.8322562703758652e-08,
"loss": 0.7524,
"step": 3584
},
{
"epoch": 0.986719878896305,
"grad_norm": 0.1804041052742331,
"learning_rate": 1.758244270103182e-08,
"loss": 0.7555,
"step": 3585
},
{
"epoch": 0.9869951145668479,
"grad_norm": 0.17425152073973565,
"learning_rate": 1.68575740515986e-08,
"loss": 0.7536,
"step": 3586
},
{
"epoch": 0.9872703502373907,
"grad_norm": 0.18022990927311366,
"learning_rate": 1.614795730870311e-08,
"loss": 0.7884,
"step": 3587
},
{
"epoch": 0.9875455859079336,
"grad_norm": 0.1692149686164318,
"learning_rate": 1.545359301395877e-08,
"loss": 0.7498,
"step": 3588
},
{
"epoch": 0.9878208215784766,
"grad_norm": 0.1787623881772803,
"learning_rate": 1.4774481697326093e-08,
"loss": 0.7754,
"step": 3589
},
{
"epoch": 0.9880960572490195,
"grad_norm": 0.17906661051327927,
"learning_rate": 1.411062387713269e-08,
"loss": 0.7665,
"step": 3590
},
{
"epoch": 0.9883712929195624,
"grad_norm": 0.18088041897608118,
"learning_rate": 1.3462020060057701e-08,
"loss": 0.7732,
"step": 3591
},
{
"epoch": 0.9886465285901053,
"grad_norm": 0.5259410023169809,
"learning_rate": 1.2828670741140693e-08,
"loss": 0.7695,
"step": 3592
},
{
"epoch": 0.9889217642606482,
"grad_norm": 0.17698042182388626,
"learning_rate": 1.2210576403779428e-08,
"loss": 0.7936,
"step": 3593
},
{
"epoch": 0.989196999931191,
"grad_norm": 0.1736793906227136,
"learning_rate": 1.1607737519727658e-08,
"loss": 0.7464,
"step": 3594
},
{
"epoch": 0.989472235601734,
"grad_norm": 0.17547954770090057,
"learning_rate": 1.1020154549095108e-08,
"loss": 0.7372,
"step": 3595
},
{
"epoch": 0.9897474712722769,
"grad_norm": 0.1793344090138556,
"learning_rate": 1.0447827940345268e-08,
"loss": 0.7577,
"step": 3596
},
{
"epoch": 0.9900227069428198,
"grad_norm": 0.17520452136961606,
"learning_rate": 9.890758130304268e-09,
"loss": 0.7566,
"step": 3597
},
{
"epoch": 0.9902979426133627,
"grad_norm": 0.18144790080034406,
"learning_rate": 9.348945544147558e-09,
"loss": 0.7877,
"step": 3598
},
{
"epoch": 0.9905731782839056,
"grad_norm": 0.181632924815335,
"learning_rate": 8.822390595404352e-09,
"loss": 0.7832,
"step": 3599
},
{
"epoch": 0.9908484139544484,
"grad_norm": 0.17877247539950303,
"learning_rate": 8.311093685966498e-09,
"loss": 0.7779,
"step": 3600
},
{
"epoch": 0.9911236496249914,
"grad_norm": 0.17896780550493493,
"learning_rate": 7.815055206072952e-09,
"loss": 0.7878,
"step": 3601
},
{
"epoch": 0.9913988852955343,
"grad_norm": 0.17442436243710627,
"learning_rate": 7.3342755343208674e-09,
"loss": 0.7653,
"step": 3602
},
{
"epoch": 0.9916741209660772,
"grad_norm": 0.17777321615401723,
"learning_rate": 6.868755037658937e-09,
"loss": 0.7767,
"step": 3603
},
{
"epoch": 0.9919493566366201,
"grad_norm": 0.17503196499366,
"learning_rate": 6.418494071389614e-09,
"loss": 0.7746,
"step": 3604
},
{
"epoch": 0.992224592307163,
"grad_norm": 0.17898836174974275,
"learning_rate": 5.983492979171335e-09,
"loss": 0.7788,
"step": 3605
},
{
"epoch": 0.9924998279777059,
"grad_norm": 0.18116771238038662,
"learning_rate": 5.563752093011854e-09,
"loss": 0.7624,
"step": 3606
},
{
"epoch": 0.9927750636482489,
"grad_norm": 0.17698711904140782,
"learning_rate": 5.159271733274907e-09,
"loss": 0.786,
"step": 3607
},
{
"epoch": 0.9930502993187917,
"grad_norm": 0.17765865208863188,
"learning_rate": 4.770052208673548e-09,
"loss": 0.7738,
"step": 3608
},
{
"epoch": 0.9933255349893346,
"grad_norm": 0.17657751738507468,
"learning_rate": 4.396093816279035e-09,
"loss": 0.7707,
"step": 3609
},
{
"epoch": 0.9936007706598775,
"grad_norm": 0.17454547847072047,
"learning_rate": 4.037396841507501e-09,
"loss": 0.7575,
"step": 3610
},
{
"epoch": 0.9938760063304204,
"grad_norm": 0.17714898000454996,
"learning_rate": 3.693961558131065e-09,
"loss": 0.7731,
"step": 3611
},
{
"epoch": 0.9941512420009633,
"grad_norm": 0.17622566153292368,
"learning_rate": 3.3657882282733812e-09,
"loss": 0.7573,
"step": 3612
},
{
"epoch": 0.9944264776715063,
"grad_norm": 0.17709442785780347,
"learning_rate": 3.052877102409646e-09,
"loss": 0.7718,
"step": 3613
},
{
"epoch": 0.9947017133420492,
"grad_norm": 0.1791136045353406,
"learning_rate": 2.755228419364375e-09,
"loss": 0.8098,
"step": 3614
},
{
"epoch": 0.994976949012592,
"grad_norm": 0.18038236847042155,
"learning_rate": 2.472842406315845e-09,
"loss": 0.7667,
"step": 3615
},
{
"epoch": 0.9952521846831349,
"grad_norm": 0.17588115954026287,
"learning_rate": 2.205719278789431e-09,
"loss": 0.747,
"step": 3616
},
{
"epoch": 0.9955274203536778,
"grad_norm": 0.17653906931286567,
"learning_rate": 1.9538592406664892e-09,
"loss": 0.7664,
"step": 3617
},
{
"epoch": 0.9958026560242207,
"grad_norm": 0.5789252284335775,
"learning_rate": 1.7172624841754748e-09,
"loss": 0.7929,
"step": 3618
},
{
"epoch": 0.9960778916947637,
"grad_norm": 0.1763033312587939,
"learning_rate": 1.4959291898963836e-09,
"loss": 0.745,
"step": 3619
},
{
"epoch": 0.9963531273653066,
"grad_norm": 0.17840002630427262,
"learning_rate": 1.2898595267585301e-09,
"loss": 0.7756,
"step": 3620
},
{
"epoch": 0.9966283630358495,
"grad_norm": 0.1767728346272913,
"learning_rate": 1.0990536520427696e-09,
"loss": 0.7875,
"step": 3621
},
{
"epoch": 0.9969035987063923,
"grad_norm": 0.1698190573293454,
"learning_rate": 9.235117113792768e-10,
"loss": 0.7447,
"step": 3622
},
{
"epoch": 0.9971788343769352,
"grad_norm": 0.17788327054454803,
"learning_rate": 7.632338387497662e-10,
"loss": 0.7855,
"step": 3623
},
{
"epoch": 0.9974540700474781,
"grad_norm": 0.17666564044937066,
"learning_rate": 6.182201564830514e-10,
"loss": 0.7556,
"step": 3624
},
{
"epoch": 0.9977293057180211,
"grad_norm": 0.1769705344619,
"learning_rate": 4.884707752594864e-10,
"loss": 0.7673,
"step": 3625
},
{
"epoch": 0.998004541388564,
"grad_norm": 0.20211506220176842,
"learning_rate": 3.739857941087444e-10,
"loss": 0.7606,
"step": 3626
},
{
"epoch": 0.9982797770591069,
"grad_norm": 0.17917807655749818,
"learning_rate": 2.747653004098183e-10,
"loss": 0.7782,
"step": 3627
},
{
"epoch": 0.9985550127296497,
"grad_norm": 0.18205947891349653,
"learning_rate": 1.9080936989324117e-10,
"loss": 0.7419,
"step": 3628
},
{
"epoch": 0.9988302484001926,
"grad_norm": 0.17364769553726467,
"learning_rate": 1.221180666344246e-10,
"loss": 0.7561,
"step": 3629
},
{
"epoch": 0.9991054840707355,
"grad_norm": 0.16925193408970945,
"learning_rate": 6.869144306476117e-11,
"loss": 0.7399,
"step": 3630
},
{
"epoch": 0.9993807197412785,
"grad_norm": 0.21789214395467008,
"learning_rate": 3.0529539960522104e-11,
"loss": 0.7566,
"step": 3631
},
{
"epoch": 0.9996559554118214,
"grad_norm": 0.1819224016534888,
"learning_rate": 7.632386447298245e-12,
"loss": 0.7667,
"step": 3632
},
{
"epoch": 0.9999311910823643,
"grad_norm": 0.17616693190335297,
"learning_rate": 0.0,
"loss": 0.7571,
"step": 3633
},
{
"epoch": 0.9999311910823643,
"step": 3633,
"total_flos": 3475585917517824.0,
"train_loss": 0.8154109569577649,
"train_runtime": 36551.3329,
"train_samples_per_second": 57.255,
"train_steps_per_second": 0.099
}
],
"logging_steps": 1,
"max_steps": 3633,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3475585917517824.0,
"train_batch_size": 9,
"trial_name": null,
"trial_params": null
}