Files
Technology-llama3_1_8B_inst…/trainer_state.json
ModelHub XC 14077ced84 初始化项目,由ModelHub XC社区提供模型
Model: BAAI/Technology-llama3_1_8B_instruct
Source: Original Platform
2026-05-18 18:25:18 +08:00

28560 lines
690 KiB
JSON

{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500,
"global_step": 4066,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.00024594195769798326,
"grad_norm": 14.433284759521484,
"learning_rate": 9.832841691248772e-09,
"loss": 1.5028,
"step": 1
},
{
"epoch": 0.0004918839153959665,
"grad_norm": 13.41608715057373,
"learning_rate": 1.9665683382497544e-08,
"loss": 1.5608,
"step": 2
},
{
"epoch": 0.0007378258730939498,
"grad_norm": 15.324729919433594,
"learning_rate": 2.9498525073746314e-08,
"loss": 1.5647,
"step": 3
},
{
"epoch": 0.000983767830791933,
"grad_norm": 11.440258979797363,
"learning_rate": 3.933136676499509e-08,
"loss": 1.4107,
"step": 4
},
{
"epoch": 0.0012297097884899164,
"grad_norm": 14.446358680725098,
"learning_rate": 4.916420845624386e-08,
"loss": 1.3553,
"step": 5
},
{
"epoch": 0.0014756517461878996,
"grad_norm": 11.821146011352539,
"learning_rate": 5.899705014749263e-08,
"loss": 1.3202,
"step": 6
},
{
"epoch": 0.001721593703885883,
"grad_norm": 12.85606861114502,
"learning_rate": 6.88298918387414e-08,
"loss": 1.4712,
"step": 7
},
{
"epoch": 0.001967535661583866,
"grad_norm": 11.844099998474121,
"learning_rate": 7.866273352999017e-08,
"loss": 1.464,
"step": 8
},
{
"epoch": 0.0022134776192818495,
"grad_norm": 12.044977188110352,
"learning_rate": 8.849557522123894e-08,
"loss": 1.4117,
"step": 9
},
{
"epoch": 0.002459419576979833,
"grad_norm": 11.964421272277832,
"learning_rate": 9.832841691248772e-08,
"loss": 1.4596,
"step": 10
},
{
"epoch": 0.0027053615346778162,
"grad_norm": 15.165493965148926,
"learning_rate": 1.0816125860373649e-07,
"loss": 1.3928,
"step": 11
},
{
"epoch": 0.002951303492375799,
"grad_norm": 12.916013717651367,
"learning_rate": 1.1799410029498526e-07,
"loss": 1.6199,
"step": 12
},
{
"epoch": 0.0031972454500737825,
"grad_norm": 14.007545471191406,
"learning_rate": 1.2782694198623402e-07,
"loss": 1.4725,
"step": 13
},
{
"epoch": 0.003443187407771766,
"grad_norm": 14.50717830657959,
"learning_rate": 1.376597836774828e-07,
"loss": 1.4548,
"step": 14
},
{
"epoch": 0.0036891293654697493,
"grad_norm": 12.757678985595703,
"learning_rate": 1.4749262536873157e-07,
"loss": 1.4159,
"step": 15
},
{
"epoch": 0.003935071323167732,
"grad_norm": 13.092570304870605,
"learning_rate": 1.5732546705998035e-07,
"loss": 1.4571,
"step": 16
},
{
"epoch": 0.004181013280865716,
"grad_norm": 13.468591690063477,
"learning_rate": 1.671583087512291e-07,
"loss": 1.4265,
"step": 17
},
{
"epoch": 0.004426955238563699,
"grad_norm": 13.153406143188477,
"learning_rate": 1.7699115044247788e-07,
"loss": 1.4998,
"step": 18
},
{
"epoch": 0.004672897196261682,
"grad_norm": 12.058491706848145,
"learning_rate": 1.8682399213372668e-07,
"loss": 1.3731,
"step": 19
},
{
"epoch": 0.004918839153959666,
"grad_norm": 11.386009216308594,
"learning_rate": 1.9665683382497543e-07,
"loss": 1.4874,
"step": 20
},
{
"epoch": 0.005164781111657649,
"grad_norm": 12.583024024963379,
"learning_rate": 2.064896755162242e-07,
"loss": 1.5002,
"step": 21
},
{
"epoch": 0.0054107230693556324,
"grad_norm": 11.748034477233887,
"learning_rate": 2.1632251720747298e-07,
"loss": 1.5496,
"step": 22
},
{
"epoch": 0.005656665027053615,
"grad_norm": 12.467665672302246,
"learning_rate": 2.2615535889872173e-07,
"loss": 1.479,
"step": 23
},
{
"epoch": 0.005902606984751598,
"grad_norm": 10.625871658325195,
"learning_rate": 2.359882005899705e-07,
"loss": 1.289,
"step": 24
},
{
"epoch": 0.006148548942449582,
"grad_norm": 11.998695373535156,
"learning_rate": 2.458210422812193e-07,
"loss": 1.5457,
"step": 25
},
{
"epoch": 0.006394490900147565,
"grad_norm": 14.956937789916992,
"learning_rate": 2.5565388397246804e-07,
"loss": 1.5454,
"step": 26
},
{
"epoch": 0.006640432857845549,
"grad_norm": 12.903891563415527,
"learning_rate": 2.654867256637168e-07,
"loss": 1.5618,
"step": 27
},
{
"epoch": 0.006886374815543532,
"grad_norm": 14.466906547546387,
"learning_rate": 2.753195673549656e-07,
"loss": 1.5442,
"step": 28
},
{
"epoch": 0.007132316773241515,
"grad_norm": 11.871532440185547,
"learning_rate": 2.8515240904621437e-07,
"loss": 1.4023,
"step": 29
},
{
"epoch": 0.0073782587309394985,
"grad_norm": 12.505096435546875,
"learning_rate": 2.9498525073746315e-07,
"loss": 1.4324,
"step": 30
},
{
"epoch": 0.0076242006886374815,
"grad_norm": 10.027387619018555,
"learning_rate": 3.048180924287119e-07,
"loss": 1.3198,
"step": 31
},
{
"epoch": 0.007870142646335464,
"grad_norm": 10.92414379119873,
"learning_rate": 3.146509341199607e-07,
"loss": 1.5022,
"step": 32
},
{
"epoch": 0.008116084604033447,
"grad_norm": 8.906771659851074,
"learning_rate": 3.244837758112095e-07,
"loss": 1.3973,
"step": 33
},
{
"epoch": 0.008362026561731432,
"grad_norm": 11.143491744995117,
"learning_rate": 3.343166175024582e-07,
"loss": 1.4179,
"step": 34
},
{
"epoch": 0.008607968519429415,
"grad_norm": 9.495902061462402,
"learning_rate": 3.4414945919370703e-07,
"loss": 1.3934,
"step": 35
},
{
"epoch": 0.008853910477127398,
"grad_norm": 8.214719772338867,
"learning_rate": 3.5398230088495575e-07,
"loss": 1.403,
"step": 36
},
{
"epoch": 0.00909985243482538,
"grad_norm": 8.861774444580078,
"learning_rate": 3.6381514257620453e-07,
"loss": 1.2501,
"step": 37
},
{
"epoch": 0.009345794392523364,
"grad_norm": 9.159226417541504,
"learning_rate": 3.7364798426745336e-07,
"loss": 1.4271,
"step": 38
},
{
"epoch": 0.009591736350221348,
"grad_norm": 8.119258880615234,
"learning_rate": 3.834808259587021e-07,
"loss": 1.4046,
"step": 39
},
{
"epoch": 0.009837678307919331,
"grad_norm": 9.15845012664795,
"learning_rate": 3.9331366764995086e-07,
"loss": 1.5162,
"step": 40
},
{
"epoch": 0.010083620265617314,
"grad_norm": 7.5952911376953125,
"learning_rate": 4.0314650934119964e-07,
"loss": 1.3695,
"step": 41
},
{
"epoch": 0.010329562223315297,
"grad_norm": 8.039081573486328,
"learning_rate": 4.129793510324484e-07,
"loss": 1.4608,
"step": 42
},
{
"epoch": 0.01057550418101328,
"grad_norm": 6.648051738739014,
"learning_rate": 4.2281219272369714e-07,
"loss": 1.4511,
"step": 43
},
{
"epoch": 0.010821446138711265,
"grad_norm": 6.49833869934082,
"learning_rate": 4.3264503441494597e-07,
"loss": 1.3147,
"step": 44
},
{
"epoch": 0.011067388096409248,
"grad_norm": 6.514254093170166,
"learning_rate": 4.4247787610619474e-07,
"loss": 1.4768,
"step": 45
},
{
"epoch": 0.01131333005410723,
"grad_norm": 6.159523010253906,
"learning_rate": 4.5231071779744347e-07,
"loss": 1.3734,
"step": 46
},
{
"epoch": 0.011559272011805214,
"grad_norm": 7.21243953704834,
"learning_rate": 4.621435594886923e-07,
"loss": 1.3744,
"step": 47
},
{
"epoch": 0.011805213969503197,
"grad_norm": 6.938387393951416,
"learning_rate": 4.71976401179941e-07,
"loss": 1.2383,
"step": 48
},
{
"epoch": 0.012051155927201181,
"grad_norm": 8.395055770874023,
"learning_rate": 4.818092428711897e-07,
"loss": 1.301,
"step": 49
},
{
"epoch": 0.012297097884899164,
"grad_norm": 6.657160758972168,
"learning_rate": 4.916420845624386e-07,
"loss": 1.3058,
"step": 50
},
{
"epoch": 0.012543039842597147,
"grad_norm": 7.0281267166137695,
"learning_rate": 5.014749262536873e-07,
"loss": 1.2348,
"step": 51
},
{
"epoch": 0.01278898180029513,
"grad_norm": 6.514767646789551,
"learning_rate": 5.113077679449361e-07,
"loss": 1.3201,
"step": 52
},
{
"epoch": 0.013034923757993113,
"grad_norm": 6.319578647613525,
"learning_rate": 5.21140609636185e-07,
"loss": 1.2899,
"step": 53
},
{
"epoch": 0.013280865715691098,
"grad_norm": 6.3555097579956055,
"learning_rate": 5.309734513274336e-07,
"loss": 1.3916,
"step": 54
},
{
"epoch": 0.01352680767338908,
"grad_norm": 6.054043292999268,
"learning_rate": 5.408062930186824e-07,
"loss": 1.3909,
"step": 55
},
{
"epoch": 0.013772749631087064,
"grad_norm": 6.042862892150879,
"learning_rate": 5.506391347099312e-07,
"loss": 1.2999,
"step": 56
},
{
"epoch": 0.014018691588785047,
"grad_norm": 5.838632583618164,
"learning_rate": 5.6047197640118e-07,
"loss": 1.3907,
"step": 57
},
{
"epoch": 0.01426463354648303,
"grad_norm": 5.693017482757568,
"learning_rate": 5.703048180924287e-07,
"loss": 1.256,
"step": 58
},
{
"epoch": 0.014510575504181012,
"grad_norm": 6.072226524353027,
"learning_rate": 5.801376597836775e-07,
"loss": 1.3222,
"step": 59
},
{
"epoch": 0.014756517461878997,
"grad_norm": 6.117721080780029,
"learning_rate": 5.899705014749263e-07,
"loss": 1.3016,
"step": 60
},
{
"epoch": 0.01500245941957698,
"grad_norm": 6.407538890838623,
"learning_rate": 5.998033431661751e-07,
"loss": 1.372,
"step": 61
},
{
"epoch": 0.015248401377274963,
"grad_norm": 5.884194850921631,
"learning_rate": 6.096361848574238e-07,
"loss": 1.4017,
"step": 62
},
{
"epoch": 0.015494343334972946,
"grad_norm": 6.451099872589111,
"learning_rate": 6.194690265486726e-07,
"loss": 1.2657,
"step": 63
},
{
"epoch": 0.01574028529267093,
"grad_norm": 5.339378833770752,
"learning_rate": 6.293018682399214e-07,
"loss": 1.2438,
"step": 64
},
{
"epoch": 0.015986227250368912,
"grad_norm": 5.808299541473389,
"learning_rate": 6.391347099311701e-07,
"loss": 1.261,
"step": 65
},
{
"epoch": 0.016232169208066895,
"grad_norm": 6.163237571716309,
"learning_rate": 6.48967551622419e-07,
"loss": 1.467,
"step": 66
},
{
"epoch": 0.01647811116576488,
"grad_norm": 6.109335899353027,
"learning_rate": 6.588003933136677e-07,
"loss": 1.2095,
"step": 67
},
{
"epoch": 0.016724053123462864,
"grad_norm": 5.411988735198975,
"learning_rate": 6.686332350049164e-07,
"loss": 1.3204,
"step": 68
},
{
"epoch": 0.016969995081160847,
"grad_norm": 5.25512170791626,
"learning_rate": 6.784660766961653e-07,
"loss": 1.2994,
"step": 69
},
{
"epoch": 0.01721593703885883,
"grad_norm": 5.323281764984131,
"learning_rate": 6.882989183874141e-07,
"loss": 1.2787,
"step": 70
},
{
"epoch": 0.017461878996556813,
"grad_norm": 5.55164909362793,
"learning_rate": 6.981317600786627e-07,
"loss": 1.3012,
"step": 71
},
{
"epoch": 0.017707820954254796,
"grad_norm": 5.422927379608154,
"learning_rate": 7.079646017699115e-07,
"loss": 1.2101,
"step": 72
},
{
"epoch": 0.01795376291195278,
"grad_norm": 5.335574150085449,
"learning_rate": 7.177974434611604e-07,
"loss": 1.1741,
"step": 73
},
{
"epoch": 0.01819970486965076,
"grad_norm": 4.826264381408691,
"learning_rate": 7.276302851524091e-07,
"loss": 1.2027,
"step": 74
},
{
"epoch": 0.018445646827348745,
"grad_norm": 5.808948993682861,
"learning_rate": 7.374631268436578e-07,
"loss": 1.292,
"step": 75
},
{
"epoch": 0.018691588785046728,
"grad_norm": 5.526470184326172,
"learning_rate": 7.472959685349067e-07,
"loss": 1.3581,
"step": 76
},
{
"epoch": 0.01893753074274471,
"grad_norm": 5.506979465484619,
"learning_rate": 7.571288102261554e-07,
"loss": 1.2474,
"step": 77
},
{
"epoch": 0.019183472700442697,
"grad_norm": 5.56695556640625,
"learning_rate": 7.669616519174042e-07,
"loss": 1.2866,
"step": 78
},
{
"epoch": 0.01942941465814068,
"grad_norm": 5.4988274574279785,
"learning_rate": 7.76794493608653e-07,
"loss": 1.2945,
"step": 79
},
{
"epoch": 0.019675356615838663,
"grad_norm": 5.414526462554932,
"learning_rate": 7.866273352999017e-07,
"loss": 1.2743,
"step": 80
},
{
"epoch": 0.019921298573536646,
"grad_norm": 5.389462471008301,
"learning_rate": 7.964601769911505e-07,
"loss": 1.2575,
"step": 81
},
{
"epoch": 0.02016724053123463,
"grad_norm": 4.9912309646606445,
"learning_rate": 8.062930186823993e-07,
"loss": 1.222,
"step": 82
},
{
"epoch": 0.02041318248893261,
"grad_norm": 4.9814653396606445,
"learning_rate": 8.161258603736479e-07,
"loss": 1.0013,
"step": 83
},
{
"epoch": 0.020659124446630595,
"grad_norm": 5.490965366363525,
"learning_rate": 8.259587020648968e-07,
"loss": 1.3476,
"step": 84
},
{
"epoch": 0.020905066404328577,
"grad_norm": 5.229772090911865,
"learning_rate": 8.357915437561456e-07,
"loss": 1.4266,
"step": 85
},
{
"epoch": 0.02115100836202656,
"grad_norm": 5.638108253479004,
"learning_rate": 8.456243854473943e-07,
"loss": 1.4258,
"step": 86
},
{
"epoch": 0.021396950319724543,
"grad_norm": 5.234287261962891,
"learning_rate": 8.554572271386432e-07,
"loss": 1.3553,
"step": 87
},
{
"epoch": 0.02164289227742253,
"grad_norm": 5.648282051086426,
"learning_rate": 8.652900688298919e-07,
"loss": 1.3078,
"step": 88
},
{
"epoch": 0.021888834235120513,
"grad_norm": 5.207667827606201,
"learning_rate": 8.751229105211406e-07,
"loss": 1.2688,
"step": 89
},
{
"epoch": 0.022134776192818496,
"grad_norm": 5.576632976531982,
"learning_rate": 8.849557522123895e-07,
"loss": 1.3586,
"step": 90
},
{
"epoch": 0.02238071815051648,
"grad_norm": 5.111441135406494,
"learning_rate": 8.947885939036383e-07,
"loss": 1.3158,
"step": 91
},
{
"epoch": 0.02262666010821446,
"grad_norm": 5.325538635253906,
"learning_rate": 9.046214355948869e-07,
"loss": 1.2532,
"step": 92
},
{
"epoch": 0.022872602065912444,
"grad_norm": 5.437929153442383,
"learning_rate": 9.144542772861357e-07,
"loss": 1.3503,
"step": 93
},
{
"epoch": 0.023118544023610427,
"grad_norm": 5.773566246032715,
"learning_rate": 9.242871189773846e-07,
"loss": 1.3744,
"step": 94
},
{
"epoch": 0.02336448598130841,
"grad_norm": 5.309929847717285,
"learning_rate": 9.341199606686333e-07,
"loss": 1.2577,
"step": 95
},
{
"epoch": 0.023610427939006393,
"grad_norm": 5.189741611480713,
"learning_rate": 9.43952802359882e-07,
"loss": 1.2713,
"step": 96
},
{
"epoch": 0.023856369896704376,
"grad_norm": 5.126186847686768,
"learning_rate": 9.537856440511308e-07,
"loss": 1.1413,
"step": 97
},
{
"epoch": 0.024102311854402363,
"grad_norm": 6.7692718505859375,
"learning_rate": 9.636184857423795e-07,
"loss": 1.2954,
"step": 98
},
{
"epoch": 0.024348253812100346,
"grad_norm": 5.555848121643066,
"learning_rate": 9.734513274336284e-07,
"loss": 1.3818,
"step": 99
},
{
"epoch": 0.02459419576979833,
"grad_norm": 5.35139274597168,
"learning_rate": 9.832841691248773e-07,
"loss": 1.2946,
"step": 100
},
{
"epoch": 0.02484013772749631,
"grad_norm": 5.050895690917969,
"learning_rate": 9.93117010816126e-07,
"loss": 1.3125,
"step": 101
},
{
"epoch": 0.025086079685194294,
"grad_norm": 5.3654093742370605,
"learning_rate": 1.0029498525073746e-06,
"loss": 1.278,
"step": 102
},
{
"epoch": 0.025332021642892277,
"grad_norm": 4.969247341156006,
"learning_rate": 1.0127826941986235e-06,
"loss": 1.1458,
"step": 103
},
{
"epoch": 0.02557796360059026,
"grad_norm": 5.1136155128479,
"learning_rate": 1.0226155358898722e-06,
"loss": 1.2066,
"step": 104
},
{
"epoch": 0.025823905558288243,
"grad_norm": 5.080495357513428,
"learning_rate": 1.032448377581121e-06,
"loss": 1.1618,
"step": 105
},
{
"epoch": 0.026069847515986226,
"grad_norm": 5.241085529327393,
"learning_rate": 1.04228121927237e-06,
"loss": 1.4272,
"step": 106
},
{
"epoch": 0.02631578947368421,
"grad_norm": 5.695695400238037,
"learning_rate": 1.0521140609636186e-06,
"loss": 1.1065,
"step": 107
},
{
"epoch": 0.026561731431382195,
"grad_norm": 4.662106037139893,
"learning_rate": 1.0619469026548673e-06,
"loss": 1.1669,
"step": 108
},
{
"epoch": 0.02680767338908018,
"grad_norm": 5.177834987640381,
"learning_rate": 1.0717797443461161e-06,
"loss": 1.1793,
"step": 109
},
{
"epoch": 0.02705361534677816,
"grad_norm": 5.14351749420166,
"learning_rate": 1.0816125860373648e-06,
"loss": 1.2398,
"step": 110
},
{
"epoch": 0.027299557304476144,
"grad_norm": 5.043832302093506,
"learning_rate": 1.0914454277286137e-06,
"loss": 1.2593,
"step": 111
},
{
"epoch": 0.027545499262174127,
"grad_norm": 5.509718894958496,
"learning_rate": 1.1012782694198624e-06,
"loss": 1.2001,
"step": 112
},
{
"epoch": 0.02779144121987211,
"grad_norm": 5.363673686981201,
"learning_rate": 1.111111111111111e-06,
"loss": 1.3331,
"step": 113
},
{
"epoch": 0.028037383177570093,
"grad_norm": 5.305070877075195,
"learning_rate": 1.12094395280236e-06,
"loss": 1.3525,
"step": 114
},
{
"epoch": 0.028283325135268076,
"grad_norm": 5.061011791229248,
"learning_rate": 1.1307767944936088e-06,
"loss": 1.1691,
"step": 115
},
{
"epoch": 0.02852926709296606,
"grad_norm": 5.197732925415039,
"learning_rate": 1.1406096361848575e-06,
"loss": 1.2408,
"step": 116
},
{
"epoch": 0.028775209050664042,
"grad_norm": 5.326655864715576,
"learning_rate": 1.1504424778761064e-06,
"loss": 1.3098,
"step": 117
},
{
"epoch": 0.029021151008362025,
"grad_norm": 4.740175247192383,
"learning_rate": 1.160275319567355e-06,
"loss": 1.1907,
"step": 118
},
{
"epoch": 0.02926709296606001,
"grad_norm": 5.087549686431885,
"learning_rate": 1.1701081612586037e-06,
"loss": 1.0858,
"step": 119
},
{
"epoch": 0.029513034923757994,
"grad_norm": 5.05009651184082,
"learning_rate": 1.1799410029498526e-06,
"loss": 1.1858,
"step": 120
},
{
"epoch": 0.029758976881455977,
"grad_norm": 4.669824600219727,
"learning_rate": 1.1897738446411015e-06,
"loss": 1.1667,
"step": 121
},
{
"epoch": 0.03000491883915396,
"grad_norm": 5.139840126037598,
"learning_rate": 1.1996066863323501e-06,
"loss": 1.3096,
"step": 122
},
{
"epoch": 0.030250860796851943,
"grad_norm": 4.710155487060547,
"learning_rate": 1.2094395280235988e-06,
"loss": 1.2648,
"step": 123
},
{
"epoch": 0.030496802754549926,
"grad_norm": 5.062249183654785,
"learning_rate": 1.2192723697148477e-06,
"loss": 1.1939,
"step": 124
},
{
"epoch": 0.03074274471224791,
"grad_norm": 5.2216973304748535,
"learning_rate": 1.2291052114060964e-06,
"loss": 1.2587,
"step": 125
},
{
"epoch": 0.030988686669945892,
"grad_norm": 5.196878433227539,
"learning_rate": 1.2389380530973452e-06,
"loss": 1.3093,
"step": 126
},
{
"epoch": 0.031234628627643875,
"grad_norm": 4.620794296264648,
"learning_rate": 1.2487708947885941e-06,
"loss": 1.1423,
"step": 127
},
{
"epoch": 0.03148057058534186,
"grad_norm": 5.154642581939697,
"learning_rate": 1.2586037364798428e-06,
"loss": 1.3356,
"step": 128
},
{
"epoch": 0.03172651254303984,
"grad_norm": 5.006635665893555,
"learning_rate": 1.2684365781710917e-06,
"loss": 1.2241,
"step": 129
},
{
"epoch": 0.031972454500737824,
"grad_norm": 4.912220001220703,
"learning_rate": 1.2782694198623401e-06,
"loss": 1.1423,
"step": 130
},
{
"epoch": 0.032218396458435807,
"grad_norm": 5.370849609375,
"learning_rate": 1.288102261553589e-06,
"loss": 1.1451,
"step": 131
},
{
"epoch": 0.03246433841613379,
"grad_norm": 5.07542085647583,
"learning_rate": 1.297935103244838e-06,
"loss": 1.159,
"step": 132
},
{
"epoch": 0.03271028037383177,
"grad_norm": 5.770486831665039,
"learning_rate": 1.3077679449360866e-06,
"loss": 1.146,
"step": 133
},
{
"epoch": 0.03295622233152976,
"grad_norm": 4.991493225097656,
"learning_rate": 1.3176007866273355e-06,
"loss": 1.1361,
"step": 134
},
{
"epoch": 0.033202164289227745,
"grad_norm": 5.177655220031738,
"learning_rate": 1.3274336283185843e-06,
"loss": 1.275,
"step": 135
},
{
"epoch": 0.03344810624692573,
"grad_norm": 4.936798095703125,
"learning_rate": 1.3372664700098328e-06,
"loss": 1.2577,
"step": 136
},
{
"epoch": 0.03369404820462371,
"grad_norm": 5.102567672729492,
"learning_rate": 1.3470993117010817e-06,
"loss": 1.0906,
"step": 137
},
{
"epoch": 0.033939990162321694,
"grad_norm": 5.409482955932617,
"learning_rate": 1.3569321533923306e-06,
"loss": 1.1717,
"step": 138
},
{
"epoch": 0.03418593212001968,
"grad_norm": 5.59683895111084,
"learning_rate": 1.3667649950835792e-06,
"loss": 1.398,
"step": 139
},
{
"epoch": 0.03443187407771766,
"grad_norm": 5.129553318023682,
"learning_rate": 1.3765978367748281e-06,
"loss": 1.1737,
"step": 140
},
{
"epoch": 0.03467781603541564,
"grad_norm": 5.006252765655518,
"learning_rate": 1.386430678466077e-06,
"loss": 1.3117,
"step": 141
},
{
"epoch": 0.034923757993113626,
"grad_norm": 5.0554304122924805,
"learning_rate": 1.3962635201573255e-06,
"loss": 1.1817,
"step": 142
},
{
"epoch": 0.03516969995081161,
"grad_norm": 5.6455888748168945,
"learning_rate": 1.4060963618485743e-06,
"loss": 1.2613,
"step": 143
},
{
"epoch": 0.03541564190850959,
"grad_norm": 5.636679172515869,
"learning_rate": 1.415929203539823e-06,
"loss": 1.2577,
"step": 144
},
{
"epoch": 0.035661583866207575,
"grad_norm": 5.21352481842041,
"learning_rate": 1.425762045231072e-06,
"loss": 1.3731,
"step": 145
},
{
"epoch": 0.03590752582390556,
"grad_norm": 5.466804504394531,
"learning_rate": 1.4355948869223208e-06,
"loss": 1.2045,
"step": 146
},
{
"epoch": 0.03615346778160354,
"grad_norm": 5.083317279815674,
"learning_rate": 1.4454277286135697e-06,
"loss": 1.3648,
"step": 147
},
{
"epoch": 0.03639940973930152,
"grad_norm": 4.955395221710205,
"learning_rate": 1.4552605703048181e-06,
"loss": 1.1471,
"step": 148
},
{
"epoch": 0.036645351696999506,
"grad_norm": 5.649031639099121,
"learning_rate": 1.465093411996067e-06,
"loss": 1.19,
"step": 149
},
{
"epoch": 0.03689129365469749,
"grad_norm": 4.527042865753174,
"learning_rate": 1.4749262536873157e-06,
"loss": 0.976,
"step": 150
},
{
"epoch": 0.03713723561239547,
"grad_norm": 4.714815616607666,
"learning_rate": 1.4847590953785646e-06,
"loss": 1.1792,
"step": 151
},
{
"epoch": 0.037383177570093455,
"grad_norm": 5.32336950302124,
"learning_rate": 1.4945919370698134e-06,
"loss": 1.2161,
"step": 152
},
{
"epoch": 0.03762911952779144,
"grad_norm": 5.683696269989014,
"learning_rate": 1.5044247787610621e-06,
"loss": 1.4505,
"step": 153
},
{
"epoch": 0.03787506148548942,
"grad_norm": 4.832881927490234,
"learning_rate": 1.5142576204523108e-06,
"loss": 1.206,
"step": 154
},
{
"epoch": 0.03812100344318741,
"grad_norm": 5.346810340881348,
"learning_rate": 1.5240904621435595e-06,
"loss": 1.1325,
"step": 155
},
{
"epoch": 0.038366945400885394,
"grad_norm": 4.778387069702148,
"learning_rate": 1.5339233038348083e-06,
"loss": 1.1576,
"step": 156
},
{
"epoch": 0.03861288735858338,
"grad_norm": 5.221841335296631,
"learning_rate": 1.5437561455260572e-06,
"loss": 1.3054,
"step": 157
},
{
"epoch": 0.03885882931628136,
"grad_norm": 4.841866493225098,
"learning_rate": 1.553588987217306e-06,
"loss": 1.2953,
"step": 158
},
{
"epoch": 0.03910477127397934,
"grad_norm": 5.528676509857178,
"learning_rate": 1.5634218289085548e-06,
"loss": 1.2801,
"step": 159
},
{
"epoch": 0.039350713231677326,
"grad_norm": 5.631914138793945,
"learning_rate": 1.5732546705998034e-06,
"loss": 1.3937,
"step": 160
},
{
"epoch": 0.03959665518937531,
"grad_norm": 5.1835036277771,
"learning_rate": 1.5830875122910521e-06,
"loss": 1.345,
"step": 161
},
{
"epoch": 0.03984259714707329,
"grad_norm": 5.447407245635986,
"learning_rate": 1.592920353982301e-06,
"loss": 1.2887,
"step": 162
},
{
"epoch": 0.040088539104771274,
"grad_norm": 5.357491493225098,
"learning_rate": 1.6027531956735499e-06,
"loss": 1.1747,
"step": 163
},
{
"epoch": 0.04033448106246926,
"grad_norm": 4.951967239379883,
"learning_rate": 1.6125860373647985e-06,
"loss": 1.4059,
"step": 164
},
{
"epoch": 0.04058042302016724,
"grad_norm": 4.798259735107422,
"learning_rate": 1.6224188790560474e-06,
"loss": 1.1574,
"step": 165
},
{
"epoch": 0.04082636497786522,
"grad_norm": 4.532293319702148,
"learning_rate": 1.6322517207472959e-06,
"loss": 1.0914,
"step": 166
},
{
"epoch": 0.041072306935563206,
"grad_norm": 5.223738670349121,
"learning_rate": 1.6420845624385448e-06,
"loss": 1.3109,
"step": 167
},
{
"epoch": 0.04131824889326119,
"grad_norm": 5.186299800872803,
"learning_rate": 1.6519174041297937e-06,
"loss": 1.2986,
"step": 168
},
{
"epoch": 0.04156419085095917,
"grad_norm": 5.0443196296691895,
"learning_rate": 1.6617502458210425e-06,
"loss": 1.2987,
"step": 169
},
{
"epoch": 0.041810132808657155,
"grad_norm": 5.047907829284668,
"learning_rate": 1.6715830875122912e-06,
"loss": 1.1741,
"step": 170
},
{
"epoch": 0.04205607476635514,
"grad_norm": 4.72170352935791,
"learning_rate": 1.68141592920354e-06,
"loss": 1.2421,
"step": 171
},
{
"epoch": 0.04230201672405312,
"grad_norm": 5.275922775268555,
"learning_rate": 1.6912487708947886e-06,
"loss": 1.1582,
"step": 172
},
{
"epoch": 0.042547958681751104,
"grad_norm": 5.2968058586120605,
"learning_rate": 1.7010816125860374e-06,
"loss": 1.2676,
"step": 173
},
{
"epoch": 0.04279390063944909,
"grad_norm": 4.744229316711426,
"learning_rate": 1.7109144542772863e-06,
"loss": 1.1686,
"step": 174
},
{
"epoch": 0.04303984259714708,
"grad_norm": 4.801780700683594,
"learning_rate": 1.720747295968535e-06,
"loss": 1.1562,
"step": 175
},
{
"epoch": 0.04328578455484506,
"grad_norm": 5.448787212371826,
"learning_rate": 1.7305801376597839e-06,
"loss": 1.3898,
"step": 176
},
{
"epoch": 0.04353172651254304,
"grad_norm": 5.518412113189697,
"learning_rate": 1.7404129793510328e-06,
"loss": 1.2674,
"step": 177
},
{
"epoch": 0.043777668470241025,
"grad_norm": 5.0862555503845215,
"learning_rate": 1.7502458210422812e-06,
"loss": 1.1962,
"step": 178
},
{
"epoch": 0.04402361042793901,
"grad_norm": 4.670449256896973,
"learning_rate": 1.76007866273353e-06,
"loss": 1.0377,
"step": 179
},
{
"epoch": 0.04426955238563699,
"grad_norm": 5.244323253631592,
"learning_rate": 1.769911504424779e-06,
"loss": 1.265,
"step": 180
},
{
"epoch": 0.044515494343334974,
"grad_norm": 4.950331211090088,
"learning_rate": 1.7797443461160276e-06,
"loss": 1.2835,
"step": 181
},
{
"epoch": 0.04476143630103296,
"grad_norm": 5.069838047027588,
"learning_rate": 1.7895771878072765e-06,
"loss": 1.2507,
"step": 182
},
{
"epoch": 0.04500737825873094,
"grad_norm": 4.518308162689209,
"learning_rate": 1.7994100294985254e-06,
"loss": 1.2254,
"step": 183
},
{
"epoch": 0.04525332021642892,
"grad_norm": 5.309910297393799,
"learning_rate": 1.8092428711897739e-06,
"loss": 1.2724,
"step": 184
},
{
"epoch": 0.045499262174126906,
"grad_norm": 5.119739055633545,
"learning_rate": 1.8190757128810228e-06,
"loss": 1.1866,
"step": 185
},
{
"epoch": 0.04574520413182489,
"grad_norm": 4.652911186218262,
"learning_rate": 1.8289085545722714e-06,
"loss": 1.0847,
"step": 186
},
{
"epoch": 0.04599114608952287,
"grad_norm": 5.1225996017456055,
"learning_rate": 1.8387413962635203e-06,
"loss": 1.2804,
"step": 187
},
{
"epoch": 0.046237088047220855,
"grad_norm": 5.137363433837891,
"learning_rate": 1.8485742379547692e-06,
"loss": 1.1589,
"step": 188
},
{
"epoch": 0.04648303000491884,
"grad_norm": 5.517624855041504,
"learning_rate": 1.8584070796460179e-06,
"loss": 1.3215,
"step": 189
},
{
"epoch": 0.04672897196261682,
"grad_norm": 4.814191818237305,
"learning_rate": 1.8682399213372665e-06,
"loss": 1.0058,
"step": 190
},
{
"epoch": 0.046974913920314804,
"grad_norm": 5.222909450531006,
"learning_rate": 1.8780727630285154e-06,
"loss": 1.2205,
"step": 191
},
{
"epoch": 0.04722085587801279,
"grad_norm": 5.480452060699463,
"learning_rate": 1.887905604719764e-06,
"loss": 1.2556,
"step": 192
},
{
"epoch": 0.04746679783571077,
"grad_norm": 4.504280090332031,
"learning_rate": 1.897738446411013e-06,
"loss": 1.1421,
"step": 193
},
{
"epoch": 0.04771273979340875,
"grad_norm": 4.9549689292907715,
"learning_rate": 1.9075712881022616e-06,
"loss": 1.0684,
"step": 194
},
{
"epoch": 0.047958681751106735,
"grad_norm": 5.211985111236572,
"learning_rate": 1.9174041297935107e-06,
"loss": 1.2272,
"step": 195
},
{
"epoch": 0.048204623708804725,
"grad_norm": 5.188969612121582,
"learning_rate": 1.927236971484759e-06,
"loss": 1.0958,
"step": 196
},
{
"epoch": 0.04845056566650271,
"grad_norm": 4.800060272216797,
"learning_rate": 1.937069813176008e-06,
"loss": 1.1228,
"step": 197
},
{
"epoch": 0.04869650762420069,
"grad_norm": 5.320250988006592,
"learning_rate": 1.9469026548672567e-06,
"loss": 1.109,
"step": 198
},
{
"epoch": 0.048942449581898674,
"grad_norm": 5.107649803161621,
"learning_rate": 1.9567354965585054e-06,
"loss": 1.2886,
"step": 199
},
{
"epoch": 0.04918839153959666,
"grad_norm": 5.639791965484619,
"learning_rate": 1.9665683382497545e-06,
"loss": 1.238,
"step": 200
},
{
"epoch": 0.04943433349729464,
"grad_norm": 5.078655242919922,
"learning_rate": 1.976401179941003e-06,
"loss": 1.2601,
"step": 201
},
{
"epoch": 0.04968027545499262,
"grad_norm": 5.2277445793151855,
"learning_rate": 1.986234021632252e-06,
"loss": 1.2052,
"step": 202
},
{
"epoch": 0.049926217412690606,
"grad_norm": 4.732024669647217,
"learning_rate": 1.9960668633235005e-06,
"loss": 1.0476,
"step": 203
},
{
"epoch": 0.05017215937038859,
"grad_norm": 5.35389518737793,
"learning_rate": 2.005899705014749e-06,
"loss": 1.3584,
"step": 204
},
{
"epoch": 0.05041810132808657,
"grad_norm": 5.011120796203613,
"learning_rate": 2.0157325467059983e-06,
"loss": 1.3804,
"step": 205
},
{
"epoch": 0.050664043285784555,
"grad_norm": 4.959133148193359,
"learning_rate": 2.025565388397247e-06,
"loss": 1.1694,
"step": 206
},
{
"epoch": 0.05090998524348254,
"grad_norm": 4.890188694000244,
"learning_rate": 2.035398230088496e-06,
"loss": 1.1917,
"step": 207
},
{
"epoch": 0.05115592720118052,
"grad_norm": 4.968293190002441,
"learning_rate": 2.0452310717797443e-06,
"loss": 1.3057,
"step": 208
},
{
"epoch": 0.0514018691588785,
"grad_norm": 4.59281063079834,
"learning_rate": 2.0550639134709934e-06,
"loss": 1.1147,
"step": 209
},
{
"epoch": 0.051647811116576486,
"grad_norm": 4.929031848907471,
"learning_rate": 2.064896755162242e-06,
"loss": 1.336,
"step": 210
},
{
"epoch": 0.05189375307427447,
"grad_norm": 4.891195297241211,
"learning_rate": 2.0747295968534907e-06,
"loss": 1.2033,
"step": 211
},
{
"epoch": 0.05213969503197245,
"grad_norm": 4.8050055503845215,
"learning_rate": 2.08456243854474e-06,
"loss": 1.2201,
"step": 212
},
{
"epoch": 0.052385636989670435,
"grad_norm": 4.942957878112793,
"learning_rate": 2.0943952802359885e-06,
"loss": 1.1069,
"step": 213
},
{
"epoch": 0.05263157894736842,
"grad_norm": 4.826563358306885,
"learning_rate": 2.104228121927237e-06,
"loss": 1.3333,
"step": 214
},
{
"epoch": 0.0528775209050664,
"grad_norm": 5.28056001663208,
"learning_rate": 2.114060963618486e-06,
"loss": 1.1128,
"step": 215
},
{
"epoch": 0.05312346286276439,
"grad_norm": 4.368964672088623,
"learning_rate": 2.1238938053097345e-06,
"loss": 1.0891,
"step": 216
},
{
"epoch": 0.053369404820462374,
"grad_norm": 5.641948223114014,
"learning_rate": 2.1337266470009836e-06,
"loss": 1.1803,
"step": 217
},
{
"epoch": 0.05361534677816036,
"grad_norm": 4.892561912536621,
"learning_rate": 2.1435594886922323e-06,
"loss": 1.0487,
"step": 218
},
{
"epoch": 0.05386128873585834,
"grad_norm": 5.496403217315674,
"learning_rate": 2.153392330383481e-06,
"loss": 1.2108,
"step": 219
},
{
"epoch": 0.05410723069355632,
"grad_norm": 5.252872943878174,
"learning_rate": 2.1632251720747296e-06,
"loss": 1.2721,
"step": 220
},
{
"epoch": 0.054353172651254306,
"grad_norm": 5.141668319702148,
"learning_rate": 2.1730580137659783e-06,
"loss": 1.0979,
"step": 221
},
{
"epoch": 0.05459911460895229,
"grad_norm": 4.924139499664307,
"learning_rate": 2.1828908554572274e-06,
"loss": 1.1037,
"step": 222
},
{
"epoch": 0.05484505656665027,
"grad_norm": 4.911133766174316,
"learning_rate": 2.192723697148476e-06,
"loss": 1.319,
"step": 223
},
{
"epoch": 0.055090998524348254,
"grad_norm": 5.304295539855957,
"learning_rate": 2.2025565388397247e-06,
"loss": 1.2688,
"step": 224
},
{
"epoch": 0.05533694048204624,
"grad_norm": 5.419989585876465,
"learning_rate": 2.212389380530974e-06,
"loss": 1.1471,
"step": 225
},
{
"epoch": 0.05558288243974422,
"grad_norm": 5.312771797180176,
"learning_rate": 2.222222222222222e-06,
"loss": 1.231,
"step": 226
},
{
"epoch": 0.0558288243974422,
"grad_norm": 5.411747932434082,
"learning_rate": 2.232055063913471e-06,
"loss": 1.1036,
"step": 227
},
{
"epoch": 0.056074766355140186,
"grad_norm": 5.019411563873291,
"learning_rate": 2.24188790560472e-06,
"loss": 1.2223,
"step": 228
},
{
"epoch": 0.05632070831283817,
"grad_norm": 5.189014911651611,
"learning_rate": 2.251720747295969e-06,
"loss": 1.2123,
"step": 229
},
{
"epoch": 0.05656665027053615,
"grad_norm": 5.391974449157715,
"learning_rate": 2.2615535889872176e-06,
"loss": 1.2785,
"step": 230
},
{
"epoch": 0.056812592228234135,
"grad_norm": 5.179771900177002,
"learning_rate": 2.2713864306784663e-06,
"loss": 1.1447,
"step": 231
},
{
"epoch": 0.05705853418593212,
"grad_norm": 5.084639549255371,
"learning_rate": 2.281219272369715e-06,
"loss": 1.3101,
"step": 232
},
{
"epoch": 0.0573044761436301,
"grad_norm": 5.014813423156738,
"learning_rate": 2.2910521140609636e-06,
"loss": 1.223,
"step": 233
},
{
"epoch": 0.057550418101328084,
"grad_norm": 4.979683876037598,
"learning_rate": 2.3008849557522127e-06,
"loss": 1.1941,
"step": 234
},
{
"epoch": 0.05779636005902607,
"grad_norm": 5.461153984069824,
"learning_rate": 2.3107177974434614e-06,
"loss": 1.2096,
"step": 235
},
{
"epoch": 0.05804230201672405,
"grad_norm": 5.1997904777526855,
"learning_rate": 2.32055063913471e-06,
"loss": 1.2583,
"step": 236
},
{
"epoch": 0.05828824397442204,
"grad_norm": 5.116849899291992,
"learning_rate": 2.330383480825959e-06,
"loss": 1.1956,
"step": 237
},
{
"epoch": 0.05853418593212002,
"grad_norm": 5.231355667114258,
"learning_rate": 2.3402163225172074e-06,
"loss": 1.2438,
"step": 238
},
{
"epoch": 0.058780127889818005,
"grad_norm": 4.861061096191406,
"learning_rate": 2.3500491642084565e-06,
"loss": 1.2309,
"step": 239
},
{
"epoch": 0.05902606984751599,
"grad_norm": 5.244760513305664,
"learning_rate": 2.359882005899705e-06,
"loss": 1.3202,
"step": 240
},
{
"epoch": 0.05927201180521397,
"grad_norm": 4.998968124389648,
"learning_rate": 2.369714847590954e-06,
"loss": 1.24,
"step": 241
},
{
"epoch": 0.059517953762911954,
"grad_norm": 4.9600510597229,
"learning_rate": 2.379547689282203e-06,
"loss": 1.2138,
"step": 242
},
{
"epoch": 0.05976389572060994,
"grad_norm": 5.108877182006836,
"learning_rate": 2.3893805309734516e-06,
"loss": 1.2381,
"step": 243
},
{
"epoch": 0.06000983767830792,
"grad_norm": 5.000603675842285,
"learning_rate": 2.3992133726647003e-06,
"loss": 1.17,
"step": 244
},
{
"epoch": 0.0602557796360059,
"grad_norm": 5.451155185699463,
"learning_rate": 2.409046214355949e-06,
"loss": 1.2412,
"step": 245
},
{
"epoch": 0.060501721593703886,
"grad_norm": 4.734378814697266,
"learning_rate": 2.4188790560471976e-06,
"loss": 1.1317,
"step": 246
},
{
"epoch": 0.06074766355140187,
"grad_norm": 5.384011268615723,
"learning_rate": 2.4287118977384467e-06,
"loss": 1.135,
"step": 247
},
{
"epoch": 0.06099360550909985,
"grad_norm": 5.565666675567627,
"learning_rate": 2.4385447394296954e-06,
"loss": 1.2907,
"step": 248
},
{
"epoch": 0.061239547466797835,
"grad_norm": 4.783609390258789,
"learning_rate": 2.448377581120944e-06,
"loss": 1.1655,
"step": 249
},
{
"epoch": 0.06148548942449582,
"grad_norm": 4.86632776260376,
"learning_rate": 2.4582104228121927e-06,
"loss": 1.1068,
"step": 250
},
{
"epoch": 0.0617314313821938,
"grad_norm": 4.423895835876465,
"learning_rate": 2.468043264503442e-06,
"loss": 1.0441,
"step": 251
},
{
"epoch": 0.061977373339891784,
"grad_norm": 5.141962051391602,
"learning_rate": 2.4778761061946905e-06,
"loss": 1.3244,
"step": 252
},
{
"epoch": 0.06222331529758977,
"grad_norm": 5.706864356994629,
"learning_rate": 2.487708947885939e-06,
"loss": 1.1427,
"step": 253
},
{
"epoch": 0.06246925725528775,
"grad_norm": 5.406494617462158,
"learning_rate": 2.4975417895771882e-06,
"loss": 1.2417,
"step": 254
},
{
"epoch": 0.06271519921298574,
"grad_norm": 5.109960079193115,
"learning_rate": 2.5073746312684365e-06,
"loss": 1.2977,
"step": 255
},
{
"epoch": 0.06296114117068372,
"grad_norm": 5.585296154022217,
"learning_rate": 2.5172074729596856e-06,
"loss": 1.156,
"step": 256
},
{
"epoch": 0.0632070831283817,
"grad_norm": 4.740612506866455,
"learning_rate": 2.5270403146509343e-06,
"loss": 1.0564,
"step": 257
},
{
"epoch": 0.06345302508607968,
"grad_norm": 5.031524658203125,
"learning_rate": 2.5368731563421834e-06,
"loss": 1.276,
"step": 258
},
{
"epoch": 0.06369896704377767,
"grad_norm": 4.779954433441162,
"learning_rate": 2.546705998033432e-06,
"loss": 1.2055,
"step": 259
},
{
"epoch": 0.06394490900147565,
"grad_norm": 5.662361145019531,
"learning_rate": 2.5565388397246803e-06,
"loss": 1.2503,
"step": 260
},
{
"epoch": 0.06419085095917364,
"grad_norm": 5.775557041168213,
"learning_rate": 2.5663716814159294e-06,
"loss": 1.2449,
"step": 261
},
{
"epoch": 0.06443679291687161,
"grad_norm": 5.2927045822143555,
"learning_rate": 2.576204523107178e-06,
"loss": 1.3944,
"step": 262
},
{
"epoch": 0.0646827348745696,
"grad_norm": 5.039997100830078,
"learning_rate": 2.586037364798427e-06,
"loss": 1.2165,
"step": 263
},
{
"epoch": 0.06492867683226758,
"grad_norm": 4.908417224884033,
"learning_rate": 2.595870206489676e-06,
"loss": 1.1503,
"step": 264
},
{
"epoch": 0.06517461878996557,
"grad_norm": 4.937044620513916,
"learning_rate": 2.605703048180924e-06,
"loss": 1.0014,
"step": 265
},
{
"epoch": 0.06542056074766354,
"grad_norm": 5.102492809295654,
"learning_rate": 2.615535889872173e-06,
"loss": 1.2276,
"step": 266
},
{
"epoch": 0.06566650270536153,
"grad_norm": 4.972696781158447,
"learning_rate": 2.625368731563422e-06,
"loss": 1.25,
"step": 267
},
{
"epoch": 0.06591244466305952,
"grad_norm": 4.781945705413818,
"learning_rate": 2.635201573254671e-06,
"loss": 1.2212,
"step": 268
},
{
"epoch": 0.0661583866207575,
"grad_norm": 5.192379474639893,
"learning_rate": 2.6450344149459196e-06,
"loss": 1.1375,
"step": 269
},
{
"epoch": 0.06640432857845549,
"grad_norm": 5.151814937591553,
"learning_rate": 2.6548672566371687e-06,
"loss": 1.1694,
"step": 270
},
{
"epoch": 0.06665027053615347,
"grad_norm": 4.606880187988281,
"learning_rate": 2.6647000983284173e-06,
"loss": 0.9862,
"step": 271
},
{
"epoch": 0.06689621249385146,
"grad_norm": 5.032959938049316,
"learning_rate": 2.6745329400196656e-06,
"loss": 1.1583,
"step": 272
},
{
"epoch": 0.06714215445154943,
"grad_norm": 5.2365875244140625,
"learning_rate": 2.6843657817109147e-06,
"loss": 1.257,
"step": 273
},
{
"epoch": 0.06738809640924742,
"grad_norm": 5.516077995300293,
"learning_rate": 2.6941986234021634e-06,
"loss": 1.2423,
"step": 274
},
{
"epoch": 0.0676340383669454,
"grad_norm": 4.946442127227783,
"learning_rate": 2.7040314650934125e-06,
"loss": 1.2283,
"step": 275
},
{
"epoch": 0.06787998032464339,
"grad_norm": 5.936635971069336,
"learning_rate": 2.713864306784661e-06,
"loss": 1.2014,
"step": 276
},
{
"epoch": 0.06812592228234136,
"grad_norm": 4.768028736114502,
"learning_rate": 2.7236971484759094e-06,
"loss": 1.1657,
"step": 277
},
{
"epoch": 0.06837186424003935,
"grad_norm": 5.497716426849365,
"learning_rate": 2.7335299901671585e-06,
"loss": 1.1925,
"step": 278
},
{
"epoch": 0.06861780619773733,
"grad_norm": 4.9626240730285645,
"learning_rate": 2.743362831858407e-06,
"loss": 1.2006,
"step": 279
},
{
"epoch": 0.06886374815543532,
"grad_norm": 5.311958312988281,
"learning_rate": 2.7531956735496562e-06,
"loss": 1.2432,
"step": 280
},
{
"epoch": 0.0691096901131333,
"grad_norm": 5.125909328460693,
"learning_rate": 2.763028515240905e-06,
"loss": 1.1505,
"step": 281
},
{
"epoch": 0.06935563207083129,
"grad_norm": 4.748795032501221,
"learning_rate": 2.772861356932154e-06,
"loss": 1.1231,
"step": 282
},
{
"epoch": 0.06960157402852926,
"grad_norm": 4.79762601852417,
"learning_rate": 2.7826941986234022e-06,
"loss": 1.196,
"step": 283
},
{
"epoch": 0.06984751598622725,
"grad_norm": 4.942255973815918,
"learning_rate": 2.792527040314651e-06,
"loss": 1.2022,
"step": 284
},
{
"epoch": 0.07009345794392523,
"grad_norm": 4.787925720214844,
"learning_rate": 2.8023598820059e-06,
"loss": 1.2336,
"step": 285
},
{
"epoch": 0.07033939990162322,
"grad_norm": 4.74533224105835,
"learning_rate": 2.8121927236971487e-06,
"loss": 1.1595,
"step": 286
},
{
"epoch": 0.0705853418593212,
"grad_norm": 5.054858684539795,
"learning_rate": 2.8220255653883978e-06,
"loss": 1.2082,
"step": 287
},
{
"epoch": 0.07083128381701918,
"grad_norm": 5.297298908233643,
"learning_rate": 2.831858407079646e-06,
"loss": 1.2509,
"step": 288
},
{
"epoch": 0.07107722577471717,
"grad_norm": 4.919221878051758,
"learning_rate": 2.8416912487708947e-06,
"loss": 1.2377,
"step": 289
},
{
"epoch": 0.07132316773241515,
"grad_norm": 4.939472198486328,
"learning_rate": 2.851524090462144e-06,
"loss": 1.2353,
"step": 290
},
{
"epoch": 0.07156910969011314,
"grad_norm": 4.771536350250244,
"learning_rate": 2.8613569321533925e-06,
"loss": 1.2725,
"step": 291
},
{
"epoch": 0.07181505164781112,
"grad_norm": 5.260715484619141,
"learning_rate": 2.8711897738446416e-06,
"loss": 1.1555,
"step": 292
},
{
"epoch": 0.0720609936055091,
"grad_norm": 4.726348876953125,
"learning_rate": 2.8810226155358902e-06,
"loss": 1.1028,
"step": 293
},
{
"epoch": 0.07230693556320708,
"grad_norm": 4.783933639526367,
"learning_rate": 2.8908554572271393e-06,
"loss": 1.2282,
"step": 294
},
{
"epoch": 0.07255287752090507,
"grad_norm": 5.237471580505371,
"learning_rate": 2.9006882989183876e-06,
"loss": 1.1297,
"step": 295
},
{
"epoch": 0.07279881947860305,
"grad_norm": 5.009494304656982,
"learning_rate": 2.9105211406096362e-06,
"loss": 1.1755,
"step": 296
},
{
"epoch": 0.07304476143630104,
"grad_norm": 4.967554092407227,
"learning_rate": 2.9203539823008853e-06,
"loss": 1.2376,
"step": 297
},
{
"epoch": 0.07329070339399901,
"grad_norm": 5.214707851409912,
"learning_rate": 2.930186823992134e-06,
"loss": 1.1992,
"step": 298
},
{
"epoch": 0.073536645351697,
"grad_norm": 5.045170307159424,
"learning_rate": 2.940019665683383e-06,
"loss": 1.2603,
"step": 299
},
{
"epoch": 0.07378258730939498,
"grad_norm": 5.371266841888428,
"learning_rate": 2.9498525073746313e-06,
"loss": 1.1729,
"step": 300
},
{
"epoch": 0.07402852926709297,
"grad_norm": 5.1045942306518555,
"learning_rate": 2.95968534906588e-06,
"loss": 1.2093,
"step": 301
},
{
"epoch": 0.07427447122479094,
"grad_norm": 4.824265956878662,
"learning_rate": 2.969518190757129e-06,
"loss": 1.1219,
"step": 302
},
{
"epoch": 0.07452041318248893,
"grad_norm": 5.6183180809021,
"learning_rate": 2.9793510324483778e-06,
"loss": 1.2981,
"step": 303
},
{
"epoch": 0.07476635514018691,
"grad_norm": 4.667074680328369,
"learning_rate": 2.989183874139627e-06,
"loss": 1.1248,
"step": 304
},
{
"epoch": 0.0750122970978849,
"grad_norm": 4.665690898895264,
"learning_rate": 2.999016715830875e-06,
"loss": 1.1454,
"step": 305
},
{
"epoch": 0.07525823905558288,
"grad_norm": 4.970240592956543,
"learning_rate": 3.0088495575221242e-06,
"loss": 1.2058,
"step": 306
},
{
"epoch": 0.07550418101328087,
"grad_norm": 5.330406665802002,
"learning_rate": 3.018682399213373e-06,
"loss": 1.315,
"step": 307
},
{
"epoch": 0.07575012297097884,
"grad_norm": 5.176372528076172,
"learning_rate": 3.0285152409046216e-06,
"loss": 1.2527,
"step": 308
},
{
"epoch": 0.07599606492867683,
"grad_norm": 5.23092794418335,
"learning_rate": 3.0383480825958707e-06,
"loss": 1.1288,
"step": 309
},
{
"epoch": 0.07624200688637482,
"grad_norm": 5.507978439331055,
"learning_rate": 3.048180924287119e-06,
"loss": 1.3008,
"step": 310
},
{
"epoch": 0.0764879488440728,
"grad_norm": 5.0398736000061035,
"learning_rate": 3.058013765978368e-06,
"loss": 1.3394,
"step": 311
},
{
"epoch": 0.07673389080177079,
"grad_norm": 5.554378986358643,
"learning_rate": 3.0678466076696167e-06,
"loss": 1.0992,
"step": 312
},
{
"epoch": 0.07697983275946876,
"grad_norm": 4.78623628616333,
"learning_rate": 3.0776794493608653e-06,
"loss": 1.1514,
"step": 313
},
{
"epoch": 0.07722577471716675,
"grad_norm": 5.364081859588623,
"learning_rate": 3.0875122910521144e-06,
"loss": 1.2883,
"step": 314
},
{
"epoch": 0.07747171667486473,
"grad_norm": 5.01410436630249,
"learning_rate": 3.097345132743363e-06,
"loss": 1.1416,
"step": 315
},
{
"epoch": 0.07771765863256272,
"grad_norm": 4.716882228851318,
"learning_rate": 3.107177974434612e-06,
"loss": 0.9836,
"step": 316
},
{
"epoch": 0.0779636005902607,
"grad_norm": 5.234086513519287,
"learning_rate": 3.1170108161258604e-06,
"loss": 1.33,
"step": 317
},
{
"epoch": 0.07820954254795869,
"grad_norm": 5.257030487060547,
"learning_rate": 3.1268436578171095e-06,
"loss": 1.2449,
"step": 318
},
{
"epoch": 0.07845548450565666,
"grad_norm": 5.876575469970703,
"learning_rate": 3.136676499508358e-06,
"loss": 1.2674,
"step": 319
},
{
"epoch": 0.07870142646335465,
"grad_norm": 5.086026668548584,
"learning_rate": 3.146509341199607e-06,
"loss": 1.2645,
"step": 320
},
{
"epoch": 0.07894736842105263,
"grad_norm": 5.770899295806885,
"learning_rate": 3.156342182890856e-06,
"loss": 1.2091,
"step": 321
},
{
"epoch": 0.07919331037875062,
"grad_norm": 4.825321197509766,
"learning_rate": 3.1661750245821042e-06,
"loss": 1.3536,
"step": 322
},
{
"epoch": 0.0794392523364486,
"grad_norm": 5.291980266571045,
"learning_rate": 3.1760078662733533e-06,
"loss": 1.1345,
"step": 323
},
{
"epoch": 0.07968519429414658,
"grad_norm": 4.682566165924072,
"learning_rate": 3.185840707964602e-06,
"loss": 1.2008,
"step": 324
},
{
"epoch": 0.07993113625184456,
"grad_norm": 4.837527275085449,
"learning_rate": 3.1956735496558507e-06,
"loss": 1.1838,
"step": 325
},
{
"epoch": 0.08017707820954255,
"grad_norm": 5.027462005615234,
"learning_rate": 3.2055063913470998e-06,
"loss": 1.1142,
"step": 326
},
{
"epoch": 0.08042302016724052,
"grad_norm": 4.791934013366699,
"learning_rate": 3.215339233038348e-06,
"loss": 1.196,
"step": 327
},
{
"epoch": 0.08066896212493851,
"grad_norm": 4.958493232727051,
"learning_rate": 3.225172074729597e-06,
"loss": 1.1771,
"step": 328
},
{
"epoch": 0.0809149040826365,
"grad_norm": 4.747767448425293,
"learning_rate": 3.2350049164208458e-06,
"loss": 1.1297,
"step": 329
},
{
"epoch": 0.08116084604033448,
"grad_norm": 4.872331142425537,
"learning_rate": 3.244837758112095e-06,
"loss": 1.0887,
"step": 330
},
{
"epoch": 0.08140678799803247,
"grad_norm": 4.914925575256348,
"learning_rate": 3.2546705998033435e-06,
"loss": 1.1948,
"step": 331
},
{
"epoch": 0.08165272995573045,
"grad_norm": 5.043294429779053,
"learning_rate": 3.2645034414945918e-06,
"loss": 1.2656,
"step": 332
},
{
"epoch": 0.08189867191342844,
"grad_norm": 4.809412479400635,
"learning_rate": 3.274336283185841e-06,
"loss": 1.2906,
"step": 333
},
{
"epoch": 0.08214461387112641,
"grad_norm": 4.647971153259277,
"learning_rate": 3.2841691248770895e-06,
"loss": 1.1609,
"step": 334
},
{
"epoch": 0.0823905558288244,
"grad_norm": 4.671137809753418,
"learning_rate": 3.2940019665683386e-06,
"loss": 1.1648,
"step": 335
},
{
"epoch": 0.08263649778652238,
"grad_norm": 4.642824649810791,
"learning_rate": 3.3038348082595873e-06,
"loss": 1.3051,
"step": 336
},
{
"epoch": 0.08288243974422037,
"grad_norm": 5.096004962921143,
"learning_rate": 3.313667649950836e-06,
"loss": 1.1931,
"step": 337
},
{
"epoch": 0.08312838170191834,
"grad_norm": 5.2832560539245605,
"learning_rate": 3.323500491642085e-06,
"loss": 1.1977,
"step": 338
},
{
"epoch": 0.08337432365961633,
"grad_norm": 4.872321128845215,
"learning_rate": 3.3333333333333333e-06,
"loss": 1.136,
"step": 339
},
{
"epoch": 0.08362026561731431,
"grad_norm": 5.192013740539551,
"learning_rate": 3.3431661750245824e-06,
"loss": 1.1676,
"step": 340
},
{
"epoch": 0.0838662075750123,
"grad_norm": 5.486101150512695,
"learning_rate": 3.352999016715831e-06,
"loss": 1.185,
"step": 341
},
{
"epoch": 0.08411214953271028,
"grad_norm": 4.975491523742676,
"learning_rate": 3.36283185840708e-06,
"loss": 1.1954,
"step": 342
},
{
"epoch": 0.08435809149040827,
"grad_norm": 4.90364408493042,
"learning_rate": 3.372664700098329e-06,
"loss": 1.1561,
"step": 343
},
{
"epoch": 0.08460403344810624,
"grad_norm": 5.160917282104492,
"learning_rate": 3.382497541789577e-06,
"loss": 1.1813,
"step": 344
},
{
"epoch": 0.08484997540580423,
"grad_norm": 4.9448628425598145,
"learning_rate": 3.392330383480826e-06,
"loss": 1.1626,
"step": 345
},
{
"epoch": 0.08509591736350221,
"grad_norm": 5.21720027923584,
"learning_rate": 3.402163225172075e-06,
"loss": 1.336,
"step": 346
},
{
"epoch": 0.0853418593212002,
"grad_norm": 4.7690935134887695,
"learning_rate": 3.411996066863324e-06,
"loss": 1.1621,
"step": 347
},
{
"epoch": 0.08558780127889817,
"grad_norm": 4.849077224731445,
"learning_rate": 3.4218289085545726e-06,
"loss": 1.1366,
"step": 348
},
{
"epoch": 0.08583374323659616,
"grad_norm": 4.87131404876709,
"learning_rate": 3.4316617502458217e-06,
"loss": 1.1137,
"step": 349
},
{
"epoch": 0.08607968519429415,
"grad_norm": 4.9637627601623535,
"learning_rate": 3.44149459193707e-06,
"loss": 1.2139,
"step": 350
},
{
"epoch": 0.08632562715199213,
"grad_norm": 5.254129886627197,
"learning_rate": 3.4513274336283186e-06,
"loss": 1.1547,
"step": 351
},
{
"epoch": 0.08657156910969012,
"grad_norm": 4.950482368469238,
"learning_rate": 3.4611602753195677e-06,
"loss": 1.2748,
"step": 352
},
{
"epoch": 0.0868175110673881,
"grad_norm": 4.760875225067139,
"learning_rate": 3.4709931170108164e-06,
"loss": 1.199,
"step": 353
},
{
"epoch": 0.08706345302508608,
"grad_norm": 4.491692066192627,
"learning_rate": 3.4808259587020655e-06,
"loss": 1.2435,
"step": 354
},
{
"epoch": 0.08730939498278406,
"grad_norm": 4.921182632446289,
"learning_rate": 3.4906588003933138e-06,
"loss": 1.1301,
"step": 355
},
{
"epoch": 0.08755533694048205,
"grad_norm": 4.867427825927734,
"learning_rate": 3.5004916420845624e-06,
"loss": 1.1753,
"step": 356
},
{
"epoch": 0.08780127889818003,
"grad_norm": 5.084958076477051,
"learning_rate": 3.5103244837758115e-06,
"loss": 1.1898,
"step": 357
},
{
"epoch": 0.08804722085587802,
"grad_norm": 4.802285194396973,
"learning_rate": 3.52015732546706e-06,
"loss": 1.1771,
"step": 358
},
{
"epoch": 0.08829316281357599,
"grad_norm": 5.210577011108398,
"learning_rate": 3.5299901671583093e-06,
"loss": 1.2105,
"step": 359
},
{
"epoch": 0.08853910477127398,
"grad_norm": 5.195521354675293,
"learning_rate": 3.539823008849558e-06,
"loss": 1.2016,
"step": 360
},
{
"epoch": 0.08878504672897196,
"grad_norm": 5.51386022567749,
"learning_rate": 3.549655850540807e-06,
"loss": 1.2448,
"step": 361
},
{
"epoch": 0.08903098868666995,
"grad_norm": 5.155383110046387,
"learning_rate": 3.5594886922320553e-06,
"loss": 1.0863,
"step": 362
},
{
"epoch": 0.08927693064436792,
"grad_norm": 4.982314109802246,
"learning_rate": 3.569321533923304e-06,
"loss": 1.1855,
"step": 363
},
{
"epoch": 0.08952287260206591,
"grad_norm": 5.239125728607178,
"learning_rate": 3.579154375614553e-06,
"loss": 1.2556,
"step": 364
},
{
"epoch": 0.08976881455976389,
"grad_norm": 4.9231486320495605,
"learning_rate": 3.5889872173058017e-06,
"loss": 1.1407,
"step": 365
},
{
"epoch": 0.09001475651746188,
"grad_norm": 5.47269344329834,
"learning_rate": 3.598820058997051e-06,
"loss": 1.1971,
"step": 366
},
{
"epoch": 0.09026069847515986,
"grad_norm": 4.96206521987915,
"learning_rate": 3.608652900688299e-06,
"loss": 1.0414,
"step": 367
},
{
"epoch": 0.09050664043285785,
"grad_norm": 5.00238561630249,
"learning_rate": 3.6184857423795477e-06,
"loss": 1.1857,
"step": 368
},
{
"epoch": 0.09075258239055582,
"grad_norm": 5.164708137512207,
"learning_rate": 3.628318584070797e-06,
"loss": 1.122,
"step": 369
},
{
"epoch": 0.09099852434825381,
"grad_norm": 4.941920757293701,
"learning_rate": 3.6381514257620455e-06,
"loss": 1.2182,
"step": 370
},
{
"epoch": 0.0912444663059518,
"grad_norm": 4.8003668785095215,
"learning_rate": 3.6479842674532946e-06,
"loss": 1.3003,
"step": 371
},
{
"epoch": 0.09149040826364978,
"grad_norm": 4.360984802246094,
"learning_rate": 3.657817109144543e-06,
"loss": 1.1509,
"step": 372
},
{
"epoch": 0.09173635022134777,
"grad_norm": 5.907519340515137,
"learning_rate": 3.667649950835792e-06,
"loss": 1.1307,
"step": 373
},
{
"epoch": 0.09198229217904574,
"grad_norm": 4.695765972137451,
"learning_rate": 3.6774827925270406e-06,
"loss": 1.1157,
"step": 374
},
{
"epoch": 0.09222823413674373,
"grad_norm": 5.222832679748535,
"learning_rate": 3.6873156342182893e-06,
"loss": 1.1949,
"step": 375
},
{
"epoch": 0.09247417609444171,
"grad_norm": 4.9251203536987305,
"learning_rate": 3.6971484759095384e-06,
"loss": 1.242,
"step": 376
},
{
"epoch": 0.0927201180521397,
"grad_norm": 4.716278553009033,
"learning_rate": 3.7069813176007866e-06,
"loss": 1.0552,
"step": 377
},
{
"epoch": 0.09296606000983768,
"grad_norm": 4.582760810852051,
"learning_rate": 3.7168141592920357e-06,
"loss": 1.0401,
"step": 378
},
{
"epoch": 0.09321200196753567,
"grad_norm": 4.287074565887451,
"learning_rate": 3.7266470009832844e-06,
"loss": 1.1335,
"step": 379
},
{
"epoch": 0.09345794392523364,
"grad_norm": 4.690394878387451,
"learning_rate": 3.736479842674533e-06,
"loss": 1.1473,
"step": 380
},
{
"epoch": 0.09370388588293163,
"grad_norm": 4.809893608093262,
"learning_rate": 3.746312684365782e-06,
"loss": 1.102,
"step": 381
},
{
"epoch": 0.09394982784062961,
"grad_norm": 4.723176956176758,
"learning_rate": 3.756145526057031e-06,
"loss": 1.0803,
"step": 382
},
{
"epoch": 0.0941957697983276,
"grad_norm": 5.041618347167969,
"learning_rate": 3.76597836774828e-06,
"loss": 1.2223,
"step": 383
},
{
"epoch": 0.09444171175602557,
"grad_norm": 4.952805519104004,
"learning_rate": 3.775811209439528e-06,
"loss": 1.0905,
"step": 384
},
{
"epoch": 0.09468765371372356,
"grad_norm": 5.232875347137451,
"learning_rate": 3.7856440511307773e-06,
"loss": 1.2424,
"step": 385
},
{
"epoch": 0.09493359567142154,
"grad_norm": 5.12797737121582,
"learning_rate": 3.795476892822026e-06,
"loss": 1.1937,
"step": 386
},
{
"epoch": 0.09517953762911953,
"grad_norm": 4.786684036254883,
"learning_rate": 3.8053097345132746e-06,
"loss": 1.1217,
"step": 387
},
{
"epoch": 0.0954254795868175,
"grad_norm": 4.81479549407959,
"learning_rate": 3.815142576204523e-06,
"loss": 1.2081,
"step": 388
},
{
"epoch": 0.0956714215445155,
"grad_norm": 4.833148002624512,
"learning_rate": 3.824975417895772e-06,
"loss": 1.1296,
"step": 389
},
{
"epoch": 0.09591736350221347,
"grad_norm": 5.274966239929199,
"learning_rate": 3.8348082595870215e-06,
"loss": 1.3259,
"step": 390
},
{
"epoch": 0.09616330545991146,
"grad_norm": 4.860105514526367,
"learning_rate": 3.84464110127827e-06,
"loss": 1.2587,
"step": 391
},
{
"epoch": 0.09640924741760945,
"grad_norm": 5.428725242614746,
"learning_rate": 3.854473942969518e-06,
"loss": 1.3203,
"step": 392
},
{
"epoch": 0.09665518937530743,
"grad_norm": 4.516085624694824,
"learning_rate": 3.8643067846607675e-06,
"loss": 1.1947,
"step": 393
},
{
"epoch": 0.09690113133300542,
"grad_norm": 4.536333084106445,
"learning_rate": 3.874139626352016e-06,
"loss": 1.1012,
"step": 394
},
{
"epoch": 0.09714707329070339,
"grad_norm": 4.737096786499023,
"learning_rate": 3.883972468043265e-06,
"loss": 1.2731,
"step": 395
},
{
"epoch": 0.09739301524840138,
"grad_norm": 5.030757904052734,
"learning_rate": 3.8938053097345135e-06,
"loss": 1.1139,
"step": 396
},
{
"epoch": 0.09763895720609936,
"grad_norm": 4.976014614105225,
"learning_rate": 3.903638151425763e-06,
"loss": 1.2306,
"step": 397
},
{
"epoch": 0.09788489916379735,
"grad_norm": 5.37526798248291,
"learning_rate": 3.913470993117011e-06,
"loss": 1.2465,
"step": 398
},
{
"epoch": 0.09813084112149532,
"grad_norm": 5.41494083404541,
"learning_rate": 3.9233038348082595e-06,
"loss": 1.3261,
"step": 399
},
{
"epoch": 0.09837678307919331,
"grad_norm": 5.0262651443481445,
"learning_rate": 3.933136676499509e-06,
"loss": 1.181,
"step": 400
},
{
"epoch": 0.09862272503689129,
"grad_norm": 4.9057793617248535,
"learning_rate": 3.942969518190758e-06,
"loss": 1.1211,
"step": 401
},
{
"epoch": 0.09886866699458928,
"grad_norm": 5.4960784912109375,
"learning_rate": 3.952802359882006e-06,
"loss": 1.2766,
"step": 402
},
{
"epoch": 0.09911460895228726,
"grad_norm": 6.610905647277832,
"learning_rate": 3.962635201573255e-06,
"loss": 1.1534,
"step": 403
},
{
"epoch": 0.09936055090998525,
"grad_norm": 5.0190935134887695,
"learning_rate": 3.972468043264504e-06,
"loss": 1.1279,
"step": 404
},
{
"epoch": 0.09960649286768322,
"grad_norm": 4.573025226593018,
"learning_rate": 3.982300884955752e-06,
"loss": 1.1989,
"step": 405
},
{
"epoch": 0.09985243482538121,
"grad_norm": 5.331722259521484,
"learning_rate": 3.992133726647001e-06,
"loss": 1.3643,
"step": 406
},
{
"epoch": 0.10009837678307919,
"grad_norm": 4.918030261993408,
"learning_rate": 4.0019665683382506e-06,
"loss": 1.2749,
"step": 407
},
{
"epoch": 0.10034431874077718,
"grad_norm": 5.3670334815979,
"learning_rate": 4.011799410029498e-06,
"loss": 1.1294,
"step": 408
},
{
"epoch": 0.10059026069847515,
"grad_norm": 4.918808937072754,
"learning_rate": 4.021632251720748e-06,
"loss": 1.173,
"step": 409
},
{
"epoch": 0.10083620265617314,
"grad_norm": 4.805149078369141,
"learning_rate": 4.031465093411997e-06,
"loss": 1.2984,
"step": 410
},
{
"epoch": 0.10108214461387113,
"grad_norm": 4.879279136657715,
"learning_rate": 4.041297935103245e-06,
"loss": 1.0673,
"step": 411
},
{
"epoch": 0.10132808657156911,
"grad_norm": 5.045076847076416,
"learning_rate": 4.051130776794494e-06,
"loss": 1.1334,
"step": 412
},
{
"epoch": 0.1015740285292671,
"grad_norm": 5.247715473175049,
"learning_rate": 4.060963618485743e-06,
"loss": 1.3094,
"step": 413
},
{
"epoch": 0.10181997048696508,
"grad_norm": 4.646758079528809,
"learning_rate": 4.070796460176992e-06,
"loss": 1.1214,
"step": 414
},
{
"epoch": 0.10206591244466307,
"grad_norm": 5.251107215881348,
"learning_rate": 4.08062930186824e-06,
"loss": 1.2542,
"step": 415
},
{
"epoch": 0.10231185440236104,
"grad_norm": 5.118597030639648,
"learning_rate": 4.090462143559489e-06,
"loss": 1.3133,
"step": 416
},
{
"epoch": 0.10255779636005903,
"grad_norm": 4.799027919769287,
"learning_rate": 4.100294985250738e-06,
"loss": 1.1711,
"step": 417
},
{
"epoch": 0.102803738317757,
"grad_norm": 5.317622661590576,
"learning_rate": 4.110127826941987e-06,
"loss": 1.2629,
"step": 418
},
{
"epoch": 0.103049680275455,
"grad_norm": 5.737770080566406,
"learning_rate": 4.1199606686332355e-06,
"loss": 1.2928,
"step": 419
},
{
"epoch": 0.10329562223315297,
"grad_norm": 4.990627288818359,
"learning_rate": 4.129793510324484e-06,
"loss": 1.2763,
"step": 420
},
{
"epoch": 0.10354156419085096,
"grad_norm": 5.169764518737793,
"learning_rate": 4.139626352015733e-06,
"loss": 1.223,
"step": 421
},
{
"epoch": 0.10378750614854894,
"grad_norm": 5.123920440673828,
"learning_rate": 4.1494591937069815e-06,
"loss": 1.307,
"step": 422
},
{
"epoch": 0.10403344810624693,
"grad_norm": 5.006464004516602,
"learning_rate": 4.15929203539823e-06,
"loss": 1.2465,
"step": 423
},
{
"epoch": 0.1042793900639449,
"grad_norm": 4.915448188781738,
"learning_rate": 4.16912487708948e-06,
"loss": 1.1566,
"step": 424
},
{
"epoch": 0.1045253320216429,
"grad_norm": 5.088415145874023,
"learning_rate": 4.1789577187807275e-06,
"loss": 1.2864,
"step": 425
},
{
"epoch": 0.10477127397934087,
"grad_norm": 5.331376075744629,
"learning_rate": 4.188790560471977e-06,
"loss": 1.2436,
"step": 426
},
{
"epoch": 0.10501721593703886,
"grad_norm": 4.775264263153076,
"learning_rate": 4.198623402163226e-06,
"loss": 1.0795,
"step": 427
},
{
"epoch": 0.10526315789473684,
"grad_norm": 4.242771625518799,
"learning_rate": 4.208456243854474e-06,
"loss": 1.0548,
"step": 428
},
{
"epoch": 0.10550909985243483,
"grad_norm": 4.725460052490234,
"learning_rate": 4.218289085545723e-06,
"loss": 1.2087,
"step": 429
},
{
"epoch": 0.1057550418101328,
"grad_norm": 5.920872688293457,
"learning_rate": 4.228121927236972e-06,
"loss": 1.3336,
"step": 430
},
{
"epoch": 0.10600098376783079,
"grad_norm": 4.9771528244018555,
"learning_rate": 4.23795476892822e-06,
"loss": 1.2278,
"step": 431
},
{
"epoch": 0.10624692572552878,
"grad_norm": 4.9481000900268555,
"learning_rate": 4.247787610619469e-06,
"loss": 1.258,
"step": 432
},
{
"epoch": 0.10649286768322676,
"grad_norm": 5.391326904296875,
"learning_rate": 4.2576204523107186e-06,
"loss": 1.3197,
"step": 433
},
{
"epoch": 0.10673880964092475,
"grad_norm": 4.814979076385498,
"learning_rate": 4.267453294001967e-06,
"loss": 1.1974,
"step": 434
},
{
"epoch": 0.10698475159862272,
"grad_norm": 5.534710884094238,
"learning_rate": 4.277286135693216e-06,
"loss": 1.1649,
"step": 435
},
{
"epoch": 0.10723069355632071,
"grad_norm": 5.084470748901367,
"learning_rate": 4.2871189773844646e-06,
"loss": 1.2397,
"step": 436
},
{
"epoch": 0.10747663551401869,
"grad_norm": 5.160434246063232,
"learning_rate": 4.296951819075713e-06,
"loss": 1.0932,
"step": 437
},
{
"epoch": 0.10772257747171668,
"grad_norm": 5.5964436531066895,
"learning_rate": 4.306784660766962e-06,
"loss": 1.2056,
"step": 438
},
{
"epoch": 0.10796851942941466,
"grad_norm": 5.083948612213135,
"learning_rate": 4.316617502458211e-06,
"loss": 1.1508,
"step": 439
},
{
"epoch": 0.10821446138711265,
"grad_norm": 5.260007381439209,
"learning_rate": 4.326450344149459e-06,
"loss": 1.3275,
"step": 440
},
{
"epoch": 0.10846040334481062,
"grad_norm": 4.9198431968688965,
"learning_rate": 4.336283185840709e-06,
"loss": 1.2086,
"step": 441
},
{
"epoch": 0.10870634530250861,
"grad_norm": 5.611910343170166,
"learning_rate": 4.346116027531957e-06,
"loss": 1.1761,
"step": 442
},
{
"epoch": 0.10895228726020659,
"grad_norm": 4.570343017578125,
"learning_rate": 4.355948869223206e-06,
"loss": 1.0922,
"step": 443
},
{
"epoch": 0.10919822921790458,
"grad_norm": 5.35595703125,
"learning_rate": 4.365781710914455e-06,
"loss": 1.1758,
"step": 444
},
{
"epoch": 0.10944417117560255,
"grad_norm": 4.818915367126465,
"learning_rate": 4.3756145526057035e-06,
"loss": 1.1881,
"step": 445
},
{
"epoch": 0.10969011313330054,
"grad_norm": 4.709303855895996,
"learning_rate": 4.385447394296952e-06,
"loss": 1.1624,
"step": 446
},
{
"epoch": 0.10993605509099852,
"grad_norm": 4.7051920890808105,
"learning_rate": 4.395280235988201e-06,
"loss": 1.2707,
"step": 447
},
{
"epoch": 0.11018199704869651,
"grad_norm": 5.294489860534668,
"learning_rate": 4.4051130776794495e-06,
"loss": 1.2654,
"step": 448
},
{
"epoch": 0.11042793900639448,
"grad_norm": 4.858747482299805,
"learning_rate": 4.414945919370698e-06,
"loss": 1.2718,
"step": 449
},
{
"epoch": 0.11067388096409247,
"grad_norm": 5.114902496337891,
"learning_rate": 4.424778761061948e-06,
"loss": 1.1779,
"step": 450
},
{
"epoch": 0.11091982292179045,
"grad_norm": 5.001521110534668,
"learning_rate": 4.434611602753196e-06,
"loss": 1.2679,
"step": 451
},
{
"epoch": 0.11116576487948844,
"grad_norm": 5.0997090339660645,
"learning_rate": 4.444444444444444e-06,
"loss": 1.2805,
"step": 452
},
{
"epoch": 0.11141170683718643,
"grad_norm": 4.589293479919434,
"learning_rate": 4.454277286135694e-06,
"loss": 1.1483,
"step": 453
},
{
"epoch": 0.1116576487948844,
"grad_norm": 4.40261173248291,
"learning_rate": 4.464110127826942e-06,
"loss": 1.0719,
"step": 454
},
{
"epoch": 0.1119035907525824,
"grad_norm": 4.963085651397705,
"learning_rate": 4.473942969518191e-06,
"loss": 1.2143,
"step": 455
},
{
"epoch": 0.11214953271028037,
"grad_norm": 5.399011135101318,
"learning_rate": 4.48377581120944e-06,
"loss": 1.2583,
"step": 456
},
{
"epoch": 0.11239547466797836,
"grad_norm": 5.436919212341309,
"learning_rate": 4.493608652900689e-06,
"loss": 1.1758,
"step": 457
},
{
"epoch": 0.11264141662567634,
"grad_norm": 5.077342510223389,
"learning_rate": 4.503441494591938e-06,
"loss": 1.2029,
"step": 458
},
{
"epoch": 0.11288735858337433,
"grad_norm": 4.964205741882324,
"learning_rate": 4.513274336283186e-06,
"loss": 1.2779,
"step": 459
},
{
"epoch": 0.1131333005410723,
"grad_norm": 4.857316493988037,
"learning_rate": 4.523107177974435e-06,
"loss": 1.2442,
"step": 460
},
{
"epoch": 0.1133792424987703,
"grad_norm": 4.908639907836914,
"learning_rate": 4.532940019665684e-06,
"loss": 1.2528,
"step": 461
},
{
"epoch": 0.11362518445646827,
"grad_norm": 4.82950496673584,
"learning_rate": 4.5427728613569326e-06,
"loss": 1.1449,
"step": 462
},
{
"epoch": 0.11387112641416626,
"grad_norm": 4.523892879486084,
"learning_rate": 4.552605703048181e-06,
"loss": 1.1318,
"step": 463
},
{
"epoch": 0.11411706837186424,
"grad_norm": 4.862086772918701,
"learning_rate": 4.56243854473943e-06,
"loss": 1.3314,
"step": 464
},
{
"epoch": 0.11436301032956223,
"grad_norm": 4.365109920501709,
"learning_rate": 4.5722713864306786e-06,
"loss": 1.1623,
"step": 465
},
{
"epoch": 0.1146089522872602,
"grad_norm": 5.031253337860107,
"learning_rate": 4.582104228121927e-06,
"loss": 1.2939,
"step": 466
},
{
"epoch": 0.11485489424495819,
"grad_norm": 4.365376949310303,
"learning_rate": 4.591937069813177e-06,
"loss": 0.9707,
"step": 467
},
{
"epoch": 0.11510083620265617,
"grad_norm": 5.1764068603515625,
"learning_rate": 4.6017699115044254e-06,
"loss": 1.0728,
"step": 468
},
{
"epoch": 0.11534677816035416,
"grad_norm": 5.277358531951904,
"learning_rate": 4.611602753195674e-06,
"loss": 1.2572,
"step": 469
},
{
"epoch": 0.11559272011805213,
"grad_norm": 4.926379203796387,
"learning_rate": 4.621435594886923e-06,
"loss": 1.1625,
"step": 470
},
{
"epoch": 0.11583866207575012,
"grad_norm": 4.603894233703613,
"learning_rate": 4.6312684365781714e-06,
"loss": 1.1965,
"step": 471
},
{
"epoch": 0.1160846040334481,
"grad_norm": 4.757040023803711,
"learning_rate": 4.64110127826942e-06,
"loss": 1.0605,
"step": 472
},
{
"epoch": 0.11633054599114609,
"grad_norm": 5.073690891265869,
"learning_rate": 4.650934119960669e-06,
"loss": 1.1374,
"step": 473
},
{
"epoch": 0.11657648794884408,
"grad_norm": 4.405266284942627,
"learning_rate": 4.660766961651918e-06,
"loss": 1.1677,
"step": 474
},
{
"epoch": 0.11682242990654206,
"grad_norm": 4.773069858551025,
"learning_rate": 4.670599803343166e-06,
"loss": 1.2089,
"step": 475
},
{
"epoch": 0.11706837186424005,
"grad_norm": 5.514909744262695,
"learning_rate": 4.680432645034415e-06,
"loss": 1.2596,
"step": 476
},
{
"epoch": 0.11731431382193802,
"grad_norm": 4.623727321624756,
"learning_rate": 4.690265486725664e-06,
"loss": 1.0796,
"step": 477
},
{
"epoch": 0.11756025577963601,
"grad_norm": 4.393792629241943,
"learning_rate": 4.700098328416913e-06,
"loss": 1.0796,
"step": 478
},
{
"epoch": 0.11780619773733399,
"grad_norm": 4.487053394317627,
"learning_rate": 4.709931170108162e-06,
"loss": 1.0495,
"step": 479
},
{
"epoch": 0.11805213969503198,
"grad_norm": 5.203456878662109,
"learning_rate": 4.71976401179941e-06,
"loss": 1.3193,
"step": 480
},
{
"epoch": 0.11829808165272995,
"grad_norm": 4.877032279968262,
"learning_rate": 4.72959685349066e-06,
"loss": 1.0797,
"step": 481
},
{
"epoch": 0.11854402361042794,
"grad_norm": 5.07778787612915,
"learning_rate": 4.739429695181908e-06,
"loss": 1.1809,
"step": 482
},
{
"epoch": 0.11878996556812592,
"grad_norm": 4.726812362670898,
"learning_rate": 4.749262536873156e-06,
"loss": 1.257,
"step": 483
},
{
"epoch": 0.11903590752582391,
"grad_norm": 4.454743385314941,
"learning_rate": 4.759095378564406e-06,
"loss": 1.1319,
"step": 484
},
{
"epoch": 0.11928184948352188,
"grad_norm": 4.983044624328613,
"learning_rate": 4.7689282202556545e-06,
"loss": 1.189,
"step": 485
},
{
"epoch": 0.11952779144121987,
"grad_norm": 4.775365829467773,
"learning_rate": 4.778761061946903e-06,
"loss": 1.1589,
"step": 486
},
{
"epoch": 0.11977373339891785,
"grad_norm": 4.756986618041992,
"learning_rate": 4.788593903638152e-06,
"loss": 1.4408,
"step": 487
},
{
"epoch": 0.12001967535661584,
"grad_norm": 4.609254360198975,
"learning_rate": 4.7984267453294005e-06,
"loss": 1.1891,
"step": 488
},
{
"epoch": 0.12026561731431382,
"grad_norm": 4.845890045166016,
"learning_rate": 4.808259587020649e-06,
"loss": 1.3495,
"step": 489
},
{
"epoch": 0.1205115592720118,
"grad_norm": 4.711799621582031,
"learning_rate": 4.818092428711898e-06,
"loss": 1.201,
"step": 490
},
{
"epoch": 0.12075750122970978,
"grad_norm": 4.964114189147949,
"learning_rate": 4.827925270403147e-06,
"loss": 1.2051,
"step": 491
},
{
"epoch": 0.12100344318740777,
"grad_norm": 4.467300891876221,
"learning_rate": 4.837758112094395e-06,
"loss": 1.1129,
"step": 492
},
{
"epoch": 0.12124938514510576,
"grad_norm": 4.862575054168701,
"learning_rate": 4.847590953785645e-06,
"loss": 1.326,
"step": 493
},
{
"epoch": 0.12149532710280374,
"grad_norm": 5.180316925048828,
"learning_rate": 4.857423795476893e-06,
"loss": 1.1796,
"step": 494
},
{
"epoch": 0.12174126906050173,
"grad_norm": 5.124729633331299,
"learning_rate": 4.867256637168142e-06,
"loss": 1.2146,
"step": 495
},
{
"epoch": 0.1219872110181997,
"grad_norm": 4.446535587310791,
"learning_rate": 4.877089478859391e-06,
"loss": 1.2203,
"step": 496
},
{
"epoch": 0.1222331529758977,
"grad_norm": 4.789176940917969,
"learning_rate": 4.886922320550639e-06,
"loss": 1.1541,
"step": 497
},
{
"epoch": 0.12247909493359567,
"grad_norm": 5.0118021965026855,
"learning_rate": 4.896755162241888e-06,
"loss": 1.1759,
"step": 498
},
{
"epoch": 0.12272503689129366,
"grad_norm": 4.5681233406066895,
"learning_rate": 4.906588003933137e-06,
"loss": 1.2152,
"step": 499
},
{
"epoch": 0.12297097884899164,
"grad_norm": 4.892839431762695,
"learning_rate": 4.9164208456243854e-06,
"loss": 1.1747,
"step": 500
},
{
"epoch": 0.12297097884899164,
"eval_loss": 1.246882438659668,
"eval_runtime": 14.0958,
"eval_samples_per_second": 28.377,
"eval_steps_per_second": 3.547,
"step": 500
},
{
"epoch": 0.12321692080668963,
"grad_norm": 5.102154731750488,
"learning_rate": 4.926253687315635e-06,
"loss": 1.1638,
"step": 501
},
{
"epoch": 0.1234628627643876,
"grad_norm": 4.86465311050415,
"learning_rate": 4.936086529006884e-06,
"loss": 1.2714,
"step": 502
},
{
"epoch": 0.12370880472208559,
"grad_norm": 5.110489368438721,
"learning_rate": 4.945919370698132e-06,
"loss": 1.3355,
"step": 503
},
{
"epoch": 0.12395474667978357,
"grad_norm": 4.535679817199707,
"learning_rate": 4.955752212389381e-06,
"loss": 1.2786,
"step": 504
},
{
"epoch": 0.12420068863748156,
"grad_norm": 5.000831604003906,
"learning_rate": 4.96558505408063e-06,
"loss": 1.416,
"step": 505
},
{
"epoch": 0.12444663059517953,
"grad_norm": 4.667386054992676,
"learning_rate": 4.975417895771878e-06,
"loss": 1.0836,
"step": 506
},
{
"epoch": 0.12469257255287752,
"grad_norm": 4.80437707901001,
"learning_rate": 4.985250737463127e-06,
"loss": 1.2555,
"step": 507
},
{
"epoch": 0.1249385145105755,
"grad_norm": 4.6181840896606445,
"learning_rate": 4.9950835791543765e-06,
"loss": 1.2304,
"step": 508
},
{
"epoch": 0.12518445646827348,
"grad_norm": 4.956863880157471,
"learning_rate": 5.004916420845624e-06,
"loss": 1.1828,
"step": 509
},
{
"epoch": 0.12543039842597148,
"grad_norm": 5.052786350250244,
"learning_rate": 5.014749262536873e-06,
"loss": 1.3617,
"step": 510
},
{
"epoch": 0.12567634038366945,
"grad_norm": 4.308551788330078,
"learning_rate": 5.0245821042281225e-06,
"loss": 1.0029,
"step": 511
},
{
"epoch": 0.12592228234136743,
"grad_norm": 4.81613302230835,
"learning_rate": 5.034414945919371e-06,
"loss": 1.1776,
"step": 512
},
{
"epoch": 0.1261682242990654,
"grad_norm": 4.663824081420898,
"learning_rate": 5.04424778761062e-06,
"loss": 1.1285,
"step": 513
},
{
"epoch": 0.1264141662567634,
"grad_norm": 4.683495044708252,
"learning_rate": 5.0540806293018685e-06,
"loss": 1.0819,
"step": 514
},
{
"epoch": 0.1266601082144614,
"grad_norm": 5.0081658363342285,
"learning_rate": 5.063913470993117e-06,
"loss": 1.1986,
"step": 515
},
{
"epoch": 0.12690605017215936,
"grad_norm": 4.713033676147461,
"learning_rate": 5.073746312684367e-06,
"loss": 1.1248,
"step": 516
},
{
"epoch": 0.12715199212985737,
"grad_norm": 4.9138712882995605,
"learning_rate": 5.083579154375615e-06,
"loss": 1.2223,
"step": 517
},
{
"epoch": 0.12739793408755534,
"grad_norm": 5.038704872131348,
"learning_rate": 5.093411996066864e-06,
"loss": 1.3563,
"step": 518
},
{
"epoch": 0.12764387604525332,
"grad_norm": 5.295085906982422,
"learning_rate": 5.103244837758113e-06,
"loss": 1.1831,
"step": 519
},
{
"epoch": 0.1278898180029513,
"grad_norm": 4.539950370788574,
"learning_rate": 5.1130776794493605e-06,
"loss": 1.2325,
"step": 520
},
{
"epoch": 0.1281357599606493,
"grad_norm": 4.928675651550293,
"learning_rate": 5.122910521140611e-06,
"loss": 1.1826,
"step": 521
},
{
"epoch": 0.12838170191834727,
"grad_norm": 4.398011207580566,
"learning_rate": 5.132743362831859e-06,
"loss": 1.2337,
"step": 522
},
{
"epoch": 0.12862764387604525,
"grad_norm": 5.3660759925842285,
"learning_rate": 5.142576204523107e-06,
"loss": 1.2892,
"step": 523
},
{
"epoch": 0.12887358583374323,
"grad_norm": 4.625879764556885,
"learning_rate": 5.152409046214356e-06,
"loss": 1.2196,
"step": 524
},
{
"epoch": 0.12911952779144123,
"grad_norm": 5.0034379959106445,
"learning_rate": 5.162241887905605e-06,
"loss": 1.2916,
"step": 525
},
{
"epoch": 0.1293654697491392,
"grad_norm": 4.6537909507751465,
"learning_rate": 5.172074729596854e-06,
"loss": 1.0961,
"step": 526
},
{
"epoch": 0.12961141170683718,
"grad_norm": 4.614251136779785,
"learning_rate": 5.181907571288103e-06,
"loss": 1.2348,
"step": 527
},
{
"epoch": 0.12985735366453516,
"grad_norm": 5.129441738128662,
"learning_rate": 5.191740412979352e-06,
"loss": 1.2225,
"step": 528
},
{
"epoch": 0.13010329562223316,
"grad_norm": 5.125554084777832,
"learning_rate": 5.2015732546706e-06,
"loss": 1.1424,
"step": 529
},
{
"epoch": 0.13034923757993114,
"grad_norm": 4.839167594909668,
"learning_rate": 5.211406096361848e-06,
"loss": 1.2356,
"step": 530
},
{
"epoch": 0.1305951795376291,
"grad_norm": 4.581501007080078,
"learning_rate": 5.2212389380530985e-06,
"loss": 1.104,
"step": 531
},
{
"epoch": 0.1308411214953271,
"grad_norm": 4.44268274307251,
"learning_rate": 5.231071779744346e-06,
"loss": 1.1131,
"step": 532
},
{
"epoch": 0.1310870634530251,
"grad_norm": 4.769991874694824,
"learning_rate": 5.240904621435595e-06,
"loss": 1.2967,
"step": 533
},
{
"epoch": 0.13133300541072307,
"grad_norm": 5.067864418029785,
"learning_rate": 5.250737463126844e-06,
"loss": 1.2287,
"step": 534
},
{
"epoch": 0.13157894736842105,
"grad_norm": 4.370571613311768,
"learning_rate": 5.260570304818093e-06,
"loss": 1.0907,
"step": 535
},
{
"epoch": 0.13182488932611905,
"grad_norm": 5.013082981109619,
"learning_rate": 5.270403146509342e-06,
"loss": 1.2052,
"step": 536
},
{
"epoch": 0.13207083128381703,
"grad_norm": 4.653934478759766,
"learning_rate": 5.2802359882005905e-06,
"loss": 1.2515,
"step": 537
},
{
"epoch": 0.132316773241515,
"grad_norm": 4.941091060638428,
"learning_rate": 5.290068829891839e-06,
"loss": 1.1824,
"step": 538
},
{
"epoch": 0.13256271519921298,
"grad_norm": 4.829664707183838,
"learning_rate": 5.299901671583088e-06,
"loss": 1.2563,
"step": 539
},
{
"epoch": 0.13280865715691098,
"grad_norm": 5.109822750091553,
"learning_rate": 5.309734513274337e-06,
"loss": 1.2279,
"step": 540
},
{
"epoch": 0.13305459911460896,
"grad_norm": 4.6525349617004395,
"learning_rate": 5.319567354965586e-06,
"loss": 1.1423,
"step": 541
},
{
"epoch": 0.13330054107230693,
"grad_norm": 4.982812881469727,
"learning_rate": 5.329400196656835e-06,
"loss": 1.2093,
"step": 542
},
{
"epoch": 0.1335464830300049,
"grad_norm": 4.925893306732178,
"learning_rate": 5.3392330383480825e-06,
"loss": 1.3291,
"step": 543
},
{
"epoch": 0.1337924249877029,
"grad_norm": 4.730286598205566,
"learning_rate": 5.349065880039331e-06,
"loss": 1.3485,
"step": 544
},
{
"epoch": 0.1340383669454009,
"grad_norm": 5.008754730224609,
"learning_rate": 5.358898721730581e-06,
"loss": 1.081,
"step": 545
},
{
"epoch": 0.13428430890309886,
"grad_norm": 4.675521373748779,
"learning_rate": 5.368731563421829e-06,
"loss": 1.0322,
"step": 546
},
{
"epoch": 0.13453025086079684,
"grad_norm": 4.617642402648926,
"learning_rate": 5.378564405113078e-06,
"loss": 1.2254,
"step": 547
},
{
"epoch": 0.13477619281849484,
"grad_norm": 4.790796756744385,
"learning_rate": 5.388397246804327e-06,
"loss": 1.1191,
"step": 548
},
{
"epoch": 0.13502213477619282,
"grad_norm": 5.423696041107178,
"learning_rate": 5.398230088495575e-06,
"loss": 1.2991,
"step": 549
},
{
"epoch": 0.1352680767338908,
"grad_norm": 4.624082088470459,
"learning_rate": 5.408062930186825e-06,
"loss": 1.2973,
"step": 550
},
{
"epoch": 0.13551401869158877,
"grad_norm": 4.770191192626953,
"learning_rate": 5.417895771878074e-06,
"loss": 1.2202,
"step": 551
},
{
"epoch": 0.13575996064928678,
"grad_norm": 4.566011905670166,
"learning_rate": 5.427728613569322e-06,
"loss": 1.0959,
"step": 552
},
{
"epoch": 0.13600590260698475,
"grad_norm": 4.546421051025391,
"learning_rate": 5.43756145526057e-06,
"loss": 1.2048,
"step": 553
},
{
"epoch": 0.13625184456468273,
"grad_norm": 4.759641170501709,
"learning_rate": 5.447394296951819e-06,
"loss": 1.2146,
"step": 554
},
{
"epoch": 0.13649778652238073,
"grad_norm": 4.878152847290039,
"learning_rate": 5.457227138643068e-06,
"loss": 1.3211,
"step": 555
},
{
"epoch": 0.1367437284800787,
"grad_norm": 4.256135940551758,
"learning_rate": 5.467059980334317e-06,
"loss": 1.0766,
"step": 556
},
{
"epoch": 0.13698967043777668,
"grad_norm": 4.841675281524658,
"learning_rate": 5.476892822025566e-06,
"loss": 1.1326,
"step": 557
},
{
"epoch": 0.13723561239547466,
"grad_norm": 4.676751136779785,
"learning_rate": 5.486725663716814e-06,
"loss": 1.203,
"step": 558
},
{
"epoch": 0.13748155435317266,
"grad_norm": 4.749851226806641,
"learning_rate": 5.496558505408064e-06,
"loss": 1.305,
"step": 559
},
{
"epoch": 0.13772749631087064,
"grad_norm": 4.195345878601074,
"learning_rate": 5.5063913470993125e-06,
"loss": 1.101,
"step": 560
},
{
"epoch": 0.13797343826856862,
"grad_norm": 4.952095031738281,
"learning_rate": 5.516224188790561e-06,
"loss": 1.2328,
"step": 561
},
{
"epoch": 0.1382193802262666,
"grad_norm": 4.614315986633301,
"learning_rate": 5.52605703048181e-06,
"loss": 1.0974,
"step": 562
},
{
"epoch": 0.1384653221839646,
"grad_norm": 4.25603723526001,
"learning_rate": 5.5358898721730585e-06,
"loss": 1.1716,
"step": 563
},
{
"epoch": 0.13871126414166257,
"grad_norm": 5.431785583496094,
"learning_rate": 5.545722713864308e-06,
"loss": 1.2005,
"step": 564
},
{
"epoch": 0.13895720609936055,
"grad_norm": 4.843262195587158,
"learning_rate": 5.555555555555557e-06,
"loss": 1.233,
"step": 565
},
{
"epoch": 0.13920314805705852,
"grad_norm": 4.503073215484619,
"learning_rate": 5.5653883972468045e-06,
"loss": 1.1241,
"step": 566
},
{
"epoch": 0.13944909001475653,
"grad_norm": 4.525237083435059,
"learning_rate": 5.575221238938053e-06,
"loss": 1.1785,
"step": 567
},
{
"epoch": 0.1396950319724545,
"grad_norm": 4.783951759338379,
"learning_rate": 5.585054080629302e-06,
"loss": 1.1644,
"step": 568
},
{
"epoch": 0.13994097393015248,
"grad_norm": 4.7505693435668945,
"learning_rate": 5.594886922320551e-06,
"loss": 1.2428,
"step": 569
},
{
"epoch": 0.14018691588785046,
"grad_norm": 4.938000202178955,
"learning_rate": 5.6047197640118e-06,
"loss": 1.2633,
"step": 570
},
{
"epoch": 0.14043285784554846,
"grad_norm": 5.320477485656738,
"learning_rate": 5.614552605703049e-06,
"loss": 1.2276,
"step": 571
},
{
"epoch": 0.14067879980324643,
"grad_norm": 5.273099899291992,
"learning_rate": 5.624385447394297e-06,
"loss": 1.3871,
"step": 572
},
{
"epoch": 0.1409247417609444,
"grad_norm": 4.420525550842285,
"learning_rate": 5.634218289085546e-06,
"loss": 1.0948,
"step": 573
},
{
"epoch": 0.1411706837186424,
"grad_norm": 4.461489677429199,
"learning_rate": 5.6440511307767956e-06,
"loss": 1.195,
"step": 574
},
{
"epoch": 0.1414166256763404,
"grad_norm": 4.184628963470459,
"learning_rate": 5.653883972468044e-06,
"loss": 1.0963,
"step": 575
},
{
"epoch": 0.14166256763403837,
"grad_norm": 4.866041660308838,
"learning_rate": 5.663716814159292e-06,
"loss": 1.2925,
"step": 576
},
{
"epoch": 0.14190850959173634,
"grad_norm": 4.348908424377441,
"learning_rate": 5.673549655850541e-06,
"loss": 1.1307,
"step": 577
},
{
"epoch": 0.14215445154943435,
"grad_norm": 4.41391134262085,
"learning_rate": 5.683382497541789e-06,
"loss": 1.184,
"step": 578
},
{
"epoch": 0.14240039350713232,
"grad_norm": 5.683847904205322,
"learning_rate": 5.693215339233039e-06,
"loss": 1.2376,
"step": 579
},
{
"epoch": 0.1426463354648303,
"grad_norm": 4.768433570861816,
"learning_rate": 5.703048180924288e-06,
"loss": 1.2556,
"step": 580
},
{
"epoch": 0.14289227742252827,
"grad_norm": 4.496997833251953,
"learning_rate": 5.712881022615536e-06,
"loss": 1.2357,
"step": 581
},
{
"epoch": 0.14313821938022628,
"grad_norm": 4.176562309265137,
"learning_rate": 5.722713864306785e-06,
"loss": 1.1392,
"step": 582
},
{
"epoch": 0.14338416133792425,
"grad_norm": 4.480749130249023,
"learning_rate": 5.7325467059980344e-06,
"loss": 1.2092,
"step": 583
},
{
"epoch": 0.14363010329562223,
"grad_norm": 4.647942066192627,
"learning_rate": 5.742379547689283e-06,
"loss": 1.1389,
"step": 584
},
{
"epoch": 0.1438760452533202,
"grad_norm": 5.0272088050842285,
"learning_rate": 5.752212389380532e-06,
"loss": 1.2913,
"step": 585
},
{
"epoch": 0.1441219872110182,
"grad_norm": 4.6475138664245605,
"learning_rate": 5.7620452310717805e-06,
"loss": 1.344,
"step": 586
},
{
"epoch": 0.14436792916871619,
"grad_norm": 4.788149833679199,
"learning_rate": 5.771878072763028e-06,
"loss": 1.1309,
"step": 587
},
{
"epoch": 0.14461387112641416,
"grad_norm": 4.921096324920654,
"learning_rate": 5.781710914454279e-06,
"loss": 1.0923,
"step": 588
},
{
"epoch": 0.14485981308411214,
"grad_norm": 4.68422794342041,
"learning_rate": 5.7915437561455265e-06,
"loss": 1.3362,
"step": 589
},
{
"epoch": 0.14510575504181014,
"grad_norm": 4.573172092437744,
"learning_rate": 5.801376597836775e-06,
"loss": 1.2612,
"step": 590
},
{
"epoch": 0.14535169699950812,
"grad_norm": 4.764773845672607,
"learning_rate": 5.811209439528024e-06,
"loss": 1.1988,
"step": 591
},
{
"epoch": 0.1455976389572061,
"grad_norm": 4.6122541427612305,
"learning_rate": 5.8210422812192725e-06,
"loss": 1.2785,
"step": 592
},
{
"epoch": 0.14584358091490407,
"grad_norm": 4.5264153480529785,
"learning_rate": 5.830875122910522e-06,
"loss": 1.1383,
"step": 593
},
{
"epoch": 0.14608952287260207,
"grad_norm": 4.330212593078613,
"learning_rate": 5.840707964601771e-06,
"loss": 1.1764,
"step": 594
},
{
"epoch": 0.14633546483030005,
"grad_norm": 5.840691566467285,
"learning_rate": 5.850540806293019e-06,
"loss": 1.3946,
"step": 595
},
{
"epoch": 0.14658140678799803,
"grad_norm": 4.687470436096191,
"learning_rate": 5.860373647984268e-06,
"loss": 1.2309,
"step": 596
},
{
"epoch": 0.14682734874569603,
"grad_norm": 4.160153865814209,
"learning_rate": 5.870206489675516e-06,
"loss": 1.0558,
"step": 597
},
{
"epoch": 0.147073290703394,
"grad_norm": 4.739638805389404,
"learning_rate": 5.880039331366766e-06,
"loss": 1.2991,
"step": 598
},
{
"epoch": 0.14731923266109198,
"grad_norm": 4.748146057128906,
"learning_rate": 5.889872173058014e-06,
"loss": 1.1726,
"step": 599
},
{
"epoch": 0.14756517461878996,
"grad_norm": 4.089597702026367,
"learning_rate": 5.899705014749263e-06,
"loss": 1.1802,
"step": 600
},
{
"epoch": 0.14781111657648796,
"grad_norm": 5.117847919464111,
"learning_rate": 5.909537856440511e-06,
"loss": 1.0839,
"step": 601
},
{
"epoch": 0.14805705853418594,
"grad_norm": 5.5169596672058105,
"learning_rate": 5.91937069813176e-06,
"loss": 1.2967,
"step": 602
},
{
"epoch": 0.1483030004918839,
"grad_norm": 4.876138210296631,
"learning_rate": 5.9292035398230096e-06,
"loss": 1.2001,
"step": 603
},
{
"epoch": 0.1485489424495819,
"grad_norm": 4.785106658935547,
"learning_rate": 5.939036381514258e-06,
"loss": 1.1802,
"step": 604
},
{
"epoch": 0.1487948844072799,
"grad_norm": 4.597662925720215,
"learning_rate": 5.948869223205507e-06,
"loss": 1.2868,
"step": 605
},
{
"epoch": 0.14904082636497787,
"grad_norm": 4.744992256164551,
"learning_rate": 5.9587020648967556e-06,
"loss": 1.1582,
"step": 606
},
{
"epoch": 0.14928676832267584,
"grad_norm": 4.8757829666137695,
"learning_rate": 5.968534906588005e-06,
"loss": 1.2469,
"step": 607
},
{
"epoch": 0.14953271028037382,
"grad_norm": 4.624231338500977,
"learning_rate": 5.978367748279254e-06,
"loss": 1.1641,
"step": 608
},
{
"epoch": 0.14977865223807182,
"grad_norm": 4.746432781219482,
"learning_rate": 5.9882005899705024e-06,
"loss": 1.1033,
"step": 609
},
{
"epoch": 0.1500245941957698,
"grad_norm": 5.202457427978516,
"learning_rate": 5.99803343166175e-06,
"loss": 1.1433,
"step": 610
},
{
"epoch": 0.15027053615346778,
"grad_norm": 4.58802604675293,
"learning_rate": 6.007866273352999e-06,
"loss": 1.2122,
"step": 611
},
{
"epoch": 0.15051647811116575,
"grad_norm": 4.548409461975098,
"learning_rate": 6.0176991150442484e-06,
"loss": 1.2709,
"step": 612
},
{
"epoch": 0.15076242006886376,
"grad_norm": 5.120619297027588,
"learning_rate": 6.027531956735497e-06,
"loss": 1.2186,
"step": 613
},
{
"epoch": 0.15100836202656173,
"grad_norm": 4.834990978240967,
"learning_rate": 6.037364798426746e-06,
"loss": 1.2919,
"step": 614
},
{
"epoch": 0.1512543039842597,
"grad_norm": 4.7802653312683105,
"learning_rate": 6.0471976401179945e-06,
"loss": 1.3145,
"step": 615
},
{
"epoch": 0.15150024594195768,
"grad_norm": 4.659482002258301,
"learning_rate": 6.057030481809243e-06,
"loss": 1.265,
"step": 616
},
{
"epoch": 0.1517461878996557,
"grad_norm": 4.644939422607422,
"learning_rate": 6.066863323500493e-06,
"loss": 1.3028,
"step": 617
},
{
"epoch": 0.15199212985735366,
"grad_norm": 4.371776103973389,
"learning_rate": 6.076696165191741e-06,
"loss": 1.0834,
"step": 618
},
{
"epoch": 0.15223807181505164,
"grad_norm": 4.938658714294434,
"learning_rate": 6.08652900688299e-06,
"loss": 1.2881,
"step": 619
},
{
"epoch": 0.15248401377274964,
"grad_norm": 5.018941879272461,
"learning_rate": 6.096361848574238e-06,
"loss": 1.417,
"step": 620
},
{
"epoch": 0.15272995573044762,
"grad_norm": 4.695444583892822,
"learning_rate": 6.1061946902654865e-06,
"loss": 1.1562,
"step": 621
},
{
"epoch": 0.1529758976881456,
"grad_norm": 4.598396301269531,
"learning_rate": 6.116027531956736e-06,
"loss": 1.0854,
"step": 622
},
{
"epoch": 0.15322183964584357,
"grad_norm": 4.541428089141846,
"learning_rate": 6.125860373647985e-06,
"loss": 1.1469,
"step": 623
},
{
"epoch": 0.15346778160354158,
"grad_norm": 4.353450775146484,
"learning_rate": 6.135693215339233e-06,
"loss": 1.2039,
"step": 624
},
{
"epoch": 0.15371372356123955,
"grad_norm": 4.244394779205322,
"learning_rate": 6.145526057030482e-06,
"loss": 1.0607,
"step": 625
},
{
"epoch": 0.15395966551893753,
"grad_norm": 5.115127086639404,
"learning_rate": 6.155358898721731e-06,
"loss": 1.1903,
"step": 626
},
{
"epoch": 0.1542056074766355,
"grad_norm": 5.059081554412842,
"learning_rate": 6.16519174041298e-06,
"loss": 1.3584,
"step": 627
},
{
"epoch": 0.1544515494343335,
"grad_norm": 5.2593560218811035,
"learning_rate": 6.175024582104229e-06,
"loss": 1.3378,
"step": 628
},
{
"epoch": 0.15469749139203148,
"grad_norm": 4.927911758422852,
"learning_rate": 6.1848574237954775e-06,
"loss": 1.2336,
"step": 629
},
{
"epoch": 0.15494343334972946,
"grad_norm": 4.760716438293457,
"learning_rate": 6.194690265486726e-06,
"loss": 1.1598,
"step": 630
},
{
"epoch": 0.15518937530742744,
"grad_norm": 4.522549629211426,
"learning_rate": 6.204523107177976e-06,
"loss": 1.244,
"step": 631
},
{
"epoch": 0.15543531726512544,
"grad_norm": 4.8928375244140625,
"learning_rate": 6.214355948869224e-06,
"loss": 1.2518,
"step": 632
},
{
"epoch": 0.15568125922282341,
"grad_norm": 4.510464668273926,
"learning_rate": 6.224188790560472e-06,
"loss": 1.2219,
"step": 633
},
{
"epoch": 0.1559272011805214,
"grad_norm": 4.6836652755737305,
"learning_rate": 6.234021632251721e-06,
"loss": 1.256,
"step": 634
},
{
"epoch": 0.15617314313821937,
"grad_norm": 4.836663722991943,
"learning_rate": 6.2438544739429696e-06,
"loss": 1.3098,
"step": 635
},
{
"epoch": 0.15641908509591737,
"grad_norm": 4.249446392059326,
"learning_rate": 6.253687315634219e-06,
"loss": 1.185,
"step": 636
},
{
"epoch": 0.15666502705361535,
"grad_norm": 4.641504287719727,
"learning_rate": 6.263520157325468e-06,
"loss": 1.1505,
"step": 637
},
{
"epoch": 0.15691096901131332,
"grad_norm": 4.881726264953613,
"learning_rate": 6.273352999016716e-06,
"loss": 1.1778,
"step": 638
},
{
"epoch": 0.15715691096901133,
"grad_norm": 5.355141639709473,
"learning_rate": 6.283185840707965e-06,
"loss": 1.3089,
"step": 639
},
{
"epoch": 0.1574028529267093,
"grad_norm": 4.716489791870117,
"learning_rate": 6.293018682399214e-06,
"loss": 1.3045,
"step": 640
},
{
"epoch": 0.15764879488440728,
"grad_norm": 5.36386251449585,
"learning_rate": 6.302851524090463e-06,
"loss": 1.2879,
"step": 641
},
{
"epoch": 0.15789473684210525,
"grad_norm": 4.525503635406494,
"learning_rate": 6.312684365781712e-06,
"loss": 1.2046,
"step": 642
},
{
"epoch": 0.15814067879980326,
"grad_norm": 4.257623672485352,
"learning_rate": 6.32251720747296e-06,
"loss": 1.2597,
"step": 643
},
{
"epoch": 0.15838662075750123,
"grad_norm": 4.373297691345215,
"learning_rate": 6.3323500491642084e-06,
"loss": 1.1628,
"step": 644
},
{
"epoch": 0.1586325627151992,
"grad_norm": 4.439374923706055,
"learning_rate": 6.342182890855457e-06,
"loss": 1.208,
"step": 645
},
{
"epoch": 0.1588785046728972,
"grad_norm": 4.480494499206543,
"learning_rate": 6.352015732546707e-06,
"loss": 1.2424,
"step": 646
},
{
"epoch": 0.1591244466305952,
"grad_norm": 4.689266204833984,
"learning_rate": 6.361848574237955e-06,
"loss": 1.3095,
"step": 647
},
{
"epoch": 0.15937038858829317,
"grad_norm": 4.431936740875244,
"learning_rate": 6.371681415929204e-06,
"loss": 1.1763,
"step": 648
},
{
"epoch": 0.15961633054599114,
"grad_norm": 5.228087902069092,
"learning_rate": 6.381514257620453e-06,
"loss": 1.1787,
"step": 649
},
{
"epoch": 0.15986227250368912,
"grad_norm": 5.212238311767578,
"learning_rate": 6.391347099311701e-06,
"loss": 1.3186,
"step": 650
},
{
"epoch": 0.16010821446138712,
"grad_norm": 5.05730676651001,
"learning_rate": 6.401179941002951e-06,
"loss": 1.3094,
"step": 651
},
{
"epoch": 0.1603541564190851,
"grad_norm": 4.334603309631348,
"learning_rate": 6.4110127826941995e-06,
"loss": 1.0579,
"step": 652
},
{
"epoch": 0.16060009837678307,
"grad_norm": 4.535286903381348,
"learning_rate": 6.420845624385448e-06,
"loss": 1.0948,
"step": 653
},
{
"epoch": 0.16084604033448105,
"grad_norm": 4.962480068206787,
"learning_rate": 6.430678466076696e-06,
"loss": 1.2605,
"step": 654
},
{
"epoch": 0.16109198229217905,
"grad_norm": 4.703233242034912,
"learning_rate": 6.440511307767946e-06,
"loss": 1.4021,
"step": 655
},
{
"epoch": 0.16133792424987703,
"grad_norm": 4.72304105758667,
"learning_rate": 6.450344149459194e-06,
"loss": 1.1541,
"step": 656
},
{
"epoch": 0.161583866207575,
"grad_norm": 4.576914310455322,
"learning_rate": 6.460176991150443e-06,
"loss": 1.1117,
"step": 657
},
{
"epoch": 0.161829808165273,
"grad_norm": 4.7307257652282715,
"learning_rate": 6.4700098328416915e-06,
"loss": 1.2084,
"step": 658
},
{
"epoch": 0.16207575012297099,
"grad_norm": 4.41452169418335,
"learning_rate": 6.47984267453294e-06,
"loss": 1.1961,
"step": 659
},
{
"epoch": 0.16232169208066896,
"grad_norm": 5.096524715423584,
"learning_rate": 6.48967551622419e-06,
"loss": 1.2748,
"step": 660
},
{
"epoch": 0.16256763403836694,
"grad_norm": 5.203492641448975,
"learning_rate": 6.499508357915438e-06,
"loss": 1.3616,
"step": 661
},
{
"epoch": 0.16281357599606494,
"grad_norm": 4.428399562835693,
"learning_rate": 6.509341199606687e-06,
"loss": 1.2322,
"step": 662
},
{
"epoch": 0.16305951795376292,
"grad_norm": 4.3599653244018555,
"learning_rate": 6.519174041297936e-06,
"loss": 1.2108,
"step": 663
},
{
"epoch": 0.1633054599114609,
"grad_norm": 4.965785980224609,
"learning_rate": 6.5290068829891836e-06,
"loss": 1.2273,
"step": 664
},
{
"epoch": 0.16355140186915887,
"grad_norm": 4.865367412567139,
"learning_rate": 6.538839724680434e-06,
"loss": 1.2612,
"step": 665
},
{
"epoch": 0.16379734382685687,
"grad_norm": 4.749738693237305,
"learning_rate": 6.548672566371682e-06,
"loss": 1.1622,
"step": 666
},
{
"epoch": 0.16404328578455485,
"grad_norm": 4.676555156707764,
"learning_rate": 6.55850540806293e-06,
"loss": 1.1769,
"step": 667
},
{
"epoch": 0.16428922774225282,
"grad_norm": 4.3011860847473145,
"learning_rate": 6.568338249754179e-06,
"loss": 1.0876,
"step": 668
},
{
"epoch": 0.1645351696999508,
"grad_norm": 5.430131912231445,
"learning_rate": 6.578171091445428e-06,
"loss": 1.127,
"step": 669
},
{
"epoch": 0.1647811116576488,
"grad_norm": 4.6941070556640625,
"learning_rate": 6.588003933136677e-06,
"loss": 1.1392,
"step": 670
},
{
"epoch": 0.16502705361534678,
"grad_norm": 4.4265007972717285,
"learning_rate": 6.597836774827926e-06,
"loss": 1.168,
"step": 671
},
{
"epoch": 0.16527299557304476,
"grad_norm": 4.980653762817383,
"learning_rate": 6.607669616519175e-06,
"loss": 1.1834,
"step": 672
},
{
"epoch": 0.16551893753074273,
"grad_norm": 4.849818229675293,
"learning_rate": 6.617502458210423e-06,
"loss": 1.2892,
"step": 673
},
{
"epoch": 0.16576487948844074,
"grad_norm": 4.92747163772583,
"learning_rate": 6.627335299901672e-06,
"loss": 1.1305,
"step": 674
},
{
"epoch": 0.1660108214461387,
"grad_norm": 4.924493789672852,
"learning_rate": 6.6371681415929215e-06,
"loss": 1.3108,
"step": 675
},
{
"epoch": 0.1662567634038367,
"grad_norm": 4.650369167327881,
"learning_rate": 6.64700098328417e-06,
"loss": 1.1135,
"step": 676
},
{
"epoch": 0.16650270536153466,
"grad_norm": 4.913872718811035,
"learning_rate": 6.656833824975418e-06,
"loss": 1.2553,
"step": 677
},
{
"epoch": 0.16674864731923267,
"grad_norm": 4.929379940032959,
"learning_rate": 6.666666666666667e-06,
"loss": 1.1531,
"step": 678
},
{
"epoch": 0.16699458927693064,
"grad_norm": 4.725403785705566,
"learning_rate": 6.676499508357916e-06,
"loss": 1.1306,
"step": 679
},
{
"epoch": 0.16724053123462862,
"grad_norm": 4.480646133422852,
"learning_rate": 6.686332350049165e-06,
"loss": 1.2076,
"step": 680
},
{
"epoch": 0.16748647319232662,
"grad_norm": 4.609620094299316,
"learning_rate": 6.6961651917404135e-06,
"loss": 1.0934,
"step": 681
},
{
"epoch": 0.1677324151500246,
"grad_norm": 4.368680953979492,
"learning_rate": 6.705998033431662e-06,
"loss": 1.1499,
"step": 682
},
{
"epoch": 0.16797835710772258,
"grad_norm": 5.038846969604492,
"learning_rate": 6.715830875122911e-06,
"loss": 1.4123,
"step": 683
},
{
"epoch": 0.16822429906542055,
"grad_norm": 4.79526424407959,
"learning_rate": 6.72566371681416e-06,
"loss": 1.206,
"step": 684
},
{
"epoch": 0.16847024102311856,
"grad_norm": 4.653499126434326,
"learning_rate": 6.735496558505409e-06,
"loss": 1.1216,
"step": 685
},
{
"epoch": 0.16871618298081653,
"grad_norm": 4.479743003845215,
"learning_rate": 6.745329400196658e-06,
"loss": 1.2897,
"step": 686
},
{
"epoch": 0.1689621249385145,
"grad_norm": 5.060058116912842,
"learning_rate": 6.7551622418879055e-06,
"loss": 1.3809,
"step": 687
},
{
"epoch": 0.16920806689621248,
"grad_norm": 4.66033411026001,
"learning_rate": 6.764995083579154e-06,
"loss": 1.1942,
"step": 688
},
{
"epoch": 0.1694540088539105,
"grad_norm": 4.765096664428711,
"learning_rate": 6.774827925270404e-06,
"loss": 1.2669,
"step": 689
},
{
"epoch": 0.16969995081160846,
"grad_norm": 4.375154495239258,
"learning_rate": 6.784660766961652e-06,
"loss": 1.2393,
"step": 690
},
{
"epoch": 0.16994589276930644,
"grad_norm": 4.557894229888916,
"learning_rate": 6.794493608652901e-06,
"loss": 1.3821,
"step": 691
},
{
"epoch": 0.17019183472700442,
"grad_norm": 4.324876308441162,
"learning_rate": 6.80432645034415e-06,
"loss": 1.1423,
"step": 692
},
{
"epoch": 0.17043777668470242,
"grad_norm": 4.888765335083008,
"learning_rate": 6.814159292035398e-06,
"loss": 1.3937,
"step": 693
},
{
"epoch": 0.1706837186424004,
"grad_norm": 4.992369174957275,
"learning_rate": 6.823992133726648e-06,
"loss": 1.2522,
"step": 694
},
{
"epoch": 0.17092966060009837,
"grad_norm": 4.902792453765869,
"learning_rate": 6.833824975417897e-06,
"loss": 1.2794,
"step": 695
},
{
"epoch": 0.17117560255779635,
"grad_norm": 4.9026384353637695,
"learning_rate": 6.843657817109145e-06,
"loss": 1.3031,
"step": 696
},
{
"epoch": 0.17142154451549435,
"grad_norm": 4.355241775512695,
"learning_rate": 6.853490658800394e-06,
"loss": 1.1751,
"step": 697
},
{
"epoch": 0.17166748647319233,
"grad_norm": 5.023930549621582,
"learning_rate": 6.8633235004916435e-06,
"loss": 1.3521,
"step": 698
},
{
"epoch": 0.1719134284308903,
"grad_norm": 5.097218990325928,
"learning_rate": 6.873156342182892e-06,
"loss": 1.159,
"step": 699
},
{
"epoch": 0.1721593703885883,
"grad_norm": 4.883558750152588,
"learning_rate": 6.88298918387414e-06,
"loss": 1.2252,
"step": 700
},
{
"epoch": 0.17240531234628628,
"grad_norm": 4.6442742347717285,
"learning_rate": 6.892822025565389e-06,
"loss": 1.2674,
"step": 701
},
{
"epoch": 0.17265125430398426,
"grad_norm": 4.964976787567139,
"learning_rate": 6.902654867256637e-06,
"loss": 1.1653,
"step": 702
},
{
"epoch": 0.17289719626168223,
"grad_norm": 4.324906349182129,
"learning_rate": 6.912487708947887e-06,
"loss": 1.1451,
"step": 703
},
{
"epoch": 0.17314313821938024,
"grad_norm": 4.339810848236084,
"learning_rate": 6.9223205506391355e-06,
"loss": 1.192,
"step": 704
},
{
"epoch": 0.17338908017707821,
"grad_norm": 4.527713298797607,
"learning_rate": 6.932153392330384e-06,
"loss": 1.1726,
"step": 705
},
{
"epoch": 0.1736350221347762,
"grad_norm": 5.391871452331543,
"learning_rate": 6.941986234021633e-06,
"loss": 1.2844,
"step": 706
},
{
"epoch": 0.17388096409247417,
"grad_norm": 5.112172603607178,
"learning_rate": 6.9518190757128815e-06,
"loss": 1.2985,
"step": 707
},
{
"epoch": 0.17412690605017217,
"grad_norm": 4.9675679206848145,
"learning_rate": 6.961651917404131e-06,
"loss": 1.2328,
"step": 708
},
{
"epoch": 0.17437284800787015,
"grad_norm": 4.7706122398376465,
"learning_rate": 6.97148475909538e-06,
"loss": 1.3077,
"step": 709
},
{
"epoch": 0.17461878996556812,
"grad_norm": 4.689836025238037,
"learning_rate": 6.9813176007866275e-06,
"loss": 1.2856,
"step": 710
},
{
"epoch": 0.1748647319232661,
"grad_norm": 4.635600566864014,
"learning_rate": 6.991150442477876e-06,
"loss": 1.2713,
"step": 711
},
{
"epoch": 0.1751106738809641,
"grad_norm": 4.609842777252197,
"learning_rate": 7.000983284169125e-06,
"loss": 1.2116,
"step": 712
},
{
"epoch": 0.17535661583866208,
"grad_norm": 4.451179027557373,
"learning_rate": 7.010816125860374e-06,
"loss": 1.2297,
"step": 713
},
{
"epoch": 0.17560255779636005,
"grad_norm": 4.454504489898682,
"learning_rate": 7.020648967551623e-06,
"loss": 1.1874,
"step": 714
},
{
"epoch": 0.17584849975405803,
"grad_norm": 4.490920066833496,
"learning_rate": 7.030481809242872e-06,
"loss": 1.1776,
"step": 715
},
{
"epoch": 0.17609444171175603,
"grad_norm": 4.469358921051025,
"learning_rate": 7.04031465093412e-06,
"loss": 1.1896,
"step": 716
},
{
"epoch": 0.176340383669454,
"grad_norm": 4.837884426116943,
"learning_rate": 7.050147492625369e-06,
"loss": 1.3467,
"step": 717
},
{
"epoch": 0.17658632562715199,
"grad_norm": 4.911612510681152,
"learning_rate": 7.0599803343166186e-06,
"loss": 1.2096,
"step": 718
},
{
"epoch": 0.17683226758485,
"grad_norm": 4.676964282989502,
"learning_rate": 7.069813176007867e-06,
"loss": 1.1994,
"step": 719
},
{
"epoch": 0.17707820954254797,
"grad_norm": 4.815658092498779,
"learning_rate": 7.079646017699116e-06,
"loss": 1.2371,
"step": 720
},
{
"epoch": 0.17732415150024594,
"grad_norm": 4.751783847808838,
"learning_rate": 7.089478859390364e-06,
"loss": 1.2446,
"step": 721
},
{
"epoch": 0.17757009345794392,
"grad_norm": 4.765551567077637,
"learning_rate": 7.099311701081614e-06,
"loss": 1.3024,
"step": 722
},
{
"epoch": 0.17781603541564192,
"grad_norm": 4.441276550292969,
"learning_rate": 7.109144542772862e-06,
"loss": 1.187,
"step": 723
},
{
"epoch": 0.1780619773733399,
"grad_norm": 4.236655235290527,
"learning_rate": 7.118977384464111e-06,
"loss": 1.1912,
"step": 724
},
{
"epoch": 0.17830791933103787,
"grad_norm": 4.456976413726807,
"learning_rate": 7.128810226155359e-06,
"loss": 1.2175,
"step": 725
},
{
"epoch": 0.17855386128873585,
"grad_norm": 4.898047924041748,
"learning_rate": 7.138643067846608e-06,
"loss": 1.4176,
"step": 726
},
{
"epoch": 0.17879980324643385,
"grad_norm": 4.41334867477417,
"learning_rate": 7.1484759095378575e-06,
"loss": 1.3333,
"step": 727
},
{
"epoch": 0.17904574520413183,
"grad_norm": 5.014675617218018,
"learning_rate": 7.158308751229106e-06,
"loss": 1.2328,
"step": 728
},
{
"epoch": 0.1792916871618298,
"grad_norm": 5.426655292510986,
"learning_rate": 7.168141592920355e-06,
"loss": 1.0858,
"step": 729
},
{
"epoch": 0.17953762911952778,
"grad_norm": 4.244655609130859,
"learning_rate": 7.1779744346116035e-06,
"loss": 1.2687,
"step": 730
},
{
"epoch": 0.17978357107722578,
"grad_norm": 4.431658744812012,
"learning_rate": 7.187807276302851e-06,
"loss": 1.1153,
"step": 731
},
{
"epoch": 0.18002951303492376,
"grad_norm": 4.588261604309082,
"learning_rate": 7.197640117994102e-06,
"loss": 1.2852,
"step": 732
},
{
"epoch": 0.18027545499262174,
"grad_norm": 4.950037002563477,
"learning_rate": 7.2074729596853495e-06,
"loss": 1.4428,
"step": 733
},
{
"epoch": 0.1805213969503197,
"grad_norm": 5.193053245544434,
"learning_rate": 7.217305801376598e-06,
"loss": 1.3618,
"step": 734
},
{
"epoch": 0.18076733890801772,
"grad_norm": 4.511353969573975,
"learning_rate": 7.227138643067847e-06,
"loss": 1.2168,
"step": 735
},
{
"epoch": 0.1810132808657157,
"grad_norm": 4.991962432861328,
"learning_rate": 7.2369714847590955e-06,
"loss": 1.2608,
"step": 736
},
{
"epoch": 0.18125922282341367,
"grad_norm": 4.3239874839782715,
"learning_rate": 7.246804326450345e-06,
"loss": 1.1895,
"step": 737
},
{
"epoch": 0.18150516478111164,
"grad_norm": 4.280043601989746,
"learning_rate": 7.256637168141594e-06,
"loss": 1.2532,
"step": 738
},
{
"epoch": 0.18175110673880965,
"grad_norm": 4.529305934906006,
"learning_rate": 7.266470009832842e-06,
"loss": 1.2478,
"step": 739
},
{
"epoch": 0.18199704869650762,
"grad_norm": 4.648525238037109,
"learning_rate": 7.276302851524091e-06,
"loss": 1.3039,
"step": 740
},
{
"epoch": 0.1822429906542056,
"grad_norm": 4.457572937011719,
"learning_rate": 7.28613569321534e-06,
"loss": 1.1257,
"step": 741
},
{
"epoch": 0.1824889326119036,
"grad_norm": 4.509995460510254,
"learning_rate": 7.295968534906589e-06,
"loss": 1.1851,
"step": 742
},
{
"epoch": 0.18273487456960158,
"grad_norm": 4.390313148498535,
"learning_rate": 7.305801376597838e-06,
"loss": 1.2793,
"step": 743
},
{
"epoch": 0.18298081652729956,
"grad_norm": 4.375789165496826,
"learning_rate": 7.315634218289086e-06,
"loss": 1.3082,
"step": 744
},
{
"epoch": 0.18322675848499753,
"grad_norm": 4.570054054260254,
"learning_rate": 7.325467059980334e-06,
"loss": 1.172,
"step": 745
},
{
"epoch": 0.18347270044269554,
"grad_norm": 4.373350620269775,
"learning_rate": 7.335299901671584e-06,
"loss": 1.1806,
"step": 746
},
{
"epoch": 0.1837186424003935,
"grad_norm": 5.120793342590332,
"learning_rate": 7.3451327433628326e-06,
"loss": 1.265,
"step": 747
},
{
"epoch": 0.1839645843580915,
"grad_norm": 4.843412399291992,
"learning_rate": 7.354965585054081e-06,
"loss": 1.2129,
"step": 748
},
{
"epoch": 0.18421052631578946,
"grad_norm": 4.784841537475586,
"learning_rate": 7.36479842674533e-06,
"loss": 1.2482,
"step": 749
},
{
"epoch": 0.18445646827348747,
"grad_norm": 4.252614974975586,
"learning_rate": 7.374631268436579e-06,
"loss": 1.0728,
"step": 750
},
{
"epoch": 0.18470241023118544,
"grad_norm": 4.374537944793701,
"learning_rate": 7.384464110127828e-06,
"loss": 1.1011,
"step": 751
},
{
"epoch": 0.18494835218888342,
"grad_norm": 4.210338592529297,
"learning_rate": 7.394296951819077e-06,
"loss": 1.2745,
"step": 752
},
{
"epoch": 0.1851942941465814,
"grad_norm": 4.752880573272705,
"learning_rate": 7.4041297935103254e-06,
"loss": 1.3164,
"step": 753
},
{
"epoch": 0.1854402361042794,
"grad_norm": 4.58962869644165,
"learning_rate": 7.413962635201573e-06,
"loss": 1.1629,
"step": 754
},
{
"epoch": 0.18568617806197737,
"grad_norm": 4.409331321716309,
"learning_rate": 7.423795476892822e-06,
"loss": 1.1616,
"step": 755
},
{
"epoch": 0.18593212001967535,
"grad_norm": 4.713413238525391,
"learning_rate": 7.4336283185840714e-06,
"loss": 1.1744,
"step": 756
},
{
"epoch": 0.18617806197737333,
"grad_norm": 4.763073921203613,
"learning_rate": 7.44346116027532e-06,
"loss": 1.3358,
"step": 757
},
{
"epoch": 0.18642400393507133,
"grad_norm": 4.71395206451416,
"learning_rate": 7.453294001966569e-06,
"loss": 1.275,
"step": 758
},
{
"epoch": 0.1866699458927693,
"grad_norm": 4.639562129974365,
"learning_rate": 7.4631268436578175e-06,
"loss": 1.4193,
"step": 759
},
{
"epoch": 0.18691588785046728,
"grad_norm": 4.445728778839111,
"learning_rate": 7.472959685349066e-06,
"loss": 1.2599,
"step": 760
},
{
"epoch": 0.1871618298081653,
"grad_norm": 4.431678295135498,
"learning_rate": 7.482792527040316e-06,
"loss": 1.0541,
"step": 761
},
{
"epoch": 0.18740777176586326,
"grad_norm": 5.165162086486816,
"learning_rate": 7.492625368731564e-06,
"loss": 1.5183,
"step": 762
},
{
"epoch": 0.18765371372356124,
"grad_norm": 4.514559268951416,
"learning_rate": 7.502458210422813e-06,
"loss": 1.2673,
"step": 763
},
{
"epoch": 0.18789965568125921,
"grad_norm": 5.007501125335693,
"learning_rate": 7.512291052114062e-06,
"loss": 1.2602,
"step": 764
},
{
"epoch": 0.18814559763895722,
"grad_norm": 4.231893539428711,
"learning_rate": 7.5221238938053095e-06,
"loss": 1.0749,
"step": 765
},
{
"epoch": 0.1883915395966552,
"grad_norm": 4.361257076263428,
"learning_rate": 7.53195673549656e-06,
"loss": 1.1553,
"step": 766
},
{
"epoch": 0.18863748155435317,
"grad_norm": 4.663354396820068,
"learning_rate": 7.541789577187808e-06,
"loss": 1.2185,
"step": 767
},
{
"epoch": 0.18888342351205115,
"grad_norm": 4.924999237060547,
"learning_rate": 7.551622418879056e-06,
"loss": 1.3602,
"step": 768
},
{
"epoch": 0.18912936546974915,
"grad_norm": 4.88938570022583,
"learning_rate": 7.561455260570305e-06,
"loss": 1.2068,
"step": 769
},
{
"epoch": 0.18937530742744713,
"grad_norm": 4.793879985809326,
"learning_rate": 7.5712881022615545e-06,
"loss": 1.281,
"step": 770
},
{
"epoch": 0.1896212493851451,
"grad_norm": 5.106657028198242,
"learning_rate": 7.581120943952803e-06,
"loss": 1.3916,
"step": 771
},
{
"epoch": 0.18986719134284308,
"grad_norm": 4.0290446281433105,
"learning_rate": 7.590953785644052e-06,
"loss": 1.0236,
"step": 772
},
{
"epoch": 0.19011313330054108,
"grad_norm": 5.341995716094971,
"learning_rate": 7.6007866273353005e-06,
"loss": 1.3514,
"step": 773
},
{
"epoch": 0.19035907525823906,
"grad_norm": 4.571128845214844,
"learning_rate": 7.610619469026549e-06,
"loss": 1.4266,
"step": 774
},
{
"epoch": 0.19060501721593703,
"grad_norm": 4.511712551116943,
"learning_rate": 7.620452310717799e-06,
"loss": 1.1531,
"step": 775
},
{
"epoch": 0.190850959173635,
"grad_norm": 3.9484376907348633,
"learning_rate": 7.630285152409047e-06,
"loss": 1.2286,
"step": 776
},
{
"epoch": 0.191096901131333,
"grad_norm": 4.544414043426514,
"learning_rate": 7.640117994100296e-06,
"loss": 1.1079,
"step": 777
},
{
"epoch": 0.191342843089031,
"grad_norm": 4.353457450866699,
"learning_rate": 7.649950835791544e-06,
"loss": 1.3103,
"step": 778
},
{
"epoch": 0.19158878504672897,
"grad_norm": 4.181809902191162,
"learning_rate": 7.659783677482793e-06,
"loss": 1.1105,
"step": 779
},
{
"epoch": 0.19183472700442694,
"grad_norm": 4.791296482086182,
"learning_rate": 7.669616519174043e-06,
"loss": 1.221,
"step": 780
},
{
"epoch": 0.19208066896212495,
"grad_norm": 4.565962791442871,
"learning_rate": 7.67944936086529e-06,
"loss": 1.2928,
"step": 781
},
{
"epoch": 0.19232661091982292,
"grad_norm": 5.3249735832214355,
"learning_rate": 7.68928220255654e-06,
"loss": 1.2554,
"step": 782
},
{
"epoch": 0.1925725528775209,
"grad_norm": 4.583021640777588,
"learning_rate": 7.699115044247788e-06,
"loss": 1.1934,
"step": 783
},
{
"epoch": 0.1928184948352189,
"grad_norm": 5.056181907653809,
"learning_rate": 7.708947885939036e-06,
"loss": 1.2541,
"step": 784
},
{
"epoch": 0.19306443679291688,
"grad_norm": 4.707731246948242,
"learning_rate": 7.718780727630285e-06,
"loss": 1.3629,
"step": 785
},
{
"epoch": 0.19331037875061485,
"grad_norm": 4.425346374511719,
"learning_rate": 7.728613569321535e-06,
"loss": 1.1719,
"step": 786
},
{
"epoch": 0.19355632070831283,
"grad_norm": 4.246187686920166,
"learning_rate": 7.738446411012783e-06,
"loss": 1.2646,
"step": 787
},
{
"epoch": 0.19380226266601083,
"grad_norm": 4.148866176605225,
"learning_rate": 7.748279252704032e-06,
"loss": 1.1795,
"step": 788
},
{
"epoch": 0.1940482046237088,
"grad_norm": 4.518898010253906,
"learning_rate": 7.75811209439528e-06,
"loss": 1.1814,
"step": 789
},
{
"epoch": 0.19429414658140678,
"grad_norm": 5.115946292877197,
"learning_rate": 7.76794493608653e-06,
"loss": 1.2867,
"step": 790
},
{
"epoch": 0.19454008853910476,
"grad_norm": 4.885878562927246,
"learning_rate": 7.77777777777778e-06,
"loss": 1.2734,
"step": 791
},
{
"epoch": 0.19478603049680276,
"grad_norm": 4.687307357788086,
"learning_rate": 7.787610619469027e-06,
"loss": 1.4284,
"step": 792
},
{
"epoch": 0.19503197245450074,
"grad_norm": 4.8126959800720215,
"learning_rate": 7.797443461160275e-06,
"loss": 1.2882,
"step": 793
},
{
"epoch": 0.19527791441219872,
"grad_norm": 4.783509731292725,
"learning_rate": 7.807276302851526e-06,
"loss": 1.2496,
"step": 794
},
{
"epoch": 0.1955238563698967,
"grad_norm": 4.238515377044678,
"learning_rate": 7.817109144542774e-06,
"loss": 1.1379,
"step": 795
},
{
"epoch": 0.1957697983275947,
"grad_norm": 4.277228355407715,
"learning_rate": 7.826941986234022e-06,
"loss": 1.2045,
"step": 796
},
{
"epoch": 0.19601574028529267,
"grad_norm": 4.734079837799072,
"learning_rate": 7.836774827925271e-06,
"loss": 1.2931,
"step": 797
},
{
"epoch": 0.19626168224299065,
"grad_norm": 4.6950249671936035,
"learning_rate": 7.846607669616519e-06,
"loss": 1.3034,
"step": 798
},
{
"epoch": 0.19650762420068862,
"grad_norm": 4.282293796539307,
"learning_rate": 7.856440511307769e-06,
"loss": 1.0959,
"step": 799
},
{
"epoch": 0.19675356615838663,
"grad_norm": 4.656702518463135,
"learning_rate": 7.866273352999018e-06,
"loss": 1.2765,
"step": 800
},
{
"epoch": 0.1969995081160846,
"grad_norm": 4.735047817230225,
"learning_rate": 7.876106194690266e-06,
"loss": 1.3996,
"step": 801
},
{
"epoch": 0.19724545007378258,
"grad_norm": 4.372286319732666,
"learning_rate": 7.885939036381515e-06,
"loss": 1.1705,
"step": 802
},
{
"epoch": 0.19749139203148058,
"grad_norm": 4.513209342956543,
"learning_rate": 7.895771878072763e-06,
"loss": 1.2742,
"step": 803
},
{
"epoch": 0.19773733398917856,
"grad_norm": 4.375402927398682,
"learning_rate": 7.905604719764013e-06,
"loss": 1.2168,
"step": 804
},
{
"epoch": 0.19798327594687654,
"grad_norm": 4.131961345672607,
"learning_rate": 7.915437561455262e-06,
"loss": 1.1697,
"step": 805
},
{
"epoch": 0.1982292179045745,
"grad_norm": 3.9675047397613525,
"learning_rate": 7.92527040314651e-06,
"loss": 1.0805,
"step": 806
},
{
"epoch": 0.19847515986227252,
"grad_norm": 4.610249042510986,
"learning_rate": 7.935103244837758e-06,
"loss": 1.3352,
"step": 807
},
{
"epoch": 0.1987211018199705,
"grad_norm": 4.603500843048096,
"learning_rate": 7.944936086529007e-06,
"loss": 1.1791,
"step": 808
},
{
"epoch": 0.19896704377766847,
"grad_norm": 4.787491321563721,
"learning_rate": 7.954768928220257e-06,
"loss": 1.4238,
"step": 809
},
{
"epoch": 0.19921298573536644,
"grad_norm": 4.871586799621582,
"learning_rate": 7.964601769911505e-06,
"loss": 1.296,
"step": 810
},
{
"epoch": 0.19945892769306445,
"grad_norm": 4.333070278167725,
"learning_rate": 7.974434611602754e-06,
"loss": 1.1608,
"step": 811
},
{
"epoch": 0.19970486965076242,
"grad_norm": 4.731893062591553,
"learning_rate": 7.984267453294002e-06,
"loss": 1.2166,
"step": 812
},
{
"epoch": 0.1999508116084604,
"grad_norm": 5.074556827545166,
"learning_rate": 7.994100294985252e-06,
"loss": 1.2615,
"step": 813
},
{
"epoch": 0.20019675356615838,
"grad_norm": 4.46243143081665,
"learning_rate": 8.003933136676501e-06,
"loss": 1.1159,
"step": 814
},
{
"epoch": 0.20044269552385638,
"grad_norm": 4.670242786407471,
"learning_rate": 8.013765978367749e-06,
"loss": 1.2529,
"step": 815
},
{
"epoch": 0.20068863748155436,
"grad_norm": 4.814939022064209,
"learning_rate": 8.023598820058997e-06,
"loss": 1.2338,
"step": 816
},
{
"epoch": 0.20093457943925233,
"grad_norm": 4.680349826812744,
"learning_rate": 8.033431661750246e-06,
"loss": 1.1309,
"step": 817
},
{
"epoch": 0.2011805213969503,
"grad_norm": 4.329833507537842,
"learning_rate": 8.043264503441496e-06,
"loss": 1.1822,
"step": 818
},
{
"epoch": 0.2014264633546483,
"grad_norm": 3.818422794342041,
"learning_rate": 8.053097345132744e-06,
"loss": 1.0593,
"step": 819
},
{
"epoch": 0.2016724053123463,
"grad_norm": 4.431004047393799,
"learning_rate": 8.062930186823993e-06,
"loss": 1.2146,
"step": 820
},
{
"epoch": 0.20191834727004426,
"grad_norm": 4.738842964172363,
"learning_rate": 8.072763028515241e-06,
"loss": 0.9916,
"step": 821
},
{
"epoch": 0.20216428922774227,
"grad_norm": 4.537582874298096,
"learning_rate": 8.08259587020649e-06,
"loss": 1.2934,
"step": 822
},
{
"epoch": 0.20241023118544024,
"grad_norm": 5.207397937774658,
"learning_rate": 8.09242871189774e-06,
"loss": 1.2832,
"step": 823
},
{
"epoch": 0.20265617314313822,
"grad_norm": 4.573858737945557,
"learning_rate": 8.102261553588988e-06,
"loss": 1.1873,
"step": 824
},
{
"epoch": 0.2029021151008362,
"grad_norm": 4.553112983703613,
"learning_rate": 8.112094395280237e-06,
"loss": 1.2133,
"step": 825
},
{
"epoch": 0.2031480570585342,
"grad_norm": 4.460824966430664,
"learning_rate": 8.121927236971485e-06,
"loss": 1.3463,
"step": 826
},
{
"epoch": 0.20339399901623217,
"grad_norm": 4.530635833740234,
"learning_rate": 8.131760078662733e-06,
"loss": 1.1632,
"step": 827
},
{
"epoch": 0.20363994097393015,
"grad_norm": 4.924211025238037,
"learning_rate": 8.141592920353984e-06,
"loss": 1.4664,
"step": 828
},
{
"epoch": 0.20388588293162813,
"grad_norm": 5.084644794464111,
"learning_rate": 8.151425762045232e-06,
"loss": 1.3275,
"step": 829
},
{
"epoch": 0.20413182488932613,
"grad_norm": 4.579489707946777,
"learning_rate": 8.16125860373648e-06,
"loss": 1.1105,
"step": 830
},
{
"epoch": 0.2043777668470241,
"grad_norm": 4.865893840789795,
"learning_rate": 8.17109144542773e-06,
"loss": 1.2985,
"step": 831
},
{
"epoch": 0.20462370880472208,
"grad_norm": 4.561331748962402,
"learning_rate": 8.180924287118977e-06,
"loss": 1.2867,
"step": 832
},
{
"epoch": 0.20486965076242006,
"grad_norm": 4.801970481872559,
"learning_rate": 8.190757128810227e-06,
"loss": 1.3419,
"step": 833
},
{
"epoch": 0.20511559272011806,
"grad_norm": 4.598800182342529,
"learning_rate": 8.200589970501476e-06,
"loss": 1.1795,
"step": 834
},
{
"epoch": 0.20536153467781604,
"grad_norm": 4.441288471221924,
"learning_rate": 8.210422812192724e-06,
"loss": 1.2242,
"step": 835
},
{
"epoch": 0.205607476635514,
"grad_norm": 4.319103240966797,
"learning_rate": 8.220255653883974e-06,
"loss": 1.2029,
"step": 836
},
{
"epoch": 0.205853418593212,
"grad_norm": 4.5912628173828125,
"learning_rate": 8.230088495575221e-06,
"loss": 1.2394,
"step": 837
},
{
"epoch": 0.20609936055091,
"grad_norm": 4.354151725769043,
"learning_rate": 8.239921337266471e-06,
"loss": 1.2773,
"step": 838
},
{
"epoch": 0.20634530250860797,
"grad_norm": 5.08561897277832,
"learning_rate": 8.249754178957719e-06,
"loss": 1.308,
"step": 839
},
{
"epoch": 0.20659124446630595,
"grad_norm": 4.384550094604492,
"learning_rate": 8.259587020648968e-06,
"loss": 1.1063,
"step": 840
},
{
"epoch": 0.20683718642400392,
"grad_norm": 4.515172481536865,
"learning_rate": 8.269419862340216e-06,
"loss": 1.1603,
"step": 841
},
{
"epoch": 0.20708312838170193,
"grad_norm": 4.632203578948975,
"learning_rate": 8.279252704031466e-06,
"loss": 1.31,
"step": 842
},
{
"epoch": 0.2073290703393999,
"grad_norm": 4.643248558044434,
"learning_rate": 8.289085545722715e-06,
"loss": 1.3677,
"step": 843
},
{
"epoch": 0.20757501229709788,
"grad_norm": 4.265455722808838,
"learning_rate": 8.298918387413963e-06,
"loss": 1.2512,
"step": 844
},
{
"epoch": 0.20782095425479588,
"grad_norm": 4.760956764221191,
"learning_rate": 8.308751229105212e-06,
"loss": 1.1834,
"step": 845
},
{
"epoch": 0.20806689621249386,
"grad_norm": 4.727907180786133,
"learning_rate": 8.31858407079646e-06,
"loss": 1.1242,
"step": 846
},
{
"epoch": 0.20831283817019183,
"grad_norm": 4.4515862464904785,
"learning_rate": 8.32841691248771e-06,
"loss": 1.2724,
"step": 847
},
{
"epoch": 0.2085587801278898,
"grad_norm": 4.7609028816223145,
"learning_rate": 8.33824975417896e-06,
"loss": 1.2399,
"step": 848
},
{
"epoch": 0.2088047220855878,
"grad_norm": 4.59127950668335,
"learning_rate": 8.348082595870207e-06,
"loss": 1.2118,
"step": 849
},
{
"epoch": 0.2090506640432858,
"grad_norm": 4.794438362121582,
"learning_rate": 8.357915437561455e-06,
"loss": 1.2834,
"step": 850
},
{
"epoch": 0.20929660600098376,
"grad_norm": 4.6203413009643555,
"learning_rate": 8.367748279252705e-06,
"loss": 1.295,
"step": 851
},
{
"epoch": 0.20954254795868174,
"grad_norm": 4.898770809173584,
"learning_rate": 8.377581120943954e-06,
"loss": 1.4348,
"step": 852
},
{
"epoch": 0.20978848991637974,
"grad_norm": 4.65386962890625,
"learning_rate": 8.387413962635202e-06,
"loss": 1.2562,
"step": 853
},
{
"epoch": 0.21003443187407772,
"grad_norm": 4.632324695587158,
"learning_rate": 8.397246804326451e-06,
"loss": 1.2611,
"step": 854
},
{
"epoch": 0.2102803738317757,
"grad_norm": 4.35437536239624,
"learning_rate": 8.4070796460177e-06,
"loss": 1.1079,
"step": 855
},
{
"epoch": 0.21052631578947367,
"grad_norm": 4.218400001525879,
"learning_rate": 8.416912487708949e-06,
"loss": 1.2071,
"step": 856
},
{
"epoch": 0.21077225774717168,
"grad_norm": 4.317579746246338,
"learning_rate": 8.426745329400198e-06,
"loss": 1.25,
"step": 857
},
{
"epoch": 0.21101819970486965,
"grad_norm": 4.81080436706543,
"learning_rate": 8.436578171091446e-06,
"loss": 1.2645,
"step": 858
},
{
"epoch": 0.21126414166256763,
"grad_norm": 4.47863245010376,
"learning_rate": 8.446411012782696e-06,
"loss": 1.3103,
"step": 859
},
{
"epoch": 0.2115100836202656,
"grad_norm": 4.424753189086914,
"learning_rate": 8.456243854473943e-06,
"loss": 1.328,
"step": 860
},
{
"epoch": 0.2117560255779636,
"grad_norm": 4.8833441734313965,
"learning_rate": 8.466076696165191e-06,
"loss": 1.3097,
"step": 861
},
{
"epoch": 0.21200196753566158,
"grad_norm": 5.188076019287109,
"learning_rate": 8.47590953785644e-06,
"loss": 1.4007,
"step": 862
},
{
"epoch": 0.21224790949335956,
"grad_norm": 4.65690279006958,
"learning_rate": 8.48574237954769e-06,
"loss": 1.3263,
"step": 863
},
{
"epoch": 0.21249385145105756,
"grad_norm": 4.253676891326904,
"learning_rate": 8.495575221238938e-06,
"loss": 1.2104,
"step": 864
},
{
"epoch": 0.21273979340875554,
"grad_norm": 4.652464866638184,
"learning_rate": 8.505408062930188e-06,
"loss": 1.3403,
"step": 865
},
{
"epoch": 0.21298573536645352,
"grad_norm": 4.343818187713623,
"learning_rate": 8.515240904621437e-06,
"loss": 1.1913,
"step": 866
},
{
"epoch": 0.2132316773241515,
"grad_norm": 4.356947898864746,
"learning_rate": 8.525073746312685e-06,
"loss": 1.1454,
"step": 867
},
{
"epoch": 0.2134776192818495,
"grad_norm": 4.582756519317627,
"learning_rate": 8.534906588003934e-06,
"loss": 1.334,
"step": 868
},
{
"epoch": 0.21372356123954747,
"grad_norm": 4.510144233703613,
"learning_rate": 8.544739429695182e-06,
"loss": 1.1541,
"step": 869
},
{
"epoch": 0.21396950319724545,
"grad_norm": 4.630588054656982,
"learning_rate": 8.554572271386432e-06,
"loss": 1.1588,
"step": 870
},
{
"epoch": 0.21421544515494342,
"grad_norm": 4.356663703918457,
"learning_rate": 8.564405113077681e-06,
"loss": 1.3754,
"step": 871
},
{
"epoch": 0.21446138711264143,
"grad_norm": 4.36549711227417,
"learning_rate": 8.574237954768929e-06,
"loss": 1.2835,
"step": 872
},
{
"epoch": 0.2147073290703394,
"grad_norm": 4.282351970672607,
"learning_rate": 8.584070796460177e-06,
"loss": 1.2621,
"step": 873
},
{
"epoch": 0.21495327102803738,
"grad_norm": 4.56622314453125,
"learning_rate": 8.593903638151426e-06,
"loss": 1.298,
"step": 874
},
{
"epoch": 0.21519921298573536,
"grad_norm": 4.594609260559082,
"learning_rate": 8.603736479842674e-06,
"loss": 1.3263,
"step": 875
},
{
"epoch": 0.21544515494343336,
"grad_norm": 4.442915439605713,
"learning_rate": 8.613569321533924e-06,
"loss": 1.2258,
"step": 876
},
{
"epoch": 0.21569109690113134,
"grad_norm": 4.241147041320801,
"learning_rate": 8.623402163225173e-06,
"loss": 1.1935,
"step": 877
},
{
"epoch": 0.2159370388588293,
"grad_norm": 4.175384998321533,
"learning_rate": 8.633235004916421e-06,
"loss": 1.2569,
"step": 878
},
{
"epoch": 0.2161829808165273,
"grad_norm": 4.044576168060303,
"learning_rate": 8.64306784660767e-06,
"loss": 1.1896,
"step": 879
},
{
"epoch": 0.2164289227742253,
"grad_norm": 4.505466938018799,
"learning_rate": 8.652900688298919e-06,
"loss": 1.16,
"step": 880
},
{
"epoch": 0.21667486473192327,
"grad_norm": 4.628767013549805,
"learning_rate": 8.662733529990168e-06,
"loss": 1.3175,
"step": 881
},
{
"epoch": 0.21692080668962124,
"grad_norm": 4.484878063201904,
"learning_rate": 8.672566371681418e-06,
"loss": 1.1373,
"step": 882
},
{
"epoch": 0.21716674864731922,
"grad_norm": 4.332725524902344,
"learning_rate": 8.682399213372665e-06,
"loss": 1.2447,
"step": 883
},
{
"epoch": 0.21741269060501722,
"grad_norm": 4.36234712600708,
"learning_rate": 8.692232055063913e-06,
"loss": 1.1429,
"step": 884
},
{
"epoch": 0.2176586325627152,
"grad_norm": 4.798089027404785,
"learning_rate": 8.702064896755163e-06,
"loss": 1.4961,
"step": 885
},
{
"epoch": 0.21790457452041317,
"grad_norm": 4.843624591827393,
"learning_rate": 8.711897738446412e-06,
"loss": 1.2309,
"step": 886
},
{
"epoch": 0.21815051647811118,
"grad_norm": 4.736741065979004,
"learning_rate": 8.72173058013766e-06,
"loss": 1.4206,
"step": 887
},
{
"epoch": 0.21839645843580915,
"grad_norm": 4.596724033355713,
"learning_rate": 8.73156342182891e-06,
"loss": 1.1819,
"step": 888
},
{
"epoch": 0.21864240039350713,
"grad_norm": 4.625223159790039,
"learning_rate": 8.741396263520157e-06,
"loss": 1.1228,
"step": 889
},
{
"epoch": 0.2188883423512051,
"grad_norm": 4.540004730224609,
"learning_rate": 8.751229105211407e-06,
"loss": 1.2006,
"step": 890
},
{
"epoch": 0.2191342843089031,
"grad_norm": 4.2051801681518555,
"learning_rate": 8.761061946902656e-06,
"loss": 1.2062,
"step": 891
},
{
"epoch": 0.21938022626660109,
"grad_norm": 4.80202579498291,
"learning_rate": 8.770894788593904e-06,
"loss": 1.3389,
"step": 892
},
{
"epoch": 0.21962616822429906,
"grad_norm": 4.599158763885498,
"learning_rate": 8.780727630285154e-06,
"loss": 1.3249,
"step": 893
},
{
"epoch": 0.21987211018199704,
"grad_norm": 3.980557680130005,
"learning_rate": 8.790560471976402e-06,
"loss": 1.2298,
"step": 894
},
{
"epoch": 0.22011805213969504,
"grad_norm": 4.493852138519287,
"learning_rate": 8.800393313667651e-06,
"loss": 1.2582,
"step": 895
},
{
"epoch": 0.22036399409739302,
"grad_norm": 4.683654308319092,
"learning_rate": 8.810226155358899e-06,
"loss": 1.2311,
"step": 896
},
{
"epoch": 0.220609936055091,
"grad_norm": 4.684290885925293,
"learning_rate": 8.820058997050148e-06,
"loss": 1.2489,
"step": 897
},
{
"epoch": 0.22085587801278897,
"grad_norm": 4.432709217071533,
"learning_rate": 8.829891838741396e-06,
"loss": 1.1109,
"step": 898
},
{
"epoch": 0.22110181997048697,
"grad_norm": 4.6879401206970215,
"learning_rate": 8.839724680432646e-06,
"loss": 1.3574,
"step": 899
},
{
"epoch": 0.22134776192818495,
"grad_norm": 4.32175874710083,
"learning_rate": 8.849557522123895e-06,
"loss": 1.0934,
"step": 900
},
{
"epoch": 0.22159370388588293,
"grad_norm": 5.156861782073975,
"learning_rate": 8.859390363815143e-06,
"loss": 1.3298,
"step": 901
},
{
"epoch": 0.2218396458435809,
"grad_norm": 4.37407112121582,
"learning_rate": 8.869223205506393e-06,
"loss": 1.204,
"step": 902
},
{
"epoch": 0.2220855878012789,
"grad_norm": 4.502548694610596,
"learning_rate": 8.87905604719764e-06,
"loss": 1.2974,
"step": 903
},
{
"epoch": 0.22233152975897688,
"grad_norm": 4.405653476715088,
"learning_rate": 8.888888888888888e-06,
"loss": 1.1935,
"step": 904
},
{
"epoch": 0.22257747171667486,
"grad_norm": 4.655777931213379,
"learning_rate": 8.89872173058014e-06,
"loss": 1.2142,
"step": 905
},
{
"epoch": 0.22282341367437286,
"grad_norm": 4.959415912628174,
"learning_rate": 8.908554572271387e-06,
"loss": 1.221,
"step": 906
},
{
"epoch": 0.22306935563207084,
"grad_norm": 4.43116569519043,
"learning_rate": 8.918387413962635e-06,
"loss": 1.0709,
"step": 907
},
{
"epoch": 0.2233152975897688,
"grad_norm": 4.259029388427734,
"learning_rate": 8.928220255653885e-06,
"loss": 1.3253,
"step": 908
},
{
"epoch": 0.2235612395474668,
"grad_norm": 4.000690460205078,
"learning_rate": 8.938053097345133e-06,
"loss": 1.1168,
"step": 909
},
{
"epoch": 0.2238071815051648,
"grad_norm": 4.003090858459473,
"learning_rate": 8.947885939036382e-06,
"loss": 1.1657,
"step": 910
},
{
"epoch": 0.22405312346286277,
"grad_norm": 4.28544807434082,
"learning_rate": 8.957718780727632e-06,
"loss": 1.1052,
"step": 911
},
{
"epoch": 0.22429906542056074,
"grad_norm": 4.451721668243408,
"learning_rate": 8.96755162241888e-06,
"loss": 1.2898,
"step": 912
},
{
"epoch": 0.22454500737825872,
"grad_norm": 4.637101650238037,
"learning_rate": 8.977384464110129e-06,
"loss": 1.0876,
"step": 913
},
{
"epoch": 0.22479094933595672,
"grad_norm": 5.370706081390381,
"learning_rate": 8.987217305801378e-06,
"loss": 1.4321,
"step": 914
},
{
"epoch": 0.2250368912936547,
"grad_norm": 4.144587993621826,
"learning_rate": 8.997050147492626e-06,
"loss": 1.2224,
"step": 915
},
{
"epoch": 0.22528283325135268,
"grad_norm": 5.05446720123291,
"learning_rate": 9.006882989183876e-06,
"loss": 1.2689,
"step": 916
},
{
"epoch": 0.22552877520905065,
"grad_norm": 5.008686542510986,
"learning_rate": 9.016715830875124e-06,
"loss": 1.2975,
"step": 917
},
{
"epoch": 0.22577471716674866,
"grad_norm": 4.562985420227051,
"learning_rate": 9.026548672566371e-06,
"loss": 1.1029,
"step": 918
},
{
"epoch": 0.22602065912444663,
"grad_norm": 4.796685695648193,
"learning_rate": 9.036381514257621e-06,
"loss": 1.3293,
"step": 919
},
{
"epoch": 0.2262666010821446,
"grad_norm": 4.579412937164307,
"learning_rate": 9.04621435594887e-06,
"loss": 1.3149,
"step": 920
},
{
"epoch": 0.22651254303984258,
"grad_norm": 4.34621000289917,
"learning_rate": 9.056047197640118e-06,
"loss": 1.1604,
"step": 921
},
{
"epoch": 0.2267584849975406,
"grad_norm": 4.718410015106201,
"learning_rate": 9.065880039331368e-06,
"loss": 1.2865,
"step": 922
},
{
"epoch": 0.22700442695523856,
"grad_norm": 4.8863372802734375,
"learning_rate": 9.075712881022616e-06,
"loss": 1.2309,
"step": 923
},
{
"epoch": 0.22725036891293654,
"grad_norm": 4.170733451843262,
"learning_rate": 9.085545722713865e-06,
"loss": 1.1464,
"step": 924
},
{
"epoch": 0.22749631087063454,
"grad_norm": 4.47605562210083,
"learning_rate": 9.095378564405115e-06,
"loss": 1.3073,
"step": 925
},
{
"epoch": 0.22774225282833252,
"grad_norm": 4.080425262451172,
"learning_rate": 9.105211406096362e-06,
"loss": 1.1833,
"step": 926
},
{
"epoch": 0.2279881947860305,
"grad_norm": 4.507909297943115,
"learning_rate": 9.11504424778761e-06,
"loss": 1.3204,
"step": 927
},
{
"epoch": 0.22823413674372847,
"grad_norm": 4.358058929443359,
"learning_rate": 9.12487708947886e-06,
"loss": 1.2446,
"step": 928
},
{
"epoch": 0.22848007870142648,
"grad_norm": 4.322032928466797,
"learning_rate": 9.13470993117011e-06,
"loss": 1.176,
"step": 929
},
{
"epoch": 0.22872602065912445,
"grad_norm": 4.095378875732422,
"learning_rate": 9.144542772861357e-06,
"loss": 1.2017,
"step": 930
},
{
"epoch": 0.22897196261682243,
"grad_norm": 4.421671390533447,
"learning_rate": 9.154375614552607e-06,
"loss": 1.2816,
"step": 931
},
{
"epoch": 0.2292179045745204,
"grad_norm": 4.223404884338379,
"learning_rate": 9.164208456243854e-06,
"loss": 1.3022,
"step": 932
},
{
"epoch": 0.2294638465322184,
"grad_norm": 4.696720600128174,
"learning_rate": 9.174041297935104e-06,
"loss": 1.2924,
"step": 933
},
{
"epoch": 0.22970978848991638,
"grad_norm": 4.640844345092773,
"learning_rate": 9.183874139626354e-06,
"loss": 1.3075,
"step": 934
},
{
"epoch": 0.22995573044761436,
"grad_norm": 4.6254143714904785,
"learning_rate": 9.193706981317601e-06,
"loss": 1.2129,
"step": 935
},
{
"epoch": 0.23020167240531234,
"grad_norm": 4.089590549468994,
"learning_rate": 9.203539823008851e-06,
"loss": 1.147,
"step": 936
},
{
"epoch": 0.23044761436301034,
"grad_norm": 4.164431095123291,
"learning_rate": 9.213372664700099e-06,
"loss": 1.1764,
"step": 937
},
{
"epoch": 0.23069355632070832,
"grad_norm": 4.577208042144775,
"learning_rate": 9.223205506391348e-06,
"loss": 1.3463,
"step": 938
},
{
"epoch": 0.2309394982784063,
"grad_norm": 4.4762725830078125,
"learning_rate": 9.233038348082598e-06,
"loss": 1.239,
"step": 939
},
{
"epoch": 0.23118544023610427,
"grad_norm": 4.413786888122559,
"learning_rate": 9.242871189773846e-06,
"loss": 1.236,
"step": 940
},
{
"epoch": 0.23143138219380227,
"grad_norm": 4.638607501983643,
"learning_rate": 9.252704031465093e-06,
"loss": 1.2944,
"step": 941
},
{
"epoch": 0.23167732415150025,
"grad_norm": 4.401584148406982,
"learning_rate": 9.262536873156343e-06,
"loss": 1.3338,
"step": 942
},
{
"epoch": 0.23192326610919822,
"grad_norm": 4.65614128112793,
"learning_rate": 9.272369714847592e-06,
"loss": 1.2297,
"step": 943
},
{
"epoch": 0.2321692080668962,
"grad_norm": 4.3185272216796875,
"learning_rate": 9.28220255653884e-06,
"loss": 1.308,
"step": 944
},
{
"epoch": 0.2324151500245942,
"grad_norm": 4.100996017456055,
"learning_rate": 9.29203539823009e-06,
"loss": 1.1956,
"step": 945
},
{
"epoch": 0.23266109198229218,
"grad_norm": 4.230007648468018,
"learning_rate": 9.301868239921338e-06,
"loss": 1.215,
"step": 946
},
{
"epoch": 0.23290703393999015,
"grad_norm": 4.165582656860352,
"learning_rate": 9.311701081612587e-06,
"loss": 1.2378,
"step": 947
},
{
"epoch": 0.23315297589768816,
"grad_norm": 5.203585147857666,
"learning_rate": 9.321533923303837e-06,
"loss": 1.3277,
"step": 948
},
{
"epoch": 0.23339891785538613,
"grad_norm": 4.351226806640625,
"learning_rate": 9.331366764995084e-06,
"loss": 1.3576,
"step": 949
},
{
"epoch": 0.2336448598130841,
"grad_norm": 4.370736598968506,
"learning_rate": 9.341199606686332e-06,
"loss": 1.3026,
"step": 950
},
{
"epoch": 0.2338908017707821,
"grad_norm": 4.063790798187256,
"learning_rate": 9.351032448377582e-06,
"loss": 1.2483,
"step": 951
},
{
"epoch": 0.2341367437284801,
"grad_norm": 4.213711738586426,
"learning_rate": 9.36086529006883e-06,
"loss": 1.2533,
"step": 952
},
{
"epoch": 0.23438268568617807,
"grad_norm": 5.042483329772949,
"learning_rate": 9.370698131760079e-06,
"loss": 1.414,
"step": 953
},
{
"epoch": 0.23462862764387604,
"grad_norm": 4.558846473693848,
"learning_rate": 9.380530973451329e-06,
"loss": 1.1805,
"step": 954
},
{
"epoch": 0.23487456960157402,
"grad_norm": 4.6492414474487305,
"learning_rate": 9.390363815142576e-06,
"loss": 1.311,
"step": 955
},
{
"epoch": 0.23512051155927202,
"grad_norm": 4.665435314178467,
"learning_rate": 9.400196656833826e-06,
"loss": 1.2862,
"step": 956
},
{
"epoch": 0.23536645351697,
"grad_norm": 4.771575927734375,
"learning_rate": 9.410029498525074e-06,
"loss": 1.302,
"step": 957
},
{
"epoch": 0.23561239547466797,
"grad_norm": 4.262117385864258,
"learning_rate": 9.419862340216323e-06,
"loss": 1.2459,
"step": 958
},
{
"epoch": 0.23585833743236595,
"grad_norm": 4.460323333740234,
"learning_rate": 9.429695181907573e-06,
"loss": 1.1524,
"step": 959
},
{
"epoch": 0.23610427939006395,
"grad_norm": 4.410240173339844,
"learning_rate": 9.43952802359882e-06,
"loss": 1.2795,
"step": 960
},
{
"epoch": 0.23635022134776193,
"grad_norm": 4.420566558837891,
"learning_rate": 9.449360865290068e-06,
"loss": 1.4699,
"step": 961
},
{
"epoch": 0.2365961633054599,
"grad_norm": 4.724861145019531,
"learning_rate": 9.45919370698132e-06,
"loss": 1.3714,
"step": 962
},
{
"epoch": 0.23684210526315788,
"grad_norm": 4.543558597564697,
"learning_rate": 9.469026548672568e-06,
"loss": 1.2182,
"step": 963
},
{
"epoch": 0.23708804722085589,
"grad_norm": 4.339579105377197,
"learning_rate": 9.478859390363815e-06,
"loss": 1.2763,
"step": 964
},
{
"epoch": 0.23733398917855386,
"grad_norm": 4.4870100021362305,
"learning_rate": 9.488692232055065e-06,
"loss": 1.1957,
"step": 965
},
{
"epoch": 0.23757993113625184,
"grad_norm": 4.283409118652344,
"learning_rate": 9.498525073746313e-06,
"loss": 1.293,
"step": 966
},
{
"epoch": 0.23782587309394984,
"grad_norm": 4.850062847137451,
"learning_rate": 9.508357915437562e-06,
"loss": 1.4205,
"step": 967
},
{
"epoch": 0.23807181505164782,
"grad_norm": 4.315372943878174,
"learning_rate": 9.518190757128812e-06,
"loss": 1.2336,
"step": 968
},
{
"epoch": 0.2383177570093458,
"grad_norm": 4.6689252853393555,
"learning_rate": 9.52802359882006e-06,
"loss": 1.2429,
"step": 969
},
{
"epoch": 0.23856369896704377,
"grad_norm": 4.336824893951416,
"learning_rate": 9.537856440511309e-06,
"loss": 1.3703,
"step": 970
},
{
"epoch": 0.23880964092474177,
"grad_norm": 4.368560314178467,
"learning_rate": 9.547689282202557e-06,
"loss": 1.2382,
"step": 971
},
{
"epoch": 0.23905558288243975,
"grad_norm": 4.535837650299072,
"learning_rate": 9.557522123893806e-06,
"loss": 1.3844,
"step": 972
},
{
"epoch": 0.23930152484013772,
"grad_norm": 4.6995768547058105,
"learning_rate": 9.567354965585054e-06,
"loss": 1.3824,
"step": 973
},
{
"epoch": 0.2395474667978357,
"grad_norm": 4.2697577476501465,
"learning_rate": 9.577187807276304e-06,
"loss": 1.1754,
"step": 974
},
{
"epoch": 0.2397934087555337,
"grad_norm": 4.643556118011475,
"learning_rate": 9.587020648967552e-06,
"loss": 1.3161,
"step": 975
},
{
"epoch": 0.24003935071323168,
"grad_norm": 4.218672275543213,
"learning_rate": 9.596853490658801e-06,
"loss": 1.2181,
"step": 976
},
{
"epoch": 0.24028529267092966,
"grad_norm": 4.344313621520996,
"learning_rate": 9.60668633235005e-06,
"loss": 1.2198,
"step": 977
},
{
"epoch": 0.24053123462862763,
"grad_norm": 4.066158771514893,
"learning_rate": 9.616519174041298e-06,
"loss": 1.2191,
"step": 978
},
{
"epoch": 0.24077717658632564,
"grad_norm": 4.58112907409668,
"learning_rate": 9.626352015732548e-06,
"loss": 1.2952,
"step": 979
},
{
"epoch": 0.2410231185440236,
"grad_norm": 5.255346775054932,
"learning_rate": 9.636184857423796e-06,
"loss": 1.606,
"step": 980
},
{
"epoch": 0.2412690605017216,
"grad_norm": 4.927239894866943,
"learning_rate": 9.646017699115045e-06,
"loss": 1.289,
"step": 981
},
{
"epoch": 0.24151500245941956,
"grad_norm": 4.796622276306152,
"learning_rate": 9.655850540806295e-06,
"loss": 1.2547,
"step": 982
},
{
"epoch": 0.24176094441711757,
"grad_norm": 4.983935832977295,
"learning_rate": 9.665683382497543e-06,
"loss": 1.1956,
"step": 983
},
{
"epoch": 0.24200688637481554,
"grad_norm": 4.503929138183594,
"learning_rate": 9.67551622418879e-06,
"loss": 1.3671,
"step": 984
},
{
"epoch": 0.24225282833251352,
"grad_norm": 4.198331356048584,
"learning_rate": 9.68534906588004e-06,
"loss": 1.2058,
"step": 985
},
{
"epoch": 0.24249877029021152,
"grad_norm": 4.6834187507629395,
"learning_rate": 9.69518190757129e-06,
"loss": 1.2508,
"step": 986
},
{
"epoch": 0.2427447122479095,
"grad_norm": 4.183260917663574,
"learning_rate": 9.705014749262537e-06,
"loss": 1.1885,
"step": 987
},
{
"epoch": 0.24299065420560748,
"grad_norm": 4.403391361236572,
"learning_rate": 9.714847590953787e-06,
"loss": 1.1915,
"step": 988
},
{
"epoch": 0.24323659616330545,
"grad_norm": 4.356921195983887,
"learning_rate": 9.724680432645035e-06,
"loss": 1.3797,
"step": 989
},
{
"epoch": 0.24348253812100346,
"grad_norm": 4.08347749710083,
"learning_rate": 9.734513274336284e-06,
"loss": 1.1297,
"step": 990
},
{
"epoch": 0.24372848007870143,
"grad_norm": 4.435698986053467,
"learning_rate": 9.744346116027534e-06,
"loss": 1.1852,
"step": 991
},
{
"epoch": 0.2439744220363994,
"grad_norm": 4.52011251449585,
"learning_rate": 9.754178957718782e-06,
"loss": 1.1734,
"step": 992
},
{
"epoch": 0.24422036399409738,
"grad_norm": 5.166198253631592,
"learning_rate": 9.764011799410031e-06,
"loss": 1.282,
"step": 993
},
{
"epoch": 0.2444663059517954,
"grad_norm": 4.265774726867676,
"learning_rate": 9.773844641101279e-06,
"loss": 1.0647,
"step": 994
},
{
"epoch": 0.24471224790949336,
"grad_norm": 4.522919654846191,
"learning_rate": 9.783677482792527e-06,
"loss": 1.2741,
"step": 995
},
{
"epoch": 0.24495818986719134,
"grad_norm": 4.402100563049316,
"learning_rate": 9.793510324483776e-06,
"loss": 1.3375,
"step": 996
},
{
"epoch": 0.24520413182488932,
"grad_norm": 4.708245754241943,
"learning_rate": 9.803343166175026e-06,
"loss": 1.1975,
"step": 997
},
{
"epoch": 0.24545007378258732,
"grad_norm": 4.366272926330566,
"learning_rate": 9.813176007866274e-06,
"loss": 1.3203,
"step": 998
},
{
"epoch": 0.2456960157402853,
"grad_norm": 4.725527763366699,
"learning_rate": 9.823008849557523e-06,
"loss": 1.3182,
"step": 999
},
{
"epoch": 0.24594195769798327,
"grad_norm": 3.829162120819092,
"learning_rate": 9.832841691248771e-06,
"loss": 1.0856,
"step": 1000
},
{
"epoch": 0.24594195769798327,
"eval_loss": 1.2967066764831543,
"eval_runtime": 13.8803,
"eval_samples_per_second": 28.818,
"eval_steps_per_second": 3.602,
"step": 1000
},
{
"epoch": 0.24618789965568125,
"grad_norm": 4.44356107711792,
"learning_rate": 9.84267453294002e-06,
"loss": 1.2396,
"step": 1001
},
{
"epoch": 0.24643384161337925,
"grad_norm": 4.231010913848877,
"learning_rate": 9.85250737463127e-06,
"loss": 1.2488,
"step": 1002
},
{
"epoch": 0.24667978357107723,
"grad_norm": 4.059149742126465,
"learning_rate": 9.862340216322518e-06,
"loss": 1.1744,
"step": 1003
},
{
"epoch": 0.2469257255287752,
"grad_norm": 4.745471000671387,
"learning_rate": 9.872173058013767e-06,
"loss": 1.2985,
"step": 1004
},
{
"epoch": 0.24717166748647318,
"grad_norm": 4.28933048248291,
"learning_rate": 9.882005899705015e-06,
"loss": 1.0564,
"step": 1005
},
{
"epoch": 0.24741760944417118,
"grad_norm": 4.5860114097595215,
"learning_rate": 9.891838741396265e-06,
"loss": 1.2125,
"step": 1006
},
{
"epoch": 0.24766355140186916,
"grad_norm": 4.273353099822998,
"learning_rate": 9.901671583087512e-06,
"loss": 1.2091,
"step": 1007
},
{
"epoch": 0.24790949335956713,
"grad_norm": 4.638987064361572,
"learning_rate": 9.911504424778762e-06,
"loss": 1.3659,
"step": 1008
},
{
"epoch": 0.24815543531726514,
"grad_norm": 4.579257965087891,
"learning_rate": 9.92133726647001e-06,
"loss": 1.2863,
"step": 1009
},
{
"epoch": 0.24840137727496311,
"grad_norm": 4.147266864776611,
"learning_rate": 9.93117010816126e-06,
"loss": 1.1588,
"step": 1010
},
{
"epoch": 0.2486473192326611,
"grad_norm": 4.675556659698486,
"learning_rate": 9.941002949852509e-06,
"loss": 1.4555,
"step": 1011
},
{
"epoch": 0.24889326119035907,
"grad_norm": 4.675887107849121,
"learning_rate": 9.950835791543757e-06,
"loss": 1.3392,
"step": 1012
},
{
"epoch": 0.24913920314805707,
"grad_norm": 4.406521797180176,
"learning_rate": 9.960668633235006e-06,
"loss": 1.224,
"step": 1013
},
{
"epoch": 0.24938514510575505,
"grad_norm": 4.507264137268066,
"learning_rate": 9.970501474926254e-06,
"loss": 1.225,
"step": 1014
},
{
"epoch": 0.24963108706345302,
"grad_norm": 4.282767295837402,
"learning_rate": 9.980334316617503e-06,
"loss": 1.2113,
"step": 1015
},
{
"epoch": 0.249877029021151,
"grad_norm": 4.425830841064453,
"learning_rate": 9.990167158308753e-06,
"loss": 1.4029,
"step": 1016
},
{
"epoch": 0.250122970978849,
"grad_norm": 4.390415191650391,
"learning_rate": 1e-05,
"loss": 1.3325,
"step": 1017
},
{
"epoch": 0.25036891293654695,
"grad_norm": 5.367855548858643,
"learning_rate": 9.999999933848413e-06,
"loss": 1.3399,
"step": 1018
},
{
"epoch": 0.25061485489424495,
"grad_norm": 4.250563621520996,
"learning_rate": 9.999999735393652e-06,
"loss": 1.2161,
"step": 1019
},
{
"epoch": 0.25086079685194296,
"grad_norm": 4.564309120178223,
"learning_rate": 9.999999404635721e-06,
"loss": 1.2307,
"step": 1020
},
{
"epoch": 0.2511067388096409,
"grad_norm": 4.221138000488281,
"learning_rate": 9.999998941574633e-06,
"loss": 1.3276,
"step": 1021
},
{
"epoch": 0.2513526807673389,
"grad_norm": 4.203039646148682,
"learning_rate": 9.999998346210396e-06,
"loss": 1.0837,
"step": 1022
},
{
"epoch": 0.2515986227250369,
"grad_norm": 4.8519487380981445,
"learning_rate": 9.999997618543027e-06,
"loss": 1.4075,
"step": 1023
},
{
"epoch": 0.25184456468273486,
"grad_norm": 4.403535842895508,
"learning_rate": 9.999996758572546e-06,
"loss": 1.2487,
"step": 1024
},
{
"epoch": 0.25209050664043287,
"grad_norm": 4.543124198913574,
"learning_rate": 9.999995766298975e-06,
"loss": 1.3977,
"step": 1025
},
{
"epoch": 0.2523364485981308,
"grad_norm": 4.522002696990967,
"learning_rate": 9.999994641722343e-06,
"loss": 1.2848,
"step": 1026
},
{
"epoch": 0.2525823905558288,
"grad_norm": 4.414734840393066,
"learning_rate": 9.999993384842674e-06,
"loss": 1.1567,
"step": 1027
},
{
"epoch": 0.2528283325135268,
"grad_norm": 4.074300289154053,
"learning_rate": 9.999991995660007e-06,
"loss": 1.3771,
"step": 1028
},
{
"epoch": 0.25307427447122477,
"grad_norm": 4.63136100769043,
"learning_rate": 9.999990474174375e-06,
"loss": 1.2712,
"step": 1029
},
{
"epoch": 0.2533202164289228,
"grad_norm": 4.197093486785889,
"learning_rate": 9.999988820385822e-06,
"loss": 1.2452,
"step": 1030
},
{
"epoch": 0.2535661583866208,
"grad_norm": 4.269925117492676,
"learning_rate": 9.999987034294387e-06,
"loss": 1.1632,
"step": 1031
},
{
"epoch": 0.2538121003443187,
"grad_norm": 4.661086559295654,
"learning_rate": 9.999985115900121e-06,
"loss": 1.3098,
"step": 1032
},
{
"epoch": 0.25405804230201673,
"grad_norm": 4.5972490310668945,
"learning_rate": 9.999983065203071e-06,
"loss": 1.2158,
"step": 1033
},
{
"epoch": 0.25430398425971473,
"grad_norm": 4.463476181030273,
"learning_rate": 9.999980882203298e-06,
"loss": 1.3109,
"step": 1034
},
{
"epoch": 0.2545499262174127,
"grad_norm": 4.849438190460205,
"learning_rate": 9.999978566900851e-06,
"loss": 1.4068,
"step": 1035
},
{
"epoch": 0.2547958681751107,
"grad_norm": 4.385087013244629,
"learning_rate": 9.999976119295799e-06,
"loss": 1.131,
"step": 1036
},
{
"epoch": 0.25504181013280863,
"grad_norm": 4.06881046295166,
"learning_rate": 9.999973539388201e-06,
"loss": 1.1098,
"step": 1037
},
{
"epoch": 0.25528775209050664,
"grad_norm": 4.2591776847839355,
"learning_rate": 9.99997082717813e-06,
"loss": 1.2983,
"step": 1038
},
{
"epoch": 0.25553369404820464,
"grad_norm": 4.30938720703125,
"learning_rate": 9.999967982665655e-06,
"loss": 1.1812,
"step": 1039
},
{
"epoch": 0.2557796360059026,
"grad_norm": 4.487225532531738,
"learning_rate": 9.999965005850851e-06,
"loss": 1.1944,
"step": 1040
},
{
"epoch": 0.2560255779636006,
"grad_norm": 4.43388557434082,
"learning_rate": 9.999961896733797e-06,
"loss": 1.1834,
"step": 1041
},
{
"epoch": 0.2562715199212986,
"grad_norm": 4.244602203369141,
"learning_rate": 9.999958655314577e-06,
"loss": 1.2445,
"step": 1042
},
{
"epoch": 0.25651746187899654,
"grad_norm": 4.204675674438477,
"learning_rate": 9.999955281593277e-06,
"loss": 1.2156,
"step": 1043
},
{
"epoch": 0.25676340383669455,
"grad_norm": 4.29486608505249,
"learning_rate": 9.999951775569982e-06,
"loss": 1.2145,
"step": 1044
},
{
"epoch": 0.2570093457943925,
"grad_norm": 4.678985595703125,
"learning_rate": 9.999948137244788e-06,
"loss": 1.2094,
"step": 1045
},
{
"epoch": 0.2572552877520905,
"grad_norm": 4.736990451812744,
"learning_rate": 9.999944366617794e-06,
"loss": 1.1769,
"step": 1046
},
{
"epoch": 0.2575012297097885,
"grad_norm": 4.205821514129639,
"learning_rate": 9.999940463689093e-06,
"loss": 1.1401,
"step": 1047
},
{
"epoch": 0.25774717166748645,
"grad_norm": 4.305410861968994,
"learning_rate": 9.999936428458795e-06,
"loss": 1.3224,
"step": 1048
},
{
"epoch": 0.25799311362518446,
"grad_norm": 4.257960319519043,
"learning_rate": 9.999932260927002e-06,
"loss": 1.171,
"step": 1049
},
{
"epoch": 0.25823905558288246,
"grad_norm": 4.50842809677124,
"learning_rate": 9.999927961093826e-06,
"loss": 1.2693,
"step": 1050
},
{
"epoch": 0.2584849975405804,
"grad_norm": 4.264610767364502,
"learning_rate": 9.99992352895938e-06,
"loss": 1.366,
"step": 1051
},
{
"epoch": 0.2587309394982784,
"grad_norm": 4.293087959289551,
"learning_rate": 9.999918964523783e-06,
"loss": 1.1987,
"step": 1052
},
{
"epoch": 0.2589768814559764,
"grad_norm": 4.26725435256958,
"learning_rate": 9.999914267787154e-06,
"loss": 1.3251,
"step": 1053
},
{
"epoch": 0.25922282341367436,
"grad_norm": 4.069000244140625,
"learning_rate": 9.99990943874962e-06,
"loss": 1.2705,
"step": 1054
},
{
"epoch": 0.25946876537137237,
"grad_norm": 4.0899505615234375,
"learning_rate": 9.999904477411306e-06,
"loss": 1.3158,
"step": 1055
},
{
"epoch": 0.2597147073290703,
"grad_norm": 4.36099910736084,
"learning_rate": 9.999899383772344e-06,
"loss": 1.238,
"step": 1056
},
{
"epoch": 0.2599606492867683,
"grad_norm": 4.52928352355957,
"learning_rate": 9.99989415783287e-06,
"loss": 1.2707,
"step": 1057
},
{
"epoch": 0.2602065912444663,
"grad_norm": 3.992438554763794,
"learning_rate": 9.99988879959302e-06,
"loss": 1.1819,
"step": 1058
},
{
"epoch": 0.26045253320216427,
"grad_norm": 4.199254989624023,
"learning_rate": 9.999883309052938e-06,
"loss": 1.3254,
"step": 1059
},
{
"epoch": 0.2606984751598623,
"grad_norm": 4.11007833480835,
"learning_rate": 9.999877686212767e-06,
"loss": 1.1926,
"step": 1060
},
{
"epoch": 0.2609444171175603,
"grad_norm": 4.811441421508789,
"learning_rate": 9.999871931072659e-06,
"loss": 1.1721,
"step": 1061
},
{
"epoch": 0.2611903590752582,
"grad_norm": 4.413527965545654,
"learning_rate": 9.999866043632762e-06,
"loss": 1.3927,
"step": 1062
},
{
"epoch": 0.26143630103295623,
"grad_norm": 4.49337100982666,
"learning_rate": 9.999860023893237e-06,
"loss": 1.3408,
"step": 1063
},
{
"epoch": 0.2616822429906542,
"grad_norm": 4.423419952392578,
"learning_rate": 9.999853871854238e-06,
"loss": 1.3416,
"step": 1064
},
{
"epoch": 0.2619281849483522,
"grad_norm": 4.393714904785156,
"learning_rate": 9.999847587515932e-06,
"loss": 1.3068,
"step": 1065
},
{
"epoch": 0.2621741269060502,
"grad_norm": 4.318437576293945,
"learning_rate": 9.999841170878481e-06,
"loss": 1.2874,
"step": 1066
},
{
"epoch": 0.26242006886374813,
"grad_norm": 4.253035068511963,
"learning_rate": 9.99983462194206e-06,
"loss": 1.2068,
"step": 1067
},
{
"epoch": 0.26266601082144614,
"grad_norm": 4.188569068908691,
"learning_rate": 9.999827940706838e-06,
"loss": 1.1062,
"step": 1068
},
{
"epoch": 0.26291195277914414,
"grad_norm": 4.280446529388428,
"learning_rate": 9.999821127172994e-06,
"loss": 1.2253,
"step": 1069
},
{
"epoch": 0.2631578947368421,
"grad_norm": 4.109984874725342,
"learning_rate": 9.999814181340708e-06,
"loss": 1.3059,
"step": 1070
},
{
"epoch": 0.2634038366945401,
"grad_norm": 4.309023857116699,
"learning_rate": 9.999807103210165e-06,
"loss": 1.2771,
"step": 1071
},
{
"epoch": 0.2636497786522381,
"grad_norm": 4.398236274719238,
"learning_rate": 9.999799892781548e-06,
"loss": 1.3409,
"step": 1072
},
{
"epoch": 0.26389572060993605,
"grad_norm": 4.044312953948975,
"learning_rate": 9.999792550055053e-06,
"loss": 1.0798,
"step": 1073
},
{
"epoch": 0.26414166256763405,
"grad_norm": 4.506936073303223,
"learning_rate": 9.999785075030869e-06,
"loss": 1.2719,
"step": 1074
},
{
"epoch": 0.264387604525332,
"grad_norm": 4.229895114898682,
"learning_rate": 9.999777467709197e-06,
"loss": 1.1702,
"step": 1075
},
{
"epoch": 0.26463354648303,
"grad_norm": 4.1754937171936035,
"learning_rate": 9.99976972809024e-06,
"loss": 1.3104,
"step": 1076
},
{
"epoch": 0.264879488440728,
"grad_norm": 4.377021312713623,
"learning_rate": 9.9997618561742e-06,
"loss": 1.3095,
"step": 1077
},
{
"epoch": 0.26512543039842595,
"grad_norm": 4.537478446960449,
"learning_rate": 9.999753851961285e-06,
"loss": 1.2267,
"step": 1078
},
{
"epoch": 0.26537137235612396,
"grad_norm": 3.954011917114258,
"learning_rate": 9.99974571545171e-06,
"loss": 1.1451,
"step": 1079
},
{
"epoch": 0.26561731431382196,
"grad_norm": 4.308267593383789,
"learning_rate": 9.999737446645685e-06,
"loss": 1.4331,
"step": 1080
},
{
"epoch": 0.2658632562715199,
"grad_norm": 4.663597106933594,
"learning_rate": 9.999729045543433e-06,
"loss": 1.243,
"step": 1081
},
{
"epoch": 0.2661091982292179,
"grad_norm": 3.9186594486236572,
"learning_rate": 9.999720512145176e-06,
"loss": 1.2241,
"step": 1082
},
{
"epoch": 0.26635514018691586,
"grad_norm": 4.098139762878418,
"learning_rate": 9.99971184645114e-06,
"loss": 1.1804,
"step": 1083
},
{
"epoch": 0.26660108214461387,
"grad_norm": 4.57953405380249,
"learning_rate": 9.999703048461554e-06,
"loss": 1.2829,
"step": 1084
},
{
"epoch": 0.26684702410231187,
"grad_norm": 4.3092756271362305,
"learning_rate": 9.999694118176646e-06,
"loss": 1.3622,
"step": 1085
},
{
"epoch": 0.2670929660600098,
"grad_norm": 4.05914831161499,
"learning_rate": 9.99968505559666e-06,
"loss": 1.247,
"step": 1086
},
{
"epoch": 0.2673389080177078,
"grad_norm": 4.135657787322998,
"learning_rate": 9.999675860721832e-06,
"loss": 1.246,
"step": 1087
},
{
"epoch": 0.2675848499754058,
"grad_norm": 4.446312427520752,
"learning_rate": 9.999666533552403e-06,
"loss": 1.32,
"step": 1088
},
{
"epoch": 0.2678307919331038,
"grad_norm": 4.449991226196289,
"learning_rate": 9.999657074088624e-06,
"loss": 1.238,
"step": 1089
},
{
"epoch": 0.2680767338908018,
"grad_norm": 4.564802169799805,
"learning_rate": 9.999647482330744e-06,
"loss": 1.524,
"step": 1090
},
{
"epoch": 0.2683226758484998,
"grad_norm": 4.357064247131348,
"learning_rate": 9.999637758279017e-06,
"loss": 1.2522,
"step": 1091
},
{
"epoch": 0.26856861780619773,
"grad_norm": 4.11984920501709,
"learning_rate": 9.999627901933697e-06,
"loss": 1.1165,
"step": 1092
},
{
"epoch": 0.26881455976389573,
"grad_norm": 4.4941325187683105,
"learning_rate": 9.99961791329505e-06,
"loss": 1.4415,
"step": 1093
},
{
"epoch": 0.2690605017215937,
"grad_norm": 3.9608211517333984,
"learning_rate": 9.999607792363338e-06,
"loss": 1.0824,
"step": 1094
},
{
"epoch": 0.2693064436792917,
"grad_norm": 4.34138822555542,
"learning_rate": 9.999597539138827e-06,
"loss": 1.2254,
"step": 1095
},
{
"epoch": 0.2695523856369897,
"grad_norm": 4.778192520141602,
"learning_rate": 9.999587153621791e-06,
"loss": 1.3306,
"step": 1096
},
{
"epoch": 0.26979832759468764,
"grad_norm": 4.221494674682617,
"learning_rate": 9.999576635812503e-06,
"loss": 1.1109,
"step": 1097
},
{
"epoch": 0.27004426955238564,
"grad_norm": 4.722675323486328,
"learning_rate": 9.999565985711244e-06,
"loss": 1.2592,
"step": 1098
},
{
"epoch": 0.27029021151008364,
"grad_norm": 4.350003719329834,
"learning_rate": 9.999555203318292e-06,
"loss": 1.1485,
"step": 1099
},
{
"epoch": 0.2705361534677816,
"grad_norm": 3.8532097339630127,
"learning_rate": 9.999544288633934e-06,
"loss": 1.1267,
"step": 1100
},
{
"epoch": 0.2707820954254796,
"grad_norm": 4.403013229370117,
"learning_rate": 9.999533241658461e-06,
"loss": 1.3251,
"step": 1101
},
{
"epoch": 0.27102803738317754,
"grad_norm": 4.656615257263184,
"learning_rate": 9.999522062392162e-06,
"loss": 1.2662,
"step": 1102
},
{
"epoch": 0.27127397934087555,
"grad_norm": 4.1301045417785645,
"learning_rate": 9.999510750835333e-06,
"loss": 1.2128,
"step": 1103
},
{
"epoch": 0.27151992129857355,
"grad_norm": 4.044670104980469,
"learning_rate": 9.999499306988276e-06,
"loss": 1.2913,
"step": 1104
},
{
"epoch": 0.2717658632562715,
"grad_norm": 4.900116443634033,
"learning_rate": 9.999487730851292e-06,
"loss": 1.344,
"step": 1105
},
{
"epoch": 0.2720118052139695,
"grad_norm": 4.42169189453125,
"learning_rate": 9.999476022424688e-06,
"loss": 1.34,
"step": 1106
},
{
"epoch": 0.2722577471716675,
"grad_norm": 4.5798258781433105,
"learning_rate": 9.999464181708772e-06,
"loss": 1.3602,
"step": 1107
},
{
"epoch": 0.27250368912936546,
"grad_norm": 4.649616241455078,
"learning_rate": 9.99945220870386e-06,
"loss": 1.2528,
"step": 1108
},
{
"epoch": 0.27274963108706346,
"grad_norm": 4.3989176750183105,
"learning_rate": 9.999440103410267e-06,
"loss": 1.281,
"step": 1109
},
{
"epoch": 0.27299557304476146,
"grad_norm": 4.325901508331299,
"learning_rate": 9.999427865828314e-06,
"loss": 1.3178,
"step": 1110
},
{
"epoch": 0.2732415150024594,
"grad_norm": 4.382983207702637,
"learning_rate": 9.999415495958325e-06,
"loss": 1.2847,
"step": 1111
},
{
"epoch": 0.2734874569601574,
"grad_norm": 4.392940521240234,
"learning_rate": 9.999402993800626e-06,
"loss": 1.3263,
"step": 1112
},
{
"epoch": 0.27373339891785536,
"grad_norm": 4.3835954666137695,
"learning_rate": 9.99939035935555e-06,
"loss": 1.165,
"step": 1113
},
{
"epoch": 0.27397934087555337,
"grad_norm": 4.3408637046813965,
"learning_rate": 9.999377592623428e-06,
"loss": 1.3033,
"step": 1114
},
{
"epoch": 0.27422528283325137,
"grad_norm": 4.2236833572387695,
"learning_rate": 9.999364693604602e-06,
"loss": 1.2588,
"step": 1115
},
{
"epoch": 0.2744712247909493,
"grad_norm": 4.09113073348999,
"learning_rate": 9.99935166229941e-06,
"loss": 1.2628,
"step": 1116
},
{
"epoch": 0.2747171667486473,
"grad_norm": 4.640167713165283,
"learning_rate": 9.9993384987082e-06,
"loss": 1.2142,
"step": 1117
},
{
"epoch": 0.2749631087063453,
"grad_norm": 3.9315268993377686,
"learning_rate": 9.999325202831315e-06,
"loss": 1.0318,
"step": 1118
},
{
"epoch": 0.2752090506640433,
"grad_norm": 4.185249328613281,
"learning_rate": 9.999311774669113e-06,
"loss": 1.3131,
"step": 1119
},
{
"epoch": 0.2754549926217413,
"grad_norm": 4.260148525238037,
"learning_rate": 9.999298214221944e-06,
"loss": 1.2961,
"step": 1120
},
{
"epoch": 0.2757009345794392,
"grad_norm": 4.448310375213623,
"learning_rate": 9.999284521490173e-06,
"loss": 1.3669,
"step": 1121
},
{
"epoch": 0.27594687653713723,
"grad_norm": 3.915771245956421,
"learning_rate": 9.999270696474154e-06,
"loss": 1.2281,
"step": 1122
},
{
"epoch": 0.27619281849483523,
"grad_norm": 4.664644241333008,
"learning_rate": 9.999256739174261e-06,
"loss": 1.2572,
"step": 1123
},
{
"epoch": 0.2764387604525332,
"grad_norm": 4.681879043579102,
"learning_rate": 9.999242649590859e-06,
"loss": 1.3402,
"step": 1124
},
{
"epoch": 0.2766847024102312,
"grad_norm": 4.34698486328125,
"learning_rate": 9.99922842772432e-06,
"loss": 1.2561,
"step": 1125
},
{
"epoch": 0.2769306443679292,
"grad_norm": 4.052037239074707,
"learning_rate": 9.999214073575024e-06,
"loss": 1.1905,
"step": 1126
},
{
"epoch": 0.27717658632562714,
"grad_norm": 4.684157371520996,
"learning_rate": 9.999199587143347e-06,
"loss": 1.3107,
"step": 1127
},
{
"epoch": 0.27742252828332514,
"grad_norm": 4.260867118835449,
"learning_rate": 9.999184968429676e-06,
"loss": 1.3682,
"step": 1128
},
{
"epoch": 0.2776684702410231,
"grad_norm": 4.477982521057129,
"learning_rate": 9.999170217434395e-06,
"loss": 1.288,
"step": 1129
},
{
"epoch": 0.2779144121987211,
"grad_norm": 3.91641902923584,
"learning_rate": 9.999155334157895e-06,
"loss": 1.1306,
"step": 1130
},
{
"epoch": 0.2781603541564191,
"grad_norm": 4.405613899230957,
"learning_rate": 9.999140318600569e-06,
"loss": 1.3036,
"step": 1131
},
{
"epoch": 0.27840629611411705,
"grad_norm": 5.041419506072998,
"learning_rate": 9.999125170762815e-06,
"loss": 1.377,
"step": 1132
},
{
"epoch": 0.27865223807181505,
"grad_norm": 4.5249786376953125,
"learning_rate": 9.999109890645037e-06,
"loss": 1.298,
"step": 1133
},
{
"epoch": 0.27889818002951305,
"grad_norm": 4.366484642028809,
"learning_rate": 9.999094478247635e-06,
"loss": 1.1988,
"step": 1134
},
{
"epoch": 0.279144121987211,
"grad_norm": 4.346078872680664,
"learning_rate": 9.999078933571017e-06,
"loss": 1.2557,
"step": 1135
},
{
"epoch": 0.279390063944909,
"grad_norm": 4.378028392791748,
"learning_rate": 9.999063256615598e-06,
"loss": 1.3308,
"step": 1136
},
{
"epoch": 0.279636005902607,
"grad_norm": 4.195530891418457,
"learning_rate": 9.999047447381788e-06,
"loss": 1.4006,
"step": 1137
},
{
"epoch": 0.27988194786030496,
"grad_norm": 5.043130397796631,
"learning_rate": 9.99903150587001e-06,
"loss": 1.2974,
"step": 1138
},
{
"epoch": 0.28012788981800296,
"grad_norm": 4.176356315612793,
"learning_rate": 9.999015432080681e-06,
"loss": 1.1809,
"step": 1139
},
{
"epoch": 0.2803738317757009,
"grad_norm": 4.18113899230957,
"learning_rate": 9.998999226014229e-06,
"loss": 1.231,
"step": 1140
},
{
"epoch": 0.2806197737333989,
"grad_norm": 4.399631500244141,
"learning_rate": 9.998982887671085e-06,
"loss": 1.1524,
"step": 1141
},
{
"epoch": 0.2808657156910969,
"grad_norm": 4.425261974334717,
"learning_rate": 9.998966417051678e-06,
"loss": 1.4063,
"step": 1142
},
{
"epoch": 0.28111165764879487,
"grad_norm": 3.8110508918762207,
"learning_rate": 9.998949814156445e-06,
"loss": 1.0404,
"step": 1143
},
{
"epoch": 0.28135759960649287,
"grad_norm": 4.220620632171631,
"learning_rate": 9.998933078985826e-06,
"loss": 1.0824,
"step": 1144
},
{
"epoch": 0.2816035415641909,
"grad_norm": 4.5974345207214355,
"learning_rate": 9.998916211540262e-06,
"loss": 1.2077,
"step": 1145
},
{
"epoch": 0.2818494835218888,
"grad_norm": 5.1596150398254395,
"learning_rate": 9.9988992118202e-06,
"loss": 1.1974,
"step": 1146
},
{
"epoch": 0.2820954254795868,
"grad_norm": 4.4813232421875,
"learning_rate": 9.998882079826092e-06,
"loss": 1.2457,
"step": 1147
},
{
"epoch": 0.2823413674372848,
"grad_norm": 4.763134479522705,
"learning_rate": 9.998864815558387e-06,
"loss": 1.3751,
"step": 1148
},
{
"epoch": 0.2825873093949828,
"grad_norm": 4.2806782722473145,
"learning_rate": 9.998847419017546e-06,
"loss": 1.1717,
"step": 1149
},
{
"epoch": 0.2828332513526808,
"grad_norm": 4.183906555175781,
"learning_rate": 9.998829890204027e-06,
"loss": 1.2999,
"step": 1150
},
{
"epoch": 0.28307919331037873,
"grad_norm": 3.9830563068389893,
"learning_rate": 9.998812229118293e-06,
"loss": 1.1804,
"step": 1151
},
{
"epoch": 0.28332513526807673,
"grad_norm": 3.895453453063965,
"learning_rate": 9.998794435760814e-06,
"loss": 1.1454,
"step": 1152
},
{
"epoch": 0.28357107722577474,
"grad_norm": 4.091982364654541,
"learning_rate": 9.99877651013206e-06,
"loss": 1.368,
"step": 1153
},
{
"epoch": 0.2838170191834727,
"grad_norm": 4.30715274810791,
"learning_rate": 9.998758452232505e-06,
"loss": 1.3169,
"step": 1154
},
{
"epoch": 0.2840629611411707,
"grad_norm": 3.9610517024993896,
"learning_rate": 9.998740262062626e-06,
"loss": 1.2716,
"step": 1155
},
{
"epoch": 0.2843089030988687,
"grad_norm": 3.8553271293640137,
"learning_rate": 9.998721939622907e-06,
"loss": 1.1709,
"step": 1156
},
{
"epoch": 0.28455484505656664,
"grad_norm": 4.264291763305664,
"learning_rate": 9.998703484913829e-06,
"loss": 1.2551,
"step": 1157
},
{
"epoch": 0.28480078701426464,
"grad_norm": 4.2499847412109375,
"learning_rate": 9.998684897935883e-06,
"loss": 1.2914,
"step": 1158
},
{
"epoch": 0.2850467289719626,
"grad_norm": 4.331030368804932,
"learning_rate": 9.99866617868956e-06,
"loss": 1.3131,
"step": 1159
},
{
"epoch": 0.2852926709296606,
"grad_norm": 4.358573913574219,
"learning_rate": 9.998647327175354e-06,
"loss": 1.202,
"step": 1160
},
{
"epoch": 0.2855386128873586,
"grad_norm": 4.345164775848389,
"learning_rate": 9.998628343393766e-06,
"loss": 1.2549,
"step": 1161
},
{
"epoch": 0.28578455484505655,
"grad_norm": 5.347385883331299,
"learning_rate": 9.998609227345299e-06,
"loss": 1.2416,
"step": 1162
},
{
"epoch": 0.28603049680275455,
"grad_norm": 4.474326133728027,
"learning_rate": 9.998589979030455e-06,
"loss": 1.1738,
"step": 1163
},
{
"epoch": 0.28627643876045256,
"grad_norm": 4.088446140289307,
"learning_rate": 9.998570598449747e-06,
"loss": 1.2226,
"step": 1164
},
{
"epoch": 0.2865223807181505,
"grad_norm": 4.188636779785156,
"learning_rate": 9.998551085603688e-06,
"loss": 1.2481,
"step": 1165
},
{
"epoch": 0.2867683226758485,
"grad_norm": 4.133874416351318,
"learning_rate": 9.998531440492788e-06,
"loss": 1.1769,
"step": 1166
},
{
"epoch": 0.28701426463354646,
"grad_norm": 4.1376051902771,
"learning_rate": 9.998511663117576e-06,
"loss": 1.2083,
"step": 1167
},
{
"epoch": 0.28726020659124446,
"grad_norm": 4.381062030792236,
"learning_rate": 9.99849175347857e-06,
"loss": 1.2095,
"step": 1168
},
{
"epoch": 0.28750614854894246,
"grad_norm": 3.930952548980713,
"learning_rate": 9.998471711576297e-06,
"loss": 1.2782,
"step": 1169
},
{
"epoch": 0.2877520905066404,
"grad_norm": 4.012092590332031,
"learning_rate": 9.998451537411289e-06,
"loss": 1.2507,
"step": 1170
},
{
"epoch": 0.2879980324643384,
"grad_norm": 3.8631670475006104,
"learning_rate": 9.998431230984078e-06,
"loss": 1.1198,
"step": 1171
},
{
"epoch": 0.2882439744220364,
"grad_norm": 4.268503665924072,
"learning_rate": 9.998410792295203e-06,
"loss": 1.3268,
"step": 1172
},
{
"epoch": 0.28848991637973437,
"grad_norm": 4.196653842926025,
"learning_rate": 9.998390221345203e-06,
"loss": 1.1431,
"step": 1173
},
{
"epoch": 0.28873585833743237,
"grad_norm": 4.212930679321289,
"learning_rate": 9.998369518134624e-06,
"loss": 1.1413,
"step": 1174
},
{
"epoch": 0.2889818002951304,
"grad_norm": 4.544216632843018,
"learning_rate": 9.998348682664014e-06,
"loss": 1.3041,
"step": 1175
},
{
"epoch": 0.2892277422528283,
"grad_norm": 4.2929840087890625,
"learning_rate": 9.998327714933922e-06,
"loss": 1.2045,
"step": 1176
},
{
"epoch": 0.2894736842105263,
"grad_norm": 4.4043169021606445,
"learning_rate": 9.998306614944905e-06,
"loss": 1.2777,
"step": 1177
},
{
"epoch": 0.2897196261682243,
"grad_norm": 4.274484157562256,
"learning_rate": 9.99828538269752e-06,
"loss": 1.2495,
"step": 1178
},
{
"epoch": 0.2899655681259223,
"grad_norm": 4.233809471130371,
"learning_rate": 9.99826401819233e-06,
"loss": 1.2744,
"step": 1179
},
{
"epoch": 0.2902115100836203,
"grad_norm": 4.184529781341553,
"learning_rate": 9.9982425214299e-06,
"loss": 1.3011,
"step": 1180
},
{
"epoch": 0.29045745204131823,
"grad_norm": 4.196940898895264,
"learning_rate": 9.9982208924108e-06,
"loss": 1.1951,
"step": 1181
},
{
"epoch": 0.29070339399901624,
"grad_norm": 4.34601354598999,
"learning_rate": 9.998199131135597e-06,
"loss": 1.2424,
"step": 1182
},
{
"epoch": 0.29094933595671424,
"grad_norm": 4.035940647125244,
"learning_rate": 9.998177237604874e-06,
"loss": 1.203,
"step": 1183
},
{
"epoch": 0.2911952779144122,
"grad_norm": 4.606171131134033,
"learning_rate": 9.998155211819207e-06,
"loss": 1.2398,
"step": 1184
},
{
"epoch": 0.2914412198721102,
"grad_norm": 4.273532867431641,
"learning_rate": 9.998133053779178e-06,
"loss": 1.2409,
"step": 1185
},
{
"epoch": 0.29168716182980814,
"grad_norm": 3.8666553497314453,
"learning_rate": 9.998110763485374e-06,
"loss": 1.1171,
"step": 1186
},
{
"epoch": 0.29193310378750614,
"grad_norm": 3.7808828353881836,
"learning_rate": 9.998088340938385e-06,
"loss": 1.2415,
"step": 1187
},
{
"epoch": 0.29217904574520415,
"grad_norm": 3.943721294403076,
"learning_rate": 9.998065786138804e-06,
"loss": 1.1737,
"step": 1188
},
{
"epoch": 0.2924249877029021,
"grad_norm": 4.628256320953369,
"learning_rate": 9.99804309908723e-06,
"loss": 1.4723,
"step": 1189
},
{
"epoch": 0.2926709296606001,
"grad_norm": 3.9521572589874268,
"learning_rate": 9.99802027978426e-06,
"loss": 1.3441,
"step": 1190
},
{
"epoch": 0.2929168716182981,
"grad_norm": 4.057553291320801,
"learning_rate": 9.997997328230498e-06,
"loss": 1.2866,
"step": 1191
},
{
"epoch": 0.29316281357599605,
"grad_norm": 4.300083637237549,
"learning_rate": 9.997974244426555e-06,
"loss": 1.307,
"step": 1192
},
{
"epoch": 0.29340875553369405,
"grad_norm": 4.545022010803223,
"learning_rate": 9.997951028373038e-06,
"loss": 1.3103,
"step": 1193
},
{
"epoch": 0.29365469749139206,
"grad_norm": 3.993496894836426,
"learning_rate": 9.997927680070563e-06,
"loss": 1.1781,
"step": 1194
},
{
"epoch": 0.29390063944909,
"grad_norm": 4.277355670928955,
"learning_rate": 9.997904199519748e-06,
"loss": 1.4363,
"step": 1195
},
{
"epoch": 0.294146581406788,
"grad_norm": 4.10087251663208,
"learning_rate": 9.997880586721213e-06,
"loss": 1.1875,
"step": 1196
},
{
"epoch": 0.29439252336448596,
"grad_norm": 4.600202560424805,
"learning_rate": 9.997856841675583e-06,
"loss": 1.2881,
"step": 1197
},
{
"epoch": 0.29463846532218396,
"grad_norm": 4.51039981842041,
"learning_rate": 9.997832964383489e-06,
"loss": 1.2599,
"step": 1198
},
{
"epoch": 0.29488440727988197,
"grad_norm": 4.206743240356445,
"learning_rate": 9.997808954845559e-06,
"loss": 1.3924,
"step": 1199
},
{
"epoch": 0.2951303492375799,
"grad_norm": 3.759392261505127,
"learning_rate": 9.99778481306243e-06,
"loss": 1.1605,
"step": 1200
},
{
"epoch": 0.2953762911952779,
"grad_norm": 4.324453830718994,
"learning_rate": 9.997760539034742e-06,
"loss": 1.3562,
"step": 1201
},
{
"epoch": 0.2956222331529759,
"grad_norm": 4.06655740737915,
"learning_rate": 9.997736132763135e-06,
"loss": 1.214,
"step": 1202
},
{
"epoch": 0.29586817511067387,
"grad_norm": 3.814960241317749,
"learning_rate": 9.997711594248259e-06,
"loss": 1.1828,
"step": 1203
},
{
"epoch": 0.2961141170683719,
"grad_norm": 6.979526519775391,
"learning_rate": 9.997686923490757e-06,
"loss": 1.2897,
"step": 1204
},
{
"epoch": 0.2963600590260698,
"grad_norm": 4.219638824462891,
"learning_rate": 9.997662120491286e-06,
"loss": 1.1734,
"step": 1205
},
{
"epoch": 0.2966060009837678,
"grad_norm": 4.423089981079102,
"learning_rate": 9.997637185250502e-06,
"loss": 1.3045,
"step": 1206
},
{
"epoch": 0.29685194294146583,
"grad_norm": 4.261505126953125,
"learning_rate": 9.997612117769064e-06,
"loss": 1.193,
"step": 1207
},
{
"epoch": 0.2970978848991638,
"grad_norm": 3.9490087032318115,
"learning_rate": 9.997586918047635e-06,
"loss": 1.2561,
"step": 1208
},
{
"epoch": 0.2973438268568618,
"grad_norm": 4.19813346862793,
"learning_rate": 9.997561586086881e-06,
"loss": 1.266,
"step": 1209
},
{
"epoch": 0.2975897688145598,
"grad_norm": 4.263858318328857,
"learning_rate": 9.997536121887476e-06,
"loss": 1.1961,
"step": 1210
},
{
"epoch": 0.29783571077225773,
"grad_norm": 4.010316848754883,
"learning_rate": 9.99751052545009e-06,
"loss": 1.3077,
"step": 1211
},
{
"epoch": 0.29808165272995574,
"grad_norm": 4.499833583831787,
"learning_rate": 9.997484796775403e-06,
"loss": 1.2655,
"step": 1212
},
{
"epoch": 0.29832759468765374,
"grad_norm": 4.0379204750061035,
"learning_rate": 9.997458935864093e-06,
"loss": 1.3278,
"step": 1213
},
{
"epoch": 0.2985735366453517,
"grad_norm": 3.8807849884033203,
"learning_rate": 9.997432942716844e-06,
"loss": 1.1821,
"step": 1214
},
{
"epoch": 0.2988194786030497,
"grad_norm": 4.793801784515381,
"learning_rate": 9.997406817334349e-06,
"loss": 1.3143,
"step": 1215
},
{
"epoch": 0.29906542056074764,
"grad_norm": 3.9666733741760254,
"learning_rate": 9.997380559717295e-06,
"loss": 1.2607,
"step": 1216
},
{
"epoch": 0.29931136251844564,
"grad_norm": 3.869178056716919,
"learning_rate": 9.997354169866379e-06,
"loss": 1.2386,
"step": 1217
},
{
"epoch": 0.29955730447614365,
"grad_norm": 4.417367935180664,
"learning_rate": 9.997327647782294e-06,
"loss": 1.358,
"step": 1218
},
{
"epoch": 0.2998032464338416,
"grad_norm": 4.0709943771362305,
"learning_rate": 9.997300993465748e-06,
"loss": 1.202,
"step": 1219
},
{
"epoch": 0.3000491883915396,
"grad_norm": 3.9917142391204834,
"learning_rate": 9.997274206917444e-06,
"loss": 1.0634,
"step": 1220
},
{
"epoch": 0.3002951303492376,
"grad_norm": 4.313510417938232,
"learning_rate": 9.997247288138093e-06,
"loss": 1.2273,
"step": 1221
},
{
"epoch": 0.30054107230693555,
"grad_norm": 4.059592247009277,
"learning_rate": 9.997220237128403e-06,
"loss": 1.1488,
"step": 1222
},
{
"epoch": 0.30078701426463356,
"grad_norm": 4.326138496398926,
"learning_rate": 9.99719305388909e-06,
"loss": 1.2053,
"step": 1223
},
{
"epoch": 0.3010329562223315,
"grad_norm": 4.248202323913574,
"learning_rate": 9.997165738420878e-06,
"loss": 1.2036,
"step": 1224
},
{
"epoch": 0.3012788981800295,
"grad_norm": 4.397487163543701,
"learning_rate": 9.997138290724484e-06,
"loss": 1.3825,
"step": 1225
},
{
"epoch": 0.3015248401377275,
"grad_norm": 4.879924297332764,
"learning_rate": 9.99711071080064e-06,
"loss": 1.2256,
"step": 1226
},
{
"epoch": 0.30177078209542546,
"grad_norm": 4.822386264801025,
"learning_rate": 9.997082998650073e-06,
"loss": 1.3308,
"step": 1227
},
{
"epoch": 0.30201672405312346,
"grad_norm": 4.245731353759766,
"learning_rate": 9.997055154273516e-06,
"loss": 1.2732,
"step": 1228
},
{
"epoch": 0.30226266601082147,
"grad_norm": 4.362673282623291,
"learning_rate": 9.997027177671705e-06,
"loss": 1.0778,
"step": 1229
},
{
"epoch": 0.3025086079685194,
"grad_norm": 4.876657485961914,
"learning_rate": 9.996999068845383e-06,
"loss": 1.3872,
"step": 1230
},
{
"epoch": 0.3027545499262174,
"grad_norm": 4.238354206085205,
"learning_rate": 9.99697082779529e-06,
"loss": 1.2482,
"step": 1231
},
{
"epoch": 0.30300049188391537,
"grad_norm": 4.3712897300720215,
"learning_rate": 9.996942454522176e-06,
"loss": 1.4143,
"step": 1232
},
{
"epoch": 0.30324643384161337,
"grad_norm": 4.201569080352783,
"learning_rate": 9.996913949026791e-06,
"loss": 1.2437,
"step": 1233
},
{
"epoch": 0.3034923757993114,
"grad_norm": 4.147704601287842,
"learning_rate": 9.996885311309892e-06,
"loss": 1.2369,
"step": 1234
},
{
"epoch": 0.3037383177570093,
"grad_norm": 4.359009742736816,
"learning_rate": 9.996856541372233e-06,
"loss": 1.1618,
"step": 1235
},
{
"epoch": 0.30398425971470733,
"grad_norm": 4.279295444488525,
"learning_rate": 9.996827639214575e-06,
"loss": 1.1679,
"step": 1236
},
{
"epoch": 0.30423020167240533,
"grad_norm": 4.618326663970947,
"learning_rate": 9.996798604837686e-06,
"loss": 1.2155,
"step": 1237
},
{
"epoch": 0.3044761436301033,
"grad_norm": 4.427160739898682,
"learning_rate": 9.996769438242332e-06,
"loss": 1.2843,
"step": 1238
},
{
"epoch": 0.3047220855878013,
"grad_norm": 3.919158458709717,
"learning_rate": 9.996740139429285e-06,
"loss": 1.1956,
"step": 1239
},
{
"epoch": 0.3049680275454993,
"grad_norm": 3.995673656463623,
"learning_rate": 9.996710708399322e-06,
"loss": 1.2518,
"step": 1240
},
{
"epoch": 0.30521396950319724,
"grad_norm": 3.7355141639709473,
"learning_rate": 9.996681145153217e-06,
"loss": 1.1439,
"step": 1241
},
{
"epoch": 0.30545991146089524,
"grad_norm": 3.814750909805298,
"learning_rate": 9.996651449691758e-06,
"loss": 1.2576,
"step": 1242
},
{
"epoch": 0.3057058534185932,
"grad_norm": 4.299772262573242,
"learning_rate": 9.996621622015729e-06,
"loss": 1.3482,
"step": 1243
},
{
"epoch": 0.3059517953762912,
"grad_norm": 4.593987464904785,
"learning_rate": 9.996591662125918e-06,
"loss": 1.493,
"step": 1244
},
{
"epoch": 0.3061977373339892,
"grad_norm": 3.9475653171539307,
"learning_rate": 9.996561570023118e-06,
"loss": 1.2791,
"step": 1245
},
{
"epoch": 0.30644367929168714,
"grad_norm": 3.856114387512207,
"learning_rate": 9.996531345708125e-06,
"loss": 1.4271,
"step": 1246
},
{
"epoch": 0.30668962124938515,
"grad_norm": 4.161723613739014,
"learning_rate": 9.99650098918174e-06,
"loss": 1.2638,
"step": 1247
},
{
"epoch": 0.30693556320708315,
"grad_norm": 3.564053535461426,
"learning_rate": 9.996470500444766e-06,
"loss": 1.1701,
"step": 1248
},
{
"epoch": 0.3071815051647811,
"grad_norm": 4.256925582885742,
"learning_rate": 9.99643987949801e-06,
"loss": 1.1984,
"step": 1249
},
{
"epoch": 0.3074274471224791,
"grad_norm": 4.640268325805664,
"learning_rate": 9.99640912634228e-06,
"loss": 1.2591,
"step": 1250
},
{
"epoch": 0.30767338908017705,
"grad_norm": 4.548612117767334,
"learning_rate": 9.996378240978393e-06,
"loss": 1.2289,
"step": 1251
},
{
"epoch": 0.30791933103787505,
"grad_norm": 4.342689037322998,
"learning_rate": 9.996347223407164e-06,
"loss": 1.3137,
"step": 1252
},
{
"epoch": 0.30816527299557306,
"grad_norm": 4.011205196380615,
"learning_rate": 9.996316073629413e-06,
"loss": 1.1546,
"step": 1253
},
{
"epoch": 0.308411214953271,
"grad_norm": 4.1350016593933105,
"learning_rate": 9.996284791645967e-06,
"loss": 1.3098,
"step": 1254
},
{
"epoch": 0.308657156910969,
"grad_norm": 3.961027145385742,
"learning_rate": 9.996253377457654e-06,
"loss": 1.2332,
"step": 1255
},
{
"epoch": 0.308903098868667,
"grad_norm": 4.201021671295166,
"learning_rate": 9.9962218310653e-06,
"loss": 1.2276,
"step": 1256
},
{
"epoch": 0.30914904082636496,
"grad_norm": 3.944270372390747,
"learning_rate": 9.996190152469747e-06,
"loss": 1.2201,
"step": 1257
},
{
"epoch": 0.30939498278406297,
"grad_norm": 4.422118663787842,
"learning_rate": 9.996158341671826e-06,
"loss": 1.2854,
"step": 1258
},
{
"epoch": 0.30964092474176097,
"grad_norm": 3.9929332733154297,
"learning_rate": 9.996126398672386e-06,
"loss": 1.2281,
"step": 1259
},
{
"epoch": 0.3098868666994589,
"grad_norm": 4.106986999511719,
"learning_rate": 9.996094323472265e-06,
"loss": 1.1608,
"step": 1260
},
{
"epoch": 0.3101328086571569,
"grad_norm": 4.0601959228515625,
"learning_rate": 9.996062116072318e-06,
"loss": 1.2239,
"step": 1261
},
{
"epoch": 0.31037875061485487,
"grad_norm": 4.082971096038818,
"learning_rate": 9.996029776473393e-06,
"loss": 1.2239,
"step": 1262
},
{
"epoch": 0.3106246925725529,
"grad_norm": 3.8583247661590576,
"learning_rate": 9.995997304676347e-06,
"loss": 1.1466,
"step": 1263
},
{
"epoch": 0.3108706345302509,
"grad_norm": 4.343196868896484,
"learning_rate": 9.995964700682041e-06,
"loss": 1.2452,
"step": 1264
},
{
"epoch": 0.3111165764879488,
"grad_norm": 4.781988143920898,
"learning_rate": 9.995931964491336e-06,
"loss": 1.2239,
"step": 1265
},
{
"epoch": 0.31136251844564683,
"grad_norm": 4.25830602645874,
"learning_rate": 9.995899096105098e-06,
"loss": 1.4255,
"step": 1266
},
{
"epoch": 0.31160846040334483,
"grad_norm": 4.0899176597595215,
"learning_rate": 9.995866095524195e-06,
"loss": 1.3061,
"step": 1267
},
{
"epoch": 0.3118544023610428,
"grad_norm": 4.066812515258789,
"learning_rate": 9.995832962749506e-06,
"loss": 1.3416,
"step": 1268
},
{
"epoch": 0.3121003443187408,
"grad_norm": 4.409636974334717,
"learning_rate": 9.995799697781902e-06,
"loss": 1.238,
"step": 1269
},
{
"epoch": 0.31234628627643873,
"grad_norm": 4.350776672363281,
"learning_rate": 9.995766300622266e-06,
"loss": 1.403,
"step": 1270
},
{
"epoch": 0.31259222823413674,
"grad_norm": 3.612041711807251,
"learning_rate": 9.99573277127148e-06,
"loss": 1.2664,
"step": 1271
},
{
"epoch": 0.31283817019183474,
"grad_norm": 4.331789970397949,
"learning_rate": 9.995699109730432e-06,
"loss": 1.2944,
"step": 1272
},
{
"epoch": 0.3130841121495327,
"grad_norm": 4.189482688903809,
"learning_rate": 9.995665316000014e-06,
"loss": 1.2094,
"step": 1273
},
{
"epoch": 0.3133300541072307,
"grad_norm": 3.781536102294922,
"learning_rate": 9.99563139008112e-06,
"loss": 1.2159,
"step": 1274
},
{
"epoch": 0.3135759960649287,
"grad_norm": 4.323209285736084,
"learning_rate": 9.995597331974645e-06,
"loss": 1.4003,
"step": 1275
},
{
"epoch": 0.31382193802262665,
"grad_norm": 4.736751556396484,
"learning_rate": 9.995563141681492e-06,
"loss": 1.3872,
"step": 1276
},
{
"epoch": 0.31406787998032465,
"grad_norm": 4.206684112548828,
"learning_rate": 9.995528819202566e-06,
"loss": 1.1693,
"step": 1277
},
{
"epoch": 0.31431382193802265,
"grad_norm": 4.659973621368408,
"learning_rate": 9.995494364538775e-06,
"loss": 1.3435,
"step": 1278
},
{
"epoch": 0.3145597638957206,
"grad_norm": 4.026401519775391,
"learning_rate": 9.995459777691033e-06,
"loss": 1.2667,
"step": 1279
},
{
"epoch": 0.3148057058534186,
"grad_norm": 4.169907093048096,
"learning_rate": 9.995425058660248e-06,
"loss": 1.2207,
"step": 1280
},
{
"epoch": 0.31505164781111655,
"grad_norm": 4.400188446044922,
"learning_rate": 9.995390207447346e-06,
"loss": 1.2804,
"step": 1281
},
{
"epoch": 0.31529758976881456,
"grad_norm": 4.134114742279053,
"learning_rate": 9.995355224053246e-06,
"loss": 1.2129,
"step": 1282
},
{
"epoch": 0.31554353172651256,
"grad_norm": 4.146888256072998,
"learning_rate": 9.995320108478876e-06,
"loss": 1.194,
"step": 1283
},
{
"epoch": 0.3157894736842105,
"grad_norm": 3.903210163116455,
"learning_rate": 9.995284860725162e-06,
"loss": 1.2665,
"step": 1284
},
{
"epoch": 0.3160354156419085,
"grad_norm": 4.293799877166748,
"learning_rate": 9.99524948079304e-06,
"loss": 1.2312,
"step": 1285
},
{
"epoch": 0.3162813575996065,
"grad_norm": 4.2587809562683105,
"learning_rate": 9.99521396868344e-06,
"loss": 1.3389,
"step": 1286
},
{
"epoch": 0.31652729955730446,
"grad_norm": 4.651932239532471,
"learning_rate": 9.99517832439731e-06,
"loss": 1.4016,
"step": 1287
},
{
"epoch": 0.31677324151500247,
"grad_norm": 4.7913289070129395,
"learning_rate": 9.99514254793559e-06,
"loss": 1.2286,
"step": 1288
},
{
"epoch": 0.3170191834727004,
"grad_norm": 4.092424392700195,
"learning_rate": 9.995106639299223e-06,
"loss": 1.1465,
"step": 1289
},
{
"epoch": 0.3172651254303984,
"grad_norm": 4.386559009552002,
"learning_rate": 9.995070598489162e-06,
"loss": 1.3311,
"step": 1290
},
{
"epoch": 0.3175110673880964,
"grad_norm": 4.327492713928223,
"learning_rate": 9.995034425506362e-06,
"loss": 1.1809,
"step": 1291
},
{
"epoch": 0.3177570093457944,
"grad_norm": 4.133778095245361,
"learning_rate": 9.99499812035178e-06,
"loss": 1.2841,
"step": 1292
},
{
"epoch": 0.3180029513034924,
"grad_norm": 3.881964683532715,
"learning_rate": 9.994961683026374e-06,
"loss": 1.2475,
"step": 1293
},
{
"epoch": 0.3182488932611904,
"grad_norm": 4.202156066894531,
"learning_rate": 9.99492511353111e-06,
"loss": 1.2908,
"step": 1294
},
{
"epoch": 0.31849483521888833,
"grad_norm": 4.173476696014404,
"learning_rate": 9.994888411866954e-06,
"loss": 1.32,
"step": 1295
},
{
"epoch": 0.31874077717658633,
"grad_norm": 4.074321269989014,
"learning_rate": 9.994851578034881e-06,
"loss": 1.2127,
"step": 1296
},
{
"epoch": 0.31898671913428434,
"grad_norm": 4.389969348907471,
"learning_rate": 9.994814612035861e-06,
"loss": 1.2994,
"step": 1297
},
{
"epoch": 0.3192326610919823,
"grad_norm": 4.2776312828063965,
"learning_rate": 9.994777513870874e-06,
"loss": 1.4241,
"step": 1298
},
{
"epoch": 0.3194786030496803,
"grad_norm": 3.7780511379241943,
"learning_rate": 9.994740283540905e-06,
"loss": 1.257,
"step": 1299
},
{
"epoch": 0.31972454500737824,
"grad_norm": 4.588326454162598,
"learning_rate": 9.994702921046933e-06,
"loss": 1.3324,
"step": 1300
},
{
"epoch": 0.31997048696507624,
"grad_norm": 3.961690664291382,
"learning_rate": 9.99466542638995e-06,
"loss": 1.2543,
"step": 1301
},
{
"epoch": 0.32021642892277424,
"grad_norm": 4.337667465209961,
"learning_rate": 9.994627799570948e-06,
"loss": 1.1971,
"step": 1302
},
{
"epoch": 0.3204623708804722,
"grad_norm": 4.395909309387207,
"learning_rate": 9.994590040590923e-06,
"loss": 1.3549,
"step": 1303
},
{
"epoch": 0.3207083128381702,
"grad_norm": 4.477030277252197,
"learning_rate": 9.994552149450874e-06,
"loss": 1.2307,
"step": 1304
},
{
"epoch": 0.3209542547958682,
"grad_norm": 4.24371862411499,
"learning_rate": 9.994514126151802e-06,
"loss": 1.2949,
"step": 1305
},
{
"epoch": 0.32120019675356615,
"grad_norm": 4.316308498382568,
"learning_rate": 9.994475970694715e-06,
"loss": 1.2164,
"step": 1306
},
{
"epoch": 0.32144613871126415,
"grad_norm": 4.149338722229004,
"learning_rate": 9.994437683080621e-06,
"loss": 1.1092,
"step": 1307
},
{
"epoch": 0.3216920806689621,
"grad_norm": 4.729922771453857,
"learning_rate": 9.994399263310537e-06,
"loss": 1.3362,
"step": 1308
},
{
"epoch": 0.3219380226266601,
"grad_norm": 4.53164529800415,
"learning_rate": 9.994360711385474e-06,
"loss": 1.3755,
"step": 1309
},
{
"epoch": 0.3221839645843581,
"grad_norm": 4.198465347290039,
"learning_rate": 9.994322027306454e-06,
"loss": 1.2217,
"step": 1310
},
{
"epoch": 0.32242990654205606,
"grad_norm": 4.4573845863342285,
"learning_rate": 9.994283211074504e-06,
"loss": 1.1942,
"step": 1311
},
{
"epoch": 0.32267584849975406,
"grad_norm": 4.0180559158325195,
"learning_rate": 9.994244262690647e-06,
"loss": 1.0172,
"step": 1312
},
{
"epoch": 0.32292179045745206,
"grad_norm": 4.2799153327941895,
"learning_rate": 9.994205182155915e-06,
"loss": 1.3071,
"step": 1313
},
{
"epoch": 0.32316773241515,
"grad_norm": 3.9545605182647705,
"learning_rate": 9.994165969471342e-06,
"loss": 1.2504,
"step": 1314
},
{
"epoch": 0.323413674372848,
"grad_norm": 4.3432230949401855,
"learning_rate": 9.994126624637965e-06,
"loss": 1.3261,
"step": 1315
},
{
"epoch": 0.323659616330546,
"grad_norm": 4.611858367919922,
"learning_rate": 9.994087147656827e-06,
"loss": 1.4207,
"step": 1316
},
{
"epoch": 0.32390555828824397,
"grad_norm": 3.8514256477355957,
"learning_rate": 9.994047538528972e-06,
"loss": 1.231,
"step": 1317
},
{
"epoch": 0.32415150024594197,
"grad_norm": 4.190978527069092,
"learning_rate": 9.994007797255447e-06,
"loss": 1.3506,
"step": 1318
},
{
"epoch": 0.3243974422036399,
"grad_norm": 3.9053215980529785,
"learning_rate": 9.993967923837305e-06,
"loss": 1.2009,
"step": 1319
},
{
"epoch": 0.3246433841613379,
"grad_norm": 4.218817234039307,
"learning_rate": 9.993927918275599e-06,
"loss": 1.1893,
"step": 1320
},
{
"epoch": 0.3248893261190359,
"grad_norm": 3.493891716003418,
"learning_rate": 9.993887780571387e-06,
"loss": 1.0372,
"step": 1321
},
{
"epoch": 0.3251352680767339,
"grad_norm": 3.98374080657959,
"learning_rate": 9.993847510725737e-06,
"loss": 1.1895,
"step": 1322
},
{
"epoch": 0.3253812100344319,
"grad_norm": 4.107308387756348,
"learning_rate": 9.993807108739706e-06,
"loss": 1.2678,
"step": 1323
},
{
"epoch": 0.3256271519921299,
"grad_norm": 4.114229679107666,
"learning_rate": 9.993766574614369e-06,
"loss": 1.3061,
"step": 1324
},
{
"epoch": 0.32587309394982783,
"grad_norm": 4.128052711486816,
"learning_rate": 9.993725908350798e-06,
"loss": 1.2471,
"step": 1325
},
{
"epoch": 0.32611903590752583,
"grad_norm": 4.2025041580200195,
"learning_rate": 9.993685109950068e-06,
"loss": 1.1773,
"step": 1326
},
{
"epoch": 0.3263649778652238,
"grad_norm": 4.171741008758545,
"learning_rate": 9.993644179413258e-06,
"loss": 1.2983,
"step": 1327
},
{
"epoch": 0.3266109198229218,
"grad_norm": 3.887896776199341,
"learning_rate": 9.993603116741452e-06,
"loss": 1.2767,
"step": 1328
},
{
"epoch": 0.3268568617806198,
"grad_norm": 3.936475992202759,
"learning_rate": 9.993561921935736e-06,
"loss": 1.2077,
"step": 1329
},
{
"epoch": 0.32710280373831774,
"grad_norm": 4.181597709655762,
"learning_rate": 9.993520594997201e-06,
"loss": 1.3604,
"step": 1330
},
{
"epoch": 0.32734874569601574,
"grad_norm": 4.368448734283447,
"learning_rate": 9.993479135926937e-06,
"loss": 1.401,
"step": 1331
},
{
"epoch": 0.32759468765371375,
"grad_norm": 3.5575497150421143,
"learning_rate": 9.993437544726048e-06,
"loss": 1.1633,
"step": 1332
},
{
"epoch": 0.3278406296114117,
"grad_norm": 3.974323034286499,
"learning_rate": 9.993395821395628e-06,
"loss": 1.131,
"step": 1333
},
{
"epoch": 0.3280865715691097,
"grad_norm": 4.418570518493652,
"learning_rate": 9.993353965936783e-06,
"loss": 1.3805,
"step": 1334
},
{
"epoch": 0.3283325135268077,
"grad_norm": 4.0409255027771,
"learning_rate": 9.993311978350621e-06,
"loss": 1.3599,
"step": 1335
},
{
"epoch": 0.32857845548450565,
"grad_norm": 4.313515663146973,
"learning_rate": 9.993269858638254e-06,
"loss": 1.3028,
"step": 1336
},
{
"epoch": 0.32882439744220365,
"grad_norm": 4.014878749847412,
"learning_rate": 9.993227606800795e-06,
"loss": 1.172,
"step": 1337
},
{
"epoch": 0.3290703393999016,
"grad_norm": 3.8833000659942627,
"learning_rate": 9.99318522283936e-06,
"loss": 1.1984,
"step": 1338
},
{
"epoch": 0.3293162813575996,
"grad_norm": 3.8810505867004395,
"learning_rate": 9.993142706755076e-06,
"loss": 1.1105,
"step": 1339
},
{
"epoch": 0.3295622233152976,
"grad_norm": 4.059360027313232,
"learning_rate": 9.993100058549065e-06,
"loss": 1.3371,
"step": 1340
},
{
"epoch": 0.32980816527299556,
"grad_norm": 4.108826160430908,
"learning_rate": 9.993057278222454e-06,
"loss": 1.2654,
"step": 1341
},
{
"epoch": 0.33005410723069356,
"grad_norm": 4.094167232513428,
"learning_rate": 9.993014365776376e-06,
"loss": 1.1415,
"step": 1342
},
{
"epoch": 0.33030004918839156,
"grad_norm": 4.271787166595459,
"learning_rate": 9.992971321211968e-06,
"loss": 1.2282,
"step": 1343
},
{
"epoch": 0.3305459911460895,
"grad_norm": 3.92555570602417,
"learning_rate": 9.992928144530369e-06,
"loss": 1.1437,
"step": 1344
},
{
"epoch": 0.3307919331037875,
"grad_norm": 4.119413375854492,
"learning_rate": 9.992884835732718e-06,
"loss": 1.1619,
"step": 1345
},
{
"epoch": 0.33103787506148546,
"grad_norm": 3.9443857669830322,
"learning_rate": 9.992841394820165e-06,
"loss": 1.2384,
"step": 1346
},
{
"epoch": 0.33128381701918347,
"grad_norm": 4.391610622406006,
"learning_rate": 9.992797821793858e-06,
"loss": 1.1193,
"step": 1347
},
{
"epoch": 0.3315297589768815,
"grad_norm": 3.848011016845703,
"learning_rate": 9.992754116654947e-06,
"loss": 1.1819,
"step": 1348
},
{
"epoch": 0.3317757009345794,
"grad_norm": 4.46571683883667,
"learning_rate": 9.992710279404594e-06,
"loss": 1.2933,
"step": 1349
},
{
"epoch": 0.3320216428922774,
"grad_norm": 4.071078300476074,
"learning_rate": 9.992666310043956e-06,
"loss": 1.4274,
"step": 1350
},
{
"epoch": 0.33226758484997543,
"grad_norm": 4.100854873657227,
"learning_rate": 9.992622208574196e-06,
"loss": 1.1828,
"step": 1351
},
{
"epoch": 0.3325135268076734,
"grad_norm": 4.069943904876709,
"learning_rate": 9.992577974996483e-06,
"loss": 1.2885,
"step": 1352
},
{
"epoch": 0.3327594687653714,
"grad_norm": 3.992279052734375,
"learning_rate": 9.992533609311983e-06,
"loss": 1.2257,
"step": 1353
},
{
"epoch": 0.33300541072306933,
"grad_norm": 4.254970073699951,
"learning_rate": 9.992489111521877e-06,
"loss": 1.3992,
"step": 1354
},
{
"epoch": 0.33325135268076733,
"grad_norm": 3.892814874649048,
"learning_rate": 9.992444481627336e-06,
"loss": 1.2388,
"step": 1355
},
{
"epoch": 0.33349729463846534,
"grad_norm": 3.9065937995910645,
"learning_rate": 9.992399719629544e-06,
"loss": 1.3842,
"step": 1356
},
{
"epoch": 0.3337432365961633,
"grad_norm": 3.999619483947754,
"learning_rate": 9.992354825529685e-06,
"loss": 1.2764,
"step": 1357
},
{
"epoch": 0.3339891785538613,
"grad_norm": 3.7184202671051025,
"learning_rate": 9.992309799328947e-06,
"loss": 1.2291,
"step": 1358
},
{
"epoch": 0.3342351205115593,
"grad_norm": 3.8028557300567627,
"learning_rate": 9.99226464102852e-06,
"loss": 1.1917,
"step": 1359
},
{
"epoch": 0.33448106246925724,
"grad_norm": 4.003077983856201,
"learning_rate": 9.992219350629601e-06,
"loss": 1.2015,
"step": 1360
},
{
"epoch": 0.33472700442695524,
"grad_norm": 3.4393386840820312,
"learning_rate": 9.992173928133387e-06,
"loss": 1.0227,
"step": 1361
},
{
"epoch": 0.33497294638465325,
"grad_norm": 4.556160926818848,
"learning_rate": 9.992128373541081e-06,
"loss": 1.2913,
"step": 1362
},
{
"epoch": 0.3352188883423512,
"grad_norm": 4.010769367218018,
"learning_rate": 9.992082686853889e-06,
"loss": 1.1953,
"step": 1363
},
{
"epoch": 0.3354648303000492,
"grad_norm": 4.413011074066162,
"learning_rate": 9.992036868073016e-06,
"loss": 1.1555,
"step": 1364
},
{
"epoch": 0.33571077225774715,
"grad_norm": 4.374898433685303,
"learning_rate": 9.991990917199679e-06,
"loss": 1.2499,
"step": 1365
},
{
"epoch": 0.33595671421544515,
"grad_norm": 4.458083629608154,
"learning_rate": 9.991944834235093e-06,
"loss": 1.2151,
"step": 1366
},
{
"epoch": 0.33620265617314316,
"grad_norm": 4.454595565795898,
"learning_rate": 9.991898619180474e-06,
"loss": 1.3037,
"step": 1367
},
{
"epoch": 0.3364485981308411,
"grad_norm": 3.506692409515381,
"learning_rate": 9.991852272037049e-06,
"loss": 1.1109,
"step": 1368
},
{
"epoch": 0.3366945400885391,
"grad_norm": 3.8959076404571533,
"learning_rate": 9.991805792806042e-06,
"loss": 1.3798,
"step": 1369
},
{
"epoch": 0.3369404820462371,
"grad_norm": 4.032749652862549,
"learning_rate": 9.991759181488682e-06,
"loss": 1.3206,
"step": 1370
},
{
"epoch": 0.33718642400393506,
"grad_norm": 4.060688495635986,
"learning_rate": 9.991712438086205e-06,
"loss": 1.1322,
"step": 1371
},
{
"epoch": 0.33743236596163306,
"grad_norm": 4.409182071685791,
"learning_rate": 9.991665562599848e-06,
"loss": 1.2563,
"step": 1372
},
{
"epoch": 0.337678307919331,
"grad_norm": 4.322493553161621,
"learning_rate": 9.991618555030848e-06,
"loss": 1.4164,
"step": 1373
},
{
"epoch": 0.337924249877029,
"grad_norm": 3.683150053024292,
"learning_rate": 9.991571415380453e-06,
"loss": 1.1207,
"step": 1374
},
{
"epoch": 0.338170191834727,
"grad_norm": 4.255983829498291,
"learning_rate": 9.991524143649908e-06,
"loss": 1.1968,
"step": 1375
},
{
"epoch": 0.33841613379242497,
"grad_norm": 3.9996345043182373,
"learning_rate": 9.991476739840464e-06,
"loss": 1.2829,
"step": 1376
},
{
"epoch": 0.33866207575012297,
"grad_norm": 4.311654090881348,
"learning_rate": 9.991429203953375e-06,
"loss": 1.2637,
"step": 1377
},
{
"epoch": 0.338908017707821,
"grad_norm": 3.941500425338745,
"learning_rate": 9.9913815359899e-06,
"loss": 1.1789,
"step": 1378
},
{
"epoch": 0.3391539596655189,
"grad_norm": 3.928452253341675,
"learning_rate": 9.991333735951299e-06,
"loss": 1.221,
"step": 1379
},
{
"epoch": 0.3393999016232169,
"grad_norm": 4.296792984008789,
"learning_rate": 9.991285803838837e-06,
"loss": 1.2511,
"step": 1380
},
{
"epoch": 0.33964584358091493,
"grad_norm": 4.0416107177734375,
"learning_rate": 9.991237739653782e-06,
"loss": 1.1133,
"step": 1381
},
{
"epoch": 0.3398917855386129,
"grad_norm": 4.5701189041137695,
"learning_rate": 9.991189543397408e-06,
"loss": 1.3019,
"step": 1382
},
{
"epoch": 0.3401377274963109,
"grad_norm": 4.078940391540527,
"learning_rate": 9.99114121507099e-06,
"loss": 1.2318,
"step": 1383
},
{
"epoch": 0.34038366945400883,
"grad_norm": 4.5263142585754395,
"learning_rate": 9.991092754675803e-06,
"loss": 1.2763,
"step": 1384
},
{
"epoch": 0.34062961141170683,
"grad_norm": 3.830003023147583,
"learning_rate": 9.991044162213135e-06,
"loss": 1.0597,
"step": 1385
},
{
"epoch": 0.34087555336940484,
"grad_norm": 4.119638919830322,
"learning_rate": 9.990995437684266e-06,
"loss": 1.1656,
"step": 1386
},
{
"epoch": 0.3411214953271028,
"grad_norm": 4.3407883644104,
"learning_rate": 9.990946581090491e-06,
"loss": 1.3187,
"step": 1387
},
{
"epoch": 0.3413674372848008,
"grad_norm": 4.506727695465088,
"learning_rate": 9.990897592433098e-06,
"loss": 1.3793,
"step": 1388
},
{
"epoch": 0.3416133792424988,
"grad_norm": 4.188359260559082,
"learning_rate": 9.990848471713384e-06,
"loss": 1.4148,
"step": 1389
},
{
"epoch": 0.34185932120019674,
"grad_norm": 3.830191135406494,
"learning_rate": 9.990799218932652e-06,
"loss": 1.2913,
"step": 1390
},
{
"epoch": 0.34210526315789475,
"grad_norm": 3.9672231674194336,
"learning_rate": 9.990749834092201e-06,
"loss": 1.1917,
"step": 1391
},
{
"epoch": 0.3423512051155927,
"grad_norm": 3.530090093612671,
"learning_rate": 9.99070031719334e-06,
"loss": 1.2481,
"step": 1392
},
{
"epoch": 0.3425971470732907,
"grad_norm": 4.092990875244141,
"learning_rate": 9.990650668237381e-06,
"loss": 1.1772,
"step": 1393
},
{
"epoch": 0.3428430890309887,
"grad_norm": 3.979814291000366,
"learning_rate": 9.990600887225634e-06,
"loss": 1.1096,
"step": 1394
},
{
"epoch": 0.34308903098868665,
"grad_norm": 4.4569902420043945,
"learning_rate": 9.99055097415942e-06,
"loss": 1.3051,
"step": 1395
},
{
"epoch": 0.34333497294638465,
"grad_norm": 4.165904521942139,
"learning_rate": 9.990500929040057e-06,
"loss": 1.3523,
"step": 1396
},
{
"epoch": 0.34358091490408266,
"grad_norm": 4.036123275756836,
"learning_rate": 9.99045075186887e-06,
"loss": 1.3177,
"step": 1397
},
{
"epoch": 0.3438268568617806,
"grad_norm": 3.8997507095336914,
"learning_rate": 9.990400442647185e-06,
"loss": 1.304,
"step": 1398
},
{
"epoch": 0.3440727988194786,
"grad_norm": 4.004807472229004,
"learning_rate": 9.990350001376337e-06,
"loss": 1.2003,
"step": 1399
},
{
"epoch": 0.3443187407771766,
"grad_norm": 4.139293193817139,
"learning_rate": 9.99029942805766e-06,
"loss": 1.2686,
"step": 1400
},
{
"epoch": 0.34456468273487456,
"grad_norm": 3.908148765563965,
"learning_rate": 9.990248722692488e-06,
"loss": 1.2316,
"step": 1401
},
{
"epoch": 0.34481062469257256,
"grad_norm": 3.6300764083862305,
"learning_rate": 9.990197885282167e-06,
"loss": 1.1769,
"step": 1402
},
{
"epoch": 0.3450565666502705,
"grad_norm": 4.244349956512451,
"learning_rate": 9.99014691582804e-06,
"loss": 1.2144,
"step": 1403
},
{
"epoch": 0.3453025086079685,
"grad_norm": 3.7288525104522705,
"learning_rate": 9.990095814331457e-06,
"loss": 1.289,
"step": 1404
},
{
"epoch": 0.3455484505656665,
"grad_norm": 3.9309911727905273,
"learning_rate": 9.99004458079377e-06,
"loss": 1.1843,
"step": 1405
},
{
"epoch": 0.34579439252336447,
"grad_norm": 4.265448570251465,
"learning_rate": 9.989993215216334e-06,
"loss": 1.3982,
"step": 1406
},
{
"epoch": 0.3460403344810625,
"grad_norm": 4.1929144859313965,
"learning_rate": 9.989941717600509e-06,
"loss": 1.3392,
"step": 1407
},
{
"epoch": 0.3462862764387605,
"grad_norm": 3.9810891151428223,
"learning_rate": 9.989890087947656e-06,
"loss": 1.0974,
"step": 1408
},
{
"epoch": 0.3465322183964584,
"grad_norm": 4.067968845367432,
"learning_rate": 9.989838326259142e-06,
"loss": 1.295,
"step": 1409
},
{
"epoch": 0.34677816035415643,
"grad_norm": 3.9688150882720947,
"learning_rate": 9.98978643253634e-06,
"loss": 1.3033,
"step": 1410
},
{
"epoch": 0.3470241023118544,
"grad_norm": 4.112033843994141,
"learning_rate": 9.989734406780619e-06,
"loss": 1.2122,
"step": 1411
},
{
"epoch": 0.3472700442695524,
"grad_norm": 4.487918376922607,
"learning_rate": 9.989682248993354e-06,
"loss": 1.3531,
"step": 1412
},
{
"epoch": 0.3475159862272504,
"grad_norm": 4.664383411407471,
"learning_rate": 9.98962995917593e-06,
"loss": 1.332,
"step": 1413
},
{
"epoch": 0.34776192818494833,
"grad_norm": 3.941998243331909,
"learning_rate": 9.989577537329726e-06,
"loss": 1.2224,
"step": 1414
},
{
"epoch": 0.34800787014264634,
"grad_norm": 4.071390628814697,
"learning_rate": 9.989524983456135e-06,
"loss": 1.1669,
"step": 1415
},
{
"epoch": 0.34825381210034434,
"grad_norm": 3.9829518795013428,
"learning_rate": 9.989472297556543e-06,
"loss": 1.2819,
"step": 1416
},
{
"epoch": 0.3484997540580423,
"grad_norm": 3.9541516304016113,
"learning_rate": 9.989419479632345e-06,
"loss": 1.3491,
"step": 1417
},
{
"epoch": 0.3487456960157403,
"grad_norm": 4.193580627441406,
"learning_rate": 9.989366529684938e-06,
"loss": 1.3572,
"step": 1418
},
{
"epoch": 0.3489916379734383,
"grad_norm": 3.9555578231811523,
"learning_rate": 9.989313447715725e-06,
"loss": 1.2327,
"step": 1419
},
{
"epoch": 0.34923757993113624,
"grad_norm": 4.370056629180908,
"learning_rate": 9.98926023372611e-06,
"loss": 1.2729,
"step": 1420
},
{
"epoch": 0.34948352188883425,
"grad_norm": 4.312520503997803,
"learning_rate": 9.989206887717499e-06,
"loss": 1.1766,
"step": 1421
},
{
"epoch": 0.3497294638465322,
"grad_norm": 4.057276725769043,
"learning_rate": 9.989153409691308e-06,
"loss": 1.3496,
"step": 1422
},
{
"epoch": 0.3499754058042302,
"grad_norm": 4.22401237487793,
"learning_rate": 9.989099799648946e-06,
"loss": 1.2697,
"step": 1423
},
{
"epoch": 0.3502213477619282,
"grad_norm": 3.8266212940216064,
"learning_rate": 9.989046057591838e-06,
"loss": 1.2018,
"step": 1424
},
{
"epoch": 0.35046728971962615,
"grad_norm": 3.8117868900299072,
"learning_rate": 9.988992183521401e-06,
"loss": 1.1941,
"step": 1425
},
{
"epoch": 0.35071323167732416,
"grad_norm": 4.115688800811768,
"learning_rate": 9.988938177439063e-06,
"loss": 1.361,
"step": 1426
},
{
"epoch": 0.35095917363502216,
"grad_norm": 3.957404851913452,
"learning_rate": 9.988884039346252e-06,
"loss": 1.2599,
"step": 1427
},
{
"epoch": 0.3512051155927201,
"grad_norm": 3.8914294242858887,
"learning_rate": 9.988829769244402e-06,
"loss": 1.1466,
"step": 1428
},
{
"epoch": 0.3514510575504181,
"grad_norm": 4.085134506225586,
"learning_rate": 9.988775367134947e-06,
"loss": 1.2611,
"step": 1429
},
{
"epoch": 0.35169699950811606,
"grad_norm": 4.2483110427856445,
"learning_rate": 9.988720833019327e-06,
"loss": 1.3318,
"step": 1430
},
{
"epoch": 0.35194294146581406,
"grad_norm": 3.9612669944763184,
"learning_rate": 9.988666166898986e-06,
"loss": 1.2062,
"step": 1431
},
{
"epoch": 0.35218888342351207,
"grad_norm": 4.222630023956299,
"learning_rate": 9.98861136877537e-06,
"loss": 1.1564,
"step": 1432
},
{
"epoch": 0.35243482538121,
"grad_norm": 3.9825825691223145,
"learning_rate": 9.98855643864993e-06,
"loss": 1.2569,
"step": 1433
},
{
"epoch": 0.352680767338908,
"grad_norm": 4.486030578613281,
"learning_rate": 9.988501376524118e-06,
"loss": 1.2121,
"step": 1434
},
{
"epoch": 0.352926709296606,
"grad_norm": 4.222902774810791,
"learning_rate": 9.98844618239939e-06,
"loss": 1.2689,
"step": 1435
},
{
"epoch": 0.35317265125430397,
"grad_norm": 3.9978694915771484,
"learning_rate": 9.98839085627721e-06,
"loss": 1.184,
"step": 1436
},
{
"epoch": 0.353418593212002,
"grad_norm": 4.294037818908691,
"learning_rate": 9.98833539815904e-06,
"loss": 1.328,
"step": 1437
},
{
"epoch": 0.3536645351697,
"grad_norm": 4.356046676635742,
"learning_rate": 9.988279808046345e-06,
"loss": 1.3132,
"step": 1438
},
{
"epoch": 0.3539104771273979,
"grad_norm": 3.762099027633667,
"learning_rate": 9.988224085940599e-06,
"loss": 1.2757,
"step": 1439
},
{
"epoch": 0.35415641908509593,
"grad_norm": 4.602606296539307,
"learning_rate": 9.988168231843278e-06,
"loss": 1.3686,
"step": 1440
},
{
"epoch": 0.3544023610427939,
"grad_norm": 4.1296162605285645,
"learning_rate": 9.988112245755857e-06,
"loss": 1.2975,
"step": 1441
},
{
"epoch": 0.3546483030004919,
"grad_norm": 3.6154232025146484,
"learning_rate": 9.988056127679817e-06,
"loss": 1.1448,
"step": 1442
},
{
"epoch": 0.3548942449581899,
"grad_norm": 3.7397797107696533,
"learning_rate": 9.987999877616643e-06,
"loss": 1.1536,
"step": 1443
},
{
"epoch": 0.35514018691588783,
"grad_norm": 3.9455783367156982,
"learning_rate": 9.987943495567826e-06,
"loss": 1.1811,
"step": 1444
},
{
"epoch": 0.35538612887358584,
"grad_norm": 4.5360565185546875,
"learning_rate": 9.987886981534857e-06,
"loss": 1.3472,
"step": 1445
},
{
"epoch": 0.35563207083128384,
"grad_norm": 4.036190986633301,
"learning_rate": 9.98783033551923e-06,
"loss": 1.3,
"step": 1446
},
{
"epoch": 0.3558780127889818,
"grad_norm": 3.918630838394165,
"learning_rate": 9.987773557522445e-06,
"loss": 1.4098,
"step": 1447
},
{
"epoch": 0.3561239547466798,
"grad_norm": 4.069088935852051,
"learning_rate": 9.987716647546003e-06,
"loss": 1.2403,
"step": 1448
},
{
"epoch": 0.35636989670437774,
"grad_norm": 4.590051651000977,
"learning_rate": 9.987659605591412e-06,
"loss": 1.3342,
"step": 1449
},
{
"epoch": 0.35661583866207575,
"grad_norm": 4.563328266143799,
"learning_rate": 9.98760243166018e-06,
"loss": 1.3754,
"step": 1450
},
{
"epoch": 0.35686178061977375,
"grad_norm": 4.330633163452148,
"learning_rate": 9.987545125753818e-06,
"loss": 1.2659,
"step": 1451
},
{
"epoch": 0.3571077225774717,
"grad_norm": 4.507234573364258,
"learning_rate": 9.987487687873849e-06,
"loss": 1.4255,
"step": 1452
},
{
"epoch": 0.3573536645351697,
"grad_norm": 4.323271751403809,
"learning_rate": 9.987430118021785e-06,
"loss": 1.2991,
"step": 1453
},
{
"epoch": 0.3575996064928677,
"grad_norm": 4.373091220855713,
"learning_rate": 9.987372416199153e-06,
"loss": 1.1932,
"step": 1454
},
{
"epoch": 0.35784554845056565,
"grad_norm": 4.7837443351745605,
"learning_rate": 9.987314582407481e-06,
"loss": 1.2806,
"step": 1455
},
{
"epoch": 0.35809149040826366,
"grad_norm": 4.160711288452148,
"learning_rate": 9.987256616648296e-06,
"loss": 1.2329,
"step": 1456
},
{
"epoch": 0.3583374323659616,
"grad_norm": 4.165082931518555,
"learning_rate": 9.987198518923135e-06,
"loss": 1.2474,
"step": 1457
},
{
"epoch": 0.3585833743236596,
"grad_norm": 4.348196983337402,
"learning_rate": 9.987140289233534e-06,
"loss": 1.2165,
"step": 1458
},
{
"epoch": 0.3588293162813576,
"grad_norm": 4.242162227630615,
"learning_rate": 9.987081927581033e-06,
"loss": 1.3162,
"step": 1459
},
{
"epoch": 0.35907525823905556,
"grad_norm": 4.277111530303955,
"learning_rate": 9.987023433967177e-06,
"loss": 1.1973,
"step": 1460
},
{
"epoch": 0.35932120019675357,
"grad_norm": 3.922065258026123,
"learning_rate": 9.986964808393513e-06,
"loss": 1.2169,
"step": 1461
},
{
"epoch": 0.35956714215445157,
"grad_norm": 4.044286251068115,
"learning_rate": 9.986906050861595e-06,
"loss": 1.185,
"step": 1462
},
{
"epoch": 0.3598130841121495,
"grad_norm": 4.111523628234863,
"learning_rate": 9.986847161372974e-06,
"loss": 1.2983,
"step": 1463
},
{
"epoch": 0.3600590260698475,
"grad_norm": 3.9976391792297363,
"learning_rate": 9.98678813992921e-06,
"loss": 1.2913,
"step": 1464
},
{
"epoch": 0.3603049680275455,
"grad_norm": 4.336568832397461,
"learning_rate": 9.986728986531866e-06,
"loss": 1.2796,
"step": 1465
},
{
"epoch": 0.3605509099852435,
"grad_norm": 4.191849708557129,
"learning_rate": 9.986669701182506e-06,
"loss": 1.3361,
"step": 1466
},
{
"epoch": 0.3607968519429415,
"grad_norm": 4.025372505187988,
"learning_rate": 9.986610283882698e-06,
"loss": 1.2911,
"step": 1467
},
{
"epoch": 0.3610427939006394,
"grad_norm": 3.8140487670898438,
"learning_rate": 9.986550734634015e-06,
"loss": 1.2391,
"step": 1468
},
{
"epoch": 0.36128873585833743,
"grad_norm": 4.178420066833496,
"learning_rate": 9.986491053438035e-06,
"loss": 1.2424,
"step": 1469
},
{
"epoch": 0.36153467781603543,
"grad_norm": 4.482036590576172,
"learning_rate": 9.986431240296333e-06,
"loss": 1.3819,
"step": 1470
},
{
"epoch": 0.3617806197737334,
"grad_norm": 3.8501226902008057,
"learning_rate": 9.986371295210493e-06,
"loss": 1.3861,
"step": 1471
},
{
"epoch": 0.3620265617314314,
"grad_norm": 4.404519557952881,
"learning_rate": 9.986311218182102e-06,
"loss": 1.1572,
"step": 1472
},
{
"epoch": 0.3622725036891294,
"grad_norm": 4.15662956237793,
"learning_rate": 9.98625100921275e-06,
"loss": 1.3082,
"step": 1473
},
{
"epoch": 0.36251844564682734,
"grad_norm": 4.128908157348633,
"learning_rate": 9.98619066830403e-06,
"loss": 1.097,
"step": 1474
},
{
"epoch": 0.36276438760452534,
"grad_norm": 3.9525139331817627,
"learning_rate": 9.986130195457538e-06,
"loss": 1.2539,
"step": 1475
},
{
"epoch": 0.3630103295622233,
"grad_norm": 4.446487903594971,
"learning_rate": 9.986069590674874e-06,
"loss": 1.1668,
"step": 1476
},
{
"epoch": 0.3632562715199213,
"grad_norm": 4.3018388748168945,
"learning_rate": 9.986008853957642e-06,
"loss": 1.2833,
"step": 1477
},
{
"epoch": 0.3635022134776193,
"grad_norm": 4.034856796264648,
"learning_rate": 9.98594798530745e-06,
"loss": 1.2943,
"step": 1478
},
{
"epoch": 0.36374815543531724,
"grad_norm": 4.305031776428223,
"learning_rate": 9.985886984725907e-06,
"loss": 1.2563,
"step": 1479
},
{
"epoch": 0.36399409739301525,
"grad_norm": 3.9727346897125244,
"learning_rate": 9.985825852214627e-06,
"loss": 1.1879,
"step": 1480
},
{
"epoch": 0.36424003935071325,
"grad_norm": 4.060462951660156,
"learning_rate": 9.98576458777523e-06,
"loss": 1.3281,
"step": 1481
},
{
"epoch": 0.3644859813084112,
"grad_norm": 3.846977949142456,
"learning_rate": 9.985703191409336e-06,
"loss": 1.3991,
"step": 1482
},
{
"epoch": 0.3647319232661092,
"grad_norm": 3.9522056579589844,
"learning_rate": 9.985641663118567e-06,
"loss": 1.0881,
"step": 1483
},
{
"epoch": 0.3649778652238072,
"grad_norm": 4.003945827484131,
"learning_rate": 9.985580002904557e-06,
"loss": 1.348,
"step": 1484
},
{
"epoch": 0.36522380718150516,
"grad_norm": 4.316896438598633,
"learning_rate": 9.985518210768932e-06,
"loss": 1.2625,
"step": 1485
},
{
"epoch": 0.36546974913920316,
"grad_norm": 4.064728736877441,
"learning_rate": 9.985456286713328e-06,
"loss": 1.3767,
"step": 1486
},
{
"epoch": 0.3657156910969011,
"grad_norm": 4.102534294128418,
"learning_rate": 9.985394230739388e-06,
"loss": 1.253,
"step": 1487
},
{
"epoch": 0.3659616330545991,
"grad_norm": 3.8190536499023438,
"learning_rate": 9.985332042848746e-06,
"loss": 1.2269,
"step": 1488
},
{
"epoch": 0.3662075750122971,
"grad_norm": 4.212854862213135,
"learning_rate": 9.985269723043054e-06,
"loss": 1.2506,
"step": 1489
},
{
"epoch": 0.36645351696999506,
"grad_norm": 3.9962356090545654,
"learning_rate": 9.985207271323958e-06,
"loss": 1.3721,
"step": 1490
},
{
"epoch": 0.36669945892769307,
"grad_norm": 4.390626430511475,
"learning_rate": 9.985144687693113e-06,
"loss": 1.3612,
"step": 1491
},
{
"epoch": 0.36694540088539107,
"grad_norm": 3.9011051654815674,
"learning_rate": 9.985081972152173e-06,
"loss": 1.1719,
"step": 1492
},
{
"epoch": 0.367191342843089,
"grad_norm": 3.9591224193573,
"learning_rate": 9.9850191247028e-06,
"loss": 1.2216,
"step": 1493
},
{
"epoch": 0.367437284800787,
"grad_norm": 4.051086902618408,
"learning_rate": 9.984956145346652e-06,
"loss": 1.3129,
"step": 1494
},
{
"epoch": 0.36768322675848497,
"grad_norm": 4.458414554595947,
"learning_rate": 9.984893034085401e-06,
"loss": 1.3682,
"step": 1495
},
{
"epoch": 0.367929168716183,
"grad_norm": 4.196484565734863,
"learning_rate": 9.984829790920713e-06,
"loss": 1.2419,
"step": 1496
},
{
"epoch": 0.368175110673881,
"grad_norm": 4.341845512390137,
"learning_rate": 9.984766415854263e-06,
"loss": 1.3705,
"step": 1497
},
{
"epoch": 0.3684210526315789,
"grad_norm": 3.8931732177734375,
"learning_rate": 9.98470290888773e-06,
"loss": 1.3034,
"step": 1498
},
{
"epoch": 0.36866699458927693,
"grad_norm": 3.637355089187622,
"learning_rate": 9.984639270022788e-06,
"loss": 1.2024,
"step": 1499
},
{
"epoch": 0.36891293654697493,
"grad_norm": 3.9868478775024414,
"learning_rate": 9.98457549926113e-06,
"loss": 1.2776,
"step": 1500
},
{
"epoch": 0.36891293654697493,
"eval_loss": 1.2976675033569336,
"eval_runtime": 13.6577,
"eval_samples_per_second": 29.288,
"eval_steps_per_second": 3.661,
"step": 1500
},
{
"epoch": 0.3691588785046729,
"grad_norm": 4.35200309753418,
"learning_rate": 9.984511596604435e-06,
"loss": 1.3627,
"step": 1501
},
{
"epoch": 0.3694048204623709,
"grad_norm": 4.082915306091309,
"learning_rate": 9.9844475620544e-06,
"loss": 1.2828,
"step": 1502
},
{
"epoch": 0.3696507624200689,
"grad_norm": 4.266556739807129,
"learning_rate": 9.984383395612717e-06,
"loss": 1.2074,
"step": 1503
},
{
"epoch": 0.36989670437776684,
"grad_norm": 4.3575825691223145,
"learning_rate": 9.984319097281084e-06,
"loss": 1.2224,
"step": 1504
},
{
"epoch": 0.37014264633546484,
"grad_norm": 4.3033905029296875,
"learning_rate": 9.9842546670612e-06,
"loss": 1.2594,
"step": 1505
},
{
"epoch": 0.3703885882931628,
"grad_norm": 4.1002583503723145,
"learning_rate": 9.984190104954774e-06,
"loss": 1.2364,
"step": 1506
},
{
"epoch": 0.3706345302508608,
"grad_norm": 3.9739556312561035,
"learning_rate": 9.984125410963513e-06,
"loss": 1.1735,
"step": 1507
},
{
"epoch": 0.3708804722085588,
"grad_norm": 4.27018928527832,
"learning_rate": 9.984060585089125e-06,
"loss": 1.2086,
"step": 1508
},
{
"epoch": 0.37112641416625675,
"grad_norm": 4.241909503936768,
"learning_rate": 9.983995627333332e-06,
"loss": 1.2917,
"step": 1509
},
{
"epoch": 0.37137235612395475,
"grad_norm": 3.7687177658081055,
"learning_rate": 9.983930537697848e-06,
"loss": 1.2101,
"step": 1510
},
{
"epoch": 0.37161829808165275,
"grad_norm": 3.850660562515259,
"learning_rate": 9.983865316184398e-06,
"loss": 1.2013,
"step": 1511
},
{
"epoch": 0.3718642400393507,
"grad_norm": 3.9074294567108154,
"learning_rate": 9.983799962794705e-06,
"loss": 1.1792,
"step": 1512
},
{
"epoch": 0.3721101819970487,
"grad_norm": 3.99351167678833,
"learning_rate": 9.9837344775305e-06,
"loss": 1.1244,
"step": 1513
},
{
"epoch": 0.37235612395474665,
"grad_norm": 4.333915710449219,
"learning_rate": 9.983668860393518e-06,
"loss": 1.341,
"step": 1514
},
{
"epoch": 0.37260206591244466,
"grad_norm": 4.06878137588501,
"learning_rate": 9.98360311138549e-06,
"loss": 1.3473,
"step": 1515
},
{
"epoch": 0.37284800787014266,
"grad_norm": 3.9610202312469482,
"learning_rate": 9.98353723050816e-06,
"loss": 1.2099,
"step": 1516
},
{
"epoch": 0.3730939498278406,
"grad_norm": 4.32100772857666,
"learning_rate": 9.98347121776327e-06,
"loss": 1.317,
"step": 1517
},
{
"epoch": 0.3733398917855386,
"grad_norm": 4.548961639404297,
"learning_rate": 9.983405073152566e-06,
"loss": 1.2435,
"step": 1518
},
{
"epoch": 0.3735858337432366,
"grad_norm": 4.236275672912598,
"learning_rate": 9.9833387966778e-06,
"loss": 1.2717,
"step": 1519
},
{
"epoch": 0.37383177570093457,
"grad_norm": 4.046571254730225,
"learning_rate": 9.983272388340723e-06,
"loss": 1.1803,
"step": 1520
},
{
"epoch": 0.37407771765863257,
"grad_norm": 4.154245853424072,
"learning_rate": 9.983205848143096e-06,
"loss": 1.2193,
"step": 1521
},
{
"epoch": 0.3743236596163306,
"grad_norm": 3.8809096813201904,
"learning_rate": 9.983139176086675e-06,
"loss": 1.2655,
"step": 1522
},
{
"epoch": 0.3745696015740285,
"grad_norm": 4.3549323081970215,
"learning_rate": 9.983072372173229e-06,
"loss": 1.2357,
"step": 1523
},
{
"epoch": 0.3748155435317265,
"grad_norm": 4.023801803588867,
"learning_rate": 9.983005436404523e-06,
"loss": 1.179,
"step": 1524
},
{
"epoch": 0.3750614854894245,
"grad_norm": 4.143782138824463,
"learning_rate": 9.982938368782327e-06,
"loss": 1.2997,
"step": 1525
},
{
"epoch": 0.3753074274471225,
"grad_norm": 4.26095724105835,
"learning_rate": 9.98287116930842e-06,
"loss": 1.3146,
"step": 1526
},
{
"epoch": 0.3755533694048205,
"grad_norm": 4.113142013549805,
"learning_rate": 9.982803837984574e-06,
"loss": 1.203,
"step": 1527
},
{
"epoch": 0.37579931136251843,
"grad_norm": 4.285439491271973,
"learning_rate": 9.982736374812576e-06,
"loss": 1.1661,
"step": 1528
},
{
"epoch": 0.37604525332021643,
"grad_norm": 4.054412841796875,
"learning_rate": 9.982668779794209e-06,
"loss": 1.2045,
"step": 1529
},
{
"epoch": 0.37629119527791444,
"grad_norm": 4.231668949127197,
"learning_rate": 9.982601052931261e-06,
"loss": 1.1384,
"step": 1530
},
{
"epoch": 0.3765371372356124,
"grad_norm": 3.8437461853027344,
"learning_rate": 9.982533194225526e-06,
"loss": 1.0949,
"step": 1531
},
{
"epoch": 0.3767830791933104,
"grad_norm": 4.224009037017822,
"learning_rate": 9.982465203678797e-06,
"loss": 1.2172,
"step": 1532
},
{
"epoch": 0.37702902115100834,
"grad_norm": 3.6231439113616943,
"learning_rate": 9.982397081292874e-06,
"loss": 1.1255,
"step": 1533
},
{
"epoch": 0.37727496310870634,
"grad_norm": 4.2984514236450195,
"learning_rate": 9.98232882706956e-06,
"loss": 1.303,
"step": 1534
},
{
"epoch": 0.37752090506640434,
"grad_norm": 4.250051498413086,
"learning_rate": 9.982260441010663e-06,
"loss": 1.2685,
"step": 1535
},
{
"epoch": 0.3777668470241023,
"grad_norm": 3.6218719482421875,
"learning_rate": 9.98219192311799e-06,
"loss": 1.2228,
"step": 1536
},
{
"epoch": 0.3780127889818003,
"grad_norm": 4.175124645233154,
"learning_rate": 9.982123273393353e-06,
"loss": 1.4173,
"step": 1537
},
{
"epoch": 0.3782587309394983,
"grad_norm": 4.646365642547607,
"learning_rate": 9.982054491838573e-06,
"loss": 1.2069,
"step": 1538
},
{
"epoch": 0.37850467289719625,
"grad_norm": 3.9215598106384277,
"learning_rate": 9.981985578455465e-06,
"loss": 1.1479,
"step": 1539
},
{
"epoch": 0.37875061485489425,
"grad_norm": 3.5920276641845703,
"learning_rate": 9.981916533245855e-06,
"loss": 1.1109,
"step": 1540
},
{
"epoch": 0.37899655681259226,
"grad_norm": 3.986285924911499,
"learning_rate": 9.98184735621157e-06,
"loss": 1.2359,
"step": 1541
},
{
"epoch": 0.3792424987702902,
"grad_norm": 3.790491819381714,
"learning_rate": 9.98177804735444e-06,
"loss": 1.159,
"step": 1542
},
{
"epoch": 0.3794884407279882,
"grad_norm": 4.081528663635254,
"learning_rate": 9.9817086066763e-06,
"loss": 1.2184,
"step": 1543
},
{
"epoch": 0.37973438268568616,
"grad_norm": 4.190249443054199,
"learning_rate": 9.981639034178985e-06,
"loss": 1.2673,
"step": 1544
},
{
"epoch": 0.37998032464338416,
"grad_norm": 4.1351776123046875,
"learning_rate": 9.981569329864336e-06,
"loss": 1.3421,
"step": 1545
},
{
"epoch": 0.38022626660108216,
"grad_norm": 4.092532634735107,
"learning_rate": 9.981499493734202e-06,
"loss": 1.2439,
"step": 1546
},
{
"epoch": 0.3804722085587801,
"grad_norm": 4.176187515258789,
"learning_rate": 9.981429525790427e-06,
"loss": 1.2207,
"step": 1547
},
{
"epoch": 0.3807181505164781,
"grad_norm": 4.371399879455566,
"learning_rate": 9.981359426034862e-06,
"loss": 1.3045,
"step": 1548
},
{
"epoch": 0.3809640924741761,
"grad_norm": 4.316927909851074,
"learning_rate": 9.981289194469363e-06,
"loss": 1.1829,
"step": 1549
},
{
"epoch": 0.38121003443187407,
"grad_norm": 4.063778877258301,
"learning_rate": 9.98121883109579e-06,
"loss": 1.2999,
"step": 1550
},
{
"epoch": 0.38145597638957207,
"grad_norm": 3.680040121078491,
"learning_rate": 9.981148335916e-06,
"loss": 1.122,
"step": 1551
},
{
"epoch": 0.38170191834727,
"grad_norm": 4.555853843688965,
"learning_rate": 9.981077708931866e-06,
"loss": 1.3889,
"step": 1552
},
{
"epoch": 0.381947860304968,
"grad_norm": 3.9836840629577637,
"learning_rate": 9.981006950145249e-06,
"loss": 1.3153,
"step": 1553
},
{
"epoch": 0.382193802262666,
"grad_norm": 3.9169604778289795,
"learning_rate": 9.980936059558027e-06,
"loss": 1.2363,
"step": 1554
},
{
"epoch": 0.382439744220364,
"grad_norm": 4.04386043548584,
"learning_rate": 9.980865037172072e-06,
"loss": 1.2237,
"step": 1555
},
{
"epoch": 0.382685686178062,
"grad_norm": 4.5220794677734375,
"learning_rate": 9.980793882989266e-06,
"loss": 1.4363,
"step": 1556
},
{
"epoch": 0.38293162813576,
"grad_norm": 3.977593183517456,
"learning_rate": 9.980722597011489e-06,
"loss": 1.1918,
"step": 1557
},
{
"epoch": 0.38317757009345793,
"grad_norm": 3.889998197555542,
"learning_rate": 9.98065117924063e-06,
"loss": 1.3197,
"step": 1558
},
{
"epoch": 0.38342351205115593,
"grad_norm": 4.14206600189209,
"learning_rate": 9.980579629678579e-06,
"loss": 1.2914,
"step": 1559
},
{
"epoch": 0.3836694540088539,
"grad_norm": 3.9528064727783203,
"learning_rate": 9.980507948327227e-06,
"loss": 1.2815,
"step": 1560
},
{
"epoch": 0.3839153959665519,
"grad_norm": 3.698195219039917,
"learning_rate": 9.980436135188471e-06,
"loss": 1.1452,
"step": 1561
},
{
"epoch": 0.3841613379242499,
"grad_norm": 3.961284875869751,
"learning_rate": 9.980364190264212e-06,
"loss": 1.2717,
"step": 1562
},
{
"epoch": 0.38440727988194784,
"grad_norm": 3.708042860031128,
"learning_rate": 9.980292113556354e-06,
"loss": 1.2592,
"step": 1563
},
{
"epoch": 0.38465322183964584,
"grad_norm": 4.1415510177612305,
"learning_rate": 9.980219905066803e-06,
"loss": 1.2057,
"step": 1564
},
{
"epoch": 0.38489916379734385,
"grad_norm": 4.225846767425537,
"learning_rate": 9.980147564797471e-06,
"loss": 1.2599,
"step": 1565
},
{
"epoch": 0.3851451057550418,
"grad_norm": 4.047697067260742,
"learning_rate": 9.980075092750272e-06,
"loss": 1.2176,
"step": 1566
},
{
"epoch": 0.3853910477127398,
"grad_norm": 4.00889778137207,
"learning_rate": 9.980002488927122e-06,
"loss": 1.2771,
"step": 1567
},
{
"epoch": 0.3856369896704378,
"grad_norm": 4.171459674835205,
"learning_rate": 9.979929753329946e-06,
"loss": 1.3716,
"step": 1568
},
{
"epoch": 0.38588293162813575,
"grad_norm": 3.8912601470947266,
"learning_rate": 9.979856885960664e-06,
"loss": 1.1564,
"step": 1569
},
{
"epoch": 0.38612887358583375,
"grad_norm": 4.820818901062012,
"learning_rate": 9.979783886821206e-06,
"loss": 1.3085,
"step": 1570
},
{
"epoch": 0.3863748155435317,
"grad_norm": 4.262513160705566,
"learning_rate": 9.979710755913506e-06,
"loss": 1.3026,
"step": 1571
},
{
"epoch": 0.3866207575012297,
"grad_norm": 4.084031581878662,
"learning_rate": 9.979637493239494e-06,
"loss": 1.1659,
"step": 1572
},
{
"epoch": 0.3868666994589277,
"grad_norm": 4.470019340515137,
"learning_rate": 9.979564098801113e-06,
"loss": 1.3184,
"step": 1573
},
{
"epoch": 0.38711264141662566,
"grad_norm": 4.055445194244385,
"learning_rate": 9.979490572600304e-06,
"loss": 1.3138,
"step": 1574
},
{
"epoch": 0.38735858337432366,
"grad_norm": 3.9399232864379883,
"learning_rate": 9.97941691463901e-06,
"loss": 1.1557,
"step": 1575
},
{
"epoch": 0.38760452533202167,
"grad_norm": 4.163764953613281,
"learning_rate": 9.979343124919185e-06,
"loss": 1.2881,
"step": 1576
},
{
"epoch": 0.3878504672897196,
"grad_norm": 4.482650279998779,
"learning_rate": 9.979269203442777e-06,
"loss": 1.2689,
"step": 1577
},
{
"epoch": 0.3880964092474176,
"grad_norm": 3.759821891784668,
"learning_rate": 9.979195150211744e-06,
"loss": 1.321,
"step": 1578
},
{
"epoch": 0.38834235120511557,
"grad_norm": 4.495752811431885,
"learning_rate": 9.979120965228045e-06,
"loss": 1.2822,
"step": 1579
},
{
"epoch": 0.38858829316281357,
"grad_norm": 3.283578634262085,
"learning_rate": 9.979046648493643e-06,
"loss": 1.0706,
"step": 1580
},
{
"epoch": 0.3888342351205116,
"grad_norm": 4.0819525718688965,
"learning_rate": 9.978972200010505e-06,
"loss": 1.2195,
"step": 1581
},
{
"epoch": 0.3890801770782095,
"grad_norm": 4.1504974365234375,
"learning_rate": 9.978897619780601e-06,
"loss": 1.3189,
"step": 1582
},
{
"epoch": 0.3893261190359075,
"grad_norm": 4.105747222900391,
"learning_rate": 9.978822907805903e-06,
"loss": 1.3842,
"step": 1583
},
{
"epoch": 0.38957206099360553,
"grad_norm": 4.592813968658447,
"learning_rate": 9.978748064088388e-06,
"loss": 1.3061,
"step": 1584
},
{
"epoch": 0.3898180029513035,
"grad_norm": 4.330500602722168,
"learning_rate": 9.97867308863004e-06,
"loss": 1.3535,
"step": 1585
},
{
"epoch": 0.3900639449090015,
"grad_norm": 4.304264545440674,
"learning_rate": 9.978597981432839e-06,
"loss": 1.3059,
"step": 1586
},
{
"epoch": 0.3903098868666995,
"grad_norm": 3.905421018600464,
"learning_rate": 9.978522742498773e-06,
"loss": 1.3075,
"step": 1587
},
{
"epoch": 0.39055582882439743,
"grad_norm": 3.6789608001708984,
"learning_rate": 9.978447371829833e-06,
"loss": 1.2751,
"step": 1588
},
{
"epoch": 0.39080177078209544,
"grad_norm": 3.802004337310791,
"learning_rate": 9.978371869428014e-06,
"loss": 1.1751,
"step": 1589
},
{
"epoch": 0.3910477127397934,
"grad_norm": 3.72989821434021,
"learning_rate": 9.978296235295315e-06,
"loss": 1.2703,
"step": 1590
},
{
"epoch": 0.3912936546974914,
"grad_norm": 4.063558578491211,
"learning_rate": 9.978220469433735e-06,
"loss": 1.3448,
"step": 1591
},
{
"epoch": 0.3915395966551894,
"grad_norm": 3.3889520168304443,
"learning_rate": 9.97814457184528e-06,
"loss": 1.0255,
"step": 1592
},
{
"epoch": 0.39178553861288734,
"grad_norm": 3.9781992435455322,
"learning_rate": 9.97806854253196e-06,
"loss": 1.2643,
"step": 1593
},
{
"epoch": 0.39203148057058534,
"grad_norm": 3.7891182899475098,
"learning_rate": 9.977992381495782e-06,
"loss": 1.2969,
"step": 1594
},
{
"epoch": 0.39227742252828335,
"grad_norm": 3.7461061477661133,
"learning_rate": 9.977916088738767e-06,
"loss": 1.3367,
"step": 1595
},
{
"epoch": 0.3925233644859813,
"grad_norm": 3.6220154762268066,
"learning_rate": 9.977839664262927e-06,
"loss": 1.1453,
"step": 1596
},
{
"epoch": 0.3927693064436793,
"grad_norm": 4.246983528137207,
"learning_rate": 9.977763108070291e-06,
"loss": 1.2357,
"step": 1597
},
{
"epoch": 0.39301524840137725,
"grad_norm": 4.122408866882324,
"learning_rate": 9.977686420162882e-06,
"loss": 1.2196,
"step": 1598
},
{
"epoch": 0.39326119035907525,
"grad_norm": 4.504385948181152,
"learning_rate": 9.977609600542728e-06,
"loss": 1.2949,
"step": 1599
},
{
"epoch": 0.39350713231677326,
"grad_norm": 4.40198278427124,
"learning_rate": 9.977532649211863e-06,
"loss": 1.5168,
"step": 1600
},
{
"epoch": 0.3937530742744712,
"grad_norm": 3.9915614128112793,
"learning_rate": 9.977455566172323e-06,
"loss": 1.1664,
"step": 1601
},
{
"epoch": 0.3939990162321692,
"grad_norm": 4.282832622528076,
"learning_rate": 9.977378351426149e-06,
"loss": 1.2285,
"step": 1602
},
{
"epoch": 0.3942449581898672,
"grad_norm": 4.277071475982666,
"learning_rate": 9.97730100497538e-06,
"loss": 1.296,
"step": 1603
},
{
"epoch": 0.39449090014756516,
"grad_norm": 4.401856422424316,
"learning_rate": 9.977223526822067e-06,
"loss": 1.3332,
"step": 1604
},
{
"epoch": 0.39473684210526316,
"grad_norm": 4.427542209625244,
"learning_rate": 9.977145916968258e-06,
"loss": 1.2521,
"step": 1605
},
{
"epoch": 0.39498278406296117,
"grad_norm": 4.013346195220947,
"learning_rate": 9.977068175416008e-06,
"loss": 1.1988,
"step": 1606
},
{
"epoch": 0.3952287260206591,
"grad_norm": 3.583876132965088,
"learning_rate": 9.976990302167373e-06,
"loss": 1.1883,
"step": 1607
},
{
"epoch": 0.3954746679783571,
"grad_norm": 3.758452892303467,
"learning_rate": 9.976912297224414e-06,
"loss": 1.0984,
"step": 1608
},
{
"epoch": 0.39572060993605507,
"grad_norm": 3.9342401027679443,
"learning_rate": 9.976834160589195e-06,
"loss": 1.3041,
"step": 1609
},
{
"epoch": 0.39596655189375307,
"grad_norm": 3.9683194160461426,
"learning_rate": 9.976755892263783e-06,
"loss": 1.2984,
"step": 1610
},
{
"epoch": 0.3962124938514511,
"grad_norm": 3.706735849380493,
"learning_rate": 9.976677492250249e-06,
"loss": 1.1704,
"step": 1611
},
{
"epoch": 0.396458435809149,
"grad_norm": 3.6997830867767334,
"learning_rate": 9.976598960550668e-06,
"loss": 1.2594,
"step": 1612
},
{
"epoch": 0.396704377766847,
"grad_norm": 4.189054489135742,
"learning_rate": 9.976520297167118e-06,
"loss": 1.1895,
"step": 1613
},
{
"epoch": 0.39695031972454503,
"grad_norm": 4.212181568145752,
"learning_rate": 9.976441502101682e-06,
"loss": 1.2292,
"step": 1614
},
{
"epoch": 0.397196261682243,
"grad_norm": 3.9526596069335938,
"learning_rate": 9.976362575356443e-06,
"loss": 1.331,
"step": 1615
},
{
"epoch": 0.397442203639941,
"grad_norm": 3.849104404449463,
"learning_rate": 9.976283516933487e-06,
"loss": 1.1768,
"step": 1616
},
{
"epoch": 0.39768814559763893,
"grad_norm": 4.082289218902588,
"learning_rate": 9.976204326834914e-06,
"loss": 1.1543,
"step": 1617
},
{
"epoch": 0.39793408755533693,
"grad_norm": 4.133740425109863,
"learning_rate": 9.976125005062808e-06,
"loss": 1.2407,
"step": 1618
},
{
"epoch": 0.39818002951303494,
"grad_norm": 3.741819143295288,
"learning_rate": 9.976045551619279e-06,
"loss": 1.1622,
"step": 1619
},
{
"epoch": 0.3984259714707329,
"grad_norm": 4.243188381195068,
"learning_rate": 9.975965966506423e-06,
"loss": 1.3984,
"step": 1620
},
{
"epoch": 0.3986719134284309,
"grad_norm": 4.501266956329346,
"learning_rate": 9.975886249726347e-06,
"loss": 1.4878,
"step": 1621
},
{
"epoch": 0.3989178553861289,
"grad_norm": 4.010527610778809,
"learning_rate": 9.97580640128116e-06,
"loss": 1.2852,
"step": 1622
},
{
"epoch": 0.39916379734382684,
"grad_norm": 4.022928714752197,
"learning_rate": 9.975726421172977e-06,
"loss": 1.3947,
"step": 1623
},
{
"epoch": 0.39940973930152485,
"grad_norm": 4.280185222625732,
"learning_rate": 9.975646309403913e-06,
"loss": 1.1953,
"step": 1624
},
{
"epoch": 0.39965568125922285,
"grad_norm": 3.8702661991119385,
"learning_rate": 9.975566065976088e-06,
"loss": 1.133,
"step": 1625
},
{
"epoch": 0.3999016232169208,
"grad_norm": 4.425428867340088,
"learning_rate": 9.975485690891624e-06,
"loss": 1.2547,
"step": 1626
},
{
"epoch": 0.4001475651746188,
"grad_norm": 3.851905107498169,
"learning_rate": 9.975405184152648e-06,
"loss": 1.349,
"step": 1627
},
{
"epoch": 0.40039350713231675,
"grad_norm": 4.119965553283691,
"learning_rate": 9.975324545761292e-06,
"loss": 1.2482,
"step": 1628
},
{
"epoch": 0.40063944909001475,
"grad_norm": 4.167501449584961,
"learning_rate": 9.97524377571969e-06,
"loss": 1.4657,
"step": 1629
},
{
"epoch": 0.40088539104771276,
"grad_norm": 4.113773345947266,
"learning_rate": 9.975162874029976e-06,
"loss": 1.2974,
"step": 1630
},
{
"epoch": 0.4011313330054107,
"grad_norm": 3.97904109954834,
"learning_rate": 9.975081840694293e-06,
"loss": 1.2102,
"step": 1631
},
{
"epoch": 0.4013772749631087,
"grad_norm": 4.283999919891357,
"learning_rate": 9.975000675714786e-06,
"loss": 1.3546,
"step": 1632
},
{
"epoch": 0.4016232169208067,
"grad_norm": 4.008006572723389,
"learning_rate": 9.974919379093602e-06,
"loss": 1.1051,
"step": 1633
},
{
"epoch": 0.40186915887850466,
"grad_norm": 4.084774017333984,
"learning_rate": 9.97483795083289e-06,
"loss": 1.2287,
"step": 1634
},
{
"epoch": 0.40211510083620267,
"grad_norm": 3.944636821746826,
"learning_rate": 9.974756390934807e-06,
"loss": 1.2388,
"step": 1635
},
{
"epoch": 0.4023610427939006,
"grad_norm": 4.034182548522949,
"learning_rate": 9.974674699401512e-06,
"loss": 1.3014,
"step": 1636
},
{
"epoch": 0.4026069847515986,
"grad_norm": 4.437850475311279,
"learning_rate": 9.974592876235163e-06,
"loss": 1.3842,
"step": 1637
},
{
"epoch": 0.4028529267092966,
"grad_norm": 4.000709533691406,
"learning_rate": 9.974510921437929e-06,
"loss": 1.2543,
"step": 1638
},
{
"epoch": 0.40309886866699457,
"grad_norm": 4.223419189453125,
"learning_rate": 9.974428835011975e-06,
"loss": 1.2617,
"step": 1639
},
{
"epoch": 0.4033448106246926,
"grad_norm": 4.35833215713501,
"learning_rate": 9.974346616959476e-06,
"loss": 1.3688,
"step": 1640
},
{
"epoch": 0.4035907525823906,
"grad_norm": 4.026985168457031,
"learning_rate": 9.974264267282607e-06,
"loss": 1.2189,
"step": 1641
},
{
"epoch": 0.4038366945400885,
"grad_norm": 4.261682510375977,
"learning_rate": 9.974181785983545e-06,
"loss": 1.2073,
"step": 1642
},
{
"epoch": 0.40408263649778653,
"grad_norm": 4.410908222198486,
"learning_rate": 9.974099173064475e-06,
"loss": 1.483,
"step": 1643
},
{
"epoch": 0.40432857845548453,
"grad_norm": 4.253917217254639,
"learning_rate": 9.974016428527582e-06,
"loss": 1.3658,
"step": 1644
},
{
"epoch": 0.4045745204131825,
"grad_norm": 4.096005439758301,
"learning_rate": 9.973933552375055e-06,
"loss": 1.3937,
"step": 1645
},
{
"epoch": 0.4048204623708805,
"grad_norm": 3.7077298164367676,
"learning_rate": 9.973850544609088e-06,
"loss": 1.2945,
"step": 1646
},
{
"epoch": 0.40506640432857843,
"grad_norm": 3.853376865386963,
"learning_rate": 9.973767405231876e-06,
"loss": 1.2369,
"step": 1647
},
{
"epoch": 0.40531234628627644,
"grad_norm": 3.9138426780700684,
"learning_rate": 9.973684134245619e-06,
"loss": 1.3357,
"step": 1648
},
{
"epoch": 0.40555828824397444,
"grad_norm": 3.591768503189087,
"learning_rate": 9.973600731652522e-06,
"loss": 1.2155,
"step": 1649
},
{
"epoch": 0.4058042302016724,
"grad_norm": 4.636513710021973,
"learning_rate": 9.973517197454791e-06,
"loss": 1.2629,
"step": 1650
},
{
"epoch": 0.4060501721593704,
"grad_norm": 4.1304931640625,
"learning_rate": 9.973433531654638e-06,
"loss": 1.2791,
"step": 1651
},
{
"epoch": 0.4062961141170684,
"grad_norm": 4.247310638427734,
"learning_rate": 9.973349734254272e-06,
"loss": 1.259,
"step": 1652
},
{
"epoch": 0.40654205607476634,
"grad_norm": 4.0282440185546875,
"learning_rate": 9.973265805255914e-06,
"loss": 1.1615,
"step": 1653
},
{
"epoch": 0.40678799803246435,
"grad_norm": 4.163817882537842,
"learning_rate": 9.973181744661786e-06,
"loss": 1.3508,
"step": 1654
},
{
"epoch": 0.4070339399901623,
"grad_norm": 3.934872627258301,
"learning_rate": 9.973097552474112e-06,
"loss": 1.3734,
"step": 1655
},
{
"epoch": 0.4072798819478603,
"grad_norm": 4.217444896697998,
"learning_rate": 9.973013228695116e-06,
"loss": 1.4066,
"step": 1656
},
{
"epoch": 0.4075258239055583,
"grad_norm": 3.809960126876831,
"learning_rate": 9.972928773327033e-06,
"loss": 1.1462,
"step": 1657
},
{
"epoch": 0.40777176586325625,
"grad_norm": 3.8511695861816406,
"learning_rate": 9.972844186372094e-06,
"loss": 1.2744,
"step": 1658
},
{
"epoch": 0.40801770782095426,
"grad_norm": 4.109602451324463,
"learning_rate": 9.972759467832543e-06,
"loss": 1.2293,
"step": 1659
},
{
"epoch": 0.40826364977865226,
"grad_norm": 3.785579204559326,
"learning_rate": 9.972674617710616e-06,
"loss": 1.1041,
"step": 1660
},
{
"epoch": 0.4085095917363502,
"grad_norm": 3.8357796669006348,
"learning_rate": 9.972589636008561e-06,
"loss": 1.1898,
"step": 1661
},
{
"epoch": 0.4087555336940482,
"grad_norm": 3.9706833362579346,
"learning_rate": 9.972504522728627e-06,
"loss": 1.249,
"step": 1662
},
{
"epoch": 0.40900147565174616,
"grad_norm": 3.9192333221435547,
"learning_rate": 9.972419277873065e-06,
"loss": 1.2358,
"step": 1663
},
{
"epoch": 0.40924741760944416,
"grad_norm": 3.789764165878296,
"learning_rate": 9.972333901444131e-06,
"loss": 1.2749,
"step": 1664
},
{
"epoch": 0.40949335956714217,
"grad_norm": 3.6348984241485596,
"learning_rate": 9.972248393444086e-06,
"loss": 1.1627,
"step": 1665
},
{
"epoch": 0.4097393015248401,
"grad_norm": 3.834049940109253,
"learning_rate": 9.972162753875188e-06,
"loss": 1.1485,
"step": 1666
},
{
"epoch": 0.4099852434825381,
"grad_norm": 4.370472431182861,
"learning_rate": 9.972076982739707e-06,
"loss": 1.2862,
"step": 1667
},
{
"epoch": 0.4102311854402361,
"grad_norm": 4.610459804534912,
"learning_rate": 9.971991080039912e-06,
"loss": 1.3237,
"step": 1668
},
{
"epoch": 0.41047712739793407,
"grad_norm": 4.378916263580322,
"learning_rate": 9.971905045778072e-06,
"loss": 1.2826,
"step": 1669
},
{
"epoch": 0.4107230693556321,
"grad_norm": 4.222982406616211,
"learning_rate": 9.97181887995647e-06,
"loss": 1.453,
"step": 1670
},
{
"epoch": 0.4109690113133301,
"grad_norm": 4.118636608123779,
"learning_rate": 9.971732582577383e-06,
"loss": 1.3375,
"step": 1671
},
{
"epoch": 0.411214953271028,
"grad_norm": 4.104123115539551,
"learning_rate": 9.971646153643095e-06,
"loss": 1.2669,
"step": 1672
},
{
"epoch": 0.41146089522872603,
"grad_norm": 3.8620152473449707,
"learning_rate": 9.971559593155891e-06,
"loss": 1.2034,
"step": 1673
},
{
"epoch": 0.411706837186424,
"grad_norm": 4.500912666320801,
"learning_rate": 9.971472901118062e-06,
"loss": 1.3069,
"step": 1674
},
{
"epoch": 0.411952779144122,
"grad_norm": 3.8667545318603516,
"learning_rate": 9.971386077531903e-06,
"loss": 1.2216,
"step": 1675
},
{
"epoch": 0.41219872110182,
"grad_norm": 4.337331295013428,
"learning_rate": 9.971299122399713e-06,
"loss": 1.3822,
"step": 1676
},
{
"epoch": 0.41244466305951794,
"grad_norm": 3.887868642807007,
"learning_rate": 9.971212035723789e-06,
"loss": 1.1628,
"step": 1677
},
{
"epoch": 0.41269060501721594,
"grad_norm": 3.897681951522827,
"learning_rate": 9.971124817506438e-06,
"loss": 1.2596,
"step": 1678
},
{
"epoch": 0.41293654697491394,
"grad_norm": 4.1923394203186035,
"learning_rate": 9.971037467749968e-06,
"loss": 1.2678,
"step": 1679
},
{
"epoch": 0.4131824889326119,
"grad_norm": 4.215851783752441,
"learning_rate": 9.97094998645669e-06,
"loss": 1.1245,
"step": 1680
},
{
"epoch": 0.4134284308903099,
"grad_norm": 4.476115703582764,
"learning_rate": 9.970862373628917e-06,
"loss": 1.176,
"step": 1681
},
{
"epoch": 0.41367437284800784,
"grad_norm": 3.7906100749969482,
"learning_rate": 9.97077462926897e-06,
"loss": 1.3639,
"step": 1682
},
{
"epoch": 0.41392031480570585,
"grad_norm": 4.204716682434082,
"learning_rate": 9.97068675337917e-06,
"loss": 1.1772,
"step": 1683
},
{
"epoch": 0.41416625676340385,
"grad_norm": 3.9978551864624023,
"learning_rate": 9.970598745961842e-06,
"loss": 1.1967,
"step": 1684
},
{
"epoch": 0.4144121987211018,
"grad_norm": 4.0516510009765625,
"learning_rate": 9.970510607019312e-06,
"loss": 1.1957,
"step": 1685
},
{
"epoch": 0.4146581406787998,
"grad_norm": 4.211540699005127,
"learning_rate": 9.970422336553915e-06,
"loss": 1.3079,
"step": 1686
},
{
"epoch": 0.4149040826364978,
"grad_norm": 4.252230644226074,
"learning_rate": 9.970333934567989e-06,
"loss": 1.3859,
"step": 1687
},
{
"epoch": 0.41515002459419575,
"grad_norm": 4.331723213195801,
"learning_rate": 9.970245401063868e-06,
"loss": 1.3533,
"step": 1688
},
{
"epoch": 0.41539596655189376,
"grad_norm": 4.038358688354492,
"learning_rate": 9.970156736043897e-06,
"loss": 1.373,
"step": 1689
},
{
"epoch": 0.41564190850959176,
"grad_norm": 3.7271597385406494,
"learning_rate": 9.970067939510423e-06,
"loss": 1.1983,
"step": 1690
},
{
"epoch": 0.4158878504672897,
"grad_norm": 4.2716522216796875,
"learning_rate": 9.969979011465796e-06,
"loss": 1.2308,
"step": 1691
},
{
"epoch": 0.4161337924249877,
"grad_norm": 4.540455341339111,
"learning_rate": 9.969889951912366e-06,
"loss": 1.4668,
"step": 1692
},
{
"epoch": 0.41637973438268566,
"grad_norm": 4.217796802520752,
"learning_rate": 9.969800760852492e-06,
"loss": 1.2359,
"step": 1693
},
{
"epoch": 0.41662567634038367,
"grad_norm": 4.16795539855957,
"learning_rate": 9.969711438288536e-06,
"loss": 1.1444,
"step": 1694
},
{
"epoch": 0.41687161829808167,
"grad_norm": 4.215379238128662,
"learning_rate": 9.969621984222856e-06,
"loss": 1.3822,
"step": 1695
},
{
"epoch": 0.4171175602557796,
"grad_norm": 4.4914655685424805,
"learning_rate": 9.969532398657824e-06,
"loss": 1.2947,
"step": 1696
},
{
"epoch": 0.4173635022134776,
"grad_norm": 4.293181419372559,
"learning_rate": 9.96944268159581e-06,
"loss": 1.3215,
"step": 1697
},
{
"epoch": 0.4176094441711756,
"grad_norm": 4.25947380065918,
"learning_rate": 9.969352833039184e-06,
"loss": 1.377,
"step": 1698
},
{
"epoch": 0.4178553861288736,
"grad_norm": 3.9051144123077393,
"learning_rate": 9.969262852990327e-06,
"loss": 1.2268,
"step": 1699
},
{
"epoch": 0.4181013280865716,
"grad_norm": 4.223618984222412,
"learning_rate": 9.96917274145162e-06,
"loss": 1.382,
"step": 1700
},
{
"epoch": 0.4183472700442695,
"grad_norm": 4.1638102531433105,
"learning_rate": 9.969082498425445e-06,
"loss": 1.4335,
"step": 1701
},
{
"epoch": 0.41859321200196753,
"grad_norm": 3.7644331455230713,
"learning_rate": 9.968992123914192e-06,
"loss": 1.1836,
"step": 1702
},
{
"epoch": 0.41883915395966553,
"grad_norm": 3.719470262527466,
"learning_rate": 9.968901617920252e-06,
"loss": 1.213,
"step": 1703
},
{
"epoch": 0.4190850959173635,
"grad_norm": 4.034485340118408,
"learning_rate": 9.96881098044602e-06,
"loss": 1.2187,
"step": 1704
},
{
"epoch": 0.4193310378750615,
"grad_norm": 3.7995450496673584,
"learning_rate": 9.968720211493893e-06,
"loss": 1.2806,
"step": 1705
},
{
"epoch": 0.4195769798327595,
"grad_norm": 3.648852825164795,
"learning_rate": 9.968629311066274e-06,
"loss": 1.1993,
"step": 1706
},
{
"epoch": 0.41982292179045744,
"grad_norm": 3.7486045360565186,
"learning_rate": 9.968538279165568e-06,
"loss": 1.2432,
"step": 1707
},
{
"epoch": 0.42006886374815544,
"grad_norm": 4.137381076812744,
"learning_rate": 9.968447115794182e-06,
"loss": 1.247,
"step": 1708
},
{
"epoch": 0.42031480570585344,
"grad_norm": 3.935755729675293,
"learning_rate": 9.968355820954532e-06,
"loss": 1.2045,
"step": 1709
},
{
"epoch": 0.4205607476635514,
"grad_norm": 4.402259826660156,
"learning_rate": 9.96826439464903e-06,
"loss": 1.204,
"step": 1710
},
{
"epoch": 0.4208066896212494,
"grad_norm": 4.0052008628845215,
"learning_rate": 9.9681728368801e-06,
"loss": 1.3365,
"step": 1711
},
{
"epoch": 0.42105263157894735,
"grad_norm": 3.8940837383270264,
"learning_rate": 9.968081147650159e-06,
"loss": 1.2539,
"step": 1712
},
{
"epoch": 0.42129857353664535,
"grad_norm": 3.725008487701416,
"learning_rate": 9.967989326961636e-06,
"loss": 1.2145,
"step": 1713
},
{
"epoch": 0.42154451549434335,
"grad_norm": 4.130488872528076,
"learning_rate": 9.96789737481696e-06,
"loss": 1.2049,
"step": 1714
},
{
"epoch": 0.4217904574520413,
"grad_norm": 3.9401748180389404,
"learning_rate": 9.967805291218564e-06,
"loss": 1.189,
"step": 1715
},
{
"epoch": 0.4220363994097393,
"grad_norm": 4.398349761962891,
"learning_rate": 9.967713076168887e-06,
"loss": 1.2266,
"step": 1716
},
{
"epoch": 0.4222823413674373,
"grad_norm": 4.069632530212402,
"learning_rate": 9.967620729670367e-06,
"loss": 1.3247,
"step": 1717
},
{
"epoch": 0.42252828332513526,
"grad_norm": 4.243809700012207,
"learning_rate": 9.967528251725446e-06,
"loss": 1.2797,
"step": 1718
},
{
"epoch": 0.42277422528283326,
"grad_norm": 3.600376605987549,
"learning_rate": 9.967435642336574e-06,
"loss": 1.1945,
"step": 1719
},
{
"epoch": 0.4230201672405312,
"grad_norm": 4.197199821472168,
"learning_rate": 9.967342901506199e-06,
"loss": 1.27,
"step": 1720
},
{
"epoch": 0.4232661091982292,
"grad_norm": 4.841041088104248,
"learning_rate": 9.967250029236777e-06,
"loss": 1.3612,
"step": 1721
},
{
"epoch": 0.4235120511559272,
"grad_norm": 4.050363540649414,
"learning_rate": 9.967157025530765e-06,
"loss": 1.1946,
"step": 1722
},
{
"epoch": 0.42375799311362516,
"grad_norm": 3.9456565380096436,
"learning_rate": 9.967063890390623e-06,
"loss": 1.3275,
"step": 1723
},
{
"epoch": 0.42400393507132317,
"grad_norm": 4.360344409942627,
"learning_rate": 9.966970623818814e-06,
"loss": 1.3726,
"step": 1724
},
{
"epoch": 0.42424987702902117,
"grad_norm": 4.225467681884766,
"learning_rate": 9.966877225817811e-06,
"loss": 1.2561,
"step": 1725
},
{
"epoch": 0.4244958189867191,
"grad_norm": 3.996978282928467,
"learning_rate": 9.966783696390082e-06,
"loss": 1.2817,
"step": 1726
},
{
"epoch": 0.4247417609444171,
"grad_norm": 3.943695545196533,
"learning_rate": 9.9666900355381e-06,
"loss": 1.3578,
"step": 1727
},
{
"epoch": 0.4249877029021151,
"grad_norm": 3.917283296585083,
"learning_rate": 9.966596243264347e-06,
"loss": 1.2606,
"step": 1728
},
{
"epoch": 0.4252336448598131,
"grad_norm": 4.180567741394043,
"learning_rate": 9.966502319571303e-06,
"loss": 1.4033,
"step": 1729
},
{
"epoch": 0.4254795868175111,
"grad_norm": 3.8179657459259033,
"learning_rate": 9.966408264461454e-06,
"loss": 1.1919,
"step": 1730
},
{
"epoch": 0.42572552877520903,
"grad_norm": 4.061753749847412,
"learning_rate": 9.966314077937287e-06,
"loss": 1.3266,
"step": 1731
},
{
"epoch": 0.42597147073290703,
"grad_norm": 4.150154113769531,
"learning_rate": 9.966219760001296e-06,
"loss": 1.3061,
"step": 1732
},
{
"epoch": 0.42621741269060504,
"grad_norm": 4.119760990142822,
"learning_rate": 9.966125310655977e-06,
"loss": 1.3379,
"step": 1733
},
{
"epoch": 0.426463354648303,
"grad_norm": 3.901587724685669,
"learning_rate": 9.966030729903826e-06,
"loss": 1.1828,
"step": 1734
},
{
"epoch": 0.426709296606001,
"grad_norm": 3.767381429672241,
"learning_rate": 9.96593601774735e-06,
"loss": 1.2014,
"step": 1735
},
{
"epoch": 0.426955238563699,
"grad_norm": 4.349600315093994,
"learning_rate": 9.965841174189054e-06,
"loss": 1.3269,
"step": 1736
},
{
"epoch": 0.42720118052139694,
"grad_norm": 3.9656548500061035,
"learning_rate": 9.965746199231446e-06,
"loss": 1.2944,
"step": 1737
},
{
"epoch": 0.42744712247909494,
"grad_norm": 3.898622512817383,
"learning_rate": 9.96565109287704e-06,
"loss": 1.4506,
"step": 1738
},
{
"epoch": 0.4276930644367929,
"grad_norm": 3.5621798038482666,
"learning_rate": 9.965555855128351e-06,
"loss": 1.2032,
"step": 1739
},
{
"epoch": 0.4279390063944909,
"grad_norm": 3.7923099994659424,
"learning_rate": 9.965460485987902e-06,
"loss": 1.1046,
"step": 1740
},
{
"epoch": 0.4281849483521889,
"grad_norm": 3.9007411003112793,
"learning_rate": 9.965364985458215e-06,
"loss": 1.2097,
"step": 1741
},
{
"epoch": 0.42843089030988685,
"grad_norm": 3.893603801727295,
"learning_rate": 9.965269353541818e-06,
"loss": 1.335,
"step": 1742
},
{
"epoch": 0.42867683226758485,
"grad_norm": 3.8513543605804443,
"learning_rate": 9.96517359024124e-06,
"loss": 1.3712,
"step": 1743
},
{
"epoch": 0.42892277422528285,
"grad_norm": 4.170489311218262,
"learning_rate": 9.965077695559014e-06,
"loss": 1.2544,
"step": 1744
},
{
"epoch": 0.4291687161829808,
"grad_norm": 3.6286261081695557,
"learning_rate": 9.964981669497679e-06,
"loss": 1.266,
"step": 1745
},
{
"epoch": 0.4294146581406788,
"grad_norm": 3.956113576889038,
"learning_rate": 9.964885512059778e-06,
"loss": 1.211,
"step": 1746
},
{
"epoch": 0.4296606000983768,
"grad_norm": 4.089892387390137,
"learning_rate": 9.964789223247852e-06,
"loss": 1.3403,
"step": 1747
},
{
"epoch": 0.42990654205607476,
"grad_norm": 3.949620008468628,
"learning_rate": 9.964692803064449e-06,
"loss": 1.2373,
"step": 1748
},
{
"epoch": 0.43015248401377276,
"grad_norm": 3.984233856201172,
"learning_rate": 9.964596251512124e-06,
"loss": 1.2647,
"step": 1749
},
{
"epoch": 0.4303984259714707,
"grad_norm": 3.987738847732544,
"learning_rate": 9.964499568593427e-06,
"loss": 1.2121,
"step": 1750
},
{
"epoch": 0.4306443679291687,
"grad_norm": 4.157508373260498,
"learning_rate": 9.96440275431092e-06,
"loss": 1.3879,
"step": 1751
},
{
"epoch": 0.4308903098868667,
"grad_norm": 4.179201602935791,
"learning_rate": 9.964305808667163e-06,
"loss": 1.2926,
"step": 1752
},
{
"epoch": 0.43113625184456467,
"grad_norm": 4.021627902984619,
"learning_rate": 9.964208731664722e-06,
"loss": 1.3075,
"step": 1753
},
{
"epoch": 0.43138219380226267,
"grad_norm": 3.6913962364196777,
"learning_rate": 9.964111523306164e-06,
"loss": 1.1331,
"step": 1754
},
{
"epoch": 0.4316281357599607,
"grad_norm": 3.5320358276367188,
"learning_rate": 9.964014183594062e-06,
"loss": 1.1088,
"step": 1755
},
{
"epoch": 0.4318740777176586,
"grad_norm": 3.894617795944214,
"learning_rate": 9.963916712530995e-06,
"loss": 1.2513,
"step": 1756
},
{
"epoch": 0.4321200196753566,
"grad_norm": 3.9999759197235107,
"learning_rate": 9.963819110119536e-06,
"loss": 1.3303,
"step": 1757
},
{
"epoch": 0.4323659616330546,
"grad_norm": 4.015420913696289,
"learning_rate": 9.963721376362275e-06,
"loss": 1.4071,
"step": 1758
},
{
"epoch": 0.4326119035907526,
"grad_norm": 4.301741123199463,
"learning_rate": 9.96362351126179e-06,
"loss": 1.4341,
"step": 1759
},
{
"epoch": 0.4328578455484506,
"grad_norm": 3.7337379455566406,
"learning_rate": 9.963525514820676e-06,
"loss": 1.1608,
"step": 1760
},
{
"epoch": 0.43310378750614853,
"grad_norm": 3.7618815898895264,
"learning_rate": 9.963427387041525e-06,
"loss": 1.1688,
"step": 1761
},
{
"epoch": 0.43334972946384653,
"grad_norm": 4.34212589263916,
"learning_rate": 9.963329127926933e-06,
"loss": 1.2635,
"step": 1762
},
{
"epoch": 0.43359567142154454,
"grad_norm": 4.024925708770752,
"learning_rate": 9.9632307374795e-06,
"loss": 0.9997,
"step": 1763
},
{
"epoch": 0.4338416133792425,
"grad_norm": 4.0820698738098145,
"learning_rate": 9.96313221570183e-06,
"loss": 1.2656,
"step": 1764
},
{
"epoch": 0.4340875553369405,
"grad_norm": 3.836421012878418,
"learning_rate": 9.963033562596529e-06,
"loss": 1.275,
"step": 1765
},
{
"epoch": 0.43433349729463844,
"grad_norm": 3.6705689430236816,
"learning_rate": 9.96293477816621e-06,
"loss": 1.0522,
"step": 1766
},
{
"epoch": 0.43457943925233644,
"grad_norm": 3.9037721157073975,
"learning_rate": 9.962835862413483e-06,
"loss": 1.2687,
"step": 1767
},
{
"epoch": 0.43482538121003445,
"grad_norm": 3.9312140941619873,
"learning_rate": 9.962736815340968e-06,
"loss": 1.2603,
"step": 1768
},
{
"epoch": 0.4350713231677324,
"grad_norm": 3.8491716384887695,
"learning_rate": 9.962637636951283e-06,
"loss": 1.1136,
"step": 1769
},
{
"epoch": 0.4353172651254304,
"grad_norm": 3.957406520843506,
"learning_rate": 9.962538327247056e-06,
"loss": 1.1913,
"step": 1770
},
{
"epoch": 0.4355632070831284,
"grad_norm": 3.9353902339935303,
"learning_rate": 9.962438886230913e-06,
"loss": 1.2891,
"step": 1771
},
{
"epoch": 0.43580914904082635,
"grad_norm": 3.3843936920166016,
"learning_rate": 9.962339313905487e-06,
"loss": 1.0228,
"step": 1772
},
{
"epoch": 0.43605509099852435,
"grad_norm": 4.093668460845947,
"learning_rate": 9.962239610273409e-06,
"loss": 1.1566,
"step": 1773
},
{
"epoch": 0.43630103295622236,
"grad_norm": 3.9964852333068848,
"learning_rate": 9.962139775337323e-06,
"loss": 1.1917,
"step": 1774
},
{
"epoch": 0.4365469749139203,
"grad_norm": 4.125541687011719,
"learning_rate": 9.962039809099863e-06,
"loss": 1.1978,
"step": 1775
},
{
"epoch": 0.4367929168716183,
"grad_norm": 3.889392375946045,
"learning_rate": 9.961939711563682e-06,
"loss": 1.2235,
"step": 1776
},
{
"epoch": 0.43703885882931626,
"grad_norm": 4.051826477050781,
"learning_rate": 9.961839482731422e-06,
"loss": 1.2936,
"step": 1777
},
{
"epoch": 0.43728480078701426,
"grad_norm": 3.9437143802642822,
"learning_rate": 9.96173912260574e-06,
"loss": 1.1962,
"step": 1778
},
{
"epoch": 0.43753074274471226,
"grad_norm": 4.009792327880859,
"learning_rate": 9.96163863118929e-06,
"loss": 1.3704,
"step": 1779
},
{
"epoch": 0.4377766847024102,
"grad_norm": 4.0439043045043945,
"learning_rate": 9.961538008484729e-06,
"loss": 1.3243,
"step": 1780
},
{
"epoch": 0.4380226266601082,
"grad_norm": 3.9895105361938477,
"learning_rate": 9.961437254494722e-06,
"loss": 1.143,
"step": 1781
},
{
"epoch": 0.4382685686178062,
"grad_norm": 4.0932135581970215,
"learning_rate": 9.961336369221935e-06,
"loss": 1.3113,
"step": 1782
},
{
"epoch": 0.43851451057550417,
"grad_norm": 4.329514980316162,
"learning_rate": 9.961235352669037e-06,
"loss": 1.3068,
"step": 1783
},
{
"epoch": 0.43876045253320217,
"grad_norm": 3.956315279006958,
"learning_rate": 9.9611342048387e-06,
"loss": 1.2115,
"step": 1784
},
{
"epoch": 0.4390063944909001,
"grad_norm": 4.140749931335449,
"learning_rate": 9.9610329257336e-06,
"loss": 1.2845,
"step": 1785
},
{
"epoch": 0.4392523364485981,
"grad_norm": 3.7155213356018066,
"learning_rate": 9.96093151535642e-06,
"loss": 1.2526,
"step": 1786
},
{
"epoch": 0.43949827840629613,
"grad_norm": 3.539806365966797,
"learning_rate": 9.960829973709842e-06,
"loss": 1.2492,
"step": 1787
},
{
"epoch": 0.4397442203639941,
"grad_norm": 4.490574359893799,
"learning_rate": 9.960728300796552e-06,
"loss": 1.2853,
"step": 1788
},
{
"epoch": 0.4399901623216921,
"grad_norm": 4.338521957397461,
"learning_rate": 9.960626496619239e-06,
"loss": 1.3287,
"step": 1789
},
{
"epoch": 0.4402361042793901,
"grad_norm": 3.740466356277466,
"learning_rate": 9.9605245611806e-06,
"loss": 1.1081,
"step": 1790
},
{
"epoch": 0.44048204623708803,
"grad_norm": 4.680780410766602,
"learning_rate": 9.96042249448333e-06,
"loss": 1.3728,
"step": 1791
},
{
"epoch": 0.44072798819478604,
"grad_norm": 4.4023919105529785,
"learning_rate": 9.960320296530131e-06,
"loss": 1.3426,
"step": 1792
},
{
"epoch": 0.44097393015248404,
"grad_norm": 4.023207187652588,
"learning_rate": 9.960217967323706e-06,
"loss": 1.2233,
"step": 1793
},
{
"epoch": 0.441219872110182,
"grad_norm": 4.08532190322876,
"learning_rate": 9.960115506866766e-06,
"loss": 1.2274,
"step": 1794
},
{
"epoch": 0.44146581406788,
"grad_norm": 3.907874584197998,
"learning_rate": 9.960012915162017e-06,
"loss": 1.303,
"step": 1795
},
{
"epoch": 0.44171175602557794,
"grad_norm": 3.9624135494232178,
"learning_rate": 9.959910192212177e-06,
"loss": 1.1904,
"step": 1796
},
{
"epoch": 0.44195769798327594,
"grad_norm": 3.8725876808166504,
"learning_rate": 9.959807338019963e-06,
"loss": 1.2696,
"step": 1797
},
{
"epoch": 0.44220363994097395,
"grad_norm": 4.0243754386901855,
"learning_rate": 9.959704352588098e-06,
"loss": 1.1897,
"step": 1798
},
{
"epoch": 0.4424495818986719,
"grad_norm": 3.6399214267730713,
"learning_rate": 9.959601235919306e-06,
"loss": 1.2556,
"step": 1799
},
{
"epoch": 0.4426955238563699,
"grad_norm": 3.8470723628997803,
"learning_rate": 9.959497988016315e-06,
"loss": 1.1725,
"step": 1800
},
{
"epoch": 0.4429414658140679,
"grad_norm": 3.763153076171875,
"learning_rate": 9.959394608881857e-06,
"loss": 1.2315,
"step": 1801
},
{
"epoch": 0.44318740777176585,
"grad_norm": 3.6007325649261475,
"learning_rate": 9.959291098518668e-06,
"loss": 1.1439,
"step": 1802
},
{
"epoch": 0.44343334972946385,
"grad_norm": 3.7789690494537354,
"learning_rate": 9.95918745692949e-06,
"loss": 1.1571,
"step": 1803
},
{
"epoch": 0.4436792916871618,
"grad_norm": 3.424339532852173,
"learning_rate": 9.959083684117057e-06,
"loss": 1.0905,
"step": 1804
},
{
"epoch": 0.4439252336448598,
"grad_norm": 3.9057133197784424,
"learning_rate": 9.958979780084124e-06,
"loss": 1.2572,
"step": 1805
},
{
"epoch": 0.4441711756025578,
"grad_norm": 3.6565279960632324,
"learning_rate": 9.958875744833435e-06,
"loss": 1.1988,
"step": 1806
},
{
"epoch": 0.44441711756025576,
"grad_norm": 4.0411295890808105,
"learning_rate": 9.958771578367748e-06,
"loss": 1.1759,
"step": 1807
},
{
"epoch": 0.44466305951795376,
"grad_norm": 4.041774749755859,
"learning_rate": 9.958667280689812e-06,
"loss": 1.1629,
"step": 1808
},
{
"epoch": 0.44490900147565177,
"grad_norm": 3.7149252891540527,
"learning_rate": 9.95856285180239e-06,
"loss": 1.0954,
"step": 1809
},
{
"epoch": 0.4451549434333497,
"grad_norm": 3.698456048965454,
"learning_rate": 9.95845829170825e-06,
"loss": 1.2618,
"step": 1810
},
{
"epoch": 0.4454008853910477,
"grad_norm": 3.9970273971557617,
"learning_rate": 9.958353600410152e-06,
"loss": 1.3691,
"step": 1811
},
{
"epoch": 0.4456468273487457,
"grad_norm": 3.862743616104126,
"learning_rate": 9.95824877791087e-06,
"loss": 1.3276,
"step": 1812
},
{
"epoch": 0.44589276930644367,
"grad_norm": 4.14730167388916,
"learning_rate": 9.958143824213176e-06,
"loss": 1.2929,
"step": 1813
},
{
"epoch": 0.4461387112641417,
"grad_norm": 4.0874762535095215,
"learning_rate": 9.958038739319846e-06,
"loss": 1.3246,
"step": 1814
},
{
"epoch": 0.4463846532218396,
"grad_norm": 3.6913561820983887,
"learning_rate": 9.957933523233663e-06,
"loss": 1.3354,
"step": 1815
},
{
"epoch": 0.4466305951795376,
"grad_norm": 4.047941207885742,
"learning_rate": 9.957828175957412e-06,
"loss": 1.4389,
"step": 1816
},
{
"epoch": 0.44687653713723563,
"grad_norm": 4.154895782470703,
"learning_rate": 9.957722697493876e-06,
"loss": 1.3517,
"step": 1817
},
{
"epoch": 0.4471224790949336,
"grad_norm": 3.626936197280884,
"learning_rate": 9.95761708784585e-06,
"loss": 1.1072,
"step": 1818
},
{
"epoch": 0.4473684210526316,
"grad_norm": 3.7108163833618164,
"learning_rate": 9.957511347016128e-06,
"loss": 1.2599,
"step": 1819
},
{
"epoch": 0.4476143630103296,
"grad_norm": 4.13779878616333,
"learning_rate": 9.957405475007507e-06,
"loss": 1.2316,
"step": 1820
},
{
"epoch": 0.44786030496802753,
"grad_norm": 4.061727523803711,
"learning_rate": 9.95729947182279e-06,
"loss": 1.383,
"step": 1821
},
{
"epoch": 0.44810624692572554,
"grad_norm": 4.0549235343933105,
"learning_rate": 9.95719333746478e-06,
"loss": 1.3221,
"step": 1822
},
{
"epoch": 0.4483521888834235,
"grad_norm": 3.776146411895752,
"learning_rate": 9.957087071936285e-06,
"loss": 1.182,
"step": 1823
},
{
"epoch": 0.4485981308411215,
"grad_norm": 4.425314903259277,
"learning_rate": 9.956980675240119e-06,
"loss": 1.3675,
"step": 1824
},
{
"epoch": 0.4488440727988195,
"grad_norm": 3.6694552898406982,
"learning_rate": 9.956874147379095e-06,
"loss": 1.1687,
"step": 1825
},
{
"epoch": 0.44909001475651744,
"grad_norm": 3.9561469554901123,
"learning_rate": 9.956767488356033e-06,
"loss": 1.3405,
"step": 1826
},
{
"epoch": 0.44933595671421545,
"grad_norm": 4.421605587005615,
"learning_rate": 9.956660698173757e-06,
"loss": 1.3147,
"step": 1827
},
{
"epoch": 0.44958189867191345,
"grad_norm": 3.948045253753662,
"learning_rate": 9.956553776835092e-06,
"loss": 1.3816,
"step": 1828
},
{
"epoch": 0.4498278406296114,
"grad_norm": 3.762622833251953,
"learning_rate": 9.956446724342863e-06,
"loss": 1.1048,
"step": 1829
},
{
"epoch": 0.4500737825873094,
"grad_norm": 4.232388973236084,
"learning_rate": 9.956339540699909e-06,
"loss": 1.3365,
"step": 1830
},
{
"epoch": 0.4503197245450074,
"grad_norm": 4.125189781188965,
"learning_rate": 9.956232225909061e-06,
"loss": 1.2889,
"step": 1831
},
{
"epoch": 0.45056566650270535,
"grad_norm": 4.245105743408203,
"learning_rate": 9.956124779973161e-06,
"loss": 1.3246,
"step": 1832
},
{
"epoch": 0.45081160846040336,
"grad_norm": 3.9167020320892334,
"learning_rate": 9.956017202895053e-06,
"loss": 1.1802,
"step": 1833
},
{
"epoch": 0.4510575504181013,
"grad_norm": 4.129471302032471,
"learning_rate": 9.95590949467758e-06,
"loss": 1.1758,
"step": 1834
},
{
"epoch": 0.4513034923757993,
"grad_norm": 4.304544925689697,
"learning_rate": 9.955801655323596e-06,
"loss": 1.2377,
"step": 1835
},
{
"epoch": 0.4515494343334973,
"grad_norm": 4.304721832275391,
"learning_rate": 9.955693684835953e-06,
"loss": 1.1999,
"step": 1836
},
{
"epoch": 0.45179537629119526,
"grad_norm": 4.423414707183838,
"learning_rate": 9.955585583217508e-06,
"loss": 1.3558,
"step": 1837
},
{
"epoch": 0.45204131824889326,
"grad_norm": 4.017286777496338,
"learning_rate": 9.95547735047112e-06,
"loss": 1.1928,
"step": 1838
},
{
"epoch": 0.45228726020659127,
"grad_norm": 4.708923816680908,
"learning_rate": 9.955368986599656e-06,
"loss": 1.3722,
"step": 1839
},
{
"epoch": 0.4525332021642892,
"grad_norm": 3.9128808975219727,
"learning_rate": 9.95526049160598e-06,
"loss": 1.1601,
"step": 1840
},
{
"epoch": 0.4527791441219872,
"grad_norm": 3.869041919708252,
"learning_rate": 9.955151865492965e-06,
"loss": 1.1932,
"step": 1841
},
{
"epoch": 0.45302508607968517,
"grad_norm": 3.822822332382202,
"learning_rate": 9.955043108263485e-06,
"loss": 1.2323,
"step": 1842
},
{
"epoch": 0.4532710280373832,
"grad_norm": 3.7289650440216064,
"learning_rate": 9.954934219920416e-06,
"loss": 1.2364,
"step": 1843
},
{
"epoch": 0.4535169699950812,
"grad_norm": 3.946932554244995,
"learning_rate": 9.954825200466642e-06,
"loss": 1.1655,
"step": 1844
},
{
"epoch": 0.4537629119527791,
"grad_norm": 3.9199132919311523,
"learning_rate": 9.954716049905047e-06,
"loss": 1.3437,
"step": 1845
},
{
"epoch": 0.45400885391047713,
"grad_norm": 4.396664142608643,
"learning_rate": 9.954606768238518e-06,
"loss": 1.227,
"step": 1846
},
{
"epoch": 0.45425479586817513,
"grad_norm": 3.9183101654052734,
"learning_rate": 9.954497355469947e-06,
"loss": 1.2362,
"step": 1847
},
{
"epoch": 0.4545007378258731,
"grad_norm": 4.143077850341797,
"learning_rate": 9.95438781160223e-06,
"loss": 1.1514,
"step": 1848
},
{
"epoch": 0.4547466797835711,
"grad_norm": 3.9387755393981934,
"learning_rate": 9.954278136638265e-06,
"loss": 1.2478,
"step": 1849
},
{
"epoch": 0.4549926217412691,
"grad_norm": 4.0431647300720215,
"learning_rate": 9.954168330580953e-06,
"loss": 1.3395,
"step": 1850
},
{
"epoch": 0.45523856369896704,
"grad_norm": 4.180318355560303,
"learning_rate": 9.954058393433203e-06,
"loss": 1.147,
"step": 1851
},
{
"epoch": 0.45548450565666504,
"grad_norm": 3.8078744411468506,
"learning_rate": 9.95394832519792e-06,
"loss": 1.0851,
"step": 1852
},
{
"epoch": 0.455730447614363,
"grad_norm": 4.728276252746582,
"learning_rate": 9.953838125878019e-06,
"loss": 1.3614,
"step": 1853
},
{
"epoch": 0.455976389572061,
"grad_norm": 3.785836935043335,
"learning_rate": 9.953727795476415e-06,
"loss": 1.2611,
"step": 1854
},
{
"epoch": 0.456222331529759,
"grad_norm": 3.79616117477417,
"learning_rate": 9.953617333996027e-06,
"loss": 1.2098,
"step": 1855
},
{
"epoch": 0.45646827348745694,
"grad_norm": 3.957402229309082,
"learning_rate": 9.953506741439777e-06,
"loss": 1.203,
"step": 1856
},
{
"epoch": 0.45671421544515495,
"grad_norm": 3.8189990520477295,
"learning_rate": 9.953396017810597e-06,
"loss": 1.2032,
"step": 1857
},
{
"epoch": 0.45696015740285295,
"grad_norm": 3.956101417541504,
"learning_rate": 9.953285163111409e-06,
"loss": 1.1473,
"step": 1858
},
{
"epoch": 0.4572060993605509,
"grad_norm": 3.659669876098633,
"learning_rate": 9.95317417734515e-06,
"loss": 1.2744,
"step": 1859
},
{
"epoch": 0.4574520413182489,
"grad_norm": 3.7662646770477295,
"learning_rate": 9.953063060514758e-06,
"loss": 1.3268,
"step": 1860
},
{
"epoch": 0.45769798327594685,
"grad_norm": 3.743382453918457,
"learning_rate": 9.952951812623171e-06,
"loss": 1.1249,
"step": 1861
},
{
"epoch": 0.45794392523364486,
"grad_norm": 3.6752781867980957,
"learning_rate": 9.952840433673334e-06,
"loss": 1.1478,
"step": 1862
},
{
"epoch": 0.45818986719134286,
"grad_norm": 3.5902984142303467,
"learning_rate": 9.952728923668194e-06,
"loss": 1.1366,
"step": 1863
},
{
"epoch": 0.4584358091490408,
"grad_norm": 4.367350101470947,
"learning_rate": 9.9526172826107e-06,
"loss": 1.2519,
"step": 1864
},
{
"epoch": 0.4586817511067388,
"grad_norm": 3.9395902156829834,
"learning_rate": 9.95250551050381e-06,
"loss": 1.3798,
"step": 1865
},
{
"epoch": 0.4589276930644368,
"grad_norm": 4.120596885681152,
"learning_rate": 9.952393607350476e-06,
"loss": 1.1624,
"step": 1866
},
{
"epoch": 0.45917363502213476,
"grad_norm": 4.071078300476074,
"learning_rate": 9.952281573153664e-06,
"loss": 1.1974,
"step": 1867
},
{
"epoch": 0.45941957697983277,
"grad_norm": 3.760327100753784,
"learning_rate": 9.952169407916334e-06,
"loss": 1.2535,
"step": 1868
},
{
"epoch": 0.45966551893753077,
"grad_norm": 5.239642143249512,
"learning_rate": 9.952057111641458e-06,
"loss": 1.3289,
"step": 1869
},
{
"epoch": 0.4599114608952287,
"grad_norm": 4.05585241317749,
"learning_rate": 9.951944684332007e-06,
"loss": 1.3801,
"step": 1870
},
{
"epoch": 0.4601574028529267,
"grad_norm": 3.7878942489624023,
"learning_rate": 9.951832125990953e-06,
"loss": 1.2543,
"step": 1871
},
{
"epoch": 0.46040334481062467,
"grad_norm": 3.8404221534729004,
"learning_rate": 9.951719436621276e-06,
"loss": 1.3234,
"step": 1872
},
{
"epoch": 0.4606492867683227,
"grad_norm": 3.9706597328186035,
"learning_rate": 9.951606616225959e-06,
"loss": 1.3592,
"step": 1873
},
{
"epoch": 0.4608952287260207,
"grad_norm": 3.767059326171875,
"learning_rate": 9.951493664807987e-06,
"loss": 1.1389,
"step": 1874
},
{
"epoch": 0.4611411706837186,
"grad_norm": 3.976389169692993,
"learning_rate": 9.951380582370347e-06,
"loss": 1.3031,
"step": 1875
},
{
"epoch": 0.46138711264141663,
"grad_norm": 4.0901360511779785,
"learning_rate": 9.951267368916034e-06,
"loss": 1.1961,
"step": 1876
},
{
"epoch": 0.46163305459911463,
"grad_norm": 4.12688684463501,
"learning_rate": 9.951154024448039e-06,
"loss": 1.2609,
"step": 1877
},
{
"epoch": 0.4618789965568126,
"grad_norm": 3.9449989795684814,
"learning_rate": 9.951040548969366e-06,
"loss": 1.2144,
"step": 1878
},
{
"epoch": 0.4621249385145106,
"grad_norm": 4.022320747375488,
"learning_rate": 9.950926942483014e-06,
"loss": 1.2642,
"step": 1879
},
{
"epoch": 0.46237088047220853,
"grad_norm": 3.930377721786499,
"learning_rate": 9.950813204991992e-06,
"loss": 1.1378,
"step": 1880
},
{
"epoch": 0.46261682242990654,
"grad_norm": 4.022216796875,
"learning_rate": 9.95069933649931e-06,
"loss": 1.3146,
"step": 1881
},
{
"epoch": 0.46286276438760454,
"grad_norm": 4.252761363983154,
"learning_rate": 9.950585337007977e-06,
"loss": 1.4065,
"step": 1882
},
{
"epoch": 0.4631087063453025,
"grad_norm": 3.5620901584625244,
"learning_rate": 9.950471206521013e-06,
"loss": 1.2516,
"step": 1883
},
{
"epoch": 0.4633546483030005,
"grad_norm": 3.9809978008270264,
"learning_rate": 9.950356945041437e-06,
"loss": 1.1763,
"step": 1884
},
{
"epoch": 0.4636005902606985,
"grad_norm": 4.27463960647583,
"learning_rate": 9.950242552572272e-06,
"loss": 1.2919,
"step": 1885
},
{
"epoch": 0.46384653221839645,
"grad_norm": 3.706299066543579,
"learning_rate": 9.950128029116546e-06,
"loss": 1.1556,
"step": 1886
},
{
"epoch": 0.46409247417609445,
"grad_norm": 4.101749897003174,
"learning_rate": 9.950013374677288e-06,
"loss": 1.2746,
"step": 1887
},
{
"epoch": 0.4643384161337924,
"grad_norm": 3.950719118118286,
"learning_rate": 9.949898589257532e-06,
"loss": 1.1648,
"step": 1888
},
{
"epoch": 0.4645843580914904,
"grad_norm": 3.7022833824157715,
"learning_rate": 9.949783672860318e-06,
"loss": 1.1668,
"step": 1889
},
{
"epoch": 0.4648303000491884,
"grad_norm": 4.024631023406982,
"learning_rate": 9.949668625488683e-06,
"loss": 1.2379,
"step": 1890
},
{
"epoch": 0.46507624200688635,
"grad_norm": 4.10100793838501,
"learning_rate": 9.949553447145673e-06,
"loss": 1.237,
"step": 1891
},
{
"epoch": 0.46532218396458436,
"grad_norm": 4.246479034423828,
"learning_rate": 9.949438137834335e-06,
"loss": 1.3947,
"step": 1892
},
{
"epoch": 0.46556812592228236,
"grad_norm": 3.8668911457061768,
"learning_rate": 9.949322697557719e-06,
"loss": 1.2663,
"step": 1893
},
{
"epoch": 0.4658140678799803,
"grad_norm": 3.9335126876831055,
"learning_rate": 9.949207126318883e-06,
"loss": 1.1418,
"step": 1894
},
{
"epoch": 0.4660600098376783,
"grad_norm": 3.5951130390167236,
"learning_rate": 9.949091424120884e-06,
"loss": 1.2383,
"step": 1895
},
{
"epoch": 0.4663059517953763,
"grad_norm": 3.6274986267089844,
"learning_rate": 9.948975590966782e-06,
"loss": 1.1998,
"step": 1896
},
{
"epoch": 0.46655189375307426,
"grad_norm": 3.9153435230255127,
"learning_rate": 9.948859626859643e-06,
"loss": 1.2111,
"step": 1897
},
{
"epoch": 0.46679783571077227,
"grad_norm": 3.897731304168701,
"learning_rate": 9.948743531802534e-06,
"loss": 1.3876,
"step": 1898
},
{
"epoch": 0.4670437776684702,
"grad_norm": 4.087168216705322,
"learning_rate": 9.94862730579853e-06,
"loss": 1.4078,
"step": 1899
},
{
"epoch": 0.4672897196261682,
"grad_norm": 4.067444324493408,
"learning_rate": 9.948510948850704e-06,
"loss": 1.3298,
"step": 1900
},
{
"epoch": 0.4675356615838662,
"grad_norm": 4.171195030212402,
"learning_rate": 9.948394460962136e-06,
"loss": 1.4606,
"step": 1901
},
{
"epoch": 0.4677816035415642,
"grad_norm": 3.8380208015441895,
"learning_rate": 9.948277842135907e-06,
"loss": 1.1855,
"step": 1902
},
{
"epoch": 0.4680275454992622,
"grad_norm": 4.07598352432251,
"learning_rate": 9.948161092375107e-06,
"loss": 1.291,
"step": 1903
},
{
"epoch": 0.4682734874569602,
"grad_norm": 4.1095871925354,
"learning_rate": 9.948044211682818e-06,
"loss": 1.2147,
"step": 1904
},
{
"epoch": 0.46851942941465813,
"grad_norm": 3.9421350955963135,
"learning_rate": 9.947927200062139e-06,
"loss": 1.223,
"step": 1905
},
{
"epoch": 0.46876537137235613,
"grad_norm": 3.6295506954193115,
"learning_rate": 9.947810057516163e-06,
"loss": 1.2193,
"step": 1906
},
{
"epoch": 0.4690113133300541,
"grad_norm": 4.002089977264404,
"learning_rate": 9.94769278404799e-06,
"loss": 1.2972,
"step": 1907
},
{
"epoch": 0.4692572552877521,
"grad_norm": 4.003655433654785,
"learning_rate": 9.947575379660726e-06,
"loss": 1.3925,
"step": 1908
},
{
"epoch": 0.4695031972454501,
"grad_norm": 4.3048248291015625,
"learning_rate": 9.947457844357473e-06,
"loss": 1.3734,
"step": 1909
},
{
"epoch": 0.46974913920314804,
"grad_norm": 3.9142348766326904,
"learning_rate": 9.947340178141346e-06,
"loss": 1.1578,
"step": 1910
},
{
"epoch": 0.46999508116084604,
"grad_norm": 3.6631715297698975,
"learning_rate": 9.947222381015454e-06,
"loss": 1.0813,
"step": 1911
},
{
"epoch": 0.47024102311854404,
"grad_norm": 4.530338764190674,
"learning_rate": 9.947104452982914e-06,
"loss": 1.2681,
"step": 1912
},
{
"epoch": 0.470486965076242,
"grad_norm": 3.9666531085968018,
"learning_rate": 9.946986394046852e-06,
"loss": 1.2989,
"step": 1913
},
{
"epoch": 0.47073290703394,
"grad_norm": 3.9357080459594727,
"learning_rate": 9.946868204210388e-06,
"loss": 1.241,
"step": 1914
},
{
"epoch": 0.470978848991638,
"grad_norm": 4.137866020202637,
"learning_rate": 9.946749883476648e-06,
"loss": 1.2806,
"step": 1915
},
{
"epoch": 0.47122479094933595,
"grad_norm": 4.093836307525635,
"learning_rate": 9.946631431848764e-06,
"loss": 1.2665,
"step": 1916
},
{
"epoch": 0.47147073290703395,
"grad_norm": 4.002551078796387,
"learning_rate": 9.946512849329872e-06,
"loss": 1.4176,
"step": 1917
},
{
"epoch": 0.4717166748647319,
"grad_norm": 3.9968814849853516,
"learning_rate": 9.946394135923108e-06,
"loss": 1.2887,
"step": 1918
},
{
"epoch": 0.4719626168224299,
"grad_norm": 3.983266830444336,
"learning_rate": 9.946275291631616e-06,
"loss": 1.4572,
"step": 1919
},
{
"epoch": 0.4722085587801279,
"grad_norm": 3.70945143699646,
"learning_rate": 9.946156316458535e-06,
"loss": 1.1693,
"step": 1920
},
{
"epoch": 0.47245450073782586,
"grad_norm": 3.679863214492798,
"learning_rate": 9.94603721040702e-06,
"loss": 1.1506,
"step": 1921
},
{
"epoch": 0.47270044269552386,
"grad_norm": 4.299841403961182,
"learning_rate": 9.945917973480217e-06,
"loss": 1.2486,
"step": 1922
},
{
"epoch": 0.47294638465322186,
"grad_norm": 3.637683629989624,
"learning_rate": 9.945798605681283e-06,
"loss": 1.1922,
"step": 1923
},
{
"epoch": 0.4731923266109198,
"grad_norm": 4.093365669250488,
"learning_rate": 9.94567910701338e-06,
"loss": 1.2957,
"step": 1924
},
{
"epoch": 0.4734382685686178,
"grad_norm": 3.693467855453491,
"learning_rate": 9.945559477479663e-06,
"loss": 1.0778,
"step": 1925
},
{
"epoch": 0.47368421052631576,
"grad_norm": 3.9096295833587646,
"learning_rate": 9.945439717083304e-06,
"loss": 1.4279,
"step": 1926
},
{
"epoch": 0.47393015248401377,
"grad_norm": 3.753743886947632,
"learning_rate": 9.945319825827469e-06,
"loss": 1.0839,
"step": 1927
},
{
"epoch": 0.47417609444171177,
"grad_norm": 3.935260534286499,
"learning_rate": 9.945199803715332e-06,
"loss": 1.368,
"step": 1928
},
{
"epoch": 0.4744220363994097,
"grad_norm": 3.5128021240234375,
"learning_rate": 9.945079650750065e-06,
"loss": 1.1367,
"step": 1929
},
{
"epoch": 0.4746679783571077,
"grad_norm": 4.2295451164245605,
"learning_rate": 9.94495936693485e-06,
"loss": 1.379,
"step": 1930
},
{
"epoch": 0.4749139203148057,
"grad_norm": 3.6616461277008057,
"learning_rate": 9.94483895227287e-06,
"loss": 1.1142,
"step": 1931
},
{
"epoch": 0.4751598622725037,
"grad_norm": 3.932229995727539,
"learning_rate": 9.944718406767313e-06,
"loss": 1.299,
"step": 1932
},
{
"epoch": 0.4754058042302017,
"grad_norm": 4.10984992980957,
"learning_rate": 9.944597730421366e-06,
"loss": 1.2553,
"step": 1933
},
{
"epoch": 0.4756517461878997,
"grad_norm": 3.816882371902466,
"learning_rate": 9.94447692323822e-06,
"loss": 1.3786,
"step": 1934
},
{
"epoch": 0.47589768814559763,
"grad_norm": 4.159842491149902,
"learning_rate": 9.944355985221076e-06,
"loss": 1.2545,
"step": 1935
},
{
"epoch": 0.47614363010329563,
"grad_norm": 3.46396541595459,
"learning_rate": 9.944234916373135e-06,
"loss": 1.1126,
"step": 1936
},
{
"epoch": 0.4763895720609936,
"grad_norm": 3.880298376083374,
"learning_rate": 9.944113716697595e-06,
"loss": 1.2393,
"step": 1937
},
{
"epoch": 0.4766355140186916,
"grad_norm": 3.878533363342285,
"learning_rate": 9.943992386197667e-06,
"loss": 1.1464,
"step": 1938
},
{
"epoch": 0.4768814559763896,
"grad_norm": 3.7061398029327393,
"learning_rate": 9.943870924876562e-06,
"loss": 1.2385,
"step": 1939
},
{
"epoch": 0.47712739793408754,
"grad_norm": 3.633507490158081,
"learning_rate": 9.943749332737491e-06,
"loss": 1.2649,
"step": 1940
},
{
"epoch": 0.47737333989178554,
"grad_norm": 3.605943202972412,
"learning_rate": 9.943627609783675e-06,
"loss": 1.2286,
"step": 1941
},
{
"epoch": 0.47761928184948355,
"grad_norm": 3.586588144302368,
"learning_rate": 9.94350575601833e-06,
"loss": 1.1175,
"step": 1942
},
{
"epoch": 0.4778652238071815,
"grad_norm": 3.727569341659546,
"learning_rate": 9.943383771444686e-06,
"loss": 1.3143,
"step": 1943
},
{
"epoch": 0.4781111657648795,
"grad_norm": 4.317198753356934,
"learning_rate": 9.943261656065966e-06,
"loss": 1.481,
"step": 1944
},
{
"epoch": 0.47835710772257745,
"grad_norm": 4.204014778137207,
"learning_rate": 9.943139409885404e-06,
"loss": 1.352,
"step": 1945
},
{
"epoch": 0.47860304968027545,
"grad_norm": 3.654623031616211,
"learning_rate": 9.943017032906233e-06,
"loss": 1.2121,
"step": 1946
},
{
"epoch": 0.47884899163797345,
"grad_norm": 3.9796745777130127,
"learning_rate": 9.942894525131694e-06,
"loss": 1.3292,
"step": 1947
},
{
"epoch": 0.4790949335956714,
"grad_norm": 3.671651601791382,
"learning_rate": 9.942771886565025e-06,
"loss": 1.1725,
"step": 1948
},
{
"epoch": 0.4793408755533694,
"grad_norm": 3.46272873878479,
"learning_rate": 9.942649117209473e-06,
"loss": 1.0945,
"step": 1949
},
{
"epoch": 0.4795868175110674,
"grad_norm": 4.0094523429870605,
"learning_rate": 9.942526217068288e-06,
"loss": 1.2268,
"step": 1950
},
{
"epoch": 0.47983275946876536,
"grad_norm": 3.6403355598449707,
"learning_rate": 9.942403186144717e-06,
"loss": 1.0922,
"step": 1951
},
{
"epoch": 0.48007870142646336,
"grad_norm": 3.967334747314453,
"learning_rate": 9.942280024442023e-06,
"loss": 1.2868,
"step": 1952
},
{
"epoch": 0.48032464338416136,
"grad_norm": 3.878350019454956,
"learning_rate": 9.942156731963458e-06,
"loss": 1.2058,
"step": 1953
},
{
"epoch": 0.4805705853418593,
"grad_norm": 3.98917818069458,
"learning_rate": 9.942033308712289e-06,
"loss": 1.3768,
"step": 1954
},
{
"epoch": 0.4808165272995573,
"grad_norm": 3.5792605876922607,
"learning_rate": 9.941909754691779e-06,
"loss": 1.2073,
"step": 1955
},
{
"epoch": 0.48106246925725527,
"grad_norm": 3.676313638687134,
"learning_rate": 9.941786069905197e-06,
"loss": 1.1866,
"step": 1956
},
{
"epoch": 0.48130841121495327,
"grad_norm": 3.884124517440796,
"learning_rate": 9.94166225435582e-06,
"loss": 1.1596,
"step": 1957
},
{
"epoch": 0.4815543531726513,
"grad_norm": 3.9484810829162598,
"learning_rate": 9.941538308046921e-06,
"loss": 1.1824,
"step": 1958
},
{
"epoch": 0.4818002951303492,
"grad_norm": 4.352466583251953,
"learning_rate": 9.94141423098178e-06,
"loss": 1.4033,
"step": 1959
},
{
"epoch": 0.4820462370880472,
"grad_norm": 4.573991298675537,
"learning_rate": 9.941290023163678e-06,
"loss": 1.3703,
"step": 1960
},
{
"epoch": 0.48229217904574523,
"grad_norm": 4.16006326675415,
"learning_rate": 9.941165684595904e-06,
"loss": 1.3764,
"step": 1961
},
{
"epoch": 0.4825381210034432,
"grad_norm": 3.6659960746765137,
"learning_rate": 9.94104121528175e-06,
"loss": 1.2199,
"step": 1962
},
{
"epoch": 0.4827840629611412,
"grad_norm": 3.8254969120025635,
"learning_rate": 9.940916615224508e-06,
"loss": 1.103,
"step": 1963
},
{
"epoch": 0.48303000491883913,
"grad_norm": 3.8376665115356445,
"learning_rate": 9.940791884427472e-06,
"loss": 1.2012,
"step": 1964
},
{
"epoch": 0.48327594687653713,
"grad_norm": 3.904587984085083,
"learning_rate": 9.940667022893946e-06,
"loss": 1.2889,
"step": 1965
},
{
"epoch": 0.48352188883423514,
"grad_norm": 3.651560068130493,
"learning_rate": 9.940542030627233e-06,
"loss": 1.1202,
"step": 1966
},
{
"epoch": 0.4837678307919331,
"grad_norm": 3.729316234588623,
"learning_rate": 9.94041690763064e-06,
"loss": 1.121,
"step": 1967
},
{
"epoch": 0.4840137727496311,
"grad_norm": 3.424711227416992,
"learning_rate": 9.94029165390748e-06,
"loss": 1.1006,
"step": 1968
},
{
"epoch": 0.4842597147073291,
"grad_norm": 3.6561214923858643,
"learning_rate": 9.940166269461063e-06,
"loss": 1.1653,
"step": 1969
},
{
"epoch": 0.48450565666502704,
"grad_norm": 4.033416271209717,
"learning_rate": 9.940040754294709e-06,
"loss": 1.2197,
"step": 1970
},
{
"epoch": 0.48475159862272504,
"grad_norm": 3.9733481407165527,
"learning_rate": 9.939915108411742e-06,
"loss": 1.1775,
"step": 1971
},
{
"epoch": 0.48499754058042305,
"grad_norm": 4.134637355804443,
"learning_rate": 9.939789331815481e-06,
"loss": 1.3114,
"step": 1972
},
{
"epoch": 0.485243482538121,
"grad_norm": 4.119969844818115,
"learning_rate": 9.939663424509258e-06,
"loss": 1.2379,
"step": 1973
},
{
"epoch": 0.485489424495819,
"grad_norm": 3.8746256828308105,
"learning_rate": 9.939537386496406e-06,
"loss": 1.2003,
"step": 1974
},
{
"epoch": 0.48573536645351695,
"grad_norm": 4.291780471801758,
"learning_rate": 9.939411217780255e-06,
"loss": 1.4127,
"step": 1975
},
{
"epoch": 0.48598130841121495,
"grad_norm": 3.90728497505188,
"learning_rate": 9.939284918364146e-06,
"loss": 1.2671,
"step": 1976
},
{
"epoch": 0.48622725036891296,
"grad_norm": 4.024746894836426,
"learning_rate": 9.939158488251422e-06,
"loss": 1.214,
"step": 1977
},
{
"epoch": 0.4864731923266109,
"grad_norm": 3.861607789993286,
"learning_rate": 9.939031927445428e-06,
"loss": 1.3681,
"step": 1978
},
{
"epoch": 0.4867191342843089,
"grad_norm": 3.815272331237793,
"learning_rate": 9.938905235949512e-06,
"loss": 1.2555,
"step": 1979
},
{
"epoch": 0.4869650762420069,
"grad_norm": 3.862234115600586,
"learning_rate": 9.938778413767028e-06,
"loss": 1.2809,
"step": 1980
},
{
"epoch": 0.48721101819970486,
"grad_norm": 3.4098005294799805,
"learning_rate": 9.93865146090133e-06,
"loss": 1.1325,
"step": 1981
},
{
"epoch": 0.48745696015740286,
"grad_norm": 4.0395402908325195,
"learning_rate": 9.938524377355777e-06,
"loss": 1.3119,
"step": 1982
},
{
"epoch": 0.4877029021151008,
"grad_norm": 3.775066375732422,
"learning_rate": 9.938397163133734e-06,
"loss": 1.1884,
"step": 1983
},
{
"epoch": 0.4879488440727988,
"grad_norm": 3.618936777114868,
"learning_rate": 9.938269818238565e-06,
"loss": 1.1944,
"step": 1984
},
{
"epoch": 0.4881947860304968,
"grad_norm": 3.6919541358947754,
"learning_rate": 9.93814234267364e-06,
"loss": 1.1996,
"step": 1985
},
{
"epoch": 0.48844072798819477,
"grad_norm": 3.6189892292022705,
"learning_rate": 9.938014736442334e-06,
"loss": 1.0956,
"step": 1986
},
{
"epoch": 0.48868666994589277,
"grad_norm": 4.037132740020752,
"learning_rate": 9.937886999548021e-06,
"loss": 1.2639,
"step": 1987
},
{
"epoch": 0.4889326119035908,
"grad_norm": 4.207380771636963,
"learning_rate": 9.937759131994082e-06,
"loss": 1.1067,
"step": 1988
},
{
"epoch": 0.4891785538612887,
"grad_norm": 3.466580390930176,
"learning_rate": 9.9376311337839e-06,
"loss": 1.1966,
"step": 1989
},
{
"epoch": 0.4894244958189867,
"grad_norm": 3.867206573486328,
"learning_rate": 9.937503004920862e-06,
"loss": 1.1126,
"step": 1990
},
{
"epoch": 0.4896704377766847,
"grad_norm": 3.9632785320281982,
"learning_rate": 9.937374745408361e-06,
"loss": 1.2181,
"step": 1991
},
{
"epoch": 0.4899163797343827,
"grad_norm": 3.7429189682006836,
"learning_rate": 9.937246355249788e-06,
"loss": 1.3753,
"step": 1992
},
{
"epoch": 0.4901623216920807,
"grad_norm": 4.2485175132751465,
"learning_rate": 9.937117834448541e-06,
"loss": 1.1618,
"step": 1993
},
{
"epoch": 0.49040826364977863,
"grad_norm": 3.815783739089966,
"learning_rate": 9.936989183008019e-06,
"loss": 1.19,
"step": 1994
},
{
"epoch": 0.49065420560747663,
"grad_norm": 3.8669593334198,
"learning_rate": 9.93686040093163e-06,
"loss": 1.2259,
"step": 1995
},
{
"epoch": 0.49090014756517464,
"grad_norm": 3.570322036743164,
"learning_rate": 9.936731488222776e-06,
"loss": 1.2814,
"step": 1996
},
{
"epoch": 0.4911460895228726,
"grad_norm": 3.743537187576294,
"learning_rate": 9.936602444884875e-06,
"loss": 1.2242,
"step": 1997
},
{
"epoch": 0.4913920314805706,
"grad_norm": 4.1929521560668945,
"learning_rate": 9.936473270921338e-06,
"loss": 1.1539,
"step": 1998
},
{
"epoch": 0.4916379734382686,
"grad_norm": 3.7359580993652344,
"learning_rate": 9.936343966335582e-06,
"loss": 1.0486,
"step": 1999
},
{
"epoch": 0.49188391539596654,
"grad_norm": 4.1101884841918945,
"learning_rate": 9.93621453113103e-06,
"loss": 1.2879,
"step": 2000
},
{
"epoch": 0.49188391539596654,
"eval_loss": 1.2986953258514404,
"eval_runtime": 13.7999,
"eval_samples_per_second": 28.986,
"eval_steps_per_second": 3.623,
"step": 2000
},
{
"epoch": 0.49212985735366455,
"grad_norm": 3.967435359954834,
"learning_rate": 9.936084965311108e-06,
"loss": 1.1475,
"step": 2001
},
{
"epoch": 0.4923757993113625,
"grad_norm": 4.118821144104004,
"learning_rate": 9.93595526887924e-06,
"loss": 1.3943,
"step": 2002
},
{
"epoch": 0.4926217412690605,
"grad_norm": 4.116652965545654,
"learning_rate": 9.935825441838863e-06,
"loss": 1.3202,
"step": 2003
},
{
"epoch": 0.4928676832267585,
"grad_norm": 3.8607633113861084,
"learning_rate": 9.935695484193411e-06,
"loss": 1.2787,
"step": 2004
},
{
"epoch": 0.49311362518445645,
"grad_norm": 3.862600088119507,
"learning_rate": 9.93556539594632e-06,
"loss": 1.3122,
"step": 2005
},
{
"epoch": 0.49335956714215445,
"grad_norm": 3.6925604343414307,
"learning_rate": 9.935435177101034e-06,
"loss": 1.2038,
"step": 2006
},
{
"epoch": 0.49360550909985246,
"grad_norm": 3.5358126163482666,
"learning_rate": 9.935304827661e-06,
"loss": 1.0529,
"step": 2007
},
{
"epoch": 0.4938514510575504,
"grad_norm": 3.598153829574585,
"learning_rate": 9.935174347629665e-06,
"loss": 1.2226,
"step": 2008
},
{
"epoch": 0.4940973930152484,
"grad_norm": 3.5963072776794434,
"learning_rate": 9.935043737010484e-06,
"loss": 1.1802,
"step": 2009
},
{
"epoch": 0.49434333497294636,
"grad_norm": 4.162814140319824,
"learning_rate": 9.93491299580691e-06,
"loss": 1.2829,
"step": 2010
},
{
"epoch": 0.49458927693064436,
"grad_norm": 3.849273204803467,
"learning_rate": 9.934782124022404e-06,
"loss": 1.195,
"step": 2011
},
{
"epoch": 0.49483521888834237,
"grad_norm": 3.66589617729187,
"learning_rate": 9.93465112166043e-06,
"loss": 1.2865,
"step": 2012
},
{
"epoch": 0.4950811608460403,
"grad_norm": 3.696441173553467,
"learning_rate": 9.934519988724454e-06,
"loss": 1.2806,
"step": 2013
},
{
"epoch": 0.4953271028037383,
"grad_norm": 3.5013318061828613,
"learning_rate": 9.934388725217944e-06,
"loss": 1.2424,
"step": 2014
},
{
"epoch": 0.4955730447614363,
"grad_norm": 3.6759259700775146,
"learning_rate": 9.934257331144374e-06,
"loss": 1.2476,
"step": 2015
},
{
"epoch": 0.49581898671913427,
"grad_norm": 4.029742240905762,
"learning_rate": 9.934125806507224e-06,
"loss": 1.3536,
"step": 2016
},
{
"epoch": 0.4960649286768323,
"grad_norm": 3.8330271244049072,
"learning_rate": 9.93399415130997e-06,
"loss": 1.2729,
"step": 2017
},
{
"epoch": 0.4963108706345303,
"grad_norm": 3.763774871826172,
"learning_rate": 9.933862365556097e-06,
"loss": 1.2589,
"step": 2018
},
{
"epoch": 0.4965568125922282,
"grad_norm": 3.9705257415771484,
"learning_rate": 9.933730449249092e-06,
"loss": 1.1863,
"step": 2019
},
{
"epoch": 0.49680275454992623,
"grad_norm": 3.5106635093688965,
"learning_rate": 9.933598402392446e-06,
"loss": 1.1836,
"step": 2020
},
{
"epoch": 0.4970486965076242,
"grad_norm": 3.7407944202423096,
"learning_rate": 9.933466224989655e-06,
"loss": 1.2678,
"step": 2021
},
{
"epoch": 0.4972946384653222,
"grad_norm": 4.435426712036133,
"learning_rate": 9.933333917044212e-06,
"loss": 1.3141,
"step": 2022
},
{
"epoch": 0.4975405804230202,
"grad_norm": 4.202254772186279,
"learning_rate": 9.933201478559622e-06,
"loss": 1.3371,
"step": 2023
},
{
"epoch": 0.49778652238071813,
"grad_norm": 3.6481056213378906,
"learning_rate": 9.933068909539388e-06,
"loss": 1.3461,
"step": 2024
},
{
"epoch": 0.49803246433841614,
"grad_norm": 4.222822666168213,
"learning_rate": 9.932936209987018e-06,
"loss": 1.2844,
"step": 2025
},
{
"epoch": 0.49827840629611414,
"grad_norm": 3.862213611602783,
"learning_rate": 9.932803379906023e-06,
"loss": 1.4386,
"step": 2026
},
{
"epoch": 0.4985243482538121,
"grad_norm": 3.7118172645568848,
"learning_rate": 9.932670419299917e-06,
"loss": 1.1724,
"step": 2027
},
{
"epoch": 0.4987702902115101,
"grad_norm": 3.9896233081817627,
"learning_rate": 9.932537328172219e-06,
"loss": 1.2116,
"step": 2028
},
{
"epoch": 0.49901623216920804,
"grad_norm": 4.1085333824157715,
"learning_rate": 9.93240410652645e-06,
"loss": 1.3326,
"step": 2029
},
{
"epoch": 0.49926217412690604,
"grad_norm": 3.623394250869751,
"learning_rate": 9.932270754366138e-06,
"loss": 1.2478,
"step": 2030
},
{
"epoch": 0.49950811608460405,
"grad_norm": 3.866910457611084,
"learning_rate": 9.932137271694809e-06,
"loss": 1.1765,
"step": 2031
},
{
"epoch": 0.499754058042302,
"grad_norm": 3.7941431999206543,
"learning_rate": 9.932003658515994e-06,
"loss": 1.1476,
"step": 2032
},
{
"epoch": 0.5,
"grad_norm": 4.244842052459717,
"learning_rate": 9.931869914833233e-06,
"loss": 1.3927,
"step": 2033
},
{
"epoch": 0.500245941957698,
"grad_norm": 4.038697719573975,
"learning_rate": 9.931736040650058e-06,
"loss": 1.2626,
"step": 2034
},
{
"epoch": 0.500491883915396,
"grad_norm": 4.15991735458374,
"learning_rate": 9.931602035970019e-06,
"loss": 1.1629,
"step": 2035
},
{
"epoch": 0.5007378258730939,
"grad_norm": 4.975515365600586,
"learning_rate": 9.931467900796655e-06,
"loss": 1.3475,
"step": 2036
},
{
"epoch": 0.5009837678307919,
"grad_norm": 3.9195396900177,
"learning_rate": 9.93133363513352e-06,
"loss": 1.2324,
"step": 2037
},
{
"epoch": 0.5012297097884899,
"grad_norm": 4.083966255187988,
"learning_rate": 9.931199238984166e-06,
"loss": 1.2811,
"step": 2038
},
{
"epoch": 0.5014756517461879,
"grad_norm": 3.506024122238159,
"learning_rate": 9.931064712352146e-06,
"loss": 1.2404,
"step": 2039
},
{
"epoch": 0.5017215937038859,
"grad_norm": 3.989150047302246,
"learning_rate": 9.930930055241024e-06,
"loss": 1.1874,
"step": 2040
},
{
"epoch": 0.5019675356615839,
"grad_norm": 3.9103245735168457,
"learning_rate": 9.93079526765436e-06,
"loss": 1.3555,
"step": 2041
},
{
"epoch": 0.5022134776192818,
"grad_norm": 3.6908440589904785,
"learning_rate": 9.930660349595724e-06,
"loss": 1.2857,
"step": 2042
},
{
"epoch": 0.5024594195769798,
"grad_norm": 3.6231374740600586,
"learning_rate": 9.930525301068682e-06,
"loss": 1.2437,
"step": 2043
},
{
"epoch": 0.5027053615346778,
"grad_norm": 3.912170886993408,
"learning_rate": 9.930390122076809e-06,
"loss": 1.3454,
"step": 2044
},
{
"epoch": 0.5029513034923758,
"grad_norm": 3.7156691551208496,
"learning_rate": 9.930254812623683e-06,
"loss": 1.2055,
"step": 2045
},
{
"epoch": 0.5031972454500738,
"grad_norm": 3.5612075328826904,
"learning_rate": 9.930119372712883e-06,
"loss": 1.1037,
"step": 2046
},
{
"epoch": 0.5034431874077717,
"grad_norm": 3.7106776237487793,
"learning_rate": 9.929983802347995e-06,
"loss": 1.2478,
"step": 2047
},
{
"epoch": 0.5036891293654697,
"grad_norm": 3.7999792098999023,
"learning_rate": 9.929848101532602e-06,
"loss": 1.1183,
"step": 2048
},
{
"epoch": 0.5039350713231677,
"grad_norm": 3.7028729915618896,
"learning_rate": 9.929712270270299e-06,
"loss": 1.2234,
"step": 2049
},
{
"epoch": 0.5041810132808657,
"grad_norm": 3.797987461090088,
"learning_rate": 9.929576308564679e-06,
"loss": 1.0742,
"step": 2050
},
{
"epoch": 0.5044269552385637,
"grad_norm": 3.8822202682495117,
"learning_rate": 9.929440216419338e-06,
"loss": 1.0826,
"step": 2051
},
{
"epoch": 0.5046728971962616,
"grad_norm": 3.9708447456359863,
"learning_rate": 9.929303993837879e-06,
"loss": 1.2823,
"step": 2052
},
{
"epoch": 0.5049188391539596,
"grad_norm": 3.590642213821411,
"learning_rate": 9.929167640823906e-06,
"loss": 1.1307,
"step": 2053
},
{
"epoch": 0.5051647811116576,
"grad_norm": 3.9430091381073,
"learning_rate": 9.929031157381026e-06,
"loss": 1.3191,
"step": 2054
},
{
"epoch": 0.5054107230693556,
"grad_norm": 3.947977304458618,
"learning_rate": 9.928894543512852e-06,
"loss": 1.4042,
"step": 2055
},
{
"epoch": 0.5056566650270536,
"grad_norm": 3.5514469146728516,
"learning_rate": 9.928757799222997e-06,
"loss": 1.0452,
"step": 2056
},
{
"epoch": 0.5059026069847516,
"grad_norm": 3.8537063598632812,
"learning_rate": 9.928620924515083e-06,
"loss": 1.2804,
"step": 2057
},
{
"epoch": 0.5061485489424495,
"grad_norm": 4.256128787994385,
"learning_rate": 9.928483919392726e-06,
"loss": 1.3699,
"step": 2058
},
{
"epoch": 0.5063944909001475,
"grad_norm": 3.881159782409668,
"learning_rate": 9.928346783859557e-06,
"loss": 1.2048,
"step": 2059
},
{
"epoch": 0.5066404328578455,
"grad_norm": 3.956533193588257,
"learning_rate": 9.9282095179192e-06,
"loss": 1.3071,
"step": 2060
},
{
"epoch": 0.5068863748155436,
"grad_norm": 3.715815782546997,
"learning_rate": 9.928072121575292e-06,
"loss": 1.1568,
"step": 2061
},
{
"epoch": 0.5071323167732416,
"grad_norm": 3.7111854553222656,
"learning_rate": 9.927934594831464e-06,
"loss": 1.1119,
"step": 2062
},
{
"epoch": 0.5073782587309394,
"grad_norm": 3.6359939575195312,
"learning_rate": 9.927796937691359e-06,
"loss": 1.3777,
"step": 2063
},
{
"epoch": 0.5076242006886375,
"grad_norm": 4.061556816101074,
"learning_rate": 9.927659150158615e-06,
"loss": 1.0503,
"step": 2064
},
{
"epoch": 0.5078701426463355,
"grad_norm": 3.513235092163086,
"learning_rate": 9.927521232236881e-06,
"loss": 1.2649,
"step": 2065
},
{
"epoch": 0.5081160846040335,
"grad_norm": 4.023743152618408,
"learning_rate": 9.927383183929806e-06,
"loss": 1.3651,
"step": 2066
},
{
"epoch": 0.5083620265617315,
"grad_norm": 4.000420570373535,
"learning_rate": 9.927245005241044e-06,
"loss": 1.3349,
"step": 2067
},
{
"epoch": 0.5086079685194295,
"grad_norm": 3.6318671703338623,
"learning_rate": 9.92710669617425e-06,
"loss": 1.1868,
"step": 2068
},
{
"epoch": 0.5088539104771274,
"grad_norm": 3.8208847045898438,
"learning_rate": 9.926968256733082e-06,
"loss": 1.4228,
"step": 2069
},
{
"epoch": 0.5090998524348254,
"grad_norm": 3.954742193222046,
"learning_rate": 9.926829686921205e-06,
"loss": 1.2466,
"step": 2070
},
{
"epoch": 0.5093457943925234,
"grad_norm": 3.7758522033691406,
"learning_rate": 9.926690986742287e-06,
"loss": 1.1032,
"step": 2071
},
{
"epoch": 0.5095917363502214,
"grad_norm": 4.136940002441406,
"learning_rate": 9.926552156199995e-06,
"loss": 1.2156,
"step": 2072
},
{
"epoch": 0.5098376783079194,
"grad_norm": 3.8263015747070312,
"learning_rate": 9.926413195298006e-06,
"loss": 1.2441,
"step": 2073
},
{
"epoch": 0.5100836202656173,
"grad_norm": 3.843029022216797,
"learning_rate": 9.926274104039994e-06,
"loss": 1.2781,
"step": 2074
},
{
"epoch": 0.5103295622233153,
"grad_norm": 3.8957035541534424,
"learning_rate": 9.92613488242964e-06,
"loss": 1.2826,
"step": 2075
},
{
"epoch": 0.5105755041810133,
"grad_norm": 4.320259094238281,
"learning_rate": 9.925995530470628e-06,
"loss": 1.1829,
"step": 2076
},
{
"epoch": 0.5108214461387113,
"grad_norm": 4.178318023681641,
"learning_rate": 9.92585604816665e-06,
"loss": 1.2709,
"step": 2077
},
{
"epoch": 0.5110673880964093,
"grad_norm": 3.9186699390411377,
"learning_rate": 9.925716435521388e-06,
"loss": 1.46,
"step": 2078
},
{
"epoch": 0.5113133300541073,
"grad_norm": 3.4883475303649902,
"learning_rate": 9.925576692538544e-06,
"loss": 1.2192,
"step": 2079
},
{
"epoch": 0.5115592720118052,
"grad_norm": 4.000193119049072,
"learning_rate": 9.925436819221811e-06,
"loss": 1.2318,
"step": 2080
},
{
"epoch": 0.5118052139695032,
"grad_norm": 3.6601035594940186,
"learning_rate": 9.925296815574891e-06,
"loss": 1.1287,
"step": 2081
},
{
"epoch": 0.5120511559272012,
"grad_norm": 3.8468799591064453,
"learning_rate": 9.925156681601491e-06,
"loss": 1.1828,
"step": 2082
},
{
"epoch": 0.5122970978848992,
"grad_norm": 3.876176118850708,
"learning_rate": 9.925016417305318e-06,
"loss": 1.1536,
"step": 2083
},
{
"epoch": 0.5125430398425972,
"grad_norm": 3.493006706237793,
"learning_rate": 9.92487602269008e-06,
"loss": 1.2001,
"step": 2084
},
{
"epoch": 0.5127889818002951,
"grad_norm": 3.758375644683838,
"learning_rate": 9.924735497759497e-06,
"loss": 1.1026,
"step": 2085
},
{
"epoch": 0.5130349237579931,
"grad_norm": 3.8290538787841797,
"learning_rate": 9.924594842517283e-06,
"loss": 1.2039,
"step": 2086
},
{
"epoch": 0.5132808657156911,
"grad_norm": 4.018424034118652,
"learning_rate": 9.924454056967164e-06,
"loss": 1.2666,
"step": 2087
},
{
"epoch": 0.5135268076733891,
"grad_norm": 4.096243381500244,
"learning_rate": 9.924313141112863e-06,
"loss": 1.3054,
"step": 2088
},
{
"epoch": 0.5137727496310871,
"grad_norm": 3.8434042930603027,
"learning_rate": 9.924172094958108e-06,
"loss": 1.3055,
"step": 2089
},
{
"epoch": 0.514018691588785,
"grad_norm": 3.610990047454834,
"learning_rate": 9.924030918506633e-06,
"loss": 1.166,
"step": 2090
},
{
"epoch": 0.514264633546483,
"grad_norm": 3.614128828048706,
"learning_rate": 9.923889611762173e-06,
"loss": 1.1907,
"step": 2091
},
{
"epoch": 0.514510575504181,
"grad_norm": 4.061952590942383,
"learning_rate": 9.923748174728466e-06,
"loss": 1.3634,
"step": 2092
},
{
"epoch": 0.514756517461879,
"grad_norm": 4.365742206573486,
"learning_rate": 9.923606607409255e-06,
"loss": 1.298,
"step": 2093
},
{
"epoch": 0.515002459419577,
"grad_norm": 3.9800992012023926,
"learning_rate": 9.923464909808288e-06,
"loss": 1.16,
"step": 2094
},
{
"epoch": 0.515248401377275,
"grad_norm": 3.3853609561920166,
"learning_rate": 9.923323081929311e-06,
"loss": 0.9991,
"step": 2095
},
{
"epoch": 0.5154943433349729,
"grad_norm": 4.26129674911499,
"learning_rate": 9.923181123776079e-06,
"loss": 1.256,
"step": 2096
},
{
"epoch": 0.5157402852926709,
"grad_norm": 3.891066789627075,
"learning_rate": 9.923039035352348e-06,
"loss": 1.2962,
"step": 2097
},
{
"epoch": 0.5159862272503689,
"grad_norm": 3.8612704277038574,
"learning_rate": 9.922896816661878e-06,
"loss": 1.2102,
"step": 2098
},
{
"epoch": 0.5162321692080669,
"grad_norm": 4.163247108459473,
"learning_rate": 9.92275446770843e-06,
"loss": 1.2871,
"step": 2099
},
{
"epoch": 0.5164781111657649,
"grad_norm": 4.322293758392334,
"learning_rate": 9.922611988495775e-06,
"loss": 1.442,
"step": 2100
},
{
"epoch": 0.5167240531234628,
"grad_norm": 3.9699535369873047,
"learning_rate": 9.92246937902768e-06,
"loss": 1.3266,
"step": 2101
},
{
"epoch": 0.5169699950811608,
"grad_norm": 3.2191202640533447,
"learning_rate": 9.922326639307918e-06,
"loss": 1.066,
"step": 2102
},
{
"epoch": 0.5172159370388588,
"grad_norm": 3.8557350635528564,
"learning_rate": 9.922183769340266e-06,
"loss": 1.2004,
"step": 2103
},
{
"epoch": 0.5174618789965568,
"grad_norm": 3.7254083156585693,
"learning_rate": 9.922040769128508e-06,
"loss": 1.2755,
"step": 2104
},
{
"epoch": 0.5177078209542548,
"grad_norm": 3.475280523300171,
"learning_rate": 9.921897638676424e-06,
"loss": 1.1968,
"step": 2105
},
{
"epoch": 0.5179537629119528,
"grad_norm": 4.05603551864624,
"learning_rate": 9.921754377987803e-06,
"loss": 1.163,
"step": 2106
},
{
"epoch": 0.5181997048696507,
"grad_norm": 3.606374740600586,
"learning_rate": 9.921610987066435e-06,
"loss": 1.2534,
"step": 2107
},
{
"epoch": 0.5184456468273487,
"grad_norm": 3.7716317176818848,
"learning_rate": 9.921467465916115e-06,
"loss": 1.2434,
"step": 2108
},
{
"epoch": 0.5186915887850467,
"grad_norm": 4.482640743255615,
"learning_rate": 9.92132381454064e-06,
"loss": 1.2953,
"step": 2109
},
{
"epoch": 0.5189375307427447,
"grad_norm": 3.8885245323181152,
"learning_rate": 9.921180032943813e-06,
"loss": 1.2717,
"step": 2110
},
{
"epoch": 0.5191834727004427,
"grad_norm": 3.9882400035858154,
"learning_rate": 9.921036121129436e-06,
"loss": 1.3811,
"step": 2111
},
{
"epoch": 0.5194294146581406,
"grad_norm": 4.286352634429932,
"learning_rate": 9.920892079101317e-06,
"loss": 1.3744,
"step": 2112
},
{
"epoch": 0.5196753566158386,
"grad_norm": 3.7288010120391846,
"learning_rate": 9.92074790686327e-06,
"loss": 1.2556,
"step": 2113
},
{
"epoch": 0.5199212985735366,
"grad_norm": 3.98539662361145,
"learning_rate": 9.920603604419107e-06,
"loss": 1.3711,
"step": 2114
},
{
"epoch": 0.5201672405312346,
"grad_norm": 3.7938284873962402,
"learning_rate": 9.92045917177265e-06,
"loss": 1.3,
"step": 2115
},
{
"epoch": 0.5204131824889326,
"grad_norm": 3.4648549556732178,
"learning_rate": 9.920314608927714e-06,
"loss": 1.1993,
"step": 2116
},
{
"epoch": 0.5206591244466307,
"grad_norm": 3.928187608718872,
"learning_rate": 9.920169915888132e-06,
"loss": 1.142,
"step": 2117
},
{
"epoch": 0.5209050664043285,
"grad_norm": 3.8039536476135254,
"learning_rate": 9.92002509265773e-06,
"loss": 1.1847,
"step": 2118
},
{
"epoch": 0.5211510083620265,
"grad_norm": 3.7212073802948,
"learning_rate": 9.919880139240338e-06,
"loss": 1.1679,
"step": 2119
},
{
"epoch": 0.5213969503197246,
"grad_norm": 3.897338390350342,
"learning_rate": 9.919735055639792e-06,
"loss": 1.3224,
"step": 2120
},
{
"epoch": 0.5216428922774226,
"grad_norm": 3.7549211978912354,
"learning_rate": 9.919589841859933e-06,
"loss": 1.0967,
"step": 2121
},
{
"epoch": 0.5218888342351206,
"grad_norm": 3.7513699531555176,
"learning_rate": 9.919444497904603e-06,
"loss": 1.1401,
"step": 2122
},
{
"epoch": 0.5221347761928185,
"grad_norm": 4.222444534301758,
"learning_rate": 9.919299023777647e-06,
"loss": 1.3255,
"step": 2123
},
{
"epoch": 0.5223807181505165,
"grad_norm": 3.9544715881347656,
"learning_rate": 9.919153419482914e-06,
"loss": 1.1714,
"step": 2124
},
{
"epoch": 0.5226266601082145,
"grad_norm": 3.5288383960723877,
"learning_rate": 9.919007685024259e-06,
"loss": 1.2024,
"step": 2125
},
{
"epoch": 0.5228726020659125,
"grad_norm": 3.7131669521331787,
"learning_rate": 9.918861820405536e-06,
"loss": 1.1845,
"step": 2126
},
{
"epoch": 0.5231185440236105,
"grad_norm": 3.8817179203033447,
"learning_rate": 9.918715825630605e-06,
"loss": 1.3452,
"step": 2127
},
{
"epoch": 0.5233644859813084,
"grad_norm": 3.616724729537964,
"learning_rate": 9.918569700703329e-06,
"loss": 1.1525,
"step": 2128
},
{
"epoch": 0.5236104279390064,
"grad_norm": 4.315052032470703,
"learning_rate": 9.918423445627577e-06,
"loss": 1.3273,
"step": 2129
},
{
"epoch": 0.5238563698967044,
"grad_norm": 3.92807936668396,
"learning_rate": 9.918277060407215e-06,
"loss": 1.2138,
"step": 2130
},
{
"epoch": 0.5241023118544024,
"grad_norm": 4.177854061126709,
"learning_rate": 9.918130545046118e-06,
"loss": 1.2996,
"step": 2131
},
{
"epoch": 0.5243482538121004,
"grad_norm": 3.4063313007354736,
"learning_rate": 9.917983899548164e-06,
"loss": 1.1174,
"step": 2132
},
{
"epoch": 0.5245941957697984,
"grad_norm": 3.9306552410125732,
"learning_rate": 9.917837123917234e-06,
"loss": 1.3987,
"step": 2133
},
{
"epoch": 0.5248401377274963,
"grad_norm": 4.0166144371032715,
"learning_rate": 9.917690218157209e-06,
"loss": 1.2887,
"step": 2134
},
{
"epoch": 0.5250860796851943,
"grad_norm": 3.9425570964813232,
"learning_rate": 9.917543182271979e-06,
"loss": 1.1646,
"step": 2135
},
{
"epoch": 0.5253320216428923,
"grad_norm": 4.2844390869140625,
"learning_rate": 9.917396016265433e-06,
"loss": 1.3076,
"step": 2136
},
{
"epoch": 0.5255779636005903,
"grad_norm": 3.8552944660186768,
"learning_rate": 9.917248720141466e-06,
"loss": 1.2303,
"step": 2137
},
{
"epoch": 0.5258239055582883,
"grad_norm": 4.2328386306762695,
"learning_rate": 9.917101293903975e-06,
"loss": 1.2847,
"step": 2138
},
{
"epoch": 0.5260698475159862,
"grad_norm": 3.8737504482269287,
"learning_rate": 9.91695373755686e-06,
"loss": 1.1887,
"step": 2139
},
{
"epoch": 0.5263157894736842,
"grad_norm": 3.8874359130859375,
"learning_rate": 9.916806051104026e-06,
"loss": 1.1871,
"step": 2140
},
{
"epoch": 0.5265617314313822,
"grad_norm": 3.8853654861450195,
"learning_rate": 9.91665823454938e-06,
"loss": 1.1276,
"step": 2141
},
{
"epoch": 0.5268076733890802,
"grad_norm": 3.638686180114746,
"learning_rate": 9.916510287896839e-06,
"loss": 1.1423,
"step": 2142
},
{
"epoch": 0.5270536153467782,
"grad_norm": 3.9215664863586426,
"learning_rate": 9.91636221115031e-06,
"loss": 1.3244,
"step": 2143
},
{
"epoch": 0.5272995573044762,
"grad_norm": 3.6984713077545166,
"learning_rate": 9.916214004313715e-06,
"loss": 1.1339,
"step": 2144
},
{
"epoch": 0.5275454992621741,
"grad_norm": 3.809002637863159,
"learning_rate": 9.916065667390977e-06,
"loss": 1.1653,
"step": 2145
},
{
"epoch": 0.5277914412198721,
"grad_norm": 3.771930694580078,
"learning_rate": 9.915917200386017e-06,
"loss": 1.2428,
"step": 2146
},
{
"epoch": 0.5280373831775701,
"grad_norm": 3.664764642715454,
"learning_rate": 9.915768603302769e-06,
"loss": 1.1142,
"step": 2147
},
{
"epoch": 0.5282833251352681,
"grad_norm": 3.921821355819702,
"learning_rate": 9.915619876145159e-06,
"loss": 1.4106,
"step": 2148
},
{
"epoch": 0.5285292670929661,
"grad_norm": 4.273387432098389,
"learning_rate": 9.915471018917126e-06,
"loss": 1.1286,
"step": 2149
},
{
"epoch": 0.528775209050664,
"grad_norm": 3.7936770915985107,
"learning_rate": 9.91532203162261e-06,
"loss": 1.1333,
"step": 2150
},
{
"epoch": 0.529021151008362,
"grad_norm": 3.792909622192383,
"learning_rate": 9.91517291426555e-06,
"loss": 1.1714,
"step": 2151
},
{
"epoch": 0.52926709296606,
"grad_norm": 4.0380635261535645,
"learning_rate": 9.915023666849892e-06,
"loss": 1.1927,
"step": 2152
},
{
"epoch": 0.529513034923758,
"grad_norm": 3.8567912578582764,
"learning_rate": 9.914874289379588e-06,
"loss": 1.2979,
"step": 2153
},
{
"epoch": 0.529758976881456,
"grad_norm": 3.6963160037994385,
"learning_rate": 9.914724781858589e-06,
"loss": 1.2023,
"step": 2154
},
{
"epoch": 0.5300049188391539,
"grad_norm": 3.8904123306274414,
"learning_rate": 9.91457514429085e-06,
"loss": 1.3359,
"step": 2155
},
{
"epoch": 0.5302508607968519,
"grad_norm": 4.023045539855957,
"learning_rate": 9.914425376680332e-06,
"loss": 1.2833,
"step": 2156
},
{
"epoch": 0.5304968027545499,
"grad_norm": 3.6560332775115967,
"learning_rate": 9.914275479030997e-06,
"loss": 1.2047,
"step": 2157
},
{
"epoch": 0.5307427447122479,
"grad_norm": 3.8492634296417236,
"learning_rate": 9.914125451346812e-06,
"loss": 1.3672,
"step": 2158
},
{
"epoch": 0.5309886866699459,
"grad_norm": 3.9805543422698975,
"learning_rate": 9.913975293631747e-06,
"loss": 1.1205,
"step": 2159
},
{
"epoch": 0.5312346286276439,
"grad_norm": 3.5246667861938477,
"learning_rate": 9.913825005889774e-06,
"loss": 1.1076,
"step": 2160
},
{
"epoch": 0.5314805705853418,
"grad_norm": 3.7201192378997803,
"learning_rate": 9.913674588124872e-06,
"loss": 1.1818,
"step": 2161
},
{
"epoch": 0.5317265125430398,
"grad_norm": 3.846723794937134,
"learning_rate": 9.91352404034102e-06,
"loss": 1.335,
"step": 2162
},
{
"epoch": 0.5319724545007378,
"grad_norm": 3.690541982650757,
"learning_rate": 9.9133733625422e-06,
"loss": 1.2507,
"step": 2163
},
{
"epoch": 0.5322183964584358,
"grad_norm": 4.099282264709473,
"learning_rate": 9.913222554732403e-06,
"loss": 1.4178,
"step": 2164
},
{
"epoch": 0.5324643384161338,
"grad_norm": 3.674947738647461,
"learning_rate": 9.913071616915614e-06,
"loss": 1.1614,
"step": 2165
},
{
"epoch": 0.5327102803738317,
"grad_norm": 3.7865703105926514,
"learning_rate": 9.912920549095831e-06,
"loss": 1.2844,
"step": 2166
},
{
"epoch": 0.5329562223315297,
"grad_norm": 3.8465161323547363,
"learning_rate": 9.912769351277051e-06,
"loss": 1.2109,
"step": 2167
},
{
"epoch": 0.5332021642892277,
"grad_norm": 3.9602277278900146,
"learning_rate": 9.912618023463273e-06,
"loss": 1.2794,
"step": 2168
},
{
"epoch": 0.5334481062469257,
"grad_norm": 4.112135887145996,
"learning_rate": 9.912466565658503e-06,
"loss": 1.5095,
"step": 2169
},
{
"epoch": 0.5336940482046237,
"grad_norm": 4.16323709487915,
"learning_rate": 9.912314977866745e-06,
"loss": 1.2315,
"step": 2170
},
{
"epoch": 0.5339399901623217,
"grad_norm": 3.9193637371063232,
"learning_rate": 9.912163260092016e-06,
"loss": 1.3873,
"step": 2171
},
{
"epoch": 0.5341859321200196,
"grad_norm": 3.801978826522827,
"learning_rate": 9.912011412338327e-06,
"loss": 1.2524,
"step": 2172
},
{
"epoch": 0.5344318740777176,
"grad_norm": 4.096787929534912,
"learning_rate": 9.911859434609697e-06,
"loss": 1.4394,
"step": 2173
},
{
"epoch": 0.5346778160354156,
"grad_norm": 4.017948627471924,
"learning_rate": 9.911707326910145e-06,
"loss": 1.3005,
"step": 2174
},
{
"epoch": 0.5349237579931136,
"grad_norm": 3.61559796333313,
"learning_rate": 9.9115550892437e-06,
"loss": 1.3074,
"step": 2175
},
{
"epoch": 0.5351696999508117,
"grad_norm": 3.982522487640381,
"learning_rate": 9.911402721614387e-06,
"loss": 1.2862,
"step": 2176
},
{
"epoch": 0.5354156419085095,
"grad_norm": 3.665327787399292,
"learning_rate": 9.91125022402624e-06,
"loss": 1.1416,
"step": 2177
},
{
"epoch": 0.5356615838662075,
"grad_norm": 3.400005578994751,
"learning_rate": 9.91109759648329e-06,
"loss": 1.0753,
"step": 2178
},
{
"epoch": 0.5359075258239056,
"grad_norm": 3.9149506092071533,
"learning_rate": 9.91094483898958e-06,
"loss": 1.2341,
"step": 2179
},
{
"epoch": 0.5361534677816036,
"grad_norm": 3.8370378017425537,
"learning_rate": 9.910791951549153e-06,
"loss": 1.1859,
"step": 2180
},
{
"epoch": 0.5363994097393016,
"grad_norm": 3.566390037536621,
"learning_rate": 9.910638934166051e-06,
"loss": 1.1548,
"step": 2181
},
{
"epoch": 0.5366453516969996,
"grad_norm": 3.774796962738037,
"learning_rate": 9.910485786844323e-06,
"loss": 1.2864,
"step": 2182
},
{
"epoch": 0.5368912936546975,
"grad_norm": 3.829087257385254,
"learning_rate": 9.910332509588023e-06,
"loss": 1.2937,
"step": 2183
},
{
"epoch": 0.5371372356123955,
"grad_norm": 4.066363334655762,
"learning_rate": 9.910179102401208e-06,
"loss": 1.2717,
"step": 2184
},
{
"epoch": 0.5373831775700935,
"grad_norm": 3.7114639282226562,
"learning_rate": 9.910025565287932e-06,
"loss": 1.3247,
"step": 2185
},
{
"epoch": 0.5376291195277915,
"grad_norm": 3.8042311668395996,
"learning_rate": 9.909871898252264e-06,
"loss": 1.1708,
"step": 2186
},
{
"epoch": 0.5378750614854895,
"grad_norm": 4.228545188903809,
"learning_rate": 9.909718101298266e-06,
"loss": 1.3272,
"step": 2187
},
{
"epoch": 0.5381210034431874,
"grad_norm": 3.591923236846924,
"learning_rate": 9.90956417443001e-06,
"loss": 1.1418,
"step": 2188
},
{
"epoch": 0.5383669454008854,
"grad_norm": 3.8637077808380127,
"learning_rate": 9.90941011765157e-06,
"loss": 1.3119,
"step": 2189
},
{
"epoch": 0.5386128873585834,
"grad_norm": 3.6931893825531006,
"learning_rate": 9.909255930967016e-06,
"loss": 1.2951,
"step": 2190
},
{
"epoch": 0.5388588293162814,
"grad_norm": 4.096282482147217,
"learning_rate": 9.909101614380436e-06,
"loss": 1.2658,
"step": 2191
},
{
"epoch": 0.5391047712739794,
"grad_norm": 3.953883647918701,
"learning_rate": 9.908947167895907e-06,
"loss": 1.2661,
"step": 2192
},
{
"epoch": 0.5393507132316773,
"grad_norm": 3.763455867767334,
"learning_rate": 9.908792591517519e-06,
"loss": 1.3278,
"step": 2193
},
{
"epoch": 0.5395966551893753,
"grad_norm": 3.7555782794952393,
"learning_rate": 9.908637885249364e-06,
"loss": 1.2247,
"step": 2194
},
{
"epoch": 0.5398425971470733,
"grad_norm": 4.057918071746826,
"learning_rate": 9.90848304909553e-06,
"loss": 1.2941,
"step": 2195
},
{
"epoch": 0.5400885391047713,
"grad_norm": 4.127137184143066,
"learning_rate": 9.90832808306012e-06,
"loss": 1.2488,
"step": 2196
},
{
"epoch": 0.5403344810624693,
"grad_norm": 3.885181427001953,
"learning_rate": 9.90817298714723e-06,
"loss": 1.2515,
"step": 2197
},
{
"epoch": 0.5405804230201673,
"grad_norm": 4.153881072998047,
"learning_rate": 9.908017761360966e-06,
"loss": 1.313,
"step": 2198
},
{
"epoch": 0.5408263649778652,
"grad_norm": 3.8210184574127197,
"learning_rate": 9.907862405705436e-06,
"loss": 1.3602,
"step": 2199
},
{
"epoch": 0.5410723069355632,
"grad_norm": 4.095151901245117,
"learning_rate": 9.907706920184747e-06,
"loss": 1.3403,
"step": 2200
},
{
"epoch": 0.5413182488932612,
"grad_norm": 4.248672962188721,
"learning_rate": 9.907551304803018e-06,
"loss": 1.2929,
"step": 2201
},
{
"epoch": 0.5415641908509592,
"grad_norm": 3.753624677658081,
"learning_rate": 9.907395559564365e-06,
"loss": 1.3438,
"step": 2202
},
{
"epoch": 0.5418101328086572,
"grad_norm": 3.935276985168457,
"learning_rate": 9.907239684472909e-06,
"loss": 1.3107,
"step": 2203
},
{
"epoch": 0.5420560747663551,
"grad_norm": 3.7254276275634766,
"learning_rate": 9.907083679532774e-06,
"loss": 1.3467,
"step": 2204
},
{
"epoch": 0.5423020167240531,
"grad_norm": 3.979093074798584,
"learning_rate": 9.906927544748088e-06,
"loss": 1.3087,
"step": 2205
},
{
"epoch": 0.5425479586817511,
"grad_norm": 3.724224090576172,
"learning_rate": 9.906771280122985e-06,
"loss": 1.1919,
"step": 2206
},
{
"epoch": 0.5427939006394491,
"grad_norm": 3.631563425064087,
"learning_rate": 9.906614885661594e-06,
"loss": 1.157,
"step": 2207
},
{
"epoch": 0.5430398425971471,
"grad_norm": 4.078211307525635,
"learning_rate": 9.906458361368061e-06,
"loss": 1.365,
"step": 2208
},
{
"epoch": 0.5432857845548451,
"grad_norm": 4.011194705963135,
"learning_rate": 9.906301707246522e-06,
"loss": 1.0917,
"step": 2209
},
{
"epoch": 0.543531726512543,
"grad_norm": 3.90392804145813,
"learning_rate": 9.906144923301124e-06,
"loss": 1.2067,
"step": 2210
},
{
"epoch": 0.543777668470241,
"grad_norm": 3.4873545169830322,
"learning_rate": 9.905988009536016e-06,
"loss": 1.2143,
"step": 2211
},
{
"epoch": 0.544023610427939,
"grad_norm": 4.265255928039551,
"learning_rate": 9.905830965955348e-06,
"loss": 1.3741,
"step": 2212
},
{
"epoch": 0.544269552385637,
"grad_norm": 3.920677423477173,
"learning_rate": 9.905673792563277e-06,
"loss": 1.2464,
"step": 2213
},
{
"epoch": 0.544515494343335,
"grad_norm": 4.0740275382995605,
"learning_rate": 9.905516489363964e-06,
"loss": 1.2079,
"step": 2214
},
{
"epoch": 0.5447614363010329,
"grad_norm": 4.170569896697998,
"learning_rate": 9.905359056361567e-06,
"loss": 1.2328,
"step": 2215
},
{
"epoch": 0.5450073782587309,
"grad_norm": 3.767235040664673,
"learning_rate": 9.905201493560256e-06,
"loss": 1.3322,
"step": 2216
},
{
"epoch": 0.5452533202164289,
"grad_norm": 4.272323131561279,
"learning_rate": 9.905043800964198e-06,
"loss": 1.4155,
"step": 2217
},
{
"epoch": 0.5454992621741269,
"grad_norm": 3.653945207595825,
"learning_rate": 9.904885978577565e-06,
"loss": 1.161,
"step": 2218
},
{
"epoch": 0.5457452041318249,
"grad_norm": 3.7385060787200928,
"learning_rate": 9.904728026404536e-06,
"loss": 1.3429,
"step": 2219
},
{
"epoch": 0.5459911460895229,
"grad_norm": 3.7524309158325195,
"learning_rate": 9.904569944449287e-06,
"loss": 1.281,
"step": 2220
},
{
"epoch": 0.5462370880472208,
"grad_norm": 3.6796000003814697,
"learning_rate": 9.904411732716003e-06,
"loss": 1.2364,
"step": 2221
},
{
"epoch": 0.5464830300049188,
"grad_norm": 3.611638069152832,
"learning_rate": 9.90425339120887e-06,
"loss": 1.192,
"step": 2222
},
{
"epoch": 0.5467289719626168,
"grad_norm": 3.8202872276306152,
"learning_rate": 9.904094919932076e-06,
"loss": 1.1032,
"step": 2223
},
{
"epoch": 0.5469749139203148,
"grad_norm": 3.6413397789001465,
"learning_rate": 9.903936318889817e-06,
"loss": 1.1389,
"step": 2224
},
{
"epoch": 0.5472208558780128,
"grad_norm": 3.64487886428833,
"learning_rate": 9.90377758808629e-06,
"loss": 1.1281,
"step": 2225
},
{
"epoch": 0.5474667978357107,
"grad_norm": 3.9430019855499268,
"learning_rate": 9.903618727525693e-06,
"loss": 1.1949,
"step": 2226
},
{
"epoch": 0.5477127397934087,
"grad_norm": 3.914421558380127,
"learning_rate": 9.90345973721223e-06,
"loss": 1.2489,
"step": 2227
},
{
"epoch": 0.5479586817511067,
"grad_norm": 3.457014560699463,
"learning_rate": 9.903300617150107e-06,
"loss": 1.2024,
"step": 2228
},
{
"epoch": 0.5482046237088047,
"grad_norm": 3.9405999183654785,
"learning_rate": 9.903141367343536e-06,
"loss": 1.311,
"step": 2229
},
{
"epoch": 0.5484505656665027,
"grad_norm": 3.847198486328125,
"learning_rate": 9.902981987796731e-06,
"loss": 1.3065,
"step": 2230
},
{
"epoch": 0.5486965076242006,
"grad_norm": 3.8664727210998535,
"learning_rate": 9.90282247851391e-06,
"loss": 1.278,
"step": 2231
},
{
"epoch": 0.5489424495818986,
"grad_norm": 3.9664089679718018,
"learning_rate": 9.90266283949929e-06,
"loss": 1.2225,
"step": 2232
},
{
"epoch": 0.5491883915395966,
"grad_norm": 3.702806234359741,
"learning_rate": 9.902503070757101e-06,
"loss": 1.2729,
"step": 2233
},
{
"epoch": 0.5494343334972946,
"grad_norm": 3.9769954681396484,
"learning_rate": 9.902343172291564e-06,
"loss": 1.338,
"step": 2234
},
{
"epoch": 0.5496802754549927,
"grad_norm": 3.7994213104248047,
"learning_rate": 9.902183144106916e-06,
"loss": 1.3621,
"step": 2235
},
{
"epoch": 0.5499262174126907,
"grad_norm": 3.7221739292144775,
"learning_rate": 9.902022986207387e-06,
"loss": 1.1884,
"step": 2236
},
{
"epoch": 0.5501721593703885,
"grad_norm": 3.961033344268799,
"learning_rate": 9.901862698597218e-06,
"loss": 1.3031,
"step": 2237
},
{
"epoch": 0.5504181013280866,
"grad_norm": 3.9215621948242188,
"learning_rate": 9.901702281280647e-06,
"loss": 1.1882,
"step": 2238
},
{
"epoch": 0.5506640432857846,
"grad_norm": 3.8801207542419434,
"learning_rate": 9.901541734261922e-06,
"loss": 1.4134,
"step": 2239
},
{
"epoch": 0.5509099852434826,
"grad_norm": 4.020729064941406,
"learning_rate": 9.90138105754529e-06,
"loss": 1.3136,
"step": 2240
},
{
"epoch": 0.5511559272011806,
"grad_norm": 3.5103695392608643,
"learning_rate": 9.901220251135002e-06,
"loss": 1.1105,
"step": 2241
},
{
"epoch": 0.5514018691588785,
"grad_norm": 4.272065162658691,
"learning_rate": 9.901059315035313e-06,
"loss": 1.2283,
"step": 2242
},
{
"epoch": 0.5516478111165765,
"grad_norm": 3.7734808921813965,
"learning_rate": 9.900898249250483e-06,
"loss": 1.3203,
"step": 2243
},
{
"epoch": 0.5518937530742745,
"grad_norm": 4.140749931335449,
"learning_rate": 9.900737053784772e-06,
"loss": 1.3552,
"step": 2244
},
{
"epoch": 0.5521396950319725,
"grad_norm": 3.8629486560821533,
"learning_rate": 9.900575728642447e-06,
"loss": 1.1814,
"step": 2245
},
{
"epoch": 0.5523856369896705,
"grad_norm": 4.07596492767334,
"learning_rate": 9.900414273827775e-06,
"loss": 1.2867,
"step": 2246
},
{
"epoch": 0.5526315789473685,
"grad_norm": 4.385118007659912,
"learning_rate": 9.900252689345031e-06,
"loss": 1.2185,
"step": 2247
},
{
"epoch": 0.5528775209050664,
"grad_norm": 3.483334541320801,
"learning_rate": 9.900090975198486e-06,
"loss": 1.3169,
"step": 2248
},
{
"epoch": 0.5531234628627644,
"grad_norm": 4.053977012634277,
"learning_rate": 9.899929131392424e-06,
"loss": 1.2221,
"step": 2249
},
{
"epoch": 0.5533694048204624,
"grad_norm": 3.9417266845703125,
"learning_rate": 9.899767157931124e-06,
"loss": 1.2663,
"step": 2250
},
{
"epoch": 0.5536153467781604,
"grad_norm": 3.6486167907714844,
"learning_rate": 9.899605054818874e-06,
"loss": 1.2414,
"step": 2251
},
{
"epoch": 0.5538612887358584,
"grad_norm": 4.290881156921387,
"learning_rate": 9.899442822059963e-06,
"loss": 1.3576,
"step": 2252
},
{
"epoch": 0.5541072306935563,
"grad_norm": 4.092667579650879,
"learning_rate": 9.899280459658682e-06,
"loss": 1.3233,
"step": 2253
},
{
"epoch": 0.5543531726512543,
"grad_norm": 4.383387565612793,
"learning_rate": 9.89911796761933e-06,
"loss": 1.3839,
"step": 2254
},
{
"epoch": 0.5545991146089523,
"grad_norm": 3.9149868488311768,
"learning_rate": 9.898955345946206e-06,
"loss": 1.2491,
"step": 2255
},
{
"epoch": 0.5548450565666503,
"grad_norm": 3.77732253074646,
"learning_rate": 9.89879259464361e-06,
"loss": 1.2702,
"step": 2256
},
{
"epoch": 0.5550909985243483,
"grad_norm": 3.574117422103882,
"learning_rate": 9.898629713715853e-06,
"loss": 1.1796,
"step": 2257
},
{
"epoch": 0.5553369404820462,
"grad_norm": 4.200451374053955,
"learning_rate": 9.898466703167241e-06,
"loss": 1.2617,
"step": 2258
},
{
"epoch": 0.5555828824397442,
"grad_norm": 3.6135802268981934,
"learning_rate": 9.89830356300209e-06,
"loss": 1.2277,
"step": 2259
},
{
"epoch": 0.5558288243974422,
"grad_norm": 3.9752767086029053,
"learning_rate": 9.898140293224716e-06,
"loss": 1.3861,
"step": 2260
},
{
"epoch": 0.5560747663551402,
"grad_norm": 4.422356128692627,
"learning_rate": 9.897976893839437e-06,
"loss": 1.3473,
"step": 2261
},
{
"epoch": 0.5563207083128382,
"grad_norm": 4.22304105758667,
"learning_rate": 9.897813364850581e-06,
"loss": 1.1922,
"step": 2262
},
{
"epoch": 0.5565666502705362,
"grad_norm": 3.488956928253174,
"learning_rate": 9.897649706262474e-06,
"loss": 1.108,
"step": 2263
},
{
"epoch": 0.5568125922282341,
"grad_norm": 3.97953462600708,
"learning_rate": 9.897485918079444e-06,
"loss": 1.1729,
"step": 2264
},
{
"epoch": 0.5570585341859321,
"grad_norm": 3.739452362060547,
"learning_rate": 9.897322000305824e-06,
"loss": 1.3262,
"step": 2265
},
{
"epoch": 0.5573044761436301,
"grad_norm": 3.9541773796081543,
"learning_rate": 9.897157952945957e-06,
"loss": 1.3407,
"step": 2266
},
{
"epoch": 0.5575504181013281,
"grad_norm": 3.636178731918335,
"learning_rate": 9.89699377600418e-06,
"loss": 1.118,
"step": 2267
},
{
"epoch": 0.5577963600590261,
"grad_norm": 4.47128438949585,
"learning_rate": 9.896829469484837e-06,
"loss": 1.4396,
"step": 2268
},
{
"epoch": 0.558042302016724,
"grad_norm": 4.095100402832031,
"learning_rate": 9.896665033392277e-06,
"loss": 1.3949,
"step": 2269
},
{
"epoch": 0.558288243974422,
"grad_norm": 3.8174242973327637,
"learning_rate": 9.896500467730849e-06,
"loss": 1.0782,
"step": 2270
},
{
"epoch": 0.55853418593212,
"grad_norm": 3.531322479248047,
"learning_rate": 9.89633577250491e-06,
"loss": 1.1161,
"step": 2271
},
{
"epoch": 0.558780127889818,
"grad_norm": 3.682903528213501,
"learning_rate": 9.896170947718817e-06,
"loss": 1.3295,
"step": 2272
},
{
"epoch": 0.559026069847516,
"grad_norm": 3.810790777206421,
"learning_rate": 9.89600599337693e-06,
"loss": 1.3107,
"step": 2273
},
{
"epoch": 0.559272011805214,
"grad_norm": 3.445693016052246,
"learning_rate": 9.895840909483616e-06,
"loss": 1.1309,
"step": 2274
},
{
"epoch": 0.5595179537629119,
"grad_norm": 3.731858968734741,
"learning_rate": 9.895675696043243e-06,
"loss": 1.2659,
"step": 2275
},
{
"epoch": 0.5597638957206099,
"grad_norm": 3.6502883434295654,
"learning_rate": 9.895510353060181e-06,
"loss": 1.2625,
"step": 2276
},
{
"epoch": 0.5600098376783079,
"grad_norm": 4.325182914733887,
"learning_rate": 9.895344880538806e-06,
"loss": 1.3612,
"step": 2277
},
{
"epoch": 0.5602557796360059,
"grad_norm": 3.8467063903808594,
"learning_rate": 9.895179278483498e-06,
"loss": 1.2686,
"step": 2278
},
{
"epoch": 0.5605017215937039,
"grad_norm": 3.8830080032348633,
"learning_rate": 9.895013546898635e-06,
"loss": 1.1545,
"step": 2279
},
{
"epoch": 0.5607476635514018,
"grad_norm": 4.065216064453125,
"learning_rate": 9.894847685788607e-06,
"loss": 1.2427,
"step": 2280
},
{
"epoch": 0.5609936055090998,
"grad_norm": 3.713036298751831,
"learning_rate": 9.8946816951578e-06,
"loss": 1.2103,
"step": 2281
},
{
"epoch": 0.5612395474667978,
"grad_norm": 3.562397003173828,
"learning_rate": 9.894515575010606e-06,
"loss": 1.1242,
"step": 2282
},
{
"epoch": 0.5614854894244958,
"grad_norm": 4.16795015335083,
"learning_rate": 9.894349325351422e-06,
"loss": 1.3818,
"step": 2283
},
{
"epoch": 0.5617314313821938,
"grad_norm": 3.6272799968719482,
"learning_rate": 9.894182946184645e-06,
"loss": 1.1888,
"step": 2284
},
{
"epoch": 0.5619773733398918,
"grad_norm": 4.448729991912842,
"learning_rate": 9.894016437514682e-06,
"loss": 1.2134,
"step": 2285
},
{
"epoch": 0.5622233152975897,
"grad_norm": 3.5368943214416504,
"learning_rate": 9.893849799345933e-06,
"loss": 1.313,
"step": 2286
},
{
"epoch": 0.5624692572552877,
"grad_norm": 4.07132625579834,
"learning_rate": 9.893683031682813e-06,
"loss": 1.2708,
"step": 2287
},
{
"epoch": 0.5627151992129857,
"grad_norm": 4.010115146636963,
"learning_rate": 9.893516134529732e-06,
"loss": 1.1907,
"step": 2288
},
{
"epoch": 0.5629611411706837,
"grad_norm": 3.8525285720825195,
"learning_rate": 9.893349107891104e-06,
"loss": 1.168,
"step": 2289
},
{
"epoch": 0.5632070831283817,
"grad_norm": 3.3778891563415527,
"learning_rate": 9.893181951771353e-06,
"loss": 1.1658,
"step": 2290
},
{
"epoch": 0.5634530250860796,
"grad_norm": 3.6644136905670166,
"learning_rate": 9.893014666174898e-06,
"loss": 1.083,
"step": 2291
},
{
"epoch": 0.5636989670437776,
"grad_norm": 3.720546245574951,
"learning_rate": 9.89284725110617e-06,
"loss": 1.1715,
"step": 2292
},
{
"epoch": 0.5639449090014756,
"grad_norm": 3.821873188018799,
"learning_rate": 9.892679706569594e-06,
"loss": 1.2193,
"step": 2293
},
{
"epoch": 0.5641908509591737,
"grad_norm": 3.573883056640625,
"learning_rate": 9.892512032569608e-06,
"loss": 1.2233,
"step": 2294
},
{
"epoch": 0.5644367929168717,
"grad_norm": 3.5404534339904785,
"learning_rate": 9.892344229110646e-06,
"loss": 1.2109,
"step": 2295
},
{
"epoch": 0.5646827348745695,
"grad_norm": 3.5757462978363037,
"learning_rate": 9.892176296197146e-06,
"loss": 1.338,
"step": 2296
},
{
"epoch": 0.5649286768322676,
"grad_norm": 3.6020987033843994,
"learning_rate": 9.892008233833558e-06,
"loss": 1.287,
"step": 2297
},
{
"epoch": 0.5651746187899656,
"grad_norm": 3.8359904289245605,
"learning_rate": 9.891840042024323e-06,
"loss": 1.2574,
"step": 2298
},
{
"epoch": 0.5654205607476636,
"grad_norm": 3.8114006519317627,
"learning_rate": 9.891671720773894e-06,
"loss": 1.1654,
"step": 2299
},
{
"epoch": 0.5656665027053616,
"grad_norm": 4.399141311645508,
"learning_rate": 9.891503270086725e-06,
"loss": 1.1069,
"step": 2300
},
{
"epoch": 0.5659124446630596,
"grad_norm": 3.9028890132904053,
"learning_rate": 9.891334689967273e-06,
"loss": 1.3408,
"step": 2301
},
{
"epoch": 0.5661583866207575,
"grad_norm": 3.8426384925842285,
"learning_rate": 9.89116598042e-06,
"loss": 1.2261,
"step": 2302
},
{
"epoch": 0.5664043285784555,
"grad_norm": 3.808612823486328,
"learning_rate": 9.890997141449367e-06,
"loss": 1.267,
"step": 2303
},
{
"epoch": 0.5666502705361535,
"grad_norm": 4.235655784606934,
"learning_rate": 9.890828173059843e-06,
"loss": 1.4841,
"step": 2304
},
{
"epoch": 0.5668962124938515,
"grad_norm": 3.989626169204712,
"learning_rate": 9.890659075255902e-06,
"loss": 1.3472,
"step": 2305
},
{
"epoch": 0.5671421544515495,
"grad_norm": 3.654944658279419,
"learning_rate": 9.890489848042014e-06,
"loss": 1.1493,
"step": 2306
},
{
"epoch": 0.5673880964092474,
"grad_norm": 3.7547993659973145,
"learning_rate": 9.890320491422658e-06,
"loss": 1.0452,
"step": 2307
},
{
"epoch": 0.5676340383669454,
"grad_norm": 3.6780879497528076,
"learning_rate": 9.890151005402318e-06,
"loss": 1.1736,
"step": 2308
},
{
"epoch": 0.5678799803246434,
"grad_norm": 3.890547513961792,
"learning_rate": 9.889981389985477e-06,
"loss": 1.1092,
"step": 2309
},
{
"epoch": 0.5681259222823414,
"grad_norm": 3.736940860748291,
"learning_rate": 9.889811645176622e-06,
"loss": 1.1904,
"step": 2310
},
{
"epoch": 0.5683718642400394,
"grad_norm": 4.106467247009277,
"learning_rate": 9.889641770980246e-06,
"loss": 1.2723,
"step": 2311
},
{
"epoch": 0.5686178061977374,
"grad_norm": 3.798337936401367,
"learning_rate": 9.889471767400842e-06,
"loss": 1.1706,
"step": 2312
},
{
"epoch": 0.5688637481554353,
"grad_norm": 4.028409957885742,
"learning_rate": 9.88930163444291e-06,
"loss": 1.2384,
"step": 2313
},
{
"epoch": 0.5691096901131333,
"grad_norm": 3.8486855030059814,
"learning_rate": 9.889131372110953e-06,
"loss": 1.139,
"step": 2314
},
{
"epoch": 0.5693556320708313,
"grad_norm": 4.017292499542236,
"learning_rate": 9.888960980409474e-06,
"loss": 1.1871,
"step": 2315
},
{
"epoch": 0.5696015740285293,
"grad_norm": 4.16668701171875,
"learning_rate": 9.888790459342983e-06,
"loss": 1.4213,
"step": 2316
},
{
"epoch": 0.5698475159862273,
"grad_norm": 3.7770137786865234,
"learning_rate": 9.888619808915991e-06,
"loss": 1.2766,
"step": 2317
},
{
"epoch": 0.5700934579439252,
"grad_norm": 4.068838596343994,
"learning_rate": 9.888449029133015e-06,
"loss": 1.3295,
"step": 2318
},
{
"epoch": 0.5703393999016232,
"grad_norm": 3.612476348876953,
"learning_rate": 9.888278119998573e-06,
"loss": 1.1948,
"step": 2319
},
{
"epoch": 0.5705853418593212,
"grad_norm": 3.7787415981292725,
"learning_rate": 9.888107081517187e-06,
"loss": 1.0733,
"step": 2320
},
{
"epoch": 0.5708312838170192,
"grad_norm": 4.032155990600586,
"learning_rate": 9.887935913693385e-06,
"loss": 1.3552,
"step": 2321
},
{
"epoch": 0.5710772257747172,
"grad_norm": 3.8607442378997803,
"learning_rate": 9.887764616531693e-06,
"loss": 1.3704,
"step": 2322
},
{
"epoch": 0.5713231677324152,
"grad_norm": 3.7871932983398438,
"learning_rate": 9.887593190036644e-06,
"loss": 1.1255,
"step": 2323
},
{
"epoch": 0.5715691096901131,
"grad_norm": 4.141458988189697,
"learning_rate": 9.887421634212777e-06,
"loss": 1.2967,
"step": 2324
},
{
"epoch": 0.5718150516478111,
"grad_norm": 3.5625367164611816,
"learning_rate": 9.887249949064628e-06,
"loss": 1.2829,
"step": 2325
},
{
"epoch": 0.5720609936055091,
"grad_norm": 3.6128551959991455,
"learning_rate": 9.887078134596744e-06,
"loss": 1.1517,
"step": 2326
},
{
"epoch": 0.5723069355632071,
"grad_norm": 3.7762503623962402,
"learning_rate": 9.886906190813667e-06,
"loss": 1.194,
"step": 2327
},
{
"epoch": 0.5725528775209051,
"grad_norm": 3.806962013244629,
"learning_rate": 9.886734117719949e-06,
"loss": 1.2868,
"step": 2328
},
{
"epoch": 0.572798819478603,
"grad_norm": 3.955584764480591,
"learning_rate": 9.886561915320143e-06,
"loss": 1.2917,
"step": 2329
},
{
"epoch": 0.573044761436301,
"grad_norm": 4.076197147369385,
"learning_rate": 9.886389583618805e-06,
"loss": 1.2078,
"step": 2330
},
{
"epoch": 0.573290703393999,
"grad_norm": 4.1631598472595215,
"learning_rate": 9.886217122620496e-06,
"loss": 1.3451,
"step": 2331
},
{
"epoch": 0.573536645351697,
"grad_norm": 3.8790793418884277,
"learning_rate": 9.886044532329777e-06,
"loss": 1.2652,
"step": 2332
},
{
"epoch": 0.573782587309395,
"grad_norm": 3.9605658054351807,
"learning_rate": 9.885871812751219e-06,
"loss": 1.3097,
"step": 2333
},
{
"epoch": 0.5740285292670929,
"grad_norm": 3.839874267578125,
"learning_rate": 9.88569896388939e-06,
"loss": 1.2141,
"step": 2334
},
{
"epoch": 0.5742744712247909,
"grad_norm": 4.2126994132995605,
"learning_rate": 9.885525985748862e-06,
"loss": 1.3227,
"step": 2335
},
{
"epoch": 0.5745204131824889,
"grad_norm": 3.8165581226348877,
"learning_rate": 9.885352878334214e-06,
"loss": 1.2092,
"step": 2336
},
{
"epoch": 0.5747663551401869,
"grad_norm": 3.7047832012176514,
"learning_rate": 9.885179641650027e-06,
"loss": 1.2361,
"step": 2337
},
{
"epoch": 0.5750122970978849,
"grad_norm": 4.578978061676025,
"learning_rate": 9.885006275700884e-06,
"loss": 1.4957,
"step": 2338
},
{
"epoch": 0.5752582390555829,
"grad_norm": 3.614168167114258,
"learning_rate": 9.884832780491372e-06,
"loss": 1.1742,
"step": 2339
},
{
"epoch": 0.5755041810132808,
"grad_norm": 4.0630202293396,
"learning_rate": 9.884659156026081e-06,
"loss": 1.3913,
"step": 2340
},
{
"epoch": 0.5757501229709788,
"grad_norm": 3.509317636489868,
"learning_rate": 9.884485402309609e-06,
"loss": 1.1121,
"step": 2341
},
{
"epoch": 0.5759960649286768,
"grad_norm": 3.8720543384552,
"learning_rate": 9.88431151934655e-06,
"loss": 1.3229,
"step": 2342
},
{
"epoch": 0.5762420068863748,
"grad_norm": 3.7598774433135986,
"learning_rate": 9.884137507141508e-06,
"loss": 1.1489,
"step": 2343
},
{
"epoch": 0.5764879488440728,
"grad_norm": 3.8191070556640625,
"learning_rate": 9.883963365699085e-06,
"loss": 1.2199,
"step": 2344
},
{
"epoch": 0.5767338908017707,
"grad_norm": 3.7745580673217773,
"learning_rate": 9.883789095023888e-06,
"loss": 1.4238,
"step": 2345
},
{
"epoch": 0.5769798327594687,
"grad_norm": 3.720015048980713,
"learning_rate": 9.883614695120532e-06,
"loss": 1.1966,
"step": 2346
},
{
"epoch": 0.5772257747171667,
"grad_norm": 4.162359237670898,
"learning_rate": 9.883440165993628e-06,
"loss": 1.2438,
"step": 2347
},
{
"epoch": 0.5774717166748647,
"grad_norm": 3.546009063720703,
"learning_rate": 9.883265507647797e-06,
"loss": 1.2546,
"step": 2348
},
{
"epoch": 0.5777176586325627,
"grad_norm": 4.043363571166992,
"learning_rate": 9.88309072008766e-06,
"loss": 1.2577,
"step": 2349
},
{
"epoch": 0.5779636005902608,
"grad_norm": 3.504254102706909,
"learning_rate": 9.88291580331784e-06,
"loss": 1.1082,
"step": 2350
},
{
"epoch": 0.5782095425479586,
"grad_norm": 3.7835853099823,
"learning_rate": 9.882740757342967e-06,
"loss": 1.2304,
"step": 2351
},
{
"epoch": 0.5784554845056566,
"grad_norm": 3.4296839237213135,
"learning_rate": 9.882565582167673e-06,
"loss": 1.0491,
"step": 2352
},
{
"epoch": 0.5787014264633547,
"grad_norm": 3.8384742736816406,
"learning_rate": 9.882390277796591e-06,
"loss": 1.2312,
"step": 2353
},
{
"epoch": 0.5789473684210527,
"grad_norm": 3.8486239910125732,
"learning_rate": 9.882214844234364e-06,
"loss": 1.2709,
"step": 2354
},
{
"epoch": 0.5791933103787507,
"grad_norm": 3.872349739074707,
"learning_rate": 9.882039281485631e-06,
"loss": 1.0234,
"step": 2355
},
{
"epoch": 0.5794392523364486,
"grad_norm": 3.7806832790374756,
"learning_rate": 9.881863589555038e-06,
"loss": 1.2117,
"step": 2356
},
{
"epoch": 0.5796851942941466,
"grad_norm": 4.388751029968262,
"learning_rate": 9.881687768447235e-06,
"loss": 1.3074,
"step": 2357
},
{
"epoch": 0.5799311362518446,
"grad_norm": 3.6489927768707275,
"learning_rate": 9.881511818166873e-06,
"loss": 1.2045,
"step": 2358
},
{
"epoch": 0.5801770782095426,
"grad_norm": 3.500624418258667,
"learning_rate": 9.881335738718608e-06,
"loss": 1.0317,
"step": 2359
},
{
"epoch": 0.5804230201672406,
"grad_norm": 3.766852378845215,
"learning_rate": 9.881159530107099e-06,
"loss": 1.2661,
"step": 2360
},
{
"epoch": 0.5806689621249385,
"grad_norm": 3.878927230834961,
"learning_rate": 9.880983192337007e-06,
"loss": 1.309,
"step": 2361
},
{
"epoch": 0.5809149040826365,
"grad_norm": 3.4503984451293945,
"learning_rate": 9.880806725413002e-06,
"loss": 1.3155,
"step": 2362
},
{
"epoch": 0.5811608460403345,
"grad_norm": 4.2128777503967285,
"learning_rate": 9.880630129339753e-06,
"loss": 1.2121,
"step": 2363
},
{
"epoch": 0.5814067879980325,
"grad_norm": 3.7699573040008545,
"learning_rate": 9.880453404121927e-06,
"loss": 1.1901,
"step": 2364
},
{
"epoch": 0.5816527299557305,
"grad_norm": 3.6038341522216797,
"learning_rate": 9.880276549764207e-06,
"loss": 1.1483,
"step": 2365
},
{
"epoch": 0.5818986719134285,
"grad_norm": 4.05662727355957,
"learning_rate": 9.88009956627127e-06,
"loss": 1.3103,
"step": 2366
},
{
"epoch": 0.5821446138711264,
"grad_norm": 3.862778425216675,
"learning_rate": 9.879922453647799e-06,
"loss": 1.2966,
"step": 2367
},
{
"epoch": 0.5823905558288244,
"grad_norm": 4.011362552642822,
"learning_rate": 9.879745211898481e-06,
"loss": 1.1685,
"step": 2368
},
{
"epoch": 0.5826364977865224,
"grad_norm": 3.781534433364868,
"learning_rate": 9.879567841028006e-06,
"loss": 1.241,
"step": 2369
},
{
"epoch": 0.5828824397442204,
"grad_norm": 3.6630637645721436,
"learning_rate": 9.879390341041067e-06,
"loss": 1.2228,
"step": 2370
},
{
"epoch": 0.5831283817019184,
"grad_norm": 3.7453813552856445,
"learning_rate": 9.879212711942359e-06,
"loss": 1.1045,
"step": 2371
},
{
"epoch": 0.5833743236596163,
"grad_norm": 3.5799715518951416,
"learning_rate": 9.879034953736586e-06,
"loss": 1.2577,
"step": 2372
},
{
"epoch": 0.5836202656173143,
"grad_norm": 3.7565345764160156,
"learning_rate": 9.878857066428449e-06,
"loss": 1.2622,
"step": 2373
},
{
"epoch": 0.5838662075750123,
"grad_norm": 4.034372806549072,
"learning_rate": 9.878679050022656e-06,
"loss": 1.1868,
"step": 2374
},
{
"epoch": 0.5841121495327103,
"grad_norm": 3.708251714706421,
"learning_rate": 9.878500904523915e-06,
"loss": 1.2358,
"step": 2375
},
{
"epoch": 0.5843580914904083,
"grad_norm": 4.048354625701904,
"learning_rate": 9.878322629936944e-06,
"loss": 1.4041,
"step": 2376
},
{
"epoch": 0.5846040334481063,
"grad_norm": 3.7614636421203613,
"learning_rate": 9.878144226266458e-06,
"loss": 1.1234,
"step": 2377
},
{
"epoch": 0.5848499754058042,
"grad_norm": 4.073982238769531,
"learning_rate": 9.877965693517176e-06,
"loss": 1.2273,
"step": 2378
},
{
"epoch": 0.5850959173635022,
"grad_norm": 4.334047794342041,
"learning_rate": 9.877787031693825e-06,
"loss": 1.4443,
"step": 2379
},
{
"epoch": 0.5853418593212002,
"grad_norm": 3.381683349609375,
"learning_rate": 9.877608240801132e-06,
"loss": 1.1728,
"step": 2380
},
{
"epoch": 0.5855878012788982,
"grad_norm": 3.6854827404022217,
"learning_rate": 9.877429320843826e-06,
"loss": 1.2339,
"step": 2381
},
{
"epoch": 0.5858337432365962,
"grad_norm": 3.830345392227173,
"learning_rate": 9.877250271826643e-06,
"loss": 1.3604,
"step": 2382
},
{
"epoch": 0.5860796851942941,
"grad_norm": 3.8693020343780518,
"learning_rate": 9.877071093754321e-06,
"loss": 1.249,
"step": 2383
},
{
"epoch": 0.5863256271519921,
"grad_norm": 3.6272079944610596,
"learning_rate": 9.8768917866316e-06,
"loss": 1.2233,
"step": 2384
},
{
"epoch": 0.5865715691096901,
"grad_norm": 3.75947904586792,
"learning_rate": 9.876712350463225e-06,
"loss": 1.1623,
"step": 2385
},
{
"epoch": 0.5868175110673881,
"grad_norm": 3.6817471981048584,
"learning_rate": 9.876532785253944e-06,
"loss": 1.231,
"step": 2386
},
{
"epoch": 0.5870634530250861,
"grad_norm": 3.4087533950805664,
"learning_rate": 9.876353091008509e-06,
"loss": 1.1687,
"step": 2387
},
{
"epoch": 0.5873093949827841,
"grad_norm": 3.9816956520080566,
"learning_rate": 9.876173267731673e-06,
"loss": 1.2876,
"step": 2388
},
{
"epoch": 0.587555336940482,
"grad_norm": 4.122218132019043,
"learning_rate": 9.875993315428198e-06,
"loss": 1.1921,
"step": 2389
},
{
"epoch": 0.58780127889818,
"grad_norm": 3.794365882873535,
"learning_rate": 9.87581323410284e-06,
"loss": 1.1859,
"step": 2390
},
{
"epoch": 0.588047220855878,
"grad_norm": 3.9502193927764893,
"learning_rate": 9.87563302376037e-06,
"loss": 1.2787,
"step": 2391
},
{
"epoch": 0.588293162813576,
"grad_norm": 3.8312175273895264,
"learning_rate": 9.875452684405553e-06,
"loss": 1.273,
"step": 2392
},
{
"epoch": 0.588539104771274,
"grad_norm": 4.01531457901001,
"learning_rate": 9.875272216043162e-06,
"loss": 1.2144,
"step": 2393
},
{
"epoch": 0.5887850467289719,
"grad_norm": 3.807788610458374,
"learning_rate": 9.875091618677972e-06,
"loss": 1.3291,
"step": 2394
},
{
"epoch": 0.5890309886866699,
"grad_norm": 3.8778319358825684,
"learning_rate": 9.874910892314761e-06,
"loss": 1.2186,
"step": 2395
},
{
"epoch": 0.5892769306443679,
"grad_norm": 3.70177960395813,
"learning_rate": 9.874730036958312e-06,
"loss": 1.0868,
"step": 2396
},
{
"epoch": 0.5895228726020659,
"grad_norm": 3.8427205085754395,
"learning_rate": 9.874549052613412e-06,
"loss": 1.1755,
"step": 2397
},
{
"epoch": 0.5897688145597639,
"grad_norm": 3.9734294414520264,
"learning_rate": 9.874367939284845e-06,
"loss": 1.3758,
"step": 2398
},
{
"epoch": 0.5900147565174618,
"grad_norm": 3.8278591632843018,
"learning_rate": 9.87418669697741e-06,
"loss": 1.0874,
"step": 2399
},
{
"epoch": 0.5902606984751598,
"grad_norm": 3.5474963188171387,
"learning_rate": 9.874005325695897e-06,
"loss": 1.2756,
"step": 2400
},
{
"epoch": 0.5905066404328578,
"grad_norm": 3.8174307346343994,
"learning_rate": 9.87382382544511e-06,
"loss": 1.2902,
"step": 2401
},
{
"epoch": 0.5907525823905558,
"grad_norm": 3.857332229614258,
"learning_rate": 9.873642196229847e-06,
"loss": 1.2134,
"step": 2402
},
{
"epoch": 0.5909985243482538,
"grad_norm": 3.752425193786621,
"learning_rate": 9.873460438054918e-06,
"loss": 1.2531,
"step": 2403
},
{
"epoch": 0.5912444663059518,
"grad_norm": 3.5243968963623047,
"learning_rate": 9.873278550925129e-06,
"loss": 1.1812,
"step": 2404
},
{
"epoch": 0.5914904082636497,
"grad_norm": 3.836498260498047,
"learning_rate": 9.873096534845296e-06,
"loss": 1.2663,
"step": 2405
},
{
"epoch": 0.5917363502213477,
"grad_norm": 3.8250317573547363,
"learning_rate": 9.872914389820234e-06,
"loss": 1.2005,
"step": 2406
},
{
"epoch": 0.5919822921790457,
"grad_norm": 4.321308135986328,
"learning_rate": 9.872732115854763e-06,
"loss": 1.3642,
"step": 2407
},
{
"epoch": 0.5922282341367437,
"grad_norm": 4.097923278808594,
"learning_rate": 9.872549712953705e-06,
"loss": 1.2996,
"step": 2408
},
{
"epoch": 0.5924741760944418,
"grad_norm": 3.7483978271484375,
"learning_rate": 9.872367181121887e-06,
"loss": 1.3196,
"step": 2409
},
{
"epoch": 0.5927201180521396,
"grad_norm": 3.582465887069702,
"learning_rate": 9.872184520364138e-06,
"loss": 1.1935,
"step": 2410
},
{
"epoch": 0.5929660600098376,
"grad_norm": 3.7812466621398926,
"learning_rate": 9.872001730685295e-06,
"loss": 1.0946,
"step": 2411
},
{
"epoch": 0.5932120019675357,
"grad_norm": 4.134995937347412,
"learning_rate": 9.87181881209019e-06,
"loss": 1.4542,
"step": 2412
},
{
"epoch": 0.5934579439252337,
"grad_norm": 3.6881282329559326,
"learning_rate": 9.871635764583666e-06,
"loss": 1.1849,
"step": 2413
},
{
"epoch": 0.5937038858829317,
"grad_norm": 4.151389122009277,
"learning_rate": 9.871452588170565e-06,
"loss": 1.4416,
"step": 2414
},
{
"epoch": 0.5939498278406297,
"grad_norm": 3.9794461727142334,
"learning_rate": 9.871269282855736e-06,
"loss": 1.2362,
"step": 2415
},
{
"epoch": 0.5941957697983276,
"grad_norm": 3.2122554779052734,
"learning_rate": 9.871085848644028e-06,
"loss": 1.1286,
"step": 2416
},
{
"epoch": 0.5944417117560256,
"grad_norm": 4.001514434814453,
"learning_rate": 9.870902285540293e-06,
"loss": 1.2369,
"step": 2417
},
{
"epoch": 0.5946876537137236,
"grad_norm": 3.537787914276123,
"learning_rate": 9.870718593549393e-06,
"loss": 1.2193,
"step": 2418
},
{
"epoch": 0.5949335956714216,
"grad_norm": 3.61586594581604,
"learning_rate": 9.870534772676183e-06,
"loss": 1.2183,
"step": 2419
},
{
"epoch": 0.5951795376291196,
"grad_norm": 3.8178277015686035,
"learning_rate": 9.870350822925532e-06,
"loss": 1.1223,
"step": 2420
},
{
"epoch": 0.5954254795868175,
"grad_norm": 3.6108362674713135,
"learning_rate": 9.870166744302302e-06,
"loss": 1.2048,
"step": 2421
},
{
"epoch": 0.5956714215445155,
"grad_norm": 3.7912793159484863,
"learning_rate": 9.869982536811372e-06,
"loss": 1.1274,
"step": 2422
},
{
"epoch": 0.5959173635022135,
"grad_norm": 3.9409377574920654,
"learning_rate": 9.869798200457607e-06,
"loss": 1.3024,
"step": 2423
},
{
"epoch": 0.5961633054599115,
"grad_norm": 3.681532859802246,
"learning_rate": 9.869613735245891e-06,
"loss": 1.2807,
"step": 2424
},
{
"epoch": 0.5964092474176095,
"grad_norm": 3.4113893508911133,
"learning_rate": 9.869429141181102e-06,
"loss": 1.2066,
"step": 2425
},
{
"epoch": 0.5966551893753075,
"grad_norm": 3.6721911430358887,
"learning_rate": 9.869244418268126e-06,
"loss": 1.1805,
"step": 2426
},
{
"epoch": 0.5969011313330054,
"grad_norm": 4.063199520111084,
"learning_rate": 9.869059566511849e-06,
"loss": 1.2566,
"step": 2427
},
{
"epoch": 0.5971470732907034,
"grad_norm": 3.4714183807373047,
"learning_rate": 9.868874585917166e-06,
"loss": 1.1147,
"step": 2428
},
{
"epoch": 0.5973930152484014,
"grad_norm": 3.3764400482177734,
"learning_rate": 9.868689476488968e-06,
"loss": 1.2621,
"step": 2429
},
{
"epoch": 0.5976389572060994,
"grad_norm": 3.3269307613372803,
"learning_rate": 9.868504238232153e-06,
"loss": 1.1228,
"step": 2430
},
{
"epoch": 0.5978848991637974,
"grad_norm": 3.7210023403167725,
"learning_rate": 9.868318871151627e-06,
"loss": 1.36,
"step": 2431
},
{
"epoch": 0.5981308411214953,
"grad_norm": 3.4914543628692627,
"learning_rate": 9.868133375252289e-06,
"loss": 1.2319,
"step": 2432
},
{
"epoch": 0.5983767830791933,
"grad_norm": 3.470322608947754,
"learning_rate": 9.867947750539053e-06,
"loss": 1.2536,
"step": 2433
},
{
"epoch": 0.5986227250368913,
"grad_norm": 3.9219858646392822,
"learning_rate": 9.867761997016826e-06,
"loss": 1.2397,
"step": 2434
},
{
"epoch": 0.5988686669945893,
"grad_norm": 3.779573917388916,
"learning_rate": 9.867576114690526e-06,
"loss": 1.2507,
"step": 2435
},
{
"epoch": 0.5991146089522873,
"grad_norm": 3.7905187606811523,
"learning_rate": 9.86739010356507e-06,
"loss": 1.3165,
"step": 2436
},
{
"epoch": 0.5993605509099852,
"grad_norm": 3.5041935443878174,
"learning_rate": 9.867203963645382e-06,
"loss": 1.1754,
"step": 2437
},
{
"epoch": 0.5996064928676832,
"grad_norm": 3.987962007522583,
"learning_rate": 9.867017694936385e-06,
"loss": 1.2075,
"step": 2438
},
{
"epoch": 0.5998524348253812,
"grad_norm": 3.560451030731201,
"learning_rate": 9.866831297443009e-06,
"loss": 1.2202,
"step": 2439
},
{
"epoch": 0.6000983767830792,
"grad_norm": 3.9680826663970947,
"learning_rate": 9.866644771170185e-06,
"loss": 1.2266,
"step": 2440
},
{
"epoch": 0.6003443187407772,
"grad_norm": 3.919506549835205,
"learning_rate": 9.866458116122852e-06,
"loss": 1.2663,
"step": 2441
},
{
"epoch": 0.6005902606984752,
"grad_norm": 3.8842403888702393,
"learning_rate": 9.866271332305945e-06,
"loss": 1.3826,
"step": 2442
},
{
"epoch": 0.6008362026561731,
"grad_norm": 3.462766647338867,
"learning_rate": 9.866084419724407e-06,
"loss": 1.0392,
"step": 2443
},
{
"epoch": 0.6010821446138711,
"grad_norm": 3.695439100265503,
"learning_rate": 9.865897378383187e-06,
"loss": 1.1383,
"step": 2444
},
{
"epoch": 0.6013280865715691,
"grad_norm": 4.157264709472656,
"learning_rate": 9.865710208287231e-06,
"loss": 1.4527,
"step": 2445
},
{
"epoch": 0.6015740285292671,
"grad_norm": 3.701310157775879,
"learning_rate": 9.865522909441494e-06,
"loss": 1.2451,
"step": 2446
},
{
"epoch": 0.6018199704869651,
"grad_norm": 3.710237503051758,
"learning_rate": 9.86533548185093e-06,
"loss": 1.238,
"step": 2447
},
{
"epoch": 0.602065912444663,
"grad_norm": 4.417288303375244,
"learning_rate": 9.865147925520499e-06,
"loss": 1.505,
"step": 2448
},
{
"epoch": 0.602311854402361,
"grad_norm": 3.868680953979492,
"learning_rate": 9.864960240455164e-06,
"loss": 1.3148,
"step": 2449
},
{
"epoch": 0.602557796360059,
"grad_norm": 3.7452638149261475,
"learning_rate": 9.864772426659892e-06,
"loss": 1.2549,
"step": 2450
},
{
"epoch": 0.602803738317757,
"grad_norm": 3.6257851123809814,
"learning_rate": 9.864584484139652e-06,
"loss": 1.2865,
"step": 2451
},
{
"epoch": 0.603049680275455,
"grad_norm": 3.4142003059387207,
"learning_rate": 9.864396412899418e-06,
"loss": 1.1006,
"step": 2452
},
{
"epoch": 0.603295622233153,
"grad_norm": 3.8305604457855225,
"learning_rate": 9.864208212944164e-06,
"loss": 1.1166,
"step": 2453
},
{
"epoch": 0.6035415641908509,
"grad_norm": 3.4892609119415283,
"learning_rate": 9.864019884278873e-06,
"loss": 1.2619,
"step": 2454
},
{
"epoch": 0.6037875061485489,
"grad_norm": 3.745746612548828,
"learning_rate": 9.863831426908526e-06,
"loss": 1.1158,
"step": 2455
},
{
"epoch": 0.6040334481062469,
"grad_norm": 3.7232320308685303,
"learning_rate": 9.86364284083811e-06,
"loss": 1.209,
"step": 2456
},
{
"epoch": 0.6042793900639449,
"grad_norm": 3.913010597229004,
"learning_rate": 9.863454126072616e-06,
"loss": 1.5398,
"step": 2457
},
{
"epoch": 0.6045253320216429,
"grad_norm": 3.8412439823150635,
"learning_rate": 9.863265282617039e-06,
"loss": 1.1465,
"step": 2458
},
{
"epoch": 0.6047712739793408,
"grad_norm": 3.8652286529541016,
"learning_rate": 9.86307631047637e-06,
"loss": 1.246,
"step": 2459
},
{
"epoch": 0.6050172159370388,
"grad_norm": 4.003816604614258,
"learning_rate": 9.862887209655618e-06,
"loss": 1.3487,
"step": 2460
},
{
"epoch": 0.6052631578947368,
"grad_norm": 3.4361114501953125,
"learning_rate": 9.862697980159779e-06,
"loss": 1.0889,
"step": 2461
},
{
"epoch": 0.6055090998524348,
"grad_norm": 3.620920419692993,
"learning_rate": 9.862508621993865e-06,
"loss": 1.2719,
"step": 2462
},
{
"epoch": 0.6057550418101328,
"grad_norm": 3.5006215572357178,
"learning_rate": 9.862319135162883e-06,
"loss": 1.2526,
"step": 2463
},
{
"epoch": 0.6060009837678307,
"grad_norm": 3.714046001434326,
"learning_rate": 9.86212951967185e-06,
"loss": 1.3097,
"step": 2464
},
{
"epoch": 0.6062469257255287,
"grad_norm": 3.460939645767212,
"learning_rate": 9.861939775525782e-06,
"loss": 1.1575,
"step": 2465
},
{
"epoch": 0.6064928676832267,
"grad_norm": 4.454813480377197,
"learning_rate": 9.8617499027297e-06,
"loss": 1.3523,
"step": 2466
},
{
"epoch": 0.6067388096409247,
"grad_norm": 3.896174430847168,
"learning_rate": 9.861559901288628e-06,
"loss": 1.3867,
"step": 2467
},
{
"epoch": 0.6069847515986228,
"grad_norm": 3.920630931854248,
"learning_rate": 9.861369771207592e-06,
"loss": 1.3305,
"step": 2468
},
{
"epoch": 0.6072306935563208,
"grad_norm": 4.136085033416748,
"learning_rate": 9.861179512491625e-06,
"loss": 1.3063,
"step": 2469
},
{
"epoch": 0.6074766355140186,
"grad_norm": 3.9845619201660156,
"learning_rate": 9.860989125145763e-06,
"loss": 1.2899,
"step": 2470
},
{
"epoch": 0.6077225774717167,
"grad_norm": 4.099381923675537,
"learning_rate": 9.86079860917504e-06,
"loss": 1.3646,
"step": 2471
},
{
"epoch": 0.6079685194294147,
"grad_norm": 3.8639416694641113,
"learning_rate": 9.860607964584499e-06,
"loss": 1.3155,
"step": 2472
},
{
"epoch": 0.6082144613871127,
"grad_norm": 3.9099810123443604,
"learning_rate": 9.860417191379184e-06,
"loss": 1.1871,
"step": 2473
},
{
"epoch": 0.6084604033448107,
"grad_norm": 3.7112653255462646,
"learning_rate": 9.860226289564143e-06,
"loss": 1.1501,
"step": 2474
},
{
"epoch": 0.6087063453025086,
"grad_norm": 4.16945743560791,
"learning_rate": 9.860035259144429e-06,
"loss": 1.1276,
"step": 2475
},
{
"epoch": 0.6089522872602066,
"grad_norm": 3.7522618770599365,
"learning_rate": 9.859844100125095e-06,
"loss": 1.1993,
"step": 2476
},
{
"epoch": 0.6091982292179046,
"grad_norm": 3.4567127227783203,
"learning_rate": 9.8596528125112e-06,
"loss": 1.1787,
"step": 2477
},
{
"epoch": 0.6094441711756026,
"grad_norm": 4.3706207275390625,
"learning_rate": 9.859461396307805e-06,
"loss": 1.1778,
"step": 2478
},
{
"epoch": 0.6096901131333006,
"grad_norm": 4.099209785461426,
"learning_rate": 9.859269851519975e-06,
"loss": 1.5133,
"step": 2479
},
{
"epoch": 0.6099360550909986,
"grad_norm": 3.9474663734436035,
"learning_rate": 9.859078178152779e-06,
"loss": 1.3016,
"step": 2480
},
{
"epoch": 0.6101819970486965,
"grad_norm": 3.395908832550049,
"learning_rate": 9.85888637621129e-06,
"loss": 1.118,
"step": 2481
},
{
"epoch": 0.6104279390063945,
"grad_norm": 3.714630126953125,
"learning_rate": 9.858694445700578e-06,
"loss": 1.2238,
"step": 2482
},
{
"epoch": 0.6106738809640925,
"grad_norm": 4.2723774909973145,
"learning_rate": 9.858502386625728e-06,
"loss": 1.2748,
"step": 2483
},
{
"epoch": 0.6109198229217905,
"grad_norm": 3.5212669372558594,
"learning_rate": 9.858310198991818e-06,
"loss": 1.0919,
"step": 2484
},
{
"epoch": 0.6111657648794885,
"grad_norm": 3.811131238937378,
"learning_rate": 9.858117882803938e-06,
"loss": 1.1867,
"step": 2485
},
{
"epoch": 0.6114117068371864,
"grad_norm": 3.574753999710083,
"learning_rate": 9.857925438067169e-06,
"loss": 1.1429,
"step": 2486
},
{
"epoch": 0.6116576487948844,
"grad_norm": 3.9225966930389404,
"learning_rate": 9.857732864786612e-06,
"loss": 1.2749,
"step": 2487
},
{
"epoch": 0.6119035907525824,
"grad_norm": 3.6712236404418945,
"learning_rate": 9.857540162967355e-06,
"loss": 1.2502,
"step": 2488
},
{
"epoch": 0.6121495327102804,
"grad_norm": 3.4748244285583496,
"learning_rate": 9.8573473326145e-06,
"loss": 1.3196,
"step": 2489
},
{
"epoch": 0.6123954746679784,
"grad_norm": 3.5270464420318604,
"learning_rate": 9.857154373733153e-06,
"loss": 1.1575,
"step": 2490
},
{
"epoch": 0.6126414166256764,
"grad_norm": 3.646967887878418,
"learning_rate": 9.856961286328414e-06,
"loss": 1.1513,
"step": 2491
},
{
"epoch": 0.6128873585833743,
"grad_norm": 3.7445261478424072,
"learning_rate": 9.856768070405397e-06,
"loss": 1.1994,
"step": 2492
},
{
"epoch": 0.6131333005410723,
"grad_norm": 3.633859634399414,
"learning_rate": 9.85657472596921e-06,
"loss": 1.1558,
"step": 2493
},
{
"epoch": 0.6133792424987703,
"grad_norm": 3.9202828407287598,
"learning_rate": 9.856381253024973e-06,
"loss": 1.3312,
"step": 2494
},
{
"epoch": 0.6136251844564683,
"grad_norm": 3.8292086124420166,
"learning_rate": 9.856187651577803e-06,
"loss": 1.2737,
"step": 2495
},
{
"epoch": 0.6138711264141663,
"grad_norm": 3.7533907890319824,
"learning_rate": 9.855993921632826e-06,
"loss": 1.1997,
"step": 2496
},
{
"epoch": 0.6141170683718642,
"grad_norm": 3.9467861652374268,
"learning_rate": 9.855800063195163e-06,
"loss": 1.2189,
"step": 2497
},
{
"epoch": 0.6143630103295622,
"grad_norm": 3.610787868499756,
"learning_rate": 9.855606076269947e-06,
"loss": 1.2057,
"step": 2498
},
{
"epoch": 0.6146089522872602,
"grad_norm": 3.583930253982544,
"learning_rate": 9.85541196086231e-06,
"loss": 1.2598,
"step": 2499
},
{
"epoch": 0.6148548942449582,
"grad_norm": 3.546447515487671,
"learning_rate": 9.85521771697739e-06,
"loss": 1.1324,
"step": 2500
},
{
"epoch": 0.6148548942449582,
"eval_loss": 1.276457667350769,
"eval_runtime": 13.7336,
"eval_samples_per_second": 29.126,
"eval_steps_per_second": 3.641,
"step": 2500
},
{
"epoch": 0.6151008362026562,
"grad_norm": 3.8565893173217773,
"learning_rate": 9.855023344620328e-06,
"loss": 1.2019,
"step": 2501
},
{
"epoch": 0.6153467781603541,
"grad_norm": 3.701789617538452,
"learning_rate": 9.85482884379626e-06,
"loss": 1.2676,
"step": 2502
},
{
"epoch": 0.6155927201180521,
"grad_norm": 3.7437520027160645,
"learning_rate": 9.854634214510342e-06,
"loss": 1.1464,
"step": 2503
},
{
"epoch": 0.6158386620757501,
"grad_norm": 3.6945221424102783,
"learning_rate": 9.854439456767718e-06,
"loss": 1.2382,
"step": 2504
},
{
"epoch": 0.6160846040334481,
"grad_norm": 3.5351028442382812,
"learning_rate": 9.854244570573543e-06,
"loss": 1.2631,
"step": 2505
},
{
"epoch": 0.6163305459911461,
"grad_norm": 3.3914003372192383,
"learning_rate": 9.854049555932973e-06,
"loss": 1.1235,
"step": 2506
},
{
"epoch": 0.6165764879488441,
"grad_norm": 4.223696231842041,
"learning_rate": 9.85385441285117e-06,
"loss": 1.2416,
"step": 2507
},
{
"epoch": 0.616822429906542,
"grad_norm": 3.8234920501708984,
"learning_rate": 9.853659141333297e-06,
"loss": 1.145,
"step": 2508
},
{
"epoch": 0.61706837186424,
"grad_norm": 3.865025758743286,
"learning_rate": 9.853463741384518e-06,
"loss": 1.25,
"step": 2509
},
{
"epoch": 0.617314313821938,
"grad_norm": 3.3541924953460693,
"learning_rate": 9.85326821301001e-06,
"loss": 1.0538,
"step": 2510
},
{
"epoch": 0.617560255779636,
"grad_norm": 3.8933675289154053,
"learning_rate": 9.85307255621494e-06,
"loss": 1.3945,
"step": 2511
},
{
"epoch": 0.617806197737334,
"grad_norm": 3.457361936569214,
"learning_rate": 9.852876771004489e-06,
"loss": 1.1758,
"step": 2512
},
{
"epoch": 0.6180521396950319,
"grad_norm": 4.003602027893066,
"learning_rate": 9.852680857383837e-06,
"loss": 1.2872,
"step": 2513
},
{
"epoch": 0.6182980816527299,
"grad_norm": 3.6918816566467285,
"learning_rate": 9.852484815358168e-06,
"loss": 1.1258,
"step": 2514
},
{
"epoch": 0.6185440236104279,
"grad_norm": 4.0276570320129395,
"learning_rate": 9.852288644932668e-06,
"loss": 1.2956,
"step": 2515
},
{
"epoch": 0.6187899655681259,
"grad_norm": 3.8040030002593994,
"learning_rate": 9.85209234611253e-06,
"loss": 1.3797,
"step": 2516
},
{
"epoch": 0.6190359075258239,
"grad_norm": 4.1408610343933105,
"learning_rate": 9.851895918902947e-06,
"loss": 1.3168,
"step": 2517
},
{
"epoch": 0.6192818494835219,
"grad_norm": 3.446748733520508,
"learning_rate": 9.851699363309116e-06,
"loss": 1.2091,
"step": 2518
},
{
"epoch": 0.6195277914412198,
"grad_norm": 3.2624197006225586,
"learning_rate": 9.851502679336237e-06,
"loss": 1.2481,
"step": 2519
},
{
"epoch": 0.6197737333989178,
"grad_norm": 3.8272945880889893,
"learning_rate": 9.851305866989518e-06,
"loss": 1.2737,
"step": 2520
},
{
"epoch": 0.6200196753566158,
"grad_norm": 3.471316337585449,
"learning_rate": 9.851108926274165e-06,
"loss": 1.2821,
"step": 2521
},
{
"epoch": 0.6202656173143138,
"grad_norm": 4.1051201820373535,
"learning_rate": 9.850911857195389e-06,
"loss": 1.3046,
"step": 2522
},
{
"epoch": 0.6205115592720118,
"grad_norm": 3.710334539413452,
"learning_rate": 9.850714659758405e-06,
"loss": 1.3118,
"step": 2523
},
{
"epoch": 0.6207575012297097,
"grad_norm": 3.8819899559020996,
"learning_rate": 9.850517333968428e-06,
"loss": 1.2076,
"step": 2524
},
{
"epoch": 0.6210034431874077,
"grad_norm": 3.9833462238311768,
"learning_rate": 9.850319879830683e-06,
"loss": 1.284,
"step": 2525
},
{
"epoch": 0.6212493851451057,
"grad_norm": 3.718158483505249,
"learning_rate": 9.850122297350395e-06,
"loss": 1.2714,
"step": 2526
},
{
"epoch": 0.6214953271028038,
"grad_norm": 4.074244499206543,
"learning_rate": 9.84992458653279e-06,
"loss": 1.1201,
"step": 2527
},
{
"epoch": 0.6217412690605018,
"grad_norm": 3.8119091987609863,
"learning_rate": 9.8497267473831e-06,
"loss": 1.2741,
"step": 2528
},
{
"epoch": 0.6219872110181998,
"grad_norm": 3.8655405044555664,
"learning_rate": 9.849528779906561e-06,
"loss": 1.2085,
"step": 2529
},
{
"epoch": 0.6222331529758977,
"grad_norm": 3.767866611480713,
"learning_rate": 9.849330684108409e-06,
"loss": 1.0824,
"step": 2530
},
{
"epoch": 0.6224790949335957,
"grad_norm": 4.0652689933776855,
"learning_rate": 9.84913245999389e-06,
"loss": 1.2452,
"step": 2531
},
{
"epoch": 0.6227250368912937,
"grad_norm": 3.3427658081054688,
"learning_rate": 9.848934107568244e-06,
"loss": 1.0707,
"step": 2532
},
{
"epoch": 0.6229709788489917,
"grad_norm": 3.5470614433288574,
"learning_rate": 9.848735626836723e-06,
"loss": 1.2348,
"step": 2533
},
{
"epoch": 0.6232169208066897,
"grad_norm": 3.4297597408294678,
"learning_rate": 9.848537017804579e-06,
"loss": 1.0604,
"step": 2534
},
{
"epoch": 0.6234628627643876,
"grad_norm": 3.8946163654327393,
"learning_rate": 9.848338280477065e-06,
"loss": 1.3043,
"step": 2535
},
{
"epoch": 0.6237088047220856,
"grad_norm": 3.528532028198242,
"learning_rate": 9.848139414859441e-06,
"loss": 1.132,
"step": 2536
},
{
"epoch": 0.6239547466797836,
"grad_norm": 3.940199375152588,
"learning_rate": 9.84794042095697e-06,
"loss": 1.1661,
"step": 2537
},
{
"epoch": 0.6242006886374816,
"grad_norm": 3.5975537300109863,
"learning_rate": 9.847741298774917e-06,
"loss": 1.2071,
"step": 2538
},
{
"epoch": 0.6244466305951796,
"grad_norm": 3.4783473014831543,
"learning_rate": 9.847542048318549e-06,
"loss": 1.1355,
"step": 2539
},
{
"epoch": 0.6246925725528775,
"grad_norm": 3.6598424911499023,
"learning_rate": 9.847342669593141e-06,
"loss": 1.3414,
"step": 2540
},
{
"epoch": 0.6249385145105755,
"grad_norm": 4.064345359802246,
"learning_rate": 9.847143162603966e-06,
"loss": 1.3218,
"step": 2541
},
{
"epoch": 0.6251844564682735,
"grad_norm": 3.883169651031494,
"learning_rate": 9.846943527356306e-06,
"loss": 1.2206,
"step": 2542
},
{
"epoch": 0.6254303984259715,
"grad_norm": 3.6029722690582275,
"learning_rate": 9.846743763855441e-06,
"loss": 1.1106,
"step": 2543
},
{
"epoch": 0.6256763403836695,
"grad_norm": 3.870412826538086,
"learning_rate": 9.84654387210666e-06,
"loss": 1.3225,
"step": 2544
},
{
"epoch": 0.6259222823413675,
"grad_norm": 3.6551971435546875,
"learning_rate": 9.846343852115248e-06,
"loss": 1.1807,
"step": 2545
},
{
"epoch": 0.6261682242990654,
"grad_norm": 3.4712815284729004,
"learning_rate": 9.846143703886502e-06,
"loss": 1.061,
"step": 2546
},
{
"epoch": 0.6264141662567634,
"grad_norm": 3.5085268020629883,
"learning_rate": 9.845943427425715e-06,
"loss": 1.1332,
"step": 2547
},
{
"epoch": 0.6266601082144614,
"grad_norm": 3.6363601684570312,
"learning_rate": 9.845743022738185e-06,
"loss": 1.2317,
"step": 2548
},
{
"epoch": 0.6269060501721594,
"grad_norm": 3.8234384059906006,
"learning_rate": 9.84554248982922e-06,
"loss": 1.2359,
"step": 2549
},
{
"epoch": 0.6271519921298574,
"grad_norm": 3.7432804107666016,
"learning_rate": 9.845341828704123e-06,
"loss": 1.0589,
"step": 2550
},
{
"epoch": 0.6273979340875553,
"grad_norm": 3.8963959217071533,
"learning_rate": 9.845141039368205e-06,
"loss": 1.2956,
"step": 2551
},
{
"epoch": 0.6276438760452533,
"grad_norm": 3.6893107891082764,
"learning_rate": 9.844940121826776e-06,
"loss": 1.2819,
"step": 2552
},
{
"epoch": 0.6278898180029513,
"grad_norm": 4.075024127960205,
"learning_rate": 9.844739076085155e-06,
"loss": 1.3568,
"step": 2553
},
{
"epoch": 0.6281357599606493,
"grad_norm": 3.4101531505584717,
"learning_rate": 9.844537902148661e-06,
"loss": 1.0152,
"step": 2554
},
{
"epoch": 0.6283817019183473,
"grad_norm": 3.285008668899536,
"learning_rate": 9.844336600022619e-06,
"loss": 1.0144,
"step": 2555
},
{
"epoch": 0.6286276438760453,
"grad_norm": 3.7445263862609863,
"learning_rate": 9.844135169712352e-06,
"loss": 1.2178,
"step": 2556
},
{
"epoch": 0.6288735858337432,
"grad_norm": 3.721270799636841,
"learning_rate": 9.843933611223194e-06,
"loss": 1.1233,
"step": 2557
},
{
"epoch": 0.6291195277914412,
"grad_norm": 3.789949893951416,
"learning_rate": 9.843731924560474e-06,
"loss": 1.3167,
"step": 2558
},
{
"epoch": 0.6293654697491392,
"grad_norm": 3.6042096614837646,
"learning_rate": 9.843530109729532e-06,
"loss": 1.0806,
"step": 2559
},
{
"epoch": 0.6296114117068372,
"grad_norm": 3.918835163116455,
"learning_rate": 9.843328166735708e-06,
"loss": 1.3302,
"step": 2560
},
{
"epoch": 0.6298573536645352,
"grad_norm": 4.031132221221924,
"learning_rate": 9.843126095584344e-06,
"loss": 1.2745,
"step": 2561
},
{
"epoch": 0.6301032956222331,
"grad_norm": 3.395883321762085,
"learning_rate": 9.84292389628079e-06,
"loss": 1.2386,
"step": 2562
},
{
"epoch": 0.6303492375799311,
"grad_norm": 3.6201744079589844,
"learning_rate": 9.84272156883039e-06,
"loss": 1.2025,
"step": 2563
},
{
"epoch": 0.6305951795376291,
"grad_norm": 3.6795220375061035,
"learning_rate": 9.842519113238505e-06,
"loss": 1.12,
"step": 2564
},
{
"epoch": 0.6308411214953271,
"grad_norm": 3.8250715732574463,
"learning_rate": 9.842316529510488e-06,
"loss": 1.2252,
"step": 2565
},
{
"epoch": 0.6310870634530251,
"grad_norm": 3.996086597442627,
"learning_rate": 9.8421138176517e-06,
"loss": 1.3039,
"step": 2566
},
{
"epoch": 0.6313330054107231,
"grad_norm": 4.396104335784912,
"learning_rate": 9.841910977667505e-06,
"loss": 1.2482,
"step": 2567
},
{
"epoch": 0.631578947368421,
"grad_norm": 3.8795971870422363,
"learning_rate": 9.841708009563271e-06,
"loss": 1.2223,
"step": 2568
},
{
"epoch": 0.631824889326119,
"grad_norm": 3.4072303771972656,
"learning_rate": 9.841504913344368e-06,
"loss": 1.0508,
"step": 2569
},
{
"epoch": 0.632070831283817,
"grad_norm": 3.3250832557678223,
"learning_rate": 9.841301689016172e-06,
"loss": 1.2217,
"step": 2570
},
{
"epoch": 0.632316773241515,
"grad_norm": 3.6893227100372314,
"learning_rate": 9.841098336584057e-06,
"loss": 1.3336,
"step": 2571
},
{
"epoch": 0.632562715199213,
"grad_norm": 3.7968735694885254,
"learning_rate": 9.840894856053404e-06,
"loss": 1.3954,
"step": 2572
},
{
"epoch": 0.6328086571569109,
"grad_norm": 3.794024705886841,
"learning_rate": 9.8406912474296e-06,
"loss": 1.2475,
"step": 2573
},
{
"epoch": 0.6330545991146089,
"grad_norm": 3.8340249061584473,
"learning_rate": 9.840487510718034e-06,
"loss": 1.2592,
"step": 2574
},
{
"epoch": 0.6333005410723069,
"grad_norm": 3.726025104522705,
"learning_rate": 9.840283645924091e-06,
"loss": 1.2239,
"step": 2575
},
{
"epoch": 0.6335464830300049,
"grad_norm": 3.7280986309051514,
"learning_rate": 9.84007965305317e-06,
"loss": 1.264,
"step": 2576
},
{
"epoch": 0.6337924249877029,
"grad_norm": 3.9969799518585205,
"learning_rate": 9.839875532110668e-06,
"loss": 1.299,
"step": 2577
},
{
"epoch": 0.6340383669454008,
"grad_norm": 3.369959831237793,
"learning_rate": 9.839671283101986e-06,
"loss": 1.1969,
"step": 2578
},
{
"epoch": 0.6342843089030988,
"grad_norm": 3.9305386543273926,
"learning_rate": 9.839466906032527e-06,
"loss": 1.1354,
"step": 2579
},
{
"epoch": 0.6345302508607968,
"grad_norm": 3.6034395694732666,
"learning_rate": 9.8392624009077e-06,
"loss": 1.2475,
"step": 2580
},
{
"epoch": 0.6347761928184948,
"grad_norm": 4.0616044998168945,
"learning_rate": 9.839057767732917e-06,
"loss": 1.296,
"step": 2581
},
{
"epoch": 0.6350221347761928,
"grad_norm": 3.830399990081787,
"learning_rate": 9.838853006513593e-06,
"loss": 1.2969,
"step": 2582
},
{
"epoch": 0.6352680767338909,
"grad_norm": 3.782472848892212,
"learning_rate": 9.838648117255146e-06,
"loss": 1.1881,
"step": 2583
},
{
"epoch": 0.6355140186915887,
"grad_norm": 3.6582870483398438,
"learning_rate": 9.838443099962995e-06,
"loss": 1.237,
"step": 2584
},
{
"epoch": 0.6357599606492867,
"grad_norm": 3.920006275177002,
"learning_rate": 9.838237954642569e-06,
"loss": 1.3413,
"step": 2585
},
{
"epoch": 0.6360059026069848,
"grad_norm": 3.726140260696411,
"learning_rate": 9.838032681299291e-06,
"loss": 1.1999,
"step": 2586
},
{
"epoch": 0.6362518445646828,
"grad_norm": 4.055642127990723,
"learning_rate": 9.837827279938599e-06,
"loss": 1.387,
"step": 2587
},
{
"epoch": 0.6364977865223808,
"grad_norm": 3.7302587032318115,
"learning_rate": 9.837621750565923e-06,
"loss": 1.3568,
"step": 2588
},
{
"epoch": 0.6367437284800787,
"grad_norm": 3.6282408237457275,
"learning_rate": 9.837416093186703e-06,
"loss": 1.3103,
"step": 2589
},
{
"epoch": 0.6369896704377767,
"grad_norm": 3.7042253017425537,
"learning_rate": 9.837210307806381e-06,
"loss": 1.2393,
"step": 2590
},
{
"epoch": 0.6372356123954747,
"grad_norm": 3.919834613800049,
"learning_rate": 9.837004394430402e-06,
"loss": 1.3403,
"step": 2591
},
{
"epoch": 0.6374815543531727,
"grad_norm": 3.612703800201416,
"learning_rate": 9.836798353064217e-06,
"loss": 1.1815,
"step": 2592
},
{
"epoch": 0.6377274963108707,
"grad_norm": 3.7662649154663086,
"learning_rate": 9.836592183713274e-06,
"loss": 1.1174,
"step": 2593
},
{
"epoch": 0.6379734382685687,
"grad_norm": 3.8971362113952637,
"learning_rate": 9.83638588638303e-06,
"loss": 1.2196,
"step": 2594
},
{
"epoch": 0.6382193802262666,
"grad_norm": 3.915012836456299,
"learning_rate": 9.836179461078946e-06,
"loss": 1.3485,
"step": 2595
},
{
"epoch": 0.6384653221839646,
"grad_norm": 3.7826449871063232,
"learning_rate": 9.835972907806479e-06,
"loss": 1.2668,
"step": 2596
},
{
"epoch": 0.6387112641416626,
"grad_norm": 3.435819149017334,
"learning_rate": 9.8357662265711e-06,
"loss": 1.1522,
"step": 2597
},
{
"epoch": 0.6389572060993606,
"grad_norm": 4.0991902351379395,
"learning_rate": 9.835559417378274e-06,
"loss": 1.3876,
"step": 2598
},
{
"epoch": 0.6392031480570586,
"grad_norm": 4.363412857055664,
"learning_rate": 9.835352480233476e-06,
"loss": 1.3295,
"step": 2599
},
{
"epoch": 0.6394490900147565,
"grad_norm": 3.9606986045837402,
"learning_rate": 9.83514541514218e-06,
"loss": 1.3778,
"step": 2600
},
{
"epoch": 0.6396950319724545,
"grad_norm": 3.425114393234253,
"learning_rate": 9.834938222109866e-06,
"loss": 1.2775,
"step": 2601
},
{
"epoch": 0.6399409739301525,
"grad_norm": 3.8080379962921143,
"learning_rate": 9.834730901142016e-06,
"loss": 1.1896,
"step": 2602
},
{
"epoch": 0.6401869158878505,
"grad_norm": 3.6675615310668945,
"learning_rate": 9.834523452244116e-06,
"loss": 1.2219,
"step": 2603
},
{
"epoch": 0.6404328578455485,
"grad_norm": 4.808147430419922,
"learning_rate": 9.834315875421654e-06,
"loss": 1.3538,
"step": 2604
},
{
"epoch": 0.6406787998032464,
"grad_norm": 3.243403434753418,
"learning_rate": 9.834108170680125e-06,
"loss": 1.112,
"step": 2605
},
{
"epoch": 0.6409247417609444,
"grad_norm": 3.8442420959472656,
"learning_rate": 9.833900338025022e-06,
"loss": 1.2117,
"step": 2606
},
{
"epoch": 0.6411706837186424,
"grad_norm": 3.6114561557769775,
"learning_rate": 9.833692377461849e-06,
"loss": 1.2353,
"step": 2607
},
{
"epoch": 0.6414166256763404,
"grad_norm": 3.6206393241882324,
"learning_rate": 9.833484288996105e-06,
"loss": 1.1984,
"step": 2608
},
{
"epoch": 0.6416625676340384,
"grad_norm": 3.721817970275879,
"learning_rate": 9.833276072633296e-06,
"loss": 1.4017,
"step": 2609
},
{
"epoch": 0.6419085095917364,
"grad_norm": 3.824291229248047,
"learning_rate": 9.833067728378933e-06,
"loss": 1.1947,
"step": 2610
},
{
"epoch": 0.6421544515494343,
"grad_norm": 3.501193046569824,
"learning_rate": 9.832859256238529e-06,
"loss": 1.2054,
"step": 2611
},
{
"epoch": 0.6424003935071323,
"grad_norm": 3.8367395401000977,
"learning_rate": 9.8326506562176e-06,
"loss": 1.1631,
"step": 2612
},
{
"epoch": 0.6426463354648303,
"grad_norm": 3.9268546104431152,
"learning_rate": 9.832441928321664e-06,
"loss": 1.2097,
"step": 2613
},
{
"epoch": 0.6428922774225283,
"grad_norm": 3.755028486251831,
"learning_rate": 9.832233072556248e-06,
"loss": 1.1699,
"step": 2614
},
{
"epoch": 0.6431382193802263,
"grad_norm": 3.8658928871154785,
"learning_rate": 9.832024088926874e-06,
"loss": 1.3623,
"step": 2615
},
{
"epoch": 0.6433841613379242,
"grad_norm": 3.537527084350586,
"learning_rate": 9.831814977439073e-06,
"loss": 1.1034,
"step": 2616
},
{
"epoch": 0.6436301032956222,
"grad_norm": 3.4717540740966797,
"learning_rate": 9.831605738098382e-06,
"loss": 1.1272,
"step": 2617
},
{
"epoch": 0.6438760452533202,
"grad_norm": 3.5187344551086426,
"learning_rate": 9.831396370910333e-06,
"loss": 1.195,
"step": 2618
},
{
"epoch": 0.6441219872110182,
"grad_norm": 4.167270183563232,
"learning_rate": 9.831186875880467e-06,
"loss": 1.2448,
"step": 2619
},
{
"epoch": 0.6443679291687162,
"grad_norm": 3.596653699874878,
"learning_rate": 9.830977253014332e-06,
"loss": 1.1511,
"step": 2620
},
{
"epoch": 0.6446138711264142,
"grad_norm": 3.9313220977783203,
"learning_rate": 9.830767502317467e-06,
"loss": 1.344,
"step": 2621
},
{
"epoch": 0.6448598130841121,
"grad_norm": 3.207564353942871,
"learning_rate": 9.830557623795426e-06,
"loss": 1.0994,
"step": 2622
},
{
"epoch": 0.6451057550418101,
"grad_norm": 3.893430471420288,
"learning_rate": 9.830347617453764e-06,
"loss": 1.2626,
"step": 2623
},
{
"epoch": 0.6453516969995081,
"grad_norm": 3.994905471801758,
"learning_rate": 9.830137483298035e-06,
"loss": 1.2368,
"step": 2624
},
{
"epoch": 0.6455976389572061,
"grad_norm": 3.8473575115203857,
"learning_rate": 9.829927221333801e-06,
"loss": 1.1583,
"step": 2625
},
{
"epoch": 0.6458435809149041,
"grad_norm": 3.7361197471618652,
"learning_rate": 9.829716831566628e-06,
"loss": 1.1626,
"step": 2626
},
{
"epoch": 0.646089522872602,
"grad_norm": 3.4324870109558105,
"learning_rate": 9.829506314002077e-06,
"loss": 1.0897,
"step": 2627
},
{
"epoch": 0.6463354648303,
"grad_norm": 3.8036117553710938,
"learning_rate": 9.829295668645724e-06,
"loss": 1.2594,
"step": 2628
},
{
"epoch": 0.646581406787998,
"grad_norm": 3.953852653503418,
"learning_rate": 9.829084895503139e-06,
"loss": 1.2175,
"step": 2629
},
{
"epoch": 0.646827348745696,
"grad_norm": 4.100643157958984,
"learning_rate": 9.828873994579901e-06,
"loss": 1.2811,
"step": 2630
},
{
"epoch": 0.647073290703394,
"grad_norm": 3.483051300048828,
"learning_rate": 9.828662965881591e-06,
"loss": 1.1655,
"step": 2631
},
{
"epoch": 0.647319232661092,
"grad_norm": 3.837907075881958,
"learning_rate": 9.828451809413793e-06,
"loss": 1.4599,
"step": 2632
},
{
"epoch": 0.6475651746187899,
"grad_norm": 4.317206859588623,
"learning_rate": 9.82824052518209e-06,
"loss": 1.2349,
"step": 2633
},
{
"epoch": 0.6478111165764879,
"grad_norm": 4.357658863067627,
"learning_rate": 9.828029113192081e-06,
"loss": 1.259,
"step": 2634
},
{
"epoch": 0.6480570585341859,
"grad_norm": 3.533658742904663,
"learning_rate": 9.827817573449355e-06,
"loss": 1.0935,
"step": 2635
},
{
"epoch": 0.6483030004918839,
"grad_norm": 4.187869548797607,
"learning_rate": 9.827605905959507e-06,
"loss": 1.3819,
"step": 2636
},
{
"epoch": 0.6485489424495819,
"grad_norm": 3.70363450050354,
"learning_rate": 9.827394110728144e-06,
"loss": 1.3304,
"step": 2637
},
{
"epoch": 0.6487948844072798,
"grad_norm": 3.7946808338165283,
"learning_rate": 9.827182187760864e-06,
"loss": 1.208,
"step": 2638
},
{
"epoch": 0.6490408263649778,
"grad_norm": 3.6368050575256348,
"learning_rate": 9.826970137063279e-06,
"loss": 1.106,
"step": 2639
},
{
"epoch": 0.6492867683226758,
"grad_norm": 3.964268684387207,
"learning_rate": 9.826757958641e-06,
"loss": 1.3785,
"step": 2640
},
{
"epoch": 0.6495327102803738,
"grad_norm": 4.022248268127441,
"learning_rate": 9.826545652499638e-06,
"loss": 1.2453,
"step": 2641
},
{
"epoch": 0.6497786522380719,
"grad_norm": 3.840114116668701,
"learning_rate": 9.826333218644814e-06,
"loss": 1.1608,
"step": 2642
},
{
"epoch": 0.6500245941957697,
"grad_norm": 4.180699825286865,
"learning_rate": 9.826120657082148e-06,
"loss": 1.2767,
"step": 2643
},
{
"epoch": 0.6502705361534677,
"grad_norm": 3.9508001804351807,
"learning_rate": 9.825907967817263e-06,
"loss": 1.3382,
"step": 2644
},
{
"epoch": 0.6505164781111658,
"grad_norm": 3.8456339836120605,
"learning_rate": 9.82569515085579e-06,
"loss": 1.1934,
"step": 2645
},
{
"epoch": 0.6507624200688638,
"grad_norm": 4.125998497009277,
"learning_rate": 9.825482206203358e-06,
"loss": 1.3698,
"step": 2646
},
{
"epoch": 0.6510083620265618,
"grad_norm": 3.6823601722717285,
"learning_rate": 9.825269133865603e-06,
"loss": 1.2147,
"step": 2647
},
{
"epoch": 0.6512543039842598,
"grad_norm": 3.886720657348633,
"learning_rate": 9.825055933848161e-06,
"loss": 1.2363,
"step": 2648
},
{
"epoch": 0.6515002459419577,
"grad_norm": 3.7813613414764404,
"learning_rate": 9.824842606156675e-06,
"loss": 1.233,
"step": 2649
},
{
"epoch": 0.6517461878996557,
"grad_norm": 3.8925318717956543,
"learning_rate": 9.82462915079679e-06,
"loss": 1.3466,
"step": 2650
},
{
"epoch": 0.6519921298573537,
"grad_norm": 4.133185863494873,
"learning_rate": 9.824415567774153e-06,
"loss": 1.2873,
"step": 2651
},
{
"epoch": 0.6522380718150517,
"grad_norm": 3.560547113418579,
"learning_rate": 9.824201857094417e-06,
"loss": 1.1701,
"step": 2652
},
{
"epoch": 0.6524840137727497,
"grad_norm": 3.656126022338867,
"learning_rate": 9.823988018763235e-06,
"loss": 1.2103,
"step": 2653
},
{
"epoch": 0.6527299557304476,
"grad_norm": 3.96260142326355,
"learning_rate": 9.823774052786268e-06,
"loss": 1.1918,
"step": 2654
},
{
"epoch": 0.6529758976881456,
"grad_norm": 3.651796340942383,
"learning_rate": 9.823559959169176e-06,
"loss": 1.3109,
"step": 2655
},
{
"epoch": 0.6532218396458436,
"grad_norm": 3.867217540740967,
"learning_rate": 9.823345737917622e-06,
"loss": 1.2631,
"step": 2656
},
{
"epoch": 0.6534677816035416,
"grad_norm": 3.4214119911193848,
"learning_rate": 9.823131389037278e-06,
"loss": 1.1525,
"step": 2657
},
{
"epoch": 0.6537137235612396,
"grad_norm": 3.9086477756500244,
"learning_rate": 9.822916912533815e-06,
"loss": 1.2536,
"step": 2658
},
{
"epoch": 0.6539596655189376,
"grad_norm": 3.3010571002960205,
"learning_rate": 9.822702308412906e-06,
"loss": 1.1807,
"step": 2659
},
{
"epoch": 0.6542056074766355,
"grad_norm": 3.9272680282592773,
"learning_rate": 9.822487576680231e-06,
"loss": 1.2087,
"step": 2660
},
{
"epoch": 0.6544515494343335,
"grad_norm": 3.4240798950195312,
"learning_rate": 9.822272717341474e-06,
"loss": 1.1114,
"step": 2661
},
{
"epoch": 0.6546974913920315,
"grad_norm": 3.9906671047210693,
"learning_rate": 9.822057730402317e-06,
"loss": 1.2894,
"step": 2662
},
{
"epoch": 0.6549434333497295,
"grad_norm": 3.60724139213562,
"learning_rate": 9.821842615868452e-06,
"loss": 1.1279,
"step": 2663
},
{
"epoch": 0.6551893753074275,
"grad_norm": 3.5562360286712646,
"learning_rate": 9.821627373745566e-06,
"loss": 1.1966,
"step": 2664
},
{
"epoch": 0.6554353172651254,
"grad_norm": 3.9176549911499023,
"learning_rate": 9.82141200403936e-06,
"loss": 1.2871,
"step": 2665
},
{
"epoch": 0.6556812592228234,
"grad_norm": 3.544968366622925,
"learning_rate": 9.821196506755529e-06,
"loss": 1.3045,
"step": 2666
},
{
"epoch": 0.6559272011805214,
"grad_norm": 3.361649751663208,
"learning_rate": 9.820980881899778e-06,
"loss": 1.0796,
"step": 2667
},
{
"epoch": 0.6561731431382194,
"grad_norm": 3.7508535385131836,
"learning_rate": 9.820765129477809e-06,
"loss": 1.1771,
"step": 2668
},
{
"epoch": 0.6564190850959174,
"grad_norm": 3.7665042877197266,
"learning_rate": 9.820549249495334e-06,
"loss": 1.1569,
"step": 2669
},
{
"epoch": 0.6566650270536154,
"grad_norm": 3.76166033744812,
"learning_rate": 9.820333241958065e-06,
"loss": 1.2669,
"step": 2670
},
{
"epoch": 0.6569109690113133,
"grad_norm": 3.5752217769622803,
"learning_rate": 9.820117106871717e-06,
"loss": 1.1329,
"step": 2671
},
{
"epoch": 0.6571569109690113,
"grad_norm": 3.6125998497009277,
"learning_rate": 9.81990084424201e-06,
"loss": 1.1865,
"step": 2672
},
{
"epoch": 0.6574028529267093,
"grad_norm": 3.5016555786132812,
"learning_rate": 9.819684454074664e-06,
"loss": 1.1899,
"step": 2673
},
{
"epoch": 0.6576487948844073,
"grad_norm": 3.9676527976989746,
"learning_rate": 9.819467936375407e-06,
"loss": 1.2879,
"step": 2674
},
{
"epoch": 0.6578947368421053,
"grad_norm": 3.7402713298797607,
"learning_rate": 9.819251291149968e-06,
"loss": 1.3108,
"step": 2675
},
{
"epoch": 0.6581406787998032,
"grad_norm": 3.9037301540374756,
"learning_rate": 9.81903451840408e-06,
"loss": 1.2893,
"step": 2676
},
{
"epoch": 0.6583866207575012,
"grad_norm": 3.9056687355041504,
"learning_rate": 9.818817618143477e-06,
"loss": 1.2083,
"step": 2677
},
{
"epoch": 0.6586325627151992,
"grad_norm": 3.873429775238037,
"learning_rate": 9.818600590373898e-06,
"loss": 1.203,
"step": 2678
},
{
"epoch": 0.6588785046728972,
"grad_norm": 3.765023708343506,
"learning_rate": 9.818383435101092e-06,
"loss": 1.3415,
"step": 2679
},
{
"epoch": 0.6591244466305952,
"grad_norm": 3.8221542835235596,
"learning_rate": 9.818166152330795e-06,
"loss": 1.1686,
"step": 2680
},
{
"epoch": 0.6593703885882931,
"grad_norm": 3.4611406326293945,
"learning_rate": 9.817948742068764e-06,
"loss": 1.1111,
"step": 2681
},
{
"epoch": 0.6596163305459911,
"grad_norm": 3.8618388175964355,
"learning_rate": 9.81773120432075e-06,
"loss": 1.3206,
"step": 2682
},
{
"epoch": 0.6598622725036891,
"grad_norm": 3.724785566329956,
"learning_rate": 9.817513539092508e-06,
"loss": 1.2818,
"step": 2683
},
{
"epoch": 0.6601082144613871,
"grad_norm": 3.7814040184020996,
"learning_rate": 9.817295746389798e-06,
"loss": 1.3421,
"step": 2684
},
{
"epoch": 0.6603541564190851,
"grad_norm": 3.7911221981048584,
"learning_rate": 9.817077826218385e-06,
"loss": 1.2455,
"step": 2685
},
{
"epoch": 0.6606000983767831,
"grad_norm": 3.8262228965759277,
"learning_rate": 9.816859778584032e-06,
"loss": 1.2843,
"step": 2686
},
{
"epoch": 0.660846040334481,
"grad_norm": 3.885464906692505,
"learning_rate": 9.816641603492509e-06,
"loss": 1.2804,
"step": 2687
},
{
"epoch": 0.661091982292179,
"grad_norm": 3.8361287117004395,
"learning_rate": 9.81642330094959e-06,
"loss": 1.2029,
"step": 2688
},
{
"epoch": 0.661337924249877,
"grad_norm": 3.694225549697876,
"learning_rate": 9.816204870961054e-06,
"loss": 1.1606,
"step": 2689
},
{
"epoch": 0.661583866207575,
"grad_norm": 4.029979228973389,
"learning_rate": 9.815986313532676e-06,
"loss": 1.2463,
"step": 2690
},
{
"epoch": 0.661829808165273,
"grad_norm": 3.4176061153411865,
"learning_rate": 9.815767628670245e-06,
"loss": 1.0627,
"step": 2691
},
{
"epoch": 0.6620757501229709,
"grad_norm": 3.7972235679626465,
"learning_rate": 9.815548816379542e-06,
"loss": 1.2611,
"step": 2692
},
{
"epoch": 0.6623216920806689,
"grad_norm": 3.853787899017334,
"learning_rate": 9.81532987666636e-06,
"loss": 1.2466,
"step": 2693
},
{
"epoch": 0.6625676340383669,
"grad_norm": 3.881412982940674,
"learning_rate": 9.81511080953649e-06,
"loss": 1.1908,
"step": 2694
},
{
"epoch": 0.6628135759960649,
"grad_norm": 3.8627424240112305,
"learning_rate": 9.81489161499573e-06,
"loss": 1.4224,
"step": 2695
},
{
"epoch": 0.663059517953763,
"grad_norm": 4.178469181060791,
"learning_rate": 9.81467229304988e-06,
"loss": 1.2919,
"step": 2696
},
{
"epoch": 0.663305459911461,
"grad_norm": 4.028983116149902,
"learning_rate": 9.814452843704744e-06,
"loss": 1.2846,
"step": 2697
},
{
"epoch": 0.6635514018691588,
"grad_norm": 3.8426246643066406,
"learning_rate": 9.81423326696613e-06,
"loss": 1.2032,
"step": 2698
},
{
"epoch": 0.6637973438268568,
"grad_norm": 4.0372419357299805,
"learning_rate": 9.814013562839844e-06,
"loss": 1.2777,
"step": 2699
},
{
"epoch": 0.6640432857845548,
"grad_norm": 3.765970468521118,
"learning_rate": 9.813793731331703e-06,
"loss": 1.2179,
"step": 2700
},
{
"epoch": 0.6642892277422529,
"grad_norm": 3.829099178314209,
"learning_rate": 9.813573772447522e-06,
"loss": 1.253,
"step": 2701
},
{
"epoch": 0.6645351696999509,
"grad_norm": 3.5761992931365967,
"learning_rate": 9.813353686193122e-06,
"loss": 1.1652,
"step": 2702
},
{
"epoch": 0.6647811116576487,
"grad_norm": 3.6343367099761963,
"learning_rate": 9.813133472574328e-06,
"loss": 1.1117,
"step": 2703
},
{
"epoch": 0.6650270536153468,
"grad_norm": 3.9197635650634766,
"learning_rate": 9.812913131596964e-06,
"loss": 1.4077,
"step": 2704
},
{
"epoch": 0.6652729955730448,
"grad_norm": 3.4026482105255127,
"learning_rate": 9.812692663266862e-06,
"loss": 1.3216,
"step": 2705
},
{
"epoch": 0.6655189375307428,
"grad_norm": 3.6619067192077637,
"learning_rate": 9.812472067589856e-06,
"loss": 1.1858,
"step": 2706
},
{
"epoch": 0.6657648794884408,
"grad_norm": 3.674340009689331,
"learning_rate": 9.812251344571783e-06,
"loss": 1.1897,
"step": 2707
},
{
"epoch": 0.6660108214461387,
"grad_norm": 3.6868436336517334,
"learning_rate": 9.812030494218484e-06,
"loss": 1.2706,
"step": 2708
},
{
"epoch": 0.6662567634038367,
"grad_norm": 3.622556209564209,
"learning_rate": 9.8118095165358e-06,
"loss": 1.0979,
"step": 2709
},
{
"epoch": 0.6665027053615347,
"grad_norm": 3.737323522567749,
"learning_rate": 9.811588411529582e-06,
"loss": 1.1404,
"step": 2710
},
{
"epoch": 0.6667486473192327,
"grad_norm": 3.951174736022949,
"learning_rate": 9.811367179205677e-06,
"loss": 1.2319,
"step": 2711
},
{
"epoch": 0.6669945892769307,
"grad_norm": 3.7596280574798584,
"learning_rate": 9.811145819569943e-06,
"loss": 1.2948,
"step": 2712
},
{
"epoch": 0.6672405312346287,
"grad_norm": 3.5799005031585693,
"learning_rate": 9.810924332628234e-06,
"loss": 1.2569,
"step": 2713
},
{
"epoch": 0.6674864731923266,
"grad_norm": 3.5244457721710205,
"learning_rate": 9.810702718386413e-06,
"loss": 1.1979,
"step": 2714
},
{
"epoch": 0.6677324151500246,
"grad_norm": 3.494089126586914,
"learning_rate": 9.810480976850342e-06,
"loss": 1.1452,
"step": 2715
},
{
"epoch": 0.6679783571077226,
"grad_norm": 4.149924278259277,
"learning_rate": 9.810259108025889e-06,
"loss": 1.4143,
"step": 2716
},
{
"epoch": 0.6682242990654206,
"grad_norm": 3.699502468109131,
"learning_rate": 9.810037111918925e-06,
"loss": 1.287,
"step": 2717
},
{
"epoch": 0.6684702410231186,
"grad_norm": 3.7696373462677,
"learning_rate": 9.809814988535324e-06,
"loss": 1.3101,
"step": 2718
},
{
"epoch": 0.6687161829808165,
"grad_norm": 3.849297285079956,
"learning_rate": 9.809592737880966e-06,
"loss": 1.1706,
"step": 2719
},
{
"epoch": 0.6689621249385145,
"grad_norm": 4.050236225128174,
"learning_rate": 9.809370359961727e-06,
"loss": 1.2232,
"step": 2720
},
{
"epoch": 0.6692080668962125,
"grad_norm": 4.010958671569824,
"learning_rate": 9.809147854783496e-06,
"loss": 1.3634,
"step": 2721
},
{
"epoch": 0.6694540088539105,
"grad_norm": 3.6007328033447266,
"learning_rate": 9.808925222352158e-06,
"loss": 1.1401,
"step": 2722
},
{
"epoch": 0.6696999508116085,
"grad_norm": 4.1935343742370605,
"learning_rate": 9.808702462673604e-06,
"loss": 1.2156,
"step": 2723
},
{
"epoch": 0.6699458927693065,
"grad_norm": 3.7772552967071533,
"learning_rate": 9.80847957575373e-06,
"loss": 1.265,
"step": 2724
},
{
"epoch": 0.6701918347270044,
"grad_norm": 3.736003875732422,
"learning_rate": 9.808256561598431e-06,
"loss": 1.1885,
"step": 2725
},
{
"epoch": 0.6704377766847024,
"grad_norm": 3.4275758266448975,
"learning_rate": 9.808033420213611e-06,
"loss": 1.1969,
"step": 2726
},
{
"epoch": 0.6706837186424004,
"grad_norm": 4.037476539611816,
"learning_rate": 9.807810151605172e-06,
"loss": 1.3496,
"step": 2727
},
{
"epoch": 0.6709296606000984,
"grad_norm": 3.926862955093384,
"learning_rate": 9.807586755779026e-06,
"loss": 1.0986,
"step": 2728
},
{
"epoch": 0.6711756025577964,
"grad_norm": 3.8330776691436768,
"learning_rate": 9.807363232741078e-06,
"loss": 1.3241,
"step": 2729
},
{
"epoch": 0.6714215445154943,
"grad_norm": 3.7725393772125244,
"learning_rate": 9.807139582497248e-06,
"loss": 1.1868,
"step": 2730
},
{
"epoch": 0.6716674864731923,
"grad_norm": 3.7919273376464844,
"learning_rate": 9.80691580505345e-06,
"loss": 1.3563,
"step": 2731
},
{
"epoch": 0.6719134284308903,
"grad_norm": 3.2115375995635986,
"learning_rate": 9.806691900415607e-06,
"loss": 1.126,
"step": 2732
},
{
"epoch": 0.6721593703885883,
"grad_norm": 3.6278059482574463,
"learning_rate": 9.806467868589644e-06,
"loss": 1.07,
"step": 2733
},
{
"epoch": 0.6724053123462863,
"grad_norm": 3.996083974838257,
"learning_rate": 9.80624370958149e-06,
"loss": 1.2232,
"step": 2734
},
{
"epoch": 0.6726512543039843,
"grad_norm": 3.6387999057769775,
"learning_rate": 9.806019423397073e-06,
"loss": 1.1264,
"step": 2735
},
{
"epoch": 0.6728971962616822,
"grad_norm": 3.719271183013916,
"learning_rate": 9.805795010042331e-06,
"loss": 1.1585,
"step": 2736
},
{
"epoch": 0.6731431382193802,
"grad_norm": 3.44995379447937,
"learning_rate": 9.805570469523203e-06,
"loss": 1.1366,
"step": 2737
},
{
"epoch": 0.6733890801770782,
"grad_norm": 3.7163469791412354,
"learning_rate": 9.805345801845626e-06,
"loss": 1.291,
"step": 2738
},
{
"epoch": 0.6736350221347762,
"grad_norm": 3.6071369647979736,
"learning_rate": 9.805121007015548e-06,
"loss": 1.1831,
"step": 2739
},
{
"epoch": 0.6738809640924742,
"grad_norm": 3.9349443912506104,
"learning_rate": 9.804896085038915e-06,
"loss": 1.2625,
"step": 2740
},
{
"epoch": 0.6741269060501721,
"grad_norm": 3.8176991939544678,
"learning_rate": 9.804671035921682e-06,
"loss": 1.2088,
"step": 2741
},
{
"epoch": 0.6743728480078701,
"grad_norm": 3.5887539386749268,
"learning_rate": 9.804445859669802e-06,
"loss": 1.2772,
"step": 2742
},
{
"epoch": 0.6746187899655681,
"grad_norm": 3.755354642868042,
"learning_rate": 9.804220556289232e-06,
"loss": 1.292,
"step": 2743
},
{
"epoch": 0.6748647319232661,
"grad_norm": 3.562833547592163,
"learning_rate": 9.803995125785937e-06,
"loss": 1.1279,
"step": 2744
},
{
"epoch": 0.6751106738809641,
"grad_norm": 3.811105251312256,
"learning_rate": 9.803769568165878e-06,
"loss": 1.2655,
"step": 2745
},
{
"epoch": 0.675356615838662,
"grad_norm": 3.5226874351501465,
"learning_rate": 9.803543883435028e-06,
"loss": 1.1992,
"step": 2746
},
{
"epoch": 0.67560255779636,
"grad_norm": 3.95967698097229,
"learning_rate": 9.803318071599352e-06,
"loss": 1.2673,
"step": 2747
},
{
"epoch": 0.675848499754058,
"grad_norm": 3.5152807235717773,
"learning_rate": 9.803092132664834e-06,
"loss": 1.2076,
"step": 2748
},
{
"epoch": 0.676094441711756,
"grad_norm": 3.5721638202667236,
"learning_rate": 9.802866066637445e-06,
"loss": 1.0762,
"step": 2749
},
{
"epoch": 0.676340383669454,
"grad_norm": 3.7930197715759277,
"learning_rate": 9.802639873523169e-06,
"loss": 1.2973,
"step": 2750
},
{
"epoch": 0.676586325627152,
"grad_norm": 3.6387012004852295,
"learning_rate": 9.802413553327995e-06,
"loss": 1.1818,
"step": 2751
},
{
"epoch": 0.6768322675848499,
"grad_norm": 3.380821943283081,
"learning_rate": 9.802187106057905e-06,
"loss": 1.1493,
"step": 2752
},
{
"epoch": 0.6770782095425479,
"grad_norm": 3.3943073749542236,
"learning_rate": 9.801960531718898e-06,
"loss": 1.1629,
"step": 2753
},
{
"epoch": 0.6773241515002459,
"grad_norm": 3.9503750801086426,
"learning_rate": 9.801733830316962e-06,
"loss": 1.4481,
"step": 2754
},
{
"epoch": 0.677570093457944,
"grad_norm": 4.3308539390563965,
"learning_rate": 9.801507001858102e-06,
"loss": 1.2742,
"step": 2755
},
{
"epoch": 0.677816035415642,
"grad_norm": 3.667858123779297,
"learning_rate": 9.801280046348316e-06,
"loss": 1.206,
"step": 2756
},
{
"epoch": 0.6780619773733398,
"grad_norm": 3.668693780899048,
"learning_rate": 9.801052963793612e-06,
"loss": 1.0812,
"step": 2757
},
{
"epoch": 0.6783079193310378,
"grad_norm": 3.6538171768188477,
"learning_rate": 9.800825754199996e-06,
"loss": 1.1375,
"step": 2758
},
{
"epoch": 0.6785538612887358,
"grad_norm": 3.2957324981689453,
"learning_rate": 9.800598417573484e-06,
"loss": 1.232,
"step": 2759
},
{
"epoch": 0.6787998032464339,
"grad_norm": 3.817131519317627,
"learning_rate": 9.800370953920087e-06,
"loss": 1.2954,
"step": 2760
},
{
"epoch": 0.6790457452041319,
"grad_norm": 3.3971118927001953,
"learning_rate": 9.800143363245826e-06,
"loss": 1.1112,
"step": 2761
},
{
"epoch": 0.6792916871618299,
"grad_norm": 3.6458561420440674,
"learning_rate": 9.799915645556725e-06,
"loss": 1.253,
"step": 2762
},
{
"epoch": 0.6795376291195278,
"grad_norm": 3.9006423950195312,
"learning_rate": 9.799687800858806e-06,
"loss": 1.0707,
"step": 2763
},
{
"epoch": 0.6797835710772258,
"grad_norm": 3.4585375785827637,
"learning_rate": 9.7994598291581e-06,
"loss": 1.2345,
"step": 2764
},
{
"epoch": 0.6800295130349238,
"grad_norm": 3.239893913269043,
"learning_rate": 9.799231730460638e-06,
"loss": 1.1291,
"step": 2765
},
{
"epoch": 0.6802754549926218,
"grad_norm": 3.9842095375061035,
"learning_rate": 9.799003504772458e-06,
"loss": 1.3853,
"step": 2766
},
{
"epoch": 0.6805213969503198,
"grad_norm": 3.650418281555176,
"learning_rate": 9.798775152099597e-06,
"loss": 1.2353,
"step": 2767
},
{
"epoch": 0.6807673389080177,
"grad_norm": 3.488300323486328,
"learning_rate": 9.798546672448098e-06,
"loss": 1.07,
"step": 2768
},
{
"epoch": 0.6810132808657157,
"grad_norm": 3.7617859840393066,
"learning_rate": 9.798318065824006e-06,
"loss": 1.3579,
"step": 2769
},
{
"epoch": 0.6812592228234137,
"grad_norm": 3.860496997833252,
"learning_rate": 9.798089332233371e-06,
"loss": 1.2225,
"step": 2770
},
{
"epoch": 0.6815051647811117,
"grad_norm": 3.8232083320617676,
"learning_rate": 9.797860471682245e-06,
"loss": 1.3335,
"step": 2771
},
{
"epoch": 0.6817511067388097,
"grad_norm": 3.7715744972229004,
"learning_rate": 9.797631484176685e-06,
"loss": 1.2764,
"step": 2772
},
{
"epoch": 0.6819970486965077,
"grad_norm": 3.538588285446167,
"learning_rate": 9.797402369722749e-06,
"loss": 1.2868,
"step": 2773
},
{
"epoch": 0.6822429906542056,
"grad_norm": 3.5791990756988525,
"learning_rate": 9.797173128326499e-06,
"loss": 1.098,
"step": 2774
},
{
"epoch": 0.6824889326119036,
"grad_norm": 3.5185086727142334,
"learning_rate": 9.796943759994e-06,
"loss": 1.1551,
"step": 2775
},
{
"epoch": 0.6827348745696016,
"grad_norm": 3.973799228668213,
"learning_rate": 9.796714264731324e-06,
"loss": 1.2607,
"step": 2776
},
{
"epoch": 0.6829808165272996,
"grad_norm": 3.926222324371338,
"learning_rate": 9.796484642544542e-06,
"loss": 1.2793,
"step": 2777
},
{
"epoch": 0.6832267584849976,
"grad_norm": 4.027682781219482,
"learning_rate": 9.79625489343973e-06,
"loss": 1.2423,
"step": 2778
},
{
"epoch": 0.6834727004426955,
"grad_norm": 3.638627767562866,
"learning_rate": 9.796025017422967e-06,
"loss": 1.211,
"step": 2779
},
{
"epoch": 0.6837186424003935,
"grad_norm": 3.9291532039642334,
"learning_rate": 9.795795014500338e-06,
"loss": 1.2551,
"step": 2780
},
{
"epoch": 0.6839645843580915,
"grad_norm": 3.4333553314208984,
"learning_rate": 9.795564884677925e-06,
"loss": 1.1933,
"step": 2781
},
{
"epoch": 0.6842105263157895,
"grad_norm": 3.703695297241211,
"learning_rate": 9.795334627961822e-06,
"loss": 1.1867,
"step": 2782
},
{
"epoch": 0.6844564682734875,
"grad_norm": 3.607741117477417,
"learning_rate": 9.795104244358117e-06,
"loss": 1.1809,
"step": 2783
},
{
"epoch": 0.6847024102311854,
"grad_norm": 3.6176626682281494,
"learning_rate": 9.79487373387291e-06,
"loss": 1.1511,
"step": 2784
},
{
"epoch": 0.6849483521888834,
"grad_norm": 3.750548839569092,
"learning_rate": 9.794643096512298e-06,
"loss": 1.2212,
"step": 2785
},
{
"epoch": 0.6851942941465814,
"grad_norm": 3.4478766918182373,
"learning_rate": 9.794412332282386e-06,
"loss": 1.2477,
"step": 2786
},
{
"epoch": 0.6854402361042794,
"grad_norm": 3.781501531600952,
"learning_rate": 9.794181441189277e-06,
"loss": 1.1062,
"step": 2787
},
{
"epoch": 0.6856861780619774,
"grad_norm": 3.910632610321045,
"learning_rate": 9.793950423239082e-06,
"loss": 1.3928,
"step": 2788
},
{
"epoch": 0.6859321200196754,
"grad_norm": 3.397245407104492,
"learning_rate": 9.793719278437917e-06,
"loss": 1.1306,
"step": 2789
},
{
"epoch": 0.6861780619773733,
"grad_norm": 3.6681694984436035,
"learning_rate": 9.793488006791894e-06,
"loss": 1.2475,
"step": 2790
},
{
"epoch": 0.6864240039350713,
"grad_norm": 3.564866304397583,
"learning_rate": 9.793256608307134e-06,
"loss": 1.1438,
"step": 2791
},
{
"epoch": 0.6866699458927693,
"grad_norm": 4.073515892028809,
"learning_rate": 9.79302508298976e-06,
"loss": 1.1602,
"step": 2792
},
{
"epoch": 0.6869158878504673,
"grad_norm": 4.132083415985107,
"learning_rate": 9.792793430845899e-06,
"loss": 1.2762,
"step": 2793
},
{
"epoch": 0.6871618298081653,
"grad_norm": 3.5829763412475586,
"learning_rate": 9.792561651881679e-06,
"loss": 1.2238,
"step": 2794
},
{
"epoch": 0.6874077717658632,
"grad_norm": 3.3725743293762207,
"learning_rate": 9.792329746103236e-06,
"loss": 1.2884,
"step": 2795
},
{
"epoch": 0.6876537137235612,
"grad_norm": 4.227250576019287,
"learning_rate": 9.792097713516701e-06,
"loss": 1.4308,
"step": 2796
},
{
"epoch": 0.6878996556812592,
"grad_norm": 3.8425514698028564,
"learning_rate": 9.79186555412822e-06,
"loss": 1.3082,
"step": 2797
},
{
"epoch": 0.6881455976389572,
"grad_norm": 3.829270601272583,
"learning_rate": 9.791633267943931e-06,
"loss": 1.2698,
"step": 2798
},
{
"epoch": 0.6883915395966552,
"grad_norm": 3.73075532913208,
"learning_rate": 9.791400854969986e-06,
"loss": 1.302,
"step": 2799
},
{
"epoch": 0.6886374815543532,
"grad_norm": 3.8142924308776855,
"learning_rate": 9.79116831521253e-06,
"loss": 1.2903,
"step": 2800
},
{
"epoch": 0.6888834235120511,
"grad_norm": 3.4827301502227783,
"learning_rate": 9.790935648677715e-06,
"loss": 1.1577,
"step": 2801
},
{
"epoch": 0.6891293654697491,
"grad_norm": 3.440375804901123,
"learning_rate": 9.790702855371703e-06,
"loss": 1.1543,
"step": 2802
},
{
"epoch": 0.6893753074274471,
"grad_norm": 3.9827394485473633,
"learning_rate": 9.79046993530065e-06,
"loss": 1.3033,
"step": 2803
},
{
"epoch": 0.6896212493851451,
"grad_norm": 3.5762510299682617,
"learning_rate": 9.79023688847072e-06,
"loss": 1.2556,
"step": 2804
},
{
"epoch": 0.6898671913428431,
"grad_norm": 3.697504758834839,
"learning_rate": 9.79000371488808e-06,
"loss": 1.1126,
"step": 2805
},
{
"epoch": 0.690113133300541,
"grad_norm": 3.685171127319336,
"learning_rate": 9.7897704145589e-06,
"loss": 1.2234,
"step": 2806
},
{
"epoch": 0.690359075258239,
"grad_norm": 4.112430572509766,
"learning_rate": 9.789536987489352e-06,
"loss": 1.3364,
"step": 2807
},
{
"epoch": 0.690605017215937,
"grad_norm": 3.727309465408325,
"learning_rate": 9.789303433685615e-06,
"loss": 1.2272,
"step": 2808
},
{
"epoch": 0.690850959173635,
"grad_norm": 3.8051540851593018,
"learning_rate": 9.789069753153868e-06,
"loss": 1.1893,
"step": 2809
},
{
"epoch": 0.691096901131333,
"grad_norm": 3.6040050983428955,
"learning_rate": 9.78883594590029e-06,
"loss": 1.21,
"step": 2810
},
{
"epoch": 0.6913428430890309,
"grad_norm": 4.278500080108643,
"learning_rate": 9.788602011931076e-06,
"loss": 1.4117,
"step": 2811
},
{
"epoch": 0.6915887850467289,
"grad_norm": 3.919942855834961,
"learning_rate": 9.78836795125241e-06,
"loss": 1.2938,
"step": 2812
},
{
"epoch": 0.6918347270044269,
"grad_norm": 3.4958057403564453,
"learning_rate": 9.788133763870486e-06,
"loss": 1.1733,
"step": 2813
},
{
"epoch": 0.692080668962125,
"grad_norm": 3.466243028640747,
"learning_rate": 9.787899449791502e-06,
"loss": 1.2088,
"step": 2814
},
{
"epoch": 0.692326610919823,
"grad_norm": 3.161153554916382,
"learning_rate": 9.78766500902166e-06,
"loss": 0.9974,
"step": 2815
},
{
"epoch": 0.692572552877521,
"grad_norm": 3.696357488632202,
"learning_rate": 9.78743044156716e-06,
"loss": 1.2196,
"step": 2816
},
{
"epoch": 0.6928184948352188,
"grad_norm": 3.7144525051116943,
"learning_rate": 9.787195747434209e-06,
"loss": 1.2272,
"step": 2817
},
{
"epoch": 0.6930644367929168,
"grad_norm": 3.5107853412628174,
"learning_rate": 9.78696092662902e-06,
"loss": 1.184,
"step": 2818
},
{
"epoch": 0.6933103787506149,
"grad_norm": 3.397926092147827,
"learning_rate": 9.786725979157804e-06,
"loss": 1.0887,
"step": 2819
},
{
"epoch": 0.6935563207083129,
"grad_norm": 3.3558249473571777,
"learning_rate": 9.786490905026779e-06,
"loss": 1.2404,
"step": 2820
},
{
"epoch": 0.6938022626660109,
"grad_norm": 3.625300407409668,
"learning_rate": 9.786255704242164e-06,
"loss": 1.2516,
"step": 2821
},
{
"epoch": 0.6940482046237088,
"grad_norm": 3.7559452056884766,
"learning_rate": 9.786020376810183e-06,
"loss": 1.1916,
"step": 2822
},
{
"epoch": 0.6942941465814068,
"grad_norm": 3.603795289993286,
"learning_rate": 9.785784922737066e-06,
"loss": 1.2314,
"step": 2823
},
{
"epoch": 0.6945400885391048,
"grad_norm": 3.4979734420776367,
"learning_rate": 9.785549342029038e-06,
"loss": 1.2726,
"step": 2824
},
{
"epoch": 0.6947860304968028,
"grad_norm": 3.5504086017608643,
"learning_rate": 9.785313634692336e-06,
"loss": 1.1638,
"step": 2825
},
{
"epoch": 0.6950319724545008,
"grad_norm": 3.815469980239868,
"learning_rate": 9.785077800733196e-06,
"loss": 1.1877,
"step": 2826
},
{
"epoch": 0.6952779144121988,
"grad_norm": 3.3551113605499268,
"learning_rate": 9.784841840157859e-06,
"loss": 1.1253,
"step": 2827
},
{
"epoch": 0.6955238563698967,
"grad_norm": 3.537262201309204,
"learning_rate": 9.784605752972567e-06,
"loss": 1.1585,
"step": 2828
},
{
"epoch": 0.6957697983275947,
"grad_norm": 5.1484694480896,
"learning_rate": 9.784369539183569e-06,
"loss": 1.3999,
"step": 2829
},
{
"epoch": 0.6960157402852927,
"grad_norm": 3.2504913806915283,
"learning_rate": 9.784133198797115e-06,
"loss": 1.1674,
"step": 2830
},
{
"epoch": 0.6962616822429907,
"grad_norm": 4.012274742126465,
"learning_rate": 9.783896731819458e-06,
"loss": 1.2626,
"step": 2831
},
{
"epoch": 0.6965076242006887,
"grad_norm": 3.8755898475646973,
"learning_rate": 9.783660138256856e-06,
"loss": 1.3207,
"step": 2832
},
{
"epoch": 0.6967535661583866,
"grad_norm": 3.599823236465454,
"learning_rate": 9.783423418115568e-06,
"loss": 1.1014,
"step": 2833
},
{
"epoch": 0.6969995081160846,
"grad_norm": 3.759026288986206,
"learning_rate": 9.783186571401856e-06,
"loss": 1.2301,
"step": 2834
},
{
"epoch": 0.6972454500737826,
"grad_norm": 3.8635828495025635,
"learning_rate": 9.782949598121994e-06,
"loss": 1.3065,
"step": 2835
},
{
"epoch": 0.6974913920314806,
"grad_norm": 3.8118181228637695,
"learning_rate": 9.782712498282246e-06,
"loss": 1.3103,
"step": 2836
},
{
"epoch": 0.6977373339891786,
"grad_norm": 3.804013967514038,
"learning_rate": 9.782475271888887e-06,
"loss": 1.1896,
"step": 2837
},
{
"epoch": 0.6979832759468766,
"grad_norm": 3.8026750087738037,
"learning_rate": 9.782237918948196e-06,
"loss": 1.2391,
"step": 2838
},
{
"epoch": 0.6982292179045745,
"grad_norm": 3.9418399333953857,
"learning_rate": 9.782000439466454e-06,
"loss": 1.2176,
"step": 2839
},
{
"epoch": 0.6984751598622725,
"grad_norm": 3.5170843601226807,
"learning_rate": 9.781762833449942e-06,
"loss": 1.1838,
"step": 2840
},
{
"epoch": 0.6987211018199705,
"grad_norm": 3.7262866497039795,
"learning_rate": 9.781525100904949e-06,
"loss": 1.21,
"step": 2841
},
{
"epoch": 0.6989670437776685,
"grad_norm": 3.8299262523651123,
"learning_rate": 9.781287241837764e-06,
"loss": 1.3118,
"step": 2842
},
{
"epoch": 0.6992129857353665,
"grad_norm": 4.011529922485352,
"learning_rate": 9.781049256254682e-06,
"loss": 1.2905,
"step": 2843
},
{
"epoch": 0.6994589276930644,
"grad_norm": 4.057225704193115,
"learning_rate": 9.780811144162002e-06,
"loss": 1.4291,
"step": 2844
},
{
"epoch": 0.6997048696507624,
"grad_norm": 4.0152268409729,
"learning_rate": 9.780572905566022e-06,
"loss": 1.3641,
"step": 2845
},
{
"epoch": 0.6999508116084604,
"grad_norm": 3.5492146015167236,
"learning_rate": 9.780334540473046e-06,
"loss": 1.2951,
"step": 2846
},
{
"epoch": 0.7001967535661584,
"grad_norm": 3.7625820636749268,
"learning_rate": 9.780096048889385e-06,
"loss": 1.22,
"step": 2847
},
{
"epoch": 0.7004426955238564,
"grad_norm": 3.7146875858306885,
"learning_rate": 9.779857430821345e-06,
"loss": 1.2859,
"step": 2848
},
{
"epoch": 0.7006886374815543,
"grad_norm": 3.809061050415039,
"learning_rate": 9.779618686275241e-06,
"loss": 1.418,
"step": 2849
},
{
"epoch": 0.7009345794392523,
"grad_norm": 3.9956254959106445,
"learning_rate": 9.779379815257393e-06,
"loss": 1.1884,
"step": 2850
},
{
"epoch": 0.7011805213969503,
"grad_norm": 4.1078925132751465,
"learning_rate": 9.779140817774117e-06,
"loss": 1.2779,
"step": 2851
},
{
"epoch": 0.7014264633546483,
"grad_norm": 3.4641895294189453,
"learning_rate": 9.778901693831743e-06,
"loss": 1.152,
"step": 2852
},
{
"epoch": 0.7016724053123463,
"grad_norm": 3.8021063804626465,
"learning_rate": 9.778662443436592e-06,
"loss": 1.2624,
"step": 2853
},
{
"epoch": 0.7019183472700443,
"grad_norm": 4.149223327636719,
"learning_rate": 9.778423066595002e-06,
"loss": 1.4627,
"step": 2854
},
{
"epoch": 0.7021642892277422,
"grad_norm": 3.536099910736084,
"learning_rate": 9.7781835633133e-06,
"loss": 1.2838,
"step": 2855
},
{
"epoch": 0.7024102311854402,
"grad_norm": 3.4286208152770996,
"learning_rate": 9.777943933597827e-06,
"loss": 1.2508,
"step": 2856
},
{
"epoch": 0.7026561731431382,
"grad_norm": 3.7258353233337402,
"learning_rate": 9.777704177454923e-06,
"loss": 1.3215,
"step": 2857
},
{
"epoch": 0.7029021151008362,
"grad_norm": 3.6346137523651123,
"learning_rate": 9.777464294890933e-06,
"loss": 1.4125,
"step": 2858
},
{
"epoch": 0.7031480570585342,
"grad_norm": 3.6287670135498047,
"learning_rate": 9.777224285912203e-06,
"loss": 1.2432,
"step": 2859
},
{
"epoch": 0.7033939990162321,
"grad_norm": 3.53383731842041,
"learning_rate": 9.776984150525084e-06,
"loss": 1.2,
"step": 2860
},
{
"epoch": 0.7036399409739301,
"grad_norm": 3.9326975345611572,
"learning_rate": 9.776743888735932e-06,
"loss": 1.3792,
"step": 2861
},
{
"epoch": 0.7038858829316281,
"grad_norm": 3.466139554977417,
"learning_rate": 9.776503500551102e-06,
"loss": 1.1234,
"step": 2862
},
{
"epoch": 0.7041318248893261,
"grad_norm": 3.561073064804077,
"learning_rate": 9.776262985976955e-06,
"loss": 1.1418,
"step": 2863
},
{
"epoch": 0.7043777668470241,
"grad_norm": 3.467527151107788,
"learning_rate": 9.776022345019858e-06,
"loss": 1.0918,
"step": 2864
},
{
"epoch": 0.7046237088047221,
"grad_norm": 3.7290802001953125,
"learning_rate": 9.775781577686177e-06,
"loss": 1.2138,
"step": 2865
},
{
"epoch": 0.70486965076242,
"grad_norm": 3.962554931640625,
"learning_rate": 9.77554068398228e-06,
"loss": 1.3283,
"step": 2866
},
{
"epoch": 0.705115592720118,
"grad_norm": 3.7760744094848633,
"learning_rate": 9.775299663914546e-06,
"loss": 1.1969,
"step": 2867
},
{
"epoch": 0.705361534677816,
"grad_norm": 4.057610988616943,
"learning_rate": 9.775058517489349e-06,
"loss": 1.4732,
"step": 2868
},
{
"epoch": 0.705607476635514,
"grad_norm": 3.4469170570373535,
"learning_rate": 9.77481724471307e-06,
"loss": 1.2041,
"step": 2869
},
{
"epoch": 0.705853418593212,
"grad_norm": 3.8763043880462646,
"learning_rate": 9.774575845592096e-06,
"loss": 1.3294,
"step": 2870
},
{
"epoch": 0.7060993605509099,
"grad_norm": 3.502882957458496,
"learning_rate": 9.774334320132813e-06,
"loss": 1.155,
"step": 2871
},
{
"epoch": 0.7063453025086079,
"grad_norm": 4.266542434692383,
"learning_rate": 9.774092668341611e-06,
"loss": 1.2834,
"step": 2872
},
{
"epoch": 0.706591244466306,
"grad_norm": 3.6888976097106934,
"learning_rate": 9.773850890224885e-06,
"loss": 1.2752,
"step": 2873
},
{
"epoch": 0.706837186424004,
"grad_norm": 4.2063093185424805,
"learning_rate": 9.773608985789034e-06,
"loss": 1.3292,
"step": 2874
},
{
"epoch": 0.707083128381702,
"grad_norm": 3.5431721210479736,
"learning_rate": 9.773366955040455e-06,
"loss": 1.1859,
"step": 2875
},
{
"epoch": 0.7073290703394,
"grad_norm": 3.5552501678466797,
"learning_rate": 9.773124797985559e-06,
"loss": 1.2451,
"step": 2876
},
{
"epoch": 0.7075750122970978,
"grad_norm": 3.6684181690216064,
"learning_rate": 9.772882514630746e-06,
"loss": 1.2311,
"step": 2877
},
{
"epoch": 0.7078209542547959,
"grad_norm": 3.7658634185791016,
"learning_rate": 9.772640104982432e-06,
"loss": 1.3249,
"step": 2878
},
{
"epoch": 0.7080668962124939,
"grad_norm": 3.6768767833709717,
"learning_rate": 9.772397569047029e-06,
"loss": 1.2097,
"step": 2879
},
{
"epoch": 0.7083128381701919,
"grad_norm": 3.673858404159546,
"learning_rate": 9.772154906830954e-06,
"loss": 1.2437,
"step": 2880
},
{
"epoch": 0.7085587801278899,
"grad_norm": 3.96591854095459,
"learning_rate": 9.771912118340633e-06,
"loss": 1.326,
"step": 2881
},
{
"epoch": 0.7088047220855878,
"grad_norm": 3.621366262435913,
"learning_rate": 9.771669203582485e-06,
"loss": 1.1892,
"step": 2882
},
{
"epoch": 0.7090506640432858,
"grad_norm": 3.461729049682617,
"learning_rate": 9.77142616256294e-06,
"loss": 1.1828,
"step": 2883
},
{
"epoch": 0.7092966060009838,
"grad_norm": 3.683854103088379,
"learning_rate": 9.771182995288427e-06,
"loss": 1.2532,
"step": 2884
},
{
"epoch": 0.7095425479586818,
"grad_norm": 3.47440767288208,
"learning_rate": 9.770939701765382e-06,
"loss": 1.1527,
"step": 2885
},
{
"epoch": 0.7097884899163798,
"grad_norm": 3.837763786315918,
"learning_rate": 9.770696282000245e-06,
"loss": 1.3401,
"step": 2886
},
{
"epoch": 0.7100344318740777,
"grad_norm": 3.63362979888916,
"learning_rate": 9.770452735999453e-06,
"loss": 1.2236,
"step": 2887
},
{
"epoch": 0.7102803738317757,
"grad_norm": 3.6551363468170166,
"learning_rate": 9.770209063769453e-06,
"loss": 1.2772,
"step": 2888
},
{
"epoch": 0.7105263157894737,
"grad_norm": 3.9384374618530273,
"learning_rate": 9.76996526531669e-06,
"loss": 1.3429,
"step": 2889
},
{
"epoch": 0.7107722577471717,
"grad_norm": 3.745328187942505,
"learning_rate": 9.769721340647618e-06,
"loss": 1.2927,
"step": 2890
},
{
"epoch": 0.7110181997048697,
"grad_norm": 3.270646810531616,
"learning_rate": 9.76947728976869e-06,
"loss": 1.121,
"step": 2891
},
{
"epoch": 0.7112641416625677,
"grad_norm": 3.600285768508911,
"learning_rate": 9.769233112686363e-06,
"loss": 1.2561,
"step": 2892
},
{
"epoch": 0.7115100836202656,
"grad_norm": 3.549861192703247,
"learning_rate": 9.768988809407101e-06,
"loss": 1.2058,
"step": 2893
},
{
"epoch": 0.7117560255779636,
"grad_norm": 3.678018569946289,
"learning_rate": 9.768744379937364e-06,
"loss": 1.1534,
"step": 2894
},
{
"epoch": 0.7120019675356616,
"grad_norm": 3.594353437423706,
"learning_rate": 9.768499824283623e-06,
"loss": 1.2794,
"step": 2895
},
{
"epoch": 0.7122479094933596,
"grad_norm": 3.7944607734680176,
"learning_rate": 9.76825514245235e-06,
"loss": 1.3293,
"step": 2896
},
{
"epoch": 0.7124938514510576,
"grad_norm": 3.829256296157837,
"learning_rate": 9.768010334450015e-06,
"loss": 1.2832,
"step": 2897
},
{
"epoch": 0.7127397934087555,
"grad_norm": 3.7937026023864746,
"learning_rate": 9.7677654002831e-06,
"loss": 1.2483,
"step": 2898
},
{
"epoch": 0.7129857353664535,
"grad_norm": 3.4295127391815186,
"learning_rate": 9.767520339958083e-06,
"loss": 1.1398,
"step": 2899
},
{
"epoch": 0.7132316773241515,
"grad_norm": 3.81139874458313,
"learning_rate": 9.767275153481452e-06,
"loss": 1.2626,
"step": 2900
},
{
"epoch": 0.7134776192818495,
"grad_norm": 3.6892337799072266,
"learning_rate": 9.767029840859692e-06,
"loss": 1.2358,
"step": 2901
},
{
"epoch": 0.7137235612395475,
"grad_norm": 3.882338285446167,
"learning_rate": 9.766784402099297e-06,
"loss": 1.2294,
"step": 2902
},
{
"epoch": 0.7139695031972455,
"grad_norm": 3.2959883213043213,
"learning_rate": 9.766538837206758e-06,
"loss": 1.2541,
"step": 2903
},
{
"epoch": 0.7142154451549434,
"grad_norm": 3.611804723739624,
"learning_rate": 9.766293146188572e-06,
"loss": 1.2376,
"step": 2904
},
{
"epoch": 0.7144613871126414,
"grad_norm": 4.193320274353027,
"learning_rate": 9.766047329051244e-06,
"loss": 1.5058,
"step": 2905
},
{
"epoch": 0.7147073290703394,
"grad_norm": 3.623293876647949,
"learning_rate": 9.76580138580128e-06,
"loss": 1.158,
"step": 2906
},
{
"epoch": 0.7149532710280374,
"grad_norm": 3.426072120666504,
"learning_rate": 9.76555531644518e-06,
"loss": 1.2268,
"step": 2907
},
{
"epoch": 0.7151992129857354,
"grad_norm": 3.65736985206604,
"learning_rate": 9.765309120989462e-06,
"loss": 1.2315,
"step": 2908
},
{
"epoch": 0.7154451549434333,
"grad_norm": 3.323507070541382,
"learning_rate": 9.765062799440638e-06,
"loss": 1.2323,
"step": 2909
},
{
"epoch": 0.7156910969011313,
"grad_norm": 3.750087261199951,
"learning_rate": 9.764816351805227e-06,
"loss": 1.1862,
"step": 2910
},
{
"epoch": 0.7159370388588293,
"grad_norm": 3.353872776031494,
"learning_rate": 9.764569778089747e-06,
"loss": 1.1779,
"step": 2911
},
{
"epoch": 0.7161829808165273,
"grad_norm": 3.558509349822998,
"learning_rate": 9.764323078300725e-06,
"loss": 1.1825,
"step": 2912
},
{
"epoch": 0.7164289227742253,
"grad_norm": 3.5206387042999268,
"learning_rate": 9.764076252444692e-06,
"loss": 1.202,
"step": 2913
},
{
"epoch": 0.7166748647319232,
"grad_norm": 3.916215181350708,
"learning_rate": 9.763829300528173e-06,
"loss": 1.353,
"step": 2914
},
{
"epoch": 0.7169208066896212,
"grad_norm": 3.5607190132141113,
"learning_rate": 9.763582222557705e-06,
"loss": 1.226,
"step": 2915
},
{
"epoch": 0.7171667486473192,
"grad_norm": 3.4083847999572754,
"learning_rate": 9.763335018539828e-06,
"loss": 1.2527,
"step": 2916
},
{
"epoch": 0.7174126906050172,
"grad_norm": 3.6419665813446045,
"learning_rate": 9.763087688481079e-06,
"loss": 1.2185,
"step": 2917
},
{
"epoch": 0.7176586325627152,
"grad_norm": 3.6154086589813232,
"learning_rate": 9.762840232388007e-06,
"loss": 1.1802,
"step": 2918
},
{
"epoch": 0.7179045745204132,
"grad_norm": 3.657057046890259,
"learning_rate": 9.762592650267156e-06,
"loss": 1.1656,
"step": 2919
},
{
"epoch": 0.7181505164781111,
"grad_norm": 3.642827272415161,
"learning_rate": 9.762344942125079e-06,
"loss": 1.2328,
"step": 2920
},
{
"epoch": 0.7183964584358091,
"grad_norm": 3.8019988536834717,
"learning_rate": 9.762097107968331e-06,
"loss": 1.207,
"step": 2921
},
{
"epoch": 0.7186424003935071,
"grad_norm": 4.130279064178467,
"learning_rate": 9.76184914780347e-06,
"loss": 1.1954,
"step": 2922
},
{
"epoch": 0.7188883423512051,
"grad_norm": 3.901109457015991,
"learning_rate": 9.761601061637055e-06,
"loss": 1.1881,
"step": 2923
},
{
"epoch": 0.7191342843089031,
"grad_norm": 3.46006441116333,
"learning_rate": 9.761352849475655e-06,
"loss": 1.2567,
"step": 2924
},
{
"epoch": 0.719380226266601,
"grad_norm": 4.178497791290283,
"learning_rate": 9.761104511325831e-06,
"loss": 1.2713,
"step": 2925
},
{
"epoch": 0.719626168224299,
"grad_norm": 3.577718734741211,
"learning_rate": 9.760856047194158e-06,
"loss": 1.0972,
"step": 2926
},
{
"epoch": 0.719872110181997,
"grad_norm": 3.3647210597991943,
"learning_rate": 9.760607457087212e-06,
"loss": 1.1447,
"step": 2927
},
{
"epoch": 0.720118052139695,
"grad_norm": 3.6690170764923096,
"learning_rate": 9.76035874101157e-06,
"loss": 1.1957,
"step": 2928
},
{
"epoch": 0.720363994097393,
"grad_norm": 3.7888054847717285,
"learning_rate": 9.76010989897381e-06,
"loss": 1.2102,
"step": 2929
},
{
"epoch": 0.720609936055091,
"grad_norm": 3.7484934329986572,
"learning_rate": 9.759860930980521e-06,
"loss": 1.3495,
"step": 2930
},
{
"epoch": 0.7208558780127889,
"grad_norm": 3.5072691440582275,
"learning_rate": 9.75961183703829e-06,
"loss": 1.2036,
"step": 2931
},
{
"epoch": 0.721101819970487,
"grad_norm": 3.7658798694610596,
"learning_rate": 9.759362617153705e-06,
"loss": 1.2804,
"step": 2932
},
{
"epoch": 0.721347761928185,
"grad_norm": 3.968895196914673,
"learning_rate": 9.759113271333362e-06,
"loss": 1.2743,
"step": 2933
},
{
"epoch": 0.721593703885883,
"grad_norm": 3.5986311435699463,
"learning_rate": 9.758863799583862e-06,
"loss": 1.1564,
"step": 2934
},
{
"epoch": 0.721839645843581,
"grad_norm": 3.421478271484375,
"learning_rate": 9.7586142019118e-06,
"loss": 1.0138,
"step": 2935
},
{
"epoch": 0.7220855878012788,
"grad_norm": 3.6207969188690186,
"learning_rate": 9.758364478323787e-06,
"loss": 1.3793,
"step": 2936
},
{
"epoch": 0.7223315297589769,
"grad_norm": 3.7541399002075195,
"learning_rate": 9.758114628826427e-06,
"loss": 1.3709,
"step": 2937
},
{
"epoch": 0.7225774717166749,
"grad_norm": 3.7359983921051025,
"learning_rate": 9.757864653426333e-06,
"loss": 1.399,
"step": 2938
},
{
"epoch": 0.7228234136743729,
"grad_norm": 3.289734363555908,
"learning_rate": 9.757614552130118e-06,
"loss": 1.275,
"step": 2939
},
{
"epoch": 0.7230693556320709,
"grad_norm": 3.4785022735595703,
"learning_rate": 9.7573643249444e-06,
"loss": 1.2895,
"step": 2940
},
{
"epoch": 0.7233152975897689,
"grad_norm": 4.442722320556641,
"learning_rate": 9.7571139718758e-06,
"loss": 1.3391,
"step": 2941
},
{
"epoch": 0.7235612395474668,
"grad_norm": 3.4361846446990967,
"learning_rate": 9.756863492930946e-06,
"loss": 1.1884,
"step": 2942
},
{
"epoch": 0.7238071815051648,
"grad_norm": 3.847398042678833,
"learning_rate": 9.756612888116461e-06,
"loss": 1.2647,
"step": 2943
},
{
"epoch": 0.7240531234628628,
"grad_norm": 3.4656386375427246,
"learning_rate": 9.75636215743898e-06,
"loss": 1.161,
"step": 2944
},
{
"epoch": 0.7242990654205608,
"grad_norm": 3.348834753036499,
"learning_rate": 9.756111300905132e-06,
"loss": 1.1127,
"step": 2945
},
{
"epoch": 0.7245450073782588,
"grad_norm": 3.8643219470977783,
"learning_rate": 9.755860318521561e-06,
"loss": 1.4208,
"step": 2946
},
{
"epoch": 0.7247909493359567,
"grad_norm": 3.7468278408050537,
"learning_rate": 9.755609210294906e-06,
"loss": 1.1908,
"step": 2947
},
{
"epoch": 0.7250368912936547,
"grad_norm": 3.6190948486328125,
"learning_rate": 9.75535797623181e-06,
"loss": 1.3348,
"step": 2948
},
{
"epoch": 0.7252828332513527,
"grad_norm": 3.818368673324585,
"learning_rate": 9.755106616338922e-06,
"loss": 1.4482,
"step": 2949
},
{
"epoch": 0.7255287752090507,
"grad_norm": 3.9183719158172607,
"learning_rate": 9.754855130622893e-06,
"loss": 1.2421,
"step": 2950
},
{
"epoch": 0.7257747171667487,
"grad_norm": 3.611154794692993,
"learning_rate": 9.754603519090377e-06,
"loss": 1.2609,
"step": 2951
},
{
"epoch": 0.7260206591244466,
"grad_norm": 3.636075019836426,
"learning_rate": 9.754351781748034e-06,
"loss": 1.1701,
"step": 2952
},
{
"epoch": 0.7262666010821446,
"grad_norm": 3.861968517303467,
"learning_rate": 9.754099918602523e-06,
"loss": 1.2491,
"step": 2953
},
{
"epoch": 0.7265125430398426,
"grad_norm": 4.075852394104004,
"learning_rate": 9.753847929660506e-06,
"loss": 1.31,
"step": 2954
},
{
"epoch": 0.7267584849975406,
"grad_norm": 3.6656057834625244,
"learning_rate": 9.753595814928658e-06,
"loss": 1.1624,
"step": 2955
},
{
"epoch": 0.7270044269552386,
"grad_norm": 3.4636499881744385,
"learning_rate": 9.753343574413644e-06,
"loss": 1.1251,
"step": 2956
},
{
"epoch": 0.7272503689129366,
"grad_norm": 3.563232421875,
"learning_rate": 9.753091208122141e-06,
"loss": 1.2266,
"step": 2957
},
{
"epoch": 0.7274963108706345,
"grad_norm": 3.734705924987793,
"learning_rate": 9.752838716060823e-06,
"loss": 1.2133,
"step": 2958
},
{
"epoch": 0.7277422528283325,
"grad_norm": 3.6140034198760986,
"learning_rate": 9.752586098236378e-06,
"loss": 1.2584,
"step": 2959
},
{
"epoch": 0.7279881947860305,
"grad_norm": 4.484870910644531,
"learning_rate": 9.752333354655485e-06,
"loss": 1.3254,
"step": 2960
},
{
"epoch": 0.7282341367437285,
"grad_norm": 3.574359893798828,
"learning_rate": 9.752080485324833e-06,
"loss": 1.2853,
"step": 2961
},
{
"epoch": 0.7284800787014265,
"grad_norm": 3.7556087970733643,
"learning_rate": 9.751827490251114e-06,
"loss": 1.1769,
"step": 2962
},
{
"epoch": 0.7287260206591244,
"grad_norm": 3.3782081604003906,
"learning_rate": 9.751574369441022e-06,
"loss": 1.1203,
"step": 2963
},
{
"epoch": 0.7289719626168224,
"grad_norm": 3.7650489807128906,
"learning_rate": 9.751321122901254e-06,
"loss": 1.215,
"step": 2964
},
{
"epoch": 0.7292179045745204,
"grad_norm": 4.468428134918213,
"learning_rate": 9.751067750638513e-06,
"loss": 1.4191,
"step": 2965
},
{
"epoch": 0.7294638465322184,
"grad_norm": 3.2760260105133057,
"learning_rate": 9.7508142526595e-06,
"loss": 1.1105,
"step": 2966
},
{
"epoch": 0.7297097884899164,
"grad_norm": 3.1928229331970215,
"learning_rate": 9.750560628970924e-06,
"loss": 1.1323,
"step": 2967
},
{
"epoch": 0.7299557304476144,
"grad_norm": 3.866544485092163,
"learning_rate": 9.750306879579499e-06,
"loss": 1.2242,
"step": 2968
},
{
"epoch": 0.7302016724053123,
"grad_norm": 3.6297245025634766,
"learning_rate": 9.750053004491937e-06,
"loss": 1.2108,
"step": 2969
},
{
"epoch": 0.7304476143630103,
"grad_norm": 3.4796388149261475,
"learning_rate": 9.749799003714954e-06,
"loss": 1.283,
"step": 2970
},
{
"epoch": 0.7306935563207083,
"grad_norm": 3.910794973373413,
"learning_rate": 9.749544877255275e-06,
"loss": 1.1354,
"step": 2971
},
{
"epoch": 0.7309394982784063,
"grad_norm": 3.865950107574463,
"learning_rate": 9.74929062511962e-06,
"loss": 1.1598,
"step": 2972
},
{
"epoch": 0.7311854402361043,
"grad_norm": 4.091001033782959,
"learning_rate": 9.749036247314721e-06,
"loss": 1.1911,
"step": 2973
},
{
"epoch": 0.7314313821938022,
"grad_norm": 3.927271842956543,
"learning_rate": 9.748781743847306e-06,
"loss": 1.3786,
"step": 2974
},
{
"epoch": 0.7316773241515002,
"grad_norm": 3.3954954147338867,
"learning_rate": 9.748527114724111e-06,
"loss": 1.121,
"step": 2975
},
{
"epoch": 0.7319232661091982,
"grad_norm": 3.8660738468170166,
"learning_rate": 9.748272359951873e-06,
"loss": 1.2834,
"step": 2976
},
{
"epoch": 0.7321692080668962,
"grad_norm": 3.599734306335449,
"learning_rate": 9.74801747953733e-06,
"loss": 1.1082,
"step": 2977
},
{
"epoch": 0.7324151500245942,
"grad_norm": 3.568272113800049,
"learning_rate": 9.747762473487233e-06,
"loss": 1.3331,
"step": 2978
},
{
"epoch": 0.7326610919822922,
"grad_norm": 3.6521451473236084,
"learning_rate": 9.747507341808323e-06,
"loss": 1.2377,
"step": 2979
},
{
"epoch": 0.7329070339399901,
"grad_norm": 3.5281732082366943,
"learning_rate": 9.747252084507355e-06,
"loss": 1.2657,
"step": 2980
},
{
"epoch": 0.7331529758976881,
"grad_norm": 3.4320714473724365,
"learning_rate": 9.74699670159108e-06,
"loss": 1.1058,
"step": 2981
},
{
"epoch": 0.7333989178553861,
"grad_norm": 3.7616279125213623,
"learning_rate": 9.746741193066261e-06,
"loss": 1.263,
"step": 2982
},
{
"epoch": 0.7336448598130841,
"grad_norm": 4.189964294433594,
"learning_rate": 9.746485558939654e-06,
"loss": 1.3962,
"step": 2983
},
{
"epoch": 0.7338908017707821,
"grad_norm": 3.294914484024048,
"learning_rate": 9.746229799218023e-06,
"loss": 1.1687,
"step": 2984
},
{
"epoch": 0.73413674372848,
"grad_norm": 3.8283896446228027,
"learning_rate": 9.745973913908139e-06,
"loss": 1.2627,
"step": 2985
},
{
"epoch": 0.734382685686178,
"grad_norm": 3.9916651248931885,
"learning_rate": 9.74571790301677e-06,
"loss": 1.1588,
"step": 2986
},
{
"epoch": 0.734628627643876,
"grad_norm": 3.7822256088256836,
"learning_rate": 9.745461766550692e-06,
"loss": 1.2428,
"step": 2987
},
{
"epoch": 0.734874569601574,
"grad_norm": 3.8073809146881104,
"learning_rate": 9.74520550451668e-06,
"loss": 1.284,
"step": 2988
},
{
"epoch": 0.735120511559272,
"grad_norm": 3.796330213546753,
"learning_rate": 9.74494911692152e-06,
"loss": 1.2517,
"step": 2989
},
{
"epoch": 0.7353664535169699,
"grad_norm": 3.492372989654541,
"learning_rate": 9.744692603771993e-06,
"loss": 1.0667,
"step": 2990
},
{
"epoch": 0.735612395474668,
"grad_norm": 3.81075119972229,
"learning_rate": 9.744435965074885e-06,
"loss": 1.2488,
"step": 2991
},
{
"epoch": 0.735858337432366,
"grad_norm": 3.9589345455169678,
"learning_rate": 9.744179200836989e-06,
"loss": 1.4611,
"step": 2992
},
{
"epoch": 0.736104279390064,
"grad_norm": 4.010930061340332,
"learning_rate": 9.743922311065096e-06,
"loss": 1.2064,
"step": 2993
},
{
"epoch": 0.736350221347762,
"grad_norm": 3.753709077835083,
"learning_rate": 9.74366529576601e-06,
"loss": 1.1756,
"step": 2994
},
{
"epoch": 0.73659616330546,
"grad_norm": 3.8984506130218506,
"learning_rate": 9.743408154946525e-06,
"loss": 1.2002,
"step": 2995
},
{
"epoch": 0.7368421052631579,
"grad_norm": 3.8449578285217285,
"learning_rate": 9.743150888613447e-06,
"loss": 1.2847,
"step": 2996
},
{
"epoch": 0.7370880472208559,
"grad_norm": 3.6768381595611572,
"learning_rate": 9.742893496773585e-06,
"loss": 1.1374,
"step": 2997
},
{
"epoch": 0.7373339891785539,
"grad_norm": 3.6121773719787598,
"learning_rate": 9.74263597943375e-06,
"loss": 1.2644,
"step": 2998
},
{
"epoch": 0.7375799311362519,
"grad_norm": 4.121194362640381,
"learning_rate": 9.742378336600756e-06,
"loss": 1.1564,
"step": 2999
},
{
"epoch": 0.7378258730939499,
"grad_norm": 3.404369354248047,
"learning_rate": 9.742120568281417e-06,
"loss": 1.1443,
"step": 3000
},
{
"epoch": 0.7378258730939499,
"eval_loss": 1.2704724073410034,
"eval_runtime": 13.694,
"eval_samples_per_second": 29.21,
"eval_steps_per_second": 3.651,
"step": 3000
},
{
"epoch": 0.7380718150516478,
"grad_norm": 3.5498878955841064,
"learning_rate": 9.741862674482556e-06,
"loss": 1.1763,
"step": 3001
},
{
"epoch": 0.7383177570093458,
"grad_norm": 3.783893585205078,
"learning_rate": 9.741604655210999e-06,
"loss": 1.2208,
"step": 3002
},
{
"epoch": 0.7385636989670438,
"grad_norm": 3.8119053840637207,
"learning_rate": 9.74134651047357e-06,
"loss": 1.3563,
"step": 3003
},
{
"epoch": 0.7388096409247418,
"grad_norm": 3.3641316890716553,
"learning_rate": 9.7410882402771e-06,
"loss": 1.2421,
"step": 3004
},
{
"epoch": 0.7390555828824398,
"grad_norm": 3.657059907913208,
"learning_rate": 9.740829844628428e-06,
"loss": 1.0732,
"step": 3005
},
{
"epoch": 0.7393015248401378,
"grad_norm": 4.262376308441162,
"learning_rate": 9.740571323534385e-06,
"loss": 1.4001,
"step": 3006
},
{
"epoch": 0.7395474667978357,
"grad_norm": 3.3992342948913574,
"learning_rate": 9.740312677001815e-06,
"loss": 1.1748,
"step": 3007
},
{
"epoch": 0.7397934087555337,
"grad_norm": 3.6153645515441895,
"learning_rate": 9.74005390503756e-06,
"loss": 1.3133,
"step": 3008
},
{
"epoch": 0.7400393507132317,
"grad_norm": 3.4278006553649902,
"learning_rate": 9.73979500764847e-06,
"loss": 1.2214,
"step": 3009
},
{
"epoch": 0.7402852926709297,
"grad_norm": 3.415449619293213,
"learning_rate": 9.739535984841392e-06,
"loss": 1.1172,
"step": 3010
},
{
"epoch": 0.7405312346286277,
"grad_norm": 3.863985061645508,
"learning_rate": 9.739276836623184e-06,
"loss": 1.3808,
"step": 3011
},
{
"epoch": 0.7407771765863256,
"grad_norm": 3.5708205699920654,
"learning_rate": 9.739017563000699e-06,
"loss": 1.1905,
"step": 3012
},
{
"epoch": 0.7410231185440236,
"grad_norm": 3.898135185241699,
"learning_rate": 9.738758163980802e-06,
"loss": 1.3806,
"step": 3013
},
{
"epoch": 0.7412690605017216,
"grad_norm": 3.5047130584716797,
"learning_rate": 9.738498639570353e-06,
"loss": 1.1617,
"step": 3014
},
{
"epoch": 0.7415150024594196,
"grad_norm": 3.6734139919281006,
"learning_rate": 9.73823898977622e-06,
"loss": 1.1007,
"step": 3015
},
{
"epoch": 0.7417609444171176,
"grad_norm": 3.9104254245758057,
"learning_rate": 9.737979214605275e-06,
"loss": 1.3017,
"step": 3016
},
{
"epoch": 0.7420068863748155,
"grad_norm": 4.0104780197143555,
"learning_rate": 9.737719314064391e-06,
"loss": 1.3947,
"step": 3017
},
{
"epoch": 0.7422528283325135,
"grad_norm": 4.125523090362549,
"learning_rate": 9.737459288160445e-06,
"loss": 1.2268,
"step": 3018
},
{
"epoch": 0.7424987702902115,
"grad_norm": 3.831815481185913,
"learning_rate": 9.737199136900317e-06,
"loss": 1.3475,
"step": 3019
},
{
"epoch": 0.7427447122479095,
"grad_norm": 3.837393283843994,
"learning_rate": 9.736938860290892e-06,
"loss": 1.2649,
"step": 3020
},
{
"epoch": 0.7429906542056075,
"grad_norm": 3.65067720413208,
"learning_rate": 9.736678458339058e-06,
"loss": 1.1135,
"step": 3021
},
{
"epoch": 0.7432365961633055,
"grad_norm": 3.668827533721924,
"learning_rate": 9.7364179310517e-06,
"loss": 1.2376,
"step": 3022
},
{
"epoch": 0.7434825381210034,
"grad_norm": 3.7406487464904785,
"learning_rate": 9.736157278435719e-06,
"loss": 1.29,
"step": 3023
},
{
"epoch": 0.7437284800787014,
"grad_norm": 3.5215630531311035,
"learning_rate": 9.735896500498008e-06,
"loss": 1.142,
"step": 3024
},
{
"epoch": 0.7439744220363994,
"grad_norm": 3.3483617305755615,
"learning_rate": 9.735635597245468e-06,
"loss": 1.2089,
"step": 3025
},
{
"epoch": 0.7442203639940974,
"grad_norm": 3.6867120265960693,
"learning_rate": 9.735374568685e-06,
"loss": 1.2144,
"step": 3026
},
{
"epoch": 0.7444663059517954,
"grad_norm": 3.65517520904541,
"learning_rate": 9.735113414823517e-06,
"loss": 1.2043,
"step": 3027
},
{
"epoch": 0.7447122479094933,
"grad_norm": 4.00148868560791,
"learning_rate": 9.734852135667923e-06,
"loss": 1.3266,
"step": 3028
},
{
"epoch": 0.7449581898671913,
"grad_norm": 3.7628579139709473,
"learning_rate": 9.734590731225137e-06,
"loss": 1.245,
"step": 3029
},
{
"epoch": 0.7452041318248893,
"grad_norm": 3.820296287536621,
"learning_rate": 9.734329201502072e-06,
"loss": 1.3772,
"step": 3030
},
{
"epoch": 0.7454500737825873,
"grad_norm": 3.931698799133301,
"learning_rate": 9.73406754650565e-06,
"loss": 1.1741,
"step": 3031
},
{
"epoch": 0.7456960157402853,
"grad_norm": 3.634464740753174,
"learning_rate": 9.733805766242794e-06,
"loss": 1.1811,
"step": 3032
},
{
"epoch": 0.7459419576979833,
"grad_norm": 3.655820846557617,
"learning_rate": 9.73354386072043e-06,
"loss": 1.0986,
"step": 3033
},
{
"epoch": 0.7461878996556812,
"grad_norm": 4.007586479187012,
"learning_rate": 9.733281829945491e-06,
"loss": 1.2913,
"step": 3034
},
{
"epoch": 0.7464338416133792,
"grad_norm": 3.6289966106414795,
"learning_rate": 9.733019673924909e-06,
"loss": 1.2161,
"step": 3035
},
{
"epoch": 0.7466797835710772,
"grad_norm": 3.6006522178649902,
"learning_rate": 9.73275739266562e-06,
"loss": 1.0846,
"step": 3036
},
{
"epoch": 0.7469257255287752,
"grad_norm": 3.283454656600952,
"learning_rate": 9.732494986174564e-06,
"loss": 1.0433,
"step": 3037
},
{
"epoch": 0.7471716674864732,
"grad_norm": 4.091580867767334,
"learning_rate": 9.732232454458686e-06,
"loss": 1.2495,
"step": 3038
},
{
"epoch": 0.7474176094441711,
"grad_norm": 3.599839448928833,
"learning_rate": 9.73196979752493e-06,
"loss": 1.2603,
"step": 3039
},
{
"epoch": 0.7476635514018691,
"grad_norm": 3.1823928356170654,
"learning_rate": 9.73170701538025e-06,
"loss": 1.0015,
"step": 3040
},
{
"epoch": 0.7479094933595671,
"grad_norm": 3.691218852996826,
"learning_rate": 9.731444108031597e-06,
"loss": 1.2469,
"step": 3041
},
{
"epoch": 0.7481554353172651,
"grad_norm": 4.038862228393555,
"learning_rate": 9.731181075485929e-06,
"loss": 1.3374,
"step": 3042
},
{
"epoch": 0.7484013772749631,
"grad_norm": 3.6620876789093018,
"learning_rate": 9.730917917750203e-06,
"loss": 1.2218,
"step": 3043
},
{
"epoch": 0.7486473192326611,
"grad_norm": 3.6090919971466064,
"learning_rate": 9.730654634831386e-06,
"loss": 1.315,
"step": 3044
},
{
"epoch": 0.748893261190359,
"grad_norm": 3.6756200790405273,
"learning_rate": 9.730391226736442e-06,
"loss": 1.1818,
"step": 3045
},
{
"epoch": 0.749139203148057,
"grad_norm": 3.319904327392578,
"learning_rate": 9.730127693472343e-06,
"loss": 1.0849,
"step": 3046
},
{
"epoch": 0.749385145105755,
"grad_norm": 3.579397439956665,
"learning_rate": 9.729864035046059e-06,
"loss": 1.2807,
"step": 3047
},
{
"epoch": 0.749631087063453,
"grad_norm": 3.6093971729278564,
"learning_rate": 9.729600251464572e-06,
"loss": 1.2069,
"step": 3048
},
{
"epoch": 0.749877029021151,
"grad_norm": 3.5340816974639893,
"learning_rate": 9.729336342734857e-06,
"loss": 1.1503,
"step": 3049
},
{
"epoch": 0.750122970978849,
"grad_norm": 3.423492431640625,
"learning_rate": 9.729072308863898e-06,
"loss": 1.1173,
"step": 3050
},
{
"epoch": 0.750368912936547,
"grad_norm": 3.824312448501587,
"learning_rate": 9.728808149858684e-06,
"loss": 1.2011,
"step": 3051
},
{
"epoch": 0.750614854894245,
"grad_norm": 3.97580885887146,
"learning_rate": 9.728543865726201e-06,
"loss": 1.1437,
"step": 3052
},
{
"epoch": 0.750860796851943,
"grad_norm": 4.030182838439941,
"learning_rate": 9.728279456473445e-06,
"loss": 1.4221,
"step": 3053
},
{
"epoch": 0.751106738809641,
"grad_norm": 3.714268207550049,
"learning_rate": 9.728014922107414e-06,
"loss": 1.3326,
"step": 3054
},
{
"epoch": 0.7513526807673389,
"grad_norm": 3.893012523651123,
"learning_rate": 9.727750262635105e-06,
"loss": 1.2377,
"step": 3055
},
{
"epoch": 0.7515986227250369,
"grad_norm": 3.469959259033203,
"learning_rate": 9.72748547806352e-06,
"loss": 1.1195,
"step": 3056
},
{
"epoch": 0.7518445646827349,
"grad_norm": 3.7841293811798096,
"learning_rate": 9.727220568399667e-06,
"loss": 1.2166,
"step": 3057
},
{
"epoch": 0.7520905066404329,
"grad_norm": 3.508549928665161,
"learning_rate": 9.726955533650557e-06,
"loss": 1.3407,
"step": 3058
},
{
"epoch": 0.7523364485981309,
"grad_norm": 3.8820338249206543,
"learning_rate": 9.7266903738232e-06,
"loss": 1.2629,
"step": 3059
},
{
"epoch": 0.7525823905558289,
"grad_norm": 3.9238831996917725,
"learning_rate": 9.726425088924616e-06,
"loss": 1.3726,
"step": 3060
},
{
"epoch": 0.7528283325135268,
"grad_norm": 3.409585475921631,
"learning_rate": 9.72615967896182e-06,
"loss": 1.1704,
"step": 3061
},
{
"epoch": 0.7530742744712248,
"grad_norm": 3.361255168914795,
"learning_rate": 9.725894143941838e-06,
"loss": 1.2415,
"step": 3062
},
{
"epoch": 0.7533202164289228,
"grad_norm": 3.4112610816955566,
"learning_rate": 9.725628483871697e-06,
"loss": 1.2384,
"step": 3063
},
{
"epoch": 0.7535661583866208,
"grad_norm": 3.663153886795044,
"learning_rate": 9.725362698758425e-06,
"loss": 1.1991,
"step": 3064
},
{
"epoch": 0.7538121003443188,
"grad_norm": 3.8682594299316406,
"learning_rate": 9.725096788609055e-06,
"loss": 1.2724,
"step": 3065
},
{
"epoch": 0.7540580423020167,
"grad_norm": 3.915640354156494,
"learning_rate": 9.72483075343062e-06,
"loss": 1.2863,
"step": 3066
},
{
"epoch": 0.7543039842597147,
"grad_norm": 4.032073020935059,
"learning_rate": 9.724564593230167e-06,
"loss": 1.227,
"step": 3067
},
{
"epoch": 0.7545499262174127,
"grad_norm": 3.5458414554595947,
"learning_rate": 9.724298308014733e-06,
"loss": 1.3136,
"step": 3068
},
{
"epoch": 0.7547958681751107,
"grad_norm": 3.299471139907837,
"learning_rate": 9.724031897791365e-06,
"loss": 1.1416,
"step": 3069
},
{
"epoch": 0.7550418101328087,
"grad_norm": 3.306596040725708,
"learning_rate": 9.723765362567111e-06,
"loss": 1.1977,
"step": 3070
},
{
"epoch": 0.7552877520905067,
"grad_norm": 3.644564628601074,
"learning_rate": 9.723498702349027e-06,
"loss": 1.2319,
"step": 3071
},
{
"epoch": 0.7555336940482046,
"grad_norm": 4.068142890930176,
"learning_rate": 9.723231917144168e-06,
"loss": 1.3135,
"step": 3072
},
{
"epoch": 0.7557796360059026,
"grad_norm": 3.747575521469116,
"learning_rate": 9.722965006959593e-06,
"loss": 1.264,
"step": 3073
},
{
"epoch": 0.7560255779636006,
"grad_norm": 3.4644269943237305,
"learning_rate": 9.722697971802363e-06,
"loss": 1.1086,
"step": 3074
},
{
"epoch": 0.7562715199212986,
"grad_norm": 3.3824708461761475,
"learning_rate": 9.722430811679548e-06,
"loss": 1.1349,
"step": 3075
},
{
"epoch": 0.7565174618789966,
"grad_norm": 4.0759711265563965,
"learning_rate": 9.722163526598214e-06,
"loss": 1.2836,
"step": 3076
},
{
"epoch": 0.7567634038366945,
"grad_norm": 3.897371292114258,
"learning_rate": 9.721896116565432e-06,
"loss": 1.222,
"step": 3077
},
{
"epoch": 0.7570093457943925,
"grad_norm": 4.024113178253174,
"learning_rate": 9.721628581588282e-06,
"loss": 1.3631,
"step": 3078
},
{
"epoch": 0.7572552877520905,
"grad_norm": 3.3536343574523926,
"learning_rate": 9.72136092167384e-06,
"loss": 1.2124,
"step": 3079
},
{
"epoch": 0.7575012297097885,
"grad_norm": 3.3221099376678467,
"learning_rate": 9.721093136829189e-06,
"loss": 1.1122,
"step": 3080
},
{
"epoch": 0.7577471716674865,
"grad_norm": 3.5662848949432373,
"learning_rate": 9.720825227061417e-06,
"loss": 1.1967,
"step": 3081
},
{
"epoch": 0.7579931136251845,
"grad_norm": 3.9111878871917725,
"learning_rate": 9.72055719237761e-06,
"loss": 1.3196,
"step": 3082
},
{
"epoch": 0.7582390555828824,
"grad_norm": 3.415342092514038,
"learning_rate": 9.720289032784863e-06,
"loss": 1.2552,
"step": 3083
},
{
"epoch": 0.7584849975405804,
"grad_norm": 3.666435956954956,
"learning_rate": 9.720020748290271e-06,
"loss": 1.3513,
"step": 3084
},
{
"epoch": 0.7587309394982784,
"grad_norm": 3.290372371673584,
"learning_rate": 9.71975233890093e-06,
"loss": 1.1846,
"step": 3085
},
{
"epoch": 0.7589768814559764,
"grad_norm": 3.46256422996521,
"learning_rate": 9.719483804623946e-06,
"loss": 1.3121,
"step": 3086
},
{
"epoch": 0.7592228234136744,
"grad_norm": 3.9005322456359863,
"learning_rate": 9.719215145466422e-06,
"loss": 1.1418,
"step": 3087
},
{
"epoch": 0.7594687653713723,
"grad_norm": 3.6891987323760986,
"learning_rate": 9.71894636143547e-06,
"loss": 1.3392,
"step": 3088
},
{
"epoch": 0.7597147073290703,
"grad_norm": 3.5796751976013184,
"learning_rate": 9.7186774525382e-06,
"loss": 1.1215,
"step": 3089
},
{
"epoch": 0.7599606492867683,
"grad_norm": 3.7734873294830322,
"learning_rate": 9.718408418781729e-06,
"loss": 1.1392,
"step": 3090
},
{
"epoch": 0.7602065912444663,
"grad_norm": 3.698126792907715,
"learning_rate": 9.718139260173172e-06,
"loss": 1.2868,
"step": 3091
},
{
"epoch": 0.7604525332021643,
"grad_norm": 3.6336984634399414,
"learning_rate": 9.717869976719655e-06,
"loss": 1.1527,
"step": 3092
},
{
"epoch": 0.7606984751598622,
"grad_norm": 3.608396291732788,
"learning_rate": 9.717600568428303e-06,
"loss": 1.3703,
"step": 3093
},
{
"epoch": 0.7609444171175602,
"grad_norm": 3.720942735671997,
"learning_rate": 9.717331035306243e-06,
"loss": 1.1867,
"step": 3094
},
{
"epoch": 0.7611903590752582,
"grad_norm": 3.7992336750030518,
"learning_rate": 9.717061377360609e-06,
"loss": 1.2067,
"step": 3095
},
{
"epoch": 0.7614363010329562,
"grad_norm": 3.406694173812866,
"learning_rate": 9.716791594598533e-06,
"loss": 1.0843,
"step": 3096
},
{
"epoch": 0.7616822429906542,
"grad_norm": 3.6964335441589355,
"learning_rate": 9.716521687027158e-06,
"loss": 1.2901,
"step": 3097
},
{
"epoch": 0.7619281849483522,
"grad_norm": 4.053557872772217,
"learning_rate": 9.716251654653623e-06,
"loss": 1.3173,
"step": 3098
},
{
"epoch": 0.7621741269060501,
"grad_norm": 3.5947413444519043,
"learning_rate": 9.715981497485074e-06,
"loss": 1.1456,
"step": 3099
},
{
"epoch": 0.7624200688637481,
"grad_norm": 3.634427309036255,
"learning_rate": 9.715711215528661e-06,
"loss": 1.1353,
"step": 3100
},
{
"epoch": 0.7626660108214461,
"grad_norm": 3.6947381496429443,
"learning_rate": 9.715440808791532e-06,
"loss": 1.1633,
"step": 3101
},
{
"epoch": 0.7629119527791441,
"grad_norm": 3.718308448791504,
"learning_rate": 9.715170277280846e-06,
"loss": 1.1633,
"step": 3102
},
{
"epoch": 0.7631578947368421,
"grad_norm": 3.411322832107544,
"learning_rate": 9.71489962100376e-06,
"loss": 1.145,
"step": 3103
},
{
"epoch": 0.76340383669454,
"grad_norm": 3.130209445953369,
"learning_rate": 9.714628839967436e-06,
"loss": 1.0206,
"step": 3104
},
{
"epoch": 0.763649778652238,
"grad_norm": 4.059390068054199,
"learning_rate": 9.714357934179036e-06,
"loss": 1.2552,
"step": 3105
},
{
"epoch": 0.763895720609936,
"grad_norm": 3.591203451156616,
"learning_rate": 9.714086903645733e-06,
"loss": 1.2552,
"step": 3106
},
{
"epoch": 0.764141662567634,
"grad_norm": 3.8266916275024414,
"learning_rate": 9.713815748374698e-06,
"loss": 1.2348,
"step": 3107
},
{
"epoch": 0.764387604525332,
"grad_norm": 3.7525527477264404,
"learning_rate": 9.713544468373102e-06,
"loss": 1.3087,
"step": 3108
},
{
"epoch": 0.7646335464830301,
"grad_norm": 3.93483304977417,
"learning_rate": 9.713273063648128e-06,
"loss": 1.288,
"step": 3109
},
{
"epoch": 0.764879488440728,
"grad_norm": 3.6516032218933105,
"learning_rate": 9.713001534206954e-06,
"loss": 1.1746,
"step": 3110
},
{
"epoch": 0.765125430398426,
"grad_norm": 3.669949531555176,
"learning_rate": 9.712729880056767e-06,
"loss": 1.2096,
"step": 3111
},
{
"epoch": 0.765371372356124,
"grad_norm": 3.7303407192230225,
"learning_rate": 9.712458101204754e-06,
"loss": 1.2571,
"step": 3112
},
{
"epoch": 0.765617314313822,
"grad_norm": 3.9511468410491943,
"learning_rate": 9.712186197658108e-06,
"loss": 1.2149,
"step": 3113
},
{
"epoch": 0.76586325627152,
"grad_norm": 3.5117974281311035,
"learning_rate": 9.711914169424022e-06,
"loss": 1.211,
"step": 3114
},
{
"epoch": 0.7661091982292179,
"grad_norm": 3.692487955093384,
"learning_rate": 9.711642016509696e-06,
"loss": 1.1934,
"step": 3115
},
{
"epoch": 0.7663551401869159,
"grad_norm": 3.7287392616271973,
"learning_rate": 9.711369738922327e-06,
"loss": 1.3947,
"step": 3116
},
{
"epoch": 0.7666010821446139,
"grad_norm": 3.931889772415161,
"learning_rate": 9.711097336669124e-06,
"loss": 1.1548,
"step": 3117
},
{
"epoch": 0.7668470241023119,
"grad_norm": 3.6661202907562256,
"learning_rate": 9.710824809757293e-06,
"loss": 1.1909,
"step": 3118
},
{
"epoch": 0.7670929660600099,
"grad_norm": 3.8811981678009033,
"learning_rate": 9.710552158194047e-06,
"loss": 1.3192,
"step": 3119
},
{
"epoch": 0.7673389080177078,
"grad_norm": 4.159524440765381,
"learning_rate": 9.710279381986597e-06,
"loss": 1.2082,
"step": 3120
},
{
"epoch": 0.7675848499754058,
"grad_norm": 3.754513740539551,
"learning_rate": 9.710006481142166e-06,
"loss": 1.2274,
"step": 3121
},
{
"epoch": 0.7678307919331038,
"grad_norm": 3.592902421951294,
"learning_rate": 9.70973345566797e-06,
"loss": 1.1641,
"step": 3122
},
{
"epoch": 0.7680767338908018,
"grad_norm": 3.4608614444732666,
"learning_rate": 9.709460305571238e-06,
"loss": 1.1357,
"step": 3123
},
{
"epoch": 0.7683226758484998,
"grad_norm": 3.5408005714416504,
"learning_rate": 9.709187030859195e-06,
"loss": 1.1696,
"step": 3124
},
{
"epoch": 0.7685686178061978,
"grad_norm": 3.327002763748169,
"learning_rate": 9.708913631539072e-06,
"loss": 1.2393,
"step": 3125
},
{
"epoch": 0.7688145597638957,
"grad_norm": 3.948559522628784,
"learning_rate": 9.708640107618102e-06,
"loss": 1.3022,
"step": 3126
},
{
"epoch": 0.7690605017215937,
"grad_norm": 3.603264093399048,
"learning_rate": 9.708366459103527e-06,
"loss": 1.0843,
"step": 3127
},
{
"epoch": 0.7693064436792917,
"grad_norm": 3.741516590118408,
"learning_rate": 9.708092686002585e-06,
"loss": 1.1764,
"step": 3128
},
{
"epoch": 0.7695523856369897,
"grad_norm": 3.7511487007141113,
"learning_rate": 9.707818788322518e-06,
"loss": 1.2015,
"step": 3129
},
{
"epoch": 0.7697983275946877,
"grad_norm": 3.7036871910095215,
"learning_rate": 9.707544766070577e-06,
"loss": 1.2423,
"step": 3130
},
{
"epoch": 0.7700442695523856,
"grad_norm": 4.06614875793457,
"learning_rate": 9.707270619254013e-06,
"loss": 1.2456,
"step": 3131
},
{
"epoch": 0.7702902115100836,
"grad_norm": 4.000271320343018,
"learning_rate": 9.706996347880078e-06,
"loss": 1.3308,
"step": 3132
},
{
"epoch": 0.7705361534677816,
"grad_norm": 3.6081089973449707,
"learning_rate": 9.706721951956032e-06,
"loss": 1.1377,
"step": 3133
},
{
"epoch": 0.7707820954254796,
"grad_norm": 3.588655710220337,
"learning_rate": 9.706447431489132e-06,
"loss": 1.327,
"step": 3134
},
{
"epoch": 0.7710280373831776,
"grad_norm": 4.144540786743164,
"learning_rate": 9.706172786486645e-06,
"loss": 1.262,
"step": 3135
},
{
"epoch": 0.7712739793408756,
"grad_norm": 3.4061357975006104,
"learning_rate": 9.705898016955837e-06,
"loss": 1.1731,
"step": 3136
},
{
"epoch": 0.7715199212985735,
"grad_norm": 3.5529541969299316,
"learning_rate": 9.70562312290398e-06,
"loss": 1.0554,
"step": 3137
},
{
"epoch": 0.7717658632562715,
"grad_norm": 3.3206307888031006,
"learning_rate": 9.705348104338347e-06,
"loss": 1.0656,
"step": 3138
},
{
"epoch": 0.7720118052139695,
"grad_norm": 3.426710367202759,
"learning_rate": 9.70507296126621e-06,
"loss": 1.1384,
"step": 3139
},
{
"epoch": 0.7722577471716675,
"grad_norm": 3.7156801223754883,
"learning_rate": 9.70479769369486e-06,
"loss": 1.2144,
"step": 3140
},
{
"epoch": 0.7725036891293655,
"grad_norm": 3.4603748321533203,
"learning_rate": 9.704522301631572e-06,
"loss": 1.1856,
"step": 3141
},
{
"epoch": 0.7727496310870634,
"grad_norm": 3.678009271621704,
"learning_rate": 9.704246785083638e-06,
"loss": 1.2523,
"step": 3142
},
{
"epoch": 0.7729955730447614,
"grad_norm": 3.6632015705108643,
"learning_rate": 9.703971144058345e-06,
"loss": 1.186,
"step": 3143
},
{
"epoch": 0.7732415150024594,
"grad_norm": 3.4851739406585693,
"learning_rate": 9.70369537856299e-06,
"loss": 1.2601,
"step": 3144
},
{
"epoch": 0.7734874569601574,
"grad_norm": 3.3549044132232666,
"learning_rate": 9.703419488604867e-06,
"loss": 1.1338,
"step": 3145
},
{
"epoch": 0.7737333989178554,
"grad_norm": 3.6141183376312256,
"learning_rate": 9.703143474191276e-06,
"loss": 1.1569,
"step": 3146
},
{
"epoch": 0.7739793408755534,
"grad_norm": 3.8467960357666016,
"learning_rate": 9.702867335329522e-06,
"loss": 1.1609,
"step": 3147
},
{
"epoch": 0.7742252828332513,
"grad_norm": 3.5616445541381836,
"learning_rate": 9.702591072026913e-06,
"loss": 1.2524,
"step": 3148
},
{
"epoch": 0.7744712247909493,
"grad_norm": 3.372070789337158,
"learning_rate": 9.702314684290757e-06,
"loss": 1.1475,
"step": 3149
},
{
"epoch": 0.7747171667486473,
"grad_norm": 3.6812331676483154,
"learning_rate": 9.702038172128369e-06,
"loss": 1.2033,
"step": 3150
},
{
"epoch": 0.7749631087063453,
"grad_norm": 3.4577507972717285,
"learning_rate": 9.701761535547065e-06,
"loss": 1.223,
"step": 3151
},
{
"epoch": 0.7752090506640433,
"grad_norm": 3.5089683532714844,
"learning_rate": 9.701484774554162e-06,
"loss": 1.2436,
"step": 3152
},
{
"epoch": 0.7754549926217412,
"grad_norm": 4.283212661743164,
"learning_rate": 9.701207889156989e-06,
"loss": 1.372,
"step": 3153
},
{
"epoch": 0.7757009345794392,
"grad_norm": 3.335488796234131,
"learning_rate": 9.70093087936287e-06,
"loss": 1.0831,
"step": 3154
},
{
"epoch": 0.7759468765371372,
"grad_norm": 3.8869636058807373,
"learning_rate": 9.700653745179131e-06,
"loss": 1.1532,
"step": 3155
},
{
"epoch": 0.7761928184948352,
"grad_norm": 3.890209674835205,
"learning_rate": 9.70037648661311e-06,
"loss": 1.3019,
"step": 3156
},
{
"epoch": 0.7764387604525332,
"grad_norm": 3.497529983520508,
"learning_rate": 9.700099103672143e-06,
"loss": 1.2892,
"step": 3157
},
{
"epoch": 0.7766847024102311,
"grad_norm": 3.6435770988464355,
"learning_rate": 9.69982159636357e-06,
"loss": 1.2665,
"step": 3158
},
{
"epoch": 0.7769306443679291,
"grad_norm": 3.5214977264404297,
"learning_rate": 9.699543964694731e-06,
"loss": 1.2786,
"step": 3159
},
{
"epoch": 0.7771765863256271,
"grad_norm": 3.572211265563965,
"learning_rate": 9.699266208672974e-06,
"loss": 1.1927,
"step": 3160
},
{
"epoch": 0.7774225282833251,
"grad_norm": 3.743227481842041,
"learning_rate": 9.698988328305649e-06,
"loss": 1.2462,
"step": 3161
},
{
"epoch": 0.7776684702410231,
"grad_norm": 3.303778886795044,
"learning_rate": 9.69871032360011e-06,
"loss": 1.0901,
"step": 3162
},
{
"epoch": 0.7779144121987212,
"grad_norm": 3.743983507156372,
"learning_rate": 9.69843219456371e-06,
"loss": 1.2315,
"step": 3163
},
{
"epoch": 0.778160354156419,
"grad_norm": 3.937730073928833,
"learning_rate": 9.69815394120381e-06,
"loss": 1.1536,
"step": 3164
},
{
"epoch": 0.778406296114117,
"grad_norm": 3.2509305477142334,
"learning_rate": 9.697875563527775e-06,
"loss": 1.2792,
"step": 3165
},
{
"epoch": 0.778652238071815,
"grad_norm": 3.4885780811309814,
"learning_rate": 9.697597061542966e-06,
"loss": 1.1983,
"step": 3166
},
{
"epoch": 0.778898180029513,
"grad_norm": 4.029336929321289,
"learning_rate": 9.697318435256757e-06,
"loss": 1.2338,
"step": 3167
},
{
"epoch": 0.7791441219872111,
"grad_norm": 3.936354160308838,
"learning_rate": 9.69703968467652e-06,
"loss": 1.2739,
"step": 3168
},
{
"epoch": 0.779390063944909,
"grad_norm": 3.765928030014038,
"learning_rate": 9.69676080980963e-06,
"loss": 1.287,
"step": 3169
},
{
"epoch": 0.779636005902607,
"grad_norm": 3.386223554611206,
"learning_rate": 9.696481810663466e-06,
"loss": 1.094,
"step": 3170
},
{
"epoch": 0.779881947860305,
"grad_norm": 3.269779920578003,
"learning_rate": 9.696202687245409e-06,
"loss": 1.1291,
"step": 3171
},
{
"epoch": 0.780127889818003,
"grad_norm": 3.425758123397827,
"learning_rate": 9.695923439562846e-06,
"loss": 1.3226,
"step": 3172
},
{
"epoch": 0.780373831775701,
"grad_norm": 3.92987060546875,
"learning_rate": 9.69564406762317e-06,
"loss": 1.1836,
"step": 3173
},
{
"epoch": 0.780619773733399,
"grad_norm": 3.3481290340423584,
"learning_rate": 9.695364571433768e-06,
"loss": 1.1222,
"step": 3174
},
{
"epoch": 0.7808657156910969,
"grad_norm": 3.7971572875976562,
"learning_rate": 9.695084951002036e-06,
"loss": 1.1905,
"step": 3175
},
{
"epoch": 0.7811116576487949,
"grad_norm": 3.423079013824463,
"learning_rate": 9.694805206335375e-06,
"loss": 1.1468,
"step": 3176
},
{
"epoch": 0.7813575996064929,
"grad_norm": 3.511819839477539,
"learning_rate": 9.694525337441189e-06,
"loss": 1.1969,
"step": 3177
},
{
"epoch": 0.7816035415641909,
"grad_norm": 3.363342761993408,
"learning_rate": 9.69424534432688e-06,
"loss": 1.1989,
"step": 3178
},
{
"epoch": 0.7818494835218889,
"grad_norm": 3.732497215270996,
"learning_rate": 9.693965226999858e-06,
"loss": 1.1754,
"step": 3179
},
{
"epoch": 0.7820954254795868,
"grad_norm": 4.158317565917969,
"learning_rate": 9.693684985467533e-06,
"loss": 1.309,
"step": 3180
},
{
"epoch": 0.7823413674372848,
"grad_norm": 3.456357002258301,
"learning_rate": 9.693404619737325e-06,
"loss": 1.0883,
"step": 3181
},
{
"epoch": 0.7825873093949828,
"grad_norm": 3.5511531829833984,
"learning_rate": 9.693124129816648e-06,
"loss": 1.2029,
"step": 3182
},
{
"epoch": 0.7828332513526808,
"grad_norm": 3.6344921588897705,
"learning_rate": 9.692843515712927e-06,
"loss": 1.1704,
"step": 3183
},
{
"epoch": 0.7830791933103788,
"grad_norm": 3.5766234397888184,
"learning_rate": 9.692562777433587e-06,
"loss": 1.1948,
"step": 3184
},
{
"epoch": 0.7833251352680768,
"grad_norm": 3.6300597190856934,
"learning_rate": 9.692281914986055e-06,
"loss": 1.417,
"step": 3185
},
{
"epoch": 0.7835710772257747,
"grad_norm": 3.4715895652770996,
"learning_rate": 9.692000928377765e-06,
"loss": 1.2552,
"step": 3186
},
{
"epoch": 0.7838170191834727,
"grad_norm": 4.031774044036865,
"learning_rate": 9.691719817616148e-06,
"loss": 1.2679,
"step": 3187
},
{
"epoch": 0.7840629611411707,
"grad_norm": 3.5051093101501465,
"learning_rate": 9.691438582708646e-06,
"loss": 1.1317,
"step": 3188
},
{
"epoch": 0.7843089030988687,
"grad_norm": 3.836676597595215,
"learning_rate": 9.6911572236627e-06,
"loss": 1.2768,
"step": 3189
},
{
"epoch": 0.7845548450565667,
"grad_norm": 3.7042040824890137,
"learning_rate": 9.690875740485756e-06,
"loss": 1.2148,
"step": 3190
},
{
"epoch": 0.7848007870142646,
"grad_norm": 3.7261807918548584,
"learning_rate": 9.690594133185259e-06,
"loss": 1.2161,
"step": 3191
},
{
"epoch": 0.7850467289719626,
"grad_norm": 3.710130214691162,
"learning_rate": 9.690312401768663e-06,
"loss": 1.2139,
"step": 3192
},
{
"epoch": 0.7852926709296606,
"grad_norm": 3.456019878387451,
"learning_rate": 9.69003054624342e-06,
"loss": 1.1932,
"step": 3193
},
{
"epoch": 0.7855386128873586,
"grad_norm": 3.5009958744049072,
"learning_rate": 9.689748566616994e-06,
"loss": 1.1462,
"step": 3194
},
{
"epoch": 0.7857845548450566,
"grad_norm": 3.8476274013519287,
"learning_rate": 9.689466462896841e-06,
"loss": 1.2433,
"step": 3195
},
{
"epoch": 0.7860304968027545,
"grad_norm": 3.5826315879821777,
"learning_rate": 9.689184235090427e-06,
"loss": 1.2565,
"step": 3196
},
{
"epoch": 0.7862764387604525,
"grad_norm": 3.685990810394287,
"learning_rate": 9.688901883205219e-06,
"loss": 1.1861,
"step": 3197
},
{
"epoch": 0.7865223807181505,
"grad_norm": 3.5398170948028564,
"learning_rate": 9.68861940724869e-06,
"loss": 1.1017,
"step": 3198
},
{
"epoch": 0.7867683226758485,
"grad_norm": 3.4283969402313232,
"learning_rate": 9.688336807228316e-06,
"loss": 1.1777,
"step": 3199
},
{
"epoch": 0.7870142646335465,
"grad_norm": 3.672309160232544,
"learning_rate": 9.68805408315157e-06,
"loss": 1.2784,
"step": 3200
},
{
"epoch": 0.7872602065912445,
"grad_norm": 3.783125162124634,
"learning_rate": 9.687771235025935e-06,
"loss": 1.1663,
"step": 3201
},
{
"epoch": 0.7875061485489424,
"grad_norm": 3.8163163661956787,
"learning_rate": 9.687488262858898e-06,
"loss": 1.2676,
"step": 3202
},
{
"epoch": 0.7877520905066404,
"grad_norm": 3.51967716217041,
"learning_rate": 9.687205166657945e-06,
"loss": 1.1793,
"step": 3203
},
{
"epoch": 0.7879980324643384,
"grad_norm": 3.3179595470428467,
"learning_rate": 9.686921946430566e-06,
"loss": 1.1702,
"step": 3204
},
{
"epoch": 0.7882439744220364,
"grad_norm": 3.7795214653015137,
"learning_rate": 9.686638602184256e-06,
"loss": 1.2883,
"step": 3205
},
{
"epoch": 0.7884899163797344,
"grad_norm": 3.877755880355835,
"learning_rate": 9.68635513392651e-06,
"loss": 1.4316,
"step": 3206
},
{
"epoch": 0.7887358583374323,
"grad_norm": 3.5156826972961426,
"learning_rate": 9.686071541664833e-06,
"loss": 1.144,
"step": 3207
},
{
"epoch": 0.7889818002951303,
"grad_norm": 3.6731066703796387,
"learning_rate": 9.685787825406726e-06,
"loss": 1.3363,
"step": 3208
},
{
"epoch": 0.7892277422528283,
"grad_norm": 3.658477783203125,
"learning_rate": 9.685503985159698e-06,
"loss": 1.341,
"step": 3209
},
{
"epoch": 0.7894736842105263,
"grad_norm": 3.1788156032562256,
"learning_rate": 9.685220020931258e-06,
"loss": 0.9898,
"step": 3210
},
{
"epoch": 0.7897196261682243,
"grad_norm": 3.5493733882904053,
"learning_rate": 9.684935932728922e-06,
"loss": 1.3417,
"step": 3211
},
{
"epoch": 0.7899655681259223,
"grad_norm": 3.310035228729248,
"learning_rate": 9.684651720560205e-06,
"loss": 1.1154,
"step": 3212
},
{
"epoch": 0.7902115100836202,
"grad_norm": 3.6826462745666504,
"learning_rate": 9.68436738443263e-06,
"loss": 1.273,
"step": 3213
},
{
"epoch": 0.7904574520413182,
"grad_norm": 3.652528762817383,
"learning_rate": 9.684082924353716e-06,
"loss": 1.1178,
"step": 3214
},
{
"epoch": 0.7907033939990162,
"grad_norm": 3.98161244392395,
"learning_rate": 9.683798340330996e-06,
"loss": 1.2063,
"step": 3215
},
{
"epoch": 0.7909493359567142,
"grad_norm": 3.7128539085388184,
"learning_rate": 9.683513632371997e-06,
"loss": 1.1429,
"step": 3216
},
{
"epoch": 0.7911952779144122,
"grad_norm": 3.2264506816864014,
"learning_rate": 9.683228800484252e-06,
"loss": 1.1036,
"step": 3217
},
{
"epoch": 0.7914412198721101,
"grad_norm": 3.6992499828338623,
"learning_rate": 9.682943844675298e-06,
"loss": 1.1977,
"step": 3218
},
{
"epoch": 0.7916871618298081,
"grad_norm": 3.7365591526031494,
"learning_rate": 9.682658764952678e-06,
"loss": 1.3198,
"step": 3219
},
{
"epoch": 0.7919331037875061,
"grad_norm": 3.626595973968506,
"learning_rate": 9.682373561323932e-06,
"loss": 1.2992,
"step": 3220
},
{
"epoch": 0.7921790457452041,
"grad_norm": 3.232018232345581,
"learning_rate": 9.682088233796607e-06,
"loss": 1.1007,
"step": 3221
},
{
"epoch": 0.7924249877029022,
"grad_norm": 3.449819326400757,
"learning_rate": 9.681802782378255e-06,
"loss": 1.1793,
"step": 3222
},
{
"epoch": 0.7926709296606,
"grad_norm": 3.3278839588165283,
"learning_rate": 9.681517207076427e-06,
"loss": 1.0785,
"step": 3223
},
{
"epoch": 0.792916871618298,
"grad_norm": 3.7702910900115967,
"learning_rate": 9.681231507898681e-06,
"loss": 1.2619,
"step": 3224
},
{
"epoch": 0.793162813575996,
"grad_norm": 3.7150464057922363,
"learning_rate": 9.680945684852576e-06,
"loss": 1.1721,
"step": 3225
},
{
"epoch": 0.793408755533694,
"grad_norm": 3.5324935913085938,
"learning_rate": 9.680659737945676e-06,
"loss": 1.2242,
"step": 3226
},
{
"epoch": 0.7936546974913921,
"grad_norm": 3.4929943084716797,
"learning_rate": 9.680373667185548e-06,
"loss": 1.0644,
"step": 3227
},
{
"epoch": 0.7939006394490901,
"grad_norm": 3.5456807613372803,
"learning_rate": 9.680087472579758e-06,
"loss": 1.1406,
"step": 3228
},
{
"epoch": 0.794146581406788,
"grad_norm": 3.737445592880249,
"learning_rate": 9.679801154135883e-06,
"loss": 1.3303,
"step": 3229
},
{
"epoch": 0.794392523364486,
"grad_norm": 3.3128955364227295,
"learning_rate": 9.679514711861496e-06,
"loss": 1.205,
"step": 3230
},
{
"epoch": 0.794638465322184,
"grad_norm": 3.401216745376587,
"learning_rate": 9.679228145764179e-06,
"loss": 1.2703,
"step": 3231
},
{
"epoch": 0.794884407279882,
"grad_norm": 3.5181336402893066,
"learning_rate": 9.678941455851513e-06,
"loss": 1.233,
"step": 3232
},
{
"epoch": 0.79513034923758,
"grad_norm": 3.7592337131500244,
"learning_rate": 9.678654642131085e-06,
"loss": 1.205,
"step": 3233
},
{
"epoch": 0.7953762911952779,
"grad_norm": 3.4549946784973145,
"learning_rate": 9.678367704610483e-06,
"loss": 1.3129,
"step": 3234
},
{
"epoch": 0.7956222331529759,
"grad_norm": 3.529653787612915,
"learning_rate": 9.6780806432973e-06,
"loss": 1.2647,
"step": 3235
},
{
"epoch": 0.7958681751106739,
"grad_norm": 3.915609121322632,
"learning_rate": 9.677793458199134e-06,
"loss": 1.3615,
"step": 3236
},
{
"epoch": 0.7961141170683719,
"grad_norm": 3.4842090606689453,
"learning_rate": 9.67750614932358e-06,
"loss": 1.2194,
"step": 3237
},
{
"epoch": 0.7963600590260699,
"grad_norm": 3.5424623489379883,
"learning_rate": 9.677218716678246e-06,
"loss": 1.1605,
"step": 3238
},
{
"epoch": 0.7966060009837679,
"grad_norm": 3.8243587017059326,
"learning_rate": 9.676931160270733e-06,
"loss": 1.2172,
"step": 3239
},
{
"epoch": 0.7968519429414658,
"grad_norm": 3.4520187377929688,
"learning_rate": 9.676643480108651e-06,
"loss": 1.2525,
"step": 3240
},
{
"epoch": 0.7970978848991638,
"grad_norm": 3.7506327629089355,
"learning_rate": 9.676355676199612e-06,
"loss": 1.0887,
"step": 3241
},
{
"epoch": 0.7973438268568618,
"grad_norm": 3.538532257080078,
"learning_rate": 9.676067748551232e-06,
"loss": 1.3067,
"step": 3242
},
{
"epoch": 0.7975897688145598,
"grad_norm": 3.3441290855407715,
"learning_rate": 9.67577969717113e-06,
"loss": 1.1992,
"step": 3243
},
{
"epoch": 0.7978357107722578,
"grad_norm": 4.1735639572143555,
"learning_rate": 9.675491522066927e-06,
"loss": 1.1561,
"step": 3244
},
{
"epoch": 0.7980816527299557,
"grad_norm": 3.770817995071411,
"learning_rate": 9.67520322324625e-06,
"loss": 1.4994,
"step": 3245
},
{
"epoch": 0.7983275946876537,
"grad_norm": 4.078700065612793,
"learning_rate": 9.674914800716726e-06,
"loss": 1.2238,
"step": 3246
},
{
"epoch": 0.7985735366453517,
"grad_norm": 3.6894965171813965,
"learning_rate": 9.674626254485989e-06,
"loss": 1.3581,
"step": 3247
},
{
"epoch": 0.7988194786030497,
"grad_norm": 3.6449451446533203,
"learning_rate": 9.67433758456167e-06,
"loss": 1.2668,
"step": 3248
},
{
"epoch": 0.7990654205607477,
"grad_norm": 3.7663257122039795,
"learning_rate": 9.674048790951411e-06,
"loss": 1.3379,
"step": 3249
},
{
"epoch": 0.7993113625184457,
"grad_norm": 3.882615327835083,
"learning_rate": 9.673759873662853e-06,
"loss": 1.1293,
"step": 3250
},
{
"epoch": 0.7995573044761436,
"grad_norm": 3.7457032203674316,
"learning_rate": 9.673470832703641e-06,
"loss": 1.3377,
"step": 3251
},
{
"epoch": 0.7998032464338416,
"grad_norm": 3.5431079864501953,
"learning_rate": 9.673181668081422e-06,
"loss": 1.055,
"step": 3252
},
{
"epoch": 0.8000491883915396,
"grad_norm": 3.438239336013794,
"learning_rate": 9.67289237980385e-06,
"loss": 1.2077,
"step": 3253
},
{
"epoch": 0.8002951303492376,
"grad_norm": 3.601931571960449,
"learning_rate": 9.672602967878575e-06,
"loss": 1.2215,
"step": 3254
},
{
"epoch": 0.8005410723069356,
"grad_norm": 3.5580520629882812,
"learning_rate": 9.67231343231326e-06,
"loss": 1.2065,
"step": 3255
},
{
"epoch": 0.8007870142646335,
"grad_norm": 3.780724287033081,
"learning_rate": 9.672023773115561e-06,
"loss": 1.1525,
"step": 3256
},
{
"epoch": 0.8010329562223315,
"grad_norm": 3.4753990173339844,
"learning_rate": 9.671733990293149e-06,
"loss": 1.265,
"step": 3257
},
{
"epoch": 0.8012788981800295,
"grad_norm": 3.542051315307617,
"learning_rate": 9.671444083853689e-06,
"loss": 1.2141,
"step": 3258
},
{
"epoch": 0.8015248401377275,
"grad_norm": 3.4063920974731445,
"learning_rate": 9.671154053804851e-06,
"loss": 1.2152,
"step": 3259
},
{
"epoch": 0.8017707820954255,
"grad_norm": 3.808685064315796,
"learning_rate": 9.67086390015431e-06,
"loss": 1.3237,
"step": 3260
},
{
"epoch": 0.8020167240531234,
"grad_norm": 3.8981597423553467,
"learning_rate": 9.670573622909743e-06,
"loss": 1.3294,
"step": 3261
},
{
"epoch": 0.8022626660108214,
"grad_norm": 3.9953501224517822,
"learning_rate": 9.670283222078833e-06,
"loss": 1.2855,
"step": 3262
},
{
"epoch": 0.8025086079685194,
"grad_norm": 3.3111023902893066,
"learning_rate": 9.669992697669263e-06,
"loss": 1.2735,
"step": 3263
},
{
"epoch": 0.8027545499262174,
"grad_norm": 3.4766428470611572,
"learning_rate": 9.66970204968872e-06,
"loss": 1.2035,
"step": 3264
},
{
"epoch": 0.8030004918839154,
"grad_norm": 3.3331050872802734,
"learning_rate": 9.669411278144896e-06,
"loss": 1.3328,
"step": 3265
},
{
"epoch": 0.8032464338416134,
"grad_norm": 3.529484987258911,
"learning_rate": 9.669120383045483e-06,
"loss": 1.2037,
"step": 3266
},
{
"epoch": 0.8034923757993113,
"grad_norm": 3.819429636001587,
"learning_rate": 9.668829364398179e-06,
"loss": 1.3686,
"step": 3267
},
{
"epoch": 0.8037383177570093,
"grad_norm": 3.2829113006591797,
"learning_rate": 9.668538222210686e-06,
"loss": 1.1953,
"step": 3268
},
{
"epoch": 0.8039842597147073,
"grad_norm": 3.2703518867492676,
"learning_rate": 9.668246956490706e-06,
"loss": 1.2168,
"step": 3269
},
{
"epoch": 0.8042302016724053,
"grad_norm": 3.4624345302581787,
"learning_rate": 9.667955567245949e-06,
"loss": 1.2401,
"step": 3270
},
{
"epoch": 0.8044761436301033,
"grad_norm": 4.107505798339844,
"learning_rate": 9.667664054484121e-06,
"loss": 1.308,
"step": 3271
},
{
"epoch": 0.8047220855878012,
"grad_norm": 3.8514795303344727,
"learning_rate": 9.667372418212936e-06,
"loss": 1.3678,
"step": 3272
},
{
"epoch": 0.8049680275454992,
"grad_norm": 3.720860004425049,
"learning_rate": 9.667080658440116e-06,
"loss": 1.2335,
"step": 3273
},
{
"epoch": 0.8052139695031972,
"grad_norm": 3.296027660369873,
"learning_rate": 9.666788775173377e-06,
"loss": 1.1449,
"step": 3274
},
{
"epoch": 0.8054599114608952,
"grad_norm": 3.6605684757232666,
"learning_rate": 9.666496768420442e-06,
"loss": 1.3448,
"step": 3275
},
{
"epoch": 0.8057058534185932,
"grad_norm": 3.411717414855957,
"learning_rate": 9.66620463818904e-06,
"loss": 1.1772,
"step": 3276
},
{
"epoch": 0.8059517953762912,
"grad_norm": 3.7364063262939453,
"learning_rate": 9.6659123844869e-06,
"loss": 1.2876,
"step": 3277
},
{
"epoch": 0.8061977373339891,
"grad_norm": 3.487536907196045,
"learning_rate": 9.665620007321754e-06,
"loss": 1.1185,
"step": 3278
},
{
"epoch": 0.8064436792916871,
"grad_norm": 3.4797072410583496,
"learning_rate": 9.66532750670134e-06,
"loss": 1.096,
"step": 3279
},
{
"epoch": 0.8066896212493851,
"grad_norm": 3.3688912391662598,
"learning_rate": 9.665034882633398e-06,
"loss": 1.1035,
"step": 3280
},
{
"epoch": 0.8069355632070832,
"grad_norm": 3.5517399311065674,
"learning_rate": 9.66474213512567e-06,
"loss": 1.2556,
"step": 3281
},
{
"epoch": 0.8071815051647812,
"grad_norm": 4.144073486328125,
"learning_rate": 9.664449264185903e-06,
"loss": 1.2126,
"step": 3282
},
{
"epoch": 0.807427447122479,
"grad_norm": 3.6900687217712402,
"learning_rate": 9.664156269821845e-06,
"loss": 1.2905,
"step": 3283
},
{
"epoch": 0.807673389080177,
"grad_norm": 3.445417642593384,
"learning_rate": 9.663863152041251e-06,
"loss": 1.2804,
"step": 3284
},
{
"epoch": 0.807919331037875,
"grad_norm": 3.711273670196533,
"learning_rate": 9.663569910851876e-06,
"loss": 1.3683,
"step": 3285
},
{
"epoch": 0.8081652729955731,
"grad_norm": 3.5577144622802734,
"learning_rate": 9.663276546261479e-06,
"loss": 1.1452,
"step": 3286
},
{
"epoch": 0.8084112149532711,
"grad_norm": 3.8996901512145996,
"learning_rate": 9.662983058277822e-06,
"loss": 1.2107,
"step": 3287
},
{
"epoch": 0.8086571569109691,
"grad_norm": 3.4875006675720215,
"learning_rate": 9.662689446908674e-06,
"loss": 1.2003,
"step": 3288
},
{
"epoch": 0.808903098868667,
"grad_norm": 4.536487579345703,
"learning_rate": 9.6623957121618e-06,
"loss": 1.3286,
"step": 3289
},
{
"epoch": 0.809149040826365,
"grad_norm": 3.779325008392334,
"learning_rate": 9.662101854044977e-06,
"loss": 1.3382,
"step": 3290
},
{
"epoch": 0.809394982784063,
"grad_norm": 3.8925764560699463,
"learning_rate": 9.661807872565977e-06,
"loss": 1.4124,
"step": 3291
},
{
"epoch": 0.809640924741761,
"grad_norm": 3.6961472034454346,
"learning_rate": 9.66151376773258e-06,
"loss": 1.1847,
"step": 3292
},
{
"epoch": 0.809886866699459,
"grad_norm": 4.099465847015381,
"learning_rate": 9.661219539552568e-06,
"loss": 1.1982,
"step": 3293
},
{
"epoch": 0.8101328086571569,
"grad_norm": 3.7477383613586426,
"learning_rate": 9.660925188033728e-06,
"loss": 1.361,
"step": 3294
},
{
"epoch": 0.8103787506148549,
"grad_norm": 3.965240001678467,
"learning_rate": 9.660630713183844e-06,
"loss": 1.2907,
"step": 3295
},
{
"epoch": 0.8106246925725529,
"grad_norm": 3.5968992710113525,
"learning_rate": 9.660336115010716e-06,
"loss": 1.2308,
"step": 3296
},
{
"epoch": 0.8108706345302509,
"grad_norm": 3.902099847793579,
"learning_rate": 9.66004139352213e-06,
"loss": 1.2929,
"step": 3297
},
{
"epoch": 0.8111165764879489,
"grad_norm": 3.872516632080078,
"learning_rate": 9.659746548725893e-06,
"loss": 1.2511,
"step": 3298
},
{
"epoch": 0.8113625184456468,
"grad_norm": 3.6983821392059326,
"learning_rate": 9.659451580629802e-06,
"loss": 1.2442,
"step": 3299
},
{
"epoch": 0.8116084604033448,
"grad_norm": 3.9622886180877686,
"learning_rate": 9.659156489241663e-06,
"loss": 1.3687,
"step": 3300
},
{
"epoch": 0.8118544023610428,
"grad_norm": 3.694780111312866,
"learning_rate": 9.658861274569284e-06,
"loss": 1.1423,
"step": 3301
},
{
"epoch": 0.8121003443187408,
"grad_norm": 3.34562087059021,
"learning_rate": 9.658565936620476e-06,
"loss": 1.098,
"step": 3302
},
{
"epoch": 0.8123462862764388,
"grad_norm": 3.529442548751831,
"learning_rate": 9.658270475403057e-06,
"loss": 1.0885,
"step": 3303
},
{
"epoch": 0.8125922282341368,
"grad_norm": 3.5683534145355225,
"learning_rate": 9.657974890924841e-06,
"loss": 1.1207,
"step": 3304
},
{
"epoch": 0.8128381701918347,
"grad_norm": 3.269733428955078,
"learning_rate": 9.657679183193651e-06,
"loss": 1.0176,
"step": 3305
},
{
"epoch": 0.8130841121495327,
"grad_norm": 3.5215439796447754,
"learning_rate": 9.657383352217313e-06,
"loss": 1.1779,
"step": 3306
},
{
"epoch": 0.8133300541072307,
"grad_norm": 3.5608839988708496,
"learning_rate": 9.657087398003655e-06,
"loss": 1.3221,
"step": 3307
},
{
"epoch": 0.8135759960649287,
"grad_norm": 3.909494161605835,
"learning_rate": 9.656791320560504e-06,
"loss": 1.2164,
"step": 3308
},
{
"epoch": 0.8138219380226267,
"grad_norm": 3.4206089973449707,
"learning_rate": 9.656495119895699e-06,
"loss": 1.1337,
"step": 3309
},
{
"epoch": 0.8140678799803246,
"grad_norm": 3.6243226528167725,
"learning_rate": 9.656198796017075e-06,
"loss": 1.2585,
"step": 3310
},
{
"epoch": 0.8143138219380226,
"grad_norm": 3.5870516300201416,
"learning_rate": 9.655902348932474e-06,
"loss": 1.2914,
"step": 3311
},
{
"epoch": 0.8145597638957206,
"grad_norm": 3.6728293895721436,
"learning_rate": 9.655605778649741e-06,
"loss": 1.2147,
"step": 3312
},
{
"epoch": 0.8148057058534186,
"grad_norm": 3.5845158100128174,
"learning_rate": 9.65530908517672e-06,
"loss": 1.0993,
"step": 3313
},
{
"epoch": 0.8150516478111166,
"grad_norm": 3.4973926544189453,
"learning_rate": 9.655012268521267e-06,
"loss": 1.1951,
"step": 3314
},
{
"epoch": 0.8152975897688146,
"grad_norm": 3.470067262649536,
"learning_rate": 9.65471532869123e-06,
"loss": 1.1871,
"step": 3315
},
{
"epoch": 0.8155435317265125,
"grad_norm": 3.6656458377838135,
"learning_rate": 9.654418265694474e-06,
"loss": 1.1735,
"step": 3316
},
{
"epoch": 0.8157894736842105,
"grad_norm": 3.375108242034912,
"learning_rate": 9.654121079538849e-06,
"loss": 1.1945,
"step": 3317
},
{
"epoch": 0.8160354156419085,
"grad_norm": 3.668471097946167,
"learning_rate": 9.65382377023223e-06,
"loss": 1.162,
"step": 3318
},
{
"epoch": 0.8162813575996065,
"grad_norm": 4.6842122077941895,
"learning_rate": 9.653526337782475e-06,
"loss": 1.2736,
"step": 3319
},
{
"epoch": 0.8165272995573045,
"grad_norm": 4.019695281982422,
"learning_rate": 9.653228782197458e-06,
"loss": 1.2904,
"step": 3320
},
{
"epoch": 0.8167732415150024,
"grad_norm": 3.6398119926452637,
"learning_rate": 9.652931103485053e-06,
"loss": 1.2234,
"step": 3321
},
{
"epoch": 0.8170191834727004,
"grad_norm": 3.664280414581299,
"learning_rate": 9.652633301653135e-06,
"loss": 1.1985,
"step": 3322
},
{
"epoch": 0.8172651254303984,
"grad_norm": 3.457239866256714,
"learning_rate": 9.652335376709587e-06,
"loss": 1.1927,
"step": 3323
},
{
"epoch": 0.8175110673880964,
"grad_norm": 3.6859958171844482,
"learning_rate": 9.65203732866229e-06,
"loss": 1.2855,
"step": 3324
},
{
"epoch": 0.8177570093457944,
"grad_norm": 3.8548355102539062,
"learning_rate": 9.651739157519129e-06,
"loss": 1.3442,
"step": 3325
},
{
"epoch": 0.8180029513034923,
"grad_norm": 3.8403804302215576,
"learning_rate": 9.651440863287996e-06,
"loss": 1.1542,
"step": 3326
},
{
"epoch": 0.8182488932611903,
"grad_norm": 4.100020408630371,
"learning_rate": 9.651142445976785e-06,
"loss": 1.0835,
"step": 3327
},
{
"epoch": 0.8184948352188883,
"grad_norm": 4.021385192871094,
"learning_rate": 9.65084390559339e-06,
"loss": 1.3177,
"step": 3328
},
{
"epoch": 0.8187407771765863,
"grad_norm": 3.47737979888916,
"learning_rate": 9.650545242145713e-06,
"loss": 1.0411,
"step": 3329
},
{
"epoch": 0.8189867191342843,
"grad_norm": 3.6344339847564697,
"learning_rate": 9.650246455641654e-06,
"loss": 1.2252,
"step": 3330
},
{
"epoch": 0.8192326610919823,
"grad_norm": 3.6847071647644043,
"learning_rate": 9.64994754608912e-06,
"loss": 1.2481,
"step": 3331
},
{
"epoch": 0.8194786030496802,
"grad_norm": 3.4429845809936523,
"learning_rate": 9.64964851349602e-06,
"loss": 1.2218,
"step": 3332
},
{
"epoch": 0.8197245450073782,
"grad_norm": 30.464202880859375,
"learning_rate": 9.649349357870272e-06,
"loss": 1.2911,
"step": 3333
},
{
"epoch": 0.8199704869650762,
"grad_norm": 3.4367482662200928,
"learning_rate": 9.649050079219783e-06,
"loss": 1.2416,
"step": 3334
},
{
"epoch": 0.8202164289227742,
"grad_norm": 4.260312557220459,
"learning_rate": 9.64875067755248e-06,
"loss": 1.307,
"step": 3335
},
{
"epoch": 0.8204623708804722,
"grad_norm": 3.809835433959961,
"learning_rate": 9.64845115287628e-06,
"loss": 1.1508,
"step": 3336
},
{
"epoch": 0.8207083128381701,
"grad_norm": 3.9966776371002197,
"learning_rate": 9.648151505199108e-06,
"loss": 1.4059,
"step": 3337
},
{
"epoch": 0.8209542547958681,
"grad_norm": 3.7749853134155273,
"learning_rate": 9.647851734528898e-06,
"loss": 1.2881,
"step": 3338
},
{
"epoch": 0.8212001967535661,
"grad_norm": 3.779608964920044,
"learning_rate": 9.64755184087358e-06,
"loss": 1.1386,
"step": 3339
},
{
"epoch": 0.8214461387112642,
"grad_norm": 3.333245038986206,
"learning_rate": 9.647251824241086e-06,
"loss": 1.007,
"step": 3340
},
{
"epoch": 0.8216920806689622,
"grad_norm": 3.409167766571045,
"learning_rate": 9.646951684639359e-06,
"loss": 1.1194,
"step": 3341
},
{
"epoch": 0.8219380226266602,
"grad_norm": 3.822819948196411,
"learning_rate": 9.64665142207634e-06,
"loss": 1.4828,
"step": 3342
},
{
"epoch": 0.822183964584358,
"grad_norm": 3.4433374404907227,
"learning_rate": 9.64635103655997e-06,
"loss": 1.0673,
"step": 3343
},
{
"epoch": 0.822429906542056,
"grad_norm": 3.8237712383270264,
"learning_rate": 9.646050528098204e-06,
"loss": 1.1923,
"step": 3344
},
{
"epoch": 0.8226758484997541,
"grad_norm": 4.1410040855407715,
"learning_rate": 9.645749896698989e-06,
"loss": 1.2206,
"step": 3345
},
{
"epoch": 0.8229217904574521,
"grad_norm": 3.6630709171295166,
"learning_rate": 9.645449142370282e-06,
"loss": 1.2623,
"step": 3346
},
{
"epoch": 0.8231677324151501,
"grad_norm": 3.4463202953338623,
"learning_rate": 9.645148265120042e-06,
"loss": 1.2761,
"step": 3347
},
{
"epoch": 0.823413674372848,
"grad_norm": 3.59062123298645,
"learning_rate": 9.644847264956226e-06,
"loss": 1.3108,
"step": 3348
},
{
"epoch": 0.823659616330546,
"grad_norm": 3.238532781600952,
"learning_rate": 9.644546141886802e-06,
"loss": 1.1775,
"step": 3349
},
{
"epoch": 0.823905558288244,
"grad_norm": 3.8212950229644775,
"learning_rate": 9.644244895919739e-06,
"loss": 1.2076,
"step": 3350
},
{
"epoch": 0.824151500245942,
"grad_norm": 3.608929395675659,
"learning_rate": 9.643943527063006e-06,
"loss": 1.2318,
"step": 3351
},
{
"epoch": 0.82439744220364,
"grad_norm": 4.057901382446289,
"learning_rate": 9.643642035324576e-06,
"loss": 1.316,
"step": 3352
},
{
"epoch": 0.824643384161338,
"grad_norm": 3.8886983394622803,
"learning_rate": 9.643340420712431e-06,
"loss": 1.2671,
"step": 3353
},
{
"epoch": 0.8248893261190359,
"grad_norm": 3.5764682292938232,
"learning_rate": 9.643038683234549e-06,
"loss": 1.2038,
"step": 3354
},
{
"epoch": 0.8251352680767339,
"grad_norm": 3.7925593852996826,
"learning_rate": 9.642736822898915e-06,
"loss": 1.1804,
"step": 3355
},
{
"epoch": 0.8253812100344319,
"grad_norm": 3.9774246215820312,
"learning_rate": 9.642434839713516e-06,
"loss": 1.2121,
"step": 3356
},
{
"epoch": 0.8256271519921299,
"grad_norm": 3.580204486846924,
"learning_rate": 9.642132733686342e-06,
"loss": 1.2181,
"step": 3357
},
{
"epoch": 0.8258730939498279,
"grad_norm": 3.679941177368164,
"learning_rate": 9.641830504825389e-06,
"loss": 1.2049,
"step": 3358
},
{
"epoch": 0.8261190359075258,
"grad_norm": 3.4778144359588623,
"learning_rate": 9.641528153138652e-06,
"loss": 1.242,
"step": 3359
},
{
"epoch": 0.8263649778652238,
"grad_norm": 3.6073551177978516,
"learning_rate": 9.641225678634132e-06,
"loss": 1.1957,
"step": 3360
},
{
"epoch": 0.8266109198229218,
"grad_norm": 3.260683298110962,
"learning_rate": 9.640923081319836e-06,
"loss": 1.1261,
"step": 3361
},
{
"epoch": 0.8268568617806198,
"grad_norm": 3.6031112670898438,
"learning_rate": 9.640620361203763e-06,
"loss": 1.2661,
"step": 3362
},
{
"epoch": 0.8271028037383178,
"grad_norm": 3.520191192626953,
"learning_rate": 9.64031751829393e-06,
"loss": 1.1735,
"step": 3363
},
{
"epoch": 0.8273487456960157,
"grad_norm": 3.86858868598938,
"learning_rate": 9.640014552598351e-06,
"loss": 1.3994,
"step": 3364
},
{
"epoch": 0.8275946876537137,
"grad_norm": 3.7166342735290527,
"learning_rate": 9.639711464125038e-06,
"loss": 1.3189,
"step": 3365
},
{
"epoch": 0.8278406296114117,
"grad_norm": 3.435774803161621,
"learning_rate": 9.639408252882013e-06,
"loss": 1.1642,
"step": 3366
},
{
"epoch": 0.8280865715691097,
"grad_norm": 3.484919309616089,
"learning_rate": 9.6391049188773e-06,
"loss": 1.1961,
"step": 3367
},
{
"epoch": 0.8283325135268077,
"grad_norm": 3.8796908855438232,
"learning_rate": 9.638801462118924e-06,
"loss": 1.3533,
"step": 3368
},
{
"epoch": 0.8285784554845057,
"grad_norm": 3.630213975906372,
"learning_rate": 9.638497882614915e-06,
"loss": 1.1892,
"step": 3369
},
{
"epoch": 0.8288243974422036,
"grad_norm": 3.451148748397827,
"learning_rate": 9.638194180373307e-06,
"loss": 1.3325,
"step": 3370
},
{
"epoch": 0.8290703393999016,
"grad_norm": 3.386863946914673,
"learning_rate": 9.637890355402135e-06,
"loss": 1.1343,
"step": 3371
},
{
"epoch": 0.8293162813575996,
"grad_norm": 3.3398525714874268,
"learning_rate": 9.63758640770944e-06,
"loss": 1.1579,
"step": 3372
},
{
"epoch": 0.8295622233152976,
"grad_norm": 3.3334174156188965,
"learning_rate": 9.637282337303262e-06,
"loss": 1.1439,
"step": 3373
},
{
"epoch": 0.8298081652729956,
"grad_norm": 3.8025968074798584,
"learning_rate": 9.63697814419165e-06,
"loss": 1.2485,
"step": 3374
},
{
"epoch": 0.8300541072306935,
"grad_norm": 3.6072158813476562,
"learning_rate": 9.636673828382647e-06,
"loss": 1.2283,
"step": 3375
},
{
"epoch": 0.8303000491883915,
"grad_norm": 3.4876487255096436,
"learning_rate": 9.636369389884316e-06,
"loss": 1.1595,
"step": 3376
},
{
"epoch": 0.8305459911460895,
"grad_norm": 3.93485689163208,
"learning_rate": 9.636064828704702e-06,
"loss": 1.2994,
"step": 3377
},
{
"epoch": 0.8307919331037875,
"grad_norm": 3.883328676223755,
"learning_rate": 9.635760144851869e-06,
"loss": 1.2775,
"step": 3378
},
{
"epoch": 0.8310378750614855,
"grad_norm": 3.6326966285705566,
"learning_rate": 9.635455338333878e-06,
"loss": 1.2509,
"step": 3379
},
{
"epoch": 0.8312838170191835,
"grad_norm": 3.177799701690674,
"learning_rate": 9.635150409158798e-06,
"loss": 1.1336,
"step": 3380
},
{
"epoch": 0.8315297589768814,
"grad_norm": 3.483607530593872,
"learning_rate": 9.634845357334692e-06,
"loss": 1.2849,
"step": 3381
},
{
"epoch": 0.8317757009345794,
"grad_norm": 3.381582736968994,
"learning_rate": 9.634540182869634e-06,
"loss": 1.156,
"step": 3382
},
{
"epoch": 0.8320216428922774,
"grad_norm": 3.7483794689178467,
"learning_rate": 9.6342348857717e-06,
"loss": 1.2107,
"step": 3383
},
{
"epoch": 0.8322675848499754,
"grad_norm": 3.719531774520874,
"learning_rate": 9.63392946604897e-06,
"loss": 1.4162,
"step": 3384
},
{
"epoch": 0.8325135268076734,
"grad_norm": 3.4329404830932617,
"learning_rate": 9.63362392370952e-06,
"loss": 1.2373,
"step": 3385
},
{
"epoch": 0.8327594687653713,
"grad_norm": 3.391448974609375,
"learning_rate": 9.63331825876144e-06,
"loss": 1.0625,
"step": 3386
},
{
"epoch": 0.8330054107230693,
"grad_norm": 3.5516183376312256,
"learning_rate": 9.633012471212818e-06,
"loss": 1.3674,
"step": 3387
},
{
"epoch": 0.8332513526807673,
"grad_norm": 3.676013469696045,
"learning_rate": 9.63270656107174e-06,
"loss": 1.2744,
"step": 3388
},
{
"epoch": 0.8334972946384653,
"grad_norm": 3.23000431060791,
"learning_rate": 9.632400528346307e-06,
"loss": 1.262,
"step": 3389
},
{
"epoch": 0.8337432365961633,
"grad_norm": 3.779876232147217,
"learning_rate": 9.632094373044614e-06,
"loss": 1.1747,
"step": 3390
},
{
"epoch": 0.8339891785538613,
"grad_norm": 3.152782440185547,
"learning_rate": 9.631788095174762e-06,
"loss": 1.217,
"step": 3391
},
{
"epoch": 0.8342351205115592,
"grad_norm": 3.870347261428833,
"learning_rate": 9.631481694744854e-06,
"loss": 1.3464,
"step": 3392
},
{
"epoch": 0.8344810624692572,
"grad_norm": 3.6326518058776855,
"learning_rate": 9.631175171763e-06,
"loss": 1.1669,
"step": 3393
},
{
"epoch": 0.8347270044269552,
"grad_norm": 3.376915454864502,
"learning_rate": 9.63086852623731e-06,
"loss": 1.1817,
"step": 3394
},
{
"epoch": 0.8349729463846532,
"grad_norm": 3.5328361988067627,
"learning_rate": 9.630561758175897e-06,
"loss": 1.1465,
"step": 3395
},
{
"epoch": 0.8352188883423513,
"grad_norm": 3.390305519104004,
"learning_rate": 9.630254867586878e-06,
"loss": 1.259,
"step": 3396
},
{
"epoch": 0.8354648303000491,
"grad_norm": 3.5631613731384277,
"learning_rate": 9.629947854478378e-06,
"loss": 1.1777,
"step": 3397
},
{
"epoch": 0.8357107722577471,
"grad_norm": 3.8190736770629883,
"learning_rate": 9.629640718858515e-06,
"loss": 1.2226,
"step": 3398
},
{
"epoch": 0.8359567142154452,
"grad_norm": 3.565189838409424,
"learning_rate": 9.62933346073542e-06,
"loss": 1.2569,
"step": 3399
},
{
"epoch": 0.8362026561731432,
"grad_norm": 3.2934374809265137,
"learning_rate": 9.62902608011722e-06,
"loss": 1.0893,
"step": 3400
},
{
"epoch": 0.8364485981308412,
"grad_norm": 3.7364768981933594,
"learning_rate": 9.628718577012051e-06,
"loss": 1.3516,
"step": 3401
},
{
"epoch": 0.836694540088539,
"grad_norm": 3.9023499488830566,
"learning_rate": 9.628410951428049e-06,
"loss": 1.3629,
"step": 3402
},
{
"epoch": 0.836940482046237,
"grad_norm": 3.6791090965270996,
"learning_rate": 9.628103203373351e-06,
"loss": 1.2087,
"step": 3403
},
{
"epoch": 0.8371864240039351,
"grad_norm": 3.4665589332580566,
"learning_rate": 9.627795332856107e-06,
"loss": 1.1233,
"step": 3404
},
{
"epoch": 0.8374323659616331,
"grad_norm": 3.7642626762390137,
"learning_rate": 9.627487339884457e-06,
"loss": 1.2806,
"step": 3405
},
{
"epoch": 0.8376783079193311,
"grad_norm": 3.523003339767456,
"learning_rate": 9.627179224466554e-06,
"loss": 1.1759,
"step": 3406
},
{
"epoch": 0.8379242498770291,
"grad_norm": 3.626258611679077,
"learning_rate": 9.62687098661055e-06,
"loss": 1.2727,
"step": 3407
},
{
"epoch": 0.838170191834727,
"grad_norm": 3.6509013175964355,
"learning_rate": 9.6265626263246e-06,
"loss": 1.3114,
"step": 3408
},
{
"epoch": 0.838416133792425,
"grad_norm": 3.5062689781188965,
"learning_rate": 9.626254143616868e-06,
"loss": 1.2152,
"step": 3409
},
{
"epoch": 0.838662075750123,
"grad_norm": 3.637770175933838,
"learning_rate": 9.625945538495511e-06,
"loss": 1.2282,
"step": 3410
},
{
"epoch": 0.838908017707821,
"grad_norm": 3.829594850540161,
"learning_rate": 9.625636810968697e-06,
"loss": 1.2468,
"step": 3411
},
{
"epoch": 0.839153959665519,
"grad_norm": 3.714613914489746,
"learning_rate": 9.625327961044596e-06,
"loss": 1.099,
"step": 3412
},
{
"epoch": 0.8393999016232169,
"grad_norm": 3.42441463470459,
"learning_rate": 9.62501898873138e-06,
"loss": 1.1276,
"step": 3413
},
{
"epoch": 0.8396458435809149,
"grad_norm": 3.2468767166137695,
"learning_rate": 9.624709894037224e-06,
"loss": 1.0446,
"step": 3414
},
{
"epoch": 0.8398917855386129,
"grad_norm": 3.7311007976531982,
"learning_rate": 9.624400676970308e-06,
"loss": 1.2669,
"step": 3415
},
{
"epoch": 0.8401377274963109,
"grad_norm": 3.671400547027588,
"learning_rate": 9.624091337538811e-06,
"loss": 1.081,
"step": 3416
},
{
"epoch": 0.8403836694540089,
"grad_norm": 3.6759839057922363,
"learning_rate": 9.623781875750922e-06,
"loss": 1.3575,
"step": 3417
},
{
"epoch": 0.8406296114117069,
"grad_norm": 3.984431505203247,
"learning_rate": 9.623472291614828e-06,
"loss": 1.1992,
"step": 3418
},
{
"epoch": 0.8408755533694048,
"grad_norm": 4.14825963973999,
"learning_rate": 9.623162585138721e-06,
"loss": 1.426,
"step": 3419
},
{
"epoch": 0.8411214953271028,
"grad_norm": 3.5330491065979004,
"learning_rate": 9.622852756330797e-06,
"loss": 1.2092,
"step": 3420
},
{
"epoch": 0.8413674372848008,
"grad_norm": 3.5095181465148926,
"learning_rate": 9.62254280519925e-06,
"loss": 1.1937,
"step": 3421
},
{
"epoch": 0.8416133792424988,
"grad_norm": 3.8394370079040527,
"learning_rate": 9.622232731752288e-06,
"loss": 1.2942,
"step": 3422
},
{
"epoch": 0.8418593212001968,
"grad_norm": 3.416160821914673,
"learning_rate": 9.62192253599811e-06,
"loss": 1.2546,
"step": 3423
},
{
"epoch": 0.8421052631578947,
"grad_norm": 3.6324386596679688,
"learning_rate": 9.621612217944927e-06,
"loss": 1.2618,
"step": 3424
},
{
"epoch": 0.8423512051155927,
"grad_norm": 3.1014630794525146,
"learning_rate": 9.621301777600952e-06,
"loss": 1.1079,
"step": 3425
},
{
"epoch": 0.8425971470732907,
"grad_norm": 3.5070250034332275,
"learning_rate": 9.620991214974394e-06,
"loss": 1.1056,
"step": 3426
},
{
"epoch": 0.8428430890309887,
"grad_norm": 3.618685245513916,
"learning_rate": 9.620680530073474e-06,
"loss": 1.115,
"step": 3427
},
{
"epoch": 0.8430890309886867,
"grad_norm": 3.652184247970581,
"learning_rate": 9.620369722906415e-06,
"loss": 1.2668,
"step": 3428
},
{
"epoch": 0.8433349729463846,
"grad_norm": 3.6702868938446045,
"learning_rate": 9.620058793481437e-06,
"loss": 1.2766,
"step": 3429
},
{
"epoch": 0.8435809149040826,
"grad_norm": 3.639071226119995,
"learning_rate": 9.61974774180677e-06,
"loss": 1.1666,
"step": 3430
},
{
"epoch": 0.8438268568617806,
"grad_norm": 3.7459042072296143,
"learning_rate": 9.619436567890642e-06,
"loss": 1.4068,
"step": 3431
},
{
"epoch": 0.8440727988194786,
"grad_norm": 3.7874209880828857,
"learning_rate": 9.619125271741293e-06,
"loss": 1.3119,
"step": 3432
},
{
"epoch": 0.8443187407771766,
"grad_norm": 3.8997390270233154,
"learning_rate": 9.61881385336695e-06,
"loss": 1.1866,
"step": 3433
},
{
"epoch": 0.8445646827348746,
"grad_norm": 3.845947265625,
"learning_rate": 9.618502312775864e-06,
"loss": 1.2411,
"step": 3434
},
{
"epoch": 0.8448106246925725,
"grad_norm": 3.9683985710144043,
"learning_rate": 9.618190649976271e-06,
"loss": 1.3017,
"step": 3435
},
{
"epoch": 0.8450565666502705,
"grad_norm": 3.4440512657165527,
"learning_rate": 9.617878864976422e-06,
"loss": 1.246,
"step": 3436
},
{
"epoch": 0.8453025086079685,
"grad_norm": 3.550762176513672,
"learning_rate": 9.617566957784563e-06,
"loss": 1.1624,
"step": 3437
},
{
"epoch": 0.8455484505656665,
"grad_norm": 3.6143157482147217,
"learning_rate": 9.617254928408952e-06,
"loss": 1.2772,
"step": 3438
},
{
"epoch": 0.8457943925233645,
"grad_norm": 3.5803792476654053,
"learning_rate": 9.616942776857842e-06,
"loss": 1.1786,
"step": 3439
},
{
"epoch": 0.8460403344810624,
"grad_norm": 3.5607428550720215,
"learning_rate": 9.616630503139498e-06,
"loss": 1.1296,
"step": 3440
},
{
"epoch": 0.8462862764387604,
"grad_norm": 4.056375026702881,
"learning_rate": 9.616318107262175e-06,
"loss": 1.397,
"step": 3441
},
{
"epoch": 0.8465322183964584,
"grad_norm": 3.51149845123291,
"learning_rate": 9.616005589234143e-06,
"loss": 1.305,
"step": 3442
},
{
"epoch": 0.8467781603541564,
"grad_norm": 3.709627151489258,
"learning_rate": 9.615692949063673e-06,
"loss": 1.2576,
"step": 3443
},
{
"epoch": 0.8470241023118544,
"grad_norm": 3.361829996109009,
"learning_rate": 9.615380186759035e-06,
"loss": 1.2303,
"step": 3444
},
{
"epoch": 0.8472700442695524,
"grad_norm": 3.6355934143066406,
"learning_rate": 9.615067302328507e-06,
"loss": 1.2167,
"step": 3445
},
{
"epoch": 0.8475159862272503,
"grad_norm": 3.4305505752563477,
"learning_rate": 9.614754295780367e-06,
"loss": 1.2007,
"step": 3446
},
{
"epoch": 0.8477619281849483,
"grad_norm": 3.576962471008301,
"learning_rate": 9.614441167122898e-06,
"loss": 1.2888,
"step": 3447
},
{
"epoch": 0.8480078701426463,
"grad_norm": 3.591891050338745,
"learning_rate": 9.614127916364383e-06,
"loss": 1.1729,
"step": 3448
},
{
"epoch": 0.8482538121003443,
"grad_norm": 3.830756425857544,
"learning_rate": 9.613814543513117e-06,
"loss": 1.2254,
"step": 3449
},
{
"epoch": 0.8484997540580423,
"grad_norm": 3.612420082092285,
"learning_rate": 9.613501048577385e-06,
"loss": 1.2074,
"step": 3450
},
{
"epoch": 0.8487456960157402,
"grad_norm": 3.673903226852417,
"learning_rate": 9.613187431565486e-06,
"loss": 1.3676,
"step": 3451
},
{
"epoch": 0.8489916379734382,
"grad_norm": 4.410651206970215,
"learning_rate": 9.61287369248572e-06,
"loss": 1.3405,
"step": 3452
},
{
"epoch": 0.8492375799311362,
"grad_norm": 3.7284915447235107,
"learning_rate": 9.612559831346385e-06,
"loss": 1.1308,
"step": 3453
},
{
"epoch": 0.8494835218888342,
"grad_norm": 3.49426007270813,
"learning_rate": 9.612245848155785e-06,
"loss": 1.1869,
"step": 3454
},
{
"epoch": 0.8497294638465323,
"grad_norm": 3.5660176277160645,
"learning_rate": 9.611931742922234e-06,
"loss": 1.2978,
"step": 3455
},
{
"epoch": 0.8499754058042303,
"grad_norm": 3.2111504077911377,
"learning_rate": 9.611617515654039e-06,
"loss": 1.1294,
"step": 3456
},
{
"epoch": 0.8502213477619281,
"grad_norm": 3.5200366973876953,
"learning_rate": 9.611303166359516e-06,
"loss": 1.1751,
"step": 3457
},
{
"epoch": 0.8504672897196262,
"grad_norm": 4.170815944671631,
"learning_rate": 9.610988695046982e-06,
"loss": 1.3342,
"step": 3458
},
{
"epoch": 0.8507132316773242,
"grad_norm": 3.9367728233337402,
"learning_rate": 9.610674101724759e-06,
"loss": 1.3668,
"step": 3459
},
{
"epoch": 0.8509591736350222,
"grad_norm": 3.3173227310180664,
"learning_rate": 9.610359386401172e-06,
"loss": 1.2705,
"step": 3460
},
{
"epoch": 0.8512051155927202,
"grad_norm": 3.8333606719970703,
"learning_rate": 9.610044549084546e-06,
"loss": 1.3747,
"step": 3461
},
{
"epoch": 0.8514510575504181,
"grad_norm": 3.445373296737671,
"learning_rate": 9.609729589783215e-06,
"loss": 1.1092,
"step": 3462
},
{
"epoch": 0.8516969995081161,
"grad_norm": 3.5416650772094727,
"learning_rate": 9.60941450850551e-06,
"loss": 1.2747,
"step": 3463
},
{
"epoch": 0.8519429414658141,
"grad_norm": 3.578150510787964,
"learning_rate": 9.609099305259771e-06,
"loss": 1.1798,
"step": 3464
},
{
"epoch": 0.8521888834235121,
"grad_norm": 3.781416654586792,
"learning_rate": 9.608783980054337e-06,
"loss": 1.4306,
"step": 3465
},
{
"epoch": 0.8524348253812101,
"grad_norm": 3.458303689956665,
"learning_rate": 9.608468532897552e-06,
"loss": 1.122,
"step": 3466
},
{
"epoch": 0.852680767338908,
"grad_norm": 3.6051833629608154,
"learning_rate": 9.608152963797763e-06,
"loss": 1.2912,
"step": 3467
},
{
"epoch": 0.852926709296606,
"grad_norm": 3.294508934020996,
"learning_rate": 9.607837272763321e-06,
"loss": 1.0953,
"step": 3468
},
{
"epoch": 0.853172651254304,
"grad_norm": 3.443876028060913,
"learning_rate": 9.607521459802577e-06,
"loss": 1.0717,
"step": 3469
},
{
"epoch": 0.853418593212002,
"grad_norm": 3.7158493995666504,
"learning_rate": 9.607205524923889e-06,
"loss": 1.2365,
"step": 3470
},
{
"epoch": 0.8536645351697,
"grad_norm": 3.5411548614501953,
"learning_rate": 9.606889468135617e-06,
"loss": 1.1312,
"step": 3471
},
{
"epoch": 0.853910477127398,
"grad_norm": 3.327141284942627,
"learning_rate": 9.606573289446123e-06,
"loss": 1.1484,
"step": 3472
},
{
"epoch": 0.8541564190850959,
"grad_norm": 3.4438464641571045,
"learning_rate": 9.606256988863777e-06,
"loss": 1.0844,
"step": 3473
},
{
"epoch": 0.8544023610427939,
"grad_norm": 3.4056637287139893,
"learning_rate": 9.605940566396944e-06,
"loss": 1.1489,
"step": 3474
},
{
"epoch": 0.8546483030004919,
"grad_norm": 3.910409450531006,
"learning_rate": 9.605624022054e-06,
"loss": 1.3951,
"step": 3475
},
{
"epoch": 0.8548942449581899,
"grad_norm": 3.3325202465057373,
"learning_rate": 9.605307355843317e-06,
"loss": 1.0201,
"step": 3476
},
{
"epoch": 0.8551401869158879,
"grad_norm": 3.4199912548065186,
"learning_rate": 9.604990567773277e-06,
"loss": 1.1711,
"step": 3477
},
{
"epoch": 0.8553861288735858,
"grad_norm": 3.3834235668182373,
"learning_rate": 9.604673657852264e-06,
"loss": 1.2202,
"step": 3478
},
{
"epoch": 0.8556320708312838,
"grad_norm": 3.5715177059173584,
"learning_rate": 9.60435662608866e-06,
"loss": 1.2088,
"step": 3479
},
{
"epoch": 0.8558780127889818,
"grad_norm": 3.355659246444702,
"learning_rate": 9.604039472490856e-06,
"loss": 1.1454,
"step": 3480
},
{
"epoch": 0.8561239547466798,
"grad_norm": 3.6049351692199707,
"learning_rate": 9.603722197067243e-06,
"loss": 1.243,
"step": 3481
},
{
"epoch": 0.8563698967043778,
"grad_norm": 3.7825331687927246,
"learning_rate": 9.603404799826217e-06,
"loss": 1.2284,
"step": 3482
},
{
"epoch": 0.8566158386620758,
"grad_norm": 3.7321994304656982,
"learning_rate": 9.603087280776178e-06,
"loss": 1.2936,
"step": 3483
},
{
"epoch": 0.8568617806197737,
"grad_norm": 3.5653882026672363,
"learning_rate": 9.602769639925524e-06,
"loss": 1.2676,
"step": 3484
},
{
"epoch": 0.8571077225774717,
"grad_norm": 3.6348471641540527,
"learning_rate": 9.602451877282663e-06,
"loss": 1.2907,
"step": 3485
},
{
"epoch": 0.8573536645351697,
"grad_norm": 3.3486251831054688,
"learning_rate": 9.602133992856004e-06,
"loss": 1.2133,
"step": 3486
},
{
"epoch": 0.8575996064928677,
"grad_norm": 3.4686672687530518,
"learning_rate": 9.601815986653955e-06,
"loss": 1.0547,
"step": 3487
},
{
"epoch": 0.8578455484505657,
"grad_norm": 3.5435614585876465,
"learning_rate": 9.601497858684934e-06,
"loss": 1.1614,
"step": 3488
},
{
"epoch": 0.8580914904082636,
"grad_norm": 3.9154298305511475,
"learning_rate": 9.601179608957356e-06,
"loss": 1.3125,
"step": 3489
},
{
"epoch": 0.8583374323659616,
"grad_norm": 3.09698748588562,
"learning_rate": 9.600861237479644e-06,
"loss": 1.0416,
"step": 3490
},
{
"epoch": 0.8585833743236596,
"grad_norm": 3.2912986278533936,
"learning_rate": 9.600542744260221e-06,
"loss": 1.1819,
"step": 3491
},
{
"epoch": 0.8588293162813576,
"grad_norm": 3.7806413173675537,
"learning_rate": 9.600224129307517e-06,
"loss": 1.2713,
"step": 3492
},
{
"epoch": 0.8590752582390556,
"grad_norm": 3.730253219604492,
"learning_rate": 9.599905392629957e-06,
"loss": 1.3489,
"step": 3493
},
{
"epoch": 0.8593212001967536,
"grad_norm": 3.517989158630371,
"learning_rate": 9.599586534235983e-06,
"loss": 1.228,
"step": 3494
},
{
"epoch": 0.8595671421544515,
"grad_norm": 3.986025333404541,
"learning_rate": 9.599267554134029e-06,
"loss": 1.3104,
"step": 3495
},
{
"epoch": 0.8598130841121495,
"grad_norm": 3.301203727722168,
"learning_rate": 9.598948452332531e-06,
"loss": 1.0516,
"step": 3496
},
{
"epoch": 0.8600590260698475,
"grad_norm": 3.6058707237243652,
"learning_rate": 9.598629228839938e-06,
"loss": 1.2306,
"step": 3497
},
{
"epoch": 0.8603049680275455,
"grad_norm": 3.4076058864593506,
"learning_rate": 9.598309883664695e-06,
"loss": 1.1167,
"step": 3498
},
{
"epoch": 0.8605509099852435,
"grad_norm": 3.3237013816833496,
"learning_rate": 9.597990416815253e-06,
"loss": 1.0463,
"step": 3499
},
{
"epoch": 0.8607968519429414,
"grad_norm": 4.048042297363281,
"learning_rate": 9.597670828300062e-06,
"loss": 1.3372,
"step": 3500
},
{
"epoch": 0.8607968519429414,
"eval_loss": 1.2633955478668213,
"eval_runtime": 13.7093,
"eval_samples_per_second": 29.177,
"eval_steps_per_second": 3.647,
"step": 3500
},
{
"epoch": 0.8610427939006394,
"grad_norm": 3.518078088760376,
"learning_rate": 9.597351118127584e-06,
"loss": 1.0962,
"step": 3501
},
{
"epoch": 0.8612887358583374,
"grad_norm": 3.2191073894500732,
"learning_rate": 9.597031286306275e-06,
"loss": 1.1783,
"step": 3502
},
{
"epoch": 0.8615346778160354,
"grad_norm": 3.7829697132110596,
"learning_rate": 9.596711332844598e-06,
"loss": 1.1646,
"step": 3503
},
{
"epoch": 0.8617806197737334,
"grad_norm": 3.553605318069458,
"learning_rate": 9.59639125775102e-06,
"loss": 1.2028,
"step": 3504
},
{
"epoch": 0.8620265617314313,
"grad_norm": 3.673516273498535,
"learning_rate": 9.59607106103401e-06,
"loss": 1.1612,
"step": 3505
},
{
"epoch": 0.8622725036891293,
"grad_norm": 3.6929807662963867,
"learning_rate": 9.595750742702041e-06,
"loss": 1.2035,
"step": 3506
},
{
"epoch": 0.8625184456468273,
"grad_norm": 3.920149564743042,
"learning_rate": 9.595430302763589e-06,
"loss": 1.2906,
"step": 3507
},
{
"epoch": 0.8627643876045253,
"grad_norm": 3.487312078475952,
"learning_rate": 9.595109741227133e-06,
"loss": 1.2707,
"step": 3508
},
{
"epoch": 0.8630103295622233,
"grad_norm": 3.543912172317505,
"learning_rate": 9.594789058101154e-06,
"loss": 1.1744,
"step": 3509
},
{
"epoch": 0.8632562715199213,
"grad_norm": 3.452489137649536,
"learning_rate": 9.59446825339414e-06,
"loss": 1.1751,
"step": 3510
},
{
"epoch": 0.8635022134776192,
"grad_norm": 3.31724214553833,
"learning_rate": 9.594147327114575e-06,
"loss": 1.0873,
"step": 3511
},
{
"epoch": 0.8637481554353172,
"grad_norm": 3.4137630462646484,
"learning_rate": 9.593826279270956e-06,
"loss": 1.1869,
"step": 3512
},
{
"epoch": 0.8639940973930152,
"grad_norm": 3.4184494018554688,
"learning_rate": 9.593505109871777e-06,
"loss": 1.1327,
"step": 3513
},
{
"epoch": 0.8642400393507133,
"grad_norm": 3.4451181888580322,
"learning_rate": 9.593183818925536e-06,
"loss": 1.2994,
"step": 3514
},
{
"epoch": 0.8644859813084113,
"grad_norm": 3.12957501411438,
"learning_rate": 9.592862406440734e-06,
"loss": 1.0897,
"step": 3515
},
{
"epoch": 0.8647319232661091,
"grad_norm": 3.6898419857025146,
"learning_rate": 9.592540872425873e-06,
"loss": 1.1213,
"step": 3516
},
{
"epoch": 0.8649778652238072,
"grad_norm": 3.2780959606170654,
"learning_rate": 9.592219216889467e-06,
"loss": 1.0376,
"step": 3517
},
{
"epoch": 0.8652238071815052,
"grad_norm": 3.5527305603027344,
"learning_rate": 9.591897439840024e-06,
"loss": 1.2379,
"step": 3518
},
{
"epoch": 0.8654697491392032,
"grad_norm": 3.6414365768432617,
"learning_rate": 9.591575541286056e-06,
"loss": 1.2242,
"step": 3519
},
{
"epoch": 0.8657156910969012,
"grad_norm": 3.4203085899353027,
"learning_rate": 9.591253521236085e-06,
"loss": 1.1152,
"step": 3520
},
{
"epoch": 0.8659616330545992,
"grad_norm": 3.730163335800171,
"learning_rate": 9.590931379698629e-06,
"loss": 1.319,
"step": 3521
},
{
"epoch": 0.8662075750122971,
"grad_norm": 4.073503017425537,
"learning_rate": 9.590609116682214e-06,
"loss": 1.4113,
"step": 3522
},
{
"epoch": 0.8664535169699951,
"grad_norm": 4.051136493682861,
"learning_rate": 9.590286732195367e-06,
"loss": 1.3105,
"step": 3523
},
{
"epoch": 0.8666994589276931,
"grad_norm": 4.103650093078613,
"learning_rate": 9.589964226246615e-06,
"loss": 1.2329,
"step": 3524
},
{
"epoch": 0.8669454008853911,
"grad_norm": 3.2931747436523438,
"learning_rate": 9.589641598844497e-06,
"loss": 1.1532,
"step": 3525
},
{
"epoch": 0.8671913428430891,
"grad_norm": 3.65830135345459,
"learning_rate": 9.589318849997548e-06,
"loss": 1.2069,
"step": 3526
},
{
"epoch": 0.867437284800787,
"grad_norm": 3.311760187149048,
"learning_rate": 9.588995979714306e-06,
"loss": 1.148,
"step": 3527
},
{
"epoch": 0.867683226758485,
"grad_norm": 3.636401414871216,
"learning_rate": 9.588672988003316e-06,
"loss": 1.1383,
"step": 3528
},
{
"epoch": 0.867929168716183,
"grad_norm": 3.6535661220550537,
"learning_rate": 9.588349874873126e-06,
"loss": 1.2579,
"step": 3529
},
{
"epoch": 0.868175110673881,
"grad_norm": 3.3090920448303223,
"learning_rate": 9.588026640332285e-06,
"loss": 1.107,
"step": 3530
},
{
"epoch": 0.868421052631579,
"grad_norm": 3.49983811378479,
"learning_rate": 9.587703284389343e-06,
"loss": 1.2165,
"step": 3531
},
{
"epoch": 0.8686669945892769,
"grad_norm": 3.210294246673584,
"learning_rate": 9.587379807052858e-06,
"loss": 1.1411,
"step": 3532
},
{
"epoch": 0.8689129365469749,
"grad_norm": 3.3888683319091797,
"learning_rate": 9.587056208331394e-06,
"loss": 1.1783,
"step": 3533
},
{
"epoch": 0.8691588785046729,
"grad_norm": 3.6505753993988037,
"learning_rate": 9.586732488233506e-06,
"loss": 1.3782,
"step": 3534
},
{
"epoch": 0.8694048204623709,
"grad_norm": 3.4986047744750977,
"learning_rate": 9.586408646767764e-06,
"loss": 1.2199,
"step": 3535
},
{
"epoch": 0.8696507624200689,
"grad_norm": 3.5368027687072754,
"learning_rate": 9.586084683942738e-06,
"loss": 1.1616,
"step": 3536
},
{
"epoch": 0.8698967043777669,
"grad_norm": 3.4359524250030518,
"learning_rate": 9.585760599766997e-06,
"loss": 1.2612,
"step": 3537
},
{
"epoch": 0.8701426463354648,
"grad_norm": 3.8198323249816895,
"learning_rate": 9.585436394249119e-06,
"loss": 1.2351,
"step": 3538
},
{
"epoch": 0.8703885882931628,
"grad_norm": 3.687746047973633,
"learning_rate": 9.585112067397682e-06,
"loss": 1.2212,
"step": 3539
},
{
"epoch": 0.8706345302508608,
"grad_norm": 3.873769760131836,
"learning_rate": 9.584787619221267e-06,
"loss": 1.3438,
"step": 3540
},
{
"epoch": 0.8708804722085588,
"grad_norm": 3.4317786693573,
"learning_rate": 9.58446304972846e-06,
"loss": 1.4619,
"step": 3541
},
{
"epoch": 0.8711264141662568,
"grad_norm": 3.5522406101226807,
"learning_rate": 9.58413835892785e-06,
"loss": 1.2324,
"step": 3542
},
{
"epoch": 0.8713723561239547,
"grad_norm": 3.642183780670166,
"learning_rate": 9.583813546828027e-06,
"loss": 1.2159,
"step": 3543
},
{
"epoch": 0.8716182980816527,
"grad_norm": 3.4028031826019287,
"learning_rate": 9.583488613437587e-06,
"loss": 1.1187,
"step": 3544
},
{
"epoch": 0.8718642400393507,
"grad_norm": 3.764350175857544,
"learning_rate": 9.583163558765127e-06,
"loss": 1.3047,
"step": 3545
},
{
"epoch": 0.8721101819970487,
"grad_norm": 3.258885383605957,
"learning_rate": 9.582838382819249e-06,
"loss": 1.1488,
"step": 3546
},
{
"epoch": 0.8723561239547467,
"grad_norm": 3.402204751968384,
"learning_rate": 9.582513085608557e-06,
"loss": 1.3494,
"step": 3547
},
{
"epoch": 0.8726020659124447,
"grad_norm": 3.6364927291870117,
"learning_rate": 9.582187667141658e-06,
"loss": 1.2024,
"step": 3548
},
{
"epoch": 0.8728480078701426,
"grad_norm": 3.7338478565216064,
"learning_rate": 9.581862127427164e-06,
"loss": 1.28,
"step": 3549
},
{
"epoch": 0.8730939498278406,
"grad_norm": 3.7250819206237793,
"learning_rate": 9.581536466473688e-06,
"loss": 1.1964,
"step": 3550
},
{
"epoch": 0.8733398917855386,
"grad_norm": 3.3996479511260986,
"learning_rate": 9.581210684289847e-06,
"loss": 1.1484,
"step": 3551
},
{
"epoch": 0.8735858337432366,
"grad_norm": 3.880577564239502,
"learning_rate": 9.580884780884261e-06,
"loss": 1.3718,
"step": 3552
},
{
"epoch": 0.8738317757009346,
"grad_norm": 3.493095874786377,
"learning_rate": 9.580558756265557e-06,
"loss": 1.1406,
"step": 3553
},
{
"epoch": 0.8740777176586325,
"grad_norm": 3.5428075790405273,
"learning_rate": 9.580232610442357e-06,
"loss": 1.0877,
"step": 3554
},
{
"epoch": 0.8743236596163305,
"grad_norm": 3.9956471920013428,
"learning_rate": 9.579906343423294e-06,
"loss": 1.29,
"step": 3555
},
{
"epoch": 0.8745696015740285,
"grad_norm": 3.8478176593780518,
"learning_rate": 9.579579955217001e-06,
"loss": 1.2434,
"step": 3556
},
{
"epoch": 0.8748155435317265,
"grad_norm": 3.621389389038086,
"learning_rate": 9.579253445832114e-06,
"loss": 1.2241,
"step": 3557
},
{
"epoch": 0.8750614854894245,
"grad_norm": 3.858661651611328,
"learning_rate": 9.578926815277272e-06,
"loss": 1.2199,
"step": 3558
},
{
"epoch": 0.8753074274471225,
"grad_norm": 4.466431140899658,
"learning_rate": 9.578600063561119e-06,
"loss": 1.2285,
"step": 3559
},
{
"epoch": 0.8755533694048204,
"grad_norm": 3.7060933113098145,
"learning_rate": 9.5782731906923e-06,
"loss": 1.1495,
"step": 3560
},
{
"epoch": 0.8757993113625184,
"grad_norm": 3.6119062900543213,
"learning_rate": 9.577946196679466e-06,
"loss": 1.2425,
"step": 3561
},
{
"epoch": 0.8760452533202164,
"grad_norm": 4.02792501449585,
"learning_rate": 9.577619081531266e-06,
"loss": 1.5054,
"step": 3562
},
{
"epoch": 0.8762911952779144,
"grad_norm": 3.761864185333252,
"learning_rate": 9.57729184525636e-06,
"loss": 1.267,
"step": 3563
},
{
"epoch": 0.8765371372356124,
"grad_norm": 3.48991322517395,
"learning_rate": 9.576964487863405e-06,
"loss": 1.0434,
"step": 3564
},
{
"epoch": 0.8767830791933103,
"grad_norm": 3.56832218170166,
"learning_rate": 9.576637009361061e-06,
"loss": 1.2461,
"step": 3565
},
{
"epoch": 0.8770290211510083,
"grad_norm": 3.4757187366485596,
"learning_rate": 9.576309409757997e-06,
"loss": 1.0954,
"step": 3566
},
{
"epoch": 0.8772749631087063,
"grad_norm": 3.5495965480804443,
"learning_rate": 9.575981689062879e-06,
"loss": 1.3102,
"step": 3567
},
{
"epoch": 0.8775209050664043,
"grad_norm": 3.4084348678588867,
"learning_rate": 9.575653847284379e-06,
"loss": 1.0965,
"step": 3568
},
{
"epoch": 0.8777668470241023,
"grad_norm": 3.586642026901245,
"learning_rate": 9.575325884431173e-06,
"loss": 1.2343,
"step": 3569
},
{
"epoch": 0.8780127889818002,
"grad_norm": 3.312760353088379,
"learning_rate": 9.574997800511939e-06,
"loss": 1.155,
"step": 3570
},
{
"epoch": 0.8782587309394982,
"grad_norm": 3.656358003616333,
"learning_rate": 9.574669595535355e-06,
"loss": 1.2074,
"step": 3571
},
{
"epoch": 0.8785046728971962,
"grad_norm": 4.046459674835205,
"learning_rate": 9.57434126951011e-06,
"loss": 1.2209,
"step": 3572
},
{
"epoch": 0.8787506148548943,
"grad_norm": 3.912782669067383,
"learning_rate": 9.574012822444892e-06,
"loss": 1.3071,
"step": 3573
},
{
"epoch": 0.8789965568125923,
"grad_norm": 3.881004810333252,
"learning_rate": 9.573684254348387e-06,
"loss": 1.2336,
"step": 3574
},
{
"epoch": 0.8792424987702903,
"grad_norm": 3.3354620933532715,
"learning_rate": 9.573355565229291e-06,
"loss": 1.0319,
"step": 3575
},
{
"epoch": 0.8794884407279882,
"grad_norm": 3.529043197631836,
"learning_rate": 9.573026755096305e-06,
"loss": 1.1771,
"step": 3576
},
{
"epoch": 0.8797343826856862,
"grad_norm": 3.588651418685913,
"learning_rate": 9.572697823958126e-06,
"loss": 1.2773,
"step": 3577
},
{
"epoch": 0.8799803246433842,
"grad_norm": 3.4134879112243652,
"learning_rate": 9.57236877182346e-06,
"loss": 1.2163,
"step": 3578
},
{
"epoch": 0.8802262666010822,
"grad_norm": 3.629589557647705,
"learning_rate": 9.572039598701011e-06,
"loss": 1.1494,
"step": 3579
},
{
"epoch": 0.8804722085587802,
"grad_norm": 3.66666316986084,
"learning_rate": 9.57171030459949e-06,
"loss": 1.2951,
"step": 3580
},
{
"epoch": 0.8807181505164781,
"grad_norm": 3.8492770195007324,
"learning_rate": 9.571380889527611e-06,
"loss": 1.3147,
"step": 3581
},
{
"epoch": 0.8809640924741761,
"grad_norm": 3.7097601890563965,
"learning_rate": 9.571051353494091e-06,
"loss": 1.2117,
"step": 3582
},
{
"epoch": 0.8812100344318741,
"grad_norm": 3.1909661293029785,
"learning_rate": 9.57072169650765e-06,
"loss": 1.0594,
"step": 3583
},
{
"epoch": 0.8814559763895721,
"grad_norm": 3.6498262882232666,
"learning_rate": 9.570391918577009e-06,
"loss": 1.1754,
"step": 3584
},
{
"epoch": 0.8817019183472701,
"grad_norm": 3.4765987396240234,
"learning_rate": 9.570062019710895e-06,
"loss": 1.1225,
"step": 3585
},
{
"epoch": 0.8819478603049681,
"grad_norm": 3.5514724254608154,
"learning_rate": 9.569731999918038e-06,
"loss": 1.2704,
"step": 3586
},
{
"epoch": 0.882193802262666,
"grad_norm": 3.5469961166381836,
"learning_rate": 9.56940185920717e-06,
"loss": 1.247,
"step": 3587
},
{
"epoch": 0.882439744220364,
"grad_norm": 3.8188278675079346,
"learning_rate": 9.569071597587027e-06,
"loss": 1.2944,
"step": 3588
},
{
"epoch": 0.882685686178062,
"grad_norm": 3.3976941108703613,
"learning_rate": 9.568741215066349e-06,
"loss": 1.2122,
"step": 3589
},
{
"epoch": 0.88293162813576,
"grad_norm": 3.2454452514648438,
"learning_rate": 9.568410711653876e-06,
"loss": 1.2469,
"step": 3590
},
{
"epoch": 0.883177570093458,
"grad_norm": 3.7621002197265625,
"learning_rate": 9.568080087358353e-06,
"loss": 1.2947,
"step": 3591
},
{
"epoch": 0.8834235120511559,
"grad_norm": 3.173442840576172,
"learning_rate": 9.567749342188533e-06,
"loss": 1.1932,
"step": 3592
},
{
"epoch": 0.8836694540088539,
"grad_norm": 3.4343206882476807,
"learning_rate": 9.567418476153161e-06,
"loss": 1.1508,
"step": 3593
},
{
"epoch": 0.8839153959665519,
"grad_norm": 3.8469834327697754,
"learning_rate": 9.567087489260997e-06,
"loss": 1.1973,
"step": 3594
},
{
"epoch": 0.8841613379242499,
"grad_norm": 4.167831897735596,
"learning_rate": 9.566756381520798e-06,
"loss": 1.2818,
"step": 3595
},
{
"epoch": 0.8844072798819479,
"grad_norm": 3.3074400424957275,
"learning_rate": 9.566425152941325e-06,
"loss": 1.1187,
"step": 3596
},
{
"epoch": 0.8846532218396459,
"grad_norm": 3.7723376750946045,
"learning_rate": 9.566093803531341e-06,
"loss": 1.1951,
"step": 3597
},
{
"epoch": 0.8848991637973438,
"grad_norm": 3.768890619277954,
"learning_rate": 9.565762333299616e-06,
"loss": 1.3481,
"step": 3598
},
{
"epoch": 0.8851451057550418,
"grad_norm": 3.592271089553833,
"learning_rate": 9.56543074225492e-06,
"loss": 1.1768,
"step": 3599
},
{
"epoch": 0.8853910477127398,
"grad_norm": 3.3280892372131348,
"learning_rate": 9.565099030406028e-06,
"loss": 1.1741,
"step": 3600
},
{
"epoch": 0.8856369896704378,
"grad_norm": 3.609210729598999,
"learning_rate": 9.564767197761715e-06,
"loss": 1.1822,
"step": 3601
},
{
"epoch": 0.8858829316281358,
"grad_norm": 3.4467432498931885,
"learning_rate": 9.564435244330764e-06,
"loss": 1.1778,
"step": 3602
},
{
"epoch": 0.8861288735858337,
"grad_norm": 3.437084913253784,
"learning_rate": 9.564103170121957e-06,
"loss": 1.2077,
"step": 3603
},
{
"epoch": 0.8863748155435317,
"grad_norm": 3.8949360847473145,
"learning_rate": 9.563770975144083e-06,
"loss": 1.2707,
"step": 3604
},
{
"epoch": 0.8866207575012297,
"grad_norm": 3.3527045249938965,
"learning_rate": 9.563438659405928e-06,
"loss": 1.056,
"step": 3605
},
{
"epoch": 0.8868666994589277,
"grad_norm": 3.5894086360931396,
"learning_rate": 9.56310622291629e-06,
"loss": 1.2729,
"step": 3606
},
{
"epoch": 0.8871126414166257,
"grad_norm": 3.478212594985962,
"learning_rate": 9.562773665683963e-06,
"loss": 1.3141,
"step": 3607
},
{
"epoch": 0.8873585833743236,
"grad_norm": 3.6984055042266846,
"learning_rate": 9.562440987717747e-06,
"loss": 1.2082,
"step": 3608
},
{
"epoch": 0.8876045253320216,
"grad_norm": 3.3572685718536377,
"learning_rate": 9.562108189026444e-06,
"loss": 1.2742,
"step": 3609
},
{
"epoch": 0.8878504672897196,
"grad_norm": 3.4459142684936523,
"learning_rate": 9.561775269618861e-06,
"loss": 1.1653,
"step": 3610
},
{
"epoch": 0.8880964092474176,
"grad_norm": 3.3658711910247803,
"learning_rate": 9.561442229503809e-06,
"loss": 1.2095,
"step": 3611
},
{
"epoch": 0.8883423512051156,
"grad_norm": 3.302816867828369,
"learning_rate": 9.561109068690098e-06,
"loss": 1.1401,
"step": 3612
},
{
"epoch": 0.8885882931628136,
"grad_norm": 3.382028341293335,
"learning_rate": 9.560775787186544e-06,
"loss": 1.3626,
"step": 3613
},
{
"epoch": 0.8888342351205115,
"grad_norm": 3.7356083393096924,
"learning_rate": 9.560442385001968e-06,
"loss": 1.2545,
"step": 3614
},
{
"epoch": 0.8890801770782095,
"grad_norm": 3.627584457397461,
"learning_rate": 9.560108862145188e-06,
"loss": 1.3291,
"step": 3615
},
{
"epoch": 0.8893261190359075,
"grad_norm": 3.361186981201172,
"learning_rate": 9.55977521862503e-06,
"loss": 1.2416,
"step": 3616
},
{
"epoch": 0.8895720609936055,
"grad_norm": 3.4848992824554443,
"learning_rate": 9.559441454450326e-06,
"loss": 1.215,
"step": 3617
},
{
"epoch": 0.8898180029513035,
"grad_norm": 4.122344017028809,
"learning_rate": 9.559107569629907e-06,
"loss": 1.2816,
"step": 3618
},
{
"epoch": 0.8900639449090014,
"grad_norm": 3.7953083515167236,
"learning_rate": 9.558773564172605e-06,
"loss": 1.123,
"step": 3619
},
{
"epoch": 0.8903098868666994,
"grad_norm": 3.754122734069824,
"learning_rate": 9.55843943808726e-06,
"loss": 1.4134,
"step": 3620
},
{
"epoch": 0.8905558288243974,
"grad_norm": 3.806171417236328,
"learning_rate": 9.55810519138271e-06,
"loss": 1.2293,
"step": 3621
},
{
"epoch": 0.8908017707820954,
"grad_norm": 3.564484119415283,
"learning_rate": 9.557770824067804e-06,
"loss": 1.2251,
"step": 3622
},
{
"epoch": 0.8910477127397934,
"grad_norm": 3.4693071842193604,
"learning_rate": 9.557436336151388e-06,
"loss": 1.1891,
"step": 3623
},
{
"epoch": 0.8912936546974914,
"grad_norm": 3.30888295173645,
"learning_rate": 9.55710172764231e-06,
"loss": 1.1186,
"step": 3624
},
{
"epoch": 0.8915395966551893,
"grad_norm": 3.473057270050049,
"learning_rate": 9.556766998549427e-06,
"loss": 1.2627,
"step": 3625
},
{
"epoch": 0.8917855386128873,
"grad_norm": 3.662726879119873,
"learning_rate": 9.556432148881595e-06,
"loss": 1.3051,
"step": 3626
},
{
"epoch": 0.8920314805705853,
"grad_norm": 3.4826040267944336,
"learning_rate": 9.556097178647677e-06,
"loss": 1.2212,
"step": 3627
},
{
"epoch": 0.8922774225282833,
"grad_norm": 3.7390823364257812,
"learning_rate": 9.555762087856531e-06,
"loss": 1.2847,
"step": 3628
},
{
"epoch": 0.8925233644859814,
"grad_norm": 3.8063580989837646,
"learning_rate": 9.555426876517029e-06,
"loss": 1.1332,
"step": 3629
},
{
"epoch": 0.8927693064436792,
"grad_norm": 3.459592819213867,
"learning_rate": 9.555091544638038e-06,
"loss": 1.2017,
"step": 3630
},
{
"epoch": 0.8930152484013772,
"grad_norm": 3.402221441268921,
"learning_rate": 9.554756092228432e-06,
"loss": 1.2648,
"step": 3631
},
{
"epoch": 0.8932611903590753,
"grad_norm": 3.700427770614624,
"learning_rate": 9.554420519297087e-06,
"loss": 1.2755,
"step": 3632
},
{
"epoch": 0.8935071323167733,
"grad_norm": 3.485463857650757,
"learning_rate": 9.554084825852883e-06,
"loss": 1.1332,
"step": 3633
},
{
"epoch": 0.8937530742744713,
"grad_norm": 3.472724199295044,
"learning_rate": 9.553749011904703e-06,
"loss": 1.1309,
"step": 3634
},
{
"epoch": 0.8939990162321693,
"grad_norm": 3.205716371536255,
"learning_rate": 9.55341307746143e-06,
"loss": 1.134,
"step": 3635
},
{
"epoch": 0.8942449581898672,
"grad_norm": 3.8189027309417725,
"learning_rate": 9.553077022531956e-06,
"loss": 1.2492,
"step": 3636
},
{
"epoch": 0.8944909001475652,
"grad_norm": 3.450049877166748,
"learning_rate": 9.552740847125174e-06,
"loss": 1.1597,
"step": 3637
},
{
"epoch": 0.8947368421052632,
"grad_norm": 3.9682462215423584,
"learning_rate": 9.552404551249977e-06,
"loss": 1.4037,
"step": 3638
},
{
"epoch": 0.8949827840629612,
"grad_norm": 3.5306830406188965,
"learning_rate": 9.552068134915263e-06,
"loss": 1.1533,
"step": 3639
},
{
"epoch": 0.8952287260206592,
"grad_norm": 3.536188840866089,
"learning_rate": 9.551731598129936e-06,
"loss": 1.0826,
"step": 3640
},
{
"epoch": 0.8954746679783571,
"grad_norm": 3.464604377746582,
"learning_rate": 9.551394940902902e-06,
"loss": 1.2059,
"step": 3641
},
{
"epoch": 0.8957206099360551,
"grad_norm": 3.586876630783081,
"learning_rate": 9.551058163243065e-06,
"loss": 1.2567,
"step": 3642
},
{
"epoch": 0.8959665518937531,
"grad_norm": 3.6346936225891113,
"learning_rate": 9.55072126515934e-06,
"loss": 1.2349,
"step": 3643
},
{
"epoch": 0.8962124938514511,
"grad_norm": 3.550907850265503,
"learning_rate": 9.55038424666064e-06,
"loss": 1.2159,
"step": 3644
},
{
"epoch": 0.8964584358091491,
"grad_norm": 3.9755165576934814,
"learning_rate": 9.550047107755883e-06,
"loss": 1.1894,
"step": 3645
},
{
"epoch": 0.896704377766847,
"grad_norm": 3.3944215774536133,
"learning_rate": 9.54970984845399e-06,
"loss": 1.1461,
"step": 3646
},
{
"epoch": 0.896950319724545,
"grad_norm": 3.388507843017578,
"learning_rate": 9.549372468763885e-06,
"loss": 1.1652,
"step": 3647
},
{
"epoch": 0.897196261682243,
"grad_norm": 3.6089677810668945,
"learning_rate": 9.549034968694494e-06,
"loss": 1.1807,
"step": 3648
},
{
"epoch": 0.897442203639941,
"grad_norm": 3.8819568157196045,
"learning_rate": 9.54869734825475e-06,
"loss": 1.3524,
"step": 3649
},
{
"epoch": 0.897688145597639,
"grad_norm": 3.5269556045532227,
"learning_rate": 9.548359607453586e-06,
"loss": 1.2568,
"step": 3650
},
{
"epoch": 0.897934087555337,
"grad_norm": 3.590235710144043,
"learning_rate": 9.548021746299937e-06,
"loss": 1.2628,
"step": 3651
},
{
"epoch": 0.8981800295130349,
"grad_norm": 3.830281972885132,
"learning_rate": 9.547683764802745e-06,
"loss": 1.2499,
"step": 3652
},
{
"epoch": 0.8984259714707329,
"grad_norm": 3.5138533115386963,
"learning_rate": 9.547345662970952e-06,
"loss": 1.2343,
"step": 3653
},
{
"epoch": 0.8986719134284309,
"grad_norm": 3.90680193901062,
"learning_rate": 9.547007440813504e-06,
"loss": 1.2538,
"step": 3654
},
{
"epoch": 0.8989178553861289,
"grad_norm": 3.8625266551971436,
"learning_rate": 9.546669098339351e-06,
"loss": 1.2443,
"step": 3655
},
{
"epoch": 0.8991637973438269,
"grad_norm": 3.465038299560547,
"learning_rate": 9.546330635557449e-06,
"loss": 1.2026,
"step": 3656
},
{
"epoch": 0.8994097393015248,
"grad_norm": 3.3339860439300537,
"learning_rate": 9.545992052476748e-06,
"loss": 1.1484,
"step": 3657
},
{
"epoch": 0.8996556812592228,
"grad_norm": 3.579580307006836,
"learning_rate": 9.545653349106214e-06,
"loss": 1.19,
"step": 3658
},
{
"epoch": 0.8999016232169208,
"grad_norm": 3.585641384124756,
"learning_rate": 9.545314525454803e-06,
"loss": 1.174,
"step": 3659
},
{
"epoch": 0.9001475651746188,
"grad_norm": 4.084930419921875,
"learning_rate": 9.544975581531482e-06,
"loss": 1.3861,
"step": 3660
},
{
"epoch": 0.9003935071323168,
"grad_norm": 3.266632556915283,
"learning_rate": 9.544636517345222e-06,
"loss": 1.1636,
"step": 3661
},
{
"epoch": 0.9006394490900148,
"grad_norm": 3.700571060180664,
"learning_rate": 9.544297332904995e-06,
"loss": 1.1749,
"step": 3662
},
{
"epoch": 0.9008853910477127,
"grad_norm": 3.584488868713379,
"learning_rate": 9.543958028219772e-06,
"loss": 1.2008,
"step": 3663
},
{
"epoch": 0.9011313330054107,
"grad_norm": 3.1556942462921143,
"learning_rate": 9.543618603298536e-06,
"loss": 1.0967,
"step": 3664
},
{
"epoch": 0.9013772749631087,
"grad_norm": 3.7243781089782715,
"learning_rate": 9.543279058150265e-06,
"loss": 1.2094,
"step": 3665
},
{
"epoch": 0.9016232169208067,
"grad_norm": 3.3711061477661133,
"learning_rate": 9.542939392783945e-06,
"loss": 1.198,
"step": 3666
},
{
"epoch": 0.9018691588785047,
"grad_norm": 3.582303762435913,
"learning_rate": 9.542599607208563e-06,
"loss": 1.2118,
"step": 3667
},
{
"epoch": 0.9021151008362026,
"grad_norm": 4.085671424865723,
"learning_rate": 9.542259701433111e-06,
"loss": 1.3369,
"step": 3668
},
{
"epoch": 0.9023610427939006,
"grad_norm": 3.3851683139801025,
"learning_rate": 9.541919675466583e-06,
"loss": 1.257,
"step": 3669
},
{
"epoch": 0.9026069847515986,
"grad_norm": 3.4781084060668945,
"learning_rate": 9.541579529317974e-06,
"loss": 1.2818,
"step": 3670
},
{
"epoch": 0.9028529267092966,
"grad_norm": 3.4545798301696777,
"learning_rate": 9.54123926299629e-06,
"loss": 1.1466,
"step": 3671
},
{
"epoch": 0.9030988686669946,
"grad_norm": 3.8504674434661865,
"learning_rate": 9.540898876510527e-06,
"loss": 1.244,
"step": 3672
},
{
"epoch": 0.9033448106246925,
"grad_norm": 3.661174774169922,
"learning_rate": 9.540558369869698e-06,
"loss": 1.237,
"step": 3673
},
{
"epoch": 0.9035907525823905,
"grad_norm": 4.375135898590088,
"learning_rate": 9.540217743082811e-06,
"loss": 1.285,
"step": 3674
},
{
"epoch": 0.9038366945400885,
"grad_norm": 3.705709934234619,
"learning_rate": 9.539876996158879e-06,
"loss": 1.2623,
"step": 3675
},
{
"epoch": 0.9040826364977865,
"grad_norm": 3.3671743869781494,
"learning_rate": 9.539536129106918e-06,
"loss": 1.2552,
"step": 3676
},
{
"epoch": 0.9043285784554845,
"grad_norm": 3.4459829330444336,
"learning_rate": 9.539195141935949e-06,
"loss": 1.0862,
"step": 3677
},
{
"epoch": 0.9045745204131825,
"grad_norm": 3.558809757232666,
"learning_rate": 9.538854034654991e-06,
"loss": 1.2825,
"step": 3678
},
{
"epoch": 0.9048204623708804,
"grad_norm": 3.3278114795684814,
"learning_rate": 9.538512807273076e-06,
"loss": 1.1142,
"step": 3679
},
{
"epoch": 0.9050664043285784,
"grad_norm": 3.302722930908203,
"learning_rate": 9.538171459799228e-06,
"loss": 1.3482,
"step": 3680
},
{
"epoch": 0.9053123462862764,
"grad_norm": 3.7310121059417725,
"learning_rate": 9.537829992242482e-06,
"loss": 1.2879,
"step": 3681
},
{
"epoch": 0.9055582882439744,
"grad_norm": 3.706099510192871,
"learning_rate": 9.537488404611872e-06,
"loss": 1.2984,
"step": 3682
},
{
"epoch": 0.9058042302016724,
"grad_norm": 3.3489394187927246,
"learning_rate": 9.537146696916439e-06,
"loss": 1.2068,
"step": 3683
},
{
"epoch": 0.9060501721593703,
"grad_norm": 3.2945239543914795,
"learning_rate": 9.536804869165221e-06,
"loss": 1.1744,
"step": 3684
},
{
"epoch": 0.9062961141170683,
"grad_norm": 3.2435789108276367,
"learning_rate": 9.536462921367265e-06,
"loss": 1.196,
"step": 3685
},
{
"epoch": 0.9065420560747663,
"grad_norm": 3.350149393081665,
"learning_rate": 9.53612085353162e-06,
"loss": 1.1232,
"step": 3686
},
{
"epoch": 0.9067879980324643,
"grad_norm": 3.4470150470733643,
"learning_rate": 9.535778665667334e-06,
"loss": 1.0463,
"step": 3687
},
{
"epoch": 0.9070339399901624,
"grad_norm": 4.065612316131592,
"learning_rate": 9.535436357783468e-06,
"loss": 1.2132,
"step": 3688
},
{
"epoch": 0.9072798819478604,
"grad_norm": 3.662580728530884,
"learning_rate": 9.535093929889072e-06,
"loss": 1.1241,
"step": 3689
},
{
"epoch": 0.9075258239055582,
"grad_norm": 3.8478949069976807,
"learning_rate": 9.534751381993213e-06,
"loss": 1.2966,
"step": 3690
},
{
"epoch": 0.9077717658632563,
"grad_norm": 3.6773204803466797,
"learning_rate": 9.53440871410495e-06,
"loss": 1.1712,
"step": 3691
},
{
"epoch": 0.9080177078209543,
"grad_norm": 3.568802833557129,
"learning_rate": 9.534065926233355e-06,
"loss": 1.1134,
"step": 3692
},
{
"epoch": 0.9082636497786523,
"grad_norm": 3.333939552307129,
"learning_rate": 9.533723018387496e-06,
"loss": 1.0495,
"step": 3693
},
{
"epoch": 0.9085095917363503,
"grad_norm": 3.820345640182495,
"learning_rate": 9.533379990576443e-06,
"loss": 1.3153,
"step": 3694
},
{
"epoch": 0.9087555336940482,
"grad_norm": 3.3256940841674805,
"learning_rate": 9.53303684280928e-06,
"loss": 1.2138,
"step": 3695
},
{
"epoch": 0.9090014756517462,
"grad_norm": 3.5115182399749756,
"learning_rate": 9.532693575095081e-06,
"loss": 1.2755,
"step": 3696
},
{
"epoch": 0.9092474176094442,
"grad_norm": 3.9968810081481934,
"learning_rate": 9.532350187442935e-06,
"loss": 1.1822,
"step": 3697
},
{
"epoch": 0.9094933595671422,
"grad_norm": 3.2059152126312256,
"learning_rate": 9.532006679861923e-06,
"loss": 1.1603,
"step": 3698
},
{
"epoch": 0.9097393015248402,
"grad_norm": 3.496220111846924,
"learning_rate": 9.531663052361135e-06,
"loss": 1.2047,
"step": 3699
},
{
"epoch": 0.9099852434825382,
"grad_norm": 3.6531107425689697,
"learning_rate": 9.531319304949664e-06,
"loss": 1.3449,
"step": 3700
},
{
"epoch": 0.9102311854402361,
"grad_norm": 3.455643892288208,
"learning_rate": 9.53097543763661e-06,
"loss": 1.2943,
"step": 3701
},
{
"epoch": 0.9104771273979341,
"grad_norm": 3.72200608253479,
"learning_rate": 9.530631450431065e-06,
"loss": 1.3238,
"step": 3702
},
{
"epoch": 0.9107230693556321,
"grad_norm": 3.4108450412750244,
"learning_rate": 9.530287343342136e-06,
"loss": 1.1788,
"step": 3703
},
{
"epoch": 0.9109690113133301,
"grad_norm": 3.8839097023010254,
"learning_rate": 9.529943116378926e-06,
"loss": 1.2378,
"step": 3704
},
{
"epoch": 0.9112149532710281,
"grad_norm": 3.8138790130615234,
"learning_rate": 9.529598769550545e-06,
"loss": 1.3521,
"step": 3705
},
{
"epoch": 0.911460895228726,
"grad_norm": 3.3680312633514404,
"learning_rate": 9.529254302866101e-06,
"loss": 1.1658,
"step": 3706
},
{
"epoch": 0.911706837186424,
"grad_norm": 3.9882829189300537,
"learning_rate": 9.528909716334716e-06,
"loss": 1.248,
"step": 3707
},
{
"epoch": 0.911952779144122,
"grad_norm": 3.6899056434631348,
"learning_rate": 9.528565009965502e-06,
"loss": 1.2175,
"step": 3708
},
{
"epoch": 0.91219872110182,
"grad_norm": 3.3671367168426514,
"learning_rate": 9.528220183767582e-06,
"loss": 1.176,
"step": 3709
},
{
"epoch": 0.912444663059518,
"grad_norm": 3.83066463470459,
"learning_rate": 9.527875237750079e-06,
"loss": 1.2376,
"step": 3710
},
{
"epoch": 0.9126906050172159,
"grad_norm": 3.7227232456207275,
"learning_rate": 9.527530171922123e-06,
"loss": 1.2275,
"step": 3711
},
{
"epoch": 0.9129365469749139,
"grad_norm": 3.7556822299957275,
"learning_rate": 9.527184986292843e-06,
"loss": 1.1755,
"step": 3712
},
{
"epoch": 0.9131824889326119,
"grad_norm": 3.2641546726226807,
"learning_rate": 9.526839680871373e-06,
"loss": 1.0847,
"step": 3713
},
{
"epoch": 0.9134284308903099,
"grad_norm": 3.5151054859161377,
"learning_rate": 9.52649425566685e-06,
"loss": 1.2111,
"step": 3714
},
{
"epoch": 0.9136743728480079,
"grad_norm": 3.720137357711792,
"learning_rate": 9.526148710688415e-06,
"loss": 1.4513,
"step": 3715
},
{
"epoch": 0.9139203148057059,
"grad_norm": 3.5734195709228516,
"learning_rate": 9.52580304594521e-06,
"loss": 1.0661,
"step": 3716
},
{
"epoch": 0.9141662567634038,
"grad_norm": 3.411285638809204,
"learning_rate": 9.525457261446382e-06,
"loss": 1.1395,
"step": 3717
},
{
"epoch": 0.9144121987211018,
"grad_norm": 3.5341696739196777,
"learning_rate": 9.525111357201081e-06,
"loss": 1.2131,
"step": 3718
},
{
"epoch": 0.9146581406787998,
"grad_norm": 3.789830446243286,
"learning_rate": 9.52476533321846e-06,
"loss": 1.3306,
"step": 3719
},
{
"epoch": 0.9149040826364978,
"grad_norm": 3.8217196464538574,
"learning_rate": 9.524419189507674e-06,
"loss": 1.1937,
"step": 3720
},
{
"epoch": 0.9151500245941958,
"grad_norm": 3.3808183670043945,
"learning_rate": 9.524072926077883e-06,
"loss": 1.2041,
"step": 3721
},
{
"epoch": 0.9153959665518937,
"grad_norm": 3.5508177280426025,
"learning_rate": 9.523726542938249e-06,
"loss": 1.3117,
"step": 3722
},
{
"epoch": 0.9156419085095917,
"grad_norm": 3.4444971084594727,
"learning_rate": 9.52338004009794e-06,
"loss": 1.1421,
"step": 3723
},
{
"epoch": 0.9158878504672897,
"grad_norm": 3.4307329654693604,
"learning_rate": 9.523033417566122e-06,
"loss": 1.1594,
"step": 3724
},
{
"epoch": 0.9161337924249877,
"grad_norm": 3.1649484634399414,
"learning_rate": 9.522686675351964e-06,
"loss": 1.1559,
"step": 3725
},
{
"epoch": 0.9163797343826857,
"grad_norm": 3.529099702835083,
"learning_rate": 9.522339813464647e-06,
"loss": 1.296,
"step": 3726
},
{
"epoch": 0.9166256763403837,
"grad_norm": 3.8384642601013184,
"learning_rate": 9.521992831913346e-06,
"loss": 1.2512,
"step": 3727
},
{
"epoch": 0.9168716182980816,
"grad_norm": 3.1574480533599854,
"learning_rate": 9.521645730707243e-06,
"loss": 1.079,
"step": 3728
},
{
"epoch": 0.9171175602557796,
"grad_norm": 3.2832865715026855,
"learning_rate": 9.52129850985552e-06,
"loss": 1.1362,
"step": 3729
},
{
"epoch": 0.9173635022134776,
"grad_norm": 3.6253788471221924,
"learning_rate": 9.52095116936737e-06,
"loss": 1.2271,
"step": 3730
},
{
"epoch": 0.9176094441711756,
"grad_norm": 3.5513031482696533,
"learning_rate": 9.520603709251981e-06,
"loss": 1.253,
"step": 3731
},
{
"epoch": 0.9178553861288736,
"grad_norm": 3.391063690185547,
"learning_rate": 9.520256129518547e-06,
"loss": 1.1777,
"step": 3732
},
{
"epoch": 0.9181013280865715,
"grad_norm": 3.5013644695281982,
"learning_rate": 9.519908430176262e-06,
"loss": 1.2618,
"step": 3733
},
{
"epoch": 0.9183472700442695,
"grad_norm": 3.8913919925689697,
"learning_rate": 9.519560611234331e-06,
"loss": 1.3364,
"step": 3734
},
{
"epoch": 0.9185932120019675,
"grad_norm": 3.13307785987854,
"learning_rate": 9.519212672701957e-06,
"loss": 1.1319,
"step": 3735
},
{
"epoch": 0.9188391539596655,
"grad_norm": 3.791264533996582,
"learning_rate": 9.518864614588346e-06,
"loss": 1.1454,
"step": 3736
},
{
"epoch": 0.9190850959173635,
"grad_norm": 3.5851056575775146,
"learning_rate": 9.518516436902705e-06,
"loss": 1.2388,
"step": 3737
},
{
"epoch": 0.9193310378750615,
"grad_norm": 3.1594293117523193,
"learning_rate": 9.518168139654251e-06,
"loss": 1.0678,
"step": 3738
},
{
"epoch": 0.9195769798327594,
"grad_norm": 3.159804582595825,
"learning_rate": 9.517819722852199e-06,
"loss": 1.2964,
"step": 3739
},
{
"epoch": 0.9198229217904574,
"grad_norm": 3.3892340660095215,
"learning_rate": 9.517471186505768e-06,
"loss": 1.2755,
"step": 3740
},
{
"epoch": 0.9200688637481554,
"grad_norm": 3.420387029647827,
"learning_rate": 9.51712253062418e-06,
"loss": 1.1174,
"step": 3741
},
{
"epoch": 0.9203148057058534,
"grad_norm": 3.6326241493225098,
"learning_rate": 9.516773755216662e-06,
"loss": 1.1196,
"step": 3742
},
{
"epoch": 0.9205607476635514,
"grad_norm": 3.02589750289917,
"learning_rate": 9.51642486029244e-06,
"loss": 0.9852,
"step": 3743
},
{
"epoch": 0.9208066896212493,
"grad_norm": 3.314985990524292,
"learning_rate": 9.516075845860749e-06,
"loss": 0.9922,
"step": 3744
},
{
"epoch": 0.9210526315789473,
"grad_norm": 3.530643939971924,
"learning_rate": 9.515726711930823e-06,
"loss": 1.1538,
"step": 3745
},
{
"epoch": 0.9212985735366453,
"grad_norm": 3.5981807708740234,
"learning_rate": 9.515377458511902e-06,
"loss": 1.232,
"step": 3746
},
{
"epoch": 0.9215445154943434,
"grad_norm": 3.5246410369873047,
"learning_rate": 9.515028085613223e-06,
"loss": 1.173,
"step": 3747
},
{
"epoch": 0.9217904574520414,
"grad_norm": 3.6701221466064453,
"learning_rate": 9.514678593244033e-06,
"loss": 1.1692,
"step": 3748
},
{
"epoch": 0.9220363994097392,
"grad_norm": 3.6094155311584473,
"learning_rate": 9.514328981413584e-06,
"loss": 1.2556,
"step": 3749
},
{
"epoch": 0.9222823413674373,
"grad_norm": 3.812060832977295,
"learning_rate": 9.51397925013112e-06,
"loss": 1.2741,
"step": 3750
},
{
"epoch": 0.9225282833251353,
"grad_norm": 3.8280935287475586,
"learning_rate": 9.513629399405898e-06,
"loss": 1.3836,
"step": 3751
},
{
"epoch": 0.9227742252828333,
"grad_norm": 3.726642608642578,
"learning_rate": 9.513279429247177e-06,
"loss": 1.2939,
"step": 3752
},
{
"epoch": 0.9230201672405313,
"grad_norm": 3.535461664199829,
"learning_rate": 9.512929339664216e-06,
"loss": 1.2257,
"step": 3753
},
{
"epoch": 0.9232661091982293,
"grad_norm": 3.352865695953369,
"learning_rate": 9.51257913066628e-06,
"loss": 1.2935,
"step": 3754
},
{
"epoch": 0.9235120511559272,
"grad_norm": 3.393155813217163,
"learning_rate": 9.512228802262633e-06,
"loss": 1.1222,
"step": 3755
},
{
"epoch": 0.9237579931136252,
"grad_norm": 3.3420541286468506,
"learning_rate": 9.511878354462546e-06,
"loss": 1.2685,
"step": 3756
},
{
"epoch": 0.9240039350713232,
"grad_norm": 3.2588160037994385,
"learning_rate": 9.511527787275292e-06,
"loss": 1.1871,
"step": 3757
},
{
"epoch": 0.9242498770290212,
"grad_norm": 3.4299204349517822,
"learning_rate": 9.511177100710147e-06,
"loss": 1.2562,
"step": 3758
},
{
"epoch": 0.9244958189867192,
"grad_norm": 3.958904981613159,
"learning_rate": 9.510826294776393e-06,
"loss": 1.259,
"step": 3759
},
{
"epoch": 0.9247417609444171,
"grad_norm": 3.7704057693481445,
"learning_rate": 9.51047536948331e-06,
"loss": 1.1429,
"step": 3760
},
{
"epoch": 0.9249877029021151,
"grad_norm": 3.3050081729888916,
"learning_rate": 9.510124324840183e-06,
"loss": 1.1339,
"step": 3761
},
{
"epoch": 0.9252336448598131,
"grad_norm": 3.813544988632202,
"learning_rate": 9.509773160856303e-06,
"loss": 1.1302,
"step": 3762
},
{
"epoch": 0.9254795868175111,
"grad_norm": 3.7022817134857178,
"learning_rate": 9.509421877540961e-06,
"loss": 1.2369,
"step": 3763
},
{
"epoch": 0.9257255287752091,
"grad_norm": 3.5074825286865234,
"learning_rate": 9.509070474903452e-06,
"loss": 1.1164,
"step": 3764
},
{
"epoch": 0.9259714707329071,
"grad_norm": 3.218583822250366,
"learning_rate": 9.508718952953076e-06,
"loss": 1.0825,
"step": 3765
},
{
"epoch": 0.926217412690605,
"grad_norm": 3.586137294769287,
"learning_rate": 9.50836731169913e-06,
"loss": 1.1394,
"step": 3766
},
{
"epoch": 0.926463354648303,
"grad_norm": 3.3264098167419434,
"learning_rate": 9.508015551150924e-06,
"loss": 1.022,
"step": 3767
},
{
"epoch": 0.926709296606001,
"grad_norm": 3.1757240295410156,
"learning_rate": 9.507663671317764e-06,
"loss": 1.1149,
"step": 3768
},
{
"epoch": 0.926955238563699,
"grad_norm": 3.2074801921844482,
"learning_rate": 9.507311672208958e-06,
"loss": 1.0994,
"step": 3769
},
{
"epoch": 0.927201180521397,
"grad_norm": 3.5946156978607178,
"learning_rate": 9.506959553833826e-06,
"loss": 1.1683,
"step": 3770
},
{
"epoch": 0.9274471224790949,
"grad_norm": 3.3039467334747314,
"learning_rate": 9.50660731620168e-06,
"loss": 1.0147,
"step": 3771
},
{
"epoch": 0.9276930644367929,
"grad_norm": 3.467288017272949,
"learning_rate": 9.506254959321843e-06,
"loss": 1.1196,
"step": 3772
},
{
"epoch": 0.9279390063944909,
"grad_norm": 3.887437343597412,
"learning_rate": 9.505902483203637e-06,
"loss": 1.2486,
"step": 3773
},
{
"epoch": 0.9281849483521889,
"grad_norm": 4.0942912101745605,
"learning_rate": 9.505549887856391e-06,
"loss": 1.3085,
"step": 3774
},
{
"epoch": 0.9284308903098869,
"grad_norm": 3.598442316055298,
"learning_rate": 9.505197173289433e-06,
"loss": 1.1157,
"step": 3775
},
{
"epoch": 0.9286768322675848,
"grad_norm": 3.5747597217559814,
"learning_rate": 9.504844339512096e-06,
"loss": 1.1983,
"step": 3776
},
{
"epoch": 0.9289227742252828,
"grad_norm": 3.8569931983947754,
"learning_rate": 9.504491386533718e-06,
"loss": 1.2756,
"step": 3777
},
{
"epoch": 0.9291687161829808,
"grad_norm": 3.578571319580078,
"learning_rate": 9.504138314363636e-06,
"loss": 1.1433,
"step": 3778
},
{
"epoch": 0.9294146581406788,
"grad_norm": 3.7454118728637695,
"learning_rate": 9.503785123011196e-06,
"loss": 1.2198,
"step": 3779
},
{
"epoch": 0.9296606000983768,
"grad_norm": 4.070187568664551,
"learning_rate": 9.503431812485739e-06,
"loss": 1.3384,
"step": 3780
},
{
"epoch": 0.9299065420560748,
"grad_norm": 3.557666540145874,
"learning_rate": 9.503078382796615e-06,
"loss": 1.1427,
"step": 3781
},
{
"epoch": 0.9301524840137727,
"grad_norm": 3.3730568885803223,
"learning_rate": 9.50272483395318e-06,
"loss": 1.1298,
"step": 3782
},
{
"epoch": 0.9303984259714707,
"grad_norm": 3.527787923812866,
"learning_rate": 9.502371165964786e-06,
"loss": 1.2882,
"step": 3783
},
{
"epoch": 0.9306443679291687,
"grad_norm": 3.583108425140381,
"learning_rate": 9.502017378840791e-06,
"loss": 1.2634,
"step": 3784
},
{
"epoch": 0.9308903098868667,
"grad_norm": 3.760725259780884,
"learning_rate": 9.501663472590557e-06,
"loss": 1.3809,
"step": 3785
},
{
"epoch": 0.9311362518445647,
"grad_norm": 3.5223946571350098,
"learning_rate": 9.501309447223448e-06,
"loss": 1.1304,
"step": 3786
},
{
"epoch": 0.9313821938022626,
"grad_norm": 3.3768486976623535,
"learning_rate": 9.500955302748833e-06,
"loss": 1.2359,
"step": 3787
},
{
"epoch": 0.9316281357599606,
"grad_norm": 3.7514970302581787,
"learning_rate": 9.500601039176083e-06,
"loss": 1.2489,
"step": 3788
},
{
"epoch": 0.9318740777176586,
"grad_norm": 3.5692567825317383,
"learning_rate": 9.50024665651457e-06,
"loss": 1.2221,
"step": 3789
},
{
"epoch": 0.9321200196753566,
"grad_norm": 3.579981565475464,
"learning_rate": 9.499892154773674e-06,
"loss": 1.2258,
"step": 3790
},
{
"epoch": 0.9323659616330546,
"grad_norm": 3.5614571571350098,
"learning_rate": 9.499537533962772e-06,
"loss": 1.141,
"step": 3791
},
{
"epoch": 0.9326119035907526,
"grad_norm": 3.464431047439575,
"learning_rate": 9.49918279409125e-06,
"loss": 1.1896,
"step": 3792
},
{
"epoch": 0.9328578455484505,
"grad_norm": 3.77783203125,
"learning_rate": 9.498827935168492e-06,
"loss": 1.2965,
"step": 3793
},
{
"epoch": 0.9331037875061485,
"grad_norm": 3.5242629051208496,
"learning_rate": 9.498472957203893e-06,
"loss": 1.1915,
"step": 3794
},
{
"epoch": 0.9333497294638465,
"grad_norm": 3.428255796432495,
"learning_rate": 9.498117860206842e-06,
"loss": 1.1313,
"step": 3795
},
{
"epoch": 0.9335956714215445,
"grad_norm": 3.501169443130493,
"learning_rate": 9.497762644186734e-06,
"loss": 1.2004,
"step": 3796
},
{
"epoch": 0.9338416133792425,
"grad_norm": 3.675184488296509,
"learning_rate": 9.49740730915297e-06,
"loss": 1.1762,
"step": 3797
},
{
"epoch": 0.9340875553369404,
"grad_norm": 3.3882060050964355,
"learning_rate": 9.497051855114955e-06,
"loss": 1.1106,
"step": 3798
},
{
"epoch": 0.9343334972946384,
"grad_norm": 3.4326963424682617,
"learning_rate": 9.49669628208209e-06,
"loss": 1.1962,
"step": 3799
},
{
"epoch": 0.9345794392523364,
"grad_norm": 3.735386371612549,
"learning_rate": 9.496340590063787e-06,
"loss": 1.08,
"step": 3800
},
{
"epoch": 0.9348253812100344,
"grad_norm": 3.70676851272583,
"learning_rate": 9.495984779069454e-06,
"loss": 1.183,
"step": 3801
},
{
"epoch": 0.9350713231677324,
"grad_norm": 3.5592615604400635,
"learning_rate": 9.495628849108511e-06,
"loss": 1.297,
"step": 3802
},
{
"epoch": 0.9353172651254305,
"grad_norm": 3.4800803661346436,
"learning_rate": 9.495272800190372e-06,
"loss": 1.104,
"step": 3803
},
{
"epoch": 0.9355632070831283,
"grad_norm": 3.7244813442230225,
"learning_rate": 9.49491663232446e-06,
"loss": 1.2102,
"step": 3804
},
{
"epoch": 0.9358091490408263,
"grad_norm": 3.5701704025268555,
"learning_rate": 9.4945603455202e-06,
"loss": 1.1653,
"step": 3805
},
{
"epoch": 0.9360550909985244,
"grad_norm": 3.5423543453216553,
"learning_rate": 9.494203939787019e-06,
"loss": 1.1375,
"step": 3806
},
{
"epoch": 0.9363010329562224,
"grad_norm": 3.54490327835083,
"learning_rate": 9.493847415134345e-06,
"loss": 1.1388,
"step": 3807
},
{
"epoch": 0.9365469749139204,
"grad_norm": 3.1926064491271973,
"learning_rate": 9.493490771571618e-06,
"loss": 1.1133,
"step": 3808
},
{
"epoch": 0.9367929168716183,
"grad_norm": 3.7149546146392822,
"learning_rate": 9.49313400910827e-06,
"loss": 1.2947,
"step": 3809
},
{
"epoch": 0.9370388588293163,
"grad_norm": 3.5872092247009277,
"learning_rate": 9.492777127753741e-06,
"loss": 1.2427,
"step": 3810
},
{
"epoch": 0.9372848007870143,
"grad_norm": 3.8970701694488525,
"learning_rate": 9.492420127517478e-06,
"loss": 1.276,
"step": 3811
},
{
"epoch": 0.9375307427447123,
"grad_norm": 3.3050131797790527,
"learning_rate": 9.492063008408925e-06,
"loss": 1.0715,
"step": 3812
},
{
"epoch": 0.9377766847024103,
"grad_norm": 3.3639533519744873,
"learning_rate": 9.491705770437531e-06,
"loss": 1.0333,
"step": 3813
},
{
"epoch": 0.9380226266601082,
"grad_norm": 4.0323872566223145,
"learning_rate": 9.49134841361275e-06,
"loss": 1.2571,
"step": 3814
},
{
"epoch": 0.9382685686178062,
"grad_norm": 3.3697216510772705,
"learning_rate": 9.490990937944038e-06,
"loss": 1.2255,
"step": 3815
},
{
"epoch": 0.9385145105755042,
"grad_norm": 3.932177782058716,
"learning_rate": 9.490633343440856e-06,
"loss": 1.1994,
"step": 3816
},
{
"epoch": 0.9387604525332022,
"grad_norm": 3.8653125762939453,
"learning_rate": 9.49027563011266e-06,
"loss": 1.2976,
"step": 3817
},
{
"epoch": 0.9390063944909002,
"grad_norm": 3.731886148452759,
"learning_rate": 9.489917797968921e-06,
"loss": 1.3039,
"step": 3818
},
{
"epoch": 0.9392523364485982,
"grad_norm": 3.3765809535980225,
"learning_rate": 9.489559847019106e-06,
"loss": 1.2403,
"step": 3819
},
{
"epoch": 0.9394982784062961,
"grad_norm": 3.495044708251953,
"learning_rate": 9.489201777272686e-06,
"loss": 1.2942,
"step": 3820
},
{
"epoch": 0.9397442203639941,
"grad_norm": 3.4638824462890625,
"learning_rate": 9.488843588739134e-06,
"loss": 1.2099,
"step": 3821
},
{
"epoch": 0.9399901623216921,
"grad_norm": 3.771656036376953,
"learning_rate": 9.488485281427933e-06,
"loss": 1.2952,
"step": 3822
},
{
"epoch": 0.9402361042793901,
"grad_norm": 3.309021472930908,
"learning_rate": 9.488126855348559e-06,
"loss": 1.2329,
"step": 3823
},
{
"epoch": 0.9404820462370881,
"grad_norm": 5.103731632232666,
"learning_rate": 9.4877683105105e-06,
"loss": 1.2134,
"step": 3824
},
{
"epoch": 0.940727988194786,
"grad_norm": 3.6319146156311035,
"learning_rate": 9.487409646923238e-06,
"loss": 1.3456,
"step": 3825
},
{
"epoch": 0.940973930152484,
"grad_norm": 3.6349234580993652,
"learning_rate": 9.48705086459627e-06,
"loss": 1.2331,
"step": 3826
},
{
"epoch": 0.941219872110182,
"grad_norm": 3.7402396202087402,
"learning_rate": 9.486691963539084e-06,
"loss": 1.2111,
"step": 3827
},
{
"epoch": 0.94146581406788,
"grad_norm": 3.7449779510498047,
"learning_rate": 9.486332943761181e-06,
"loss": 1.2095,
"step": 3828
},
{
"epoch": 0.941711756025578,
"grad_norm": 3.6909730434417725,
"learning_rate": 9.485973805272058e-06,
"loss": 1.2246,
"step": 3829
},
{
"epoch": 0.941957697983276,
"grad_norm": 3.65000319480896,
"learning_rate": 9.48561454808122e-06,
"loss": 1.3278,
"step": 3830
},
{
"epoch": 0.9422036399409739,
"grad_norm": 3.9282188415527344,
"learning_rate": 9.485255172198172e-06,
"loss": 1.3876,
"step": 3831
},
{
"epoch": 0.9424495818986719,
"grad_norm": 3.504951238632202,
"learning_rate": 9.484895677632424e-06,
"loss": 1.0905,
"step": 3832
},
{
"epoch": 0.9426955238563699,
"grad_norm": 3.4208784103393555,
"learning_rate": 9.484536064393487e-06,
"loss": 1.0367,
"step": 3833
},
{
"epoch": 0.9429414658140679,
"grad_norm": 3.340430974960327,
"learning_rate": 9.484176332490878e-06,
"loss": 1.0171,
"step": 3834
},
{
"epoch": 0.9431874077717659,
"grad_norm": 3.95871639251709,
"learning_rate": 9.483816481934116e-06,
"loss": 1.2464,
"step": 3835
},
{
"epoch": 0.9434333497294638,
"grad_norm": 3.461184501647949,
"learning_rate": 9.483456512732722e-06,
"loss": 1.2326,
"step": 3836
},
{
"epoch": 0.9436792916871618,
"grad_norm": 3.397439956665039,
"learning_rate": 9.483096424896224e-06,
"loss": 1.0618,
"step": 3837
},
{
"epoch": 0.9439252336448598,
"grad_norm": 3.773892879486084,
"learning_rate": 9.482736218434144e-06,
"loss": 1.2633,
"step": 3838
},
{
"epoch": 0.9441711756025578,
"grad_norm": 4.458995819091797,
"learning_rate": 9.482375893356018e-06,
"loss": 1.2889,
"step": 3839
},
{
"epoch": 0.9444171175602558,
"grad_norm": 3.7137601375579834,
"learning_rate": 9.482015449671378e-06,
"loss": 1.3037,
"step": 3840
},
{
"epoch": 0.9446630595179538,
"grad_norm": 3.3900792598724365,
"learning_rate": 9.481654887389764e-06,
"loss": 1.3094,
"step": 3841
},
{
"epoch": 0.9449090014756517,
"grad_norm": 3.420044422149658,
"learning_rate": 9.481294206520714e-06,
"loss": 1.1674,
"step": 3842
},
{
"epoch": 0.9451549434333497,
"grad_norm": 3.86102032661438,
"learning_rate": 9.480933407073776e-06,
"loss": 1.2335,
"step": 3843
},
{
"epoch": 0.9454008853910477,
"grad_norm": 3.5613207817077637,
"learning_rate": 9.480572489058493e-06,
"loss": 1.1224,
"step": 3844
},
{
"epoch": 0.9456468273487457,
"grad_norm": 3.4949965476989746,
"learning_rate": 9.480211452484416e-06,
"loss": 1.1378,
"step": 3845
},
{
"epoch": 0.9458927693064437,
"grad_norm": 3.5595834255218506,
"learning_rate": 9.4798502973611e-06,
"loss": 1.282,
"step": 3846
},
{
"epoch": 0.9461387112641416,
"grad_norm": 3.5682520866394043,
"learning_rate": 9.4794890236981e-06,
"loss": 1.2812,
"step": 3847
},
{
"epoch": 0.9463846532218396,
"grad_norm": 3.9022674560546875,
"learning_rate": 9.479127631504973e-06,
"loss": 1.5532,
"step": 3848
},
{
"epoch": 0.9466305951795376,
"grad_norm": 3.2392373085021973,
"learning_rate": 9.478766120791284e-06,
"loss": 1.1253,
"step": 3849
},
{
"epoch": 0.9468765371372356,
"grad_norm": 3.7060301303863525,
"learning_rate": 9.478404491566602e-06,
"loss": 1.1774,
"step": 3850
},
{
"epoch": 0.9471224790949336,
"grad_norm": 3.4534833431243896,
"learning_rate": 9.478042743840492e-06,
"loss": 1.2007,
"step": 3851
},
{
"epoch": 0.9473684210526315,
"grad_norm": 3.597015857696533,
"learning_rate": 9.477680877622528e-06,
"loss": 1.2892,
"step": 3852
},
{
"epoch": 0.9476143630103295,
"grad_norm": 3.3262054920196533,
"learning_rate": 9.477318892922283e-06,
"loss": 1.2013,
"step": 3853
},
{
"epoch": 0.9478603049680275,
"grad_norm": 3.249096632003784,
"learning_rate": 9.476956789749336e-06,
"loss": 1.1139,
"step": 3854
},
{
"epoch": 0.9481062469257255,
"grad_norm": 3.445883274078369,
"learning_rate": 9.47659456811327e-06,
"loss": 1.2276,
"step": 3855
},
{
"epoch": 0.9483521888834235,
"grad_norm": 3.5545454025268555,
"learning_rate": 9.47623222802367e-06,
"loss": 1.2976,
"step": 3856
},
{
"epoch": 0.9485981308411215,
"grad_norm": 3.7074806690216064,
"learning_rate": 9.475869769490122e-06,
"loss": 1.3535,
"step": 3857
},
{
"epoch": 0.9488440727988194,
"grad_norm": 3.244786024093628,
"learning_rate": 9.475507192522215e-06,
"loss": 1.1357,
"step": 3858
},
{
"epoch": 0.9490900147565174,
"grad_norm": 3.5750832557678223,
"learning_rate": 9.475144497129548e-06,
"loss": 1.1874,
"step": 3859
},
{
"epoch": 0.9493359567142154,
"grad_norm": 3.4356906414031982,
"learning_rate": 9.474781683321715e-06,
"loss": 1.0262,
"step": 3860
},
{
"epoch": 0.9495818986719134,
"grad_norm": 3.456873655319214,
"learning_rate": 9.474418751108317e-06,
"loss": 1.0244,
"step": 3861
},
{
"epoch": 0.9498278406296115,
"grad_norm": 3.6558327674865723,
"learning_rate": 9.474055700498956e-06,
"loss": 1.181,
"step": 3862
},
{
"epoch": 0.9500737825873093,
"grad_norm": 3.432169198989868,
"learning_rate": 9.47369253150324e-06,
"loss": 1.1606,
"step": 3863
},
{
"epoch": 0.9503197245450073,
"grad_norm": 3.3367044925689697,
"learning_rate": 9.47332924413078e-06,
"loss": 1.1692,
"step": 3864
},
{
"epoch": 0.9505656665027054,
"grad_norm": 3.685706615447998,
"learning_rate": 9.472965838391187e-06,
"loss": 1.2211,
"step": 3865
},
{
"epoch": 0.9508116084604034,
"grad_norm": 3.667942762374878,
"learning_rate": 9.472602314294074e-06,
"loss": 1.1517,
"step": 3866
},
{
"epoch": 0.9510575504181014,
"grad_norm": 3.767714023590088,
"learning_rate": 9.472238671849067e-06,
"loss": 1.2616,
"step": 3867
},
{
"epoch": 0.9513034923757994,
"grad_norm": 3.7005503177642822,
"learning_rate": 9.471874911065782e-06,
"loss": 1.187,
"step": 3868
},
{
"epoch": 0.9515494343334973,
"grad_norm": 3.6397907733917236,
"learning_rate": 9.471511031953848e-06,
"loss": 1.3042,
"step": 3869
},
{
"epoch": 0.9517953762911953,
"grad_norm": 3.3443901538848877,
"learning_rate": 9.471147034522895e-06,
"loss": 1.1469,
"step": 3870
},
{
"epoch": 0.9520413182488933,
"grad_norm": 3.6902546882629395,
"learning_rate": 9.470782918782547e-06,
"loss": 1.2051,
"step": 3871
},
{
"epoch": 0.9522872602065913,
"grad_norm": 3.2730002403259277,
"learning_rate": 9.470418684742447e-06,
"loss": 1.0646,
"step": 3872
},
{
"epoch": 0.9525332021642893,
"grad_norm": 3.884192705154419,
"learning_rate": 9.470054332412229e-06,
"loss": 1.2368,
"step": 3873
},
{
"epoch": 0.9527791441219872,
"grad_norm": 3.509549617767334,
"learning_rate": 9.469689861801533e-06,
"loss": 1.1073,
"step": 3874
},
{
"epoch": 0.9530250860796852,
"grad_norm": 3.5661025047302246,
"learning_rate": 9.469325272920005e-06,
"loss": 1.1983,
"step": 3875
},
{
"epoch": 0.9532710280373832,
"grad_norm": 3.2048611640930176,
"learning_rate": 9.468960565777294e-06,
"loss": 1.0262,
"step": 3876
},
{
"epoch": 0.9535169699950812,
"grad_norm": 3.376889228820801,
"learning_rate": 9.468595740383047e-06,
"loss": 1.1877,
"step": 3877
},
{
"epoch": 0.9537629119527792,
"grad_norm": 3.7012181282043457,
"learning_rate": 9.468230796746918e-06,
"loss": 1.2635,
"step": 3878
},
{
"epoch": 0.9540088539104771,
"grad_norm": 3.6242101192474365,
"learning_rate": 9.467865734878565e-06,
"loss": 1.3012,
"step": 3879
},
{
"epoch": 0.9542547958681751,
"grad_norm": 3.5750832557678223,
"learning_rate": 9.467500554787648e-06,
"loss": 1.3696,
"step": 3880
},
{
"epoch": 0.9545007378258731,
"grad_norm": 3.355804443359375,
"learning_rate": 9.467135256483828e-06,
"loss": 1.1906,
"step": 3881
},
{
"epoch": 0.9547466797835711,
"grad_norm": 3.4438674449920654,
"learning_rate": 9.466769839976772e-06,
"loss": 1.1748,
"step": 3882
},
{
"epoch": 0.9549926217412691,
"grad_norm": 3.7418315410614014,
"learning_rate": 9.466404305276152e-06,
"loss": 1.2661,
"step": 3883
},
{
"epoch": 0.9552385636989671,
"grad_norm": 3.5406558513641357,
"learning_rate": 9.466038652391633e-06,
"loss": 1.2004,
"step": 3884
},
{
"epoch": 0.955484505656665,
"grad_norm": 3.5044021606445312,
"learning_rate": 9.4656728813329e-06,
"loss": 1.2274,
"step": 3885
},
{
"epoch": 0.955730447614363,
"grad_norm": 3.861959934234619,
"learning_rate": 9.465306992109622e-06,
"loss": 1.3296,
"step": 3886
},
{
"epoch": 0.955976389572061,
"grad_norm": 3.6768319606781006,
"learning_rate": 9.464940984731488e-06,
"loss": 1.2451,
"step": 3887
},
{
"epoch": 0.956222331529759,
"grad_norm": 3.8698320388793945,
"learning_rate": 9.464574859208178e-06,
"loss": 1.3843,
"step": 3888
},
{
"epoch": 0.956468273487457,
"grad_norm": 3.431727170944214,
"learning_rate": 9.464208615549383e-06,
"loss": 1.2413,
"step": 3889
},
{
"epoch": 0.9567142154451549,
"grad_norm": 3.617901563644409,
"learning_rate": 9.463842253764792e-06,
"loss": 1.2311,
"step": 3890
},
{
"epoch": 0.9569601574028529,
"grad_norm": 3.4012129306793213,
"learning_rate": 9.463475773864102e-06,
"loss": 1.2181,
"step": 3891
},
{
"epoch": 0.9572060993605509,
"grad_norm": 3.400784730911255,
"learning_rate": 9.463109175857007e-06,
"loss": 1.1172,
"step": 3892
},
{
"epoch": 0.9574520413182489,
"grad_norm": 3.292860507965088,
"learning_rate": 9.462742459753209e-06,
"loss": 1.1778,
"step": 3893
},
{
"epoch": 0.9576979832759469,
"grad_norm": 3.3770363330841064,
"learning_rate": 9.462375625562411e-06,
"loss": 1.0802,
"step": 3894
},
{
"epoch": 0.9579439252336449,
"grad_norm": 3.3880670070648193,
"learning_rate": 9.46200867329432e-06,
"loss": 1.1235,
"step": 3895
},
{
"epoch": 0.9581898671913428,
"grad_norm": 3.5733585357666016,
"learning_rate": 9.461641602958647e-06,
"loss": 1.352,
"step": 3896
},
{
"epoch": 0.9584358091490408,
"grad_norm": 3.1096608638763428,
"learning_rate": 9.461274414565103e-06,
"loss": 1.1025,
"step": 3897
},
{
"epoch": 0.9586817511067388,
"grad_norm": 3.8797056674957275,
"learning_rate": 9.460907108123404e-06,
"loss": 1.3212,
"step": 3898
},
{
"epoch": 0.9589276930644368,
"grad_norm": 3.5723931789398193,
"learning_rate": 9.46053968364327e-06,
"loss": 1.2497,
"step": 3899
},
{
"epoch": 0.9591736350221348,
"grad_norm": 3.764925956726074,
"learning_rate": 9.460172141134423e-06,
"loss": 1.2879,
"step": 3900
},
{
"epoch": 0.9594195769798327,
"grad_norm": 3.2651586532592773,
"learning_rate": 9.459804480606589e-06,
"loss": 1.0856,
"step": 3901
},
{
"epoch": 0.9596655189375307,
"grad_norm": 3.4858059883117676,
"learning_rate": 9.459436702069497e-06,
"loss": 1.0766,
"step": 3902
},
{
"epoch": 0.9599114608952287,
"grad_norm": 3.4706971645355225,
"learning_rate": 9.459068805532878e-06,
"loss": 1.2289,
"step": 3903
},
{
"epoch": 0.9601574028529267,
"grad_norm": 3.8492400646209717,
"learning_rate": 9.458700791006464e-06,
"loss": 1.2526,
"step": 3904
},
{
"epoch": 0.9604033448106247,
"grad_norm": 3.8789875507354736,
"learning_rate": 9.458332658499998e-06,
"loss": 1.387,
"step": 3905
},
{
"epoch": 0.9606492867683227,
"grad_norm": 3.420001268386841,
"learning_rate": 9.457964408023218e-06,
"loss": 1.2292,
"step": 3906
},
{
"epoch": 0.9608952287260206,
"grad_norm": 3.5160412788391113,
"learning_rate": 9.457596039585868e-06,
"loss": 1.0721,
"step": 3907
},
{
"epoch": 0.9611411706837186,
"grad_norm": 3.3310837745666504,
"learning_rate": 9.457227553197694e-06,
"loss": 1.1096,
"step": 3908
},
{
"epoch": 0.9613871126414166,
"grad_norm": 3.403733253479004,
"learning_rate": 9.456858948868452e-06,
"loss": 1.21,
"step": 3909
},
{
"epoch": 0.9616330545991146,
"grad_norm": 3.5101540088653564,
"learning_rate": 9.45649022660789e-06,
"loss": 1.2312,
"step": 3910
},
{
"epoch": 0.9618789965568126,
"grad_norm": 3.8657493591308594,
"learning_rate": 9.456121386425765e-06,
"loss": 1.385,
"step": 3911
},
{
"epoch": 0.9621249385145105,
"grad_norm": 3.7910027503967285,
"learning_rate": 9.45575242833184e-06,
"loss": 1.2417,
"step": 3912
},
{
"epoch": 0.9623708804722085,
"grad_norm": 3.8486952781677246,
"learning_rate": 9.455383352335875e-06,
"loss": 1.2379,
"step": 3913
},
{
"epoch": 0.9626168224299065,
"grad_norm": 3.4816291332244873,
"learning_rate": 9.455014158447637e-06,
"loss": 1.1972,
"step": 3914
},
{
"epoch": 0.9628627643876045,
"grad_norm": 4.0010223388671875,
"learning_rate": 9.454644846676897e-06,
"loss": 1.4225,
"step": 3915
},
{
"epoch": 0.9631087063453025,
"grad_norm": 3.7106900215148926,
"learning_rate": 9.454275417033423e-06,
"loss": 1.3585,
"step": 3916
},
{
"epoch": 0.9633546483030004,
"grad_norm": 3.5111634731292725,
"learning_rate": 9.453905869526993e-06,
"loss": 1.1629,
"step": 3917
},
{
"epoch": 0.9636005902606984,
"grad_norm": 3.5515894889831543,
"learning_rate": 9.453536204167385e-06,
"loss": 1.12,
"step": 3918
},
{
"epoch": 0.9638465322183964,
"grad_norm": 3.258821725845337,
"learning_rate": 9.453166420964382e-06,
"loss": 1.2491,
"step": 3919
},
{
"epoch": 0.9640924741760944,
"grad_norm": 3.5903799533843994,
"learning_rate": 9.452796519927767e-06,
"loss": 1.2182,
"step": 3920
},
{
"epoch": 0.9643384161337925,
"grad_norm": 3.8884048461914062,
"learning_rate": 9.452426501067326e-06,
"loss": 1.2363,
"step": 3921
},
{
"epoch": 0.9645843580914905,
"grad_norm": 3.356123447418213,
"learning_rate": 9.452056364392856e-06,
"loss": 1.3502,
"step": 3922
},
{
"epoch": 0.9648303000491883,
"grad_norm": 3.50557541847229,
"learning_rate": 9.451686109914144e-06,
"loss": 1.2512,
"step": 3923
},
{
"epoch": 0.9650762420068864,
"grad_norm": 3.561814069747925,
"learning_rate": 9.451315737640992e-06,
"loss": 1.2079,
"step": 3924
},
{
"epoch": 0.9653221839645844,
"grad_norm": 3.7782504558563232,
"learning_rate": 9.450945247583198e-06,
"loss": 1.3143,
"step": 3925
},
{
"epoch": 0.9655681259222824,
"grad_norm": 3.797196865081787,
"learning_rate": 9.450574639750566e-06,
"loss": 1.2199,
"step": 3926
},
{
"epoch": 0.9658140678799804,
"grad_norm": 3.7892367839813232,
"learning_rate": 9.450203914152901e-06,
"loss": 1.1944,
"step": 3927
},
{
"epoch": 0.9660600098376783,
"grad_norm": 3.6078383922576904,
"learning_rate": 9.449833070800017e-06,
"loss": 1.3291,
"step": 3928
},
{
"epoch": 0.9663059517953763,
"grad_norm": 3.565176486968994,
"learning_rate": 9.449462109701723e-06,
"loss": 1.2504,
"step": 3929
},
{
"epoch": 0.9665518937530743,
"grad_norm": 3.721128225326538,
"learning_rate": 9.449091030867837e-06,
"loss": 1.0978,
"step": 3930
},
{
"epoch": 0.9667978357107723,
"grad_norm": 3.257206678390503,
"learning_rate": 9.448719834308175e-06,
"loss": 1.0915,
"step": 3931
},
{
"epoch": 0.9670437776684703,
"grad_norm": 3.873300790786743,
"learning_rate": 9.448348520032561e-06,
"loss": 1.333,
"step": 3932
},
{
"epoch": 0.9672897196261683,
"grad_norm": 3.9430665969848633,
"learning_rate": 9.44797708805082e-06,
"loss": 1.2885,
"step": 3933
},
{
"epoch": 0.9675356615838662,
"grad_norm": 4.052707195281982,
"learning_rate": 9.447605538372782e-06,
"loss": 1.1452,
"step": 3934
},
{
"epoch": 0.9677816035415642,
"grad_norm": 3.6545021533966064,
"learning_rate": 9.447233871008274e-06,
"loss": 1.1646,
"step": 3935
},
{
"epoch": 0.9680275454992622,
"grad_norm": 3.466557264328003,
"learning_rate": 9.446862085967136e-06,
"loss": 1.1715,
"step": 3936
},
{
"epoch": 0.9682734874569602,
"grad_norm": 3.435060501098633,
"learning_rate": 9.446490183259203e-06,
"loss": 1.2322,
"step": 3937
},
{
"epoch": 0.9685194294146582,
"grad_norm": 3.3253560066223145,
"learning_rate": 9.446118162894316e-06,
"loss": 1.2344,
"step": 3938
},
{
"epoch": 0.9687653713723561,
"grad_norm": 3.7697930335998535,
"learning_rate": 9.445746024882317e-06,
"loss": 1.3497,
"step": 3939
},
{
"epoch": 0.9690113133300541,
"grad_norm": 3.271660804748535,
"learning_rate": 9.445373769233055e-06,
"loss": 1.1219,
"step": 3940
},
{
"epoch": 0.9692572552877521,
"grad_norm": 3.298402786254883,
"learning_rate": 9.445001395956381e-06,
"loss": 1.2552,
"step": 3941
},
{
"epoch": 0.9695031972454501,
"grad_norm": 3.2619965076446533,
"learning_rate": 9.444628905062147e-06,
"loss": 1.0157,
"step": 3942
},
{
"epoch": 0.9697491392031481,
"grad_norm": 3.536372661590576,
"learning_rate": 9.44425629656021e-06,
"loss": 1.3314,
"step": 3943
},
{
"epoch": 0.9699950811608461,
"grad_norm": 3.045236587524414,
"learning_rate": 9.443883570460427e-06,
"loss": 1.0602,
"step": 3944
},
{
"epoch": 0.970241023118544,
"grad_norm": 3.6073899269104004,
"learning_rate": 9.443510726772664e-06,
"loss": 1.1723,
"step": 3945
},
{
"epoch": 0.970486965076242,
"grad_norm": 3.655804395675659,
"learning_rate": 9.443137765506784e-06,
"loss": 1.3346,
"step": 3946
},
{
"epoch": 0.97073290703394,
"grad_norm": 3.4012553691864014,
"learning_rate": 9.442764686672658e-06,
"loss": 1.234,
"step": 3947
},
{
"epoch": 0.970978848991638,
"grad_norm": 3.552107334136963,
"learning_rate": 9.442391490280158e-06,
"loss": 1.3179,
"step": 3948
},
{
"epoch": 0.971224790949336,
"grad_norm": 3.3224945068359375,
"learning_rate": 9.442018176339156e-06,
"loss": 1.2053,
"step": 3949
},
{
"epoch": 0.9714707329070339,
"grad_norm": 3.3016951084136963,
"learning_rate": 9.441644744859534e-06,
"loss": 1.0527,
"step": 3950
},
{
"epoch": 0.9717166748647319,
"grad_norm": 3.1873135566711426,
"learning_rate": 9.441271195851168e-06,
"loss": 1.1328,
"step": 3951
},
{
"epoch": 0.9719626168224299,
"grad_norm": 3.5724406242370605,
"learning_rate": 9.440897529323948e-06,
"loss": 1.148,
"step": 3952
},
{
"epoch": 0.9722085587801279,
"grad_norm": 3.2752809524536133,
"learning_rate": 9.440523745287758e-06,
"loss": 1.0886,
"step": 3953
},
{
"epoch": 0.9724545007378259,
"grad_norm": 3.471850633621216,
"learning_rate": 9.44014984375249e-06,
"loss": 1.3555,
"step": 3954
},
{
"epoch": 0.9727004426955238,
"grad_norm": 3.8765337467193604,
"learning_rate": 9.439775824728038e-06,
"loss": 1.2075,
"step": 3955
},
{
"epoch": 0.9729463846532218,
"grad_norm": 5.273101329803467,
"learning_rate": 9.439401688224296e-06,
"loss": 1.1248,
"step": 3956
},
{
"epoch": 0.9731923266109198,
"grad_norm": 3.281752824783325,
"learning_rate": 9.439027434251169e-06,
"loss": 1.2599,
"step": 3957
},
{
"epoch": 0.9734382685686178,
"grad_norm": 3.9435343742370605,
"learning_rate": 9.438653062818555e-06,
"loss": 1.2737,
"step": 3958
},
{
"epoch": 0.9736842105263158,
"grad_norm": 3.492047071456909,
"learning_rate": 9.438278573936363e-06,
"loss": 1.201,
"step": 3959
},
{
"epoch": 0.9739301524840138,
"grad_norm": 3.5101473331451416,
"learning_rate": 9.4379039676145e-06,
"loss": 1.2467,
"step": 3960
},
{
"epoch": 0.9741760944417117,
"grad_norm": 3.2483408451080322,
"learning_rate": 9.437529243862881e-06,
"loss": 1.0941,
"step": 3961
},
{
"epoch": 0.9744220363994097,
"grad_norm": 3.638209581375122,
"learning_rate": 9.437154402691419e-06,
"loss": 1.1489,
"step": 3962
},
{
"epoch": 0.9746679783571077,
"grad_norm": 3.5533618927001953,
"learning_rate": 9.436779444110033e-06,
"loss": 1.182,
"step": 3963
},
{
"epoch": 0.9749139203148057,
"grad_norm": 3.5716166496276855,
"learning_rate": 9.436404368128646e-06,
"loss": 1.2409,
"step": 3964
},
{
"epoch": 0.9751598622725037,
"grad_norm": 3.494746685028076,
"learning_rate": 9.436029174757183e-06,
"loss": 1.0087,
"step": 3965
},
{
"epoch": 0.9754058042302016,
"grad_norm": 3.5767982006073,
"learning_rate": 9.43565386400557e-06,
"loss": 1.469,
"step": 3966
},
{
"epoch": 0.9756517461878996,
"grad_norm": 3.5581271648406982,
"learning_rate": 9.43527843588374e-06,
"loss": 1.1712,
"step": 3967
},
{
"epoch": 0.9758976881455976,
"grad_norm": 3.6011576652526855,
"learning_rate": 9.434902890401623e-06,
"loss": 1.0994,
"step": 3968
},
{
"epoch": 0.9761436301032956,
"grad_norm": 3.7829370498657227,
"learning_rate": 9.434527227569162e-06,
"loss": 1.2132,
"step": 3969
},
{
"epoch": 0.9763895720609936,
"grad_norm": 3.5494561195373535,
"learning_rate": 9.434151447396292e-06,
"loss": 1.2755,
"step": 3970
},
{
"epoch": 0.9766355140186916,
"grad_norm": 3.5181777477264404,
"learning_rate": 9.433775549892959e-06,
"loss": 1.1178,
"step": 3971
},
{
"epoch": 0.9768814559763895,
"grad_norm": 3.73901104927063,
"learning_rate": 9.43339953506911e-06,
"loss": 1.2093,
"step": 3972
},
{
"epoch": 0.9771273979340875,
"grad_norm": 3.502727746963501,
"learning_rate": 9.433023402934693e-06,
"loss": 1.2582,
"step": 3973
},
{
"epoch": 0.9773733398917855,
"grad_norm": 3.5313947200775146,
"learning_rate": 9.432647153499662e-06,
"loss": 1.1749,
"step": 3974
},
{
"epoch": 0.9776192818494835,
"grad_norm": 3.5662155151367188,
"learning_rate": 9.432270786773972e-06,
"loss": 1.3048,
"step": 3975
},
{
"epoch": 0.9778652238071815,
"grad_norm": 2.930616617202759,
"learning_rate": 9.43189430276758e-06,
"loss": 1.0708,
"step": 3976
},
{
"epoch": 0.9781111657648794,
"grad_norm": 3.6719422340393066,
"learning_rate": 9.431517701490453e-06,
"loss": 1.2976,
"step": 3977
},
{
"epoch": 0.9783571077225774,
"grad_norm": 3.6541898250579834,
"learning_rate": 9.431140982952553e-06,
"loss": 1.2427,
"step": 3978
},
{
"epoch": 0.9786030496802754,
"grad_norm": 3.5526115894317627,
"learning_rate": 9.43076414716385e-06,
"loss": 1.1524,
"step": 3979
},
{
"epoch": 0.9788489916379735,
"grad_norm": 3.591541290283203,
"learning_rate": 9.43038719413431e-06,
"loss": 1.2948,
"step": 3980
},
{
"epoch": 0.9790949335956715,
"grad_norm": 3.2846362590789795,
"learning_rate": 9.430010123873913e-06,
"loss": 1.2284,
"step": 3981
},
{
"epoch": 0.9793408755533693,
"grad_norm": 3.9091074466705322,
"learning_rate": 9.429632936392634e-06,
"loss": 1.1577,
"step": 3982
},
{
"epoch": 0.9795868175110674,
"grad_norm": 3.363180637359619,
"learning_rate": 9.429255631700457e-06,
"loss": 1.2196,
"step": 3983
},
{
"epoch": 0.9798327594687654,
"grad_norm": 3.2019002437591553,
"learning_rate": 9.42887820980736e-06,
"loss": 1.2318,
"step": 3984
},
{
"epoch": 0.9800787014264634,
"grad_norm": 3.4849393367767334,
"learning_rate": 9.428500670723334e-06,
"loss": 1.1841,
"step": 3985
},
{
"epoch": 0.9803246433841614,
"grad_norm": 3.7001113891601562,
"learning_rate": 9.428123014458368e-06,
"loss": 1.1997,
"step": 3986
},
{
"epoch": 0.9805705853418594,
"grad_norm": 3.8835556507110596,
"learning_rate": 9.427745241022455e-06,
"loss": 1.2643,
"step": 3987
},
{
"epoch": 0.9808165272995573,
"grad_norm": 3.368734359741211,
"learning_rate": 9.42736735042559e-06,
"loss": 1.2354,
"step": 3988
},
{
"epoch": 0.9810624692572553,
"grad_norm": 3.474813938140869,
"learning_rate": 9.426989342677773e-06,
"loss": 1.1855,
"step": 3989
},
{
"epoch": 0.9813084112149533,
"grad_norm": 3.2981035709381104,
"learning_rate": 9.426611217789009e-06,
"loss": 1.0957,
"step": 3990
},
{
"epoch": 0.9815543531726513,
"grad_norm": 3.8233461380004883,
"learning_rate": 9.426232975769298e-06,
"loss": 1.2497,
"step": 3991
},
{
"epoch": 0.9818002951303493,
"grad_norm": 3.3758599758148193,
"learning_rate": 9.425854616628652e-06,
"loss": 1.1658,
"step": 3992
},
{
"epoch": 0.9820462370880472,
"grad_norm": 3.8683032989501953,
"learning_rate": 9.425476140377084e-06,
"loss": 1.3352,
"step": 3993
},
{
"epoch": 0.9822921790457452,
"grad_norm": 3.2530999183654785,
"learning_rate": 9.425097547024605e-06,
"loss": 1.2565,
"step": 3994
},
{
"epoch": 0.9825381210034432,
"grad_norm": 3.300710916519165,
"learning_rate": 9.424718836581236e-06,
"loss": 1.1846,
"step": 3995
},
{
"epoch": 0.9827840629611412,
"grad_norm": 3.4276251792907715,
"learning_rate": 9.424340009056993e-06,
"loss": 1.1894,
"step": 3996
},
{
"epoch": 0.9830300049188392,
"grad_norm": 3.4976072311401367,
"learning_rate": 9.423961064461906e-06,
"loss": 1.173,
"step": 3997
},
{
"epoch": 0.9832759468765372,
"grad_norm": 3.4860527515411377,
"learning_rate": 9.423582002806e-06,
"loss": 1.2497,
"step": 3998
},
{
"epoch": 0.9835218888342351,
"grad_norm": 3.487330198287964,
"learning_rate": 9.423202824099305e-06,
"loss": 1.2034,
"step": 3999
},
{
"epoch": 0.9837678307919331,
"grad_norm": 3.8622448444366455,
"learning_rate": 9.422823528351855e-06,
"loss": 1.223,
"step": 4000
},
{
"epoch": 0.9837678307919331,
"eval_loss": 1.2509678602218628,
"eval_runtime": 13.6283,
"eval_samples_per_second": 29.351,
"eval_steps_per_second": 3.669,
"step": 4000
},
{
"epoch": 0.9840137727496311,
"grad_norm": 4.094562530517578,
"learning_rate": 9.422444115573683e-06,
"loss": 1.3321,
"step": 4001
},
{
"epoch": 0.9842597147073291,
"grad_norm": 3.702362060546875,
"learning_rate": 9.422064585774832e-06,
"loss": 1.2424,
"step": 4002
},
{
"epoch": 0.9845056566650271,
"grad_norm": 3.285337448120117,
"learning_rate": 9.421684938965346e-06,
"loss": 1.1869,
"step": 4003
},
{
"epoch": 0.984751598622725,
"grad_norm": 3.5119123458862305,
"learning_rate": 9.421305175155266e-06,
"loss": 1.1845,
"step": 4004
},
{
"epoch": 0.984997540580423,
"grad_norm": 3.4742894172668457,
"learning_rate": 9.420925294354644e-06,
"loss": 1.1436,
"step": 4005
},
{
"epoch": 0.985243482538121,
"grad_norm": 3.398460865020752,
"learning_rate": 9.420545296573532e-06,
"loss": 1.1115,
"step": 4006
},
{
"epoch": 0.985489424495819,
"grad_norm": 3.6169159412384033,
"learning_rate": 9.420165181821983e-06,
"loss": 1.2044,
"step": 4007
},
{
"epoch": 0.985735366453517,
"grad_norm": 3.693427324295044,
"learning_rate": 9.419784950110054e-06,
"loss": 1.3031,
"step": 4008
},
{
"epoch": 0.985981308411215,
"grad_norm": 3.8905467987060547,
"learning_rate": 9.419404601447812e-06,
"loss": 1.2555,
"step": 4009
},
{
"epoch": 0.9862272503689129,
"grad_norm": 3.499493360519409,
"learning_rate": 9.419024135845314e-06,
"loss": 1.2625,
"step": 4010
},
{
"epoch": 0.9864731923266109,
"grad_norm": 3.42989182472229,
"learning_rate": 9.418643553312634e-06,
"loss": 1.136,
"step": 4011
},
{
"epoch": 0.9867191342843089,
"grad_norm": 3.911168336868286,
"learning_rate": 9.418262853859838e-06,
"loss": 1.3539,
"step": 4012
},
{
"epoch": 0.9869650762420069,
"grad_norm": 3.4512367248535156,
"learning_rate": 9.417882037497e-06,
"loss": 1.1902,
"step": 4013
},
{
"epoch": 0.9872110181997049,
"grad_norm": 3.241431713104248,
"learning_rate": 9.417501104234201e-06,
"loss": 1.1418,
"step": 4014
},
{
"epoch": 0.9874569601574028,
"grad_norm": 3.5200860500335693,
"learning_rate": 9.417120054081515e-06,
"loss": 1.143,
"step": 4015
},
{
"epoch": 0.9877029021151008,
"grad_norm": 3.689511299133301,
"learning_rate": 9.416738887049026e-06,
"loss": 1.3157,
"step": 4016
},
{
"epoch": 0.9879488440727988,
"grad_norm": 3.661564350128174,
"learning_rate": 9.416357603146824e-06,
"loss": 1.3345,
"step": 4017
},
{
"epoch": 0.9881947860304968,
"grad_norm": 3.3963818550109863,
"learning_rate": 9.415976202384993e-06,
"loss": 1.1748,
"step": 4018
},
{
"epoch": 0.9884407279881948,
"grad_norm": 3.3439180850982666,
"learning_rate": 9.415594684773627e-06,
"loss": 1.121,
"step": 4019
},
{
"epoch": 0.9886866699458927,
"grad_norm": 3.4944005012512207,
"learning_rate": 9.415213050322822e-06,
"loss": 1.1535,
"step": 4020
},
{
"epoch": 0.9889326119035907,
"grad_norm": 3.8324179649353027,
"learning_rate": 9.414831299042676e-06,
"loss": 1.1717,
"step": 4021
},
{
"epoch": 0.9891785538612887,
"grad_norm": 3.4863691329956055,
"learning_rate": 9.41444943094329e-06,
"loss": 1.2869,
"step": 4022
},
{
"epoch": 0.9894244958189867,
"grad_norm": 3.570549249649048,
"learning_rate": 9.414067446034768e-06,
"loss": 1.1814,
"step": 4023
},
{
"epoch": 0.9896704377766847,
"grad_norm": 3.2333450317382812,
"learning_rate": 9.41368534432722e-06,
"loss": 1.1036,
"step": 4024
},
{
"epoch": 0.9899163797343827,
"grad_norm": 3.5444629192352295,
"learning_rate": 9.413303125830752e-06,
"loss": 1.2759,
"step": 4025
},
{
"epoch": 0.9901623216920806,
"grad_norm": 3.7427749633789062,
"learning_rate": 9.412920790555481e-06,
"loss": 1.3634,
"step": 4026
},
{
"epoch": 0.9904082636497786,
"grad_norm": 3.4836504459381104,
"learning_rate": 9.412538338511524e-06,
"loss": 1.1114,
"step": 4027
},
{
"epoch": 0.9906542056074766,
"grad_norm": 3.6351001262664795,
"learning_rate": 9.412155769709e-06,
"loss": 1.1319,
"step": 4028
},
{
"epoch": 0.9909001475651746,
"grad_norm": 3.4052226543426514,
"learning_rate": 9.411773084158034e-06,
"loss": 1.1852,
"step": 4029
},
{
"epoch": 0.9911460895228726,
"grad_norm": 3.523376941680908,
"learning_rate": 9.411390281868747e-06,
"loss": 1.3752,
"step": 4030
},
{
"epoch": 0.9913920314805705,
"grad_norm": 3.6203367710113525,
"learning_rate": 9.411007362851274e-06,
"loss": 1.2091,
"step": 4031
},
{
"epoch": 0.9916379734382685,
"grad_norm": 3.234666109085083,
"learning_rate": 9.410624327115745e-06,
"loss": 0.9977,
"step": 4032
},
{
"epoch": 0.9918839153959665,
"grad_norm": 3.21868634223938,
"learning_rate": 9.410241174672294e-06,
"loss": 1.1103,
"step": 4033
},
{
"epoch": 0.9921298573536645,
"grad_norm": 3.424708127975464,
"learning_rate": 9.409857905531062e-06,
"loss": 1.0448,
"step": 4034
},
{
"epoch": 0.9923757993113625,
"grad_norm": 4.0532612800598145,
"learning_rate": 9.409474519702187e-06,
"loss": 1.3371,
"step": 4035
},
{
"epoch": 0.9926217412690606,
"grad_norm": 3.8172054290771484,
"learning_rate": 9.409091017195817e-06,
"loss": 1.1483,
"step": 4036
},
{
"epoch": 0.9928676832267584,
"grad_norm": 4.097633361816406,
"learning_rate": 9.4087073980221e-06,
"loss": 1.3047,
"step": 4037
},
{
"epoch": 0.9931136251844564,
"grad_norm": 3.5504236221313477,
"learning_rate": 9.408323662191184e-06,
"loss": 1.3312,
"step": 4038
},
{
"epoch": 0.9933595671421545,
"grad_norm": 3.5524344444274902,
"learning_rate": 9.407939809713223e-06,
"loss": 1.1869,
"step": 4039
},
{
"epoch": 0.9936055090998525,
"grad_norm": 3.3190994262695312,
"learning_rate": 9.407555840598377e-06,
"loss": 1.1973,
"step": 4040
},
{
"epoch": 0.9938514510575505,
"grad_norm": 3.934922456741333,
"learning_rate": 9.407171754856804e-06,
"loss": 1.339,
"step": 4041
},
{
"epoch": 0.9940973930152484,
"grad_norm": 3.707014560699463,
"learning_rate": 9.406787552498667e-06,
"loss": 1.3354,
"step": 4042
},
{
"epoch": 0.9943433349729464,
"grad_norm": 3.6166956424713135,
"learning_rate": 9.406403233534134e-06,
"loss": 1.2492,
"step": 4043
},
{
"epoch": 0.9945892769306444,
"grad_norm": 3.4014697074890137,
"learning_rate": 9.40601879797337e-06,
"loss": 1.2149,
"step": 4044
},
{
"epoch": 0.9948352188883424,
"grad_norm": 3.472659111022949,
"learning_rate": 9.405634245826554e-06,
"loss": 1.1623,
"step": 4045
},
{
"epoch": 0.9950811608460404,
"grad_norm": 3.311044692993164,
"learning_rate": 9.405249577103857e-06,
"loss": 1.1866,
"step": 4046
},
{
"epoch": 0.9953271028037384,
"grad_norm": 3.1138179302215576,
"learning_rate": 9.404864791815457e-06,
"loss": 1.1194,
"step": 4047
},
{
"epoch": 0.9955730447614363,
"grad_norm": 3.6343393325805664,
"learning_rate": 9.404479889971538e-06,
"loss": 1.328,
"step": 4048
},
{
"epoch": 0.9958189867191343,
"grad_norm": 3.2858681678771973,
"learning_rate": 9.404094871582284e-06,
"loss": 1.1085,
"step": 4049
},
{
"epoch": 0.9960649286768323,
"grad_norm": 3.55973744392395,
"learning_rate": 9.403709736657882e-06,
"loss": 1.0645,
"step": 4050
},
{
"epoch": 0.9963108706345303,
"grad_norm": 3.8255014419555664,
"learning_rate": 9.403324485208525e-06,
"loss": 1.1739,
"step": 4051
},
{
"epoch": 0.9965568125922283,
"grad_norm": 3.441549301147461,
"learning_rate": 9.402939117244404e-06,
"loss": 1.3294,
"step": 4052
},
{
"epoch": 0.9968027545499262,
"grad_norm": 4.243690013885498,
"learning_rate": 9.40255363277572e-06,
"loss": 1.3512,
"step": 4053
},
{
"epoch": 0.9970486965076242,
"grad_norm": 3.7497622966766357,
"learning_rate": 9.402168031812669e-06,
"loss": 1.2329,
"step": 4054
},
{
"epoch": 0.9972946384653222,
"grad_norm": 3.435392141342163,
"learning_rate": 9.401782314365458e-06,
"loss": 1.1095,
"step": 4055
},
{
"epoch": 0.9975405804230202,
"grad_norm": 3.5515782833099365,
"learning_rate": 9.40139648044429e-06,
"loss": 1.2276,
"step": 4056
},
{
"epoch": 0.9977865223807182,
"grad_norm": 4.4121904373168945,
"learning_rate": 9.401010530059375e-06,
"loss": 1.3025,
"step": 4057
},
{
"epoch": 0.9980324643384161,
"grad_norm": 3.5035877227783203,
"learning_rate": 9.400624463220928e-06,
"loss": 1.1532,
"step": 4058
},
{
"epoch": 0.9982784062961141,
"grad_norm": 4.073094367980957,
"learning_rate": 9.400238279939164e-06,
"loss": 1.2571,
"step": 4059
},
{
"epoch": 0.9985243482538121,
"grad_norm": 3.499570369720459,
"learning_rate": 9.399851980224298e-06,
"loss": 1.1597,
"step": 4060
},
{
"epoch": 0.9987702902115101,
"grad_norm": 3.870664596557617,
"learning_rate": 9.399465564086556e-06,
"loss": 1.2482,
"step": 4061
},
{
"epoch": 0.9990162321692081,
"grad_norm": 3.9459023475646973,
"learning_rate": 9.39907903153616e-06,
"loss": 1.2638,
"step": 4062
},
{
"epoch": 0.9992621741269061,
"grad_norm": 3.538461208343506,
"learning_rate": 9.39869238258334e-06,
"loss": 1.2039,
"step": 4063
},
{
"epoch": 0.999508116084604,
"grad_norm": 3.2853970527648926,
"learning_rate": 9.398305617238326e-06,
"loss": 1.1594,
"step": 4064
},
{
"epoch": 0.999754058042302,
"grad_norm": 3.9652302265167236,
"learning_rate": 9.397918735511352e-06,
"loss": 1.3362,
"step": 4065
},
{
"epoch": 1.0,
"grad_norm": 3.6118056774139404,
"learning_rate": 9.397531737412655e-06,
"loss": 1.1831,
"step": 4066
}
],
"logging_steps": 1.0,
"max_steps": 20330,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500.0,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 1.278722452087636e+18,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}